From ac14b29d2320af90860d85d7ccd8fc03f40c8efb Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Thu, 26 Feb 2026 15:55:11 -0500 Subject: [PATCH 01/12] WIP sharding modeling --- .../_looptree/reuse/symbolic/symbolic.py | 45 +- examples/arches/networked/flat.yaml | 43 +- examples/arches/networked/hierarchical.yaml | 22 +- notebooks/tutorials/networks.ipynb | 456 ++++-------------- tests/test_network.py | 77 ++- 5 files changed, 209 insertions(+), 434 deletions(-) diff --git a/accelforge/model/_looptree/reuse/symbolic/symbolic.py b/accelforge/model/_looptree/reuse/symbolic/symbolic.py index 9cff4a76..e2a9a3d7 100755 --- a/accelforge/model/_looptree/reuse/symbolic/symbolic.py +++ b/accelforge/model/_looptree/reuse/symbolic/symbolic.py @@ -876,7 +876,9 @@ def analyze_spatial(node_idx, current_shape, info: AnalysisInfo): node: Spatial = mapping[node_idx] rank_var = node.rank_variable node_dim = node.name - spatial_component = find_component_object(node.component, info.job.flattened_arch) + flattened_arch = info.job.flattened_arch + arch_spec = info.job.spec.arch + spatial_component = find_component_object(node.component, flattened_arch) component_spatial_dim = spatial_component.spatial[node_dim] stride_and_shape = get_stride_and_tile_shape(node, current_shape, node_idx, info) @@ -937,15 +939,13 @@ def handle_repeated_value(repeated_shape): child_network_stats.max_hops, ) projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)] - component_object = find_component_object( - network.component, info.job.flattened_arch - ) - bits_per_value_scale = component_object.bits_per_value_scale[network.tensor] + network_object = find_component_object(network.component, flattened_arch) + bits_per_value_scale = network_object.bits_per_value_scale[network.tensor] bits_per_value = ( bits_per_value_scale * info.job.einsum.tensor_accesses[network.tensor].bits_per_value ) - bits_per_action = component_object.bits_per_action + bits_per_action = network_object.bits_per_action if bits_per_action is not None: actions_per_value = bits_per_value / bits_per_action else: @@ -955,8 +955,9 @@ def handle_repeated_value(repeated_shape): * actions_per_value ) - if info.job.spec.arch.is_above(node.component, network.component): + if is_component_a_above_b(node.component, network.source.level, flattened_arch): continue + source_object = find_component_object(network.source.level, flattened_arch) last_fanout = child_result.fanout.get((node.component, einsum_name), {}) last_fanout = last_fanout.get(node.name, 1) @@ -972,14 +973,18 @@ def handle_repeated_value(repeated_shape): overall_max_hops + child_network_stats.max_hops, ) elif isinstance(relevancy, Relevant): + avg_max_src_to_dst = ( + component_spatial_dim.fanout + / + source_object._get_physical_fanout_along(node.name, 1) + )-1 # Cost of unicast is the cost of delivering to each point in # the dimension with shape as stride # TODO: we should use the actual stride - total_unicast_cost = ( - 0.5 * (shape_repeats - 1) * shape_repeats * last_fanout * volume - ) - max_unicast_hops = (shape_repeats - 1) * last_fanout - overall_max_hops += max_unicast_hops + max_hops = MinGeqZero((shape_repeats-1)*last_fanout, avg_max_src_to_dst) + avg_hops = 0.5*max_hops + total_unicast_cost = avg_hops * shape_repeats * volume + overall_max_hops += max_hops accumulated_network_stats.total_hops += total_unicast_cost accumulated_network_stats.max_hops = MaxGeqZero( @@ -1070,6 +1075,22 @@ def find_component_object( raise ValueError(f"Component {component} not found in flattened arch") +def is_component_a_above_b(component_a: str, component_b: str, flattened_arch): + a_found = False + b_found = False + for node in flattened_arch: + if node.name == component_a: + a_found = True + if node.name == component_b: + b_found = True + + if a_found and not b_found: + return True + elif b_found and not a_found: + return False + raise ValueError(f"Neither {component_a} nor {component_b} found in flattened arch") + + def analyze_storage( node_idx: int, current_shape: dict[str, int], diff --git a/examples/arches/networked/flat.yaml b/examples/arches/networked/flat.yaml index 18c369e9..911cd59f 100644 --- a/examples/arches/networked/flat.yaml +++ b/examples/arches/networked/flat.yaml @@ -1,3 +1,5 @@ +{% set N_ROW_BUFFER = N_ROW_BUFFER | default(4) %} +{% set N_COL_BUFFER = N_COL_BUFFER | default(4) %} arch: nodes: - !Memory @@ -7,16 +9,18 @@ arch: leak_power: 0 tensors: {keep: All} actions: - - {name: read, energy: 0, latency: 0} - - {name: write, energy: 0, latency: 0} + - {name: read, energy: 100, latency: 0} + - {name: write, energy: 100, latency: 0} - !Network name: NoC area: 0 leak_power: 0 - actions: [] + actions: + - {name: hops, energy: 1, latency: 0} - !Array + name: PeArray spatial: - {name: X, fanout: 4} - {name: Y, fanout: 4} @@ -28,42 +32,42 @@ arch: leak_power: 0 tensors: {keep: ~MainMemory, may_keep: All} actions: - - {name: read, energy: 0, latency: 0} - - {name: write, energy: 0, latency: 0} + - {name: read, energy: 10, latency: 0} + - {name: write, energy: 10, latency: 0} - !Memory name: RowBuffer size: inf area: 0 leak_power: 0 - tensors: {keep: ~MainMemory, may_keep: All} + tensors: {keep: input, may_keep: input} actions: - - {name: read, energy: 0, latency: 0} - - {name: write, energy: 0, latency: 0} + - {name: read, energy: 5, latency: 0} + - {name: write, energy: 5, latency: 0} spatial: - - {name: X, fanout: 4} + - {name: X, fanout: {{N_ROW_BUFFER}}} - !Memory name: ColumnBuffer size: inf area: 0 leak_power: 0 - tensors: {keep: ~MainMemory, may_keep: All} + tensors: {keep: output, may_keep: output} actions: - - {name: read, energy: 0, latency: 0} - - {name: write, energy: 0, latency: 0} + - {name: read, energy: 5, latency: 0} + - {name: write, energy: 5, latency: 0} spatial: - - {name: Y, fanout: 4} + - {name: Y, fanout: {{N_COL_BUFFER}}} - !Memory name: DistributedBuffer size: inf area: 0 leak_power: 0 - tensors: {keep: ~MainMemory, may_keep: All} + tensors: {keep: weight, may_keep: weight} actions: - - {name: read, energy: 0, latency: 0} - - {name: write, energy: 0, latency: 0} + - {name: read, energy: 5, latency: 0} + - {name: write, energy: 5, latency: 0} spatial: - {name: X, fanout: 2} - {name: Y, fanout: 2} @@ -73,13 +77,14 @@ arch: size: inf area: 0 leak_power: 0 + tensors: {keep: weight, may_keep: weight} actions: - - {name: read, energy: 0, latency: 0} - - {name: write, energy: 0, latency: 0} + - {name: read, energy: 1, latency: 0} + - {name: write, energy: 1, latency: 0} - !Compute name: MAC area: 0 leak_power: 0 actions: - - {name: compute, energy: 0, latency: 0} \ No newline at end of file + - {name: compute, energy: 1, latency: 0} \ No newline at end of file diff --git a/examples/arches/networked/hierarchical.yaml b/examples/arches/networked/hierarchical.yaml index 5a7d44be..211d0e81 100644 --- a/examples/arches/networked/hierarchical.yaml +++ b/examples/arches/networked/hierarchical.yaml @@ -7,8 +7,8 @@ arch: leak_power: 0 tensors: {keep: All} actions: - - {name: read, energy: 0, latency: 0} - - {name: write, energy: 0, latency: 0} + - {name: read, energy: 100, latency: 1e-9} + - {name: write, energy: 100, latency: 1e-9} - !Memory name: GlobalBuffer @@ -17,15 +17,15 @@ arch: leak_power: 0 tensors: {keep: ~MainMemory, may_keep: All} actions: - - {name: read, energy: 0, latency: 0} - - {name: write, energy: 0, latency: 0} + - {name: read, energy: 10, latency: 1e-9/4} + - {name: write, energy: 10, latency: 1e-9/4} - !Network - name: PeArray + name: PeNoc area: 0 leak_power: 0 actions: - - {name: hops, energy: 1, latency: 0} + - {name: hops, energy: 5, latency: 1e-9/4} - !Memory name: Scratchpad @@ -34,25 +34,25 @@ arch: leak_power: 0 tensors: {keep: All} actions: - - {name: read, energy: 0, latency: 0} - - {name: write, energy: 0, latency: 0} + - {name: read, energy: 2, latency: 1e-9/16} + - {name: write, energy: 2, latency: 1e-9/16} spatial: - {name: X, fanout: 2} - {name: Y, fanout: 2} - !Network - name: MacArray + name: MacNoc area: 0 leak_power: 0 actions: - - {name: hops, energy: 1, latency: 0} + - {name: hops, energy: 1, latency: 1e-9/16} - !Compute name: MAC area: 0 leak_power: 0 actions: - - {name: compute, energy: 0, latency: 0} + - {name: compute, energy: 1, latency: 1e-9} spatial: - {name: X, fanout: 2} - {name: Y, fanout: 2} \ No newline at end of file diff --git a/notebooks/tutorials/networks.ipynb b/notebooks/tutorials/networks.ipynb index d809e0d9..0c09b7af 100644 --- a/notebooks/tutorials/networks.ipynb +++ b/notebooks/tutorials/networks.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "43938186", "metadata": {}, "outputs": [], @@ -20,117 +20,50 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "49a31e7a", "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "G\n", - "\n", - "\n", - "\n", - "Memory_125810196833264\n", - "\n", - "\n", - "MainMemory with size inf\n", - "\n", - "\n", - "\n", - "Memory_125810196833584\n", - "\n", - "\n", - "GlobalBuffer with size inf\n", - "\n", - "\n", - "\n", - "Memory_125810196833264--Memory_125810196833584\n", - "\n", - "\n", - "\n", - "\n", - "Network_125810196835344\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "PeArray\n", - "\n", - "\n", - "\n", - "Memory_125810196833584--Network_125810196835344\n", - "\n", - "\n", - "\n", - "\n", - "Memory_125810196835744\n", - "\n", - "\n", - "Scratchpad with size inf\n", - "[2× X, 2× Y]\n", - "\n", - "\n", - "\n", - "Network_125810196835344--Memory_125810196835744\n", - "\n", - "\n", - "\n", - "\n", - "Network_125810196835824\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "MacArray\n", - "\n", - "\n", - "\n", - "Memory_125810196835744--Network_125810196835824\n", - "\n", - "\n", - "\n", - "\n", - "Compute_125810196837184\n", - "\n", - "MAC\n", - "[2× X, 2× Y]\n", - "\n", - "\n", - "\n", - "Network_125810196835824--Compute_125810196837184\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "Arch(nodes=ArchNodes([Memory(name='MainMemory', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='All', may_keep='', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Memory(name='GlobalBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Network(name='PeArray', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs()), Memory(name='Scratchpad', spatial=[Spatial(name='X', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='All', may_keep='', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Network(name='MacArray', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs()), Compute(name='MAC', spatial=[Spatial(name='X', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[Action(name='compute', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras())], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs())]), variables=EvalExtras(), extra_attributes_for_all_component_models=EvalExtras())" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spec = af.Spec.from_yaml(\n", - " af.examples.arches.networked.hierarchical\n", + " af.examples.arches.networked.hierarchical,\n", + " af.examples.workloads.matmuls,\n", + " jinja_parse_data={\"N_EINSUMS\": 1}\n", ")\n", "spec.arch" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a64424bb", + "metadata": {}, + "outputs": [], + "source": [ + "spec.mapper.metrics = af.mapper.Metrics.LATENCY | af.mapper.Metrics.ENERGY\n", + "result = spec.map_workload_to_arch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cc6ed1d", + "metadata": {}, + "outputs": [], + "source": [ + "result.data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0dadcac", + "metadata": {}, + "outputs": [], + "source": [ + "result.energy(per_component=True)" + ] + }, { "cell_type": "markdown", "id": "389cb739", @@ -141,178 +74,76 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "9a11eec1", "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "G\n", - "\n", - "\n", - "\n", - "Memory_125810196833024\n", - "\n", - "\n", - "MainMemory with size inf\n", - "\n", - "\n", - "\n", - "Network_125810197053520\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "NoC\n", - "\n", - "\n", - "\n", - "Memory_125810196833024--Network_125810197053520\n", - "\n", - "\n", - "\n", - "\n", - "Array_125810197054240\n", - "\n", - "Array \n", - "[4× X, 4× Y]\n", - "\n", - "\n", - "\n", - "Network_125810197053520--Array_125810197054240\n", - "\n", - "\n", - "\n", - "\n", - "Memory_125810197054800\n", - "\n", - "\n", - "GlobalBuffer with size inf\n", - "\n", - "\n", - "\n", - "Array_125810197054240--Memory_125810197054800\n", - "\n", - "\n", - "\n", - "\n", - "Memory_125810197055840\n", - "\n", - "\n", - "RowBuffer with size inf\n", - "[4× X]\n", - "\n", - "\n", - "\n", - "Array_125810197054240--Memory_125810197055840\n", - "\n", - "\n", - "\n", - "\n", - "Memory_125810197056320\n", - "\n", - "\n", - "ColumnBuffer with size inf\n", - "[4× Y]\n", - "\n", - "\n", - "\n", - "Array_125810197054240--Memory_125810197056320\n", - "\n", - "\n", - "\n", - "\n", - "Memory_125810197057440\n", - "\n", - "\n", - "DistributedBuffer with size inf\n", - "[2× X, 2× Y]\n", - "\n", - "\n", - "\n", - "Array_125810197054240--Memory_125810197057440\n", - "\n", - "\n", - "\n", - "\n", - "Memory_125810197059360\n", - "\n", - "\n", - "Scratchpad with size inf\n", - "\n", - "\n", - "\n", - "Array_125810197054240--Memory_125810197059360\n", - "\n", - "\n", - "\n", - "\n", - "Compute_125810197060160\n", - "\n", - "MAC\n", - "\n", - "\n", - "\n", - "Memory_125810197059360--Compute_125810197060160\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "Arch(nodes=ArchNodes([Memory(name='MainMemory', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='All', may_keep='', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Network(name='NoC', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs()), Array(spatial=[Spatial(name='X', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], nodes=ArchNodes([Memory(name='GlobalBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Memory(name='RowBuffer', spatial=[Spatial(name='X', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Memory(name='ColumnBuffer', spatial=[Spatial(name='Y', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Memory(name='DistributedBuffer', spatial=[Spatial(name='X', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf')])), Memory(name='Scratchpad', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='', may_keep='', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Compute(name='MAC', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[Action(name='compute', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras())], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs())]), variables=EvalExtras(), extra_attributes_for_all_component_models=EvalExtras())" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spec = af.Spec.from_yaml(\n", - " af.examples.arches.networked.flat\n", + " af.examples.arches.networked.flat,\n", + " af.examples.workloads.matmuls,\n", + " jinja_parse_data={\"N_EINSUMS\": 1}\n", ")\n", "spec.arch" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "d2bbda8a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[Memory(name='MainMemory', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for MainMemory action read.', 'Setting MainMemory energy to action.energy=0', 'Calculating energy for MainMemory action write.', 'Setting MainMemory energy to action.energy=0', 'Calculating latency for MainMemory action read.', 'Setting MainMemory latency to action.latency=0', 'Calculating latency for MainMemory action write.', 'Setting MainMemory latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for MainMemory action read.', 'Setting MainMemory energy to action.energy=0', 'Calculating energy for MainMemory action write.', 'Setting MainMemory energy to action.energy=0', 'Calculating latency for MainMemory action read.', 'Setting MainMemory latency to action.latency=0', 'Calculating latency for MainMemory action write.', 'Setting MainMemory latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep=InvertibleSet(frozenset()), may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n", - " Network(name='NoC', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Using predefined leak power value self.leak_power=0'], actions=[], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs()),\n", - " Memory(name='GlobalBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for GlobalBuffer action read.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating energy for GlobalBuffer action write.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating latency for GlobalBuffer action read.', 'Setting GlobalBuffer latency to action.latency=0', 'Calculating latency for GlobalBuffer action write.', 'Setting GlobalBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for GlobalBuffer action read.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating energy for GlobalBuffer action write.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating latency for GlobalBuffer action read.', 'Setting GlobalBuffer latency to action.latency=0', 'Calculating latency for GlobalBuffer action write.', 'Setting GlobalBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n", - " Memory(name='RowBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for RowBuffer action read.', 'Setting RowBuffer energy to action.energy=0', 'Calculating energy for RowBuffer action write.', 'Setting RowBuffer energy to action.energy=0', 'Calculating latency for RowBuffer action read.', 'Setting RowBuffer latency to action.latency=0', 'Calculating latency for RowBuffer action write.', 'Setting RowBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for RowBuffer action read.', 'Setting RowBuffer energy to action.energy=0', 'Calculating energy for RowBuffer action write.', 'Setting RowBuffer energy to action.energy=0', 'Calculating latency for RowBuffer action read.', 'Setting RowBuffer latency to action.latency=0', 'Calculating latency for RowBuffer action write.', 'Setting RowBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n", - " Memory(name='ColumnBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for ColumnBuffer action read.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating energy for ColumnBuffer action write.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating latency for ColumnBuffer action read.', 'Setting ColumnBuffer latency to action.latency=0', 'Calculating latency for ColumnBuffer action write.', 'Setting ColumnBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for ColumnBuffer action read.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating energy for ColumnBuffer action write.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating latency for ColumnBuffer action read.', 'Setting ColumnBuffer latency to action.latency=0', 'Calculating latency for ColumnBuffer action write.', 'Setting ColumnBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n", - " Memory(name='DistributedBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for DistributedBuffer action read.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating energy for DistributedBuffer action write.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating latency for DistributedBuffer action read.', 'Setting DistributedBuffer latency to action.latency=0', 'Calculating latency for DistributedBuffer action write.', 'Setting DistributedBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for DistributedBuffer action read.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating energy for DistributedBuffer action write.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating latency for DistributedBuffer action read.', 'Setting DistributedBuffer latency to action.latency=0', 'Calculating latency for DistributedBuffer action write.', 'Setting DistributedBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n", - " Array(spatial=[Spatial(name='X', fanout=4, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=4, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False)], nodes=[Memory(name='GlobalBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for GlobalBuffer action read.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating energy for GlobalBuffer action write.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating latency for GlobalBuffer action read.', 'Setting GlobalBuffer latency to action.latency=0', 'Calculating latency for GlobalBuffer action write.', 'Setting GlobalBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for GlobalBuffer action read.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating energy for GlobalBuffer action write.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating latency for GlobalBuffer action read.', 'Setting GlobalBuffer latency to action.latency=0', 'Calculating latency for GlobalBuffer action write.', 'Setting GlobalBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf), Memory(name='RowBuffer', spatial=[Spatial(name='X', fanout=4, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for RowBuffer action read.', 'Setting RowBuffer energy to action.energy=0', 'Calculating energy for RowBuffer action write.', 'Setting RowBuffer energy to action.energy=0', 'Calculating latency for RowBuffer action read.', 'Setting RowBuffer latency to action.latency=0', 'Calculating latency for RowBuffer action write.', 'Setting RowBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for RowBuffer action read.', 'Setting RowBuffer energy to action.energy=0', 'Calculating energy for RowBuffer action write.', 'Setting RowBuffer energy to action.energy=0', 'Calculating latency for RowBuffer action read.', 'Setting RowBuffer latency to action.latency=0', 'Calculating latency for RowBuffer action write.', 'Setting RowBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf), Memory(name='ColumnBuffer', spatial=[Spatial(name='Y', fanout=4, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for ColumnBuffer action read.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating energy for ColumnBuffer action write.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating latency for ColumnBuffer action read.', 'Setting ColumnBuffer latency to action.latency=0', 'Calculating latency for ColumnBuffer action write.', 'Setting ColumnBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for ColumnBuffer action read.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating energy for ColumnBuffer action write.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating latency for ColumnBuffer action read.', 'Setting ColumnBuffer latency to action.latency=0', 'Calculating latency for ColumnBuffer action write.', 'Setting ColumnBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf), Memory(name='DistributedBuffer', spatial=[Spatial(name='X', fanout=2, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=2, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for DistributedBuffer action read.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating energy for DistributedBuffer action write.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating latency for DistributedBuffer action read.', 'Setting DistributedBuffer latency to action.latency=0', 'Calculating latency for DistributedBuffer action write.', 'Setting DistributedBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for DistributedBuffer action read.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating energy for DistributedBuffer action write.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating latency for DistributedBuffer action read.', 'Setting DistributedBuffer latency to action.latency=0', 'Calculating latency for DistributedBuffer action write.', 'Setting DistributedBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf)]),\n", - " Memory(name='Scratchpad', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for Scratchpad action read.', 'Setting Scratchpad energy to action.energy=0', 'Calculating energy for Scratchpad action write.', 'Setting Scratchpad energy to action.energy=0', 'Calculating latency for Scratchpad action read.', 'Setting Scratchpad latency to action.latency=0', 'Calculating latency for Scratchpad action write.', 'Setting Scratchpad latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for Scratchpad action read.', 'Setting Scratchpad energy to action.energy=0', 'Calculating energy for Scratchpad action write.', 'Setting Scratchpad energy to action.energy=0', 'Calculating latency for Scratchpad action read.', 'Setting Scratchpad latency to action.latency=0', 'Calculating latency for Scratchpad action write.', 'Setting Scratchpad latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep=InvertibleSet(frozenset()), may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n", - " Compute(name='MAC', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for MAC action compute.', 'Setting MAC energy to action.energy=0', 'Calculating latency for MAC action compute.', 'Setting MAC latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for MAC action compute.', 'Setting MAC energy to action.energy=0', 'Calculating latency for MAC action compute.', 'Setting MAC latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[Action(name='compute', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras())], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs())]]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], + "source": [ + "result = spec.map_workload_to_arch()\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e2b3332", + "metadata": {}, + "outputs": [], "source": [ - "spec.calculate_component_area_energy_latency_leak()._get_flattened_architecture()" + "from accelforge.plotting.mappings import plot_energy_breakdown\n", + "\n", + "plot_energy_breakdown([result], separate_by=[\"component\"], stack_by=[\"tensor\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "741719fa", + "metadata": {}, + "outputs": [], + "source": [ + "spec = af.Spec.from_yaml(\n", + " af.examples.arches.networked.flat,\n", + " af.examples.workloads.matmuls,\n", + " jinja_parse_data={\"N_EINSUMS\": 1, \"N_ROW_BUFFER\": 1, \"N_COL_BUFFER\": 1},\n", + ")\n", + "spec.arch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a62e6dfa", + "metadata": {}, + "outputs": [], + "source": [ + "result = spec.map_workload_to_arch()\n", + "plot_energy_breakdown([result], separate_by=[\"component\"], stack_by=[\"tensor\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "929f5399", + "metadata": {}, + "outputs": [], + "source": [ + "result.data[[c for c in result.data.columns if \"hops\" in c]]" ] }, { @@ -325,100 +156,24 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "cc2df4b6", "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "G\n", - "\n", - "\n", - "\n", - "Memory_125810197377456\n", - "\n", - "\n", - "MainMemory with size inf\n", - "\n", - "\n", - "\n", - "Network_125810192413344\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "NoC\n", - "\n", - "\n", - "\n", - "Memory_125810197377456--Network_125810192413344\n", - "\n", - "\n", - "\n", - "\n", - "Memory_125810192404624\n", - "\n", - "\n", - "HBM with size inf\n", - "[4× X, 4× Y]\n", - "\n", - "\n", - "\n", - "Network_125810192413344--Memory_125810192404624\n", - "\n", - "\n", - "\n", - "\n", - "Memory_125810192599872\n", - "\n", - "\n", - "Buffer with size inf\n", - "\n", - "\n", - "\n", - "Memory_125810192404624--Memory_125810192599872\n", - "\n", - "\n", - "\n", - "\n", - "Compute_125810192606352\n", - "\n", - "MAC\n", - "\n", - "\n", - "\n", - "Memory_125810192599872--Compute_125810192606352\n", - "\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "Arch(nodes=ArchNodes([Memory(name='MainMemory', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='All', may_keep='', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Network(name='NoC', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs()), Memory(name='HBM', spatial=[Spatial(name='X', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Memory(name='Buffer', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='', may_keep='', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Compute(name='MAC', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[Action(name='compute', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras())], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs())]), variables=EvalExtras(), extra_attributes_for_all_component_models=EvalExtras())" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spec = af.Spec.from_yaml(\n", " af.examples.arches.networked.rack\n", ")\n", "spec.arch" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e54780a", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -436,8 +191,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/tests/test_network.py b/tests/test_network.py index e8598833..601dc713 100644 --- a/tests/test_network.py +++ b/tests/test_network.py @@ -2,18 +2,20 @@ import accelforge as af +af.set_n_parallel_jobs(1) + class TestParsing(TestCase): def test_hierarchical(self): spec = af.Spec.from_yaml( af.examples.arches.networked.hierarchical, ) - self.assertIn("PeArray", spec.arch.nodes) - self.assertEqual(spec.arch.nodes["PeArray"].get_fanout(), 1) + self.assertIn("PeNoc", spec.arch.nodes) + self.assertEqual(spec.arch.nodes["PeNoc"].get_fanout(), 1) self.assertIn("Scratchpad", spec.arch.nodes) self.assertEqual(spec.arch.nodes["Scratchpad"].get_fanout(), 4) - self.assertIn("MacArray", spec.arch.nodes) - self.assertEqual(spec.arch.nodes["MacArray"].get_fanout(), 1) + self.assertIn("MacNoc", spec.arch.nodes) + self.assertEqual(spec.arch.nodes["MacNoc"].get_fanout(), 1) try: spec = spec.calculate_component_area_energy_latency_leak() @@ -50,34 +52,34 @@ def test_hierarchical(self): af.examples.workloads.matmuls, af.examples.arches.networked.hierarchical, af.examples.mappings.one_matmul_to_networked_hierarchical, - jinja_parse_data={"N_EINSUMS": 1, "M": 8, "KN": 8, "MAC_TILE": MAC_TILE, "M_TILE": M_TILE} + jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN, "MAC_TILE": MAC_TILE, "M_TILE": M_TILE} ) result = spec.evaluate_mapping() self.assertEqual( - result.data["Matmul0actionMacArrayT0hops"].iloc[0], + result.data["Matmul0actionMacNocT0hops"].iloc[0], (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (0.5*MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE ) # NOTE: assuming XY routing (as defined in mapping) self.assertEqual( - result.data["Matmul0actionMacArrayT1hops"].iloc[0], + result.data["Matmul0actionMacNocT1hops"].iloc[0], (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE ) self.assertEqual( - result.data["Matmul0actionMacArrayW0hops"].iloc[0], + result.data["Matmul0actionMacNocW0hops"].iloc[0], (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE ) self.assertEqual( - result.data["Matmul0actionPeArrayT0hops"].iloc[0], + result.data["Matmul0actionPeNocT0hops"].iloc[0], (M/M_TILE) * (0.5*PE_TILE*(PE_TILE-1) + PE_TILE*(PE_TILE-1)) * M_TILE*MAC_TILE*BITS_PER_VALUE ) # NOTE: assuming XY routing (as defined in mapping) self.assertEqual( - result.data["Matmul0actionPeArrayT1hops"].iloc[0], + result.data["Matmul0actionPeNocT1hops"].iloc[0], (M/M_TILE) * (PE_TILE*(PE_TILE-1) + PE_TILE*0.5*PE_TILE*(PE_TILE-1)) * M_TILE*MAC_TILE*BITS_PER_VALUE ) self.assertEqual( - result.data["Matmul0actionPeArrayW0hops"].iloc[0], + result.data["Matmul0actionPeNocW0hops"].iloc[0], (M/M_TILE) * (PE_TILE*0.5*PE_TILE*(PE_TILE-1) + PE_TILE*0.5*PE_TILE*(PE_TILE-1)) * MAC_TILE**2*BITS_PER_VALUE ) @@ -87,40 +89,33 @@ def test_hierarchical(self): M = 8 KN = 8 MAC_TILE = 2 - PE_TILE = KN//MAC_TILE M_TILE = 4 - BITS_PER_VALUE = 8 spec = af.Spec.from_yaml( af.examples.workloads.matmuls, af.examples.arches.networked.hierarchical, - jinja_parse_data={"N_EINSUMS": 1, "M": 8, "KN": 8, "MAC_TILE": MAC_TILE, "M_TILE": M_TILE} + jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN} ) result = spec.map_workload_to_arch() - # self.assertEqual( - # result.data["Matmul0actionMacArrayT0hops"].iloc[0], - # (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (0.5*MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE - # ) - # # NOTE: assuming XY routing (as defined in mapping) - # self.assertEqual( - # result.data["Matmul0actionMacArrayT1hops"].iloc[0], - # (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE - # ) - # self.assertEqual( - # result.data["Matmul0actionMacArrayW0hops"].iloc[0], - # (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE - # ) - - # self.assertEqual( - # result.data["Matmul0actionPeArrayT0hops"].iloc[0], - # (M/M_TILE) * (0.5*PE_TILE*(PE_TILE-1) + PE_TILE*(PE_TILE-1)) * M_TILE*MAC_TILE*BITS_PER_VALUE - # ) - # # NOTE: assuming XY routing (as defined in mapping) - # self.assertEqual( - # result.data["Matmul0actionPeArrayT1hops"].iloc[0], - # (M/M_TILE) * (PE_TILE*(PE_TILE-1) + PE_TILE*0.5*PE_TILE*(PE_TILE-1)) * M_TILE*MAC_TILE*BITS_PER_VALUE - # ) - # self.assertEqual( - # result.data["Matmul0actionPeArrayW0hops"].iloc[0], - # (M/M_TILE) * (PE_TILE*0.5*PE_TILE*(PE_TILE-1) + PE_TILE*0.5*PE_TILE*(PE_TILE-1)) * MAC_TILE**2*BITS_PER_VALUE - # ) \ No newline at end of file + + def test_flat(self): + M = 8 + KN = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + af.examples.arches.networked.flat, + jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN} + ) + result = spec.map_workload_to_arch() + + def test_flat_one_row_buffer(self): + M = 8 + KN = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + af.examples.arches.networked.flat, + jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN, "N_ROW_BUFFER": 1} + ) + result = spec.map_workload_to_arch() \ No newline at end of file From 64470bb86a8af8e1ee39f6980b7e2a5b9afd01cf Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Thu, 21 May 2026 19:56:53 -0400 Subject: [PATCH 02/12] [model] Use flattened arch --- .../model/_looptree/reuse/symbolic/symbolic/_network.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/accelforge/model/_looptree/reuse/symbolic/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/symbolic/_network.py index 6870da58..b143f10a 100644 --- a/accelforge/model/_looptree/reuse/symbolic/symbolic/_network.py +++ b/accelforge/model/_looptree/reuse/symbolic/symbolic/_network.py @@ -34,6 +34,8 @@ def accumulate_child_result( child_shape, node, ): + flattened_arch = info.job.flattened_arch + for network, child_network_stats in child_result.network_stats.items(): if network not in self.network_stats: self.network_stats[network] = NetworkStats() @@ -48,7 +50,7 @@ def accumulate_child_result( ) projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)] component_object = find_component_object( - network.component, info.job.flattened_arch + network.component, flattened_arch ) workload_bpv = info.job.einsum.tensor_accesses[ network.tensor @@ -66,9 +68,7 @@ def accumulate_child_result( * actions_per_value ) - if info.job.spec_one_einsum.arch.is_above( - node.component, network.component - ): + if is_component_a_above_b(node.component, network.component, flattened_arch): continue relevancy = info.tensor_to_relevancy[network.tensor][node.rank_variable] From d34e44b661db3ceef44212d6a27b2f5049f57038 Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Fri, 22 May 2026 11:38:26 -0400 Subject: [PATCH 03/12] [frontend] Add comment to explain physical_fanout --- accelforge/frontend/arch/structure.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/accelforge/frontend/arch/structure.py b/accelforge/frontend/arch/structure.py index fe2bf580..ac75d03b 100644 --- a/accelforge/frontend/arch/structure.py +++ b/accelforge/frontend/arch/structure.py @@ -333,6 +333,10 @@ def _flatten( nodes = [] + # Nodes inside an array are flattened to fit into a hierarchical + # model in order to map. + # However, we will keep information about how these nodes are + # arranged for modeling. for node in self.nodes: try: if isinstance(node, Branch): From 4b3fcabcd670e4b81d425d1096c4a9e798859789 Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Fri, 22 May 2026 13:44:09 -0400 Subject: [PATCH 04/12] [network] Add *untested* distributed model --- accelforge/frontend/arch/spatialable.py | 4 +- .../_looptree/reuse/symbolic/_network.py | 124 ++++++++---------- tests/test_network.py | 36 ++--- 3 files changed, 72 insertions(+), 92 deletions(-) diff --git a/accelforge/frontend/arch/spatialable.py b/accelforge/frontend/arch/spatialable.py index 367f4506..a5a3e286 100644 --- a/accelforge/frontend/arch/spatialable.py +++ b/accelforge/frontend/arch/spatialable.py @@ -142,11 +142,11 @@ def _get_physical_fanout_along(self, dim_name: str, default: int = 1) -> int: return s.fanout return default - def _get_physical_stride_along(self, dim_name: str, default: int = 1) -> int: + def _get_physical_stride_along(self, dim_name: str) -> int: for s in self._physical_spatial: if s.name == dim_name: return s.stride - return default + raise ValueError(f"dimension {dim_name} not found") def _spatial_str(self, include_newline=True) -> str: if not self.spatial: diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py index 28a3b186..4c7cc9ea 100644 --- a/accelforge/model/_looptree/reuse/symbolic/_network.py +++ b/accelforge/model/_looptree/reuse/symbolic/_network.py @@ -12,10 +12,10 @@ PartiallyRelevant, ) -from accelforge.util._sympy.broadcast_max import Min, Max, MaxGeqZero +from accelforge.util._sympy.broadcast_max import MaxGeqZero, MinGeqZero from ._common import AnalysisInfo -from ._stats import NetworkStats +from ._stats import NetworkStats, SymbolicAnalysisOutput class NetworkAnalyzer: @@ -25,7 +25,7 @@ def __init__(self, network_stats): def accumulate_child_result( self, - child_result, + child_result: SymbolicAnalysisOutput, info: AnalysisInfo, shape_repeats, einsum_name, @@ -35,6 +35,7 @@ def accumulate_child_result( flattened_arch = info.job.flattened_arch for network, child_network_stats in child_result.network_stats.items(): + src_component = flattened_arch[network.source.level] if network not in self.network_stats: self.network_stats[network] = NetworkStats() accumulated_network_stats = self.network_stats[network] @@ -64,93 +65,72 @@ def accumulate_child_result( * actions_per_value ) - if is_component_a_above_b(node.component, network.component, flattened_arch): + if flattened_arch.is_above(node.component, network.component): continue relevancy = info.tensor_to_relevancy[network.tensor][node.rank_variable] + # The fanout in this dimension in mapping nodes below, i.e., the stride last_fanout = child_result.fanout.get((node.component, einsum_name), {}) last_fanout = last_fanout.get(node.name, 1) if isinstance(relevancy, Irrelevant): - # Cost of multicasting is the cost of delivering along the dimension - multicast_hops = shape_repeats * last_fanout - multicast_cost = multicast_hops * volume - self.overall_max_hops += multicast_hops - - accumulated_network_stats.total_hops += multicast_cost - accumulated_network_stats.max_hops = MaxGeqZero( - accumulated_network_stats.max_hops, - self.overall_max_hops + child_network_stats.max_hops, - ) + # Distributed or not, the amount of total cost is the same. + # However, the accesses now come from different physical memories + total_cost = multicast_cost(shape_repeats, last_fanout)*volume + max_hops = shape_repeats*last_fanout elif isinstance(relevancy, Relevant): - # Cost of unicast is the cost of delivering to each point in - # the dimension with shape as stride - # TODO: we should use the actual stride - total_unicast_cost = ( - 0.5 * (shape_repeats + 1) * shape_repeats * last_fanout * volume - ) - max_unicast_hops = shape_repeats * last_fanout - self.overall_max_hops += max_unicast_hops - - accumulated_network_stats.total_hops += total_unicast_cost - accumulated_network_stats.max_hops = MaxGeqZero( - accumulated_network_stats.max_hops, - self.overall_max_hops + child_network_stats.max_hops, - ) + # If distributed, then we bind data as locally as possible in the + # physical buffers + if src_component._get_physical_fanout_along(node.name) > 1: + physical_stride = src_component._get_physical_stride_along(node.name) + + n_dsts_per_physical = MinGeqZero( + # if last_fanout > physical_stride, set n_dst to 1, which results in 0 hops + # later (which is correct because the set of destinations always overlap + # the set of sources). + MaxGeqZero(physical_stride / last_fanout, 1), + shape_repeats + ) + n_activated_physical = MaxGeqZero(shape_repeats*last_fanout/physical_stride, 1) + total_cost = ( + n_activated_physical + * + unicast_cost(n_dsts_per_physical, last_fanout) + * + volume + ) + max_hops = MinGeqZero(shape_repeats*last_fanout, physical_stride) + else: + total_cost = unicast_cost(shape_repeats, last_fanout)*volume + max_hops = shape_repeats * last_fanout elif isinstance(relevancy, PartiallyRelevant): raise NotImplementedError() else: raise RuntimeError(f"unhandled relevancy type {relevancy}") - return self.overall_max_hops - - -def reduce_dicts(dict1: dict, dict2: dict, reduce_op): - for key in dict1: - if key not in dict2: - dict2[key] = dict1[key] - else: - dict2[key] = reduce_op(dict1[key], dict2[key]) - - -def get_total_to_per_unit(total, max_per_unit): - if total == 0 and max_per_unit != 0: - raise ValueError(f"total is 0 but max_per_unit is {max_per_unit}") - if total == 0: - return 1 - return max_per_unit / total + # TODO: this is sketchy + self.overall_max_hops += max_hops + accumulated_network_stats.total_hops += total_cost + accumulated_network_stats.max_hops = MaxGeqZero( + accumulated_network_stats.max_hops, + self.overall_max_hops + child_network_stats.max_hops, + ) -def has_parent_tensor_holder( - tensor: TensorName, node_idx: int, info -) -> bool: - for node in info.mapping[:node_idx]: - if isinstance(node, TensorHolder) and tensor in node.tensors: - return True - return False + return self.overall_max_hops -def find_component_object( - component: str, flattened_arch: list[arch.Leaf] -) -> arch.TensorHolder: - for node in flattened_arch: - if node.name == component: - return node - raise ValueError(f"Component {component} not found in flattened arch") +def multicast_cost(n_dsts, stride): + """Returns total hops of multicast along a dimension.""" + return (n_dsts-1)*stride -def is_component_a_above_b(component_a: str, component_b: str, flattened_arch): - a_found = False - b_found = False - for node in flattened_arch: - if node.name == component_a: - a_found = True - if node.name == component_b: - b_found = True +def unicast_cost(n_dsts, stride): + """Returns total hops of unicast along a dimension.""" + # Cost of unicast is the cost of delivering to each point in + # the dimension with shape as stride + return arithmetic_sum(n_dsts-1)*stride - if a_found and not b_found: - return True - elif b_found and not a_found: - return False - raise ValueError(f"Neither {component_a} nor {component_b} found in flattened arch") +def arithmetic_sum(n): + return 0.5 * (n+1) * n \ No newline at end of file diff --git a/tests/test_network.py b/tests/test_network.py index 406a7017..e7cd640f 100644 --- a/tests/test_network.py +++ b/tests/test_network.py @@ -78,7 +78,7 @@ def test_hierarchical_1d(self): * (KN / MAC_TILE) # number of used Scratchpad * M_TILE * KN # temporal for n1 in mapping - * sum(i+1 for i in range(MAC_TILE)) # unicast along X-axis of MacArray + * sum(i for i in range(MAC_TILE)) # unicast along X-axis of MacArray * BITS_PER_VALUE, ) # NOTE: assuming XY routing (as defined in mapping) @@ -88,7 +88,7 @@ def test_hierarchical_1d(self): * (KN / MAC_TILE) * M_TILE * KN # temporal for n1 in mapping - * MAC_TILE # multicast along X-axis of MacArray + * (MAC_TILE - 1) # multicast along X-axis of MacArray * BITS_PER_VALUE, ) self.assertEqual( @@ -97,14 +97,14 @@ def test_hierarchical_1d(self): * (KN / MAC_TILE) * M_TILE * KN - * sum(i+1 for i in range(MAC_TILE)) + * sum(i for i in range(MAC_TILE)) * BITS_PER_VALUE, ) self.assertEqual( result.data["Matmul0actionPeArrayT0hops"].iloc[0], (M / M_TILE) - * sum(i+1 for i in range(KN // MAC_TILE)) # unicast along X-axis of PeArray + * sum(i for i in range(KN // MAC_TILE)) # unicast along X-axis of PeArray * M_TILE * MAC_TILE * BITS_PER_VALUE, @@ -113,7 +113,7 @@ def test_hierarchical_1d(self): self.assertEqual( result.data["Matmul0actionPeArrayT1hops"].iloc[0], (M / M_TILE) - * KN // MAC_TILE # multicast along X-axis of PeArray + * (KN // MAC_TILE - 1) # multicast along X-axis of PeArray * M_TILE * KN * BITS_PER_VALUE, @@ -121,7 +121,7 @@ def test_hierarchical_1d(self): self.assertEqual( result.data["Matmul0actionPeArrayW0hops"].iloc[0], (M / M_TILE) - * sum(i+1 for i in range(KN // MAC_TILE)) # unicast along PeArray + * sum(i for i in range(KN // MAC_TILE)) # unicast along PeArray * MAC_TILE * KN * BITS_PER_VALUE, @@ -156,9 +156,9 @@ def test_hierarchical(self): * (KN / MAC_TILE) ** 2 * M_TILE * ( - sum(i+1 for i in range(MAC_TILE)) # unicasting along X + sum(i for i in range(MAC_TILE)) # unicasting along X + - MAC_TILE * MAC_TILE # multicast along Y for each column + MAC_TILE * (MAC_TILE-1) # multicast along Y for each column ) * BITS_PER_VALUE, ) @@ -169,9 +169,9 @@ def test_hierarchical(self): * (KN / MAC_TILE) ** 2 * M_TILE * ( - MAC_TILE * MAC_TILE # multicast along X (the tile is shape N1, which is MAC_TILE here) + MAC_TILE * (MAC_TILE - 1) # multicast along X (the tile is shape N1, which is MAC_TILE here) + - MAC_TILE * sum(i+1 for i in range(MAC_TILE)) # unicasting along Y for each row + MAC_TILE * sum(i for i in range(MAC_TILE)) # unicasting along Y for each row ) * BITS_PER_VALUE, ) @@ -181,9 +181,9 @@ def test_hierarchical(self): * (KN / MAC_TILE) ** 2 * M_TILE * ( - MAC_TILE * sum(i+1 for i in range(MAC_TILE)) # unicast along X (the tile is shape N1, which is MAC_TILE here) + MAC_TILE * sum(i for i in range(MAC_TILE)) # unicast along X (the tile is shape N1, which is MAC_TILE here) + - MAC_TILE * sum(i+1 for i in range(MAC_TILE)) # unicasting along Y for each row + MAC_TILE * sum(i for i in range(MAC_TILE)) # unicasting along Y for each row ) * BITS_PER_VALUE, ) @@ -192,9 +192,9 @@ def test_hierarchical(self): result.data["Matmul0actionPeArrayT0hops"].iloc[0], (M / M_TILE) * ( - sum(i+1 for i in range(PE_TILE)) + sum(i for i in range(PE_TILE)) + - PE_TILE * PE_TILE + PE_TILE * (PE_TILE - 1) ) # tile shape * M_TILE @@ -206,9 +206,9 @@ def test_hierarchical(self): result.data["Matmul0actionPeArrayT1hops"].iloc[0], (M / M_TILE) * ( - PE_TILE * PE_TILE + PE_TILE * (PE_TILE - 1) + - PE_TILE * sum(i+1 for i in range(PE_TILE)) + PE_TILE * sum(i for i in range(PE_TILE)) ) * M_TILE * MAC_TILE @@ -218,9 +218,9 @@ def test_hierarchical(self): result.data["Matmul0actionPeArrayW0hops"].iloc[0], (M / M_TILE) * ( - PE_TILE * sum(i+1 for i in range(PE_TILE)) + PE_TILE * sum(i for i in range(PE_TILE)) + - PE_TILE * sum(i+1 for i in range(PE_TILE)) + PE_TILE * sum(i for i in range(PE_TILE)) ) * MAC_TILE**2 * BITS_PER_VALUE, From 307196ddaf4a344378baf4fca7705055f8047164 Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Fri, 22 May 2026 15:13:12 -0400 Subject: [PATCH 05/12] [network] Tested distributed buffers --- accelforge/frontend/arch/_flattened_arch.py | 33 ++++++++++ .../_looptree/reuse/symbolic/_symbolic.py | 50 +++++++------- tests/input_files/networked/flat.yaml | 31 +++------ .../networked/one_matmul_to_flat.yaml | 42 ++++++++++++ tests/test_network.py | 66 ++++++++++++++++++- 5 files changed, 177 insertions(+), 45 deletions(-) create mode 100644 tests/input_files/networked/one_matmul_to_flat.yaml diff --git a/accelforge/frontend/arch/_flattened_arch.py b/accelforge/frontend/arch/_flattened_arch.py index a5062b65..2a249a94 100644 --- a/accelforge/frontend/arch/_flattened_arch.py +++ b/accelforge/frontend/arch/_flattened_arch.py @@ -1,3 +1,12 @@ +from typing import TypeVar + + +_FIND_SENTINEL = object() + +D = TypeVar("D") +T = TypeVar("T") + + class FlattenedArch: """ A flattened arch is an architecture spec that has been @@ -51,3 +60,27 @@ def is_above(self, name_a: str, name_b: str): idx_a = self.index(name_a) idx_b = self.index(name_b) return idx_a < idx_b + + def find_first_of_type_between( + self, node_type: T, name_lower: str, name_upper: str, default: D = _FIND_SENTINEL + ) -> T | D: + """ + Returns the first node with type `node_type` above `name_lower` and under `name_upper`. + + If `name` does not exist, raises an error. + + If no node of `node_type` is found, either `default` is + returned (if provided) or raises an error. + """ + upper_idx = self.index(name_upper) + lower_idx = self.index(name_lower) + + for i, node in enumerate(self.nodes): + if not isinstance(node, node_type) or i <= upper_idx or i >= lower_idx: + continue + else: + return node + if default is not _FIND_SENTINEL: + return default + else: + raise ValueError(f"node with type {node_type} between {name_upper} and {name_lower} not found") diff --git a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py index ae075b52..91a6bd32 100755 --- a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py +++ b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py @@ -903,19 +903,21 @@ def analyze_reservation(node_idx, current_shape, info: AnalysisInfo): child_result.buffet_stats[buffet] = stats # Reservation nodes are the first to produce stats for a network - network_node = info.job.spec_one_einsum.arch.find_first_of_type_above( - NetworkSpec, buffet.level, default=None - ) - if network_node is not None: - network = Network( - tensor, - einsum_name, - info.data_movement_connections.get_src(buffet), - buffet, - component=network_node.name if network_node else network_node, + src = info.data_movement_connections.get_src(buffet) + if src is not None: + network_node = info.job.flattened_arch.find_first_of_type_between( + NetworkSpec, buffet.level, src.level, default=None ) - assert network not in child_result.network_stats - child_result.network_stats[network] = NetworkStats() + if network_node is not None: + network = Network( + tensor, + einsum_name, + src, + buffet, + component=network_node.name if network_node else network_node, + ) + assert network not in child_result.network_stats + child_result.network_stats[network] = NetworkStats() fanout_key = (node.resource, einsum_name) if fanout_key not in child_result.fanout: @@ -965,18 +967,20 @@ def analyze_compute( stats.max_occupancy = 1 result_accumulator.buffet_stats[buffet] = stats - network_node = info.job.spec_one_einsum.arch.find_first_of_type_above( - NetworkSpec, node.component, default=None - ) - if network_node is not None: - network = Network( - tensor, - info.job.einsum_name, - info.data_movement_connections.get_src(buffet), - buffet, - component=network_node.name if network_node else network_node, + src = info.data_movement_connections.get_src(buffet) + if src is not None: + network_node = info.job.flattened_arch.find_first_of_type_between( + NetworkSpec, node.component, src.level, default=None ) - result_accumulator.network_stats[network] = NetworkStats() + if network_node is not None: + network = Network( + tensor, + info.job.einsum_name, + src, + buffet, + component=network_node.name if network_node else network_node, + ) + result_accumulator.network_stats[network] = NetworkStats() return result_accumulator diff --git a/tests/input_files/networked/flat.yaml b/tests/input_files/networked/flat.yaml index ff2a05c8..b546ba21 100644 --- a/tests/input_files/networked/flat.yaml +++ b/tests/input_files/networked/flat.yaml @@ -1,5 +1,3 @@ -{% set N_ROW_BUFFER = N_ROW_BUFFER | default(4) %} -{% set N_COL_BUFFER = N_COL_BUFFER | default(4) %} arch: nodes: - !Memory @@ -12,29 +10,12 @@ arch: - {name: read, energy: 100, latency: 0} - {name: write, energy: 100, latency: 0} - - !Network - name: NoC - area: 0 - leak_power: 0 - actions: - - {name: hops, energy: 1, latency: 0} - - !Array name: Array spatial: - {name: X, fanout: 4} - {name: Y, fanout: 4} nodes: - - !Memory - name: GlobalBuffer - size: inf - area: 0 - leak_power: 0 - tensors: {keep: ~MainMemory, may_keep: All} - actions: - - {name: read, energy: 10, latency: 0} - - {name: write, energy: 10, latency: 0} - - !Memory name: RowBuffer size: inf @@ -45,7 +26,7 @@ arch: - {name: read, energy: 5, latency: 0} - {name: write, energy: 5, latency: 0} spatial: - - {name: X, fanout: {{N_ROW_BUFFER}}} + - {name: X, fanout: 4} - !Memory name: ColumnBuffer @@ -57,7 +38,7 @@ arch: - {name: read, energy: 5, latency: 0} - {name: write, energy: 5, latency: 0} spatial: - - {name: Y, fanout: {{N_COL_BUFFER}}} + - {name: Y, fanout: 4} - !Memory name: DistributedBuffer @@ -72,6 +53,14 @@ arch: - {name: X, fanout: 2} - {name: Y, fanout: 2} + - !Network + name: NoC + area: 0 + leak_power: 0 + actions: + - {name: hops, energy: 1, latency: 0} + + - !Memory name: Scratchpad size: inf diff --git a/tests/input_files/networked/one_matmul_to_flat.yaml b/tests/input_files/networked/one_matmul_to_flat.yaml new file mode 100644 index 00000000..cf7d2f17 --- /dev/null +++ b/tests/input_files/networked/one_matmul_to_flat.yaml @@ -0,0 +1,42 @@ +mapping: + nodes: + - !Storage + component: MainMemory + tensors: [T0, T1, W0] + - !Storage + component: DistributedBuffer + tensors: [W0] + - !Temporal + rank_variable: m + tile_shape: {{ M_TILE }} + - !Storage + component: RowBuffer + tensors: [T0] + - !Storage + component: ColumnBuffer + tensors: [T1] + - !Spatial + rank_variable: n0 + tile_shape: {{ MAC_TILE }} + component: Array + name: X + - !Spatial + rank_variable: n1 + tile_shape: {{ MAC_TILE }} + component: Array + name: Y + - !Storage + component: Scratchpad + tensors: [T0, T1, W0] + - !Temporal + rank_variable: m + tile_shape: 1 + - !Temporal + rank_variable: n0 + tile_shape: 1 + - !Temporal + rank_variable: n1 + tile_shape: 1 + - !Compute + einsum: Matmul0 + component: MAC \ No newline at end of file diff --git a/tests/test_network.py b/tests/test_network.py index e7cd640f..546d3bf4 100644 --- a/tests/test_network.py +++ b/tests/test_network.py @@ -33,7 +33,6 @@ def test_flat(self): {n.name for n in spec.arch.get_nodes_of_type(af.spec.Leaf)}, { "MainMemory", - "GlobalBuffer", "NoC", "RowBuffer", "ColumnBuffer", @@ -226,6 +225,71 @@ def test_hierarchical(self): * BITS_PER_VALUE, ) + def test_flat(self): + M = 8 + KN = 8 + MAC_TILE = 2 + M_TILE = 4 + BITS_PER_VALUE = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + INPUT_FILES_DIR / "flat.yaml", + INPUT_FILES_DIR / "one_matmul_to_flat.yaml", + jinja_parse_data={ + "N_EINSUMS": 1, + "M": 8, + "KN": 8, + "MAC_TILE": MAC_TILE, + "M_TILE": M_TILE, + }, + ) + result = spec.evaluate_mapping() + self.assertEqual( + result.data['Matmul0actionNoCT0hops'].iloc[0], + ( + M / M_TILE + * + (KN / MAC_TILE) * (KN / MAC_TILE - 1) # num rows * multicast_hops + * + M_TILE * MAC_TILE # tile shape + * + BITS_PER_VALUE + ) + ) + self.assertEqual( + result.data['Matmul0actionNoCT1hops'].iloc[0], + ( + M / M_TILE + * + (KN / MAC_TILE) * (KN / MAC_TILE - 1) # num rows * multicast_hops + * + M_TILE * MAC_TILE # tile shape + * + BITS_PER_VALUE + ) + ) + self.assertEqual( + result.data['Matmul0actionNoCW0hops'].iloc[0], + ( + M / M_TILE + * + ( + 4 # a 2x2 grid of physical buffers + * + ( + sum(i for i in range(2)) * MAC_TILE # unicast along row * tile shape + + + 2 * sum(i for i in range(2)) # num cols * unicast down col + ) + ) + * + MAC_TILE * MAC_TILE # tile shape + * + BITS_PER_VALUE + ) + ) + class TestMapper(TestCase): def test_hierarchical(self): From 0453ddceb860b9362656a9a052b18493af369069 Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Fri, 22 May 2026 16:58:44 -0400 Subject: [PATCH 06/12] [model] Read/writes of distributed buffers --- accelforge/frontend/arch/_flattened_arch.py | 103 +++++++++++++++++- accelforge/frontend/arch/spatialable.py | 9 ++ .../model/_looptree/reuse/symbolic/_stats.py | 6 + .../_looptree/reuse/symbolic/_symbolic.py | 38 +++++-- tests/input_files/networked/flat.yaml | 4 +- tests/test_network.py | 42 +++++++ 6 files changed, 188 insertions(+), 14 deletions(-) diff --git a/accelforge/frontend/arch/_flattened_arch.py b/accelforge/frontend/arch/_flattened_arch.py index 2a249a94..81e31588 100644 --- a/accelforge/frontend/arch/_flattened_arch.py +++ b/accelforge/frontend/arch/_flattened_arch.py @@ -1,4 +1,4 @@ -from typing import TypeVar +from typing import TypeVar, Callable _FIND_SENTINEL = object() @@ -62,7 +62,12 @@ def is_above(self, name_a: str, name_b: str): return idx_a < idx_b def find_first_of_type_between( - self, node_type: T, name_lower: str, name_upper: str, default: D = _FIND_SENTINEL + self, + node_type: T, + name_lower: str, + name_upper: str, + default: D = _FIND_SENTINEL, + top_bottom: bool = True, ) -> T | D: """ Returns the first node with type `node_type` above `name_lower` and under `name_upper`. @@ -75,7 +80,10 @@ def find_first_of_type_between( upper_idx = self.index(name_upper) lower_idx = self.index(name_lower) - for i, node in enumerate(self.nodes): + iterator = self.nodes + if not top_bottom: + iterator = reversed(top_bottom) + for i, node in enumerate(iterator): if not isinstance(node, node_type) or i <= upper_idx or i >= lower_idx: continue else: @@ -84,3 +92,92 @@ def find_first_of_type_between( return default else: raise ValueError(f"node with type {node_type} between {name_upper} and {name_lower} not found") + + def find_first_of_type_above( + self, + node_type: T, + name_lower: str, + default: D = _FIND_SENTINEL, + top_bottom: bool = True, + ) -> T | D: + """ + Returns the first node with type `node_type` above `name_lower` and under `name_upper`. + + If `name` does not exist, raises an error. + + If no node of `node_type` is found, either `default` is + returned (if provided) or raises an error. + """ + lower_idx = self.index(name_lower) + + iterator = self.nodes + if not top_bottom: + iterator = reversed(top_bottom) + for i, node in enumerate(iterator): + if not isinstance(node, node_type) or i >= lower_idx: + continue + else: + return node + if default is not _FIND_SENTINEL: + return default + else: + raise ValueError(f"node with type {node_type} above {name_lower} not found") + + def find_first_of_type_below( + self, + node_type: T, + name_upper: str, + default: D = _FIND_SENTINEL, + top_bottom: bool = True, + ) -> T | D: + """ + Returns the first node with type `node_type` above `name_lower` and under `name_upper`. + + If `name` does not exist, raises an error. + + If no node of `node_type` is found, either `default` is + returned (if provided) or raises an error. + """ + upper_idx = self.index(name_upper) + + iterator = self.nodes + if not top_bottom: + iterator = reversed(top_bottom) + for i, node in enumerate(iterator): + if not isinstance(node, node_type) or i <= upper_idx: + continue + else: + return node + if default is not _FIND_SENTINEL: + return default + else: + raise ValueError(f"node with type {node_type} below {name_upper} not found") + + def first_below( + self, + name: str, + filter: Callable = None, + default: D = _FIND_SENTINEL, + ) -> T | D: + """ + Returns the first node with type `node_type` above `name_lower` and under `name_upper`. + + If `name` does not exist, raises an error. + + If no node of `node_type` is found, either `default` is + returned (if provided) or raises an error. + """ + idx = self.index(name) + + if filter is None: + filter = lambda x: True + + for i, node in enumerate(self.nodes): + if not filter(node) or i <= idx: + continue + else: + return node + if default is not _FIND_SENTINEL: + return default + else: + raise ValueError(f"node below {name} not found") diff --git a/accelforge/frontend/arch/spatialable.py b/accelforge/frontend/arch/spatialable.py index a5a3e286..0b767302 100644 --- a/accelforge/frontend/arch/spatialable.py +++ b/accelforge/frontend/arch/spatialable.py @@ -136,6 +136,12 @@ def get_fanout_along(self, dim_name: str, default: int = 1) -> int: return s.fanout return default + def _has_physical_dim(self, dim_name: str) -> bool: + for s in self._physical_spatial: + if s.name == dim_name: + return True + return False + def _get_physical_fanout_along(self, dim_name: str, default: int = 1) -> int: for s in self._physical_spatial: if s.name == dim_name: @@ -153,3 +159,6 @@ def _spatial_str(self, include_newline=True) -> str: return "" result = ", ".join(f"{s.fanout}× {s.name}" for s in self.spatial) return f"\n[{result}]" if include_newline else result + + def _is_distributed(self): + return any(s.fanout > 1 for s in self._physical_spatial) \ No newline at end of file diff --git a/accelforge/model/_looptree/reuse/symbolic/_stats.py b/accelforge/model/_looptree/reuse/symbolic/_stats.py index 8368937d..1c2c8ee3 100644 --- a/accelforge/model/_looptree/reuse/symbolic/_stats.py +++ b/accelforge/model/_looptree/reuse/symbolic/_stats.py @@ -100,6 +100,12 @@ def repeat_temporal(self, factor: int, is_fully_relevant: bool) -> "BuffetStats" return new def repeat_spatial(self, factor: int, reuse_parent_accesses: bool) -> "BuffetStats": + """ + Repeat buffet stats due to spatial loop `factor` number of times. + + For accesses to parent, the amount of repetition is `factor` if `reuse_parent_access` + is False; otherwise, there is no repetition. + """ new = copy.copy(self) if factor == 1: return new diff --git a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py index 91a6bd32..5ed426e7 100755 --- a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py +++ b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py @@ -606,7 +606,6 @@ def handle_repeated_value(repeated_shape): accumulated_buffet_stats = result_accumulator.buffet_stats child_stats = list(child_result.buffet_stats.items()) for i, (buffet, buffet_stats) in enumerate(child_stats): - stats = buffet_stats accumulated_stats = accumulated_buffet_stats.setdefault( buffet, BuffetStats.blank() ) @@ -628,8 +627,8 @@ def handle_repeated_value(repeated_shape): and buffet.tensor in component_spatial_dim.may_reuse ) - stats.n_loops_above = stats.n_loops_above + 1 - accumulated_stats += stats.repeat_spatial( + buffet_stats.n_loops_above = buffet_stats.n_loops_above + 1 + accumulated_stats += buffet_stats.repeat_spatial( shape_repeats, reuse_parent_accesses ) @@ -692,6 +691,7 @@ def analyze_storage( count_writes: bool = True, ): mapping = info.mapping + flattened_arch = info.job.flattened_arch einsum_name = mapping[-1].einsum node: TensorHolder = mapping[node_idx] @@ -798,25 +798,45 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any: else: write_scale = 0 + # ======================= + # For distributed buffers + n_active_physical_units = 1 + if child is not None: + next_spatial = flattened_arch.first_below( + node.component, + lambda n: isinstance(n, arch.Spatialable) and len(n.spatial) > 0, + default=None, + ) + if component_object._is_distributed() and next_spatial is not None: + for (b, e), dim_fanout in child_result.fanout.items(): + if b != next_spatial.name: + continue + for d in dim_fanout: + if not component_object._has_physical_dim(d): + continue + n_active_physical_units *= ( + dim_fanout[d] / component_object._get_physical_stride_along(d) + ) + # ========================== # Data exchanges with parent if count_downward_movement[tensor]: # Parent -> Me stats.total_write_actions += stats.total_reads_to_parent * write_scale stats.max_per_unit_write_actions += ( - stats.total_reads_to_parent * write_scale + stats.total_reads_to_parent * write_scale / n_active_physical_units ) stats.total_skipped_first_write_actions += ( stats.total_skipped_first_reads_to_parent * write_scale ) stats.min_per_unit_skipped_first_write_actions += ( - stats.min_per_parent_skipped_first_reads_to_parent * write_scale + stats.min_per_parent_skipped_first_reads_to_parent * write_scale / n_active_physical_units ) if count_upward_movement[tensor]: # Me -> Parent # Comment this to have the final writeback to a buffer hit both that buffer and # go directly to the parent without incurring another read from the buffer. stats.total_read_actions += stats.total_writes_to_parent * read_scale - stats.max_per_unit_read_actions += stats.total_writes_to_parent * read_scale + stats.max_per_unit_read_actions += stats.total_writes_to_parent * read_scale / n_active_physical_units # ======================== # Data exchanges with peer @@ -829,7 +849,7 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any: if count_downward_movement[tensor]: # Me -> Child stats.total_read_actions += child.total_reads_to_parent * read_scale stats.max_per_unit_read_actions += ( - child.max_per_parent_reads_to_parent * read_scale + child.max_per_parent_reads_to_parent * read_scale / n_active_physical_units ) # Skip first read if skip_initial: @@ -837,13 +857,13 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any: child.total_skipped_first_reads_to_parent * read_scale ) stats.min_per_unit_skipped_first_read_actions += ( - child.min_per_parent_skipped_first_reads_to_parent * read_scale + child.min_per_parent_skipped_first_reads_to_parent * read_scale / n_active_physical_units ) if count_upward_movement[tensor]: # Child -> Me stats.total_write_actions += child.total_writes_to_parent * write_scale stats.max_per_unit_write_actions += ( - child.max_per_parent_writes_to_parent * write_scale + child.max_per_parent_writes_to_parent * write_scale / n_active_physical_units ) return child_result diff --git a/tests/input_files/networked/flat.yaml b/tests/input_files/networked/flat.yaml index b546ba21..f4862b0b 100644 --- a/tests/input_files/networked/flat.yaml +++ b/tests/input_files/networked/flat.yaml @@ -23,7 +23,7 @@ arch: leak_power: 0 tensors: {keep: input, may_keep: input} actions: - - {name: read, energy: 5, latency: 0} + - {name: read, energy: 5, latency: 1} - {name: write, energy: 5, latency: 0} spatial: - {name: X, fanout: 4} @@ -47,7 +47,7 @@ arch: leak_power: 0 tensors: {keep: weight, may_keep: weight} actions: - - {name: read, energy: 5, latency: 0} + - {name: read, energy: 5, latency: 1} - {name: write, energy: 5, latency: 0} spatial: - {name: X, fanout: 2} diff --git a/tests/test_network.py b/tests/test_network.py index 546d3bf4..9c87977b 100644 --- a/tests/test_network.py +++ b/tests/test_network.py @@ -289,6 +289,48 @@ def test_flat(self): BITS_PER_VALUE ) ) + self.assertEqual( + result.data['Matmul0actionRowBufferT0read'].iloc[0], + ( + M / M_TILE + * + KN // MAC_TILE + * + M_TILE * MAC_TILE + * + BITS_PER_VALUE + ) + ) + self.assertEqual( + result.data['Matmul0latencyRowBuffer'].iloc[0], + ( + M / M_TILE + * + KN // MAC_TILE + * + M_TILE * MAC_TILE + * + BITS_PER_VALUE + / + 4 # num of physical RowBuffer + ) + ) + self.assertEqual( + result.data['Matmul0latencyDistributedBuffer'].iloc[0], + ( + M / M_TILE + * + KN // MAC_TILE + * + KN // MAC_TILE + * + MAC_TILE * MAC_TILE # tile shape + * + BITS_PER_VALUE + / + 4 # num of physical DistributedBuffer + ) + ) class TestMapper(TestCase): From 8313202c93af396a3685d6ad4e5e1e4e5c09801a Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Tue, 26 May 2026 17:38:17 -0400 Subject: [PATCH 07/12] Model latency and distributed occupancy --- accelforge/model/_looptree/latency/memory.py | 11 +++++++++++ .../model/_looptree/reuse/symbolic/_symbolic.py | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/accelforge/model/_looptree/latency/memory.py b/accelforge/model/_looptree/latency/memory.py index eb6f6426..a52989e7 100755 --- a/accelforge/model/_looptree/latency/memory.py +++ b/accelforge/model/_looptree/latency/memory.py @@ -79,6 +79,17 @@ def component_latency( f"Component {component} is not a TensorHolder or Compute" ) + for network, network_stats in looptree_results.network_stats.items(): + component = network.component + actions = component_to_actions[component] + if component not in name2component: + raise ValueError(f"Component {component} found in mapping but not arch") + + for action in name2component[component].actions: + actions[f"{action.name}_actions"] += 0 + + actions["hops_actions"] += network_stats.max_hops + longest_compute_latency = Max( 0, *[s.max_latency for s in looptree_results.compute_stats.values()] ) diff --git a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py index 5ed426e7..cb64012c 100755 --- a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py +++ b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py @@ -818,6 +818,10 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any: dim_fanout[d] / component_object._get_physical_stride_along(d) ) + # ========================== + # Recalculate usage of distributed buffers + stats.max_occupancy /= n_active_physical_units + # ========================== # Data exchanges with parent if count_downward_movement[tensor]: # Parent -> Me From db5eae4579f6493f84924ec3543e956da6a5b36e Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Wed, 27 May 2026 17:05:40 -0400 Subject: [PATCH 08/12] Implement (almost) proper latency model; waiting for hwcomponents latency/bandwidth update --- accelforge/frontend/arch/components.py | 14 +++ accelforge/model/_looptree/latency/memory.py | 31 ++++-- .../_looptree/reuse/symbolic/_network.py | 94 +++++++++++-------- .../model/_looptree/reuse/symbolic/_stats.py | 8 +- accelforge/model/run_model.py | 1 + .../networked/hierarchical_1d.yaml | 3 +- tests/test_network.py | 4 + 7 files changed, 106 insertions(+), 49 deletions(-) diff --git a/accelforge/frontend/arch/components.py b/accelforge/frontend/arch/components.py index d7b64dfd..8affd5ec 100644 --- a/accelforge/frontend/arch/components.py +++ b/accelforge/frontend/arch/components.py @@ -1174,6 +1174,20 @@ class Network(Component, Leaf): of the spatial nodes from top to bottom. """ + total_latency: str | int | float = "max(max_hops*actions['hops'].latency, max_link_traffic/actions['hops'].latency)" + """ + Models latency as either: + - *Latency-bound*, which means that the latency of the route with the most number of + hops dominate the overall communication latency. + - *Bandwidth-bound*, which means that the traffic over the most congested link + dominates the overall communication latency. + + Keywords: + - `max_hops` returns the number of hops in the longest route. + - `max_link_traffic` returns the amount of traffic (in bits) over the most congested + link. + """ + bits_per_value: EvalsTo[dict] = {} """ Sets the bits per value for tensors in this `TensorHolder`. Keys are evaluated as diff --git a/accelforge/model/_looptree/latency/memory.py b/accelforge/model/_looptree/latency/memory.py index a52989e7..491f97f2 100755 --- a/accelforge/model/_looptree/latency/memory.py +++ b/accelforge/model/_looptree/latency/memory.py @@ -14,7 +14,7 @@ from accelforge.model._looptree.reuse.symbolic import BuffetStats from accelforge.util._eval_expressions import MATH_FUNCS, eval_expression -from accelforge.util._sympy.broadcast_max import Max, Min +from accelforge.util._sympy.broadcast_max import Max, Min, MaxGeqZero import symengine as se @@ -47,6 +47,10 @@ def component_latency( component_to_actions: dict[str, dict[str, float]] = defaultdict( lambda: defaultdict(lambda: 0) ) + # Holds ``keywords" that do not map neatly to actions, e.g., max_hops for network + component_to_keywords: dict[str, dict[str, float]] = defaultdict( + lambda: defaultdict(lambda: 0) + ) name2component: dict[str, Component] = {node.name: node for node in flattened_arch} compute_obj = flattened_arch[-1] @@ -79,16 +83,28 @@ def component_latency( f"Component {component} is not a TensorHolder or Compute" ) + network_to_max_link_traffic = defaultdict(lambda: defaultdict(lambda: 0)) + network_to_max_hops = defaultdict(lambda: []) for network, network_stats in looptree_results.network_stats.items(): component = network.component - actions = component_to_actions[component] if component not in name2component: raise ValueError(f"Component {component} found in mapping but not arch") - for action in name2component[component].actions: - actions[f"{action.name}_actions"] += 0 + dim_traffic = network_to_max_link_traffic[component] + for dim, max_traffic_in_dim in network_stats.max_traffic.items(): + dim_traffic[dim] += max_traffic_in_dim - actions["hops_actions"] += network_stats.max_hops + network_to_max_hops[component].append(network_stats.max_hops) + + for network, network_stats in looptree_results.network_stats.items(): + component = network.component + keywords = component_to_keywords[component] + keywords["max_link_traffic"] = MaxGeqZero( + *network_to_max_link_traffic[component].values() + ) + keywords["max_hops"] = MaxGeqZero( + *network_to_max_hops[component] + ) longest_compute_latency = Max( 0, *[s.max_latency for s in looptree_results.compute_stats.values()] @@ -126,14 +142,15 @@ def component_latency( "sum": se.Add, } - for component, actions in component_to_actions.items(): - component_obj = name2component[component] + for component, component_obj in name2component.items(): + actions = component_to_actions[component] symbol_table = { "action2latency": component_to_action_latency[component], **symbol_table_base, **name2component[component].shallow_model_dump(include_None=True), **actions, **component_to_action_latency[component], + **component_to_keywords[component], } if name2component[component].total_latency is not None: component_latency[component] = eval_expression( diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py index 4c7cc9ea..2f9bfe87 100644 --- a/accelforge/model/_looptree/reuse/symbolic/_network.py +++ b/accelforge/model/_looptree/reuse/symbolic/_network.py @@ -1,9 +1,5 @@ -import copy -from accelforge.frontend import arch -from accelforge.frontend.arch import Network as NetworkSpec from accelforge.frontend.mapping import ( - TensorHolder, - TensorName + Spatial ) from accelforge.frontend._workload_isl._symbolic import ( compute_dense_tile_occupancy, @@ -20,7 +16,7 @@ class NetworkAnalyzer: def __init__(self, network_stats): - self.overall_max_hops = 0 + self.overall_max_hops: dict = {} self.network_stats = network_stats def accumulate_child_result( @@ -30,8 +26,9 @@ def accumulate_child_result( shape_repeats, einsum_name, child_shape, - node, + node: Spatial, ): + """This function is called for every repeated shape.""" flattened_arch = info.job.flattened_arch for network, child_network_stats in child_result.network_stats.items(): @@ -40,44 +37,37 @@ def accumulate_child_result( self.network_stats[network] = NetworkStats() accumulated_network_stats = self.network_stats[network] - accumulated_network_stats.total_hops += ( - child_network_stats.total_hops * shape_repeats - ) - accumulated_network_stats.max_hops = MaxGeqZero( - accumulated_network_stats.max_hops, - child_network_stats.max_hops, - ) - projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)] - component_object = flattened_arch[network.component] - workload_bpv = info.job.einsum.tensor_accesses[ - network.tensor - ].bits_per_value - bits_per_value = component_object.bits_per_value.get( - network.tensor, workload_bpv - ) - bits_per_action = component_object.bits_per_action - if bits_per_action is not None: - actions_per_value = bits_per_value / bits_per_action - else: - actions_per_value = bits_per_value - volume = ( - compute_dense_tile_occupancy(projection, child_shape) - * actions_per_value - ) - + # We only need to update the summary if the spatial loop is for + # a component higher than the network of interest if flattened_arch.is_above(node.component, network.component): + accumulated_network_stats.total_hops += ( + child_network_stats.total_hops * shape_repeats + ) + accumulated_network_stats.max_hops = MaxGeqZero( + accumulated_network_stats.max_hops, + child_network_stats.max_hops, + ) + for k, v in child_network_stats.max_traffic.items(): + accumulated_network_stats.max_traffic[k] = MaxGeqZero( + accumulated_network_stats.max_traffic.get(k, 0), + v + ) continue + volume = self._get_data_volume(network, einsum_name, info, child_shape) + relevancy = info.tensor_to_relevancy[network.tensor][node.rank_variable] # The fanout in this dimension in mapping nodes below, i.e., the stride last_fanout = child_result.fanout.get((node.component, einsum_name), {}) last_fanout = last_fanout.get(node.name, 1) if isinstance(relevancy, Irrelevant): + # The volume travels through link by link in one axis of the mesh # Distributed or not, the amount of total cost is the same. # However, the accesses now come from different physical memories total_cost = multicast_cost(shape_repeats, last_fanout)*volume max_hops = shape_repeats*last_fanout + max_traffic = volume elif isinstance(relevancy, Relevant): # If distributed, then we bind data as locally as possible in the # physical buffers @@ -99,26 +89,56 @@ def accumulate_child_result( * volume ) - max_hops = MinGeqZero(shape_repeats*last_fanout, physical_stride) + max_hops = MinGeqZero((n_dsts_per_physical-1)*last_fanout, physical_stride) + max_traffic = (n_dsts_per_physical-1)*volume else: total_cost = unicast_cost(shape_repeats, last_fanout)*volume max_hops = shape_repeats * last_fanout + max_traffic = (shape_repeats-1)*volume elif isinstance(relevancy, PartiallyRelevant): raise NotImplementedError() else: raise RuntimeError(f"unhandled relevancy type {relevancy}") - # TODO: this is sketchy - self.overall_max_hops += max_hops + # Each subsequent call to this function (i.e., over different iterations of a spatial loop) + # adds more to the max hops + self.overall_max_hops[network] = self.overall_max_hops.get(network, 0) + max_hops - accumulated_network_stats.total_hops += total_cost + accumulated_network_stats.total_hops += ( + total_cost + child_network_stats.total_hops*shape_repeats + ) accumulated_network_stats.max_hops = MaxGeqZero( accumulated_network_stats.max_hops, - self.overall_max_hops + child_network_stats.max_hops, + self.overall_max_hops[network] + child_network_stats.max_hops, + ) + accumulated_network_stats.max_traffic[node.name] = MaxGeqZero( + accumulated_network_stats.max_traffic.get(node.name, 0), + max_traffic + child_network_stats.max_traffic.get(node.name, 0) ) return self.overall_max_hops + def _get_data_volume(self, network, einsum_name, info, child_shape): + flattened_arch = info.job.flattened_arch + projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)] + component_object = flattened_arch[network.component] + workload_bpv = info.job.einsum.tensor_accesses[ + network.tensor + ].bits_per_value + bits_per_value = component_object.bits_per_value.get( + network.tensor, workload_bpv + ) + bits_per_action = component_object.bits_per_action + if bits_per_action is not None: + actions_per_value = bits_per_value / bits_per_action + else: + actions_per_value = bits_per_value + volume = ( + compute_dense_tile_occupancy(projection, child_shape) + * actions_per_value + ) + return volume + def multicast_cost(n_dsts, stride): """Returns total hops of multicast along a dimension.""" diff --git a/accelforge/model/_looptree/reuse/symbolic/_stats.py b/accelforge/model/_looptree/reuse/symbolic/_stats.py index 1c2c8ee3..aa2c1d90 100644 --- a/accelforge/model/_looptree/reuse/symbolic/_stats.py +++ b/accelforge/model/_looptree/reuse/symbolic/_stats.py @@ -21,7 +21,11 @@ @dataclass class NetworkStats: total_hops: Any = field(default=0) + """Total number of hops overall. Useful to calculate energy.""" max_hops: Any = field(default=0) + """Longest hops among all routes.""" + max_traffic: dict[int | str, Any] = field(default_factory=dict) + """Maximum traffic occuring on any single link along a dimension.""" def repeat(self, n_repeats): new = copy.copy(self) @@ -32,10 +36,6 @@ def repeat(self, n_repeats): new.total_hops = new.total_hops * n_repeats return new - def combine(self, other: "NetworkStats"): - self.total_hops += other.total_hops - self.max_hops = max(self.max_hops, other.max_hops) - @dataclass class BuffetStats: diff --git a/accelforge/model/run_model.py b/accelforge/model/run_model.py index eee4f60f..b68ea7c5 100644 --- a/accelforge/model/run_model.py +++ b/accelforge/model/run_model.py @@ -43,6 +43,7 @@ def run_model( ) latency = component_latency(reuse, job.flattened_arch, pmapping, spec) + print(latency) try: overall_latency = MaxGeqZero(*latency.values()) except Exception as e: diff --git a/tests/input_files/networked/hierarchical_1d.yaml b/tests/input_files/networked/hierarchical_1d.yaml index 49853317..1c5c60d1 100644 --- a/tests/input_files/networked/hierarchical_1d.yaml +++ b/tests/input_files/networked/hierarchical_1d.yaml @@ -24,8 +24,9 @@ arch: name: PeArray area: 0 leak_power: 0 + total_latency: "max_hops" actions: - - {name: hops, energy: 1, latency: 0} + - {name: hops, energy: 1, latency: 1} - !Memory name: Scratchpad diff --git a/tests/test_network.py b/tests/test_network.py index 109e9f68..d684f48a 100644 --- a/tests/test_network.py +++ b/tests/test_network.py @@ -125,6 +125,10 @@ def test_hierarchical_1d(self): * KN * BITS_PER_VALUE, ) + self.assertEqual( + result.data["Totallatency"].iloc[0], + 4 + ) def test_hierarchical(self): M = 8 From 7ad792a1669b389f4236f2cc94093868e656d920 Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Fri, 5 Jun 2026 10:26:07 -0400 Subject: [PATCH 09/12] [network] Update to latest spec --- accelforge/frontend/arch/components.py | 16 ++-------- accelforge/model/_looptree/latency/memory.py | 9 ++++-- tests/input_files/networked/flat.yaml | 6 ++-- tests/input_files/networked/hierarchical.yaml | 4 +-- .../networked/hierarchical_1d.yaml | 4 +-- tests/test_network.py | 29 +++++++++---------- 6 files changed, 30 insertions(+), 38 deletions(-) diff --git a/accelforge/frontend/arch/components.py b/accelforge/frontend/arch/components.py index e7f6de98..8bdf17ae 100644 --- a/accelforge/frontend/arch/components.py +++ b/accelforge/frontend/arch/components.py @@ -145,7 +145,7 @@ def _set_n_calls(self, value: int | float) -> None: @classmethod def _deprecate_latency_fields(cls, data): if isinstance(data, dict): - if "latency" in data: + if "latency" in data and not "throughput" in data: l = data.pop("latency") warnings.warn( f"Setting `latency` on `{cls.__name__}` is deprecated; use " @@ -155,16 +155,11 @@ def _deprecate_latency_fields(cls, data): DeprecationWarning, stacklevel=2, ) - if "throughput" in data: - raise ValueError( - f"Cannot specify both `latency` and `throughput` on " - f"`{cls.__name__}`. Drop the deprecated `latency` field." - ) l = str(l).strip() data["throughput"] = ( f"1 / ({l}) if ({l}) != 0 else float('inf')" ) - if "latency_scale" in data: + if "latency_scale" in data and not "throughput_scale" in data: ls = data.pop("latency_scale") warnings.warn( f"Setting `latency_scale` on `{cls.__name__}` is deprecated; use " @@ -174,11 +169,6 @@ def _deprecate_latency_fields(cls, data): DeprecationWarning, stacklevel=2, ) - if "throughput_scale" in data: - raise ValueError( - f"Cannot specify both `latency_scale` and `throughput_scale` " - f"on `{cls.__name__}`. Drop the deprecated `latency_scale`." - ) ls = str(ls).strip() data["throughput_scale"] = ( f"1 / ({ls}) if ({ls}) != 0 else float('inf')" @@ -1316,7 +1306,7 @@ class Network(Component, Leaf): of the spatial nodes from top to bottom. """ - total_latency: str | int | float = "max(max_hops*actions['hops'].latency, max_link_traffic/actions['hops'].latency)" + total_latency: str | int | float = "max(max_hops*actions['hops'].latency, max_link_traffic/actions['hops'].throughput)" """ Models latency as either: - *Latency-bound*, which means that the latency of the route with the most number of diff --git a/accelforge/model/_looptree/latency/memory.py b/accelforge/model/_looptree/latency/memory.py index 4571a355..ebcaa1de 100755 --- a/accelforge/model/_looptree/latency/memory.py +++ b/accelforge/model/_looptree/latency/memory.py @@ -165,13 +165,18 @@ def component_latency( "sum": _sum, } - for component in component_to_actions: + for component in name2component: + if component not in component_to_actions and component not in component_to_keywords: + continue component_obj = name2component[component] dump = component_obj.shallow_model_dump(include_None=True) # Replace serialized `actions` dump with local Action copies that carry # the correct n_calls for this job, so formulas can access `a.n_calls`, # `a.throughput`, etc. without mutating the shared spec state. - dump["actions"] = component_to_actions[component] + if component in component_to_actions: + dump["actions"] = component_to_actions[component] + if component in component_to_keywords: + dump |= component_to_keywords[component] symbol_table = {**symbol_table_base, **dump} if component_obj.total_latency is not None: component_latency[component] = eval_expression( diff --git a/tests/input_files/networked/flat.yaml b/tests/input_files/networked/flat.yaml index 2b511e59..28679d21 100644 --- a/tests/input_files/networked/flat.yaml +++ b/tests/input_files/networked/flat.yaml @@ -57,8 +57,8 @@ arch: leak_power: 0 tensors: {keep: weight, may_keep: weight} actions: - - {name: read, energy: 5, throughput: inf} - - {name: write, energy: 5, throughput: inf} + - {name: read, energy: 5, throughput: 1} + - {name: write, energy: 5, throughput: 1} spatial: - {name: X, fanout: 2} - {name: Y, fanout: 2} @@ -68,7 +68,7 @@ arch: area: 0 leak_power: 0 actions: - - {name: hops, energy: 1, throughput: inf} + - {name: hops, energy: 1, latency: 0, throughput: inf} - !Memory name: Scratchpad diff --git a/tests/input_files/networked/hierarchical.yaml b/tests/input_files/networked/hierarchical.yaml index 5e8634ad..f268ef7e 100644 --- a/tests/input_files/networked/hierarchical.yaml +++ b/tests/input_files/networked/hierarchical.yaml @@ -25,7 +25,7 @@ arch: area: 0 leak_power: 0 actions: - - {name: hops, energy: 1, throughput: 4e9} + - {name: hops, energy: 1, latency: 0, throughput: 4e9} - !Memory name: Scratchpad @@ -45,7 +45,7 @@ arch: area: 0 leak_power: 0 actions: - - {name: hops, energy: 1, throughput: 16e9} + - {name: hops, energy: 1, latency: 0, throughput: 16e9} - !Compute name: MAC diff --git a/tests/input_files/networked/hierarchical_1d.yaml b/tests/input_files/networked/hierarchical_1d.yaml index 4171e115..167212ff 100644 --- a/tests/input_files/networked/hierarchical_1d.yaml +++ b/tests/input_files/networked/hierarchical_1d.yaml @@ -26,7 +26,7 @@ arch: leak_power: 0 total_latency: "max_hops" actions: - - {name: hops, energy: 1, throughput: 1} + - {name: hops, energy: 1, latency: 0, throughput: 1} - !Memory name: Scratchpad @@ -45,7 +45,7 @@ arch: area: 0 leak_power: 0 actions: - - {name: hops, energy: 1, throughput: inf} + - {name: hops, energy: 1, latency: 1, throughput: inf} - !Compute name: MAC diff --git a/tests/test_network.py b/tests/test_network.py index 06b731d5..8a11802d 100644 --- a/tests/test_network.py +++ b/tests/test_network.py @@ -27,21 +27,6 @@ def test_flat(self): spec = af.Spec.from_yaml( INPUT_FILES_DIR / "flat.yaml", ) - print(spec.arch.nodes["NoC"]) - self.assertIn("NoC", spec.arch.nodes) - self.assertEqual(spec.arch.nodes["NoC"].get_fanout(), 1) - self.assertEqual( - {n.name for n in spec.arch.get_nodes_of_type(af.spec.Leaf)}, - { - "MainMemory", - "NoC", - "RowBuffer", - "ColumnBuffer", - "DistributedBuffer", - "Scratchpad", - "MAC", - }, - ) try: spec = spec.calculate_component_costs() @@ -320,7 +305,7 @@ def test_flat(self): ) self.assertEqual( result.data['Matmul0latencyDistributedBuffer'].iloc[0], - ( + ( # Reads from child M / M_TILE * KN // MAC_TILE @@ -333,6 +318,18 @@ def test_flat(self): / 4 # num of physical DistributedBuffer ) + + + ( # Writes from parent + KN // MAC_TILE + * + KN // MAC_TILE + * + MAC_TILE * MAC_TILE # tile shape + * + BITS_PER_VALUE + / + 4 # num of physical DistributedBuffer + ) ) From 488f4b1523dc9239b3cffb43e98048fe63d37efd Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Fri, 5 Jun 2026 15:14:24 -0400 Subject: [PATCH 10/12] [network] Refactor network cost to handle different topologies --- accelforge/frontend/arch/components.py | 2 +- accelforge/model/_looptree/latency/memory.py | 1 + .../_looptree/reuse/symbolic/_network.py | 261 ++++++++++++++---- .../_looptree/reuse/symbolic/_symbolic.py | 6 +- 4 files changed, 207 insertions(+), 63 deletions(-) diff --git a/accelforge/frontend/arch/components.py b/accelforge/frontend/arch/components.py index 8bdf17ae..dc47cbda 100644 --- a/accelforge/frontend/arch/components.py +++ b/accelforge/frontend/arch/components.py @@ -1294,7 +1294,7 @@ def _render_node_color(self) -> str: return "#E0EEFF" -class TopologySpec(str, enum.Enum): +class TopologySpec(enum.StrEnum): MESH = "mesh" diff --git a/accelforge/model/_looptree/latency/memory.py b/accelforge/model/_looptree/latency/memory.py index ebcaa1de..080e3c04 100755 --- a/accelforge/model/_looptree/latency/memory.py +++ b/accelforge/model/_looptree/latency/memory.py @@ -109,6 +109,7 @@ def component_latency( network_to_max_link_traffic = defaultdict(lambda: defaultdict(lambda: 0)) network_to_max_hops = defaultdict(lambda: []) + # Aggregates across tensors for network, network_stats in looptree_results.network_stats.items(): component = network.component if component not in name2component: diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py index 2f9bfe87..b493de93 100644 --- a/accelforge/model/_looptree/reuse/symbolic/_network.py +++ b/accelforge/model/_looptree/reuse/symbolic/_network.py @@ -1,6 +1,11 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any + from accelforge.frontend.mapping import ( Spatial ) +from accelforge.frontend.arch.components import TopologySpec from accelforge.frontend._workload_isl._symbolic import ( compute_dense_tile_occupancy, Irrelevant, @@ -14,22 +19,178 @@ from ._stats import NetworkStats, SymbolicAnalysisOutput -class NetworkAnalyzer: - def __init__(self, network_stats): +@dataclass +class PerLoopTransferCost: + """The per-spatial-loop cost contributed by a single network, as computed + by a :class:`TopologyModel`.""" + + total_cost: Any + """Total hops contributed by data movement over this spatial loop.""" + max_hops: Any + """Hops added to the longest route by this spatial loop.""" + max_traffic: Any + """Maximum traffic (in actions) on any single link along this dimension.""" + + +class TopologyModel(ABC): + """Computes the cost of moving data across a network of a given topology. + + Subclasses encapsulate everything topology-specific about how a tensor's + data is delivered across a spatial fanout. :class:`NetworkAnalyzer` selects + the model for each network from its component's + :class:`~accelforge.frontend.arch.components.TopologySpec` and remains + agnostic to the topology itself. + + Instances are stateful: they accumulate per-network max hops across the + repeated spatial-loop iterations of a single :class:`NetworkAnalyzer`, so a + fresh model is constructed for each analyzer (see :func:`get_topology_model`). + """ + + def __init__(self): + # Running total of max hops per network, accumulated across the + # repeated spatial-loop iterations handled by one NetworkAnalyzer. self.overall_max_hops: dict = {} + + def accumulate_max_hops(self, network, max_hops): + """Add this loop's ``max_hops`` to ``network``'s running total and + return the updated total. + + Each call to :meth:`NetworkAnalyzer.accumulate_child_result` (i.e., over + a different iteration of a spatial loop) adds more to the max hops. + """ + self.overall_max_hops[network] = ( + self.overall_max_hops.get(network, 0) + max_hops + ) + return self.overall_max_hops[network] + + @abstractmethod + def per_loop_transfer_cost( + self, + relevancy, + *, + shape_repeats, + last_fanout, + volume, + src_component, + dim_name: str, + ) -> PerLoopTransferCost: + """Return the :class:`PerLoopTransferCost` for moving ``volume`` of data across one + spatial loop. + + Args: + relevancy: The relevancy of the spatial loop's rank variable to the + tensor (``Irrelevant``, ``Relevant``, or ``PartiallyRelevant``). + shape_repeats: The number of iterations of this spatial loop. + last_fanout: The fanout in this dimension among mapping nodes below + (i.e., the stride). + volume: The data volume (in actions) moved per destination. + src_component: The flattened-arch component sourcing the data, used + to query physical fanout/stride. + dim_name: The name of the spatial dimension (e.g., ``X`` or ``Y``). + """ + raise NotImplementedError + + +class MeshTopologyModel(TopologyModel): + """Cost model for a mesh network. + + Data travels link-by-link along one axis of the mesh. Multicast delivers a + value to every point along the dimension; unicast delivers a distinct value + to each point. When the source is physically distributed, data is bound as + locally as possible across the physical buffers. + """ + + def per_loop_transfer_cost( + self, + relevancy, + *, + shape_repeats, + last_fanout, + volume, + src_component, + dim_name, + ) -> PerLoopTransferCost: + if isinstance(relevancy, Irrelevant): + # The volume travels through link by link in one axis of the mesh + # Distributed or not, the amount of total cost is the same. + # However, the accesses now come from different physical memories + total_cost = multicast_cost(shape_repeats, last_fanout) * volume + max_hops = shape_repeats * last_fanout + max_traffic = volume + elif isinstance(relevancy, Relevant): + # If distributed, then we bind data as locally as possible in the + # physical buffers + if src_component._get_physical_fanout_along(dim_name) > 1: + physical_stride = src_component._get_physical_stride_along(dim_name) + + n_dsts_per_physical = MinGeqZero( + # if last_fanout > physical_stride, set n_dst to 1, which results in 0 hops + # later (which is correct because the set of destinations always overlap + # the set of sources). + MaxGeqZero(physical_stride / last_fanout, 1), + shape_repeats + ) + n_activated_physical = MaxGeqZero(shape_repeats * last_fanout / physical_stride, 1) + total_cost = ( + n_activated_physical + * + unicast_cost(n_dsts_per_physical, last_fanout) + * + volume + ) + max_hops = MinGeqZero((n_dsts_per_physical - 1) * last_fanout, physical_stride) + max_traffic = (n_dsts_per_physical - 1) * volume + else: + total_cost = unicast_cost(shape_repeats, last_fanout) * volume + max_hops = shape_repeats * last_fanout + max_traffic = (shape_repeats - 1) * volume + elif isinstance(relevancy, PartiallyRelevant): + raise NotImplementedError() + else: + raise RuntimeError(f"unhandled relevancy type {relevancy}") + + return PerLoopTransferCost(total_cost=total_cost, max_hops=max_hops, max_traffic=max_traffic) + + +# Registry mapping each topology to the model class that costs its data +# movement. Classes (not instances) are stored because models are stateful and +# each NetworkAnalyzer needs its own. +TOPOLOGY_MODELS: dict[TopologySpec, type[TopologyModel]] = { + TopologySpec.MESH: MeshTopologyModel, +} + + +def get_topology_model(topology) -> TopologyModel: + """Construct a fresh :class:`TopologyModel` for the given topology.""" + return TOPOLOGY_MODELS[topology]() + + +class NetworkAnalyzer: + def __init__(self, network_stats, info: AnalysisInfo, einsum_name, node: Spatial): self.network_stats = network_stats + # These don't change across calls to accumulate_child_result. + self.info = info + self.einsum_name = einsum_name + self.node = node + # Each network gets its own topology model, since different networks may + # have different topologies. Models are constructed lazily, the first + # time a network needs costing, and reused for the analyzer's lifetime so + # their accumulated max hops persist. + self.topology_models: dict = {} + + def _get_topology_model(self, network, topology) -> TopologyModel: + if network not in self.topology_models: + self.topology_models[network] = get_topology_model(topology) + return self.topology_models[network] def accumulate_child_result( self, child_result: SymbolicAnalysisOutput, - info: AnalysisInfo, shape_repeats, - einsum_name, child_shape, - node: Spatial, ): """This function is called for every repeated shape.""" - flattened_arch = info.job.flattened_arch + flattened_arch = self.info.job.flattened_arch for network, child_network_stats in child_result.network_stats.items(): src_component = flattened_arch[network.source.level] @@ -39,7 +200,7 @@ def accumulate_child_result( # We only need to update the summary if the spatial loop is for # a component higher than the network of interest - if flattened_arch.is_above(node.component, network.component): + if flattened_arch.is_above(self.node.component, network.component): accumulated_network_stats.total_hops += ( child_network_stats.total_hops * shape_repeats ) @@ -54,71 +215,51 @@ def accumulate_child_result( ) continue - volume = self._get_data_volume(network, einsum_name, info, child_shape) + volume = self._get_data_volume(network, child_shape) - relevancy = info.tensor_to_relevancy[network.tensor][node.rank_variable] + relevancy = self.info.tensor_to_relevancy[network.tensor][self.node.rank_variable] # The fanout in this dimension in mapping nodes below, i.e., the stride - last_fanout = child_result.fanout.get((node.component, einsum_name), {}) - last_fanout = last_fanout.get(node.name, 1) - if isinstance(relevancy, Irrelevant): - # The volume travels through link by link in one axis of the mesh - # Distributed or not, the amount of total cost is the same. - # However, the accesses now come from different physical memories - total_cost = multicast_cost(shape_repeats, last_fanout)*volume - max_hops = shape_repeats*last_fanout - max_traffic = volume - elif isinstance(relevancy, Relevant): - # If distributed, then we bind data as locally as possible in the - # physical buffers - if src_component._get_physical_fanout_along(node.name) > 1: - physical_stride = src_component._get_physical_stride_along(node.name) - - n_dsts_per_physical = MinGeqZero( - # if last_fanout > physical_stride, set n_dst to 1, which results in 0 hops - # later (which is correct because the set of destinations always overlap - # the set of sources). - MaxGeqZero(physical_stride / last_fanout, 1), - shape_repeats - ) - n_activated_physical = MaxGeqZero(shape_repeats*last_fanout/physical_stride, 1) - total_cost = ( - n_activated_physical - * - unicast_cost(n_dsts_per_physical, last_fanout) - * - volume - ) - max_hops = MinGeqZero((n_dsts_per_physical-1)*last_fanout, physical_stride) - max_traffic = (n_dsts_per_physical-1)*volume - else: - total_cost = unicast_cost(shape_repeats, last_fanout)*volume - max_hops = shape_repeats * last_fanout - max_traffic = (shape_repeats-1)*volume - elif isinstance(relevancy, PartiallyRelevant): - raise NotImplementedError() - else: - raise RuntimeError(f"unhandled relevancy type {relevancy}") + last_fanout = child_result.fanout.get((self.node.component, self.einsum_name), {}) + last_fanout = last_fanout.get(self.node.name, 1) - # Each subsequent call to this function (i.e., over different iterations of a spatial loop) - # adds more to the max hops - self.overall_max_hops[network] = self.overall_max_hops.get(network, 0) + max_hops + topology_model = self._get_topology_model( + network, flattened_arch[network.component].topology + ) + per_loop_transfer_cost = topology_model.per_loop_transfer_cost( + relevancy, + shape_repeats=shape_repeats, + last_fanout=last_fanout, + volume=volume, + src_component=src_component, + dim_name=self.node.name, + ) + + overall_max_hops = topology_model.accumulate_max_hops( + network, per_loop_transfer_cost.max_hops + ) accumulated_network_stats.total_hops += ( - total_cost + child_network_stats.total_hops*shape_repeats + per_loop_transfer_cost.total_cost + + child_network_stats.total_hops * shape_repeats ) accumulated_network_stats.max_hops = MaxGeqZero( accumulated_network_stats.max_hops, - self.overall_max_hops[network] + child_network_stats.max_hops, + overall_max_hops + child_network_stats.max_hops, ) - accumulated_network_stats.max_traffic[node.name] = MaxGeqZero( - accumulated_network_stats.max_traffic.get(node.name, 0), - max_traffic + child_network_stats.max_traffic.get(node.name, 0) + accumulated_network_stats.max_traffic[self.node.name] = MaxGeqZero( + accumulated_network_stats.max_traffic.get(self.node.name, 0), + per_loop_transfer_cost.max_traffic + child_network_stats.max_traffic.get(self.node.name, 0) ) - return self.overall_max_hops + overall_max_hops = {} + for model in self.topology_models.values(): + overall_max_hops.update(model.overall_max_hops) + return overall_max_hops - def _get_data_volume(self, network, einsum_name, info, child_shape): + def _get_data_volume(self, network, child_shape): + info = self.info + einsum_name = self.einsum_name flattened_arch = info.job.flattened_arch projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)] component_object = flattened_arch[network.component] @@ -153,4 +294,4 @@ def unicast_cost(n_dsts, stride): def arithmetic_sum(n): - return 0.5 * (n+1) * n \ No newline at end of file + return 0.5 * (n+1) * n diff --git a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py index cb64012c..d3c7b50b 100755 --- a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py +++ b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py @@ -592,7 +592,9 @@ def analyze_spatial(node_idx, current_shape, info: AnalysisInfo): result_accumulator = SymbolicAnalysisOutput() - network_analyzer = NetworkAnalyzer(result_accumulator.network_stats) + network_analyzer = NetworkAnalyzer( + result_accumulator.network_stats, info, einsum_name, node + ) def handle_repeated_value(repeated_shape): shape_value = repeated_shape.value @@ -633,7 +635,7 @@ def handle_repeated_value(repeated_shape): ) network_analyzer.accumulate_child_result( - child_result, info, shape_repeats, einsum_name, child_shape, node + child_result, shape_repeats, child_shape ) for einsum, child_steps in child_result.temporal_steps.items(): From b9c7d4abe41e9015775344d398eca3ce0912d65e Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Fri, 5 Jun 2026 17:04:57 -0400 Subject: [PATCH 11/12] [network] WIP review --- accelforge/frontend/arch/components.py | 1 + .../_looptree/reuse/symbolic/_network.py | 68 +++++++ .../input_files/networked/flat.yaml | 0 .../input_files/networked/hierarchical.yaml | 0 .../networked/hierarchical_1d.yaml | 0 .../networked/hierarchical_1d_all_to_all.yaml | 61 +++++++ .../networked/hierarchical_switched.yaml | 0 .../networked/one_matmul_to_flat.yaml | 0 .../one_matmul_to_networked_hierarchical.yaml | 0 ...e_matmul_to_networked_hierarchical_1d.yaml | 0 tests/{ => network}/test_network.py | 101 ++++++++++- tests/network/test_topology_model.py | 168 ++++++++++++++++++ 12 files changed, 398 insertions(+), 1 deletion(-) rename tests/{ => network}/input_files/networked/flat.yaml (100%) rename tests/{ => network}/input_files/networked/hierarchical.yaml (100%) rename tests/{ => network}/input_files/networked/hierarchical_1d.yaml (100%) create mode 100644 tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml rename tests/{ => network}/input_files/networked/hierarchical_switched.yaml (100%) rename tests/{ => network}/input_files/networked/one_matmul_to_flat.yaml (100%) rename tests/{ => network}/input_files/networked/one_matmul_to_networked_hierarchical.yaml (100%) rename tests/{ => network}/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml (100%) rename tests/{ => network}/test_network.py (75%) create mode 100644 tests/network/test_topology_model.py diff --git a/accelforge/frontend/arch/components.py b/accelforge/frontend/arch/components.py index dc47cbda..867cb03b 100644 --- a/accelforge/frontend/arch/components.py +++ b/accelforge/frontend/arch/components.py @@ -1296,6 +1296,7 @@ def _render_node_color(self) -> str: class TopologySpec(enum.StrEnum): MESH = "mesh" + ALL_TO_ALL = "all_to_all" class Network(Component, Leaf): diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py index b493de93..cdd9b8fa 100644 --- a/accelforge/model/_looptree/reuse/symbolic/_network.py +++ b/accelforge/model/_looptree/reuse/symbolic/_network.py @@ -152,11 +152,79 @@ def per_loop_transfer_cost( return PerLoopTransferCost(total_cost=total_cost, max_hops=max_hops, max_traffic=max_traffic) +class AllToAllTopologyModel(TopologyModel): + """Cost model for an all-to-all network built around a switch (e.g. NVLink / + NVSwitch). + + Every node connects to every other node through a central switch, so any + source reaches any destination in a constant number of hops regardless of + how far apart they are in the logical fanout. This differs from a mesh in + two ways: + + - **Uniform latency.** The longest route is a single switch traversal, so + ``max_hops`` is constant rather than growing with the distance + (``shape_repeats * stride``) between source and destination. + - **No store-and-forward accumulation.** Each destination is reached + directly, so the total (energy) cost is linear in the number of + destinations rather than quadratic as in a mesh unicast. + + The physical stride is irrelevant here (all nodes are equidistant from the + switch), so ``last_fanout`` and physical distribution are not consulted. + """ + + HOPS_PER_TRANSFER = 1 + """Hops charged for one source-to-destination transfer across the switch. + One switch traversal is treated as a single hop; the per-hop energy and + latency come from the network component's ``hops`` action.""" + + def per_loop_transfer_cost( + self, + relevancy, + *, + shape_repeats, + last_fanout, + volume, + src_component, + dim_name, + ) -> PerLoopTransferCost: + hops = self.HOPS_PER_TRANSFER + + # n - 1 other instances each receive the data across the switch. The + # source already holds it (the set of destinations overlaps the set of + # sources), so it needs no transfer to itself. + n_dsts = shape_repeats - 1 + + if isinstance(relevancy, (Irrelevant, Relevant)): + # Same delivery count (and hence energy) whether the data is shared + # (multicast) or distinct per instance (unicast): each of the n - 1 + # destinations is one switch traversal away. + total_cost = n_dsts * hops * volume + # Every route is a single switch traversal, independent of distance. + max_hops = hops + if isinstance(relevancy, Irrelevant): + # Multicast: the switch replicates, so each link carries the + # value at most once. + max_traffic = volume + else: + # Unicast: the source's uplink to the switch carries all n - 1 + # distinct messages, making it the most congested link. + max_traffic = n_dsts * volume + elif isinstance(relevancy, PartiallyRelevant): + raise NotImplementedError() + else: + raise RuntimeError(f"unhandled relevancy type {relevancy}") + + return PerLoopTransferCost( + total_cost=total_cost, max_hops=max_hops, max_traffic=max_traffic + ) + + # Registry mapping each topology to the model class that costs its data # movement. Classes (not instances) are stored because models are stateful and # each NetworkAnalyzer needs its own. TOPOLOGY_MODELS: dict[TopologySpec, type[TopologyModel]] = { TopologySpec.MESH: MeshTopologyModel, + TopologySpec.ALL_TO_ALL: AllToAllTopologyModel, } diff --git a/tests/input_files/networked/flat.yaml b/tests/network/input_files/networked/flat.yaml similarity index 100% rename from tests/input_files/networked/flat.yaml rename to tests/network/input_files/networked/flat.yaml diff --git a/tests/input_files/networked/hierarchical.yaml b/tests/network/input_files/networked/hierarchical.yaml similarity index 100% rename from tests/input_files/networked/hierarchical.yaml rename to tests/network/input_files/networked/hierarchical.yaml diff --git a/tests/input_files/networked/hierarchical_1d.yaml b/tests/network/input_files/networked/hierarchical_1d.yaml similarity index 100% rename from tests/input_files/networked/hierarchical_1d.yaml rename to tests/network/input_files/networked/hierarchical_1d.yaml diff --git a/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml b/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml new file mode 100644 index 00000000..3d8b6d22 --- /dev/null +++ b/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml @@ -0,0 +1,61 @@ +arch: + nodes: + - !Memory + name: MainMemory + size: inf + area: 0 + leak_power: 0 + tensors: {keep: All} + actions: + - {name: read, energy: 0, throughput: inf} + - {name: write, energy: 0, throughput: inf} + + - !Memory + name: GlobalBuffer + size: inf + area: 0 + leak_power: 0 + tensors: {keep: ~MainMemory, may_keep: All} + actions: + - {name: read, energy: 0, throughput: inf} + - {name: write, energy: 0, throughput: inf} + + - !Network + name: PeArray + area: 0 + leak_power: 0 + total_latency: "max_hops" + actions: + - {name: hops, energy: 1, latency: 0, throughput: 1} + + - !Memory + name: Scratchpad + size: inf + area: 0 + leak_power: 0 + tensors: {keep: All} + actions: + - {name: read, energy: 0, throughput: inf} + - {name: write, energy: 0, throughput: inf} + spatial: + - {name: X, fanout: 4} + + # All-to-all switch (NVLink-like): every node is one switch hop from every + # other, so unicast and multicast cost the same total hops and max_hops is + # constant. Fanout is 4 so this differs observably from a mesh. + - !Network + name: MacArray + topology: all_to_all + area: 0 + leak_power: 0 + actions: + - {name: hops, energy: 1, latency: 1, throughput: inf} + + - !Compute + name: MAC + area: 0 + leak_power: 0 + actions: + - {name: compute, energy: 0, throughput: inf} + spatial: + - {name: X, fanout: 4} diff --git a/tests/input_files/networked/hierarchical_switched.yaml b/tests/network/input_files/networked/hierarchical_switched.yaml similarity index 100% rename from tests/input_files/networked/hierarchical_switched.yaml rename to tests/network/input_files/networked/hierarchical_switched.yaml diff --git a/tests/input_files/networked/one_matmul_to_flat.yaml b/tests/network/input_files/networked/one_matmul_to_flat.yaml similarity index 100% rename from tests/input_files/networked/one_matmul_to_flat.yaml rename to tests/network/input_files/networked/one_matmul_to_flat.yaml diff --git a/tests/input_files/networked/one_matmul_to_networked_hierarchical.yaml b/tests/network/input_files/networked/one_matmul_to_networked_hierarchical.yaml similarity index 100% rename from tests/input_files/networked/one_matmul_to_networked_hierarchical.yaml rename to tests/network/input_files/networked/one_matmul_to_networked_hierarchical.yaml diff --git a/tests/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml b/tests/network/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml similarity index 100% rename from tests/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml rename to tests/network/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml diff --git a/tests/test_network.py b/tests/network/test_network.py similarity index 75% rename from tests/test_network.py rename to tests/network/test_network.py index 8a11802d..138631fc 100644 --- a/tests/test_network.py +++ b/tests/network/test_network.py @@ -34,7 +34,7 @@ def test_flat(self): self.fail(e.message) -class TestModel(TestCase): +class TestModelMesh(TestCase): def test_hierarchical_1d(self): M = 8 KN = 8 @@ -333,6 +333,105 @@ def test_flat(self): ) +class TestModelAllToAll(TestCase): + """Full-model evaluation of the 1D hierarchy where MacArray is an all-to-all + switch (NVLink-like) instead of a mesh. PeArray remains a mesh, so the two + networks can be contrasted within a single run.""" + + def test_hierarchical_1d_all_to_all(self): + M = 8 + KN = 8 + MAC_TILE = 4 + M_TILE = 4 + BITS_PER_VALUE = 8 + + spec = af.Spec.from_yaml( + af.examples.workloads.matmuls, + INPUT_FILES_DIR / "hierarchical_1d_all_to_all.yaml", + INPUT_FILES_DIR / "one_matmul_to_networked_hierarchical_1d.yaml", + jinja_parse_data={ + "N_EINSUMS": 1, + "M": M, + "KN": KN, + "MAC_TILE": MAC_TILE, + "M_TILE": M_TILE, + }, + ) + result = spec.evaluate_mapping() + + # --- MacArray: all-to-all switch --------------------------------- + # On a switch every node is one hop away, so unicast (T0, W0) collapses + # to the same (MAC_TILE - 1) linear cost as multicast (T1): all equal. + # Contrast test_hierarchical_1d, where the mesh makes T0/W0 quadratic + # (sum(range(MAC_TILE))). + all_to_all = ( + (M / M_TILE) + * (KN / MAC_TILE) # number of used Scratchpad + * M_TILE + * KN # temporal for n1 in mapping + * (MAC_TILE - 1) # one switch hop per destination, for every tensor + * BITS_PER_VALUE + ) + for tensor in ("T0", "T1", "W0"): + self.assertEqual( + result.data[ + f"Matmul0actionMacArray{tensor}hops" + ].iloc[0], + all_to_all, + msg=f"unexpected MacArray hops for {tensor}", + ) + + # Guard: a mesh would make the unicast tensors strictly more expensive. + mesh_unicast = ( + (M / M_TILE) + * (KN / MAC_TILE) + * M_TILE + * KN + * sum(range(MAC_TILE)) # quadratic on a mesh + * BITS_PER_VALUE + ) + self.assertGreater(mesh_unicast, all_to_all) + + # --- PeArray: still a mesh --------------------------------------- + # Unchanged from test_hierarchical_1d, so the mesh formulas hold (now + # with MAC_TILE = 4, i.e. KN // MAC_TILE = 2). + self.assertEqual( + result.data["Matmul0actionPeArrayT0hops"].iloc[0], + (M / M_TILE) + * sum(i for i in range(KN // MAC_TILE)) # unicast along X of PeArray + * M_TILE + * MAC_TILE + * BITS_PER_VALUE, + ) + self.assertEqual( + result.data["Matmul0actionPeArrayT1hops"].iloc[0], + (M / M_TILE) + * (KN // MAC_TILE - 1) # multicast along X of PeArray + * M_TILE + * KN + * BITS_PER_VALUE, + ) + self.assertEqual( + result.data["Matmul0actionPeArrayW0hops"].iloc[0], + (M / M_TILE) + * sum(i for i in range(KN // MAC_TILE)) # unicast along PeArray + * MAC_TILE + * KN + * BITS_PER_VALUE, + ) + + # --- Latency ------------------------------------------------------ + # The switch's uniform single-hop routing gives MacArray a constant + # latency of 1, versus the mesh PeArray's distance-dependent 2. + self.assertEqual( + result.data["Matmul0latencyMacArray"].iloc[0], 1 + ) + self.assertEqual( + result.data["Matmul0latencyPeArray"].iloc[0], 2 + ) + self.assertEqual(result.data["Totallatency"].iloc[0], 2) + + class TestMapper(TestCase): def test_hierarchical(self): M = 8 diff --git a/tests/network/test_topology_model.py b/tests/network/test_topology_model.py new file mode 100644 index 00000000..36dcc31c --- /dev/null +++ b/tests/network/test_topology_model.py @@ -0,0 +1,168 @@ +from unittest import TestCase + +from accelforge.frontend.arch.components import TopologySpec +from accelforge.frontend._workload_isl._symbolic import ( + Irrelevant, + PartiallyRelevant, + Relevant, +) +from accelforge.model._looptree.reuse.symbolic._network import ( + AllToAllTopologyModel, + MeshTopologyModel, + get_topology_model, +) + + +class _NoDistribution: + """Stand-in source component that is not physically distributed.""" + + def _get_physical_fanout_along(self, dim_name, default=1): + return 1 + + +class _Distributed: + """Stand-in source component physically distributed along a dimension.""" + + def __init__(self, fanout, stride): + self.fanout = fanout + self.stride = stride + + def _get_physical_fanout_along(self, dim_name, default=1): + return self.fanout + + def _get_physical_stride_along(self, dim_name): + return self.stride + + +class TestMeshTopologyModel(TestCase): + """Unit tests for the mesh cost model in isolation.""" + + def _cost(self, relevancy, *, n, stride, volume=10, src=None): + return MeshTopologyModel().per_loop_transfer_cost( + relevancy, + shape_repeats=n, + last_fanout=stride, + volume=volume, + src_component=src if src is not None else _NoDistribution(), + dim_name="X", + ) + + def test_registry_resolves_model(self): + self.assertIsInstance(get_topology_model(TopologySpec.MESH), MeshTopologyModel) + self.assertIsInstance(get_topology_model("mesh"), MeshTopologyModel) + + def test_multicast(self): + # Irrelevant: one value flows down the line, dropped at each of the + # (n - 1) downstream nodes. Each link carries it at most once. + n, stride, volume = 4, 2, 10 + cost = self._cost(Irrelevant(), n=n, stride=stride, volume=volume) + self.assertEqual(cost.total_cost, (n - 1) * stride * volume) + self.assertEqual(cost.max_hops, n * stride) + self.assertEqual(cost.max_traffic, volume) + + def test_unicast(self): + # Relevant (not distributed): each destination needs its own data + # delivered i*stride hops away, so the total is quadratic and the link + # nearest the source carries traffic for all (n - 1) downstream nodes. + n, stride, volume = 4, 2, 10 + cost = self._cost(Relevant("n0"), n=n, stride=stride, volume=volume) + self.assertEqual(cost.total_cost, sum(range(n)) * stride * volume) + self.assertEqual(cost.max_hops, n * stride) + self.assertEqual(cost.max_traffic, (n - 1) * volume) + + def test_unicast_distributed_binds_locally(self): + # When the source is physically distributed, data binds as locally as + # possible, reducing hops relative to the non-distributed unicast. + n, stride, volume = 4, 1, 10 + src = _Distributed(fanout=2, stride=4) + cost = self._cost(Relevant("n0"), n=n, stride=stride, volume=volume, src=src) + + # physical_stride / last_fanout = 4, capped at shape_repeats = 4 + n_dsts_per_physical = 4 + n_activated_physical = 1 # n*stride / physical_stride = 4/4 + self.assertEqual( + cost.total_cost, + n_activated_physical * sum(range(n_dsts_per_physical)) * stride * volume, + ) + self.assertEqual(cost.max_hops, (n_dsts_per_physical - 1) * stride) + self.assertEqual(cost.max_traffic, (n_dsts_per_physical - 1) * volume) + + def test_partially_relevant_not_implemented(self): + with self.assertRaises(NotImplementedError): + self._cost(PartiallyRelevant("n0"), n=4, stride=2) + + +class TestAllToAllTopologyModel(TestCase): + """Unit tests for the all-to-all (switch) cost model in isolation.""" + + def _cost(self, relevancy, n, *, volume=10, last_fanout=99): + # last_fanout is deliberately large and arbitrary: an all-to-all switch + # must ignore physical stride entirely. + return AllToAllTopologyModel().per_loop_transfer_cost( + relevancy, + shape_repeats=n, + last_fanout=last_fanout, + volume=volume, + src_component=_NoDistribution(), + dim_name="X", + ) + + def test_registry_resolves_model(self): + # Resolves both by enum and by the StrEnum value (the form that survives + # the arch evaluation pipeline). + self.assertIsInstance( + get_topology_model(TopologySpec.ALL_TO_ALL), AllToAllTopologyModel + ) + self.assertIsInstance(get_topology_model("all_to_all"), AllToAllTopologyModel) + + def test_multicast(self): + n, volume = 5, 10 + cost = self._cost(Irrelevant(), n, volume=volume) + # Linear in destinations, one switch hop, shared link traffic. + self.assertEqual(cost.total_cost, (n - 1) * volume) + self.assertEqual(cost.max_hops, AllToAllTopologyModel.HOPS_PER_TRANSFER) + self.assertEqual(cost.max_traffic, volume) + + def test_unicast(self): + n, volume = 5, 10 + cost = self._cost(Relevant("n0"), n, volume=volume) + # Same (linear) total cost as multicast and constant hops, but the + # source's uplink to the switch carries every distinct message. + self.assertEqual(cost.total_cost, (n - 1) * volume) + self.assertEqual(cost.max_hops, AllToAllTopologyModel.HOPS_PER_TRANSFER) + self.assertEqual(cost.max_traffic, (n - 1) * volume) + + def test_independent_of_stride(self): + # Stride (last_fanout) must not affect any component of the cost. + a = self._cost(Relevant("n0"), 5, last_fanout=1) + b = self._cost(Relevant("n0"), 5, last_fanout=1000) + self.assertEqual( + (a.total_cost, a.max_hops, a.max_traffic), + (b.total_cost, b.max_hops, b.max_traffic), + ) + + def test_linear_unlike_mesh_quadratic(self): + # Against an identical mesh scenario, all-to-all unicast is linear while + # the mesh is quadratic, and all-to-all hops are constant (< distance). + n, volume, stride = 6, 1, 1 + kwargs = dict( + shape_repeats=n, + last_fanout=stride, + volume=volume, + src_component=_NoDistribution(), + dim_name="X", + ) + a2a = AllToAllTopologyModel().per_loop_transfer_cost(Relevant("n0"), **kwargs) + mesh = MeshTopologyModel().per_loop_transfer_cost(Relevant("n0"), **kwargs) + + self.assertEqual(a2a.total_cost, (n - 1) * volume) + self.assertEqual(mesh.total_cost, sum(range(n)) * stride * volume) + self.assertLess(a2a.total_cost, mesh.total_cost) + self.assertLess(a2a.max_hops, mesh.max_hops) + + def test_accumulate_max_hops_persists(self): + # overall_max_hops accumulates across calls for a given network. + model = AllToAllTopologyModel() + h = AllToAllTopologyModel.HOPS_PER_TRANSFER + self.assertEqual(model.accumulate_max_hops("net", h), h) + self.assertEqual(model.accumulate_max_hops("net", h), 2 * h) From dd4769946e78f21c950ac7119a953fdce7d5ce9f Mon Sep 17 00:00:00 2001 From: Michael Gilbert Date: Fri, 5 Jun 2026 19:13:54 -0400 Subject: [PATCH 12/12] [network] Clean up Claude output --- .../_looptree/reuse/symbolic/_network.py | 36 +++++++------------ .../networked/hierarchical_1d_all_to_all.yaml | 6 ++-- tests/network/test_network.py | 24 +++---------- 3 files changed, 18 insertions(+), 48 deletions(-) diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py index cdd9b8fa..0c833354 100644 --- a/accelforge/model/_looptree/reuse/symbolic/_network.py +++ b/accelforge/model/_looptree/reuse/symbolic/_network.py @@ -94,10 +94,10 @@ def per_loop_transfer_cost( class MeshTopologyModel(TopologyModel): """Cost model for a mesh network. - Data travels link-by-link along one axis of the mesh. Multicast delivers a - value to every point along the dimension; unicast delivers a distinct value - to each point. When the source is physically distributed, data is bound as - locally as possible across the physical buffers. + Data travels along one axis of the mesh. Multicast delivers a value to every + point along the dimension; unicast delivers a distinct value to each point. + When the source is physically distributed, data is bound as locally as + possible across the physical buffers. """ def per_loop_transfer_cost( @@ -153,23 +153,13 @@ def per_loop_transfer_cost( class AllToAllTopologyModel(TopologyModel): - """Cost model for an all-to-all network built around a switch (e.g. NVLink / - NVSwitch). - - Every node connects to every other node through a central switch, so any - source reaches any destination in a constant number of hops regardless of - how far apart they are in the logical fanout. This differs from a mesh in - two ways: - - - **Uniform latency.** The longest route is a single switch traversal, so - ``max_hops`` is constant rather than growing with the distance - (``shape_repeats * stride``) between source and destination. - - **No store-and-forward accumulation.** Each destination is reached - directly, so the total (energy) cost is linear in the number of - destinations rather than quadratic as in a mesh unicast. - - The physical stride is irrelevant here (all nodes are equidistant from the - switch), so ``last_fanout`` and physical distribution are not consulted. + """Cost model for an all-to-all network using a switch (e.g. NVLink). + + Every node connects to every other node through a switch, so any + source reaches any destination in one hop regardless of + + Physical stride is irrelevant, so ``last_fanout`` and physical distribution + are not used. """ HOPS_PER_TRANSFER = 1 @@ -219,9 +209,7 @@ def per_loop_transfer_cost( ) -# Registry mapping each topology to the model class that costs its data -# movement. Classes (not instances) are stored because models are stateful and -# each NetworkAnalyzer needs its own. +# Registry of topology models TOPOLOGY_MODELS: dict[TopologySpec, type[TopologyModel]] = { TopologySpec.MESH: MeshTopologyModel, TopologySpec.ALL_TO_ALL: AllToAllTopologyModel, diff --git a/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml b/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml index 3d8b6d22..bbb14f8c 100644 --- a/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml +++ b/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml @@ -26,7 +26,7 @@ arch: leak_power: 0 total_latency: "max_hops" actions: - - {name: hops, energy: 1, latency: 0, throughput: 1} + - {name: hops, energy: 1, latency: 1, throughput: inf} - !Memory name: Scratchpad @@ -40,9 +40,7 @@ arch: spatial: - {name: X, fanout: 4} - # All-to-all switch (NVLink-like): every node is one switch hop from every - # other, so unicast and multicast cost the same total hops and max_hops is - # constant. Fanout is 4 so this differs observably from a mesh. + # All-to-all switch (NVLink-like): every node is one hop from every other - !Network name: MacArray topology: all_to_all diff --git a/tests/network/test_network.py b/tests/network/test_network.py index 138631fc..04e6e6ba 100644 --- a/tests/network/test_network.py +++ b/tests/network/test_network.py @@ -334,9 +334,7 @@ def test_flat(self): class TestModelAllToAll(TestCase): - """Full-model evaluation of the 1D hierarchy where MacArray is an all-to-all - switch (NVLink-like) instead of a mesh. PeArray remains a mesh, so the two - networks can be contrasted within a single run.""" + """MacArray is an all-to-all switch (NVLink-like). PeArray is a mesh.""" def test_hierarchical_1d_all_to_all(self): M = 8 @@ -360,16 +358,13 @@ def test_hierarchical_1d_all_to_all(self): result = spec.evaluate_mapping() # --- MacArray: all-to-all switch --------------------------------- - # On a switch every node is one hop away, so unicast (T0, W0) collapses - # to the same (MAC_TILE - 1) linear cost as multicast (T1): all equal. - # Contrast test_hierarchical_1d, where the mesh makes T0/W0 quadratic - # (sum(range(MAC_TILE))). + # Every node is one hop away all_to_all = ( (M / M_TILE) * (KN / MAC_TILE) # number of used Scratchpad * M_TILE * KN # temporal for n1 in mapping - * (MAC_TILE - 1) # one switch hop per destination, for every tensor + * (MAC_TILE - 1) # one hop per destination, for every tensor * BITS_PER_VALUE ) for tensor in ("T0", "T1", "W0"): @@ -381,17 +376,6 @@ def test_hierarchical_1d_all_to_all(self): msg=f"unexpected MacArray hops for {tensor}", ) - # Guard: a mesh would make the unicast tensors strictly more expensive. - mesh_unicast = ( - (M / M_TILE) - * (KN / MAC_TILE) - * M_TILE - * KN - * sum(range(MAC_TILE)) # quadratic on a mesh - * BITS_PER_VALUE - ) - self.assertGreater(mesh_unicast, all_to_all) - # --- PeArray: still a mesh --------------------------------------- # Unchanged from test_hierarchical_1d, so the mesh formulas hold (now # with MAC_TILE = 4, i.e. KN // MAC_TILE = 2). @@ -422,7 +406,7 @@ def test_hierarchical_1d_all_to_all(self): # --- Latency ------------------------------------------------------ # The switch's uniform single-hop routing gives MacArray a constant - # latency of 1, versus the mesh PeArray's distance-dependent 2. + # latency of 1, versus the mesh PeArray's 2. self.assertEqual( result.data["Matmul0latencyMacArray"].iloc[0], 1 )