From ac14b29d2320af90860d85d7ccd8fc03f40c8efb Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Thu, 26 Feb 2026 15:55:11 -0500
Subject: [PATCH 01/12] WIP sharding modeling

---
 .../_looptree/reuse/symbolic/symbolic.py      |  45 +-
 examples/arches/networked/flat.yaml           |  43 +-
 examples/arches/networked/hierarchical.yaml   |  22 +-
 notebooks/tutorials/networks.ipynb            | 456 ++++--------------
 tests/test_network.py                         |  77 ++-
 5 files changed, 209 insertions(+), 434 deletions(-)

diff --git a/accelforge/model/_looptree/reuse/symbolic/symbolic.py b/accelforge/model/_looptree/reuse/symbolic/symbolic.py
index 9cff4a76..e2a9a3d7 100755
--- a/accelforge/model/_looptree/reuse/symbolic/symbolic.py
+++ b/accelforge/model/_looptree/reuse/symbolic/symbolic.py
@@ -876,7 +876,9 @@ def analyze_spatial(node_idx, current_shape, info: AnalysisInfo):
     node: Spatial = mapping[node_idx]
     rank_var = node.rank_variable
     node_dim = node.name
-    spatial_component = find_component_object(node.component, info.job.flattened_arch)
+    flattened_arch = info.job.flattened_arch
+    arch_spec = info.job.spec.arch
+    spatial_component = find_component_object(node.component, flattened_arch)
     component_spatial_dim = spatial_component.spatial[node_dim]
     stride_and_shape = get_stride_and_tile_shape(node, current_shape, node_idx, info)
 
@@ -937,15 +939,13 @@ def handle_repeated_value(repeated_shape):
                 child_network_stats.max_hops,
             )
             projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)]
-            component_object = find_component_object(
-                network.component, info.job.flattened_arch
-            )
-            bits_per_value_scale = component_object.bits_per_value_scale[network.tensor]
+            network_object = find_component_object(network.component, flattened_arch)
+            bits_per_value_scale = network_object.bits_per_value_scale[network.tensor]
             bits_per_value = (
                 bits_per_value_scale
                 * info.job.einsum.tensor_accesses[network.tensor].bits_per_value
             )
-            bits_per_action = component_object.bits_per_action
+            bits_per_action = network_object.bits_per_action
             if bits_per_action is not None:
                 actions_per_value = bits_per_value / bits_per_action
             else:
@@ -955,8 +955,9 @@ def handle_repeated_value(repeated_shape):
                 * actions_per_value
             )
 
-            if info.job.spec.arch.is_above(node.component, network.component):
+            if is_component_a_above_b(node.component, network.source.level, flattened_arch):
                 continue
+            source_object = find_component_object(network.source.level, flattened_arch)
 
             last_fanout = child_result.fanout.get((node.component, einsum_name), {})
             last_fanout = last_fanout.get(node.name, 1)
@@ -972,14 +973,18 @@ def handle_repeated_value(repeated_shape):
                     overall_max_hops + child_network_stats.max_hops,
                 )
             elif isinstance(relevancy, Relevant):
+                avg_max_src_to_dst = (
+                    component_spatial_dim.fanout
+                    /
+                    source_object._get_physical_fanout_along(node.name, 1)
+                )-1
                 # Cost of unicast is the cost of delivering to each point in
                 # the dimension with shape as stride
                 # TODO: we should use the actual stride
-                total_unicast_cost = (
-                    0.5 * (shape_repeats - 1) * shape_repeats * last_fanout * volume
-                )
-                max_unicast_hops = (shape_repeats - 1) * last_fanout
-                overall_max_hops += max_unicast_hops
+                max_hops = MinGeqZero((shape_repeats-1)*last_fanout, avg_max_src_to_dst)
+                avg_hops = 0.5*max_hops
+                total_unicast_cost = avg_hops * shape_repeats * volume
+                overall_max_hops += max_hops
 
                 accumulated_network_stats.total_hops += total_unicast_cost
                 accumulated_network_stats.max_hops = MaxGeqZero(
@@ -1070,6 +1075,22 @@ def find_component_object(
     raise ValueError(f"Component {component} not found in flattened arch")
 
 
+def is_component_a_above_b(component_a: str, component_b: str, flattened_arch):
+    a_found = False
+    b_found = False
+    for node in flattened_arch:
+        if node.name == component_a:
+            a_found = True
+        if node.name == component_b:
+            b_found = True
+
+        if a_found and not b_found:
+            return True
+        elif b_found and not a_found:
+            return False
+    raise ValueError(f"Neither {component_a} nor {component_b} found in flattened arch")
+
+
 def analyze_storage(
     node_idx: int,
     current_shape: dict[str, int],
diff --git a/examples/arches/networked/flat.yaml b/examples/arches/networked/flat.yaml
index 18c369e9..911cd59f 100644
--- a/examples/arches/networked/flat.yaml
+++ b/examples/arches/networked/flat.yaml
@@ -1,3 +1,5 @@
+{% set N_ROW_BUFFER = N_ROW_BUFFER | default(4) %}
+{% set N_COL_BUFFER = N_COL_BUFFER | default(4) %}
 arch:
   nodes:
   - !Memory
@@ -7,16 +9,18 @@ arch:
     leak_power: 0
     tensors: {keep: All}
     actions:
-    - {name: read, energy: 0, latency: 0}
-    - {name: write, energy: 0, latency: 0}
+    - {name: read, energy: 100, latency: 0}
+    - {name: write, energy: 100, latency: 0}
 
   - !Network
     name: NoC
     area: 0
     leak_power: 0
-    actions: []
+    actions:
+    - {name: hops, energy: 1, latency: 0}
 
   - !Array
+    name: PeArray
     spatial:
     - {name: X, fanout: 4}
     - {name: Y, fanout: 4}
@@ -28,42 +32,42 @@ arch:
       leak_power: 0
       tensors: {keep: ~MainMemory, may_keep: All}
       actions:
-      - {name: read, energy: 0, latency: 0}
-      - {name: write, energy: 0, latency: 0}
+      - {name: read, energy: 10, latency: 0}
+      - {name: write, energy: 10, latency: 0}
 
     - !Memory
       name: RowBuffer
       size: inf
       area: 0
       leak_power: 0
-      tensors: {keep: ~MainMemory, may_keep: All}
+      tensors: {keep: input, may_keep: input}
       actions:
-      - {name: read, energy: 0, latency: 0}
-      - {name: write, energy: 0, latency: 0}
+      - {name: read, energy: 5, latency: 0}
+      - {name: write, energy: 5, latency: 0}
       spatial:
-      - {name: X, fanout: 4}
+      - {name: X, fanout: {{N_ROW_BUFFER}}}
 
     - !Memory
       name: ColumnBuffer
       size: inf
       area: 0
       leak_power: 0
-      tensors: {keep: ~MainMemory, may_keep: All}
+      tensors: {keep: output, may_keep: output}
       actions:
-      - {name: read, energy: 0, latency: 0}
-      - {name: write, energy: 0, latency: 0}
+      - {name: read, energy: 5, latency: 0}
+      - {name: write, energy: 5, latency: 0}
       spatial:
-      - {name: Y, fanout: 4}
+      - {name: Y, fanout: {{N_COL_BUFFER}}}
 
     - !Memory
       name: DistributedBuffer
       size: inf
       area: 0
       leak_power: 0
-      tensors: {keep: ~MainMemory, may_keep: All}
+      tensors: {keep: weight, may_keep: weight}
       actions:
-      - {name: read, energy: 0, latency: 0}
-      - {name: write, energy: 0, latency: 0}
+      - {name: read, energy: 5, latency: 0}
+      - {name: write, energy: 5, latency: 0}
       spatial:
       - {name: X, fanout: 2}
       - {name: Y, fanout: 2}
@@ -73,13 +77,14 @@ arch:
     size: inf
     area: 0
     leak_power: 0
+    tensors: {keep: weight, may_keep: weight}
     actions:
-    - {name: read, energy: 0, latency: 0}
-    - {name: write, energy: 0, latency: 0}
+    - {name: read, energy: 1, latency: 0}
+    - {name: write, energy: 1, latency: 0}
 
   - !Compute
     name: MAC
     area: 0
     leak_power: 0
     actions:
-    - {name: compute, energy: 0, latency: 0}
\ No newline at end of file
+    - {name: compute, energy: 1, latency: 0}
\ No newline at end of file
diff --git a/examples/arches/networked/hierarchical.yaml b/examples/arches/networked/hierarchical.yaml
index 5a7d44be..211d0e81 100644
--- a/examples/arches/networked/hierarchical.yaml
+++ b/examples/arches/networked/hierarchical.yaml
@@ -7,8 +7,8 @@ arch:
     leak_power: 0
     tensors: {keep: All}
     actions:
-    - {name: read, energy: 0, latency: 0}
-    - {name: write, energy: 0, latency: 0}
+    - {name: read, energy: 100, latency: 1e-9}
+    - {name: write, energy: 100, latency: 1e-9}
 
   - !Memory
     name: GlobalBuffer
@@ -17,15 +17,15 @@ arch:
     leak_power: 0
     tensors: {keep: ~MainMemory, may_keep: All}
     actions:
-    - {name: read, energy: 0, latency: 0}
-    - {name: write, energy: 0, latency: 0}
+    - {name: read, energy: 10, latency: 1e-9/4}
+    - {name: write, energy: 10, latency: 1e-9/4}
 
   - !Network
-    name: PeArray
+    name: PeNoc
     area: 0
     leak_power: 0
     actions:
-    - {name: hops, energy: 1, latency: 0}
+    - {name: hops, energy: 5, latency: 1e-9/4}
 
   - !Memory
     name: Scratchpad
@@ -34,25 +34,25 @@ arch:
     leak_power: 0
     tensors: {keep: All}
     actions:
-    - {name: read, energy: 0, latency: 0}
-    - {name: write, energy: 0, latency: 0}
+    - {name: read, energy: 2, latency: 1e-9/16}
+    - {name: write, energy: 2, latency: 1e-9/16}
     spatial:
     - {name: X, fanout: 2}
     - {name: Y, fanout: 2}
 
   - !Network
-    name: MacArray
+    name: MacNoc
     area: 0
     leak_power: 0
     actions:
-    - {name: hops, energy: 1, latency: 0}
+    - {name: hops, energy: 1, latency: 1e-9/16}
 
   - !Compute
     name: MAC
     area: 0
     leak_power: 0
     actions:
-    - {name: compute, energy: 0, latency: 0}
+    - {name: compute, energy: 1, latency: 1e-9}
     spatial:
     - {name: X, fanout: 2}
     - {name: Y, fanout: 2}
\ No newline at end of file
diff --git a/notebooks/tutorials/networks.ipynb b/notebooks/tutorials/networks.ipynb
index d809e0d9..0c09b7af 100644
--- a/notebooks/tutorials/networks.ipynb
+++ b/notebooks/tutorials/networks.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "43938186",
    "metadata": {},
    "outputs": [],
@@ -20,117 +20,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "49a31e7a",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/svg+xml": [
-       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
-       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
-       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
-       "<!-- Generated by graphviz version 2.43.0 (0)\n",
-       " -->\n",
-       "<!-- Title: G Pages: 1 -->\n",
-       "<svg width=\"151pt\" height=\"327pt\"\n",
-       " viewBox=\"0.00 0.00 151.00 326.83\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
-       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 322.83)\">\n",
-       "<title>G</title>\n",
-       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-322.83 147,-322.83 147,4 -4,4\"/>\n",
-       "<!-- Memory_125810196833264 -->\n",
-       "<g id=\"node1\" class=\"node\">\n",
-       "<title>Memory_125810196833264</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M143,-315.56C143,-317.37 110.95,-318.83 71.5,-318.83 32.05,-318.83 0,-317.37 0,-315.56 0,-315.56 0,-286.11 0,-286.11 0,-284.3 32.05,-282.83 71.5,-282.83 110.95,-282.83 143,-284.3 143,-286.11 143,-286.11 143,-315.56 143,-315.56\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M143,-315.56C143,-313.75 110.95,-312.29 71.5,-312.29 32.05,-312.29 0,-313.75 0,-315.56\"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-297.73\" font-family=\"Arial\" font-size=\"12.00\">MainMemory with size inf</text>\n",
-       "</g>\n",
-       "<!-- Memory_125810196833584 -->\n",
-       "<g id=\"node2\" class=\"node\">\n",
-       "<title>Memory_125810196833584</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M142,-271.56C142,-273.37 110.4,-274.83 71.5,-274.83 32.6,-274.83 1,-273.37 1,-271.56 1,-271.56 1,-242.11 1,-242.11 1,-240.3 32.6,-238.83 71.5,-238.83 110.4,-238.83 142,-240.3 142,-242.11 142,-242.11 142,-271.56 142,-271.56\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M142,-271.56C142,-269.75 110.4,-268.29 71.5,-268.29 32.6,-268.29 1,-269.75 1,-271.56\"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-253.73\" font-family=\"Arial\" font-size=\"12.00\">GlobalBuffer with size inf</text>\n",
-       "</g>\n",
-       "<!-- Memory_125810196833264&#45;&#45;Memory_125810196833584 -->\n",
-       "<g id=\"edge1\" class=\"edge\">\n",
-       "<title>Memory_125810196833264&#45;&#45;Memory_125810196833584</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M71.5,-282.75C71.5,-280.21 71.5,-277.58 71.5,-275.04\"/>\n",
-       "</g>\n",
-       "<!-- Network_125810196835344 -->\n",
-       "<g id=\"node3\" class=\"node\">\n",
-       "<title>Network_125810196835344</title>\n",
-       "<polygon fill=\"#faf8c8\" stroke=\"black\" points=\"97.5,-230.83 45.5,-230.83 45.5,-178.83 97.5,-178.83 97.5,-230.83\"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"57.5,-230.83 45.5,-218.83 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"45.5,-190.83 57.5,-178.83 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"85.5,-178.83 97.5,-190.83 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"97.5,-218.83 85.5,-230.83 \"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-201.73\" font-family=\"Arial\" font-size=\"12.00\">PeArray</text>\n",
-       "</g>\n",
-       "<!-- Memory_125810196833584&#45;&#45;Network_125810196835344 -->\n",
-       "<g id=\"edge2\" class=\"edge\">\n",
-       "<title>Memory_125810196833584&#45;&#45;Network_125810196835344</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M71.5,-238.63C71.5,-236.14 71.5,-233.54 71.5,-230.94\"/>\n",
-       "</g>\n",
-       "<!-- Memory_125810196835744 -->\n",
-       "<g id=\"node4\" class=\"node\">\n",
-       "<title>Memory_125810196835744</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M139,-166.48C139,-168.81 108.75,-170.71 71.5,-170.71 34.25,-170.71 4,-168.81 4,-166.48 4,-166.48 4,-128.43 4,-128.43 4,-126.1 34.25,-124.21 71.5,-124.21 108.75,-124.21 139,-126.1 139,-128.43 139,-128.43 139,-166.48 139,-166.48\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M139,-166.48C139,-164.15 108.75,-162.25 71.5,-162.25 34.25,-162.25 4,-164.15 4,-166.48\"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-150.86\" font-family=\"Arial\" font-size=\"12.00\">Scratchpad with size inf</text>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-137.86\" font-family=\"Arial\" font-size=\"12.00\">[2× X, 2× Y]</text>\n",
-       "</g>\n",
-       "<!-- Network_125810196835344&#45;&#45;Memory_125810196835744 -->\n",
-       "<g id=\"edge3\" class=\"edge\">\n",
-       "<title>Network_125810196835344&#45;&#45;Memory_125810196835744</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M71.5,-178.74C71.5,-176.11 71.5,-173.44 71.5,-170.84\"/>\n",
-       "</g>\n",
-       "<!-- Network_125810196835824 -->\n",
-       "<g id=\"node5\" class=\"node\">\n",
-       "<title>Network_125810196835824</title>\n",
-       "<polygon fill=\"#faf8c8\" stroke=\"black\" points=\"101.5,-116.08 41.5,-116.08 41.5,-56.08 101.5,-56.08 101.5,-116.08\"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"53.5,-116.08 41.5,-104.08 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"41.5,-68.08 53.5,-56.08 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"89.5,-56.08 101.5,-68.08 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"101.5,-104.08 89.5,-116.08 \"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-82.98\" font-family=\"Arial\" font-size=\"12.00\">MacArray</text>\n",
-       "</g>\n",
-       "<!-- Memory_125810196835744&#45;&#45;Network_125810196835824 -->\n",
-       "<g id=\"edge4\" class=\"edge\">\n",
-       "<title>Memory_125810196835744&#45;&#45;Network_125810196835824</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M71.5,-124.01C71.5,-121.52 71.5,-118.95 71.5,-116.38\"/>\n",
-       "</g>\n",
-       "<!-- Compute_125810196837184 -->\n",
-       "<g id=\"node6\" class=\"node\">\n",
-       "<title>Compute_125810196837184</title>\n",
-       "<ellipse fill=\"#e0eeff\" stroke=\"black\" cx=\"71.5\" cy=\"-24.04\" rx=\"49.49\" ry=\"24.08\"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-27.44\" font-family=\"Arial\" font-size=\"12.00\">MAC</text>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-14.44\" font-family=\"Arial\" font-size=\"12.00\">[2× X, 2× Y]</text>\n",
-       "</g>\n",
-       "<!-- Network_125810196835824&#45;&#45;Compute_125810196837184 -->\n",
-       "<g id=\"edge5\" class=\"edge\">\n",
-       "<title>Network_125810196835824&#45;&#45;Compute_125810196837184</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M71.5,-55.77C71.5,-53.27 71.5,-50.76 71.5,-48.31\"/>\n",
-       "</g>\n",
-       "</g>\n",
-       "</svg>\n"
-      ],
-      "text/plain": [
-       "Arch(nodes=ArchNodes([Memory(name='MainMemory', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='All', may_keep='<Nothing if keep is defined, else All>', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Memory(name='GlobalBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Network(name='PeArray', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs()), Memory(name='Scratchpad', spatial=[Spatial(name='X', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='All', may_keep='<Nothing if keep is defined, else All>', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Network(name='MacArray', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs()), Compute(name='MAC', spatial=[Spatial(name='X', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[Action(name='compute', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras())], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs())]), variables=EvalExtras(), extra_attributes_for_all_component_models=EvalExtras())"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "spec = af.Spec.from_yaml(\n",
-    "    af.examples.arches.networked.hierarchical\n",
+    "    af.examples.arches.networked.hierarchical,\n",
+    "    af.examples.workloads.matmuls,\n",
+    "    jinja_parse_data={\"N_EINSUMS\": 1}\n",
     ")\n",
     "spec.arch"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a64424bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spec.mapper.metrics = af.mapper.Metrics.LATENCY | af.mapper.Metrics.ENERGY\n",
+    "result = spec.map_workload_to_arch()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cc6ed1d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result.data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0dadcac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result.energy(per_component=True)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "389cb739",
@@ -141,178 +74,76 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "9a11eec1",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/svg+xml": [
-       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
-       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
-       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
-       "<!-- Generated by graphviz version 2.43.0 (0)\n",
-       " -->\n",
-       "<!-- Title: G Pages: 1 -->\n",
-       "<svg width=\"797pt\" height=\"231pt\"\n",
-       " viewBox=\"0.00 0.00 797.00 230.75\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
-       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 226.75)\">\n",
-       "<title>G</title>\n",
-       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-226.75 793,-226.75 793,4 -4,4\"/>\n",
-       "<!-- Memory_125810196833024 -->\n",
-       "<g id=\"node1\" class=\"node\">\n",
-       "<title>Memory_125810196833024</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M453,-219.48C453,-221.28 420.95,-222.75 381.5,-222.75 342.05,-222.75 310,-221.28 310,-219.48 310,-219.48 310,-190.02 310,-190.02 310,-188.22 342.05,-186.75 381.5,-186.75 420.95,-186.75 453,-188.22 453,-190.02 453,-190.02 453,-219.48 453,-219.48\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M453,-219.48C453,-217.67 420.95,-216.2 381.5,-216.2 342.05,-216.2 310,-217.67 310,-219.48\"/>\n",
-       "<text text-anchor=\"middle\" x=\"381.5\" y=\"-201.65\" font-family=\"Arial\" font-size=\"12.00\">MainMemory with size inf</text>\n",
-       "</g>\n",
-       "<!-- Network_125810197053520 -->\n",
-       "<g id=\"node2\" class=\"node\">\n",
-       "<title>Network_125810197053520</title>\n",
-       "<polygon fill=\"#faf8c8\" stroke=\"black\" points=\"399.5,-178.75 363.5,-178.75 363.5,-142.75 399.5,-142.75 399.5,-178.75\"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"375.5,-178.75 363.5,-166.75 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"363.5,-154.75 375.5,-142.75 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"387.5,-142.75 399.5,-154.75 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"399.5,-166.75 387.5,-178.75 \"/>\n",
-       "<text text-anchor=\"middle\" x=\"381.5\" y=\"-157.65\" font-family=\"Arial\" font-size=\"12.00\">NoC</text>\n",
-       "</g>\n",
-       "<!-- Memory_125810196833024&#45;&#45;Network_125810197053520 -->\n",
-       "<g id=\"edge1\" class=\"edge\">\n",
-       "<title>Memory_125810196833024&#45;&#45;Network_125810197053520</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M381.5,-186.67C381.5,-184.12 381.5,-181.5 381.5,-178.95\"/>\n",
-       "</g>\n",
-       "<!-- Array_125810197054240 -->\n",
-       "<g id=\"node3\" class=\"node\">\n",
-       "<title>Array_125810197054240</title>\n",
-       "<polygon fill=\"#fcc2fc\" stroke=\"black\" points=\"416.5,-134.75 346.5,-134.75 346.5,-98.75 416.5,-98.75 416.5,-134.75\"/>\n",
-       "<text text-anchor=\"middle\" x=\"381.5\" y=\"-120.15\" font-family=\"Arial\" font-size=\"12.00\">Array </text>\n",
-       "<text text-anchor=\"middle\" x=\"381.5\" y=\"-107.15\" font-family=\"Arial\" font-size=\"12.00\">[4× X, 4× Y]</text>\n",
-       "</g>\n",
-       "<!-- Network_125810197053520&#45;&#45;Array_125810197054240 -->\n",
-       "<g id=\"edge6\" class=\"edge\">\n",
-       "<title>Network_125810197053520&#45;&#45;Array_125810197054240</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M381.5,-142.67C381.5,-140.12 381.5,-137.5 381.5,-134.95\"/>\n",
-       "</g>\n",
-       "<!-- Memory_125810197054800 -->\n",
-       "<g id=\"node4\" class=\"node\">\n",
-       "<title>Memory_125810197054800</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M141,-82.1C141,-83.91 109.4,-85.38 70.5,-85.38 31.6,-85.38 0,-83.91 0,-82.1 0,-82.1 0,-52.65 0,-52.65 0,-50.84 31.6,-49.38 70.5,-49.38 109.4,-49.38 141,-50.84 141,-52.65 141,-52.65 141,-82.1 141,-82.1\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M141,-82.1C141,-80.3 109.4,-78.83 70.5,-78.83 31.6,-78.83 0,-80.3 0,-82.1\"/>\n",
-       "<text text-anchor=\"middle\" x=\"70.5\" y=\"-64.28\" font-family=\"Arial\" font-size=\"12.00\">GlobalBuffer with size inf</text>\n",
-       "</g>\n",
-       "<!-- Array_125810197054240&#45;&#45;Memory_125810197054800 -->\n",
-       "<g id=\"edge2\" class=\"edge\">\n",
-       "<title>Array_125810197054240&#45;&#45;Memory_125810197054800</title>\n",
-       "<path fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" d=\"M346.4,-113.76C300.9,-110.76 219.22,-103.99 150.5,-90.75 142,-89.11 133.09,-87.02 124.43,-84.76\"/>\n",
-       "</g>\n",
-       "<!-- Memory_125810197055840 -->\n",
-       "<g id=\"node5\" class=\"node\">\n",
-       "<title>Memory_125810197055840</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M289.5,-86.4C289.5,-88.73 260.37,-90.63 224.5,-90.63 188.63,-90.63 159.5,-88.73 159.5,-86.4 159.5,-86.4 159.5,-48.35 159.5,-48.35 159.5,-46.02 188.63,-44.12 224.5,-44.12 260.37,-44.12 289.5,-46.02 289.5,-48.35 289.5,-48.35 289.5,-86.4 289.5,-86.4\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M289.5,-86.4C289.5,-84.07 260.37,-82.17 224.5,-82.17 188.63,-82.17 159.5,-84.07 159.5,-86.4\"/>\n",
-       "<text text-anchor=\"middle\" x=\"224.5\" y=\"-70.78\" font-family=\"Arial\" font-size=\"12.00\">RowBuffer with size inf</text>\n",
-       "<text text-anchor=\"middle\" x=\"224.5\" y=\"-57.77\" font-family=\"Arial\" font-size=\"12.00\">[4× X]</text>\n",
-       "</g>\n",
-       "<!-- Array_125810197054240&#45;&#45;Memory_125810197055840 -->\n",
-       "<g id=\"edge3\" class=\"edge\">\n",
-       "<title>Array_125810197054240&#45;&#45;Memory_125810197055840</title>\n",
-       "<path fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" d=\"M346.27,-105.12C329.54,-100.07 309,-93.87 289.59,-88.02\"/>\n",
-       "</g>\n",
-       "<!-- Memory_125810197056320 -->\n",
-       "<g id=\"node6\" class=\"node\">\n",
-       "<title>Memory_125810197056320</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M455,-86.4C455,-88.73 422.06,-90.63 381.5,-90.63 340.94,-90.63 308,-88.73 308,-86.4 308,-86.4 308,-48.35 308,-48.35 308,-46.02 340.94,-44.12 381.5,-44.12 422.06,-44.12 455,-46.02 455,-48.35 455,-48.35 455,-86.4 455,-86.4\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M455,-86.4C455,-84.07 422.06,-82.17 381.5,-82.17 340.94,-82.17 308,-84.07 308,-86.4\"/>\n",
-       "<text text-anchor=\"middle\" x=\"381.5\" y=\"-70.78\" font-family=\"Arial\" font-size=\"12.00\">ColumnBuffer with size inf</text>\n",
-       "<text text-anchor=\"middle\" x=\"381.5\" y=\"-57.77\" font-family=\"Arial\" font-size=\"12.00\">[4× Y]</text>\n",
-       "</g>\n",
-       "<!-- Array_125810197054240&#45;&#45;Memory_125810197056320 -->\n",
-       "<g id=\"edge4\" class=\"edge\">\n",
-       "<title>Array_125810197054240&#45;&#45;Memory_125810197056320</title>\n",
-       "<path fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" d=\"M381.5,-98.65C381.5,-96.1 381.5,-93.45 381.5,-90.81\"/>\n",
-       "</g>\n",
-       "<!-- Memory_125810197057440 -->\n",
-       "<g id=\"node7\" class=\"node\">\n",
-       "<title>Memory_125810197057440</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M636,-86.4C636,-88.73 599.47,-90.63 554.5,-90.63 509.53,-90.63 473,-88.73 473,-86.4 473,-86.4 473,-48.35 473,-48.35 473,-46.02 509.53,-44.12 554.5,-44.12 599.47,-44.12 636,-46.02 636,-48.35 636,-48.35 636,-86.4 636,-86.4\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M636,-86.4C636,-84.07 599.47,-82.17 554.5,-82.17 509.53,-82.17 473,-84.07 473,-86.4\"/>\n",
-       "<text text-anchor=\"middle\" x=\"554.5\" y=\"-70.78\" font-family=\"Arial\" font-size=\"12.00\">DistributedBuffer with size inf</text>\n",
-       "<text text-anchor=\"middle\" x=\"554.5\" y=\"-57.77\" font-family=\"Arial\" font-size=\"12.00\">[2× X, 2× Y]</text>\n",
-       "</g>\n",
-       "<!-- Array_125810197054240&#45;&#45;Memory_125810197057440 -->\n",
-       "<g id=\"edge5\" class=\"edge\">\n",
-       "<title>Array_125810197054240&#45;&#45;Memory_125810197057440</title>\n",
-       "<path fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" d=\"M416.51,-106.16C434.48,-101.24 457.11,-95.04 478.82,-89.1\"/>\n",
-       "</g>\n",
-       "<!-- Memory_125810197059360 -->\n",
-       "<g id=\"node8\" class=\"node\">\n",
-       "<title>Memory_125810197059360</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M789,-82.1C789,-83.91 758.75,-85.38 721.5,-85.38 684.25,-85.38 654,-83.91 654,-82.1 654,-82.1 654,-52.65 654,-52.65 654,-50.84 684.25,-49.38 721.5,-49.38 758.75,-49.38 789,-50.84 789,-52.65 789,-52.65 789,-82.1 789,-82.1\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M789,-82.1C789,-80.3 758.75,-78.83 721.5,-78.83 684.25,-78.83 654,-80.3 654,-82.1\"/>\n",
-       "<text text-anchor=\"middle\" x=\"721.5\" y=\"-64.28\" font-family=\"Arial\" font-size=\"12.00\">Scratchpad with size inf</text>\n",
-       "</g>\n",
-       "<!-- Array_125810197054240&#45;&#45;Memory_125810197059360 -->\n",
-       "<g id=\"edge7\" class=\"edge\">\n",
-       "<title>Array_125810197054240&#45;&#45;Memory_125810197059360</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M416.82,-114.56C467.75,-112.35 564.58,-106.34 645.5,-90.75 653.74,-89.16 662.37,-87.06 670.73,-84.78\"/>\n",
-       "</g>\n",
-       "<!-- Compute_125810197060160 -->\n",
-       "<g id=\"node9\" class=\"node\">\n",
-       "<title>Compute_125810197060160</title>\n",
-       "<ellipse fill=\"#e0eeff\" stroke=\"black\" cx=\"721.5\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\n",
-       "<text text-anchor=\"middle\" x=\"721.5\" y=\"-14.9\" font-family=\"Arial\" font-size=\"12.00\">MAC</text>\n",
-       "</g>\n",
-       "<!-- Memory_125810197059360&#45;&#45;Compute_125810197060160 -->\n",
-       "<g id=\"edge8\" class=\"edge\">\n",
-       "<title>Memory_125810197059360&#45;&#45;Compute_125810197060160</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M721.5,-49.27C721.5,-45.03 721.5,-40.48 721.5,-36.24\"/>\n",
-       "</g>\n",
-       "</g>\n",
-       "</svg>\n"
-      ],
-      "text/plain": [
-       "Arch(nodes=ArchNodes([Memory(name='MainMemory', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='All', may_keep='<Nothing if keep is defined, else All>', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Network(name='NoC', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs()), Array(spatial=[Spatial(name='X', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], nodes=ArchNodes([Memory(name='GlobalBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Memory(name='RowBuffer', spatial=[Spatial(name='X', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Memory(name='ColumnBuffer', spatial=[Spatial(name='Y', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Memory(name='DistributedBuffer', spatial=[Spatial(name='X', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=2, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf')])), Memory(name='Scratchpad', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='<Defaults to Nothing>', may_keep='<Nothing if keep is defined, else All>', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Compute(name='MAC', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[Action(name='compute', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras())], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs())]), variables=EvalExtras(), extra_attributes_for_all_component_models=EvalExtras())"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "spec = af.Spec.from_yaml(\n",
-    "    af.examples.arches.networked.flat\n",
+    "    af.examples.arches.networked.flat,\n",
+    "    af.examples.workloads.matmuls,\n",
+    "    jinja_parse_data={\"N_EINSUMS\": 1}\n",
     ")\n",
     "spec.arch"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "d2bbda8a",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[[Memory(name='MainMemory', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for MainMemory action read.', 'Setting MainMemory energy to action.energy=0', 'Calculating energy for MainMemory action write.', 'Setting MainMemory energy to action.energy=0', 'Calculating latency for MainMemory action read.', 'Setting MainMemory latency to action.latency=0', 'Calculating latency for MainMemory action write.', 'Setting MainMemory latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for MainMemory action read.', 'Setting MainMemory energy to action.energy=0', 'Calculating energy for MainMemory action write.', 'Setting MainMemory energy to action.energy=0', 'Calculating latency for MainMemory action read.', 'Setting MainMemory latency to action.latency=0', 'Calculating latency for MainMemory action write.', 'Setting MainMemory latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep=InvertibleSet(frozenset()), may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n",
-       "  Network(name='NoC', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Using predefined leak power value self.leak_power=0'], actions=[], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs()),\n",
-       "  Memory(name='GlobalBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for GlobalBuffer action read.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating energy for GlobalBuffer action write.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating latency for GlobalBuffer action read.', 'Setting GlobalBuffer latency to action.latency=0', 'Calculating latency for GlobalBuffer action write.', 'Setting GlobalBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for GlobalBuffer action read.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating energy for GlobalBuffer action write.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating latency for GlobalBuffer action read.', 'Setting GlobalBuffer latency to action.latency=0', 'Calculating latency for GlobalBuffer action write.', 'Setting GlobalBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n",
-       "  Memory(name='RowBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for RowBuffer action read.', 'Setting RowBuffer energy to action.energy=0', 'Calculating energy for RowBuffer action write.', 'Setting RowBuffer energy to action.energy=0', 'Calculating latency for RowBuffer action read.', 'Setting RowBuffer latency to action.latency=0', 'Calculating latency for RowBuffer action write.', 'Setting RowBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for RowBuffer action read.', 'Setting RowBuffer energy to action.energy=0', 'Calculating energy for RowBuffer action write.', 'Setting RowBuffer energy to action.energy=0', 'Calculating latency for RowBuffer action read.', 'Setting RowBuffer latency to action.latency=0', 'Calculating latency for RowBuffer action write.', 'Setting RowBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n",
-       "  Memory(name='ColumnBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for ColumnBuffer action read.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating energy for ColumnBuffer action write.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating latency for ColumnBuffer action read.', 'Setting ColumnBuffer latency to action.latency=0', 'Calculating latency for ColumnBuffer action write.', 'Setting ColumnBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for ColumnBuffer action read.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating energy for ColumnBuffer action write.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating latency for ColumnBuffer action read.', 'Setting ColumnBuffer latency to action.latency=0', 'Calculating latency for ColumnBuffer action write.', 'Setting ColumnBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n",
-       "  Memory(name='DistributedBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for DistributedBuffer action read.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating energy for DistributedBuffer action write.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating latency for DistributedBuffer action read.', 'Setting DistributedBuffer latency to action.latency=0', 'Calculating latency for DistributedBuffer action write.', 'Setting DistributedBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for DistributedBuffer action read.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating energy for DistributedBuffer action write.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating latency for DistributedBuffer action read.', 'Setting DistributedBuffer latency to action.latency=0', 'Calculating latency for DistributedBuffer action write.', 'Setting DistributedBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n",
-       "  Array(spatial=[Spatial(name='X', fanout=4, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=4, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False)], nodes=[Memory(name='GlobalBuffer', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for GlobalBuffer action read.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating energy for GlobalBuffer action write.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating latency for GlobalBuffer action read.', 'Setting GlobalBuffer latency to action.latency=0', 'Calculating latency for GlobalBuffer action write.', 'Setting GlobalBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for GlobalBuffer action read.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating energy for GlobalBuffer action write.', 'Setting GlobalBuffer energy to action.energy=0', 'Calculating latency for GlobalBuffer action read.', 'Setting GlobalBuffer latency to action.latency=0', 'Calculating latency for GlobalBuffer action write.', 'Setting GlobalBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf), Memory(name='RowBuffer', spatial=[Spatial(name='X', fanout=4, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for RowBuffer action read.', 'Setting RowBuffer energy to action.energy=0', 'Calculating energy for RowBuffer action write.', 'Setting RowBuffer energy to action.energy=0', 'Calculating latency for RowBuffer action read.', 'Setting RowBuffer latency to action.latency=0', 'Calculating latency for RowBuffer action write.', 'Setting RowBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for RowBuffer action read.', 'Setting RowBuffer energy to action.energy=0', 'Calculating energy for RowBuffer action write.', 'Setting RowBuffer energy to action.energy=0', 'Calculating latency for RowBuffer action read.', 'Setting RowBuffer latency to action.latency=0', 'Calculating latency for RowBuffer action write.', 'Setting RowBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf), Memory(name='ColumnBuffer', spatial=[Spatial(name='Y', fanout=4, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for ColumnBuffer action read.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating energy for ColumnBuffer action write.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating latency for ColumnBuffer action read.', 'Setting ColumnBuffer latency to action.latency=0', 'Calculating latency for ColumnBuffer action write.', 'Setting ColumnBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for ColumnBuffer action read.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating energy for ColumnBuffer action write.', 'Setting ColumnBuffer energy to action.energy=0', 'Calculating latency for ColumnBuffer action read.', 'Setting ColumnBuffer latency to action.latency=0', 'Calculating latency for ColumnBuffer action write.', 'Setting ColumnBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf), Memory(name='DistributedBuffer', spatial=[Spatial(name='X', fanout=2, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=2, may_reuse=InvertibleSet(frozenset()), loop_bounds=[], min_usage=0.0, reuse=InvertibleSet(frozenset()), usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for DistributedBuffer action read.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating energy for DistributedBuffer action write.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating latency for DistributedBuffer action read.', 'Setting DistributedBuffer latency to action.latency=0', 'Calculating latency for DistributedBuffer action write.', 'Setting DistributedBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for DistributedBuffer action read.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating energy for DistributedBuffer action write.', 'Setting DistributedBuffer energy to action.energy=0', 'Calculating latency for DistributedBuffer action read.', 'Setting DistributedBuffer latency to action.latency=0', 'Calculating latency for DistributedBuffer action write.', 'Setting DistributedBuffer latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf)]),\n",
-       "  Memory(name='Scratchpad', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for Scratchpad action read.', 'Setting Scratchpad energy to action.energy=0', 'Calculating energy for Scratchpad action write.', 'Setting Scratchpad energy to action.energy=0', 'Calculating latency for Scratchpad action read.', 'Setting Scratchpad latency to action.latency=0', 'Calculating latency for Scratchpad action write.', 'Setting Scratchpad latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for Scratchpad action read.', 'Setting Scratchpad energy to action.energy=0', 'Calculating energy for Scratchpad action write.', 'Setting Scratchpad energy to action.energy=0', 'Calculating latency for Scratchpad action read.', 'Setting Scratchpad latency to action.latency=0', 'Calculating latency for Scratchpad action write.', 'Setting Scratchpad latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action=1)], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep=InvertibleSet(frozenset()), may_keep=InvertibleSet(frozenset()), back=InvertibleSet(frozenset()), tile_shape=[], no_refetch_from_above=InvertibleSet(frozenset()), tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={}, bits_per_action=None, size=inf),\n",
-       "  Compute(name='MAC', spatial=[], component_class=None, component_model=None, component_modeling_log=['Using predefined area value self.area=0', 'Calculating energy for MAC action compute.', 'Setting MAC energy to action.energy=0', 'Calculating latency for MAC action compute.', 'Setting MAC latency to action.latency=0', 'Using predefined leak power value self.leak_power=0', 'Using predefined area value self.area=0', 'Calculating energy for MAC action compute.', 'Setting MAC energy to action.energy=0', 'Calculating latency for MAC action compute.', 'Setting MAC latency to action.latency=0', 'Using predefined leak power value self.leak_power=0'], actions=[Action(name='compute', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras())], enabled=True, area=0, total_area=0, area_scale=1, leak_power=0, total_leak_power=0, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs())]]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
+   "source": [
+    "result = spec.map_workload_to_arch()\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e2b3332",
+   "metadata": {},
+   "outputs": [],
    "source": [
-    "spec.calculate_component_area_energy_latency_leak()._get_flattened_architecture()"
+    "from accelforge.plotting.mappings import plot_energy_breakdown\n",
+    "\n",
+    "plot_energy_breakdown([result], separate_by=[\"component\"], stack_by=[\"tensor\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "741719fa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spec = af.Spec.from_yaml(\n",
+    "    af.examples.arches.networked.flat,\n",
+    "    af.examples.workloads.matmuls,\n",
+    "    jinja_parse_data={\"N_EINSUMS\": 1, \"N_ROW_BUFFER\": 1, \"N_COL_BUFFER\": 1},\n",
+    ")\n",
+    "spec.arch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a62e6dfa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = spec.map_workload_to_arch()\n",
+    "plot_energy_breakdown([result], separate_by=[\"component\"], stack_by=[\"tensor\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "929f5399",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result.data[[c for c in result.data.columns if \"hops\" in c]]"
    ]
   },
   {
@@ -325,100 +156,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "cc2df4b6",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/svg+xml": [
-       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
-       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
-       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
-       "<!-- Generated by graphviz version 2.43.0 (0)\n",
-       " -->\n",
-       "<!-- Title: G Pages: 1 -->\n",
-       "<svg width=\"151pt\" height=\"231pt\"\n",
-       " viewBox=\"0.00 0.00 151.00 230.75\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
-       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 226.75)\">\n",
-       "<title>G</title>\n",
-       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-226.75 147,-226.75 147,4 -4,4\"/>\n",
-       "<!-- Memory_125810197377456 -->\n",
-       "<g id=\"node1\" class=\"node\">\n",
-       "<title>Memory_125810197377456</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M143,-219.48C143,-221.28 110.95,-222.75 71.5,-222.75 32.05,-222.75 0,-221.28 0,-219.48 0,-219.48 0,-190.02 0,-190.02 0,-188.22 32.05,-186.75 71.5,-186.75 110.95,-186.75 143,-188.22 143,-190.02 143,-190.02 143,-219.48 143,-219.48\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M143,-219.48C143,-217.67 110.95,-216.2 71.5,-216.2 32.05,-216.2 0,-217.67 0,-219.48\"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-201.65\" font-family=\"Arial\" font-size=\"12.00\">MainMemory with size inf</text>\n",
-       "</g>\n",
-       "<!-- Network_125810192413344 -->\n",
-       "<g id=\"node2\" class=\"node\">\n",
-       "<title>Network_125810192413344</title>\n",
-       "<polygon fill=\"#faf8c8\" stroke=\"black\" points=\"89.5,-178.75 53.5,-178.75 53.5,-142.75 89.5,-142.75 89.5,-178.75\"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"65.5,-178.75 53.5,-166.75 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"53.5,-154.75 65.5,-142.75 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"77.5,-142.75 89.5,-154.75 \"/>\n",
-       "<polyline fill=\"none\" stroke=\"black\" points=\"89.5,-166.75 77.5,-178.75 \"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-157.65\" font-family=\"Arial\" font-size=\"12.00\">NoC</text>\n",
-       "</g>\n",
-       "<!-- Memory_125810197377456&#45;&#45;Network_125810192413344 -->\n",
-       "<g id=\"edge1\" class=\"edge\">\n",
-       "<title>Memory_125810197377456&#45;&#45;Network_125810192413344</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M71.5,-186.67C71.5,-184.12 71.5,-181.5 71.5,-178.95\"/>\n",
-       "</g>\n",
-       "<!-- Memory_125810192404624 -->\n",
-       "<g id=\"node3\" class=\"node\">\n",
-       "<title>Memory_125810192404624</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M122,-130.4C122,-132.73 99.37,-134.63 71.5,-134.63 43.63,-134.63 21,-132.73 21,-130.4 21,-130.4 21,-92.35 21,-92.35 21,-90.02 43.63,-88.12 71.5,-88.12 99.37,-88.12 122,-90.02 122,-92.35 122,-92.35 122,-130.4 122,-130.4\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M122,-130.4C122,-128.07 99.37,-126.17 71.5,-126.17 43.63,-126.17 21,-128.07 21,-130.4\"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-114.78\" font-family=\"Arial\" font-size=\"12.00\">HBM with size inf</text>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-101.78\" font-family=\"Arial\" font-size=\"12.00\">[4× X, 4× Y]</text>\n",
-       "</g>\n",
-       "<!-- Network_125810192413344&#45;&#45;Memory_125810192404624 -->\n",
-       "<g id=\"edge2\" class=\"edge\">\n",
-       "<title>Network_125810192413344&#45;&#45;Memory_125810192404624</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M71.5,-142.65C71.5,-140.1 71.5,-137.45 71.5,-134.81\"/>\n",
-       "</g>\n",
-       "<!-- Memory_125810192599872 -->\n",
-       "<g id=\"node4\" class=\"node\">\n",
-       "<title>Memory_125810192599872</title>\n",
-       "<path fill=\"#d7fcd7\" stroke=\"black\" d=\"M124,-76.73C124,-78.53 100.47,-80 71.5,-80 42.53,-80 19,-78.53 19,-76.73 19,-76.73 19,-47.27 19,-47.27 19,-45.47 42.53,-44 71.5,-44 100.47,-44 124,-45.47 124,-47.27 124,-47.27 124,-76.73 124,-76.73\"/>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M124,-76.73C124,-74.92 100.47,-73.45 71.5,-73.45 42.53,-73.45 19,-74.92 19,-76.73\"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-58.9\" font-family=\"Arial\" font-size=\"12.00\">Buffer with size inf</text>\n",
-       "</g>\n",
-       "<!-- Memory_125810192404624&#45;&#45;Memory_125810192599872 -->\n",
-       "<g id=\"edge3\" class=\"edge\">\n",
-       "<title>Memory_125810192404624&#45;&#45;Memory_125810192599872</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M71.5,-88.07C71.5,-85.39 71.5,-82.67 71.5,-80.08\"/>\n",
-       "</g>\n",
-       "<!-- Compute_125810192606352 -->\n",
-       "<g id=\"node5\" class=\"node\">\n",
-       "<title>Compute_125810192606352</title>\n",
-       "<ellipse fill=\"#e0eeff\" stroke=\"black\" cx=\"71.5\" cy=\"-18\" rx=\"27\" ry=\"18\"/>\n",
-       "<text text-anchor=\"middle\" x=\"71.5\" y=\"-14.9\" font-family=\"Arial\" font-size=\"12.00\">MAC</text>\n",
-       "</g>\n",
-       "<!-- Memory_125810192599872&#45;&#45;Compute_125810192606352 -->\n",
-       "<g id=\"edge4\" class=\"edge\">\n",
-       "<title>Memory_125810192599872&#45;&#45;Compute_125810192606352</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M71.5,-43.92C71.5,-41.37 71.5,-38.75 71.5,-36.2\"/>\n",
-       "</g>\n",
-       "</g>\n",
-       "</svg>\n"
-      ],
-      "text/plain": [
-       "Arch(nodes=ArchNodes([Memory(name='MainMemory', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='All', may_keep='<Nothing if keep is defined, else All>', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Network(name='NoC', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs()), Memory(name='HBM', spatial=[Spatial(name='X', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False), Spatial(name='Y', fanout=4, may_reuse='All', loop_bounds=[], min_usage=0.0, reuse='Nothing', usage_scale=1, power_gateable=False)], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='~MainMemory', may_keep='All', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Memory(name='Buffer', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[TensorHolderAction(name='read', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action'), TensorHolderAction(name='write', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras(), bits_per_action='1 if bits_per_action is None else bits_per_action')], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs(), tensors=Tensors(keep='<Defaults to Nothing>', may_keep='<Nothing if keep is defined, else All>', back='Nothing', tile_shape=[], no_refetch_from_above='~All', tensor_order_options=[], force_memory_hierarchy_order=True), bits_per_value_scale={'All': 1}, bits_per_action=None, size='inf'), Compute(name='MAC', spatial=[], component_class=None, component_model=None, component_modeling_log=[], actions=[Action(name='compute', energy=0, energy_scale=1, latency=0, latency_scale=1, extra_attributes_for_component_model=EvalExtras())], enabled=True, area=0, total_area=None, area_scale=1, leak_power=0, total_leak_power=None, leak_power_scale=1, energy_scale=1, total_latency='sum(*action2latency.values())', latency_scale=1, n_parallel_instances=1, extra_attributes_for_component_model=_ExtraAttrs())]), variables=EvalExtras(), extra_attributes_for_all_component_models=EvalExtras())"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "spec = af.Spec.from_yaml(\n",
     "    af.examples.arches.networked.rack\n",
     ")\n",
     "spec.arch"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e54780a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -436,8 +191,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/tests/test_network.py b/tests/test_network.py
index e8598833..601dc713 100644
--- a/tests/test_network.py
+++ b/tests/test_network.py
@@ -2,18 +2,20 @@
 
 import accelforge as af
 
+af.set_n_parallel_jobs(1)
+
 
 class TestParsing(TestCase):
     def test_hierarchical(self):
         spec = af.Spec.from_yaml(
             af.examples.arches.networked.hierarchical,
         )
-        self.assertIn("PeArray", spec.arch.nodes)
-        self.assertEqual(spec.arch.nodes["PeArray"].get_fanout(), 1)
+        self.assertIn("PeNoc", spec.arch.nodes)
+        self.assertEqual(spec.arch.nodes["PeNoc"].get_fanout(), 1)
         self.assertIn("Scratchpad", spec.arch.nodes)
         self.assertEqual(spec.arch.nodes["Scratchpad"].get_fanout(), 4)
-        self.assertIn("MacArray", spec.arch.nodes)
-        self.assertEqual(spec.arch.nodes["MacArray"].get_fanout(), 1)
+        self.assertIn("MacNoc", spec.arch.nodes)
+        self.assertEqual(spec.arch.nodes["MacNoc"].get_fanout(), 1)
 
         try:
             spec = spec.calculate_component_area_energy_latency_leak()
@@ -50,34 +52,34 @@ def test_hierarchical(self):
             af.examples.workloads.matmuls,
             af.examples.arches.networked.hierarchical,
             af.examples.mappings.one_matmul_to_networked_hierarchical,
-            jinja_parse_data={"N_EINSUMS": 1, "M": 8, "KN": 8, "MAC_TILE": MAC_TILE, "M_TILE": M_TILE}
+            jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN, "MAC_TILE": MAC_TILE, "M_TILE": M_TILE}
         )
         result = spec.evaluate_mapping()
         self.assertEqual(
-            result.data["Matmul0<SEP>action<SEP>MacArray<SEP>T0<SEP>hops"].iloc[0],
+            result.data["Matmul0<SEP>action<SEP>MacNoc<SEP>T0<SEP>hops"].iloc[0],
             (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (0.5*MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE
         )
         # NOTE: assuming XY routing (as defined in mapping)
         self.assertEqual(
-            result.data["Matmul0<SEP>action<SEP>MacArray<SEP>T1<SEP>hops"].iloc[0],
+            result.data["Matmul0<SEP>action<SEP>MacNoc<SEP>T1<SEP>hops"].iloc[0],
             (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE
         )
         self.assertEqual(
-            result.data["Matmul0<SEP>action<SEP>MacArray<SEP>W0<SEP>hops"].iloc[0],
+            result.data["Matmul0<SEP>action<SEP>MacNoc<SEP>W0<SEP>hops"].iloc[0],
             (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE
         )
 
         self.assertEqual(
-            result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T0<SEP>hops"].iloc[0],
+            result.data["Matmul0<SEP>action<SEP>PeNoc<SEP>T0<SEP>hops"].iloc[0],
             (M/M_TILE) * (0.5*PE_TILE*(PE_TILE-1) + PE_TILE*(PE_TILE-1)) * M_TILE*MAC_TILE*BITS_PER_VALUE
         )
         # NOTE: assuming XY routing (as defined in mapping)
         self.assertEqual(
-            result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T1<SEP>hops"].iloc[0],
+            result.data["Matmul0<SEP>action<SEP>PeNoc<SEP>T1<SEP>hops"].iloc[0],
             (M/M_TILE) * (PE_TILE*(PE_TILE-1) + PE_TILE*0.5*PE_TILE*(PE_TILE-1)) * M_TILE*MAC_TILE*BITS_PER_VALUE
         )
         self.assertEqual(
-            result.data["Matmul0<SEP>action<SEP>PeArray<SEP>W0<SEP>hops"].iloc[0],
+            result.data["Matmul0<SEP>action<SEP>PeNoc<SEP>W0<SEP>hops"].iloc[0],
             (M/M_TILE) * (PE_TILE*0.5*PE_TILE*(PE_TILE-1) + PE_TILE*0.5*PE_TILE*(PE_TILE-1)) * MAC_TILE**2*BITS_PER_VALUE
         )
 
@@ -87,40 +89,33 @@ def test_hierarchical(self):
         M = 8
         KN = 8
         MAC_TILE = 2
-        PE_TILE = KN//MAC_TILE
         M_TILE = 4
-        BITS_PER_VALUE = 8
 
         spec = af.Spec.from_yaml(
             af.examples.workloads.matmuls,
             af.examples.arches.networked.hierarchical,
-            jinja_parse_data={"N_EINSUMS": 1, "M": 8, "KN": 8, "MAC_TILE": MAC_TILE, "M_TILE": M_TILE}
+            jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN}
         )
         result = spec.map_workload_to_arch()
-        # self.assertEqual(
-        #     result.data["Matmul0<SEP>action<SEP>MacArray<SEP>T0<SEP>hops"].iloc[0],
-        #     (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (0.5*MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE
-        # )
-        # # NOTE: assuming XY routing (as defined in mapping)
-        # self.assertEqual(
-        #     result.data["Matmul0<SEP>action<SEP>MacArray<SEP>T1<SEP>hops"].iloc[0],
-        #     (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE
-        # )
-        # self.assertEqual(
-        #     result.data["Matmul0<SEP>action<SEP>MacArray<SEP>W0<SEP>hops"].iloc[0],
-        #     (M/M_TILE)*(KN/MAC_TILE)**2 * M_TILE * (MAC_TILE*(MAC_TILE-1) + MAC_TILE*(MAC_TILE-1)) * BITS_PER_VALUE
-        # )
-
-        # self.assertEqual(
-        #     result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T0<SEP>hops"].iloc[0],
-        #     (M/M_TILE) * (0.5*PE_TILE*(PE_TILE-1) + PE_TILE*(PE_TILE-1)) * M_TILE*MAC_TILE*BITS_PER_VALUE
-        # )
-        # # NOTE: assuming XY routing (as defined in mapping)
-        # self.assertEqual(
-        #     result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T1<SEP>hops"].iloc[0],
-        #     (M/M_TILE) * (PE_TILE*(PE_TILE-1) + PE_TILE*0.5*PE_TILE*(PE_TILE-1)) * M_TILE*MAC_TILE*BITS_PER_VALUE
-        # )
-        # self.assertEqual(
-        #     result.data["Matmul0<SEP>action<SEP>PeArray<SEP>W0<SEP>hops"].iloc[0],
-        #     (M/M_TILE) * (PE_TILE*0.5*PE_TILE*(PE_TILE-1) + PE_TILE*0.5*PE_TILE*(PE_TILE-1)) * MAC_TILE**2*BITS_PER_VALUE
-        # )
\ No newline at end of file
+
+    def test_flat(self):
+        M = 8
+        KN = 8
+
+        spec = af.Spec.from_yaml(
+            af.examples.workloads.matmuls,
+            af.examples.arches.networked.flat,
+            jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN}
+        )
+        result = spec.map_workload_to_arch()
+
+    def test_flat_one_row_buffer(self):
+        M = 8
+        KN = 8
+
+        spec = af.Spec.from_yaml(
+            af.examples.workloads.matmuls,
+            af.examples.arches.networked.flat,
+            jinja_parse_data={"N_EINSUMS": 1, "M": M, "KN": KN, "N_ROW_BUFFER": 1}
+        )
+        result = spec.map_workload_to_arch()
\ No newline at end of file

From 64470bb86a8af8e1ee39f6980b7e2a5b9afd01cf Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Thu, 21 May 2026 19:56:53 -0400
Subject: [PATCH 02/12] [model] Use flattened arch

---
 .../model/_looptree/reuse/symbolic/symbolic/_network.py   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/accelforge/model/_looptree/reuse/symbolic/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/symbolic/_network.py
index 6870da58..b143f10a 100644
--- a/accelforge/model/_looptree/reuse/symbolic/symbolic/_network.py
+++ b/accelforge/model/_looptree/reuse/symbolic/symbolic/_network.py
@@ -34,6 +34,8 @@ def accumulate_child_result(
         child_shape,
         node,
     ):
+        flattened_arch = info.job.flattened_arch
+
         for network, child_network_stats in child_result.network_stats.items():
             if network not in self.network_stats:
                 self.network_stats[network] = NetworkStats()
@@ -48,7 +50,7 @@ def accumulate_child_result(
             )
             projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)]
             component_object = find_component_object(
-                network.component, info.job.flattened_arch
+                network.component, flattened_arch
             )
             workload_bpv = info.job.einsum.tensor_accesses[
                 network.tensor
@@ -66,9 +68,7 @@ def accumulate_child_result(
                 * actions_per_value
             )
 
-            if info.job.spec_one_einsum.arch.is_above(
-                node.component, network.component
-            ):
+            if is_component_a_above_b(node.component, network.component, flattened_arch):
                 continue
 
             relevancy = info.tensor_to_relevancy[network.tensor][node.rank_variable]

From d34e44b661db3ceef44212d6a27b2f5049f57038 Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Fri, 22 May 2026 11:38:26 -0400
Subject: [PATCH 03/12] [frontend] Add comment to explain physical_fanout

---
 accelforge/frontend/arch/structure.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/accelforge/frontend/arch/structure.py b/accelforge/frontend/arch/structure.py
index fe2bf580..ac75d03b 100644
--- a/accelforge/frontend/arch/structure.py
+++ b/accelforge/frontend/arch/structure.py
@@ -333,6 +333,10 @@ def _flatten(
 
         nodes = []
 
+        # Nodes inside an array are flattened to fit into a hierarchical
+        # model in order to map.
+        # However, we will keep information about how these nodes are
+        # arranged for modeling.
         for node in self.nodes:
             try:
                 if isinstance(node, Branch):

From 4b3fcabcd670e4b81d425d1096c4a9e798859789 Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Fri, 22 May 2026 13:44:09 -0400
Subject: [PATCH 04/12] [network] Add *untested* distributed model

---
 accelforge/frontend/arch/spatialable.py       |   4 +-
 .../_looptree/reuse/symbolic/_network.py      | 124 ++++++++----------
 tests/test_network.py                         |  36 ++---
 3 files changed, 72 insertions(+), 92 deletions(-)

diff --git a/accelforge/frontend/arch/spatialable.py b/accelforge/frontend/arch/spatialable.py
index 367f4506..a5a3e286 100644
--- a/accelforge/frontend/arch/spatialable.py
+++ b/accelforge/frontend/arch/spatialable.py
@@ -142,11 +142,11 @@ def _get_physical_fanout_along(self, dim_name: str, default: int = 1) -> int:
                 return s.fanout
         return default
 
-    def _get_physical_stride_along(self, dim_name: str, default: int = 1) -> int:
+    def _get_physical_stride_along(self, dim_name: str) -> int:
         for s in self._physical_spatial:
             if s.name == dim_name:
                 return s.stride
-        return default
+        raise ValueError(f"dimension {dim_name} not found")
 
     def _spatial_str(self, include_newline=True) -> str:
         if not self.spatial:
diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py
index 28a3b186..4c7cc9ea 100644
--- a/accelforge/model/_looptree/reuse/symbolic/_network.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_network.py
@@ -12,10 +12,10 @@
     PartiallyRelevant,
 )
 
-from accelforge.util._sympy.broadcast_max import Min, Max, MaxGeqZero
+from accelforge.util._sympy.broadcast_max import MaxGeqZero, MinGeqZero
 
 from ._common import AnalysisInfo
-from ._stats import NetworkStats
+from ._stats import NetworkStats, SymbolicAnalysisOutput
 
 
 class NetworkAnalyzer:
@@ -25,7 +25,7 @@ def __init__(self, network_stats):
 
     def accumulate_child_result(
         self,
-        child_result,
+        child_result: SymbolicAnalysisOutput,
         info: AnalysisInfo,
         shape_repeats,
         einsum_name,
@@ -35,6 +35,7 @@ def accumulate_child_result(
         flattened_arch = info.job.flattened_arch
 
         for network, child_network_stats in child_result.network_stats.items():
+            src_component = flattened_arch[network.source.level]
             if network not in self.network_stats:
                 self.network_stats[network] = NetworkStats()
             accumulated_network_stats = self.network_stats[network]
@@ -64,93 +65,72 @@ def accumulate_child_result(
                 * actions_per_value
             )
 
-            if is_component_a_above_b(node.component, network.component, flattened_arch):
+            if flattened_arch.is_above(node.component, network.component):
                 continue
 
             relevancy = info.tensor_to_relevancy[network.tensor][node.rank_variable]
 
+            # The fanout in this dimension in mapping nodes below, i.e., the stride
             last_fanout = child_result.fanout.get((node.component, einsum_name), {})
             last_fanout = last_fanout.get(node.name, 1)
             if isinstance(relevancy, Irrelevant):
-                # Cost of multicasting is the cost of delivering along the dimension
-                multicast_hops = shape_repeats * last_fanout
-                multicast_cost = multicast_hops * volume
-                self.overall_max_hops += multicast_hops
-
-                accumulated_network_stats.total_hops += multicast_cost
-                accumulated_network_stats.max_hops = MaxGeqZero(
-                    accumulated_network_stats.max_hops,
-                    self.overall_max_hops + child_network_stats.max_hops,
-                )
+                # Distributed or not, the amount of total cost is the same.
+                # However, the accesses now come from different physical memories
+                total_cost = multicast_cost(shape_repeats, last_fanout)*volume
+                max_hops = shape_repeats*last_fanout
             elif isinstance(relevancy, Relevant):
-                # Cost of unicast is the cost of delivering to each point in
-                # the dimension with shape as stride
-                # TODO: we should use the actual stride
-                total_unicast_cost = (
-                    0.5 * (shape_repeats + 1) * shape_repeats * last_fanout * volume
-                )
-                max_unicast_hops = shape_repeats * last_fanout
-                self.overall_max_hops += max_unicast_hops
-
-                accumulated_network_stats.total_hops += total_unicast_cost
-                accumulated_network_stats.max_hops = MaxGeqZero(
-                    accumulated_network_stats.max_hops,
-                    self.overall_max_hops + child_network_stats.max_hops,
-                )
+                # If distributed, then we bind data as locally as possible in the
+                # physical buffers
+                if src_component._get_physical_fanout_along(node.name) > 1:
+                    physical_stride = src_component._get_physical_stride_along(node.name)
+
+                    n_dsts_per_physical = MinGeqZero(
+                        # if last_fanout > physical_stride, set n_dst to 1, which results in 0 hops
+                        # later (which is correct because the set of destinations always overlap
+                        # the set of sources).
+                        MaxGeqZero(physical_stride / last_fanout, 1),
+                        shape_repeats
+                    )
+                    n_activated_physical = MaxGeqZero(shape_repeats*last_fanout/physical_stride, 1)
+                    total_cost = (
+                        n_activated_physical
+                        *
+                        unicast_cost(n_dsts_per_physical, last_fanout)
+                        *
+                        volume
+                    )
+                    max_hops = MinGeqZero(shape_repeats*last_fanout, physical_stride)
+                else:
+                    total_cost = unicast_cost(shape_repeats, last_fanout)*volume
+                    max_hops = shape_repeats * last_fanout
             elif isinstance(relevancy, PartiallyRelevant):
                 raise NotImplementedError()
             else:
                 raise RuntimeError(f"unhandled relevancy type {relevancy}")
 
-        return self.overall_max_hops
-
-
-def reduce_dicts(dict1: dict, dict2: dict, reduce_op):
-    for key in dict1:
-        if key not in dict2:
-            dict2[key] = dict1[key]
-        else:
-            dict2[key] = reduce_op(dict1[key], dict2[key])
-
-
-def get_total_to_per_unit(total, max_per_unit):
-    if total == 0 and max_per_unit != 0:
-        raise ValueError(f"total is 0 but max_per_unit is {max_per_unit}")
-    if total == 0:
-        return 1
-    return max_per_unit / total
+            # TODO: this is sketchy
+            self.overall_max_hops += max_hops
 
+            accumulated_network_stats.total_hops += total_cost
+            accumulated_network_stats.max_hops = MaxGeqZero(
+                accumulated_network_stats.max_hops,
+                self.overall_max_hops + child_network_stats.max_hops,
+            )
 
-def has_parent_tensor_holder(
-    tensor: TensorName, node_idx: int, info
-) -> bool:
-    for node in info.mapping[:node_idx]:
-        if isinstance(node, TensorHolder) and tensor in node.tensors:
-            return True
-    return False
+        return self.overall_max_hops
 
 
-def find_component_object(
-    component: str, flattened_arch: list[arch.Leaf]
-) -> arch.TensorHolder:
-    for node in flattened_arch:
-        if node.name == component:
-            return node
-    raise ValueError(f"Component {component} not found in flattened arch")
+def multicast_cost(n_dsts, stride):
+    """Returns total hops of multicast along a dimension."""
+    return (n_dsts-1)*stride
 
 
-def is_component_a_above_b(component_a: str, component_b: str, flattened_arch):
-    a_found = False
-    b_found = False
-    for node in flattened_arch:
-        if node.name == component_a:
-            a_found = True
-        if node.name == component_b:
-            b_found = True
+def unicast_cost(n_dsts, stride):
+    """Returns total hops of unicast along a dimension."""
+    # Cost of unicast is the cost of delivering to each point in
+    # the dimension with shape as stride
+    return arithmetic_sum(n_dsts-1)*stride
 
-        if a_found and not b_found:
-            return True
-        elif b_found and not a_found:
-            return False
-    raise ValueError(f"Neither {component_a} nor {component_b} found in flattened arch")
 
+def arithmetic_sum(n):
+    return 0.5 * (n+1) * n
\ No newline at end of file
diff --git a/tests/test_network.py b/tests/test_network.py
index 406a7017..e7cd640f 100644
--- a/tests/test_network.py
+++ b/tests/test_network.py
@@ -78,7 +78,7 @@ def test_hierarchical_1d(self):
             * (KN / MAC_TILE)  # number of used Scratchpad
             * M_TILE
             * KN  # temporal for n1 in mapping
-            * sum(i+1 for i in range(MAC_TILE))  # unicast along X-axis of MacArray
+            * sum(i for i in range(MAC_TILE))  # unicast along X-axis of MacArray
             * BITS_PER_VALUE,
         )
         # NOTE: assuming XY routing (as defined in mapping)
@@ -88,7 +88,7 @@ def test_hierarchical_1d(self):
             * (KN / MAC_TILE)
             * M_TILE
             * KN  # temporal for n1 in mapping
-            * MAC_TILE   # multicast along X-axis of MacArray
+            * (MAC_TILE - 1)   # multicast along X-axis of MacArray
             * BITS_PER_VALUE,
         )
         self.assertEqual(
@@ -97,14 +97,14 @@ def test_hierarchical_1d(self):
             * (KN / MAC_TILE)
             * M_TILE
             * KN
-            * sum(i+1 for i in range(MAC_TILE))
+            * sum(i for i in range(MAC_TILE))
             * BITS_PER_VALUE,
         )
 
         self.assertEqual(
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T0<SEP>hops"].iloc[0],
             (M / M_TILE)
-            * sum(i+1 for i in range(KN // MAC_TILE))  # unicast along X-axis of PeArray
+            * sum(i for i in range(KN // MAC_TILE))  # unicast along X-axis of PeArray
             * M_TILE
             * MAC_TILE
             * BITS_PER_VALUE,
@@ -113,7 +113,7 @@ def test_hierarchical_1d(self):
         self.assertEqual(
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T1<SEP>hops"].iloc[0],
             (M / M_TILE)
-            * KN // MAC_TILE  # multicast along X-axis of PeArray
+            * (KN // MAC_TILE - 1)  # multicast along X-axis of PeArray
             * M_TILE
             * KN
             * BITS_PER_VALUE,
@@ -121,7 +121,7 @@ def test_hierarchical_1d(self):
         self.assertEqual(
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>W0<SEP>hops"].iloc[0],
             (M / M_TILE)
-            * sum(i+1 for i in range(KN // MAC_TILE))  # unicast along PeArray
+            * sum(i for i in range(KN // MAC_TILE))  # unicast along PeArray
             * MAC_TILE
             * KN
             * BITS_PER_VALUE,
@@ -156,9 +156,9 @@ def test_hierarchical(self):
             * (KN / MAC_TILE) ** 2
             * M_TILE
             * (
-                sum(i+1 for i in range(MAC_TILE))  # unicasting along X
+                sum(i for i in range(MAC_TILE))  # unicasting along X
                 +
-                MAC_TILE * MAC_TILE  # multicast along Y for each column
+                MAC_TILE * (MAC_TILE-1)  # multicast along Y for each column
             )
             * BITS_PER_VALUE,
         )
@@ -169,9 +169,9 @@ def test_hierarchical(self):
             * (KN / MAC_TILE) ** 2
             * M_TILE
             * (
-                MAC_TILE * MAC_TILE  # multicast along X (the tile is shape N1, which is MAC_TILE here)
+                MAC_TILE * (MAC_TILE - 1)  # multicast along X (the tile is shape N1, which is MAC_TILE here)
                 +
-                MAC_TILE * sum(i+1 for i in range(MAC_TILE))  # unicasting along Y for each row
+                MAC_TILE * sum(i for i in range(MAC_TILE))  # unicasting along Y for each row
             )
             * BITS_PER_VALUE,
         )
@@ -181,9 +181,9 @@ def test_hierarchical(self):
             * (KN / MAC_TILE) ** 2
             * M_TILE
             * (
-                MAC_TILE * sum(i+1 for i in range(MAC_TILE))  # unicast along X (the tile is shape N1, which is MAC_TILE here)
+                MAC_TILE * sum(i for i in range(MAC_TILE))  # unicast along X (the tile is shape N1, which is MAC_TILE here)
                 +
-                MAC_TILE * sum(i+1 for i in range(MAC_TILE))  # unicasting along Y for each row
+                MAC_TILE * sum(i for i in range(MAC_TILE))  # unicasting along Y for each row
             )
             * BITS_PER_VALUE,
         )
@@ -192,9 +192,9 @@ def test_hierarchical(self):
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T0<SEP>hops"].iloc[0],
             (M / M_TILE)
             * (
-                sum(i+1 for i in range(PE_TILE))
+                sum(i for i in range(PE_TILE))
                 +
-                PE_TILE * PE_TILE
+                PE_TILE * (PE_TILE - 1)
             )
             # tile shape
             * M_TILE
@@ -206,9 +206,9 @@ def test_hierarchical(self):
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T1<SEP>hops"].iloc[0],
             (M / M_TILE)
             * (
-                PE_TILE * PE_TILE
+                PE_TILE * (PE_TILE - 1)
                 +
-                PE_TILE * sum(i+1 for i in range(PE_TILE))
+                PE_TILE * sum(i for i in range(PE_TILE))
             )
             * M_TILE
             * MAC_TILE
@@ -218,9 +218,9 @@ def test_hierarchical(self):
             result.data["Matmul0<SEP>action<SEP>PeArray<SEP>W0<SEP>hops"].iloc[0],
             (M / M_TILE)
             * (
-                PE_TILE * sum(i+1 for i in range(PE_TILE))
+                PE_TILE * sum(i for i in range(PE_TILE))
                 +
-                PE_TILE * sum(i+1 for i in range(PE_TILE))
+                PE_TILE * sum(i for i in range(PE_TILE))
             )
             * MAC_TILE**2
             * BITS_PER_VALUE,

From 307196ddaf4a344378baf4fca7705055f8047164 Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Fri, 22 May 2026 15:13:12 -0400
Subject: [PATCH 05/12] [network] Tested distributed buffers

---
 accelforge/frontend/arch/_flattened_arch.py   | 33 ++++++++++
 .../_looptree/reuse/symbolic/_symbolic.py     | 50 +++++++-------
 tests/input_files/networked/flat.yaml         | 31 +++------
 .../networked/one_matmul_to_flat.yaml         | 42 ++++++++++++
 tests/test_network.py                         | 66 ++++++++++++++++++-
 5 files changed, 177 insertions(+), 45 deletions(-)
 create mode 100644 tests/input_files/networked/one_matmul_to_flat.yaml

diff --git a/accelforge/frontend/arch/_flattened_arch.py b/accelforge/frontend/arch/_flattened_arch.py
index a5062b65..2a249a94 100644
--- a/accelforge/frontend/arch/_flattened_arch.py
+++ b/accelforge/frontend/arch/_flattened_arch.py
@@ -1,3 +1,12 @@
+from typing import TypeVar
+
+
+_FIND_SENTINEL = object()
+
+D = TypeVar("D")
+T = TypeVar("T")
+
+
 class FlattenedArch:
     """
     A flattened arch is an architecture spec that has been
@@ -51,3 +60,27 @@ def is_above(self, name_a: str, name_b: str):
         idx_a = self.index(name_a)
         idx_b = self.index(name_b)
         return idx_a < idx_b
+
+    def find_first_of_type_between(
+        self, node_type: T, name_lower: str, name_upper: str, default: D = _FIND_SENTINEL
+    ) -> T | D:
+        """
+        Returns the first node with type `node_type` above `name_lower` and under `name_upper`.
+
+        If `name` does not exist, raises an error.
+
+        If no node of `node_type` is found, either `default` is
+        returned (if provided) or raises an error.
+        """
+        upper_idx = self.index(name_upper)
+        lower_idx = self.index(name_lower)
+
+        for i, node in enumerate(self.nodes):
+            if not isinstance(node, node_type) or i <= upper_idx or i >= lower_idx:
+                continue
+            else:
+                return node
+        if default is not _FIND_SENTINEL:
+            return default
+        else:
+            raise ValueError(f"node with type {node_type} between {name_upper} and {name_lower} not found")
diff --git a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
index ae075b52..91a6bd32 100755
--- a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
@@ -903,19 +903,21 @@ def analyze_reservation(node_idx, current_shape, info: AnalysisInfo):
     child_result.buffet_stats[buffet] = stats
 
     # Reservation nodes are the first to produce stats for a network
-    network_node = info.job.spec_one_einsum.arch.find_first_of_type_above(
-        NetworkSpec, buffet.level, default=None
-    )
-    if network_node is not None:
-        network = Network(
-            tensor,
-            einsum_name,
-            info.data_movement_connections.get_src(buffet),
-            buffet,
-            component=network_node.name if network_node else network_node,
+    src = info.data_movement_connections.get_src(buffet)
+    if src is not None:
+        network_node = info.job.flattened_arch.find_first_of_type_between(
+            NetworkSpec, buffet.level, src.level, default=None
         )
-        assert network not in child_result.network_stats
-        child_result.network_stats[network] = NetworkStats()
+        if network_node is not None:
+            network = Network(
+                tensor,
+                einsum_name,
+                src,
+                buffet,
+                component=network_node.name if network_node else network_node,
+            )
+            assert network not in child_result.network_stats
+            child_result.network_stats[network] = NetworkStats()
 
     fanout_key = (node.resource, einsum_name)
     if fanout_key not in child_result.fanout:
@@ -965,18 +967,20 @@ def analyze_compute(
         stats.max_occupancy = 1
         result_accumulator.buffet_stats[buffet] = stats
 
-        network_node = info.job.spec_one_einsum.arch.find_first_of_type_above(
-            NetworkSpec, node.component, default=None
-        )
-        if network_node is not None:
-            network = Network(
-                tensor,
-                info.job.einsum_name,
-                info.data_movement_connections.get_src(buffet),
-                buffet,
-                component=network_node.name if network_node else network_node,
+        src = info.data_movement_connections.get_src(buffet)
+        if src is not None:
+            network_node = info.job.flattened_arch.find_first_of_type_between(
+                NetworkSpec, node.component, src.level, default=None
             )
-            result_accumulator.network_stats[network] = NetworkStats()
+            if network_node is not None:
+                network = Network(
+                    tensor,
+                    info.job.einsum_name,
+                    src,
+                    buffet,
+                    component=network_node.name if network_node else network_node,
+                )
+                result_accumulator.network_stats[network] = NetworkStats()
 
     return result_accumulator
 
diff --git a/tests/input_files/networked/flat.yaml b/tests/input_files/networked/flat.yaml
index ff2a05c8..b546ba21 100644
--- a/tests/input_files/networked/flat.yaml
+++ b/tests/input_files/networked/flat.yaml
@@ -1,5 +1,3 @@
-{% set N_ROW_BUFFER = N_ROW_BUFFER | default(4) %}
-{% set N_COL_BUFFER = N_COL_BUFFER | default(4) %}
 arch:
   nodes:
   - !Memory
@@ -12,29 +10,12 @@ arch:
     - {name: read, energy: 100, latency: 0}
     - {name: write, energy: 100, latency: 0}
 
-  - !Network
-    name: NoC
-    area: 0
-    leak_power: 0
-    actions:
-    - {name: hops, energy: 1, latency: 0}
-
   - !Array
     name: Array
     spatial:
     - {name: X, fanout: 4}
     - {name: Y, fanout: 4}
     nodes:
-    - !Memory
-      name: GlobalBuffer
-      size: inf
-      area: 0
-      leak_power: 0
-      tensors: {keep: ~MainMemory, may_keep: All}
-      actions:
-      - {name: read, energy: 10, latency: 0}
-      - {name: write, energy: 10, latency: 0}
-
     - !Memory
       name: RowBuffer
       size: inf
@@ -45,7 +26,7 @@ arch:
       - {name: read, energy: 5, latency: 0}
       - {name: write, energy: 5, latency: 0}
       spatial:
-      - {name: X, fanout: {{N_ROW_BUFFER}}}
+      - {name: X, fanout: 4}
 
     - !Memory
       name: ColumnBuffer
@@ -57,7 +38,7 @@ arch:
       - {name: read, energy: 5, latency: 0}
       - {name: write, energy: 5, latency: 0}
       spatial:
-      - {name: Y, fanout: {{N_COL_BUFFER}}}
+      - {name: Y, fanout: 4}
 
     - !Memory
       name: DistributedBuffer
@@ -72,6 +53,14 @@ arch:
       - {name: X, fanout: 2}
       - {name: Y, fanout: 2}
 
+    - !Network
+      name: NoC
+      area: 0
+      leak_power: 0
+      actions:
+      - {name: hops, energy: 1, latency: 0}
+
+
   - !Memory
     name: Scratchpad
     size: inf
diff --git a/tests/input_files/networked/one_matmul_to_flat.yaml b/tests/input_files/networked/one_matmul_to_flat.yaml
new file mode 100644
index 00000000..cf7d2f17
--- /dev/null
+++ b/tests/input_files/networked/one_matmul_to_flat.yaml
@@ -0,0 +1,42 @@
+mapping:
+  nodes:
+  - !Storage
+    component: MainMemory
+    tensors: [T0, T1, W0]
+  - !Storage
+    component: DistributedBuffer
+    tensors: [W0]
+  - !Temporal
+    rank_variable: m
+    tile_shape: {{ M_TILE }}
+  - !Storage
+    component: RowBuffer
+    tensors: [T0]
+  - !Storage
+    component: ColumnBuffer
+    tensors: [T1]
+  - !Spatial
+    rank_variable: n0
+    tile_shape: {{ MAC_TILE }}
+    component: Array
+    name: X
+  - !Spatial
+    rank_variable: n1
+    tile_shape: {{ MAC_TILE }}
+    component: Array
+    name: Y
+  - !Storage
+    component: Scratchpad
+    tensors: [T0, T1, W0]
+  - !Temporal
+    rank_variable: m
+    tile_shape: 1
+  - !Temporal
+    rank_variable: n0
+    tile_shape: 1
+  - !Temporal
+    rank_variable: n1
+    tile_shape: 1
+  - !Compute
+    einsum: Matmul0
+    component: MAC
\ No newline at end of file
diff --git a/tests/test_network.py b/tests/test_network.py
index e7cd640f..546d3bf4 100644
--- a/tests/test_network.py
+++ b/tests/test_network.py
@@ -33,7 +33,6 @@ def test_flat(self):
             {n.name for n in spec.arch.get_nodes_of_type(af.spec.Leaf)},
             {
                 "MainMemory",
-                "GlobalBuffer",
                 "NoC",
                 "RowBuffer",
                 "ColumnBuffer",
@@ -226,6 +225,71 @@ def test_hierarchical(self):
             * BITS_PER_VALUE,
         )
 
+    def test_flat(self):
+        M = 8
+        KN = 8
+        MAC_TILE = 2
+        M_TILE = 4
+        BITS_PER_VALUE = 8
+
+        spec = af.Spec.from_yaml(
+            af.examples.workloads.matmuls,
+            INPUT_FILES_DIR / "flat.yaml",
+            INPUT_FILES_DIR / "one_matmul_to_flat.yaml",
+            jinja_parse_data={
+                "N_EINSUMS": 1,
+                "M": 8,
+                "KN": 8,
+                "MAC_TILE": MAC_TILE,
+                "M_TILE": M_TILE,
+            },
+        )
+        result = spec.evaluate_mapping()
+        self.assertEqual(
+            result.data['Matmul0<SEP>action<SEP>NoC<SEP>T0<SEP>hops'].iloc[0],
+            (
+                M / M_TILE
+                *
+                (KN / MAC_TILE) * (KN / MAC_TILE - 1)   # num rows * multicast_hops
+                *
+                M_TILE * MAC_TILE  # tile shape
+                *
+                BITS_PER_VALUE
+            )
+        )
+        self.assertEqual(
+            result.data['Matmul0<SEP>action<SEP>NoC<SEP>T1<SEP>hops'].iloc[0],
+            (
+                M / M_TILE
+                *
+                (KN / MAC_TILE) * (KN / MAC_TILE - 1)   # num rows * multicast_hops
+                *
+                M_TILE * MAC_TILE  # tile shape
+                *
+                BITS_PER_VALUE
+            )
+        )
+        self.assertEqual(
+            result.data['Matmul0<SEP>action<SEP>NoC<SEP>W0<SEP>hops'].iloc[0],
+            (
+                M / M_TILE
+                *
+                (
+                    4   # a 2x2 grid of physical buffers
+                    *
+                    (
+                        sum(i for i in range(2)) * MAC_TILE  # unicast along row * tile shape
+                        +
+                        2 * sum(i for i in range(2))  # num cols * unicast down col
+                    )
+                )
+                *
+                MAC_TILE * MAC_TILE  # tile shape
+                *
+                BITS_PER_VALUE
+            )
+        )
+
 
 class TestMapper(TestCase):
     def test_hierarchical(self):

From 0453ddceb860b9362656a9a052b18493af369069 Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Fri, 22 May 2026 16:58:44 -0400
Subject: [PATCH 06/12] [model] Read/writes of distributed buffers

---
 accelforge/frontend/arch/_flattened_arch.py   | 103 +++++++++++++++++-
 accelforge/frontend/arch/spatialable.py       |   9 ++
 .../model/_looptree/reuse/symbolic/_stats.py  |   6 +
 .../_looptree/reuse/symbolic/_symbolic.py     |  38 +++++--
 tests/input_files/networked/flat.yaml         |   4 +-
 tests/test_network.py                         |  42 +++++++
 6 files changed, 188 insertions(+), 14 deletions(-)

diff --git a/accelforge/frontend/arch/_flattened_arch.py b/accelforge/frontend/arch/_flattened_arch.py
index 2a249a94..81e31588 100644
--- a/accelforge/frontend/arch/_flattened_arch.py
+++ b/accelforge/frontend/arch/_flattened_arch.py
@@ -1,4 +1,4 @@
-from typing import TypeVar
+from typing import TypeVar, Callable
 
 
 _FIND_SENTINEL = object()
@@ -62,7 +62,12 @@ def is_above(self, name_a: str, name_b: str):
         return idx_a < idx_b
 
     def find_first_of_type_between(
-        self, node_type: T, name_lower: str, name_upper: str, default: D = _FIND_SENTINEL
+        self,
+        node_type: T,
+        name_lower: str,
+        name_upper: str,
+        default: D = _FIND_SENTINEL,
+        top_bottom: bool = True,
     ) -> T | D:
         """
         Returns the first node with type `node_type` above `name_lower` and under `name_upper`.
@@ -75,7 +80,10 @@ def find_first_of_type_between(
         upper_idx = self.index(name_upper)
         lower_idx = self.index(name_lower)
 
-        for i, node in enumerate(self.nodes):
+        iterator = self.nodes
+        if not top_bottom:
+            iterator = reversed(top_bottom)
+        for i, node in enumerate(iterator):
             if not isinstance(node, node_type) or i <= upper_idx or i >= lower_idx:
                 continue
             else:
@@ -84,3 +92,92 @@ def find_first_of_type_between(
             return default
         else:
             raise ValueError(f"node with type {node_type} between {name_upper} and {name_lower} not found")
+
+    def find_first_of_type_above(
+        self,
+        node_type: T,
+        name_lower: str,
+        default: D = _FIND_SENTINEL,
+        top_bottom: bool = True,
+    ) -> T | D:
+        """
+        Returns the first node with type `node_type` above `name_lower` and under `name_upper`.
+
+        If `name` does not exist, raises an error.
+
+        If no node of `node_type` is found, either `default` is
+        returned (if provided) or raises an error.
+        """
+        lower_idx = self.index(name_lower)
+
+        iterator = self.nodes
+        if not top_bottom:
+            iterator = reversed(top_bottom)
+        for i, node in enumerate(iterator):
+            if not isinstance(node, node_type) or i >= lower_idx:
+                continue
+            else:
+                return node
+        if default is not _FIND_SENTINEL:
+            return default
+        else:
+            raise ValueError(f"node with type {node_type} above {name_lower} not found")
+
+    def find_first_of_type_below(
+        self,
+        node_type: T,
+        name_upper: str,
+        default: D = _FIND_SENTINEL,
+        top_bottom: bool = True,
+    ) -> T | D:
+        """
+        Returns the first node with type `node_type` above `name_lower` and under `name_upper`.
+
+        If `name` does not exist, raises an error.
+
+        If no node of `node_type` is found, either `default` is
+        returned (if provided) or raises an error.
+        """
+        upper_idx = self.index(name_upper)
+
+        iterator = self.nodes
+        if not top_bottom:
+            iterator = reversed(top_bottom)
+        for i, node in enumerate(iterator):
+            if not isinstance(node, node_type) or i <= upper_idx:
+                continue
+            else:
+                return node
+        if default is not _FIND_SENTINEL:
+            return default
+        else:
+            raise ValueError(f"node with type {node_type} below {name_upper} not found")
+
+    def first_below(
+        self,
+        name: str,
+        filter: Callable = None,
+        default: D = _FIND_SENTINEL,
+    ) -> T | D:
+        """
+        Returns the first node with type `node_type` above `name_lower` and under `name_upper`.
+
+        If `name` does not exist, raises an error.
+
+        If no node of `node_type` is found, either `default` is
+        returned (if provided) or raises an error.
+        """
+        idx = self.index(name)
+
+        if filter is None:
+            filter = lambda x: True
+
+        for i, node in enumerate(self.nodes):
+            if not filter(node) or i <= idx:
+                continue
+            else:
+                return node
+        if default is not _FIND_SENTINEL:
+            return default
+        else:
+            raise ValueError(f"node below {name} not found")
diff --git a/accelforge/frontend/arch/spatialable.py b/accelforge/frontend/arch/spatialable.py
index a5a3e286..0b767302 100644
--- a/accelforge/frontend/arch/spatialable.py
+++ b/accelforge/frontend/arch/spatialable.py
@@ -136,6 +136,12 @@ def get_fanout_along(self, dim_name: str, default: int = 1) -> int:
                 return s.fanout
         return default
 
+    def _has_physical_dim(self, dim_name: str) -> bool:
+        for s in self._physical_spatial:
+            if s.name == dim_name:
+                return True
+        return False
+
     def _get_physical_fanout_along(self, dim_name: str, default: int = 1) -> int:
         for s in self._physical_spatial:
             if s.name == dim_name:
@@ -153,3 +159,6 @@ def _spatial_str(self, include_newline=True) -> str:
             return ""
         result = ", ".join(f"{s.fanout}× {s.name}" for s in self.spatial)
         return f"\n[{result}]" if include_newline else result
+
+    def _is_distributed(self):
+        return any(s.fanout > 1 for s in self._physical_spatial)
\ No newline at end of file
diff --git a/accelforge/model/_looptree/reuse/symbolic/_stats.py b/accelforge/model/_looptree/reuse/symbolic/_stats.py
index 8368937d..1c2c8ee3 100644
--- a/accelforge/model/_looptree/reuse/symbolic/_stats.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_stats.py
@@ -100,6 +100,12 @@ def repeat_temporal(self, factor: int, is_fully_relevant: bool) -> "BuffetStats"
         return new
 
     def repeat_spatial(self, factor: int, reuse_parent_accesses: bool) -> "BuffetStats":
+        """
+        Repeat buffet stats due to spatial loop `factor` number of times.
+
+        For accesses to parent, the amount of repetition is `factor` if `reuse_parent_access`
+        is False; otherwise, there is no repetition.
+        """
         new = copy.copy(self)
         if factor == 1:
             return new
diff --git a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
index 91a6bd32..5ed426e7 100755
--- a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
@@ -606,7 +606,6 @@ def handle_repeated_value(repeated_shape):
         accumulated_buffet_stats = result_accumulator.buffet_stats
         child_stats = list(child_result.buffet_stats.items())
         for i, (buffet, buffet_stats) in enumerate(child_stats):
-            stats = buffet_stats
             accumulated_stats = accumulated_buffet_stats.setdefault(
                 buffet, BuffetStats.blank()
             )
@@ -628,8 +627,8 @@ def handle_repeated_value(repeated_shape):
                 and buffet.tensor in component_spatial_dim.may_reuse
             )
 
-            stats.n_loops_above = stats.n_loops_above + 1
-            accumulated_stats += stats.repeat_spatial(
+            buffet_stats.n_loops_above = buffet_stats.n_loops_above + 1
+            accumulated_stats += buffet_stats.repeat_spatial(
                 shape_repeats, reuse_parent_accesses
             )
 
@@ -692,6 +691,7 @@ def analyze_storage(
     count_writes: bool = True,
 ):
     mapping = info.mapping
+    flattened_arch = info.job.flattened_arch
     einsum_name = mapping[-1].einsum
     node: TensorHolder = mapping[node_idx]
 
@@ -798,25 +798,45 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any:
         else:
             write_scale = 0
 
+        # =======================
+        # For distributed buffers
+        n_active_physical_units = 1
+        if child is not None:
+            next_spatial = flattened_arch.first_below(
+                node.component,
+                lambda n: isinstance(n, arch.Spatialable) and len(n.spatial) > 0,
+                default=None,
+            )
+            if component_object._is_distributed() and next_spatial is not None:
+                for (b, e), dim_fanout in child_result.fanout.items():
+                    if b != next_spatial.name:
+                        continue
+                    for d in dim_fanout:
+                        if not component_object._has_physical_dim(d):
+                            continue
+                        n_active_physical_units *= (
+                            dim_fanout[d] / component_object._get_physical_stride_along(d)
+                        )
+
         # ==========================
         # Data exchanges with parent
         if count_downward_movement[tensor]:  # Parent -> Me
             stats.total_write_actions += stats.total_reads_to_parent * write_scale
             stats.max_per_unit_write_actions += (
-                stats.total_reads_to_parent * write_scale
+                stats.total_reads_to_parent * write_scale / n_active_physical_units
             )
             stats.total_skipped_first_write_actions += (
                 stats.total_skipped_first_reads_to_parent * write_scale
             )
             stats.min_per_unit_skipped_first_write_actions += (
-                stats.min_per_parent_skipped_first_reads_to_parent * write_scale
+                stats.min_per_parent_skipped_first_reads_to_parent * write_scale / n_active_physical_units
             )
 
         if count_upward_movement[tensor]:  # Me -> Parent
             # Comment this to have the final writeback to a buffer hit both that buffer and
             # go directly to the parent without incurring another read from the buffer.
             stats.total_read_actions += stats.total_writes_to_parent * read_scale
-            stats.max_per_unit_read_actions += stats.total_writes_to_parent * read_scale
+            stats.max_per_unit_read_actions += stats.total_writes_to_parent * read_scale / n_active_physical_units
 
         # ========================
         # Data exchanges with peer
@@ -829,7 +849,7 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any:
             if count_downward_movement[tensor]:  # Me -> Child
                 stats.total_read_actions += child.total_reads_to_parent * read_scale
                 stats.max_per_unit_read_actions += (
-                    child.max_per_parent_reads_to_parent * read_scale
+                    child.max_per_parent_reads_to_parent * read_scale / n_active_physical_units
                 )
                 # Skip first read
                 if skip_initial:
@@ -837,13 +857,13 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any:
                         child.total_skipped_first_reads_to_parent * read_scale
                     )
                     stats.min_per_unit_skipped_first_read_actions += (
-                        child.min_per_parent_skipped_first_reads_to_parent * read_scale
+                        child.min_per_parent_skipped_first_reads_to_parent * read_scale / n_active_physical_units
                     )
 
             if count_upward_movement[tensor]:  # Child -> Me
                 stats.total_write_actions += child.total_writes_to_parent * write_scale
                 stats.max_per_unit_write_actions += (
-                    child.max_per_parent_writes_to_parent * write_scale
+                    child.max_per_parent_writes_to_parent * write_scale / n_active_physical_units
                 )
 
     return child_result
diff --git a/tests/input_files/networked/flat.yaml b/tests/input_files/networked/flat.yaml
index b546ba21..f4862b0b 100644
--- a/tests/input_files/networked/flat.yaml
+++ b/tests/input_files/networked/flat.yaml
@@ -23,7 +23,7 @@ arch:
       leak_power: 0
       tensors: {keep: input, may_keep: input}
       actions:
-      - {name: read, energy: 5, latency: 0}
+      - {name: read, energy: 5, latency: 1}
       - {name: write, energy: 5, latency: 0}
       spatial:
       - {name: X, fanout: 4}
@@ -47,7 +47,7 @@ arch:
       leak_power: 0
       tensors: {keep: weight, may_keep: weight}
       actions:
-      - {name: read, energy: 5, latency: 0}
+      - {name: read, energy: 5, latency: 1}
       - {name: write, energy: 5, latency: 0}
       spatial:
       - {name: X, fanout: 2}
diff --git a/tests/test_network.py b/tests/test_network.py
index 546d3bf4..9c87977b 100644
--- a/tests/test_network.py
+++ b/tests/test_network.py
@@ -289,6 +289,48 @@ def test_flat(self):
                 BITS_PER_VALUE
             )
         )
+        self.assertEqual(
+            result.data['Matmul0<SEP>action<SEP>RowBuffer<SEP>T0<SEP>read'].iloc[0],
+            (
+                M / M_TILE
+                *
+                KN // MAC_TILE
+                *
+                M_TILE * MAC_TILE
+                *
+                BITS_PER_VALUE
+            )
+        )
+        self.assertEqual(
+            result.data['Matmul0<SEP>latency<SEP>RowBuffer'].iloc[0],
+            (
+                M / M_TILE
+                *
+                KN // MAC_TILE
+                *
+                M_TILE * MAC_TILE
+                *
+                BITS_PER_VALUE
+                /
+                4    # num of physical RowBuffer
+            )
+        )
+        self.assertEqual(
+            result.data['Matmul0<SEP>latency<SEP>DistributedBuffer'].iloc[0],
+            (
+                M / M_TILE
+                *
+                KN // MAC_TILE
+                *
+                KN // MAC_TILE
+                *
+                MAC_TILE * MAC_TILE  # tile shape
+                *
+                BITS_PER_VALUE
+                /
+                4    # num of physical DistributedBuffer
+            )
+        )
 
 
 class TestMapper(TestCase):

From 8313202c93af396a3685d6ad4e5e1e4e5c09801a Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Tue, 26 May 2026 17:38:17 -0400
Subject: [PATCH 07/12] Model latency and distributed occupancy

---
 accelforge/model/_looptree/latency/memory.py          | 11 +++++++++++
 .../model/_looptree/reuse/symbolic/_symbolic.py       |  4 ++++
 2 files changed, 15 insertions(+)

diff --git a/accelforge/model/_looptree/latency/memory.py b/accelforge/model/_looptree/latency/memory.py
index eb6f6426..a52989e7 100755
--- a/accelforge/model/_looptree/latency/memory.py
+++ b/accelforge/model/_looptree/latency/memory.py
@@ -79,6 +79,17 @@ def component_latency(
                 f"Component {component} is not a TensorHolder or Compute"
             )
 
+    for network, network_stats in looptree_results.network_stats.items():
+        component = network.component
+        actions = component_to_actions[component]
+        if component not in name2component:
+            raise ValueError(f"Component {component} found in mapping but not arch")
+
+        for action in name2component[component].actions:
+            actions[f"{action.name}_actions"] += 0
+
+        actions["hops_actions"] += network_stats.max_hops
+
     longest_compute_latency = Max(
         0, *[s.max_latency for s in looptree_results.compute_stats.values()]
     )
diff --git a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
index 5ed426e7..cb64012c 100755
--- a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
@@ -818,6 +818,10 @@ def inherit_add(attr: str, default_value: Any = fills) -> Any:
                             dim_fanout[d] / component_object._get_physical_stride_along(d)
                         )
 
+        # ==========================
+        # Recalculate usage of distributed buffers
+        stats.max_occupancy /= n_active_physical_units
+
         # ==========================
         # Data exchanges with parent
         if count_downward_movement[tensor]:  # Parent -> Me

From db5eae4579f6493f84924ec3543e956da6a5b36e Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Wed, 27 May 2026 17:05:40 -0400
Subject: [PATCH 08/12] Implement (almost) proper latency model; waiting for
 hwcomponents latency/bandwidth update

---
 accelforge/frontend/arch/components.py        | 14 +++
 accelforge/model/_looptree/latency/memory.py  | 31 ++++--
 .../_looptree/reuse/symbolic/_network.py      | 94 +++++++++++--------
 .../model/_looptree/reuse/symbolic/_stats.py  |  8 +-
 accelforge/model/run_model.py                 |  1 +
 .../networked/hierarchical_1d.yaml            |  3 +-
 tests/test_network.py                         |  4 +
 7 files changed, 106 insertions(+), 49 deletions(-)

diff --git a/accelforge/frontend/arch/components.py b/accelforge/frontend/arch/components.py
index d7b64dfd..8affd5ec 100644
--- a/accelforge/frontend/arch/components.py
+++ b/accelforge/frontend/arch/components.py
@@ -1174,6 +1174,20 @@ class Network(Component, Leaf):
     of the spatial nodes from top to bottom.
     """
 
+    total_latency: str | int | float = "max(max_hops*actions['hops'].latency, max_link_traffic/actions['hops'].latency)"
+    """
+    Models latency as either:
+    - *Latency-bound*, which means that the latency of the route with the most number of
+      hops dominate the overall communication latency.
+    - *Bandwidth-bound*, which means that the traffic over the most congested link
+      dominates the overall communication latency.
+
+    Keywords:
+    - `max_hops` returns the number of hops in the longest route.
+    - `max_link_traffic` returns the amount of traffic (in bits) over the most congested
+      link.
+    """
+
     bits_per_value: EvalsTo[dict] = {}
     """
     Sets the bits per value for tensors in this `TensorHolder`. Keys are evaluated as
diff --git a/accelforge/model/_looptree/latency/memory.py b/accelforge/model/_looptree/latency/memory.py
index a52989e7..491f97f2 100755
--- a/accelforge/model/_looptree/latency/memory.py
+++ b/accelforge/model/_looptree/latency/memory.py
@@ -14,7 +14,7 @@
 
 from accelforge.model._looptree.reuse.symbolic import BuffetStats
 from accelforge.util._eval_expressions import MATH_FUNCS, eval_expression
-from accelforge.util._sympy.broadcast_max import Max, Min
+from accelforge.util._sympy.broadcast_max import Max, Min, MaxGeqZero
 import symengine as se
 
 
@@ -47,6 +47,10 @@ def component_latency(
     component_to_actions: dict[str, dict[str, float]] = defaultdict(
         lambda: defaultdict(lambda: 0)
     )
+    # Holds ``keywords" that do not map neatly to actions, e.g., max_hops for network
+    component_to_keywords: dict[str, dict[str, float]] = defaultdict(
+        lambda: defaultdict(lambda: 0)
+    )
     name2component: dict[str, Component] = {node.name: node for node in flattened_arch}
 
     compute_obj = flattened_arch[-1]
@@ -79,16 +83,28 @@ def component_latency(
                 f"Component {component} is not a TensorHolder or Compute"
             )
 
+    network_to_max_link_traffic = defaultdict(lambda: defaultdict(lambda: 0))
+    network_to_max_hops = defaultdict(lambda: [])
     for network, network_stats in looptree_results.network_stats.items():
         component = network.component
-        actions = component_to_actions[component]
         if component not in name2component:
             raise ValueError(f"Component {component} found in mapping but not arch")
 
-        for action in name2component[component].actions:
-            actions[f"{action.name}_actions"] += 0
+        dim_traffic = network_to_max_link_traffic[component]
+        for dim, max_traffic_in_dim in network_stats.max_traffic.items():
+            dim_traffic[dim] += max_traffic_in_dim
 
-        actions["hops_actions"] += network_stats.max_hops
+        network_to_max_hops[component].append(network_stats.max_hops)
+
+    for network, network_stats in looptree_results.network_stats.items():
+        component = network.component
+        keywords = component_to_keywords[component]
+        keywords["max_link_traffic"] = MaxGeqZero(
+            *network_to_max_link_traffic[component].values()
+        )
+        keywords["max_hops"] = MaxGeqZero(
+            *network_to_max_hops[component]
+        )
 
     longest_compute_latency = Max(
         0, *[s.max_latency for s in looptree_results.compute_stats.values()]
@@ -126,14 +142,15 @@ def component_latency(
         "sum": se.Add,
     }
 
-    for component, actions in component_to_actions.items():
-        component_obj = name2component[component]
+    for component, component_obj in name2component.items():
+        actions = component_to_actions[component]
         symbol_table = {
             "action2latency": component_to_action_latency[component],
             **symbol_table_base,
             **name2component[component].shallow_model_dump(include_None=True),
             **actions,
             **component_to_action_latency[component],
+            **component_to_keywords[component],
         }
         if name2component[component].total_latency is not None:
             component_latency[component] = eval_expression(
diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py
index 4c7cc9ea..2f9bfe87 100644
--- a/accelforge/model/_looptree/reuse/symbolic/_network.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_network.py
@@ -1,9 +1,5 @@
-import copy
-from accelforge.frontend import arch
-from accelforge.frontend.arch import Network as NetworkSpec
 from accelforge.frontend.mapping import (
-    TensorHolder,
-    TensorName
+    Spatial
 )
 from accelforge.frontend._workload_isl._symbolic import (
     compute_dense_tile_occupancy,
@@ -20,7 +16,7 @@
 
 class NetworkAnalyzer:
     def __init__(self, network_stats):
-        self.overall_max_hops = 0
+        self.overall_max_hops: dict = {}
         self.network_stats = network_stats
 
     def accumulate_child_result(
@@ -30,8 +26,9 @@ def accumulate_child_result(
         shape_repeats,
         einsum_name,
         child_shape,
-        node,
+        node: Spatial,
     ):
+        """This function is called for every repeated shape."""
         flattened_arch = info.job.flattened_arch
 
         for network, child_network_stats in child_result.network_stats.items():
@@ -40,44 +37,37 @@ def accumulate_child_result(
                 self.network_stats[network] = NetworkStats()
             accumulated_network_stats = self.network_stats[network]
 
-            accumulated_network_stats.total_hops += (
-                child_network_stats.total_hops * shape_repeats
-            )
-            accumulated_network_stats.max_hops = MaxGeqZero(
-                accumulated_network_stats.max_hops,
-                child_network_stats.max_hops,
-            )
-            projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)]
-            component_object = flattened_arch[network.component]
-            workload_bpv = info.job.einsum.tensor_accesses[
-                network.tensor
-            ].bits_per_value
-            bits_per_value = component_object.bits_per_value.get(
-                network.tensor, workload_bpv
-            )
-            bits_per_action = component_object.bits_per_action
-            if bits_per_action is not None:
-                actions_per_value = bits_per_value / bits_per_action
-            else:
-                actions_per_value = bits_per_value
-            volume = (
-                compute_dense_tile_occupancy(projection, child_shape)
-                * actions_per_value
-            )
-
+            # We only need to update the summary if the spatial loop is for
+            # a component higher than the network of interest
             if flattened_arch.is_above(node.component, network.component):
+                accumulated_network_stats.total_hops += (
+                    child_network_stats.total_hops * shape_repeats
+                )
+                accumulated_network_stats.max_hops = MaxGeqZero(
+                    accumulated_network_stats.max_hops,
+                    child_network_stats.max_hops,
+                )
+                for k, v in child_network_stats.max_traffic.items():
+                    accumulated_network_stats.max_traffic[k] = MaxGeqZero(
+                        accumulated_network_stats.max_traffic.get(k, 0),
+                        v
+                    )
                 continue
 
+            volume = self._get_data_volume(network, einsum_name, info, child_shape)
+
             relevancy = info.tensor_to_relevancy[network.tensor][node.rank_variable]
 
             # The fanout in this dimension in mapping nodes below, i.e., the stride
             last_fanout = child_result.fanout.get((node.component, einsum_name), {})
             last_fanout = last_fanout.get(node.name, 1)
             if isinstance(relevancy, Irrelevant):
+                # The volume travels through link by link in one axis of the mesh
                 # Distributed or not, the amount of total cost is the same.
                 # However, the accesses now come from different physical memories
                 total_cost = multicast_cost(shape_repeats, last_fanout)*volume
                 max_hops = shape_repeats*last_fanout
+                max_traffic = volume
             elif isinstance(relevancy, Relevant):
                 # If distributed, then we bind data as locally as possible in the
                 # physical buffers
@@ -99,26 +89,56 @@ def accumulate_child_result(
                         *
                         volume
                     )
-                    max_hops = MinGeqZero(shape_repeats*last_fanout, physical_stride)
+                    max_hops = MinGeqZero((n_dsts_per_physical-1)*last_fanout, physical_stride)
+                    max_traffic = (n_dsts_per_physical-1)*volume
                 else:
                     total_cost = unicast_cost(shape_repeats, last_fanout)*volume
                     max_hops = shape_repeats * last_fanout
+                    max_traffic = (shape_repeats-1)*volume
             elif isinstance(relevancy, PartiallyRelevant):
                 raise NotImplementedError()
             else:
                 raise RuntimeError(f"unhandled relevancy type {relevancy}")
 
-            # TODO: this is sketchy
-            self.overall_max_hops += max_hops
+            # Each subsequent call to this function (i.e., over different iterations of a spatial loop)
+            # adds more to the max hops
+            self.overall_max_hops[network] = self.overall_max_hops.get(network, 0) + max_hops
 
-            accumulated_network_stats.total_hops += total_cost
+            accumulated_network_stats.total_hops += (
+                total_cost + child_network_stats.total_hops*shape_repeats
+            )
             accumulated_network_stats.max_hops = MaxGeqZero(
                 accumulated_network_stats.max_hops,
-                self.overall_max_hops + child_network_stats.max_hops,
+                self.overall_max_hops[network] + child_network_stats.max_hops,
+            )
+            accumulated_network_stats.max_traffic[node.name] = MaxGeqZero(
+                accumulated_network_stats.max_traffic.get(node.name, 0),
+                max_traffic + child_network_stats.max_traffic.get(node.name, 0)
             )
 
         return self.overall_max_hops
 
+    def _get_data_volume(self, network, einsum_name, info, child_shape):
+        flattened_arch = info.job.flattened_arch
+        projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)]
+        component_object = flattened_arch[network.component]
+        workload_bpv = info.job.einsum.tensor_accesses[
+            network.tensor
+        ].bits_per_value
+        bits_per_value = component_object.bits_per_value.get(
+            network.tensor, workload_bpv
+        )
+        bits_per_action = component_object.bits_per_action
+        if bits_per_action is not None:
+            actions_per_value = bits_per_value / bits_per_action
+        else:
+            actions_per_value = bits_per_value
+        volume = (
+            compute_dense_tile_occupancy(projection, child_shape)
+            * actions_per_value
+        )
+        return volume
+
 
 def multicast_cost(n_dsts, stride):
     """Returns total hops of multicast along a dimension."""
diff --git a/accelforge/model/_looptree/reuse/symbolic/_stats.py b/accelforge/model/_looptree/reuse/symbolic/_stats.py
index 1c2c8ee3..aa2c1d90 100644
--- a/accelforge/model/_looptree/reuse/symbolic/_stats.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_stats.py
@@ -21,7 +21,11 @@
 @dataclass
 class NetworkStats:
     total_hops: Any = field(default=0)
+    """Total number of hops overall. Useful to calculate energy."""
     max_hops: Any = field(default=0)
+    """Longest hops among all routes."""
+    max_traffic: dict[int | str, Any] = field(default_factory=dict)
+    """Maximum traffic occuring on any single link along a dimension."""
 
     def repeat(self, n_repeats):
         new = copy.copy(self)
@@ -32,10 +36,6 @@ def repeat(self, n_repeats):
         new.total_hops = new.total_hops * n_repeats
         return new
 
-    def combine(self, other: "NetworkStats"):
-        self.total_hops += other.total_hops
-        self.max_hops = max(self.max_hops, other.max_hops)
-
 
 @dataclass
 class BuffetStats:
diff --git a/accelforge/model/run_model.py b/accelforge/model/run_model.py
index eee4f60f..b68ea7c5 100644
--- a/accelforge/model/run_model.py
+++ b/accelforge/model/run_model.py
@@ -43,6 +43,7 @@ def run_model(
     )
 
     latency = component_latency(reuse, job.flattened_arch, pmapping, spec)
+    print(latency)
     try:
         overall_latency = MaxGeqZero(*latency.values())
     except Exception as e:
diff --git a/tests/input_files/networked/hierarchical_1d.yaml b/tests/input_files/networked/hierarchical_1d.yaml
index 49853317..1c5c60d1 100644
--- a/tests/input_files/networked/hierarchical_1d.yaml
+++ b/tests/input_files/networked/hierarchical_1d.yaml
@@ -24,8 +24,9 @@ arch:
     name: PeArray
     area: 0
     leak_power: 0
+    total_latency: "max_hops"
     actions:
-    - {name: hops, energy: 1, latency: 0}
+    - {name: hops, energy: 1, latency: 1}
 
   - !Memory
     name: Scratchpad
diff --git a/tests/test_network.py b/tests/test_network.py
index 109e9f68..d684f48a 100644
--- a/tests/test_network.py
+++ b/tests/test_network.py
@@ -125,6 +125,10 @@ def test_hierarchical_1d(self):
             * KN
             * BITS_PER_VALUE,
         )
+        self.assertEqual(
+            result.data["Total<SEP>latency"].iloc[0],
+            4
+        )
 
     def test_hierarchical(self):
         M = 8

From 7ad792a1669b389f4236f2cc94093868e656d920 Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Fri, 5 Jun 2026 10:26:07 -0400
Subject: [PATCH 09/12] [network] Update to latest spec

---
 accelforge/frontend/arch/components.py        | 16 ++--------
 accelforge/model/_looptree/latency/memory.py  |  9 ++++--
 tests/input_files/networked/flat.yaml         |  6 ++--
 tests/input_files/networked/hierarchical.yaml |  4 +--
 .../networked/hierarchical_1d.yaml            |  4 +--
 tests/test_network.py                         | 29 +++++++++----------
 6 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/accelforge/frontend/arch/components.py b/accelforge/frontend/arch/components.py
index e7f6de98..8bdf17ae 100644
--- a/accelforge/frontend/arch/components.py
+++ b/accelforge/frontend/arch/components.py
@@ -145,7 +145,7 @@ def _set_n_calls(self, value: int | float) -> None:
     @classmethod
     def _deprecate_latency_fields(cls, data):
         if isinstance(data, dict):
-            if "latency" in data:
+            if "latency" in data and not "throughput" in data:
                 l = data.pop("latency")
                 warnings.warn(
                     f"Setting `latency` on `{cls.__name__}` is deprecated; use "
@@ -155,16 +155,11 @@ def _deprecate_latency_fields(cls, data):
                     DeprecationWarning,
                     stacklevel=2,
                 )
-                if "throughput" in data:
-                    raise ValueError(
-                        f"Cannot specify both `latency` and `throughput` on "
-                        f"`{cls.__name__}`. Drop the deprecated `latency` field."
-                    )
                 l = str(l).strip()
                 data["throughput"] = (
                     f"1 / ({l}) if ({l}) != 0 else float('inf')"
                 )
-            if "latency_scale" in data:
+            if "latency_scale" in data and not "throughput_scale" in data:
                 ls = data.pop("latency_scale")
                 warnings.warn(
                     f"Setting `latency_scale` on `{cls.__name__}` is deprecated; use "
@@ -174,11 +169,6 @@ def _deprecate_latency_fields(cls, data):
                     DeprecationWarning,
                     stacklevel=2,
                 )
-                if "throughput_scale" in data:
-                    raise ValueError(
-                        f"Cannot specify both `latency_scale` and `throughput_scale` "
-                        f"on `{cls.__name__}`. Drop the deprecated `latency_scale`."
-                    )
                 ls = str(ls).strip()
                 data["throughput_scale"] = (
                     f"1 / ({ls}) if ({ls}) != 0 else float('inf')"
@@ -1316,7 +1306,7 @@ class Network(Component, Leaf):
     of the spatial nodes from top to bottom.
     """
 
-    total_latency: str | int | float = "max(max_hops*actions['hops'].latency, max_link_traffic/actions['hops'].latency)"
+    total_latency: str | int | float = "max(max_hops*actions['hops'].latency, max_link_traffic/actions['hops'].throughput)"
     """
     Models latency as either:
     - *Latency-bound*, which means that the latency of the route with the most number of
diff --git a/accelforge/model/_looptree/latency/memory.py b/accelforge/model/_looptree/latency/memory.py
index 4571a355..ebcaa1de 100755
--- a/accelforge/model/_looptree/latency/memory.py
+++ b/accelforge/model/_looptree/latency/memory.py
@@ -165,13 +165,18 @@ def component_latency(
         "sum": _sum,
     }
 
-    for component in component_to_actions:
+    for component in name2component:
+        if component not in component_to_actions and component not in component_to_keywords:
+            continue
         component_obj = name2component[component]
         dump = component_obj.shallow_model_dump(include_None=True)
         # Replace serialized `actions` dump with local Action copies that carry
         # the correct n_calls for this job, so formulas can access `a.n_calls`,
         # `a.throughput`, etc. without mutating the shared spec state.
-        dump["actions"] = component_to_actions[component]
+        if component in component_to_actions:
+            dump["actions"] = component_to_actions[component]
+        if component in component_to_keywords:
+            dump |= component_to_keywords[component]
         symbol_table = {**symbol_table_base, **dump}
         if component_obj.total_latency is not None:
             component_latency[component] = eval_expression(
diff --git a/tests/input_files/networked/flat.yaml b/tests/input_files/networked/flat.yaml
index 2b511e59..28679d21 100644
--- a/tests/input_files/networked/flat.yaml
+++ b/tests/input_files/networked/flat.yaml
@@ -57,8 +57,8 @@ arch:
       leak_power: 0
       tensors: {keep: weight, may_keep: weight}
       actions:
-      - {name: read, energy: 5, throughput: inf}
-      - {name: write, energy: 5, throughput: inf}
+      - {name: read, energy: 5, throughput: 1}
+      - {name: write, energy: 5, throughput: 1}
       spatial:
       - {name: X, fanout: 2}
       - {name: Y, fanout: 2}
@@ -68,7 +68,7 @@ arch:
       area: 0
       leak_power: 0
       actions:
-      - {name: hops, energy: 1, throughput: inf}
+      - {name: hops, energy: 1, latency: 0, throughput: inf}
 
   - !Memory
     name: Scratchpad
diff --git a/tests/input_files/networked/hierarchical.yaml b/tests/input_files/networked/hierarchical.yaml
index 5e8634ad..f268ef7e 100644
--- a/tests/input_files/networked/hierarchical.yaml
+++ b/tests/input_files/networked/hierarchical.yaml
@@ -25,7 +25,7 @@ arch:
     area: 0
     leak_power: 0
     actions:
-    - {name: hops, energy: 1, throughput: 4e9}
+    - {name: hops, energy: 1, latency: 0, throughput: 4e9}
 
   - !Memory
     name: Scratchpad
@@ -45,7 +45,7 @@ arch:
     area: 0
     leak_power: 0
     actions:
-    - {name: hops, energy: 1, throughput: 16e9}
+    - {name: hops, energy: 1, latency: 0, throughput: 16e9}
 
   - !Compute
     name: MAC
diff --git a/tests/input_files/networked/hierarchical_1d.yaml b/tests/input_files/networked/hierarchical_1d.yaml
index 4171e115..167212ff 100644
--- a/tests/input_files/networked/hierarchical_1d.yaml
+++ b/tests/input_files/networked/hierarchical_1d.yaml
@@ -26,7 +26,7 @@ arch:
     leak_power: 0
     total_latency: "max_hops"
     actions:
-    - {name: hops, energy: 1, throughput: 1}
+    - {name: hops, energy: 1, latency: 0, throughput: 1}
 
   - !Memory
     name: Scratchpad
@@ -45,7 +45,7 @@ arch:
     area: 0
     leak_power: 0
     actions:
-    - {name: hops, energy: 1, throughput: inf}
+    - {name: hops, energy: 1, latency: 1, throughput: inf}
 
   - !Compute
     name: MAC
diff --git a/tests/test_network.py b/tests/test_network.py
index 06b731d5..8a11802d 100644
--- a/tests/test_network.py
+++ b/tests/test_network.py
@@ -27,21 +27,6 @@ def test_flat(self):
         spec = af.Spec.from_yaml(
             INPUT_FILES_DIR / "flat.yaml",
         )
-        print(spec.arch.nodes["NoC"])
-        self.assertIn("NoC", spec.arch.nodes)
-        self.assertEqual(spec.arch.nodes["NoC"].get_fanout(), 1)
-        self.assertEqual(
-            {n.name for n in spec.arch.get_nodes_of_type(af.spec.Leaf)},
-            {
-                "MainMemory",
-                "NoC",
-                "RowBuffer",
-                "ColumnBuffer",
-                "DistributedBuffer",
-                "Scratchpad",
-                "MAC",
-            },
-        )
 
         try:
             spec = spec.calculate_component_costs()
@@ -320,7 +305,7 @@ def test_flat(self):
         )
         self.assertEqual(
             result.data['Matmul0<SEP>latency<SEP>DistributedBuffer'].iloc[0],
-            (
+            (   # Reads from child
                 M / M_TILE
                 *
                 KN // MAC_TILE
@@ -333,6 +318,18 @@ def test_flat(self):
                 /
                 4    # num of physical DistributedBuffer
             )
+            +
+            (   # Writes from parent
+                KN // MAC_TILE
+                *
+                KN // MAC_TILE
+                *
+                MAC_TILE * MAC_TILE  # tile shape
+                *
+                BITS_PER_VALUE
+                /
+                4    # num of physical DistributedBuffer
+            )
         )
 
 

From 488f4b1523dc9239b3cffb43e98048fe63d37efd Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Fri, 5 Jun 2026 15:14:24 -0400
Subject: [PATCH 10/12] [network] Refactor network cost to handle different
 topologies

---
 accelforge/frontend/arch/components.py        |   2 +-
 accelforge/model/_looptree/latency/memory.py  |   1 +
 .../_looptree/reuse/symbolic/_network.py      | 261 ++++++++++++++----
 .../_looptree/reuse/symbolic/_symbolic.py     |   6 +-
 4 files changed, 207 insertions(+), 63 deletions(-)

diff --git a/accelforge/frontend/arch/components.py b/accelforge/frontend/arch/components.py
index 8bdf17ae..dc47cbda 100644
--- a/accelforge/frontend/arch/components.py
+++ b/accelforge/frontend/arch/components.py
@@ -1294,7 +1294,7 @@ def _render_node_color(self) -> str:
         return "#E0EEFF"
 
 
-class TopologySpec(str, enum.Enum):
+class TopologySpec(enum.StrEnum):
     MESH = "mesh"
 
 
diff --git a/accelforge/model/_looptree/latency/memory.py b/accelforge/model/_looptree/latency/memory.py
index ebcaa1de..080e3c04 100755
--- a/accelforge/model/_looptree/latency/memory.py
+++ b/accelforge/model/_looptree/latency/memory.py
@@ -109,6 +109,7 @@ def component_latency(
 
     network_to_max_link_traffic = defaultdict(lambda: defaultdict(lambda: 0))
     network_to_max_hops = defaultdict(lambda: [])
+    # Aggregates across tensors
     for network, network_stats in looptree_results.network_stats.items():
         component = network.component
         if component not in name2component:
diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py
index 2f9bfe87..b493de93 100644
--- a/accelforge/model/_looptree/reuse/symbolic/_network.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_network.py
@@ -1,6 +1,11 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any
+
 from accelforge.frontend.mapping import (
     Spatial
 )
+from accelforge.frontend.arch.components import TopologySpec
 from accelforge.frontend._workload_isl._symbolic import (
     compute_dense_tile_occupancy,
     Irrelevant,
@@ -14,22 +19,178 @@
 from ._stats import NetworkStats, SymbolicAnalysisOutput
 
 
-class NetworkAnalyzer:
-    def __init__(self, network_stats):
+@dataclass
+class PerLoopTransferCost:
+    """The per-spatial-loop cost contributed by a single network, as computed
+    by a :class:`TopologyModel`."""
+
+    total_cost: Any
+    """Total hops contributed by data movement over this spatial loop."""
+    max_hops: Any
+    """Hops added to the longest route by this spatial loop."""
+    max_traffic: Any
+    """Maximum traffic (in actions) on any single link along this dimension."""
+
+
+class TopologyModel(ABC):
+    """Computes the cost of moving data across a network of a given topology.
+
+    Subclasses encapsulate everything topology-specific about how a tensor's
+    data is delivered across a spatial fanout. :class:`NetworkAnalyzer` selects
+    the model for each network from its component's
+    :class:`~accelforge.frontend.arch.components.TopologySpec` and remains
+    agnostic to the topology itself.
+
+    Instances are stateful: they accumulate per-network max hops across the
+    repeated spatial-loop iterations of a single :class:`NetworkAnalyzer`, so a
+    fresh model is constructed for each analyzer (see :func:`get_topology_model`).
+    """
+
+    def __init__(self):
+        # Running total of max hops per network, accumulated across the
+        # repeated spatial-loop iterations handled by one NetworkAnalyzer.
         self.overall_max_hops: dict = {}
+
+    def accumulate_max_hops(self, network, max_hops):
+        """Add this loop's ``max_hops`` to ``network``'s running total and
+        return the updated total.
+
+        Each call to :meth:`NetworkAnalyzer.accumulate_child_result` (i.e., over
+        a different iteration of a spatial loop) adds more to the max hops.
+        """
+        self.overall_max_hops[network] = (
+            self.overall_max_hops.get(network, 0) + max_hops
+        )
+        return self.overall_max_hops[network]
+
+    @abstractmethod
+    def per_loop_transfer_cost(
+        self,
+        relevancy,
+        *,
+        shape_repeats,
+        last_fanout,
+        volume,
+        src_component,
+        dim_name: str,
+    ) -> PerLoopTransferCost:
+        """Return the :class:`PerLoopTransferCost` for moving ``volume`` of data across one
+        spatial loop.
+
+        Args:
+            relevancy: The relevancy of the spatial loop's rank variable to the
+                tensor (``Irrelevant``, ``Relevant``, or ``PartiallyRelevant``).
+            shape_repeats: The number of iterations of this spatial loop.
+            last_fanout: The fanout in this dimension among mapping nodes below
+                (i.e., the stride).
+            volume: The data volume (in actions) moved per destination.
+            src_component: The flattened-arch component sourcing the data, used
+                to query physical fanout/stride.
+            dim_name: The name of the spatial dimension (e.g., ``X`` or ``Y``).
+        """
+        raise NotImplementedError
+
+
+class MeshTopologyModel(TopologyModel):
+    """Cost model for a mesh network.
+
+    Data travels link-by-link along one axis of the mesh. Multicast delivers a
+    value to every point along the dimension; unicast delivers a distinct value
+    to each point. When the source is physically distributed, data is bound as
+    locally as possible across the physical buffers.
+    """
+
+    def per_loop_transfer_cost(
+        self,
+        relevancy,
+        *,
+        shape_repeats,
+        last_fanout,
+        volume,
+        src_component,
+        dim_name,
+    ) -> PerLoopTransferCost:
+        if isinstance(relevancy, Irrelevant):
+            # The volume travels through link by link in one axis of the mesh
+            # Distributed or not, the amount of total cost is the same.
+            # However, the accesses now come from different physical memories
+            total_cost = multicast_cost(shape_repeats, last_fanout) * volume
+            max_hops = shape_repeats * last_fanout
+            max_traffic = volume
+        elif isinstance(relevancy, Relevant):
+            # If distributed, then we bind data as locally as possible in the
+            # physical buffers
+            if src_component._get_physical_fanout_along(dim_name) > 1:
+                physical_stride = src_component._get_physical_stride_along(dim_name)
+
+                n_dsts_per_physical = MinGeqZero(
+                    # if last_fanout > physical_stride, set n_dst to 1, which results in 0 hops
+                    # later (which is correct because the set of destinations always overlap
+                    # the set of sources).
+                    MaxGeqZero(physical_stride / last_fanout, 1),
+                    shape_repeats
+                )
+                n_activated_physical = MaxGeqZero(shape_repeats * last_fanout / physical_stride, 1)
+                total_cost = (
+                    n_activated_physical
+                    *
+                    unicast_cost(n_dsts_per_physical, last_fanout)
+                    *
+                    volume
+                )
+                max_hops = MinGeqZero((n_dsts_per_physical - 1) * last_fanout, physical_stride)
+                max_traffic = (n_dsts_per_physical - 1) * volume
+            else:
+                total_cost = unicast_cost(shape_repeats, last_fanout) * volume
+                max_hops = shape_repeats * last_fanout
+                max_traffic = (shape_repeats - 1) * volume
+        elif isinstance(relevancy, PartiallyRelevant):
+            raise NotImplementedError()
+        else:
+            raise RuntimeError(f"unhandled relevancy type {relevancy}")
+
+        return PerLoopTransferCost(total_cost=total_cost, max_hops=max_hops, max_traffic=max_traffic)
+
+
+# Registry mapping each topology to the model class that costs its data
+# movement. Classes (not instances) are stored because models are stateful and
+# each NetworkAnalyzer needs its own.
+TOPOLOGY_MODELS: dict[TopologySpec, type[TopologyModel]] = {
+    TopologySpec.MESH: MeshTopologyModel,
+}
+
+
+def get_topology_model(topology) -> TopologyModel:
+    """Construct a fresh :class:`TopologyModel` for the given topology."""
+    return TOPOLOGY_MODELS[topology]()
+
+
+class NetworkAnalyzer:
+    def __init__(self, network_stats, info: AnalysisInfo, einsum_name, node: Spatial):
         self.network_stats = network_stats
+        # These don't change across calls to accumulate_child_result.
+        self.info = info
+        self.einsum_name = einsum_name
+        self.node = node
+        # Each network gets its own topology model, since different networks may
+        # have different topologies. Models are constructed lazily, the first
+        # time a network needs costing, and reused for the analyzer's lifetime so
+        # their accumulated max hops persist.
+        self.topology_models: dict = {}
+
+    def _get_topology_model(self, network, topology) -> TopologyModel:
+        if network not in self.topology_models:
+            self.topology_models[network] = get_topology_model(topology)
+        return self.topology_models[network]
 
     def accumulate_child_result(
         self,
         child_result: SymbolicAnalysisOutput,
-        info: AnalysisInfo,
         shape_repeats,
-        einsum_name,
         child_shape,
-        node: Spatial,
     ):
         """This function is called for every repeated shape."""
-        flattened_arch = info.job.flattened_arch
+        flattened_arch = self.info.job.flattened_arch
 
         for network, child_network_stats in child_result.network_stats.items():
             src_component = flattened_arch[network.source.level]
@@ -39,7 +200,7 @@ def accumulate_child_result(
 
             # We only need to update the summary if the spatial loop is for
             # a component higher than the network of interest
-            if flattened_arch.is_above(node.component, network.component):
+            if flattened_arch.is_above(self.node.component, network.component):
                 accumulated_network_stats.total_hops += (
                     child_network_stats.total_hops * shape_repeats
                 )
@@ -54,71 +215,51 @@ def accumulate_child_result(
                     )
                 continue
 
-            volume = self._get_data_volume(network, einsum_name, info, child_shape)
+            volume = self._get_data_volume(network, child_shape)
 
-            relevancy = info.tensor_to_relevancy[network.tensor][node.rank_variable]
+            relevancy = self.info.tensor_to_relevancy[network.tensor][self.node.rank_variable]
 
             # The fanout in this dimension in mapping nodes below, i.e., the stride
-            last_fanout = child_result.fanout.get((node.component, einsum_name), {})
-            last_fanout = last_fanout.get(node.name, 1)
-            if isinstance(relevancy, Irrelevant):
-                # The volume travels through link by link in one axis of the mesh
-                # Distributed or not, the amount of total cost is the same.
-                # However, the accesses now come from different physical memories
-                total_cost = multicast_cost(shape_repeats, last_fanout)*volume
-                max_hops = shape_repeats*last_fanout
-                max_traffic = volume
-            elif isinstance(relevancy, Relevant):
-                # If distributed, then we bind data as locally as possible in the
-                # physical buffers
-                if src_component._get_physical_fanout_along(node.name) > 1:
-                    physical_stride = src_component._get_physical_stride_along(node.name)
-
-                    n_dsts_per_physical = MinGeqZero(
-                        # if last_fanout > physical_stride, set n_dst to 1, which results in 0 hops
-                        # later (which is correct because the set of destinations always overlap
-                        # the set of sources).
-                        MaxGeqZero(physical_stride / last_fanout, 1),
-                        shape_repeats
-                    )
-                    n_activated_physical = MaxGeqZero(shape_repeats*last_fanout/physical_stride, 1)
-                    total_cost = (
-                        n_activated_physical
-                        *
-                        unicast_cost(n_dsts_per_physical, last_fanout)
-                        *
-                        volume
-                    )
-                    max_hops = MinGeqZero((n_dsts_per_physical-1)*last_fanout, physical_stride)
-                    max_traffic = (n_dsts_per_physical-1)*volume
-                else:
-                    total_cost = unicast_cost(shape_repeats, last_fanout)*volume
-                    max_hops = shape_repeats * last_fanout
-                    max_traffic = (shape_repeats-1)*volume
-            elif isinstance(relevancy, PartiallyRelevant):
-                raise NotImplementedError()
-            else:
-                raise RuntimeError(f"unhandled relevancy type {relevancy}")
+            last_fanout = child_result.fanout.get((self.node.component, self.einsum_name), {})
+            last_fanout = last_fanout.get(self.node.name, 1)
 
-            # Each subsequent call to this function (i.e., over different iterations of a spatial loop)
-            # adds more to the max hops
-            self.overall_max_hops[network] = self.overall_max_hops.get(network, 0) + max_hops
+            topology_model = self._get_topology_model(
+                network, flattened_arch[network.component].topology
+            )
+            per_loop_transfer_cost = topology_model.per_loop_transfer_cost(
+                relevancy,
+                shape_repeats=shape_repeats,
+                last_fanout=last_fanout,
+                volume=volume,
+                src_component=src_component,
+                dim_name=self.node.name,
+            )
+
+            overall_max_hops = topology_model.accumulate_max_hops(
+                network, per_loop_transfer_cost.max_hops
+            )
 
             accumulated_network_stats.total_hops += (
-                total_cost + child_network_stats.total_hops*shape_repeats
+                per_loop_transfer_cost.total_cost
+                + child_network_stats.total_hops * shape_repeats
             )
             accumulated_network_stats.max_hops = MaxGeqZero(
                 accumulated_network_stats.max_hops,
-                self.overall_max_hops[network] + child_network_stats.max_hops,
+                overall_max_hops + child_network_stats.max_hops,
             )
-            accumulated_network_stats.max_traffic[node.name] = MaxGeqZero(
-                accumulated_network_stats.max_traffic.get(node.name, 0),
-                max_traffic + child_network_stats.max_traffic.get(node.name, 0)
+            accumulated_network_stats.max_traffic[self.node.name] = MaxGeqZero(
+                accumulated_network_stats.max_traffic.get(self.node.name, 0),
+                per_loop_transfer_cost.max_traffic + child_network_stats.max_traffic.get(self.node.name, 0)
             )
 
-        return self.overall_max_hops
+        overall_max_hops = {}
+        for model in self.topology_models.values():
+            overall_max_hops.update(model.overall_max_hops)
+        return overall_max_hops
 
-    def _get_data_volume(self, network, einsum_name, info, child_shape):
+    def _get_data_volume(self, network, child_shape):
+        info = self.info
+        einsum_name = self.einsum_name
         flattened_arch = info.job.flattened_arch
         projection = info.einsum_tensor_to_projection[(einsum_name, network.tensor)]
         component_object = flattened_arch[network.component]
@@ -153,4 +294,4 @@ def unicast_cost(n_dsts, stride):
 
 
 def arithmetic_sum(n):
-    return 0.5 * (n+1) * n
\ No newline at end of file
+    return 0.5 * (n+1) * n
diff --git a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
index cb64012c..d3c7b50b 100755
--- a/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_symbolic.py
@@ -592,7 +592,9 @@ def analyze_spatial(node_idx, current_shape, info: AnalysisInfo):
 
     result_accumulator = SymbolicAnalysisOutput()
 
-    network_analyzer = NetworkAnalyzer(result_accumulator.network_stats)
+    network_analyzer = NetworkAnalyzer(
+        result_accumulator.network_stats, info, einsum_name, node
+    )
 
     def handle_repeated_value(repeated_shape):
         shape_value = repeated_shape.value
@@ -633,7 +635,7 @@ def handle_repeated_value(repeated_shape):
             )
 
         network_analyzer.accumulate_child_result(
-            child_result, info, shape_repeats, einsum_name, child_shape, node
+            child_result, shape_repeats, child_shape
         )
 
         for einsum, child_steps in child_result.temporal_steps.items():

From b9c7d4abe41e9015775344d398eca3ce0912d65e Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Fri, 5 Jun 2026 17:04:57 -0400
Subject: [PATCH 11/12] [network] WIP review

---
 accelforge/frontend/arch/components.py        |   1 +
 .../_looptree/reuse/symbolic/_network.py      |  68 +++++++
 .../input_files/networked/flat.yaml           |   0
 .../input_files/networked/hierarchical.yaml   |   0
 .../networked/hierarchical_1d.yaml            |   0
 .../networked/hierarchical_1d_all_to_all.yaml |  61 +++++++
 .../networked/hierarchical_switched.yaml      |   0
 .../networked/one_matmul_to_flat.yaml         |   0
 .../one_matmul_to_networked_hierarchical.yaml |   0
 ...e_matmul_to_networked_hierarchical_1d.yaml |   0
 tests/{ => network}/test_network.py           | 101 ++++++++++-
 tests/network/test_topology_model.py          | 168 ++++++++++++++++++
 12 files changed, 398 insertions(+), 1 deletion(-)
 rename tests/{ => network}/input_files/networked/flat.yaml (100%)
 rename tests/{ => network}/input_files/networked/hierarchical.yaml (100%)
 rename tests/{ => network}/input_files/networked/hierarchical_1d.yaml (100%)
 create mode 100644 tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml
 rename tests/{ => network}/input_files/networked/hierarchical_switched.yaml (100%)
 rename tests/{ => network}/input_files/networked/one_matmul_to_flat.yaml (100%)
 rename tests/{ => network}/input_files/networked/one_matmul_to_networked_hierarchical.yaml (100%)
 rename tests/{ => network}/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml (100%)
 rename tests/{ => network}/test_network.py (75%)
 create mode 100644 tests/network/test_topology_model.py

diff --git a/accelforge/frontend/arch/components.py b/accelforge/frontend/arch/components.py
index dc47cbda..867cb03b 100644
--- a/accelforge/frontend/arch/components.py
+++ b/accelforge/frontend/arch/components.py
@@ -1296,6 +1296,7 @@ def _render_node_color(self) -> str:
 
 class TopologySpec(enum.StrEnum):
     MESH = "mesh"
+    ALL_TO_ALL = "all_to_all"
 
 
 class Network(Component, Leaf):
diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py
index b493de93..cdd9b8fa 100644
--- a/accelforge/model/_looptree/reuse/symbolic/_network.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_network.py
@@ -152,11 +152,79 @@ def per_loop_transfer_cost(
         return PerLoopTransferCost(total_cost=total_cost, max_hops=max_hops, max_traffic=max_traffic)
 
 
+class AllToAllTopologyModel(TopologyModel):
+    """Cost model for an all-to-all network built around a switch (e.g. NVLink /
+    NVSwitch).
+
+    Every node connects to every other node through a central switch, so any
+    source reaches any destination in a constant number of hops regardless of
+    how far apart they are in the logical fanout. This differs from a mesh in
+    two ways:
+
+    - **Uniform latency.** The longest route is a single switch traversal, so
+      ``max_hops`` is constant rather than growing with the distance
+      (``shape_repeats * stride``) between source and destination.
+    - **No store-and-forward accumulation.** Each destination is reached
+      directly, so the total (energy) cost is linear in the number of
+      destinations rather than quadratic as in a mesh unicast.
+
+    The physical stride is irrelevant here (all nodes are equidistant from the
+    switch), so ``last_fanout`` and physical distribution are not consulted.
+    """
+
+    HOPS_PER_TRANSFER = 1
+    """Hops charged for one source-to-destination transfer across the switch.
+    One switch traversal is treated as a single hop; the per-hop energy and
+    latency come from the network component's ``hops`` action."""
+
+    def per_loop_transfer_cost(
+        self,
+        relevancy,
+        *,
+        shape_repeats,
+        last_fanout,
+        volume,
+        src_component,
+        dim_name,
+    ) -> PerLoopTransferCost:
+        hops = self.HOPS_PER_TRANSFER
+
+        # n - 1 other instances each receive the data across the switch. The
+        # source already holds it (the set of destinations overlaps the set of
+        # sources), so it needs no transfer to itself.
+        n_dsts = shape_repeats - 1
+
+        if isinstance(relevancy, (Irrelevant, Relevant)):
+            # Same delivery count (and hence energy) whether the data is shared
+            # (multicast) or distinct per instance (unicast): each of the n - 1
+            # destinations is one switch traversal away.
+            total_cost = n_dsts * hops * volume
+            # Every route is a single switch traversal, independent of distance.
+            max_hops = hops
+            if isinstance(relevancy, Irrelevant):
+                # Multicast: the switch replicates, so each link carries the
+                # value at most once.
+                max_traffic = volume
+            else:
+                # Unicast: the source's uplink to the switch carries all n - 1
+                # distinct messages, making it the most congested link.
+                max_traffic = n_dsts * volume
+        elif isinstance(relevancy, PartiallyRelevant):
+            raise NotImplementedError()
+        else:
+            raise RuntimeError(f"unhandled relevancy type {relevancy}")
+
+        return PerLoopTransferCost(
+            total_cost=total_cost, max_hops=max_hops, max_traffic=max_traffic
+        )
+
+
 # Registry mapping each topology to the model class that costs its data
 # movement. Classes (not instances) are stored because models are stateful and
 # each NetworkAnalyzer needs its own.
 TOPOLOGY_MODELS: dict[TopologySpec, type[TopologyModel]] = {
     TopologySpec.MESH: MeshTopologyModel,
+    TopologySpec.ALL_TO_ALL: AllToAllTopologyModel,
 }
 
 
diff --git a/tests/input_files/networked/flat.yaml b/tests/network/input_files/networked/flat.yaml
similarity index 100%
rename from tests/input_files/networked/flat.yaml
rename to tests/network/input_files/networked/flat.yaml
diff --git a/tests/input_files/networked/hierarchical.yaml b/tests/network/input_files/networked/hierarchical.yaml
similarity index 100%
rename from tests/input_files/networked/hierarchical.yaml
rename to tests/network/input_files/networked/hierarchical.yaml
diff --git a/tests/input_files/networked/hierarchical_1d.yaml b/tests/network/input_files/networked/hierarchical_1d.yaml
similarity index 100%
rename from tests/input_files/networked/hierarchical_1d.yaml
rename to tests/network/input_files/networked/hierarchical_1d.yaml
diff --git a/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml b/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml
new file mode 100644
index 00000000..3d8b6d22
--- /dev/null
+++ b/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml
@@ -0,0 +1,61 @@
+arch:
+  nodes:
+  - !Memory
+    name: MainMemory
+    size: inf
+    area: 0
+    leak_power: 0
+    tensors: {keep: All}
+    actions:
+    - {name: read, energy: 0, throughput: inf}
+    - {name: write, energy: 0, throughput: inf}
+
+  - !Memory
+    name: GlobalBuffer
+    size: inf
+    area: 0
+    leak_power: 0
+    tensors: {keep: ~MainMemory, may_keep: All}
+    actions:
+    - {name: read, energy: 0, throughput: inf}
+    - {name: write, energy: 0, throughput: inf}
+
+  - !Network
+    name: PeArray
+    area: 0
+    leak_power: 0
+    total_latency: "max_hops"
+    actions:
+    - {name: hops, energy: 1, latency: 0, throughput: 1}
+
+  - !Memory
+    name: Scratchpad
+    size: inf
+    area: 0
+    leak_power: 0
+    tensors: {keep: All}
+    actions:
+    - {name: read, energy: 0, throughput: inf}
+    - {name: write, energy: 0, throughput: inf}
+    spatial:
+    - {name: X, fanout: 4}
+
+  # All-to-all switch (NVLink-like): every node is one switch hop from every
+  # other, so unicast and multicast cost the same total hops and max_hops is
+  # constant. Fanout is 4 so this differs observably from a mesh.
+  - !Network
+    name: MacArray
+    topology: all_to_all
+    area: 0
+    leak_power: 0
+    actions:
+    - {name: hops, energy: 1, latency: 1, throughput: inf}
+
+  - !Compute
+    name: MAC
+    area: 0
+    leak_power: 0
+    actions:
+    - {name: compute, energy: 0, throughput: inf}
+    spatial:
+    - {name: X, fanout: 4}
diff --git a/tests/input_files/networked/hierarchical_switched.yaml b/tests/network/input_files/networked/hierarchical_switched.yaml
similarity index 100%
rename from tests/input_files/networked/hierarchical_switched.yaml
rename to tests/network/input_files/networked/hierarchical_switched.yaml
diff --git a/tests/input_files/networked/one_matmul_to_flat.yaml b/tests/network/input_files/networked/one_matmul_to_flat.yaml
similarity index 100%
rename from tests/input_files/networked/one_matmul_to_flat.yaml
rename to tests/network/input_files/networked/one_matmul_to_flat.yaml
diff --git a/tests/input_files/networked/one_matmul_to_networked_hierarchical.yaml b/tests/network/input_files/networked/one_matmul_to_networked_hierarchical.yaml
similarity index 100%
rename from tests/input_files/networked/one_matmul_to_networked_hierarchical.yaml
rename to tests/network/input_files/networked/one_matmul_to_networked_hierarchical.yaml
diff --git a/tests/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml b/tests/network/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml
similarity index 100%
rename from tests/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml
rename to tests/network/input_files/networked/one_matmul_to_networked_hierarchical_1d.yaml
diff --git a/tests/test_network.py b/tests/network/test_network.py
similarity index 75%
rename from tests/test_network.py
rename to tests/network/test_network.py
index 8a11802d..138631fc 100644
--- a/tests/test_network.py
+++ b/tests/network/test_network.py
@@ -34,7 +34,7 @@ def test_flat(self):
             self.fail(e.message)
 
 
-class TestModel(TestCase):
+class TestModelMesh(TestCase):
     def test_hierarchical_1d(self):
         M = 8
         KN = 8
@@ -333,6 +333,105 @@ def test_flat(self):
         )
 
 
+class TestModelAllToAll(TestCase):
+    """Full-model evaluation of the 1D hierarchy where MacArray is an all-to-all
+    switch (NVLink-like) instead of a mesh. PeArray remains a mesh, so the two
+    networks can be contrasted within a single run."""
+
+    def test_hierarchical_1d_all_to_all(self):
+        M = 8
+        KN = 8
+        MAC_TILE = 4
+        M_TILE = 4
+        BITS_PER_VALUE = 8
+
+        spec = af.Spec.from_yaml(
+            af.examples.workloads.matmuls,
+            INPUT_FILES_DIR / "hierarchical_1d_all_to_all.yaml",
+            INPUT_FILES_DIR / "one_matmul_to_networked_hierarchical_1d.yaml",
+            jinja_parse_data={
+                "N_EINSUMS": 1,
+                "M": M,
+                "KN": KN,
+                "MAC_TILE": MAC_TILE,
+                "M_TILE": M_TILE,
+            },
+        )
+        result = spec.evaluate_mapping()
+
+        # --- MacArray: all-to-all switch ---------------------------------
+        # On a switch every node is one hop away, so unicast (T0, W0) collapses
+        # to the same (MAC_TILE - 1) linear cost as multicast (T1): all equal.
+        # Contrast test_hierarchical_1d, where the mesh makes T0/W0 quadratic
+        # (sum(range(MAC_TILE))).
+        all_to_all = (
+            (M / M_TILE)
+            * (KN / MAC_TILE)  # number of used Scratchpad
+            * M_TILE
+            * KN  # temporal for n1 in mapping
+            * (MAC_TILE - 1)  # one switch hop per destination, for every tensor
+            * BITS_PER_VALUE
+        )
+        for tensor in ("T0", "T1", "W0"):
+            self.assertEqual(
+                result.data[
+                    f"Matmul0<SEP>action<SEP>MacArray<SEP>{tensor}<SEP>hops"
+                ].iloc[0],
+                all_to_all,
+                msg=f"unexpected MacArray hops for {tensor}",
+            )
+
+        # Guard: a mesh would make the unicast tensors strictly more expensive.
+        mesh_unicast = (
+            (M / M_TILE)
+            * (KN / MAC_TILE)
+            * M_TILE
+            * KN
+            * sum(range(MAC_TILE))  # quadratic on a mesh
+            * BITS_PER_VALUE
+        )
+        self.assertGreater(mesh_unicast, all_to_all)
+
+        # --- PeArray: still a mesh ---------------------------------------
+        # Unchanged from test_hierarchical_1d, so the mesh formulas hold (now
+        # with MAC_TILE = 4, i.e. KN // MAC_TILE = 2).
+        self.assertEqual(
+            result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T0<SEP>hops"].iloc[0],
+            (M / M_TILE)
+            * sum(i for i in range(KN // MAC_TILE))  # unicast along X of PeArray
+            * M_TILE
+            * MAC_TILE
+            * BITS_PER_VALUE,
+        )
+        self.assertEqual(
+            result.data["Matmul0<SEP>action<SEP>PeArray<SEP>T1<SEP>hops"].iloc[0],
+            (M / M_TILE)
+            * (KN // MAC_TILE - 1)  # multicast along X of PeArray
+            * M_TILE
+            * KN
+            * BITS_PER_VALUE,
+        )
+        self.assertEqual(
+            result.data["Matmul0<SEP>action<SEP>PeArray<SEP>W0<SEP>hops"].iloc[0],
+            (M / M_TILE)
+            * sum(i for i in range(KN // MAC_TILE))  # unicast along PeArray
+            * MAC_TILE
+            * KN
+            * BITS_PER_VALUE,
+        )
+
+        # --- Latency ------------------------------------------------------
+        # The switch's uniform single-hop routing gives MacArray a constant
+        # latency of 1, versus the mesh PeArray's distance-dependent 2.
+        self.assertEqual(
+            result.data["Matmul0<SEP>latency<SEP>MacArray"].iloc[0], 1
+        )
+        self.assertEqual(
+            result.data["Matmul0<SEP>latency<SEP>PeArray"].iloc[0], 2
+        )
+        self.assertEqual(result.data["Total<SEP>latency"].iloc[0], 2)
+
+
 class TestMapper(TestCase):
     def test_hierarchical(self):
         M = 8
diff --git a/tests/network/test_topology_model.py b/tests/network/test_topology_model.py
new file mode 100644
index 00000000..36dcc31c
--- /dev/null
+++ b/tests/network/test_topology_model.py
@@ -0,0 +1,168 @@
+from unittest import TestCase
+
+from accelforge.frontend.arch.components import TopologySpec
+from accelforge.frontend._workload_isl._symbolic import (
+    Irrelevant,
+    PartiallyRelevant,
+    Relevant,
+)
+from accelforge.model._looptree.reuse.symbolic._network import (
+    AllToAllTopologyModel,
+    MeshTopologyModel,
+    get_topology_model,
+)
+
+
+class _NoDistribution:
+    """Stand-in source component that is not physically distributed."""
+
+    def _get_physical_fanout_along(self, dim_name, default=1):
+        return 1
+
+
+class _Distributed:
+    """Stand-in source component physically distributed along a dimension."""
+
+    def __init__(self, fanout, stride):
+        self.fanout = fanout
+        self.stride = stride
+
+    def _get_physical_fanout_along(self, dim_name, default=1):
+        return self.fanout
+
+    def _get_physical_stride_along(self, dim_name):
+        return self.stride
+
+
+class TestMeshTopologyModel(TestCase):
+    """Unit tests for the mesh cost model in isolation."""
+
+    def _cost(self, relevancy, *, n, stride, volume=10, src=None):
+        return MeshTopologyModel().per_loop_transfer_cost(
+            relevancy,
+            shape_repeats=n,
+            last_fanout=stride,
+            volume=volume,
+            src_component=src if src is not None else _NoDistribution(),
+            dim_name="X",
+        )
+
+    def test_registry_resolves_model(self):
+        self.assertIsInstance(get_topology_model(TopologySpec.MESH), MeshTopologyModel)
+        self.assertIsInstance(get_topology_model("mesh"), MeshTopologyModel)
+
+    def test_multicast(self):
+        # Irrelevant: one value flows down the line, dropped at each of the
+        # (n - 1) downstream nodes. Each link carries it at most once.
+        n, stride, volume = 4, 2, 10
+        cost = self._cost(Irrelevant(), n=n, stride=stride, volume=volume)
+        self.assertEqual(cost.total_cost, (n - 1) * stride * volume)
+        self.assertEqual(cost.max_hops, n * stride)
+        self.assertEqual(cost.max_traffic, volume)
+
+    def test_unicast(self):
+        # Relevant (not distributed): each destination needs its own data
+        # delivered i*stride hops away, so the total is quadratic and the link
+        # nearest the source carries traffic for all (n - 1) downstream nodes.
+        n, stride, volume = 4, 2, 10
+        cost = self._cost(Relevant("n0"), n=n, stride=stride, volume=volume)
+        self.assertEqual(cost.total_cost, sum(range(n)) * stride * volume)
+        self.assertEqual(cost.max_hops, n * stride)
+        self.assertEqual(cost.max_traffic, (n - 1) * volume)
+
+    def test_unicast_distributed_binds_locally(self):
+        # When the source is physically distributed, data binds as locally as
+        # possible, reducing hops relative to the non-distributed unicast.
+        n, stride, volume = 4, 1, 10
+        src = _Distributed(fanout=2, stride=4)
+        cost = self._cost(Relevant("n0"), n=n, stride=stride, volume=volume, src=src)
+
+        # physical_stride / last_fanout = 4, capped at shape_repeats = 4
+        n_dsts_per_physical = 4
+        n_activated_physical = 1  # n*stride / physical_stride = 4/4
+        self.assertEqual(
+            cost.total_cost,
+            n_activated_physical * sum(range(n_dsts_per_physical)) * stride * volume,
+        )
+        self.assertEqual(cost.max_hops, (n_dsts_per_physical - 1) * stride)
+        self.assertEqual(cost.max_traffic, (n_dsts_per_physical - 1) * volume)
+
+    def test_partially_relevant_not_implemented(self):
+        with self.assertRaises(NotImplementedError):
+            self._cost(PartiallyRelevant("n0"), n=4, stride=2)
+
+
+class TestAllToAllTopologyModel(TestCase):
+    """Unit tests for the all-to-all (switch) cost model in isolation."""
+
+    def _cost(self, relevancy, n, *, volume=10, last_fanout=99):
+        # last_fanout is deliberately large and arbitrary: an all-to-all switch
+        # must ignore physical stride entirely.
+        return AllToAllTopologyModel().per_loop_transfer_cost(
+            relevancy,
+            shape_repeats=n,
+            last_fanout=last_fanout,
+            volume=volume,
+            src_component=_NoDistribution(),
+            dim_name="X",
+        )
+
+    def test_registry_resolves_model(self):
+        # Resolves both by enum and by the StrEnum value (the form that survives
+        # the arch evaluation pipeline).
+        self.assertIsInstance(
+            get_topology_model(TopologySpec.ALL_TO_ALL), AllToAllTopologyModel
+        )
+        self.assertIsInstance(get_topology_model("all_to_all"), AllToAllTopologyModel)
+
+    def test_multicast(self):
+        n, volume = 5, 10
+        cost = self._cost(Irrelevant(), n, volume=volume)
+        # Linear in destinations, one switch hop, shared link traffic.
+        self.assertEqual(cost.total_cost, (n - 1) * volume)
+        self.assertEqual(cost.max_hops, AllToAllTopologyModel.HOPS_PER_TRANSFER)
+        self.assertEqual(cost.max_traffic, volume)
+
+    def test_unicast(self):
+        n, volume = 5, 10
+        cost = self._cost(Relevant("n0"), n, volume=volume)
+        # Same (linear) total cost as multicast and constant hops, but the
+        # source's uplink to the switch carries every distinct message.
+        self.assertEqual(cost.total_cost, (n - 1) * volume)
+        self.assertEqual(cost.max_hops, AllToAllTopologyModel.HOPS_PER_TRANSFER)
+        self.assertEqual(cost.max_traffic, (n - 1) * volume)
+
+    def test_independent_of_stride(self):
+        # Stride (last_fanout) must not affect any component of the cost.
+        a = self._cost(Relevant("n0"), 5, last_fanout=1)
+        b = self._cost(Relevant("n0"), 5, last_fanout=1000)
+        self.assertEqual(
+            (a.total_cost, a.max_hops, a.max_traffic),
+            (b.total_cost, b.max_hops, b.max_traffic),
+        )
+
+    def test_linear_unlike_mesh_quadratic(self):
+        # Against an identical mesh scenario, all-to-all unicast is linear while
+        # the mesh is quadratic, and all-to-all hops are constant (< distance).
+        n, volume, stride = 6, 1, 1
+        kwargs = dict(
+            shape_repeats=n,
+            last_fanout=stride,
+            volume=volume,
+            src_component=_NoDistribution(),
+            dim_name="X",
+        )
+        a2a = AllToAllTopologyModel().per_loop_transfer_cost(Relevant("n0"), **kwargs)
+        mesh = MeshTopologyModel().per_loop_transfer_cost(Relevant("n0"), **kwargs)
+
+        self.assertEqual(a2a.total_cost, (n - 1) * volume)
+        self.assertEqual(mesh.total_cost, sum(range(n)) * stride * volume)
+        self.assertLess(a2a.total_cost, mesh.total_cost)
+        self.assertLess(a2a.max_hops, mesh.max_hops)
+
+    def test_accumulate_max_hops_persists(self):
+        # overall_max_hops accumulates across calls for a given network.
+        model = AllToAllTopologyModel()
+        h = AllToAllTopologyModel.HOPS_PER_TRANSFER
+        self.assertEqual(model.accumulate_max_hops("net", h), h)
+        self.assertEqual(model.accumulate_max_hops("net", h), 2 * h)

From dd4769946e78f21c950ac7119a953fdce7d5ce9f Mon Sep 17 00:00:00 2001
From: Michael Gilbert <gilbertm@mit.edu>
Date: Fri, 5 Jun 2026 19:13:54 -0400
Subject: [PATCH 12/12] [network] Clean up Claude output

---
 .../_looptree/reuse/symbolic/_network.py      | 36 +++++++------------
 .../networked/hierarchical_1d_all_to_all.yaml |  6 ++--
 tests/network/test_network.py                 | 24 +++----------
 3 files changed, 18 insertions(+), 48 deletions(-)

diff --git a/accelforge/model/_looptree/reuse/symbolic/_network.py b/accelforge/model/_looptree/reuse/symbolic/_network.py
index cdd9b8fa..0c833354 100644
--- a/accelforge/model/_looptree/reuse/symbolic/_network.py
+++ b/accelforge/model/_looptree/reuse/symbolic/_network.py
@@ -94,10 +94,10 @@ def per_loop_transfer_cost(
 class MeshTopologyModel(TopologyModel):
     """Cost model for a mesh network.
 
-    Data travels link-by-link along one axis of the mesh. Multicast delivers a
-    value to every point along the dimension; unicast delivers a distinct value
-    to each point. When the source is physically distributed, data is bound as
-    locally as possible across the physical buffers.
+    Data travels along one axis of the mesh. Multicast delivers a value to every
+    point along the dimension; unicast delivers a distinct value to each point.
+    When the source is physically distributed, data is bound as locally as
+    possible across the physical buffers.
     """
 
     def per_loop_transfer_cost(
@@ -153,23 +153,13 @@ def per_loop_transfer_cost(
 
 
 class AllToAllTopologyModel(TopologyModel):
-    """Cost model for an all-to-all network built around a switch (e.g. NVLink /
-    NVSwitch).
-
-    Every node connects to every other node through a central switch, so any
-    source reaches any destination in a constant number of hops regardless of
-    how far apart they are in the logical fanout. This differs from a mesh in
-    two ways:
-
-    - **Uniform latency.** The longest route is a single switch traversal, so
-      ``max_hops`` is constant rather than growing with the distance
-      (``shape_repeats * stride``) between source and destination.
-    - **No store-and-forward accumulation.** Each destination is reached
-      directly, so the total (energy) cost is linear in the number of
-      destinations rather than quadratic as in a mesh unicast.
-
-    The physical stride is irrelevant here (all nodes are equidistant from the
-    switch), so ``last_fanout`` and physical distribution are not consulted.
+    """Cost model for an all-to-all network using a switch (e.g. NVLink).
+
+    Every node connects to every other node through a switch, so any
+    source reaches any destination in one hop regardless of
+
+    Physical stride is irrelevant, so ``last_fanout`` and physical distribution
+    are not used.
     """
 
     HOPS_PER_TRANSFER = 1
@@ -219,9 +209,7 @@ def per_loop_transfer_cost(
         )
 
 
-# Registry mapping each topology to the model class that costs its data
-# movement. Classes (not instances) are stored because models are stateful and
-# each NetworkAnalyzer needs its own.
+# Registry of topology models
 TOPOLOGY_MODELS: dict[TopologySpec, type[TopologyModel]] = {
     TopologySpec.MESH: MeshTopologyModel,
     TopologySpec.ALL_TO_ALL: AllToAllTopologyModel,
diff --git a/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml b/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml
index 3d8b6d22..bbb14f8c 100644
--- a/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml
+++ b/tests/network/input_files/networked/hierarchical_1d_all_to_all.yaml
@@ -26,7 +26,7 @@ arch:
     leak_power: 0
     total_latency: "max_hops"
     actions:
-    - {name: hops, energy: 1, latency: 0, throughput: 1}
+    - {name: hops, energy: 1, latency: 1, throughput: inf}
 
   - !Memory
     name: Scratchpad
@@ -40,9 +40,7 @@ arch:
     spatial:
     - {name: X, fanout: 4}
 
-  # All-to-all switch (NVLink-like): every node is one switch hop from every
-  # other, so unicast and multicast cost the same total hops and max_hops is
-  # constant. Fanout is 4 so this differs observably from a mesh.
+  # All-to-all switch (NVLink-like): every node is one hop from every other
   - !Network
     name: MacArray
     topology: all_to_all
diff --git a/tests/network/test_network.py b/tests/network/test_network.py
index 138631fc..04e6e6ba 100644
--- a/tests/network/test_network.py
+++ b/tests/network/test_network.py
@@ -334,9 +334,7 @@ def test_flat(self):
 
 
 class TestModelAllToAll(TestCase):
-    """Full-model evaluation of the 1D hierarchy where MacArray is an all-to-all
-    switch (NVLink-like) instead of a mesh. PeArray remains a mesh, so the two
-    networks can be contrasted within a single run."""
+    """MacArray is an all-to-all switch (NVLink-like). PeArray is a mesh."""
 
     def test_hierarchical_1d_all_to_all(self):
         M = 8
@@ -360,16 +358,13 @@ def test_hierarchical_1d_all_to_all(self):
         result = spec.evaluate_mapping()
 
         # --- MacArray: all-to-all switch ---------------------------------
-        # On a switch every node is one hop away, so unicast (T0, W0) collapses
-        # to the same (MAC_TILE - 1) linear cost as multicast (T1): all equal.
-        # Contrast test_hierarchical_1d, where the mesh makes T0/W0 quadratic
-        # (sum(range(MAC_TILE))).
+        # Every node is one hop away 
         all_to_all = (
             (M / M_TILE)
             * (KN / MAC_TILE)  # number of used Scratchpad
             * M_TILE
             * KN  # temporal for n1 in mapping
-            * (MAC_TILE - 1)  # one switch hop per destination, for every tensor
+            * (MAC_TILE - 1)  # one hop per destination, for every tensor
             * BITS_PER_VALUE
         )
         for tensor in ("T0", "T1", "W0"):
@@ -381,17 +376,6 @@ def test_hierarchical_1d_all_to_all(self):
                 msg=f"unexpected MacArray hops for {tensor}",
             )
 
-        # Guard: a mesh would make the unicast tensors strictly more expensive.
-        mesh_unicast = (
-            (M / M_TILE)
-            * (KN / MAC_TILE)
-            * M_TILE
-            * KN
-            * sum(range(MAC_TILE))  # quadratic on a mesh
-            * BITS_PER_VALUE
-        )
-        self.assertGreater(mesh_unicast, all_to_all)
-
         # --- PeArray: still a mesh ---------------------------------------
         # Unchanged from test_hierarchical_1d, so the mesh formulas hold (now
         # with MAC_TILE = 4, i.e. KN // MAC_TILE = 2).
@@ -422,7 +406,7 @@ def test_hierarchical_1d_all_to_all(self):
 
         # --- Latency ------------------------------------------------------
         # The switch's uniform single-hop routing gives MacArray a constant
-        # latency of 1, versus the mesh PeArray's distance-dependent 2.
+        # latency of 1, versus the mesh PeArray's 2.
         self.assertEqual(
             result.data["Matmul0<SEP>latency<SEP>MacArray"].iloc[0], 1
         )