diff --git a/.gitignore b/.gitignore index 8265b48..5a930ee 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,9 @@ hardware/deps/* *.tdb util/lint/sg_projects util/lint/tmp +hardware/bootrom/bootdata.cc +hardware/bootrom/bootdata_bootrom.cc +hardware/bootrom/bootrom.bin +hardware/bootrom/bootrom.dump +hardware/bootrom/bootrom.elf +hardware/bootrom/bootrom.sv diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 567242b..b54b177 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,29 +8,32 @@ variables: GIT_SUBMODULE_STRATEGY: none ROOT_DIR: '$CI_PROJECT_DIR' APPS: "tests" - PATH: '/home/gitlabci/.cargo/bin:/usr/local/bin:/usr/bin:/usr/sbin:/sbin:/usr/local/condor/bin:/usr/sepp/bin:$CI_PROJECT_DIR/install/verilator/bin:/home/gitlabci/.local/bin' + PATH: '$HOME/.cargo/bin:/usr/local/bin:/usr/bin:/usr/sbin:/sbin:/usr/local/condor/bin:/usr/sepp/bin:$CI_PROJECT_DIR/install/verilator/bin:$HOME/.local/bin' OBJCACHE: '' CC: '/usr/pack/gcc-11.2.0-af/linux-x64/bin/gcc' CXX: '/usr/pack/gcc-11.2.0-af/linux-x64/bin/g++' CMAKE: 'cmake-3.28.3' python: 'python3' python3: 'python3' + # Config to build and test + CI_CONFIG: 'cachepool_fpu_2g' + SW_PREFIX: 'test-cachepool-' default: - tags: [dolent] + tags: [shared] stages: - build + - test -.base: - artifacts: - when: always - expire_in: 1 day - -build-vsim: - extends: .base +# --------------------------------------------------------------------------- +# Build stage: compile RTL and software for CI_CONFIG. +# Parallel jobs within the same pipeline share $HOME, so the toolchain +# installed by make quick-tool is automatically available to all test jobs. +# --------------------------------------------------------------------------- +build: stage: build - timeout: 5h + timeout: 4h 30m script: - echo "Using CC=$CC" - echo "Using CXX=$CXX" @@ -39,10 +42,57 @@ build-vsim: - make quick-tool - make init - make dram-build - - cd util/auto-benchmark - - chmod +x ./run_ci.sh - - ./run_ci.sh + - python3 -m pip install --quiet -r requirements.txt + - make clean generate vsim config=$CI_CONFIG + artifacts: + when: always + expire_in: 2h + paths: + # QuestaSim compiled work library + - sim/work/ + # vsim wrapper scripts (exclude sim/bin/logs/ — not needed by test jobs) + - sim/bin/cachepool_cluster.vsim + - sim/bin/cachepool_cluster.vsim.gui + # DPI shared library + - sim/work-dpi/ + # Software binaries for all kernels + - software/build/CachePoolTests/ + # DRAMSys shared libraries and config files (referenced by vsim at runtime) + - hardware/deps/dram_rtl_sim/dramsys_lib/DRAMSys/build/lib/ + - hardware/deps/dram_rtl_sim/dramsys_lib/DRAMSys/configs/ +# --------------------------------------------------------------------------- +# Test stage: run each kernel in parallel on a separate runner. +# Each job downloads the build artifacts, runs one simulation, and checks +# the output log for failures. +# --------------------------------------------------------------------------- +test: + stage: test + timeout: 1h + needs: [build] + parallel: + matrix: + - KERNEL: + - spin-lock + - load-store_M16 + - fdotp-32b_M32768 + - gemv_M512_N128_K32 + - fmatmul-32b_M64_N64_K64 + - fft-32b_M1024_N16 + - multi_producer_single_consumer_double_linked_list_M1_N1350_K10 + - byte-enable + script: + # The vsim script writes a .rtlbinary marker here; ensure the dir exists. + - mkdir -p sim/bin/logs + - chmod +x sim/bin/cachepool_cluster.vsim + - BIN="${SW_PREFIX}${KERNEL}" + - sim/bin/cachepool_cluster.vsim software/build/CachePoolTests/$BIN 2>&1 | tee test_${KERNEL}.log + - python3 util/auto-benchmark/check-ci.py test_${KERNEL}.log artifacts: + when: always + expire_in: 1 week paths: - - util/auto-benchmark/logs + # Full simulation log + - test_*.log + # Performance-monitor trace files written by the simulator + - sim/bin/logs/ diff --git a/Bender.lock b/Bender.lock index 9684f42..e4ec6e9 100644 --- a/Bender.lock +++ b/Bender.lock @@ -16,17 +16,34 @@ packages: - common_verification - tech_cells_generic axi_riscv_atomics: - revision: 97dcb14ef057cbe5bd70dda2060b5bb9e7e04c6d - version: 0.7.0 + revision: 97a1dd2ac643c276880420a0cf8eea697f228aa9 + version: 0.8.3 source: Git: https://github.com/pulp-platform/axi_riscv_atomics.git dependencies: - axi - common_cells - common_verification + axi_stream: + revision: 54891ff40455ca94a37641b9da4604647878cc07 + version: 0.1.1 + source: + Git: https://github.com/pulp-platform/axi_stream.git + dependencies: + - common_cells + cluster_icache: + revision: ce0ed94a5b95f5c76b9fa51940303fcce53f56e5 + version: null + source: + Git: https://github.com/pulp-platform/cluster_icache.git + dependencies: + - axi + - common_cells + - scm + - tech_cells_generic common_cells: - revision: 9afda9abb565971649c2aa0985639c096f351171 - version: 1.38.0 + revision: 9ca8a7655f741e7dd5736669a20a301325194c28 + version: 1.39.0 source: Git: https://github.com/pulp-platform/common_cells.git dependencies: @@ -45,8 +62,27 @@ packages: Git: https://github.com/pulp-platform/dram_rtl_sim.git dependencies: - axi + floo_noc: + revision: 97306733f33acbb646c7e403c03a674fc1404b44 + version: null + source: + Git: https://github.com/pulp-platform/FlooNoC.git + dependencies: + - axi + - axi_riscv_atomics + - common_cells + - common_verification + - floo_noc_pd + - fpnew + - idma + floo_noc_pd: + revision: null + version: null + source: + Path: hardware/deps/floo_noc/./pd + dependencies: [] fpnew: - revision: a8e0cba6dd50f357ece73c2c955d96efc3c6c315 + revision: e5aa6a01b5bbe1675c3aa8872e1203413ded83d1 version: null source: Git: https://github.com/pulp-platform/cvfpu.git @@ -61,14 +97,16 @@ packages: dependencies: - common_cells idma: - revision: b31e8f019c657eff4126bc789f0336d403da6766 - version: 0.4.2 + revision: 28a36e5e07705549e59fc33db96ab681bc1ca88e + version: 0.6.5 source: Git: https://github.com/pulp-platform/iDMA.git dependencies: - axi + - axi_stream - common_cells - common_verification + - obi - register_interface insitu-cache: revision: fa761ddebc946f9b46509d84945bf41ee1a9ec49 @@ -79,6 +117,14 @@ packages: - axi - common_cells - register_interface + obi: + revision: 0155fc34e900c7c884e081c0a1114a247937ff69 + version: 0.1.7 + source: + Git: https://github.com/pulp-platform/obi.git + dependencies: + - common_cells + - common_verification register_interface: revision: 146501d80052b61475cdc333d3aab4cd769fd5dc version: 0.3.9 @@ -96,8 +142,15 @@ packages: dependencies: - common_cells - tech_cells_generic + scm: + revision: 1976c7efb4979271eee2abe262fde0f9a20e2557 + version: 1.2.1 + source: + Git: https://github.com/pulp-platform/scm.git + dependencies: + - tech_cells_generic spatz: - revision: ed25c78dd72d839db8141287f9516d78ee399b93 + revision: 08847c5fcc2dfe2427c70076d5970de24d54af4c version: null source: Git: https://github.com/pulp-platform/spatz.git diff --git a/Bender.yml b/Bender.yml index 45b01da..7873a05 100644 --- a/Bender.yml +++ b/Bender.yml @@ -10,13 +10,14 @@ dependencies: axi_riscv_atomics: { git: "https://github.com/pulp-platform/axi_riscv_atomics.git", version: 0.7.0 } common_cells: { git: "https://github.com/pulp-platform/common_cells.git", version: 1.28.0 } FPnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: pulp-v0.1.3 } - idma: { git: "https://github.com/pulp-platform/iDMA.git", version: 0.4.2 } register_interface: { git: "https://github.com/pulp-platform/register_interface.git", version: 0.3.8 } riscv-dbg: { git: "https://github.com/pulp-platform/riscv-dbg.git", version: 0.7.0 } tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.11 } Insitu-Cache: { git: "https://github.com/pulp-platform/Insitu-Cache.git", rev: zexin/cachepool_dev } - spatz: { git: "https://github.com/pulp-platform/spatz.git", rev: cachepool-32b } dram_rtl_sim: { git: "https://github.com/pulp-platform/dram_rtl_sim.git", rev: cachepool } + floo_noc: { git: "https://github.com/pulp-platform/FlooNoC.git", rev: main } + cluster_icache: { git: "https://github.com/pulp-platform/cluster_icache.git", rev: main } + spatz: { git: "https://github.com/pulp-platform/spatz.git", rev: cachepool-32b } workspace: checkout_dir: "./hardware/deps" @@ -29,22 +30,27 @@ sources: - hardware/src/tcdm_cache_interco.sv - hardware/src/tcdm_id_remapper.sv - hardware/src/spatz_cache_amo.sv + # FlooNoC + - hardware/generated/floo_cachepool_noc_pkg.sv # Memory-mapped register - hardware/cachepool_peripheral/cachepool_peripheral_reg_pkg.sv - hardware/cachepool_peripheral/cachepool_peripheral_reg_top.sv - hardware/cachepool_peripheral/cachepool_peripheral.sv # Bootrom - hardware/bootrom/bootrom.sv - # Barrier - - hardware/src/cachepool_tile_barrier.sv - - hardware/src/cachepool_cluster_barrier.sv # Level 1 - hardware/src/cachepool_pkg.sv - hardware/src/cachepool_cc.sv + # Barrier + - hardware/src/cachepool_tile_barrier.sv + - hardware/src/cachepool_cluster_barrier.sv + # ICache + - hardware/src/axi_hier_interco.sv # Level 2 - hardware/src/cachepool_tile.sv # Level 3 - hardware/src/cachepool_group.sv + - hardware/src/cachepool_group_noc_wrapper.sv - hardware/src/cachepool_cluster.sv # Level 4 diff --git a/Makefile b/Makefile index 588cf0f..9bd24f0 100644 --- a/Makefile +++ b/Makefile @@ -67,7 +67,7 @@ CACHE_PATH := $(shell [ -x "$(BENDER)" ] && $(BENDER) path insitu-cac # Configurations CFG_DIR ?= ${CACHEPOOL_DIR}/config -config ?= cachepool_512 +config ?= cachepool_fpu_2g # Compiler choice for SW cmake COMPILER ?= llvm @@ -163,6 +163,30 @@ $(BOOTROM_DIR)/bootrom.sv: $(BOOTROM_DIR)/bootrom.bin $(BOOTROM_DIR)/bootdata.cc ${PYTHON} $(SCRIPTS_DIR)/generate_bootrom.py \ $< -c $(HJSON_OUT) --output $@ +########### +# FlooNoC # +########### +FLOO_DIR ?= $(shell $(BENDER_INSTALL_DIR)/bender path floo_noc) +FLOO_GEN_OUTDIR ?= $(ROOT_DIR)/hardware/generated +FLOO_CFG ?= $(ROOT_DIR)/config/floonoc_cachepool_4g.yml +FLOO_NAME = cachepool +FLOO_NOC ?= $(FLOO_GEN_OUTDIR)/floo_$(FLOO_NAME)_noc_pkg.sv + +$(info FLOO_DIR: $(FLOO_DIR)) + +# Generates the sources for FlooNoC +.PHONY: update-floonoc install-floogen clean-floonoc +install-floogen: + pip install -e $(FLOO_DIR) --quiet + +update-floonoc: $(FLOO_NOC) +$(FLOO_NOC): install-floogen $(FLOO_CFG) + mkdir -p $(FLOO_GEN_OUTDIR) + PATH="$(HOME)/.local/bin:$(PATH)" floogen pkg -c $(FLOO_CFG) -o $(FLOO_GEN_OUTDIR) --no-format + +clean-floonoc: + rm -f $(FLOO_NOC) + ########### # DramSys # ########### @@ -232,33 +256,32 @@ VLOG_FLAGS += -64 VLOG_DEFS = -DCACHEPOOL # Cluster configuration +VLOG_DEFS += -DNUM_GROUPS=$(num_groups) +VLOG_DEFS += -DNUM_GROUPS_X=$(num_groups_x) VLOG_DEFS += -DNUM_TILES=$(num_tiles) VLOG_DEFS += -DNUM_CORES=$(num_cores) VLOG_DEFS += -DDATA_WIDTH=$(data_width) VLOG_DEFS += -DADDR_WIDTH=$(addr_width) # Tile configuration -VLOG_DEFS += -DNUM_CORES_PER_TILE=$(num_cores_per_tile) VLOG_DEFS += -DREFILL_DATA_WIDTH=$(refill_data_width) # L1 Data Cache VLOG_DEFS += -DL1D_CACHELINE_WIDTH=$(l1d_cacheline_width) -VLOG_DEFS += -DL1D_SIZE=$(l1d_size) -VLOG_DEFS += -DL1D_BANK_FACTOR=$(l1d_bank_factor) VLOG_DEFS += -DL1D_COAL_WINDOW=$(l1d_coal_window) VLOG_DEFS += -DL1D_NUM_WAY=$(l1d_num_way) -VLOG_DEFS += -DL1D_TILE_SIZE=$(l1d_tile_size) VLOG_DEFS += -DL1D_TAG_DATA_WIDTH=$(l1d_tag_data_width) VLOG_DEFS += -DL1D_NUM_BANKS=$(l1d_num_banks) VLOG_DEFS += -DL1D_DEPTH=$(l1d_depth) # CachePool CC / core cluster -VLOG_DEFS += -DSPATZ_FPU_EN=$(spatz_fpu_en) VLOG_DEFS += -DSPATZ_NUM_FPU=$(spatz_num_fpu) VLOG_DEFS += -DSPATZ_NUM_IPU=$(spatz_num_ipu) VLOG_DEFS += -DSPATZ_MAX_TRANS=$(spatz_max_trans) VLOG_DEFS += -DSNITCH_MAX_TRANS=$(snitch_max_trans) VLOG_DEFS += -DREMOTE_PORT_PER_CORE=$(num_remote_ports_per_tile) +VLOG_DEFS += -DRG_PORT_PER_CORE=$(num_rg_ports_per_core) +VLOG_DEFS += -DNOC_PORT_PER_TILE=$(num_noc_ports_per_tile) # AXI configuration VLOG_DEFS += -DAXI_USER_WIDTH=$(axi_user_width) @@ -268,14 +291,12 @@ VLOG_DEFS += -DL2_CHANNEL=$(l2_channel) VLOG_DEFS += -DL2_BANK_WIDTH=$(l2_bank_width) VLOG_DEFS += -DL2_INTERLEAVE=$(l2_interleave) -# Peripherals / memory map -VLOG_DEFS += -DSTACK_ADDR=$(stack_addr) +# Stack / SPM (boot_addr, stack_addr, periph_start_addr, uart_addr used by hjson +# generator via environment; not consumed as SV defines) VLOG_DEFS += -DSTACK_HW_SIZE=$(stack_hw_size) VLOG_DEFS += -DSTACK_HW_DEPTH=$(stack_hw_depth) VLOG_DEFS += -DSTACK_TOT_SIZE=$(stack_tot_size) -VLOG_DEFS += -DPERIPH_START_ADDR=$(periph_start_addr) -VLOG_DEFS += -DBOOT_ADDR=$(boot_addr) -VLOG_DEFS += -DUART_ADDR=$(uart_addr) +VLOG_DEFS += -DSTACK_TOT_DEPTH=$(stack_tot_depth) ENABLE_CACHEPOOL_TESTS ?= 1 @@ -365,6 +386,9 @@ help: @echo "*generate*: generate the Spatz package and opcodes, and the cluster config HJSON" @echo "*cache-init*: source the insitu-cache environment (requires bender checkout)" @echo "*bootrom*: compile and generate the bootrom SystemVerilog module" + @echo "*update-floonoc*: regenerate FlooNoC package from FLOO_CFG (run after changing group count)" + @echo "*install-floogen*: install the floogen Python tool (required by update-floonoc)" + @echo "*clean-floonoc*: remove the generated FlooNoC package" @echo "" @echo "DRAMSys:" @echo "" diff --git a/README.md b/README.md index 48c83e9..d00a967 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,11 @@ CachePool is a Snitch–Spatz–based many-core system with a shared L1 data cac | Level | Module | Description | |-------|--------|-------------| | 1 | Core Complex (CC) | One 32-bit Snitch + one Spatz RVV accelerator | -| 2 | Tile | 4 CCs + 4 × 64 KiB 4-way InSitu-Cache banks | -| 3 | Group | 4 Tiles connected via crossbar | -| 4 | Cluster (WIP) | Multiple Groups connected via NoC (currently one Group) | +| 2 | Tile | 4 CCs + 4 × InSitu-Cache banks | +| 3 | Group | 4 Tiles connected via crossbar + shared L2 ICache | +| 4 | Cluster | Multiple Groups connected via FlooNoC XY mesh | -All tiles in a cluster share one unified L1 cache, interleaved across cache banks. The bank-selection offset is configurable at runtime via `l1d_xbar_config(...)`. +All tiles across all groups share one unified L1 data cache, interleaved across cache banks. The bank-selection offset is configurable at runtime via `l1d_xbar_config(...)`. ## Requirements @@ -68,10 +68,10 @@ make dram-build CMAKE=/path/to/cmake-3.28.x CC=/path/to/gcc-11.2 CXX=/path/to/g+ ### Generate Required RTL Some RTL components (e.g., package headers) must be generated prior to simulation. -Generation requires specifying a **configuration**. If none is provided, the default is `cachepool_512`. +Generation requires specifying a **configuration**. If none is provided, the default is `cachepool_2g`. ```bash -make generate config=cachepool_fpu_512 +make generate config=cachepool_fpu_2g ``` ### Build the BootROM @@ -79,7 +79,7 @@ make generate config=cachepool_fpu_512 The BootROM is built separately from the RTL generation step: ```bash -make bootrom config=cachepool_fpu_512 +make bootrom config=cachepool_fpu_2g ``` ### Compilation and Simulation @@ -87,13 +87,13 @@ make bootrom config=cachepool_fpu_512 #### Build Software Only ```bash -make sw config=cachepool_fpu_512 +make sw config=cachepool_fpu_2g ``` #### Build Hardware + Software (QuestaSim) ```bash -make vsim config=cachepool_fpu_512 +make vsim config=cachepool_fpu_2g ``` #### Run the Simulation @@ -125,7 +125,7 @@ A lightweight benchmarking automation flow is provided under `util/auto-benchmar 1. Edit `configs.sh` to list the desired configurations and kernels: - CONFIGS="cachepool_fpu_512 cachepool_fpu_256 cachepool_fpu_128" + CONFIGS="cachepool_fpu_2g cachepool_fpu_4g" KERNELS="fdotp-32b_M32768 ffft-64b_M16384 fmatmul-64b_M2048" PREFIX="test-cachepool-" ROOT_PATH=../.. @@ -147,10 +147,10 @@ A lightweight benchmarking automation flow is provided under `util/auto-benchmar Example directory after a run: logs/20251028-1230/ - ├── cachepool_fpu_512_fdotp-32b_M32768.log - ├── cachepool_fpu_512_fdotp-32b_M32768_pm/ - ├── cachepool_fpu_512_summary.txt - ├── cachepool_fpu_256_summary.txt + ├── cachepool_fpu_2g_fdotp-32b_M32768.log + ├── cachepool_fpu_2g_fdotp-32b_M32768_pm/ + ├── cachepool_fpu_2g_summary.txt + ├── cachepool_fpu_4g_summary.txt └── ... Each run includes: @@ -170,46 +170,45 @@ This setup allows quick reproducible benchmarks with all results neatly organize Usage: ```bash -python3 check_ci.py logs/latest/cachepool_fpu_512_load-store.log +python3 check_ci.py logs/latest/cachepool_fpu_2g_load-store.log ``` Exit code 0 means all tests passed; exit code 1 means at least one failure was detected. On failure the offending lines and their line numbers are printed for manual inspection. ## Configurations -All hardware knobs live in **`config/config.mk`** (and flavor files it includes). The default configuration is **4 tiles, 16 cores**. +All hardware knobs live in **`config/config.mk`** (and flavor files it includes). The default configuration is **2 groups, 4 tiles/group, 4 cores/tile = 32 cores total**. -| Flavor file | Description | -|-------------|-------------| -| `cachepool.mk` | No floating-point support | -| `cachepool_fpu.mk` | Enables single/half precision in the Spatz vector core | +Configuration names encode the number of groups and whether the FPU is enabled: -Available named configurations (passed as `config=`): - -| Name | Cacheline | FPU | -|------|-----------|-----| -| `cachepool_512` | 512b | No | -| `cachepool_128` | 128b | No | -| `cachepool_fpu_512` | 512b | Yes | -| `cachepool_fpu_256` | 256b | Yes | -| `cachepool_fpu_128` | 128b | Yes | +| Name | Groups | Mesh | FPU | Cores | +|------|--------|------|-----|-------| +| `cachepool_2g` | 2 | 1×2 | No | 32 | +| `cachepool_fpu_2g` | 2 | 1×2 | Yes | 32 | +| `cachepool_4g` | 4 | 2×2 | No | 64 | +| `cachepool_fpu_4g` | 4 | 2×2 | Yes | 64 | +| `cachepool_fpu_16g` | 16 | 4×4 | Yes | 256 | The Spatz cluster consumes **`config/cachepool.hjson`**, which is **generated** from: - `config/cachepool.hjson.tmpl` (skeleton with comments) - `config/config.mk` (source of truth) -To switch flavors, set `config=` (or export `CACHEPOOL_CONFIGURATION=`), then rebuild: +Multi-group configurations also require a FlooNoC topology file (e.g. `config/floonoc_cachepool_4g.yml`). After changing the group count, regenerate the FlooNoC package: ```bash -make clean -make generate config=cachepool_fpu_512 +make update-floonoc ``` -> `make clean` is recommended when changing configurations. +To switch configurations, always clean first: + +```bash +make clean +make generate config=cachepool_fpu_2g +``` ### How configuration flows -1. **`config/config.mk`** defines all parameters (e.g., `num_tiles`, `num_cores`, `l1d_cacheline_width`, `axi_user_width`, addresses, etc.). Derived values (like `axi_user_width`) are pre-computed so tools receive integers, not expressions. +1. **`config/config.mk`** defines all parameters (e.g., `num_groups`, `num_groups_x`, `num_tiles_per_group`, `num_cores_per_tile`, `l1d_cacheline_width`, `axi_user_width`, etc.). Derived values are pre-computed so tools receive integers, not expressions. 2. `make generate` calls the Python generator to produce **`config/cachepool.hjson`** from the template. 3. The Makefile passes the same values to **QuestaSim** via `VLOG_DEFS`, keeping RTL, sim, and HJSON in sync. @@ -319,7 +318,7 @@ Cluster peripherals (including the BootROM and memory-mapped registers) are inst SpyGlass lint (optional): ```bash -make lint config=cachepool_fpu_512 +make lint config=cachepool_fpu_2g ``` --- @@ -328,6 +327,7 @@ make lint config=cachepool_fpu_512 - To see the exact macros passed to vlog, check `VLOG_DEFS` in the Makefile and `sim/work/compile.vsim.tcl`. - If you change cacheline width, `AXI_USER_WIDTH` is derived (supported widths: 128→19, 256→18, 512→17). Unsupported widths error out at generation time. -- Use `make clean` when switching flavors/configs to prevent stale build artifacts. +- When changing the number of groups, run `make update-floonoc` to regenerate the FlooNoC package before `make generate`. +- Use `make clean` when switching configs to prevent stale build artifacts. - Runtime functions `snrt_tile_id()` and `snrt_num_tiles()` are available to query tile topology from software. - Changing the partition mode or boundary address while the cache holds valid data requires a flush (`l1d_flush()` or the appropriate partition flush) before reconfiguring. diff --git a/config/cachepool.hjson b/config/cachepool.hjson index 2ac0947..652b4b7 100644 --- a/config/cachepool.hjson +++ b/config/cachepool.hjson @@ -53,11 +53,11 @@ register_offload_rsp: true }, - nr_tiles: 4, + nr_tiles: 8, - // Repeat the compute core template N times (driven by 16) + // Repeat the compute core template N times (driven by 32) cores: [ - { $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" } + { $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" },{ $ref: "#/compute_core_template" } ], icache: { diff --git a/config/cachepool_512.mk b/config/cachepool_2g.mk similarity index 87% rename from config/cachepool_512.mk rename to config/cachepool_2g.mk index 6d04a68..271eedb 100644 --- a/config/cachepool_512.mk +++ b/config/cachepool_2g.mk @@ -8,11 +8,17 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 2 + +# 1×2 mesh +num_groups_x ?= 1 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,14 +26,17 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 1 + ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 @@ -36,9 +45,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -48,7 +54,7 @@ l1d_coal_window ?= 2 # L1 data cache number of ways per l1d_num_way ?= 4 -# L1 data cache size **per tile** (KiB) +# L1 data cache size per tile (KiB) l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) diff --git a/config/cachepool_128.mk b/config/cachepool_4g.mk similarity index 85% rename from config/cachepool_128.mk rename to config/cachepool_4g.mk index df52dab..8b1b300 100644 --- a/config/cachepool_128.mk +++ b/config/cachepool_4g.mk @@ -8,11 +8,17 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 4 + +# 2×2 mesh +num_groups_x ?= 2 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,24 +26,24 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 +num_remote_ports_per_tile ?= 4 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 4 + ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 ##### L1 Data Cache ##### # L1 data cacheline width (in Bit) -l1d_cacheline_width ?= 128 - -# L1 data cache size (in KiB) -l1d_size ?= 256 +l1d_cacheline_width ?= 512 # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -52,7 +58,7 @@ l1d_num_way ?= 4 l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 52 +l1d_tag_data_width ?= 92 #################### ## CachePool CC ## @@ -77,7 +83,7 @@ snitch_max_trans ?= 16 ## L2 Main Memory ## ##################### # L2 number of channels -l2_channel ?= 4 +l2_channel ?= 8 # L2 bank width (DRAM width, change with care) l2_bank_width ?= 512 diff --git a/config/cachepool_fpu_128.mk b/config/cachepool_fpu_16g.mk similarity index 84% rename from config/cachepool_fpu_128.mk rename to config/cachepool_fpu_16g.mk index e60aad4..8cf8445 100644 --- a/config/cachepool_fpu_128.mk +++ b/config/cachepool_fpu_16g.mk @@ -8,11 +8,17 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 16 + +# 4×4 mesh +num_groups_x ?= 4 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,30 +26,30 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 2 + ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 ##### L1 Data Cache ##### # L1 data cacheline width (in Bit) -l1d_cacheline_width ?= 128 - -# L1 data cache size (in KiB) -l1d_size ?= 256 +l1d_cacheline_width ?= 512 # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 # L1 coalecsing window -l1d_coal_window ?= 1 +l1d_coal_window ?= 2 # L1 data cache number of ways per l1d_num_way ?= 4 @@ -52,7 +58,7 @@ l1d_num_way ?= 4 l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 64 +l1d_tag_data_width ?= 92 #################### ## CachePool CC ## @@ -77,7 +83,7 @@ snitch_max_trans ?= 16 ## L2 Main Memory ## ##################### # L2 number of channels -l2_channel ?= 4 +l2_channel ?= 16 # L2 bank width (DRAM width, change with care) l2_bank_width ?= 512 diff --git a/config/cachepool_fpu_512.mk b/config/cachepool_fpu_2g.mk similarity index 88% rename from config/cachepool_fpu_512.mk rename to config/cachepool_fpu_2g.mk index 2e4c3ca..efb23fa 100644 --- a/config/cachepool_fpu_512.mk +++ b/config/cachepool_fpu_2g.mk @@ -8,11 +8,17 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 2 + +# 1×2 mesh +num_groups_x ?= 1 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,16 +26,17 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 -num_remote_ports_per_tile ?= 2 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 1 ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 @@ -38,9 +45,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -79,7 +83,7 @@ snitch_max_trans ?= 16 ## L2 Main Memory ## ##################### # L2 number of channels -l2_channel ?= 4 +l2_channel ?= 8 # L2 bank width (DRAM width, change with care) l2_bank_width ?= 512 diff --git a/config/cachepool_fpu_256.mk b/config/cachepool_fpu_4g.mk similarity index 84% rename from config/cachepool_fpu_256.mk rename to config/cachepool_fpu_4g.mk index 279dc80..90a6af8 100644 --- a/config/cachepool_fpu_256.mk +++ b/config/cachepool_fpu_4g.mk @@ -8,11 +8,17 @@ ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 4 + +# 2×2 mesh +num_groups_x ?= 2 + # Number of tiles -num_tiles ?= 4 +num_tiles_per_group ?= 4 # Number of cores -num_cores ?= 16 +num_cores_per_tile ?= 4 # Core datawidth data_width ?= 32 @@ -20,24 +26,24 @@ data_width ?= 32 # Core addrwidth addr_width ?= 32 +num_remote_ports_per_tile ?= 1 + +num_rg_ports_per_core ?= 1 + +num_noc_ports_per_tile ?= 2 + ###################### ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 ##### L1 Data Cache ##### # L1 data cacheline width (in Bit) -l1d_cacheline_width ?= 256 - -# L1 data cache size (in KiB) -l1d_size ?= 256 +l1d_cacheline_width ?= 512 # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -52,7 +58,7 @@ l1d_num_way ?= 4 l1d_tile_size ?= 256 # L1 data cache tag width (TODO: should be calcualted) -l1d_tag_data_width ?= 64 +l1d_tag_data_width ?= 92 #################### ## CachePool CC ## @@ -77,13 +83,13 @@ snitch_max_trans ?= 16 ## L2 Main Memory ## ##################### # L2 number of channels -l2_channel ?= 4 +l2_channel ?= 8 # L2 bank width (DRAM width, change with care) l2_bank_width ?= 512 # L2 interleaving factor (in order of bank_width) -l2_interleave ?= 8 +l2_interleave ?= 16 ################## diff --git a/config/config.mk b/config/config.mk index 9eee8cb..1164e89 100644 --- a/config/config.mk +++ b/config/config.mk @@ -26,13 +26,25 @@ include $(CACHEPOOL_DIR)/config/$(config).mk ## CachePool Cluster ## ######################### +# Number of groups +num_groups ?= 1 + +# X dimension of the group mesh (Y = num_groups / num_groups_x) +num_groups_x ?= 1 + # Number of tiles -num_tiles ?= 1 +num_tiles_per_group ?= 4 +num_tiles = $(shell echo $$(( $(num_groups) * $(num_tiles_per_group)))) num_remote_ports_per_tile ?= 1 # Number of cores -num_cores ?= 4 +num_cores_per_tile ?= 4 +num_cores ?= $(shell echo $$(( $(num_tiles) * $(num_cores_per_tile)))) + +num_rg_ports_per_core ?= 0 + +num_noc_ports_per_tile ?= 1 # Core datawidth data_width ?= 32 @@ -45,9 +57,6 @@ addr_width ?= 32 ## CachePool Tile ## ###################### -# Number of cores per CachePool tile -num_cores_per_tile ?= 4 - # Refill interconnection data width refill_data_width ?= 128 @@ -56,9 +65,6 @@ refill_data_width ?= 128 # L1 data cacheline width (in Bit) l1d_cacheline_width ?= 512 -# L1 data cache size (in KiB) -l1d_size ?= 256 - # L1 data cache banking factor (how many banks per core?) l1d_bank_factor ?= 1 @@ -116,6 +122,15 @@ endif ##################### ## L2 Main Memory ## ##################### + +# DRAM base address and size (hex: 0x8000_0000, 0x2000_0000) +dram_addr ?= 2147483648 +dram_len ?= 536870912 + +# Uncached region base address and size (hex: 0xC000_0000, 0x2000_0000) +uncached_addr ?= 3221225472 +uncached_len ?= 536870912 + # L2 number of channels l2_channel ?= 4 diff --git a/config/floonoc_cachepool_4g.yml b/config/floonoc_cachepool_4g.yml new file mode 100644 index 0000000..2c81e28 --- /dev/null +++ b/config/floonoc_cachepool_4g.yml @@ -0,0 +1,89 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +name: cachepool +description: "CachePool AXI NoC" +network_type: "axi" + +routing: + route_algo: "SRC" + use_id_table: true + +protocols: + - name: "wide_in" + type: "wide" + protocol: "AXI4" + data_width: 256 + addr_width: 32 + id_width: 2 + user_width: 1 + - name: "wide_out" + type: "wide" + protocol: "AXI4" + data_width: 256 + addr_width: 32 + id_width: 2 + user_width: 1 + +endpoints: + - name: "group" + array: [2, 2] + mgr_port_protocol: + - "wide_in" + - name: "hbm" + array: [4] + addr_range: + base: 0x8000_0000 + size: 0x0010_0000 + sbr_port_protocol: + - "wide_out" + - name: "host_peri" + addr_range: + - start: 0x0000_0000 + end: 0x7FFF_FFFF + - start: 0xA000_0000 + end: 0xC000_FFFF + mgr_port_protocol: + - "wide_in" + sbr_port_protocol: + - "wide_out" + +routers: + - name: "group_router" + array: [2, 2] + degree: 5 + +connections: + - src: "group" + dst: "group_router" + src_range: + - [0, 1] + - [0, 1] + dst_range: + - [0, 1] + - [0, 1] + dst_dir: "Eject" + # HBM West + - src: "hbm" + dst: "group_router" + src_range: + - [0, 1] + dst_range: + - [0, 0] + - [0, 1] + dst_dir: "West" + # HBM East + - src: "hbm" + dst: "group_router" + src_range: + - [2, 3] + dst_range: + - [1, 1] + - [0, 1] + dst_dir: "East" + # Special + - src: "host_peri" + dst: "group_router" + dst_idx: [0, 0] + dst_dir: "South" diff --git a/hardware/bootrom/bootdata.cc b/hardware/bootrom/bootdata.cc deleted file mode 100644 index f96b8ba..0000000 --- a/hardware/bootrom/bootdata.cc +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -#include - -namespace sim { - -const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 16, - .hartid_base = 0, - .tcdm_start = 0xbffff800, - .tcdm_size = 0x800, - .tcdm_offset = 0x0, - .global_mem_start = 0x80000000, - .global_mem_end = 0xa0000000, - .tile_count = 4}; - -} // namespace sim diff --git a/hardware/bootrom/bootdata_bootrom.cc b/hardware/bootrom/bootdata_bootrom.cc deleted file mode 100644 index d578d55..0000000 --- a/hardware/bootrom/bootdata_bootrom.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -#include - -// The boot data generated along with the system RTL. -struct BootData { - uint32_t boot_addr; - uint32_t core_count; - uint32_t hartid_base; - uint32_t tcdm_start; - uint32_t tcdm_size; - uint32_t tcdm_offset; - uint64_t global_mem_start; - uint64_t global_mem_end; - uint32_t tile_count; -}; - -extern "C" const BootData BOOTDATA = {.boot_addr = 0x1000, - .core_count = 16, - .hartid_base = 0, - .tcdm_start = 0xbffff800, - .tcdm_size = 0x800, - .tcdm_offset = 0x0, - .global_mem_start = 0x80000000, - .global_mem_end = 0xa0000000, - .tile_count = 4}; diff --git a/hardware/bootrom/bootrom.bin b/hardware/bootrom/bootrom.bin deleted file mode 100755 index d4a9322..0000000 Binary files a/hardware/bootrom/bootrom.bin and /dev/null differ diff --git a/hardware/bootrom/bootrom.dump b/hardware/bootrom/bootrom.dump deleted file mode 100644 index dad90e3..0000000 --- a/hardware/bootrom/bootrom.dump +++ /dev/null @@ -1,127 +0,0 @@ - -/scratch2/diyou/cachepool/ManyRVData/hardware/bootrom/bootrom.elf: file format elf32-littleriscv - - -Disassembly of section .text: - -00001000 <_start>: - 1000: 00000317 auipc t1,0x0 - 1004: 07832303 lw t1,120(t1) # 1078 <_GLOBAL_OFFSET_TABLE_+0x4> - 1008: 30531073 csrw mtvec,t1 - 100c: f1402573 csrr a0,mhartid - 1010: 00000597 auipc a1,0x0 - 1014: 06c5a583 lw a1,108(a1) # 107c <_GLOBAL_OFFSET_TABLE_+0x8> - 1018: 3047d073 csrwi mie,15 - 101c: 10500073 wfi - 1020: 00c5a383 lw t2,12(a1) - 1024: 0105ae03 lw t3,16(a1) - 1028: 01c383b3 add t2,t2,t3 - 102c: 02038393 addi t2,t2,32 - 1030: 0003a383 lw t2,0(t2) - 1034: 00038067 jr t2 - -00001038 : - 1038: 10500073 wfi - 103c: ffdff06f j 1038 - -Disassembly of section .rodata: - -00001040 : - 1040: 1000 .2byte 0x1000 - 1042: 0000 .2byte 0x0 - 1044: 0010 .2byte 0x10 - 1046: 0000 .2byte 0x0 - 1048: 0000 .2byte 0x0 - 104a: 0000 .2byte 0x0 - 104c: f800 .2byte 0xf800 - 104e: bfff .2byte 0xbfff - 1050: 0800 .2byte 0x800 - ... - 105a: 8000 .2byte 0x8000 - 105c: 0000 .2byte 0x0 - 105e: 0000 .2byte 0x0 - 1060: 0000 .2byte 0x0 - 1062: a000 .2byte 0xa000 - 1064: 0000 .2byte 0x0 - 1066: 0000 .2byte 0x0 - 1068: 0004 .2byte 0x4 - 106a: 0000 .2byte 0x0 - 106c: 0000 .2byte 0x0 - ... - -Disassembly of section .boot_section: - -00001070 : - 1070: 1038 .2byte 0x1038 - ... - -Disassembly of section .got: - -00001074 <_GLOBAL_OFFSET_TABLE_>: - 1074: 0000 .2byte 0x0 - 1076: 0000 .2byte 0x0 - 1078: 1038 .2byte 0x1038 - 107a: 0000 .2byte 0x0 - 107c: 1040 .2byte 0x1040 - ... - -Disassembly of section .got.plt: - -00001080 <.got.plt>: - 1080: ffff .2byte 0xffff - 1082: ffff .2byte 0xffff - 1084: 0000 .2byte 0x0 - ... - -Disassembly of section .riscv.attributes: - -00000000 <.riscv.attributes>: - 0: 4341 .2byte 0x4341 - 2: 0000 .2byte 0x0 - 4: 7200 .2byte 0x7200 - 6: 7369 .2byte 0x7369 - 8: 01007663 bgeu zero,a6,14 <_start-0xfec> - c: 0039 .2byte 0x39 - e: 0000 .2byte 0x0 - 10: 1004 .2byte 0x1004 - 12: 7205 .2byte 0x7205 - 14: 3376 .2byte 0x3376 - 16: 6932 .2byte 0x6932 - 18: 7032 .2byte 0x7032 - 1a: 5f31 .2byte 0x5f31 - 1c: 326d .2byte 0x326d - 1e: 3070 .2byte 0x3070 - 20: 615f 7032 5f31 .byte 0x5f, 0x61, 0x32, 0x70, 0x31, 0x5f - 26: 3266 .2byte 0x3266 - 28: 3270 .2byte 0x3270 - 2a: 7a5f 6369 7273 .byte 0x5f, 0x7a, 0x69, 0x63, 0x73, 0x72 - 30: 7032 .2byte 0x7032 - 32: 5f30 .2byte 0x5f30 - 34: 697a .2byte 0x697a - 36: 6566 .2byte 0x6566 - 38: 636e .2byte 0x636e - 3a: 6965 .2byte 0x6965 - 3c: 7032 .2byte 0x7032 - 3e: 0030 .2byte 0x30 - 40: 0108 .2byte 0x108 - 42: 0b0a .2byte 0xb0a - -Disassembly of section .comment: - -00000000 <.comment>: - 0: 3a434347 .4byte 0x3a434347 - 4: 2820 .2byte 0x2820 - 6: 736f7263 bgeu t5,s6,72a <_start-0x8d6> - a: 6f6f7473 csrrci s0,0x6f6,30 - e: 2d6c .2byte 0x2d6c - 10: 474e .2byte 0x474e - 12: 3120 .2byte 0x3120 - 14: 322e .2byte 0x322e - 16: 2e35 .2byte 0x2e35 - 18: 2e30 .2byte 0x2e30 - 1a: 3538 .2byte 0x3538 - 1c: 365f 6331 6334 .byte 0x5f, 0x36, 0x31, 0x63, 0x34, 0x63 - 22: 20296163 bltu s2,sp,224 <_start-0xddc> - 26: 2e39 .2byte 0x2e39 - 28: 2e35 .2byte 0x2e35 - 2a: 0030 .2byte 0x30 diff --git a/hardware/bootrom/bootrom.elf b/hardware/bootrom/bootrom.elf deleted file mode 100755 index 8c26b6e..0000000 Binary files a/hardware/bootrom/bootrom.elf and /dev/null differ diff --git a/hardware/bootrom/bootrom.sv b/hardware/bootrom/bootrom.sv deleted file mode 100644 index c3b8995..0000000 --- a/hardware/bootrom/bootrom.sv +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2023 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 -// -// Description: Automatically generated bootrom -// -// Generated by util/scripts/generate_bootrom.py - -module bootrom #( - /* Automatically generated. DO NOT CHANGE! */ - parameter int unsigned DataWidth = 128, - parameter int unsigned AddrWidth = 32 -) ( - input logic clk_i, - input logic req_i, - input logic [AddrWidth-1:0] addr_i, - output logic [DataWidth-1:0] rdata_o -); - localparam int RomSize = 8; - localparam int AddrBits = RomSize > 1 ? $clog2(RomSize) : 1; - - const logic [RomSize-1:0][DataWidth-1:0] mem = { - 128'h00001040000010380000000000001038, - 128'h000000000000000400000000a0000000, - 128'h00000000800000000000000000000800, - 128'hbffff800000000000000001000001000, - 128'hffdff06f10500073000380670003a383, - 128'h0203839301c383b30105ae0300c5a383, - 128'h105000733047d07306c5a58300000597, - 128'hf1402573305310730783230300000317 - }; - - logic [AddrBits-1:0] addr_q; - - always_ff @(posedge clk_i) begin - if (req_i) begin - addr_q <= addr_i[AddrBits-1+4:4]; - end - end - - // this prevents spurious Xes from propagating into - // the speculative fetch stage of the core - assign rdata_o = (addr_q < RomSize) ? mem[addr_q] : '0; -endmodule diff --git a/hardware/cachepool_peripheral/cachepool_peripheral.sv b/hardware/cachepool_peripheral/cachepool_peripheral.sv index 6326cfa..d539b76 100644 --- a/hardware/cachepool_peripheral/cachepool_peripheral.sv +++ b/hardware/cachepool_peripheral/cachepool_peripheral.sv @@ -89,7 +89,7 @@ module cachepool_peripheral //////////// L1 DCache //////////// logic [NumPerfCounters-1:0][47:0] perf_counter_d, perf_counter_q; - logic [31:0] cl_clint_d, cl_clint_q; + logic [NrCores-1:0] cl_clint_d, cl_clint_q; logic [9:0] l1d_spm_size_d, l1d_spm_size_q; logic [3:0] l1d_private_d, l1d_private_q; addr_t private_start_addr_d, private_start_addr_q; @@ -172,7 +172,7 @@ module cachepool_peripheral end `FF(private_start_addr_q, private_start_addr_d, 32'hA000_0000, clk_i, rst_ni) - `FF(l1d_private_q, l1d_private_d, '0, clk_i, rst_ni) + `FF(l1d_private_q, l1d_private_d, 0, clk_i, rst_ni) `FF(l1d_lock_q, l1d_lock_d, '0, clk_i, rst_ni) // To show if the current flush/invalidation is complete assign hw2reg.l1d_flush_status.d = (l1d_lock_q != '0); diff --git a/hardware/generated/floo_cachepool_noc_pkg.sv b/hardware/generated/floo_cachepool_noc_pkg.sv new file mode 100644 index 0000000..e475728 --- /dev/null +++ b/hardware/generated/floo_cachepool_noc_pkg.sv @@ -0,0 +1,240 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// AUTOMATICALLY GENERATED! DO NOT EDIT! + +`include "axi/typedef.svh" +`include "floo_noc/typedef.svh" + +package floo_cachepool_noc_pkg; + + import floo_pkg::*; + + ///////////////////// + // Address Map // + ///////////////////// + + typedef enum logic[3:0] { + GroupX0Y0 = 0, + GroupX0Y1 = 1, + GroupX1Y0 = 2, + GroupX1Y1 = 3, + Hbm0 = 4, + Hbm1 = 5, + Hbm2 = 6, + Hbm3 = 7, + HostPeri = 8, + NumEndpoints = 9} ep_id_e; + + + + typedef enum logic[2:0] { + Hbm0SamIdx = 0, + Hbm1SamIdx = 1, + Hbm2SamIdx = 2, + Hbm3SamIdx = 3, + HostPeriSamIdx = 5} sam_idx_e; + + + + typedef logic[0:0] rob_idx_t; +typedef logic[0:0] port_id_t; +typedef logic[3:0] id_t; +typedef logic[8:0] route_t; + + + typedef struct packed { + id_t idx; + id_t start_addr; + id_t end_addr; + } route_map_rule_t; + + localparam int unsigned SamNumRules = 6; + +typedef struct packed { + id_t idx; + logic [31:0] start_addr; + logic [31:0] end_addr; +} sam_rule_t; + +localparam sam_rule_t[SamNumRules-1:0] Sam = '{ +'{ idx: 8, + start_addr: 32'h00000000, + end_addr: 32'h7fffffff},// HostPeri +'{ idx: 8, + start_addr: 32'ha0000000, + end_addr: 32'hc000ffff},// HostPeri +'{ idx: 7, + start_addr: 32'h80300000, + end_addr: 32'h80400000},// Hbm3 +'{ idx: 6, + start_addr: 32'h80200000, + end_addr: 32'h80300000},// Hbm2 +'{ idx: 5, + start_addr: 32'h80100000, + end_addr: 32'h80200000},// Hbm1 +'{ idx: 4, + start_addr: 32'h80000000, + end_addr: 32'h80100000} // Hbm0 + +}; + + + localparam route_t[NumEndpoints-1:0][NumEndpoints-1:0] RoutingTables = '{ +'{ +9'b000000000,// -> host_peri_ni +9'b001001000,// -> hbm_ni_3 +9'b000001001,// -> hbm_ni_2 +9'b000011000,// -> hbm_ni_1 +9'b000000011,// -> hbm_ni_0 +9'b100001000,// -> group_ni_1_1 +9'b000100001,// -> group_ni_1_0 +9'b000100000,// -> group_ni_0_1 +9'b000000100 // -> group_ni_0_0 +}, +'{ +9'b010010011,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b000000100,// -> group_ni_1_1 +9'b000100010,// -> group_ni_1_0 +9'b000100011,// -> group_ni_0_1 +9'b100010011 // -> group_ni_0_0 +}, +'{ +9'b000010011,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b000100000,// -> group_ni_1_1 +9'b000000100,// -> group_ni_1_0 +9'b100000011,// -> group_ni_0_1 +9'b000100011 // -> group_ni_0_0 +}, +'{ +9'b000010010,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b000100001,// -> group_ni_1_1 +9'b100001010,// -> group_ni_1_0 +9'b000000100,// -> group_ni_0_1 +9'b000100010 // -> group_ni_0_0 +}, +'{ +9'b000000010,// -> host_peri_ni +9'b000000000,// -> hbm_ni_3 +9'b000000000,// -> hbm_ni_2 +9'b000000000,// -> hbm_ni_1 +9'b000000000,// -> hbm_ni_0 +9'b100001000,// -> group_ni_1_1 +9'b000100001,// -> group_ni_1_0 +9'b000100000,// -> group_ni_0_1 +9'b000000100 // -> group_ni_0_0 +}, +'{ +9'b010010011,// -> host_peri_ni +9'b000000001,// -> hbm_ni_3 +9'b000001010,// -> hbm_ni_2 +9'b000011011,// -> hbm_ni_1 +9'b011010011,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}, +'{ +9'b000010011,// -> host_peri_ni +9'b000001000,// -> hbm_ni_3 +9'b000000001,// -> hbm_ni_2 +9'b011000011,// -> hbm_ni_1 +9'b000011011,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}, +'{ +9'b000010010,// -> host_peri_ni +9'b000001001,// -> hbm_ni_3 +9'b001001010,// -> hbm_ni_2 +9'b000000011,// -> hbm_ni_1 +9'b000011010,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}, +'{ +9'b000000010,// -> host_peri_ni +9'b001001000,// -> hbm_ni_3 +9'b000001001,// -> hbm_ni_2 +9'b000011000,// -> hbm_ni_1 +9'b000000011,// -> hbm_ni_0 +9'b000000000,// -> group_ni_1_1 +9'b000000000,// -> group_ni_1_0 +9'b000000000,// -> group_ni_0_1 +9'b000000000 // -> group_ni_0_0 +}} +; + + + localparam route_cfg_t RouteCfg = '{ RouteAlgo: SourceRouting, + UseIdTable: 1'b1, + XYAddrOffsetX: 0, + XYAddrOffsetY: 0, + IdAddrOffset: 0, + NumSamRules: 6, + NumRoutes: 9, + CollectiveCfg: '{ OpCfg: '{ EnNarrowMulticast: 1'b0, + EnWideMulticast: 1'b0, + EnLsbAnd: 1'b0, + EnFpAdd: 1'b0, + EnFpMul: 1'b0, + EnFpMin: 1'b0, + EnFpMax: 1'b0, + EnIntAdd: 1'b0, + EnIntMul: 1'b0, + EnIntMinS: 1'b0, + EnIntMinU: 1'b0, + EnIntMaxS: 1'b0, + EnIntMaxU: 1'b0}, + NarrRedCfg: RedDefaultCfg, + WideRedCfg: RedDefaultCfg}}; + + + + typedef logic[31:0] axi_wide_in_addr_t; +typedef logic[255:0] axi_wide_in_data_t; +typedef logic[31:0] axi_wide_in_strb_t; +typedef logic[1:0] axi_wide_in_id_t; +typedef logic[0:0] axi_wide_in_user_t; +`AXI_TYPEDEF_ALL_CT(axi_wide_in, axi_wide_in_req_t, axi_wide_in_rsp_t, axi_wide_in_addr_t, axi_wide_in_id_t, axi_wide_in_data_t, axi_wide_in_strb_t, axi_wide_in_user_t) + + + typedef logic[31:0] axi_wide_out_addr_t; +typedef logic[255:0] axi_wide_out_data_t; +typedef logic[31:0] axi_wide_out_strb_t; +typedef logic[1:0] axi_wide_out_id_t; +typedef logic[0:0] axi_wide_out_user_t; +`AXI_TYPEDEF_ALL_CT(axi_wide_out, axi_wide_out_req_t, axi_wide_out_rsp_t, axi_wide_out_addr_t, axi_wide_out_id_t, axi_wide_out_data_t, axi_wide_out_strb_t, axi_wide_out_user_t) + + + + `FLOO_TYPEDEF_HDR_T(hdr_t, route_t, id_t, axi_ch_e, rob_idx_t) + localparam axi_cfg_t AxiCfg = '{ AddrWidth: 32, + DataWidth: 256, + InIdWidth: 2, + OutIdWidth: 2, + UserWidth: 1}; +`FLOO_TYPEDEF_AXI_CHAN_ALL(axi, req, rsp, axi_wide_in, AxiCfg, hdr_t) + +`FLOO_TYPEDEF_AXI_LINK_ALL(req, rsp, req, rsp) + + +endpackage diff --git a/hardware/src/axi_hier_interco.sv b/hardware/src/axi_hier_interco.sv new file mode 100644 index 0000000..b4ab001 --- /dev/null +++ b/hardware/src/axi_hier_interco.sv @@ -0,0 +1,322 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Author: Samuel Riedel + +// Implement a hierarchical AXI interconnect. Below shows one level of the interconnect. This module +// recursively instantiates itself and creates a tree of interconnects, each node with `Radix` slave +// ports. +// +// AXI Mux Read-only ID Width +// Cache Converter +// |‾╲ +// +-------->| ╲ +// | + +-------+ +-------+ +// +-------->| M | | | | | +// | U |---->| $ |---->| > |----> +// | X | | | | | +// | + +-------+ +-------+ +// +-------->| ╱ +// |_╱ +// Internal Cache +// Slave type type type Master type + +module axi_hier_interco + import cachepool_pkg::ro_cache_ctrl_t; +#( + parameter int unsigned NumSlvPorts = 0, + parameter int unsigned NumMstPorts = 0, + parameter int unsigned Radix = 2, + parameter int unsigned EnableCache = 0, + parameter int unsigned CacheLineWidth = 0, + parameter int unsigned CacheSizeByte = 0, + parameter int unsigned CacheSets = 0, + parameter int unsigned AddrWidth = 0, + parameter int unsigned DataWidth = 0, + parameter int unsigned SlvIdWidth = 0, + parameter int unsigned MstIdWidth = 0, + parameter int unsigned UserWidth = 0, + parameter type slv_req_t = logic, + parameter type slv_resp_t = logic, + parameter type mst_req_t = logic, + parameter type mst_resp_t = logic +) ( + input logic clk_i, + input logic rst_ni, + input logic test_i, + input ro_cache_ctrl_t ro_cache_ctrl_i, + input slv_req_t [NumSlvPorts-1:0] slv_req_i, + output slv_resp_t [NumSlvPorts-1:0] slv_resp_o, + output mst_req_t [NumMstPorts-1:0] mst_req_o, + input mst_resp_t [NumMstPorts-1:0] mst_resp_i +); + + //////////////// + // Typedefs // + //////////////// + + localparam int unsigned IntIdWidth = SlvIdWidth + $clog2(NumSlvPorts); + localparam int unsigned CacheIdWidth = EnableCache[0] ? IntIdWidth + 1: IntIdWidth; + localparam int unsigned NrAddrRules = cachepool_pkg::ROCacheNumAddrRules; + + typedef logic [AddrWidth-1:0] addr_t; + typedef logic [DataWidth-1:0] data_t; + typedef logic [DataWidth/8-1:0] strb_t; + typedef logic [SlvIdWidth-1:0] slv_id_t; + typedef logic [MstIdWidth-1:0] mst_id_t; + typedef logic [IntIdWidth-1:0] int_id_t; + typedef logic [CacheIdWidth-1:0] cache_id_t; + typedef logic [UserWidth-1:0] user_t; + + `include "axi/typedef.svh" + // Common AXI types + `AXI_TYPEDEF_W_CHAN_T(w_t, data_t, strb_t, user_t); + // Slave AXI types + `AXI_TYPEDEF_AW_CHAN_T(slv_aw_t, addr_t, slv_id_t, user_t); + `AXI_TYPEDEF_B_CHAN_T(slv_b_t, slv_id_t, user_t); + `AXI_TYPEDEF_AR_CHAN_T(slv_ar_t, addr_t, slv_id_t, user_t); + `AXI_TYPEDEF_R_CHAN_T(slv_r_t, data_t, slv_id_t, user_t); + // Intermediate AXI types + `AXI_TYPEDEF_AW_CHAN_T(int_aw_t, addr_t, int_id_t, user_t); + `AXI_TYPEDEF_B_CHAN_T(int_b_t, int_id_t, user_t); + `AXI_TYPEDEF_AR_CHAN_T(int_ar_t, addr_t, int_id_t, user_t); + `AXI_TYPEDEF_R_CHAN_T(int_r_t, data_t, int_id_t, user_t); + `AXI_TYPEDEF_REQ_T(int_req_t, int_aw_t, w_t, int_ar_t); + `AXI_TYPEDEF_RESP_T(int_resp_t, int_b_t, int_r_t ); + // Cache AXI types + `AXI_TYPEDEF_AW_CHAN_T(cache_aw_t, addr_t, cache_id_t, user_t); + `AXI_TYPEDEF_B_CHAN_T(cache_b_t, cache_id_t, user_t); + `AXI_TYPEDEF_AR_CHAN_T(cache_ar_t, addr_t, cache_id_t, user_t); + `AXI_TYPEDEF_R_CHAN_T(cache_r_t, data_t, cache_id_t, user_t); + `AXI_TYPEDEF_REQ_T(cache_req_t, cache_aw_t, w_t, cache_ar_t); + `AXI_TYPEDEF_RESP_T(cache_resp_t, cache_b_t, cache_r_t ); + + /////////////// + // Interco // + /////////////// + + // Recursive module to implement multiple hierarchy levels at once + + if (NumMstPorts > NumSlvPorts) begin : gen_error + $error("[axi_hier_interco] `NumMstPorts` must be bigger than `NumSlvPorts`."); + end else if (NumMstPorts == NumSlvPorts) begin : gen_top_level + // Top-level, connect the ports to the master ports + for (genvar i = 0; i < NumMstPorts; i++) begin : gen_bypasses + assign mst_req_o[i] = slv_req_i[i]; + assign slv_resp_o[i] = mst_resp_i[i]; + end + end else if (Radix <= 1) begin : gen_error + $error("[axi_hier_interco] `Radix` must be bigger than 1."); + end else if (NumSlvPorts > Radix) begin : gen_axi_level_recursive + // More than one level missing. --> Recursively call this module + // This level will contain `NumMuxes` interconnects + localparam int unsigned NumMuxes = NumSlvPorts / Radix; + if (NumMuxes * Radix != NumSlvPorts) begin : gen_error + $error("[axi_hier_interco] `NumSlvPorts` mod `Radix` must be 0."); + end else begin : gen_level + slv_req_t [NumMuxes-1:0] int_req; + slv_resp_t [NumMuxes-1:0] int_resp; + + for (genvar i = 0; i < NumMuxes; i++) begin : gen_lower_level + axi_hier_interco #( + .NumSlvPorts (Radix ), + .NumMstPorts (1 ), + .Radix (Radix ), + .EnableCache (EnableCache ), + .CacheLineWidth (CacheLineWidth), + .CacheSizeByte (CacheSizeByte ), + .CacheSets (CacheSets ), + .AddrWidth (AddrWidth ), + .DataWidth (DataWidth ), + .SlvIdWidth (SlvIdWidth ), + .MstIdWidth (SlvIdWidth ), + .UserWidth (UserWidth ), + .slv_req_t (slv_req_t ), + .slv_resp_t (slv_resp_t ), + .mst_req_t (slv_req_t ), + .mst_resp_t (slv_resp_t ) + ) i_axi_interco ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (test_i ), + .ro_cache_ctrl_i (ro_cache_ctrl_i ), + .slv_req_i (slv_req_i[i*Radix +: Radix] ), + .slv_resp_o (slv_resp_o[i*Radix +: Radix]), + .mst_req_o (int_req[i] ), + .mst_resp_i (int_resp[i] ) + ); + end + + axi_hier_interco #( + .NumSlvPorts (NumMuxes ), + .NumMstPorts (NumMstPorts ), + .Radix (Radix ), + .EnableCache (EnableCache>>1), + .CacheLineWidth (CacheLineWidth), + .CacheSizeByte (CacheSizeByte ), + .CacheSets (CacheSets ), + .AddrWidth (AddrWidth ), + .DataWidth (DataWidth ), + .SlvIdWidth (SlvIdWidth ), + .MstIdWidth (MstIdWidth ), + .UserWidth (UserWidth ), + .slv_req_t (slv_req_t ), + .slv_resp_t (slv_resp_t ), + .mst_req_t (mst_req_t ), + .mst_resp_t (mst_resp_t ) + ) i_axi_interco ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (test_i ), + .ro_cache_ctrl_i (ro_cache_ctrl_i), + .slv_req_i (int_req ), + .slv_resp_o (int_resp ), + .mst_req_o (mst_req_o ), + .mst_resp_i (mst_resp_i ) + ); + end + end else if (NumSlvPorts <= Radix && NumMstPorts == 1) begin : gen_bottom_level + + // Intermediate AXI channel + int_req_t int_req; + int_resp_t int_resp; + cache_req_t cache_req; + cache_resp_t cache_resp; + + axi_mux #( + // AXI parameter and channel types + .SlvAxiIDWidth (SlvIdWidth ), // AXI ID width, slave ports + .slv_aw_chan_t (slv_aw_t ), // AW Channel Type, slave ports + .mst_aw_chan_t (int_aw_t ), // AW Channel Type, master port + .w_chan_t (w_t ), // W Channel Type, all ports + .slv_b_chan_t (slv_b_t ), // B Channel Type, slave ports + .mst_b_chan_t (int_b_t ), // B Channel Type, master port + .slv_ar_chan_t (slv_ar_t ), // AR Channel Type, slave ports + .mst_ar_chan_t (int_ar_t ), // AR Channel Type, master port + .slv_r_chan_t (slv_r_t ), // R Channel Type, slave ports + .mst_r_chan_t (int_r_t ), // R Channel Type, master port + .slv_req_t (slv_req_t ), // Slave port request type + .slv_resp_t (slv_resp_t ), // Slave port response type + .mst_req_t (int_req_t ), // Master ports request type + .mst_resp_t (int_resp_t ), // Master ports response type + .NoSlvPorts (NumSlvPorts), // Number of slave ports + // Maximum number of outstanding transactions per write + .MaxWTrans (8 ), + // If enabled, this multiplexer is purely combinatorial + .FallThrough (1'b0 ), + // add spill register on write master ports, adds a cycle latency on write channels + .SpillAw (1'b1 ), + .SpillW (1'b1 ), + .SpillB (1'b1 ), + // add spill register on read master ports, adds a cycle latency on read channels + .SpillAr (1'b1 ), + .SpillR (1'b1 ) + ) i_axi_mux ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (test_i ), + .slv_reqs_i (slv_req_i ), + .slv_resps_o (slv_resp_o), + .mst_req_o (int_req ), + .mst_resp_i (int_resp ) + ); + + if (EnableCache[0]) begin: gen_ro_cache + localparam int unsigned LineCount = CacheSizeByte/(CacheSets*CacheLineWidth/8); + snitch_read_only_cache #( + .LineWidth (CacheLineWidth), + .LineCount (LineCount ), + .WayCount (CacheSets ), + .AxiAddrWidth (AddrWidth ), + .AxiDataWidth (DataWidth ), + .AxiIdWidth (IntIdWidth ), + .AxiUserWidth (UserWidth ), + .MaxTrans (32'd16 ), + .NrAddrRules (NrAddrRules ), + .SerialLookup (0 ), + .slv_req_t (int_req_t ), + .slv_rsp_t (int_resp_t ), + .mst_req_t (cache_req_t ), + .mst_rsp_t (cache_resp_t ) + ) i_snitch_read_only_cache ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .enable_i (ro_cache_ctrl_i.enable ), + .flush_valid_i (ro_cache_ctrl_i.flush_valid), + .flush_ready_o (/* unused */ ), + .icache_events_o (/* unused */ ), + .start_addr_i (ro_cache_ctrl_i.start_addr ), + .end_addr_i (ro_cache_ctrl_i.end_addr ), + .axi_slv_req_i (int_req ), + .axi_slv_rsp_o (int_resp ), + .axi_mst_req_o (cache_req ), + .axi_mst_rsp_i (cache_resp ), + .sram_cfg_data_i ('0 ), + .sram_cfg_tag_i ('0 ), + .sram_cfg_out_data_o (/* unused */ ), + .sram_cfg_out_tag_o (/* unused */ ) + ); + end else begin: gen_no_ro_cache + assign cache_req = int_req; + assign int_resp = cache_resp; + end + + axi_id_remap #( + .AxiSlvPortIdWidth (CacheIdWidth ), + .AxiSlvPortMaxUniqIds (2**MstIdWidth), + .AxiMaxTxnsPerId (8 ), + .AxiMstPortIdWidth (MstIdWidth ), + .slv_req_t (cache_req_t ), + .slv_resp_t (cache_resp_t ), + .mst_req_t (mst_req_t ), + .mst_resp_t (mst_resp_t ) + ) i_axi_id_remap ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (cache_req ), + .slv_resp_o (cache_resp), + .mst_req_o (mst_req_o ), + .mst_resp_i (mst_resp_i) + ); + + // Check all the AXI widths + if ($bits(slv_req_i[0].aw.addr) != AddrWidth) + $error("[axi_hier_interco] `slv_req_i.aw.addr` does not match AddrWidth."); + if ($bits(slv_req_i[0].w.data) != DataWidth) + $error("[axi_hier_interco] `slv_req_i.w.data` does not match DataWidth."); + if ($bits(slv_req_i[0].aw.id) != SlvIdWidth) + $error("[axi_hier_interco] `slv_req_i.aw.id` does not match SlvIdWidth."); + if ($bits(slv_req_i[0].aw.user) != UserWidth) + $error("[axi_hier_interco] `slv_req_i.aw.user` does not match UserWidth."); + + if ($bits(mst_req_o[0].aw.addr) != AddrWidth) + $error("[axi_hier_interco] `mst_req_o.aw.addr` does not match AddrWidth."); + if ($bits(mst_req_o[0].w.data) != DataWidth) + $error("[axi_hier_interco] `mst_req_o.w.data` does not match DataWidth."); + if ($bits(mst_req_o[0].aw.id) != MstIdWidth) + $error("[axi_hier_interco] `mst_req_o.aw.id` does not match MstIdWidth."); + if ($bits(mst_req_o[0].aw.user) != UserWidth) + $error("[axi_hier_interco] `mst_req_o.aw.user` does not match UserWidth."); + + if ($bits(int_req.aw.addr) != AddrWidth) + $error("[axi_hier_interco] `int_req.aw.addr` does not match AddrWidth."); + if ($bits(int_req.w.data) != DataWidth) + $error("[axi_hier_interco] `int_req.w.data` does not match DataWidth."); + if ($bits(int_req.aw.id) != IntIdWidth) + $error("[axi_hier_interco] `int_req.aw.id` does not match IntIdWidth."); + if ($bits(int_req.aw.user) != UserWidth) + $error("[axi_hier_interco] `int_req.aw.user` does not match UserWidth."); + + if ($bits(cache_req.aw.addr) != AddrWidth) + $error("[axi_hier_interco] `cache_req.aw.addr` does not match AddrWidth."); + if ($bits(cache_req.w.data) != DataWidth) + $error("[axi_hier_interco] `cache_req.w.data` does not match DataWidth."); + if ($bits(cache_req.aw.id) != CacheIdWidth) + $error("[axi_hier_interco] `cache_req.aw.id` does not match CacheIdWidth."); + if ($bits(cache_req.aw.user) != UserWidth) + $error("[axi_hier_interco] `cache_req.aw.user` does not match UserWidth."); + end else begin: gen_error + $error("[axi_hier_interco] Cannot build a tree with those parameters."); + end +endmodule diff --git a/hardware/src/cachepool_cc.sv b/hardware/src/cachepool_cc.sv index 86c8d7e..1950cd6 100644 --- a/hardware/src/cachepool_cc.sv +++ b/hardware/src/cachepool_cc.sv @@ -23,12 +23,6 @@ module cachepool_cc parameter int unsigned DataWidth = 0, /// User width of the buses. parameter int unsigned UserWidth = 0, - /// Data width of the AXI DMA buses. - parameter int unsigned DMADataWidth = 0, - /// Id width of the AXI DMA bus. - parameter int unsigned DMAIdWidth = 0, - parameter int unsigned DMAAxiReqFifoDepth = 0, - parameter int unsigned DMAReqFifoDepth = 0, parameter int unsigned SpmStackDepth = 512, /// Data port request type. @@ -75,7 +69,6 @@ module cachepool_cc parameter bit XF16ALT = 0, parameter bit XF8ALT = 0, /// Enable Snitch DMA - parameter bit Xdma = 0, parameter int unsigned NumIntOutstandingLoads = 0, parameter int unsigned NumIntOutstandingMem = 0, parameter int unsigned NumSpatzOutstandingLoads = 0, @@ -176,7 +169,7 @@ module cachepool_cc .VMSupport (1'b0 ), .RVE (RVE ), .FP_EN (FPEn ), - .Xdma (Xdma ), + .Xdma (1'b0 ), .RVF (RVF ), .RVD (RVD ), .RVV (RVV ), diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv index df687a3..c9102ef 100644 --- a/hardware/src/cachepool_cluster.sv +++ b/hardware/src/cachepool_cluster.sv @@ -4,21 +4,11 @@ // Author: Diyou Shen -`include "axi/assign.svh" `include "axi/typedef.svh" -`include "common_cells/assertions.svh" -`include "common_cells/registers.svh" -`include "mem_interface/assign.svh" -`include "mem_interface/typedef.svh" -`include "register_interface//assign.svh" `include "register_interface/typedef.svh" -`include "reqrsp_interface/assign.svh" -`include "reqrsp_interface/typedef.svh" -`include "snitch_vm/typedef.svh" -`include "tcdm_interface/assign.svh" -`include "tcdm_interface/typedef.svh" -/// A single-tile cluster implementation for CachePool +/// CachePool cluster: instantiates NumGroups groups connected via FlooNoC mesh, +/// with shared L2 memory and peripheral fabric. module cachepool_cluster import cachepool_pkg::*; import spatz_pkg::*; @@ -47,10 +37,6 @@ module cachepool_cluster parameter int unsigned ClusterPeriphSize = 64, /// Number of TCDM Banks. parameter int unsigned NrBanks = 2 * NrCores, - /// Size of DMA AXI buffer. - parameter int unsigned DMAAxiReqFifoDepth = 3, - /// Size of DMA request fifo. - parameter int unsigned DMAReqFifoDepth = 3, /// Width of a single icache line. parameter unsigned ICacheLineWidth = 0, /// Number of icache lines per set. @@ -65,8 +51,6 @@ module cachepool_cluster /// Spatz FPU/IPU Configuration parameter int unsigned NumSpatzFPUs = 4, parameter int unsigned NumSpatzIPUs = 1, - /// Per-core enabling of the custom `Xdma` ISA extensions. - parameter bit [NrCores-1:0] Xdma = '{default: '0}, /// # Per-core parameters /// Per-core integer outstanding loads parameter int unsigned NumIntOutstandingLoads = 0, @@ -115,20 +99,20 @@ module cachepool_cluster input logic rst_ni, /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. - input logic [NrCores-1:0] debug_req_i, + input logic debug_req_i, /// End of Computing indicator to notify the host/tb output logic [3:0] eoc_o, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. - input logic [NrCores-1:0] meip_i, + input logic meip_i, /// Machine timer interrupt pending. Usually those interrupts come from a /// core-local interrupt controller such as a timer/RTC. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] mtip_i, + input logic mtip_i, /// Core software interrupt pending. Usually those interrupts come from /// another core to facilitate inter-processor-interrupts. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] msip_i, + input logic msip_i, /// First hartid of the cluster. Cores of a cluster are monotonically /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. @@ -157,39 +141,16 @@ module cachepool_cluster // Imports // --------- import snitch_pkg::*; - import snitch_icache_pkg::icache_events_t; // --------- // Constants // --------- - /// Minimum width to hold the core number. - localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); - - // Enlarge the address width for Spatz due to cache - localparam int unsigned TCDMAddrWidth = 32; - - // Core Request, SoC Request - localparam int unsigned NrNarrowMasters = 2; - localparam int unsigned WideIdWidthOut = AxiIdWidthOut; - localparam int unsigned WideIdWidthIn = WideIdWidthOut - $clog2(NumClusterMst); - - // Cache XBar configuration struct - localparam axi_pkg::xbar_cfg_t CacheXbarCfg = '{ - NoSlvPorts : NumClusterMst*NumTiles, - NoMstPorts : ClusterWideOutAxiPorts, - MaxMstTrans : MaxMstTrans, - MaxSlvTrans : MaxSlvTrans, - FallThrough : 1'b0, - LatencyMode : XbarLatency, - AxiIdWidthSlvPorts: WideIdWidthIn, - AxiIdUsedSlvPorts : WideIdWidthIn, - UniqueIds : 1'b0, - AxiAddrWidth : AxiAddrWidth, - AxiDataWidth : AxiDataWidth, - NoAddrRules : ClusterWideOutAxiPorts - 1, - default : '0 - }; + localparam int unsigned WideIdWidthIn = WideIdWidthOut - ClusterRouteIdWidth - GroupMuxIdBits; + + // Pre-mux AXI ID width: per-group reqrsp_to_axi output. + // The multi-group axi_mux adds GroupMuxIdBits on top to reach WideIdWidthOut. + localparam int unsigned WideIdWidthPreMux = WideIdWidthOut - GroupMuxIdBits; // -------- // Typedefs @@ -201,37 +162,39 @@ module cachepool_cluster typedef logic [WideIdWidthOut-1:0] id_cache_slv_t; typedef logic [AxiUserWidth-1:0] user_cache_t; - `AXI_TYPEDEF_ALL(axi_mst_cache, addr_t, id_cache_mst_t, data_cache_t, strb_cache_t, user_cache_t) - `AXI_TYPEDEF_ALL(axi_slv_cache, addr_t, id_cache_slv_t, data_cache_t, strb_cache_t, user_cache_t) - - `REG_BUS_TYPEDEF_ALL(reg_cache, addr_t, data_cache_t, strb_cache_t) + // reqrsp_to_axi output type: full GroupAxiIdOutWidth-bit IDs (decoupled from WideIdWidthPreMux + // which now equals WideRefillIdWidth after per-group ID remapping). + typedef logic [GroupAxiIdOutWidth-1:0] id_cache_premux_t; + // Remapper output / mux slave input type: bounded WideRefillIdWidth-bit IDs. + typedef logic [WideIdWidthPreMux-1:0] id_cache_remap_t; - typedef struct packed { - int unsigned idx; - addr_t start_addr; - addr_t end_addr; - } xbar_rule_t; - - `SNITCH_VM_TYPEDEF(AxiAddrWidth) + `AXI_TYPEDEF_ALL(axi_mst_cache, addr_t, id_cache_mst_t, data_cache_t, strb_cache_t, user_cache_t) + // Post-mux AXI types (same as before — used for axi_cut and output). + `AXI_TYPEDEF_ALL(axi_slv_cache, addr_t, id_cache_slv_t, data_cache_t, strb_cache_t, user_cache_t) + // reqrsp_to_axi output AXI types (full GroupAxiIdOutWidth-bit IDs). + `AXI_TYPEDEF_ALL(axi_premux_cache, addr_t, id_cache_premux_t, data_cache_t, strb_cache_t, user_cache_t) + // Remapped AXI types: WideRefillIdWidth-bit IDs, fed into the inter-group mux / future NoC. + `AXI_TYPEDEF_ALL(axi_remap_cache, addr_t, id_cache_remap_t, data_cache_t, strb_cache_t, user_cache_t) // ---------------- // Wire Definitions // ---------------- // 1. AXI - axi_mst_cache_req_t [NumTiles-1:0][TileNarrowAxiPorts-1:0] axi_tile_req; - axi_mst_cache_resp_t [NumTiles-1:0][TileNarrowAxiPorts-1:0] axi_tile_rsp; - axi_slv_cache_req_t [ClusterWideOutAxiPorts-1 :0] wide_axi_slv_req; - axi_slv_cache_resp_t [ClusterWideOutAxiPorts-1 :0] wide_axi_slv_rsp; - axi_narrow_req_t [NumTiles-1:0][1:0] axi_out_req; - axi_narrow_resp_t [NumTiles-1:0][1:0] axi_out_resp; - - // 2. BootROM - reg_cache_req_t [NumTiles-1:0] bootrom_reg_req; - reg_cache_rsp_t [NumTiles-1:0] bootrom_reg_rsp; + // Post-mux wide AXI (one per L2 channel, merged across groups). + axi_slv_cache_req_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_req; + axi_slv_cache_resp_t [ClusterWideOutAxiPorts-1:0] wide_axi_slv_rsp; + // Per-group pre-mux wide AXI (per group, per L2 channel): full GroupAxiIdOutWidth-bit IDs. + axi_premux_cache_req_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] wide_axi_premux_req; + axi_premux_cache_resp_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] wide_axi_premux_rsp; + // Per-group remapped wide AXI: WideRefillIdWidth-bit IDs, fed into the inter-group mux. + axi_remap_cache_req_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] wide_axi_remap_req; + axi_remap_cache_resp_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] wide_axi_remap_rsp; + // Narrow AXI per tile (UART + Periph). + axi_narrow_req_t [NumTiles-1:0][1:0] axi_out_req; + axi_narrow_resp_t [NumTiles-1:0][1:0] axi_out_resp; // 3. Peripherals axi_addr_t private_start_addr; - icache_events_t [NrCores-1:0] icache_events; logic icache_prefetch_enable; logic [NrCores-1:0] cl_interrupt; logic [$clog2(L1AddrWidth)-1:0] dynamic_offset; @@ -241,503 +204,316 @@ module cachepool_cluster logic [NumTiles-1:0] l1d_insn_ready; logic [NumTiles-1:0] l1d_busy; + // Per-group error signals. + logic [NumGroups-1:0] group_error; + + // Inter-group NoC mesh signals (indexed by group, then direction, then port) + noc_group_req_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_out; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_out_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_out_ready; + noc_group_req_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_in; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_in_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_in_ready; + noc_group_rsp_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_out; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_out_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_out_ready; + noc_group_rsp_t [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_in; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_in_valid; + logic [NumGroups-1:0][3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_in_ready; + // --------------- - // CachePool Tile + // CachePool Group // --------------- - cache_trans_req_t [NumL1CacheCtrl-1 :0] cache_refill_req; - cache_trans_rsp_t [NumL1CacheCtrl-1 :0] cache_refill_rsp; - - cache_trans_req_t [NumTiles-1 :0] cache_core_req; - cache_trans_rsp_t [NumTiles-1 :0] cache_core_rsp; - - cache_trans_req_chan_t [NumTiles*NumClusterMst-1 :0] tile_req_chan; - cache_trans_rsp_chan_t [NumTiles*NumClusterMst-1 :0] tile_rsp_chan; - logic [NumTiles*NumClusterMst-1 :0] tile_req_valid, tile_req_ready, tile_rsp_valid, tile_rsp_ready; - - l2_req_t [ClusterWideOutAxiPorts-1 :0] l2_req; - l2_rsp_t [ClusterWideOutAxiPorts-1 :0] l2_rsp; - - cache_trans_req_chan_t [ClusterWideOutAxiPorts-1 :0] l2_req_chan; - cache_trans_rsp_chan_t [ClusterWideOutAxiPorts-1 :0] l2_rsp_chan; - logic [ClusterWideOutAxiPorts-1 :0] l2_req_valid, l2_req_ready , l2_rsp_valid, l2_rsp_ready; - - typedef logic [$clog2(NumClusterMst*NumTiles)-1:0] l2_sel_t; - // one more bit for out-of-range alert - typedef logic [$clog2(ClusterWideOutAxiPorts) :0] tile_sel_err_t; - typedef logic [$clog2(ClusterWideOutAxiPorts)-1:0] tile_sel_t; - - // Which l2 we want to select for each req - tile_sel_err_t [NumTiles*NumClusterMst-1 :0] tile_sel_err; - tile_sel_t [NumTiles*NumClusterMst-1 :0] tile_sel; - // Which tile we selected for each req - l2_sel_t [ClusterWideOutAxiPorts-1 :0] tile_selected; - // which tile we want to select for each rsp - l2_sel_t [ClusterWideOutAxiPorts-1 :0] l2_sel; - // What is the priority for response wiring? - // Here we want to make sure the responses from one burst - // continues until done - // If the rsp is a burst with blen != 0, then we will keep - // the rr same, until got a burst rsp with blen == 0 - tile_sel_t [NumTiles*NumClusterMst-1 :0] l2_rsp_rr; - - logic [NumTiles*NumClusterMst-1 :0] rr_lock_d, rr_lock_q; - tile_sel_t [NumTiles*NumClusterMst-1 :0] l2_prio_d, l2_prio_q; - - - l2_sel_t [ClusterWideOutAxiPorts-1:0] port_id; - - for (genvar i = 0; i < ClusterWideOutAxiPorts; i ++) begin - assign port_id[i] = l2_rsp[i].p.user.tile_id * NumClusterMst + l2_rsp[i].p.user.bank_id; - end - - - if (Burst_Enable) begin : gen_burst_ext_sel - `FF(rr_lock_q, rr_lock_d, 1'b0) - `FF(l2_prio_q, l2_prio_d, 1'b0) - - for (genvar port = 0; port < NumTiles*NumClusterMst; port ++) begin : gen_rsp_rr - tile_sel_t l2_rr; - logic [ClusterWideOutAxiPorts-1:0] arb_valid; - for (genvar i = 0; i < ClusterWideOutAxiPorts; i ++) begin - // Used to check the round-robin selection - assign arb_valid[i] = (port_id[i] == port) & l2_rsp_valid[i]; - end - - always_comb begin - l2_prio_d[port] = l2_prio_q[port]; - rr_lock_d[port] = rr_lock_q[port]; - - // Determine the priority we give - // round-robin or locked to previous value? - if (|arb_valid) begin - if (rr_lock_q[port]) begin - // rr is locked because of burst - l2_prio_d[port] = l2_prio_q[port]; - end else begin - l2_prio_d[port] = l2_rr; - end - end - // assigned to xbar rr_i - l2_rsp_rr[port] = l2_prio_d[port]; - - // Lock judgement - // Should it work on the l2_rsp instead of tile_rsp? - if (tile_rsp_chan[port].user.burst.is_burst & |arb_valid) begin - // We got a burst response - if (tile_rsp_chan[port].user.burst.burst_len == 0) begin - // this is the last transaction within a burt, remove lock - rr_lock_d[port] = 1'b0; - end else begin - // the burst response is not finished yet, lock the rr - rr_lock_d[port] = 1'b1; - end - end - end - - // We use the rr_arb_tree to get the round-robin selection - // No data is needed here, only need the handshaking - rr_arb_tree #( - .NumIn ( ClusterWideOutAxiPorts ), - .DataType ( logic ), - .ExtPrio ( 1'b0 ), - .AxiVldRdy ( 1'b1 ), - .LockIn ( 1'b1 ) - ) i_rr_arb_tree ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .flush_i ( '0 ), - .rr_i ( '0 ), - .req_i ( arb_valid ), - .gnt_o ( /*not used*/ ), - .data_i ( '0 ), - .req_o ( /*not used*/ ), - .gnt_i ( tile_rsp_ready[port] ), - .data_o ( /*not used*/ ), - .idx_o ( l2_rr ) + // Per-group L2 reqrsp ports (one per L2 channel per group). + l2_req_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] l2_req; + l2_rsp_t [NumGroups-1:0][ClusterWideOutAxiPorts-1:0] l2_rsp; + + assign error_o = |group_error; + + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_group_y + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_group_x + // Flat group index: g = gy * NumGroupsX + gx + localparam int unsigned g = gy * NumGroupsX + gx; + cachepool_group_noc_wrapper #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( WideIdWidthIn ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NumCoreGroup ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks / NumGroups ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_mst_cache_req_t ), + .axi_out_resp_t ( axi_mst_cache_resp_t ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_group ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( group_error[g] ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id_i + 10'(g * NumCoreGroup) ), + .tile_base_id_i ( TileIDWidth'(g * NumTilesPerGroup) ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .private_start_addr_i ( private_start_addr ), + .axi_narrow_req_o ( axi_out_req [g*NumTilesPerGroup +: NumTilesPerGroup] ), + .axi_narrow_rsp_i ( axi_out_resp[g*NumTilesPerGroup +: NumTilesPerGroup] ), + // DRAM refill reqrsp (post-xbar, one per L2 channel) + .l2_req_o ( l2_req[g] ), + .l2_rsp_i ( l2_rsp[g] ), + // Peripherals + .icache_events_o ( /* unused */ ), + .icache_prefetch_enable_i ( icache_prefetch_enable ), + .cl_interrupt_i ( cl_interrupt [g*NumCoreGroup +: NumCoreGroup] ), + .dynamic_offset_i ( dynamic_offset ), + .l1d_private_i ( l1d_private ), + .l1d_insn_i ( l1d_insn ), + .l1d_insn_valid_i ( l1d_insn_valid ), + .l1d_insn_ready_o ( l1d_insn_ready[g*NumTilesPerGroup +: NumTilesPerGroup]), + .l1d_busy_i ( l1d_busy [g*NumTilesPerGroup +: NumTilesPerGroup]), + .group_xy_id_i ( group_xy_id_t'{x: gx, + y: gy, + port_id: 1'b0} ), + .noc_req_o ( noc_req_out [g] ), + .noc_req_valid_o ( noc_req_out_valid[g] ), + .noc_req_ready_i ( noc_req_out_ready[g] ), + .noc_req_i ( noc_req_in [g] ), + .noc_req_valid_i ( noc_req_in_valid [g] ), + .noc_req_ready_o ( noc_req_in_ready [g] ), + .noc_rsp_o ( noc_rsp_out [g] ), + .noc_rsp_valid_o ( noc_rsp_out_valid[g] ), + .noc_rsp_ready_i ( noc_rsp_out_ready[g] ), + .noc_rsp_i ( noc_rsp_in [g] ), + .noc_rsp_valid_i ( noc_rsp_in_valid [g] ), + .noc_rsp_ready_o ( noc_rsp_in_ready [g] ) ); end - end else begin - assign l2_prio_d = '0; - assign l2_prio_q = '0; - assign rr_lock_d = '0; - assign rr_lock_q = '0; - assign l2_rsp_rr = '0; end - if (NumTiles > 1) begin : gen_group - cachepool_group #( - .AxiAddrWidth ( AxiAddrWidth ), - .AxiDataWidth ( AxiDataWidth ), - .AxiIdWidthIn ( AxiIdWidthIn ), - .AxiIdWidthOut ( WideIdWidthIn ), - .AxiUserWidth ( AxiUserWidth ), - .BootAddr ( BootAddr ), - .UartAddr ( UartAddr ), - .ClusterPeriphSize ( ClusterPeriphSize ), - .NrCores ( NrCores ), - .TCDMDepth ( TCDMDepth ), - .NrBanks ( NrBanks ), - .ICacheLineWidth ( ICacheLineWidth ), - .ICacheLineCount ( ICacheLineCount ), - .ICacheSets ( ICacheSets ), - .FPUImplementation ( FPUImplementation ), - .NumSpatzFPUs ( NumSpatzFPUs ), - .NumSpatzIPUs ( NumSpatzIPUs ), - .SnitchPMACfg ( SnitchPMACfg ), - .NumIntOutstandingLoads ( NumIntOutstandingLoads ), - .NumIntOutstandingMem ( NumIntOutstandingMem ), - .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), - .axi_in_req_t ( axi_in_req_t ), - .axi_in_resp_t ( axi_in_resp_t ), - .axi_narrow_req_t ( axi_narrow_req_t ), - .axi_narrow_resp_t ( axi_narrow_resp_t ), - .axi_out_req_t ( axi_mst_cache_req_t ), - .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), - .RegisterOffloadRsp ( RegisterOffloadRsp ), - .RegisterCoreReq ( RegisterCoreReq ), - .RegisterCoreRsp ( RegisterCoreRsp ), - .RegisterTCDMCuts ( RegisterTCDMCuts ), - .RegisterExt ( RegisterExt ), - .XbarLatency ( XbarLatency ), - .MaxMstTrans ( MaxMstTrans ), - .MaxSlvTrans ( MaxSlvTrans ) - ) i_group ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .impl_i ( impl_i ), - .error_o ( error_o ), - .debug_req_i ( debug_req_i ), - .meip_i ( meip_i ), - .mtip_i ( mtip_i ), - .msip_i ( msip_i ), - .hart_base_id_i ( hart_base_id_i ), - .cluster_base_addr_i ( cluster_base_addr_i ), - .private_start_addr_i ( private_start_addr ), - .axi_narrow_req_o ( axi_out_req ), - .axi_narrow_rsp_i ( axi_out_resp ), - .axi_wide_req_o ( axi_tile_req ), - .axi_wide_rsp_i ( axi_tile_rsp ), - // Cache Refill Ports - .cache_refill_req_o ( cache_refill_req ), - .cache_refill_rsp_i ( cache_refill_rsp ), - // Peripherals - .icache_events_o ( icache_events ), - .icache_prefetch_enable_i ( icache_prefetch_enable ), - .cl_interrupt_i ( cl_interrupt ), - .dynamic_offset_i ( dynamic_offset ), - .l1d_private_i ( l1d_private ), - .l1d_insn_i ( l1d_insn ), - .l1d_insn_valid_i ( l1d_insn_valid ), - .l1d_insn_ready_o ( l1d_insn_ready ), - .l1d_busy_i ( l1d_busy ) - ); - // TODO: 2 axi ports converted lost correct assignments - // 1. tile id? - // 2. mux then convert? - for (genvar t = 0; t < NumTiles; t ++) begin : gen_axi_converter - axi_to_reqrsp #( - .axi_req_t ( axi_mst_cache_req_t ), - .axi_rsp_t ( axi_mst_cache_resp_t ), - .AddrWidth ( AxiAddrWidth ), - .DataWidth ( AxiDataWidth ), - .UserWidth ( $bits(refill_user_t) ), - .IdWidth ( AxiIdWidthIn ), - .BufDepth ( NumSpatzOutstandingLoads ), - .reqrsp_req_t ( cache_trans_req_t ), - .reqrsp_rsp_t ( cache_trans_rsp_t ) - ) i_axi2reqrsp ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .busy_o ( ), - .axi_req_i ( axi_tile_req [t][TileMem] ), - .axi_rsp_o ( axi_tile_rsp [t][TileMem] ), - .reqrsp_req_o ( cache_core_req[t] ), - .reqrsp_rsp_i ( cache_core_rsp[t] ) - ); + // ---------------------------- + // Inter-group NoC mesh wiring + // ---------------------------- + + // East-West (horizontal) interior connections + for (genvar gx = 0; gx < NumGroupsX-1; gx++) begin : gen_ew_conn + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_ew_conn_y + // East output of (gx,gy) → West input of (gx+1,gy) + assign noc_req_in [gx+1 + gy*NumGroupsX][3] = noc_req_out [gx + gy*NumGroupsX][1]; + assign noc_req_in_valid[gx+1 + gy*NumGroupsX][3] = noc_req_out_valid[gx + gy*NumGroupsX][1]; + assign noc_req_out_ready[gx + gy*NumGroupsX][1] = noc_req_in_ready [gx+1 + gy*NumGroupsX][3]; + assign noc_rsp_in [gx+1 + gy*NumGroupsX][3] = noc_rsp_out [gx + gy*NumGroupsX][1]; + assign noc_rsp_in_valid[gx+1 + gy*NumGroupsX][3] = noc_rsp_out_valid[gx + gy*NumGroupsX][1]; + assign noc_rsp_out_ready[gx + gy*NumGroupsX][1] = noc_rsp_in_ready [gx+1 + gy*NumGroupsX][3]; + // West output of (gx+1,gy) → East input of (gx,gy) + assign noc_req_in [gx + gy*NumGroupsX][1] = noc_req_out [gx+1 + gy*NumGroupsX][3]; + assign noc_req_in_valid[gx + gy*NumGroupsX][1] = noc_req_out_valid[gx+1 + gy*NumGroupsX][3]; + assign noc_req_out_ready[gx+1 + gy*NumGroupsX][3] = noc_req_in_ready[gx + gy*NumGroupsX][1]; + assign noc_rsp_in [gx + gy*NumGroupsX][1] = noc_rsp_out [gx+1 + gy*NumGroupsX][3]; + assign noc_rsp_in_valid[gx + gy*NumGroupsX][1] = noc_rsp_out_valid[gx+1 + gy*NumGroupsX][3]; + assign noc_rsp_out_ready[gx+1 + gy*NumGroupsX][3] = noc_rsp_in_ready[gx + gy*NumGroupsX][1]; end - - end else begin : gen_tile - cachepool_tile #( - .AxiAddrWidth ( AxiAddrWidth ), - .AxiDataWidth ( AxiDataWidth ), - .AxiIdWidthIn ( AxiIdWidthIn ), - .AxiIdWidthOut ( WideIdWidthIn ), - .AxiUserWidth ( AxiUserWidth ), - .BootAddr ( BootAddr ), - .UartAddr ( UartAddr ), - .ClusterPeriphSize ( ClusterPeriphSize ), - .NrCores ( NrCores ), - .TCDMDepth ( TCDMDepth ), - .NrBanks ( NrBanks ), - .ICacheLineWidth ( ICacheLineWidth ), - .ICacheLineCount ( ICacheLineCount ), - .ICacheSets ( ICacheSets ), - .FPUImplementation ( FPUImplementation ), - .NumSpatzFPUs ( NumSpatzFPUs ), - .NumSpatzIPUs ( NumSpatzIPUs ), - .SnitchPMACfg ( SnitchPMACfg ), - .TileIDWidth ( 1 ), - .NumIntOutstandingLoads ( NumIntOutstandingLoads ), - .NumIntOutstandingMem ( NumIntOutstandingMem ), - .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), - .axi_in_req_t ( axi_in_req_t ), - .axi_in_resp_t ( axi_in_resp_t ), - .axi_narrow_req_t ( axi_narrow_req_t ), - .axi_narrow_resp_t ( axi_narrow_resp_t ), - .axi_out_req_t ( axi_mst_cache_req_t ), - .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), - .RegisterOffloadRsp ( RegisterOffloadRsp ), - .RegisterCoreReq ( RegisterCoreReq ), - .RegisterCoreRsp ( RegisterCoreRsp ), - .RegisterTCDMCuts ( RegisterTCDMCuts ), - .RegisterExt ( RegisterExt ), - .XbarLatency ( XbarLatency ), - .MaxMstTrans ( MaxMstTrans ), - .MaxSlvTrans ( MaxSlvTrans ) - ) i_tile ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .impl_i ( impl_i ), - .error_o ( error_o ), - .debug_req_i ( debug_req_i ), - .meip_i ( meip_i ), - .mtip_i ( mtip_i ), - .msip_i ( msip_i ), - .hart_base_id_i ( hart_base_id_i ), - .cluster_base_addr_i ( cluster_base_addr_i ), - .tile_id_i ( '0 ), - .private_start_addr_i ( private_start_addr ), - .axi_out_req_o ( axi_out_req [0] ), - .axi_out_resp_i ( axi_out_resp [0] ), - // Remote Ports (not used) - .remote_req_o ( ), - .remote_req_dst_o ( ), - .remote_rsp_i ( '0 ), - .remote_rsp_ready_i ( '0 ), - .remote_req_i ( '0 ), - .remote_rsp_o ( ), - .remote_rsp_ready_o ( ), - // Cache Refill Ports - .cache_refill_req_o ( cache_refill_req ), - .cache_refill_rsp_i ( cache_refill_rsp ), - .axi_wide_req_o ( axi_tile_req[0] ), - .axi_wide_rsp_i ( axi_tile_rsp[0] ), - // Peripherals - .icache_events_o ( icache_events ), - .icache_prefetch_enable_i ( icache_prefetch_enable ), - .cl_interrupt_i ( cl_interrupt ), - .dynamic_offset_i ( dynamic_offset ), - .l1d_private_i ( l1d_private ), - .l1d_insn_i ( l1d_insn ), - .l1d_insn_valid_i ( l1d_insn_valid ), - .l1d_insn_ready_o ( l1d_insn_ready ), - .l1d_busy_i ( l1d_busy ) - ); - - axi_to_reqrsp #( - .axi_req_t ( axi_mst_cache_req_t ), - .axi_rsp_t ( axi_mst_cache_resp_t ), - .AddrWidth ( AxiAddrWidth ), - .DataWidth ( AxiDataWidth ), - .UserWidth ( $bits(refill_user_t) ), - .IdWidth ( AxiIdWidthIn ), - .BufDepth ( NumSpatzOutstandingLoads ), - .reqrsp_req_t ( cache_trans_req_t ), - .reqrsp_rsp_t ( cache_trans_rsp_t ) - ) i_axi2reqrsp ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .busy_o ( ), - .axi_req_i ( axi_tile_req [0][TileMem] ), - .axi_rsp_o ( axi_tile_rsp [0][TileMem] ), - .reqrsp_req_o ( cache_core_req[0] ), - .reqrsp_rsp_i ( cache_core_rsp[0] ) - ); end - // Additional one port for iCache connection - localparam int unsigned ReqrspPortsTile = NumL1CtrlTile + 1; - always_comb begin - for (int t = 0; t < NumTiles; t++) begin - for (int p = 0; p < ReqrspPortsTile; p++) begin - automatic int unsigned xbar_idx = t*ReqrspPortsTile + p; - automatic int unsigned refill_idx = t*NumL1CtrlTile + p-1; - - if (p == 0) begin - // connect_icache_path - tile_req_chan [xbar_idx] = cache_core_req [t].q; - // Scrmable address - tile_req_chan [xbar_idx].addr = scrambleAddr(cache_core_req[t].q.addr); - tile_req_valid [xbar_idx] = cache_core_req [t].q_valid; - cache_core_rsp [t].q_ready = tile_req_ready [xbar_idx]; - - cache_core_rsp [t].p = tile_rsp_chan [xbar_idx]; - cache_core_rsp [t].p_valid = tile_rsp_valid [xbar_idx]; - tile_rsp_ready [xbar_idx] = cache_core_req [t].p_ready; - // Tile ID assignment - tile_req_chan [xbar_idx].user.tile_id = t; - end else begin - // connect_refill_path - tile_req_chan [xbar_idx] = cache_refill_req[refill_idx].q; - // Scramble address - tile_req_chan [xbar_idx].addr = scrambleAddr(cache_refill_req[refill_idx].q.addr); - tile_req_valid [xbar_idx] = cache_refill_req[refill_idx].q_valid; - cache_refill_rsp[refill_idx].q_ready = tile_req_ready [xbar_idx]; - - cache_refill_rsp[refill_idx].p = tile_rsp_chan [xbar_idx]; - cache_refill_rsp[refill_idx].p_valid = tile_rsp_valid [xbar_idx]; - tile_rsp_ready [xbar_idx] = cache_refill_req[refill_idx].p_ready; - // Tile ID assignment - tile_req_chan [xbar_idx].user.tile_id = t; - end - end + // North-South (vertical) interior connections + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_ns_conn + for (genvar gy = 0; gy < NumGroupsY-1; gy++) begin : gen_ns_conn_y + // North output of (gx,gy) (dir 0) → South input of (gx,gy+1) (dir 2) + assign noc_req_in [gx + (gy+1)*NumGroupsX][2] = noc_req_out [gx + gy*NumGroupsX][0]; + assign noc_req_in_valid[gx + (gy+1)*NumGroupsX][2] = noc_req_out_valid[gx + gy*NumGroupsX][0]; + assign noc_req_out_ready[gx + gy *NumGroupsX][0] = noc_req_in_ready[gx + (gy+1)*NumGroupsX][2]; + assign noc_rsp_in [gx + (gy+1)*NumGroupsX][2] = noc_rsp_out [gx + gy*NumGroupsX][0]; + assign noc_rsp_in_valid[gx + (gy+1)*NumGroupsX][2] = noc_rsp_out_valid[gx + gy*NumGroupsX][0]; + assign noc_rsp_out_ready[gx + gy *NumGroupsX][0] = noc_rsp_in_ready[gx + (gy+1)*NumGroupsX][2]; + // South output of (gx,gy+1) (dir 2) → North input of (gx,gy) (dir 0) + assign noc_req_in [gx + gy *NumGroupsX][0] = noc_req_out [gx + (gy+1)*NumGroupsX][2]; + assign noc_req_in_valid[gx + gy *NumGroupsX][0] = noc_req_out_valid[gx + (gy+1)*NumGroupsX][2]; + assign noc_req_out_ready[gx + (gy+1)*NumGroupsX][2] = noc_req_in_ready[gx + gy *NumGroupsX][0]; + assign noc_rsp_in [gx + gy *NumGroupsX][0] = noc_rsp_out [gx + (gy+1)*NumGroupsX][2]; + assign noc_rsp_in_valid[gx + gy *NumGroupsX][0] = noc_rsp_out_valid[gx + (gy+1)*NumGroupsX][2]; + assign noc_rsp_out_ready[gx + (gy+1)*NumGroupsX][2] = noc_rsp_in_ready[gx + gy *NumGroupsX][0]; end end - typedef struct packed { - int unsigned idx; - logic [AxiAddrWidth-1:0] base; - logic [AxiAddrWidth-1:0] mask; - } reqrsp_rule_t; - - reqrsp_rule_t [ClusterWideOutAxiPorts-1:0] xbar_rule; - - for (genvar i = 0; i < ClusterWideOutAxiPorts; i ++) begin - assign xbar_rule[i] = '{ - idx : i, - base : DramAddr + DramPerChSize * i, - mask : ({AxiAddrWidth{1'b1}} << $clog2(DramPerChSize)) - }; + // West boundary: gx=0 has no West neighbor (dir 3) + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_west_bnd + assign noc_req_in [gy*NumGroupsX][3] = '0; + assign noc_req_in_valid[gy*NumGroupsX][3] = '0; + assign noc_req_out_ready[gy*NumGroupsX][3] = '1; + assign noc_rsp_in [gy*NumGroupsX][3] = '0; + assign noc_rsp_in_valid[gy*NumGroupsX][3] = '0; + assign noc_rsp_out_ready[gy*NumGroupsX][3] = '1; end - logic [$clog2(ClusterWideOutAxiPorts):0] default_idx; - assign default_idx = ClusterWideOutAxiPorts; - - for (genvar inp = 0; inp < NumClusterMst*NumTiles; inp ++) begin : gen_xbar_sel - addr_decode_napot #( - .NoIndices (ClusterWideOutAxiPorts+1 ), - .NoRules (ClusterWideOutAxiPorts ), - .addr_t (axi_addr_t ), - .rule_t (reqrsp_rule_t ) - ) i_snitch_decode_napot ( - .addr_i (tile_req_chan[inp].addr), - .addr_map_i (xbar_rule ), - .idx_o (tile_sel_err[inp] ), - .dec_valid_o (/* Unused */ ), - .dec_error_o (/* Unused */ ), - .en_default_idx_i (1'b1 ), - .default_idx_i (default_idx ) - ); + // East boundary: gx=NumGroupsX-1 has no East neighbor (dir 1) + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_east_bnd + assign noc_req_in [(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_req_in_valid[(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_req_out_ready[(NumGroupsX-1) + gy*NumGroupsX][1] = '1; + assign noc_rsp_in [(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_rsp_in_valid[(NumGroupsX-1) + gy*NumGroupsX][1] = '0; + assign noc_rsp_out_ready[(NumGroupsX-1) + gy*NumGroupsX][1] = '1; + end - assign tile_sel[inp] = tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)-1:0]; + // South boundary: gy=0 has no South neighbor (dir 2) + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_south_bnd + assign noc_req_in [gx][2] = '0; + assign noc_req_in_valid[gx][2] = '0; + assign noc_req_out_ready[gx][2] = '1; + assign noc_rsp_in [gx][2] = '0; + assign noc_rsp_in_valid[gx][2] = '0; + assign noc_rsp_out_ready[gx][2] = '1; + end -`ifndef TARGET_SYNTHESIS - // Alert the system that we have illegal memory access - IllegalMemAccess : assert property( - @(posedge clk_i) disable iff (!rst_ni) (tile_req_valid[inp] |-> !tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)])) - else $error("Visited illegal address: time=%0t, port=%0d, addr=0x%08h", $time, inp, tile_req_chan[inp].addr); - // else $fatal (1, "Visited address is not mapped"); -`endif + // North boundary: gy=NumGroupsY-1 has no North neighbor (dir 0) + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_north_bnd + assign noc_req_in [gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_req_in_valid[gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_req_out_ready[gx + (NumGroupsY-1)*NumGroupsX][0] = '1; + assign noc_rsp_in [gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_rsp_in_valid[gx + (NumGroupsY-1)*NumGroupsX][0] = '0; + assign noc_rsp_out_ready[gx + (NumGroupsY-1)*NumGroupsX][0] = '1; end - reqrsp_xbar #( - .NumInp (NumClusterMst*NumTiles ), - .NumOut (ClusterWideOutAxiPorts ), - .PipeReg (1'b1 ), - .ExtReqPrio (1'b0 ), - .ExtRspPrio (Burst_Enable ), - .tcdm_req_chan_t (cache_trans_req_chan_t ), - .tcdm_rsp_chan_t (cache_trans_rsp_chan_t ) - ) i_cluster_xbar ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .slv_req_i (tile_req_chan ), - .slv_req_valid_i (tile_req_valid ), - .slv_req_ready_o (tile_req_ready ), - .slv_rsp_o (tile_rsp_chan ), - .slv_rsp_valid_o (tile_rsp_valid ), - .slv_rsp_ready_i (tile_rsp_ready ), - .slv_sel_i (tile_sel[NumTiles*NumClusterMst-1:0] ), - .slv_rr_i ('0 ), - .slv_selected_o (tile_selected ), - .mst_req_o (l2_req_chan ), - .mst_req_valid_o (l2_req_valid ), - .mst_req_ready_i (l2_req_ready ), - .mst_rsp_i (l2_rsp_chan ), - .mst_rr_i (l2_rsp_rr ), - .mst_rsp_valid_i (l2_rsp_valid ), - .mst_rsp_ready_o (l2_rsp_ready ), - .mst_sel_i (l2_sel ) - ); + // ------------- + // To Main Memory: reqrsp_to_axi per group, then axi_mux across groups + // ------------- - for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin - // To L2 Channels - always_comb begin - l2_req[ch].q = '{ - addr : l2_req_chan[ch].addr, - write: l2_req_chan[ch].write, - amo : l2_req_chan[ch].amo, - data : l2_req_chan[ch].data, - strb : l2_req_chan[ch].strb, - size : l2_req_chan[ch].size, - default: '0 - }; - l2_req[ch].q.user = l2_req_chan[ch].user; - l2_req[ch].q_valid = l2_req_valid[ch] ; - l2_req_ready[ch] = l2_rsp[ch].q_ready; - - l2_rsp_chan [ch] = '{ - data : l2_rsp[ch].p.data, - error: l2_rsp[ch].p.error, - write: l2_rsp[ch].p.write, - default: '0 - }; - l2_rsp_chan [ch].user = l2_rsp[ch].p.user; - l2_rsp_valid[ch] = l2_rsp[ch].p_valid; - l2_req[ch].p_ready = l2_rsp_ready[ch]; - // calculate the port from the tile id and bank id - // bank_id == 0 --- bypass - // bank_id == 1-4 --- cache bank 0-3 - l2_sel[ch] = l2_rsp[ch].p.user.tile_id * NumClusterMst + l2_rsp[ch].p.user.bank_id; + // Step 1: Per-group reqrsp_to_axi conversion. + for (genvar gy = 0; gy < NumGroupsY; gy++) begin : gen_per_group_l2 + for (genvar gx = 0; gx < NumGroupsX; gx++) begin : gen_per_group_l2 + localparam int unsigned g = gy * NumGroupsX + gx; + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_per_ch + reqrsp_to_axi #( + .MaxTrans ( NumSpatzOutstandingLoads*2 ), + .ID ( '0 ), + .EnBurst ( 1 ), + .ShuffleId ( 1 ), + .UserWidth ( $bits(refill_user_t) ), + .ReqUserFallThrough ( 1'b0 ), + .DataWidth ( AxiDataWidth ), + .AxiUserWidth ( AxiUserWidth ), + .reqrsp_req_t ( l2_req_t ), + .reqrsp_rsp_t ( l2_rsp_t ), + .axi_req_t ( axi_premux_cache_req_t ), + .axi_rsp_t ( axi_premux_cache_resp_t ) + ) i_reqrsp2axi ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .user_i ( l2_req[g][ch].q.user ), + .reqrsp_req_i ( l2_req[g][ch] ), + .reqrsp_rsp_o ( l2_rsp[g][ch] ), + .axi_req_o ( wide_axi_premux_req[g][ch] ), + .axi_rsp_i ( wide_axi_premux_rsp[g][ch] ) + ); + end end end - for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch ++) begin : gen_output_axi - reqrsp_to_axi #( - .MaxTrans (NumSpatzOutstandingLoads*2 ), - .ID ('0 ), - .EnBurst (1 ), - .ShuffleId (1 ), - .UserWidth ($bits(refill_user_t) ), - .ReqUserFallThrough (1'b0 ), - .DataWidth (AxiDataWidth ), - .AxiUserWidth (AxiUserWidth ), - .reqrsp_req_t (l2_req_t ), - .reqrsp_rsp_t (l2_rsp_t ), - .axi_req_t (axi_slv_cache_req_t ), - .axi_rsp_t (axi_slv_cache_resp_t ) - ) i_reqrsp2axi ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .user_i (l2_req[ch].q.user ), - .reqrsp_req_i (l2_req[ch] ), - .reqrsp_rsp_o (l2_rsp[ch] ), - .axi_req_o (wide_axi_slv_req[ch] ), - .axi_rsp_i (wide_axi_slv_rsp[ch] ) - ); - end + // Step 2: Per-L2-channel axi_mux across groups. + if (NumGroups > 1) begin : gen_l2_group_mux + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_l2_ch_mux + // Per-group ID remapper: reduces GroupAxiIdOutWidth to WideRefillIdWidth before the mux. + // axi_id_remap preserves ID independence (unlike axi_id_serialize) for performance. + // AxiSlvPortMaxUniqIds = NumSpatzOutstandingLoads*2 matches the reqrsp_to_axi MaxTrans + // so the remapper never stalls. + for (genvar g = 0; g < NumGroups; g++) begin : gen_l2_mux_remap + axi_id_remap #( + .AxiSlvPortIdWidth ( GroupAxiIdOutWidth ), + .AxiSlvPortMaxUniqIds ( NumSpatzOutstandingLoads * 2 ), + .AxiMaxTxnsPerId ( NumSpatzOutstandingLoads ), + .AxiMstPortIdWidth ( WideIdWidthPreMux ), + .slv_req_t ( axi_premux_cache_req_t ), + .slv_resp_t ( axi_premux_cache_resp_t ), + .mst_req_t ( axi_remap_cache_req_t ), + .mst_resp_t ( axi_remap_cache_resp_t ) + ) i_l2_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( wide_axi_premux_req[g][ch] ), + .slv_resp_o ( wide_axi_premux_rsp[g][ch] ), + .mst_req_o ( wide_axi_remap_req[g][ch] ), + .mst_resp_i ( wide_axi_remap_rsp[g][ch] ) + ); + end + // Collect remapped per-group inputs for the mux. + axi_remap_cache_req_t [NumGroups-1:0] l2_mux_slv_req; + axi_remap_cache_resp_t [NumGroups-1:0] l2_mux_slv_rsp; + + for (genvar g = 0; g < NumGroups; g++) begin : gen_l2_mux_connect + assign l2_mux_slv_req[g] = wide_axi_remap_req[g][ch]; + assign wide_axi_remap_rsp[g][ch] = l2_mux_slv_rsp[g]; + end + + axi_mux #( + .SlvAxiIDWidth ( WideIdWidthPreMux ), + .slv_aw_chan_t ( axi_remap_cache_aw_chan_t ), + .mst_aw_chan_t ( axi_slv_cache_aw_chan_t ), + .w_chan_t ( axi_slv_cache_w_chan_t ), + .slv_b_chan_t ( axi_remap_cache_b_chan_t ), + .mst_b_chan_t ( axi_slv_cache_b_chan_t ), + .slv_ar_chan_t ( axi_remap_cache_ar_chan_t ), + .mst_ar_chan_t ( axi_slv_cache_ar_chan_t ), + .slv_r_chan_t ( axi_remap_cache_r_chan_t ), + .mst_r_chan_t ( axi_slv_cache_r_chan_t ), + .slv_req_t ( axi_remap_cache_req_t ), + .slv_resp_t ( axi_remap_cache_resp_t ), + .mst_req_t ( axi_slv_cache_req_t ), + .mst_resp_t ( axi_slv_cache_resp_t ), + .NoSlvPorts ( NumGroups ), + .FallThrough ( 0 ), + .SpillAw ( XbarLatency[4] ), + .SpillW ( XbarLatency[3] ), + .SpillB ( XbarLatency[2] ), + .SpillAr ( XbarLatency[1] ), + .SpillR ( XbarLatency[0] ), + .MaxWTrans ( 2 ) + ) i_axi_l2_mux ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( '0 ), + .slv_reqs_i ( l2_mux_slv_req ), + .slv_resps_o ( l2_mux_slv_rsp ), + .mst_req_o ( wide_axi_slv_req[ch] ), + .mst_resp_i ( wide_axi_slv_rsp[ch] ) + ); + end + end else begin : gen_l2_no_mux + // Single group: direct connection, no mux needed. + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_l2_ch_direct + assign wide_axi_slv_req[ch] = wide_axi_premux_req[0][ch]; + assign wide_axi_premux_rsp[0][ch] = wide_axi_slv_rsp[ch]; + end + end - // ------------- - // To Main Memory - // ------------- // Optionally decouple the external wide AXI master port. for (genvar port = 0; port < ClusterWideOutAxiPorts; port ++) begin : gen_axi_out_cut axi_cut #( @@ -775,20 +551,20 @@ module cachepool_cluster axi_mux #( .SlvAxiIDWidth ( CsrAxiMstIdWidth ), - .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), // AW Channel Type, slave ports - .mst_aw_chan_t ( axi_uart_aw_chan_t ), // AW Channel Type, master port - .w_chan_t ( axi_uart_w_chan_t ), // W Channel Type, all ports - .slv_b_chan_t ( axi_csr_mst_b_chan_t ), // B Channel Type, slave ports - .mst_b_chan_t ( axi_uart_b_chan_t ), // B Channel Type, master port - .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), // AR Channel Type, slave ports - .mst_ar_chan_t ( axi_uart_ar_chan_t ), // AR Channel Type, master port - .slv_r_chan_t ( axi_csr_mst_r_chan_t ), // R Channel Type, slave ports - .mst_r_chan_t ( axi_uart_r_chan_t ), // R Channel Type, master port + .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), + .mst_aw_chan_t ( axi_uart_aw_chan_t ), + .w_chan_t ( axi_uart_w_chan_t ), + .slv_b_chan_t ( axi_csr_mst_b_chan_t ), + .mst_b_chan_t ( axi_uart_b_chan_t ), + .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), + .mst_ar_chan_t ( axi_uart_ar_chan_t ), + .slv_r_chan_t ( axi_csr_mst_r_chan_t ), + .mst_r_chan_t ( axi_uart_r_chan_t ), .slv_req_t ( axi_csr_mst_req_t ), .slv_resp_t ( axi_csr_mst_resp_t ), .mst_req_t ( axi_uart_req_t ), .mst_resp_t ( axi_uart_resp_t ), - .NoSlvPorts ( NumTiles ), // Number of Masters for the module + .NoSlvPorts ( NumTiles ), .FallThrough ( 0 ), .SpillAw ( XbarLatency[4] ), .SpillW ( XbarLatency[3] ), @@ -797,9 +573,9 @@ module cachepool_cluster .SpillR ( XbarLatency[0] ), .MaxWTrans ( 2 ) ) i_axi_uart_mux ( - .clk_i ( clk_i ), // Clock - .rst_ni ( rst_ni ), // Asynchronous reset active low - .test_i ( '0 ), // Test Mode enable + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( '0 ), .slv_reqs_i ( axi_uart_mux_req ), .slv_resps_o ( axi_uart_mux_rsp ), .mst_req_o ( axi_narrow_req_o ), @@ -810,43 +586,6 @@ module cachepool_cluster assign axi_out_resp[0][ClusterUart] = axi_narrow_resp_i; end - /***** BootROM ****/ - for (genvar t = 0; t < NumTiles; t++) begin : gen_bootrom - axi_to_reg #( - .ADDR_WIDTH (AxiAddrWidth ), - .DATA_WIDTH (AxiDataWidth ), - .AXI_MAX_WRITE_TXNS (1 ), - .AXI_MAX_READ_TXNS (1 ), - .DECOUPLE_W (0 ), - .ID_WIDTH (WideIdWidthIn ), - .USER_WIDTH (AxiUserWidth ), - .axi_req_t (axi_mst_cache_req_t ), - .axi_rsp_t (axi_mst_cache_resp_t), - .reg_req_t (reg_cache_req_t ), - .reg_rsp_t (reg_cache_rsp_t ) - ) i_axi_to_reg_bootrom ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .testmode_i (1'b0 ), - .axi_req_i (axi_tile_req[t][TileBootROM] ), - .axi_rsp_o (axi_tile_rsp[t][TileBootROM] ), - .reg_req_o (bootrom_reg_req[t] ), - .reg_rsp_i (bootrom_reg_rsp[t] ) - ); - - bootrom i_bootrom ( - .clk_i (clk_i ), - .req_i (bootrom_reg_req[t].valid ), - .addr_i (addr_t'(bootrom_reg_req[t].addr) ), - .rdata_o(bootrom_reg_rsp[t].rdata ) - ); - - `FF(bootrom_reg_rsp[t].ready, bootrom_reg_req[t].valid, 1'b0) - - assign bootrom_reg_rsp[t].error = 1'b0; - end - - /***** CSR/Peripherals *****/ `REG_BUS_TYPEDEF_ALL(reg, narrow_addr_t, narrow_data_t, narrow_strb_t) @@ -860,6 +599,11 @@ module cachepool_cluster axi_narrow_req_t [NumTiles-1:0] axi_core_csr_req, axi_barrier_req; axi_narrow_resp_t [NumTiles-1:0] axi_core_csr_rsp, axi_barrier_rsp; + // Serialized CSR signals: one entry per tile plus one for the external axi_in port. + // Index [NumTiles] = axi_in_req_i, indices [NumTiles-1:0] = per-tile CSR outputs. + axi_csr_ser_req_t [NumTiles:0] axi_csr_pre_mux_req; + axi_csr_ser_resp_t [NumTiles:0] axi_csr_pre_mux_rsp; + for (genvar t = 0; t < NumTiles; t++) begin assign axi_barrier_req[t] = axi_out_req [t][ClusterPeriph]; @@ -895,23 +639,81 @@ module cachepool_cluster .cluster_periph_start_address_i ( tcdm_end_address ) ); + // Per-tile CSR ID serializers: reduce CsrAxiMstIdWidth to CsrSerIdWidth before the mux + // so the mux output stays bounded regardless of NumTiles. + for (genvar t = 0; t < NumTiles; t++) begin : gen_csr_id_serialize + axi_id_serialize #( + .AxiSlvPortIdWidth ( CsrAxiMstIdWidth ), + .AxiSlvPortMaxTxns ( 2 ), + .AxiMstPortIdWidth ( CsrSerIdWidth ), + .AxiMstPortMaxUniqIds ( 1 ), + .AxiMstPortMaxTxnsPerId ( 2 ), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( SpatzAxiNarrowDataWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .AtopSupport ( 1'b0 ), + .slv_req_t ( axi_narrow_req_t ), + .slv_resp_t ( axi_narrow_resp_t ), + .mst_req_t ( axi_csr_ser_req_t ), + .mst_resp_t ( axi_csr_ser_resp_t ), + // Provide one dummy entry to avoid [IdMapNumEntries-1:0] underflow when 0. + // Entry maps ID 0 -> 0, which is identical to the default modulo formula. + .IdMapNumEntries ( 1 ), + .IdMap ( '{'{32'd0, 32'd0}} ) + ) i_csr_id_serialize ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_core_csr_req[t] ), + .slv_resp_o ( axi_core_csr_rsp[t] ), + .mst_req_o ( axi_csr_pre_mux_req[t] ), + .mst_resp_i ( axi_csr_pre_mux_rsp[t] ) + ); + end + + // Serializer for the external axi_in port (SoC CSR access). + axi_id_serialize #( + .AxiSlvPortIdWidth ( AxiIdWidthIn ), + .AxiSlvPortMaxTxns ( 2 ), + .AxiMstPortIdWidth ( CsrSerIdWidth ), + .AxiMstPortMaxUniqIds ( 1 ), + .AxiMstPortMaxTxnsPerId ( 2 ), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( SpatzAxiNarrowDataWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .AtopSupport ( 1'b0 ), + .slv_req_t ( axi_in_req_t ), + .slv_resp_t ( axi_in_resp_t ), + .mst_req_t ( axi_csr_ser_req_t ), + .mst_resp_t ( axi_csr_ser_resp_t ), + // Provide one dummy entry to avoid [IdMapNumEntries-1:0] underflow when 0. + // Entry maps ID 0 -> 0, which is identical to the default modulo formula. + .IdMapNumEntries ( 1 ), + .IdMap ( '{'{32'd0, 32'd0}} ) + ) i_csr_in_id_serialize ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_in_req_i ), + .slv_resp_o ( axi_in_resp_o ), + .mst_req_o ( axi_csr_pre_mux_req[NumTiles] ), + .mst_resp_i ( axi_csr_pre_mux_rsp[NumTiles] ) + ); axi_mux #( - .SlvAxiIDWidth ( CsrAxiMstIdWidth ), - .slv_aw_chan_t ( axi_csr_mst_aw_chan_t ), // AW Channel Type, slave ports - .mst_aw_chan_t ( axi_csr_slv_aw_chan_t ), // AW Channel Type, master port - .w_chan_t ( axi_csr_slv_w_chan_t ), // W Channel Type, all ports - .slv_b_chan_t ( axi_csr_mst_b_chan_t ), // B Channel Type, slave ports - .mst_b_chan_t ( axi_csr_slv_b_chan_t ), // B Channel Type, master port - .slv_ar_chan_t ( axi_csr_mst_ar_chan_t ), // AR Channel Type, slave ports - .mst_ar_chan_t ( axi_csr_slv_ar_chan_t ), // AR Channel Type, master port - .slv_r_chan_t ( axi_csr_mst_r_chan_t ), // R Channel Type, slave ports - .mst_r_chan_t ( axi_csr_slv_r_chan_t ), // R Channel Type, master port - .slv_req_t ( axi_csr_mst_req_t ), - .slv_resp_t ( axi_csr_mst_resp_t ), + .SlvAxiIDWidth ( CsrSerIdWidth ), + .slv_aw_chan_t ( axi_csr_ser_aw_chan_t ), + .mst_aw_chan_t ( axi_csr_slv_aw_chan_t ), + .w_chan_t ( axi_csr_slv_w_chan_t ), + .slv_b_chan_t ( axi_csr_ser_b_chan_t ), + .mst_b_chan_t ( axi_csr_slv_b_chan_t ), + .slv_ar_chan_t ( axi_csr_ser_ar_chan_t ), + .mst_ar_chan_t ( axi_csr_slv_ar_chan_t ), + .slv_r_chan_t ( axi_csr_ser_r_chan_t ), + .mst_r_chan_t ( axi_csr_slv_r_chan_t ), + .slv_req_t ( axi_csr_ser_req_t ), + .slv_resp_t ( axi_csr_ser_resp_t ), .mst_req_t ( axi_csr_slv_req_t ), .mst_resp_t ( axi_csr_slv_resp_t ), - .NoSlvPorts ( NumTiles + 1 ), // Number of Masters for the module + .NoSlvPorts ( NumTiles + 1 ), .FallThrough ( 0 ), .SpillAw ( XbarLatency[4] ), .SpillW ( XbarLatency[3] ), @@ -920,13 +722,13 @@ module cachepool_cluster .SpillR ( XbarLatency[0] ), .MaxWTrans ( 2 ) ) i_axi_csr_mux ( - .clk_i ( clk_i ), // Clock - .rst_ni ( rst_ni ), // Asynchronous reset active low - .test_i ('0 ), // Test Mode enable - .slv_reqs_i ( {axi_in_req_i, axi_core_csr_req} ), - .slv_resps_o ( {axi_in_resp_o, axi_core_csr_rsp} ), - .mst_req_o ( axi_csr_req ), - .mst_resp_i ( axi_csr_rsp ) + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ('0 ), + .slv_reqs_i ( axi_csr_pre_mux_req ), + .slv_resps_o ( axi_csr_pre_mux_rsp ), + .mst_req_o ( axi_csr_req ), + .mst_resp_i ( axi_csr_rsp ) ); axi_to_reg #( @@ -942,39 +744,15 @@ module cachepool_cluster .reg_req_t (reg_req_t ), .reg_rsp_t (reg_rsp_t ) ) i_csr_axi_to_reg ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .testmode_i (1'b0 ), - .axi_req_i (axi_csr_req ), - .axi_rsp_o (axi_csr_rsp ), - .reg_req_o (reg_req ), - .reg_rsp_i (reg_rsp ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (1'b0 ), + .axi_req_i (axi_csr_req ), + .axi_rsp_o (axi_csr_rsp ), + .reg_req_o (reg_req ), + .reg_rsp_i (reg_rsp ) ); - - // Event counter increments for the TCDM. - typedef struct packed { - /// Number requests going in - logic [$clog2(5):0] inc_accessed; - /// Number of requests stalled due to congestion - logic [$clog2(5):0] inc_congested; - } tcdm_events_t; - - // Event counter increments for DMA. - typedef struct packed { - logic aw_stall, ar_stall, r_stall, w_stall, - buf_w_stall, buf_r_stall; - logic aw_valid, aw_ready, aw_done, aw_bw; - logic ar_valid, ar_ready, ar_done, ar_bw; - logic r_valid, r_ready, r_done, r_bw; - logic w_valid, w_ready, w_done, w_bw; - logic b_valid, b_ready, b_done; - logic dma_busy; - axi_pkg::len_t aw_len, ar_len; - axi_pkg::size_t aw_size, ar_size; - logic [$clog2(SpatzAxiNarrowDataWidth/8):0] num_bytes_written; - } dma_events_t; - cachepool_peripheral #( .AddrWidth (AxiAddrWidth ), .SPMWidth ($clog2(L1NumSet)), diff --git a/hardware/src/cachepool_group.sv b/hardware/src/cachepool_group.sv index b14d1ac..1e6e5d4 100644 --- a/hardware/src/cachepool_group.sv +++ b/hardware/src/cachepool_group.sv @@ -4,19 +4,9 @@ // Author: Diyou Shen -`include "axi/assign.svh" `include "axi/typedef.svh" -`include "common_cells/assertions.svh" `include "common_cells/registers.svh" -`include "mem_interface/assign.svh" -`include "mem_interface/typedef.svh" -`include "register_interface//assign.svh" `include "register_interface/typedef.svh" -`include "reqrsp_interface/assign.svh" -`include "reqrsp_interface/typedef.svh" -`include "snitch_vm/typedef.svh" -`include "tcdm_interface/assign.svh" -`include "tcdm_interface/typedef.svh" /// Group implementation for CachePool module cachepool_group @@ -24,7 +14,7 @@ module cachepool_group import spatz_pkg::*; import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; - import snitch_icache_pkg::icache_events_t; + import snitch_icache_pkg::icache_l1_events_t; #( /// Width of physical address. parameter int unsigned AxiAddrWidth = 48, @@ -48,10 +38,6 @@ module cachepool_group parameter int unsigned ClusterPeriphSize = 64, /// Number of TCDM Banks. parameter int unsigned NrBanks = 2 * NrCores, - /// Size of DMA AXI buffer. - parameter int unsigned DMAAxiReqFifoDepth = 3, - /// Size of DMA request fifo. - parameter int unsigned DMAReqFifoDepth = 3, /// Width of a single icache line. parameter unsigned ICacheLineWidth = 0, /// Number of icache lines per set. @@ -65,8 +51,6 @@ module cachepool_group /// Spatz FPU/IPU Configuration parameter int unsigned NumSpatzFPUs = 4, parameter int unsigned NumSpatzIPUs = 1, - /// Per-core enabling of the custom `Xdma` ISA extensions. - parameter bit [NrCores-1:0] Xdma = '{default: '0}, /// # Per-core parameters /// Per-core integer outstanding loads parameter int unsigned NumIntOutstandingLoads = 0, @@ -107,7 +91,12 @@ module cachepool_group parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, /// # SRAM Configuration rules needed: L1D Tag + L1D Data + L1D FIFO + L1I Tag + L1I Data /*** ATTENTION: `NrSramCfg` should be changed if `L1NumDataBank` and `L1NumTagBank` is changed ***/ - parameter int unsigned NrSramCfg = 1 + parameter int unsigned NrSramCfg = 1, + + localparam int unsigned TotRGPorts = (NumRemoteGroupPortCore == 0) ? 0 : + NumTilesPerGroup*NumRemoteGroupPortCore*NrTCDMPortsPerCore-1, + localparam int unsigned NumRemoteGroupPortTile = (NumRemoteGroupPortCore == 0) ? 1 : + NumRemoteGroupPortCore * NrTCDMPortsPerCore ) ( /// System clock. input logic clk_i, @@ -115,48 +104,60 @@ module cachepool_group input logic rst_ni, /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. - input logic [NrCores-1:0] debug_req_i, + input logic debug_req_i, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. - input logic [NrCores-1:0] meip_i, + input logic meip_i, /// Machine timer interrupt pending. Usually those interrupts come from a /// core-local interrupt controller such as a timer/RTC. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] mtip_i, + input logic mtip_i, /// Core software interrupt pending. Usually those interrupts come from /// another core to facilitate inter-processor-interrupts. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] msip_i, + input logic msip_i, /// First hartid of the cluster. Cores of a cluster are monotonically /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. input logic [9:0] hart_base_id_i, + /// Globally-unique tile ID of the first tile in this group (= group_index * NumTilesPerGroup). + input logic [TileIDWidth-1:0] tile_base_id_i, /// Base address of cluster. TCDM and cluster peripheral location are derived from /// it. This signal is pseudo-static. input axi_addr_t cluster_base_addr_i, /// Partitioning address input axi_addr_t private_start_addr_i, /// AXI Narrow out-port (UART/Peripheral) - output axi_narrow_req_t [GroupNarrowAxiPorts-1:0] axi_narrow_req_o, - input axi_narrow_resp_t [GroupNarrowAxiPorts-1:0] axi_narrow_rsp_i, - /// Wide AXI ports to cluster level - output axi_out_req_t [GroupWideAxiPorts-1:0] axi_wide_req_o, - input axi_out_resp_t [GroupWideAxiPorts-1:0] axi_wide_rsp_i, + output axi_narrow_req_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_req_o, + input axi_narrow_resp_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_rsp_i, - /// Cache refill ports - output cache_trans_req_t [NumL1CacheCtrl-1:0] cache_refill_req_o, - input cache_trans_rsp_t [NumL1CacheCtrl-1:0] cache_refill_rsp_i, + /// DRAM refill reqrsp ports (post-xbar, one per L2 channel) + output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, + input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, /// Peripheral signals - output icache_events_t [NrCores-1:0] icache_events_o, + output icache_l1_events_t [NrCores-1:0] icache_events_o, input logic icache_prefetch_enable_i, input logic [NrCores-1:0] cl_interrupt_i, input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, input logic [3:0] l1d_private_i, - input cache_insn_t l1d_insn_i, + input cache_insn_t l1d_insn_i, input logic l1d_insn_valid_i, - output logic [NumTiles-1:0] l1d_insn_ready_o, - input logic [NumTiles-1:0] l1d_busy_i, + output logic [NumTilesPerGroup-1:0] l1d_insn_ready_o, + input logic [NumTilesPerGroup-1:0] l1d_busy_i, + + /// Inter-group remote access ports (to other groups). + /// Layout: [NumTilesPerGroup-1:0][NumRemoteGroupPortTile-1:0] flattened to + /// [NumTilesPerGroup * NumRemoteGroupPortTile - 1 : 0]. + /// Per-tile flat index: j + r * NrTCDMPortsPerCore (j = interco instance, + /// r = inter-group slot within that instance). + /// NumRemoteGroupPortTile = NumRemoteGroupPortCore * NrTCDMPortsPerCore. + /// Uses REQRSP-style types with built-in ready and remote_group_user_t. + output remote_group_req_t [TotRGPorts:0] remote_group_req_o, + input remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_i, + /// Inter-group remote access ports (from other groups) + input remote_group_req_t [TotRGPorts:0] remote_group_req_i, + output remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_o, /// SRAM Configuration input impl_in_t [NrSramCfg-1:0] impl_i, @@ -173,17 +174,9 @@ module cachepool_group // --------- // Constants // --------- - /// Minimum width to hold the core number. - localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); - localparam int unsigned TileIDWidth = cf_math_pkg::idx_width(NumTiles); - - // Enlarge the address width for Spatz due to cache - localparam int unsigned TCDMAddrWidth = L1AddrWidth; + // Per-group overrides of package-level constants that depend on NumTiles/NumCores. + localparam int unsigned NumL1CacheCtrlLocal = NrCores; - // Core Request, SoC Request - localparam int unsigned NrNarrowMasters = 2; - - localparam int unsigned WideIdWidthOut = AxiIdWidthOut; localparam int unsigned WideIdWidthIn = AxiIdWidthOut; @@ -194,13 +187,9 @@ module cachepool_group typedef logic [AxiDataWidth-1:0] data_cache_t; typedef logic [AxiDataWidth/8-1:0] strb_cache_t; typedef logic [WideIdWidthIn-1:0] id_cache_mst_t; - typedef logic [WideIdWidthOut-1:0] id_cache_slv_t; typedef logic [AxiUserWidth-1:0] user_cache_t; `AXI_TYPEDEF_ALL(axi_mst_cache, addr_t, id_cache_mst_t, data_cache_t, strb_cache_t, user_cache_t) - `AXI_TYPEDEF_ALL(axi_slv_cache, addr_t, id_cache_slv_t, data_cache_t, strb_cache_t, user_cache_t) - - `REG_BUS_TYPEDEF_ALL(reg_cache, addr_t, data_cache_t, strb_cache_t) typedef struct packed { int unsigned idx; @@ -208,43 +197,472 @@ module cachepool_group addr_t end_addr; } xbar_rule_t; - `SNITCH_VM_TYPEDEF(AxiAddrWidth) - // --------------- // CachePool Tile // --------------- - logic [NumTiles-1:0] error; + logic [NumTilesPerGroup-1:0] error; assign error_o = |error; + // Internal tile-side wide AXI: split into two flat arrays by port function + // BootROM (TileBootROM=0): muxed into single shared bootrom in this group + axi_mst_cache_req_t [NumTilesPerGroup-1:0] axi_tile_bootrom_req; + axi_mst_cache_resp_t [NumTilesPerGroup-1:0] axi_tile_bootrom_rsp; + // TileMem (TileMem=1): stays in group, fed into axi_to_reqrsp + axi_mst_cache_req_t [NumTilesPerGroup-1:0] axi_tile_mem_req; + axi_mst_cache_resp_t [NumTilesPerGroup-1:0] axi_tile_mem_rsp; + + // Per-group bootrom mux AXI type: the mux prepends $clog2(NumTilesPerGroup) + // bits to the ID, not $clog2(NumTiles) as the package assumes. + localparam int unsigned LocalBootRomIdWidth = WideIdWidthIn + $clog2(NumTilesPerGroup); + typedef logic [LocalBootRomIdWidth-1:0] local_bootrom_id_t; + `AXI_TYPEDEF_ALL(local_bootrom, addr_t, local_bootrom_id_t, data_cache_t, strb_cache_t, user_cache_t) + + // Mux all per-tile BootROM AXI ports into a single bootrom instance + local_bootrom_req_t axi_bootrom_mux_req; + local_bootrom_resp_t axi_bootrom_mux_rsp; + + if (NumTilesPerGroup > 1) begin : gen_bootrom_mux + axi_mux #( + .SlvAxiIDWidth ( WideIdWidthIn ), + .slv_aw_chan_t ( axi_mst_cache_aw_chan_t ), + .mst_aw_chan_t ( local_bootrom_aw_chan_t ), + .w_chan_t ( axi_mst_cache_w_chan_t ), + .slv_b_chan_t ( axi_mst_cache_b_chan_t ), + .mst_b_chan_t ( local_bootrom_b_chan_t ), + .slv_ar_chan_t ( axi_mst_cache_ar_chan_t ), + .mst_ar_chan_t ( local_bootrom_ar_chan_t ), + .slv_r_chan_t ( axi_mst_cache_r_chan_t ), + .mst_r_chan_t ( local_bootrom_r_chan_t ), + .slv_req_t ( axi_mst_cache_req_t ), + .slv_resp_t ( axi_mst_cache_resp_t ), + .mst_req_t ( local_bootrom_req_t ), + .mst_resp_t ( local_bootrom_resp_t ), + .NoSlvPorts ( NumTilesPerGroup ), + .FallThrough ( 0 ), + .SpillAw ( XbarLatency[4] ), + .SpillW ( XbarLatency[3] ), + .SpillB ( XbarLatency[2] ), + .SpillAr ( XbarLatency[1] ), + .SpillR ( XbarLatency[0] ), + .MaxWTrans ( 2 ) + ) i_axi_bootrom_mux ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( '0 ), + .slv_reqs_i ( axi_tile_bootrom_req ), + .slv_resps_o( axi_tile_bootrom_rsp ), + .mst_req_o ( axi_bootrom_mux_req ), + .mst_resp_i ( axi_bootrom_mux_rsp ) + ); + end else begin : gen_bootrom_connect + // NumTilesPerGroup==1: direct connect, no ID widening needed + assign axi_bootrom_mux_req = local_bootrom_req_t'(axi_tile_bootrom_req[0]); + assign axi_tile_bootrom_rsp[0] = axi_mst_cache_resp_t'(axi_bootrom_mux_rsp); + end + + // Single BootROM instance shared across all tiles in the group + `REG_BUS_TYPEDEF_ALL(reg_bootrom, addr_t, data_cache_t, strb_cache_t) + reg_bootrom_req_t bootrom_reg_req; + reg_bootrom_rsp_t bootrom_reg_rsp; + + axi_to_reg #( + .ADDR_WIDTH ( AxiAddrWidth ), + .DATA_WIDTH ( AxiDataWidth ), + .AXI_MAX_WRITE_TXNS ( 1 ), + .AXI_MAX_READ_TXNS ( 1 ), + .DECOUPLE_W ( 0 ), + .ID_WIDTH ( LocalBootRomIdWidth ), + .USER_WIDTH ( AxiUserWidth ), + .axi_req_t ( local_bootrom_req_t ), + .axi_rsp_t ( local_bootrom_resp_t ), + .reg_req_t ( reg_bootrom_req_t ), + .reg_rsp_t ( reg_bootrom_rsp_t ) + ) i_axi_to_reg_bootrom ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .testmode_i ( 1'b0 ), + .axi_req_i ( axi_bootrom_mux_req ), + .axi_rsp_o ( axi_bootrom_mux_rsp ), + .reg_req_o ( bootrom_reg_req ), + .reg_rsp_i ( bootrom_reg_rsp ) + ); + + bootrom i_bootrom ( + .clk_i ( clk_i ), + .req_i ( bootrom_reg_req.valid ), + .addr_i ( addr_t'(bootrom_reg_req.addr) ), + .rdata_o ( bootrom_reg_rsp.rdata ) + ); + + `FF(bootrom_reg_rsp.ready, bootrom_reg_req.valid, 1'b0) + assign bootrom_reg_rsp.error = 1'b0; + + // Cache refill ports from tiles (NumL1CacheCtrlLocal = NumCores total) + cache_trans_req_t [NumL1CacheCtrlLocal-1:0] cache_refill_req; + cache_trans_rsp_t [NumL1CacheCtrlLocal-1:0] cache_refill_rsp; + + // L2 Group ICache AXI master output (from axi_hier_interco) + axi_mst_cache_req_t axi_l2icache_mst_req; + axi_mst_cache_resp_t axi_l2icache_mst_rsp; + // L2 Group ICache reqrsp output (to xbar port 0) + cache_trans_req_t cache_l2icache_req; + cache_trans_rsp_t cache_l2icache_rsp; + // L2 Group ICache control (hardwired) + ro_cache_ctrl_t l2icache_ctrl; + + // Flat xbar input channels: NumTilesPerGroup * NumClusterMst ports + cache_trans_req_chan_t [NumTilesPerGroup*NumClusterMst-1:0] tile_req_chan; + cache_trans_rsp_chan_t [NumTilesPerGroup*NumClusterMst-1:0] tile_rsp_chan; + logic [NumTilesPerGroup*NumClusterMst-1:0] tile_req_valid, tile_req_ready, + tile_rsp_valid, tile_rsp_ready; + + // Xbar output channels: one per L2 channel + cache_trans_req_chan_t [ClusterWideOutAxiPorts-1:0] l2_req_chan; + cache_trans_rsp_chan_t [ClusterWideOutAxiPorts-1:0] l2_rsp_chan; + logic [ClusterWideOutAxiPorts-1:0] l2_req_valid, l2_req_ready, + l2_rsp_valid, l2_rsp_ready; + + // Selection types + typedef logic [$clog2(NumClusterMst*NumTilesPerGroup)-1:0] l2_sel_t; + typedef logic [$clog2(ClusterWideOutAxiPorts) :0] tile_sel_err_t; // one extra bit for OOB + typedef logic [$clog2(ClusterWideOutAxiPorts)-1:0] tile_sel_t; + + tile_sel_err_t [NumTilesPerGroup*NumClusterMst-1:0] tile_sel_err; + tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] tile_sel; + l2_sel_t [ClusterWideOutAxiPorts-1:0] l2_sel; + tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] l2_rsp_rr; + + logic [NumTilesPerGroup*NumClusterMst-1:0] rr_lock_d, rr_lock_q; + tile_sel_t [NumTilesPerGroup*NumClusterMst-1:0] l2_prio_d, l2_prio_q; + + // port_id: which xbar input port does each L2 channel response target + l2_sel_t [ClusterWideOutAxiPorts-1:0] port_id; + for (genvar i = 0; i < ClusterWideOutAxiPorts; i++) begin + assign port_id[i] = l2_rsp_i[i].p.user.tile_id * NumClusterMst + + l2_rsp_i[i].p.user.bank_id; + end + + // --------------------- + // L2 Group ICache: 4-to-1 AXI mux + read-only cache + ID remap + // --------------------- + always_comb begin + l2icache_ctrl = '0; + l2icache_ctrl.enable = 1'b1; + l2icache_ctrl.flush_valid = 1'b0; + l2icache_ctrl.start_addr[0] = DramAddr; + l2icache_ctrl.end_addr[0] = DramAddr + DramSize; + end + + axi_hier_interco #( + .NumSlvPorts ( NumTilesPerGroup ), + .NumMstPorts ( 1 ), + .Radix ( NumTilesPerGroup ), + .EnableCache ( 1 ), + .CacheLineWidth ( L2ICacheLineWidth ), + .CacheSizeByte ( L2ICacheSizeByte ), + .CacheSets ( L2ICacheSets ), + .AddrWidth ( AxiAddrWidth ), + .DataWidth ( AxiDataWidth ), + .SlvIdWidth ( WideIdWidthIn ), + .MstIdWidth ( WideIdWidthIn ), + .UserWidth ( AxiUserWidth ), + .slv_req_t ( axi_mst_cache_req_t ), + .slv_resp_t ( axi_mst_cache_resp_t ), + .mst_req_t ( axi_mst_cache_req_t ), + .mst_resp_t ( axi_mst_cache_resp_t ) + ) i_l2icache_interco ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( 1'b0 ), + .ro_cache_ctrl_i ( l2icache_ctrl ), + .slv_req_i ( axi_tile_mem_req ), + .slv_resp_o ( axi_tile_mem_rsp ), + .mst_req_o ( axi_l2icache_mst_req ), + .mst_resp_i ( axi_l2icache_mst_rsp ) + ); + + // Single axi_to_reqrsp for the L2 ICache master output + axi_to_reqrsp #( + .axi_req_t ( axi_mst_cache_req_t ), + .axi_rsp_t ( axi_mst_cache_resp_t ), + .AddrWidth ( AxiAddrWidth ), + .DataWidth ( AxiDataWidth ), + .UserWidth ( $bits(refill_user_t) ), + .IdWidth ( WideIdWidthIn ), + .BufDepth ( NumSpatzOutstandingLoads ), + .reqrsp_req_t ( cache_trans_req_t ), + .reqrsp_rsp_t ( cache_trans_rsp_t ) + ) i_l2icache_axi2reqrsp ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .busy_o ( ), + .axi_req_i ( axi_l2icache_mst_req ), + .axi_rsp_o ( axi_l2icache_mst_rsp ), + .reqrsp_req_o ( cache_l2icache_req ), + .reqrsp_rsp_i ( cache_l2icache_rsp ) + ); + + // --------------------- + // Wiring: assemble flat xbar input from icache-bypass and refill paths + // --------------------- + // Port layout per tile: p=0 -> L2 ICache output (t=0) or unused (t>0), + // p=1..NumL1CtrlTile -> refill (cache_refill_req) + localparam int unsigned ReqrspPortsTile = NumL1CtrlTile + 1; + always_comb begin + for (int t = 0; t < NumTilesPerGroup; t++) begin + for (int p = 0; p < ReqrspPortsTile; p++) begin + automatic int unsigned xbar_idx = t * ReqrspPortsTile + p; + automatic int unsigned refill_idx = t * NumL1CtrlTile + p - 1; + + if (p == 0) begin + if (t == 0) begin + // L2 ICache output → xbar port 0 + tile_req_chan [xbar_idx] = cache_l2icache_req.q; + tile_req_chan [xbar_idx].addr = scrambleAddr(cache_l2icache_req.q.addr); + tile_req_valid [xbar_idx] = cache_l2icache_req.q_valid; + cache_l2icache_rsp.q_ready = tile_req_ready[xbar_idx]; + + cache_l2icache_rsp.p = tile_rsp_chan [xbar_idx]; + cache_l2icache_rsp.p_valid = tile_rsp_valid[xbar_idx]; + tile_rsp_ready [xbar_idx] = cache_l2icache_req.p_ready; + tile_req_chan [xbar_idx].user.tile_id = '0; + end else begin + // unused icache-bypass ports (tiles 1-3) + tile_req_chan [xbar_idx] = '0; + tile_req_valid [xbar_idx] = 1'b0; + tile_rsp_ready [xbar_idx] = 1'b0; + end + end else begin + // refill path + tile_req_chan [xbar_idx] = cache_refill_req[refill_idx].q; + tile_req_chan [xbar_idx].addr = scrambleAddr(cache_refill_req[refill_idx].q.addr); + tile_req_valid [xbar_idx] = cache_refill_req[refill_idx].q_valid; + cache_refill_rsp[refill_idx].q_ready = tile_req_ready[xbar_idx]; + + cache_refill_rsp[refill_idx].p = tile_rsp_chan [xbar_idx]; + cache_refill_rsp[refill_idx].p_valid = tile_rsp_valid[xbar_idx]; + tile_rsp_ready [xbar_idx] = cache_refill_req[refill_idx].p_ready; + tile_req_chan [xbar_idx].user.tile_id = t; + end + end + end + end + + // --------------------- + // Address decoder: select L2 channel per xbar input port + // --------------------- + typedef struct packed { + int unsigned idx; + logic [AxiAddrWidth-1:0] base; + logic [AxiAddrWidth-1:0] mask; + } reqrsp_rule_t; + + reqrsp_rule_t [ClusterWideOutAxiPorts-1:0] xbar_rule; + for (genvar i = 0; i < ClusterWideOutAxiPorts; i++) begin + assign xbar_rule[i] = '{ + idx : i, + base : DramAddr + DramPerChSize * i, + mask : ({AxiAddrWidth{1'b1}} << $clog2(DramPerChSize)) + }; + end + + logic [$clog2(ClusterWideOutAxiPorts):0] default_idx; + assign default_idx = ClusterWideOutAxiPorts; + + for (genvar inp = 0; inp < NumClusterMst*NumTilesPerGroup; inp++) begin : gen_xbar_sel + addr_decode_napot #( + .NoIndices ( ClusterWideOutAxiPorts+1 ), + .NoRules ( ClusterWideOutAxiPorts ), + .addr_t ( axi_addr_t ), + .rule_t ( reqrsp_rule_t ) + ) i_snitch_decode_napot ( + .addr_i ( tile_req_chan[inp].addr ), + .addr_map_i ( xbar_rule ), + .idx_o ( tile_sel_err[inp] ), + .dec_valid_o ( /* unused */ ), + .dec_error_o ( /* unused */ ), + .en_default_idx_i ( 1'b1 ), + .default_idx_i ( default_idx ) + ); + assign tile_sel[inp] = tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)-1:0]; + +`ifndef TARGET_SYNTHESIS + IllegalMemAccess : assert property ( + @(posedge clk_i) disable iff (!rst_ni) + (tile_req_valid[inp] |-> !tile_sel_err[inp][$clog2(ClusterWideOutAxiPorts)])) + else $error("Visited illegal address: time=%0t, port=%0d, addr=0x%08h", + $time, inp, tile_req_chan[inp].addr); +`endif + end + + // --------------------- + // Burst protection logic + // --------------------- + if (Burst_Enable) begin : gen_burst_ext_sel + `FF(rr_lock_q, rr_lock_d, 1'b0) + `FF(l2_prio_q, l2_prio_d, 1'b0) + + for (genvar port = 0; port < NumTilesPerGroup*NumClusterMst; port++) begin : gen_rsp_rr + tile_sel_t l2_rr; + logic [ClusterWideOutAxiPorts-1:0] arb_valid; + + for (genvar i = 0; i < ClusterWideOutAxiPorts; i++) begin + assign arb_valid[i] = (port_id[i] == port) & l2_rsp_valid[i]; + end + + always_comb begin + l2_prio_d[port] = l2_prio_q[port]; + rr_lock_d[port] = rr_lock_q[port]; + + if (|arb_valid) begin + if (rr_lock_q[port]) begin + l2_prio_d[port] = l2_prio_q[port]; + end else begin + l2_prio_d[port] = l2_rr; + end + end + l2_rsp_rr[port] = l2_prio_d[port]; + + if (tile_rsp_chan[port].user.burst.is_burst & |arb_valid) begin + if (tile_rsp_chan[port].user.burst.burst_len == 0) begin + rr_lock_d[port] = 1'b0; + end else begin + rr_lock_d[port] = 1'b1; + end + end + end + + rr_arb_tree #( + .NumIn ( ClusterWideOutAxiPorts ), + .DataType ( logic ), + .ExtPrio ( 1'b0 ), + .AxiVldRdy ( 1'b1 ), + .LockIn ( 1'b1 ) + ) i_rr_arb_tree ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .flush_i ( '0 ), + .rr_i ( '0 ), + .req_i ( arb_valid ), + .gnt_o ( /* not used */ ), + .data_i ( '0 ), + .req_o ( /* not used */ ), + .gnt_i ( tile_rsp_ready[port]), + .data_o ( /* not used */ ), + .idx_o ( l2_rr ) + ); + end + end else begin + assign l2_prio_d = '0; + assign l2_prio_q = '0; + assign rr_lock_d = '0; + assign rr_lock_q = '0; + assign l2_rsp_rr = '0; + end + + // --------------------- + // Refill (DRAM) xbar + // --------------------- + reqrsp_xbar #( + .NumInp ( NumClusterMst*NumTilesPerGroup ), + .NumOut ( ClusterWideOutAxiPorts ), + .PipeReg ( 1'b1 ), + .ExtReqPrio ( 1'b0 ), + .ExtRspPrio ( Burst_Enable ), + .tcdm_req_chan_t ( cache_trans_req_chan_t ), + .tcdm_rsp_chan_t ( cache_trans_rsp_chan_t ) + ) i_refill_xbar ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( tile_req_chan ), + .slv_req_valid_i ( tile_req_valid ), + .slv_req_ready_o ( tile_req_ready ), + .slv_rsp_o ( tile_rsp_chan ), + .slv_rsp_valid_o ( tile_rsp_valid ), + .slv_rsp_ready_i ( tile_rsp_ready ), + .slv_sel_i ( tile_sel[NumTilesPerGroup*NumClusterMst-1:0] ), + .slv_rr_i ( '0 ), + .slv_selected_o ( /* unused */ ), + .mst_req_o ( l2_req_chan ), + .mst_req_valid_o ( l2_req_valid ), + .mst_req_ready_i ( l2_req_ready ), + .mst_rsp_i ( l2_rsp_chan ), + .mst_rr_i ( l2_rsp_rr ), + .mst_rsp_valid_i ( l2_rsp_valid ), + .mst_rsp_ready_o ( l2_rsp_ready ), + .mst_sel_i ( l2_sel ) + ); + + // --------------------- + // l2_req/rsp packing: bridge xbar channels <-> l2_req_t/l2_rsp_t port + // --------------------- + for (genvar ch = 0; ch < ClusterWideOutAxiPorts; ch++) begin : gen_l2_pack + always_comb begin + // Request: xbar -> group output port + l2_req_o[ch].q = '{ + addr : l2_req_chan[ch].addr, + write : l2_req_chan[ch].write, + amo : l2_req_chan[ch].amo, + data : l2_req_chan[ch].data, + strb : l2_req_chan[ch].strb, + size : l2_req_chan[ch].size, + default: '0 + }; + l2_req_o[ch].q.user = l2_req_chan[ch].user; + l2_req_o[ch].q_valid = l2_req_valid[ch]; + l2_req_ready[ch] = l2_rsp_i[ch].q_ready; + + // Response: group input port -> xbar + l2_rsp_chan[ch] = '{ + data : l2_rsp_i[ch].p.data, + error : l2_rsp_i[ch].p.error, + write : l2_rsp_i[ch].p.write, + default: '0 + }; + l2_rsp_chan[ch].user = l2_rsp_i[ch].p.user; + l2_rsp_valid[ch] = l2_rsp_i[ch].p_valid; + l2_req_o[ch].p_ready = l2_rsp_ready[ch]; + + // Response demux: which xbar input port does this response target? + l2_sel[ch] = l2_rsp_i[ch].p.user.tile_id * NumClusterMst + + l2_rsp_i[ch].p.user.bank_id; + end + end + // Tile remote access signals // In/Out relative to the tile (out--leave a tile; in--enter a tile) // Tile-side flat layout: index = j + r*NrTCDMPortsPerCore (j=xbar idx, r=remote slot within xbar) - tcdm_req_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_out_req; - tcdm_rsp_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_out_rsp; - logic [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_ready, tile_remote_out_ready; + tcdm_req_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_out_req; + tcdm_rsp_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_out_rsp; + logic [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_in_ready, tile_remote_out_ready; - tcdm_req_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_req; - tcdm_rsp_t [NumTiles-1:0][NumRemotePortTile-1:0] tile_remote_in_rsp; + tcdm_req_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_in_req; + tcdm_rsp_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] tile_remote_in_rsp; - // Xbar-side: NrTCDMPortsPerCore xbars, each with NumTiles*NumRemotePortCore ports + // Xbar-side: NrTCDMPortsPerCore xbars, each with NumTilesPerGroup*NumRemotePortCore ports // Xbar port index = t*NumRemotePortCore + r - tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_req_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_req_valid, tile_remote_out_req_ready; - tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_rsp_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_out_rsp_valid, tile_remote_out_rsp_ready; + tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_req_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_req_valid, tile_remote_out_req_ready; + tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_rsp_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_out_rsp_valid, tile_remote_out_rsp_ready; + + tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_req_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_req_valid, tile_remote_in_req_ready; + tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_rsp_chan; + logic [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] tile_remote_in_rsp_valid, tile_remote_in_rsp_ready; - tcdm_req_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_req_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_req_valid, tile_remote_in_req_ready; - tcdm_rsp_chan_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_rsp_chan; - logic [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] tile_remote_in_rsp_valid, tile_remote_in_rsp_ready; + // Per-group override of package-level remote xbar selection width. + // The package uses NumTiles (total), but the group's xbar is sized per-group. + localparam int unsigned LocalRemoteXbarSelWidth = $clog2(NumTilesPerGroup * NumRemotePortCore); + typedef logic [LocalRemoteXbarSelWidth-1:0] local_remote_xbar_sel_t; // Tile-side selection: narrow type, only carries tile_id - remote_tile_sel_t [NumTiles-1:0][NumRemotePortTile-1:0] remote_out_sel_tile; + remote_tile_sel_t [NumTilesPerGroup-1:0][NumRemotePortTile-1:0] remote_out_sel_tile; // Xbar-side selection: wider type, encodes tile_id*NumRemotePortCore + core_id%NumRemotePortCore - remote_xbar_sel_t [NrTCDMPortsPerCore-1:0][NumTiles*NumRemotePortCore-1:0] remote_out_sel_xbar, remote_in_sel_xbar; + local_remote_xbar_sel_t [NrTCDMPortsPerCore-1:0][NumTilesPerGroup*NumRemotePortCore-1:0] remote_out_sel_xbar, remote_in_sel_xbar; - for (genvar t = 0; t < NumTiles; t++) begin + for (genvar t = 0; t < NumTilesPerGroup; t++) begin for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin for (genvar r = 0; r < NumRemotePortCore; r++) begin // tile flat index: j + r*NrTCDMPortsPerCore @@ -265,113 +683,197 @@ module cachepool_group assign tile_remote_in_rsp_valid[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].p_valid; assign tile_remote_in_req_ready[j][t*NumRemotePortCore+r] = tile_remote_in_rsp[t][j+r*NrTCDMPortsPerCore].q_ready; - // Request selection: route to target tile's remote-in slot based on - // target tile ID, so that all requests to the same destination tile - // travel through one pipeline — preserving write-before-read ordering. - assign remote_out_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'( + assign remote_out_sel_xbar[j][t*NumRemotePortCore+r] = local_remote_xbar_sel_t'( remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] * NumRemotePortCore - + remote_out_sel_tile[t][j+r*NrTCDMPortsPerCore] % NumRemotePortCore); + + tile_remote_out_req_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore); - // Response selection: route back to source tile's remote-out slot. - // The originator (tile_id in user field) sent on slot - // (target_tile % NumRemotePortCore). The responding tile is `t` - // (genvar), so target_tile = t. - assign remote_in_sel_xbar[j][t*NumRemotePortCore+r] = remote_xbar_sel_t'( + assign remote_in_sel_xbar[j][t*NumRemotePortCore+r] = local_remote_xbar_sel_t'( tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.tile_id * NumRemotePortCore - + t % NumRemotePortCore); + + tile_remote_in_rsp_chan[j][t*NumRemotePortCore+r].user.core_id % NumRemotePortCore); end end end - for (genvar t = 0; t < NumTiles; t ++) begin : gen_tiles + for (genvar t = 0; t < NumTilesPerGroup; t ++) begin : gen_tiles logic [9:0] hart_base_id; assign hart_base_id = hart_base_id_i + t * NumCoresTile; logic [TileIDWidth-1:0] tile_id; - assign tile_id = t; - - cachepool_tile #( - .AxiAddrWidth ( AxiAddrWidth ), - .AxiDataWidth ( AxiDataWidth ), - .AxiIdWidthIn ( AxiIdWidthIn ), - .AxiIdWidthOut ( WideIdWidthIn ), - .AxiUserWidth ( AxiUserWidth ), - .BootAddr ( BootAddr ), - .UartAddr ( UartAddr ), - .ClusterPeriphSize ( ClusterPeriphSize ), - .NrCores ( NumCoresTile ), - .TCDMDepth ( TCDMDepth ), - .NrBanks ( NrBanks ), - .ICacheLineWidth ( ICacheLineWidth ), - .ICacheLineCount ( ICacheLineCount ), - .ICacheSets ( ICacheSets ), - .FPUImplementation ( FPUImplementation ), - .NumSpatzFPUs ( NumSpatzFPUs ), - .NumSpatzIPUs ( NumSpatzIPUs ), - .SnitchPMACfg ( SnitchPMACfg ), - .NumIntOutstandingLoads ( NumIntOutstandingLoads ), - .NumIntOutstandingMem ( NumIntOutstandingMem ), - .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), - .axi_in_req_t ( axi_in_req_t ), - .axi_in_resp_t ( axi_in_resp_t ), - .axi_narrow_req_t ( axi_narrow_req_t ), - .axi_narrow_resp_t ( axi_narrow_resp_t ), - .axi_out_req_t ( axi_mst_cache_req_t ), - .axi_out_resp_t ( axi_mst_cache_resp_t ), - .Xdma ( Xdma ), - .TileIDWidth ( TileIDWidth ), - .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), - .DMAReqFifoDepth ( DMAReqFifoDepth ), - .RegisterOffloadRsp ( RegisterOffloadRsp ), - .RegisterCoreReq ( RegisterCoreReq ), - .RegisterCoreRsp ( RegisterCoreRsp ), - .RegisterTCDMCuts ( RegisterTCDMCuts ), - .RegisterExt ( RegisterExt ), - .XbarLatency ( XbarLatency ), - .MaxMstTrans ( MaxMstTrans ), - .MaxSlvTrans ( MaxSlvTrans ) - ) i_tile ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .impl_i ( impl_i ), - .error_o ( error[t] ), - // TODO: remove hardcode - .debug_req_i ( debug_req_i [t*NumCoresTile+:NumCoresTile] ), - .meip_i ( meip_i [t*NumCoresTile+:NumCoresTile] ), - .mtip_i ( mtip_i [t*NumCoresTile+:NumCoresTile] ), - .msip_i ( msip_i [t*NumCoresTile+:NumCoresTile] ), - .hart_base_id_i ( hart_base_id ), - .cluster_base_addr_i ( cluster_base_addr_i ), - .tile_id_i ( tile_id ), - .private_start_addr_i ( private_start_addr_i ), - // AXI out for UART - .axi_out_req_o ( axi_narrow_req_o [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), - .axi_out_resp_i ( axi_narrow_rsp_i [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), - // Remote Access Ports - .remote_req_o ( tile_remote_out_req[t] ), - .remote_req_dst_o ( remote_out_sel_tile[t] ), - .remote_rsp_i ( tile_remote_out_rsp[t] ), - .remote_rsp_ready_i ( tile_remote_out_ready[t] ), - .remote_req_i ( tile_remote_in_req [t] ), - .remote_rsp_o ( tile_remote_in_rsp [t] ), - .remote_rsp_ready_o ( tile_remote_in_ready[t] ), - // Cache Refill Ports - .cache_refill_req_o ( cache_refill_req_o[t*NumL1CtrlTile+:NumL1CtrlTile] ), - .cache_refill_rsp_i ( cache_refill_rsp_i[t*NumL1CtrlTile+:NumL1CtrlTile] ), - // BootROM / Core-side Cache Bypass - .axi_wide_req_o ( axi_wide_req_o [t*TileWideAxiPorts+:TileWideAxiPorts] ), - .axi_wide_rsp_i ( axi_wide_rsp_i [t*TileWideAxiPorts+:TileWideAxiPorts] ), - // Peripherals - .icache_events_o ( /* unused */ ), - .icache_prefetch_enable_i ( icache_prefetch_enable_i ), - .cl_interrupt_i ( cl_interrupt_i [t*NumCoresTile+:NumCoresTile] ), - .dynamic_offset_i ( dynamic_offset_i ), - .l1d_insn_i ( l1d_insn_i ), - .l1d_private_i ( l1d_private_i ), - .l1d_insn_valid_i ( l1d_insn_valid_i ), - .l1d_insn_ready_o ( l1d_insn_ready_o [t] ), - .l1d_busy_i ( l1d_busy_i [t] ) - ); + assign tile_id = tile_base_id_i + TileIDWidth'(t); + + if (NumRemoteGroupPortCore == 0) begin : gen_tile + cachepool_tile #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( WideIdWidthIn ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NumCoresTile ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_mst_cache_req_t ), + .axi_out_resp_t ( axi_mst_cache_resp_t ), + .TileIDWidth ( TileIDWidth ), + .NumRemoteGroupPortCore ( NumRemoteGroupPortCore ), + .NumTilesPerGroup ( NumTilesPerGroup ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_tile ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( error [t] ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .tile_id_i ( tile_id ), + .private_start_addr_i ( private_start_addr_i ), + // AXI out for UART + .axi_out_req_o ( axi_narrow_req_o [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + .axi_out_resp_i ( axi_narrow_rsp_i [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + // Remote Access Ports + .remote_req_o ( tile_remote_out_req [t] ), + .remote_req_dst_o ( remote_out_sel_tile [t] ), + .remote_rsp_i ( tile_remote_out_rsp [t] ), + .remote_rsp_ready_i ( tile_remote_out_ready[t] ), + .remote_req_i ( tile_remote_in_req [t] ), + .remote_rsp_o ( tile_remote_in_rsp [t] ), + .remote_rsp_ready_o ( tile_remote_in_ready [t] ), + // Inter-group Remote Access Ports (directly exposed to group I/O) + .remote_group_req_o ( ), + .remote_group_rsp_i ( '0 ), + .remote_group_req_i ( '0 ), + .remote_group_rsp_o ( ), + // Cache Refill Ports (now internal, connected to group-level xbar) + .cache_refill_req_o ( cache_refill_req[t*NumL1CtrlTile+:NumL1CtrlTile] ), + .cache_refill_rsp_i ( cache_refill_rsp[t*NumL1CtrlTile+:NumL1CtrlTile] ), + // BootROM (goes to cluster) / Core-side Cache Bypass (stays in group) + .axi_wide_req_o ( {axi_tile_mem_req[t], axi_tile_bootrom_req[t]} ), + .axi_wide_rsp_i ( {axi_tile_mem_rsp[t], axi_tile_bootrom_rsp[t]} ), + // Peripherals + .icache_events_o ( /* unused */ ), + .icache_prefetch_enable_i ( icache_prefetch_enable_i ), + .cl_interrupt_i ( cl_interrupt_i [t*NumCoresTile+:NumCoresTile] ), + .dynamic_offset_i ( dynamic_offset_i ), + .l1d_insn_i ( l1d_insn_i ), + .l1d_private_i ( l1d_private_i ), + .l1d_insn_valid_i ( l1d_insn_valid_i ), + .l1d_insn_ready_o ( l1d_insn_ready_o [t] ), + .l1d_busy_i ( l1d_busy_i [t] ) + ); + end else begin : gen_tile + cachepool_tile #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( WideIdWidthIn ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NumCoresTile ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_mst_cache_req_t ), + .axi_out_resp_t ( axi_mst_cache_resp_t ), + .TileIDWidth ( TileIDWidth ), + .NumRemoteGroupPortCore ( NumRemoteGroupPortCore ), + .NumTilesPerGroup ( NumTilesPerGroup ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_tile ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( error [t] ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .tile_id_i ( tile_id ), + .private_start_addr_i ( private_start_addr_i ), + // AXI out for UART + .axi_out_req_o ( axi_narrow_req_o [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + .axi_out_resp_i ( axi_narrow_rsp_i [t*TileNarrowAxiPorts+:TileNarrowAxiPorts]), + // Remote Access Ports + .remote_req_o ( tile_remote_out_req [t] ), + .remote_req_dst_o ( remote_out_sel_tile [t] ), + .remote_rsp_i ( tile_remote_out_rsp [t] ), + .remote_rsp_ready_i ( tile_remote_out_ready[t] ), + .remote_req_i ( tile_remote_in_req [t] ), + .remote_rsp_o ( tile_remote_in_rsp [t] ), + .remote_rsp_ready_o ( tile_remote_in_ready [t] ), + // Inter-group Remote Access Ports (directly exposed to group I/O) + .remote_group_req_o ( remote_group_req_o [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + .remote_group_rsp_i ( remote_group_rsp_i [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + .remote_group_req_i ( remote_group_req_i [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + .remote_group_rsp_o ( remote_group_rsp_o [t*NumRemoteGroupPortTile+:NumRemoteGroupPortTile]), + // Cache Refill Ports (now internal, connected to group-level xbar) + .cache_refill_req_o ( cache_refill_req[t*NumL1CtrlTile+:NumL1CtrlTile] ), + .cache_refill_rsp_i ( cache_refill_rsp[t*NumL1CtrlTile+:NumL1CtrlTile] ), + // BootROM (goes to cluster) / Core-side Cache Bypass (stays in group) + .axi_wide_req_o ( {axi_tile_mem_req[t], axi_tile_bootrom_req[t]} ), + .axi_wide_rsp_i ( {axi_tile_mem_rsp[t], axi_tile_bootrom_rsp[t]} ), + // Peripherals + .icache_events_o ( /* unused */ ), + .icache_prefetch_enable_i ( icache_prefetch_enable_i ), + .cl_interrupt_i ( cl_interrupt_i [t*NumCoresTile+:NumCoresTile] ), + .dynamic_offset_i ( dynamic_offset_i ), + .l1d_insn_i ( l1d_insn_i ), + .l1d_private_i ( l1d_private_i ), + .l1d_insn_valid_i ( l1d_insn_valid_i ), + .l1d_insn_ready_o ( l1d_insn_ready_o [t] ), + .l1d_busy_i ( l1d_busy_i [t] ) + ); + end end // ------------ @@ -379,11 +881,10 @@ module cachepool_group // ------------ for (genvar p = 0; p < NrTCDMPortsPerCore; p++) begin : gen_remote_tile_xbar - // Decide which tile to go reqrsp_xbar #( - .NumInp (NumTiles * NumRemotePortCore ), - .NumOut (NumTiles * NumRemotePortCore ), + .NumInp (NumTilesPerGroup * NumRemotePortCore ), + .NumOut (NumTilesPerGroup * NumRemotePortCore ), .PipeReg (1'b1 ), .RspReg (1'b1 ), .ExtReqPrio (1'b0 ), diff --git a/hardware/src/cachepool_group_noc_wrapper.sv b/hardware/src/cachepool_group_noc_wrapper.sv new file mode 100644 index 0000000..85ea868 --- /dev/null +++ b/hardware/src/cachepool_group_noc_wrapper.sv @@ -0,0 +1,596 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Description: Wrapper around cachepool_group that handles inter-group +// interconnection: master-side concentration xbar, flit packing, floo_router +// instances (req + rsp), and a slave-side dispatch xbar. +// +// Author: Diyou Shen + + +module cachepool_group_noc_wrapper + import cachepool_pkg::*; + import floo_pkg::*; + import spatz_pkg::*; + import fpnew_pkg::fpu_implementation_t; + import snitch_pma_pkg::snitch_pma_t; + import snitch_icache_pkg::icache_l1_events_t; + #( + parameter int unsigned AxiAddrWidth = 48, + parameter int unsigned AxiDataWidth = 512, + parameter int unsigned AxiIdWidthIn = 2, + parameter int unsigned AxiIdWidthOut = 2, + parameter int unsigned AxiUserWidth = 1, + parameter logic [31:0] BootAddr = 32'h0, + parameter logic [31:0] UartAddr = 32'h0, + parameter int unsigned NrCores = 0, + parameter int unsigned TCDMDepth = 1024, + parameter int unsigned ClusterPeriphSize = 64, + parameter int unsigned NrBanks = 2 * NrCores, + parameter int unsigned ICacheLineWidth = 0, + parameter int unsigned ICacheLineCount = 0, + parameter int unsigned ICacheSets = 0, + parameter fpu_implementation_t FPUImplementation = fpu_implementation_t'(0), + parameter int unsigned NumSpatzFPUs = 1, + parameter int unsigned NumSpatzIPUs = 1, + parameter snitch_pma_t SnitchPMACfg = '0, + parameter int unsigned NumIntOutstandingLoads = 1, + parameter int unsigned NumIntOutstandingMem = 4, + parameter int unsigned NumSpatzOutstandingLoads = 4, + parameter bit RegisterOffloadRsp = 1, + parameter bit RegisterCoreReq = 0, + parameter bit RegisterCoreRsp = 0, + parameter bit RegisterTCDMCuts = 1'b0, + parameter bit RegisterExt = 1'b0, + parameter axi_pkg::xbar_latency_e XbarLatency = axi_pkg::CUT_ALL_PORTS, + parameter int unsigned MaxMstTrans = 4, + parameter int unsigned MaxSlvTrans = 4, + parameter type axi_in_req_t = logic, + parameter type axi_in_resp_t = logic, + parameter type axi_narrow_req_t = logic, + parameter type axi_narrow_resp_t = logic, + parameter type axi_out_req_t = logic, + parameter type axi_out_resp_t = logic, + parameter type impl_in_t = logic, + parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, + parameter int unsigned NrSramCfg = 1 + ) ( + input logic clk_i, + input logic rst_ni, + input logic debug_req_i, + input logic meip_i, + input logic mtip_i, + input logic msip_i, + input logic [9:0] hart_base_id_i, + input logic [TileIDWidth-1:0] tile_base_id_i, + input axi_addr_t cluster_base_addr_i, + input axi_addr_t private_start_addr_i, + output axi_narrow_req_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_req_o, + input axi_narrow_resp_t [TileNarrowAxiPorts*NumTilesPerGroup-1:0] axi_narrow_rsp_i, + output l2_req_t [ClusterWideOutAxiPorts-1:0] l2_req_o, + input l2_rsp_t [ClusterWideOutAxiPorts-1:0] l2_rsp_i, + output icache_l1_events_t [NrCores-1:0] icache_events_o, + input logic icache_prefetch_enable_i, + input logic [NrCores-1:0] cl_interrupt_i, + input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, + input logic [3:0] l1d_private_i, + input cache_insn_t l1d_insn_i, + input logic l1d_insn_valid_i, + output logic [NumTilesPerGroup-1:0] l1d_insn_ready_o, + input logic [NumTilesPerGroup-1:0] l1d_busy_i, + input impl_in_t [NrSramCfg-1:0] impl_i, + output logic error_o, + // XY coordinates of this group in the inter-group mesh + input group_xy_id_t group_xy_id_i, + // Inter-group req mesh: 4 directions (N=0,E=1,S=2,W=3) + // dim1: direction, dim2: tile*NumNoCPortsPerTile+channel + output noc_group_req_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_o, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_valid_o, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_ready_i, + input noc_group_req_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_i, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_valid_i, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_req_ready_o, + // Inter-group rsp mesh + output noc_group_rsp_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_o, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_valid_o, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_ready_i, + input noc_group_rsp_t [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_i, + input logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_valid_i, + output logic [3:0][NumTilesPerGroup*NumNoCPortsPerTile-1:0] noc_rsp_ready_o + ); + + + // ------------------------------------------------------------------------- + // Localparams + // ------------------------------------------------------------------------- + localparam int unsigned NumRemoteGroupPortTile = (NumRemoteGroupPortCore == 0) ? 1 + : NumRemoteGroupPortCore * NrTCDMPortsPerCore; + localparam int unsigned NumRemoteGroupPortGroup = NumRemoteGroupPortTile * NumTilesPerGroup; + localparam int unsigned NumNoCPortsGroup = NumNoCPortsPerTile * NumTilesPerGroup; + localparam int unsigned SlvXbarSelW = (NumRemoteGroupPortGroup > 1) ? $clog2(NumRemoteGroupPortGroup) : 1; + localparam int unsigned MstXbarSelW = (NumNoCPortsGroup > 1) ? $clog2(NumNoCPortsGroup) : 1; + + // -- Struct / xbar field widths (always >= 1 to avoid zero-width ports) ------ + localparam int unsigned NocCacheBankBits = $clog2(NrBanks); + localparam int unsigned NocAddrTileWidth = (NumTilesPerGroup > 1) ? $clog2(NumTilesPerGroup) : 1; + // -- Actual bit counts inside dst_tile_id (can be 0 when that dimension = 1) - + // dst_tile_id layout: [ group_y (NocGroupBitsY) | group_x (NocGroupBitsX) | local_tile (NocGroupOffset) ] + // where NocGroupOffset = $clog2(NumTilesPerGroup) (0 when NumTilesPerGroup == 1). + localparam int unsigned NocGroupOffset = $clog2(NumTilesPerGroup); + localparam int unsigned NocGroupBitsX = (NumGroupsX > 1) ? $clog2(NumGroupsX) : 0; + localparam int unsigned NocGroupBitsY = (NumGroupsY > 1) ? $clog2(NumGroupsY) : 0; + + + // ------------------------------------------------------------------------- + // Group ↔ wrapper boundary signals + // ------------------------------------------------------------------------- + remote_group_req_t [NumRemoteGroupPortGroup-1:0] remote_group_req_to_group; + remote_group_rsp_t [NumRemoteGroupPortGroup-1:0] remote_group_rsp_from_group; + remote_group_req_t [NumRemoteGroupPortGroup-1:0] remote_group_req_from_group; + remote_group_rsp_t [NumRemoteGroupPortGroup-1:0] remote_group_rsp_to_group; + + + // ------------------------------------------------------------------------- + // Mesh signals [tile][ch][dir=3:0] and transposition to/from ports + // ------------------------------------------------------------------------- + noc_group_req_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_out; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_out_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_out_ready; + noc_group_req_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_in; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_in_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] req_mesh_in_ready; + + noc_group_rsp_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_out; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_out_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_out_ready; + noc_group_rsp_t [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_in; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_in_valid; + logic [NumTilesPerGroup-1:0][NumNoCPortsPerTile-1:0][3:0] rsp_mesh_in_ready; + + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_mesh_trans_t + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_mesh_trans_n + for (genvar d = 0; d < 4; d++) begin : gen_mesh_trans_d + // Mute the channel when not valid for debugging + assign noc_req_o[d][t*NumNoCPortsPerTile+n] = req_mesh_out_valid[t][n][d] ? req_mesh_out[t][n][d] : '0; + assign noc_req_valid_o[d][t*NumNoCPortsPerTile+n] = req_mesh_out_valid[t][n][d]; + assign req_mesh_out_ready[t][n][d] = noc_req_ready_i[d][t*NumNoCPortsPerTile+n]; + assign req_mesh_in[t][n][d] = noc_req_i[d][t*NumNoCPortsPerTile+n]; + assign req_mesh_in_valid[t][n][d] = noc_req_valid_i[d][t*NumNoCPortsPerTile+n]; + assign noc_req_ready_o[d][t*NumNoCPortsPerTile+n] = req_mesh_in_ready[t][n][d]; + + assign noc_rsp_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_out_valid[t][n][d] ? rsp_mesh_out[t][n][d] : '0; + assign noc_rsp_valid_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_out_valid[t][n][d]; + assign rsp_mesh_out_ready[t][n][d] = noc_rsp_ready_i[d][t*NumNoCPortsPerTile+n]; + assign rsp_mesh_in[t][n][d] = noc_rsp_i[d][t*NumNoCPortsPerTile+n]; + assign rsp_mesh_in_valid[t][n][d] = noc_rsp_valid_i[d][t*NumNoCPortsPerTile+n]; + assign noc_rsp_ready_o[d][t*NumNoCPortsPerTile+n] = rsp_mesh_in_ready[t][n][d]; + end + end + end + + + if (NumRemoteGroupPortCore > 0) begin : gen_noc + + // ----------------------------------------------------------------------- + // Router inject/eject signals (flat 1D index noc_port = t*NumNoCPortsPerTile+n) + // ----------------------------------------------------------------------- + noc_group_req_t [NumNoCPortsGroup-1:0] packed_req; + logic [NumNoCPortsGroup-1:0] packed_req_valid; + logic [NumNoCPortsGroup-1:0] packed_req_ready; + + noc_group_req_t [NumNoCPortsGroup-1:0] eject_req; + logic [NumNoCPortsGroup-1:0] eject_req_valid; + logic [NumNoCPortsGroup-1:0] eject_req_ready; + + noc_group_rsp_t [NumNoCPortsGroup-1:0] inject_rsp; + logic [NumNoCPortsGroup-1:0] inject_rsp_valid; + logic [NumNoCPortsGroup-1:0] inject_rsp_ready; + + noc_group_rsp_t [NumNoCPortsGroup-1:0] eject_rsp; + logic [NumNoCPortsGroup-1:0] eject_rsp_valid; + logic [NumNoCPortsGroup-1:0] eject_rsp_ready; + + // Master xbar output (one concentrated req/rsp channel per tile/channel) + remote_group_req_chan_t [NumNoCPortsGroup-1:0] mst_xbar_req; + logic [NumNoCPortsGroup-1:0] mst_xbar_req_valid; + logic [NumNoCPortsGroup-1:0] mst_xbar_req_ready; + + // Slave xbar signals + noc_group_req_t [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_req; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_req_valid; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_req_ready; + noc_group_rsp_t [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_rsp; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_rsp_valid; + logic [NumRemoteGroupPortGroup-1:0] slv_xbar_mst_rsp_ready; + noc_group_rsp_t [NumNoCPortsGroup-1:0] slv_xbar_slv_rsp; + logic [NumNoCPortsGroup-1:0] slv_xbar_slv_rsp_valid; + logic [NumNoCPortsGroup-1:0] slv_xbar_slv_rsp_ready; + + logic [NumNoCPortsGroup-1:0][SlvXbarSelW-1:0] slv_xbar_slv_sel; + logic [NumRemoteGroupPortGroup-1:0][MstXbarSelW-1:0] slv_xbar_mst_sel; + + + // ----------------------------------------------------------------------- + // Master-side per-tile concentration xbar + flit packing + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_mst_t + + remote_group_req_chan_t [NumRemoteGroupPortTile-1:0] mst_slv_req; + logic [NumRemoteGroupPortTile-1:0] mst_slv_req_valid; + logic [NumRemoteGroupPortTile-1:0] mst_slv_req_ready; + remote_group_rsp_chan_t [NumRemoteGroupPortTile-1:0] mst_slv_rsp; + logic [NumRemoteGroupPortTile-1:0] mst_slv_rsp_valid; + logic [NumRemoteGroupPortTile-1:0] mst_slv_rsp_ready; + remote_group_rsp_chan_t [NumNoCPortsPerTile-1:0] eject_rsp_payload; + portid_t [NumNoCPortsPerTile-1:0] mst_xbar_mst_sel; + portid_t [NumNoCPortsPerTile-1:0] mst_xbar_slv_selected; + + for (genvar p = 0; p < NumRemoteGroupPortTile; p++) begin : gen_mst_port_p + assign mst_slv_req[p] = remote_group_req_from_group[t*NumRemoteGroupPortTile+p].q; + assign mst_slv_req_valid[p] = remote_group_req_from_group[t*NumRemoteGroupPortTile+p].q_valid; + assign remote_group_rsp_to_group[t*NumRemoteGroupPortTile+p].q_ready = mst_slv_req_ready[p]; + assign remote_group_rsp_to_group[t*NumRemoteGroupPortTile+p].p = mst_slv_rsp[p]; + assign remote_group_rsp_to_group[t*NumRemoteGroupPortTile+p].p_valid = mst_slv_rsp_valid[p]; + assign mst_slv_rsp_ready[p] = + remote_group_req_from_group[t*NumRemoteGroupPortTile+p].p_ready; + end + + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_mst_eject_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; + assign eject_rsp_payload[n] = eject_rsp[noc_port].payload; + assign mst_xbar_mst_sel[n] = eject_rsp[noc_port].hdr.src_port_id; + end + + // Static port-to-NoC-channel mapping: each flat port p has xbar index + // j = p % NrTCDMPortsPerCore, and is steered to NoC channel j % NumNoCPortsPerTile. + // Spatz ports (j=0..NrTCDMPortsPerCore-2) divide evenly across channels; + // Snitch (j=NrTCDMPortsPerCore-1) maps by the same modulo. + localparam int unsigned NocMstSelWidth = (NumNoCPortsPerTile > 1) + ? $clog2(NumNoCPortsPerTile) : 1; + logic [NumRemoteGroupPortTile-1:0][NocMstSelWidth-1:0] noc_mst_sel; + for (genvar p = 0; p < NumRemoteGroupPortTile; p++) begin : gen_noc_mst_sel + assign noc_mst_sel[p] = NocMstSelWidth'((p % NrTCDMPortsPerCore) % NumNoCPortsPerTile); + end + + reqrsp_xbar #( + .NumInp ( NumRemoteGroupPortTile ), + .NumOut ( NumNoCPortsPerTile ), + .tcdm_req_chan_t ( remote_group_req_chan_t ), + .tcdm_rsp_chan_t ( remote_group_rsp_chan_t ) + ) i_noc_mst_xbar ( + .clk_i, + .rst_ni, + .slv_req_i ( mst_slv_req ), + .slv_rr_i ( '0 ), + .slv_req_valid_i ( mst_slv_req_valid ), + .slv_req_ready_o ( mst_slv_req_ready ), + .slv_rsp_o ( mst_slv_rsp ), + .slv_rsp_valid_o ( mst_slv_rsp_valid ), + .slv_rsp_ready_i ( mst_slv_rsp_ready ), + .slv_sel_i ( noc_mst_sel ), + .slv_selected_o ( mst_xbar_slv_selected ), + .mst_req_o ( mst_xbar_req[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_req_valid_o ( mst_xbar_req_valid[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_req_ready_i ( mst_xbar_req_ready[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_rsp_i ( eject_rsp_payload ), + .mst_rr_i ( '0 ), + .mst_rsp_valid_i ( eject_rsp_valid[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_rsp_ready_o ( eject_rsp_ready[t*NumNoCPortsPerTile +: + NumNoCPortsPerTile] ), + .mst_sel_i ( mst_xbar_mst_sel ) + ); + + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_pack_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; + assign packed_req[noc_port].hdr.collective_op = '0; + assign packed_req[noc_port].hdr.src_id = group_xy_id_i; + // dst_tile_id set by tcdm_cache_interco: bits [NocGroupOffset +: NocGroupBitsX] = group_x, + // bits [(NocGroupOffset+NocGroupBitsX) +: NocGroupBitsY] = group_y. + // When a dimension has only 1 group, no bits are consumed and the coordinate is 0. + if (NumGroupsX > 1) begin : gen_dst_x + assign packed_req[noc_port].hdr.dst_id.x = + mst_xbar_req[noc_port].user.dst_tile_id[NocGroupOffset +: NocGroupBitsX]; + end else begin : gen_dst_x + assign packed_req[noc_port].hdr.dst_id.x = '0; + end + if (NumGroupsY > 1) begin : gen_dst_y + assign packed_req[noc_port].hdr.dst_id.y = + mst_xbar_req[noc_port].user.dst_tile_id[(NocGroupOffset + NocGroupBitsX) +: NocGroupBitsY]; + end else begin : gen_dst_y + assign packed_req[noc_port].hdr.dst_id.y = '0; + end + assign packed_req[noc_port].hdr.dst_id.port_id = '0; + assign packed_req[noc_port].hdr.src_tile_id = group_tile_sel_t'(t); + assign packed_req[noc_port].hdr.src_port_id = mst_xbar_slv_selected[n]; + assign packed_req[noc_port].hdr.last = 1'b1; + assign packed_req[noc_port].payload = mst_xbar_req[noc_port]; + assign packed_req_valid[noc_port] = mst_xbar_req_valid[noc_port]; + assign mst_xbar_req_ready[noc_port] = packed_req_ready[noc_port]; + + end + + end : gen_mst_t + + + // ----------------------------------------------------------------------- + // Per-tile per-channel req floo_router + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_req_router_t + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_req_router_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; + floo_router #( + .NumRoutes ( 5 ), + .NumVirtChannels ( 1 ), + .NumPhysChannels ( 1 ), + .InFifoDepth ( 2 ), + .OutFifoDepth ( 0 ), + .RouteAlgo ( XYRouting ), + .IdWidth ( $bits(group_xy_id_t) ), + .id_t ( group_xy_id_t ), + .NumAddrRules ( 1 ), + .addr_rule_t ( logic ), + .flit_t ( noc_group_req_t ), + .hdr_t ( noc_group_hdr_t ) + ) i_req_router ( + .clk_i, + .rst_ni, + .test_enable_i ( 1'b0 ), + .xy_id_i ( group_xy_id_i ), + .id_route_map_i ( '0 ), + .valid_i ( {packed_req_valid[noc_port], + req_mesh_in_valid[t][n][3:0]} ), + .ready_o ( {packed_req_ready[noc_port], + req_mesh_in_ready[t][n][3:0]} ), + .data_i ( {packed_req[noc_port], + req_mesh_in[t][n][3:0]} ), + .credit_o ( ), + .valid_o ( {eject_req_valid[noc_port], + req_mesh_out_valid[t][n][3:0]} ), + .ready_i ( {eject_req_ready[noc_port], + req_mesh_out_ready[t][n][3:0]} ), + .data_o ( {eject_req[noc_port], + req_mesh_out[t][n][3:0]} ), + .credit_i ( '1 ), + .offload_req_o ( ), + .offload_rsp_i ( '0 ) + ); + end + end + + + // ----------------------------------------------------------------------- + // Per-tile per-channel rsp floo_router + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_rsp_router_t + for (genvar n = 0; n < NumNoCPortsPerTile; n++) begin : gen_rsp_router_n + localparam int unsigned noc_port = t * NumNoCPortsPerTile + n; + floo_router #( + .NumRoutes ( 5 ), + .NumVirtChannels ( 1 ), + .NumPhysChannels ( 1 ), + .InFifoDepth ( 2 ), + .OutFifoDepth ( 0 ), + .RouteAlgo ( XYRouting ), + .IdWidth ( $bits(group_xy_id_t) ), + .id_t ( group_xy_id_t ), + .NumAddrRules ( 1 ), + .addr_rule_t ( logic ), + .flit_t ( noc_group_rsp_t ), + .hdr_t ( noc_group_hdr_t ) + ) i_rsp_router ( + .clk_i, + .rst_ni, + .test_enable_i ( 1'b0 ), + .xy_id_i ( group_xy_id_i ), + .id_route_map_i ( '0 ), + .valid_i ( {inject_rsp_valid[noc_port], + rsp_mesh_in_valid[t][n][3:0]} ), + .ready_o ( {inject_rsp_ready[noc_port], + rsp_mesh_in_ready[t][n][3:0]} ), + .data_i ( {inject_rsp[noc_port], + rsp_mesh_in[t][n][3:0]} ), + .credit_o ( ), + .valid_o ( {eject_rsp_valid[noc_port], + rsp_mesh_out_valid[t][n][3:0]} ), + .ready_i ( {eject_rsp_ready[noc_port], + rsp_mesh_out_ready[t][n][3:0]} ), + .data_o ( {eject_rsp[noc_port], + rsp_mesh_out[t][n][3:0]} ), + .credit_i ( '1 ), + .offload_req_o ( ), + .offload_rsp_i ( '0 ) + ); + end + end + + + // ----------------------------------------------------------------------- + // Slave xbar selection signals + inject_rsp ↔ slv_xbar_slv_rsp + // ----------------------------------------------------------------------- + for (genvar noc_port = 0; noc_port < NumNoCPortsGroup; noc_port++) begin : gen_slv_sel + assign slv_xbar_slv_sel[noc_port] = (NumTilesPerGroup == 1) + ? SlvXbarSelW'(eject_req[noc_port].hdr.src_port_id) + : SlvXbarSelW'(eject_req[noc_port].payload.addr[(dynamic_offset_i + NocCacheBankBits) +: NocAddrTileWidth] + * NumRemoteGroupPortTile + + eject_req[noc_port].hdr.src_port_id); + + end + + assign inject_rsp = slv_xbar_slv_rsp; + assign inject_rsp_valid = slv_xbar_slv_rsp_valid; + assign slv_xbar_slv_rsp_ready = inject_rsp_ready; + + + // ----------------------------------------------------------------------- + // Slave-side group-wide dispatch xbar + // ----------------------------------------------------------------------- + reqrsp_xbar #( + .NumInp ( NumNoCPortsGroup ), + .NumOut ( NumRemoteGroupPortGroup), + .tcdm_req_chan_t ( noc_group_req_t ), + .tcdm_rsp_chan_t ( noc_group_rsp_t ) + ) i_noc_slv_xbar ( + .clk_i, + .rst_ni, + .slv_req_i ( eject_req ), + .slv_rr_i ( '0 ), + .slv_req_valid_i ( eject_req_valid ), + .slv_req_ready_o ( eject_req_ready ), + .slv_rsp_o ( slv_xbar_slv_rsp ), + .slv_rsp_valid_o ( slv_xbar_slv_rsp_valid ), + .slv_rsp_ready_i ( slv_xbar_slv_rsp_ready ), + .slv_sel_i ( slv_xbar_slv_sel ), + .slv_selected_o ( ), + .mst_req_o ( slv_xbar_mst_req ), + .mst_req_valid_o ( slv_xbar_mst_req_valid ), + .mst_req_ready_i ( slv_xbar_mst_req_ready ), + .mst_rsp_i ( slv_xbar_mst_rsp ), + .mst_rr_i ( '0 ), + .mst_rsp_valid_i ( slv_xbar_mst_rsp_valid ), + .mst_rsp_ready_o ( slv_xbar_mst_rsp_ready ), + .mst_sel_i ( slv_xbar_mst_sel ) + ); + + + // ----------------------------------------------------------------------- + // Slave delivery: unpack xbar output → group slave ports + rsp packing + // ----------------------------------------------------------------------- + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_slv_deliver_t + for (genvar p = 0; p < NumRemoteGroupPortTile; p++) begin : gen_slv_deliver_p + localparam int unsigned port = t * NumRemoteGroupPortTile + p; + + // Placeholder response routing: route response back via the NoC channel + // of the same tile (t). Correct cross-tile response routing is deferred. + assign slv_xbar_mst_sel[port] = MstXbarSelW'(t * NumNoCPortsPerTile); + + always_comb begin : proc_req_unpack + remote_group_req_to_group[port].q = slv_xbar_mst_req[port].payload; + remote_group_req_to_group[port].q.user.src_group_x = + slv_xbar_mst_req[port].hdr.src_id.x; + remote_group_req_to_group[port].q.user.src_group_y = + slv_xbar_mst_req[port].hdr.src_id.y; + end + + assign remote_group_req_to_group[port].q_valid = slv_xbar_mst_req_valid[port]; + assign slv_xbar_mst_req_ready[port] = + remote_group_rsp_from_group[port].q_ready; + assign remote_group_req_to_group[port].p_ready = slv_xbar_mst_rsp_ready[port]; + + + assign slv_xbar_mst_rsp[port].payload = + remote_group_rsp_from_group[port].p; + assign slv_xbar_mst_rsp[port].hdr.collective_op = '0; + assign slv_xbar_mst_rsp[port].hdr.src_id = group_xy_id_i; + if (NumGroupsX > 1) begin : gen_rsp_dst_x + assign slv_xbar_mst_rsp[port].hdr.dst_id.x = + remote_group_rsp_from_group[port].p.user.tile_id[NocGroupOffset +: NocGroupBitsX]; + end else begin : gen_rsp_dst_x + assign slv_xbar_mst_rsp[port].hdr.dst_id.x = '0; + end + if (NumGroupsY > 1) begin : gen_rsp_dst_y + assign slv_xbar_mst_rsp[port].hdr.dst_id.y = + remote_group_rsp_from_group[port].p.user.tile_id[(NocGroupOffset + NocGroupBitsX) +: NocGroupBitsY]; + end else begin : gen_rsp_dst_y + assign slv_xbar_mst_rsp[port].hdr.dst_id.y = '0; + end + assign slv_xbar_mst_rsp[port].hdr.dst_id.port_id = '0; + assign slv_xbar_mst_rsp[port].hdr.src_tile_id = group_tile_sel_t'(t); + assign slv_xbar_mst_rsp[port].hdr.src_port_id = remote_group_rsp_from_group[port].p.user.port_id; + assign slv_xbar_mst_rsp[port].hdr.last = 1'b1; + assign slv_xbar_mst_rsp_valid[port] = + remote_group_rsp_from_group[port].p_valid; + end + end + + + end else begin : gen_noc_disabled + + assign remote_group_req_to_group = '0; + assign remote_group_rsp_to_group = '0; + assign req_mesh_out = '0; + assign req_mesh_out_valid = '0; + assign req_mesh_in_ready = '0; + assign rsp_mesh_out = '0; + assign rsp_mesh_out_valid = '0; + assign rsp_mesh_in_ready = '0; + + end + + + // ------------------------------------------------------------------------- + // Group instantiation + // ------------------------------------------------------------------------- + cachepool_group #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( AxiIdWidthOut ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .UartAddr ( UartAddr ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NrCores ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_narrow_req_t ( axi_narrow_req_t ), + .axi_narrow_resp_t ( axi_narrow_resp_t ), + .axi_out_req_t ( axi_out_req_t ), + .axi_out_resp_t ( axi_out_resp_t ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_group ( + .clk_i, + .rst_ni, + .impl_i ( impl_i ), + .error_o ( error_o ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id_i ), + .tile_base_id_i ( tile_base_id_i ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .private_start_addr_i ( private_start_addr_i ), + .axi_narrow_req_o ( axi_narrow_req_o ), + .axi_narrow_rsp_i ( axi_narrow_rsp_i ), + .l2_req_o ( l2_req_o ), + .l2_rsp_i ( l2_rsp_i ), + .remote_group_req_o ( remote_group_req_from_group ), + .remote_group_rsp_i ( remote_group_rsp_to_group ), + .remote_group_req_i ( remote_group_req_to_group ), + .remote_group_rsp_o ( remote_group_rsp_from_group ), + .icache_events_o ( icache_events_o ), + .icache_prefetch_enable_i ( icache_prefetch_enable_i ), + .cl_interrupt_i ( cl_interrupt_i ), + .dynamic_offset_i ( dynamic_offset_i ), + .l1d_private_i ( l1d_private_i ), + .l1d_insn_i ( l1d_insn_i ), + .l1d_insn_valid_i ( l1d_insn_valid_i ), + .l1d_insn_ready_o ( l1d_insn_ready_o ), + .l1d_busy_i ( l1d_busy_i ) + ); + +endmodule diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv index 737bc70..7029bed 100644 --- a/hardware/src/cachepool_pkg.sv +++ b/hardware/src/cachepool_pkg.sv @@ -52,19 +52,39 @@ package cachepool_pkg; // TILE CONFIG // /////////////////// // How many cores for each tile? - localparam int unsigned NumCoresTile = NumCores / NumTiles; + localparam int unsigned NumCoresTile = NumCores / NumTiles; // How many remote ports for each tile per core's port? - localparam int unsigned NumRemotePortCore = `ifdef REMOTE_PORT_PER_CORE `REMOTE_PORT_PER_CORE `else 0 `endif; + localparam int unsigned NumRemotePortCore = `ifdef REMOTE_PORT_PER_CORE `REMOTE_PORT_PER_CORE `else 0 `endif; // How many cores within a tile? This is used to select the ports within a tile. - localparam int unsigned LogNumCoresTile = $clog2(NumCoresTile); + localparam int unsigned LogNumCoresTile = $clog2(NumCoresTile); // 4 ports from Spatz + 1 shared port from Snitch/FPU - localparam int unsigned NrTCDMPortsPerCore = 5; + localparam int unsigned NrTCDMPortsPerCore = 5; // How many remote ports for each tile in total? - localparam int unsigned NumRemotePortTile = NumRemotePortCore * NrTCDMPortsPerCore; + localparam int unsigned NumRemotePortTile = NumRemotePortCore * NrTCDMPortsPerCore; + + //////////////////// + // GROUP CONFIG // + //////////////////// + // How many tiles for each group? + localparam int unsigned NumTilesPerGroup = NumTiles / NumGroups; + + // How many cores for each group? + localparam int unsigned NumCoreGroup = NumCores / NumGroups; + + // How many remote group ports for each tile? + localparam int unsigned NumRemoteGroupPortCore = `ifdef RG_PORT_PER_CORE `RG_PORT_PER_CORE `else 0 `endif; + + // Number of inter-group NoC router channels per tile (x in the 5-to-x concentration xbar). + localparam int unsigned NumNoCPortsPerTile = `ifdef NOC_PORT_PER_TILE `NOC_PORT_PER_TILE `else 1 `endif; + + // Group mesh dimensions. NumGroupsY is derived; NumGroupsX must be set via config. + localparam int unsigned NumGroupsX = `ifdef NUM_GROUPS_X `NUM_GROUPS_X `else 1 `endif; + localparam int unsigned NumGroupsY = NumGroups / NumGroupsX; + //////////////////// // CLUSTER HW // @@ -76,6 +96,12 @@ package cachepool_pkg; localparam int unsigned ICacheLineCount = 128; localparam int unsigned ICacheSets = 4; + // Group-level L2 ICache (shared read-only cache, primarily for coalescing) + localparam int unsigned L2ICacheLineWidth = 128; + localparam int unsigned L2ICacheSets = 4; + localparam int unsigned L2ICacheSizeByte = 65536; + localparam int unsigned L2ICacheLineCount = L2ICacheSizeByte / (L2ICacheSets * L2ICacheLineWidth / 8); + // Be careful on unsigned long int passed in from configuration. // Currently use fixed values. localparam int unsigned TCDMStartAddr = 32'hBFFF_F800; @@ -172,26 +198,63 @@ package cachepool_pkg; localparam int unsigned ClusterRouteIdWidth = $clog2(NumClusterMst); /***** ID Width Topology (Tile -> Group -> Cluster) *****/ + // TileAxiIdWidth: base iCache/DMA AXI ID bits per tile before tile-index bits are added. + // Determines how many outstanding refills the iCache can track (2^TileAxiIdWidth = 8). + // This is the "tile_local_bits" field described above. localparam int unsigned TileAxiIdWidth = 3; localparam int unsigned GroupAxiIdWidth = TileAxiIdWidth + $clog2(NumTiles); localparam int unsigned ClusterAxiIdWidth = GroupAxiIdWidth + ClusterRouteIdWidth; - - // legacy naming + // Alias used by the Spatz-generated wrapper and testbench templates. localparam int unsigned SpatzAxiIdInWidth = ClusterAxiIdWidth; - // localparam int unsigned SpatzAxiIdInWidth = TileAxiIdWidth; - localparam int unsigned SpatzAxiIdOutWidth = ClusterAxiIdWidth + 1; + + // Per-group AXI output ID width (pre multi-group mux). + // The +1 comes from reqrsp_to_axi, which tags each burst with one extra bit. + localparam int unsigned GroupAxiIdOutWidth = ClusterAxiIdWidth + 1; + // Bounded per-group refill ID width: uses NumTilesPerGroup (not NumTiles) so the + // ID space stays fixed regardless of total system size. axi_id_remap at each group + // output reduces GroupAxiIdOutWidth to this before the inter-group mux / future NoC. + // For NumGroups == 1, NumTilesPerGroup == NumTiles so this equals GroupAxiIdOutWidth. + localparam int unsigned WideRefillIdWidth = TileAxiIdWidth + $clog2(NumTilesPerGroup) + ClusterRouteIdWidth + 1; + // Cluster-level AXI output ID width: widened by multi-group mux. + // When NumGroups == 1, $clog2(1) == 0 so this equals WideRefillIdWidth == GroupAxiIdOutWidth. + localparam int unsigned GroupMuxIdBits = (NumGroups > 1) ? $clog2(NumGroups) : 0; + localparam int unsigned SpatzAxiIdOutWidth = WideRefillIdWidth + GroupMuxIdBits; // Fixed AXI ID width for IWC localparam int unsigned IwcAxiIdOutWidth = SpatzAxiIdOutWidth + 1; - localparam int unsigned CsrAxiMstIdWidth = ClusterAxiIdWidth; - localparam int unsigned CsrAxiSlvIdWidth = ClusterAxiIdWidth + $clog2(NumTiles+1); + // Cluster wrapper external output AXI ID width, after the wrapper-level axi_id_remap. + // Reduces the fat SpatzAxiIdOutWidth presented to the DRAM controller. + // Must satisfy: WrapperAxiIdOutWidth >= $clog2(NumAxiMaxTrans) = $clog2(32) = 5. + localparam int unsigned WrapperAxiIdOutWidth = 6; + // External SoC/testbench input AXI ID width (host → cluster direction). + // axi_id_remap in the wrapper expands these to SpatzAxiIdInWidth internally. + localparam int unsigned WrapperAxiIdInWidth = 4; + // External narrow output AXI ID width for the UART port (cluster → SoC direction). + // axi_id_remap in the wrapper compresses SpatzAxiUartIdWidth to this. + localparam int unsigned WrapperAxiNarrowIdOutWidth = 4; - // Base ID width 6, plus tile mux => adding clog(tile) - localparam int unsigned SpatzAxiNarrowIdWidth = 6 + $clog2(NumTiles); - // UART ID width, with an extra xbar + localparam int unsigned CsrAxiMstIdWidth = ClusterAxiIdWidth; + // ID width after per-master serialization before the CSR mux. + // axi_id_serialize at each CSR master reduces CsrAxiMstIdWidth to this, + // keeping the mux output (CsrAxiSlvIdWidth) bounded regardless of NumTiles. + // Must be > 1: axi_id_serialize internally uses axi_id_prepend which requires + // AxiMstPortIdWidth > MuxIdWidth (= 1 when AxiMstPortMaxUniqIds = 1). + localparam int unsigned CsrSerIdWidth = 2; + localparam int unsigned CsrAxiSlvIdWidth = CsrSerIdWidth + $clog2(NumTiles+1); + + // Narrow AXI ID width = ClusterAxiIdWidth (same field structure, used on the narrow path). + localparam int unsigned SpatzAxiNarrowIdWidth = ClusterAxiIdWidth; + // UART ID width: narrow path muxed across all tiles adds $clog2(NumTiles) bits. localparam int unsigned SpatzAxiUartIdWidth = SpatzAxiNarrowIdWidth + $clog2(NumTiles); + // BootROM AXI ID width: wide data bus, muxed from NumTilesPerGroup tile ports per group. + // The group's axi_mst_cache slave ID width = GroupAxiIdWidth + 1 + // (cluster passes WideIdWidthIn = SpatzAxiIdOutWidth - ClusterRouteIdWidth - GroupMuxIdBits + // = GroupAxiIdWidth + 1). + // The per-group BootROM mux master adds $clog2(NumTilesPerGroup) bits on top. + localparam int unsigned BootRomAxiSlvIdWidth = GroupAxiIdWidth + 1 + $clog2(NumTilesPerGroup); + /***** Tile Ports *****/ // We have three sets of AXI ports for each tile: // 1) Wide output bus for BootRom & L2 (from ICache) @@ -228,8 +291,6 @@ package cachepool_pkg; // Wide AXI ports: X to DRAM (X=4 for now) localparam int unsigned ClusterWideOutAxiPorts = NumL2Channel; - // TODO: multi-tile support - // One more from the Snitch core ////////////////// // L2 / DRAM // @@ -274,6 +335,7 @@ package cachepool_pkg; typedef logic [SpatzAxiIdInWidth-1:0] axi_id_in_t; typedef logic [SpatzAxiIdOutWidth-1:0] axi_id_out_t; + typedef logic [GroupAxiIdOutWidth-1:0] axi_id_group_out_t; typedef logic [SpatzAxiNarrowIdWidth-1:0] axi_narrow_id_t; // legacy name; TODO: remove @@ -282,9 +344,15 @@ package cachepool_pkg; typedef logic [SpatzAxiUartIdWidth-1:0] axi_uart_id_t; typedef logic [CsrAxiMstIdWidth-1:0] axi_id_csr_mst_t; + typedef logic [CsrSerIdWidth-1:0] axi_id_csr_ser_t; typedef logic [CsrAxiSlvIdWidth-1:0] axi_id_csr_slv_t; - typedef logic [IwcAxiIdOutWidth-1:0] axi_id_out_iwc_t; + typedef logic [IwcAxiIdOutWidth-1:0] axi_id_out_iwc_t; + typedef logic [WrapperAxiIdOutWidth-1:0] axi_id_wrapper_out_t; + typedef logic [WrapperAxiIdInWidth-1:0] axi_id_wrapper_in_t; + typedef logic [WrapperAxiNarrowIdOutWidth-1:0] axi_id_wrapper_narrow_out_t; + + typedef logic [BootRomAxiSlvIdWidth-1:0] axi_bootrom_slv_id_t; ////////////////// // TILE TYPES // @@ -362,7 +430,64 @@ package cachepool_pkg; // GROUP TYPES // /////////////////// - typedef logic [RemoteXbarSelWidth-1:0] remote_xbar_sel_t; + typedef logic [RemoteXbarSelWidth-1:0] remote_xbar_sel_t; + typedef logic [$clog2(NrTCDMPortsPerCore)-1:0] portid_t; + + typedef struct packed { + logic [CoreIDWidth-1:0] core_id; + logic [TileIDWidth-1:0] tile_id; + reqid_t req_id; + logic is_fpu; + portid_t port_id; + logic [idx_width(NumGroupsX)-1:0] src_group_x; + logic [idx_width(NumGroupsY)-1:0] src_group_y; + // Globally-unique destination tile ID, set by tcdm_cache_interco for + // inter-group requests. Upper bits (above $clog2(NumTilesPerGroup)) are + // the linear group index; lower bits are the local tile within the group. + logic [TileIDWidth-1:0] dst_tile_id; + } remote_group_user_t; + + `REQRSP_TYPEDEF_ALL(remote_group, narrow_addr_t, narrow_data_t, narrow_strb_t, remote_group_user_t) + + // XY mesh coordinates for a group. port_id selects the eject port (always 0 for single-link). + typedef struct packed { + logic [idx_width(NumGroupsX)-1:0] x; + logic [idx_width(NumGroupsY)-1:0] y; + logic port_id; + } group_xy_id_t; + + // Per-group tile index used by dispatch xbar selection. + typedef logic [idx_width(NumTilesPerGroup)-1:0] group_tile_sel_t; + + // Routing header embedded in every inter-group NoC flit. + typedef struct packed { + logic [3:0] collective_op; + group_xy_id_t src_id; + group_xy_id_t dst_id; + group_tile_sel_t src_tile_id; + portid_t src_port_id; + logic last; + } noc_group_hdr_t; + + // Inter-group NoC flit types (payload + routing header). + typedef struct packed { + remote_group_req_chan_t payload; + noc_group_hdr_t hdr; + } noc_group_req_t; + + typedef struct packed { + remote_group_rsp_chan_t payload; + noc_group_hdr_t hdr; + } noc_group_rsp_t; + + // Group ICache (L2 read-only cache control) + localparam int unsigned ROCacheNumAddrRules = 1; + typedef struct packed { + logic enable; + logic flush_valid; + axi_addr_t [ROCacheNumAddrRules-1:0] start_addr; + axi_addr_t [ROCacheNumAddrRules-1:0] end_addr; + } ro_cache_ctrl_t; ///////////////////// @@ -421,12 +546,24 @@ package cachepool_pkg; // AXI typedef bundles `AXI_TYPEDEF_ALL(spatz_axi_narrow, axi_addr_t, axi_narrow_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) `AXI_TYPEDEF_ALL(spatz_axi_in, axi_addr_t, axi_id_in_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(spatz_axi_out, axi_addr_t, axi_id_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(spatz_axi_iwc_out, axi_addr_t, axi_id_out_iwc_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) - - `AXI_TYPEDEF_ALL(axi_uart, axi_addr_t, axi_uart_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(axi_csr_mst, axi_addr_t, axi_id_csr_mst_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) - `AXI_TYPEDEF_ALL(axi_csr_slv, axi_addr_t, axi_id_csr_slv_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(spatz_axi_out, axi_addr_t, axi_id_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + // Per-group AXI output: narrower ID (pre multi-group mux). + `AXI_TYPEDEF_ALL(spatz_axi_group_out, axi_addr_t, axi_id_group_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(spatz_axi_iwc_out, axi_addr_t, axi_id_out_iwc_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + // Wrapper-level external output type: ID narrowed from SpatzAxiIdOutWidth to WrapperAxiIdOutWidth. + `AXI_TYPEDEF_ALL(spatz_axi_wrapper_out, axi_addr_t, axi_id_wrapper_out_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) + // Wrapper-level external input type: narrow ID from SoC (WrapperAxiIdInWidth → SpatzAxiIdInWidth inside). + `AXI_TYPEDEF_ALL(spatz_axi_wrapper_in, axi_addr_t, axi_id_wrapper_in_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + // Wrapper-level external narrow output type: ID compressed from SpatzAxiUartIdWidth to WrapperAxiNarrowIdOutWidth. + `AXI_TYPEDEF_ALL(spatz_axi_wrapper_narrow_out, axi_addr_t, axi_id_wrapper_narrow_out_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + + `AXI_TYPEDEF_ALL(axi_uart, axi_addr_t, axi_uart_id_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(axi_csr_mst, axi_addr_t, axi_id_csr_mst_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + // Serialized CSR type: 1-bit ID output of axi_id_serialize, fed into the CSR mux slave ports. + `AXI_TYPEDEF_ALL(axi_csr_ser, axi_addr_t, axi_id_csr_ser_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + `AXI_TYPEDEF_ALL(axi_csr_slv, axi_addr_t, axi_id_csr_slv_t, axi_narrow_data_t, axi_narrow_strb_t, axi_user_t) + // BootROM: wide data bus (same payload as cache), slv = post-mux (widened ID) + `AXI_TYPEDEF_ALL(axi_bootrom_slv, axi_addr_t, axi_bootrom_slv_id_t, axi_wide_data_t, axi_wide_strb_t, axi_user_t) /************************************************************** * FUNCTIONS diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv index e01c0ac..0fa53fb 100644 --- a/hardware/src/cachepool_tile.sv +++ b/hardware/src/cachepool_tile.sv @@ -4,19 +4,11 @@ // Author: Diyou Shen -`include "axi/assign.svh" `include "axi/typedef.svh" `include "common_cells/assertions.svh" `include "common_cells/registers.svh" -`include "mem_interface/assign.svh" -`include "mem_interface/typedef.svh" -`include "register_interface//assign.svh" -`include "register_interface/typedef.svh" -`include "reqrsp_interface/assign.svh" `include "reqrsp_interface/typedef.svh" `include "snitch_vm/typedef.svh" -`include "tcdm_interface/assign.svh" -`include "tcdm_interface/typedef.svh" /// Tile implementation for CachePool module cachepool_tile @@ -24,7 +16,7 @@ module cachepool_tile import spatz_pkg::*; import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; - import snitch_icache_pkg::icache_events_t; + import snitch_icache_pkg::icache_l1_events_t; #( /// Width of physical address. parameter int unsigned AxiAddrWidth = 48, @@ -48,10 +40,6 @@ module cachepool_tile parameter int unsigned ClusterPeriphSize = 64, /// Number of TCDM Banks. parameter int unsigned NrBanks = 2 * NrCores, - /// Size of DMA AXI buffer. - parameter int unsigned DMAAxiReqFifoDepth = 3, - /// Size of DMA request fifo. - parameter int unsigned DMAReqFifoDepth = 3, /// Width of a single icache line. parameter unsigned ICacheLineWidth = 0, /// Number of icache lines per set. @@ -66,10 +54,14 @@ module cachepool_tile /// Spatz FPU/IPU Configuration parameter int unsigned NumSpatzFPUs = 4, parameter int unsigned NumSpatzIPUs = 1, - /// Per-core enabling of the custom `Xdma` ISA extensions. - parameter bit [NrCores-1:0] Xdma = '{default: '0}, /// Tile ID Width parameter int unsigned TileIDWidth = 0, + /// Number of dedicated inter-group remote ports per xbar plane. + /// When 0, no inter-group ports are generated (single-group mode). + parameter int unsigned NumRemoteGroupPortCore = 0, + /// Number of tiles within a single group (passed to interco for + /// group-id extraction from the address). + parameter int unsigned NumTilesPerGroup = 0, /// # Per-core parameters /// Per-core integer outstanding loads parameter int unsigned NumIntOutstandingLoads = '0, @@ -110,67 +102,76 @@ module cachepool_tile parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, /// # SRAM Configuration rules needed: L1D Tag + L1D Data + L1D FIFO + L1I Tag + L1I Data /*** ATTENTION: `NrSramCfg` should be changed if `L1NumDataBank` and `L1NumTagBank` is changed ***/ - parameter int unsigned NrSramCfg = 1 + parameter int unsigned NrSramCfg = 1, + localparam int unsigned TotRGPorts = (NumRemoteGroupPortCore == 0) ? 0 : NumRemoteGroupPortCore*NrTCDMPortsPerCore-1 ) ( /// System clock. - input logic clk_i, + input logic clk_i, /// Asynchronous active high reset. This signal is assumed to be _async_. - input logic rst_ni, + input logic rst_ni, /// Per-core debug request signal. Asserting this signals puts the /// corresponding core into debug mode. This signal is assumed to be _async_. - input logic [NrCores-1:0] debug_req_i, - /// End of Computing indicator to notify the host/tb - // output logic eoc_o, + input logic debug_req_i, /// Machine external interrupt pending. Usually those interrupts come from a /// platform-level interrupt controller. This signal is assumed to be _async_. - input logic [NrCores-1:0] meip_i, + input logic meip_i, /// Machine timer interrupt pending. Usually those interrupts come from a /// core-local interrupt controller such as a timer/RTC. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] mtip_i, + input logic mtip_i, /// Core software interrupt pending. Usually those interrupts come from /// another core to facilitate inter-processor-interrupts. This signal is /// assumed to be _async_. - input logic [NrCores-1:0] msip_i, + input logic msip_i, /// First hartid of the cluster. Cores of a cluster are monotonically /// increasing without a gap, i.e., a cluster with 8 cores and a /// `hart_base_id_i` of 5 get the hartids 5 - 12. - input logic [9:0] hart_base_id_i, + input logic [9:0] hart_base_id_i, /// Base address of cluster. TCDM and cluster peripheral location are derived from /// it. This signal is pseudo-static. - input axi_addr_t cluster_base_addr_i, + input axi_addr_t cluster_base_addr_i, /// Tile ID, internal ID, the base is always 0, in theory should not change during use - input remote_tile_sel_t tile_id_i, + input remote_tile_sel_t tile_id_i, /// Partitioning address - input axi_addr_t private_start_addr_i, + input axi_addr_t private_start_addr_i, /// AXI Narrow out-port (UART/Peripheral) - output axi_narrow_req_t [1:0] axi_out_req_o, - input axi_narrow_resp_t [1:0] axi_out_resp_i, + output axi_narrow_req_t [1:0] axi_out_req_o, + input axi_narrow_resp_t [1:0] axi_out_resp_i, /// Cache Refill ports - output cache_trans_req_t [NumL1CtrlTile-1:0] cache_refill_req_o, - input cache_trans_rsp_t [NumL1CtrlTile-1:0] cache_refill_rsp_i, + output cache_trans_req_t [NumL1CtrlTile-1:0] cache_refill_req_o, + input cache_trans_rsp_t [NumL1CtrlTile-1:0] cache_refill_rsp_i, /// Wide AXI ports to cluster level - output axi_out_req_t [TileNarrowAxiPorts-1:0] axi_wide_req_o, - input axi_out_resp_t [TileNarrowAxiPorts-1:0] axi_wide_rsp_i, + output axi_out_req_t [TileNarrowAxiPorts-1:0] axi_wide_req_o, + input axi_out_resp_t [TileNarrowAxiPorts-1:0] axi_wide_rsp_i, /// Remote Tile access ports (to remote tiles) - output tcdm_req_t [NumRemotePortTile-1:0] remote_req_o, - output remote_tile_sel_t [NumRemotePortTile-1:0] remote_req_dst_o, - input tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_i, - input logic [NumRemotePortTile-1:0] remote_rsp_ready_i, + output tcdm_req_t [NumRemotePortTile-1:0] remote_req_o, + output remote_tile_sel_t [NumRemotePortTile-1:0] remote_req_dst_o, + input tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_i, + input logic [NumRemotePortTile-1:0] remote_rsp_ready_i, /// Remote Tile access ports (from remote tiles) - input tcdm_req_t [NumRemotePortTile-1:0] remote_req_i, - output tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_o, - output logic [NumRemotePortTile-1:0] remote_rsp_ready_o, + input tcdm_req_t [NumRemotePortTile-1:0] remote_req_i, + output tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_o, + output logic [NumRemotePortTile-1:0] remote_rsp_ready_o, + /// Inter-group remote access ports (to other groups). + /// Flat layout: flat index = j + r * NrTCDMPortsPerCore, + /// where j is the interco instance and r is the inter-group remote slot. + /// Total count: NumRemoteGroupPortCore * NrTCDMPortsPerCore. + /// Uses REQRSP-style types with built-in ready and remote_group_user_t. + output remote_group_req_t [TotRGPorts:0] remote_group_req_o, + input remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_i, + /// Inter-group remote access ports (from other groups) + input remote_group_req_t [TotRGPorts:0] remote_group_req_i, + output remote_group_rsp_t [TotRGPorts:0] remote_group_rsp_o, /// Peripheral signals - output icache_events_t [NrCores-1:0] icache_events_o, - input logic icache_prefetch_enable_i, - input logic [NrCores-1:0] cl_interrupt_i, - input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, - input cache_insn_t l1d_insn_i, - input logic [3:0] l1d_private_i, - input logic l1d_insn_valid_i, - output logic l1d_insn_ready_o, - input logic l1d_busy_i, + output icache_l1_events_t [NrCores-1:0] icache_events_o, + input logic icache_prefetch_enable_i, + input logic [NrCores-1:0] cl_interrupt_i, + input logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset_i, + input cache_insn_t l1d_insn_i, + input logic [3:0] l1d_private_i, + input logic l1d_insn_valid_i, + output logic l1d_insn_ready_o, + input logic l1d_busy_i, @@ -189,7 +190,6 @@ module cachepool_tile // --------- // TODO: Should be imported from Memory-mapped Reg logic [2:0] num_private_cache; - // half-half assign num_private_cache = l1d_private_i[2:0]; /// Minimum width to hold the core number. @@ -304,11 +304,6 @@ module cachepool_tile `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t, tcdm_user_t) - `MEM_TYPEDEF_ALL(mem, tcdm_mem_addr_t, data_t, strb_t, tcdm_user_t) - - `REG_BUS_TYPEDEF_ALL(reg, addr_t, data_t, strb_t) - - typedef struct packed { int unsigned idx; addr_t start_addr; @@ -405,7 +400,7 @@ module cachepool_tile core_events_t [NrCores-1:0] core_events; - snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events; + // snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events; // 4. Memory Subsystem (Core side). reqrsp_req_t [NrCores-1:0] core_req, filtered_core_req; @@ -418,6 +413,12 @@ module cachepool_tile tcdm_req_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_req, cache_xbar_req; tcdm_rsp_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_rsp, cache_xbar_rsp; + // Post-xbar gated copies. + // cache_ctrl_req : xbar output with q_valid suppressed during flush. + // cache_bank_rsp : raw response from the bank/AMO stage; q_ready is gated before + // being returned to the interco as cache_xbar_rsp. + tcdm_req_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_ctrl_req; + tcdm_rsp_t [NrTCDMPortsPerCore-1:0][NumL1CtrlTile-1:0] cache_bank_rsp; tcdm_req_t [NumL1CtrlTile-1:0] cache_amo_req; tcdm_rsp_t [NumL1CtrlTile-1:0] cache_amo_rsp; @@ -517,16 +518,14 @@ module cachepool_tile always_comb begin : cache_flush_protection for (int j = 0; unsigned'(j) < NrTCDMPortsCores; j++) begin /***** REQ *****/ - // Wire to Cache outputs unmerge_req[j].q = tcdm_req[j].q; - // invalidate the request when cache is busy - unmerge_req[j].q_valid = tcdm_req[j].q_valid && !l1d_busy_i; + unmerge_req[j].q_valid = tcdm_req[j].q_valid; unmerge_pready[j] = 1'b1; /***** RSP *****/ tcdm_rsp[j].p = unmerge_rsp[j].p; tcdm_rsp[j].p_valid = unmerge_rsp[j].p_valid; - tcdm_rsp[j].q_ready = unmerge_rsp[j].q_ready && !l1d_busy_i; + tcdm_rsp[j].q_ready = unmerge_rsp[j].q_ready; end end @@ -545,25 +544,21 @@ module cachepool_tile // Used to determine the mapping policy between different cache banks. // Set through CSR - logic [$clog2(TCDMAddrWidth)-1:0] dynamic_offset; + logic [$clog2(AxiAddrWidth)-1:0] dynamic_offset; assign dynamic_offset = dynamic_offset_i; // One entry per flat remote port: flat index = j + r*NrTCDMPortsPerCore // where j is the xbar index and r is the remote slot within that xbar. logic [NumRemotePortTile-1:0] remote_out_pready, remote_in_pready; - // Flush protection for remote ports. - // - // During a flush (l1d_busy_i) remote tiles must be fully stalled: - // - q_valid gated : stops new requests being presented to the xbar - // - q_ready gated : stops the xbar accepting a request that is already - // sitting at the input (spill register would otherwise - // pop it, and the transaction would be lost because the - // cache is unavailable) - // - remote_in_pready gated : stops response-ready from propagating back, - // preventing in-flight completions during the flush window + // Intra-group remote port wiring. + // q_valid and q_ready for incoming requests are passed through without gating: + // the after-xbar flush gate (cache_xbar_flush_gate) provides the authoritative + // protection at the cache bank boundary and naturally back-pressures through + // the interco to the remote sender. + // response-ready (remote_in_pready) is still gated to prevent draining in-flight + // completions during the flush window. tcdm_req_t [NumRemotePortTile-1:0] remote_req_gated; - // Intermediate response signals from the xbar before q_ready gating. tcdm_rsp_t [NumRemotePortTile-1:0] remote_rsp_xbar; always_comb begin : remote_flush_protection @@ -571,14 +566,10 @@ module cachepool_tile for (int r = 0; r < NumRemotePortCore; r++) begin automatic int unsigned flat = j + r * NrTCDMPortsPerCore; - // Gate q_valid: prevent new requests entering the xbar. remote_req_gated[flat].q = remote_req_i[flat].q; - remote_req_gated[flat].q_valid = remote_req_i[flat].q_valid && !l1d_busy_i; + remote_req_gated[flat].q_valid = remote_req_i[flat].q_valid; - // Pass the full xbar response through, then gate only q_ready so the - // remote tile cannot complete a handshake during a flush. remote_rsp_o[flat] = remote_rsp_xbar[flat]; - remote_rsp_o[flat].q_ready = remote_rsp_xbar[flat].q_ready && !l1d_busy_i; // Gate response-ready back to us: prevent draining completions // of requests that arrived just before the flush. @@ -589,9 +580,149 @@ module cachepool_tile assign remote_rsp_ready_o = remote_out_pready; + // ------------------------------------------------------------------------- + // Inter-group remote ports – type conversion and flush protection + // ------------------------------------------------------------------------- + // External ports use REQRSP-style remote_group_req_t / remote_group_rsp_t + // (with built-in ready and remote_group_user_t). + // Internal interco uses TCDM-style tcdm_req_t / tcdm_rsp_t. + // This section bridges the two and applies flush gating. + // + // Same flat layout as remote ports: flat = j + r * NrTCDMPortsPerCore. + // Total count: NumRemoteGroupPortCore * NrTCDMPortsPerCore. + + localparam int unsigned NumRemoteGroupPortTile = NumRemoteGroupPortCore * NrTCDMPortsPerCore; + + // Internal TCDM-style signals going to/from the interco. + tcdm_req_t [NumRemoteGroupPortTile-1:0] rg_interco_in_req; // incoming requests to interco + tcdm_rsp_t [NumRemoteGroupPortTile-1:0] rg_interco_in_rsp; // responses from interco (for incoming) + logic [NumRemoteGroupPortTile-1:0] rg_interco_in_pready; // response ready for incoming + + tcdm_req_t [NumRemoteGroupPortTile-1:0] rg_interco_out_req; // outgoing requests from interco + tcdm_rsp_t [NumRemoteGroupPortTile-1:0] rg_interco_out_rsp; // responses returning (for outgoing) + logic [NumRemoteGroupPortTile-1:0] rg_interco_out_pready;// response ready for outgoing + remote_tile_sel_t [NumRemoteGroupPortTile-1:0] rg_interco_out_dst; // target tile from interco + + if (NumRemoteGroupPortCore > 0) begin : gen_remote_group_ports + always_comb begin + for (int j = 0; j < NrTCDMPortsPerCore; j++) begin + for (int r = 0; r < NumRemoteGroupPortCore; r++) begin + automatic int unsigned flat = j + r * NrTCDMPortsPerCore; + + // ----------------------------------------------------------- + // Incoming: REQRSP → TCDM conversion → interco + // q_valid and q_ready are passed through without gating; the + // after-xbar flush gate (cache_xbar_flush_gate) is the authoritative + // protection point and naturally back-pressures through the interco. + // ----------------------------------------------------------- + rg_interco_in_req[flat] = '{ + q: '{ + addr: remote_group_req_i[flat].q.addr, + write: remote_group_req_i[flat].q.write, + data: remote_group_req_i[flat].q.data, + strb: remote_group_req_i[flat].q.strb, + amo: remote_group_req_i[flat].q.amo, + user: '{ + core_id: remote_group_req_i[flat].q.user.core_id, + tile_id: remote_group_req_i[flat].q.user.tile_id, + req_id: remote_group_req_i[flat].q.user.req_id, + is_fpu: remote_group_req_i[flat].q.user.is_fpu, + default: '0 + }, + default: '0 + }, + q_valid: remote_group_req_i[flat].q_valid, + default: '0 + }; + + // Interco response (TCDM) → REQRSP for remote_group_rsp_o. + remote_group_rsp_o[flat] = '{ + p: '{ + data: rg_interco_in_rsp[flat].p.data, + write: rg_interco_in_rsp[flat].p.write, + user: '{ + core_id: rg_interco_in_rsp[flat].p.user.core_id, + tile_id: rg_interco_in_rsp[flat].p.user.tile_id, + req_id: rg_interco_in_rsp[flat].p.user.req_id, + is_fpu: rg_interco_in_rsp[flat].p.user.is_fpu, + port_id: portid_t'(j), + default: '0 + }, + default: '0 + }, + p_valid: rg_interco_in_rsp[flat].p_valid, + q_ready: rg_interco_in_rsp[flat].q_ready, + default: '0 + }; + + // Response ready from the external port (REQRSP p_ready). + rg_interco_in_pready[flat] = remote_group_req_i[flat].p_ready && !l1d_busy_i; + + // ----------------------------------------------------------- + // Outgoing: interco → flush gating → TCDM to REQRSP → output + // ----------------------------------------------------------- + remote_group_req_o[flat] = '{ + q: '{ + addr: rg_interco_out_req[flat].q.addr, + write: rg_interco_out_req[flat].q.write, + data: rg_interco_out_req[flat].q.data, + strb: rg_interco_out_req[flat].q.strb, + amo: rg_interco_out_req[flat].q.amo, + user: '{ + core_id: rg_interco_out_req[flat].q.user.core_id, + tile_id: rg_interco_out_req[flat].q.user.tile_id, + req_id: rg_interco_out_req[flat].q.user.req_id, + is_fpu: rg_interco_out_req[flat].q.user.is_fpu, + port_id: portid_t'(j), + dst_tile_id: rg_interco_out_dst[flat], + default: '0 + }, + default: '0 + }, + q_valid: rg_interco_out_req[flat].q_valid && !l1d_busy_i, + p_ready: rg_interco_out_pready[flat] && !l1d_busy_i, + default: '0 + }; + + // Returning response (REQRSP) → TCDM for the interco. + rg_interco_out_rsp[flat] = '{ + p: '{ + data: remote_group_rsp_i[flat].p.data, + write: remote_group_rsp_i[flat].p.write, + user: '{ + core_id: remote_group_rsp_i[flat].p.user.core_id, + tile_id: remote_group_rsp_i[flat].p.user.tile_id, + req_id: remote_group_rsp_i[flat].p.user.req_id, + is_fpu: remote_group_rsp_i[flat].p.user.is_fpu, + default: '0 + }, + default: '0 + }, + p_valid: remote_group_rsp_i[flat].p_valid, + q_ready: remote_group_rsp_i[flat].q_ready, + default: '0 + }; + end + end + end + end else begin : gen_remote_group_no_ports + // No inter-group remote ports: tie off outputs. + assign remote_group_rsp_o = '0; + assign remote_group_req_o = '0; + assign rg_interco_in_req = '0; + assign rg_interco_in_pready = '0; + assign rg_interco_out_rsp = '0; + assign rg_interco_out_pready = '0; + assign rg_interco_in_rsp = '0; + assign rg_interco_out_req = '0; + assign rg_interco_out_dst = '0; + end + /// Wire requests after strb handling to the cache controller. /// Each xbar j handles NumRemotePortCore remote slots at flat indices /// j + r*NrTCDMPortsPerCore for r in [0, NumRemotePortCore). + /// Similarly, each xbar j handles NumRemoteGroupPortCore inter-group remote slots at flat indices + /// j + r*NrTCDMPortsPerCore for r in [0, NumRemoteGroupPortCore). for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin : gen_cache_xbar // Collect the NumRemotePortCore remote slots for this xbar. tcdm_req_t [NumRemotePortCore-1:0] xbar_remote_req_gated; @@ -613,33 +744,92 @@ module cachepool_tile assign remote_req_o [flat] = xbar_remote_req_o [r]; end - tcdm_cache_interco #( - .NumTiles (NumTiles ), - .NumCores (NrCores ), - .NumCache (NumL1CtrlTile ), - .NumTotCache (NumL1CacheCtrl ), - .NumRemotePort (NumRemotePortCore ), - .AddrWidth (TCDMAddrWidth ), - .TileIDWidth (TileIDWidth ), - .tcdm_req_t (tcdm_req_t ), - .tcdm_rsp_t (tcdm_rsp_t ), - .tcdm_req_chan_t (tcdm_req_chan_t ), - .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) - ) i_cache_xbar ( - .clk_i ( clk_i ), - .rst_ni ( rst_ni ), - .tile_id_i ( tile_id_i ), - .dynamic_offset_i ( dynamic_offset ), - .private_start_addr_i ( private_start_addr_i ), - .num_private_cache_i ( num_private_cache ), - .core_req_i ({xbar_remote_req_gated, cache_req [j]} ), - .core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ), - .core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ), - .tile_sel_o ( xbar_remote_req_dst ), - .mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]} ), - .mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]} ), - .mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]} ) - ); + // Collect the NumRemoteGroupPortCore inter-group remote slots for this xbar (same flat layout). + // When NumRemoteGroupPortCore == 0, no inter-group remote signals exist and the interco is + // instantiated without inter-group remote ports (backward-compatible). + if (NumRemoteGroupPortCore > 0) begin : gen_remote_group_slice + tcdm_req_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_in_req; + tcdm_rsp_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_in_rsp; + logic [NumRemoteGroupPortCore-1:0] xbar_remote_group_in_pready; + tcdm_req_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_req; + tcdm_rsp_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_rsp; + logic [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_pready; + remote_tile_sel_t [NumRemoteGroupPortCore-1:0] xbar_remote_group_out_dst; + + for (genvar r = 0; r < NumRemoteGroupPortCore; r++) begin : gen_remote_group_slice_r + localparam int unsigned flat = j + r * NrTCDMPortsPerCore; + // Incoming: from conversion/flush → interco input + assign xbar_remote_group_in_req [r] = rg_interco_in_req [flat]; + assign xbar_remote_group_in_pready [r] = rg_interco_in_pready [flat]; + assign rg_interco_in_rsp [flat] = xbar_remote_group_in_rsp [r]; + // Outgoing: interco output → conversion/flush + assign rg_interco_out_req [flat] = xbar_remote_group_out_req [r]; + assign rg_interco_out_dst [flat] = xbar_remote_group_out_dst [r]; + assign xbar_remote_group_out_rsp [r] = rg_interco_out_rsp [flat]; + assign rg_interco_out_pready [flat] = xbar_remote_group_out_pready[r]; + end + + tcdm_cache_interco #( + .NumTiles (NumTiles ), + .NumCores (NrCores ), + .NumCache (NumL1CtrlTile ), + .NumTotCache (NumL1CacheCtrl ), + .NumRemotePort (NumRemotePortCore ), + .NumRemoteGroupPort (NumRemoteGroupPortCore ), + .NumTilesPerGroup (NumTilesPerGroup ), + .AddrWidth (TCDMAddrWidth ), + .TileIDWidth (TileIDWidth ), + .tcdm_req_t (tcdm_req_t ), + .tcdm_rsp_t (tcdm_rsp_t ), + .tcdm_req_chan_t (tcdm_req_chan_t ), + .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) + ) i_cache_xbar ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tile_id_i ( tile_id_i ), + .dynamic_offset_i ( dynamic_offset ), + .private_start_addr_i ( private_start_addr_i ), + .num_private_cache_i ( num_private_cache ), + .core_req_i ({xbar_remote_group_in_req, xbar_remote_req_gated, cache_req [j]}), + .core_rsp_ready_i ({xbar_remote_group_in_pready, xbar_remote_in_pready, cache_pready [j]}), + .core_rsp_o ({xbar_remote_group_in_rsp, xbar_remote_rsp_xbar, cache_rsp [j]}), + .tile_sel_o ( xbar_remote_req_dst ), + .remote_group_sel_o ( xbar_remote_group_out_dst ), + .mem_req_o ({xbar_remote_group_out_req, xbar_remote_req_o, cache_xbar_req [j]}), + .mem_rsp_ready_o ({xbar_remote_group_out_pready, xbar_remote_out_pready, cache_xbar_pready[j]}), + .mem_rsp_i ({xbar_remote_group_out_rsp, xbar_remote_rsp_i, cache_xbar_rsp [j]}) + ); + end else begin : gen_no_remote_group + // No inter-group remote ports: instantiate interco without inter-group remote ports (backward-compatible). + tcdm_cache_interco #( + .NumTiles (NumTiles ), + .NumCores (NrCores ), + .NumCache (NumL1CtrlTile ), + .NumTotCache (NumL1CacheCtrl ), + .NumRemotePort (NumRemotePortCore ), + .AddrWidth (TCDMAddrWidth ), + .TileIDWidth (TileIDWidth ), + .tcdm_req_t (tcdm_req_t ), + .tcdm_rsp_t (tcdm_rsp_t ), + .tcdm_req_chan_t (tcdm_req_chan_t ), + .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) + ) i_cache_xbar ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .tile_id_i ( tile_id_i ), + .dynamic_offset_i ( dynamic_offset ), + .private_start_addr_i ( private_start_addr_i ), + .num_private_cache_i ( num_private_cache ), + .core_req_i ({xbar_remote_req_gated, cache_req [j]} ), + .core_rsp_ready_i ({xbar_remote_in_pready, cache_pready [j]} ), + .core_rsp_o ({xbar_remote_rsp_xbar, cache_rsp [j]} ), + .tile_sel_o ( xbar_remote_req_dst ), + .remote_group_sel_o ( ), + .mem_req_o ({xbar_remote_req_o, cache_xbar_req [j]}), + .mem_rsp_ready_o ({xbar_remote_out_pready, cache_xbar_pready[j]}), + .mem_rsp_i ({xbar_remote_rsp_i, cache_xbar_rsp [j]}) + ); + end end for (genvar cb = 0; cb < NumL1CtrlTile; cb++) begin : gen_cache_connect @@ -659,9 +849,9 @@ module cachepool_tile ) i_cache_amo ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .core_req_i (cache_xbar_req [j][cb] ), + .core_req_i (cache_ctrl_req [j][cb] ), .core_rsp_ready_i (cache_xbar_pready[j][cb] ), - .core_rsp_o (cache_xbar_rsp [j][cb] ), + .core_rsp_o (cache_bank_rsp [j][cb] ), .mem_req_o (cache_amo_req [cb] ), .mem_rsp_ready_o (cache_amo_pready [cb] ), .mem_rsp_i (cache_amo_rsp [cb] ) @@ -713,22 +903,67 @@ module cachepool_tile assign cache_rsp_reg.p.write = cache_rsp_write[cb][j]; end else begin : gen_no_amo - // Bypass AMO and registers - assign cache_req_valid[cb][j] = cache_xbar_req [j][cb].q_valid; - assign cache_rsp_ready[cb][j] = cache_xbar_pready[j][cb]; - assign cache_req_addr [cb][j] = cache_xbar_req [j][cb].q.addr; - assign cache_req_meta [cb][j] = cache_xbar_req [j][cb].q.user; - assign cache_req_write[cb][j] = cache_xbar_req [j][cb].q.write; - assign cache_req_data [cb][j] = cache_xbar_req [j][cb].q.data; - assign cache_req_strb [cb][j] = cache_xbar_req [j][cb].q.strb; - - assign cache_xbar_rsp[j][cb].p_valid = cache_rsp_valid[cb][j]; - assign cache_xbar_rsp[j][cb].q_ready = cache_req_ready[cb][j]; - assign cache_xbar_rsp[j][cb].p.data = cache_rsp_data [cb][j]; - assign cache_xbar_rsp[j][cb].p.user = cache_rsp_meta [cb][j]; - - assign cache_xbar_rsp[j][cb].p.write = cache_rsp_write[cb][j]; + // Spill registers to cut the L1 xbar → coalescer critical path, + // matching the timing budget of the Snitch AMO path above. + tcdm_req_t cache_req_reg; + tcdm_rsp_t cache_rsp_reg; + + spill_register #( + .T ( tcdm_req_chan_t ), + .Bypass ( 1'b0 ) + ) i_spill_reg_cache_req ( + .clk_i , + .rst_ni ( rst_ni ), + .valid_i ( cache_ctrl_req[j][cb].q_valid ), + .ready_o ( cache_bank_rsp[j][cb].q_ready ), + .data_i ( cache_ctrl_req[j][cb].q ), + .valid_o ( cache_req_reg.q_valid ), + .ready_i ( cache_rsp_reg.q_ready ), + .data_o ( cache_req_reg.q ) + ); + spill_register #( + .T ( tcdm_rsp_chan_t ), + .Bypass ( 1'b1 ) + ) i_spill_reg_cache_rsp ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .valid_i ( cache_rsp_reg.p_valid ), + .ready_o ( cache_rsp_ready[cb][j] ), + .data_i ( cache_rsp_reg.p ), + .valid_o ( cache_bank_rsp[j][cb].p_valid ), + .ready_i ( cache_xbar_pready[j][cb] ), + .data_o ( cache_bank_rsp[j][cb].p ) + ); + + assign cache_req_valid[cb][j] = cache_req_reg.q_valid; + assign cache_req_addr [cb][j] = cache_req_reg.q.addr; + assign cache_req_meta [cb][j] = cache_req_reg.q.user; + assign cache_req_write[cb][j] = cache_req_reg.q.write; + assign cache_req_data [cb][j] = cache_req_reg.q.data; + assign cache_req_strb [cb][j] = cache_req_reg.q.strb; + + assign cache_rsp_reg.p_valid = cache_rsp_valid[cb][j]; + assign cache_rsp_reg.q_ready = cache_req_ready[cb][j]; + assign cache_rsp_reg.p.data = cache_rsp_data [cb][j]; + assign cache_rsp_reg.p.user = cache_rsp_meta [cb][j]; + assign cache_rsp_reg.p.write = cache_rsp_write[cb][j]; + + end + end + end + + // Post-xbar flush gate (applied uniformly across all ports). + // Suppresses q_valid going into the bank so no new cache accesses are processed + // while a flush is in progress, and gates q_ready going back to the interco so the + // xbar cannot dequeue a buffered request that is already sitting at its output. + always_comb begin : cache_xbar_flush_gate + for (int j = 0; j < NrTCDMPortsPerCore; j++) begin + for (int cb = 0; cb < NumL1CtrlTile; cb++) begin + cache_ctrl_req[j][cb] = cache_xbar_req[j][cb]; + cache_ctrl_req[j][cb].q_valid = cache_xbar_req[j][cb].q_valid && !l1d_busy_i; + cache_xbar_rsp[j][cb] = cache_bank_rsp[j][cb]; + cache_xbar_rsp[j][cb].q_ready = cache_bank_rsp[j][cb].q_ready && !l1d_busy_i; end end end @@ -745,6 +980,7 @@ module cachepool_tile localparam NumWordPerLine = L1LineWidth / DataWidth; localparam int unsigned WordBytes = DataWidth / 8; +`ifndef TARGET_SYNTHESIS initial begin $display("Cache Configuration:"); $display(" NumCtrl : %0d", NumL1CtrlTile); @@ -759,6 +995,7 @@ module cachepool_tile $display(" RefillDataWidth: %0d", RefillDataWidth); $display(" DynamicOffset : %0d", dynamic_offset); end +`endif // CL-offset mask: bits below dynamic_offset, verbatim in both directions. logic [SpatzAxiAddrWidth-1:0] bitmask_lo; @@ -885,6 +1122,7 @@ module cachepool_tile .CacheLineWidth (L1LineWidth ), .SetAssociativity (L1AssoPerCtrl ), .BankFactor (L1BankFactor ), + // .LogDebug (0 ), .RefillDataWidth (RefillDataWidth ), // Type .core_meta_t (tcdm_user_t ), @@ -1056,7 +1294,7 @@ module cachepool_tile .clk_i (clk_i ), .rst_ni (rst_ni ), .impl_i ('0 ), - .impl_o (/* unsed */ ), + .impl_o (/* unused */ ), .req_i (l1_tag_bank_req [cb][j]), .we_i (l1_tag_bank_we [cb][j]), .addr_i (l1_tag_bank_addr [cb][j]), @@ -1087,7 +1325,7 @@ module cachepool_tile .clk_i (clk_i ), .rst_ni (rst_ni ), .impl_i ('0 ), - .impl_o (/* unsed */ ), + .impl_o (/* unused */ ), .req_i ( l1_data_bank_req [cb][BaseIdx] ), .we_i ( l1_data_bank_we [cb][BaseIdx] ), .addr_i ( l1_data_bank_addr [cb][BaseIdx] ), @@ -1111,7 +1349,7 @@ module cachepool_tile // .clk_i (clk_i ), // .rst_ni (rst_ni ), // .impl_i ('0 ), - // .impl_o (/* unsed */ ), + // .impl_o (/* unused */ ), // .req_i (l1_data_bank_req [cb][j]), // .we_i (l1_data_bank_we [cb][j]), // .addr_i (l1_data_bank_addr [cb][j]), @@ -1134,13 +1372,13 @@ module cachepool_tile interrupts_t irq; sync #(.STAGES (2)) - i_sync_debug (.clk_i, .rst_ni, .serial_i (debug_req_i[i]), .serial_o (irq.debug)); + i_sync_debug (.clk_i, .rst_ni, .serial_i (debug_req_i), .serial_o (irq.debug)); sync #(.STAGES (2)) - i_sync_meip (.clk_i, .rst_ni, .serial_i (meip_i[i]), .serial_o (irq.meip)); + i_sync_meip (.clk_i, .rst_ni, .serial_i (meip_i), .serial_o (irq.meip)); sync #(.STAGES (2)) - i_sync_mtip (.clk_i, .rst_ni, .serial_i (mtip_i[i]), .serial_o (irq.mtip)); + i_sync_mtip (.clk_i, .rst_ni, .serial_i (mtip_i), .serial_o (irq.mtip)); sync #(.STAGES (2)) - i_sync_msip (.clk_i, .rst_ni, .serial_i (msip_i[i]), .serial_o (irq.msip)); + i_sync_msip (.clk_i, .rst_ni, .serial_i (msip_i), .serial_o (irq.msip)); assign irq.mcip = cl_interrupt_i[i]; tcdm_req_t [TcdmPorts-1:0] tcdm_req_wo_user; @@ -1155,15 +1393,10 @@ module cachepool_tile .RVF (RVF ), .RVD (RVD ), .RVV (RVV ), - .Xdma (Xdma[i] ), .AddrWidth (AxiAddrWidth ), .DataWidth (NarrowDataWidth ), .UserWidth (AxiUserWidth ), - .DMADataWidth (AxiDataWidth ), - .DMAIdWidth (AxiIdWidthIn ), .SnitchPMACfg (SnitchPMACfg ), - .DMAAxiReqFifoDepth (DMAAxiReqFifoDepth ), - .DMAReqFifoDepth (DMAReqFifoDepth ), .dreq_t (reqrsp_req_t ), .drsp_t (reqrsp_rsp_t ), .dreq_chan_t (reqrsp_req_chan_t ), @@ -1255,14 +1488,14 @@ module cachepool_tile .L0_LINE_COUNT ( 8 ), .LINE_WIDTH ( ICacheLineWidth ), .LINE_COUNT ( ICacheLineCount ), - .SET_COUNT ( ICacheSets ), + .WAY_COUNT ( ICacheSets ), .FETCH_AW ( AxiAddrWidth ), .FETCH_DW ( 32 ), .FILL_AW ( AxiAddrWidth ), .FILL_DW ( AxiDataWidth ), .EARLY_LATCH ( 0 ), .L0_EARLY_TAG_WIDTH ( snitch_pkg::PAGE_SHIFT - $clog2(ICacheLineWidth/8) ), - .ISO_CROSSING ( 1'b0 ), + .ISO_CROSSING ( 1'b1 ), .axi_req_t ( axi_mst_tile_wide_req_t ), .axi_rsp_t ( axi_mst_tile_wide_resp_t ), .sram_cfg_data_t ( impl_in_t ), @@ -1272,7 +1505,9 @@ module cachepool_tile .clk_d2_i ( clk_i ), .rst_ni ( rst_ni ), .enable_prefetching_i ( icache_prefetch_enable_i ), - .icache_events_o ( icache_events_o ), + .enable_branch_pred_i ( '0 ), + .icache_l0_events_o ( ), + .icache_l1_events_o ( ), .flush_valid_i ( flush_valid ), .flush_ready_o ( flush_ready ), .inst_addr_i ( inst_addr ), @@ -1283,6 +1518,8 @@ module cachepool_tile .inst_error_o ( inst_error ), .sram_cfg_tag_i ( '0 ), .sram_cfg_data_i ( '0 ), + .sram_cfg_out_data_o (), + .sram_cfg_out_tag_o (), .axi_req_o ( wide_axi_mst_req[ICache] ), .axi_rsp_i ( wide_axi_mst_rsp[ICache] ) ); @@ -1471,12 +1708,7 @@ module cachepool_tile // ------------- // Sanity Checks // ------------- - // Sanity check the parameters. Not every configuration makes sense. - `ASSERT_INIT(CheckSuperBankSanity, NrBanks >= BanksPerSuperBank); - `ASSERT_INIT(CheckSuperBankFactor, (NrBanks % BanksPerSuperBank) == 0); // Check that the cluster base address aligns to the TCDMSize. `ASSERT(ClusterBaseAddrAlign, ((TCDMSize - 1) & cluster_base_addr_i) == 0) - // Make sure we only have one DMA in the system. - `ASSERT_INIT(NumberDMA, $onehot0(Xdma)) endmodule diff --git a/hardware/src/tcdm_cache_interco.sv b/hardware/src/tcdm_cache_interco.sv index ba49e11..5287208 100644 --- a/hardware/src/tcdm_cache_interco.sv +++ b/hardware/src/tcdm_cache_interco.sv @@ -21,24 +21,57 @@ // private_bank = addr_bank_bits % num_private_cache_q // shared_bank = num_private_cache_q + (addr_bank_bits % num_shared_cache_q) // For non-power-of-2 partition sizes this causes uneven bank utilisation. +// +// Multi-group support (NumRemoteGroupPort > 0): +// +// When the cluster contains multiple groups, tile IDs are globally unique +// and encode both the group and tile-within-group: +// tile_id = {group_id, local_tile_id} +// +// The xbar performs three-way routing for shared (non-private) requests: +// 1. Local : same tile -> local cache bank +// 2. Intra-group : same group, diff tile -> remote port (existing xbar) +// 3. Inter-group : different group -> inter-group remote port (new) +// +// inter-group remote ports are appended after the remote ports on both input and output +// sides of the xbar, preserving full backward compatibility when +// NumRemoteGroupPort == 0. `include "common_cells/registers.svh" +`include "common_cells/assertions.svh" module tcdm_cache_interco #( /// Number of Tiles ('>= 1') parameter int unsigned NumTiles = 32'd1, /// Number of inputs into the interconnect (Cores per Tile) (`> 0`). parameter int unsigned NumCores = 32'd0, - /// Number of remote ports added to xbar ('>= 0'). + /// Number of remote ports added to xbar for intra-group traffic ('>= 0'). parameter int unsigned NumRemotePort = 32'd0, + /// Number of dedicated inter-group remote ports ('>= 0'). + /// When 0, the module behaves identically to the single-group configuration. + /// Each inter-group remote port serves as both an output (requests to other groups) and an + /// input (requests arriving from other groups), mirroring NumRemotePort. + parameter int unsigned NumRemoteGroupPort = 32'd0, /// Number of outputs from the interconnect (Cache banks per Tile) (`> 0`). parameter int unsigned NumCache = 32'd0, /// Number of total cache banks across all tiles (used for address scramble). + /// For multi-group, this must cover all tiles across all groups. parameter int unsigned NumTotCache = 32'd0, /// Address width in bits (cacheline offset: 512b => 6 bits). parameter int unsigned AddrWidth = 32'd32, /// Tile ID width ('> 0'). + /// In multi-group configurations, TileIDWidth covers the globally unique + /// tile ID which encodes both group and tile-within-group: + /// tile_id = {group_id, local_tile_id} parameter int unsigned TileIDWidth = 32'd1, + /// DRAM base address, used to check if we get illegal access + parameter int unsigned DramBaseAddr = 32'h8000_0000, + /// Number of tiles within a single group. + /// Used to extract the group portion from the address tile field: + /// group_id = addr_tile_bits / NumTilesPerGroup + /// Only relevant when NumRemoteGroupPort > 0. Defaults to NumTiles for + /// backward compatibility (single-group: all tiles are in one group). + parameter int unsigned NumTilesPerGroup = NumTiles, /// Port type of the data request ports. parameter type tcdm_req_t = logic, @@ -52,7 +85,9 @@ module tcdm_cache_interco #( parameter snitch_pkg::topo_e Topology = snitch_pkg::LogarithmicInterconnect, /// Dependency parameters – do not override. parameter type tile_id_t = logic [TileIDWidth-1:0], - parameter type addr_t = logic [AddrWidth-1:0] + parameter type addr_t = logic [AddrWidth-1:0], + localparam TotInPorts = NumCores+NumRemotePort+NumRemoteGroupPort, + localparam TotOutPorts = NumCache+NumRemotePort+NumRemoteGroupPort ) ( /// Clock, positive edge triggered. @@ -68,69 +103,79 @@ module tcdm_cache_interco #( input logic [$clog2(NumCache):0] num_private_cache_i, /// Partitioning address input addr_t private_start_addr_i, - /// Request port (cores + remote-in) ---------------------------------- - input tcdm_req_t [NumCores+NumRemotePort-1:0] core_req_i, + /// Request port (cores + intra-group remote-in + inter-group remote-in). + input tcdm_req_t [TotInPorts-1:0] core_req_i, /// Response ready in. - input logic [NumCores+NumRemotePort-1:0] core_rsp_ready_i, - /// Response port (cores + remote-in). - output tcdm_rsp_t [NumCores+NumRemotePort-1:0] core_rsp_o, + input logic [TotInPorts-1:0] core_rsp_ready_i, + /// Response port (cores + intra-group remote-in + inter-group remote-in). + output tcdm_rsp_t [TotInPorts-1:0] core_rsp_o, /// Memory side ------------------------------------------------------- - /// Which remote tile is targeted (one entry per remote output port). + /// Which remote tile is targeted (one entry per intra-group remote output). output tile_id_t [NumRemotePort-1:0] tile_sel_o, - // output logic remote_group_o, - /// Requests to cache banks and remote output ports. - output tcdm_req_t [NumCache+NumRemotePort-1:0] mem_req_o, + /// Which tile is targeted via inter-group remote (one entry per inter-group remote output). + /// Carries the full globally-unique tile ID; the wrapper decomposes it + /// into group XY coordinates for the router and local tile ID for the + /// receiving-side xbar. + output tile_id_t [NumRemoteGroupPort-1:0] remote_group_sel_o, + /// Requests to cache banks, intra-group remote, and inter-group remote ports. + output tcdm_req_t [TotOutPorts-1:0] mem_req_o, /// Response ready out. - output logic [NumCache+NumRemotePort-1:0] mem_rsp_ready_o, - /// Responses from cache banks and remote output ports. - input tcdm_rsp_t [NumCache+NumRemotePort-1:0] mem_rsp_i + output logic [TotOutPorts-1:0] mem_rsp_ready_o, + /// Responses from cache banks, intra-group remote, and inter-group remote ports. + input tcdm_rsp_t [TotOutPorts-1:0] mem_rsp_i ); // ------------------------------------------------------------------------- // Local parameters // ------------------------------------------------------------------------- - // Bits to index into xbar outputs (local banks + one remote slot). - localparam int unsigned NumOutSelBits = $clog2(NumCache + NumRemotePort); + // Total number of xbar input and output ports. + localparam int unsigned NumInp = NumCores + NumRemotePort + NumRemoteGroupPort; + localparam int unsigned NumOut = NumCache + NumRemotePort + NumRemoteGroupPort; + // Bits to index into xbar outputs. + localparam int unsigned NumOutSelBits = $clog2(NumOut); // Bits to index into xbar inputs. - localparam int unsigned NumInpSelBits = $clog2(NumCores + NumRemotePort); + localparam int unsigned NumInpSelBits = $clog2(NumInp); // Bits needed to select among local cache banks. - localparam int unsigned CacheBankBits = $clog2(NumCache); + localparam int unsigned CacheBankBits = $clog2(NumCache); // Bits needed to select the tile in the shared address space. - // Equals TileIDWidth by construction (NumTotCache / NumCache == NumTiles). - localparam int unsigned TileBits = $clog2(NumTotCache / NumCache); + // Equals TileIDWidth by construction (NumTotCache / NumCache == NumTotalTiles). + localparam int unsigned TileBits = $clog2(NumTotCache / NumCache); + + // Group extraction: number of bits to identify the group within TileID. + // LocalTileBits = $clog2(NumTilesPerGroup); GroupBits = TileBits - LocalTileBits. + // Only meaningful when NumRemoteGroupPort > 0. + localparam int unsigned LocalTileBits = $clog2(NumTilesPerGroup); // ------------------------------------------------------------------------- // Types // ------------------------------------------------------------------------- typedef logic [NumInpSelBits-1:0] mem_sel_t; - typedef logic [NumOutSelBits -1:0] core_sel_t; + typedef logic [NumOutSelBits-1:0] core_sel_t; // ------------------------------------------------------------------------- // Internal signals // ------------------------------------------------------------------------- // Xbar routing signals. - core_sel_t [NumCores+NumRemotePort-1:0] core_req_sel; - mem_sel_t [NumCache+NumRemotePort-1:0] mem_rsp_sel; - // '1' when this request stays on local banks. - logic [NumCores+NumRemotePort-1:0] local_sel; + core_sel_t [NumInp-1:0] core_req_sel; + mem_sel_t [NumOut-1:0] mem_rsp_sel; // '1' when a request targets the private partition. - logic [NumCores+NumRemotePort-1:0] is_private; + logic [NumInp-1:0] is_private; // Xbar channel signals. - tcdm_req_chan_t [NumCores+NumRemotePort-1:0] core_req; - logic [NumCores+NumRemotePort-1:0] core_req_valid, core_req_ready; + tcdm_req_chan_t [NumInp-1:0] core_req; + logic [NumInp-1:0] core_req_valid, core_req_ready; - tcdm_req_chan_t [NumCache+NumRemotePort-1:0] mem_req; - logic [NumCache+NumRemotePort-1:0] mem_req_valid, mem_req_ready; + tcdm_req_chan_t [NumOut-1:0] mem_req; + logic [NumOut-1:0] mem_req_valid, mem_req_ready; - tcdm_rsp_chan_t [NumCores+NumRemotePort-1:0] core_rsp; - logic [NumCores+NumRemotePort-1:0] core_rsp_valid, core_rsp_ready; + tcdm_rsp_chan_t [NumInp-1:0] core_rsp; + logic [NumInp-1:0] core_rsp_valid, core_rsp_ready; - tcdm_rsp_chan_t [NumCache+NumRemotePort-1:0] mem_rsp; - logic [NumCache+NumRemotePort-1:0] mem_rsp_valid, mem_rsp_ready; + tcdm_rsp_chan_t [NumOut-1:0] mem_rsp; + logic [NumOut-1:0] mem_rsp_valid, mem_rsp_ready; // ------------------------------------------------------------------------- // Partition control – registered to ease timing @@ -155,7 +200,7 @@ module tcdm_cache_interco #( // Private/shared classification (request side, before xbar) // ------------------------------------------------------------------------- - for (genvar inp = 0; inp < NumCores+NumRemotePort; inp++) begin : gen_is_private + for (genvar inp = 0; inp < NumInp; inp++) begin : gen_is_private assign is_private[inp] = (core_req[inp].addr >= private_start_addr_q); end @@ -164,103 +209,131 @@ module tcdm_cache_interco #( // ------------------------------------------------------------------------- reqrsp_xbar #( - .NumInp (NumCores + NumRemotePort), - .NumOut (NumCache + NumRemotePort), + .NumInp (NumInp ), + .NumOut (NumOut ), .PipeReg (1'b0 ), .ExtReqPrio (1'b0 ), .ExtRspPrio (1'b0 ), .tcdm_req_chan_t (tcdm_req_chan_t ), .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) ) i_cache_xbar ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .slv_req_i (core_req ), - .slv_rr_i ('0 ), - .slv_req_valid_i (core_req_valid ), - .slv_req_ready_o (core_req_ready ), - .slv_rsp_o (core_rsp ), - .slv_rsp_valid_o (core_rsp_valid ), - .slv_rsp_ready_i (core_rsp_ready ), - .slv_sel_i (core_req_sel ), - .slv_selected_o (/* unused */ ), - .mst_req_o (mem_req ), - .mst_rr_i ('0 ), - .mst_req_valid_o (mem_req_valid ), - .mst_req_ready_i (mem_req_ready ), - .mst_rsp_i (mem_rsp ), - .mst_rsp_valid_i (mem_rsp_valid ), - .mst_rsp_ready_o (mem_rsp_ready ), - .mst_sel_i (mem_rsp_sel ) + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (core_req ), + .slv_rr_i ('0 ), + .slv_req_valid_i (core_req_valid ), + .slv_req_ready_o (core_req_ready ), + .slv_rsp_o (core_rsp ), + .slv_rsp_valid_o (core_rsp_valid ), + .slv_rsp_ready_i (core_rsp_ready ), + .slv_sel_i (core_req_sel ), + .slv_selected_o (/* unused */ ), + .mst_req_o (mem_req ), + .mst_rr_i ('0 ), + .mst_req_valid_o (mem_req_valid ), + .mst_req_ready_i (mem_req_ready ), + .mst_rsp_i (mem_rsp ), + .mst_rsp_valid_i (mem_rsp_valid ), + .mst_rsp_ready_o (mem_rsp_ready ), + .mst_sel_i (mem_rsp_sel ) ); // ------------------------------------------------------------------------- // Request routing (xbar input-side selection) // ------------------------------------------------------------------------- // - // Address layout (example: offset=6, CacheBankBits=2, TileBits=2): + // Address layout (example: offset=6, CacheBankBits=2, TileBits=4 with + // LocalTileBits=2 and GroupBits=2): // - // 31 14 | 13 12 | 11 10 | 9 7 | 5 0 - // Tag | TileID | BankSel | Index | CL offset - // ^-- [offset+CacheBankBits+TileBits-1 : offset+CacheBankBits] - // ^-- [offset+CacheBankBits-1 : offset] + // 31 16 | 15 14 | 13 12 | 11 10 | 9 7 | 5 0 + // Tag | GroupID | LclTID | BankSel | Index | CL offset + // ^-- [offset+CacheBankBits+TileBits-1 : offset+CacheBankBits+LocalTileBits] + // ^-- [offset+CacheBankBits+LocalTileBits-1 : offset+CacheBankBits] + // ^-- [offset+CacheBankBits-1 : offset] // - // Partitioning supports any num_private_cache_q in [0..NumCache]: - // Private banks : ports [0 .. num_private_cache_q-1] - // Shared banks : ports [num_private_cache_q .. NumCache-1] + // Three-way routing classification: + // 1. Local : addr tile == my tile -> route to cache bank + // 2. Intra-group : same group, different tile -> route to remote port + // 3. Inter-group : different group -> route to inter-group remote port // - // Bank selection uses modulo folding: - // private_bank = (addr_bank_bits % num_private_cache_q) - // shared_bank = num_private_cache_q + (addr_bank_bits % num_shared_cache_q) + // Partitioning (private/shared) interacts as follows: + // - Private requests are always local (same as before). + // - Shared requests use the full three-way classification. // - // For power-of-2 partition sizes this reduces to a simple bit mask. - // For non-power-of-2 sizes (e.g. 3) the modulo is a small comparator since - // addr_bank_bits is only CacheBankBits wide. + // The original two-way classification (local vs. remote) is preserved + // when NumRemoteGroupPort == 0, ensuring backward compatibility. + + // Derive this tile's group ID from the globally-unique tile_id_i. + logic [TileBits-1:0] my_group_id; + if (NumRemoteGroupPort == 0) begin + assign my_group_id = tile_id_i; + end else begin + assign my_group_id = tile_id_i[TileBits-1:LocalTileBits]; + end - for (genvar port = 0; port < NumCores+NumRemotePort; port++) begin : gen_req_sel + for (genvar port = 0; port < NumInp; port++) begin : gen_req_sel logic [CacheBankBits-1:0] addr_bank; - logic [TileIDWidth-1:0] addr_tile; + // Full tile ID extracted from the address (covers group + local tile). + logic [TileBits-1:0] addr_tile_id; + // Group portion of the address tile field. + logic [TileBits-1:0] addr_group_id; + // Whether the addressed group matches this tile's group. + logic same_group; always_comb begin // Defaults. - local_sel[port] = 1'b1; core_req_sel[port] = '0; // Extract the raw BankSel field from the address. - addr_bank = core_req[port].addr[dynamic_offset_i +: CacheBankBits]; - // Extract the target TileID from the address (used for remote port selection). - addr_tile = core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth]; - - if (num_private_cache_q == ($clog2(NumCache)+1)'(NumCache) || NumTiles == 1) begin - // All-private or single-tile: every request is local. + addr_bank = core_req[port].addr[dynamic_offset_i +: CacheBankBits]; + // Extract the full tile ID (group + local) from the address. + addr_tile_id = core_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileBits]; + // Extract group portion (upper bits of tile ID). + addr_group_id = addr_tile_id >> LocalTileBits; + // Compare group IDs. + same_group = (addr_group_id == my_group_id); + + if (num_private_cache_q == ($clog2(NumCache)+1)'(NumCache) + || (NumTiles == 1 && NumRemoteGroupPort == 0)) begin + // All-private, or single-tile single-group: every request is local. // Use the full BankSel field directly (no folding needed). - local_sel[port] = 1'b1; core_req_sel[port] = core_sel_t'(addr_bank); end else if (num_private_cache_q == '0) begin - // All-shared: check TileID to decide local vs. remote. - // Use the full BankSel field directly (no folding needed). - local_sel[port] = (addr_tile == tile_id_i); - // Route remote requests by target tile ID so that all accesses to the - // same tile share a single pipeline, preserving write-before-read - // ordering across barriers. - core_req_sel[port] = local_sel[port] - ? core_sel_t'(addr_bank) - : core_sel_t'(NumCache + (addr_tile % NumRemotePort)); + // All-shared: full three-way classification. + if (NumRemoteGroupPort > 0 && !same_group) begin + // Inter-group: route to inter-group remote port. + core_req_sel[port] = core_sel_t'(NumCache + NumRemotePort + + (port % NumRemoteGroupPort)); + end else if (addr_tile_id[LocalTileBits-1:0] != tile_id_i[LocalTileBits-1:0] + && !(NumTiles == 1)) begin + // Intra-group remote: different tile, same group. + core_req_sel[port] = core_sel_t'(NumCache + (port % NumRemotePort)); + end else begin + // Local: same tile. + core_req_sel[port] = core_sel_t'(addr_bank); + end end else begin - // Mixed: fold addr_bank into the appropriate partition via modulo. + // Mixed partition: fold addr_bank into the appropriate partition. if (is_private[port]) begin // Private request: always local. - // bank = addr_bank % num_private_cache_q, offset from bank 0. - local_sel[port] = 1'b1; core_req_sel[port] = core_sel_t'(addr_bank % num_private_cache_q); end else begin - // Shared request: check TileID to decide local vs. remote. - // bank = num_private_cache_q + (addr_bank % num_shared_cache_q). - local_sel[port] = (addr_tile == tile_id_i); - core_req_sel[port] = local_sel[port] - ? core_sel_t'(num_private_cache_q + (addr_bank % num_shared_cache_q)) - : core_sel_t'(NumCache + (addr_tile % NumRemotePort)); + // Shared request: three-way classification. + if (NumRemoteGroupPort > 0 && !same_group) begin + // Inter-group: route to inter-group remote port. + core_req_sel[port] = core_sel_t'(NumCache + NumRemotePort + + (port % NumRemoteGroupPort)); + end else if (addr_tile_id[LocalTileBits-1:0] != tile_id_i[LocalTileBits-1:0] + && !(NumTiles == 1)) begin + // Intra-group remote: different tile, same group. + core_req_sel[port] = core_sel_t'(NumCache + (port % NumRemotePort)); + end else begin + // Local: same tile. + core_req_sel[port] = core_sel_t'(num_private_cache_q + + (addr_bank % num_shared_cache_q)); + end end end end @@ -269,16 +342,35 @@ module tcdm_cache_interco #( // ------------------------------------------------------------------------- // Response routing (xbar output-side selection) // ------------------------------------------------------------------------- + // + // Responses from local cache banks are routed back to the originating + // core using core_id. Responses from intra-group remote tiles and + // inter-group remote ports carry a tile_id that differs from tile_id_i; + // these are forwarded to the corresponding remote-in or inter-group remote-in port. + + for (genvar port = 0; port < NumOut; port++) begin : gen_rsp_sel + logic [TileBits-1:0] rsp_group_id; + if (NumRemoteGroupPort == 0) begin + assign rsp_group_id = my_group_id; + end else begin + assign rsp_group_id = mem_rsp[port].user.tile_id[TileBits-1:LocalTileBits]; + end - for (genvar port = 0; port < NumCache+NumRemotePort; port++) begin : gen_rsp_sel always_comb begin mem_rsp_sel[port] = mem_rsp[port].user.core_id; if (mem_rsp[port].user.tile_id != tile_id_i) begin - // Response destined for a remote tile: forward to the remote interco - // port that matches the incoming request path. The group-level xbar - // routes requests from source tile S to our remote-in slot - // (S % NumRemotePort), so responses must return via the same slot. - mem_rsp_sel[port] = mem_sel_t'(NumCores + (mem_rsp[port].user.tile_id % NumRemotePort)); + // Response originates from a different tile (intra-group remote or + // inter-group remote). Determine which input port set it came from. + if (NumRemoteGroupPort > 0 + && rsp_group_id != my_group_id) begin + // Inter-group: forward to the inter-group remote-in input port. + mem_rsp_sel[port] = mem_sel_t'(NumCores + NumRemotePort + + (mem_rsp[port].user.core_id % NumRemoteGroupPort)); + end else begin + // Intra-group: forward to the remote-in input port. + mem_rsp_sel[port] = mem_sel_t'(NumCores + + (mem_rsp[port].user.core_id % NumRemotePort)); + end end end end @@ -287,7 +379,7 @@ module tcdm_cache_interco #( // Input-side pipeline registers // ------------------------------------------------------------------------- - for (genvar port = 0; port < NumCores+NumRemotePort; port++) begin : gen_cache_interco_reg + for (genvar port = 0; port < NumInp; port++) begin : gen_cache_interco_reg spill_register #( .T (tcdm_req_chan_t ) ) i_tcdm_req_reg ( @@ -349,11 +441,11 @@ module tcdm_cache_interco #( // // lower = addr & ((1 << offset) - 1) // CLoffset, verbatim // rot_field = (addr >> offset) & ((1 << N) - 1) // N routing bits - // upper = addr >> (offset + N) // Tag+Index + // upper = addr >> (offset + N) // Tag+Index // // addr_rot = lower - // | (upper << offset) // close the hole - // | (rot_field << (AddrWidth - N)) // park at MSB + // | (upper << offset) // close the hole + // | (rot_field << (AddrWidth - N)) // park at MSB // Width of bits_to_rotate signal: must hold values up to CacheBankBits+TileBits. localparam int unsigned RotWidth = $clog2(CacheBankBits + TileBits + 1) + 1; @@ -408,7 +500,7 @@ module tcdm_cache_interco #( // Output assignment // ------------------------------------------------------------------------- - for (genvar port = 0; port < NumCache + NumRemotePort; port++) begin : gen_cache_io + for (genvar port = 0; port < NumOut; port++) begin : gen_cache_io always_comb begin mem_req_o[port] = '{ q : mem_req[port], @@ -419,10 +511,14 @@ module tcdm_cache_interco #( if (port < NumCache) begin // Local bank: forward address with routing bits rotated to MSB. mem_req_o[port].q.addr = addr_rot[port]; - end else begin - // Remote port: pass address untouched; extract target tile ID. + end else if (port < NumCache + NumRemotePort) begin + // Intra-group remote port: pass address untouched; extract target tile ID. tile_sel_o[port - NumCache] = - mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileIDWidth]; + mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileBits]; + end else begin + // Inter-group remote port: pass address untouched; extract target tile ID. + remote_group_sel_o[port - NumCache - NumRemotePort] = + mem_req[port].addr[(dynamic_offset_i + CacheBankBits) +: TileBits]; end end @@ -433,4 +529,21 @@ module tcdm_cache_interco #( assign mem_rsp_ready_o = mem_rsp_ready; + // ------------------------------------------------------------------------- + // Assertions + // ------------------------------------------------------------------------- +`ifndef TARGET_SYNTHESIS + // This is used to ensure we will not have illegal visits to DRAM + // This kind of error can be latent in the system until the entry is evicted + for (genvar x = 0; x < TotInPorts; x++) begin : gen_addr_assert + CoreReqAddrAboveDram: assert property ( + @(posedge clk_i) disable iff (!rst_ni !== '0) + core_req_i[x].q_valid |-> core_req_i[x].q.addr >= addr_t'(DramBaseAddr) + ) else begin + $error("[%m] port %0d: addr 0x%08x is below DramBaseAddr 0x%08x", + x, core_req_i[x].q.addr, DramBaseAddr); + end + end +`endif + endmodule diff --git a/hardware/tb/cachepool_cluster_wrapper.sv b/hardware/tb/cachepool_cluster_wrapper.sv index a9dba20..e412f7c 100644 --- a/hardware/tb/cachepool_cluster_wrapper.sv +++ b/hardware/tb/cachepool_cluster_wrapper.sv @@ -13,50 +13,65 @@ module cachepool_cluster_wrapper import fpnew_pkg::fpu_implementation_t; import snitch_pma_pkg::snitch_pma_t; #( - parameter int unsigned AxiAddrWidth = SpatzAxiAddrWidth, - parameter int unsigned AxiDataWidth = SpatzAxiDataWidth, - parameter int unsigned AxiUserWidth = SpatzAxiUserWidth, - parameter int unsigned AxiInIdWidth = SpatzAxiIdInWidth, - parameter int unsigned AxiOutIdWidth = SpatzAxiIdOutWidth, - - parameter type axi_in_resp_t = spatz_axi_in_resp_t, - parameter type axi_in_req_t = spatz_axi_in_req_t, - - parameter type axi_out_resp_t = spatz_axi_out_resp_t, - parameter type axi_out_req_t = spatz_axi_out_req_t, - - parameter type axi_narrow_req_t = spatz_axi_narrow_req_t, - parameter type axi_narrow_resp_t = spatz_axi_narrow_resp_t + parameter int unsigned AxiAddrWidth = SpatzAxiAddrWidth, + parameter int unsigned AxiDataWidth = SpatzAxiDataWidth, + parameter int unsigned AxiUserWidth = SpatzAxiUserWidth, + // External input ID width (SoC/testbench → wrapper); remapped to SpatzAxiIdInWidth inside. + parameter int unsigned AxiInIdWidth = WrapperAxiIdInWidth, + // External wide output ID width (wrapper → DRAM); remapped from SpatzAxiIdOutWidth inside. + parameter int unsigned AxiOutIdWidth = WrapperAxiIdOutWidth, + // External narrow output ID width (UART, wrapper → SoC); remapped from SpatzAxiUartIdWidth inside. + parameter int unsigned AxiNarrowOutIdWidth = WrapperAxiNarrowIdOutWidth, + + // External input types use the wrapper-narrowed ID (WrapperAxiIdInWidth). + parameter type axi_in_req_t = spatz_axi_wrapper_in_req_t, + parameter type axi_in_resp_t = spatz_axi_wrapper_in_resp_t, + + // External wide output types use the wrapper-narrowed ID (WrapperAxiIdOutWidth). + parameter type axi_out_req_t = spatz_axi_wrapper_out_req_t, + parameter type axi_out_resp_t = spatz_axi_wrapper_out_resp_t, + + // External narrow output types use the wrapper-narrowed ID (WrapperAxiNarrowIdOutWidth). + parameter type axi_narrow_out_req_t = spatz_axi_wrapper_narrow_out_req_t, + parameter type axi_narrow_out_resp_t = spatz_axi_wrapper_narrow_out_resp_t )( - input logic clk_i, - input logic rst_ni, - output logic [3:0] eoc_o, - input logic debug_req_i, - - input logic meip_i, - input logic mtip_i, - input logic msip_i, - output logic cluster_probe_o, - input axi_in_req_t axi_in_req_i, - output axi_in_resp_t axi_in_resp_o, - /// AXI Narrow out-port (UART) - output axi_uart_req_t axi_narrow_req_o, - input axi_uart_resp_t axi_narrow_resp_i, - output axi_out_req_t [NumClusterSlv-1:0] axi_out_req_o, - input axi_out_resp_t [NumClusterSlv-1:0] axi_out_resp_i + input logic clk_i, + input logic rst_ni, + output logic [3:0] eoc_o, + input logic debug_req_i, + + input logic meip_i, + input logic mtip_i, + input logic msip_i, + output logic cluster_probe_o, + // AXI slave port (from SoC/testbench); external ID = AxiInIdWidth. + input axi_in_req_t axi_in_req_i, + output axi_in_resp_t axi_in_resp_o, + /// AXI Narrow out-port (UART); external ID = AxiNarrowOutIdWidth. + output axi_narrow_out_req_t axi_narrow_req_o, + input axi_narrow_out_resp_t axi_narrow_resp_i, + // AXI wide master ports (to DRAM); external ID = AxiOutIdWidth. + output axi_out_req_t [NumClusterSlv-1:0] axi_out_req_o, + input axi_out_resp_t [NumClusterSlv-1:0] axi_out_resp_i ); - - spatz_axi_iwc_out_req_t [NumClusterSlv-1:0] axi_from_cluster_iwc_req; - spatz_axi_iwc_out_resp_t [NumClusterSlv-1:0] axi_from_cluster_iwc_resp; + // Internal signals between wrapper remappers and cluster (fat IDs). + spatz_axi_in_req_t axi_cluster_in_req; + spatz_axi_in_resp_t axi_cluster_in_resp; + axi_uart_req_t axi_cluster_narrow_req; + axi_uart_resp_t axi_cluster_narrow_resp; + spatz_axi_out_req_t [NumClusterSlv-1:0] axi_cluster_out_req; + spatz_axi_out_resp_t [NumClusterSlv-1:0] axi_cluster_out_resp; // Spatz cluster under test. + // Internal AXI types are fixed (full-width IDs); the wrapper remaps at both boundaries. cachepool_cluster #( .AxiAddrWidth (AxiAddrWidth ), .AxiDataWidth (AxiDataWidth ), - .AxiIdWidthIn (AxiInIdWidth ), - .AxiIdWidthOut (AxiOutIdWidth ), + // Cluster always sees the full internal ID width on its slave port. + .AxiIdWidthIn (SpatzAxiIdInWidth ), + .AxiIdWidthOut (SpatzAxiIdOutWidth ), .AxiUserWidth (AxiUserWidth ), .BootAddr (BootAddr ), .UartAddr (UartAddr ), @@ -74,15 +89,15 @@ module cachepool_cluster_wrapper .NumIntOutstandingLoads (NumIntOutstandingLoads ), .NumIntOutstandingMem (NumIntOutstandingMem ), .NumSpatzOutstandingLoads (NumSpatzOutstandingLoads ), - .axi_in_req_t (axi_in_req_t ), - .axi_in_resp_t (axi_in_resp_t ), - .axi_narrow_req_t (axi_narrow_req_t ), - .axi_narrow_resp_t (axi_narrow_resp_t ), - .axi_out_req_t (axi_out_req_t ), - .axi_out_resp_t (axi_out_resp_t ), - .Xdma (4'h0 ), - .DMAAxiReqFifoDepth (3 ), - .DMAReqFifoDepth (3 ), + // Cluster slave port uses full internal type (remap is above this level). + .axi_in_req_t (spatz_axi_in_req_t ), + .axi_in_resp_t (spatz_axi_in_resp_t ), + // Cluster per-tile narrow type (internal crossbar width, not the UART mux output). + .axi_narrow_req_t (spatz_axi_narrow_req_t ), + .axi_narrow_resp_t (spatz_axi_narrow_resp_t ), + // Cluster internally uses the fat output type; the wrapper remaps it. + .axi_out_req_t (spatz_axi_out_req_t ), + .axi_out_resp_t (spatz_axi_out_resp_t ), .RegisterOffloadRsp (1 ), .RegisterCoreReq (1 ), .RegisterCoreRsp (1 ), @@ -97,22 +112,86 @@ module cachepool_cluster_wrapper .eoc_o (eoc_o ), .impl_i ('0 ), .error_o ( ), - .debug_req_i ({NumCores{debug_req_i}} ), - .meip_i ({NumCores{meip_i}} ), - .mtip_i ({NumCores{mtip_i}} ), - .msip_i ({NumCores{msip_i}} ), + .debug_req_i (debug_req_i ), + .meip_i (meip_i ), + .mtip_i (mtip_i ), + .msip_i (msip_i ), .hart_base_id_i (10'h0 ), .cluster_base_addr_i (TCDMStartAddr ), .cluster_probe_o (cluster_probe_o ), - .axi_in_req_i , - .axi_in_resp_o , - .axi_narrow_req_o , - .axi_narrow_resp_i , - // AXI Master Port - .axi_out_req_o ( axi_out_req_o ), - .axi_out_resp_i ( axi_out_resp_i ) + // Remapped internal connections. + .axi_in_req_i (axi_cluster_in_req ), + .axi_in_resp_o (axi_cluster_in_resp ), + .axi_narrow_req_o (axi_cluster_narrow_req ), + .axi_narrow_resp_i (axi_cluster_narrow_resp ), + // AXI Master Port (fat IDs; wrapper remaps before external port). + .axi_out_req_o (axi_cluster_out_req ), + .axi_out_resp_i (axi_cluster_out_resp ) + ); + + // Expand WrapperAxiIdInWidth -> SpatzAxiIdInWidth on the cluster slave port. + // The external SoC/testbench drives narrow IDs; the cluster expects full-width IDs. + axi_id_remap #( + .AxiSlvPortIdWidth ( WrapperAxiIdInWidth ), + // Up to 2^WrapperAxiIdInWidth = 16 unique IDs from external host. + .AxiSlvPortMaxUniqIds ( 2**WrapperAxiIdInWidth ), + .AxiMaxTxnsPerId ( NumAxiMaxTrans ), + .AxiMstPortIdWidth ( SpatzAxiIdInWidth ), + .slv_req_t ( axi_in_req_t ), + .slv_resp_t ( axi_in_resp_t ), + .mst_req_t ( spatz_axi_in_req_t ), + .mst_resp_t ( spatz_axi_in_resp_t ) + ) i_in_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_in_req_i ), + .slv_resp_o ( axi_in_resp_o ), + .mst_req_o ( axi_cluster_in_req ), + .mst_resp_i ( axi_cluster_in_resp ) ); + // Compress SpatzAxiUartIdWidth -> WrapperAxiNarrowIdOutWidth on the UART master port. + axi_id_remap #( + .AxiSlvPortIdWidth ( SpatzAxiUartIdWidth ), + // Cap at 2^WrapperAxiNarrowIdOutWidth unique IDs toward the SoC. + .AxiSlvPortMaxUniqIds ( 2**WrapperAxiNarrowIdOutWidth ), + .AxiMaxTxnsPerId ( NumAxiMaxTrans ), + .AxiMstPortIdWidth ( WrapperAxiNarrowIdOutWidth ), + .slv_req_t ( axi_uart_req_t ), + .slv_resp_t ( axi_uart_resp_t ), + .mst_req_t ( axi_narrow_out_req_t ), + .mst_resp_t ( axi_narrow_out_resp_t ) + ) i_narrow_out_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_cluster_narrow_req ), + .slv_resp_o ( axi_cluster_narrow_resp ), + .mst_req_o ( axi_narrow_req_o ), + .mst_resp_i ( axi_narrow_resp_i ) + ); + + // Reduce SpatzAxiIdOutWidth -> WrapperAxiIdOutWidth per DRAM channel. + // NumAxiMaxTrans = 32 outstanding per channel; 6 bits gives 64 unique ID slots. + for (genvar ch = 0; ch < NumClusterSlv; ch++) begin : gen_out_id_remap + axi_id_remap #( + .AxiSlvPortIdWidth ( SpatzAxiIdOutWidth ), + .AxiSlvPortMaxUniqIds ( NumAxiMaxTrans ), + .AxiMaxTxnsPerId ( NumAxiMaxTrans ), + .AxiMstPortIdWidth ( WrapperAxiIdOutWidth ), + .slv_req_t ( spatz_axi_out_req_t ), + .slv_resp_t ( spatz_axi_out_resp_t ), + .mst_req_t ( spatz_axi_wrapper_out_req_t ), + .mst_resp_t ( spatz_axi_wrapper_out_resp_t ) + ) i_out_id_remap ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_cluster_out_req [ch] ), + .slv_resp_o ( axi_cluster_out_resp [ch] ), + .mst_req_o ( axi_out_req_o [ch] ), + .mst_resp_i ( axi_out_resp_i [ch] ) + ); + end + // AXI utilization monitor `ifndef TARGET_SYNTHESIS typedef logic [31:0] cnt_t; @@ -241,11 +320,14 @@ module cachepool_cluster_wrapper if (AxiUserWidth != SpatzAxiUserWidth) $error("[spatz_cluster_wrapper] AXI User Width does not match the configuration."); - if (AxiInIdWidth != SpatzAxiIdInWidth) + if (AxiInIdWidth != WrapperAxiIdInWidth) $error("[spatz_cluster_wrapper] AXI Id Width (In) does not match the configuration."); - if (AxiOutIdWidth != SpatzAxiIdOutWidth) + if (AxiOutIdWidth != WrapperAxiIdOutWidth) $error("[spatz_cluster_wrapper] AXI Id Width (Out) does not match the configuration."); + + if (AxiNarrowOutIdWidth != WrapperAxiNarrowIdOutWidth) + $error("[spatz_cluster_wrapper] AXI Narrow Id Width (Out) does not match the configuration."); `endif endmodule diff --git a/hardware/tb/tb_cachepool.sv b/hardware/tb/tb_cachepool.sv index 7da3e78..385d699 100644 --- a/hardware/tb/tb_cachepool.sv +++ b/hardware/tb/tb_cachepool.sv @@ -66,18 +66,19 @@ module tb_cachepool; localparam NumAXISlaves = 2; localparam NumRules = NumAXISlaves-1; - // Spatz wide port to SoC (currently dram) - spatz_axi_out_req_t [NumL2Channel-1:0] axi_from_cluster_req; - spatz_axi_out_resp_t [NumL2Channel-1:0] axi_from_cluster_resp; - // From SoC to Spatz - spatz_axi_in_req_t axi_to_cluster_req; - spatz_axi_in_resp_t axi_to_cluster_resp; + // Spatz wide port to SoC (currently dram); IDs narrowed by wrapper-level axi_id_remap. + spatz_axi_wrapper_out_req_t [NumL2Channel-1:0] axi_from_cluster_req; + spatz_axi_wrapper_out_resp_t [NumL2Channel-1:0] axi_from_cluster_resp; + // From SoC to Spatz; IDs expanded by wrapper-level axi_id_remap (WrapperAxiIdInWidth → SpatzAxiIdInWidth). + spatz_axi_wrapper_in_req_t axi_to_cluster_req; + spatz_axi_wrapper_in_resp_t axi_to_cluster_resp; - axi_uart_req_t axi_uart_req; - axi_uart_resp_t axi_uart_rsp; + // UART; IDs compressed by wrapper-level axi_id_remap (SpatzAxiUartIdWidth → WrapperAxiNarrowIdOutWidth). + spatz_axi_wrapper_narrow_out_req_t axi_uart_req; + spatz_axi_wrapper_narrow_out_resp_t axi_uart_rsp; // DRAM Scrambled request - spatz_axi_out_req_t [NumL2Channel-1:0] axi_dram_req; + spatz_axi_wrapper_out_req_t [NumL2Channel-1:0] axi_dram_req; /********* @@ -141,13 +142,13 @@ module tb_cachepool; reqrsp_cluster_in_rsp_t to_cluster_rsp; reqrsp_to_axi #( - .DataWidth (SpatzDataWidth ), - .AxiUserWidth(SpatzAxiUserWidth ), - .UserWidth ($bits(tcdm_user_t) ), - .axi_req_t (spatz_axi_in_req_t ), - .axi_rsp_t (spatz_axi_in_resp_t ), - .reqrsp_req_t(reqrsp_cluster_in_req_t), - .reqrsp_rsp_t(reqrsp_cluster_in_rsp_t) + .DataWidth (SpatzDataWidth ), + .AxiUserWidth(SpatzAxiUserWidth ), + .UserWidth ($bits(tcdm_user_t) ), + .axi_req_t (spatz_axi_wrapper_in_req_t ), + .axi_rsp_t (spatz_axi_wrapper_in_resp_t ), + .reqrsp_req_t(reqrsp_cluster_in_req_t ), + .reqrsp_rsp_t(reqrsp_cluster_in_rsp_t ) ) i_reqrsp_to_axi ( .clk_i (clk ), .rst_ni (rst_n ), @@ -211,6 +212,72 @@ module tb_cachepool; to_cluster_req = '0; + // Initialize L1D cache before waking up cores + // Step 1: Write init instruction (flush + invalidate) + to_cluster_req = '{ + q: '{ + addr : PeriStartAddr + CACHEPOOL_PERIPHERAL_CFG_L1D_INSN_OFFSET, + data : 32'h3, + write : 1'b1, + strb : '1, + amo : reqrsp_pkg::AMONone, + default: '0 + }, + q_valid: 1'b1, + p_ready: 1'b0 + }; + `wait_for(to_cluster_rsp.q_ready); + to_cluster_req = '0; + `wait_for(to_cluster_rsp.p_valid); + to_cluster_req = '{p_ready: 1'b1, q: '{amo: reqrsp_pkg::AMONone, default: '0}, default: '0}; + @(posedge clk); + to_cluster_req = '0; + + // Step 2: Commit the instruction + to_cluster_req = '{ + q: '{ + addr : PeriStartAddr + CACHEPOOL_PERIPHERAL_L1D_INSN_COMMIT_OFFSET, + data : 32'h1, + write : 1'b1, + strb : '1, + amo : reqrsp_pkg::AMONone, + default: '0 + }, + q_valid: 1'b1, + p_ready: 1'b0 + }; + `wait_for(to_cluster_rsp.q_ready); + to_cluster_req = '0; + `wait_for(to_cluster_rsp.p_valid); + to_cluster_req = '{p_ready: 1'b1, q: '{amo: reqrsp_pkg::AMONone, default: '0}, default: '0}; + @(posedge clk); + to_cluster_req = '0; + + // Step 3: Poll until flush complete + begin + automatic logic [31:0] flush_status; + do begin + to_cluster_req = '{ + q: '{ + addr : PeriStartAddr + CACHEPOOL_PERIPHERAL_L1D_FLUSH_STATUS_OFFSET, + write : 1'b0, + strb : '0, + amo : reqrsp_pkg::AMONone, + default: '0 + }, + q_valid: 1'b1, + p_ready: 1'b0 + }; + `wait_for(to_cluster_rsp.q_ready); + to_cluster_req = '0; + `wait_for(to_cluster_rsp.p_valid); + flush_status = to_cluster_rsp.p.data; + to_cluster_req = '{p_ready: 1'b1, q: '{amo: reqrsp_pkg::AMONone, default: '0}, default: '0}; + @(posedge clk); + to_cluster_req = '0; + end while (flush_status[0]); + end + // Wake up cores debug_req = '1; @(posedge clk); @@ -227,8 +294,8 @@ module tb_cachepool; **********/ axi_uart #( - .axi_req_t (axi_uart_req_t ), - .axi_resp_t(axi_uart_resp_t) + .axi_req_t (spatz_axi_wrapper_narrow_out_req_t ), + .axi_resp_t(spatz_axi_wrapper_narrow_out_resp_t) ) i_axi_uart ( .clk_i (clk ), .rst_ni (rst_n ), @@ -314,19 +381,19 @@ module tb_cachepool; for (genvar mem = 0; mem < NumL2Channel; mem++) begin: gen_dram axi_dram_sim #( - .BASE ( DramBase ), - .DRAMType ( DramType ), - .AxiAddrWidth ( SpatzAxiAddrWidth ), - .AxiDataWidth ( SpatzAxiDataWidth ), - .AxiIdWidth ( SpatzAxiIdOutWidth ), - .AxiUserWidth ( SpatzAxiUserWidth ), - .axi_req_t ( spatz_axi_out_req_t ), - .axi_resp_t ( spatz_axi_out_resp_t ), - .axi_ar_t ( spatz_axi_out_ar_chan_t ), - .axi_r_t ( spatz_axi_out_r_chan_t ), - .axi_aw_t ( spatz_axi_out_aw_chan_t ), - .axi_w_t ( spatz_axi_out_w_chan_t ), - .axi_b_t ( spatz_axi_out_b_chan_t ) + .BASE ( DramBase ), + .DRAMType ( DramType ), + .AxiAddrWidth ( SpatzAxiAddrWidth ), + .AxiDataWidth ( SpatzAxiDataWidth ), + .AxiIdWidth ( WrapperAxiIdOutWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .axi_req_t ( spatz_axi_wrapper_out_req_t ), + .axi_resp_t ( spatz_axi_wrapper_out_resp_t ), + .axi_ar_t ( spatz_axi_wrapper_out_ar_chan_t ), + .axi_r_t ( spatz_axi_wrapper_out_r_chan_t ), + .axi_aw_t ( spatz_axi_wrapper_out_aw_chan_t ), + .axi_w_t ( spatz_axi_wrapper_out_w_chan_t ), + .axi_b_t ( spatz_axi_wrapper_out_b_chan_t ) ) i_axi_dram_sim ( .clk_i ( clk ), .rst_ni ( rst_n ), diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..216cd0d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +# Python packages required for hardware code generation (make generate). +# dataclasses is a stdlib backport needed on Python 3.6; it is a no-op on 3.7+. +dataclasses +hjson +jsonref +jsonschema +mako +termcolor diff --git a/sim/scripts/vsim_cluster.tcl b/sim/scripts/vsim_cluster.tcl index c8d66f3..2213a16 100644 --- a/sim/scripts/vsim_cluster.tcl +++ b/sim/scripts/vsim_cluster.tcl @@ -5,11 +5,7 @@ # Create group for Cluster onerror {resume} -set cluster_path $1 - -add wave -noupdate -group Cluster -group xbar -group req_xbar ${cluster_path}/i_cluster_xbar/i_req_xbar/* -add wave -noupdate -group Cluster -group xbar -group rsp_xbar ${cluster_path}/i_cluster_xbar/i_rsp_xbar/* -add wave -noupdate -group Cluster -group xbar ${cluster_path}/i_cluster_xbar/* +quietly set cluster_path $1 add wave -noupdate -group Cluster -group CSR ${cluster_path}/i_cachepool_cluster_peripheral/* diff --git a/sim/scripts/vsim_core.tcl b/sim/scripts/vsim_core.tcl index 9510e33..30ee61a 100644 --- a/sim/scripts/vsim_core.tcl +++ b/sim/scripts/vsim_core.tcl @@ -4,179 +4,192 @@ # Create group for core $1 onerror {resume} - -set core_path ${3} - -add wave -noupdate -group tile[$1]_core[$2] -group scalar_xbar ${core_path}/i_cachepool_cc/i_scalar_xbar/* - -add wave -noupdate -group tile[$1]_core[$2] -group Params ${core_path}/i_cachepool_cc/BootAddr -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/clk_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/rst_i -add wave -noupdate -group tile[$1]_core[$2] -radix unsigned ${core_path}/i_cachepool_cc/i_snitch/hart_id_i - -add wave -noupdate -group tile[$1]_core[$2] -divider Instructions -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_addr_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_data_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_valid_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/inst_ready_i - -add wave -noupdate -group tile[$1]_core[$2] -divider Load/Store -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/data_req_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/data_rsp_i - -add wave -noupdate -group tile[$1]_core[$2] -divider Accelerator -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qreq_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qrsp_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qvalid_o -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_qready_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_prsp_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_pvalid_i -add wave -noupdate -group tile[$1]_core[$2] ${core_path}/i_cachepool_cc/i_snitch/acc_pready_o - -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/illegal_inst -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/stall -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_stall -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_stall -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/zero_lsb -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -divider LSU -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_size -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_amo -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qvalid -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pvalid -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_load -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_i -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_acc -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -divider ALU -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/iimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/jimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/bimm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/simm -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/adder_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs1 -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs2 -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_raddr -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_rdata -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_waddr -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_wdata -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_we -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/consec_pc -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_load -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_store -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_signed -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_misaligned -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_addr_misaligned -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/st_addr_misaligned -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/valid_instr -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/exception -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_op -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_select -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_select -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/write_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uses_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/next_pc -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_select -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_bypass -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_branch -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_rvalue -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_en -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_register_rd -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/operands_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/dst_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_ready -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_reversed -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left_result -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_ext -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result_ext -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_arithmetic -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opa -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opb -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_writeback -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_d -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_q -add wave -noupdate -group tile[$1]_core[$2] -group Snitch ${core_path}/i_cachepool_cc/i_snitch/core_events_o - -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -group Internal -group RF ${core_path}/i_cachepool_cc/i_snitch/i_snitch_regfile/* -add wave -noupdate -group tile[$1]_core[$2] -group Snitch -group Internal ${core_path}/i_cachepool_cc/i_snitch/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_valid_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_ready_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_req_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_rsp_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_valid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_ready_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_valid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_ready_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_valid_i - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/* -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" -group FPR ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fpr/* -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" -group LSU ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fp_lsu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group Controller ${core_path}/i_cachepool_cc/i_spatz/i_controller/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider RegisterWrite -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wvalid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider RegisterRead -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/re_i -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rvalid_o -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider Internal -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VSLDU ${core_path}/i_cachepool_cc/i_spatz/i_vsldu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VFU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group FPU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/gen_fpu/* - -add wave -noupdate -group tile[$1]_core[$2] -group Internal ${core_path}/i_cachepool_cc/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU -group ROB0 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[0]/i_reorder_buffer/* -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU -group ROB1 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[1]/i_reorder_buffer/* - -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_valid -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_ready -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_empty -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_pop -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_push -add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo_bypass +quietly WaveActivateNextPane {} 0 + +quietly set core_path ${4} +quietly set name g_${1}_t_${2}_c_${3} + +# Build the parent group prefix list from optional args 5 (GroupWP) and 6 (tile) +quietly set parent_grp [list] +if {$argc > 4 && "${5}" != ""} { + quietly lappend parent_grp -group ${5} +} +if {$argc > 5 && "${6}" != ""} { + quietly lappend parent_grp -group ${6} +} + +# The {*} syntax safely expands the list. +# If $parent_grp is empty, it safely ignores it instead of passing "". +add wave -noupdate {*}$parent_grp -group ${name} -group scalar_xbar ${core_path}/i_cachepool_cc/i_scalar_xbar/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Params ${core_path}/i_cachepool_cc/BootAddr +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/clk_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/rst_i +add wave -noupdate {*}$parent_grp -group ${name} -radix unsigned ${core_path}/i_cachepool_cc/i_snitch/hart_id_i + +add wave -noupdate {*}$parent_grp -group ${name} -divider Instructions +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_addr_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_data_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_valid_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/inst_ready_i + +add wave -noupdate {*}$parent_grp -group ${name} -divider Load/Store +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/data_req_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/data_rsp_i + +add wave -noupdate {*}$parent_grp -group ${name} -divider Accelerator +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qreq_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qrsp_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qvalid_o +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_qready_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_prsp_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_pvalid_i +add wave -noupdate {*}$parent_grp -group ${name} ${core_path}/i_cachepool_cc/i_snitch/acc_pready_o + +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/illegal_inst +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/stall +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_stall +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_stall +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/zero_lsb +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/pc_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/wfi_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/fcsr_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -divider LSU +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_size +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_amo +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_qvalid +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pvalid +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_pready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/lsu_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_load +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_i +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/retire_acc +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -divider ALU +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/iimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/jimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/bimm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/simm +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/adder_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs1 +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rs2 +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_raddr +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_rdata +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_waddr +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_wdata +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/gpr_we +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/consec_pc +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/sb_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_load +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_store +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_signed +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ls_misaligned +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/ld_addr_misaligned +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/st_addr_misaligned +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/valid_instr +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/exception +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_op +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_select +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_select +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/write_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/uses_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/next_pc +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_select +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/rd_bypass +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/is_branch +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_rvalue +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/csr_en +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_register_rd +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/operands_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/dst_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opa_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/opb_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_reversed +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left_result +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_opa_ext +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_right_result_ext +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_left +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/shift_arithmetic +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opa +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_opb +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/alu_writeback +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_cnt_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_d +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/acc_mem_str_cnt_q +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch ${core_path}/i_cachepool_cc/i_snitch/core_events_o + +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -group Internal -group RF ${core_path}/i_cachepool_cc/i_snitch/i_snitch_regfile/* +add wave -noupdate {*}$parent_grp -group ${name} -group Snitch -group Internal ${core_path}/i_cachepool_cc/i_snitch/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_valid_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_ready_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_req_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/issue_rsp_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_valid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_ready_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/rsp_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_valid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_req_ready_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz ${core_path}/i_cachepool_cc/i_spatz/spatz_mem_rsp_valid_i + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group "FPU Sequencer" ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/* +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group "FPU Sequencer" -group FPR ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fpr/* +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group "FPU Sequencer" -group LSU ${core_path}/i_cachepool_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fp_lsu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group Controller ${core_path}/i_cachepool_cc/i_spatz/i_controller/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF -divider RegisterWrite +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wvalid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF -divider RegisterRead +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/re_i +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rvalid_o +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF -divider Internal +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/waddr +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wdata +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/we +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/wbe +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/raddr +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VRF ${core_path}/i_cachepool_cc/i_spatz/i_vrf/rdata + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VLSU ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VSLDU ${core_path}/i_cachepool_cc/i_spatz/i_vsldu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VFU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group FPU ${core_path}/i_cachepool_cc/i_spatz/i_vfu/gen_fpu/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Internal ${core_path}/i_cachepool_cc/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VLSU -group ROB0 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[0]/i_reorder_buffer/* +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group VLSU -group ROB1 ${core_path}/i_cachepool_cc/i_spatz/i_vlsu/gen_rob[1]/i_reorder_buffer/* + +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_valid +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_ready +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_empty +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_pop +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_rsp_push +add wave -noupdate {*}$parent_grp -group ${name} -group Spatz -group RSP_FIFO ${core_path}/i_cachepool_cc/spatz_mem_fifo_bypass diff --git a/sim/scripts/vsim_group.tcl b/sim/scripts/vsim_group.tcl index 8edb7e5..876c4b2 100644 --- a/sim/scripts/vsim_group.tcl +++ b/sim/scripts/vsim_group.tcl @@ -2,18 +2,21 @@ # Solderpad Hardware License, Version 0.51, see LICENSE for details. # SPDX-License-Identifier: SHL-0.51 -# Create group for Tile $1 onerror {resume} -set group_path $1 +quietly set group_path $1 +quietly set parent_grp $3 # Add waves for remote xbar for {set p 0} {$p < $2} {incr p} { onerror {resume} - set xbar_path ${group_path}/gen_remote_tile_xbar[$p]/i_tile_remote_xbar + quietly set xbar_path ${group_path}/gen_remote_tile_xbar[$p]/i_tile_remote_xbar - add wave -noupdate -group Group -group remote_xbar[$p] ${xbar_path}/* + add wave -noupdate -group "${parent_grp}" -group remote_xbar[$p] ${xbar_path}/* } -add wave -noupdate -group Group -group Internal ${group_path}/* +add wave -noupdate -group "${parent_grp}" -group refill_xbar -group req_xbar ${group_path}/i_refill_xbar/i_req_xbar/* +add wave -noupdate -group "${parent_grp}" -group refill_xbar -group rsp_xbar ${group_path}/i_refill_xbar/i_rsp_xbar/* + +add wave -noupdate -group "${parent_grp}" -group Internal ${group_path}/* diff --git a/sim/scripts/vsim_tile.tcl b/sim/scripts/vsim_tile.tcl index 8763440..5a9565e 100644 --- a/sim/scripts/vsim_tile.tcl +++ b/sim/scripts/vsim_tile.tcl @@ -5,38 +5,44 @@ # Create group for Tile $1 onerror {resume} -set tile_path $2 +quietly set tile_path $3 +quietly set parent_grp $4 + +# --- Configuration Variables --- +# NrTCDMPortsPerCore: 4 Spatz ports + 1 Snitch port +quietly set NUM_XBARS 5 +quietly set SNITCH_IDX [expr {$NUM_XBARS - 1}] # Add waves for tcdm_mapper and csrs -# add wave -noupdate -group tile[$1] -group Barrier ${tile_path}/i_tile/i_snitch_barrier/* -# add wave -noupdate -group tile[$1] -group axi2reqrsp ${tile_path}/i_axi2reqrsp/* +# add wave -noupdate -group ${parent_grp} -group tile[$1] -group Barrier ${tile_path}/i_tile/i_snitch_barrier/* +# add wave -noupdate -group ${parent_grp} -group tile[$1] -group axi2reqrsp ${tile_path}/i_axi2reqrsp/* # Add waves for xbars -add wave -noupdate -group tile[$1] -group narrow_xbar ${tile_path}/i_tile/i_axi_narrow_xbar/* -add wave -noupdate -group tile[$1] -group wide_xbar ${tile_path}/i_tile/i_axi_wide_xbar/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group narrow_xbar ${tile_path}/i_tile/i_axi_narrow_xbar/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group wide_xbar ${tile_path}/i_tile/i_axi_wide_xbar/* -add wave -noupdate -group Barrier -group tile[$1] ${tile_path}/i_tile/i_cachepool_tile_barrier/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group Barrier ${tile_path}/i_tile/i_cachepool_tile_barrier/* # Add waves for cache controller for {set c 0} {$c < 4} {incr c} { onerror {resume} - set cache_path ${tile_path}/i_tile/gen_l1_cache_ctrl[$c]/i_l1_controller + quietly set cache_path ${tile_path}/i_tile/gen_l1_cache_ctrl[$c]/i_l1_controller - add wave -noupdate -group tile[$1] -group cache[$c] -group amo ${tile_path}/i_tile/gen_cache_connect[$c]/gen_cache_amo_connect[4]/gen_amo/i_cache_amo/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group amo ${tile_path}/i_tile/gen_cache_connect[$c]/gen_cache_amo_connect[${SNITCH_IDX}]/gen_amo/i_cache_amo/* - add wave -noupdate -group tile[$1] -group cache[$c] -group coalescer ${cache_path}/i_par_coalescer_for_spatz/gen_extend_window/i_par_coalescer_extend_window/i_par_coalescer/* - add wave -noupdate -group tile[$1] -group cache[$c] -group core ${cache_path}/i_insitu_cache_tcdm_wrapper/i_insitu_cache_core/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl0 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[0]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl1 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[1]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl2 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[2]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group meta_ctrl3 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[3]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group coalescer ${cache_path}/i_par_coalescer_for_spatz/gen_extend_window/i_par_coalescer_extend_window/i_par_coalescer/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group core ${cache_path}/i_insitu_cache_tcdm_wrapper/i_insitu_cache_core/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl0 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[0]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl1 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[1]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl2 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[2]/i_access_ctrl_for_meta/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group meta_ctrl3 ${cache_path}/i_insitu_cache_tcdm_wrapper/gen_cache_banks[3]/i_access_ctrl_for_meta/* - add wave -noupdate -group tile[$1] -group cache[$c] -group Internal ${cache_path}/* + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache[$c] -group Internal ${cache_path}/* } -for {set c 0} {$c < 5} {incr c} { - add wave -noupdate -group tile[$1] -group cache_xbar -group xbar[$c] ${tile_path}/i_tile/gen_cache_xbar[$c]/i_cache_xbar/* +for {set c 0} {$c < $NUM_XBARS} {incr c} { + add wave -noupdate -group ${parent_grp} -group tile[$1] -group cache_xbar -group xbar[$c] ${tile_path}/i_tile/gen_cache_xbar[$c]/gen_remote_group_slice/i_cache_xbar/* } # Add waves for remaining signals -add wave -noupdate -group tile[$1] -group Internal ${tile_path}/i_tile/* +add wave -noupdate -group ${parent_grp} -group tile[$1] -group Internal ${tile_path}/i_tile/* diff --git a/sim/scripts/vsim_wave.tcl b/sim/scripts/vsim_wave.tcl index d5fa528..a5f9347 100644 --- a/sim/scripts/vsim_wave.tcl +++ b/sim/scripts/vsim_wave.tcl @@ -1,12 +1,16 @@ -# Copyright 2021 ETH Zurich and University of Bologna. +# Copyright 2026 ETH Zurich and University of Bologna. # Solderpad Hardware License, Version 0.51, see LICENSE for details. # SPDX-License-Identifier: SHL-0.51 onerror {resume} quietly WaveActivateNextPane {} 0 -set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster -set group_path ${cluster_path}/gen_group/i_group +# --- Configuration Variables --- +quietly set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster +quietly set NUM_GROUPS 4 ;# Total number of groups +quietly set NUM_GROUPS_X 2 ;# X dimension of group mesh (NUM_GROUPS_Y = NUM_GROUPS / NUM_GROUPS_X) +quietly set NUM_TILES 4 ;# Tiles per group +quietly set NUM_CORES 4 ;# Cores per tile # Add the cluster probe add wave /tb_cachepool/cluster_probe @@ -14,23 +18,37 @@ add wave /tb_cachepool/cluster_probe # Cluster do sim/scripts/vsim_cluster.tcl ${cluster_path} -# Group -# add wave -noupdate -group Group ${group_path}/* -do sim/scripts/vsim_group.tcl ${group_path} 5 - -# Tile and Core -for {set tile 0} {$tile < 4} {incr tile} { - set tile_path ${group_path}/gen_tiles[$tile] - - do sim/scripts/vsim_tile.tcl $tile ${tile_path} - # Add all cores in Tile 0 - for {set core 0} {$core < 4} {incr core} { - set core_path ${tile_path}/i_tile/gen_core[$core] - do sim/scripts/vsim_core.tcl $tile $core ${core_path} - } - - for {set ch 0} {$ch < 4} {incr ch} { - add wave -noupdate -group DramSys$ch /tb_cachepool/gen_dram[$ch]/i_axi_dram_sim/* +# Iterate through all groups using 2D coordinates +for {set g 0} {$g < $NUM_GROUPS} {incr g} { + quietly set gx [expr {$g % $NUM_GROUPS_X}] + quietly set gy [expr {$g / $NUM_GROUPS_X}] + quietly set group_wp_path ${cluster_path}/gen_group_y[${gy}]/gen_group_x[${gx}]/i_group + quietly set group_path ${group_wp_path}/i_group + quietly set gwp_name "GroupWP_x${gx}_y${gy}" + + # 1. Plot GroupWP signals for this group (always, all groups) + add wave -noupdate -group "${gwp_name}" ${group_wp_path}/* + + # 2. Plot Group-level signals nested inside GroupWP (always, all groups) + do sim/scripts/vsim_group.tcl ${group_path} 5 "${gwp_name}" + + # 3. Plot all tiles and cores for the diagonal groups: (0,0) always, + # and (1,1) if the mesh has at least 2 columns and 2 rows + if {($gx == 0 && $gy == 0) || ($gx == 1 && $gy == 1 && $NUM_GROUPS_X >= 2)} { + for {set tile 0} {$tile < $NUM_TILES} {incr tile} { + quietly set tile_path ${group_path}/gen_tiles[${tile}]/gen_tile + do sim/scripts/vsim_tile.tcl $tile $g ${tile_path} "${gwp_name}" + + # 4. Plot all cores grouped under their tile + for {set core 0} {$core < $NUM_CORES} {incr core} { + quietly set core_path ${tile_path}/i_tile/gen_core[${core}] + do sim/scripts/vsim_core.tcl $g $tile $core ${core_path} "${gwp_name}" "tile[${tile}]" + } + } } } +# Add DRAM waves once at the end +for {set ch 0} {$ch < 4} {incr ch} { + add wave -noupdate -group "DramSys_$ch" /tb_cachepool/gen_dram[$ch]/i_axi_dram_sim/* +} diff --git a/sim/scripts/vsim_wave_single_tile.tcl b/sim/scripts/vsim_wave_single_tile.tcl index 28e54e5..125d849 100644 --- a/sim/scripts/vsim_wave_single_tile.tcl +++ b/sim/scripts/vsim_wave_single_tile.tcl @@ -5,9 +5,10 @@ onerror {resume} quietly WaveActivateNextPane {} 0 -set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster -set group_path ${cluster_path} -set tile_path ${group_path}/gen_tile +quietly set cluster_path /tb_cachepool/i_cluster_wrapper/i_cluster +quietly set group_wp_path ${cluster_path}/gen_group[0][0]/i_group +quietly set group_path ${group_wp_path}/i_group +quietly set tile_path ${group_path}/gen_tiles[0]/gen_tile # Add the cluster probe @@ -15,11 +16,11 @@ add wave /tb_cachepool/cluster_probe do sim/scripts/vsim_cluster.tcl ${cluster_path} -do sim/scripts/vsim_tile.tcl 0 ${tile_path} +do sim/scripts/vsim_tile.tcl 0 0 ${tile_path} # Add all cores in Tile 0 for {set core 0} {$core < 4} {incr core} { - set core_path ${tile_path}/i_tile/gen_core[$core] - do sim/scripts/vsim_core.tcl 0 $core ${core_path} + quietly set core_path ${tile_path}/i_tile/gen_core[$core] + do sim/scripts/vsim_core.tcl 0 0 $core ${core_path} "" } for {set ch 0} {$ch < 4} {incr ch} { diff --git a/sim/sim.mk b/sim/sim.mk index 56cee77..80342e8 100644 --- a/sim/sim.mk +++ b/sim/sim.mk @@ -82,7 +82,7 @@ ${SIM_DIR}/${DPI_LIB}/cachepool_dpi.so: ${dpi_target} # ----------------- ${WORK_DIR}/${FESVR_VERSION}_unzip: mkdir -p $(dir $@) - wget -O $(dir $@)/${FESVR_VERSION} https://github.com/riscv/riscv-isa-sim/tarball/${FESVR_VERSION} + curl -fL -o $(dir $@)/${FESVR_VERSION} https://github.com/riscv/riscv-isa-sim/tarball/${FESVR_VERSION} tar xfm $(dir $@)${FESVR_VERSION} --strip-components=1 -C $(dir $@) touch $@ @@ -100,17 +100,24 @@ ${WORK_DIR}/compile.vsim.tcl: ${SNLIB_DIR}/rtl_lib.cc ${SNLIB_DIR}/common_lib.cc echo 'return 0' >> $@ # Wrapper script & GUI script +# The generated scripts derive ROOT_DIR from their own location at runtime so +# that they remain portable across different checkout paths (CI runners, moved +# repos). All absolute paths baked in by make are replaced by a single sed pass. define QUESTASIM ${VSIM} -c -do "source $<; quit" | tee $(dir $<)vsim.log @! grep -P "Errors: [1-9]*," $(dir $<)vsim.log @mkdir -p $(SIMBIN_DIR) $(SIMBIN_DIR)/logs - @echo "#!/bin/bash" > $(SIMBIN_DIR)/cachepool_cluster.vsim + @echo '#!/bin/bash' > $(SIMBIN_DIR)/cachepool_cluster.vsim + @echo 'ROOT_DIR="$$(cd "$$(dirname "$$(readlink -f "$$0")")/../.." && pwd)"' >> $(SIMBIN_DIR)/cachepool_cluster.vsim @echo 'echo `realpath $$1` > ${SIMBIN_DIR}/logs/.rtlbinary' >> $(SIMBIN_DIR)/cachepool_cluster.vsim @echo '${VSIM} +permissive ${VSIM_FLAGS} -do "run -a" -work ${WORK_DIR} -c -ldflags "-Wl,-rpath,${GCC_LIB} -L${FESVR}/lib -lfesvr_vsim -lutil" $1 +permissive-off ++$$1 +PRELOAD=$$1' >> $(SIMBIN_DIR)/cachepool_cluster.vsim + @sed -i 's|$(CACHEPOOL_DIR)|$${ROOT_DIR}|g' $(SIMBIN_DIR)/cachepool_cluster.vsim @chmod +x $(SIMBIN_DIR)/cachepool_cluster.vsim - @echo "#!/bin/bash" > $(SIMBIN_DIR)/cachepool_cluster.vsim.gui + @echo '#!/bin/bash' > $(SIMBIN_DIR)/cachepool_cluster.vsim.gui + @echo 'ROOT_DIR="$$(cd "$$(dirname "$$(readlink -f "$$0")")/../.." && pwd)"' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui @echo 'echo `realpath $$1` > ${SIMBIN_DIR}/logs/.rtlbinary' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui @echo '${VSIM} +permissive ${VSIM_FLAGS} -do "log -r /*; source ${WAVE_FILE}; run -a" -work ${WORK_DIR} -ldflags "-Wl,-rpath,${GCC_LIB} -L${FESVR}/lib -lfesvr_vsim -lutil" $1 +permissive-off ++$$1 +PRELOAD=$$1' >> $(SIMBIN_DIR)/cachepool_cluster.vsim.gui + @sed -i 's|$(CACHEPOOL_DIR)|$${ROOT_DIR}|g' $(SIMBIN_DIR)/cachepool_cluster.vsim.gui @chmod +x $(SIMBIN_DIR)/cachepool_cluster.vsim.gui endef diff --git a/software/snRuntime/include/l1cache.h b/software/snRuntime/include/l1cache.h index ecde97b..ba6e9d2 100644 --- a/software/snRuntime/include/l1cache.h +++ b/software/snRuntime/include/l1cache.h @@ -17,6 +17,8 @@ void l1d_xbar_commit(); void l1d_commit(); void l1d_init(uint32_t size); void l1d_flush(); +void l1d_shared_flush(); +void l1d_private_flush(uint32_t tile); void l1d_wait(); void l1d_spm_config (uint32_t size); void l1d_part (uint32_t size); diff --git a/software/snRuntime/include/snrt.h b/software/snRuntime/include/snrt.h index ae91213..1d5358a 100644 --- a/software/snRuntime/include/snrt.h +++ b/software/snRuntime/include/snrt.h @@ -291,6 +291,14 @@ static inline void snrt_mutex_release(volatile uint32_t *pmtx) { dm_exit(); \ snrt_cluster_hw_barrier(); +//================================================================================ +// Printf functions +//================================================================================ + +// Print a float value without promoting to double (avoids fcvt.d.s / fsd, +// which are illegal on rv32imaf). All arithmetic stays in single precision. +extern void snrt_printf_float(float val); + #ifdef __cplusplus } #endif diff --git a/software/snRuntime/src/l1cache.c b/software/snRuntime/src/l1cache.c index 79700d1..c52803c 100644 --- a/software/snRuntime/src/l1cache.c +++ b/software/snRuntime/src/l1cache.c @@ -20,6 +20,9 @@ void l1d_xbar_config(uint32_t offset) { (uint32_t *)(_snrt_team_current->root->cluster_mem.end + CACHEPOOL_PERIPHERAL_XBAR_OFFSET_REG_OFFSET); *cfg = offset; + // Flush cache before commit xbar changes + l1d_flush(); + l1d_wait(); l1d_xbar_commit(); } @@ -96,28 +99,29 @@ void l1d_wait() { } } -void l1d_spm_config (uint32_t size) { - // flush the cache before reconfiguration - l1d_flush(); - l1d_wait(); - // free all allocated region - snrt_l1alloc_reset(); - // set the pointers - volatile uint32_t *cfg_size = - (uint32_t *)(_snrt_team_current->root->cluster_mem.end + - CACHEPOOL_PERIPHERAL_CFG_L1D_SPM_REG_OFFSET); - volatile uint32_t *commit = - (uint32_t *)(_snrt_team_current->root->cluster_mem.end + - CACHEPOOL_PERIPHERAL_L1D_SPM_COMMIT_REG_OFFSET); - // Make sure dummy region will not be optimized away - volatile double *dummy; - // Should be (L1_size - size) * 128 - int cache_region = (128 - size) * 128; - dummy = (volatile double *)snrt_l1alloc(cache_region * sizeof(double)); - // change size and commit the change - *cfg_size = size; - *commit = 1; -} +// Used for hybrid SPM/cache, unused in CachePool now +// void l1d_spm_config (uint32_t size) { +// // flush the cache before reconfiguration +// l1d_flush(); +// l1d_wait(); +// // free all allocated region +// snrt_l1alloc_reset(); +// // set the pointers +// volatile uint32_t *cfg_size = +// (uint32_t *)(_snrt_team_current->root->cluster_mem.end + +// CACHEPOOL_PERIPHERAL_CFG_L1D_SPM_REG_OFFSET); +// volatile uint32_t *commit = +// (uint32_t *)(_snrt_team_current->root->cluster_mem.end + +// CACHEPOOL_PERIPHERAL_L1D_SPM_COMMIT_REG_OFFSET); +// // Make sure dummy region will not be optimized away +// volatile double *dummy; +// // Should be (L1_size - size) * 128 +// int cache_region = (128 - size) * 128; +// dummy = (volatile double *)snrt_l1alloc(cache_region * sizeof(double)); +// // change size and commit the change +// *cfg_size = size; +// *commit = 1; +// } // Used to configure the number of private cache banks per tile void l1d_part (uint32_t size) { diff --git a/software/snRuntime/src/printf.c b/software/snRuntime/src/printf.c index 744a441..7c78245 100644 --- a/software/snRuntime/src/printf.c +++ b/software/snRuntime/src/printf.c @@ -26,3 +26,33 @@ void snrt_putchar(char character); // Include the vendorized tiny printf implementation. #include "../vendor/printf.c" + +// Print a single-precision float as a decimal string without promoting to +// double. Passing a float through a variadic (...) argument promotes it to +// double per the C standard, generating fcvt.d.s / fsd which are illegal on +// rv32imaf. This wrapper takes the value as a named argument (no promotion) +// and keeps all arithmetic in single precision. +void snrt_printf_float(float val) { + uint32_t bits; + __builtin_memcpy(&bits, &val, sizeof(uint32_t)); + + uint32_t exp = (bits >> 23) & 0xFFU; + uint32_t mant = bits & 0x7FFFFFU; + + if (exp == 0xFFU) { + if (mant != 0U) + printf("NaN"); + else + printf("%sInf", (bits >> 31) ? "-" : ""); + return; + } + + if (bits >> 31) { + _putchar('-'); + val = -val; + } + + uint32_t int_part = (uint32_t)val; + uint32_t frac_part = (uint32_t)((val - (float)int_part) * 1000000.0f); + printf("%u.%06u", int_part, frac_part); +} diff --git a/software/snRuntime/src/start.S b/software/snRuntime/src/start.S index 34c0d93..7b2ce87 100644 --- a/software/snRuntime/src/start.S +++ b/software/snRuntime/src/start.S @@ -53,6 +53,14 @@ snrt.crt0.init_bss: blt t0, t1, 1b 2: +snrt.crt0.init_vec_registers: + li t0, -1 + vsetvli zero, t0, e32, m8, ta, ma + vmv.v.i v0, 0 + vmv.v.i v8, 0 + vmv.v.i v16, 0 + vmv.v.i v24, 0 + snrt.crt0.init_registers: # Clear FP registers csrr t0, misa diff --git a/software/tests/CMakeLists.txt b/software/tests/CMakeLists.txt index c090359..a3e85e5 100644 --- a/software/tests/CMakeLists.txt +++ b/software/tests/CMakeLists.txt @@ -123,3 +123,5 @@ add_spatz_test_oneParam(idotp-32b idotp-32b/main.c 32768) add_spatz_test_oneParam(load-store load-store/main.c 16) + +add_spatz_test_zeroParam(bandwidth bandwidth/main.c) diff --git a/software/tests/bandwidth/data/data.h b/software/tests/bandwidth/data/data.h new file mode 100644 index 0000000..c5be45f --- /dev/null +++ b/software/tests/bandwidth/data/data.h @@ -0,0 +1,336 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// This file was generated automatically by script/gen_data.py. + +#include + +const uint32_t M = 4096; +const uint32_t R = 8; + +static int data_dram[4096] __attribute__((section(".data"))) = { + 2, 79, -8, -86, 6, -29, 88, -80, 2, 21, -26, -13, 16, -1, 3, 51, + 30, 49, -48, -99, -13, 57, -63, 29, 91, 87, -80, 60, -43, -79, -12, -52, + -42, 69, 87, -86, 89, 89, 74, 89, -50, 7, -46, -37, 30, -50, 34, -80, + -28, 66, -83, 31, -12, -41, -87, -92, -11, -48, 29, -17, -9, 10, 87, 98, + 71, -93, 74, -66, -20, 63, -51, 3, 31, -99, 33, -47, 5, -97, -47, 90, + 45, -57, 61, 89, -87, -6, -53, -86, 99, 89, -61, -19, 10, -48, -77, 53, + 87, 23, -60, 56, -86, -56, -36, -12, -30, -92, -13, 28, 35, -38, 38, -20, + 35, 62, 62, -68, 22, -96, -60, -73, 34, -29, -89, 61, -68, -53, 50, -39, + -64, -2, 71, 3, -66, 92, 0, 74, 30, -100, -96, 41, 2, -74, 36, -86, + -11, -59, 23, 78, -38, -5, -49, -5, 31, 50, 42, 70, -72, -65, -88, 59, + -30, 86, -15, -73, -35, 69, -56, -39, 84, 33, -73, -73, 7, -57, -17, -71, + 89, -26, 27, -9, 89, 28, 20, -74, 89, 20, 15, -98, 2, 97, 99, 54, + 36, -39, 64, -50, 71, 51, -42, 17, 59, -5, 79, 12, -39, 85, -49, -89, + -62, 29, 30, 12, 0, 12, 83, -20, 86, 12, -99, 29, -47, -14, 28, 46, + 25, 29, -48, 71, 59, 97, 59, -33, 82, 83, 22, 44, -63, -77, -32, 15, + -3, 97, 38, 43, -4, 23, 86, -31, -8, -98, 47, 86, 63, 46, -11, 94, + 46, 47, -5, 98, -49, 60, 67, 27, -62, -19, 3, 28, -90, 84, 77, 50, + 58, -59, -2, -94, 43, -11, 11, -41, 12, -99, 28, -53, 39, 96, -64, 59, + -92, -2, 46, -53, 30, 47, 51, -47, 19, 60, 51, 15, -26, 12, 99, 63, + 65, 3, -17, 11, -2, 52, -8, 45, 27, 9, -19, 93, -47, 62, 88, 68, + 60, -33, -68, 41, -80, -53, 47, 27, 35, 34, 94, 44, 27, -68, 75, 86, + 14, 18, -79, 57, -63, 8, -50, 81, -93, -74, -74, -80, -71, -4, -73, 10, + 91, 96, -40, -53, 46, -97, -66, 91, -52, -84, 71, 57, -55, 16, -95, -2, + 23, -64, -77, -8, -55, 80, -6, -2, 87, 15, 90, 59, 60, -34, 27, -83, + -76, -47, -43, -34, 3, 73, -77, 13, -69, 74, -15, 50, 93, 26, 54, 29, + -84, 3, 60, 36, -58, 75, -62, 69, -75, -2, -51, 52, 51, -88, -41, 34, + -44, -65, 72, -81, -36, -93, 43, 41, 14, 42, -9, -3, -35, -69, 90, -15, + -50, 52, 85, -38, 89, 24, 49, -43, -43, -15, -52, 79, 69, -31, -86, -47, + 87, 0, -93, -48, -41, 7, -96, 2, 95, -95, 8, 15, -7, -54, -2, -46, + 67, -49, 43, -88, 13, 23, 5, 57, 46, 44, 19, -38, -82, -9, -43, 82, + -11, 16, -39, -78, 26, 36, 39, 28, -43, 21, -100, -67, -5, 25, 17, -53, + -12, 16, 28, -85, 88, 91, 90, -32, -79, -8, 94, -25, 53, 43, 78, -15, + 84, -72, -32, -54, -7, 89, 96, 43, 75, -16, -62, -1, -68, 0, -78, -91, + -32, -1, -67, 79, 37, 46, 85, -5, -100, -32, -97, -85, -77, -21, -99, 27, + 59, -17, 51, 39, 77, 62, 23, -68, 60, 88, 78, 70, 0, -89, -34, -36, + 60, 67, -27, -58, -57, -72, 40, -89, -6, -55, 29, -66, -20, -11, -93, -8, + 53, -11, 61, 14, 4, 34, 95, -43, 13, -26, 56, 19, 63, -80, 63, 37, + 0, 51, 91, 76, -2, -65, -5, 51, 50, 89, -64, -89, 12, 82, -88, -78, + -2, 4, -71, -84, 12, -39, -17, 11, -15, 40, 86, -82, 76, -1, 39, 88, + 46, -25, -92, 98, -73, 27, -49, -18, 10, 43, 16, -32, -2, 39, -76, 79, + 22, -48, 50, 43, -44, -62, 8, 80, -59, 85, 66, 41, 21, 32, 62, -26, + 45, -25, -92, -27, 85, 44, -94, 73, 40, 67, 69, 36, 77, -74, 21, 93, + -96, -72, 64, 65, -18, 35, 8, -36, -15, 44, -30, -56, 31, -65, -31, 58, + -82, 88, 7, 81, 66, -10, -11, -82, -62, 25, 94, 72, 40, 25, -43, 47, + 99, -40, 26, 4, 66, -100, 30, -9, 89, 90, 12, 52, -45, 60, 65, 16, + 33, -43, -57, 72, 59, 72, -40, -54, 48, -21, 17, 63, 46, -81, 84, 45, + -54, -52, -87, 42, 58, -100, 16, -47, 17, -98, 43, 84, -89, -27, -85, 99, + 1, 51, 55, 16, -93, 21, -9, 63, -11, 35, 85, -41, 77, -73, 0, -60, + 91, 54, 90, 44, 60, 56, 40, -55, -66, 33, -19, 14, 96, -54, 52, 93, + -91, -45, -71, 8, -96, 18, -68, 17, -36, 45, 76, -90, -16, -75, -38, -15, + -42, -74, 76, 60, -3, 4, -2, 28, 48, -46, -95, 96, -6, 32, 1, -98, + 80, -78, -48, 64, -18, 44, -16, -23, 9, -100, -50, 72, -97, 89, 92, 12, + -69, -67, -9, -6, -29, -62, 53, 61, 81, 17, -98, 22, -51, -89, 92, -47, + 32, -44, 44, 11, -54, 50, -16, 41, -35, -26, 78, 2, 65, 91, -63, -51, + -3, -19, -71, -22, -10, 78, 90, -49, 65, -22, -71, 5, -50, -20, 32, -72, + 31, 37, 83, 44, -27, -84, -17, -32, -67, -95, -48, 93, 25, -58, 14, 10, + 50, 82, -21, -6, 17, 43, -93, 31, 3, 31, 83, -76, 94, -5, 94, 54, + -8, 59, 77, -40, 21, -50, 46, -80, -96, -9, 69, -40, -79, 48, -31, -100, + 32, -89, -11, -55, -67, 76, -23, -56, 54, -28, -75, -54, 20, -45, -7, 6, + -38, -53, -40, -20, -75, -65, -100, -93, 12, -2, 79, -54, 26, -45, -87, -73, + -23, 29, 8, 53, -87, 86, -45, 14, -94, -98, 10, 50, 6, -83, -63, 14, + -86, 91, 18, -73, -62, 84, -84, -15, 25, -57, -76, 44, -88, -76, -33, 37, + -34, 8, 45, 10, 10, -67, 10, -93, 67, 12, -18, -59, 68, 0, -95, 79, + -75, -37, 86, 83, -42, 8, 97, 20, -68, 80, 49, -80, 97, -31, 11, -97, + -7, -26, 89, -39, -7, -6, 51, -46, 36, 30, -70, -61, -65, 51, -95, -35, + -26, -97, -22, 33, 17, 12, -7, 78, -39, 84, 93, -22, 35, -75, 78, 72, + -57, 32, -31, 53, 17, -33, -82, -81, 12, 39, -54, -100, -11, 41, -37, -63, + -64, 25, 38, -1, -24, -98, 60, 33, 77, -91, -96, 50, 37, 71, 29, -88, + 67, 29, -17, -36, -38, 0, -28, -84, -92, 42, 51, -63, 62, -7, -6, -52, + -32, -39, 87, 77, -23, 16, 19, 36, 61, -25, -66, -100, -61, -37, 49, -41, + -37, -8, -29, -90, 4, -87, -41, -71, -66, 64, -96, -18, 22, -23, 53, -39, + 31, -12, -59, -12, 45, -61, -29, -62, 3, -87, -69, 78, 24, 65, -4, -78, + -38, 42, 52, 44, 93, -48, 3, 3, 78, -62, 78, 97, 33, -34, 34, 78, + 99, -59, 91, -86, 10, 56, -68, 54, 63, 56, -63, 84, -4, 54, 82, -68, + 95, -15, -35, -91, -96, 22, -27, -4, 17, -63, 40, 58, -54, 8, 79, 83, + -86, 56, 35, -96, 56, -54, -33, -25, 72, 29, 54, -6, 14, 63, -65, -75, + -58, -74, -32, 47, -90, -27, 3, -63, 33, 99, -78, -54, -11, -55, 6, 39, + 40, 3, 89, 10, 3, -41, 70, -1, -33, 32, 19, -64, -29, 26, 5, -9, + -70, 21, 36, -50, -72, -61, -60, -90, 50, -100, -55, 0, 48, -11, 9, 63, + 81, -44, -100, -38, 81, -46, 14, 67, 42, 48, 74, -48, 36, 79, -44, 16, + 53, 68, -66, -38, -76, -11, -26, -63, 29, 34, 61, -10, -84, 27, 70, 86, + 7, 78, 81, 51, -76, 98, 79, 97, -13, 60, 76, -72, -38, 49, 53, 55, + 13, -52, 98, -20, -17, 4, 76, 47, 17, -15, -38, 88, 3, -52, -30, -100, + 40, 78, 22, 83, -39, -69, 57, -72, -52, 72, -71, 43, 67, -82, -83, -100, + -23, 14, 74, 93, 12, 13, 65, 78, -38, 31, 28, -93, 56, -46, 2, 30, + -69, -91, 9, 61, 23, -14, -46, -69, 77, 34, -8, -93, -36, -44, -34, 86, + -29, -47, -34, -50, -4, -9, -93, 61, -66, -5, -13, -23, -69, 19, -55, 43, + 95, 64, 81, 41, -46, 75, -94, -27, -94, 60, 50, -16, 46, 11, 24, 46, + -65, -72, 87, -19, -99, 28, -54, 96, 47, -90, 29, 94, 18, 39, -81, -96, + -64, 65, 36, -48, 71, -2, -15, 7, -77, -20, 57, 19, -42, 27, 13, 14, + -87, -92, -61, 93, -76, 2, -79, -97, 53, 85, -72, 8, -3, -64, -26, -31, + 45, 27, -16, -59, -1, -60, 65, -67, -84, 64, 52, 21, -25, -14, 16, 5, + -15, -74, -44, -70, 33, 67, -3, -89, -48, 98, 37, 72, 44, 53, 89, 21, + -55, -37, 29, -47, 6, 92, -50, 80, -65, 14, -75, -72, 48, -90, -61, -90, + 63, 12, 86, -62, -47, -3, -46, 10, 51, 49, -32, -45, -68, -65, 47, -20, + 89, 8, 53, 93, -28, -61, -84, 28, -40, 70, 3, -59, -76, -62, -66, 30, + 7, 71, 78, 1, -3, -89, -82, 0, -57, 86, -52, 88, -84, -27, -44, -46, + -54, -89, -39, -21, -18, -93, 48, -20, 17, 97, 99, -76, -12, -89, 42, -42, + 53, 8, -75, -54, -69, -91, -85, 98, 44, 50, -75, -15, 34, -87, -22, 16, + -94, 36, 14, 75, -29, -42, -19, 15, 66, 45, 86, 2, -84, -87, 58, 51, + 7, 87, -56, -3, 30, -64, 70, 67, 82, 7, -62, -86, 19, -97, -8, 52, + 40, 60, -85, 69, 7, 25, 93, -46, 69, -67, 57, -88, 40, -83, 59, -5, + 22, -62, -55, -72, 89, -39, 84, -85, 83, 20, -91, 57, 52, 2, -17, 32, + -36, 76, -98, 72, 41, 19, 57, -33, 45, 89, -64, 52, 75, 92, 80, -28, + 42, -52, 20, -13, 95, 39, 86, -64, -40, 70, 23, 97, -62, 83, -38, -55, + -13, -90, -39, -24, 52, 98, -49, -97, -42, -29, 47, -8, -38, -47, 16, 9, + -27, 6, -44, 68, -98, 19, -95, -96, -96, -47, -54, 76, -92, -2, -81, 21, + -40, 22, 62, 77, -19, -39, 44, -98, 59, -88, 19, 24, -12, -28, -33, 41, + 0, -3, -90, 4, 83, -34, 22, -72, 36, 94, 52, -25, 69, -92, -61, -76, + -96, 38, 86, 23, -63, -29, 32, 75, -11, 45, 97, -64, -41, 75, -73, 96, + -20, 13, -61, -56, -39, -8, 14, 85, -34, -45, -61, -3, -61, -10, -68, -95, + 46, -21, 67, -9, -4, -42, -42, -53, 68, -88, 6, -56, -12, 80, 69, -1, + 90, 54, 65, 87, -94, 32, -56, -43, -70, -78, 69, 37, 59, 6, 80, 57, + -62, 58, -19, -1, -99, -24, -89, -62, 93, -88, -55, 54, 12, -84, 3, -3, + 87, 91, -13, 8, 62, -31, 57, 98, -27, -38, 55, -41, -22, -64, -79, 95, + 47, -5, -33, 91, -84, -89, 49, 44, 37, 17, 92, -79, 19, -51, -87, 91, + 74, -72, 63, 79, 24, -94, -72, 38, -68, 33, 10, -22, 59, 78, -43, -75, + 73, -72, -97, -2, 40, -62, -86, 18, -72, 56, -26, 15, -69, 29, 74, -17, + -79, -74, 94, -20, 25, 83, -9, -80, -54, 0, 86, -17, -95, 46, 84, 96, + 30, -100, -43, 23, 65, -86, 61, 88, 62, -37, -76, -77, -89, -83, -86, -74, + -29, -41, 73, 44, -8, -48, -93, -22, -74, 73, -51, 98, 30, -8, 91, 71, + 80, -77, -41, 21, 30, 47, -55, -86, 52, 98, -84, 25, -17, -92, 65, 75, + 87, -38, 16, 29, -13, -29, 38, -37, -29, -80, -83, -37, -19, 9, -6, 65, + -69, -90, -56, -68, 1, 68, 35, -90, 78, -13, -60, 44, -25, 20, -55, 59, + -21, -47, -15, -9, 47, 60, 67, 59, -4, -99, -68, 57, 60, -24, 15, 53, + 49, 2, 93, 65, 2, -11, 73, 21, 42, 5, 18, 71, -52, 3, -40, 94, + 22, 33, -92, -95, -28, 2, 59, -60, 13, -93, -29, 77, -9, -39, -17, -39, + 10, 26, -94, -17, 31, 33, -43, -79, 53, 26, -98, 26, -60, 87, 5, -87, + -89, -14, -89, 40, -76, -56, 21, 46, -46, -1, 72, 35, 18, -48, 82, -69, + -50, -57, -19, 97, 45, 49, -64, -5, -45, 4, -42, 30, 55, -66, 88, -8, + 81, 90, -4, 34, -25, -52, -7, 58, 85, 88, -77, 47, -64, -97, 7, 25, + -81, 87, 51, -51, 76, 78, -5, 4, 63, 16, -3, -35, -22, 61, 63, -95, + -62, 69, 14, 85, 24, -89, 30, -83, -61, -80, 42, -2, -75, -93, -1, -14, + -59, 71, -87, -23, -99, 18, -64, -32, 65, -60, -39, -41, 24, 10, -87, -17, + 29, -31, 44, 22, -12, -73, -35, 62, -96, 92, 67, 83, 63, -56, 50, -54, + -70, 38, -91, 94, 17, -87, -21, -8, -91, -25, -7, 4, -100, 32, -81, -93, + 70, -100, -44, -37, 17, -73, 20, 6, 64, 74, 64, -52, 19, -57, -45, 55, + 10, 33, 7, 88, 47, 13, -23, 29, -64, -81, -75, -72, -100, 18, -66, 97, + -42, -32, 60, 78, -89, -81, 80, -58, 25, -51, -72, 84, -62, 83, 70, 77, + 50, 71, -63, -17, 50, 27, 25, -56, -49, -21, 77, 23, 18, -93, -16, 28, + -93, 56, -5, 1, -96, -15, 74, -67, 92, -46, 18, 70, 57, 13, 44, -75, + 35, -54, 12, -25, -29, -12, -55, 95, -68, -100, 46, -70, 41, -48, -53, -32, + 4, -11, 89, 93, 91, 97, -38, -28, 60, 68, -12, 75, -88, -28, -24, 58, + -33, -3, 24, 1, 31, 1, -82, 97, -19, 40, -78, -18, 8, -26, 28, -13, + -47, 98, -12, 42, -36, -25, 98, 42, 48, 46, -41, -8, 60, -52, -66, 71, + 1, -90, -2, 99, 9, -27, 93, 73, -92, -98, -16, -75, -29, -53, 93, 97, + 55, 16, 53, 52, -89, 32, -53, -64, -57, -86, -58, 75, -14, -31, -55, 7, + 22, 70, -80, 50, 6, -21, -54, 16, 23, 42, -76, -65, -50, -86, -33, -80, + 40, -82, -67, -23, 24, 95, 16, -49, -74, 56, 55, -66, 35, 43, 5, 63, + 57, 45, -99, 76, -24, 31, -11, -12, -7, -94, -6, 7, -27, -62, 40, 58, + -30, 90, -45, 69, 88, -59, 83, -65, -17, 95, 17, -94, -47, 28, 71, 19, + 72, 5, -54, 22, 40, 18, 49, 17, -80, 36, -75, -92, -86, -91, 21, 20, + 24, 69, -29, 51, -52, 53, -10, -64, -33, -51, 51, -82, -1, -41, 66, -12, + 45, 46, 63, 82, -32, 74, -52, 7, 60, 18, 48, 49, 75, -64, 42, 62, + 98, 59, -90, 37, -23, -24, 84, 52, -99, -37, -66, -94, -16, 45, -34, -59, + 17, 24, 66, 68, -77, 87, -55, -82, -33, 1, -59, -69, 66, 8, -65, -4, + 63, 30, -28, 35, -4, 22, -31, 68, 95, -88, -52, -41, 29, 87, -96, 71, + -74, 59, 48, -93, 99, -51, -34, -57, 24, -43, -98, -15, -31, 29, 71, -99, + 1, -66, -3, -71, 15, -93, 49, -57, 85, -65, 96, -24, -70, -54, -46, 5, + 89, 57, 20, -46, 43, -89, 44, 43, 39, 35, -98, 43, -71, -78, -9, 14, + 3, 17, -96, 3, 7, -36, 20, -71, 48, -17, 97, 49, 99, -69, 60, -85, + -63, 41, -65, 50, 13, -35, 4, -16, -99, 92, -26, -34, 79, 89, 50, 53, + 95, -46, -95, 62, 49, 70, 42, -15, -41, 0, -89, -52, -59, 65, -19, 72, + -66, 57, 57, 33, -34, -21, -94, 79, -87, -28, 42, 48, 62, -35, -10, 29, + 62, -81, -91, -76, 93, -76, -47, -68, 63, -6, -87, -36, 9, -21, -47, 24, + 79, 30, 68, 33, -76, 40, 16, 35, 48, 95, -76, -22, -33, 72, 87, -89, + -69, 27, -5, -9, 40, -70, -23, -81, 45, -41, 90, 33, 59, -8, -58, 29, + -41, 50, -70, -96, 61, -91, -12, 29, 25, -90, 77, 58, -45, -18, 36, -2, + 48, -28, 55, 89, 14, -94, -21, 43, -4, -97, -97, 2, -61, -84, 0, 2, + 23, -49, -49, 63, 23, 65, 77, -38, 15, 90, -39, -94, 54, -9, -87, 98, + -15, 32, 82, 29, -39, 48, -22, 48, 37, -43, 19, 66, 32, -36, -88, 80, + -19, 72, 59, 63, 87, 38, 90, 48, -49, 82, 16, -75, 95, -67, -49, 11, + -19, 79, 56, 65, -59, -80, -59, 40, 12, 4, 91, -84, 19, 36, 68, 28, + 29, -41, -53, -72, -4, -91, -43, 87, -80, -73, -67, -93, 86, 99, 76, -22, + 44, 87, -14, -13, 72, -32, 59, 71, 92, -14, 0, -93, -24, 99, -14, -58, + 28, 8, 78, 51, 51, -84, 32, 47, -80, -43, 58, 9, 38, 79, 42, 34, + -73, -50, 85, -96, -96, -7, -5, -93, -37, 37, -63, 10, 11, -20, 35, 3, + 52, -28, 93, 77, 94, -52, -65, 25, -33, -23, 4, 68, 21, 18, 89, 13, + -91, 48, 63, 39, -10, -99, 17, -88, 80, 59, -6, 43, -69, -69, 23, -90, + -91, -86, -84, 67, 45, 82, 52, -42, -23, 23, 13, -79, 57, 71, -69, 78, + 73, 12, 19, 48, 21, -69, 18, -62, -89, 74, -83, 48, 40, 48, 25, 35, + -55, 3, 34, 50, -15, 49, -12, -79, -9, -72, -88, -99, -72, 79, -74, -51, + -6, 50, -13, -57, -79, 70, 64, 88, -4, 83, -98, 90, 17, -82, -78, 56, + -71, -62, 7, 10, -76, 27, 10, 35, 5, 85, 43, -29, 28, 7, -96, -98, + 92, 31, 3, -37, 5, -3, -91, 48, 27, -55, 54, -35, 68, 11, -88, 36, + 63, -15, -11, -43, -47, -89, -87, 20, 61, 80, 20, -81, -26, 43, -31, -45, + 0, 85, 31, 10, 24, 31, -81, 37, -77, 53, -64, -47, 2, 48, -63, 73, + -97, 87, -44, 72, -81, -84, 18, -30, -97, -87, -98, -21, 85, -46, 31, 19, + -77, 61, 57, -19, 46, 58, 87, -41, 46, -52, -38, 9, 49, 51, 8, -25, + -71, -43, 16, 58, -1, 91, -82, 75, 72, -7, 82, 12, 82, 45, -38, 15, + 12, -54, -44, 17, -56, 60, 44, 97, -97, 47, -54, 42, 97, -54, -23, 23, + -26, 2, 36, -11, 43, -32, -40, 37, -51, -98, 6, 12, -7, 17, 97, -20, + 8, -50, 46, 69, -66, -55, 53, -32, -8, -37, 64, -52, 48, -27, 91, -96, + -1, -31, -11, -97, 25, -62, 62, 28, -77, -44, -1, -31, -60, 81, 73, -41, + 7, 87, -44, -70, -17, 51, -65, 57, 51, 7, 5, 57, -15, 34, 62, 82, + -87, 19, 37, -21, 43, 81, 5, -31, -57, -28, -56, 83, 6, -17, -19, 6, + 84, -80, 95, 77, 60, -94, -31, 26, 42, 59, 47, 42, 75, 98, 48, 69, + 53, 19, 27, 41, -44, 64, -43, -80, -19, -32, 30, -14, 64, -52, 46, 14, + -12, -16, 27, 39, 88, 68, 60, 77, 16, 95, 26, -30, -15, -68, -20, 51, + -40, -40, -3, -73, 85, -58, 68, 66, -15, 76, -84, -3, 97, 97, -3, -36, + -27, -16, -96, -99, -79, 11, -44, -35, -27, -3, 43, 11, 78, 24, 66, -15, + -25, -13, 67, -63, 87, -42, 37, -2, 91, -38, -52, 91, -69, 34, -80, -81, + 53, 26, -33, 9, 15, 42, 19, -22, 15, -9, -87, 54, 24, 67, -43, -73, + -75, 12, 90, -35, 52, 30, 99, -57, -4, -59, -95, 66, 79, -78, -12, 22, + 77, 70, 46, 96, -50, -90, -89, -27, -93, 51, -60, 63, -16, -29, -46, 36, + -54, 42, 75, 3, -59, -41, 97, -89, 29, -94, -21, 30, 8, -98, -74, 13, + 6, -61, 60, 28, -93, 0, 59, 23, 75, 6, 0, -97, 97, 9, -82, -69, + 36, -89, 38, -55, 18, -85, 28, -38, 8, 18, -80, 57, -88, 61, -97, 55, + 1, -42, 27, -74, -32, -76, 30, -4, 95, -76, 47, 61, 53, 25, 32, 88, + -92, 15, 77, 33, -41, 71, 20, 38, 93, 74, 7, 7, 30, -25, -71, -80, + 55, 89, 60, 62, 79, -67, -60, -50, 74, -19, -43, -26, 84, -78, 75, 27, + 49, -84, -32, 7, 38, -7, -86, -10, 62, -68, 24, -85, 61, -35, 67, 55, + -54, 66, 97, 47, 23, -100, 99, -14, 27, 90, -34, -97, 85, 52, -54, -63, + 42, 90, -80, -44, 16, 80, -55, 41, 44, -62, 36, -2, -24, 14, 33, 50, + 86, -10, -3, 19, -55, -63, 41, -1, -29, 89, -66, 46, -71, -23, -17, 12, + -76, 67, -13, -87, 53, 1, 49, 75, -18, 9, 78, 30, 69, -82, -73, 72, + -30, -38, -28, 23, 18, 20, -82, 74, 41, 19, 65, 80, -40, 63, 67, -71, + -92, -37, 60, 19, -49, -12, 41, -48, 86, -23, -60, 95, -33, -93, -99, 4, + 24, 9, -15, 68, 19, 26, 82, -85, -39, 12, 75, 57, -89, -66, -90, -37, + -51, 48, -32, -53, -81, 0, 94, 71, 60, 88, 36, -33, -66, -36, -93, -56, + -14, 74, 67, -25, 81, -25, 73, -54, -86, 4, 46, -75, -49, -8, 49, -80, + -50, -63, -81, 56, 78, 31, -46, 98, -9, -73, -97, 49, -25, 72, -34, -15, + 93, 61, -10, 82, 4, -100, -29, -14, 65, -95, 52, 44, -40, -89, -95, -74, + 35, 23, 5, 92, 84, -8, 8, -2, 67, 41, -66, 75, -17, -45, 76, 40, + 88, -52, -55, 12, 89, 35, -58, 47, 23, -4, -11, -62, -16, 76, 91, -39, + 5, 6, -47, 25, -92, -13, -64, -52, -60, -84, -13, -80, -50, -42, 66, 82, + -60, -62, 40, 91, -90, 46, 38, -24, 26, -55, -56, -100, 67, 7, -82, -25, + -29, 99, 88, 8, -89, 88, 60, -56, -11, 2, 96, -93, 52, -10, 73, -14, + 57, 74, 76, 74, -17, -19, 45, 93, -33, 60, 26, 63, -19, 46, 55, 67, + -17, 12, -25, 21, 42, -60, 66, -99, 90, -72, 4, 93, -50, -26, -91, 41, + -50, 17, 20, -62, -73, -98, -17, 97, 54, 16, -99, -8, 65, 30, 43, -7, + -61, -99, -20, 68, -89, 62, -36, -46, -56, 0, -39, -52, 26, 52, -78, 51, + -63, -73, -80, 81, 6, 45, 63, -2, 72, 18, -28, 80, 21, -71, 87, 7, + -3, -69, 94, 94, 55, 71, -22, -60, -38, -78, 3, -26, -53, 59, -44, 17, + -1, 23, 60, 51, 66, -67, 83, 55, 64, -94, 95, 64, -45, -36, 92, -54, + 47, -82, 56, 87, 45, 62, 70, 25, 38, -52, 12, -63, -37, -32, 35, 61, + 75, 15, 83, -36, -70, 31, 12, 8, 77, -32, 57, 19, -13, 86, -34, -26, + -56, -97, 53, 54, 73, 6, 91, -50, -51, -95, -42, 24, 42, -71, 69, 21, + 24, -64, 69, 23, 0, -66, -14, 33, 95, -83, 12, -72, -24, -42, 74, 25, + 44, 8, -79, -33, -32, 64, -52, 40, 61, 8, -15, -48, 22, -14, 78, 30, + -59, -3, 59, 8, -50, -16, 73, 26, -80, -57, 38, 2, -33, 28, -9, 90, + -77, 24, 40, 10, -3, 38, 78, -84, -2, -19, 3, 51, -82, 64, -16, -93, + -73, -96, 44, 0, 63, 17, 46, -37, -10, 16, -3, -90, -47, 72, -54, 39, + 48, 38, -96, 83, -81, 68, -55, -89, -31, -84, 14, 46, 53, -53, 7, -49, + -5, -31, 82, 1, -26, 60, 89, 50, 42, 54, 91, 78, 80, -19, -9, 78, + 74, -97, 34, 8, 62, -53, -28, -29, 15, 72, -20, -20, 12, -5, -29, 46, + -38, -63, -34, 86, -32, -88, 74, 88, -70, 13, -79, -8, 70, -34, -47, 37, + -59, -37, 25, -80, 93, 68, 51, -66, 83, -71, -44, -54, 62, -82, 13, -93, + -75, 12, -30, -71, 34, -34, -36, -65, -76, -64, -100, 42, -64, 40, 35, 32, + 87, 27, 26, -100, 63, -66, 93, 90, 3, -78, -92, -67, 73, 27, -97, 57, + 1, -26, -2, -29, -98, 41, 87, 72, -67, -56, -40, -18, 22, 1, -85, -35, + 67, 44, 18, 61, 23, 43, 1, 87, -35, 93, -88, 83, 10, 58, -67, -62, + 94, 35, 19, 25, -46, -74, 17, -48, -62, -36, 44, 57, 80, -88, -80, 46, + -48, -82, -44, -76, -70, -20, 82, -78, -69, 89, -27, -91, 99, -47, -49, 47, + 59, 13, -87, 77, 55, 9, -48, 51, -22, -79, -71, 90, -97, -42, -93, -89, + 18, 58, -67, -19, -3, 26, -26, 4, -52, -22, 84, -8, 53, -83, -5, -63 +}; + +static uint32_t offset_dram[1024] __attribute__((section(".data"))) = { + 1024, 1408, 1472, 3136, 448, 2560, 320, 256, 2752, 3584, 4032, 1792, 1728, 640, 3584, 0, + 2688, 1152, 512, 1664, 2112, 2752, 192, 2240, 1088, 896, 3200, 768, 1152, 3072, 320, 3008, + 3520, 704, 3200, 2304, 1024, 3648, 1408, 1600, 1792, 2048, 3776, 1664, 1536, 3520, 3456, 576, + 2944, 1408, 1280, 1280, 2688, 768, 2304, 2048, 1472, 2752, 640, 1024, 192, 2944, 1536, 1856, + 512, 1280, 1216, 1152, 1152, 704, 2304, 2624, 1024, 2304, 3392, 3008, 2432, 1728, 4032, 2560, + 2176, 2944, 3008, 1088, 4032, 832, 2176, 2752, 2560, 3776, 384, 2560, 768, 1088, 2304, 768, + 2240, 1920, 64, 960, 512, 512, 2368, 640, 3392, 3520, 1344, 2048, 2240, 2816, 1408, 2816, + 2816, 2112, 512, 1472, 448, 384, 2240, 1088, 128, 1728, 320, 2752, 2752, 1280, 1728, 2496, + 3392, 3392, 0, 2240, 1280, 3776, 2688, 2176, 768, 3584, 3072, 3392, 3392, 576, 3328, 2752, + 2944, 3328, 576, 2880, 768, 2304, 2368, 1472, 64, 1344, 1024, 2240, 832, 576, 2176, 1536, + 128, 2944, 2368, 1024, 704, 3392, 3264, 1472, 1792, 2688, 1536, 2240, 3648, 3776, 1600, 2368, + 0, 1216, 3456, 1728, 960, 2944, 1024, 2496, 1728, 3648, 704, 1664, 3392, 3328, 2624, 3712, + 2944, 1856, 2560, 2112, 2304, 320, 384, 1984, 1984, 3328, 1536, 2048, 3456, 3648, 384, 3264, + 3584, 3904, 704, 960, 960, 2688, 3328, 2240, 768, 384, 2240, 2944, 3200, 512, 1984, 640, + 448, 2048, 1408, 1536, 3520, 448, 2816, 1344, 3712, 192, 2496, 128, 1792, 2560, 1792, 2112, + 2112, 1728, 768, 896, 3776, 1408, 2752, 1600, 192, 1600, 2496, 512, 3456, 256, 3520, 1984, + 1216, 3136, 640, 896, 1984, 1728, 1344, 2560, 1024, 3712, 1600, 2752, 384, 2624, 3584, 1600, + 1536, 576, 3712, 2816, 1152, 640, 768, 3328, 448, 2496, 192, 2944, 1920, 1728, 640, 2688, + 0, 3072, 2944, 3264, 2368, 1472, 3392, 3776, 3584, 2944, 1920, 2560, 2880, 3136, 1600, 3072, + 3136, 256, 1216, 3264, 2560, 640, 3520, 3392, 2048, 3136, 2368, 1664, 2112, 896, 1984, 3840, + 3648, 1280, 2880, 256, 320, 2368, 3904, 1344, 3904, 1088, 832, 576, 2880, 768, 1920, 1856, + 128, 1856, 320, 1792, 448, 2752, 2816, 2688, 2752, 2048, 1024, 3008, 640, 768, 3136, 640, + 1920, 1728, 2752, 320, 2560, 960, 192, 2176, 3776, 3008, 3520, 4032, 3712, 1600, 3904, 3584, + 704, 640, 2304, 1536, 1088, 1152, 1408, 3456, 2368, 3392, 1088, 1600, 320, 192, 2752, 3200, + 1792, 3712, 3840, 2112, 1024, 2688, 2688, 3328, 960, 64, 2112, 3456, 448, 768, 2624, 448, + 1856, 1984, 2240, 2944, 1856, 0, 2688, 2432, 1024, 3712, 1216, 1472, 3136, 1344, 1792, 1344, + 2752, 512, 2304, 576, 2240, 2304, 1728, 4032, 1728, 2624, 3968, 2688, 1280, 1472, 2624, 3776, + 2496, 64, 3456, 448, 3008, 1664, 1344, 2304, 1664, 3584, 1792, 128, 2752, 768, 2816, 576, + 2240, 512, 3392, 3520, 3456, 1664, 2112, 3200, 0, 1984, 1536, 1792, 384, 3072, 128, 1216, + 1216, 3328, 1856, 2176, 2176, 3072, 2432, 1664, 1600, 832, 1024, 1088, 2816, 3264, 1792, 1216, + 3328, 1664, 1088, 192, 0, 3136, 0, 3648, 1600, 2816, 1216, 3264, 1408, 576, 3712, 640, + 2560, 2560, 256, 3648, 0, 1024, 1344, 2112, 3520, 2816, 2560, 1792, 192, 256, 4032, 640, + 3520, 3200, 1280, 2176, 704, 1600, 3200, 1152, 2816, 3776, 384, 2944, 3456, 384, 1600, 2496, + 256, 1280, 832, 192, 1408, 2624, 1216, 896, 3136, 3264, 2816, 1792, 448, 2560, 3008, 1152, + 3008, 2432, 2880, 1728, 2880, 3584, 1664, 192, 3648, 896, 2176, 2304, 0, 128, 2944, 1472, + 1664, 2496, 3968, 3840, 2240, 2112, 1792, 1472, 1472, 2624, 192, 128, 2112, 3584, 3072, 1408, + 3328, 3520, 3264, 1408, 2816, 256, 1920, 3904, 3328, 3904, 2560, 3584, 3840, 3584, 3776, 3392, + 320, 2304, 3584, 576, 2496, 384, 2432, 2368, 1280, 3840, 1216, 128, 4032, 3200, 1344, 2944, + 192, 1728, 704, 2368, 3200, 3968, 2240, 2944, 2560, 256, 768, 1408, 3648, 3136, 1856, 3904, + 2304, 1536, 3008, 1664, 3200, 832, 3584, 3904, 3008, 3712, 2944, 3392, 2240, 2432, 960, 3520, + 1344, 1280, 3392, 1280, 448, 3072, 960, 3712, 3456, 1920, 2624, 576, 1984, 3712, 2112, 3968, + 3904, 512, 1024, 640, 1984, 576, 1344, 3584, 1024, 192, 3008, 3264, 2944, 1536, 3712, 1280, + 2304, 2368, 3072, 0, 3968, 1536, 3200, 2240, 1024, 2560, 3968, 1472, 64, 2496, 896, 1984, + 2432, 3712, 320, 0, 2944, 1408, 3648, 2432, 3840, 3904, 3584, 1024, 2112, 3712, 2560, 1792, + 512, 768, 320, 320, 1216, 3264, 3328, 128, 1024, 3968, 3072, 704, 3008, 2944, 2816, 3456, + 4032, 0, 1024, 128, 1600, 896, 2240, 1984, 1984, 4032, 192, 1088, 3072, 1152, 2688, 3264, + 2176, 2496, 832, 3648, 2048, 832, 2112, 3840, 3456, 192, 1472, 3712, 1792, 1152, 320, 3584, + 2048, 3072, 2496, 1152, 3328, 0, 256, 1664, 960, 320, 2496, 768, 3008, 4032, 3904, 2176, + 1152, 704, 960, 3712, 2688, 576, 1536, 1216, 1920, 192, 2752, 1408, 64, 3520, 1344, 3200, + 3584, 2304, 448, 384, 832, 3520, 3456, 128, 3136, 960, 3456, 1152, 3200, 1088, 3264, 960, + 448, 3776, 2048, 3072, 704, 3904, 3328, 640, 256, 1664, 704, 1024, 320, 2048, 256, 3584, + 1728, 832, 256, 3392, 3520, 3712, 2048, 3904, 3136, 3008, 2560, 1152, 1344, 256, 3712, 64, + 1856, 2944, 2432, 192, 2048, 768, 1216, 3392, 384, 896, 1408, 832, 1408, 1152, 4032, 3392, + 1792, 3520, 1216, 1792, 1472, 1088, 3968, 576, 2560, 3584, 448, 1600, 3904, 1792, 576, 1024, + 64, 320, 3648, 256, 1728, 3904, 2560, 256, 1088, 1600, 1216, 2560, 3072, 1664, 3072, 704, + 384, 3712, 128, 576, 2112, 1536, 2176, 960, 64, 3776, 1472, 576, 2368, 1344, 1600, 2688, + 3456, 2432, 1664, 1792, 2944, 2368, 1216, 1664, 2560, 3392, 1216, 2752, 1152, 1216, 2176, 2944, + 64, 1856, 3328, 2112, 1344, 128, 320, 896, 1664, 576, 512, 2048, 1984, 3008, 320, 2560, + 3712, 2240, 3712, 3776, 64, 1792, 192, 0, 1152, 1600, 1536, 576, 3328, 896, 3712, 1344, + 3200, 3008, 1280, 3712, 3520, 1344, 2240, 2112, 1856, 1536, 2816, 1728, 0, 3072, 2816, 192, + 384, 3072, 512, 2240, 1152, 0, 576, 2240, 512, 2816, 2880, 2624, 1984, 1984, 320, 3008, + 2368, 1024, 576, 3136, 1664, 3392, 3136, 2176, 3264, 3712, 1280, 3968, 1536, 1728, 2944, 896, + 192, 1792, 3712, 512, 1664, 1600, 1152, 3520, 1728, 1024, 2688, 3584, 2560, 2624, 3136, 1216, + 2816, 2496, 2432, 3072, 3328, 1408, 2944, 1984, 3264, 2944, 4032, 3968, 2048, 3584, 2880, 256 +}; + diff --git a/software/tests/bandwidth/main.c b/software/tests/bandwidth/main.c new file mode 100644 index 0000000..a817c18 --- /dev/null +++ b/software/tests/bandwidth/main.c @@ -0,0 +1,157 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Diyou Shen + +#include +#include +#include +#include + +#include "data/data.h" + +// #define DEBUG + +// Random-load bandwidth benchmark — measures L1-to-core interconnect bandwidth. +// +// Phase 1 (warmup): all cores together stream through all of data_dram so that +// every cache line is resident in the L1 before measurement begins. +// Requires M * sizeof(int) <= total L1 cache capacity. +// +// Phase 2 (measurement): each core issues random vector loads drawn from a +// pre-generated offset table. Because all data is already cached, this +// measures the L1 hit bandwidth (local + remote tile xbar), not DRAM refill +// latency. No correctness check is performed — only the cycle count matters. + +int main() { + const uint32_t num_cores = snrt_cluster_core_num(); + const uint32_t cid = snrt_cluster_core_idx(); + const uint32_t c_per_tile = 4; + const uint32_t cid_tile = cid % c_per_tile; + + // Spatz vector config: lmul=4, element width 32 b, VLEN=512 b + const uint32_t lmul = 4; + const uint32_t vlen_bits = 512; + const uint32_t elem_bits = 32; + // Elements transferred by one vle32.v with lmul=4 + const uint32_t v_len = lmul * vlen_bits / elem_bits; + + const uint32_t measure_iterations = R; + + // offset = 6 = log2(64 B cacheline) — the hardware minimum. + // + // With this offset and 256-byte-aligned accesses (step = v_len = 64 elems): + // addr[7:6] = bank within tile (changes at +64, +128, +192 → banks 0..3) + // addr[9:8] = tile ID (unchanged across the 256-byte span) + // + // Every vle32.v load (256 bytes, 4 cachelines) therefore maps entirely to + // ONE tile while distributing its 4 cachelines across the 4 banks of that + // tile. The random offset in offset_dram picks which tile is targeted + // (local or remote), so ~75 % of loads hit a remote tile and generate + // traffic on the inter-tile xbar. + const uint32_t scramble_bits = 6; + + if (cid == 0) { + l1d_xbar_config(scramble_bits); + // Fully shared + l1d_part(4); +#ifdef DEBUG + printf("scramble_bits=%u v_len=%u\n", scramble_bits, v_len); +#endif + } + + snrt_cluster_hw_barrier(); + + // ----------------------------------------------------------------------- + // Phase 1: warmup — fill L1 cache with all of data_dram. + // Each core streams through its own slice (M / num_cores elements) using + // the widest LMUL so that every tile's banks are populated in parallel. + // ----------------------------------------------------------------------- + // const uint32_t elems_per_core = M / num_cores; + const uint32_t elems_per_core = M; + // const int *wp = data_dram + cid * elems_per_core; + const int *wp = data_dram + cid_tile * elems_per_core; + uint32_t avl = elems_per_core; + uint32_t wvl; + do { + asm volatile("vsetvli %0, %1, e32, m8, ta, ma" : "=r"(wvl) : "r"(avl)); + asm volatile("vle32.v v0, (%0)" :: "r"(wp)); + wp += wvl; + avl -= wvl; + } while (avl > 0); + + // Barrier: every tile's banks must be populated before measurement starts. + snrt_cluster_hw_barrier(); + + // ----------------------------------------------------------------------- + // Phase 2: measurement — random loads from the now-hot cache. + // ----------------------------------------------------------------------- + + // Per-core pointer into the offset table. + // Layout: interleaved by core — + // [core0_round0, core1_round0, ..., coreN_round0, core0_round1, ...] + const uint32_t *offset_p = offset_dram + cid; + + const int *data = data_dram; + const int *addr1, *addr2, *addr3, *addr4; + + uint32_t vl; + asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(v_len)); + + uint32_t timer = 0; + + if (cid == 0) { + start_kernel(); + timer = benchmark_get_cycle(); + } + + // Four loads per inner iteration to overlap address computation with loads. + for (uint32_t i = 0; i < measure_iterations / 4; i++) { + addr1 = data + *offset_p; offset_p += num_cores; + addr2 = data + *offset_p; offset_p += num_cores; + asm volatile("vle32.v v0, (%0)" :: "r"(addr1)); + asm volatile("vle32.v v4, (%0)" :: "r"(addr2)); + addr3 = data + *offset_p; offset_p += num_cores; + addr4 = data + *offset_p; offset_p += num_cores; + asm volatile("vle32.v v8, (%0)" :: "r"(addr3)); + asm volatile("vle32.v v12, (%0)" :: "r"(addr4)); + } + + snrt_cluster_hw_barrier(); + + if (cid == 0) { + timer = benchmark_get_cycle() - timer; + stop_kernel(); + + // elements loaded by one core / timer + uint32_t performance = measure_iterations * v_len * 1000 / timer; + // 1000‰ = one vector load per cycle (peak throughput) + uint32_t utilization = performance / v_len; + + printf("\n----- random-load bw: %u iters x %u elems -----\n", + measure_iterations, v_len); + printf("Total cycles: %u, avg per load: %u\n", + timer, timer / measure_iterations); + printf("Performance: %u elems/1000cyc (%u%%o utilization)\n", + performance, utilization); + + write_cyc(timer); + } + + snrt_cluster_hw_barrier(); + + return 0; +} diff --git a/software/tests/bandwidth/script/bw.json b/software/tests/bandwidth/script/bw.json new file mode 100644 index 0000000..7abcd4d --- /dev/null +++ b/software/tests/bandwidth/script/bw.json @@ -0,0 +1,17 @@ +// Parameters for CachePool Random-Load Bandwidth Benchmark +// +// M : elements in data_dram (4096 int32 = 16 KB; exceeds default L1 to +// force DRAM refills on random accesses) +// round: measurement rounds per core; must be divisible by 4 +// prec : element width in bits (32 = int) +// core : active cores (matches default cluster: 4 tiles × 4 cores = 16) +// step : load granularity in elements = v_len for lmul=4, VLEN=512 +// (4 × 512/32 = 64 elements per vle32.v) + +{ + M: 4096, + round: 64, + prec: 32, + core: 16, + step: 64 +} diff --git a/software/tests/bandwidth/script/gen_data.py b/software/tests/bandwidth/script/gen_data.py new file mode 100644 index 0000000..38408d8 --- /dev/null +++ b/software/tests/bandwidth/script/gen_data.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Diyou Shen +# +# Generate data/data.h for the CachePool random-load bandwidth benchmark. +# +# Layout in the generated header: +# const uint32_t M = ; +# const uint32_t R = ; +# static int data_dram[M] — random payload in DRAM +# static uint32_t offset_dram[R * cores] — interleaved per-core offsets +# +# Each offset is a v_len-aligned element index into data_dram (step = v_len). +# Offsets span the full data_dram range so that repeated accesses across all +# rounds cover more than the L1 cache capacity and exercise DRAM bandwidth. + +import numpy as np +import argparse +import pathlib +import hjson + +np.random.seed(42) + + +def array_to_cstr(a): + """Format a numpy integer array as a C initialiser list.""" + out = '{\n' + values_per_line = 16 + flat = a.flatten() + for i in range(0, len(flat), values_per_line): + chunk = flat[i:i + values_per_line] + line = ', '.join(str(v) for v in chunk) + if i + values_per_line < len(flat): + out += ' ' + line + ',\n' + else: + out += ' ' + line + '\n' + out += '}' + return out + + +def rand_data_generator(shape, prec): + """Return a random integer numpy array of the requested precision.""" + dtype_map = {64: np.int64, 32: np.int32, 16: np.int16, 8: np.int8} + dtype = dtype_map[prec] + return np.random.randint(-100, 100, size=shape, dtype=dtype) + + +def rand_offset_generator(num_entries, data_elems, step): + """Return v_len-aligned random element offsets that span data_dram. + + Each offset satisfies: + offset + step <= data_elems (in-bounds for a v_len-wide load) + Offsets are expressed in elements (not bytes). + + Parameters + ---------- + num_entries : int + Total number of offset entries (cores * rounds). + data_elems : int + Size of data_dram in elements. + step : int + Load granularity in elements (= v_len, e.g. 64 for lmul=4 + VLEN=512). + """ + max_idx = data_elems // step - 1 # last valid aligned block index + if max_idx < 0: + raise ValueError(f"data_elems ({data_elems}) < step ({step})") + indices = np.random.randint(0, max_idx + 1, size=num_entries, dtype=np.uint32) + return indices * step + + +def emit_header(data_arr, offset_arr, M, R, cores, prec): + ctypes = {64: 'int64_t', 32: 'int', 16: 'int16_t', 8: 'int8_t'} + dtype = ctypes[prec] + + offset_size = R * cores + + s = '// Copyright 2022 ETH Zurich and University of Bologna.\n' + s += '// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n' + s += '// SPDX-License-Identifier: Apache-2.0\n' + s += '// This file was generated automatically by script/gen_data.py.\n\n' + s += '#include \n\n' + s += f'const uint32_t M = {M};\n' + s += f'const uint32_t R = {R};\n\n' + # data_dram: payload array accessed by random vector loads + s += (f'static {dtype} data_dram[{M}]' + f' __attribute__((section(".data"))) = ' + + array_to_cstr(data_arr) + ';\n\n') + # offset_dram: interleaved per-core random element offsets + # Layout: [core0_r0, core1_r0, ..., coreN_r0, core0_r1, ...] + s += (f'static uint32_t offset_dram[{offset_size}]' + f' __attribute__((section(".data"))) = ' + + array_to_cstr(offset_arr) + ';\n\n') + return s + + +def main(): + parser = argparse.ArgumentParser( + description='Generate data.h for the CachePool bandwidth benchmark') + parser.add_argument('-c', '--cfg', type=pathlib.Path, required=True, + help='Path to parameter JSON (e.g. script/bw.json)') + args = parser.parse_args() + + with args.cfg.open() as f: + p = hjson.loads(f.read()) + + M = p['M'] # elements in data_dram + R = p['round'] # measurement rounds per core + prec = p['prec'] # element width in bits (32) + cores = p['core'] # number of active cores + step = p['step'] # load granularity in elements (= v_len) + + data_arr = rand_data_generator((M,), prec) + offset_arr = rand_offset_generator(R * cores, M, step) + + out_path = pathlib.Path(__file__).parent.parent / 'data' / 'data.h' + out_path.parent.mkdir(parents=True, exist_ok=True) + with out_path.open('w') as f: + f.write(emit_header(data_arr, offset_arr, M, R, cores, prec)) + + print(f'Generated {out_path} (M={M}, R={R}, cores={cores}, step={step})') + + +if __name__ == '__main__': + main() diff --git a/software/tests/byte-enable/main.c b/software/tests/byte-enable/main.c index 7266687..3d64f07 100644 --- a/software/tests/byte-enable/main.c +++ b/software/tests/byte-enable/main.c @@ -400,7 +400,6 @@ int main(void) { const unsigned int core_id = snrt_cluster_core_idx(); if (core_id == 0) { - l1d_init(0); uint32_t offset = 31U - __builtin_clz((unsigned int)L1LineWidth); l1d_xbar_config(offset); } diff --git a/software/tests/fdotp-32b/main.c b/software/tests/fdotp-32b/main.c index 0592ff3..bf2ccb8 100644 --- a/software/tests/fdotp-32b/main.c +++ b/software/tests/fdotp-32b/main.c @@ -56,7 +56,7 @@ int main() { } else { if (cid == 0) { printf("FATAL: Problem size too small!\n"); - return 0; + return -2; } } @@ -81,8 +81,6 @@ int main() { if (cid == 0) { // Set xbar policy l1d_xbar_config(l1_scramble_bits); - // Initialize the cache - l1d_init(0); } snrt_cluster_hw_barrier(); @@ -126,7 +124,7 @@ int main() { else if (lmul >= 1) acc = fdotp_v32b_lmul1(a_int, b_int, elem_jump_per_round, elem_per_round, rounds); else - return 0; + return -3; result[cid] = acc; @@ -143,11 +141,22 @@ int main() { stop_kernel(); } - // Final reduction + // Final reduction: two-level tree with group size 4 + const uint32_t red_group = 4; + + // Level 1: lead core of each group accumulates its group + if (cid % red_group == 0) { + for (uint32_t i = 1; i < red_group && (cid + i) < num_cores; ++i) + acc += result[cid + i]; + result[cid] = acc; + } + + snrt_cluster_hw_barrier(); + + // Level 2: core 0 sums all group results if (cid == 0) { - // timer_tmp = benchmark_get_cycle() - timer_tmp; - for (uint32_t i = 1; i < num_cores; ++i) - acc += result[i]; + for (uint32_t g = red_group; g < num_cores; g += red_group) + acc += result[g]; result[0] = acc; } @@ -176,6 +185,12 @@ int main() { if (cid == 0) { if (fp_check(result[0], dotp_result*measure_iter)) { printf("Check Failed!\n"); + printf("Calc:"); + snrt_printf_float(result[0]); + printf(", Exp:"); + snrt_printf_float((float)(dotp_result * measure_iter)); + printf("\n"); + return -1; } } diff --git a/software/tests/fmatmul-32b/main.c b/software/tests/fmatmul-32b/main.c index abaf908..a36c01f 100644 --- a/software/tests/fmatmul-32b/main.c +++ b/software/tests/fmatmul-32b/main.c @@ -77,10 +77,8 @@ int main() { // Set xbar policy // All cores will access the same B // Scramble based on cacheline - // l1d_xbar_config(5); l1d_xbar_config(5); - // Init the cache - l1d_init(0); + l1d_part(4); } // Wait for all cores to finish @@ -137,7 +135,7 @@ int main() { } else if (kernel_size == 8) { matmul_8xVL(gemm_C_dram, gemm_A_dram, gemm_B_dram, m_start, m_end, gemm_l.K, gemm_l.N, p_start, p_end); } else { - return -2; + return -1; } // Wait for all cores to finish @@ -164,9 +162,13 @@ int main() { snrt_cluster_hw_barrier(); if (cid == 0) { - for (uint32_t j = 0; j < num_cores; j++) { - printf("Core %d error %d\n", j, error[j]); - // error[0] += error[j]; + if (error[0] != 0) + printf("Core 0 error %d\n", error[0]); + + for (uint32_t j = 1; j < num_cores; j++) { + error[0] += error[j]; + if (error[j] != 0) + printf("Core %d error %d\n", j, error[j]); } } else { @@ -174,14 +176,6 @@ int main() { } snrt_cluster_hw_barrier(); - - // if (error[0] != 0) { - // if (cid == 0) { - // printf("Check failed, error count:%d\n", error[0]); - // // printf("First iter took %u cycles\n", timer_iter1); - // } - // // return -1; - // } } } @@ -207,6 +201,8 @@ int main() { // Wait for all cores to finish snrt_cluster_hw_barrier(); + if (error[0] > 0) + return -1; return 0; } diff --git a/software/tests/gemv/main.c b/software/tests/gemv/main.c index 6fb3bd0..fd0ae8f 100644 --- a/software/tests/gemv/main.c +++ b/software/tests/gemv/main.c @@ -62,8 +62,9 @@ int main() { // Allocate the matrices if (cid == 0) { - // Set xbar policy + // We use all-private mode for this kernel l1d_xbar_config(offset); + l1d_part(4); } // Reset timer @@ -126,7 +127,11 @@ int main() { for (uint32_t j = 0; j < gemv_l.M; j++) { if (fp_check(&result[j], &gemv_result[j])) { - printf("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], gemv_result[i]); + printf("Error: ID: %i Calc", i); + snrt_printf_float(result[i]); + printf(",Exp:"); + snrt_printf_float(gemv_result[i]); + printf("\n"); } } } diff --git a/software/tests/idotp-32b/main.c b/software/tests/idotp-32b/main.c index bbd21ce..0143e31 100644 --- a/software/tests/idotp-32b/main.c +++ b/software/tests/idotp-32b/main.c @@ -51,9 +51,7 @@ int main() { if (cid == 0) { // Set xbar policy - l1d_init(0); l1d_xbar_config(offset); - // Initialize the cache printf ("round:%u, lmul:%u, dim:%u\n", rounds, lmul, dim); } diff --git a/software/tests/load-store/main.c b/software/tests/load-store/main.c index 4c43947..79ccec1 100644 --- a/software/tests/load-store/main.c +++ b/software/tests/load-store/main.c @@ -105,7 +105,6 @@ static int check_const(uint32_t *ptr, uint32_t count, uint32_t value, static void cache_cfg(uint32_t cid, uint32_t xbar_offset, uint32_t part) { if (cid == 0) { l1d_xbar_config(xbar_offset); - l1d_init(0); l1d_part(part); } sync_all(); diff --git a/toolchain.mk b/toolchain.mk index 2d40a65..97e58f8 100644 --- a/toolchain.mk +++ b/toolchain.mk @@ -69,7 +69,7 @@ ${TOOLCHAIN_DIR}/riscv-isa-sim: ${TOOLCHAIN_DIR}/riscv-isa-sim.version ${TOOLCHAIN_DIR}/dtc: mkdir -p ${TOOLCHAIN_DIR}/dtc - cd ${TOOLCHAIN_DIR}/dtc && wget -c https://git.kernel.org/pub/scm/utils/dtc/dtc.git/snapshot/dtc-1.7.0.tar.gz + cd ${TOOLCHAIN_DIR}/dtc && curl -fLO https://git.kernel.org/pub/scm/utils/dtc/dtc.git/snapshot/dtc-1.7.0.tar.gz cd ${TOOLCHAIN_DIR}/dtc && tar xf dtc-1.7.0.tar.gz # ---------- Build toolchains ---------- diff --git a/util/auto-benchmark/check-ci.py b/util/auto-benchmark/check-ci.py index fa4ef63..1bbf5e4 100644 --- a/util/auto-benchmark/check-ci.py +++ b/util/auto-benchmark/check-ci.py @@ -20,7 +20,7 @@ def main(): # Matches "error " anywhere in a line, captured as group 1. error_val_re = re.compile(r'\berror\s+(\d+)\b', re.IGNORECASE) # Matches FAIL or [FAIL] anywhere in a line. - fail_re = re.compile(r'\bFAIL\b', re.IGNORECASE) + fail_re = re.compile(r'\bFailed\b', re.IGNORECASE) failures = [] diff --git a/util/auto-benchmark/configs-ci.sh b/util/auto-benchmark/configs-ci.sh index 11fe23e..70557e2 100644 --- a/util/auto-benchmark/configs-ci.sh +++ b/util/auto-benchmark/configs-ci.sh @@ -1,5 +1,5 @@ # Configs and kernel suffixes (without prefix) -CONFIGS="cachepool_fpu_512" -KERNELS="spin-lock load-store_M16 fdotp-32b_M32768 gemv-opt_M512_N128_K32 fmatmul-32b_M32_N32_K32 fft-32b_M1024_N16 multi_producer_single_consumer_double_linked_list_M1_N1350_K10 byte-enable" +CONFIGS="cachepool_fpu_2g" +KERNELS="spin-lock load-store_M16 fdotp-32b_M32768 gemv_M512_N128_K32 fmatmul-32b_M64_N64_K64 fft-32b_M1024_N16 multi_producer_single_consumer_double_linked_list_M1_N1350_K10 byte-enable" PREFIX="test-cachepool-" # common prefix for all kernels ROOT_PATH=../.. # adjust if needed (path to repo root) diff --git a/util/auto-benchmark/configs.sh b/util/auto-benchmark/configs.sh index 1a545be..bb8fa6c 100755 --- a/util/auto-benchmark/configs.sh +++ b/util/auto-benchmark/configs.sh @@ -1,12 +1,8 @@ # Configs and kernel suffixes (without prefix) -# CONFIGS="cachepool_fpu_512" -CONFIGS="cachepool_fpu_128 cachepool_fpu_256 cachepool_fpu_512" +CONFIGS="cachepool_fpu_2g cachepool_fpu_4g" -# KERNELS="spin-lock fdotp-32b_M8192 fmatmul-32b_M32_N32_K32" -# KERNELS="fdotp-32b_M65536 gemv-opt_M1024_N128_K32 gemv_M1024_N128_K32" KERNELS="spin-lock fdotp-32b_M65536 gemv-opt_M1024_N128_K32 gemv_M1024_N128_K32 fmatmul-32b_M64_N64_K64 multi_producer_single_consumer_double_linked_list_M1_N1350_K100 byte-enable" -# KERNELS="spin-lock fdotp-32b_M32768" PREFIX="test-cachepool-" # common prefix for all kernels ROOT_PATH=../.. # adjust if needed (path to repo root) diff --git a/util/auto-benchmark/write_results.py b/util/auto-benchmark/write_results.py index 4d254fe..2035e38 100644 --- a/util/auto-benchmark/write_results.py +++ b/util/auto-benchmark/write_results.py @@ -18,7 +18,7 @@ def extract_uart_lines(input_file_path, output_file_path, config=None, kernel=No # Copy only lines containing '[UART]' for line in input_file: - if '[UART]' in line: + if '[UART]' in line or '[EOC]' in line: output_file.write(line) output_file.write("\n----------------------------------------\n")