From daf951b3f80e76ddab9c4d3159780832bf02c332 Mon Sep 17 00:00:00 2001 From: Yinrong Li Date: Sun, 24 May 2026 03:19:44 +0200 Subject: [PATCH 1/4] [hardware] Unify the SPM address scrambler and tile interconnect --- Bender.yml | 6 +- hardware/src/address_scrambler.sv | 89 ---- hardware/src/mempool_addr_scrambler.sv | 95 +++++ hardware/src/mempool_bank_id_remapper.sv | 89 ---- hardware/src/mempool_dma_tile_id_remapper.sv | 43 -- hardware/src/mempool_group.sv | 27 +- hardware/src/mempool_tcdm_bank_interco.sv | 415 +++++++++++++++++++ hardware/src/mempool_tile.sv | 276 ++++-------- hardware/src/tcdm_shim.sv | 40 +- hardware/src/tcdm_wide_narrow_mux.sv | 143 ------- 10 files changed, 615 insertions(+), 608 deletions(-) delete mode 100644 hardware/src/address_scrambler.sv create mode 100644 hardware/src/mempool_addr_scrambler.sv delete mode 100644 hardware/src/mempool_bank_id_remapper.sv delete mode 100644 hardware/src/mempool_dma_tile_id_remapper.sv create mode 100644 hardware/src/mempool_tcdm_bank_interco.sv delete mode 100644 hardware/src/tcdm_wide_narrow_mux.sv diff --git a/Bender.yml b/Bender.yml index ccf4897b..6629bcd1 100644 --- a/Bender.yml +++ b/Bender.yml @@ -34,13 +34,11 @@ sources: - hardware/src/snitch_addr_demux.sv - hardware/src/tcdm_adapter.sv - hardware/src/tcdm_shim.sv - - hardware/src/tcdm_wide_narrow_mux.sv - - hardware/src/address_scrambler.sv + - hardware/src/mempool_addr_scrambler.sv + - hardware/src/mempool_tcdm_bank_interco.sv - hardware/src/axi_L2_interleaver.sv - hardware/src/bootrom.sv - hardware/src/selector.sv - - hardware/src/mempool_bank_id_remapper.sv - - hardware/src/mempool_dma_tile_id_remapper.sv - hardware/src/mempool_tile_rw_demux.sv - hardware/src/control_registers/control_registers_reg_pkg.sv - hardware/src/control_registers/control_registers_reg_top.sv diff --git a/hardware/src/address_scrambler.sv b/hardware/src/address_scrambler.sv deleted file mode 100644 index 78377a48..00000000 --- a/hardware/src/address_scrambler.sv +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -// Description: Scrambles the address in such a way, that part of the memory is accessed -// sequentially and part is interleaved. -// Current constraints: - -// Author: Samuel Riedel - -module address_scrambler -#( - parameter int unsigned AddrWidth = 32, - parameter int unsigned ByteOffset = 2, - parameter int unsigned NumTiles = 2, - parameter int unsigned NumTilesPerDma = 16, - parameter int unsigned NumBanksPerTile = 2, - parameter bit Bypass = 0, - parameter int unsigned SeqMemSizePerTile = 4*1024, - parameter logic [AddrWidth-1:0] TCDMBaseAddr = 32'b0, - parameter logic [31:0] TCDMMask = '1 << 28 -) ( - input logic [AddrWidth-1:0] address_i, - output logic [AddrWidth-1:0] address_o -); - localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); - localparam int unsigned TileIdBits = $clog2(NumTiles); - localparam int unsigned TileIdBitsPerDma = $clog2(NumTilesPerDma); - localparam int unsigned SeqPerTileBits = $clog2(SeqMemSizePerTile); - localparam int unsigned SeqTotalBits = SeqPerTileBits+TileIdBits; - localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; - localparam int unsigned ScrambleBits = SeqPerTileBits-ConstantBitsLSB; - - logic not_io_address; - assign not_io_address = (address_i & TCDMMask) == TCDMBaseAddr; - - function automatic logic [TileIdBitsPerDma-1:0] spm_tile_id_remap ( - logic [TileIdBitsPerDma-1:0] data_in, - logic [TileIdBitsPerDma-1:0] idx_i - ); - if (mempool_pkg::TileIdRemap == 1) begin - spm_tile_id_remap = data_in + idx_i; - end else begin - spm_tile_id_remap = data_in; - end - endfunction - - if (Bypass || NumTiles < 2) begin - assign address_o = address_i; - end else begin - logic [ScrambleBits-1:0] scramble; // Address bits that have to be shuffled around - logic [TileIdBits-1:0] tile_id; // Which tile does this address region belong to - - // Leave this part of the address unchanged - // The LSBs that correspond to the offset inside a tile. These are the byte offset (bank width) - // and the Bank offset (Number of Banks in tile) - assign address_o[ConstantBitsLSB-1:0] = address_i[ConstantBitsLSB-1:0]; - // The MSBs that are outside of the sequential memory size. Currently the sequential memory size - // always starts at 0. These are all the MSBs up to SeqMemSizePerTile*NumTiles - assign address_o[AddrWidth-1:SeqTotalBits] = address_i[AddrWidth-1:SeqTotalBits]; - - // Scramble the middle part - // Bits that would have gone to different tiles but now go to increasing lines in the same tile - assign scramble = address_i[SeqPerTileBits-1:ConstantBitsLSB]; // Bits that would - // Bits that would have gone to increasing lines in the same tile but now go to different tiles - assign tile_id = address_i[SeqTotalBits-1:SeqPerTileBits]; - - always_comb begin - // Default: Unscrambled - address_o[SeqTotalBits-1:ConstantBitsLSB] = {tile_id, scramble}; - // If not in bypass mode and address is in sequential region and more than one tile - if (address_i < (NumTiles * SeqMemSizePerTile)) begin - address_o[SeqTotalBits-1:ConstantBitsLSB] = {scramble, tile_id}; - end else if(not_io_address) begin - address_o[ConstantBitsLSB +: TileIdBitsPerDma] = - spm_tile_id_remap( - address_i[ConstantBitsLSB +: TileIdBitsPerDma], - address_i[(ConstantBitsLSB + TileIdBits) +: TileIdBitsPerDma] - ); - end - end - end - - // Check for unsupported configurations - if (NumBanksPerTile < 2) - $fatal(1, "NumBanksPerTile must be greater than 2. The special case '1' is currently not supported!"); - if (SeqMemSizePerTile % (2**ByteOffset*NumBanksPerTile) != 0) - $fatal(1, "SeqMemSizePerTile must be a multiple of BankWidth*NumBanksPerTile!"); -endmodule : address_scrambler diff --git a/hardware/src/mempool_addr_scrambler.sv b/hardware/src/mempool_addr_scrambler.sv new file mode 100644 index 00000000..0f8ef2c4 --- /dev/null +++ b/hardware/src/mempool_addr_scrambler.sv @@ -0,0 +1,95 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// SPM address scrambler. Single combinational module applying up to two +// independently-gated address transforms: +// Stage 1 (EnableSeqInterleaveSwap, sequential region): swap the +// {tile_id, scramble} fields so HW sees an interleaved-bank layout. +// Stage 2 (EnableTileIdRemap, non-sequential TCDM): spread accesses +// across the NumTilesPerDma tiles of a DMA group. +// +// Used by snitch cores pre-tcdm_shim (both stages on) and by the DMA +// tile-id path in mempool_group.sv (Stage 1 off, Stage 2 on; caller then +// slices tile_id_remap out of address_o for the reqrsp_demux select). +// Replaces address_scrambler.sv and mempool_dma_tile_id_remapper.sv. +// +// NOTE: bank-id remap is intentionally NOT here — it lives in the tile's +// bank-side network (mempool_bank_id_remapper). + +module mempool_addr_scrambler +#( + parameter int unsigned AddrWidth = 32, + parameter int unsigned ByteOffset = 2, + parameter int unsigned NumTiles = 2, + parameter int unsigned NumTilesPerDma = 16, + parameter int unsigned NumBanksPerTile = 2, + parameter bit Bypass = 1'b0, + parameter int unsigned SeqMemSizePerTile = 4*1024, + parameter logic [AddrWidth-1:0] TCDMBaseAddr = 32'b0, + parameter logic [31:0] TCDMMask = '1 << 28, + parameter bit EnableSeqInterleaveSwap = 1'b1, + parameter bit EnableTileIdRemap = 1'b0 +) ( + input logic [AddrWidth-1:0] address_i, + output logic [AddrWidth-1:0] address_o +); + + `define max(a,b) (((a) > (b))? (a) : (b)) + + localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); + localparam int unsigned TileIdBits = $clog2(NumTiles); + localparam int unsigned TileIdBitsPerDma = `max(1, $clog2(NumTilesPerDma)); + localparam int unsigned SeqPerTileBits = $clog2(SeqMemSizePerTile); + localparam int unsigned SeqTotalBits = SeqPerTileBits + TileIdBits; + localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; + localparam int unsigned ScrambleBits = SeqPerTileBits - ConstantBitsLSB; + + logic not_io_address; + assign not_io_address = (address_i & TCDMMask) == TCDMBaseAddr; + + function automatic logic [TileIdBitsPerDma-1:0] spm_tile_id_remap ( + logic [TileIdBitsPerDma-1:0] data_in, + logic [TileIdBitsPerDma-1:0] idx_i + ); + if (EnableTileIdRemap) begin + spm_tile_id_remap = data_in + idx_i; + end else begin + spm_tile_id_remap = data_in; + end + endfunction + + if (Bypass || NumTiles < 2) begin : gen_bypass + assign address_o = address_i; + end else begin : gen_active + logic [ScrambleBits-1:0] scramble; + logic [TileIdBits-1:0] tile_id; + + assign scramble = address_i[SeqPerTileBits-1:ConstantBitsLSB]; + assign tile_id = address_i[SeqTotalBits-1:SeqPerTileBits]; + + always_comb begin + address_o = address_i; + + // Stage 1: sequential→interleaved field swap (sequential region only). + if (EnableSeqInterleaveSwap && (address_i < (NumTiles * SeqMemSizePerTile))) begin + address_o[SeqTotalBits-1:ConstantBitsLSB] = {scramble, tile_id}; + end + // Stage 2: tile-id remap. Stages 1/2 are mutually exclusive by range. + else if (EnableTileIdRemap && not_io_address) begin + address_o[ConstantBitsLSB +: TileIdBitsPerDma] = + spm_tile_id_remap( + address_i[ConstantBitsLSB +: TileIdBitsPerDma], + address_i[(ConstantBitsLSB + TileIdBits) +: TileIdBitsPerDma] + ); + end + end + end : gen_active + + // Check for unsupported configurations + if (NumBanksPerTile < 2) + $fatal(1, "NumBanksPerTile must be greater than 2. The special case '1' is currently not supported!"); + if (SeqMemSizePerTile % (2**ByteOffset*NumBanksPerTile) != 0) + $fatal(1, "SeqMemSizePerTile must be a multiple of BankWidth*NumBanksPerTile!"); + +endmodule : mempool_addr_scrambler diff --git a/hardware/src/mempool_bank_id_remapper.sv b/hardware/src/mempool_bank_id_remapper.sv deleted file mode 100644 index ef012b58..00000000 --- a/hardware/src/mempool_bank_id_remapper.sv +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2021 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -`include "mempool/mempool.svh" - -module mempool_bank_id_remapper - import mempool_pkg::*; - import cf_math_pkg::idx_width; -#( - parameter int unsigned NumCoresPerTile = 4, - parameter int unsigned NumRemoteReqPortsPerTile = 4, - parameter int unsigned NumBanksPerTile = 16, - parameter int unsigned TCDMAddrMemWidth = 8, - parameter bit SpmBankIdRemap = 0 -) ( - input tcdm_dma_req_t tcdm_dma_req_i, - input tcdm_slave_req_t [NumRemoteReqPortsPerTile-1:0] tcdm_slave_req_i, - input tcdm_slave_req_t [NumCoresPerTile-1:0] local_req_interco_payload_i, - output tcdm_dma_req_t tcdm_dma_req_remapped_o, - output tcdm_slave_req_t [NumRemoteReqPortsPerTile-1:0] tcdm_slave_req_remapped_o, - output tcdm_slave_req_t [NumCoresPerTile-1:0] local_req_interco_payload_remapped_o -); - - /***************** - * Definitions * - *****************/ - - // Compute shift amount: use lower of address width and bank index bits - `define min(a,b) (((a) < (b))? (a) : (b)) - `define max(a,b) (((a) > (b))? (a) : (b)) - - typedef logic [idx_width(NumRemoteReqPortsPerTile)-1:0] remote_ports_index_t; - localparam SHIFT_AMOUNT = `min(TCDMAddrMemWidth, idx_width(NumBanksPerTile)); // or 4 - localparam RemoteReqBits = `max(1, $clog2(NumRemoteReqPortsPerTile-1)); - - // Rotate bank ID by index bits for spreading - function automatic logic [idx_width(NumBanksPerTile)-1:0] spm_bank_id_remap ( - logic [idx_width(NumBanksPerTile)-1:0] data_in, - logic [SHIFT_AMOUNT-1:0] idx_i - ); - if (SpmBankIdRemap == 1) begin - spm_bank_id_remap = data_in + idx_i; - end else begin - spm_bank_id_remap = data_in; - end - endfunction - - /************************ - * Bank Address Remap * - ************************/ - // Apply to DMA, remote, and local requests - tcdm_dma_req_t tcdm_dma_req_remapped; - tcdm_slave_req_t [NumRemoteReqPortsPerTile-1:0] tcdm_slave_req_remapped; - tcdm_slave_req_t [NumCoresPerTile-1:0] local_req_interco_payload_remapped; - - always_comb begin - // DMA request: remap low bits of tgt_addr - tcdm_dma_req_remapped = tcdm_dma_req_i; - tcdm_dma_req_remapped.tgt_addr[idx_width(NumBanksPerTile)-1:0] = - spm_bank_id_remap( - tcdm_dma_req_i.tgt_addr[idx_width(NumBanksPerTile)-1:0], - tcdm_dma_req_i.tgt_addr[idx_width(NumBanksPerTile) +: SHIFT_AMOUNT] - ); - // Remote requests: remap low bits of tgt_addr - for(int rp = 0; rp < NumRemoteReqPortsPerTile; rp++) begin - tcdm_slave_req_remapped[rp] = tcdm_slave_req_i[rp]; - tcdm_slave_req_remapped[rp].tgt_addr[idx_width(NumBanksPerTile)-1:0] = - spm_bank_id_remap( - tcdm_slave_req_i[rp].tgt_addr[idx_width(NumBanksPerTile)-1:0], - tcdm_slave_req_i[rp].tgt_addr[idx_width(NumBanksPerTile) +: SHIFT_AMOUNT] - ); - end - // Local requests: remap low bits of tgt_addr - for(int c = 0; c < NumCoresPerTile; c++) begin - local_req_interco_payload_remapped[c] = local_req_interco_payload_i[c]; - local_req_interco_payload_remapped[c].tgt_addr[idx_width(NumBanksPerTile)-1:0] = - spm_bank_id_remap( - local_req_interco_payload_i[c].tgt_addr[idx_width(NumBanksPerTile)-1:0], - local_req_interco_payload_i[c].tgt_addr[idx_width(NumBanksPerTile) +: SHIFT_AMOUNT] - ); - end - end - - // Drive outputs - assign tcdm_dma_req_remapped_o = tcdm_dma_req_remapped; - assign tcdm_slave_req_remapped_o = tcdm_slave_req_remapped; - assign local_req_interco_payload_remapped_o = local_req_interco_payload_remapped; -endmodule diff --git a/hardware/src/mempool_dma_tile_id_remapper.sv b/hardware/src/mempool_dma_tile_id_remapper.sv deleted file mode 100644 index 751fc6c7..00000000 --- a/hardware/src/mempool_dma_tile_id_remapper.sv +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2025 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -module mempool_dma_tile_id_remapper - import mempool_pkg::*; - import cf_math_pkg::idx_width; -( - input reqrsp_req_t dma_reqrsp_req_i, - output logic [idx_width(NumTilesPerDma)-1:0] tile_id_remap_o -); - - // Address slice offsets - localparam int TILE_ID_LOW_OFFSET = ByteOffset - + idx_width(NumBanksPerTile); - localparam int TILE_ID_REMAPPED_OFFSET = TILE_ID_LOW_OFFSET - + idx_width(NumTilesPerGroup) - + idx_width(NumGroups); - localparam int TILE_ID_WIDTH = idx_width(NumTilesPerDma); - - // Extract fields from address - logic [TILE_ID_WIDTH-1:0] tile_id_remap_before; - logic [TILE_ID_WIDTH-1:0] tile_id_remap; - - assign tile_id_remap_before = dma_reqrsp_req_i.q.addr[TILE_ID_LOW_OFFSET +: TILE_ID_WIDTH]; - assign tile_id_remap = tile_id_remap_before + - dma_reqrsp_req_i.q.addr[TILE_ID_REMAPPED_OFFSET +: TILE_ID_WIDTH]; - - generate - if (TileIdRemap == 1) begin : gen_remap_enabled - always_comb begin - if (dma_reqrsp_req_i.q.addr < (NumTiles * SeqMemSizePerTile)) begin - tile_id_remap_o = tile_id_remap_before; - end else begin - tile_id_remap_o = tile_id_remap; - end - end - end else begin : gen_remap_disabled - assign tile_id_remap_o = tile_id_remap_before; - end - endgenerate - -endmodule diff --git a/hardware/src/mempool_group.sv b/hardware/src/mempool_group.sv index 9c8bd039..0cf66c21 100644 --- a/hardware/src/mempool_group.sv +++ b/hardware/src/mempool_group.sv @@ -536,11 +536,32 @@ module mempool_group .reqrsp_rsp_i(dma_reqrsp_rsp) ); - mempool_dma_tile_id_remapper i_mempool_group_tile_id_remapper ( - .dma_reqrsp_req_i (dma_reqrsp_req), - .tile_id_remap_o (tile_id_remap[d]) + // SPM address pipeline for the DMA request: tile-id remap selects the dest tile. + // The seq↔interleave swap is SKIPPED (DMA addresses are already interleaved). + addr_t dma_scrambled_addr; + + mempool_addr_scrambler #( + .AddrWidth (AddrWidth ), + .ByteOffset (ByteOffset ), + .NumTiles (NumTiles ), + .NumTilesPerDma (NumTilesPerDma ), + .NumBanksPerTile (NumBanksPerTile ), + .Bypass (0 ), + .SeqMemSizePerTile (SeqMemSizePerTile), + .TCDMBaseAddr (TCDMBaseAddr ), + .TCDMMask (TCDMMask ), + .EnableSeqInterleaveSwap (1'b0 ), + .EnableTileIdRemap (TileIdRemap ) + ) i_dma_addr_scrambler ( + .address_i (dma_reqrsp_req.q.addr ), + .address_o (dma_scrambled_addr ) ); + // Extract the (post-remap) tile-id slice for the reqrsp_demux select. + assign tile_id_remap[d] = + dma_scrambled_addr[ByteOffset + idx_width(NumBanksPerTile) + +: idx_width(NumTilesPerDma)]; + if (NumTilesPerDma > 1) begin: gen_dma_reqrsp_demux reqrsp_demux #( .NrPorts (NumTilesPerDma ), diff --git a/hardware/src/mempool_tcdm_bank_interco.sv b/hardware/src/mempool_tcdm_bank_interco.sv new file mode 100644 index 00000000..68aeddd9 --- /dev/null +++ b/hardware/src/mempool_tcdm_bank_interco.sv @@ -0,0 +1,415 @@ +// Copyright 2026 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +`include "mempool/mempool.svh" + +// Tile-internal TCDM bank-side interconnect: narrow req/resp crossbars, the +// wide DMA superbank demux/mux, per-bank wide-over-narrow priority, and the +// within-superbank bank-id remap. Replaces the legacy i_local_req_interco / +// i_local_resp_interco / i_dma_req_interco / i_dma_resp_interco stream_xbars, +// the per-superbank tcdm_wide_narrow_mux, and mempool_bank_id_remapper.sv. +// +// Bank-id remap is within-superbank only: only the low log2(NumBanksPerSB) +// bits of bank_id are rewritten (add the row-id slice). bank_id_hi (selects +// superbank for wide DMA) passes through unchanged so the wide chunk lands in +// the same superbank a narrow access of its base address would pick; the wide +// fork then rotates within that superbank to stay coherent with the narrow path. +// +// gen_superbank_resp_ini_addr (NumRemoteReqPortsPerTile vs +// NumRemoteRespPortsPerTile asymmetry) stays OUTSIDE this module: the caller +// passes the post-conditioning resp idx via mst_resp_ini_addr_i. + +module mempool_tcdm_bank_interco + import mempool_pkg::*; + import cf_math_pkg::idx_width; +#( + parameter int unsigned NumNarrowReq = 8, + parameter int unsigned NumNarrowResp = 8, + parameter int unsigned NumBanksPerTile = 16, + parameter int unsigned NumSuperbanks = 1, + parameter int unsigned NarrowDataWidth = 32, + parameter int unsigned WideDataWidth = 512, + parameter int unsigned ByteOffset = 2, + parameter bit SpmBankIdRemap = 0, + parameter type narrow_req_t = tcdm_slave_req_t, + parameter type narrow_resp_t = tcdm_slave_resp_t, + parameter type wide_req_t = tcdm_dma_req_t, + parameter type wide_resp_t = tcdm_dma_resp_t, + parameter type group_id_t = mempool_pkg::group_id_t, + // Must be wide enough to address NumNarrowResp ports (logic [idx_width(NumNarrowResp)-1:0]). + parameter type resp_idx_t = logic [3:0], + // derived + localparam int unsigned NumBanksPerSB = NumBanksPerTile / NumSuperbanks, + localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile), + localparam int unsigned BankIdLoBits = $clog2(NumBanksPerSB), + localparam int unsigned SBSelBits = (NumSuperbanks > 1) ? $clog2(NumSuperbanks) : 1, + localparam int unsigned NarrowBeWidth = NarrowDataWidth / 8, + localparam int unsigned RotBits = (NumBanksPerSB > 1) ? $clog2(NumBanksPerSB) : 1, + // Bank-id remap offset width, clamped by the row-id slice above bank_id. + localparam int unsigned BankRemapShiftAmt = + (BankIdLoBits <= (NumBanksPerSB == 1 ? 0 : BankIdLoBits)) ? BankIdLoBits : BankIdLoBits +) ( + input logic clk_i, + input logic rst_ni, + input group_id_t group_id_i, + + // ----- Narrow request inputs ----- + input narrow_req_t [NumNarrowReq-1:0] slv_narrow_req_i, + input logic [NumNarrowReq-1:0] slv_narrow_req_valid_i, + output logic [NumNarrowReq-1:0] slv_narrow_req_ready_o, + + // ----- Narrow response outputs ----- + output narrow_resp_t [NumNarrowResp-1:0] slv_narrow_resp_o, + output logic [NumNarrowResp-1:0] slv_narrow_resp_valid_o, + input logic [NumNarrowResp-1:0] slv_narrow_resp_ready_i, + + // ----- Wide DMA request input (single, demuxed internally) ----- + input wide_req_t slv_wide_req_i, + input logic slv_wide_req_valid_i, + output logic slv_wide_req_ready_o, + + // ----- Wide DMA response output (single, muxed internally) ----- + output wide_resp_t slv_wide_resp_o, + output logic slv_wide_resp_valid_o, + input logic slv_wide_resp_ready_i, + + // ----- Bank-side request ports (per bank) ----- + output narrow_req_t [NumBanksPerTile-1:0] mst_req_o, + output logic [NumBanksPerTile-1:0] mst_req_valid_o, + input logic [NumBanksPerTile-1:0] mst_req_ready_i, + output logic [NumBanksPerTile-1:0] mst_req_wide_o, + output resp_idx_t [NumBanksPerTile-1:0] mst_req_ini_addr_o, + + // ----- Bank-side response ports (per bank) ----- + input narrow_resp_t [NumBanksPerTile-1:0] mst_resp_i, + input logic [NumBanksPerTile-1:0] mst_resp_valid_i, + output logic [NumBanksPerTile-1:0] mst_resp_ready_o, + input logic [NumBanksPerTile-1:0] mst_resp_wide_i, + input resp_idx_t [NumBanksPerTile-1:0] mst_resp_ini_addr_i +); + + // Depth to absorb the bank pipeline plus slack (a couple of wide reqs + // outstanding per superbank; 8 leaves comfortable slack). + localparam int unsigned RotFifoDepth = 8; + + // ============================================================ + // Narrow-request: bank-id-lo remap + xbar + // ============================================================ + // Within-superbank constraint: only the low BankIdLoBits of bank_id are rewritten. + logic [NumNarrowReq-1:0][BankOffsetBits-1:0] narrow_sel; + for (genvar i = 0; i < NumNarrowReq; i++) begin : gen_narrow_sel + logic [BankIdLoBits-1:0] raw_lo, new_lo; + assign raw_lo = slv_narrow_req_i[i].tgt_addr[0 +: BankIdLoBits]; + if (SpmBankIdRemap) begin : gen_remap_active + // Offset comes from the row-id slice just above bank_id. + logic [BankRemapShiftAmt-1:0] offset_lo; + assign offset_lo = slv_narrow_req_i[i].tgt_addr[BankOffsetBits +: BankRemapShiftAmt]; + assign new_lo = raw_lo + offset_lo; + end else begin : gen_remap_passthrough + assign new_lo = raw_lo; + end + if (NumSuperbanks > 1) begin : gen_hi_passthrough + // bank_id_hi passes through untouched + assign narrow_sel[i] = + {slv_narrow_req_i[i].tgt_addr[BankIdLoBits +: (BankOffsetBits - BankIdLoBits)], + new_lo}; + end else begin : gen_no_hi + assign narrow_sel[i] = new_lo; + end + end + + narrow_req_t [NumBanksPerTile-1:0] narrow_to_bank; + logic [NumBanksPerTile-1:0] narrow_to_bank_valid; + logic [NumBanksPerTile-1:0] narrow_to_bank_ready; + resp_idx_t [NumBanksPerTile-1:0] narrow_to_bank_idx; + + stream_xbar #( + .NumInp (NumNarrowReq ), + .NumOut (NumBanksPerTile ), + .payload_t(narrow_req_t ) + ) i_narrow_req_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i(1'b0 ), + .rr_i ('0 ), + .data_i (slv_narrow_req_i ), + .valid_i(slv_narrow_req_valid_i ), + .ready_o(slv_narrow_req_ready_o ), + .sel_i (narrow_sel ), + .data_o (narrow_to_bank ), + .valid_o(narrow_to_bank_valid ), + .ready_i(narrow_to_bank_ready ), + .idx_o (narrow_to_bank_idx ) + ); + + // ============================================================ + // Wide DMA: demux single wide req to per-superbank streams. + // ============================================================ + wide_req_t [NumSuperbanks-1:0] wide_to_sb; + logic [NumSuperbanks-1:0] wide_to_sb_valid; + logic [NumSuperbanks-1:0] wide_to_sb_ready; + + if (NumSuperbanks > 1) begin : gen_wide_req_demux + stream_xbar #( + .NumInp (1 ), + .NumOut (NumSuperbanks), + .payload_t(wide_req_t ) + ) i_wide_req_demux ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i(1'b0 ), + .rr_i ('0 ), + .data_i (slv_wide_req_i ), + .valid_i(slv_wide_req_valid_i), + .ready_o(slv_wide_req_ready_o), + // Superbank select uses the upper bits of bank_id (unchanged by remap). + .sel_i (slv_wide_req_i.tgt_addr[BankIdLoBits +: SBSelBits]), + .data_o (wide_to_sb ), + .valid_o(wide_to_sb_valid ), + .ready_i(wide_to_sb_ready ), + .idx_o (/* unused */ ) + ); + end else begin : gen_wide_req_bypass + assign wide_to_sb[0] = slv_wide_req_i; + assign wide_to_sb_valid[0] = slv_wide_req_valid_i; + assign slv_wide_req_ready_o = wide_to_sb_ready[0]; + end + + // ============================================================ + // Per-superbank wide handling: fork-with-rotation on the req side, + // join-with-inverse-rotation on the resp side. Rotation amount per wide + // req is queued in a per-superbank FIFO so outstanding reqs reassemble. + // ============================================================ + wide_resp_t [NumSuperbanks-1:0] sb_wide_resp; + logic [NumSuperbanks-1:0] sb_wide_resp_valid; + logic [NumSuperbanks-1:0] sb_wide_resp_ready; + + // Per-bank wide req/resp signals coming out of the fork / into the join. + logic [NumBanksPerTile-1:0] wide_fork_valid; + logic [NumBanksPerTile-1:0] wide_fork_ready; + narrow_req_t [NumBanksPerTile-1:0] wide_req_for_bank; + logic [NumBanksPerTile-1:0] wide_join_in_valid; + logic [NumBanksPerTile-1:0] wide_join_in_ready; + + for (genvar d = 0; d < NumSuperbanks; d++) begin : gen_sb + // Rotation amount = (remapped) bank_id_lo of the wide req's tgt_addr. + logic [RotBits-1:0] req_rot; + if (NumBanksPerSB > 1) begin : gen_rot + // Same bank-id-lo remap arithmetic as the narrow xbar sel, so rotations match. + logic [BankIdLoBits-1:0] raw_lo, post_lo; + assign raw_lo = wide_to_sb[d].tgt_addr[0 +: BankIdLoBits]; + if (SpmBankIdRemap) begin : gen_remap_on + logic [BankRemapShiftAmt-1:0] off_lo; + assign off_lo = wide_to_sb[d].tgt_addr[BankOffsetBits +: BankRemapShiftAmt]; + assign post_lo = raw_lo + off_lo; + end else begin : gen_remap_off + assign post_lo = raw_lo; + end + assign req_rot = post_lo; + end else begin : gen_no_rot + assign req_rot = '0; + end + + // Rotation FIFO: push at fork accept, pop at wide resp out. + logic [RotBits-1:0] resp_rot; + logic rot_fifo_push, rot_fifo_pop; + logic rot_fifo_full; + + fifo_v3 #( + .DEPTH (RotFifoDepth), + .DATA_WIDTH (RotBits ) + ) i_rot_fifo ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i (1'b0 ), + .testmode_i (1'b0 ), + .data_i (req_rot ), + .push_i (rot_fifo_push), + .full_o (rot_fifo_full), + .data_o (resp_rot ), + .pop_i (rot_fifo_pop ), + .empty_o (/* unused */ ), + .usage_o (/* unused */ ) + ); + + // Fork the wide req across the superbank's banks. + logic [NumBanksPerSB-1:0] sb_fork_valid; + logic [NumBanksPerSB-1:0] sb_fork_ready; + stream_fork #(.N_OUP(NumBanksPerSB)) i_wide_fork ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .valid_i(wide_to_sb_valid[d] ), + .ready_o(wide_to_sb_ready[d] ), + .valid_o(sb_fork_valid ), + .ready_i(sb_fork_ready ) + ); + assign rot_fifo_push = wide_to_sb_valid[d] & wide_to_sb_ready[d]; + + // Compose each bank's wide req: rotated word slice + shared address. + for (genvar b = 0; b < NumBanksPerSB; b++) begin : gen_sb_bank_req + localparam int unsigned global_bank = d*NumBanksPerSB + b; + always_comb begin + // word j = (b - req_rot) mod NumBanksPerSB feeds bank port b. + automatic int unsigned j = (b + NumBanksPerSB - req_rot) % NumBanksPerSB; + wide_req_for_bank[global_bank] = '{ + wdata: '{ + meta_id: wide_to_sb[d].wdata.meta_id, + core_id: wide_to_sb[d].wdata.core_id, + amo: wide_to_sb[d].wdata.amo, + data: wide_to_sb[d].wdata.data[j*NarrowDataWidth +: NarrowDataWidth] + }, + wen: wide_to_sb[d].wen, + be: wide_to_sb[d].be[j*NarrowBeWidth +: NarrowBeWidth], + tgt_addr: wide_to_sb[d].tgt_addr, + ini_addr: '0, + src_group_id: group_id_i + }; + end + assign wide_fork_valid[global_bank] = sb_fork_valid[b]; + assign sb_fork_ready[b] = wide_fork_ready[global_bank]; + end + + // Wide resp join across the superbank's banks; inverse rotation uses + // resp_rot from the FIFO head. + logic [NumBanksPerSB-1:0] sb_join_valid; + logic [NumBanksPerSB-1:0] sb_join_ready; + stream_join #(.N_INP(NumBanksPerSB)) i_wide_join ( + .inp_valid_i(sb_join_valid ), + .inp_ready_o(sb_join_ready ), + .oup_valid_o(sb_wide_resp_valid[d] ), + .oup_ready_i(sb_wide_resp_ready[d] ) + ); + assign rot_fifo_pop = sb_wide_resp_valid[d] & sb_wide_resp_ready[d]; + + // Wire the per-bank wide-side response handshake into the join. + for (genvar b = 0; b < NumBanksPerSB; b++) begin : gen_sb_bank_resp + localparam int unsigned global_bank = d*NumBanksPerSB + b; + assign sb_join_valid[b] = wide_join_in_valid[global_bank]; + assign wide_join_in_ready[global_bank] = sb_join_ready[b]; + end + + // Reassemble wide rdata with inverse rotation, then carry metadata. + always_comb begin + sb_wide_resp[d] = '{rdata: '{default: '0}}; + for (int b = 0; b < NumBanksPerSB; b++) begin + // Bank port b carries word j = (b - resp_rot) mod NumBanksPerSB. + automatic int unsigned j = (b + NumBanksPerSB - resp_rot) % NumBanksPerSB; + sb_wide_resp[d].rdata.data[j*NarrowDataWidth +: NarrowDataWidth] + = mst_resp_i[d*NumBanksPerSB + b].rdata.data; + end + // Metadata from the lowest bank — all banks of a wide chunk share it. + sb_wide_resp[d].rdata.meta_id = mst_resp_i[d*NumBanksPerSB].rdata.meta_id; + sb_wide_resp[d].rdata.core_id = mst_resp_i[d*NumBanksPerSB].rdata.core_id; + sb_wide_resp[d].rdata.amo = mst_resp_i[d*NumBanksPerSB].rdata.amo; + end + + // synopsys translate_off +`ifndef SYNTHESIS + always_ff @(posedge clk_i) begin + if (rst_ni && rot_fifo_push && rot_fifo_full) begin + $fatal(1, "[mempool_tcdm_bank_interco] sb=%0d rot FIFO overflow (depth=%0d)", d, RotFifoDepth); + end + end +`endif + // synopsys translate_on + end + + // ============================================================ + // Per-bank priority: wide overrides narrow at the request port. + // ============================================================ + for (genvar b = 0; b < NumBanksPerTile; b++) begin : gen_bank_mux + always_comb begin + if (wide_fork_valid[b]) begin + mst_req_o[b] = wide_req_for_bank[b]; + mst_req_valid_o[b] = wide_fork_valid[b]; + mst_req_wide_o[b] = 1'b1; + wide_fork_ready[b] = mst_req_ready_i[b]; + narrow_to_bank_ready[b] = 1'b0; + mst_req_ini_addr_o[b] = '0; + end else begin + mst_req_o[b] = narrow_to_bank[b]; + mst_req_valid_o[b] = narrow_to_bank_valid[b]; + mst_req_wide_o[b] = 1'b0; + wide_fork_ready[b] = 1'b0; + narrow_to_bank_ready[b] = mst_req_ready_i[b]; + mst_req_ini_addr_o[b] = narrow_to_bank_idx[b]; + end + end + end + + // ============================================================ + // Per-bank response split: wide → join, narrow → resp xbar. + // ============================================================ + narrow_resp_t [NumBanksPerTile-1:0] narrow_resp_to_xbar; + logic [NumBanksPerTile-1:0] narrow_resp_to_xbar_valid; + logic [NumBanksPerTile-1:0] narrow_resp_to_xbar_ready; + + for (genvar b = 0; b < NumBanksPerTile; b++) begin : gen_resp_split + assign narrow_resp_to_xbar[b] = mst_resp_i[b]; + always_comb begin + if (mst_resp_wide_i[b]) begin + wide_join_in_valid[b] = mst_resp_valid_i[b]; + mst_resp_ready_o[b] = wide_join_in_ready[b]; + narrow_resp_to_xbar_valid[b] = 1'b0; + end else begin + wide_join_in_valid[b] = 1'b0; + mst_resp_ready_o[b] = narrow_resp_to_xbar_ready[b]; + narrow_resp_to_xbar_valid[b] = mst_resp_valid_i[b]; + end + end + end + + // ============================================================ + // Narrow response xbar (NumBanksPerTile → NumNarrowResp). + // sel = caller-supplied resp idx (already conditioned for any + // NumRemoteReqPortsPerTile vs NumRemoteRespPortsPerTile asymmetry). + // ============================================================ + stream_xbar #( + .NumInp (NumBanksPerTile ), + .NumOut (NumNarrowResp ), + .payload_t(narrow_resp_t ) + ) i_narrow_resp_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i(1'b0 ), + .rr_i ('0 ), + .data_i (narrow_resp_to_xbar ), + .valid_i(narrow_resp_to_xbar_valid ), + .ready_o(narrow_resp_to_xbar_ready ), + .sel_i (mst_resp_ini_addr_i ), + .data_o (slv_narrow_resp_o ), + .valid_o(slv_narrow_resp_valid_o ), + .ready_i(slv_narrow_resp_ready_i ), + .idx_o (/* unused */ ) + ); + + // ============================================================ + // Wide response mux (NumSuperbanks → 1). + // ============================================================ + if (NumSuperbanks > 1) begin : gen_wide_resp_mux + stream_xbar #( + .NumInp (NumSuperbanks ), + .NumOut (1 ), + .payload_t(wide_resp_t ) + ) i_wide_resp_mux ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i(1'b0 ), + .rr_i ('0 ), + .data_i (sb_wide_resp ), + .valid_i(sb_wide_resp_valid ), + .ready_o(sb_wide_resp_ready ), + .sel_i ('0 ), + .data_o (slv_wide_resp_o ), + .valid_o(slv_wide_resp_valid_o ), + .ready_i(slv_wide_resp_ready_i ), + .idx_o (/* unused */ ) + ); + end else begin : gen_wide_resp_bypass + assign slv_wide_resp_o = sb_wide_resp[0]; + assign slv_wide_resp_valid_o = sb_wide_resp_valid[0]; + assign sb_wide_resp_ready[0] = slv_wide_resp_ready_i; + end + +endmodule diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index 1ed11816..e0e9539e 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -88,13 +88,6 @@ module mempool_tile assign group_id = '0; end: gen_group_id - /************************ - * Bank Address Remap * - ************************/ - tcdm_dma_req_t tcdm_dma_req_remapped; - tcdm_slave_req_t [NumRemoteReqPortsPerTile-1:0] tcdm_slave_req_remapped; - tcdm_slave_req_t [NumCoresPerTile-1:0] local_req_interco_payload_remapped; - /*********** * Cores * ***********/ @@ -367,20 +360,6 @@ module mempool_tile } bank_metadata_t; // Memory interfaces - tcdm_dma_req_t [NumSuperbanks-1:0] tcdm_dma_req; - logic [NumSuperbanks-1:0] tcdm_dma_req_valid; - logic [NumSuperbanks-1:0] tcdm_dma_req_ready; - tcdm_dma_resp_t [NumSuperbanks-1:0] tcdm_dma_resp; - logic [NumSuperbanks-1:0] tcdm_dma_resp_valid; - logic [NumSuperbanks-1:0] tcdm_dma_resp_ready; - - logic [NumBanksPerTile-1:0] superbank_req_valid; - logic [NumBanksPerTile-1:0] superbank_req_ready; - local_req_interco_addr_t [NumBanksPerTile-1:0] superbank_req_ini_addr; - tcdm_slave_req_t [NumBanksPerTile-1:0] superbank_req_payload; - logic [NumBanksPerTile-1:0] superbank_resp_valid; - logic [NumBanksPerTile-1:0] superbank_resp_ready; - tcdm_slave_resp_t [NumBanksPerTile-1:0] superbank_resp_payload; local_resp_interco_addr_t[NumBanksPerTile-1:0] superbank_resp_ini_addr; logic [NumBanksPerTile-1:0] bank_req_valid; @@ -394,63 +373,11 @@ module mempool_tile logic [NumBanksPerTile-1:0] bank_resp_wide; local_resp_interco_addr_t[NumBanksPerTile-1:0] bank_resp_ini_addr; - tcdm_dma_req_t tcdm_dma_req_i_struct; - assign tcdm_dma_req_i_struct = tcdm_dma_req_remapped; - - if (NumSuperbanks == 1) begin : gen_dma_interco_bypass - assign tcdm_dma_req = tcdm_dma_req_i_struct; - assign tcdm_dma_req_valid = tcdm_dma_req_valid_i; - assign tcdm_dma_req_ready_o = tcdm_dma_req_ready; - assign tcdm_dma_resp_o = tcdm_dma_resp; - assign tcdm_dma_resp_valid_o = tcdm_dma_resp_valid; - assign tcdm_dma_resp_ready = tcdm_dma_resp_ready_i; - end else begin : gen_dma_interco - stream_xbar #( - .NumInp (1 ), - .NumOut (NumSuperbanks ), - .payload_t(tcdm_dma_req_t) - ) i_dma_req_interco ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .flush_i(1'b0 ), - // External priority flag - .rr_i ('0 ), - // Master - .data_i (tcdm_dma_req_i_struct ), - .valid_i(tcdm_dma_req_valid_i ), - .ready_o(tcdm_dma_req_ready_o ), - .sel_i (tcdm_dma_req_i_struct.tgt_addr[idx_width(NumBanksPerTile)-1:$clog2(DmaNumWords)]), - // Slave - .data_o (tcdm_dma_req ), - .valid_o(tcdm_dma_req_valid ), - .ready_i(tcdm_dma_req_ready ), - .idx_o (/* Unused */ ) - ); - - stream_xbar #( - .NumInp (NumSuperbanks ), - .NumOut (1 ), - .payload_t(tcdm_dma_resp_t) - ) i_dma_resp_interco ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .flush_i(1'b0 ), - // External priority flag - .rr_i ('0 ), - // Master - .data_i (tcdm_dma_resp ), - .valid_i(tcdm_dma_resp_valid ), - .ready_o(tcdm_dma_resp_ready ), - .sel_i ('0 ), - // Slave - .data_o (tcdm_dma_resp_o ), - .valid_o(tcdm_dma_resp_valid_o ), - .ready_i(tcdm_dma_resp_ready_i ), - .idx_o (/* Unused */ ) - ); - end - - assign bank_req_ini_addr = superbank_req_ini_addr; + // gen_superbank_resp_ini_addr conditions bank_resp_ini_addr (the local + // input-port index stored in the bank's metadata) into the resp xbar's + // sel, handling NumRemoteReqPortsPerTile vs NumRemoteRespPortsPerTile + // asymmetry. The conditioned value feeds mempool_tcdm_bank_interco's + // mst_resp_ini_addr_i input. for (genvar b = 0; unsigned'(b) < NumBanksPerTile; b++) begin: gen_superbank_resp_ini_addr if(NumRemoteReqPortsPerTile > NumRemoteRespPortsPerTile ) begin: gen_superbank_resp_ini_addr_req_gt_resp always_comb begin @@ -507,41 +434,9 @@ module mempool_tile end end - for (genvar d = 0; unsigned'(d) < NumSuperbanks; d++) begin: gen_dma_mux - tcdm_wide_narrow_mux #( - .NarrowDataWidth(DataWidth ), - .WideDataWidth (DmaDataWidth ), - .narrow_req_t (tcdm_slave_req_t ), - .narrow_rsp_t (tcdm_slave_resp_t), - .wide_req_t (tcdm_dma_req_t ), - .wide_rsp_t (tcdm_dma_resp_t ), - .group_id_t (group_id_t ) - ) i_tcdm_wide_narrow_mux ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .group_id_i (group_id ), // FlooNoC Added - .slv_narrow_req_i (superbank_req_payload[d*DmaNumWords+:DmaNumWords] ), - .slv_narrow_req_valid_i(superbank_req_valid[d*DmaNumWords+:DmaNumWords] ), - .slv_narrow_req_ready_o(superbank_req_ready[d*DmaNumWords+:DmaNumWords] ), - .slv_narrow_rsp_o (superbank_resp_payload[d*DmaNumWords+:DmaNumWords]), - .slv_narrow_rsp_valid_o(superbank_resp_valid[d*DmaNumWords+:DmaNumWords] ), - .slv_narrow_rsp_ready_i(superbank_resp_ready[d*DmaNumWords+:DmaNumWords] ), - .slv_wide_req_i (tcdm_dma_req[d] ), - .slv_wide_req_valid_i (tcdm_dma_req_valid[d] ), - .slv_wide_req_ready_o (tcdm_dma_req_ready[d] ), - .slv_wide_rsp_o (tcdm_dma_resp[d] ), - .slv_wide_rsp_valid_o (tcdm_dma_resp_valid[d] ), - .slv_wide_rsp_ready_i (tcdm_dma_resp_ready[d] ), - .mst_req_o (bank_req_payload[d*DmaNumWords+:DmaNumWords] ), - .mst_req_wide_o (bank_req_wide[d*DmaNumWords+:DmaNumWords] ), - .mst_req_valid_o (bank_req_valid[d*DmaNumWords+:DmaNumWords] ), - .mst_req_ready_i (bank_req_ready[d*DmaNumWords+:DmaNumWords] ), - .mst_rsp_i (bank_resp_payload[d*DmaNumWords+:DmaNumWords] ), - .mst_rsp_wide_i (bank_resp_wide[d*DmaNumWords+:DmaNumWords] ), - .mst_rsp_valid_i (bank_resp_valid[d*DmaNumWords+:DmaNumWords] ), - .mst_rsp_ready_o (bank_resp_ready[d*DmaNumWords+:DmaNumWords] ) - ); - end + // (mempool_tcdm_bank_interco is instantiated further below, after the + // signal declarations for local_req_interco_payload / postreg_tcdm_slave_req + // / etc. that it connects to.) `ifndef TARGET_SYNTHESIS `ifndef TARGET_VERILATOR @@ -720,7 +615,7 @@ module mempool_tile .rst_ni (rst_ni ), .clr_i (1'b0 ), .testmode_i(1'b0 ), - .data_i (tcdm_slave_req_remapped[h] ), + .data_i (tcdm_slave_req_i[h] ), .valid_i (tcdm_slave_req_valid_i[h] ), .ready_o (tcdm_slave_req_ready_o[h] ), .data_o (postreg_tcdm_slave_req[h] ), @@ -849,75 +744,61 @@ module mempool_tile tcdm_slave_resp_t [NumCoresPerTile-1:0] local_resp_interco_payload; addr_t [NumCoresPerTile-1:0] local_req_interco_addr_int; - logic [NumCoresPerTile+NumRemoteReqPortsPerTile-1:0][idx_width(NumBanksPerTile)-1:0] local_req_interco_tgt_sel; - for (genvar j = 0; unsigned'(j) < NumCoresPerTile; j++) begin: gen_local_req_interco_tgt_sel_local - assign local_req_interco_tgt_sel[j] = local_req_interco_payload_remapped[j].tgt_addr[idx_width(NumBanksPerTile)-1:0]; - end: gen_local_req_interco_tgt_sel_local - for (genvar j = 0; unsigned'(j) < NumRemoteReqPortsPerTile; j++) begin: gen_local_req_interco_tgt_sel_remote - assign local_req_interco_tgt_sel[j + NumCoresPerTile] = postreg_tcdm_slave_req[j].tgt_addr[idx_width(NumBanksPerTile)-1:0]; - end: gen_local_req_interco_tgt_sel_remote - - stream_xbar #( - .NumInp (NumCoresPerTile + NumRemoteReqPortsPerTile ), - .NumOut (NumBanksPerTile ), - .payload_t(tcdm_slave_req_t ) - ) i_local_req_interco ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .flush_i(1'b0 ), - // External priority flag - .rr_i ('0 ), - // Master - .data_i ({postreg_tcdm_slave_req, local_req_interco_payload_remapped}), - .valid_i({postreg_tcdm_slave_req_valid, local_req_interco_valid} ), - .ready_o({postreg_tcdm_slave_req_ready, local_req_interco_ready} ), - .sel_i (local_req_interco_tgt_sel ), - // Slave - .data_o (superbank_req_payload ), - .valid_o(superbank_req_valid ), - .ready_i(superbank_req_ready ), - .idx_o (superbank_req_ini_addr ) + // Bank-side TCDM interconnect: combines narrow-req routing, narrow-resp + // routing, wide-DMA superbank demux/mux, wide-narrow priority arbitration + // at each bank, and within-superbank bank-id remap. + mempool_tcdm_bank_interco #( + .NumNarrowReq (NumCoresPerTile + NumRemoteReqPortsPerTile), + .NumNarrowResp (NumCoresPerTile + NumRemoteRespPortsPerTile), + .NumBanksPerTile(NumBanksPerTile ), + .NumSuperbanks (NumSuperbanks ), + .NarrowDataWidth(DataWidth ), + .WideDataWidth (DmaDataWidth ), + .ByteOffset (ByteOffset ), + .SpmBankIdRemap (SpmBankIdRemap ), + .narrow_req_t (tcdm_slave_req_t ), + .narrow_resp_t (tcdm_slave_resp_t), + .wide_req_t (tcdm_dma_req_t ), + .wide_resp_t (tcdm_dma_resp_t ), + .group_id_t (group_id_t ), + .resp_idx_t (local_resp_interco_addr_t) + ) i_tcdm_bank_interco ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .group_id_i (group_id), + // Narrow request inputs: cores at [0..NumCoresPerTile-1], remote slaves + // at [NumCoresPerTile..NumCoresPerTile+NumRemoteReqPortsPerTile-1] + // (matches the legacy concat order in i_local_req_interco). + .slv_narrow_req_i ({postreg_tcdm_slave_req, local_req_interco_payload}), + .slv_narrow_req_valid_i ({postreg_tcdm_slave_req_valid, local_req_interco_valid }), + .slv_narrow_req_ready_o ({postreg_tcdm_slave_req_ready, local_req_interco_ready }), + // Narrow response outputs: cores at [0..NumCoresPerTile-1], remote resp + // ports at [NumCoresPerTile..NumCoresPerTile+NumRemoteRespPortsPerTile-1]. + .slv_narrow_resp_o ({prereg_tcdm_slave_resp, local_resp_interco_payload}), + .slv_narrow_resp_valid_o({prereg_tcdm_slave_resp_valid, local_resp_interco_valid }), + .slv_narrow_resp_ready_i({prereg_tcdm_slave_resp_ready, local_resp_interco_ready }), + // Wide DMA (single, demuxed/muxed across superbanks inside the module) + .slv_wide_req_i (tcdm_dma_req_i ), + .slv_wide_req_valid_i (tcdm_dma_req_valid_i), + .slv_wide_req_ready_o (tcdm_dma_req_ready_o), + .slv_wide_resp_o (tcdm_dma_resp_o ), + .slv_wide_resp_valid_o (tcdm_dma_resp_valid_o), + .slv_wide_resp_ready_i (tcdm_dma_resp_ready_i), + // Bank-side request / response. + .mst_req_o (bank_req_payload ), + .mst_req_valid_o (bank_req_valid ), + .mst_req_ready_i (bank_req_ready ), + .mst_req_wide_o (bank_req_wide ), + .mst_req_ini_addr_o (bank_req_ini_addr ), + .mst_resp_i (bank_resp_payload ), + .mst_resp_valid_i (bank_resp_valid ), + .mst_resp_ready_o (bank_resp_ready ), + .mst_resp_wide_i (bank_resp_wide ), + // gen_superbank_resp_ini_addr conditions bank_resp_ini_addr into the + // resp xbar's sel (handles req/resp port count asymmetry). + .mst_resp_ini_addr_i (superbank_resp_ini_addr) ); - stream_xbar #( - .NumInp (NumBanksPerTile ), - .NumOut (NumCoresPerTile + NumRemoteRespPortsPerTile ), - .payload_t(tcdm_slave_resp_t ) - ) i_local_resp_interco ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .flush_i(1'b0 ), - // External priority flag - .rr_i ('0 ), - // Master - .data_i (superbank_resp_payload ), - .valid_i(superbank_resp_valid ), - .ready_o(superbank_resp_ready ), - .sel_i (superbank_resp_ini_addr ), - // Slave - .data_o ({prereg_tcdm_slave_resp, local_resp_interco_payload} ), - .valid_o({prereg_tcdm_slave_resp_valid, local_resp_interco_valid}), - .ready_i({prereg_tcdm_slave_resp_ready, local_resp_interco_ready}), - .idx_o (/* Unused */ ) - ); - - /******************** - * ID Remapping * - ********************/ - mempool_bank_id_remapper #( - .NumCoresPerTile (NumCoresPerTile ), - .NumRemoteReqPortsPerTile (NumRemoteReqPortsPerTile ), - .NumBanksPerTile (NumBanksPerTile ), - .TCDMAddrMemWidth (TCDMAddrMemWidth ), - .SpmBankIdRemap (SpmBankIdRemap ) - ) i_mempool_tile_id_remapper ( - .tcdm_dma_req_i (tcdm_dma_req_i ), - .tcdm_slave_req_i (tcdm_slave_req_i ), - .local_req_interco_payload_i (local_req_interco_payload ), - .tcdm_dma_req_remapped_o (tcdm_dma_req_remapped ), - .tcdm_slave_req_remapped_o (tcdm_slave_req_remapped ), - .local_req_interco_payload_remapped_o (local_req_interco_payload_remapped ) - ); /******************* * Core De/mux * @@ -1022,17 +903,21 @@ module mempool_tile assign remote_req_interco_wen [c] = remote_req_interco[c].wen; assign remote_req_interco_amoen [c] = |remote_req_interco[c].wdata.amo; - // Scramble address before entering TCDM shim for sequential+interleaved memory map - address_scrambler #( - .AddrWidth (AddrWidth ), - .ByteOffset (ByteOffset ), - .NumTiles (NumTiles ), - .NumBanksPerTile (NumBanksPerTile ), - .Bypass (0 ), - .SeqMemSizePerTile (SeqMemSizePerTile), - .TCDMBaseAddr (TCDMBaseAddr ), - .TCDMMask (TCDMMask ) - ) i_address_scrambler ( + // Scramble + remap address before the TCDM shim: stage 1 seq->interleave + // swap always on; stage 2 tile-id remap gated by TileIdRemap. + mempool_addr_scrambler #( + .AddrWidth (AddrWidth ), + .ByteOffset (ByteOffset ), + .NumTiles (NumTiles ), + .NumTilesPerDma (NumTilesPerDma ), + .NumBanksPerTile (NumBanksPerTile ), + .Bypass (0 ), + .SeqMemSizePerTile (SeqMemSizePerTile), + .TCDMBaseAddr (TCDMBaseAddr ), + .TCDMMask (TCDMMask ), + .EnableSeqInterleaveSwap (1'b1 ), + .EnableTileIdRemap (TileIdRemap ) + ) i_addr_scrambler ( .address_i (snitch_data_qaddr[c] ), .address_o (snitch_data_qaddr_scrambled[c]) ); @@ -1044,12 +929,7 @@ module mempool_tile .MaxOutStandingTrans (snitch_pkg::NumIntOutstandingLoads), .NrTCDM (2 ), .NrSoC (1 ), - .NumRules (3 ), - .ByteOffset (ByteOffset ), - .NumTiles (NumTiles ), - .NumTilesPerDma (NumTilesPerDma ), - .NumBanksPerTile (NumBanksPerTile ), - .SeqMemSizePerTile (SeqMemSizePerTile ) + .NumRules (3 ) ) i_tcdm_shim ( .clk_i (clk_i ), .rst_ni (rst_ni ), diff --git a/hardware/src/tcdm_shim.sv b/hardware/src/tcdm_shim.sv index d1ca964f..5da72f99 100644 --- a/hardware/src/tcdm_shim.sv +++ b/hardware/src/tcdm_shim.sv @@ -17,13 +17,6 @@ module tcdm_shim parameter int unsigned NrTCDM = 2 , parameter int unsigned NrSoC = 1 , parameter int unsigned NumRules = 1 , // Routing rules - - parameter int unsigned ByteOffset = 2 , - parameter int unsigned NumTiles = 256 , - parameter int unsigned NumTilesPerDma = 16 , - parameter int unsigned NumBanksPerTile = 16 , - parameter int unsigned SeqMemSizePerTile = 4*1024 , - localparam int unsigned StrbWidth = DataWidth/8 , localparam int unsigned NumOutput = NrTCDM + NrSoC, localparam int unsigned MetaIdWidth = idx_width(MaxOutStandingTrans) @@ -80,28 +73,10 @@ module tcdm_shim // Includes `include "common_cells/registers.svh" - `define max(a,b) (((a) > (b))? (a) : (b)) - - localparam int unsigned BankOffsetBits = $clog2(NumBanksPerTile); - localparam int unsigned TileIdBits = $clog2(NumTiles); - localparam int unsigned TileIdBitsPerDma = `max(1, $clog2(NumTilesPerDma)); - localparam int unsigned ConstantBitsLSB = ByteOffset + BankOffsetBits; - - function automatic logic [TileIdBitsPerDma-1:0] spm_tile_id_remap ( - logic [TileIdBitsPerDma-1:0] data_in, - logic [TileIdBitsPerDma-1:0] idx_i - ); - // if (mempool_pkg::TileIdRemap == 1) begin - // spm_tile_id_remap = data_in + idx_i; - // end else begin - spm_tile_id_remap = data_in; - // end - endfunction dreq_t data_qpayload ; dreq_t [NrSoC-1:0] soc_qpayload ; dreq_t [NrTCDM-1:0] tcdm_qpayload; - logic [NrTCDM-1:0][AddrWidth-1:0] tcdm_qpayload_addr_remapped; dresp_t data_ppayload ; dresp_t [NrSoC-1:0] soc_ppayload ; @@ -184,21 +159,8 @@ module tcdm_shim ); // Connect TCDM output ports - always_comb begin - for (int i = 0; i < NrTCDM; i++) begin - tcdm_qpayload_addr_remapped[i] = tcdm_qpayload[i].addr; - if (tcdm_qpayload[i].addr >= (NumTiles * SeqMemSizePerTile)) begin - tcdm_qpayload_addr_remapped[i][ConstantBitsLSB +: TileIdBitsPerDma] = - spm_tile_id_remap( - tcdm_qpayload[i].addr[ConstantBitsLSB +: TileIdBitsPerDma], - tcdm_qpayload[i].addr[(ConstantBitsLSB + TileIdBits) +: TileIdBitsPerDma] - ); - end - end - end - for (genvar i = 0; i < NrTCDM; i++) begin : gen_tcdm_con - assign tcdm_req_tgt_addr_o[i] = tcdm_qpayload_addr_remapped[i] ; + assign tcdm_req_tgt_addr_o[i] = tcdm_qpayload[i].addr ; assign tcdm_req_wdata_o[i] = tcdm_qpayload[i].data ; assign tcdm_req_amo_o[i] = tcdm_qpayload[i].amo ; assign tcdm_req_id_o[i] = tcdm_qpayload[i].id ; diff --git a/hardware/src/tcdm_wide_narrow_mux.sv b/hardware/src/tcdm_wide_narrow_mux.sv deleted file mode 100644 index 33382130..00000000 --- a/hardware/src/tcdm_wide_narrow_mux.sv +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Solderpad Hardware License, Version 0.51, see LICENSE for details. -// SPDX-License-Identifier: SHL-0.51 - -// Author: Samuel Riedel - -// This module multiplexes many narrow ports and one wide port onto many narrow -// ports. The wide port is prioritized. -module tcdm_wide_narrow_mux #( - // Width of narrow data. - parameter int unsigned NarrowDataWidth = 0, - // Width of wide data. - parameter int unsigned WideDataWidth = 0, - // Request type of narrow inputs. - parameter type narrow_req_t = logic, - // Response type of narrow inputs. - parameter type narrow_rsp_t = logic, - // Request type of wide inputs. - parameter type wide_req_t = logic, - // Response type of wide inputs. - parameter type wide_rsp_t = logic, - // Group ID type, FlooNoC Added - parameter type group_id_t = logic, - // Derived. *Do not override* - // Number of narrow inputs. - parameter int unsigned NrPorts = WideDataWidth / NarrowDataWidth -) ( - input logic clk_i, - input logic rst_ni, - // Group ID, FlooNoC Added - input group_id_t group_id_i, - // Narrow inputs - input narrow_req_t [NrPorts-1:0] slv_narrow_req_i, - input logic [NrPorts-1:0] slv_narrow_req_valid_i, - output logic [NrPorts-1:0] slv_narrow_req_ready_o, - output narrow_rsp_t [NrPorts-1:0] slv_narrow_rsp_o, - output logic [NrPorts-1:0] slv_narrow_rsp_valid_o, - input logic [NrPorts-1:0] slv_narrow_rsp_ready_i, - // Wide input - input wide_req_t slv_wide_req_i, - input logic slv_wide_req_valid_i, - output logic slv_wide_req_ready_o, - output wide_rsp_t slv_wide_rsp_o, - output logic slv_wide_rsp_valid_o, - input logic slv_wide_rsp_ready_i, - // Multiplexed outputs - output narrow_req_t [NrPorts-1:0] mst_req_o, - output logic [NrPorts-1:0] mst_req_wide_o, - output logic [NrPorts-1:0] mst_req_valid_o, - input logic [NrPorts-1:0] mst_req_ready_i, - input narrow_rsp_t [NrPorts-1:0] mst_rsp_i, - input logic [NrPorts-1:0] mst_rsp_wide_i, - input logic [NrPorts-1:0] mst_rsp_valid_i, - output logic [NrPorts-1:0] mst_rsp_ready_o -); - - localparam int unsigned NarrowBeWidth = NarrowDataWidth/8; - - // Request path - logic [NrPorts-1:0] forked_wide_req_valid; - logic [NrPorts-1:0] forked_wide_req_ready; - - // Fork the wide request into multiple narrow ones - stream_fork #( - .N_OUP (NrPorts) - ) i_wide_stream_fork ( - .clk_i (clk_i ), - .rst_ni (rst_ni ), - .valid_i(slv_wide_req_valid_i ), - .ready_o(slv_wide_req_ready_o ), - .valid_o(forked_wide_req_valid), - .ready_i(forked_wide_req_ready) - ); - - always_comb begin - // Feed-through narrow ports by default - mst_req_valid_o = slv_narrow_req_valid_i; - slv_narrow_req_ready_o = mst_req_ready_i; - mst_req_wide_o = '0; - mst_req_o = slv_narrow_req_i; - // Block wide by default - forked_wide_req_ready = '0; - - for (int i = 0; i < NrPorts; i++) begin - if (forked_wide_req_valid[i]) begin - // Select the wide port - mst_req_valid_o[i] = forked_wide_req_valid[i]; - forked_wide_req_ready[i] = mst_req_ready_i[i]; - mst_req_wide_o[i] = 1'b1; - mst_req_o[i] = '{ - wdata: slv_wide_req_i.wdata[i*NarrowDataWidth+:NarrowDataWidth], - wen: slv_wide_req_i.wen, - be: slv_wide_req_i.be[i*NarrowBeWidth+:NarrowBeWidth], - tgt_addr: slv_wide_req_i.tgt_addr, - ini_addr: '0, - src_group_id: group_id_i // FlooNoC Added - }; - // Block access from narrow ports. - slv_narrow_req_ready_o[i] = 1'b0; - end - end - end - - // Response path - logic [NrPorts-1:0] forked_wide_rsp_valid; - logic [NrPorts-1:0] forked_wide_rsp_ready; - - // Join the multiple narrow requests into one wide one - stream_join #( - .N_INP (NrPorts) - ) i_wide_stream_join ( - .inp_valid_i(forked_wide_rsp_valid), - .inp_ready_o(forked_wide_rsp_ready), - .oup_valid_o(slv_wide_rsp_valid_o ), - .oup_ready_i(slv_wide_rsp_ready_i ) - ); - - always_comb begin - // Broadcast data - slv_narrow_rsp_o = mst_rsp_i; - // Tie off both interfaces by default - slv_narrow_rsp_valid_o = '0; - forked_wide_rsp_valid = '0; - mst_rsp_ready_o = '0; - for (int i = 0; i < NrPorts; i++) begin - // Broadcast data from all banks. - slv_wide_rsp_o.rdata[i*NarrowDataWidth+:NarrowDataWidth] = mst_rsp_i[i].rdata; - // Connect handshake based on selection - if (mst_rsp_wide_i[i]) begin - forked_wide_rsp_valid[i] = mst_rsp_valid_i[i]; - mst_rsp_ready_o[i] = forked_wide_rsp_ready[i]; - end else begin - slv_narrow_rsp_valid_o[i] = mst_rsp_valid_i[i]; - mst_rsp_ready_o[i] = slv_narrow_rsp_ready_i[i]; - end - end - end - - // Check parameters - if (NrPorts*NarrowDataWidth != WideDataWidth) begin - $error("[tcdm_wide_narrow_mux] WideDataWidth must be divisible by NarrowDataWidth."); - end -endmodule From 7b7a10cb59a6088d96c8183a70f54b2427c8f59a Mon Sep 17 00:00:00 2001 From: Yinrong Li Date: Wed, 10 Jun 2026 22:34:40 +0200 Subject: [PATCH 2/4] [script] Fix the VCS simulation flow. --- hardware/Makefile | 16 ++- hardware/scripts/vcs/dump_all.tcl | 9 ++ hardware/scripts/vcs/run.tcl | 14 ++- hardware/scripts/vcs/wave.tcl | 145 ++++++---------------------- hardware/scripts/vcs/wave_cache.tcl | 28 ------ hardware/scripts/vcs/wave_core.tcl | 141 --------------------------- hardware/scripts/vcs/wave_tile.tcl | 57 ----------- 7 files changed, 66 insertions(+), 344 deletions(-) create mode 100644 hardware/scripts/vcs/dump_all.tcl delete mode 100644 hardware/scripts/vcs/wave_cache.tcl delete mode 100644 hardware/scripts/vcs/wave_core.tcl delete mode 100644 hardware/scripts/vcs/wave_tile.tcl diff --git a/hardware/Makefile b/hardware/Makefile index 055670f9..2f060c55 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -35,6 +35,7 @@ questa_config ?= vcs_version ?= 2024.09-zr vcs_cmd ?= vcs-$(vcs_version) vcs_config ?= +vcs_gui ?= -verdi # Path to the application binaries app_path ?= $(abspath $(ROOT_DIR)/../software/bin) # Bender @@ -96,6 +97,8 @@ vlogan_args += -assert svaext +v2k -override_timescale=1ns/1ps -kdb ifdef preload vcs_args += +PRELOAD=$(preload) endif +vcs_assert_args ?= -assert nopostproc +vcs_args += $(vcs_assert_args) # ============================================================================ # DPI and Trace Configuration @@ -365,13 +368,20 @@ $(buildpath)/mempool_simvopt: $(buildpath)/compilevcs.sh $(buildpath)/$(dpi_libr $(vcs_cmd) vcs -full64 $(top_level) -cc $(CC) -cpp $(CXX) -ld $(CXX) $(dpi_library)/mempool_vcs_dpi.so $(vcs_config) -assert disable_cover -o mempool_simvopt # Simulation -simvcs: compile_vcs_simv +simvcs: clean-dasm compile_vcs_simv cd $(buildpath) && \ - ./mempool_simv $(vcs_args) -ucli -l transcript -do ../scripts/vcs/run.tcl -gui + ./mempool_simv $(vcs_args) $(vcs_gui) -ucli -do ../scripts/vcs/run.tcl -l transcript + ./scripts/return_status.sh $(buildpath)/transcript -simcvcs: compile_vcs_simvopt +simcvcs: clean-dasm compile_vcs_simvopt cd $(buildpath) && \ ./mempool_simvopt $(vcs_args) -l transcript + ./scripts/return_status.sh $(buildpath)/transcript + +simcvcs_fsdb: clean-dasm compile_vcs_simv + cd $(buildpath) && \ + ./mempool_simv $(vcs_args) -ucli -do ../scripts/vcs/dump_all.tcl -l transcript + ./scripts/return_status.sh $(buildpath)/transcript # DPIs .PHONY: dpivcs diff --git a/hardware/scripts/vcs/dump_all.tcl b/hardware/scripts/vcs/dump_all.tcl new file mode 100644 index 00000000..4feb2aca --- /dev/null +++ b/hardware/scripts/vcs/dump_all.tcl @@ -0,0 +1,9 @@ +# Copyright 2021 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# Headless full-signal FSDB dump (no GUI). Like run.tcl but skips the nWave +# window population (the wv* commands need the Verdi GUI). +dump -file mempool.fsdb -type FSDB +dump -add /mempool_tb -depth 0 -aggregates -fsdb_opt +mda+packedmda+struct +run \ No newline at end of file diff --git a/hardware/scripts/vcs/run.tcl b/hardware/scripts/vcs/run.tcl index e60f4829..cb29716b 100644 --- a/hardware/scripts/vcs/run.tcl +++ b/hardware/scripts/vcs/run.tcl @@ -2,5 +2,17 @@ # Solderpad Hardware License, Version 0.51, see LICENSE for details. # SPDX-License-Identifier: SHL-0.51 -do ./wave.tcl +# VCS + Verdi run script. Under `-verdi` the simv `-do` file is played by Verdi's +# TclPlay console, which runs both simulator (dump/run) and Verdi/nWave commands. +# +# 1. Full-signal FSDB dump (= QuestaSim `log -r *`): -depth 0 = all levels, +# -aggregates keeps structs/arrays, -fsdb_opt also captures memories. +dump -file mempool.fsdb -type FSDB +dump -add /mempool_tb -depth 0 -aggregates -fsdb_opt +mda+packedmda+struct + +# 2. Populate nWave. wave.tcl uses Verdi-only console commands, so source it +# here (under -verdi), not via -ucli. +source ../scripts/vcs/wave.tcl + +# 3. Run until the testbench's $finish. run diff --git a/hardware/scripts/vcs/wave.tcl b/hardware/scripts/vcs/wave.tcl index 97fb29d3..5c1fa6fe 100644 --- a/hardware/scripts/vcs/wave.tcl +++ b/hardware/scripts/vcs/wave.tcl @@ -2,117 +2,34 @@ # Solderpad Hardware License, Version 0.51, see LICENSE for details. # SPDX-License-Identifier: SHL-0.51 -# Create an nWave window -wvCreateWindow - -# Add a vector of the core's wfi signal to quickly see which cores are active -wvAddGroup wfi -wvAddSignal -group {wfi {mempool_tb/wfi}} - -# Add min function (DVE does not support TCL8.5) -proc min args { - set minval [lindex args 0] - foreach arg $args { - if { $arg < $minval } { - set minval $arg - } - } - return $minval -} - -# Add all cores from group 0 tile 0 -set group 0 -set tile 0 -for {set core 0} {$core < [min 4 [get -radix dec mempool_pkg::NumCoresPerTile]]} {incr core} { - source ../scripts/vcs/wave_core.tcl -} - -# Add specific cores from different tiles -set group 1 -set tile 0 -set core 0 -source ../scripts/vcs/wave_core.tcl - -# Add groups -for {set group 0} {$group < [get -radix dec mempool_pkg::NumGroups]} {incr group} { - # Create the group - wvAddGroup group\[$group\] - - # Add tiles - for {set tile 0} {$tile < [min 2 [get -radix dec mempool_pkg::NumTilesPerGroup]]} {incr tile} { - source ../scripts/vcs/wave_tile.tcl - } - - # Interconnects - for {set tgtgroup 0} {$tgtgroup < [get -radix dec mempool_pkg::NumGroups]} {incr tgtgroup} { - if {$tgtgroup != $group} { - set interco_idx [expr $group ^ $tgtgroup] - wvSelectGroup group\[$group\] - wvAddSubGroup interconnect_to_group\[$tgtgroup\] - wvSetPosition [subst {(group\[$group\]/interconnect_to_group\[$tgtgroup\] last)}] - - wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/clk_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/rst_ni \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_valid_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_ready_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_tgt_addr_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_wen_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_wdata_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_be_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/resp_valid_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/resp_ready_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/resp_rdata_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_valid_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_ready_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_ini_addr_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_tgt_addr_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_wen_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_wdata_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/req_be_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/resp_valid_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/resp_ready_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/resp_ini_addr_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_remote_interco\[$interco_idx\]/i_remote_interco/resp_rdata_i - } - } - - wvSelectGroup group\[$group\] - wvAddSubGroup local_interconnect - wvSetPosition [subst {(group\[$group\]/local_interconnect last)}] - - wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/clk_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/rst_ni \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_valid_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_ready_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_tgt_addr_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_wen_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_wdata_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_be_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/resp_valid_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/resp_ready_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/resp_rdata_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_valid_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_ready_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_ini_addr_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_tgt_addr_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_wen_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_wdata_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/req_be_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/resp_valid_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/resp_ready_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/resp_ini_addr_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/i_local_interco/resp_rdata_i -} - -wvAddGroup Control_Registers -wvSetPosition {(Control_Registers last)} -wvAddSignal mempool_tb/dut/i_ctrl_registers/clk_i \ - mempool_tb/dut/i_ctrl_registers/rst_ni \ - mempool_tb/dut/i_ctrl_registers/axi_lite_slave_req_i \ - mempool_tb/dut/i_ctrl_registers/axi_lite_slave_resp_o \ - mempool_tb/dut/i_ctrl_registers/eoc_o \ - mempool_tb/dut/i_ctrl_registers/eoc_valid_o \ - mempool_tb/dut/i_ctrl_registers/wake_up_o \ - mempool_tb/dut/i_ctrl_registers/tcdm_start_address_o \ - mempool_tb/dut/i_ctrl_registers/tcdm_end_address_o \ - mempool_tb/dut/i_ctrl_registers/num_cores_o +# VCS + Verdi (nWave) curated overview for the 2D-mesh FlooNoC topology. +# `make simvcs` auto-sources this from run.tcl; re-source any time from Verdi's +# Tcl console: source ../scripts/vcs/wave.tcl. run.tcl dumps EVERY signal to the +# FSDB, so anything not listed here can still be dragged into nWave (no re-run). +# +# GOTCHA: interpreted by VERDI (nWave), NOT the UCLI shell -- use only `wv*` +# commands (no `dump`/`run`/`get`) and hard-code loop bounds (no +# `get mempool_pkg::...`). Wrap `[0]`/`[3]` indices in {braces} so Tcl does not +# treat them as command substitution. + +# GOTCHA: capture wvCreateWindow's return -- it does NOT auto-set $_nWave2 from +# the -do/TclPlay console, so `wvAddSignal -win $_nWave2` would otherwise fail. +set _nWave2 [wvCreateWindow] + +# --- System overview (testbench level) --- +wvAddSignal -win $_nWave2 {/mempool_tb/wfi} +wvAddSignal -win $_nWave2 {/mempool_tb/eoc_valid} +wvAddSignal -win $_nWave2 {/mempool_tb/snitch_utilization} +wvAddSignal -win $_nWave2 {/mempool_tb/lsu_utilization} + +# --- Control registers (end-of-computation / wake-up) --- +wvAddSignal -win $_nWave2 {/mempool_tb/dut/i_ctrl_registers/eoc_o} +wvAddSignal -win $_nWave2 {/mempool_tb/dut/i_ctrl_registers/eoc_valid_o} +wvAddSignal -win $_nWave2 {/mempool_tb/dut/i_ctrl_registers/wake_up_o} + +# --- Group 0 / Tile 0, core 0 (snitch front-end) --- +wvAddSignal -win $_nWave2 {/mempool_tb/dut/i_mempool_cluster/gen_groups_x[0]/gen_groups_y[0]/gen_rtl_group/i_group/i_mempool_group/gen_tiles[0]/i_tile/gen_cores[0]/gen_mempool_cc/riscv_core/i_snitch/pc_q} +wvAddSignal -win $_nWave2 {/mempool_tb/dut/i_mempool_cluster/gen_groups_x[0]/gen_groups_y[0]/gen_rtl_group/i_group/i_mempool_group/gen_tiles[0]/i_tile/gen_cores[0]/gen_mempool_cc/riscv_core/i_snitch/wfi_q} + + +wvZoomAll -win $_nWave2 diff --git a/hardware/scripts/vcs/wave_cache.tcl b/hardware/scripts/vcs/wave_cache.tcl deleted file mode 100644 index bc198935..00000000 --- a/hardware/scripts/vcs/wave_cache.tcl +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright 2021 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# Create cache for core $3 from group $1 tile $2 (core_id=NUM_CORES_PER_group*$1+NUM_CORES_PER_TILE*$2+$3) - -add_wave -group core\[$1\]\[$2\]\[$3\] -divider Parameters -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/NR_FETCH_PORTS -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/L0_LINE_COUNT -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/LINE_WIDTH -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/LINE_COUNT -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/SET_COUNT -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/FETCH_DW -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/FILL_AW -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/FILL_DW -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/EARLY_LATCH -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/L0_EARLY_TAG_WIDTH -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/ISO_CROSSING -add_wave -group core\[$1\]\[$2\]\[$3\] -divider Signals -add_wave -group cache\[$1\]\[$2\]\[$3\] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/* - -for {set i 0} {$i < [get -radix dec /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/NR_FETCH_PORTS]} {incr i} { - add_wave -group cache\[$1\]\[$2\]\[$3\]|refill[$i] /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/gen_prefetcher[$i]/i_snitch_icache_l0/* -} - -add_wave -group cache\[$1\]\[$2\]\[$3\]|lookup /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/gen_serial_lookup/i_lookup/* -add_wave -group cache\[$1\]\[$2\]\[$3\]|handler /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/i_handler/* -add_wave -group cache\[$1\]\[$2\]\[$3\]|refill /mempool_tb/dut/i_mempool_cluster/gen_groups\[$1\]/i_group/gen_tiles\[$2\]/i_tile/gen_caches\[$3\]/i_snitch_icache/i_refill/* diff --git a/hardware/scripts/vcs/wave_core.tcl b/hardware/scripts/vcs/wave_core.tcl deleted file mode 100644 index 2f4ef0be..00000000 --- a/hardware/scripts/vcs/wave_core.tcl +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright 2021 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# Create group for core $core from group $group tile $tile (core_id=NUM_CORES_PER_group*$group+NUM_CORES_PER_TILE*$tile+$core) - -wvAddGroup core\[$group\]\[$tile\]\[$core\] -wvSetPosition [subst {(core\[$group\]\[$tile\]\[$core\] last)}] - -add_wave -divider Clock -wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/clk_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/rst_ni \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/hart_id_i - -add_wave -divider Instructions -wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/inst_addr_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/inst_data_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/inst_valid_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/inst_ready_i - -add_wave -divider "Load and Store" -wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_qaddr_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_qwrite_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_qamo_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_qdata_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_qstrb_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_qvalid_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_qready_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_pdata_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_perror_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_pvalid_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/data_pready_o - -add_wave -divider Accelerator -wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_qaddr_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_qid_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_qdata_op_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_qdata_arga_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_qdata_argb_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_qdata_argc_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_qvalid_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_qready_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_pdata_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_pid_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_perror_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_pvalid_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_pready_o \ - -wvSelectGroup core\[$group\]\[$tile\]\[$core\] -wvAddSubGroup Internal -wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/clk_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/rst_ni \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/illegal_inst \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/stall \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/lsu_stall \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_stall \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/zero_lsb \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/pc_d \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/pc_q \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/wfi_d \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/wfi_q \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/wake_up_sync_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/wake_up_d \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/wake_up_q -add_wave -divider LSU -wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/ls_size \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/ls_amo \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/ld_result \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/lsu_qready \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/lsu_qvalid \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/lsu_pvalid \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/lsu_pready \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/lsu_rd \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/retire_load \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/retire_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/retire_acc -add_wave -divider ALU -wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/opa \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/opb \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/iimm \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/uimm \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/jimm \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/bimm \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/simm \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/adder_result \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/alu_result \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/rd \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/rs1 \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/rs2 \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/gpr_raddr \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/gpr_rdata \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/gpr_waddr \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/gpr_wdata \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/gpr_we \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/consec_pc \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/sb_d \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/sb_q \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/is_load \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/is_store \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/is_signed \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/is_fp_load \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/is_fp_store \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/ls_misaligned \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/ld_addr_misaligned \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/st_addr_misaligned \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/valid_instr \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/exception \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/alu_op \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/opa_select \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/opb_select \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/write_rd \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/uses_rd \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/next_pc \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/rd_select \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/rd_bypass \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/is_branch \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/csr_rvalue \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/csr_en \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/cycle_q \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/instret_q \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/acc_register_rd \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/operands_ready \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/dst_ready \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/opa_ready \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/opb_ready \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/shift_opa \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/shift_opa_reversed \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/shift_right_result \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/shift_left_result \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/shift_opa_ext \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/shift_right_result_ext \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/shift_left \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/shift_arithmetic \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/alu_opa \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/alu_opb \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/alu_writeback \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/csr_trace_q \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/csr_trace_en \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/gen_cores\[$core\]/gen_mempool_cc/riscv_core/i_snitch/core_events_o - -wvCollapseAllGroups diff --git a/hardware/scripts/vcs/wave_tile.tcl b/hardware/scripts/vcs/wave_tile.tcl deleted file mode 100644 index 20f06693..00000000 --- a/hardware/scripts/vcs/wave_tile.tcl +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright 2021 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# Create group for group $group tile $tile - -wvSelectGroup group\[$group\] -wvAddSubGroup tile\[$tile\] - -wvSetPosition [subst {(group\[$group\]/tile\[$tile\] last)}] - -add_wave -divider Clock -wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/clk_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/rst_ni \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tile_id_i \ - -add_wave -divider TCDM -wvAddSignal mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_master_req_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_master_req_valid_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_master_req_ready_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_master_resp_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_master_resp_valid_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_master_resp_ready_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_slave_req_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_slave_req_valid_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_slave_req_ready_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_slave_resp_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_slave_resp_valid_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/tcdm_slave_resp_ready_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/axi_mst_req_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/axi_mst_resp_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_inst_addr \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_inst_data \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_inst_valid \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_inst_ready \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_qaddr \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_qwrite \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_qamo \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_qdata \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_qstrb \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_qid \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_qvalid \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_qready \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_pdata \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_perror \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_pid \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_pvalid \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/snitch_data_pready \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/mask_map \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/soc_req_o \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/soc_resp_i \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/soc_qvalid \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/soc_qready \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/soc_pvalid \ - mempool_tb/dut/i_mempool_cluster/gen_groups\[$group\]/i_group/gen_tiles\[$tile\]/i_tile/soc_pready - -wvCollapseAllGroups From 4559112aa577c2a31ba1e0c716a494a1775d703c Mon Sep 17 00:00:00 2001 From: Yinrong Li Date: Thu, 11 Jun 2026 03:29:15 +0200 Subject: [PATCH 3/4] [hardware] Add NoC, SPM, and PE-port profiling traces. --- hardware/src/mempool_pkg.sv | 44 - hardware/src/mempool_tile.sv | 36 +- hardware/tb/mempool_tb.sv | 1 + hardware/tb/tb_noc_profiling.svh | 2001 +++++------------------------- hardware/tb/tb_spm_profiling.svh | 349 ++++++ 5 files changed, 709 insertions(+), 1722 deletions(-) create mode 100644 hardware/tb/tb_spm_profiling.svh diff --git a/hardware/src/mempool_pkg.sv b/hardware/src/mempool_pkg.sv index 1771b6d9..92b372a0 100644 --- a/hardware/src/mempool_pkg.sv +++ b/hardware/src/mempool_pkg.sv @@ -505,50 +505,6 @@ package mempool_pkg; int unsigned write_cycles[$]; // dynamic array to store cycles of write accesses } profile_t; - // tile level profiling - typedef struct { - // tile remote ports profile - int unsigned req_vld_cyc_num[NumRemoteReqPortsPerTile-1]; - int unsigned req_hsk_cyc_num[NumRemoteReqPortsPerTile-1]; - } tile_level_profile_t; - - // group level profiling - typedef struct { - // group xbar ports profile - int unsigned req_vld_cyc_num [NumRemoteReqPortsPerTile-1]; - int unsigned req_hsk_cyc_num [NumRemoteReqPortsPerTile-1]; - int unsigned req_vld_cyc_more_than_one_hit_same_bank_num; - } group_level_profile_t; - - // router level profile - typedef struct { - // noc router ports profile - int unsigned in_vld_cyc_num [4]; // 4: 4 directions - int unsigned in_hsk_cyc_num [4]; // 4: 4 directions - int unsigned out_vld_cyc_num[4]; // 4: 4 directions - int unsigned out_hsk_cyc_num[4]; // 4: 4 directions - } router_level_profile_t; - - // router local ports profile - // noc router local req ports profile - typedef struct { - int unsigned read_req_num; - int unsigned write_req_num; - } router_local_req_port_profile_t; - // noc router local resp ports profile - typedef struct { - int unsigned req_num; - } router_local_resp_port_profile_t; - - typedef struct { - // noc router ports profile - int unsigned in_vld_cyc_num [5]; - int unsigned in_hsk_cyc_num [5]; - int unsigned hol_stall_cyc_num [5]; - int unsigned out_congst_cyc_num [5][5]; - int unsigned cur_stall_cyc_num [5]; - int unsigned max_stall_cyc_num [5]; - } router_input_profile_t; `endif `endif diff --git a/hardware/src/mempool_tile.sv b/hardware/src/mempool_tile.sv index e0e9539e..a660e673 100644 --- a/hardware/src/mempool_tile.sv +++ b/hardware/src/mempool_tile.sv @@ -373,11 +373,8 @@ module mempool_tile logic [NumBanksPerTile-1:0] bank_resp_wide; local_resp_interco_addr_t[NumBanksPerTile-1:0] bank_resp_ini_addr; - // gen_superbank_resp_ini_addr conditions bank_resp_ini_addr (the local - // input-port index stored in the bank's metadata) into the resp xbar's - // sel, handling NumRemoteReqPortsPerTile vs NumRemoteRespPortsPerTile - // asymmetry. The conditioned value feeds mempool_tcdm_bank_interco's - // mst_resp_ini_addr_i input. + // Condition bank_resp_ini_addr into the resp xbar's sel, handling + // req/resp port-count asymmetry; feeds i_tcdm_bank_interco.mst_resp_ini_addr_i. for (genvar b = 0; unsigned'(b) < NumBanksPerTile; b++) begin: gen_superbank_resp_ini_addr if(NumRemoteReqPortsPerTile > NumRemoteRespPortsPerTile ) begin: gen_superbank_resp_ini_addr_req_gt_resp always_comb begin @@ -434,13 +431,16 @@ module mempool_tile end end - // (mempool_tcdm_bank_interco is instantiated further below, after the - // signal declarations for local_req_interco_payload / postreg_tcdm_slave_req - // / etc. that it connects to.) + // mempool_tcdm_bank_interco is instantiated further below, after its + // connected signal declarations. `ifndef TARGET_SYNTHESIS `ifndef TARGET_VERILATOR + // DISABLED (commented out): heavy per-word SPM profiler (profile_d) — its + // unbounded [bank][2^addr] cycle lists balloon VCS memory; superseded by the + // lightweight per-bank trace in tb/tb_spm_profiling.svh. `ifdef SPM_PROFILING + /* logic [63:0] cycle_q; profile_t profile_d [NumBanksPerTile-1:0][2**TCDMAddrMemWidth-1:0]; // profile_t profile_q [NumBanksPerTile-1:0][2**TCDMAddrMemWidth-1:0]; @@ -453,6 +453,7 @@ module mempool_tile // profile_q <= profile_d; end end + */ `endif `endif `endif @@ -535,6 +536,8 @@ module mempool_tile `ifndef TARGET_SYNTHESIS `ifndef TARGET_VERILATOR `ifdef SPM_PROFILING + // DISABLED (commented out) with the profile_d decl above. + /* always_ff @(posedge clk_i or negedge rst_ni) begin // profile_d[b] = profile_q[b]; if(~rst_ni) begin @@ -568,6 +571,7 @@ module mempool_tile end end end + */ `endif `endif `endif @@ -744,9 +748,8 @@ module mempool_tile tcdm_slave_resp_t [NumCoresPerTile-1:0] local_resp_interco_payload; addr_t [NumCoresPerTile-1:0] local_req_interco_addr_int; - // Bank-side TCDM interconnect: combines narrow-req routing, narrow-resp - // routing, wide-DMA superbank demux/mux, wide-narrow priority arbitration - // at each bank, and within-superbank bank-id remap. + // Bank-side TCDM interconnect: narrow req/resp routing, wide-DMA superbank + // demux/mux, per-bank wide-narrow arbitration, and within-superbank bank-id remap. mempool_tcdm_bank_interco #( .NumNarrowReq (NumCoresPerTile + NumRemoteReqPortsPerTile), .NumNarrowResp (NumCoresPerTile + NumRemoteRespPortsPerTile), @@ -766,14 +769,12 @@ module mempool_tile .clk_i (clk_i), .rst_ni (rst_ni), .group_id_i (group_id), - // Narrow request inputs: cores at [0..NumCoresPerTile-1], remote slaves - // at [NumCoresPerTile..NumCoresPerTile+NumRemoteReqPortsPerTile-1] - // (matches the legacy concat order in i_local_req_interco). + // Narrow req inputs: cores at low indices, remote slaves above; concat + // order must match the legacy i_local_req_interco order. .slv_narrow_req_i ({postreg_tcdm_slave_req, local_req_interco_payload}), .slv_narrow_req_valid_i ({postreg_tcdm_slave_req_valid, local_req_interco_valid }), .slv_narrow_req_ready_o ({postreg_tcdm_slave_req_ready, local_req_interco_ready }), - // Narrow response outputs: cores at [0..NumCoresPerTile-1], remote resp - // ports at [NumCoresPerTile..NumCoresPerTile+NumRemoteRespPortsPerTile-1]. + // Narrow resp outputs: cores at low indices, remote resp ports above. .slv_narrow_resp_o ({prereg_tcdm_slave_resp, local_resp_interco_payload}), .slv_narrow_resp_valid_o({prereg_tcdm_slave_resp_valid, local_resp_interco_valid }), .slv_narrow_resp_ready_i({prereg_tcdm_slave_resp_ready, local_resp_interco_ready }), @@ -794,8 +795,7 @@ module mempool_tile .mst_resp_valid_i (bank_resp_valid ), .mst_resp_ready_o (bank_resp_ready ), .mst_resp_wide_i (bank_resp_wide ), - // gen_superbank_resp_ini_addr conditions bank_resp_ini_addr into the - // resp xbar's sel (handles req/resp port count asymmetry). + // Conditioned resp xbar sel (see gen_superbank_resp_ini_addr above). .mst_resp_ini_addr_i (superbank_resp_ini_addr) ); diff --git a/hardware/tb/mempool_tb.sv b/hardware/tb/mempool_tb.sv index 4ce4fc14..d389276f 100644 --- a/hardware/tb/mempool_tb.sv +++ b/hardware/tb/mempool_tb.sv @@ -457,5 +457,6 @@ module mempool_tb; * NoC Profiling * ****************/ `include "tb_noc_profiling.svh" +`include "tb_spm_profiling.svh" endmodule : mempool_tb diff --git a/hardware/tb/tb_noc_profiling.svh b/hardware/tb/tb_noc_profiling.svh index 73622ae0..2fe24a93 100644 --- a/hardware/tb/tb_noc_profiling.svh +++ b/hardware/tb/tb_noc_profiling.svh @@ -17,1116 +17,224 @@ end `ifdef NOC_PROFILING - string app, log_path, dump_time; + // Hierarchical path to group g's FlooNoC wrapper (i_group). + `define NOC_GRP(grp) dut.i_mempool_cluster.gen_groups_x[(grp) / NumY].gen_groups_y[(grp) % NumY].gen_rtl_group.i_group + + // ONE FULL per-router / per-tile NoC trace; the export-time --noc-slices flag + // chooses the granularity. Each line is one of: + // S run-length: idle(0)/stall(1)/read(2)/write(3) + // P ... one accepted REQUEST flit (addr + XY routing), emitted every handshake cycle + // portidx 0..3 = N/E/S/W mesh ports, 4 = local; io 0=input, 1=output. + // To stay under VCS's open-file limit, files are MERGED and lines PREFIXED with a demux idx: + // routers -> router_g_{req,resp}.log (one per group; = in-group router id + // , a flat slot t*ports+p -- routers are remapped so NOT tile/port) + // cores -> pe_g_t.log (one per tile; = core) + // tiles -> tile_g_t.log (already one per tile; NO prefix) + string app, log_path; integer retval; - // File handles and filenames for various profiling logs - int f_2, f_3, f_4, f_5; - int f_final_2, f_final_3, f_final_4, f_final_5; - string fn_2, fn_3, fn_4, fn_5; - string fn_final_2, fn_final_3, fn_final_4, fn_final_5; - // Input/output log file descriptors - int req_floo_input_log_fd, resp_floo_input_log_fd; + // MERGED file handles: routers -> ONE req + ONE resp file per group (line tagged with + // router id); Snitch cores -> ONE file per tile (line tagged with core idx). PE = the + // Snitch core's data memory port. + int f_rreq [NumGroups]; + int f_rresp [NumGroups]; + int f_tile [NumGroups][NumTilesPerGroup]; + int f_pe [NumGroups][NumTilesPerGroup]; initial begin - // Read APP name from command line argument void'($value$plusargs("APP=%s", app)); - // Set profiling output path - $sformat(log_path, "noc_profiling"); - // Create log directory - retval = $system({"mkdir -p ", log_path}); - // Open input/output log files - req_floo_input_log_fd = $fopen($sformatf("%s/req_floo_input.log", log_path), "w"); - resp_floo_input_log_fd = $fopen($sformatf("%s/resp_floo_input.log", log_path), "w"); - end - - // ------------------------------------------------------------ - // Profiling structures - // ------------------------------------------------------------ - - tile_level_profile_t tile_level_profile_q [NumGroups-1:0][NumTilesPerGroup-1:0]; - group_level_profile_t group_level_profile_q [NumGroups-1:0]; - - router_level_profile_t router_level_profile_req_q [NumGroups-1:0][NumTilesPerGroup-1:0][NumWideRemoteReqPortsPerTile-1:0]; - router_local_req_port_profile_t router_local_req_port_profile_q [NumGroups-1:0][NumTilesPerGroup-1:0][NumWideRemoteReqPortsPerTile-1:0]; - - router_level_profile_t router_level_profile_resp_q [NumGroups-1:0][NumTilesPerGroup-1:0][NumRemoteRespPortsPerTile-2:0]; - router_local_resp_port_profile_t router_local_resp_port_profile_q [NumGroups-1:0][NumTilesPerGroup-1:0][NumRemoteRespPortsPerTile-2:0]; - - // ------------------------------------------------------------ - // Tile-level profiling: counts valid and handshake cycles - // ------------------------------------------------------------ - - generate - for (genvar g = 0; g < NumGroups; g++) begin : gen_group - for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_tile - always_ff @(posedge clk or negedge rst_n) begin - if (!rst_n) begin - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - tile_level_profile_q[g][t].req_vld_cyc_num[p] <= '0; - tile_level_profile_q[g][t].req_hsk_cyc_num[p] <= '0; - end - end else begin - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - tile_level_profile_q[g][t].req_vld_cyc_num[p] <= - tile_level_profile_q[g][t].req_vld_cyc_num[p] + - $countones( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .i_mempool_group - .gen_tiles[t] - .i_tile - .tcdm_master_req_valid_o[p + 1] - ); - - tile_level_profile_q[g][t].req_hsk_cyc_num[p] <= - tile_level_profile_q[g][t].req_hsk_cyc_num[p] + - $countones( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .i_mempool_group - .gen_tiles[t] - .i_tile - .tcdm_master_req_valid_o[p + 1] & - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .i_mempool_group - .gen_tiles[t] - .i_tile - .tcdm_master_req_ready_i[p + 1] - ); - end - end - end - end - end - endgenerate - - // ------------------------------------------------------------ - // Group-level profiling signals - // ------------------------------------------------------------ - - // Count of requests targeting the same bank from multiple tiles - logic [NumGroups-1:0] - [NumTilesPerGroup * NumBanksPerTile - 1:0] - [$clog2(NumTilesPerGroup * (NumRemoteReqPortsPerTile - 1)) : 0] - group_xbar_req_to_same_bank_count; - - // Count of bank access conflicts from multiple requesters - logic [NumGroups-1:0] - [NumTilesPerGroup * NumBanksPerTile - 1:0] - [$clog2(NumTilesPerGroup * (NumRemoteReqPortsPerTile - 1)) : 0] - group_xbar_req_to_same_bank_conflict_count; - - // Sum of conflict counts across all banks in a group - logic [NumGroups-1:0] - [$clog2(NumTilesPerGroup * (NumRemoteReqPortsPerTile - 1)) : 0] - group_xbar_req_to_same_bank_conflict_count_sum; - - // Per-port valid signal for incoming requests to TCDM crossbar - logic [NumX-1:0] - [NumY-1:0] - [NumRemoteReqPortsPerTile - 2:0] - [NumTilesPerGroup-1:0] - tcdm_slave_req_valid; - - // Per-port target address for incoming requests to TCDM crossbar - logic [NumX-1:0] - [NumY-1:0] - [NumRemoteReqPortsPerTile - 2:0] - [NumTilesPerGroup-1:0] - [idx_width(NumTilesPerGroup) + idx_width(NumBanksPerTile) - 1 : 0] - tcdm_slave_req_tgt_addr; - - // ------------------------------------------------------------ - // Capture TCDM slave port request valid & address per tile/port - // ------------------------------------------------------------ - generate - for (genvar x_dim = 0; x_dim < NumX; x_dim++) begin : gen_x - for (genvar y_dim = 0; y_dim < NumY; y_dim++) begin : gen_y - for (genvar p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin : gen_port - for (genvar t_i = 0; t_i < NumTilesPerGroup; t_i++) begin : gen_tile - assign tcdm_slave_req_valid[x_dim][y_dim][p][t_i] = - dut.i_mempool_cluster - .gen_groups_x[x_dim] - .gen_groups_y[y_dim] - .i_group - .floo_req_from_router_before_xbar_valid_per_port[p + 1][t_i]; - - assign tcdm_slave_req_tgt_addr[x_dim][y_dim][p][t_i] = - dut.i_mempool_cluster - .gen_groups_x[x_dim] - .gen_groups_y[y_dim] - .i_group - .floo_req_from_router[t_i][p + 1] - .hdr.tgt_addr[ - idx_width(NumTilesPerGroup) + idx_width(NumBanksPerTile) - 1 : 0 - ]; - end - end - end - end - endgenerate - - always_comb begin - group_xbar_req_to_same_bank_count = '0; - + log_path = "noc_profiling"; + retval = $system({"mkdir -p ", log_path}); + // Tiles keep their id. Routers use a FLAT in-group router id (r = t*NumPortsPerTile + p): + // the req/resp remappers shuffle logical traffic across physical routers, so [tile][port] + // is just a physical slot, not a tile/port assignment. for (int g = 0; g < NumGroups; g++) begin - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - for (int t_i = 0; t_i < NumTilesPerGroup; t_i++) begin - // If source port from router is valid - if (tcdm_slave_req_valid[g / NumY][g % NumY][p][t_i]) begin - // Then destination port count +1 - group_xbar_req_to_same_bank_count[g][ - tcdm_slave_req_tgt_addr[g / NumY][g % NumY][p][t_i] - ] += 1; - end - end + f_rreq[g] = $fopen($sformatf("%s/router_g%0d_req.log", log_path, g), "w"); + f_rresp[g] = $fopen($sformatf("%s/router_g%0d_resp.log", log_path, g), "w"); + for (int t = 0; t < NumTilesPerGroup; t++) begin + f_tile[g][t] = $fopen($sformatf("%s/tile_g%0d_t%0d.log", log_path, g, t), "w"); + f_pe[g][t] = $fopen($sformatf("%s/pe_g%0d_t%0d.log", log_path, g, t), "w"); end end end - always_comb begin - group_xbar_req_to_same_bank_conflict_count = '0; - group_xbar_req_to_same_bank_conflict_count_sum = '0; - - for (int g = 0; g < NumGroups; g++) begin - for (int b = 0; b < NumTilesPerGroup * NumBanksPerTile; b++) begin - if (group_xbar_req_to_same_bank_count[g][b] > 0) begin - // Minus the one that is not a conflict - group_xbar_req_to_same_bank_conflict_count[g][b] = - group_xbar_req_to_same_bank_count[g][b] - 1; - end - - group_xbar_req_to_same_bank_conflict_count_sum[g] += - group_xbar_req_to_same_bank_conflict_count[g][b]; - end - end - end + // Module-scope state registers so the end-of-sim `final` below can flush them. + // router: [group][tile][port][portidx 0..4][io 0..1] + logic [1:0] rsq_st [NumGroups][NumTilesPerGroup][NumWideRemoteReqPortsPerTile][5][2]; + logic [63:0] rsq_start [NumGroups][NumTilesPerGroup][NumWideRemoteReqPortsPerTile][5][2]; + logic [1:0] rsp_st [NumGroups][NumTilesPerGroup][NumRemoteRespPortsPerTile-1][5][2]; + logic [63:0] rsp_start [NumGroups][NumTilesPerGroup][NumRemoteRespPortsPerTile-1][5][2]; + // tile: [group][tile][port] + logic [1:0] ts_mreq_st [NumGroups][NumTilesPerGroup][NumRemoteReqPortsPerTile]; logic [63:0] ts_mreq_s [NumGroups][NumTilesPerGroup][NumRemoteReqPortsPerTile]; + logic [1:0] ts_sreq_st [NumGroups][NumTilesPerGroup][NumRemoteReqPortsPerTile]; logic [63:0] ts_sreq_s [NumGroups][NumTilesPerGroup][NumRemoteReqPortsPerTile]; + logic [1:0] ts_mrsp_st [NumGroups][NumTilesPerGroup][NumRemoteRespPortsPerTile]; logic [63:0] ts_mrsp_s [NumGroups][NumTilesPerGroup][NumRemoteRespPortsPerTile]; + logic [1:0] ts_srsp_st [NumGroups][NumTilesPerGroup][NumRemoteRespPortsPerTile]; logic [63:0] ts_srsp_s [NumGroups][NumTilesPerGroup][NumRemoteRespPortsPerTile]; + // PE (Snitch core data port): req = data_q* (out, read/write by qwrite), resp = data_p* (in). + logic [1:0] pe_req_st [NumGroups][NumTilesPerGroup][NumCoresPerTile]; logic [63:0] pe_req_s [NumGroups][NumTilesPerGroup][NumCoresPerTile]; + logic [1:0] pe_rsp_st [NumGroups][NumTilesPerGroup][NumCoresPerTile]; logic [63:0] pe_rsp_s [NumGroups][NumTilesPerGroup][NumCoresPerTile]; + // ------------------------------------------------------------ + // Router port capture (4 mesh dirs + local, input + output) -> per-router file. + // S lines RLE idle/stall/read/write; P lines log every accepted request flit + // (req routers only). read/write split via payload.wen. + // ------------------------------------------------------------ generate - for (genvar g = 0; g < NumGroups; g++) begin : gen_group_level_profile + for (genvar g = 0; g < NumGroups; g++) begin : gen_rstate_g always_ff @(posedge clk or negedge rst_n) begin if (!rst_n) begin - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - group_level_profile_q[g].req_vld_cyc_num[p] = '0; - group_level_profile_q[g].req_hsk_cyc_num[p] = '0; - end - group_level_profile_q[g].req_vld_cyc_more_than_one_hit_same_bank_num = '0; + rsq_st[g] <= '{default: '0}; rsq_start[g] <= '{default: '0}; + rsp_st[g] <= '{default: '0}; rsp_start[g] <= '{default: '0}; end else begin - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - group_level_profile_q[g].req_vld_cyc_num[p] += - $countones( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .floo_req_from_router_before_xbar_valid_per_port[p + 1] - [NumTilesPerGroup - 1 : 0] - ); - - group_level_profile_q[g].req_hsk_cyc_num[p] += - $countones( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .floo_req_from_router_before_xbar_valid_per_port[p + 1] - [NumTilesPerGroup - 1 : 0] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .floo_req_from_router_before_xbar_ready_per_port[p + 1] - [NumTilesPerGroup - 1 : 0] - ); - end - - group_level_profile_q[g].req_vld_cyc_more_than_one_hit_same_bank_num += - group_xbar_req_to_same_bank_conflict_count_sum[g]; - end - end - end - endgenerate - - // router level profiling - generate - for (genvar g = 0; g < NumGroups; g++) begin : gen_router_profile_per_group - for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_router_profile_per_tile - for (genvar p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin : gen_req_router_profile_per_remote_port - if (p < NumNarrowRemoteReqPortsPerTile) begin - always_ff @(posedge clk or negedge rst_n) begin - if (!rst_n) begin - router_local_req_port_profile_q[g][t][p].read_req_num = '0; - router_local_req_port_profile_q[g][t][p].write_req_num = '0; - - for (int router_p = 0; router_p < 4; router_p++) begin - router_level_profile_req_q[g][t][p].in_vld_cyc_num[router_p] = '0; - router_level_profile_req_q[g][t][p].in_hsk_cyc_num[router_p] = '0; - router_level_profile_req_q[g][t][p].out_vld_cyc_num[router_p] = '0; - router_level_profile_req_q[g][t][p].out_hsk_cyc_num[router_p] = '0; + for (int t = 0; t < NumTilesPerGroup; t++) begin + // ---- wide-req routers ---- + for (int p = 0; p < NumWideRemoteReqPortsPerTile; p++) begin + automatic int rid = t*NumWideRemoteReqPortsPerTile + p; // merged-file router tag + for (int d = 0; d < 4; d++) begin + automatic logic [1:0] si = `NOC_GRP(g).floo_tcdm_wide_req_valid_in_trans[t][p][d] + ? (`NOC_GRP(g).floo_tcdm_wide_req_ready_out_trans[t][p][d] + ? (`NOC_GRP(g).floo_tcdm_wide_req_in_trans[t][p][d].payload.wen ? 2'd3 : 2'd2) : 2'd1) : 2'd0; + automatic logic [1:0] so = `NOC_GRP(g).floo_tcdm_wide_req_valid_out_trans[t][p][d] + ? (`NOC_GRP(g).floo_tcdm_wide_req_ready_in_trans[t][p][d] + ? (`NOC_GRP(g).floo_tcdm_wide_req_out_trans[t][p][d].payload.wen ? 2'd3 : 2'd2) : 2'd1) : 2'd0; + if (si != rsq_st[g][t][p][d][0]) begin + $fwrite(f_rreq[g], "%0d S %0d 0 %0d %0d %0d\n", rid, d, rsq_start[g][t][p][d][0], cycle_q, rsq_st[g][t][p][d][0]); + rsq_st[g][t][p][d][0] <= si; rsq_start[g][t][p][d][0] <= cycle_q; end - end else begin - router_local_req_port_profile_q[g][t][p].read_req_num += - $countones( - (|( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_narrow_req_router_j[p] - .i_floo_narrow_req_router.valid_i[0] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_narrow_req_router_j[p] - .i_floo_narrow_req_router.ready_i[0] - )) & ~dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_narrow_req_router_j[p] - .i_floo_narrow_req_router.data_i[0][0].payload.wen - ); - - for (int router_p = 0; router_p < 4; router_p++) begin - // narrow req router - - router_level_profile_req_q[g][t][p].in_vld_cyc_num[router_p] += - $countones( - |dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_narrow_req_router_j[p] - .i_floo_narrow_req_router.valid_i[router_p + 1] - ); - - router_level_profile_req_q[g][t][p].in_hsk_cyc_num[router_p] += - $countones( - |( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_narrow_req_router_j[p] - .i_floo_narrow_req_router.valid_i[router_p + 1] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_narrow_req_router_j[p] - .i_floo_narrow_req_router.ready_o[router_p + 1] - ) - ); - - router_level_profile_req_q[g][t][p].out_vld_cyc_num[router_p] += - $countones( - |dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_narrow_req_router_j[p] - .i_floo_narrow_req_router.valid_o[router_p + 1] - ); - - router_level_profile_req_q[g][t][p].out_hsk_cyc_num[router_p] += - $countones( - |( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_narrow_req_router_j[p] - .i_floo_narrow_req_router.valid_o[router_p + 1] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_narrow_req_router_j[p] - .i_floo_narrow_req_router.ready_i[router_p + 1] - ) - ); + if (so != rsq_st[g][t][p][d][1]) begin + $fwrite(f_rreq[g], "%0d S %0d 1 %0d %0d %0d\n", rid, d, rsq_start[g][t][p][d][1], cycle_q, rsq_st[g][t][p][d][1]); + rsq_st[g][t][p][d][1] <= so; rsq_start[g][t][p][d][1] <= cycle_q; end + // P: portidx io cycle wen tgt_addr dst_x dst_y src_x src_y src_tile core meta_id + if (si >= 2) // input handshake -> one request flit accepted + $fwrite(f_rreq[g], "%0d P %0d 0 %0d %0d %0h %0d %0d %0d %0d %0d %0d %0d\n", rid, d, cycle_q, + `NOC_GRP(g).floo_tcdm_wide_req_in_trans[t][p][d].payload.wen, + `NOC_GRP(g).floo_tcdm_wide_req_in_trans[t][p][d].hdr.tgt_addr, + `NOC_GRP(g).floo_tcdm_wide_req_in_trans[t][p][d].hdr.dst_id.x, + `NOC_GRP(g).floo_tcdm_wide_req_in_trans[t][p][d].hdr.dst_id.y, + `NOC_GRP(g).floo_tcdm_wide_req_in_trans[t][p][d].hdr.src_id.x, + `NOC_GRP(g).floo_tcdm_wide_req_in_trans[t][p][d].hdr.src_id.y, + `NOC_GRP(g).floo_tcdm_wide_req_in_trans[t][p][d].hdr.src_tile_id, + `NOC_GRP(g).floo_tcdm_wide_req_in_trans[t][p][d].hdr.core_id, + `NOC_GRP(g).floo_tcdm_wide_req_in_trans[t][p][d].hdr.meta_id); + if (so >= 2) // output handshake + $fwrite(f_rreq[g], "%0d P %0d 1 %0d %0d %0h %0d %0d %0d %0d %0d %0d %0d\n", rid, d, cycle_q, + `NOC_GRP(g).floo_tcdm_wide_req_out_trans[t][p][d].payload.wen, + `NOC_GRP(g).floo_tcdm_wide_req_out_trans[t][p][d].hdr.tgt_addr, + `NOC_GRP(g).floo_tcdm_wide_req_out_trans[t][p][d].hdr.dst_id.x, + `NOC_GRP(g).floo_tcdm_wide_req_out_trans[t][p][d].hdr.dst_id.y, + `NOC_GRP(g).floo_tcdm_wide_req_out_trans[t][p][d].hdr.src_id.x, + `NOC_GRP(g).floo_tcdm_wide_req_out_trans[t][p][d].hdr.src_id.y, + `NOC_GRP(g).floo_tcdm_wide_req_out_trans[t][p][d].hdr.src_tile_id, + `NOC_GRP(g).floo_tcdm_wide_req_out_trans[t][p][d].hdr.core_id, + `NOC_GRP(g).floo_tcdm_wide_req_out_trans[t][p][d].hdr.meta_id); end - end - end - else begin - always_ff @(posedge clk or negedge rst_n) begin - if (!rst_n) begin - router_local_req_port_profile_q[g][t][p].read_req_num = '0; - router_local_req_port_profile_q[g][t][p].write_req_num = '0; - - for (int router_p = 0; router_p < 4; router_p++) begin - router_level_profile_req_q[g][t][p].in_vld_cyc_num[router_p] = '0; - router_level_profile_req_q[g][t][p].in_hsk_cyc_num[router_p] = '0; - router_level_profile_req_q[g][t][p].out_vld_cyc_num[router_p] = '0; - router_level_profile_req_q[g][t][p].out_hsk_cyc_num[router_p] = '0; + begin : req_local + automatic logic [1:0] sli = `NOC_GRP(g).floo_tcdm_rdwr_req_to_router_vc_valid[t][p] + ? (`NOC_GRP(g).floo_tcdm_rdwr_req_to_router_vc_ready[t][p] + ? (`NOC_GRP(g).floo_tcdm_rdwr_req_to_router[t][p].payload.wen ? 2'd3 : 2'd2) : 2'd1) : 2'd0; + automatic logic [1:0] slo = `NOC_GRP(g).floo_tcdm_rdwr_req_from_router_vc_valid[t][p] + ? (`NOC_GRP(g).floo_tcdm_rdwr_req_from_router_vc_ready[t][p] + ? (`NOC_GRP(g).floo_tcdm_rdwr_req_from_router[t][p].payload.wen ? 2'd3 : 2'd2) : 2'd1) : 2'd0; + if (sli != rsq_st[g][t][p][4][0]) begin + $fwrite(f_rreq[g], "%0d S 4 0 %0d %0d %0d\n", rid, rsq_start[g][t][p][4][0], cycle_q, rsq_st[g][t][p][4][0]); + rsq_st[g][t][p][4][0] <= sli; rsq_start[g][t][p][4][0] <= cycle_q; end - end else begin - router_local_req_port_profile_q[g][t][p].read_req_num += - $countones( - (|( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.valid_i[0] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.ready_i[0] - )) & ~dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.data_i[0][0].payload.wen - ); - - router_local_req_port_profile_q[g][t][p].write_req_num += - $countones( - (|( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.valid_i[0] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.ready_i[0] - )) & dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.data_i[0][0].payload.wen - ); - - for (int router_p = 0; router_p < 4; router_p++) begin - // wide req router - - router_level_profile_req_q[g][t][p].in_vld_cyc_num[router_p] += - $countones( - |dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.valid_i[router_p + 1] - ); - - router_level_profile_req_q[g][t][p].in_hsk_cyc_num[router_p] += - $countones( - |( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.valid_i[router_p + 1] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.ready_o[router_p + 1] - ) - ); - - router_level_profile_req_q[g][t][p].out_vld_cyc_num[router_p] += - $countones( - |dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.valid_o[router_p + 1] - ); - - router_level_profile_req_q[g][t][p].out_hsk_cyc_num[router_p] += - $countones( - |( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.valid_o[router_p + 1] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_req_router_j[p - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.ready_i[router_p + 1] - ) - ); + if (slo != rsq_st[g][t][p][4][1]) begin + $fwrite(f_rreq[g], "%0d S 4 1 %0d %0d %0d\n", rid, rsq_start[g][t][p][4][1], cycle_q, rsq_st[g][t][p][4][1]); + rsq_st[g][t][p][4][1] <= slo; rsq_start[g][t][p][4][1] <= cycle_q; end + if (sli >= 2) // local input (injection from tile) handshake + $fwrite(f_rreq[g], "%0d P 4 0 %0d %0d %0h %0d %0d %0d %0d %0d %0d %0d\n", rid, cycle_q, + `NOC_GRP(g).floo_tcdm_rdwr_req_to_router[t][p].payload.wen, + `NOC_GRP(g).floo_tcdm_rdwr_req_to_router[t][p].hdr.tgt_addr, + `NOC_GRP(g).floo_tcdm_rdwr_req_to_router[t][p].hdr.dst_id.x, + `NOC_GRP(g).floo_tcdm_rdwr_req_to_router[t][p].hdr.dst_id.y, + `NOC_GRP(g).floo_tcdm_rdwr_req_to_router[t][p].hdr.src_id.x, + `NOC_GRP(g).floo_tcdm_rdwr_req_to_router[t][p].hdr.src_id.y, + `NOC_GRP(g).floo_tcdm_rdwr_req_to_router[t][p].hdr.src_tile_id, + `NOC_GRP(g).floo_tcdm_rdwr_req_to_router[t][p].hdr.core_id, + `NOC_GRP(g).floo_tcdm_rdwr_req_to_router[t][p].hdr.meta_id); + if (slo >= 2) // local output (ejection to tile) handshake + $fwrite(f_rreq[g], "%0d P 4 1 %0d %0d %0h %0d %0d %0d %0d %0d %0d %0d\n", rid, cycle_q, + `NOC_GRP(g).floo_tcdm_rdwr_req_from_router[t][p].payload.wen, + `NOC_GRP(g).floo_tcdm_rdwr_req_from_router[t][p].hdr.tgt_addr, + `NOC_GRP(g).floo_tcdm_rdwr_req_from_router[t][p].hdr.dst_id.x, + `NOC_GRP(g).floo_tcdm_rdwr_req_from_router[t][p].hdr.dst_id.y, + `NOC_GRP(g).floo_tcdm_rdwr_req_from_router[t][p].hdr.src_id.x, + `NOC_GRP(g).floo_tcdm_rdwr_req_from_router[t][p].hdr.src_id.y, + `NOC_GRP(g).floo_tcdm_rdwr_req_from_router[t][p].hdr.src_tile_id, + `NOC_GRP(g).floo_tcdm_rdwr_req_from_router[t][p].hdr.core_id, + `NOC_GRP(g).floo_tcdm_rdwr_req_from_router[t][p].hdr.meta_id); end end - end - end - for (genvar p = 0; p < (NumRemoteRespPortsPerTile - 1); p++) begin : gen_resp_router_profile_per_remote_port - always_ff @(posedge clk or negedge rst_n) begin - if (!rst_n) begin - router_local_resp_port_profile_q[g][t][p].req_num = '0; - - for (int router_p = 0; router_p < 4; router_p++) begin - router_level_profile_resp_q[g][t][p].in_vld_cyc_num[router_p] = '0; - router_level_profile_resp_q[g][t][p].in_hsk_cyc_num[router_p] = '0; - router_level_profile_resp_q[g][t][p].out_vld_cyc_num[router_p] = '0; - router_level_profile_resp_q[g][t][p].out_hsk_cyc_num[router_p] = '0; - end - end else begin - router_local_resp_port_profile_q[g][t][p].req_num += - $countones( - |( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_resp_router_j[p + 1] - .i_floo_wide_resp_router.valid_i[0] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_resp_router_j[p + 1] - .i_floo_wide_resp_router.ready_i[0] - ) - ); - - for (int router_p = 0; router_p < 4; router_p++) begin - // resp router - - router_level_profile_resp_q[g][t][p].in_vld_cyc_num[router_p] += - $countones( - |dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_resp_router_j[p + 1] - .i_floo_wide_resp_router.valid_i[router_p + 1] - ); - - router_level_profile_resp_q[g][t][p].in_hsk_cyc_num[router_p] += - $countones( - |( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_resp_router_j[p + 1] - .i_floo_wide_resp_router.valid_i[router_p + 1] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_resp_router_j[p + 1] - .i_floo_wide_resp_router.ready_o[router_p + 1] - ) - ); - - router_level_profile_resp_q[g][t][p].out_vld_cyc_num[router_p] += - $countones( - |dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_resp_router_j[p + 1] - .i_floo_wide_resp_router.valid_o[router_p + 1] - ); - - router_level_profile_resp_q[g][t][p].out_hsk_cyc_num[router_p] += - $countones( - |( - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_resp_router_j[p + 1] - .i_floo_wide_resp_router.valid_o[router_p + 1] & - - dut.i_mempool_cluster - .gen_groups_x[g / NumY] - .gen_groups_y[g % NumY] - .i_group - .gen_router_router_i[t] - .gen_router_wide_resp_router_j[p + 1] - .i_floo_wide_resp_router.ready_i[router_p + 1] - ) - ); - end - end - end - end - end - end - endgenerate - - always_ff @(posedge clk) begin - if (rst_n) begin - // if (cycle_q[19:0] == 'h80000) begin - if ( - ((cycle_q[63:0] < 'h8000) && - ((cycle_q[10:0] == 11'h400) || (cycle_q[10:0] == 11'h000))) || - (cycle_q[15:0] == 'h8000) - ) begin - - $sformat(fn_2, "%s/tile_level_profile_q_%8x.log", log_path, cycle_q); - f_2 = $fopen(fn_2, "w"); - $display("[Tracer] Logging tile_level_profile_q to %s", fn_2); - - $sformat(fn_3, "%s/group_level_profile_q_%8x.log", log_path, cycle_q); - f_3 = $fopen(fn_3, "w"); - $display("[Tracer] Logging group_level_profile_q to %s", fn_3); - - $sformat(fn_4, "%s/router_level_profile_q_%8x.log", log_path, cycle_q); - f_4 = $fopen(fn_4, "w"); - $display("[Tracer] Logging router_level_profile_q to %s", fn_4); - - $sformat(fn_5, "%s/router_local_input_profile_q_%8x.log", log_path, cycle_q); - f_5 = $fopen(fn_5, "w"); - $display("[Tracer] Logging router_local_input_profile_q to %s", fn_5); - - $timeformat(-9, 0, "", 10); - $sformat(dump_time, "dump time %t, cycle %8d #;\n", $time, cycle_q); - $fwrite(f_2, dump_time); - $fwrite(f_3, dump_time); - $fwrite(f_4, dump_time); - $fwrite(f_5, dump_time); - - // tile level - for (int g = 0; g < NumGroups; g++) begin - for (int t_i = 0; t_i < NumTilesPerGroup; t_i++) begin - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - automatic string extras_str_2; - extras_str_2 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'req_vld_cyc_num': %03d, 'req_hsk_cyc_num': %03d, 'util': %.2f\n", - g, t_i, p, - tile_level_profile_q[g][t_i].req_vld_cyc_num[p], - tile_level_profile_q[g][t_i].req_hsk_cyc_num[p], - (tile_level_profile_q[g][t_i].req_vld_cyc_num[p] == 0) ? 0.0 : - ((tile_level_profile_q[g][t_i].req_hsk_cyc_num[p] * 1.0) / - (tile_level_profile_q[g][t_i].req_vld_cyc_num[p] * 1.0)) - ); - $fwrite(f_2, extras_str_2); - end - end - end - $fclose(f_2); - - // group level - for (int g = 0; g < NumGroups; g++) begin - int unsigned req_vld_cyc_num_sum; - int unsigned req_hsk_cyc_num_sum; - automatic string extras_str_3; - - req_vld_cyc_num_sum = 0; - req_hsk_cyc_num_sum = 0; - - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - req_vld_cyc_num_sum += group_level_profile_q[g].req_vld_cyc_num[p]; - req_hsk_cyc_num_sum += group_level_profile_q[g].req_hsk_cyc_num[p]; - end - - extras_str_3 = $sformatf( - "{'GROUP': %03d, 'req_vld_cyc_num': %03d, 'req_hsk_cyc_num': %03d, " - "'req_vld_cyc_more_than_one_hit_same_bank_num': %03d, 'util': %.2f\n", - g, - req_vld_cyc_num_sum, - req_hsk_cyc_num_sum, - group_level_profile_q[g].req_vld_cyc_more_than_one_hit_same_bank_num, - ((req_vld_cyc_num_sum - group_level_profile_q[g].req_vld_cyc_more_than_one_hit_same_bank_num) == 0) ? - 0.0 : - ((req_hsk_cyc_num_sum * 1.0) / - ((req_vld_cyc_num_sum - group_level_profile_q[g].req_vld_cyc_more_than_one_hit_same_bank_num) * 1.0)) - ); - - $fwrite(f_3, extras_str_3); - end - $fclose(f_3); - - // router level - for (int g = 0; g < NumGroups; g++) begin - for (int t = 0; t < NumTilesPerGroup; t++) begin - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - if (p < NumNarrowRemoteReqPortsPerTile) begin - // narrow req - for (int dir = 0; dir < 4; dir++) begin - automatic string extras_str_4; - extras_str_4 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'REQ_RSP': 0, " - "'TYPE': 0, 'DIR': %03d, 'in_vld_cyc_num': %03d, 'in_hsk_cyc_num': %03d, " - "'out_vld_cyc_num': %03d, 'out_hsk_cyc_num': %03d, " - "'in_util': %.2f, 'out_util': %.2f\n", - g, t, p, dir, - router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir], - router_level_profile_req_q[g][t][p].in_hsk_cyc_num[dir], - router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir], - router_level_profile_req_q[g][t][p].out_hsk_cyc_num[dir], - (router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir] > 0) ? - (router_level_profile_req_q[g][t][p].in_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir] * 1.0) : - 0, - (router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir] > 0) ? - (router_level_profile_req_q[g][t][p].out_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir] * 1.0) : - 0 - ); - $fwrite(f_4, extras_str_4); + // ---- resp routers (router slot p+1; no read/write split). P lines log every + // accepted RESPONSE flit; a resp flit has no tgt_addr. hdr carries dst_id + // (requester it returns to), src_id (responder), tile_id/core_id (requester). + // P ---- + for (int p = 0; p < NumRemoteRespPortsPerTile-1; p++) begin + automatic int rid = t*(NumRemoteRespPortsPerTile-1) + p; // merged-file router tag + for (int d = 0; d < 4; d++) begin + automatic logic [1:0] si = `NOC_GRP(g).floo_tcdm_resp_valid_in_trans[t][p+1][d] + ? (`NOC_GRP(g).floo_tcdm_resp_ready_out_trans[t][p+1][d] ? 2'd2 : 2'd1) : 2'd0; + automatic logic [1:0] so = `NOC_GRP(g).floo_tcdm_resp_valid_out_trans[t][p+1][d] + ? (`NOC_GRP(g).floo_tcdm_resp_ready_in_trans[t][p+1][d] ? 2'd2 : 2'd1) : 2'd0; + if (si != rsp_st[g][t][p][d][0]) begin + $fwrite(f_rresp[g], "%0d S %0d 0 %0d %0d %0d\n", rid, d, rsp_start[g][t][p][d][0], cycle_q, rsp_st[g][t][p][d][0]); + rsp_st[g][t][p][d][0] <= si; rsp_start[g][t][p][d][0] <= cycle_q; end - end else begin - // wide req - for (int dir = 0; dir < 4; dir++) begin - automatic string extras_str_4; - extras_str_4 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'REQ_RSP': 0, " - "'TYPE': 1, 'DIR': %03d, 'in_vld_cyc_num': %03d, 'in_hsk_cyc_num': %03d, " - "'out_vld_cyc_num': %03d, 'out_hsk_cyc_num': %03d, " - "'in_util': %.2f, 'out_util': %.2f\n", - g, t, p, dir, - router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir], - router_level_profile_req_q[g][t][p].in_hsk_cyc_num[dir], - router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir], - router_level_profile_req_q[g][t][p].out_hsk_cyc_num[dir], - (router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir] > 0) ? - (router_level_profile_req_q[g][t][p].in_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir] * 1.0) : - 0, - (router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir] > 0) ? - (router_level_profile_req_q[g][t][p].out_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir] * 1.0) : - 0 - ); - $fwrite(f_4, extras_str_4); + if (so != rsp_st[g][t][p][d][1]) begin + $fwrite(f_rresp[g], "%0d S %0d 1 %0d %0d %0d\n", rid, d, rsp_start[g][t][p][d][1], cycle_q, rsp_st[g][t][p][d][1]); + rsp_st[g][t][p][d][1] <= so; rsp_start[g][t][p][d][1] <= cycle_q; end + if (si >= 2) // mesh input handshake -> one response flit accepted + $fwrite(f_rresp[g], "%0d P %0d 0 %0d %0d %0d %0d %0d %0d %0d %0d\n", rid, d, cycle_q, + `NOC_GRP(g).floo_tcdm_resp_in_trans[t][p+1][d].hdr.dst_id.x, + `NOC_GRP(g).floo_tcdm_resp_in_trans[t][p+1][d].hdr.dst_id.y, + `NOC_GRP(g).floo_tcdm_resp_in_trans[t][p+1][d].hdr.src_id.x, + `NOC_GRP(g).floo_tcdm_resp_in_trans[t][p+1][d].hdr.src_id.y, + `NOC_GRP(g).floo_tcdm_resp_in_trans[t][p+1][d].hdr.tile_id, + `NOC_GRP(g).floo_tcdm_resp_in_trans[t][p+1][d].hdr.core_id, + `NOC_GRP(g).floo_tcdm_resp_in_trans[t][p+1][d].hdr.meta_id); + if (so >= 2) // mesh output handshake + $fwrite(f_rresp[g], "%0d P %0d 1 %0d %0d %0d %0d %0d %0d %0d %0d\n", rid, d, cycle_q, + `NOC_GRP(g).floo_tcdm_resp_out_trans[t][p+1][d].hdr.dst_id.x, + `NOC_GRP(g).floo_tcdm_resp_out_trans[t][p+1][d].hdr.dst_id.y, + `NOC_GRP(g).floo_tcdm_resp_out_trans[t][p+1][d].hdr.src_id.x, + `NOC_GRP(g).floo_tcdm_resp_out_trans[t][p+1][d].hdr.src_id.y, + `NOC_GRP(g).floo_tcdm_resp_out_trans[t][p+1][d].hdr.tile_id, + `NOC_GRP(g).floo_tcdm_resp_out_trans[t][p+1][d].hdr.core_id, + `NOC_GRP(g).floo_tcdm_resp_out_trans[t][p+1][d].hdr.meta_id); end - end - - // resp - for (int p = 0; p < (NumRemoteRespPortsPerTile - 1); p++) begin - for (int dir = 0; dir < 4; dir++) begin - automatic string extras_str_4; - - extras_str_4 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'REQ_RSP': 1, " - "'TYPE': 1, 'DIR': %03d, 'in_vld_cyc_num': %03d, 'in_hsk_cyc_num': %03d, " - "'out_vld_cyc_num': %03d, 'out_hsk_cyc_num': %03d, " - "'in_util': %.2f, 'out_util': %.2f\n", - g, t, p, dir, - router_level_profile_resp_q[g][t][p].in_vld_cyc_num[dir], - router_level_profile_resp_q[g][t][p].in_hsk_cyc_num[dir], - router_level_profile_resp_q[g][t][p].out_vld_cyc_num[dir], - router_level_profile_resp_q[g][t][p].out_hsk_cyc_num[dir], - (router_level_profile_resp_q[g][t][p].in_vld_cyc_num[dir] > 0) ? - (router_level_profile_resp_q[g][t][p].in_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_resp_q[g][t][p].in_vld_cyc_num[dir] * 1.0) : - 0, - (router_level_profile_resp_q[g][t][p].out_vld_cyc_num[dir] > 0) ? - (router_level_profile_resp_q[g][t][p].out_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_resp_q[g][t][p].out_vld_cyc_num[dir] * 1.0) : - 0 - ); - - $fwrite(f_4, extras_str_4); - end - end - end - end - $fclose(f_4); - - // router local port - for (int g = 0; g < NumGroups; g++) begin - for (int t = 0; t < NumTilesPerGroup; t++) begin - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - if (p < NumNarrowRemoteReqPortsPerTile) begin - // narrow req - automatic string extras_str_5; - extras_str_5 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'REQ_RSP': 0, 'TYPE': 0, " - "'req_read_in_num': %03d, 'req_write_in_num': %03d\n", - g, t, p, - router_local_req_port_profile_q[g][t][p].read_req_num, - router_local_req_port_profile_q[g][t][p].write_req_num - ); - $fwrite(f_5, extras_str_5); - end else begin - // wide req - automatic string extras_str_5; - extras_str_5 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'REQ_RSP': 0, 'TYPE': 1, " - "'req_read_in_num': %03d, 'req_write_in_num': %03d\n", - g, t, p, - router_local_req_port_profile_q[g][t][p].read_req_num, - router_local_req_port_profile_q[g][t][p].write_req_num - ); - $fwrite(f_5, extras_str_5); - end - end - - // resp - for (int p = 0; p < (NumRemoteRespPortsPerTile - 1); p++) begin - automatic string extras_str_5; - extras_str_5 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'REQ_RSP': 1, " - "'TYPE': 1, 'resp_in_num': %03d\n", - g, t, p, - router_local_resp_port_profile_q[g][t][p].req_num - ); - $fwrite(f_5, extras_str_5); - end - end - end - - $fclose(f_5); - end - end - end - - final begin - $sformat(fn_final_2, "%s/tile_level_profile_q.log", log_path); - f_final_2 = $fopen(fn_final_2, "w"); - $display("[Tracer] Final Logging Banks to %s", fn_final_2); - - $sformat(fn_final_3, "%s/group_level_profile_q.log", log_path); - f_final_3 = $fopen(fn_final_3, "w"); - $display("[Tracer] Final Logging Banks to %s", fn_final_3); - - $sformat(fn_final_4, "%s/router_level_profile_q.log", log_path); - f_final_4 = $fopen(fn_final_4, "w"); - $display("[Tracer] Final Logging Banks to %s", fn_final_4); - - $timeformat(-9, 0, "", 10); - $sformat(dump_time, "dump time %t, cycle %8d #;\n", $time, cycle_q); - $fwrite(f_final_2, dump_time); - $fwrite(f_final_3, dump_time); - $fwrite(f_final_4, dump_time); - - // tile level - for (int g = 0; g < NumGroups; g++) begin - for (int t_i = 0; t_i < NumTilesPerGroup; t_i++) begin - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - automatic string extras_str_final_2; - - extras_str_final_2 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, " - "'req_vld_cyc_num': %03d, 'req_hsk_cyc_num': %03d, 'util': %.2f\n", - g, t_i, p, - tile_level_profile_q[g][t_i].req_vld_cyc_num[p], - tile_level_profile_q[g][t_i].req_hsk_cyc_num[p], - (tile_level_profile_q[g][t_i].req_vld_cyc_num[p] == 0) ? - 0.0 : - (tile_level_profile_q[g][t_i].req_hsk_cyc_num[p] * 1.0) / - (tile_level_profile_q[g][t_i].req_vld_cyc_num[p] * 1.0) - ); - - $fwrite(f_final_2, extras_str_final_2); - end - end - end - $fclose(f_final_2); - - // group level - for (int g = 0; g < NumGroups; g++) begin - int unsigned req_vld_cyc_num_sum; - int unsigned req_hsk_cyc_num_sum; - automatic string extras_str_final_3; - - req_vld_cyc_num_sum = 0; - req_hsk_cyc_num_sum = 0; - - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - req_vld_cyc_num_sum += group_level_profile_q[g].req_vld_cyc_num[p]; - req_hsk_cyc_num_sum += group_level_profile_q[g].req_hsk_cyc_num[p]; - end - - extras_str_final_3 = $sformatf( - "{'GROUP': %03d, 'req_vld_cyc_num': %03d, 'req_hsk_cyc_num': %03d, " - "'req_vld_cyc_more_than_one_hit_same_bank_num': %03d, 'util': %.2f\n", - g, - req_vld_cyc_num_sum, - req_hsk_cyc_num_sum, - group_level_profile_q[g].req_vld_cyc_more_than_one_hit_same_bank_num, - ((req_vld_cyc_num_sum - group_level_profile_q[g].req_vld_cyc_more_than_one_hit_same_bank_num) == 0) ? - 0.0 : - (req_hsk_cyc_num_sum * 1.0) / - ((req_vld_cyc_num_sum - group_level_profile_q[g].req_vld_cyc_more_than_one_hit_same_bank_num) * 1.0) - ); - - $fwrite(f_final_3, extras_str_final_3); - end - $fclose(f_final_3); - - // router level - for (int g = 0; g < NumGroups; g++) begin - for (int t = 0; t < NumTilesPerGroup; t++) begin - for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin - if (p < NumNarrowRemoteReqPortsPerTile) begin - // narrow req - for (int dir = 0; dir < 4; dir++) begin - automatic string extras_str_final_4; - extras_str_final_4 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'REQ_RSP': 0, 'TYPE': 0, " - "'DIR': %03d, 'in_vld_cyc_num': %03d, 'in_hsk_cyc_num': %03d, " - "'out_vld_cyc_num': %03d, 'out_hsk_cyc_num': %03d, " - "'in_util': %.2f, 'out_util': %.2f\n", - g, t, p, dir, - router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir], - router_level_profile_req_q[g][t][p].in_hsk_cyc_num[dir], - router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir], - router_level_profile_req_q[g][t][p].out_hsk_cyc_num[dir], - (router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir] > 0) ? - (router_level_profile_req_q[g][t][p].in_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir] * 1.0) : 0, - (router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir] > 0) ? - (router_level_profile_req_q[g][t][p].out_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir] * 1.0) : 0 - ); - $fwrite(f_final_4, extras_str_final_4); - end - end else begin - // wide req - for (int dir = 0; dir < 4; dir++) begin - automatic string extras_str_final_4; - extras_str_final_4 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'REQ_RSP': 0, 'TYPE': 1, " - "'DIR': %03d, 'in_vld_cyc_num': %03d, 'in_hsk_cyc_num': %03d, " - "'out_vld_cyc_num': %03d, 'out_hsk_cyc_num': %03d, " - "'in_util': %.2f, 'out_util': %.2f\n", - g, t, p, dir, - router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir], - router_level_profile_req_q[g][t][p].in_hsk_cyc_num[dir], - router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir], - router_level_profile_req_q[g][t][p].out_hsk_cyc_num[dir], - (router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir] > 0) ? - (router_level_profile_req_q[g][t][p].in_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_req_q[g][t][p].in_vld_cyc_num[dir] * 1.0) : 0, - (router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir] > 0) ? - (router_level_profile_req_q[g][t][p].out_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_req_q[g][t][p].out_vld_cyc_num[dir] * 1.0) : 0 - ); - $fwrite(f_final_4, extras_str_final_4); - end - end - end - - // resp - for (int p = 0; p < NumRemoteRespPortsPerTile; p++) begin - for (int dir = 0; dir < 4; dir++) begin - automatic string extras_str_final_4; - extras_str_final_4 = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'REQ_RSP': 1, 'TYPE': 1, " - "'DIR': %03d, 'in_vld_cyc_num': %03d, 'in_hsk_cyc_num': %03d, " - "'out_vld_cyc_num': %03d, 'out_hsk_cyc_num': %03d, " - "'in_util': %.2f, 'out_util': %.2f\n", - g, t, p, dir, - router_level_profile_resp_q[g][t][p].in_vld_cyc_num[dir], - router_level_profile_resp_q[g][t][p].in_hsk_cyc_num[dir], - router_level_profile_resp_q[g][t][p].out_vld_cyc_num[dir], - router_level_profile_resp_q[g][t][p].out_hsk_cyc_num[dir], - (router_level_profile_resp_q[g][t][p].in_vld_cyc_num[dir] > 0) ? - (router_level_profile_resp_q[g][t][p].in_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_resp_q[g][t][p].in_vld_cyc_num[dir] * 1.0) : 0, - (router_level_profile_resp_q[g][t][p].out_vld_cyc_num[dir] > 0) ? - (router_level_profile_resp_q[g][t][p].out_hsk_cyc_num[dir] * 1.0) / - (router_level_profile_resp_q[g][t][p].out_vld_cyc_num[dir] * 1.0) : 0 - ); - $fwrite(f_final_4, extras_str_final_4); - end - end - end - end - $fclose(f_final_4); - end - - router_input_profile_t req_router_input_profile_q[NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteReqPortsPerTile-1)-1:0]; - floo_rdwr_req_t floo_req_input_queue[NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteReqPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0][$]; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteReqPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_req_input_fifo_ready_o; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteReqPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_req_input_fifo_valid_i; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteReqPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_req_input_fifo_ready_i; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteReqPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_req_input_fifo_valid_o; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteReqPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_req_output_fifo_ready_o; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteReqPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_req_output_fifo_valid_i; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteReqPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_req_output_fifo_ready_i; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteReqPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_req_output_fifo_valid_o; - - generate - for (genvar g = 0; g < NumGroups; g++) begin : gen_req_router_input_queue_per_group - for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_req_router_input_queue_per_tile - for (genvar r = 0; r < (NumRemoteReqPortsPerTile - 1); r++) begin : gen_req_router_input_queue_per_remote_port - for (genvar router_p = 0; router_p < 5; router_p++) begin : gen_req_router_input_queue_per_dir - if (r < NumNarrowRemoteReqPortsPerTile) begin - assign floo_req_input_fifo_ready_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_narrow_req_router_j[r] - .i_floo_narrow_req_router.ready_o[router_p]; - - assign floo_req_input_fifo_valid_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_narrow_req_router_j[r] - .i_floo_narrow_req_router.valid_i[router_p]; - - assign floo_req_input_fifo_ready_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_narrow_req_router_j[r] - .i_floo_narrow_req_router.in_ready[router_p]; - - assign floo_req_input_fifo_valid_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_narrow_req_router_j[r] - .i_floo_narrow_req_router.in_valid[router_p]; - - assign floo_req_output_fifo_ready_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_narrow_req_router_j[r] - .i_floo_narrow_req_router.out_ready[router_p]; - - assign floo_req_output_fifo_valid_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_narrow_req_router_j[r] - .i_floo_narrow_req_router.out_valid[router_p]; - - assign floo_req_output_fifo_ready_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_narrow_req_router_j[r] - .i_floo_narrow_req_router.out_buffered_ready[router_p]; - - assign floo_req_output_fifo_valid_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_narrow_req_router_j[r] - .i_floo_narrow_req_router.out_buffered_valid[router_p]; - - for (genvar v = 0; v < NumVirtualChannel; v++) begin : gen_req_router_input_queue_per_vc - always_ff @(posedge clk) begin - if (rst_n) begin - if (floo_req_input_fifo_valid_i[g][t][r][router_p][v] & - floo_req_input_fifo_ready_o[g][t][r][router_p][v]) begin - floo_req_input_queue[g][t][r][router_p][v].push_back( - floo_rdwr_req_t'{ - hdr: dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_narrow_req_router_j[r] - .i_floo_narrow_req_router.data_i[router_p].hdr, - payload: '0 - }); - end - - if (floo_req_input_fifo_valid_o[g][t][r][router_p][v] & - floo_req_input_fifo_ready_i[g][t][r][router_p][v]) begin - floo_req_input_queue[g][t][r][router_p][v].delete(0); - end - end + begin : rsp_local + automatic logic [1:0] sli = `NOC_GRP(g).floo_tcdm_resp_to_router_vc_valid[t][p+1] + ? (`NOC_GRP(g).floo_tcdm_resp_to_router_vc_ready[t][p+1] ? 2'd2 : 2'd1) : 2'd0; + automatic logic [1:0] slo = `NOC_GRP(g).floo_tcdm_resp_from_router_vc_valid[t][p+1] + ? (`NOC_GRP(g).floo_tcdm_resp_from_router_vc_ready[t][p+1] ? 2'd2 : 2'd1) : 2'd0; + if (sli != rsp_st[g][t][p][4][0]) begin + $fwrite(f_rresp[g], "%0d S 4 0 %0d %0d %0d\n", rid, rsp_start[g][t][p][4][0], cycle_q, rsp_st[g][t][p][4][0]); + rsp_st[g][t][p][4][0] <= sli; rsp_start[g][t][p][4][0] <= cycle_q; end - end - end else begin - assign floo_req_input_fifo_ready_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_req_router_j[r - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.ready_o[router_p]; - - assign floo_req_input_fifo_valid_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_req_router_j[r - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.valid_i[router_p]; - - assign floo_req_input_fifo_ready_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_req_router_j[r - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.in_ready[router_p]; - - assign floo_req_input_fifo_valid_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_req_router_j[r - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.in_valid[router_p]; - - assign floo_req_output_fifo_ready_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_req_router_j[r - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.out_ready[router_p]; - - assign floo_req_output_fifo_valid_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_req_router_j[r - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.out_valid[router_p]; - - assign floo_req_output_fifo_ready_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_req_router_j[r - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.out_buffered_ready[router_p]; - - assign floo_req_output_fifo_valid_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_req_router_j[r - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.out_buffered_valid[router_p]; - - for (genvar v = 0; v < NumVirtualChannel; v++) begin : gen_req_router_input_queue_per_vc - always_ff @(posedge clk) begin - if (rst_n) begin - if (floo_req_input_fifo_valid_i[g][t][r][router_p][v] & - floo_req_input_fifo_ready_o[g][t][r][router_p][v]) begin - floo_req_input_queue[g][t][r][router_p][v].push_back( - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_req_router_j[r - NumNarrowRemoteReqPortsPerTile] - .i_floo_wide_req_router.data_i[router_p]); - end - - if (floo_req_input_fifo_valid_o[g][t][r][router_p][v] & - floo_req_input_fifo_ready_i[g][t][r][router_p][v]) begin - floo_req_input_queue[g][t][r][router_p][v].delete(0); - end - end + if (slo != rsp_st[g][t][p][4][1]) begin + $fwrite(f_rresp[g], "%0d S 4 1 %0d %0d %0d\n", rid, rsp_start[g][t][p][4][1], cycle_q, rsp_st[g][t][p][4][1]); + rsp_st[g][t][p][4][1] <= slo; rsp_start[g][t][p][4][1] <= cycle_q; end + if (sli >= 2) // local input (injection from tile = slave_resp) handshake + $fwrite(f_rresp[g], "%0d P 4 0 %0d %0d %0d %0d %0d %0d %0d %0d\n", rid, cycle_q, + `NOC_GRP(g).floo_tcdm_resp_to_router[t][p+1].hdr.dst_id.x, + `NOC_GRP(g).floo_tcdm_resp_to_router[t][p+1].hdr.dst_id.y, + `NOC_GRP(g).floo_tcdm_resp_to_router[t][p+1].hdr.src_id.x, + `NOC_GRP(g).floo_tcdm_resp_to_router[t][p+1].hdr.src_id.y, + `NOC_GRP(g).floo_tcdm_resp_to_router[t][p+1].hdr.tile_id, + `NOC_GRP(g).floo_tcdm_resp_to_router[t][p+1].hdr.core_id, + `NOC_GRP(g).floo_tcdm_resp_to_router[t][p+1].hdr.meta_id); + if (slo >= 2) // local output (ejection to tile = master_resp) handshake + $fwrite(f_rresp[g], "%0d P 4 1 %0d %0d %0d %0d %0d %0d %0d %0d\n", rid, cycle_q, + `NOC_GRP(g).floo_tcdm_resp_from_router[t][p+1].hdr.dst_id.x, + `NOC_GRP(g).floo_tcdm_resp_from_router[t][p+1].hdr.dst_id.y, + `NOC_GRP(g).floo_tcdm_resp_from_router[t][p+1].hdr.src_id.x, + `NOC_GRP(g).floo_tcdm_resp_from_router[t][p+1].hdr.src_id.y, + `NOC_GRP(g).floo_tcdm_resp_from_router[t][p+1].hdr.tile_id, + `NOC_GRP(g).floo_tcdm_resp_from_router[t][p+1].hdr.core_id, + `NOC_GRP(g).floo_tcdm_resp_from_router[t][p+1].hdr.meta_id); end end end @@ -1135,389 +243,87 @@ end endgenerate - function route_direction_e xy_routing (group_xy_id_t group_id, floo_rdwr_req_t floo_req); - automatic group_xy_id_t dest_id = group_xy_id_t'(floo_req.hdr.dst_id); - if (dest_id == group_id) begin - xy_routing = Eject; - end else if (dest_id.x == group_id.x) begin - if (dest_id.y < group_id.y) begin - xy_routing = South; - end else begin - xy_routing = North; - end - end else begin - if (dest_id.x < group_id.x) begin - xy_routing = West; - end else begin - xy_routing = East; - end - end - endfunction - - function group_xy_id_t get_next_hop (group_xy_id_t group_id, route_direction_e out_dir); - if (out_dir == Eject) begin - get_next_hop = group_id; - end else if (out_dir == South) begin - get_next_hop = '{x:group_id.x, y:group_id.y-1}; - end else if (out_dir == North) begin - get_next_hop = '{x:group_id.x, y:group_id.y+1}; - end else if (out_dir == East) begin - get_next_hop = '{x:group_id.x+1, y:group_id.y}; - end else if (out_dir == West) begin - get_next_hop = '{x:group_id.x-1, y:group_id.y}; - end - endfunction - - function int onehot_to_bin (logic [NumVirtualChannel-1:0] onehot); - for (int i = 0; i < NumVirtualChannel; i++) begin - if (onehot[i]) begin - onehot_to_bin = i; - break; - end - end - endfunction - + // ------------------------------------------------------------ + // Tile port capture -> per-tile file. req=master_req(out)/slave_req(in) split + // read/write by .wen + P packet lines; resp=master_resp(in)/slave_resp(out) + // single handshake state (no address). genvar t: gen_tiles[t].i_tile constant. + // ------------------------------------------------------------ generate - for (genvar g = 0; g < NumGroups; g++) begin : gen_req_router_input_profile_per_group - for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_req_router_input_profile_per_tile - for (genvar r = 0; r < (NumRemoteReqPortsPerTile - 1); r++) begin : gen_req_router_input_profile_per_remote_port - for (genvar router_p = 0; router_p < 5; router_p++) begin : gen_req_router_input_profile_per_dir - always_ff @(posedge clk or negedge rst_n) begin - if (!rst_n) begin - req_router_input_profile_q[g][t][r].in_vld_cyc_num[router_p] = '0; - req_router_input_profile_q[g][t][r].in_hsk_cyc_num[router_p] = '0; - req_router_input_profile_q[g][t][r].hol_stall_cyc_num[router_p] = '0; - req_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p] = {'0, '0, '0, '0, '0}; - req_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p] = '0; - req_router_input_profile_q[g][t][r].max_stall_cyc_num[router_p] = '0; - end else begin - if ((cycle_q % 200) == 0) begin - req_router_input_profile_q[g][t][r].in_vld_cyc_num[router_p] = '0; - req_router_input_profile_q[g][t][r].in_hsk_cyc_num[router_p] = '0; - req_router_input_profile_q[g][t][r].hol_stall_cyc_num[router_p] = '0; - req_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p] = {'0, '0, '0, '0, '0}; - req_router_input_profile_q[g][t][r].max_stall_cyc_num[router_p] = '0; - end - - if (|floo_req_input_fifo_valid_i[g][t][r][router_p]) begin - req_router_input_profile_q[g][t][r].in_vld_cyc_num[router_p] += 1; - - if (|(floo_req_input_fifo_ready_o[g][t][r][router_p] & - floo_req_input_fifo_valid_i[g][t][r][router_p])) begin - req_router_input_profile_q[g][t][r].in_hsk_cyc_num[router_p] += 1; - - if (req_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p] > 0) begin - if (req_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p] > - req_router_input_profile_q[g][t][r].max_stall_cyc_num[router_p]) begin - req_router_input_profile_q[g][t][r].max_stall_cyc_num[router_p] = - req_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p]; - end - req_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p] = 0; - end - end else begin - automatic int vc_idx = onehot_to_bin(floo_req_input_fifo_valid_i[g][t][r][router_p]); - assert(|floo_req_input_fifo_valid_o[g][t][r][router_p]); - - req_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p] += 1; - - `ifdef XY_ROUTING - if (~floo_req_input_fifo_ready_i[g][t][r][router_p][vc_idx]) begin - automatic route_direction_e in_dir = route_direction_e'(router_p); - automatic route_direction_e out_dir = xy_routing(g, floo_req_input_queue[g][t][r][router_p][vc_idx][0]); - automatic group_xy_id_t cur_hop = g; - automatic logic cont = '1; - - req_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][out_dir] += 1; - - assert(floo_req_output_fifo_valid_i[g][t][r][out_dir][vc_idx]); - - while ('1) begin - for (int i = 1; i < floo_req_input_queue[cur_hop][t][r][in_dir][vc_idx].size(); i++) begin - out_dir = xy_routing(cur_hop, floo_req_input_queue[cur_hop][t][r][in_dir][vc_idx][i]); - - if (~floo_req_output_fifo_valid_i[cur_hop][t][r][out_dir][vc_idx] & - floo_req_output_fifo_ready_o[cur_hop][t][r][out_dir][vc_idx]) begin - req_router_input_profile_q[g][t][r].hol_stall_cyc_num[router_p] += 1; - cont = '0; - break; - end - end - - if (~cont) break; - - out_dir = xy_routing(cur_hop, floo_req_input_queue[cur_hop][t][r][in_dir][vc_idx][0]); - assert(floo_req_output_fifo_valid_i[cur_hop][t][r][out_dir][vc_idx]); - - if (floo_req_output_fifo_ready_o[cur_hop][t][r][out_dir][vc_idx] | - floo_req_output_fifo_ready_i[cur_hop][t][r][out_dir][vc_idx]) begin - break; - end - - if (out_dir == Eject) break; - - cur_hop = get_next_hop(cur_hop, out_dir); - - if (out_dir == North) begin - in_dir = South; - end else if (out_dir == South) begin - in_dir = North; - end else if (out_dir == East) begin - in_dir = West; - end else if (out_dir == West) begin - in_dir = East; - end - - if (floo_req_input_fifo_ready_i[cur_hop][t][r][in_dir][vc_idx]) begin - break; - end - end - end - `endif - end - end + for (genvar g = 0; g < NumGroups; g++) begin : gen_tstate_g + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_tstate_t + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) begin + ts_mreq_st[g][t] <= '{default: '0}; ts_mreq_s[g][t] <= '{default: '0}; + ts_sreq_st[g][t] <= '{default: '0}; ts_sreq_s[g][t] <= '{default: '0}; + ts_mrsp_st[g][t] <= '{default: '0}; ts_mrsp_s[g][t] <= '{default: '0}; + ts_srsp_st[g][t] <= '{default: '0}; ts_srsp_s[g][t] <= '{default: '0}; + end else begin + for (int p = 0; p < NumRemoteReqPortsPerTile; p++) begin + automatic logic [1:0] smo = `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_req_valid_o[p] + ? (`NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_req_ready_i[p] + ? (`NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_req_o[p].wen ? 2'd3 : 2'd2) : 2'd1) : 2'd0; + automatic logic [1:0] ssi = `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_req_valid_i[p] + ? (`NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_req_ready_o[p] + ? (`NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_req_i[p].wen ? 2'd3 : 2'd2) : 2'd1) : 2'd0; + if (smo != ts_mreq_st[g][t][p]) begin + $fwrite(f_tile[g][t], "S 0 1 %0d %0d %0d %0d\n", p, ts_mreq_s[g][t][p], cycle_q, ts_mreq_st[g][t][p]); + ts_mreq_st[g][t][p] <= smo; ts_mreq_s[g][t][p] <= cycle_q; end - end - end - end - end - end - endgenerate - - always_ff @(negedge clk) begin : log_req_router_input_profile - if (rst_n) begin - for (int g = 0; g < NumGroups; g++) begin - for (int t = 0; t < NumTilesPerGroup; t++) begin - for (int r = 0; r < (NumRemoteReqPortsPerTile - 1); r++) begin - for (int router_p = 0; router_p < 5; router_p++) begin - if ((cycle_q % 200) == 199) begin - automatic string log_str; - - log_str = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'DIR': %03d, " - "'start_cycle': %03d, 'end_cycle': %03d, " - "'in_vld_cyc_num': %03d, 'in_hsk_cyc_num': %03d, " - "'hol_stall_cyc_num': %03d, 'max_stall_cyc_num': %03d, " - "'out_dir0_cong_cyc_num': %03d, 'out_dir1_cong_cyc_num': %03d, " - "'out_dir2_cong_cyc_num': %03d, 'out_dir3_cong_cyc_num': %03d, " - "'out_dir4_cong_cyc_num': %03d}\n", - g, t, r, router_p, cycle_q - 199, cycle_q, - req_router_input_profile_q[g][t][r].in_vld_cyc_num[router_p], - req_router_input_profile_q[g][t][r].in_hsk_cyc_num[router_p], - req_router_input_profile_q[g][t][r].hol_stall_cyc_num[router_p], - req_router_input_profile_q[g][t][r].max_stall_cyc_num[router_p], - req_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][0], - req_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][1], - req_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][2], - req_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][3], - req_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][4] - ); - - $fwrite(req_floo_input_log_fd, log_str); + if (ssi != ts_sreq_st[g][t][p]) begin + $fwrite(f_tile[g][t], "S 0 0 %0d %0d %0d %0d\n", p, ts_sreq_s[g][t][p], cycle_q, ts_sreq_st[g][t][p]); + ts_sreq_st[g][t][p] <= ssi; ts_sreq_s[g][t][p] <= cycle_q; end + // P: io port cycle wen tgt_addr src_group dst_group req_tile req_core meta_id. + // src/dst groups are LINEAR ids (exporter renders as mesh (x,y)); + // requester group == src group, so not emitted separately. + if (smo >= 2) // master_req out (this tile issues a request) + $fwrite(f_tile[g][t], "P 1 %0d %0d %0d %0h %0d %0d %0d %0d %0d\n", p, cycle_q, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_req_o[p].wen, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_req_o[p].tgt_addr, + g, // src group + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_req_o[p].tgt_group_id, // dst group + t, // requester tile + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_req_o[p].wdata.core_id, // requester core + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_req_o[p].wdata.meta_id);// meta_id + if (ssi >= 2) // slave_req in (a request arrives at this tile's SPM) + $fwrite(f_tile[g][t], "P 0 %0d %0d %0d %0h %0d %0d %0d %0d %0d\n", p, cycle_q, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_req_i[p].wen, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_req_i[p].tgt_addr, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_req_i[p].src_group_id, // src group + g, // dst group + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_req_i[p].ini_addr, // requester tile + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_req_i[p].wdata.core_id, // requester core + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_req_i[p].wdata.meta_id); // meta_id end - end - end - end - end - end - - router_input_profile_t resp_router_input_profile_q[NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteRespPortsPerTile-1)-1:0]; - floo_resp_t floo_resp_input_queue[NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteRespPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0][$]; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteRespPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_resp_input_fifo_ready_o; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteRespPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_resp_input_fifo_valid_i; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteRespPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_resp_input_fifo_ready_i; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteRespPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_resp_input_fifo_valid_o; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteRespPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_resp_output_fifo_ready_o; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteRespPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_resp_output_fifo_valid_i; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteRespPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_resp_output_fifo_ready_i; - logic [NumGroups-1:0][NumTilesPerGroup-1:0][(NumRemoteRespPortsPerTile-1)-1:0][4:0][NumVirtualChannel-1:0] floo_resp_output_fifo_valid_o; - - generate - for (genvar g = 0; g < NumGroups; g++) begin : gen_resp_router_input_queue_per_group - for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_resp_router_input_queue_per_tile - for (genvar r = 0; r < (NumRemoteRespPortsPerTile - 1); r++) begin : gen_resp_router_input_queue_per_remote_port - for (genvar router_p = 0; router_p < 5; router_p++) begin : gen_resp_router_input_queue_per_dir - assign floo_resp_input_fifo_ready_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_resp_router_j[r + 1] - .i_floo_wide_resp_router.ready_o[router_p]; - - assign floo_resp_input_fifo_valid_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_resp_router_j[r + 1] - .i_floo_wide_resp_router.valid_i[router_p]; - - assign floo_resp_input_fifo_ready_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_resp_router_j[r + 1] - .i_floo_wide_resp_router.in_ready[router_p]; - - assign floo_resp_input_fifo_valid_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_resp_router_j[r + 1] - .i_floo_wide_resp_router.in_valid[router_p]; - - assign floo_resp_output_fifo_ready_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_resp_router_j[r + 1] - .i_floo_wide_resp_router.out_ready[router_p]; - - assign floo_resp_output_fifo_valid_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_resp_router_j[r + 1] - .i_floo_wide_resp_router.out_valid[router_p]; - - assign floo_resp_output_fifo_ready_i[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_resp_router_j[r + 1] - .i_floo_wide_resp_router.out_buffered_ready[router_p]; - - assign floo_resp_output_fifo_valid_o[g][t][r][router_p] = - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_resp_router_j[r + 1] - .i_floo_wide_resp_router.out_buffered_valid[router_p]; - - for (genvar v = 0; v < NumVirtualChannel; v++) begin : gen_resp_router_input_queue_per_vc - always_ff @(posedge clk) begin - if (rst_n) begin - if (floo_resp_input_fifo_valid_i[g][t][r][router_p][v] & - floo_resp_input_fifo_ready_o[g][t][r][router_p][v]) begin - floo_resp_input_queue[g][t][r][router_p][v].push_back( - dut.i_mempool_cluster.gen_groups_x[g / NumY].gen_groups_y[g % NumY] - .i_group.gen_router_router_i[t].gen_router_wide_resp_router_j[r + 1] - .i_floo_wide_resp_router.data_i[router_p]); - end - - if (floo_resp_input_fifo_valid_o[g][t][r][router_p][v] & - floo_resp_input_fifo_ready_i[g][t][r][router_p][v]) begin - floo_resp_input_queue[g][t][r][router_p][v].delete(0); - end - end + for (int p = 0; p < NumRemoteRespPortsPerTile; p++) begin + automatic logic [1:0] smi = `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_resp_valid_i[p] + ? (`NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_resp_ready_o[p] ? 2'd2 : 2'd1) : 2'd0; + automatic logic [1:0] sso = `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_resp_valid_o[p] + ? (`NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_resp_ready_i[p] ? 2'd2 : 2'd1) : 2'd0; + // Tile RESP packets use Q lines (kept apart from P req lines): + // Q + // master_resp(in,io0): responder not on tcdm boundary -> srcvalid=0, dst=this group. + // slave_resp(out,io1): src=this group, dst=src_group_id (requester it returns to). + if (smi != ts_mrsp_st[g][t][p]) begin + $fwrite(f_tile[g][t], "S 1 0 %0d %0d %0d %0d\n", p, ts_mrsp_s[g][t][p], cycle_q, ts_mrsp_st[g][t][p]); + ts_mrsp_st[g][t][p] <= smi; ts_mrsp_s[g][t][p] <= cycle_q; end - end - end - end - end - end - endgenerate - - function route_direction_e resp_xy_routing (group_xy_id_t group_id, floo_resp_t floo_resp); - automatic group_xy_id_t dest_id = group_xy_id_t'(floo_resp.hdr.dst_id); - if (dest_id == group_id) begin - resp_xy_routing = Eject; - end else if (dest_id.x == group_id.x) begin - if (dest_id.y < group_id.y) begin - resp_xy_routing = South; - end else begin - resp_xy_routing = North; - end - end else begin - if (dest_id.x < group_id.x) begin - resp_xy_routing = West; - end else begin - resp_xy_routing = East; - end - end - endfunction - - generate - for (genvar g = 0; g < NumGroups; g++) begin : gen_resp_router_input_profile_per_group - for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_resp_router_input_profile_per_tile - for (genvar r = 0; r < (NumRemoteRespPortsPerTile - 1); r++) begin : gen_resp_router_input_profile_per_remote_port - for (genvar router_p = 0; router_p < 5; router_p++) begin : gen_resp_router_input_profile_per_dir - always_ff @(posedge clk or negedge rst_n) begin - if (!rst_n) begin - resp_router_input_profile_q[g][t][r].in_vld_cyc_num[router_p] = '0; - resp_router_input_profile_q[g][t][r].in_hsk_cyc_num[router_p] = '0; - resp_router_input_profile_q[g][t][r].hol_stall_cyc_num[router_p] = '0; - resp_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p] = {'0, '0, '0, '0, '0}; - resp_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p] = '0; - resp_router_input_profile_q[g][t][r].max_stall_cyc_num[router_p] = '0; - end else begin - if ((cycle_q % 200) == 0) begin - resp_router_input_profile_q[g][t][r].in_vld_cyc_num[router_p] = '0; - resp_router_input_profile_q[g][t][r].in_hsk_cyc_num[router_p] = '0; - resp_router_input_profile_q[g][t][r].hol_stall_cyc_num[router_p] = '0; - resp_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p] = {'0, '0, '0, '0, '0}; - resp_router_input_profile_q[g][t][r].max_stall_cyc_num[router_p] = '0; - end - - if (|floo_resp_input_fifo_valid_i[g][t][r][router_p]) begin - resp_router_input_profile_q[g][t][r].in_vld_cyc_num[router_p] += 1; - - if (|(floo_resp_input_fifo_ready_o[g][t][r][router_p] & - floo_resp_input_fifo_valid_i[g][t][r][router_p])) begin - resp_router_input_profile_q[g][t][r].in_hsk_cyc_num[router_p] += 1; - - if (resp_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p] > 0) begin - if (resp_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p] > - resp_router_input_profile_q[g][t][r].max_stall_cyc_num[router_p]) begin - resp_router_input_profile_q[g][t][r].max_stall_cyc_num[router_p] = - resp_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p]; - end - resp_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p] = 0; - end - end else begin - automatic int vc_idx = onehot_to_bin(floo_resp_input_fifo_valid_i[g][t][r][router_p]); - assert(|floo_resp_input_fifo_valid_o[g][t][r][router_p]); - - resp_router_input_profile_q[g][t][r].cur_stall_cyc_num[router_p] += 1; - - `ifdef XY_ROUTING - if (~floo_resp_input_fifo_ready_i[g][t][r][router_p][vc_idx]) begin - automatic route_direction_e in_dir = route_direction_e'(router_p); - automatic route_direction_e out_dir = resp_xy_routing(g, floo_resp_input_queue[g][t][r][router_p][vc_idx][0]); - automatic group_xy_id_t cur_hop = g; - automatic logic cont = '1; - - resp_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][out_dir] += 1; - assert(floo_resp_output_fifo_valid_i[g][t][r][out_dir][vc_idx]); - - while ('1) begin - for (int i = 1; i < floo_resp_input_queue[cur_hop][t][r][in_dir][vc_idx].size(); i++) begin - out_dir = resp_xy_routing(cur_hop, floo_resp_input_queue[cur_hop][t][r][in_dir][vc_idx][i]); - - if (~floo_resp_output_fifo_valid_i[cur_hop][t][r][out_dir][vc_idx] & - floo_resp_output_fifo_ready_o[cur_hop][t][r][out_dir][vc_idx]) begin - resp_router_input_profile_q[g][t][r].hol_stall_cyc_num[router_p] += 1; - cont = '0; - break; - end - end - - if (~cont) break; - - out_dir = resp_xy_routing(cur_hop, floo_resp_input_queue[cur_hop][t][r][in_dir][vc_idx][0]); - assert(floo_resp_output_fifo_valid_i[cur_hop][t][r][out_dir][vc_idx]); - - if (floo_resp_output_fifo_ready_o[cur_hop][t][r][out_dir][vc_idx] | - floo_resp_output_fifo_ready_i[cur_hop][t][r][out_dir][vc_idx]) begin - break; - end - - if (out_dir == Eject) break; - - cur_hop = get_next_hop(cur_hop, out_dir); - - if (out_dir == North) begin - in_dir = South; - end else if (out_dir == South) begin - in_dir = North; - end else if (out_dir == East) begin - in_dir = West; - end else if (out_dir == West) begin - in_dir = East; - end - - if (floo_resp_input_fifo_ready_i[cur_hop][t][r][in_dir][vc_idx]) begin - break; - end - end - end - `endif - end - end + if (smi >= 2) // master_resp in: response arrives for this tile's core + $fwrite(f_tile[g][t], "Q 0 %0d %0d 0 %0d %0d %0d %0d %0d\n", p, cycle_q, + g, g, t, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_resp_i[p].rdata.core_id, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_master_resp_i[p].rdata.meta_id); + if (sso != ts_srsp_st[g][t][p]) begin + $fwrite(f_tile[g][t], "S 1 1 %0d %0d %0d %0d\n", p, ts_srsp_s[g][t][p], cycle_q, ts_srsp_st[g][t][p]); + ts_srsp_st[g][t][p] <= sso; ts_srsp_s[g][t][p] <= cycle_q; end + if (sso >= 2) // slave_resp out: this tile's SPM responds to a requester + $fwrite(f_tile[g][t], "Q 1 %0d %0d 1 %0d %0d %0d %0d %0d\n", p, cycle_q, + g, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_resp_o[p].src_group_id, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_resp_o[p].ini_addr, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_resp_o[p].rdata.core_id, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.tcdm_slave_resp_o[p].rdata.meta_id); end end end @@ -1525,222 +331,97 @@ end endgenerate - always_ff @(negedge clk) begin : log_resp_router_input_profile - if (rst_n) begin - for (int g = 0; g < NumGroups; g++) begin - for (int t = 0; t < NumTilesPerGroup; t++) begin - for (int r = 0; r < (NumRemoteRespPortsPerTile - 1); r++) begin - for (int router_p = 0; router_p < 5; router_p++) begin - if ((cycle_q % 200) == 199) begin - automatic string log_str; - - log_str = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'PORT': %03d, 'DIR': %03d, " - "'start_cycle': %03d, 'end_cycle': %03d, " - "'in_vld_cyc_num': %03d, 'in_hsk_cyc_num': %03d, " - "'hol_stall_cyc_num': %03d, 'max_stall_cyc_num': %03d, " - "'out_dir0_cong_cyc_num': %03d, 'out_dir1_cong_cyc_num': %03d, " - "'out_dir2_cong_cyc_num': %03d, 'out_dir3_cong_cyc_num': %03d, " - "'out_dir4_cong_cyc_num': %03d}\n", - g, t, r, router_p, cycle_q - 199, cycle_q, - resp_router_input_profile_q[g][t][r].in_vld_cyc_num[router_p], - resp_router_input_profile_q[g][t][r].in_hsk_cyc_num[router_p], - resp_router_input_profile_q[g][t][r].hol_stall_cyc_num[router_p], - resp_router_input_profile_q[g][t][r].max_stall_cyc_num[router_p], - resp_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][0], - resp_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][1], - resp_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][2], - resp_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][3], - resp_router_input_profile_q[g][t][r].out_congst_cyc_num[router_p][4] - ); - - $fwrite(resp_floo_input_log_fd, log_str); - end - end - end - end - end - end - end -`endif // NOC_PROFILING - -`ifdef SPM_PROFILING - int f_0, f_1, f_final_0, f_final_1; - string fn_0, fn_1, fn_final_0, fn_final_1; - string app, log_path; - - initial begin - void'($value$plusargs("APP=%s", app)); - $sformat(log_path, "../scripts/spm_profiling/run_logs/%s", app); - end - - profile_t dbg_profile_q[NumGroups-1:0][NumTilesPerGroup-1:0][NumBanksPerTile-1:0][2**TCDMAddrMemWidth-1:0]; - + // ------------------------------------------------------------ + // PE (Snitch core) data-port capture -> per-tile file (one line per core). Same + // format as a tile port; req = snitch_data_q* (out, read/write by qwrite, addr=qaddr), + // resp = snitch_data_p* (in). No bw/util derived for PE ports. + // S ; P + // ------------------------------------------------------------ generate - for (genvar g = 0; g < NumGroups; g++) begin - for (genvar t = 0; t < NumTilesPerGroup; t++) begin - for (genvar b = 0; b < NumBanksPerTile; b++) begin - for(genvar i = 0; i < 2**TCDMAddrMemWidth; i++) begin - always_ff @(posedge clk or posedge rst_n) begin - if(cycle_q[7:0] == 'h80) begin - dbg_profile_q[g][t][b][i].initiated = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].initiated; - dbg_profile_q[g][t][b][i].initial_cycle = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].initial_cycle; - dbg_profile_q[g][t][b][i].last_read_cycle = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].last_read_cycle; - dbg_profile_q[g][t][b][i].last_write_cycle = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].last_write_cycle; - dbg_profile_q[g][t][b][i].last_access_cycle = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].last_access_cycle; - dbg_profile_q[g][t][b][i].access_read_number = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].access_read_number; - dbg_profile_q[g][t][b][i].access_write_number = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].access_write_number; - dbg_profile_q[g][t][b][i].access_number = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].access_number; - dbg_profile_q[g][t][b][i].read_cycles = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].read_cycles; - dbg_profile_q[g][t][b][i].write_cycles = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].write_cycles; + for (genvar g = 0; g < NumGroups; g++) begin : gen_pe_g + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_pe_t + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) begin + pe_req_st[g][t] <= '{default: '0}; pe_req_s[g][t] <= '{default: '0}; + pe_rsp_st[g][t] <= '{default: '0}; pe_rsp_s[g][t] <= '{default: '0}; + end else begin + for (int c = 0; c < NumCoresPerTile; c++) begin + automatic logic [1:0] rq = `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.snitch_data_qvalid[c] + ? (`NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.snitch_data_qready[c] + ? (`NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.snitch_data_qwrite[c] ? 2'd3 : 2'd2) : 2'd1) : 2'd0; + automatic logic [1:0] rp = `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.snitch_data_pvalid[c] + ? (`NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.snitch_data_pready[c] ? 2'd2 : 2'd1) : 2'd0; + if (rq != pe_req_st[g][t][c]) begin + $fwrite(f_pe[g][t], "%0d S 0 1 0 %0d %0d %0d\n", c, pe_req_s[g][t][c], cycle_q, pe_req_st[g][t][c]); + pe_req_st[g][t][c] <= rq; pe_req_s[g][t][c] <= cycle_q; end - end - end - end - end - end - endgenerate - - always_ff @(posedge clk or posedge rst_n) begin - if (rst_n) begin - if ((cycle_q[63:0] == 'h100) || - (cycle_q[63:0] == 'h200) || - (cycle_q[63:0] == 'h400) || - (cycle_q[63:0] == 'h800) || - (cycle_q[63:0] == 'h1000) || - (cycle_q[15:0] == 'h8000)) begin - - $sformat(fn_0, "%s/trace_banks_cyc_%8x.dasm", log_path, cycle_q); - $sformat(fn_1, "%s/trace_banks_cyc_%8x_inited.dasm", log_path, cycle_q); - f_1 = $fopen(fn_1, "w"); - $display("[Tracer] Logging Banks to %s, %s", fn_0, fn_1); - - for (int g = 0; g < NumGroups; g++) begin - for (int t = 0; t < NumTilesPerGroup; t++) begin - for (int b = 0; b < NumBanksPerTile; b++) begin - for (int i = 0; i < 2 ** TCDMAddrMemWidth; i++) begin - automatic string trace_entry; - automatic string extras_str; - - extras_str = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'BANK': %03d, 'IDX': 0x%x, " - "'inited': %03d, 'ini_cyc': %03d, 'last_rd_cyc': %03d, " - "'last_wr_cyc': %03d, 'last_acc_cyc': %03d, " - "'acc_rd_num': %03d, 'acc_wr_num': %03d, 'acc_num': %03d, ", - g, t, b, i, - dbg_profile_q[g][t][b][i].initiated, - dbg_profile_q[g][t][b][i].initial_cycle, - dbg_profile_q[g][t][b][i].last_read_cycle, - dbg_profile_q[g][t][b][i].last_write_cycle, - dbg_profile_q[g][t][b][i].last_access_cycle, - dbg_profile_q[g][t][b][i].access_read_number, - dbg_profile_q[g][t][b][i].access_write_number, - dbg_profile_q[g][t][b][i].access_number - ); - - // Append read cycles - extras_str = $sformatf("%s'rd_cyc': ", extras_str); - foreach (dbg_profile_q[g][t][b][i].read_cycles[cycle_idx]) begin - extras_str = $sformatf( - "%s%03d ", extras_str, - dbg_profile_q[g][t][b][i].read_cycles[cycle_idx] - ); - end - extras_str = $sformatf("%s, ", extras_str); - - // Append write cycles - extras_str = $sformatf("%s'wr_cyc': ", extras_str); - foreach (dbg_profile_q[g][t][b][i].write_cycles[cycle_idx]) begin - extras_str = $sformatf( - "%s%03d ", extras_str, - dbg_profile_q[g][t][b][i].write_cycles[cycle_idx] - ); - end - extras_str = $sformatf("%s}", extras_str); - - // Conditionally log only initialized banks - if (dbg_profile_q[g][t][b][i].initiated) begin - $sformat(trace_entry, "%8d #; %s\n", cycle_q, extras_str); - $fwrite(f_1, trace_entry); - end + if (rp != pe_rsp_st[g][t][c]) begin + $fwrite(f_pe[g][t], "%0d S 1 0 0 %0d %0d %0d\n", c, pe_rsp_s[g][t][c], cycle_q, pe_rsp_st[g][t][c]); + pe_rsp_st[g][t][c] <= rp; pe_rsp_s[g][t][c] <= cycle_q; end + if (rq >= 2) // req handshake -> one core load/store accepted (qid = meta_id) + // mask qid to the low snitch_pkg::MetaIdWidth bits (upper bits are X) so the + // printed id matches the NoC's zero-extended meta_id for this request. + $fwrite(f_pe[g][t], "%0d P 1 0 %0d %0d %0h %0d\n", c, cycle_q, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.snitch_data_qwrite[c], + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.snitch_data_qaddr[c], + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.snitch_data_qid[c][snitch_pkg::MetaIdWidth-1:0]); + if (rp >= 2) // resp handshake -> a response returns to this core (pid = meta_id) + $fwrite(f_pe[g][t], "%0d P 0 0 %0d %0d\n", c, cycle_q, + `NOC_GRP(g).i_mempool_group.gen_tiles[t].i_tile.snitch_data_pid[c]); end end end - $fclose(f_1); end end - end + endgenerate + // ------------------------------------------------------------ + // End-of-sim flush: write every port's still-open S run, then close every file. + // P lines are written as they happen, so they need no flush. + // ------------------------------------------------------------ final begin - $sformat(fn_final_0, "%s/trace_banks_cyc_%8x_final.dasm", log_path, cycle_q); - $sformat(fn_final_1, "%s/trace_banks_cyc_%8x_inited_final.dasm", log_path, cycle_q); - f_final_0 = $fopen(fn_final_0, "w"); - f_final_1 = $fopen(fn_final_1, "w"); - - $display("[Tracer] Final Logging Banks to %s, %s", fn_final_0, fn_final_1); - for (int g = 0; g < NumGroups; g++) begin for (int t = 0; t < NumTilesPerGroup; t++) begin - for (int b = 0; b < NumBanksPerTile; b++) begin - for (int i = 0; i < 2 ** TCDMAddrMemWidth; i++) begin - automatic string trace_entry_final; - automatic string extras_str_final; - - extras_str_final = $sformatf( - "{'GROUP': %03d, 'TILE': %03d, 'BANK': %03d, 'IDX': 0x%x, " - "'inited': %03d, 'ini_cyc': %03d, 'last_rd_cyc': %03d, " - "'last_wr_cyc': %03d, 'last_acc_cyc': %03d, " - "'acc_rd_num': %03d, 'acc_wr_num': %03d, 'acc_num': %03d, ", - g, t, b, i, - dbg_profile_q[g][t][b][i].initiated, - dbg_profile_q[g][t][b][i].initial_cycle, - dbg_profile_q[g][t][b][i].last_read_cycle, - dbg_profile_q[g][t][b][i].last_write_cycle, - dbg_profile_q[g][t][b][i].last_access_cycle, - dbg_profile_q[g][t][b][i].access_read_number, - dbg_profile_q[g][t][b][i].access_write_number, - dbg_profile_q[g][t][b][i].access_number - ); - - // Append read cycles - extras_str_final = $sformatf("%s'rd_cyc': ", extras_str_final); - foreach (dbg_profile_q[g][t][b][i].read_cycles[cycle_idx]) begin - extras_str_final = $sformatf( - "%s%03d ", extras_str_final, - dbg_profile_q[g][t][b][i].read_cycles[cycle_idx] - ); - end - extras_str_final = $sformatf("%s, ", extras_str_final); - - // Append write cycles - extras_str_final = $sformatf("%s'wr_cyc': ", extras_str_final); - foreach (dbg_profile_q[g][t][b][i].write_cycles[cycle_idx]) begin - extras_str_final = $sformatf( - "%s%03d ", extras_str_final, - dbg_profile_q[g][t][b][i].write_cycles[cycle_idx] - ); - end - extras_str_final = $sformatf("%s}", extras_str_final); - - // Log to inited trace file if applicable - if (dbg_profile_q[g][t][b][i].initiated) begin - $sformat(trace_entry_final, "%8d #; %s\n", cycle_q, extras_str_final); - $fwrite(f_final_1, trace_entry_final); - end - - // Log to full trace file - $sformat(trace_entry_final, "%8d #; %s\n", cycle_q, extras_str_final); - $fwrite(f_final_0, trace_entry_final); + for (int p = 0; p < NumWideRemoteReqPortsPerTile; p++) begin + automatic int rid = t*NumWideRemoteReqPortsPerTile + p; + for (int d = 0; d < 4; d++) begin + $fwrite(f_rreq[g], "%0d S %0d 0 %0d %0d %0d\n", rid, d, rsq_start[g][t][p][d][0], cycle_q, rsq_st[g][t][p][d][0]); + $fwrite(f_rreq[g], "%0d S %0d 1 %0d %0d %0d\n", rid, d, rsq_start[g][t][p][d][1], cycle_q, rsq_st[g][t][p][d][1]); + end + $fwrite(f_rreq[g], "%0d S 4 0 %0d %0d %0d\n", rid, rsq_start[g][t][p][4][0], cycle_q, rsq_st[g][t][p][4][0]); + $fwrite(f_rreq[g], "%0d S 4 1 %0d %0d %0d\n", rid, rsq_start[g][t][p][4][1], cycle_q, rsq_st[g][t][p][4][1]); + end + for (int p = 0; p < NumRemoteRespPortsPerTile-1; p++) begin + automatic int rid = t*(NumRemoteRespPortsPerTile-1) + p; + for (int d = 0; d < 4; d++) begin + $fwrite(f_rresp[g], "%0d S %0d 0 %0d %0d %0d\n", rid, d, rsp_start[g][t][p][d][0], cycle_q, rsp_st[g][t][p][d][0]); + $fwrite(f_rresp[g], "%0d S %0d 1 %0d %0d %0d\n", rid, d, rsp_start[g][t][p][d][1], cycle_q, rsp_st[g][t][p][d][1]); end + $fwrite(f_rresp[g], "%0d S 4 0 %0d %0d %0d\n", rid, rsp_start[g][t][p][4][0], cycle_q, rsp_st[g][t][p][4][0]); + $fwrite(f_rresp[g], "%0d S 4 1 %0d %0d %0d\n", rid, rsp_start[g][t][p][4][1], cycle_q, rsp_st[g][t][p][4][1]); + end + for (int p = 0; p < NumRemoteReqPortsPerTile; p++) begin + $fwrite(f_tile[g][t], "S 0 1 %0d %0d %0d %0d\n", p, ts_mreq_s[g][t][p], cycle_q, ts_mreq_st[g][t][p]); + $fwrite(f_tile[g][t], "S 0 0 %0d %0d %0d %0d\n", p, ts_sreq_s[g][t][p], cycle_q, ts_sreq_st[g][t][p]); end + for (int p = 0; p < NumRemoteRespPortsPerTile; p++) begin + $fwrite(f_tile[g][t], "S 1 0 %0d %0d %0d %0d\n", p, ts_mrsp_s[g][t][p], cycle_q, ts_mrsp_st[g][t][p]); + $fwrite(f_tile[g][t], "S 1 1 %0d %0d %0d %0d\n", p, ts_srsp_s[g][t][p], cycle_q, ts_srsp_st[g][t][p]); + end + $fclose(f_tile[g][t]); + for (int c = 0; c < NumCoresPerTile; c++) begin + $fwrite(f_pe[g][t], "%0d S 0 1 0 %0d %0d %0d\n", c, pe_req_s[g][t][c], cycle_q, pe_req_st[g][t][c]); + $fwrite(f_pe[g][t], "%0d S 1 0 0 %0d %0d %0d\n", c, pe_rsp_s[g][t][c], cycle_q, pe_rsp_st[g][t][c]); + end + $fclose(f_pe[g][t]); end + $fclose(f_rreq[g]); + $fclose(f_rresp[g]); end - - $fclose(f_final_0); - $fclose(f_final_1); end -`endif // SPM_PROFILING + +`endif // NOC_PROFILING `endif // TARGET_VERILATOR `endif // TARGET_SYNTHESIS diff --git a/hardware/tb/tb_spm_profiling.svh b/hardware/tb/tb_spm_profiling.svh new file mode 100644 index 00000000..0416bf6b --- /dev/null +++ b/hardware/tb/tb_spm_profiling.svh @@ -0,0 +1,349 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// SPM bank-activity time-series trace (enabled by SPM_PROFILING). +// +// One lightweight log per tile (spm_profiling/bank_g_t.log) covering all +// its banks. Taps the post-arbitration bank grant + winning request payload, so +// it shows per-bank idle/stall/read/write over time AND which core won each bank +// cycle (explains tile slave_req-in stalls from bank contention). +// S state 0=idle 1=stall 2=read 3=write +// P +// loc=1 iff winning input port is local; wide=1 = DMA. Local requests zero +// their src_{grp,tile,core} payload fields, so use port for local accesses; +// remote accesses carry the true NoC origin in src_{grp,tile,core}. +// +// The old per-cycle bank-conflict counters and heavy per-word profiler +// (dbg_profile_q, mirroring mempool_tile.profile_d) are commented out at the +// bottom -- their unbounded dynamic cycle lists balloon VCS memory. mempool_tile's +// matching profile_d is likewise commented out (under its SPM_PROFILING gate). +// Relies on cycle_q (declared in tb_noc_profiling.svh, included first). + +`ifndef TB_SPM_PROFILING_SVH_ +`define TB_SPM_PROFILING_SVH_ + +`ifndef TARGET_SYNTHESIS +`ifndef TARGET_VERILATOR +`ifdef SPM_PROFILING + + // Hierarchical path to group g, tile t's i_tile. Defined locally because + // NOC_GRP (tb_noc_profiling.svh) only exists under NOC_PROFILING, whereas this + // trace must work whenever SPM_PROFILING is set. + `define SPM_TILE(gg,tt) dut.i_mempool_cluster.gen_groups_x[(gg)/NumY].gen_groups_y[(gg)%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[tt].i_tile + + string spm_bank_log_path; + integer spm_bank_retval; + int f_bank [NumGroups][NumTilesPerGroup]; + // Per-bank run state (0=idle 1=stall 2=read 3=write) and start cycle of the + // currently-open run. + logic [1:0] bank_st [NumGroups][NumTilesPerGroup][NumBanksPerTile]; + logic [63:0] bank_s [NumGroups][NumTilesPerGroup][NumBanksPerTile]; + + initial begin + spm_bank_log_path = "spm_profiling"; + spm_bank_retval = $system({"mkdir -p ", spm_bank_log_path}); + for (int g = 0; g < NumGroups; g++) + for (int t = 0; t < NumTilesPerGroup; t++) + f_bank[g][t] = $fopen($sformatf("%s/bank_g%0d_t%0d.log", + spm_bank_log_path, g, t), "w"); + end + + // Per-bank access capture. bank_req_valid/ready[b] is the post-arbitration + // grant; bank_req_ini_addr[b] (winning input port) + bank_req_wide[b] identify + // the requester (payload src fields valid only for remote inputs). + generate + for (genvar g = 0; g < NumGroups; g++) begin : gen_spm_bank_g + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_spm_bank_t + // local/remote boundary = NumCoresPerTile core ports on the local side. + localparam int unsigned NLP = NumCoresPerTile; + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) begin + bank_st[g][t] <= '{default: '0}; bank_s[g][t] <= '{default: '0}; + end else begin + for (int b = 0; b < NumBanksPerTile; b++) begin + automatic logic [1:0] rb = `SPM_TILE(g,t).bank_req_valid[b] + ? (`SPM_TILE(g,t).bank_req_ready[b] + ? (`SPM_TILE(g,t).bank_req_payload[b].wen ? 2'd3 : 2'd2) : 2'd1) : 2'd0; + if (rb != bank_st[g][t][b]) begin + $fwrite(f_bank[g][t], "S %0d %0d %0d %0d\n", b, bank_s[g][t][b], cycle_q, bank_st[g][t][b]); + bank_st[g][t][b] <= rb; bank_s[g][t][b] <= cycle_q; + end + if (rb >= 2) begin + // Who won the bank: bank_req_ini_addr[b] = winning input-port index + // (< NLP = local), bank_req_wide[b]=1 = DMA. Local inputs zero their + // src fields, so port identifies them; remote inputs carry origin. + automatic int unsigned winp = `SPM_TILE(g,t).bank_req_ini_addr[b]; + $fwrite(f_bank[g][t], "P %0d %0d %0d %0h %0d %0d %0d %0d %0d %0d %0d\n", + b, cycle_q, + `SPM_TILE(g,t).bank_req_payload[b].wen, + `SPM_TILE(g,t).bank_req_payload[b].tgt_addr, + (winp < NLP) ? 1 : 0, // loc: local port + `SPM_TILE(g,t).bank_req_wide[b], // wide: DMA access + winp, // winning input port + `SPM_TILE(g,t).bank_req_payload[b].src_group_id, // remote origin grp + `SPM_TILE(g,t).bank_req_payload[b].ini_addr, // remote origin tile + `SPM_TILE(g,t).bank_req_payload[b].wdata.core_id, // remote origin core + `SPM_TILE(g,t).bank_req_payload[b].wdata.meta_id); // meta_id + end + end + end + end + end + end + endgenerate + + // End-of-sim flush: emit each bank's still-open run, then close every file. + final begin + for (int g = 0; g < NumGroups; g++) + for (int t = 0; t < NumTilesPerGroup; t++) begin + for (int b = 0; b < NumBanksPerTile; b++) + $fwrite(f_bank[g][t], "S %0d %0d %0d %0d\n", b, bank_s[g][t][b], cycle_q, bank_st[g][t][b]); + $fclose(f_bank[g][t]); + end + end + + /* ===================== DISABLED previous profiler ===================== + * Old per-cycle bank-conflict counters and per-word dbg_profile_q profiler. + * The per-word profiler mirrors mempool_tile.profile_d, whose unbounded dynamic + * cycle lists balloon VCS. Kept commented for reference. + * ====================================================================== + string spm_app, spm_log_path; + integer spm_retval; + int f_0, f_1, f_final_0, f_final_1; + string fn_0, fn_1, fn_final_0, fn_final_1; + int f_bc; + + initial begin + void'($value$plusargs("APP=%s", spm_app)); + $sformat(spm_log_path, "../scripts/spm_profiling/run_logs/%s", spm_app); + spm_retval = $system({"mkdir -p ", spm_log_path}); + f_bc = $fopen({spm_log_path, "/bank_conflict.log"}, "w"); + end + + // ------------------------------------------------------------ + // Bank-conflict profiling (same-bank contention from multiple tiles) + // ------------------------------------------------------------ + // Count of requests targeting the same bank from multiple tiles + logic [NumGroups-1:0] + [NumTilesPerGroup * NumBanksPerTile - 1:0] + [$clog2(NumTilesPerGroup * (NumRemoteReqPortsPerTile - 1)) : 0] + group_xbar_req_to_same_bank_count; + logic [NumGroups-1:0] + [NumTilesPerGroup * NumBanksPerTile - 1:0] + [$clog2(NumTilesPerGroup * (NumRemoteReqPortsPerTile - 1)) : 0] + group_xbar_req_to_same_bank_conflict_count; + logic [NumGroups-1:0] + [$clog2(NumTilesPerGroup * (NumRemoteReqPortsPerTile - 1)) : 0] + group_xbar_req_to_same_bank_conflict_count_sum; + logic [NumX-1:0][NumY-1:0][NumRemoteReqPortsPerTile - 2:0][NumTilesPerGroup-1:0] + tcdm_slave_req_valid; + logic [NumX-1:0][NumY-1:0][NumRemoteReqPortsPerTile - 2:0][NumTilesPerGroup-1:0] + [idx_width(NumTilesPerGroup) + idx_width(NumBanksPerTile) - 1 : 0] + tcdm_slave_req_tgt_addr; + + generate + for (genvar x_dim = 0; x_dim < NumX; x_dim++) begin : gen_x + for (genvar y_dim = 0; y_dim < NumY; y_dim++) begin : gen_y + for (genvar p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) begin : gen_port + for (genvar t = 0; t < NumTilesPerGroup; t++) begin : gen_tile + assign tcdm_slave_req_valid[x_dim][y_dim][p][t] = + dut.i_mempool_cluster.gen_groups_x[x_dim].gen_groups_y[y_dim] + .gen_rtl_group.i_group + .floo_tcdm_req_from_router_before_xbar_valid_per_port[p + 1][t]; + assign tcdm_slave_req_tgt_addr[x_dim][y_dim][p][t] = + dut.i_mempool_cluster.gen_groups_x[x_dim].gen_groups_y[y_dim] + .gen_rtl_group.i_group.floo_tcdm_req_from_router[t][p + 1] + .hdr.tgt_addr[idx_width(NumTilesPerGroup) + idx_width(NumBanksPerTile) - 1 : 0]; + end + end + end + end + endgenerate + + always_comb begin + group_xbar_req_to_same_bank_count = '0; + for (int g = 0; g < NumGroups; g++) + for (int p = 0; p < (NumRemoteReqPortsPerTile - 1); p++) + for (int t = 0; t < NumTilesPerGroup; t++) + if (tcdm_slave_req_valid[g / NumY][g % NumY][p][t]) + group_xbar_req_to_same_bank_count[g][ + tcdm_slave_req_tgt_addr[g / NumY][g % NumY][p][t]] += 1; + end + + always_comb begin + group_xbar_req_to_same_bank_conflict_count = '0; + group_xbar_req_to_same_bank_conflict_count_sum = '0; + for (int g = 0; g < NumGroups; g++) + for (int b = 0; b < NumTilesPerGroup * NumBanksPerTile; b++) begin + if (group_xbar_req_to_same_bank_count[g][b] > 0) + group_xbar_req_to_same_bank_conflict_count[g][b] = + group_xbar_req_to_same_bank_count[g][b] - 1; // minus the winner + group_xbar_req_to_same_bank_conflict_count_sum[g] += + group_xbar_req_to_same_bank_conflict_count[g][b]; + end + end + + // Cumulative per-group bank-conflict cycles, dumped as a time series. + int unsigned bank_conflict_q [NumGroups]; + always_ff @(posedge clk or negedge rst_n) begin + if (!rst_n) + for (int g = 0; g < NumGroups; g++) bank_conflict_q[g] = '0; + else + for (int g = 0; g < NumGroups; g++) + bank_conflict_q[g] += group_xbar_req_to_same_bank_conflict_count_sum[g]; + end + + always_ff @(posedge clk) begin + if (rst_n && ((cycle_q % 1024) == 0)) begin + $timeformat(-9, 0, "", 10); + $fwrite(f_bc, "dump time %t, cycle %8d #;\n", $time, cycle_q); + for (int g = 0; g < NumGroups; g++) + $fwrite(f_bc, "{'GROUP': %03d, 'bank_conflict_cyc_num': %0d}\n", g, bank_conflict_q[g]); + end + end + + // ------------------------------------------------------------ + // SPM bank-activity profiling (per-bank-word access trace) + // ------------------------------------------------------------ + profile_t dbg_profile_q[NumGroups-1:0][NumTilesPerGroup-1:0][NumBanksPerTile-1:0][2**TCDMAddrMemWidth-1:0]; + + generate + for (genvar g = 0; g < NumGroups; g++) begin + for (genvar t = 0; t < NumTilesPerGroup; t++) begin + for (genvar b = 0; b < NumBanksPerTile; b++) begin + for(genvar i = 0; i < 2**TCDMAddrMemWidth; i++) begin + always_ff @(posedge clk or posedge rst_n) begin + if(cycle_q[7:0] == 'h80) begin + dbg_profile_q[g][t][b][i].initiated = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].initiated; + dbg_profile_q[g][t][b][i].initial_cycle = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].initial_cycle; + dbg_profile_q[g][t][b][i].last_read_cycle = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].last_read_cycle; + dbg_profile_q[g][t][b][i].last_write_cycle = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].last_write_cycle; + dbg_profile_q[g][t][b][i].last_access_cycle = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].last_access_cycle; + dbg_profile_q[g][t][b][i].access_read_number = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].access_read_number; + dbg_profile_q[g][t][b][i].access_write_number = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].access_write_number; + dbg_profile_q[g][t][b][i].access_number = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].access_number; + dbg_profile_q[g][t][b][i].read_cycles = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].read_cycles; + dbg_profile_q[g][t][b][i].write_cycles = dut.i_mempool_cluster.gen_groups_x[g/NumY].gen_groups_y[g%NumY].gen_rtl_group.i_group.i_mempool_group.gen_tiles[t].i_tile.profile_d[b][i].write_cycles; + end + end + end + end + end + end + endgenerate + + always_ff @(posedge clk or posedge rst_n) begin + if (rst_n) begin + if ((cycle_q[63:0] == 'h100) || + (cycle_q[63:0] == 'h200) || + (cycle_q[63:0] == 'h400) || + (cycle_q[63:0] == 'h800) || + (cycle_q[63:0] == 'h1000) || + (cycle_q[15:0] == 'h8000)) begin + + $sformat(fn_0, "%s/trace_banks_cyc_%8x.dasm", spm_log_path, cycle_q); + $sformat(fn_1, "%s/trace_banks_cyc_%8x_inited.dasm", spm_log_path, cycle_q); + f_1 = $fopen(fn_1, "w"); + $display("[Tracer] Logging Banks to %s, %s", fn_0, fn_1); + + for (int g = 0; g < NumGroups; g++) begin + for (int t = 0; t < NumTilesPerGroup; t++) begin + for (int b = 0; b < NumBanksPerTile; b++) begin + for (int i = 0; i < 2 ** TCDMAddrMemWidth; i++) begin + automatic string trace_entry; + automatic string extras_str; + + extras_str = $sformatf( + "{'GROUP': %03d, 'TILE': %03d, 'BANK': %03d, 'IDX': 0x%x, 'inited': %03d, 'ini_cyc': %03d, 'last_rd_cyc': %03d, 'last_wr_cyc': %03d, 'last_acc_cyc': %03d, 'acc_rd_num': %03d, 'acc_wr_num': %03d, 'acc_num': %03d, ", + g, t, b, i, + dbg_profile_q[g][t][b][i].initiated, + dbg_profile_q[g][t][b][i].initial_cycle, + dbg_profile_q[g][t][b][i].last_read_cycle, + dbg_profile_q[g][t][b][i].last_write_cycle, + dbg_profile_q[g][t][b][i].last_access_cycle, + dbg_profile_q[g][t][b][i].access_read_number, + dbg_profile_q[g][t][b][i].access_write_number, + dbg_profile_q[g][t][b][i].access_number + ); + + extras_str = $sformatf("%s'rd_cyc': ", extras_str); + foreach (dbg_profile_q[g][t][b][i].read_cycles[cycle_idx]) + extras_str = $sformatf("%s%03d ", extras_str, dbg_profile_q[g][t][b][i].read_cycles[cycle_idx]); + extras_str = $sformatf("%s, ", extras_str); + + extras_str = $sformatf("%s'wr_cyc': ", extras_str); + foreach (dbg_profile_q[g][t][b][i].write_cycles[cycle_idx]) + extras_str = $sformatf("%s%03d ", extras_str, dbg_profile_q[g][t][b][i].write_cycles[cycle_idx]); + extras_str = $sformatf("%s}", extras_str); + + if (dbg_profile_q[g][t][b][i].initiated) begin + $sformat(trace_entry, "%8d #; %s\n", cycle_q, extras_str); + $fwrite(f_1, trace_entry); + end + end + end + end + end + $fclose(f_1); + end + end + end + + final begin + $sformat(fn_final_0, "%s/trace_banks_cyc_%8x_final.dasm", spm_log_path, cycle_q); + $sformat(fn_final_1, "%s/trace_banks_cyc_%8x_inited_final.dasm", spm_log_path, cycle_q); + f_final_0 = $fopen(fn_final_0, "w"); + f_final_1 = $fopen(fn_final_1, "w"); + $display("[Tracer] Final Logging Banks to %s, %s", fn_final_0, fn_final_1); + + for (int g = 0; g < NumGroups; g++) begin + for (int t = 0; t < NumTilesPerGroup; t++) begin + for (int b = 0; b < NumBanksPerTile; b++) begin + for (int i = 0; i < 2 ** TCDMAddrMemWidth; i++) begin + automatic string trace_entry_final; + automatic string extras_str_final; + + extras_str_final = $sformatf( + "{'GROUP': %03d, 'TILE': %03d, 'BANK': %03d, 'IDX': 0x%x, 'inited': %03d, 'ini_cyc': %03d, 'last_rd_cyc': %03d, 'last_wr_cyc': %03d, 'last_acc_cyc': %03d, 'acc_rd_num': %03d, 'acc_wr_num': %03d, 'acc_num': %03d, ", + g, t, b, i, + dbg_profile_q[g][t][b][i].initiated, + dbg_profile_q[g][t][b][i].initial_cycle, + dbg_profile_q[g][t][b][i].last_read_cycle, + dbg_profile_q[g][t][b][i].last_write_cycle, + dbg_profile_q[g][t][b][i].last_access_cycle, + dbg_profile_q[g][t][b][i].access_read_number, + dbg_profile_q[g][t][b][i].access_write_number, + dbg_profile_q[g][t][b][i].access_number + ); + + extras_str_final = $sformatf("%s'rd_cyc': ", extras_str_final); + foreach (dbg_profile_q[g][t][b][i].read_cycles[cycle_idx]) + extras_str_final = $sformatf("%s%03d ", extras_str_final, dbg_profile_q[g][t][b][i].read_cycles[cycle_idx]); + extras_str_final = $sformatf("%s, ", extras_str_final); + + extras_str_final = $sformatf("%s'wr_cyc': ", extras_str_final); + foreach (dbg_profile_q[g][t][b][i].write_cycles[cycle_idx]) + extras_str_final = $sformatf("%s%03d ", extras_str_final, dbg_profile_q[g][t][b][i].write_cycles[cycle_idx]); + extras_str_final = $sformatf("%s}", extras_str_final); + + if (dbg_profile_q[g][t][b][i].initiated) begin + $sformat(trace_entry_final, "%8d #; %s\n", cycle_q, extras_str_final); + $fwrite(f_final_1, trace_entry_final); + end + $sformat(trace_entry_final, "%8d #; %s\n", cycle_q, extras_str_final); + $fwrite(f_final_0, trace_entry_final); + end + end + end + end + $fclose(f_final_0); + $fclose(f_final_1); + $fclose(f_bc); + end + ===================== end DISABLED previous profiler ===================== */ + +`endif // SPM_PROFILING +`endif // TARGET_VERILATOR +`endif // TARGET_SYNTHESIS +`endif // TB_SPM_PROFILING_SVH_ From c28a607256f37f2313cca744eb7c2e0d0424f20a Mon Sep 17 00:00:00 2001 From: Yinrong Li Date: Thu, 11 Jun 2026 03:29:44 +0200 Subject: [PATCH 4/4] [script] Add Perfetto-based trace visualization. --- Makefile | 23 + hardware/Makefile | 50 + hardware/scripts/perfetto_gen.py | 1716 ++++++++++++++++++++++++++++++ 3 files changed, 1789 insertions(+) create mode 100644 hardware/scripts/perfetto_gen.py diff --git a/Makefile b/Makefile index 1727ea19..4d2cdc40 100644 --- a/Makefile +++ b/Makefile @@ -192,6 +192,29 @@ $(VERILATOR_INSTALL_DIR)/bin/verilator: toolchain/verilator Makefile cp toolchain/verilator/bin/verilator_bin $(VERILATOR_INSTALL_DIR)/share/verilator/bin/verilator_bin cp toolchain/verilator/bin/verilator_bin $(VERILATOR_INSTALL_DIR)/bin/verilator_bin +# Perfetto trace_processor (native, large-trace acceleration). The prebuilt +# shell needs a newer GLIBC than older hosts ship, so run it in an Ubuntu +# singularity container; shell + image install under install/perfetto. +PERFETTO_INSTALL_DIR ?= $(INSTALL_DIR)/perfetto +PERFETTO_IMAGE ?= docker://ubuntu:24.04 +PERFETTO_SHELL := $(PERFETTO_INSTALL_DIR)/trace_processor_shell +PERFETTO_SIF := $(PERFETTO_INSTALL_DIR)/ubuntu2404.sif + +.PHONY: perfetto +perfetto: $(PERFETTO_SHELL) $(PERFETTO_SIF) +$(PERFETTO_SHELL): + mkdir -p $(PERFETTO_INSTALL_DIR) + @url=$$(curl -sSL --fail https://get.perfetto.dev/trace_processor | \ + grep -oE 'https://\S+linux-amd64/trace_processor_shell' | head -1); \ + test -n "$$url" || { echo "ERROR: cannot resolve latest trace_processor URL"; exit 1; }; \ + echo ">> latest trace_processor: $$url"; \ + curl -L --fail -o $@ "$$url" + chmod +x $@ +$(PERFETTO_SIF): + mkdir -p $(PERFETTO_INSTALL_DIR) + SINGULARITY_CACHEDIR=$(PERFETTO_INSTALL_DIR)/.cache singularity build --force $@ $(PERFETTO_IMAGE) + rm -rf $(PERFETTO_INSTALL_DIR)/.cache + # Update and patch hardware dependencies for MemPool # Previous changes will be stashed. Clear all the stashes with `git stash clear` .PHONY: update-deps diff --git a/hardware/Makefile b/hardware/Makefile index 2f060c55..16dd8ea0 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -519,6 +519,56 @@ $(buildpath)/%.trace: $(buildpath)/%.dasm tracevis: $(MEMPOOL_DIR)/scripts/tracevis.py $(preload) $(buildpath)/*.trace -o $(buildpath)/tracevis.json +# Perfetto protobuf export. Knobs: +# slices=function|instruction|none core slice granularity (instruction needs spike-dasm) +# noc_slices=state|packet NoC port slice granularity +# free_range=1 auto-scale counter y-axes (default: pinned to [0,1]) +# flows=1 correlate packets to core requests via (requester, meta_id); forces noc_slices=packet +# freq= clock freq for the real-ns axis (default 500 MHz) +slices ?= function +freq ?= 500 +noc_slices ?= state +ifeq ($(flows),1) +override noc_slices := packet +endif +# Optional cycle window: window=START:END exports only that cycle range (smaller/faster trace). +ifdef window + win_start := $(word 1,$(subst :, ,$(window))) + win_end := $(word 2,$(subst :, ,$(window))) +endif +.PHONY: perfetto-gen +perfetto-gen: + $(python) $(ROOT_DIR)/scripts/perfetto_gen.py $(buildpath)/trace_hart_*.dasm \ + --cores-per-tile $(num_cores_per_tile) \ + --tiles-per-group $(shell echo $$(( $(num_cores) / $(num_cores_per_tile) / $(num_groups) ))) \ + --mesh-y $(shell echo $$(( $(num_groups) / $(num_x) ))) \ + --clk-freq $(freq) \ + --slices $(slices) $(if $(preload),--elf $(preload),) \ + $(if $(wildcard $(buildpath)/noc_profiling),--noc $(buildpath)/noc_profiling,) \ + $(if $(wildcard $(buildpath)/spm_profiling),--spm $(buildpath)/spm_profiling,) \ + --noc-slices $(noc_slices) $(if $(filter 1,$(free_range)),--free-range,) \ + $(if $(filter 1,$(flows)),--flows,) \ + $(if $(win_start),--cycle-start $(win_start),) $(if $(win_end),--cycle-end $(win_end),) \ + -o $(buildpath)/perf.perfetto-trace + +# Native Perfetto trace_processor lives under $(INSTALL_DIR)/perfetto, installed +# by the root `make perfetto`. perfetto-gen exports the trace; perfetto-view +# serves it through the installed shell + container. +perfetto_dir := $(INSTALL_DIR)/perfetto +tp_shell := $(perfetto_dir)/trace_processor_shell +perfetto_sif := $(perfetto_dir)/ubuntu2404.sif + +.PHONY: perfetto-view +perfetto-view: + @test -f $(tp_shell) -a -f $(perfetto_sif) || \ + { echo "ERROR: trace_processor not installed -- run 'make perfetto' in the repo root"; exit 1; } + @test -f $(buildpath)/perf.perfetto-trace || \ + { echo "ERROR: $(buildpath)/perf.perfetto-trace not found -- run 'make perfetto-gen' first"; exit 1; } + @echo ">> Serving $(abspath $(buildpath))/perf.perfetto-trace on 127.0.0.1:9001" + @echo ">> Reload https://ui.perfetto.dev and click YES on 'Trace Processor native acceleration'" + singularity exec -B $(abspath $(buildpath)) -B $(perfetto_dir) $(perfetto_sif) \ + $(tp_shell) --httpd $(abspath $(buildpath))/perf.perfetto-trace + ############################ # Unit tests simulation # ############################ diff --git a/hardware/scripts/perfetto_gen.py b/hardware/scripts/perfetto_gen.py new file mode 100644 index 00000000..3644a652 --- /dev/null +++ b/hardware/scripts/perfetto_gen.py @@ -0,0 +1,1716 @@ +#!/usr/bin/env python3 +# Copyright 2026 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# perfetto_gen.py -- Phase 1 of the performance-visualization plan. +# +# Reads the per-hart Snitch traces (trace_hart_*.dasm) and emits a Perfetto +# *protobuf* trace (https://ui.perfetto.dev). Scale-ready successor to and +# SUPERSET of `make tracevis`: group>tile>core nested track tree, per-core +# function/instruction slices, and per-core IPC + stall-breakdown COUNTER +# tracks (windowed over --window-ns), which tracevis never had. +# +# Counter semantics (matters for correctness): the .dasm emits a line only when +# an instruction RETIRES, and its stall_* count the cycles of the gap that +# PRECEDED that retirement, so each instruction's stalls are spread across the +# windows the retirement gap spans; every active window is emitted so an idle +# window reads a true 0 (no sample-and-hold). +# +# Everything comes from the raw .dasm (no `make trace`): stalls + pc in the +# line, function/source via addr2line on the elf, disasm via spike-dasm. +# +# Usage: +# scripts/perfetto_gen.py build_vcs/trace_hart_*.dasm -o out.perfetto-trace +# scripts/perfetto_gen.py ... --slices instruction # full per-insn timeline +# scripts/perfetto_gen.py ... --slices none # counters only +# (geometry defaults match tensorpool64: 4 cores/tile, 4 tiles/group) + +import argparse +import bisect +import collections +import glob +import os +import re +import subprocess +import sys + +from perfetto.trace_builder.proto_builder import TraceProtoBuilder +from perfetto.protos.perfetto.trace.perfetto_trace_pb2 import TrackEvent + +SEQ = 1 # trusted_packet_sequence_id (single producer; absolute timestamps) + +# One .dasm line: