diff --git a/Bender.yml b/Bender.yml index ccf4897b..6629bcd1 100644 --- a/Bender.yml +++ b/Bender.yml @@ -34,13 +34,11 @@ sources: - hardware/src/snitch_addr_demux.sv - hardware/src/tcdm_adapter.sv - hardware/src/tcdm_shim.sv - - hardware/src/tcdm_wide_narrow_mux.sv - - hardware/src/address_scrambler.sv + - hardware/src/mempool_addr_scrambler.sv + - hardware/src/mempool_tcdm_bank_interco.sv - hardware/src/axi_L2_interleaver.sv - hardware/src/bootrom.sv - hardware/src/selector.sv - - hardware/src/mempool_bank_id_remapper.sv - - hardware/src/mempool_dma_tile_id_remapper.sv - hardware/src/mempool_tile_rw_demux.sv - hardware/src/control_registers/control_registers_reg_pkg.sv - hardware/src/control_registers/control_registers_reg_top.sv diff --git a/Makefile b/Makefile index 1727ea19..4d2cdc40 100644 --- a/Makefile +++ b/Makefile @@ -192,6 +192,29 @@ $(VERILATOR_INSTALL_DIR)/bin/verilator: toolchain/verilator Makefile cp toolchain/verilator/bin/verilator_bin $(VERILATOR_INSTALL_DIR)/share/verilator/bin/verilator_bin cp toolchain/verilator/bin/verilator_bin $(VERILATOR_INSTALL_DIR)/bin/verilator_bin +# Perfetto trace_processor (native, large-trace acceleration). The prebuilt +# shell needs a newer GLIBC than older hosts ship, so run it in an Ubuntu +# singularity container; shell + image install under install/perfetto. +PERFETTO_INSTALL_DIR ?= $(INSTALL_DIR)/perfetto +PERFETTO_IMAGE ?= docker://ubuntu:24.04 +PERFETTO_SHELL := $(PERFETTO_INSTALL_DIR)/trace_processor_shell +PERFETTO_SIF := $(PERFETTO_INSTALL_DIR)/ubuntu2404.sif + +.PHONY: perfetto +perfetto: $(PERFETTO_SHELL) $(PERFETTO_SIF) +$(PERFETTO_SHELL): + mkdir -p $(PERFETTO_INSTALL_DIR) + @url=$$(curl -sSL --fail https://get.perfetto.dev/trace_processor | \ + grep -oE 'https://\S+linux-amd64/trace_processor_shell' | head -1); \ + test -n "$$url" || { echo "ERROR: cannot resolve latest trace_processor URL"; exit 1; }; \ + echo ">> latest trace_processor: $$url"; \ + curl -L --fail -o $@ "$$url" + chmod +x $@ +$(PERFETTO_SIF): + mkdir -p $(PERFETTO_INSTALL_DIR) + SINGULARITY_CACHEDIR=$(PERFETTO_INSTALL_DIR)/.cache singularity build --force $@ $(PERFETTO_IMAGE) + rm -rf $(PERFETTO_INSTALL_DIR)/.cache + # Update and patch hardware dependencies for MemPool # Previous changes will be stashed. Clear all the stashes with `git stash clear` .PHONY: update-deps diff --git a/hardware/Makefile b/hardware/Makefile index 055670f9..16dd8ea0 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -35,6 +35,7 @@ questa_config ?= vcs_version ?= 2024.09-zr vcs_cmd ?= vcs-$(vcs_version) vcs_config ?= +vcs_gui ?= -verdi # Path to the application binaries app_path ?= $(abspath $(ROOT_DIR)/../software/bin) # Bender @@ -96,6 +97,8 @@ vlogan_args += -assert svaext +v2k -override_timescale=1ns/1ps -kdb ifdef preload vcs_args += +PRELOAD=$(preload) endif +vcs_assert_args ?= -assert nopostproc +vcs_args += $(vcs_assert_args) # ============================================================================ # DPI and Trace Configuration @@ -365,13 +368,20 @@ $(buildpath)/mempool_simvopt: $(buildpath)/compilevcs.sh $(buildpath)/$(dpi_libr $(vcs_cmd) vcs -full64 $(top_level) -cc $(CC) -cpp $(CXX) -ld $(CXX) $(dpi_library)/mempool_vcs_dpi.so $(vcs_config) -assert disable_cover -o mempool_simvopt # Simulation -simvcs: compile_vcs_simv +simvcs: clean-dasm compile_vcs_simv cd $(buildpath) && \ - ./mempool_simv $(vcs_args) -ucli -l transcript -do ../scripts/vcs/run.tcl -gui + ./mempool_simv $(vcs_args) $(vcs_gui) -ucli -do ../scripts/vcs/run.tcl -l transcript + ./scripts/return_status.sh $(buildpath)/transcript -simcvcs: compile_vcs_simvopt +simcvcs: clean-dasm compile_vcs_simvopt cd $(buildpath) && \ ./mempool_simvopt $(vcs_args) -l transcript + ./scripts/return_status.sh $(buildpath)/transcript + +simcvcs_fsdb: clean-dasm compile_vcs_simv + cd $(buildpath) && \ + ./mempool_simv $(vcs_args) -ucli -do ../scripts/vcs/dump_all.tcl -l transcript + ./scripts/return_status.sh $(buildpath)/transcript # DPIs .PHONY: dpivcs @@ -509,6 +519,56 @@ $(buildpath)/%.trace: $(buildpath)/%.dasm tracevis: $(MEMPOOL_DIR)/scripts/tracevis.py $(preload) $(buildpath)/*.trace -o $(buildpath)/tracevis.json +# Perfetto protobuf export. Knobs: +# slices=function|instruction|none core slice granularity (instruction needs spike-dasm) +# noc_slices=state|packet NoC port slice granularity +# free_range=1 auto-scale counter y-axes (default: pinned to [0,1]) +# flows=1 correlate packets to core requests via (requester, meta_id); forces noc_slices=packet +# freq= clock freq for the real-ns axis (default 500 MHz) +slices ?= function +freq ?= 500 +noc_slices ?= state +ifeq ($(flows),1) +override noc_slices := packet +endif +# Optional cycle window: window=START:END exports only that cycle range (smaller/faster trace). +ifdef window + win_start := $(word 1,$(subst :, ,$(window))) + win_end := $(word 2,$(subst :, ,$(window))) +endif +.PHONY: perfetto-gen +perfetto-gen: + $(python) $(ROOT_DIR)/scripts/perfetto_gen.py $(buildpath)/trace_hart_*.dasm \ + --cores-per-tile $(num_cores_per_tile) \ + --tiles-per-group $(shell echo $$(( $(num_cores) / $(num_cores_per_tile) / $(num_groups) ))) \ + --mesh-y $(shell echo $$(( $(num_groups) / $(num_x) ))) \ + --clk-freq $(freq) \ + --slices $(slices) $(if $(preload),--elf $(preload),) \ + $(if $(wildcard $(buildpath)/noc_profiling),--noc $(buildpath)/noc_profiling,) \ + $(if $(wildcard $(buildpath)/spm_profiling),--spm $(buildpath)/spm_profiling,) \ + --noc-slices $(noc_slices) $(if $(filter 1,$(free_range)),--free-range,) \ + $(if $(filter 1,$(flows)),--flows,) \ + $(if $(win_start),--cycle-start $(win_start),) $(if $(win_end),--cycle-end $(win_end),) \ + -o $(buildpath)/perf.perfetto-trace + +# Native Perfetto trace_processor lives under $(INSTALL_DIR)/perfetto, installed +# by the root `make perfetto`. perfetto-gen exports the trace; perfetto-view +# serves it through the installed shell + container. +perfetto_dir := $(INSTALL_DIR)/perfetto +tp_shell := $(perfetto_dir)/trace_processor_shell +perfetto_sif := $(perfetto_dir)/ubuntu2404.sif + +.PHONY: perfetto-view +perfetto-view: + @test -f $(tp_shell) -a -f $(perfetto_sif) || \ + { echo "ERROR: trace_processor not installed -- run 'make perfetto' in the repo root"; exit 1; } + @test -f $(buildpath)/perf.perfetto-trace || \ + { echo "ERROR: $(buildpath)/perf.perfetto-trace not found -- run 'make perfetto-gen' first"; exit 1; } + @echo ">> Serving $(abspath $(buildpath))/perf.perfetto-trace on 127.0.0.1:9001" + @echo ">> Reload https://ui.perfetto.dev and click YES on 'Trace Processor native acceleration'" + singularity exec -B $(abspath $(buildpath)) -B $(perfetto_dir) $(perfetto_sif) \ + $(tp_shell) --httpd $(abspath $(buildpath))/perf.perfetto-trace + ############################ # Unit tests simulation # ############################ diff --git a/hardware/scripts/perfetto_gen.py b/hardware/scripts/perfetto_gen.py new file mode 100644 index 00000000..3644a652 --- /dev/null +++ b/hardware/scripts/perfetto_gen.py @@ -0,0 +1,1716 @@ +#!/usr/bin/env python3 +# Copyright 2026 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# perfetto_gen.py -- Phase 1 of the performance-visualization plan. +# +# Reads the per-hart Snitch traces (trace_hart_*.dasm) and emits a Perfetto +# *protobuf* trace (https://ui.perfetto.dev). Scale-ready successor to and +# SUPERSET of `make tracevis`: group>tile>core nested track tree, per-core +# function/instruction slices, and per-core IPC + stall-breakdown COUNTER +# tracks (windowed over --window-ns), which tracevis never had. +# +# Counter semantics (matters for correctness): the .dasm emits a line only when +# an instruction RETIRES, and its stall_* count the cycles of the gap that +# PRECEDED that retirement, so each instruction's stalls are spread across the +# windows the retirement gap spans; every active window is emitted so an idle +# window reads a true 0 (no sample-and-hold). +# +# Everything comes from the raw .dasm (no `make trace`): stalls + pc in the +# line, function/source via addr2line on the elf, disasm via spike-dasm. +# +# Usage: +# scripts/perfetto_gen.py build_vcs/trace_hart_*.dasm -o out.perfetto-trace +# scripts/perfetto_gen.py ... --slices instruction # full per-insn timeline +# scripts/perfetto_gen.py ... --slices none # counters only +# (geometry defaults match tensorpool64: 4 cores/tile, 4 tiles/group) + +import argparse +import bisect +import collections +import glob +import os +import re +import subprocess +import sys + +from perfetto.trace_builder.proto_builder import TraceProtoBuilder +from perfetto.protos.perfetto.trace.perfetto_trace_pb2 import TrackEvent + +SEQ = 1 # trusted_packet_sequence_id (single producer; absolute timestamps) + +# One .dasm line: