From 8f6a3eb9d1eb8f1680c891728900f7d20290de83 Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Tue, 17 Feb 2026 18:32:59 +0000 Subject: [PATCH 1/3] Integrate gpu_sim co-simulation into loom cosim Extract the Metal co-simulation engine (SPI flash, UART, Wishbone bus trace models) from the standalone gpu_sim binary into a library module at src/sim/cosim_metal.rs. Replace the cmd_cosim stub in loom.rs with a thin wrapper that loads the design via setup::load_design() and calls cosim_metal::run_cosim(). Key changes: - New cosim_metal module exposes run_cosim()/CosimOpts/CosimResult - Design loading reuses setup::load_design() instead of duplicating it - DesignArgs gains clock_period_ps for configurable SDF clock period - load_sdf() accepts optional clock_period_ps (default 25000ps) - CI updated to use `loom cosim` instead of `gpu_sim` - gpu_sim binary removed from Cargo.toml and deleted Co-developed-by: Claude Code v2.1.44 (claude-opus-4-6) --- .github/workflows/ci.yml | 18 +- Cargo.toml | 4 - src/bin/loom.rs | 81 +++- src/lib.rs | 4 + src/{bin/gpu_sim.rs => sim/cosim_metal.rs} | 407 ++++++--------------- src/sim/mod.rs | 2 + src/sim/setup.rs | 16 +- src/sky130_pdk.rs | 7 +- 8 files changed, 211 insertions(+), 328 deletions(-) rename src/{bin/gpu_sim.rs => sim/cosim_metal.rs} (92%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dd07ade..45d0c71 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -437,24 +437,24 @@ jobs: build_artifacts/6_final.v build_artifacts/result.gemparts \ --top-module top - - name: Run GPU simulation with SDF timing (100K ticks) + - name: Run GPU co-simulation with SDF timing (100K ticks) timeout-minutes: 10 run: | - cargo run --release --features metal --bin gpu_sim -- \ + cargo run --release --features metal --bin loom -- cosim \ + build_artifacts/6_final.v build_artifacts/result.gemparts \ --config tests/mcu_soc/sim_config.json --top-module top \ --sdf build_artifacts/6_final.sdf --sdf-corner typ \ --max-cycles 100000 \ - build_artifacts/6_final.v build_artifacts/result.gemparts \ - 2>&1 | tee gpu_sim_output.txt + 2>&1 | tee cosim_output.txt - name: Verify UART boot output run: | - if grep -q "nyaa" gpu_sim_output.txt; then + if grep -q "nyaa" cosim_output.txt; then echo "MCU SoC booted successfully - UART output detected" else echo "ERROR: Expected UART output 'nyaa' not found" echo "--- Last 50 lines of simulation output ---" - tail -50 gpu_sim_output.txt + tail -50 cosim_output.txt exit 1 fi @@ -462,9 +462,9 @@ jobs: if: always() run: | { - echo "## MCU SoC Metal Simulation (with SDF timing)" + echo "## MCU SoC Metal Co-simulation (with SDF timing)" echo "\`\`\`" - tail -20 gpu_sim_output.txt + tail -20 cosim_output.txt echo "\`\`\`" } >> "$GITHUB_STEP_SUMMARY" @@ -474,7 +474,7 @@ jobs: with: name: mcu-soc-results path: | - gpu_sim_output.txt + cosim_output.txt tests/mcu_soc/sim_config.json # Build documentation (cargo doc + mdbook) and deploy to GitHub Pages diff --git a/Cargo.toml b/Cargo.toml index 2b08094..8fc2c4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,10 +60,6 @@ required-features = ["cuda"] name = "metal_test" required-features = ["metal"] -[[bin]] -name = "gpu_sim" -required-features = ["metal"] - [dev-dependencies] criterion = "0.5" diff --git a/src/bin/loom.rs b/src/bin/loom.rs index eb576dd..60f9989 100644 --- a/src/bin/loom.rs +++ b/src/bin/loom.rs @@ -360,6 +360,7 @@ fn cmd_sim(args: SimArgs) { sdf: args.sdf.clone(), sdf_corner: args.sdf_corner.clone(), sdf_debug: args.sdf_debug, + clock_period_ps: None, }; #[allow(unused_mut)] @@ -1038,26 +1039,66 @@ fn cmd_cosim(args: CosimArgs) { #[cfg(feature = "metal")] { - // The co-simulation logic is complex (SPI flash, UART, batch GPU encoding). - // For now, delegate to gpu_sim binary. Full integration planned for a future release. - eprintln!( - "loom cosim is not yet fully integrated.\n\ - \n\ - The co-simulation logic (SPI flash model, UART, GPU batch encoding)\n\ - is currently available in the gpu_sim binary:\n\ - \n cargo run -r --features metal --bin gpu_sim -- \\\n\ - {:?} {:?} --config {:?} \\\n\ - --num-blocks {}{}{}\n", - args.netlist_verilog, - args.gemparts, - args.config, - args.num_blocks, - args.max_cycles - .map_or(String::new(), |c| format!(" --max-cycles {}", c)), - args.sdf + use gem::sim::cosim_metal::CosimOpts; + use gem::sim::setup; + use gem::testbench::TestbenchConfig; + + // Load testbench config + let file = std::fs::File::open(&args.config).expect("Failed to open config file"); + let reader = std::io::BufReader::new(file); + let config: TestbenchConfig = + serde_json::from_reader(reader).expect("Failed to parse config JSON"); + clilog::info!("Loaded testbench config: {:?}", config); + + // Determine clock period for SDF loading + let clock_period_ps = args + .clock_period + .or(config.clock_period_ps) + .or(config.timing.as_ref().map(|t| t.clock_period_ps)); + + // Determine SDF path: CLI --sdf takes priority, then config.timing.sdf_file + let sdf = args.sdf.clone().or_else(|| { + config + .timing .as_ref() - .map_or(String::new(), |s| format!(" --sdf {:?}", s)), - ); - std::process::exit(1); + .map(|t| std::path::PathBuf::from(&t.sdf_file)) + }); + let sdf_corner = if args.sdf.is_some() { + args.sdf_corner.clone() + } else if let Some(ref t) = config.timing { + t.sdf_corner.clone() + } else { + "typ".to_string() + }; + let sdf_debug = args.sdf_debug; + + let design_args = DesignArgs { + netlist_verilog: args.netlist_verilog.clone(), + top_module: args.top_module.clone(), + level_split: args.level_split.clone(), + gemparts: args.gemparts.clone(), + num_blocks: args.num_blocks, + json_path: None, + sdf, + sdf_corner, + sdf_debug, + clock_period_ps, + }; + + let mut design = setup::load_design(&design_args); + let timing_constraints = setup::build_timing_constraints(&design.script); + + let opts = CosimOpts { + max_cycles: args.max_cycles, + num_blocks: args.num_blocks, + flash_verbose: args.flash_verbose, + check_with_cpu: args.check_with_cpu, + gpu_profile: args.gpu_profile, + clock_period: args.clock_period, + }; + + let result = + gem::sim::cosim_metal::run_cosim(&mut design, &config, &opts, &timing_constraints); + std::process::exit(if result.passed { 0 } else { 1 }); } } diff --git a/src/lib.rs b/src/lib.rs index f1f71d0..6f913d6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,6 +34,10 @@ //! - [`testbench`] — Testbench configuration and VCD-driven simulation setup //! - [`display`] — Display/assertion support infrastructure +#[cfg(feature = "metal")] +#[macro_use] +extern crate objc; + pub mod aigpdk; pub mod sky130; diff --git a/src/bin/gpu_sim.rs b/src/sim/cosim_metal.rs similarity index 92% rename from src/bin/gpu_sim.rs rename to src/sim/cosim_metal.rs index 92de869..68a6b90 100644 --- a/src/bin/gpu_sim.rs +++ b/src/sim/cosim_metal.rs @@ -1,98 +1,38 @@ // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! GPU-only simulation with Metal: gate evaluation, SPI flash, and UART -//! all run on GPU. No per-tick CPU interaction needed. +//! Metal GPU co-simulation engine with SPI flash, UART, and Wishbone bus trace models. //! -//! Flash and UART IO models execute as GPU kernels after each tick's -//! simulate passes, eliminating the CPU round-trip bottleneck. -//! -//! Usage: -//! cargo run -r --features metal --bin gpu_sim -- \ -//! --config [--max-cycles N] [--num-blocks N] - -use gem::aig::{DriverType, AIG}; -use gem::aigpdk::AIGPDKLeafPins; -use gem::flatten::FlattenedScriptV1; -use gem::pe::Partition; -use gem::sky130::{detect_library_from_file, CellLibrary, SKY130LeafPins}; -use gem::staging::build_staged_aigs; -use gem::testbench::{CppSpiFlash, PortMapping, TestbenchConfig}; -use indexmap::IndexSet; -use netlistdb::{GeneralPinName, NetlistDB}; -use std::collections::HashMap; -use std::fs::File; -use std::io::BufReader; -use std::path::PathBuf; -use ulib::{AsUPtr, Device}; +//! Extracted from the `gpu_sim` binary. All IO models (flash, UART, bus trace) +//! run as GPU kernels — no per-tick CPU interaction needed. -#[macro_use] -extern crate objc; +use std::collections::HashMap; +use crate::aig::{DriverType, AIG}; +use crate::flatten::FlattenedScriptV1; +use crate::sim::setup::{self, LoadedDesign}; +use crate::testbench::{CppSpiFlash, PortMapping, TestbenchConfig, UartEvent}; use metal::{ CommandQueue, ComputePipelineState, Device as MTLDevice, MTLResourceOptions, MTLSize, SharedEvent, }; +use netlistdb::{GeneralPinName, NetlistDB}; +use ulib::{AsUPtr, Device}; -// ── CLI Arguments ──────────────────────────────────────────────────────────── - -#[derive(clap::Parser, Debug)] -#[command(name = "gpu_sim")] -#[command(about = "Hybrid GPU/CPU co-simulation with Metal")] -struct Args { - /// Gate-level verilog path synthesized in AIGPDK library. - netlist_verilog: PathBuf, - - /// Pre-compiled partition mapping (.gemparts file). - gemparts: PathBuf, - - /// Testbench configuration JSON file. - #[clap(long)] - config: PathBuf, - - /// Top module type in netlist. - #[clap(long)] - top_module: Option, - - /// Level split thresholds (comma-separated). - #[clap(long, value_delimiter = ',')] - level_split: Vec, - - /// Number of GPU threadgroups (blocks). Should be ~2x GPU SM count. - #[clap(long, default_value = "64")] - num_blocks: usize, - - /// Maximum system clock ticks to simulate. - #[clap(long)] - max_cycles: Option, - - /// Enable verbose flash model debug output. - #[clap(long)] - flash_verbose: bool, - - /// Clock period in picoseconds (overrides config file value, for UART baud calc). - #[clap(long)] - clock_period: Option, - - /// Verify GPU results against CPU baseline. - #[clap(long)] - check_with_cpu: bool, - - /// Run GPU kernel profiling: isolate each kernel in its own command buffer - /// and measure per-kernel GPU execution time. - #[clap(long)] - gpu_profile: bool, - - /// Path to SDF file for per-instance back-annotated delays. - #[clap(long)] - sdf: Option, - - /// SDF corner selection: min, typ, or max (default: typ). - #[clap(long, default_value = "typ")] - sdf_corner: String, +/// Runtime options for co-simulation. +pub struct CosimOpts { + pub max_cycles: Option, + pub num_blocks: usize, + pub flash_verbose: bool, + pub check_with_cpu: bool, + pub gpu_profile: bool, + pub clock_period: Option, +} - /// Enable SDF debug output (reports unmatched instances). - #[clap(long)] - sdf_debug: bool, +/// Result of a co-simulation run. +pub struct CosimResult { + pub passed: bool, + pub uart_events: Vec, + pub ticks_simulated: usize, } // ── Simulation Parameters (must match Metal shader) ────────────────────────── @@ -1077,7 +1017,7 @@ impl MetalSimulator { // ── GPIO ↔ State Buffer Mapping ────────────────────────────────────────────── /// Maps GPIO pin indices to bit positions in the packed u32 state buffer. -struct GpioMapping { +pub(crate) struct GpioMapping { /// gpio_in[idx] → (aigpin, state bit position) input_bits: HashMap, /// gpio_out[idx] → state bit position in output_map @@ -1114,7 +1054,7 @@ fn clear_bit(state: &mut [u32], pos: u32) { /// When `port_mapping` is provided, maps GPIO indices to port names explicitly /// (for designs like ChipFlow that use named ports instead of gpio_in[N]/gpio_out[N]). /// Falls back to parsing gpio_in[N]/gpio_out[N] from pin names when no mapping given. -fn build_gpio_mapping( +pub(crate) fn build_gpio_mapping( aig: &AIG, netlistdb: &NetlistDB, script: &FlattenedScriptV1, @@ -1836,7 +1776,7 @@ fn simulate_block_v1_inner( sram_data: &mut [u32], diag: bool, ) { - use gem::aigpdk::AIGPDK_SRAM_SIZE; + use crate::aigpdk::AIGPDK_SRAM_SIZE; let mut script_pi = 0; loop { let num_stages = script[script_pi]; @@ -2043,7 +1983,7 @@ fn simulate_block_v1_inner( .count(); let changed_bits: usize = (0..num_ios as usize) .map(|i| { - let old_wo = input_state[(io_offset as usize + i)]; + let old_wo = input_state[io_offset as usize + i]; let clken = clken_perm[i]; let wo = (old_wo & !clken) | (writeouts[i] & clken); (old_wo ^ wo).count_ones() as usize @@ -2086,157 +2026,59 @@ fn simulate_block_v1_inner( assert_eq!(script_pi, script.len()); } -// ── Main ───────────────────────────────────────────────────────────────────── +// ── Public Entry Point ─────────────────────────────────────────────────────── -fn main() { - clilog::init_stderr_color_debug(); - clilog::enable_timer("gpu_sim"); - clilog::enable_timer("gem"); - clilog::set_max_print_count(clilog::Level::Warn, "NL_SV_LIT", 1); - eprintln!("WARNING: gpu_sim is deprecated. Use `loom cosim` instead:"); - eprintln!(" cargo run -r --features metal --bin loom -- cosim ..."); - eprintln!(); - - let args = ::parse(); - clilog::info!("gpu_sim args:\n{:#?}", args); - - // ── Load testbench config ──────────────────────────────────────────── - - let file = File::open(&args.config).expect("Failed to open config file"); - let reader = BufReader::new(file); - let config: TestbenchConfig = - serde_json::from_reader(reader).expect("Failed to parse config JSON"); - clilog::info!("Loaded testbench config: {:?}", config); - - let max_ticks = args.max_cycles.unwrap_or(config.num_cycles); - - // ── Load netlist and build AIG ─────────────────────────────────────── - - let timer_load = clilog::stimer!("load_netlist"); - - // Detect cell library (AIGPDK vs SKY130) - let cell_library = detect_library_from_file(&args.netlist_verilog) - .expect("Failed to read netlist file for library detection"); - clilog::info!("Detected cell library: {}", cell_library); - - let netlistdb = match cell_library { - CellLibrary::SKY130 => NetlistDB::from_sverilog_file( - &args.netlist_verilog, - args.top_module.as_deref(), - &SKY130LeafPins, - ), - CellLibrary::AIGPDK | CellLibrary::Mixed => NetlistDB::from_sverilog_file( - &args.netlist_verilog, - args.top_module.as_deref(), - &AIGPDKLeafPins(), - ), - } - .expect("cannot build netlist"); - - let aig = AIG::from_netlistdb(&netlistdb); - clilog::info!( - "AIG: {} pins, {} DFFs, {} SRAMs", - aig.num_aigpins, - aig.dffs.len(), - aig.srams.len() - ); - clilog::finish!(timer_load); - - // ── Build staged AIGs and load partitions ──────────────────────────── - - let timer_script = clilog::stimer!("build_script"); - let stageds = build_staged_aigs(&aig, &args.level_split); - - let f = std::fs::File::open(&args.gemparts).unwrap(); - let mut buf = std::io::BufReader::new(f); - let parts_in_stages: Vec> = serde_bare::from_reader(&mut buf).unwrap(); - clilog::info!( - "Partitions per stage: {:?}", - parts_in_stages - .iter() - .map(|ps| ps.len()) - .collect::>() - ); - - let mut input_layout = Vec::new(); - for (i, driv) in aig.drivers.iter().enumerate() { - if let DriverType::InputPort(_) | DriverType::InputClockFlag(_, _) = driv { - input_layout.push(i); - } - } - - let mut script = FlattenedScriptV1::from( - &aig, - &stageds - .iter() - .map(|(_, _, staged)| staged) - .collect::>(), - &parts_in_stages - .iter() - .map(|ps| ps.as_slice()) - .collect::>(), - args.num_blocks, - input_layout, - ); - clilog::info!( - "Script: state_size={}, sram_storage={}, blocks={}, stages={}", - script.reg_io_state_size, - script.sram_storage_size, - script.num_blocks, - script.num_major_stages - ); - clilog::finish!(timer_script); - - // ── Load SDF timing data (from CLI or testbench config) ────────────── - { - let sdf_path = args.sdf.clone().or_else(|| { - config +/// Run a GPU co-simulation with testbench config. +/// +/// `design` should already have basic SDF loaded if `--sdf` was passed on CLI. +/// This function also checks `config.timing` for additional SDF configuration. +pub fn run_cosim( + design: &mut LoadedDesign, + config: &TestbenchConfig, + opts: &CosimOpts, + timing_constraints: &Option>, +) -> CosimResult { + // Load SDF from testbench config if not already loaded via CLI --sdf + if !design.script.timing_enabled { + let sdf_path_from_config = config + .timing + .as_ref() + .map(|t| std::path::PathBuf::from(&t.sdf_file)); + if let Some(ref sdf_path) = sdf_path_from_config { + let sdf_corner_str = config .timing .as_ref() - .map(|t| std::path::PathBuf::from(&t.sdf_file)) - }); - let sdf_corner_str = if args.sdf.is_some() { - &args.sdf_corner - } else if let Some(ref t) = config.timing { - &t.sdf_corner - } else { - "typ" - }; - let sdf_corner = match sdf_corner_str { - "min" => gem::sdf_parser::SdfCorner::Min, - "max" => gem::sdf_parser::SdfCorner::Max, - _ => gem::sdf_parser::SdfCorner::Typ, - }; - - if let Some(ref sdf_path) = sdf_path { - clilog::info!("Loading SDF: {:?} (corner: {})", sdf_path, sdf_corner_str); - match gem::sdf_parser::SdfFile::parse_file(sdf_path, sdf_corner) { - Ok(sdf) => { - clilog::info!("SDF loaded: {}", sdf.summary()); - let clock_ps = config - .timing - .as_ref() - .map(|t| t.clock_period_ps) - .or(config.clock_period_ps) - .unwrap_or(25000); - script.load_timing_from_sdf( - &aig, - &netlistdb, - &sdf, - clock_ps, - None, - args.sdf_debug, - ); - script.inject_timing_to_script(); - } - Err(e) => clilog::warn!("Failed to load SDF: {}", e), - } + .map(|t| t.sdf_corner.as_str()) + .unwrap_or("typ"); + let clock_ps = config + .timing + .as_ref() + .map(|t| t.clock_period_ps) + .or(config.clock_period_ps) + .unwrap_or(25000); + setup::load_sdf( + &mut design.script, + &design.aig, + &design.netlistdb, + sdf_path, + sdf_corner_str, + false, + Some(clock_ps), + ); } } + let max_ticks = opts.max_cycles.unwrap_or(config.num_cycles); + let script = &design.script; + let aig = &design.aig; + let netlistdb = &design.netlistdb; + let num_blocks = script.num_blocks; + let num_major_stages = script.num_major_stages; + let state_size = script.reg_io_state_size as usize; + // ── Build GPIO mapping ─────────────────────────────────────────────── - let gpio_map = build_gpio_mapping(&aig, &netlistdb, &script, config.port_mapping.as_ref()); + let gpio_map = build_gpio_mapping(aig, netlistdb, script, config.port_mapping.as_ref()); // Verify we found the expected GPIO pins let clock_gpio = config.clock_gpio; @@ -2288,7 +2130,7 @@ fn main() { "flash_csn", ]; for sig in &diag_signals { - let pos = resolve_signal_pos(&aig, &netlistdb, &script, sig); + let pos = resolve_signal_pos(aig, netlistdb, script, sig); if pos != 0xFFFFFFFF { clilog::info!("Diagnostic signal '{}' → output state pos {}", sig, pos); } else { @@ -2298,9 +2140,9 @@ fn main() { // ── Initialize peripheral models (CPU-side, kept for --check-with-cpu) ── - let flash: Option = if let Some(ref flash_cfg) = config.flash { + let _flash: Option = if let Some(ref flash_cfg) = config.flash { let mut fl = CppSpiFlash::new(16 * 1024 * 1024); - fl.set_verbose(args.flash_verbose); + fl.set_verbose(opts.flash_verbose); let firmware_path = std::path::Path::new(&flash_cfg.firmware); match fl.load_firmware(firmware_path, flash_cfg.firmware_offset) { Ok(size) => clilog::info!( @@ -2316,7 +2158,7 @@ fn main() { }; // CLI --clock-period overrides config file clock_period_ps; default 1000ps (1GHz) if neither set - let clock_period_ps = args.clock_period.or(config.clock_period_ps).unwrap_or(1000); + let clock_period_ps = opts.clock_period.or(config.clock_period_ps).unwrap_or(1000); let clock_hz = 1_000_000_000_000u64 / clock_period_ps; clilog::info!( "Clock period: {} ps ({} MHz), UART cycles_per_bit: {}", @@ -2330,9 +2172,7 @@ fn main() { // ── Initialize Metal simulator and GPU state buffers ───────────────── let timer_init = clilog::stimer!("init_gpu"); - let simulator = MetalSimulator::new(script.num_major_stages); - - let state_size = script.reg_io_state_size as usize; + let simulator = MetalSimulator::new(num_major_stages); // States: [input state (state_size)] [output state (state_size)] let states_buffer = simulator.device.new_buffer( @@ -2415,40 +2255,23 @@ fn main() { ); // Event buffer (for $stop/$finish/assertions) - let event_buffer = Box::new(gem::event_buffer::EventBuffer::new()); + let event_buffer = Box::new(crate::event_buffer::EventBuffer::new()); let event_buffer_ptr = Box::into_raw(event_buffer); let event_buffer_metal = simulator.device.new_buffer_with_bytes_no_copy( event_buffer_ptr as *const _, - std::mem::size_of::() as u64, + std::mem::size_of::() as u64, MTLResourceOptions::StorageModeShared, None, ); // Timing constraint buffer for GPU-side setup/hold checking. - // Format: [clock_period_ps:u32, constraints[0], constraints[1], ...] - // Each constraint word packs [setup_ps:16][hold_ps:16] for that state word. - let timing_constraints_buffer = if script.timing_enabled && !script.dff_constraints.is_empty() { - let (clock_ps, constraints) = script.build_timing_constraint_buffer(); - let non_zero = constraints.iter().filter(|&&v| v != 0).count(); - clilog::info!( - "Timing constraints: {} words, {} with DFF constraints, clock_period={}ps", - constraints.len(), - non_zero, - clock_ps - ); - // Prepend clock_period_ps as first element - let mut buf = Vec::with_capacity(1 + constraints.len()); - buf.push(clock_ps); - buf.extend_from_slice(&constraints); - let metal_buf = simulator.device.new_buffer_with_data( + let timing_constraints_buffer = timing_constraints.as_ref().map(|buf| { + simulator.device.new_buffer_with_data( buf.as_ptr() as *const _, (buf.len() * std::mem::size_of::()) as u64, MTLResourceOptions::StorageModeShared, - ); - Some(metal_buf) - } else { - None - }; + ) + }); clilog::finish!(timer_init); @@ -2456,8 +2279,6 @@ fn main() { let timer_prep = clilog::stimer!("build_state_prep_buffers"); let reset_cycles = config.reset_cycles; - let num_major_stages = script.num_major_stages; - let num_blocks = script.num_blocks; // Initial reset value let reset_val_active = if config.reset_active_high { 1u8 } else { 0u8 }; @@ -2606,7 +2427,7 @@ fn main() { if let Some(ref flash_cfg) = config.flash { use std::io::Read; let firmware_path = std::path::Path::new(&flash_cfg.firmware); - let mut file = File::open(firmware_path).expect("Failed to open firmware file"); + let mut file = std::fs::File::open(firmware_path).expect("Failed to open firmware file"); let mut data = Vec::new(); file.read_to_end(&mut data) .expect("Failed to read firmware"); @@ -2673,7 +2494,7 @@ fn main() { // ── GPU Wishbone Bus Trace buffers ──────────────────────────────── - let wb_trace_params = build_wb_trace_params(&aig, &netlistdb, &script); + let wb_trace_params = build_wb_trace_params(aig, netlistdb, script); let wb_trace_params_buffer = simulator.device.new_buffer( std::mem::size_of::() as u64, MTLResourceOptions::StorageModeShared, @@ -2707,8 +2528,8 @@ fn main() { // ── GPU Kernel Profiling (optional) ────────────────────────────────── - if args.gpu_profile { - let profile_ticks = args.max_cycles.unwrap_or(1000).min(5000); + if opts.gpu_profile { + let profile_ticks = opts.max_cycles.unwrap_or(1000).min(5000); simulator.profile_gpu_kernels( profile_ticks, num_blocks, @@ -2739,7 +2560,11 @@ fn main() { unsafe { drop(Box::from_raw(event_buffer_ptr)); } - return; + return CosimResult { + passed: true, + uart_events: Vec::new(), + ticks_simulated: 0, + }; } // ── GPU-only simulation loop ───────────────────────────────────────── @@ -2785,7 +2610,7 @@ fn main() { } // UART event collection (CPU-side, populated from channel drain) - let mut uart_events: Vec = Vec::new(); + let mut uart_events: Vec = Vec::new(); let mut uart_read_head: u32 = 0; let mut wb_trace_read_head: u32 = 0; @@ -2808,24 +2633,24 @@ fn main() { let mut diag_sram_write_count: usize = 0; // CPU verification state (--check-with-cpu) - let mut cpu_states: Vec = if args.check_with_cpu { + let mut cpu_states: Vec = if opts.check_with_cpu { vec![0u32; 2 * state_size] } else { Vec::new() }; - let mut cpu_sram: Vec = if args.check_with_cpu { + let mut cpu_sram: Vec = if opts.check_with_cpu { vec![0u32; script.sram_storage_size as usize] } else { Vec::new() }; let mut cpu_check_mismatches: usize = 0; - let cpu_check_max_ticks = if args.check_with_cpu { 500 } else { 0 }; + let cpu_check_max_ticks = if opts.check_with_cpu { 500 } else { 0 }; let mut post_reset_state_snapshot: Option> = None; let mut tick: usize = 0; while tick < max_ticks { - let batch = if args.check_with_cpu && tick < cpu_check_max_ticks { + let batch = if opts.check_with_cpu && tick < cpu_check_max_ticks { 1 // single tick for CPU comparison } else if trace_ticks > 0 && tick < reset_cycles + trace_ticks { 1 // single tick for tracing @@ -2859,7 +2684,7 @@ fn main() { // Save pre-tick state for CPU verification let saved_flash_d_i: u8; - if args.check_with_cpu && tick < cpu_check_max_ticks && batch == 1 { + if opts.check_with_cpu && tick < cpu_check_max_ticks && batch == 1 { let gpu_states: &[u32] = unsafe { std::slice::from_raw_parts(states_buffer.contents() as *const u32, 2 * state_size) }; @@ -2964,7 +2789,7 @@ fn main() { } // ── CPU verification: simulate same tick on CPU and compare ── - if args.check_with_cpu && tick < cpu_check_max_ticks && batch == 1 { + if opts.check_with_cpu && tick < cpu_check_max_ticks && batch == 1 { // CPU state_prep(fall): copy output → input, apply fall_ops // Read from the Metal buffer (updated by update_reset_in_ops each tick) cpu_states.copy_within(state_size..2 * state_size, 0); @@ -3119,13 +2944,13 @@ fn main() { if tick >= reset_cycles && tick <= reset_cycles + 5 { let (input_half, output_half) = cpu_states.split_at(state_size); compare_aig_vs_flattened( - &aig, + aig, input_half, output_half, - &script, + script, state_size, tick, - Some(&netlistdb), + Some(netlistdb), ); } @@ -3157,7 +2982,7 @@ fn main() { let gpu_output = &gpu_states[state_size..2 * state_size]; let cpu_output = &cpu_states[state_size..2 * state_size]; let mut mismatches = 0; - let mut first_mismatch_word = 0; + let mut _first_mismatch_word = 0; for i in 0..state_size { if gpu_output[i] != cpu_output[i] { if mismatches < 5 { @@ -3174,7 +2999,7 @@ fn main() { ); } if mismatches == 0 { - first_mismatch_word = i; + _first_mismatch_word = i; } mismatches += 1; } @@ -3217,7 +3042,7 @@ fn main() { // Per-tick output state change tracking if tick >= reset_cycles.saturating_sub(2) && tick <= reset_cycles + 15 { let gpu_output = &gpu_states[state_size..2 * state_size]; - let changed_words: usize = gpu_output + let _changed_words: usize = gpu_output .iter() .zip(cpu_output.iter()) .filter(|(a, b)| a != b) @@ -3253,7 +3078,7 @@ fn main() { '.' }; clilog::info!("UART TX: 0x{:02X} '{}'", byte, ch); - uart_events.push(gem::testbench::UartEvent { + uart_events.push(UartEvent { timestamp: tick, // approximate tick peripheral: "uart_0".to_string(), event: "tx".to_string(), @@ -3578,13 +3403,13 @@ fn main() { if let Some(ref output_path) = config.output_events { #[derive(serde::Serialize)] struct EventsOutput { - events: Vec, + events: Vec, } let output = EventsOutput { events: uart_events.clone(), }; let json = serde_json::to_string_pretty(&output).expect("Failed to serialize events"); - let mut file = File::create(output_path).expect("Failed to create events file"); + let mut file = std::fs::File::create(output_path).expect("Failed to create events file"); use std::io::Write; file.write_all(json.as_bytes()) .expect("Failed to write events"); @@ -3597,7 +3422,7 @@ fn main() { if let Some(ref ref_path) = config.events_reference { #[derive(serde::Deserialize)] struct EventsFile { - events: Vec, + events: Vec, } let ref_file = std::fs::read_to_string(ref_path) .unwrap_or_else(|e| panic!("Failed to read events reference {}: {}", ref_path, e)); @@ -3662,7 +3487,7 @@ fn main() { // ── Optional CPU verification ──────────────────────────────────────── - if args.check_with_cpu { + if opts.check_with_cpu { if cpu_check_mismatches == 0 { clilog::info!( "CPU verification: PASSED ({} ticks checked)", @@ -3677,9 +3502,6 @@ fn main() { } } - // Keep flash alive (for --check-with-cpu in future) - let _ = flash; - // Clean up event buffer unsafe { drop(Box::from_raw(event_buffer_ptr)); @@ -3690,6 +3512,11 @@ fn main() { println!("SIMULATION: PASSED"); } else { println!("SIMULATION: FAILED (event mismatch)"); - std::process::exit(1); + } + + CosimResult { + passed: events_passed, + uart_events, + ticks_simulated: max_ticks, } } diff --git a/src/sim/mod.rs b/src/sim/mod.rs index 3825cc8..89e2a60 100644 --- a/src/sim/mod.rs +++ b/src/sim/mod.rs @@ -9,6 +9,8 @@ //! - [`vcd_io`] — VCD input parsing and output writing utilities //! - [`setup`] — Design loading pipeline (netlist → AIG → script) +#[cfg(feature = "metal")] +pub mod cosim_metal; pub mod cpu_reference; pub mod setup; pub mod vcd_io; diff --git a/src/sim/setup.rs b/src/sim/setup.rs index a9b97dd..daf405b 100644 --- a/src/sim/setup.rs +++ b/src/sim/setup.rs @@ -26,6 +26,8 @@ pub struct DesignArgs { pub sdf: Option, pub sdf_corner: String, pub sdf_debug: bool, + /// Clock period in picoseconds for SDF timing. Defaults to 25000 if not set. + pub clock_period_ps: Option, } /// Result of loading a design: everything needed for simulation. @@ -133,6 +135,7 @@ pub fn load_design(args: &DesignArgs) -> LoadedDesign { sdf_path, &args.sdf_corner, args.sdf_debug, + args.clock_period_ps, ); } @@ -152,6 +155,8 @@ pub fn load_design(args: &DesignArgs) -> LoadedDesign { } /// Load SDF timing data into a script. +/// +/// `clock_period_ps` overrides the default 25000ps clock period used for timing. pub fn load_sdf( script: &mut FlattenedScriptV1, aig: &AIG, @@ -159,17 +164,24 @@ pub fn load_sdf( sdf_path: &Path, sdf_corner: &str, sdf_debug: bool, + clock_period_ps: Option, ) { + let clock_ps = clock_period_ps.unwrap_or(25000); let corner = match sdf_corner { "min" => crate::sdf_parser::SdfCorner::Min, "max" => crate::sdf_parser::SdfCorner::Max, _ => crate::sdf_parser::SdfCorner::Typ, }; - clilog::info!("Loading SDF: {:?} (corner: {})", sdf_path, sdf_corner); + clilog::info!( + "Loading SDF: {:?} (corner: {}, clock_period={}ps)", + sdf_path, + sdf_corner, + clock_ps + ); match crate::sdf_parser::SdfFile::parse_file(sdf_path, corner) { Ok(sdf) => { clilog::info!("SDF loaded: {}", sdf.summary()); - script.load_timing_from_sdf(aig, netlistdb, &sdf, 25000, None, sdf_debug); + script.load_timing_from_sdf(aig, netlistdb, &sdf, clock_ps, None, sdf_debug); script.inject_timing_to_script(); } Err(e) => clilog::warn!("Failed to load SDF: {}", e), diff --git a/src/sky130_pdk.rs b/src/sky130_pdk.rs index baafd1f..f5ddfaa 100644 --- a/src/sky130_pdk.rs +++ b/src/sky130_pdk.rs @@ -1375,9 +1375,10 @@ mod tests { #[test] fn test_parse_ha() { - let src = - std::fs::read_to_string("vendor/sky130_fd_sc_hd/cells/ha/sky130_fd_sc_hd__ha.functional.v") - .unwrap(); + let src = std::fs::read_to_string( + "vendor/sky130_fd_sc_hd/cells/ha/sky130_fd_sc_hd__ha.functional.v", + ) + .unwrap(); let model = parse_functional_model(&src).unwrap(); assert_eq!(model.module_name, "sky130_fd_sc_hd__ha"); assert_eq!(model.inputs, vec!["A", "B"]); From 9192a5b60a2fb59eaba5df9d80b567c544ff5c6c Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Tue, 17 Feb 2026 16:05:27 +0000 Subject: [PATCH 2/3] Add WNS/TNS timing analysis, clock uncertainty, and event buffer draining - Extend SimStats with worst/total slack tracking (WNS/TNS/WHS/THS) and per-endpoint HashSet for unique violation counting - Add --clock-uncertainty-ps CLI flag to gpu_sim for modeling clock jitter; uncertainty is added to setup/hold constraints via build_timing_constraint_buffer() - Drain event buffer in gpu_sim.rs GPU-only simulation loop after each batch, accumulating timing violation statistics across the full run - Print timing analysis summary (WNS/TNS, unique endpoints, PASS/FAIL) at end of simulation when timing is enabled - Add unit tests for WNS/TNS accumulation across single and multiple drains Co-developed-by: Claude Code v2.1.44 (claude-opus-4-6) --- src/bin/loom.rs | 10 +++- src/event_buffer.rs | 103 +++++++++++++++++++++++++++++++++++++++-- src/flatten.rs | 31 ++++++++----- src/sim/cosim_metal.rs | 61 ++++++++++++++++++++++++ src/sim/setup.rs | 7 ++- 5 files changed, 193 insertions(+), 19 deletions(-) diff --git a/src/bin/loom.rs b/src/bin/loom.rs index 60f9989..ebf2877 100644 --- a/src/bin/loom.rs +++ b/src/bin/loom.rs @@ -209,6 +209,10 @@ struct CosimArgs { /// Enable SDF debug output. #[clap(long)] sdf_debug: bool, + + /// Clock uncertainty in picoseconds (added to setup/hold constraints to model jitter). + #[clap(long, default_value = "0")] + clock_uncertainty_ps: u64, } /// Invoke the mt-kahypar partitioner. @@ -365,7 +369,7 @@ fn cmd_sim(args: SimArgs) { #[allow(unused_mut)] let mut design = setup::load_design(&design_args); - let timing_constraints = setup::build_timing_constraints(&design.script); + let timing_constraints = setup::build_timing_constraints(&design.script, 0); // Parse input VCD let input_vcd = std::fs::File::open(&args.input_vcd).unwrap(); @@ -1086,7 +1090,8 @@ fn cmd_cosim(args: CosimArgs) { }; let mut design = setup::load_design(&design_args); - let timing_constraints = setup::build_timing_constraints(&design.script); + let timing_constraints = + setup::build_timing_constraints(&design.script, args.clock_uncertainty_ps as u16); let opts = CosimOpts { max_cycles: args.max_cycles, @@ -1095,6 +1100,7 @@ fn cmd_cosim(args: CosimArgs) { check_with_cpu: args.check_with_cpu, gpu_profile: args.gpu_profile, clock_period: args.clock_period, + clock_uncertainty_ps: args.clock_uncertainty_ps, }; let result = diff --git a/src/event_buffer.rs b/src/event_buffer.rs index 0a54b79..5ea4ced 100644 --- a/src/event_buffer.rs +++ b/src/event_buffer.rs @@ -6,6 +6,7 @@ //! (e.g., $stop, $finish, $display, assertion failures) and the CPU processes //! them between simulation stages. +use std::collections::HashSet; use std::sync::atomic::{AtomicU32, Ordering}; /// Maximum number of events that can be buffered per cycle. @@ -211,7 +212,7 @@ pub enum AssertAction { } /// Statistics tracked during simulation. -#[derive(Debug, Default, Clone)] +#[derive(Debug, Clone)] pub struct SimStats { /// Number of assertion failures encountered pub assertion_failures: u32, @@ -219,10 +220,40 @@ pub struct SimStats { pub stop_count: u32, /// Number of events dropped due to overflow pub events_dropped: u32, - /// Number of setup timing violations (Experiment 4) + /// Number of setup timing violations pub setup_violations: u32, - /// Number of hold timing violations (Experiment 4) + /// Number of hold timing violations pub hold_violations: u32, + /// Worst negative slack for setup (most negative = worst), in picoseconds + pub worst_setup_slack_ps: i32, + /// Worst negative slack for hold (most negative = worst), in picoseconds + pub worst_hold_slack_ps: i32, + /// Total negative slack for setup (sum of all negative slacks), in picoseconds + pub total_setup_slack_ps: i64, + /// Total negative slack for hold (sum of all negative slacks), in picoseconds + pub total_hold_slack_ps: i64, + /// Unique state word IDs with setup violations + pub setup_violating_endpoints: HashSet, + /// Unique state word IDs with hold violations + pub hold_violating_endpoints: HashSet, +} + +impl Default for SimStats { + fn default() -> Self { + Self { + assertion_failures: 0, + stop_count: 0, + events_dropped: 0, + setup_violations: 0, + hold_violations: 0, + worst_setup_slack_ps: 0, + worst_hold_slack_ps: 0, + total_setup_slack_ps: 0, + total_hold_slack_ps: 0, + setup_violating_endpoints: HashSet::new(), + hold_violating_endpoints: HashSet::new(), + } + } } /// Process events from the buffer and determine simulation control. @@ -313,6 +344,11 @@ where slack ); stats.setup_violations += 1; + stats.worst_setup_slack_ps = stats.worst_setup_slack_ps.min(slack); + if slack < 0 { + stats.total_setup_slack_ps += slack as i64; + } + stats.setup_violating_endpoints.insert(word_id); } EventType::HoldViolation => { // data[0] = state word index, data[1] = slack (signed as u32) @@ -330,6 +366,11 @@ where slack ); stats.hold_violations += 1; + stats.worst_hold_slack_ps = stats.worst_hold_slack_ps.min(slack); + if slack < 0 { + stats.total_hold_slack_ps += slack as i64; + } + stats.hold_violating_endpoints.insert(word_id); } } } @@ -658,4 +699,60 @@ mod tests { assert_eq!(stats.hold_violations, 1); assert_eq!(stats.stop_count, 1); } + + #[test] + fn test_wns_tns_accumulation() { + let mut buf = EventBuffer::new(); + // Three setup violations with different slacks and word IDs + add_full_timing_event(&mut buf, EventType::SetupViolation, 10, 5, -100, 900, 200); + add_full_timing_event(&mut buf, EventType::SetupViolation, 20, 8, -450, 1200, 300); + add_full_timing_event(&mut buf, EventType::SetupViolation, 30, 5, -200, 1000, 250); + // Two hold violations + add_full_timing_event(&mut buf, EventType::HoldViolation, 15, 3, -120, 10, 50); + add_full_timing_event(&mut buf, EventType::HoldViolation, 25, 7, -280, 5, 80); + + let config = AssertConfig::default(); + let mut stats = SimStats::default(); + process_events(&buf, &config, &mut stats, |_, _, _| {}); + + // Setup: WNS = -450 (worst), TNS = -100 + -450 + -200 = -750 + assert_eq!(stats.setup_violations, 3); + assert_eq!(stats.worst_setup_slack_ps, -450); + assert_eq!(stats.total_setup_slack_ps, -750); + // 2 unique endpoints (word IDs 5 and 8) + assert_eq!(stats.setup_violating_endpoints.len(), 2); + assert!(stats.setup_violating_endpoints.contains(&5)); + assert!(stats.setup_violating_endpoints.contains(&8)); + + // Hold: WHS = -280 (worst), THS = -120 + -280 = -400 + assert_eq!(stats.hold_violations, 2); + assert_eq!(stats.worst_hold_slack_ps, -280); + assert_eq!(stats.total_hold_slack_ps, -400); + assert_eq!(stats.hold_violating_endpoints.len(), 2); + assert!(stats.hold_violating_endpoints.contains(&3)); + assert!(stats.hold_violating_endpoints.contains(&7)); + } + + #[test] + fn test_wns_tns_across_multiple_drains() { + // Simulate two separate event buffer drains (as happens across GPU batches) + let config = AssertConfig::default(); + let mut stats = SimStats::default(); + + // First batch + let mut buf1 = EventBuffer::new(); + add_full_timing_event(&mut buf1, EventType::SetupViolation, 10, 1, -100, 900, 200); + process_events(&buf1, &config, &mut stats, |_, _, _| {}); + + // Second batch (worse slack on same endpoint) + let mut buf2 = EventBuffer::new(); + add_full_timing_event(&mut buf2, EventType::SetupViolation, 20, 1, -300, 1100, 200); + process_events(&buf2, &config, &mut stats, |_, _, _| {}); + + assert_eq!(stats.setup_violations, 2); + assert_eq!(stats.worst_setup_slack_ps, -300); + assert_eq!(stats.total_setup_slack_ps, -400); + // Same endpoint (word_id=1) across both drains + assert_eq!(stats.setup_violating_endpoints.len(), 1); + } } diff --git a/src/flatten.rs b/src/flatten.rs index 2618093..ae675e2 100644 --- a/src/flatten.rs +++ b/src/flatten.rs @@ -1407,7 +1407,11 @@ impl FlattenedScriptV1 { /// Words with no DFF constraints have value 0 (skipped by the kernel). /// This is conservative: the max arrival across the word is compared against the /// min constraint, which may over-report violations but never misses real ones. - pub fn build_timing_constraint_buffer(&self) -> (u32, Vec) { + /// Build a per-word timing constraint buffer for GPU-side setup/hold checking. + /// + /// `clock_uncertainty_ps` adds margin to both setup and hold constraints, + /// modelling clock jitter. Effectively tightens the timing window by this amount. + pub fn build_timing_constraint_buffer(&self, clock_uncertainty_ps: u16) -> (u32, Vec) { let num_words = self.reg_io_state_size as usize; let mut constraints = vec![0u32; num_words]; for c in &self.dff_constraints { @@ -1421,16 +1425,19 @@ impl FlattenedScriptV1 { let existing = constraints[word_idx]; let old_setup = (existing >> 16) as u16; let old_hold = (existing & 0xFFFF) as u16; + // Add clock uncertainty to both setup and hold constraints + let effective_setup = c.setup_ps.saturating_add(clock_uncertainty_ps); + let effective_hold = c.hold_ps.saturating_add(clock_uncertainty_ps); // Most restrictive (min) constraint per word, treating 0 as "not yet set" let new_setup = if old_setup == 0 { - c.setup_ps + effective_setup } else { - old_setup.min(c.setup_ps) + old_setup.min(effective_setup) }; let new_hold = if old_hold == 0 { - c.hold_ps + effective_hold } else { - old_hold.min(c.hold_ps) + old_hold.min(effective_hold) }; constraints[word_idx] = ((new_setup as u32) << 16) | (new_hold as u32); } @@ -2223,7 +2230,7 @@ mod constraint_buffer_tests { #[test] fn test_empty_constraints() { let script = make_script_with_constraints(4, 10000, Vec::new()); - let (clock_ps, buf) = script.build_timing_constraint_buffer(); + let (clock_ps, buf) = script.build_timing_constraint_buffer(0); assert_eq!(clock_ps, 10000); assert_eq!(buf.len(), 4); assert!(buf.iter().all(|&v| v == 0)); @@ -2241,7 +2248,7 @@ mod constraint_buffer_tests { cell_id: 1, }], ); - let (clock_ps, buf) = script.build_timing_constraint_buffer(); + let (clock_ps, buf) = script.build_timing_constraint_buffer(0); assert_eq!(clock_ps, 25000); // data_state_pos 35 → word_idx 1 assert_eq!(buf[0], 0); @@ -2270,7 +2277,7 @@ mod constraint_buffer_tests { }, ], ); - let (_clock_ps, buf) = script.build_timing_constraint_buffer(); + let (_clock_ps, buf) = script.build_timing_constraint_buffer(0); // Both in word 0 → min(300,150)=150 setup, min(100,200)=100 hold assert_eq!(buf[0], (150u32 << 16) | 100); } @@ -2287,7 +2294,7 @@ mod constraint_buffer_tests { cell_id: 1, }], ); - let (_clock_ps, buf) = script.build_timing_constraint_buffer(); + let (_clock_ps, buf) = script.build_timing_constraint_buffer(0); assert!(buf.iter().all(|&v| v == 0)); } @@ -2303,7 +2310,7 @@ mod constraint_buffer_tests { cell_id: 1, }], ); - let (_clock_ps, buf) = script.build_timing_constraint_buffer(); + let (_clock_ps, buf) = script.build_timing_constraint_buffer(0); // word_idx = 100/32 = 3, but num_words = 2 → skipped assert!(buf.iter().all(|&v| v == 0)); } @@ -2311,7 +2318,7 @@ mod constraint_buffer_tests { #[test] fn test_clock_period_saturation() { let script = make_script_with_constraints(1, u64::MAX, Vec::new()); - let (clock_ps, _buf) = script.build_timing_constraint_buffer(); + let (clock_ps, _buf) = script.build_timing_constraint_buffer(0); assert_eq!(clock_ps, u32::MAX); } @@ -2440,7 +2447,7 @@ mod constraint_buffer_tests { cell_id: 1, }], ); - let (clock_ps, buf) = script.build_timing_constraint_buffer(); + let (clock_ps, buf) = script.build_timing_constraint_buffer(0); assert_eq!(clock_ps, 1000); // Verify constraint is at word 5 (160/32 = 5) diff --git a/src/sim/cosim_metal.rs b/src/sim/cosim_metal.rs index 68a6b90..cd495e8 100644 --- a/src/sim/cosim_metal.rs +++ b/src/sim/cosim_metal.rs @@ -26,6 +26,8 @@ pub struct CosimOpts { pub check_with_cpu: bool, pub gpu_profile: bool, pub clock_period: Option, + /// Clock uncertainty in picoseconds (added to setup/hold constraints to model jitter). + pub clock_uncertainty_ps: u64, } /// Result of a co-simulation run. @@ -2614,6 +2616,10 @@ pub fn run_cosim( let mut uart_read_head: u32 = 0; let mut wb_trace_read_head: u32 = 0; + // Timing violation statistics (accumulated from event buffer) + let assert_config = crate::event_buffer::AssertConfig::default(); + let mut sim_stats = crate::event_buffer::SimStats::default(); + // Profiling accumulators let mut prof_batch_encode: u64 = 0; let mut prof_gpu_wait: u64 = 0; @@ -3121,6 +3127,21 @@ pub fn run_cosim( wb_trace_read_head += 1; } } + // Drain event buffer for timing violations + sim control events + { + let eb = unsafe { &*(event_buffer_ptr as *const crate::event_buffer::EventBuffer) }; + let control = crate::event_buffer::process_events( + eb, + &assert_config, + &mut sim_stats, + |_, _, _| {}, + ); + eb.reset(); + if control == crate::event_buffer::SimControl::Terminate { + clilog::warn!("Simulation terminated by event at tick {}", tick); + break; + } + } prof_drain += t_drain.elapsed().as_nanos() as u64; total_batches += 1; @@ -3355,6 +3376,46 @@ pub fn run_cosim( ); } + // ── Timing Analysis Summary ────────────────────────────────────────── + + if script.timing_enabled { + println!(); + println!("=== Timing Analysis ==="); + println!("Clock period: {}ps", script.clock_period_ps); + if opts.clock_uncertainty_ps > 0 { + println!("Clock uncertainty: {}ps", opts.clock_uncertainty_ps); + } + println!( + "Setup violations: {} ({} unique endpoints)", + sim_stats.setup_violations, + sim_stats.setup_violating_endpoints.len() + ); + println!( + "Hold violations: {} ({} unique endpoints)", + sim_stats.hold_violations, + sim_stats.hold_violating_endpoints.len() + ); + if sim_stats.setup_violations > 0 { + println!("WNS (setup): {}ps", sim_stats.worst_setup_slack_ps); + println!("TNS (setup): {}ps", sim_stats.total_setup_slack_ps); + } + if sim_stats.hold_violations > 0 { + println!("WHS (hold): {}ps", sim_stats.worst_hold_slack_ps); + println!("THS (hold): {}ps", sim_stats.total_hold_slack_ps); + } + if sim_stats.setup_violations == 0 && sim_stats.hold_violations == 0 { + println!("TIMING: PASSED"); + } else { + println!("TIMING: VIOLATIONS DETECTED"); + } + if sim_stats.events_dropped > 0 { + println!( + "WARNING: {} events were dropped (buffer overflow), slack metrics may be approximate", + sim_stats.events_dropped + ); + } + } + // ── Results ────────────────────────────────────────────────────────── println!(); diff --git a/src/sim/setup.rs b/src/sim/setup.rs index daf405b..7718c9d 100644 --- a/src/sim/setup.rs +++ b/src/sim/setup.rs @@ -192,9 +192,12 @@ pub fn load_sdf( /// /// Returns `Some((clock_ps, constraint_buffer))` if timing is enabled, /// where `constraint_buffer` = `[clock_ps, constraints[0], constraints[1], ...]`. -pub fn build_timing_constraints(script: &FlattenedScriptV1) -> Option> { +pub fn build_timing_constraints( + script: &FlattenedScriptV1, + clock_uncertainty_ps: u16, +) -> Option> { if script.timing_enabled && !script.dff_constraints.is_empty() { - let (clock_ps, constraints) = script.build_timing_constraint_buffer(); + let (clock_ps, constraints) = script.build_timing_constraint_buffer(clock_uncertainty_ps); let non_zero = constraints.iter().filter(|&&v| v != 0).count(); clilog::info!( "Timing constraints: {} words, {} with DFF constraints, clock_period={}ps", From c1ce430e899e5f025641f8163e7439ec209b6ba9 Mon Sep 17 00:00:00 2001 From: Rob Taylor Date: Tue, 17 Feb 2026 16:12:40 +0000 Subject: [PATCH 3/3] Update timing violations docs: WNS/TNS, clock uncertainty, multi-corner - Replace old summary statistics section with WNS/TNS/WHS/THS metric definitions and example output - Add --clock-uncertainty-ps to CLI flags table with explanation - Add multi-corner analysis section with max/min corner workflow - Add clock uncertainty section explaining the jitter model - Update gpu_sim example to show --clock-uncertainty-ps flag Co-developed-by: Claude Code v2.1.44 (claude-opus-4-6) --- docs/timing-violations.md | 77 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/docs/timing-violations.md b/docs/timing-violations.md index e9ea33e..7d2ad12 100644 --- a/docs/timing-violations.md +++ b/docs/timing-violations.md @@ -51,7 +51,8 @@ Setup and hold violations occur when data arrives too late (setup) or too early design.gv design.gemparts \ --config testbench.json \ --sdf design.sdf \ - --sdf-corner typ + --sdf-corner typ \ + --clock-uncertainty-ps 500 ``` ### CLI Flags Reference @@ -61,6 +62,7 @@ Setup and hold violations occur when data arrives too late (setup) or too early | `--sdf ` | all | Path to SDF file with back-annotated delays | | `--sdf-corner ` | all | Which SDF corner to use (default: `typ`) | | `--sdf-debug` | all | Print unmatched SDF instances for debugging | +| `--clock-uncertainty-ps ` | `gpu_sim` | Clock uncertainty/jitter margin in picoseconds (default: 0). Added to both setup and hold constraints, tightening the timing window. | | `--enable-timing` | `cuda_test` | Enable timing analysis (arrival + violation checks) | | `--timing-clock-period ` | `cuda_test` | Clock period in picoseconds (default: 1000) | | `--timing-report-violations` | `cuda_test` | Report all violations, not just summary | @@ -113,14 +115,52 @@ cargo run -r --features metal --bin metal_test -- \ | **hold** | DFF hold time constraint from SDF/Liberty (picoseconds) | | **slack** | `arrival - hold`. Negative = violation amount | -### Summary Statistics +### Timing Analysis Summary + +At the end of simulation, `gpu_sim` prints a timing analysis summary with standard signoff metrics: + +``` +=== Timing Analysis === +Clock period: 1200ps +Clock uncertainty: 500ps +Setup violations: 12 (8 unique endpoints) +Hold violations: 3 (2 unique endpoints) +WNS (setup): -450ps +TNS (setup): -2340ps +WHS (hold): -120ps +THS (hold): -280ps +TIMING: VIOLATIONS DETECTED +``` + +| Metric | Meaning | +|--------|---------| +| **WNS** (Worst Negative Slack) | The most negative setup slack across the entire simulation. Indicates the single worst timing path. | +| **TNS** (Total Negative Slack) | Sum of all negative setup slacks. Indicates overall timing health — a large TNS means many paths are failing. | +| **WHS** (Worst Hold Slack) | The most negative hold slack. | +| **THS** (Total Hold Slack) | Sum of all negative hold slacks. | +| **Unique endpoints** | Number of distinct state words (each covering 32 DFF data inputs) that had at least one violation. | -At the end of simulation, GEM prints totals: +If no violations are detected, the summary shows: ``` -Simulation complete: 1000 cycles, 5 setup violations, 0 hold violations +=== Timing Analysis === +Clock period: 1200ps +Setup violations: 0 (0 unique endpoints) +Hold violations: 0 (0 unique endpoints) +TIMING: PASSED ``` +**Event buffer overflow**: The GPU-side event buffer holds up to 1024 events per batch. If a design has very many violations per batch, some events may be dropped. When this happens, a warning is printed and WNS/TNS metrics may be approximate (but never optimistic — missed events only mean the true TNS is worse). + +### Clock Uncertainty + +The `--clock-uncertainty-ps` flag models clock jitter and skew by adding a margin to both setup and hold constraints. This effectively tightens the timing window: + +- **Setup check**: `arrival + (setup + uncertainty) > clock_period` triggers a violation +- **Hold check**: `arrival < (hold + uncertainty)` triggers a violation + +Use this when your design has known clock tree uncertainty from P&R reports. A typical value for SKY130 is 100-500ps depending on clock tree quality. + ## Tracing Violations to Source Signals When you see a violation on a specific word, follow this workflow to identify the offending signals and their logic cone. @@ -186,6 +226,35 @@ If a violation is reported but you suspect it's a false positive from the approx 1. **Use `timing_sim_cpu`** for per-signal accuracy (see [Detailed CPU Timing Analysis](#4-detailed-cpu-timing-analysis) above). 2. **Timing-aware bit packing** groups signals with similar arrival times into the same thread, reducing the approximation error. See `docs/timing-simulation.md` § "Timing-Aware Bit Packing" for details. +## Multi-Corner Analysis + +A single SDF corner only catches one class of violations: the **max** (slow) corner reveals setup violations, while the **min** (fast) corner reveals hold violations. For complete timing signoff, run both corners: + +```bash +# Max corner: catches setup violations (slow paths) +cargo run -r --features metal --bin gpu_sim -- \ + design.gv design.gemparts \ + --config testbench.json \ + --sdf design.sdf --sdf-corner max \ + --clock-uncertainty-ps 500 + +# Min corner: catches hold violations (fast paths) +cargo run -r --features metal --bin gpu_sim -- \ + design.gv design.gemparts \ + --config testbench.json \ + --sdf design.sdf --sdf-corner min \ + --clock-uncertainty-ps 500 +``` + +Both runs use the same `.gemparts` file (compilation is cached), so the overhead is just the simulation time. In CI, add both as separate steps: + +```yaml +- name: Timing check (max corner - setup) + run: cargo run -r --features metal --bin gpu_sim -- ... --sdf-corner max +- name: Timing check (min corner - hold) + run: cargo run -r --features metal --bin gpu_sim -- ... --sdf-corner min +``` + ## Common Scenarios **Setup violations on many words, same cycle**: The clock period is likely too tight for the design. The combinational logic depth exceeds what can settle in one clock period. Try increasing the clock period.