From 8f6a3eb9d1eb8f1680c891728900f7d20290de83 Mon Sep 17 00:00:00 2001
From: Rob Taylor <rob.taylor@chipflow.io>
Date: Tue, 17 Feb 2026 18:32:59 +0000
Subject: [PATCH 1/3] Integrate gpu_sim co-simulation into loom cosim

Extract the Metal co-simulation engine (SPI flash, UART, Wishbone bus
trace models) from the standalone gpu_sim binary into a library module
at src/sim/cosim_metal.rs. Replace the cmd_cosim stub in loom.rs with
a thin wrapper that loads the design via setup::load_design() and calls
cosim_metal::run_cosim().

Key changes:
- New cosim_metal module exposes run_cosim()/CosimOpts/CosimResult
- Design loading reuses setup::load_design() instead of duplicating it
- DesignArgs gains clock_period_ps for configurable SDF clock period
- load_sdf() accepts optional clock_period_ps (default 25000ps)
- CI updated to use `loom cosim` instead of `gpu_sim`
- gpu_sim binary removed from Cargo.toml and deleted

Co-developed-by: Claude Code v2.1.44 (claude-opus-4-6)
---
 .github/workflows/ci.yml                   |  18 +-
 Cargo.toml                                 |   4 -
 src/bin/loom.rs                            |  81 +++-
 src/lib.rs                                 |   4 +
 src/{bin/gpu_sim.rs => sim/cosim_metal.rs} | 407 ++++++---------------
 src/sim/mod.rs                             |   2 +
 src/sim/setup.rs                           |  16 +-
 src/sky130_pdk.rs                          |   7 +-
 8 files changed, 211 insertions(+), 328 deletions(-)
 rename src/{bin/gpu_sim.rs => sim/cosim_metal.rs} (92%)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index dd07ade..45d0c71 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -437,24 +437,24 @@ jobs:
             build_artifacts/6_final.v build_artifacts/result.gemparts \
             --top-module top
 
-      - name: Run GPU simulation with SDF timing (100K ticks)
+      - name: Run GPU co-simulation with SDF timing (100K ticks)
         timeout-minutes: 10
         run: |
-          cargo run --release --features metal --bin gpu_sim -- \
+          cargo run --release --features metal --bin loom -- cosim \
+            build_artifacts/6_final.v build_artifacts/result.gemparts \
             --config tests/mcu_soc/sim_config.json --top-module top \
             --sdf build_artifacts/6_final.sdf --sdf-corner typ \
             --max-cycles 100000 \
-            build_artifacts/6_final.v build_artifacts/result.gemparts \
-            2>&1 | tee gpu_sim_output.txt
+            2>&1 | tee cosim_output.txt
 
       - name: Verify UART boot output
         run: |
-          if grep -q "nyaa" gpu_sim_output.txt; then
+          if grep -q "nyaa" cosim_output.txt; then
             echo "MCU SoC booted successfully - UART output detected"
           else
             echo "ERROR: Expected UART output 'nyaa' not found"
             echo "--- Last 50 lines of simulation output ---"
-            tail -50 gpu_sim_output.txt
+            tail -50 cosim_output.txt
             exit 1
           fi
 
@@ -462,9 +462,9 @@ jobs:
         if: always()
         run: |
           {
-            echo "## MCU SoC Metal Simulation (with SDF timing)"
+            echo "## MCU SoC Metal Co-simulation (with SDF timing)"
             echo "\`\`\`"
-            tail -20 gpu_sim_output.txt
+            tail -20 cosim_output.txt
             echo "\`\`\`"
           } >> "$GITHUB_STEP_SUMMARY"
 
@@ -474,7 +474,7 @@ jobs:
         with:
           name: mcu-soc-results
           path: |
-            gpu_sim_output.txt
+            cosim_output.txt
             tests/mcu_soc/sim_config.json
 
   # Build documentation (cargo doc + mdbook) and deploy to GitHub Pages
diff --git a/Cargo.toml b/Cargo.toml
index 2b08094..8fc2c4e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -60,10 +60,6 @@ required-features = ["cuda"]
 name = "metal_test"
 required-features = ["metal"]
 
-[[bin]]
-name = "gpu_sim"
-required-features = ["metal"]
-
 [dev-dependencies]
 criterion = "0.5"
 
diff --git a/src/bin/loom.rs b/src/bin/loom.rs
index eb576dd..60f9989 100644
--- a/src/bin/loom.rs
+++ b/src/bin/loom.rs
@@ -360,6 +360,7 @@ fn cmd_sim(args: SimArgs) {
         sdf: args.sdf.clone(),
         sdf_corner: args.sdf_corner.clone(),
         sdf_debug: args.sdf_debug,
+        clock_period_ps: None,
     };
 
     #[allow(unused_mut)]
@@ -1038,26 +1039,66 @@ fn cmd_cosim(args: CosimArgs) {
 
     #[cfg(feature = "metal")]
     {
-        // The co-simulation logic is complex (SPI flash, UART, batch GPU encoding).
-        // For now, delegate to gpu_sim binary. Full integration planned for a future release.
-        eprintln!(
-            "loom cosim is not yet fully integrated.\n\
-             \n\
-             The co-simulation logic (SPI flash model, UART, GPU batch encoding)\n\
-             is currently available in the gpu_sim binary:\n\
-             \n  cargo run -r --features metal --bin gpu_sim -- \\\n\
-               {:?} {:?} --config {:?} \\\n\
-               --num-blocks {}{}{}\n",
-            args.netlist_verilog,
-            args.gemparts,
-            args.config,
-            args.num_blocks,
-            args.max_cycles
-                .map_or(String::new(), |c| format!(" --max-cycles {}", c)),
-            args.sdf
+        use gem::sim::cosim_metal::CosimOpts;
+        use gem::sim::setup;
+        use gem::testbench::TestbenchConfig;
+
+        // Load testbench config
+        let file = std::fs::File::open(&args.config).expect("Failed to open config file");
+        let reader = std::io::BufReader::new(file);
+        let config: TestbenchConfig =
+            serde_json::from_reader(reader).expect("Failed to parse config JSON");
+        clilog::info!("Loaded testbench config: {:?}", config);
+
+        // Determine clock period for SDF loading
+        let clock_period_ps = args
+            .clock_period
+            .or(config.clock_period_ps)
+            .or(config.timing.as_ref().map(|t| t.clock_period_ps));
+
+        // Determine SDF path: CLI --sdf takes priority, then config.timing.sdf_file
+        let sdf = args.sdf.clone().or_else(|| {
+            config
+                .timing
                 .as_ref()
-                .map_or(String::new(), |s| format!(" --sdf {:?}", s)),
-        );
-        std::process::exit(1);
+                .map(|t| std::path::PathBuf::from(&t.sdf_file))
+        });
+        let sdf_corner = if args.sdf.is_some() {
+            args.sdf_corner.clone()
+        } else if let Some(ref t) = config.timing {
+            t.sdf_corner.clone()
+        } else {
+            "typ".to_string()
+        };
+        let sdf_debug = args.sdf_debug;
+
+        let design_args = DesignArgs {
+            netlist_verilog: args.netlist_verilog.clone(),
+            top_module: args.top_module.clone(),
+            level_split: args.level_split.clone(),
+            gemparts: args.gemparts.clone(),
+            num_blocks: args.num_blocks,
+            json_path: None,
+            sdf,
+            sdf_corner,
+            sdf_debug,
+            clock_period_ps,
+        };
+
+        let mut design = setup::load_design(&design_args);
+        let timing_constraints = setup::build_timing_constraints(&design.script);
+
+        let opts = CosimOpts {
+            max_cycles: args.max_cycles,
+            num_blocks: args.num_blocks,
+            flash_verbose: args.flash_verbose,
+            check_with_cpu: args.check_with_cpu,
+            gpu_profile: args.gpu_profile,
+            clock_period: args.clock_period,
+        };
+
+        let result =
+            gem::sim::cosim_metal::run_cosim(&mut design, &config, &opts, &timing_constraints);
+        std::process::exit(if result.passed { 0 } else { 1 });
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index f1f71d0..6f913d6 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -34,6 +34,10 @@
 //! - [`testbench`] — Testbench configuration and VCD-driven simulation setup
 //! - [`display`] — Display/assertion support infrastructure
 
+#[cfg(feature = "metal")]
+#[macro_use]
+extern crate objc;
+
 pub mod aigpdk;
 
 pub mod sky130;
diff --git a/src/bin/gpu_sim.rs b/src/sim/cosim_metal.rs
similarity index 92%
rename from src/bin/gpu_sim.rs
rename to src/sim/cosim_metal.rs
index 92de869..68a6b90 100644
--- a/src/bin/gpu_sim.rs
+++ b/src/sim/cosim_metal.rs
@@ -1,98 +1,38 @@
 // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-//! GPU-only simulation with Metal: gate evaluation, SPI flash, and UART
-//! all run on GPU. No per-tick CPU interaction needed.
+//! Metal GPU co-simulation engine with SPI flash, UART, and Wishbone bus trace models.
 //!
-//! Flash and UART IO models execute as GPU kernels after each tick's
-//! simulate passes, eliminating the CPU round-trip bottleneck.
-//!
-//! Usage:
-//!   cargo run -r --features metal --bin gpu_sim -- <netlist.gv> <gemparts> \
-//!     --config <testbench.json> [--max-cycles N] [--num-blocks N]
-
-use gem::aig::{DriverType, AIG};
-use gem::aigpdk::AIGPDKLeafPins;
-use gem::flatten::FlattenedScriptV1;
-use gem::pe::Partition;
-use gem::sky130::{detect_library_from_file, CellLibrary, SKY130LeafPins};
-use gem::staging::build_staged_aigs;
-use gem::testbench::{CppSpiFlash, PortMapping, TestbenchConfig};
-use indexmap::IndexSet;
-use netlistdb::{GeneralPinName, NetlistDB};
-use std::collections::HashMap;
-use std::fs::File;
-use std::io::BufReader;
-use std::path::PathBuf;
-use ulib::{AsUPtr, Device};
+//! Extracted from the `gpu_sim` binary. All IO models (flash, UART, bus trace)
+//! run as GPU kernels — no per-tick CPU interaction needed.
 
-#[macro_use]
-extern crate objc;
+use std::collections::HashMap;
 
+use crate::aig::{DriverType, AIG};
+use crate::flatten::FlattenedScriptV1;
+use crate::sim::setup::{self, LoadedDesign};
+use crate::testbench::{CppSpiFlash, PortMapping, TestbenchConfig, UartEvent};
 use metal::{
     CommandQueue, ComputePipelineState, Device as MTLDevice, MTLResourceOptions, MTLSize,
     SharedEvent,
 };
+use netlistdb::{GeneralPinName, NetlistDB};
+use ulib::{AsUPtr, Device};
 
-// ── CLI Arguments ────────────────────────────────────────────────────────────
-
-#[derive(clap::Parser, Debug)]
-#[command(name = "gpu_sim")]
-#[command(about = "Hybrid GPU/CPU co-simulation with Metal")]
-struct Args {
-    /// Gate-level verilog path synthesized in AIGPDK library.
-    netlist_verilog: PathBuf,
-
-    /// Pre-compiled partition mapping (.gemparts file).
-    gemparts: PathBuf,
-
-    /// Testbench configuration JSON file.
-    #[clap(long)]
-    config: PathBuf,
-
-    /// Top module type in netlist.
-    #[clap(long)]
-    top_module: Option<String>,
-
-    /// Level split thresholds (comma-separated).
-    #[clap(long, value_delimiter = ',')]
-    level_split: Vec<usize>,
-
-    /// Number of GPU threadgroups (blocks). Should be ~2x GPU SM count.
-    #[clap(long, default_value = "64")]
-    num_blocks: usize,
-
-    /// Maximum system clock ticks to simulate.
-    #[clap(long)]
-    max_cycles: Option<usize>,
-
-    /// Enable verbose flash model debug output.
-    #[clap(long)]
-    flash_verbose: bool,
-
-    /// Clock period in picoseconds (overrides config file value, for UART baud calc).
-    #[clap(long)]
-    clock_period: Option<u64>,
-
-    /// Verify GPU results against CPU baseline.
-    #[clap(long)]
-    check_with_cpu: bool,
-
-    /// Run GPU kernel profiling: isolate each kernel in its own command buffer
-    /// and measure per-kernel GPU execution time.
-    #[clap(long)]
-    gpu_profile: bool,
-
-    /// Path to SDF file for per-instance back-annotated delays.
-    #[clap(long)]
-    sdf: Option<PathBuf>,
-
-    /// SDF corner selection: min, typ, or max (default: typ).
-    #[clap(long, default_value = "typ")]
-    sdf_corner: String,
+/// Runtime options for co-simulation.
+pub struct CosimOpts {
+    pub max_cycles: Option<usize>,
+    pub num_blocks: usize,
+    pub flash_verbose: bool,
+    pub check_with_cpu: bool,
+    pub gpu_profile: bool,
+    pub clock_period: Option<u64>,
+}
 
-    /// Enable SDF debug output (reports unmatched instances).
-    #[clap(long)]
-    sdf_debug: bool,
+/// Result of a co-simulation run.
+pub struct CosimResult {
+    pub passed: bool,
+    pub uart_events: Vec<UartEvent>,
+    pub ticks_simulated: usize,
 }
 
 // ── Simulation Parameters (must match Metal shader) ──────────────────────────
@@ -1077,7 +1017,7 @@ impl MetalSimulator {
 // ── GPIO ↔ State Buffer Mapping ──────────────────────────────────────────────
 
 /// Maps GPIO pin indices to bit positions in the packed u32 state buffer.
-struct GpioMapping {
+pub(crate) struct GpioMapping {
     /// gpio_in[idx] → (aigpin, state bit position)
     input_bits: HashMap<usize, u32>,
     /// gpio_out[idx] → state bit position in output_map
@@ -1114,7 +1054,7 @@ fn clear_bit(state: &mut [u32], pos: u32) {
 /// When `port_mapping` is provided, maps GPIO indices to port names explicitly
 /// (for designs like ChipFlow that use named ports instead of gpio_in[N]/gpio_out[N]).
 /// Falls back to parsing gpio_in[N]/gpio_out[N] from pin names when no mapping given.
-fn build_gpio_mapping(
+pub(crate) fn build_gpio_mapping(
     aig: &AIG,
     netlistdb: &NetlistDB,
     script: &FlattenedScriptV1,
@@ -1836,7 +1776,7 @@ fn simulate_block_v1_inner(
     sram_data: &mut [u32],
     diag: bool,
 ) {
-    use gem::aigpdk::AIGPDK_SRAM_SIZE;
+    use crate::aigpdk::AIGPDK_SRAM_SIZE;
     let mut script_pi = 0;
     loop {
         let num_stages = script[script_pi];
@@ -2043,7 +1983,7 @@ fn simulate_block_v1_inner(
                 .count();
             let changed_bits: usize = (0..num_ios as usize)
                 .map(|i| {
-                    let old_wo = input_state[(io_offset as usize + i)];
+                    let old_wo = input_state[io_offset as usize + i];
                     let clken = clken_perm[i];
                     let wo = (old_wo & !clken) | (writeouts[i] & clken);
                     (old_wo ^ wo).count_ones() as usize
@@ -2086,157 +2026,59 @@ fn simulate_block_v1_inner(
     assert_eq!(script_pi, script.len());
 }
 
-// ── Main ─────────────────────────────────────────────────────────────────────
+// ── Public Entry Point ───────────────────────────────────────────────────────
 
-fn main() {
-    clilog::init_stderr_color_debug();
-    clilog::enable_timer("gpu_sim");
-    clilog::enable_timer("gem");
-    clilog::set_max_print_count(clilog::Level::Warn, "NL_SV_LIT", 1);
-    eprintln!("WARNING: gpu_sim is deprecated. Use `loom cosim` instead:");
-    eprintln!("  cargo run -r --features metal --bin loom -- cosim ...");
-    eprintln!();
-
-    let args = <Args as clap::Parser>::parse();
-    clilog::info!("gpu_sim args:\n{:#?}", args);
-
-    // ── Load testbench config ────────────────────────────────────────────
-
-    let file = File::open(&args.config).expect("Failed to open config file");
-    let reader = BufReader::new(file);
-    let config: TestbenchConfig =
-        serde_json::from_reader(reader).expect("Failed to parse config JSON");
-    clilog::info!("Loaded testbench config: {:?}", config);
-
-    let max_ticks = args.max_cycles.unwrap_or(config.num_cycles);
-
-    // ── Load netlist and build AIG ───────────────────────────────────────
-
-    let timer_load = clilog::stimer!("load_netlist");
-
-    // Detect cell library (AIGPDK vs SKY130)
-    let cell_library = detect_library_from_file(&args.netlist_verilog)
-        .expect("Failed to read netlist file for library detection");
-    clilog::info!("Detected cell library: {}", cell_library);
-
-    let netlistdb = match cell_library {
-        CellLibrary::SKY130 => NetlistDB::from_sverilog_file(
-            &args.netlist_verilog,
-            args.top_module.as_deref(),
-            &SKY130LeafPins,
-        ),
-        CellLibrary::AIGPDK | CellLibrary::Mixed => NetlistDB::from_sverilog_file(
-            &args.netlist_verilog,
-            args.top_module.as_deref(),
-            &AIGPDKLeafPins(),
-        ),
-    }
-    .expect("cannot build netlist");
-
-    let aig = AIG::from_netlistdb(&netlistdb);
-    clilog::info!(
-        "AIG: {} pins, {} DFFs, {} SRAMs",
-        aig.num_aigpins,
-        aig.dffs.len(),
-        aig.srams.len()
-    );
-    clilog::finish!(timer_load);
-
-    // ── Build staged AIGs and load partitions ────────────────────────────
-
-    let timer_script = clilog::stimer!("build_script");
-    let stageds = build_staged_aigs(&aig, &args.level_split);
-
-    let f = std::fs::File::open(&args.gemparts).unwrap();
-    let mut buf = std::io::BufReader::new(f);
-    let parts_in_stages: Vec<Vec<Partition>> = serde_bare::from_reader(&mut buf).unwrap();
-    clilog::info!(
-        "Partitions per stage: {:?}",
-        parts_in_stages
-            .iter()
-            .map(|ps| ps.len())
-            .collect::<Vec<_>>()
-    );
-
-    let mut input_layout = Vec::new();
-    for (i, driv) in aig.drivers.iter().enumerate() {
-        if let DriverType::InputPort(_) | DriverType::InputClockFlag(_, _) = driv {
-            input_layout.push(i);
-        }
-    }
-
-    let mut script = FlattenedScriptV1::from(
-        &aig,
-        &stageds
-            .iter()
-            .map(|(_, _, staged)| staged)
-            .collect::<Vec<_>>(),
-        &parts_in_stages
-            .iter()
-            .map(|ps| ps.as_slice())
-            .collect::<Vec<_>>(),
-        args.num_blocks,
-        input_layout,
-    );
-    clilog::info!(
-        "Script: state_size={}, sram_storage={}, blocks={}, stages={}",
-        script.reg_io_state_size,
-        script.sram_storage_size,
-        script.num_blocks,
-        script.num_major_stages
-    );
-    clilog::finish!(timer_script);
-
-    // ── Load SDF timing data (from CLI or testbench config) ──────────────
-    {
-        let sdf_path = args.sdf.clone().or_else(|| {
-            config
+/// Run a GPU co-simulation with testbench config.
+///
+/// `design` should already have basic SDF loaded if `--sdf` was passed on CLI.
+/// This function also checks `config.timing` for additional SDF configuration.
+pub fn run_cosim(
+    design: &mut LoadedDesign,
+    config: &TestbenchConfig,
+    opts: &CosimOpts,
+    timing_constraints: &Option<Vec<u32>>,
+) -> CosimResult {
+    // Load SDF from testbench config if not already loaded via CLI --sdf
+    if !design.script.timing_enabled {
+        let sdf_path_from_config = config
+            .timing
+            .as_ref()
+            .map(|t| std::path::PathBuf::from(&t.sdf_file));
+        if let Some(ref sdf_path) = sdf_path_from_config {
+            let sdf_corner_str = config
                 .timing
                 .as_ref()
-                .map(|t| std::path::PathBuf::from(&t.sdf_file))
-        });
-        let sdf_corner_str = if args.sdf.is_some() {
-            &args.sdf_corner
-        } else if let Some(ref t) = config.timing {
-            &t.sdf_corner
-        } else {
-            "typ"
-        };
-        let sdf_corner = match sdf_corner_str {
-            "min" => gem::sdf_parser::SdfCorner::Min,
-            "max" => gem::sdf_parser::SdfCorner::Max,
-            _ => gem::sdf_parser::SdfCorner::Typ,
-        };
-
-        if let Some(ref sdf_path) = sdf_path {
-            clilog::info!("Loading SDF: {:?} (corner: {})", sdf_path, sdf_corner_str);
-            match gem::sdf_parser::SdfFile::parse_file(sdf_path, sdf_corner) {
-                Ok(sdf) => {
-                    clilog::info!("SDF loaded: {}", sdf.summary());
-                    let clock_ps = config
-                        .timing
-                        .as_ref()
-                        .map(|t| t.clock_period_ps)
-                        .or(config.clock_period_ps)
-                        .unwrap_or(25000);
-                    script.load_timing_from_sdf(
-                        &aig,
-                        &netlistdb,
-                        &sdf,
-                        clock_ps,
-                        None,
-                        args.sdf_debug,
-                    );
-                    script.inject_timing_to_script();
-                }
-                Err(e) => clilog::warn!("Failed to load SDF: {}", e),
-            }
+                .map(|t| t.sdf_corner.as_str())
+                .unwrap_or("typ");
+            let clock_ps = config
+                .timing
+                .as_ref()
+                .map(|t| t.clock_period_ps)
+                .or(config.clock_period_ps)
+                .unwrap_or(25000);
+            setup::load_sdf(
+                &mut design.script,
+                &design.aig,
+                &design.netlistdb,
+                sdf_path,
+                sdf_corner_str,
+                false,
+                Some(clock_ps),
+            );
         }
     }
 
+    let max_ticks = opts.max_cycles.unwrap_or(config.num_cycles);
+    let script = &design.script;
+    let aig = &design.aig;
+    let netlistdb = &design.netlistdb;
+    let num_blocks = script.num_blocks;
+    let num_major_stages = script.num_major_stages;
+    let state_size = script.reg_io_state_size as usize;
+
     // ── Build GPIO mapping ───────────────────────────────────────────────
 
-    let gpio_map = build_gpio_mapping(&aig, &netlistdb, &script, config.port_mapping.as_ref());
+    let gpio_map = build_gpio_mapping(aig, netlistdb, script, config.port_mapping.as_ref());
 
     // Verify we found the expected GPIO pins
     let clock_gpio = config.clock_gpio;
@@ -2288,7 +2130,7 @@ fn main() {
         "flash_csn",
     ];
     for sig in &diag_signals {
-        let pos = resolve_signal_pos(&aig, &netlistdb, &script, sig);
+        let pos = resolve_signal_pos(aig, netlistdb, script, sig);
         if pos != 0xFFFFFFFF {
             clilog::info!("Diagnostic signal '{}' → output state pos {}", sig, pos);
         } else {
@@ -2298,9 +2140,9 @@ fn main() {
 
     // ── Initialize peripheral models (CPU-side, kept for --check-with-cpu) ──
 
-    let flash: Option<CppSpiFlash> = if let Some(ref flash_cfg) = config.flash {
+    let _flash: Option<CppSpiFlash> = if let Some(ref flash_cfg) = config.flash {
         let mut fl = CppSpiFlash::new(16 * 1024 * 1024);
-        fl.set_verbose(args.flash_verbose);
+        fl.set_verbose(opts.flash_verbose);
         let firmware_path = std::path::Path::new(&flash_cfg.firmware);
         match fl.load_firmware(firmware_path, flash_cfg.firmware_offset) {
             Ok(size) => clilog::info!(
@@ -2316,7 +2158,7 @@ fn main() {
     };
 
     // CLI --clock-period overrides config file clock_period_ps; default 1000ps (1GHz) if neither set
-    let clock_period_ps = args.clock_period.or(config.clock_period_ps).unwrap_or(1000);
+    let clock_period_ps = opts.clock_period.or(config.clock_period_ps).unwrap_or(1000);
     let clock_hz = 1_000_000_000_000u64 / clock_period_ps;
     clilog::info!(
         "Clock period: {} ps ({} MHz), UART cycles_per_bit: {}",
@@ -2330,9 +2172,7 @@ fn main() {
     // ── Initialize Metal simulator and GPU state buffers ─────────────────
 
     let timer_init = clilog::stimer!("init_gpu");
-    let simulator = MetalSimulator::new(script.num_major_stages);
-
-    let state_size = script.reg_io_state_size as usize;
+    let simulator = MetalSimulator::new(num_major_stages);
 
     // States: [input state (state_size)] [output state (state_size)]
     let states_buffer = simulator.device.new_buffer(
@@ -2415,40 +2255,23 @@ fn main() {
     );
 
     // Event buffer (for $stop/$finish/assertions)
-    let event_buffer = Box::new(gem::event_buffer::EventBuffer::new());
+    let event_buffer = Box::new(crate::event_buffer::EventBuffer::new());
     let event_buffer_ptr = Box::into_raw(event_buffer);
     let event_buffer_metal = simulator.device.new_buffer_with_bytes_no_copy(
         event_buffer_ptr as *const _,
-        std::mem::size_of::<gem::event_buffer::EventBuffer>() as u64,
+        std::mem::size_of::<crate::event_buffer::EventBuffer>() as u64,
         MTLResourceOptions::StorageModeShared,
         None,
     );
 
     // Timing constraint buffer for GPU-side setup/hold checking.
-    // Format: [clock_period_ps:u32, constraints[0], constraints[1], ...]
-    // Each constraint word packs [setup_ps:16][hold_ps:16] for that state word.
-    let timing_constraints_buffer = if script.timing_enabled && !script.dff_constraints.is_empty() {
-        let (clock_ps, constraints) = script.build_timing_constraint_buffer();
-        let non_zero = constraints.iter().filter(|&&v| v != 0).count();
-        clilog::info!(
-            "Timing constraints: {} words, {} with DFF constraints, clock_period={}ps",
-            constraints.len(),
-            non_zero,
-            clock_ps
-        );
-        // Prepend clock_period_ps as first element
-        let mut buf = Vec::with_capacity(1 + constraints.len());
-        buf.push(clock_ps);
-        buf.extend_from_slice(&constraints);
-        let metal_buf = simulator.device.new_buffer_with_data(
+    let timing_constraints_buffer = timing_constraints.as_ref().map(|buf| {
+        simulator.device.new_buffer_with_data(
             buf.as_ptr() as *const _,
             (buf.len() * std::mem::size_of::<u32>()) as u64,
             MTLResourceOptions::StorageModeShared,
-        );
-        Some(metal_buf)
-    } else {
-        None
-    };
+        )
+    });
 
     clilog::finish!(timer_init);
 
@@ -2456,8 +2279,6 @@ fn main() {
 
     let timer_prep = clilog::stimer!("build_state_prep_buffers");
     let reset_cycles = config.reset_cycles;
-    let num_major_stages = script.num_major_stages;
-    let num_blocks = script.num_blocks;
 
     // Initial reset value
     let reset_val_active = if config.reset_active_high { 1u8 } else { 0u8 };
@@ -2606,7 +2427,7 @@ fn main() {
     if let Some(ref flash_cfg) = config.flash {
         use std::io::Read;
         let firmware_path = std::path::Path::new(&flash_cfg.firmware);
-        let mut file = File::open(firmware_path).expect("Failed to open firmware file");
+        let mut file = std::fs::File::open(firmware_path).expect("Failed to open firmware file");
         let mut data = Vec::new();
         file.read_to_end(&mut data)
             .expect("Failed to read firmware");
@@ -2673,7 +2494,7 @@ fn main() {
 
     // ── GPU Wishbone Bus Trace buffers ────────────────────────────────
 
-    let wb_trace_params = build_wb_trace_params(&aig, &netlistdb, &script);
+    let wb_trace_params = build_wb_trace_params(aig, netlistdb, script);
     let wb_trace_params_buffer = simulator.device.new_buffer(
         std::mem::size_of::<WbTraceParams>() as u64,
         MTLResourceOptions::StorageModeShared,
@@ -2707,8 +2528,8 @@ fn main() {
 
     // ── GPU Kernel Profiling (optional) ──────────────────────────────────
 
-    if args.gpu_profile {
-        let profile_ticks = args.max_cycles.unwrap_or(1000).min(5000);
+    if opts.gpu_profile {
+        let profile_ticks = opts.max_cycles.unwrap_or(1000).min(5000);
         simulator.profile_gpu_kernels(
             profile_ticks,
             num_blocks,
@@ -2739,7 +2560,11 @@ fn main() {
         unsafe {
             drop(Box::from_raw(event_buffer_ptr));
         }
-        return;
+        return CosimResult {
+            passed: true,
+            uart_events: Vec::new(),
+            ticks_simulated: 0,
+        };
     }
 
     // ── GPU-only simulation loop ─────────────────────────────────────────
@@ -2785,7 +2610,7 @@ fn main() {
     }
 
     // UART event collection (CPU-side, populated from channel drain)
-    let mut uart_events: Vec<gem::testbench::UartEvent> = Vec::new();
+    let mut uart_events: Vec<UartEvent> = Vec::new();
     let mut uart_read_head: u32 = 0;
     let mut wb_trace_read_head: u32 = 0;
 
@@ -2808,24 +2633,24 @@ fn main() {
     let mut diag_sram_write_count: usize = 0;
 
     // CPU verification state (--check-with-cpu)
-    let mut cpu_states: Vec<u32> = if args.check_with_cpu {
+    let mut cpu_states: Vec<u32> = if opts.check_with_cpu {
         vec![0u32; 2 * state_size]
     } else {
         Vec::new()
     };
-    let mut cpu_sram: Vec<u32> = if args.check_with_cpu {
+    let mut cpu_sram: Vec<u32> = if opts.check_with_cpu {
         vec![0u32; script.sram_storage_size as usize]
     } else {
         Vec::new()
     };
     let mut cpu_check_mismatches: usize = 0;
-    let cpu_check_max_ticks = if args.check_with_cpu { 500 } else { 0 };
+    let cpu_check_max_ticks = if opts.check_with_cpu { 500 } else { 0 };
 
     let mut post_reset_state_snapshot: Option<Vec<u32>> = None;
 
     let mut tick: usize = 0;
     while tick < max_ticks {
-        let batch = if args.check_with_cpu && tick < cpu_check_max_ticks {
+        let batch = if opts.check_with_cpu && tick < cpu_check_max_ticks {
             1 // single tick for CPU comparison
         } else if trace_ticks > 0 && tick < reset_cycles + trace_ticks {
             1 // single tick for tracing
@@ -2859,7 +2684,7 @@ fn main() {
 
         // Save pre-tick state for CPU verification
         let saved_flash_d_i: u8;
-        if args.check_with_cpu && tick < cpu_check_max_ticks && batch == 1 {
+        if opts.check_with_cpu && tick < cpu_check_max_ticks && batch == 1 {
             let gpu_states: &[u32] = unsafe {
                 std::slice::from_raw_parts(states_buffer.contents() as *const u32, 2 * state_size)
             };
@@ -2964,7 +2789,7 @@ fn main() {
         }
 
         // ── CPU verification: simulate same tick on CPU and compare ──
-        if args.check_with_cpu && tick < cpu_check_max_ticks && batch == 1 {
+        if opts.check_with_cpu && tick < cpu_check_max_ticks && batch == 1 {
             // CPU state_prep(fall): copy output → input, apply fall_ops
             // Read from the Metal buffer (updated by update_reset_in_ops each tick)
             cpu_states.copy_within(state_size..2 * state_size, 0);
@@ -3119,13 +2944,13 @@ fn main() {
             if tick >= reset_cycles && tick <= reset_cycles + 5 {
                 let (input_half, output_half) = cpu_states.split_at(state_size);
                 compare_aig_vs_flattened(
-                    &aig,
+                    aig,
                     input_half,
                     output_half,
-                    &script,
+                    script,
                     state_size,
                     tick,
-                    Some(&netlistdb),
+                    Some(netlistdb),
                 );
             }
 
@@ -3157,7 +2982,7 @@ fn main() {
             let gpu_output = &gpu_states[state_size..2 * state_size];
             let cpu_output = &cpu_states[state_size..2 * state_size];
             let mut mismatches = 0;
-            let mut first_mismatch_word = 0;
+            let mut _first_mismatch_word = 0;
             for i in 0..state_size {
                 if gpu_output[i] != cpu_output[i] {
                     if mismatches < 5 {
@@ -3174,7 +2999,7 @@ fn main() {
                         );
                     }
                     if mismatches == 0 {
-                        first_mismatch_word = i;
+                        _first_mismatch_word = i;
                     }
                     mismatches += 1;
                 }
@@ -3217,7 +3042,7 @@ fn main() {
             // Per-tick output state change tracking
             if tick >= reset_cycles.saturating_sub(2) && tick <= reset_cycles + 15 {
                 let gpu_output = &gpu_states[state_size..2 * state_size];
-                let changed_words: usize = gpu_output
+                let _changed_words: usize = gpu_output
                     .iter()
                     .zip(cpu_output.iter())
                     .filter(|(a, b)| a != b)
@@ -3253,7 +3078,7 @@ fn main() {
                     '.'
                 };
                 clilog::info!("UART TX: 0x{:02X} '{}'", byte, ch);
-                uart_events.push(gem::testbench::UartEvent {
+                uart_events.push(UartEvent {
                     timestamp: tick, // approximate tick
                     peripheral: "uart_0".to_string(),
                     event: "tx".to_string(),
@@ -3578,13 +3403,13 @@ fn main() {
     if let Some(ref output_path) = config.output_events {
         #[derive(serde::Serialize)]
         struct EventsOutput {
-            events: Vec<gem::testbench::UartEvent>,
+            events: Vec<UartEvent>,
         }
         let output = EventsOutput {
             events: uart_events.clone(),
         };
         let json = serde_json::to_string_pretty(&output).expect("Failed to serialize events");
-        let mut file = File::create(output_path).expect("Failed to create events file");
+        let mut file = std::fs::File::create(output_path).expect("Failed to create events file");
         use std::io::Write;
         file.write_all(json.as_bytes())
             .expect("Failed to write events");
@@ -3597,7 +3422,7 @@ fn main() {
     if let Some(ref ref_path) = config.events_reference {
         #[derive(serde::Deserialize)]
         struct EventsFile {
-            events: Vec<gem::testbench::UartEvent>,
+            events: Vec<UartEvent>,
         }
         let ref_file = std::fs::read_to_string(ref_path)
             .unwrap_or_else(|e| panic!("Failed to read events reference {}: {}", ref_path, e));
@@ -3662,7 +3487,7 @@ fn main() {
 
     // ── Optional CPU verification ────────────────────────────────────────
 
-    if args.check_with_cpu {
+    if opts.check_with_cpu {
         if cpu_check_mismatches == 0 {
             clilog::info!(
                 "CPU verification: PASSED ({} ticks checked)",
@@ -3677,9 +3502,6 @@ fn main() {
         }
     }
 
-    // Keep flash alive (for --check-with-cpu in future)
-    let _ = flash;
-
     // Clean up event buffer
     unsafe {
         drop(Box::from_raw(event_buffer_ptr));
@@ -3690,6 +3512,11 @@ fn main() {
         println!("SIMULATION: PASSED");
     } else {
         println!("SIMULATION: FAILED (event mismatch)");
-        std::process::exit(1);
+    }
+
+    CosimResult {
+        passed: events_passed,
+        uart_events,
+        ticks_simulated: max_ticks,
     }
 }
diff --git a/src/sim/mod.rs b/src/sim/mod.rs
index 3825cc8..89e2a60 100644
--- a/src/sim/mod.rs
+++ b/src/sim/mod.rs
@@ -9,6 +9,8 @@
 //! - [`vcd_io`] — VCD input parsing and output writing utilities
 //! - [`setup`] — Design loading pipeline (netlist → AIG → script)
 
+#[cfg(feature = "metal")]
+pub mod cosim_metal;
 pub mod cpu_reference;
 pub mod setup;
 pub mod vcd_io;
diff --git a/src/sim/setup.rs b/src/sim/setup.rs
index a9b97dd..daf405b 100644
--- a/src/sim/setup.rs
+++ b/src/sim/setup.rs
@@ -26,6 +26,8 @@ pub struct DesignArgs {
     pub sdf: Option<PathBuf>,
     pub sdf_corner: String,
     pub sdf_debug: bool,
+    /// Clock period in picoseconds for SDF timing. Defaults to 25000 if not set.
+    pub clock_period_ps: Option<u64>,
 }
 
 /// Result of loading a design: everything needed for simulation.
@@ -133,6 +135,7 @@ pub fn load_design(args: &DesignArgs) -> LoadedDesign {
             sdf_path,
             &args.sdf_corner,
             args.sdf_debug,
+            args.clock_period_ps,
         );
     }
 
@@ -152,6 +155,8 @@ pub fn load_design(args: &DesignArgs) -> LoadedDesign {
 }
 
 /// Load SDF timing data into a script.
+///
+/// `clock_period_ps` overrides the default 25000ps clock period used for timing.
 pub fn load_sdf(
     script: &mut FlattenedScriptV1,
     aig: &AIG,
@@ -159,17 +164,24 @@ pub fn load_sdf(
     sdf_path: &Path,
     sdf_corner: &str,
     sdf_debug: bool,
+    clock_period_ps: Option<u64>,
 ) {
+    let clock_ps = clock_period_ps.unwrap_or(25000);
     let corner = match sdf_corner {
         "min" => crate::sdf_parser::SdfCorner::Min,
         "max" => crate::sdf_parser::SdfCorner::Max,
         _ => crate::sdf_parser::SdfCorner::Typ,
     };
-    clilog::info!("Loading SDF: {:?} (corner: {})", sdf_path, sdf_corner);
+    clilog::info!(
+        "Loading SDF: {:?} (corner: {}, clock_period={}ps)",
+        sdf_path,
+        sdf_corner,
+        clock_ps
+    );
     match crate::sdf_parser::SdfFile::parse_file(sdf_path, corner) {
         Ok(sdf) => {
             clilog::info!("SDF loaded: {}", sdf.summary());
-            script.load_timing_from_sdf(aig, netlistdb, &sdf, 25000, None, sdf_debug);
+            script.load_timing_from_sdf(aig, netlistdb, &sdf, clock_ps, None, sdf_debug);
             script.inject_timing_to_script();
         }
         Err(e) => clilog::warn!("Failed to load SDF: {}", e),
diff --git a/src/sky130_pdk.rs b/src/sky130_pdk.rs
index baafd1f..f5ddfaa 100644
--- a/src/sky130_pdk.rs
+++ b/src/sky130_pdk.rs
@@ -1375,9 +1375,10 @@ mod tests {
 
     #[test]
     fn test_parse_ha() {
-        let src =
-            std::fs::read_to_string("vendor/sky130_fd_sc_hd/cells/ha/sky130_fd_sc_hd__ha.functional.v")
-                .unwrap();
+        let src = std::fs::read_to_string(
+            "vendor/sky130_fd_sc_hd/cells/ha/sky130_fd_sc_hd__ha.functional.v",
+        )
+        .unwrap();
         let model = parse_functional_model(&src).unwrap();
         assert_eq!(model.module_name, "sky130_fd_sc_hd__ha");
         assert_eq!(model.inputs, vec!["A", "B"]);

From 9192a5b60a2fb59eaba5df9d80b567c544ff5c6c Mon Sep 17 00:00:00 2001
From: Rob Taylor <rob.taylor@chipflow.io>
Date: Tue, 17 Feb 2026 16:05:27 +0000
Subject: [PATCH 2/3] Add WNS/TNS timing analysis, clock uncertainty, and event
 buffer draining

- Extend SimStats with worst/total slack tracking (WNS/TNS/WHS/THS) and
  per-endpoint HashSet for unique violation counting
- Add --clock-uncertainty-ps CLI flag to gpu_sim for modeling clock jitter;
  uncertainty is added to setup/hold constraints via build_timing_constraint_buffer()
- Drain event buffer in gpu_sim.rs GPU-only simulation loop after each batch,
  accumulating timing violation statistics across the full run
- Print timing analysis summary (WNS/TNS, unique endpoints, PASS/FAIL) at
  end of simulation when timing is enabled
- Add unit tests for WNS/TNS accumulation across single and multiple drains

Co-developed-by: Claude Code v2.1.44 (claude-opus-4-6)
---
 src/bin/loom.rs        |  10 +++-
 src/event_buffer.rs    | 103 +++++++++++++++++++++++++++++++++++++++--
 src/flatten.rs         |  31 ++++++++-----
 src/sim/cosim_metal.rs |  61 ++++++++++++++++++++++++
 src/sim/setup.rs       |   7 ++-
 5 files changed, 193 insertions(+), 19 deletions(-)

diff --git a/src/bin/loom.rs b/src/bin/loom.rs
index 60f9989..ebf2877 100644
--- a/src/bin/loom.rs
+++ b/src/bin/loom.rs
@@ -209,6 +209,10 @@ struct CosimArgs {
     /// Enable SDF debug output.
     #[clap(long)]
     sdf_debug: bool,
+
+    /// Clock uncertainty in picoseconds (added to setup/hold constraints to model jitter).
+    #[clap(long, default_value = "0")]
+    clock_uncertainty_ps: u64,
 }
 
 /// Invoke the mt-kahypar partitioner.
@@ -365,7 +369,7 @@ fn cmd_sim(args: SimArgs) {
 
     #[allow(unused_mut)]
     let mut design = setup::load_design(&design_args);
-    let timing_constraints = setup::build_timing_constraints(&design.script);
+    let timing_constraints = setup::build_timing_constraints(&design.script, 0);
 
     // Parse input VCD
     let input_vcd = std::fs::File::open(&args.input_vcd).unwrap();
@@ -1086,7 +1090,8 @@ fn cmd_cosim(args: CosimArgs) {
         };
 
         let mut design = setup::load_design(&design_args);
-        let timing_constraints = setup::build_timing_constraints(&design.script);
+        let timing_constraints =
+            setup::build_timing_constraints(&design.script, args.clock_uncertainty_ps as u16);
 
         let opts = CosimOpts {
             max_cycles: args.max_cycles,
@@ -1095,6 +1100,7 @@ fn cmd_cosim(args: CosimArgs) {
             check_with_cpu: args.check_with_cpu,
             gpu_profile: args.gpu_profile,
             clock_period: args.clock_period,
+            clock_uncertainty_ps: args.clock_uncertainty_ps,
         };
 
         let result =
diff --git a/src/event_buffer.rs b/src/event_buffer.rs
index 0a54b79..5ea4ced 100644
--- a/src/event_buffer.rs
+++ b/src/event_buffer.rs
@@ -6,6 +6,7 @@
 //! (e.g., $stop, $finish, $display, assertion failures) and the CPU processes
 //! them between simulation stages.
 
+use std::collections::HashSet;
 use std::sync::atomic::{AtomicU32, Ordering};
 
 /// Maximum number of events that can be buffered per cycle.
@@ -211,7 +212,7 @@ pub enum AssertAction {
 }
 
 /// Statistics tracked during simulation.
-#[derive(Debug, Default, Clone)]
+#[derive(Debug, Clone)]
 pub struct SimStats {
     /// Number of assertion failures encountered
     pub assertion_failures: u32,
@@ -219,10 +220,40 @@ pub struct SimStats {
     pub stop_count: u32,
     /// Number of events dropped due to overflow
     pub events_dropped: u32,
-    /// Number of setup timing violations (Experiment 4)
+    /// Number of setup timing violations
     pub setup_violations: u32,
-    /// Number of hold timing violations (Experiment 4)
+    /// Number of hold timing violations
     pub hold_violations: u32,
+    /// Worst negative slack for setup (most negative = worst), in picoseconds
+    pub worst_setup_slack_ps: i32,
+    /// Worst negative slack for hold (most negative = worst), in picoseconds
+    pub worst_hold_slack_ps: i32,
+    /// Total negative slack for setup (sum of all negative slacks), in picoseconds
+    pub total_setup_slack_ps: i64,
+    /// Total negative slack for hold (sum of all negative slacks), in picoseconds
+    pub total_hold_slack_ps: i64,
+    /// Unique state word IDs with setup violations
+    pub setup_violating_endpoints: HashSet<u32>,
+    /// Unique state word IDs with hold violations
+    pub hold_violating_endpoints: HashSet<u32>,
+}
+
+impl Default for SimStats {
+    fn default() -> Self {
+        Self {
+            assertion_failures: 0,
+            stop_count: 0,
+            events_dropped: 0,
+            setup_violations: 0,
+            hold_violations: 0,
+            worst_setup_slack_ps: 0,
+            worst_hold_slack_ps: 0,
+            total_setup_slack_ps: 0,
+            total_hold_slack_ps: 0,
+            setup_violating_endpoints: HashSet::new(),
+            hold_violating_endpoints: HashSet::new(),
+        }
+    }
 }
 
 /// Process events from the buffer and determine simulation control.
@@ -313,6 +344,11 @@ where
                     slack
                 );
                 stats.setup_violations += 1;
+                stats.worst_setup_slack_ps = stats.worst_setup_slack_ps.min(slack);
+                if slack < 0 {
+                    stats.total_setup_slack_ps += slack as i64;
+                }
+                stats.setup_violating_endpoints.insert(word_id);
             }
             EventType::HoldViolation => {
                 // data[0] = state word index, data[1] = slack (signed as u32)
@@ -330,6 +366,11 @@ where
                     slack
                 );
                 stats.hold_violations += 1;
+                stats.worst_hold_slack_ps = stats.worst_hold_slack_ps.min(slack);
+                if slack < 0 {
+                    stats.total_hold_slack_ps += slack as i64;
+                }
+                stats.hold_violating_endpoints.insert(word_id);
             }
         }
     }
@@ -658,4 +699,60 @@ mod tests {
         assert_eq!(stats.hold_violations, 1);
         assert_eq!(stats.stop_count, 1);
     }
+
+    #[test]
+    fn test_wns_tns_accumulation() {
+        let mut buf = EventBuffer::new();
+        // Three setup violations with different slacks and word IDs
+        add_full_timing_event(&mut buf, EventType::SetupViolation, 10, 5, -100, 900, 200);
+        add_full_timing_event(&mut buf, EventType::SetupViolation, 20, 8, -450, 1200, 300);
+        add_full_timing_event(&mut buf, EventType::SetupViolation, 30, 5, -200, 1000, 250);
+        // Two hold violations
+        add_full_timing_event(&mut buf, EventType::HoldViolation, 15, 3, -120, 10, 50);
+        add_full_timing_event(&mut buf, EventType::HoldViolation, 25, 7, -280, 5, 80);
+
+        let config = AssertConfig::default();
+        let mut stats = SimStats::default();
+        process_events(&buf, &config, &mut stats, |_, _, _| {});
+
+        // Setup: WNS = -450 (worst), TNS = -100 + -450 + -200 = -750
+        assert_eq!(stats.setup_violations, 3);
+        assert_eq!(stats.worst_setup_slack_ps, -450);
+        assert_eq!(stats.total_setup_slack_ps, -750);
+        // 2 unique endpoints (word IDs 5 and 8)
+        assert_eq!(stats.setup_violating_endpoints.len(), 2);
+        assert!(stats.setup_violating_endpoints.contains(&5));
+        assert!(stats.setup_violating_endpoints.contains(&8));
+
+        // Hold: WHS = -280 (worst), THS = -120 + -280 = -400
+        assert_eq!(stats.hold_violations, 2);
+        assert_eq!(stats.worst_hold_slack_ps, -280);
+        assert_eq!(stats.total_hold_slack_ps, -400);
+        assert_eq!(stats.hold_violating_endpoints.len(), 2);
+        assert!(stats.hold_violating_endpoints.contains(&3));
+        assert!(stats.hold_violating_endpoints.contains(&7));
+    }
+
+    #[test]
+    fn test_wns_tns_across_multiple_drains() {
+        // Simulate two separate event buffer drains (as happens across GPU batches)
+        let config = AssertConfig::default();
+        let mut stats = SimStats::default();
+
+        // First batch
+        let mut buf1 = EventBuffer::new();
+        add_full_timing_event(&mut buf1, EventType::SetupViolation, 10, 1, -100, 900, 200);
+        process_events(&buf1, &config, &mut stats, |_, _, _| {});
+
+        // Second batch (worse slack on same endpoint)
+        let mut buf2 = EventBuffer::new();
+        add_full_timing_event(&mut buf2, EventType::SetupViolation, 20, 1, -300, 1100, 200);
+        process_events(&buf2, &config, &mut stats, |_, _, _| {});
+
+        assert_eq!(stats.setup_violations, 2);
+        assert_eq!(stats.worst_setup_slack_ps, -300);
+        assert_eq!(stats.total_setup_slack_ps, -400);
+        // Same endpoint (word_id=1) across both drains
+        assert_eq!(stats.setup_violating_endpoints.len(), 1);
+    }
 }
diff --git a/src/flatten.rs b/src/flatten.rs
index 2618093..ae675e2 100644
--- a/src/flatten.rs
+++ b/src/flatten.rs
@@ -1407,7 +1407,11 @@ impl FlattenedScriptV1 {
     /// Words with no DFF constraints have value 0 (skipped by the kernel).
     /// This is conservative: the max arrival across the word is compared against the
     /// min constraint, which may over-report violations but never misses real ones.
-    pub fn build_timing_constraint_buffer(&self) -> (u32, Vec<u32>) {
+    /// Build a per-word timing constraint buffer for GPU-side setup/hold checking.
+    ///
+    /// `clock_uncertainty_ps` adds margin to both setup and hold constraints,
+    /// modelling clock jitter. Effectively tightens the timing window by this amount.
+    pub fn build_timing_constraint_buffer(&self, clock_uncertainty_ps: u16) -> (u32, Vec<u32>) {
         let num_words = self.reg_io_state_size as usize;
         let mut constraints = vec![0u32; num_words];
         for c in &self.dff_constraints {
@@ -1421,16 +1425,19 @@ impl FlattenedScriptV1 {
             let existing = constraints[word_idx];
             let old_setup = (existing >> 16) as u16;
             let old_hold = (existing & 0xFFFF) as u16;
+            // Add clock uncertainty to both setup and hold constraints
+            let effective_setup = c.setup_ps.saturating_add(clock_uncertainty_ps);
+            let effective_hold = c.hold_ps.saturating_add(clock_uncertainty_ps);
             // Most restrictive (min) constraint per word, treating 0 as "not yet set"
             let new_setup = if old_setup == 0 {
-                c.setup_ps
+                effective_setup
             } else {
-                old_setup.min(c.setup_ps)
+                old_setup.min(effective_setup)
             };
             let new_hold = if old_hold == 0 {
-                c.hold_ps
+                effective_hold
             } else {
-                old_hold.min(c.hold_ps)
+                old_hold.min(effective_hold)
             };
             constraints[word_idx] = ((new_setup as u32) << 16) | (new_hold as u32);
         }
@@ -2223,7 +2230,7 @@ mod constraint_buffer_tests {
     #[test]
     fn test_empty_constraints() {
         let script = make_script_with_constraints(4, 10000, Vec::new());
-        let (clock_ps, buf) = script.build_timing_constraint_buffer();
+        let (clock_ps, buf) = script.build_timing_constraint_buffer(0);
         assert_eq!(clock_ps, 10000);
         assert_eq!(buf.len(), 4);
         assert!(buf.iter().all(|&v| v == 0));
@@ -2241,7 +2248,7 @@ mod constraint_buffer_tests {
                 cell_id: 1,
             }],
         );
-        let (clock_ps, buf) = script.build_timing_constraint_buffer();
+        let (clock_ps, buf) = script.build_timing_constraint_buffer(0);
         assert_eq!(clock_ps, 25000);
         // data_state_pos 35 → word_idx 1
         assert_eq!(buf[0], 0);
@@ -2270,7 +2277,7 @@ mod constraint_buffer_tests {
                 },
             ],
         );
-        let (_clock_ps, buf) = script.build_timing_constraint_buffer();
+        let (_clock_ps, buf) = script.build_timing_constraint_buffer(0);
         // Both in word 0 → min(300,150)=150 setup, min(100,200)=100 hold
         assert_eq!(buf[0], (150u32 << 16) | 100);
     }
@@ -2287,7 +2294,7 @@ mod constraint_buffer_tests {
                 cell_id: 1,
             }],
         );
-        let (_clock_ps, buf) = script.build_timing_constraint_buffer();
+        let (_clock_ps, buf) = script.build_timing_constraint_buffer(0);
         assert!(buf.iter().all(|&v| v == 0));
     }
 
@@ -2303,7 +2310,7 @@ mod constraint_buffer_tests {
                 cell_id: 1,
             }],
         );
-        let (_clock_ps, buf) = script.build_timing_constraint_buffer();
+        let (_clock_ps, buf) = script.build_timing_constraint_buffer(0);
         // word_idx = 100/32 = 3, but num_words = 2 → skipped
         assert!(buf.iter().all(|&v| v == 0));
     }
@@ -2311,7 +2318,7 @@ mod constraint_buffer_tests {
     #[test]
     fn test_clock_period_saturation() {
         let script = make_script_with_constraints(1, u64::MAX, Vec::new());
-        let (clock_ps, _buf) = script.build_timing_constraint_buffer();
+        let (clock_ps, _buf) = script.build_timing_constraint_buffer(0);
         assert_eq!(clock_ps, u32::MAX);
     }
 
@@ -2440,7 +2447,7 @@ mod constraint_buffer_tests {
                 cell_id: 1,
             }],
         );
-        let (clock_ps, buf) = script.build_timing_constraint_buffer();
+        let (clock_ps, buf) = script.build_timing_constraint_buffer(0);
         assert_eq!(clock_ps, 1000);
 
         // Verify constraint is at word 5 (160/32 = 5)
diff --git a/src/sim/cosim_metal.rs b/src/sim/cosim_metal.rs
index 68a6b90..cd495e8 100644
--- a/src/sim/cosim_metal.rs
+++ b/src/sim/cosim_metal.rs
@@ -26,6 +26,8 @@ pub struct CosimOpts {
     pub check_with_cpu: bool,
     pub gpu_profile: bool,
     pub clock_period: Option<u64>,
+    /// Clock uncertainty in picoseconds (added to setup/hold constraints to model jitter).
+    pub clock_uncertainty_ps: u64,
 }
 
 /// Result of a co-simulation run.
@@ -2614,6 +2616,10 @@ pub fn run_cosim(
     let mut uart_read_head: u32 = 0;
     let mut wb_trace_read_head: u32 = 0;
 
+    // Timing violation statistics (accumulated from event buffer)
+    let assert_config = crate::event_buffer::AssertConfig::default();
+    let mut sim_stats = crate::event_buffer::SimStats::default();
+
     // Profiling accumulators
     let mut prof_batch_encode: u64 = 0;
     let mut prof_gpu_wait: u64 = 0;
@@ -3121,6 +3127,21 @@ pub fn run_cosim(
                 wb_trace_read_head += 1;
             }
         }
+        // Drain event buffer for timing violations + sim control events
+        {
+            let eb = unsafe { &*(event_buffer_ptr as *const crate::event_buffer::EventBuffer) };
+            let control = crate::event_buffer::process_events(
+                eb,
+                &assert_config,
+                &mut sim_stats,
+                |_, _, _| {},
+            );
+            eb.reset();
+            if control == crate::event_buffer::SimControl::Terminate {
+                clilog::warn!("Simulation terminated by event at tick {}", tick);
+                break;
+            }
+        }
         prof_drain += t_drain.elapsed().as_nanos() as u64;
 
         total_batches += 1;
@@ -3355,6 +3376,46 @@ pub fn run_cosim(
         );
     }
 
+    // ── Timing Analysis Summary ──────────────────────────────────────────
+
+    if script.timing_enabled {
+        println!();
+        println!("=== Timing Analysis ===");
+        println!("Clock period: {}ps", script.clock_period_ps);
+        if opts.clock_uncertainty_ps > 0 {
+            println!("Clock uncertainty: {}ps", opts.clock_uncertainty_ps);
+        }
+        println!(
+            "Setup violations: {} ({} unique endpoints)",
+            sim_stats.setup_violations,
+            sim_stats.setup_violating_endpoints.len()
+        );
+        println!(
+            "Hold violations:  {} ({} unique endpoints)",
+            sim_stats.hold_violations,
+            sim_stats.hold_violating_endpoints.len()
+        );
+        if sim_stats.setup_violations > 0 {
+            println!("WNS (setup): {}ps", sim_stats.worst_setup_slack_ps);
+            println!("TNS (setup): {}ps", sim_stats.total_setup_slack_ps);
+        }
+        if sim_stats.hold_violations > 0 {
+            println!("WHS (hold):  {}ps", sim_stats.worst_hold_slack_ps);
+            println!("THS (hold):  {}ps", sim_stats.total_hold_slack_ps);
+        }
+        if sim_stats.setup_violations == 0 && sim_stats.hold_violations == 0 {
+            println!("TIMING: PASSED");
+        } else {
+            println!("TIMING: VIOLATIONS DETECTED");
+        }
+        if sim_stats.events_dropped > 0 {
+            println!(
+                "WARNING: {} events were dropped (buffer overflow), slack metrics may be approximate",
+                sim_stats.events_dropped
+            );
+        }
+    }
+
     // ── Results ──────────────────────────────────────────────────────────
 
     println!();
diff --git a/src/sim/setup.rs b/src/sim/setup.rs
index daf405b..7718c9d 100644
--- a/src/sim/setup.rs
+++ b/src/sim/setup.rs
@@ -192,9 +192,12 @@ pub fn load_sdf(
 ///
 /// Returns `Some((clock_ps, constraint_buffer))` if timing is enabled,
 /// where `constraint_buffer` = `[clock_ps, constraints[0], constraints[1], ...]`.
-pub fn build_timing_constraints(script: &FlattenedScriptV1) -> Option<Vec<u32>> {
+pub fn build_timing_constraints(
+    script: &FlattenedScriptV1,
+    clock_uncertainty_ps: u16,
+) -> Option<Vec<u32>> {
     if script.timing_enabled && !script.dff_constraints.is_empty() {
-        let (clock_ps, constraints) = script.build_timing_constraint_buffer();
+        let (clock_ps, constraints) = script.build_timing_constraint_buffer(clock_uncertainty_ps);
         let non_zero = constraints.iter().filter(|&&v| v != 0).count();
         clilog::info!(
             "Timing constraints: {} words, {} with DFF constraints, clock_period={}ps",

From c1ce430e899e5f025641f8163e7439ec209b6ba9 Mon Sep 17 00:00:00 2001
From: Rob Taylor <rob.taylor@chipflow.io>
Date: Tue, 17 Feb 2026 16:12:40 +0000
Subject: [PATCH 3/3] Update timing violations docs: WNS/TNS, clock
 uncertainty, multi-corner

- Replace old summary statistics section with WNS/TNS/WHS/THS metric
  definitions and example output
- Add --clock-uncertainty-ps to CLI flags table with explanation
- Add multi-corner analysis section with max/min corner workflow
- Add clock uncertainty section explaining the jitter model
- Update gpu_sim example to show --clock-uncertainty-ps flag

Co-developed-by: Claude Code v2.1.44 (claude-opus-4-6)
---
 docs/timing-violations.md | 77 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 4 deletions(-)

diff --git a/docs/timing-violations.md b/docs/timing-violations.md
index e9ea33e..7d2ad12 100644
--- a/docs/timing-violations.md
+++ b/docs/timing-violations.md
@@ -51,7 +51,8 @@ Setup and hold violations occur when data arrives too late (setup) or too early
        design.gv design.gemparts \
        --config testbench.json \
        --sdf design.sdf \
-       --sdf-corner typ
+       --sdf-corner typ \
+       --clock-uncertainty-ps 500
    ```
 
 ### CLI Flags Reference
@@ -61,6 +62,7 @@ Setup and hold violations occur when data arrives too late (setup) or too early
 | `--sdf <path>` | all | Path to SDF file with back-annotated delays |
 | `--sdf-corner <min\|typ\|max>` | all | Which SDF corner to use (default: `typ`) |
 | `--sdf-debug` | all | Print unmatched SDF instances for debugging |
+| `--clock-uncertainty-ps <ps>` | `gpu_sim` | Clock uncertainty/jitter margin in picoseconds (default: 0). Added to both setup and hold constraints, tightening the timing window. |
 | `--enable-timing` | `cuda_test` | Enable timing analysis (arrival + violation checks) |
 | `--timing-clock-period <ps>` | `cuda_test` | Clock period in picoseconds (default: 1000) |
 | `--timing-report-violations` | `cuda_test` | Report all violations, not just summary |
@@ -113,14 +115,52 @@ cargo run -r --features metal --bin metal_test -- \
 | **hold** | DFF hold time constraint from SDF/Liberty (picoseconds) |
 | **slack** | `arrival - hold`. Negative = violation amount |
 
-### Summary Statistics
+### Timing Analysis Summary
+
+At the end of simulation, `gpu_sim` prints a timing analysis summary with standard signoff metrics:
+
+```
+=== Timing Analysis ===
+Clock period: 1200ps
+Clock uncertainty: 500ps
+Setup violations: 12 (8 unique endpoints)
+Hold violations:  3 (2 unique endpoints)
+WNS (setup): -450ps
+TNS (setup): -2340ps
+WHS (hold):  -120ps
+THS (hold):  -280ps
+TIMING: VIOLATIONS DETECTED
+```
+
+| Metric | Meaning |
+|--------|---------|
+| **WNS** (Worst Negative Slack) | The most negative setup slack across the entire simulation. Indicates the single worst timing path. |
+| **TNS** (Total Negative Slack) | Sum of all negative setup slacks. Indicates overall timing health — a large TNS means many paths are failing. |
+| **WHS** (Worst Hold Slack) | The most negative hold slack. |
+| **THS** (Total Hold Slack) | Sum of all negative hold slacks. |
+| **Unique endpoints** | Number of distinct state words (each covering 32 DFF data inputs) that had at least one violation. |
 
-At the end of simulation, GEM prints totals:
+If no violations are detected, the summary shows:
 
 ```
-Simulation complete: 1000 cycles, 5 setup violations, 0 hold violations
+=== Timing Analysis ===
+Clock period: 1200ps
+Setup violations: 0 (0 unique endpoints)
+Hold violations:  0 (0 unique endpoints)
+TIMING: PASSED
 ```
 
+**Event buffer overflow**: The GPU-side event buffer holds up to 1024 events per batch. If a design has very many violations per batch, some events may be dropped. When this happens, a warning is printed and WNS/TNS metrics may be approximate (but never optimistic — missed events only mean the true TNS is worse).
+
+### Clock Uncertainty
+
+The `--clock-uncertainty-ps` flag models clock jitter and skew by adding a margin to both setup and hold constraints. This effectively tightens the timing window:
+
+- **Setup check**: `arrival + (setup + uncertainty) > clock_period` triggers a violation
+- **Hold check**: `arrival < (hold + uncertainty)` triggers a violation
+
+Use this when your design has known clock tree uncertainty from P&R reports. A typical value for SKY130 is 100-500ps depending on clock tree quality.
+
 ## Tracing Violations to Source Signals
 
 When you see a violation on a specific word, follow this workflow to identify the offending signals and their logic cone.
@@ -186,6 +226,35 @@ If a violation is reported but you suspect it's a false positive from the approx
 1. **Use `timing_sim_cpu`** for per-signal accuracy (see [Detailed CPU Timing Analysis](#4-detailed-cpu-timing-analysis) above).
 2. **Timing-aware bit packing** groups signals with similar arrival times into the same thread, reducing the approximation error. See `docs/timing-simulation.md` § "Timing-Aware Bit Packing" for details.
 
+## Multi-Corner Analysis
+
+A single SDF corner only catches one class of violations: the **max** (slow) corner reveals setup violations, while the **min** (fast) corner reveals hold violations. For complete timing signoff, run both corners:
+
+```bash
+# Max corner: catches setup violations (slow paths)
+cargo run -r --features metal --bin gpu_sim -- \
+    design.gv design.gemparts \
+    --config testbench.json \
+    --sdf design.sdf --sdf-corner max \
+    --clock-uncertainty-ps 500
+
+# Min corner: catches hold violations (fast paths)
+cargo run -r --features metal --bin gpu_sim -- \
+    design.gv design.gemparts \
+    --config testbench.json \
+    --sdf design.sdf --sdf-corner min \
+    --clock-uncertainty-ps 500
+```
+
+Both runs use the same `.gemparts` file (compilation is cached), so the overhead is just the simulation time. In CI, add both as separate steps:
+
+```yaml
+- name: Timing check (max corner - setup)
+  run: cargo run -r --features metal --bin gpu_sim -- ... --sdf-corner max
+- name: Timing check (min corner - hold)
+  run: cargo run -r --features metal --bin gpu_sim -- ... --sdf-corner min
+```
+
 ## Common Scenarios
 
 **Setup violations on many words, same cycle**: The clock period is likely too tight for the design. The combinational logic depth exceeds what can settle in one clock period. Try increasing the clock period.