From 7652d13bc9830ce0668e919472e173139871bc63 Mon Sep 17 00:00:00 2001 From: Archith Date: Sun, 3 May 2026 11:48:44 -0700 Subject: [PATCH 1/3] plan: define startup time measurement pass Co-authored-by: Cursor --- .planning/startup-time-under-10ms.plan.md | 131 ++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 .planning/startup-time-under-10ms.plan.md diff --git a/.planning/startup-time-under-10ms.plan.md b/.planning/startup-time-under-10ms.plan.md new file mode 100644 index 0000000..71a7ada --- /dev/null +++ b/.planning/startup-time-under-10ms.plan.md @@ -0,0 +1,131 @@ +# Startup Time Under 10ms Measurement Plan + +## Roadmap Item + +- `ROADMAP.md`: `Startup time: < 10ms (vs ~100ms for interpreter)` + +## Scope Decision + +This PR does **not** claim the `<10ms` target is achieved. It establishes repeatable startup measurement and report-only CI visibility so the next optimization PR can be judged against real data. The roadmap checkbox must stay unchecked until the measured target is met. + +The target should apply to the standalone/native execution path, not ordinary `forge run app.fg`. A source file run still has to start the Rust CLI, parse CLI args, read source, lex, parse, typecheck, initialize runtime state, and execute. The native/standalone path is the only realistic place for `<10ms`. + +## Current State + +- `forge run app.fg` goes through the full CLI/frontend/interpreter path. +- `forge run app.fgc` skips lex/parse but still starts the CLI and VM. +- `forge build --native` can now produce a standalone source-runtime binary when `libforge_lang.a` is present. +- Existing `benches/fork_for_serving.rs` measures per-request fork cost, not process startup. +- There is no repeatable startup benchmark, no CI trend signal, and no agreed measurement definition. + +## Measurement Definition + +Measure cold-ish process startup wall time from parent process spawn to child process exit for short-lived programs. + +Initial modes: + +1. `source-run`: `forge run hello.fg` +2. `bytecode-run`: `forge run hello.fgc` +3. `native-source-runtime`: generated `forge build --native hello.fg` binary when `libforge_lang.a` is available +4. `aot-bytecode`: generated `forge build --aot hello.fg` binary when `libforge_lang.a` is available + +Short-lived fixture: + +```forge +42 +``` + +The harness must assert correctness on every run. A child process that exits nonzero, segfaults, times out, or prints unexpected output must fail the measurement instead of looking like a fast startup. + +Use a small `println("ok")` fixture for every mode so the harness can assert stdout-based correctness. Avoid server startup, networking, shell builtins, or filesystem writes in the measured child program. + +## Implementation Units + +### U1. Startup Measurement Harness + +Files: +- Create: `tools/startup_time.rs` or `tests/startup_time.rs` as a small Rust harness binary/test helper +- Modify: `Cargo.toml` only if using a cargo bench/bin target is necessary + +Do **not** use Criterion for process startup measurement. Criterion is optimized for in-process function benchmarking and its warmup/statistical model is a poor fit for fork/exec wall time. + +Add a custom wall-time harness (or a thin wrapper around `hyperfine` only if introducing that dependency/tool is cleaner) that: +- Locates the `forge` binary under test. +- Creates an isolated temp fixture directory. +- Writes `hello.fg`. +- Builds `hello.fgc`. +- Requires the caller/CI job to provide `FORGE_LIB_DIR` pointing at an existing `libforge_lang.a`. +- Builds native artifacts with `FORGE_LIB_DIR` set so standalone modes are actually measured. +- Measures process spawn-to-exit wall time for each mode using `std::process::Command` and `Instant`. +- Runs enough repetitions to report min/median/p95 or min/mean/p95. +- Asserts every child exits successfully and emits expected output where applicable. +- Times out child processes so hangs fail fast. + +Harness output should be simple, line-oriented, and easy to paste into PRs, for example: + +```text +startup.source_run median=... +startup.bytecode_run median=... +startup.native_source_runtime median=... +startup.aot_bytecode median=... +``` + +### U2. Report-Only CI Job + +Files: +- Modify: `.github/workflows/ci.yml` + +Add a startup benchmark job that: +- Builds the Forge binary in release mode. +- Builds `libforge_lang.a` explicitly. +- Sets `FORGE_LIB_DIR` to the directory containing `libforge_lang.a`. +- Runs the startup measurement harness. + +Keep this report-only for now: +- The job should fail if the harness does not compile/run or any measured child fails/times out. +- It should not fail because the measured value is above 10ms yet. + +Rationale: shared CI runners are noisy; the first step is a trend signal. + +### U3. Budget Documentation + +Files: +- Create: `docs/performance/startup.md` or update an existing performance doc if one exists +- Modify: `CHANGELOG.md` + +Document: +- Measurement modes and what each means. +- Why `<10ms` applies to standalone/native startup, not `forge run`. +- Current status: report-only startup harness exists; hard gate follows after optimization. +- Future hard-gate proposal: native startup p50/p95 budget once stable baseline is known. +- CI explicitly builds and measures the standalone native path; native modes must not be silently skipped. + +### U4. Local Developer Command + +Files: +- Optional create: `scripts/measure_startup.sh` + +Add a script only if it materially improves developer ergonomics by wrapping the Rust harness with the right release-build and `FORGE_LIB_DIR` setup. Avoid duplicating measurement logic between shell and Rust. + +## Risks + +- Process startup benchmarks are noisy on GitHub-hosted runners. +- Harness setup must not accidentally measure build time. +- Native source-runtime binaries embed the interpreter and may not get close to `<10ms`; if so, the next item may require a bytecode/native runner fast path rather than optimizing the source-runtime path. +- Launcher-mode native binaries must be labeled separately from standalone source-runtime binaries; the roadmap target cares about standalone. +- Without storing historical baselines, CI output is visibility-only; this PR should not pretend to provide trend analysis yet. +- The native measurements require a working C compiler (`cc`) and static library; CI must install/use the available platform toolchain explicitly. + +## Verification + +- `cargo fmt -- --check` +- `cargo test` +- `cargo clippy --all-targets -- -A clippy::approx_constant -A clippy::result_large_err -A clippy::only_used_in_recursion -A clippy::len_zero` +- The new startup measurement command/harness +- Existing Forge integration tests remain green. + +## Success Criteria + +- Developers can run one command to see startup timings for source, bytecode, and available native modes. +- CI exposes startup timing regressions as benchmark output. +- The roadmap item remains unchecked, with a clear next optimization target based on measured data. From d2d5070860ab8ca4f1a867b00faea629d2ef9434 Mon Sep 17 00:00:00 2001 From: Archith Date: Sun, 3 May 2026 11:53:28 -0700 Subject: [PATCH 2/3] perf(startup): add report-only startup measurement Co-authored-by: Cursor --- .github/workflows/ci.yml | 16 ++ CHANGELOG.md | 1 + docs/performance/startup.md | 32 ++++ tools/startup_time.rs | 282 ++++++++++++++++++++++++++++++++++++ 4 files changed, 331 insertions(+) create mode 100644 docs/performance/startup.md create mode 100644 tools/startup_time.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2b13cb1..c32375f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,6 +45,22 @@ jobs: - uses: Swatinem/rust-cache@v2 - run: cargo bench --bench fork_for_serving -- --warm-up-time 3 --measurement-time 5 --sample-size 50 + startup-benchmark: + name: Startup benchmark + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - name: Build release forge and static runtime + run: cargo build --release --lib --bin forge + - name: Build startup measurement harness + run: rustc tools/startup_time.rs -O -o target/startup_time + - name: Measure startup modes + env: + FORGE_LIB_DIR: target/release + run: ./target/startup_time --forge ./target/release/forge --warmups 2 --reps 20 + fmt: name: Format runs-on: ubuntu-latest diff --git a/CHANGELOG.md b/CHANGELOG.md index e4ee7d7..ef74539 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - **Standalone source-runtime native binaries for Forge servers** — `forge build --native` now links against `libforge_lang.a` when available and emits a single executable that embeds Forge source and starts interpreter-only runtime features like `@server` without shelling out to the `forge` CLI. `--aot` remains bytecode/VM-only and continues to reject decorator-driven servers with guidance to use `--native`. +- **Startup time measurement harness** — `tools/startup_time.rs` measures source, bytecode, native source-runtime, and bytecode AOT process startup with correctness checks. CI runs it as a report-only signal before the `<10ms` native startup target becomes a hard gate. - **Structured concurrency with `squad` blocks** — `squad { spawn { } spawn { } }` runs tasks concurrently with automatic join, cooperative cancellation on failure, and error propagation. Returns an array of results in spawn order. Works in both interpreter and VM engines. - **First-class `Set` type** — `set([1, 2, 3])` or `set((1, 2, 3))` builds a deduplicated set. Methods: `.has(x)`, `.add(x)`, `.remove(x)`, `.union(other)`, `.intersect(other)`, `.diff(other)`, `.to_array()`. Supports `len()`, `contains()`, iteration, order-independent equality, and is truthy when non-empty. Works across interpreter, VM, bytecode round-trip, and JIT. - **First-class `Map` type** — `map([("a", 1), ("b", 2)])` or `map()` builds an ordered key/value map with any-type keys. Methods: `.get(k)`, `.set(k, v)`, `.has(k)`, `.remove(k)`, `.keys()`, `.values()`, `.len()`, `.to_array()`. Insertion order is preserved on overwrite. Key equality uses container semantics (int/float collision, NaN self-match). Supports `for k, v in m` iteration (which also unlocks `for k, v in obj` parity for plain objects under the VM), `len()`, `contains()`, order-independent equality, and is truthy when non-empty. `json.stringify` emits JSON objects for maps with string keys and errors on non-string keys. Works across interpreter, VM, bytecode round-trip, and JIT. diff --git a/docs/performance/startup.md b/docs/performance/startup.md new file mode 100644 index 0000000..bf74a5b --- /dev/null +++ b/docs/performance/startup.md @@ -0,0 +1,32 @@ +# Startup Time Measurement + +Forge's roadmap target of `<10ms` startup applies to standalone/native execution paths, not to `forge run app.fg`. + +`forge run app.fg` intentionally does more work: starts the CLI, reads source, lexes, parses, typechecks, initializes the runtime, and executes. Native and bytecode paths can skip parts of that work and are the realistic target for sub-10ms startup. + +## Harness + +Startup timing is measured by `tools/startup_time.rs`, a small Rust process-level harness. It measures wall time from parent process spawn to child process exit and verifies each child prints `ok`. + +The harness measures: + +- `startup.source_run`: `forge run hello.fg` +- `startup.bytecode_run`: `forge run hello.fgc` +- `startup.native_source_runtime`: standalone source-runtime binary from `forge build --native` +- `startup.aot_bytecode`: standalone bytecode binary from `forge build --aot` + +The native modes require `FORGE_LIB_DIR` to point at a directory containing `libforge_lang.a`. + +## Local Run + +```bash +cargo build --release --lib --bin forge +rustc tools/startup_time.rs -O -o target/startup_time +FORGE_LIB_DIR=target/release ./target/startup_time --forge ./target/release/forge --warmups 2 --reps 20 +``` + +## CI Status + +CI runs this harness as report-only. The job fails if the harness fails to compile, if fixture builds fail, if any child process exits unsuccessfully, or if output is wrong. It does not yet fail because startup is above 10ms. + +The hard `<10ms` gate should be added after we have stable baseline data and an optimization PR that actually reaches the native startup target. diff --git a/tools/startup_time.rs b/tools/startup_time.rs new file mode 100644 index 0000000..97ff929 --- /dev/null +++ b/tools/startup_time.rs @@ -0,0 +1,282 @@ +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; +use std::process::{Command, Output}; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + +const FIXTURE_SOURCE: &str = r#"println("ok")"#; + +#[derive(Debug)] +struct Config { + forge: PathBuf, + reps: usize, + warmups: usize, +} + +#[derive(Debug)] +struct Mode { + name: &'static str, + command: PathBuf, + args: Vec, + envs: Vec<(String, String)>, +} + +#[derive(Debug)] +struct Stats { + min: Duration, + median: Duration, + p95: Duration, + max: Duration, +} + +fn main() { + if let Err(err) = run() { + eprintln!("startup_time: {err}"); + std::process::exit(1); + } +} + +fn run() -> Result<(), String> { + let config = parse_args()?; + let lib_dir = env::var("FORGE_LIB_DIR").map(PathBuf::from).map_err(|_| { + "FORGE_LIB_DIR must point at a directory containing libforge_lang.a".to_string() + })?; + let lib_dir = fs::canonicalize(&lib_dir).map_err(|err| { + format!( + "failed to canonicalize FORGE_LIB_DIR {}: {err}", + lib_dir.display() + ) + })?; + let lib_path = lib_dir.join("libforge_lang.a"); + if !lib_path.exists() { + return Err(format!("{} does not exist", lib_path.display())); + } + + let workdir = unique_workdir()?; + let result = run_in_workdir(&config, &lib_dir, &workdir); + let _ = fs::remove_dir_all(&workdir); + result +} + +fn parse_args() -> Result { + let mut forge = None; + let mut reps = 20usize; + let mut warmups = 3usize; + let args = env::args().skip(1).collect::>(); + let mut i = 0; + while i < args.len() { + match args[i].as_str() { + "--forge" => { + i += 1; + forge = args.get(i).map(PathBuf::from); + } + "--reps" => { + i += 1; + reps = args + .get(i) + .ok_or("--reps requires a value")? + .parse() + .map_err(|_| "--reps must be a positive integer".to_string())?; + } + "--warmups" => { + i += 1; + warmups = args + .get(i) + .ok_or("--warmups requires a value")? + .parse() + .map_err(|_| "--warmups must be a non-negative integer".to_string())?; + } + "--help" | "-h" => { + println!( + "Usage: startup_time --forge [--reps N] [--warmups N]\n\ + Requires FORGE_LIB_DIR to contain libforge_lang.a." + ); + std::process::exit(0); + } + other => return Err(format!("unknown argument: {other}")), + } + i += 1; + } + + let forge = forge.ok_or("--forge is required")?; + let forge = fs::canonicalize(&forge).map_err(|err| { + format!( + "failed to canonicalize forge binary {}: {err}", + forge.display() + ) + })?; + if reps == 0 { + return Err("--reps must be greater than zero".to_string()); + } + + Ok(Config { + forge, + reps, + warmups, + }) +} + +fn unique_workdir() -> Result { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|err| format!("system clock before unix epoch: {err}"))? + .as_nanos(); + let dir = env::temp_dir().join(format!("forge-startup-{}-{nanos}", std::process::id())); + fs::create_dir_all(&dir).map_err(|err| format!("failed to create {}: {err}", dir.display()))?; + Ok(dir) +} + +fn run_in_workdir(config: &Config, lib_dir: &Path, workdir: &Path) -> Result<(), String> { + let source_run = write_fixture(workdir, "source_run.fg")?; + let bytecode_run = write_fixture(workdir, "bytecode_run.fg")?; + let native_source = write_fixture(workdir, "native_source.fg")?; + let aot_bytecode = write_fixture(workdir, "aot_bytecode.fg")?; + + checked_command( + Command::new(&config.forge) + .arg("build") + .arg(&bytecode_run) + .current_dir(workdir), + "build bytecode fixture", + )?; + checked_command( + Command::new(&config.forge) + .arg("build") + .arg("--native") + .arg(&native_source) + .env("FORGE_LIB_DIR", lib_dir) + .current_dir(workdir), + "build native source-runtime fixture", + )?; + checked_command( + Command::new(&config.forge) + .arg("build") + .arg("--aot") + .arg(&aot_bytecode) + .env("FORGE_LIB_DIR", lib_dir) + .current_dir(workdir), + "build native bytecode fixture", + )?; + + let modes = vec![ + Mode { + name: "source_run", + command: config.forge.clone(), + args: vec!["run".to_string(), source_run.display().to_string()], + envs: vec![], + }, + Mode { + name: "bytecode_run", + command: config.forge.clone(), + args: vec![ + "run".to_string(), + bytecode_run.with_extension("fgc").display().to_string(), + ], + envs: vec![], + }, + Mode { + name: "native_source_runtime", + command: native_source.with_extension(""), + args: vec![], + envs: vec![], + }, + Mode { + name: "aot_bytecode", + command: aot_bytecode.with_extension(""), + args: vec![], + envs: vec![], + }, + ]; + + println!( + "startup_time reps={} warmups={} forge={}", + config.reps, + config.warmups, + config.forge.display() + ); + for mode in modes { + let stats = measure_mode(&mode, config.reps, config.warmups)?; + println!( + "startup.{name} min_ms={:.3} median_ms={:.3} p95_ms={:.3} max_ms={:.3}", + millis(stats.min), + millis(stats.median), + millis(stats.p95), + millis(stats.max), + name = mode.name + ); + } + + Ok(()) +} + +fn write_fixture(workdir: &Path, name: &str) -> Result { + let path = workdir.join(name); + fs::write(&path, FIXTURE_SOURCE) + .map_err(|err| format!("failed to write {}: {err}", path.display()))?; + Ok(path) +} + +fn checked_command(command: &mut Command, label: &str) -> Result { + let output = command + .output() + .map_err(|err| format!("{label}: failed to spawn: {err}"))?; + if !output.status.success() { + return Err(format!( + "{label}: failed with status {}\nstdout:\n{}\nstderr:\n{}", + output.status, + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + )); + } + Ok(output) +} + +fn measure_mode(mode: &Mode, reps: usize, warmups: usize) -> Result { + for _ in 0..warmups { + run_child(mode)?; + } + + let mut times = Vec::with_capacity(reps); + for _ in 0..reps { + let started = Instant::now(); + run_child(mode)?; + times.push(started.elapsed()); + } + times.sort_unstable(); + + let min = times[0]; + let median = times[times.len() / 2]; + let p95_idx = ((times.len() * 95).div_ceil(100)).saturating_sub(1); + let p95 = times[p95_idx.min(times.len() - 1)]; + let max = times[times.len() - 1]; + + Ok(Stats { + min, + median, + p95, + max, + }) +} + +fn run_child(mode: &Mode) -> Result<(), String> { + let mut command = Command::new(&mode.command); + command.args(&mode.args); + for (key, value) in &mode.envs { + command.env(key, value); + } + let output = checked_command(&mut command, mode.name)?; + let stdout = String::from_utf8_lossy(&output.stdout); + if stdout.trim() != "ok" { + return Err(format!( + "{}: expected stdout 'ok', got {:?}\nstderr:\n{}", + mode.name, + stdout, + String::from_utf8_lossy(&output.stderr) + )); + } + Ok(()) +} + +fn millis(duration: Duration) -> f64 { + duration.as_secs_f64() * 1000.0 +} From e6bff65877b1c37a649128eeb1a0cf9903d2cbb0 Mon Sep 17 00:00:00 2001 From: Archith Date: Sun, 3 May 2026 11:55:58 -0700 Subject: [PATCH 3/3] fix(startup): add child timeout to measurement harness Co-authored-by: Cursor --- .planning/startup-time-under-10ms.plan.md | 2 +- tools/startup_time.rs | 41 +++++++++++++++++++++-- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/.planning/startup-time-under-10ms.plan.md b/.planning/startup-time-under-10ms.plan.md index 71a7ada..35dae9f 100644 --- a/.planning/startup-time-under-10ms.plan.md +++ b/.planning/startup-time-under-10ms.plan.md @@ -32,7 +32,7 @@ Initial modes: Short-lived fixture: ```forge -42 +println("ok") ``` The harness must assert correctness on every run. A child process that exits nonzero, segfaults, times out, or prints unexpected output must fail the measurement instead of looking like a fast startup. diff --git a/tools/startup_time.rs b/tools/startup_time.rs index 97ff929..10571bf 100644 --- a/tools/startup_time.rs +++ b/tools/startup_time.rs @@ -1,10 +1,11 @@ use std::env; use std::fs; use std::path::{Path, PathBuf}; -use std::process::{Command, Output}; +use std::process::{Command, Output, Stdio}; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; const FIXTURE_SOURCE: &str = r#"println("ok")"#; +const CHILD_TIMEOUT: Duration = Duration::from_secs(5); #[derive(Debug)] struct Config { @@ -261,10 +262,46 @@ fn measure_mode(mode: &Mode, reps: usize, warmups: usize) -> Result Result<(), String> { let mut command = Command::new(&mode.command); command.args(&mode.args); + command.stdout(Stdio::piped()).stderr(Stdio::piped()); for (key, value) in &mode.envs { command.env(key, value); } - let output = checked_command(&mut command, mode.name)?; + let mut child = command + .spawn() + .map_err(|err| format!("{}: failed to spawn: {err}", mode.name))?; + let deadline = Instant::now() + CHILD_TIMEOUT; + loop { + if child + .try_wait() + .map_err(|err| format!("{}: failed to poll child: {err}", mode.name))? + .is_some() + { + break; + } + if Instant::now() >= deadline { + let _ = child.kill(); + let _ = child.wait(); + return Err(format!( + "{}: child timed out after {:.1}s", + mode.name, + CHILD_TIMEOUT.as_secs_f64() + )); + } + std::thread::sleep(Duration::from_millis(5)); + } + + let output = child + .wait_with_output() + .map_err(|err| format!("{}: failed to collect child output: {err}", mode.name))?; + if !output.status.success() { + return Err(format!( + "{}: failed with status {}\nstdout:\n{}\nstderr:\n{}", + mode.name, + output.status, + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + )); + } let stdout = String::from_utf8_lossy(&output.stdout); if stdout.trim() != "ok" { return Err(format!(