From 177d307ed91ddc9f7732b9e74874494ea1bc9950 Mon Sep 17 00:00:00 2001 From: Leszek Date: Mon, 15 Jun 2026 04:02:57 +0200 Subject: [PATCH] test: harden two load-sensitive tests against CPU oversubscription MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test-only; production code untouched. An investigation (1220 suite runs under CPU saturation, zero failures) showed the suite is deterministic under a normal single-cargo run — the "flake" seen during a parallel-agent wave was shared-worktree cargo contention, not test logic. Two latent fragilities are hardened defensively: - viewer: test_viewer_loader_drop_cancels_worker waited on a detached busy-wait (yield_now) worker with recv_timeout(1s).unwrap(); under heavy load the starved worker could miss the 1s deadline. Drop sets the cancel flag (Release; worker loads Relaxed, eventual visibility guaranteed), so the worker is guaranteed to send — widen to a generous 30s hang-guard that tolerates starvation yet still fails fast on a real deadlock instead of hanging the test binary. - debug_log: log_truncates_oversized_file shares process-global LOG_FILE/CHECK_COUNTER/TEST_CACHE_HOME with concurrent production log() calls from other parallel tests. Re-create the oversized file each attempt so every retry genuinely exercises truncation, name the retry budget (RETRY_BUDGET) with a rationale, and document the interference and the per-test-redirect tech-debt follow-up. --- src/app/debug_log.rs | 35 ++++++++++++++++++++++++++++------- src/ui/viewer/tests.rs | 11 ++++++++--- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/src/app/debug_log.rs b/src/app/debug_log.rs index 1314994..35fbb5b 100644 --- a/src/app/debug_log.rs +++ b/src/app/debug_log.rs @@ -231,15 +231,36 @@ mod tests { let _ = std::fs::create_dir_all(parent); } - { - let mut f = std::fs::File::create(&path).expect("create oversized log"); - std::io::Write::write_all(&mut f, &vec![b'X'; (MAX_LOG_SIZE_BYTES + 1) as usize]) - .expect("write oversized log"); - } + // The truncation guard only fires on a *freshly opened* file (or every + // CHECK_INTERVAL writes), so each attempt re-creates the oversized file + // and re-opens via `reset_for_test()` to genuinely exercise truncation. + // Creating it once before the loop would only test the first iteration: + // after any `log()` truncates, the file is small and later attempts just + // append, no longer exercising the truncation path. + // + // Cross-test interference is expected and the retry loop MUST NOT be + // collapsed into a single attempt. Production `log()` relies on + // PROCESS-GLOBAL statics: `CHECK_COUNTER` (AtomicU32), `LOG_FILE` (Mutex) + // and the `TEST_CACHE_HOME` redirect. `TEST_MUTEX` only serializes the two + // debug_log tests against each other; it does NOT exclude the hundreds of + // other tests on parallel threads that invoke the production + // `debug_log!`/`log()` macro. Once this test installs its tempdir into the + // process-global `TEST_CACHE_HOME`, those concurrent `log()` calls bump the + // shared `CHECK_COUNTER` and write to this test's tempdir, perturbing the + // size dance below. The retry loop tolerates that until one cycle wins. + // + // TODO(tech-debt): the principled fix is a per-test redirect gate that + // production `log()` consults first, isolating these globals; tracked as + // follow-up. Until then the test is empirically — not provably — robust. + // Empirically sufficient under N-way parallel `cargo test`; ~250ms worst + // case (RETRY_BUDGET iterations * the 5ms back-off below). + const RETRY_BUDGET: usize = 50; let mut truncated = false; - for attempt in 0..20 { + for attempt in 0..RETRY_BUDGET { reset_for_test(); + std::fs::write(&path, vec![b'X'; (MAX_LOG_SIZE_BYTES + 1) as usize]) + .expect("write oversized log"); log(format_args!("attempt {attempt}")); let len = std::fs::metadata(&path).map(|m| m.len()).unwrap_or(0); if len > 0 && len < MAX_LOG_SIZE_BYTES { @@ -258,7 +279,7 @@ mod tests { reset_for_test(); assert!( truncated, - "log() never acquired mutex to truncate oversized file after 20 retries" + "log() never truncated the oversized file within {RETRY_BUDGET} retries" ); } } diff --git a/src/ui/viewer/tests.rs b/src/ui/viewer/tests.rs index c931ef7..3394c8d 100644 --- a/src/ui/viewer/tests.rs +++ b/src/ui/viewer/tests.rs @@ -19,7 +19,6 @@ use std::thread; use tempfile::NamedTempFile; const DEFAULT_PAGE_HEIGHT: usize = 20; -const TEST_CHANNEL_TIMEOUT_SECS: u64 = 1; fn create_test_file(content: &str) -> NamedTempFile { let mut file = NamedTempFile::new().unwrap(); @@ -89,9 +88,15 @@ fn test_viewer_loader_drop_cancels_worker() { drop(loader); assert!(cancel.load(Ordering::Relaxed)); + // Drop stores `cancel` with `Release`; the worker loads it `Relaxed` in a + // `yield_now` loop, which still guarantees eventual visibility of the flag, + // so the worker is guaranteed to send. The generous 30s timeout (vs the old + // 1s) tolerates CPU starvation of the detached worker under heavy load, yet + // still bounds a genuine deadlock so CI fails fast instead of hanging the + // whole test binary. done_rx - .recv_timeout(std::time::Duration::from_secs(TEST_CHANNEL_TIMEOUT_SECS)) - .unwrap(); + .recv_timeout(std::time::Duration::from_secs(30)) + .expect("worker should send done after observing cancel"); } #[test]