From 368f1180e75faa548cba6a867e8ff96d387c7956 Mon Sep 17 00:00:00 2001 From: Leszek Date: Mon, 15 Jun 2026 04:02:57 +0200 Subject: [PATCH] test: harden two load-sensitive tests against CPU oversubscription MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both are test-only changes; production code is untouched. An investigation (1220 suite runs under CPU saturation, zero failures) showed the suite is deterministic under a normal single-cargo run — the "flake" seen during a parallel-agent wave was shared-worktree cargo contention, not test logic. These two latent fragilities are hardened defensively: - viewer: test_viewer_loader_drop_cancels_worker waited on a detached busy-wait (yield_now) worker with recv_timeout(1s).unwrap(); under heavy load the starved worker could miss the 1s deadline. Drop sets the cancel flag, so the worker is guaranteed to send — switch to a blocking recv() and drop the now-dead TEST_CHANNEL_TIMEOUT_SECS constant. - debug_log: log_truncates_oversized_file shares process-global LOG_FILE/CHECK_COUNTER/TEST_CACHE_HOME with concurrent production log() calls from other parallel tests. Widen the retry budget (20 -> 50) and document why the loop must not be collapsed into a single attempt. --- src/app/debug_log.rs | 17 +++++++++++++++-- src/ui/viewer/tests.rs | 8 +++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/app/debug_log.rs b/src/app/debug_log.rs index 1314994..b043cde 100644 --- a/src/app/debug_log.rs +++ b/src/app/debug_log.rs @@ -237,8 +237,21 @@ mod tests { .expect("write oversized log"); } + // Cross-test interference is expected here and the retry loop below MUST + // NOT be collapsed into a single attempt. Production `log()` relies on + // PROCESS-GLOBAL statics: `CHECK_COUNTER` (AtomicU32), `LOG_FILE` (Mutex) + // and the `TEST_CACHE_HOME` redirect. `TEST_MUTEX` only serializes the two + // debug_log tests against each other; it does NOT exclude the hundreds of + // other tests running on parallel threads, many of which invoke the + // production `debug_log!`/`log()` macro. Once this test installs its + // tempdir into the process-global `TEST_CACHE_HOME`, those concurrent + // `log()` calls (a) bump the shared `CHECK_COUNTER` and (b) write to + // `log_path()`, which now points at THIS test's tempdir. So concurrent + // appends and counter bumps can perturb the careful size dance below. + // The retry loop tolerates that interference: we keep trying until one of + // our own `reset_for_test()` + `log()` cycles wins the size check. let mut truncated = false; - for attempt in 0..20 { + for attempt in 0..50 { reset_for_test(); log(format_args!("attempt {attempt}")); let len = std::fs::metadata(&path).map(|m| m.len()).unwrap_or(0); @@ -258,7 +271,7 @@ mod tests { reset_for_test(); assert!( truncated, - "log() never acquired mutex to truncate oversized file after 20 retries" + "log() never acquired mutex to truncate oversized file after 50 retries" ); } } diff --git a/src/ui/viewer/tests.rs b/src/ui/viewer/tests.rs index c931ef7..d4643ae 100644 --- a/src/ui/viewer/tests.rs +++ b/src/ui/viewer/tests.rs @@ -19,7 +19,6 @@ use std::thread; use tempfile::NamedTempFile; const DEFAULT_PAGE_HEIGHT: usize = 20; -const TEST_CHANNEL_TIMEOUT_SECS: u64 = 1; fn create_test_file(content: &str) -> NamedTempFile { let mut file = NamedTempFile::new().unwrap(); @@ -89,9 +88,12 @@ fn test_viewer_loader_drop_cancels_worker() { drop(loader); assert!(cancel.load(Ordering::Relaxed)); + // Drop sets `cancel`, so the worker is guaranteed to send once it observes it. + // A blocking recv removes the arbitrary 1s deadline that made this test fail + // under heavy CPU load when the detached yield-looping worker got starved. done_rx - .recv_timeout(std::time::Duration::from_secs(TEST_CHANNEL_TIMEOUT_SECS)) - .unwrap(); + .recv() + .expect("worker should send done after observing cancel"); } #[test]