diff --git a/.jules/bolt.md b/.jules/bolt.md index 2c736dc8..46442e20 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -1,3 +1,7 @@ ## 2026-06-04 - Borrowing strings during clustering graph traversal **Learning:** In `crates/clarion-analysis/src/lib.rs`, the fallback algorithm `local_weighted_components` cloned module string IDs deeply while populating neighbor lists (`neighbors`), seen sets (`seen`), and graph traversal stacks (`stack`). For a typical graph with hundreds or thousands of nodes, this causes extensive unnecessary memory allocations and CPU overhead during clustering. **Action:** Replace `String` with `&str` references inside internal clustering structures. Rust's borrow checker can perfectly track the lifetimes bound to the original `ModuleGraph`, and `.to_owned()` only needs to be called when pushing a module into the final partitioned `Vec` results. This dramatically reduces heap allocations. + +## 2026-06-05 - Avoid redundant Regex compilation in repeated function calls +**Learning:** `Regex::new` is an expensive operation that compiles regexes into DFA/NFA representations. In `crates/clarion-scanner/src/patterns.rs`, `Scanner::new()` was compiling the default rule floor regexes every time it was called. Rust's `Regex` and `RegexSet` objects from the `regex` crate wrap their compiled state in an `Arc`, meaning they are very cheap to clone. +**Action:** Use `std::sync::OnceLock` to statically cache expensive struct instances that hold `Regex` fields (or the `Regex` fields themselves) if they only depend on static configurations. You can safely and efficiently `.clone()` the cached structures for subsequent calls. diff --git a/crates/clarion-scanner/src/patterns.rs b/crates/clarion-scanner/src/patterns.rs index bb4671de..9a718940 100644 --- a/crates/clarion-scanner/src/patterns.rs +++ b/crates/clarion-scanner/src/patterns.rs @@ -14,14 +14,14 @@ pub struct PatternMeta { capture_group: Option, } -#[derive(Debug)] +#[derive(Debug, Clone)] struct CompiledPattern { meta: PatternMeta, regex: Regex, } /// Rust-native port of the ADR-013 v0.1 secret rule floor. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Scanner { patterns: RegexSet, pattern_meta: Vec, @@ -46,28 +46,36 @@ impl Scanner { /// Panics only if one of the compiled-in regular expressions is invalid. #[must_use] pub fn new() -> Self { - let pattern_meta = default_pattern_meta(); - let patterns = RegexSet::new(pattern_meta.iter().map(|meta| meta.pattern)) - .expect("default secret patterns compile"); - let compiled_patterns = pattern_meta - .iter() - .cloned() - .map(|meta| CompiledPattern { - regex: Regex::new(meta.pattern).expect("default secret pattern compiles"), - meta, + // Cache compiled Regexes and Scanner instance behind a OnceLock. + // Cloning a `Regex` or `RegexSet` is extremely cheap (just bumping an Arc), + // preventing redundant compilation overhead whenever a Scanner is instantiated. + static DEFAULTS: std::sync::OnceLock = std::sync::OnceLock::new(); + DEFAULTS + .get_or_init(|| { + let pattern_meta = default_pattern_meta(); + let patterns = RegexSet::new(pattern_meta.iter().map(|meta| meta.pattern)) + .expect("default secret patterns compile"); + let compiled_patterns = pattern_meta + .iter() + .cloned() + .map(|meta| CompiledPattern { + regex: Regex::new(meta.pattern).expect("default secret pattern compiles"), + meta, + }) + .collect(); + Self { + patterns, + pattern_meta, + compiled_patterns, + entropy_b64: EntropyTuning::BASE64, + entropy_hex: EntropyTuning::HEX, + entropy_b64_re: Regex::new(r"[A-Za-z0-9+/]{20,}={0,2}") + .expect("base64 candidate regex compiles"), + entropy_hex_re: Regex::new(r"\b[a-fA-F0-9]{40,}\b") + .expect("hex candidate regex compiles"), + } }) - .collect(); - Self { - patterns, - pattern_meta, - compiled_patterns, - entropy_b64: EntropyTuning::BASE64, - entropy_hex: EntropyTuning::HEX, - entropy_b64_re: Regex::new(r"[A-Za-z0-9+/]{20,}={0,2}") - .expect("base64 candidate regex compiles"), - entropy_hex_re: Regex::new(r"\b[a-fA-F0-9]{40,}\b") - .expect("hex candidate regex compiles"), - } + .clone() } #[must_use]