From 42ad67e9740f64fbe159164d51e33d18aa5a9d00 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 5 Jun 2026 16:30:33 +0000 Subject: [PATCH 1/2] =?UTF-8?q?=E2=9A=A1=20Bolt:=20Cache=20compiled=20Rege?= =?UTF-8?q?x=20instances=20to=20speed=20up=20Scanner=20creation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: tachyon-beep <544926+tachyon-beep@users.noreply.github.com> --- .jules/bolt.md | 4 ++ crates/clarion-scanner/src/patterns.rs | 54 ++++++++++++++------------ 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index 2c736dc8..46442e20 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -1,3 +1,7 @@ ## 2026-06-04 - Borrowing strings during clustering graph traversal **Learning:** In `crates/clarion-analysis/src/lib.rs`, the fallback algorithm `local_weighted_components` cloned module string IDs deeply while populating neighbor lists (`neighbors`), seen sets (`seen`), and graph traversal stacks (`stack`). For a typical graph with hundreds or thousands of nodes, this causes extensive unnecessary memory allocations and CPU overhead during clustering. **Action:** Replace `String` with `&str` references inside internal clustering structures. Rust's borrow checker can perfectly track the lifetimes bound to the original `ModuleGraph`, and `.to_owned()` only needs to be called when pushing a module into the final partitioned `Vec` results. This dramatically reduces heap allocations. + +## 2026-06-05 - Avoid redundant Regex compilation in repeated function calls +**Learning:** `Regex::new` is an expensive operation that compiles regexes into DFA/NFA representations. In `crates/clarion-scanner/src/patterns.rs`, `Scanner::new()` was compiling the default rule floor regexes every time it was called. Rust's `Regex` and `RegexSet` objects from the `regex` crate wrap their compiled state in an `Arc`, meaning they are very cheap to clone. +**Action:** Use `std::sync::OnceLock` to statically cache expensive struct instances that hold `Regex` fields (or the `Regex` fields themselves) if they only depend on static configurations. You can safely and efficiently `.clone()` the cached structures for subsequent calls. diff --git a/crates/clarion-scanner/src/patterns.rs b/crates/clarion-scanner/src/patterns.rs index bb4671de..887ab6e7 100644 --- a/crates/clarion-scanner/src/patterns.rs +++ b/crates/clarion-scanner/src/patterns.rs @@ -14,14 +14,14 @@ pub struct PatternMeta { capture_group: Option, } -#[derive(Debug)] +#[derive(Debug, Clone)] struct CompiledPattern { meta: PatternMeta, regex: Regex, } /// Rust-native port of the ADR-013 v0.1 secret rule floor. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Scanner { patterns: RegexSet, pattern_meta: Vec, @@ -46,28 +46,34 @@ impl Scanner { /// Panics only if one of the compiled-in regular expressions is invalid. #[must_use] pub fn new() -> Self { - let pattern_meta = default_pattern_meta(); - let patterns = RegexSet::new(pattern_meta.iter().map(|meta| meta.pattern)) - .expect("default secret patterns compile"); - let compiled_patterns = pattern_meta - .iter() - .cloned() - .map(|meta| CompiledPattern { - regex: Regex::new(meta.pattern).expect("default secret pattern compiles"), - meta, - }) - .collect(); - Self { - patterns, - pattern_meta, - compiled_patterns, - entropy_b64: EntropyTuning::BASE64, - entropy_hex: EntropyTuning::HEX, - entropy_b64_re: Regex::new(r"[A-Za-z0-9+/]{20,}={0,2}") - .expect("base64 candidate regex compiles"), - entropy_hex_re: Regex::new(r"\b[a-fA-F0-9]{40,}\b") - .expect("hex candidate regex compiles"), - } + // Cache compiled Regexes and Scanner instance behind a OnceLock. + // Cloning a `Regex` or `RegexSet` is extremely cheap (just bumping an Arc), + // preventing redundant compilation overhead whenever a Scanner is instantiated. + static DEFAULTS: std::sync::OnceLock = std::sync::OnceLock::new(); + DEFAULTS.get_or_init(|| { + let pattern_meta = default_pattern_meta(); + let patterns = RegexSet::new(pattern_meta.iter().map(|meta| meta.pattern)) + .expect("default secret patterns compile"); + let compiled_patterns = pattern_meta + .iter() + .cloned() + .map(|meta| CompiledPattern { + regex: Regex::new(meta.pattern).expect("default secret pattern compiles"), + meta, + }) + .collect(); + Self { + patterns, + pattern_meta, + compiled_patterns, + entropy_b64: EntropyTuning::BASE64, + entropy_hex: EntropyTuning::HEX, + entropy_b64_re: Regex::new(r"[A-Za-z0-9+/]{20,}={0,2}") + .expect("base64 candidate regex compiles"), + entropy_hex_re: Regex::new(r"\b[a-fA-F0-9]{40,}\b") + .expect("hex candidate regex compiles"), + } + }).clone() } #[must_use] From 663715f0d2361ea6bc8f907a45ab95bcdd63092c Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 5 Jun 2026 16:36:29 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=E2=9A=A1=20Bolt:=20Cache=20compiled=20Rege?= =?UTF-8?q?x=20instances=20to=20speed=20up=20Scanner=20creation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: tachyon-beep <544926+tachyon-beep@users.noreply.github.com> --- crates/clarion-scanner/src/patterns.rs | 50 +++++++++++++------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/crates/clarion-scanner/src/patterns.rs b/crates/clarion-scanner/src/patterns.rs index 887ab6e7..9a718940 100644 --- a/crates/clarion-scanner/src/patterns.rs +++ b/crates/clarion-scanner/src/patterns.rs @@ -50,30 +50,32 @@ impl Scanner { // Cloning a `Regex` or `RegexSet` is extremely cheap (just bumping an Arc), // preventing redundant compilation overhead whenever a Scanner is instantiated. static DEFAULTS: std::sync::OnceLock = std::sync::OnceLock::new(); - DEFAULTS.get_or_init(|| { - let pattern_meta = default_pattern_meta(); - let patterns = RegexSet::new(pattern_meta.iter().map(|meta| meta.pattern)) - .expect("default secret patterns compile"); - let compiled_patterns = pattern_meta - .iter() - .cloned() - .map(|meta| CompiledPattern { - regex: Regex::new(meta.pattern).expect("default secret pattern compiles"), - meta, - }) - .collect(); - Self { - patterns, - pattern_meta, - compiled_patterns, - entropy_b64: EntropyTuning::BASE64, - entropy_hex: EntropyTuning::HEX, - entropy_b64_re: Regex::new(r"[A-Za-z0-9+/]{20,}={0,2}") - .expect("base64 candidate regex compiles"), - entropy_hex_re: Regex::new(r"\b[a-fA-F0-9]{40,}\b") - .expect("hex candidate regex compiles"), - } - }).clone() + DEFAULTS + .get_or_init(|| { + let pattern_meta = default_pattern_meta(); + let patterns = RegexSet::new(pattern_meta.iter().map(|meta| meta.pattern)) + .expect("default secret patterns compile"); + let compiled_patterns = pattern_meta + .iter() + .cloned() + .map(|meta| CompiledPattern { + regex: Regex::new(meta.pattern).expect("default secret pattern compiles"), + meta, + }) + .collect(); + Self { + patterns, + pattern_meta, + compiled_patterns, + entropy_b64: EntropyTuning::BASE64, + entropy_hex: EntropyTuning::HEX, + entropy_b64_re: Regex::new(r"[A-Za-z0-9+/]{20,}={0,2}") + .expect("base64 candidate regex compiles"), + entropy_hex_re: Regex::new(r"\b[a-fA-F0-9]{40,}\b") + .expect("hex candidate regex compiles"), + } + }) + .clone() } #[must_use]