From a9ebdf7048f91367ee68f8ae152e063f8da17896 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Wed, 27 May 2026 12:01:48 +0200 Subject: [PATCH] feat: add fast scan mode to scanner Introduces a "fast scan" mode that optimizes scanning performance by avoiding the tracking of all pattern matches when they are not strictly required by a rule's condition. This mode is beneficial for rules with simple boolean conditions (e.g., `$a`) where only the presence of a match matters, not its specific offsets or count. When enabled, only the first occurrence of a fast-scanned pattern is reported. Patterns used in contexts like count (`#a`), offset (`@a`), length (`!a`), anchored matches, or within `for ... of` loops requiring detailed match information, automatically disable fast scan for themselves during compilation. --- capi/include/yara_x.h | 13 +++++++ capi/src/scanner.rs | 38 ++++++++++++++++++ capi/src/tests.rs | 48 ++++++++++++++++++++--- go/scanner.go | 14 +++++++ go/scanner_test.go | 18 +++++++++ lib/src/compiler/ir/ast2ir.rs | 65 ++++++++++++++++++++++++++++--- lib/src/compiler/ir/mod.rs | 24 +++++++++++- lib/src/compiler/mod.rs | 14 +++++++ lib/src/compiler/rules.rs | 8 +++- lib/src/compiler/tests/mod.rs | 2 +- lib/src/scanner/blocks.rs | 50 ++++++++++++++++++++++++ lib/src/scanner/context.rs | 9 ++++- lib/src/scanner/mod.rs | 15 ++++++++ lib/src/scanner/tests.rs | 68 +++++++++++++++++++++++++++++++++ py/src/lib.rs | 14 +++++++ py/tests/test_api.py | 12 ++++++ py/yara_x.pyi | 14 +++++++ site/content/docs/api/c.md | 22 +++++++++++ site/content/docs/api/python.md | 12 ++++++ 19 files changed, 444 insertions(+), 16 deletions(-) diff --git a/capi/include/yara_x.h b/capi/include/yara_x.h index 9b793eb60..75355452f 100644 --- a/capi/include/yara_x.h +++ b/capi/include/yara_x.h @@ -703,6 +703,19 @@ void yrx_scanner_destroy(struct YRX_SCANNER *scanner); enum YRX_RESULT yrx_scanner_set_timeout(struct YRX_SCANNER *scanner, uint64_t timeout); +// Enables or disables fast scan mode for the scanner. +// +// In fast scan mode, the scanner avoids tracking matches for patterns when it +// is not necessary (e.g. when a rule condition only performs a simple boolean +// check `$a`). +// +// Note that using fast scan mode implies that not all matches will be +// reported. For instance, when iterating matches using [`ScanResults`], +// you won't get all occurrences of the pattern in the file, only the first +// one. +enum YRX_RESULT yrx_scanner_fast_scan(struct YRX_SCANNER *scanner, + bool yes); + // Scans a data buffer. // // `data` can be null as long as `len` is 0. In such cases its handled as diff --git a/capi/src/scanner.rs b/capi/src/scanner.rs index f34dae071..d52a821a7 100644 --- a/capi/src/scanner.rs +++ b/capi/src/scanner.rs @@ -34,6 +34,19 @@ impl<'r> InnerScanner<'r> { self } + fn fast_scan(&mut self, yes: bool) -> &mut Self { + match self { + InnerScanner::SingleBlock(s) => { + s.fast_scan(yes); + } + InnerScanner::MultiBlock(s) => { + s.fast_scan(yes); + } + InnerScanner::None => unreachable!(), + } + self + } + fn make_multi_block(&mut self) -> &mut yara_x::blocks::Scanner<'r> { // Already a multi-block scanner, nothing else to do. if let Self::MultiBlock(s) = self { @@ -153,6 +166,31 @@ pub unsafe extern "C" fn yrx_scanner_set_timeout( YRX_RESULT::YRX_SUCCESS } +/// Enables or disables fast scan mode for the scanner. +/// +/// In fast scan mode, the scanner avoids tracking matches for patterns when it +/// is not necessary (e.g. when a rule condition only performs a simple boolean +/// check `$a`). +/// +/// Note that using fast scan mode implies that not all matches will be +/// reported. For instance, when iterating matches using [`ScanResults`], +/// you won't get all occurrences of the pattern in the file, only the first +/// one. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn yrx_scanner_fast_scan( + scanner: *mut YRX_SCANNER, + yes: bool, +) -> YRX_RESULT { + let scanner = match scanner.as_mut() { + Some(s) => s, + None => return YRX_RESULT::YRX_INVALID_ARGUMENT, + }; + + scanner.inner.fast_scan(yes); + + YRX_RESULT::YRX_SUCCESS +} + /// Scans a data buffer. /// /// `data` can be null as long as `len` is 0. In such cases its handled as diff --git a/capi/src/tests.rs b/capi/src/tests.rs index 40835e018..ce6388bc8 100644 --- a/capi/src/tests.rs +++ b/capi/src/tests.rs @@ -12,12 +12,13 @@ use crate::{ yrx_rule_iter_metadata, yrx_rule_iter_patterns, yrx_rule_iter_tags, yrx_rule_namespace, yrx_rules_deserialize, yrx_rules_destroy, yrx_rules_iter, yrx_rules_iter_imports, yrx_rules_serialize, - yrx_scanner_create, yrx_scanner_destroy, yrx_scanner_finish, - yrx_scanner_on_console_log, yrx_scanner_on_matching_rule, - yrx_scanner_scan, yrx_scanner_scan_block, yrx_scanner_set_global_bool, - yrx_scanner_set_global_float, yrx_scanner_set_global_int, - yrx_scanner_set_global_json, yrx_scanner_set_global_str, - yrx_scanner_set_module_data, yrx_scanner_set_timeout, + yrx_scanner_create, yrx_scanner_destroy, yrx_scanner_fast_scan, + yrx_scanner_finish, yrx_scanner_on_console_log, + yrx_scanner_on_matching_rule, yrx_scanner_scan, yrx_scanner_scan_block, + yrx_scanner_set_global_bool, yrx_scanner_set_global_float, + yrx_scanner_set_global_int, yrx_scanner_set_global_json, + yrx_scanner_set_global_str, yrx_scanner_set_module_data, + yrx_scanner_set_timeout, }; use std::ffi::{CStr, c_char, c_void}; @@ -457,3 +458,38 @@ fn capi_errors() { yrx_compiler_destroy(compiler); } } + +#[test] +fn capi_fast_scan() { + unsafe { + let mut compiler = std::ptr::null_mut(); + yrx_compiler_create(0, &mut compiler); + + let src = c"rule test { strings: $a = \"foo\" condition: $a }"; + yrx_compiler_add_source(compiler, src.as_ptr()); + + let rules = yrx_compiler_build(compiler); + yrx_compiler_destroy(compiler); + + let mut scanner = std::ptr::null_mut(); + yrx_scanner_create(rules, &mut scanner); + + // Enable fast scan mode + yrx_scanner_fast_scan(scanner, true); + + let mut matches = 0; + yrx_scanner_on_matching_rule( + scanner, + on_rule_match_increase_counter, + &mut matches as *mut i32 as *mut c_void, + ); + + let data = b"foofoofoo"; + yrx_scanner_scan(scanner, data.as_ptr(), data.len()); + + assert_eq!(matches, 1); + + yrx_scanner_destroy(scanner); + yrx_rules_destroy(rules); + } +} diff --git a/go/scanner.go b/go/scanner.go index 848a49a47..c1926c56e 100644 --- a/go/scanner.go +++ b/go/scanner.go @@ -116,6 +116,20 @@ func (s *Scanner) SetTimeout(timeout time.Duration) { runtime.KeepAlive(s) } +// FastScan enables or disables fast scan mode. +// +// In fast scan mode, the scanner avoids tracking matches for patterns when it +// is not necessary (e.g. when a rule condition only performs a simple boolean +// check `$a`). +// +// Note that using fast scan mode implies that not all matches will be +// reported. For instance, when iterating matches, you won't get all occurrences +// of the pattern in the file, only the first one. +func (s *Scanner) FastScan(yes bool) { + C.yrx_scanner_fast_scan(s.cScanner, C.bool(yes)) + runtime.KeepAlive(s) +} + var ErrTimeout = errors.New("timeout") // SetGlobal sets the value of a global variable. diff --git a/go/scanner_test.go b/go/scanner_test.go index 84023b0f3..219343d3a 100644 --- a/go/scanner_test.go +++ b/go/scanner_test.go @@ -160,3 +160,21 @@ func BenchmarkScan(b *testing.B) { } } } + +func TestScannerFastScan(t *testing.T) { + r, _ := Compile(` + rule t { + strings: + $a = "foo" + condition: + $a + }`) + s := NewScanner(r) + s.FastScan(true) + scanResults, _ := s.Scan([]byte("foofoofoo")) + matchingRules := scanResults.MatchingRules() + + assert.Len(t, matchingRules, 1) + assert.Len(t, matchingRules[0].Patterns(), 1) + assert.Len(t, matchingRules[0].Patterns()[0].Matches(), 1) +} diff --git a/lib/src/compiler/ir/ast2ir.rs b/lib/src/compiler/ir/ast2ir.rs index 14b9d3a9c..2bfdf3ccd 100644 --- a/lib/src/compiler/ir/ast2ir.rs +++ b/lib/src/compiler/ir/ast2ir.rs @@ -249,6 +249,7 @@ pub(in crate::compiler) fn text_pattern_from_ast<'src>( identifier: pattern.identifier.clone(), in_use: false, span: pattern.span(), + fast_scan_allowed: true, pattern: Pattern::Text(LiteralPattern { flags, text, @@ -305,6 +306,7 @@ pub(in crate::compiler) fn hex_pattern_from_ast<'src>( identifier: pattern.identifier.clone(), in_use: false, span: pattern.span(), + fast_scan_allowed: true, pattern: Pattern::Hex(RegexpPattern { hir, flags: PatternFlags::Ascii, @@ -440,6 +442,7 @@ pub(in crate::compiler) fn regexp_pattern_from_ast<'src>( identifier: pattern.identifier.clone(), in_use: false, span: pattern.span(), + fast_scan_allowed: true, pattern: Pattern::Regexp(RegexpPattern { flags, hir, @@ -772,6 +775,10 @@ fn expr_from_ast( pattern.make_non_anchorable(); } + if !matches!(anchor, MatchAnchor::None) { + pattern.disallow_fast_scan(); + } + ctx.ir.pattern_match(pattern_idx, anchor) } } @@ -805,13 +812,19 @@ fn expr_from_ast( let range = range_from_ast(ctx, range)?; let (pattern_idx, pattern) = ctx.get_pattern_mut(&p.identifier)?; - pattern.make_non_anchorable().mark_as_used(); + pattern + .make_non_anchorable() + .mark_as_used() + .disallow_fast_scan(); ctx.ir.pattern_count(pattern_idx, Some(range)) } (_, None) => { let (pattern_idx, pattern) = ctx.get_pattern_mut(&p.identifier)?; - pattern.make_non_anchorable().mark_as_used(); + pattern + .make_non_anchorable() + .mark_as_used() + .disallow_fast_scan(); ctx.ir.pattern_count(pattern_idx, None) } } @@ -847,13 +860,19 @@ fn expr_from_ast( integer_in_range_from_ast(ctx, index, 1..=i64::MAX)?; let (pattern_idx, pattern) = ctx.get_pattern_mut(&p.identifier)?; - pattern.make_non_anchorable().mark_as_used(); + pattern + .make_non_anchorable() + .mark_as_used() + .disallow_fast_scan(); ctx.ir.pattern_offset(pattern_idx, Some(range)) } (_, None) => { let (pattern_idx, pattern) = ctx.get_pattern_mut(&p.identifier)?; - pattern.make_non_anchorable().mark_as_used(); + pattern + .make_non_anchorable() + .mark_as_used() + .disallow_fast_scan(); ctx.ir.pattern_offset(pattern_idx, None) } } @@ -889,13 +908,19 @@ fn expr_from_ast( integer_in_range_from_ast(ctx, index, 1..=i64::MAX)?; let (pattern_idx, pattern) = ctx.get_pattern_mut(&p.identifier)?; - pattern.make_non_anchorable().mark_as_used(); + pattern + .make_non_anchorable() + .mark_as_used() + .disallow_fast_scan(); ctx.ir.pattern_length(pattern_idx, Some(index)) } (_, None) => { let (pattern_idx, pattern) = ctx.get_pattern_mut(&p.identifier)?; - pattern.make_non_anchorable().mark_as_used(); + pattern + .make_non_anchorable() + .mark_as_used() + .disallow_fast_scan(); ctx.ir.pattern_length(pattern_idx, None) } } @@ -1247,6 +1272,34 @@ fn for_of_expr_from_ast( let body = bool_expr_from_ast(ctx, &for_of.body)?; + let mut allow_fast_scan = true; + + for event in ctx.ir.dfs_iter(body) { + if let dfs::Event::Enter((_, expr, _)) = event + && (matches!( + expr, + Expr::PatternCountVar { .. } + | Expr::PatternOffsetVar { .. } + | Expr::PatternLengthVar { .. } + ) || (match expr { + Expr::PatternMatchVar { anchor, .. } => { + !matches!(anchor, MatchAnchor::None) + } + _ => false, + })) + { + allow_fast_scan = false; + break; + } + } + + if !allow_fast_scan { + for &pattern_idx in &pattern_set { + ctx.current_rule_patterns[pattern_idx.as_usize()] + .disallow_fast_scan(); + } + } + ctx.for_of_depth -= 1; ctx.symbol_table.pop(); ctx.vars.unwind(&stack_frame); diff --git a/lib/src/compiler/ir/mod.rs b/lib/src/compiler/ir/mod.rs index e1428741a..5dadd0423 100644 --- a/lib/src/compiler/ir/mod.rs +++ b/lib/src/compiler/ir/mod.rs @@ -107,6 +107,7 @@ pub(crate) struct PatternInRule<'src> { pattern: Pattern, span: Span, in_use: bool, + fast_scan_allowed: bool, } impl<'src> PatternInRule<'src> { @@ -183,6 +184,27 @@ impl<'src> PatternInRule<'src> { self.in_use = true; self } + + /// Returns true if this pattern can be fast-scanned. + /// + /// A pattern can be fast-scanned if its occurrences are only evaluated + /// as simple boolean checks (e.g. `$a`), meaning the scanner can stop + /// tracking matches for it once the first match has been found. + #[inline] + pub fn fast_scan_allowed(&self) -> bool { + self.fast_scan_allowed + } + + /// Disallows fast-scanning for this pattern. + /// + /// This is called when the pattern is used in a context that requires + /// tracking all matches (such as count `#a`, offset `@a`, length `!a`, + /// anchored checks, or loop equivalents). + #[inline] + pub fn disallow_fast_scan(&mut self) -> &mut Self { + self.fast_scan_allowed = false; + self + } } /// Represents a pattern in YARA. @@ -1560,7 +1582,7 @@ impl IR { pub fn matches_regex_set( &mut self, lhs: ExprId, - regex_set: crate::compiler::RegexSetId, + regex_set: RegexSetId, ) -> ExprId { let expr_id = ExprId::from(self.nodes.len()); self.parents[lhs.0 as usize] = expr_id; diff --git a/lib/src/compiler/mod.rs b/lib/src/compiler/mod.rs index 97110a42c..0e4d9fde2 100644 --- a/lib/src/compiler/mod.rs +++ b/lib/src/compiler/mod.rs @@ -344,6 +344,10 @@ pub struct Compiler<'a> { /// Next (not used yet) [`PatternId`]. next_pattern_id: PatternId, + /// Vector where the N-th boolean indicates whether the pattern with + /// PatternId = N is a fast-scan pattern. + fast_scan_patterns: Vec, + /// Map used for de-duplicating pattern. Keys are the pattern's IR and /// values are the `PatternId` assigned to each pattern. Every time a rule /// declares a pattern, this map is used for determining if the same @@ -483,6 +487,7 @@ impl<'a> Compiler<'a> { error_on_slow_pattern: false, error_on_slow_loop: false, next_pattern_id: PatternId(0), + fast_scan_patterns: Vec::new(), current_namespace: default_namespace, features: FxHashSet::default(), warnings: Warnings::default(), @@ -809,6 +814,7 @@ impl<'a> Compiler<'a> { warnings: self.warnings.into(), filesize_bounds: self.filesize_bounds, regex_sets: self.regex_sets, + fast_scan_patterns: self.fast_scan_patterns, }; rules.build_ac_automaton(); @@ -1190,6 +1196,7 @@ impl Compiler<'_> { re_code_len: self.re_code.len(), sub_patterns_len: self.sub_patterns.len(), symbol_table_len: self.symbol_table.len(), + fast_scan_patterns_len: self.fast_scan_patterns.len(), } } @@ -1204,6 +1211,7 @@ impl Compiler<'_> { self.re_code.truncate(snapshot.re_code_len); self.atoms.truncate(snapshot.atoms_len); self.symbol_table.truncate(snapshot.symbol_table_len); + self.fast_scan_patterns.truncate(snapshot.fast_scan_patterns_len); // Pattern IDs that are >= next_pattern_id, are being discarded. Any pattern // or file size bound associated to such IDs must be removed. @@ -1716,12 +1724,17 @@ impl Compiler<'_> { Entry::Vacant(entry) => { let pattern_id = self.next_pattern_id; self.next_pattern_id.incr(1); + self.fast_scan_patterns.push(true); pending_patterns.insert(pattern_id); entry.insert(pattern_id); pattern_id } }; + if !pattern.fast_scan_allowed() { + self.fast_scan_patterns[usize::from(pattern_id)] = false; + } + let kind = match pattern.pattern() { Pattern::Text(_) => PatternKind::Text, Pattern::Regexp(_) => PatternKind::Regexp, @@ -2994,6 +3007,7 @@ struct Snapshot { re_code_len: usize, sub_patterns_len: usize, symbol_table_len: usize, + fast_scan_patterns_len: usize, } /// Represents a list of warnings. diff --git a/lib/src/compiler/rules.rs b/lib/src/compiler/rules.rs index d5faaedf5..5131d8280 100644 --- a/lib/src/compiler/rules.rs +++ b/lib/src/compiler/rules.rs @@ -33,7 +33,7 @@ const MAGIC: &[u8] = b"YARA-X\0\0"; /// /// This version is incremented every time a change is made to the binary /// format in a way that breaks backwards compatibility. -const SERIALIZATION_VERSION: u32 = 1; +const SERIALIZATION_VERSION: u32 = 2; /// Aho-Corasick automaton bundled with an optional Teddy scanner if the /// number of patterns is low enough. If the Teddy scanner is present, and @@ -164,6 +164,7 @@ pub struct Rules { /// in the source code, allowing them to be compiled into a unified /// set automata for single-pass evaluation. pub(in crate::compiler) regex_sets: FxHashMap>, + pub(in crate::compiler) fast_scan_patterns: Vec, } impl Rules { @@ -565,6 +566,11 @@ impl Rules { ) -> Option<&FilesizeBounds> { self.filesize_bounds.get(&pattern_id) } + + #[inline] + pub(crate) fn is_fast_scan(&self, pattern_id: PatternId) -> bool { + self.fast_scan_patterns[usize::from(pattern_id)] + } } #[cfg(feature = "native-code-serialization")] diff --git a/lib/src/compiler/tests/mod.rs b/lib/src/compiler/tests/mod.rs index 25fe2624d..1342dce6b 100644 --- a/lib/src/compiler/tests/mod.rs +++ b/lib/src/compiler/tests/mod.rs @@ -28,7 +28,7 @@ fn serialization() { // `DecodeError`. let mut data = Vec::new(); data.extend(b"YARA-X\0\0"); - data.extend(1u32.to_le_bytes()); + data.extend(2u32.to_le_bytes()); data.extend(b"foo"); assert!(matches!( diff --git a/lib/src/scanner/blocks.rs b/lib/src/scanner/blocks.rs index c534e064f..9cb5744db 100644 --- a/lib/src/scanner/blocks.rs +++ b/lib/src/scanner/blocks.rs @@ -280,6 +280,21 @@ impl<'r> Scanner<'r> { self } + /// Enables or disables fast scan mode. + /// + /// In fast scan mode, the scanner avoids tracking matches for patterns + /// when it is not necessary (e.g. when a rule condition only performs a + /// simple boolean check `$a`). + /// + /// Note that using fast scan mode implies that not all matches will be + /// reported. For instance, when iterating matches using [`ScanResults`], + /// you won't get all occurrences of the pattern in the file, only the first + /// one. + pub fn fast_scan(&mut self, yes: bool) -> &mut Self { + self.scan_context_mut().tracker.fast_scan = yes; + self + } + /// Sets a callback that is invoked every time a YARA rule calls the /// `console` module. /// @@ -561,4 +576,39 @@ mod tests { assert_eq!(results.matching_rules().len(), 1); } + + #[test] + fn block_scanner_fast_scan() { + let rules = compile( + r#" + rule test { + strings: + $a = "foo" + condition: + $a + }"#, + ) + .unwrap(); + + let mut scanner = Scanner::new(&rules); + let results = scanner + .fast_scan(true) + .scan(0, b"foofoofoo") + .unwrap() + .finish() + .unwrap(); + + assert_eq!(results.matching_rules().len(), 1); + + let rule = results.matching_rules().next().unwrap(); + let pattern = rule.patterns().next().unwrap(); + let mut matches = pattern.matches(); + + // Only a single match is returned because of the fast scan mode! + let match1 = matches.next().unwrap(); + assert_eq!(match1.data(), b"foo".as_slice()); + assert_eq!(match1.range(), 0..3); + + assert!(matches.next().is_none()); + } } diff --git a/lib/src/scanner/context.rs b/lib/src/scanner/context.rs index d41ea17d6..7e3786502 100644 --- a/lib/src/scanner/context.rs +++ b/lib/src/scanner/context.rs @@ -67,6 +67,7 @@ pub(crate) struct MatchTracker<'r> { pub unconfirmed_matches: FxHashMap>, pub limit_reached: FxHashSet, pub compiled_rules: &'r Rules, + pub fast_scan: bool, } /// Structure that holds information about WASM memories and variables used @@ -1745,7 +1746,12 @@ fn track_pattern_match( bits.set(pattern_id.into(), true); - if !tracker.pattern_matches.add(pattern_id, match_, replace_if_longer) { + let added = + tracker.pattern_matches.add(pattern_id, match_, replace_if_longer); + if !added + || (tracker.fast_scan + && tracker.compiled_rules.is_fast_scan(pattern_id)) + { tracker.limit_reached.insert(pattern_id); } } @@ -1909,6 +1915,7 @@ pub fn create_wasm_store_and_ctx<'r>( unconfirmed_matches: FxHashMap::default(), limit_reached: FxHashSet::default(), compiled_rules: rules, + fast_scan: false, }, deadline: 0, regex_cache: RefCell::new(FxHashMap::default()), diff --git a/lib/src/scanner/mod.rs b/lib/src/scanner/mod.rs index 63f56d539..a46aa7df4 100644 --- a/lib/src/scanner/mod.rs +++ b/lib/src/scanner/mod.rs @@ -227,6 +227,21 @@ impl<'r> Scanner<'r> { self } + /// Enables or disables fast scan mode. + /// + /// In fast scan mode, the scanner avoids tracking matches for patterns + /// when it is not necessary (e.g. when a rule condition only performs a + /// simple boolean check `$a`). + /// + /// Note that using fast scan mode implies that not all matches will be + /// reported. For instance, when iterating matches using [`ScanResults`], + /// you won't get all occurrences of the pattern in the file, only the first + /// one. + pub fn fast_scan(&mut self, yes: bool) -> &mut Self { + self.scan_context_mut().tracker.fast_scan = yes; + self + } + /// Specifies whether [`Scanner::scan_file`] and [`Scanner::scan_file_with_options`] /// may use memory-mapped files to read input. /// diff --git a/lib/src/scanner/tests.rs b/lib/src/scanner/tests.rs index 27723eda6..83ca724aa 100644 --- a/lib/src/scanner/tests.rs +++ b/lib/src/scanner/tests.rs @@ -985,3 +985,71 @@ fn regex_set_optimization() { results.matching_rules().map(|r| r.identifier().to_string()).collect(); assert_eq!(matching_rules, vec!["test_match"]); } + +#[test] +fn fast_scan_mode() { + let rules = crate::compile( + r#" + rule test_boolean { + strings: + $a = "foo" + $b = "bar" + condition: + $a and $b + } + rule test_count { + strings: + $c = "baz" + condition: + #c > 1 + } + "#, + ) + .unwrap(); + + // Test standard scan first (fast_scan = false by default) + let mut scanner = Scanner::new(&rules); + let results = scanner.scan(b"foofoobarbarbazbaz").unwrap(); + + // Check pattern $a matches + let test_boolean = results + .matching_rules() + .find(|r| r.identifier() == "test_boolean") + .unwrap(); + let mut patterns_a = + test_boolean.patterns().filter(|p| p.identifier() == "$a"); + assert_eq!(patterns_a.next().unwrap().matches().len(), 2); // foofoo has 2 matches + + // Check pattern $c matches + let test_count = results + .matching_rules() + .find(|r| r.identifier() == "test_count") + .unwrap(); + let mut patterns_c = + test_count.patterns().filter(|p| p.identifier() == "$c"); + assert_eq!(patterns_c.next().unwrap().matches().len(), 2); // bazbaz has 2 matches + + // Test fast scan mode (fast_scan = true) + let mut scanner = Scanner::new(&rules); + scanner.fast_scan(true); + let results = scanner.scan(b"foofoobarbarbazbaz").unwrap(); + + // Rule test_boolean still matches + let test_boolean = results + .matching_rules() + .find(|r| r.identifier() == "test_boolean") + .unwrap(); + // But pattern $a must only have 1 match because it is fast-scanned! + let mut patterns_a = + test_boolean.patterns().filter(|p| p.identifier() == "$a"); + assert_eq!(patterns_a.next().unwrap().matches().len(), 1); + + // Pattern $c must still have 2 matches because #c is used, disabling fast scan! + let test_count = results + .matching_rules() + .find(|r| r.identifier() == "test_count") + .unwrap(); + let mut patterns_c = + test_count.patterns().filter(|p| p.identifier() == "$c"); + assert_eq!(patterns_c.next().unwrap().matches().len(), 2); +} diff --git a/py/src/lib.rs b/py/src/lib.rs index 64a01c9d8..b131ef2e1 100644 --- a/py/src/lib.rs +++ b/py/src/lib.rs @@ -1014,6 +1014,20 @@ impl Scanner { self.inner.max_matches_per_pattern(matches); } + /// Enables or disables fast scan mode. + /// + /// In fast scan mode, the scanner avoids tracking matches for patterns when + /// it is not necessary (e.g. when a rule condition only performs a simple + /// boolean check `$a`). + /// + /// Note that using fast scan mode implies that not all matches will be + /// reported. For instance, when iterating matches using [`ScanResults`], + /// you won't get all occurrences of the pattern in the file, only the first + /// one. + fn fast_scan(&mut self, yes: bool) { + self.inner.fast_scan(yes); + } + /// Sets a callback that is invoked every time a YARA rule calls the /// `console` module. /// diff --git a/py/tests/test_api.py b/py/tests/test_api.py index 9734856cc..1b7f6e2f2 100644 --- a/py/tests/test_api.py +++ b/py/tests/test_api.py @@ -238,6 +238,18 @@ def test_scanner_max_matches_per_pattern(): assert len(matching_rules) == 1 +def test_scanner_fast_scan(): + compiler = yara_x.Compiler() + compiler.add_source('rule test {strings: $a = "foo" condition: $a}') + + scanner = yara_x.Scanner(compiler.build()) + scanner.fast_scan(True) + matching_rules = scanner.scan(b'foofoofoo').matching_rules + assert len(matching_rules) == 1 + assert len(matching_rules[0].patterns) == 1 + assert len(matching_rules[0].patterns[0].matches) == 1 + + def test_scan_options(): if 'test_proto2' not in yara_x.module_names(): return diff --git a/py/yara_x.pyi b/py/yara_x.pyi index cc90de6f6..4f9b09465 100644 --- a/py/yara_x.pyi +++ b/py/yara_x.pyi @@ -288,6 +288,20 @@ class Scanner: """ ... + def fast_scan(self, yes: bool) -> None: + r""" + Enables or disables fast scan mode. + + In fast scan mode, the scanner avoids tracking matches for patterns when + it is not necessary (e.g. when a rule condition only performs a simple + boolean check `$a`). + + Note that using fast scan mode implies that not all matches will be + reported. For instance, when iterating matches, you won't get all occurrences + of the pattern in the file, only the first one. + """ + ... + def console_log(self, callback: collections.abc.Callable[[str], Any]) -> None: r""" Sets a callback that is invoked every time a YARA rule calls the diff --git a/site/content/docs/api/c.md b/site/content/docs/api/c.md index 73ddec1e4..3d6c926d6 100644 --- a/site/content/docs/api/c.md +++ b/site/content/docs/api/c.md @@ -742,6 +742,28 @@ enum YRX_RESULT yrx_scanner_set_timeout( uint64_t timeout); ``` +------ + +### yrx_scanner_fast_scan + +```c +enum YRX_RESULT yrx_scanner_fast_scan( + struct YRX_SCANNER *scanner, + bool yes); +``` + +Enables or disables fast scan mode for the scanner. + +In fast scan mode, the scanner avoids tracking matches for patterns when it is +not necessary (e.g. when a rule condition only performs a simple boolean check +`$a`). + +Note that using fast scan mode implies that not all matches will be reported. +For instance, when iterating matches, you won't get all occurrences of the +pattern in the file, only the first one. + +------ + ### yrx_scanner_set_global_xxxx ```c diff --git a/site/content/docs/api/python.md b/site/content/docs/api/python.md index 44ca03354..4b8e642bc 100644 --- a/site/content/docs/api/python.md +++ b/site/content/docs/api/python.md @@ -574,6 +574,18 @@ if the type of `value` is not one of the supported ones. Sets a timeout for each scan. Scans will abort after the specified `seconds`. +#### .fast_scan(bool) + +Enables or disables fast scan mode. + +In fast scan mode, the scanner avoids tracking matches for patterns when it is +not necessary (e.g. when a rule condition only performs a simple boolean check +`$a`). + +Note that using fast scan mode implies that not all matches will be reported. +For instance, when iterating matches, you won't get all occurrences of the +pattern in the file, only the first one. + --------- ### ScanOptions