blopker · blopker · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,14 @@
+[Unreleased]
+
+- Add multi-language support for Markdown files: fenced code blocks are now spell-checked using the appropriate language grammar (Python, Rust, Bash, etc.)
+- Add language injection system via `@injection.*` capture tags in `.scm` query files. Adding multi-language support to a new file type requires only a `.scm` change, no Rust code
+- Add HTML block injection in Markdown. Block-level HTML is spell-checked using the HTML grammar
+- Add language alias resolution for Markdown code blocks (e.g., `py`, `js`, `sh`, `rs`, `yml`, `c++`)
+- Pre-compile all tree-sitter queries at startup for faster spell-checking and earlier error detection
+- Reduce per-word memory allocations in the spell-check pipeline
+- Fix Erlang query producing duplicate captures for function name atoms
+- Refactor: split spell-checking into separate extraction (`parser.rs`) and checking (`checker.rs`) modules
+
 [0.3.35]
 
 - Add tag-based filtering (`include_tags`/`exclude_tags`) to control which parts of code are spell-checked (comments, strings, identifiers, etc.)

diff --git a/crates/codebook/src/checker.rs b/crates/codebook/src/checker.rs
@@ -0,0 +1,123 @@
+use std::collections::{HashMap, HashSet};
+
+use crate::dictionaries::dictionary::Dictionary;
+use crate::parser::{TextRange, WordLocation};
+use codebook_config::CodebookConfig;
+
+/// A candidate word extracted from a text node, with its position
+/// in original-document byte offsets. Borrows the word text from the
+/// source document to avoid per-word String allocations.
+#[derive(Debug, Clone, PartialEq)]
+pub struct WordCandidate<'a> {
+    pub word: &'a str,
+    pub start_byte: usize,
+    pub end_byte: usize,
+}
+
+/// Check candidate words against dictionaries and config rules.
+/// Returns WordLocations for misspelled words, grouping all locations
+/// of the same word together.
+pub fn check_words(
+    candidates: &[WordCandidate<'_>],
+    dictionaries: &[std::sync::Arc<dyn Dictionary>],
+    config: &dyn CodebookConfig,
+) -> Vec<WordLocation> {
+    // Group candidates by word text, deduplicating identical spans.
+    let mut word_positions: HashMap<&str, HashSet<TextRange>> = HashMap::new();
+    for candidate in candidates {
+        let location = TextRange {
+            start_byte: candidate.start_byte,
+            end_byte: candidate.end_byte,
+        };
+        let added = word_positions
+            .entry(candidate.word)
+            .or_default()
+            .insert(location);
+
+        debug_assert!(
+            added,
+            "Two of the same locations found. Make a better query. Word: {}, Location: {:?}",
+            candidate.word, location
+        );
+    }
+
+    // Check each unique word once
+    let mut results = Vec::new();
+    for (word, positions) in word_positions {
+        let positions: Vec<TextRange> = positions.into_iter().collect();
+        if config.should_flag_word(word) {
+            results.push(WordLocation::new(word.to_string(), positions));
+            continue;
+        }
+        if word.len() < config.get_min_word_length() {
+            continue;
+        }
+        if config.is_allowed_word(word) {
+            continue;
+        }
+        let is_correct = dictionaries.iter().any(|dict| dict.check(word));
+        if !is_correct {
+            results.push(WordLocation::new(word.to_string(), positions));
+        }
+    }
+    results
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::dictionaries::dictionary::TextDictionary;
+    use std::sync::Arc;
+
+    fn make_candidates<'a>(words: &[(&'a str, usize, usize)]) -> Vec<WordCandidate<'a>> {
+        words
+            .iter()
+            .map(|(word, start, end)| WordCandidate {
+                word,
+                start_byte: *start,
+                end_byte: *end,
+            })
+            .collect()
+    }
+
+    #[test]
+    fn test_check_words_flags_unknown() {
+        let dict = Arc::new(TextDictionary::new("hello\nworld\n"));
+        let config = Arc::new(codebook_config::CodebookConfigMemory::default());
+        let candidates = make_candidates(&[("hello", 0, 5), ("wrld", 6, 10)]);
+        let results = check_words(&candidates, &[dict], config.as_ref());
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0].word, "wrld");
+    }
+
+    #[test]
+    fn test_check_words_groups_locations() {
+        let dict = Arc::new(TextDictionary::new("hello\n"));
+        let config = Arc::new(codebook_config::CodebookConfigMemory::default());
+        let candidates = make_candidates(&[("wrld", 0, 4), ("wrld", 10, 14)]);
+        let results = check_words(&candidates, &[dict], config.as_ref());
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0].word, "wrld");
+        assert_eq!(results[0].locations.len(), 2);
+    }
+
+    #[test]
+    fn test_check_words_respects_min_length() {
+        let dict = Arc::new(TextDictionary::new(""));
+        let config = Arc::new(codebook_config::CodebookConfigMemory::default());
+        // Default min word length is 3
+        let candidates = make_candidates(&[("ab", 0, 2)]);
+        let results = check_words(&candidates, &[dict], config.as_ref());
+        assert!(results.is_empty(), "Short words should be skipped");
+    }
+
+    #[test]
+    fn test_check_words_respects_allowed_words() {
+        let dict = Arc::new(TextDictionary::new(""));
+        let config = Arc::new(codebook_config::CodebookConfigMemory::default());
+        config.add_word("codebook").unwrap();
+        let candidates = make_candidates(&[("codebook", 0, 8)]);
+        let results = check_words(&candidates, &[dict], config.as_ref());
+        assert!(results.is_empty(), "Allowed words should not be flagged");
+    }
+}
diff --git a/crates/codebook/src/dictionaries/dictionary.rs b/crates/codebook/src/dictionaries/dictionary.rs
@@ -7,10 +7,6 @@ use std::{
     sync::{Arc, RwLock},
 };
 
-use crate::parser::{WordLocation, find_locations};
-use crate::queries::LanguageType;
-use regex::Regex;
-
 pub trait Dictionary: Send + Sync {
     fn check(&self, word: &str) -> bool;
     fn suggest(&self, word: &str) -> Vec<String>;
@@ -170,17 +166,6 @@ impl TextDictionary {
     }
 }
 
-/// Integration helper to use any Dictionary trait with optimized batch processing
-pub fn find_locations_with_dictionary_batch(
-    text: &str,
-    language: LanguageType,
-    dictionary: &dyn Dictionary,
-    skip_patterns: &[Regex],
-) -> Vec<WordLocation> {
-    // For non-HashSet dictionaries, we still get deduplication benefits
-    find_locations(text, language, |word| dictionary.check(word), |_| true, skip_patterns)
-}
-
 #[cfg(test)]
 mod dictionary_tests {
     use super::*;

diff --git a/crates/codebook/src/lib.rs b/crates/codebook/src/lib.rs
@@ -1,3 +1,4 @@
+pub mod checker;
 pub mod dictionaries;
 mod logging;
 pub mod parser;
@@ -6,6 +7,7 @@ pub mod regexes;
 mod splitter;
 
 use crate::regexes::get_default_skip_patterns;
+use std::collections::HashSet;
 use std::path::Path;
 use std::sync::Arc;
 
@@ -38,56 +40,42 @@ impl Codebook {
         file_path: Option<&str>,
     ) -> Vec<parser::WordLocation> {
         if let Some(file_path) = file_path {
-            // ignore_paths is a blocklist and has higher precedence than include_paths
             if self.config.should_ignore_path(Path::new(file_path)) {
                 return Vec::new();
             }
-            // include_paths is an allowlist; empty list means "include everything"
             if !self.config.should_include_path(Path::new(file_path)) {
                 return Vec::new();
             }
         }
-        // get needed dictionary names
-        // get needed dictionaries
-        // call spell check on each dictionary
+
         let language = self.resolve_language(language, file_path);
-        let dictionaries = self.get_dictionaries(Some(language));
-        // Combine default and user patterns
+
+        // Combine default and user skip patterns
         let mut all_patterns = get_default_skip_patterns().clone();
         if let Some(user_patterns) = self.config.get_ignore_patterns() {
             all_patterns.extend(user_patterns);
         }
-        parser::find_locations(
+
+        // Extract all words, recursively following injections
+        let (candidates, languages_found) = parser::extract_all_words(
             text,
             language,
-            |word| {
-                if self.config.should_flag_word(word) {
-                    return false;
-                }
-                if word.len() < self.config.get_min_word_length() {
-                    return true;
-                }
-                if self.config.is_allowed_word(word) {
-                    return true;
-                }
-                for dictionary in &dictionaries {
-                    if dictionary.check(word) {
-                        return true;
-                    }
-                }
-                false
-            },
-            |tag| self.config.should_check_tag(tag),
+            &|tag| self.config.should_check_tag(tag),
             &all_patterns,
-        )
+        );
+
+        // Load dictionaries for all languages encountered
+        let dictionaries = self.get_dictionaries_for_languages(&languages_found);
+
+        // Check words against dictionaries
+        checker::check_words(&candidates, &dictionaries, self.config.as_ref())
     }
 
     fn resolve_language(
         &self,
         language_type: Option<queries::LanguageType>,
         path: Option<&str>,
     ) -> queries::LanguageType {
-        // Check if we have a language_id first, fallback to path, fall back to text
         match language_type {
             Some(lang) => lang,
             None => match path {
@@ -97,21 +85,26 @@ impl Codebook {
         }
     }
 
-    fn get_dictionaries(
+    /// Gather dictionaries for all languages encountered in a file.
+    fn get_dictionaries_for_languages(
         &self,
-        language: Option<queries::LanguageType>,
+        languages: &HashSet<queries::LanguageType>,
     ) -> Vec<Arc<dyn Dictionary>> {
         let mut dictionary_ids = self.config.get_dictionary_ids();
-        if let Some(lang) = language {
-            let language_dictionary_ids = lang.dictionary_ids();
-            dictionary_ids.extend(language_dictionary_ids);
-        };
+
+        for lang in languages {
+            dictionary_ids.extend(lang.dictionary_ids());
+        }
+
         dictionary_ids.extend(DEFAULT_DICTIONARIES.iter().map(|f| f.to_string()));
+
+        dictionary_ids.sort();
+        dictionary_ids.dedup();
+
         let mut dictionaries = Vec::with_capacity(dictionary_ids.len());
         debug!("Checking text with dictionaries: {dictionary_ids:?}");
         for dictionary_id in dictionary_ids {
-            let dictionary = self.manager.get_dictionary(&dictionary_id);
-            if let Some(d) = dictionary {
+            if let Some(d) = self.manager.get_dictionary(&dictionary_id) {
                 dictionaries.push(d);
             }
         }
@@ -125,9 +118,8 @@ impl Codebook {
     }
 
     pub fn get_suggestions(&self, word: &str) -> Option<Vec<String>> {
-        // Get top suggestions and return the first 5 suggestions in round robin order
         let max_results = 5;
-        let dictionaries = self.get_dictionaries(None);
+        let dictionaries = self.get_dictionaries_for_languages(&HashSet::new());
         let mut is_misspelled = false;
         let suggestions: Vec<Vec<String>> = dictionaries
             .iter()
@@ -176,9 +168,7 @@ mod tests {
             vec!["date", "elderberry", "fig"],
             vec!["grape", "honeydew", "kiwi"],
         ];
-
         let result = collect_round_robin(&sources, 5);
-        // Round-robin order: first from each source, then second from each source
         assert_eq!(
             result,
             vec!["apple", "date", "grape", "banana", "elderberry"]
@@ -192,13 +182,6 @@ mod tests {
             vec!["banana", "cherry", "date"],
             vec!["cherry", "date", "elderberry"],
         ];
-
-        // In round-robin, we get:
-        // 1. apple (1st from 1st source)
-        // 2. banana (1st from 2nd source) - cherry already taken
-        // 3. cherry (1st from 3rd source)
-        // 4. banana (2nd from 1st source)
-        // 5. date (3rd from 2nd source) - cherry already taken
         let result = collect_round_robin(&sources, 5);
         assert_eq!(
             result,
@@ -213,8 +196,6 @@ mod tests {
             vec!["elderberry"],
             vec!["fig", "grape"],
         ];
-
-        // Round-robin order with uneven sources
         let result = collect_round_robin(&sources, 7);
         assert_eq!(
             result,
@@ -240,17 +221,13 @@ mod tests {
     #[test]
     fn test_collect_round_robin_some_empty_sources() {
         let sources = vec![vec!["apple", "banana"], vec![], vec!["cherry", "date"]];
-
-        // Round-robin order, skipping empty source
         let result = collect_round_robin(&sources, 4);
         assert_eq!(result, vec!["apple", "cherry", "banana", "date"]);
     }
 
     #[test]
     fn test_collect_round_robin_with_numbers() {
         let sources = vec![vec![1, 3, 5], vec![2, 4, 6]];
-
-        // Round-robin order with numbers
         let result = collect_round_robin(&sources, 6);
         assert_eq!(result, vec![1, 2, 3, 4, 5, 6]);
     }
@@ -262,17 +239,13 @@ mod tests {
             vec!["date", "elderberry", "fig"],
             vec!["grape", "honeydew", "kiwi"],
         ];
-
-        // First round of round-robin (first from each source)
         let result = collect_round_robin(&sources, 3);
         assert_eq!(result, vec!["apple", "date", "grape"]);
     }
 
     #[test]
     fn test_collect_round_robin_max_count_higher_than_available() {
         let sources = vec![vec!["apple", "banana"], vec!["cherry", "date"]];
-
-        // Round-robin order for all available elements
         let result = collect_round_robin(&sources, 10);
         assert_eq!(result, vec!["apple", "banana", "cherry", "date"]);
     }