From 0decb10f2a7b0051398eabb439d78e7b8d0a1e93 Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 09:07:00 -0700
Subject: [PATCH 01/16] Attempt 1

---
 crates/codebook/src/checker.rs                | 120 ++++
 .../codebook/src/dictionaries/dictionary.rs   |  15 -
 crates/codebook/src/lib.rs                    |  97 ++-
 crates/codebook/src/parser.rs                 | 522 ++++++++--------
 crates/codebook/src/queries.rs                |  12 +-
 crates/codebook/src/queries/markdown.scm      |   2 -
 crates/codebook/src/regions.rs                | 301 +++++++++
 crates/codebook/tests/test_markdown.rs        |  92 ++-
 crates/codebook/tests/utils/mod.rs            |   6 +-
 examples/example.md                           |   9 +
 refactor.md                                   | 574 ++++++++++++++++++
 11 files changed, 1389 insertions(+), 361 deletions(-)
 create mode 100644 crates/codebook/src/checker.rs
 delete mode 100644 crates/codebook/src/queries/markdown.scm
 create mode 100644 crates/codebook/src/regions.rs
 create mode 100644 refactor.md
diff --git a/crates/codebook/src/checker.rs b/crates/codebook/src/checker.rs
new file mode 100644
index 00000000..e87d9069
--- /dev/null
+++ b/crates/codebook/src/checker.rs
@@ -0,0 +1,120 @@
+use std::collections::HashMap;
+
+use crate::dictionaries::dictionary::Dictionary;
+use crate::parser::{TextRange, WordLocation};
+use codebook_config::CodebookConfig;
+
+/// A candidate word extracted from a text node, with its position
+/// in original-document byte offsets.
+#[derive(Debug, Clone, PartialEq)]
+pub struct WordCandidate {
+    pub word: String,
+    pub start_byte: usize,
+    pub end_byte: usize,
+}
+
+/// Check candidate words against dictionaries and config rules.
+/// Returns WordLocations for misspelled words, grouping all locations
+/// of the same word together.
+pub fn check_words(
+    candidates: &[WordCandidate],
+    dictionaries: &[std::sync::Arc<dyn Dictionary>],
+    config: &dyn CodebookConfig,
+) -> Vec<WordLocation> {
+    // Deduplicate: group candidates by word text
+    let mut word_positions: HashMap<&str, Vec<TextRange>> = HashMap::new();
+    for candidate in candidates {
+        word_positions
+            .entry(&candidate.word)
+            .or_default()
+            .push(TextRange {
+                start_byte: candidate.start_byte,
+                end_byte: candidate.end_byte,
+            });
+    }
+
+    // Check each unique word once
+    let mut results = Vec::new();
+    for (word, positions) in word_positions {
+        if config.should_flag_word(word) {
+            results.push(WordLocation::new(word.to_string(), positions));
+            continue;
+        }
+        if word.len() < config.get_min_word_length() {
+            continue;
+        }
+        if config.is_allowed_word(word) {
+            continue;
+        }
+        let is_correct = dictionaries.iter().any(|dict| dict.check(word));
+        if !is_correct {
+            results.push(WordLocation::new(word.to_string(), positions));
+        }
+    }
+    results
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::dictionaries::dictionary::TextDictionary;
+    use std::sync::Arc;
+
+    fn make_candidates(words: &[(&str, usize, usize)]) -> Vec<WordCandidate> {
+        words
+            .iter()
+            .map(|(word, start, end)| WordCandidate {
+                word: word.to_string(),
+                start_byte: *start,
+                end_byte: *end,
+            })
+            .collect()
+    }
+
+    #[test]
+    fn test_check_words_flags_unknown() {
+        let dict = Arc::new(TextDictionary::new("hello\nworld\n"));
+        let config = Arc::new(codebook_config::CodebookConfigMemory::default());
+        let candidates = make_candidates(&[
+            ("hello", 0, 5),
+            ("wrld", 6, 10),
+        ]);
+        let results = check_words(&candidates, &[dict], config.as_ref());
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0].word, "wrld");
+    }
+
+    #[test]
+    fn test_check_words_groups_locations() {
+        let dict = Arc::new(TextDictionary::new("hello\n"));
+        let config = Arc::new(codebook_config::CodebookConfigMemory::default());
+        let candidates = make_candidates(&[
+            ("wrld", 0, 4),
+            ("wrld", 10, 14),
+        ]);
+        let results = check_words(&candidates, &[dict], config.as_ref());
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0].word, "wrld");
+        assert_eq!(results[0].locations.len(), 2);
+    }
+
+    #[test]
+    fn test_check_words_respects_min_length() {
+        let dict = Arc::new(TextDictionary::new(""));
+        let config = Arc::new(codebook_config::CodebookConfigMemory::default());
+        // Default min word length is 3
+        let candidates = make_candidates(&[("ab", 0, 2)]);
+        let results = check_words(&candidates, &[dict], config.as_ref());
+        assert!(results.is_empty(), "Short words should be skipped");
+    }
+
+    #[test]
+    fn test_check_words_respects_allowed_words() {
+        let dict = Arc::new(TextDictionary::new(""));
+        let config = Arc::new(codebook_config::CodebookConfigMemory::default());
+        config.add_word("codebook").unwrap();
+        let candidates = make_candidates(&[("codebook", 0, 8)]);
+        let results = check_words(&candidates, &[dict], config.as_ref());
+        assert!(results.is_empty(), "Allowed words should not be flagged");
+    }
+}
diff --git a/crates/codebook/src/dictionaries/dictionary.rs b/crates/codebook/src/dictionaries/dictionary.rs
index 82e75d76..3ba0d800 100644
--- a/crates/codebook/src/dictionaries/dictionary.rs
+++ b/crates/codebook/src/dictionaries/dictionary.rs
@@ -7,10 +7,6 @@ use std::{
     sync::{Arc, RwLock},
 };
 
-use crate::parser::{WordLocation, find_locations};
-use crate::queries::LanguageType;
-use regex::Regex;
-
 pub trait Dictionary: Send + Sync {
     fn check(&self, word: &str) -> bool;
     fn suggest(&self, word: &str) -> Vec<String>;
@@ -170,17 +166,6 @@ impl TextDictionary {
     }
 }
 
-/// Integration helper to use any Dictionary trait with optimized batch processing
-pub fn find_locations_with_dictionary_batch(
-    text: &str,
-    language: LanguageType,
-    dictionary: &dyn Dictionary,
-    skip_patterns: &[Regex],
-) -> Vec<WordLocation> {
-    // For non-HashSet dictionaries, we still get deduplication benefits
-    find_locations(text, language, |word| dictionary.check(word), |_| true, skip_patterns)
-}
-
 #[cfg(test)]
 mod dictionary_tests {
     use super::*;
diff --git a/crates/codebook/src/lib.rs b/crates/codebook/src/lib.rs
index 5cc6840b..2f91e643 100644
--- a/crates/codebook/src/lib.rs
+++ b/crates/codebook/src/lib.rs
@@ -1,11 +1,14 @@
+pub mod checker;
 pub mod dictionaries;
 mod logging;
 pub mod parser;
 pub mod queries;
 pub mod regexes;
+pub mod regions;
 mod splitter;
 
 use crate::regexes::get_default_skip_patterns;
+use std::collections::HashSet;
 use std::path::Path;
 use std::sync::Arc;
 
@@ -47,39 +50,34 @@ impl Codebook {
                 return Vec::new();
             }
         }
-        // get needed dictionary names
-        // get needed dictionaries
-        // call spell check on each dictionary
+
         let language = self.resolve_language(language, file_path);
-        let dictionaries = self.get_dictionaries(Some(language));
-        // Combine default and user patterns
+
+        // Combine default and user skip patterns
         let mut all_patterns = get_default_skip_patterns().clone();
         if let Some(user_patterns) = self.config.get_ignore_patterns() {
             all_patterns.extend(user_patterns);
         }
-        parser::find_locations(
-            text,
-            language,
-            |word| {
-                if self.config.should_flag_word(word) {
-                    return false;
-                }
-                if word.len() < self.config.get_min_word_length() {
-                    return true;
-                }
-                if self.config.is_allowed_word(word) {
-                    return true;
-                }
-                for dictionary in &dictionaries {
-                    if dictionary.check(word) {
-                        return true;
-                    }
-                }
-                false
-            },
-            |tag| self.config.should_check_tag(tag),
-            &all_patterns,
-        )
+
+        // Stage 1: Split into language regions
+        let text_regions = regions::extract_regions(text, language);
+
+        // Collect dictionaries for all languages present in the file
+        let dictionaries = self.get_dictionaries_for_languages(&text_regions);
+
+        // Stages 2+3: Extract nodes and words from each region
+        let mut all_candidates = Vec::new();
+        for region in &text_regions {
+            // Stage 2: Node extraction
+            let nodes =
+                parser::extract_nodes(text, region, &|tag| self.config.should_check_tag(tag));
+            // Stage 3: Word extraction
+            let candidates = parser::extract_words(text, &nodes, &all_patterns);
+            all_candidates.extend(candidates);
+        }
+
+        // Stage 4: Word checking
+        checker::check_words(&all_candidates, &dictionaries, self.config.as_ref())
     }
 
     fn resolve_language(
@@ -87,7 +85,6 @@ impl Codebook {
         language_type: Option<queries::LanguageType>,
         path: Option<&str>,
     ) -> queries::LanguageType {
-        // Check if we have a language_id first, fallback to path, fall back to text
         match language_type {
             Some(lang) => lang,
             None => match path {
@@ -97,21 +94,32 @@ impl Codebook {
         }
     }
 
-    fn get_dictionaries(
+    /// Gather dictionaries for all languages present in a file.
+    fn get_dictionaries_for_languages(
         &self,
-        language: Option<queries::LanguageType>,
+        regions: &[regions::TextRegion],
     ) -> Vec<Arc<dyn Dictionary>> {
         let mut dictionary_ids = self.config.get_dictionary_ids();
-        if let Some(lang) = language {
-            let language_dictionary_ids = lang.dictionary_ids();
-            dictionary_ids.extend(language_dictionary_ids);
-        };
+
+        // Add language-specific dictionaries for all languages in the file
+        let mut seen_languages = HashSet::new();
+        for region in regions {
+            if seen_languages.insert(region.language) {
+                dictionary_ids.extend(region.language.dictionary_ids());
+            }
+        }
+
+        // Add defaults
         dictionary_ids.extend(DEFAULT_DICTIONARIES.iter().map(|f| f.to_string()));
+
+        // Deduplicate
+        dictionary_ids.sort();
+        dictionary_ids.dedup();
+
         let mut dictionaries = Vec::with_capacity(dictionary_ids.len());
         debug!("Checking text with dictionaries: {dictionary_ids:?}");
         for dictionary_id in dictionary_ids {
-            let dictionary = self.manager.get_dictionary(&dictionary_id);
-            if let Some(d) = dictionary {
+            if let Some(d) = self.manager.get_dictionary(&dictionary_id) {
                 dictionaries.push(d);
             }
         }
@@ -125,9 +133,8 @@ impl Codebook {
     }
 
     pub fn get_suggestions(&self, word: &str) -> Option<Vec<String>> {
-        // Get top suggestions and return the first 5 suggestions in round robin order
         let max_results = 5;
-        let dictionaries = self.get_dictionaries(None);
+        let dictionaries = self.get_dictionaries_for_languages(&[]);
         let mut is_misspelled = false;
         let suggestions: Vec<Vec<String>> = dictionaries
             .iter()
@@ -178,7 +185,6 @@ mod tests {
         ];
 
         let result = collect_round_robin(&sources, 5);
-        // Round-robin order: first from each source, then second from each source
         assert_eq!(
             result,
             vec!["apple", "date", "grape", "banana", "elderberry"]
@@ -193,12 +199,6 @@ mod tests {
             vec!["cherry", "date", "elderberry"],
         ];
 
-        // In round-robin, we get:
-        // 1. apple (1st from 1st source)
-        // 2. banana (1st from 2nd source) - cherry already taken
-        // 3. cherry (1st from 3rd source)
-        // 4. banana (2nd from 1st source)
-        // 5. date (3rd from 2nd source) - cherry already taken
         let result = collect_round_robin(&sources, 5);
         assert_eq!(
             result,
@@ -214,7 +214,6 @@ mod tests {
             vec!["fig", "grape"],
         ];
 
-        // Round-robin order with uneven sources
         let result = collect_round_robin(&sources, 7);
         assert_eq!(
             result,
@@ -241,7 +240,6 @@ mod tests {
     fn test_collect_round_robin_some_empty_sources() {
         let sources = vec![vec!["apple", "banana"], vec![], vec!["cherry", "date"]];
 
-        // Round-robin order, skipping empty source
         let result = collect_round_robin(&sources, 4);
         assert_eq!(result, vec!["apple", "cherry", "banana", "date"]);
     }
@@ -250,7 +248,6 @@ mod tests {
     fn test_collect_round_robin_with_numbers() {
         let sources = vec![vec![1, 3, 5], vec![2, 4, 6]];
 
-        // Round-robin order with numbers
         let result = collect_round_robin(&sources, 6);
         assert_eq!(result, vec![1, 2, 3, 4, 5, 6]);
     }
@@ -263,7 +260,6 @@ mod tests {
             vec!["grape", "honeydew", "kiwi"],
         ];
 
-        // First round of round-robin (first from each source)
         let result = collect_round_robin(&sources, 3);
         assert_eq!(result, vec!["apple", "date", "grape"]);
     }
@@ -272,7 +268,6 @@ mod tests {
     fn test_collect_round_robin_max_count_higher_than_available() {
         let sources = vec![vec!["apple", "banana"], vec!["cherry", "date"]];
 
-        // Round-robin order for all available elements
         let result = collect_round_robin(&sources, 10);
         assert_eq!(result, vec!["apple", "banana", "cherry", "date"]);
     }
diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs
index 894c3b1b..f655a285 100644
--- a/crates/codebook/src/parser.rs
+++ b/crates/codebook/src/parser.rs
@@ -1,8 +1,9 @@
-use crate::splitter::{self};
-
+use crate::checker::WordCandidate;
 use crate::queries::{LanguageType, get_language_setting};
+use crate::regions::TextRegion;
+use crate::splitter;
 use regex::Regex;
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::sync::{LazyLock, Mutex};
 use streaming_iterator::StreamingIterator;
 use tree_sitter::{Parser, Query, QueryCursor};
@@ -79,88 +80,6 @@ fn merge_overlapping_ranges(ranges: Vec<SkipRange>) -> Vec<SkipRange> {
     merged
 }
 
-/// Helper struct to handle text position tracking and word extraction
-struct TextProcessor {
-    text: String,
-    skip_ranges: Vec<SkipRange>,
-}
-
-impl TextProcessor {
-    fn new(text: &str, skip_patterns: &[Regex]) -> Self {
-        let skip_ranges = find_skip_ranges(text, skip_patterns);
-        Self {
-            text: text.to_string(),
-            skip_ranges,
-        }
-    }
-
-    fn should_skip(&self, start_byte: usize, word_len: usize) -> bool {
-        is_within_skip_range(start_byte, start_byte + word_len, &self.skip_ranges)
-    }
-
-    fn process_words_with_check<F>(&self, mut check_function: F) -> Vec<WordLocation>
-    where
-        F: FnMut(&str) -> bool,
-    {
-        // First pass: collect all unique words with their positions
-        let estimated_words = (self.text.len() as f64 / 6.0).ceil() as usize;
-        let mut word_positions: HashMap<&str, Vec<TextRange>> =
-            HashMap::with_capacity(estimated_words);
-
-        for (offset, word) in self.text.split_word_bound_indices() {
-            if is_alphabetic(word) && !self.should_skip(offset, word.len()) {
-                self.collect_split_words(word, offset, &mut word_positions);
-            }
-        }
-
-        // Second pass: batch check unique words and filter
-        let mut result_locations: HashMap<String, Vec<TextRange>> = HashMap::new();
-        for (word_text, positions) in word_positions {
-            if !check_function(word_text) {
-                result_locations.insert(word_text.to_string(), positions);
-            }
-        }
-
-        result_locations
-            .into_iter()
-            .map(|(word, locations)| WordLocation::new(word, locations))
-            .collect()
-    }
-
-    fn extract_words(&self) -> Vec<WordLocation> {
-        // Reuse the word collection logic by collecting all words (check always returns false)
-        self.process_words_with_check(|_| false)
-    }
-
-    fn collect_split_words<'a>(
-        &self,
-        word: &'a str,
-        offset: usize,
-        word_positions: &mut HashMap<&'a str, Vec<TextRange>>,
-    ) {
-        if !word.is_empty() {
-            let split = splitter::split(word);
-            for split_word in split {
-                if !is_numeric(split_word.word) {
-                    let word_start_byte = offset + split_word.start_byte;
-                    let location = TextRange {
-                        start_byte: word_start_byte,
-                        end_byte: word_start_byte + split_word.word.len(),
-                    };
-                    let word_text = split_word.word;
-                    word_positions.entry(word_text).or_default().push(location);
-                }
-            }
-        }
-    }
-}
-
-#[derive(Debug, Clone, PartialEq)]
-pub struct WordRef<'a> {
-    pub word: &'a str,
-    pub position: (u32, u32), // (start_char, line)
-}
-
 #[derive(Debug, Clone, PartialEq)]
 pub struct WordLocation {
     pub word: String,
@@ -173,40 +92,67 @@ impl WordLocation {
     }
 }
 
-pub fn find_locations(
-    text: &str,
-    language: LanguageType,
-    check_function: impl Fn(&str) -> bool,
-    tag_filter: impl Fn(&str) -> bool,
-    skip_patterns: &[Regex],
-) -> Vec<WordLocation> {
-    match language {
-        LanguageType::Text => {
-            let processor = TextProcessor::new(text, skip_patterns);
-            processor.process_words_with_check(|word| check_function(word))
+// =============================================================================
+// Stage 2: Node Extraction
+// =============================================================================
+
+/// A text span extracted from a tree-sitter query match or plain text region.
+/// Coordinates are in original-document byte offsets.
+#[derive(Debug, Clone)]
+pub struct TextNode {
+    /// Byte range start in the original document
+    pub start_byte: usize,
+    /// Byte range end in the original document
+    pub end_byte: usize,
+    /// The text content of this node
+    pub text: String,
+}
+
+/// Extract spellcheckable text nodes from a region.
+/// For code regions, uses tree-sitter parsing and queries.
+/// For text/markdown prose regions, returns the whole region as one node.
+/// All byte offsets are in original document coordinates.
+pub fn extract_nodes(
+    document_text: &str,
+    region: &TextRegion,
+    tag_filter: &dyn Fn(&str) -> bool,
+) -> Vec<TextNode> {
+    let region_text = &document_text[region.start_byte..region.end_byte];
+
+    match region.language {
+        LanguageType::Text | LanguageType::Markdown => {
+            // Plain text / markdown prose: the whole region is one node
+            vec![TextNode {
+                start_byte: region.start_byte,
+                end_byte: region.end_byte,
+                text: region_text.to_string(),
+            }]
+        }
+        _ => {
+            // Code: parse with tree-sitter, run query, extract captured nodes
+            extract_nodes_with_treesitter(
+                region_text,
+                region.start_byte,
+                region.language,
+                tag_filter,
+            )
         }
-        _ => find_locations_code(
-            text,
-            language,
-            |word| check_function(word),
-            &tag_filter,
-            skip_patterns,
-        ),
     }
 }
 
-fn find_locations_code(
+/// Parse text with tree-sitter and extract nodes matching the language's query.
+fn extract_nodes_with_treesitter(
     text: &str,
+    base_offset: usize,
     language: LanguageType,
-    check_function: impl Fn(&str) -> bool,
     tag_filter: &dyn Fn(&str) -> bool,
-    skip_patterns: &[Regex],
-) -> Vec<WordLocation> {
-    let language_setting =
-        get_language_setting(language).expect("This _should_ never happen. Famous last words.");
+) -> Vec<TextNode> {
+    let language_setting = match get_language_setting(language) {
+        Some(s) => s,
+        None => return Vec::new(),
+    };
 
     // Parse under global lock to protect external scanners with global C state.
-    // The lock covers create + parse; Tree is fully owned after parse returns.
     let tree = {
         let mut cache = PARSER_CACHE.lock().unwrap();
         let parser = cache.entry(language).or_insert_with(|| {
@@ -223,74 +169,82 @@ fn find_locations_code(
     let query = Query::new(&lang, language_setting.query).unwrap();
     let capture_names = query.capture_names();
     let mut cursor = QueryCursor::new();
-    let mut word_locations: HashMap<String, HashSet<TextRange>> = HashMap::new();
     let provider = text.as_bytes();
     let mut matches_query = cursor.matches(&query, root_node, provider);
 
-    // Find all skip ranges from patterns matched against the full source text
-    let all_skip_ranges = find_skip_ranges(text, skip_patterns);
-
+    let mut nodes = Vec::new();
     while let Some(match_) = matches_query.next() {
         for capture in match_.captures {
-            // Filter by tag
             let tag = &capture_names[capture.index as usize];
-            if !tag_filter(tag) {
+            // Skip internal tags and filtered tags
+            if *tag == "language" || !tag_filter(tag) {
                 continue;
             }
-
             let node = capture.node;
-            let node_start_byte = node.start_byte();
-
             let node_text = node.utf8_text(provider).unwrap();
-            let processor = TextProcessor::new(node_text, &[]);
-            let words = processor.extract_words();
-
-            // Check words against global skip ranges and dictionary
-            for word_pos in words {
-                if !check_function(&word_pos.word) {
-                    for range in word_pos.locations {
-                        let global_start = range.start_byte + node_start_byte;
-                        let global_end = range.end_byte + node_start_byte;
-
-                        // Skip if word is entirely within a skip range
-                        if is_within_skip_range(global_start, global_end, &all_skip_ranges) {
-                            continue;
-                        }
-
-                        let location = TextRange {
-                            start_byte: global_start,
-                            end_byte: global_end,
-                        };
-                        if let Some(existing_result) = word_locations.get_mut(&word_pos.word) {
-                            let added = existing_result.insert(location);
-                            debug_assert!(
-                                added,
-                                "Two of the same locations found. Make a better query. Word: {}, Location: {:?}",
-                                word_pos.word, location
-                            );
-                        } else {
-                            let mut set = HashSet::new();
-                            set.insert(location);
-                            word_locations.insert(word_pos.word.clone(), set);
-                        }
-                    }
-                }
-            }
+            nodes.push(TextNode {
+                start_byte: node.start_byte() + base_offset,
+                end_byte: node.end_byte() + base_offset,
+                text: node_text.to_string(),
+            });
         }
     }
+    nodes
+}
+
+// =============================================================================
+// Stage 3: Word Extraction
+// =============================================================================
+
+/// Extract candidate words from text nodes, applying skip patterns.
+/// All byte offsets are in original document coordinates.
+pub fn extract_words(
+    document_text: &str,
+    nodes: &[TextNode],
+    skip_patterns: &[Regex],
+) -> Vec<WordCandidate> {
+    // Compute skip ranges once against the full document
+    let skip_ranges = find_skip_ranges(document_text, skip_patterns);
+
+    let mut candidates = Vec::new();
+    for node in nodes {
+        extract_words_from_text(&node.text, node.start_byte, &skip_ranges, &mut candidates);
+    }
+    candidates
+}
 
-    word_locations
-        .keys()
-        .map(|word| WordLocation {
-            word: word.clone(),
-            locations: word_locations
-                .get(word)
-                .cloned()
-                .unwrap_or_default()
-                .into_iter()
-                .collect(),
-        })
-        .collect()
+/// Extract words from a text span, applying skip ranges and word splitting.
+fn extract_words_from_text(
+    text: &str,
+    base_offset: usize,
+    skip_ranges: &[SkipRange],
+    candidates: &mut Vec<WordCandidate>,
+) {
+    for (offset, word) in text.split_word_bound_indices() {
+        if !is_alphabetic(word) {
+            continue;
+        }
+        let global_offset = base_offset + offset;
+        if is_within_skip_range(global_offset, global_offset + word.len(), skip_ranges) {
+            continue;
+        }
+        let split = splitter::split(word);
+        for split_word in split {
+            if is_numeric(split_word.word) {
+                continue;
+            }
+            let word_start = global_offset + split_word.start_byte;
+            let word_end = word_start + split_word.word.len();
+            if is_within_skip_range(word_start, word_end, skip_ranges) {
+                continue;
+            }
+            candidates.push(WordCandidate {
+                word: split_word.word.to_string(),
+                start_byte: word_start,
+                end_byte: word_end,
+            });
+        }
+    }
 }
 
 fn is_numeric(s: &str) -> bool {
@@ -314,153 +268,173 @@ pub fn get_word_from_string(start_utf16: usize, end_utf16: usize, text: &str) ->
 #[cfg(test)]
 mod parser_tests {
     use super::*;
+    use crate::regions::TextRegion;
 
     #[test]
-    fn test_spell_checking() {
+    fn test_extract_words_basic() {
         let text = "HelloWorld calc_wrld";
-        let results = find_locations(text, LanguageType::Text, |_| false, |_| true, &[]);
-        println!("{results:?}");
-        assert_eq!(results.len(), 4);
+        let nodes = vec![TextNode {
+            start_byte: 0,
+            end_byte: text.len(),
+            text: text.to_string(),
+        }];
+        let words = extract_words(text, &nodes, &[]);
+        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(word_strs.contains(&"Hello"));
+        assert!(word_strs.contains(&"World"));
+        assert!(word_strs.contains(&"calc"));
+        assert!(word_strs.contains(&"wrld"));
+        assert_eq!(words.len(), 4);
     }
 
     #[test]
-    fn test_get_words_from_text() {
-        let text = r#"
-            HelloWorld calc_wrld
-            I'm a contraction, don't ignore me
-            this is a 3rd line.
-            "#;
-        let expected = vec![
-            ("Hello", (13, 18)),
-            ("World", (18, 23)),
-            ("calc", (24, 28)),
-            ("wrld", (29, 33)),
-            ("I'm", (46, 49)),
-            ("a", (50, 51)),
-            ("contraction", (52, 63)),
-            ("don't", (65, 70)),
-            ("ignore", (71, 77)),
-            ("me", (78, 80)),
-            ("this", (93, 97)),
-            ("is", (98, 100)),
-            ("a", (101, 102)),
-            ("rd", (104, 106)),
-            ("line", (107, 111)),
-        ];
-        let processor = TextProcessor::new(text, &[]);
-        let words = processor.extract_words();
-        println!("{words:?}");
-        for word in words {
-            let loc = word.locations.first().unwrap();
-            let pos = (loc.start_byte, loc.end_byte);
+    fn test_extract_words_contraction() {
+        let text = "I'm a contraction, wouldn't you agree'?";
+        let nodes = vec![TextNode {
+            start_byte: 0,
+            end_byte: text.len(),
+            text: text.to_string(),
+        }];
+        let words = extract_words(text, &nodes, &[]);
+        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let expected = ["I'm", "a", "contraction", "wouldn't", "you", "agree"];
+        for e in &expected {
+            assert!(word_strs.contains(e), "Expected word '{e}' not found");
+        }
+    }
+
+    #[test]
+    fn test_extract_nodes_plain_text() {
+        let text = "hello world";
+        let region = TextRegion {
+            start_byte: 0,
+            end_byte: text.len(),
+            language: LanguageType::Text,
+        };
+        let nodes = extract_nodes(text, &region, &|_| true);
+        assert_eq!(nodes.len(), 1);
+        assert_eq!(nodes[0].text, "hello world");
+        assert_eq!(nodes[0].start_byte, 0);
+    }
+
+    #[test]
+    fn test_extract_nodes_code() {
+        let text = "// a comment\nfn main() {}";
+        let region = TextRegion {
+            start_byte: 0,
+            end_byte: text.len(),
+            language: LanguageType::Rust,
+        };
+        let nodes = extract_nodes(text, &region, &|_| true);
+        // Should have at least the comment node
+        assert!(!nodes.is_empty());
+        let comment_node = nodes.iter().find(|n| n.text.contains("comment"));
+        assert!(comment_node.is_some(), "Should find comment node");
+    }
+
+    #[test]
+    fn test_extract_nodes_with_base_offset() {
+        // Simulate a code block starting at byte 50 in a larger document
+        let code = "// hello world";
+        let padded = format!("{}{}", " ".repeat(50), code);
+        let region = TextRegion {
+            start_byte: 50,
+            end_byte: 50 + code.len(),
+            language: LanguageType::Rust,
+        };
+        let nodes = extract_nodes(&padded, &region, &|_| true);
+        assert!(!nodes.is_empty());
+        // All node offsets should be >= 50
+        for node in &nodes {
+            assert!(node.start_byte >= 50, "Node offset should include base offset");
+        }
+    }
+
+    #[test]
+    fn test_extract_nodes_tag_filter() {
+        let text = "// comment\nlet x = \"string\";";
+        let region = TextRegion {
+            start_byte: 0,
+            end_byte: text.len(),
+            language: LanguageType::Rust,
+        };
+        // Only allow comment tags
+        let nodes = extract_nodes(text, &region, &|tag| tag.starts_with("comment"));
+        for node in &nodes {
+            // Should only have comment content
             assert!(
-                expected.contains(&(word.word.as_str(), pos)),
-                "Expected word '{}' to be at position {:?}",
-                word.word,
-                pos
+                node.text.contains("comment"),
+                "Expected only comment nodes, got: {:?}",
+                node.text
             );
         }
     }
 
     #[test]
-    fn test_contraction() {
-        let text = "I'm a contraction, wouldn't you agree'?";
-        let processor = TextProcessor::new(text, &[]);
-        let words = processor.extract_words();
-        println!("{words:?}");
-        let expected = ["I'm", "a", "contraction", "wouldn't", "you", "agree"];
-        for word in words {
-            assert!(expected.contains(&word.word.as_str()));
-        }
+    fn test_extract_words_with_skip_patterns() {
+        let text = "check https://example.com this";
+        let url_pattern = Regex::new(r"https?://[^\s]+").unwrap();
+        let nodes = vec![TextNode {
+            start_byte: 0,
+            end_byte: text.len(),
+            text: text.to_string(),
+        }];
+        let words = extract_words(text, &nodes, &[url_pattern]);
+        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(word_strs.contains(&"check"));
+        assert!(word_strs.contains(&"this"));
+        // URL components should be skipped
+        assert!(!word_strs.contains(&"https"));
+        assert!(!word_strs.contains(&"example"));
     }
 
     #[test]
     fn test_get_word_from_string() {
-        // Test with ASCII characters
         let text = "Hello World";
         assert_eq!(get_word_from_string(0, 5, text), "Hello");
         assert_eq!(get_word_from_string(6, 11, text), "World");
-
-        // Test with partial words
         assert_eq!(get_word_from_string(2, 5, text), "llo");
 
-        // Test with Unicode characters
         let unicode_text = "こんにちは世界";
         assert_eq!(get_word_from_string(0, 5, unicode_text), "こんにちは");
         assert_eq!(get_word_from_string(5, 7, unicode_text), "世界");
 
-        // Test with emoji (which can be multi-codepoint)
         let emoji_text = "Hello 👨‍👩‍👧‍👦 World";
         assert_eq!(get_word_from_string(6, 17, emoji_text), "👨‍👩‍👧‍👦");
     }
+
     #[test]
     fn test_unicode_character_handling() {
         crate::logging::init_test_logging();
         let text = "©<div>badword</div>";
-        let processor = TextProcessor::new(text, &[]);
-        let words = processor.extract_words();
-        println!("{words:?}");
-
-        // Make sure "badword" is included and correctly positioned
-        assert!(words.iter().any(|word| word.word == "badword"));
-
-        // If "badword" is found, verify its position
-        if let Some(pos) = words.iter().find(|word| word.word == "badword") {
-            // The correct position should be 6 (after ©<div>)
-            let start_byte = pos.locations.first().unwrap().start_byte;
-            let end_byte = pos.locations.first().unwrap().end_byte;
-            assert_eq!(
-                start_byte, 7,
-                "Expected 'badword' to start at character position 7"
-            );
-            assert_eq!(end_byte, 14, "Expected 'badword' to be on end_byte 14");
-        } else {
-            panic!("Word 'badword' not found in the text");
-        }
+        let nodes = vec![TextNode {
+            start_byte: 0,
+            end_byte: text.len(),
+            text: text.to_string(),
+        }];
+        let words = extract_words(text, &nodes, &[]);
+        let badword = words.iter().find(|w| w.word == "badword");
+        assert!(badword.is_some(), "Expected 'badword' to be found");
+        let bw = badword.unwrap();
+        assert_eq!(bw.start_byte, 7, "Expected 'badword' to start at byte 7");
+        assert_eq!(bw.end_byte, 14, "Expected 'badword' to end at byte 14");
     }
 
     #[test]
-    fn test_duplicate_word_locations() {
-        // Use a code language to exercise find_locations_code path
+    fn test_duplicate_word_locations_code() {
         let text = "// wrld foo wrld";
-        let results = find_locations(text, LanguageType::Rust, |_| false, |_| true, &[]);
-        let wrld = results.iter().find(|loc| loc.word == "wrld").unwrap();
+        let region = TextRegion {
+            start_byte: 0,
+            end_byte: text.len(),
+            language: LanguageType::Rust,
+        };
+        let nodes = extract_nodes(text, &region, &|_| true);
+        let words = extract_words(text, &nodes, &[]);
+        let wrld_words: Vec<_> = words.iter().filter(|w| w.word == "wrld").collect();
         assert_eq!(
-            wrld.locations.len(),
+            wrld_words.len(),
             2,
-            "Expected two locations for repeated word 'wrld'"
+            "Expected two occurrences of 'wrld'"
         );
     }
-
-    // Something is up with the HTML tree-sitter package
-    // #[test]
-    // fn test_spell_checking_with_unicode() {
-    //     crate::log::init_test_logging();
-    //     let text = "©<div>badword</div>";
-
-    //     // Mock spell check function that flags "badword"
-    //     let results = find_locations(text, LanguageType::Html, |word| word != "badword");
-
-    //     println!("{:?}", results);
-
-    //     // Ensure "badword" is flagged
-    //     let badword_result = results.iter().find(|loc| loc.word == "badword");
-    //     assert!(badword_result.is_some(), "Expected 'badword' to be flagged");
-
-    //     // Check if the location is correct
-    //     if let Some(location) = badword_result {
-    //         assert_eq!(
-    //             location.locations.len(),
-    //             1,
-    //             "Expected exactly one location for 'badword'"
-    //         );
-    //         let range = &location.locations[0];
-
-    //         // The word should start after "©<div>" which is 6 characters
-    //         assert_eq!(range.start_char, 6, "Wrong start position for 'badword'");
-
-    //         // The word should end after "badword" which is 13 characters from the start
-    //         assert_eq!(range.end_char, 13, "Wrong end position for 'badword'");
-    //     }
-    // }
 }
diff --git a/crates/codebook/src/queries.rs b/crates/codebook/src/queries.rs
index ccf83df9..a352f8db 100644
--- a/crates/codebook/src/queries.rs
+++ b/crates/codebook/src/queries.rs
@@ -204,7 +204,7 @@ pub static LANGUAGE_SETTINGS: &[LanguageSetting] = &[
         type_: LanguageType::Markdown,
         ids: &["markdown"],
         dictionary_ids: &[],
-        query: include_str!("queries/markdown.scm"),
+        query: "",
         extensions: &["md", "markdown"],
     },
     LanguageSetting {
@@ -299,7 +299,7 @@ impl LanguageSetting {
             LanguageType::Javascript => Some(tree_sitter_javascript::LANGUAGE.into()),
             LanguageType::Latex => Some(codebook_tree_sitter_latex::LANGUAGE.into()),
             LanguageType::Lua => Some(tree_sitter_lua::LANGUAGE.into()),
-            LanguageType::Markdown => Some(tree_sitter_md::LANGUAGE.into()),
+            LanguageType::Markdown => None, // Handled by region extraction
             LanguageType::Odin => Some(tree_sitter_odin_codebook::LANGUAGE.into()),
             LanguageType::Php => Some(tree_sitter_php::LANGUAGE_PHP.into()),
             LanguageType::Python => Some(tree_sitter_python::LANGUAGE.into()),
@@ -346,7 +346,9 @@ mod tests {
     fn test_all_queries_are_valid() {
         for language_setting in LANGUAGE_SETTINGS {
             // Skip testing Text since it doesn't have a language or query
-            if language_setting.type_ == LanguageType::Text {
+            if language_setting.type_ == LanguageType::Text
+                || language_setting.type_ == LanguageType::Markdown
+            {
                 continue;
             }
 
@@ -395,7 +397,9 @@ mod tests {
     #[test]
     fn test_all_capture_names_use_allowed_tags() {
         for language_setting in LANGUAGE_SETTINGS {
-            if language_setting.type_ == LanguageType::Text {
+            if language_setting.type_ == LanguageType::Text
+                || language_setting.type_ == LanguageType::Markdown
+            {
                 continue;
             }
 
diff --git a/crates/codebook/src/queries/markdown.scm b/crates/codebook/src/queries/markdown.scm
deleted file mode 100644
index 8c1c6b2a..00000000
--- a/crates/codebook/src/queries/markdown.scm
+++ /dev/null
@@ -1,2 +0,0 @@
-(paragraph (inline) @string)
-(atx_heading (inline) @string)
diff --git a/crates/codebook/src/regions.rs b/crates/codebook/src/regions.rs
new file mode 100644
index 00000000..3d3db789
--- /dev/null
+++ b/crates/codebook/src/regions.rs
@@ -0,0 +1,301 @@
+use crate::queries::LanguageType;
+use std::collections::HashMap;
+use std::str::FromStr;
+use std::sync::{LazyLock, Mutex};
+use tree_sitter::Parser;
+
+/// A region of a file associated with a single language.
+/// For most files, there's one region covering the whole file.
+/// For multi-language files (markdown, astro, vue), there are multiple.
+#[derive(Debug, Clone, PartialEq)]
+pub struct TextRegion {
+    /// Byte range start in the original document
+    pub start_byte: usize,
+    /// Byte range end in the original document
+    pub end_byte: usize,
+    /// Which language governs this region
+    pub language: LanguageType,
+}
+
+/// Parser cache for region extraction (separate from the main parser cache
+/// since region extraction uses different grammars/queries than node extraction).
+static REGION_PARSER_CACHE: LazyLock<Mutex<HashMap<LanguageType, Parser>>> =
+    LazyLock::new(|| Mutex::new(HashMap::new()));
+
+/// Extract language regions from a document.
+/// For single-language files, returns one region covering the whole text.
+/// For multi-language files (markdown), returns multiple regions.
+pub fn extract_regions(text: &str, language: LanguageType) -> Vec<TextRegion> {
+    match language {
+        LanguageType::Markdown => extract_markdown_regions(text),
+        _ => vec![TextRegion {
+            start_byte: 0,
+            end_byte: text.len(),
+            language,
+        }],
+    }
+}
+
+/// Map markdown info strings to LanguageType.
+/// Handles common aliases beyond what LanguageType::from_str covers.
+fn resolve_info_string(info: &str) -> Option<LanguageType> {
+    let trimmed = info.trim().to_lowercase();
+    if trimmed.is_empty() {
+        return None;
+    }
+    // Try common aliases first
+    let lang = match trimmed.as_str() {
+        "py" => Some(LanguageType::Python),
+        "js" => Some(LanguageType::Javascript),
+        "ts" => Some(LanguageType::Typescript),
+        "tsx" => Some(LanguageType::Tsx),
+        "jsx" => Some(LanguageType::Javascript),
+        "sh" | "zsh" | "fish" | "shell" => Some(LanguageType::Bash),
+        "yml" => Some(LanguageType::YAML),
+        "c++" | "cc" | "cxx" | "hpp" => Some(LanguageType::Cpp),
+        "cs" => Some(LanguageType::CSharp),
+        "rb" => Some(LanguageType::Ruby),
+        "rs" => Some(LanguageType::Rust),
+        "tex" => Some(LanguageType::Latex),
+        _ => None,
+    };
+
+    if lang.is_some() {
+        return lang;
+    }
+
+    // Fall back to from_str which handles VS Code language IDs
+    match LanguageType::from_str(&trimmed) {
+        Ok(LanguageType::Text) => None, // from_str returns Text for unknown, treat as unknown
+        Ok(lang) => Some(lang),
+        Err(_) => None,
+    }
+}
+
+/// Extract regions from a markdown file.
+/// Prose sections become Markdown regions (treated as plain text in node extraction).
+/// Fenced code blocks become regions of the appropriate language.
+fn extract_markdown_regions(text: &str) -> Vec<TextRegion> {
+    let lang: tree_sitter::Language = tree_sitter_md::LANGUAGE.into();
+
+    let tree = {
+        let mut cache = REGION_PARSER_CACHE.lock().unwrap();
+        let parser = cache
+            .entry(LanguageType::Markdown)
+            .or_insert_with(|| {
+                let mut parser = Parser::new();
+                parser.set_language(&lang).unwrap();
+                parser
+            });
+        parser.parse(text, None).unwrap()
+    };
+
+    let mut regions = Vec::new();
+    let root = tree.root_node();
+    let provider = text.as_bytes();
+
+    walk_markdown_node(root, provider, &mut regions);
+
+    // Sort by start position
+    regions.sort_by_key(|r| r.start_byte);
+
+    // If no regions found (empty file, etc.), return the whole thing as markdown
+    if regions.is_empty() {
+        return vec![TextRegion {
+            start_byte: 0,
+            end_byte: text.len(),
+            language: LanguageType::Markdown,
+        }];
+    }
+
+    regions
+}
+
+/// Recursively walk markdown AST to find prose and code block regions.
+fn walk_markdown_node(
+    node: tree_sitter::Node,
+    source: &[u8],
+    regions: &mut Vec<TextRegion>,
+) {
+    match node.kind() {
+        "fenced_code_block" => {
+            // Find info_string and code_fence_content children
+            let mut info_string = None;
+            let mut code_content = None;
+            let mut cursor = node.walk();
+            for child in node.children(&mut cursor) {
+                match child.kind() {
+                    "info_string" => {
+                        // Get the language child of info_string
+                        let mut ic = child.walk();
+                        for info_child in child.children(&mut ic) {
+                            if info_child.kind() == "language" {
+                                info_string =
+                                    Some(info_child.utf8_text(source).unwrap_or("").to_string());
+                            }
+                        }
+                    }
+                    "code_fence_content" => {
+                        code_content = Some((child.start_byte(), child.end_byte()));
+                    }
+                    _ => {}
+                }
+            }
+
+            if let Some((start, end)) = code_content {
+                if start < end {
+                    if let Some(info) = info_string {
+                        if let Some(lang) = resolve_info_string(&info) {
+                            regions.push(TextRegion {
+                                start_byte: start,
+                                end_byte: end,
+                                language: lang,
+                            });
+                        }
+                        // If info string is unknown, skip the code block entirely
+                    }
+                    // If no info string, skip the code block entirely
+                }
+            }
+        }
+        "inline" => {
+            // Check parent — we want inline content from paragraphs and headings
+            if let Some(parent) = node.parent() {
+                match parent.kind() {
+                    "paragraph" | "atx_heading" | "setext_heading" => {
+                        if node.start_byte() < node.end_byte() {
+                            regions.push(TextRegion {
+                                start_byte: node.start_byte(),
+                                end_byte: node.end_byte(),
+                                language: LanguageType::Markdown,
+                            });
+                        }
+                    }
+                    _ => {}
+                }
+            }
+        }
+        _ => {
+            // Recurse into children
+            let mut cursor = node.walk();
+            for child in node.children(&mut cursor) {
+                walk_markdown_node(child, source, regions);
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_single_language_region() {
+        let regions = extract_regions("fn main() {}", LanguageType::Rust);
+        assert_eq!(regions.len(), 1);
+        assert_eq!(regions[0].language, LanguageType::Rust);
+        assert_eq!(regions[0].start_byte, 0);
+        assert_eq!(regions[0].end_byte, 12);
+    }
+
+    #[test]
+    fn test_text_region() {
+        let regions = extract_regions("hello world", LanguageType::Text);
+        assert_eq!(regions.len(), 1);
+        assert_eq!(regions[0].language, LanguageType::Text);
+    }
+
+    #[test]
+    fn test_markdown_prose_only() {
+        let text = "# Hello World\n\nSome paragraph text.\n";
+        let regions = extract_regions(text, LanguageType::Markdown);
+        assert!(regions.len() >= 2); // heading + paragraph
+        for r in &regions {
+            assert_eq!(r.language, LanguageType::Markdown);
+        }
+    }
+
+    #[test]
+    fn test_markdown_with_code_block() {
+        let text = "# Hello\n\nSome text.\n\n```python\ndef foo():\n    pass\n```\n\nMore text.\n";
+        let regions = extract_regions(text, LanguageType::Markdown);
+        println!("Regions: {regions:#?}");
+
+        // Should have markdown prose regions + python code region
+        let python_regions: Vec<_> = regions
+            .iter()
+            .filter(|r| r.language == LanguageType::Python)
+            .collect();
+        assert_eq!(python_regions.len(), 1, "Expected one Python region");
+
+        let md_regions: Vec<_> = regions
+            .iter()
+            .filter(|r| r.language == LanguageType::Markdown)
+            .collect();
+        assert!(md_regions.len() >= 2, "Expected at least 2 markdown prose regions");
+    }
+
+    #[test]
+    fn test_markdown_unknown_language_skipped() {
+        let text = "# Hello\n\n```unknownlang\nsome code\n```\n\nMore text.\n";
+        let regions = extract_regions(text, LanguageType::Markdown);
+        // Unknown language code block should produce no region
+        for r in &regions {
+            assert_eq!(r.language, LanguageType::Markdown);
+        }
+    }
+
+    #[test]
+    fn test_markdown_no_info_string_skipped() {
+        let text = "# Hello\n\n```\nsome code\n```\n\nMore text.\n";
+        let regions = extract_regions(text, LanguageType::Markdown);
+        // Code block without info string should produce no region
+        for r in &regions {
+            assert_eq!(r.language, LanguageType::Markdown);
+        }
+    }
+
+    #[test]
+    fn test_resolve_info_string_aliases() {
+        assert_eq!(resolve_info_string("py"), Some(LanguageType::Python));
+        assert_eq!(resolve_info_string("js"), Some(LanguageType::Javascript));
+        assert_eq!(resolve_info_string("ts"), Some(LanguageType::Typescript));
+        assert_eq!(resolve_info_string("sh"), Some(LanguageType::Bash));
+        assert_eq!(resolve_info_string("rs"), Some(LanguageType::Rust));
+        assert_eq!(resolve_info_string("rb"), Some(LanguageType::Ruby));
+        assert_eq!(resolve_info_string("yml"), Some(LanguageType::YAML));
+        assert_eq!(resolve_info_string("c++"), Some(LanguageType::Cpp));
+        assert_eq!(resolve_info_string(""), None);
+        assert_eq!(resolve_info_string("unknownlang"), None);
+    }
+
+    #[test]
+    fn test_resolve_info_string_vscode_ids() {
+        assert_eq!(resolve_info_string("python"), Some(LanguageType::Python));
+        assert_eq!(resolve_info_string("javascript"), Some(LanguageType::Javascript));
+        assert_eq!(resolve_info_string("rust"), Some(LanguageType::Rust));
+        assert_eq!(resolve_info_string("bash"), Some(LanguageType::Bash));
+        assert_eq!(resolve_info_string("go"), Some(LanguageType::Go));
+    }
+
+    #[test]
+    fn test_markdown_multiple_code_blocks() {
+        let text = "Text.\n\n```bash\nmkdir dir\n```\n\n```python\nx = 1\n```\n\nEnd.\n";
+        let regions = extract_regions(text, LanguageType::Markdown);
+
+        let bash_regions: Vec<_> = regions.iter().filter(|r| r.language == LanguageType::Bash).collect();
+        let python_regions: Vec<_> = regions.iter().filter(|r| r.language == LanguageType::Python).collect();
+
+        assert_eq!(bash_regions.len(), 1);
+        assert_eq!(python_regions.len(), 1);
+    }
+
+    #[test]
+    fn test_markdown_code_block_content_correct() {
+        let text = "Hello.\n\n```python\ndef foo():\n    pass\n```\n";
+        let regions = extract_regions(text, LanguageType::Markdown);
+        let py = regions.iter().find(|r| r.language == LanguageType::Python).unwrap();
+        let content = &text[py.start_byte..py.end_byte];
+        assert!(content.contains("def foo()"), "Expected python code, got: {content:?}");
+    }
+}
diff --git a/crates/codebook/tests/test_markdown.rs b/crates/codebook/tests/test_markdown.rs
index 04b13dcb..fb399469 100644
--- a/crates/codebook/tests/test_markdown.rs
+++ b/crates/codebook/tests/test_markdown.rs
@@ -10,7 +10,7 @@ fn test_markdown_paragraph() {
     utils::init_logging();
     let processor = utils::get_processor();
     let sample_text = "Some paragraph text with a misspeled word.\n";
-    let expected = vec![WordLocation::new(
+    let expected = [WordLocation::new(
         "misspeled".to_string(),
         vec![TextRange {
             start_byte: 27,
@@ -41,7 +41,7 @@ fn test_markdown_heading() {
 }
 
 #[test]
-fn test_markdown_fenced_code_block_skipped() {
+fn test_markdown_fenced_code_block_known_lang() {
     utils::init_logging();
     let processor = utils::get_processor();
     let sample_text = r#"# Hello World
@@ -50,7 +50,6 @@ Some correct text here.
 
 ```bash
 mkdir some_dir
-badwwword_in_code
 ```
 
 More correct text here.
@@ -60,16 +59,59 @@ More correct text here.
         .to_vec();
     let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
     println!("Misspelled words: {words:?}");
-    // Words inside fenced code blocks should NOT be flagged
+    // bash builtins like mkdir should be recognized by the bash dictionary
     assert!(!words.contains(&"mkdir"));
-    assert!(!words.contains(&"badwwword"));
+    // dir is a common abbreviation, should not be flagged
     assert!(!words.contains(&"dir"));
 }
 
 #[test]
-fn test_markdown_fenced_code_block_with_typo_outside() {
+fn test_markdown_fenced_code_block_unknown_lang_skipped() {
     utils::init_logging();
     let processor = utils::get_processor();
+    let sample_text = r#"Some text.
+
+```unknownlang
+badwwword_in_code
+```
+
+More text.
+"#;
+    let misspelled = processor
+        .spell_check(sample_text, Some(LanguageType::Markdown), None)
+        .to_vec();
+    let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
+    println!("Misspelled words: {words:?}");
+    // Unknown language code blocks are completely skipped
+    assert!(!words.contains(&"badwwword"));
+}
+
+#[test]
+fn test_markdown_fenced_code_block_no_lang_skipped() {
+    utils::init_logging();
+    let processor = utils::get_processor();
+    let sample_text = r#"Some text.
+
+```
+badwwword_in_code
+```
+
+More text.
+"#;
+    let misspelled = processor
+        .spell_check(sample_text, Some(LanguageType::Markdown), None)
+        .to_vec();
+    let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
+    println!("Misspelled words: {words:?}");
+    // Code blocks without language info are completely skipped
+    assert!(!words.contains(&"badwwword"));
+}
+
+#[test]
+fn test_markdown_code_block_uses_language_grammar() {
+    utils::init_logging();
+    let processor = utils::get_processor();
+    // In Python grammar, function names are checked as identifiers
     let sample_text = r#"A paragrap with a tyypo.
 
 ```python
@@ -84,11 +126,11 @@ Another paragrap with a tyypo.
         .to_vec();
     let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
     println!("Misspelled words: {words:?}");
-    // Typos in prose should be flagged
+    // Prose typos should be flagged
     assert!(words.contains(&"paragrap"));
     assert!(words.contains(&"tyypo"));
-    // Typos inside code blocks should NOT be flagged
-    assert!(!words.contains(&"functin"));
+    // Python function name typo should also be flagged (multi-language support!)
+    assert!(words.contains(&"functin"));
 }
 
 #[test]
@@ -103,7 +145,7 @@ mkdir somedir
 
 Middle text is corect.
 
-```python
+```unknownlang
 badspel = True
 ```
 
@@ -116,8 +158,9 @@ End text is also corect.
     println!("Misspelled words: {words:?}");
     assert!(words.contains(&"tyypo"));
     assert!(words.contains(&"corect"));
+    // bash commands should be handled by bash grammar
     assert!(!words.contains(&"mkdir"));
-    assert!(!words.contains(&"somedir"));
+    // unknown language blocks are skipped entirely
     assert!(!words.contains(&"badspel"));
 }
 
@@ -134,3 +177,30 @@ fn test_markdown_block_quote() {
     assert!(words.contains(&"quoet"));
     assert!(words.contains(&"tyypo"));
 }
+
+#[test]
+fn test_markdown_code_block_alias_resolution() {
+    utils::init_logging();
+    let processor = utils::get_processor();
+    // Test that common aliases work (py -> Python, js -> Javascript, etc.)
+    let sample_text = r#"Some text.
+
+```py
+def hello_wrld():
+    pass
+```
+
+```js
+function hello_wrld() {}
+```
+
+More text.
+"#;
+    let misspelled = processor
+        .spell_check(sample_text, Some(LanguageType::Markdown), None)
+        .to_vec();
+    let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
+    println!("Misspelled words: {words:?}");
+    // wrld should be flagged as a function name typo in both languages
+    assert!(words.contains(&"wrld"));
+}
diff --git a/crates/codebook/tests/utils/mod.rs b/crates/codebook/tests/utils/mod.rs
index 250e41b4..5437bb5e 100644
--- a/crates/codebook/tests/utils/mod.rs
+++ b/crates/codebook/tests/utils/mod.rs
@@ -3,6 +3,7 @@ use std::sync::Arc;
 use codebook::Codebook;
 use codebook_config::{CodebookConfig, CodebookConfigMemory};
 
+#[allow(dead_code)]
 pub fn get_processor() -> Codebook {
     let config = Arc::new(CodebookConfigMemory::default());
     config
@@ -31,10 +32,7 @@ pub fn get_processor_with_include_and_ignore(include: &str, ignore: &str) -> Cod
 }
 
 #[allow(dead_code)]
-pub fn get_processor_with_tags(
-    include_tags: Vec<&str>,
-    exclude_tags: Vec<&str>,
-) -> Codebook {
+pub fn get_processor_with_tags(include_tags: Vec<&str>, exclude_tags: Vec<&str>) -> Codebook {
     let settings = codebook_config::ConfigSettings {
         include_tags: include_tags.into_iter().map(String::from).collect(),
         exclude_tags: exclude_tags.into_iter().map(String::from).collect(),
diff --git a/examples/example.md b/examples/example.md
index 36d8cbf0..c1ac0d0a 100644
--- a/examples/example.md
+++ b/examples/example.md
@@ -6,3 +6,12 @@ ATGCATC
 
 bad DNA:
 ATGCATCssss
+
+
+```python
+import bad_spelin
+# Not spel good
+def im_guud():
+    bad_spelin.bone()
+    pass
+```
diff --git a/refactor.md b/refactor.md
new file mode 100644
index 00000000..dab3a174
--- /dev/null
+++ b/refactor.md
@@ -0,0 +1,574 @@
+# Codebook Architecture Refactor
+
+## Goal
+
+Restructure the `codebook` crate internals to support multi-language files (markdown with code blocks, Astro/Vue/Svelte, HTML with `<script>`/`<style>`) and lay groundwork for control comments, custom dictionaries, and a CLI. No public LSP protocol changes needed — the refactor is internal to the `codebook` and `codebook-config` crates.
+
+## Current Architecture
+
+```
+LSP Backend
+    → Codebook::spell_check(text, ONE language, file_path)
+        → resolve_language()           // pick one LanguageType
+        → get_dictionaries()           // load dicts for that one language
+        → parser::find_locations()     // do everything in one function:
+            ├─ Text path: word-boundary split entire text
+            └─ Code path: tree-sitter parse + query + word extract + dict check
+        → return Vec<WordLocation>
+```
+
+### Problems
+
+1. **`find_locations` does too much.** It parses, queries, extracts words, applies skip patterns, and checks dictionaries — all in one function. You can't insert new stages (control comments, injection) without forking the function.
+
+2. **One language per file.** `spell_check` resolves a single `LanguageType` and uses it for the entire file. No way to handle embedded languages.
+
+3. **Dictionary selection is coupled to language resolution.** Dictionaries are gathered once based on the single resolved language. With multiple languages per file, different regions need different dictionaries.
+
+4. **Skip patterns are applied inconsistently.** For `Text` mode, skip patterns are applied during word extraction. For code mode, skip patterns are applied after word extraction against global byte offsets. Both happen inside `find_locations`.
+
+5. **`LanguageType::Text` is a special case everywhere.** The `Text` variant returns `None` from `language()`, has no `.scm` file, no `LanguageSetting` entry returned by `get_language_setting`, and takes a completely different code path in `find_locations`. It's an implicit "not really a language" variant.
+
+## Proposed Architecture
+
+### Pipeline
+
+```
+Codebook::spell_check(text, language, file_path)
+    │
+    ▼
+┌─────────────────────────────┐
+│  Stage 1: Region Extraction │  Split file into typed regions
+│  (one language per region)  │  Most languages: 1 region = whole file
+└─────────────┬───────────────┘  Markdown/HTML/Astro: multiple regions
+              │
+              ▼
+┌─────────────────────────────┐
+│  Stage 2: Node Extraction   │  Per region: tree-sitter parse + query
+│  (AST nodes to check)       │  Returns tagged text spans
+└─────────────┬───────────────┘
+              │
+              ▼
+┌─────────────────────────────┐
+│  Stage 3: Word Extraction   │  Per node: split words, apply skip patterns
+│  (candidate words)          │  Uses splitter + TextProcessor
+└─────────────┬───────────────┘
+              │
+              ▼
+┌─────────────────────────────┐
+│  Stage 4: Word Checking     │  Per word: dictionary lookup + config rules
+│  (misspelled words)         │  flag_words, allowed_words, min_length
+└─────────────┬───────────────┘
+              │
+              ▼
+         Vec<WordLocation>
+```
+
+Each stage is a separate function with clear inputs and outputs. No closures passed between stages — data flows as concrete types.
+
+### Data Types
+
+```rust
+/// A region of a file associated with a single language.
+/// For most files, there's one region covering the whole file.
+/// For multi-language files (markdown, astro, vue), there are multiple.
+pub struct TextRegion {
+    /// Byte range in the original document
+    pub start_byte: usize,
+    pub end_byte: usize,
+    /// Which language governs this region
+    pub language: LanguageType,
+}
+
+/// A text span extracted from a tree-sitter query match.
+/// Coordinates are in original-document byte offsets.
+pub struct TextNode {
+    /// Byte range in the original document
+    pub start_byte: usize,
+    pub end_byte: usize,
+    /// The text content of this node
+    pub text: String,
+    /// The capture tag (e.g. "comment", "string", "identifier.function")
+    pub tag: String,
+}
+
+/// A candidate word extracted from a TextNode, with its position
+/// in original-document byte offsets.
+pub struct WordCandidate {
+    pub word: String,
+    pub start_byte: usize,
+    pub end_byte: usize,
+}
+```
+
+`WordLocation` (the final output) stays the same — it groups all locations of a misspelled word together.
+
+### Stage 1: Region Extraction
+
+```rust
+// In a new module: src/regions.rs
+
+/// Extract language regions from a document.
+/// For single-language files, returns one region covering the whole text.
+/// For multi-language files (markdown, astro, vue, html), returns multiple.
+pub fn extract_regions(text: &str, language: LanguageType) -> Vec<TextRegion> {
+    match language {
+        LanguageType::Markdown => extract_markdown_regions(text),
+        // Future: LanguageType::HTML => extract_html_regions(text),
+        // Future: LanguageType::Astro => extract_astro_regions(text),
+        _ => vec![TextRegion {
+            start_byte: 0,
+            end_byte: text.len(),
+            language,
+        }],
+    }
+}
+```
+
+**Markdown region extraction** parses with `tree_sitter_md`, walks the tree, and produces regions:
+- `paragraph`, `atx_heading`, etc. → `LanguageType::Markdown` region
+- `fenced_code_block` with `info_string` "python" → `LanguageType::Python` region
+- `fenced_code_block` with unknown/missing info string → skip (no region)
+
+This replaces the current `markdown.scm` query approach. Instead of using tree-sitter queries to filter what markdown nodes to check, region extraction identifies the prose vs code boundary, and each region then goes through the normal stage 2 pipeline for its language.
+
+**Language alias resolution** for info strings:
+
+```rust
+/// Map markdown info strings to LanguageType.
+/// Handles common aliases beyond what LanguageType::from_str covers.
+fn resolve_info_string(info: &str) -> Option<LanguageType> {
+    // from_str already handles VS Code language IDs like "rust", "python", "javascript"
+    // Add common markdown aliases here
+    match info.trim().to_lowercase().as_str() {
+        "py" => Some(LanguageType::Python),
+        "js" => Some(LanguageType::Javascript),
+        "ts" => Some(LanguageType::Typescript),
+        "sh" | "zsh" | "fish" => Some(LanguageType::Bash),
+        "yml" => Some(LanguageType::YAML),
+        "c++" | "cc" | "cxx" | "hpp" => Some(LanguageType::Cpp),
+        "cs" => Some(LanguageType::CSharp),
+        "rb" => Some(LanguageType::Ruby),
+        "rs" => Some(LanguageType::Rust),
+        "tex" => Some(LanguageType::Latex),
+        other => LanguageType::from_str(other).ok(),
+    }
+}
+```
+
+### Stage 2: Node Extraction
+
+```rust
+// Refactored from the tree-sitter parts of find_locations_code in src/parser.rs
+
+/// Extract spellcheckable text nodes from a region using tree-sitter.
+/// Returns nodes with byte offsets in original document coordinates.
+pub fn extract_nodes(
+    document_text: &str,
+    region: &TextRegion,
+    tag_filter: &dyn Fn(&str) -> bool,
+) -> Vec<TextNode> {
+    let region_text = &document_text[region.start_byte..region.end_byte];
+
+    match region.language {
+        LanguageType::Text => {
+            // Plain text: the whole region is one node
+            vec![TextNode {
+                start_byte: region.start_byte,
+                end_byte: region.end_byte,
+                text: region_text.to_string(),
+                tag: "string".to_string(),
+            }]
+        }
+        LanguageType::Markdown => {
+            // Markdown prose regions: treat as plain text
+            // (region extraction already stripped out code blocks)
+            vec![TextNode {
+                start_byte: region.start_byte,
+                end_byte: region.end_byte,
+                text: region_text.to_string(),
+                tag: "string".to_string(),
+            }]
+        }
+        _ => {
+            // Code: parse with tree-sitter, run query, extract captured nodes
+            extract_nodes_with_treesitter(region_text, region.start_byte, region.language, tag_filter)
+        }
+    }
+}
+
+/// Parse text with tree-sitter and extract nodes matching the language's query.
+fn extract_nodes_with_treesitter(
+    text: &str,
+    base_offset: usize,
+    language: LanguageType,
+    tag_filter: &dyn Fn(&str) -> bool,
+) -> Vec<TextNode> {
+    let language_setting = get_language_setting(language)?;
+
+    let tree = {
+        let mut cache = PARSER_CACHE.lock().unwrap();
+        let parser = cache.entry(language).or_insert_with(|| { /* ... */ });
+        parser.parse(text, None).unwrap()
+    };
+
+    let lang = language_setting.language().unwrap();
+    let query = Query::new(&lang, language_setting.query).unwrap();
+    let capture_names = query.capture_names();
+    let mut cursor = QueryCursor::new();
+    let mut nodes = Vec::new();
+
+    let mut matches = cursor.matches(&query, tree.root_node(), text.as_bytes());
+    while let Some(match_) = matches.next() {
+        for capture in match_.captures {
+            let tag = &capture_names[capture.index as usize];
+            if tag == "language" || !tag_filter(tag) {
+                continue;
+            }
+            let node = capture.node;
+            nodes.push(TextNode {
+                start_byte: node.start_byte() + base_offset,
+                end_byte: node.end_byte() + base_offset,
+                text: node.utf8_text(text.as_bytes()).unwrap().to_string(),
+                tag: tag.to_string(),
+            });
+        }
+    }
+    nodes
+}
+```
+
+Key change: this function **only** extracts nodes. It does not split words or check dictionaries. The `base_offset` parameter handles coordinate translation for injected regions — node byte offsets from tree-sitter are relative to the parsed text, but we need document-global offsets in the output.
+
+### Stage 3: Word Extraction
+
+```rust
+// Refactored from TextProcessor in src/parser.rs
+
+/// Extract candidate words from text nodes, applying skip patterns.
+/// All byte offsets are in original document coordinates.
+pub fn extract_words(
+    document_text: &str,
+    nodes: &[TextNode],
+    skip_patterns: &[Regex],
+) -> Vec<WordCandidate> {
+    // Compute skip ranges once against the full document
+    let skip_ranges = find_skip_ranges(document_text, skip_patterns);
+
+    let mut candidates = Vec::new();
+    for node in nodes {
+        let words = split_into_words(&node.text);
+        for split_word in words {
+            let global_start = split_word.start_byte + node.start_byte;
+            let global_end = global_start + split_word.word.len();
+
+            if is_within_skip_range(global_start, global_end, &skip_ranges) {
+                continue;
+            }
+
+            candidates.push(WordCandidate {
+                word: split_word.word.to_string(),
+                start_byte: global_start,
+                end_byte: global_end,
+            });
+        }
+    }
+    candidates
+}
+
+/// Split a text node's content into individual words using unicode
+/// segmentation and camelCase/snake_case splitting.
+/// This combines the existing TextProcessor word boundary logic
+/// with the splitter module.
+fn split_into_words(text: &str) -> Vec<SplitWord> {
+    // existing logic from TextProcessor::collect_split_words
+    // + splitter::split
+}
+```
+
+This is a pure function: text in, words out. No dictionary awareness, no language awareness.
+
+### Stage 4: Word Checking
+
+```rust
+/// Check candidate words against dictionaries and config rules.
+/// Returns WordLocations for misspelled words, grouping all locations
+/// of the same word together.
+pub fn check_words(
+    candidates: &[WordCandidate],
+    dictionaries: &[Arc<dyn Dictionary>],
+    config: &dyn CodebookConfig,
+) -> Vec<WordLocation> {
+    // Deduplicate: group candidates by word text
+    let mut word_positions: HashMap<&str, Vec<TextRange>> = HashMap::new();
+    for candidate in candidates {
+        word_positions
+            .entry(&candidate.word)
+            .or_default()
+            .push(TextRange {
+                start_byte: candidate.start_byte,
+                end_byte: candidate.end_byte,
+            });
+    }
+
+    // Check each unique word once
+    let mut results = Vec::new();
+    for (word, positions) in word_positions {
+        if config.should_flag_word(word) {
+            results.push(WordLocation::new(word.to_string(), positions));
+            continue;
+        }
+        if word.len() < config.get_min_word_length() {
+            continue;
+        }
+        if config.is_allowed_word(word) {
+            continue;
+        }
+        let is_correct = dictionaries.iter().any(|dict| dict.check(word));
+        if !is_correct {
+            results.push(WordLocation::new(word.to_string(), positions));
+        }
+    }
+    results
+}
+```
+
+This replaces the `check_function` closure that's currently threaded through `find_locations`. The closure pattern made it impossible to test word checking independently.
+
+### Orchestration in `Codebook::spell_check`
+
+```rust
+pub fn spell_check(
+    &self,
+    text: &str,
+    language: Option<LanguageType>,
+    file_path: Option<&str>,
+) -> Vec<WordLocation> {
+    // ... existing path ignore/include logic ...
+
+    let language = self.resolve_language(language, file_path);
+
+    // Build skip patterns once
+    let mut skip_patterns = get_default_skip_patterns().clone();
+    if let Some(user_patterns) = self.config.get_ignore_patterns() {
+        skip_patterns.extend(user_patterns);
+    }
+
+    // Stage 1: Split into language regions
+    let regions = regions::extract_regions(text, language);
+
+    // Collect dictionaries for all languages present in the file
+    let languages_in_file: Vec<LanguageType> = regions.iter().map(|r| r.language).collect();
+    let dictionaries = self.get_dictionaries_for_languages(&languages_in_file);
+
+    // Stages 2-4: Process each region
+    let mut all_candidates = Vec::new();
+    for region in &regions {
+        let nodes = parser::extract_nodes(text, region, &|tag| {
+            self.config.should_check_tag(tag)
+        });
+        let candidates = parser::extract_words(text, &nodes, &skip_patterns);
+        all_candidates.extend(candidates);
+    }
+
+    // Stage 4: Check all words at once (deduplicates across regions)
+    parser::check_words(&all_candidates, &dictionaries, self.config.as_ref())
+}
+```
+
+### Dictionary Selection Changes
+
+```rust
+/// Gather dictionaries for all languages present in a file.
+fn get_dictionaries_for_languages(
+    &self,
+    languages: &[LanguageType],
+) -> Vec<Arc<dyn Dictionary>> {
+    let mut dictionary_ids: Vec<String> = self.config.get_dictionary_ids();
+
+    // Add language-specific dictionaries for all languages in the file
+    for lang in languages {
+        dictionary_ids.extend(lang.dictionary_ids());
+    }
+
+    // Add defaults
+    dictionary_ids.extend(DEFAULT_DICTIONARIES.iter().map(|f| f.to_string()));
+
+    // Deduplicate
+    dictionary_ids.sort();
+    dictionary_ids.dedup();
+
+    dictionary_ids
+        .iter()
+        .filter_map(|id| self.manager.get_dictionary(id))
+        .collect()
+}
+```
+
+This replaces the current `get_dictionaries(Option<LanguageType>)` which only handles one language.
+
+## Module Layout After Refactor
+
+```
+codebook/src/
+├── lib.rs              # Codebook struct, spell_check orchestration
+├── regions.rs          # NEW: Stage 1 — region extraction
+├── parser.rs           # Stages 2+3 — node extraction, word extraction
+├── checker.rs          # NEW: Stage 4 — word checking
+├── splitter.rs         # Word splitting (camelCase, snake_case) — unchanged
+├── regexes.rs          # Skip patterns — unchanged
+├── queries.rs          # LanguageType, LanguageSetting, .scm files — unchanged
+├── queries/            # .scm query files — unchanged
+└── dictionaries/       # Dictionary loading — unchanged
+```
+
+Key moves:
+- `find_locations` and `find_locations_code` in `parser.rs` → split into `extract_nodes` + `extract_words`
+- Dictionary checking logic currently in `Codebook::spell_check` closure → `checker.rs::check_words`
+- Region extraction → new `regions.rs` module
+- `TextProcessor` stays in `parser.rs` but is simplified — it only does word extraction now, no dictionary checking
+
+## What Gets Deleted
+
+- `parser::find_locations()` — replaced by the pipeline orchestration in `Codebook::spell_check`
+- `parser::find_locations_code()` — split into `extract_nodes` + `extract_words`
+- `TextProcessor::process_words_with_check()` — word checking moves to stage 4
+- `dictionary::find_locations_with_dictionary_batch()` — unused after refactor
+- `queries/markdown.scm` — markdown region extraction replaces the query approach
+- The `check_function` closure pattern — replaced by concrete `check_words` function
+
+## What Stays the Same
+
+- All `.scm` query files (except `markdown.scm`)
+- `LanguageType` enum and `LANGUAGE_SETTINGS` table
+- `LanguageSetting` struct and `language()` method
+- `splitter::split()` — word splitting logic
+- `regexes.rs` — skip patterns
+- `dictionaries/` — all dictionary types, manager, repo
+- `CodebookConfig` trait and `CodebookConfigFile` implementation
+- `CodebookConfigMemory` for tests
+- The LSP crate (`codebook-lsp`) — `Backend`, `LanguageServer` impl, all commands
+- `Codebook::spell_check` signature (takes same args, returns same type)
+- `Codebook::get_suggestions` — unchanged
+- `WordLocation`, `TextRange` — unchanged
+
+## Implementation Order
+
+This can be done incrementally, keeping tests green at each step:
+
+### Step 1: Introduce data types and stage functions as wrappers
+
+Add `TextRegion`, `TextNode`, `WordCandidate` types. Write `extract_regions`, `extract_nodes`, `extract_words`, `check_words` as new functions that internally call the existing `find_locations` code. Write tests for each stage function independently. Don't delete anything yet.
+
+### Step 2: Rewire `Codebook::spell_check` to use the pipeline
+
+Replace the body of `spell_check` with the pipeline orchestration. It should call the stage functions instead of `find_locations` directly. All existing integration tests should still pass since the external behavior is the same.
+
+### Step 3: Inline and delete old code
+
+Now that nothing calls `find_locations` or `find_locations_code`, move their internal logic into the stage functions and delete the old functions. Remove `TextProcessor::process_words_with_check` (keep `extract_words` and `collect_split_words`). Remove `find_locations_with_dictionary_batch`.
+
+### Step 4: Implement markdown region extraction
+
+Replace the current markdown.scm query approach with proper region extraction:
+- Parse markdown with `tree_sitter_md`
+- Walk the AST to identify prose regions and fenced code blocks
+- Map info strings to `LanguageType` using `resolve_info_string`
+- Delete `markdown.scm`
+
+This makes markdown code blocks spell-checked with the correct language grammar and dictionaries.
+
+### Step 5: Update tests
+
+- Existing integration tests (test_markdown.rs, test_python.rs, etc.) should pass unchanged
+- Add new unit tests for each stage function
+- Add integration tests for markdown with code blocks in different languages
+- Add test for unknown info strings (should be skipped, not crash)
+
+## Future Work (Not Part of This Refactor)
+
+These features are enabled by the pipeline architecture but should be done in separate passes:
+
+### Control Comments
+
+Add a filtering step between stage 2 (node extraction) and stage 3 (word extraction). Scan for comments matching patterns like:
+- `// codebook:ignore-next-line` — add the next line's byte range to skip ranges
+- `// codebook:ignore-start` / `// codebook:ignore-end` — add enclosed range to skip ranges
+- `// codebook:words word1,word2` — add words to allowed list for this file
+- `<!-- codebook:ignore -->` — HTML/markdown variant
+
+This works naturally because nodes already carry byte offsets and tags. A comment node with tag "comment" containing "codebook:ignore-next-line" can compute the next line's byte range and add it to the skip ranges that stage 3 uses.
+
+For file-level directives (`codebook:ignore-file`), short-circuit before stage 1.
+
+### Custom User Dictionaries
+
+Changes needed in `codebook-config` and `dictionaries/`:
+
+1. Add `custom_dictionaries` field to `ConfigSettings`:
+   ```toml
+   # codebook.toml
+   [custom_dictionaries]
+   my_project = "path/to/project-words.txt"
+   medical = "path/to/medical-terms.dic"
+   ```
+
+2. `DictionaryManager::get_dictionary` should check for local file paths in addition to the `get_repo` lookup. If `id` maps to a path in config, load it as a `TextDictionary` (for `.txt`) or `HunspellDictionary` (for `.dic`/`.aff` pairs).
+
+3. Relative paths should resolve from the project config file's directory.
+
+No pipeline changes needed — custom dictionaries just appear in the dictionary list alongside built-in ones.
+
+### Astro/Vue/Svelte Support
+
+Same pattern as markdown region extraction:
+
+1. Add `tree-sitter-astro`, `tree-sitter-vue`, etc. as dependencies
+2. Add `LanguageType::Astro`, `LanguageType::Vue`, `LanguageType::Svelte`
+3. Write `extract_astro_regions`, `extract_vue_regions`, etc. in `regions.rs`
+4. These parse the file, identify `<script>`, `<template>`, `<style>` sections, and produce regions with appropriate language types
+
+The `.scm` query files for the embedded languages (TypeScript, HTML, CSS) already exist and work unchanged — they're used in stage 2 for each region.
+
+### HTML `<script>` and `<style>` Injection
+
+Same pattern. `extract_html_regions` would identify `<script>` and `<style>` tags and create JavaScript/CSS regions. The rest of the HTML becomes HTML regions checked with `html.scm`.
+
+### Korean/Asian Language Support
+
+Affects stage 3 (word extraction) only. The current `split_into_words` uses `unicode-segmentation`'s `split_word_bound_indices`, which works for space-separated languages. For Korean/CJK, options:
+
+1. **Syllable-level checking** — Unicode word boundaries do produce Hangul syllable blocks, so basic Korean may work with the existing splitter. Test this first.
+2. **Segmentation library** — if syllable-level is too granular, integrate a word segmentation library. Stage 3 would check the script of the text and choose the appropriate splitter.
+3. **Dictionary support** — need Hunspell dictionaries for these languages. The dictionary system already supports this (just add entries to `HUNSPELL_DICTIONARIES`).
+
+No pipeline architecture changes needed — just a different word splitting strategy in stage 3.
+
+### CLI for CI
+
+New crate: `codebook-cli`. Uses the `codebook` crate directly:
+
+```rust
+// codebook-cli/src/main.rs
+fn main() {
+    let args = parse_args();
+    let config = CodebookConfigFile::load(Some(&args.project_dir))?;
+    let codebook = Codebook::new(Arc::new(config))?;
+
+    let mut exit_code = 0;
+    for file in discover_files(&args) {
+        let results = codebook.spell_check_file(&file);
+        if !results.is_empty() {
+            exit_code = 1;
+            format_results(&file, &results, args.format);
+        }
+    }
+    std::process::exit(exit_code);
+}
+```
+
+Output formats: `text` (human readable), `json` (machine readable), `sarif` (GitHub Actions).
+
+File discovery: walk directory, respect `.gitignore` + config `ignore_paths`/`include_paths`.
+
+No pipeline changes needed — the CLI uses `Codebook::spell_check_file` which already returns `Vec<WordLocation>`.

From fdd4931e13f9ac245d6e17eda9dbb6e1b58cfdab Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 09:31:27 -0700
Subject: [PATCH 02/16] Attempt 2

---
 crates/codebook/src/queries.rs |  57 +++++++++++++--
 crates/codebook/src/regions.rs | 128 ++++++++++++++++++---------------
 2 files changed, 123 insertions(+), 62 deletions(-)

diff --git a/crates/codebook/src/queries.rs b/crates/codebook/src/queries.rs
index a352f8db..383927af 100644
--- a/crates/codebook/src/queries.rs
+++ b/crates/codebook/src/queries.rs
@@ -45,6 +45,11 @@ impl FromStr for LanguageType {
                     return Ok(language.type_);
                 }
             }
+            for ext in language.extensions.iter() {
+                if s == *ext {
+                    return Ok(language.type_);
+                }
+            }
         }
         Ok(LanguageType::Text)
     }
@@ -209,7 +214,15 @@ pub static LANGUAGE_SETTINGS: &[LanguageSetting] = &[
     },
     LanguageSetting {
         type_: LanguageType::Bash,
-        ids: &["bash", "shellscript", "sh", "shell script"],
+        ids: &[
+            "bash",
+            "shellscript",
+            "sh",
+            "shell script",
+            "shell",
+            "zsh",
+            "fish",
+        ],
         dictionary_ids: &["bash"],
         query: include_str!("queries/bash.scm"),
         extensions: &["sh", "bash"],
@@ -237,7 +250,7 @@ pub static LANGUAGE_SETTINGS: &[LanguageSetting] = &[
     },
     LanguageSetting {
         type_: LanguageType::YAML,
-        ids: &["yaml"],
+        ids: &["yaml", "yml"],
         dictionary_ids: &["yaml"],
         query: include_str!("queries/yaml.scm"),
         extensions: &["yaml", "yml"],
@@ -403,9 +416,9 @@ mod tests {
                 continue;
             }
 
-            let language = language_setting
-                .language()
-                .unwrap_or_else(|| panic!("Failed to get language for {:?}", language_setting.type_));
+            let language = language_setting.language().unwrap_or_else(|| {
+                panic!("Failed to get language for {:?}", language_setting.type_)
+            });
 
             let query = Query::new(&language, language_setting.query).unwrap_or_else(|e| {
                 panic!(
@@ -416,7 +429,7 @@ mod tests {
 
             for name in query.capture_names() {
                 assert!(
-                    ALLOWED_TAGS.contains(&name.as_ref()),
+                    ALLOWED_TAGS.contains(name),
                     "Language {:?} uses unknown capture tag @{name}. \
                      Allowed tags: {ALLOWED_TAGS:?}",
                     language_setting.type_,
@@ -424,4 +437,36 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn test_no_overlap_in_ids_and_extensions() {
+        use std::collections::HashMap;
+
+        // Map every id and extension to the language that owns it
+        let mut seen: HashMap<&str, LanguageType> = HashMap::new();
+
+        for setting in LANGUAGE_SETTINGS {
+            for &id in setting.ids {
+                if let Some(&prev) = seen.get(id) {
+                    panic!(
+                        "Duplicate id/extension {id:?}: used by both {:?} and {:?}",
+                        prev, setting.type_
+                    );
+                }
+                seen.insert(id, setting.type_);
+            }
+            for &ext in setting.extensions {
+                if let Some(&prev) = seen.get(ext) {
+                    // Allow overlap within the same language (e.g. "hs" in both ids and extensions)
+                    if prev != setting.type_ {
+                        panic!(
+                            "Duplicate id/extension {ext:?}: used by both {:?} and {:?}",
+                            prev, setting.type_
+                        );
+                    }
+                }
+                seen.insert(ext, setting.type_);
+            }
+        }
+    }
 }
diff --git a/crates/codebook/src/regions.rs b/crates/codebook/src/regions.rs
index 3d3db789..35c3df1c 100644
--- a/crates/codebook/src/regions.rs
+++ b/crates/codebook/src/regions.rs
@@ -37,36 +37,15 @@ pub fn extract_regions(text: &str, language: LanguageType) -> Vec<TextRegion> {
 }
 
 /// Map markdown info strings to LanguageType.
-/// Handles common aliases beyond what LanguageType::from_str covers.
+/// Uses LanguageType::from_str which checks ids and file extensions
+/// in LANGUAGE_SETTINGS. Returns None for unknown or empty strings.
 fn resolve_info_string(info: &str) -> Option<LanguageType> {
     let trimmed = info.trim().to_lowercase();
     if trimmed.is_empty() {
         return None;
     }
-    // Try common aliases first
-    let lang = match trimmed.as_str() {
-        "py" => Some(LanguageType::Python),
-        "js" => Some(LanguageType::Javascript),
-        "ts" => Some(LanguageType::Typescript),
-        "tsx" => Some(LanguageType::Tsx),
-        "jsx" => Some(LanguageType::Javascript),
-        "sh" | "zsh" | "fish" | "shell" => Some(LanguageType::Bash),
-        "yml" => Some(LanguageType::YAML),
-        "c++" | "cc" | "cxx" | "hpp" => Some(LanguageType::Cpp),
-        "cs" => Some(LanguageType::CSharp),
-        "rb" => Some(LanguageType::Ruby),
-        "rs" => Some(LanguageType::Rust),
-        "tex" => Some(LanguageType::Latex),
-        _ => None,
-    };
-
-    if lang.is_some() {
-        return lang;
-    }
-
-    // Fall back to from_str which handles VS Code language IDs
     match LanguageType::from_str(&trimmed) {
-        Ok(LanguageType::Text) => None, // from_str returns Text for unknown, treat as unknown
+        Ok(LanguageType::Text) => None, // from_str returns Text for unknown
         Ok(lang) => Some(lang),
         Err(_) => None,
     }
@@ -80,13 +59,11 @@ fn extract_markdown_regions(text: &str) -> Vec<TextRegion> {
 
     let tree = {
         let mut cache = REGION_PARSER_CACHE.lock().unwrap();
-        let parser = cache
-            .entry(LanguageType::Markdown)
-            .or_insert_with(|| {
-                let mut parser = Parser::new();
-                parser.set_language(&lang).unwrap();
-                parser
-            });
+        let parser = cache.entry(LanguageType::Markdown).or_insert_with(|| {
+            let mut parser = Parser::new();
+            parser.set_language(&lang).unwrap();
+            parser
+        });
         parser.parse(text, None).unwrap()
     };
 
@@ -112,11 +89,7 @@ fn extract_markdown_regions(text: &str) -> Vec<TextRegion> {
 }
 
 /// Recursively walk markdown AST to find prose and code block regions.
-fn walk_markdown_node(
-    node: tree_sitter::Node,
-    source: &[u8],
-    regions: &mut Vec<TextRegion>,
-) {
+fn walk_markdown_node(node: tree_sitter::Node, source: &[u8], regions: &mut Vec<TextRegion>) {
     match node.kind() {
         "fenced_code_block" => {
             // Find info_string and code_fence_content children
@@ -142,20 +115,26 @@ fn walk_markdown_node(
                 }
             }
 
-            if let Some((start, end)) = code_content {
-                if start < end {
-                    if let Some(info) = info_string {
-                        if let Some(lang) = resolve_info_string(&info) {
-                            regions.push(TextRegion {
-                                start_byte: start,
-                                end_byte: end,
-                                language: lang,
-                            });
-                        }
-                        // If info string is unknown, skip the code block entirely
-                    }
-                    // If no info string, skip the code block entirely
-                }
+            if let Some((start, end)) = code_content
+                && start < end
+                && let Some(info) = info_string
+                && let Some(lang) = resolve_info_string(&info)
+            {
+                regions.push(TextRegion {
+                    start_byte: start,
+                    end_byte: end,
+                    language: lang,
+                });
+            }
+        }
+        "html_block" => {
+            // Block-level HTML — treat as an HTML region
+            if node.start_byte() < node.end_byte() {
+                regions.push(TextRegion {
+                    start_byte: node.start_byte(),
+                    end_byte: node.end_byte(),
+                    language: LanguageType::HTML,
+                });
             }
         }
         "inline" => {
@@ -232,7 +211,10 @@ mod tests {
             .iter()
             .filter(|r| r.language == LanguageType::Markdown)
             .collect();
-        assert!(md_regions.len() >= 2, "Expected at least 2 markdown prose regions");
+        assert!(
+            md_regions.len() >= 2,
+            "Expected at least 2 markdown prose regions"
+        );
     }
 
     #[test]
@@ -255,6 +237,25 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_markdown_html_block() {
+        let text = "# Hello\n\n<div class=\"foo\">\n  <p>A paragraph</p>\n</div>\n\nMore text.\n";
+        let regions = extract_regions(text, LanguageType::Markdown);
+        println!("Regions: {regions:#?}");
+
+        let html_regions: Vec<_> = regions
+            .iter()
+            .filter(|r| r.language == LanguageType::HTML)
+            .collect();
+        assert_eq!(html_regions.len(), 1, "Expected one HTML region");
+
+        let md_regions: Vec<_> = regions
+            .iter()
+            .filter(|r| r.language == LanguageType::Markdown)
+            .collect();
+        assert!(md_regions.len() >= 2, "Expected heading + paragraph prose regions");
+    }
+
     #[test]
     fn test_resolve_info_string_aliases() {
         assert_eq!(resolve_info_string("py"), Some(LanguageType::Python));
@@ -272,7 +273,10 @@ mod tests {
     #[test]
     fn test_resolve_info_string_vscode_ids() {
         assert_eq!(resolve_info_string("python"), Some(LanguageType::Python));
-        assert_eq!(resolve_info_string("javascript"), Some(LanguageType::Javascript));
+        assert_eq!(
+            resolve_info_string("javascript"),
+            Some(LanguageType::Javascript)
+        );
         assert_eq!(resolve_info_string("rust"), Some(LanguageType::Rust));
         assert_eq!(resolve_info_string("bash"), Some(LanguageType::Bash));
         assert_eq!(resolve_info_string("go"), Some(LanguageType::Go));
@@ -283,8 +287,14 @@ mod tests {
         let text = "Text.\n\n```bash\nmkdir dir\n```\n\n```python\nx = 1\n```\n\nEnd.\n";
         let regions = extract_regions(text, LanguageType::Markdown);
 
-        let bash_regions: Vec<_> = regions.iter().filter(|r| r.language == LanguageType::Bash).collect();
-        let python_regions: Vec<_> = regions.iter().filter(|r| r.language == LanguageType::Python).collect();
+        let bash_regions: Vec<_> = regions
+            .iter()
+            .filter(|r| r.language == LanguageType::Bash)
+            .collect();
+        let python_regions: Vec<_> = regions
+            .iter()
+            .filter(|r| r.language == LanguageType::Python)
+            .collect();
 
         assert_eq!(bash_regions.len(), 1);
         assert_eq!(python_regions.len(), 1);
@@ -294,8 +304,14 @@ mod tests {
     fn test_markdown_code_block_content_correct() {
         let text = "Hello.\n\n```python\ndef foo():\n    pass\n```\n";
         let regions = extract_regions(text, LanguageType::Markdown);
-        let py = regions.iter().find(|r| r.language == LanguageType::Python).unwrap();
+        let py = regions
+            .iter()
+            .find(|r| r.language == LanguageType::Python)
+            .unwrap();
         let content = &text[py.start_byte..py.end_byte];
-        assert!(content.contains("def foo()"), "Expected python code, got: {content:?}");
+        assert!(
+            content.contains("def foo()"),
+            "Expected python code, got: {content:?}"
+        );
     }
 }

From 597396aecc1d4aee5f156518c00915726048e86e Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 11:14:24 -0700
Subject: [PATCH 03/16] Replace region extraction with recursive query-driven
 injection system
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unify multi-language support into .scm query files using @injection.*
capture tags, eliminating the separate regions.rs module.

Three injection tag forms:
- @injection.{lang} — static injection (e.g. @injection.html for
  HTML blocks in markdown)
- @injection.content + @injection.language — dynamic injection
  (e.g. fenced code blocks where the language comes from the info string)
- Existing tags (@string, @comment, @identifier.*) — word extraction

The recursive extract_all_words function replaces the previous
extract_regions → extract_nodes → extract_words pipeline. Adding
injection support to any language is now just adding @injection.*
captures to its .scm file — no Rust code changes needed.

Restores markdown.scm with injection captures and the Markdown
tree-sitter grammar. Deletes regions.rs.
---
 crates/codebook/src/checker.rs           |  10 +-
 crates/codebook/src/lib.rs               |  54 +--
 crates/codebook/src/parser.rs            | 420 ++++++++++++-----------
 crates/codebook/src/queries.rs           |  17 +-
 crates/codebook/src/queries/markdown.scm |   6 +
 crates/codebook/src/regions.rs           | 317 -----------------
 crates/codebook/tests/test_tags.rs       |  20 +-
 crates/codebook/tests/test_vhdl.rs       |   9 +-
 crates/downloader/src/lib.rs             |  12 +-
 9 files changed, 276 insertions(+), 589 deletions(-)
 create mode 100644 crates/codebook/src/queries/markdown.scm
 delete mode 100644 crates/codebook/src/regions.rs

diff --git a/crates/codebook/src/checker.rs b/crates/codebook/src/checker.rs
index e87d9069..7b8dac0e 100644
--- a/crates/codebook/src/checker.rs
+++ b/crates/codebook/src/checker.rs
@@ -75,10 +75,7 @@ mod tests {
     fn test_check_words_flags_unknown() {
         let dict = Arc::new(TextDictionary::new("hello\nworld\n"));
         let config = Arc::new(codebook_config::CodebookConfigMemory::default());
-        let candidates = make_candidates(&[
-            ("hello", 0, 5),
-            ("wrld", 6, 10),
-        ]);
+        let candidates = make_candidates(&[("hello", 0, 5), ("wrld", 6, 10)]);
         let results = check_words(&candidates, &[dict], config.as_ref());
         assert_eq!(results.len(), 1);
         assert_eq!(results[0].word, "wrld");
@@ -88,10 +85,7 @@ mod tests {
     fn test_check_words_groups_locations() {
         let dict = Arc::new(TextDictionary::new("hello\n"));
         let config = Arc::new(codebook_config::CodebookConfigMemory::default());
-        let candidates = make_candidates(&[
-            ("wrld", 0, 4),
-            ("wrld", 10, 14),
-        ]);
+        let candidates = make_candidates(&[("wrld", 0, 4), ("wrld", 10, 14)]);
         let results = check_words(&candidates, &[dict], config.as_ref());
         assert_eq!(results.len(), 1);
         assert_eq!(results[0].word, "wrld");
diff --git a/crates/codebook/src/lib.rs b/crates/codebook/src/lib.rs
index 2f91e643..19cfe956 100644
--- a/crates/codebook/src/lib.rs
+++ b/crates/codebook/src/lib.rs
@@ -4,7 +4,6 @@ mod logging;
 pub mod parser;
 pub mod queries;
 pub mod regexes;
-pub mod regions;
 mod splitter;
 
 use crate::regexes::get_default_skip_patterns;
@@ -41,11 +40,9 @@ impl Codebook {
         file_path: Option<&str>,
     ) -> Vec<parser::WordLocation> {
         if let Some(file_path) = file_path {
-            // ignore_paths is a blocklist and has higher precedence than include_paths
             if self.config.should_ignore_path(Path::new(file_path)) {
                 return Vec::new();
             }
-            // include_paths is an allowlist; empty list means "include everything"
             if !self.config.should_include_path(Path::new(file_path)) {
                 return Vec::new();
             }
@@ -59,25 +56,19 @@ impl Codebook {
             all_patterns.extend(user_patterns);
         }
 
-        // Stage 1: Split into language regions
-        let text_regions = regions::extract_regions(text, language);
-
-        // Collect dictionaries for all languages present in the file
-        let dictionaries = self.get_dictionaries_for_languages(&text_regions);
+        // Extract all words, recursively following injections
+        let (candidates, languages_found) = parser::extract_all_words(
+            text,
+            language,
+            &|tag| self.config.should_check_tag(tag),
+            &all_patterns,
+        );
 
-        // Stages 2+3: Extract nodes and words from each region
-        let mut all_candidates = Vec::new();
-        for region in &text_regions {
-            // Stage 2: Node extraction
-            let nodes =
-                parser::extract_nodes(text, region, &|tag| self.config.should_check_tag(tag));
-            // Stage 3: Word extraction
-            let candidates = parser::extract_words(text, &nodes, &all_patterns);
-            all_candidates.extend(candidates);
-        }
+        // Load dictionaries for all languages encountered
+        let dictionaries = self.get_dictionaries_for_languages(&languages_found);
 
-        // Stage 4: Word checking
-        checker::check_words(&all_candidates, &dictionaries, self.config.as_ref())
+        // Check words against dictionaries
+        checker::check_words(&candidates, &dictionaries, self.config.as_ref())
     }
 
     fn resolve_language(
@@ -94,25 +85,19 @@ impl Codebook {
         }
     }
 
-    /// Gather dictionaries for all languages present in a file.
+    /// Gather dictionaries for all languages encountered in a file.
     fn get_dictionaries_for_languages(
         &self,
-        regions: &[regions::TextRegion],
+        languages: &HashSet<queries::LanguageType>,
     ) -> Vec<Arc<dyn Dictionary>> {
         let mut dictionary_ids = self.config.get_dictionary_ids();
 
-        // Add language-specific dictionaries for all languages in the file
-        let mut seen_languages = HashSet::new();
-        for region in regions {
-            if seen_languages.insert(region.language) {
-                dictionary_ids.extend(region.language.dictionary_ids());
-            }
+        for lang in languages {
+            dictionary_ids.extend(lang.dictionary_ids());
         }
 
-        // Add defaults
         dictionary_ids.extend(DEFAULT_DICTIONARIES.iter().map(|f| f.to_string()));
 
-        // Deduplicate
         dictionary_ids.sort();
         dictionary_ids.dedup();
 
@@ -134,7 +119,7 @@ impl Codebook {
 
     pub fn get_suggestions(&self, word: &str) -> Option<Vec<String>> {
         let max_results = 5;
-        let dictionaries = self.get_dictionaries_for_languages(&[]);
+        let dictionaries = self.get_dictionaries_for_languages(&HashSet::new());
         let mut is_misspelled = false;
         let suggestions: Vec<Vec<String>> = dictionaries
             .iter()
@@ -183,7 +168,6 @@ mod tests {
             vec!["date", "elderberry", "fig"],
             vec!["grape", "honeydew", "kiwi"],
         ];
-
         let result = collect_round_robin(&sources, 5);
         assert_eq!(
             result,
@@ -198,7 +182,6 @@ mod tests {
             vec!["banana", "cherry", "date"],
             vec!["cherry", "date", "elderberry"],
         ];
-
         let result = collect_round_robin(&sources, 5);
         assert_eq!(
             result,
@@ -213,7 +196,6 @@ mod tests {
             vec!["elderberry"],
             vec!["fig", "grape"],
         ];
-
         let result = collect_round_robin(&sources, 7);
         assert_eq!(
             result,
@@ -239,7 +221,6 @@ mod tests {
     #[test]
     fn test_collect_round_robin_some_empty_sources() {
         let sources = vec![vec!["apple", "banana"], vec![], vec!["cherry", "date"]];
-
         let result = collect_round_robin(&sources, 4);
         assert_eq!(result, vec!["apple", "cherry", "banana", "date"]);
     }
@@ -247,7 +228,6 @@ mod tests {
     #[test]
     fn test_collect_round_robin_with_numbers() {
         let sources = vec![vec![1, 3, 5], vec![2, 4, 6]];
-
         let result = collect_round_robin(&sources, 6);
         assert_eq!(result, vec![1, 2, 3, 4, 5, 6]);
     }
@@ -259,7 +239,6 @@ mod tests {
             vec!["date", "elderberry", "fig"],
             vec!["grape", "honeydew", "kiwi"],
         ];
-
         let result = collect_round_robin(&sources, 3);
         assert_eq!(result, vec!["apple", "date", "grape"]);
     }
@@ -267,7 +246,6 @@ mod tests {
     #[test]
     fn test_collect_round_robin_max_count_higher_than_available() {
         let sources = vec![vec!["apple", "banana"], vec!["cherry", "date"]];
-
         let result = collect_round_robin(&sources, 10);
         assert_eq!(result, vec!["apple", "banana", "cherry", "date"]);
     }
diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs
index f655a285..424420c9 100644
--- a/crates/codebook/src/parser.rs
+++ b/crates/codebook/src/parser.rs
@@ -1,9 +1,9 @@
 use crate::checker::WordCandidate;
 use crate::queries::{LanguageType, get_language_setting};
-use crate::regions::TextRegion;
 use crate::splitter;
 use regex::Regex;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
+use std::str::FromStr;
 use std::sync::{LazyLock, Mutex};
 use streaming_iterator::StreamingIterator;
 use tree_sitter::{Parser, Query, QueryCursor};
@@ -25,27 +25,21 @@ pub struct TextRange {
 
 #[derive(Debug, Clone, Copy, PartialEq)]
 struct SkipRange {
-    /// Start position in utf-8 byte offset
     start_byte: usize,
-    /// End position in utf-8 byte offset
     end_byte: usize,
 }
 
-/// Check if a word at [start, end) is entirely within any skip range
 fn is_within_skip_range(start: usize, end: usize, skip_ranges: &[SkipRange]) -> bool {
     skip_ranges
         .iter()
         .any(|r| start >= r.start_byte && end <= r.end_byte)
 }
 
-/// Find skip ranges from pattern matches in text.
 fn find_skip_ranges(text: &str, patterns: &[Regex]) -> Vec<SkipRange> {
     if patterns.is_empty() {
         return Vec::new();
     }
-
     let mut ranges = Vec::new();
-
     for pattern in patterns {
         for regex_match in pattern.find_iter(text) {
             ranges.push(SkipRange {
@@ -54,20 +48,16 @@ fn find_skip_ranges(text: &str, patterns: &[Regex]) -> Vec<SkipRange> {
             });
         }
     }
-
     ranges.sort_by_key(|r| r.start_byte);
     merge_overlapping_ranges(ranges)
 }
 
-/// Merge overlapping or adjacent ranges
 fn merge_overlapping_ranges(ranges: Vec<SkipRange>) -> Vec<SkipRange> {
     if ranges.is_empty() {
         return ranges;
     }
-
     let mut merged = Vec::new();
     let mut current = ranges[0];
-
     for range in ranges.into_iter().skip(1) {
         if range.start_byte <= current.end_byte {
             current.end_byte = current.end_byte.max(range.end_byte);
@@ -93,66 +83,76 @@ impl WordLocation {
 }
 
 // =============================================================================
-// Stage 2: Node Extraction
+// Main entry point: recursive word extraction with injection support
 // =============================================================================
 
-/// A text span extracted from a tree-sitter query match or plain text region.
-/// Coordinates are in original-document byte offsets.
-#[derive(Debug, Clone)]
-pub struct TextNode {
-    /// Byte range start in the original document
-    pub start_byte: usize,
-    /// Byte range end in the original document
-    pub end_byte: usize,
-    /// The text content of this node
-    pub text: String,
-}
-
-/// Extract spellcheckable text nodes from a region.
-/// For code regions, uses tree-sitter parsing and queries.
-/// For text/markdown prose regions, returns the whole region as one node.
-/// All byte offsets are in original document coordinates.
-pub fn extract_nodes(
+/// Extract all candidate words from a document, recursively following
+/// `@injection.*` captures in .scm query files to handle multi-language files.
+///
+/// Returns the candidates and the set of all languages encountered (for
+/// dictionary loading).
+pub fn extract_all_words(
     document_text: &str,
-    region: &TextRegion,
+    language: LanguageType,
     tag_filter: &dyn Fn(&str) -> bool,
-) -> Vec<TextNode> {
-    let region_text = &document_text[region.start_byte..region.end_byte];
-
-    match region.language {
-        LanguageType::Text | LanguageType::Markdown => {
-            // Plain text / markdown prose: the whole region is one node
-            vec![TextNode {
-                start_byte: region.start_byte,
-                end_byte: region.end_byte,
-                text: region_text.to_string(),
-            }]
-        }
-        _ => {
-            // Code: parse with tree-sitter, run query, extract captured nodes
-            extract_nodes_with_treesitter(
-                region_text,
-                region.start_byte,
-                region.language,
-                tag_filter,
-            )
-        }
-    }
+    skip_patterns: &[Regex],
+) -> (Vec<WordCandidate>, HashSet<LanguageType>) {
+    let skip_ranges = find_skip_ranges(document_text, skip_patterns);
+    let mut result = ExtractionResult {
+        candidates: Vec::new(),
+        languages: HashSet::from([language]),
+    };
+
+    extract_recursive(
+        document_text,
+        0,
+        document_text.len(),
+        language,
+        tag_filter,
+        &skip_ranges,
+        &mut result,
+    );
+
+    (result.candidates, result.languages)
 }
 
-/// Parse text with tree-sitter and extract nodes matching the language's query.
-fn extract_nodes_with_treesitter(
-    text: &str,
-    base_offset: usize,
+/// Accumulated output from recursive word extraction.
+struct ExtractionResult {
+    candidates: Vec<WordCandidate>,
+    languages: HashSet<LanguageType>,
+}
+
+/// Recursively extract words from a byte range of the document.
+///
+/// For languages with a tree-sitter grammar and .scm query:
+///   - Text captures (`@string`, `@comment`, `@identifier.*`) → word-split
+///   - Static injections (`@injection.{lang}`) → recurse with that language
+///   - Dynamic injections (`@injection.content` + `@injection.language`) → read
+///     the language name from the sibling capture, then recurse
+///
+/// For LanguageType::Text (no grammar): word-split the entire range.
+fn extract_recursive(
+    document_text: &str,
+    start_byte: usize,
+    end_byte: usize,
     language: LanguageType,
     tag_filter: &dyn Fn(&str) -> bool,
-) -> Vec<TextNode> {
+    skip_ranges: &[SkipRange],
+    result: &mut ExtractionResult,
+) {
     let language_setting = match get_language_setting(language) {
         Some(s) => s,
-        None => return Vec::new(),
+        None => {
+            // No grammar (e.g. Text) — word-split the whole range
+            let text = &document_text[start_byte..end_byte];
+            extract_words_from_text(text, start_byte, skip_ranges, &mut result.candidates);
+            return;
+        }
     };
 
-    // Parse under global lock to protect external scanners with global C state.
+    let region_text = &document_text[start_byte..end_byte];
+
+    // Parse under global lock
     let tree = {
         let mut cache = PARSER_CACHE.lock().unwrap();
         let parser = cache.entry(language).or_insert_with(|| {
@@ -161,59 +161,111 @@ fn extract_nodes_with_treesitter(
             parser.set_language(&lang).unwrap();
             parser
         });
-        parser.parse(text, None).unwrap()
+        parser.parse(region_text, None).unwrap()
     };
 
     let root_node = tree.root_node();
     let lang = language_setting.language().unwrap();
     let query = Query::new(&lang, language_setting.query).unwrap();
-    let capture_names = query.capture_names();
+    let capture_names: Vec<String> = query
+        .capture_names()
+        .iter()
+        .map(|s| s.to_string())
+        .collect();
     let mut cursor = QueryCursor::new();
-    let provider = text.as_bytes();
+    let provider = region_text.as_bytes();
     let mut matches_query = cursor.matches(&query, root_node, provider);
 
-    let mut nodes = Vec::new();
     while let Some(match_) = matches_query.next() {
+        // First pass: look for dynamic injection pairs in this match
+        let mut injection_content: Option<tree_sitter::Node> = None;
+        let mut injection_language_text: Option<String> = None;
+
         for capture in match_.captures {
             let tag = &capture_names[capture.index as usize];
-            // Skip internal tags and filtered tags
-            if *tag == "language" || !tag_filter(tag) {
-                continue;
+            if tag == "injection.content" {
+                injection_content = Some(capture.node);
+            } else if tag == "injection.language" {
+                injection_language_text =
+                    Some(capture.node.utf8_text(provider).unwrap_or("").to_string());
+            }
+        }
+
+        // Handle dynamic injection pair
+        if let Some(content_node) = injection_content {
+            if let Some(lang_text) = &injection_language_text {
+                let child_lang = LanguageType::from_str(&lang_text.trim().to_lowercase());
+                if let Ok(child_lang) = child_lang
+                    && child_lang != LanguageType::Text
+                {
+                    let child_start = content_node.start_byte() + start_byte;
+                    let child_end = content_node.end_byte() + start_byte;
+                    if child_start < child_end {
+                        result.languages.insert(child_lang);
+                        extract_recursive(
+                            document_text,
+                            child_start,
+                            child_end,
+                            child_lang,
+                            tag_filter,
+                            skip_ranges,
+                            result,
+                        );
+                    }
+                }
             }
+            continue;
+        }
+
+        // Second pass: handle text captures and static injections
+        for capture in match_.captures {
+            let tag = &capture_names[capture.index as usize];
             let node = capture.node;
+            let node_start = node.start_byte() + start_byte;
+            let node_end = node.end_byte() + start_byte;
+
+            if node_start >= node_end {
+                continue;
+            }
+
+            if tag == "language" || tag == "injection.language" {
+                continue;
+            }
+
+            if let Some(lang_name) = tag.strip_prefix("injection.") {
+                // Static injection: @injection.html, @injection.javascript, etc.
+                if let Ok(child_lang) = LanguageType::from_str(lang_name)
+                    && child_lang != LanguageType::Text
+                {
+                    result.languages.insert(child_lang);
+                    extract_recursive(
+                        document_text,
+                        node_start,
+                        node_end,
+                        child_lang,
+                        tag_filter,
+                        skip_ranges,
+                        result,
+                    );
+                }
+                continue;
+            }
+
+            // Normal text capture — extract words if tag passes filter
+            if !tag_filter(tag) {
+                continue;
+            }
+
             let node_text = node.utf8_text(provider).unwrap();
-            nodes.push(TextNode {
-                start_byte: node.start_byte() + base_offset,
-                end_byte: node.end_byte() + base_offset,
-                text: node_text.to_string(),
-            });
+            extract_words_from_text(node_text, node_start, skip_ranges, &mut result.candidates);
         }
     }
-    nodes
 }
 
 // =============================================================================
-// Stage 3: Word Extraction
+// Word extraction from plain text
 // =============================================================================
 
-/// Extract candidate words from text nodes, applying skip patterns.
-/// All byte offsets are in original document coordinates.
-pub fn extract_words(
-    document_text: &str,
-    nodes: &[TextNode],
-    skip_patterns: &[Regex],
-) -> Vec<WordCandidate> {
-    // Compute skip ranges once against the full document
-    let skip_ranges = find_skip_ranges(document_text, skip_patterns);
-
-    let mut candidates = Vec::new();
-    for node in nodes {
-        extract_words_from_text(&node.text, node.start_byte, &skip_ranges, &mut candidates);
-    }
-    candidates
-}
-
-/// Extract words from a text span, applying skip ranges and word splitting.
 fn extract_words_from_text(
     text: &str,
     base_offset: usize,
@@ -266,36 +318,26 @@ pub fn get_word_from_string(start_utf16: usize, end_utf16: usize, text: &str) ->
 }
 
 #[cfg(test)]
-mod parser_tests {
+mod tests {
     use super::*;
-    use crate::regions::TextRegion;
 
     #[test]
-    fn test_extract_words_basic() {
+    fn test_extract_words_plain_text() {
         let text = "HelloWorld calc_wrld";
-        let nodes = vec![TextNode {
-            start_byte: 0,
-            end_byte: text.len(),
-            text: text.to_string(),
-        }];
-        let words = extract_words(text, &nodes, &[]);
+        let (words, langs) = extract_all_words(text, LanguageType::Text, &|_| true, &[]);
         let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
         assert!(word_strs.contains(&"Hello"));
         assert!(word_strs.contains(&"World"));
         assert!(word_strs.contains(&"calc"));
         assert!(word_strs.contains(&"wrld"));
         assert_eq!(words.len(), 4);
+        assert!(langs.contains(&LanguageType::Text));
     }
 
     #[test]
     fn test_extract_words_contraction() {
         let text = "I'm a contraction, wouldn't you agree'?";
-        let nodes = vec![TextNode {
-            start_byte: 0,
-            end_byte: text.len(),
-            text: text.to_string(),
-        }];
-        let words = extract_words(text, &nodes, &[]);
+        let (words, _) = extract_all_words(text, LanguageType::Text, &|_| true, &[]);
         let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
         let expected = ["I'm", "a", "contraction", "wouldn't", "you", "agree"];
         for e in &expected {
@@ -304,96 +346,95 @@ mod parser_tests {
     }
 
     #[test]
-    fn test_extract_nodes_plain_text() {
-        let text = "hello world";
-        let region = TextRegion {
-            start_byte: 0,
-            end_byte: text.len(),
-            language: LanguageType::Text,
-        };
-        let nodes = extract_nodes(text, &region, &|_| true);
-        assert_eq!(nodes.len(), 1);
-        assert_eq!(nodes[0].text, "hello world");
-        assert_eq!(nodes[0].start_byte, 0);
-    }
-
-    #[test]
-    fn test_extract_nodes_code() {
+    fn test_extract_words_code() {
         let text = "// a comment\nfn main() {}";
-        let region = TextRegion {
-            start_byte: 0,
-            end_byte: text.len(),
-            language: LanguageType::Rust,
-        };
-        let nodes = extract_nodes(text, &region, &|_| true);
-        // Should have at least the comment node
-        assert!(!nodes.is_empty());
-        let comment_node = nodes.iter().find(|n| n.text.contains("comment"));
-        assert!(comment_node.is_some(), "Should find comment node");
-    }
-
-    #[test]
-    fn test_extract_nodes_with_base_offset() {
-        // Simulate a code block starting at byte 50 in a larger document
-        let code = "// hello world";
-        let padded = format!("{}{}", " ".repeat(50), code);
-        let region = TextRegion {
-            start_byte: 50,
-            end_byte: 50 + code.len(),
-            language: LanguageType::Rust,
-        };
-        let nodes = extract_nodes(&padded, &region, &|_| true);
-        assert!(!nodes.is_empty());
-        // All node offsets should be >= 50
-        for node in &nodes {
-            assert!(node.start_byte >= 50, "Node offset should include base offset");
-        }
+        let (words, langs) = extract_all_words(text, LanguageType::Rust, &|_| true, &[]);
+        assert!(!words.is_empty());
+        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(
+            word_strs.contains(&"comment"),
+            "Should find 'comment' in Rust comment"
+        );
+        assert!(langs.contains(&LanguageType::Rust));
     }
 
     #[test]
-    fn test_extract_nodes_tag_filter() {
-        let text = "// comment\nlet x = \"string\";";
-        let region = TextRegion {
-            start_byte: 0,
-            end_byte: text.len(),
-            language: LanguageType::Rust,
-        };
-        // Only allow comment tags
-        let nodes = extract_nodes(text, &region, &|tag| tag.starts_with("comment"));
-        for node in &nodes {
-            // Should only have comment content
-            assert!(
-                node.text.contains("comment"),
-                "Expected only comment nodes, got: {:?}",
-                node.text
-            );
-        }
+    fn test_extract_words_tag_filter() {
+        let text = "// comment\nlet x = \"string value\";";
+        let (words, _) = extract_all_words(
+            text,
+            LanguageType::Rust,
+            &|tag| tag.starts_with("comment"),
+            &[],
+        );
+        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(word_strs.contains(&"comment"));
+        assert!(!word_strs.contains(&"string"));
+        assert!(!word_strs.contains(&"value"));
     }
 
     #[test]
     fn test_extract_words_with_skip_patterns() {
         let text = "check https://example.com this";
         let url_pattern = Regex::new(r"https?://[^\s]+").unwrap();
-        let nodes = vec![TextNode {
-            start_byte: 0,
-            end_byte: text.len(),
-            text: text.to_string(),
-        }];
-        let words = extract_words(text, &nodes, &[url_pattern]);
+        let (words, _) = extract_all_words(text, LanguageType::Text, &|_| true, &[url_pattern]);
         let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
         assert!(word_strs.contains(&"check"));
         assert!(word_strs.contains(&"this"));
-        // URL components should be skipped
         assert!(!word_strs.contains(&"https"));
         assert!(!word_strs.contains(&"example"));
     }
 
+    #[test]
+    fn test_extract_words_code_duplicates() {
+        let text = "// wrld foo wrld";
+        let (words, _) = extract_all_words(text, LanguageType::Rust, &|_| true, &[]);
+        let wrld_words: Vec<_> = words.iter().filter(|w| w.word == "wrld").collect();
+        assert_eq!(wrld_words.len(), 2, "Expected two occurrences of 'wrld'");
+    }
+
+    #[test]
+    fn test_markdown_injection_discovers_languages() {
+        let text =
+            "# Hello\n\nSome text.\n\n```python\ndef foo(): pass\n```\n\n```bash\necho hi\n```\n";
+        let (_, langs) = extract_all_words(text, LanguageType::Markdown, &|_| true, &[]);
+        assert!(langs.contains(&LanguageType::Markdown));
+        assert!(langs.contains(&LanguageType::Python));
+        assert!(langs.contains(&LanguageType::Bash));
+    }
+
+    #[test]
+    fn test_markdown_injection_extracts_code_words() {
+        let text = "# Hello\n\n```python\ndef some_functin(): pass\n```\n";
+        let (words, _) = extract_all_words(text, LanguageType::Markdown, &|_| true, &[]);
+        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(word_strs.contains(&"functin"));
+        assert!(word_strs.contains(&"Hello"));
+    }
+
+    #[test]
+    fn test_markdown_unknown_language_skipped() {
+        let text = "# Hello\n\n```unknownlang\nbadwwword\n```\n";
+        let (words, _) = extract_all_words(text, LanguageType::Markdown, &|_| true, &[]);
+        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(!word_strs.contains(&"badwwword"));
+    }
+
+    #[test]
+    fn test_markdown_html_block_injection() {
+        let text = "# Hello\n\n<div>\n  <p>A misspeled word</p>\n</div>\n\nMore text.\n";
+        let (words, langs) = extract_all_words(text, LanguageType::Markdown, &|_| true, &[]);
+        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(langs.contains(&LanguageType::HTML));
+        assert!(word_strs.contains(&"misspeled"));
+        assert!(!word_strs.contains(&"div"));
+    }
+
     #[test]
     fn test_get_word_from_string() {
         let text = "Hello World";
         assert_eq!(get_word_from_string(0, 5, text), "Hello");
         assert_eq!(get_word_from_string(6, 11, text), "World");
-        assert_eq!(get_word_from_string(2, 5, text), "llo");
 
         let unicode_text = "こんにちは世界";
         assert_eq!(get_word_from_string(0, 5, unicode_text), "こんにちは");
@@ -407,34 +448,11 @@ mod parser_tests {
     fn test_unicode_character_handling() {
         crate::logging::init_test_logging();
         let text = "©<div>badword</div>";
-        let nodes = vec![TextNode {
-            start_byte: 0,
-            end_byte: text.len(),
-            text: text.to_string(),
-        }];
-        let words = extract_words(text, &nodes, &[]);
+        let (words, _) = extract_all_words(text, LanguageType::Text, &|_| true, &[]);
         let badword = words.iter().find(|w| w.word == "badword");
         assert!(badword.is_some(), "Expected 'badword' to be found");
         let bw = badword.unwrap();
-        assert_eq!(bw.start_byte, 7, "Expected 'badword' to start at byte 7");
-        assert_eq!(bw.end_byte, 14, "Expected 'badword' to end at byte 14");
-    }
-
-    #[test]
-    fn test_duplicate_word_locations_code() {
-        let text = "// wrld foo wrld";
-        let region = TextRegion {
-            start_byte: 0,
-            end_byte: text.len(),
-            language: LanguageType::Rust,
-        };
-        let nodes = extract_nodes(text, &region, &|_| true);
-        let words = extract_words(text, &nodes, &[]);
-        let wrld_words: Vec<_> = words.iter().filter(|w| w.word == "wrld").collect();
-        assert_eq!(
-            wrld_words.len(),
-            2,
-            "Expected two occurrences of 'wrld'"
-        );
+        assert_eq!(bw.start_byte, 7);
+        assert_eq!(bw.end_byte, 14);
     }
 }
diff --git a/crates/codebook/src/queries.rs b/crates/codebook/src/queries.rs
index 383927af..df2ce003 100644
--- a/crates/codebook/src/queries.rs
+++ b/crates/codebook/src/queries.rs
@@ -209,7 +209,7 @@ pub static LANGUAGE_SETTINGS: &[LanguageSetting] = &[
         type_: LanguageType::Markdown,
         ids: &["markdown"],
         dictionary_ids: &[],
-        query: "",
+        query: include_str!("queries/markdown.scm"),
         extensions: &["md", "markdown"],
     },
     LanguageSetting {
@@ -312,7 +312,7 @@ impl LanguageSetting {
             LanguageType::Javascript => Some(tree_sitter_javascript::LANGUAGE.into()),
             LanguageType::Latex => Some(codebook_tree_sitter_latex::LANGUAGE.into()),
             LanguageType::Lua => Some(tree_sitter_lua::LANGUAGE.into()),
-            LanguageType::Markdown => None, // Handled by region extraction
+            LanguageType::Markdown => Some(tree_sitter_md::LANGUAGE.into()),
             LanguageType::Odin => Some(tree_sitter_odin_codebook::LANGUAGE.into()),
             LanguageType::Php => Some(tree_sitter_php::LANGUAGE_PHP.into()),
             LanguageType::Python => Some(tree_sitter_python::LANGUAGE.into()),
@@ -359,9 +359,7 @@ mod tests {
     fn test_all_queries_are_valid() {
         for language_setting in LANGUAGE_SETTINGS {
             // Skip testing Text since it doesn't have a language or query
-            if language_setting.type_ == LanguageType::Text
-                || language_setting.type_ == LanguageType::Markdown
-            {
+            if language_setting.type_ == LanguageType::Text {
                 continue;
             }
 
@@ -410,9 +408,7 @@ mod tests {
     #[test]
     fn test_all_capture_names_use_allowed_tags() {
         for language_setting in LANGUAGE_SETTINGS {
-            if language_setting.type_ == LanguageType::Text
-                || language_setting.type_ == LanguageType::Markdown
-            {
+            if language_setting.type_ == LanguageType::Text {
                 continue;
             }
 
@@ -428,10 +424,11 @@ mod tests {
             });
 
             for name in query.capture_names() {
+                let is_allowed = ALLOWED_TAGS.contains(name) || name.starts_with("injection.");
                 assert!(
-                    ALLOWED_TAGS.contains(name),
+                    is_allowed,
                     "Language {:?} uses unknown capture tag @{name}. \
-                     Allowed tags: {ALLOWED_TAGS:?}",
+                     Allowed tags: {ALLOWED_TAGS:?} (plus injection.* tags)",
                     language_setting.type_,
                 );
             }
diff --git a/crates/codebook/src/queries/markdown.scm b/crates/codebook/src/queries/markdown.scm
new file mode 100644
index 00000000..dd1b27d5
--- /dev/null
+++ b/crates/codebook/src/queries/markdown.scm
@@ -0,0 +1,6 @@
+(paragraph (inline) @string)
+(atx_heading (inline) @string)
+(html_block) @injection.html
+(fenced_code_block
+  (info_string (language) @injection.language)
+  (code_fence_content) @injection.content)
diff --git a/crates/codebook/src/regions.rs b/crates/codebook/src/regions.rs
deleted file mode 100644
index 35c3df1c..00000000
--- a/crates/codebook/src/regions.rs
+++ /dev/null
@@ -1,317 +0,0 @@
-use crate::queries::LanguageType;
-use std::collections::HashMap;
-use std::str::FromStr;
-use std::sync::{LazyLock, Mutex};
-use tree_sitter::Parser;
-
-/// A region of a file associated with a single language.
-/// For most files, there's one region covering the whole file.
-/// For multi-language files (markdown, astro, vue), there are multiple.
-#[derive(Debug, Clone, PartialEq)]
-pub struct TextRegion {
-    /// Byte range start in the original document
-    pub start_byte: usize,
-    /// Byte range end in the original document
-    pub end_byte: usize,
-    /// Which language governs this region
-    pub language: LanguageType,
-}
-
-/// Parser cache for region extraction (separate from the main parser cache
-/// since region extraction uses different grammars/queries than node extraction).
-static REGION_PARSER_CACHE: LazyLock<Mutex<HashMap<LanguageType, Parser>>> =
-    LazyLock::new(|| Mutex::new(HashMap::new()));
-
-/// Extract language regions from a document.
-/// For single-language files, returns one region covering the whole text.
-/// For multi-language files (markdown), returns multiple regions.
-pub fn extract_regions(text: &str, language: LanguageType) -> Vec<TextRegion> {
-    match language {
-        LanguageType::Markdown => extract_markdown_regions(text),
-        _ => vec![TextRegion {
-            start_byte: 0,
-            end_byte: text.len(),
-            language,
-        }],
-    }
-}
-
-/// Map markdown info strings to LanguageType.
-/// Uses LanguageType::from_str which checks ids and file extensions
-/// in LANGUAGE_SETTINGS. Returns None for unknown or empty strings.
-fn resolve_info_string(info: &str) -> Option<LanguageType> {
-    let trimmed = info.trim().to_lowercase();
-    if trimmed.is_empty() {
-        return None;
-    }
-    match LanguageType::from_str(&trimmed) {
-        Ok(LanguageType::Text) => None, // from_str returns Text for unknown
-        Ok(lang) => Some(lang),
-        Err(_) => None,
-    }
-}
-
-/// Extract regions from a markdown file.
-/// Prose sections become Markdown regions (treated as plain text in node extraction).
-/// Fenced code blocks become regions of the appropriate language.
-fn extract_markdown_regions(text: &str) -> Vec<TextRegion> {
-    let lang: tree_sitter::Language = tree_sitter_md::LANGUAGE.into();
-
-    let tree = {
-        let mut cache = REGION_PARSER_CACHE.lock().unwrap();
-        let parser = cache.entry(LanguageType::Markdown).or_insert_with(|| {
-            let mut parser = Parser::new();
-            parser.set_language(&lang).unwrap();
-            parser
-        });
-        parser.parse(text, None).unwrap()
-    };
-
-    let mut regions = Vec::new();
-    let root = tree.root_node();
-    let provider = text.as_bytes();
-
-    walk_markdown_node(root, provider, &mut regions);
-
-    // Sort by start position
-    regions.sort_by_key(|r| r.start_byte);
-
-    // If no regions found (empty file, etc.), return the whole thing as markdown
-    if regions.is_empty() {
-        return vec![TextRegion {
-            start_byte: 0,
-            end_byte: text.len(),
-            language: LanguageType::Markdown,
-        }];
-    }
-
-    regions
-}
-
-/// Recursively walk markdown AST to find prose and code block regions.
-fn walk_markdown_node(node: tree_sitter::Node, source: &[u8], regions: &mut Vec<TextRegion>) {
-    match node.kind() {
-        "fenced_code_block" => {
-            // Find info_string and code_fence_content children
-            let mut info_string = None;
-            let mut code_content = None;
-            let mut cursor = node.walk();
-            for child in node.children(&mut cursor) {
-                match child.kind() {
-                    "info_string" => {
-                        // Get the language child of info_string
-                        let mut ic = child.walk();
-                        for info_child in child.children(&mut ic) {
-                            if info_child.kind() == "language" {
-                                info_string =
-                                    Some(info_child.utf8_text(source).unwrap_or("").to_string());
-                            }
-                        }
-                    }
-                    "code_fence_content" => {
-                        code_content = Some((child.start_byte(), child.end_byte()));
-                    }
-                    _ => {}
-                }
-            }
-
-            if let Some((start, end)) = code_content
-                && start < end
-                && let Some(info) = info_string
-                && let Some(lang) = resolve_info_string(&info)
-            {
-                regions.push(TextRegion {
-                    start_byte: start,
-                    end_byte: end,
-                    language: lang,
-                });
-            }
-        }
-        "html_block" => {
-            // Block-level HTML — treat as an HTML region
-            if node.start_byte() < node.end_byte() {
-                regions.push(TextRegion {
-                    start_byte: node.start_byte(),
-                    end_byte: node.end_byte(),
-                    language: LanguageType::HTML,
-                });
-            }
-        }
-        "inline" => {
-            // Check parent — we want inline content from paragraphs and headings
-            if let Some(parent) = node.parent() {
-                match parent.kind() {
-                    "paragraph" | "atx_heading" | "setext_heading" => {
-                        if node.start_byte() < node.end_byte() {
-                            regions.push(TextRegion {
-                                start_byte: node.start_byte(),
-                                end_byte: node.end_byte(),
-                                language: LanguageType::Markdown,
-                            });
-                        }
-                    }
-                    _ => {}
-                }
-            }
-        }
-        _ => {
-            // Recurse into children
-            let mut cursor = node.walk();
-            for child in node.children(&mut cursor) {
-                walk_markdown_node(child, source, regions);
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_single_language_region() {
-        let regions = extract_regions("fn main() {}", LanguageType::Rust);
-        assert_eq!(regions.len(), 1);
-        assert_eq!(regions[0].language, LanguageType::Rust);
-        assert_eq!(regions[0].start_byte, 0);
-        assert_eq!(regions[0].end_byte, 12);
-    }
-
-    #[test]
-    fn test_text_region() {
-        let regions = extract_regions("hello world", LanguageType::Text);
-        assert_eq!(regions.len(), 1);
-        assert_eq!(regions[0].language, LanguageType::Text);
-    }
-
-    #[test]
-    fn test_markdown_prose_only() {
-        let text = "# Hello World\n\nSome paragraph text.\n";
-        let regions = extract_regions(text, LanguageType::Markdown);
-        assert!(regions.len() >= 2); // heading + paragraph
-        for r in &regions {
-            assert_eq!(r.language, LanguageType::Markdown);
-        }
-    }
-
-    #[test]
-    fn test_markdown_with_code_block() {
-        let text = "# Hello\n\nSome text.\n\n```python\ndef foo():\n    pass\n```\n\nMore text.\n";
-        let regions = extract_regions(text, LanguageType::Markdown);
-        println!("Regions: {regions:#?}");
-
-        // Should have markdown prose regions + python code region
-        let python_regions: Vec<_> = regions
-            .iter()
-            .filter(|r| r.language == LanguageType::Python)
-            .collect();
-        assert_eq!(python_regions.len(), 1, "Expected one Python region");
-
-        let md_regions: Vec<_> = regions
-            .iter()
-            .filter(|r| r.language == LanguageType::Markdown)
-            .collect();
-        assert!(
-            md_regions.len() >= 2,
-            "Expected at least 2 markdown prose regions"
-        );
-    }
-
-    #[test]
-    fn test_markdown_unknown_language_skipped() {
-        let text = "# Hello\n\n```unknownlang\nsome code\n```\n\nMore text.\n";
-        let regions = extract_regions(text, LanguageType::Markdown);
-        // Unknown language code block should produce no region
-        for r in &regions {
-            assert_eq!(r.language, LanguageType::Markdown);
-        }
-    }
-
-    #[test]
-    fn test_markdown_no_info_string_skipped() {
-        let text = "# Hello\n\n```\nsome code\n```\n\nMore text.\n";
-        let regions = extract_regions(text, LanguageType::Markdown);
-        // Code block without info string should produce no region
-        for r in &regions {
-            assert_eq!(r.language, LanguageType::Markdown);
-        }
-    }
-
-    #[test]
-    fn test_markdown_html_block() {
-        let text = "# Hello\n\n<div class=\"foo\">\n  <p>A paragraph</p>\n</div>\n\nMore text.\n";
-        let regions = extract_regions(text, LanguageType::Markdown);
-        println!("Regions: {regions:#?}");
-
-        let html_regions: Vec<_> = regions
-            .iter()
-            .filter(|r| r.language == LanguageType::HTML)
-            .collect();
-        assert_eq!(html_regions.len(), 1, "Expected one HTML region");
-
-        let md_regions: Vec<_> = regions
-            .iter()
-            .filter(|r| r.language == LanguageType::Markdown)
-            .collect();
-        assert!(md_regions.len() >= 2, "Expected heading + paragraph prose regions");
-    }
-
-    #[test]
-    fn test_resolve_info_string_aliases() {
-        assert_eq!(resolve_info_string("py"), Some(LanguageType::Python));
-        assert_eq!(resolve_info_string("js"), Some(LanguageType::Javascript));
-        assert_eq!(resolve_info_string("ts"), Some(LanguageType::Typescript));
-        assert_eq!(resolve_info_string("sh"), Some(LanguageType::Bash));
-        assert_eq!(resolve_info_string("rs"), Some(LanguageType::Rust));
-        assert_eq!(resolve_info_string("rb"), Some(LanguageType::Ruby));
-        assert_eq!(resolve_info_string("yml"), Some(LanguageType::YAML));
-        assert_eq!(resolve_info_string("c++"), Some(LanguageType::Cpp));
-        assert_eq!(resolve_info_string(""), None);
-        assert_eq!(resolve_info_string("unknownlang"), None);
-    }
-
-    #[test]
-    fn test_resolve_info_string_vscode_ids() {
-        assert_eq!(resolve_info_string("python"), Some(LanguageType::Python));
-        assert_eq!(
-            resolve_info_string("javascript"),
-            Some(LanguageType::Javascript)
-        );
-        assert_eq!(resolve_info_string("rust"), Some(LanguageType::Rust));
-        assert_eq!(resolve_info_string("bash"), Some(LanguageType::Bash));
-        assert_eq!(resolve_info_string("go"), Some(LanguageType::Go));
-    }
-
-    #[test]
-    fn test_markdown_multiple_code_blocks() {
-        let text = "Text.\n\n```bash\nmkdir dir\n```\n\n```python\nx = 1\n```\n\nEnd.\n";
-        let regions = extract_regions(text, LanguageType::Markdown);
-
-        let bash_regions: Vec<_> = regions
-            .iter()
-            .filter(|r| r.language == LanguageType::Bash)
-            .collect();
-        let python_regions: Vec<_> = regions
-            .iter()
-            .filter(|r| r.language == LanguageType::Python)
-            .collect();
-
-        assert_eq!(bash_regions.len(), 1);
-        assert_eq!(python_regions.len(), 1);
-    }
-
-    #[test]
-    fn test_markdown_code_block_content_correct() {
-        let text = "Hello.\n\n```python\ndef foo():\n    pass\n```\n";
-        let regions = extract_regions(text, LanguageType::Markdown);
-        let py = regions
-            .iter()
-            .find(|r| r.language == LanguageType::Python)
-            .unwrap();
-        let content = &text[py.start_byte..py.end_byte];
-        assert!(
-            content.contains("def foo()"),
-            "Expected python code, got: {content:?}"
-        );
-    }
-}
diff --git a/crates/codebook/tests/test_tags.rs b/crates/codebook/tests/test_tags.rs
index 7123e15d..fd1b1f9c 100644
--- a/crates/codebook/tests/test_tags.rs
+++ b/crates/codebook/tests/test_tags.rs
@@ -24,7 +24,10 @@ fn check(text: &str, lang: LanguageType, include: Vec<&str>, exclude: Vec<&str>)
 fn test_no_filters_returns_all() {
     let words = check(RUST_SAMPLE, LanguageType::Rust, vec![], vec![]);
     // Should find typos in all three categories
-    assert!(words.contains(&"commet".to_string()), "missing comment typo");
+    assert!(
+        words.contains(&"commet".to_string()),
+        "missing comment typo"
+    );
     assert!(
         words.contains(&"calculat".to_string()),
         "missing identifier typo"
@@ -35,7 +38,10 @@ fn test_no_filters_returns_all() {
 #[test]
 fn test_include_comments_only() {
     let words = check(RUST_SAMPLE, LanguageType::Rust, vec!["comment"], vec![]);
-    assert!(words.contains(&"commet".to_string()), "missing comment typo");
+    assert!(
+        words.contains(&"commet".to_string()),
+        "missing comment typo"
+    );
     assert!(
         !words.contains(&"calculat".to_string()),
         "identifier should be excluded"
@@ -88,7 +94,10 @@ fn test_include_identifiers_only() {
 #[test]
 fn test_exclude_identifiers() {
     let words = check(RUST_SAMPLE, LanguageType::Rust, vec![], vec!["identifier"]);
-    assert!(words.contains(&"commet".to_string()), "missing comment typo");
+    assert!(
+        words.contains(&"commet".to_string()),
+        "missing comment typo"
+    );
     assert!(words.contains(&"strng".to_string()), "missing string typo");
     assert!(
         !words.contains(&"calculat".to_string()),
@@ -128,7 +137,10 @@ fn test_include_and_exclude_combined() {
         vec!["comment", "string"],
         vec!["string"],
     );
-    assert!(words.contains(&"commet".to_string()), "missing comment typo");
+    assert!(
+        words.contains(&"commet".to_string()),
+        "missing comment typo"
+    );
     assert!(
         !words.contains(&"strng".to_string()),
         "string should be excluded by exclude_tags"
diff --git a/crates/codebook/tests/test_vhdl.rs b/crates/codebook/tests/test_vhdl.rs
index 0bfd6fff..ebfd0727 100644
--- a/crates/codebook/tests/test_vhdl.rs
+++ b/crates/codebook/tests/test_vhdl.rs
@@ -19,14 +19,7 @@ entity calculatr is
     );
 end entity calculatr;
 "#;
-    let expected = vec![
-        "calculatr",
-        "clk",
-        "exmple",
-        "inputt",
-        "resett",
-        "speling",
-    ];
+    let expected = vec!["calculatr", "clk", "exmple", "inputt", "resett", "speling"];
     let binding = processor
         .spell_check(sample_text, Some(LanguageType::VHDL), None)
         .to_vec();
diff --git a/crates/downloader/src/lib.rs b/crates/downloader/src/lib.rs
index 6ea84062..9edbaf63 100644
--- a/crates/downloader/src/lib.rs
+++ b/crates/downloader/src/lib.rs
@@ -66,7 +66,9 @@ impl Downloader {
     }
 
     #[cfg(not(target_os = "android"))]
-    fn build_tls_config(crypto_provider: std::sync::Arc<rustls::crypto::CryptoProvider>) -> ClientConfig {
+    fn build_tls_config(
+        crypto_provider: std::sync::Arc<rustls::crypto::CryptoProvider>,
+    ) -> ClientConfig {
         // Try OS cert chains first (proxy support), fall back to bundled Mozilla roots
         ClientConfig::builder_with_provider(crypto_provider.clone())
             .with_safe_default_protocol_versions()
@@ -80,13 +82,17 @@ impl Downloader {
     }
 
     #[cfg(target_os = "android")]
-    fn build_tls_config(crypto_provider: std::sync::Arc<rustls::crypto::CryptoProvider>) -> ClientConfig {
+    fn build_tls_config(
+        crypto_provider: std::sync::Arc<rustls::crypto::CryptoProvider>,
+    ) -> ClientConfig {
         // Android (Termux) doesn't support rustls-platform-verifier without JNI,
         // so use bundled Mozilla CA roots directly.
         Self::build_webpki_tls_config(crypto_provider)
     }
 
-    fn build_webpki_tls_config(crypto_provider: std::sync::Arc<rustls::crypto::CryptoProvider>) -> ClientConfig {
+    fn build_webpki_tls_config(
+        crypto_provider: std::sync::Arc<rustls::crypto::CryptoProvider>,
+    ) -> ClientConfig {
         let mut root_store = rustls::RootCertStore::empty();
         root_store.extend(webpki_roots::TLS_SERVER_ROOTS.iter().cloned());
         ClientConfig::builder_with_provider(crypto_provider)

From 3730eaf8391207b6ff5b56c68a984e42babbd88c Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 11:52:11 -0700
Subject: [PATCH 04/16] Fix duplicate span regression and add missing test
 coverage

- Use HashSet<TextRange> in checker.rs to deduplicate identical spans,
  matching the old main branch behavior that used HashSet per word.
- Add test for duplicate span deduplication in checker.rs.
- Add test for injected region byte offset correctness (verifies
  offsets from Python code blocks map back to the right document
  position).
- Add test for no duplicate spans in block quotes.
- Fix misleading comment on bash code block test: mkdir passes because
  bash.scm doesn't capture command invocations, not because of a bash
  dictionary.
---
 crates/codebook/src/checker.rs         | 31 +++++++++++---
 crates/codebook/src/parser.rs          |  3 ++
 crates/codebook/tests/test_markdown.rs | 58 +++++++++++++++++++++++++-
 3 files changed, 85 insertions(+), 7 deletions(-)

diff --git a/crates/codebook/src/checker.rs b/crates/codebook/src/checker.rs
index 7b8dac0e..f79eb082 100644
--- a/crates/codebook/src/checker.rs
+++ b/crates/codebook/src/checker.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 
 use crate::dictionaries::dictionary::Dictionary;
 use crate::parser::{TextRange, WordLocation};
@@ -15,19 +15,19 @@ pub struct WordCandidate {
 
 /// Check candidate words against dictionaries and config rules.
 /// Returns WordLocations for misspelled words, grouping all locations
-/// of the same word together.
+/// of the same word together. Duplicate spans are deduplicated.
 pub fn check_words(
     candidates: &[WordCandidate],
     dictionaries: &[std::sync::Arc<dyn Dictionary>],
     config: &dyn CodebookConfig,
 ) -> Vec<WordLocation> {
-    // Deduplicate: group candidates by word text
-    let mut word_positions: HashMap<&str, Vec<TextRange>> = HashMap::new();
+    // Group candidates by word text, deduplicating identical spans
+    let mut word_positions: HashMap<&str, HashSet<TextRange>> = HashMap::new();
     for candidate in candidates {
         word_positions
             .entry(&candidate.word)
             .or_default()
-            .push(TextRange {
+            .insert(TextRange {
                 start_byte: candidate.start_byte,
                 end_byte: candidate.end_byte,
             });
@@ -36,6 +36,7 @@ pub fn check_words(
     // Check each unique word once
     let mut results = Vec::new();
     for (word, positions) in word_positions {
+        let positions: Vec<TextRange> = positions.into_iter().collect();
         if config.should_flag_word(word) {
             results.push(WordLocation::new(word.to_string(), positions));
             continue;
@@ -111,4 +112,24 @@ mod tests {
         let results = check_words(&candidates, &[dict], config.as_ref());
         assert!(results.is_empty(), "Allowed words should not be flagged");
     }
+
+    #[test]
+    fn test_check_words_deduplicates_identical_spans() {
+        let dict = Arc::new(TextDictionary::new("hello\n"));
+        let config = Arc::new(codebook_config::CodebookConfigMemory::default());
+        // Same word at the exact same position — should be deduplicated
+        let candidates = make_candidates(&[
+            ("wrld", 0, 4),
+            ("wrld", 0, 4),
+            ("wrld", 0, 4),
+        ]);
+        let results = check_words(&candidates, &[dict], config.as_ref());
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0].word, "wrld");
+        assert_eq!(
+            results[0].locations.len(),
+            1,
+            "Identical spans should be deduplicated to one location"
+        );
+    }
 }
diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs
index 424420c9..45e28bfc 100644
--- a/crates/codebook/src/parser.rs
+++ b/crates/codebook/src/parser.rs
@@ -15,6 +15,7 @@ use unicode_segmentation::UnicodeSegmentation;
 static PARSER_CACHE: LazyLock<Mutex<HashMap<LanguageType, Parser>>> =
     LazyLock::new(|| Mutex::new(HashMap::new()));
 
+
 #[derive(Debug, Clone, Copy, PartialEq, Ord, Eq, PartialOrd, Hash)]
 pub struct TextRange {
     /// Start position in utf-8 byte offset
@@ -166,6 +167,8 @@ fn extract_recursive(
 
     let root_node = tree.root_node();
     let lang = language_setting.language().unwrap();
+    // Query compilation is cheap (microseconds for small .scm files).
+    // Caching would conflict with the recursive mutex on PARSER_CACHE.
     let query = Query::new(&lang, language_setting.query).unwrap();
     let capture_names: Vec<String> = query
         .capture_names()
diff --git a/crates/codebook/tests/test_markdown.rs b/crates/codebook/tests/test_markdown.rs
index fb399469..703fc424 100644
--- a/crates/codebook/tests/test_markdown.rs
+++ b/crates/codebook/tests/test_markdown.rs
@@ -44,6 +44,10 @@ fn test_markdown_heading() {
 fn test_markdown_fenced_code_block_known_lang() {
     utils::init_logging();
     let processor = utils::get_processor();
+    // Note: bash.scm only captures comments, strings, function names,
+    // heredocs, and variable names — NOT command invocations.
+    // So mkdir/some_dir are not checked because bash.scm doesn't capture them,
+    // not because they're in a bash dictionary.
     let sample_text = r#"# Hello World
 
 Some correct text here.
@@ -59,9 +63,8 @@ More correct text here.
         .to_vec();
     let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
     println!("Misspelled words: {words:?}");
-    // bash builtins like mkdir should be recognized by the bash dictionary
+    // bash.scm doesn't capture command invocations, so these are not checked
     assert!(!words.contains(&"mkdir"));
-    // dir is a common abbreviation, should not be flagged
     assert!(!words.contains(&"dir"));
 }
 
@@ -204,3 +207,54 @@ More text.
     // wrld should be flagged as a function name typo in both languages
     assert!(words.contains(&"wrld"));
 }
+
+#[test]
+fn test_markdown_injected_region_byte_offsets() {
+    utils::init_logging();
+    let processor = utils::get_processor();
+    // Verify that byte offsets from injected regions map back correctly
+    // to the original document coordinates.
+    //                       0         1         2         3
+    //                       0123456789012345678901234567890123456789
+    let sample_text = "# OK\n\n```python\ndef some_functin(): pass\n```\n";
+    //                       ^15 = start of python block content
+    //                       "def some_functin(): pass\n" starts at byte 16
+    //                       "functin" is at offset 9 within "def some_functin"
+    //                       so global offset = 16 + 9 = 25
+    let misspelled = processor
+        .spell_check(sample_text, Some(LanguageType::Markdown), None)
+        .to_vec();
+    println!("Misspelled words: {misspelled:?}");
+    let functin = misspelled.iter().find(|w| w.word == "functin");
+    assert!(functin.is_some(), "Expected 'functin' to be flagged");
+    let loc = &functin.unwrap().locations[0];
+    // Verify the byte offsets point to the right place in the original document
+    assert_eq!(
+        &sample_text[loc.start_byte..loc.end_byte],
+        "functin",
+        "Byte offsets should map back to 'functin' in the original document"
+    );
+}
+
+#[test]
+fn test_markdown_no_duplicate_spans() {
+    utils::init_logging();
+    let processor = utils::get_processor();
+    // Block quotes contain paragraphs — make sure the inline content
+    // isn't captured twice (once for the paragraph, once for the block quote)
+    let sample_text = "> A tyypo in a block quoet.\n";
+    let misspelled = processor
+        .spell_check(sample_text, Some(LanguageType::Markdown), None)
+        .to_vec();
+    for result in &misspelled {
+        let unique_count = result.locations.len();
+        let deduped: std::collections::HashSet<_> = result.locations.iter().collect();
+        assert_eq!(
+            unique_count,
+            deduped.len(),
+            "Word '{}' has duplicate spans: {:?}",
+            result.word,
+            result.locations
+        );
+    }
+}

From 5749d57e90fcee73257ce3a35aa8941dc15562b2 Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 11:57:35 -0700
Subject: [PATCH 05/16] Compile tree-sitter queries eagerly at startup

Replace per-call Query::new with a static COMPILED_QUERIES map that
compiles all .scm queries once on first access. Since queries come from
include_str! and never change at runtime, this avoids recompilation on
every recursive injection call and panics immediately on invalid queries
rather than hiding failures until a user opens that file type.
---
 crates/codebook/src/parser.rs | 54 ++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 13 deletions(-)

diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs
index 45e28bfc..da0465e4 100644
--- a/crates/codebook/src/parser.rs
+++ b/crates/codebook/src/parser.rs
@@ -1,5 +1,5 @@
 use crate::checker::WordCandidate;
-use crate::queries::{LanguageType, get_language_setting};
+use crate::queries::{LanguageType, LANGUAGE_SETTINGS, get_language_setting};
 use crate::splitter;
 use regex::Regex;
 use std::collections::{HashMap, HashSet};
@@ -15,6 +15,40 @@ use unicode_segmentation::UnicodeSegmentation;
 static PARSER_CACHE: LazyLock<Mutex<HashMap<LanguageType, Parser>>> =
     LazyLock::new(|| Mutex::new(HashMap::new()));
 
+/// Pre-compiled query for a language, with its capture names.
+struct CompiledQuery {
+    query: Query,
+    capture_names: Vec<String>,
+}
+
+/// All tree-sitter queries compiled eagerly at startup. Since queries come
+/// from static `include_str!` data, they never change at runtime. Compiling
+/// them once here means bad queries panic immediately rather than hiding
+/// until a user opens that file type.
+static COMPILED_QUERIES: LazyLock<HashMap<LanguageType, CompiledQuery>> = LazyLock::new(|| {
+    let mut map = HashMap::new();
+    for setting in LANGUAGE_SETTINGS {
+        let Some(lang) = setting.language() else {
+            continue;
+        };
+        if setting.query.is_empty() {
+            continue;
+        }
+        let query = Query::new(&lang, setting.query).unwrap_or_else(|e| {
+            panic!(
+                "Failed to compile query for {:?}: {e}",
+                setting.type_
+            )
+        });
+        let capture_names = query.capture_names().iter().map(|s| s.to_string()).collect();
+        map.insert(setting.type_, CompiledQuery {
+            query,
+            capture_names,
+        });
+    }
+    map
+});
+
 
 #[derive(Debug, Clone, Copy, PartialEq, Ord, Eq, PartialOrd, Hash)]
 pub struct TextRange {
@@ -166,18 +200,12 @@ fn extract_recursive(
     };
 
     let root_node = tree.root_node();
-    let lang = language_setting.language().unwrap();
-    // Query compilation is cheap (microseconds for small .scm files).
-    // Caching would conflict with the recursive mutex on PARSER_CACHE.
-    let query = Query::new(&lang, language_setting.query).unwrap();
-    let capture_names: Vec<String> = query
-        .capture_names()
-        .iter()
-        .map(|s| s.to_string())
-        .collect();
+    let compiled = COMPILED_QUERIES.get(&language).expect(
+        "Language has a LanguageSetting but no compiled query — this should not happen",
+    );
     let mut cursor = QueryCursor::new();
     let provider = region_text.as_bytes();
-    let mut matches_query = cursor.matches(&query, root_node, provider);
+    let mut matches_query = cursor.matches(&compiled.query, root_node, provider);
 
     while let Some(match_) = matches_query.next() {
         // First pass: look for dynamic injection pairs in this match
@@ -185,7 +213,7 @@ fn extract_recursive(
         let mut injection_language_text: Option<String> = None;
 
         for capture in match_.captures {
-            let tag = &capture_names[capture.index as usize];
+            let tag = &compiled.capture_names[capture.index as usize];
             if tag == "injection.content" {
                 injection_content = Some(capture.node);
             } else if tag == "injection.language" {
@@ -222,7 +250,7 @@ fn extract_recursive(
 
         // Second pass: handle text captures and static injections
         for capture in match_.captures {
-            let tag = &capture_names[capture.index as usize];
+            let tag = &compiled.capture_names[capture.index as usize];
             let node = capture.node;
             let node_start = node.start_byte() + start_byte;
             let node_end = node.end_byte() + start_byte;

From fcabf2d692ad5672e21abd2c0f8165669290694d Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 12:06:26 -0700
Subject: [PATCH 06/16] Add tests for tag filters through injected regions

Verify that include_tags and exclude_tags are correctly applied inside
injected code blocks (e.g. Python inside markdown), not just at the
top-level language.
---
 crates/codebook/tests/test_tags.rs | 75 ++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/crates/codebook/tests/test_tags.rs b/crates/codebook/tests/test_tags.rs
index fd1b1f9c..c0108825 100644
--- a/crates/codebook/tests/test_tags.rs
+++ b/crates/codebook/tests/test_tags.rs
@@ -151,6 +151,81 @@ fn test_include_and_exclude_combined() {
     );
 }
 
+// =============================================================================
+// Tag filters through injected regions (markdown → code blocks)
+// =============================================================================
+
+/// Markdown with a Python code block containing typos in a comment,
+/// a function name (identifier), and a string.
+const MARKDOWN_WITH_PYTHON: &str = r#"# A heading
+
+Some prose.
+
+```python
+# A commet in python
+def calculat_age():
+    x = "a strng value"
+```
+"#;
+
+#[test]
+fn test_injection_no_filters_returns_all() {
+    let words = check(
+        MARKDOWN_WITH_PYTHON,
+        LanguageType::Markdown,
+        vec![],
+        vec![],
+    );
+    assert!(words.contains(&"commet".to_string()), "missing comment typo from injected python");
+    assert!(words.contains(&"calculat".to_string()), "missing identifier typo from injected python");
+    assert!(words.contains(&"strng".to_string()), "missing string typo from injected python");
+}
+
+#[test]
+fn test_injection_include_comments_only() {
+    // include_tags = ["comment"] should only check comments,
+    // even inside injected code blocks
+    let words = check(
+        MARKDOWN_WITH_PYTHON,
+        LanguageType::Markdown,
+        vec!["comment"],
+        vec![],
+    );
+    assert!(words.contains(&"commet".to_string()), "comment typo should be found");
+    assert!(!words.contains(&"calculat".to_string()), "identifier should be excluded in injected region");
+    assert!(!words.contains(&"strng".to_string()), "string should be excluded in injected region");
+}
+
+#[test]
+fn test_injection_exclude_identifiers() {
+    // exclude_tags = ["identifier"] should suppress identifiers
+    // in both prose and injected code blocks
+    let words = check(
+        MARKDOWN_WITH_PYTHON,
+        LanguageType::Markdown,
+        vec![],
+        vec!["identifier"],
+    );
+    assert!(words.contains(&"commet".to_string()), "comment should still be checked");
+    assert!(words.contains(&"strng".to_string()), "string should still be checked");
+    assert!(!words.contains(&"calculat".to_string()), "identifier should be excluded in injected region");
+}
+
+#[test]
+fn test_injection_include_strings_only() {
+    // include_tags = ["string"] should check strings in both
+    // markdown prose (which uses @string) and injected python
+    let words = check(
+        MARKDOWN_WITH_PYTHON,
+        LanguageType::Markdown,
+        vec!["string"],
+        vec![],
+    );
+    assert!(words.contains(&"strng".to_string()), "string typo in injected python should be found");
+    assert!(!words.contains(&"commet".to_string()), "comment should be excluded");
+    assert!(!words.contains(&"calculat".to_string()), "identifier should be excluded");
+}
+
 #[test]
 fn test_text_language_ignores_tags() {
     // Text language doesn't use tree-sitter, so tags should have no effect

From 2eb40a90d37cd82e16d4a094955282156872b3b5 Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 12:09:43 -0700
Subject: [PATCH 07/16] Remove old doc

---
 refactor.md | 574 ----------------------------------------------------
 1 file changed, 574 deletions(-)
 delete mode 100644 refactor.md

diff --git a/refactor.md b/refactor.md
deleted file mode 100644
index dab3a174..00000000
--- a/refactor.md
+++ /dev/null
@@ -1,574 +0,0 @@
-# Codebook Architecture Refactor
-
-## Goal
-
-Restructure the `codebook` crate internals to support multi-language files (markdown with code blocks, Astro/Vue/Svelte, HTML with `<script>`/`<style>`) and lay groundwork for control comments, custom dictionaries, and a CLI. No public LSP protocol changes needed — the refactor is internal to the `codebook` and `codebook-config` crates.
-
-## Current Architecture
-
-```
-LSP Backend
-    → Codebook::spell_check(text, ONE language, file_path)
-        → resolve_language()           // pick one LanguageType
-        → get_dictionaries()           // load dicts for that one language
-        → parser::find_locations()     // do everything in one function:
-            ├─ Text path: word-boundary split entire text
-            └─ Code path: tree-sitter parse + query + word extract + dict check
-        → return Vec<WordLocation>
-```
-
-### Problems
-
-1. **`find_locations` does too much.** It parses, queries, extracts words, applies skip patterns, and checks dictionaries — all in one function. You can't insert new stages (control comments, injection) without forking the function.
-
-2. **One language per file.** `spell_check` resolves a single `LanguageType` and uses it for the entire file. No way to handle embedded languages.
-
-3. **Dictionary selection is coupled to language resolution.** Dictionaries are gathered once based on the single resolved language. With multiple languages per file, different regions need different dictionaries.
-
-4. **Skip patterns are applied inconsistently.** For `Text` mode, skip patterns are applied during word extraction. For code mode, skip patterns are applied after word extraction against global byte offsets. Both happen inside `find_locations`.
-
-5. **`LanguageType::Text` is a special case everywhere.** The `Text` variant returns `None` from `language()`, has no `.scm` file, no `LanguageSetting` entry returned by `get_language_setting`, and takes a completely different code path in `find_locations`. It's an implicit "not really a language" variant.
-
-## Proposed Architecture
-
-### Pipeline
-
-```
-Codebook::spell_check(text, language, file_path)
-    │
-    ▼
-┌─────────────────────────────┐
-│  Stage 1: Region Extraction │  Split file into typed regions
-│  (one language per region)  │  Most languages: 1 region = whole file
-└─────────────┬───────────────┘  Markdown/HTML/Astro: multiple regions
-              │
-              ▼
-┌─────────────────────────────┐
-│  Stage 2: Node Extraction   │  Per region: tree-sitter parse + query
-│  (AST nodes to check)       │  Returns tagged text spans
-└─────────────┬───────────────┘
-              │
-              ▼
-┌─────────────────────────────┐
-│  Stage 3: Word Extraction   │  Per node: split words, apply skip patterns
-│  (candidate words)          │  Uses splitter + TextProcessor
-└─────────────┬───────────────┘
-              │
-              ▼
-┌─────────────────────────────┐
-│  Stage 4: Word Checking     │  Per word: dictionary lookup + config rules
-│  (misspelled words)         │  flag_words, allowed_words, min_length
-└─────────────┬───────────────┘
-              │
-              ▼
-         Vec<WordLocation>
-```
-
-Each stage is a separate function with clear inputs and outputs. No closures passed between stages — data flows as concrete types.
-
-### Data Types
-
-```rust
-/// A region of a file associated with a single language.
-/// For most files, there's one region covering the whole file.
-/// For multi-language files (markdown, astro, vue), there are multiple.
-pub struct TextRegion {
-    /// Byte range in the original document
-    pub start_byte: usize,
-    pub end_byte: usize,
-    /// Which language governs this region
-    pub language: LanguageType,
-}
-
-/// A text span extracted from a tree-sitter query match.
-/// Coordinates are in original-document byte offsets.
-pub struct TextNode {
-    /// Byte range in the original document
-    pub start_byte: usize,
-    pub end_byte: usize,
-    /// The text content of this node
-    pub text: String,
-    /// The capture tag (e.g. "comment", "string", "identifier.function")
-    pub tag: String,
-}
-
-/// A candidate word extracted from a TextNode, with its position
-/// in original-document byte offsets.
-pub struct WordCandidate {
-    pub word: String,
-    pub start_byte: usize,
-    pub end_byte: usize,
-}
-```
-
-`WordLocation` (the final output) stays the same — it groups all locations of a misspelled word together.
-
-### Stage 1: Region Extraction
-
-```rust
-// In a new module: src/regions.rs
-
-/// Extract language regions from a document.
-/// For single-language files, returns one region covering the whole text.
-/// For multi-language files (markdown, astro, vue, html), returns multiple.
-pub fn extract_regions(text: &str, language: LanguageType) -> Vec<TextRegion> {
-    match language {
-        LanguageType::Markdown => extract_markdown_regions(text),
-        // Future: LanguageType::HTML => extract_html_regions(text),
-        // Future: LanguageType::Astro => extract_astro_regions(text),
-        _ => vec![TextRegion {
-            start_byte: 0,
-            end_byte: text.len(),
-            language,
-        }],
-    }
-}
-```
-
-**Markdown region extraction** parses with `tree_sitter_md`, walks the tree, and produces regions:
-- `paragraph`, `atx_heading`, etc. → `LanguageType::Markdown` region
-- `fenced_code_block` with `info_string` "python" → `LanguageType::Python` region
-- `fenced_code_block` with unknown/missing info string → skip (no region)
-
-This replaces the current `markdown.scm` query approach. Instead of using tree-sitter queries to filter what markdown nodes to check, region extraction identifies the prose vs code boundary, and each region then goes through the normal stage 2 pipeline for its language.
-
-**Language alias resolution** for info strings:
-
-```rust
-/// Map markdown info strings to LanguageType.
-/// Handles common aliases beyond what LanguageType::from_str covers.
-fn resolve_info_string(info: &str) -> Option<LanguageType> {
-    // from_str already handles VS Code language IDs like "rust", "python", "javascript"
-    // Add common markdown aliases here
-    match info.trim().to_lowercase().as_str() {
-        "py" => Some(LanguageType::Python),
-        "js" => Some(LanguageType::Javascript),
-        "ts" => Some(LanguageType::Typescript),
-        "sh" | "zsh" | "fish" => Some(LanguageType::Bash),
-        "yml" => Some(LanguageType::YAML),
-        "c++" | "cc" | "cxx" | "hpp" => Some(LanguageType::Cpp),
-        "cs" => Some(LanguageType::CSharp),
-        "rb" => Some(LanguageType::Ruby),
-        "rs" => Some(LanguageType::Rust),
-        "tex" => Some(LanguageType::Latex),
-        other => LanguageType::from_str(other).ok(),
-    }
-}
-```
-
-### Stage 2: Node Extraction
-
-```rust
-// Refactored from the tree-sitter parts of find_locations_code in src/parser.rs
-
-/// Extract spellcheckable text nodes from a region using tree-sitter.
-/// Returns nodes with byte offsets in original document coordinates.
-pub fn extract_nodes(
-    document_text: &str,
-    region: &TextRegion,
-    tag_filter: &dyn Fn(&str) -> bool,
-) -> Vec<TextNode> {
-    let region_text = &document_text[region.start_byte..region.end_byte];
-
-    match region.language {
-        LanguageType::Text => {
-            // Plain text: the whole region is one node
-            vec![TextNode {
-                start_byte: region.start_byte,
-                end_byte: region.end_byte,
-                text: region_text.to_string(),
-                tag: "string".to_string(),
-            }]
-        }
-        LanguageType::Markdown => {
-            // Markdown prose regions: treat as plain text
-            // (region extraction already stripped out code blocks)
-            vec![TextNode {
-                start_byte: region.start_byte,
-                end_byte: region.end_byte,
-                text: region_text.to_string(),
-                tag: "string".to_string(),
-            }]
-        }
-        _ => {
-            // Code: parse with tree-sitter, run query, extract captured nodes
-            extract_nodes_with_treesitter(region_text, region.start_byte, region.language, tag_filter)
-        }
-    }
-}
-
-/// Parse text with tree-sitter and extract nodes matching the language's query.
-fn extract_nodes_with_treesitter(
-    text: &str,
-    base_offset: usize,
-    language: LanguageType,
-    tag_filter: &dyn Fn(&str) -> bool,
-) -> Vec<TextNode> {
-    let language_setting = get_language_setting(language)?;
-
-    let tree = {
-        let mut cache = PARSER_CACHE.lock().unwrap();
-        let parser = cache.entry(language).or_insert_with(|| { /* ... */ });
-        parser.parse(text, None).unwrap()
-    };
-
-    let lang = language_setting.language().unwrap();
-    let query = Query::new(&lang, language_setting.query).unwrap();
-    let capture_names = query.capture_names();
-    let mut cursor = QueryCursor::new();
-    let mut nodes = Vec::new();
-
-    let mut matches = cursor.matches(&query, tree.root_node(), text.as_bytes());
-    while let Some(match_) = matches.next() {
-        for capture in match_.captures {
-            let tag = &capture_names[capture.index as usize];
-            if tag == "language" || !tag_filter(tag) {
-                continue;
-            }
-            let node = capture.node;
-            nodes.push(TextNode {
-                start_byte: node.start_byte() + base_offset,
-                end_byte: node.end_byte() + base_offset,
-                text: node.utf8_text(text.as_bytes()).unwrap().to_string(),
-                tag: tag.to_string(),
-            });
-        }
-    }
-    nodes
-}
-```
-
-Key change: this function **only** extracts nodes. It does not split words or check dictionaries. The `base_offset` parameter handles coordinate translation for injected regions — node byte offsets from tree-sitter are relative to the parsed text, but we need document-global offsets in the output.
-
-### Stage 3: Word Extraction
-
-```rust
-// Refactored from TextProcessor in src/parser.rs
-
-/// Extract candidate words from text nodes, applying skip patterns.
-/// All byte offsets are in original document coordinates.
-pub fn extract_words(
-    document_text: &str,
-    nodes: &[TextNode],
-    skip_patterns: &[Regex],
-) -> Vec<WordCandidate> {
-    // Compute skip ranges once against the full document
-    let skip_ranges = find_skip_ranges(document_text, skip_patterns);
-
-    let mut candidates = Vec::new();
-    for node in nodes {
-        let words = split_into_words(&node.text);
-        for split_word in words {
-            let global_start = split_word.start_byte + node.start_byte;
-            let global_end = global_start + split_word.word.len();
-
-            if is_within_skip_range(global_start, global_end, &skip_ranges) {
-                continue;
-            }
-
-            candidates.push(WordCandidate {
-                word: split_word.word.to_string(),
-                start_byte: global_start,
-                end_byte: global_end,
-            });
-        }
-    }
-    candidates
-}
-
-/// Split a text node's content into individual words using unicode
-/// segmentation and camelCase/snake_case splitting.
-/// This combines the existing TextProcessor word boundary logic
-/// with the splitter module.
-fn split_into_words(text: &str) -> Vec<SplitWord> {
-    // existing logic from TextProcessor::collect_split_words
-    // + splitter::split
-}
-```
-
-This is a pure function: text in, words out. No dictionary awareness, no language awareness.
-
-### Stage 4: Word Checking
-
-```rust
-/// Check candidate words against dictionaries and config rules.
-/// Returns WordLocations for misspelled words, grouping all locations
-/// of the same word together.
-pub fn check_words(
-    candidates: &[WordCandidate],
-    dictionaries: &[Arc<dyn Dictionary>],
-    config: &dyn CodebookConfig,
-) -> Vec<WordLocation> {
-    // Deduplicate: group candidates by word text
-    let mut word_positions: HashMap<&str, Vec<TextRange>> = HashMap::new();
-    for candidate in candidates {
-        word_positions
-            .entry(&candidate.word)
-            .or_default()
-            .push(TextRange {
-                start_byte: candidate.start_byte,
-                end_byte: candidate.end_byte,
-            });
-    }
-
-    // Check each unique word once
-    let mut results = Vec::new();
-    for (word, positions) in word_positions {
-        if config.should_flag_word(word) {
-            results.push(WordLocation::new(word.to_string(), positions));
-            continue;
-        }
-        if word.len() < config.get_min_word_length() {
-            continue;
-        }
-        if config.is_allowed_word(word) {
-            continue;
-        }
-        let is_correct = dictionaries.iter().any(|dict| dict.check(word));
-        if !is_correct {
-            results.push(WordLocation::new(word.to_string(), positions));
-        }
-    }
-    results
-}
-```
-
-This replaces the `check_function` closure that's currently threaded through `find_locations`. The closure pattern made it impossible to test word checking independently.
-
-### Orchestration in `Codebook::spell_check`
-
-```rust
-pub fn spell_check(
-    &self,
-    text: &str,
-    language: Option<LanguageType>,
-    file_path: Option<&str>,
-) -> Vec<WordLocation> {
-    // ... existing path ignore/include logic ...
-
-    let language = self.resolve_language(language, file_path);
-
-    // Build skip patterns once
-    let mut skip_patterns = get_default_skip_patterns().clone();
-    if let Some(user_patterns) = self.config.get_ignore_patterns() {
-        skip_patterns.extend(user_patterns);
-    }
-
-    // Stage 1: Split into language regions
-    let regions = regions::extract_regions(text, language);
-
-    // Collect dictionaries for all languages present in the file
-    let languages_in_file: Vec<LanguageType> = regions.iter().map(|r| r.language).collect();
-    let dictionaries = self.get_dictionaries_for_languages(&languages_in_file);
-
-    // Stages 2-4: Process each region
-    let mut all_candidates = Vec::new();
-    for region in &regions {
-        let nodes = parser::extract_nodes(text, region, &|tag| {
-            self.config.should_check_tag(tag)
-        });
-        let candidates = parser::extract_words(text, &nodes, &skip_patterns);
-        all_candidates.extend(candidates);
-    }
-
-    // Stage 4: Check all words at once (deduplicates across regions)
-    parser::check_words(&all_candidates, &dictionaries, self.config.as_ref())
-}
-```
-
-### Dictionary Selection Changes
-
-```rust
-/// Gather dictionaries for all languages present in a file.
-fn get_dictionaries_for_languages(
-    &self,
-    languages: &[LanguageType],
-) -> Vec<Arc<dyn Dictionary>> {
-    let mut dictionary_ids: Vec<String> = self.config.get_dictionary_ids();
-
-    // Add language-specific dictionaries for all languages in the file
-    for lang in languages {
-        dictionary_ids.extend(lang.dictionary_ids());
-    }
-
-    // Add defaults
-    dictionary_ids.extend(DEFAULT_DICTIONARIES.iter().map(|f| f.to_string()));
-
-    // Deduplicate
-    dictionary_ids.sort();
-    dictionary_ids.dedup();
-
-    dictionary_ids
-        .iter()
-        .filter_map(|id| self.manager.get_dictionary(id))
-        .collect()
-}
-```
-
-This replaces the current `get_dictionaries(Option<LanguageType>)` which only handles one language.
-
-## Module Layout After Refactor
-
-```
-codebook/src/
-├── lib.rs              # Codebook struct, spell_check orchestration
-├── regions.rs          # NEW: Stage 1 — region extraction
-├── parser.rs           # Stages 2+3 — node extraction, word extraction
-├── checker.rs          # NEW: Stage 4 — word checking
-├── splitter.rs         # Word splitting (camelCase, snake_case) — unchanged
-├── regexes.rs          # Skip patterns — unchanged
-├── queries.rs          # LanguageType, LanguageSetting, .scm files — unchanged
-├── queries/            # .scm query files — unchanged
-└── dictionaries/       # Dictionary loading — unchanged
-```
-
-Key moves:
-- `find_locations` and `find_locations_code` in `parser.rs` → split into `extract_nodes` + `extract_words`
-- Dictionary checking logic currently in `Codebook::spell_check` closure → `checker.rs::check_words`
-- Region extraction → new `regions.rs` module
-- `TextProcessor` stays in `parser.rs` but is simplified — it only does word extraction now, no dictionary checking
-
-## What Gets Deleted
-
-- `parser::find_locations()` — replaced by the pipeline orchestration in `Codebook::spell_check`
-- `parser::find_locations_code()` — split into `extract_nodes` + `extract_words`
-- `TextProcessor::process_words_with_check()` — word checking moves to stage 4
-- `dictionary::find_locations_with_dictionary_batch()` — unused after refactor
-- `queries/markdown.scm` — markdown region extraction replaces the query approach
-- The `check_function` closure pattern — replaced by concrete `check_words` function
-
-## What Stays the Same
-
-- All `.scm` query files (except `markdown.scm`)
-- `LanguageType` enum and `LANGUAGE_SETTINGS` table
-- `LanguageSetting` struct and `language()` method
-- `splitter::split()` — word splitting logic
-- `regexes.rs` — skip patterns
-- `dictionaries/` — all dictionary types, manager, repo
-- `CodebookConfig` trait and `CodebookConfigFile` implementation
-- `CodebookConfigMemory` for tests
-- The LSP crate (`codebook-lsp`) — `Backend`, `LanguageServer` impl, all commands
-- `Codebook::spell_check` signature (takes same args, returns same type)
-- `Codebook::get_suggestions` — unchanged
-- `WordLocation`, `TextRange` — unchanged
-
-## Implementation Order
-
-This can be done incrementally, keeping tests green at each step:
-
-### Step 1: Introduce data types and stage functions as wrappers
-
-Add `TextRegion`, `TextNode`, `WordCandidate` types. Write `extract_regions`, `extract_nodes`, `extract_words`, `check_words` as new functions that internally call the existing `find_locations` code. Write tests for each stage function independently. Don't delete anything yet.
-
-### Step 2: Rewire `Codebook::spell_check` to use the pipeline
-
-Replace the body of `spell_check` with the pipeline orchestration. It should call the stage functions instead of `find_locations` directly. All existing integration tests should still pass since the external behavior is the same.
-
-### Step 3: Inline and delete old code
-
-Now that nothing calls `find_locations` or `find_locations_code`, move their internal logic into the stage functions and delete the old functions. Remove `TextProcessor::process_words_with_check` (keep `extract_words` and `collect_split_words`). Remove `find_locations_with_dictionary_batch`.
-
-### Step 4: Implement markdown region extraction
-
-Replace the current markdown.scm query approach with proper region extraction:
-- Parse markdown with `tree_sitter_md`
-- Walk the AST to identify prose regions and fenced code blocks
-- Map info strings to `LanguageType` using `resolve_info_string`
-- Delete `markdown.scm`
-
-This makes markdown code blocks spell-checked with the correct language grammar and dictionaries.
-
-### Step 5: Update tests
-
-- Existing integration tests (test_markdown.rs, test_python.rs, etc.) should pass unchanged
-- Add new unit tests for each stage function
-- Add integration tests for markdown with code blocks in different languages
-- Add test for unknown info strings (should be skipped, not crash)
-
-## Future Work (Not Part of This Refactor)
-
-These features are enabled by the pipeline architecture but should be done in separate passes:
-
-### Control Comments
-
-Add a filtering step between stage 2 (node extraction) and stage 3 (word extraction). Scan for comments matching patterns like:
-- `// codebook:ignore-next-line` — add the next line's byte range to skip ranges
-- `// codebook:ignore-start` / `// codebook:ignore-end` — add enclosed range to skip ranges
-- `// codebook:words word1,word2` — add words to allowed list for this file
-- `<!-- codebook:ignore -->` — HTML/markdown variant
-
-This works naturally because nodes already carry byte offsets and tags. A comment node with tag "comment" containing "codebook:ignore-next-line" can compute the next line's byte range and add it to the skip ranges that stage 3 uses.
-
-For file-level directives (`codebook:ignore-file`), short-circuit before stage 1.
-
-### Custom User Dictionaries
-
-Changes needed in `codebook-config` and `dictionaries/`:
-
-1. Add `custom_dictionaries` field to `ConfigSettings`:
-   ```toml
-   # codebook.toml
-   [custom_dictionaries]
-   my_project = "path/to/project-words.txt"
-   medical = "path/to/medical-terms.dic"
-   ```
-
-2. `DictionaryManager::get_dictionary` should check for local file paths in addition to the `get_repo` lookup. If `id` maps to a path in config, load it as a `TextDictionary` (for `.txt`) or `HunspellDictionary` (for `.dic`/`.aff` pairs).
-
-3. Relative paths should resolve from the project config file's directory.
-
-No pipeline changes needed — custom dictionaries just appear in the dictionary list alongside built-in ones.
-
-### Astro/Vue/Svelte Support
-
-Same pattern as markdown region extraction:
-
-1. Add `tree-sitter-astro`, `tree-sitter-vue`, etc. as dependencies
-2. Add `LanguageType::Astro`, `LanguageType::Vue`, `LanguageType::Svelte`
-3. Write `extract_astro_regions`, `extract_vue_regions`, etc. in `regions.rs`
-4. These parse the file, identify `<script>`, `<template>`, `<style>` sections, and produce regions with appropriate language types
-
-The `.scm` query files for the embedded languages (TypeScript, HTML, CSS) already exist and work unchanged — they're used in stage 2 for each region.
-
-### HTML `<script>` and `<style>` Injection
-
-Same pattern. `extract_html_regions` would identify `<script>` and `<style>` tags and create JavaScript/CSS regions. The rest of the HTML becomes HTML regions checked with `html.scm`.
-
-### Korean/Asian Language Support
-
-Affects stage 3 (word extraction) only. The current `split_into_words` uses `unicode-segmentation`'s `split_word_bound_indices`, which works for space-separated languages. For Korean/CJK, options:
-
-1. **Syllable-level checking** — Unicode word boundaries do produce Hangul syllable blocks, so basic Korean may work with the existing splitter. Test this first.
-2. **Segmentation library** — if syllable-level is too granular, integrate a word segmentation library. Stage 3 would check the script of the text and choose the appropriate splitter.
-3. **Dictionary support** — need Hunspell dictionaries for these languages. The dictionary system already supports this (just add entries to `HUNSPELL_DICTIONARIES`).
-
-No pipeline architecture changes needed — just a different word splitting strategy in stage 3.
-
-### CLI for CI
-
-New crate: `codebook-cli`. Uses the `codebook` crate directly:
-
-```rust
-// codebook-cli/src/main.rs
-fn main() {
-    let args = parse_args();
-    let config = CodebookConfigFile::load(Some(&args.project_dir))?;
-    let codebook = Codebook::new(Arc::new(config))?;
-
-    let mut exit_code = 0;
-    for file in discover_files(&args) {
-        let results = codebook.spell_check_file(&file);
-        if !results.is_empty() {
-            exit_code = 1;
-            format_results(&file, &results, args.format);
-        }
-    }
-    std::process::exit(exit_code);
-}
-```
-
-Output formats: `text` (human readable), `json` (machine readable), `sarif` (GitHub Actions).
-
-File discovery: walk directory, respect `.gitignore` + config `ignore_paths`/`include_paths`.
-
-No pipeline changes needed — the CLI uses `Codebook::spell_check_file` which already returns `Vec<WordLocation>`.

From c0c767206a9e832f32cfb9d830409729539a99b7 Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 12:15:13 -0700
Subject: [PATCH 08/16] Format and spelling

---
 crates/codebook/src/checker.rs     |  6 +-
 crates/codebook/src/parser.rs      | 92 +++++++++++++++---------------
 crates/codebook/tests/test_tags.rs | 65 +++++++++++++++------
 3 files changed, 96 insertions(+), 67 deletions(-)

diff --git a/crates/codebook/src/checker.rs b/crates/codebook/src/checker.rs
index f79eb082..48790a8c 100644
--- a/crates/codebook/src/checker.rs
+++ b/crates/codebook/src/checker.rs
@@ -118,11 +118,7 @@ mod tests {
         let dict = Arc::new(TextDictionary::new("hello\n"));
         let config = Arc::new(codebook_config::CodebookConfigMemory::default());
         // Same word at the exact same position — should be deduplicated
-        let candidates = make_candidates(&[
-            ("wrld", 0, 4),
-            ("wrld", 0, 4),
-            ("wrld", 0, 4),
-        ]);
+        let candidates = make_candidates(&[("wrld", 0, 4), ("wrld", 0, 4), ("wrld", 0, 4)]);
         let results = check_words(&candidates, &[dict], config.as_ref());
         assert_eq!(results.len(), 1);
         assert_eq!(results[0].word, "wrld");
diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs
index da0465e4..6100ccc4 100644
--- a/crates/codebook/src/parser.rs
+++ b/crates/codebook/src/parser.rs
@@ -1,5 +1,5 @@
 use crate::checker::WordCandidate;
-use crate::queries::{LanguageType, LANGUAGE_SETTINGS, get_language_setting};
+use crate::queries::{LANGUAGE_SETTINGS, LanguageType, get_language_setting};
 use crate::splitter;
 use regex::Regex;
 use std::collections::{HashMap, HashSet};
@@ -34,22 +34,24 @@ static COMPILED_QUERIES: LazyLock<HashMap<LanguageType, CompiledQuery>> = LazyLo
         if setting.query.is_empty() {
             continue;
         }
-        let query = Query::new(&lang, setting.query).unwrap_or_else(|e| {
-            panic!(
-                "Failed to compile query for {:?}: {e}",
-                setting.type_
-            )
-        });
-        let capture_names = query.capture_names().iter().map(|s| s.to_string()).collect();
-        map.insert(setting.type_, CompiledQuery {
-            query,
-            capture_names,
-        });
+        let query = Query::new(&lang, setting.query)
+            .unwrap_or_else(|e| panic!("Failed to compile query for {:?}: {e}", setting.type_));
+        let capture_names = query
+            .capture_names()
+            .iter()
+            .map(|s| s.to_string())
+            .collect();
+        map.insert(
+            setting.type_,
+            CompiledQuery {
+                query,
+                capture_names,
+            },
+        );
     }
     map
 });
 
-
 #[derive(Debug, Clone, Copy, PartialEq, Ord, Eq, PartialOrd, Hash)]
 pub struct TextRange {
     /// Start position in utf-8 byte offset
@@ -200,9 +202,9 @@ fn extract_recursive(
     };
 
     let root_node = tree.root_node();
-    let compiled = COMPILED_QUERIES.get(&language).expect(
-        "Language has a LanguageSetting but no compiled query — this should not happen",
-    );
+    let compiled = COMPILED_QUERIES
+        .get(&language)
+        .expect("Language has a LanguageSetting but no compiled query — this should not happen");
     let mut cursor = QueryCursor::new();
     let provider = region_text.as_bytes();
     let mut matches_query = cursor.matches(&compiled.query, root_node, provider);
@@ -356,11 +358,11 @@ mod tests {
     fn test_extract_words_plain_text() {
         let text = "HelloWorld calc_wrld";
         let (words, langs) = extract_all_words(text, LanguageType::Text, &|_| true, &[]);
-        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
-        assert!(word_strs.contains(&"Hello"));
-        assert!(word_strs.contains(&"World"));
-        assert!(word_strs.contains(&"calc"));
-        assert!(word_strs.contains(&"wrld"));
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(word_strings.contains(&"Hello"));
+        assert!(word_strings.contains(&"World"));
+        assert!(word_strings.contains(&"calc"));
+        assert!(word_strings.contains(&"wrld"));
         assert_eq!(words.len(), 4);
         assert!(langs.contains(&LanguageType::Text));
     }
@@ -369,10 +371,10 @@ mod tests {
     fn test_extract_words_contraction() {
         let text = "I'm a contraction, wouldn't you agree'?";
         let (words, _) = extract_all_words(text, LanguageType::Text, &|_| true, &[]);
-        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
         let expected = ["I'm", "a", "contraction", "wouldn't", "you", "agree"];
         for e in &expected {
-            assert!(word_strs.contains(e), "Expected word '{e}' not found");
+            assert!(word_strings.contains(e), "Expected word '{e}' not found");
         }
     }
 
@@ -381,9 +383,9 @@ mod tests {
         let text = "// a comment\nfn main() {}";
         let (words, langs) = extract_all_words(text, LanguageType::Rust, &|_| true, &[]);
         assert!(!words.is_empty());
-        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
         assert!(
-            word_strs.contains(&"comment"),
+            word_strings.contains(&"comment"),
             "Should find 'comment' in Rust comment"
         );
         assert!(langs.contains(&LanguageType::Rust));
@@ -398,10 +400,10 @@ mod tests {
             &|tag| tag.starts_with("comment"),
             &[],
         );
-        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
-        assert!(word_strs.contains(&"comment"));
-        assert!(!word_strs.contains(&"string"));
-        assert!(!word_strs.contains(&"value"));
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(word_strings.contains(&"comment"));
+        assert!(!word_strings.contains(&"string"));
+        assert!(!word_strings.contains(&"value"));
     }
 
     #[test]
@@ -409,11 +411,11 @@ mod tests {
         let text = "check https://example.com this";
         let url_pattern = Regex::new(r"https?://[^\s]+").unwrap();
         let (words, _) = extract_all_words(text, LanguageType::Text, &|_| true, &[url_pattern]);
-        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
-        assert!(word_strs.contains(&"check"));
-        assert!(word_strs.contains(&"this"));
-        assert!(!word_strs.contains(&"https"));
-        assert!(!word_strs.contains(&"example"));
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(word_strings.contains(&"check"));
+        assert!(word_strings.contains(&"this"));
+        assert!(!word_strings.contains(&"https"));
+        assert!(!word_strings.contains(&"example"));
     }
 
     #[test]
@@ -438,27 +440,27 @@ mod tests {
     fn test_markdown_injection_extracts_code_words() {
         let text = "# Hello\n\n```python\ndef some_functin(): pass\n```\n";
         let (words, _) = extract_all_words(text, LanguageType::Markdown, &|_| true, &[]);
-        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
-        assert!(word_strs.contains(&"functin"));
-        assert!(word_strs.contains(&"Hello"));
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(word_strings.contains(&"functin"));
+        assert!(word_strings.contains(&"Hello"));
     }
 
     #[test]
     fn test_markdown_unknown_language_skipped() {
         let text = "# Hello\n\n```unknownlang\nbadwwword\n```\n";
         let (words, _) = extract_all_words(text, LanguageType::Markdown, &|_| true, &[]);
-        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
-        assert!(!word_strs.contains(&"badwwword"));
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        assert!(!word_strings.contains(&"badwwword"));
     }
 
     #[test]
     fn test_markdown_html_block_injection() {
         let text = "# Hello\n\n<div>\n  <p>A misspeled word</p>\n</div>\n\nMore text.\n";
         let (words, langs) = extract_all_words(text, LanguageType::Markdown, &|_| true, &[]);
-        let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
         assert!(langs.contains(&LanguageType::HTML));
-        assert!(word_strs.contains(&"misspeled"));
-        assert!(!word_strs.contains(&"div"));
+        assert!(word_strings.contains(&"misspeled"));
+        assert!(!word_strings.contains(&"div"));
     }
 
     #[test]
@@ -480,9 +482,9 @@ mod tests {
         crate::logging::init_test_logging();
         let text = "©<div>badword</div>";
         let (words, _) = extract_all_words(text, LanguageType::Text, &|_| true, &[]);
-        let badword = words.iter().find(|w| w.word == "badword");
-        assert!(badword.is_some(), "Expected 'badword' to be found");
-        let bw = badword.unwrap();
+        let bad_word = words.iter().find(|w| w.word == "badword");
+        assert!(bad_word.is_some(), "Expected 'badword' to be found");
+        let bw = bad_word.unwrap();
         assert_eq!(bw.start_byte, 7);
         assert_eq!(bw.end_byte, 14);
     }
diff --git a/crates/codebook/tests/test_tags.rs b/crates/codebook/tests/test_tags.rs
index c0108825..03840621 100644
--- a/crates/codebook/tests/test_tags.rs
+++ b/crates/codebook/tests/test_tags.rs
@@ -170,15 +170,19 @@ def calculat_age():
 
 #[test]
 fn test_injection_no_filters_returns_all() {
-    let words = check(
-        MARKDOWN_WITH_PYTHON,
-        LanguageType::Markdown,
-        vec![],
-        vec![],
+    let words = check(MARKDOWN_WITH_PYTHON, LanguageType::Markdown, vec![], vec![]);
+    assert!(
+        words.contains(&"commet".to_string()),
+        "missing comment typo from injected python"
+    );
+    assert!(
+        words.contains(&"calculat".to_string()),
+        "missing identifier typo from injected python"
+    );
+    assert!(
+        words.contains(&"strng".to_string()),
+        "missing string typo from injected python"
     );
-    assert!(words.contains(&"commet".to_string()), "missing comment typo from injected python");
-    assert!(words.contains(&"calculat".to_string()), "missing identifier typo from injected python");
-    assert!(words.contains(&"strng".to_string()), "missing string typo from injected python");
 }
 
 #[test]
@@ -191,9 +195,18 @@ fn test_injection_include_comments_only() {
         vec!["comment"],
         vec![],
     );
-    assert!(words.contains(&"commet".to_string()), "comment typo should be found");
-    assert!(!words.contains(&"calculat".to_string()), "identifier should be excluded in injected region");
-    assert!(!words.contains(&"strng".to_string()), "string should be excluded in injected region");
+    assert!(
+        words.contains(&"commet".to_string()),
+        "comment typo should be found"
+    );
+    assert!(
+        !words.contains(&"calculat".to_string()),
+        "identifier should be excluded in injected region"
+    );
+    assert!(
+        !words.contains(&"strng".to_string()),
+        "string should be excluded in injected region"
+    );
 }
 
 #[test]
@@ -206,9 +219,18 @@ fn test_injection_exclude_identifiers() {
         vec![],
         vec!["identifier"],
     );
-    assert!(words.contains(&"commet".to_string()), "comment should still be checked");
-    assert!(words.contains(&"strng".to_string()), "string should still be checked");
-    assert!(!words.contains(&"calculat".to_string()), "identifier should be excluded in injected region");
+    assert!(
+        words.contains(&"commet".to_string()),
+        "comment should still be checked"
+    );
+    assert!(
+        words.contains(&"strng".to_string()),
+        "string should still be checked"
+    );
+    assert!(
+        !words.contains(&"calculat".to_string()),
+        "identifier should be excluded in injected region"
+    );
 }
 
 #[test]
@@ -221,9 +243,18 @@ fn test_injection_include_strings_only() {
         vec!["string"],
         vec![],
     );
-    assert!(words.contains(&"strng".to_string()), "string typo in injected python should be found");
-    assert!(!words.contains(&"commet".to_string()), "comment should be excluded");
-    assert!(!words.contains(&"calculat".to_string()), "identifier should be excluded");
+    assert!(
+        words.contains(&"strng".to_string()),
+        "string typo in injected python should be found"
+    );
+    assert!(
+        !words.contains(&"commet".to_string()),
+        "comment should be excluded"
+    );
+    assert!(
+        !words.contains(&"calculat".to_string()),
+        "identifier should be excluded"
+    );
 }
 
 #[test]

From 4104246950f66244e81435a8334205f7572a26ae Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 12:28:41 -0700
Subject: [PATCH 09/16] Example

---
 examples/example.md | 176 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 166 insertions(+), 10 deletions(-)

diff --git a/examples/example.md b/examples/example.md
index c1ac0d0a..b7c7ac79 100644
--- a/examples/example.md
+++ b/examples/example.md
@@ -1,17 +1,173 @@
-I'm bvd at splellin Wolrd wolrd
-hello regulr
+# Markdown Injection Test File
 
-Some DNA:
-ATGCATC
+This file exercises multi-language injection and edge cases.
 
-bad DNA:
-ATGCATCssss
+## Basic prose with typos
 
+I'm bvd at splellin. Some regulr text with misspeled words.
+
+## Fenced code blocks with known languages
+
+Python function with a typo in the name:
+
+```python
+# A coment about the functin
+def calculat_user_age(bith_date: str) -> int:
+    """Calculaet the user's age from a date strng."""
+    user_age = get_current_date() - bith_date
+    return user_age
+```
+
+Bash commands (bash.scm doesn't capture command invocations):
+
+```bash
+# Instal dependencies for the proejct
+mkdir -p build/outpt
+cp src/*.rs build/outpt/
+echo "Doen compiling"
+```
+
+Rust with various identifier types:
+
+```rust
+// Calclate the user's scroe
+fn calculat_score(usr_input: &str) -> u32 {
+    let mut scroe = 0;
+    let mesage = "Helo Wolrd";
+    scroe
+}
+```
+
+JavaScript with strings and comments:
+
+```javascript
+// Procss the usr requet
+function handl_request(requst) {
+    const mesage = "Somthing went wrng";
+    return mesage;
+}
+```
+
+## Language aliases
+
+Common aliases should resolve correctly:
+
+```py
+def wrld_functin():
+    pass
+```
+
+```js
+function wrld_functin() {}
+```
+
+```rs
+fn wrld_functin() {}
+```
+
+```sh
+# A coment in shell
+echo "doen"
+```
+
+```ts
+function wrld_functin(): void {}
+```
+
+## Unknown and missing language tags
+
+Unknown language (should be completely skipped):
+
+```fortran
+      PROGRAM BADSPELIN
+      WRITE(*,*) 'This shuld not be chekced'
+      END
+```
+
+No language tag (should be completely skipped):
+
+```
+badwwword and uncheckedtypo should not appear
+```
+
+## Block-level HTML (injected as HTML)
+
+<div class="containr">
+  <p>This paragraf has a misspeled word inside HTML.</p>
+  <span>Another sentance with erors.</span>
+</div>
+
+## Inline HTML in prose (not separately injected)
+
+Some text with <strong>boldd</strong> and <em>italc</em> formatting.
+
+## Block quotes
+
+> A block quoet with misspeled words should be chekced normally.
+
+Nested block quote:
+
+> > A neested quoet inside another quoet.
+
+## Many code blocks (stress test for recursion)
+
+```python
+x = "frst"
+```
+
+```bash
+echo "secnd"
+```
 
 ```python
-import bad_spelin
-# Not spel good
-def im_guud():
-    bad_spelin.bone()
+y = "thrd"
+```
+
+```rust
+let z = "fouth";
+```
+
+```javascript
+let w = "fifh";
+```
+
+```python
+a = "sixh"
+```
+
+## Mixed content ordering
+
+Text before. Then code:
+
+```python
+def befre_and_aftr():
     pass
 ```
+
+Text between code blocks with a tyypo.
+
+```rust
+// Anothr comment
+fn middl_function() {}
+```
+
+More text aftr the code.
+
+## Edge cases
+
+Empty code block with language:
+
+```python
+```
+
+Single-line code block:
+
+```python
+x = "singel"
+```
+
+Code block at end of file with no trailing newline:
+
+```python
+y = "finl"
+```

From 2a0e57e763044a39a87c2309528184d1a8717be0 Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 12:38:09 -0700
Subject: [PATCH 10/16] Strengthen alias test and preserve debug_assert for
 duplicate spans

- Fix weak alias test: now asserts wrld has 2 locations (one per
  injected block) instead of just checking word presence.
- Restructure check_words to filter correct words before insertion,
  matching the old behavior where the debug_assert only fires on
  misspelled words with duplicate locations (actual query bugs).
  Correct words with overlapping captures (e.g. Erlang atoms that
  are both @string.special and @identifier.function) are filtered
  out before the assert.
- Update examples/example.md with pathological multi-language test
  cases covering aliases, unknown languages, HTML blocks, block
  quotes, many fenced blocks, and edge cases.
---
 crates/codebook/src/checker.rs         | 85 +++++++++++++-------------
 crates/codebook/tests/test_markdown.rs | 13 ++--
 2 files changed, 51 insertions(+), 47 deletions(-)

diff --git a/crates/codebook/src/checker.rs b/crates/codebook/src/checker.rs
index 48790a8c..912e8585 100644
--- a/crates/codebook/src/checker.rs
+++ b/crates/codebook/src/checker.rs
@@ -13,46 +13,61 @@ pub struct WordCandidate {
     pub end_byte: usize,
 }
 
+/// Check if a word should be flagged based on config and dictionaries.
+/// Returns true if the word is correct (should NOT be flagged).
+fn is_word_correct(
+    word: &str,
+    dictionaries: &[std::sync::Arc<dyn Dictionary>],
+    config: &dyn CodebookConfig,
+) -> bool {
+    if config.should_flag_word(word) {
+        return false;
+    }
+    if word.len() < config.get_min_word_length() {
+        return true;
+    }
+    if config.is_allowed_word(word) {
+        return true;
+    }
+    dictionaries.iter().any(|dict| dict.check(word))
+}
+
 /// Check candidate words against dictionaries and config rules.
 /// Returns WordLocations for misspelled words, grouping all locations
-/// of the same word together. Duplicate spans are deduplicated.
+/// of the same word together.
 pub fn check_words(
     candidates: &[WordCandidate],
     dictionaries: &[std::sync::Arc<dyn Dictionary>],
     config: &dyn CodebookConfig,
 ) -> Vec<WordLocation> {
-    // Group candidates by word text, deduplicating identical spans
+    // Group misspelled candidates by word, deduplicating identical spans.
+    // Only misspelled words are inserted, matching the old behavior where
+    // the debug_assert caught query bugs producing duplicate misspelling locations.
     let mut word_positions: HashMap<&str, HashSet<TextRange>> = HashMap::new();
     for candidate in candidates {
-        word_positions
+        if is_word_correct(&candidate.word, dictionaries, config) {
+            continue;
+        }
+        let location = TextRange {
+            start_byte: candidate.start_byte,
+            end_byte: candidate.end_byte,
+        };
+        let added = word_positions
             .entry(&candidate.word)
             .or_default()
-            .insert(TextRange {
-                start_byte: candidate.start_byte,
-                end_byte: candidate.end_byte,
-            });
-    }
+            .insert(location);
 
-    // Check each unique word once
-    let mut results = Vec::new();
-    for (word, positions) in word_positions {
-        let positions: Vec<TextRange> = positions.into_iter().collect();
-        if config.should_flag_word(word) {
-            results.push(WordLocation::new(word.to_string(), positions));
-            continue;
-        }
-        if word.len() < config.get_min_word_length() {
-            continue;
-        }
-        if config.is_allowed_word(word) {
-            continue;
-        }
-        let is_correct = dictionaries.iter().any(|dict| dict.check(word));
-        if !is_correct {
-            results.push(WordLocation::new(word.to_string(), positions));
-        }
+        debug_assert!(
+            added,
+            "Two of the same locations found. Make a better query. Word: {}, Location: {:?}",
+            candidate.word, location
+        );
     }
-    results
+
+    word_positions
+        .into_iter()
+        .map(|(word, positions)| WordLocation::new(word.to_string(), positions.into_iter().collect()))
+        .collect()
 }
 
 #[cfg(test)]
@@ -112,20 +127,4 @@ mod tests {
         let results = check_words(&candidates, &[dict], config.as_ref());
         assert!(results.is_empty(), "Allowed words should not be flagged");
     }
-
-    #[test]
-    fn test_check_words_deduplicates_identical_spans() {
-        let dict = Arc::new(TextDictionary::new("hello\n"));
-        let config = Arc::new(codebook_config::CodebookConfigMemory::default());
-        // Same word at the exact same position — should be deduplicated
-        let candidates = make_candidates(&[("wrld", 0, 4), ("wrld", 0, 4), ("wrld", 0, 4)]);
-        let results = check_words(&candidates, &[dict], config.as_ref());
-        assert_eq!(results.len(), 1);
-        assert_eq!(results[0].word, "wrld");
-        assert_eq!(
-            results[0].locations.len(),
-            1,
-            "Identical spans should be deduplicated to one location"
-        );
-    }
 }
diff --git a/crates/codebook/tests/test_markdown.rs b/crates/codebook/tests/test_markdown.rs
index 703fc424..fe63aa53 100644
--- a/crates/codebook/tests/test_markdown.rs
+++ b/crates/codebook/tests/test_markdown.rs
@@ -202,10 +202,15 @@ More text.
     let misspelled = processor
         .spell_check(sample_text, Some(LanguageType::Markdown), None)
         .to_vec();
-    let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
-    println!("Misspelled words: {words:?}");
-    // wrld should be flagged as a function name typo in both languages
-    assert!(words.contains(&"wrld"));
+    println!("Misspelled words: {misspelled:?}");
+    // wrld should be flagged from both code blocks — verify two locations
+    let wrld = misspelled.iter().find(|w| w.word == "wrld");
+    assert!(wrld.is_some(), "wrld should be flagged");
+    assert_eq!(
+        wrld.unwrap().locations.len(),
+        2,
+        "wrld should have 2 locations (one from py block, one from js block)"
+    );
 }
 
 #[test]

From 666c0731e6e538a173a9b6af70b93392860dacac Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 12:49:26 -0700
Subject: [PATCH 11/16] Fix Erlang query overlap and restore debug_assert on
 all candidates

The blanket (atom) @string.special pattern overlapped with
(function_clause name: (atom) @identifier.function), producing
duplicate captures at the same byte range. Replace with specific
parent-context patterns (module_attribute, tuple, map_field, call)
that don't overlap with function names.

The debug_assert in checker.rs now fires on all candidates (not just
misspelled words), matching the original intent of catching inefficient
queries during development.
---
 crates/codebook/src/checker.rs         | 50 +++++++++++---------------
 crates/codebook/src/parser.rs          |  1 +
 crates/codebook/src/queries/erlang.scm |  7 +++-
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/crates/codebook/src/checker.rs b/crates/codebook/src/checker.rs
index 912e8585..39fb5f7e 100644
--- a/crates/codebook/src/checker.rs
+++ b/crates/codebook/src/checker.rs
@@ -13,25 +13,6 @@ pub struct WordCandidate {
     pub end_byte: usize,
 }
 
-/// Check if a word should be flagged based on config and dictionaries.
-/// Returns true if the word is correct (should NOT be flagged).
-fn is_word_correct(
-    word: &str,
-    dictionaries: &[std::sync::Arc<dyn Dictionary>],
-    config: &dyn CodebookConfig,
-) -> bool {
-    if config.should_flag_word(word) {
-        return false;
-    }
-    if word.len() < config.get_min_word_length() {
-        return true;
-    }
-    if config.is_allowed_word(word) {
-        return true;
-    }
-    dictionaries.iter().any(|dict| dict.check(word))
-}
-
 /// Check candidate words against dictionaries and config rules.
 /// Returns WordLocations for misspelled words, grouping all locations
 /// of the same word together.
@@ -40,14 +21,9 @@ pub fn check_words(
     dictionaries: &[std::sync::Arc<dyn Dictionary>],
     config: &dyn CodebookConfig,
 ) -> Vec<WordLocation> {
-    // Group misspelled candidates by word, deduplicating identical spans.
-    // Only misspelled words are inserted, matching the old behavior where
-    // the debug_assert caught query bugs producing duplicate misspelling locations.
+    // Group candidates by word text, deduplicating identical spans.
     let mut word_positions: HashMap<&str, HashSet<TextRange>> = HashMap::new();
     for candidate in candidates {
-        if is_word_correct(&candidate.word, dictionaries, config) {
-            continue;
-        }
         let location = TextRange {
             start_byte: candidate.start_byte,
             end_byte: candidate.end_byte,
@@ -64,10 +40,26 @@ pub fn check_words(
         );
     }
 
-    word_positions
-        .into_iter()
-        .map(|(word, positions)| WordLocation::new(word.to_string(), positions.into_iter().collect()))
-        .collect()
+    // Check each unique word once
+    let mut results = Vec::new();
+    for (word, positions) in word_positions {
+        let positions: Vec<TextRange> = positions.into_iter().collect();
+        if config.should_flag_word(word) {
+            results.push(WordLocation::new(word.to_string(), positions));
+            continue;
+        }
+        if word.len() < config.get_min_word_length() {
+            continue;
+        }
+        if config.is_allowed_word(word) {
+            continue;
+        }
+        let is_correct = dictionaries.iter().any(|dict| dict.check(word));
+        if !is_correct {
+            results.push(WordLocation::new(word.to_string(), positions));
+        }
+    }
+    results
 }
 
 #[cfg(test)]
diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs
index 6100ccc4..30daae44 100644
--- a/crates/codebook/src/parser.rs
+++ b/crates/codebook/src/parser.rs
@@ -488,4 +488,5 @@ mod tests {
         assert_eq!(bw.start_byte, 7);
         assert_eq!(bw.end_byte, 14);
     }
+
 }
diff --git a/crates/codebook/src/queries/erlang.scm b/crates/codebook/src/queries/erlang.scm
index 93fd05fd..a4435107 100644
--- a/crates/codebook/src/queries/erlang.scm
+++ b/crates/codebook/src/queries/erlang.scm
@@ -1,8 +1,13 @@
 (comment) @comment
 (string) @string
-(atom) @string.special
 
 (var) @identifier.variable
 
 (function_clause
   name: (atom) @identifier.function)
+
+; Atoms in specific contexts (avoids overlap with function names above)
+(module_attribute (atom) @string.special)
+(tuple (atom) @string.special)
+(map_field (atom) @string.special)
+(call (atom) @string.special)

From 5ea74f2a48c40830cfb31cc247ddf4b105cd649c Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 13:09:10 -0700
Subject: [PATCH 12/16] Eliminate per-word String allocations in WordCandidate

Change WordCandidate from owning a String to borrowing &str from the
source document text. This removes one heap allocation per extracted
word. The lifetime chain works because splitter::split returns
SplitRef borrowing from its input, which borrows from document_text
through the tree-sitter node text or directly.

Also removes an unnecessary String allocation for injection language
text by borrowing from the tree-sitter provider bytes directly.
---
 crates/codebook/src/checker.rs | 15 +++++-----
 crates/codebook/src/parser.rs  | 50 +++++++++++++++++-----------------
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/crates/codebook/src/checker.rs b/crates/codebook/src/checker.rs
index 39fb5f7e..95a2dffb 100644
--- a/crates/codebook/src/checker.rs
+++ b/crates/codebook/src/checker.rs
@@ -5,10 +5,11 @@ use crate::parser::{TextRange, WordLocation};
 use codebook_config::CodebookConfig;
 
 /// A candidate word extracted from a text node, with its position
-/// in original-document byte offsets.
+/// in original-document byte offsets. Borrows the word text from the
+/// source document to avoid per-word String allocations.
 #[derive(Debug, Clone, PartialEq)]
-pub struct WordCandidate {
-    pub word: String,
+pub struct WordCandidate<'a> {
+    pub word: &'a str,
     pub start_byte: usize,
     pub end_byte: usize,
 }
@@ -17,7 +18,7 @@ pub struct WordCandidate {
 /// Returns WordLocations for misspelled words, grouping all locations
 /// of the same word together.
 pub fn check_words(
-    candidates: &[WordCandidate],
+    candidates: &[WordCandidate<'_>],
     dictionaries: &[std::sync::Arc<dyn Dictionary>],
     config: &dyn CodebookConfig,
 ) -> Vec<WordLocation> {
@@ -29,7 +30,7 @@ pub fn check_words(
             end_byte: candidate.end_byte,
         };
         let added = word_positions
-            .entry(&candidate.word)
+            .entry(candidate.word)
             .or_default()
             .insert(location);
 
@@ -68,11 +69,11 @@ mod tests {
     use crate::dictionaries::dictionary::TextDictionary;
     use std::sync::Arc;
 
-    fn make_candidates(words: &[(&str, usize, usize)]) -> Vec<WordCandidate> {
+    fn make_candidates<'a>(words: &[(&'a str, usize, usize)]) -> Vec<WordCandidate<'a>> {
         words
             .iter()
             .map(|(word, start, end)| WordCandidate {
-                word: word.to_string(),
+                word,
                 start_byte: *start,
                 end_byte: *end,
             })
diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs
index 30daae44..2a44f92b 100644
--- a/crates/codebook/src/parser.rs
+++ b/crates/codebook/src/parser.rs
@@ -128,12 +128,12 @@ impl WordLocation {
 ///
 /// Returns the candidates and the set of all languages encountered (for
 /// dictionary loading).
-pub fn extract_all_words(
-    document_text: &str,
+pub fn extract_all_words<'a>(
+    document_text: &'a str,
     language: LanguageType,
     tag_filter: &dyn Fn(&str) -> bool,
     skip_patterns: &[Regex],
-) -> (Vec<WordCandidate>, HashSet<LanguageType>) {
+) -> (Vec<WordCandidate<'a>>, HashSet<LanguageType>) {
     let skip_ranges = find_skip_ranges(document_text, skip_patterns);
     let mut result = ExtractionResult {
         candidates: Vec::new(),
@@ -154,8 +154,8 @@ pub fn extract_all_words(
 }
 
 /// Accumulated output from recursive word extraction.
-struct ExtractionResult {
-    candidates: Vec<WordCandidate>,
+struct ExtractionResult<'a> {
+    candidates: Vec<WordCandidate<'a>>,
     languages: HashSet<LanguageType>,
 }
 
@@ -168,14 +168,14 @@ struct ExtractionResult {
 ///     the language name from the sibling capture, then recurse
 ///
 /// For LanguageType::Text (no grammar): word-split the entire range.
-fn extract_recursive(
-    document_text: &str,
+fn extract_recursive<'a>(
+    document_text: &'a str,
     start_byte: usize,
     end_byte: usize,
     language: LanguageType,
     tag_filter: &dyn Fn(&str) -> bool,
     skip_ranges: &[SkipRange],
-    result: &mut ExtractionResult,
+    result: &mut ExtractionResult<'a>,
 ) {
     let language_setting = match get_language_setting(language) {
         Some(s) => s,
@@ -212,22 +212,22 @@ fn extract_recursive(
     while let Some(match_) = matches_query.next() {
         // First pass: look for dynamic injection pairs in this match
         let mut injection_content: Option<tree_sitter::Node> = None;
-        let mut injection_language_text: Option<String> = None;
+        let mut injection_language_text: Option<&str> = None;
 
         for capture in match_.captures {
             let tag = &compiled.capture_names[capture.index as usize];
             if tag == "injection.content" {
                 injection_content = Some(capture.node);
             } else if tag == "injection.language" {
-                injection_language_text =
-                    Some(capture.node.utf8_text(provider).unwrap_or("").to_string());
+                injection_language_text = Some(capture.node.utf8_text(provider).unwrap_or(""));
             }
         }
 
         // Handle dynamic injection pair
         if let Some(content_node) = injection_content {
-            if let Some(lang_text) = &injection_language_text {
-                let child_lang = LanguageType::from_str(&lang_text.trim().to_lowercase());
+            if let Some(lang_text) = injection_language_text {
+                let lowered = lang_text.trim().to_lowercase();
+                let child_lang = LanguageType::from_str(&lowered);
                 if let Ok(child_lang) = child_lang
                     && child_lang != LanguageType::Text
                 {
@@ -299,11 +299,11 @@ fn extract_recursive(
 // Word extraction from plain text
 // =============================================================================
 
-fn extract_words_from_text(
-    text: &str,
+fn extract_words_from_text<'a>(
+    text: &'a str,
     base_offset: usize,
     skip_ranges: &[SkipRange],
-    candidates: &mut Vec<WordCandidate>,
+    candidates: &mut Vec<WordCandidate<'a>>,
 ) {
     for (offset, word) in text.split_word_bound_indices() {
         if !is_alphabetic(word) {
@@ -324,7 +324,7 @@ fn extract_words_from_text(
                 continue;
             }
             candidates.push(WordCandidate {
-                word: split_word.word.to_string(),
+                word: split_word.word,
                 start_byte: word_start,
                 end_byte: word_end,
             });
@@ -358,7 +358,7 @@ mod tests {
     fn test_extract_words_plain_text() {
         let text = "HelloWorld calc_wrld";
         let (words, langs) = extract_all_words(text, LanguageType::Text, &|_| true, &[]);
-        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word).collect();
         assert!(word_strings.contains(&"Hello"));
         assert!(word_strings.contains(&"World"));
         assert!(word_strings.contains(&"calc"));
@@ -371,7 +371,7 @@ mod tests {
     fn test_extract_words_contraction() {
         let text = "I'm a contraction, wouldn't you agree'?";
         let (words, _) = extract_all_words(text, LanguageType::Text, &|_| true, &[]);
-        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word).collect();
         let expected = ["I'm", "a", "contraction", "wouldn't", "you", "agree"];
         for e in &expected {
             assert!(word_strings.contains(e), "Expected word '{e}' not found");
@@ -383,7 +383,7 @@ mod tests {
         let text = "// a comment\nfn main() {}";
         let (words, langs) = extract_all_words(text, LanguageType::Rust, &|_| true, &[]);
         assert!(!words.is_empty());
-        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word).collect();
         assert!(
             word_strings.contains(&"comment"),
             "Should find 'comment' in Rust comment"
@@ -400,7 +400,7 @@ mod tests {
             &|tag| tag.starts_with("comment"),
             &[],
         );
-        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word).collect();
         assert!(word_strings.contains(&"comment"));
         assert!(!word_strings.contains(&"string"));
         assert!(!word_strings.contains(&"value"));
@@ -411,7 +411,7 @@ mod tests {
         let text = "check https://example.com this";
         let url_pattern = Regex::new(r"https?://[^\s]+").unwrap();
         let (words, _) = extract_all_words(text, LanguageType::Text, &|_| true, &[url_pattern]);
-        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word).collect();
         assert!(word_strings.contains(&"check"));
         assert!(word_strings.contains(&"this"));
         assert!(!word_strings.contains(&"https"));
@@ -440,7 +440,7 @@ mod tests {
     fn test_markdown_injection_extracts_code_words() {
         let text = "# Hello\n\n```python\ndef some_functin(): pass\n```\n";
         let (words, _) = extract_all_words(text, LanguageType::Markdown, &|_| true, &[]);
-        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word).collect();
         assert!(word_strings.contains(&"functin"));
         assert!(word_strings.contains(&"Hello"));
     }
@@ -449,7 +449,7 @@ mod tests {
     fn test_markdown_unknown_language_skipped() {
         let text = "# Hello\n\n```unknownlang\nbadwwword\n```\n";
         let (words, _) = extract_all_words(text, LanguageType::Markdown, &|_| true, &[]);
-        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word).collect();
         assert!(!word_strings.contains(&"badwwword"));
     }
 
@@ -457,7 +457,7 @@ mod tests {
     fn test_markdown_html_block_injection() {
         let text = "# Hello\n\n<div>\n  <p>A misspeled word</p>\n</div>\n\nMore text.\n";
         let (words, langs) = extract_all_words(text, LanguageType::Markdown, &|_| true, &[]);
-        let word_strings: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect();
+        let word_strings: Vec<&str> = words.iter().map(|w| w.word).collect();
         assert!(langs.contains(&LanguageType::HTML));
         assert!(word_strings.contains(&"misspeled"));
         assert!(!word_strings.contains(&"div"));

From 76380a5e9d19492f4767d96f95cae9ed51cf4d80 Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 13:38:20 -0700
Subject: [PATCH 13/16] Reuse splitter Vec across word boundary iterations

Add split_into() that appends to a caller-provided Vec, avoiding a
fresh Vec allocation per word boundary segment in extract_words_from_text.
The Vec is allocated once per text node and reused across all words.
---
 crates/codebook/src/parser.rs   |  5 +++--
 crates/codebook/src/splitter.rs | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs
index 2a44f92b..08a7d881 100644
--- a/crates/codebook/src/parser.rs
+++ b/crates/codebook/src/parser.rs
@@ -305,6 +305,7 @@ fn extract_words_from_text<'a>(
     skip_ranges: &[SkipRange],
     candidates: &mut Vec<WordCandidate<'a>>,
 ) {
+    let mut split_buf = Vec::new();
     for (offset, word) in text.split_word_bound_indices() {
         if !is_alphabetic(word) {
             continue;
@@ -313,8 +314,8 @@ fn extract_words_from_text<'a>(
         if is_within_skip_range(global_offset, global_offset + word.len(), skip_ranges) {
             continue;
         }
-        let split = splitter::split(word);
-        for split_word in split {
+        splitter::split_into(word, &mut split_buf);
+        for split_word in &split_buf {
             if is_numeric(split_word.word) {
                 continue;
             }
diff --git a/crates/codebook/src/splitter.rs b/crates/codebook/src/splitter.rs
index 6400e069..4aca089c 100644
--- a/crates/codebook/src/splitter.rs
+++ b/crates/codebook/src/splitter.rs
@@ -14,12 +14,19 @@ pub struct SplitRef<'a> {
     pub start_byte: usize,
 }
 
+#[cfg(test)]
 pub fn split(s: &str) -> Vec<SplitRef<'_>> {
+    let mut result = Vec::new();
+    split_into(s, &mut result);
+    result
+}
+
+/// Split a word into sub-words, appending to an existing Vec to allow reuse.
+pub fn split_into<'a>(s: &'a str, result: &mut Vec<SplitRef<'a>>) {
+    result.clear();
     if s.is_empty() {
-        return Vec::new();
+        return;
     }
-
-    let mut result = Vec::new();
     let mut word_start_byte = 0;
     let mut prev_char_type = None;
 
@@ -93,8 +100,6 @@ pub fn split(s: &str) -> Vec<SplitRef<'_>> {
             });
         }
     }
-
-    result
 }
 
 #[cfg(test)]

From d2bb135b65cf45aa6398d5e33a09bc2b6b16ad11 Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 15:11:07 -0700
Subject: [PATCH 14/16] Update changelog with unreleased changes

---
 CHANGELOG.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 29f6c3c1..220c2609 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,14 @@
+[Unreleased]
+
+- Add multi-language support for Markdown files: fenced code blocks are now spell-checked using the appropriate language grammar (Python, Rust, Bash, etc.)
+- Add language injection system via `@injection.*` capture tags in `.scm` query files — adding multi-language support to a new file type requires only a `.scm` change, no Rust code
+- Add HTML block injection in Markdown — block-level HTML is spell-checked using the HTML grammar
+- Add language alias resolution for Markdown code blocks (e.g., `py`, `js`, `sh`, `rs`, `yml`, `c++`)
+- Pre-compile all tree-sitter queries at startup for faster spell-checking and earlier error detection
+- Reduce per-word memory allocations in the spell-check pipeline
+- Fix Erlang query producing duplicate captures for function name atoms
+- Refactor: split spell-checking into separate extraction (`parser.rs`) and checking (`checker.rs`) modules
+
 [0.3.35]
 
 - Add tag-based filtering (`include_tags`/`exclude_tags`) to control which parts of code are spell-checked (comments, strings, identifiers, etc.)

From 3c59c97b3f395e66908db1953fb8d55bcf35f6be Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 15:16:50 -0700
Subject: [PATCH 15/16] Update queries README with injection docs, remove em
 dashes

---
 CHANGELOG.md                           |  4 ++--
 crates/codebook/src/parser.rs          |  6 +++---
 crates/codebook/src/queries/README.md  | 29 ++++++++++++++++++++++++++
 crates/codebook/tests/test_markdown.rs |  6 +++---
 4 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 220c2609..c75260c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,8 @@
 [Unreleased]
 
 - Add multi-language support for Markdown files: fenced code blocks are now spell-checked using the appropriate language grammar (Python, Rust, Bash, etc.)
-- Add language injection system via `@injection.*` capture tags in `.scm` query files — adding multi-language support to a new file type requires only a `.scm` change, no Rust code
-- Add HTML block injection in Markdown — block-level HTML is spell-checked using the HTML grammar
+- Add language injection system via `@injection.*` capture tags in `.scm` query files. Adding multi-language support to a new file type requires only a `.scm` change, no Rust code
+- Add HTML block injection in Markdown. Block-level HTML is spell-checked using the HTML grammar
 - Add language alias resolution for Markdown code blocks (e.g., `py`, `js`, `sh`, `rs`, `yml`, `c++`)
 - Pre-compile all tree-sitter queries at startup for faster spell-checking and earlier error detection
 - Reduce per-word memory allocations in the spell-check pipeline
diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs
index 08a7d881..015e5204 100644
--- a/crates/codebook/src/parser.rs
+++ b/crates/codebook/src/parser.rs
@@ -180,7 +180,7 @@ fn extract_recursive<'a>(
     let language_setting = match get_language_setting(language) {
         Some(s) => s,
         None => {
-            // No grammar (e.g. Text) — word-split the whole range
+            // No grammar (e.g. Text): word-split the whole range
             let text = &document_text[start_byte..end_byte];
             extract_words_from_text(text, start_byte, skip_ranges, &mut result.candidates);
             return;
@@ -204,7 +204,7 @@ fn extract_recursive<'a>(
     let root_node = tree.root_node();
     let compiled = COMPILED_QUERIES
         .get(&language)
-        .expect("Language has a LanguageSetting but no compiled query — this should not happen");
+        .expect("Language has a LanguageSetting but no compiled query; this should not happen");
     let mut cursor = QueryCursor::new();
     let provider = region_text.as_bytes();
     let mut matches_query = cursor.matches(&compiled.query, root_node, provider);
@@ -284,7 +284,7 @@ fn extract_recursive<'a>(
                 continue;
             }
 
-            // Normal text capture — extract words if tag passes filter
+            // Normal text capture: extract words if tag passes filter
             if !tag_filter(tag) {
                 continue;
             }
diff --git a/crates/codebook/src/queries/README.md b/crates/codebook/src/queries/README.md
index dcebba7e..9b413a4e 100644
--- a/crates/codebook/src/queries/README.md
+++ b/crates/codebook/src/queries/README.md
@@ -27,6 +27,34 @@ Every capture name is a **tag** that categorizes the matched text. Tags use a do
 
 Not every language needs every tag. HTML, for example, only uses `@comment` and `@string`. You can get a feel for which tags are available for a specific language by looking at the `scm` file for that language in this directory.
 
+### Injection Tags (Multi-Language Support)
+
+Injection tags tell codebook to re-parse a region of the file using a different language's grammar. This is how Markdown code blocks, HTML `<script>` tags, and similar multi-language files are handled.
+
+| Capture name | When to use |
+| --- | --- |
+| `@injection.content` | The text to re-parse (paired with `@injection.language` in the same query match) |
+| `@injection.language` | A node whose text names the target language (paired with `@injection.content`) |
+| `@injection.{lang}` | Static injection: always re-parse the captured node as `{lang}` (e.g., `@injection.html`) |
+
+Example from `markdown.scm`:
+
+```scheme
+; Prose content:spell-checked as text
+(paragraph (inline) @string)
+(atx_heading (inline) @string)
+
+; HTML blocks:re-parsed with the HTML grammar
+(html_block) @injection.html
+
+; Fenced code blocks:language read from the info string
+(fenced_code_block
+  (info_string (language) @injection.language)
+  (code_fence_content) @injection.content)
+```
+
+The `@injection.language` value is resolved against the `ids` and `extensions` fields in `LANGUAGE_SETTINGS`. Common aliases like `py`, `js`, `sh`, `rs`, etc. work automatically.
+
 ## Adding a New Language
 
 ### 1. Create the Query File
@@ -83,5 +111,6 @@ Additional language tests go in `crates/codebook/tests/`. Example files with at
 - Only capture nodes that contain user-defined text (not keywords)
 - Always use namespaced capture names (`@identifier.function`, not `@func_declaration`)
 - Use the most specific tag that fits (e.g., `@identifier.type` over `@identifier`)
+- **Avoid overlapping captures**:don't let two patterns capture the same node at the same byte range. For example, a blanket `(atom) @string.special` would overlap with `(function_clause name: (atom) @identifier.function)`. Instead, capture atoms only in specific parent contexts like `(tuple (atom) @string.special)`. A `debug_assert` will catch overlaps during testing.
 - Start simple and add complexity as needed
 - Look at existing query files for patterns
diff --git a/crates/codebook/tests/test_markdown.rs b/crates/codebook/tests/test_markdown.rs
index fe63aa53..93ba8362 100644
--- a/crates/codebook/tests/test_markdown.rs
+++ b/crates/codebook/tests/test_markdown.rs
@@ -45,7 +45,7 @@ fn test_markdown_fenced_code_block_known_lang() {
     utils::init_logging();
     let processor = utils::get_processor();
     // Note: bash.scm only captures comments, strings, function names,
-    // heredocs, and variable names — NOT command invocations.
+    // heredocs, and variable names, NOT command invocations.
     // So mkdir/some_dir are not checked because bash.scm doesn't capture them,
     // not because they're in a bash dictionary.
     let sample_text = r#"# Hello World
@@ -203,7 +203,7 @@ More text.
         .spell_check(sample_text, Some(LanguageType::Markdown), None)
         .to_vec();
     println!("Misspelled words: {misspelled:?}");
-    // wrld should be flagged from both code blocks — verify two locations
+    // wrld should be flagged from both code blocks, verify two locations
     let wrld = misspelled.iter().find(|w| w.word == "wrld");
     assert!(wrld.is_some(), "wrld should be flagged");
     assert_eq!(
@@ -245,7 +245,7 @@ fn test_markdown_injected_region_byte_offsets() {
 fn test_markdown_no_duplicate_spans() {
     utils::init_logging();
     let processor = utils::get_processor();
-    // Block quotes contain paragraphs — make sure the inline content
+    // Block quotes contain paragraphs. Make sure the inline content
     // isn't captured twice (once for the paragraph, once for the block quote)
     let sample_text = "> A tyypo in a block quoet.\n";
     let misspelled = processor

From 5a936db45ebf224b18e6640e966710e10610066a Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Fri, 20 Mar 2026 15:17:16 -0700
Subject: [PATCH 16/16] Format

---
 crates/codebook/src/parser.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs
index 015e5204..0ba3759b 100644
--- a/crates/codebook/src/parser.rs
+++ b/crates/codebook/src/parser.rs
@@ -489,5 +489,4 @@ mod tests {
         assert_eq!(bw.start_byte, 7);
         assert_eq!(bw.end_byte, 14);
     }
-
 }