From 0decb10f2a7b0051398eabb439d78e7b8d0a1e93 Mon Sep 17 00:00:00 2001 From: Bo Lopker Date: Fri, 20 Mar 2026 09:07:00 -0700 Subject: [PATCH 01/16] Attempt 1 --- crates/codebook/src/checker.rs | 120 ++++ .../codebook/src/dictionaries/dictionary.rs | 15 - crates/codebook/src/lib.rs | 97 ++- crates/codebook/src/parser.rs | 522 ++++++++-------- crates/codebook/src/queries.rs | 12 +- crates/codebook/src/queries/markdown.scm | 2 - crates/codebook/src/regions.rs | 301 +++++++++ crates/codebook/tests/test_markdown.rs | 92 ++- crates/codebook/tests/utils/mod.rs | 6 +- examples/example.md | 9 + refactor.md | 574 ++++++++++++++++++ 11 files changed, 1389 insertions(+), 361 deletions(-) create mode 100644 crates/codebook/src/checker.rs delete mode 100644 crates/codebook/src/queries/markdown.scm create mode 100644 crates/codebook/src/regions.rs create mode 100644 refactor.md diff --git a/crates/codebook/src/checker.rs b/crates/codebook/src/checker.rs new file mode 100644 index 00000000..e87d9069 --- /dev/null +++ b/crates/codebook/src/checker.rs @@ -0,0 +1,120 @@ +use std::collections::HashMap; + +use crate::dictionaries::dictionary::Dictionary; +use crate::parser::{TextRange, WordLocation}; +use codebook_config::CodebookConfig; + +/// A candidate word extracted from a text node, with its position +/// in original-document byte offsets. +#[derive(Debug, Clone, PartialEq)] +pub struct WordCandidate { + pub word: String, + pub start_byte: usize, + pub end_byte: usize, +} + +/// Check candidate words against dictionaries and config rules. +/// Returns WordLocations for misspelled words, grouping all locations +/// of the same word together. +pub fn check_words( + candidates: &[WordCandidate], + dictionaries: &[std::sync::Arc], + config: &dyn CodebookConfig, +) -> Vec { + // Deduplicate: group candidates by word text + let mut word_positions: HashMap<&str, Vec> = HashMap::new(); + for candidate in candidates { + word_positions + .entry(&candidate.word) + .or_default() + .push(TextRange { + start_byte: candidate.start_byte, + end_byte: candidate.end_byte, + }); + } + + // Check each unique word once + let mut results = Vec::new(); + for (word, positions) in word_positions { + if config.should_flag_word(word) { + results.push(WordLocation::new(word.to_string(), positions)); + continue; + } + if word.len() < config.get_min_word_length() { + continue; + } + if config.is_allowed_word(word) { + continue; + } + let is_correct = dictionaries.iter().any(|dict| dict.check(word)); + if !is_correct { + results.push(WordLocation::new(word.to_string(), positions)); + } + } + results +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dictionaries::dictionary::TextDictionary; + use std::sync::Arc; + + fn make_candidates(words: &[(&str, usize, usize)]) -> Vec { + words + .iter() + .map(|(word, start, end)| WordCandidate { + word: word.to_string(), + start_byte: *start, + end_byte: *end, + }) + .collect() + } + + #[test] + fn test_check_words_flags_unknown() { + let dict = Arc::new(TextDictionary::new("hello\nworld\n")); + let config = Arc::new(codebook_config::CodebookConfigMemory::default()); + let candidates = make_candidates(&[ + ("hello", 0, 5), + ("wrld", 6, 10), + ]); + let results = check_words(&candidates, &[dict], config.as_ref()); + assert_eq!(results.len(), 1); + assert_eq!(results[0].word, "wrld"); + } + + #[test] + fn test_check_words_groups_locations() { + let dict = Arc::new(TextDictionary::new("hello\n")); + let config = Arc::new(codebook_config::CodebookConfigMemory::default()); + let candidates = make_candidates(&[ + ("wrld", 0, 4), + ("wrld", 10, 14), + ]); + let results = check_words(&candidates, &[dict], config.as_ref()); + assert_eq!(results.len(), 1); + assert_eq!(results[0].word, "wrld"); + assert_eq!(results[0].locations.len(), 2); + } + + #[test] + fn test_check_words_respects_min_length() { + let dict = Arc::new(TextDictionary::new("")); + let config = Arc::new(codebook_config::CodebookConfigMemory::default()); + // Default min word length is 3 + let candidates = make_candidates(&[("ab", 0, 2)]); + let results = check_words(&candidates, &[dict], config.as_ref()); + assert!(results.is_empty(), "Short words should be skipped"); + } + + #[test] + fn test_check_words_respects_allowed_words() { + let dict = Arc::new(TextDictionary::new("")); + let config = Arc::new(codebook_config::CodebookConfigMemory::default()); + config.add_word("codebook").unwrap(); + let candidates = make_candidates(&[("codebook", 0, 8)]); + let results = check_words(&candidates, &[dict], config.as_ref()); + assert!(results.is_empty(), "Allowed words should not be flagged"); + } +} diff --git a/crates/codebook/src/dictionaries/dictionary.rs b/crates/codebook/src/dictionaries/dictionary.rs index 82e75d76..3ba0d800 100644 --- a/crates/codebook/src/dictionaries/dictionary.rs +++ b/crates/codebook/src/dictionaries/dictionary.rs @@ -7,10 +7,6 @@ use std::{ sync::{Arc, RwLock}, }; -use crate::parser::{WordLocation, find_locations}; -use crate::queries::LanguageType; -use regex::Regex; - pub trait Dictionary: Send + Sync { fn check(&self, word: &str) -> bool; fn suggest(&self, word: &str) -> Vec; @@ -170,17 +166,6 @@ impl TextDictionary { } } -/// Integration helper to use any Dictionary trait with optimized batch processing -pub fn find_locations_with_dictionary_batch( - text: &str, - language: LanguageType, - dictionary: &dyn Dictionary, - skip_patterns: &[Regex], -) -> Vec { - // For non-HashSet dictionaries, we still get deduplication benefits - find_locations(text, language, |word| dictionary.check(word), |_| true, skip_patterns) -} - #[cfg(test)] mod dictionary_tests { use super::*; diff --git a/crates/codebook/src/lib.rs b/crates/codebook/src/lib.rs index 5cc6840b..2f91e643 100644 --- a/crates/codebook/src/lib.rs +++ b/crates/codebook/src/lib.rs @@ -1,11 +1,14 @@ +pub mod checker; pub mod dictionaries; mod logging; pub mod parser; pub mod queries; pub mod regexes; +pub mod regions; mod splitter; use crate::regexes::get_default_skip_patterns; +use std::collections::HashSet; use std::path::Path; use std::sync::Arc; @@ -47,39 +50,34 @@ impl Codebook { return Vec::new(); } } - // get needed dictionary names - // get needed dictionaries - // call spell check on each dictionary + let language = self.resolve_language(language, file_path); - let dictionaries = self.get_dictionaries(Some(language)); - // Combine default and user patterns + + // Combine default and user skip patterns let mut all_patterns = get_default_skip_patterns().clone(); if let Some(user_patterns) = self.config.get_ignore_patterns() { all_patterns.extend(user_patterns); } - parser::find_locations( - text, - language, - |word| { - if self.config.should_flag_word(word) { - return false; - } - if word.len() < self.config.get_min_word_length() { - return true; - } - if self.config.is_allowed_word(word) { - return true; - } - for dictionary in &dictionaries { - if dictionary.check(word) { - return true; - } - } - false - }, - |tag| self.config.should_check_tag(tag), - &all_patterns, - ) + + // Stage 1: Split into language regions + let text_regions = regions::extract_regions(text, language); + + // Collect dictionaries for all languages present in the file + let dictionaries = self.get_dictionaries_for_languages(&text_regions); + + // Stages 2+3: Extract nodes and words from each region + let mut all_candidates = Vec::new(); + for region in &text_regions { + // Stage 2: Node extraction + let nodes = + parser::extract_nodes(text, region, &|tag| self.config.should_check_tag(tag)); + // Stage 3: Word extraction + let candidates = parser::extract_words(text, &nodes, &all_patterns); + all_candidates.extend(candidates); + } + + // Stage 4: Word checking + checker::check_words(&all_candidates, &dictionaries, self.config.as_ref()) } fn resolve_language( @@ -87,7 +85,6 @@ impl Codebook { language_type: Option, path: Option<&str>, ) -> queries::LanguageType { - // Check if we have a language_id first, fallback to path, fall back to text match language_type { Some(lang) => lang, None => match path { @@ -97,21 +94,32 @@ impl Codebook { } } - fn get_dictionaries( + /// Gather dictionaries for all languages present in a file. + fn get_dictionaries_for_languages( &self, - language: Option, + regions: &[regions::TextRegion], ) -> Vec> { let mut dictionary_ids = self.config.get_dictionary_ids(); - if let Some(lang) = language { - let language_dictionary_ids = lang.dictionary_ids(); - dictionary_ids.extend(language_dictionary_ids); - }; + + // Add language-specific dictionaries for all languages in the file + let mut seen_languages = HashSet::new(); + for region in regions { + if seen_languages.insert(region.language) { + dictionary_ids.extend(region.language.dictionary_ids()); + } + } + + // Add defaults dictionary_ids.extend(DEFAULT_DICTIONARIES.iter().map(|f| f.to_string())); + + // Deduplicate + dictionary_ids.sort(); + dictionary_ids.dedup(); + let mut dictionaries = Vec::with_capacity(dictionary_ids.len()); debug!("Checking text with dictionaries: {dictionary_ids:?}"); for dictionary_id in dictionary_ids { - let dictionary = self.manager.get_dictionary(&dictionary_id); - if let Some(d) = dictionary { + if let Some(d) = self.manager.get_dictionary(&dictionary_id) { dictionaries.push(d); } } @@ -125,9 +133,8 @@ impl Codebook { } pub fn get_suggestions(&self, word: &str) -> Option> { - // Get top suggestions and return the first 5 suggestions in round robin order let max_results = 5; - let dictionaries = self.get_dictionaries(None); + let dictionaries = self.get_dictionaries_for_languages(&[]); let mut is_misspelled = false; let suggestions: Vec> = dictionaries .iter() @@ -178,7 +185,6 @@ mod tests { ]; let result = collect_round_robin(&sources, 5); - // Round-robin order: first from each source, then second from each source assert_eq!( result, vec!["apple", "date", "grape", "banana", "elderberry"] @@ -193,12 +199,6 @@ mod tests { vec!["cherry", "date", "elderberry"], ]; - // In round-robin, we get: - // 1. apple (1st from 1st source) - // 2. banana (1st from 2nd source) - cherry already taken - // 3. cherry (1st from 3rd source) - // 4. banana (2nd from 1st source) - // 5. date (3rd from 2nd source) - cherry already taken let result = collect_round_robin(&sources, 5); assert_eq!( result, @@ -214,7 +214,6 @@ mod tests { vec!["fig", "grape"], ]; - // Round-robin order with uneven sources let result = collect_round_robin(&sources, 7); assert_eq!( result, @@ -241,7 +240,6 @@ mod tests { fn test_collect_round_robin_some_empty_sources() { let sources = vec![vec!["apple", "banana"], vec![], vec!["cherry", "date"]]; - // Round-robin order, skipping empty source let result = collect_round_robin(&sources, 4); assert_eq!(result, vec!["apple", "cherry", "banana", "date"]); } @@ -250,7 +248,6 @@ mod tests { fn test_collect_round_robin_with_numbers() { let sources = vec![vec![1, 3, 5], vec![2, 4, 6]]; - // Round-robin order with numbers let result = collect_round_robin(&sources, 6); assert_eq!(result, vec![1, 2, 3, 4, 5, 6]); } @@ -263,7 +260,6 @@ mod tests { vec!["grape", "honeydew", "kiwi"], ]; - // First round of round-robin (first from each source) let result = collect_round_robin(&sources, 3); assert_eq!(result, vec!["apple", "date", "grape"]); } @@ -272,7 +268,6 @@ mod tests { fn test_collect_round_robin_max_count_higher_than_available() { let sources = vec![vec!["apple", "banana"], vec!["cherry", "date"]]; - // Round-robin order for all available elements let result = collect_round_robin(&sources, 10); assert_eq!(result, vec!["apple", "banana", "cherry", "date"]); } diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs index 894c3b1b..f655a285 100644 --- a/crates/codebook/src/parser.rs +++ b/crates/codebook/src/parser.rs @@ -1,8 +1,9 @@ -use crate::splitter::{self}; - +use crate::checker::WordCandidate; use crate::queries::{LanguageType, get_language_setting}; +use crate::regions::TextRegion; +use crate::splitter; use regex::Regex; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::sync::{LazyLock, Mutex}; use streaming_iterator::StreamingIterator; use tree_sitter::{Parser, Query, QueryCursor}; @@ -79,88 +80,6 @@ fn merge_overlapping_ranges(ranges: Vec) -> Vec { merged } -/// Helper struct to handle text position tracking and word extraction -struct TextProcessor { - text: String, - skip_ranges: Vec, -} - -impl TextProcessor { - fn new(text: &str, skip_patterns: &[Regex]) -> Self { - let skip_ranges = find_skip_ranges(text, skip_patterns); - Self { - text: text.to_string(), - skip_ranges, - } - } - - fn should_skip(&self, start_byte: usize, word_len: usize) -> bool { - is_within_skip_range(start_byte, start_byte + word_len, &self.skip_ranges) - } - - fn process_words_with_check(&self, mut check_function: F) -> Vec - where - F: FnMut(&str) -> bool, - { - // First pass: collect all unique words with their positions - let estimated_words = (self.text.len() as f64 / 6.0).ceil() as usize; - let mut word_positions: HashMap<&str, Vec> = - HashMap::with_capacity(estimated_words); - - for (offset, word) in self.text.split_word_bound_indices() { - if is_alphabetic(word) && !self.should_skip(offset, word.len()) { - self.collect_split_words(word, offset, &mut word_positions); - } - } - - // Second pass: batch check unique words and filter - let mut result_locations: HashMap> = HashMap::new(); - for (word_text, positions) in word_positions { - if !check_function(word_text) { - result_locations.insert(word_text.to_string(), positions); - } - } - - result_locations - .into_iter() - .map(|(word, locations)| WordLocation::new(word, locations)) - .collect() - } - - fn extract_words(&self) -> Vec { - // Reuse the word collection logic by collecting all words (check always returns false) - self.process_words_with_check(|_| false) - } - - fn collect_split_words<'a>( - &self, - word: &'a str, - offset: usize, - word_positions: &mut HashMap<&'a str, Vec>, - ) { - if !word.is_empty() { - let split = splitter::split(word); - for split_word in split { - if !is_numeric(split_word.word) { - let word_start_byte = offset + split_word.start_byte; - let location = TextRange { - start_byte: word_start_byte, - end_byte: word_start_byte + split_word.word.len(), - }; - let word_text = split_word.word; - word_positions.entry(word_text).or_default().push(location); - } - } - } - } -} - -#[derive(Debug, Clone, PartialEq)] -pub struct WordRef<'a> { - pub word: &'a str, - pub position: (u32, u32), // (start_char, line) -} - #[derive(Debug, Clone, PartialEq)] pub struct WordLocation { pub word: String, @@ -173,40 +92,67 @@ impl WordLocation { } } -pub fn find_locations( - text: &str, - language: LanguageType, - check_function: impl Fn(&str) -> bool, - tag_filter: impl Fn(&str) -> bool, - skip_patterns: &[Regex], -) -> Vec { - match language { - LanguageType::Text => { - let processor = TextProcessor::new(text, skip_patterns); - processor.process_words_with_check(|word| check_function(word)) +// ============================================================================= +// Stage 2: Node Extraction +// ============================================================================= + +/// A text span extracted from a tree-sitter query match or plain text region. +/// Coordinates are in original-document byte offsets. +#[derive(Debug, Clone)] +pub struct TextNode { + /// Byte range start in the original document + pub start_byte: usize, + /// Byte range end in the original document + pub end_byte: usize, + /// The text content of this node + pub text: String, +} + +/// Extract spellcheckable text nodes from a region. +/// For code regions, uses tree-sitter parsing and queries. +/// For text/markdown prose regions, returns the whole region as one node. +/// All byte offsets are in original document coordinates. +pub fn extract_nodes( + document_text: &str, + region: &TextRegion, + tag_filter: &dyn Fn(&str) -> bool, +) -> Vec { + let region_text = &document_text[region.start_byte..region.end_byte]; + + match region.language { + LanguageType::Text | LanguageType::Markdown => { + // Plain text / markdown prose: the whole region is one node + vec![TextNode { + start_byte: region.start_byte, + end_byte: region.end_byte, + text: region_text.to_string(), + }] + } + _ => { + // Code: parse with tree-sitter, run query, extract captured nodes + extract_nodes_with_treesitter( + region_text, + region.start_byte, + region.language, + tag_filter, + ) } - _ => find_locations_code( - text, - language, - |word| check_function(word), - &tag_filter, - skip_patterns, - ), } } -fn find_locations_code( +/// Parse text with tree-sitter and extract nodes matching the language's query. +fn extract_nodes_with_treesitter( text: &str, + base_offset: usize, language: LanguageType, - check_function: impl Fn(&str) -> bool, tag_filter: &dyn Fn(&str) -> bool, - skip_patterns: &[Regex], -) -> Vec { - let language_setting = - get_language_setting(language).expect("This _should_ never happen. Famous last words."); +) -> Vec { + let language_setting = match get_language_setting(language) { + Some(s) => s, + None => return Vec::new(), + }; // Parse under global lock to protect external scanners with global C state. - // The lock covers create + parse; Tree is fully owned after parse returns. let tree = { let mut cache = PARSER_CACHE.lock().unwrap(); let parser = cache.entry(language).or_insert_with(|| { @@ -223,74 +169,82 @@ fn find_locations_code( let query = Query::new(&lang, language_setting.query).unwrap(); let capture_names = query.capture_names(); let mut cursor = QueryCursor::new(); - let mut word_locations: HashMap> = HashMap::new(); let provider = text.as_bytes(); let mut matches_query = cursor.matches(&query, root_node, provider); - // Find all skip ranges from patterns matched against the full source text - let all_skip_ranges = find_skip_ranges(text, skip_patterns); - + let mut nodes = Vec::new(); while let Some(match_) = matches_query.next() { for capture in match_.captures { - // Filter by tag let tag = &capture_names[capture.index as usize]; - if !tag_filter(tag) { + // Skip internal tags and filtered tags + if *tag == "language" || !tag_filter(tag) { continue; } - let node = capture.node; - let node_start_byte = node.start_byte(); - let node_text = node.utf8_text(provider).unwrap(); - let processor = TextProcessor::new(node_text, &[]); - let words = processor.extract_words(); - - // Check words against global skip ranges and dictionary - for word_pos in words { - if !check_function(&word_pos.word) { - for range in word_pos.locations { - let global_start = range.start_byte + node_start_byte; - let global_end = range.end_byte + node_start_byte; - - // Skip if word is entirely within a skip range - if is_within_skip_range(global_start, global_end, &all_skip_ranges) { - continue; - } - - let location = TextRange { - start_byte: global_start, - end_byte: global_end, - }; - if let Some(existing_result) = word_locations.get_mut(&word_pos.word) { - let added = existing_result.insert(location); - debug_assert!( - added, - "Two of the same locations found. Make a better query. Word: {}, Location: {:?}", - word_pos.word, location - ); - } else { - let mut set = HashSet::new(); - set.insert(location); - word_locations.insert(word_pos.word.clone(), set); - } - } - } - } + nodes.push(TextNode { + start_byte: node.start_byte() + base_offset, + end_byte: node.end_byte() + base_offset, + text: node_text.to_string(), + }); } } + nodes +} + +// ============================================================================= +// Stage 3: Word Extraction +// ============================================================================= + +/// Extract candidate words from text nodes, applying skip patterns. +/// All byte offsets are in original document coordinates. +pub fn extract_words( + document_text: &str, + nodes: &[TextNode], + skip_patterns: &[Regex], +) -> Vec { + // Compute skip ranges once against the full document + let skip_ranges = find_skip_ranges(document_text, skip_patterns); + + let mut candidates = Vec::new(); + for node in nodes { + extract_words_from_text(&node.text, node.start_byte, &skip_ranges, &mut candidates); + } + candidates +} - word_locations - .keys() - .map(|word| WordLocation { - word: word.clone(), - locations: word_locations - .get(word) - .cloned() - .unwrap_or_default() - .into_iter() - .collect(), - }) - .collect() +/// Extract words from a text span, applying skip ranges and word splitting. +fn extract_words_from_text( + text: &str, + base_offset: usize, + skip_ranges: &[SkipRange], + candidates: &mut Vec, +) { + for (offset, word) in text.split_word_bound_indices() { + if !is_alphabetic(word) { + continue; + } + let global_offset = base_offset + offset; + if is_within_skip_range(global_offset, global_offset + word.len(), skip_ranges) { + continue; + } + let split = splitter::split(word); + for split_word in split { + if is_numeric(split_word.word) { + continue; + } + let word_start = global_offset + split_word.start_byte; + let word_end = word_start + split_word.word.len(); + if is_within_skip_range(word_start, word_end, skip_ranges) { + continue; + } + candidates.push(WordCandidate { + word: split_word.word.to_string(), + start_byte: word_start, + end_byte: word_end, + }); + } + } } fn is_numeric(s: &str) -> bool { @@ -314,153 +268,173 @@ pub fn get_word_from_string(start_utf16: usize, end_utf16: usize, text: &str) -> #[cfg(test)] mod parser_tests { use super::*; + use crate::regions::TextRegion; #[test] - fn test_spell_checking() { + fn test_extract_words_basic() { let text = "HelloWorld calc_wrld"; - let results = find_locations(text, LanguageType::Text, |_| false, |_| true, &[]); - println!("{results:?}"); - assert_eq!(results.len(), 4); + let nodes = vec![TextNode { + start_byte: 0, + end_byte: text.len(), + text: text.to_string(), + }]; + let words = extract_words(text, &nodes, &[]); + let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect(); + assert!(word_strs.contains(&"Hello")); + assert!(word_strs.contains(&"World")); + assert!(word_strs.contains(&"calc")); + assert!(word_strs.contains(&"wrld")); + assert_eq!(words.len(), 4); } #[test] - fn test_get_words_from_text() { - let text = r#" - HelloWorld calc_wrld - I'm a contraction, don't ignore me - this is a 3rd line. - "#; - let expected = vec![ - ("Hello", (13, 18)), - ("World", (18, 23)), - ("calc", (24, 28)), - ("wrld", (29, 33)), - ("I'm", (46, 49)), - ("a", (50, 51)), - ("contraction", (52, 63)), - ("don't", (65, 70)), - ("ignore", (71, 77)), - ("me", (78, 80)), - ("this", (93, 97)), - ("is", (98, 100)), - ("a", (101, 102)), - ("rd", (104, 106)), - ("line", (107, 111)), - ]; - let processor = TextProcessor::new(text, &[]); - let words = processor.extract_words(); - println!("{words:?}"); - for word in words { - let loc = word.locations.first().unwrap(); - let pos = (loc.start_byte, loc.end_byte); + fn test_extract_words_contraction() { + let text = "I'm a contraction, wouldn't you agree'?"; + let nodes = vec![TextNode { + start_byte: 0, + end_byte: text.len(), + text: text.to_string(), + }]; + let words = extract_words(text, &nodes, &[]); + let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect(); + let expected = ["I'm", "a", "contraction", "wouldn't", "you", "agree"]; + for e in &expected { + assert!(word_strs.contains(e), "Expected word '{e}' not found"); + } + } + + #[test] + fn test_extract_nodes_plain_text() { + let text = "hello world"; + let region = TextRegion { + start_byte: 0, + end_byte: text.len(), + language: LanguageType::Text, + }; + let nodes = extract_nodes(text, ®ion, &|_| true); + assert_eq!(nodes.len(), 1); + assert_eq!(nodes[0].text, "hello world"); + assert_eq!(nodes[0].start_byte, 0); + } + + #[test] + fn test_extract_nodes_code() { + let text = "// a comment\nfn main() {}"; + let region = TextRegion { + start_byte: 0, + end_byte: text.len(), + language: LanguageType::Rust, + }; + let nodes = extract_nodes(text, ®ion, &|_| true); + // Should have at least the comment node + assert!(!nodes.is_empty()); + let comment_node = nodes.iter().find(|n| n.text.contains("comment")); + assert!(comment_node.is_some(), "Should find comment node"); + } + + #[test] + fn test_extract_nodes_with_base_offset() { + // Simulate a code block starting at byte 50 in a larger document + let code = "// hello world"; + let padded = format!("{}{}", " ".repeat(50), code); + let region = TextRegion { + start_byte: 50, + end_byte: 50 + code.len(), + language: LanguageType::Rust, + }; + let nodes = extract_nodes(&padded, ®ion, &|_| true); + assert!(!nodes.is_empty()); + // All node offsets should be >= 50 + for node in &nodes { + assert!(node.start_byte >= 50, "Node offset should include base offset"); + } + } + + #[test] + fn test_extract_nodes_tag_filter() { + let text = "// comment\nlet x = \"string\";"; + let region = TextRegion { + start_byte: 0, + end_byte: text.len(), + language: LanguageType::Rust, + }; + // Only allow comment tags + let nodes = extract_nodes(text, ®ion, &|tag| tag.starts_with("comment")); + for node in &nodes { + // Should only have comment content assert!( - expected.contains(&(word.word.as_str(), pos)), - "Expected word '{}' to be at position {:?}", - word.word, - pos + node.text.contains("comment"), + "Expected only comment nodes, got: {:?}", + node.text ); } } #[test] - fn test_contraction() { - let text = "I'm a contraction, wouldn't you agree'?"; - let processor = TextProcessor::new(text, &[]); - let words = processor.extract_words(); - println!("{words:?}"); - let expected = ["I'm", "a", "contraction", "wouldn't", "you", "agree"]; - for word in words { - assert!(expected.contains(&word.word.as_str())); - } + fn test_extract_words_with_skip_patterns() { + let text = "check https://example.com this"; + let url_pattern = Regex::new(r"https?://[^\s]+").unwrap(); + let nodes = vec![TextNode { + start_byte: 0, + end_byte: text.len(), + text: text.to_string(), + }]; + let words = extract_words(text, &nodes, &[url_pattern]); + let word_strs: Vec<&str> = words.iter().map(|w| w.word.as_str()).collect(); + assert!(word_strs.contains(&"check")); + assert!(word_strs.contains(&"this")); + // URL components should be skipped + assert!(!word_strs.contains(&"https")); + assert!(!word_strs.contains(&"example")); } #[test] fn test_get_word_from_string() { - // Test with ASCII characters let text = "Hello World"; assert_eq!(get_word_from_string(0, 5, text), "Hello"); assert_eq!(get_word_from_string(6, 11, text), "World"); - - // Test with partial words assert_eq!(get_word_from_string(2, 5, text), "llo"); - // Test with Unicode characters let unicode_text = "こんにちは世界"; assert_eq!(get_word_from_string(0, 5, unicode_text), "こんにちは"); assert_eq!(get_word_from_string(5, 7, unicode_text), "世界"); - // Test with emoji (which can be multi-codepoint) let emoji_text = "Hello 👨‍👩‍👧‍👦 World"; assert_eq!(get_word_from_string(6, 17, emoji_text), "👨‍👩‍👧‍👦"); } + #[test] fn test_unicode_character_handling() { crate::logging::init_test_logging(); let text = "©
badword
"; - let processor = TextProcessor::new(text, &[]); - let words = processor.extract_words(); - println!("{words:?}"); - - // Make sure "badword" is included and correctly positioned - assert!(words.iter().any(|word| word.word == "badword")); - - // If "badword" is found, verify its position - if let Some(pos) = words.iter().find(|word| word.word == "badword") { - // The correct position should be 6 (after ©
) - let start_byte = pos.locations.first().unwrap().start_byte; - let end_byte = pos.locations.first().unwrap().end_byte; - assert_eq!( - start_byte, 7, - "Expected 'badword' to start at character position 7" - ); - assert_eq!(end_byte, 14, "Expected 'badword' to be on end_byte 14"); - } else { - panic!("Word 'badword' not found in the text"); - } + let nodes = vec![TextNode { + start_byte: 0, + end_byte: text.len(), + text: text.to_string(), + }]; + let words = extract_words(text, &nodes, &[]); + let badword = words.iter().find(|w| w.word == "badword"); + assert!(badword.is_some(), "Expected 'badword' to be found"); + let bw = badword.unwrap(); + assert_eq!(bw.start_byte, 7, "Expected 'badword' to start at byte 7"); + assert_eq!(bw.end_byte, 14, "Expected 'badword' to end at byte 14"); } #[test] - fn test_duplicate_word_locations() { - // Use a code language to exercise find_locations_code path + fn test_duplicate_word_locations_code() { let text = "// wrld foo wrld"; - let results = find_locations(text, LanguageType::Rust, |_| false, |_| true, &[]); - let wrld = results.iter().find(|loc| loc.word == "wrld").unwrap(); + let region = TextRegion { + start_byte: 0, + end_byte: text.len(), + language: LanguageType::Rust, + }; + let nodes = extract_nodes(text, ®ion, &|_| true); + let words = extract_words(text, &nodes, &[]); + let wrld_words: Vec<_> = words.iter().filter(|w| w.word == "wrld").collect(); assert_eq!( - wrld.locations.len(), + wrld_words.len(), 2, - "Expected two locations for repeated word 'wrld'" + "Expected two occurrences of 'wrld'" ); } - - // Something is up with the HTML tree-sitter package - // #[test] - // fn test_spell_checking_with_unicode() { - // crate::log::init_test_logging(); - // let text = "©
badword
"; - - // // Mock spell check function that flags "badword" - // let results = find_locations(text, LanguageType::Html, |word| word != "badword"); - - // println!("{:?}", results); - - // // Ensure "badword" is flagged - // let badword_result = results.iter().find(|loc| loc.word == "badword"); - // assert!(badword_result.is_some(), "Expected 'badword' to be flagged"); - - // // Check if the location is correct - // if let Some(location) = badword_result { - // assert_eq!( - // location.locations.len(), - // 1, - // "Expected exactly one location for 'badword'" - // ); - // let range = &location.locations[0]; - - // // The word should start after "©
" which is 6 characters - // assert_eq!(range.start_char, 6, "Wrong start position for 'badword'"); - - // // The word should end after "badword" which is 13 characters from the start - // assert_eq!(range.end_char, 13, "Wrong end position for 'badword'"); - // } - // } } diff --git a/crates/codebook/src/queries.rs b/crates/codebook/src/queries.rs index ccf83df9..a352f8db 100644 --- a/crates/codebook/src/queries.rs +++ b/crates/codebook/src/queries.rs @@ -204,7 +204,7 @@ pub static LANGUAGE_SETTINGS: &[LanguageSetting] = &[ type_: LanguageType::Markdown, ids: &["markdown"], dictionary_ids: &[], - query: include_str!("queries/markdown.scm"), + query: "", extensions: &["md", "markdown"], }, LanguageSetting { @@ -299,7 +299,7 @@ impl LanguageSetting { LanguageType::Javascript => Some(tree_sitter_javascript::LANGUAGE.into()), LanguageType::Latex => Some(codebook_tree_sitter_latex::LANGUAGE.into()), LanguageType::Lua => Some(tree_sitter_lua::LANGUAGE.into()), - LanguageType::Markdown => Some(tree_sitter_md::LANGUAGE.into()), + LanguageType::Markdown => None, // Handled by region extraction LanguageType::Odin => Some(tree_sitter_odin_codebook::LANGUAGE.into()), LanguageType::Php => Some(tree_sitter_php::LANGUAGE_PHP.into()), LanguageType::Python => Some(tree_sitter_python::LANGUAGE.into()), @@ -346,7 +346,9 @@ mod tests { fn test_all_queries_are_valid() { for language_setting in LANGUAGE_SETTINGS { // Skip testing Text since it doesn't have a language or query - if language_setting.type_ == LanguageType::Text { + if language_setting.type_ == LanguageType::Text + || language_setting.type_ == LanguageType::Markdown + { continue; } @@ -395,7 +397,9 @@ mod tests { #[test] fn test_all_capture_names_use_allowed_tags() { for language_setting in LANGUAGE_SETTINGS { - if language_setting.type_ == LanguageType::Text { + if language_setting.type_ == LanguageType::Text + || language_setting.type_ == LanguageType::Markdown + { continue; } diff --git a/crates/codebook/src/queries/markdown.scm b/crates/codebook/src/queries/markdown.scm deleted file mode 100644 index 8c1c6b2a..00000000 --- a/crates/codebook/src/queries/markdown.scm +++ /dev/null @@ -1,2 +0,0 @@ -(paragraph (inline) @string) -(atx_heading (inline) @string) diff --git a/crates/codebook/src/regions.rs b/crates/codebook/src/regions.rs new file mode 100644 index 00000000..3d3db789 --- /dev/null +++ b/crates/codebook/src/regions.rs @@ -0,0 +1,301 @@ +use crate::queries::LanguageType; +use std::collections::HashMap; +use std::str::FromStr; +use std::sync::{LazyLock, Mutex}; +use tree_sitter::Parser; + +/// A region of a file associated with a single language. +/// For most files, there's one region covering the whole file. +/// For multi-language files (markdown, astro, vue), there are multiple. +#[derive(Debug, Clone, PartialEq)] +pub struct TextRegion { + /// Byte range start in the original document + pub start_byte: usize, + /// Byte range end in the original document + pub end_byte: usize, + /// Which language governs this region + pub language: LanguageType, +} + +/// Parser cache for region extraction (separate from the main parser cache +/// since region extraction uses different grammars/queries than node extraction). +static REGION_PARSER_CACHE: LazyLock>> = + LazyLock::new(|| Mutex::new(HashMap::new())); + +/// Extract language regions from a document. +/// For single-language files, returns one region covering the whole text. +/// For multi-language files (markdown), returns multiple regions. +pub fn extract_regions(text: &str, language: LanguageType) -> Vec { + match language { + LanguageType::Markdown => extract_markdown_regions(text), + _ => vec![TextRegion { + start_byte: 0, + end_byte: text.len(), + language, + }], + } +} + +/// Map markdown info strings to LanguageType. +/// Handles common aliases beyond what LanguageType::from_str covers. +fn resolve_info_string(info: &str) -> Option { + let trimmed = info.trim().to_lowercase(); + if trimmed.is_empty() { + return None; + } + // Try common aliases first + let lang = match trimmed.as_str() { + "py" => Some(LanguageType::Python), + "js" => Some(LanguageType::Javascript), + "ts" => Some(LanguageType::Typescript), + "tsx" => Some(LanguageType::Tsx), + "jsx" => Some(LanguageType::Javascript), + "sh" | "zsh" | "fish" | "shell" => Some(LanguageType::Bash), + "yml" => Some(LanguageType::YAML), + "c++" | "cc" | "cxx" | "hpp" => Some(LanguageType::Cpp), + "cs" => Some(LanguageType::CSharp), + "rb" => Some(LanguageType::Ruby), + "rs" => Some(LanguageType::Rust), + "tex" => Some(LanguageType::Latex), + _ => None, + }; + + if lang.is_some() { + return lang; + } + + // Fall back to from_str which handles VS Code language IDs + match LanguageType::from_str(&trimmed) { + Ok(LanguageType::Text) => None, // from_str returns Text for unknown, treat as unknown + Ok(lang) => Some(lang), + Err(_) => None, + } +} + +/// Extract regions from a markdown file. +/// Prose sections become Markdown regions (treated as plain text in node extraction). +/// Fenced code blocks become regions of the appropriate language. +fn extract_markdown_regions(text: &str) -> Vec { + let lang: tree_sitter::Language = tree_sitter_md::LANGUAGE.into(); + + let tree = { + let mut cache = REGION_PARSER_CACHE.lock().unwrap(); + let parser = cache + .entry(LanguageType::Markdown) + .or_insert_with(|| { + let mut parser = Parser::new(); + parser.set_language(&lang).unwrap(); + parser + }); + parser.parse(text, None).unwrap() + }; + + let mut regions = Vec::new(); + let root = tree.root_node(); + let provider = text.as_bytes(); + + walk_markdown_node(root, provider, &mut regions); + + // Sort by start position + regions.sort_by_key(|r| r.start_byte); + + // If no regions found (empty file, etc.), return the whole thing as markdown + if regions.is_empty() { + return vec![TextRegion { + start_byte: 0, + end_byte: text.len(), + language: LanguageType::Markdown, + }]; + } + + regions +} + +/// Recursively walk markdown AST to find prose and code block regions. +fn walk_markdown_node( + node: tree_sitter::Node, + source: &[u8], + regions: &mut Vec, +) { + match node.kind() { + "fenced_code_block" => { + // Find info_string and code_fence_content children + let mut info_string = None; + let mut code_content = None; + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + match child.kind() { + "info_string" => { + // Get the language child of info_string + let mut ic = child.walk(); + for info_child in child.children(&mut ic) { + if info_child.kind() == "language" { + info_string = + Some(info_child.utf8_text(source).unwrap_or("").to_string()); + } + } + } + "code_fence_content" => { + code_content = Some((child.start_byte(), child.end_byte())); + } + _ => {} + } + } + + if let Some((start, end)) = code_content { + if start < end { + if let Some(info) = info_string { + if let Some(lang) = resolve_info_string(&info) { + regions.push(TextRegion { + start_byte: start, + end_byte: end, + language: lang, + }); + } + // If info string is unknown, skip the code block entirely + } + // If no info string, skip the code block entirely + } + } + } + "inline" => { + // Check parent — we want inline content from paragraphs and headings + if let Some(parent) = node.parent() { + match parent.kind() { + "paragraph" | "atx_heading" | "setext_heading" => { + if node.start_byte() < node.end_byte() { + regions.push(TextRegion { + start_byte: node.start_byte(), + end_byte: node.end_byte(), + language: LanguageType::Markdown, + }); + } + } + _ => {} + } + } + } + _ => { + // Recurse into children + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + walk_markdown_node(child, source, regions); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_single_language_region() { + let regions = extract_regions("fn main() {}", LanguageType::Rust); + assert_eq!(regions.len(), 1); + assert_eq!(regions[0].language, LanguageType::Rust); + assert_eq!(regions[0].start_byte, 0); + assert_eq!(regions[0].end_byte, 12); + } + + #[test] + fn test_text_region() { + let regions = extract_regions("hello world", LanguageType::Text); + assert_eq!(regions.len(), 1); + assert_eq!(regions[0].language, LanguageType::Text); + } + + #[test] + fn test_markdown_prose_only() { + let text = "# Hello World\n\nSome paragraph text.\n"; + let regions = extract_regions(text, LanguageType::Markdown); + assert!(regions.len() >= 2); // heading + paragraph + for r in ®ions { + assert_eq!(r.language, LanguageType::Markdown); + } + } + + #[test] + fn test_markdown_with_code_block() { + let text = "# Hello\n\nSome text.\n\n```python\ndef foo():\n pass\n```\n\nMore text.\n"; + let regions = extract_regions(text, LanguageType::Markdown); + println!("Regions: {regions:#?}"); + + // Should have markdown prose regions + python code region + let python_regions: Vec<_> = regions + .iter() + .filter(|r| r.language == LanguageType::Python) + .collect(); + assert_eq!(python_regions.len(), 1, "Expected one Python region"); + + let md_regions: Vec<_> = regions + .iter() + .filter(|r| r.language == LanguageType::Markdown) + .collect(); + assert!(md_regions.len() >= 2, "Expected at least 2 markdown prose regions"); + } + + #[test] + fn test_markdown_unknown_language_skipped() { + let text = "# Hello\n\n```unknownlang\nsome code\n```\n\nMore text.\n"; + let regions = extract_regions(text, LanguageType::Markdown); + // Unknown language code block should produce no region + for r in ®ions { + assert_eq!(r.language, LanguageType::Markdown); + } + } + + #[test] + fn test_markdown_no_info_string_skipped() { + let text = "# Hello\n\n```\nsome code\n```\n\nMore text.\n"; + let regions = extract_regions(text, LanguageType::Markdown); + // Code block without info string should produce no region + for r in ®ions { + assert_eq!(r.language, LanguageType::Markdown); + } + } + + #[test] + fn test_resolve_info_string_aliases() { + assert_eq!(resolve_info_string("py"), Some(LanguageType::Python)); + assert_eq!(resolve_info_string("js"), Some(LanguageType::Javascript)); + assert_eq!(resolve_info_string("ts"), Some(LanguageType::Typescript)); + assert_eq!(resolve_info_string("sh"), Some(LanguageType::Bash)); + assert_eq!(resolve_info_string("rs"), Some(LanguageType::Rust)); + assert_eq!(resolve_info_string("rb"), Some(LanguageType::Ruby)); + assert_eq!(resolve_info_string("yml"), Some(LanguageType::YAML)); + assert_eq!(resolve_info_string("c++"), Some(LanguageType::Cpp)); + assert_eq!(resolve_info_string(""), None); + assert_eq!(resolve_info_string("unknownlang"), None); + } + + #[test] + fn test_resolve_info_string_vscode_ids() { + assert_eq!(resolve_info_string("python"), Some(LanguageType::Python)); + assert_eq!(resolve_info_string("javascript"), Some(LanguageType::Javascript)); + assert_eq!(resolve_info_string("rust"), Some(LanguageType::Rust)); + assert_eq!(resolve_info_string("bash"), Some(LanguageType::Bash)); + assert_eq!(resolve_info_string("go"), Some(LanguageType::Go)); + } + + #[test] + fn test_markdown_multiple_code_blocks() { + let text = "Text.\n\n```bash\nmkdir dir\n```\n\n```python\nx = 1\n```\n\nEnd.\n"; + let regions = extract_regions(text, LanguageType::Markdown); + + let bash_regions: Vec<_> = regions.iter().filter(|r| r.language == LanguageType::Bash).collect(); + let python_regions: Vec<_> = regions.iter().filter(|r| r.language == LanguageType::Python).collect(); + + assert_eq!(bash_regions.len(), 1); + assert_eq!(python_regions.len(), 1); + } + + #[test] + fn test_markdown_code_block_content_correct() { + let text = "Hello.\n\n```python\ndef foo():\n pass\n```\n"; + let regions = extract_regions(text, LanguageType::Markdown); + let py = regions.iter().find(|r| r.language == LanguageType::Python).unwrap(); + let content = &text[py.start_byte..py.end_byte]; + assert!(content.contains("def foo()"), "Expected python code, got: {content:?}"); + } +} diff --git a/crates/codebook/tests/test_markdown.rs b/crates/codebook/tests/test_markdown.rs index 04b13dcb..fb399469 100644 --- a/crates/codebook/tests/test_markdown.rs +++ b/crates/codebook/tests/test_markdown.rs @@ -10,7 +10,7 @@ fn test_markdown_paragraph() { utils::init_logging(); let processor = utils::get_processor(); let sample_text = "Some paragraph text with a misspeled word.\n"; - let expected = vec![WordLocation::new( + let expected = [WordLocation::new( "misspeled".to_string(), vec![TextRange { start_byte: 27, @@ -41,7 +41,7 @@ fn test_markdown_heading() { } #[test] -fn test_markdown_fenced_code_block_skipped() { +fn test_markdown_fenced_code_block_known_lang() { utils::init_logging(); let processor = utils::get_processor(); let sample_text = r#"# Hello World @@ -50,7 +50,6 @@ Some correct text here. ```bash mkdir some_dir -badwwword_in_code ``` More correct text here. @@ -60,16 +59,59 @@ More correct text here. .to_vec(); let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect(); println!("Misspelled words: {words:?}"); - // Words inside fenced code blocks should NOT be flagged + // bash builtins like mkdir should be recognized by the bash dictionary assert!(!words.contains(&"mkdir")); - assert!(!words.contains(&"badwwword")); + // dir is a common abbreviation, should not be flagged assert!(!words.contains(&"dir")); } #[test] -fn test_markdown_fenced_code_block_with_typo_outside() { +fn test_markdown_fenced_code_block_unknown_lang_skipped() { utils::init_logging(); let processor = utils::get_processor(); + let sample_text = r#"Some text. + +```unknownlang +badwwword_in_code +``` + +More text. +"#; + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::Markdown), None) + .to_vec(); + let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect(); + println!("Misspelled words: {words:?}"); + // Unknown language code blocks are completely skipped + assert!(!words.contains(&"badwwword")); +} + +#[test] +fn test_markdown_fenced_code_block_no_lang_skipped() { + utils::init_logging(); + let processor = utils::get_processor(); + let sample_text = r#"Some text. + +``` +badwwword_in_code +``` + +More text. +"#; + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::Markdown), None) + .to_vec(); + let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect(); + println!("Misspelled words: {words:?}"); + // Code blocks without language info are completely skipped + assert!(!words.contains(&"badwwword")); +} + +#[test] +fn test_markdown_code_block_uses_language_grammar() { + utils::init_logging(); + let processor = utils::get_processor(); + // In Python grammar, function names are checked as identifiers let sample_text = r#"A paragrap with a tyypo. ```python @@ -84,11 +126,11 @@ Another paragrap with a tyypo. .to_vec(); let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect(); println!("Misspelled words: {words:?}"); - // Typos in prose should be flagged + // Prose typos should be flagged assert!(words.contains(&"paragrap")); assert!(words.contains(&"tyypo")); - // Typos inside code blocks should NOT be flagged - assert!(!words.contains(&"functin")); + // Python function name typo should also be flagged (multi-language support!) + assert!(words.contains(&"functin")); } #[test] @@ -103,7 +145,7 @@ mkdir somedir Middle text is corect. -```python +```unknownlang badspel = True ``` @@ -116,8 +158,9 @@ End text is also corect. println!("Misspelled words: {words:?}"); assert!(words.contains(&"tyypo")); assert!(words.contains(&"corect")); + // bash commands should be handled by bash grammar assert!(!words.contains(&"mkdir")); - assert!(!words.contains(&"somedir")); + // unknown language blocks are skipped entirely assert!(!words.contains(&"badspel")); } @@ -134,3 +177,30 @@ fn test_markdown_block_quote() { assert!(words.contains(&"quoet")); assert!(words.contains(&"tyypo")); } + +#[test] +fn test_markdown_code_block_alias_resolution() { + utils::init_logging(); + let processor = utils::get_processor(); + // Test that common aliases work (py -> Python, js -> Javascript, etc.) + let sample_text = r#"Some text. + +```py +def hello_wrld(): + pass +``` + +```js +function hello_wrld() {} +``` + +More text. +"#; + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::Markdown), None) + .to_vec(); + let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect(); + println!("Misspelled words: {words:?}"); + // wrld should be flagged as a function name typo in both languages + assert!(words.contains(&"wrld")); +} diff --git a/crates/codebook/tests/utils/mod.rs b/crates/codebook/tests/utils/mod.rs index 250e41b4..5437bb5e 100644 --- a/crates/codebook/tests/utils/mod.rs +++ b/crates/codebook/tests/utils/mod.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use codebook::Codebook; use codebook_config::{CodebookConfig, CodebookConfigMemory}; +#[allow(dead_code)] pub fn get_processor() -> Codebook { let config = Arc::new(CodebookConfigMemory::default()); config @@ -31,10 +32,7 @@ pub fn get_processor_with_include_and_ignore(include: &str, ignore: &str) -> Cod } #[allow(dead_code)] -pub fn get_processor_with_tags( - include_tags: Vec<&str>, - exclude_tags: Vec<&str>, -) -> Codebook { +pub fn get_processor_with_tags(include_tags: Vec<&str>, exclude_tags: Vec<&str>) -> Codebook { let settings = codebook_config::ConfigSettings { include_tags: include_tags.into_iter().map(String::from).collect(), exclude_tags: exclude_tags.into_iter().map(String::from).collect(), diff --git a/examples/example.md b/examples/example.md index 36d8cbf0..c1ac0d0a 100644 --- a/examples/example.md +++ b/examples/example.md @@ -6,3 +6,12 @@ ATGCATC bad DNA: ATGCATCssss + + +```python +import bad_spelin +# Not spel good +def im_guud(): + bad_spelin.bone() + pass +``` diff --git a/refactor.md b/refactor.md new file mode 100644 index 00000000..dab3a174 --- /dev/null +++ b/refactor.md @@ -0,0 +1,574 @@ +# Codebook Architecture Refactor + +## Goal + +Restructure the `codebook` crate internals to support multi-language files (markdown with code blocks, Astro/Vue/Svelte, HTML with `