Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
[Unreleased]

- Add multi-language support for Markdown files: fenced code blocks are now spell-checked using the appropriate language grammar (Python, Rust, Bash, etc.)
- Add language injection system via `@injection.*` capture tags in `.scm` query files. Adding multi-language support to a new file type requires only a `.scm` change, no Rust code
- Add HTML block injection in Markdown. Block-level HTML is spell-checked using the HTML grammar
- Add language alias resolution for Markdown code blocks (e.g., `py`, `js`, `sh`, `rs`, `yml`, `c++`)
- Pre-compile all tree-sitter queries at startup for faster spell-checking and earlier error detection
- Reduce per-word memory allocations in the spell-check pipeline
- Fix Erlang query producing duplicate captures for function name atoms
- Refactor: split spell-checking into separate extraction (`parser.rs`) and checking (`checker.rs`) modules

[0.3.35]

- Add tag-based filtering (`include_tags`/`exclude_tags`) to control which parts of code are spell-checked (comments, strings, identifiers, etc.)
Expand Down
123 changes: 123 additions & 0 deletions crates/codebook/src/checker.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
use std::collections::{HashMap, HashSet};

use crate::dictionaries::dictionary::Dictionary;
use crate::parser::{TextRange, WordLocation};
use codebook_config::CodebookConfig;

/// A candidate word extracted from a text node, with its position
/// in original-document byte offsets. Borrows the word text from the
/// source document to avoid per-word String allocations.
#[derive(Debug, Clone, PartialEq)]
pub struct WordCandidate<'a> {
pub word: &'a str,
pub start_byte: usize,
pub end_byte: usize,
}

/// Check candidate words against dictionaries and config rules.
/// Returns WordLocations for misspelled words, grouping all locations
/// of the same word together.
pub fn check_words(
candidates: &[WordCandidate<'_>],
dictionaries: &[std::sync::Arc<dyn Dictionary>],
config: &dyn CodebookConfig,
) -> Vec<WordLocation> {
// Group candidates by word text, deduplicating identical spans.
let mut word_positions: HashMap<&str, HashSet<TextRange>> = HashMap::new();
for candidate in candidates {
let location = TextRange {
start_byte: candidate.start_byte,
end_byte: candidate.end_byte,
};
let added = word_positions
.entry(candidate.word)
.or_default()
.insert(location);

debug_assert!(
added,
"Two of the same locations found. Make a better query. Word: {}, Location: {:?}",
candidate.word, location
);
}

// Check each unique word once
let mut results = Vec::new();
for (word, positions) in word_positions {
let positions: Vec<TextRange> = positions.into_iter().collect();
if config.should_flag_word(word) {
results.push(WordLocation::new(word.to_string(), positions));
continue;
}
if word.len() < config.get_min_word_length() {
continue;
}
if config.is_allowed_word(word) {
continue;
}
let is_correct = dictionaries.iter().any(|dict| dict.check(word));
if !is_correct {
results.push(WordLocation::new(word.to_string(), positions));
}
}
results
}

#[cfg(test)]
mod tests {
use super::*;
use crate::dictionaries::dictionary::TextDictionary;
use std::sync::Arc;

fn make_candidates<'a>(words: &[(&'a str, usize, usize)]) -> Vec<WordCandidate<'a>> {
words
.iter()
.map(|(word, start, end)| WordCandidate {
word,
start_byte: *start,
end_byte: *end,
})
.collect()
}

#[test]
fn test_check_words_flags_unknown() {
let dict = Arc::new(TextDictionary::new("hello\nworld\n"));
let config = Arc::new(codebook_config::CodebookConfigMemory::default());
let candidates = make_candidates(&[("hello", 0, 5), ("wrld", 6, 10)]);
let results = check_words(&candidates, &[dict], config.as_ref());
assert_eq!(results.len(), 1);
assert_eq!(results[0].word, "wrld");
}

#[test]
fn test_check_words_groups_locations() {
let dict = Arc::new(TextDictionary::new("hello\n"));
let config = Arc::new(codebook_config::CodebookConfigMemory::default());
let candidates = make_candidates(&[("wrld", 0, 4), ("wrld", 10, 14)]);
let results = check_words(&candidates, &[dict], config.as_ref());
assert_eq!(results.len(), 1);
assert_eq!(results[0].word, "wrld");
assert_eq!(results[0].locations.len(), 2);
}

#[test]
fn test_check_words_respects_min_length() {
let dict = Arc::new(TextDictionary::new(""));
let config = Arc::new(codebook_config::CodebookConfigMemory::default());
// Default min word length is 3
let candidates = make_candidates(&[("ab", 0, 2)]);
let results = check_words(&candidates, &[dict], config.as_ref());
assert!(results.is_empty(), "Short words should be skipped");
}

#[test]
fn test_check_words_respects_allowed_words() {
let dict = Arc::new(TextDictionary::new(""));
let config = Arc::new(codebook_config::CodebookConfigMemory::default());
config.add_word("codebook").unwrap();
let candidates = make_candidates(&[("codebook", 0, 8)]);
let results = check_words(&candidates, &[dict], config.as_ref());
assert!(results.is_empty(), "Allowed words should not be flagged");
}
}
15 changes: 0 additions & 15 deletions crates/codebook/src/dictionaries/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@ use std::{
sync::{Arc, RwLock},
};

use crate::parser::{WordLocation, find_locations};
use crate::queries::LanguageType;
use regex::Regex;

pub trait Dictionary: Send + Sync {
fn check(&self, word: &str) -> bool;
fn suggest(&self, word: &str) -> Vec<String>;
Expand Down Expand Up @@ -170,17 +166,6 @@ impl TextDictionary {
}
}

/// Integration helper to use any Dictionary trait with optimized batch processing
pub fn find_locations_with_dictionary_batch(
text: &str,
language: LanguageType,
dictionary: &dyn Dictionary,
skip_patterns: &[Regex],
) -> Vec<WordLocation> {
// For non-HashSet dictionaries, we still get deduplication benefits
find_locations(text, language, |word| dictionary.check(word), |_| true, skip_patterns)
}

#[cfg(test)]
mod dictionary_tests {
use super::*;
Expand Down
87 changes: 30 additions & 57 deletions crates/codebook/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod checker;
pub mod dictionaries;
mod logging;
pub mod parser;
Expand All @@ -6,6 +7,7 @@ pub mod regexes;
mod splitter;

use crate::regexes::get_default_skip_patterns;
use std::collections::HashSet;
use std::path::Path;
use std::sync::Arc;

Expand Down Expand Up @@ -38,56 +40,42 @@ impl Codebook {
file_path: Option<&str>,
) -> Vec<parser::WordLocation> {
if let Some(file_path) = file_path {
// ignore_paths is a blocklist and has higher precedence than include_paths
if self.config.should_ignore_path(Path::new(file_path)) {
return Vec::new();
}
// include_paths is an allowlist; empty list means "include everything"
if !self.config.should_include_path(Path::new(file_path)) {
return Vec::new();
}
}
// get needed dictionary names
// get needed dictionaries
// call spell check on each dictionary

let language = self.resolve_language(language, file_path);
let dictionaries = self.get_dictionaries(Some(language));
// Combine default and user patterns

// Combine default and user skip patterns
let mut all_patterns = get_default_skip_patterns().clone();
if let Some(user_patterns) = self.config.get_ignore_patterns() {
all_patterns.extend(user_patterns);
}
parser::find_locations(

// Extract all words, recursively following injections
let (candidates, languages_found) = parser::extract_all_words(
text,
language,
|word| {
if self.config.should_flag_word(word) {
return false;
}
if word.len() < self.config.get_min_word_length() {
return true;
}
if self.config.is_allowed_word(word) {
return true;
}
for dictionary in &dictionaries {
if dictionary.check(word) {
return true;
}
}
false
},
|tag| self.config.should_check_tag(tag),
&|tag| self.config.should_check_tag(tag),
&all_patterns,
)
);

// Load dictionaries for all languages encountered
let dictionaries = self.get_dictionaries_for_languages(&languages_found);

// Check words against dictionaries
checker::check_words(&candidates, &dictionaries, self.config.as_ref())
}

fn resolve_language(
&self,
language_type: Option<queries::LanguageType>,
path: Option<&str>,
) -> queries::LanguageType {
// Check if we have a language_id first, fallback to path, fall back to text
match language_type {
Some(lang) => lang,
None => match path {
Expand All @@ -97,21 +85,26 @@ impl Codebook {
}
}

fn get_dictionaries(
/// Gather dictionaries for all languages encountered in a file.
fn get_dictionaries_for_languages(
&self,
language: Option<queries::LanguageType>,
languages: &HashSet<queries::LanguageType>,
) -> Vec<Arc<dyn Dictionary>> {
let mut dictionary_ids = self.config.get_dictionary_ids();
if let Some(lang) = language {
let language_dictionary_ids = lang.dictionary_ids();
dictionary_ids.extend(language_dictionary_ids);
};

for lang in languages {
dictionary_ids.extend(lang.dictionary_ids());
}

dictionary_ids.extend(DEFAULT_DICTIONARIES.iter().map(|f| f.to_string()));

dictionary_ids.sort();
dictionary_ids.dedup();

let mut dictionaries = Vec::with_capacity(dictionary_ids.len());
debug!("Checking text with dictionaries: {dictionary_ids:?}");
for dictionary_id in dictionary_ids {
let dictionary = self.manager.get_dictionary(&dictionary_id);
if let Some(d) = dictionary {
if let Some(d) = self.manager.get_dictionary(&dictionary_id) {
dictionaries.push(d);
}
}
Expand All @@ -125,9 +118,8 @@ impl Codebook {
}

pub fn get_suggestions(&self, word: &str) -> Option<Vec<String>> {
// Get top suggestions and return the first 5 suggestions in round robin order
let max_results = 5;
let dictionaries = self.get_dictionaries(None);
let dictionaries = self.get_dictionaries_for_languages(&HashSet::new());
let mut is_misspelled = false;
let suggestions: Vec<Vec<String>> = dictionaries
.iter()
Expand Down Expand Up @@ -176,9 +168,7 @@ mod tests {
vec!["date", "elderberry", "fig"],
vec!["grape", "honeydew", "kiwi"],
];

let result = collect_round_robin(&sources, 5);
// Round-robin order: first from each source, then second from each source
assert_eq!(
result,
vec!["apple", "date", "grape", "banana", "elderberry"]
Expand All @@ -192,13 +182,6 @@ mod tests {
vec!["banana", "cherry", "date"],
vec!["cherry", "date", "elderberry"],
];

// In round-robin, we get:
// 1. apple (1st from 1st source)
// 2. banana (1st from 2nd source) - cherry already taken
// 3. cherry (1st from 3rd source)
// 4. banana (2nd from 1st source)
// 5. date (3rd from 2nd source) - cherry already taken
let result = collect_round_robin(&sources, 5);
assert_eq!(
result,
Expand All @@ -213,8 +196,6 @@ mod tests {
vec!["elderberry"],
vec!["fig", "grape"],
];

// Round-robin order with uneven sources
let result = collect_round_robin(&sources, 7);
assert_eq!(
result,
Expand All @@ -240,17 +221,13 @@ mod tests {
#[test]
fn test_collect_round_robin_some_empty_sources() {
let sources = vec![vec!["apple", "banana"], vec![], vec!["cherry", "date"]];

// Round-robin order, skipping empty source
let result = collect_round_robin(&sources, 4);
assert_eq!(result, vec!["apple", "cherry", "banana", "date"]);
}

#[test]
fn test_collect_round_robin_with_numbers() {
let sources = vec![vec![1, 3, 5], vec![2, 4, 6]];

// Round-robin order with numbers
let result = collect_round_robin(&sources, 6);
assert_eq!(result, vec![1, 2, 3, 4, 5, 6]);
}
Expand All @@ -262,17 +239,13 @@ mod tests {
vec!["date", "elderberry", "fig"],
vec!["grape", "honeydew", "kiwi"],
];

// First round of round-robin (first from each source)
let result = collect_round_robin(&sources, 3);
assert_eq!(result, vec!["apple", "date", "grape"]);
}

#[test]
fn test_collect_round_robin_max_count_higher_than_available() {
let sources = vec![vec!["apple", "banana"], vec!["cherry", "date"]];

// Round-robin order for all available elements
let result = collect_round_robin(&sources, 10);
assert_eq!(result, vec!["apple", "banana", "cherry", "date"]);
}
Expand Down
Loading
Loading