diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 59fcdd2..788510d 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -13,7 +13,8 @@ "Bash(git remote get-url:*)", "Bash(gh issue list:*)", "Bash(gh issue view:*)", - "Bash(gh repo view:*)" + "Bash(gh repo view:*)", + "Bash(cargo build:*)" ] } } diff --git a/CHANGELOG.md b/CHANGELOG.md index af0effe..684d95b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +[Unreleased] + +- Add tag-based filtering (`include_tags`/`exclude_tags`) to control which parts of code are spell-checked (comments, strings, identifiers, etc.) +- Rename tree-sitter capture names to use dot-separated namespace convention (e.g., `@identifier.function` instead of `@func_declaration`) + [0.3.34] - Fix crash in Termux by falling back to bundled Mozilla CA roots on Android (#230) diff --git a/README.md b/README.md index d6550f1..262a22c 100644 --- a/README.md +++ b/README.md @@ -293,6 +293,19 @@ ignore_patterns = [ # Set to 2 to check words with 2 or more characters min_word_length = 3 +# Filter which parts of your code are spell-checked by tag. +# Tags use a dot-separated hierarchy (e.g., "comment", "identifier.function"). +# Matching is prefix-based: "comment" matches "comment", "comment.line", +# "comment.block", etc. +# +# Only check these tags (if set, everything else is excluded) +# Default: [] (empty = check everything) +include_tags = ["comment", "string"] +# +# Exclude these tags from checking (takes precedence over include_tags) +# Default: [] +exclude_tags = ["string.heredoc"] + # Whether to use global configuration (project config only) # Set to false to completely ignore global settings # Default: true @@ -355,6 +368,26 @@ ignore_patterns = [ **Tip**: Include the identifier in your pattern. `'vim\.opt\.[a-z]+'` skips `showmode` in `vim.opt.showmode`, but `'vim\.opt\.'` alone won't (it only matches up to the dot). +### Tag-Based Filtering + +Codebook categorizes every piece of text it checks using **tags** — dot-separated labels like `comment`, `string`, `identifier.function`, etc. You can use `include_tags` and `exclude_tags` to control which categories are spell-checked. + +Matching is **prefix-based**: `"comment"` matches `comment`, `comment.line`, `comment.block`, etc. `include_tags` narrows what is checked (allowlist), and `exclude_tags` removes from that set (blocklist, takes precedence). This works the same way as `include_paths`/`ignore_paths`. + +```toml +# Only check comments and strings, ignore all identifiers +include_tags = ["comment", "string"] + +# Check everything except variable and parameter names +exclude_tags = ["identifier.variable", "identifier.parameter"] + +# Both can be combined: check comments and strings, but skip heredocs +include_tags = ["comment", "string"] +exclude_tags = ["string.heredoc"] +``` + +For the full list of available tags, see the [query tag reference](crates/codebook/src/queries/README.md). + ### LSP Initialization Options Editors can pass `initializationOptions` when starting the Codebook LSP for LSP-specific options. Refer to your editor's documentation for how to apply these options. All values are optional, omit them for the default behavior: @@ -451,68 +484,7 @@ For plain text dictionaries, use `TextRepo::new()` instead and add to `TEXT_DICT ## Adding New Programming Language Support -Codebook uses Tree-sitter support additional programming languages. Here's how to add support for a new language: - -### 1. Create a Tree-sitter Query - -Each language needs a Tree-sitter query file that defines which parts of the code should be checked for spelling issues. The query needs to capture: - -- Identifiers (variable names, function names, class names, etc.) -- String literals -- Comments - -Create a new `.scm` file in `codebook/crates/codebook/src/queries/` named after your language (e.g., `java.scm`). - -### 2. Understand the Language's AST - -To write an effective query, you need to understand the Abstract Syntax Tree (AST) structure of your language. Use these tools: - -- [Tree-sitter Playground](https://tree-sitter.github.io/tree-sitter/7-playground.html): Interactively explore how Tree-sitter parses code -- [Tree-sitter Visualizer](https://blopker.github.io/ts-visualizer/): Visualize the AST of your code in a more detailed way - -A good approach is to: - -1. Write sample code with identifiers, strings, and comments -2. Paste it into the playground/visualizer -3. Observe the node types used for each element -4. Create capture patterns that target only definition nodes, not usages - -### 3. Update the Language Settings - -Add your language to `codebook/crates/codebook/src/queries.rs`: - -1. Add a new variant to the `LanguageType` enum -2. Add a new entry to the `LANGUAGE_SETTINGS` array with: - - The language type - - File extensions for your language - - Language identifiers - - Path to your query file - -### 4. Add the Tree-sitter Grammar - -Make sure the appropriate Tree-sitter grammar is added as a dependency in `Cargo.toml` and update the `language()` function in `queries.rs` to return the correct language parser. - -### 5. Test Your Implementation - -Run the tests to ensure your query is valid: - -```bash -cargo test -p codebook queries::tests::test_all_queries_are_valid -``` - -Additional language tests should go in `codebook/tests`. There are many example tests to copy. - -You can also test with real code files to verify that Codebook correctly identifies spelling issues in your language. Example files should go in `examples/` and contain at least one spelling error to pass integration tests. - -### Tips for Writing Effective Queries - -- Focus on capturing definitions, not usages -- Include only nodes that contain user-defined text (not keywords) -- Test with representative code samples -- Start simple and add complexity as needed -- Look at existing language queries for patterns - -If you've successfully added support for a new language, please consider contributing it back to Codebook with a pull request! +See the [query development guide](crates/codebook/src/queries/README.md) for instructions on adding Tree-sitter queries for new languages, the tag naming convention, and tips for writing effective queries. ## Running Tests diff --git a/crates/codebook-config/src/helpers.rs b/crates/codebook-config/src/helpers.rs index c8c0d5f..393eaac 100644 --- a/crates/codebook-config/src/helpers.rs +++ b/crates/codebook-config/src/helpers.rs @@ -1,5 +1,3 @@ -use crate::settings::ConfigSettings; -use glob::Pattern; use log::error; use regex::{Regex, RegexBuilder}; use std::env; @@ -57,86 +55,6 @@ pub(crate) fn unix_cache_dir() -> PathBuf { env::temp_dir().join("codebook").join("cache") } -/// Insert a word into the allowlist, returning true when it was newly added. -pub(crate) fn insert_word(settings: &mut ConfigSettings, word: &str) -> bool { - let word = word.to_ascii_lowercase(); - if settings.words.contains(&word) { - return false; - } - settings.words.push(word); - settings.words.sort(); - settings.words.dedup(); - true -} - -/// Insert a path into the ignore list, returning true when it was newly added. -pub(crate) fn insert_ignore(settings: &mut ConfigSettings, file: &str) -> bool { - let file = file.to_string(); - if settings.ignore_paths.contains(&file) { - return false; - } - settings.ignore_paths.push(file); - settings.ignore_paths.sort(); - settings.ignore_paths.dedup(); - true -} - -/// Insert a path into the include list, returning true when it was newly added. -pub(crate) fn insert_include(settings: &mut ConfigSettings, file: &str) -> bool { - let file = file.to_string(); - if settings.include_paths.contains(&file) { - return false; - } - settings.include_paths.push(file); - settings.include_paths.sort(); - settings.include_paths.dedup(); - true -} - -/// Resolve configured dictionary IDs, providing a default when none are set. -pub(crate) fn dictionary_ids(settings: &ConfigSettings) -> Vec { - if settings.dictionaries.is_empty() { - vec!["en_us".to_string()] - } else { - settings.dictionaries.clone() - } -} - -fn match_pattern(pattern: &[String], path_str: &str) -> bool { - pattern.iter().any(|pattern| { - Pattern::new(pattern) - .map(|p| p.matches(path_str)) - .unwrap_or(false) - }) -} - -/// Determine whether a path should be included based on the configured glob patterns. -pub(crate) fn should_include_path(settings: &ConfigSettings, path: &Path) -> bool { - if settings.include_paths.is_empty() { - return true; - } - let path_str = path.to_string_lossy(); - match_pattern(&settings.include_paths, &path_str) -} - -/// Determine whether a path should be ignored based on the configured glob patterns. -pub(crate) fn should_ignore_path(settings: &ConfigSettings, path: &Path) -> bool { - let path_str = path.to_string_lossy(); - match_pattern(&settings.ignore_paths, &path_str) -} - -/// Check if a word is explicitly allowed. -pub(crate) fn is_allowed_word(settings: &ConfigSettings, word: &str) -> bool { - let word = word.to_ascii_lowercase(); - settings.words.iter().any(|w| w == &word) -} - -/// Check if a word should be flagged. -pub(crate) fn should_flag_word(settings: &ConfigSettings, word: &str) -> bool { - let word = word.to_ascii_lowercase(); - settings.flag_words.iter().any(|w| w == &word) -} - /// Compile user-provided ignore regex patterns, dropping invalid entries. /// Patterns are compiled with multiline mode so `^` and `$` match line boundaries. pub(crate) fn build_ignore_regexes(patterns: &[String]) -> Vec { @@ -154,11 +72,6 @@ pub(crate) fn build_ignore_regexes(patterns: &[String]) -> Vec { .collect() } -/// Retrieve the configured minimum word length. -pub(crate) fn min_word_length(settings: &ConfigSettings) -> usize { - settings.min_word_length -} - pub(crate) fn expand_tilde>(path_user_input: P) -> Option { let p = path_user_input.as_ref(); if !p.starts_with("~") { diff --git a/crates/codebook-config/src/lib.rs b/crates/codebook-config/src/lib.rs index 3e70a9c..29ab8f4 100644 --- a/crates/codebook-config/src/lib.rs +++ b/crates/codebook-config/src/lib.rs @@ -2,7 +2,8 @@ mod helpers; mod settings; mod watched_file; use crate::helpers::expand_tilde; -use crate::settings::ConfigSettings; +pub use crate::settings::ConfigSettings; + use crate::watched_file::WatchedFile; use log::debug; use log::info; @@ -32,6 +33,7 @@ pub trait CodebookConfig: Sync + Send + Debug { fn should_flag_word(&self, word: &str) -> bool; fn get_ignore_patterns(&self) -> Option>; fn get_min_word_length(&self) -> usize; + fn should_check_tag(&self, tag: &str) -> bool; fn cache_dir(&self) -> &Path; } @@ -474,51 +476,51 @@ impl CodebookConfigFile { impl CodebookConfig for CodebookConfigFile { /// Add a word to the project configs allowlist fn add_word(&self, word: &str) -> Result { - Ok(self.update_project_settings(|settings| helpers::insert_word(settings, word))) + Ok(self.update_project_settings(|settings| settings.insert_word(word))) } /// Add a word to the global configs allowlist fn add_word_global(&self, word: &str) -> Result { - Ok(self.update_global_settings(|settings| helpers::insert_word(settings, word))) + Ok(self.update_global_settings(|settings| settings.insert_word(word))) } /// Add a file to the ignore list fn add_ignore(&self, file: &str) -> Result { - Ok(self.update_project_settings(|settings| helpers::insert_ignore(settings, file))) + Ok(self.update_project_settings(|settings| settings.insert_ignore(file))) } /// Add a file to the include list fn add_include(&self, file: &str) -> Result { - Ok(self.update_project_settings(|settings| helpers::insert_include(settings, file))) + Ok(self.update_project_settings(|settings| settings.insert_include(file))) } /// Get dictionary IDs from effective configuration fn get_dictionary_ids(&self) -> Vec { let snapshot = self.snapshot(); - helpers::dictionary_ids(&snapshot) + snapshot.dictionary_ids() } /// Check if a path is included based on the effective configuration fn should_include_path(&self, path: &Path) -> bool { let snapshot = self.snapshot(); - helpers::should_include_path(&snapshot, path) + snapshot.should_include_path(path) } /// Check if a path should be ignored based on the effective configuration fn should_ignore_path(&self, path: &Path) -> bool { let snapshot = self.snapshot(); - helpers::should_ignore_path(&snapshot, path) + snapshot.should_ignore_path(path) } /// Check if a word is in the effective allowlist fn is_allowed_word(&self, word: &str) -> bool { let snapshot = self.snapshot(); - helpers::is_allowed_word(&snapshot, word) + snapshot.is_allowed_word(word) } /// Check if a word should be flagged according to effective configuration fn should_flag_word(&self, word: &str) -> bool { let snapshot = self.snapshot(); - helpers::should_flag_word(&snapshot, word) + snapshot.should_flag_word(word) } /// Get the list of user-defined ignore patterns @@ -534,7 +536,11 @@ impl CodebookConfig for CodebookConfigFile { /// Get the minimum word length which should be checked fn get_min_word_length(&self) -> usize { - helpers::min_word_length(&self.snapshot()) + self.snapshot().min_word_length() + } + + fn should_check_tag(&self, tag: &str) -> bool { + self.snapshot().should_check_tag(tag) } fn cache_dir(&self) -> &Path { @@ -576,7 +582,7 @@ impl CodebookConfigMemory { impl CodebookConfig for CodebookConfigMemory { fn add_word(&self, word: &str) -> Result { let mut settings = self.settings.write().unwrap(); - Ok(helpers::insert_word(&mut settings, word)) + Ok(settings.insert_word(word)) } fn add_word_global(&self, word: &str) -> Result { @@ -585,37 +591,37 @@ impl CodebookConfig for CodebookConfigMemory { fn add_ignore(&self, file: &str) -> Result { let mut settings = self.settings.write().unwrap(); - Ok(helpers::insert_ignore(&mut settings, file)) + Ok(settings.insert_ignore(file)) } fn add_include(&self, file: &str) -> Result { let mut settings = self.settings.write().unwrap(); - Ok(helpers::insert_include(&mut settings, file)) + Ok(settings.insert_include(file)) } fn get_dictionary_ids(&self) -> Vec { let snapshot = self.snapshot(); - helpers::dictionary_ids(&snapshot) + snapshot.dictionary_ids() } fn should_include_path(&self, path: &Path) -> bool { let snapshot = self.snapshot(); - helpers::should_include_path(&snapshot, path) + snapshot.should_include_path(path) } fn should_ignore_path(&self, path: &Path) -> bool { let snapshot = self.snapshot(); - helpers::should_ignore_path(&snapshot, path) + snapshot.should_ignore_path(path) } fn is_allowed_word(&self, word: &str) -> bool { let snapshot = self.snapshot(); - helpers::is_allowed_word(&snapshot, word) + snapshot.is_allowed_word(word) } fn should_flag_word(&self, word: &str) -> bool { let snapshot = self.snapshot(); - helpers::should_flag_word(&snapshot, word) + snapshot.should_flag_word(word) } fn get_ignore_patterns(&self) -> Option> { @@ -624,7 +630,11 @@ impl CodebookConfig for CodebookConfigMemory { } fn get_min_word_length(&self) -> usize { - helpers::min_word_length(&self.snapshot()) + self.snapshot().min_word_length() + } + + fn should_check_tag(&self, tag: &str) -> bool { + self.snapshot().should_check_tag(tag) } fn cache_dir(&self) -> &Path { diff --git a/crates/codebook-config/src/settings.rs b/crates/codebook-config/src/settings.rs index 708674d..1a3e89e 100644 --- a/crates/codebook-config/src/settings.rs +++ b/crates/codebook-config/src/settings.rs @@ -1,4 +1,6 @@ +use glob::Pattern; use serde::{Deserialize, Serialize}; +use std::path::Path; #[derive(Debug, Serialize, Clone, PartialEq)] pub struct ConfigSettings { /// List of dictionaries to use for spell checking @@ -38,6 +40,14 @@ pub struct ConfigSettings { skip_serializing_if = "is_default_min_word_length" )] pub min_word_length: usize, + + /// Tag prefixes to include (if non-empty, only matching tags are checked) + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub include_tags: Vec, + + /// Tag prefixes to exclude (takes precedence over include_tags) + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub exclude_tags: Vec, } fn default_use_global() -> bool { @@ -67,6 +77,8 @@ impl Default for ConfigSettings { ignore_patterns: Vec::new(), use_global: true, min_word_length: default_min_word_length(), + include_tags: Vec::new(), + exclude_tags: Vec::new(), } } } @@ -97,6 +109,10 @@ impl<'de> Deserialize<'de> for ConfigSettings { use_global: bool, #[serde(default = "default_min_word_length")] min_word_length: usize, + #[serde(default)] + include_tags: Vec, + #[serde(default)] + exclude_tags: Vec, } let helper = Helper::deserialize(deserializer)?; @@ -109,6 +125,8 @@ impl<'de> Deserialize<'de> for ConfigSettings { ignore_patterns: helper.ignore_patterns, use_global: helper.use_global, min_word_length: helper.min_word_length, + include_tags: helper.include_tags, + exclude_tags: helper.exclude_tags, }) } } @@ -123,6 +141,8 @@ impl ConfigSettings { self.include_paths.extend(other.include_paths); self.ignore_paths.extend(other.ignore_paths); self.ignore_patterns.extend(other.ignore_patterns); + self.include_tags.extend(other.include_tags); + self.exclude_tags.extend(other.exclude_tags); // The use_global setting from the other config is ignored during merging // as this is a per-config setting @@ -145,9 +165,125 @@ impl ConfigSettings { sort_and_dedup(&mut self.include_paths); sort_and_dedup(&mut self.ignore_paths); sort_and_dedup(&mut self.ignore_patterns); + sort_and_dedup(&mut self.include_tags); + sort_and_dedup(&mut self.exclude_tags); + } +} + +/// Check if a tag matches a pattern using prefix matching. +/// "comment" matches "comment", "comment.line", "comment.block", etc. +fn tag_matches_pattern(tag: &str, pattern: &str) -> bool { + tag == pattern || tag.starts_with(pattern) && tag.as_bytes().get(pattern.len()) == Some(&b'.') +} + +impl ConfigSettings { + /// Determine whether a capture tag should be spell-checked based on + /// include_tags and exclude_tags. exclude_tags takes precedence. + pub fn should_check_tag(&self, tag: &str) -> bool { + // exclude_tags takes precedence + if self + .exclude_tags + .iter() + .any(|p| tag_matches_pattern(tag, p)) + { + return false; + } + // if include_tags is set, tag must match at least one + if !self.include_tags.is_empty() { + return self + .include_tags + .iter() + .any(|p| tag_matches_pattern(tag, p)); + } + true + } + + /// Insert a word into the allowlist, returning true when it was newly added. + pub fn insert_word(&mut self, word: &str) -> bool { + let word = word.to_ascii_lowercase(); + if self.words.contains(&word) { + return false; + } + self.words.push(word); + self.words.sort(); + self.words.dedup(); + true + } + + /// Insert a path into the ignore list, returning true when it was newly added. + pub fn insert_ignore(&mut self, file: &str) -> bool { + let file = file.to_string(); + if self.ignore_paths.contains(&file) { + return false; + } + self.ignore_paths.push(file); + self.ignore_paths.sort(); + self.ignore_paths.dedup(); + true + } + + /// Insert a path into the include list, returning true when it was newly added. + pub fn insert_include(&mut self, file: &str) -> bool { + let file = file.to_string(); + if self.include_paths.contains(&file) { + return false; + } + self.include_paths.push(file); + self.include_paths.sort(); + self.include_paths.dedup(); + true + } + + /// Resolve configured dictionary IDs, providing a default when none are set. + pub fn dictionary_ids(&self) -> Vec { + if self.dictionaries.is_empty() { + vec!["en_us".to_string()] + } else { + self.dictionaries.clone() + } + } + + /// Determine whether a path should be included based on the configured glob patterns. + pub fn should_include_path(&self, path: &Path) -> bool { + if self.include_paths.is_empty() { + return true; + } + let path_str = path.to_string_lossy(); + match_pattern(&self.include_paths, &path_str) + } + + /// Determine whether a path should be ignored based on the configured glob patterns. + pub fn should_ignore_path(&self, path: &Path) -> bool { + let path_str = path.to_string_lossy(); + match_pattern(&self.ignore_paths, &path_str) + } + + /// Check if a word is explicitly allowed. + pub fn is_allowed_word(&self, word: &str) -> bool { + let word = word.to_ascii_lowercase(); + self.words.iter().any(|w| w == &word) + } + + /// Check if a word should be flagged. + pub fn should_flag_word(&self, word: &str) -> bool { + let word = word.to_ascii_lowercase(); + self.flag_words.iter().any(|w| w == &word) + } + + /// Retrieve the configured minimum word length. + pub fn min_word_length(&self) -> usize { + self.min_word_length } } +fn match_pattern(patterns: &[String], path_str: &str) -> bool { + patterns.iter().any(|pattern| { + Pattern::new(pattern) + .map(|p| p.matches(path_str)) + .unwrap_or(false) + }) +} + /// Helper function to sort and deduplicate a Vec of strings fn sort_and_dedup(vec: &mut Vec) { vec.sort(); @@ -257,6 +393,7 @@ mod tests { ignore_patterns: vec!["^```.*$".to_string()], use_global: true, min_word_length: 3, + ..Default::default() }; let other = ConfigSettings { @@ -268,6 +405,7 @@ mod tests { ignore_patterns: vec!["^//.*$".to_string()], use_global: false, min_word_length: 2, + ..Default::default() }; base.merge(other); @@ -337,6 +475,7 @@ mod tests { ], use_global: true, min_word_length: 3, + ..Default::default() }; config.sort_and_dedup(); @@ -370,6 +509,138 @@ mod tests { assert_eq!(config, ConfigSettings::default()); } + #[test] + fn test_include_tags_deserialization() { + let toml_str = r#" + include_tags = ["comment", "string"] + "#; + let config: ConfigSettings = toml::from_str(toml_str).unwrap(); + assert_eq!(config.include_tags, vec!["comment", "string"]); + assert!(config.exclude_tags.is_empty()); + } + + #[test] + fn test_exclude_tags_deserialization() { + let toml_str = r#" + exclude_tags = ["identifier.variable", "identifier.parameter"] + "#; + let config: ConfigSettings = toml::from_str(toml_str).unwrap(); + assert!(config.include_tags.is_empty()); + assert_eq!( + config.exclude_tags, + vec!["identifier.variable", "identifier.parameter"] + ); + } + + #[test] + fn test_tags_default_empty() { + let config = ConfigSettings::default(); + assert!(config.include_tags.is_empty()); + assert!(config.exclude_tags.is_empty()); + } + + #[test] + fn test_tags_serialization_omitted_when_empty() { + let config = ConfigSettings::default(); + let serialized = toml::to_string(&config).unwrap(); + assert!(!serialized.contains("include_tags")); + assert!(!serialized.contains("exclude_tags")); + } + + #[test] + fn test_tags_serialization_present_when_set() { + let config = ConfigSettings { + include_tags: vec!["comment".to_string()], + ..Default::default() + }; + let serialized = toml::to_string(&config).unwrap(); + assert!(serialized.contains("include_tags")); + } + + #[test] + fn test_tags_merge() { + let mut base = ConfigSettings { + include_tags: vec!["comment".to_string()], + exclude_tags: vec!["identifier.type".to_string()], + ..Default::default() + }; + let other = ConfigSettings { + include_tags: vec!["string".to_string(), "comment".to_string()], + exclude_tags: vec!["identifier.module".to_string()], + ..Default::default() + }; + base.merge(other); + assert_eq!(base.include_tags, vec!["comment", "string"]); + assert_eq!( + base.exclude_tags, + vec!["identifier.module", "identifier.type"] + ); + } + + #[test] + fn test_should_check_tag_no_filters() { + let config = ConfigSettings::default(); + assert!(config.should_check_tag("comment")); + assert!(config.should_check_tag("string")); + assert!(config.should_check_tag("identifier.function")); + } + + #[test] + fn test_should_check_tag_include_only() { + let config = ConfigSettings { + include_tags: vec!["comment".to_string(), "string".to_string()], + ..Default::default() + }; + assert!(config.should_check_tag("comment")); + assert!(config.should_check_tag("comment.line")); + assert!(config.should_check_tag("comment.block")); + assert!(config.should_check_tag("string")); + assert!(config.should_check_tag("string.special")); + assert!(!config.should_check_tag("identifier")); + assert!(!config.should_check_tag("identifier.function")); + } + + #[test] + fn test_should_check_tag_exclude_only() { + let config = ConfigSettings { + exclude_tags: vec!["identifier.variable".to_string()], + ..Default::default() + }; + assert!(config.should_check_tag("comment")); + assert!(config.should_check_tag("identifier.function")); + assert!(!config.should_check_tag("identifier.variable")); + } + + #[test] + fn test_should_check_tag_both_include_and_exclude() { + // include comments and strings, but exclude string.heredoc + let config = ConfigSettings { + include_tags: vec!["comment".to_string(), "string".to_string()], + exclude_tags: vec!["string.heredoc".to_string()], + ..Default::default() + }; + assert!(config.should_check_tag("comment")); + assert!(config.should_check_tag("comment.line")); + assert!(config.should_check_tag("string")); + assert!(config.should_check_tag("string.special")); + assert!(!config.should_check_tag("string.heredoc")); + assert!(!config.should_check_tag("identifier.function")); + } + + #[test] + fn test_should_check_tag_exclude_prefix() { + // Excluding "identifier" should exclude all identifier sub-tags + let config = ConfigSettings { + exclude_tags: vec!["identifier".to_string()], + ..Default::default() + }; + assert!(config.should_check_tag("comment")); + assert!(config.should_check_tag("string")); + assert!(!config.should_check_tag("identifier")); + assert!(!config.should_check_tag("identifier.function")); + assert!(!config.should_check_tag("identifier.type")); + } + #[test] fn test_partial_deserialization() { let toml_str = r#" diff --git a/crates/codebook/src/dictionaries/dictionary.rs b/crates/codebook/src/dictionaries/dictionary.rs index 0feccba..82e75d7 100644 --- a/crates/codebook/src/dictionaries/dictionary.rs +++ b/crates/codebook/src/dictionaries/dictionary.rs @@ -178,7 +178,7 @@ pub fn find_locations_with_dictionary_batch( skip_patterns: &[Regex], ) -> Vec { // For non-HashSet dictionaries, we still get deduplication benefits - find_locations(text, language, |word| dictionary.check(word), skip_patterns) + find_locations(text, language, |word| dictionary.check(word), |_| true, skip_patterns) } #[cfg(test)] diff --git a/crates/codebook/src/lib.rs b/crates/codebook/src/lib.rs index d04e1f8..5cc6840 100644 --- a/crates/codebook/src/lib.rs +++ b/crates/codebook/src/lib.rs @@ -77,6 +77,7 @@ impl Codebook { } false }, + |tag| self.config.should_check_tag(tag), &all_patterns, ) } diff --git a/crates/codebook/src/parser.rs b/crates/codebook/src/parser.rs index fff3649..894c3b1 100644 --- a/crates/codebook/src/parser.rs +++ b/crates/codebook/src/parser.rs @@ -177,6 +177,7 @@ pub fn find_locations( text: &str, language: LanguageType, check_function: impl Fn(&str) -> bool, + tag_filter: impl Fn(&str) -> bool, skip_patterns: &[Regex], ) -> Vec { match language { @@ -184,7 +185,13 @@ pub fn find_locations( let processor = TextProcessor::new(text, skip_patterns); processor.process_words_with_check(|word| check_function(word)) } - _ => find_locations_code(text, language, |word| check_function(word), skip_patterns), + _ => find_locations_code( + text, + language, + |word| check_function(word), + &tag_filter, + skip_patterns, + ), } } @@ -192,6 +199,7 @@ fn find_locations_code( text: &str, language: LanguageType, check_function: impl Fn(&str) -> bool, + tag_filter: &dyn Fn(&str) -> bool, skip_patterns: &[Regex], ) -> Vec { let language_setting = @@ -213,6 +221,7 @@ fn find_locations_code( let root_node = tree.root_node(); let lang = language_setting.language().unwrap(); let query = Query::new(&lang, language_setting.query).unwrap(); + let capture_names = query.capture_names(); let mut cursor = QueryCursor::new(); let mut word_locations: HashMap> = HashMap::new(); let provider = text.as_bytes(); @@ -223,6 +232,12 @@ fn find_locations_code( while let Some(match_) = matches_query.next() { for capture in match_.captures { + // Filter by tag + let tag = &capture_names[capture.index as usize]; + if !tag_filter(tag) { + continue; + } + let node = capture.node; let node_start_byte = node.start_byte(); @@ -303,7 +318,7 @@ mod parser_tests { #[test] fn test_spell_checking() { let text = "HelloWorld calc_wrld"; - let results = find_locations(text, LanguageType::Text, |_| false, &[]); + let results = find_locations(text, LanguageType::Text, |_| false, |_| true, &[]); println!("{results:?}"); assert_eq!(results.len(), 4); } @@ -408,7 +423,7 @@ mod parser_tests { fn test_duplicate_word_locations() { // Use a code language to exercise find_locations_code path let text = "// wrld foo wrld"; - let results = find_locations(text, LanguageType::Rust, |_| false, &[]); + let results = find_locations(text, LanguageType::Rust, |_| false, |_| true, &[]); let wrld = results.iter().find(|loc| loc.word == "wrld").unwrap(); assert_eq!( wrld.locations.len(), diff --git a/crates/codebook/src/queries.rs b/crates/codebook/src/queries.rs index 2fdf447..9501cf6 100644 --- a/crates/codebook/src/queries.rs +++ b/crates/codebook/src/queries.rs @@ -361,4 +361,54 @@ mod tests { ); } } + + /// Allowed full capture names. Any capture in a .scm file must be one of these. + /// The special "language" tag is used internally (e.g., ruby heredocs) and is + /// not exposed for user filtering. + const ALLOWED_TAGS: &[&str] = &[ + "comment", + "comment.line", + "comment.block", + "string", + "string.special", + "string.heredoc", + "identifier", + "identifier.function", + "identifier.type", + "identifier.parameter", + "identifier.field", + "identifier.variable", + "identifier.constant", + "identifier.module", + "language", + ]; + + #[test] + fn test_all_capture_names_use_allowed_tags() { + for language_setting in LANGUAGE_SETTINGS { + if language_setting.type_ == LanguageType::Text { + continue; + } + + let language = language_setting + .language() + .unwrap_or_else(|| panic!("Failed to get language for {:?}", language_setting.type_)); + + let query = Query::new(&language, language_setting.query).unwrap_or_else(|e| { + panic!( + "Invalid query for language {:?}: {:?}", + language_setting.type_, e + ) + }); + + for name in query.capture_names() { + assert!( + ALLOWED_TAGS.contains(&name.as_ref()), + "Language {:?} uses unknown capture tag @{name}. \ + Allowed tags: {ALLOWED_TAGS:?}", + language_setting.type_, + ); + } + } + } } diff --git a/crates/codebook/src/queries/README.md b/crates/codebook/src/queries/README.md new file mode 100644 index 0000000..c08fb47 --- /dev/null +++ b/crates/codebook/src/queries/README.md @@ -0,0 +1,87 @@ +# Tree-sitter Queries + +This directory contains Tree-sitter query files (`.scm`) that define which parts of each language's source code are extracted for spell checking. + +## Tag Convention + +Every capture name is a **tag** that categorizes the matched text. Tags use a dot-separated hierarchy so users can filter what gets checked via `include_tags`/`exclude_tags` in `codebook.toml`. Matching is prefix-based: `"comment"` matches `comment`, `comment.line`, `comment.block`, etc. + +### Available Tags + +| Capture name | When to use | +| --- | --- | +| `@comment` | Generic comments (when line/block aren't distinguished) | +| `@comment.line` | Line comments (`//`, `#`, `--`, etc.) | +| `@comment.block` | Block comments (`/* */`, `{- -}`, etc.) | +| `@string` | String literals and string content | +| `@string.special` | Atoms, symbols, struct tags, and other non-standard strings | +| `@string.heredoc` | Heredoc bodies | +| `@identifier` | Fallback for ambiguous identifiers that don't fit below | +| `@identifier.function` | Function and method name definitions | +| `@identifier.type` | Type, class, struct, interface, enum name definitions | +| `@identifier.parameter` | Function parameter names | +| `@identifier.field` | Struct field and object property names | +| `@identifier.variable` | Variable declaration names | +| `@identifier.constant` | Constant and enum member names | +| `@identifier.module` | Package, module, and namespace names | + +Not every language needs every tag. HTML, for example, only uses `@comment` and `@string`. + +## Adding a New Language + +### 1. Create the Query File + +Create a new `.scm` file in this directory named after your language (e.g., `java.scm`). + +Use namespaced capture names from the table above. Example: + +```scheme +(comment) @comment +(string_content) @string +(function_declaration name: (identifier) @identifier.function) +(parameter name: (identifier) @identifier.parameter) +(variable_declaration (identifier) @identifier.variable) +(class_declaration name: (identifier) @identifier.type) +``` + +### 2. Understand the Language's AST + +Use these tools to explore the grammar's node types: + +- [Tree-sitter Playground](https://tree-sitter.github.io/tree-sitter/7-playground.html) +- [Tree-sitter Visualizer](https://blopker.github.io/ts-visualizer/) + +A good approach: + +1. Write sample code with identifiers, strings, and comments +2. Paste it into the playground/visualizer +3. Observe the node types used for each element +4. Create capture patterns that target only definition nodes, not usages + +### 3. Update the Language Settings + +Add your language to `queries.rs`: + +1. Add a new variant to the `LanguageType` enum +2. Add a new entry to the `LANGUAGE_SETTINGS` array with the language type, file extensions, language identifiers, and path to your query file + +### 4. Add the Tree-sitter Grammar + +Add the grammar as a dependency in `Cargo.toml` and update the `language()` function in `queries.rs` to return the correct parser. + +### 5. Test + +```bash +cargo test -p codebook queries::tests::test_all_queries_are_valid +``` + +Additional language tests go in `crates/codebook/tests/`. Example files with at least one spelling error go in `examples/`. + +## Tips + +- Focus on capturing **definitions**, not usages +- Only capture nodes that contain user-defined text (not keywords) +- Always use namespaced capture names (`@identifier.function`, not `@func_declaration`) +- Use the most specific tag that fits (e.g., `@identifier.type` over `@identifier`) +- Start simple and add complexity as needed +- Look at existing query files for patterns diff --git a/crates/codebook/src/queries/bash.scm b/crates/codebook/src/queries/bash.scm index bb810e2..513c433 100644 --- a/crates/codebook/src/queries/bash.scm +++ b/crates/codebook/src/queries/bash.scm @@ -1,7 +1,7 @@ (comment) @comment (string_content) @string (function_definition - name: (word) @identifier) -(heredoc_body) @string + name: (word) @identifier.function) +(heredoc_body) @string.heredoc (variable_assignment - name: (variable_name) @identifier) + name: (variable_name) @identifier.variable) diff --git a/crates/codebook/src/queries/c.scm b/crates/codebook/src/queries/c.scm index c56077e..ff5b2ee 100644 --- a/crates/codebook/src/queries/c.scm +++ b/crates/codebook/src/queries/c.scm @@ -1,29 +1,29 @@ (comment) @comment (preproc_def - name: (identifier) @identifier) + name: (identifier) @identifier.constant) (type_definition - declarator: (type_identifier) @identifier) + declarator: (type_identifier) @identifier.type) (struct_specifier - name: (type_identifier) @identifier) + name: (type_identifier) @identifier.type) (field_declaration - declarator: (field_identifier) @identifier) + declarator: (field_identifier) @identifier.field) (pointer_declarator - declarator: (field_identifier) @identifier) + declarator: (field_identifier) @identifier.field) (enum_specifier - name: (type_identifier) @identifier) + name: (type_identifier) @identifier.type) (enumerator - name: (identifier) @identifier) + name: (identifier) @identifier.constant) (init_declarator - declarator: (identifier) @identifier) + declarator: (identifier) @identifier.variable) (pointer_declarator - declarator: (identifier) @identifier) + declarator: (identifier) @identifier.variable) (init_declarator (string_literal - (string_content) @string_content)) + (string_content) @string)) (function_declarator - declarator: (identifier) @identifier) + declarator: (identifier) @identifier.function) (parameter_declaration - declarator: (identifier) @identifier) + declarator: (identifier) @identifier.parameter) (call_expression (argument_list (string_literal diff --git a/crates/codebook/src/queries/cpp.scm b/crates/codebook/src/queries/cpp.scm index 1668098..c06c459 100644 --- a/crates/codebook/src/queries/cpp.scm +++ b/crates/codebook/src/queries/cpp.scm @@ -1,44 +1,44 @@ (comment) @comment (preproc_def - name: (identifier) @identifier) + name: (identifier) @identifier.constant) (type_definition - declarator: (type_identifier) @identifier) + declarator: (type_identifier) @identifier.type) (struct_specifier - name: (type_identifier) @identifier) + name: (type_identifier) @identifier.type) (class_specifier - name: (type_identifier) @identifier) + name: (type_identifier) @identifier.type) (field_declaration - declarator: (field_identifier) @identifier) + declarator: (field_identifier) @identifier.field) (pointer_declarator - declarator: (field_identifier) @identifier) + declarator: (field_identifier) @identifier.field) (enum_specifier - name: (type_identifier) @identifier) + name: (type_identifier) @identifier.type) (enumerator - name: (identifier) @identifier) + name: (identifier) @identifier.constant) (init_declarator - declarator: (identifier) @identifier) + declarator: (identifier) @identifier.variable) (pointer_declarator - declarator: (identifier) @identifier) + declarator: (identifier) @identifier.variable) (init_declarator (string_literal - (string_content) @string_content)) + (string_content) @string)) (function_declarator - declarator: (identifier) @identifier) + declarator: (identifier) @identifier.function) (parameter_declaration - declarator: (identifier) @identifier) + declarator: (identifier) @identifier.parameter) (call_expression (argument_list diff --git a/crates/codebook/src/queries/csharp.scm b/crates/codebook/src/queries/csharp.scm index 06a9ae4..b9535ff 100644 --- a/crates/codebook/src/queries/csharp.scm +++ b/crates/codebook/src/queries/csharp.scm @@ -3,55 +3,55 @@ ;; Methods / functions (method_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.function) (local_function_statement - name: (identifier) @identifier) + name: (identifier) @identifier.function) (constructor_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.function) (destructor_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.function) ;; Parameters (parameter - name: (identifier) @identifier) + name: (identifier) @identifier.parameter) ; Variable/Field definitions ; local variables (local_declaration_statement (variable_declaration (variable_declarator - (identifier) @identifier))) + (identifier) @identifier.variable))) ; fields in classes/structs (field_declaration (variable_declaration (variable_declarator - (identifier) @identifier))) + (identifier) @identifier.field))) ; Struct/Type definitions (interface_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.type) (class_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.type) (enum_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.type) (struct_declaration - (identifier) @identifier) + (identifier) @identifier.type) (record_declaration - (identifier) @identifier) + (identifier) @identifier.type) (namespace_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.module) (enum_member_declaration - (identifier) @identifier) + (identifier) @identifier.constant) ; String literals (interpolated_string_expression diff --git a/crates/codebook/src/queries/css.scm b/crates/codebook/src/queries/css.scm index f1e9f7a..0b58b77 100644 --- a/crates/codebook/src/queries/css.scm +++ b/crates/codebook/src/queries/css.scm @@ -1,6 +1,6 @@ -(class_name) @identifier -(id_name) @identifier -(property_name) @identifier +(class_name) @identifier.type +(id_name) @identifier.type +(property_name) @identifier.field (comment) @comment (string_value) @string (plain_value) @identifier diff --git a/crates/codebook/src/queries/elixir.scm b/crates/codebook/src/queries/elixir.scm index 4a38009..747dada 100644 --- a/crates/codebook/src/queries/elixir.scm +++ b/crates/codebook/src/queries/elixir.scm @@ -8,17 +8,17 @@ (quoted_atom) (keyword) (quoted_keyword) -] @string.special.symbol +] @string.special (comment) @comment -(alias) @identifier +(alias) @identifier.type (call (arguments (identifier) @identifier)) (call - (identifier) @identifier) + (identifier) @identifier.function) (binary_operator - left: (identifier) @identifier) + left: (identifier) @identifier.variable) diff --git a/crates/codebook/src/queries/erlang.scm b/crates/codebook/src/queries/erlang.scm index 231ebc2..93fd05f 100644 --- a/crates/codebook/src/queries/erlang.scm +++ b/crates/codebook/src/queries/erlang.scm @@ -1,8 +1,8 @@ (comment) @comment (string) @string -(atom) @string.special.symbol +(atom) @string.special -(var) @identifier +(var) @identifier.variable (function_clause - name: (atom) @identifier) + name: (atom) @identifier.function) diff --git a/crates/codebook/src/queries/go.scm b/crates/codebook/src/queries/go.scm index 34d5ca9..bd507a8 100644 --- a/crates/codebook/src/queries/go.scm +++ b/crates/codebook/src/queries/go.scm @@ -1,18 +1,18 @@ (comment) @comment -(function_declaration (identifier) @func_declaration) -(var_spec (identifier) @var_spec) +(function_declaration (identifier) @identifier.function) +(var_spec (identifier) @identifier.variable) (short_var_declaration - (expression_list (identifier) @short_var)) + (expression_list (identifier) @identifier.variable)) (parameter_declaration - name: (identifier) @parameter_name) -(field_identifier) @field -(type_identifier) @type_name + name: (identifier) @identifier.parameter) +(field_identifier) @identifier.field +(type_identifier) @identifier.type (import_spec - name: (package_identifier) @import_alias) -(package_clause (package_identifier) @package_name) -(label_name) @label -(field_declaration tag: (raw_string_literal) @struct_tag) -(const_spec name: (identifier) @const_name) -(range_clause left: (expression_list (identifier) @range_var)) -(interpreted_string_literal) @string_literal -(array_type (interpreted_string_literal) @array_string) + name: (package_identifier) @identifier.module) +(package_clause (package_identifier) @identifier.module) +(label_name) @identifier +(field_declaration tag: (raw_string_literal) @string.special) +(const_spec name: (identifier) @identifier.constant) +(range_clause left: (expression_list (identifier) @identifier.variable)) +(interpreted_string_literal) @string +(array_type (interpreted_string_literal) @string) diff --git a/crates/codebook/src/queries/haskell.scm b/crates/codebook/src/queries/haskell.scm index 60200e5..93f99d0 100644 --- a/crates/codebook/src/queries/haskell.scm +++ b/crates/codebook/src/queries/haskell.scm @@ -1,5 +1,5 @@ (comment) @comment (string) @string -(variable) @identifier -(module) @identifier +(variable) @identifier.variable +(module) @identifier.module (name) @identifier diff --git a/crates/codebook/src/queries/java.scm b/crates/codebook/src/queries/java.scm index 3b470dc..9824e7b 100644 --- a/crates/codebook/src/queries/java.scm +++ b/crates/codebook/src/queries/java.scm @@ -7,18 +7,18 @@ (string_literal) ] @string (variable_declarator - name: (identifier) @identifier) + name: (identifier) @identifier.variable) (interface_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.type) (class_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.type) (method_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.function) (enum_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.type) (enum_constant - name: (identifier) @identifier) + name: (identifier) @identifier.constant) (formal_parameter - name: (identifier) @identifier) + name: (identifier) @identifier.parameter) (catch_formal_parameter - name: (identifier) @identifier) + name: (identifier) @identifier.parameter) diff --git a/crates/codebook/src/queries/javascript.scm b/crates/codebook/src/queries/javascript.scm index f48e91d..b6d7ab5 100644 --- a/crates/codebook/src/queries/javascript.scm +++ b/crates/codebook/src/queries/javascript.scm @@ -1,20 +1,20 @@ (comment) @comment (string_fragment) @string (variable_declarator - name: (identifier) @identifier) + name: (identifier) @identifier.variable) (object (pair - key: (property_identifier) @property_name)) + key: (property_identifier) @identifier.field)) (catch_clause - parameter: (identifier) @identifier) + parameter: (identifier) @identifier.parameter) (jsx_text) @string -(shorthand_property_identifier) @identifier +(shorthand_property_identifier) @identifier.field (function_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.function) (function_declaration parameters: (formal_parameters - (identifier) @identifier)) + (identifier) @identifier.parameter)) (method_definition - name: (property_identifier) @identifier) + name: (property_identifier) @identifier.function) (class_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.type) diff --git a/crates/codebook/src/queries/lua.scm b/crates/codebook/src/queries/lua.scm index 7c9febf..d97ece5 100644 --- a/crates/codebook/src/queries/lua.scm +++ b/crates/codebook/src/queries/lua.scm @@ -6,26 +6,26 @@ ; Function declarations (function_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.function) (function_declaration (method_index_expression - method: (identifier) @identifier)) + method: (identifier) @identifier.function)) ; Variable assignments (assignment_statement (variable_list - (identifier) @identifier)) + (identifier) @identifier.variable)) (assignment_statement (variable_list (dot_index_expression - field: (identifier) @identifier))) + field: (identifier) @identifier.field))) ; Function parameters (parameters - (identifier) @identifier) + (identifier) @identifier.parameter) ; Table fields (field - name: (identifier) @identifier) + name: (identifier) @identifier.field) diff --git a/crates/codebook/src/queries/odin.scm b/crates/codebook/src/queries/odin.scm index 894c679..0ce30b9 100644 --- a/crates/codebook/src/queries/odin.scm +++ b/crates/codebook/src/queries/odin.scm @@ -1,44 +1,44 @@ ; Comments -(comment) @comment -(block_comment) @comment +(comment) @comment.line +(block_comment) @comment.block ; Procedure declarations (including parameter names) (procedure_declaration - (expression) @identifier) + (expression) @identifier.function) (overloaded_procedure_declaration - (expression) @identifier) + (expression) @identifier.function) (parameter - (identifier) @identifier) + (identifier) @identifier.parameter) (default_parameter - (identifier) @identifier) + (identifier) @identifier.parameter) ; Variables and constants identifiers (declaration-only) (var_declaration - (expression) @identifier ":") + (expression) @identifier.variable ":") (assignment_statement - (expression) @identifier ":=") + (expression) @identifier.variable ":=") (const_declaration - (expression)+ @identifier) + (expression)+ @identifier.constant) (const_type_declaration - (expression)+ @identifier) + (expression)+ @identifier.constant) ; Struct, enum, union, bit_fields names (struct_declaration - (expression) @identifier) + (expression) @identifier.type) (enum_declaration - (expression) @identifier) + (expression) @identifier.type) (union_declaration - (expression) @identifier) + (expression) @identifier.type) (bit_field_declaration - (expression) @identifier "::") + (expression) @identifier.type "::") ; Fields and enum variant names (field - (identifier) @identifier) + (identifier) @identifier.field) (bit_field_member - name: (identifier) @identifier) + name: (identifier) @identifier.field) (enum_member - name: (identifier) @identifier) + name: (identifier) @identifier.constant) ; Strings (string_content) @string diff --git a/crates/codebook/src/queries/php.scm b/crates/codebook/src/queries/php.scm index 6b9afa5..0fb8616 100644 --- a/crates/codebook/src/queries/php.scm +++ b/crates/codebook/src/queries/php.scm @@ -6,20 +6,20 @@ ; Names (covers function names, class names, etc.) (class_declaration - name: (name) @identifier) + name: (name) @identifier.type) (const_declaration - (const_element (name) @identifier)) + (const_element (name) @identifier.constant)) (namespace_definition - (namespace_name (name) @identifier)) + (namespace_name (name) @identifier.module)) (property_element - (variable_name (name) @identifier)) + (variable_name (name) @identifier.field)) (method_declaration - name: (name) @identifier) + name: (name) @identifier.function) (assignment_expression - left: (variable_name (name) @identifier)) + left: (variable_name (name) @identifier.variable)) (function_definition - name: (name) @identifier) + name: (name) @identifier.function) (simple_parameter - (variable_name (name) @identifier)) + (variable_name (name) @identifier.parameter)) (catch_clause - (variable_name (name) @identifier)) + (variable_name (name) @identifier.parameter)) diff --git a/crates/codebook/src/queries/python.scm b/crates/codebook/src/queries/python.scm index c03d1e4..6eb3286 100644 --- a/crates/codebook/src/queries/python.scm +++ b/crates/codebook/src/queries/python.scm @@ -3,35 +3,35 @@ (string_content) @string (function_definition - name: (identifier) @identifier) + name: (identifier) @identifier.function) (class_definition - name: (identifier) @identifier) + name: (identifier) @identifier.type) (assignment - left: (identifier) @identifier) + left: (identifier) @identifier.variable) (import_statement name: (aliased_import - alias: (identifier) @identifier)) + alias: (identifier) @identifier.module)) (import_from_statement name: (aliased_import - alias: (identifier) @identifier)) + alias: (identifier) @identifier.module)) (parameters - (identifier) @identifier) + (identifier) @identifier.parameter) ; Matches typed parameters (e.g., "name: str") ; The identifier for the name is a *direct child* of typed_parameter, ; while the type identifier is nested inside a (type) node. (typed_parameter - (identifier) @identifier) + (identifier) @identifier.parameter) ; Matches parameters with default values (e.g., "limit=10") (default_parameter - (identifier) @identifier) + (identifier) @identifier.parameter) ; Matches typed parameters with default values (e.g., "limit: int = 10") (typed_default_parameter - (identifier) @identifier) + (identifier) @identifier.parameter) diff --git a/crates/codebook/src/queries/r.scm b/crates/codebook/src/queries/r.scm index 7ac0853..96252ea 100644 --- a/crates/codebook/src/queries/r.scm +++ b/crates/codebook/src/queries/r.scm @@ -1,14 +1,14 @@ (comment) @comment (string) @string -(parameter name: (identifier) @identifier) +(parameter name: (identifier) @identifier.parameter) (binary_operator - lhs: (identifier) @identifier + lhs: (identifier) @identifier.variable operator: ["<-" "="]) (binary_operator operator: "->" - rhs: (identifier) @identifier) + rhs: (identifier) @identifier.variable) ;--------------------------------------- ; Less clear-cut spell checking targets: @@ -17,7 +17,7 @@ ; Functions with ... args sometimes use the argument names similarly to ; new variable definitions which should be spell-checked. ; e.g. dplyr::mutate(data_table, new_column_name=col_a + col_b) should check `new_column_name` -(argument name: (identifier) @identifier) +(argument name: (identifier) @identifier.parameter) ; Assignments with `$` can similarly define new names ; For chains, only check the last name since the earlier names are not being newly defined @@ -25,11 +25,11 @@ (binary_operator lhs: (extract_operator operator: "$" - rhs: (identifier) @identifier) + rhs: (identifier) @identifier.field) operator: ["<-" "="]) (binary_operator operator: "->" rhs: (extract_operator operator: "$" - rhs: (identifier) @identifier)) + rhs: (identifier) @identifier.field)) diff --git a/crates/codebook/src/queries/ruby.scm b/crates/codebook/src/queries/ruby.scm index 676a579..8943f89 100644 --- a/crates/codebook/src/queries/ruby.scm +++ b/crates/codebook/src/queries/ruby.scm @@ -1,12 +1,12 @@ (string) @string (comment) @comment -(assignment (identifier) @identifier) +(assignment (identifier) @identifier.variable) (method - (method_parameters (keyword_parameter (identifier) @identifier))) + (method_parameters (keyword_parameter (identifier) @identifier.parameter))) (method - (method_parameters (identifier) @identifier)) -(method name: (identifier) @identifier) + (method_parameters (identifier) @identifier.parameter)) +(method name: (identifier) @identifier.function) (heredoc_body - (heredoc_content) @string + (heredoc_content) @string.heredoc (heredoc_end) @language (#downcase! @language)) diff --git a/crates/codebook/src/queries/rust.scm b/crates/codebook/src/queries/rust.scm index 38126fd..aca5e37 100644 --- a/crates/codebook/src/queries/rust.scm +++ b/crates/codebook/src/queries/rust.scm @@ -4,26 +4,26 @@ !trait body: (declaration_list (function_item - name: (identifier) @identifier))) + name: (identifier) @identifier.function))) ; Functions in trait definitions (trait_item body: (declaration_list (function_item - name: (identifier) @identifier))) + name: (identifier) @identifier.function))) ; Top-level functions -(source_file (function_item name: (identifier) @identifier)) +(source_file (function_item name: (identifier) @identifier.function)) ; Functions in modules -(mod_item body: (declaration_list (function_item name: (identifier) @identifier))) +(mod_item body: (declaration_list (function_item name: (identifier) @identifier.function))) ; Nested functions (inside blocks) -(block (function_item name: (identifier) @identifier)) +(block (function_item name: (identifier) @identifier.function)) (parameter - pattern: (identifier) @identifier) + pattern: (identifier) @identifier.parameter) (let_declaration - pattern: (identifier) @identifier) + pattern: (identifier) @identifier.variable) (struct_item - name: (type_identifier) @identifier) + name: (type_identifier) @identifier.type) (field_declaration - name: (field_identifier) @identifier) -(block_comment) @comment -(line_comment) @comment + name: (field_identifier) @identifier.field) +(block_comment) @comment.block +(line_comment) @comment.line (string_content) @string diff --git a/crates/codebook/src/queries/swift.scm b/crates/codebook/src/queries/swift.scm index 41fd386..9258a79 100644 --- a/crates/codebook/src/queries/swift.scm +++ b/crates/codebook/src/queries/swift.scm @@ -1,20 +1,20 @@ -(comment) @comment -(multiline_comment) @comment +(comment) @comment.line +(multiline_comment) @comment.block (class_declaration - name: (type_identifier) @identifier) + name: (type_identifier) @identifier.type) (function_declaration - name: (simple_identifier) @identifier) + name: (simple_identifier) @identifier.function) (protocol_declaration - name: (type_identifier) @identifier) + name: (type_identifier) @identifier.type) (property_declaration - name: (pattern) @identifier) + name: (pattern) @identifier.field) (parameter - name: (simple_identifier) @identifier) + name: (simple_identifier) @identifier.parameter) (line_string_literal) @string (multi_line_string_literal) @string diff --git a/crates/codebook/src/queries/typescript.scm b/crates/codebook/src/queries/typescript.scm index 815acce..1274358 100644 --- a/crates/codebook/src/queries/typescript.scm +++ b/crates/codebook/src/queries/typescript.scm @@ -1,30 +1,30 @@ (comment) @comment (string_fragment) @string (variable_declarator - name: (identifier) @identifier) + name: (identifier) @identifier.variable) (object (pair - key: (property_identifier) @property_name)) + key: (property_identifier) @identifier.field)) (interface_declaration - name: (type_identifier) @identifier) + name: (type_identifier) @identifier.type) (interface_body (property_signature - name: (property_identifier) @property_name)) + name: (property_identifier) @identifier.field)) (catch_clause - parameter: (identifier) @identifier) + parameter: (identifier) @identifier.parameter) (jsx_text) @string -(shorthand_property_identifier) @identifier +(shorthand_property_identifier) @identifier.field (function_declaration - name: (identifier) @identifier) + name: (identifier) @identifier.function) (formal_parameters (required_parameter - pattern: (identifier) @identifier)) + pattern: (identifier) @identifier.parameter)) (formal_parameters (optional_parameter - pattern: (identifier) @identifier)) + pattern: (identifier) @identifier.parameter)) (method_definition - name: (property_identifier) @identifier) + name: (property_identifier) @identifier.function) (class_declaration - name: (type_identifier) @identifier) + name: (type_identifier) @identifier.type) (public_field_definition - name: (property_identifier) @identifier) + name: (property_identifier) @identifier.field) diff --git a/crates/codebook/src/queries/typst.scm b/crates/codebook/src/queries/typst.scm index 4f34d06..623b6c0 100644 --- a/crates/codebook/src/queries/typst.scm +++ b/crates/codebook/src/queries/typst.scm @@ -17,28 +17,28 @@ (label) @identifier ; Import rename -(as (ident) @identifier) +(as (ident) @identifier.variable) ; Collect remainder of array in a new variable -(elude (ident) @identifier) +(elude (ident) @identifier.variable) -(let pattern: (ident) @identifier) +(let pattern: (ident) @identifier.variable) ; Destructuring assignment -(let pattern: (group (ident) @identifier)) +(let pattern: (group (ident) @identifier.variable)) ; Destructuring dict into a new variable -(let pattern: (group (tagged field: (ident) (ident) @identifier))) +(let pattern: (group (tagged field: (ident) (ident) @identifier.variable))) ; Dictionary type -(let pattern: (ident) value: (group (tagged field: (ident) @identifier))) +(let pattern: (ident) value: (group (tagged field: (ident) @identifier.field))) -(for pattern: (ident) @identifier) -(for pattern: (group (ident) @identifier)) +(for pattern: (ident) @identifier.variable) +(for pattern: (group (ident) @identifier.variable)) ; Function -(let pattern: (call item: (ident) @identifier)) -(let pattern: (call item: (ident) (group (ident) @identifier))) -(let pattern: (call item: (ident) (group (tagged field: (ident) @identifier)))) +(let pattern: (call item: (ident) @identifier.function)) +(let pattern: (call item: (ident) (group (ident) @identifier.parameter))) +(let pattern: (call item: (ident) (group (tagged field: (ident) @identifier.parameter)))) ; Anonymous function -(lambda pattern: (ident) @identifier) -(lambda pattern: (group (ident) @identifier)) -(lambda pattern: (group (group (ident) @identifier))) +(lambda pattern: (ident) @identifier.parameter) +(lambda pattern: (group (ident) @identifier.parameter)) +(lambda pattern: (group (group (ident) @identifier.parameter))) diff --git a/crates/codebook/src/queries/vhdl.scm b/crates/codebook/src/queries/vhdl.scm index 71fbf96..9385b10 100644 --- a/crates/codebook/src/queries/vhdl.scm +++ b/crates/codebook/src/queries/vhdl.scm @@ -1,72 +1,72 @@ ; Comments - capture comment content for spell checking (line_comment - (comment_content) @comment) + (comment_content) @comment.line) (block_comment - (comment_content) @comment) + (comment_content) @comment.block) ; String literals (string_literal) @string ; Entity declarations (entity_declaration - entity: (identifier) @identifier) + entity: (identifier) @identifier.type) ; Architecture definitions (architecture_definition - architecture: (identifier) @identifier) + architecture: (identifier) @identifier.type) ; Signal declarations (signal_declaration (identifier_list - (identifier) @identifier)) + (identifier) @identifier.variable)) ; Variable declarations (variable_declaration (identifier_list - (identifier) @identifier)) + (identifier) @identifier.variable)) ; Constant declarations (constant_declaration (identifier_list - (identifier) @identifier)) + (identifier) @identifier.constant)) ; Function specifications (function_specification - function: (identifier) @identifier) + function: (identifier) @identifier.function) ; Procedure specifications (procedure_specification - procedure: (identifier) @identifier) + procedure: (identifier) @identifier.function) ; Component declarations (component_declaration - component: (identifier) @identifier) + component: (identifier) @identifier.type) ; Type declarations (type_declaration - type: (identifier) @identifier) + type: (identifier) @identifier.type) ; Subtype declarations (subtype_declaration - type: (identifier) @identifier) + type: (identifier) @identifier.type) ; Port/generic interface declarations (interface_declaration (identifier_list - (identifier) @identifier)) + (identifier) @identifier.parameter)) (interface_signal_declaration (identifier_list - (identifier) @identifier)) + (identifier) @identifier.parameter)) (interface_variable_declaration (identifier_list - (identifier) @identifier)) + (identifier) @identifier.parameter)) (interface_constant_declaration (identifier_list - (identifier) @identifier)) + (identifier) @identifier.parameter)) ; Labels (label) @identifier ; Alias declarations (alias_declaration - (identifier) @identifier) + (identifier) @identifier.variable) diff --git a/crates/codebook/src/queries/yaml.scm b/crates/codebook/src/queries/yaml.scm index 4a0b805..2ccb720 100644 --- a/crates/codebook/src/queries/yaml.scm +++ b/crates/codebook/src/queries/yaml.scm @@ -17,7 +17,7 @@ [ (double_quote_scalar) (single_quote_scalar) - ] @identifier)) + ] @identifier.field)) (flow_mapping (_ @@ -25,4 +25,4 @@ [ (double_quote_scalar) (single_quote_scalar) - ] @identifier))) + ] @identifier.field))) diff --git a/crates/codebook/src/queries/zig.scm b/crates/codebook/src/queries/zig.scm index dd967cf..dd98520 100644 --- a/crates/codebook/src/queries/zig.scm +++ b/crates/codebook/src/queries/zig.scm @@ -6,20 +6,20 @@ ; Variable declarations (const/var declarations) (variable_declaration - (identifier) @identifier) + (identifier) @identifier.variable) ; Function declarations (function_declaration - (identifier) @identifier) + (identifier) @identifier.function) ; Function parameters (parameter - (identifier) @identifier) + (identifier) @identifier.parameter) ; Payload identifiers (capture variables in for/while loops) (payload - (identifier) @identifier) + (identifier) @identifier.variable) (struct_declaration (container_field - (identifier) @identifier)) + (identifier) @identifier.field)) diff --git a/crates/codebook/tests/test_tags.rs b/crates/codebook/tests/test_tags.rs new file mode 100644 index 0000000..7123e15 --- /dev/null +++ b/crates/codebook/tests/test_tags.rs @@ -0,0 +1,156 @@ +use codebook::queries::LanguageType; +mod utils; + +/// Sample Rust code with misspellings in comments, strings, and identifiers. +const RUST_SAMPLE: &str = r#" + // A commet with a typo + fn calculat_age() { + let nmber = "a strng value"; + } +"#; + +fn check(text: &str, lang: LanguageType, include: Vec<&str>, exclude: Vec<&str>) -> Vec { + let processor = utils::get_processor_with_tags(include, exclude); + let mut words: Vec = processor + .spell_check(text, Some(lang), None) + .iter() + .map(|r| r.word.clone()) + .collect(); + words.sort(); + words +} + +#[test] +fn test_no_filters_returns_all() { + let words = check(RUST_SAMPLE, LanguageType::Rust, vec![], vec![]); + // Should find typos in all three categories + assert!(words.contains(&"commet".to_string()), "missing comment typo"); + assert!( + words.contains(&"calculat".to_string()), + "missing identifier typo" + ); + assert!(words.contains(&"strng".to_string()), "missing string typo"); +} + +#[test] +fn test_include_comments_only() { + let words = check(RUST_SAMPLE, LanguageType::Rust, vec!["comment"], vec![]); + assert!(words.contains(&"commet".to_string()), "missing comment typo"); + assert!( + !words.contains(&"calculat".to_string()), + "identifier should be excluded" + ); + assert!( + !words.contains(&"strng".to_string()), + "string should be excluded" + ); + assert!( + !words.contains(&"nmber".to_string()), + "variable should be excluded" + ); +} + +#[test] +fn test_include_strings_only() { + let words = check(RUST_SAMPLE, LanguageType::Rust, vec!["string"], vec![]); + assert!(words.contains(&"strng".to_string()), "missing string typo"); + assert!( + !words.contains(&"commet".to_string()), + "comment should be excluded" + ); + assert!( + !words.contains(&"calculat".to_string()), + "identifier should be excluded" + ); +} + +#[test] +fn test_include_identifiers_only() { + let words = check(RUST_SAMPLE, LanguageType::Rust, vec!["identifier"], vec![]); + assert!( + words.contains(&"calculat".to_string()), + "missing function name typo" + ); + assert!( + words.contains(&"nmber".to_string()), + "missing variable name typo" + ); + assert!( + !words.contains(&"commet".to_string()), + "comment should be excluded" + ); + assert!( + !words.contains(&"strng".to_string()), + "string should be excluded" + ); +} + +#[test] +fn test_exclude_identifiers() { + let words = check(RUST_SAMPLE, LanguageType::Rust, vec![], vec!["identifier"]); + assert!(words.contains(&"commet".to_string()), "missing comment typo"); + assert!(words.contains(&"strng".to_string()), "missing string typo"); + assert!( + !words.contains(&"calculat".to_string()), + "identifier should be excluded" + ); + assert!( + !words.contains(&"nmber".to_string()), + "variable should be excluded" + ); +} + +#[test] +fn test_exclude_specific_subtag() { + // Exclude only identifier.variable, keep identifier.function + let words = check( + RUST_SAMPLE, + LanguageType::Rust, + vec![], + vec!["identifier.variable"], + ); + assert!( + words.contains(&"calculat".to_string()), + "function name should still be checked" + ); + assert!( + !words.contains(&"nmber".to_string()), + "variable should be excluded" + ); +} + +#[test] +fn test_include_and_exclude_combined() { + // Include comments and strings, but exclude string specifically + let words = check( + RUST_SAMPLE, + LanguageType::Rust, + vec!["comment", "string"], + vec!["string"], + ); + assert!(words.contains(&"commet".to_string()), "missing comment typo"); + assert!( + !words.contains(&"strng".to_string()), + "string should be excluded by exclude_tags" + ); + assert!( + !words.contains(&"calculat".to_string()), + "identifier not in include_tags" + ); +} + +#[test] +fn test_text_language_ignores_tags() { + // Text language doesn't use tree-sitter, so tags should have no effect + let processor = utils::get_processor_with_tags(vec!["comment"], vec![]); + let text = "This has a tset typo"; + let words: Vec = processor + .spell_check(text, Some(LanguageType::Text), None) + .iter() + .map(|r| r.word.clone()) + .collect(); + assert!( + words.contains(&"tset".to_string()), + "Text mode should check everything regardless of tags" + ); +} diff --git a/crates/codebook/tests/utils/mod.rs b/crates/codebook/tests/utils/mod.rs index 424c0f7..250e41b 100644 --- a/crates/codebook/tests/utils/mod.rs +++ b/crates/codebook/tests/utils/mod.rs @@ -30,6 +30,20 @@ pub fn get_processor_with_include_and_ignore(include: &str, ignore: &str) -> Cod Codebook::new(config).unwrap() } +#[allow(dead_code)] +pub fn get_processor_with_tags( + include_tags: Vec<&str>, + exclude_tags: Vec<&str>, +) -> Codebook { + let settings = codebook_config::ConfigSettings { + include_tags: include_tags.into_iter().map(String::from).collect(), + exclude_tags: exclude_tags.into_iter().map(String::from).collect(), + ..Default::default() + }; + let config = Arc::new(CodebookConfigMemory::new(settings)); + Codebook::new(config).unwrap() +} + #[allow(dead_code)] pub fn init_logging() { let _ = env_logger::builder().is_test(true).try_init();