From 7e5dc285c397d9c11cd691d93beabf080a24db5f Mon Sep 17 00:00:00 2001 From: Bo Lopker Date: Thu, 19 Mar 2026 22:43:47 -0700 Subject: [PATCH] Add tree-sitter Markdown support to skip fenced code blocks Adds a proper Markdown language type using tree-sitter-md instead of treating Markdown as plain text. Fenced code blocks are no longer spell-checked, fixing false positives like `mkdir` in bash snippets. Closes #198 --- Cargo.lock | 11 ++ Cargo.toml | 1 + crates/codebook/Cargo.toml | 1 + crates/codebook/src/queries.rs | 9 ++ crates/codebook/src/queries/markdown.scm | 2 + crates/codebook/tests/test_markdown.rs | 136 +++++++++++++++++++++++ 6 files changed, 160 insertions(+) create mode 100644 crates/codebook/src/queries/markdown.scm create mode 100644 crates/codebook/tests/test_markdown.rs diff --git a/Cargo.lock b/Cargo.lock index 42b516a..515d020 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -426,6 +426,7 @@ dependencies = [ "tree-sitter-java", "tree-sitter-javascript", "tree-sitter-lua", + "tree-sitter-md", "tree-sitter-odin-codebook", "tree-sitter-php", "tree-sitter-python", @@ -3021,6 +3022,16 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-md" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2efd398be546456c814598ee56c0f51769a77241511b4a58077815d120afa882" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-odin-codebook" version = "1.4.0" diff --git a/Cargo.toml b/Cargo.toml index 800d64f..1e33f56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ tree-sitter-html = "<0.25.0" tree-sitter-java = "<0.25.0" tree-sitter-javascript = "<0.26.0" tree-sitter-lua = "<0.25.0" +tree-sitter-md = "<0.6.0" tree-sitter-odin-codebook = "1.4.0" tree-sitter-php = "<0.25.0" tree-sitter-python = "<0.26.0" diff --git a/crates/codebook/Cargo.toml b/crates/codebook/Cargo.toml index b5c8028..74270ae 100644 --- a/crates/codebook/Cargo.toml +++ b/crates/codebook/Cargo.toml @@ -42,6 +42,7 @@ tree-sitter-java.workspace = true tree-sitter-javascript.workspace = true codebook-tree-sitter-latex.workspace = true tree-sitter-lua.workspace = true +tree-sitter-md.workspace = true tree-sitter-odin-codebook.workspace = true tree-sitter-php.workspace = true tree-sitter-python.workspace = true diff --git a/crates/codebook/src/queries.rs b/crates/codebook/src/queries.rs index 9501cf6..ccf83df 100644 --- a/crates/codebook/src/queries.rs +++ b/crates/codebook/src/queries.rs @@ -18,6 +18,7 @@ pub enum LanguageType { Javascript, Latex, Lua, + Markdown, Odin, Php, Python, @@ -199,6 +200,13 @@ pub static LANGUAGE_SETTINGS: &[LanguageSetting] = &[ query: include_str!("queries/lua.scm"), extensions: &["lua"], }, + LanguageSetting { + type_: LanguageType::Markdown, + ids: &["markdown"], + dictionary_ids: &[], + query: include_str!("queries/markdown.scm"), + extensions: &["md", "markdown"], + }, LanguageSetting { type_: LanguageType::Bash, ids: &["bash", "shellscript", "sh", "shell script"], @@ -291,6 +299,7 @@ impl LanguageSetting { LanguageType::Javascript => Some(tree_sitter_javascript::LANGUAGE.into()), LanguageType::Latex => Some(codebook_tree_sitter_latex::LANGUAGE.into()), LanguageType::Lua => Some(tree_sitter_lua::LANGUAGE.into()), + LanguageType::Markdown => Some(tree_sitter_md::LANGUAGE.into()), LanguageType::Odin => Some(tree_sitter_odin_codebook::LANGUAGE.into()), LanguageType::Php => Some(tree_sitter_php::LANGUAGE_PHP.into()), LanguageType::Python => Some(tree_sitter_python::LANGUAGE.into()), diff --git a/crates/codebook/src/queries/markdown.scm b/crates/codebook/src/queries/markdown.scm new file mode 100644 index 0000000..8c1c6b2 --- /dev/null +++ b/crates/codebook/src/queries/markdown.scm @@ -0,0 +1,2 @@ +(paragraph (inline) @string) +(atx_heading (inline) @string) diff --git a/crates/codebook/tests/test_markdown.rs b/crates/codebook/tests/test_markdown.rs new file mode 100644 index 0000000..04b13dc --- /dev/null +++ b/crates/codebook/tests/test_markdown.rs @@ -0,0 +1,136 @@ +use codebook::{ + parser::{TextRange, WordLocation}, + queries::LanguageType, +}; + +mod utils; + +#[test] +fn test_markdown_paragraph() { + utils::init_logging(); + let processor = utils::get_processor(); + let sample_text = "Some paragraph text with a misspeled word.\n"; + let expected = vec![WordLocation::new( + "misspeled".to_string(), + vec![TextRange { + start_byte: 27, + end_byte: 36, + }], + )]; + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::Markdown), None) + .to_vec(); + println!("Misspelled words: {misspelled:?}"); + assert_eq!(misspelled.len(), 1); + assert_eq!(misspelled[0].word, expected[0].word); + assert_eq!(misspelled[0].locations, expected[0].locations); +} + +#[test] +fn test_markdown_heading() { + utils::init_logging(); + let processor = utils::get_processor(); + let sample_text = "# A headng with a tyypo\n"; + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::Markdown), None) + .to_vec(); + let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect(); + println!("Misspelled words: {words:?}"); + assert!(words.contains(&"headng")); + assert!(words.contains(&"tyypo")); +} + +#[test] +fn test_markdown_fenced_code_block_skipped() { + utils::init_logging(); + let processor = utils::get_processor(); + let sample_text = r#"# Hello World + +Some correct text here. + +```bash +mkdir some_dir +badwwword_in_code +``` + +More correct text here. +"#; + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::Markdown), None) + .to_vec(); + let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect(); + println!("Misspelled words: {words:?}"); + // Words inside fenced code blocks should NOT be flagged + assert!(!words.contains(&"mkdir")); + assert!(!words.contains(&"badwwword")); + assert!(!words.contains(&"dir")); +} + +#[test] +fn test_markdown_fenced_code_block_with_typo_outside() { + utils::init_logging(); + let processor = utils::get_processor(); + let sample_text = r#"A paragrap with a tyypo. + +```python +def some_functin(): + pass +``` + +Another paragrap with a tyypo. +"#; + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::Markdown), None) + .to_vec(); + let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect(); + println!("Misspelled words: {words:?}"); + // Typos in prose should be flagged + assert!(words.contains(&"paragrap")); + assert!(words.contains(&"tyypo")); + // Typos inside code blocks should NOT be flagged + assert!(!words.contains(&"functin")); +} + +#[test] +fn test_markdown_multiple_code_blocks() { + utils::init_logging(); + let processor = utils::get_processor(); + let sample_text = r#"Some text with a tyypo. + +```bash +mkdir somedir +``` + +Middle text is corect. + +```python +badspel = True +``` + +End text is also corect. +"#; + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::Markdown), None) + .to_vec(); + let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect(); + println!("Misspelled words: {words:?}"); + assert!(words.contains(&"tyypo")); + assert!(words.contains(&"corect")); + assert!(!words.contains(&"mkdir")); + assert!(!words.contains(&"somedir")); + assert!(!words.contains(&"badspel")); +} + +#[test] +fn test_markdown_block_quote() { + utils::init_logging(); + let processor = utils::get_processor(); + let sample_text = "> A block quoet with a tyypo.\n"; + let misspelled = processor + .spell_check(sample_text, Some(LanguageType::Markdown), None) + .to_vec(); + let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect(); + println!("Misspelled words: {words:?}"); + assert!(words.contains(&"quoet")); + assert!(words.contains(&"tyypo")); +}