Skip to content

Commit 7871a46

Browse files
authored
Add tree-sitter Markdown support to skip fenced code blocks (#233)
Adds a proper Markdown language type using tree-sitter-md instead of treating Markdown as plain text. Fenced code blocks are no longer spell-checked, fixing false positives like `mkdir` in bash snippets. Closes #198
1 parent 95c5afb commit 7871a46

6 files changed

Lines changed: 160 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ tree-sitter-html = "<0.25.0"
5757
tree-sitter-java = "<0.25.0"
5858
tree-sitter-javascript = "<0.26.0"
5959
tree-sitter-lua = "<0.25.0"
60+
tree-sitter-md = "<0.6.0"
6061
tree-sitter-odin-codebook = "1.4.0"
6162
tree-sitter-php = "<0.25.0"
6263
tree-sitter-python = "<0.26.0"

crates/codebook/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ tree-sitter-java.workspace = true
4242
tree-sitter-javascript.workspace = true
4343
codebook-tree-sitter-latex.workspace = true
4444
tree-sitter-lua.workspace = true
45+
tree-sitter-md.workspace = true
4546
tree-sitter-odin-codebook.workspace = true
4647
tree-sitter-php.workspace = true
4748
tree-sitter-python.workspace = true

crates/codebook/src/queries.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ pub enum LanguageType {
1818
Javascript,
1919
Latex,
2020
Lua,
21+
Markdown,
2122
Odin,
2223
Php,
2324
Python,
@@ -199,6 +200,13 @@ pub static LANGUAGE_SETTINGS: &[LanguageSetting] = &[
199200
query: include_str!("queries/lua.scm"),
200201
extensions: &["lua"],
201202
},
203+
LanguageSetting {
204+
type_: LanguageType::Markdown,
205+
ids: &["markdown"],
206+
dictionary_ids: &[],
207+
query: include_str!("queries/markdown.scm"),
208+
extensions: &["md", "markdown"],
209+
},
202210
LanguageSetting {
203211
type_: LanguageType::Bash,
204212
ids: &["bash", "shellscript", "sh", "shell script"],
@@ -291,6 +299,7 @@ impl LanguageSetting {
291299
LanguageType::Javascript => Some(tree_sitter_javascript::LANGUAGE.into()),
292300
LanguageType::Latex => Some(codebook_tree_sitter_latex::LANGUAGE.into()),
293301
LanguageType::Lua => Some(tree_sitter_lua::LANGUAGE.into()),
302+
LanguageType::Markdown => Some(tree_sitter_md::LANGUAGE.into()),
294303
LanguageType::Odin => Some(tree_sitter_odin_codebook::LANGUAGE.into()),
295304
LanguageType::Php => Some(tree_sitter_php::LANGUAGE_PHP.into()),
296305
LanguageType::Python => Some(tree_sitter_python::LANGUAGE.into()),
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
(paragraph (inline) @string)
2+
(atx_heading (inline) @string)
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
use codebook::{
2+
parser::{TextRange, WordLocation},
3+
queries::LanguageType,
4+
};
5+
6+
mod utils;
7+
8+
#[test]
9+
fn test_markdown_paragraph() {
10+
utils::init_logging();
11+
let processor = utils::get_processor();
12+
let sample_text = "Some paragraph text with a misspeled word.\n";
13+
let expected = vec![WordLocation::new(
14+
"misspeled".to_string(),
15+
vec![TextRange {
16+
start_byte: 27,
17+
end_byte: 36,
18+
}],
19+
)];
20+
let misspelled = processor
21+
.spell_check(sample_text, Some(LanguageType::Markdown), None)
22+
.to_vec();
23+
println!("Misspelled words: {misspelled:?}");
24+
assert_eq!(misspelled.len(), 1);
25+
assert_eq!(misspelled[0].word, expected[0].word);
26+
assert_eq!(misspelled[0].locations, expected[0].locations);
27+
}
28+
29+
#[test]
30+
fn test_markdown_heading() {
31+
utils::init_logging();
32+
let processor = utils::get_processor();
33+
let sample_text = "# A headng with a tyypo\n";
34+
let misspelled = processor
35+
.spell_check(sample_text, Some(LanguageType::Markdown), None)
36+
.to_vec();
37+
let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
38+
println!("Misspelled words: {words:?}");
39+
assert!(words.contains(&"headng"));
40+
assert!(words.contains(&"tyypo"));
41+
}
42+
43+
#[test]
44+
fn test_markdown_fenced_code_block_skipped() {
45+
utils::init_logging();
46+
let processor = utils::get_processor();
47+
let sample_text = r#"# Hello World
48+
49+
Some correct text here.
50+
51+
```bash
52+
mkdir some_dir
53+
badwwword_in_code
54+
```
55+
56+
More correct text here.
57+
"#;
58+
let misspelled = processor
59+
.spell_check(sample_text, Some(LanguageType::Markdown), None)
60+
.to_vec();
61+
let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
62+
println!("Misspelled words: {words:?}");
63+
// Words inside fenced code blocks should NOT be flagged
64+
assert!(!words.contains(&"mkdir"));
65+
assert!(!words.contains(&"badwwword"));
66+
assert!(!words.contains(&"dir"));
67+
}
68+
69+
#[test]
70+
fn test_markdown_fenced_code_block_with_typo_outside() {
71+
utils::init_logging();
72+
let processor = utils::get_processor();
73+
let sample_text = r#"A paragrap with a tyypo.
74+
75+
```python
76+
def some_functin():
77+
pass
78+
```
79+
80+
Another paragrap with a tyypo.
81+
"#;
82+
let misspelled = processor
83+
.spell_check(sample_text, Some(LanguageType::Markdown), None)
84+
.to_vec();
85+
let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
86+
println!("Misspelled words: {words:?}");
87+
// Typos in prose should be flagged
88+
assert!(words.contains(&"paragrap"));
89+
assert!(words.contains(&"tyypo"));
90+
// Typos inside code blocks should NOT be flagged
91+
assert!(!words.contains(&"functin"));
92+
}
93+
94+
#[test]
95+
fn test_markdown_multiple_code_blocks() {
96+
utils::init_logging();
97+
let processor = utils::get_processor();
98+
let sample_text = r#"Some text with a tyypo.
99+
100+
```bash
101+
mkdir somedir
102+
```
103+
104+
Middle text is corect.
105+
106+
```python
107+
badspel = True
108+
```
109+
110+
End text is also corect.
111+
"#;
112+
let misspelled = processor
113+
.spell_check(sample_text, Some(LanguageType::Markdown), None)
114+
.to_vec();
115+
let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
116+
println!("Misspelled words: {words:?}");
117+
assert!(words.contains(&"tyypo"));
118+
assert!(words.contains(&"corect"));
119+
assert!(!words.contains(&"mkdir"));
120+
assert!(!words.contains(&"somedir"));
121+
assert!(!words.contains(&"badspel"));
122+
}
123+
124+
#[test]
125+
fn test_markdown_block_quote() {
126+
utils::init_logging();
127+
let processor = utils::get_processor();
128+
let sample_text = "> A block quoet with a tyypo.\n";
129+
let misspelled = processor
130+
.spell_check(sample_text, Some(LanguageType::Markdown), None)
131+
.to_vec();
132+
let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
133+
println!("Misspelled words: {words:?}");
134+
assert!(words.contains(&"quoet"));
135+
assert!(words.contains(&"tyypo"));
136+
}

0 commit comments

Comments
 (0)