Skip to content

Commit 708bb4a

Browse files
committed
Switch tree-sitter to WASM runtime for cross-platform CI compatibility
Native tree-sitter required C++ compilation via node-gyp which failed on GitHub Actions with Node 24 (V8 C++20 requirement). Switch to web-tree-sitter (WASM) with grammar files from @vscode/tree-sitter-wasm. - Replace native tree-sitter + grammar packages with web-tree-sitter + @vscode/tree-sitter-wasm - Make parseSource/parseCodeFile/parseFile/extractSymbols async (WASM loading is async) - Remove native tree-sitter jest mock, use --experimental-vm-modules instead - Remove npm overrides (no more native peer dep conflicts) - Update all tests for async parser API (beforeAll pattern)
1 parent 272f1a7 commit 708bb4a

13 files changed

Lines changed: 271 additions & 599 deletions

package-lock.json

Lines changed: 14 additions & 92 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
"dev": "tsc --watch",
1515
"cli": "node dist/cli/index.js",
1616
"cli:dev": "tsx src/cli/index.ts",
17-
"test": "jest",
18-
"test:watch": "jest --watch"
17+
"test": "NODE_OPTIONS='--experimental-vm-modules' jest",
18+
"test:watch": "NODE_OPTIONS='--experimental-vm-modules' jest --watch"
1919
},
2020
"repository": {
2121
"type": "git",
@@ -49,6 +49,7 @@
4949
"dependencies": {
5050
"@huggingface/transformers": "^3.8.1",
5151
"@modelcontextprotocol/sdk": "^1.27.1",
52+
"@vscode/tree-sitter-wasm": "^0.3.0",
5253
"chokidar": "^5.0.0",
5354
"commander": "^14.0.3",
5455
"cookie-parser": "^1.4.7",
@@ -60,9 +61,7 @@
6061
"micromatch": "^4.0.8",
6162
"mime": "^4.1.0",
6263
"multer": "^2.1.1",
63-
"tree-sitter": "^0.25.0",
64-
"tree-sitter-javascript": "^0.25.0",
65-
"tree-sitter-typescript": "^0.23.2",
64+
"web-tree-sitter": "^0.26.7",
6665
"ws": "^8.19.0",
6766
"yaml": "^2.8.2",
6867
"zod": "^4.3.6"
@@ -85,10 +84,5 @@
8584
"tsc-alias": "^1.8.16",
8685
"tsx": "^4.21.0",
8786
"typescript": "^5.9.3"
88-
},
89-
"overrides": {
90-
"tree-sitter-typescript": {
91-
"tree-sitter": "$tree-sitter"
92-
}
9387
}
9488
}

src/cli/indexer.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ export function createProjectIndexer(
102102
const fileId = path.relative(config.projectDir, absolutePath);
103103
if (getFileMtime(docGraph, fileId) === mtime) return;
104104
const content = fs.readFileSync(absolutePath, 'utf-8');
105-
const chunks = parseFile(content, absolutePath, config.projectDir, config.chunkDepth);
105+
const chunks = await parseFile(content, absolutePath, config.projectDir, config.chunkDepth);
106106
// Batch-embed all chunks + file-level in one forward pass
107107
const batchInputs = chunks.map(c => ({ title: c.title, content: c.content }));
108108
const rootChunk = chunks.find(c => c.level === 1);
@@ -139,7 +139,7 @@ export function createProjectIndexer(
139139
const mtime = stat.mtimeMs;
140140
const fileId = path.relative(config.projectDir, absolutePath);
141141
if (getCodeFileMtime(codeGraph, fileId) === mtime) return;
142-
const parsed = parseCodeFile(absolutePath, config.projectDir, mtime);
142+
const parsed = await parseCodeFile(absolutePath, config.projectDir, mtime);
143143
// Batch-embed all symbols + file-level in one forward pass
144144
const batchInputs = parsed.nodes.map(({ attrs }) => ({ title: attrs.signature, content: attrs.docComment }));
145145
batchInputs.push({ title: fileId, content: '' });

src/lib/parsers/code.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,11 @@ function hasFile(p: string): boolean {
5252
// Main parser
5353
// ---------------------------------------------------------------------------
5454

55-
export function parseCodeFile(
55+
export async function parseCodeFile(
5656
absolutePath: string,
5757
codeDir: string,
5858
mtime: number,
59-
): ParsedFile {
59+
): Promise<ParsedFile> {
6060
const fileId = path.relative(codeDir, absolutePath);
6161

6262
// Determine language from file extension
@@ -77,7 +77,7 @@ export function parseCodeFile(
7777
}
7878

7979
const source = fs.readFileSync(absolutePath, 'utf-8');
80-
const rootNode = parseSource(source, language);
80+
const rootNode = await parseSource(source, language);
8181

8282
if (!rootNode) {
8383
return {

src/lib/parsers/codeblock.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@ const TAG_TO_LANGUAGE: Record<string, string> = {
1414
* Extract top-level symbol names from a code block using tree-sitter.
1515
* Returns [] for unsupported languages or on parse failure.
1616
*/
17-
export function extractSymbols(code: string, language: string): string[] {
17+
export async function extractSymbols(code: string, language: string): Promise<string[]> {
1818
const lang = TAG_TO_LANGUAGE[language.toLowerCase()];
1919
if (!lang || !isLanguageSupported(lang)) return [];
2020

2121
try {
22-
const rootNode = parseSource(code, lang);
22+
const rootNode = await parseSource(code, lang);
2323
if (!rootNode) return [];
2424

2525
const mapper = getMapper(lang)!;

src/lib/parsers/docs.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ export interface Chunk {
1515
}
1616

1717
// Parse a markdown file into chunks split by headings
18-
export function parseFile(
18+
export async function parseFile(
1919
content: string,
2020
absolutePath: string,
2121
projectDir: string,
2222
chunkDepth: number,
23-
): Chunk[] {
23+
): Promise<Chunk[]> {
2424
const fileId = path.relative(projectDir, absolutePath);
2525
const lines = content.split('\n');
2626

@@ -65,14 +65,14 @@ export function parseFile(
6565
};
6666
});
6767

68-
return spliceCodeBlocks(textChunks, seenIds);
68+
return await spliceCodeBlocks(textChunks, seenIds);
6969
}
7070

7171
// --- code block extraction ---
7272

7373
const FENCE_RE = /^(`{3,}|~{3,})(\S*)\s*\n([\s\S]*?)^\1\s*$/gm;
7474

75-
function spliceCodeBlocks(chunks: Chunk[], seenIds: Set<string>): Chunk[] {
75+
async function spliceCodeBlocks(chunks: Chunk[], seenIds: Set<string>): Promise<Chunk[]> {
7676
const result: Chunk[] = [];
7777

7878
for (const chunk of chunks) {
@@ -102,7 +102,7 @@ function spliceCodeBlocks(chunks: Chunk[], seenIds: Set<string>): Chunk[] {
102102
seenIds.add(id);
103103

104104
const lang = cb.language || undefined;
105-
const symbols = lang ? extractSymbols(cb.code, lang) : [];
105+
const symbols = lang ? await extractSymbols(cb.code, lang) : [];
106106

107107
result.push({
108108
id,

src/lib/parsers/languages/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
export { registerLanguage, getLanguageEntry, isLanguageSupported, parseSource, getMapper, listLanguages } from './registry';
1+
export { registerLanguage, isLanguageSupported, parseSource, getMapper, listLanguages, initParser } from './registry';
22
export type { LanguageMapper, ExtractedSymbol, ExtractedEdge, ExtractedImport } from './types';
33
export { registerTypescript } from './typescript';
44

src/lib/parsers/languages/registry.ts

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,77 @@
1-
import Parser from 'tree-sitter';
1+
import path from 'path';
22
import type { LanguageMapper } from './types';
33

44
export { type LanguageMapper, type ExtractedSymbol, type ExtractedEdge, type ExtractedImport } from './types';
55

6+
// web-tree-sitter types (loaded lazily)
7+
type WTSLanguage = any;
8+
69
interface LanguageEntry {
7-
/** tree-sitter Language object */
8-
grammar: any;
10+
/** WASM file name, e.g. 'tree-sitter-typescript.wasm' */
11+
wasmFile: string;
12+
/** Loaded Language instance (null until loadLanguage is called) */
13+
language: WTSLanguage | null;
914
/** Mapper that extracts symbols, edges, imports from the AST */
1015
mapper: LanguageMapper;
1116
}
1217

1318
/** Map from language name (matching file-lang.ts names) to entry. */
1419
const languages = new Map<string, LanguageEntry>();
1520

16-
/** Shared parser instance — setLanguage() is cheap, no need for one per language. */
17-
let _parser: Parser | undefined;
21+
/** WASM directory containing grammar .wasm files */
22+
const WASM_DIR = path.join(
23+
path.dirname(require.resolve('@vscode/tree-sitter-wasm/package.json')),
24+
'wasm',
25+
);
26+
27+
let _initPromise: Promise<void> | null = null;
28+
29+
/** web-tree-sitter module (lazy loaded) */
30+
let _wts: any = null;
31+
32+
/** Initialize the WASM parser runtime. Must be called before parsing. */
33+
export async function initParser(): Promise<void> {
34+
if (_wts) return;
35+
if (_initPromise) return _initPromise;
36+
37+
_initPromise = (async () => {
38+
_wts = require('web-tree-sitter');
39+
await _wts.Parser.init();
40+
})();
1841

19-
function getParser(): Parser {
20-
if (!_parser) _parser = new Parser();
21-
return _parser;
42+
return _initPromise;
2243
}
2344

24-
/** Register a language. Called at module load time by each language module. */
25-
export function registerLanguage(name: string, grammar: any, mapper: LanguageMapper): void {
26-
languages.set(name, { grammar, mapper });
45+
/** Register a language (sync — only stores metadata). */
46+
export function registerLanguage(name: string, wasmFile: string, mapper: LanguageMapper): void {
47+
languages.set(name, { wasmFile, language: null, mapper });
2748
}
2849

29-
/** Get a registered language entry. Returns undefined for unsupported languages. */
30-
export function getLanguageEntry(languageName: string): LanguageEntry | undefined {
31-
return languages.get(languageName);
50+
/** Load a language WASM if not already loaded. */
51+
async function loadLanguage(entry: LanguageEntry): Promise<WTSLanguage> {
52+
if (entry.language) return entry.language;
53+
await initParser();
54+
const wasmPath = path.join(WASM_DIR, entry.wasmFile);
55+
entry.language = await _wts.Language.load(wasmPath);
56+
return entry.language;
3257
}
3358

34-
/** Check if a language is supported. */
59+
/** Check if a language is registered. */
3560
export function isLanguageSupported(languageName: string): boolean {
3661
return languages.has(languageName);
3762
}
3863

3964
/** Parse source code with the appropriate language grammar. Returns root node or null. */
40-
export function parseSource(code: string, languageName: string): any | null {
65+
export async function parseSource(code: string, languageName: string): Promise<any | null> {
4166
const entry = languages.get(languageName);
4267
if (!entry) return null;
4368

44-
const parser = getParser();
45-
parser.setLanguage(entry.grammar);
69+
await initParser();
70+
const lang = await loadLanguage(entry);
71+
const parser = new _wts.Parser();
72+
parser.setLanguage(lang);
4673
const tree = parser.parse(code);
47-
return tree.rootNode;
74+
return tree?.rootNode ?? null;
4875
}
4976

5077
/** Get the mapper for a language. Returns undefined for unsupported languages. */

0 commit comments

Comments
 (0)