diff --git a/src/api.ts b/src/api.ts index ecd3505..26dee19 100644 --- a/src/api.ts +++ b/src/api.ts @@ -1,4 +1,4 @@ -import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, NewlineConfig, TokenPattern } from './types.ts'; +import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, NewlineConfig, StringInterpolation, TokenPattern } from './types.ts'; import { altPattern, anyChar, followedBy, isTokenPattern, lit, never, noneOf, notFollowedBy, notPrecededBy, oneOf, optPattern, plus, precededBy, range, repeat, @@ -17,6 +17,9 @@ interface TokenOptions { skip?: boolean; scope?: string; escape?: TokenPatternInput; + // Highlight-only interpolation regions for ordinary string tokens (e.g. env-spec `${…}` / `$(…)`). + // The parser/lexer stay token-based; generators re-express these as nested regions. + interpolation?: StringInterpolation | StringInterpolation[]; // A regex matching exactly one well-formed escape sequence. Engine-scanned tokens // (templates) validate each `\`-escape against it and reject any that don't match — // unlike `escape` (highlight-only), this drives tokenization. Skipped in tag @@ -414,6 +417,9 @@ export function defineGrammar(config: GrammarConfig): CstGrammar & { name: strin flags, scope: tok.opts.scope, escapePattern: tok.opts.escape, + interpolation: tok.opts.interpolation + ? (Array.isArray(tok.opts.interpolation) ? tok.opts.interpolation : [tok.opts.interpolation]).map((i) => ({ ...i })) + : undefined, escapeValidPattern: tok.opts.escapeValid, embed: tok.opts.embed, identifier: tok.opts.identifier, diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts index d827de9..b773558 100644 --- a/src/gen-lexer.ts +++ b/src/gen-lexer.ts @@ -966,7 +966,7 @@ export function createLexer(grammar: CstGrammar) { // • LINE-LEAD at the document root (a bare top-level `"a\nb`, or `---\n"a\nb`) → -1. // Blank (whitespace-only) continuation lines are skipped — they are folded line breaks, legal // at any column. Flow is exempt (indentation suspended). yaml-test-suite DK95[1] / QB6E. - if (tm.isString && indent && flowDepth === 0 && m[0].includes('\n')) { + if (tm.isString && indent?.blockScalar && flowDepth === 0 && m[0].includes('\n')) { const prevT = tokens[tokens.length - 1]; const prevIsDocMarker = !!prevT && blockScalarDocMarkers.includes(prevT.text); let parentCol: number; diff --git a/src/gen-monarch.ts b/src/gen-monarch.ts index 63860e9..a2b4a40 100644 --- a/src/gen-monarch.ts +++ b/src/gen-monarch.ts @@ -488,6 +488,11 @@ export function generateMonarch(grammar: CstGrammar): MonarchLanguage { const stringTopRules: MonarchRule[] = []; // entered from root/value const stringNestedRules: MonarchRule[] = []; // entered from interpolation holes + // Highlight-only string interpolation regions (e.g. env-spec `${…}` / `$(…)`): per region we add a + // begin rule into the string body and build a dedicated interp state (re-enter the expression body, + // pop on the region's end). Specs are collected here; the states are built after templates, once the + // nested string/template rules they include are populated. + const interpStateSpecs: { name: string; end: string }[] = []; for (const t of grammar.tokens) { if (t.flags.includes('skip') || t.flags.includes('regex') || t.template) continue; @@ -505,7 +510,19 @@ export function generateMonarch(grammar: CstGrammar): MonarchLanguage { const body: MonarchRule[] = []; const escapePattern = tokenEscapePatternSource(t); if (escapePattern) body.push([anchoredSource(escapePattern), 'string.escape']); - body.push([`[^${escapeForCharClass(delim[0])}\\\\]+`, tok]); + // Interpolation openers come BEFORE the content run so they win; the content run then excludes + // any position that begins an interpolation (negative lookahead) so it can't swallow `${`. + const interps = t.interpolation ?? []; + interps.forEach((interp, i) => { + const name = `string_interp_${suffix}_${i + 1}`; + body.push([escapeRegex(interp.begin), { token: 'delimiter.bracket', next: `@${name}` }]); + interpStateSpecs.push({ name, end: interp.end }); + }); + const dc = escapeForCharClass(delim[0]); + const content = interps.length + ? `(?:(?!${interps.map(p => escapeRegex(p.begin)).join('|')})[^${dc}\\\\])+` + : `[^${dc}\\\\]+`; + body.push([content, tok]); body.push(['\\\\.', 'string.escape']); tokenizer[bodyState] = body; } @@ -591,6 +608,28 @@ export function generateMonarch(grammar: CstGrammar): MonarchLanguage { ]; } + // String-interpolation states (collected in the string loop above). Built here, after templates, + // so the nested string/template rules they include are populated; `@interpExprBody` is a lazy + // include resolved by Monarch. A bare `{` pushes a brace-counting frame (shared with templates). + if (interpStateSpecs.length) { + if (!tokenizer['bracketCounting']) { + tokenizer['bracketCounting'] = [ + wsRule, ...commentRules, ...stringNestedRules, ...templateNestedRules, + ['\\{', { token: 'delimiter.bracket', next: '@bracketCounting' }], + ['\\}', { token: 'delimiter.bracket', next: '@pop' }], + { include: '@interpExprBody' }, + ]; + } + for (const spec of interpStateSpecs) { + tokenizer[spec.name] = [ + wsRule, ...commentRules, ...stringNestedRules, ...templateNestedRules, + ['\\{', { token: 'delimiter.bracket', next: '@bracketCounting' }], + [escapeRegex(spec.end), { token: 'delimiter.bracket', next: '@pop' }], + { include: '@interpExprBody' }, + ]; + } + } + // ── Numbers (most-specific first; token decl order encodes specificity) ── const numberRules: MonarchRule[] = []; for (const t of grammar.tokens) { diff --git a/src/gen-tm.ts b/src/gen-tm.ts index 3857443..1cbd3fa 100644 --- a/src/gen-tm.ts +++ b/src/gen-tm.ts @@ -4464,12 +4464,26 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra } else if (tokenEscapePatternSource(tok) && scope.startsWith('string.')) { // String with escape sequences: generate begin/end for each delimiter const escapePat: TmPattern = { match: tokenEscapePatternSource(tok)!, name: `constant.character.escape.${langName}` }; + // Highlight-only interpolation regions (e.g. env-spec `${…}` / `$(…)`): each becomes a nested + // begin/end region — the same shape a template literal's hole gets. `begin`/`end` are + // author-supplied regex SOURCES (not literals), so they are NOT re-escaped here. + const interpPats: TmPattern[] = (tok.interpolation ?? []).map((interp) => { + const p: TmPattern = { begin: escapeRegex(interp.begin), end: escapeRegex(interp.end), patterns: [{ include: interp.include ?? '$self' }] }; + if (interp.beginScope) p.beginCaptures = { '0': { name: `${interp.beginScope}.${langName}` } }; + if (interp.endScope) p.endCaptures = { '0': { name: `${interp.endScope}.${langName}` } }; + if (interp.contentScope) p.name = `${interp.contentScope}.${langName}`; + return p; + }); + const stringPats: (TmPattern | { include: string })[] = [escapePat, ...interpPats]; const delimiters: [string, string][] = []; + // Drive the delimiter scope off the EXTRACTED delimiter generically: `"`/`'` keep their + // canonical scopes; any other delimiter (e.g. a backtick string) takes the token's own scope + // instead of the old loop's `"`-fallback (which mis-delimited backtick strings). + const scopeForDelim = (d: string) => d === '"' ? 'string.quoted.double' : d === "'" ? 'string.quoted.single' : scope; for (const delim of tokenPatternStringDelimiters(tok)) { - if (delim === '"') delimiters.push(['"', 'string.quoted.double']); - else if (delim === "'") delimiters.push(["'", 'string.quoted.single']); + delimiters.push([delim, scopeForDelim(delim)]); } - if (delimiters.length === 0) delimiters.push(['"', scope]); // fallback + if (delimiters.length === 0) delimiters.push(['"', scope]); // fallback: no delimiter extractable if (delimiters.length === 1) { const [delim, delimScope] = delimiters[0]; @@ -4479,7 +4493,7 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra beginCaptures: { '0': { name: `punctuation.definition.string.begin.${langName}` } }, end: `${escapeRegex(delim)}|$`, endCaptures: { '0': { name: `punctuation.definition.string.end.${langName}` } }, - patterns: [escapePat], + patterns: stringPats, }; topPatterns.push({ include: `#${key}` }); rememberLiteralKey(delimScope, key, tok.name); @@ -4493,7 +4507,7 @@ export function generateTmLanguage(grammar: CstGrammar, langName: string): TmGra beginCaptures: { '0': { name: `punctuation.definition.string.begin.${langName}` } }, end: `${escapeRegex(delim)}|$`, endCaptures: { '0': { name: `punctuation.definition.string.end.${langName}` } }, - patterns: [escapePat], + patterns: stringPats, }; topPatterns.push({ include: `#${subKey}` }); rememberLiteralKey(delimScope, subKey, tok.name); diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index b29300e..dda7dea 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -149,6 +149,9 @@ interface GrammarJsContext { * `template_chars` token. `null` when no template token exists. */ templatePlan: TemplatePlan | null; + /** String tokens carrying highlight-only interpolation regions, each re-expressed as a rule + * backed by an external `_chars` token (parallel to `templatePlan`). Empty if none. */ + interpolationPlans: InterpolationPlan[]; /** * Ref nodes (the identifier right after a definition keyword) that should be * wrapped in `field('name', …)` so highlights.scm can target them with the @@ -358,6 +361,8 @@ function buildTokenBody(name: string, ctx: GrammarJsContext): string | null { // The interpolated-template token is re-expressed as a `template` RULE (with // `${ … }` holes that re-enter the expression grammar), emitted separately. if (ctx.templatePlan && ctx.templatePlan.tokenName === name) return null; + // A string token with interpolation regions is likewise re-expressed as a rule (emitted separately). + if (ctx.interpolationPlans.some(ip => ip.tokenName === name)) return null; // Skip-flagged tokens (comments, whitespace) go in `extras`, not as a named // rule reference — but we still emit them so highlights can capture comments. // tree-sitter's token() DFA rejects zero-width assertions, so strip them first. @@ -538,6 +543,43 @@ function planTemplate(grammar: CstGrammar): TemplatePlan | null { }; } +/** + * A string token carrying highlight-only interpolation regions (e.g. env-spec `${…}` / `$(…)`), + * re-expressed as a tree-sitter RULE (open delim + chars/interpolation runs + close delim) — the + * same shape a template literal gets. The literal text between regions is an external + * `_chars` token (the scanner stops it at the close delim or any region opener). + */ +interface InterpolationPlan { + tokenName: string; // original token name (e.g. 'DQ') — now emitted as a rule, not a token + ruleSnake: string; // snake rule name (e.g. 'dq') — keeps `$.dq` references valid + charsSnake: string; // external scanner symbol for the literal text (e.g. 'dq_chars') + open: string; // opening delimiter (e.g. '"') + close: string; // closing delimiter (same as open for a string token) + regions: { ruleSnake: string; open: string; close: string }[]; // one sub-rule per interpolation entry +} + +function planInterpolations(grammar: CstGrammar): InterpolationPlan[] { + const plans: InterpolationPlan[] = []; + for (const tok of grammar.tokens) { + if (!tok.interpolation?.length) continue; + const open = tokenPatternStringDelimiters(tok)[0] ?? '"'; + const ruleSnake = toSnake(tok.name); + plans.push({ + tokenName: tok.name, + ruleSnake, + charsSnake: ruleSnake + '_chars', + open, + close: open, + regions: tok.interpolation.map((interp, i) => ({ + ruleSnake: `${ruleSnake}_interpolation_${i + 1}`, + open: interp.begin, + close: interp.end, + })), + }); + } + return plans; +} + /** Determine which tokens the external scanner must provide. */ function planScannerTokens(grammar: CstGrammar): Map { const map = new Map(); @@ -560,6 +602,7 @@ function planScannerTokens(grammar: CstGrammar): Map { function externalSymbols(ctx: GrammarJsContext): string[] { const syms = [...ctx.scannerTokenFor.values()]; if (ctx.templatePlan) syms.push(ctx.templatePlan.charsSnake); + for (const ip of ctx.interpolationPlans) syms.push(ip.charsSnake); return syms; } @@ -725,8 +768,10 @@ export function generateTreeSitter(grammar: CstGrammar, langName?: string): Tree const scannerTokenFor = planScannerTokens(grammar); const templatePlan = planTemplate(grammar); + const interpolationPlans = planInterpolations(grammar); const externalSnake = new Set([...scannerTokenFor.values()]); if (templatePlan) externalSnake.add(templatePlan.charsSnake); + for (const ip of interpolationPlans) externalSnake.add(ip.charsSnake); // Find the identifier nodes that follow a declaration keyword, so we can wrap // them in `field('name', …)` in grammar.js AND emit standard `name:` highlight @@ -736,6 +781,7 @@ export function generateTreeSitter(grammar: CstGrammar, langName?: string): Tree const ctx: GrammarJsContext = { grammar, tokenNames, ruleSnake, tokenSnake, prattRules, externalSnake, scannerTokenFor, templatePlan, + interpolationPlans, nameFieldNodes: nameFields.nodes, }; @@ -859,6 +905,27 @@ function buildGrammarJs(ctx: GrammarJsContext, grammarName: string): string { ); } + // String-interpolation tokens: re-expressed as a rule (open + chars/interpolation runs + close); + // each interpolation region is a sub-rule whose hole re-enters the expression grammar (like a template). + const interpExprName = [...ctx.prattRules][0]; + const interpExprSnake = interpExprName ? ctx.ruleSnake.get(interpExprName)! : null; + const interpHole = interpExprSnake ? `optional($.${interpExprSnake})` : 'blank()'; + for (const ip of ctx.interpolationPlans) { + const choices = [`$.${ip.charsSnake}`, ...ip.regions.map(r => `$.${r.ruleSnake}`)].join(', '); + ruleEntries.push( + ` ${ip.ruleSnake}: $ => seq(\n` + + ` ${jsString(ip.open)},\n` + + ` repeat(choice(${choices})),\n` + + ` ${jsString(ip.close)}\n` + + ` )`, + ); + for (const r of ip.regions) { + ruleEntries.push( + ` ${r.ruleSnake}: $ => seq(${jsString(r.open)}, ${interpHole}, ${jsString(r.close)})`, + ); + } + } + lines.push(ruleEntries.join(',\n\n')); lines.push(' }'); lines.push('});'); @@ -1087,6 +1154,15 @@ function buildHighlightsScm( tokenNodeCaptures.push({ query: `(${tpl.substRuleSnake} ${jsString(tpl.interpOpen)})`, capture: '@punctuation.special' }); tokenNodeCaptures.push({ query: `(${tpl.substRuleSnake} ${jsString(tpl.interpClose)})`, capture: '@punctuation.special' }); } + // String-interpolation regions: the literal text reads as string; the region delimiters as + // punctuation — same treatment as a template hole, derived from the interpolation metadata. + for (const ip of ctx.interpolationPlans) { + tokenNodeCaptures.push({ query: `(${ip.charsSnake})`, capture: '@string' }); + for (const r of ip.regions) { + tokenNodeCaptures.push({ query: `(${r.ruleSnake} ${jsString(r.open)})`, capture: '@punctuation.special' }); + tokenNodeCaptures.push({ query: `(${r.ruleSnake} ${jsString(r.close)})`, capture: '@punctuation.special' }); + } + } // ── D. Contextual node captures via emitted fields ── // Operators carry an `operator` field in Pratt rules; they're already covered by @@ -1753,6 +1829,49 @@ function buildScannerC( L.push(''); } + // ── Interpolated-string char scanners (one per string token carrying interpolation) ── + // Each scans the literal run inside the string, stopping before the close delimiter or any + // interpolation opener (so the opener re-enters the expression grammar via its sub-rule). The + // openers are DATA from the interpolation metadata (decoded literals, length 1–2). + { + const cChar = (ch: string) => ch === '\\' ? "'\\\\'" : ch === "'" ? "'\\''" : `'${ch}'`; + for (const ip of ctx.interpolationPlans) { + const charsSym = ip.charsSnake.toUpperCase(); + const up = ip.ruleSnake.toUpperCase(); + const openerInit = ip.regions.map(r => jsString(r.open)).join(', '); + L.push(`// ── Interpolated-string scan (${ip.tokenName}): literal text up to the close delim or an opener ──`); + L.push(`static const char *${up}_OPENERS[] = { ${openerInit} };`); + L.push(`static const unsigned ${up}_OPENER_COUNT = ${ip.regions.length};`); + L.push(`static bool scan_${ip.ruleSnake}_chars(TSLexer *lexer) {`); + L.push(' bool has_content = false;'); + L.push(' for (;;) {'); + L.push(' lexer->mark_end(lexer);'); + L.push(' int32_t c = lexer->lookahead;'); + L.push(' if (c == 0) return false; // EOF — let the CFG report the unterminated string'); + L.push(` if (c == ${cChar(ip.close)}) break; // closing delimiter`); + L.push(' bool first_match = false;'); + L.push(` for (unsigned i = 0; i < ${up}_OPENER_COUNT; i++) if ((int32_t)${up}_OPENERS[i][0] == c) { first_match = true; break; }`); + L.push(' if (first_match) {'); + L.push(' advance(lexer); // peek past the opener\'s first char'); + L.push(' int32_t c2 = lexer->lookahead;'); + L.push(' bool real = false;'); + L.push(` for (unsigned i = 0; i < ${up}_OPENER_COUNT; i++)`); + L.push(` if ((int32_t)${up}_OPENERS[i][0] == c && (${up}_OPENERS[i][1] == 0 || (int32_t)${up}_OPENERS[i][1] == c2)) { real = true; break; }`); + L.push(' if (real) break; // a real opener — token ends before it (mark_end frozen above)'); + L.push(' has_content = true; continue; // lone first char → literal content'); + L.push(' }'); + L.push(' if (c == \'\\\\\') { advance(lexer); if (lexer->lookahead != 0) advance(lexer); has_content = true; continue; }'); + L.push(' advance(lexer);'); + L.push(' has_content = true;'); + L.push(' }'); + L.push(' if (!has_content) return false;'); + L.push(` lexer->result_symbol = ${charsSym};`); + L.push(' return true;'); + L.push('}'); + L.push(''); + } + } + // ── scan() entry ── L.push('bool tree_sitter_' + grammarName + '_external_scanner_scan(void *payload, TSLexer *lexer,'); L.push(' const bool *valid_symbols) {'); @@ -1797,6 +1916,14 @@ function buildScannerC( L.push(' }'); L.push(''); } + for (const ip of ctx.interpolationPlans) { + const charsSym = ip.charsSnake.toUpperCase(); + L.push(` // ${ip.tokenName} interpolated-string literal text (whitespace inside is content, not skipped).`); + L.push(` if (valid_symbols[${charsSym}]) {`); + L.push(` if (scan_${ip.ruleSnake}_chars(lexer)) return true;`); + L.push(' }'); + L.push(''); + } L.push(' return false;'); L.push('}'); L.push(''); diff --git a/src/types.ts b/src/types.ts index fa41ff3..89b7de7 100644 --- a/src/types.ts +++ b/src/types.ts @@ -20,6 +20,7 @@ export interface TokenDecl { flags: string[]; scope?: string; // @scope(...) override escapePattern?: TokenPattern; // @escape pattern — escape sequence pattern (highlight only) + interpolation?: StringInterpolation[]; // highlight-only interpolation regions inside a string token (e.g. `${…}` / `$(…)`) escapeValidPattern?: TokenPattern; // one well-formed escape; engine-scanned tokens reject non-matching `\`-escapes (skipped in tag position) embed?: string; // @embed(lang) — embedded language scope name // ── Lexer hints (keep the engine language-agnostic; all optional) ── @@ -45,6 +46,21 @@ export interface TokenDecl { blockOnly?: boolean; } +/** + * A highlight-only interpolation region inside a string token (e.g. an env-spec `"…${expr}…"` + * or `"…$(expr)…"`). The lexer/parser stay token-based — these only tell the highlight + * generators (TextMate / Monarch / tree-sitter) to re-express the string as nested regions. + * `begin`/`end` are regex-source fragments; scopes omit the language suffix (generators add it). + */ +export interface StringInterpolation { + begin: string; // LITERAL begin delimiter, NOT a regex (e.g. '${'); generators escape it as needed + end: string; // LITERAL end delimiter, NOT a regex (e.g. '}') + beginScope?: string; // delimiter scope for the opener (without language suffix) + endScope?: string; // delimiter scope for the closer (without language suffix) + contentScope?: string; // body / container scope (without language suffix) + include?: string; // TextMate include inside the body (default '$self') +} + /** Delimiters an interpolated template literal is made of (e.g. JS: `` ` ``, `${`, `}`). */ export interface TemplateDelimiters { open: string; // starts AND ends a template literal (e.g. '`') diff --git a/test/env-spec-regressions.ts b/test/env-spec-regressions.ts new file mode 100644 index 0000000..33b5be9 --- /dev/null +++ b/test/env-spec-regressions.ts @@ -0,0 +1,91 @@ +// Regression contracts for env-spec-style DSL grammars (originally PR #9, ported to the +// current token-pattern-IR API). These lock down two user-facing behaviors: +// 1. an escaped backtick string keeps backtick delimiters in TextMate (no `"` fallback) +// 2. an indentation grammar WITHOUT `indent.blockScalar` does not enforce YAML multiline +// quoted-scalar continuation rules (so `KEY="line1\nline2"` parses) +// +// Run with: node test/env-spec-regressions.ts +import { createParser } from '../src/gen-parser.ts'; +import { defineGrammar, many, opt, rule, token, seq, star, alt, lit, oneOf, noneOf, anyChar, never, range, plus, followedBy } from '../src/api.ts'; +import { generateTmLanguage } from '../src/gen-tm.ts'; + +let ok = 0; +let fail = 0; +const check = (label: string, cond: boolean) => { + if (cond) ok++; + else { fail++; console.log(` ✗ ${label}`); } +}; + +// --------------------------------------------------------------------------- +// Regression 1: escaped backtick strings keep backtick delimiters in TextMate. +// token pattern: `(?:\\.|[^`\\])*` escape: \\. +// --------------------------------------------------------------------------- +{ + const BT = token( + seq(lit('`'), star(alt(seq(lit('\\'), anyChar()), noneOf(oneOf('`', '\\')))), lit('`')), + { scope: 'string.quoted.other', string: true, escape: seq(lit('\\'), anyChar()) }, + ); + const File = rule(() => [[BT]]); + const grammar = defineGrammar({ name: 'backtick-string', tokens: { BT }, rules: { File }, entry: File }); + + const tm = generateTmLanguage(grammar, 'backtick-string'); + const btRepo = tm.repository.bt; + check('tm: backtick token repository entry exists', !!btRepo); + check('tm: backtick token begin delimiter is `', btRepo?.begin === '`'); + check('tm: backtick token end delimiter is `|$', btRepo?.end === '`|$'); +} + +// --------------------------------------------------------------------------- +// Regression 2: indentation grammars without blockScalar must NOT enforce YAML +// multiline quoted-scalar indentation rules. +// --------------------------------------------------------------------------- +{ + const WS = token(plus(oneOf(' ', '\t')), { skip: true }); + const INDENT = token(never(), {}); + const DEDENT = token(never(), {}); + const NEWLINE = token(never(), {}); + // KEY is `[A-Z_][A-Z0-9_]*` immediately followed by `=` (a lookahead). + const KEY = token( + seq(oneOf(range('A', 'Z'), '_'), star(oneOf(range('A', 'Z'), range('0', '9'), '_')), followedBy(lit('='))), + { identifier: true }, + ); + const DQ = token( + seq(lit('"'), star(alt(seq(lit('\\'), anyChar()), noneOf(oneOf('"', '\\')))), lit('"')), + { string: true, escape: seq(lit('\\'), anyChar()) }, + ); + + const Value = rule(() => [[DQ]]); + const Statement = rule(() => [[KEY, '=', Value, opt(NEWLINE)]]); + const File = rule(() => [[many(Statement)]]); + + const grammar = defineGrammar({ + name: 'indent-no-blockscalar', + tokens: { WS, INDENT, DEDENT, NEWLINE, KEY, DQ }, + rules: { Value, Statement, File }, + indent: { + indentToken: 'INDENT', + dedentToken: 'DEDENT', + newlineToken: 'NEWLINE', + flowOpen: ['('], + flowClose: [')'], + }, + entry: File, + }); + + const parser = createParser(grammar); + let threw = false; + try { + // Regressed when YAML block-scalar continuation checks ran for ALL indentation grammars: KEY="a\nb" + parser.parse('KEY="line1\nline2"'); + } catch { + threw = true; + } + check('parser: multiline inline quoted value is accepted without blockScalar', !threw); +} + +console.log( + fail === 0 + ? `\n${ok}/${ok} env-spec regression checks pass` + : `\n${fail} FAILED (of ${ok + fail})`, +); +process.exit(fail === 0 ? 0 : 1); diff --git a/test/interpolation-metadata.ts b/test/interpolation-metadata.ts new file mode 100644 index 0000000..6e4b568 --- /dev/null +++ b/test/interpolation-metadata.ts @@ -0,0 +1,132 @@ +// Contract: token-level string `interpolation` metadata propagates to TextMate, Monarch, and +// tree-sitter (originally PR #9, ported to the current token-pattern-IR API). A `string` token +// declares interpolation regions (`${…}` / `$(…)`); each generator re-expresses them as nested +// regions / states / rules. `begin`/`end` are regex-source fragments (highlight-only), unaffected +// by the token IR. +// +// Run with: node test/interpolation-metadata.ts +import { defineGrammar, many, rule, token, seq, star, alt, lit, oneOf, noneOf, anyChar, range, plus } from '../src/api.ts'; +import { generateTmLanguage } from '../src/gen-tm.ts'; +import { generateMonarch } from '../src/gen-monarch.ts'; +import { generateTreeSitter } from '../src/gen-treesitter.ts'; +import { execFileSync } from 'node:child_process'; +import { mkdtempSync, writeFileSync, mkdirSync, existsSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +let ok = 0; +let fail = 0; +const check = (label: string, cond: boolean) => { + if (cond) ok++; + else { fail++; console.log(` ✗ ${label}`); } +}; + +const WS = token(plus(oneOf(' ', '\t')), { skip: true }); +const NL = token(seq(star(lit('\r')), lit('\n')), { skip: true }); +const KEY = token(seq(oneOf(range('A', 'Z'), '_'), star(oneOf(range('A', 'Z'), range('0', '9'), '_'))), { identifier: true }); +const DQ = token( + seq(lit('"'), star(alt(seq(lit('\\'), anyChar()), noneOf(oneOf('"', '\\')))), lit('"')), + { + string: true, + escape: seq(lit('\\'), anyChar()), + interpolation: [ + { + begin: '${', + end: '}', + beginScope: 'punctuation.definition.interpolation.begin', + endScope: 'punctuation.definition.interpolation.end', + contentScope: 'variable.function', + }, + { + begin: '$(', + end: ')', + beginScope: 'punctuation.definition.interpolation.begin', + endScope: 'punctuation.definition.interpolation.end', + contentScope: 'variable.function', + }, + ], + }, +); + +const Value = rule(() => [[DQ]]); +const Line = rule(() => [[KEY, '=', Value]]); +const File = rule(() => [[many(Line)]]); + +const grammar = defineGrammar({ + name: 'interpolation-metadata', + tokens: { WS, NL, KEY, DQ }, + rules: { Value, Line, File }, + scopes: { 'keyword.operator.assignment': ['='] }, + entry: File, +}); + +// ── TextMate generation ── +const tm = generateTmLanguage(grammar, 'interpolation-metadata'); +const dqTm = tm.repository.dq; +check('tm: DQ repository entry exists', !!dqTm); +check('tm: DQ ${ interpolation begin is the escaped literal', JSON.stringify(dqTm).includes('"begin":"\\\\$\\\\{"')); +check('tm: DQ $( interpolation begin is the escaped literal', JSON.stringify(dqTm).includes('"begin":"\\\\$\\\\("')); +check('tm: interpolation begin scope emitted', JSON.stringify(dqTm).includes('punctuation.definition.interpolation.begin.interpolation-metadata')); +check('tm: interpolation end scope emitted', JSON.stringify(dqTm).includes('punctuation.definition.interpolation.end.interpolation-metadata')); + +// ── Monarch generation ── +const monarch = generateMonarch(grammar); +const bodyStateName = Object.keys(monarch.tokenizer).find(s => s.startsWith('string_dquote_body')); +check('monarch: has double-quote string body state', !!bodyStateName); +const bodyRules = bodyStateName ? monarch.tokenizer[bodyStateName] : []; +check('monarch: body has ${ interpolation begin rule', bodyRules.some(r => Array.isArray(r) && r[0] === '\\$\\{')); +check('monarch: body has $( interpolation begin rule', bodyRules.some(r => Array.isArray(r) && r[0] === '\\$\\(')); +check('monarch: creates interpolation state', Object.keys(monarch.tokenizer).some(s => s.startsWith('string_interp_dquote_'))); + +// ── Tree-sitter generation ── +const ts = generateTreeSitter(grammar, 'interpolation-metadata'); +check('treesitter: re-emits DQ token as rule', ts.grammarJs.includes('dq: $ => seq(')); +check('treesitter: emits first interpolation rule', ts.grammarJs.includes('dq_interpolation_1')); +check('treesitter: emits second interpolation rule', ts.grammarJs.includes('dq_interpolation_2')); +check('treesitter: scanner has dq chars scan fn', ts.scannerC.includes('scan_dq_chars')); +check('treesitter: scanner openers include ${', ts.scannerC.includes('"${"')); +check('treesitter: scanner openers include $(', ts.scannerC.includes('"$("')); +check('treesitter: highlights capture interpolation punctuation', ts.highlightsScm.includes('(dq_interpolation_1 "${") @punctuation.special')); +check('treesitter: highlights capture interpolation punctuation for $(', ts.highlightsScm.includes('(dq_interpolation_2 "$(") @punctuation.special')); + +// ── Optional: real tree-sitter CLI — generate + parse proves scanner.c COMPILES and that the +// `dq_chars` external + interpolation rules actually tokenize an interpolated string. ── +const tsBin = join(process.cwd(), 'node_modules', '.bin', 'tree-sitter'); +if (existsSync(tsBin)) { + console.log('\ntree-sitter CLI found — generating + parsing an interpolated string…'); + const dir = mkdtempSync(join(tmpdir(), 'monogram-interp-')); + mkdirSync(join(dir, 'src'), { recursive: true }); + mkdirSync(join(dir, 'queries'), { recursive: true }); + writeFileSync(join(dir, 'grammar.js'), ts.grammarJs); + writeFileSync(join(dir, 'src', 'scanner.c'), ts.scannerC); + writeFileSync(join(dir, 'queries', 'highlights.scm'), ts.highlightsScm); + writeFileSync(join(dir, 'package.json'), JSON.stringify({ name: 'tree-sitter-interp-monogram', version: '0.0.0' })); + let generated = false; + try { + execFileSync(tsBin, ['generate'], { cwd: dir, stdio: 'pipe' }); + generated = true; + } catch (e: any) { + console.log(' generate failed:', ((e.stderr || e.message || '') + '').split('\n').slice(0, 8).join('\n ')); + } + check('tree-sitter generate succeeds (interpolation rules + scanner consistent)', generated); + if (generated) { + // `a` and `b` are dq_chars runs; `${}` / `$()` are interpolation regions (empty holes — the + // tiny grammar has no expression rule, so blank() — which still exercises the scanner stops). + writeFileSync(join(dir, 'in.env'), 'A="a${}b$()c"\n'); + let tree = ''; + try { tree = execFileSync(tsBin, ['parse', 'in.env'], { cwd: dir, encoding: 'utf8' }); } + catch (e: any) { tree = ((e.stdout || '') + '\n' + (e.stderr || '')); } + check('parse: both interpolation regions present, no ERROR', + tree.includes('dq_interpolation_1') && tree.includes('dq_interpolation_2') && !tree.includes('ERROR')); + } + console.log(` (artifacts in ${dir})`); +} else { + console.log('\ntree-sitter CLI not found — structural validation only (not a failure).'); +} + +console.log( + fail === 0 + ? `\n${ok}/${ok} interpolation-metadata checks pass` + : `\n${fail} FAILED (of ${ok + fail})`, +); +process.exit(fail === 0 ? 0 : 1);