diff --git a/src/api.ts b/src/api.ts index c2218c0..ecd3505 100644 --- a/src/api.ts +++ b/src/api.ts @@ -1,4 +1,4 @@ -import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, TokenPattern } from './types.ts'; +import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, NewlineConfig, TokenPattern } from './types.ts'; import { altPattern, anyChar, followedBy, isTokenPattern, lit, never, noneOf, notFollowedBy, notPrecededBy, oneOf, optPattern, plus, precededBy, range, repeat, @@ -381,6 +381,7 @@ interface GrammarConfig { entry: RuleRef; markup?: MarkupConfig; // opt-in markup-mode tokenization (HTML/Vue) indent?: IndentConfig; // opt-in indentation-sensitive tokenization (YAML) + newline?: NewlineConfig; // opt-in NEWLINE-sensitive tokenization, independent of indent (no indent stack) expression?: RuleRef; // the rule that produces an EXPRESSION; enables a derived `#expression` sub-grammar (expression-only embeds) aliasScopes?: { scope: string; file: string }[]; // extra grammars re-exposing this one under another scopeName (e.g. text.html.derivative) canonicalRepoNames?: Record; // official repo KEY NAME → structural key(s) for the SAME construct; gen-tm RENAMES the structural key (or synthesises a union wrapper) to emit the official name natively (the 限制器; see CstGrammar.canonicalRepoNames) @@ -388,6 +389,11 @@ interface GrammarConfig { } export function defineGrammar(config: GrammarConfig): CstGrammar & { name: string; scopeName?: string } { + // `indent` is the richer layer built on top of newline-significant line boundaries, so the two + // modes are mutually exclusive — declaring both is a configuration error, not a merge. + if (config.indent && config.newline) { + throw new Error('A grammar may declare `indent` OR `newline`, not both — `indent` already implies newline-significant line boundaries.'); + } const names = new Map(); for (const [name, tok] of Object.entries(config.tokens)) { names.set(tok, name); @@ -453,5 +459,5 @@ export function defineGrammar(config: GrammarConfig): CstGrammar & { name: strin } } - return { name: config.name, scopeName: config.scopeName, tokens, precs, rules, scopeOverrides, markup: config.markup, indent: config.indent, expressionRule: config.expression ? names.get(config.expression) : undefined, aliasScopes: config.aliasScopes, canonicalRepoNames: config.canonicalRepoNames, manifest: config.manifest }; + return { name: config.name, scopeName: config.scopeName, tokens, precs, rules, scopeOverrides, markup: config.markup, indent: config.indent, newline: config.newline, expressionRule: config.expression ? names.get(config.expression) : undefined, aliasScopes: config.aliasScopes, canonicalRepoNames: config.canonicalRepoNames, manifest: config.manifest }; } diff --git a/src/emit-parser.ts b/src/emit-parser.ts index fec1d90..a9bbd7f 100644 --- a/src/emit-parser.ts +++ b/src/emit-parser.ts @@ -635,6 +635,7 @@ export function emitParser(grammar: CstGrammar): string { rules: [{ name: '$lits', body: litRuleBody, flags: [] }], markup: grammar.markup, indent: grammar.indent, + newline: grammar.newline, scopeOverrides: [], }; diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts index 7872a20..d827de9 100644 --- a/src/gen-lexer.ts +++ b/src/gen-lexer.ts @@ -185,16 +185,24 @@ export function createLexer(grammar: CstGrammar) { // → null, and a `tagOpen` always opens a tag (legacy behaviour, unchanged for other grammars). const tagOpenAfterRe = markup?.tagOpenAfter ? new RegExp('[' + markup.tagOpenAfter + ']') : null; - // ── Indentation mode (opt-in; dormant unless the grammar declares `indent`) ── + // ── Indentation / newline mode (opt-in; dormant unless the grammar declares `indent` or `newline`) ── // Like markup, the INDENT/DEDENT/NEWLINE tokens are EMITTED by a state machine (not matched // by a regex) — so they are skipped in the regex loop and their grammar patterns are // placeholders. Indentation is suspended inside flow delimiters via a flow-depth counter. + // `newline` is the line-boundary + flow-suspension LAYER that `indent` builds on: an indent + // grammar gets the full stack + INDENT/DEDENT/NEWLINE; a newline-only grammar emits just the + // NEWLINE token at each significant line boundary (no stack). `lineSensitive` gates the shared + // machinery; `indent`/`newline` are mutually exclusive (defineGrammar rejects declaring both). const indent = grammar.indent; + const newline = grammar.newline; + const lineSensitive = !!indent || !!newline; + const lineComment = (indent ?? newline)?.comment; // line-comment introducer (both modes skip comment-only lines) const indentTokenNames = new Set( - indent ? ([indent.indentToken, indent.dedentToken, indent.newlineToken].filter(Boolean) as string[]) : [], + indent ? ([indent.indentToken, indent.dedentToken, indent.newlineToken].filter(Boolean) as string[]) + : newline ? [newline.token] : [], ); - const flowOpenSet = new Set(indent?.flowOpen ?? []); - const flowCloseSet = new Set(indent?.flowClose ?? []); + const flowOpenSet = new Set((indent ?? newline)?.flowOpen ?? []); + const flowCloseSet = new Set((indent ?? newline)?.flowClose ?? []); // String-literal token names (the `string`-flagged tokens — quoted scalars in YAML). Used by the // flow mapping-separator guard below: a quoted scalar can never run past its closing quote, so a // `:` immediately after one (inside flow) is ALWAYS the mapping `key: value` separator, never the @@ -416,7 +424,7 @@ export function createLexer(grammar: CstGrammar) { && tagOpenAfterRe.test(source[i + markup!.tagOpen.length])); // Indentation state — active only when `indent` is declared (dormant otherwise). let flowDepth = 0; // >0 while inside flow delimiters ([ ] { }) → indentation suspended - let lineStart = !!indent; // at a block-context line boundary (file start counts as one) + let lineStart = lineSensitive; // at a block-context line boundary (file start counts as one) let emittedContent = false; // any real (non-structural) token emitted yet — suppress a leading NEWLINE/DEDENT let currentLineCol = 0; // leading-space column of the current logical line (bounds block scalars) let atLineLead = false; // the next emitted token is the FIRST content token of its line (compact-indicator probe) @@ -446,7 +454,7 @@ export function createLexer(grammar: CstGrammar) { if (pendingComment) { t.commentBefore = true; pendingComment = false; } if (pendingMultilineFlow) { t.multilineFlowBefore = true; pendingMultilineFlow = false; } tokens.push(t); - if (indent) { + if (lineSensitive) { if (!indentTokenNames.has(t.type)) { emittedContent = true; // a real token (not INDENT/DEDENT/NEWLINE) atLineLead = false; // line-lead consumed once a real token lands @@ -456,7 +464,8 @@ export function createLexer(grammar: CstGrammar) { // Entering the OUTERMOST flow (0→1): if it opens right after a `:`/`-` block indicator, // it is a block VALUE/ITEM → arm the §7.4 indent rule with n = the current block column // (the indent-stack top). Anywhere else (top-level / after `,` / as a key) the rule is OFF. - if (flowDepth === 0) { + // The §7.4 / multi-line-flow bookkeeping is indent-only (a newline grammar has no stack). + if (flowDepth === 0 && indent) { const prevTok = tokens[tokens.length - 2]; // the token before this just-pushed open flowValueIndent = (prevTok && prevTok.type === '' && (prevTok.text === ':' || prevTok.text === '-')) ? indentStack[indentStack.length - 1] : -1; @@ -465,7 +474,7 @@ export function createLexer(grammar: CstGrammar) { flowDepth++; } else if (flowCloseSet.has(t.text)) { flowDepth = Math.max(0, flowDepth - 1); - if (flowDepth === 0) { + if (flowDepth === 0 && indent) { flowValueIndent = -1; if (flowSawNewline) pendingMultilineFlow = true; // a multi-line flow just closed → flag the next token flowSawNewline = false; @@ -566,7 +575,7 @@ export function createLexer(grammar: CstGrammar) { // ── Indentation mode: at a block-context line start, skip blank/comment lines, measure // the next content line's leading-space column, and emit NEWLINE / INDENT / DEDENT(s) // before that line's tokens (relative to the indentation stack). ── - if (indent && flowDepth === 0 && lineStart) { + if (lineSensitive && flowDepth === 0 && lineStart) { let p = pos, col = 0; while (p < source.length && source[p] === ' ') { p++; col++; } const ch = source[p]; @@ -600,18 +609,26 @@ export function createLexer(grammar: CstGrammar) { // (its emitted NEWLINE/DEDENT either reject in value position or are harmless between // siblings — matching the `yaml` oracle, which is context-sensitive there). We reject only // the structural case, so no valid leaf-continuation is mis-rejected. - if (ch === '\t') { + if (indent && ch === '\t') { // §6.1 tab-in-indentation error is YAML-specific (newline mode has no stack) let q = p; while (q < source.length && (source[q] === ' ' || source[q] === '\t')) q++; const after = source[q]; if (q < source.length && after !== '\n' && after !== '\r' && startsBlockStructuralNode(source, q)) { throw new Error(`Tab character used in indentation at offset ${p}`); } } - if (indent.comment && source.startsWith(indent.comment, p)) { // comment-only line — ignored + if (lineComment && source.startsWith(lineComment, p)) { // comment-only line — ignored let e = p; while (e < source.length && source[e] !== '\n') e++; pos = e; pendingComment = true; continue; // next iteration consumes the newline } pos = p; // consume the leading indentation + // ── newline-only mode: no indent stack — emit ONE NEWLINE at this real line boundary (a + // leading boundary before any content is suppressed via emittedContent) and move on. ── + if (!indent) { + if (emittedContent) push({ type: newline!.token, text: '', offset: pos }); + lineStart = false; + atLineLead = true; + continue; + } currentLineCol = col; // bounds a block scalar started on this line const top = indentStack[indentStack.length - 1]; if (col > top) { @@ -664,10 +681,10 @@ export function createLexer(grammar: CstGrammar) { continue; } - // Whitespace. In indentation mode, inline spaces/tabs are skipped but a NEWLINE is a - // block-context line boundary (sets lineStart so the routine above runs next) — except - // inside flow delimiters, where newlines are insignificant. Otherwise skip any run. - if (indent) { + // Whitespace. In an indentation / newline grammar, inline spaces/tabs are skipped but a + // NEWLINE is a block-context line boundary (sets lineStart so the routine above runs next) — + // except inside flow delimiters, where newlines are insignificant. Otherwise skip any run. + if (lineSensitive) { const c = source[pos]; if (c === ' ' || c === '\t') { // A TAB between a block indicator (`-`/`?`/map-`:`) and a NESTED block-structural node it @@ -677,7 +694,7 @@ export function createLexer(grammar: CstGrammar) { // separation, so the structural sniff gates it. After a `:` a node PROPERTY is the inline // value (`key:\t&a x` is legal), so the `:` case excludes properties (allowProperty=false) // while `-`/`?` include them (`-\t&a x` IS an error). Block context only (flowDepth===0). - if (flowDepth === 0) { + if (indent && flowDepth === 0) { // §6.1 tab-after-indicator error is YAML-specific const prev = tokens[tokens.length - 1]; const isIndicator = prev && prev.type === '' && (prev.text === '-' || prev.text === '?' || prev.text === ':'); if (isIndicator) { @@ -692,7 +709,7 @@ export function createLexer(grammar: CstGrammar) { if (c === '\n' || c === '\r') { pos++; if (c === '\r' && source[pos] === '\n') pos++; if (flowDepth === 0) lineStart = true; - else { + else if (indent) { flowSawNewline = true; // this outermost flow spans >1 line → it can't be an implicit block key // §7.4: inside a value/item-position flow, a CONTENT line must be indented MORE than the // enclosing block column `n` (flowValueIndent). The indentation column is the leading-SPACE diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index 2635c1a..b29300e 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -541,6 +541,10 @@ function planTemplate(grammar: CstGrammar): TemplatePlan | null { /** Determine which tokens the external scanner must provide. */ function planScannerTokens(grammar: CstGrammar): Map { const map = new Map(); + // A newline-sensitive grammar's NEWLINE token is engine-emitted; in tree-sitter it becomes a + // stateless external token (the scanner emits it at each significant line boundary). Listed + // FIRST so it heads the enum / externals order. + if (grammar.newline) map.set(grammar.newline.token, toSnake(grammar.newline.token)); // The regex token: '/' is context-sensitive (regex vs division). The scanner // resolves it. const regexTok = grammar.tokens.find(t => t.flags.includes('regex')); @@ -1625,6 +1629,26 @@ function buildScannerC( L.push('static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }'); L.push(''); + const nl = grammar.newline; + if (nl) { + const nlSym = ctx.scannerTokenFor.get(nl.token)!.toUpperCase(); + L.push('// ── Newline scan ────────────────────────────────────────────────'); + L.push('// A newline-sensitive grammar emits one NEWLINE token at each significant line'); + L.push('// boundary. tree-sitter only asks for it where the grammar permits it (statement'); + L.push('// boundaries); inside flow delimiters the rules never reference NEWLINE, so'); + L.push('// valid_symbols[NEWLINE] is false there and the line break falls through to'); + L.push('// `extras` as ordinary whitespace. Stateless: one line break (\\n / \\r / \\r\\n) per token.'); + L.push('static bool scan_newline(TSLexer *lexer) {'); + L.push(' if (lexer->lookahead == \'\\r\') { advance(lexer); if (lexer->lookahead == \'\\n\') advance(lexer); }'); + L.push(' else if (lexer->lookahead == \'\\n\') advance(lexer);'); + L.push(' else return false;'); + L.push(` lexer->result_symbol = ${nlSym};`); + L.push(' lexer->mark_end(lexer);'); + L.push(' return true;'); + L.push('}'); + L.push(''); + } + if (regexTok) { // Derive the regex literal scan from the token pattern + hints. const flagChars = tokenPatternTrailingCharClass(regexTok) ?? 'gimsuyd'; @@ -1734,6 +1758,14 @@ function buildScannerC( L.push(' const bool *valid_symbols) {'); L.push(' (void)payload;'); L.push(''); + if (grammar.newline) { + const nlSym = ctx.scannerTokenFor.get(grammar.newline.token)!.toUpperCase(); + L.push(' // Newline first: a significant line boundary outranks every other external token.'); + L.push(` if (valid_symbols[${nlSym}] && (lexer->lookahead == '\\n' || lexer->lookahead == '\\r')) {`); + L.push(' if (scan_newline(lexer)) return true;'); + L.push(' }'); + L.push(''); + } if (tp && regexTok) { const charsSym = tp.charsSnake.toUpperCase(); const regexSym = ctx.scannerTokenFor.get(regexTok.name)!.toUpperCase(); diff --git a/src/types.ts b/src/types.ts index d0dff86..fa41ff3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -329,6 +329,25 @@ export interface IndentConfig { }; } +/** + * Opt-in NEWLINE-sensitive tokenization, INDEPENDENT of `indent`. For grammars that are + * newline-aware but NOT indentation-aware — statements are line-delimited, but nesting is via + * delimiters / expressions, not indentation (e.g. dotenv-style env specs). The lexer emits a single + * NEWLINE token at each significant line boundary (suppressed inside flow delimiters, and on blank / + * comment-only lines), with NO indent stack and NO INDENT/DEDENT tokens. `indent` is the richer + * layer built ON TOP of this same line-boundary + flow-suspension machinery (indent = newline + + * indent stack + YAML block-scalar semantics), so declaring BOTH is rejected. The NEWLINE token is + * engine-emitted (declared with a placeholder `never()` pattern and named here), exactly like the + * indent tokens. ABSENT for token-stream / indentation languages → dormant, tokenization + * byte-identical. + */ +export interface NewlineConfig { + token: string; // token TYPE emitted at each significant line boundary (engine-emitted, like the indent tokens) + flowOpen?: string[]; // punctuation that SUSPENDS newline significance while open (e.g. ['(', '[', '{']) + flowClose?: string[]; // matching closers (e.g. [')', ']', '}']) + comment?: string; // line-comment introducer; a comment-only line emits no NEWLINE (e.g. '#') +} + export interface PrecOperator { value: string; position: 'infix' | 'prefix' | 'postfix'; @@ -391,6 +410,7 @@ export interface CstGrammar { scopeName?: string; // declared TextMate scope name (e.g. source.ts); its suffix drives every scope's language tag markup?: MarkupConfig; // opt-in markup-mode tokenization (HTML/Vue); absent for token-stream languages indent?: IndentConfig; // opt-in indentation-sensitive tokenization (YAML); absent → byte-identical token stream + newline?: NewlineConfig; // opt-in NEWLINE-sensitive tokenization, independent of indent (no indent stack); absent → byte-identical token stream expressionRule?: string; // name of the rule that produces an EXPRESSION; lets gen-tm derive a `#expression` sub-grammar (for expression-only embeds, e.g. Vue `{{ }}`) // Extra TextMate grammars that just RE-EXPOSE this one under another scopeName (thin // `{scopeName, patterns:[{include: }]}` wrappers). HTML declares diff --git a/test/newline-mode.ts b/test/newline-mode.ts new file mode 100644 index 0000000..d5235b7 --- /dev/null +++ b/test/newline-mode.ts @@ -0,0 +1,171 @@ +// Regression test for issue #10 — a `newline`-sensitive mode INDEPENDENT of `indent`. +// +// A tiny dotenv / env-spec-flavoured grammar (KEY=value statements, one per line; a +// value is a scalar or a function call whose args may span lines INSIDE `( … )`; `#` +// line comments) exercises the LAYERED newline machinery: the lexer emits a single +// NEWLINE token at each significant line boundary, with NO indent stack and NO +// INDENT/DEDENT tokens, and all four backends (parser / TextMate / Monarch / +// tree-sitter) stay coherent. The grammar is defined INLINE (like test/agnostic.ts) — +// no new language file is added to the repo. +// +// Run with: node test/newline-mode.ts +import { execFileSync } from 'node:child_process'; +import { mkdtempSync, writeFileSync, mkdirSync, existsSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import vm from 'node:vm'; +import { token, rule, defineGrammar, many, opt, sep, seq, plus, oneOf, range, star, noneOf, never } from '../src/api.ts'; +import { createLexer } from '../src/gen-lexer.ts'; +import { createParser } from '../src/gen-parser.ts'; +import { generateTmLanguage } from '../src/gen-tm.ts'; +import { generateMonarch } from '../src/gen-monarch.ts'; +import { generateTreeSitter } from '../src/gen-treesitter.ts'; +import type { NewlineConfig } from '../src/types.ts'; + +let ok = 0, fail = 0; +const check = (label: string, cond: boolean) => { if (cond) ok++; else { fail++; console.log(' ✗', label); } }; + +// ── A minimal newline-aware, NON-indent grammar (env-spec flavour) ── +const Newline = token(never(), {}); // engine-emitted at each significant line boundary +const Ident = token(plus(oneOf(range('a', 'z'), range('A', 'Z'), range('0', '9'), '_')), { identifier: true }); +const Comment = token(seq('#', star(noneOf('\n'))), { skip: true }); + +const Value = rule(($: any) => [Ident, [Ident, '(', sep($, ','), ')']]); // scalar OR call (call args may span lines) +const Stmt = rule(() => [[Ident, '=', Value]]); +const Program = rule(() => [[opt(Stmt), many(Newline, opt(Stmt))]]); + +const newline: NewlineConfig = { token: 'Newline', flowOpen: ['('], flowClose: [')'], comment: '#' }; +const g = defineGrammar({ + name: 'envspec', scopeName: 'source.envspec', + tokens: { Comment, Ident, Newline }, + rules: { Value, Stmt, Program }, entry: Program, + newline, +}); + +// ── 0. defineGrammar rejects declaring both indent and newline ── +let bothRejected = false; +try { + defineGrammar({ + name: 'bad', tokens: { Ident, Newline }, rules: { Stmt, Program }, entry: Program, + newline, indent: { indentToken: 'X', dedentToken: 'Y', newlineToken: 'Newline' }, + }); +} catch { bothRejected = true; } +check('defineGrammar rejects declaring BOTH indent and newline', bothRejected); + +// ── 1. Lexer: NEWLINE emission, flow suspension, blank/comment lines, NO indent tokens ── +const { tokenize } = createLexer(g); +const countNL = (s: string) => tokenize(s).filter(t => t.type === 'Newline').length; +const hasIndentTokens = (s: string) => tokenize(s).some(t => t.type === 'Indent' || t.type === 'Dedent'); + +check('two statements separated by exactly one NEWLINE', countNL('A=1\nB=2') === 1); +check('never emits INDENT/DEDENT (newline ≠ indent)', !hasIndentTokens('A=1\nB=2\nC=3')); +check('blank lines collapse to a single NEWLINE', countNL('A=1\n\n\nB=2') === 1); +check('comment-only line is not a separator', countNL('A=1\n# note\nB=2') === 1); +check('newline INSIDE flow ( … ) is suspended', countNL('A=fn(1,\n2)') === 0); +check('flow value still produces no indent tokens', !hasIndentTokens('A=fn(1,\n2)')); +check('leading boundary suppressed (no NEWLINE before first content)', tokenize('\n\nA=1')[0]?.type !== 'Newline'); + +// ── 2. Parser: accepts line-delimited / flow-spanning input, rejects malformed ── +const { parse } = createParser(g); +const accepts = (s: string) => { try { return parse(s).kind === 'node'; } catch { return false; } }; +check('accepts a single statement', accepts('A=1')); +check('accepts newline-separated statements', accepts('A=1\nB=2')); +check('accepts a trailing newline', accepts('A=1\n')); +check('accepts a comment line between statements', accepts('A=1\n# c\nB=2')); +check('accepts a function-call value spanning lines in ( … )', accepts('A=fn(1,\n2)\nB=3')); +check('rejects a statement with no `=`', !accepts('A B')); +check('rejects a statement with no value', !accepts('A=')); + +// ── 3. TextMate: generates without error; the NEWLINE never() token yields no rule ── +const tm = generateTmLanguage(g, 'envspec'); +check('TextMate grammar has a non-empty repository', !!tm.repository && Object.keys(tm.repository).length > 0); +check('TextMate grammar has patterns', Array.isArray(tm.patterns) && tm.patterns.length > 0); +check('TextMate: NEWLINE is an invisible never-match (?!) rule (same convention as YAML indent tokens)', tm.repository.newline?.match === '(?!)'); + +// ── 4. Monarch: generates without error ── +const mon = generateMonarch(g); +check('Monarch tokenizer has a root state', !!mon.tokenizer && !!mon.tokenizer.root); + +// ── 5. tree-sitter: NEWLINE is a stateless external token; coherent grammar.js + scanner.c ── +const { grammarJs, scannerC, highlightsScm, externalTokens } = generateTreeSitter(g, 'envspec'); +check('tree-sitter declares externals', /externals:\s*\$ =>/.test(grammarJs)); +check('tree-sitter externalTokens include newline', externalTokens.includes('newline')); +check('grammar rules reference $.newline (external) as separator', grammarJs.includes('$.newline')); +check('scanner.c declares the NEWLINE enum', /enum TokenType\s*\{[^}]*\bNEWLINE\b/.test(scannerC)); +check('scanner.c implements scan_newline', scannerC.includes('scan_newline')); +check('scanner.c dispatches NEWLINE in scan()', scannerC.includes('valid_symbols[NEWLINE]')); + +// grammar.js must parse & execute as real JS through stubbed tree-sitter DSL globals. +function grammarExecutes(src: string): { ok: boolean; err?: string } { + const mk = (name: string) => (...args: unknown[]) => ({ type: name, args }); + const prec = Object.assign((...a: unknown[]) => mk('prec')(...a), { + left: (...a: unknown[]) => mk('prec.left')(...a), + right: (...a: unknown[]) => mk('prec.right')(...a), + }); + const sandbox: Record = { + module: { exports: {} as { rules?: Record } }, + grammar: (def: any) => { + const $ = new Proxy({}, { get: (_t, k) => ({ type: 'ref', name: String(k) }) }); + for (const fn of Object.values(def.rules)) (fn as (x: unknown) => unknown)($); + for (const k of ['extras', 'word', 'externals', 'conflicts']) if (def[k]) def[k]($); + return def; + }, + seq: mk('seq'), choice: mk('choice'), optional: mk('optional'), + repeat: mk('repeat'), repeat1: mk('repeat1'), token: mk('token'), + field: mk('field'), blank: mk('blank'), prec, + }; + vm.createContext(sandbox); + try { vm.runInContext(src, sandbox, { filename: 'grammar.js' }); return { ok: true }; } + catch (e: any) { return { ok: false, err: e.message }; } +} +const exec = grammarExecutes(grammarJs); +check(`tree-sitter grammar.js parses & executes${exec.err ? ' (' + exec.err + ')' : ''}`, exec.ok); + +// ── 6. Optional: real tree-sitter CLI — generate + parse proves scanner.c COMPILES +// and that NEWLINE fires at boundaries but is suppressed inside flow ( … ). ── +const tsBin = join(process.cwd(), 'node_modules', '.bin', 'tree-sitter'); +function hasCli(): boolean { + if (!existsSync(tsBin)) return false; + try { execFileSync(tsBin, ['--version'], { stdio: 'ignore' }); return true; } + catch { return false; } +} +if (hasCli()) { + console.log('\ntree-sitter CLI found — generating + parsing to validate scanner.c…'); + const dir = mkdtempSync(join(tmpdir(), 'monogram-nl-')); + mkdirSync(join(dir, 'src'), { recursive: true }); + mkdirSync(join(dir, 'queries'), { recursive: true }); + writeFileSync(join(dir, 'grammar.js'), grammarJs); + writeFileSync(join(dir, 'src', 'scanner.c'), scannerC); + writeFileSync(join(dir, 'queries', 'highlights.scm'), highlightsScm); + writeFileSync(join(dir, 'package.json'), JSON.stringify({ name: 'tree-sitter-envspec-monogram', version: '0.0.0' }, null, 2)); + let generated = false; + try { + execFileSync(tsBin, ['generate'], { cwd: dir, stdio: 'pipe' }); + generated = true; + } catch (e: any) { + console.log(' generate failed:', ((e.stderr || e.message || '') + '').split('\n').slice(0, 8).join('\n ')); + } + check('tree-sitter generate succeeds (externals/scanner consistent)', generated); + if (generated) { + const parseTree = (input: string) => { + writeFileSync(join(dir, 'in.env'), input); + // `tree-sitter parse` auto-builds (compiles parser.c + scanner.c) then parses; + // it exits non-zero when the tree contains an ERROR but still prints the tree. + try { return execFileSync(tsBin, ['parse', 'in.env'], { cwd: dir, encoding: 'utf8' }); } + catch (e: any) { return ((e.stdout || '') + '\n' + (e.stderr || '')); } + }; + const t1 = parseTree('A=1\nB=2\n'); + check('parse: NEWLINE node present between statements, no ERROR', t1.includes('newline') && !t1.includes('ERROR')); + const t2 = parseTree('A=fn(1,\n2)\nB=3\n'); + // fn(1,\n2) must parse with NO newline node inside it (flow suspends the line break); + // the only two newline nodes are the A→B separator and the trailing line break. + check('parse: flow-internal newline suppressed (no ERROR; newlines only at statement boundaries)', + !t2.includes('ERROR') && (t2.match(/\(newline /g) ?? []).length === 2); + } + console.log(` (artifacts in ${dir})`); +} else { + console.log('\ntree-sitter CLI not found — structural validation only (not a failure).'); +} + +console.log(fail === 0 ? `\n${ok}/${ok} newline-mode checks pass` : `\n${fail} of ${ok + fail} FAILED`); +process.exit(fail === 0 ? 0 : 1);