Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/api.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, TokenPattern } from './types.ts';
import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, NewlineConfig, TokenPattern } from './types.ts';
import {
altPattern, anyChar, followedBy, isTokenPattern, lit, never, noneOf, notFollowedBy,
notPrecededBy, oneOf, optPattern, plus, precededBy, range, repeat,
Expand Down Expand Up @@ -381,13 +381,19 @@ interface GrammarConfig {
entry: RuleRef;
markup?: MarkupConfig; // opt-in markup-mode tokenization (HTML/Vue)
indent?: IndentConfig; // opt-in indentation-sensitive tokenization (YAML)
newline?: NewlineConfig; // opt-in NEWLINE-sensitive tokenization, independent of indent (no indent stack)
expression?: RuleRef; // the rule that produces an EXPRESSION; enables a derived `#expression` sub-grammar (expression-only embeds)
aliasScopes?: { scope: string; file: string }[]; // extra grammars re-exposing this one under another scopeName (e.g. text.html.derivative)
canonicalRepoNames?: Record<string, string | string[]>; // official repo KEY NAME → structural key(s) for the SAME construct; gen-tm RENAMES the structural key (or synthesises a union wrapper) to emit the official name natively (the 限制器; see CstGrammar.canonicalRepoNames)
manifest?: import('./types.ts').ContributesManifest; // VS Code `contributes` packaging (emits a pasteable snippet)
}

export function defineGrammar(config: GrammarConfig): CstGrammar & { name: string; scopeName?: string } {
// `indent` is the richer layer built on top of newline-significant line boundaries, so the two
// modes are mutually exclusive — declaring both is a configuration error, not a merge.
if (config.indent && config.newline) {
throw new Error('A grammar may declare `indent` OR `newline`, not both — `indent` already implies newline-significant line boundaries.');
}
const names = new Map<object, string>();
for (const [name, tok] of Object.entries(config.tokens)) {
names.set(tok, name);
Expand Down Expand Up @@ -453,5 +459,5 @@ export function defineGrammar(config: GrammarConfig): CstGrammar & { name: strin
}
}

return { name: config.name, scopeName: config.scopeName, tokens, precs, rules, scopeOverrides, markup: config.markup, indent: config.indent, expressionRule: config.expression ? names.get(config.expression) : undefined, aliasScopes: config.aliasScopes, canonicalRepoNames: config.canonicalRepoNames, manifest: config.manifest };
return { name: config.name, scopeName: config.scopeName, tokens, precs, rules, scopeOverrides, markup: config.markup, indent: config.indent, newline: config.newline, expressionRule: config.expression ? names.get(config.expression) : undefined, aliasScopes: config.aliasScopes, canonicalRepoNames: config.canonicalRepoNames, manifest: config.manifest };
}
1 change: 1 addition & 0 deletions src/emit-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,7 @@ export function emitParser(grammar: CstGrammar): string {
rules: [{ name: '$lits', body: litRuleBody, flags: [] }],
markup: grammar.markup,
indent: grammar.indent,
newline: grammar.newline,
scopeOverrides: [],
};

Expand Down
51 changes: 34 additions & 17 deletions src/gen-lexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -185,16 +185,24 @@ export function createLexer(grammar: CstGrammar) {
// → null, and a `tagOpen` always opens a tag (legacy behaviour, unchanged for other grammars).
const tagOpenAfterRe = markup?.tagOpenAfter ? new RegExp('[' + markup.tagOpenAfter + ']') : null;

// ── Indentation mode (opt-in; dormant unless the grammar declares `indent`) ──
// ── Indentation / newline mode (opt-in; dormant unless the grammar declares `indent` or `newline`) ──
// Like markup, the INDENT/DEDENT/NEWLINE tokens are EMITTED by a state machine (not matched
// by a regex) — so they are skipped in the regex loop and their grammar patterns are
// placeholders. Indentation is suspended inside flow delimiters via a flow-depth counter.
// `newline` is the line-boundary + flow-suspension LAYER that `indent` builds on: an indent
// grammar gets the full stack + INDENT/DEDENT/NEWLINE; a newline-only grammar emits just the
// NEWLINE token at each significant line boundary (no stack). `lineSensitive` gates the shared
// machinery; `indent`/`newline` are mutually exclusive (defineGrammar rejects declaring both).
const indent = grammar.indent;
const newline = grammar.newline;
const lineSensitive = !!indent || !!newline;
const lineComment = (indent ?? newline)?.comment; // line-comment introducer (both modes skip comment-only lines)
const indentTokenNames = new Set<string>(
indent ? ([indent.indentToken, indent.dedentToken, indent.newlineToken].filter(Boolean) as string[]) : [],
indent ? ([indent.indentToken, indent.dedentToken, indent.newlineToken].filter(Boolean) as string[])
: newline ? [newline.token] : [],
);
const flowOpenSet = new Set(indent?.flowOpen ?? []);
const flowCloseSet = new Set(indent?.flowClose ?? []);
const flowOpenSet = new Set((indent ?? newline)?.flowOpen ?? []);
const flowCloseSet = new Set((indent ?? newline)?.flowClose ?? []);
// String-literal token names (the `string`-flagged tokens — quoted scalars in YAML). Used by the
// flow mapping-separator guard below: a quoted scalar can never run past its closing quote, so a
// `:` immediately after one (inside flow) is ALWAYS the mapping `key: value` separator, never the
Expand Down Expand Up @@ -416,7 +424,7 @@ export function createLexer(grammar: CstGrammar) {
&& tagOpenAfterRe.test(source[i + markup!.tagOpen.length]));
// Indentation state — active only when `indent` is declared (dormant otherwise).
let flowDepth = 0; // >0 while inside flow delimiters ([ ] { }) → indentation suspended
let lineStart = !!indent; // at a block-context line boundary (file start counts as one)
let lineStart = lineSensitive; // at a block-context line boundary (file start counts as one)
let emittedContent = false; // any real (non-structural) token emitted yet — suppress a leading NEWLINE/DEDENT
let currentLineCol = 0; // leading-space column of the current logical line (bounds block scalars)
let atLineLead = false; // the next emitted token is the FIRST content token of its line (compact-indicator probe)
Expand Down Expand Up @@ -446,7 +454,7 @@ export function createLexer(grammar: CstGrammar) {
if (pendingComment) { t.commentBefore = true; pendingComment = false; }
if (pendingMultilineFlow) { t.multilineFlowBefore = true; pendingMultilineFlow = false; }
tokens.push(t);
if (indent) {
if (lineSensitive) {
if (!indentTokenNames.has(t.type)) {
emittedContent = true; // a real token (not INDENT/DEDENT/NEWLINE)
atLineLead = false; // line-lead consumed once a real token lands
Expand All @@ -456,7 +464,8 @@ export function createLexer(grammar: CstGrammar) {
// Entering the OUTERMOST flow (0→1): if it opens right after a `:`/`-` block indicator,
// it is a block VALUE/ITEM → arm the §7.4 indent rule with n = the current block column
// (the indent-stack top). Anywhere else (top-level / after `,` / as a key) the rule is OFF.
if (flowDepth === 0) {
// The §7.4 / multi-line-flow bookkeeping is indent-only (a newline grammar has no stack).
if (flowDepth === 0 && indent) {
const prevTok = tokens[tokens.length - 2]; // the token before this just-pushed open
flowValueIndent = (prevTok && prevTok.type === '' && (prevTok.text === ':' || prevTok.text === '-'))
? indentStack[indentStack.length - 1] : -1;
Expand All @@ -465,7 +474,7 @@ export function createLexer(grammar: CstGrammar) {
flowDepth++;
} else if (flowCloseSet.has(t.text)) {
flowDepth = Math.max(0, flowDepth - 1);
if (flowDepth === 0) {
if (flowDepth === 0 && indent) {
flowValueIndent = -1;
if (flowSawNewline) pendingMultilineFlow = true; // a multi-line flow just closed → flag the next token
flowSawNewline = false;
Expand Down Expand Up @@ -566,7 +575,7 @@ export function createLexer(grammar: CstGrammar) {
// ── Indentation mode: at a block-context line start, skip blank/comment lines, measure
// the next content line's leading-space column, and emit NEWLINE / INDENT / DEDENT(s)
// before that line's tokens (relative to the indentation stack). ──
if (indent && flowDepth === 0 && lineStart) {
if (lineSensitive && flowDepth === 0 && lineStart) {
let p = pos, col = 0;
while (p < source.length && source[p] === ' ') { p++; col++; }
const ch = source[p];
Expand Down Expand Up @@ -600,18 +609,26 @@ export function createLexer(grammar: CstGrammar) {
// (its emitted NEWLINE/DEDENT either reject in value position or are harmless between
// siblings — matching the `yaml` oracle, which is context-sensitive there). We reject only
// the structural case, so no valid leaf-continuation is mis-rejected.
if (ch === '\t') {
if (indent && ch === '\t') { // §6.1 tab-in-indentation error is YAML-specific (newline mode has no stack)
let q = p; while (q < source.length && (source[q] === ' ' || source[q] === '\t')) q++;
const after = source[q];
if (q < source.length && after !== '\n' && after !== '\r' && startsBlockStructuralNode(source, q)) {
throw new Error(`Tab character used in indentation at offset ${p}`);
}
}
if (indent.comment && source.startsWith(indent.comment, p)) { // comment-only line — ignored
if (lineComment && source.startsWith(lineComment, p)) { // comment-only line — ignored
let e = p; while (e < source.length && source[e] !== '\n') e++;
pos = e; pendingComment = true; continue; // next iteration consumes the newline
}
pos = p; // consume the leading indentation
// ── newline-only mode: no indent stack — emit ONE NEWLINE at this real line boundary (a
// leading boundary before any content is suppressed via emittedContent) and move on. ──
if (!indent) {
if (emittedContent) push({ type: newline!.token, text: '', offset: pos });
lineStart = false;
atLineLead = true;
continue;
}
currentLineCol = col; // bounds a block scalar started on this line
const top = indentStack[indentStack.length - 1];
if (col > top) {
Expand Down Expand Up @@ -664,10 +681,10 @@ export function createLexer(grammar: CstGrammar) {
continue;
}

// Whitespace. In indentation mode, inline spaces/tabs are skipped but a NEWLINE is a
// block-context line boundary (sets lineStart so the routine above runs next) — except
// inside flow delimiters, where newlines are insignificant. Otherwise skip any run.
if (indent) {
// Whitespace. In an indentation / newline grammar, inline spaces/tabs are skipped but a
// NEWLINE is a block-context line boundary (sets lineStart so the routine above runs next) —
// except inside flow delimiters, where newlines are insignificant. Otherwise skip any run.
if (lineSensitive) {
const c = source[pos];
if (c === ' ' || c === '\t') {
// A TAB between a block indicator (`-`/`?`/map-`:`) and a NESTED block-structural node it
Expand All @@ -677,7 +694,7 @@ export function createLexer(grammar: CstGrammar) {
// separation, so the structural sniff gates it. After a `:` a node PROPERTY is the inline
// value (`key:\t&a x` is legal), so the `:` case excludes properties (allowProperty=false)
// while `-`/`?` include them (`-\t&a x` IS an error). Block context only (flowDepth===0).
if (flowDepth === 0) {
if (indent && flowDepth === 0) { // §6.1 tab-after-indicator error is YAML-specific
const prev = tokens[tokens.length - 1];
const isIndicator = prev && prev.type === '' && (prev.text === '-' || prev.text === '?' || prev.text === ':');
if (isIndicator) {
Expand All @@ -692,7 +709,7 @@ export function createLexer(grammar: CstGrammar) {
if (c === '\n' || c === '\r') {
pos++; if (c === '\r' && source[pos] === '\n') pos++;
if (flowDepth === 0) lineStart = true;
else {
else if (indent) {
flowSawNewline = true; // this outermost flow spans >1 line → it can't be an implicit block key
// §7.4: inside a value/item-position flow, a CONTENT line must be indented MORE than the
// enclosing block column `n` (flowValueIndent). The indentation column is the leading-SPACE
Expand Down
32 changes: 32 additions & 0 deletions src/gen-treesitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,10 @@ function planTemplate(grammar: CstGrammar): TemplatePlan | null {
/** Determine which tokens the external scanner must provide. */
function planScannerTokens(grammar: CstGrammar): Map<string, string> {
const map = new Map<string, string>();
// A newline-sensitive grammar's NEWLINE token is engine-emitted; in tree-sitter it becomes a
// stateless external token (the scanner emits it at each significant line boundary). Listed
// FIRST so it heads the enum / externals order.
if (grammar.newline) map.set(grammar.newline.token, toSnake(grammar.newline.token));
// The regex token: '/' is context-sensitive (regex vs division). The scanner
// resolves it.
const regexTok = grammar.tokens.find(t => t.flags.includes('regex'));
Expand Down Expand Up @@ -1625,6 +1629,26 @@ function buildScannerC(
L.push('static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }');
L.push('');

const nl = grammar.newline;
if (nl) {
const nlSym = ctx.scannerTokenFor.get(nl.token)!.toUpperCase();
L.push('// ── Newline scan ────────────────────────────────────────────────');
L.push('// A newline-sensitive grammar emits one NEWLINE token at each significant line');
L.push('// boundary. tree-sitter only asks for it where the grammar permits it (statement');
L.push('// boundaries); inside flow delimiters the rules never reference NEWLINE, so');
L.push('// valid_symbols[NEWLINE] is false there and the line break falls through to');
L.push('// `extras` as ordinary whitespace. Stateless: one line break (\\n / \\r / \\r\\n) per token.');
L.push('static bool scan_newline(TSLexer *lexer) {');
L.push(' if (lexer->lookahead == \'\\r\') { advance(lexer); if (lexer->lookahead == \'\\n\') advance(lexer); }');
L.push(' else if (lexer->lookahead == \'\\n\') advance(lexer);');
L.push(' else return false;');
L.push(` lexer->result_symbol = ${nlSym};`);
L.push(' lexer->mark_end(lexer);');
L.push(' return true;');
L.push('}');
L.push('');
}

if (regexTok) {
// Derive the regex literal scan from the token pattern + hints.
const flagChars = tokenPatternTrailingCharClass(regexTok) ?? 'gimsuyd';
Expand Down Expand Up @@ -1734,6 +1758,14 @@ function buildScannerC(
L.push(' const bool *valid_symbols) {');
L.push(' (void)payload;');
L.push('');
if (grammar.newline) {
const nlSym = ctx.scannerTokenFor.get(grammar.newline.token)!.toUpperCase();
L.push(' // Newline first: a significant line boundary outranks every other external token.');
L.push(` if (valid_symbols[${nlSym}] && (lexer->lookahead == '\\n' || lexer->lookahead == '\\r')) {`);
L.push(' if (scan_newline(lexer)) return true;');
L.push(' }');
L.push('');
}
if (tp && regexTok) {
const charsSym = tp.charsSnake.toUpperCase();
const regexSym = ctx.scannerTokenFor.get(regexTok.name)!.toUpperCase();
Expand Down
20 changes: 20 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,25 @@ export interface IndentConfig {
};
}

/**
* Opt-in NEWLINE-sensitive tokenization, INDEPENDENT of `indent`. For grammars that are
* newline-aware but NOT indentation-aware — statements are line-delimited, but nesting is via
* delimiters / expressions, not indentation (e.g. dotenv-style env specs). The lexer emits a single
* NEWLINE token at each significant line boundary (suppressed inside flow delimiters, and on blank /
* comment-only lines), with NO indent stack and NO INDENT/DEDENT tokens. `indent` is the richer
* layer built ON TOP of this same line-boundary + flow-suspension machinery (indent = newline +
* indent stack + YAML block-scalar semantics), so declaring BOTH is rejected. The NEWLINE token is
* engine-emitted (declared with a placeholder `never()` pattern and named here), exactly like the
* indent tokens. ABSENT for token-stream / indentation languages → dormant, tokenization
* byte-identical.
*/
export interface NewlineConfig {
token: string; // token TYPE emitted at each significant line boundary (engine-emitted, like the indent tokens)
flowOpen?: string[]; // punctuation that SUSPENDS newline significance while open (e.g. ['(', '[', '{'])
flowClose?: string[]; // matching closers (e.g. [')', ']', '}'])
comment?: string; // line-comment introducer; a comment-only line emits no NEWLINE (e.g. '#')
}

export interface PrecOperator {
value: string;
position: 'infix' | 'prefix' | 'postfix';
Expand Down Expand Up @@ -391,6 +410,7 @@ export interface CstGrammar {
scopeName?: string; // declared TextMate scope name (e.g. source.ts); its suffix drives every scope's language tag
markup?: MarkupConfig; // opt-in markup-mode tokenization (HTML/Vue); absent for token-stream languages
indent?: IndentConfig; // opt-in indentation-sensitive tokenization (YAML); absent → byte-identical token stream
newline?: NewlineConfig; // opt-in NEWLINE-sensitive tokenization, independent of indent (no indent stack); absent → byte-identical token stream
expressionRule?: string; // name of the rule that produces an EXPRESSION; lets gen-tm derive a `#expression` sub-grammar (for expression-only embeds, e.g. Vue `{{ }}`)
// Extra TextMate grammars that just RE-EXPOSE this one under another scopeName (thin
// `{scopeName, patterns:[{include: <this.scopeName>}]}` wrappers). HTML declares
Expand Down
Loading
Loading