johnsoncodehk · johnsoncodehk · Jun 6, 2026 · Jun 6, 2026
diff --git a/src/api.ts b/src/api.ts
@@ -1,4 +1,4 @@
-import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, TokenPattern } from './types.ts';
+import type { CstGrammar, TokenDecl, PrecLevel, PrecOperator, RuleDecl, RuleExpr, MarkupConfig, IndentConfig, NewlineConfig, TokenPattern } from './types.ts';
 import {
   altPattern, anyChar, followedBy, isTokenPattern, lit, never, noneOf, notFollowedBy,
   notPrecededBy, oneOf, optPattern, plus, precededBy, range, repeat,
@@ -381,13 +381,19 @@ interface GrammarConfig {
   entry: RuleRef;
   markup?: MarkupConfig;  // opt-in markup-mode tokenization (HTML/Vue)
   indent?: IndentConfig;  // opt-in indentation-sensitive tokenization (YAML)
+  newline?: NewlineConfig;  // opt-in NEWLINE-sensitive tokenization, independent of indent (no indent stack)
   expression?: RuleRef;   // the rule that produces an EXPRESSION; enables a derived `#expression` sub-grammar (expression-only embeds)
   aliasScopes?: { scope: string; file: string }[];  // extra grammars re-exposing this one under another scopeName (e.g. text.html.derivative)
   canonicalRepoNames?: Record<string, string | string[]>;  // official repo KEY NAME → structural key(s) for the SAME construct; gen-tm RENAMES the structural key (or synthesises a union wrapper) to emit the official name natively (the 限制器; see CstGrammar.canonicalRepoNames)
   manifest?: import('./types.ts').ContributesManifest;  // VS Code `contributes` packaging (emits a pasteable snippet)
 }
 
 export function defineGrammar(config: GrammarConfig): CstGrammar & { name: string; scopeName?: string } {
+  // `indent` is the richer layer built on top of newline-significant line boundaries, so the two
+  // modes are mutually exclusive — declaring both is a configuration error, not a merge.
+  if (config.indent && config.newline) {
+    throw new Error('A grammar may declare `indent` OR `newline`, not both — `indent` already implies newline-significant line boundaries.');
+  }
   const names = new Map<object, string>();
   for (const [name, tok] of Object.entries(config.tokens)) {
     names.set(tok, name);
@@ -453,5 +459,5 @@ export function defineGrammar(config: GrammarConfig): CstGrammar & { name: strin
     }
   }
 
-  return { name: config.name, scopeName: config.scopeName, tokens, precs, rules, scopeOverrides, markup: config.markup, indent: config.indent, expressionRule: config.expression ? names.get(config.expression) : undefined, aliasScopes: config.aliasScopes, canonicalRepoNames: config.canonicalRepoNames, manifest: config.manifest };
+  return { name: config.name, scopeName: config.scopeName, tokens, precs, rules, scopeOverrides, markup: config.markup, indent: config.indent, newline: config.newline, expressionRule: config.expression ? names.get(config.expression) : undefined, aliasScopes: config.aliasScopes, canonicalRepoNames: config.canonicalRepoNames, manifest: config.manifest };
 }
diff --git a/src/emit-parser.ts b/src/emit-parser.ts
@@ -635,6 +635,7 @@ export function emitParser(grammar: CstGrammar): string {
     rules: [{ name: '$lits', body: litRuleBody, flags: [] }],
     markup: grammar.markup,
     indent: grammar.indent,
+    newline: grammar.newline,
     scopeOverrides: [],
   };
 

diff --git a/src/gen-lexer.ts b/src/gen-lexer.ts
@@ -185,16 +185,24 @@ export function createLexer(grammar: CstGrammar) {
   // → null, and a `tagOpen` always opens a tag (legacy behaviour, unchanged for other grammars).
   const tagOpenAfterRe = markup?.tagOpenAfter ? new RegExp('[' + markup.tagOpenAfter + ']') : null;
 
-  // ── Indentation mode (opt-in; dormant unless the grammar declares `indent`) ──
+  // ── Indentation / newline mode (opt-in; dormant unless the grammar declares `indent` or `newline`) ──
   // Like markup, the INDENT/DEDENT/NEWLINE tokens are EMITTED by a state machine (not matched
   // by a regex) — so they are skipped in the regex loop and their grammar patterns are
   // placeholders. Indentation is suspended inside flow delimiters via a flow-depth counter.
+  // `newline` is the line-boundary + flow-suspension LAYER that `indent` builds on: an indent
+  // grammar gets the full stack + INDENT/DEDENT/NEWLINE; a newline-only grammar emits just the
+  // NEWLINE token at each significant line boundary (no stack). `lineSensitive` gates the shared
+  // machinery; `indent`/`newline` are mutually exclusive (defineGrammar rejects declaring both).
   const indent = grammar.indent;
+  const newline = grammar.newline;
+  const lineSensitive = !!indent || !!newline;
+  const lineComment = (indent ?? newline)?.comment;   // line-comment introducer (both modes skip comment-only lines)
   const indentTokenNames = new Set<string>(
-    indent ? ([indent.indentToken, indent.dedentToken, indent.newlineToken].filter(Boolean) as string[]) : [],
+    indent ? ([indent.indentToken, indent.dedentToken, indent.newlineToken].filter(Boolean) as string[])
+           : newline ? [newline.token] : [],
   );
-  const flowOpenSet = new Set(indent?.flowOpen ?? []);
-  const flowCloseSet = new Set(indent?.flowClose ?? []);
+  const flowOpenSet = new Set((indent ?? newline)?.flowOpen ?? []);
+  const flowCloseSet = new Set((indent ?? newline)?.flowClose ?? []);
   // String-literal token names (the `string`-flagged tokens — quoted scalars in YAML). Used by the
   // flow mapping-separator guard below: a quoted scalar can never run past its closing quote, so a
   // `:` immediately after one (inside flow) is ALWAYS the mapping `key: value` separator, never the
@@ -416,7 +424,7 @@ export function createLexer(grammar: CstGrammar) {
         && tagOpenAfterRe.test(source[i + markup!.tagOpen.length]));
     // Indentation state — active only when `indent` is declared (dormant otherwise).
     let flowDepth = 0;               // >0 while inside flow delimiters ([ ] { }) → indentation suspended
-    let lineStart = !!indent;        // at a block-context line boundary (file start counts as one)
+    let lineStart = lineSensitive;   // at a block-context line boundary (file start counts as one)
     let emittedContent = false;      // any real (non-structural) token emitted yet — suppress a leading NEWLINE/DEDENT
     let currentLineCol = 0;          // leading-space column of the current logical line (bounds block scalars)
     let atLineLead = false;          // the next emitted token is the FIRST content token of its line (compact-indicator probe)
@@ -446,7 +454,7 @@ export function createLexer(grammar: CstGrammar) {
       if (pendingComment) { t.commentBefore = true; pendingComment = false; }
       if (pendingMultilineFlow) { t.multilineFlowBefore = true; pendingMultilineFlow = false; }
       tokens.push(t);
-      if (indent) {
+      if (lineSensitive) {
         if (!indentTokenNames.has(t.type)) {
           emittedContent = true;                                     // a real token (not INDENT/DEDENT/NEWLINE)
           atLineLead = false;                                        // line-lead consumed once a real token lands
@@ -456,7 +464,8 @@ export function createLexer(grammar: CstGrammar) {
             // Entering the OUTERMOST flow (0→1): if it opens right after a `:`/`-` block indicator,
             // it is a block VALUE/ITEM → arm the §7.4 indent rule with n = the current block column
             // (the indent-stack top). Anywhere else (top-level / after `,` / as a key) the rule is OFF.
-            if (flowDepth === 0) {
+            // The §7.4 / multi-line-flow bookkeeping is indent-only (a newline grammar has no stack).
+            if (flowDepth === 0 && indent) {
               const prevTok = tokens[tokens.length - 2];   // the token before this just-pushed open
               flowValueIndent = (prevTok && prevTok.type === '' && (prevTok.text === ':' || prevTok.text === '-'))
                 ? indentStack[indentStack.length - 1] : -1;
@@ -465,7 +474,7 @@ export function createLexer(grammar: CstGrammar) {
             flowDepth++;
           } else if (flowCloseSet.has(t.text)) {
             flowDepth = Math.max(0, flowDepth - 1);
-            if (flowDepth === 0) {
+            if (flowDepth === 0 && indent) {
               flowValueIndent = -1;
               if (flowSawNewline) pendingMultilineFlow = true;   // a multi-line flow just closed → flag the next token
               flowSawNewline = false;
@@ -566,7 +575,7 @@ export function createLexer(grammar: CstGrammar) {
       // ── Indentation mode: at a block-context line start, skip blank/comment lines, measure
       // the next content line's leading-space column, and emit NEWLINE / INDENT / DEDENT(s)
       // before that line's tokens (relative to the indentation stack). ──
-      if (indent && flowDepth === 0 && lineStart) {
+      if (lineSensitive && flowDepth === 0 && lineStart) {
         let p = pos, col = 0;
         while (p < source.length && source[p] === ' ') { p++; col++; }
         const ch = source[p];
@@ -600,18 +609,26 @@ export function createLexer(grammar: CstGrammar) {
         // (its emitted NEWLINE/DEDENT either reject in value position or are harmless between
         // siblings — matching the `yaml` oracle, which is context-sensitive there). We reject only
         // the structural case, so no valid leaf-continuation is mis-rejected.
-        if (ch === '\t') {
+        if (indent && ch === '\t') {   // §6.1 tab-in-indentation error is YAML-specific (newline mode has no stack)
           let q = p; while (q < source.length && (source[q] === ' ' || source[q] === '\t')) q++;
           const after = source[q];
           if (q < source.length && after !== '\n' && after !== '\r' && startsBlockStructuralNode(source, q)) {
             throw new Error(`Tab character used in indentation at offset ${p}`);
           }
         }
-        if (indent.comment && source.startsWith(indent.comment, p)) {       // comment-only line — ignored
+        if (lineComment && source.startsWith(lineComment, p)) {             // comment-only line — ignored
           let e = p; while (e < source.length && source[e] !== '\n') e++;
           pos = e; pendingComment = true; continue;                         // next iteration consumes the newline
         }
         pos = p;                                                            // consume the leading indentation
+        // ── newline-only mode: no indent stack — emit ONE NEWLINE at this real line boundary (a
+        // leading boundary before any content is suppressed via emittedContent) and move on. ──
+        if (!indent) {
+          if (emittedContent) push({ type: newline!.token, text: '', offset: pos });
+          lineStart = false;
+          atLineLead = true;
+          continue;
+        }
         currentLineCol = col;                                               // bounds a block scalar started on this line
         const top = indentStack[indentStack.length - 1];
         if (col > top) {
@@ -664,10 +681,10 @@ export function createLexer(grammar: CstGrammar) {
         continue;
       }
 
-      // Whitespace. In indentation mode, inline spaces/tabs are skipped but a NEWLINE is a
-      // block-context line boundary (sets lineStart so the routine above runs next) — except
-      // inside flow delimiters, where newlines are insignificant. Otherwise skip any run.
-      if (indent) {
+      // Whitespace. In an indentation / newline grammar, inline spaces/tabs are skipped but a
+      // NEWLINE is a block-context line boundary (sets lineStart so the routine above runs next) —
+      // except inside flow delimiters, where newlines are insignificant. Otherwise skip any run.
+      if (lineSensitive) {
         const c = source[pos];
         if (c === ' ' || c === '\t') {
           // A TAB between a block indicator (`-`/`?`/map-`:`) and a NESTED block-structural node it
@@ -677,7 +694,7 @@ export function createLexer(grammar: CstGrammar) {
           // separation, so the structural sniff gates it. After a `:` a node PROPERTY is the inline
           // value (`key:\t&a x` is legal), so the `:` case excludes properties (allowProperty=false)
           // while `-`/`?` include them (`-\t&a x` IS an error). Block context only (flowDepth===0).
-          if (flowDepth === 0) {
+          if (indent && flowDepth === 0) {   // §6.1 tab-after-indicator error is YAML-specific
             const prev = tokens[tokens.length - 1];
             const isIndicator = prev && prev.type === '' && (prev.text === '-' || prev.text === '?' || prev.text === ':');
             if (isIndicator) {
@@ -692,7 +709,7 @@ export function createLexer(grammar: CstGrammar) {
         if (c === '\n' || c === '\r') {
           pos++; if (c === '\r' && source[pos] === '\n') pos++;
           if (flowDepth === 0) lineStart = true;
-          else {
+          else if (indent) {
             flowSawNewline = true;   // this outermost flow spans >1 line → it can't be an implicit block key
             // §7.4: inside a value/item-position flow, a CONTENT line must be indented MORE than the
             // enclosing block column `n` (flowValueIndent). The indentation column is the leading-SPACE

diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
@@ -541,6 +541,10 @@ function planTemplate(grammar: CstGrammar): TemplatePlan | null {
 /** Determine which tokens the external scanner must provide. */
 function planScannerTokens(grammar: CstGrammar): Map<string, string> {
   const map = new Map<string, string>();
+  // A newline-sensitive grammar's NEWLINE token is engine-emitted; in tree-sitter it becomes a
+  // stateless external token (the scanner emits it at each significant line boundary). Listed
+  // FIRST so it heads the enum / externals order.
+  if (grammar.newline) map.set(grammar.newline.token, toSnake(grammar.newline.token));
   // The regex token: '/' is context-sensitive (regex vs division). The scanner
   // resolves it.
   const regexTok = grammar.tokens.find(t => t.flags.includes('regex'));
@@ -1625,6 +1629,26 @@ function buildScannerC(
   L.push('static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }');
   L.push('');
 
+  const nl = grammar.newline;
+  if (nl) {
+    const nlSym = ctx.scannerTokenFor.get(nl.token)!.toUpperCase();
+    L.push('// ── Newline scan ────────────────────────────────────────────────');
+    L.push('// A newline-sensitive grammar emits one NEWLINE token at each significant line');
+    L.push('// boundary. tree-sitter only asks for it where the grammar permits it (statement');
+    L.push('// boundaries); inside flow delimiters the rules never reference NEWLINE, so');
+    L.push('// valid_symbols[NEWLINE] is false there and the line break falls through to');
+    L.push('// `extras` as ordinary whitespace. Stateless: one line break (\\n / \\r / \\r\\n) per token.');
+    L.push('static bool scan_newline(TSLexer *lexer) {');
+    L.push('  if (lexer->lookahead == \'\\r\') { advance(lexer); if (lexer->lookahead == \'\\n\') advance(lexer); }');
+    L.push('  else if (lexer->lookahead == \'\\n\') advance(lexer);');
+    L.push('  else return false;');
+    L.push(`  lexer->result_symbol = ${nlSym};`);
+    L.push('  lexer->mark_end(lexer);');
+    L.push('  return true;');
+    L.push('}');
+    L.push('');
+  }
+
   if (regexTok) {
     // Derive the regex literal scan from the token pattern + hints.
     const flagChars = tokenPatternTrailingCharClass(regexTok) ?? 'gimsuyd';
@@ -1734,6 +1758,14 @@ function buildScannerC(
   L.push('                                                          const bool *valid_symbols) {');
   L.push('  (void)payload;');
   L.push('');
+  if (grammar.newline) {
+    const nlSym = ctx.scannerTokenFor.get(grammar.newline.token)!.toUpperCase();
+    L.push('  // Newline first: a significant line boundary outranks every other external token.');
+    L.push(`  if (valid_symbols[${nlSym}] && (lexer->lookahead == '\\n' || lexer->lookahead == '\\r')) {`);
+    L.push('    if (scan_newline(lexer)) return true;');
+    L.push('  }');
+    L.push('');
+  }
   if (tp && regexTok) {
     const charsSym = tp.charsSnake.toUpperCase();
     const regexSym = ctx.scannerTokenFor.get(regexTok.name)!.toUpperCase();

diff --git a/src/types.ts b/src/types.ts
@@ -329,6 +329,25 @@ export interface IndentConfig {
   };
 }
 
+/**
+ * Opt-in NEWLINE-sensitive tokenization, INDEPENDENT of `indent`. For grammars that are
+ * newline-aware but NOT indentation-aware — statements are line-delimited, but nesting is via
+ * delimiters / expressions, not indentation (e.g. dotenv-style env specs). The lexer emits a single
+ * NEWLINE token at each significant line boundary (suppressed inside flow delimiters, and on blank /
+ * comment-only lines), with NO indent stack and NO INDENT/DEDENT tokens. `indent` is the richer
+ * layer built ON TOP of this same line-boundary + flow-suspension machinery (indent = newline +
+ * indent stack + YAML block-scalar semantics), so declaring BOTH is rejected. The NEWLINE token is
+ * engine-emitted (declared with a placeholder `never()` pattern and named here), exactly like the
+ * indent tokens. ABSENT for token-stream / indentation languages → dormant, tokenization
+ * byte-identical.
+ */
+export interface NewlineConfig {
+  token: string;        // token TYPE emitted at each significant line boundary (engine-emitted, like the indent tokens)
+  flowOpen?: string[];  // punctuation that SUSPENDS newline significance while open (e.g. ['(', '[', '{'])
+  flowClose?: string[]; // matching closers (e.g. [')', ']', '}'])
+  comment?: string;     // line-comment introducer; a comment-only line emits no NEWLINE (e.g. '#')
+}
+
 export interface PrecOperator {
   value: string;
   position: 'infix' | 'prefix' | 'postfix';
@@ -391,6 +410,7 @@ export interface CstGrammar {
   scopeName?: string;  // declared TextMate scope name (e.g. source.ts); its suffix drives every scope's language tag
   markup?: MarkupConfig;  // opt-in markup-mode tokenization (HTML/Vue); absent for token-stream languages
   indent?: IndentConfig;  // opt-in indentation-sensitive tokenization (YAML); absent → byte-identical token stream
+  newline?: NewlineConfig;  // opt-in NEWLINE-sensitive tokenization, independent of indent (no indent stack); absent → byte-identical token stream
   expressionRule?: string;  // name of the rule that produces an EXPRESSION; lets gen-tm derive a `#expression` sub-grammar (for expression-only embeds, e.g. Vue `{{ }}`)
   // Extra TextMate grammars that just RE-EXPOSE this one under another scopeName (thin
   // `{scopeName, patterns:[{include: <this.scopeName>}]}` wrappers). HTML declares