diff --git a/packages/compiler/test/test-cstReader.js b/packages/compiler/test/test-cstReader.js index 3de65707..0b97acdf 100644 --- a/packages/compiler/test/test-cstReader.js +++ b/packages/compiler/test/test-cstReader.js @@ -1,10 +1,11 @@ import test from 'ava'; import * as fc from 'fast-check'; -import {createHandle, createReader, CstNodeType} from '../../runtime/src/cstReader.ts'; +import {createReader, CstNodeType} from '../../runtime/src/cstReader.ts'; +import {createHandle} from '../../runtime/src/cstReaderShared.ts'; import {compileAndLoad, matchWithInput} from './_helpers.js'; -const childrenOf = (reader, handle, i) => { +const childrenOf = (reader, handle) => { const arr = []; reader.forEachChild(handle, c => arr.push(c)); return arr; @@ -31,8 +32,8 @@ test('terminal children', async t => { g.match('abcd').use(mr => { const reader = createReader(mr); const children = []; - reader.forEachChild(reader.root, (child, leadingSpaces, startIdx, index) => { - children.push({child, leadingSpaces, startIdx, index}); + reader.forEachChild(reader.root, (child, leadingSpaces, index) => { + children.push({child, leadingSpaces, startIdx: reader.startIdx(child), index}); }); t.is(children.length, 2); @@ -57,8 +58,8 @@ test('nonterminal children', async t => { g.match('xy').use(mr => { const reader = createReader(mr); const children = []; - reader.forEachChild(reader.root, (child, ls, startIdx, i) => { - children.push({child, ls, startIdx, i}); + reader.forEachChild(reader.root, (child, ls, i) => { + children.push({child, ls, startIdx: reader.startIdx(child), i}); }); t.is(children.length, 2); t.is(reader.ctorName(children[0].child), 'a'); @@ -137,6 +138,71 @@ test('optional node: absent', async t => { }); }); +test('withChildren, tupleArity, forEachTuple, and isPresent', async t => { + const g = await compileAndLoad('G { start = ("a" "b"?)* }'); + g.match('abab').use(mr => { + const reader = createReader(mr); + let list; + reader.forEachChild(reader.root, child => { + list = child; + }); + + t.is(reader.tupleArity(list), 2); + + const tuples = []; + reader.forEachTuple(list, (a, b) => { + tuples.push( + reader.sourceString(a) + + reader.withChildren(b, (_handle, child) => + reader.isPresent(b) ? reader.sourceString(child) : '' + ) + ); + }); + t.deepEqual(tuples, ['ab', 'ab']); + + let emptyOpt; + g.match('a').use(mr2 => { + const reader2 = createReader(mr2); + reader2.forEachChild(reader2.root, child => { + list = child; + }); + reader2.forEachTuple(list, (_a, b) => { + emptyOpt = b; + }); + t.false(reader2.isPresent(emptyOpt)); + t.is( + reader2.withChildren(emptyOpt, (_handle, child) => + child === undefined ? 'missing' : 'present' + ), + 'missing' + ); + }); + }); +}); + +test('type-specific helpers assert on the wrong handle kind', async t => { + const g = await compileAndLoad('G { Start = ("a" "b"?)* }'); + g.match('ab').use(mr => { + const reader = createReader(mr); + let list; + reader.forEachChild(reader.root, child => { + list = child; + }); + + let terminal; + let opt; + reader.forEachTuple(list, (a, b) => { + terminal = a; + opt = b; + }); + + t.throws(() => reader.ruleId(list), {message: 'Not a nonterminal'}); + t.throws(() => reader.tupleArity(reader.root), {message: 'Not a list'}); + t.throws(() => reader.isPresent(terminal), {message: 'Not an opt'}); + t.true(reader.isPresent(opt)); + }); +}); + // --- unparse via walk --- test('unparse: simple terminals', async t => { @@ -215,7 +281,7 @@ test('rootLeadingSpacesLen: present', async t => { g.match(' x').use(mr => { const reader = createReader(mr); t.is(reader.rootLeadingSpacesLen, 2); - t.is(reader.sourceSlice(0, reader.rootLeadingSpacesLen), ' '); + t.is(reader.input.slice(0, reader.rootLeadingSpacesLen), ' '); t.is(reader.startIdx(reader.root), 2); }); }); @@ -233,14 +299,15 @@ test('child leadingSpaces in syntactic rule', async t => { g.match('a b').use(mr => { const reader = createReader(mr); const spacesInfo = []; - reader.forEachChild(reader.root, (child, leadingSpacesLen, childStartIdx, index) => { + reader.forEachChild(reader.root, (child, leadingSpacesLen, index) => { + const childStartIdx = reader.startIdx(child); spacesInfo.push({ index, hasSpaces: leadingSpacesLen > 0, spacesLen: leadingSpacesLen, spacesStr: leadingSpacesLen > 0 - ? reader.sourceSlice(childStartIdx - leadingSpacesLen, leadingSpacesLen) + ? reader.input.slice(childStartIdx - leadingSpacesLen, childStartIdx) : '', }); }); @@ -272,8 +339,8 @@ const spaceMemoIgnored = test.macro(async (t, twoBody, input = '> xx') => { const reader = createReader(mr); const [two] = childrenOf(reader, reader.root); const children = []; - reader.forEachChild(two, (child, leadingSpacesLen, childStartIdx) => { - children.push({child, leadingSpacesLen, childStartIdx}); + reader.forEachChild(two, (child, leadingSpacesLen) => { + children.push({child, leadingSpacesLen, childStartIdx: reader.startIdx(child)}); }); t.deepEqual( children.map(({leadingSpacesLen}) => leadingSpacesLen), @@ -305,15 +372,13 @@ test( '> x' ); -// --- details --- +// --- rule metadata --- -test('details returns ruleId for nonterminals', async t => { +test('ruleId returns a stable rule index for nonterminals', async t => { const g = await compileAndLoad('G { start = a\na = "x" }'); g.match('x').use(mr => { const reader = createReader(mr); - // Root is 'start', details should be its ruleId (>= 0). - const d = reader.details(reader.root); - t.true(d >= 0); + t.true(reader.ruleId(reader.root) >= 0); }); }); @@ -433,7 +498,8 @@ function checkInvariants(reader, handle, isLexicalParent) { let cursor = start; let reconstructed = ''; - reader.forEachChild(handle, (child, leadingSpacesLen, childStartIdx, index) => { + reader.forEachChild(handle, (child, leadingSpacesLen, index) => { + const childStartIdx = reader.startIdx(child); indices.push(index); callbackCount++; @@ -467,7 +533,7 @@ function checkInvariants(reader, handle, isLexicalParent) { // Round-trip reconstruction: interleave spaces + child text. if (leadingSpacesLen > 0) { - reconstructed += reader.sourceSlice(childStartIdx - leadingSpacesLen, leadingSpacesLen); + reconstructed += reader.input.slice(childStartIdx - leadingSpacesLen, childStartIdx); } reconstructed += reader.sourceString(child); @@ -528,7 +594,7 @@ function checkMatch(reader) { } // -- Root round-trip: leadingSpaces + render(root) === input -- - const rootSpaces = reader.sourceSlice(0, rootLeadingSpacesLen); + const rootSpaces = input.slice(0, rootLeadingSpacesLen); const rootText = reader.sourceString(root); if (rootSpaces + rootText !== input) { errors.push( diff --git a/packages/compiler/test/test-wasm.js b/packages/compiler/test/test-wasm.js index 744bf0d5..471164f8 100644 --- a/packages/compiler/test/test-wasm.js +++ b/packages/compiler/test/test-wasm.js @@ -2127,3 +2127,48 @@ test('edge flag: tagged terminal decoding with HAS_LEADING_SPACES bit', async t t.is(letter.sourceString, 'c'); t.falsy(letter.leadingSpaces); }); + +// Regression: MatchResult.input must reflect the input from *its* match, +// not the most recent match on the same grammar. +test('MatchResult.input is stable after a subsequent match', async t => { + const g = await compileAndLoad('G { start = letter+ }'); + g.match('abc').use(r1 => { + g.match('xy').use(r2 => { + t.is(r1.input, 'abc'); + t.is(r2.input, 'xy'); + }); + }); +}); + +// Regression: getRightmostFailures() must not silently return wrong data +// when wasm state has been overwritten by a subsequent match(). +test('FailedMatchResult.getRightmostFailures throws if not the most recent match', async t => { + const g = await compileAndLoad('G { start = "ok" end }'); + + g.match('bad').use(r1 => { + t.true(r1.failed()); + + // A subsequent match overwrites the wasm state. + g.match('ok').use(r2 => { + t.true(r2.succeeded()); + + // Accessing failures on the stale result should throw. + t.throws(() => r1.getRightmostFailures(), { + message: /not the most recent match/, + }); + }); + }); +}); + +// getRightmostFailures() works when called on the most recent match. +test('FailedMatchResult.getRightmostFailures works on most recent match', async t => { + const g = await compileAndLoad('G { start = "ok" end }'); + + g.match('bad').use(r1 => { + t.true(r1.failed()); + + const failures = r1.getRightmostFailures(); + t.true(failures.length > 0); + t.is(r1.getRightmostFailurePosition(), 0); + }); +}); diff --git a/packages/lang-python/convertToOhm.ts b/packages/lang-python/convertToOhm.ts index 3e124cb7..4bdfe3cb 100644 --- a/packages/lang-python/convertToOhm.ts +++ b/packages/lang-python/convertToOhm.ts @@ -1,8 +1,8 @@ import assert from 'node:assert'; import {grammar} from '@ohm-js/compiler/compat'; -import type {Operation} from '@ohm-js/semantics/src/types.ts'; -import {createOperation} from '@ohm-js/semantics/src/index.ts'; +import {createOperation} from '@ohm-js/semantics'; +import type {Operation} from '@ohm-js/semantics'; const hasOwn = (obj: object, prop: string) => Object.hasOwnProperty.call(obj, prop); diff --git a/packages/runtime/src/cstReader.ts b/packages/runtime/src/cstReader.ts index d8942024..5b6af244 100644 --- a/packages/runtime/src/cstReader.ts +++ b/packages/runtime/src/cstReader.ts @@ -2,80 +2,43 @@ import { CST_CHILD_COUNT_OFFSET, CST_CHILDREN_OFFSET, CST_MATCH_LENGTH_OFFSET, + CST_HAS_LEADING_SPACES_FLAG, CST_TYPE_AND_DETAILS_OFFSET, CstNodeType, isTaggedTerminal, MatchRecordType, rawMatchRecordType, } from './miniohm.ts'; +import {assert} from './assert.ts'; +import {createReaderFromCtx} from './cstReaderFactory.ts'; +import {createHandle, rawHandle, unpackStartIdx} from './cstReaderShared.ts'; import type {MatchContext, SucceededMatchResult} from './miniohm.ts'; export {CstNodeType}; -const HANDLE_BITS = 27; -const SHIFT = 2 ** HANDLE_BITS; // 134217728 -const MASK = SHIFT - 1; // 0x7FFFFFF - -/** - * Pack a raw CST handle and startIdx into a single Number handle. - * Uses 53 of the available integer-precision bits in an IEEE 754 double - * (27 bits for the pointer, 26 bits for startIdx). Accessor methods - * (isTerminal, matchLength, etc.) extract the low 27 bits via `& MASK`. - */ -function pack(rawHandle: number, startIdx: number): number { - return startIdx * SHIFT + rawHandle; -} - -function unpackStartIdx(handle: number): number { - const raw = handle & MASK; - return (handle - raw) / SHIFT; -} - -/** Extract the raw CST pointer from a packed handle. */ -export function rawHandle(handle: number): number { - return handle & MASK; -} - -/** - * Create a packed handle from a raw pointer and startIdx. - * Validates that both values fit in the packed representation. - */ -export function createHandle(rawPtr: number, startIdx: number): number { - if (rawPtr >= SHIFT) { - throw new Error( - `Raw CST pointer ${rawPtr} exceeds ${HANDLE_BITS}-bit limit (max ${SHIFT - 1})` - ); - } - const startIdxLimit = 2 ** (53 - HANDLE_BITS); - if (startIdx >= startIdxLimit) { - throw new Error( - `startIdx ${startIdx} exceeds ${53 - HANDLE_BITS}-bit limit (max ${startIdxLimit - 1})` - ); - } - return pack(rawPtr, startIdx); +function nextEdgePos(reader: CstReader, child: number): number { + return reader.startIdx(child) + reader.matchLength(child); } /** * Zero-allocation access to the CST stored in Wasm linear memory. * - * Handles have startIdx packed in the upper bits. Accessor methods - * (isTerminal, matchLength, childCount, ctorName, details) extract - * the raw pointer via `& MASK`. + * Handles have startIdx packed in the upper bits. * * forEachChild(handle, fn) iterates visible children. The callback receives - * (childHandle, leadingSpacesLen, childStartIdx, index). + * (childHandle, leadingSpacesLen, index). * * Leading spaces are edge data (they belong to the parent→child relationship), * not node data. For each child edge: - * - childStartIdx === startIdx(childHandle) + * - startIdx(childHandle) is the child's start position * - leadingSpacesLen >= 0 - * - leading spaces span: start = childStartIdx - leadingSpacesLen, length = leadingSpacesLen - * - child source span: start = childStartIdx, length = matchLength(childHandle) + * - leading spaces span: start = startIdx(childHandle) - leadingSpacesLen, length = leadingSpacesLen + * - child source span: start = startIdx(childHandle), length = matchLength(childHandle) * * For root: * - startIdx(root) === rootLeadingSpacesLen - * - leading spaces before root are sourceSlice(0, rootLeadingSpacesLen) + * - leading spaces before root are input.slice(0, rootLeadingSpacesLen) */ export class CstReader { /** @internal */ @@ -100,7 +63,7 @@ export class CstReader { /** Node type: NONTERMINAL, TERMINAL, LIST, or OPT. */ type(handle: number): CstNodeType { - const raw = handle & MASK; + const raw = rawHandle(handle); if (isTaggedTerminal(raw)) return CstNodeType.TERMINAL; const mrType = rawMatchRecordType(this._ctx.view, raw); if (mrType === MatchRecordType.NONTERMINAL) return CstNodeType.NONTERMINAL; @@ -111,14 +74,14 @@ export class CstReader { /** Number of raw children stored in this match record. */ childCount(handle: number): number { - const raw = handle & MASK; + const raw = rawHandle(handle); if (isTaggedTerminal(raw)) return 0; return this._ctx.view.getUint32(raw + CST_CHILD_COUNT_OFFSET, true); } /** Length of matched input (in UTF-16 code units). */ matchLength(handle: number): number { - const raw = handle & MASK; + const raw = rawHandle(handle); if (isTaggedTerminal(raw)) return raw >>> 2; return this._ctx.view.getUint32(raw + CST_MATCH_LENGTH_OFFSET, true); } @@ -128,7 +91,7 @@ export class CstReader { * For other types: '_terminal', '_list', '_opt'. */ ctorName(handle: number): string { - const raw = handle & MASK; + const raw = rawHandle(handle); if (isTaggedTerminal(raw)) return '_terminal'; const type = rawMatchRecordType(this._ctx.view, raw); if (type === MatchRecordType.NONTERMINAL) { @@ -140,47 +103,62 @@ export class CstReader { return '_opt'; } - /** - * Upper bits of typeAndDetails. For NONTERMINAL: the ruleId. - * For ITER_FLAG: the arity (children per iteration). - */ - details(handle: number): number { - const raw = handle & MASK; + /** @internal */ + private _details(handle: number): number { + const raw = rawHandle(handle); if (isTaggedTerminal(raw)) return 0; return this._ctx.view.getInt32(raw + CST_TYPE_AND_DETAILS_OFFSET, true) >>> 2; } + /** Rule ID for a nonterminal node. */ + ruleId(handle: number): number { + assert(this.type(handle) === CstNodeType.NONTERMINAL, 'Not a nonterminal'); + return this._details(handle); + } + + /** Children per tuple for a list node. */ + tupleArity(handle: number): number { + assert(this.type(handle) === CstNodeType.LIST, 'Not a list'); + return this._details(handle); + } + + /** Whether an optional node has a child. */ + isPresent(handle: number): boolean { + assert(this.type(handle) === CstNodeType.OPT, 'Not an opt'); + return this.childCount(handle) > 0; + } + /** Source string for a node (startIdx is extracted from the handle). */ sourceString(handle: number): string { const si = unpackStartIdx(handle); return this._ctx.input.slice(si, si + this.matchLength(handle)); } - /** Extract a substring from the input. */ - sourceSlice(startIdx: number, len: number): string { - return this._ctx.input.slice(startIdx, startIdx + len); - } - /** The full input string that was parsed. */ get input(): string { return this._ctx.input; } + /** The array of rule names, indexed by rule ID. */ + get ruleNames(): readonly string[] { + return this._ctx.ruleNames; + } + /** * Iterate over children. The callback receives (childHandle, leadingSpacesLen, - * childStartIdx, index). + * index). * - * Leading spaces belong to the parent→child edge. Use sourceSlice() to - * extract the spaces text: sourceSlice(childStartIdx - leadingSpacesLen, leadingSpacesLen). + * Leading spaces belong to the parent→child edge. Use `startIdx(child)` and + * `input.slice()` to recover the spaces text. * * Only NONTERMINAL and TERMINAL children may have leading spaces; * LIST and OPT children always have leadingSpacesLen === 0. */ forEachChild( handle: number, - fn: (child: number, leadingSpacesLen: number, childStartIdx: number, index: number) => void + fn: (child: number, leadingSpacesLen: number, index: number) => void ): void { - const raw = handle & MASK; + const raw = rawHandle(handle); if (isTaggedTerminal(raw)) return; const count = this._ctx.view.getUint32(raw + CST_CHILD_COUNT_OFFSET, true); @@ -190,10 +168,8 @@ export class CstReader { for (let i = 0; i < count; i++) { const slot = this._ctx.view.getUint32(raw + CST_CHILDREN_OFFSET + i * 4, true); - // Bit 1 of the child slot is the HAS_LEADING_SPACES edge flag. - const hasLeadingSpaces = (slot & 2) !== 0; - // Strip the edge flag to get the actual value. - const rawChild = slot & ~2; + const hasLeadingSpaces = (slot & CST_HAS_LEADING_SPACES_FLAG) !== 0; + const rawChild = slot & ~CST_HAS_LEADING_SPACES_FLAG; const leadingSpacesLen = hasLeadingSpaces && getSpacesLenAt && this._hasParentSpaces(rawChild) @@ -203,7 +179,7 @@ export class CstReader { const childStartIdx = edgeStartIdx + leadingSpacesLen; const childHandle = createHandle(rawChild, childStartIdx); - fn(childHandle, leadingSpacesLen, childStartIdx, i); + fn(childHandle, leadingSpacesLen, i); const len = isTaggedTerminal(rawChild) ? rawChild >>> 2 @@ -213,12 +189,131 @@ export class CstReader { } } + /** + * Get the handle of the child at `index`, given the current `edgeStartIdx`. + * The caller must track `edgeStartIdx`: for the first child, it's + * `startIdx(parentHandle)`; for subsequent children, it's + * `startIdx(prevChild) + matchLength(prevChild)`. + * @internal + */ + childAt(handle: number, index: number, edgeStartIdx: number): number { + const raw = rawHandle(handle); + const slot = this._ctx.view.getUint32(raw + CST_CHILDREN_OFFSET + index * 4, true); + const hasLeadingSpaces = (slot & CST_HAS_LEADING_SPACES_FLAG) !== 0; + const rawChild = slot & ~CST_HAS_LEADING_SPACES_FLAG; + + const {getSpacesLenAt} = this._ctx; + const leadingSpacesLen = + hasLeadingSpaces && getSpacesLenAt && this._hasParentSpaces(rawChild) + ? Math.max(0, getSpacesLenAt(edgeStartIdx)) + : 0; + + return createHandle(rawChild, edgeStartIdx + leadingSpacesLen); + } + + /** + * Call `fn` with the node handle followed by its children. + * Avoids allocation for nodes with up to 7 children. + */ + withChildren(handle: number, fn: (handle: number, ...children: number[]) => R): R { + const count = this.childCount(handle); + let edgeStartIdx = this.startIdx(handle); + + if (count < 8) { + if (count === 0) return fn(handle); + + const c0 = this.childAt(handle, 0, edgeStartIdx); + if (count === 1) return fn(handle, c0); + + edgeStartIdx = nextEdgePos(this, c0); + const c1 = this.childAt(handle, 1, edgeStartIdx); + if (count === 2) return fn(handle, c0, c1); + + edgeStartIdx = nextEdgePos(this, c1); + const c2 = this.childAt(handle, 2, edgeStartIdx); + if (count === 3) return fn(handle, c0, c1, c2); + + edgeStartIdx = nextEdgePos(this, c2); + const c3 = this.childAt(handle, 3, edgeStartIdx); + if (count === 4) return fn(handle, c0, c1, c2, c3); + + edgeStartIdx = nextEdgePos(this, c3); + const c4 = this.childAt(handle, 4, edgeStartIdx); + if (count === 5) return fn(handle, c0, c1, c2, c3, c4); + + edgeStartIdx = nextEdgePos(this, c4); + const c5 = this.childAt(handle, 5, edgeStartIdx); + if (count === 6) return fn(handle, c0, c1, c2, c3, c4, c5); + + edgeStartIdx = nextEdgePos(this, c5); + const c6 = this.childAt(handle, 6, edgeStartIdx); + return fn(handle, c0, c1, c2, c3, c4, c5, c6); + } + + const children: number[] = []; + for (let i = 0; i < count; i++) { + const child = this.childAt(handle, i, edgeStartIdx); + children.push(child); + edgeStartIdx = nextEdgePos(this, child); + } + return fn(handle, ...children); + } + + /** + * Iterate over a list node in tuple-sized groups. + * Avoids allocation for arities up to 3. + */ + forEachTuple(handle: number, fn: (...children: number[]) => void): void { + const arity = this.tupleArity(handle); + if (arity <= 1) { + this.forEachChild(handle, child => fn(child)); + return; + } + + const count = this.childCount(handle); + let edgeStartIdx = this.startIdx(handle); + + if (arity === 2) { + for (let i = 0; i < count; i += 2) { + const c0 = this.childAt(handle, i, edgeStartIdx); + edgeStartIdx = nextEdgePos(this, c0); + const c1 = this.childAt(handle, i + 1, edgeStartIdx); + edgeStartIdx = nextEdgePos(this, c1); + fn(c0, c1); + } + return; + } + + if (arity === 3) { + for (let i = 0; i < count; i += 3) { + const c0 = this.childAt(handle, i, edgeStartIdx); + edgeStartIdx = nextEdgePos(this, c0); + const c1 = this.childAt(handle, i + 1, edgeStartIdx); + edgeStartIdx = nextEdgePos(this, c1); + const c2 = this.childAt(handle, i + 2, edgeStartIdx); + edgeStartIdx = nextEdgePos(this, c2); + fn(c0, c1, c2); + } + return; + } + + const tuple = new Array(arity); + for (let i = 0; i < count; ) { + for (let j = 0; j < arity; j++, i++) { + const child = this.childAt(handle, i, edgeStartIdx); + tuple[j] = child; + edgeStartIdx = nextEdgePos(this, child); + } + fn(...tuple); + } + } + /** * Whether the nonterminal at `handle` is a syntactic rule. * Uses cached metadata (ruleIsSyntactic), not string formatting. */ isSyntactic(handle: number): boolean { - const raw = handle & MASK; + const raw = rawHandle(handle); if (isTaggedTerminal(raw)) return false; const mrType = rawMatchRecordType(this._ctx.view, raw); if (mrType !== MatchRecordType.NONTERMINAL) return false; @@ -253,36 +348,6 @@ export class CstReader { } } -/** - * Create a CstReader from a MatchContext and Wasm exports. - * Validates packed-handle limits (heap size and input length). - * @internal - */ -export function createReaderFromCtx(ctx: MatchContext, exports: any): CstReader { - const heapTop = exports.__offset.value; - if (heapTop >= SHIFT) { - throw new Error( - `Wasm heap too large for CstReader: ${heapTop} bytes exceeds ${HANDLE_BITS}-bit limit (${SHIFT} bytes)` - ); - } - // Two constraints on input length: - // 1. startIdx must fit in (53 - HANDLE_BITS) bits when packed. - // 2. Tagged terminals encode as (matchLength << 2) | flags, so - // matchLength (≤ input.length) must fit in (HANDLE_BITS - 2) bits. - const startIdxLimit = 2 ** (53 - HANDLE_BITS); - const terminalLimit = 2 ** (HANDLE_BITS - 2); - const inputLimit = Math.min(startIdxLimit, terminalLimit); - if (ctx.input.length >= inputLimit) { - throw new Error( - `Input too long for CstReader: ${ctx.input.length} chars exceeds limit (${inputLimit} chars)` - ); - } - - const rootLeadingSpacesLen = Math.max(0, exports.getSpacesLenAt(0)); - const rootPtr = exports.bindingsAt(0); - return new CstReader(ctx, createHandle(rootPtr, rootLeadingSpacesLen), rootLeadingSpacesLen); -} - export function createReader(result: SucceededMatchResult): CstReader { const exports = (result.grammar as any)._instance.exports; return createReaderFromCtx(result._ctx, exports); diff --git a/packages/runtime/src/cstReaderFactory.ts b/packages/runtime/src/cstReaderFactory.ts new file mode 100644 index 00000000..4e05e2d0 --- /dev/null +++ b/packages/runtime/src/cstReaderFactory.ts @@ -0,0 +1,26 @@ +import type {MatchContext} from './miniohm.ts'; + +import {CstReader} from './cstReader.ts'; +import {createHandle, HANDLE_BITS, INPUT_LENGTH_LIMIT, SHIFT} from './cstReaderShared.ts'; + +/** + * Create a CstReader from a MatchContext and Wasm exports. + * Validates packed-handle limits (heap size and input length). + */ +export function createReaderFromCtx(ctx: MatchContext, exports: any): CstReader { + const heapTop = exports.__offset.value; + if (heapTop >= SHIFT) { + throw new Error( + `Wasm heap too large for CstReader: ${heapTop} bytes exceeds ${HANDLE_BITS}-bit limit (${SHIFT} bytes)` + ); + } + if (ctx.input.length >= INPUT_LENGTH_LIMIT) { + throw new Error( + `Input too long for CstReader: ${ctx.input.length} chars exceeds limit (${INPUT_LENGTH_LIMIT} chars)` + ); + } + + const rootLeadingSpacesLen = Math.max(0, exports.getSpacesLenAt(0)); + const rootPtr = exports.bindingsAt(0); + return new CstReader(ctx, createHandle(rootPtr, rootLeadingSpacesLen), rootLeadingSpacesLen); +} diff --git a/packages/runtime/src/cstReaderShared.ts b/packages/runtime/src/cstReaderShared.ts new file mode 100644 index 00000000..5ff82ffd --- /dev/null +++ b/packages/runtime/src/cstReaderShared.ts @@ -0,0 +1,47 @@ +const HANDLE_BITS = 27; +const SHIFT = 2 ** HANDLE_BITS; // 134217728 +const MASK = SHIFT - 1; // 0x7FFFFFF +const START_IDX_BITS = 53 - HANDLE_BITS; +const START_IDX_LIMIT = 2 ** START_IDX_BITS; +const TERMINAL_LENGTH_LIMIT = 2 ** (HANDLE_BITS - 2); +const INPUT_LENGTH_LIMIT = Math.min(START_IDX_LIMIT, TERMINAL_LENGTH_LIMIT); + +/** + * Pack a raw CST handle and startIdx into a single Number handle. + * Uses 53 of the available integer-precision bits in an IEEE 754 double + * (27 bits for the pointer, 26 bits for startIdx). + */ +function pack(rawHandle: number, startIdx: number): number { + return startIdx * SHIFT + rawHandle; +} + +/** Extract the raw CST pointer from a packed handle. */ +export function rawHandle(handle: number): number { + return handle & MASK; +} + +/** Extract the startIdx from a packed handle. */ +export function unpackStartIdx(handle: number): number { + const raw = rawHandle(handle); + return (handle - raw) / SHIFT; +} + +/** + * Create a packed handle from a raw pointer and startIdx. + * Validates that both values fit in the packed representation. + */ +export function createHandle(rawPtr: number, startIdx: number): number { + if (rawPtr >= SHIFT) { + throw new Error( + `Raw CST pointer ${rawPtr} exceeds ${HANDLE_BITS}-bit limit (max ${SHIFT - 1})` + ); + } + if (startIdx >= START_IDX_LIMIT) { + throw new Error( + `startIdx ${startIdx} exceeds ${START_IDX_BITS}-bit limit (max ${START_IDX_LIMIT - 1})` + ); + } + return pack(rawPtr, startIdx); +} + +export {HANDLE_BITS, INPUT_LENGTH_LIMIT, SHIFT}; diff --git a/packages/runtime/src/miniohm.ts b/packages/runtime/src/miniohm.ts index a3e01184..9e34e4ab 100644 --- a/packages/runtime/src/miniohm.ts +++ b/packages/runtime/src/miniohm.ts @@ -1,5 +1,7 @@ import {assert, checkNotNull} from './assert.ts'; -import {CstReader, createHandle, createReaderFromCtx, rawHandle} from './cstReader.ts'; +import {CstReader} from './cstReader.ts'; +import {createReaderFromCtx} from './cstReaderFactory.ts'; +import {createHandle, rawHandle} from './cstReaderShared.ts'; import {getLineAndColumn, getLineAndColumnMessage} from './extras.ts'; export const MATCH_RECORD_TYPE_MASK = 0b11; @@ -10,6 +12,9 @@ export const CST_TYPE_AND_DETAILS_OFFSET = 4; export const CST_CHILD_COUNT_OFFSET = 8; export const CST_CHILDREN_OFFSET = 16; +/** Bit 1 of a child slot is the HAS_LEADING_SPACES edge flag. */ +export const CST_HAS_LEADING_SPACES_FLAG = 2; + // Tagged terminal: (matchLength << 2) | 1. Bit 0 distinguishes from real pointers. // Bit 1 is the HAS_LEADING_SPACES edge flag (set on child slots, not on root handles). export function isTaggedTerminal(handle: number): boolean { @@ -630,7 +635,7 @@ class CstNodeImpl implements CstNodeBase { : new SeqNodeImpl(n.children, n.source, n.sourceString); return new OptNodeImpl(child, n.source, n.sourceString); } else if (type === CstNodeType.LIST) { - const arity = n._reader.details(n._handle); + const arity = n._reader.tupleArity(n._handle); if (arity <= 1) { return new ListNodeImpl(n.children, n.source, n.sourceString); } @@ -640,7 +645,7 @@ class CstNodeImpl implements CstNodeBase { // FIXME: We don't need any of this nonsense if we actually build the SeqNodes at parse time. const seqChildren = n.children.slice(i, i + arity); const endIdx = checkNotNull(seqChildren.at(-1)).source.endIdx; - const sourceString = n._reader.sourceSlice(startIdx, endIdx - startIdx); + const sourceString = n._reader.input.slice(startIdx, endIdx); arr.push(new SeqNodeImpl(seqChildren, {startIdx, endIdx}, sourceString)); startIdx = endIdx; } @@ -708,7 +713,10 @@ class LazySpacesNode implements NonterminalNode { get sourceString(): string { if (this._sourceString === undefined) { - this._sourceString = this._reader.sourceSlice(this._startIdx, this._matchLength); + this._sourceString = this._reader.input.slice( + this._startIdx, + this._startIdx + this._matchLength + ); } return this._sourceString; } @@ -914,7 +922,7 @@ export abstract class MatchResult { } get input(): string { - return (this.grammar as any)._input; + return this._ctx.input; } // `using` accesses [Symbol.dispose] at declaration time to get the @@ -1009,14 +1017,20 @@ export class FailedMatchResult extends MatchResult { this._rightmostFailurePosition = rightmostFailurePosition; } - /** @internal */ - private _assertAttached(property: string) { + private _assertMostRecent(method: string) { if (!this._attached) { throw new Error( - `Cannot access '${property}' after MatchResult has been disposed. ` + + `Cannot access '${method}' after MatchResult has been disposed. ` + `Access failure information before calling dispose(), or use result.use().` ); } + const stack = (this.grammar as any)._resultStack; + if (stack.at(-1) !== this) { + throw new Error( + `Cannot call ${method} on a FailedMatchResult that is not the most recent match. ` + + `Failure information is only available before a subsequent match() call.` + ); + } } getRightmostFailurePosition(): number { @@ -1025,11 +1039,11 @@ export class FailedMatchResult extends MatchResult { getRightmostFailures(): Failure[] { if (this._rightmostFailures === null) { - this._assertAttached('getRightmostFailures()'); + this._assertMostRecent('getRightmostFailures()'); const {exports} = (this.grammar as any)._instance; const ruleIds = (this.grammar as any)._ruleIds; const ruleNames = (this.grammar as any)._ruleNames; - const inputLength = (this.grammar as any)._input.length; + const inputLength = this._ctx.input.length; exports.recordFailures(inputLength, ruleIds.get(ruleNames[0])); // Use a Map to deduplicate by description while preserving fluffy status. diff --git a/packages/runtime/src/unstableDebug.ts b/packages/runtime/src/unstableDebug.ts index af2c4797..ae6b0def 100644 --- a/packages/runtime/src/unstableDebug.ts +++ b/packages/runtime/src/unstableDebug.ts @@ -1,3 +1,4 @@ +import {CST_HAS_LEADING_SPACES_FLAG} from './miniohm.ts'; import type {SucceededMatchResult} from './miniohm.ts'; const MATCH_RECORD_TYPE_MASK = 0b11; @@ -108,8 +109,7 @@ function walkRecordTree( // Tagged terminal (bit 0 = 1). Bit 1 may be the edge flag — not a heap object either way. stats.countByType.terminal++; } else { - // Heap pointer — strip bit 1 (HAS_LEADING_SPACES edge flag). - const childPtr = slot & ~2; + const childPtr = slot & ~CST_HAS_LEADING_SPACES_FLAG; if (!visited.has(childPtr)) { visited.add(childPtr); stack.push(childPtr); diff --git a/packages/semantics/bench.ts b/packages/semantics/bench.ts new file mode 100644 index 00000000..8770a907 --- /dev/null +++ b/packages/semantics/bench.ts @@ -0,0 +1,108 @@ +import {readFileSync} from 'node:fs'; +import process from 'node:process'; + +import {Bench} from 'tinybench'; +import * as ohm from '@ohm-js/compiler/compat'; +import {createReader} from 'ohm-js/cstReader'; + +import {createOperation} from './src/index.ts'; +import {createReaderOperation} from './src/reader.ts'; + +const smallSize = process.argv.includes('--small-size'); + +const scriptRel = (relPath: string) => new URL(relPath, import.meta.url); +const es5Source = readFileSync(scriptRel('../../examples/ecmascript/src/es5.ohm'), 'utf8'); +const g = ohm.grammars(es5Source).ES5; + +const input = smallSize + ? 'var x = 1 + 2;' + : readFileSync(scriptRel('../compiler/test/data/_underscore-1.8.3.js'), 'utf8'); + +// --- CstNode-based (createOperation) --- + +const countNodesCstNode = createOperation('countNodes', { + _nonterminal(ctx, ...children) { + let sum = 1; + for (const c of children) sum += countNodesCstNode(c); + return sum; + }, + _terminal(ctx) { + return 1; + }, + _default(ctx, ...children) { + let sum = 1; + for (const c of children) sum += countNodesCstNode(c); + return sum; + }, +}); + +// --- CstReader-based (createReaderOperation) --- + +let _rd: ReturnType; + +const countNodesCstReader = createReaderOperation('countNodes', { + _nonterminal(h) { + let sum = 1; + _rd.forEachChild(h, child => { + sum += countNodesCstReader(_rd, child); + }); + return sum; + }, + _terminal(h) { + return 1; + }, + _default(h) { + let sum = 1; + _rd.forEachChild(h, child => { + sum += countNodesCstReader(_rd, child); + }); + return sum; + }, +}); + +// --- Benchmark --- + +const opts = { + afterEach() { + process.stderr.write('.'); + }, +}; + +const bench = new Bench({ + iterations: smallSize ? 1 : 10, + time: 0, + warmup: !smallSize, + throws: true, +}); + +bench.add( + 'createOperation (CstNode)', + () => g.match(input).use((r: any) => countNodesCstNode(r.getCstRoot())), + opts +); + +bench.add( + 'createReaderOperation (CstReader)', + () => + g.match(input).use((r: any) => { + _rd = createReader(r); + return countNodesCstReader(_rd, _rd.root); + }), + opts +); + +console.log(`Input: ${smallSize ? 'small' : 'underscore-1.8.3.js'} (${input.length} bytes)\n`); + +(async () => { + await bench.run(); + process.stderr.write('\n'); + + for (const task of bench.tasks) { + const {mean, sd, samplesCount} = task.result!.latency; + console.log(`${task.name}: ${mean.toFixed(0)}ms ± ${sd.toFixed(0)}ms (n=${samplesCount})`); + } + + const cstNodeMean = bench.tasks[0].result!.latency.mean; + const cstReaderMean = bench.tasks[1].result!.latency.mean; + console.log(`\nSpeedup: ${(cstNodeMean / cstReaderMean).toFixed(2)}x`); +})(); diff --git a/packages/semantics/package.json b/packages/semantics/package.json index 37efaf6a..d4fa353e 100644 --- a/packages/semantics/package.json +++ b/packages/semantics/package.json @@ -15,13 +15,24 @@ "author": "Patrick Dubroy ", "type": "module", "main": "dist/index.js", + "exports": { + ".": { + "types": "./dist/src/index.d.ts", + "default": "./dist/src/index.js" + }, + "./reader": { + "types": "./dist/src/reader.d.ts", + "default": "./dist/src/reader.js" + } + }, "scripts": { "build": "tsc", - "test": "ava" + "test": "ava && node --experimental-strip-types bench.ts --small-size" }, "devDependencies": { "@ohm-js/compiler": "workspace:^", - "ava": "^6.0.0" + "ava": "^6.0.0", + "tinybench": "^6.0.0" }, "peerDependencies": { "@ohm-js/compiler": "workspace:^", diff --git a/packages/semantics/src/index.test.ts b/packages/semantics/src/index.test.ts index 3e07c676..f49bc635 100644 --- a/packages/semantics/src/index.test.ts +++ b/packages/semantics/src/index.test.ts @@ -1,10 +1,9 @@ /* global URL */ import * as ohm from '@ohm-js/compiler/compat'; -import type {CstNode, ListNode, SucceededMatchResult, TerminalNode} from 'ohm-js'; +import type {CstNode} from 'ohm-js'; import test from 'ava'; import {readFileSync} from 'node:fs'; -import * as ohmJs from 'ohm-js'; import type {Operation, VisitorCtx} from './types.ts'; import {createOperation} from './index.ts'; @@ -103,3 +102,35 @@ test('it handles v17 CSTs', t => { t.fail('parse failed'); } }); + +// Regression: a missing-action error in a nested call should not corrupt +// the global action stack (the finally block should only pop if this frame pushed). +test('missing action does not corrupt the action stack', t => { + // 'start' has an action that catches the missing-action error for 'broken' + // (which has 2 children, so no default action applies) and then visits + // 'alsoBroken' (also 2 children, no action). + // With the bug: broken's finally pops start's frame, then start's finally + // pops from empty. When alsoBroken throws, the stack trace is empty — + // it should still show start. + const twoChildG = ohm.grammar( + 'G { start = broken alsoBroken broken = "a" "b" alsoBroken = "c" "d" }' + ); + const op: Operation = createOperation('op', { + start(ctx, broken, alsoBroken) { + try { + op(broken); + } catch {} + return op(alsoBroken); + }, + _terminal(ctx) { + return (ctx.thisNode as any).sourceString; + }, + }); + const r = twoChildG.match('abcd'); + assert(r.succeeded(), 'match should succeed'); + const err = t.throws(() => op(r.getCstRoot()), { + message: /missing semantic action: alsoBroken/, + }); + // The error trace should show that we're inside 'start'. + t.regex(err!.message, /op > start/); +}); diff --git a/packages/semantics/src/index.ts b/packages/semantics/src/index.ts index 9d89c3f7..8b7776aa 100644 --- a/packages/semantics/src/index.ts +++ b/packages/semantics/src/index.ts @@ -1,5 +1,6 @@ import type {CstNode, NonterminalNode} from 'ohm-js'; import type {ActionDict, Operation, VisitorCtx} from './types.ts'; +export type {Operation} from './types.ts'; const globalActionStack: [string, string, string][] = []; @@ -21,6 +22,7 @@ export function createOperation>( }; // Ported from Operation.execute in ohm-js/src/Semantics.js + const stackLen = globalActionStack.length; try { // Look for a semantic action whose name matches the node's constructor name, which is either // the name of a rule in the grammar, or '_terminal' (for a terminal node), or '_iter' (for an @@ -66,7 +68,7 @@ export function createOperation>( throw new Error(`missing semantic action: ${ctorName}` + getActionStackTrace()); // End inlined logic } finally { - globalActionStack.pop(); + globalActionStack.length = stackLen; } }; return doIt; diff --git a/packages/semantics/src/reader.test.ts b/packages/semantics/src/reader.test.ts new file mode 100644 index 00000000..fa7621e2 --- /dev/null +++ b/packages/semantics/src/reader.test.ts @@ -0,0 +1,100 @@ +/* global URL */ + +import * as ohm from '@ohm-js/compiler/compat'; +import {createReader} from 'ohm-js/cstReader'; +import test from 'ava'; +import {readFileSync} from 'node:fs'; + +import type {ReaderOperation} from './reader.ts'; +import {createReaderOperation} from './reader.ts'; + +const scriptRel = (relPath: string) => new URL(relPath, import.meta.url); + +test('reader-based: arithmetic', t => { + const g2 = ohm.grammar(readFileSync(scriptRel('../../ohm-js/test/arithmetic.ohm'), 'utf8')); + g2.match('1+(2*3)').use(r => { + if (!r.succeeded()) return t.fail('parse failed'); + const rd = createReader(r); + + const evalIt: ReaderOperation = createReaderOperation('evalIt', { + addExp_plus(h, a, _, b) { + return evalIt(rd, a) + evalIt(rd, b); + }, + addExp_minus(h, a, _, b) { + return evalIt(rd, a) - evalIt(rd, b); + }, + mulExp_times(h, a, _, b) { + return evalIt(rd, a) * evalIt(rd, b); + }, + mulExp_divide(h, a, _, b) { + return evalIt(rd, a) / evalIt(rd, b); + }, + priExp_paren(h, _, e, _2) { + return evalIt(rd, e); + }, + number(h, _) { + return parseInt(rd.sourceString(h), 10); + }, + }); + t.is(evalIt(rd, rd.root), 7); + }); +}); + +test('reader-based: list and opt', t => { + const g = ohm.grammar(String.raw` + G { + Start = ~end #"a" &(letter "c") ("b"+ letter?)* punc? + punc = ("!" space?)+ + } + `); + + g.match('abcbc!!').use(r => { + if (!r.succeeded()) return t.fail('parse failed'); + const rd = createReader(r); + + const reversed: ReaderOperation = createReaderOperation('reversed', { + Start(h, a, list, opt) { + const parts: string[] = []; + rd.forEachTuple(list, (b, optLetter) => { + parts.push(reversed(rd, optLetter) + reversed(rd, b)); + }); + return reversed(rd, opt) + parts.reverse().join('') + reversed(rd, a); + }, + punc(h, list) { + return reversed(rd, list); + }, + _list(h) { + const parts: string[] = []; + rd.forEachTuple(h, (...children) => { + let text = ''; + for (const child of children) { + text += reversed(rd, child); + } + parts.push(text); + }); + return parts.join(''); + }, + _opt(h) { + if (!rd.isPresent(h)) return ''; + return rd.withChildren(h, (_handle, ...children) => { + let text = ''; + for (const child of children) { + text += reversed(rd, child); + } + return text; + }); + }, + _terminal(h) { + return rd.sourceString(h); + }, + _default(h) { + let result = ''; + rd.forEachChild(h, child => { + result += reversed(rd, child); + }); + return result; + }, + }); + t.is(reversed(rd, rd.root), '!!cbcba'); + }); +}); diff --git a/packages/semantics/src/reader.ts b/packages/semantics/src/reader.ts new file mode 100644 index 00000000..ee7aae50 --- /dev/null +++ b/packages/semantics/src/reader.ts @@ -0,0 +1,109 @@ +import type {CstReader} from 'ohm-js/cstReader'; +import {CstNodeType} from 'ohm-js/cstReader'; + +export type ReaderActionDict = { + _list?: (handle: number) => R; + _nonterminal?: (handle: number) => R; + _opt?: (handle: number) => R; + _terminal?: (handle: number) => R; + _default?: (handle: number) => R; + [ruleName: string]: ((handle: number, ...children: number[]) => R) | undefined; +}; + +export type ReaderOperation = (reader: CstReader, handle: number) => R; + +type ActionFn = (handle: number, ...children: number[]) => R; + +// Sentinel values used in the dispatch table for fallback actions. +const NO_ACTION = 0; +const USE_NONTERMINAL = 1; +const USE_DEFAULT = 2; + +export function createReaderOperation( + name: string, + actions: ReaderActionDict +): ReaderOperation { + // Lazily-built dispatch table: actionTable[ruleId] is either an action + // function or a sentinel (NO_ACTION / USE_NONTERMINAL / USE_DEFAULT). + let actionTable: (ActionFn | number)[] | undefined; + let cachedRuleNames: readonly string[] | undefined; + const listAction = actions._list; + const terminalAction = actions._terminal; + const nonterminalAction = actions._nonterminal; + const optAction = actions._opt; + const defaultAction = actions._default; + + function fail(reader: CstReader, handle: number): never { + throw new Error(`missing semantic action for '${reader.ctorName(handle)}' in '${name}'`); + } + + function buildTable(ruleNames: readonly string[]): (ActionFn | number)[] { + const table: (ActionFn | number)[] = new Array(ruleNames.length); + for (let i = 0; i < ruleNames.length; i++) { + const ctorName = ruleNames[i].split('<')[0]; + const action = actions[ctorName]; + if (action) { + table[i] = action; + } else if (nonterminalAction) { + table[i] = USE_NONTERMINAL; + } else if (defaultAction) { + table[i] = USE_DEFAULT; + } else { + table[i] = NO_ACTION; + } + } + return table; + } + + function getTable(reader: CstReader): (ActionFn | number)[] { + const ruleNames = reader.ruleNames; + if (actionTable && cachedRuleNames === ruleNames) return actionTable; + cachedRuleNames = ruleNames; + actionTable = buildTable(ruleNames); + return actionTable; + } + + const doIt: ReaderOperation = (reader: CstReader, handle: number): R => { + const nodeType = reader.type(handle); + + // Terminal — no children, no table lookup needed. + if (nodeType === CstNodeType.TERMINAL) { + if (terminalAction) return terminalAction(handle); + if (defaultAction) return defaultAction(handle); + return fail(reader, handle); + } + + if (nodeType === CstNodeType.LIST) { + if (listAction) return listAction(handle); + if (defaultAction) return defaultAction(handle); + return fail(reader, handle); + } + + if (nodeType === CstNodeType.OPT) { + if (optAction) return optAction(handle); + if (defaultAction) return defaultAction(handle); + return fail(reader, handle); + } + + // Nonterminal — use dispatch table indexed by ruleId. + const table = getTable(reader); + const ruleId = reader.ruleId(handle); + const entry = table[ruleId]; + + if (typeof entry === 'function') { + return reader.withChildren(handle, entry); + } + if (entry === USE_NONTERMINAL) { + return nonterminalAction!(handle); + } + if (entry === USE_DEFAULT) { + return defaultAction!(handle); + } + if (reader.childCount(handle) === 1) { + return reader.withChildren(handle, (_handle, child) => doIt(reader, child)); + } + return fail(reader, handle); + }; + + return doIt; +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8c0d4c13..128f1555 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -356,6 +356,9 @@ importers: ava: specifier: ^6.0.0 version: 6.2.0(rollup@4.43.0) + tinybench: + specifier: ^6.0.0 + version: 6.0.0 packages/to-ast-compat: dependencies: