diff --git a/doc/releases/ohm-golang-v18.0.md b/doc/releases/ohm-golang-v18.0.md new file mode 100644 index 00000000..3d433800 --- /dev/null +++ b/doc/releases/ohm-golang-v18.0.md @@ -0,0 +1,361 @@ +# Ohm Go v18.0 — API Specification + +This document specifies the Go runtime API for Ohm v18. The Go implementation consumes `.wasm` files produced by the Ohm compiler. This document is the authoritative specification for the implementation; portions marked **[not yet implemented]** describe the target behavior. + +## TL;DR + +Prerequists +- Golang +- Docker (for generating wasm files from ohm source) + +```bash +cd golang/examples +# Step 1 +go generate +# expected output 'Wrote Wasm to my-grammar.wasm' + +# Step 2 +go build +# note, if Step 1 failed, the build will fail with a message '... pattern my-grammar.wasm: no matching files found' + +# Step 3 +./ohmgo-examples +# or +./ohmgo-examples "" +# expected out, 'match failed' or 'match succeeded' +``` + +## Compiling grammars + +In v18 the grammar is compiled to a `.wasm` binary at build time, then loaded at runtime. The compiler is packaged as a Docker image. + +### Command line (Docker) + +The compiler is packaged as a Docker image. Mount your working directory to `/local` and use the `compile` subcommand: + +```bash +docker run --rm -v "$(pwd):/local" ohm:latest compile my-grammar.ohm +# writes my-grammar.wasm in the current directory +``` + +With an explicit output path or grammar name: + +```bash +docker run --rm -v "$(pwd):/local" ohm:latest compile -o out/my-grammar.wasm my-grammar.ohm +docker run --rm -v "$(pwd):/local" ohm:latest compile --grammarName MyGrammar my-grammar.ohm +``` + +### Build integration + +Add a `go:generate` directive so `go generate` keeps the `.wasm` up to date: + +**Note: this depends on the OS setting a PWD environment variable** + +```go +//go:generate docker run --rm -v "$PWD:/local" ohm:latest compile my-grammar.ohm +``` + +## Loading and using grammars + +```go +import ( + "context" + "os" + + goohm "github.com/ohmjs/goohm/runtime" +) + +wasmBytes, err := os.ReadFile("my-grammar.wasm") +if err != nil { + // handle +} + +ctx := context.Background() +g, err := goohm.NewGrammar(ctx, wasmBytes) +if err != nil { + // handle +} +defer g.Close() +``` + +`NewGrammar` compiles the Wasm module and initialises the runtime. Call `Close` when the grammar is no longer needed to release all WebAssembly resources. + +## MatchResult lifecycle + +Parse results live in Wasm linear memory and **must be explicitly disposed**. Results must be disposed in LIFO order (most recent first). + +### `defer` (recommended) + +```go +result, err := g.Match(input) +if err != nil { + // handle +} +defer result.Close() + +if result.Succeeded() { + // ... use result ... +} +``` + +### Explicit close + +When you need precise control over disposal order (e.g. nested matches): + +```go +result1, _ := g.Match(input1) +result2, _ := g.Match(input2) + +result2.Close() // must close most-recent first +result1.Close() +``` + +### Notes + +- Results must be disposed in LIFO order (most recent first). +- Failing to dispose a result will prevent subsequent `Match()` calls from succeeding. + +## Matching with a start rule + +By default, `Match` uses the grammar's default start rule. Pass an optional start rule name to override: + +```go +result, err := g.Match(input, "MyRule") +``` + +## MatchResult API + +```go +type MatchResult struct { /* ... */ } + +func (r *MatchResult) Succeeded() bool +func (r *MatchResult) Failed() bool +func (r *MatchResult) Input() string +func (r *MatchResult) Close() + +// CST access (only valid when Succeeded() is true) +func (r *MatchResult) GetCstRoot() (*CstNode, error) +``` + +## CST nodes + +`MatchResult` gives you the CST directly — there is no Semantics layer. + +```go +result, err := g.Match(input) +if err != nil { + // handle +} +defer result.Close() + +if result.Succeeded() { + root, err := result.GetCstRoot() + if err != nil { + // handle + } + fmt.Println(root.CtorName()) // rule name + fmt.Println(root.SourceString()) // matched text + start, end := root.Source() + children := root.Children() +} +``` + +### Node types + +All nodes share: `CtorName()`, `SourceString()`, `Source()` (returns `startIdx, endIdx`), `Children()`. + +| Type | `CtorName()` | Description | +|------|-------------|-------------| +| `NonterminalNode` | rule name | Has `IsSyntactic()`, `IsLexical()`, `LeadingSpaces()` | +| `TerminalNode` | `"_terminal"` | Has `Value()` | +| `ListNode` | `"_list"` | From `*` or `+`. Has `Collect(cb)` **[not yet implemented]** | +| `OptNode` | `"_opt"` | From `?`. Has `IfPresent(cb, orElse)`, `IsPresent()`, `IsEmpty()` **[not yet implemented]** | +| `SeqNode` | `"_seq"` | Grouped sequence. Has `Unpack(cb)` **[not yet implemented]** | + +Type guards: + +```go +node.IsNonterminal() bool +node.IsTerminal() bool +node.IsList() bool +node.IsOptional() bool +node.IsSeq() bool // [not yet implemented] +``` + +### Arity + +- Iter (`*`/`+`) and Opt (`?`) nodes are **not flattened**. A rule `a b c*` produces 3 children; the third is a `ListNode`. +- Positive lookahead (`&e`) **does not bind a node**. + +### Working with ListNode + +`Collect` maps over the items in a `ListNode`. When an item is a `SeqNode`, its children are unpacked as separate arguments to the callback. + +**[not yet implemented]** + +```go +// Signature (generic, exact form TBD with Go generics or interface{}) +func (n *CstNode) Collect(cb func(items ...*CstNode) any) []any + +// Example: rule like items = (name ":" value)* +items := listNode.Collect(func(parts ...*CstNode) any { + // parts[0] = name, parts[1] = ":", parts[2] = value + return map[string]string{ + "name": parts[0].SourceString(), + "value": parts[2].SourceString(), + } +}) +``` + +### Working with OptNode + +`IfPresent` calls the first callback if the option matched and the second (optional) callback otherwise. When the value is a `SeqNode`, its children are unpacked. + +**[not yet implemented]** + +```go +func (n *CstNode) IsPresent() bool +func (n *CstNode) IsEmpty() bool +func (n *CstNode) IfPresent(present func(parts ...*CstNode) any, orElse func() any) any + +// Example +val := optNode.IfPresent( + func(parts ...*CstNode) any { return parts[0].SourceString() }, + func() any { return "default" }, +) +``` + +### Working with SeqNode + +`Unpack` spreads the children of a `SeqNode` as arguments to the callback. + +**[not yet implemented]** + +```go +func (n *CstNode) Unpack(cb func(parts ...*CstNode)) + +seqNode.Unpack(func(parts ...*CstNode) { + left, op, right := parts[0], parts[1], parts[2] + _ = left; _ = op; _ = right +}) +``` + +### LeadingSpaces + +For syntactic rules, nonterminal nodes may carry implicit leading whitespace. `LeadingSpaces()` returns a `*CstNode` of type `ListNode` (possibly empty) representing those spaces. + +**[not yet implemented as a public method]** + +```go +func (n *CstNode) LeadingSpaces() *CstNode // ListNode; nil if not applicable +``` + +## Error handling + +**[not yet implemented — fields below are the target API]** + +When `result.Failed()` is true, the following fields are available: + +```go +type MatchResult struct { /* ... */ } + +func (r *MatchResult) Message() string // Full message with line/col and input excerpt +func (r *MatchResult) ShortMessage() string // "Line 1, col 5: expected ..." +func (r *MatchResult) GetExpectedText() string // "letter or digit" +func (r *MatchResult) RightmostFailurePosition() int +func (r *MatchResult) RightmostFailures() []Failure +``` + +```go +result, _ := g.Match(input) +defer result.Close() + +if result.Failed() { + fmt.Println(result.Message()) + fmt.Println(result.ShortMessage()) + fmt.Println(result.GetExpectedText()) + fmt.Println(result.RightmostFailurePosition()) +} +``` + +## Syntactic vs. lexical rule classification + +`IsSyntactic()` checks whether the **first letter** of the rule name is an upper-case letter. Rule names that start with a non-letter character (e.g. `_ident`, `0digit`) are classified as lexical, not syntactic. + +```go +node.IsSyntactic() bool // true iff first letter in CtorName() is upper-case +node.IsLexical() bool // !IsSyntactic() for nonterminals +``` + +## Grammar introspection + +```go +g.GetRuleNames() []string // all rule names defined in the grammar +g.GetRuleId(name string) (int, bool) // rule name → internal id; false if not found +``` + +## Removed / not-yet-available APIs + +The following are not present in v18 (mirroring the JS v18 status): + +- **Semantics** — no `CreateSemantics`, `AddOperation`, `AddAttribute`. Traverse the CST directly. +- **Matcher** — no incremental parsing. +- **Tracing** — no `Trace()`. +- **Grammar introspection via `.Rules`** — use `GetRuleNames()` instead. + +## Full example + +```go +package main + +import ( + "context" + "fmt" + "os" + + goohm "github.com/ohmjs/goohm/runtime" +) + +func main() { + wasmBytes, err := os.ReadFile("arithmetic.wasm") + if err != nil { + panic(err) + } + + ctx := context.Background() + g, err := goohm.NewGrammar(ctx, wasmBytes) + if err != nil { + panic(err) + } + defer g.Close() + + result, err := g.Match("1 + 2 * 3") + if err != nil { + panic(err) + } + defer result.Close() + + if result.Failed() { + fmt.Println("Parse failed:", result.Message()) + return + } + + root, err := result.GetCstRoot() + if err != nil { + panic(err) + } + + walk(root, 0) +} + +func walk(node *goohm.CstNode, depth int) { + indent := "" + for i := 0; i < depth; i++ { + indent += " " + } + fmt.Printf("%s%s %q\n", indent, node.CtorName(), node.SourceString()) + for _, child := range node.Children() { + walk(child, depth+1) + } +} +``` diff --git a/doc/releases/ohm-js-18.0.md b/doc/releases/ohm-js-18.0.md index 272215de..587e57c1 100644 --- a/doc/releases/ohm-js-18.0.md +++ b/doc/releases/ohm-js-18.0.md @@ -19,6 +19,13 @@ In v17, grammars were parsed at runtime. In v18, the recommendation is to compil npx ohm2wasm my-grammar.ohm # writes my-grammar.wasm ``` +or using the ohm docker image + +```bash +docker run --rm -v "$(pwd):/local" ohm:latest compile my-grammar.ohm +# writes my-grammar.wasm in the current directory +``` + ### Programmatic ```js diff --git a/go.work b/go.work new file mode 100644 index 00000000..5d076e17 --- /dev/null +++ b/go.work @@ -0,0 +1,9 @@ +go 1.24.2 + +use ( + ./golang/cli + ./golang/examples + ./golang/runtime + ./golang/test + ./packages/compiler/test/go +) diff --git a/go.work.sum b/go.work.sum new file mode 100644 index 00000000..ae1a3e30 --- /dev/null +++ b/go.work.sum @@ -0,0 +1,12 @@ +github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/UYA= +github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= +github.com/hashicorp/go-multierror v1.0.0 h1:iVjPR7a6H0tWELX5NxNe7bYopibicUzc7uPribsnS6o= +github.com/hashicorp/go-multierror v1.0.0/go.mod h1:dHtQlpGsu+cZNNAkkCN/P3hoUDHhCYQXV3UM06sGGrk= +github.com/millergarym/opts v1.2.4 h1:4txZ+77JE8jfP5HOoz+AqYzLmba6z1g1CwxETn+bEF0= +github.com/millergarym/opts v1.2.4/go.mod h1:7p7X/vlpKZmtaDFYKs956EujFqA6aCrOkcCaS6UBcR4= +github.com/posener/complete v1.2.2-0.20190308074557-af07aa5181b3 h1:GqpA1/5oN1NgsxoSA4RH0YWTaqvUlQNeOpHXD/JRbOQ= +github.com/posener/complete v1.2.2-0.20190308074557-af07aa5181b3/go.mod h1:6gapUrK/U1TAN7ciCoNRIdVC5sbdBTUh1DKN0g6uH7E= +github.com/tetratelabs/wazero v1.11.0 h1:+gKemEuKCTevU4d7ZTzlsvgd1uaToIDtlQlmNbwqYhA= +github.com/tetratelabs/wazero v1.11.0/go.mod h1:eV28rsN8Q+xwjogd7f4/Pp4xFxO7uOGbLcD/LzB1wiU= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= diff --git a/golang/cli/go.mod b/golang/cli/go.mod new file mode 100644 index 00000000..5a682db7 --- /dev/null +++ b/golang/cli/go.mod @@ -0,0 +1,3 @@ +module github.com/ohmjs/ohmgo + +go 1.24.2 diff --git a/golang/examples/.gitignore b/golang/examples/.gitignore new file mode 100644 index 00000000..b432a803 --- /dev/null +++ b/golang/examples/.gitignore @@ -0,0 +1,2 @@ +my-grammar.wasm +ohmgo-examples \ No newline at end of file diff --git a/golang/examples/generate.go b/golang/examples/generate.go new file mode 100644 index 00000000..0ab78719 --- /dev/null +++ b/golang/examples/generate.go @@ -0,0 +1,3 @@ +package main + +//go:generate docker run --rm -v "$PWD:/local" ohm:latest compile my-grammar.ohm diff --git a/golang/examples/go.mod b/golang/examples/go.mod new file mode 100644 index 00000000..9f35cbeb --- /dev/null +++ b/golang/examples/go.mod @@ -0,0 +1,3 @@ +module github.com/ohmjs/ohmgo-examples + +go 1.24.2 diff --git a/golang/examples/load_and_use.go b/golang/examples/load_and_use.go new file mode 100644 index 00000000..7898a435 --- /dev/null +++ b/golang/examples/load_and_use.go @@ -0,0 +1,39 @@ +package main + +import ( + "context" + _ "embed" + "log" + "os" + + goohm "github.com/ohmjs/goohm" +) + +//go:embed my-grammar.wasm +var wasmBytes []byte + +func main() { + // or, if you prefer, read the .wasm file from disk + // wasmBytes, err := os.ReadFile("my-grammar.wasm") + ctx := context.Background() + grmr, err := goohm.NewGrammar(ctx, wasmBytes) + if err != nil { + log.Fatalf("creating grammar: %v", err) + } + defer grmr.Close() + // use the grammar to match some input text + input := "Hello, world!" + if len(os.Args) > 1 && os.Args[1] != "" { + input = os.Args[1] + } + result, err := grmr.Match(input) + if err != nil { + log.Fatalf("matching: %v", err) + } + defer result.Close() + if !result.Succeeded() { + log.Printf("match failed") + os.Exit(1) + } + log.Printf("match succeeded") +} diff --git a/golang/examples/my-grammar.ohm b/golang/examples/my-grammar.ohm new file mode 100644 index 00000000..42637aaf --- /dev/null +++ b/golang/examples/my-grammar.ohm @@ -0,0 +1,9 @@ +MyGrammar { + Start = hello name "!"? + hello = + | "hello," + | "Hello," + | "hello" + | "Hello" + name = letter+ +} diff --git a/golang/runtime/README.md b/golang/runtime/README.md new file mode 100644 index 00000000..66442943 --- /dev/null +++ b/golang/runtime/README.md @@ -0,0 +1,49 @@ +# minohm-go + +A Go implementation of the [minohm][] interface, for using Ohm grammars from Go. + +A _grammar blob_ is an [Ohm][] grammar that has been compiled to .wasm via the `@ohm-js/wasm` NPM package. To use a grammar blob to match some input, you need a miniohm implementation for your host language of choice (JavaScript, Go, Python, etc.) This package provides a miniohm implementation for the [Go Programming Language][go]. + +[minohm]: https://github.com/ohmjs/ohm/blob/main/doc/design/miniohm.md +[Ohm]: https://ohmjs.org +[go]: https://go.dev/ + +## Overview + +- The `miniohm` module is in matcher.go and cst.go. +- An example can be found in cmd/example/main.go. + +## Compiling grammars to Wasm + +Use the `ohm2wasm` command from the `@ohm-js/wasm` NPM package to compile a .ohm file to a Wasm grammar blob. For example: + +``` +npx ohm2wasm myGrammar.ohm +``` + +See Makefile for an example. + +## Matching input + +Create a new `WasmMatcher` and use the `Match` function: + +```go +matcher := NewWasmMatcher(ctx) +err := matcher.LoadModule("path/to/grammar.wasm") +matcher.SetInput("text to match") +success, err := matcher.Match() +cstRoot, err := matcher.GetCstRoot() +``` + +## Walking the CST + +A full implementation of semantics, operations, etc. is not part of the miniohm interface. Instead, you can walk the CST (concrete syntax tree) directly using the CstNode interface. See cmd/example/main.go for an example. + +## Developing + +Useful commands: + +```sh +make # Build +make test # Run tests +``` diff --git a/golang/runtime/cst.go b/golang/runtime/cst.go new file mode 100644 index 00000000..8161442c --- /dev/null +++ b/golang/runtime/cst.go @@ -0,0 +1,195 @@ +package goohm + +import ( + "unicode/utf16" + "unsafe" + + "github.com/tetratelabs/wazero/api" +) + +// CstNodeType constants matching the ohm-js CstNodeType enum. +const ( + CstNodeTypeNonterminal = 0 + CstNodeTypeTerminal = 1 + CstNodeTypeList = 2 + CstNodeTypeOpt = 3 +) + +const ( + matchRecordTypeMask = 3 + cstNodeHeaderSize = 16 + slotSize = 4 +) + +// cstContext holds shared state for all CstNodes from a single match result. +type cstContext struct { + ruleNames []string + memory api.Memory + inputUTF16 []uint16 +} + +// CstNode represents a node in the concrete syntax tree. +// Its API mirrors the ohm-js CstNode interface. +type CstNode struct { + ctx *cstContext + base uint32 + startIdx int // position in the input (UTF-16 code units) +} + +func newCstNode(ctx *cstContext, base uint32, startIdx int) *CstNode { + return &CstNode{ctx: ctx, base: base, startIdx: startIdx} +} + +// --- internal field accessors --- + +func (n *CstNode) typeAndDetails() int32 { + data, ok := n.ctx.memory.Read(n.base+8, 4) + if !ok { + return 0 + } + return readInt32(data, 0) +} + +func (n *CstNode) matchRecordType() int32 { + return n.typeAndDetails() & matchRecordTypeMask +} + +func (n *CstNode) ruleID() int32 { + return n.typeAndDetails() >> 2 +} + +func (n *CstNode) count() uint32 { + data, ok := n.ctx.memory.Read(n.base, 4) + if !ok { + return 0 + } + return readUint32(data, 0) +} + +// --- public API (matches the ohm-js CstNode interface) --- + +// Type returns the CstNodeType for this node. +func (n *CstNode) Type() int { + switch n.matchRecordType() { + case 0: + return CstNodeTypeNonterminal + case 1: + return CstNodeTypeTerminal + case 2: + return CstNodeTypeList + case 3: + return CstNodeTypeOpt + default: + return -1 + } +} + +// CtorName returns the constructor name for this node. +// For nonterminals this is the rule name; for other types it is +// "_terminal", "_list", or "_opt". +func (n *CstNode) CtorName() string { + switch n.Type() { + case CstNodeTypeNonterminal: + id := n.ruleID() + if int(id) < len(n.ctx.ruleNames) { + return n.ctx.ruleNames[id] + } + return "" + case CstNodeTypeTerminal: + return "_terminal" + case CstNodeTypeList: + return "_list" + case CstNodeTypeOpt: + return "_opt" + default: + return "" + } +} + +// MatchLength returns the number of UTF-16 code units consumed by this node. +func (n *CstNode) MatchLength() int { + data, ok := n.ctx.memory.Read(n.base+4, 4) + if !ok { + return 0 + } + return int(readUint32(data, 0)) +} + +// Source returns the start and end indices (UTF-16 code units) in the input. +func (n *CstNode) Source() (startIdx, endIdx int) { + return n.startIdx, n.startIdx + n.MatchLength() +} + +// SourceString returns the portion of the input matched by this node. +func (n *CstNode) SourceString() string { + start := n.startIdx + end := start + n.MatchLength() + if end > len(n.ctx.inputUTF16) { + end = len(n.ctx.inputUTF16) + } + if start >= end { + return "" + } + return string(utf16.Decode(n.ctx.inputUTF16[start:end])) +} + +// Value returns the matched text. +func (n *CstNode) Value() string { + return n.SourceString() +} + +func (n *CstNode) IsNonterminal() bool { return n.Type() == CstNodeTypeNonterminal } +func (n *CstNode) IsTerminal() bool { return n.Type() == CstNodeTypeTerminal } +func (n *CstNode) IsList() bool { return n.Type() == CstNodeTypeList } +func (n *CstNode) IsOptional() bool { return n.Type() == CstNodeTypeOpt } + +// IsSyntactic returns true if this is a nonterminal whose rule name starts +// with an uppercase letter. +func (n *CstNode) IsSyntactic() bool { + if !n.IsNonterminal() { + return false + } + name := n.CtorName() + return len(name) > 0 && name[0] >= 'A' && name[0] <= 'Z' +} + +// IsLexical returns true if this is a nonterminal whose rule name starts +// with a lowercase letter. +func (n *CstNode) IsLexical() bool { + return n.IsNonterminal() && !n.IsSyntactic() +} + +func (n *CstNode) NumChildren() uint32 { + return n.count() +} + +// Children returns the child nodes, with startIdx properly tracked. +func (n *CstNode) Children() []*CstNode { + count := n.count() + if count == 0 { + return nil + } + children := make([]*CstNode, count) + startIdx := n.startIdx + for i := uint32(0); i < count; i++ { + slotOffset := n.base + cstNodeHeaderSize + i*slotSize + data, ok := n.ctx.memory.Read(slotOffset, 4) + if !ok { + return children[:i] + } + childBase := readUint32(data, 0) + child := newCstNode(n.ctx, childBase, startIdx) + children[i] = child + startIdx += child.MatchLength() + } + return children +} + +// Helper functions for reading little-endian values from memory. +func readUint32(data []byte, offset uint32) uint32 { + return *(*uint32)(unsafe.Pointer(&data[offset])) +} + +func readInt32(data []byte, offset uint32) int32 { + return *(*int32)(unsafe.Pointer(&data[offset])) +} diff --git a/golang/runtime/go.mod b/golang/runtime/go.mod new file mode 100644 index 00000000..eaffd370 --- /dev/null +++ b/golang/runtime/go.mod @@ -0,0 +1,7 @@ +module github.com/ohmjs/goohm + +go 1.24.2 + +require github.com/tetratelabs/wazero v1.11.0 + +require golang.org/x/sys v0.38.0 // indirect diff --git a/golang/runtime/go.sum b/golang/runtime/go.sum new file mode 100644 index 00000000..6cfca8b4 --- /dev/null +++ b/golang/runtime/go.sum @@ -0,0 +1,4 @@ +github.com/tetratelabs/wazero v1.11.0 h1:+gKemEuKCTevU4d7ZTzlsvgd1uaToIDtlQlmNbwqYhA= +github.com/tetratelabs/wazero v1.11.0/go.mod h1:eV28rsN8Q+xwjogd7f4/Pp4xFxO7uOGbLcD/LzB1wiU= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= diff --git a/golang/runtime/grammar.go b/golang/runtime/grammar.go new file mode 100644 index 00000000..f8cf832c --- /dev/null +++ b/golang/runtime/grammar.go @@ -0,0 +1,513 @@ +package goohm + +import ( + "context" + "encoding/binary" + "fmt" + "io" + "regexp" + "unicode" + "unicode/utf16" + + "github.com/tetratelabs/wazero" + "github.com/tetratelabs/wazero/api" +) + +// Grammar is a Go implementation of the JavaScript Grammar class from miniohm. +type Grammar struct { + runtime wazero.Runtime + module api.Module + input string + inputUTF16 []uint16 // UTF-16 code units of the current input + ctx context.Context + ruleIds map[string]int + ruleNames []string + strings []string // strings table from the custom section + resultStack []*MatchResult +} + +// GetModule returns the WebAssembly module +func (g *Grammar) GetModule() api.Module { + return g.module +} + +// NewGrammar compiles and instantiates a grammar from the given Wasm bytes. +// This parallels Grammar.instantiate() in the JS API. +func NewGrammar(ctx context.Context, wasmBytes []byte) (*Grammar, error) { + config := wazero.NewRuntimeConfig().WithCustomSections(true) + + g := &Grammar{ + runtime: wazero.NewRuntimeWithConfig(ctx, config), + ctx: ctx, + ruleIds: make(map[string]int), + } + + // Create the env module with the abort function + _, err := g.runtime.NewHostModuleBuilder("env"). + NewFunctionBuilder(). + WithFunc(func(a, b, c, d int32) { + panic("WebAssembly module aborted execution") + }). + Export("abort"). + Instantiate(g.ctx) + if err != nil { + return nil, fmt.Errorf("failed to create env module: %v", err) + } + + // Create the ohmRuntime module with required host functions + _, err = g.runtime.NewHostModuleBuilder("ohmRuntime"). + NewFunctionBuilder(). + WithFunc(func(ctx context.Context, mod api.Module, dest, length uint32) uint32 { + return g.fillInputBuffer(ctx, mod, dest, length) + }). + Export("fillInputBuffer"). + NewFunctionBuilder(). + WithFunc(func(ctx context.Context, mod api.Module, categoryBitmap uint32) uint32 { + return g.matchUnicodeChar(ctx, mod, categoryBitmap) + }). + Export("matchUnicodeChar"). + NewFunctionBuilder(). + WithFunc(func(ctx context.Context, mod api.Module, stringIdx uint32) uint32 { + return g.matchCaseInsensitive(ctx, mod, stringIdx) + }). + Export("matchCaseInsensitive"). + Instantiate(g.ctx) + if err != nil { + return nil, fmt.Errorf("failed to create ohmRuntime module: %v", err) + } + + // Compile the module to access the custom sections + compiledModule, err := g.runtime.CompileModule(g.ctx, wasmBytes) + if err != nil { + return nil, fmt.Errorf("error compiling module: %v", err) + } + + // Parse custom sections + customSections := compiledModule.CustomSections() + if customSections == nil { + return nil, fmt.Errorf("no custom sections found in module") + } + + for _, section := range customSections { + switch section.Name() { + case "ruleNames": + g.ruleNames, err = parseLEB128Strings(section.Data()) + if err != nil { + return nil, fmt.Errorf("failed to parse ruleNames: %v", err) + } + case "strings": + g.strings, err = parseLEB128Strings(section.Data()) + if err != nil { + return nil, fmt.Errorf("failed to parse strings: %v", err) + } + } + } + + if g.ruleNames == nil { + return nil, fmt.Errorf("required custom section 'ruleNames' not found") + } + + // Instantiate the module + g.module, err = g.runtime.InstantiateModule(g.ctx, compiledModule, wazero.NewModuleConfig()) + if err != nil { + return nil, fmt.Errorf("error instantiating module: %v", err) + } + + // Build the ruleIds map + g.ruleIds = make(map[string]int, len(g.ruleNames)) + for i, name := range g.ruleNames { + g.ruleIds[name] = i + } + + return g, nil +} + +// parseLEB128Strings parses a LEB128-encoded vector of strings from a custom section. +// Used for both ruleNames and strings sections. +func parseLEB128Strings(data []byte) ([]string, error) { + if len(data) == 0 { + return nil, fmt.Errorf("empty custom section data") + } + + numUint64, bytesRead := binary.Uvarint(data) + if bytesRead <= 0 { + return nil, fmt.Errorf("failed to read count: %v", io.ErrUnexpectedEOF) + } + if numUint64 > uint64(^uint32(0)) { + return nil, fmt.Errorf("count exceeds maximum uint32 value") + } + + num := uint32(numUint64) + data = data[bytesRead:] + + result := make([]string, num) + for i := uint32(0); i < num; i++ { + lenUint64, bytesRead := binary.Uvarint(data) + if bytesRead <= 0 { + return nil, fmt.Errorf("failed to read string length: %v", io.ErrUnexpectedEOF) + } + if lenUint64 > uint64(^uint32(0)) { + return nil, fmt.Errorf("string length exceeds maximum uint32 value") + } + + strLen := uint32(lenUint64) + data = data[bytesRead:] + + if uint64(len(data)) < uint64(strLen) { + return nil, fmt.Errorf("buffer too small to read string bytes") + } + + result[i] = string(data[:strLen]) + data = data[strLen:] + } + + return result, nil +} + +/* + * Wasm heap memory management + * =========================== + * + * The Wasm module uses a bump-pointer allocator (AssemblyScript's "stub" + * runtime). Each match() call allocates a memo table and CST nodes on + * the Wasm heap. There is no way to free individual allocations — you + * can only reset the bump pointer. + * + * To allow incremental freeing, we exploit two facts: + * + * 1. MatchResult disposal is LIFO — you must dispose the most recent + * result first (enforced by Dispose). + * + * 2. Allocations for match N are always contiguous and sit above + * match N-1's allocations on the heap. + * + * Before each match, we snapshot the bump pointer (`__offset`) as a + * "watermark" and store it on the MatchResult. On dispose, we reset the + * bump pointer to that watermark, freeing exactly that match's + * allocations while keeping earlier results intact. + */ + +// readOffset reads the __offset global (bump-pointer allocator position). +func (g *Grammar) readOffset() uint64 { + return g.module.ExportedGlobal("__offset").(api.MutableGlobal).Get() +} + +// writeOffset sets the __offset global (bump-pointer allocator position). +func (g *Grammar) writeOffset(val uint64) { + g.module.ExportedGlobal("__offset").(api.MutableGlobal).Set(val) +} + +// Match matches the input against the grammar, using the given start rule +// (or the grammar's default start rule if none is specified). +// Returns a MatchResult that can be inspected for success/failure and CST access. +// The caller must call Close() on the result when done. +func (g *Grammar) Match(input string, startRule ...string) (*MatchResult, error) { + g.input = input + g.inputUTF16 = utf16.Encode([]rune(input)) + + // Resolve the rule ID + var ruleId uint64 + startExpr := "" + if len(startRule) > 0 && startRule[0] != "" { + startExpr = startRule[0] + id, ok := g.ruleIds[startExpr] + if !ok { + return nil, fmt.Errorf("rule not found: %s", startExpr) + } + ruleId = uint64(id) + } + + // Snapshot the heap bump pointer before the match. + heapWatermark := g.readOffset() + + // Call match(inputLength, startRuleId) + matchFunc := g.module.ExportedFunction("match") + if matchFunc == nil { + return nil, fmt.Errorf("match function not exported by module") + } + + inputLength := uint64(len(g.inputUTF16)) + results, err := matchFunc.Call(g.ctx, inputLength, ruleId) + if err != nil { + return nil, fmt.Errorf("error calling match function: %v", err) + } + + result := &MatchResult{ + grammar: g, + input: input, + inputUTF16: g.inputUTF16, + succeeded: results[0] != 0, + startExpr: startExpr, + heapWatermark: heapWatermark, + } + g.resultStack = append(g.resultStack, result) + return result, nil +} + +// MatchResult holds the result of matching input against a grammar. +// The caller must call Close() when done to free the CST memory. +// Disposal is LIFO: the most recent MatchResult must be disposed first. +type MatchResult struct { + grammar *Grammar + input string + inputUTF16 []uint16 + succeeded bool + startExpr string + heapWatermark uint64 +} + +// Succeeded returns true if the match was successful. +func (r *MatchResult) Succeeded() bool { return r.succeeded } + +// Failed returns true if the match was unsuccessful. +func (r *MatchResult) Failed() bool { return !r.succeeded } + +// Input returns the input string that was matched. +func (r *MatchResult) Input() string { return r.input } + +// Dispose frees the CST memory for this match result by resetting the +// bump-pointer allocator to the watermark recorded before this match. +// Disposal is LIFO: the most recent MatchResult must be disposed first. +func (r *MatchResult) Close() { + g := r.grammar + stack := g.resultStack + if len(stack) == 0 || stack[len(stack)-1] != r { + panic("You can only Close() the most recent MatchResult") + } + g.resultStack = stack[:len(stack)-1] + g.writeOffset(r.heapWatermark) +} + +func (r *MatchResult) cstContext() *cstContext { + return &cstContext{ + ruleNames: r.grammar.ruleNames, + memory: r.grammar.module.Memory(), + inputUTF16: r.inputUTF16, + } +} + +// GetCstRoot returns the root CST node by using bindingsAt(0) and handling +// the $spaces leading node, mirroring miniohm.ts _getCstRoot. +func (r *MatchResult) GetCstRoot() (*CstNode, error) { + g := r.grammar + ctx := r.cstContext() + + bindingsAtFunc := g.module.ExportedFunction("bindingsAt") + if bindingsAtFunc == nil { + return nil, fmt.Errorf("bindingsAt function not exported") + } + + getBindingsLengthFunc := g.module.ExportedFunction("getBindingsLength") + if getBindingsLengthFunc == nil { + return nil, fmt.Errorf("getBindingsLength function not exported") + } + + // Get first binding + results, err := bindingsAtFunc.Call(g.ctx, 0) + if err != nil { + return nil, fmt.Errorf("error calling bindingsAt(0): %v", err) + } + firstNode := newCstNode(ctx, uint32(results[0]), 0) + + if firstNode.CtorName() != "$spaces" { + return firstNode, nil + } + + // If first node is $spaces, the actual root is at binding 1 + lenResults, err := getBindingsLengthFunc.Call(g.ctx) + if err != nil { + return nil, fmt.Errorf("error calling getBindingsLength: %v", err) + } + if lenResults[0] <= 1 { + return nil, fmt.Errorf("expected more than 1 binding, got %d", lenResults[0]) + } + + results, err = bindingsAtFunc.Call(g.ctx, 1) + if err != nil { + return nil, fmt.Errorf("error calling bindingsAt(1): %v", err) + } + return newCstNode(ctx, uint32(results[0]), firstNode.MatchLength()), nil +} + +// GetAllBindings returns all top-level binding nodes from the match. +// For syntactic start rules, this is [$spaces, root]. +// For lexical start rules, this is just [root]. +func (r *MatchResult) GetAllBindings() ([]*CstNode, error) { + g := r.grammar + ctx := r.cstContext() + + bindingsAtFunc := g.module.ExportedFunction("bindingsAt") + if bindingsAtFunc == nil { + return nil, fmt.Errorf("bindingsAt function not exported") + } + + getBindingsLengthFunc := g.module.ExportedFunction("getBindingsLength") + if getBindingsLengthFunc == nil { + return nil, fmt.Errorf("getBindingsLength function not exported") + } + + lenResults, err := getBindingsLengthFunc.Call(g.ctx) + if err != nil { + return nil, fmt.Errorf("error calling getBindingsLength: %v", err) + } + numBindings := int(lenResults[0]) + + nodes := make([]*CstNode, numBindings) + startIdx := 0 + for i := 0; i < numBindings; i++ { + results, err := bindingsAtFunc.Call(g.ctx, uint64(i)) + if err != nil { + return nil, fmt.Errorf("error calling bindingsAt(%d): %v", i, err) + } + node := newCstNode(ctx, uint32(results[0]), startIdx) + nodes[i] = node + startIdx += node.MatchLength() + } + + return nodes, nil +} + +// GetRuleNames returns the list of rule names in the grammar +func (g *Grammar) GetRuleNames() []string { + return g.ruleNames +} + +// readPos reads the `pos` global from the Wasm module +func (g *Grammar) readPos() uint32 { + posGlobal := g.module.ExportedGlobal("pos") + if posGlobal == nil { + return 0 + } + return uint32(posGlobal.Get()) +} + +// writePos writes to the `pos` global in the Wasm module +func (g *Grammar) writePos(val uint32) { + posGlobal := g.module.ExportedGlobal("pos") + if posGlobal == nil { + return + } + posGlobal.(api.MutableGlobal).Set(uint64(val)) +} + +// fillInputBuffer writes UTF-16LE code units to the dest pointer provided by the Wasm module. +func (g *Grammar) fillInputBuffer(ctx context.Context, mod api.Module, dest, length uint32) uint32 { + memory := mod.Memory() + if memory == nil { + panic("WebAssembly module has no memory") + } + + // Write UTF-16LE code units + numUnits := uint32(len(g.inputUTF16)) + if length < numUnits { + numUnits = length + } + + for i := uint32(0); i < numUnits; i++ { + offset := dest + i*2 + memory.WriteUint16Le(offset, g.inputUTF16[i]) + } + + return numUnits +} + +// unicodeCategoryBitmapMap maps bit positions in the category bitmap to +// Unicode categories and properties that the Ohm runtime can match. +var unicodeCategoryBitmapMap = map[int]func(rune) bool{ + 0: func(r rune) bool { return unicode.IsLetter(r) }, // Lu|Ll|Lt|Lm|Lo + 1: func(r rune) bool { return unicode.Is(unicode.Nl, r) }, // Nl + 2: func(r rune) bool { return unicode.Is(unicode.Mn, r) }, // Mn + 3: func(r rune) bool { return unicode.Is(unicode.Mc, r) }, // Mc + 4: func(r rune) bool { return unicode.Is(unicode.Nd, r) }, // Nd + 5: func(r rune) bool { return unicode.Is(unicode.Pc, r) }, // Pc + 6: func(r rune) bool { return unicode.IsUpper(r) }, // Lu + 7: func(r rune) bool { return unicode.IsLower(r) }, // Ll + 8: func(r rune) bool { return unicode.Is(unicode.Lt, r) }, // Lt + 9: func(r rune) bool { return unicode.Is(unicode.Lm, r) }, // Lm + 10: func(r rune) bool { return unicode.Is(unicode.Lo, r) }, // Lo +} + +// matchUnicodeChar matches a character at the current `pos` against a Unicode category bitmap. +// Reads `pos` from the Wasm global, advances it on success. +func (g *Grammar) matchUnicodeChar(ctx context.Context, mod api.Module, categoryBitmap uint32) uint32 { + pos := g.readPos() + if int(pos) >= len(g.inputUTF16) { + return 0 + } + + // Decode the rune at pos (may be a surrogate pair) + codeUnit := g.inputUTF16[pos] + var r rune + var advance uint32 = 1 + if utf16.IsSurrogate(rune(codeUnit)) && int(pos+1) < len(g.inputUTF16) { + r = utf16.DecodeRune(rune(codeUnit), rune(g.inputUTF16[pos+1])) + advance = 2 + } else { + r = rune(codeUnit) + } + + // Check each category bit + for bit := 0; bit < 32; bit++ { + if categoryBitmap&(1<= len(g.strings) { + return 0 + } + + str := g.strings[stringIdx] + pos := g.readPos() + + // Build a regex for case-insensitive match + pattern := "(?i)" + regexp.QuoteMeta(str) + re := regexp.MustCompile(pattern) + + // Convert input from pos onward back to a Go string for matching + remaining := g.inputUTF16[pos:] + remainingStr := string(utf16.Decode(remaining)) + + loc := re.FindStringIndex(remainingStr) + if loc == nil || loc[0] != 0 { + return 0 + } + + // Advance pos by the number of UTF-16 code units consumed + matched := remainingStr[:loc[1]] + matchedUTF16 := utf16.Encode([]rune(matched)) + g.writePos(pos + uint32(len(matchedUTF16))) + return 1 +} + +// GetRuleId returns the ID for a rule name +func (g *Grammar) GetRuleId(ruleName string) (int, bool) { + id, ok := g.ruleIds[ruleName] + return id, ok +} + +// Close releases all resources +func (g *Grammar) Close() error { + if g.module != nil { + if err := g.module.Close(g.ctx); err != nil { + return err + } + } + + if g.runtime != nil { + if err := g.runtime.Close(g.ctx); err != nil { + return err + } + } + + return nil +} diff --git a/golang/test/go.mod b/golang/test/go.mod new file mode 100644 index 00000000..e1e17d11 --- /dev/null +++ b/golang/test/go.mod @@ -0,0 +1,7 @@ +module github.com/ohmjs/goohm-test + +go 1.24.2 + +require github.com/tetratelabs/wazero v1.11.0 + +require golang.org/x/sys v0.38.0 // indirect diff --git a/golang/test/helper.go b/golang/test/helper.go new file mode 100644 index 00000000..0eea01a6 --- /dev/null +++ b/golang/test/helper.go @@ -0,0 +1,46 @@ +package goohm_test + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + + "github.com/ohmjs/goohm" +) + +func compileAndLoad(source string) (*goohm.Grammar, error) { + // 1. create a temporary directory + tmpDir, err := os.MkdirTemp("", "goohm-test-*") + if err != nil { + return nil, fmt.Errorf("creating temp dir: %w", err) + } + defer os.RemoveAll(tmpDir) + // 2. write the source to the file source.ohm in the temporary directory + ohmFile := filepath.Join(tmpDir, "source.ohm") + if err := os.WriteFile(ohmFile, []byte(source), 0644); err != nil { + return nil, fmt.Errorf("writing source.ohm: %w", err) + } + // 3. use `docker run --rm -v "$PWD:/local" ohm:latest compile source.ohm` to create source.wasm file + // TODO: see if it would be better to use https://github.com/docker/go-sdk/tree/main/container or https://github.com/ory/dockertest + cmd := exec.Command("docker", "run", "--rm", + "-v", tmpDir+":/local", + "ohm:latest", "compile", "source.ohm") + if out, err := cmd.CombinedOutput(); err != nil { + return nil, fmt.Errorf("compiling source.ohm: %w\n%s", err, out) + } + // 4. read the .wasm file into memory + wasmFile := filepath.Join(tmpDir, "source.wasm") + wasmBytes, err := os.ReadFile(wasmFile) + if err != nil { + return nil, fmt.Errorf("reading source.wasm: %w", err) + } + // 5. create a Grammar from the .wasm bytes and return it + ctx := context.Background() + gmr, err := goohm.NewGrammar(ctx, wasmBytes) + if err != nil { + return nil, fmt.Errorf("creating grammar: %w", err) + } + return gmr, nil +} diff --git a/golang/test/wasm_test.go b/golang/test/wasm_test.go new file mode 100644 index 00000000..3bf61dd9 --- /dev/null +++ b/golang/test/wasm_test.go @@ -0,0 +1,86 @@ +package goohm_test + +import "testing" + +func AssertEqual[T comparable](t *testing.T, got, want T) { + t.Helper() // Marks this func as a helper for better line reporting + if got != want { + t.Errorf("got %v, want %v", got, want) + } +} + +func AssertTrue(t *testing.T, got bool) { + t.Helper() // Marks this func as a helper for better line reporting + if !got { + t.Errorf("expected true, got false") + } +} + +func TestCstReturns01(t *testing.T) { + gmr, err := compileAndLoad(`G { start = "a" | "b" }`) + if err != nil { + t.Fatalf("compiling and loading grammar: %v", err) + } + defer gmr.Close() + result, err := gmr.Match("a") + if err != nil { + t.Fatalf("matching: %v", err) + } + defer result.Close() + AssertTrue(t, result.Succeeded()) + root, err := result.GetCstRoot() + if err != nil { + t.Fatalf("getting CST root: %v", err) + } + // start + AssertEqual(t, root.NumChildren(), 1) + AssertEqual(t, root.MatchLength(), 1) + AssertEqual(t, root.CtorName(), "start") + // "a" + term := root.Children()[0] + AssertEqual(t, term.NumChildren(), 0) + AssertEqual(t, term.MatchLength(), 1) + AssertTrue(t, term.IsTerminal()) +} + +func TestCstReturns02(t *testing.T) { + gmr, err := compileAndLoad(` +G { + start = "a" b + b = "b" +}`) + if err != nil { + t.Fatalf("compiling and loading grammar: %v", err) + } + defer gmr.Close() + result, err := gmr.Match("ab") + if err != nil { + t.Fatalf("matching: %v", err) + } + defer result.Close() + AssertTrue(t, result.Succeeded()) + root, err := result.GetCstRoot() + if err != nil { + t.Fatalf("getting CST root: %v", err) + } + // start + AssertEqual(t, root.NumChildren(), 2) + AssertEqual(t, root.MatchLength(), 2) + AssertEqual(t, root.CtorName(), "start") + // "a" + childA := root.Children()[0] + AssertEqual(t, childA.NumChildren(), 0) + AssertEqual(t, childA.MatchLength(), 1) + AssertTrue(t, childA.IsTerminal()) + // NonterminalNode for b + childB := root.Children()[1] + AssertEqual(t, childB.NumChildren(), 1) + AssertEqual(t, childB.MatchLength(), 1) + AssertEqual(t, childB.CtorName(), "b") + // TerminalNode for "b" + term := childB.Children()[0] + AssertEqual(t, term.NumChildren(), 0) + AssertEqual(t, term.MatchLength(), 1) + AssertTrue(t, term.IsTerminal()) + AssertEqual(t, term.CtorName(), "_terminal") +}