From d94d23d2aef5de9587ec687d6d42b3c535a0916e Mon Sep 17 00:00:00 2001 From: James McHugh Date: Tue, 14 Apr 2026 19:30:25 +1000 Subject: [PATCH] fix(sqlite): convert ANTLR character indices to byte offsets for source extraction ANTLR's InputStream operates on characters (runes), so token positions returned by GetStop() are character indices. However, source.Pluck() slices Go strings using byte offsets. When multi-byte UTF-8 characters (e.g. em-dash U+2014) appear in SQL comments, this mismatch causes queries to be extracted at wrong positions -- truncating parameter placeholders and leaking comment text into generated Go code. Build a rune-to-byte offset lookup table and use it to translate ANTLR positions before storing StmtLocation and StmtLen. Fixes #4372 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../testdata/sqlite_unicode_comment/db/db.go | 31 ++++++++++++++++ .../sqlite_unicode_comment/db/models.go | 10 +++++ .../sqlite_unicode_comment/db/query.sql.go | 37 +++++++++++++++++++ .../testdata/sqlite_unicode_comment/query.sql | 7 ++++ .../sqlite_unicode_comment/schema.sql | 1 + .../testdata/sqlite_unicode_comment/sqlc.json | 16 ++++++++ internal/engine/sqlite/parse.go | 31 ++++++++++++++-- 7 files changed, 129 insertions(+), 4 deletions(-) create mode 100644 internal/endtoend/testdata/sqlite_unicode_comment/db/db.go create mode 100644 internal/endtoend/testdata/sqlite_unicode_comment/db/models.go create mode 100644 internal/endtoend/testdata/sqlite_unicode_comment/db/query.sql.go create mode 100644 internal/endtoend/testdata/sqlite_unicode_comment/query.sql create mode 100644 internal/endtoend/testdata/sqlite_unicode_comment/schema.sql create mode 100644 internal/endtoend/testdata/sqlite_unicode_comment/sqlc.json diff --git a/internal/endtoend/testdata/sqlite_unicode_comment/db/db.go b/internal/endtoend/testdata/sqlite_unicode_comment/db/db.go new file mode 100644 index 0000000000..cd5bbb8e08 --- /dev/null +++ b/internal/endtoend/testdata/sqlite_unicode_comment/db/db.go @@ -0,0 +1,31 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 + +package db + +import ( + "context" + "database/sql" +) + +type DBTX interface { + ExecContext(context.Context, string, ...interface{}) (sql.Result, error) + PrepareContext(context.Context, string) (*sql.Stmt, error) + QueryContext(context.Context, string, ...interface{}) (*sql.Rows, error) + QueryRowContext(context.Context, string, ...interface{}) *sql.Row +} + +func New(db DBTX) *Queries { + return &Queries{db: db} +} + +type Queries struct { + db DBTX +} + +func (q *Queries) WithTx(tx *sql.Tx) *Queries { + return &Queries{ + db: tx, + } +} diff --git a/internal/endtoend/testdata/sqlite_unicode_comment/db/models.go b/internal/endtoend/testdata/sqlite_unicode_comment/db/models.go new file mode 100644 index 0000000000..b8d77e1021 --- /dev/null +++ b/internal/endtoend/testdata/sqlite_unicode_comment/db/models.go @@ -0,0 +1,10 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 + +package db + +type Item struct { + ID int64 + Name string +} diff --git a/internal/endtoend/testdata/sqlite_unicode_comment/db/query.sql.go b/internal/endtoend/testdata/sqlite_unicode_comment/db/query.sql.go new file mode 100644 index 0000000000..2ad41c67c3 --- /dev/null +++ b/internal/endtoend/testdata/sqlite_unicode_comment/db/query.sql.go @@ -0,0 +1,37 @@ +// Code generated by sqlc. DO NOT EDIT. +// versions: +// sqlc v1.30.0 +// source: query.sql + +package db + +import ( + "context" +) + +const getItem = `-- name: GetItem :one +SELECT id, name FROM items WHERE id = ? +` + +func (q *Queries) GetItem(ctx context.Context, id int64) (Item, error) { + row := q.db.QueryRowContext(ctx, getItem, id) + var i Item + err := row.Scan(&i.ID, &i.Name) + return i, err +} + +const updateItem = `-- name: UpdateItem :exec + +UPDATE items SET name = ? WHERE id = ? +` + +type UpdateItemParams struct { + Name string + ID int64 +} + +// section — divider +func (q *Queries) UpdateItem(ctx context.Context, arg UpdateItemParams) error { + _, err := q.db.ExecContext(ctx, updateItem, arg.Name, arg.ID) + return err +} diff --git a/internal/endtoend/testdata/sqlite_unicode_comment/query.sql b/internal/endtoend/testdata/sqlite_unicode_comment/query.sql new file mode 100644 index 0000000000..dd4e5cdc29 --- /dev/null +++ b/internal/endtoend/testdata/sqlite_unicode_comment/query.sql @@ -0,0 +1,7 @@ +-- name: GetItem :one +SELECT id, name FROM items WHERE id = ?; + +-- section — divider + +-- name: UpdateItem :exec +UPDATE items SET name = ? WHERE id = ?; diff --git a/internal/endtoend/testdata/sqlite_unicode_comment/schema.sql b/internal/endtoend/testdata/sqlite_unicode_comment/schema.sql new file mode 100644 index 0000000000..93e4173ad2 --- /dev/null +++ b/internal/endtoend/testdata/sqlite_unicode_comment/schema.sql @@ -0,0 +1 @@ +CREATE TABLE items (id INTEGER PRIMARY KEY, name TEXT NOT NULL); diff --git a/internal/endtoend/testdata/sqlite_unicode_comment/sqlc.json b/internal/endtoend/testdata/sqlite_unicode_comment/sqlc.json new file mode 100644 index 0000000000..cbd787d930 --- /dev/null +++ b/internal/endtoend/testdata/sqlite_unicode_comment/sqlc.json @@ -0,0 +1,16 @@ +{ + "version": "2", + "sql": [ + { + "engine": "sqlite", + "queries": "query.sql", + "schema": "schema.sql", + "gen": { + "go": { + "package": "db", + "out": "db" + } + } + } + ] +} diff --git a/internal/engine/sqlite/parse.go b/internal/engine/sqlite/parse.go index 13425b156e..2a42909e94 100644 --- a/internal/engine/sqlite/parse.go +++ b/internal/engine/sqlite/parse.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "io" + "unicode/utf8" "github.com/antlr4-go/antlr/v4" "github.com/sqlc-dev/sqlc/internal/engine/sqlite/parser" @@ -42,7 +43,8 @@ func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) { if err != nil { return nil, err } - input := antlr.NewInputStream(string(blob)) + src := string(blob) + input := antlr.NewInputStream(src) lexer := parser.NewSQLiteLexer(input) stream := antlr.NewCommonTokenStream(lexer, 0) pp := parser.NewSQLiteParser(stream) @@ -57,6 +59,13 @@ func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) { if !ok { return nil, fmt.Errorf("expected ParserContext; got %T\n", tree) } + + // ANTLR's InputStream operates on characters (runes), so token + // positions are character indices. source.Pluck slices with byte + // offsets. Build a lookup table so we can translate correctly when + // the input contains multi-byte UTF-8 characters (e.g. em-dash). + runeToByteOffset := buildRuneToByteOffsets(src) + var stmts []ast.Statement for _, istmt := range pctx.AllSql_stmt_list() { list, ok := istmt.(*parser.Sql_stmt_listContext) @@ -72,12 +81,13 @@ func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) { loc = stmt.GetStop().GetStop() + 2 continue } - len := (stmt.GetStop().GetStop() + 1) - loc + byteLoc := runeToByteOffset[loc] + byteEnd := runeToByteOffset[stmt.GetStop().GetStop()+1] stmts = append(stmts, ast.Statement{ Raw: &ast.RawStmt{ Stmt: out, - StmtLocation: loc, - StmtLen: len, + StmtLocation: byteLoc, + StmtLen: byteEnd - byteLoc, }, }) loc = stmt.GetStop().GetStop() + 2 @@ -86,6 +96,19 @@ func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) { return stmts, nil } +// buildRuneToByteOffsets returns a slice mapping rune index to byte offset. +// Entry i holds the byte offset where rune i begins; the final entry holds +// len(s) so that an exclusive end position can be looked up safely. +func buildRuneToByteOffsets(s string) []int { + n := utf8.RuneCountInString(s) + offsets := make([]int, 0, n+1) + for bytePos := range s { + offsets = append(offsets, bytePos) + } + offsets = append(offsets, len(s)) + return offsets +} + func (p *Parser) CommentSyntax() source.CommentSyntax { return source.CommentSyntax{ Dash: true,