From 98f807193d4f58c189d1ae9c647e75d8fb6a8645 Mon Sep 17 00:00:00 2001 From: Stephen Waits Date: Thu, 21 May 2026 19:18:41 -0600 Subject: [PATCH] fix(scanner): avoid GCC -O2 strict-aliasing miscompile in scan_start_tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parsing any document containing an HTML tag (even `
`) segfaults when the scanner is compiled with GCC at -O2 — the default for `tree-sitter build` and for the parsers editors such as Neovim ship. Debug (-O0) builds are unaffected, so test suites miss it. scan_start_tag() appends to the tag stack with `array_push(&state->html->tags, tag)`. array_push() (tree-sitter array.h) reallocates the backing buffer through the generic `Array *` type, then writes the new element back through the concrete `Array(Tag) *` type. Because the array is reached through the `state->html` pointer, GCC's -O2 strict-aliasing analysis treats the realloc's `contents` store and the element store as non-aliasing, elides the reload of `contents`, and writes the element through a stale NULL pointer. Do the grow + append explicitly, keeping every access on the concrete `Array(Tag)` type so there is no `Array *` type-pun for the optimizer to break. Applied to the canonical tree-sitter-htmlx scanner and its vendored copy in tree-sitter-svelte. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/tree-sitter-htmlx/src/scanner.c | 24 ++++++++++++++++++- crates/tree-sitter-svelte/src/htmlx/scanner.c | 24 ++++++++++++++++++- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/crates/tree-sitter-htmlx/src/scanner.c b/crates/tree-sitter-htmlx/src/scanner.c index fde9732..dfae624 100644 --- a/crates/tree-sitter-htmlx/src/scanner.c +++ b/crates/tree-sitter-htmlx/src/scanner.c @@ -345,7 +345,29 @@ static bool scan_start_tag(State *state, TSLexer *lexer, const bool *valid) { lexer->mark_end(lexer); } Tag tag = htmlx_tag_for_svelte_name(name, saw_ascii_upper); - array_push(&state->html->tags, tag); + + // Append onto the HTML scanner's tag stack. + // + // This is done explicitly rather than with + // `array_push(&state->html->tags, tag)`. array_push() reallocates the + // backing buffer through array.h's generic `Array *` type, then writes + // the new element back through the concrete `Array(Tag) *` type. When + // the array is reached through the `state->html` pointer, GCC's + // strict-aliasing analysis at -O2 treats the realloc store and the + // element store as non-aliasing and elides the reload of `contents` — + // leaving a stale NULL pointer and segfaulting on the first tag of + // every document. Keeping every access on the concrete `Array(Tag)` + // type avoids the miscompile. + Scanner *html = state->html; + if (html->tags.size + 1 > html->tags.capacity) { + uint32_t new_capacity = + html->tags.capacity ? html->tags.capacity * 2 : 8; + html->tags.contents = (Tag *)ts_realloc( + html->tags.contents, (size_t)new_capacity * sizeof(Tag)); + html->tags.capacity = new_capacity; + } + html->tags.contents[html->tags.size++] = tag; + state->open_tag_is_namespaced = false; switch (tag.type) { diff --git a/crates/tree-sitter-svelte/src/htmlx/scanner.c b/crates/tree-sitter-svelte/src/htmlx/scanner.c index 6885b39..0ecd353 100644 --- a/crates/tree-sitter-svelte/src/htmlx/scanner.c +++ b/crates/tree-sitter-svelte/src/htmlx/scanner.c @@ -349,7 +349,29 @@ static bool scan_start_tag(State *state, TSLexer *lexer, const bool *valid) { lexer->mark_end(lexer); } Tag tag = htmlx_tag_for_svelte_name(name, saw_ascii_upper); - array_push(&state->html->tags, tag); + + // Append onto the HTML scanner's tag stack. + // + // This is done explicitly rather than with + // `array_push(&state->html->tags, tag)`. array_push() reallocates the + // backing buffer through array.h's generic `Array *` type, then writes + // the new element back through the concrete `Array(Tag) *` type. When + // the array is reached through the `state->html` pointer, GCC's + // strict-aliasing analysis at -O2 treats the realloc store and the + // element store as non-aliasing and elides the reload of `contents` — + // leaving a stale NULL pointer and segfaulting on the first tag of + // every document. Keeping every access on the concrete `Array(Tag)` + // type avoids the miscompile. + Scanner *html = state->html; + if (html->tags.size + 1 > html->tags.capacity) { + uint32_t new_capacity = + html->tags.capacity ? html->tags.capacity * 2 : 8; + html->tags.contents = (Tag *)ts_realloc( + html->tags.contents, (size_t)new_capacity * sizeof(Tag)); + html->tags.capacity = new_capacity; + } + html->tags.contents[html->tags.size++] = tag; + state->open_tag_is_namespaced = false; switch (tag.type) {