From 2f430c3589572b943fddd940a2df5ead7b700d82 Mon Sep 17 00:00:00 2001 From: Richard Gibson Date: Tue, 16 Jun 2026 02:40:48 -0400 Subject: [PATCH 1/4] When formatting, replace inline ASCII quotes with typographic quotes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ref https://github.com/tc39/ecma262/pull/3861#pullrequestreview-4412001453 Ref #173 Ref #317 Note that this processing cannot be scoped to individual text nodes because e.g. `a "binary64 value"` should get rewritten into `a “binary64 value”` (but the same would not be true if the medial element were block-level rather than inline). ASCII quotes are not replaced inside of HTML comments, ``/`` elements, backtick spans (e.g., ``` `code` ```), asterisk spans (e.g., `*"string"*`), or after equals signs (as in HTML element attributes). --- src/formatter/ecmarkup.ts | 100 +++++++++++++++++++++++++++++++++++--- 1 file changed, 92 insertions(+), 8 deletions(-) diff --git a/src/formatter/ecmarkup.ts b/src/formatter/ecmarkup.ts index 7622ee37..43e62298 100644 --- a/src/formatter/ecmarkup.ts +++ b/src/formatter/ecmarkup.ts @@ -15,6 +15,7 @@ const RAW_CONTENT_ELEMENTS = new Set([ 'script', 'style', 'code', + 'emu-val', ]); // https://html.spec.whatwg.org/multipage/syntax.html#void-elements @@ -84,7 +85,7 @@ export async function printDocument(src: string): Promise { output.appendLine(``); for (const comment of leadingComments) { - output.append(await printChildNodes(src, [comment], false, false, 0)); + output.append(await printChildNodes(src, [comment], 'block', false, false, 0)); output.linebreak(); } @@ -119,8 +120,8 @@ export async function printDocument(src: string): Promise { output.append(await printElement(src, head, 0)); output.append(await printElement(src, body, 0)); } else { - output.append(await printChildNodes(src, head.childNodes, false, false, 0)); - output.append(await printChildNodes(src, body.childNodes, false, false, 0)); + output.append(await printChildNodes(src, head.childNodes, 'block', false, false, 0)); + output.append(await printChildNodes(src, body.childNodes, 'block', false, false, 0)); } while (output.lines[0] === '') { output.lines.shift(); @@ -166,7 +167,7 @@ export async function printElement( if (PARAGRAPH_LIKE_ELEMENTS.has(node.tagName)) { output.firstLineIsPartial = false; output.appendText(printStartTag(node)); - const body = await printChildNodes(src, childNodes, false, false, indent + 1); + const body = await printChildNodes(src, childNodes, 'block', false, false, indent + 1); body.trim(); if (body.lines.length > 1) { output.linebreak(); @@ -288,7 +289,14 @@ export async function printElement( const type = node.attrs.find(a => a.name === 'type')?.value ?? null; const printedHeader = printHeader(parseResult, type, indent + 2); output.append( - await printChildNodes(src, childNodes.slice(0, maybeH1Index), true, true, indent + 1), + await printChildNodes( + src, + childNodes.slice(0, maybeH1Index), + 'block', + true, + true, + indent + 1, + ), ); if (output.last !== '') { output.linebreak(); @@ -301,7 +309,9 @@ export async function printElement( dropLeadingLinebreaks = false; } } - output.append(await printChildNodes(src, childNodes, dropLeadingLinebreaks, true, indent + 1)); + output.append( + await printChildNodes(src, childNodes, 'block', dropLeadingLinebreaks, true, indent + 1), + ); --output.indent; output.appendLine(``); @@ -360,13 +370,14 @@ export async function printElement( if (block) { output.appendLine(printStartTag(node)); ++output.indent; - output.append(await printChildNodes(src, childNodes, true, true, indent + 1)); + output.append(await printChildNodes(src, childNodes, 'block', true, true, indent + 1)); --output.indent; output.appendLine(``); } else { output.appendText(printStartTag(node)); ++output.indent; - output.append(await printChildNodes(src, childNodes, false, true, indent + 1)); + const flowContext = node.tagName === 'emu-note' ? 'block' : 'inline'; + output.append(await printChildNodes(src, childNodes, flowContext, false, true, indent + 1)); --output.indent; const trailingSpace = output.last.endsWith(' '); if (trailingSpace) { @@ -383,12 +394,14 @@ export async function printElement( async function printChildNodes( src: string, nodes: Node[], + flowContext: 'block' | 'inline', dropLeadingLinebreaks: boolean, dropTrailingLinebreaks: boolean, indent: number, ): Promise { const output = new LineBuilder(indent); let skipNextElement = false; + let inlineRunFirstLine = 0; for (let i = 0; i < nodes.length; ++i) { const node = nodes[i]; if (node.nodeName === '#comment') { @@ -464,14 +477,85 @@ async function printChildNodes( } } } else { + const inlineRunEnded = flowContext === 'block' && isBlockElement(ele); + if (inlineRunEnded) { + fixAsciiQuotes(output.lines, inlineRunFirstLine); + } output.append(await printElement(src, ele, indent)); + if (inlineRunEnded) { + inlineRunFirstLine = output.lines.length; + } } } } + if (flowContext === 'block') { + fixAsciiQuotes(output.lines, inlineRunFirstLine); + } return output; } +// this regular expression is not perfect, but generally follows +// https://html.spec.whatwg.org/multipage/parsing.html#tokenization +// (and spec source text tends to avoid the sort of edge cases that would +// reveal its flaws) +const rHtmlTag = (() => { + const SPACE_CHAR = '[\\t\\n\\f ]'; + const TOKEN_CHAR = '[^\\t\\n\\f />]'; + const ATTR = `(?=${TOKEN_CHAR}|=)${TOKEN_CHAR}*(?:=${SPACE_CHAR}*(?:"[^"]*"|'[^']*'|(?!"|')${TOKEN_CHAR}*)?)?`; + return new RegExp( + String.raw`(`, + 'gi', + ); +})(); + +const rMaybeAsciiQuoted = new RegExp( + String.raw`${'`'}(?:[^${'`'}\\]|\\.)*${'`'}|<(${[...RAW_CONTENT_ELEMENTS].join('|')})\b[^>]*>.*?]*>|=".*?"|\*".*?"\*|"(.*?)"`, + 'gi', +); + +function fixAsciiQuotes(lines: string[], i: number = 0) { + for (let inComment = false; i < lines.length; i++) { + let line = lines[i]; + + // handle multi-line comments + const preservedPrefix = inComment ? line.match(/^.*?-->/)?.[0] || line : ''; + if (preservedPrefix) { + inComment = false; + line = line.substring(preservedPrefix.length); + } else if (inComment) { + continue; + } + + // replace escapes/comments and tags with placeholders (in that order) + const placeholders: string[] = []; + line = line.replace(/&\d|/g, c => `&${placeholders.push(c) - 1};`); + line = line.replace(rHtmlTag, (tag, prefix, name, attrs) => { + if (RAW_CONTENT_ELEMENTS.has(name)) { + // keep the tag name but replace any attributes with a placeholder + return attrs ? `${prefix}&${placeholders.push(tag.slice(prefix.length, -1)) - 1};>` : tag; + } + return `&${placeholders.push(tag) - 1};`; + }); + + const preservedSuffix = line.match(/ + and quotes can "follow comments" + + + 1. [x="a"] Assert: Quotes are also "detected" in "algorithm steps". + + `, + dedentKeepingTrailingNewline` +
+ “quotes” can span “inline elements” + + and quotes can “follow comments” +
+ + 1. [x="a"] Assert: Quotes are also “detected” in “algorithm steps”. + + `, + ); + }); + + it('preserves ASCII quotes that are code', async () => { + await assertRoundTrips( + `

ASCII quotes are not replaced in "code elements", "emu-val elements", ${'`'}"backtick spans"${'`'}, or *"inline language strings"*.

\n`, + ); + }); }); describe('grammar formatting', () => {