comunica · jitsedesmet · Jun 2, 2026
diff --git a/packages/rules-sparql-1-1/lib/utils.ts b/packages/rules-sparql-1-1/lib/utils.ts
@@ -2,30 +2,140 @@ import { TransformerSubTyped } from '@traqula/core';
 import type { Sparql11Nodes } from './Sparql11types.js';
 
 /**
- * Transform input in accordance to [19.2](https://www.w3.org/TR/sparql11-query/#codepointEscape)
- * and validate unicode codepoints.
+ * Apply codepoint escape substitution within a string literal or IRI ref chunk, and validate
+ * that no lone surrogate (from raw embedded chars) remains after substitution.
+ * Per SPARQL spec section 19.2, \uXXXX/\UXXXXXXXX escapes resolve to Unicode codepoints,
+ * and surrogate codepoints (U+D800–U+DFFF) are never legal as escaped values.
  */
-export function sparqlCodepointEscape(input: string): string {
-  const sanitizedInput = input.replaceAll(
+function processChunk(chunk: string): string {
+  const processed = chunk.replaceAll(
     /\\u([0-9a-fA-F]{4})|\\U([0-9a-fA-F]{8})/gu,
-    (_, unicode4: string, unicode8: string) => {
-      if (unicode4) {
-        const charCode = Number.parseInt(unicode4, 16);
-        return String.fromCodePoint(charCode);
-      }
-      const charCode = Number.parseInt(unicode8, 16);
-      if (charCode < 0xFFFF) {
-        return String.fromCodePoint(charCode);
+    (_, u4: string, u8: string) => {
+      const charCode = Number.parseInt(u4 ?? u8, 16);
+      if (charCode >= 0xD800 && charCode <= 0xDFFF) {
+        throw new Error(`Invalid unicode codepoint of surrogate pair`);
       }
-      const substractedCharCode = charCode - 0x10000;
-      return String.fromCodePoint(0xD800 + (substractedCharCode >> 10), 0xDC00 + (substractedCharCode & 0x3FF));
+      return String.fromCodePoint(charCode);
     },
   );
-  // Test for invalid unicode surrogate pairs
-  if (/[\uD800-\uDBFF](?:[^\uDC00-\uDFFF]|$)/u.test(sanitizedInput)) {
+  // Validate no lone high surrogate remains (from raw embedded surrogate chars)
+  if (/[\uD800-\uDBFF](?:[^\uDC00-\uDFFF]|$)/u.test(processed)) {
     throw new Error(`Invalid unicode codepoint of surrogate pair without corresponding codepoint`);
   }
-  return sanitizedInput;
+  return processed;
+}
+
+/**
+ * Returns true when the character at position `pos` is not a legal IRI-ref body character
+ * per the SPARQL grammar production IRIREF := '<' ([^<>"{}|^`\]-[#x00-#x20])* '>'.
+ * A '\\' that is NOT the start of a UCHAR (\uXXXX / \UXXXXXXXX) is also invalid.
+ */
+function isInvalidIriChar(input: string, pos: number): boolean {
+  const c = input.codePointAt(pos)!;
+  // Excluded from IRIREF body: control chars (#x00-#x20), space, " < > \ ^ ` { | }
+  return c <= 0x20 || c === 0x22 || c === 0x3C || c === 0x3E ||
+    c === 0x5C || c === 0x5E || c === 0x60 || c === 0x7B || c === 0x7C || c === 0x7D;
+}
+
+/**
+ * Transform input in accordance to [19.2](https://www.w3.org/TR/sparql11-query/#codepointEscape).
+ * Codepoint escapes (\uXXXX / \UXXXXXXXX) are only applied within IRI references and string
+ * literals; using them outside those contexts throws an error. Surrogate codepoints are always
+ * rejected. Raw lone surrogates embedded in string/IRI chunks are also rejected.
+ */
+export function sparqlCodepointEscape(input: string): string {
+  let result = '';
+  let i = 0;
+
+  while (i < input.length) {
+    // Skip # comments (pass through to end of line unchanged)
+    if (input[i] === '#') {
+      const eol = input.indexOf('\n', i);
+      if (eol === -1) {
+        result += input.slice(i);
+        return result;
+      }
+      result += input.slice(i, eol + 1);
+      i = eol + 1;
+      continue;
+    }
+
+    // Long string literals — must be checked before short strings
+    if (input.startsWith('"""', i) || input.startsWith('\'\'\'', i)) {
+      const delim = input.startsWith('"""', i) ? '"""' : '\'\'\'';
+      let end = i + 3;
+      while (end < input.length) {
+        if (input[end] === '\\') {
+          // Skip escape sequence (incl. \uXXXX prefix; processChunk handles expansion)
+          end += 2;
+        } else if (input.startsWith(delim, end)) {
+          end += 3;
+          break;
+        } else {
+          end++;
+        }
+      }
+      result += processChunk(input.slice(i, end));
+      i = end;
+      continue;
+    }
+
+    // Short string literals
+    if (input[i] === '"' || input[i] === '\'') {
+      const delim = input[i];
+      let end = i + 1;
+      while (end < input.length && input[end] !== delim && input[end] !== '\n' && input[end] !== '\r') {
+        if (input[end] === '\\') {
+          // Skip escape sequence
+          end += 2;
+        } else {
+          end++;
+        }
+      }
+      if (end < input.length && input[end] === delim) {
+        end++;
+      }
+      result += processChunk(input.slice(i, end));
+      i = end;
+      continue;
+    }
+
+    // IRI references: '<' not followed by '<' (which is the SPARQL 1.2 '<<' triple-term delimiter)
+    if (input[i] === '<' && input[i + 1] !== '<') {
+      // Validate IRI body characters to distinguish an IRI ref from a comparison operator.
+      // Abort and treat '<' as a plain character if any invalid IRI char is found before '>'.
+      let end = i + 1;
+      let validIriRef = true;
+      while (end < input.length && input[end] !== '>') {
+        if (input[end] === '\\' && (input[end + 1] === 'u' || input[end + 1] === 'U')) {
+          // Valid UCHAR prefix inside IRI; processChunk will expand it
+          end += 2;
+        } else if (isInvalidIriChar(input, end)) {
+          validIriRef = false;
+          break;
+        } else {
+          end++;
+        }
+      }
+      if (validIriRef && end < input.length) {
+        // Consume closing '>'
+        end++;
+        result += processChunk(input.slice(i, end));
+        i = end;
+        continue;
+      }
+      // Not a valid IRI ref (e.g. comparison operator) — fall through
+    }
+
+    // Codepoint escape outside an allowed context is an error
+    if (input[i] === '\\' && (input[i + 1] === 'u' || input[i + 1] === 'U')) {
+      throw new Error(`Codepoint escape not allowed outside of string literals or IRI references`);
+    }
+
+    result += input[i++];
+  }
+
+  return result;
 }
 
 /**

diff --git a/packages/rules-sparql-1-1/test/utils.test.ts b/packages/rules-sparql-1-1/test/utils.test.ts
@@ -2,27 +2,86 @@ import { describe, it } from 'vitest';
 import { sparqlCodepointEscape } from '../lib/index.js';
 
 describe('sparqlCodepointEscape', () => {
-  it('converts \\uXXXX escapes to unicode characters', ({ expect }) => {
-    expect(sparqlCodepointEscape('hello\\u0041world')).toBe('helloAworld');
-    expect(sparqlCodepointEscape('\\u0048\\u0069')).toBe('Hi');
-  });
+  describe('within IRI references', () => {
+    it('converts \\uXXXX escapes inside <...>', ({ expect }) => {
+      expect(sparqlCodepointEscape('<hello\\u0041world>')).toBe('<helloAworld>');
+      expect(sparqlCodepointEscape('<\\u0048\\u0069>')).toBe('<Hi>');
+    });
+
+    it('converts \\UXXXXXXXX escapes inside <...>', ({ expect }) => {
+      expect(sparqlCodepointEscape('<\\U00000041>')).toBe('<A>');
+      expect(sparqlCodepointEscape('<test\\U00000042end>')).toBe('<testBend>');
+    });
+
+    it('handles supplementary characters (above U+FFFF) inside <...>', ({ expect }) => {
+      // U+1F600 (😀)
+      expect(sparqlCodepointEscape('<\\U0001F600>')).toBe('<😀>');
+    });
+
+    it('throws on surrogate codepoints in \\uXXXX escape inside <...>', ({ expect }) => {
+      expect(() => sparqlCodepointEscape('<\\uD800>')).toThrowError(/surrogate/u);
+      expect(() => sparqlCodepointEscape('<\\uDFFF>')).toThrowError(/surrogate/u);
+    });
 
-  it('converts \\UXXXXXXXX escapes to unicode characters', ({ expect }) => {
-    expect(sparqlCodepointEscape('\\U00000041')).toBe('A');
-    expect(sparqlCodepointEscape('test\\U00000042end')).toBe('testBend');
+    it('throws on raw lone surrogate inside <...>', ({ expect }) => {
+      expect(() => sparqlCodepointEscape('<\uD800>')).toThrowError(/Invalid unicode codepoint/u);
+    });
   });
 
-  it('handles characters above 0xFFFF (surrogate pairs)', ({ expect }) => {
-    // U+1F600 (😀) = 0x1F600 = 128512
-    expect(sparqlCodepointEscape('\\U0001F600')).toBe('😀');
+  describe('within string literals', () => {
+    it('converts \\uXXXX escapes inside double-quoted strings', ({ expect }) => {
+      expect(sparqlCodepointEscape('"\\u0041"')).toBe('"A"');
+    });
+
+    it('converts \\uXXXX escapes inside single-quoted strings', ({ expect }) => {
+      expect(sparqlCodepointEscape('\'\\u0041\'')).toBe('\'A\'');
+    });
+
+    it('converts \\uXXXX escapes inside long double-quoted strings', ({ expect }) => {
+      expect(sparqlCodepointEscape('"""\\u0041"""')).toBe('"""A"""');
+    });
+
+    it('converts \\uXXXX escapes inside long single-quoted strings', ({ expect }) => {
+      expect(sparqlCodepointEscape('\'\'\'\\u0041\'\'\'')).toBe('\'\'\'A\'\'\'');
+    });
+
+    it('throws on surrogate codepoints in \\uXXXX escape inside string', ({ expect }) => {
+      expect(() => sparqlCodepointEscape('"\\uD83C"')).toThrowError(/surrogate/u);
+    });
+
+    it('throws on raw lone high surrogate inside string', ({ expect }) => {
+      expect(() => sparqlCodepointEscape('"\uD800"')).toThrowError(/Invalid unicode codepoint/u);
+    });
   });
 
-  it('throws on invalid unicode surrogate pairs', ({ expect }) => {
-    // A high surrogate (D800-DBFF) not followed by a low surrogate
-    expect(() => sparqlCodepointEscape('\uD800')).toThrowError(/Invalid unicode codepoint/u);
+  describe('outside string/IRI contexts', () => {
+    it('throws on \\uXXXX escape in SPARQL keyword position', ({ expect }) => {
+      expect(() => sparqlCodepointEscape('\\u0041SK {}')).toThrowError(/Codepoint escape not allowed/u);
+    });
+
+    it('throws on \\uXXXX escape in variable name', ({ expect }) => {
+      const query = 'SELECT * { ?a\\u0062c <:p> ?o }';
+      expect(() => sparqlCodepointEscape(query)).toThrowError(/Codepoint escape not allowed/u);
+    });
+
+    it('does not process \\uXXXX in # comments', ({ expect }) => {
+      // Comments pass through unchanged; no error thrown
+      expect(sparqlCodepointEscape('# \\u0041\nSELECT * {}')).toBe('# \\u0041\nSELECT * {}');
+    });
+
+    it('does not enter IRI mode for comparison operators', ({ expect }) => {
+      // '< ' (with space) is a comparison, not an IRI ref
+      expect(sparqlCodepointEscape('SELECT * { FILTER(?x < 5) }')).toBe('SELECT * { FILTER(?x < 5) }');
+    });
   });
 
   it('passes through normal strings unchanged', ({ expect }) => {
-    expect(sparqlCodepointEscape('hello world')).toBe('hello world');
+    expect(sparqlCodepointEscape('SELECT * WHERE { ?s ?p ?o }')).toBe('SELECT * WHERE { ?s ?p ?o }');
+  });
+
+  it('handles unterminated short string at end of input gracefully', ({ expect }) => {
+    // A string that is never closed (no closing quote before EOF)
+    expect(sparqlCodepointEscape('"abc')).toBe('"abc');
+    expect(sparqlCodepointEscape('\'abc')).toBe('\'abc');
   });
 });
diff --git a/packages/test-utils/statics/algebra/algebra-blank-to-var/sparql12/codepoint-esc-05.json b/packages/test-utils/statics/algebra/algebra-blank-to-var/sparql12/codepoint-esc-05.json
@@ -0,0 +1,38 @@
+{
+  "type": "project",
+  "input": {
+    "type": "bgp",
+    "patterns": [
+      {
+        "type": "pattern",
+        "termType": "Quad",
+        "subject": {
+          "termType": "NamedNode",
+          "value": "http://example/abc"
+        },
+        "predicate": {
+          "termType": "Variable",
+          "value": "p"
+        },
+        "object": {
+          "termType": "Variable",
+          "value": "o"
+        },
+        "graph": {
+          "termType": "DefaultGraph",
+          "value": ""
+        }
+      }
+    ]
+  },
+  "variables": [
+    {
+      "termType": "Variable",
+      "value": "o"
+    },
+    {
+      "termType": "Variable",
+      "value": "p"
+    }
+  ]
+}
diff --git a/packages/test-utils/statics/algebra/algebra-blank-to-var/sparql12/codepoint-esc-06.json b/packages/test-utils/statics/algebra/algebra-blank-to-var/sparql12/codepoint-esc-06.json
@@ -0,0 +1,38 @@
+{
+  "type": "project",
+  "input": {
+    "type": "bgp",
+    "patterns": [
+      {
+        "type": "pattern",
+        "termType": "Quad",
+        "subject": {
+          "termType": "NamedNode",
+          "value": "http://example/abc"
+        },
+        "predicate": {
+          "termType": "Variable",
+          "value": "p"
+        },
+        "object": {
+          "termType": "Literal",
+          "value": "abc",
+          "datatype": {
+            "termType": "NamedNode",
+            "value": "http://www.w3.org/2001/XMLSchema#string"
+          }
+        },
+        "graph": {
+          "termType": "DefaultGraph",
+          "value": ""
+        }
+      }
+    ]
+  },
+  "variables": [
+    {
+      "termType": "Variable",
+      "value": "p"
+    }
+  ]
+}
diff --git a/packages/test-utils/statics/algebra/algebra-blank-to-var/sparql12/codepoint-esc-07.json b/packages/test-utils/statics/algebra/algebra-blank-to-var/sparql12/codepoint-esc-07.json
@@ -0,0 +1,38 @@
+{
+  "type": "project",
+  "input": {
+    "type": "bgp",
+    "patterns": [
+      {
+        "type": "pattern",
+        "termType": "Quad",
+        "subject": {
+          "termType": "NamedNode",
+          "value": "http://example/abc"
+        },
+        "predicate": {
+          "termType": "Variable",
+          "value": "p"
+        },
+        "object": {
+          "termType": "Literal",
+          "value": "abc",
+          "datatype": {
+            "termType": "NamedNode",
+            "value": "http://www.w3.org/2001/XMLSchema#string"
+          }
+        },
+        "graph": {
+          "termType": "DefaultGraph",
+          "value": ""
+        }
+      }
+    ]
+  },
+  "variables": [
+    {
+      "termType": "Variable",
+      "value": "p"
+    }
+  ]
+}