Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 127 additions & 17 deletions packages/rules-sparql-1-1/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,140 @@ import { TransformerSubTyped } from '@traqula/core';
import type { Sparql11Nodes } from './Sparql11types.js';

/**
* Transform input in accordance to [19.2](https://www.w3.org/TR/sparql11-query/#codepointEscape)
* and validate unicode codepoints.
* Apply codepoint escape substitution within a string literal or IRI ref chunk, and validate
* that no lone surrogate (from raw embedded chars) remains after substitution.
* Per SPARQL spec section 19.2, \uXXXX/\UXXXXXXXX escapes resolve to Unicode codepoints,
* and surrogate codepoints (U+D800–U+DFFF) are never legal as escaped values.
*/
export function sparqlCodepointEscape(input: string): string {
const sanitizedInput = input.replaceAll(
function processChunk(chunk: string): string {
const processed = chunk.replaceAll(
/\\u([0-9a-fA-F]{4})|\\U([0-9a-fA-F]{8})/gu,
(_, unicode4: string, unicode8: string) => {
if (unicode4) {
const charCode = Number.parseInt(unicode4, 16);
return String.fromCodePoint(charCode);
}
const charCode = Number.parseInt(unicode8, 16);
if (charCode < 0xFFFF) {
return String.fromCodePoint(charCode);
(_, u4: string, u8: string) => {
const charCode = Number.parseInt(u4 ?? u8, 16);
if (charCode >= 0xD800 && charCode <= 0xDFFF) {
throw new Error(`Invalid unicode codepoint of surrogate pair`);
}
const substractedCharCode = charCode - 0x10000;
return String.fromCodePoint(0xD800 + (substractedCharCode >> 10), 0xDC00 + (substractedCharCode & 0x3FF));
return String.fromCodePoint(charCode);
},
);
// Test for invalid unicode surrogate pairs
if (/[\uD800-\uDBFF](?:[^\uDC00-\uDFFF]|$)/u.test(sanitizedInput)) {
// Validate no lone high surrogate remains (from raw embedded surrogate chars)
if (/[\uD800-\uDBFF](?:[^\uDC00-\uDFFF]|$)/u.test(processed)) {
throw new Error(`Invalid unicode codepoint of surrogate pair without corresponding codepoint`);
}
return sanitizedInput;
return processed;
}

/**
* Returns true when the character at position `pos` is not a legal IRI-ref body character
* per the SPARQL grammar production IRIREF := '<' ([^<>"{}|^`\]-[#x00-#x20])* '>'.
* A '\\' that is NOT the start of a UCHAR (\uXXXX / \UXXXXXXXX) is also invalid.
*/
function isInvalidIriChar(input: string, pos: number): boolean {
const c = input.codePointAt(pos)!;
// Excluded from IRIREF body: control chars (#x00-#x20), space, " < > \ ^ ` { | }
return c <= 0x20 || c === 0x22 || c === 0x3C || c === 0x3E ||
c === 0x5C || c === 0x5E || c === 0x60 || c === 0x7B || c === 0x7C || c === 0x7D;
}

/**
* Transform input in accordance to [19.2](https://www.w3.org/TR/sparql11-query/#codepointEscape).
* Codepoint escapes (\uXXXX / \UXXXXXXXX) are only applied within IRI references and string
* literals; using them outside those contexts throws an error. Surrogate codepoints are always
* rejected. Raw lone surrogates embedded in string/IRI chunks are also rejected.
*/
export function sparqlCodepointEscape(input: string): string {
let result = '';
let i = 0;

while (i < input.length) {
// Skip # comments (pass through to end of line unchanged)
if (input[i] === '#') {
const eol = input.indexOf('\n', i);
if (eol === -1) {
result += input.slice(i);
return result;
}
result += input.slice(i, eol + 1);
i = eol + 1;
continue;
}

// Long string literals — must be checked before short strings
if (input.startsWith('"""', i) || input.startsWith('\'\'\'', i)) {
const delim = input.startsWith('"""', i) ? '"""' : '\'\'\'';
let end = i + 3;
while (end < input.length) {
if (input[end] === '\\') {
// Skip escape sequence (incl. \uXXXX prefix; processChunk handles expansion)
end += 2;
} else if (input.startsWith(delim, end)) {
end += 3;
break;
} else {
end++;
}
}
result += processChunk(input.slice(i, end));
i = end;
continue;
}

// Short string literals
if (input[i] === '"' || input[i] === '\'') {
const delim = input[i];
let end = i + 1;
while (end < input.length && input[end] !== delim && input[end] !== '\n' && input[end] !== '\r') {
if (input[end] === '\\') {
// Skip escape sequence
end += 2;
} else {
end++;
}
}
if (end < input.length && input[end] === delim) {
end++;
}
result += processChunk(input.slice(i, end));
i = end;
continue;
}

// IRI references: '<' not followed by '<' (which is the SPARQL 1.2 '<<' triple-term delimiter)
if (input[i] === '<' && input[i + 1] !== '<') {
// Validate IRI body characters to distinguish an IRI ref from a comparison operator.
// Abort and treat '<' as a plain character if any invalid IRI char is found before '>'.
let end = i + 1;
let validIriRef = true;
while (end < input.length && input[end] !== '>') {
if (input[end] === '\\' && (input[end + 1] === 'u' || input[end + 1] === 'U')) {
// Valid UCHAR prefix inside IRI; processChunk will expand it
end += 2;
} else if (isInvalidIriChar(input, end)) {
validIriRef = false;
break;
} else {
end++;
}
}
if (validIriRef && end < input.length) {
// Consume closing '>'
end++;
result += processChunk(input.slice(i, end));
i = end;
continue;
}
// Not a valid IRI ref (e.g. comparison operator) — fall through
}

// Codepoint escape outside an allowed context is an error
if (input[i] === '\\' && (input[i + 1] === 'u' || input[i + 1] === 'U')) {
throw new Error(`Codepoint escape not allowed outside of string literals or IRI references`);
}

result += input[i++];
}

return result;
}

/**
Expand Down
87 changes: 73 additions & 14 deletions packages/rules-sparql-1-1/test/utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,86 @@ import { describe, it } from 'vitest';
import { sparqlCodepointEscape } from '../lib/index.js';

describe('sparqlCodepointEscape', () => {
it('converts \\uXXXX escapes to unicode characters', ({ expect }) => {
expect(sparqlCodepointEscape('hello\\u0041world')).toBe('helloAworld');
expect(sparqlCodepointEscape('\\u0048\\u0069')).toBe('Hi');
});
describe('within IRI references', () => {
it('converts \\uXXXX escapes inside <...>', ({ expect }) => {
expect(sparqlCodepointEscape('<hello\\u0041world>')).toBe('<helloAworld>');
expect(sparqlCodepointEscape('<\\u0048\\u0069>')).toBe('<Hi>');
});

it('converts \\UXXXXXXXX escapes inside <...>', ({ expect }) => {
expect(sparqlCodepointEscape('<\\U00000041>')).toBe('<A>');
expect(sparqlCodepointEscape('<test\\U00000042end>')).toBe('<testBend>');
});

it('handles supplementary characters (above U+FFFF) inside <...>', ({ expect }) => {
// U+1F600 (😀)
expect(sparqlCodepointEscape('<\\U0001F600>')).toBe('<😀>');
});

it('throws on surrogate codepoints in \\uXXXX escape inside <...>', ({ expect }) => {
expect(() => sparqlCodepointEscape('<\\uD800>')).toThrowError(/surrogate/u);
expect(() => sparqlCodepointEscape('<\\uDFFF>')).toThrowError(/surrogate/u);
});

it('converts \\UXXXXXXXX escapes to unicode characters', ({ expect }) => {
expect(sparqlCodepointEscape('\\U00000041')).toBe('A');
expect(sparqlCodepointEscape('test\\U00000042end')).toBe('testBend');
it('throws on raw lone surrogate inside <...>', ({ expect }) => {
expect(() => sparqlCodepointEscape('<\uD800>')).toThrowError(/Invalid unicode codepoint/u);
});
});

it('handles characters above 0xFFFF (surrogate pairs)', ({ expect }) => {
// U+1F600 (😀) = 0x1F600 = 128512
expect(sparqlCodepointEscape('\\U0001F600')).toBe('😀');
describe('within string literals', () => {
it('converts \\uXXXX escapes inside double-quoted strings', ({ expect }) => {
expect(sparqlCodepointEscape('"\\u0041"')).toBe('"A"');
});

it('converts \\uXXXX escapes inside single-quoted strings', ({ expect }) => {
expect(sparqlCodepointEscape('\'\\u0041\'')).toBe('\'A\'');
});

it('converts \\uXXXX escapes inside long double-quoted strings', ({ expect }) => {
expect(sparqlCodepointEscape('"""\\u0041"""')).toBe('"""A"""');
});

it('converts \\uXXXX escapes inside long single-quoted strings', ({ expect }) => {
expect(sparqlCodepointEscape('\'\'\'\\u0041\'\'\'')).toBe('\'\'\'A\'\'\'');
});

it('throws on surrogate codepoints in \\uXXXX escape inside string', ({ expect }) => {
expect(() => sparqlCodepointEscape('"\\uD83C"')).toThrowError(/surrogate/u);
});

it('throws on raw lone high surrogate inside string', ({ expect }) => {
expect(() => sparqlCodepointEscape('"\uD800"')).toThrowError(/Invalid unicode codepoint/u);
});
});

it('throws on invalid unicode surrogate pairs', ({ expect }) => {
// A high surrogate (D800-DBFF) not followed by a low surrogate
expect(() => sparqlCodepointEscape('\uD800')).toThrowError(/Invalid unicode codepoint/u);
describe('outside string/IRI contexts', () => {
it('throws on \\uXXXX escape in SPARQL keyword position', ({ expect }) => {
expect(() => sparqlCodepointEscape('\\u0041SK {}')).toThrowError(/Codepoint escape not allowed/u);
});

it('throws on \\uXXXX escape in variable name', ({ expect }) => {
const query = 'SELECT * { ?a\\u0062c <:p> ?o }';
expect(() => sparqlCodepointEscape(query)).toThrowError(/Codepoint escape not allowed/u);
});

it('does not process \\uXXXX in # comments', ({ expect }) => {
// Comments pass through unchanged; no error thrown
expect(sparqlCodepointEscape('# \\u0041\nSELECT * {}')).toBe('# \\u0041\nSELECT * {}');
});

it('does not enter IRI mode for comparison operators', ({ expect }) => {
// '< ' (with space) is a comparison, not an IRI ref
expect(sparqlCodepointEscape('SELECT * { FILTER(?x < 5) }')).toBe('SELECT * { FILTER(?x < 5) }');
});
});

it('passes through normal strings unchanged', ({ expect }) => {
expect(sparqlCodepointEscape('hello world')).toBe('hello world');
expect(sparqlCodepointEscape('SELECT * WHERE { ?s ?p ?o }')).toBe('SELECT * WHERE { ?s ?p ?o }');
});

it('handles unterminated short string at end of input gracefully', ({ expect }) => {
// A string that is never closed (no closing quote before EOF)
expect(sparqlCodepointEscape('"abc')).toBe('"abc');
expect(sparqlCodepointEscape('\'abc')).toBe('\'abc');
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"type": "project",
"input": {
"type": "bgp",
"patterns": [
{
"type": "pattern",
"termType": "Quad",
"subject": {
"termType": "NamedNode",
"value": "http://example/abc"
},
"predicate": {
"termType": "Variable",
"value": "p"
},
"object": {
"termType": "Variable",
"value": "o"
},
"graph": {
"termType": "DefaultGraph",
"value": ""
}
}
]
},
"variables": [
{
"termType": "Variable",
"value": "o"
},
{
"termType": "Variable",
"value": "p"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"type": "project",
"input": {
"type": "bgp",
"patterns": [
{
"type": "pattern",
"termType": "Quad",
"subject": {
"termType": "NamedNode",
"value": "http://example/abc"
},
"predicate": {
"termType": "Variable",
"value": "p"
},
"object": {
"termType": "Literal",
"value": "abc",
"datatype": {
"termType": "NamedNode",
"value": "http://www.w3.org/2001/XMLSchema#string"
}
},
"graph": {
"termType": "DefaultGraph",
"value": ""
}
}
]
},
"variables": [
{
"termType": "Variable",
"value": "p"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"type": "project",
"input": {
"type": "bgp",
"patterns": [
{
"type": "pattern",
"termType": "Quad",
"subject": {
"termType": "NamedNode",
"value": "http://example/abc"
},
"predicate": {
"termType": "Variable",
"value": "p"
},
"object": {
"termType": "Literal",
"value": "abc",
"datatype": {
"termType": "NamedNode",
"value": "http://www.w3.org/2001/XMLSchema#string"
}
},
"graph": {
"termType": "DefaultGraph",
"value": ""
}
}
]
},
"variables": [
{
"termType": "Variable",
"value": "p"
}
]
}
Loading
Loading