diff --git a/.gitignore b/.gitignore index 7e091d77..57920331 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ build/ # Generated runtime artifacts src/main/cod/src/bin/ src/main/cod/src/idx/ +src/main/cod/demo/src/bin/ +src/main/cod/demo/src/idx/ diff --git a/src/main/cod/demo/src/main/test/unicode/UnicodeStandardLibraryComprehensive.cod b/src/main/cod/demo/src/main/test/unicode/UnicodeStandardLibraryComprehensive.cod deleted file mode 100644 index a4315154..00000000 --- a/src/main/cod/demo/src/main/test/unicode/UnicodeStandardLibraryComprehensive.cod +++ /dev/null @@ -1,42 +0,0 @@ -unit test.unicode - -use {unicode} - -share UnicodeStandardLibraryComprehensive { - share check(label: text, actual: text, expected: text) { - if actual == expected { - out("PASS " + label) - return - } - out("FAIL " + label) - out(" actual: " + actual) - out(" expected: " + expected) - } - - share checkBool(label: text, actual: bool, expected: bool) { - if actual == expected { - out("PASS " + label) - return - } - out("FAIL " + label) - out(" actual: " + actual) - out(" expected: " + expected) - } - - share main() { - out("== unicode standard library comprehensive ==") - - UnicodeStandardLibraryComprehensive.check("normalize simple", Unicode.normalizeEscaped("\\u0041"), "\\u0041") - UnicodeStandardLibraryComprehensive.check("normalize lowercase", Unicode.normalizeEscaped("\\u03c9"), "\\u03C9") - UnicodeStandardLibraryComprehensive.check("normalize surrogate pair", Unicode.normalizeEscaped("\\ud83d\\ude00"), "\\uD83D\\uDE00") - - UnicodeStandardLibraryComprehensive.checkBool("valid empty", Unicode.isValidEscaped(""), true) - UnicodeStandardLibraryComprehensive.checkBool("valid basic", Unicode.isValidEscaped("\\u0041"), true) - UnicodeStandardLibraryComprehensive.checkBool("valid surrogate", Unicode.isValidEscaped("\\uD83D\\uDE00"), true) - UnicodeStandardLibraryComprehensive.checkBool("invalid bad hex", Unicode.isValidEscaped("\\u00G1"), false) - UnicodeStandardLibraryComprehensive.checkBool("invalid lone high", Unicode.isValidEscaped("\\uD83D"), false) - UnicodeStandardLibraryComprehensive.checkBool("invalid lone low", Unicode.isValidEscaped("\\uDE00"), false) - - out("== done ==") - } -} diff --git a/src/main/cod/std/json/Json.cod b/src/main/cod/std/json/Json.cod index c6fa9ccc..c0c1c09c 100644 --- a/src/main/cod/std/json/Json.cod +++ b/src/main/cod/std/json/Json.cod @@ -1,7 +1,5 @@ unit json -use {unicode} - // JSON standard library for Coderive. // Supports parsing, mutation, and compact/pretty serialization of JSON values. share JsonValue { @@ -303,12 +301,82 @@ share Json { if start + 5 >= raw.length { ~> (false) } if raw[start] != "\\" { ~> (false) } if raw[start + 1] != "u" { ~> (false) } - if !Unicode.isHexDigit(raw[start + 2]) { ~> (false) } - if !Unicode.isHexDigit(raw[start + 3]) { ~> (false) } - if !Unicode.isHexDigit(raw[start + 4]) { ~> (false) } - if !Unicode.isHexDigit(raw[start + 5]) { ~> (false) } + if !Json.isHexDigit(raw[start + 2]) { ~> (false) } + if !Json.isHexDigit(raw[start + 3]) { ~> (false) } + if !Json.isHexDigit(raw[start + 4]) { ~> (false) } + if !Json.isHexDigit(raw[start + 5]) { ~> (false) } ~> (true) } + + share isDigit(ch: text) :: bool { + ~> (any[ch == "0", ch == "1", ch == "2", ch == "3", ch == "4", ch == "5", ch == "6", ch == "7", ch == "8", ch == "9"]) + } + + share isHexDigit(ch: text) :: bool { + ~> (any[ + Json.isDigit(ch), + any[ch == "a", ch == "b", ch == "c", ch == "d", ch == "e", ch == "f"], + any[ch == "A", ch == "B", ch == "C", ch == "D", ch == "E", ch == "F"] + ]) + } + + share hexValue(ch: text) :: int { + if ch == "0" { ~> (0) } + if ch == "1" { ~> (1) } + if ch == "2" { ~> (2) } + if ch == "3" { ~> (3) } + if ch == "4" { ~> (4) } + if ch == "5" { ~> (5) } + if ch == "6" { ~> (6) } + if ch == "7" { ~> (7) } + if ch == "8" { ~> (8) } + if ch == "9" { ~> (9) } + if any[ch == "a", ch == "A"] { ~> (10) } + if any[ch == "b", ch == "B"] { ~> (11) } + if any[ch == "c", ch == "C"] { ~> (12) } + if any[ch == "d", ch == "D"] { ~> (13) } + if any[ch == "e", ch == "E"] { ~> (14) } + if any[ch == "f", ch == "F"] { ~> (15) } + ~> (-1) + } + + share hexDigit(value: int) :: text { + if value == 0 { ~> ("0") } + if value == 1 { ~> ("1") } + if value == 2 { ~> ("2") } + if value == 3 { ~> ("3") } + if value == 4 { ~> ("4") } + if value == 5 { ~> ("5") } + if value == 6 { ~> ("6") } + if value == 7 { ~> ("7") } + if value == 8 { ~> ("8") } + if value == 9 { ~> ("9") } + if value == 10 { ~> ("A") } + if value == 11 { ~> ("B") } + if value == 12 { ~> ("C") } + if value == 13 { ~> ("D") } + if value == 14 { ~> ("E") } + ~> ("F") + } + + share hex4(value: int) :: text { + x: int = value + if x < 0 { x = 0 } + if x > 65535 { x = x % 65536 } + d0 := (x / 4096) % 16 + d1 := (x / 256) % 16 + d2 := (x / 16) % 16 + d3 := x % 16 + ~> (Json.hexDigit(d0) + Json.hexDigit(d1) + Json.hexDigit(d2) + Json.hexDigit(d3)) + } + + share isHighSurrogate(unit: int) :: bool { + ~> (all[unit >= 55296, unit <= 56319]) + } + + share isLowSurrogate(unit: int) :: bool { + ~> (all[unit >= 56320, unit <= 57343]) + } share indent(depth: int) :: text { textValue := "" @@ -359,7 +427,7 @@ share JsonParser { if ch == "\"" { ~> (this.parseText()) } if ch == "[" { ~> (this.parseArray()) } if ch == "{" { ~> (this.parseObject()) } - if any[ch == "-", Unicode.isDigit(ch)] { ~> (this.parseNumber()) } + if any[ch == "-", Json.isDigit(ch)] { ~> (this.parseNumber()) } ~> (JsonValue.makeError("invalid json value at index " + this.index)) } @@ -431,9 +499,9 @@ share JsonParser { ~> (JsonValue.makeError("invalid unicode escape at index " + this.index)) } this.index = this.index + 4 - firstText := "\\u" + Unicode.hex4(firstUnit) + firstText := "\\u" + Json.hex4(firstUnit) - if Unicode.isHighSurrogate(firstUnit) { + if Json.isHighSurrogate(firstUnit) { if this.index + 5 >= this.source.length { ~> (JsonValue.makeError("missing low surrogate at index " + this.index)) } @@ -442,15 +510,15 @@ share JsonParser { } this.index = this.index + 2 secondUnit := this.parseHex4At(this.index) - if any[secondUnit < 0, !Unicode.isLowSurrogate(secondUnit)] { + if any[secondUnit < 0, !Json.isLowSurrogate(secondUnit)] { ~> (JsonValue.makeError("invalid low surrogate at index " + this.index)) } this.index = this.index + 4 - outText = outText + firstText + "\\u" + Unicode.hex4(secondUnit) + outText = outText + firstText + "\\u" + Json.hex4(secondUnit) continue } - if Unicode.isLowSurrogate(firstUnit) { + if Json.isLowSurrogate(firstUnit) { ~> (JsonValue.makeError("unexpected low surrogate at index " + (this.index - 4))) } @@ -480,24 +548,24 @@ share JsonParser { if this.source[this.index] == "0" { this.index = this.index + 1 } else { - if !Unicode.isDigit(this.source[this.index]) { + if !Json.isDigit(this.source[this.index]) { ~> (JsonValue.makeError("invalid number at index " + start)) } for i of this.index to this.source.length { if this.index >= this.source.length { break } - if !Unicode.isDigit(this.source[this.index]) { break } + if !Json.isDigit(this.source[this.index]) { break } this.index = this.index + 1 } } if all[this.index < this.source.length, this.source[this.index] == "."] { this.index = this.index + 1 - if any[this.index >= this.source.length, !Unicode.isDigit(this.source[this.index])] { + if any[this.index >= this.source.length, !Json.isDigit(this.source[this.index])] { ~> (JsonValue.makeError("invalid number fraction at index " + start)) } for i of this.index to this.source.length { if this.index >= this.source.length { break } - if !Unicode.isDigit(this.source[this.index]) { break } + if !Json.isDigit(this.source[this.index]) { break } this.index = this.index + 1 } } @@ -507,12 +575,12 @@ share JsonParser { if all[this.index < this.source.length, any[this.source[this.index] == "+", this.source[this.index] == "-"]] { this.index = this.index + 1 } - if any[this.index >= this.source.length, !Unicode.isDigit(this.source[this.index])] { + if any[this.index >= this.source.length, !Json.isDigit(this.source[this.index])] { ~> (JsonValue.makeError("invalid number exponent at index " + start)) } for i of this.index to this.source.length { if this.index >= this.source.length { break } - if !Unicode.isDigit(this.source[this.index]) { break } + if !Json.isDigit(this.source[this.index]) { break } this.index = this.index + 1 } } @@ -648,7 +716,7 @@ share JsonParser { if start + 3 >= this.source.length { ~> (-1) } value: int = 0 for j of 0 to 3 { - digit := Unicode.hexValue(this.source[start + j]) + digit := Json.hexValue(this.source[start + j]) if digit < 0 { ~> (-1) } value = value * 16 + digit } diff --git a/src/main/cod/std/unicode/Unicode.cod b/src/main/cod/std/unicode/Unicode.cod deleted file mode 100644 index f9f92a31..00000000 --- a/src/main/cod/std/unicode/Unicode.cod +++ /dev/null @@ -1,132 +0,0 @@ -unit unicode - -share Unicode { - share isDigit(ch: text) :: bool { - ~> (any[ch == "0", ch == "1", ch == "2", ch == "3", ch == "4", ch == "5", ch == "6", ch == "7", ch == "8", ch == "9"]) - } - - share isHexDigit(ch: text) :: bool { - ~> (any[ - Unicode.isDigit(ch), - any[ch == "a", ch == "b", ch == "c", ch == "d", ch == "e", ch == "f"], - any[ch == "A", ch == "B", ch == "C", ch == "D", ch == "E", ch == "F"] - ]) - } - - share normalizeEscaped(raw: text) :: text { - outText := "" - idx: int = 0 - for i of 0 to raw.length { - if idx >= raw.length { break } - ch := raw[idx] - if ch != "\\" { - outText = outText + ch - idx = idx + 1 - continue - } - - if idx + 1 >= raw.length { - ~> ("") - } - - if raw[idx + 1] != "u" { - outText = outText + ch + raw[idx + 1] - idx = idx + 2 - continue - } - - if idx + 5 >= raw.length { ~> ("") } - first := Unicode.parseHex4(raw, idx + 2) - if first < 0 { ~> ("") } - firstText := "\\u" + Unicode.hex4(first) - idx = idx + 6 - - if Unicode.isHighSurrogate(first) { - if idx + 5 >= raw.length { ~> ("") } - if any[raw[idx] != "\\", raw[idx + 1] != "u"] { ~> ("") } - second := Unicode.parseHex4(raw, idx + 2) - if any[second < 0, !Unicode.isLowSurrogate(second)] { ~> ("") } - outText = outText + firstText + "\\u" + Unicode.hex4(second) - idx = idx + 6 - continue - } - - if Unicode.isLowSurrogate(first) { ~> ("") } - outText = outText + firstText - } - ~> (outText) - } - - share isValidEscaped(raw: text) :: bool { - ~> (!Unicode.normalizeEscaped(raw).isEmpty() || raw.isEmpty()) - } - - share parseHex4(raw: text, start: int) :: int { - if start + 3 >= raw.length { ~> (-1) } - value: int = 0 - for j of 0 to 3 { - digit := Unicode.hexValue(raw[start + j]) - if digit < 0 { ~> (-1) } - value = value * 16 + digit - } - ~> (value) - } - - share hexValue(ch: text) :: int { - if ch == "0" { ~> (0) } - if ch == "1" { ~> (1) } - if ch == "2" { ~> (2) } - if ch == "3" { ~> (3) } - if ch == "4" { ~> (4) } - if ch == "5" { ~> (5) } - if ch == "6" { ~> (6) } - if ch == "7" { ~> (7) } - if ch == "8" { ~> (8) } - if ch == "9" { ~> (9) } - if any[ch == "a", ch == "A"] { ~> (10) } - if any[ch == "b", ch == "B"] { ~> (11) } - if any[ch == "c", ch == "C"] { ~> (12) } - if any[ch == "d", ch == "D"] { ~> (13) } - if any[ch == "e", ch == "E"] { ~> (14) } - if any[ch == "f", ch == "F"] { ~> (15) } - ~> (-1) - } - - share hex4(value: int) :: text { - x: int = value - if x < 0 { x = 0 } - if x > 65535 { x = x % 65536 } - d0 := (x / 4096) % 16 - d1 := (x / 256) % 16 - d2 := (x / 16) % 16 - d3 := x % 16 - ~> (Unicode.hexDigit(d0) + Unicode.hexDigit(d1) + Unicode.hexDigit(d2) + Unicode.hexDigit(d3)) - } - - share hexDigit(value: int) :: text { - if value == 0 { ~> ("0") } - if value == 1 { ~> ("1") } - if value == 2 { ~> ("2") } - if value == 3 { ~> ("3") } - if value == 4 { ~> ("4") } - if value == 5 { ~> ("5") } - if value == 6 { ~> ("6") } - if value == 7 { ~> ("7") } - if value == 8 { ~> ("8") } - if value == 9 { ~> ("9") } - if value == 10 { ~> ("A") } - if value == 11 { ~> ("B") } - if value == 12 { ~> ("C") } - if value == 13 { ~> ("D") } - if value == 14 { ~> ("E") } - ~> ("F") - } - - share isHighSurrogate(unit: int) :: bool { - ~> (all[unit >= 55296, unit <= 56319]) - } - - share isLowSurrogate(unit: int) :: bool { - ~> (all[unit >= 56320, unit <= 57343]) - } -} diff --git a/src/main/java/cod/lexer/StringLexer.java b/src/main/java/cod/lexer/StringLexer.java index 555ebb4b..7fea5d5f 100644 --- a/src/main/java/cod/lexer/StringLexer.java +++ b/src/main/java/cod/lexer/StringLexer.java @@ -6,11 +6,85 @@ public class StringLexer { private final MainLexer lexer; private final List extractedStrings; + + private static final class UnicodeEscapeResult { + private final String text; + private final int consumedChars; + + private UnicodeEscapeResult(String text, int consumedChars) { + this.text = text; + this.consumedChars = consumedChars; + } + } public StringLexer(MainLexer lexer) { this.lexer = lexer; this.extractedStrings = new ArrayList(); } + + private boolean isHexDigit(char c) { + return (c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F'); + } + + private int hexValue(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return -1; + } + + private int readUnicodeUnit() { + if (lexer.getPosition() + 4 > lexer.getInput().length) { + throw new RuntimeException("Syntax Error: Incomplete Unicode escape at line " + lexer.line); + } + + int value = 0; + for (int i = 0; i < 4; i++) { + char digit = lexer.peek(); + if (!isHexDigit(digit)) { + throw new RuntimeException("Syntax Error: Invalid Unicode escape at line " + lexer.line); + } + lexer.consume(); + value = (value << 4) + hexValue(digit); + } + return value; + } + + private UnicodeEscapeResult decodeUnicodeEscape() { + int consumed = 0; + int firstUnit = readUnicodeUnit(); + consumed += 4; + + if (Character.isLowSurrogate((char)firstUnit)) { + throw new RuntimeException("Syntax Error: Unexpected low surrogate in Unicode escape at line " + lexer.line); + } + + if (Character.isHighSurrogate((char)firstUnit)) { + if (lexer.getPosition() + 5 >= lexer.getInput().length) { + throw new RuntimeException("Syntax Error: Missing low surrogate in Unicode escape at line " + lexer.line); + } + if (lexer.peek() != '\\' || lexer.peek(1) != 'u') { + throw new RuntimeException("Syntax Error: Expected low surrogate escape at line " + lexer.line); + } + + lexer.consume(); + lexer.consume(); + consumed += 2; + + int secondUnit = readUnicodeUnit(); + consumed += 4; + + if (!Character.isLowSurrogate((char)secondUnit)) { + throw new RuntimeException("Syntax Error: Invalid low surrogate in Unicode escape at line " + lexer.line); + } + + return new UnicodeEscapeResult(new String(new char[] {(char)firstUnit, (char)secondUnit}), consumed); + } + + return new UnicodeEscapeResult(String.valueOf((char)firstUnit), consumed); + } public Token scan() { if (lexer.peek() == '|' && lexer.peek(1) == '"') { @@ -98,20 +172,27 @@ private Token readText() { char escaped = lexer.consume(); length++; - // Convert escape sequence to actual character - char actualChar; - switch (escaped) { - case 'n': actualChar = '\n'; break; - case 't': actualChar = '\t'; break; - case 'r': actualChar = '\r'; break; - case '\\': actualChar = '\\'; break; - case '"': actualChar = '"'; break; - case '{': actualChar = '{'; break; - default: actualChar = escaped; break; + // Convert escape sequence to actual character(s) + String escapedStr; + if (escaped == 'u') { + UnicodeEscapeResult unicodeResult = decodeUnicodeEscape(); + escapedStr = unicodeResult.text; + length += unicodeResult.consumedChars; + } else { + char actualChar; + switch (escaped) { + case 'n': actualChar = '\n'; break; + case 't': actualChar = '\t'; break; + case 'r': actualChar = '\r'; break; + case '\\': actualChar = '\\'; break; + case '"': actualChar = '"'; break; + case '{': actualChar = '{'; break; + default: actualChar = escaped; break; + } + escapedStr = String.valueOf(actualChar); } - // Add escaped character as a text literal (already without quotes) - String escapedStr = String.valueOf(actualChar); + // Add escaped character(s) as a text literal (already without quotes) Token escapedToken = Token.createTextLiteral(escapedStr, lexer.line, lexer.column - 1); parts.add(escapedToken); childTokens.add(escapedToken); @@ -406,17 +487,24 @@ private Token readMultilineText() { currentColumnInLine++; // Convert escape sequences to actual characters - char actualChar; - switch (escaped) { - case 'n': actualChar = '\n'; break; - case 't': actualChar = '\t'; break; - case 'r': actualChar = '\r'; break; - case '\\': actualChar = '\\'; break; - case '"': actualChar = '"'; break; - case '{': actualChar = '{'; break; - default: actualChar = escaped; break; + if (escaped == 'u') { + UnicodeEscapeResult unicodeResult = decodeUnicodeEscape(); + currentLine.append(unicodeResult.text); + length += unicodeResult.consumedChars; + currentColumnInLine += unicodeResult.consumedChars; + } else { + char actualChar; + switch (escaped) { + case 'n': actualChar = '\n'; break; + case 't': actualChar = '\t'; break; + case 'r': actualChar = '\r'; break; + case '\\': actualChar = '\\'; break; + case '"': actualChar = '"'; break; + case '{': actualChar = '{'; break; + default: actualChar = escaped; break; + } + currentLine.append(actualChar); } - currentLine.append(actualChar); continue; } diff --git a/src/main/java/cod/parser/ExpressionParser.java b/src/main/java/cod/parser/ExpressionParser.java index f26f3f2b..a3b57dd4 100644 --- a/src/main/java/cod/parser/ExpressionParser.java +++ b/src/main/java/cod/parser/ExpressionParser.java @@ -19,6 +19,16 @@ public class ExpressionParser extends BaseParser { private static final String SELF_CALL_PLACEHOLDER = "<~"; + private static final class UnicodeEscapeParseResult { + private final String text; + private final int nextPos; + + private UnicodeEscapeParseResult(String text, int nextPos) { + this.text = text; + this.nextPos = nextPos; + } + } + private static final int PREC_ASSIGNMENT = 10; private static final int PREC_EQUALITY = 50; private static final int PREC_CHAIN = 55; @@ -1034,6 +1044,12 @@ private Expr parseInterpolatedText(Token token) { case '\\': current.append('\\'); break; case '"': current.append('"'); break; case '{': current.append('{'); break; + case 'u': + UnicodeEscapeParseResult unicodeResult = decodeUnicodeEscape(text, pos, token); + current.append(unicodeResult.text); + pos = unicodeResult.nextPos; + inEscape = false; + continue; default: current.append('\\').append(c); break; } inEscape = false; @@ -1088,6 +1104,58 @@ private Expr parseInterpolatedText(Token token) { return result; } + + private int hexValue(char ch) { + if (ch >= '0' && ch <= '9') return ch - '0'; + if (ch >= 'a' && ch <= 'f') return ch - 'a' + 10; + if (ch >= 'A' && ch <= 'F') return ch - 'A' + 10; + return -1; + } + + private int parseUnicodeUnit(String text, int start, Token token) { + if (start + 4 > text.length()) { + throw error("Incomplete Unicode escape in text literal", token); + } + + int value = 0; + for (int i = 0; i < 4; i++) { + int digit = hexValue(text.charAt(start + i)); + if (digit < 0) { + throw error("Invalid Unicode escape in text literal", token); + } + value = (value << 4) + digit; + } + return value; + } + + private UnicodeEscapeParseResult decodeUnicodeEscape(String text, int escapePos, Token token) { + int firstUnit = parseUnicodeUnit(text, escapePos + 1, token); + int nextPos = escapePos + 5; + + if (Character.isLowSurrogate((char)firstUnit)) { + throw error("Unexpected low surrogate in text Unicode escape", token); + } + + if (Character.isHighSurrogate((char)firstUnit)) { + if (nextPos + 5 >= text.length()) { + throw error("Missing low surrogate in text Unicode escape", token); + } + if (text.charAt(nextPos) != '\\' || text.charAt(nextPos + 1) != 'u') { + throw error("Expected low surrogate Unicode escape", token); + } + + int secondUnit = parseUnicodeUnit(text, nextPos + 2, token); + if (!Character.isLowSurrogate((char)secondUnit)) { + throw error("Invalid low surrogate in text Unicode escape", token); + } + + return new UnicodeEscapeParseResult( + new String(new char[] {(char)firstUnit, (char)secondUnit}), + nextPos + 6); + } + + return new UnicodeEscapeParseResult(String.valueOf((char)firstUnit), nextPos); + } private Expr parseInterpolationExpressionDirectly(String exprText, Token textToken) { ParserState savedState = getCurrentState();