|
| 1 | +# ============================================================================= |
| 2 | +# Self-Hosting Lexer (Phase V, milestone 1) |
| 3 | +# ============================================================================= |
| 4 | +# A lexer for a subset of OMNIcode, written entirely in OMNIcode itself. |
| 5 | +# Run on the Rust OMC interpreter, this demonstrates that the language is |
| 6 | +# expressive enough to introspect its own source. |
| 7 | +# |
| 8 | +# Pipeline: |
| 9 | +# string source --> this lexer --> array of ["KIND", "value"] tokens |
| 10 | +# |
| 11 | +# Scope: handles identifiers, integers, the major punctuation (= , ; ( ) { } |
| 12 | +# [ ] + - * / . :), the `h` / `fn` / `if` / `while` keywords, and string |
| 13 | +# literals in double quotes. Comments (# ...) are skipped. |
| 14 | +# |
| 15 | +# Not yet handled: float literals, triple-quoted strings, the bitwise ops, |
| 16 | +# escape sequences, multi-character operators (==, !=, <=, >=, ->, <<, >>). |
| 17 | +# Those are the next milestone. |
| 18 | +# ============================================================================= |
| 19 | + |
| 20 | +# --------------------------------------------------------------------------- |
| 21 | +# Character class predicates. Implemented via str_contains over the literal |
| 22 | +# alphabets — no ord() needed (we don't have one). Returns 1 / 0 like |
| 23 | +# canonical Python OMC convention. |
| 24 | +# --------------------------------------------------------------------------- |
| 25 | +fn is_digit(c) { |
| 26 | + return str_contains("0123456789", c); |
| 27 | +} |
| 28 | + |
| 29 | +fn is_alpha(c) { |
| 30 | + if str_contains("abcdefghijklmnopqrstuvwxyz", c) == 1 { return 1; } |
| 31 | + if str_contains("ABCDEFGHIJKLMNOPQRSTUVWXYZ", c) == 1 { return 1; } |
| 32 | + if c == "_" { return 1; } |
| 33 | + return 0; |
| 34 | +} |
| 35 | + |
| 36 | +fn is_alnum(c) { |
| 37 | + if is_alpha(c) == 1 { return 1; } |
| 38 | + if is_digit(c) == 1 { return 1; } |
| 39 | + return 0; |
| 40 | +} |
| 41 | + |
| 42 | +fn is_space(c) { |
| 43 | + if c == " " { return 1; } |
| 44 | + if c == "\n" { return 1; } |
| 45 | + if c == "\t" { return 1; } |
| 46 | + if c == "\r" { return 1; } |
| 47 | + return 0; |
| 48 | +} |
| 49 | + |
| 50 | +# --------------------------------------------------------------------------- |
| 51 | +# Skip whitespace and `#` comments starting at position `pos`. Returns the |
| 52 | +# new position. This is the only "stateful" helper — we thread the position |
| 53 | +# explicitly because OMC doesn't have mutable references. |
| 54 | +# --------------------------------------------------------------------------- |
| 55 | +fn skip_ws(source, pos) { |
| 56 | + h n = str_len(source); |
| 57 | + h p = pos; |
| 58 | + while p < n { |
| 59 | + h c = str_slice(source, p, p + 1); |
| 60 | + if is_space(c) == 1 { |
| 61 | + p = p + 1; |
| 62 | + } else { |
| 63 | + if c == "#" { |
| 64 | + # Skip to end of line. |
| 65 | + while p < n { |
| 66 | + h cc = str_slice(source, p, p + 1); |
| 67 | + if cc == "\n" { |
| 68 | + p = p + 1; |
| 69 | + break; |
| 70 | + } |
| 71 | + p = p + 1; |
| 72 | + } |
| 73 | + } else { |
| 74 | + break; |
| 75 | + } |
| 76 | + } |
| 77 | + } |
| 78 | + return p; |
| 79 | +} |
| 80 | + |
| 81 | +# --------------------------------------------------------------------------- |
| 82 | +# Recognize keywords. Returns the keyword kind string if `word` is a known |
| 83 | +# keyword, or "IDENT" otherwise. Mirrors the OMC lexer's keyword table. |
| 84 | +# --------------------------------------------------------------------------- |
| 85 | +fn classify_word(word) -> string { |
| 86 | + if word == "h" { return "H"; } |
| 87 | + if word == "fn" { return "FN"; } |
| 88 | + if word == "if" { return "IF"; } |
| 89 | + if word == "else" { return "ELSE"; } |
| 90 | + if word == "while" { return "WHILE"; } |
| 91 | + if word == "for" { return "FOR"; } |
| 92 | + if word == "in" { return "IN"; } |
| 93 | + if word == "return" { return "RETURN"; } |
| 94 | + if word == "break" { return "BREAK"; } |
| 95 | + if word == "continue" { return "CONTINUE"; } |
| 96 | + if word == "print" { return "PRINT"; } |
| 97 | + if word == "import" { return "IMPORT"; } |
| 98 | + if word == "and" { return "AND"; } |
| 99 | + if word == "or" { return "OR"; } |
| 100 | + if word == "not" { return "NOT"; } |
| 101 | + if word == "res" { return "RES"; } |
| 102 | + if word == "fold" { return "FOLD"; } |
| 103 | + if word == "true" { return "BOOL"; } |
| 104 | + if word == "false" { return "BOOL"; } |
| 105 | + return "IDENT"; |
| 106 | +} |
| 107 | + |
| 108 | +# --------------------------------------------------------------------------- |
| 109 | +# Read an identifier or keyword starting at `pos`. Returns a 3-element |
| 110 | +# array: [kind, value, end_pos]. |
| 111 | +# --------------------------------------------------------------------------- |
| 112 | +fn read_ident(source, pos) { |
| 113 | + h n = str_len(source); |
| 114 | + h end = pos; |
| 115 | + while end < n { |
| 116 | + h c = str_slice(source, end, end + 1); |
| 117 | + if is_alnum(c) == 1 { |
| 118 | + end = end + 1; |
| 119 | + } else { |
| 120 | + break; |
| 121 | + } |
| 122 | + } |
| 123 | + h word = str_slice(source, pos, end); |
| 124 | + h kind = classify_word(word); |
| 125 | + return [kind, word, end]; |
| 126 | +} |
| 127 | + |
| 128 | +# --------------------------------------------------------------------------- |
| 129 | +# Read an integer literal. Returns [NUMBER, digits, end_pos]. |
| 130 | +# --------------------------------------------------------------------------- |
| 131 | +fn read_number(source, pos) { |
| 132 | + h n = str_len(source); |
| 133 | + h end = pos; |
| 134 | + while end < n { |
| 135 | + h c = str_slice(source, end, end + 1); |
| 136 | + if is_digit(c) == 1 { |
| 137 | + end = end + 1; |
| 138 | + } else { |
| 139 | + break; |
| 140 | + } |
| 141 | + } |
| 142 | + h digits = str_slice(source, pos, end); |
| 143 | + return ["NUMBER", digits, end]; |
| 144 | +} |
| 145 | + |
| 146 | +# --------------------------------------------------------------------------- |
| 147 | +# Read a double-quoted string literal. Returns [STRING, content, end_pos]. |
| 148 | +# Does NOT handle backslash escapes (next milestone). |
| 149 | +# --------------------------------------------------------------------------- |
| 150 | +fn read_string_literal(source, pos) { |
| 151 | + h n = str_len(source); |
| 152 | + h end = pos + 1; # skip opening quote |
| 153 | + while end < n { |
| 154 | + h c = str_slice(source, end, end + 1); |
| 155 | + if c == "\"" { |
| 156 | + h content = str_slice(source, pos + 1, end); |
| 157 | + return ["STRING", content, end + 1]; |
| 158 | + } |
| 159 | + end = end + 1; |
| 160 | + } |
| 161 | + return ["STRING_UNCLOSED", "", end]; |
| 162 | +} |
| 163 | + |
| 164 | +# --------------------------------------------------------------------------- |
| 165 | +# Single-character punctuation map. Returns the kind name or empty string. |
| 166 | +# --------------------------------------------------------------------------- |
| 167 | +fn punct_kind(c) -> string { |
| 168 | + if c == "(" { return "LPAREN"; } |
| 169 | + if c == ")" { return "RPAREN"; } |
| 170 | + if c == "{" { return "LBRACE"; } |
| 171 | + if c == "}" { return "RBRACE"; } |
| 172 | + if c == "[" { return "LBRACKET"; } |
| 173 | + if c == "]" { return "RBRACKET"; } |
| 174 | + if c == ";" { return "SEMI"; } |
| 175 | + if c == "," { return "COMMA"; } |
| 176 | + if c == "=" { return "EQ"; } |
| 177 | + if c == "+" { return "PLUS"; } |
| 178 | + if c == "-" { return "MINUS"; } |
| 179 | + if c == "*" { return "STAR"; } |
| 180 | + if c == "/" { return "SLASH"; } |
| 181 | + if c == "%" { return "PERCENT"; } |
| 182 | + if c == "<" { return "LT"; } |
| 183 | + if c == ">" { return "GT"; } |
| 184 | + if c == "." { return "DOT"; } |
| 185 | + if c == ":" { return "COLON"; } |
| 186 | + if c == "@" { return "AT"; } |
| 187 | + if c == "&" { return "AMP"; } |
| 188 | + if c == "|" { return "PIPE"; } |
| 189 | + if c == "^" { return "CARET"; } |
| 190 | + if c == "~" { return "TILDE"; } |
| 191 | + return ""; |
| 192 | +} |
| 193 | + |
| 194 | +# --------------------------------------------------------------------------- |
| 195 | +# Tokenize the whole source string. Returns an array of [kind, value] |
| 196 | +# tokens. Position-threading is internal; the caller just gets the |
| 197 | +# completed token stream. |
| 198 | +# --------------------------------------------------------------------------- |
| 199 | +fn tokenize(source) { |
| 200 | + h n = str_len(source); |
| 201 | + h tokens = arr_new(0, 0); |
| 202 | + h pos = 0; |
| 203 | + |
| 204 | + while pos < n { |
| 205 | + pos = skip_ws(source, pos); |
| 206 | + if pos >= n { break; } |
| 207 | + |
| 208 | + h c = str_slice(source, pos, pos + 1); |
| 209 | + |
| 210 | + # Identifier or keyword? |
| 211 | + if is_alpha(c) == 1 { |
| 212 | + h tok = read_ident(source, pos); |
| 213 | + arr_push(tokens, [arr_get(tok, 0), arr_get(tok, 1)]); |
| 214 | + pos = arr_get(tok, 2); |
| 215 | + } else { |
| 216 | + # Number? |
| 217 | + if is_digit(c) == 1 { |
| 218 | + h tok = read_number(source, pos); |
| 219 | + arr_push(tokens, [arr_get(tok, 0), arr_get(tok, 1)]); |
| 220 | + pos = arr_get(tok, 2); |
| 221 | + } else { |
| 222 | + # String literal? |
| 223 | + if c == "\"" { |
| 224 | + h tok = read_string_literal(source, pos); |
| 225 | + arr_push(tokens, [arr_get(tok, 0), arr_get(tok, 1)]); |
| 226 | + pos = arr_get(tok, 2); |
| 227 | + } else { |
| 228 | + # Punctuation. |
| 229 | + h kind = punct_kind(c); |
| 230 | + if str_len(kind) > 0 { |
| 231 | + arr_push(tokens, [kind, c]); |
| 232 | + pos = pos + 1; |
| 233 | + } else { |
| 234 | + # Unknown — emit and skip. |
| 235 | + arr_push(tokens, ["UNKNOWN", c]); |
| 236 | + pos = pos + 1; |
| 237 | + } |
| 238 | + } |
| 239 | + } |
| 240 | + } |
| 241 | + } |
| 242 | + |
| 243 | + arr_push(tokens, ["EOF", ""]); |
| 244 | + return tokens; |
| 245 | +} |
| 246 | + |
| 247 | +# --------------------------------------------------------------------------- |
| 248 | +# Pretty-print a token stream. |
| 249 | +# --------------------------------------------------------------------------- |
| 250 | +fn print_tokens(tokens) { |
| 251 | + h i = 0; |
| 252 | + h n = arr_len(tokens); |
| 253 | + while i < n { |
| 254 | + h t = arr_get(tokens, i); |
| 255 | + h kind = arr_get(t, 0); |
| 256 | + h value = arr_get(t, 1); |
| 257 | + print(concat_many(" [", i, "] ", kind, " ", value)); |
| 258 | + i = i + 1; |
| 259 | + } |
| 260 | +} |
| 261 | + |
| 262 | +# --------------------------------------------------------------------------- |
| 263 | +# Drive the lexer on representative inputs. |
| 264 | +# --------------------------------------------------------------------------- |
| 265 | +print("== Self-Hosting Lexer Demo (Phase V, milestone 1) =="); |
| 266 | +print(""); |
| 267 | + |
| 268 | +# Test 1: simplest possible OMC program. |
| 269 | +print("--- Input 1: h x = 89; ---"); |
| 270 | +h src1 = "h x = 89;"; |
| 271 | +h toks1 = tokenize(src1); |
| 272 | +print_tokens(toks1); |
| 273 | +print(""); |
| 274 | + |
| 275 | +# Test 2: a function definition with arithmetic. |
| 276 | +print("--- Input 2: fn add(a, b) { return a + b; } ---"); |
| 277 | +h src2 = "fn add(a, b) { return a + b; }"; |
| 278 | +h toks2 = tokenize(src2); |
| 279 | +print_tokens(toks2); |
| 280 | +print(""); |
| 281 | + |
| 282 | +# Test 3: with a comment and a string literal. |
| 283 | +print("--- Input 3: # greet\\nprint(\"hi\"); ---"); |
| 284 | +h src3 = "# greet |
| 285 | +print(\"hi\");"; |
| 286 | +h toks3 = tokenize(src3); |
| 287 | +print_tokens(toks3); |
| 288 | +print(""); |
| 289 | + |
| 290 | +# Test 4: harmonic-flavored — uses res() and a Fibonacci constant. |
| 291 | +print("--- Input 4: h r = res(89); ---"); |
| 292 | +h src4 = "h r = res(89);"; |
| 293 | +h toks4 = tokenize(src4); |
| 294 | +print_tokens(toks4); |
| 295 | +print(""); |
| 296 | + |
| 297 | +print("== Observations =="); |
| 298 | +print("- This lexer runs on the Rust OMC interpreter and emits tokens for"); |
| 299 | +print(" programs the SAME interpreter could parse. Self-introspection."); |
| 300 | +print("- Position-threading by return value is verbose but works without"); |
| 301 | +print(" mutable references — a real constraint of the language as it stands."); |
| 302 | +print("- Next milestones: multi-char operators (== <= => !=), float literals,"); |
| 303 | +print(" string-escape handling. Then a parser. Then a codegen. Then the"); |
| 304 | +print(" fixpoint: OMC-compiled-by-OMC produces the same output as itself."); |
0 commit comments