From 36c263703a3ff54b5ac143ad850f9eb03027b5ee Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Mon, 9 Jun 2025 23:17:44 +0000 Subject: [PATCH] organization: Split into logically distinct subpieces A I mentioned in #560, and as contemplated in #536, I'd like to try re-using JuliaParser infrastructure to replace parsers I've written for some other languages. This takes the first step to do so by moving various files into directories depending on whether they are language-dependent or not. Right now there is still some coupling and of course, there are no actual abstractions between these pieces. The idea would be to intrduce those over time. For now, if we put in this refactoring, the way to use this would be to copy the appropriate pieces (at least `core/`) into your downstream parser and then rewrite it to those APIs. I'm planning to do that with a parser or two to see if I hit any big API issues and see what it would take to actually make the re-use happen. - core: Core functionality for parsing - julia: Core functionality for parsing *julia* - integration: Integration code to use as the parser for base - porcelain: Other syntax tree types for external users of the package The `integration` and `porcelain` components should not depend on each other. Otherwise it's layered as expected. This is just the reorganization. Additional work is required to actually spearate the abstractions. --- src/JuliaSyntax.jl | 27 +-- src/{ => core}/diagnostics.jl | 0 src/{ => core}/parse_stream.jl | 314 ---------------------------- src/{ => core}/source_files.jl | 0 src/{ => core}/tree_cursors.jl | 0 src/{ => integration}/expr.jl | 0 src/{ => integration}/hooks.jl | 0 src/julia/julia_parse_stream.jl | 315 +++++++++++++++++++++++++++++ src/{ => julia}/kinds.jl | 0 src/{ => julia}/literal_parsing.jl | 0 src/{ => julia}/parser.jl | 0 src/{ => julia}/parser_api.jl | 0 src/{ => julia}/tokenize.jl | 0 src/{ => porcelain}/green_node.jl | 0 src/{ => porcelain}/syntax_tree.jl | 0 src/precompile.jl | 2 +- 16 files changed, 330 insertions(+), 328 deletions(-) rename src/{ => core}/diagnostics.jl (100%) rename src/{ => core}/parse_stream.jl (77%) rename src/{ => core}/source_files.jl (100%) rename src/{ => core}/tree_cursors.jl (100%) rename src/{ => integration}/expr.jl (100%) rename src/{ => integration}/hooks.jl (100%) create mode 100644 src/julia/julia_parse_stream.jl rename src/{ => julia}/kinds.jl (100%) rename src/{ => julia}/literal_parsing.jl (100%) rename src/{ => julia}/parser.jl (100%) rename src/{ => julia}/parser_api.jl (100%) rename src/{ => julia}/tokenize.jl (100%) rename src/{ => porcelain}/green_node.jl (100%) rename src/{ => porcelain}/syntax_tree.jl (100%) diff --git a/src/JuliaSyntax.jl b/src/JuliaSyntax.jl index 3c276984..da5861c0 100644 --- a/src/JuliaSyntax.jl +++ b/src/JuliaSyntax.jl @@ -79,29 +79,30 @@ export SyntaxNode # Helper utilities include("utils.jl") -include("kinds.jl") +include("julia/kinds.jl") # Lexing uses a significantly modified version of Tokenize.jl -include("tokenize.jl") +include("julia/tokenize.jl") # Source and diagnostics -include("source_files.jl") -include("diagnostics.jl") +include("core/source_files.jl") +include("core/diagnostics.jl") # Parsing -include("parse_stream.jl") -include("parser.jl") -include("parser_api.jl") -include("literal_parsing.jl") +include("core/parse_stream.jl") +include("core/tree_cursors.jl") +include("julia/julia_parse_stream.jl") +include("julia/parser.jl") +include("julia/parser_api.jl") +include("julia/literal_parsing.jl") # Tree data structures -include("tree_cursors.jl") -include("green_node.jl") -include("syntax_tree.jl") -include("expr.jl") +include("porcelain/green_node.jl") +include("porcelain/syntax_tree.jl") +include("integration/expr.jl") # Hooks to integrate the parser with Base -include("hooks.jl") +include("integration/hooks.jl") include("precompile.jl") end diff --git a/src/diagnostics.jl b/src/core/diagnostics.jl similarity index 100% rename from src/diagnostics.jl rename to src/core/diagnostics.jl diff --git a/src/parse_stream.jl b/src/core/parse_stream.jl similarity index 77% rename from src/parse_stream.jl rename to src/core/parse_stream.jl index 1000fdaa..fd66b2b4 100644 --- a/src/parse_stream.jl +++ b/src/core/parse_stream.jl @@ -9,93 +9,11 @@ const EMPTY_FLAGS = RawFlags(0) # Set for tokens or ranges which are syntax trivia after parsing const TRIVIA_FLAG = RawFlags(1<<0) -# Token flags - may be set for operator kinded tokens -# Operator is dotted -const DOTOP_FLAG = RawFlags(1<<1) -# Operator has a suffix -const SUFFIXED_FLAG = RawFlags(1<<2) - -# Set for K"call", K"dotcall" or any syntactic operator heads -# Distinguish various syntaxes which are mapped to K"call" -const PREFIX_CALL_FLAG = RawFlags(0<<3) -const INFIX_FLAG = RawFlags(1<<3) -const PREFIX_OP_FLAG = RawFlags(2<<3) -const POSTFIX_OP_FLAG = RawFlags(3<<3) - -# The following flags are quite head-specific and may overlap - -""" -Set when K"string" or K"cmdstring" was triple-delimited as with \"\"\" or ``` -""" -const TRIPLE_STRING_FLAG = RawFlags(1<<5) - -""" -Set when a K"string", K"cmdstring" or K"Identifier" needs raw string unescaping -""" -const RAW_STRING_FLAG = RawFlags(1<<6) - -""" -Set for K"tuple", K"block" or K"macrocall" which are delimited by parentheses -""" -const PARENS_FLAG = RawFlags(1<<5) - -""" -Set for various delimited constructs when they contains a trailing comma. For -example, to distinguish `(a,b,)` vs `(a,b)`, and `f(a)` vs `f(a,)`. Kinds where -this applies are: `tuple call dotcall macrocall vect curly braces <: >:`. -""" -const TRAILING_COMMA_FLAG = RawFlags(1<<6) - -""" -Set for K"quote" for the short form `:x` as opposed to long form `quote x end` -""" -const COLON_QUOTE = RawFlags(1<<5) - -""" -Set for K"toplevel" which is delimited by parentheses -""" -const TOPLEVEL_SEMICOLONS_FLAG = RawFlags(1<<5) - -""" -Set for K"function" in short form definitions such as `f() = 1` -""" -const SHORT_FORM_FUNCTION_FLAG = RawFlags(1<<5) - -""" -Set for K"struct" when mutable -""" -const MUTABLE_FLAG = RawFlags(1<<5) - -""" -Set for K"module" when it's not bare (`module`, not `baremodule`) -""" -const BARE_MODULE_FLAG = RawFlags(1<<5) - """ Set for nodes that are non-terminals """ const NON_TERMINAL_FLAG = RawFlags(1<<7) -# Flags holding the dimension of an nrow or other UInt8 not held in the source -# TODO: Given this is only used for nrow/ncat, we could actually use all the flags? -const NUMERIC_FLAGS = RawFlags(RawFlags(0xff)<<8) - -function set_numeric_flags(n::Integer) - f = RawFlags((n << 8) & NUMERIC_FLAGS) - if numeric_flags(f) != n - error("Numeric flags unable to hold large integer $n") - end - f -end - -function call_type_flags(f::RawFlags) - f & 0b11000 -end - -function numeric_flags(f::RawFlags) - Int((f >> 8) % UInt8) -end - function remove_flags(n::RawFlags, fs...) RawFlags(n & ~(RawFlags((|)(fs...)))) end @@ -138,47 +56,6 @@ function Base.summary(head::SyntaxHead) untokenize(head, unique=false, include_flag_suff=false) end -function untokenize(head::SyntaxHead; unique=true, include_flag_suff=true) - str = (is_error(kind(head)) ? untokenize(kind(head); unique=false) : - untokenize(kind(head); unique=unique))::String - if is_dotted(head) - str = "."*str - end - if include_flag_suff - # Ignore DOTOP_FLAG - it's represented above with . prefix - is_trivia(head) && (str = str*"-t") - is_infix_op_call(head) && (str = str*"-i") - is_prefix_op_call(head) && (str = str*"-pre") - is_postfix_op_call(head) && (str = str*"-post") - - k = kind(head) - if k in KSet"string cmdstring Identifier" - has_flags(head, TRIPLE_STRING_FLAG) && (str = str*"-s") - has_flags(head, RAW_STRING_FLAG) && (str = str*"-r") - elseif k in KSet"tuple block macrocall" - has_flags(head, PARENS_FLAG) && (str = str*"-p") - elseif k == K"quote" - has_flags(head, COLON_QUOTE) && (str = str*"-:") - elseif k == K"toplevel" - has_flags(head, TOPLEVEL_SEMICOLONS_FLAG) && (str = str*"-;") - elseif k == K"function" - has_flags(head, SHORT_FORM_FUNCTION_FLAG) && (str = str*"-=") - elseif k == K"struct" - has_flags(head, MUTABLE_FLAG) && (str = str*"-mut") - elseif k == K"module" - has_flags(head, BARE_MODULE_FLAG) && (str = str*"-bare") - end - if k in KSet"tuple call dotcall macrocall vect curly braces <: >:" && - has_flags(head, TRAILING_COMMA_FLAG) - str *= "-," - end - is_suffixed(head) && (str = str*"-suf") - n = numeric_flags(head) - n != 0 && (str = str*"-"*string(n)) - end - str -end - #------------------------------------------------------------------------------- # Generic interface for types `T` which have kind and flags. Either: # 1. Define kind(::T) and flags(::T), or @@ -200,65 +77,6 @@ invisible to the parser (eg, whitespace) or implied by the structure of the AST """ is_trivia(x) = has_flags(x, TRIVIA_FLAG) -""" - is_prefix_call(x) - -Return true for normal prefix function call syntax such as the `f` call node -parsed from `f(x)`. -""" -is_prefix_call(x) = call_type_flags(x) == PREFIX_CALL_FLAG - -""" - is_infix_op_call(x) - -Return true for infix operator calls such as the `+` call node parsed from -`x + y`. -""" -is_infix_op_call(x) = call_type_flags(x) == INFIX_FLAG - -""" - is_prefix_op_call(x) - -Return true for prefix operator calls such as the `+` call node parsed from `+x`. -""" -is_prefix_op_call(x) = call_type_flags(x) == PREFIX_OP_FLAG - -""" - is_postfix_op_call(x) - -Return true for postfix operator calls such as the `'ᵀ` call node parsed from `x'ᵀ`. -""" -is_postfix_op_call(x) = call_type_flags(x) == POSTFIX_OP_FLAG - -""" - is_dotted(x) - -Return true for dotted syntax tokens -""" -is_dotted(x) = has_flags(x, DOTOP_FLAG) - -""" - is_suffixed(x) - -Return true for operators which have suffixes, such as `+₁` -""" -is_suffixed(x) = has_flags(x, SUFFIXED_FLAG) - -""" - is_decorated(x) - -Return true for operators which are decorated with a dot or suffix. -""" -is_decorated(x) = is_dotted(x) || is_suffixed(x) - -""" - numeric_flags(x) - -Return the number attached to a `SyntaxHead`. This is only for kinds `K"nrow"` -and `K"ncat"`, for now. -""" -numeric_flags(x) = numeric_flags(flags(x)) - #------------------------------------------------------------------------------- """ `SyntaxToken` is a token covering a contiguous byte range in the input text. @@ -280,7 +98,6 @@ function Base.show(io::IO, tok::SyntaxToken) end head(tok::SyntaxToken) = tok.head -flags(tok::SyntaxToken) = remove_flags(flags(tok.head), NUMERIC_FLAGS) preceding_whitespace(tok::SyntaxToken) = tok.preceding_whitespace @@ -962,45 +779,6 @@ function bump_glue(stream::ParseStream, kind, flags) return position(stream) end -""" - bump_split(stream, token_spec1, [token_spec2 ...]) - -Bump the next token, splitting it into several pieces - -Tokens are defined by a number of `token_spec` of shape `(nbyte, kind, flags)`. -If all `nbyte` are positive, the sum must equal the token length. If one -`nbyte` is negative, that token is given `tok_len + nbyte` bytes and the sum of -all `nbyte` must equal zero. - -This is a hack which helps resolves the occasional lexing ambiguity. For -example -* Whether .+ should be a single token or the composite (. +) which is used for - standalone operators. -* Whether ... is splatting (most of the time) or three . tokens in import paths - -TODO: Are these the only cases? Can we replace this general utility with a -simpler one which only splits preceding dots? -""" -function bump_split(stream::ParseStream, split_spec::Vararg{Any, N}) where {N} - tok = stream.lookahead[stream.lookahead_index] - stream.lookahead_index += 1 - start_b = _next_byte(stream) - toklen = tok.next_byte - start_b - prev_b = start_b - for (i, (nbyte, k, f)) in enumerate(split_spec) - h = SyntaxHead(k, f) - actual_nbyte = nbyte < 0 ? (toklen + nbyte) : nbyte - orig_k = k == K"." ? K"." : kind(tok) - node = RawGreenNode(h, actual_nbyte, orig_k) - push!(stream.output, node) - prev_b += actual_nbyte - stream.next_byte += actual_nbyte - end - @assert tok.next_byte == prev_b - stream.peek_count = 0 - return position(stream) -end - """ Reset kind or flags of an existing node in the output stream @@ -1129,98 +907,6 @@ function emit_diagnostic(diagnostics::AbstractVector{Diagnostic}, push!(diagnostics, Diagnostic(first(byterange), last(byterange); kws...)) end -#------------------------------------------------------------------------------- -# ParseStream Post-processing - -function validate_tokens(stream::ParseStream) - txtbuf = unsafe_textbuf(stream) - charbuf = IOBuffer() - - # Process terminal nodes in the output - fbyte = stream.output[1].byte_span+1 # Start after sentinel - for i = 2:length(stream.output) - node = stream.output[i] - if !is_terminal(node) || kind(node) == K"TOMBSTONE" - continue - end - - k = kind(node) - nbyte = fbyte + node.byte_span - tokrange = fbyte:nbyte-1 - error_kind = K"None" - - if k in KSet"Integer BinInt OctInt HexInt" - # The following shouldn't be able to error... - # parse_int_literal - # parse_uint_literal - elseif k == K"Float" || k == K"Float32" - underflow0 = false - if k == K"Float" - x, code = parse_float_literal(Float64, txtbuf, fbyte, nbyte) - # jl_strtod_c can return "underflow" even for valid cases such - # as `5e-324` where the source is an exact representation of - # `x`. So only warn when underflowing to zero. - underflow0 = code === :underflow && x == 0 - else - x, code = parse_float_literal(Float32, txtbuf, fbyte, nbyte) - underflow0 = code === :underflow && x == 0 - end - if code === :ok - # pass - elseif code === :overflow - emit_diagnostic(stream, tokrange, - error="overflow in floating point literal") - error_kind = K"ErrorNumericOverflow" - elseif underflow0 - emit_diagnostic(stream, tokrange, - warning="underflow to zero in floating point literal") - end - elseif k == K"Char" - @assert fbyte < nbyte # Already handled in the parser - truncate(charbuf, 0) - had_error = unescape_julia_string(charbuf, txtbuf, fbyte, - nbyte, stream.diagnostics) - if had_error - error_kind = K"ErrorInvalidEscapeSequence" - else - seek(charbuf,0) - read(charbuf, Char) - if !eof(charbuf) - error_kind = K"ErrorOverLongCharacter" - emit_diagnostic(stream, tokrange, - error="character literal contains multiple characters") - end - end - elseif k == K"String" && !has_flags(node, RAW_STRING_FLAG) - had_error = unescape_julia_string(devnull, txtbuf, fbyte, - nbyte, stream.diagnostics) - if had_error - error_kind = K"ErrorInvalidEscapeSequence" - end - elseif is_error(k) && k != K"error" - # Emit messages for non-generic token errors - tokstr = String(txtbuf[tokrange]) - msg = if k in KSet"ErrorInvisibleChar ErrorUnknownCharacter ErrorIdentifierStart" - "$(_token_error_descriptions[k]) $(repr(tokstr[1]))" - elseif k in KSet"ErrorInvalidUTF8 ErrorBidiFormatting" - "$(_token_error_descriptions[k]) $(repr(tokstr))" - else - _token_error_descriptions[k] - end - emit_diagnostic(stream, tokrange, error=msg) - end - - if error_kind != K"None" - # Update the node with new error kind - stream.output[i] = RawGreenNode(SyntaxHead(error_kind, EMPTY_FLAGS), - node.byte_span, node.orig_kind) - end - - fbyte = nbyte - end - sort!(stream.diagnostics, by=first_byte) -end - # Tree construction from the list of text ranges held by ParseStream # API for extracting results from ParseStream diff --git a/src/source_files.jl b/src/core/source_files.jl similarity index 100% rename from src/source_files.jl rename to src/core/source_files.jl diff --git a/src/tree_cursors.jl b/src/core/tree_cursors.jl similarity index 100% rename from src/tree_cursors.jl rename to src/core/tree_cursors.jl diff --git a/src/expr.jl b/src/integration/expr.jl similarity index 100% rename from src/expr.jl rename to src/integration/expr.jl diff --git a/src/hooks.jl b/src/integration/hooks.jl similarity index 100% rename from src/hooks.jl rename to src/integration/hooks.jl diff --git a/src/julia/julia_parse_stream.jl b/src/julia/julia_parse_stream.jl new file mode 100644 index 00000000..aab8a547 --- /dev/null +++ b/src/julia/julia_parse_stream.jl @@ -0,0 +1,315 @@ +# Token flags - may be set for operator kinded tokens +# Operator is dotted +const DOTOP_FLAG = RawFlags(1<<1) +# Operator has a suffix +const SUFFIXED_FLAG = RawFlags(1<<2) + +# Set for K"call", K"dotcall" or any syntactic operator heads +# Distinguish various syntaxes which are mapped to K"call" +const PREFIX_CALL_FLAG = RawFlags(0<<3) +const INFIX_FLAG = RawFlags(1<<3) +const PREFIX_OP_FLAG = RawFlags(2<<3) +const POSTFIX_OP_FLAG = RawFlags(3<<3) + +# The following flags are quite head-specific and may overlap + +""" +Set when K"string" or K"cmdstring" was triple-delimited as with \"\"\" or ``` +""" +const TRIPLE_STRING_FLAG = RawFlags(1<<5) + +""" +Set when a K"string", K"cmdstring" or K"Identifier" needs raw string unescaping +""" +const RAW_STRING_FLAG = RawFlags(1<<6) + +""" +Set for K"tuple", K"block" or K"macrocall" which are delimited by parentheses +""" +const PARENS_FLAG = RawFlags(1<<5) + +""" +Set for various delimited constructs when they contains a trailing comma. For +example, to distinguish `(a,b,)` vs `(a,b)`, and `f(a)` vs `f(a,)`. Kinds where +this applies are: `tuple call dotcall macrocall vect curly braces <: >:`. +""" +const TRAILING_COMMA_FLAG = RawFlags(1<<6) + +""" +Set for K"quote" for the short form `:x` as opposed to long form `quote x end` +""" +const COLON_QUOTE = RawFlags(1<<5) + +""" +Set for K"toplevel" which is delimited by parentheses +""" +const TOPLEVEL_SEMICOLONS_FLAG = RawFlags(1<<5) + +""" +Set for K"function" in short form definitions such as `f() = 1` +""" +const SHORT_FORM_FUNCTION_FLAG = RawFlags(1<<5) + +""" +Set for K"struct" when mutable +""" +const MUTABLE_FLAG = RawFlags(1<<5) + +""" +Set for K"module" when it's not bare (`module`, not `baremodule`) +""" +const BARE_MODULE_FLAG = RawFlags(1<<5) + +# Flags holding the dimension of an nrow or other UInt8 not held in the source +# TODO: Given this is only used for nrow/ncat, we could actually use all the flags? +const NUMERIC_FLAGS = RawFlags(RawFlags(0xff)<<8) + +function set_numeric_flags(n::Integer) + f = RawFlags((n << 8) & NUMERIC_FLAGS) + if numeric_flags(f) != n + error("Numeric flags unable to hold large integer $n") + end + f +end + +function call_type_flags(f::RawFlags) + f & 0b11000 +end + +function numeric_flags(f::RawFlags) + Int((f >> 8) % UInt8) +end + +flags(tok::SyntaxToken) = remove_flags(flags(tok.head), NUMERIC_FLAGS) + +""" + is_prefix_call(x) + +Return true for normal prefix function call syntax such as the `f` call node +parsed from `f(x)`. +""" +is_prefix_call(x) = call_type_flags(x) == PREFIX_CALL_FLAG + +""" + is_infix_op_call(x) + +Return true for infix operator calls such as the `+` call node parsed from +`x + y`. +""" +is_infix_op_call(x) = call_type_flags(x) == INFIX_FLAG + +""" + is_prefix_op_call(x) + +Return true for prefix operator calls such as the `+` call node parsed from `+x`. +""" +is_prefix_op_call(x) = call_type_flags(x) == PREFIX_OP_FLAG + +""" + is_postfix_op_call(x) + +Return true for postfix operator calls such as the `'ᵀ` call node parsed from `x'ᵀ`. +""" +is_postfix_op_call(x) = call_type_flags(x) == POSTFIX_OP_FLAG + +""" + is_dotted(x) + +Return true for dotted syntax tokens +""" +is_dotted(x) = has_flags(x, DOTOP_FLAG) + +""" + is_suffixed(x) + +Return true for operators which have suffixes, such as `+₁` +""" +is_suffixed(x) = has_flags(x, SUFFIXED_FLAG) + +""" + is_decorated(x) + +Return true for operators which are decorated with a dot or suffix. +""" +is_decorated(x) = is_dotted(x) || is_suffixed(x) + +""" + numeric_flags(x) + +Return the number attached to a `SyntaxHead`. This is only for kinds `K"nrow"` +and `K"ncat"`, for now. +""" +numeric_flags(x) = numeric_flags(flags(x)) + +function untokenize(head::SyntaxHead; unique=true, include_flag_suff=true) + str = (is_error(kind(head)) ? untokenize(kind(head); unique=false) : + untokenize(kind(head); unique=unique))::String + if is_dotted(head) + str = "."*str + end + if include_flag_suff + # Ignore DOTOP_FLAG - it's represented above with . prefix + is_trivia(head) && (str = str*"-t") + is_infix_op_call(head) && (str = str*"-i") + is_prefix_op_call(head) && (str = str*"-pre") + is_postfix_op_call(head) && (str = str*"-post") + + k = kind(head) + if k in KSet"string cmdstring Identifier" + has_flags(head, TRIPLE_STRING_FLAG) && (str = str*"-s") + has_flags(head, RAW_STRING_FLAG) && (str = str*"-r") + elseif k in KSet"tuple block macrocall" + has_flags(head, PARENS_FLAG) && (str = str*"-p") + elseif k == K"quote" + has_flags(head, COLON_QUOTE) && (str = str*"-:") + elseif k == K"toplevel" + has_flags(head, TOPLEVEL_SEMICOLONS_FLAG) && (str = str*"-;") + elseif k == K"function" + has_flags(head, SHORT_FORM_FUNCTION_FLAG) && (str = str*"-=") + elseif k == K"struct" + has_flags(head, MUTABLE_FLAG) && (str = str*"-mut") + elseif k == K"module" + has_flags(head, BARE_MODULE_FLAG) && (str = str*"-bare") + end + if k in KSet"tuple call dotcall macrocall vect curly braces <: >:" && + has_flags(head, TRAILING_COMMA_FLAG) + str *= "-," + end + is_suffixed(head) && (str = str*"-suf") + n = numeric_flags(head) + n != 0 && (str = str*"-"*string(n)) + end + str +end + + +#------------------------------------------------------------------------------- +# ParseStream Post-processing + +function validate_tokens(stream::ParseStream) + txtbuf = unsafe_textbuf(stream) + charbuf = IOBuffer() + + # Process terminal nodes in the output + fbyte = stream.output[1].byte_span+1 # Start after sentinel + for i = 2:length(stream.output) + node = stream.output[i] + if !is_terminal(node) || kind(node) == K"TOMBSTONE" + continue + end + + k = kind(node) + nbyte = fbyte + node.byte_span + tokrange = fbyte:nbyte-1 + error_kind = K"None" + + if k in KSet"Integer BinInt OctInt HexInt" + # The following shouldn't be able to error... + # parse_int_literal + # parse_uint_literal + elseif k == K"Float" || k == K"Float32" + underflow0 = false + if k == K"Float" + x, code = parse_float_literal(Float64, txtbuf, fbyte, nbyte) + # jl_strtod_c can return "underflow" even for valid cases such + # as `5e-324` where the source is an exact representation of + # `x`. So only warn when underflowing to zero. + underflow0 = code === :underflow && x == 0 + else + x, code = parse_float_literal(Float32, txtbuf, fbyte, nbyte) + underflow0 = code === :underflow && x == 0 + end + if code === :ok + # pass + elseif code === :overflow + emit_diagnostic(stream, tokrange, + error="overflow in floating point literal") + error_kind = K"ErrorNumericOverflow" + elseif underflow0 + emit_diagnostic(stream, tokrange, + warning="underflow to zero in floating point literal") + end + elseif k == K"Char" + @assert fbyte < nbyte # Already handled in the parser + truncate(charbuf, 0) + had_error = unescape_julia_string(charbuf, txtbuf, fbyte, + nbyte, stream.diagnostics) + if had_error + error_kind = K"ErrorInvalidEscapeSequence" + else + seek(charbuf,0) + read(charbuf, Char) + if !eof(charbuf) + error_kind = K"ErrorOverLongCharacter" + emit_diagnostic(stream, tokrange, + error="character literal contains multiple characters") + end + end + elseif k == K"String" && !has_flags(node, RAW_STRING_FLAG) + had_error = unescape_julia_string(devnull, txtbuf, fbyte, + nbyte, stream.diagnostics) + if had_error + error_kind = K"ErrorInvalidEscapeSequence" + end + elseif is_error(k) && k != K"error" + # Emit messages for non-generic token errors + tokstr = String(txtbuf[tokrange]) + msg = if k in KSet"ErrorInvisibleChar ErrorUnknownCharacter ErrorIdentifierStart" + "$(_token_error_descriptions[k]) $(repr(tokstr[1]))" + elseif k in KSet"ErrorInvalidUTF8 ErrorBidiFormatting" + "$(_token_error_descriptions[k]) $(repr(tokstr))" + else + _token_error_descriptions[k] + end + emit_diagnostic(stream, tokrange, error=msg) + end + + if error_kind != K"None" + # Update the node with new error kind + stream.output[i] = RawGreenNode(SyntaxHead(error_kind, EMPTY_FLAGS), + node.byte_span, node.orig_kind) + end + + fbyte = nbyte + end + sort!(stream.diagnostics, by=first_byte) +end + +""" + bump_split(stream, token_spec1, [token_spec2 ...]) + +Bump the next token, splitting it into several pieces + +Tokens are defined by a number of `token_spec` of shape `(nbyte, kind, flags)`. +If all `nbyte` are positive, the sum must equal the token length. If one +`nbyte` is negative, that token is given `tok_len + nbyte` bytes and the sum of +all `nbyte` must equal zero. + +This is a hack which helps resolves the occasional lexing ambiguity. For +example +* Whether .+ should be a single token or the composite (. +) which is used for + standalone operators. +* Whether ... is splatting (most of the time) or three . tokens in import paths + +TODO: Are these the only cases? Can we replace this general utility with a +simpler one which only splits preceding dots? +""" +function bump_split(stream::ParseStream, split_spec::Vararg{Any, N}) where {N} + tok = stream.lookahead[stream.lookahead_index] + stream.lookahead_index += 1 + start_b = _next_byte(stream) + toklen = tok.next_byte - start_b + prev_b = start_b + for (i, (nbyte, k, f)) in enumerate(split_spec) + h = SyntaxHead(k, f) + actual_nbyte = nbyte < 0 ? (toklen + nbyte) : nbyte + orig_k = k == K"." ? K"." : kind(tok) + node = RawGreenNode(h, actual_nbyte, orig_k) + push!(stream.output, node) + prev_b += actual_nbyte + stream.next_byte += actual_nbyte + end + @assert tok.next_byte == prev_b + stream.peek_count = 0 + return position(stream) +end diff --git a/src/kinds.jl b/src/julia/kinds.jl similarity index 100% rename from src/kinds.jl rename to src/julia/kinds.jl diff --git a/src/literal_parsing.jl b/src/julia/literal_parsing.jl similarity index 100% rename from src/literal_parsing.jl rename to src/julia/literal_parsing.jl diff --git a/src/parser.jl b/src/julia/parser.jl similarity index 100% rename from src/parser.jl rename to src/julia/parser.jl diff --git a/src/parser_api.jl b/src/julia/parser_api.jl similarity index 100% rename from src/parser_api.jl rename to src/julia/parser_api.jl diff --git a/src/tokenize.jl b/src/julia/tokenize.jl similarity index 100% rename from src/tokenize.jl rename to src/julia/tokenize.jl diff --git a/src/green_node.jl b/src/porcelain/green_node.jl similarity index 100% rename from src/green_node.jl rename to src/porcelain/green_node.jl diff --git a/src/syntax_tree.jl b/src/porcelain/syntax_tree.jl similarity index 100% rename from src/syntax_tree.jl rename to src/porcelain/syntax_tree.jl diff --git a/src/precompile.jl b/src/precompile.jl index 5a80d92d..9fb71c74 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -1,5 +1,5 @@ # Just parse some file as a precompile workload -let filename = joinpath(@__DIR__, "literal_parsing.jl") +let filename = joinpath(@__DIR__, "julia/literal_parsing.jl") text = read(filename, String) parseall(Expr, text) parseall(SyntaxNode, text)