diff --git a/libcst/_parser/_parsing_check.py b/libcst/_parser/_parsing_check.py deleted file mode 100644 index 03283c954..000000000 --- a/libcst/_parser/_parsing_check.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Iterable, Union - -from libcst._exceptions import EOFSentinel -from libcst._parser.parso.pgen2.generator import ReservedString -from libcst._parser.parso.python.token import PythonTokenTypes, TokenType -from libcst._parser.types.token import Token - -_EOF_STR: str = "end of file (EOF)" -_INDENT_STR: str = "an indent" -_DEDENT_STR: str = "a dedent" - - -def get_expected_str( - encountered: Union[Token, EOFSentinel], - expected: Union[Iterable[Union[TokenType, ReservedString]], EOFSentinel], -) -> str: - if ( - isinstance(encountered, EOFSentinel) - or encountered.type is PythonTokenTypes.ENDMARKER - ): - encountered_str = _EOF_STR - elif encountered.type is PythonTokenTypes.INDENT: - encountered_str = _INDENT_STR - elif encountered.type is PythonTokenTypes.DEDENT: - encountered_str = _DEDENT_STR - else: - encountered_str = repr(encountered.string) - - if isinstance(expected, EOFSentinel): - expected_names = [_EOF_STR] - else: - expected_names = sorted( - [ - repr(el.name) if isinstance(el, TokenType) else repr(el.value) - for el in expected - ] - ) - - if len(expected_names) > 10: - # There's too many possibilities, so it's probably not useful to list them. - # Instead, let's just abbreviate the message. - return f"Unexpectedly encountered {encountered_str}." - else: - if len(expected_names) == 1: - expected_str = expected_names[0] - else: - expected_str = f"{', '.join(expected_names[:-1])}, or {expected_names[-1]}" - return f"Encountered {encountered_str}, but expected {expected_str}." diff --git a/libcst/_parser/base_parser.py b/libcst/_parser/base_parser.py deleted file mode 100644 index d349bb149..000000000 --- a/libcst/_parser/base_parser.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. - -# Modifications: -# Copyright David Halter and Contributors -# Modifications are dual-licensed: MIT and PSF. -# 99% of the code is different from pgen2, now. - -# A fork of `parso.parser`. -# https://github.com/davidhalter/parso/blob/v0.3.4/parso/parser.py -# -# The following changes were made: -# - Typing was added. -# - Error recovery is removed. -# - The Jedi-specific _allowed_transition_names_and_token_types API is removed. -# - Improved error messages by using our exceptions module. -# - node_map/leaf_map were removed in favor of just calling convert_*. -# - convert_node/convert_leaf were renamed to convert_nonterminal/convert_terminal -# - convert_nonterminal is called regardless of the number of children. Parso avoids -# calling it in some cases to avoid creating extra nodes. -# - The parser is constructed with the tokens to allow us to track a bit more state. As -# As a consequence parser may only be used once. -# - Supports our custom Token class, instead of `parso.python.tokenize.Token`. - - -from dataclasses import dataclass, field -from typing import Generic, Iterable, List, Sequence, TypeVar, Union - -from libcst._exceptions import EOFSentinel, ParserSyntaxError, PartialParserSyntaxError -from libcst._parser._parsing_check import get_expected_str -from libcst._parser.parso.pgen2.generator import DFAState, Grammar, ReservedString -from libcst._parser.parso.python.token import TokenType -from libcst._parser.types.token import Token - -_NodeT = TypeVar("_NodeT") -_TokenTypeT = TypeVar("_TokenTypeT", bound=TokenType) -_TokenT = TypeVar("_TokenT", bound=Token) - - -@dataclass(frozen=False) -class StackNode(Generic[_TokenTypeT, _NodeT]): - dfa: "DFAState[_TokenTypeT]" - nodes: List[_NodeT] = field(default_factory=list) - - @property - def nonterminal(self) -> str: - return self.dfa.from_rule - - -def _token_to_transition( - grammar: "Grammar[_TokenTypeT]", type_: _TokenTypeT, value: str -) -> Union[ReservedString, _TokenTypeT]: - # Map from token to label - if type_.contains_syntax: - # Check for reserved words (keywords) - try: - return grammar.reserved_syntax_strings[value] - except KeyError: - pass - - return type_ - - -# TODO: This should be an ABC, but there's a metaclass conflict between Generic and ABC -# that's fixed in Python 3.7. -class BaseParser(Generic[_TokenT, _TokenTypeT, _NodeT]): - """Parser engine. - - A Parser instance contains state pertaining to the current token - sequence, and should not be used concurrently by different threads - to parse separate token sequences. - - See python/tokenize.py for how to get input tokens by a string. - """ - - tokens: Iterable[_TokenT] - lines: Sequence[str] # used when generating parse errors - _pgen_grammar: "Grammar[_TokenTypeT]" - stack: List[StackNode[_TokenTypeT, _NodeT]] - # Keep track of if parse was called. Because a parser may keep global mutable state, - # each BaseParser instance should only be used once. - __was_parse_called: bool - - def __init__( - self, - *, - tokens: Iterable[_TokenT], - lines: Sequence[str], - pgen_grammar: "Grammar[_TokenTypeT]", - start_nonterminal: str, - ) -> None: - self.tokens = tokens - self.lines = lines - self._pgen_grammar = pgen_grammar - first_dfa = pgen_grammar.nonterminal_to_dfas[start_nonterminal][0] - self.stack = [StackNode(first_dfa)] - self.__was_parse_called = False - - def parse(self) -> _NodeT: - # Ensure that we don't re-use parsers. - if self.__was_parse_called: - raise ValueError("Each parser object may only be used to parse once.") - self.__was_parse_called = True - - for token in self.tokens: - self._add_token(token) - - while True: - tos = self.stack[-1] - if not tos.dfa.is_final: - expected_str = get_expected_str( - EOFSentinel.EOF, tos.dfa.transitions.keys() - ) - raise ParserSyntaxError( - f"Incomplete input. {expected_str}", - lines=self.lines, - raw_line=len(self.lines), - raw_column=len(self.lines[-1]), - ) - - if len(self.stack) > 1: - self._pop() - else: - return self.convert_nonterminal(tos.nonterminal, tos.nodes) - - def convert_nonterminal( - self, nonterminal: str, children: Sequence[_NodeT] - ) -> _NodeT: ... - - def convert_terminal(self, token: _TokenT) -> _NodeT: ... - - def _add_token(self, token: _TokenT) -> None: - """ - This is the only core function for parsing. Here happens basically - everything. Everything is well prepared by the parser generator and we - only apply the necessary steps here. - """ - grammar = self._pgen_grammar - stack = self.stack - # pyre-fixme[6]: Expected `_TokenTypeT` for 2nd param but got `TokenType`. - transition = _token_to_transition(grammar, token.type, token.string) - - while True: - try: - plan = stack[-1].dfa.transitions[transition] - break - except KeyError: - if stack[-1].dfa.is_final: - try: - self._pop() - except PartialParserSyntaxError as ex: - # Upconvert the PartialParserSyntaxError to a ParserSyntaxError - # by backfilling the line/column information. - raise ParserSyntaxError( - ex.message, - lines=self.lines, - raw_line=token.start_pos[0], - raw_column=token.start_pos[1], - ) - except Exception as ex: - # convert_nonterminal may fail due to a bug in our code. Try to - # recover enough to at least tell us where in the file it - # failed. - raise ParserSyntaxError( - f"Internal error: {ex}", - lines=self.lines, - raw_line=token.start_pos[0], - raw_column=token.start_pos[1], - ) - else: - # We never broke out -- EOF is too soon -- Unfinished statement. - # - # BUG: The `expected_str` may not be complete because we already - # popped the other possibilities off the stack at this point, but - # it still seems useful to list some of the possibilities that we - # could've expected. - expected_str = get_expected_str( - token, stack[-1].dfa.transitions.keys() - ) - raise ParserSyntaxError( - f"Incomplete input. {expected_str}", - lines=self.lines, - raw_line=token.start_pos[0], - raw_column=token.start_pos[1], - ) - except IndexError: - # I don't think this will ever happen with Python's grammar, because if - # there are any extra tokens at the end of the input, we'll instead - # complain that we expected ENDMARKER. - # - # However, let's leave it just in case. - expected_str = get_expected_str(token, EOFSentinel.EOF) - raise ParserSyntaxError( - f"Too much input. {expected_str}", - lines=self.lines, - raw_line=token.start_pos[0], - raw_column=token.start_pos[1], - ) - - # Logically, `plan` is always defined, but pyre can't reasonably determine that. - stack[-1].dfa = plan.next_dfa - - for push in plan.dfa_pushes: - stack.append(StackNode(push)) - - leaf = self.convert_terminal(token) - stack[-1].nodes.append(leaf) - - def _pop(self) -> None: - tos = self.stack.pop() - # Unlike parso and lib2to3, we call `convert_nonterminal` unconditionally - # instead of only when we have more than one child. This allows us to create a - # far more consistent and predictable tree. - new_node = self.convert_nonterminal(tos.dfa.from_rule, tos.nodes) - self.stack[-1].nodes.append(new_node) diff --git a/libcst/_parser/conversions/README.md b/libcst/_parser/conversions/README.md deleted file mode 100644 index 798e3d187..000000000 --- a/libcst/_parser/conversions/README.md +++ /dev/null @@ -1,209 +0,0 @@ -# Parser Conversions Developer Guide - -Parser conversions take grammar productions and convert them to CST nodes, or to some -"partial" value that will later be converted to a CST node. - -The grammar production that parser conversions are associated with is co-located -alongside the conversion function using our `@with_production` decorator. This is -similar to the API that [rply](https://github.com/alex/rply/) uses. - -Grammar productions are collected when the parser is first called, and converted into a -state machine by Parso's pgen2 fork. - -Unlike rply's API, productions are not automatically gathered, because that would be -dependent on implicit import-time side-effects. Instead all conversion functions must be -listed in `_grammar.py`. - -# What's a production? - -A production is a line in our BNF-like grammar definition. A production has a name (the -first argument of `@with_production`), and a sequence of children (the second argument -of `@with_production`). - -Python's full grammar is here: https://docs.python.org/3/reference/grammar.html - -We use Parso's fork of pgen2, and therefore support the same BNF-like syntax that -Python's documentation uses. - -# Why is everything `Any`-typed? Isn't that bad? - -Yes, `Any` types indicate a gap in static type coverage. Unfortunately, this isn't -easily solved. - -The value of `children` given to a conversion function is dependent on textual grammar -representation and pgen2's implementation, which the type system is unaware of. Unless -we extend the type system to support pgen2 (unlikely) or add a layer of -machine-generated code (possible, but we're not there), there's no way for the type -system to validate any annotations on `children`. - -We could add annotations to `children`, but they're usually complicated types (so they -wouldn't be very human-readable), and they wouldn't actually provide any type safety -because the type checker doesn't know about them. - -Similarly, we could annotate return type annotations, but that's just duplicating the -type we're already expressing in our return statement (so it doesn't improve readability -much), and it's not providing any static type safety. - -We do perform runtime type checks inside tests, and we hope that this test coverage will -help compensate for the lack of static type safety. - -# Where's the whitespace? - -The most important differentiation between an Abstract Syntax Tree and a Concrete Syntax -Tree (for our purposes) is that the CST contains enough information to exactly reproduce -the original program. This means that we must somehow capture and store whitespace. - -The grammar does not contain whitespace information, and there are no explicit tokens -for whitespace. If the grammar did contain whitespace information, the grammar likely -wouldn't be LL(1), and while we could use another context free grammar parsing -algorithm, it would add complexity and likely wouldn't be as efficient. - -Instead, we have a hand-written re-entrant recursive-descent parser for whitespace. It's -the responsibility of conversion functions to call into this parser given whitespace -states before and after a token. - -# Token and WhitespaceState Data Structures - -A token is defined as: - -``` -class Token: - type: TokenType - string: str - # The start of where `string` is in the source, not including leading whitespace. - start_pos: Tuple[int, int] - # The end of where `string` is in the source, not including trailing whitespace. - end_pos: Tuple[int, int] - whitespace_before: WhitespaceState - whitespace_after: WhitespaceState -``` - -Or, in the order that these pieces appear lexically in a parsed program: - -``` -+-------------------+--------+-------------------+ -| whitespace_before | string | whitespace_after | -| (WhitespaceState) | (str) | (WhitespaceState) | -+-------------------+--------+-------------------+ -``` - -Tokens are immutable, but only shallowly, because their whitespace fields are mutable -WhitespaceState objects. - -WhitespaceStates are opaque objects that the whitespace parser consumes and mutates. -WhitespaceState nodes are shared across multiple tokens, so `whitespace_after` is the -same object as `whitespace_before` in the next token. - -# Parser Execution Order - -The parser generator we use (`pgen2`) is bottom-up, meaning that children productions -are called before their parents. In contrast, our hand written whitespace parser is -top-down. - -Inside each production, child conversion functions are called from left to right. - -As an example, assume we're given the following simple grammar and program: - -``` -add_expr: NUMBER ['+' add_expr] -``` - -``` -1 + 2 + 3 -``` - -which forms the parse tree: - -``` - [H] add_expr - / | \ -[A] 1 [B] '+' [G] add_expr - / | \ - [C] 2 [D] '+' [F] add_expr - | - [E] 3 -``` - -The conversion functions would be called in the labeled alphabetical order, with `A` -converted first, and `H` converted last. - -# Who owns whitespace? - -There's a lot of holes between you and a correct whitespace representation, but these -can be divided into a few categories of potential mistakes: - -## Forgetting to Parse Whitespace - -Fortunately, the inverse (parsing the same whitespace twice) should not be possible, -because whitespace is "consumed" by the whitespace parser. - -This kind of mistake is easily caught with tests. - -## Assigning Whitespace to the Wrong Owner - -This is probably the easiest mistake to make. The general convention is that the -top-most possible node owns whitespace, but in a bottom-up parser like ours, the -children are parsed before their parents. - -In contrast, the best owner for whitespace in our tree when there's multiple possible -owners is usually the top-most node. - -As an example, assume we have the following grammar and program: - -``` -simple_stmt: (pass_stmt ';')* NEWLINE -``` - -``` -pass; # comment -``` - -Since both `cst.Semicolon` and `cst.SimpleStatement` can both store some whitespace -after themselves, there's some ambiguity about who should own the space character before -the comment. However, since `cst.SimpleStatement` is the parent, the convention is that -it should own it. - -Unfortunately, since nodes are processed bottom-to-top and left-to-right, the semicolon -under `simple_stmt` will get processed before `simple_stmt` is. This means that in a -naive implementation, the semicolon's conversion function would have a chance to consume -the whitespace before `simple_stmt` can. - -To solve this problem, you must "fix" the whitespace in the parent node's conversion -function or grammar. This can be done in a number of ways. In order of preference: - -1. Split the child's grammar production into two separate productions, one that consumes - it's leading or trailing whitespace, and one that doesn't. Depending on the parent, - use the appropriate version of the child. -2. Construct a "partial" node in the child that doesn't consume the whitespace, and then - consume the correct whitespace in the parent. Be careful about what whitespace a - node's siblings consume. -3. "Steal" the whitespace from the child by replacing the child with a new version that - doesn't have the whitespace. - -This mistake is probably hard to catch with tests, because the CST will still reprint -correctly, but it creates ergonomic issues for tools consuming the CST. - -## Consuming Whitespace in the Wrong Order - -This mistake is probably is the hardest to make by accident, but it could still happen, -and may be hard to catch with tests. - -Given the following piece of code: - -``` -pass # trailing -# empty line -pass -``` - -The first statement should own `# trailing` (parsed using `parse_trailing_whitespace`). -The second statement then should `# empty line` (parsed using `parse_empty_lines`). - -However, it's possible that if you somehow called `parse_empty_lines` on the second -statement before calling `parse_trailing_whitespace` on the first statement, -`parse_empty_lines` could accidentally end up consuming the `# trailing` comment, -because `parse_trailing_whitespace` hasn't yet consumed it. - -However, this circumstance is unlikely, because you'd explicitly have to handle the -children out-of-order, and we have assertions inside the whitespace parser to prevent -some possible mistakes, like the one described above. diff --git a/libcst/_parser/conversions/__init__.py b/libcst/_parser/conversions/__init__.py deleted file mode 100644 index 7bec24cb1..000000000 --- a/libcst/_parser/conversions/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. diff --git a/libcst/_parser/conversions/expression.py b/libcst/_parser/conversions/expression.py deleted file mode 100644 index 79d7ad783..000000000 --- a/libcst/_parser/conversions/expression.py +++ /dev/null @@ -1,1630 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# pyre-unsafe - -import re -import typing -from tokenize import ( - Floatnumber as FLOATNUMBER_RE, - Imagnumber as IMAGNUMBER_RE, - Intnumber as INTNUMBER_RE, -) - -from libcst import CSTLogicError -from libcst._exceptions import ParserSyntaxError, PartialParserSyntaxError -from libcst._maybe_sentinel import MaybeSentinel -from libcst._nodes.expression import ( - Arg, - Asynchronous, - Attribute, - Await, - BinaryOperation, - BooleanOperation, - Call, - Comparison, - ComparisonTarget, - CompFor, - CompIf, - ConcatenatedString, - Dict, - DictComp, - DictElement, - Element, - Ellipsis, - Float, - FormattedString, - FormattedStringExpression, - FormattedStringText, - From, - GeneratorExp, - IfExp, - Imaginary, - Index, - Integer, - Lambda, - LeftCurlyBrace, - LeftParen, - LeftSquareBracket, - List, - ListComp, - Name, - NamedExpr, - Param, - Parameters, - RightCurlyBrace, - RightParen, - RightSquareBracket, - Set, - SetComp, - Slice, - StarredDictElement, - StarredElement, - Subscript, - SubscriptElement, - Tuple, - UnaryOperation, - Yield, -) -from libcst._nodes.op import ( - Add, - And, - AssignEqual, - BaseBinaryOp, - BaseBooleanOp, - BaseCompOp, - BitAnd, - BitInvert, - BitOr, - BitXor, - Colon, - Comma, - Divide, - Dot, - Equal, - FloorDivide, - GreaterThan, - GreaterThanEqual, - In, - Is, - IsNot, - LeftShift, - LessThan, - LessThanEqual, - MatrixMultiply, - Minus, - Modulo, - Multiply, - Not, - NotEqual, - NotIn, - Or, - Plus, - Power, - RightShift, - Subtract, -) -from libcst._nodes.whitespace import SimpleWhitespace -from libcst._parser.custom_itertools import grouper -from libcst._parser.production_decorator import with_production -from libcst._parser.types.config import ParserConfig -from libcst._parser.types.partials import ( - ArglistPartial, - AttributePartial, - CallPartial, - FormattedStringConversionPartial, - FormattedStringFormatSpecPartial, - SlicePartial, - SubscriptPartial, - WithLeadingWhitespace, -) -from libcst._parser.types.token import Token -from libcst._parser.whitespace_parser import parse_parenthesizable_whitespace - -BINOP_TOKEN_LUT: typing.Dict[str, typing.Type[BaseBinaryOp]] = { - "*": Multiply, - "@": MatrixMultiply, - "/": Divide, - "%": Modulo, - "//": FloorDivide, - "+": Add, - "-": Subtract, - "<<": LeftShift, - ">>": RightShift, - "&": BitAnd, - "^": BitXor, - "|": BitOr, -} - - -BOOLOP_TOKEN_LUT: typing.Dict[str, typing.Type[BaseBooleanOp]] = {"and": And, "or": Or} - - -COMPOP_TOKEN_LUT: typing.Dict[str, typing.Type[BaseCompOp]] = { - "<": LessThan, - ">": GreaterThan, - "==": Equal, - "<=": LessThanEqual, - ">=": GreaterThanEqual, - "in": In, - "is": Is, -} - - -# N.B. This uses a `testlist | star_expr`, not a `testlist_star_expr` because -# `testlist_star_expr` may not always be representable by a non-partial node, since it's -# only used as part of `expr_stmt`. -@with_production("expression_input", "(testlist | star_expr) ENDMARKER") -def convert_expression_input( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (child, endmarker) = children - # HACK: UGLY! REMOVE THIS SOON! - # Unwrap WithLeadingWhitespace if it exists. It shouldn't exist by this point, but - # testlist isn't fully implemented, and we currently leak these partial objects. - if isinstance(child, WithLeadingWhitespace): - child = child.value - return child - - -@with_production("namedexpr_test", "test [':=' test]", version=">=3.8") -def convert_namedexpr_test( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - test, *assignment = children - if len(assignment) == 0: - return test - - # Convert all of the operations that have no precedence in a loop - (walrus, value) = assignment - return WithLeadingWhitespace( - NamedExpr( - target=test.value, - whitespace_before_walrus=parse_parenthesizable_whitespace( - config, walrus.whitespace_before - ), - whitespace_after_walrus=parse_parenthesizable_whitespace( - config, walrus.whitespace_after - ), - value=value.value, - ), - test.whitespace_before, - ) - - -@with_production("test", "or_test ['if' or_test 'else' test] | lambdef") -def convert_test( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - (child,) = children - return child - else: - (body, if_token, test, else_token, orelse) = children - return WithLeadingWhitespace( - IfExp( - body=body.value, - test=test.value, - orelse=orelse.value, - whitespace_before_if=parse_parenthesizable_whitespace( - config, if_token.whitespace_before - ), - whitespace_after_if=parse_parenthesizable_whitespace( - config, if_token.whitespace_after - ), - whitespace_before_else=parse_parenthesizable_whitespace( - config, else_token.whitespace_before - ), - whitespace_after_else=parse_parenthesizable_whitespace( - config, else_token.whitespace_after - ), - ), - body.whitespace_before, - ) - - -@with_production("test_nocond", "or_test | lambdef_nocond") -def convert_test_nocond( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (child,) = children - return child - - -@with_production("lambdef", "'lambda' [varargslist] ':' test") -@with_production("lambdef_nocond", "'lambda' [varargslist] ':' test_nocond") -def convert_lambda( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - lambdatoken, *params, colontoken, test = children - - # Grab the whitespace around the colon. If there are no params, then - # the colon owns the whitespace before and after it. If there are - # any params, then the last param owns the whitespace before the colon. - # We handle the parameter movement below. - colon = Colon( - whitespace_before=parse_parenthesizable_whitespace( - config, colontoken.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, colontoken.whitespace_after - ), - ) - - # Unpack optional parameters - if len(params) == 0: - parameters = Parameters() - whitespace_after_lambda = MaybeSentinel.DEFAULT - else: - (parameters,) = params - whitespace_after_lambda = parse_parenthesizable_whitespace( - config, lambdatoken.whitespace_after - ) - - # Handle pre-colon whitespace - if parameters.star_kwarg is not None: - if parameters.star_kwarg.comma == MaybeSentinel.DEFAULT: - parameters = parameters.with_changes( - star_kwarg=parameters.star_kwarg.with_changes( - whitespace_after_param=colon.whitespace_before - ) - ) - elif parameters.kwonly_params: - if parameters.kwonly_params[-1].comma == MaybeSentinel.DEFAULT: - parameters = parameters.with_changes( - kwonly_params=( - *parameters.kwonly_params[:-1], - parameters.kwonly_params[-1].with_changes( - whitespace_after_param=colon.whitespace_before - ), - ) - ) - elif isinstance(parameters.star_arg, Param): - if parameters.star_arg.comma == MaybeSentinel.DEFAULT: - parameters = parameters.with_changes( - star_arg=parameters.star_arg.with_changes( - whitespace_after_param=colon.whitespace_before - ) - ) - elif parameters.params: - if parameters.params[-1].comma == MaybeSentinel.DEFAULT: - parameters = parameters.with_changes( - params=( - *parameters.params[:-1], - parameters.params[-1].with_changes( - whitespace_after_param=colon.whitespace_before - ), - ) - ) - - # Colon doesn't own its own pre-whitespace now. - colon = colon.with_changes(whitespace_before=SimpleWhitespace("")) - - # Return a lambda - return WithLeadingWhitespace( - Lambda( - whitespace_after_lambda=whitespace_after_lambda, - params=parameters, - body=test.value, - colon=colon, - ), - lambdatoken.whitespace_before, - ) - - -@with_production("or_test", "and_test ('or' and_test)*") -@with_production("and_test", "not_test ('and' not_test)*") -def convert_boolop( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - leftexpr, *rightexprs = children - if len(rightexprs) == 0: - return leftexpr - - whitespace_before = leftexpr.whitespace_before - leftexpr = leftexpr.value - - # Convert all of the operations that have no precedence in a loop - for op, rightexpr in grouper(rightexprs, 2): - if op.string not in BOOLOP_TOKEN_LUT: - raise ParserSyntaxError( - f"Unexpected token '{op.string}'!", - lines=config.lines, - raw_line=0, - raw_column=0, - ) - leftexpr = BooleanOperation( - left=leftexpr, - # pyre-ignore Pyre thinks that the type of the LUT is CSTNode. - operator=BOOLOP_TOKEN_LUT[op.string]( - whitespace_before=parse_parenthesizable_whitespace( - config, op.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, op.whitespace_after - ), - ), - right=rightexpr.value, - ) - return WithLeadingWhitespace(leftexpr, whitespace_before) - - -@with_production("not_test", "'not' not_test | comparison") -def convert_not_test( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - (child,) = children - return child - else: - nottoken, nottest = children - return WithLeadingWhitespace( - UnaryOperation( - operator=Not( - whitespace_after=parse_parenthesizable_whitespace( - config, nottoken.whitespace_after - ) - ), - expression=nottest.value, - ), - nottoken.whitespace_before, - ) - - -@with_production("comparison", "expr (comp_op expr)*") -def convert_comparison( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - (child,) = children - return child - - lhs, *rest = children - - comparisons: typing.List[ComparisonTarget] = [] - for operator, comparator in grouper(rest, 2): - comparisons.append( - ComparisonTarget(operator=operator, comparator=comparator.value) - ) - - return WithLeadingWhitespace( - Comparison(left=lhs.value, comparisons=tuple(comparisons)), - lhs.whitespace_before, - ) - - -@with_production( - "comp_op", "('<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not')" -) -def convert_comp_op( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - (op,) = children - if op.string in COMPOP_TOKEN_LUT: - # A regular comparison containing one token - # pyre-ignore Pyre thinks that the type of the LUT is CSTNode. - return COMPOP_TOKEN_LUT[op.string]( - whitespace_before=parse_parenthesizable_whitespace( - config, op.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, op.whitespace_after - ), - ) - elif op.string in ["!=", "<>"]: - # Not equal, which can take two forms in some cases - return NotEqual( - whitespace_before=parse_parenthesizable_whitespace( - config, op.whitespace_before - ), - value=op.string, - whitespace_after=parse_parenthesizable_whitespace( - config, op.whitespace_after - ), - ) - else: - # this should be unreachable - raise ParserSyntaxError( - f"Unexpected token '{op.string}'!", - lines=config.lines, - raw_line=0, - raw_column=0, - ) - else: - # A two-token comparison - leftcomp, rightcomp = children - - if leftcomp.string == "not" and rightcomp.string == "in": - return NotIn( - whitespace_before=parse_parenthesizable_whitespace( - config, leftcomp.whitespace_before - ), - whitespace_between=parse_parenthesizable_whitespace( - config, leftcomp.whitespace_after - ), - whitespace_after=parse_parenthesizable_whitespace( - config, rightcomp.whitespace_after - ), - ) - elif leftcomp.string == "is" and rightcomp.string == "not": - return IsNot( - whitespace_before=parse_parenthesizable_whitespace( - config, leftcomp.whitespace_before - ), - whitespace_between=parse_parenthesizable_whitespace( - config, leftcomp.whitespace_after - ), - whitespace_after=parse_parenthesizable_whitespace( - config, rightcomp.whitespace_after - ), - ) - else: - # this should be unreachable - raise ParserSyntaxError( - f"Unexpected token '{leftcomp.string} {rightcomp.string}'!", - lines=config.lines, - raw_line=0, - raw_column=0, - ) - - -@with_production("star_expr", "'*' expr") -def convert_star_expr( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - star, expr = children - return WithLeadingWhitespace( - StarredElement( - expr.value, - whitespace_before_value=parse_parenthesizable_whitespace( - config, expr.whitespace_before - ), - # atom is responsible for parenthesis and trailing_whitespace if they exist - # testlist_comp, exprlist, dictorsetmaker, etc are responsible for the comma - # if it exists. - ), - whitespace_before=star.whitespace_before, - ) - - -@with_production("expr", "xor_expr ('|' xor_expr)*") -@with_production("xor_expr", "and_expr ('^' and_expr)*") -@with_production("and_expr", "shift_expr ('&' shift_expr)*") -@with_production("shift_expr", "arith_expr (('<<'|'>>') arith_expr)*") -@with_production("arith_expr", "term (('+'|'-') term)*") -@with_production("term", "factor (('*'|'@'|'/'|'%'|'//') factor)*", version=">=3.5") -@with_production("term", "factor (('*'|'/'|'%'|'//') factor)*", version="<3.5") -def convert_binop( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - leftexpr, *rightexprs = children - if len(rightexprs) == 0: - return leftexpr - - whitespace_before = leftexpr.whitespace_before - leftexpr = leftexpr.value - - # Convert all of the operations that have no precedence in a loop - for op, rightexpr in grouper(rightexprs, 2): - if op.string not in BINOP_TOKEN_LUT: - raise ParserSyntaxError( - f"Unexpected token '{op.string}'!", - lines=config.lines, - raw_line=0, - raw_column=0, - ) - leftexpr = BinaryOperation( - left=leftexpr, - # pyre-ignore Pyre thinks that the type of the LUT is CSTNode. - operator=BINOP_TOKEN_LUT[op.string]( - whitespace_before=parse_parenthesizable_whitespace( - config, op.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, op.whitespace_after - ), - ), - right=rightexpr.value, - ) - return WithLeadingWhitespace(leftexpr, whitespace_before) - - -@with_production("factor", "('+'|'-'|'~') factor | power") -def convert_factor( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - (child,) = children - return child - - op, factor = children - - # First, tokenize the unary operator - if op.string == "+": - opnode = Plus( - whitespace_after=parse_parenthesizable_whitespace( - config, op.whitespace_after - ) - ) - elif op.string == "-": - opnode = Minus( - whitespace_after=parse_parenthesizable_whitespace( - config, op.whitespace_after - ) - ) - elif op.string == "~": - opnode = BitInvert( - whitespace_after=parse_parenthesizable_whitespace( - config, op.whitespace_after - ) - ) - else: - raise ParserSyntaxError( - f"Unexpected token '{op.string}'!", - lines=config.lines, - raw_line=0, - raw_column=0, - ) - - return WithLeadingWhitespace( - UnaryOperation(operator=opnode, expression=factor.value), op.whitespace_before - ) - - -@with_production("power", "atom_expr ['**' factor]") -def convert_power( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - (child,) = children - return child - - left, power, right = children - return WithLeadingWhitespace( - BinaryOperation( - left=left.value, - operator=Power( - whitespace_before=parse_parenthesizable_whitespace( - config, power.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, power.whitespace_after - ), - ), - right=right.value, - ), - left.whitespace_before, - ) - - -@with_production("atom_expr", "atom_expr_await | atom_expr_trailer") -def convert_atom_expr( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (child,) = children - return child - - -@with_production("atom_expr_await", "AWAIT atom_expr_trailer") -def convert_atom_expr_await( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - keyword, expr = children - return WithLeadingWhitespace( - Await( - whitespace_after_await=parse_parenthesizable_whitespace( - config, keyword.whitespace_after - ), - expression=expr.value, - ), - keyword.whitespace_before, - ) - - -@with_production("atom_expr_trailer", "atom trailer*") -def convert_atom_expr_trailer( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - atom, *trailers = children - whitespace_before = atom.whitespace_before - atom = atom.value - - # Need to walk through all trailers from left to right and construct - # a series of nodes based on each partial type. We can't do this with - # left recursion due to limits in the parser. - for trailer in trailers: - if isinstance(trailer, SubscriptPartial): - atom = Subscript( - value=atom, - whitespace_after_value=parse_parenthesizable_whitespace( - config, trailer.whitespace_before - ), - lbracket=trailer.lbracket, - # pyre-fixme[6]: Expected `Sequence[SubscriptElement]` for 4th param - # but got `Union[typing.Sequence[SubscriptElement], Index, Slice]`. - slice=trailer.slice, - rbracket=trailer.rbracket, - ) - elif isinstance(trailer, AttributePartial): - atom = Attribute(value=atom, dot=trailer.dot, attr=trailer.attr) - elif isinstance(trailer, CallPartial): - # If the trailing argument doesn't have a comma, then it owns the - # trailing whitespace before the rpar. Otherwise, the comma owns - # it. - if ( - len(trailer.args) > 0 - and trailer.args[-1].comma == MaybeSentinel.DEFAULT - ): - args = ( - *trailer.args[:-1], - trailer.args[-1].with_changes( - whitespace_after_arg=trailer.rpar.whitespace_before - ), - ) - else: - args = trailer.args - atom = Call( - func=atom, - whitespace_after_func=parse_parenthesizable_whitespace( - config, trailer.lpar.whitespace_before - ), - whitespace_before_args=trailer.lpar.value.whitespace_after, - # pyre-fixme[6]: Expected `Sequence[Arg]` for 4th param but got - # `Tuple[object, ...]`. - args=tuple(args), - ) - else: - # This is an invalid trailer, so lets give up - raise CSTLogicError() - return WithLeadingWhitespace(atom, whitespace_before) - - -@with_production( - "trailer", "trailer_arglist | trailer_subscriptlist | trailer_attribute" -) -def convert_trailer( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (child,) = children - return child - - -@with_production("trailer_arglist", "'(' [arglist] ')'") -def convert_trailer_arglist( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - lpar, *arglist, rpar = children - return CallPartial( - lpar=WithLeadingWhitespace( - LeftParen( - whitespace_after=parse_parenthesizable_whitespace( - config, lpar.whitespace_after - ) - ), - lpar.whitespace_before, - ), - args=() if not arglist else arglist[0].args, - rpar=RightParen( - whitespace_before=parse_parenthesizable_whitespace( - config, rpar.whitespace_before - ) - ), - ) - - -@with_production("trailer_subscriptlist", "'[' subscriptlist ']'") -def convert_trailer_subscriptlist( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (lbracket, subscriptlist, rbracket) = children - return SubscriptPartial( - lbracket=LeftSquareBracket( - whitespace_after=parse_parenthesizable_whitespace( - config, lbracket.whitespace_after - ) - ), - slice=subscriptlist.value, - rbracket=RightSquareBracket( - whitespace_before=parse_parenthesizable_whitespace( - config, rbracket.whitespace_before - ) - ), - whitespace_before=lbracket.whitespace_before, - ) - - -@with_production("subscriptlist", "subscript (',' subscript)* [',']") -def convert_subscriptlist( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - # This is a list of SubscriptElement, so construct as such by grouping every - # subscript with an optional comma and adding to a list. - elements = [] - for slice, comma in grouper(children, 2): - if comma is None: - elements.append(SubscriptElement(slice=slice.value)) - else: - elements.append( - SubscriptElement( - slice=slice.value, - comma=Comma( - whitespace_before=parse_parenthesizable_whitespace( - config, comma.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, comma.whitespace_after - ), - ), - ) - ) - return WithLeadingWhitespace(elements, children[0].whitespace_before) - - -@with_production("subscript", "test | [test] ':' [test] [sliceop]") -def convert_subscript( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1 and not isinstance(children[0], Token): - # This is just an index node - (test,) = children - return WithLeadingWhitespace(Index(test.value), test.whitespace_before) - - if isinstance(children[-1], SlicePartial): - # We got a partial slice as the final param. Extract the final - # bits of the full subscript. - *others, sliceop = children - whitespace_before = others[0].whitespace_before - second_colon = sliceop.second_colon - step = sliceop.step - else: - # We can just parse this below, without taking extras from the - # partial child. - others = children - whitespace_before = others[0].whitespace_before - second_colon = MaybeSentinel.DEFAULT - step = None - - # We need to create a partial slice to pass up. So, align so we have - # a list that's always [Optional[Test], Colon, Optional[Test]]. - if isinstance(others[0], Token): - # First token is a colon, so insert an empty test on the LHS. We - # know the RHS is a test since it's not a sliceop. - slicechildren = [None, *others] - else: - # First token is non-colon, so its a test. - slicechildren = [*others] - - if len(slicechildren) < 3: - # Now, we have to fill in the RHS. We know its two long - # at this point if its not already 3. - slicechildren = [*slicechildren, None] - - lower, first_colon, upper = slicechildren - return WithLeadingWhitespace( - Slice( - lower=lower.value if lower is not None else None, - first_colon=Colon( - whitespace_before=parse_parenthesizable_whitespace( - config, - first_colon.whitespace_before, - ), - whitespace_after=parse_parenthesizable_whitespace( - config, - first_colon.whitespace_after, - ), - ), - upper=upper.value if upper is not None else None, - second_colon=second_colon, - step=step, - ), - whitespace_before=whitespace_before, - ) - - -@with_production("sliceop", "':' [test]") -def convert_sliceop( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 2: - colon, test = children - step = test.value - else: - (colon,) = children - step = None - return SlicePartial( - second_colon=Colon( - whitespace_before=parse_parenthesizable_whitespace( - config, colon.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, colon.whitespace_after - ), - ), - step=step, - ) - - -@with_production("trailer_attribute", "'.' NAME") -def convert_trailer_attribute( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - dot, name = children - return AttributePartial( - dot=Dot( - whitespace_before=parse_parenthesizable_whitespace( - config, dot.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, dot.whitespace_after - ), - ), - attr=Name(name.string), - ) - - -@with_production( - "atom", - "atom_parens | atom_squarebrackets | atom_curlybraces | atom_string | atom_basic | atom_ellipses", -) -def convert_atom( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (child,) = children - return child - - -@with_production("atom_basic", "NAME | NUMBER | 'None' | 'True' | 'False'") -def convert_atom_basic( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (child,) = children - if child.type.name == "NAME": - # This also handles 'None', 'True', and 'False' directly, but we - # keep it in the grammar to be more correct. - return WithLeadingWhitespace(Name(child.string), child.whitespace_before) - elif child.type.name == "NUMBER": - # We must determine what type of number it is since we split node - # types up this way. - if re.fullmatch(INTNUMBER_RE, child.string): - return WithLeadingWhitespace(Integer(child.string), child.whitespace_before) - elif re.fullmatch(FLOATNUMBER_RE, child.string): - return WithLeadingWhitespace(Float(child.string), child.whitespace_before) - elif re.fullmatch(IMAGNUMBER_RE, child.string): - return WithLeadingWhitespace( - Imaginary(child.string), child.whitespace_before - ) - else: - raise ParserSyntaxError( - f"Unparseable number {child.string}", - lines=config.lines, - raw_line=0, - raw_column=0, - ) - else: - raise ParserSyntaxError( - f"Logic error, unexpected token {child.type.name}", - lines=config.lines, - raw_line=0, - raw_column=0, - ) - - -@with_production("atom_squarebrackets", "'[' [testlist_comp_list] ']'") -def convert_atom_squarebrackets( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - lbracket_tok, *body, rbracket_tok = children - lbracket = LeftSquareBracket( - whitespace_after=parse_parenthesizable_whitespace( - config, lbracket_tok.whitespace_after - ) - ) - - rbracket = RightSquareBracket( - whitespace_before=parse_parenthesizable_whitespace( - config, rbracket_tok.whitespace_before - ) - ) - - if len(body) == 0: - list_node = List((), lbracket=lbracket, rbracket=rbracket) - else: # len(body) == 1 - # body[0] is a List or ListComp - list_node = body[0].value.with_changes(lbracket=lbracket, rbracket=rbracket) - - return WithLeadingWhitespace(list_node, lbracket_tok.whitespace_before) - - -@with_production("atom_curlybraces", "'{' [dictorsetmaker] '}'") -def convert_atom_curlybraces( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - lbrace_tok, *body, rbrace_tok = children - lbrace = LeftCurlyBrace( - whitespace_after=parse_parenthesizable_whitespace( - config, lbrace_tok.whitespace_after - ) - ) - - rbrace = RightCurlyBrace( - whitespace_before=parse_parenthesizable_whitespace( - config, rbrace_tok.whitespace_before - ) - ) - - if len(body) == 0: - dict_or_set_node = Dict((), lbrace=lbrace, rbrace=rbrace) - else: # len(body) == 1 - dict_or_set_node = body[0].value.with_changes(lbrace=lbrace, rbrace=rbrace) - - return WithLeadingWhitespace(dict_or_set_node, lbrace_tok.whitespace_before) - - -@with_production("atom_parens", "'(' [yield_expr|testlist_comp_tuple] ')'") -def convert_atom_parens( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - lpar_tok, *atoms, rpar_tok = children - - lpar = LeftParen( - whitespace_after=parse_parenthesizable_whitespace( - config, lpar_tok.whitespace_after - ) - ) - - rpar = RightParen( - whitespace_before=parse_parenthesizable_whitespace( - config, rpar_tok.whitespace_before - ) - ) - - if len(atoms) == 1: - # inner_atom is a _BaseParenthesizedNode - inner_atom = atoms[0].value - return WithLeadingWhitespace( - inner_atom.with_changes( - # pyre-fixme[60]: Expected to unpack an iterable, but got `unknown`. - lpar=(lpar, *inner_atom.lpar), - # pyre-fixme[60]: Expected to unpack an iterable, but got `unknown`. - rpar=(*inner_atom.rpar, rpar), - ), - lpar_tok.whitespace_before, - ) - else: - return WithLeadingWhitespace( - Tuple((), lpar=(lpar,), rpar=(rpar,)), lpar_tok.whitespace_before - ) - - -@with_production("atom_ellipses", "'...'") -def convert_atom_ellipses( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (token,) = children - return WithLeadingWhitespace(Ellipsis(), token.whitespace_before) - - -@with_production("atom_string", "(STRING | fstring) [atom_string]") -def convert_atom_string( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - return children[0] - else: - left, right = children - return WithLeadingWhitespace( - ConcatenatedString( - left=left.value, - whitespace_between=parse_parenthesizable_whitespace( - config, right.whitespace_before - ), - right=right.value, - ), - left.whitespace_before, - ) - - -@with_production("fstring", "FSTRING_START fstring_content* FSTRING_END") -def convert_fstring( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - start, *content, end = children - return WithLeadingWhitespace( - FormattedString(start=start.string, parts=tuple(content), end=end.string), - start.whitespace_before, - ) - - -@with_production("fstring_content", "FSTRING_STRING | fstring_expr") -def convert_fstring_content( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (child,) = children - if isinstance(child, Token): - # Construct and return a raw string portion. - return FormattedStringText(child.string) - else: - # Pass the expression up one production. - return child - - -@with_production("fstring_conversion", "'!' NAME") -def convert_fstring_conversion( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - exclaim, name = children - # There cannot be a space between the two tokens, so no need to preserve this. - return FormattedStringConversionPartial(name.string, exclaim.whitespace_before) - - -@with_production("fstring_equality", "'='", version=">=3.8") -def convert_fstring_equality( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (equal,) = children - return AssignEqual( - whitespace_before=parse_parenthesizable_whitespace( - config, equal.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, equal.whitespace_after - ), - ) - - -@with_production( - "fstring_expr", - "'{' (testlist_comp_tuple | yield_expr) [ fstring_equality ] [ fstring_conversion ] [ fstring_format_spec ] '}'", - version=">=3.8", -) -@with_production( - "fstring_expr", - "'{' (testlist_comp_tuple | yield_expr) [ fstring_conversion ] [ fstring_format_spec ] '}'", - version="<3.8", -) -def convert_fstring_expr( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - openbrkt, testlist, *conversions, closebrkt = children - - # Extract any optional equality (self-debugging expressions) - if len(conversions) > 0 and isinstance(conversions[0], AssignEqual): - equal = conversions[0] - conversions = conversions[1:] - else: - equal = None - - # Extract any optional conversion - if len(conversions) > 0 and isinstance( - conversions[0], FormattedStringConversionPartial - ): - conversion = conversions[0].value - conversions = conversions[1:] - else: - conversion = None - - # Extract any optional format spec - if len(conversions) > 0: - format_spec = conversions[0].values - else: - format_spec = None - - # Fix up any spacing issue we find due to the fact that the equal can - # have whitespace and is also at the end of the expression. - if equal is not None: - whitespace_after_expression = SimpleWhitespace("") - else: - whitespace_after_expression = parse_parenthesizable_whitespace( - config, children[2].whitespace_before - ) - - return FormattedStringExpression( - whitespace_before_expression=parse_parenthesizable_whitespace( - config, testlist.whitespace_before - ), - expression=testlist.value, - equal=equal, - whitespace_after_expression=whitespace_after_expression, - conversion=conversion, - format_spec=format_spec, - ) - - -@with_production("fstring_format_spec", "':' fstring_content*") -def convert_fstring_format_spec( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - colon, *content = children - return FormattedStringFormatSpecPartial(tuple(content), colon.whitespace_before) - - -@with_production( - "testlist_comp_tuple", - "(namedexpr_test|star_expr) ( comp_for | (',' (namedexpr_test|star_expr))* [','] )", - version=">=3.8", -) -@with_production( - "testlist_comp_tuple", - "(test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )", - version=">=3.5,<3.8", -) -@with_production( - "testlist_comp_tuple", - "(test) ( comp_for | (',' (test))* [','] )", - version="<3.5", -) -def convert_testlist_comp_tuple( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - return _convert_testlist_comp( - config, - children, - single_child_is_sequence=False, - sequence_type=Tuple, - comprehension_type=GeneratorExp, - ) - - -@with_production( - "testlist_comp_list", - "(namedexpr_test|star_expr) ( comp_for | (',' (namedexpr_test|star_expr))* [','] )", - version=">=3.8", -) -@with_production( - "testlist_comp_list", - "(test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )", - version=">=3.5,<3.8", -) -@with_production( - "testlist_comp_list", - "(test) ( comp_for | (',' (test))* [','] )", - version="<3.5", -) -def convert_testlist_comp_list( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - return _convert_testlist_comp( - config, - children, - single_child_is_sequence=True, - sequence_type=List, - comprehension_type=ListComp, - ) - - -def _convert_testlist_comp( - config: ParserConfig, - children: typing.Sequence[typing.Any], - single_child_is_sequence: bool, - sequence_type: typing.Union[ - typing.Type[Tuple], typing.Type[List], typing.Type[Set] - ], - comprehension_type: typing.Union[ - typing.Type[GeneratorExp], typing.Type[ListComp], typing.Type[SetComp] - ], -) -> typing.Any: - # This is either a single-element list, or the second token is a comma, so we're not - # in a generator. - if len(children) == 1 or isinstance(children[1], Token): - return _convert_sequencelike( - config, children, single_child_is_sequence, sequence_type - ) - else: - # N.B. The parent node (e.g. atom) is responsible for computing and attaching - # whitespace information on any parenthesis, square brackets, or curly braces - elt, for_in = children - return WithLeadingWhitespace( - comprehension_type(elt=elt.value, for_in=for_in, lpar=(), rpar=()), - elt.whitespace_before, - ) - - -@with_production("testlist_star_expr", "(test|star_expr) (',' (test|star_expr))* [',']") -@with_production("testlist", "test (',' test)* [',']") -@with_production("exprlist", "(expr|star_expr) (',' (expr|star_expr))* [',']") -def convert_test_or_expr_list( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - # Used by expression statements and assignments. Neither of these cases want to - # treat a single child as a sequence. - return _convert_sequencelike( - config, children, single_child_is_sequence=False, sequence_type=Tuple - ) - - -def _convert_sequencelike( - config: ParserConfig, - children: typing.Sequence[typing.Any], - single_child_is_sequence: bool, - sequence_type: typing.Union[ - typing.Type[Tuple], typing.Type[List], typing.Type[Set] - ], -) -> typing.Any: - if not single_child_is_sequence and len(children) == 1: - return children[0] - # N.B. The parent node (e.g. atom) is responsible for computing and attaching - # whitespace information on any parenthesis, square brackets, or curly braces - elements = [] - for wrapped_expr_or_starred_element, comma_token in grouper(children, 2): - expr_or_starred_element = wrapped_expr_or_starred_element.value - if comma_token is None: - comma = MaybeSentinel.DEFAULT - else: - comma = Comma( - whitespace_before=parse_parenthesizable_whitespace( - config, comma_token.whitespace_before - ), - # Only compute whitespace_after if we're not a trailing comma. - # If we're a trailing comma, that whitespace should be consumed by the - # TrailingWhitespace, parenthesis, etc. - whitespace_after=( - parse_parenthesizable_whitespace( - config, comma_token.whitespace_after - ) - if comma_token is not children[-1] - else SimpleWhitespace("") - ), - ) - - if isinstance(expr_or_starred_element, StarredElement): - starred_element = expr_or_starred_element - elements.append(starred_element.with_changes(comma=comma)) - else: - expr = expr_or_starred_element - elements.append(Element(value=expr, comma=comma)) - - # lpar/rpar are the responsibility of our parent - return WithLeadingWhitespace( - sequence_type(elements, lpar=(), rpar=()), - children[0].whitespace_before, - ) - - -@with_production( - "dictorsetmaker", - ( - "( ((test ':' test | '**' expr)" - + " (comp_for | (',' (test ':' test | '**' expr))* [','])) |" - + "((test | star_expr) " - + " (comp_for | (',' (test | star_expr))* [','])) )" - ), - version=">=3.5", -) -@with_production( - "dictorsetmaker", - ( - "( ((test ':' test)" - + " (comp_for | (',' (test ':' test))* [','])) |" - + "((test) " - + " (comp_for | (',' (test))* [','])) )" - ), - version="<3.5", -) -def convert_dictorsetmaker( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - # We'll always have at least one child. `atom_curlybraces` handles empty - # dicts. - if len(children) > 1 and ( - (isinstance(children[1], Token) and children[1].string == ":") - or (isinstance(children[0], Token) and children[0].string == "**") - ): - return _convert_dict(config, children) - else: - return _convert_set(config, children) - - -def _convert_dict_element( - config: ParserConfig, - children_iter: typing.Iterator[typing.Any], - last_child: typing.Any, -) -> typing.Union[DictElement, StarredDictElement]: - first = next(children_iter) - if isinstance(first, Token) and first.string == "**": - expr = next(children_iter) - element = StarredDictElement( - expr.value, - whitespace_before_value=parse_parenthesizable_whitespace( - config, expr.whitespace_before - ), - ) - else: - key = first - colon_tok = next(children_iter) - value = next(children_iter) - element = DictElement( - key.value, - value.value, - whitespace_before_colon=parse_parenthesizable_whitespace( - config, colon_tok.whitespace_before - ), - whitespace_after_colon=parse_parenthesizable_whitespace( - config, colon_tok.whitespace_after - ), - ) - # Handle the trailing comma (if there is one) - try: - comma_token = next(children_iter) - element = element.with_changes( - comma=Comma( - whitespace_before=parse_parenthesizable_whitespace( - config, comma_token.whitespace_before - ), - # Only compute whitespace_after if we're not a trailing comma. - # If we're a trailing comma, that whitespace should be consumed by the - # RightBracket. - whitespace_after=( - parse_parenthesizable_whitespace( - config, comma_token.whitespace_after - ) - if comma_token is not last_child - else SimpleWhitespace("") - ), - ) - ) - except StopIteration: - pass - return element - - -def _convert_dict( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - is_first_starred = isinstance(children[0], Token) and children[0].string == "**" - if is_first_starred: - possible_comp_for = None if len(children) < 3 else children[2] - else: - possible_comp_for = None if len(children) < 4 else children[3] - if isinstance(possible_comp_for, CompFor): - if is_first_starred: - raise PartialParserSyntaxError( - "dict unpacking cannot be used in dict comprehension" - ) - return _convert_dict_comp(config, children) - - children_iter = iter(children) - last_child = children[-1] - elements = [] - while True: - try: - elements.append(_convert_dict_element(config, children_iter, last_child)) - except StopIteration: - break - # lbrace, rbrace, lpar, and rpar will be attached as-needed by the atom grammar - return WithLeadingWhitespace(Dict(tuple(elements)), children[0].whitespace_before) - - -def _convert_dict_comp(config, children: typing.Sequence[typing.Any]) -> typing.Any: - key, colon_token, value, comp_for = children - return WithLeadingWhitespace( - DictComp( - key.value, - value.value, - comp_for, - # lbrace, rbrace, lpar, and rpar will be attached as-needed by the atom grammar - whitespace_before_colon=parse_parenthesizable_whitespace( - config, colon_token.whitespace_before - ), - whitespace_after_colon=parse_parenthesizable_whitespace( - config, colon_token.whitespace_after - ), - ), - key.whitespace_before, - ) - - -def _convert_set( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - return _convert_testlist_comp( - config, - children, - single_child_is_sequence=True, - sequence_type=Set, - comprehension_type=SetComp, - ) - - -@with_production("arglist", "argument (',' argument)* [',']") -def convert_arglist( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - args = [] - for argument, comma in grouper(children, 2): - if comma is None: - args.append(argument) - else: - args.append( - argument.with_changes( - comma=Comma( - whitespace_before=parse_parenthesizable_whitespace( - config, comma.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, comma.whitespace_after - ), - ) - ) - ) - return ArglistPartial(args) - - -@with_production("argument", "arg_assign_comp_for | star_arg") -def convert_argument( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - (child,) = children - return child - - -@with_production( - "arg_assign_comp_for", "test [comp_for] | test '=' test", version="<=3.7" -) -@with_production( - "arg_assign_comp_for", - "test [comp_for] | test ':=' test | test '=' test", - version=">=3.8", -) -def convert_arg_assign_comp_for( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - # Simple test - (child,) = children - return Arg(value=child.value) - elif len(children) == 2: - elt, for_in = children - return Arg(value=GeneratorExp(elt.value, for_in, lpar=(), rpar=())) - else: - lhs, equal, rhs = children - # "key := value" assignment; positional - if equal.string == ":=": - val = convert_namedexpr_test(config, children) - if not isinstance(val, WithLeadingWhitespace): - raise TypeError( - f"convert_namedexpr_test returned {val!r}, not WithLeadingWhitespace" - ) - return Arg(value=val.value) - # "key = value" assignment; keyword argument - return Arg( - keyword=lhs.value, - equal=AssignEqual( - whitespace_before=parse_parenthesizable_whitespace( - config, equal.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, equal.whitespace_after - ), - ), - value=rhs.value, - ) - - -@with_production("star_arg", "'**' test | '*' test") -def convert_star_arg( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - star, test = children - return Arg( - star=star.string, - whitespace_after_star=parse_parenthesizable_whitespace( - config, star.whitespace_after - ), - value=test.value, - ) - - -@with_production("sync_comp_for", "'for' exprlist 'in' or_test comp_if* [comp_for]") -def convert_sync_comp_for( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - # unpack - for_tok, target, in_tok, iter, *trailing = children - if len(trailing) and isinstance(trailing[-1], CompFor): - *ifs, inner_for_in = trailing - else: - ifs, inner_for_in = trailing, None - - return CompFor( - target=target.value, - iter=iter.value, - ifs=ifs, - inner_for_in=inner_for_in, - whitespace_before=parse_parenthesizable_whitespace( - config, for_tok.whitespace_before - ), - whitespace_after_for=parse_parenthesizable_whitespace( - config, for_tok.whitespace_after - ), - whitespace_before_in=parse_parenthesizable_whitespace( - config, in_tok.whitespace_before - ), - whitespace_after_in=parse_parenthesizable_whitespace( - config, in_tok.whitespace_after - ), - ) - - -@with_production("comp_for", "[ASYNC] sync_comp_for", version=">=3.6") -@with_production("comp_for", "sync_comp_for", version="<=3.5") -def convert_comp_for( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - (sync_comp_for,) = children - return sync_comp_for - else: - (async_tok, sync_comp_for) = children - return sync_comp_for.with_changes( - # asynchronous steals the `CompFor`'s `whitespace_before`. - asynchronous=Asynchronous(whitespace_after=sync_comp_for.whitespace_before), - # But, in exchange, `CompFor` gets to keep `async_tok`'s leading - # whitespace, because that's now the beginning of the `CompFor`. - whitespace_before=parse_parenthesizable_whitespace( - config, async_tok.whitespace_before - ), - ) - - -@with_production("comp_if", "'if' test_nocond") -def convert_comp_if( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if_tok, test = children - return CompIf( - test.value, - whitespace_before=parse_parenthesizable_whitespace( - config, if_tok.whitespace_before - ), - whitespace_before_test=parse_parenthesizable_whitespace( - config, test.whitespace_before - ), - ) - - -@with_production("yield_expr", "'yield' [yield_arg]") -def convert_yield_expr( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - # Yielding implicit none - (yield_token,) = children - yield_node = Yield(value=None) - else: - # Yielding explicit value - (yield_token, yield_arg) = children - yield_node = Yield( - value=yield_arg.value, - whitespace_after_yield=parse_parenthesizable_whitespace( - config, yield_arg.whitespace_before - ), - ) - - return WithLeadingWhitespace(yield_node, yield_token.whitespace_before) - - -@with_production("yield_arg", "testlist", version="<3.3") -@with_production("yield_arg", "'from' test | testlist", version=">=3.3,<3.8") -@with_production("yield_arg", "'from' test | testlist_star_expr", version=">=3.8") -def convert_yield_arg( - config: ParserConfig, children: typing.Sequence[typing.Any] -) -> typing.Any: - if len(children) == 1: - # Just a regular testlist, pass it up - (child,) = children - return child - else: - # Its a yield from - (from_token, test) = children - - return WithLeadingWhitespace( - From( - item=test.value, - whitespace_after_from=parse_parenthesizable_whitespace( - config, test.whitespace_before - ), - ), - from_token.whitespace_before, - ) diff --git a/libcst/_parser/conversions/module.py b/libcst/_parser/conversions/module.py deleted file mode 100644 index b40641d09..000000000 --- a/libcst/_parser/conversions/module.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# pyre-unsafe - -from typing import Any, Sequence - -from libcst._nodes.module import Module -from libcst._nodes.whitespace import NEWLINE_RE -from libcst._parser.production_decorator import with_production -from libcst._parser.types.config import ParserConfig - - -@with_production("file_input", "(NEWLINE | stmt)* ENDMARKER") -def convert_file_input(config: ParserConfig, children: Sequence[Any]) -> Any: - *body, footer = children - if len(body) == 0: - # If there's no body, the header and footer are ambiguous. The header is more - # important, and should own the EmptyLine nodes instead of the footer. - header = footer - footer = () - if ( - len(config.lines) == 2 - and NEWLINE_RE.fullmatch(config.lines[0]) - and config.lines[1] == "" - ): - # This is an empty file (not even a comment), so special-case this to an - # empty list instead of a single dummy EmptyLine (which is what we'd - # normally parse). - header = () - else: - # Steal the leading lines from the first statement, and move them into the - # header. - first_stmt = body[0] - header = first_stmt.leading_lines - body[0] = first_stmt.with_changes(leading_lines=()) - return Module( - header=header, - body=body, - footer=footer, - encoding=config.encoding, - default_indent=config.default_indent, - default_newline=config.default_newline, - has_trailing_newline=config.has_trailing_newline, - ) diff --git a/libcst/_parser/conversions/params.py b/libcst/_parser/conversions/params.py deleted file mode 100644 index 5b29f95d1..000000000 --- a/libcst/_parser/conversions/params.py +++ /dev/null @@ -1,346 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# pyre-unsafe - -from typing import Any, List, Optional, Sequence, Union - -from libcst import CSTLogicError -from libcst._exceptions import PartialParserSyntaxError -from libcst._maybe_sentinel import MaybeSentinel -from libcst._nodes.expression import ( - Annotation, - Name, - Param, - Parameters, - ParamSlash, - ParamStar, -) -from libcst._nodes.op import AssignEqual, Comma -from libcst._parser.custom_itertools import grouper -from libcst._parser.production_decorator import with_production -from libcst._parser.types.config import ParserConfig -from libcst._parser.types.partials import ParamStarPartial -from libcst._parser.whitespace_parser import parse_parenthesizable_whitespace - - -@with_production( # noqa: C901: too complex - "typedargslist", - """( - (tfpdef_assign (',' tfpdef_assign)* ',' tfpdef_posind [',' [ tfpdef_assign ( - ',' tfpdef_assign)* [',' [ - tfpdef_star (',' tfpdef_assign)* [',' [tfpdef_starstar [',']]] - | tfpdef_starstar [',']]] - | tfpdef_star (',' tfpdef_assign)* [',' [tfpdef_starstar [',']]] - | tfpdef_starstar [',']]] ) - | (tfpdef_assign (',' tfpdef_assign)* [',' [ - tfpdef_star (',' tfpdef_assign)* [',' [tfpdef_starstar [',']]] - | tfpdef_starstar [',']]] - | tfpdef_star (',' tfpdef_assign)* [',' [tfpdef_starstar [',']]] - | tfpdef_starstar [',']) - )""", - version=">=3.8", -) -@with_production( # noqa: C901: too complex - "typedargslist", - ( - "(tfpdef_assign (',' tfpdef_assign)* " - + "[',' [tfpdef_star (',' tfpdef_assign)* [',' [tfpdef_starstar [',']]] | tfpdef_starstar [',']]]" - + "| tfpdef_star (',' tfpdef_assign)* [',' [tfpdef_starstar [',']]] | tfpdef_starstar [','])" - ), - version=">=3.6,<=3.7", -) -@with_production( # noqa: C901: too complex - "typedargslist", - ( - "(tfpdef_assign (',' tfpdef_assign)* " - + "[',' [tfpdef_star (',' tfpdef_assign)* [',' tfpdef_starstar] | tfpdef_starstar]]" - + "| tfpdef_star (',' tfpdef_assign)* [',' tfpdef_starstar] | tfpdef_starstar)" - ), - version="<=3.5", -) -@with_production( - "varargslist", - """vfpdef_assign (',' vfpdef_assign)* ',' vfpdef_posind [',' [ (vfpdef_assign (',' vfpdef_assign)* [',' [ - vfpdef_star (',' vfpdef_assign)* [',' [vfpdef_starstar [',']]] - | vfpdef_starstar [',']]] - | vfpdef_star (',' vfpdef_assign)* [',' [vfpdef_starstar [',']]] - | vfpdef_starstar [',']) ]] | (vfpdef_assign (',' vfpdef_assign)* [',' [ - vfpdef_star (',' vfpdef_assign)* [',' [vfpdef_starstar [',']]] - | vfpdef_starstar [',']]] - | vfpdef_star (',' vfpdef_assign)* [',' [vfpdef_starstar [',']]] - | vfpdef_starstar [','] - )""", - version=">=3.8", -) -@with_production( - "varargslist", - ( - "(vfpdef_assign (',' vfpdef_assign)* " - + "[',' [vfpdef_star (',' vfpdef_assign)* [',' [vfpdef_starstar [',']]] | vfpdef_starstar [',']]]" - + "| vfpdef_star (',' vfpdef_assign)* [',' [vfpdef_starstar [',']]] | vfpdef_starstar [','])" - ), - version=">=3.6,<=3.7", -) -@with_production( - "varargslist", - ( - "(vfpdef_assign (',' vfpdef_assign)* " - + "[',' [vfpdef_star (',' vfpdef_assign)* [',' vfpdef_starstar] | vfpdef_starstar]]" - + "| vfpdef_star (',' vfpdef_assign)* [',' vfpdef_starstar] | vfpdef_starstar)" - ), - version="<=3.5", -) -def convert_argslist( # noqa: C901 - config: ParserConfig, children: Sequence[Any] -) -> Any: - posonly_params: List[Param] = [] - posonly_ind: Union[ParamSlash, MaybeSentinel] = MaybeSentinel.DEFAULT - params: List[Param] = [] - seen_default: bool = False - star_arg: Union[Param, ParamStar, MaybeSentinel] = MaybeSentinel.DEFAULT - kwonly_params: List[Param] = [] - star_kwarg: Optional[Param] = None - - def add_param( - current_param: Optional[List[Param]], param: Union[Param, ParamStar] - ) -> Optional[List[Param]]: - nonlocal star_arg - nonlocal star_kwarg - nonlocal seen_default - nonlocal posonly_params - nonlocal posonly_ind - nonlocal params - - if isinstance(param, ParamStar): - # Only can add this if we don't already have a "*" or a "*param". - if current_param is params: - star_arg = param - current_param = kwonly_params - else: - # Example code: - # def fn(*abc, *): ... - # This should be unreachable, the grammar already disallows it. - raise ValueError( - "Cannot have multiple star ('*') markers in a single argument " - + "list." - ) - elif isinstance(param, ParamSlash): - # Only can add this if we don't already have a "/" or a "*" or a "*param". - if current_param is params and len(posonly_params) == 0: - posonly_ind = param - posonly_params = params - params = [] - current_param = params - else: - # Example code: - # def fn(foo, /, *, /, bar): ... - # This should be unreachable, the grammar already disallows it. - raise ValueError( - "Cannot have multiple slash ('/') markers in a single argument " - + "list." - ) - elif isinstance(param.star, str) and param.star == "" and param.default is None: - # Can only add this if we're in the params or kwonly_params section - if current_param is params and not seen_default: - params.append(param) - elif current_param is kwonly_params: - kwonly_params.append(param) - else: - # Example code: - # def fn(first=None, second): ... - # This code is reachable, so we should use a PartialParserSyntaxError. - raise PartialParserSyntaxError( - "Cannot have a non-default argument following a default argument." - ) - elif ( - isinstance(param.star, str) - and param.star == "" - and param.default is not None - ): - # Can only add this if we're not yet at star args. - if current_param is params: - seen_default = True - params.append(param) - elif current_param is kwonly_params: - kwonly_params.append(param) - else: - # Example code: - # def fn(**kwargs, trailing=None) - # This should be unreachable, the grammar already disallows it. - raise ValueError("Cannot have any arguments after a kwargs expansion.") - elif ( - isinstance(param.star, str) and param.star == "*" and param.default is None - ): - # Can only add this if we're in params, since we only allow one of - # "*" or "*param". - if current_param is params: - star_arg = param - current_param = kwonly_params - else: - # Example code: - # def fn(*first, *second): ... - # This should be unreachable, the grammar already disallows it. - raise ValueError( - "Expected a keyword argument but found a starred positional " - + "argument expansion." - ) - elif ( - isinstance(param.star, str) and param.star == "**" and param.default is None - ): - # Can add this in all cases where we don't have a star_kwarg - # yet. - if current_param is not None: - star_kwarg = param - current_param = None - else: - # Example code: - # def fn(**first, **second) - # This should be unreachable, the grammar already disallows it. - raise ValueError( - "Multiple starred keyword argument expansions are not allowed in a " - + "single argument list" - ) - else: - # The state machine should never end up here. - raise CSTLogicError("Logic error!") - - return current_param - - # The parameter list we are adding to - current: Optional[List[Param]] = params - - # We should have every other item in the group as a param or a comma by now, - # so split them up, add commas and then put them in the appropriate group. - for parameter, comma in grouper(children, 2): - if comma is None: - if isinstance(parameter, ParamStarPartial): - # Example: - # def fn(abc, *): ... - # - # There's also the case where we have bare * with a trailing comma. - # That's handled later. - # - # It's not valid to construct a ParamStar object without a comma, so we - # need to catch the non-comma case separately. - raise PartialParserSyntaxError( - "Named (keyword) arguments must follow a bare *." - ) - else: - current = add_param(current, parameter) - else: - comma = Comma( - whitespace_before=parse_parenthesizable_whitespace( - config, comma.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, comma.whitespace_after - ), - ) - if isinstance(parameter, ParamStarPartial): - current = add_param(current, ParamStar(comma=comma)) - else: - current = add_param(current, parameter.with_changes(comma=comma)) - - if isinstance(star_arg, ParamStar) and len(kwonly_params) == 0: - # Example: - # def fn(abc, *,): ... - # - # This will raise a validation error, but we want to make sure to raise a syntax - # error instead. - # - # The case where there's no trailing comma is already handled by this point, so - # this conditional is only for the case where we have a trailing comma. - raise PartialParserSyntaxError( - "Named (keyword) arguments must follow a bare *." - ) - - return Parameters( - posonly_params=tuple(posonly_params), - posonly_ind=posonly_ind, - params=tuple(params), - star_arg=star_arg, - kwonly_params=tuple(kwonly_params), - star_kwarg=star_kwarg, - ) - - -@with_production("tfpdef_star", "'*' [tfpdef]") -@with_production("vfpdef_star", "'*' [vfpdef]") -def convert_fpdef_star(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 1: - (star,) = children - return ParamStarPartial() - else: - star, param = children - return param.with_changes( - star=star.string, - whitespace_after_star=parse_parenthesizable_whitespace( - config, star.whitespace_after - ), - ) - - -@with_production("tfpdef_starstar", "'**' tfpdef") -@with_production("vfpdef_starstar", "'**' vfpdef") -def convert_fpdef_starstar(config: ParserConfig, children: Sequence[Any]) -> Any: - starstar, param = children - return param.with_changes( - star=starstar.string, - whitespace_after_star=parse_parenthesizable_whitespace( - config, starstar.whitespace_after - ), - ) - - -@with_production("tfpdef_assign", "tfpdef ['=' test]") -@with_production("vfpdef_assign", "vfpdef ['=' test]") -def convert_fpdef_assign(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 1: - (child,) = children - return child - - param, equal, default = children - return param.with_changes( - equal=AssignEqual( - whitespace_before=parse_parenthesizable_whitespace( - config, equal.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, equal.whitespace_after - ), - ), - default=default.value, - ) - - -@with_production("tfpdef", "NAME [':' test]") -@with_production("vfpdef", "NAME") -def convert_fpdef(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 1: - # This is just a parameter - (child,) = children - namenode = Name(child.string) - annotation = None - else: - # This is a parameter with a type hint - name, colon, typehint = children - namenode = Name(name.string) - annotation = Annotation( - whitespace_before_indicator=parse_parenthesizable_whitespace( - config, colon.whitespace_before - ), - whitespace_after_indicator=parse_parenthesizable_whitespace( - config, colon.whitespace_after - ), - annotation=typehint.value, - ) - - return Param(star="", name=namenode, annotation=annotation, default=None) - - -@with_production("tfpdef_posind", "'/'") -@with_production("vfpdef_posind", "'/'") -def convert_fpdef_slash(config: ParserConfig, children: Sequence[Any]) -> Any: - return ParamSlash() diff --git a/libcst/_parser/conversions/statement.py b/libcst/_parser/conversions/statement.py deleted file mode 100644 index f96c6ea21..000000000 --- a/libcst/_parser/conversions/statement.py +++ /dev/null @@ -1,1381 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# pyre-unsafe - -from typing import Any, Dict, List, Optional, Sequence, Tuple, Type - -from libcst import CSTLogicError -from libcst._exceptions import ParserSyntaxError, PartialParserSyntaxError -from libcst._maybe_sentinel import MaybeSentinel -from libcst._nodes.expression import ( - Annotation, - Arg, - Asynchronous, - Attribute, - Call, - From, - LeftParen, - Name, - Param, - Parameters, - RightParen, -) -from libcst._nodes.op import ( - AddAssign, - AssignEqual, - BaseAugOp, - BitAndAssign, - BitOrAssign, - BitXorAssign, - Comma, - DivideAssign, - Dot, - FloorDivideAssign, - ImportStar, - LeftShiftAssign, - MatrixMultiplyAssign, - ModuloAssign, - MultiplyAssign, - PowerAssign, - RightShiftAssign, - Semicolon, - SubtractAssign, -) -from libcst._nodes.statement import ( - AnnAssign, - AsName, - Assert, - Assign, - AssignTarget, - AugAssign, - Break, - ClassDef, - Continue, - Decorator, - Del, - Else, - ExceptHandler, - Expr, - Finally, - For, - FunctionDef, - Global, - If, - Import, - ImportAlias, - ImportFrom, - IndentedBlock, - NameItem, - Nonlocal, - Pass, - Raise, - Return, - SimpleStatementLine, - SimpleStatementSuite, - Try, - While, - With, - WithItem, -) -from libcst._nodes.whitespace import EmptyLine, SimpleWhitespace -from libcst._parser.custom_itertools import grouper -from libcst._parser.production_decorator import with_production -from libcst._parser.types.config import ParserConfig -from libcst._parser.types.partials import ( - AnnAssignPartial, - AssignPartial, - AugAssignPartial, - DecoratorPartial, - ExceptClausePartial, - FuncdefPartial, - ImportPartial, - ImportRelativePartial, - SimpleStatementPartial, - WithLeadingWhitespace, -) -from libcst._parser.types.token import Token -from libcst._parser.whitespace_parser import ( - parse_empty_lines, - parse_parenthesizable_whitespace, - parse_simple_whitespace, -) - -AUGOP_TOKEN_LUT: Dict[str, Type[BaseAugOp]] = { - "+=": AddAssign, - "-=": SubtractAssign, - "*=": MultiplyAssign, - "@=": MatrixMultiplyAssign, - "/=": DivideAssign, - "%=": ModuloAssign, - "&=": BitAndAssign, - "|=": BitOrAssign, - "^=": BitXorAssign, - "<<=": LeftShiftAssign, - ">>=": RightShiftAssign, - "**=": PowerAssign, - "//=": FloorDivideAssign, -} - - -@with_production("stmt_input", "stmt ENDMARKER") -def convert_stmt_input(config: ParserConfig, children: Sequence[Any]) -> Any: - (child, endmarker) = children - return child - - -@with_production("stmt", "simple_stmt_line | compound_stmt") -def convert_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - (child,) = children - return child - - -@with_production("simple_stmt_partial", "small_stmt (';' small_stmt)* [';'] NEWLINE") -def convert_simple_stmt_partial(config: ParserConfig, children: Sequence[Any]) -> Any: - *statements, trailing_whitespace = children - - last_stmt = len(statements) / 2 - body = [] - for i, (stmt_body, semi) in enumerate(grouper(statements, 2)): - if semi is not None: - if i == (last_stmt - 1): - # Trailing semicolons only own the whitespace before. - semi = Semicolon( - whitespace_before=parse_simple_whitespace( - config, semi.whitespace_before - ), - whitespace_after=SimpleWhitespace(""), - ) - else: - # Middle semicolons own the whitespace before and after. - semi = Semicolon( - whitespace_before=parse_simple_whitespace( - config, semi.whitespace_before - ), - whitespace_after=parse_simple_whitespace( - config, semi.whitespace_after - ), - ) - else: - semi = MaybeSentinel.DEFAULT - body.append(stmt_body.value.with_changes(semicolon=semi)) - return SimpleStatementPartial( - body, - whitespace_before=statements[0].whitespace_before, - trailing_whitespace=trailing_whitespace, - ) - - -@with_production("simple_stmt_line", "simple_stmt_partial") -def convert_simple_stmt_line(config: ParserConfig, children: Sequence[Any]) -> Any: - """ - This function is similar to convert_simple_stmt_suite, but yields a different type - """ - (partial,) = children - return SimpleStatementLine( - partial.body, - leading_lines=parse_empty_lines(config, partial.whitespace_before), - trailing_whitespace=partial.trailing_whitespace, - ) - - -@with_production("simple_stmt_suite", "simple_stmt_partial") -def convert_simple_stmt_suite(config: ParserConfig, children: Sequence[Any]) -> Any: - """ - This function is similar to convert_simple_stmt_line, but yields a different type - """ - (partial,) = children - return SimpleStatementSuite( - partial.body, - leading_whitespace=parse_simple_whitespace(config, partial.whitespace_before), - trailing_whitespace=partial.trailing_whitespace, - ) - - -@with_production( - "small_stmt", - ( - "expr_stmt | del_stmt | pass_stmt | break_stmt | continue_stmt | return_stmt" - + "| raise_stmt | yield_stmt | import_stmt | global_stmt | nonlocal_stmt" - + "| assert_stmt" - ), -) -def convert_small_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - # Doesn't construct SmallStatement, because we don't know about semicolons yet. - # convert_simple_stmt will construct the SmallStatement nodes. - (small_stmt_body,) = children - return small_stmt_body - - -@with_production( - "expr_stmt", - "testlist_star_expr (annassign | augassign | assign* )", - version=">=3.6", -) -@with_production( - "expr_stmt", "testlist_star_expr (augassign | assign* )", version="<=3.5" -) -@with_production("yield_stmt", "yield_expr") -def convert_expr_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 1: - # This is an unassigned expr statement (like a function call) - (test_node,) = children - return WithLeadingWhitespace( - Expr(value=test_node.value), test_node.whitespace_before - ) - elif len(children) == 2: - lhs, rhs = children - if isinstance(rhs, AnnAssignPartial): - return WithLeadingWhitespace( - AnnAssign( - target=lhs.value, - annotation=rhs.annotation, - equal=MaybeSentinel.DEFAULT if rhs.equal is None else rhs.equal, - value=rhs.value, - ), - lhs.whitespace_before, - ) - elif isinstance(rhs, AugAssignPartial): - return WithLeadingWhitespace( - AugAssign(target=lhs.value, operator=rhs.operator, value=rhs.value), - lhs.whitespace_before, - ) - # The only thing it could be at this point is an assign with one or more targets. - # So, walk the children moving the equals ownership back one and constructing a - # list of AssignTargets. - targets = [] - for i in range(len(children) - 1): - target = children[i].value - equal = children[i + 1].equal - - targets.append( - AssignTarget( - target=target, - whitespace_before_equal=equal.whitespace_before, - whitespace_after_equal=equal.whitespace_after, - ) - ) - - return WithLeadingWhitespace( - Assign(targets=tuple(targets), value=children[-1].value), - children[0].whitespace_before, - ) - - -@with_production("annassign", "':' test ['=' test]", version=">=3.6,<3.8") -@with_production( - "annassign", "':' test ['=' (yield_expr|testlist_star_expr)]", version=">=3.8" -) -def convert_annassign(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 2: - # Variable annotation only - colon, annotation = children - annotation = annotation.value - equal = None - value = None - elif len(children) == 4: - # Variable annotation and assignment - colon, annotation, equal, value = children - annotation = annotation.value - value = value.value - equal = AssignEqual( - whitespace_before=parse_simple_whitespace(config, equal.whitespace_before), - whitespace_after=parse_simple_whitespace(config, equal.whitespace_after), - ) - else: - raise ParserSyntaxError( - "Invalid parser state!", lines=config.lines, raw_line=0, raw_column=0 - ) - - return AnnAssignPartial( - annotation=Annotation( - whitespace_before_indicator=parse_simple_whitespace( - config, colon.whitespace_before - ), - whitespace_after_indicator=parse_simple_whitespace( - config, colon.whitespace_after - ), - annotation=annotation, - ), - equal=equal, - value=value, - ) - - -@with_production( - "augassign", - ( - "('+=' | '-=' | '*=' | '@=' | '/=' | '%=' | '&=' | '|=' | '^=' | '<<=' | " - + "'>>=' | '**=' | '//=') (yield_expr | testlist)" - ), - version=">=3.5", -) -@with_production( - "augassign", - ( - "('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | '<<=' | " - + "'>>=' | '**=' | '//=') (yield_expr | testlist)" - ), - version="<3.5", -) -def convert_augassign(config: ParserConfig, children: Sequence[Any]) -> Any: - op, expr = children - if op.string not in AUGOP_TOKEN_LUT: - raise ParserSyntaxError( - f"Unexpected token '{op.string}'!", - lines=config.lines, - raw_line=0, - raw_column=0, - ) - - return AugAssignPartial( - # pyre-ignore Pyre seems to think that the value of this LUT is CSTNode - operator=AUGOP_TOKEN_LUT[op.string]( - whitespace_before=parse_simple_whitespace(config, op.whitespace_before), - whitespace_after=parse_simple_whitespace(config, op.whitespace_after), - ), - value=expr.value, - ) - - -@with_production("assign", "'=' (yield_expr|testlist_star_expr)") -def convert_assign(config: ParserConfig, children: Sequence[Any]) -> Any: - equal, expr = children - return AssignPartial( - equal=AssignEqual( - whitespace_before=parse_simple_whitespace(config, equal.whitespace_before), - whitespace_after=parse_simple_whitespace(config, equal.whitespace_after), - ), - value=expr.value, - ) - - -@with_production("pass_stmt", "'pass'") -def convert_pass_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - (name,) = children - return WithLeadingWhitespace(Pass(), name.whitespace_before) - - -@with_production("del_stmt", "'del' exprlist") -def convert_del_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - (del_name, exprlist) = children - return WithLeadingWhitespace( - Del( - target=exprlist.value, - whitespace_after_del=parse_simple_whitespace( - config, del_name.whitespace_after - ), - ), - del_name.whitespace_before, - ) - - -@with_production("continue_stmt", "'continue'") -def convert_continue_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - (name,) = children - return WithLeadingWhitespace(Continue(), name.whitespace_before) - - -@with_production("break_stmt", "'break'") -def convert_break_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - (name,) = children - return WithLeadingWhitespace(Break(), name.whitespace_before) - - -@with_production("return_stmt", "'return' [testlist]", version="<=3.7") -@with_production("return_stmt", "'return' [testlist_star_expr]", version=">=3.8") -def convert_return_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 1: - (keyword,) = children - return WithLeadingWhitespace( - Return(whitespace_after_return=SimpleWhitespace("")), - keyword.whitespace_before, - ) - else: - (keyword, testlist) = children - return WithLeadingWhitespace( - Return( - value=testlist.value, - whitespace_after_return=parse_simple_whitespace( - config, keyword.whitespace_after - ), - ), - keyword.whitespace_before, - ) - - -@with_production("import_stmt", "import_name | import_from") -def convert_import_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - (child,) = children - return child - - -@with_production("import_name", "'import' dotted_as_names") -def convert_import_name(config: ParserConfig, children: Sequence[Any]) -> Any: - importtoken, names = children - return WithLeadingWhitespace( - Import( - names=names.names, - whitespace_after_import=parse_simple_whitespace( - config, importtoken.whitespace_after - ), - ), - importtoken.whitespace_before, - ) - - -@with_production("import_relative", "('.' | '...')* dotted_name | ('.' | '...')+") -def convert_import_relative(config: ParserConfig, children: Sequence[Any]) -> Any: - dots = [] - dotted_name = None - for child in children: - if isinstance(child, Token): - # Special case for "...", which is part of the grammar - if child.string == "...": - dots.extend( - [ - Dot(), - Dot(), - Dot( - whitespace_after=parse_simple_whitespace( - config, child.whitespace_after - ) - ), - ] - ) - else: - dots.append( - Dot( - whitespace_after=parse_simple_whitespace( - config, child.whitespace_after - ) - ) - ) - else: - # This should be the dotted name, and we can't get more than - # one, but lets be sure anyway - if dotted_name is not None: - raise CSTLogicError() - dotted_name = child - - return ImportRelativePartial(relative=tuple(dots), module=dotted_name) - - -@with_production( - "import_from", - "'from' import_relative 'import' ('*' | '(' import_as_names ')' | import_as_names)", -) -def convert_import_from(config: ParserConfig, children: Sequence[Any]) -> Any: - fromtoken, import_relative, importtoken, *importlist = children - - if len(importlist) == 1: - (possible_star,) = importlist - if isinstance(possible_star, Token): - # Its a "*" import, so we must construct this node. - names = ImportStar() - else: - # Its an import as names partial, grab the names from that. - names = possible_star.names - lpar = None - rpar = None - else: - # Its an import as names partial with parens - lpartoken, namespartial, rpartoken = importlist - lpar = LeftParen( - whitespace_after=parse_parenthesizable_whitespace( - config, lpartoken.whitespace_after - ) - ) - names = namespartial.names - rpar = RightParen( - whitespace_before=parse_parenthesizable_whitespace( - config, rpartoken.whitespace_before - ) - ) - - # If we have a relative-only import, then we need to relocate the space - # after the final dot to be owned by the import token. - if len(import_relative.relative) > 0 and import_relative.module is None: - whitespace_before_import = import_relative.relative[-1].whitespace_after - relative = ( - *import_relative.relative[:-1], - import_relative.relative[-1].with_changes( - whitespace_after=SimpleWhitespace("") - ), - ) - else: - whitespace_before_import = parse_simple_whitespace( - config, importtoken.whitespace_before - ) - relative = import_relative.relative - - return WithLeadingWhitespace( - ImportFrom( - whitespace_after_from=parse_simple_whitespace( - config, fromtoken.whitespace_after - ), - relative=relative, - module=import_relative.module, - whitespace_before_import=whitespace_before_import, - whitespace_after_import=parse_simple_whitespace( - config, importtoken.whitespace_after - ), - lpar=lpar, - names=names, - rpar=rpar, - ), - fromtoken.whitespace_before, - ) - - -@with_production("import_as_name", "NAME ['as' NAME]") -def convert_import_as_name(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 1: - (dotted_name,) = children - return ImportAlias(name=Name(dotted_name.string), asname=None) - else: - dotted_name, astoken, name = children - return ImportAlias( - name=Name(dotted_name.string), - asname=AsName( - whitespace_before_as=parse_simple_whitespace( - config, astoken.whitespace_before - ), - whitespace_after_as=parse_simple_whitespace( - config, astoken.whitespace_after - ), - name=Name(name.string), - ), - ) - - -@with_production("dotted_as_name", "dotted_name ['as' NAME]") -def convert_dotted_as_name(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 1: - (dotted_name,) = children - return ImportAlias(name=dotted_name, asname=None) - else: - dotted_name, astoken, name = children - return ImportAlias( - name=dotted_name, - asname=AsName( - whitespace_before_as=parse_parenthesizable_whitespace( - config, astoken.whitespace_before - ), - whitespace_after_as=parse_parenthesizable_whitespace( - config, astoken.whitespace_after - ), - name=Name(name.string), - ), - ) - - -@with_production("import_as_names", "import_as_name (',' import_as_name)* [',']") -def convert_import_as_names(config: ParserConfig, children: Sequence[Any]) -> Any: - return _gather_import_names(config, children) - - -@with_production("dotted_as_names", "dotted_as_name (',' dotted_as_name)*") -def convert_dotted_as_names(config: ParserConfig, children: Sequence[Any]) -> Any: - return _gather_import_names(config, children) - - -def _gather_import_names( - config: ParserConfig, children: Sequence[Any] -) -> ImportPartial: - names = [] - for name, comma in grouper(children, 2): - if comma is None: - names.append(name) - else: - names.append( - name.with_changes( - comma=Comma( - whitespace_before=parse_parenthesizable_whitespace( - config, comma.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, comma.whitespace_after - ), - ) - ) - ) - - return ImportPartial(names=names) - - -@with_production("dotted_name", "NAME ('.' NAME)*") -def convert_dotted_name(config: ParserConfig, children: Sequence[Any]) -> Any: - left, *rest = children - node = Name(left.string) - - for dot, right in grouper(rest, 2): - node = Attribute( - value=node, - dot=Dot( - whitespace_before=parse_parenthesizable_whitespace( - config, dot.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, dot.whitespace_after - ), - ), - attr=Name(right.string), - ) - - return node - - -@with_production("raise_stmt", "'raise' [test ['from' test]]") -def convert_raise_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 1: - (raise_token,) = children - whitespace_after_raise = MaybeSentinel.DEFAULT - exc = None - cause = None - elif len(children) == 2: - (raise_token, test) = children - whitespace_after_raise = parse_simple_whitespace(config, test.whitespace_before) - exc = test.value - cause = None - elif len(children) == 4: - (raise_token, test, from_token, source) = children - whitespace_after_raise = parse_simple_whitespace(config, test.whitespace_before) - exc = test.value - cause = From( - whitespace_before_from=parse_simple_whitespace( - config, from_token.whitespace_before - ), - whitespace_after_from=parse_simple_whitespace( - config, source.whitespace_before - ), - item=source.value, - ) - else: - raise CSTLogicError() - - return WithLeadingWhitespace( - Raise(whitespace_after_raise=whitespace_after_raise, exc=exc, cause=cause), - raise_token.whitespace_before, - ) - - -def _construct_nameitems(config: ParserConfig, names: Sequence[Any]) -> List[NameItem]: - nameitems: List[NameItem] = [] - for name, maybe_comma in grouper(names, 2): - if maybe_comma is None: - nameitems.append(NameItem(Name(name.string))) - else: - nameitems.append( - NameItem( - Name(name.string), - comma=Comma( - whitespace_before=parse_simple_whitespace( - config, maybe_comma.whitespace_before - ), - whitespace_after=parse_simple_whitespace( - config, maybe_comma.whitespace_after - ), - ), - ) - ) - return nameitems - - -@with_production("global_stmt", "'global' NAME (',' NAME)*") -def convert_global_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - (global_token, *names) = children - return WithLeadingWhitespace( - Global( - names=tuple(_construct_nameitems(config, names)), - whitespace_after_global=parse_simple_whitespace( - config, names[0].whitespace_before - ), - ), - global_token.whitespace_before, - ) - - -@with_production("nonlocal_stmt", "'nonlocal' NAME (',' NAME)*") -def convert_nonlocal_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - (nonlocal_token, *names) = children - return WithLeadingWhitespace( - Nonlocal( - names=tuple(_construct_nameitems(config, names)), - whitespace_after_nonlocal=parse_simple_whitespace( - config, names[0].whitespace_before - ), - ), - nonlocal_token.whitespace_before, - ) - - -@with_production("assert_stmt", "'assert' test [',' test]") -def convert_assert_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 2: - (assert_token, test) = children - assert_node = Assert( - whitespace_after_assert=parse_simple_whitespace( - config, test.whitespace_before - ), - test=test.value, - msg=None, - ) - else: - (assert_token, test, comma_token, msg) = children - assert_node = Assert( - whitespace_after_assert=parse_simple_whitespace( - config, test.whitespace_before - ), - test=test.value, - comma=Comma( - whitespace_before=parse_simple_whitespace( - config, comma_token.whitespace_before - ), - whitespace_after=parse_simple_whitespace(config, msg.whitespace_before), - ), - msg=msg.value, - ) - - return WithLeadingWhitespace(assert_node, assert_token.whitespace_before) - - -@with_production( - "compound_stmt", - ("if_stmt | while_stmt | asyncable_stmt | try_stmt | classdef | decorated"), -) -def convert_compound_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - (stmt,) = children - return stmt - - -@with_production( - "if_stmt", "'if' test ':' suite [if_stmt_elif|if_stmt_else]", version="<=3.7" -) -@with_production( - "if_stmt", - "'if' namedexpr_test ':' suite [if_stmt_elif|if_stmt_else]", - version=">=3.8", -) -def convert_if_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - if_tok, test, colon_tok, suite, *tail = children - - if len(tail) > 0: - (orelse,) = tail - else: - orelse = None - - return If( - leading_lines=parse_empty_lines(config, if_tok.whitespace_before), - whitespace_before_test=parse_simple_whitespace(config, if_tok.whitespace_after), - test=test.value, - whitespace_after_test=parse_simple_whitespace( - config, colon_tok.whitespace_before - ), - body=suite, - orelse=orelse, - ) - - -@with_production( - "if_stmt_elif", "'elif' test ':' suite [if_stmt_elif|if_stmt_else]", version="<=3.7" -) -@with_production( - "if_stmt_elif", - "'elif' namedexpr_test ':' suite [if_stmt_elif|if_stmt_else]", - version=">=3.8", -) -def convert_if_stmt_elif(config: ParserConfig, children: Sequence[Any]) -> Any: - # this behaves exactly the same as `convert_if_stmt`, except that the leading token - # has a different string value. - return convert_if_stmt(config, children) - - -@with_production("if_stmt_else", "'else' ':' suite") -def convert_if_stmt_else(config: ParserConfig, children: Sequence[Any]) -> Any: - else_tok, colon_tok, suite = children - return Else( - leading_lines=parse_empty_lines(config, else_tok.whitespace_before), - whitespace_before_colon=parse_simple_whitespace( - config, colon_tok.whitespace_before - ), - body=suite, - ) - - -@with_production( - "while_stmt", "'while' test ':' suite ['else' ':' suite]", version="<=3.7" -) -@with_production( - "while_stmt", "'while' namedexpr_test ':' suite ['else' ':' suite]", version=">=3.8" -) -def convert_while_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - while_token, test, while_colon_token, while_suite, *else_block = children - - if len(else_block) > 0: - (else_token, else_colon_token, else_suite) = else_block - orelse = Else( - leading_lines=parse_empty_lines(config, else_token.whitespace_before), - whitespace_before_colon=parse_simple_whitespace( - config, else_colon_token.whitespace_before - ), - body=else_suite, - ) - else: - orelse = None - - return While( - leading_lines=parse_empty_lines(config, while_token.whitespace_before), - whitespace_after_while=parse_simple_whitespace( - config, while_token.whitespace_after - ), - test=test.value, - whitespace_before_colon=parse_simple_whitespace( - config, while_colon_token.whitespace_before - ), - body=while_suite, - orelse=orelse, - ) - - -@with_production( - "for_stmt", "'for' exprlist 'in' testlist ':' suite ['else' ':' suite]" -) -def convert_for_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - ( - for_token, - expr, - in_token, - test, - for_colon_token, - for_suite, - *else_block, - ) = children - - if len(else_block) > 0: - (else_token, else_colon_token, else_suite) = else_block - orelse = Else( - leading_lines=parse_empty_lines(config, else_token.whitespace_before), - whitespace_before_colon=parse_simple_whitespace( - config, else_colon_token.whitespace_before - ), - body=else_suite, - ) - else: - orelse = None - - return WithLeadingWhitespace( - For( - whitespace_after_for=parse_simple_whitespace( - config, for_token.whitespace_after - ), - target=expr.value, - whitespace_before_in=parse_simple_whitespace( - config, in_token.whitespace_before - ), - whitespace_after_in=parse_simple_whitespace( - config, in_token.whitespace_after - ), - iter=test.value, - whitespace_before_colon=parse_simple_whitespace( - config, for_colon_token.whitespace_before - ), - body=for_suite, - orelse=orelse, - ), - for_token.whitespace_before, - ) - - -@with_production( - "try_stmt", - "('try' ':' suite ((except_clause ':' suite)+ ['else' ':' suite] ['finally' ':' suite] | 'finally' ':' suite))", -) -def convert_try_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - trytoken, try_colon_token, try_suite, *rest = children - handlers: List[ExceptHandler] = [] - orelse: Optional[Else] = None - finalbody: Optional[Finally] = None - - for clause, colon_token, suite in grouper(rest, 3): - if isinstance(clause, Token): - if clause.string == "else": - if orelse is not None: - raise CSTLogicError("Logic error!") - orelse = Else( - leading_lines=parse_empty_lines(config, clause.whitespace_before), - whitespace_before_colon=parse_simple_whitespace( - config, colon_token.whitespace_before - ), - body=suite, - ) - elif clause.string == "finally": - if finalbody is not None: - raise CSTLogicError("Logic error!") - finalbody = Finally( - leading_lines=parse_empty_lines(config, clause.whitespace_before), - whitespace_before_colon=parse_simple_whitespace( - config, colon_token.whitespace_before - ), - body=suite, - ) - else: - raise CSTLogicError("Logic error!") - elif isinstance(clause, ExceptClausePartial): - handlers.append( - ExceptHandler( - body=suite, - type=clause.type, - name=clause.name, - leading_lines=clause.leading_lines, - whitespace_after_except=clause.whitespace_after_except, - whitespace_before_colon=parse_simple_whitespace( - config, colon_token.whitespace_before - ), - ) - ) - else: - raise CSTLogicError("Logic error!") - - return Try( - leading_lines=parse_empty_lines(config, trytoken.whitespace_before), - whitespace_before_colon=parse_simple_whitespace( - config, try_colon_token.whitespace_before - ), - body=try_suite, - handlers=tuple(handlers), - orelse=orelse, - finalbody=finalbody, - ) - - -@with_production("except_clause", "'except' [test ['as' NAME]]") -def convert_except_clause(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 1: - (except_token,) = children - whitespace_after_except = SimpleWhitespace("") - test = None - name = None - elif len(children) == 2: - (except_token, test_node) = children - whitespace_after_except = parse_simple_whitespace( - config, except_token.whitespace_after - ) - test = test_node.value - name = None - else: - (except_token, test_node, as_token, name_token) = children - whitespace_after_except = parse_simple_whitespace( - config, except_token.whitespace_after - ) - test = test_node.value - name = AsName( - whitespace_before_as=parse_simple_whitespace( - config, as_token.whitespace_before - ), - whitespace_after_as=parse_simple_whitespace( - config, as_token.whitespace_after - ), - name=Name(name_token.string), - ) - - return ExceptClausePartial( - leading_lines=parse_empty_lines(config, except_token.whitespace_before), - whitespace_after_except=whitespace_after_except, - type=test, - name=name, - ) - - -@with_production( - "with_stmt", "'with' with_item (',' with_item)* ':' suite", version=">=3.1" -) -@with_production("with_stmt", "'with' with_item ':' suite", version="<3.1") -def convert_with_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - (with_token, *items, colon_token, suite) = children - item_nodes: List[WithItem] = [] - - for with_item, maybe_comma in grouper(items, 2): - if maybe_comma is not None: - item_nodes.append( - with_item.with_changes( - comma=Comma( - whitespace_before=parse_parenthesizable_whitespace( - config, maybe_comma.whitespace_before - ), - whitespace_after=parse_parenthesizable_whitespace( - config, maybe_comma.whitespace_after - ), - ) - ) - ) - else: - item_nodes.append(with_item) - - return WithLeadingWhitespace( - With( - whitespace_after_with=parse_simple_whitespace( - config, with_token.whitespace_after - ), - items=tuple(item_nodes), - whitespace_before_colon=parse_simple_whitespace( - config, colon_token.whitespace_before - ), - body=suite, - ), - with_token.whitespace_before, - ) - - -@with_production("with_item", "test ['as' expr]") -def convert_with_item(config: ParserConfig, children: Sequence[Any]) -> Any: - if len(children) == 3: - (test, as_token, expr_node) = children - test_node = test.value - asname = AsName( - whitespace_before_as=parse_simple_whitespace( - config, as_token.whitespace_before - ), - whitespace_after_as=parse_simple_whitespace( - config, as_token.whitespace_after - ), - name=expr_node.value, - ) - else: - (test,) = children - test_node = test.value - asname = None - - return WithItem(item=test_node, asname=asname) - - -def _extract_async( - config: ParserConfig, children: Sequence[Any] -) -> Tuple[List[EmptyLine], Optional[Asynchronous], Any]: - if len(children) == 1: - (stmt,) = children - - whitespace_before = stmt.whitespace_before - asyncnode = None - else: - asynctoken, stmt = children - - whitespace_before = asynctoken.whitespace_before - asyncnode = Asynchronous( - whitespace_after=parse_simple_whitespace( - config, asynctoken.whitespace_after - ) - ) - - return (parse_empty_lines(config, whitespace_before), asyncnode, stmt.value) - - -@with_production("asyncable_funcdef", "[ASYNC] funcdef", version=">=3.5") -@with_production("asyncable_funcdef", "funcdef", version="<3.5") -def convert_asyncable_funcdef(config: ParserConfig, children: Sequence[Any]) -> Any: - leading_lines, asyncnode, funcdef = _extract_async(config, children) - - return funcdef.with_changes( - asynchronous=asyncnode, leading_lines=leading_lines, lines_after_decorators=() - ) - - -@with_production("funcdef", "'def' NAME parameters [funcdef_annotation] ':' suite") -def convert_funcdef(config: ParserConfig, children: Sequence[Any]) -> Any: - defnode, namenode, param_partial, *annotation, colon, suite = children - - # If the trailing paremeter doesn't have a comma, then it owns the trailing - # whitespace before the rpar. Otherwise, the comma owns it (and will have - # already parsed it). We don't check/update ParamStar because if it exists - # then we are guaranteed have at least one kwonly_param. - parameters = param_partial.params - if parameters.star_kwarg is not None: - if parameters.star_kwarg.comma == MaybeSentinel.DEFAULT: - parameters = parameters.with_changes( - star_kwarg=parameters.star_kwarg.with_changes( - whitespace_after_param=param_partial.rpar.whitespace_before - ) - ) - elif parameters.kwonly_params: - if parameters.kwonly_params[-1].comma == MaybeSentinel.DEFAULT: - parameters = parameters.with_changes( - kwonly_params=( - *parameters.kwonly_params[:-1], - parameters.kwonly_params[-1].with_changes( - whitespace_after_param=param_partial.rpar.whitespace_before - ), - ) - ) - elif isinstance(parameters.star_arg, Param): - if parameters.star_arg.comma == MaybeSentinel.DEFAULT: - parameters = parameters.with_changes( - star_arg=parameters.star_arg.with_changes( - whitespace_after_param=param_partial.rpar.whitespace_before - ) - ) - elif parameters.params: - if parameters.params[-1].comma == MaybeSentinel.DEFAULT: - parameters = parameters.with_changes( - params=( - *parameters.params[:-1], - parameters.params[-1].with_changes( - whitespace_after_param=param_partial.rpar.whitespace_before - ), - ) - ) - - return WithLeadingWhitespace( - FunctionDef( - whitespace_after_def=parse_simple_whitespace( - config, defnode.whitespace_after - ), - name=Name(namenode.string), - whitespace_after_name=parse_simple_whitespace( - config, namenode.whitespace_after - ), - whitespace_before_params=param_partial.lpar.whitespace_after, - params=parameters, - returns=None if not annotation else annotation[0], - whitespace_before_colon=parse_simple_whitespace( - config, colon.whitespace_before - ), - body=suite, - ), - defnode.whitespace_before, - ) - - -@with_production("parameters", "'(' [typedargslist] ')'") -def convert_parameters(config: ParserConfig, children: Sequence[Any]) -> Any: - lpar, *paramlist, rpar = children - return FuncdefPartial( - lpar=LeftParen( - whitespace_after=parse_parenthesizable_whitespace( - config, lpar.whitespace_after - ) - ), - params=Parameters() if not paramlist else paramlist[0], - rpar=RightParen( - whitespace_before=parse_parenthesizable_whitespace( - config, rpar.whitespace_before - ) - ), - ) - - -@with_production("funcdef_annotation", "'->' test") -def convert_funcdef_annotation(config: ParserConfig, children: Sequence[Any]) -> Any: - arrow, typehint = children - return Annotation( - whitespace_before_indicator=parse_parenthesizable_whitespace( - config, arrow.whitespace_before - ), - whitespace_after_indicator=parse_parenthesizable_whitespace( - config, arrow.whitespace_after - ), - annotation=typehint.value, - ) - - -@with_production("classdef", "'class' NAME ['(' [arglist] ')'] ':' suite") -def convert_classdef(config: ParserConfig, children: Sequence[Any]) -> Any: - classdef, name, *arglist, colon, suite = children - - # First, parse out the comments and empty lines before the statement. - leading_lines = parse_empty_lines(config, classdef.whitespace_before) - - # Compute common whitespace and nodes - whitespace_after_class = parse_simple_whitespace(config, classdef.whitespace_after) - namenode = Name(name.string) - whitespace_after_name = parse_simple_whitespace(config, name.whitespace_after) - - # Now, construct the classdef node itself - if not arglist: - # No arglist, so no arguments to this class - return ClassDef( - leading_lines=leading_lines, - lines_after_decorators=(), - whitespace_after_class=whitespace_after_class, - name=namenode, - whitespace_after_name=whitespace_after_name, - body=suite, - ) - else: - # Unwrap arglist partial, because its valid to not have any - lpar, *args, rpar = arglist - args = args[0].args if args else [] - - bases: List[Arg] = [] - keywords: List[Arg] = [] - - current_arg = bases - for arg in args: - if arg.star == "**" or arg.keyword is not None: - current_arg = keywords - # Some quick validation - if current_arg is keywords and ( - arg.star == "*" or (arg.star == "" and arg.keyword is None) - ): - raise PartialParserSyntaxError( - "Positional argument follows keyword argument." - ) - current_arg.append(arg) - - return ClassDef( - leading_lines=leading_lines, - lines_after_decorators=(), - whitespace_after_class=whitespace_after_class, - name=namenode, - whitespace_after_name=whitespace_after_name, - lpar=LeftParen( - whitespace_after=parse_parenthesizable_whitespace( - config, lpar.whitespace_after - ) - ), - bases=bases, - keywords=keywords, - rpar=RightParen( - whitespace_before=parse_parenthesizable_whitespace( - config, rpar.whitespace_before - ) - ), - whitespace_before_colon=parse_simple_whitespace( - config, colon.whitespace_before - ), - body=suite, - ) - - -@with_production("decorator", "'@' dotted_name [ '(' [arglist] ')' ] NEWLINE") -def convert_decorator(config: ParserConfig, children: Sequence[Any]) -> Any: - atsign, name, *arglist, newline = children - if not arglist: - # This is either a name or an attribute node, so just extract it. - decoratornode = name - else: - # This needs to be converted into a call node, and we have the - # arglist partial. - lpar, *args, rpar = arglist - args = args[0].args if args else [] - - # If the trailing argument doesn't have a comma, then it owns the - # trailing whitespace before the rpar. Otherwise, the comma owns - # it. - if len(args) > 0 and args[-1].comma == MaybeSentinel.DEFAULT: - args[-1] = args[-1].with_changes( - whitespace_after_arg=parse_parenthesizable_whitespace( - config, rpar.whitespace_before - ) - ) - - decoratornode = Call( - func=name, - whitespace_after_func=parse_simple_whitespace( - config, lpar.whitespace_before - ), - whitespace_before_args=parse_parenthesizable_whitespace( - config, lpar.whitespace_after - ), - args=tuple(args), - ) - - return Decorator( - leading_lines=parse_empty_lines(config, atsign.whitespace_before), - whitespace_after_at=parse_simple_whitespace(config, atsign.whitespace_after), - decorator=decoratornode, - trailing_whitespace=newline, - ) - - -@with_production("decorators", "decorator+") -def convert_decorators(config: ParserConfig, children: Sequence[Any]) -> Any: - return DecoratorPartial(decorators=children) - - -@with_production("decorated", "decorators (classdef | asyncable_funcdef)") -def convert_decorated(config: ParserConfig, children: Sequence[Any]) -> Any: - partial, class_or_func = children - - # First, split up the spacing on the first decorator - leading_lines = partial.decorators[0].leading_lines - - # Now, redistribute ownership of the whitespace - decorators = ( - partial.decorators[0].with_changes(leading_lines=()), - *partial.decorators[1:], - ) - - # Now, modify the original function or class to add the decorators. - return class_or_func.with_changes( - leading_lines=leading_lines, - # pyre-fixme[60]: Concatenation not yet support for multiple variadic - # tuples: `*class_or_func.leading_lines, - # *class_or_func.lines_after_decorators`. - # pyre-fixme[60]: Expected to unpack an iterable, but got `unknown`. - lines_after_decorators=( - *class_or_func.leading_lines, - *class_or_func.lines_after_decorators, - ), - decorators=decorators, - ) - - -@with_production( - "asyncable_stmt", "[ASYNC] (funcdef | with_stmt | for_stmt)", version=">=3.5" -) -@with_production("asyncable_stmt", "funcdef | with_stmt | for_stmt", version="<3.5") -def convert_asyncable_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: - leading_lines, asyncnode, stmtnode = _extract_async(config, children) - if isinstance(stmtnode, FunctionDef): - return stmtnode.with_changes( - asynchronous=asyncnode, - leading_lines=leading_lines, - lines_after_decorators=(), - ) - elif isinstance(stmtnode, With): - return stmtnode.with_changes( - asynchronous=asyncnode, leading_lines=leading_lines - ) - elif isinstance(stmtnode, For): - return stmtnode.with_changes( - asynchronous=asyncnode, leading_lines=leading_lines - ) - else: - raise CSTLogicError("Logic error!") - - -@with_production("suite", "simple_stmt_suite | indented_suite") -def convert_suite(config: ParserConfig, children: Sequence[Any]) -> Any: - (suite,) = children - return suite - - -@with_production("indented_suite", "NEWLINE INDENT stmt+ DEDENT") -def convert_indented_suite(config: ParserConfig, children: Sequence[Any]) -> Any: - newline, indent, *stmts, dedent = children - return IndentedBlock( - header=newline, - indent=( - None - if indent.relative_indent == config.default_indent - else indent.relative_indent - ), - body=stmts, - # We want to be able to only keep comments in the footer that are actually for - # this IndentedBlock. We do so by assuming that lines which are indented to the - # same level as the block itself are comments that go at the footer of the - # block. Comments that are indented to less than this indent are assumed to - # belong to the next line of code. We override the indent here because the - # dedent node's absolute indent is the resulting indentation after the dedent - # is performed. Its this way because the whitespace state for both the dedent's - # whitespace_after and the next BaseCompoundStatement's whitespace_before is - # shared. This allows us to partially parse here and parse the rest of the - # whitespace and comments on the next line, effectively making sure that - # comments are attached to the correct node. - footer=parse_empty_lines( - config, - dedent.whitespace_after, - override_absolute_indent=indent.whitespace_before.absolute_indent, - ), - ) diff --git a/libcst/_parser/conversions/terminals.py b/libcst/_parser/conversions/terminals.py deleted file mode 100644 index f5697229e..000000000 --- a/libcst/_parser/conversions/terminals.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# pyre-unsafe - -from typing import Any - -from libcst._nodes.expression import SimpleString -from libcst._parser.types.config import ParserConfig -from libcst._parser.types.partials import WithLeadingWhitespace -from libcst._parser.types.token import Token -from libcst._parser.whitespace_parser import ( - parse_empty_lines, - parse_trailing_whitespace, -) - - -def convert_NAME(config: ParserConfig, token: Token) -> Any: - return token - - -def convert_NUMBER(config: ParserConfig, token: Token) -> Any: - return token - - -def convert_STRING(config: ParserConfig, token: Token) -> Any: - return WithLeadingWhitespace(SimpleString(token.string), token.whitespace_before) - - -def convert_OP(config: ParserConfig, token: Token) -> Any: - return token - - -def convert_NEWLINE(config: ParserConfig, token: Token) -> Any: - # A NEWLINE token is only emitted for semantic newlines, which means that this - # corresponds to a TrailingWhitespace, since that's the only semantic - # newline-containing node. - - # N.B. Because this token is whitespace, and because the whitespace parser doesn't - # try to prevent overflows, `token.whitespace_before` will end up overflowing into - # the value of this newline token, so `parse_trailing_whitespace` will include - # token.string's value. This is expected and desired behavior. - return parse_trailing_whitespace(config, token.whitespace_before) - - -def convert_INDENT(config: ParserConfig, token: Token) -> Any: - return token - - -def convert_DEDENT(config: ParserConfig, token: Token) -> Any: - return token - - -def convert_ENDMARKER(config: ParserConfig, token: Token) -> Any: - # Parse any and all empty lines with an indent similar to the header. That is, - # indent of nothing and including all indents. In some cases, like when the - # footer parser follows an indented suite, the state's indent can be wrong - # due to the fact that it is shared with the _DEDENT node. We know that if - # we're parsing the end of a file, we will have no indent. - return parse_empty_lines( - config, token.whitespace_before, override_absolute_indent="" - ) - - -def convert_FSTRING_START(config: ParserConfig, token: Token) -> Any: - return token - - -def convert_FSTRING_END(config: ParserConfig, token: Token) -> Any: - return token - - -def convert_FSTRING_STRING(config: ParserConfig, token: Token) -> Any: - return token - - -def convert_ASYNC(config: ParserConfig, token: Token) -> Any: - return token - - -def convert_AWAIT(config: ParserConfig, token: Token) -> Any: - return token diff --git a/libcst/_parser/custom_itertools.py b/libcst/_parser/custom_itertools.py deleted file mode 100644 index 81cfdb4b2..000000000 --- a/libcst/_parser/custom_itertools.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from itertools import zip_longest -from typing import Iterable, Iterator, TypeVar - -_T = TypeVar("_T") - - -# https://docs.python.org/3/library/itertools.html#itertools-recipes -def grouper(iterable: Iterable[_T], n: int, fillvalue: _T = None) -> Iterator[_T]: - "Collect data into fixed-length chunks or blocks" - # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" - args = [iter(iterable)] * n - return zip_longest(*args, fillvalue=fillvalue) diff --git a/libcst/_parser/detect_config.py b/libcst/_parser/detect_config.py index 375a4f07c..3cb052f98 100644 --- a/libcst/_parser/detect_config.py +++ b/libcst/_parser/detect_config.py @@ -1,8 +1,3 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - import itertools import re @@ -11,29 +6,7 @@ from tokenize import detect_encoding as py_tokenize_detect_encoding from typing import FrozenSet, Iterable, Iterator, Pattern, Set, Tuple, Union -from libcst._nodes.whitespace import NEWLINE_RE -from libcst._parser.parso.python.token import PythonTokenTypes, TokenType -from libcst._parser.parso.utils import split_lines from libcst._parser.types.config import AutoConfig, ParserConfig, PartialParserConfig -from libcst._parser.types.token import Token -from libcst._parser.wrapped_tokenize import tokenize_lines - -_INDENT: TokenType = PythonTokenTypes.INDENT -_NAME: TokenType = PythonTokenTypes.NAME -_NEWLINE: TokenType = PythonTokenTypes.NEWLINE -_STRING: TokenType = PythonTokenTypes.STRING - -_FALLBACK_DEFAULT_NEWLINE = "\n" -_FALLBACK_DEFAULT_INDENT = " " -_CONTINUATION_RE: Pattern[str] = re.compile(r"\\(\r\n?|\n)", re.UNICODE) - - -@dataclass(frozen=True) -class ConfigDetectionResult: - # The config is a set of constant values used by the parser. - config: ParserConfig - # The tokens iterator is mutated by the parser. - tokens: Iterator[Token] def _detect_encoding(source: Union[str, bytes]) -> str: @@ -49,71 +22,6 @@ def _detect_encoding(source: Union[str, bytes]) -> str: return py_tokenize_detect_encoding(BytesIO(source).readline)[0] -def _detect_default_newline(source_str: str) -> str: - """ - Finds the first newline, and uses that value as the default newline. - """ - # Don't use `NEWLINE_RE` for this, because it might match multiple newlines as a - # single newline. - match = NEWLINE_RE.search(source_str) - return match.group(0) if match is not None else _FALLBACK_DEFAULT_NEWLINE - - -def _detect_indent(tokens: Iterable[Token]) -> str: - """ - Finds the first INDENT token, and uses that as the value of the default indent. - """ - try: - first_indent = next(t for t in tokens if t.type is _INDENT) - except StopIteration: - return _FALLBACK_DEFAULT_INDENT - first_indent_str = first_indent.relative_indent - assert first_indent_str is not None, "INDENT tokens must contain a relative_indent" - return first_indent_str - - -def _detect_trailing_newline(source_str: str) -> bool: - if len(source_str) == 0 or not NEWLINE_RE.fullmatch(source_str[-1]): - return False - # Make sure that the last newline wasn't following a continuation - return not ( - _CONTINUATION_RE.fullmatch(source_str[-2:]) - or _CONTINUATION_RE.fullmatch(source_str[-3:]) - ) - - -def _detect_future_imports(tokens: Iterable[Token]) -> FrozenSet[str]: - """ - Finds __future__ imports in their proper locations. - - See `https://www.python.org/dev/peps/pep-0236/`_ - """ - future_imports: Set[str] = set() - state = 0 - for tok in tokens: - if state == 0 and tok.type in (_STRING, _NEWLINE): - continue - elif state == 0 and tok.string == "from": - state = 1 - elif state == 1 and tok.string == "__future__": - state = 2 - elif state == 2 and tok.string == "import": - state = 3 - elif state == 3 and tok.string == "as": - state = 4 - elif state == 3 and tok.type == _NAME: - future_imports.add(tok.string) - elif state == 4 and tok.type == _NAME: - state = 3 - elif state == 3 and tok.string in "(),": - continue - elif state == 3 and tok.type == _NEWLINE: - state = 0 - else: - break - return frozenset(future_imports) - - def convert_to_utf8( source: Union[str, bytes], *, partial: PartialParserConfig ) -> Tuple[str, str]: @@ -130,81 +38,3 @@ def convert_to_utf8( source_str = source if isinstance(source, str) else source.decode(encoding) return (encoding, source_str) - -def detect_config( - source: Union[str, bytes], - *, - partial: PartialParserConfig, - detect_trailing_newline: bool, - detect_default_newline: bool, -) -> ConfigDetectionResult: - """ - Computes a ParserConfig given the current source code to be parsed and a partial - config. - """ - - python_version = partial.parsed_python_version - - encoding, source_str = convert_to_utf8(source, partial=partial) - - partial_default_newline = partial.default_newline - default_newline = ( - ( - _detect_default_newline(source_str) - if detect_default_newline - else _FALLBACK_DEFAULT_NEWLINE - ) - if isinstance(partial_default_newline, AutoConfig) - else partial_default_newline - ) - - # HACK: The grammar requires a trailing newline, but python doesn't actually require - # a trailing newline. Add one onto the end to make the parser happy. We'll strip it - # out again during cst.Module's codegen. - # - # I think parso relies on error recovery support to handle this, which we don't - # have. lib2to3 doesn't handle this case at all AFAICT. - has_trailing_newline = detect_trailing_newline and _detect_trailing_newline( - source_str - ) - if detect_trailing_newline and not has_trailing_newline: - source_str += default_newline - - lines = split_lines(source_str, keepends=True) - - tokens = tokenize_lines(source_str, lines, python_version) - - partial_default_indent = partial.default_indent - if isinstance(partial_default_indent, AutoConfig): - # We need to clone `tokens` before passing it to `_detect_indent`, because - # `_detect_indent` consumes some tokens, mutating `tokens`. - # - # Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the - # size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup` - # once `token_dup` is freed at the end of this method (subject to - # GC/refcounting). - tokens, tokens_dup = itertools.tee(tokens) - default_indent = _detect_indent(tokens_dup) - else: - default_indent = partial_default_indent - - partial_future_imports = partial.future_imports - if isinstance(partial_future_imports, AutoConfig): - # Same note as above re itertools.tee, we will consume tokens. - tokens, tokens_dup = itertools.tee(tokens) - future_imports = _detect_future_imports(tokens_dup) - else: - future_imports = partial_future_imports - - return ConfigDetectionResult( - config=ParserConfig( - lines=lines, - encoding=encoding, - default_indent=default_indent, - default_newline=default_newline, - has_trailing_newline=has_trailing_newline, - version=python_version, - future_imports=future_imports, - ), - tokens=tokens, - ) diff --git a/libcst/_parser/grammar.py b/libcst/_parser/grammar.py deleted file mode 100644 index ee65ef72f..000000000 --- a/libcst/_parser/grammar.py +++ /dev/null @@ -1,413 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import re -from functools import lru_cache -from typing import FrozenSet, Iterator, Mapping, Optional, Tuple, Union - -from libcst._parser.conversions.expression import ( - convert_arg_assign_comp_for, - convert_arglist, - convert_argument, - convert_atom, - convert_atom_basic, - convert_atom_curlybraces, - convert_atom_ellipses, - convert_atom_expr, - convert_atom_expr_await, - convert_atom_expr_trailer, - convert_atom_parens, - convert_atom_squarebrackets, - convert_atom_string, - convert_binop, - convert_boolop, - convert_comp_for, - convert_comp_if, - convert_comp_op, - convert_comparison, - convert_dictorsetmaker, - convert_expression_input, - convert_factor, - convert_fstring, - convert_fstring_content, - convert_fstring_conversion, - convert_fstring_equality, - convert_fstring_expr, - convert_fstring_format_spec, - convert_lambda, - convert_namedexpr_test, - convert_not_test, - convert_power, - convert_sliceop, - convert_star_arg, - convert_star_expr, - convert_subscript, - convert_subscriptlist, - convert_sync_comp_for, - convert_test, - convert_test_nocond, - convert_test_or_expr_list, - convert_testlist_comp_list, - convert_testlist_comp_tuple, - convert_trailer, - convert_trailer_arglist, - convert_trailer_attribute, - convert_trailer_subscriptlist, - convert_yield_arg, - convert_yield_expr, -) -from libcst._parser.conversions.module import convert_file_input -from libcst._parser.conversions.params import ( - convert_argslist, - convert_fpdef, - convert_fpdef_assign, - convert_fpdef_slash, - convert_fpdef_star, - convert_fpdef_starstar, -) -from libcst._parser.conversions.statement import ( - convert_annassign, - convert_assert_stmt, - convert_assign, - convert_asyncable_funcdef, - convert_asyncable_stmt, - convert_augassign, - convert_break_stmt, - convert_classdef, - convert_compound_stmt, - convert_continue_stmt, - convert_decorated, - convert_decorator, - convert_decorators, - convert_del_stmt, - convert_dotted_as_name, - convert_dotted_as_names, - convert_dotted_name, - convert_except_clause, - convert_expr_stmt, - convert_for_stmt, - convert_funcdef, - convert_funcdef_annotation, - convert_global_stmt, - convert_if_stmt, - convert_if_stmt_elif, - convert_if_stmt_else, - convert_import_as_name, - convert_import_as_names, - convert_import_from, - convert_import_name, - convert_import_relative, - convert_import_stmt, - convert_indented_suite, - convert_nonlocal_stmt, - convert_parameters, - convert_pass_stmt, - convert_raise_stmt, - convert_return_stmt, - convert_simple_stmt_line, - convert_simple_stmt_partial, - convert_simple_stmt_suite, - convert_small_stmt, - convert_stmt, - convert_stmt_input, - convert_suite, - convert_try_stmt, - convert_while_stmt, - convert_with_item, - convert_with_stmt, -) -from libcst._parser.conversions.terminals import ( - convert_ASYNC, - convert_AWAIT, - convert_DEDENT, - convert_ENDMARKER, - convert_FSTRING_END, - convert_FSTRING_START, - convert_FSTRING_STRING, - convert_INDENT, - convert_NAME, - convert_NEWLINE, - convert_NUMBER, - convert_OP, - convert_STRING, -) -from libcst._parser.parso.pgen2.generator import generate_grammar, Grammar -from libcst._parser.parso.python.token import PythonTokenTypes, TokenType -from libcst._parser.parso.utils import parse_version_string, PythonVersionInfo -from libcst._parser.production_decorator import get_productions -from libcst._parser.types.config import AutoConfig -from libcst._parser.types.conversions import NonterminalConversion, TerminalConversion -from libcst._parser.types.production import Production - -# Keep this sorted alphabetically -_TERMINAL_CONVERSIONS_SEQUENCE: Tuple[TerminalConversion, ...] = ( - convert_DEDENT, - convert_ENDMARKER, - convert_INDENT, - convert_NAME, - convert_NEWLINE, - convert_NUMBER, - convert_OP, - convert_STRING, - convert_FSTRING_START, - convert_FSTRING_END, - convert_FSTRING_STRING, - convert_ASYNC, - convert_AWAIT, -) - -# Try to match the order of https://docs.python.org/3/reference/grammar.html -_NONTERMINAL_CONVERSIONS_SEQUENCE: Tuple[NonterminalConversion, ...] = ( - convert_file_input, - convert_stmt_input, # roughly equivalent to single_input - convert_expression_input, # roughly equivalent to eval_input - convert_stmt, - convert_simple_stmt_partial, - convert_simple_stmt_line, - convert_simple_stmt_suite, - convert_small_stmt, - convert_expr_stmt, - convert_annassign, - convert_augassign, - convert_assign, - convert_pass_stmt, - convert_continue_stmt, - convert_break_stmt, - convert_del_stmt, - convert_import_stmt, - convert_import_name, - convert_import_relative, - convert_import_from, - convert_import_as_name, - convert_dotted_as_name, - convert_import_as_names, - convert_dotted_as_names, - convert_dotted_name, - convert_return_stmt, - convert_raise_stmt, - convert_global_stmt, - convert_nonlocal_stmt, - convert_assert_stmt, - convert_compound_stmt, - convert_if_stmt, - convert_if_stmt_elif, - convert_if_stmt_else, - convert_while_stmt, - convert_for_stmt, - convert_try_stmt, - convert_except_clause, - convert_with_stmt, - convert_with_item, - convert_asyncable_funcdef, - convert_funcdef, - convert_classdef, - convert_decorator, - convert_decorators, - convert_decorated, - convert_asyncable_stmt, - convert_parameters, - convert_argslist, - convert_fpdef_slash, - convert_fpdef_star, - convert_fpdef_starstar, - convert_fpdef_assign, - convert_fpdef, - convert_funcdef_annotation, - convert_suite, - convert_indented_suite, - convert_namedexpr_test, - convert_test, - convert_test_nocond, - convert_lambda, - convert_boolop, - convert_not_test, - convert_comparison, - convert_comp_op, - convert_star_expr, - convert_binop, - convert_factor, - convert_power, - convert_atom_expr, - convert_atom_expr_await, - convert_atom_expr_trailer, - convert_trailer, - convert_trailer_attribute, - convert_trailer_subscriptlist, - convert_subscriptlist, - convert_subscript, - convert_sliceop, - convert_trailer_arglist, - convert_atom, - convert_atom_basic, - convert_atom_parens, - convert_atom_squarebrackets, - convert_atom_curlybraces, - convert_atom_string, - convert_fstring, - convert_fstring_content, - convert_fstring_conversion, - convert_fstring_equality, - convert_fstring_expr, - convert_fstring_format_spec, - convert_atom_ellipses, - convert_testlist_comp_tuple, - convert_testlist_comp_list, - convert_test_or_expr_list, - convert_dictorsetmaker, - convert_arglist, - convert_argument, - convert_arg_assign_comp_for, - convert_star_arg, - convert_sync_comp_for, - convert_comp_for, - convert_comp_if, - convert_yield_expr, - convert_yield_arg, -) - - -def get_grammar_str(version: PythonVersionInfo, future_imports: FrozenSet[str]) -> str: - """ - Returns an BNF-like grammar text that `parso.pgen2.generator.generate_grammar` can - handle. - - While you should generally use `get_grammar` instead, this can be useful for - debugging the grammar. - """ - lines = [] - for p in get_nonterminal_productions(version, future_imports): - lines.append(str(p)) - return "\n".join(lines) + "\n" - - -# TODO: We should probably provide an on-disk cache like parso and lib2to3 do. Because -# of how we're defining our grammar, efficient cache invalidation is harder, though not -# impossible. -@lru_cache() -def get_grammar( - version: PythonVersionInfo, - future_imports: Union[FrozenSet[str], AutoConfig], -) -> "Grammar[TokenType]": - if isinstance(future_imports, AutoConfig): - # For easier testing, if not provided assume no __future__ imports - future_imports = frozenset(()) - return generate_grammar(get_grammar_str(version, future_imports), PythonTokenTypes) - - -@lru_cache() -def get_terminal_conversions() -> Mapping[str, TerminalConversion]: - """ - Returns a mapping from terminal type name to the conversion function that should be - called by the parser. - """ - return { - # pyre-fixme[16]: Optional type has no attribute `group`. - re.match("convert_(.*)", fn.__name__).group(1): fn - for fn in _TERMINAL_CONVERSIONS_SEQUENCE - } - - -@lru_cache() -def validate_grammar() -> None: - for fn in _NONTERMINAL_CONVERSIONS_SEQUENCE: - fn_productions = get_productions(fn) - if all(p.name == fn_productions[0].name for p in fn_productions): - # all the production names are the same, ensure that the `convert_` function - # is named correctly - production_name = fn_productions[0].name - expected_name = f"convert_{production_name}" - if fn.__name__ != expected_name: - raise ValueError( - f"The conversion function for '{production_name}' " - + f"must be called '{expected_name}', not '{fn.__name__}'." - ) - - -def _get_version_comparison(version: str) -> Tuple[str, PythonVersionInfo]: - if version[:2] in (">=", "<=", "==", "!="): - return (version[:2], parse_version_string(version[2:].strip())) - if version[:1] in (">", "<"): - return (version[:1], parse_version_string(version[1:].strip())) - raise ValueError(f"Invalid version comparison specifier '{version}'") - - -def _compare_versions( - requested_version: PythonVersionInfo, - actual_version: PythonVersionInfo, - comparison: str, -) -> bool: - if comparison == ">=": - return actual_version >= requested_version - if comparison == "<=": - return actual_version <= requested_version - if comparison == "==": - return actual_version == requested_version - if comparison == "!=": - return actual_version != requested_version - if comparison == ">": - return actual_version > requested_version - if comparison == "<": - return actual_version < requested_version - raise ValueError(f"Invalid version comparison specifier '{comparison}'") - - -def _should_include( - requested_version: Optional[str], actual_version: PythonVersionInfo -) -> bool: - if requested_version is None: - return True - for version in requested_version.split(","): - comparison, parsed_version = _get_version_comparison(version.strip()) - if not _compare_versions(parsed_version, actual_version, comparison): - return False - return True - - -def _should_include_future( - future: Optional[str], - future_imports: FrozenSet[str], -) -> bool: - if future is None: - return True - if future[:1] == "!": - return future[1:] not in future_imports - return future in future_imports - - -def get_nonterminal_productions( - version: PythonVersionInfo, future_imports: FrozenSet[str] -) -> Iterator[Production]: - for conversion in _NONTERMINAL_CONVERSIONS_SEQUENCE: - for production in get_productions(conversion): - if not _should_include(production.version, version): - continue - if not _should_include_future(production.future, future_imports): - continue - yield production - - -@lru_cache() -def get_nonterminal_conversions( - version: PythonVersionInfo, - future_imports: FrozenSet[str], -) -> Mapping[str, NonterminalConversion]: - """ - Returns a mapping from nonterminal production name to the conversion function that - should be called by the parser. - """ - conversions = {} - for fn in _NONTERMINAL_CONVERSIONS_SEQUENCE: - for fn_production in get_productions(fn): - if not _should_include(fn_production.version, version): - continue - if not _should_include_future(fn_production.future, future_imports): - continue - if fn_production.name in conversions: - raise ValueError( - f"Found duplicate '{fn_production.name}' production in grammar" - ) - conversions[fn_production.name] = fn - - return conversions diff --git a/libcst/_parser/parso/__init__.py b/libcst/_parser/parso/__init__.py deleted file mode 100644 index 7bec24cb1..000000000 --- a/libcst/_parser/parso/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. diff --git a/libcst/_parser/parso/pgen2/__init__.py b/libcst/_parser/parso/pgen2/__init__.py deleted file mode 100644 index 7bec24cb1..000000000 --- a/libcst/_parser/parso/pgen2/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. diff --git a/libcst/_parser/parso/pgen2/generator.py b/libcst/_parser/parso/pgen2/generator.py deleted file mode 100644 index 5e83741bd..000000000 --- a/libcst/_parser/parso/pgen2/generator.py +++ /dev/null @@ -1,364 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. -# -# Modifications: -# Copyright David Halter and Contributors -# Modifications are dual-licensed: MIT and PSF. -# 99% of the code is different from pgen2, now. -# -# A fork of `parso.pgen2.generator`. -# https://github.com/davidhalter/parso/blob/master/parso/pgen2/generator.py -# -# The following changes were made: -# - Type stubs were directly applied. -# pyre-unsafe - -""" -This module defines the data structures used to represent a grammar. - -Specifying grammars in pgen is possible with this grammar:: - - grammar: (NEWLINE | rule)* ENDMARKER - rule: NAME ':' rhs NEWLINE - rhs: items ('|' items)* - items: item+ - item: '[' rhs ']' | atom ['+' | '*'] - atom: '(' rhs ')' | NAME | STRING - -This grammar is self-referencing. - -This parser generator (pgen2) was created by Guido Rossum and used for lib2to3. -Most of the code has been refactored to make it more Pythonic. Since this was a -"copy" of the CPython Parser parser "pgen", there was some work needed to make -it more readable. It should also be slightly faster than the original pgen2, -because we made some optimizations. -""" - -from ast import literal_eval -from typing import Any, Generic, Mapping, Sequence, Set, TypeVar, Union - -from libcst._parser.parso.pgen2.grammar_parser import GrammarParser, NFAState - -_TokenTypeT = TypeVar("_TokenTypeT") - - -class DFAPlan: - """ - Plans are used for the parser to create stack nodes and do the proper - DFA state transitions. - """ - - def __init__( - self, next_dfa: "DFAState", dfa_pushes: Sequence["DFAState"] = [] - ) -> None: - self.next_dfa = next_dfa - self.dfa_pushes = dfa_pushes - - def __repr__(self) -> str: - return "%s(%s, %s)" % (self.__class__.__name__, self.next_dfa, self.dfa_pushes) - - -class DFAState(Generic[_TokenTypeT]): - """ - The DFAState object is the core class for pretty much anything. DFAState - are the vertices of an ordered graph while arcs and transitions are the - edges. - - Arcs are the initial edges, where most DFAStates are not connected and - transitions are then calculated to connect the DFA state machines that have - different nonterminals. - """ - - def __init__(self, from_rule: str, nfa_set: Set[NFAState], final: NFAState) -> None: - self.from_rule = from_rule - self.nfa_set = nfa_set - self.arcs: Mapping[str, DFAState] = ( - {} - ) # map from terminals/nonterminals to DFAState - # In an intermediary step we set these nonterminal arcs (which has the - # same structure as arcs). These don't contain terminals anymore. - self.nonterminal_arcs: Mapping[str, DFAState] = {} - - # Transitions are basically the only thing that the parser is using - # with is_final. Everyting else is purely here to create a parser. - self.transitions: Mapping[Union[_TokenTypeT, ReservedString], DFAPlan] = {} - self.is_final = final in nfa_set - - def add_arc(self, next_, label): - assert isinstance(label, str) - assert label not in self.arcs - assert isinstance(next_, DFAState) - self.arcs[label] = next_ - - def unifystate(self, old, new): - for label, next_ in self.arcs.items(): - if next_ is old: - self.arcs[label] = new - - def __eq__(self, other): - # Equality test -- ignore the nfa_set instance variable - assert isinstance(other, DFAState) - if self.is_final != other.is_final: - return False - # Can't just return self.arcs == other.arcs, because that - # would invoke this method recursively, with cycles... - if len(self.arcs) != len(other.arcs): - return False - for label, next_ in self.arcs.items(): - if next_ is not other.arcs.get(label): - return False - return True - - def __repr__(self) -> str: - return "<%s: %s is_final=%s>" % ( - self.__class__.__name__, - self.from_rule, - self.is_final, - ) - - -class ReservedString: - """ - Most grammars will have certain keywords and operators that are mentioned - in the grammar as strings (e.g. "if") and not token types (e.g. NUMBER). - This class basically is the former. - """ - - def __init__(self, value: str) -> None: - self.value = value - - def __repr__(self) -> str: - return "%s(%s)" % (self.__class__.__name__, self.value) - - -class Grammar(Generic[_TokenTypeT]): - """ - Once initialized, this class supplies the grammar tables for the - parsing engine implemented by parse.py. The parsing engine - accesses the instance variables directly. - - The only important part in this parsers are dfas and transitions between - dfas. - """ - - def __init__( - self, - start_nonterminal: str, - rule_to_dfas: Mapping[str, Sequence[DFAState[_TokenTypeT]]], - reserved_syntax_strings: Mapping[str, ReservedString], - ) -> None: - self.nonterminal_to_dfas = rule_to_dfas - self.reserved_syntax_strings = reserved_syntax_strings - self.start_nonterminal = start_nonterminal - - -def _simplify_dfas(dfas): - """ - This is not theoretically optimal, but works well enough. - Algorithm: repeatedly look for two states that have the same - set of arcs (same labels pointing to the same nodes) and - unify them, until things stop changing. - - dfas is a list of DFAState instances - """ - changes = True - while changes: - changes = False - for i, state_i in enumerate(dfas): - for j in range(i + 1, len(dfas)): - state_j = dfas[j] - if state_i == state_j: - # print " unify", i, j - del dfas[j] - for state in dfas: - state.unifystate(state_j, state_i) - changes = True - break - - -def _make_dfas(start, finish): - """ - Uses the powerset construction algorithm to create DFA states from sets of - NFA states. - - Also does state reduction if some states are not needed. - """ - # To turn an NFA into a DFA, we define the states of the DFA - # to correspond to *sets* of states of the NFA. Then do some - # state reduction. - assert isinstance(start, NFAState) - assert isinstance(finish, NFAState) - - def addclosure(nfa_state, base_nfa_set): - assert isinstance(nfa_state, NFAState) - if nfa_state in base_nfa_set: - return - base_nfa_set.add(nfa_state) - for nfa_arc in nfa_state.arcs: - if nfa_arc.nonterminal_or_string is None: - addclosure(nfa_arc.next, base_nfa_set) - - base_nfa_set = set() - addclosure(start, base_nfa_set) - states = [DFAState(start.from_rule, base_nfa_set, finish)] - for state in states: # NB states grows while we're iterating - arcs = {} - # Find state transitions and store them in arcs. - for nfa_state in state.nfa_set: - for nfa_arc in nfa_state.arcs: - if nfa_arc.nonterminal_or_string is not None: - nfa_set = arcs.setdefault(nfa_arc.nonterminal_or_string, set()) - addclosure(nfa_arc.next, nfa_set) - - # Now create the dfa's with no None's in arcs anymore. All Nones have - # been eliminated and state transitions (arcs) are properly defined, we - # just need to create the dfa's. - for nonterminal_or_string, nfa_set in arcs.items(): - for nested_state in states: - if nested_state.nfa_set == nfa_set: - # The DFA state already exists for this rule. - break - else: - nested_state = DFAState(start.from_rule, nfa_set, finish) - states.append(nested_state) - - state.add_arc(nested_state, nonterminal_or_string) - return states # List of DFAState instances; first one is start - - -def generate_grammar(bnf_grammar: str, token_namespace: Any) -> Grammar[Any]: - """ - ``bnf_text`` is a grammar in extended BNF (using * for repetition, + for - at-least-once repetition, [] for optional parts, | for alternatives and () - for grouping). - - It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its - own parser. - """ - rule_to_dfas = {} - start_nonterminal = None - for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse(): - dfas = _make_dfas(nfa_a, nfa_z) - _simplify_dfas(dfas) - rule_to_dfas[nfa_a.from_rule] = dfas - - if start_nonterminal is None: - start_nonterminal = nfa_a.from_rule - - reserved_strings = {} - for nonterminal, dfas in rule_to_dfas.items(): - for dfa_state in dfas: - for terminal_or_nonterminal, next_dfa in dfa_state.arcs.items(): - if terminal_or_nonterminal in rule_to_dfas: - dfa_state.nonterminal_arcs[terminal_or_nonterminal] = next_dfa - else: - transition = _make_transition( - token_namespace, reserved_strings, terminal_or_nonterminal - ) - dfa_state.transitions[transition] = DFAPlan(next_dfa) - - _calculate_tree_traversal(rule_to_dfas) - if start_nonterminal is None: - raise ValueError("could not find starting nonterminal!") - return Grammar(start_nonterminal, rule_to_dfas, reserved_strings) - - -def _make_transition(token_namespace, reserved_syntax_strings, label): - """ - Creates a reserved string ("if", "for", "*", ...) or returns the token type - (NUMBER, STRING, ...) for a given grammar terminal. - """ - if label[0].isalpha(): - # A named token (e.g. NAME, NUMBER, STRING) - return getattr(token_namespace, label) - else: - # Either a keyword or an operator - assert label[0] in ('"', "'"), label - assert not label.startswith('"""') and not label.startswith("'''") - value = literal_eval(label) - try: - return reserved_syntax_strings[value] - except KeyError: - r = reserved_syntax_strings[value] = ReservedString(value) - return r - - -def _calculate_tree_traversal(nonterminal_to_dfas): - """ - By this point we know how dfas can move around within a stack node, but we - don't know how we can add a new stack node (nonterminal transitions). - """ - # Map from grammar rule (nonterminal) name to a set of tokens. - first_plans = {} - - nonterminals = list(nonterminal_to_dfas.keys()) - nonterminals.sort() - for nonterminal in nonterminals: - if nonterminal not in first_plans: - _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal) - - # Now that we have calculated the first terminals, we are sure that - # there is no left recursion. - - for dfas in nonterminal_to_dfas.values(): - for dfa_state in dfas: - transitions = dfa_state.transitions - for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items(): - for transition, pushes in first_plans[nonterminal].items(): - if transition in transitions: - prev_plan = transitions[transition] - # Make sure these are sorted so that error messages are - # at least deterministic - choices = sorted( - [ - ( - prev_plan.dfa_pushes[0].from_rule - if prev_plan.dfa_pushes - else prev_plan.next_dfa.from_rule - ), - (pushes[0].from_rule if pushes else next_dfa.from_rule), - ] - ) - raise ValueError( - ( - "Rule %s is ambiguous; given a %s token, we " - + "can't determine if we should evaluate %s or %s." - ) - % ((dfa_state.from_rule, transition) + tuple(choices)) - ) - transitions[transition] = DFAPlan(next_dfa, pushes) - - -def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal): - """ - Calculates the first plan in the first_plans dictionary for every given - nonterminal. This is going to be used to know when to create stack nodes. - """ - dfas = nonterminal_to_dfas[nonterminal] - new_first_plans = {} - first_plans[nonterminal] = None # dummy to detect left recursion - # We only need to check the first dfa. All the following ones are not - # interesting to find first terminals. - state = dfas[0] - for transition, next_ in state.transitions.items(): - # It's a string. We have finally found a possible first token. - new_first_plans[transition] = [next_.next_dfa] - - for nonterminal2, next_ in state.nonterminal_arcs.items(): - # It's a nonterminal and we have either a left recursion issue - # in the grammar or we have to recurse. - try: - first_plans2 = first_plans[nonterminal2] - except KeyError: - first_plans2 = _calculate_first_plans( - nonterminal_to_dfas, first_plans, nonterminal2 - ) - else: - if first_plans2 is None: - raise ValueError("left recursion for rule %r" % nonterminal) - - for t, pushes in first_plans2.items(): - new_first_plans[t] = [next_] + pushes - - first_plans[nonterminal] = new_first_plans - return new_first_plans diff --git a/libcst/_parser/parso/pgen2/grammar_parser.py b/libcst/_parser/parso/pgen2/grammar_parser.py deleted file mode 100644 index 0d30199d3..000000000 --- a/libcst/_parser/parso/pgen2/grammar_parser.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. -# -# Modifications: -# Copyright David Halter and Contributors -# Modifications are dual-licensed: MIT and PSF. -# 99% of the code is different from pgen2, now. -# -# A fork of `parso.pgen2.grammar_parser`. -# https://github.com/davidhalter/parso/blob/master/parso/pgen2/grammar_parser.py -# -# The following changes were made: -# - Type stubs were directly applied. -# pyre-unsafe - -from typing import Generator, List, Optional, Tuple - -from libcst._parser.parso.python.token import PythonTokenTypes -from libcst._parser.parso.python.tokenize import tokenize -from libcst._parser.parso.utils import parse_version_string - - -class NFAArc: - def __init__(self, next_: "NFAState", nonterminal_or_string: Optional[str]) -> None: - self.next: NFAState = next_ - self.nonterminal_or_string: Optional[str] = nonterminal_or_string - - def __repr__(self) -> str: - return "<%s: %s>" % (self.__class__.__name__, self.nonterminal_or_string) - - -class NFAState: - def __init__(self, from_rule: str) -> None: - self.from_rule = from_rule - self.arcs: List[NFAArc] = [] - - def add_arc( - self, next_: "NFAState", nonterminal_or_string: Optional[str] = None - ) -> None: - self.arcs.append(NFAArc(next_, nonterminal_or_string)) - - def __repr__(self) -> str: - return "<%s: from %s>" % (self.__class__.__name__, self.from_rule) - - -class GrammarParser: - """ - The parser for Python grammar files. - """ - - def __init__(self, bnf_grammar: str) -> None: - self._bnf_grammar: str = bnf_grammar - self.generator = tokenize(bnf_grammar, version_info=parse_version_string("3.6")) - self._gettoken() # Initialize lookahead - - def parse(self) -> Generator[Tuple[NFAState, NFAState], None, None]: - # grammar: (NEWLINE | rule)* ENDMARKER - while self.type != PythonTokenTypes.ENDMARKER: - while self.type == PythonTokenTypes.NEWLINE: - self._gettoken() - - # rule: NAME ':' rhs NEWLINE - # pyre-ignore Pyre is unhappy with the fact that we haven't put - # _current_rule_name in the constructor. - self._current_rule_name = self._expect(PythonTokenTypes.NAME) - self._expect(PythonTokenTypes.OP, ":") - - a, z = self._parse_rhs() - self._expect(PythonTokenTypes.NEWLINE) - - yield a, z - - def _parse_rhs(self): - # rhs: items ('|' items)* - a, z = self._parse_items() - if self.value != "|": - return a, z - else: - aa = NFAState(self._current_rule_name) - zz = NFAState(self._current_rule_name) - while True: - # Add the possibility to go into the state of a and come back - # to finish. - aa.add_arc(a) - z.add_arc(zz) - if self.value != "|": - break - - self._gettoken() - a, z = self._parse_items() - return aa, zz - - def _parse_items(self): - # items: item+ - a, b = self._parse_item() - while self.type in ( - PythonTokenTypes.NAME, - PythonTokenTypes.STRING, - ) or self.value in ("(", "["): - c, d = self._parse_item() - # Need to end on the next item. - b.add_arc(c) - b = d - return a, b - - def _parse_item(self): - # item: '[' rhs ']' | atom ['+' | '*'] - if self.value == "[": - self._gettoken() - a, z = self._parse_rhs() - self._expect(PythonTokenTypes.OP, "]") - # Make it also possible that there is no token and change the - # state. - a.add_arc(z) - return a, z - else: - a, z = self._parse_atom() - value = self.value - if value not in ("+", "*"): - return a, z - self._gettoken() - # Make it clear that we can go back to the old state and repeat. - z.add_arc(a) - if value == "+": - return a, z - else: - # The end state is the same as the beginning, nothing must - # change. - return a, a - - def _parse_atom(self): - # atom: '(' rhs ')' | NAME | STRING - if self.value == "(": - self._gettoken() - a, z = self._parse_rhs() - self._expect(PythonTokenTypes.OP, ")") - return a, z - elif self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING): - a = NFAState(self._current_rule_name) - z = NFAState(self._current_rule_name) - # Make it clear that the state transition requires that value. - a.add_arc(z, self.value) - self._gettoken() - return a, z - else: - self._raise_error( - "expected (...) or NAME or STRING, got %s/%s", self.type, self.value - ) - - def _expect(self, type_, value=None): - if self.type != type_: - self._raise_error("expected %s, got %s [%s]", type_, self.type, self.value) - if value is not None and self.value != value: - self._raise_error("expected %s, got %s", value, self.value) - value = self.value - self._gettoken() - return value - - def _gettoken(self) -> None: - tup = next(self.generator) - self.type, self.value, self.begin, prefix = tup - - def _raise_error(self, msg: str, *args: object) -> None: - if args: - try: - msg = msg % args - except Exception: - msg = " ".join([msg] + list(map(str, args))) - line = self._bnf_grammar.splitlines()[self.begin[0] - 1] - raise SyntaxError(msg, ("", self.begin[0], self.begin[1], line)) diff --git a/libcst/_parser/parso/python/__init__.py b/libcst/_parser/parso/python/__init__.py deleted file mode 100644 index 7bec24cb1..000000000 --- a/libcst/_parser/parso/python/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. diff --git a/libcst/_parser/parso/python/py_token.py b/libcst/_parser/parso/python/py_token.py deleted file mode 100644 index 204ce94d9..000000000 --- a/libcst/_parser/parso/python/py_token.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. -# -# Modifications: -# Copyright David Halter and Contributors -# Modifications are dual-licensed: MIT and PSF. -# 99% of the code is different from pgen2, now. -# -# A fork of `parso.python.token`. -# https://github.com/davidhalter/parso/blob/master/parso/python/token.py -# -# The following changes were made: -# - Explicit TokenType references instead of dynamic creation. -# - Use dataclasses instead of raw classes. -# pyre-unsafe - -from dataclasses import dataclass - - -@dataclass(frozen=True) -class TokenType: - name: str - contains_syntax: bool = False - - def __repr__(self) -> str: - return "%s(%s)" % (self.__class__.__name__, self.name) - - -class PythonTokenTypes: - """ - Basically an enum, but Python 2 doesn't have enums in the standard library. - """ - - STRING: TokenType = TokenType("STRING") - NUMBER: TokenType = TokenType("NUMBER") - NAME: TokenType = TokenType("NAME", contains_syntax=True) - ERRORTOKEN: TokenType = TokenType("ERRORTOKEN") - NEWLINE: TokenType = TokenType("NEWLINE") - INDENT: TokenType = TokenType("INDENT") - DEDENT: TokenType = TokenType("DEDENT") - ERROR_DEDENT: TokenType = TokenType("ERROR_DEDENT") - ASYNC: TokenType = TokenType("ASYNC") - AWAIT: TokenType = TokenType("AWAIT") - FSTRING_STRING: TokenType = TokenType("FSTRING_STRING") - FSTRING_START: TokenType = TokenType("FSTRING_START") - FSTRING_END: TokenType = TokenType("FSTRING_END") - OP: TokenType = TokenType("OP", contains_syntax=True) - ENDMARKER: TokenType = TokenType("ENDMARKER") diff --git a/libcst/_parser/parso/python/token.py b/libcst/_parser/parso/python/token.py deleted file mode 100644 index 164262b92..000000000 --- a/libcst/_parser/parso/python/token.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -try: - from libcst_native import token_type as native_token_type - - TokenType = native_token_type.TokenType - - class PythonTokenTypes: - STRING: TokenType = native_token_type.STRING - NUMBER: TokenType = native_token_type.NUMBER - NAME: TokenType = native_token_type.NAME - NEWLINE: TokenType = native_token_type.NEWLINE - INDENT: TokenType = native_token_type.INDENT - DEDENT: TokenType = native_token_type.DEDENT - ASYNC: TokenType = native_token_type.ASYNC - AWAIT: TokenType = native_token_type.AWAIT - FSTRING_STRING: TokenType = native_token_type.FSTRING_STRING - FSTRING_START: TokenType = native_token_type.FSTRING_START - FSTRING_END: TokenType = native_token_type.FSTRING_END - OP: TokenType = native_token_type.OP - ENDMARKER: TokenType = native_token_type.ENDMARKER - # unused dummy tokens for backwards compat with the parso tokenizer - ERRORTOKEN: TokenType = native_token_type.ERRORTOKEN - ERROR_DEDENT: TokenType = native_token_type.ERROR_DEDENT - -except ImportError: - from libcst._parser.parso.python.py_token import ( # noqa: F401 - PythonTokenTypes, - TokenType, - ) diff --git a/libcst/_parser/parso/python/tokenize.py b/libcst/_parser/parso/python/tokenize.py deleted file mode 100644 index 7f7998638..000000000 --- a/libcst/_parser/parso/python/tokenize.py +++ /dev/null @@ -1,1155 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. -# -# Modifications: -# Copyright David Halter and Contributors -# Modifications are dual-licensed: MIT and PSF. -# 99% of the code is different from pgen2, now. -# -# A fork of `parso.python.tokenize`. -# https://github.com/davidhalter/parso/blob/master/parso/python/tokenize.py -# -# The following changes were made: -# - Changes to be compatible with PythonTokenTypes -# - Removed main section -# - Applied type stubs directly -# - Removed Python 2 shims -# - Added support for Python 3.6 ASYNC/AWAIT hacks -# -# -*- coding: utf-8 -*- -# This tokenizer has been copied from the ``tokenize.py`` standard library -# tokenizer. The reason was simple: The standard library tokenizer fails -# if the indentation is not right. To make it possible to do error recovery the -# tokenizer needed to be rewritten. -# -# Basically this is a stripped down version of the standard library module, so -# you can read the documentation there. Additionally we included some speed and -# memory optimizations here. -# pyre-unsafe -from __future__ import absolute_import - -import itertools as _itertools -import re -import sys -from codecs import BOM_UTF8 -from collections import namedtuple -from dataclasses import dataclass -from typing import Dict, Generator, Iterable, Optional, Pattern, Set, Tuple - -from libcst import CSTLogicError -from libcst._parser.parso.python.token import PythonTokenTypes -from libcst._parser.parso.utils import PythonVersionInfo, split_lines - -# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) -MAX_UNICODE = "\U0010ffff" -BOM_UTF8_STRING = BOM_UTF8.decode("utf-8") - -STRING = PythonTokenTypes.STRING -NAME = PythonTokenTypes.NAME -NUMBER = PythonTokenTypes.NUMBER -OP = PythonTokenTypes.OP -NEWLINE = PythonTokenTypes.NEWLINE -INDENT = PythonTokenTypes.INDENT -DEDENT = PythonTokenTypes.DEDENT -ASYNC = PythonTokenTypes.ASYNC -AWAIT = PythonTokenTypes.AWAIT -ENDMARKER = PythonTokenTypes.ENDMARKER -ERRORTOKEN = PythonTokenTypes.ERRORTOKEN -ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT -FSTRING_START = PythonTokenTypes.FSTRING_START -FSTRING_STRING = PythonTokenTypes.FSTRING_STRING -FSTRING_END = PythonTokenTypes.FSTRING_END - - -@dataclass(frozen=True) -class TokenCollection: - pseudo_token: Pattern - single_quoted: Set[str] - triple_quoted: Set[str] - endpats: Dict[str, Pattern] - whitespace: Pattern - fstring_pattern_map: Dict[str, str] - always_break_tokens: Set[str] - - -_token_collection_cache: Dict[PythonVersionInfo, TokenCollection] = {} - - -def group(*choices: str, **kwargs: object) -> str: - capture = kwargs.pop("capture", False) # Python 2, arrghhhhh :( - assert not kwargs - - start = "(" - if not capture: - start += "?:" - return start + "|".join(choices) + ")" - - -def maybe(*choices: str) -> str: - return group(*choices) + "?" - - -# Return the empty string, plus all of the valid string prefixes. -def _all_string_prefixes( - version_info: PythonVersionInfo, - include_fstring: bool = False, - only_fstring: bool = False, -) -> Set[str]: - def different_case_versions(prefix): - for s in _itertools.product(*[(c, c.upper()) for c in prefix]): - yield "".join(s) - - # The valid string prefixes. Only contain the lower case versions, - # and don't contain any permuations (include 'fr', but not - # 'rf'). The various permutations will be generated. - valid_string_prefixes = ["b", "r"] - if version_info >= (3, 0): - valid_string_prefixes.append("br") - if version_info < (3, 0) or version_info >= (3, 3): - valid_string_prefixes.append("u") - - result = {""} - if version_info >= (3, 6) and include_fstring: - f = ["f", "fr"] - if only_fstring: - valid_string_prefixes = f - result = set() - else: - valid_string_prefixes += f - elif only_fstring: - return set() - - # if we add binary f-strings, add: ['fb', 'fbr'] - for prefix in valid_string_prefixes: - for t in _itertools.permutations(prefix): - # create a list with upper and lower versions of each - # character - result.update(different_case_versions(t)) - if version_info <= (2, 7): - # In Python 2 the order cannot just be random. - result.update(different_case_versions("ur")) - result.update(different_case_versions("br")) - return result - - -def _compile(expr: str) -> Pattern: - return re.compile(expr, re.UNICODE) - - -def _get_token_collection(version_info: PythonVersionInfo) -> TokenCollection: - try: - return _token_collection_cache[version_info] - except KeyError: - _token_collection_cache[version_info] = result = _create_token_collection( - version_info - ) - return result - - -fstring_raw_string = _compile(r"(?:[^{}]+|\{\{|\}\})+") - -unicode_character_name = r"[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*" -fstring_string_single_line = _compile( - r"(?:\{\{|\}\}|\\N\{" - + unicode_character_name - + r"\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+" -) -fstring_string_multi_line = _compile( - r"(?:\{\{|\}\}|\\N\{" + unicode_character_name + r"\}|\\[^N]|[^{}\\])+" -) - -fstring_format_spec_single_line = _compile(r"(?:\\(?:\r\n?|\n)|[^{}\r\n])+") -fstring_format_spec_multi_line = _compile(r"[^{}]+") - - -def _create_token_collection( # noqa: C901 - version_info: PythonVersionInfo, -) -> TokenCollection: - # Note: we use unicode matching for names ("\w") but ascii matching for - # number literals. - Whitespace = r"[ \f\t]*" - Comment = r"#[^\r\n]*" - # Python 2 is pretty much not working properly anymore, we just ignore - # parsing unicode properly, which is fine, I guess. - if version_info.major == 2: - Name = r"([A-Za-z_0-9]+)" - elif sys.version_info[0] == 2: - # Unfortunately the regex engine cannot deal with the regex below, so - # just use this one. - Name = r"(\w+)" - else: - Name = "([A-Za-z_0-9\u0080-" + MAX_UNICODE + "]+)" - - if version_info >= (3, 6): - Hexnumber = r"0[xX](?:_?[0-9a-fA-F])+" - Binnumber = r"0[bB](?:_?[01])+" - Octnumber = r"0[oO](?:_?[0-7])+" - Decnumber = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)" - Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) - Exponent = r"[eE][-+]?[0-9](?:_?[0-9])*" - Pointfloat = group( - r"[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?", r"\.[0-9](?:_?[0-9])*" - ) + maybe(Exponent) - Expfloat = r"[0-9](?:_?[0-9])*" + Exponent - Floatnumber = group(Pointfloat, Expfloat) - Imagnumber = group(r"[0-9](?:_?[0-9])*[jJ]", Floatnumber + r"[jJ]") - else: - Hexnumber = r"0[xX][0-9a-fA-F]+" - Binnumber = r"0[bB][01]+" - if version_info >= (3, 0): - Octnumber = r"0[oO][0-7]+" - else: - Octnumber = "0[oO]?[0-7]+" - Decnumber = r"(?:0+|[1-9][0-9]*)" - Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) - if version_info.major < 3: - Intnumber += "[lL]?" - Exponent = r"[eE][-+]?[0-9]+" - Pointfloat = group(r"[0-9]+\.[0-9]*", r"\.[0-9]+") + maybe(Exponent) - Expfloat = r"[0-9]+" + Exponent - Floatnumber = group(Pointfloat, Expfloat) - Imagnumber = group(r"[0-9]+[jJ]", Floatnumber + r"[jJ]") - Number = group(Imagnumber, Floatnumber, Intnumber) - - # Note that since _all_string_prefixes includes the empty string, - # StringPrefix can be the empty string (making it optional). - possible_prefixes = _all_string_prefixes(version_info) - StringPrefix = group(*possible_prefixes) - StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True)) - fstring_prefixes = _all_string_prefixes( - version_info, include_fstring=True, only_fstring=True - ) - FStringStart = group(*fstring_prefixes) - - # Tail end of ' string. - Single = r"(?:\\.|[^'\\])*'" - # Tail end of " string. - Double = r'(?:\\.|[^"\\])*"' - # Tail end of ''' string. - Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''" - # Tail end of """ string. - Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""' - Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""') - - # Because of leftmost-then-longest match semantics, be sure to put the - # longest operators first (e.g., if = came before ==, == would get - # recognized as two instances of =). - Operator = group( - r"\*\*=?", r">>=?", r"<<=?", r"//=?", r"->", r"[+\-*/%&@`|^!=<>]=?", r"~" - ) - - Bracket = "[][(){}]" - - special_args = [r"\r\n?", r"\n", r"[;.,@]"] - if version_info >= (3, 0): - special_args.insert(0, r"\.\.\.") - if version_info >= (3, 8): - special_args.insert(0, ":=?") - else: - special_args.insert(0, ":") - Special = group(*special_args) - - Funny = group(Operator, Bracket, Special) - - # First (or only) line of ' or " string. - ContStr = group( - StringPrefix - + r"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*" - + group("'", r"\\(?:\r\n?|\n)"), - StringPrefix - + r'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*' - + group('"', r"\\(?:\r\n?|\n)"), - ) - pseudo_extra_pool = [Comment, Triple] - all_quotes = '"', "'", '"""', "'''" - if fstring_prefixes: - pseudo_extra_pool.append(FStringStart + group(*all_quotes)) - - PseudoExtras = group(r"\\(?:\r\n?|\n)|\Z", *pseudo_extra_pool) - PseudoToken = group(Whitespace, capture=True) + group( - PseudoExtras, Number, Funny, ContStr, Name, capture=True - ) - - # For a given string prefix plus quotes, endpats maps it to a regex - # to match the remainder of that string. _prefix can be empty, for - # a normal single or triple quoted string (with no prefix). - endpats = {} - for _prefix in possible_prefixes: - endpats[_prefix + "'"] = _compile(Single) - endpats[_prefix + '"'] = _compile(Double) - endpats[_prefix + "'''"] = _compile(Single3) - endpats[_prefix + '"""'] = _compile(Double3) - - # A set of all of the single and triple quoted string prefixes, - # including the opening quotes. - single_quoted = set() - triple_quoted = set() - fstring_pattern_map = {} - for t in possible_prefixes: - for quote in '"', "'": - single_quoted.add(t + quote) - - for quote in '"""', "'''": - triple_quoted.add(t + quote) - - for t in fstring_prefixes: - for quote in all_quotes: - fstring_pattern_map[t + quote] = quote - - pseudo_token_compiled = _compile(PseudoToken) - return TokenCollection( - pseudo_token_compiled, - single_quoted, - triple_quoted, - endpats, - _compile(Whitespace), - fstring_pattern_map, - { - ";", - "import", - "class", - "def", - "try", - "except", - "finally", - "while", - "with", - "return", - }, - ) - - -class Token(namedtuple("Token", ["type", "string", "start_pos", "prefix"])): - @property - def end_pos(self): - lines = split_lines(self.string) - if len(lines) > 1: - return self.start_pos[0] + len(lines) - 1, 0 - else: - return self.start_pos[0], self.start_pos[1] + len(self.string) - - -class PythonToken(Token): - def __repr__(self): - return "TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)" % self._replace( - type=self.type.name - ) - - -class FStringNode: - def __init__(self, quote, raw): - self.quote = quote - self.raw = raw - self.parentheses_count = 0 - self.previous_lines = "" - self.last_string_start_pos = None - # In the syntax there can be multiple format_spec's nested: - # {x:{y:3}} - self.format_spec_count = 0 - - def open_parentheses(self, character): - self.parentheses_count += 1 - - def close_parentheses(self, character): - self.parentheses_count -= 1 - if self.parentheses_count == 0: - # No parentheses means that the format spec is also finished. - self.format_spec_count = 0 - - def allow_multiline(self): - return len(self.quote) == 3 - - def is_in_expr(self): - return self.parentheses_count > self.format_spec_count - - def is_in_format_spec(self): - return not self.is_in_expr() and self.format_spec_count - - -def _close_fstring_if_necessary(fstring_stack, string, start_pos, additional_prefix): - for fstring_stack_index, node in enumerate(fstring_stack): - if string.startswith(node.quote): - token = PythonToken( - FSTRING_END, node.quote, start_pos, prefix=additional_prefix - ) - additional_prefix = "" - assert not node.previous_lines - del fstring_stack[fstring_stack_index:] - return token, "", len(node.quote) - return None, additional_prefix, 0 - - -def _find_fstring_string(endpats, fstring_stack, line, lnum, pos): - tos = fstring_stack[-1] - allow_multiline = tos.allow_multiline() - if tos.is_in_format_spec(): - if allow_multiline: - regex = fstring_format_spec_multi_line - else: - regex = fstring_format_spec_single_line - else: - if tos.raw: - regex = fstring_raw_string - elif allow_multiline: - regex = fstring_string_multi_line - else: - regex = fstring_string_single_line - - match = regex.match(line, pos) - if match is None: - return tos.previous_lines, pos - - if not tos.previous_lines: - tos.last_string_start_pos = (lnum, pos) - - string = match.group(0) - for fstring_stack_node in fstring_stack: - end_match = endpats[fstring_stack_node.quote].match(string) - if end_match is not None: - string = end_match.group(0)[: -len(fstring_stack_node.quote)] - - new_pos = pos - new_pos += len(string) - # even if allow_multiline is False, we still need to check for trailing - # newlines, because a single-line f-string can contain line continuations - if string.endswith("\n") or string.endswith("\r"): - tos.previous_lines += string - string = "" - else: - string = tos.previous_lines + string - - return string, new_pos - - -def tokenize( - code: str, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0) -) -> Generator[PythonToken, None, None]: - """Generate tokens from a source code (string).""" - lines = split_lines(code, keepends=True) - return tokenize_lines(lines, version_info, start_pos=start_pos) - - -def tokenize_lines( # noqa: C901 - lines: Iterable[str], - version_info: PythonVersionInfo, - start_pos: Tuple[int, int] = (1, 0), -) -> Generator[PythonToken, None, None]: - token_collection = _get_token_collection(version_info) - if version_info >= PythonVersionInfo(3, 7): - return _tokenize_lines_py37_or_above( - lines, version_info, token_collection, start_pos=start_pos - ) - else: - return _tokenize_lines_py36_or_below( - lines, version_info, token_collection, start_pos=start_pos - ) - - -def _tokenize_lines_py36_or_below( # noqa: C901 - lines: Iterable[str], - version_info: PythonVersionInfo, - token_collection: TokenCollection, - start_pos: Tuple[int, int] = (1, 0), -) -> Generator[PythonToken, None, None]: - """ - A heavily modified Python standard library tokenizer. - - Additionally to the default information, yields also the prefix of each - token. This idea comes from lib2to3. The prefix contains all information - that is irrelevant for the parser like newlines in parentheses or comments. - """ - - paren_level = 0 # count parentheses - indents = [0] - max = 0 - numchars = "0123456789" - contstr = "" - contline = None - # We start with a newline. This makes indent at the first position - # possible. It's not valid Python, but still better than an INDENT in the - # second line (and not in the first). This makes quite a few things in - # Jedi's fast parser possible. - new_line = True - prefix = "" # Should never be required, but here for safety - endprog = None # Should not be required, but here for lint - contstr_start: Optional[Tuple[int, int]] = None - additional_prefix = "" - first = True - lnum = start_pos[0] - 1 - fstring_stack = [] - # stash and async_* are used for async/await parsing - stashed: Optional[PythonToken] = None - async_def: bool = False - async_def_indent: int = 0 - async_def_newline: bool = False - - def dedent_if_necessary(start): - nonlocal stashed - nonlocal async_def - nonlocal async_def_indent - nonlocal async_def_newline - - while start < indents[-1]: - if start > indents[-2]: - yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "") - break - if stashed is not None: - yield stashed - stashed = None - if async_def and async_def_newline and async_def_indent >= indents[-1]: - # We exited an 'async def' block, so stop tracking for indents - async_def = False - async_def_newline = False - async_def_indent = 0 - yield PythonToken(DEDENT, "", spos, "") - indents.pop() - - for line in lines: # loop over lines in stream - lnum += 1 - pos = 0 - max = len(line) - if first: - if line.startswith(BOM_UTF8_STRING): - additional_prefix = BOM_UTF8_STRING - line = line[1:] - max = len(line) - - # Fake that the part before was already parsed. - line = "^" * start_pos[1] + line - pos = start_pos[1] - max += start_pos[1] - - first = False - - if contstr: # continued string - if endprog is None: - raise CSTLogicError("Logic error!") - endmatch = endprog.match(line) - if endmatch: - pos = endmatch.end(0) - if contstr_start is None: - raise CSTLogicError("Logic error!") - if stashed is not None: - raise CSTLogicError("Logic error!") - yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix) - contstr = "" - contline = None - else: - contstr = contstr + line - contline = contline + line - continue - - while pos < max: - if fstring_stack: - tos = fstring_stack[-1] - if not tos.is_in_expr(): - string, pos = _find_fstring_string( - token_collection.endpats, fstring_stack, line, lnum, pos - ) - if string: - if stashed is not None: - raise CSTLogicError("Logic error!") - yield PythonToken( - FSTRING_STRING, - string, - tos.last_string_start_pos, - # Never has a prefix because it can start anywhere and - # include whitespace. - prefix="", - ) - tos.previous_lines = "" - continue - if pos == max: - break - - rest = line[pos:] - ( - fstring_end_token, - additional_prefix, - quote_length, - ) = _close_fstring_if_necessary( - fstring_stack, rest, (lnum, pos), additional_prefix - ) - pos += quote_length - if fstring_end_token is not None: - if stashed is not None: - raise CSTLogicError("Logic error!") - yield fstring_end_token - continue - - pseudomatch = token_collection.pseudo_token.match(line, pos) - if not pseudomatch: # scan for tokens - match = token_collection.whitespace.match(line, pos) - if pos == 0: - # pyre-fixme[16]: `Optional` has no attribute `end`. - yield from dedent_if_necessary(match.end()) - pos = match.end() - new_line = False - yield PythonToken( - ERRORTOKEN, - line[pos], - (lnum, pos), - # pyre-fixme[16]: `Optional` has no attribute `group`. - additional_prefix + match.group(0), - ) - additional_prefix = "" - pos += 1 - continue - - prefix = additional_prefix + pseudomatch.group(1) - additional_prefix = "" - start, pos = pseudomatch.span(2) - spos = (lnum, start) - token = pseudomatch.group(2) - if token == "": - assert prefix - additional_prefix = prefix - # This means that we have a line with whitespace/comments at - # the end, which just results in an endmarker. - break - initial = token[0] - - if new_line and initial not in "\r\n\\#": - new_line = False - if paren_level == 0 and not fstring_stack: - i = 0 - indent_start = start - while line[i] == "\f": - i += 1 - # TODO don't we need to change spos as well? - indent_start -= 1 - if indent_start > indents[-1]: - if stashed is not None: - yield stashed - stashed = None - yield PythonToken(INDENT, "", spos, "") - indents.append(indent_start) - yield from dedent_if_necessary(indent_start) - - if initial in numchars or ( # ordinary number - initial == "." and token != "." and token != "..." - ): - if stashed is not None: - yield stashed - stashed = None - yield PythonToken(NUMBER, token, spos, prefix) - elif pseudomatch.group(3) is not None: # ordinary name - if token in token_collection.always_break_tokens: - fstring_stack[:] = [] - paren_level = 0 - # We only want to dedent if the token is on a new line. - if re.match(r"[ \f\t]*$", line[:start]): - while True: - indent = indents.pop() - if indent > start: - if ( - async_def - and async_def_newline - and async_def_indent >= indent - ): - # We dedented outside of an 'async def' block. - async_def = False - async_def_newline = False - async_def_indent = 0 - if stashed is not None: - yield stashed - stashed = None - yield PythonToken(DEDENT, "", spos, "") - else: - indents.append(indent) - break - if str.isidentifier(token): - should_yield_identifier = True - if token in ("async", "await") and async_def: - # We're inside an 'async def' block, all async/await are - # tokens. - if token == "async": - yield PythonToken(ASYNC, token, spos, prefix) - else: - yield PythonToken(AWAIT, token, spos, prefix) - should_yield_identifier = False - - # We are possibly starting an 'async def' section - elif token == "async" and not stashed: - stashed = PythonToken(NAME, token, spos, prefix) - should_yield_identifier = False - - # We actually are starting an 'async def' section - elif ( - token == "def" - and stashed is not None - and stashed[0] is NAME - and stashed[1] == "async" - ): - async_def = True - async_def_indent = indents[-1] - yield PythonToken(ASYNC, stashed[1], stashed[2], stashed[3]) - stashed = None - - # We are either not stashed, or we output an ASYNC token above. - elif stashed: - yield stashed - stashed = None - - # If we didn't bail early due to possibly recognizing an 'async def', - # then we should yield this token as normal. - if should_yield_identifier: - yield PythonToken(NAME, token, spos, prefix) - else: - yield from _split_illegal_unicode_name(token, spos, prefix) - elif initial in "\r\n": - if any(not f.allow_multiline() for f in fstring_stack): - # Would use fstring_stack.clear, but that's not available - # in Python 2. - fstring_stack[:] = [] - - if not new_line and paren_level == 0 and not fstring_stack: - if async_def: - async_def_newline = True - if stashed: - yield stashed - stashed = None - yield PythonToken(NEWLINE, token, spos, prefix) - else: - additional_prefix = prefix + token - new_line = True - elif initial == "#": # Comments - assert not token.endswith("\n") - additional_prefix = prefix + token - elif token in token_collection.triple_quoted: - endprog = token_collection.endpats[token] - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - pos = endmatch.end(0) - token = line[start:pos] - if stashed is not None: - yield stashed - stashed = None - yield PythonToken(STRING, token, spos, prefix) - else: - contstr_start = (lnum, start) # multiple lines - contstr = line[start:] - contline = line - break - - # Check up to the first 3 chars of the token to see if - # they're in the single_quoted set. If so, they start - # a string. - # We're using the first 3, because we're looking for - # "rb'" (for example) at the start of the token. If - # we switch to longer prefixes, this needs to be - # adjusted. - # Note that initial == token[:1]. - # Also note that single quote checking must come after - # triple quote checking (above). - elif ( - initial in token_collection.single_quoted - or token[:2] in token_collection.single_quoted - or token[:3] in token_collection.single_quoted - ): - if token[-1] in "\r\n": # continued string - # This means that a single quoted string ends with a - # backslash and is continued. - contstr_start = lnum, start - endprog = ( - token_collection.endpats.get(initial) - or token_collection.endpats.get(token[1]) - or token_collection.endpats.get(token[2]) - ) - contstr = line[start:] - contline = line - break - else: # ordinary string - if stashed is not None: - yield stashed - stashed = None - yield PythonToken(STRING, token, spos, prefix) - elif ( - token in token_collection.fstring_pattern_map - ): # The start of an fstring. - fstring_stack.append( - FStringNode( - token_collection.fstring_pattern_map[token], - "r" in token or "R" in token, - ) - ) - if stashed is not None: - yield stashed - stashed = None - yield PythonToken(FSTRING_START, token, spos, prefix) - elif initial == "\\" and line[start:] in ( - "\\\n", - "\\\r\n", - "\\\r", - ): # continued stmt - additional_prefix += prefix + line[start:] - break - else: - if token in "([{": - if fstring_stack: - fstring_stack[-1].open_parentheses(token) - else: - paren_level += 1 - elif token in ")]}": - if fstring_stack: - fstring_stack[-1].close_parentheses(token) - else: - if paren_level: - paren_level -= 1 - elif ( - token == ":" - and fstring_stack - and fstring_stack[-1].parentheses_count - - fstring_stack[-1].format_spec_count - == 1 - ): - fstring_stack[-1].format_spec_count += 1 - - if stashed is not None: - yield stashed - stashed = None - yield PythonToken(OP, token, spos, prefix) - - if contstr: - yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix) - if contstr.endswith("\n") or contstr.endswith("\r"): - new_line = True - - if stashed is not None: - yield stashed - stashed = None - - end_pos = lnum, max - # As the last position we just take the maximally possible position. We - # remove -1 for the last new line. - for indent in indents[1:]: - yield PythonToken(DEDENT, "", end_pos, "") - yield PythonToken(ENDMARKER, "", end_pos, additional_prefix) - - -def _tokenize_lines_py37_or_above( # noqa: C901 - lines: Iterable[str], - version_info: PythonVersionInfo, - token_collection: TokenCollection, - start_pos: Tuple[int, int] = (1, 0), -) -> Generator[PythonToken, None, None]: - """ - A heavily modified Python standard library tokenizer. - - Additionally to the default information, yields also the prefix of each - token. This idea comes from lib2to3. The prefix contains all information - that is irrelevant for the parser like newlines in parentheses or comments. - """ - - def dedent_if_necessary(start): - while start < indents[-1]: - if start > indents[-2]: - yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "") - break - yield PythonToken(DEDENT, "", spos, "") - indents.pop() - - paren_level = 0 # count parentheses - indents = [0] - max = 0 - numchars = "0123456789" - contstr = "" - contline = None - # We start with a newline. This makes indent at the first position - # possible. It's not valid Python, but still better than an INDENT in the - # second line (and not in the first). This makes quite a few things in - # Jedi's fast parser possible. - new_line = True - prefix = "" # Should never be required, but here for safety - endprog = None # Should not be required, but here for lint - contstr_start: Optional[Tuple[int, int]] = None - additional_prefix = "" - first = True - lnum = start_pos[0] - 1 - fstring_stack = [] - for line in lines: # loop over lines in stream - lnum += 1 - pos = 0 - max = len(line) - if first: - if line.startswith(BOM_UTF8_STRING): - additional_prefix = BOM_UTF8_STRING - line = line[1:] - max = len(line) - - # Fake that the part before was already parsed. - line = "^" * start_pos[1] + line - pos = start_pos[1] - max += start_pos[1] - - first = False - - if contstr: # continued string - if endprog is None: - raise CSTLogicError("Logic error!") - endmatch = endprog.match(line) - if endmatch: - pos = endmatch.end(0) - if contstr_start is None: - raise CSTLogicError("Logic error!") - yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix) - contstr = "" - contline = None - else: - contstr = contstr + line - contline = contline + line - continue - - while pos < max: - if fstring_stack: - tos = fstring_stack[-1] - if not tos.is_in_expr(): - string, pos = _find_fstring_string( - token_collection.endpats, fstring_stack, line, lnum, pos - ) - if string: - yield PythonToken( - FSTRING_STRING, - string, - tos.last_string_start_pos, - # Never has a prefix because it can start anywhere and - # include whitespace. - prefix="", - ) - tos.previous_lines = "" - continue - if pos == max: - break - - rest = line[pos:] - ( - fstring_end_token, - additional_prefix, - quote_length, - ) = _close_fstring_if_necessary( - fstring_stack, rest, (lnum, pos), additional_prefix - ) - pos += quote_length - if fstring_end_token is not None: - yield fstring_end_token - continue - - pseudomatch = token_collection.pseudo_token.match(line, pos) - if not pseudomatch: # scan for tokens - match = token_collection.whitespace.match(line, pos) - if pos == 0: - # pyre-fixme[16]: `Optional` has no attribute `end`. - for t in dedent_if_necessary(match.end()): - yield t - pos = match.end() - new_line = False - yield PythonToken( - ERRORTOKEN, - line[pos], - (lnum, pos), - # pyre-fixme[16]: `Optional` has no attribute `group`. - additional_prefix + match.group(0), - ) - additional_prefix = "" - pos += 1 - continue - - prefix = additional_prefix + pseudomatch.group(1) - additional_prefix = "" - start, pos = pseudomatch.span(2) - spos = (lnum, start) - token = pseudomatch.group(2) - if token == "": - assert prefix - additional_prefix = prefix - # This means that we have a line with whitespace/comments at - # the end, which just results in an endmarker. - break - initial = token[0] - - if new_line and initial not in "\r\n\\#": - new_line = False - if paren_level == 0 and not fstring_stack: - i = 0 - indent_start = start - while line[i] == "\f": - i += 1 - # TODO don't we need to change spos as well? - indent_start -= 1 - if indent_start > indents[-1]: - yield PythonToken(INDENT, "", spos, "") - indents.append(indent_start) - for t in dedent_if_necessary(indent_start): - yield t - - if initial in numchars or ( # ordinary number - initial == "." and token != "." and token != "..." - ): - yield PythonToken(NUMBER, token, spos, prefix) - elif pseudomatch.group(3) is not None: # ordinary name - if token in token_collection.always_break_tokens: - fstring_stack[:] = [] - paren_level = 0 - # We only want to dedent if the token is on a new line. - if re.match(r"[ \f\t]*$", line[:start]): - while True: - indent = indents.pop() - if indent > start: - yield PythonToken(DEDENT, "", spos, "") - else: - indents.append(indent) - break - if str.isidentifier(token): - # py37 doesn't need special tokens for async/await, and we could - # emit NAME, but then we'd need different grammar for py36 and py37. - if token == "async": - yield PythonToken(ASYNC, token, spos, prefix) - elif token == "await": - yield PythonToken(AWAIT, token, spos, prefix) - else: - yield PythonToken(NAME, token, spos, prefix) - else: - for t in _split_illegal_unicode_name(token, spos, prefix): - yield t # yield from Python 2 - elif initial in "\r\n": - if any(not f.allow_multiline() for f in fstring_stack): - # Would use fstring_stack.clear, but that's not available - # in Python 2. - fstring_stack[:] = [] - - if not new_line and paren_level == 0 and not fstring_stack: - yield PythonToken(NEWLINE, token, spos, prefix) - else: - additional_prefix = prefix + token - new_line = True - elif initial == "#": # Comments - assert not token.endswith("\n") - additional_prefix = prefix + token - elif token in token_collection.triple_quoted: - endprog = token_collection.endpats[token] - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - pos = endmatch.end(0) - token = line[start:pos] - yield PythonToken(STRING, token, spos, prefix) - else: - contstr_start = (lnum, start) # multiple lines - contstr = line[start:] - contline = line - break - - # Check up to the first 3 chars of the token to see if - # they're in the single_quoted set. If so, they start - # a string. - # We're using the first 3, because we're looking for - # "rb'" (for example) at the start of the token. If - # we switch to longer prefixes, this needs to be - # adjusted. - # Note that initial == token[:1]. - # Also note that single quote checking must come after - # triple quote checking (above). - elif ( - initial in token_collection.single_quoted - or token[:2] in token_collection.single_quoted - or token[:3] in token_collection.single_quoted - ): - if token[-1] in "\r\n": # continued string - # This means that a single quoted string ends with a - # backslash and is continued. - contstr_start = lnum, start - endprog = ( - token_collection.endpats.get(initial) - or token_collection.endpats.get(token[1]) - or token_collection.endpats.get(token[2]) - ) - contstr = line[start:] - contline = line - break - else: # ordinary string - yield PythonToken(STRING, token, spos, prefix) - elif ( - token in token_collection.fstring_pattern_map - ): # The start of an fstring. - fstring_stack.append( - FStringNode( - token_collection.fstring_pattern_map[token], - "r" in token or "R" in token, - ) - ) - yield PythonToken(FSTRING_START, token, spos, prefix) - elif initial == "\\" and line[start:] in ( - "\\\n", - "\\\r\n", - "\\\r", - ): # continued stmt - additional_prefix += prefix + line[start:] - break - else: - if token in "([{": - if fstring_stack: - fstring_stack[-1].open_parentheses(token) - else: - paren_level += 1 - elif token in ")]}": - if fstring_stack: - fstring_stack[-1].close_parentheses(token) - else: - if paren_level: - paren_level -= 1 - elif ( - token == ":" - and fstring_stack - and fstring_stack[-1].parentheses_count - - fstring_stack[-1].format_spec_count - == 1 - ): - fstring_stack[-1].format_spec_count += 1 - - yield PythonToken(OP, token, spos, prefix) - - if contstr: - yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix) - if contstr.endswith("\n") or contstr.endswith("\r"): - new_line = True - - end_pos = lnum, max - # As the last position we just take the maximally possible position. We - # remove -1 for the last new line. - for indent in indents[1:]: - yield PythonToken(DEDENT, "", end_pos, "") - yield PythonToken(ENDMARKER, "", end_pos, additional_prefix) - - -def _split_illegal_unicode_name( - token: str, start_pos: Tuple[int, int], prefix: str -) -> Generator[PythonToken, None, None]: - def create_token(): - return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix) - - found = "" - is_illegal = False - pos = start_pos - for i, char in enumerate(token): - if is_illegal: - if str.isidentifier(char): - yield create_token() - found = char - is_illegal = False - prefix = "" - pos = start_pos[0], start_pos[1] + i - else: - found += char - else: - new_found = found + char - if str.isidentifier(new_found): - found = new_found - else: - if found: - yield create_token() - prefix = "" - pos = start_pos[0], start_pos[1] + i - found = char - is_illegal = True - - if found: - yield create_token() diff --git a/libcst/_parser/parso/tests/__init__.py b/libcst/_parser/parso/tests/__init__.py deleted file mode 100644 index 7bec24cb1..000000000 --- a/libcst/_parser/parso/tests/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. diff --git a/libcst/_parser/parso/tests/test_fstring.py b/libcst/_parser/parso/tests/test_fstring.py deleted file mode 100644 index 255366bba..000000000 --- a/libcst/_parser/parso/tests/test_fstring.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. -# -# Modifications: -# Copyright David Halter and Contributors -# Modifications are dual-licensed: MIT and PSF. -# 99% of the code is different from pgen2, now. -# -# A fork of Parso's tokenize test -# https://github.com/davidhalter/parso/blob/master/test/test_tokenize.py -# -# The following changes were made: -# - Convert base test to Unittet -# - Remove grammar-specific tests -# pyre-unsafe -from libcst._parser.parso.python.tokenize import tokenize -from libcst._parser.parso.utils import parse_version_string -from libcst.testing.utils import data_provider, UnitTest - - -class ParsoTokenizeTest(UnitTest): - @data_provider( - ( - # 2 times 2, 5 because python expr and endmarker. - ('f"}{"', [(1, 0), (1, 2), (1, 3), (1, 4), (1, 5)]), - ( - 'f" :{ 1 : } "', - [ - (1, 0), - (1, 2), - (1, 4), - (1, 6), - (1, 8), - (1, 9), - (1, 10), - (1, 11), - (1, 12), - (1, 13), - ], - ), - ( - 'f"""\n {\nfoo\n }"""', - [(1, 0), (1, 4), (2, 1), (3, 0), (4, 1), (4, 2), (4, 5)], - ), - ) - ) - def test_tokenize_start_pos(self, code, positions): - tokens = list(tokenize(code, version_info=parse_version_string("3.6"))) - assert positions == [p.start_pos for p in tokens] diff --git a/libcst/_parser/parso/tests/test_tokenize.py b/libcst/_parser/parso/tests/test_tokenize.py deleted file mode 100644 index c8180047e..000000000 --- a/libcst/_parser/parso/tests/test_tokenize.py +++ /dev/null @@ -1,428 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. -# -# Modifications: -# Copyright David Halter and Contributors -# Modifications are dual-licensed: MIT and PSF. -# 99% of the code is different from pgen2, now. -# -# A fork of Parso's tokenize test -# https://github.com/davidhalter/parso/blob/master/test/test_tokenize.py -# -# The following changes were made: -# - Convert base test to Unittet -# - Remove grammar-specific tests -# pyre-unsafe - -# -*- coding: utf-8 # This file contains Unicode characters. -from textwrap import dedent - -from libcst._parser.parso.python.token import PythonTokenTypes -from libcst._parser.parso.python.tokenize import PythonToken, tokenize -from libcst._parser.parso.utils import parse_version_string, split_lines -from libcst.testing.utils import data_provider, UnitTest - -# To make it easier to access some of the token types, just put them here. -NAME = PythonTokenTypes.NAME -NEWLINE = PythonTokenTypes.NEWLINE -STRING = PythonTokenTypes.STRING -NUMBER = PythonTokenTypes.NUMBER -INDENT = PythonTokenTypes.INDENT -DEDENT = PythonTokenTypes.DEDENT -ERRORTOKEN = PythonTokenTypes.ERRORTOKEN -OP = PythonTokenTypes.OP -ENDMARKER = PythonTokenTypes.ENDMARKER -ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT -FSTRING_START = PythonTokenTypes.FSTRING_START -FSTRING_STRING = PythonTokenTypes.FSTRING_STRING -FSTRING_END = PythonTokenTypes.FSTRING_END - - -def _get_token_list(string, version=None): - # Load the current version. - version_info = parse_version_string(version) - return list(tokenize(string, version_info)) - - -class ParsoTokenizerTest(UnitTest): - def test_simple_no_whitespace(self): - # Test a simple one line string, no preceding whitespace - simple_docstring = '"""simple one line docstring"""' - token_list = _get_token_list(simple_docstring) - _, value, _, prefix = token_list[0] - assert prefix == "" - assert value == '"""simple one line docstring"""' - - def test_simple_with_whitespace(self): - # Test a simple one line string with preceding whitespace and newline - simple_docstring = ' """simple one line docstring""" \r\n' - token_list = _get_token_list(simple_docstring) - assert token_list[0][0] == INDENT - typ, value, start_pos, prefix = token_list[1] - assert prefix == " " - assert value == '"""simple one line docstring"""' - assert typ == STRING - typ, value, start_pos, prefix = token_list[2] - assert prefix == " " - assert typ == NEWLINE - - def test_function_whitespace(self): - # Test function definition whitespace identification - fundef = dedent( - """ - def test_whitespace(*args, **kwargs): - x = 1 - if x > 0: - print(True) - """ - ) - token_list = _get_token_list(fundef) - for _, value, _, prefix in token_list: - if value == "test_whitespace": - assert prefix == " " - if value == "(": - assert prefix == "" - if value == "*": - assert prefix == "" - if value == "**": - assert prefix == " " - if value == "print": - assert prefix == " " - if value == "if": - assert prefix == " " - - def test_tokenize_multiline_I(self): - # Make sure multiline string having newlines have the end marker on the - # next line - fundef = '''""""\n''' - token_list = _get_token_list(fundef) - assert token_list == [ - PythonToken(ERRORTOKEN, '""""\n', (1, 0), ""), - PythonToken(ENDMARKER, "", (2, 0), ""), - ] - - def test_tokenize_multiline_II(self): - # Make sure multiline string having no newlines have the end marker on - # same line - fundef = '''""""''' - token_list = _get_token_list(fundef) - assert token_list == [ - PythonToken(ERRORTOKEN, '""""', (1, 0), ""), - PythonToken(ENDMARKER, "", (1, 4), ""), - ] - - def test_tokenize_multiline_III(self): - # Make sure multiline string having newlines have the end marker on the - # next line even if several newline - fundef = '''""""\n\n''' - token_list = _get_token_list(fundef) - assert token_list == [ - PythonToken(ERRORTOKEN, '""""\n\n', (1, 0), ""), - PythonToken(ENDMARKER, "", (3, 0), ""), - ] - - def test_identifier_contains_unicode(self): - fundef = dedent( - """ - def 我あφ(): - pass - """ - ) - token_list = _get_token_list(fundef) - unicode_token = token_list[1] - assert unicode_token[0] == NAME - - def test_ur_literals(self): - """ - Decided to parse `u''` literals regardless of Python version. This makes - probably sense: - - - Python 3+ doesn't support it, but it doesn't hurt - not be. While this is incorrect, it's just incorrect for one "old" and in - the future not very important version. - - All the other Python versions work very well with it. - """ - - def check(literal, is_literal=True): - token_list = _get_token_list(literal) - typ, result_literal, _, _ = token_list[0] - if is_literal: - if typ != FSTRING_START: - assert typ == STRING - assert result_literal == literal - else: - assert typ == NAME - - check('u""') - check('ur""', is_literal=False) - check('Ur""', is_literal=False) - check('UR""', is_literal=False) - check('bR""') - # Starting with Python 3.3 this ordering is also possible. - check('Rb""') - - # Starting with Python 3.6 format strings where introduced. - check('fr""', is_literal=True) - check('rF""', is_literal=True) - check('f""', is_literal=True) - check('F""', is_literal=True) - - def test_error_literal(self): - error_token, newline, endmarker = _get_token_list('"\n') - assert error_token.type == ERRORTOKEN - assert error_token.string == '"' - assert newline.type == NEWLINE - assert endmarker.type == ENDMARKER - assert endmarker.prefix == "" - - bracket, error_token, endmarker = _get_token_list('( """') - assert error_token.type == ERRORTOKEN - assert error_token.prefix == " " - assert error_token.string == '"""' - assert endmarker.type == ENDMARKER - assert endmarker.prefix == "" - - def test_endmarker_end_pos(self): - def check(code): - tokens = _get_token_list(code) - lines = split_lines(code) - assert tokens[-1].end_pos == (len(lines), len(lines[-1])) - - check("#c") - check("#c\n") - check("a\n") - check("a") - check(r"a\\n") - check("a\\") - - @data_provider( - ( - # Indentation - (" foo", [INDENT, NAME, DEDENT]), - (" foo\n bar", [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]), - ( - " foo\n bar \n baz", - [ - INDENT, - NAME, - NEWLINE, - ERROR_DEDENT, - NAME, - NEWLINE, - ERROR_DEDENT, - NAME, - DEDENT, - ], - ), - (" foo\nbar", [INDENT, NAME, NEWLINE, DEDENT, NAME]), - # Name stuff - ("1foo1", [NUMBER, NAME]), - ("மெல்லினம்", [NAME]), - ("²", [ERRORTOKEN]), - ("ä²ö", [NAME, ERRORTOKEN, NAME]), - ("ää²¹öö", [NAME, ERRORTOKEN, NAME]), - ) - ) - def test_token_types(self, code, types): - actual_types = [t.type for t in _get_token_list(code)] - assert actual_types == types + [ENDMARKER] - - def test_error_string(self): - t1, newline, endmarker = _get_token_list(' "\n') - assert t1.type == ERRORTOKEN - assert t1.prefix == " " - assert t1.string == '"' - assert newline.type == NEWLINE - assert endmarker.prefix == "" - assert endmarker.string == "" - - def test_indent_error_recovery(self): - code = dedent( - """\ - str( - from x import a - def - """ - ) - lst = _get_token_list(code) - expected = [ - # `str(` - INDENT, - NAME, - OP, - # `from parso` - NAME, - NAME, - # `import a` on same line as the previous from parso - NAME, - NAME, - NEWLINE, - # Dedent happens, because there's an import now and the import - # statement "breaks" out of the opening paren on the first line. - DEDENT, - # `b` - NAME, - NEWLINE, - ENDMARKER, - ] - assert [t.type for t in lst] == expected - - def test_error_token_after_dedent(self): - code = dedent( - """\ - class C: - pass - $foo - """ - ) - lst = _get_token_list(code) - expected = [ - NAME, - NAME, - OP, - NEWLINE, - INDENT, - NAME, - NEWLINE, - DEDENT, - # $foo\n - ERRORTOKEN, - NAME, - NEWLINE, - ENDMARKER, - ] - assert [t.type for t in lst] == expected - - def test_brackets_no_indentation(self): - """ - There used to be an issue that the parentheses counting would go below - zero. This should not happen. - """ - code = dedent( - """\ - } - { - } - """ - ) - lst = _get_token_list(code) - assert [t.type for t in lst] == [OP, NEWLINE, OP, OP, NEWLINE, ENDMARKER] - - def test_form_feed(self): - error_token, endmarker = _get_token_list( - dedent( - '''\ - \f"""''' - ) - ) - assert error_token.prefix == "\f" - assert error_token.string == '"""' - assert endmarker.prefix == "" - - def test_carriage_return(self): - lst = _get_token_list(" =\\\rclass") - assert [t.type for t in lst] == [INDENT, OP, DEDENT, NAME, ENDMARKER] - - def test_backslash(self): - code = "\\\n# 1 \n" - (endmarker,) = _get_token_list(code) - assert endmarker.prefix == code - - @data_provider( - ( - ('f"', [FSTRING_START], "3.7"), - ('f""', [FSTRING_START, FSTRING_END], "3.7"), - ('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END], "3.7"), - ('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP], "3.7"), - (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"), - (r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"), - # format spec - ( - r'f"Some {x:.2f}{y}"', - [ - FSTRING_START, - FSTRING_STRING, - OP, - NAME, - OP, - FSTRING_STRING, - OP, - OP, - NAME, - OP, - FSTRING_END, - ], - "3.7", - ), - # multiline f-string - ('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"), - ( - 'f"""abc{\n123}def"""', - [ - FSTRING_START, - FSTRING_STRING, - OP, - NUMBER, - OP, - FSTRING_STRING, - FSTRING_END, - ], - "3.7", - ), - # a line continuation inside of an fstring_string - ('f"abc\\\ndef"', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"), - ( - 'f"\\\n{123}\\\n"', - [ - FSTRING_START, - FSTRING_STRING, - OP, - NUMBER, - OP, - FSTRING_STRING, - FSTRING_END, - ], - "3.7", - ), - # a line continuation inside of an fstring_expr - ('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END], "3.7"), - # a line continuation inside of an format spec - ( - 'f"{123:.2\\\nf}"', - [FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END], - "3.7", - ), - # a newline without a line continuation inside a single-line string is - # wrong, and will generate an ERRORTOKEN - ( - 'f"abc\ndef"', - [FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN], - "3.7", - ), - # a more complex example - ( - r'print(f"Some {x:.2f}a{y}")', - [ - NAME, - OP, - FSTRING_START, - FSTRING_STRING, - OP, - NAME, - OP, - FSTRING_STRING, - OP, - FSTRING_STRING, - OP, - NAME, - OP, - FSTRING_END, - OP, - ], - "3.7", - ), - ) - ) - def test_fstring(self, code, types, py_version): - actual_types = [t.type for t in _get_token_list(code, py_version)] - assert types + [ENDMARKER] == actual_types diff --git a/libcst/_parser/parso/tests/test_utils.py b/libcst/_parser/parso/tests/test_utils.py deleted file mode 100644 index 1f548ef4a..000000000 --- a/libcst/_parser/parso/tests/test_utils.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. -# -# Modifications: -# Copyright David Halter and Contributors -# Modifications are dual-licensed: MIT and PSF. -# 99% of the code is different from pgen2, now. -# -# A fork of Parso's tokenize test -# https://github.com/davidhalter/parso/blob/master/test/test_tokenize.py -# -# The following changes were made: -# - Convert base test to Unittet -# - Remove grammar-specific tests -# pyre-unsafe -from libcst._parser.parso.utils import python_bytes_to_unicode, split_lines -from libcst.testing.utils import data_provider, UnitTest - - -class ParsoUtilsTest(UnitTest): - @data_provider( - ( - ("asd\r\n", ["asd", ""], False), - ("asd\r\n", ["asd\r\n", ""], True), - ("asd\r", ["asd", ""], False), - ("asd\r", ["asd\r", ""], True), - ("asd\n", ["asd", ""], False), - ("asd\n", ["asd\n", ""], True), - ("asd\r\n\f", ["asd", "\f"], False), - ("asd\r\n\f", ["asd\r\n", "\f"], True), - ("\fasd\r\n", ["\fasd", ""], False), - ("\fasd\r\n", ["\fasd\r\n", ""], True), - ("", [""], False), - ("", [""], True), - ("\n", ["", ""], False), - ("\n", ["\n", ""], True), - ("\r", ["", ""], False), - ("\r", ["\r", ""], True), - # Invalid line breaks - ("a\vb", ["a\vb"], False), - ("a\vb", ["a\vb"], True), - ("\x1c", ["\x1c"], False), - ("\x1c", ["\x1c"], True), - ) - ) - def test_split_lines(self, string, expected_result, keepends): - assert split_lines(string, keepends=keepends) == expected_result - - def test_python_bytes_to_unicode_unicode_text(self): - source = ( - b"# vim: fileencoding=utf-8\n" - + b"# \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a\n" - ) - actual = python_bytes_to_unicode(source) - expected = source.decode("utf-8") - assert actual == expected diff --git a/libcst/_parser/parso/utils.py b/libcst/_parser/parso/utils.py deleted file mode 100644 index 54517123e..000000000 --- a/libcst/_parser/parso/utils.py +++ /dev/null @@ -1,218 +0,0 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. -# -# Modifications: -# Copyright David Halter and Contributors -# Modifications are dual-licensed: MIT and PSF. -# 99% of the code is different from pgen2, now. -# -# A fork of `parso.utils`. -# https://github.com/davidhalter/parso/blob/master/parso/utils.py -# -# The following changes were made: -# - Drop Python 2 compatibility layer -# - Use dataclasses instead of namedtuple -# - Apply type hints directly to files -# - Make PythonVersionInfo directly usable in hashmaps -# - Unroll total ordering because Pyre doesn't understand it - - -import re -import sys -from ast import literal_eval -from dataclasses import dataclass -from typing import Optional, Sequence, Tuple, Union - -# The following is a list in Python that are line breaks in str.splitlines, but -# not in Python. In Python only \r (Carriage Return, 0xD) and \n (Line Feed, -# 0xA) are allowed to split lines. -_NON_LINE_BREAKS = ( - "\v", # Vertical Tabulation 0xB - "\f", # Form Feed 0xC - "\x1c", # File Separator - "\x1d", # Group Separator - "\x1e", # Record Separator - "\x85", # Next Line (NEL - Equivalent to CR+LF. - # Used to mark end-of-line on some IBM mainframes.) - "\u2028", # Line Separator - "\u2029", # Paragraph Separator -) - - -@dataclass(frozen=True) -class Version: - major: int - minor: int - micro: int - - -def split_lines(string: str, keepends: bool = False) -> Sequence[str]: - r""" - Intended for Python code. In contrast to Python's :py:meth:`str.splitlines`, - looks at form feeds and other special characters as normal text. Just - splits ``\n`` and ``\r\n``. - Also different: Returns ``[""]`` for an empty string input. - - In Python 2.7 form feeds are used as normal characters when using - str.splitlines. However in Python 3 somewhere there was a decision to split - also on form feeds. - """ - if keepends: - lst = string.splitlines(True) - - # We have to merge lines that were broken by form feed characters. - merge = [] - for i, line in enumerate(lst): - try: - last_chr = line[-1] - except IndexError: - pass - else: - if last_chr in _NON_LINE_BREAKS: - merge.append(i) - - for index in reversed(merge): - try: - lst[index] = lst[index] + lst[index + 1] - del lst[index + 1] - except IndexError: - # index + 1 can be empty and therefore there's no need to - # merge. - pass - - # The stdlib's implementation of the end is inconsistent when calling - # it with/without keepends. One time there's an empty string in the - # end, one time there's none. - if string.endswith("\n") or string.endswith("\r") or string == "": - lst.append("") - return lst - else: - return re.split(r"\n|\r\n|\r", string) - - -def python_bytes_to_unicode( - source: Union[str, bytes], encoding: str = "utf-8", errors: str = "strict" -) -> str: - """ - Checks for unicode BOMs and PEP 263 encoding declarations. Then returns a - unicode object like in :py:meth:`bytes.decode`. - - :param encoding: See :py:meth:`bytes.decode` documentation. - :param errors: See :py:meth:`bytes.decode` documentation. ``errors`` can be - ``'strict'``, ``'replace'`` or ``'ignore'``. - """ - - def detect_encoding() -> Union[str, bytes]: - """ - For the implementation of encoding definitions in Python, look at: - - http://www.python.org/dev/peps/pep-0263/ - - http://docs.python.org/2/reference/lexical_analysis.html#encoding-declarations - """ - byte_mark = literal_eval(r"b'\xef\xbb\xbf'") - if source.startswith(byte_mark): - # UTF-8 byte-order mark - return b"utf-8" - - # pyre-ignore Pyre can't see that Union[str, bytes] conforms to AnyStr. - first_two_match = re.match(rb"(?:[^\n]*\n){0,2}", source) - if first_two_match is None: - return encoding - first_two_lines = first_two_match.group(0) - possible_encoding = re.search(rb"coding[=:]\s*([-\w.]+)", first_two_lines) - if possible_encoding: - return possible_encoding.group(1) - else: - # the default if nothing else has been set -> PEP 263 - return encoding - - if isinstance(source, str): - # only cast bytes - return source - - actual_encoding = detect_encoding() - if not isinstance(actual_encoding, str): - actual_encoding = actual_encoding.decode("utf-8", "replace") - - # Cast to str - return source.decode(actual_encoding, errors) - - -@dataclass(frozen=True) -class PythonVersionInfo: - major: int - minor: int - - def __gt__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool: - if isinstance(other, tuple): - if len(other) != 2: - raise ValueError("Can only compare to tuples of length 2.") - return (self.major, self.minor) > other - - return (self.major, self.minor) > (other.major, other.minor) - - def __ge__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool: - return self.__gt__(other) or self.__eq__(other) - - def __lt__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool: - if isinstance(other, tuple): - if len(other) != 2: - raise ValueError("Can only compare to tuples of length 2.") - return (self.major, self.minor) < other - - return (self.major, self.minor) < (other.major, other.minor) - - def __le__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool: - return self.__lt__(other) or self.__eq__(other) - - def __eq__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool: - if isinstance(other, tuple): - if len(other) != 2: - raise ValueError("Can only compare to tuples of length 2.") - return (self.major, self.minor) == other - - return (self.major, self.minor) == (other.major, other.minor) - - def __ne__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool: - return not self.__eq__(other) - - def __hash__(self) -> int: - return hash((self.major, self.minor)) - - -def _parse_version(version: str) -> PythonVersionInfo: - match = re.match(r"(\d+)(?:\.(\d+)(?:\.\d+)?)?$", version) - if match is None: - raise ValueError( - ( - "The given version is not in the right format. " - + 'Use something like "3.2" or "3".' - ) - ) - - major = int(match.group(1)) - minor = match.group(2) - if minor is None: - # Use the latest Python in case it's not exactly defined, because the - # grammars are typically backwards compatible? - if major == 2: - minor = "7" - elif major == 3: - minor = "6" - else: - raise NotImplementedError( - "Sorry, no support yet for those fancy new/old versions." - ) - minor = int(minor) - return PythonVersionInfo(major, minor) - - -def parse_version_string(version: Optional[str] = None) -> PythonVersionInfo: - """ - Checks for a valid version number (e.g. `3.2` or `2.7.1` or `3`) and - returns a corresponding version info that is always two characters long in - decimal. - """ - if version is None: - version = "%s.%s" % sys.version_info[:2] - - return _parse_version(version) diff --git a/libcst/_parser/production_decorator.py b/libcst/_parser/production_decorator.py deleted file mode 100644 index d5ba52deb..000000000 --- a/libcst/_parser/production_decorator.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Callable, Optional, Sequence, TypeVar - -from libcst._parser.types.conversions import NonterminalConversion -from libcst._parser.types.production import Production - -_NonterminalConversionT = TypeVar( - "_NonterminalConversionT", bound=NonterminalConversion -) - - -# We could version our grammar at a later point by adding a version metadata kwarg to -# this decorator. -def with_production( - production_name: str, - children: str, - *, - version: Optional[str] = None, - future: Optional[str] = None, - # pyre-fixme[34]: `Variable[_NonterminalConversionT (bound to - # typing.Callable[[libcst_native.parser_config.ParserConfig, - # typing.Sequence[typing.Any]], typing.Any])]` isn't present in the function's - # parameters. -) -> Callable[[_NonterminalConversionT], _NonterminalConversionT]: - """ - Attaches a bit of grammar to a conversion function. The parser extracts all of these - production strings, and uses it to form the language's full grammar. - - If you need to attach multiple productions to the same conversion function - """ - - def inner(fn: _NonterminalConversionT) -> _NonterminalConversionT: - if not hasattr(fn, "productions"): - fn.productions = [] - # pyre-ignore: Pyre doesn't think that fn has a __name__ attribute - fn_name = fn.__name__ - if not fn_name.startswith("convert_"): - raise ValueError( - "A function with a production must be named 'convert_X', not " - + f"'{fn_name}'." - ) - # pyre-ignore: Pyre doesn't know about this magic field we added - fn.productions.append(Production(production_name, children, version, future)) - return fn - - return inner - - -def get_productions(fn: NonterminalConversion) -> Sequence[Production]: - # pyre-ignore Pyre doesn't know about this magic field we added - return fn.productions diff --git a/libcst/_parser/py_whitespace_parser.py b/libcst/_parser/py_whitespace_parser.py deleted file mode 100644 index 6b6573a65..000000000 --- a/libcst/_parser/py_whitespace_parser.py +++ /dev/null @@ -1,271 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from typing import List, Optional, Sequence, Tuple, Union - -from libcst import CSTLogicError, ParserSyntaxError -from libcst._nodes.whitespace import ( - Comment, - COMMENT_RE, - EmptyLine, - Newline, - NEWLINE_RE, - ParenthesizedWhitespace, - SIMPLE_WHITESPACE_RE, - SimpleWhitespace, - TrailingWhitespace, -) -from libcst._parser.types.config import BaseWhitespaceParserConfig -from libcst._parser.types.whitespace_state import WhitespaceState as State - -# BEGIN PARSER ENTRYPOINTS - - -def parse_simple_whitespace( - config: BaseWhitespaceParserConfig, state: State -) -> SimpleWhitespace: - # The match never fails because the pattern can match an empty string - lines = config.lines - # pyre-fixme[16]: Optional type has no attribute `group`. - ws_line = SIMPLE_WHITESPACE_RE.match(lines[state.line - 1], state.column).group(0) - ws_line_list = [ws_line] - while "\\" in ws_line: - # continuation character - state.line += 1 - state.column = 0 - ws_line = SIMPLE_WHITESPACE_RE.match(lines[state.line - 1], state.column).group( - 0 - ) - ws_line_list.append(ws_line) - - # TODO: we could special-case the common case where there's no continuation - # character to avoid list construction and joining. - - # once we've finished collecting continuation characters - state.column += len(ws_line) - return SimpleWhitespace("".join(ws_line_list)) - - -def parse_empty_lines( - config: BaseWhitespaceParserConfig, - state: State, - *, - override_absolute_indent: Optional[str] = None, -) -> Sequence[EmptyLine]: - # If override_absolute_indent is true, then we need to parse all lines up - # to and including the last line that is indented at our level. These all - # belong to the footer and not to the next line's leading_lines. All lines - # that have indent=False and come after the last line where indent=True - # do not belong to this node. - state_for_line = State( - state.line, state.column, state.absolute_indent, state.is_parenthesized - ) - lines: List[Tuple[State, EmptyLine]] = [] - while True: - el = _parse_empty_line( - config, state_for_line, override_absolute_indent=override_absolute_indent - ) - if el is None: - break - - # Store the updated state with the element we parsed. Then make a new state - # clone for the next element. - lines.append((state_for_line, el)) - state_for_line = State( - state_for_line.line, - state_for_line.column, - state.absolute_indent, - state.is_parenthesized, - ) - - if override_absolute_indent is not None: - # We need to find the last element that is indented, and then split the list - # at that point. - for i in range(len(lines) - 1, -1, -1): - if lines[i][1].indent: - lines = lines[: (i + 1)] - break - else: - # We didn't find any lines, throw them all away - lines = [] - - if lines: - # Update the state line and column to match the last line actually parsed. - final_state: State = lines[-1][0] - state.line = final_state.line - state.column = final_state.column - return [r[1] for r in lines] - - -def parse_trailing_whitespace( - config: BaseWhitespaceParserConfig, state: State -) -> TrailingWhitespace: - trailing_whitespace = _parse_trailing_whitespace(config, state) - if trailing_whitespace is None: - raise ParserSyntaxError( - "Internal Error: Failed to parse TrailingWhitespace. This should never " - + "happen because a TrailingWhitespace is never optional in the grammar, " - + "so this error should've been caught by parso first.", - lines=config.lines, - raw_line=state.line, - raw_column=state.column, - ) - return trailing_whitespace - - -def parse_parenthesizable_whitespace( - config: BaseWhitespaceParserConfig, state: State -) -> Union[SimpleWhitespace, ParenthesizedWhitespace]: - if state.is_parenthesized: - # First, try parenthesized (don't need speculation because it either - # parses or doesn't modify state). - parenthesized_whitespace = _parse_parenthesized_whitespace(config, state) - if parenthesized_whitespace is not None: - return parenthesized_whitespace - # Now, just parse and return a simple whitespace - return parse_simple_whitespace(config, state) - - -# END PARSER ENTRYPOINTS -# BEGIN PARSER INTERNAL PRODUCTIONS - - -def _parse_empty_line( - config: BaseWhitespaceParserConfig, - state: State, - *, - override_absolute_indent: Optional[str] = None, -) -> Optional[EmptyLine]: - # begin speculative parsing - speculative_state = State( - state.line, state.column, state.absolute_indent, state.is_parenthesized - ) - try: - indent = _parse_indent( - config, speculative_state, override_absolute_indent=override_absolute_indent - ) - except Exception: - # We aren't on a new line, speculative parsing failed - return None - whitespace = parse_simple_whitespace(config, speculative_state) - comment = _parse_comment(config, speculative_state) - newline = _parse_newline(config, speculative_state) - if newline is None: - # speculative parsing failed - return None - # speculative parsing succeeded - state.line = speculative_state.line - state.column = speculative_state.column - # don't need to copy absolute_indent/is_parenthesized because they don't change. - return EmptyLine(indent, whitespace, comment, newline) - - -def _parse_indent( - config: BaseWhitespaceParserConfig, - state: State, - *, - override_absolute_indent: Optional[str] = None, -) -> bool: - """ - Returns True if indentation was found, otherwise False. - """ - absolute_indent = ( - override_absolute_indent - if override_absolute_indent is not None - else state.absolute_indent - ) - line_str = config.lines[state.line - 1] - if state.column != 0: - if state.column == len(line_str) and state.line == len(config.lines): - # We're at EOF, treat this as a failed speculative parse - return False - raise CSTLogicError( - "Internal Error: Column should be 0 when parsing an indent." - ) - if line_str.startswith(absolute_indent, state.column): - state.column += len(absolute_indent) - return True - return False - - -def _parse_comment( - config: BaseWhitespaceParserConfig, state: State -) -> Optional[Comment]: - comment_match = COMMENT_RE.match(config.lines[state.line - 1], state.column) - if comment_match is None: - return None - comment = comment_match.group(0) - state.column += len(comment) - return Comment(comment) - - -def _parse_newline( - config: BaseWhitespaceParserConfig, state: State -) -> Optional[Newline]: - # begin speculative parsing - line_str = config.lines[state.line - 1] - newline_match = NEWLINE_RE.match(line_str, state.column) - if newline_match is not None: - # speculative parsing succeeded - newline_str = newline_match.group(0) - state.column += len(newline_str) - if state.column != len(line_str): - raise ParserSyntaxError( - "Internal Error: Found a newline, but it wasn't the EOL.", - lines=config.lines, - raw_line=state.line, - raw_column=state.column, - ) - if state.line < len(config.lines): - # this newline was the end of a line, and there's another line, - # therefore we should move to the next line - state.line += 1 - state.column = 0 - if newline_str == config.default_newline: - # Just inherit it from the Module instead of explicitly setting it. - return Newline() - else: - return Newline(newline_str) - else: # no newline was found, speculative parsing failed - return None - - -def _parse_trailing_whitespace( - config: BaseWhitespaceParserConfig, state: State -) -> Optional[TrailingWhitespace]: - # Begin speculative parsing - speculative_state = State( - state.line, state.column, state.absolute_indent, state.is_parenthesized - ) - whitespace = parse_simple_whitespace(config, speculative_state) - comment = _parse_comment(config, speculative_state) - newline = _parse_newline(config, speculative_state) - if newline is None: - # Speculative parsing failed - return None - # Speculative parsing succeeded - state.line = speculative_state.line - state.column = speculative_state.column - # don't need to copy absolute_indent/is_parenthesized because they don't change. - return TrailingWhitespace(whitespace, comment, newline) - - -def _parse_parenthesized_whitespace( - config: BaseWhitespaceParserConfig, state: State -) -> Optional[ParenthesizedWhitespace]: - first_line = _parse_trailing_whitespace(config, state) - if first_line is None: - # Speculative parsing failed - return None - empty_lines = () - while True: - empty_line = _parse_empty_line(config, state) - if empty_line is None: - # This isn't an empty line, so parse it below - break - empty_lines = empty_lines + (empty_line,) - indent = _parse_indent(config, state) - last_line = parse_simple_whitespace(config, state) - return ParenthesizedWhitespace(first_line, empty_lines, indent, last_line) diff --git a/libcst/_parser/python_parser.py b/libcst/_parser/python_parser.py deleted file mode 100644 index 7f3d53db4..000000000 --- a/libcst/_parser/python_parser.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# pyre-unsafe - -from typing import Any, Iterable, Mapping, Sequence - -from libcst._parser.base_parser import BaseParser -from libcst._parser.grammar import get_nonterminal_conversions, get_terminal_conversions -from libcst._parser.parso.pgen2.generator import Grammar -from libcst._parser.parso.python.token import TokenType -from libcst._parser.types.config import ParserConfig -from libcst._parser.types.conversions import NonterminalConversion, TerminalConversion -from libcst._parser.types.token import Token - - -class PythonCSTParser(BaseParser[Token, TokenType, Any]): - config: ParserConfig - terminal_conversions: Mapping[str, TerminalConversion] - nonterminal_conversions: Mapping[str, NonterminalConversion] - - def __init__( - self, - *, - tokens: Iterable[Token], - config: ParserConfig, - pgen_grammar: "Grammar[TokenType]", - start_nonterminal: str = "file_input", - ) -> None: - super().__init__( - tokens=tokens, - lines=config.lines, - pgen_grammar=pgen_grammar, - start_nonterminal=start_nonterminal, - ) - self.config = config - self.terminal_conversions = get_terminal_conversions() - self.nonterminal_conversions = get_nonterminal_conversions( - config.version, config.future_imports - ) - - def convert_nonterminal(self, nonterminal: str, children: Sequence[Any]) -> Any: - return self.nonterminal_conversions[nonterminal](self.config, children) - - def convert_terminal(self, token: Token) -> Any: - return self.terminal_conversions[token.type.name](self.config, token) diff --git a/libcst/_parser/tests/test_config.py b/libcst/_parser/tests/test_config.py index d9c31db5f..02e4db492 100644 --- a/libcst/_parser/tests/test_config.py +++ b/libcst/_parser/tests/test_config.py @@ -4,7 +4,7 @@ # LICENSE file in the root directory of this source tree. # pyre-strict -from libcst._parser.parso.utils import PythonVersionInfo +from libcst._parser.utils import PythonVersionInfo from libcst._parser.types.config import _pick_compatible_python_version from libcst.testing.utils import UnitTest diff --git a/libcst/_parser/tests/test_detect_config.py b/libcst/_parser/tests/test_detect_config.py deleted file mode 100644 index 6d9eaa6cc..000000000 --- a/libcst/_parser/tests/test_detect_config.py +++ /dev/null @@ -1,331 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Union - -from libcst._parser.detect_config import detect_config -from libcst._parser.parso.utils import PythonVersionInfo -from libcst._parser.types.config import ( - parser_config_asdict, - ParserConfig, - PartialParserConfig, -) -from libcst.testing.utils import data_provider, UnitTest - - -class TestDetectConfig(UnitTest): - @data_provider( - { - "empty_input": { - "source": b"", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=["\n", ""], - encoding="utf-8", - default_indent=" ", - default_newline="\n", - has_trailing_newline=False, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "detect_trailing_newline_disabled": { - "source": b"", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": False, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=[""], # the trailing newline isn't inserted - encoding="utf-8", - default_indent=" ", - default_newline="\n", - has_trailing_newline=False, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "detect_default_newline_disabled": { - "source": b"pass\r", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": False, - "detect_default_newline": False, - "expected_config": ParserConfig( - lines=["pass\r", ""], # the trailing newline isn't inserted - encoding="utf-8", - default_indent=" ", - default_newline="\n", - has_trailing_newline=False, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "newline_inferred": { - "source": b"first_line\r\n\nsomething\n", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=["first_line\r\n", "\n", "something\n", ""], - encoding="utf-8", - default_indent=" ", - default_newline="\r\n", - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "newline_partial_given": { - "source": b"first_line\r\nsecond_line\r\n", - "partial": PartialParserConfig( - default_newline="\n", python_version="3.7" - ), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=["first_line\r\n", "second_line\r\n", ""], - encoding="utf-8", - default_indent=" ", - default_newline="\n", # The given partial disables inference - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "indent_inferred": { - "source": b"if test:\n\t something\n", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=["if test:\n", "\t something\n", ""], - encoding="utf-8", - default_indent="\t ", - default_newline="\n", - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "indent_partial_given": { - "source": b"if test:\n\t something\n", - "partial": PartialParserConfig( - default_indent=" ", python_version="3.7" - ), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=["if test:\n", "\t something\n", ""], - encoding="utf-8", - default_indent=" ", - default_newline="\n", - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "encoding_inferred": { - "source": b"#!/usr/bin/python3\n# -*- coding: latin-1 -*-\npass\n", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=[ - "#!/usr/bin/python3\n", - "# -*- coding: latin-1 -*-\n", - "pass\n", - "", - ], - encoding="iso-8859-1", # this is an alias for latin-1 - default_indent=" ", - default_newline="\n", - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "encoding_partial_given": { - "source": b"#!/usr/bin/python3\n# -*- coding: latin-1 -*-\npass\n", - "partial": PartialParserConfig( - encoding="us-ascii", python_version="3.7" - ), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=[ - "#!/usr/bin/python3\n", - "# -*- coding: latin-1 -*-\n", - "pass\n", - "", - ], - encoding="us-ascii", - default_indent=" ", - default_newline="\n", - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "encoding_str_not_bytes_disables_inference": { - "source": "#!/usr/bin/python3\n# -*- coding: latin-1 -*-\npass\n", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=[ - "#!/usr/bin/python3\n", - "# -*- coding: latin-1 -*-\n", - "pass\n", - "", - ], - encoding="utf-8", # because source is a str, don't infer latin-1 - default_indent=" ", - default_newline="\n", - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "encoding_non_ascii_compatible_utf_16_with_bom": { - "source": b"\xff\xfet\x00e\x00s\x00t\x00", - "partial": PartialParserConfig(encoding="utf-16", python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=["test\n", ""], - encoding="utf-16", - default_indent=" ", - default_newline="\n", - has_trailing_newline=False, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "detect_trailing_newline_missing_newline": { - "source": b"test", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=["test\n", ""], - encoding="utf-8", - default_indent=" ", - default_newline="\n", - has_trailing_newline=False, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "detect_trailing_newline_has_newline": { - "source": b"test\n", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=["test\n", ""], - encoding="utf-8", - default_indent=" ", - default_newline="\n", - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "detect_trailing_newline_missing_newline_after_line_continuation": { - "source": b"test\\\n", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=["test\\\n", "\n", ""], - encoding="utf-8", - default_indent=" ", - default_newline="\n", - has_trailing_newline=False, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "detect_trailing_newline_has_newline_after_line_continuation": { - "source": b"test\\\n\n", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=["test\\\n", "\n", ""], - encoding="utf-8", - default_indent=" ", - default_newline="\n", - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset(), - ), - }, - "future_imports_in_correct_position": { - "source": b"# C\n''' D '''\nfrom __future__ import a as b\n", - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=[ - "# C\n", - "''' D '''\n", - "from __future__ import a as b\n", - "", - ], - encoding="utf-8", - default_indent=" ", - default_newline="\n", - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset({"a"}), - ), - }, - "future_imports_in_mixed_position": { - "source": ( - b"from __future__ import a, b\nimport os\n" - + b"from __future__ import c\n" - ), - "partial": PartialParserConfig(python_version="3.7"), - "detect_trailing_newline": True, - "detect_default_newline": True, - "expected_config": ParserConfig( - lines=[ - "from __future__ import a, b\n", - "import os\n", - "from __future__ import c\n", - "", - ], - encoding="utf-8", - default_indent=" ", - default_newline="\n", - has_trailing_newline=True, - version=PythonVersionInfo(3, 7), - future_imports=frozenset({"a", "b"}), - ), - }, - } - ) - def test_detect_module_config( - self, - *, - source: Union[str, bytes], - partial: PartialParserConfig, - detect_trailing_newline: bool, - detect_default_newline: bool, - expected_config: ParserConfig, - ) -> None: - self.assertEqual( - parser_config_asdict( - detect_config( - source, - partial=partial, - detect_trailing_newline=detect_trailing_newline, - detect_default_newline=detect_default_newline, - ).config - ), - parser_config_asdict(expected_config), - ) diff --git a/libcst/_parser/tests/test_version_compare.py b/libcst/_parser/tests/test_version_compare.py index 102657d6b..a68c00100 100644 --- a/libcst/_parser/tests/test_version_compare.py +++ b/libcst/_parser/tests/test_version_compare.py @@ -4,7 +4,7 @@ # LICENSE file in the root directory of this source tree. from libcst._parser.grammar import _should_include -from libcst._parser.parso.utils import PythonVersionInfo +from libcst._parser.utils import PythonVersionInfo from libcst.testing.utils import data_provider, UnitTest diff --git a/libcst/_parser/tests/test_wrapped_tokenize.py b/libcst/_parser/tests/test_wrapped_tokenize.py deleted file mode 100644 index dbaf37002..000000000 --- a/libcst/_parser/tests/test_wrapped_tokenize.py +++ /dev/null @@ -1,1906 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# pyre-unsafe - -from typing import Sequence - -from libcst._exceptions import ParserSyntaxError -from libcst._parser.parso.python.token import PythonTokenTypes -from libcst._parser.parso.utils import parse_version_string, PythonVersionInfo -from libcst._parser.types.whitespace_state import WhitespaceState -from libcst._parser.wrapped_tokenize import Token, tokenize -from libcst.testing.utils import data_provider, UnitTest - -_PY38 = parse_version_string("3.8.0") -_PY37 = parse_version_string("3.7.0") -_PY36 = parse_version_string("3.6.0") -_PY35 = parse_version_string("3.5.0") - - -class WrappedTokenizeTest(UnitTest): - maxDiff = 10000 - - @data_provider( - { - "simple_py35": ( - "pass;\n", - _PY35, - ( - Token( - type=PythonTokenTypes.NAME, - string="pass", - start_pos=(1, 0), - end_pos=(1, 4), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=4, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=";", - start_pos=(1, 4), - end_pos=(1, 5), - whitespace_before=WhitespaceState( - line=1, column=4, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 5), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(2, 0), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "with_indent_py35": ( - "if foo:\n bar\n", - _PY35, - ( - Token( - type=PythonTokenTypes.NAME, - string="if", - start_pos=(1, 0), - end_pos=(1, 2), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=2, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="foo", - start_pos=(1, 3), - end_pos=(1, 6), - whitespace_before=WhitespaceState( - line=1, column=2, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=":", - start_pos=(1, 6), - end_pos=(1, 7), - whitespace_before=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=7, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 7), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, column=7, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.INDENT, - string="", - start_pos=(2, 4), - end_pos=(2, 4), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=" ", - ), - Token( - type=PythonTokenTypes.NAME, - string="bar", - start_pos=(2, 4), - end_pos=(2, 7), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=7, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(2, 7), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=2, - column=7, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.DEDENT, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "async_py35": ( - "async def foo():\n return await bar\n", - _PY35, - ( - Token( - type=PythonTokenTypes.ASYNC, - string="async", - start_pos=(1, 0), - end_pos=(1, 5), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="def", - start_pos=(1, 6), - end_pos=(1, 9), - whitespace_before=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=9, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="foo", - start_pos=(1, 10), - end_pos=(1, 13), - whitespace_before=WhitespaceState( - line=1, column=9, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, - column=13, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string="(", - start_pos=(1, 13), - end_pos=(1, 14), - whitespace_before=WhitespaceState( - line=1, - column=13, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=1, column=14, absolute_indent="", is_parenthesized=True - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=")", - start_pos=(1, 14), - end_pos=(1, 15), - whitespace_before=WhitespaceState( - line=1, column=14, absolute_indent="", is_parenthesized=True - ), - whitespace_after=WhitespaceState( - line=1, - column=15, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=":", - start_pos=(1, 15), - end_pos=(1, 16), - whitespace_before=WhitespaceState( - line=1, - column=15, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=1, - column=16, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 16), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, - column=16, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.INDENT, - string="", - start_pos=(2, 4), - end_pos=(2, 4), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=" ", - ), - Token( - type=PythonTokenTypes.NAME, - string="return", - start_pos=(2, 4), - end_pos=(2, 10), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=10, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.AWAIT, - string="await", - start_pos=(2, 11), - end_pos=(2, 16), - whitespace_before=WhitespaceState( - line=2, - column=10, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=16, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="bar", - start_pos=(2, 17), - end_pos=(2, 20), - whitespace_before=WhitespaceState( - line=2, - column=16, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=20, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(2, 20), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=2, - column=20, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.DEDENT, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "async_no_token_35": ( - "async;\n", - _PY35, - ( - Token( - type=PythonTokenTypes.NAME, - string="async", - start_pos=(1, 0), - end_pos=(1, 5), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=";", - start_pos=(1, 5), - end_pos=(1, 6), - whitespace_before=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 6), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(2, 0), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "simple_py36": ( - "pass;\n", - _PY36, - ( - Token( - type=PythonTokenTypes.NAME, - string="pass", - start_pos=(1, 0), - end_pos=(1, 4), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=4, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=";", - start_pos=(1, 4), - end_pos=(1, 5), - whitespace_before=WhitespaceState( - line=1, column=4, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 5), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(2, 0), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "with_indent_py36": ( - "if foo:\n bar\n", - _PY36, - ( - Token( - type=PythonTokenTypes.NAME, - string="if", - start_pos=(1, 0), - end_pos=(1, 2), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=2, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="foo", - start_pos=(1, 3), - end_pos=(1, 6), - whitespace_before=WhitespaceState( - line=1, column=2, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=":", - start_pos=(1, 6), - end_pos=(1, 7), - whitespace_before=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=7, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 7), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, column=7, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.INDENT, - string="", - start_pos=(2, 4), - end_pos=(2, 4), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=" ", - ), - Token( - type=PythonTokenTypes.NAME, - string="bar", - start_pos=(2, 4), - end_pos=(2, 7), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=7, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(2, 7), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=2, - column=7, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.DEDENT, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "async_py36": ( - "async def foo():\n return await bar\n", - _PY36, - ( - Token( - type=PythonTokenTypes.ASYNC, - string="async", - start_pos=(1, 0), - end_pos=(1, 5), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="def", - start_pos=(1, 6), - end_pos=(1, 9), - whitespace_before=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=9, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="foo", - start_pos=(1, 10), - end_pos=(1, 13), - whitespace_before=WhitespaceState( - line=1, column=9, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, - column=13, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string="(", - start_pos=(1, 13), - end_pos=(1, 14), - whitespace_before=WhitespaceState( - line=1, - column=13, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=1, column=14, absolute_indent="", is_parenthesized=True - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=")", - start_pos=(1, 14), - end_pos=(1, 15), - whitespace_before=WhitespaceState( - line=1, column=14, absolute_indent="", is_parenthesized=True - ), - whitespace_after=WhitespaceState( - line=1, - column=15, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=":", - start_pos=(1, 15), - end_pos=(1, 16), - whitespace_before=WhitespaceState( - line=1, - column=15, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=1, - column=16, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 16), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, - column=16, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.INDENT, - string="", - start_pos=(2, 4), - end_pos=(2, 4), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=" ", - ), - Token( - type=PythonTokenTypes.NAME, - string="return", - start_pos=(2, 4), - end_pos=(2, 10), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=10, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.AWAIT, - string="await", - start_pos=(2, 11), - end_pos=(2, 16), - whitespace_before=WhitespaceState( - line=2, - column=10, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=16, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="bar", - start_pos=(2, 17), - end_pos=(2, 20), - whitespace_before=WhitespaceState( - line=2, - column=16, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=20, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(2, 20), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=2, - column=20, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.DEDENT, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "async_no_token_36": ( - "async;\n", - _PY36, - ( - Token( - type=PythonTokenTypes.NAME, - string="async", - start_pos=(1, 0), - end_pos=(1, 5), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=";", - start_pos=(1, 5), - end_pos=(1, 6), - whitespace_before=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 6), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(2, 0), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "simple_py37": ( - "pass;\n", - _PY37, - ( - Token( - type=PythonTokenTypes.NAME, - string="pass", - start_pos=(1, 0), - end_pos=(1, 4), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=4, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=";", - start_pos=(1, 4), - end_pos=(1, 5), - whitespace_before=WhitespaceState( - line=1, column=4, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 5), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(2, 0), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "with_indent_py37": ( - "if foo:\n bar\n", - _PY37, - ( - Token( - type=PythonTokenTypes.NAME, - string="if", - start_pos=(1, 0), - end_pos=(1, 2), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=2, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="foo", - start_pos=(1, 3), - end_pos=(1, 6), - whitespace_before=WhitespaceState( - line=1, column=2, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=":", - start_pos=(1, 6), - end_pos=(1, 7), - whitespace_before=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=7, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 7), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, column=7, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.INDENT, - string="", - start_pos=(2, 4), - end_pos=(2, 4), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=" ", - ), - Token( - type=PythonTokenTypes.NAME, - string="bar", - start_pos=(2, 4), - end_pos=(2, 7), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=7, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(2, 7), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=2, - column=7, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.DEDENT, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "async_py37": ( - "async def foo():\n return await bar\n", - _PY37, - ( - Token( - type=PythonTokenTypes.ASYNC, - string="async", - start_pos=(1, 0), - end_pos=(1, 5), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="def", - start_pos=(1, 6), - end_pos=(1, 9), - whitespace_before=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=9, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="foo", - start_pos=(1, 10), - end_pos=(1, 13), - whitespace_before=WhitespaceState( - line=1, column=9, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, - column=13, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string="(", - start_pos=(1, 13), - end_pos=(1, 14), - whitespace_before=WhitespaceState( - line=1, - column=13, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=1, column=14, absolute_indent="", is_parenthesized=True - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=")", - start_pos=(1, 14), - end_pos=(1, 15), - whitespace_before=WhitespaceState( - line=1, column=14, absolute_indent="", is_parenthesized=True - ), - whitespace_after=WhitespaceState( - line=1, - column=15, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=":", - start_pos=(1, 15), - end_pos=(1, 16), - whitespace_before=WhitespaceState( - line=1, - column=15, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=1, - column=16, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 16), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, - column=16, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.INDENT, - string="", - start_pos=(2, 4), - end_pos=(2, 4), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=" ", - ), - Token( - type=PythonTokenTypes.NAME, - string="return", - start_pos=(2, 4), - end_pos=(2, 10), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=10, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.AWAIT, - string="await", - start_pos=(2, 11), - end_pos=(2, 16), - whitespace_before=WhitespaceState( - line=2, - column=10, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=16, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="bar", - start_pos=(2, 17), - end_pos=(2, 20), - whitespace_before=WhitespaceState( - line=2, - column=16, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=20, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(2, 20), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=2, - column=20, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.DEDENT, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "simple_py38": ( - "pass;\n", - _PY38, - ( - Token( - type=PythonTokenTypes.NAME, - string="pass", - start_pos=(1, 0), - end_pos=(1, 4), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=4, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=";", - start_pos=(1, 4), - end_pos=(1, 5), - whitespace_before=WhitespaceState( - line=1, column=4, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 5), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(2, 0), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "with_indent_py38": ( - "if foo:\n bar\n", - _PY38, - ( - Token( - type=PythonTokenTypes.NAME, - string="if", - start_pos=(1, 0), - end_pos=(1, 2), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=2, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="foo", - start_pos=(1, 3), - end_pos=(1, 6), - whitespace_before=WhitespaceState( - line=1, column=2, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=":", - start_pos=(1, 6), - end_pos=(1, 7), - whitespace_before=WhitespaceState( - line=1, column=6, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=7, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 7), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, column=7, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.INDENT, - string="", - start_pos=(2, 4), - end_pos=(2, 4), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=" ", - ), - Token( - type=PythonTokenTypes.NAME, - string="bar", - start_pos=(2, 4), - end_pos=(2, 7), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=7, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(2, 7), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=2, - column=7, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.DEDENT, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - "async_py38": ( - "async def foo():\n return await bar\n", - _PY38, - ( - Token( - type=PythonTokenTypes.ASYNC, - string="async", - start_pos=(1, 0), - end_pos=(1, 5), - whitespace_before=WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="def", - start_pos=(1, 6), - end_pos=(1, 9), - whitespace_before=WhitespaceState( - line=1, column=5, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, column=9, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="foo", - start_pos=(1, 10), - end_pos=(1, 13), - whitespace_before=WhitespaceState( - line=1, column=9, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=1, - column=13, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string="(", - start_pos=(1, 13), - end_pos=(1, 14), - whitespace_before=WhitespaceState( - line=1, - column=13, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=1, column=14, absolute_indent="", is_parenthesized=True - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=")", - start_pos=(1, 14), - end_pos=(1, 15), - whitespace_before=WhitespaceState( - line=1, column=14, absolute_indent="", is_parenthesized=True - ), - whitespace_after=WhitespaceState( - line=1, - column=15, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.OP, - string=":", - start_pos=(1, 15), - end_pos=(1, 16), - whitespace_before=WhitespaceState( - line=1, - column=15, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=1, - column=16, - absolute_indent="", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(1, 16), - end_pos=(2, 0), - whitespace_before=WhitespaceState( - line=1, - column=16, - absolute_indent="", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.INDENT, - string="", - start_pos=(2, 4), - end_pos=(2, 4), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=" ", - ), - Token( - type=PythonTokenTypes.NAME, - string="return", - start_pos=(2, 4), - end_pos=(2, 10), - whitespace_before=WhitespaceState( - line=2, - column=0, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=10, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.AWAIT, - string="await", - start_pos=(2, 11), - end_pos=(2, 16), - whitespace_before=WhitespaceState( - line=2, - column=10, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=16, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NAME, - string="bar", - start_pos=(2, 17), - end_pos=(2, 20), - whitespace_before=WhitespaceState( - line=2, - column=16, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=2, - column=20, - absolute_indent=" ", - is_parenthesized=False, - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.NEWLINE, - string="\n", - start_pos=(2, 20), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=2, - column=20, - absolute_indent=" ", - is_parenthesized=False, - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.DEDENT, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - Token( - type=PythonTokenTypes.ENDMARKER, - string="", - start_pos=(3, 0), - end_pos=(3, 0), - whitespace_before=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - whitespace_after=WhitespaceState( - line=3, column=0, absolute_indent="", is_parenthesized=False - ), - relative_indent=None, - ), - ), - ), - } - ) - def test_tokenize( - self, code: str, ver: PythonVersionInfo, expected: Sequence[Token] - ) -> None: - tokens = tuple(tokenize(code, ver)) - self.assertSequenceEqual(tokens, expected) - for a, b in zip(tokens, tokens[1:]): - # These must be the same object, so if whitespace gets consumed (mutated) at - # the end of token a, it shows up at the beginning of token b. - self.assertIs(a.whitespace_after, b.whitespace_before) - - def test_errortoken(self) -> None: - for version in [_PY36, _PY37, _PY38]: - with self.assertRaisesRegex(ParserSyntaxError, "not a valid token"): - # use tuple() to read everything - # The copyright symbol isn't a valid token - tuple(tokenize("\u00a9", version)) - - def test_error_dedent(self) -> None: - for version in [_PY36, _PY37, _PY38]: - with self.assertRaisesRegex(ParserSyntaxError, "Inconsistent indentation"): - # create some inconsistent indents to generate an ERROR_DEDENT token - tuple(tokenize(" a\n b", version)) diff --git a/libcst/_parser/types/config.py b/libcst/_parser/types/config.py index 289fd8aef..1476b1276 100644 --- a/libcst/_parser/types/config.py +++ b/libcst/_parser/types/config.py @@ -12,18 +12,13 @@ from libcst._add_slots import add_slots from libcst._nodes.whitespace import NEWLINE_RE -from libcst._parser.parso.utils import parse_version_string, PythonVersionInfo +from libcst._parser.utils import parse_version_string, PythonVersionInfo _INDENT_RE: Pattern[str] = re.compile(r"[ \t]+") -try: - from libcst_native import parser_config as config_mod +from libcst._parser.types import py_config as config_mod - MockWhitespaceParserConfig = config_mod.BaseWhitespaceParserConfig -except ImportError: - from libcst._parser.types import py_config as config_mod - - MockWhitespaceParserConfig = config_mod.MockWhitespaceParserConfig +MockWhitespaceParserConfig = config_mod.MockWhitespaceParserConfig BaseWhitespaceParserConfig = config_mod.BaseWhitespaceParserConfig ParserConfig = config_mod.ParserConfig @@ -168,7 +163,10 @@ def __repr__(self) -> str: def _pick_compatible_python_version(version: Optional[str] = None) -> PythonVersionInfo: - max_version = parse_version_string(version) + if version is None: + max_version = sys.version_info[:2] + else: + max_version = parse_version_string(version) for v in KNOWN_PYTHON_VERSION_STRINGS[::-1]: tmp = parse_version_string(v) if tmp <= max_version: diff --git a/libcst/_parser/types/py_config.py b/libcst/_parser/types/py_config.py index d77325912..490f450f5 100644 --- a/libcst/_parser/types/py_config.py +++ b/libcst/_parser/types/py_config.py @@ -7,7 +7,7 @@ from dataclasses import asdict, dataclass from typing import Any, FrozenSet, Mapping, Sequence -from libcst._parser.parso.utils import PythonVersionInfo +from libcst._parser.utils import PythonVersionInfo class BaseWhitespaceParserConfig(abc.ABC): diff --git a/libcst/_parser/types/py_token.py b/libcst/_parser/types/py_token.py deleted file mode 100644 index d2f9b5379..000000000 --- a/libcst/_parser/types/py_token.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -from dataclasses import dataclass -from typing import Optional, Tuple - -from libcst._add_slots import add_slots -from libcst._parser.parso.python.token import TokenType -from libcst._parser.types.whitespace_state import WhitespaceState - - -@add_slots -@dataclass(frozen=True) -class Token: - type: TokenType - string: str - # The start of where `string` is in the source, not including leading whitespace. - start_pos: Tuple[int, int] - # The end of where `string` is in the source, not including trailing whitespace. - end_pos: Tuple[int, int] - whitespace_before: WhitespaceState - whitespace_after: WhitespaceState - # The relative indent this token adds. - relative_indent: Optional[str] diff --git a/libcst/_parser/types/py_whitespace_state.py b/libcst/_parser/types/py_whitespace_state.py deleted file mode 100644 index 6359e83eb..000000000 --- a/libcst/_parser/types/py_whitespace_state.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from dataclasses import dataclass - -from libcst._add_slots import add_slots - - -@add_slots -@dataclass(frozen=False) -class WhitespaceState: - """ - A frequently mutated store of the whitespace parser's current state. This object - must be cloned prior to speculative parsing. - - This is in contrast to the `config` object each whitespace parser function takes, - which is frozen and never mutated. - - Whitespace parsing works by mutating this state object. By encapsulating saving, and - re-using state objects inside the top-level python parser, the whitespace parser is - able to be reentrant. One 'convert' function can consume part of the whitespace, and - another 'convert' function can consume the rest, depending on who owns what - whitespace. - - This is similar to the approach you might take to parse nested languages (e.g. - JavaScript inside of HTML). We're treating whitespace as a separate language and - grammar from the rest of Python's grammar. - """ - - line: int # one-indexed (to match parso's behavior) - column: int # zero-indexed (to match parso's behavior) - # What to look for when executing `_parse_indent`. - absolute_indent: str - is_parenthesized: bool diff --git a/libcst/_parser/types/token.py b/libcst/_parser/types/token.py deleted file mode 100644 index 54d904efe..000000000 --- a/libcst/_parser/types/token.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -try: - from libcst_native import tokenize - - Token = tokenize.Token -except ImportError: - from libcst._parser.types.py_token import Token # noqa: F401 diff --git a/libcst/_parser/types/whitespace_state.py b/libcst/_parser/types/whitespace_state.py index 7eaeab32b..9ff420c66 100644 --- a/libcst/_parser/types/whitespace_state.py +++ b/libcst/_parser/types/whitespace_state.py @@ -7,9 +7,6 @@ Defines the state object used by the whitespace parser. """ -try: - from libcst_native import whitespace_state as mod -except ImportError: - from libcst._parser.types import py_whitespace_state as mod +from libcst.native import whitespace_state as mod WhitespaceState = mod.WhitespaceState diff --git a/libcst/_parser/utils.py b/libcst/_parser/utils.py new file mode 100644 index 000000000..e2068e990 --- /dev/null +++ b/libcst/_parser/utils.py @@ -0,0 +1,38 @@ + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import re +from typing import List, Tuple + +PythonVersionInfo = Tuple[int, int] + + +def parse_version_string(version: str) -> PythonVersionInfo: + try: + v = tuple(map(int, version.split("."))) + except ValueError: + raise ValueError("The given version is not in the right format") + + if len(v) < 2 or len(v) > 3: + raise ValueError("The given version is not in the right format") + return (v[0], v[1]) + + +# Regular expression to split lines found in parso +_SPLIT_LINES_RE = re.compile(r"([^\n\r]*[\n\r]*)") + + +def split_lines(s: str, keepends: bool = False) -> List[str]: + """ + Split a string into lines, optionally keeping the line endings. + This implementation mimics `parso.utils.split_lines` but uses standard python methods where possible. + Note: parso's split_lines had specific behavior for mixed newlines, but s.splitlines() is usually close enough. + However, parso behavior might be slightly different regarding universal newlines. + For LibCST, correct line splitting is critical for source fidelity. + + Using standard splitlines for now as a safe default for modern python. + """ + return s.splitlines(keepends=keepends) diff --git a/libcst/_parser/whitespace_parser.py b/libcst/_parser/whitespace_parser.py index 1c29efc5d..7567c38c1 100644 --- a/libcst/_parser/whitespace_parser.py +++ b/libcst/_parser/whitespace_parser.py @@ -15,13 +15,7 @@ hand-rolled recursive descent parser. """ -try: - # It'd be better to do `from libcst_native.whitespace_parser import *`, but we're - # blocked on https://github.com/PyO3/pyo3/issues/759 - # (which ultimately seems to be a limitation of how importlib works) - from libcst_native import whitespace_parser as mod -except ImportError: - from libcst._parser import py_whitespace_parser as mod +from libcst.native import whitespace_parser as mod parse_simple_whitespace = mod.parse_simple_whitespace parse_empty_lines = mod.parse_empty_lines diff --git a/libcst/_parser/wrapped_tokenize.py b/libcst/_parser/wrapped_tokenize.py deleted file mode 100644 index 8d6010523..000000000 --- a/libcst/_parser/wrapped_tokenize.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - - -""" -Parso's tokenize doesn't give us tokens in the format that we'd ideally like, so this -performs a small number of transformations to the token stream: - -- `end_pos` is precomputed as a property, instead of lazily as a method, for more - efficient access. -- `whitespace_before` and `whitespace_after` have been added. These include the correct - indentation information. -- `prefix` is removed, since we don't use it anywhere. -- `ERRORTOKEN` and `ERROR_DEDENT` have been removed, because we don't intend to support - error recovery. If we encounter token errors, we'll raise a ParserSyntaxError instead. - -If performance becomes a concern, we can rewrite this later as a fork of the original -tokenize module, instead of as a wrapper. -""" - -from dataclasses import dataclass, field -from enum import Enum -from typing import Generator, Iterator, List, Optional, Sequence - -from libcst._add_slots import add_slots -from libcst._exceptions import ParserSyntaxError -from libcst._parser.parso.python.token import PythonTokenTypes, TokenType -from libcst._parser.parso.python.tokenize import ( - Token as OrigToken, - tokenize_lines as orig_tokenize_lines, -) -from libcst._parser.parso.utils import PythonVersionInfo, split_lines -from libcst._parser.types.token import Token -from libcst._parser.types.whitespace_state import WhitespaceState - -_ERRORTOKEN: TokenType = PythonTokenTypes.ERRORTOKEN -_ERROR_DEDENT: TokenType = PythonTokenTypes.ERROR_DEDENT - -_INDENT: TokenType = PythonTokenTypes.INDENT -_DEDENT: TokenType = PythonTokenTypes.DEDENT -_ENDMARKER: TokenType = PythonTokenTypes.ENDMARKER - -_FSTRING_START: TokenType = PythonTokenTypes.FSTRING_START -_FSTRING_END: TokenType = PythonTokenTypes.FSTRING_END - -_OP: TokenType = PythonTokenTypes.OP - - -class _ParenthesisOrFStringStackEntry(Enum): - PARENTHESIS = 0 - FSTRING = 0 - - -_PARENTHESIS_STACK_ENTRY: _ParenthesisOrFStringStackEntry = ( - _ParenthesisOrFStringStackEntry.PARENTHESIS -) -_FSTRING_STACK_ENTRY: _ParenthesisOrFStringStackEntry = ( - _ParenthesisOrFStringStackEntry.FSTRING -) - - -@add_slots -@dataclass(frozen=False) -class _TokenizeState: - lines: Sequence[str] - previous_whitespace_state: WhitespaceState = field( - default_factory=lambda: WhitespaceState( - line=1, column=0, absolute_indent="", is_parenthesized=False - ) - ) - indents: List[str] = field(default_factory=lambda: [""]) - parenthesis_or_fstring_stack: List[_ParenthesisOrFStringStackEntry] = field( - default_factory=list - ) - - -def tokenize(code: str, version_info: PythonVersionInfo) -> Iterator[Token]: - try: - from libcst_native import tokenize as native_tokenize - - return native_tokenize.tokenize(code) - except ImportError: - lines = split_lines(code, keepends=True) - return tokenize_lines(code, lines, version_info) - - -def tokenize_lines( - code: str, lines: Sequence[str], version_info: PythonVersionInfo -) -> Iterator[Token]: - try: - from libcst_native import tokenize as native_tokenize - - # TODO: pass through version_info - return native_tokenize.tokenize(code) - except ImportError: - return tokenize_lines_py(code, lines, version_info) - - -def tokenize_lines_py( - code: str, lines: Sequence[str], version_info: PythonVersionInfo -) -> Generator[Token, None, None]: - state = _TokenizeState(lines) - orig_tokens_iter = iter(orig_tokenize_lines(lines, version_info)) - - # Iterate over the tokens and pass them to _convert_token, providing a one-token - # lookahead, to enable proper indent handling. - try: - curr_token = next(orig_tokens_iter) - except StopIteration: - pass # empty file - else: - for next_token in orig_tokens_iter: - yield _convert_token(state, curr_token, next_token) - curr_token = next_token - yield _convert_token(state, curr_token, None) - - -def _convert_token( # noqa: C901: too complex - state: _TokenizeState, curr_token: OrigToken, next_token: Optional[OrigToken] -) -> Token: - ct_type = curr_token.type - ct_string = curr_token.string - ct_start_pos = curr_token.start_pos - if ct_type is _ERRORTOKEN: - raise ParserSyntaxError( - f"{ct_string!r} is not a valid token.", - lines=state.lines, - raw_line=ct_start_pos[0], - raw_column=ct_start_pos[1], - ) - if ct_type is _ERROR_DEDENT: - raise ParserSyntaxError( - "Inconsistent indentation. Expected a dedent.", - lines=state.lines, - raw_line=ct_start_pos[0], - raw_column=ct_start_pos[1], - ) - - # Compute relative indent changes for indent/dedent nodes - relative_indent: Optional[str] = None - if ct_type is _INDENT: - old_indent = "" if len(state.indents) < 2 else state.indents[-2] - new_indent = state.indents[-1] - relative_indent = new_indent[len(old_indent) :] - - if next_token is not None: - nt_type = next_token.type - if nt_type is _INDENT: - nt_line, nt_column = next_token.start_pos - state.indents.append(state.lines[nt_line - 1][:nt_column]) - elif nt_type is _DEDENT: - state.indents.pop() - - whitespace_before = state.previous_whitespace_state - - if ct_type is _INDENT or ct_type is _DEDENT or ct_type is _ENDMARKER: - # Don't update whitespace state for these dummy tokens. This makes it possible - # to partially parse whitespace for IndentedBlock footers, and then parse the - # rest of the whitespace in the following statement's leading_lines. - # Unfortunately, that means that the indentation is either wrong for the footer - # comments, or for the next line. We've chosen to allow it to be wrong for the - # IndentedBlock footer and manually override the state when parsing whitespace - # in that particular node. - whitespace_after = whitespace_before - ct_end_pos = ct_start_pos - else: - # Not a dummy token, so update the whitespace state. - - # Compute our own end_pos, since parso's end_pos is wrong for triple-strings. - lines = split_lines(ct_string) - if len(lines) > 1: - ct_end_pos = ct_start_pos[0] + len(lines) - 1, len(lines[-1]) - else: - ct_end_pos = (ct_start_pos[0], ct_start_pos[1] + len(ct_string)) - - # Figure out what mode the whitespace parser should use. If we're inside - # parentheses, certain whitespace (e.g. newlines) are allowed where they would - # otherwise not be. f-strings override and disable this behavior, however. - # - # Parso's tokenizer tracks this internally, but doesn't expose it, so we have to - # duplicate that logic here. - - pof_stack = state.parenthesis_or_fstring_stack - try: - if ct_type is _FSTRING_START: - pof_stack.append(_FSTRING_STACK_ENTRY) - elif ct_type is _FSTRING_END: - pof_stack.pop() - elif ct_type is _OP: - if ct_string in "([{": - pof_stack.append(_PARENTHESIS_STACK_ENTRY) - elif ct_string in ")]}": - pof_stack.pop() - except IndexError: - # pof_stack may be empty by the time we need to read from it due to - # mismatched braces. - raise ParserSyntaxError( - "Encountered a closing brace without a matching opening brace.", - lines=state.lines, - raw_line=ct_start_pos[0], - raw_column=ct_start_pos[1], - ) - is_parenthesized = ( - len(pof_stack) > 0 and pof_stack[-1] == _PARENTHESIS_STACK_ENTRY - ) - - whitespace_after = WhitespaceState( - ct_end_pos[0], ct_end_pos[1], state.indents[-1], is_parenthesized - ) - - # Hold onto whitespace_after, so we can use it as whitespace_before in the next - # node. - state.previous_whitespace_state = whitespace_after - - return Token( - ct_type, - ct_string, - ct_start_pos, - ct_end_pos, - whitespace_before, - whitespace_after, - relative_indent, - ) diff --git a/libcst/tool.py b/libcst/tool.py index a2164b11e..bd41f730f 100644 --- a/libcst/tool.py +++ b/libcst/tool.py @@ -25,7 +25,7 @@ import yaml from libcst import CSTLogicError, LIBCST_VERSION, parse_module, PartialParserConfig -from libcst._parser.parso.utils import parse_version_string +from libcst._parser.utils import parse_version_string from libcst.codemod import ( CodemodCommand, CodemodContext,