Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/libcst/_parser/wrapped_tokenize.py: 35%
100 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:43 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:43 +0000
1# Copyright (c) Meta Platforms, Inc. and affiliates.
2#
3# This source code is licensed under the MIT license found in the
4# LICENSE file in the root directory of this source tree.
7"""
8Parso's tokenize doesn't give us tokens in the format that we'd ideally like, so this
9performs a small number of transformations to the token stream:
11- `end_pos` is precomputed as a property, instead of lazily as a method, for more
12 efficient access.
13- `whitespace_before` and `whitespace_after` have been added. These include the correct
14 indentation information.
15- `prefix` is removed, since we don't use it anywhere.
16- `ERRORTOKEN` and `ERROR_DEDENT` have been removed, because we don't intend to support
17 error recovery. If we encounter token errors, we'll raise a ParserSyntaxError instead.
19If performance becomes a concern, we can rewrite this later as a fork of the original
20tokenize module, instead of as a wrapper.
21"""
23from dataclasses import dataclass, field
24from enum import Enum
25from typing import Generator, Iterator, List, Optional, Sequence
27from libcst._add_slots import add_slots
28from libcst._exceptions import ParserSyntaxError
29from libcst._parser.parso.python.token import PythonTokenTypes, TokenType
30from libcst._parser.parso.python.tokenize import (
31 Token as OrigToken,
32 tokenize_lines as orig_tokenize_lines,
33)
34from libcst._parser.parso.utils import PythonVersionInfo, split_lines
35from libcst._parser.types.token import Token
36from libcst._parser.types.whitespace_state import WhitespaceState
38_ERRORTOKEN: TokenType = PythonTokenTypes.ERRORTOKEN
39_ERROR_DEDENT: TokenType = PythonTokenTypes.ERROR_DEDENT
41_INDENT: TokenType = PythonTokenTypes.INDENT
42_DEDENT: TokenType = PythonTokenTypes.DEDENT
43_ENDMARKER: TokenType = PythonTokenTypes.ENDMARKER
45_FSTRING_START: TokenType = PythonTokenTypes.FSTRING_START
46_FSTRING_END: TokenType = PythonTokenTypes.FSTRING_END
48_OP: TokenType = PythonTokenTypes.OP
51class _ParenthesisOrFStringStackEntry(Enum):
52 PARENTHESIS = 0
53 FSTRING = 0
56_PARENTHESIS_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
57 _ParenthesisOrFStringStackEntry.PARENTHESIS
58)
59_FSTRING_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
60 _ParenthesisOrFStringStackEntry.FSTRING
61)
64@add_slots
65@dataclass(frozen=False)
66class _TokenizeState:
67 lines: Sequence[str]
68 previous_whitespace_state: WhitespaceState = field(
69 default_factory=lambda: WhitespaceState(
70 line=1, column=0, absolute_indent="", is_parenthesized=False
71 )
72 )
73 indents: List[str] = field(default_factory=lambda: [""])
74 parenthesis_or_fstring_stack: List[_ParenthesisOrFStringStackEntry] = field(
75 default_factory=list
76 )
79def tokenize(code: str, version_info: PythonVersionInfo) -> Iterator[Token]:
80 try:
81 from libcst_native import tokenize as native_tokenize
83 return native_tokenize.tokenize(code)
84 except ImportError:
85 lines = split_lines(code, keepends=True)
86 return tokenize_lines(code, lines, version_info)
89def tokenize_lines(
90 code: str, lines: Sequence[str], version_info: PythonVersionInfo
91) -> Iterator[Token]:
92 try:
93 from libcst_native import tokenize as native_tokenize
95 # TODO: pass through version_info
96 return native_tokenize.tokenize(code)
97 except ImportError:
98 return tokenize_lines_py(code, lines, version_info)
101def tokenize_lines_py(
102 code: str, lines: Sequence[str], version_info: PythonVersionInfo
103) -> Generator[Token, None, None]:
104 state = _TokenizeState(lines)
105 orig_tokens_iter = iter(orig_tokenize_lines(lines, version_info))
107 # Iterate over the tokens and pass them to _convert_token, providing a one-token
108 # lookahead, to enable proper indent handling.
109 try:
110 curr_token = next(orig_tokens_iter)
111 except StopIteration:
112 pass # empty file
113 else:
114 for next_token in orig_tokens_iter:
115 yield _convert_token(state, curr_token, next_token)
116 curr_token = next_token
117 yield _convert_token(state, curr_token, None)
120def _convert_token( # noqa: C901: too complex
121 state: _TokenizeState, curr_token: OrigToken, next_token: Optional[OrigToken]
122) -> Token:
123 ct_type = curr_token.type
124 ct_string = curr_token.string
125 ct_start_pos = curr_token.start_pos
126 if ct_type is _ERRORTOKEN:
127 raise ParserSyntaxError(
128 f"{ct_string!r} is not a valid token.",
129 lines=state.lines,
130 raw_line=ct_start_pos[0],
131 raw_column=ct_start_pos[1],
132 )
133 if ct_type is _ERROR_DEDENT:
134 raise ParserSyntaxError(
135 "Inconsistent indentation. Expected a dedent.",
136 lines=state.lines,
137 raw_line=ct_start_pos[0],
138 raw_column=ct_start_pos[1],
139 )
141 # Compute relative indent changes for indent/dedent nodes
142 relative_indent: Optional[str] = None
143 if ct_type is _INDENT:
144 old_indent = "" if len(state.indents) < 2 else state.indents[-2]
145 new_indent = state.indents[-1]
146 relative_indent = new_indent[len(old_indent) :]
148 if next_token is not None:
149 nt_type = next_token.type
150 if nt_type is _INDENT:
151 nt_line, nt_column = next_token.start_pos
152 state.indents.append(state.lines[nt_line - 1][:nt_column])
153 elif nt_type is _DEDENT:
154 state.indents.pop()
156 whitespace_before = state.previous_whitespace_state
158 if ct_type is _INDENT or ct_type is _DEDENT or ct_type is _ENDMARKER:
159 # Don't update whitespace state for these dummy tokens. This makes it possible
160 # to partially parse whitespace for IndentedBlock footers, and then parse the
161 # rest of the whitespace in the following statement's leading_lines.
162 # Unfortunately, that means that the indentation is either wrong for the footer
163 # comments, or for the next line. We've chosen to allow it to be wrong for the
164 # IndentedBlock footer and manually override the state when parsing whitespace
165 # in that particular node.
166 whitespace_after = whitespace_before
167 ct_end_pos = ct_start_pos
168 else:
169 # Not a dummy token, so update the whitespace state.
171 # Compute our own end_pos, since parso's end_pos is wrong for triple-strings.
172 lines = split_lines(ct_string)
173 if len(lines) > 1:
174 ct_end_pos = ct_start_pos[0] + len(lines) - 1, len(lines[-1])
175 else:
176 ct_end_pos = (ct_start_pos[0], ct_start_pos[1] + len(ct_string))
178 # Figure out what mode the whitespace parser should use. If we're inside
179 # parentheses, certain whitespace (e.g. newlines) are allowed where they would
180 # otherwise not be. f-strings override and disable this behavior, however.
181 #
182 # Parso's tokenizer tracks this internally, but doesn't expose it, so we have to
183 # duplicate that logic here.
185 pof_stack = state.parenthesis_or_fstring_stack
186 try:
187 if ct_type is _FSTRING_START:
188 pof_stack.append(_FSTRING_STACK_ENTRY)
189 elif ct_type is _FSTRING_END:
190 pof_stack.pop()
191 elif ct_type is _OP:
192 if ct_string in "([{":
193 pof_stack.append(_PARENTHESIS_STACK_ENTRY)
194 elif ct_string in ")]}":
195 pof_stack.pop()
196 except IndexError:
197 # pof_stack may be empty by the time we need to read from it due to
198 # mismatched braces.
199 raise ParserSyntaxError(
200 "Encountered a closing brace without a matching opening brace.",
201 lines=state.lines,
202 raw_line=ct_start_pos[0],
203 raw_column=ct_start_pos[1],
204 )
205 is_parenthesized = (
206 len(pof_stack) > 0 and pof_stack[-1] == _PARENTHESIS_STACK_ENTRY
207 )
209 whitespace_after = WhitespaceState(
210 ct_end_pos[0], ct_end_pos[1], state.indents[-1], is_parenthesized
211 )
213 # Hold onto whitespace_after, so we can use it as whitespace_before in the next
214 # node.
215 state.previous_whitespace_state = whitespace_after
217 return Token(
218 ct_type,
219 ct_string,
220 ct_start_pos,
221 ct_end_pos,
222 whitespace_before,
223 whitespace_after,
224 relative_indent,
225 )