Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/libcst/_parser/parso/python/tokenize.py: 10%
680 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:43 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:43 +0000
1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3#
4# Modifications:
5# Copyright David Halter and Contributors
6# Modifications are dual-licensed: MIT and PSF.
7# 99% of the code is different from pgen2, now.
8#
9# A fork of `parso.python.tokenize`.
10# https://github.com/davidhalter/parso/blob/master/parso/python/tokenize.py
11#
12# The following changes were made:
13# - Changes to be compatible with PythonTokenTypes
14# - Removed main section
15# - Applied type stubs directly
16# - Removed Python 2 shims
17# - Added support for Python 3.6 ASYNC/AWAIT hacks
18#
19# -*- coding: utf-8 -*-
20# This tokenizer has been copied from the ``tokenize.py`` standard library
21# tokenizer. The reason was simple: The standard library tokenizer fails
22# if the indentation is not right. To make it possible to do error recovery the
23# tokenizer needed to be rewritten.
24#
25# Basically this is a stripped down version of the standard library module, so
26# you can read the documentation there. Additionally we included some speed and
27# memory optimizations here.
28# pyre-unsafe
29from __future__ import absolute_import
31import itertools as _itertools
32import re
33import sys
34from codecs import BOM_UTF8
35from collections import namedtuple
36from dataclasses import dataclass
37from typing import Dict, Generator, Iterable, Optional, Pattern, Set, Tuple
39from libcst._parser.parso.python.token import PythonTokenTypes
40from libcst._parser.parso.utils import PythonVersionInfo, split_lines
42# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
43MAX_UNICODE = "\U0010ffff"
44BOM_UTF8_STRING = BOM_UTF8.decode("utf-8")
46STRING = PythonTokenTypes.STRING
47NAME = PythonTokenTypes.NAME
48NUMBER = PythonTokenTypes.NUMBER
49OP = PythonTokenTypes.OP
50NEWLINE = PythonTokenTypes.NEWLINE
51INDENT = PythonTokenTypes.INDENT
52DEDENT = PythonTokenTypes.DEDENT
53ASYNC = PythonTokenTypes.ASYNC
54AWAIT = PythonTokenTypes.AWAIT
55ENDMARKER = PythonTokenTypes.ENDMARKER
56ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
57ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
58FSTRING_START = PythonTokenTypes.FSTRING_START
59FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
60FSTRING_END = PythonTokenTypes.FSTRING_END
63@dataclass(frozen=True)
64class TokenCollection:
65 pseudo_token: Pattern
66 single_quoted: Set[str]
67 triple_quoted: Set[str]
68 endpats: Dict[str, Pattern]
69 whitespace: Pattern
70 fstring_pattern_map: Dict[str, str]
71 always_break_tokens: Set[str]
74_token_collection_cache: Dict[PythonVersionInfo, TokenCollection] = {}
77def group(*choices: str, **kwargs: object) -> str:
78 capture = kwargs.pop("capture", False) # Python 2, arrghhhhh :(
79 assert not kwargs
81 start = "("
82 if not capture:
83 start += "?:"
84 return start + "|".join(choices) + ")"
87def maybe(*choices: str) -> str:
88 return group(*choices) + "?"
91# Return the empty string, plus all of the valid string prefixes.
92def _all_string_prefixes(
93 version_info: PythonVersionInfo,
94 include_fstring: bool = False,
95 only_fstring: bool = False,
96) -> Set[str]:
97 def different_case_versions(prefix):
98 for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
99 yield "".join(s)
101 # The valid string prefixes. Only contain the lower case versions,
102 # and don't contain any permuations (include 'fr', but not
103 # 'rf'). The various permutations will be generated.
104 valid_string_prefixes = ["b", "r"]
105 if version_info >= (3, 0):
106 valid_string_prefixes.append("br")
107 if version_info < (3, 0) or version_info >= (3, 3):
108 valid_string_prefixes.append("u")
110 result = {""}
111 if version_info >= (3, 6) and include_fstring:
112 f = ["f", "fr"]
113 if only_fstring:
114 valid_string_prefixes = f
115 result = set()
116 else:
117 valid_string_prefixes += f
118 elif only_fstring:
119 return set()
121 # if we add binary f-strings, add: ['fb', 'fbr']
122 for prefix in valid_string_prefixes:
123 for t in _itertools.permutations(prefix):
124 # create a list with upper and lower versions of each
125 # character
126 result.update(different_case_versions(t))
127 if version_info <= (2, 7):
128 # In Python 2 the order cannot just be random.
129 result.update(different_case_versions("ur"))
130 result.update(different_case_versions("br"))
131 return result
134def _compile(expr: str) -> Pattern:
135 return re.compile(expr, re.UNICODE)
138def _get_token_collection(version_info: PythonVersionInfo) -> TokenCollection:
139 try:
140 return _token_collection_cache[version_info]
141 except KeyError:
142 _token_collection_cache[version_info] = result = _create_token_collection(
143 version_info
144 )
145 return result
148fstring_raw_string = _compile(r"(?:[^{}]+|\{\{|\}\})+")
150unicode_character_name = r"[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*"
151fstring_string_single_line = _compile(
152 r"(?:\{\{|\}\}|\\N\{"
153 + unicode_character_name
154 + r"\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+"
155)
156fstring_string_multi_line = _compile(
157 r"(?:\{\{|\}\}|\\N\{" + unicode_character_name + r"\}|\\[^N]|[^{}\\])+"
158)
160fstring_format_spec_single_line = _compile(r"(?:\\(?:\r\n?|\n)|[^{}\r\n])+")
161fstring_format_spec_multi_line = _compile(r"[^{}]+")
164def _create_token_collection( # noqa: C901
165 version_info: PythonVersionInfo,
166) -> TokenCollection:
167 # Note: we use unicode matching for names ("\w") but ascii matching for
168 # number literals.
169 Whitespace = r"[ \f\t]*"
170 Comment = r"#[^\r\n]*"
171 # Python 2 is pretty much not working properly anymore, we just ignore
172 # parsing unicode properly, which is fine, I guess.
173 if version_info.major == 2:
174 Name = r"([A-Za-z_0-9]+)"
175 elif sys.version_info[0] == 2:
176 # Unfortunately the regex engine cannot deal with the regex below, so
177 # just use this one.
178 Name = r"(\w+)"
179 else:
180 Name = "([A-Za-z_0-9\u0080-" + MAX_UNICODE + "]+)"
182 if version_info >= (3, 6):
183 Hexnumber = r"0[xX](?:_?[0-9a-fA-F])+"
184 Binnumber = r"0[bB](?:_?[01])+"
185 Octnumber = r"0[oO](?:_?[0-7])+"
186 Decnumber = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)"
187 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
188 Exponent = r"[eE][-+]?[0-9](?:_?[0-9])*"
189 Pointfloat = group(
190 r"[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?", r"\.[0-9](?:_?[0-9])*"
191 ) + maybe(Exponent)
192 Expfloat = r"[0-9](?:_?[0-9])*" + Exponent
193 Floatnumber = group(Pointfloat, Expfloat)
194 Imagnumber = group(r"[0-9](?:_?[0-9])*[jJ]", Floatnumber + r"[jJ]")
195 else:
196 Hexnumber = r"0[xX][0-9a-fA-F]+"
197 Binnumber = r"0[bB][01]+"
198 if version_info >= (3, 0):
199 Octnumber = r"0[oO][0-7]+"
200 else:
201 Octnumber = "0[oO]?[0-7]+"
202 Decnumber = r"(?:0+|[1-9][0-9]*)"
203 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
204 if version_info.major < 3:
205 Intnumber += "[lL]?"
206 Exponent = r"[eE][-+]?[0-9]+"
207 Pointfloat = group(r"[0-9]+\.[0-9]*", r"\.[0-9]+") + maybe(Exponent)
208 Expfloat = r"[0-9]+" + Exponent
209 Floatnumber = group(Pointfloat, Expfloat)
210 Imagnumber = group(r"[0-9]+[jJ]", Floatnumber + r"[jJ]")
211 Number = group(Imagnumber, Floatnumber, Intnumber)
213 # Note that since _all_string_prefixes includes the empty string,
214 # StringPrefix can be the empty string (making it optional).
215 possible_prefixes = _all_string_prefixes(version_info)
216 StringPrefix = group(*possible_prefixes)
217 StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True))
218 fstring_prefixes = _all_string_prefixes(
219 version_info, include_fstring=True, only_fstring=True
220 )
221 FStringStart = group(*fstring_prefixes)
223 # Tail end of ' string.
224 Single = r"(?:\\.|[^'\\])*'"
225 # Tail end of " string.
226 Double = r'(?:\\.|[^"\\])*"'
227 # Tail end of ''' string.
228 Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''"
229 # Tail end of """ string.
230 Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""'
231 Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""')
233 # Because of leftmost-then-longest match semantics, be sure to put the
234 # longest operators first (e.g., if = came before ==, == would get
235 # recognized as two instances of =).
236 Operator = group(
237 r"\*\*=?", r">>=?", r"<<=?", r"//=?", r"->", r"[+\-*/%&@`|^!=<>]=?", r"~"
238 )
240 Bracket = "[][(){}]"
242 special_args = [r"\r\n?", r"\n", r"[;.,@]"]
243 if version_info >= (3, 0):
244 special_args.insert(0, r"\.\.\.")
245 if version_info >= (3, 8):
246 special_args.insert(0, ":=?")
247 else:
248 special_args.insert(0, ":")
249 Special = group(*special_args)
251 Funny = group(Operator, Bracket, Special)
253 # First (or only) line of ' or " string.
254 ContStr = group(
255 StringPrefix
256 + r"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*"
257 + group("'", r"\\(?:\r\n?|\n)"),
258 StringPrefix
259 + r'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*'
260 + group('"', r"\\(?:\r\n?|\n)"),
261 )
262 pseudo_extra_pool = [Comment, Triple]
263 all_quotes = '"', "'", '"""', "'''"
264 if fstring_prefixes:
265 pseudo_extra_pool.append(FStringStart + group(*all_quotes))
267 PseudoExtras = group(r"\\(?:\r\n?|\n)|\Z", *pseudo_extra_pool)
268 PseudoToken = group(Whitespace, capture=True) + group(
269 PseudoExtras, Number, Funny, ContStr, Name, capture=True
270 )
272 # For a given string prefix plus quotes, endpats maps it to a regex
273 # to match the remainder of that string. _prefix can be empty, for
274 # a normal single or triple quoted string (with no prefix).
275 endpats = {}
276 for _prefix in possible_prefixes:
277 endpats[_prefix + "'"] = _compile(Single)
278 endpats[_prefix + '"'] = _compile(Double)
279 endpats[_prefix + "'''"] = _compile(Single3)
280 endpats[_prefix + '"""'] = _compile(Double3)
282 # A set of all of the single and triple quoted string prefixes,
283 # including the opening quotes.
284 single_quoted = set()
285 triple_quoted = set()
286 fstring_pattern_map = {}
287 for t in possible_prefixes:
288 for quote in '"', "'":
289 single_quoted.add(t + quote)
291 for quote in '"""', "'''":
292 triple_quoted.add(t + quote)
294 for t in fstring_prefixes:
295 for quote in all_quotes:
296 fstring_pattern_map[t + quote] = quote
298 pseudo_token_compiled = _compile(PseudoToken)
299 return TokenCollection(
300 pseudo_token_compiled,
301 single_quoted,
302 triple_quoted,
303 endpats,
304 _compile(Whitespace),
305 fstring_pattern_map,
306 {
307 ";",
308 "import",
309 "class",
310 "def",
311 "try",
312 "except",
313 "finally",
314 "while",
315 "with",
316 "return",
317 },
318 )
321class Token(namedtuple("Token", ["type", "string", "start_pos", "prefix"])):
322 @property
323 def end_pos(self):
324 lines = split_lines(self.string)
325 if len(lines) > 1:
326 return self.start_pos[0] + len(lines) - 1, 0
327 else:
328 return self.start_pos[0], self.start_pos[1] + len(self.string)
331class PythonToken(Token):
332 def __repr__(self):
333 return "TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)" % self._replace(
334 type=self.type.name
335 )
338class FStringNode:
339 def __init__(self, quote, raw):
340 self.quote = quote
341 self.raw = raw
342 self.parentheses_count = 0
343 self.previous_lines = ""
344 self.last_string_start_pos = None
345 # In the syntax there can be multiple format_spec's nested:
346 # {x:{y:3}}
347 self.format_spec_count = 0
349 def open_parentheses(self, character):
350 self.parentheses_count += 1
352 def close_parentheses(self, character):
353 self.parentheses_count -= 1
354 if self.parentheses_count == 0:
355 # No parentheses means that the format spec is also finished.
356 self.format_spec_count = 0
358 def allow_multiline(self):
359 return len(self.quote) == 3
361 def is_in_expr(self):
362 return self.parentheses_count > self.format_spec_count
364 def is_in_format_spec(self):
365 return not self.is_in_expr() and self.format_spec_count
368def _close_fstring_if_necessary(fstring_stack, string, start_pos, additional_prefix):
369 for fstring_stack_index, node in enumerate(fstring_stack):
370 if string.startswith(node.quote):
371 token = PythonToken(
372 FSTRING_END, node.quote, start_pos, prefix=additional_prefix
373 )
374 additional_prefix = ""
375 assert not node.previous_lines
376 del fstring_stack[fstring_stack_index:]
377 return token, "", len(node.quote)
378 return None, additional_prefix, 0
381def _find_fstring_string(endpats, fstring_stack, line, lnum, pos):
382 tos = fstring_stack[-1]
383 allow_multiline = tos.allow_multiline()
384 if tos.is_in_format_spec():
385 if allow_multiline:
386 regex = fstring_format_spec_multi_line
387 else:
388 regex = fstring_format_spec_single_line
389 else:
390 if tos.raw:
391 regex = fstring_raw_string
392 elif allow_multiline:
393 regex = fstring_string_multi_line
394 else:
395 regex = fstring_string_single_line
397 match = regex.match(line, pos)
398 if match is None:
399 return tos.previous_lines, pos
401 if not tos.previous_lines:
402 tos.last_string_start_pos = (lnum, pos)
404 string = match.group(0)
405 for fstring_stack_node in fstring_stack:
406 end_match = endpats[fstring_stack_node.quote].match(string)
407 if end_match is not None:
408 string = end_match.group(0)[: -len(fstring_stack_node.quote)]
410 new_pos = pos
411 new_pos += len(string)
412 # even if allow_multiline is False, we still need to check for trailing
413 # newlines, because a single-line f-string can contain line continuations
414 if string.endswith("\n") or string.endswith("\r"):
415 tos.previous_lines += string
416 string = ""
417 else:
418 string = tos.previous_lines + string
420 return string, new_pos
423def tokenize(
424 code: str, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0)
425) -> Generator[PythonToken, None, None]:
426 """Generate tokens from a the source code (string)."""
427 lines = split_lines(code, keepends=True)
428 return tokenize_lines(lines, version_info, start_pos=start_pos)
431def tokenize_lines( # noqa: C901
432 lines: Iterable[str],
433 version_info: PythonVersionInfo,
434 start_pos: Tuple[int, int] = (1, 0),
435) -> Generator[PythonToken, None, None]:
436 token_collection = _get_token_collection(version_info)
437 if version_info >= PythonVersionInfo(3, 7):
438 return _tokenize_lines_py37_or_above(
439 lines, version_info, token_collection, start_pos=start_pos
440 )
441 else:
442 return _tokenize_lines_py36_or_below(
443 lines, version_info, token_collection, start_pos=start_pos
444 )
447def _tokenize_lines_py36_or_below( # noqa: C901
448 lines: Iterable[str],
449 version_info: PythonVersionInfo,
450 token_collection: TokenCollection,
451 start_pos: Tuple[int, int] = (1, 0),
452) -> Generator[PythonToken, None, None]:
453 """
454 A heavily modified Python standard library tokenizer.
456 Additionally to the default information, yields also the prefix of each
457 token. This idea comes from lib2to3. The prefix contains all information
458 that is irrelevant for the parser like newlines in parentheses or comments.
459 """
461 paren_level = 0 # count parentheses
462 indents = [0]
463 max = 0
464 numchars = "0123456789"
465 contstr = ""
466 contline = None
467 # We start with a newline. This makes indent at the first position
468 # possible. It's not valid Python, but still better than an INDENT in the
469 # second line (and not in the first). This makes quite a few things in
470 # Jedi's fast parser possible.
471 new_line = True
472 prefix = "" # Should never be required, but here for safety
473 endprog = None # Should not be required, but here for lint
474 contstr_start: Optional[Tuple[int, int]] = None
475 additional_prefix = ""
476 first = True
477 lnum = start_pos[0] - 1
478 fstring_stack = []
479 # stash and async_* are used for async/await parsing
480 stashed: Optional[PythonToken] = None
481 async_def: bool = False
482 async_def_indent: int = 0
483 async_def_newline: bool = False
485 def dedent_if_necessary(start):
486 nonlocal stashed
487 nonlocal async_def
488 nonlocal async_def_indent
489 nonlocal async_def_newline
491 while start < indents[-1]:
492 if start > indents[-2]:
493 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "")
494 break
495 if stashed is not None:
496 yield stashed
497 stashed = None
498 if async_def and async_def_newline and async_def_indent >= indents[-1]:
499 # We exited an 'async def' block, so stop tracking for indents
500 async_def = False
501 async_def_newline = False
502 async_def_indent = 0
503 yield PythonToken(DEDENT, "", spos, "")
504 indents.pop()
506 for line in lines: # loop over lines in stream
507 lnum += 1
508 pos = 0
509 max = len(line)
510 if first:
511 if line.startswith(BOM_UTF8_STRING):
512 additional_prefix = BOM_UTF8_STRING
513 line = line[1:]
514 max = len(line)
516 # Fake that the part before was already parsed.
517 line = "^" * start_pos[1] + line
518 pos = start_pos[1]
519 max += start_pos[1]
521 first = False
523 if contstr: # continued string
524 if endprog is None:
525 raise Exception("Logic error!")
526 endmatch = endprog.match(line)
527 if endmatch:
528 pos = endmatch.end(0)
529 if contstr_start is None:
530 raise Exception("Logic error!")
531 if stashed is not None:
532 raise Exception("Logic error!")
533 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix)
534 contstr = ""
535 contline = None
536 else:
537 contstr = contstr + line
538 contline = contline + line
539 continue
541 while pos < max:
542 if fstring_stack:
543 tos = fstring_stack[-1]
544 if not tos.is_in_expr():
545 string, pos = _find_fstring_string(
546 token_collection.endpats, fstring_stack, line, lnum, pos
547 )
548 if string:
549 if stashed is not None:
550 raise Exception("Logic error!")
551 yield PythonToken(
552 FSTRING_STRING,
553 string,
554 tos.last_string_start_pos,
555 # Never has a prefix because it can start anywhere and
556 # include whitespace.
557 prefix="",
558 )
559 tos.previous_lines = ""
560 continue
561 if pos == max:
562 break
564 rest = line[pos:]
565 (
566 fstring_end_token,
567 additional_prefix,
568 quote_length,
569 ) = _close_fstring_if_necessary(
570 fstring_stack, rest, (lnum, pos), additional_prefix
571 )
572 pos += quote_length
573 if fstring_end_token is not None:
574 if stashed is not None:
575 raise Exception("Logic error!")
576 yield fstring_end_token
577 continue
579 pseudomatch = token_collection.pseudo_token.match(line, pos)
580 if not pseudomatch: # scan for tokens
581 match = token_collection.whitespace.match(line, pos)
582 if pos == 0:
583 # pyre-fixme[16]: `Optional` has no attribute `end`.
584 yield from dedent_if_necessary(match.end())
585 pos = match.end()
586 new_line = False
587 yield PythonToken(
588 ERRORTOKEN,
589 line[pos],
590 (lnum, pos),
591 # pyre-fixme[16]: `Optional` has no attribute `group`.
592 additional_prefix + match.group(0),
593 )
594 additional_prefix = ""
595 pos += 1
596 continue
598 prefix = additional_prefix + pseudomatch.group(1)
599 additional_prefix = ""
600 start, pos = pseudomatch.span(2)
601 spos = (lnum, start)
602 token = pseudomatch.group(2)
603 if token == "":
604 assert prefix
605 additional_prefix = prefix
606 # This means that we have a line with whitespace/comments at
607 # the end, which just results in an endmarker.
608 break
609 initial = token[0]
611 if new_line and initial not in "\r\n\\#":
612 new_line = False
613 if paren_level == 0 and not fstring_stack:
614 i = 0
615 indent_start = start
616 while line[i] == "\f":
617 i += 1
618 # TODO don't we need to change spos as well?
619 indent_start -= 1
620 if indent_start > indents[-1]:
621 if stashed is not None:
622 yield stashed
623 stashed = None
624 yield PythonToken(INDENT, "", spos, "")
625 indents.append(indent_start)
626 yield from dedent_if_necessary(indent_start)
628 if initial in numchars or ( # ordinary number
629 initial == "." and token != "." and token != "..."
630 ):
631 if stashed is not None:
632 yield stashed
633 stashed = None
634 yield PythonToken(NUMBER, token, spos, prefix)
635 elif pseudomatch.group(3) is not None: # ordinary name
636 if token in token_collection.always_break_tokens:
637 fstring_stack[:] = []
638 paren_level = 0
639 # We only want to dedent if the token is on a new line.
640 if re.match(r"[ \f\t]*$", line[:start]):
641 while True:
642 indent = indents.pop()
643 if indent > start:
644 if (
645 async_def
646 and async_def_newline
647 and async_def_indent >= indent
648 ):
649 # We dedented outside of an 'async def' block.
650 async_def = False
651 async_def_newline = False
652 async_def_indent = 0
653 if stashed is not None:
654 yield stashed
655 stashed = None
656 yield PythonToken(DEDENT, "", spos, "")
657 else:
658 indents.append(indent)
659 break
660 if str.isidentifier(token):
661 should_yield_identifier = True
662 if token in ("async", "await") and async_def:
663 # We're inside an 'async def' block, all async/await are
664 # tokens.
665 if token == "async":
666 yield PythonToken(ASYNC, token, spos, prefix)
667 else:
668 yield PythonToken(AWAIT, token, spos, prefix)
669 should_yield_identifier = False
671 # We are possibly starting an 'async def' section
672 elif token == "async" and not stashed:
673 stashed = PythonToken(NAME, token, spos, prefix)
674 should_yield_identifier = False
676 # We actually are starting an 'async def' section
677 elif (
678 token == "def"
679 and stashed is not None
680 and stashed[0] is NAME
681 and stashed[1] == "async"
682 ):
683 async_def = True
684 async_def_indent = indents[-1]
685 yield PythonToken(ASYNC, stashed[1], stashed[2], stashed[3])
686 stashed = None
688 # We are either not stashed, or we output an ASYNC token above.
689 elif stashed:
690 yield stashed
691 stashed = None
693 # If we didn't bail early due to possibly recognizing an 'async def',
694 # then we should yield this token as normal.
695 if should_yield_identifier:
696 yield PythonToken(NAME, token, spos, prefix)
697 else:
698 yield from _split_illegal_unicode_name(token, spos, prefix)
699 elif initial in "\r\n":
700 if any(not f.allow_multiline() for f in fstring_stack):
701 # Would use fstring_stack.clear, but that's not available
702 # in Python 2.
703 fstring_stack[:] = []
705 if not new_line and paren_level == 0 and not fstring_stack:
706 if async_def:
707 async_def_newline = True
708 if stashed:
709 yield stashed
710 stashed = None
711 yield PythonToken(NEWLINE, token, spos, prefix)
712 else:
713 additional_prefix = prefix + token
714 new_line = True
715 elif initial == "#": # Comments
716 assert not token.endswith("\n")
717 additional_prefix = prefix + token
718 elif token in token_collection.triple_quoted:
719 endprog = token_collection.endpats[token]
720 endmatch = endprog.match(line, pos)
721 if endmatch: # all on one line
722 pos = endmatch.end(0)
723 token = line[start:pos]
724 if stashed is not None:
725 yield stashed
726 stashed = None
727 yield PythonToken(STRING, token, spos, prefix)
728 else:
729 contstr_start = (lnum, start) # multiple lines
730 contstr = line[start:]
731 contline = line
732 break
734 # Check up to the first 3 chars of the token to see if
735 # they're in the single_quoted set. If so, they start
736 # a string.
737 # We're using the first 3, because we're looking for
738 # "rb'" (for example) at the start of the token. If
739 # we switch to longer prefixes, this needs to be
740 # adjusted.
741 # Note that initial == token[:1].
742 # Also note that single quote checking must come after
743 # triple quote checking (above).
744 elif (
745 initial in token_collection.single_quoted
746 or token[:2] in token_collection.single_quoted
747 or token[:3] in token_collection.single_quoted
748 ):
749 if token[-1] in "\r\n": # continued string
750 # This means that a single quoted string ends with a
751 # backslash and is continued.
752 contstr_start = lnum, start
753 endprog = (
754 token_collection.endpats.get(initial)
755 or token_collection.endpats.get(token[1])
756 or token_collection.endpats.get(token[2])
757 )
758 contstr = line[start:]
759 contline = line
760 break
761 else: # ordinary string
762 if stashed is not None:
763 yield stashed
764 stashed = None
765 yield PythonToken(STRING, token, spos, prefix)
766 elif (
767 token in token_collection.fstring_pattern_map
768 ): # The start of an fstring.
769 fstring_stack.append(
770 FStringNode(
771 token_collection.fstring_pattern_map[token],
772 "r" in token or "R" in token,
773 )
774 )
775 if stashed is not None:
776 yield stashed
777 stashed = None
778 yield PythonToken(FSTRING_START, token, spos, prefix)
779 elif initial == "\\" and line[start:] in (
780 "\\\n",
781 "\\\r\n",
782 "\\\r",
783 ): # continued stmt
784 additional_prefix += prefix + line[start:]
785 break
786 else:
787 if token in "([{":
788 if fstring_stack:
789 fstring_stack[-1].open_parentheses(token)
790 else:
791 paren_level += 1
792 elif token in ")]}":
793 if fstring_stack:
794 fstring_stack[-1].close_parentheses(token)
795 else:
796 if paren_level:
797 paren_level -= 1
798 elif (
799 token == ":"
800 and fstring_stack
801 and fstring_stack[-1].parentheses_count
802 - fstring_stack[-1].format_spec_count
803 == 1
804 ):
805 fstring_stack[-1].format_spec_count += 1
807 if stashed is not None:
808 yield stashed
809 stashed = None
810 yield PythonToken(OP, token, spos, prefix)
812 if contstr:
813 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)
814 if contstr.endswith("\n") or contstr.endswith("\r"):
815 new_line = True
817 if stashed is not None:
818 yield stashed
819 stashed = None
821 end_pos = lnum, max
822 # As the last position we just take the maximally possible position. We
823 # remove -1 for the last new line.
824 for indent in indents[1:]:
825 yield PythonToken(DEDENT, "", end_pos, "")
826 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix)
829def _tokenize_lines_py37_or_above( # noqa: C901
830 lines: Iterable[str],
831 version_info: PythonVersionInfo,
832 token_collection: TokenCollection,
833 start_pos: Tuple[int, int] = (1, 0),
834) -> Generator[PythonToken, None, None]:
835 """
836 A heavily modified Python standard library tokenizer.
838 Additionally to the default information, yields also the prefix of each
839 token. This idea comes from lib2to3. The prefix contains all information
840 that is irrelevant for the parser like newlines in parentheses or comments.
841 """
843 def dedent_if_necessary(start):
844 while start < indents[-1]:
845 if start > indents[-2]:
846 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "")
847 break
848 yield PythonToken(DEDENT, "", spos, "")
849 indents.pop()
851 paren_level = 0 # count parentheses
852 indents = [0]
853 max = 0
854 numchars = "0123456789"
855 contstr = ""
856 contline = None
857 # We start with a newline. This makes indent at the first position
858 # possible. It's not valid Python, but still better than an INDENT in the
859 # second line (and not in the first). This makes quite a few things in
860 # Jedi's fast parser possible.
861 new_line = True
862 prefix = "" # Should never be required, but here for safety
863 endprog = None # Should not be required, but here for lint
864 contstr_start: Optional[Tuple[int, int]] = None
865 additional_prefix = ""
866 first = True
867 lnum = start_pos[0] - 1
868 fstring_stack = []
869 for line in lines: # loop over lines in stream
870 lnum += 1
871 pos = 0
872 max = len(line)
873 if first:
874 if line.startswith(BOM_UTF8_STRING):
875 additional_prefix = BOM_UTF8_STRING
876 line = line[1:]
877 max = len(line)
879 # Fake that the part before was already parsed.
880 line = "^" * start_pos[1] + line
881 pos = start_pos[1]
882 max += start_pos[1]
884 first = False
886 if contstr: # continued string
887 if endprog is None:
888 raise Exception("Logic error!")
889 endmatch = endprog.match(line)
890 if endmatch:
891 pos = endmatch.end(0)
892 if contstr_start is None:
893 raise Exception("Logic error!")
894 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix)
895 contstr = ""
896 contline = None
897 else:
898 contstr = contstr + line
899 contline = contline + line
900 continue
902 while pos < max:
903 if fstring_stack:
904 tos = fstring_stack[-1]
905 if not tos.is_in_expr():
906 string, pos = _find_fstring_string(
907 token_collection.endpats, fstring_stack, line, lnum, pos
908 )
909 if string:
910 yield PythonToken(
911 FSTRING_STRING,
912 string,
913 tos.last_string_start_pos,
914 # Never has a prefix because it can start anywhere and
915 # include whitespace.
916 prefix="",
917 )
918 tos.previous_lines = ""
919 continue
920 if pos == max:
921 break
923 rest = line[pos:]
924 (
925 fstring_end_token,
926 additional_prefix,
927 quote_length,
928 ) = _close_fstring_if_necessary(
929 fstring_stack, rest, (lnum, pos), additional_prefix
930 )
931 pos += quote_length
932 if fstring_end_token is not None:
933 yield fstring_end_token
934 continue
936 pseudomatch = token_collection.pseudo_token.match(line, pos)
937 if not pseudomatch: # scan for tokens
938 match = token_collection.whitespace.match(line, pos)
939 if pos == 0:
940 # pyre-fixme[16]: `Optional` has no attribute `end`.
941 for t in dedent_if_necessary(match.end()):
942 yield t
943 pos = match.end()
944 new_line = False
945 yield PythonToken(
946 ERRORTOKEN,
947 line[pos],
948 (lnum, pos),
949 # pyre-fixme[16]: `Optional` has no attribute `group`.
950 additional_prefix + match.group(0),
951 )
952 additional_prefix = ""
953 pos += 1
954 continue
956 prefix = additional_prefix + pseudomatch.group(1)
957 additional_prefix = ""
958 start, pos = pseudomatch.span(2)
959 spos = (lnum, start)
960 token = pseudomatch.group(2)
961 if token == "":
962 assert prefix
963 additional_prefix = prefix
964 # This means that we have a line with whitespace/comments at
965 # the end, which just results in an endmarker.
966 break
967 initial = token[0]
969 if new_line and initial not in "\r\n\\#":
970 new_line = False
971 if paren_level == 0 and not fstring_stack:
972 i = 0
973 indent_start = start
974 while line[i] == "\f":
975 i += 1
976 # TODO don't we need to change spos as well?
977 indent_start -= 1
978 if indent_start > indents[-1]:
979 yield PythonToken(INDENT, "", spos, "")
980 indents.append(indent_start)
981 for t in dedent_if_necessary(indent_start):
982 yield t
984 if initial in numchars or ( # ordinary number
985 initial == "." and token != "." and token != "..."
986 ):
987 yield PythonToken(NUMBER, token, spos, prefix)
988 elif pseudomatch.group(3) is not None: # ordinary name
989 if token in token_collection.always_break_tokens:
990 fstring_stack[:] = []
991 paren_level = 0
992 # We only want to dedent if the token is on a new line.
993 if re.match(r"[ \f\t]*$", line[:start]):
994 while True:
995 indent = indents.pop()
996 if indent > start:
997 yield PythonToken(DEDENT, "", spos, "")
998 else:
999 indents.append(indent)
1000 break
1001 if str.isidentifier(token):
1002 # py37 doesn't need special tokens for async/await, and we could
1003 # emit NAME, but then we'd need different grammar for py36 and py37.
1004 if token == "async":
1005 yield PythonToken(ASYNC, token, spos, prefix)
1006 elif token == "await":
1007 yield PythonToken(AWAIT, token, spos, prefix)
1008 else:
1009 yield PythonToken(NAME, token, spos, prefix)
1010 else:
1011 for t in _split_illegal_unicode_name(token, spos, prefix):
1012 yield t # yield from Python 2
1013 elif initial in "\r\n":
1014 if any(not f.allow_multiline() for f in fstring_stack):
1015 # Would use fstring_stack.clear, but that's not available
1016 # in Python 2.
1017 fstring_stack[:] = []
1019 if not new_line and paren_level == 0 and not fstring_stack:
1020 yield PythonToken(NEWLINE, token, spos, prefix)
1021 else:
1022 additional_prefix = prefix + token
1023 new_line = True
1024 elif initial == "#": # Comments
1025 assert not token.endswith("\n")
1026 additional_prefix = prefix + token
1027 elif token in token_collection.triple_quoted:
1028 endprog = token_collection.endpats[token]
1029 endmatch = endprog.match(line, pos)
1030 if endmatch: # all on one line
1031 pos = endmatch.end(0)
1032 token = line[start:pos]
1033 yield PythonToken(STRING, token, spos, prefix)
1034 else:
1035 contstr_start = (lnum, start) # multiple lines
1036 contstr = line[start:]
1037 contline = line
1038 break
1040 # Check up to the first 3 chars of the token to see if
1041 # they're in the single_quoted set. If so, they start
1042 # a string.
1043 # We're using the first 3, because we're looking for
1044 # "rb'" (for example) at the start of the token. If
1045 # we switch to longer prefixes, this needs to be
1046 # adjusted.
1047 # Note that initial == token[:1].
1048 # Also note that single quote checking must come after
1049 # triple quote checking (above).
1050 elif (
1051 initial in token_collection.single_quoted
1052 or token[:2] in token_collection.single_quoted
1053 or token[:3] in token_collection.single_quoted
1054 ):
1055 if token[-1] in "\r\n": # continued string
1056 # This means that a single quoted string ends with a
1057 # backslash and is continued.
1058 contstr_start = lnum, start
1059 endprog = (
1060 token_collection.endpats.get(initial)
1061 or token_collection.endpats.get(token[1])
1062 or token_collection.endpats.get(token[2])
1063 )
1064 contstr = line[start:]
1065 contline = line
1066 break
1067 else: # ordinary string
1068 yield PythonToken(STRING, token, spos, prefix)
1069 elif (
1070 token in token_collection.fstring_pattern_map
1071 ): # The start of an fstring.
1072 fstring_stack.append(
1073 FStringNode(
1074 token_collection.fstring_pattern_map[token],
1075 "r" in token or "R" in token,
1076 )
1077 )
1078 yield PythonToken(FSTRING_START, token, spos, prefix)
1079 elif initial == "\\" and line[start:] in (
1080 "\\\n",
1081 "\\\r\n",
1082 "\\\r",
1083 ): # continued stmt
1084 additional_prefix += prefix + line[start:]
1085 break
1086 else:
1087 if token in "([{":
1088 if fstring_stack:
1089 fstring_stack[-1].open_parentheses(token)
1090 else:
1091 paren_level += 1
1092 elif token in ")]}":
1093 if fstring_stack:
1094 fstring_stack[-1].close_parentheses(token)
1095 else:
1096 if paren_level:
1097 paren_level -= 1
1098 elif (
1099 token == ":"
1100 and fstring_stack
1101 and fstring_stack[-1].parentheses_count
1102 - fstring_stack[-1].format_spec_count
1103 == 1
1104 ):
1105 fstring_stack[-1].format_spec_count += 1
1107 yield PythonToken(OP, token, spos, prefix)
1109 if contstr:
1110 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)
1111 if contstr.endswith("\n") or contstr.endswith("\r"):
1112 new_line = True
1114 end_pos = lnum, max
1115 # As the last position we just take the maximally possible position. We
1116 # remove -1 for the last new line.
1117 for indent in indents[1:]:
1118 yield PythonToken(DEDENT, "", end_pos, "")
1119 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix)
1122def _split_illegal_unicode_name(
1123 token: str, start_pos: Tuple[int, int], prefix: str
1124) -> Generator[PythonToken, None, None]:
1125 def create_token():
1126 return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix)
1128 found = ""
1129 is_illegal = False
1130 pos = start_pos
1131 for i, char in enumerate(token):
1132 if is_illegal:
1133 if str.isidentifier(char):
1134 yield create_token()
1135 found = char
1136 is_illegal = False
1137 prefix = ""
1138 pos = start_pos[0], start_pos[1] + i
1139 else:
1140 found += char
1141 else:
1142 new_found = found + char
1143 if str.isidentifier(new_found):
1144 found = new_found
1145 else:
1146 if found:
1147 yield create_token()
1148 prefix = ""
1149 pos = start_pos[0], start_pos[1] + i
1150 found = char
1151 is_illegal = True
1153 if found:
1154 yield create_token()