1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3#
4# Modifications:
5# Copyright David Halter and Contributors
6# Modifications are dual-licensed: MIT and PSF.
7# 99% of the code is different from pgen2, now.
8#
9# A fork of `parso.python.tokenize`.
10# https://github.com/davidhalter/parso/blob/master/parso/python/tokenize.py
11#
12# The following changes were made:
13# - Changes to be compatible with PythonTokenTypes
14# - Removed main section
15# - Applied type stubs directly
16# - Removed Python 2 shims
17# - Added support for Python 3.6 ASYNC/AWAIT hacks
18#
19# -*- coding: utf-8 -*-
20# This tokenizer has been copied from the ``tokenize.py`` standard library
21# tokenizer. The reason was simple: The standard library tokenizer fails
22# if the indentation is not right. To make it possible to do error recovery the
23# tokenizer needed to be rewritten.
24#
25# Basically this is a stripped down version of the standard library module, so
26# you can read the documentation there. Additionally we included some speed and
27# memory optimizations here.
28# pyre-unsafe
29from __future__ import absolute_import
30
31import itertools as _itertools
32import re
33import sys
34from codecs import BOM_UTF8
35from collections import namedtuple
36from dataclasses import dataclass
37from typing import Dict, Generator, Iterable, Optional, Pattern, Set, Tuple
38
39from libcst import CSTLogicError
40from libcst._parser.parso.python.token import PythonTokenTypes
41from libcst._parser.parso.utils import PythonVersionInfo, split_lines
42
43# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
44MAX_UNICODE = "\U0010ffff"
45BOM_UTF8_STRING = BOM_UTF8.decode("utf-8")
46
47STRING = PythonTokenTypes.STRING
48NAME = PythonTokenTypes.NAME
49NUMBER = PythonTokenTypes.NUMBER
50OP = PythonTokenTypes.OP
51NEWLINE = PythonTokenTypes.NEWLINE
52INDENT = PythonTokenTypes.INDENT
53DEDENT = PythonTokenTypes.DEDENT
54ASYNC = PythonTokenTypes.ASYNC
55AWAIT = PythonTokenTypes.AWAIT
56ENDMARKER = PythonTokenTypes.ENDMARKER
57ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
58ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
59FSTRING_START = PythonTokenTypes.FSTRING_START
60FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
61FSTRING_END = PythonTokenTypes.FSTRING_END
62
63
64@dataclass(frozen=True)
65class TokenCollection:
66 pseudo_token: Pattern
67 single_quoted: Set[str]
68 triple_quoted: Set[str]
69 endpats: Dict[str, Pattern]
70 whitespace: Pattern
71 fstring_pattern_map: Dict[str, str]
72 always_break_tokens: Set[str]
73
74
75_token_collection_cache: Dict[PythonVersionInfo, TokenCollection] = {}
76
77
78def group(*choices: str, **kwargs: object) -> str:
79 capture = kwargs.pop("capture", False) # Python 2, arrghhhhh :(
80 assert not kwargs
81
82 start = "("
83 if not capture:
84 start += "?:"
85 return start + "|".join(choices) + ")"
86
87
88def maybe(*choices: str) -> str:
89 return group(*choices) + "?"
90
91
92# Return the empty string, plus all of the valid string prefixes.
93def _all_string_prefixes(
94 version_info: PythonVersionInfo,
95 include_fstring: bool = False,
96 only_fstring: bool = False,
97) -> Set[str]:
98 def different_case_versions(prefix):
99 for s in _itertools.product(*[(c, c.upper()) for c in prefix]):
100 yield "".join(s)
101
102 # The valid string prefixes. Only contain the lower case versions,
103 # and don't contain any permuations (include 'fr', but not
104 # 'rf'). The various permutations will be generated.
105 valid_string_prefixes = ["b", "r"]
106 if version_info >= (3, 0):
107 valid_string_prefixes.append("br")
108 if version_info < (3, 0) or version_info >= (3, 3):
109 valid_string_prefixes.append("u")
110
111 result = {""}
112 if version_info >= (3, 6) and include_fstring:
113 f = ["f", "fr"]
114 if only_fstring:
115 valid_string_prefixes = f
116 result = set()
117 else:
118 valid_string_prefixes += f
119 elif only_fstring:
120 return set()
121
122 # if we add binary f-strings, add: ['fb', 'fbr']
123 for prefix in valid_string_prefixes:
124 for t in _itertools.permutations(prefix):
125 # create a list with upper and lower versions of each
126 # character
127 result.update(different_case_versions(t))
128 if version_info <= (2, 7):
129 # In Python 2 the order cannot just be random.
130 result.update(different_case_versions("ur"))
131 result.update(different_case_versions("br"))
132 return result
133
134
135def _compile(expr: str) -> Pattern:
136 return re.compile(expr, re.UNICODE)
137
138
139def _get_token_collection(version_info: PythonVersionInfo) -> TokenCollection:
140 try:
141 return _token_collection_cache[version_info]
142 except KeyError:
143 _token_collection_cache[version_info] = result = _create_token_collection(
144 version_info
145 )
146 return result
147
148
149fstring_raw_string = _compile(r"(?:[^{}]+|\{\{|\}\})+")
150
151unicode_character_name = r"[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*"
152fstring_string_single_line = _compile(
153 r"(?:\{\{|\}\}|\\N\{"
154 + unicode_character_name
155 + r"\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+"
156)
157fstring_string_multi_line = _compile(
158 r"(?:\{\{|\}\}|\\N\{" + unicode_character_name + r"\}|\\[^N]|[^{}\\])+"
159)
160
161fstring_format_spec_single_line = _compile(r"(?:\\(?:\r\n?|\n)|[^{}\r\n])+")
162fstring_format_spec_multi_line = _compile(r"[^{}]+")
163
164
165def _create_token_collection( # noqa: C901
166 version_info: PythonVersionInfo,
167) -> TokenCollection:
168 # Note: we use unicode matching for names ("\w") but ascii matching for
169 # number literals.
170 Whitespace = r"[ \f\t]*"
171 Comment = r"#[^\r\n]*"
172 # Python 2 is pretty much not working properly anymore, we just ignore
173 # parsing unicode properly, which is fine, I guess.
174 if version_info.major == 2:
175 Name = r"([A-Za-z_0-9]+)"
176 elif sys.version_info[0] == 2:
177 # Unfortunately the regex engine cannot deal with the regex below, so
178 # just use this one.
179 Name = r"(\w+)"
180 else:
181 Name = "([A-Za-z_0-9\u0080-" + MAX_UNICODE + "]+)"
182
183 if version_info >= (3, 6):
184 Hexnumber = r"0[xX](?:_?[0-9a-fA-F])+"
185 Binnumber = r"0[bB](?:_?[01])+"
186 Octnumber = r"0[oO](?:_?[0-7])+"
187 Decnumber = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)"
188 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
189 Exponent = r"[eE][-+]?[0-9](?:_?[0-9])*"
190 Pointfloat = group(
191 r"[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?", r"\.[0-9](?:_?[0-9])*"
192 ) + maybe(Exponent)
193 Expfloat = r"[0-9](?:_?[0-9])*" + Exponent
194 Floatnumber = group(Pointfloat, Expfloat)
195 Imagnumber = group(r"[0-9](?:_?[0-9])*[jJ]", Floatnumber + r"[jJ]")
196 else:
197 Hexnumber = r"0[xX][0-9a-fA-F]+"
198 Binnumber = r"0[bB][01]+"
199 if version_info >= (3, 0):
200 Octnumber = r"0[oO][0-7]+"
201 else:
202 Octnumber = "0[oO]?[0-7]+"
203 Decnumber = r"(?:0+|[1-9][0-9]*)"
204 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
205 if version_info.major < 3:
206 Intnumber += "[lL]?"
207 Exponent = r"[eE][-+]?[0-9]+"
208 Pointfloat = group(r"[0-9]+\.[0-9]*", r"\.[0-9]+") + maybe(Exponent)
209 Expfloat = r"[0-9]+" + Exponent
210 Floatnumber = group(Pointfloat, Expfloat)
211 Imagnumber = group(r"[0-9]+[jJ]", Floatnumber + r"[jJ]")
212 Number = group(Imagnumber, Floatnumber, Intnumber)
213
214 # Note that since _all_string_prefixes includes the empty string,
215 # StringPrefix can be the empty string (making it optional).
216 possible_prefixes = _all_string_prefixes(version_info)
217 StringPrefix = group(*possible_prefixes)
218 StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True))
219 fstring_prefixes = _all_string_prefixes(
220 version_info, include_fstring=True, only_fstring=True
221 )
222 FStringStart = group(*fstring_prefixes)
223
224 # Tail end of ' string.
225 Single = r"(?:\\.|[^'\\])*'"
226 # Tail end of " string.
227 Double = r'(?:\\.|[^"\\])*"'
228 # Tail end of ''' string.
229 Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''"
230 # Tail end of """ string.
231 Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""'
232 Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""')
233
234 # Because of leftmost-then-longest match semantics, be sure to put the
235 # longest operators first (e.g., if = came before ==, == would get
236 # recognized as two instances of =).
237 Operator = group(
238 r"\*\*=?", r">>=?", r"<<=?", r"//=?", r"->", r"[+\-*/%&@`|^!=<>]=?", r"~"
239 )
240
241 Bracket = "[][(){}]"
242
243 special_args = [r"\r\n?", r"\n", r"[;.,@]"]
244 if version_info >= (3, 0):
245 special_args.insert(0, r"\.\.\.")
246 if version_info >= (3, 8):
247 special_args.insert(0, ":=?")
248 else:
249 special_args.insert(0, ":")
250 Special = group(*special_args)
251
252 Funny = group(Operator, Bracket, Special)
253
254 # First (or only) line of ' or " string.
255 ContStr = group(
256 StringPrefix
257 + r"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*"
258 + group("'", r"\\(?:\r\n?|\n)"),
259 StringPrefix
260 + r'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*'
261 + group('"', r"\\(?:\r\n?|\n)"),
262 )
263 pseudo_extra_pool = [Comment, Triple]
264 all_quotes = '"', "'", '"""', "'''"
265 if fstring_prefixes:
266 pseudo_extra_pool.append(FStringStart + group(*all_quotes))
267
268 PseudoExtras = group(r"\\(?:\r\n?|\n)|\Z", *pseudo_extra_pool)
269 PseudoToken = group(Whitespace, capture=True) + group(
270 PseudoExtras, Number, Funny, ContStr, Name, capture=True
271 )
272
273 # For a given string prefix plus quotes, endpats maps it to a regex
274 # to match the remainder of that string. _prefix can be empty, for
275 # a normal single or triple quoted string (with no prefix).
276 endpats = {}
277 for _prefix in possible_prefixes:
278 endpats[_prefix + "'"] = _compile(Single)
279 endpats[_prefix + '"'] = _compile(Double)
280 endpats[_prefix + "'''"] = _compile(Single3)
281 endpats[_prefix + '"""'] = _compile(Double3)
282
283 # A set of all of the single and triple quoted string prefixes,
284 # including the opening quotes.
285 single_quoted = set()
286 triple_quoted = set()
287 fstring_pattern_map = {}
288 for t in possible_prefixes:
289 for quote in '"', "'":
290 single_quoted.add(t + quote)
291
292 for quote in '"""', "'''":
293 triple_quoted.add(t + quote)
294
295 for t in fstring_prefixes:
296 for quote in all_quotes:
297 fstring_pattern_map[t + quote] = quote
298
299 pseudo_token_compiled = _compile(PseudoToken)
300 return TokenCollection(
301 pseudo_token_compiled,
302 single_quoted,
303 triple_quoted,
304 endpats,
305 _compile(Whitespace),
306 fstring_pattern_map,
307 {
308 ";",
309 "import",
310 "class",
311 "def",
312 "try",
313 "except",
314 "finally",
315 "while",
316 "with",
317 "return",
318 },
319 )
320
321
322class Token(namedtuple("Token", ["type", "string", "start_pos", "prefix"])):
323 @property
324 def end_pos(self):
325 lines = split_lines(self.string)
326 if len(lines) > 1:
327 return self.start_pos[0] + len(lines) - 1, 0
328 else:
329 return self.start_pos[0], self.start_pos[1] + len(self.string)
330
331
332class PythonToken(Token):
333 def __repr__(self):
334 return "TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)" % self._replace(
335 type=self.type.name
336 )
337
338
339class FStringNode:
340 def __init__(self, quote, raw):
341 self.quote = quote
342 self.raw = raw
343 self.parentheses_count = 0
344 self.previous_lines = ""
345 self.last_string_start_pos = None
346 # In the syntax there can be multiple format_spec's nested:
347 # {x:{y:3}}
348 self.format_spec_count = 0
349
350 def open_parentheses(self, character):
351 self.parentheses_count += 1
352
353 def close_parentheses(self, character):
354 self.parentheses_count -= 1
355 if self.parentheses_count == 0:
356 # No parentheses means that the format spec is also finished.
357 self.format_spec_count = 0
358
359 def allow_multiline(self):
360 return len(self.quote) == 3
361
362 def is_in_expr(self):
363 return self.parentheses_count > self.format_spec_count
364
365 def is_in_format_spec(self):
366 return not self.is_in_expr() and self.format_spec_count
367
368
369def _close_fstring_if_necessary(fstring_stack, string, start_pos, additional_prefix):
370 for fstring_stack_index, node in enumerate(fstring_stack):
371 if string.startswith(node.quote):
372 token = PythonToken(
373 FSTRING_END, node.quote, start_pos, prefix=additional_prefix
374 )
375 additional_prefix = ""
376 assert not node.previous_lines
377 del fstring_stack[fstring_stack_index:]
378 return token, "", len(node.quote)
379 return None, additional_prefix, 0
380
381
382def _find_fstring_string(endpats, fstring_stack, line, lnum, pos):
383 tos = fstring_stack[-1]
384 allow_multiline = tos.allow_multiline()
385 if tos.is_in_format_spec():
386 if allow_multiline:
387 regex = fstring_format_spec_multi_line
388 else:
389 regex = fstring_format_spec_single_line
390 else:
391 if tos.raw:
392 regex = fstring_raw_string
393 elif allow_multiline:
394 regex = fstring_string_multi_line
395 else:
396 regex = fstring_string_single_line
397
398 match = regex.match(line, pos)
399 if match is None:
400 return tos.previous_lines, pos
401
402 if not tos.previous_lines:
403 tos.last_string_start_pos = (lnum, pos)
404
405 string = match.group(0)
406 for fstring_stack_node in fstring_stack:
407 end_match = endpats[fstring_stack_node.quote].match(string)
408 if end_match is not None:
409 string = end_match.group(0)[: -len(fstring_stack_node.quote)]
410
411 new_pos = pos
412 new_pos += len(string)
413 # even if allow_multiline is False, we still need to check for trailing
414 # newlines, because a single-line f-string can contain line continuations
415 if string.endswith("\n") or string.endswith("\r"):
416 tos.previous_lines += string
417 string = ""
418 else:
419 string = tos.previous_lines + string
420
421 return string, new_pos
422
423
424def tokenize(
425 code: str, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0)
426) -> Generator[PythonToken, None, None]:
427 """Generate tokens from a the source code (string)."""
428 lines = split_lines(code, keepends=True)
429 return tokenize_lines(lines, version_info, start_pos=start_pos)
430
431
432def tokenize_lines( # noqa: C901
433 lines: Iterable[str],
434 version_info: PythonVersionInfo,
435 start_pos: Tuple[int, int] = (1, 0),
436) -> Generator[PythonToken, None, None]:
437 token_collection = _get_token_collection(version_info)
438 if version_info >= PythonVersionInfo(3, 7):
439 return _tokenize_lines_py37_or_above(
440 lines, version_info, token_collection, start_pos=start_pos
441 )
442 else:
443 return _tokenize_lines_py36_or_below(
444 lines, version_info, token_collection, start_pos=start_pos
445 )
446
447
448def _tokenize_lines_py36_or_below( # noqa: C901
449 lines: Iterable[str],
450 version_info: PythonVersionInfo,
451 token_collection: TokenCollection,
452 start_pos: Tuple[int, int] = (1, 0),
453) -> Generator[PythonToken, None, None]:
454 """
455 A heavily modified Python standard library tokenizer.
456
457 Additionally to the default information, yields also the prefix of each
458 token. This idea comes from lib2to3. The prefix contains all information
459 that is irrelevant for the parser like newlines in parentheses or comments.
460 """
461
462 paren_level = 0 # count parentheses
463 indents = [0]
464 max = 0
465 numchars = "0123456789"
466 contstr = ""
467 contline = None
468 # We start with a newline. This makes indent at the first position
469 # possible. It's not valid Python, but still better than an INDENT in the
470 # second line (and not in the first). This makes quite a few things in
471 # Jedi's fast parser possible.
472 new_line = True
473 prefix = "" # Should never be required, but here for safety
474 endprog = None # Should not be required, but here for lint
475 contstr_start: Optional[Tuple[int, int]] = None
476 additional_prefix = ""
477 first = True
478 lnum = start_pos[0] - 1
479 fstring_stack = []
480 # stash and async_* are used for async/await parsing
481 stashed: Optional[PythonToken] = None
482 async_def: bool = False
483 async_def_indent: int = 0
484 async_def_newline: bool = False
485
486 def dedent_if_necessary(start):
487 nonlocal stashed
488 nonlocal async_def
489 nonlocal async_def_indent
490 nonlocal async_def_newline
491
492 while start < indents[-1]:
493 if start > indents[-2]:
494 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "")
495 break
496 if stashed is not None:
497 yield stashed
498 stashed = None
499 if async_def and async_def_newline and async_def_indent >= indents[-1]:
500 # We exited an 'async def' block, so stop tracking for indents
501 async_def = False
502 async_def_newline = False
503 async_def_indent = 0
504 yield PythonToken(DEDENT, "", spos, "")
505 indents.pop()
506
507 for line in lines: # loop over lines in stream
508 lnum += 1
509 pos = 0
510 max = len(line)
511 if first:
512 if line.startswith(BOM_UTF8_STRING):
513 additional_prefix = BOM_UTF8_STRING
514 line = line[1:]
515 max = len(line)
516
517 # Fake that the part before was already parsed.
518 line = "^" * start_pos[1] + line
519 pos = start_pos[1]
520 max += start_pos[1]
521
522 first = False
523
524 if contstr: # continued string
525 if endprog is None:
526 raise CSTLogicError("Logic error!")
527 endmatch = endprog.match(line)
528 if endmatch:
529 pos = endmatch.end(0)
530 if contstr_start is None:
531 raise CSTLogicError("Logic error!")
532 if stashed is not None:
533 raise CSTLogicError("Logic error!")
534 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix)
535 contstr = ""
536 contline = None
537 else:
538 contstr = contstr + line
539 contline = contline + line
540 continue
541
542 while pos < max:
543 if fstring_stack:
544 tos = fstring_stack[-1]
545 if not tos.is_in_expr():
546 string, pos = _find_fstring_string(
547 token_collection.endpats, fstring_stack, line, lnum, pos
548 )
549 if string:
550 if stashed is not None:
551 raise CSTLogicError("Logic error!")
552 yield PythonToken(
553 FSTRING_STRING,
554 string,
555 tos.last_string_start_pos,
556 # Never has a prefix because it can start anywhere and
557 # include whitespace.
558 prefix="",
559 )
560 tos.previous_lines = ""
561 continue
562 if pos == max:
563 break
564
565 rest = line[pos:]
566 (
567 fstring_end_token,
568 additional_prefix,
569 quote_length,
570 ) = _close_fstring_if_necessary(
571 fstring_stack, rest, (lnum, pos), additional_prefix
572 )
573 pos += quote_length
574 if fstring_end_token is not None:
575 if stashed is not None:
576 raise CSTLogicError("Logic error!")
577 yield fstring_end_token
578 continue
579
580 pseudomatch = token_collection.pseudo_token.match(line, pos)
581 if not pseudomatch: # scan for tokens
582 match = token_collection.whitespace.match(line, pos)
583 if pos == 0:
584 # pyre-fixme[16]: `Optional` has no attribute `end`.
585 yield from dedent_if_necessary(match.end())
586 pos = match.end()
587 new_line = False
588 yield PythonToken(
589 ERRORTOKEN,
590 line[pos],
591 (lnum, pos),
592 # pyre-fixme[16]: `Optional` has no attribute `group`.
593 additional_prefix + match.group(0),
594 )
595 additional_prefix = ""
596 pos += 1
597 continue
598
599 prefix = additional_prefix + pseudomatch.group(1)
600 additional_prefix = ""
601 start, pos = pseudomatch.span(2)
602 spos = (lnum, start)
603 token = pseudomatch.group(2)
604 if token == "":
605 assert prefix
606 additional_prefix = prefix
607 # This means that we have a line with whitespace/comments at
608 # the end, which just results in an endmarker.
609 break
610 initial = token[0]
611
612 if new_line and initial not in "\r\n\\#":
613 new_line = False
614 if paren_level == 0 and not fstring_stack:
615 i = 0
616 indent_start = start
617 while line[i] == "\f":
618 i += 1
619 # TODO don't we need to change spos as well?
620 indent_start -= 1
621 if indent_start > indents[-1]:
622 if stashed is not None:
623 yield stashed
624 stashed = None
625 yield PythonToken(INDENT, "", spos, "")
626 indents.append(indent_start)
627 yield from dedent_if_necessary(indent_start)
628
629 if initial in numchars or ( # ordinary number
630 initial == "." and token != "." and token != "..."
631 ):
632 if stashed is not None:
633 yield stashed
634 stashed = None
635 yield PythonToken(NUMBER, token, spos, prefix)
636 elif pseudomatch.group(3) is not None: # ordinary name
637 if token in token_collection.always_break_tokens:
638 fstring_stack[:] = []
639 paren_level = 0
640 # We only want to dedent if the token is on a new line.
641 if re.match(r"[ \f\t]*$", line[:start]):
642 while True:
643 indent = indents.pop()
644 if indent > start:
645 if (
646 async_def
647 and async_def_newline
648 and async_def_indent >= indent
649 ):
650 # We dedented outside of an 'async def' block.
651 async_def = False
652 async_def_newline = False
653 async_def_indent = 0
654 if stashed is not None:
655 yield stashed
656 stashed = None
657 yield PythonToken(DEDENT, "", spos, "")
658 else:
659 indents.append(indent)
660 break
661 if str.isidentifier(token):
662 should_yield_identifier = True
663 if token in ("async", "await") and async_def:
664 # We're inside an 'async def' block, all async/await are
665 # tokens.
666 if token == "async":
667 yield PythonToken(ASYNC, token, spos, prefix)
668 else:
669 yield PythonToken(AWAIT, token, spos, prefix)
670 should_yield_identifier = False
671
672 # We are possibly starting an 'async def' section
673 elif token == "async" and not stashed:
674 stashed = PythonToken(NAME, token, spos, prefix)
675 should_yield_identifier = False
676
677 # We actually are starting an 'async def' section
678 elif (
679 token == "def"
680 and stashed is not None
681 and stashed[0] is NAME
682 and stashed[1] == "async"
683 ):
684 async_def = True
685 async_def_indent = indents[-1]
686 yield PythonToken(ASYNC, stashed[1], stashed[2], stashed[3])
687 stashed = None
688
689 # We are either not stashed, or we output an ASYNC token above.
690 elif stashed:
691 yield stashed
692 stashed = None
693
694 # If we didn't bail early due to possibly recognizing an 'async def',
695 # then we should yield this token as normal.
696 if should_yield_identifier:
697 yield PythonToken(NAME, token, spos, prefix)
698 else:
699 yield from _split_illegal_unicode_name(token, spos, prefix)
700 elif initial in "\r\n":
701 if any(not f.allow_multiline() for f in fstring_stack):
702 # Would use fstring_stack.clear, but that's not available
703 # in Python 2.
704 fstring_stack[:] = []
705
706 if not new_line and paren_level == 0 and not fstring_stack:
707 if async_def:
708 async_def_newline = True
709 if stashed:
710 yield stashed
711 stashed = None
712 yield PythonToken(NEWLINE, token, spos, prefix)
713 else:
714 additional_prefix = prefix + token
715 new_line = True
716 elif initial == "#": # Comments
717 assert not token.endswith("\n")
718 additional_prefix = prefix + token
719 elif token in token_collection.triple_quoted:
720 endprog = token_collection.endpats[token]
721 endmatch = endprog.match(line, pos)
722 if endmatch: # all on one line
723 pos = endmatch.end(0)
724 token = line[start:pos]
725 if stashed is not None:
726 yield stashed
727 stashed = None
728 yield PythonToken(STRING, token, spos, prefix)
729 else:
730 contstr_start = (lnum, start) # multiple lines
731 contstr = line[start:]
732 contline = line
733 break
734
735 # Check up to the first 3 chars of the token to see if
736 # they're in the single_quoted set. If so, they start
737 # a string.
738 # We're using the first 3, because we're looking for
739 # "rb'" (for example) at the start of the token. If
740 # we switch to longer prefixes, this needs to be
741 # adjusted.
742 # Note that initial == token[:1].
743 # Also note that single quote checking must come after
744 # triple quote checking (above).
745 elif (
746 initial in token_collection.single_quoted
747 or token[:2] in token_collection.single_quoted
748 or token[:3] in token_collection.single_quoted
749 ):
750 if token[-1] in "\r\n": # continued string
751 # This means that a single quoted string ends with a
752 # backslash and is continued.
753 contstr_start = lnum, start
754 endprog = (
755 token_collection.endpats.get(initial)
756 or token_collection.endpats.get(token[1])
757 or token_collection.endpats.get(token[2])
758 )
759 contstr = line[start:]
760 contline = line
761 break
762 else: # ordinary string
763 if stashed is not None:
764 yield stashed
765 stashed = None
766 yield PythonToken(STRING, token, spos, prefix)
767 elif (
768 token in token_collection.fstring_pattern_map
769 ): # The start of an fstring.
770 fstring_stack.append(
771 FStringNode(
772 token_collection.fstring_pattern_map[token],
773 "r" in token or "R" in token,
774 )
775 )
776 if stashed is not None:
777 yield stashed
778 stashed = None
779 yield PythonToken(FSTRING_START, token, spos, prefix)
780 elif initial == "\\" and line[start:] in (
781 "\\\n",
782 "\\\r\n",
783 "\\\r",
784 ): # continued stmt
785 additional_prefix += prefix + line[start:]
786 break
787 else:
788 if token in "([{":
789 if fstring_stack:
790 fstring_stack[-1].open_parentheses(token)
791 else:
792 paren_level += 1
793 elif token in ")]}":
794 if fstring_stack:
795 fstring_stack[-1].close_parentheses(token)
796 else:
797 if paren_level:
798 paren_level -= 1
799 elif (
800 token == ":"
801 and fstring_stack
802 and fstring_stack[-1].parentheses_count
803 - fstring_stack[-1].format_spec_count
804 == 1
805 ):
806 fstring_stack[-1].format_spec_count += 1
807
808 if stashed is not None:
809 yield stashed
810 stashed = None
811 yield PythonToken(OP, token, spos, prefix)
812
813 if contstr:
814 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)
815 if contstr.endswith("\n") or contstr.endswith("\r"):
816 new_line = True
817
818 if stashed is not None:
819 yield stashed
820 stashed = None
821
822 end_pos = lnum, max
823 # As the last position we just take the maximally possible position. We
824 # remove -1 for the last new line.
825 for indent in indents[1:]:
826 yield PythonToken(DEDENT, "", end_pos, "")
827 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix)
828
829
830def _tokenize_lines_py37_or_above( # noqa: C901
831 lines: Iterable[str],
832 version_info: PythonVersionInfo,
833 token_collection: TokenCollection,
834 start_pos: Tuple[int, int] = (1, 0),
835) -> Generator[PythonToken, None, None]:
836 """
837 A heavily modified Python standard library tokenizer.
838
839 Additionally to the default information, yields also the prefix of each
840 token. This idea comes from lib2to3. The prefix contains all information
841 that is irrelevant for the parser like newlines in parentheses or comments.
842 """
843
844 def dedent_if_necessary(start):
845 while start < indents[-1]:
846 if start > indents[-2]:
847 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "")
848 break
849 yield PythonToken(DEDENT, "", spos, "")
850 indents.pop()
851
852 paren_level = 0 # count parentheses
853 indents = [0]
854 max = 0
855 numchars = "0123456789"
856 contstr = ""
857 contline = None
858 # We start with a newline. This makes indent at the first position
859 # possible. It's not valid Python, but still better than an INDENT in the
860 # second line (and not in the first). This makes quite a few things in
861 # Jedi's fast parser possible.
862 new_line = True
863 prefix = "" # Should never be required, but here for safety
864 endprog = None # Should not be required, but here for lint
865 contstr_start: Optional[Tuple[int, int]] = None
866 additional_prefix = ""
867 first = True
868 lnum = start_pos[0] - 1
869 fstring_stack = []
870 for line in lines: # loop over lines in stream
871 lnum += 1
872 pos = 0
873 max = len(line)
874 if first:
875 if line.startswith(BOM_UTF8_STRING):
876 additional_prefix = BOM_UTF8_STRING
877 line = line[1:]
878 max = len(line)
879
880 # Fake that the part before was already parsed.
881 line = "^" * start_pos[1] + line
882 pos = start_pos[1]
883 max += start_pos[1]
884
885 first = False
886
887 if contstr: # continued string
888 if endprog is None:
889 raise CSTLogicError("Logic error!")
890 endmatch = endprog.match(line)
891 if endmatch:
892 pos = endmatch.end(0)
893 if contstr_start is None:
894 raise CSTLogicError("Logic error!")
895 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix)
896 contstr = ""
897 contline = None
898 else:
899 contstr = contstr + line
900 contline = contline + line
901 continue
902
903 while pos < max:
904 if fstring_stack:
905 tos = fstring_stack[-1]
906 if not tos.is_in_expr():
907 string, pos = _find_fstring_string(
908 token_collection.endpats, fstring_stack, line, lnum, pos
909 )
910 if string:
911 yield PythonToken(
912 FSTRING_STRING,
913 string,
914 tos.last_string_start_pos,
915 # Never has a prefix because it can start anywhere and
916 # include whitespace.
917 prefix="",
918 )
919 tos.previous_lines = ""
920 continue
921 if pos == max:
922 break
923
924 rest = line[pos:]
925 (
926 fstring_end_token,
927 additional_prefix,
928 quote_length,
929 ) = _close_fstring_if_necessary(
930 fstring_stack, rest, (lnum, pos), additional_prefix
931 )
932 pos += quote_length
933 if fstring_end_token is not None:
934 yield fstring_end_token
935 continue
936
937 pseudomatch = token_collection.pseudo_token.match(line, pos)
938 if not pseudomatch: # scan for tokens
939 match = token_collection.whitespace.match(line, pos)
940 if pos == 0:
941 # pyre-fixme[16]: `Optional` has no attribute `end`.
942 for t in dedent_if_necessary(match.end()):
943 yield t
944 pos = match.end()
945 new_line = False
946 yield PythonToken(
947 ERRORTOKEN,
948 line[pos],
949 (lnum, pos),
950 # pyre-fixme[16]: `Optional` has no attribute `group`.
951 additional_prefix + match.group(0),
952 )
953 additional_prefix = ""
954 pos += 1
955 continue
956
957 prefix = additional_prefix + pseudomatch.group(1)
958 additional_prefix = ""
959 start, pos = pseudomatch.span(2)
960 spos = (lnum, start)
961 token = pseudomatch.group(2)
962 if token == "":
963 assert prefix
964 additional_prefix = prefix
965 # This means that we have a line with whitespace/comments at
966 # the end, which just results in an endmarker.
967 break
968 initial = token[0]
969
970 if new_line and initial not in "\r\n\\#":
971 new_line = False
972 if paren_level == 0 and not fstring_stack:
973 i = 0
974 indent_start = start
975 while line[i] == "\f":
976 i += 1
977 # TODO don't we need to change spos as well?
978 indent_start -= 1
979 if indent_start > indents[-1]:
980 yield PythonToken(INDENT, "", spos, "")
981 indents.append(indent_start)
982 for t in dedent_if_necessary(indent_start):
983 yield t
984
985 if initial in numchars or ( # ordinary number
986 initial == "." and token != "." and token != "..."
987 ):
988 yield PythonToken(NUMBER, token, spos, prefix)
989 elif pseudomatch.group(3) is not None: # ordinary name
990 if token in token_collection.always_break_tokens:
991 fstring_stack[:] = []
992 paren_level = 0
993 # We only want to dedent if the token is on a new line.
994 if re.match(r"[ \f\t]*$", line[:start]):
995 while True:
996 indent = indents.pop()
997 if indent > start:
998 yield PythonToken(DEDENT, "", spos, "")
999 else:
1000 indents.append(indent)
1001 break
1002 if str.isidentifier(token):
1003 # py37 doesn't need special tokens for async/await, and we could
1004 # emit NAME, but then we'd need different grammar for py36 and py37.
1005 if token == "async":
1006 yield PythonToken(ASYNC, token, spos, prefix)
1007 elif token == "await":
1008 yield PythonToken(AWAIT, token, spos, prefix)
1009 else:
1010 yield PythonToken(NAME, token, spos, prefix)
1011 else:
1012 for t in _split_illegal_unicode_name(token, spos, prefix):
1013 yield t # yield from Python 2
1014 elif initial in "\r\n":
1015 if any(not f.allow_multiline() for f in fstring_stack):
1016 # Would use fstring_stack.clear, but that's not available
1017 # in Python 2.
1018 fstring_stack[:] = []
1019
1020 if not new_line and paren_level == 0 and not fstring_stack:
1021 yield PythonToken(NEWLINE, token, spos, prefix)
1022 else:
1023 additional_prefix = prefix + token
1024 new_line = True
1025 elif initial == "#": # Comments
1026 assert not token.endswith("\n")
1027 additional_prefix = prefix + token
1028 elif token in token_collection.triple_quoted:
1029 endprog = token_collection.endpats[token]
1030 endmatch = endprog.match(line, pos)
1031 if endmatch: # all on one line
1032 pos = endmatch.end(0)
1033 token = line[start:pos]
1034 yield PythonToken(STRING, token, spos, prefix)
1035 else:
1036 contstr_start = (lnum, start) # multiple lines
1037 contstr = line[start:]
1038 contline = line
1039 break
1040
1041 # Check up to the first 3 chars of the token to see if
1042 # they're in the single_quoted set. If so, they start
1043 # a string.
1044 # We're using the first 3, because we're looking for
1045 # "rb'" (for example) at the start of the token. If
1046 # we switch to longer prefixes, this needs to be
1047 # adjusted.
1048 # Note that initial == token[:1].
1049 # Also note that single quote checking must come after
1050 # triple quote checking (above).
1051 elif (
1052 initial in token_collection.single_quoted
1053 or token[:2] in token_collection.single_quoted
1054 or token[:3] in token_collection.single_quoted
1055 ):
1056 if token[-1] in "\r\n": # continued string
1057 # This means that a single quoted string ends with a
1058 # backslash and is continued.
1059 contstr_start = lnum, start
1060 endprog = (
1061 token_collection.endpats.get(initial)
1062 or token_collection.endpats.get(token[1])
1063 or token_collection.endpats.get(token[2])
1064 )
1065 contstr = line[start:]
1066 contline = line
1067 break
1068 else: # ordinary string
1069 yield PythonToken(STRING, token, spos, prefix)
1070 elif (
1071 token in token_collection.fstring_pattern_map
1072 ): # The start of an fstring.
1073 fstring_stack.append(
1074 FStringNode(
1075 token_collection.fstring_pattern_map[token],
1076 "r" in token or "R" in token,
1077 )
1078 )
1079 yield PythonToken(FSTRING_START, token, spos, prefix)
1080 elif initial == "\\" and line[start:] in (
1081 "\\\n",
1082 "\\\r\n",
1083 "\\\r",
1084 ): # continued stmt
1085 additional_prefix += prefix + line[start:]
1086 break
1087 else:
1088 if token in "([{":
1089 if fstring_stack:
1090 fstring_stack[-1].open_parentheses(token)
1091 else:
1092 paren_level += 1
1093 elif token in ")]}":
1094 if fstring_stack:
1095 fstring_stack[-1].close_parentheses(token)
1096 else:
1097 if paren_level:
1098 paren_level -= 1
1099 elif (
1100 token == ":"
1101 and fstring_stack
1102 and fstring_stack[-1].parentheses_count
1103 - fstring_stack[-1].format_spec_count
1104 == 1
1105 ):
1106 fstring_stack[-1].format_spec_count += 1
1107
1108 yield PythonToken(OP, token, spos, prefix)
1109
1110 if contstr:
1111 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix)
1112 if contstr.endswith("\n") or contstr.endswith("\r"):
1113 new_line = True
1114
1115 end_pos = lnum, max
1116 # As the last position we just take the maximally possible position. We
1117 # remove -1 for the last new line.
1118 for indent in indents[1:]:
1119 yield PythonToken(DEDENT, "", end_pos, "")
1120 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix)
1121
1122
1123def _split_illegal_unicode_name(
1124 token: str, start_pos: Tuple[int, int], prefix: str
1125) -> Generator[PythonToken, None, None]:
1126 def create_token():
1127 return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix)
1128
1129 found = ""
1130 is_illegal = False
1131 pos = start_pos
1132 for i, char in enumerate(token):
1133 if is_illegal:
1134 if str.isidentifier(char):
1135 yield create_token()
1136 found = char
1137 is_illegal = False
1138 prefix = ""
1139 pos = start_pos[0], start_pos[1] + i
1140 else:
1141 found += char
1142 else:
1143 new_found = found + char
1144 if str.isidentifier(new_found):
1145 found = new_found
1146 else:
1147 if found:
1148 yield create_token()
1149 prefix = ""
1150 pos = start_pos[0], start_pos[1] + i
1151 found = char
1152 is_illegal = True
1153
1154 if found:
1155 yield create_token()