Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/libcst/_parser/parso/python/tokenize.py: 10%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

684 statements  

1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 

2# Licensed to PSF under a Contributor Agreement. 

3# 

4# Modifications: 

5# Copyright David Halter and Contributors 

6# Modifications are dual-licensed: MIT and PSF. 

7# 99% of the code is different from pgen2, now. 

8# 

9# A fork of `parso.python.tokenize`. 

10# https://github.com/davidhalter/parso/blob/master/parso/python/tokenize.py 

11# 

12# The following changes were made: 

13# - Changes to be compatible with PythonTokenTypes 

14# - Removed main section 

15# - Applied type stubs directly 

16# - Removed Python 2 shims 

17# - Added support for Python 3.6 ASYNC/AWAIT hacks 

18# 

19# -*- coding: utf-8 -*- 

20# This tokenizer has been copied from the ``tokenize.py`` standard library 

21# tokenizer. The reason was simple: The standard library tokenizer fails 

22# if the indentation is not right. To make it possible to do error recovery the 

23# tokenizer needed to be rewritten. 

24# 

25# Basically this is a stripped down version of the standard library module, so 

26# you can read the documentation there. Additionally we included some speed and 

27# memory optimizations here. 

28# pyre-unsafe 

29from __future__ import absolute_import 

30 

31import itertools as _itertools 

32import re 

33import sys 

34from codecs import BOM_UTF8 

35from collections import namedtuple 

36from dataclasses import dataclass 

37from typing import Dict, Generator, Iterable, Optional, Pattern, Set, Tuple 

38 

39from libcst import CSTLogicError 

40from libcst._parser.parso.python.token import PythonTokenTypes 

41from libcst._parser.parso.utils import PythonVersionInfo, split_lines 

42 

43# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) 

44MAX_UNICODE = "\U0010ffff" 

45BOM_UTF8_STRING = BOM_UTF8.decode("utf-8") 

46 

47STRING = PythonTokenTypes.STRING 

48NAME = PythonTokenTypes.NAME 

49NUMBER = PythonTokenTypes.NUMBER 

50OP = PythonTokenTypes.OP 

51NEWLINE = PythonTokenTypes.NEWLINE 

52INDENT = PythonTokenTypes.INDENT 

53DEDENT = PythonTokenTypes.DEDENT 

54ASYNC = PythonTokenTypes.ASYNC 

55AWAIT = PythonTokenTypes.AWAIT 

56ENDMARKER = PythonTokenTypes.ENDMARKER 

57ERRORTOKEN = PythonTokenTypes.ERRORTOKEN 

58ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT 

59FSTRING_START = PythonTokenTypes.FSTRING_START 

60FSTRING_STRING = PythonTokenTypes.FSTRING_STRING 

61FSTRING_END = PythonTokenTypes.FSTRING_END 

62 

63 

64@dataclass(frozen=True) 

65class TokenCollection: 

66 pseudo_token: Pattern 

67 single_quoted: Set[str] 

68 triple_quoted: Set[str] 

69 endpats: Dict[str, Pattern] 

70 whitespace: Pattern 

71 fstring_pattern_map: Dict[str, str] 

72 always_break_tokens: Set[str] 

73 

74 

75_token_collection_cache: Dict[PythonVersionInfo, TokenCollection] = {} 

76 

77 

78def group(*choices: str, **kwargs: object) -> str: 

79 capture = kwargs.pop("capture", False) # Python 2, arrghhhhh :( 

80 assert not kwargs 

81 

82 start = "(" 

83 if not capture: 

84 start += "?:" 

85 return start + "|".join(choices) + ")" 

86 

87 

88def maybe(*choices: str) -> str: 

89 return group(*choices) + "?" 

90 

91 

92# Return the empty string, plus all of the valid string prefixes. 

93def _all_string_prefixes( 

94 version_info: PythonVersionInfo, 

95 include_fstring: bool = False, 

96 only_fstring: bool = False, 

97) -> Set[str]: 

98 def different_case_versions(prefix): 

99 for s in _itertools.product(*[(c, c.upper()) for c in prefix]): 

100 yield "".join(s) 

101 

102 # The valid string prefixes. Only contain the lower case versions, 

103 # and don't contain any permuations (include 'fr', but not 

104 # 'rf'). The various permutations will be generated. 

105 valid_string_prefixes = ["b", "r"] 

106 if version_info >= (3, 0): 

107 valid_string_prefixes.append("br") 

108 if version_info < (3, 0) or version_info >= (3, 3): 

109 valid_string_prefixes.append("u") 

110 

111 result = {""} 

112 if version_info >= (3, 6) and include_fstring: 

113 f = ["f", "fr"] 

114 if only_fstring: 

115 valid_string_prefixes = f 

116 result = set() 

117 else: 

118 valid_string_prefixes += f 

119 elif only_fstring: 

120 return set() 

121 

122 # if we add binary f-strings, add: ['fb', 'fbr'] 

123 for prefix in valid_string_prefixes: 

124 for t in _itertools.permutations(prefix): 

125 # create a list with upper and lower versions of each 

126 # character 

127 result.update(different_case_versions(t)) 

128 if version_info <= (2, 7): 

129 # In Python 2 the order cannot just be random. 

130 result.update(different_case_versions("ur")) 

131 result.update(different_case_versions("br")) 

132 return result 

133 

134 

135def _compile(expr: str) -> Pattern: 

136 return re.compile(expr, re.UNICODE) 

137 

138 

139def _get_token_collection(version_info: PythonVersionInfo) -> TokenCollection: 

140 try: 

141 return _token_collection_cache[version_info] 

142 except KeyError: 

143 _token_collection_cache[version_info] = result = _create_token_collection( 

144 version_info 

145 ) 

146 return result 

147 

148 

149fstring_raw_string = _compile(r"(?:[^{}]+|\{\{|\}\})+") 

150 

151unicode_character_name = r"[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*" 

152fstring_string_single_line = _compile( 

153 r"(?:\{\{|\}\}|\\N\{" 

154 + unicode_character_name 

155 + r"\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+" 

156) 

157fstring_string_multi_line = _compile( 

158 r"(?:\{\{|\}\}|\\N\{" + unicode_character_name + r"\}|\\[^N]|[^{}\\])+" 

159) 

160 

161fstring_format_spec_single_line = _compile(r"(?:\\(?:\r\n?|\n)|[^{}\r\n])+") 

162fstring_format_spec_multi_line = _compile(r"[^{}]+") 

163 

164 

165def _create_token_collection( # noqa: C901 

166 version_info: PythonVersionInfo, 

167) -> TokenCollection: 

168 # Note: we use unicode matching for names ("\w") but ascii matching for 

169 # number literals. 

170 Whitespace = r"[ \f\t]*" 

171 Comment = r"#[^\r\n]*" 

172 # Python 2 is pretty much not working properly anymore, we just ignore 

173 # parsing unicode properly, which is fine, I guess. 

174 if version_info.major == 2: 

175 Name = r"([A-Za-z_0-9]+)" 

176 elif sys.version_info[0] == 2: 

177 # Unfortunately the regex engine cannot deal with the regex below, so 

178 # just use this one. 

179 Name = r"(\w+)" 

180 else: 

181 Name = "([A-Za-z_0-9\u0080-" + MAX_UNICODE + "]+)" 

182 

183 if version_info >= (3, 6): 

184 Hexnumber = r"0[xX](?:_?[0-9a-fA-F])+" 

185 Binnumber = r"0[bB](?:_?[01])+" 

186 Octnumber = r"0[oO](?:_?[0-7])+" 

187 Decnumber = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)" 

188 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 

189 Exponent = r"[eE][-+]?[0-9](?:_?[0-9])*" 

190 Pointfloat = group( 

191 r"[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?", r"\.[0-9](?:_?[0-9])*" 

192 ) + maybe(Exponent) 

193 Expfloat = r"[0-9](?:_?[0-9])*" + Exponent 

194 Floatnumber = group(Pointfloat, Expfloat) 

195 Imagnumber = group(r"[0-9](?:_?[0-9])*[jJ]", Floatnumber + r"[jJ]") 

196 else: 

197 Hexnumber = r"0[xX][0-9a-fA-F]+" 

198 Binnumber = r"0[bB][01]+" 

199 if version_info >= (3, 0): 

200 Octnumber = r"0[oO][0-7]+" 

201 else: 

202 Octnumber = "0[oO]?[0-7]+" 

203 Decnumber = r"(?:0+|[1-9][0-9]*)" 

204 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 

205 if version_info.major < 3: 

206 Intnumber += "[lL]?" 

207 Exponent = r"[eE][-+]?[0-9]+" 

208 Pointfloat = group(r"[0-9]+\.[0-9]*", r"\.[0-9]+") + maybe(Exponent) 

209 Expfloat = r"[0-9]+" + Exponent 

210 Floatnumber = group(Pointfloat, Expfloat) 

211 Imagnumber = group(r"[0-9]+[jJ]", Floatnumber + r"[jJ]") 

212 Number = group(Imagnumber, Floatnumber, Intnumber) 

213 

214 # Note that since _all_string_prefixes includes the empty string, 

215 # StringPrefix can be the empty string (making it optional). 

216 possible_prefixes = _all_string_prefixes(version_info) 

217 StringPrefix = group(*possible_prefixes) 

218 StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True)) 

219 fstring_prefixes = _all_string_prefixes( 

220 version_info, include_fstring=True, only_fstring=True 

221 ) 

222 FStringStart = group(*fstring_prefixes) 

223 

224 # Tail end of ' string. 

225 Single = r"(?:\\.|[^'\\])*'" 

226 # Tail end of " string. 

227 Double = r'(?:\\.|[^"\\])*"' 

228 # Tail end of ''' string. 

229 Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''" 

230 # Tail end of """ string. 

231 Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""' 

232 Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""') 

233 

234 # Because of leftmost-then-longest match semantics, be sure to put the 

235 # longest operators first (e.g., if = came before ==, == would get 

236 # recognized as two instances of =). 

237 Operator = group( 

238 r"\*\*=?", r">>=?", r"<<=?", r"//=?", r"->", r"[+\-*/%&@`|^!=<>]=?", r"~" 

239 ) 

240 

241 Bracket = "[][(){}]" 

242 

243 special_args = [r"\r\n?", r"\n", r"[;.,@]"] 

244 if version_info >= (3, 0): 

245 special_args.insert(0, r"\.\.\.") 

246 if version_info >= (3, 8): 

247 special_args.insert(0, ":=?") 

248 else: 

249 special_args.insert(0, ":") 

250 Special = group(*special_args) 

251 

252 Funny = group(Operator, Bracket, Special) 

253 

254 # First (or only) line of ' or " string. 

255 ContStr = group( 

256 StringPrefix 

257 + r"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*" 

258 + group("'", r"\\(?:\r\n?|\n)"), 

259 StringPrefix 

260 + r'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*' 

261 + group('"', r"\\(?:\r\n?|\n)"), 

262 ) 

263 pseudo_extra_pool = [Comment, Triple] 

264 all_quotes = '"', "'", '"""', "'''" 

265 if fstring_prefixes: 

266 pseudo_extra_pool.append(FStringStart + group(*all_quotes)) 

267 

268 PseudoExtras = group(r"\\(?:\r\n?|\n)|\Z", *pseudo_extra_pool) 

269 PseudoToken = group(Whitespace, capture=True) + group( 

270 PseudoExtras, Number, Funny, ContStr, Name, capture=True 

271 ) 

272 

273 # For a given string prefix plus quotes, endpats maps it to a regex 

274 # to match the remainder of that string. _prefix can be empty, for 

275 # a normal single or triple quoted string (with no prefix). 

276 endpats = {} 

277 for _prefix in possible_prefixes: 

278 endpats[_prefix + "'"] = _compile(Single) 

279 endpats[_prefix + '"'] = _compile(Double) 

280 endpats[_prefix + "'''"] = _compile(Single3) 

281 endpats[_prefix + '"""'] = _compile(Double3) 

282 

283 # A set of all of the single and triple quoted string prefixes, 

284 # including the opening quotes. 

285 single_quoted = set() 

286 triple_quoted = set() 

287 fstring_pattern_map = {} 

288 for t in possible_prefixes: 

289 for quote in '"', "'": 

290 single_quoted.add(t + quote) 

291 

292 for quote in '"""', "'''": 

293 triple_quoted.add(t + quote) 

294 

295 for t in fstring_prefixes: 

296 for quote in all_quotes: 

297 fstring_pattern_map[t + quote] = quote 

298 

299 pseudo_token_compiled = _compile(PseudoToken) 

300 return TokenCollection( 

301 pseudo_token_compiled, 

302 single_quoted, 

303 triple_quoted, 

304 endpats, 

305 _compile(Whitespace), 

306 fstring_pattern_map, 

307 { 

308 ";", 

309 "import", 

310 "class", 

311 "def", 

312 "try", 

313 "except", 

314 "finally", 

315 "while", 

316 "with", 

317 "return", 

318 }, 

319 ) 

320 

321 

322class Token(namedtuple("Token", ["type", "string", "start_pos", "prefix"])): 

323 @property 

324 def end_pos(self): 

325 lines = split_lines(self.string) 

326 if len(lines) > 1: 

327 return self.start_pos[0] + len(lines) - 1, 0 

328 else: 

329 return self.start_pos[0], self.start_pos[1] + len(self.string) 

330 

331 

332class PythonToken(Token): 

333 def __repr__(self): 

334 return "TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)" % self._replace( 

335 type=self.type.name 

336 ) 

337 

338 

339class FStringNode: 

340 def __init__(self, quote, raw): 

341 self.quote = quote 

342 self.raw = raw 

343 self.parentheses_count = 0 

344 self.previous_lines = "" 

345 self.last_string_start_pos = None 

346 # In the syntax there can be multiple format_spec's nested: 

347 # {x:{y:3}} 

348 self.format_spec_count = 0 

349 

350 def open_parentheses(self, character): 

351 self.parentheses_count += 1 

352 

353 def close_parentheses(self, character): 

354 self.parentheses_count -= 1 

355 if self.parentheses_count == 0: 

356 # No parentheses means that the format spec is also finished. 

357 self.format_spec_count = 0 

358 

359 def allow_multiline(self): 

360 return len(self.quote) == 3 

361 

362 def is_in_expr(self): 

363 return self.parentheses_count > self.format_spec_count 

364 

365 def is_in_format_spec(self): 

366 return not self.is_in_expr() and self.format_spec_count 

367 

368 

369def _close_fstring_if_necessary(fstring_stack, string, start_pos, additional_prefix): 

370 for fstring_stack_index, node in enumerate(fstring_stack): 

371 if string.startswith(node.quote): 

372 token = PythonToken( 

373 FSTRING_END, node.quote, start_pos, prefix=additional_prefix 

374 ) 

375 additional_prefix = "" 

376 assert not node.previous_lines 

377 del fstring_stack[fstring_stack_index:] 

378 return token, "", len(node.quote) 

379 return None, additional_prefix, 0 

380 

381 

382def _find_fstring_string(endpats, fstring_stack, line, lnum, pos): 

383 tos = fstring_stack[-1] 

384 allow_multiline = tos.allow_multiline() 

385 if tos.is_in_format_spec(): 

386 if allow_multiline: 

387 regex = fstring_format_spec_multi_line 

388 else: 

389 regex = fstring_format_spec_single_line 

390 else: 

391 if tos.raw: 

392 regex = fstring_raw_string 

393 elif allow_multiline: 

394 regex = fstring_string_multi_line 

395 else: 

396 regex = fstring_string_single_line 

397 

398 match = regex.match(line, pos) 

399 if match is None: 

400 return tos.previous_lines, pos 

401 

402 if not tos.previous_lines: 

403 tos.last_string_start_pos = (lnum, pos) 

404 

405 string = match.group(0) 

406 for fstring_stack_node in fstring_stack: 

407 end_match = endpats[fstring_stack_node.quote].match(string) 

408 if end_match is not None: 

409 string = end_match.group(0)[: -len(fstring_stack_node.quote)] 

410 

411 new_pos = pos 

412 new_pos += len(string) 

413 # even if allow_multiline is False, we still need to check for trailing 

414 # newlines, because a single-line f-string can contain line continuations 

415 if string.endswith("\n") or string.endswith("\r"): 

416 tos.previous_lines += string 

417 string = "" 

418 else: 

419 string = tos.previous_lines + string 

420 

421 return string, new_pos 

422 

423 

424def tokenize( 

425 code: str, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0) 

426) -> Generator[PythonToken, None, None]: 

427 """Generate tokens from a the source code (string).""" 

428 lines = split_lines(code, keepends=True) 

429 return tokenize_lines(lines, version_info, start_pos=start_pos) 

430 

431 

432def tokenize_lines( # noqa: C901 

433 lines: Iterable[str], 

434 version_info: PythonVersionInfo, 

435 start_pos: Tuple[int, int] = (1, 0), 

436) -> Generator[PythonToken, None, None]: 

437 token_collection = _get_token_collection(version_info) 

438 if version_info >= PythonVersionInfo(3, 7): 

439 return _tokenize_lines_py37_or_above( 

440 lines, version_info, token_collection, start_pos=start_pos 

441 ) 

442 else: 

443 return _tokenize_lines_py36_or_below( 

444 lines, version_info, token_collection, start_pos=start_pos 

445 ) 

446 

447 

448def _tokenize_lines_py36_or_below( # noqa: C901 

449 lines: Iterable[str], 

450 version_info: PythonVersionInfo, 

451 token_collection: TokenCollection, 

452 start_pos: Tuple[int, int] = (1, 0), 

453) -> Generator[PythonToken, None, None]: 

454 """ 

455 A heavily modified Python standard library tokenizer. 

456 

457 Additionally to the default information, yields also the prefix of each 

458 token. This idea comes from lib2to3. The prefix contains all information 

459 that is irrelevant for the parser like newlines in parentheses or comments. 

460 """ 

461 

462 paren_level = 0 # count parentheses 

463 indents = [0] 

464 max = 0 

465 numchars = "0123456789" 

466 contstr = "" 

467 contline = None 

468 # We start with a newline. This makes indent at the first position 

469 # possible. It's not valid Python, but still better than an INDENT in the 

470 # second line (and not in the first). This makes quite a few things in 

471 # Jedi's fast parser possible. 

472 new_line = True 

473 prefix = "" # Should never be required, but here for safety 

474 endprog = None # Should not be required, but here for lint 

475 contstr_start: Optional[Tuple[int, int]] = None 

476 additional_prefix = "" 

477 first = True 

478 lnum = start_pos[0] - 1 

479 fstring_stack = [] 

480 # stash and async_* are used for async/await parsing 

481 stashed: Optional[PythonToken] = None 

482 async_def: bool = False 

483 async_def_indent: int = 0 

484 async_def_newline: bool = False 

485 

486 def dedent_if_necessary(start): 

487 nonlocal stashed 

488 nonlocal async_def 

489 nonlocal async_def_indent 

490 nonlocal async_def_newline 

491 

492 while start < indents[-1]: 

493 if start > indents[-2]: 

494 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "") 

495 break 

496 if stashed is not None: 

497 yield stashed 

498 stashed = None 

499 if async_def and async_def_newline and async_def_indent >= indents[-1]: 

500 # We exited an 'async def' block, so stop tracking for indents 

501 async_def = False 

502 async_def_newline = False 

503 async_def_indent = 0 

504 yield PythonToken(DEDENT, "", spos, "") 

505 indents.pop() 

506 

507 for line in lines: # loop over lines in stream 

508 lnum += 1 

509 pos = 0 

510 max = len(line) 

511 if first: 

512 if line.startswith(BOM_UTF8_STRING): 

513 additional_prefix = BOM_UTF8_STRING 

514 line = line[1:] 

515 max = len(line) 

516 

517 # Fake that the part before was already parsed. 

518 line = "^" * start_pos[1] + line 

519 pos = start_pos[1] 

520 max += start_pos[1] 

521 

522 first = False 

523 

524 if contstr: # continued string 

525 if endprog is None: 

526 raise CSTLogicError("Logic error!") 

527 endmatch = endprog.match(line) 

528 if endmatch: 

529 pos = endmatch.end(0) 

530 if contstr_start is None: 

531 raise CSTLogicError("Logic error!") 

532 if stashed is not None: 

533 raise CSTLogicError("Logic error!") 

534 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix) 

535 contstr = "" 

536 contline = None 

537 else: 

538 contstr = contstr + line 

539 contline = contline + line 

540 continue 

541 

542 while pos < max: 

543 if fstring_stack: 

544 tos = fstring_stack[-1] 

545 if not tos.is_in_expr(): 

546 string, pos = _find_fstring_string( 

547 token_collection.endpats, fstring_stack, line, lnum, pos 

548 ) 

549 if string: 

550 if stashed is not None: 

551 raise CSTLogicError("Logic error!") 

552 yield PythonToken( 

553 FSTRING_STRING, 

554 string, 

555 tos.last_string_start_pos, 

556 # Never has a prefix because it can start anywhere and 

557 # include whitespace. 

558 prefix="", 

559 ) 

560 tos.previous_lines = "" 

561 continue 

562 if pos == max: 

563 break 

564 

565 rest = line[pos:] 

566 ( 

567 fstring_end_token, 

568 additional_prefix, 

569 quote_length, 

570 ) = _close_fstring_if_necessary( 

571 fstring_stack, rest, (lnum, pos), additional_prefix 

572 ) 

573 pos += quote_length 

574 if fstring_end_token is not None: 

575 if stashed is not None: 

576 raise CSTLogicError("Logic error!") 

577 yield fstring_end_token 

578 continue 

579 

580 pseudomatch = token_collection.pseudo_token.match(line, pos) 

581 if not pseudomatch: # scan for tokens 

582 match = token_collection.whitespace.match(line, pos) 

583 if pos == 0: 

584 # pyre-fixme[16]: `Optional` has no attribute `end`. 

585 yield from dedent_if_necessary(match.end()) 

586 pos = match.end() 

587 new_line = False 

588 yield PythonToken( 

589 ERRORTOKEN, 

590 line[pos], 

591 (lnum, pos), 

592 # pyre-fixme[16]: `Optional` has no attribute `group`. 

593 additional_prefix + match.group(0), 

594 ) 

595 additional_prefix = "" 

596 pos += 1 

597 continue 

598 

599 prefix = additional_prefix + pseudomatch.group(1) 

600 additional_prefix = "" 

601 start, pos = pseudomatch.span(2) 

602 spos = (lnum, start) 

603 token = pseudomatch.group(2) 

604 if token == "": 

605 assert prefix 

606 additional_prefix = prefix 

607 # This means that we have a line with whitespace/comments at 

608 # the end, which just results in an endmarker. 

609 break 

610 initial = token[0] 

611 

612 if new_line and initial not in "\r\n\\#": 

613 new_line = False 

614 if paren_level == 0 and not fstring_stack: 

615 i = 0 

616 indent_start = start 

617 while line[i] == "\f": 

618 i += 1 

619 # TODO don't we need to change spos as well? 

620 indent_start -= 1 

621 if indent_start > indents[-1]: 

622 if stashed is not None: 

623 yield stashed 

624 stashed = None 

625 yield PythonToken(INDENT, "", spos, "") 

626 indents.append(indent_start) 

627 yield from dedent_if_necessary(indent_start) 

628 

629 if initial in numchars or ( # ordinary number 

630 initial == "." and token != "." and token != "..." 

631 ): 

632 if stashed is not None: 

633 yield stashed 

634 stashed = None 

635 yield PythonToken(NUMBER, token, spos, prefix) 

636 elif pseudomatch.group(3) is not None: # ordinary name 

637 if token in token_collection.always_break_tokens: 

638 fstring_stack[:] = [] 

639 paren_level = 0 

640 # We only want to dedent if the token is on a new line. 

641 if re.match(r"[ \f\t]*$", line[:start]): 

642 while True: 

643 indent = indents.pop() 

644 if indent > start: 

645 if ( 

646 async_def 

647 and async_def_newline 

648 and async_def_indent >= indent 

649 ): 

650 # We dedented outside of an 'async def' block. 

651 async_def = False 

652 async_def_newline = False 

653 async_def_indent = 0 

654 if stashed is not None: 

655 yield stashed 

656 stashed = None 

657 yield PythonToken(DEDENT, "", spos, "") 

658 else: 

659 indents.append(indent) 

660 break 

661 if str.isidentifier(token): 

662 should_yield_identifier = True 

663 if token in ("async", "await") and async_def: 

664 # We're inside an 'async def' block, all async/await are 

665 # tokens. 

666 if token == "async": 

667 yield PythonToken(ASYNC, token, spos, prefix) 

668 else: 

669 yield PythonToken(AWAIT, token, spos, prefix) 

670 should_yield_identifier = False 

671 

672 # We are possibly starting an 'async def' section 

673 elif token == "async" and not stashed: 

674 stashed = PythonToken(NAME, token, spos, prefix) 

675 should_yield_identifier = False 

676 

677 # We actually are starting an 'async def' section 

678 elif ( 

679 token == "def" 

680 and stashed is not None 

681 and stashed[0] is NAME 

682 and stashed[1] == "async" 

683 ): 

684 async_def = True 

685 async_def_indent = indents[-1] 

686 yield PythonToken(ASYNC, stashed[1], stashed[2], stashed[3]) 

687 stashed = None 

688 

689 # We are either not stashed, or we output an ASYNC token above. 

690 elif stashed: 

691 yield stashed 

692 stashed = None 

693 

694 # If we didn't bail early due to possibly recognizing an 'async def', 

695 # then we should yield this token as normal. 

696 if should_yield_identifier: 

697 yield PythonToken(NAME, token, spos, prefix) 

698 else: 

699 yield from _split_illegal_unicode_name(token, spos, prefix) 

700 elif initial in "\r\n": 

701 if any(not f.allow_multiline() for f in fstring_stack): 

702 # Would use fstring_stack.clear, but that's not available 

703 # in Python 2. 

704 fstring_stack[:] = [] 

705 

706 if not new_line and paren_level == 0 and not fstring_stack: 

707 if async_def: 

708 async_def_newline = True 

709 if stashed: 

710 yield stashed 

711 stashed = None 

712 yield PythonToken(NEWLINE, token, spos, prefix) 

713 else: 

714 additional_prefix = prefix + token 

715 new_line = True 

716 elif initial == "#": # Comments 

717 assert not token.endswith("\n") 

718 additional_prefix = prefix + token 

719 elif token in token_collection.triple_quoted: 

720 endprog = token_collection.endpats[token] 

721 endmatch = endprog.match(line, pos) 

722 if endmatch: # all on one line 

723 pos = endmatch.end(0) 

724 token = line[start:pos] 

725 if stashed is not None: 

726 yield stashed 

727 stashed = None 

728 yield PythonToken(STRING, token, spos, prefix) 

729 else: 

730 contstr_start = (lnum, start) # multiple lines 

731 contstr = line[start:] 

732 contline = line 

733 break 

734 

735 # Check up to the first 3 chars of the token to see if 

736 # they're in the single_quoted set. If so, they start 

737 # a string. 

738 # We're using the first 3, because we're looking for 

739 # "rb'" (for example) at the start of the token. If 

740 # we switch to longer prefixes, this needs to be 

741 # adjusted. 

742 # Note that initial == token[:1]. 

743 # Also note that single quote checking must come after 

744 # triple quote checking (above). 

745 elif ( 

746 initial in token_collection.single_quoted 

747 or token[:2] in token_collection.single_quoted 

748 or token[:3] in token_collection.single_quoted 

749 ): 

750 if token[-1] in "\r\n": # continued string 

751 # This means that a single quoted string ends with a 

752 # backslash and is continued. 

753 contstr_start = lnum, start 

754 endprog = ( 

755 token_collection.endpats.get(initial) 

756 or token_collection.endpats.get(token[1]) 

757 or token_collection.endpats.get(token[2]) 

758 ) 

759 contstr = line[start:] 

760 contline = line 

761 break 

762 else: # ordinary string 

763 if stashed is not None: 

764 yield stashed 

765 stashed = None 

766 yield PythonToken(STRING, token, spos, prefix) 

767 elif ( 

768 token in token_collection.fstring_pattern_map 

769 ): # The start of an fstring. 

770 fstring_stack.append( 

771 FStringNode( 

772 token_collection.fstring_pattern_map[token], 

773 "r" in token or "R" in token, 

774 ) 

775 ) 

776 if stashed is not None: 

777 yield stashed 

778 stashed = None 

779 yield PythonToken(FSTRING_START, token, spos, prefix) 

780 elif initial == "\\" and line[start:] in ( 

781 "\\\n", 

782 "\\\r\n", 

783 "\\\r", 

784 ): # continued stmt 

785 additional_prefix += prefix + line[start:] 

786 break 

787 else: 

788 if token in "([{": 

789 if fstring_stack: 

790 fstring_stack[-1].open_parentheses(token) 

791 else: 

792 paren_level += 1 

793 elif token in ")]}": 

794 if fstring_stack: 

795 fstring_stack[-1].close_parentheses(token) 

796 else: 

797 if paren_level: 

798 paren_level -= 1 

799 elif ( 

800 token == ":" 

801 and fstring_stack 

802 and fstring_stack[-1].parentheses_count 

803 - fstring_stack[-1].format_spec_count 

804 == 1 

805 ): 

806 fstring_stack[-1].format_spec_count += 1 

807 

808 if stashed is not None: 

809 yield stashed 

810 stashed = None 

811 yield PythonToken(OP, token, spos, prefix) 

812 

813 if contstr: 

814 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix) 

815 if contstr.endswith("\n") or contstr.endswith("\r"): 

816 new_line = True 

817 

818 if stashed is not None: 

819 yield stashed 

820 stashed = None 

821 

822 end_pos = lnum, max 

823 # As the last position we just take the maximally possible position. We 

824 # remove -1 for the last new line. 

825 for indent in indents[1:]: 

826 yield PythonToken(DEDENT, "", end_pos, "") 

827 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix) 

828 

829 

830def _tokenize_lines_py37_or_above( # noqa: C901 

831 lines: Iterable[str], 

832 version_info: PythonVersionInfo, 

833 token_collection: TokenCollection, 

834 start_pos: Tuple[int, int] = (1, 0), 

835) -> Generator[PythonToken, None, None]: 

836 """ 

837 A heavily modified Python standard library tokenizer. 

838 

839 Additionally to the default information, yields also the prefix of each 

840 token. This idea comes from lib2to3. The prefix contains all information 

841 that is irrelevant for the parser like newlines in parentheses or comments. 

842 """ 

843 

844 def dedent_if_necessary(start): 

845 while start < indents[-1]: 

846 if start > indents[-2]: 

847 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "") 

848 break 

849 yield PythonToken(DEDENT, "", spos, "") 

850 indents.pop() 

851 

852 paren_level = 0 # count parentheses 

853 indents = [0] 

854 max = 0 

855 numchars = "0123456789" 

856 contstr = "" 

857 contline = None 

858 # We start with a newline. This makes indent at the first position 

859 # possible. It's not valid Python, but still better than an INDENT in the 

860 # second line (and not in the first). This makes quite a few things in 

861 # Jedi's fast parser possible. 

862 new_line = True 

863 prefix = "" # Should never be required, but here for safety 

864 endprog = None # Should not be required, but here for lint 

865 contstr_start: Optional[Tuple[int, int]] = None 

866 additional_prefix = "" 

867 first = True 

868 lnum = start_pos[0] - 1 

869 fstring_stack = [] 

870 for line in lines: # loop over lines in stream 

871 lnum += 1 

872 pos = 0 

873 max = len(line) 

874 if first: 

875 if line.startswith(BOM_UTF8_STRING): 

876 additional_prefix = BOM_UTF8_STRING 

877 line = line[1:] 

878 max = len(line) 

879 

880 # Fake that the part before was already parsed. 

881 line = "^" * start_pos[1] + line 

882 pos = start_pos[1] 

883 max += start_pos[1] 

884 

885 first = False 

886 

887 if contstr: # continued string 

888 if endprog is None: 

889 raise CSTLogicError("Logic error!") 

890 endmatch = endprog.match(line) 

891 if endmatch: 

892 pos = endmatch.end(0) 

893 if contstr_start is None: 

894 raise CSTLogicError("Logic error!") 

895 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix) 

896 contstr = "" 

897 contline = None 

898 else: 

899 contstr = contstr + line 

900 contline = contline + line 

901 continue 

902 

903 while pos < max: 

904 if fstring_stack: 

905 tos = fstring_stack[-1] 

906 if not tos.is_in_expr(): 

907 string, pos = _find_fstring_string( 

908 token_collection.endpats, fstring_stack, line, lnum, pos 

909 ) 

910 if string: 

911 yield PythonToken( 

912 FSTRING_STRING, 

913 string, 

914 tos.last_string_start_pos, 

915 # Never has a prefix because it can start anywhere and 

916 # include whitespace. 

917 prefix="", 

918 ) 

919 tos.previous_lines = "" 

920 continue 

921 if pos == max: 

922 break 

923 

924 rest = line[pos:] 

925 ( 

926 fstring_end_token, 

927 additional_prefix, 

928 quote_length, 

929 ) = _close_fstring_if_necessary( 

930 fstring_stack, rest, (lnum, pos), additional_prefix 

931 ) 

932 pos += quote_length 

933 if fstring_end_token is not None: 

934 yield fstring_end_token 

935 continue 

936 

937 pseudomatch = token_collection.pseudo_token.match(line, pos) 

938 if not pseudomatch: # scan for tokens 

939 match = token_collection.whitespace.match(line, pos) 

940 if pos == 0: 

941 # pyre-fixme[16]: `Optional` has no attribute `end`. 

942 for t in dedent_if_necessary(match.end()): 

943 yield t 

944 pos = match.end() 

945 new_line = False 

946 yield PythonToken( 

947 ERRORTOKEN, 

948 line[pos], 

949 (lnum, pos), 

950 # pyre-fixme[16]: `Optional` has no attribute `group`. 

951 additional_prefix + match.group(0), 

952 ) 

953 additional_prefix = "" 

954 pos += 1 

955 continue 

956 

957 prefix = additional_prefix + pseudomatch.group(1) 

958 additional_prefix = "" 

959 start, pos = pseudomatch.span(2) 

960 spos = (lnum, start) 

961 token = pseudomatch.group(2) 

962 if token == "": 

963 assert prefix 

964 additional_prefix = prefix 

965 # This means that we have a line with whitespace/comments at 

966 # the end, which just results in an endmarker. 

967 break 

968 initial = token[0] 

969 

970 if new_line and initial not in "\r\n\\#": 

971 new_line = False 

972 if paren_level == 0 and not fstring_stack: 

973 i = 0 

974 indent_start = start 

975 while line[i] == "\f": 

976 i += 1 

977 # TODO don't we need to change spos as well? 

978 indent_start -= 1 

979 if indent_start > indents[-1]: 

980 yield PythonToken(INDENT, "", spos, "") 

981 indents.append(indent_start) 

982 for t in dedent_if_necessary(indent_start): 

983 yield t 

984 

985 if initial in numchars or ( # ordinary number 

986 initial == "." and token != "." and token != "..." 

987 ): 

988 yield PythonToken(NUMBER, token, spos, prefix) 

989 elif pseudomatch.group(3) is not None: # ordinary name 

990 if token in token_collection.always_break_tokens: 

991 fstring_stack[:] = [] 

992 paren_level = 0 

993 # We only want to dedent if the token is on a new line. 

994 if re.match(r"[ \f\t]*$", line[:start]): 

995 while True: 

996 indent = indents.pop() 

997 if indent > start: 

998 yield PythonToken(DEDENT, "", spos, "") 

999 else: 

1000 indents.append(indent) 

1001 break 

1002 if str.isidentifier(token): 

1003 # py37 doesn't need special tokens for async/await, and we could 

1004 # emit NAME, but then we'd need different grammar for py36 and py37. 

1005 if token == "async": 

1006 yield PythonToken(ASYNC, token, spos, prefix) 

1007 elif token == "await": 

1008 yield PythonToken(AWAIT, token, spos, prefix) 

1009 else: 

1010 yield PythonToken(NAME, token, spos, prefix) 

1011 else: 

1012 for t in _split_illegal_unicode_name(token, spos, prefix): 

1013 yield t # yield from Python 2 

1014 elif initial in "\r\n": 

1015 if any(not f.allow_multiline() for f in fstring_stack): 

1016 # Would use fstring_stack.clear, but that's not available 

1017 # in Python 2. 

1018 fstring_stack[:] = [] 

1019 

1020 if not new_line and paren_level == 0 and not fstring_stack: 

1021 yield PythonToken(NEWLINE, token, spos, prefix) 

1022 else: 

1023 additional_prefix = prefix + token 

1024 new_line = True 

1025 elif initial == "#": # Comments 

1026 assert not token.endswith("\n") 

1027 additional_prefix = prefix + token 

1028 elif token in token_collection.triple_quoted: 

1029 endprog = token_collection.endpats[token] 

1030 endmatch = endprog.match(line, pos) 

1031 if endmatch: # all on one line 

1032 pos = endmatch.end(0) 

1033 token = line[start:pos] 

1034 yield PythonToken(STRING, token, spos, prefix) 

1035 else: 

1036 contstr_start = (lnum, start) # multiple lines 

1037 contstr = line[start:] 

1038 contline = line 

1039 break 

1040 

1041 # Check up to the first 3 chars of the token to see if 

1042 # they're in the single_quoted set. If so, they start 

1043 # a string. 

1044 # We're using the first 3, because we're looking for 

1045 # "rb'" (for example) at the start of the token. If 

1046 # we switch to longer prefixes, this needs to be 

1047 # adjusted. 

1048 # Note that initial == token[:1]. 

1049 # Also note that single quote checking must come after 

1050 # triple quote checking (above). 

1051 elif ( 

1052 initial in token_collection.single_quoted 

1053 or token[:2] in token_collection.single_quoted 

1054 or token[:3] in token_collection.single_quoted 

1055 ): 

1056 if token[-1] in "\r\n": # continued string 

1057 # This means that a single quoted string ends with a 

1058 # backslash and is continued. 

1059 contstr_start = lnum, start 

1060 endprog = ( 

1061 token_collection.endpats.get(initial) 

1062 or token_collection.endpats.get(token[1]) 

1063 or token_collection.endpats.get(token[2]) 

1064 ) 

1065 contstr = line[start:] 

1066 contline = line 

1067 break 

1068 else: # ordinary string 

1069 yield PythonToken(STRING, token, spos, prefix) 

1070 elif ( 

1071 token in token_collection.fstring_pattern_map 

1072 ): # The start of an fstring. 

1073 fstring_stack.append( 

1074 FStringNode( 

1075 token_collection.fstring_pattern_map[token], 

1076 "r" in token or "R" in token, 

1077 ) 

1078 ) 

1079 yield PythonToken(FSTRING_START, token, spos, prefix) 

1080 elif initial == "\\" and line[start:] in ( 

1081 "\\\n", 

1082 "\\\r\n", 

1083 "\\\r", 

1084 ): # continued stmt 

1085 additional_prefix += prefix + line[start:] 

1086 break 

1087 else: 

1088 if token in "([{": 

1089 if fstring_stack: 

1090 fstring_stack[-1].open_parentheses(token) 

1091 else: 

1092 paren_level += 1 

1093 elif token in ")]}": 

1094 if fstring_stack: 

1095 fstring_stack[-1].close_parentheses(token) 

1096 else: 

1097 if paren_level: 

1098 paren_level -= 1 

1099 elif ( 

1100 token == ":" 

1101 and fstring_stack 

1102 and fstring_stack[-1].parentheses_count 

1103 - fstring_stack[-1].format_spec_count 

1104 == 1 

1105 ): 

1106 fstring_stack[-1].format_spec_count += 1 

1107 

1108 yield PythonToken(OP, token, spos, prefix) 

1109 

1110 if contstr: 

1111 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix) 

1112 if contstr.endswith("\n") or contstr.endswith("\r"): 

1113 new_line = True 

1114 

1115 end_pos = lnum, max 

1116 # As the last position we just take the maximally possible position. We 

1117 # remove -1 for the last new line. 

1118 for indent in indents[1:]: 

1119 yield PythonToken(DEDENT, "", end_pos, "") 

1120 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix) 

1121 

1122 

1123def _split_illegal_unicode_name( 

1124 token: str, start_pos: Tuple[int, int], prefix: str 

1125) -> Generator[PythonToken, None, None]: 

1126 def create_token(): 

1127 return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix) 

1128 

1129 found = "" 

1130 is_illegal = False 

1131 pos = start_pos 

1132 for i, char in enumerate(token): 

1133 if is_illegal: 

1134 if str.isidentifier(char): 

1135 yield create_token() 

1136 found = char 

1137 is_illegal = False 

1138 prefix = "" 

1139 pos = start_pos[0], start_pos[1] + i 

1140 else: 

1141 found += char 

1142 else: 

1143 new_found = found + char 

1144 if str.isidentifier(new_found): 

1145 found = new_found 

1146 else: 

1147 if found: 

1148 yield create_token() 

1149 prefix = "" 

1150 pos = start_pos[0], start_pos[1] + i 

1151 found = char 

1152 is_illegal = True 

1153 

1154 if found: 

1155 yield create_token()