Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/libcst/_parser/parso/python/tokenize.py: 10%

680 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-25 06:43 +0000

1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 

2# Licensed to PSF under a Contributor Agreement. 

3# 

4# Modifications: 

5# Copyright David Halter and Contributors 

6# Modifications are dual-licensed: MIT and PSF. 

7# 99% of the code is different from pgen2, now. 

8# 

9# A fork of `parso.python.tokenize`. 

10# https://github.com/davidhalter/parso/blob/master/parso/python/tokenize.py 

11# 

12# The following changes were made: 

13# - Changes to be compatible with PythonTokenTypes 

14# - Removed main section 

15# - Applied type stubs directly 

16# - Removed Python 2 shims 

17# - Added support for Python 3.6 ASYNC/AWAIT hacks 

18# 

19# -*- coding: utf-8 -*- 

20# This tokenizer has been copied from the ``tokenize.py`` standard library 

21# tokenizer. The reason was simple: The standard library tokenizer fails 

22# if the indentation is not right. To make it possible to do error recovery the 

23# tokenizer needed to be rewritten. 

24# 

25# Basically this is a stripped down version of the standard library module, so 

26# you can read the documentation there. Additionally we included some speed and 

27# memory optimizations here. 

28# pyre-unsafe 

29from __future__ import absolute_import 

30 

31import itertools as _itertools 

32import re 

33import sys 

34from codecs import BOM_UTF8 

35from collections import namedtuple 

36from dataclasses import dataclass 

37from typing import Dict, Generator, Iterable, Optional, Pattern, Set, Tuple 

38 

39from libcst._parser.parso.python.token import PythonTokenTypes 

40from libcst._parser.parso.utils import PythonVersionInfo, split_lines 

41 

42# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) 

43MAX_UNICODE = "\U0010ffff" 

44BOM_UTF8_STRING = BOM_UTF8.decode("utf-8") 

45 

46STRING = PythonTokenTypes.STRING 

47NAME = PythonTokenTypes.NAME 

48NUMBER = PythonTokenTypes.NUMBER 

49OP = PythonTokenTypes.OP 

50NEWLINE = PythonTokenTypes.NEWLINE 

51INDENT = PythonTokenTypes.INDENT 

52DEDENT = PythonTokenTypes.DEDENT 

53ASYNC = PythonTokenTypes.ASYNC 

54AWAIT = PythonTokenTypes.AWAIT 

55ENDMARKER = PythonTokenTypes.ENDMARKER 

56ERRORTOKEN = PythonTokenTypes.ERRORTOKEN 

57ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT 

58FSTRING_START = PythonTokenTypes.FSTRING_START 

59FSTRING_STRING = PythonTokenTypes.FSTRING_STRING 

60FSTRING_END = PythonTokenTypes.FSTRING_END 

61 

62 

63@dataclass(frozen=True) 

64class TokenCollection: 

65 pseudo_token: Pattern 

66 single_quoted: Set[str] 

67 triple_quoted: Set[str] 

68 endpats: Dict[str, Pattern] 

69 whitespace: Pattern 

70 fstring_pattern_map: Dict[str, str] 

71 always_break_tokens: Set[str] 

72 

73 

74_token_collection_cache: Dict[PythonVersionInfo, TokenCollection] = {} 

75 

76 

77def group(*choices: str, **kwargs: object) -> str: 

78 capture = kwargs.pop("capture", False) # Python 2, arrghhhhh :( 

79 assert not kwargs 

80 

81 start = "(" 

82 if not capture: 

83 start += "?:" 

84 return start + "|".join(choices) + ")" 

85 

86 

87def maybe(*choices: str) -> str: 

88 return group(*choices) + "?" 

89 

90 

91# Return the empty string, plus all of the valid string prefixes. 

92def _all_string_prefixes( 

93 version_info: PythonVersionInfo, 

94 include_fstring: bool = False, 

95 only_fstring: bool = False, 

96) -> Set[str]: 

97 def different_case_versions(prefix): 

98 for s in _itertools.product(*[(c, c.upper()) for c in prefix]): 

99 yield "".join(s) 

100 

101 # The valid string prefixes. Only contain the lower case versions, 

102 # and don't contain any permuations (include 'fr', but not 

103 # 'rf'). The various permutations will be generated. 

104 valid_string_prefixes = ["b", "r"] 

105 if version_info >= (3, 0): 

106 valid_string_prefixes.append("br") 

107 if version_info < (3, 0) or version_info >= (3, 3): 

108 valid_string_prefixes.append("u") 

109 

110 result = {""} 

111 if version_info >= (3, 6) and include_fstring: 

112 f = ["f", "fr"] 

113 if only_fstring: 

114 valid_string_prefixes = f 

115 result = set() 

116 else: 

117 valid_string_prefixes += f 

118 elif only_fstring: 

119 return set() 

120 

121 # if we add binary f-strings, add: ['fb', 'fbr'] 

122 for prefix in valid_string_prefixes: 

123 for t in _itertools.permutations(prefix): 

124 # create a list with upper and lower versions of each 

125 # character 

126 result.update(different_case_versions(t)) 

127 if version_info <= (2, 7): 

128 # In Python 2 the order cannot just be random. 

129 result.update(different_case_versions("ur")) 

130 result.update(different_case_versions("br")) 

131 return result 

132 

133 

134def _compile(expr: str) -> Pattern: 

135 return re.compile(expr, re.UNICODE) 

136 

137 

138def _get_token_collection(version_info: PythonVersionInfo) -> TokenCollection: 

139 try: 

140 return _token_collection_cache[version_info] 

141 except KeyError: 

142 _token_collection_cache[version_info] = result = _create_token_collection( 

143 version_info 

144 ) 

145 return result 

146 

147 

148fstring_raw_string = _compile(r"(?:[^{}]+|\{\{|\}\})+") 

149 

150unicode_character_name = r"[A-Za-z0-9\-]+(?: [A-Za-z0-9\-]+)*" 

151fstring_string_single_line = _compile( 

152 r"(?:\{\{|\}\}|\\N\{" 

153 + unicode_character_name 

154 + r"\}|\\(?:\r\n?|\n)|\\[^\r\nN]|[^{}\r\n\\])+" 

155) 

156fstring_string_multi_line = _compile( 

157 r"(?:\{\{|\}\}|\\N\{" + unicode_character_name + r"\}|\\[^N]|[^{}\\])+" 

158) 

159 

160fstring_format_spec_single_line = _compile(r"(?:\\(?:\r\n?|\n)|[^{}\r\n])+") 

161fstring_format_spec_multi_line = _compile(r"[^{}]+") 

162 

163 

164def _create_token_collection( # noqa: C901 

165 version_info: PythonVersionInfo, 

166) -> TokenCollection: 

167 # Note: we use unicode matching for names ("\w") but ascii matching for 

168 # number literals. 

169 Whitespace = r"[ \f\t]*" 

170 Comment = r"#[^\r\n]*" 

171 # Python 2 is pretty much not working properly anymore, we just ignore 

172 # parsing unicode properly, which is fine, I guess. 

173 if version_info.major == 2: 

174 Name = r"([A-Za-z_0-9]+)" 

175 elif sys.version_info[0] == 2: 

176 # Unfortunately the regex engine cannot deal with the regex below, so 

177 # just use this one. 

178 Name = r"(\w+)" 

179 else: 

180 Name = "([A-Za-z_0-9\u0080-" + MAX_UNICODE + "]+)" 

181 

182 if version_info >= (3, 6): 

183 Hexnumber = r"0[xX](?:_?[0-9a-fA-F])+" 

184 Binnumber = r"0[bB](?:_?[01])+" 

185 Octnumber = r"0[oO](?:_?[0-7])+" 

186 Decnumber = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)" 

187 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 

188 Exponent = r"[eE][-+]?[0-9](?:_?[0-9])*" 

189 Pointfloat = group( 

190 r"[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?", r"\.[0-9](?:_?[0-9])*" 

191 ) + maybe(Exponent) 

192 Expfloat = r"[0-9](?:_?[0-9])*" + Exponent 

193 Floatnumber = group(Pointfloat, Expfloat) 

194 Imagnumber = group(r"[0-9](?:_?[0-9])*[jJ]", Floatnumber + r"[jJ]") 

195 else: 

196 Hexnumber = r"0[xX][0-9a-fA-F]+" 

197 Binnumber = r"0[bB][01]+" 

198 if version_info >= (3, 0): 

199 Octnumber = r"0[oO][0-7]+" 

200 else: 

201 Octnumber = "0[oO]?[0-7]+" 

202 Decnumber = r"(?:0+|[1-9][0-9]*)" 

203 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 

204 if version_info.major < 3: 

205 Intnumber += "[lL]?" 

206 Exponent = r"[eE][-+]?[0-9]+" 

207 Pointfloat = group(r"[0-9]+\.[0-9]*", r"\.[0-9]+") + maybe(Exponent) 

208 Expfloat = r"[0-9]+" + Exponent 

209 Floatnumber = group(Pointfloat, Expfloat) 

210 Imagnumber = group(r"[0-9]+[jJ]", Floatnumber + r"[jJ]") 

211 Number = group(Imagnumber, Floatnumber, Intnumber) 

212 

213 # Note that since _all_string_prefixes includes the empty string, 

214 # StringPrefix can be the empty string (making it optional). 

215 possible_prefixes = _all_string_prefixes(version_info) 

216 StringPrefix = group(*possible_prefixes) 

217 StringPrefixWithF = group(*_all_string_prefixes(version_info, include_fstring=True)) 

218 fstring_prefixes = _all_string_prefixes( 

219 version_info, include_fstring=True, only_fstring=True 

220 ) 

221 FStringStart = group(*fstring_prefixes) 

222 

223 # Tail end of ' string. 

224 Single = r"(?:\\.|[^'\\])*'" 

225 # Tail end of " string. 

226 Double = r'(?:\\.|[^"\\])*"' 

227 # Tail end of ''' string. 

228 Single3 = r"(?:\\.|'(?!'')|[^'\\])*'''" 

229 # Tail end of """ string. 

230 Double3 = r'(?:\\.|"(?!"")|[^"\\])*"""' 

231 Triple = group(StringPrefixWithF + "'''", StringPrefixWithF + '"""') 

232 

233 # Because of leftmost-then-longest match semantics, be sure to put the 

234 # longest operators first (e.g., if = came before ==, == would get 

235 # recognized as two instances of =). 

236 Operator = group( 

237 r"\*\*=?", r">>=?", r"<<=?", r"//=?", r"->", r"[+\-*/%&@`|^!=<>]=?", r"~" 

238 ) 

239 

240 Bracket = "[][(){}]" 

241 

242 special_args = [r"\r\n?", r"\n", r"[;.,@]"] 

243 if version_info >= (3, 0): 

244 special_args.insert(0, r"\.\.\.") 

245 if version_info >= (3, 8): 

246 special_args.insert(0, ":=?") 

247 else: 

248 special_args.insert(0, ":") 

249 Special = group(*special_args) 

250 

251 Funny = group(Operator, Bracket, Special) 

252 

253 # First (or only) line of ' or " string. 

254 ContStr = group( 

255 StringPrefix 

256 + r"'[^\r\n'\\]*(?:\\.[^\r\n'\\]*)*" 

257 + group("'", r"\\(?:\r\n?|\n)"), 

258 StringPrefix 

259 + r'"[^\r\n"\\]*(?:\\.[^\r\n"\\]*)*' 

260 + group('"', r"\\(?:\r\n?|\n)"), 

261 ) 

262 pseudo_extra_pool = [Comment, Triple] 

263 all_quotes = '"', "'", '"""', "'''" 

264 if fstring_prefixes: 

265 pseudo_extra_pool.append(FStringStart + group(*all_quotes)) 

266 

267 PseudoExtras = group(r"\\(?:\r\n?|\n)|\Z", *pseudo_extra_pool) 

268 PseudoToken = group(Whitespace, capture=True) + group( 

269 PseudoExtras, Number, Funny, ContStr, Name, capture=True 

270 ) 

271 

272 # For a given string prefix plus quotes, endpats maps it to a regex 

273 # to match the remainder of that string. _prefix can be empty, for 

274 # a normal single or triple quoted string (with no prefix). 

275 endpats = {} 

276 for _prefix in possible_prefixes: 

277 endpats[_prefix + "'"] = _compile(Single) 

278 endpats[_prefix + '"'] = _compile(Double) 

279 endpats[_prefix + "'''"] = _compile(Single3) 

280 endpats[_prefix + '"""'] = _compile(Double3) 

281 

282 # A set of all of the single and triple quoted string prefixes, 

283 # including the opening quotes. 

284 single_quoted = set() 

285 triple_quoted = set() 

286 fstring_pattern_map = {} 

287 for t in possible_prefixes: 

288 for quote in '"', "'": 

289 single_quoted.add(t + quote) 

290 

291 for quote in '"""', "'''": 

292 triple_quoted.add(t + quote) 

293 

294 for t in fstring_prefixes: 

295 for quote in all_quotes: 

296 fstring_pattern_map[t + quote] = quote 

297 

298 pseudo_token_compiled = _compile(PseudoToken) 

299 return TokenCollection( 

300 pseudo_token_compiled, 

301 single_quoted, 

302 triple_quoted, 

303 endpats, 

304 _compile(Whitespace), 

305 fstring_pattern_map, 

306 { 

307 ";", 

308 "import", 

309 "class", 

310 "def", 

311 "try", 

312 "except", 

313 "finally", 

314 "while", 

315 "with", 

316 "return", 

317 }, 

318 ) 

319 

320 

321class Token(namedtuple("Token", ["type", "string", "start_pos", "prefix"])): 

322 @property 

323 def end_pos(self): 

324 lines = split_lines(self.string) 

325 if len(lines) > 1: 

326 return self.start_pos[0] + len(lines) - 1, 0 

327 else: 

328 return self.start_pos[0], self.start_pos[1] + len(self.string) 

329 

330 

331class PythonToken(Token): 

332 def __repr__(self): 

333 return "TokenInfo(type=%s, string=%r, start_pos=%r, prefix=%r)" % self._replace( 

334 type=self.type.name 

335 ) 

336 

337 

338class FStringNode: 

339 def __init__(self, quote, raw): 

340 self.quote = quote 

341 self.raw = raw 

342 self.parentheses_count = 0 

343 self.previous_lines = "" 

344 self.last_string_start_pos = None 

345 # In the syntax there can be multiple format_spec's nested: 

346 # {x:{y:3}} 

347 self.format_spec_count = 0 

348 

349 def open_parentheses(self, character): 

350 self.parentheses_count += 1 

351 

352 def close_parentheses(self, character): 

353 self.parentheses_count -= 1 

354 if self.parentheses_count == 0: 

355 # No parentheses means that the format spec is also finished. 

356 self.format_spec_count = 0 

357 

358 def allow_multiline(self): 

359 return len(self.quote) == 3 

360 

361 def is_in_expr(self): 

362 return self.parentheses_count > self.format_spec_count 

363 

364 def is_in_format_spec(self): 

365 return not self.is_in_expr() and self.format_spec_count 

366 

367 

368def _close_fstring_if_necessary(fstring_stack, string, start_pos, additional_prefix): 

369 for fstring_stack_index, node in enumerate(fstring_stack): 

370 if string.startswith(node.quote): 

371 token = PythonToken( 

372 FSTRING_END, node.quote, start_pos, prefix=additional_prefix 

373 ) 

374 additional_prefix = "" 

375 assert not node.previous_lines 

376 del fstring_stack[fstring_stack_index:] 

377 return token, "", len(node.quote) 

378 return None, additional_prefix, 0 

379 

380 

381def _find_fstring_string(endpats, fstring_stack, line, lnum, pos): 

382 tos = fstring_stack[-1] 

383 allow_multiline = tos.allow_multiline() 

384 if tos.is_in_format_spec(): 

385 if allow_multiline: 

386 regex = fstring_format_spec_multi_line 

387 else: 

388 regex = fstring_format_spec_single_line 

389 else: 

390 if tos.raw: 

391 regex = fstring_raw_string 

392 elif allow_multiline: 

393 regex = fstring_string_multi_line 

394 else: 

395 regex = fstring_string_single_line 

396 

397 match = regex.match(line, pos) 

398 if match is None: 

399 return tos.previous_lines, pos 

400 

401 if not tos.previous_lines: 

402 tos.last_string_start_pos = (lnum, pos) 

403 

404 string = match.group(0) 

405 for fstring_stack_node in fstring_stack: 

406 end_match = endpats[fstring_stack_node.quote].match(string) 

407 if end_match is not None: 

408 string = end_match.group(0)[: -len(fstring_stack_node.quote)] 

409 

410 new_pos = pos 

411 new_pos += len(string) 

412 # even if allow_multiline is False, we still need to check for trailing 

413 # newlines, because a single-line f-string can contain line continuations 

414 if string.endswith("\n") or string.endswith("\r"): 

415 tos.previous_lines += string 

416 string = "" 

417 else: 

418 string = tos.previous_lines + string 

419 

420 return string, new_pos 

421 

422 

423def tokenize( 

424 code: str, version_info: PythonVersionInfo, start_pos: Tuple[int, int] = (1, 0) 

425) -> Generator[PythonToken, None, None]: 

426 """Generate tokens from a the source code (string).""" 

427 lines = split_lines(code, keepends=True) 

428 return tokenize_lines(lines, version_info, start_pos=start_pos) 

429 

430 

431def tokenize_lines( # noqa: C901 

432 lines: Iterable[str], 

433 version_info: PythonVersionInfo, 

434 start_pos: Tuple[int, int] = (1, 0), 

435) -> Generator[PythonToken, None, None]: 

436 token_collection = _get_token_collection(version_info) 

437 if version_info >= PythonVersionInfo(3, 7): 

438 return _tokenize_lines_py37_or_above( 

439 lines, version_info, token_collection, start_pos=start_pos 

440 ) 

441 else: 

442 return _tokenize_lines_py36_or_below( 

443 lines, version_info, token_collection, start_pos=start_pos 

444 ) 

445 

446 

447def _tokenize_lines_py36_or_below( # noqa: C901 

448 lines: Iterable[str], 

449 version_info: PythonVersionInfo, 

450 token_collection: TokenCollection, 

451 start_pos: Tuple[int, int] = (1, 0), 

452) -> Generator[PythonToken, None, None]: 

453 """ 

454 A heavily modified Python standard library tokenizer. 

455 

456 Additionally to the default information, yields also the prefix of each 

457 token. This idea comes from lib2to3. The prefix contains all information 

458 that is irrelevant for the parser like newlines in parentheses or comments. 

459 """ 

460 

461 paren_level = 0 # count parentheses 

462 indents = [0] 

463 max = 0 

464 numchars = "0123456789" 

465 contstr = "" 

466 contline = None 

467 # We start with a newline. This makes indent at the first position 

468 # possible. It's not valid Python, but still better than an INDENT in the 

469 # second line (and not in the first). This makes quite a few things in 

470 # Jedi's fast parser possible. 

471 new_line = True 

472 prefix = "" # Should never be required, but here for safety 

473 endprog = None # Should not be required, but here for lint 

474 contstr_start: Optional[Tuple[int, int]] = None 

475 additional_prefix = "" 

476 first = True 

477 lnum = start_pos[0] - 1 

478 fstring_stack = [] 

479 # stash and async_* are used for async/await parsing 

480 stashed: Optional[PythonToken] = None 

481 async_def: bool = False 

482 async_def_indent: int = 0 

483 async_def_newline: bool = False 

484 

485 def dedent_if_necessary(start): 

486 nonlocal stashed 

487 nonlocal async_def 

488 nonlocal async_def_indent 

489 nonlocal async_def_newline 

490 

491 while start < indents[-1]: 

492 if start > indents[-2]: 

493 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "") 

494 break 

495 if stashed is not None: 

496 yield stashed 

497 stashed = None 

498 if async_def and async_def_newline and async_def_indent >= indents[-1]: 

499 # We exited an 'async def' block, so stop tracking for indents 

500 async_def = False 

501 async_def_newline = False 

502 async_def_indent = 0 

503 yield PythonToken(DEDENT, "", spos, "") 

504 indents.pop() 

505 

506 for line in lines: # loop over lines in stream 

507 lnum += 1 

508 pos = 0 

509 max = len(line) 

510 if first: 

511 if line.startswith(BOM_UTF8_STRING): 

512 additional_prefix = BOM_UTF8_STRING 

513 line = line[1:] 

514 max = len(line) 

515 

516 # Fake that the part before was already parsed. 

517 line = "^" * start_pos[1] + line 

518 pos = start_pos[1] 

519 max += start_pos[1] 

520 

521 first = False 

522 

523 if contstr: # continued string 

524 if endprog is None: 

525 raise Exception("Logic error!") 

526 endmatch = endprog.match(line) 

527 if endmatch: 

528 pos = endmatch.end(0) 

529 if contstr_start is None: 

530 raise Exception("Logic error!") 

531 if stashed is not None: 

532 raise Exception("Logic error!") 

533 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix) 

534 contstr = "" 

535 contline = None 

536 else: 

537 contstr = contstr + line 

538 contline = contline + line 

539 continue 

540 

541 while pos < max: 

542 if fstring_stack: 

543 tos = fstring_stack[-1] 

544 if not tos.is_in_expr(): 

545 string, pos = _find_fstring_string( 

546 token_collection.endpats, fstring_stack, line, lnum, pos 

547 ) 

548 if string: 

549 if stashed is not None: 

550 raise Exception("Logic error!") 

551 yield PythonToken( 

552 FSTRING_STRING, 

553 string, 

554 tos.last_string_start_pos, 

555 # Never has a prefix because it can start anywhere and 

556 # include whitespace. 

557 prefix="", 

558 ) 

559 tos.previous_lines = "" 

560 continue 

561 if pos == max: 

562 break 

563 

564 rest = line[pos:] 

565 ( 

566 fstring_end_token, 

567 additional_prefix, 

568 quote_length, 

569 ) = _close_fstring_if_necessary( 

570 fstring_stack, rest, (lnum, pos), additional_prefix 

571 ) 

572 pos += quote_length 

573 if fstring_end_token is not None: 

574 if stashed is not None: 

575 raise Exception("Logic error!") 

576 yield fstring_end_token 

577 continue 

578 

579 pseudomatch = token_collection.pseudo_token.match(line, pos) 

580 if not pseudomatch: # scan for tokens 

581 match = token_collection.whitespace.match(line, pos) 

582 if pos == 0: 

583 # pyre-fixme[16]: `Optional` has no attribute `end`. 

584 yield from dedent_if_necessary(match.end()) 

585 pos = match.end() 

586 new_line = False 

587 yield PythonToken( 

588 ERRORTOKEN, 

589 line[pos], 

590 (lnum, pos), 

591 # pyre-fixme[16]: `Optional` has no attribute `group`. 

592 additional_prefix + match.group(0), 

593 ) 

594 additional_prefix = "" 

595 pos += 1 

596 continue 

597 

598 prefix = additional_prefix + pseudomatch.group(1) 

599 additional_prefix = "" 

600 start, pos = pseudomatch.span(2) 

601 spos = (lnum, start) 

602 token = pseudomatch.group(2) 

603 if token == "": 

604 assert prefix 

605 additional_prefix = prefix 

606 # This means that we have a line with whitespace/comments at 

607 # the end, which just results in an endmarker. 

608 break 

609 initial = token[0] 

610 

611 if new_line and initial not in "\r\n\\#": 

612 new_line = False 

613 if paren_level == 0 and not fstring_stack: 

614 i = 0 

615 indent_start = start 

616 while line[i] == "\f": 

617 i += 1 

618 # TODO don't we need to change spos as well? 

619 indent_start -= 1 

620 if indent_start > indents[-1]: 

621 if stashed is not None: 

622 yield stashed 

623 stashed = None 

624 yield PythonToken(INDENT, "", spos, "") 

625 indents.append(indent_start) 

626 yield from dedent_if_necessary(indent_start) 

627 

628 if initial in numchars or ( # ordinary number 

629 initial == "." and token != "." and token != "..." 

630 ): 

631 if stashed is not None: 

632 yield stashed 

633 stashed = None 

634 yield PythonToken(NUMBER, token, spos, prefix) 

635 elif pseudomatch.group(3) is not None: # ordinary name 

636 if token in token_collection.always_break_tokens: 

637 fstring_stack[:] = [] 

638 paren_level = 0 

639 # We only want to dedent if the token is on a new line. 

640 if re.match(r"[ \f\t]*$", line[:start]): 

641 while True: 

642 indent = indents.pop() 

643 if indent > start: 

644 if ( 

645 async_def 

646 and async_def_newline 

647 and async_def_indent >= indent 

648 ): 

649 # We dedented outside of an 'async def' block. 

650 async_def = False 

651 async_def_newline = False 

652 async_def_indent = 0 

653 if stashed is not None: 

654 yield stashed 

655 stashed = None 

656 yield PythonToken(DEDENT, "", spos, "") 

657 else: 

658 indents.append(indent) 

659 break 

660 if str.isidentifier(token): 

661 should_yield_identifier = True 

662 if token in ("async", "await") and async_def: 

663 # We're inside an 'async def' block, all async/await are 

664 # tokens. 

665 if token == "async": 

666 yield PythonToken(ASYNC, token, spos, prefix) 

667 else: 

668 yield PythonToken(AWAIT, token, spos, prefix) 

669 should_yield_identifier = False 

670 

671 # We are possibly starting an 'async def' section 

672 elif token == "async" and not stashed: 

673 stashed = PythonToken(NAME, token, spos, prefix) 

674 should_yield_identifier = False 

675 

676 # We actually are starting an 'async def' section 

677 elif ( 

678 token == "def" 

679 and stashed is not None 

680 and stashed[0] is NAME 

681 and stashed[1] == "async" 

682 ): 

683 async_def = True 

684 async_def_indent = indents[-1] 

685 yield PythonToken(ASYNC, stashed[1], stashed[2], stashed[3]) 

686 stashed = None 

687 

688 # We are either not stashed, or we output an ASYNC token above. 

689 elif stashed: 

690 yield stashed 

691 stashed = None 

692 

693 # If we didn't bail early due to possibly recognizing an 'async def', 

694 # then we should yield this token as normal. 

695 if should_yield_identifier: 

696 yield PythonToken(NAME, token, spos, prefix) 

697 else: 

698 yield from _split_illegal_unicode_name(token, spos, prefix) 

699 elif initial in "\r\n": 

700 if any(not f.allow_multiline() for f in fstring_stack): 

701 # Would use fstring_stack.clear, but that's not available 

702 # in Python 2. 

703 fstring_stack[:] = [] 

704 

705 if not new_line and paren_level == 0 and not fstring_stack: 

706 if async_def: 

707 async_def_newline = True 

708 if stashed: 

709 yield stashed 

710 stashed = None 

711 yield PythonToken(NEWLINE, token, spos, prefix) 

712 else: 

713 additional_prefix = prefix + token 

714 new_line = True 

715 elif initial == "#": # Comments 

716 assert not token.endswith("\n") 

717 additional_prefix = prefix + token 

718 elif token in token_collection.triple_quoted: 

719 endprog = token_collection.endpats[token] 

720 endmatch = endprog.match(line, pos) 

721 if endmatch: # all on one line 

722 pos = endmatch.end(0) 

723 token = line[start:pos] 

724 if stashed is not None: 

725 yield stashed 

726 stashed = None 

727 yield PythonToken(STRING, token, spos, prefix) 

728 else: 

729 contstr_start = (lnum, start) # multiple lines 

730 contstr = line[start:] 

731 contline = line 

732 break 

733 

734 # Check up to the first 3 chars of the token to see if 

735 # they're in the single_quoted set. If so, they start 

736 # a string. 

737 # We're using the first 3, because we're looking for 

738 # "rb'" (for example) at the start of the token. If 

739 # we switch to longer prefixes, this needs to be 

740 # adjusted. 

741 # Note that initial == token[:1]. 

742 # Also note that single quote checking must come after 

743 # triple quote checking (above). 

744 elif ( 

745 initial in token_collection.single_quoted 

746 or token[:2] in token_collection.single_quoted 

747 or token[:3] in token_collection.single_quoted 

748 ): 

749 if token[-1] in "\r\n": # continued string 

750 # This means that a single quoted string ends with a 

751 # backslash and is continued. 

752 contstr_start = lnum, start 

753 endprog = ( 

754 token_collection.endpats.get(initial) 

755 or token_collection.endpats.get(token[1]) 

756 or token_collection.endpats.get(token[2]) 

757 ) 

758 contstr = line[start:] 

759 contline = line 

760 break 

761 else: # ordinary string 

762 if stashed is not None: 

763 yield stashed 

764 stashed = None 

765 yield PythonToken(STRING, token, spos, prefix) 

766 elif ( 

767 token in token_collection.fstring_pattern_map 

768 ): # The start of an fstring. 

769 fstring_stack.append( 

770 FStringNode( 

771 token_collection.fstring_pattern_map[token], 

772 "r" in token or "R" in token, 

773 ) 

774 ) 

775 if stashed is not None: 

776 yield stashed 

777 stashed = None 

778 yield PythonToken(FSTRING_START, token, spos, prefix) 

779 elif initial == "\\" and line[start:] in ( 

780 "\\\n", 

781 "\\\r\n", 

782 "\\\r", 

783 ): # continued stmt 

784 additional_prefix += prefix + line[start:] 

785 break 

786 else: 

787 if token in "([{": 

788 if fstring_stack: 

789 fstring_stack[-1].open_parentheses(token) 

790 else: 

791 paren_level += 1 

792 elif token in ")]}": 

793 if fstring_stack: 

794 fstring_stack[-1].close_parentheses(token) 

795 else: 

796 if paren_level: 

797 paren_level -= 1 

798 elif ( 

799 token == ":" 

800 and fstring_stack 

801 and fstring_stack[-1].parentheses_count 

802 - fstring_stack[-1].format_spec_count 

803 == 1 

804 ): 

805 fstring_stack[-1].format_spec_count += 1 

806 

807 if stashed is not None: 

808 yield stashed 

809 stashed = None 

810 yield PythonToken(OP, token, spos, prefix) 

811 

812 if contstr: 

813 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix) 

814 if contstr.endswith("\n") or contstr.endswith("\r"): 

815 new_line = True 

816 

817 if stashed is not None: 

818 yield stashed 

819 stashed = None 

820 

821 end_pos = lnum, max 

822 # As the last position we just take the maximally possible position. We 

823 # remove -1 for the last new line. 

824 for indent in indents[1:]: 

825 yield PythonToken(DEDENT, "", end_pos, "") 

826 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix) 

827 

828 

829def _tokenize_lines_py37_or_above( # noqa: C901 

830 lines: Iterable[str], 

831 version_info: PythonVersionInfo, 

832 token_collection: TokenCollection, 

833 start_pos: Tuple[int, int] = (1, 0), 

834) -> Generator[PythonToken, None, None]: 

835 """ 

836 A heavily modified Python standard library tokenizer. 

837 

838 Additionally to the default information, yields also the prefix of each 

839 token. This idea comes from lib2to3. The prefix contains all information 

840 that is irrelevant for the parser like newlines in parentheses or comments. 

841 """ 

842 

843 def dedent_if_necessary(start): 

844 while start < indents[-1]: 

845 if start > indents[-2]: 

846 yield PythonToken(ERROR_DEDENT, "", (lnum, 0), "") 

847 break 

848 yield PythonToken(DEDENT, "", spos, "") 

849 indents.pop() 

850 

851 paren_level = 0 # count parentheses 

852 indents = [0] 

853 max = 0 

854 numchars = "0123456789" 

855 contstr = "" 

856 contline = None 

857 # We start with a newline. This makes indent at the first position 

858 # possible. It's not valid Python, but still better than an INDENT in the 

859 # second line (and not in the first). This makes quite a few things in 

860 # Jedi's fast parser possible. 

861 new_line = True 

862 prefix = "" # Should never be required, but here for safety 

863 endprog = None # Should not be required, but here for lint 

864 contstr_start: Optional[Tuple[int, int]] = None 

865 additional_prefix = "" 

866 first = True 

867 lnum = start_pos[0] - 1 

868 fstring_stack = [] 

869 for line in lines: # loop over lines in stream 

870 lnum += 1 

871 pos = 0 

872 max = len(line) 

873 if first: 

874 if line.startswith(BOM_UTF8_STRING): 

875 additional_prefix = BOM_UTF8_STRING 

876 line = line[1:] 

877 max = len(line) 

878 

879 # Fake that the part before was already parsed. 

880 line = "^" * start_pos[1] + line 

881 pos = start_pos[1] 

882 max += start_pos[1] 

883 

884 first = False 

885 

886 if contstr: # continued string 

887 if endprog is None: 

888 raise Exception("Logic error!") 

889 endmatch = endprog.match(line) 

890 if endmatch: 

891 pos = endmatch.end(0) 

892 if contstr_start is None: 

893 raise Exception("Logic error!") 

894 yield PythonToken(STRING, contstr + line[:pos], contstr_start, prefix) 

895 contstr = "" 

896 contline = None 

897 else: 

898 contstr = contstr + line 

899 contline = contline + line 

900 continue 

901 

902 while pos < max: 

903 if fstring_stack: 

904 tos = fstring_stack[-1] 

905 if not tos.is_in_expr(): 

906 string, pos = _find_fstring_string( 

907 token_collection.endpats, fstring_stack, line, lnum, pos 

908 ) 

909 if string: 

910 yield PythonToken( 

911 FSTRING_STRING, 

912 string, 

913 tos.last_string_start_pos, 

914 # Never has a prefix because it can start anywhere and 

915 # include whitespace. 

916 prefix="", 

917 ) 

918 tos.previous_lines = "" 

919 continue 

920 if pos == max: 

921 break 

922 

923 rest = line[pos:] 

924 ( 

925 fstring_end_token, 

926 additional_prefix, 

927 quote_length, 

928 ) = _close_fstring_if_necessary( 

929 fstring_stack, rest, (lnum, pos), additional_prefix 

930 ) 

931 pos += quote_length 

932 if fstring_end_token is not None: 

933 yield fstring_end_token 

934 continue 

935 

936 pseudomatch = token_collection.pseudo_token.match(line, pos) 

937 if not pseudomatch: # scan for tokens 

938 match = token_collection.whitespace.match(line, pos) 

939 if pos == 0: 

940 # pyre-fixme[16]: `Optional` has no attribute `end`. 

941 for t in dedent_if_necessary(match.end()): 

942 yield t 

943 pos = match.end() 

944 new_line = False 

945 yield PythonToken( 

946 ERRORTOKEN, 

947 line[pos], 

948 (lnum, pos), 

949 # pyre-fixme[16]: `Optional` has no attribute `group`. 

950 additional_prefix + match.group(0), 

951 ) 

952 additional_prefix = "" 

953 pos += 1 

954 continue 

955 

956 prefix = additional_prefix + pseudomatch.group(1) 

957 additional_prefix = "" 

958 start, pos = pseudomatch.span(2) 

959 spos = (lnum, start) 

960 token = pseudomatch.group(2) 

961 if token == "": 

962 assert prefix 

963 additional_prefix = prefix 

964 # This means that we have a line with whitespace/comments at 

965 # the end, which just results in an endmarker. 

966 break 

967 initial = token[0] 

968 

969 if new_line and initial not in "\r\n\\#": 

970 new_line = False 

971 if paren_level == 0 and not fstring_stack: 

972 i = 0 

973 indent_start = start 

974 while line[i] == "\f": 

975 i += 1 

976 # TODO don't we need to change spos as well? 

977 indent_start -= 1 

978 if indent_start > indents[-1]: 

979 yield PythonToken(INDENT, "", spos, "") 

980 indents.append(indent_start) 

981 for t in dedent_if_necessary(indent_start): 

982 yield t 

983 

984 if initial in numchars or ( # ordinary number 

985 initial == "." and token != "." and token != "..." 

986 ): 

987 yield PythonToken(NUMBER, token, spos, prefix) 

988 elif pseudomatch.group(3) is not None: # ordinary name 

989 if token in token_collection.always_break_tokens: 

990 fstring_stack[:] = [] 

991 paren_level = 0 

992 # We only want to dedent if the token is on a new line. 

993 if re.match(r"[ \f\t]*$", line[:start]): 

994 while True: 

995 indent = indents.pop() 

996 if indent > start: 

997 yield PythonToken(DEDENT, "", spos, "") 

998 else: 

999 indents.append(indent) 

1000 break 

1001 if str.isidentifier(token): 

1002 # py37 doesn't need special tokens for async/await, and we could 

1003 # emit NAME, but then we'd need different grammar for py36 and py37. 

1004 if token == "async": 

1005 yield PythonToken(ASYNC, token, spos, prefix) 

1006 elif token == "await": 

1007 yield PythonToken(AWAIT, token, spos, prefix) 

1008 else: 

1009 yield PythonToken(NAME, token, spos, prefix) 

1010 else: 

1011 for t in _split_illegal_unicode_name(token, spos, prefix): 

1012 yield t # yield from Python 2 

1013 elif initial in "\r\n": 

1014 if any(not f.allow_multiline() for f in fstring_stack): 

1015 # Would use fstring_stack.clear, but that's not available 

1016 # in Python 2. 

1017 fstring_stack[:] = [] 

1018 

1019 if not new_line and paren_level == 0 and not fstring_stack: 

1020 yield PythonToken(NEWLINE, token, spos, prefix) 

1021 else: 

1022 additional_prefix = prefix + token 

1023 new_line = True 

1024 elif initial == "#": # Comments 

1025 assert not token.endswith("\n") 

1026 additional_prefix = prefix + token 

1027 elif token in token_collection.triple_quoted: 

1028 endprog = token_collection.endpats[token] 

1029 endmatch = endprog.match(line, pos) 

1030 if endmatch: # all on one line 

1031 pos = endmatch.end(0) 

1032 token = line[start:pos] 

1033 yield PythonToken(STRING, token, spos, prefix) 

1034 else: 

1035 contstr_start = (lnum, start) # multiple lines 

1036 contstr = line[start:] 

1037 contline = line 

1038 break 

1039 

1040 # Check up to the first 3 chars of the token to see if 

1041 # they're in the single_quoted set. If so, they start 

1042 # a string. 

1043 # We're using the first 3, because we're looking for 

1044 # "rb'" (for example) at the start of the token. If 

1045 # we switch to longer prefixes, this needs to be 

1046 # adjusted. 

1047 # Note that initial == token[:1]. 

1048 # Also note that single quote checking must come after 

1049 # triple quote checking (above). 

1050 elif ( 

1051 initial in token_collection.single_quoted 

1052 or token[:2] in token_collection.single_quoted 

1053 or token[:3] in token_collection.single_quoted 

1054 ): 

1055 if token[-1] in "\r\n": # continued string 

1056 # This means that a single quoted string ends with a 

1057 # backslash and is continued. 

1058 contstr_start = lnum, start 

1059 endprog = ( 

1060 token_collection.endpats.get(initial) 

1061 or token_collection.endpats.get(token[1]) 

1062 or token_collection.endpats.get(token[2]) 

1063 ) 

1064 contstr = line[start:] 

1065 contline = line 

1066 break 

1067 else: # ordinary string 

1068 yield PythonToken(STRING, token, spos, prefix) 

1069 elif ( 

1070 token in token_collection.fstring_pattern_map 

1071 ): # The start of an fstring. 

1072 fstring_stack.append( 

1073 FStringNode( 

1074 token_collection.fstring_pattern_map[token], 

1075 "r" in token or "R" in token, 

1076 ) 

1077 ) 

1078 yield PythonToken(FSTRING_START, token, spos, prefix) 

1079 elif initial == "\\" and line[start:] in ( 

1080 "\\\n", 

1081 "\\\r\n", 

1082 "\\\r", 

1083 ): # continued stmt 

1084 additional_prefix += prefix + line[start:] 

1085 break 

1086 else: 

1087 if token in "([{": 

1088 if fstring_stack: 

1089 fstring_stack[-1].open_parentheses(token) 

1090 else: 

1091 paren_level += 1 

1092 elif token in ")]}": 

1093 if fstring_stack: 

1094 fstring_stack[-1].close_parentheses(token) 

1095 else: 

1096 if paren_level: 

1097 paren_level -= 1 

1098 elif ( 

1099 token == ":" 

1100 and fstring_stack 

1101 and fstring_stack[-1].parentheses_count 

1102 - fstring_stack[-1].format_spec_count 

1103 == 1 

1104 ): 

1105 fstring_stack[-1].format_spec_count += 1 

1106 

1107 yield PythonToken(OP, token, spos, prefix) 

1108 

1109 if contstr: 

1110 yield PythonToken(ERRORTOKEN, contstr, contstr_start, prefix) 

1111 if contstr.endswith("\n") or contstr.endswith("\r"): 

1112 new_line = True 

1113 

1114 end_pos = lnum, max 

1115 # As the last position we just take the maximally possible position. We 

1116 # remove -1 for the last new line. 

1117 for indent in indents[1:]: 

1118 yield PythonToken(DEDENT, "", end_pos, "") 

1119 yield PythonToken(ENDMARKER, "", end_pos, additional_prefix) 

1120 

1121 

1122def _split_illegal_unicode_name( 

1123 token: str, start_pos: Tuple[int, int], prefix: str 

1124) -> Generator[PythonToken, None, None]: 

1125 def create_token(): 

1126 return PythonToken(ERRORTOKEN if is_illegal else NAME, found, pos, prefix) 

1127 

1128 found = "" 

1129 is_illegal = False 

1130 pos = start_pos 

1131 for i, char in enumerate(token): 

1132 if is_illegal: 

1133 if str.isidentifier(char): 

1134 yield create_token() 

1135 found = char 

1136 is_illegal = False 

1137 prefix = "" 

1138 pos = start_pos[0], start_pos[1] + i 

1139 else: 

1140 found += char 

1141 else: 

1142 new_found = found + char 

1143 if str.isidentifier(new_found): 

1144 found = new_found 

1145 else: 

1146 if found: 

1147 yield create_token() 

1148 prefix = "" 

1149 pos = start_pos[0], start_pos[1] + i 

1150 found = char 

1151 is_illegal = True 

1152 

1153 if found: 

1154 yield create_token()