Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 65%

370 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:15 +0000

1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 

2# All rights reserved. 

3 

4# mypy: allow-untyped-defs, allow-untyped-calls 

5 

6"""Tokenization help for Python programs. 

7 

8generate_tokens(readline) is a generator that breaks a stream of 

9text into Python tokens. It accepts a readline-like method which is called 

10repeatedly to get the next line of input (or "" for EOF). It generates 

115-tuples with these members: 

12 

13 the token type (see token.py) 

14 the token (a string) 

15 the starting (row, column) indices of the token (a 2-tuple of ints) 

16 the ending (row, column) indices of the token (a 2-tuple of ints) 

17 the original line (string) 

18 

19It is designed to match the working of the Python tokenizer exactly, except 

20that it produces COMMENT tokens for comments and gives type OP for all 

21operators 

22 

23Older entry points 

24 tokenize_loop(readline, tokeneater) 

25 tokenize(readline, tokeneater=printtoken) 

26are the same, except instead of generating tokens, tokeneater is a callback 

27function to which the 5 fields described above are passed as 5 arguments, 

28each time a new token is found.""" 

29 

30import sys 

31from typing import ( 

32 Callable, 

33 Iterable, 

34 Iterator, 

35 List, 

36 Optional, 

37 Set, 

38 Text, 

39 Tuple, 

40 Pattern, 

41 Union, 

42 cast, 

43) 

44 

45if sys.version_info >= (3, 8): 

46 from typing import Final 

47else: 

48 from typing_extensions import Final 

49 

50from blib2to3.pgen2.token import * 

51from blib2to3.pgen2.grammar import Grammar 

52 

53__author__ = "Ka-Ping Yee <ping@lfw.org>" 

54__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" 

55 

56import re 

57from codecs import BOM_UTF8, lookup 

58from blib2to3.pgen2.token import * 

59 

60from . import token 

61 

62__all__ = [x for x in dir(token) if x[0] != "_"] + [ 

63 "tokenize", 

64 "generate_tokens", 

65 "untokenize", 

66] 

67del token 

68 

69 

70def group(*choices: str) -> str: 

71 return "(" + "|".join(choices) + ")" 

72 

73 

74def any(*choices: str) -> str: 

75 return group(*choices) + "*" 

76 

77 

78def maybe(*choices: str) -> str: 

79 return group(*choices) + "?" 

80 

81 

82def _combinations(*l: str) -> Set[str]: 

83 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()) 

84 

85 

86Whitespace = r"[ \f\t]*" 

87Comment = r"#[^\r\n]*" 

88Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment) 

89Name = ( # this is invalid but it's fine because Name comes after Number in all groups 

90 r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+" 

91) 

92 

93Binnumber = r"0[bB]_?[01]+(?:_[01]+)*" 

94Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?" 

95Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?" 

96Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?") 

97Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) 

98Exponent = r"[eE][-+]?\d+(?:_\d+)*" 

99Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe( 

100 Exponent 

101) 

102Expfloat = r"\d+(?:_\d+)*" + Exponent 

103Floatnumber = group(Pointfloat, Expfloat) 

104Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]") 

105Number = group(Imagnumber, Floatnumber, Intnumber) 

106 

107# Tail end of ' string. 

108Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 

109# Tail end of " string. 

110Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 

111# Tail end of ''' string. 

112Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 

113# Tail end of """ string. 

114Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 

115_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?" 

116Triple = group(_litprefix + "'''", _litprefix + '"""') 

117# Single-line ' or " string. 

118String = group( 

119 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 

120 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"', 

121) 

122 

123# Because of leftmost-then-longest match semantics, be sure to put the 

124# longest operators first (e.g., if = came before ==, == would get 

125# recognized as two instances of =). 

126Operator = group( 

127 r"\*\*=?", 

128 r">>=?", 

129 r"<<=?", 

130 r"<>", 

131 r"!=", 

132 r"//=?", 

133 r"->", 

134 r"[+\-*/%&@|^=<>:]=?", 

135 r"~", 

136) 

137 

138Bracket = "[][(){}]" 

139Special = group(r"\r?\n", r"[:;.,`@]") 

140Funny = group(Operator, Bracket, Special) 

141 

142# First (or only) line of ' or " string. 

143ContStr = group( 

144 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"), 

145 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"), 

146) 

147PseudoExtras = group(r"\\\r?\n", Comment, Triple) 

148PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 

149 

150pseudoprog: Final = re.compile(PseudoToken, re.UNICODE) 

151single3prog = re.compile(Single3) 

152double3prog = re.compile(Double3) 

153 

154_strprefixes = ( 

155 _combinations("r", "R", "f", "F") 

156 | _combinations("r", "R", "b", "B") 

157 | {"u", "U", "ur", "uR", "Ur", "UR"} 

158) 

159 

160endprogs: Final = { 

161 "'": re.compile(Single), 

162 '"': re.compile(Double), 

163 "'''": single3prog, 

164 '"""': double3prog, 

165 **{f"{prefix}'''": single3prog for prefix in _strprefixes}, 

166 **{f'{prefix}"""': double3prog for prefix in _strprefixes}, 

167} 

168 

169triple_quoted: Final = ( 

170 {"'''", '"""'} 

171 | {f"{prefix}'''" for prefix in _strprefixes} 

172 | {f'{prefix}"""' for prefix in _strprefixes} 

173) 

174single_quoted: Final = ( 

175 {"'", '"'} 

176 | {f"{prefix}'" for prefix in _strprefixes} 

177 | {f'{prefix}"' for prefix in _strprefixes} 

178) 

179 

180tabsize = 8 

181 

182 

183class TokenError(Exception): 

184 pass 

185 

186 

187class StopTokenizing(Exception): 

188 pass 

189 

190 

191Coord = Tuple[int, int] 

192 

193 

194def printtoken( 

195 type: int, token: Text, srow_col: Coord, erow_col: Coord, line: Text 

196) -> None: # for testing 

197 (srow, scol) = srow_col 

198 (erow, ecol) = erow_col 

199 print( 

200 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token)) 

201 ) 

202 

203 

204TokenEater = Callable[[int, Text, Coord, Coord, Text], None] 

205 

206 

207def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None: 

208 """ 

209 The tokenize() function accepts two parameters: one representing the 

210 input stream, and one providing an output mechanism for tokenize(). 

211 

212 The first parameter, readline, must be a callable object which provides 

213 the same interface as the readline() method of built-in file objects. 

214 Each call to the function should return one line of input as a string. 

215 

216 The second parameter, tokeneater, must also be a callable object. It is 

217 called once for each token, with five arguments, corresponding to the 

218 tuples generated by generate_tokens(). 

219 """ 

220 try: 

221 tokenize_loop(readline, tokeneater) 

222 except StopTokenizing: 

223 pass 

224 

225 

226# backwards compatible interface 

227def tokenize_loop(readline: Callable[[], Text], tokeneater: TokenEater) -> None: 

228 for token_info in generate_tokens(readline): 

229 tokeneater(*token_info) 

230 

231 

232GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text] 

233TokenInfo = Union[Tuple[int, str], GoodTokenInfo] 

234 

235 

236class Untokenizer: 

237 tokens: List[Text] 

238 prev_row: int 

239 prev_col: int 

240 

241 def __init__(self) -> None: 

242 self.tokens = [] 

243 self.prev_row = 1 

244 self.prev_col = 0 

245 

246 def add_whitespace(self, start: Coord) -> None: 

247 row, col = start 

248 assert row <= self.prev_row 

249 col_offset = col - self.prev_col 

250 if col_offset: 

251 self.tokens.append(" " * col_offset) 

252 

253 def untokenize(self, iterable: Iterable[TokenInfo]) -> Text: 

254 for t in iterable: 

255 if len(t) == 2: 

256 self.compat(cast(Tuple[int, str], t), iterable) 

257 break 

258 tok_type, token, start, end, line = cast( 

259 Tuple[int, Text, Coord, Coord, Text], t 

260 ) 

261 self.add_whitespace(start) 

262 self.tokens.append(token) 

263 self.prev_row, self.prev_col = end 

264 if tok_type in (NEWLINE, NL): 

265 self.prev_row += 1 

266 self.prev_col = 0 

267 return "".join(self.tokens) 

268 

269 def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None: 

270 startline = False 

271 indents = [] 

272 toks_append = self.tokens.append 

273 toknum, tokval = token 

274 if toknum in (NAME, NUMBER): 

275 tokval += " " 

276 if toknum in (NEWLINE, NL): 

277 startline = True 

278 for tok in iterable: 

279 toknum, tokval = tok[:2] 

280 

281 if toknum in (NAME, NUMBER, ASYNC, AWAIT): 

282 tokval += " " 

283 

284 if toknum == INDENT: 

285 indents.append(tokval) 

286 continue 

287 elif toknum == DEDENT: 

288 indents.pop() 

289 continue 

290 elif toknum in (NEWLINE, NL): 

291 startline = True 

292 elif startline and indents: 

293 toks_append(indents[-1]) 

294 startline = False 

295 toks_append(tokval) 

296 

297 

298cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII) 

299blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII) 

300 

301 

302def _get_normal_name(orig_enc: str) -> str: 

303 """Imitates get_normal_name in tokenizer.c.""" 

304 # Only care about the first 12 characters. 

305 enc = orig_enc[:12].lower().replace("_", "-") 

306 if enc == "utf-8" or enc.startswith("utf-8-"): 

307 return "utf-8" 

308 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith( 

309 ("latin-1-", "iso-8859-1-", "iso-latin-1-") 

310 ): 

311 return "iso-8859-1" 

312 return orig_enc 

313 

314 

315def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]: 

316 """ 

317 The detect_encoding() function is used to detect the encoding that should 

318 be used to decode a Python source file. It requires one argument, readline, 

319 in the same way as the tokenize() generator. 

320 

321 It will call readline a maximum of twice, and return the encoding used 

322 (as a string) and a list of any lines (left as bytes) it has read 

323 in. 

324 

325 It detects the encoding from the presence of a utf-8 bom or an encoding 

326 cookie as specified in pep-0263. If both a bom and a cookie are present, but 

327 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid 

328 charset, raise a SyntaxError. Note that if a utf-8 bom is found, 

329 'utf-8-sig' is returned. 

330 

331 If no encoding is specified, then the default of 'utf-8' will be returned. 

332 """ 

333 bom_found = False 

334 encoding = None 

335 default = "utf-8" 

336 

337 def read_or_stop() -> bytes: 

338 try: 

339 return readline() 

340 except StopIteration: 

341 return bytes() 

342 

343 def find_cookie(line: bytes) -> Optional[str]: 

344 try: 

345 line_string = line.decode("ascii") 

346 except UnicodeDecodeError: 

347 return None 

348 match = cookie_re.match(line_string) 

349 if not match: 

350 return None 

351 encoding = _get_normal_name(match.group(1)) 

352 try: 

353 codec = lookup(encoding) 

354 except LookupError: 

355 # This behaviour mimics the Python interpreter 

356 raise SyntaxError("unknown encoding: " + encoding) 

357 

358 if bom_found: 

359 if codec.name != "utf-8": 

360 # This behaviour mimics the Python interpreter 

361 raise SyntaxError("encoding problem: utf-8") 

362 encoding += "-sig" 

363 return encoding 

364 

365 first = read_or_stop() 

366 if first.startswith(BOM_UTF8): 

367 bom_found = True 

368 first = first[3:] 

369 default = "utf-8-sig" 

370 if not first: 

371 return default, [] 

372 

373 encoding = find_cookie(first) 

374 if encoding: 

375 return encoding, [first] 

376 if not blank_re.match(first): 

377 return default, [first] 

378 

379 second = read_or_stop() 

380 if not second: 

381 return default, [first] 

382 

383 encoding = find_cookie(second) 

384 if encoding: 

385 return encoding, [first, second] 

386 

387 return default, [first, second] 

388 

389 

390def untokenize(iterable: Iterable[TokenInfo]) -> Text: 

391 """Transform tokens back into Python source code. 

392 

393 Each element returned by the iterable must be a token sequence 

394 with at least two elements, a token number and token value. If 

395 only two tokens are passed, the resulting output is poor. 

396 

397 Round-trip invariant for full input: 

398 Untokenized source will match input source exactly 

399 

400 Round-trip invariant for limited input: 

401 # Output text will tokenize the back to the input 

402 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 

403 newcode = untokenize(t1) 

404 readline = iter(newcode.splitlines(1)).next 

405 t2 = [tok[:2] for tokin generate_tokens(readline)] 

406 assert t1 == t2 

407 """ 

408 ut = Untokenizer() 

409 return ut.untokenize(iterable) 

410 

411 

412def generate_tokens( 

413 readline: Callable[[], Text], grammar: Optional[Grammar] = None 

414) -> Iterator[GoodTokenInfo]: 

415 """ 

416 The generate_tokens() generator requires one argument, readline, which 

417 must be a callable object which provides the same interface as the 

418 readline() method of built-in file objects. Each call to the function 

419 should return one line of input as a string. Alternately, readline 

420 can be a callable function terminating with StopIteration: 

421 readline = open(myfile).next # Example of alternate readline 

422 

423 The generator produces 5-tuples with these members: the token type; the 

424 token string; a 2-tuple (srow, scol) of ints specifying the row and 

425 column where the token begins in the source; a 2-tuple (erow, ecol) of 

426 ints specifying the row and column where the token ends in the source; 

427 and the line on which the token was found. The line passed is the 

428 logical line; continuation lines are included. 

429 """ 

430 lnum = parenlev = continued = 0 

431 numchars: Final[str] = "0123456789" 

432 contstr, needcont = "", 0 

433 contline: Optional[str] = None 

434 indents = [0] 

435 

436 # If we know we're parsing 3.7+, we can unconditionally parse `async` and 

437 # `await` as keywords. 

438 async_keywords = False if grammar is None else grammar.async_keywords 

439 # 'stashed' and 'async_*' are used for async/await parsing 

440 stashed: Optional[GoodTokenInfo] = None 

441 async_def = False 

442 async_def_indent = 0 

443 async_def_nl = False 

444 

445 strstart: Tuple[int, int] 

446 endprog: Pattern[str] 

447 

448 while 1: # loop over lines in stream 

449 try: 

450 line = readline() 

451 except StopIteration: 

452 line = "" 

453 lnum += 1 

454 pos, max = 0, len(line) 

455 

456 if contstr: # continued string 

457 assert contline is not None 

458 if not line: 

459 raise TokenError("EOF in multi-line string", strstart) 

460 endmatch = endprog.match(line) 

461 if endmatch: 

462 pos = end = endmatch.end(0) 

463 yield ( 

464 STRING, 

465 contstr + line[:end], 

466 strstart, 

467 (lnum, end), 

468 contline + line, 

469 ) 

470 contstr, needcont = "", 0 

471 contline = None 

472 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n": 

473 yield ( 

474 ERRORTOKEN, 

475 contstr + line, 

476 strstart, 

477 (lnum, len(line)), 

478 contline, 

479 ) 

480 contstr = "" 

481 contline = None 

482 continue 

483 else: 

484 contstr = contstr + line 

485 contline = contline + line 

486 continue 

487 

488 elif parenlev == 0 and not continued: # new statement 

489 if not line: 

490 break 

491 column = 0 

492 while pos < max: # measure leading whitespace 

493 if line[pos] == " ": 

494 column += 1 

495 elif line[pos] == "\t": 

496 column = (column // tabsize + 1) * tabsize 

497 elif line[pos] == "\f": 

498 column = 0 

499 else: 

500 break 

501 pos += 1 

502 if pos == max: 

503 break 

504 

505 if stashed: 

506 yield stashed 

507 stashed = None 

508 

509 if line[pos] in "\r\n": # skip blank lines 

510 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line) 

511 continue 

512 

513 if line[pos] == "#": # skip comments 

514 comment_token = line[pos:].rstrip("\r\n") 

515 nl_pos = pos + len(comment_token) 

516 yield ( 

517 COMMENT, 

518 comment_token, 

519 (lnum, pos), 

520 (lnum, nl_pos), 

521 line, 

522 ) 

523 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line) 

524 continue 

525 

526 if column > indents[-1]: # count indents 

527 indents.append(column) 

528 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 

529 

530 while column < indents[-1]: # count dedents 

531 if column not in indents: 

532 raise IndentationError( 

533 "unindent does not match any outer indentation level", 

534 ("<tokenize>", lnum, pos, line), 

535 ) 

536 indents = indents[:-1] 

537 

538 if async_def and async_def_indent >= indents[-1]: 

539 async_def = False 

540 async_def_nl = False 

541 async_def_indent = 0 

542 

543 yield (DEDENT, "", (lnum, pos), (lnum, pos), line) 

544 

545 if async_def and async_def_nl and async_def_indent >= indents[-1]: 

546 async_def = False 

547 async_def_nl = False 

548 async_def_indent = 0 

549 

550 else: # continued statement 

551 if not line: 

552 raise TokenError("EOF in multi-line statement", (lnum, 0)) 

553 continued = 0 

554 

555 while pos < max: 

556 pseudomatch = pseudoprog.match(line, pos) 

557 if pseudomatch: # scan for tokens 

558 start, end = pseudomatch.span(1) 

559 spos, epos, pos = (lnum, start), (lnum, end), end 

560 token, initial = line[start:end], line[start] 

561 

562 if initial in numchars or ( 

563 initial == "." and token != "." 

564 ): # ordinary number 

565 yield (NUMBER, token, spos, epos, line) 

566 elif initial in "\r\n": 

567 newline = NEWLINE 

568 if parenlev > 0: 

569 newline = NL 

570 elif async_def: 

571 async_def_nl = True 

572 if stashed: 

573 yield stashed 

574 stashed = None 

575 yield (newline, token, spos, epos, line) 

576 

577 elif initial == "#": 

578 assert not token.endswith("\n") 

579 if stashed: 

580 yield stashed 

581 stashed = None 

582 yield (COMMENT, token, spos, epos, line) 

583 elif token in triple_quoted: 

584 endprog = endprogs[token] 

585 endmatch = endprog.match(line, pos) 

586 if endmatch: # all on one line 

587 pos = endmatch.end(0) 

588 token = line[start:pos] 

589 if stashed: 

590 yield stashed 

591 stashed = None 

592 yield (STRING, token, spos, (lnum, pos), line) 

593 else: 

594 strstart = (lnum, start) # multiple lines 

595 contstr = line[start:] 

596 contline = line 

597 break 

598 elif ( 

599 initial in single_quoted 

600 or token[:2] in single_quoted 

601 or token[:3] in single_quoted 

602 ): 

603 if token[-1] == "\n": # continued string 

604 strstart = (lnum, start) 

605 maybe_endprog = ( 

606 endprogs.get(initial) 

607 or endprogs.get(token[1]) 

608 or endprogs.get(token[2]) 

609 ) 

610 assert ( 

611 maybe_endprog is not None 

612 ), f"endprog not found for {token}" 

613 endprog = maybe_endprog 

614 contstr, needcont = line[start:], 1 

615 contline = line 

616 break 

617 else: # ordinary string 

618 if stashed: 

619 yield stashed 

620 stashed = None 

621 yield (STRING, token, spos, epos, line) 

622 elif initial.isidentifier(): # ordinary name 

623 if token in ("async", "await"): 

624 if async_keywords or async_def: 

625 yield ( 

626 ASYNC if token == "async" else AWAIT, 

627 token, 

628 spos, 

629 epos, 

630 line, 

631 ) 

632 continue 

633 

634 tok = (NAME, token, spos, epos, line) 

635 if token == "async" and not stashed: 

636 stashed = tok 

637 continue 

638 

639 if token in ("def", "for"): 

640 if stashed and stashed[0] == NAME and stashed[1] == "async": 

641 if token == "def": 

642 async_def = True 

643 async_def_indent = indents[-1] 

644 

645 yield ( 

646 ASYNC, 

647 stashed[1], 

648 stashed[2], 

649 stashed[3], 

650 stashed[4], 

651 ) 

652 stashed = None 

653 

654 if stashed: 

655 yield stashed 

656 stashed = None 

657 

658 yield tok 

659 elif initial == "\\": # continued stmt 

660 # This yield is new; needed for better idempotency: 

661 if stashed: 

662 yield stashed 

663 stashed = None 

664 yield (NL, token, spos, (lnum, pos), line) 

665 continued = 1 

666 else: 

667 if initial in "([{": 

668 parenlev += 1 

669 elif initial in ")]}": 

670 parenlev -= 1 

671 if stashed: 

672 yield stashed 

673 stashed = None 

674 yield (OP, token, spos, epos, line) 

675 else: 

676 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line) 

677 pos += 1 

678 

679 if stashed: 

680 yield stashed 

681 stashed = None 

682 

683 for indent in indents[1:]: # pop remaining indent levels 

684 yield (DEDENT, "", (lnum, 0), (lnum, 0), "") 

685 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "") 

686 

687 

688if __name__ == "__main__": # testing 

689 import sys 

690 

691 if len(sys.argv) > 1: 

692 tokenize(open(sys.argv[1]).readline) 

693 else: 

694 tokenize(sys.stdin.readline)