Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 65%

4# mypy: allow-untyped-defs, allow-untyped-calls

6"""Tokenization help for Python programs.

8generate_tokens(readline) is a generator that breaks a stream of

9text into Python tokens. It accepts a readline-like method which is called

10repeatedly to get the next line of input (or "" for EOF). It generates

115-tuples with these members:

13 the token type (see token.py)

14 the token (a string)

15 the starting (row, column) indices of the token (a 2-tuple of ints)

16 the ending (row, column) indices of the token (a 2-tuple of ints)

17 the original line (string)

19It is designed to match the working of the Python tokenizer exactly, except

20that it produces COMMENT tokens for comments and gives type OP for all

21operators

23Older entry points

24 tokenize_loop(readline, tokeneater)

25 tokenize(readline, tokeneater=printtoken)

26are the same, except instead of generating tokens, tokeneater is a callback

27function to which the 5 fields described above are passed as 5 arguments,

28each time a new token is found."""

30import sys

31from typing import (

32 Callable,

33 Iterable,

34 Iterator,

35 List,

36 Optional,

37 Set,

38 Text,

39 Tuple,

40 Pattern,

41 Union,

42 cast,

43)

45if sys.version_info >= (3, 8):

46 from typing import Final

47else:

48 from typing_extensions import Final

50from blib2to3.pgen2.token import *

51from blib2to3.pgen2.grammar import Grammar

53__author__ = "Ka-Ping Yee <ping@lfw.org>"

54__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"

56import re

57from codecs import BOM_UTF8, lookup

58from blib2to3.pgen2.token import *

60from . import token

62__all__ = [x for x in dir(token) if x[0] != "_"] + [

63 "tokenize",

64 "generate_tokens",

65 "untokenize",

66]

67del token

70def group(*choices: str) -> str:

71 return "(" + "|".join(choices) + ")"

74def any(*choices: str) -> str:

75 return group(*choices) + "*"

78def maybe(*choices: str) -> str:

79 return group(*choices) + "?"

82def _combinations(*l: str) -> Set[str]:

83 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())

86Whitespace = r"[ \f\t]*"

87Comment = r"#[^\r\n]*"

88Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)

89Name = ( # this is invalid but it's fine because Name comes after Number in all groups

90 r"[^\s#\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"

91)

93Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"

94Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"

95Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"

96Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")

97Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)

98Exponent = r"[eE][-+]?\d+(?:_\d+)*"

99Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(

100 Exponent

101)

102Expfloat = r"\d+(?:_\d+)*" + Exponent

103Floatnumber = group(Pointfloat, Expfloat)

104Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")

105Number = group(Imagnumber, Floatnumber, Intnumber)

106

107# Tail end of ' string.

108Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

109# Tail end of " string.

110Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

111# Tail end of ''' string.

112Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

113# Tail end of """ string.

114Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

115_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"

116Triple = group(_litprefix + "'''", _litprefix + '"""')

117# Single-line ' or " string.

118String = group(

119 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

120 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',

121)

122

123# Because of leftmost-then-longest match semantics, be sure to put the

124# longest operators first (e.g., if = came before ==, == would get

125# recognized as two instances of =).

126Operator = group(

127 r"\*\*=?",

128 r">>=?",

129 r"<<=?",

130 r"<>",

131 r"!=",

132 r"//=?",

133 r"->",

134 r"[+\-*/%&@|^=<>:]=?",

135 r"~",

136)

137

138Bracket = "[][(){}]"

139Special = group(r"\r?\n", r"[:;.,`@]")

140Funny = group(Operator, Bracket, Special)

141

142# First (or only) line of ' or " string.

143ContStr = group(

144 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),

145 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),

146)

147PseudoExtras = group(r"\\\r?\n", Comment, Triple)

148PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

149

150pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)

151single3prog = re.compile(Single3)

152double3prog = re.compile(Double3)

153

154_strprefixes = (

155 _combinations("r", "R", "f", "F")

156 | _combinations("r", "R", "b", "B")

157 | {"u", "U", "ur", "uR", "Ur", "UR"}

158)

159

160endprogs: Final = {

161 "'": re.compile(Single),

162 '"': re.compile(Double),

163 "'''": single3prog,

164 '"""': double3prog,

165 **{f"{prefix}'''": single3prog for prefix in _strprefixes},

166 **{f'{prefix}"""': double3prog for prefix in _strprefixes},

167}

168

169triple_quoted: Final = (

170 {"'''", '"""'}

171 | {f"{prefix}'''" for prefix in _strprefixes}

172 | {f'{prefix}"""' for prefix in _strprefixes}

173)

174single_quoted: Final = (

175 {"'", '"'}

176 | {f"{prefix}'" for prefix in _strprefixes}

177 | {f'{prefix}"' for prefix in _strprefixes}

178)

179

180tabsize = 8

181

182

183class TokenError(Exception):

184 pass

185

186

187class StopTokenizing(Exception):

188 pass

189

190

191Coord = Tuple[int, int]

192

193

194def printtoken(

195 type: int, token: Text, srow_col: Coord, erow_col: Coord, line: Text

196) -> None: # for testing

197 (srow, scol) = srow_col

198 (erow, ecol) = erow_col

199 print(

200 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))

201 )

202

203

204TokenEater = Callable[[int, Text, Coord, Coord, Text], None]

205

206

207def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:

208 """

209 The tokenize() function accepts two parameters: one representing the

210 input stream, and one providing an output mechanism for tokenize().

211

212 The first parameter, readline, must be a callable object which provides

213 the same interface as the readline() method of built-in file objects.

214 Each call to the function should return one line of input as a string.

215

216 The second parameter, tokeneater, must also be a callable object. It is

217 called once for each token, with five arguments, corresponding to the

218 tuples generated by generate_tokens().

219 """

220 try:

221 tokenize_loop(readline, tokeneater)

222 except StopTokenizing:

223 pass

224

225

226# backwards compatible interface

227def tokenize_loop(readline: Callable[[], Text], tokeneater: TokenEater) -> None:

228 for token_info in generate_tokens(readline):

229 tokeneater(*token_info)

230

231

232GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]

233TokenInfo = Union[Tuple[int, str], GoodTokenInfo]

234

235

236class Untokenizer:

237 tokens: List[Text]

238 prev_row: int

239 prev_col: int

240

241 def __init__(self) -> None:

242 self.tokens = []

243 self.prev_row = 1

244 self.prev_col = 0

245

246 def add_whitespace(self, start: Coord) -> None:

247 row, col = start

248 assert row <= self.prev_row

249 col_offset = col - self.prev_col

250 if col_offset:

251 self.tokens.append(" " * col_offset)

252

253 def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:

254 for t in iterable:

255 if len(t) == 2:

256 self.compat(cast(Tuple[int, str], t), iterable)

257 break

258 tok_type, token, start, end, line = cast(

259 Tuple[int, Text, Coord, Coord, Text], t

260 )

261 self.add_whitespace(start)

262 self.tokens.append(token)

263 self.prev_row, self.prev_col = end

264 if tok_type in (NEWLINE, NL):

265 self.prev_row += 1

266 self.prev_col = 0

267 return "".join(self.tokens)

268

269 def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:

270 startline = False

271 indents = []

272 toks_append = self.tokens.append

273 toknum, tokval = token

274 if toknum in (NAME, NUMBER):

275 tokval += " "

276 if toknum in (NEWLINE, NL):

277 startline = True

278 for tok in iterable:

279 toknum, tokval = tok[:2]

280

281 if toknum in (NAME, NUMBER, ASYNC, AWAIT):

282 tokval += " "

283

284 if toknum == INDENT:

285 indents.append(tokval)

286 continue

287 elif toknum == DEDENT:

288 indents.pop()

289 continue

290 elif toknum in (NEWLINE, NL):

291 startline = True

292 elif startline and indents:

293 toks_append(indents[-1])

294 startline = False

295 toks_append(tokval)

296

297

298cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)

299blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)

300

301

302def _get_normal_name(orig_enc: str) -> str:

303 """Imitates get_normal_name in tokenizer.c."""

304 # Only care about the first 12 characters.

305 enc = orig_enc[:12].lower().replace("_", "-")

306 if enc == "utf-8" or enc.startswith("utf-8-"):

307 return "utf-8"

308 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(

309 ("latin-1-", "iso-8859-1-", "iso-latin-1-")

310 ):

311 return "iso-8859-1"

312 return orig_enc

313

314

315def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:

316 """

317 The detect_encoding() function is used to detect the encoding that should

318 be used to decode a Python source file. It requires one argument, readline,

319 in the same way as the tokenize() generator.

320

321 It will call readline a maximum of twice, and return the encoding used

322 (as a string) and a list of any lines (left as bytes) it has read

323 in.

324

325 It detects the encoding from the presence of a utf-8 bom or an encoding

326 cookie as specified in pep-0263. If both a bom and a cookie are present, but

327 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid

328 charset, raise a SyntaxError. Note that if a utf-8 bom is found,

329 'utf-8-sig' is returned.

330

331 If no encoding is specified, then the default of 'utf-8' will be returned.

332 """

333 bom_found = False

334 encoding = None

335 default = "utf-8"

336

337 def read_or_stop() -> bytes:

338 try:

339 return readline()

340 except StopIteration:

341 return bytes()

342

343 def find_cookie(line: bytes) -> Optional[str]:

344 try:

345 line_string = line.decode("ascii")

346 except UnicodeDecodeError:

347 return None

348 match = cookie_re.match(line_string)

349 if not match:

350 return None

351 encoding = _get_normal_name(match.group(1))

352 try:

353 codec = lookup(encoding)

354 except LookupError:

355 # This behaviour mimics the Python interpreter

356 raise SyntaxError("unknown encoding: " + encoding)

357

358 if bom_found:

359 if codec.name != "utf-8":

360 # This behaviour mimics the Python interpreter

361 raise SyntaxError("encoding problem: utf-8")

362 encoding += "-sig"

363 return encoding

364

365 first = read_or_stop()

366 if first.startswith(BOM_UTF8):

367 bom_found = True

368 first = first[3:]

369 default = "utf-8-sig"

370 if not first:

371 return default, []

372

373 encoding = find_cookie(first)

374 if encoding:

375 return encoding, [first]

376 if not blank_re.match(first):

377 return default, [first]

378

379 second = read_or_stop()

380 if not second:

381 return default, [first]

382

383 encoding = find_cookie(second)

384 if encoding:

385 return encoding, [first, second]

386

387 return default, [first, second]

388

389

390def untokenize(iterable: Iterable[TokenInfo]) -> Text:

391 """Transform tokens back into Python source code.

392

393 Each element returned by the iterable must be a token sequence

394 with at least two elements, a token number and token value. If

395 only two tokens are passed, the resulting output is poor.

396

397 Round-trip invariant for full input:

398 Untokenized source will match input source exactly

399

400 Round-trip invariant for limited input:

401 # Output text will tokenize the back to the input

402 t1 = [tok[:2] for tok in generate_tokens(f.readline)]

403 newcode = untokenize(t1)

404 readline = iter(newcode.splitlines(1)).next

405 t2 = [tok[:2] for tokin generate_tokens(readline)]

406 assert t1 == t2

407 """

408 ut = Untokenizer()

409 return ut.untokenize(iterable)

410

411

412def generate_tokens(

413 readline: Callable[[], Text], grammar: Optional[Grammar] = None

414) -> Iterator[GoodTokenInfo]:

415 """

416 The generate_tokens() generator requires one argument, readline, which

417 must be a callable object which provides the same interface as the

418 readline() method of built-in file objects. Each call to the function

419 should return one line of input as a string. Alternately, readline

420 can be a callable function terminating with StopIteration:

421 readline = open(myfile).next # Example of alternate readline

422

423 The generator produces 5-tuples with these members: the token type; the

424 token string; a 2-tuple (srow, scol) of ints specifying the row and

425 column where the token begins in the source; a 2-tuple (erow, ecol) of

426 ints specifying the row and column where the token ends in the source;

427 and the line on which the token was found. The line passed is the

428 logical line; continuation lines are included.

429 """

430 lnum = parenlev = continued = 0

431 numchars: Final[str] = "0123456789"

432 contstr, needcont = "", 0

433 contline: Optional[str] = None

434 indents = [0]

435

436 # If we know we're parsing 3.7+, we can unconditionally parse `async` and

437 # `await` as keywords.

438 async_keywords = False if grammar is None else grammar.async_keywords

439 # 'stashed' and 'async_*' are used for async/await parsing

440 stashed: Optional[GoodTokenInfo] = None

441 async_def = False

442 async_def_indent = 0

443 async_def_nl = False

444

445 strstart: Tuple[int, int]

446 endprog: Pattern[str]

447

448 while 1: # loop over lines in stream

449 try:

450 line = readline()

451 except StopIteration:

452 line = ""

453 lnum += 1

454 pos, max = 0, len(line)

455

456 if contstr: # continued string

457 assert contline is not None

458 if not line:

459 raise TokenError("EOF in multi-line string", strstart)

460 endmatch = endprog.match(line)

461 if endmatch:

462 pos = end = endmatch.end(0)

463 yield (

464 STRING,

465 contstr + line[:end],

466 strstart,

467 (lnum, end),

468 contline + line,

469 )

470 contstr, needcont = "", 0

471 contline = None

472 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":

473 yield (

474 ERRORTOKEN,

475 contstr + line,

476 strstart,

477 (lnum, len(line)),

478 contline,

479 )

480 contstr = ""

481 contline = None

482 continue

483 else:

484 contstr = contstr + line

485 contline = contline + line

486 continue

487

488 elif parenlev == 0 and not continued: # new statement

489 if not line:

490 break

491 column = 0

492 while pos < max: # measure leading whitespace

493 if line[pos] == " ":

494 column += 1

495 elif line[pos] == "\t":

496 column = (column // tabsize + 1) * tabsize

497 elif line[pos] == "\f":

498 column = 0

499 else:

500 break

501 pos += 1

502 if pos == max:

503 break

504

505 if stashed:

506 yield stashed

507 stashed = None

508

509 if line[pos] in "\r\n": # skip blank lines

510 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)

511 continue

512

513 if line[pos] == "#": # skip comments

514 comment_token = line[pos:].rstrip("\r\n")

515 nl_pos = pos + len(comment_token)

516 yield (

517 COMMENT,

518 comment_token,

519 (lnum, pos),

520 (lnum, nl_pos),

521 line,

522 )

523 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)

524 continue

525

526 if column > indents[-1]: # count indents

527 indents.append(column)

528 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

529

530 while column < indents[-1]: # count dedents

531 if column not in indents:

532 raise IndentationError(

533 "unindent does not match any outer indentation level",

534 ("<tokenize>", lnum, pos, line),

535 )

536 indents = indents[:-1]

537

538 if async_def and async_def_indent >= indents[-1]:

539 async_def = False

540 async_def_nl = False

541 async_def_indent = 0

542

543 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)

544

545 if async_def and async_def_nl and async_def_indent >= indents[-1]:

546 async_def = False

547 async_def_nl = False

548 async_def_indent = 0

549

550 else: # continued statement

551 if not line:

552 raise TokenError("EOF in multi-line statement", (lnum, 0))

553 continued = 0

554

555 while pos < max:

556 pseudomatch = pseudoprog.match(line, pos)

557 if pseudomatch: # scan for tokens

558 start, end = pseudomatch.span(1)

559 spos, epos, pos = (lnum, start), (lnum, end), end

560 token, initial = line[start:end], line[start]

561

562 if initial in numchars or (

563 initial == "." and token != "."

564 ): # ordinary number

565 yield (NUMBER, token, spos, epos, line)

566 elif initial in "\r\n":

567 newline = NEWLINE

568 if parenlev > 0:

569 newline = NL

570 elif async_def:

571 async_def_nl = True

572 if stashed:

573 yield stashed

574 stashed = None

575 yield (newline, token, spos, epos, line)

576

577 elif initial == "#":

578 assert not token.endswith("\n")

579 if stashed:

580 yield stashed

581 stashed = None

582 yield (COMMENT, token, spos, epos, line)

583 elif token in triple_quoted:

584 endprog = endprogs[token]

585 endmatch = endprog.match(line, pos)

586 if endmatch: # all on one line

587 pos = endmatch.end(0)

588 token = line[start:pos]

589 if stashed:

590 yield stashed

591 stashed = None

592 yield (STRING, token, spos, (lnum, pos), line)

593 else:

594 strstart = (lnum, start) # multiple lines

595 contstr = line[start:]

596 contline = line

597 break

598 elif (

599 initial in single_quoted

600 or token[:2] in single_quoted

601 or token[:3] in single_quoted

602 ):

603 if token[-1] == "\n": # continued string

604 strstart = (lnum, start)

605 maybe_endprog = (

606 endprogs.get(initial)

607 or endprogs.get(token[1])

608 or endprogs.get(token[2])

609 )

610 assert (

611 maybe_endprog is not None

612 ), f"endprog not found for {token}"

613 endprog = maybe_endprog

614 contstr, needcont = line[start:], 1

615 contline = line

616 break

617 else: # ordinary string

618 if stashed:

619 yield stashed

620 stashed = None

621 yield (STRING, token, spos, epos, line)

622 elif initial.isidentifier(): # ordinary name

623 if token in ("async", "await"):

624 if async_keywords or async_def:

625 yield (

626 ASYNC if token == "async" else AWAIT,

627 token,

628 spos,

629 epos,

630 line,

631 )

632 continue

633

634 tok = (NAME, token, spos, epos, line)

635 if token == "async" and not stashed:

636 stashed = tok

637 continue

638

639 if token in ("def", "for"):

640 if stashed and stashed[0] == NAME and stashed[1] == "async":

641 if token == "def":

642 async_def = True

643 async_def_indent = indents[-1]

644

645 yield (

646 ASYNC,

647 stashed[1],

648 stashed[2],

649 stashed[3],

650 stashed[4],

651 )

652 stashed = None

653

654 if stashed:

655 yield stashed

656 stashed = None

657

658 yield tok

659 elif initial == "\\": # continued stmt

660 # This yield is new; needed for better idempotency:

661 if stashed:

662 yield stashed

663 stashed = None

664 yield (NL, token, spos, (lnum, pos), line)

665 continued = 1

666 else:

667 if initial in "([{":

668 parenlev += 1

669 elif initial in ")]}":

670 parenlev -= 1

671 if stashed:

672 yield stashed

673 stashed = None

674 yield (OP, token, spos, epos, line)

675 else:

676 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)

677 pos += 1

678

679 if stashed:

680 yield stashed

681 stashed = None

682

683 for indent in indents[1:]: # pop remaining indent levels

684 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")

685 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")

686

687

688if __name__ == "__main__": # testing

689 import sys

690

691 if len(sys.argv) > 1:

692 tokenize(open(sys.argv[1]).readline)

693 else:

694 tokenize(sys.stdin.readline)