Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 65%
370 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:15 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:15 +0000
1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
4# mypy: allow-untyped-defs, allow-untyped-calls
6"""Tokenization help for Python programs.
8generate_tokens(readline) is a generator that breaks a stream of
9text into Python tokens. It accepts a readline-like method which is called
10repeatedly to get the next line of input (or "" for EOF). It generates
115-tuples with these members:
13 the token type (see token.py)
14 the token (a string)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19It is designed to match the working of the Python tokenizer exactly, except
20that it produces COMMENT tokens for comments and gives type OP for all
21operators
23Older entry points
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26are the same, except instead of generating tokens, tokeneater is a callback
27function to which the 5 fields described above are passed as 5 arguments,
28each time a new token is found."""
30import sys
31from typing import (
32 Callable,
33 Iterable,
34 Iterator,
35 List,
36 Optional,
37 Set,
38 Text,
39 Tuple,
40 Pattern,
41 Union,
42 cast,
43)
45if sys.version_info >= (3, 8):
46 from typing import Final
47else:
48 from typing_extensions import Final
50from blib2to3.pgen2.token import *
51from blib2to3.pgen2.grammar import Grammar
53__author__ = "Ka-Ping Yee <ping@lfw.org>"
54__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
56import re
57from codecs import BOM_UTF8, lookup
58from blib2to3.pgen2.token import *
60from . import token
62__all__ = [x for x in dir(token) if x[0] != "_"] + [
63 "tokenize",
64 "generate_tokens",
65 "untokenize",
66]
67del token
70def group(*choices: str) -> str:
71 return "(" + "|".join(choices) + ")"
74def any(*choices: str) -> str:
75 return group(*choices) + "*"
78def maybe(*choices: str) -> str:
79 return group(*choices) + "?"
82def _combinations(*l: str) -> Set[str]:
83 return set(x + y for x in l for y in l + ("",) if x.casefold() != y.casefold())
86Whitespace = r"[ \f\t]*"
87Comment = r"#[^\r\n]*"
88Ignore = Whitespace + any(r"\\\r?\n" + Whitespace) + maybe(Comment)
89Name = ( # this is invalid but it's fine because Name comes after Number in all groups
90 r"[^\s#\(\)\[\]\{\}+\-*/!@$%^&=|;:'\",\.<>/?`~\\]+"
91)
93Binnumber = r"0[bB]_?[01]+(?:_[01]+)*"
94Hexnumber = r"0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?"
95Octnumber = r"0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?"
96Decnumber = group(r"[1-9]\d*(?:_\d+)*[lL]?", "0[lL]?")
97Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
98Exponent = r"[eE][-+]?\d+(?:_\d+)*"
99Pointfloat = group(r"\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?", r"\.\d+(?:_\d+)*") + maybe(
100 Exponent
101)
102Expfloat = r"\d+(?:_\d+)*" + Exponent
103Floatnumber = group(Pointfloat, Expfloat)
104Imagnumber = group(r"\d+(?:_\d+)*[jJ]", Floatnumber + r"[jJ]")
105Number = group(Imagnumber, Floatnumber, Intnumber)
107# Tail end of ' string.
108Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
109# Tail end of " string.
110Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
111# Tail end of ''' string.
112Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
113# Tail end of """ string.
114Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
115_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
116Triple = group(_litprefix + "'''", _litprefix + '"""')
117# Single-line ' or " string.
118String = group(
119 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
120 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"',
121)
123# Because of leftmost-then-longest match semantics, be sure to put the
124# longest operators first (e.g., if = came before ==, == would get
125# recognized as two instances of =).
126Operator = group(
127 r"\*\*=?",
128 r">>=?",
129 r"<<=?",
130 r"<>",
131 r"!=",
132 r"//=?",
133 r"->",
134 r"[+\-*/%&@|^=<>:]=?",
135 r"~",
136)
138Bracket = "[][(){}]"
139Special = group(r"\r?\n", r"[:;.,`@]")
140Funny = group(Operator, Bracket, Special)
142# First (or only) line of ' or " string.
143ContStr = group(
144 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r"\\\r?\n"),
145 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r"\\\r?\n"),
146)
147PseudoExtras = group(r"\\\r?\n", Comment, Triple)
148PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
150pseudoprog: Final = re.compile(PseudoToken, re.UNICODE)
151single3prog = re.compile(Single3)
152double3prog = re.compile(Double3)
154_strprefixes = (
155 _combinations("r", "R", "f", "F")
156 | _combinations("r", "R", "b", "B")
157 | {"u", "U", "ur", "uR", "Ur", "UR"}
158)
160endprogs: Final = {
161 "'": re.compile(Single),
162 '"': re.compile(Double),
163 "'''": single3prog,
164 '"""': double3prog,
165 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
166 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
167}
169triple_quoted: Final = (
170 {"'''", '"""'}
171 | {f"{prefix}'''" for prefix in _strprefixes}
172 | {f'{prefix}"""' for prefix in _strprefixes}
173)
174single_quoted: Final = (
175 {"'", '"'}
176 | {f"{prefix}'" for prefix in _strprefixes}
177 | {f'{prefix}"' for prefix in _strprefixes}
178)
180tabsize = 8
183class TokenError(Exception):
184 pass
187class StopTokenizing(Exception):
188 pass
191Coord = Tuple[int, int]
194def printtoken(
195 type: int, token: Text, srow_col: Coord, erow_col: Coord, line: Text
196) -> None: # for testing
197 (srow, scol) = srow_col
198 (erow, ecol) = erow_col
199 print(
200 "%d,%d-%d,%d:\t%s\t%s" % (srow, scol, erow, ecol, tok_name[type], repr(token))
201 )
204TokenEater = Callable[[int, Text, Coord, Coord, Text], None]
207def tokenize(readline: Callable[[], Text], tokeneater: TokenEater = printtoken) -> None:
208 """
209 The tokenize() function accepts two parameters: one representing the
210 input stream, and one providing an output mechanism for tokenize().
212 The first parameter, readline, must be a callable object which provides
213 the same interface as the readline() method of built-in file objects.
214 Each call to the function should return one line of input as a string.
216 The second parameter, tokeneater, must also be a callable object. It is
217 called once for each token, with five arguments, corresponding to the
218 tuples generated by generate_tokens().
219 """
220 try:
221 tokenize_loop(readline, tokeneater)
222 except StopTokenizing:
223 pass
226# backwards compatible interface
227def tokenize_loop(readline: Callable[[], Text], tokeneater: TokenEater) -> None:
228 for token_info in generate_tokens(readline):
229 tokeneater(*token_info)
232GoodTokenInfo = Tuple[int, Text, Coord, Coord, Text]
233TokenInfo = Union[Tuple[int, str], GoodTokenInfo]
236class Untokenizer:
237 tokens: List[Text]
238 prev_row: int
239 prev_col: int
241 def __init__(self) -> None:
242 self.tokens = []
243 self.prev_row = 1
244 self.prev_col = 0
246 def add_whitespace(self, start: Coord) -> None:
247 row, col = start
248 assert row <= self.prev_row
249 col_offset = col - self.prev_col
250 if col_offset:
251 self.tokens.append(" " * col_offset)
253 def untokenize(self, iterable: Iterable[TokenInfo]) -> Text:
254 for t in iterable:
255 if len(t) == 2:
256 self.compat(cast(Tuple[int, str], t), iterable)
257 break
258 tok_type, token, start, end, line = cast(
259 Tuple[int, Text, Coord, Coord, Text], t
260 )
261 self.add_whitespace(start)
262 self.tokens.append(token)
263 self.prev_row, self.prev_col = end
264 if tok_type in (NEWLINE, NL):
265 self.prev_row += 1
266 self.prev_col = 0
267 return "".join(self.tokens)
269 def compat(self, token: Tuple[int, Text], iterable: Iterable[TokenInfo]) -> None:
270 startline = False
271 indents = []
272 toks_append = self.tokens.append
273 toknum, tokval = token
274 if toknum in (NAME, NUMBER):
275 tokval += " "
276 if toknum in (NEWLINE, NL):
277 startline = True
278 for tok in iterable:
279 toknum, tokval = tok[:2]
281 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
282 tokval += " "
284 if toknum == INDENT:
285 indents.append(tokval)
286 continue
287 elif toknum == DEDENT:
288 indents.pop()
289 continue
290 elif toknum in (NEWLINE, NL):
291 startline = True
292 elif startline and indents:
293 toks_append(indents[-1])
294 startline = False
295 toks_append(tokval)
298cookie_re = re.compile(r"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.ASCII)
299blank_re = re.compile(rb"^[ \t\f]*(?:[#\r\n]|$)", re.ASCII)
302def _get_normal_name(orig_enc: str) -> str:
303 """Imitates get_normal_name in tokenizer.c."""
304 # Only care about the first 12 characters.
305 enc = orig_enc[:12].lower().replace("_", "-")
306 if enc == "utf-8" or enc.startswith("utf-8-"):
307 return "utf-8"
308 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or enc.startswith(
309 ("latin-1-", "iso-8859-1-", "iso-latin-1-")
310 ):
311 return "iso-8859-1"
312 return orig_enc
315def detect_encoding(readline: Callable[[], bytes]) -> Tuple[str, List[bytes]]:
316 """
317 The detect_encoding() function is used to detect the encoding that should
318 be used to decode a Python source file. It requires one argument, readline,
319 in the same way as the tokenize() generator.
321 It will call readline a maximum of twice, and return the encoding used
322 (as a string) and a list of any lines (left as bytes) it has read
323 in.
325 It detects the encoding from the presence of a utf-8 bom or an encoding
326 cookie as specified in pep-0263. If both a bom and a cookie are present, but
327 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
328 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
329 'utf-8-sig' is returned.
331 If no encoding is specified, then the default of 'utf-8' will be returned.
332 """
333 bom_found = False
334 encoding = None
335 default = "utf-8"
337 def read_or_stop() -> bytes:
338 try:
339 return readline()
340 except StopIteration:
341 return bytes()
343 def find_cookie(line: bytes) -> Optional[str]:
344 try:
345 line_string = line.decode("ascii")
346 except UnicodeDecodeError:
347 return None
348 match = cookie_re.match(line_string)
349 if not match:
350 return None
351 encoding = _get_normal_name(match.group(1))
352 try:
353 codec = lookup(encoding)
354 except LookupError:
355 # This behaviour mimics the Python interpreter
356 raise SyntaxError("unknown encoding: " + encoding)
358 if bom_found:
359 if codec.name != "utf-8":
360 # This behaviour mimics the Python interpreter
361 raise SyntaxError("encoding problem: utf-8")
362 encoding += "-sig"
363 return encoding
365 first = read_or_stop()
366 if first.startswith(BOM_UTF8):
367 bom_found = True
368 first = first[3:]
369 default = "utf-8-sig"
370 if not first:
371 return default, []
373 encoding = find_cookie(first)
374 if encoding:
375 return encoding, [first]
376 if not blank_re.match(first):
377 return default, [first]
379 second = read_or_stop()
380 if not second:
381 return default, [first]
383 encoding = find_cookie(second)
384 if encoding:
385 return encoding, [first, second]
387 return default, [first, second]
390def untokenize(iterable: Iterable[TokenInfo]) -> Text:
391 """Transform tokens back into Python source code.
393 Each element returned by the iterable must be a token sequence
394 with at least two elements, a token number and token value. If
395 only two tokens are passed, the resulting output is poor.
397 Round-trip invariant for full input:
398 Untokenized source will match input source exactly
400 Round-trip invariant for limited input:
401 # Output text will tokenize the back to the input
402 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
403 newcode = untokenize(t1)
404 readline = iter(newcode.splitlines(1)).next
405 t2 = [tok[:2] for tokin generate_tokens(readline)]
406 assert t1 == t2
407 """
408 ut = Untokenizer()
409 return ut.untokenize(iterable)
412def generate_tokens(
413 readline: Callable[[], Text], grammar: Optional[Grammar] = None
414) -> Iterator[GoodTokenInfo]:
415 """
416 The generate_tokens() generator requires one argument, readline, which
417 must be a callable object which provides the same interface as the
418 readline() method of built-in file objects. Each call to the function
419 should return one line of input as a string. Alternately, readline
420 can be a callable function terminating with StopIteration:
421 readline = open(myfile).next # Example of alternate readline
423 The generator produces 5-tuples with these members: the token type; the
424 token string; a 2-tuple (srow, scol) of ints specifying the row and
425 column where the token begins in the source; a 2-tuple (erow, ecol) of
426 ints specifying the row and column where the token ends in the source;
427 and the line on which the token was found. The line passed is the
428 logical line; continuation lines are included.
429 """
430 lnum = parenlev = continued = 0
431 numchars: Final[str] = "0123456789"
432 contstr, needcont = "", 0
433 contline: Optional[str] = None
434 indents = [0]
436 # If we know we're parsing 3.7+, we can unconditionally parse `async` and
437 # `await` as keywords.
438 async_keywords = False if grammar is None else grammar.async_keywords
439 # 'stashed' and 'async_*' are used for async/await parsing
440 stashed: Optional[GoodTokenInfo] = None
441 async_def = False
442 async_def_indent = 0
443 async_def_nl = False
445 strstart: Tuple[int, int]
446 endprog: Pattern[str]
448 while 1: # loop over lines in stream
449 try:
450 line = readline()
451 except StopIteration:
452 line = ""
453 lnum += 1
454 pos, max = 0, len(line)
456 if contstr: # continued string
457 assert contline is not None
458 if not line:
459 raise TokenError("EOF in multi-line string", strstart)
460 endmatch = endprog.match(line)
461 if endmatch:
462 pos = end = endmatch.end(0)
463 yield (
464 STRING,
465 contstr + line[:end],
466 strstart,
467 (lnum, end),
468 contline + line,
469 )
470 contstr, needcont = "", 0
471 contline = None
472 elif needcont and line[-2:] != "\\\n" and line[-3:] != "\\\r\n":
473 yield (
474 ERRORTOKEN,
475 contstr + line,
476 strstart,
477 (lnum, len(line)),
478 contline,
479 )
480 contstr = ""
481 contline = None
482 continue
483 else:
484 contstr = contstr + line
485 contline = contline + line
486 continue
488 elif parenlev == 0 and not continued: # new statement
489 if not line:
490 break
491 column = 0
492 while pos < max: # measure leading whitespace
493 if line[pos] == " ":
494 column += 1
495 elif line[pos] == "\t":
496 column = (column // tabsize + 1) * tabsize
497 elif line[pos] == "\f":
498 column = 0
499 else:
500 break
501 pos += 1
502 if pos == max:
503 break
505 if stashed:
506 yield stashed
507 stashed = None
509 if line[pos] in "\r\n": # skip blank lines
510 yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
511 continue
513 if line[pos] == "#": # skip comments
514 comment_token = line[pos:].rstrip("\r\n")
515 nl_pos = pos + len(comment_token)
516 yield (
517 COMMENT,
518 comment_token,
519 (lnum, pos),
520 (lnum, nl_pos),
521 line,
522 )
523 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
524 continue
526 if column > indents[-1]: # count indents
527 indents.append(column)
528 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
530 while column < indents[-1]: # count dedents
531 if column not in indents:
532 raise IndentationError(
533 "unindent does not match any outer indentation level",
534 ("<tokenize>", lnum, pos, line),
535 )
536 indents = indents[:-1]
538 if async_def and async_def_indent >= indents[-1]:
539 async_def = False
540 async_def_nl = False
541 async_def_indent = 0
543 yield (DEDENT, "", (lnum, pos), (lnum, pos), line)
545 if async_def and async_def_nl and async_def_indent >= indents[-1]:
546 async_def = False
547 async_def_nl = False
548 async_def_indent = 0
550 else: # continued statement
551 if not line:
552 raise TokenError("EOF in multi-line statement", (lnum, 0))
553 continued = 0
555 while pos < max:
556 pseudomatch = pseudoprog.match(line, pos)
557 if pseudomatch: # scan for tokens
558 start, end = pseudomatch.span(1)
559 spos, epos, pos = (lnum, start), (lnum, end), end
560 token, initial = line[start:end], line[start]
562 if initial in numchars or (
563 initial == "." and token != "."
564 ): # ordinary number
565 yield (NUMBER, token, spos, epos, line)
566 elif initial in "\r\n":
567 newline = NEWLINE
568 if parenlev > 0:
569 newline = NL
570 elif async_def:
571 async_def_nl = True
572 if stashed:
573 yield stashed
574 stashed = None
575 yield (newline, token, spos, epos, line)
577 elif initial == "#":
578 assert not token.endswith("\n")
579 if stashed:
580 yield stashed
581 stashed = None
582 yield (COMMENT, token, spos, epos, line)
583 elif token in triple_quoted:
584 endprog = endprogs[token]
585 endmatch = endprog.match(line, pos)
586 if endmatch: # all on one line
587 pos = endmatch.end(0)
588 token = line[start:pos]
589 if stashed:
590 yield stashed
591 stashed = None
592 yield (STRING, token, spos, (lnum, pos), line)
593 else:
594 strstart = (lnum, start) # multiple lines
595 contstr = line[start:]
596 contline = line
597 break
598 elif (
599 initial in single_quoted
600 or token[:2] in single_quoted
601 or token[:3] in single_quoted
602 ):
603 if token[-1] == "\n": # continued string
604 strstart = (lnum, start)
605 maybe_endprog = (
606 endprogs.get(initial)
607 or endprogs.get(token[1])
608 or endprogs.get(token[2])
609 )
610 assert (
611 maybe_endprog is not None
612 ), f"endprog not found for {token}"
613 endprog = maybe_endprog
614 contstr, needcont = line[start:], 1
615 contline = line
616 break
617 else: # ordinary string
618 if stashed:
619 yield stashed
620 stashed = None
621 yield (STRING, token, spos, epos, line)
622 elif initial.isidentifier(): # ordinary name
623 if token in ("async", "await"):
624 if async_keywords or async_def:
625 yield (
626 ASYNC if token == "async" else AWAIT,
627 token,
628 spos,
629 epos,
630 line,
631 )
632 continue
634 tok = (NAME, token, spos, epos, line)
635 if token == "async" and not stashed:
636 stashed = tok
637 continue
639 if token in ("def", "for"):
640 if stashed and stashed[0] == NAME and stashed[1] == "async":
641 if token == "def":
642 async_def = True
643 async_def_indent = indents[-1]
645 yield (
646 ASYNC,
647 stashed[1],
648 stashed[2],
649 stashed[3],
650 stashed[4],
651 )
652 stashed = None
654 if stashed:
655 yield stashed
656 stashed = None
658 yield tok
659 elif initial == "\\": # continued stmt
660 # This yield is new; needed for better idempotency:
661 if stashed:
662 yield stashed
663 stashed = None
664 yield (NL, token, spos, (lnum, pos), line)
665 continued = 1
666 else:
667 if initial in "([{":
668 parenlev += 1
669 elif initial in ")]}":
670 parenlev -= 1
671 if stashed:
672 yield stashed
673 stashed = None
674 yield (OP, token, spos, epos, line)
675 else:
676 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
677 pos += 1
679 if stashed:
680 yield stashed
681 stashed = None
683 for indent in indents[1:]: # pop remaining indent levels
684 yield (DEDENT, "", (lnum, 0), (lnum, 0), "")
685 yield (ENDMARKER, "", (lnum, 0), (lnum, 0), "")
688if __name__ == "__main__": # testing
689 import sys
691 if len(sys.argv) > 1:
692 tokenize(open(sys.argv[1]).readline)
693 else:
694 tokenize(sys.stdin.readline)