Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 74%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
4# mypy: allow-untyped-defs, allow-untyped-calls
6"""Tokenization help for Python programs.
8generate_tokens(readline) is a generator that breaks a stream of
9text into Python tokens. It accepts a readline-like method which is called
10repeatedly to get the next line of input (or "" for EOF). It generates
115-tuples with these members:
13 the token type (see token.py)
14 the token (a string)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19It is designed to match the working of the Python tokenizer exactly, except
20that it produces COMMENT tokens for comments and gives type OP for all
21operators
23Older entry points
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26are the same, except instead of generating tokens, tokeneater is a callback
27function to which the 5 fields described above are passed as 5 arguments,
28each time a new token is found."""
30import sys
31from collections.abc import Iterator
32from typing import Optional
34from blib2to3.pgen2.grammar import Grammar
35from blib2to3.pgen2.token import (
36 ASYNC,
37 AWAIT,
38 COMMENT,
39 DEDENT,
40 ENDMARKER,
41 FSTRING_END,
42 FSTRING_MIDDLE,
43 FSTRING_START,
44 INDENT,
45 NAME,
46 NEWLINE,
47 NL,
48 NUMBER,
49 OP,
50 STRING,
51 TSTRING_END,
52 TSTRING_MIDDLE,
53 TSTRING_START,
54 tok_name,
55)
57__author__ = "Ka-Ping Yee <ping@lfw.org>"
58__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
60import pytokens
61from pytokens import TokenType
63from . import token as _token
65__all__ = [x for x in dir(_token) if x[0] != "_"] + [
66 "tokenize",
67 "generate_tokens",
68 "untokenize",
69]
70del _token
72Coord = tuple[int, int]
73TokenInfo = tuple[int, str, Coord, Coord, str]
75TOKEN_TYPE_MAP = {
76 TokenType.indent: INDENT,
77 TokenType.dedent: DEDENT,
78 TokenType.newline: NEWLINE,
79 TokenType.nl: NL,
80 TokenType.comment: COMMENT,
81 TokenType.semicolon: OP,
82 TokenType.lparen: OP,
83 TokenType.rparen: OP,
84 TokenType.lbracket: OP,
85 TokenType.rbracket: OP,
86 TokenType.lbrace: OP,
87 TokenType.rbrace: OP,
88 TokenType.colon: OP,
89 TokenType.op: OP,
90 TokenType.identifier: NAME,
91 TokenType.number: NUMBER,
92 TokenType.string: STRING,
93 TokenType.fstring_start: FSTRING_START,
94 TokenType.fstring_middle: FSTRING_MIDDLE,
95 TokenType.fstring_end: FSTRING_END,
96 TokenType.tstring_start: TSTRING_START,
97 TokenType.tstring_middle: TSTRING_MIDDLE,
98 TokenType.tstring_end: TSTRING_END,
99 TokenType.endmarker: ENDMARKER,
100}
103class TokenError(Exception): ...
106def transform_whitespace(
107 token: pytokens.Token, source: str, prev_token: Optional[pytokens.Token]
108) -> pytokens.Token:
109 r"""
110 Black treats `\\\n` at the end of a line as a 'NL' token, while it
111 is ignored as whitespace in the regular Python parser.
112 But, only the first one. If there's a `\\\n` following it
113 (as in, a \ just by itself on a line), that is not made into NL.
114 """
115 if (
116 token.type == TokenType.whitespace
117 and prev_token is not None
118 and prev_token.type not in (TokenType.nl, TokenType.newline)
119 ):
120 token_str = source[token.start_index : token.end_index]
121 if token_str.startswith("\\\r\n"):
122 return pytokens.Token(
123 TokenType.nl,
124 token.start_index,
125 token.start_index + 3,
126 token.start_line,
127 token.start_col,
128 token.start_line,
129 token.start_col + 3,
130 )
131 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"):
132 return pytokens.Token(
133 TokenType.nl,
134 token.start_index,
135 token.start_index + 2,
136 token.start_line,
137 token.start_col,
138 token.start_line,
139 token.start_col + 2,
140 )
142 return token
145def tokenize(source: str, grammar: Optional[Grammar] = None) -> Iterator[TokenInfo]:
146 lines = source.split("\n")
147 lines += [""] # For newline tokens in files that don't end in a newline
148 line, column = 1, 0
150 prev_token: Optional[pytokens.Token] = None
151 try:
152 for token in pytokens.tokenize(source):
153 token = transform_whitespace(token, source, prev_token)
155 line, column = token.start_line, token.start_col
156 if token.type == TokenType.whitespace:
157 continue
159 token_str = source[token.start_index : token.end_index]
161 if token.type == TokenType.newline and token_str == "":
162 # Black doesn't yield empty newline tokens at the end of a file
163 # if there's no newline at the end of a file.
164 prev_token = token
165 continue
167 source_line = lines[token.start_line - 1]
169 if token.type == TokenType.identifier and token_str in ("async", "await"):
170 # Black uses `async` and `await` token types just for those two keywords
171 yield (
172 ASYNC if token_str == "async" else AWAIT,
173 token_str,
174 (token.start_line, token.start_col),
175 (token.end_line, token.end_col),
176 source_line,
177 )
178 elif token.type == TokenType.op and token_str == "...":
179 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead
180 assert token.start_line == token.end_line
181 assert token.end_col == token.start_col + 3
183 token_str = "."
184 for start_col in range(token.start_col, token.start_col + 3):
185 end_col = start_col + 1
186 yield (
187 TOKEN_TYPE_MAP[token.type],
188 token_str,
189 (token.start_line, start_col),
190 (token.end_line, end_col),
191 source_line,
192 )
193 else:
194 token_type = TOKEN_TYPE_MAP.get(token.type)
195 if token_type is None:
196 raise ValueError(f"Unknown token type: {token.type!r}")
197 yield (
198 TOKEN_TYPE_MAP[token.type],
199 token_str,
200 (token.start_line, token.start_col),
201 (token.end_line, token.end_col),
202 source_line,
203 )
204 prev_token = token
206 except pytokens.UnexpectedEOF:
207 raise TokenError("Unexpected EOF in multi-line statement", (line, column))
208 except pytokens.TokenizeError as exc:
209 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column))
212def printtoken(
213 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
214) -> None: # for testing
215 (srow, scol) = srow_col
216 (erow, ecol) = erow_col
217 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}")
220if __name__ == "__main__": # testing
221 if len(sys.argv) > 1:
222 token_iterator = tokenize(open(sys.argv[1]).read())
223 else:
224 token_iterator = tokenize(sys.stdin.read())
226 for tok in token_iterator:
227 printtoken(*tok)