1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4# mypy: allow-untyped-defs, allow-untyped-calls
5
6"""Tokenization help for Python programs.
7
8generate_tokens(readline) is a generator that breaks a stream of
9text into Python tokens. It accepts a readline-like method which is called
10repeatedly to get the next line of input (or "" for EOF). It generates
115-tuples with these members:
12
13 the token type (see token.py)
14 the token (a string)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
18
19It is designed to match the working of the Python tokenizer exactly, except
20that it produces COMMENT tokens for comments and gives type OP for all
21operators
22
23Older entry points
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26are the same, except instead of generating tokens, tokeneater is a callback
27function to which the 5 fields described above are passed as 5 arguments,
28each time a new token is found."""
29
30import sys
31from collections.abc import Iterator
32
33from blib2to3.pgen2.grammar import Grammar
34from blib2to3.pgen2.token import (
35 ASYNC,
36 AWAIT,
37 COMMENT,
38 DEDENT,
39 ENDMARKER,
40 FSTRING_END,
41 FSTRING_MIDDLE,
42 FSTRING_START,
43 INDENT,
44 LAZY,
45 NAME,
46 NEWLINE,
47 NL,
48 NUMBER,
49 OP,
50 STRING,
51 TSTRING_END,
52 TSTRING_MIDDLE,
53 TSTRING_START,
54 tok_name,
55)
56
57__author__ = "Ka-Ping Yee <ping@lfw.org>"
58__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
59
60import pytokens
61from pytokens import TokenType
62
63from . import token as _token
64
65__all__ = [x for x in dir(_token) if x[0] != "_"] + [
66 "tokenize",
67 "generate_tokens",
68 "untokenize",
69]
70del _token
71
72Coord = tuple[int, int]
73TokenInfo = tuple[int, str, Coord, Coord, str]
74LazyStash = tuple[pytokens.Token, str, str]
75
76TOKEN_TYPE_MAP = {
77 TokenType.indent: INDENT,
78 TokenType.dedent: DEDENT,
79 TokenType.newline: NEWLINE,
80 TokenType.nl: NL,
81 TokenType.comment: COMMENT,
82 TokenType.semicolon: OP,
83 TokenType.lparen: OP,
84 TokenType.rparen: OP,
85 TokenType.lbracket: OP,
86 TokenType.rbracket: OP,
87 TokenType.lbrace: OP,
88 TokenType.rbrace: OP,
89 TokenType.colon: OP,
90 TokenType.op: OP,
91 TokenType.identifier: NAME,
92 TokenType.number: NUMBER,
93 TokenType.string: STRING,
94 TokenType.fstring_start: FSTRING_START,
95 TokenType.fstring_middle: FSTRING_MIDDLE,
96 TokenType.fstring_end: FSTRING_END,
97 TokenType.tstring_start: TSTRING_START,
98 TokenType.tstring_middle: TSTRING_MIDDLE,
99 TokenType.tstring_end: TSTRING_END,
100 TokenType.endmarker: ENDMARKER,
101}
102
103
104class TokenError(Exception): ...
105
106
107def transform_whitespace(
108 token: pytokens.Token, source: str, prev_token: pytokens.Token | None
109) -> pytokens.Token:
110 r"""
111 Black treats `\\\n` at the end of a line as a 'NL' token, while it
112 is ignored as whitespace in the regular Python parser.
113 But, only the first one. If there's a `\\\n` following it
114 (as in, a \ just by itself on a line), that is not made into NL.
115 """
116 if (
117 token.type == TokenType.whitespace
118 and prev_token is not None
119 and prev_token.type not in (TokenType.nl, TokenType.newline)
120 ):
121 token_str = source[token.start_index : token.end_index]
122 if token_str.startswith("\\\r\n"):
123 return pytokens.Token(
124 TokenType.nl,
125 token.start_index,
126 token.start_index + 3,
127 token.start_line,
128 token.start_col,
129 token.start_line,
130 token.start_col + 3,
131 )
132 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"):
133 return pytokens.Token(
134 TokenType.nl,
135 token.start_index,
136 token.start_index + 2,
137 token.start_line,
138 token.start_col,
139 token.start_line,
140 token.start_col + 2,
141 )
142
143 return token
144
145
146def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]:
147 lines = source.split("\n")
148 lines += [""] # For newline tokens in files that don't end in a newline
149 line, column = 1, 0
150
151 prev_token: pytokens.Token | None = None
152 lazy_stashed: LazyStash | None = None
153 stmt_start = True
154
155 def emit_stashed_lazy(*, as_keyword: bool) -> Iterator[TokenInfo]:
156 nonlocal lazy_stashed
157 if lazy_stashed is None:
158 return
159
160 stashed_token, stashed_str, stashed_line = lazy_stashed
161 yield (
162 LAZY if as_keyword else NAME,
163 stashed_str,
164 (stashed_token.start_line, stashed_token.start_col),
165 (stashed_token.end_line, stashed_token.end_col),
166 stashed_line,
167 )
168 lazy_stashed = None
169
170 try:
171 for token in pytokens.tokenize(source):
172 token = transform_whitespace(token, source, prev_token)
173
174 line, column = token.start_line, token.start_col
175 if token.type == TokenType.whitespace:
176 continue
177
178 token_str = source[token.start_index : token.end_index]
179
180 if token.type == TokenType.newline and token_str == "":
181 # Black doesn't yield empty newline tokens at the end of a file
182 # if there's no newline at the end of a file.
183 prev_token = token
184 continue
185
186 source_line = lines[token.start_line - 1]
187
188 if lazy_stashed is not None and not (
189 token.type == TokenType.identifier and token_str in ("import", "from")
190 ):
191 yield from emit_stashed_lazy(as_keyword=False)
192
193 if (
194 token.type == TokenType.identifier
195 and token_str == "lazy"
196 and stmt_start
197 ):
198 lazy_stashed = (token, token_str, source_line)
199 prev_token = token
200 stmt_start = False
201 continue
202
203 if lazy_stashed is not None:
204 yield from emit_stashed_lazy(as_keyword=True)
205
206 if token.type == TokenType.identifier and token_str in ("async", "await"):
207 # Black uses `async` and `await` token types just for those two keywords
208 yield (
209 ASYNC if token_str == "async" else AWAIT,
210 token_str,
211 (token.start_line, token.start_col),
212 (token.end_line, token.end_col),
213 source_line,
214 )
215 elif token.type == TokenType.op and token_str == "...":
216 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead
217 assert token.start_line == token.end_line
218 assert token.end_col == token.start_col + 3
219
220 token_str = "."
221 for start_col in range(token.start_col, token.start_col + 3):
222 end_col = start_col + 1
223 yield (
224 TOKEN_TYPE_MAP[token.type],
225 token_str,
226 (token.start_line, start_col),
227 (token.end_line, end_col),
228 source_line,
229 )
230 else:
231 token_type = TOKEN_TYPE_MAP.get(token.type)
232 if token_type is None:
233 raise ValueError(f"Unknown token type: {token.type!r}")
234 yield (
235 TOKEN_TYPE_MAP[token.type],
236 token_str,
237 (token.start_line, token.start_col),
238 (token.end_line, token.end_col),
239 source_line,
240 )
241 prev_token = token
242
243 if token.type in {
244 TokenType.indent,
245 TokenType.dedent,
246 TokenType.newline,
247 TokenType.semicolon,
248 TokenType.colon,
249 }:
250 stmt_start = True
251 elif token.type not in {TokenType.comment, TokenType.nl}:
252 stmt_start = False
253
254 yield from emit_stashed_lazy(as_keyword=False)
255
256 except pytokens.UnexpectedEOF:
257 raise TokenError("Unexpected EOF in multi-line statement", (line, column))
258 except pytokens.TokenizeError as exc:
259 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column))
260
261
262def printtoken(
263 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
264) -> None: # for testing
265 srow, scol = srow_col
266 erow, ecol = erow_col
267 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}")
268
269
270if __name__ == "__main__": # testing
271 if len(sys.argv) > 1:
272 token_iterator = tokenize(open(sys.argv[1]).read())
273 else:
274 token_iterator = tokenize(sys.stdin.read())
275
276 for tok in token_iterator:
277 printtoken(*tok)