1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4# mypy: allow-untyped-defs, allow-untyped-calls
5
6"""Tokenization help for Python programs.
7
8generate_tokens(readline) is a generator that breaks a stream of
9text into Python tokens. It accepts a readline-like method which is called
10repeatedly to get the next line of input (or "" for EOF). It generates
115-tuples with these members:
12
13 the token type (see token.py)
14 the token (a string)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
18
19It is designed to match the working of the Python tokenizer exactly, except
20that it produces COMMENT tokens for comments and gives type OP for all
21operators
22
23Older entry points
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26are the same, except instead of generating tokens, tokeneater is a callback
27function to which the 5 fields described above are passed as 5 arguments,
28each time a new token is found."""
29
30import sys
31from collections.abc import Iterator
32from typing import Optional
33
34from blib2to3.pgen2.grammar import Grammar
35from blib2to3.pgen2.token import (
36 ASYNC,
37 AWAIT,
38 COMMENT,
39 DEDENT,
40 ENDMARKER,
41 FSTRING_END,
42 FSTRING_MIDDLE,
43 FSTRING_START,
44 INDENT,
45 NAME,
46 NEWLINE,
47 NL,
48 NUMBER,
49 OP,
50 STRING,
51 TSTRING_END,
52 TSTRING_MIDDLE,
53 TSTRING_START,
54 tok_name,
55)
56
57__author__ = "Ka-Ping Yee <ping@lfw.org>"
58__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
59
60import pytokens
61from pytokens import TokenType
62
63from . import token as _token
64
65__all__ = [x for x in dir(_token) if x[0] != "_"] + [
66 "tokenize",
67 "generate_tokens",
68 "untokenize",
69]
70del _token
71
72Coord = tuple[int, int]
73TokenInfo = tuple[int, str, Coord, Coord, str]
74
75TOKEN_TYPE_MAP = {
76 TokenType.indent: INDENT,
77 TokenType.dedent: DEDENT,
78 TokenType.newline: NEWLINE,
79 TokenType.nl: NL,
80 TokenType.comment: COMMENT,
81 TokenType.semicolon: OP,
82 TokenType.lparen: OP,
83 TokenType.rparen: OP,
84 TokenType.lbracket: OP,
85 TokenType.rbracket: OP,
86 TokenType.lbrace: OP,
87 TokenType.rbrace: OP,
88 TokenType.colon: OP,
89 TokenType.op: OP,
90 TokenType.identifier: NAME,
91 TokenType.number: NUMBER,
92 TokenType.string: STRING,
93 TokenType.fstring_start: FSTRING_START,
94 TokenType.fstring_middle: FSTRING_MIDDLE,
95 TokenType.fstring_end: FSTRING_END,
96 TokenType.tstring_start: TSTRING_START,
97 TokenType.tstring_middle: TSTRING_MIDDLE,
98 TokenType.tstring_end: TSTRING_END,
99 TokenType.endmarker: ENDMARKER,
100}
101
102
103class TokenError(Exception): ...
104
105
106def transform_whitespace(
107 token: pytokens.Token, source: str, prev_token: Optional[pytokens.Token]
108) -> pytokens.Token:
109 r"""
110 Black treats `\\\n` at the end of a line as a 'NL' token, while it
111 is ignored as whitespace in the regular Python parser.
112 But, only the first one. If there's a `\\\n` following it
113 (as in, a \ just by itself on a line), that is not made into NL.
114 """
115 if (
116 token.type == TokenType.whitespace
117 and prev_token is not None
118 and prev_token.type not in (TokenType.nl, TokenType.newline)
119 ):
120 token_str = source[token.start_index : token.end_index]
121 if token_str.startswith("\\\r\n"):
122 return pytokens.Token(
123 TokenType.nl,
124 token.start_index,
125 token.start_index + 3,
126 token.start_line,
127 token.start_col,
128 token.start_line,
129 token.start_col + 3,
130 )
131 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"):
132 return pytokens.Token(
133 TokenType.nl,
134 token.start_index,
135 token.start_index + 2,
136 token.start_line,
137 token.start_col,
138 token.start_line,
139 token.start_col + 2,
140 )
141
142 return token
143
144
145def tokenize(source: str, grammar: Optional[Grammar] = None) -> Iterator[TokenInfo]:
146 lines = source.split("\n")
147 lines += [""] # For newline tokens in files that don't end in a newline
148 line, column = 1, 0
149
150 prev_token: Optional[pytokens.Token] = None
151 try:
152 for token in pytokens.tokenize(source):
153 token = transform_whitespace(token, source, prev_token)
154
155 line, column = token.start_line, token.start_col
156 if token.type == TokenType.whitespace:
157 continue
158
159 token_str = source[token.start_index : token.end_index]
160
161 if token.type == TokenType.newline and token_str == "":
162 # Black doesn't yield empty newline tokens at the end of a file
163 # if there's no newline at the end of a file.
164 prev_token = token
165 continue
166
167 source_line = lines[token.start_line - 1]
168
169 if token.type == TokenType.identifier and token_str in ("async", "await"):
170 # Black uses `async` and `await` token types just for those two keywords
171 yield (
172 ASYNC if token_str == "async" else AWAIT,
173 token_str,
174 (token.start_line, token.start_col),
175 (token.end_line, token.end_col),
176 source_line,
177 )
178 elif token.type == TokenType.op and token_str == "...":
179 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead
180 assert token.start_line == token.end_line
181 assert token.end_col == token.start_col + 3
182
183 token_str = "."
184 for start_col in range(token.start_col, token.start_col + 3):
185 end_col = start_col + 1
186 yield (
187 TOKEN_TYPE_MAP[token.type],
188 token_str,
189 (token.start_line, start_col),
190 (token.end_line, end_col),
191 source_line,
192 )
193 else:
194 token_type = TOKEN_TYPE_MAP.get(token.type)
195 if token_type is None:
196 raise ValueError(f"Unknown token type: {token.type!r}")
197 yield (
198 TOKEN_TYPE_MAP[token.type],
199 token_str,
200 (token.start_line, token.start_col),
201 (token.end_line, token.end_col),
202 source_line,
203 )
204 prev_token = token
205
206 except pytokens.UnexpectedEOF:
207 raise TokenError("Unexpected EOF in multi-line statement", (line, column))
208 except pytokens.TokenizeError as exc:
209 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column))
210
211
212def printtoken(
213 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
214) -> None: # for testing
215 (srow, scol) = srow_col
216 (erow, ecol) = erow_col
217 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}")
218
219
220if __name__ == "__main__": # testing
221 if len(sys.argv) > 1:
222 token_iterator = tokenize(open(sys.argv[1]).read())
223 else:
224 token_iterator = tokenize(sys.stdin.read())
225
226 for tok in token_iterator:
227 printtoken(*tok)