Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 74%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
4# mypy: allow-untyped-defs, allow-untyped-calls
6"""Tokenization help for Python programs.
8generate_tokens(readline) is a generator that breaks a stream of
9text into Python tokens. It accepts a readline-like method which is called
10repeatedly to get the next line of input (or "" for EOF). It generates
115-tuples with these members:
13 the token type (see token.py)
14 the token (a string)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19It is designed to match the working of the Python tokenizer exactly, except
20that it produces COMMENT tokens for comments and gives type OP for all
21operators
23Older entry points
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26are the same, except instead of generating tokens, tokeneater is a callback
27function to which the 5 fields described above are passed as 5 arguments,
28each time a new token is found."""
30import sys
31from collections.abc import Iterator
33from blib2to3.pgen2.grammar import Grammar
34from blib2to3.pgen2.token import (
35 ASYNC,
36 AWAIT,
37 COMMENT,
38 DEDENT,
39 ENDMARKER,
40 FSTRING_END,
41 FSTRING_MIDDLE,
42 FSTRING_START,
43 INDENT,
44 NAME,
45 NEWLINE,
46 NL,
47 NUMBER,
48 OP,
49 STRING,
50 TSTRING_END,
51 TSTRING_MIDDLE,
52 TSTRING_START,
53 tok_name,
54)
56__author__ = "Ka-Ping Yee <ping@lfw.org>"
57__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
59import pytokens
60from pytokens import TokenType
62from . import token as _token
64__all__ = [x for x in dir(_token) if x[0] != "_"] + [
65 "tokenize",
66 "generate_tokens",
67 "untokenize",
68]
69del _token
71Coord = tuple[int, int]
72TokenInfo = tuple[int, str, Coord, Coord, str]
74TOKEN_TYPE_MAP = {
75 TokenType.indent: INDENT,
76 TokenType.dedent: DEDENT,
77 TokenType.newline: NEWLINE,
78 TokenType.nl: NL,
79 TokenType.comment: COMMENT,
80 TokenType.semicolon: OP,
81 TokenType.lparen: OP,
82 TokenType.rparen: OP,
83 TokenType.lbracket: OP,
84 TokenType.rbracket: OP,
85 TokenType.lbrace: OP,
86 TokenType.rbrace: OP,
87 TokenType.colon: OP,
88 TokenType.op: OP,
89 TokenType.identifier: NAME,
90 TokenType.number: NUMBER,
91 TokenType.string: STRING,
92 TokenType.fstring_start: FSTRING_START,
93 TokenType.fstring_middle: FSTRING_MIDDLE,
94 TokenType.fstring_end: FSTRING_END,
95 TokenType.tstring_start: TSTRING_START,
96 TokenType.tstring_middle: TSTRING_MIDDLE,
97 TokenType.tstring_end: TSTRING_END,
98 TokenType.endmarker: ENDMARKER,
99}
102class TokenError(Exception): ...
105def transform_whitespace(
106 token: pytokens.Token, source: str, prev_token: pytokens.Token | None
107) -> pytokens.Token:
108 r"""
109 Black treats `\\\n` at the end of a line as a 'NL' token, while it
110 is ignored as whitespace in the regular Python parser.
111 But, only the first one. If there's a `\\\n` following it
112 (as in, a \ just by itself on a line), that is not made into NL.
113 """
114 if (
115 token.type == TokenType.whitespace
116 and prev_token is not None
117 and prev_token.type not in (TokenType.nl, TokenType.newline)
118 ):
119 token_str = source[token.start_index : token.end_index]
120 if token_str.startswith("\\\r\n"):
121 return pytokens.Token(
122 TokenType.nl,
123 token.start_index,
124 token.start_index + 3,
125 token.start_line,
126 token.start_col,
127 token.start_line,
128 token.start_col + 3,
129 )
130 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"):
131 return pytokens.Token(
132 TokenType.nl,
133 token.start_index,
134 token.start_index + 2,
135 token.start_line,
136 token.start_col,
137 token.start_line,
138 token.start_col + 2,
139 )
141 return token
144def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]:
145 lines = source.split("\n")
146 lines += [""] # For newline tokens in files that don't end in a newline
147 line, column = 1, 0
149 prev_token: pytokens.Token | None = None
150 try:
151 for token in pytokens.tokenize(source):
152 token = transform_whitespace(token, source, prev_token)
154 line, column = token.start_line, token.start_col
155 if token.type == TokenType.whitespace:
156 continue
158 token_str = source[token.start_index : token.end_index]
160 if token.type == TokenType.newline and token_str == "":
161 # Black doesn't yield empty newline tokens at the end of a file
162 # if there's no newline at the end of a file.
163 prev_token = token
164 continue
166 source_line = lines[token.start_line - 1]
168 if token.type == TokenType.identifier and token_str in ("async", "await"):
169 # Black uses `async` and `await` token types just for those two keywords
170 yield (
171 ASYNC if token_str == "async" else AWAIT,
172 token_str,
173 (token.start_line, token.start_col),
174 (token.end_line, token.end_col),
175 source_line,
176 )
177 elif token.type == TokenType.op and token_str == "...":
178 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead
179 assert token.start_line == token.end_line
180 assert token.end_col == token.start_col + 3
182 token_str = "."
183 for start_col in range(token.start_col, token.start_col + 3):
184 end_col = start_col + 1
185 yield (
186 TOKEN_TYPE_MAP[token.type],
187 token_str,
188 (token.start_line, start_col),
189 (token.end_line, end_col),
190 source_line,
191 )
192 else:
193 token_type = TOKEN_TYPE_MAP.get(token.type)
194 if token_type is None:
195 raise ValueError(f"Unknown token type: {token.type!r}")
196 yield (
197 TOKEN_TYPE_MAP[token.type],
198 token_str,
199 (token.start_line, token.start_col),
200 (token.end_line, token.end_col),
201 source_line,
202 )
203 prev_token = token
205 except pytokens.UnexpectedEOF:
206 raise TokenError("Unexpected EOF in multi-line statement", (line, column))
207 except pytokens.TokenizeError as exc:
208 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column))
211def printtoken(
212 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
213) -> None: # for testing
214 (srow, scol) = srow_col
215 (erow, ecol) = erow_col
216 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}")
219if __name__ == "__main__": # testing
220 if len(sys.argv) > 1:
221 token_iterator = tokenize(open(sys.argv[1]).read())
222 else:
223 token_iterator = tokenize(sys.stdin.read())
225 for tok in token_iterator:
226 printtoken(*tok)