Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 75%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
4# mypy: allow-untyped-defs, allow-untyped-calls
6"""Tokenization help for Python programs.
8generate_tokens(readline) is a generator that breaks a stream of
9text into Python tokens. It accepts a readline-like method which is called
10repeatedly to get the next line of input (or "" for EOF). It generates
115-tuples with these members:
13 the token type (see token.py)
14 the token (a string)
15 the starting (row, column) indices of the token (a 2-tuple of ints)
16 the ending (row, column) indices of the token (a 2-tuple of ints)
17 the original line (string)
19It is designed to match the working of the Python tokenizer exactly, except
20that it produces COMMENT tokens for comments and gives type OP for all
21operators
23Older entry points
24 tokenize_loop(readline, tokeneater)
25 tokenize(readline, tokeneater=printtoken)
26are the same, except instead of generating tokens, tokeneater is a callback
27function to which the 5 fields described above are passed as 5 arguments,
28each time a new token is found."""
30import sys
31from collections.abc import Iterator
32from typing import Optional
34from blib2to3.pgen2.grammar import Grammar
35from blib2to3.pgen2.token import (
36 ASYNC,
37 AWAIT,
38 COMMENT,
39 DEDENT,
40 ENDMARKER,
41 ERRORTOKEN,
42 FSTRING_END,
43 FSTRING_MIDDLE,
44 FSTRING_START,
45 INDENT,
46 NAME,
47 NEWLINE,
48 NL,
49 NUMBER,
50 OP,
51 STRING,
52 tok_name,
53)
55__author__ = "Ka-Ping Yee <ping@lfw.org>"
56__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"
58import pytokens
59from pytokens import TokenType
61from . import token as _token
63__all__ = [x for x in dir(_token) if x[0] != "_"] + [
64 "tokenize",
65 "generate_tokens",
66 "untokenize",
67]
68del _token
70Coord = tuple[int, int]
71TokenInfo = tuple[int, str, Coord, Coord, str]
73TOKEN_TYPE_MAP = {
74 TokenType.indent: INDENT,
75 TokenType.dedent: DEDENT,
76 TokenType.newline: NEWLINE,
77 TokenType.nl: NL,
78 TokenType.comment: COMMENT,
79 TokenType.semicolon: OP,
80 TokenType.lparen: OP,
81 TokenType.rparen: OP,
82 TokenType.lbracket: OP,
83 TokenType.rbracket: OP,
84 TokenType.lbrace: OP,
85 TokenType.rbrace: OP,
86 TokenType.colon: OP,
87 TokenType.op: OP,
88 TokenType.identifier: NAME,
89 TokenType.number: NUMBER,
90 TokenType.string: STRING,
91 TokenType.fstring_start: FSTRING_START,
92 TokenType.fstring_middle: FSTRING_MIDDLE,
93 TokenType.fstring_end: FSTRING_END,
94 TokenType.endmarker: ENDMARKER,
95}
98class TokenError(Exception): ...
101def transform_whitespace(
102 token: pytokens.Token, source: str, prev_token: Optional[pytokens.Token]
103) -> pytokens.Token:
104 r"""
105 Black treats `\\\n` at the end of a line as a 'NL' token, while it
106 is ignored as whitespace in the regular Python parser.
107 But, only the first one. If there's a `\\\n` following it
108 (as in, a \ just by itself on a line), that is not made into NL.
109 """
110 if (
111 token.type == TokenType.whitespace
112 and prev_token is not None
113 and prev_token.type not in (TokenType.nl, TokenType.newline)
114 ):
115 token_str = source[token.start_index : token.end_index]
116 if token_str.startswith("\\\r\n"):
117 return pytokens.Token(
118 TokenType.nl,
119 token.start_index,
120 token.start_index + 3,
121 token.start_line,
122 token.start_col,
123 token.start_line,
124 token.start_col + 3,
125 )
126 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"):
127 return pytokens.Token(
128 TokenType.nl,
129 token.start_index,
130 token.start_index + 2,
131 token.start_line,
132 token.start_col,
133 token.start_line,
134 token.start_col + 2,
135 )
137 return token
140def tokenize(source: str, grammar: Optional[Grammar] = None) -> Iterator[TokenInfo]:
141 lines = source.split("\n")
142 lines += [""] # For newline tokens in files that don't end in a newline
143 line, column = 1, 0
145 prev_token: Optional[pytokens.Token] = None
146 try:
147 for token in pytokens.tokenize(source):
148 token = transform_whitespace(token, source, prev_token)
150 line, column = token.start_line, token.start_col
151 if token.type == TokenType.whitespace:
152 continue
154 token_str = source[token.start_index : token.end_index]
156 if token.type == TokenType.newline and token_str == "":
157 # Black doesn't yield empty newline tokens at the end of a file
158 # if there's no newline at the end of a file.
159 prev_token = token
160 continue
162 source_line = lines[token.start_line - 1]
164 if token.type == TokenType.identifier and token_str in ("async", "await"):
165 # Black uses `async` and `await` token types just for those two keywords
166 yield (
167 ASYNC if token_str == "async" else AWAIT,
168 token_str,
169 (token.start_line, token.start_col),
170 (token.end_line, token.end_col),
171 source_line,
172 )
173 elif token.type == TokenType.op and token_str == "...":
174 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead
175 assert token.start_line == token.end_line
176 assert token.end_col == token.start_col + 3
178 token_str = "."
179 for start_col in range(token.start_col, token.start_col + 3):
180 end_col = start_col + 1
181 yield (
182 TOKEN_TYPE_MAP[token.type],
183 token_str,
184 (token.start_line, start_col),
185 (token.end_line, end_col),
186 source_line,
187 )
188 else:
189 yield (
190 TOKEN_TYPE_MAP[token.type],
191 token_str,
192 (token.start_line, token.start_col),
193 (token.end_line, token.end_col),
194 source_line,
195 )
196 prev_token = token
198 except pytokens.UnexpectedEOF:
199 raise TokenError("Unexpected EOF in multi-line statement", (line, column))
200 except pytokens.TokenizeError as exc:
201 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column))
204def printtoken(
205 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str
206) -> None: # for testing
207 (srow, scol) = srow_col
208 (erow, ecol) = erow_col
209 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}")
212if __name__ == "__main__": # testing
213 if len(sys.argv) > 1:
214 token_iterator = tokenize(open(sys.argv[1]).read())
215 else:
216 token_iterator = tokenize(sys.stdin.read())
218 for tok in token_iterator:
219 printtoken(*tok)