Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 74%

4# mypy: allow-untyped-defs, allow-untyped-calls

6"""Tokenization help for Python programs.

8generate_tokens(readline) is a generator that breaks a stream of

9text into Python tokens. It accepts a readline-like method which is called

10repeatedly to get the next line of input (or "" for EOF). It generates

115-tuples with these members:

13 the token type (see token.py)

14 the token (a string)

15 the starting (row, column) indices of the token (a 2-tuple of ints)

16 the ending (row, column) indices of the token (a 2-tuple of ints)

17 the original line (string)

19It is designed to match the working of the Python tokenizer exactly, except

20that it produces COMMENT tokens for comments and gives type OP for all

21operators

23Older entry points

24 tokenize_loop(readline, tokeneater)

25 tokenize(readline, tokeneater=printtoken)

26are the same, except instead of generating tokens, tokeneater is a callback

27function to which the 5 fields described above are passed as 5 arguments,

28each time a new token is found."""

30import sys

31from collections.abc import Iterator

32from typing import Optional

34from blib2to3.pgen2.grammar import Grammar

35from blib2to3.pgen2.token import (

36 ASYNC,

37 AWAIT,

38 COMMENT,

39 DEDENT,

40 ENDMARKER,

41 FSTRING_END,

42 FSTRING_MIDDLE,

43 FSTRING_START,

44 INDENT,

45 NAME,

46 NEWLINE,

47 NL,

48 NUMBER,

49 OP,

50 STRING,

51 TSTRING_END,

52 TSTRING_MIDDLE,

53 TSTRING_START,

54 tok_name,

55)

57__author__ = "Ka-Ping Yee <ping@lfw.org>"

58__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"

60import pytokens

61from pytokens import TokenType

63from . import token as _token

65__all__ = [x for x in dir(_token) if x[0] != "_"] + [

66 "tokenize",

67 "generate_tokens",

68 "untokenize",

69]

70del _token

72Coord = tuple[int, int]

73TokenInfo = tuple[int, str, Coord, Coord, str]

75TOKEN_TYPE_MAP = {

76 TokenType.indent: INDENT,

77 TokenType.dedent: DEDENT,

78 TokenType.newline: NEWLINE,

79 TokenType.nl: NL,

80 TokenType.comment: COMMENT,

81 TokenType.semicolon: OP,

82 TokenType.lparen: OP,

83 TokenType.rparen: OP,

84 TokenType.lbracket: OP,

85 TokenType.rbracket: OP,

86 TokenType.lbrace: OP,

87 TokenType.rbrace: OP,

88 TokenType.colon: OP,

89 TokenType.op: OP,

90 TokenType.identifier: NAME,

91 TokenType.number: NUMBER,

92 TokenType.string: STRING,

93 TokenType.fstring_start: FSTRING_START,

94 TokenType.fstring_middle: FSTRING_MIDDLE,

95 TokenType.fstring_end: FSTRING_END,

96 TokenType.tstring_start: TSTRING_START,

97 TokenType.tstring_middle: TSTRING_MIDDLE,

98 TokenType.tstring_end: TSTRING_END,

99 TokenType.endmarker: ENDMARKER,

100}

101

102

103class TokenError(Exception): ...

104

105

106def transform_whitespace(

107 token: pytokens.Token, source: str, prev_token: Optional[pytokens.Token]

108) -> pytokens.Token:

109 r"""

110 Black treats `\\\n` at the end of a line as a 'NL' token, while it

111 is ignored as whitespace in the regular Python parser.

112 But, only the first one. If there's a `\\\n` following it

113 (as in, a \ just by itself on a line), that is not made into NL.

114 """

115 if (

116 token.type == TokenType.whitespace

117 and prev_token is not None

118 and prev_token.type not in (TokenType.nl, TokenType.newline)

119 ):

120 token_str = source[token.start_index : token.end_index]

121 if token_str.startswith("\\\r\n"):

122 return pytokens.Token(

123 TokenType.nl,

124 token.start_index,

125 token.start_index + 3,

126 token.start_line,

127 token.start_col,

128 token.start_line,

129 token.start_col + 3,

130 )

131 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"):

132 return pytokens.Token(

133 TokenType.nl,

134 token.start_index,

135 token.start_index + 2,

136 token.start_line,

137 token.start_col,

138 token.start_line,

139 token.start_col + 2,

140 )

141

142 return token

143

144

145def tokenize(source: str, grammar: Optional[Grammar] = None) -> Iterator[TokenInfo]:

146 lines = source.split("\n")

147 lines += [""] # For newline tokens in files that don't end in a newline

148 line, column = 1, 0

149

150 prev_token: Optional[pytokens.Token] = None

151 try:

152 for token in pytokens.tokenize(source):

153 token = transform_whitespace(token, source, prev_token)

154

155 line, column = token.start_line, token.start_col

156 if token.type == TokenType.whitespace:

157 continue

158

159 token_str = source[token.start_index : token.end_index]

160

161 if token.type == TokenType.newline and token_str == "":

162 # Black doesn't yield empty newline tokens at the end of a file

163 # if there's no newline at the end of a file.

164 prev_token = token

165 continue

166

167 source_line = lines[token.start_line - 1]

168

169 if token.type == TokenType.identifier and token_str in ("async", "await"):

170 # Black uses `async` and `await` token types just for those two keywords

171 yield (

172 ASYNC if token_str == "async" else AWAIT,

173 token_str,

174 (token.start_line, token.start_col),

175 (token.end_line, token.end_col),

176 source_line,

177 )

178 elif token.type == TokenType.op and token_str == "...":

179 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead

180 assert token.start_line == token.end_line

181 assert token.end_col == token.start_col + 3

182

183 token_str = "."

184 for start_col in range(token.start_col, token.start_col + 3):

185 end_col = start_col + 1

186 yield (

187 TOKEN_TYPE_MAP[token.type],

188 token_str,

189 (token.start_line, start_col),

190 (token.end_line, end_col),

191 source_line,

192 )

193 else:

194 token_type = TOKEN_TYPE_MAP.get(token.type)

195 if token_type is None:

196 raise ValueError(f"Unknown token type: {token.type!r}")

197 yield (

198 TOKEN_TYPE_MAP[token.type],

199 token_str,

200 (token.start_line, token.start_col),

201 (token.end_line, token.end_col),

202 source_line,

203 )

204 prev_token = token

205

206 except pytokens.UnexpectedEOF:

207 raise TokenError("Unexpected EOF in multi-line statement", (line, column))

208 except pytokens.TokenizeError as exc:

209 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column))

210

211

212def printtoken(

213 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str

214) -> None: # for testing

215 (srow, scol) = srow_col

216 (erow, ecol) = erow_col

217 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}")

218

219

220if __name__ == "__main__": # testing

221 if len(sys.argv) > 1:

222 token_iterator = tokenize(open(sys.argv[1]).read())

223 else:

224 token_iterator = tokenize(sys.stdin.read())

225

226 for tok in token_iterator:

227 printtoken(*tok)