Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 76%

4# mypy: allow-untyped-defs, allow-untyped-calls

6"""Tokenization help for Python programs.

8generate_tokens(readline) is a generator that breaks a stream of

9text into Python tokens. It accepts a readline-like method which is called

10repeatedly to get the next line of input (or "" for EOF). It generates

115-tuples with these members:

13 the token type (see token.py)

14 the token (a string)

15 the starting (row, column) indices of the token (a 2-tuple of ints)

16 the ending (row, column) indices of the token (a 2-tuple of ints)

17 the original line (string)

19It is designed to match the working of the Python tokenizer exactly, except

20that it produces COMMENT tokens for comments and gives type OP for all

21operators

23Older entry points

24 tokenize_loop(readline, tokeneater)

25 tokenize(readline, tokeneater=printtoken)

26are the same, except instead of generating tokens, tokeneater is a callback

27function to which the 5 fields described above are passed as 5 arguments,

28each time a new token is found."""

30import sys

31from collections.abc import Iterator

33from blib2to3.pgen2.grammar import Grammar

34from blib2to3.pgen2.token import (

35 ASYNC,

36 AWAIT,

37 COMMENT,

38 DEDENT,

39 ENDMARKER,

40 FSTRING_END,

41 FSTRING_MIDDLE,

42 FSTRING_START,

43 INDENT,

44 LAZY,

45 NAME,

46 NEWLINE,

47 NL,

48 NUMBER,

49 OP,

50 STRING,

51 TSTRING_END,

52 TSTRING_MIDDLE,

53 TSTRING_START,

54 tok_name,

55)

57__author__ = "Ka-Ping Yee <ping@lfw.org>"

58__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"

60import pytokens

61from pytokens import TokenType

63from . import token as _token

65__all__ = [x for x in dir(_token) if x[0] != "_"] + [

66 "tokenize",

67 "generate_tokens",

68 "untokenize",

69]

70del _token

72Coord = tuple[int, int]

73TokenInfo = tuple[int, str, Coord, Coord, str]

74LazyStash = tuple[pytokens.Token, str, str]

76TOKEN_TYPE_MAP = {

77 TokenType.indent: INDENT,

78 TokenType.dedent: DEDENT,

79 TokenType.newline: NEWLINE,

80 TokenType.nl: NL,

81 TokenType.comment: COMMENT,

82 TokenType.semicolon: OP,

83 TokenType.lparen: OP,

84 TokenType.rparen: OP,

85 TokenType.lbracket: OP,

86 TokenType.rbracket: OP,

87 TokenType.lbrace: OP,

88 TokenType.rbrace: OP,

89 TokenType.colon: OP,

90 TokenType.op: OP,

91 TokenType.identifier: NAME,

92 TokenType.number: NUMBER,

93 TokenType.string: STRING,

94 TokenType.fstring_start: FSTRING_START,

95 TokenType.fstring_middle: FSTRING_MIDDLE,

96 TokenType.fstring_end: FSTRING_END,

97 TokenType.tstring_start: TSTRING_START,

98 TokenType.tstring_middle: TSTRING_MIDDLE,

99 TokenType.tstring_end: TSTRING_END,

100 TokenType.endmarker: ENDMARKER,

101}

102

103

104class TokenError(Exception): ...

105

106

107def transform_whitespace(

108 token: pytokens.Token, source: str, prev_token: pytokens.Token | None

109) -> pytokens.Token:

110 r"""

111 Black treats `\\\n` at the end of a line as a 'NL' token, while it

112 is ignored as whitespace in the regular Python parser.

113 But, only the first one. If there's a `\\\n` following it

114 (as in, a \ just by itself on a line), that is not made into NL.

115 """

116 if (

117 token.type == TokenType.whitespace

118 and prev_token is not None

119 and prev_token.type not in (TokenType.nl, TokenType.newline)

120 ):

121 token_str = source[token.start_index : token.end_index]

122 if token_str.startswith("\\\r\n"):

123 return pytokens.Token(

124 TokenType.nl,

125 token.start_index,

126 token.start_index + 3,

127 token.start_line,

128 token.start_col,

129 token.start_line,

130 token.start_col + 3,

131 )

132 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"):

133 return pytokens.Token(

134 TokenType.nl,

135 token.start_index,

136 token.start_index + 2,

137 token.start_line,

138 token.start_col,

139 token.start_line,

140 token.start_col + 2,

141 )

142

143 return token

144

145

146def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]:

147 lines = source.split("\n")

148 lines += [""] # For newline tokens in files that don't end in a newline

149 line, column = 1, 0

150

151 prev_token: pytokens.Token | None = None

152 lazy_stashed: LazyStash | None = None

153 stmt_start = True

154

155 def emit_stashed_lazy(*, as_keyword: bool) -> Iterator[TokenInfo]:

156 nonlocal lazy_stashed

157 if lazy_stashed is None:

158 return

159

160 stashed_token, stashed_str, stashed_line = lazy_stashed

161 yield (

162 LAZY if as_keyword else NAME,

163 stashed_str,

164 (stashed_token.start_line, stashed_token.start_col),

165 (stashed_token.end_line, stashed_token.end_col),

166 stashed_line,

167 )

168 lazy_stashed = None

169

170 try:

171 for token in pytokens.tokenize(source):

172 token = transform_whitespace(token, source, prev_token)

173

174 line, column = token.start_line, token.start_col

175 if token.type == TokenType.whitespace:

176 continue

177

178 token_str = source[token.start_index : token.end_index]

179

180 if token.type == TokenType.newline and token_str == "":

181 # Black doesn't yield empty newline tokens at the end of a file

182 # if there's no newline at the end of a file.

183 prev_token = token

184 continue

185

186 source_line = lines[token.start_line - 1]

187

188 if lazy_stashed is not None and not (

189 token.type == TokenType.identifier and token_str in ("import", "from")

190 ):

191 yield from emit_stashed_lazy(as_keyword=False)

192

193 if (

194 token.type == TokenType.identifier

195 and token_str == "lazy"

196 and stmt_start

197 ):

198 lazy_stashed = (token, token_str, source_line)

199 prev_token = token

200 stmt_start = False

201 continue

202

203 if lazy_stashed is not None:

204 yield from emit_stashed_lazy(as_keyword=True)

205

206 if token.type == TokenType.identifier and token_str in ("async", "await"):

207 # Black uses `async` and `await` token types just for those two keywords

208 yield (

209 ASYNC if token_str == "async" else AWAIT,

210 token_str,

211 (token.start_line, token.start_col),

212 (token.end_line, token.end_col),

213 source_line,

214 )

215 elif token.type == TokenType.op and token_str == "...":

216 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead

217 assert token.start_line == token.end_line

218 assert token.end_col == token.start_col + 3

219

220 token_str = "."

221 for start_col in range(token.start_col, token.start_col + 3):

222 end_col = start_col + 1

223 yield (

224 TOKEN_TYPE_MAP[token.type],

225 token_str,

226 (token.start_line, start_col),

227 (token.end_line, end_col),

228 source_line,

229 )

230 else:

231 token_type = TOKEN_TYPE_MAP.get(token.type)

232 if token_type is None:

233 raise ValueError(f"Unknown token type: {token.type!r}")

234 yield (

235 TOKEN_TYPE_MAP[token.type],

236 token_str,

237 (token.start_line, token.start_col),

238 (token.end_line, token.end_col),

239 source_line,

240 )

241 prev_token = token

242

243 if token.type in {

244 TokenType.indent,

245 TokenType.dedent,

246 TokenType.newline,

247 TokenType.semicolon,

248 TokenType.colon,

249 }:

250 stmt_start = True

251 elif token.type not in {TokenType.comment, TokenType.nl}:

252 stmt_start = False

253

254 yield from emit_stashed_lazy(as_keyword=False)

255

256 except pytokens.UnexpectedEOF:

257 raise TokenError("Unexpected EOF in multi-line statement", (line, column))

258 except pytokens.TokenizeError as exc:

259 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column))

260

261

262def printtoken(

263 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str

264) -> None: # for testing

265 srow, scol = srow_col

266 erow, ecol = erow_col

267 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}")

268

269

270if __name__ == "__main__": # testing

271 if len(sys.argv) > 1:

272 token_iterator = tokenize(open(sys.argv[1]).read())

273 else:

274 token_iterator = tokenize(sys.stdin.read())

275

276 for tok in token_iterator:

277 printtoken(*tok)