Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 76%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

92 statements  

1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 

2# All rights reserved. 

3 

4# mypy: allow-untyped-defs, allow-untyped-calls 

5 

6"""Tokenization help for Python programs. 

7 

8generate_tokens(readline) is a generator that breaks a stream of 

9text into Python tokens. It accepts a readline-like method which is called 

10repeatedly to get the next line of input (or "" for EOF). It generates 

115-tuples with these members: 

12 

13 the token type (see token.py) 

14 the token (a string) 

15 the starting (row, column) indices of the token (a 2-tuple of ints) 

16 the ending (row, column) indices of the token (a 2-tuple of ints) 

17 the original line (string) 

18 

19It is designed to match the working of the Python tokenizer exactly, except 

20that it produces COMMENT tokens for comments and gives type OP for all 

21operators 

22 

23Older entry points 

24 tokenize_loop(readline, tokeneater) 

25 tokenize(readline, tokeneater=printtoken) 

26are the same, except instead of generating tokens, tokeneater is a callback 

27function to which the 5 fields described above are passed as 5 arguments, 

28each time a new token is found.""" 

29 

30import sys 

31from collections.abc import Iterator 

32 

33from blib2to3.pgen2.grammar import Grammar 

34from blib2to3.pgen2.token import ( 

35 ASYNC, 

36 AWAIT, 

37 COMMENT, 

38 DEDENT, 

39 ENDMARKER, 

40 FSTRING_END, 

41 FSTRING_MIDDLE, 

42 FSTRING_START, 

43 INDENT, 

44 LAZY, 

45 NAME, 

46 NEWLINE, 

47 NL, 

48 NUMBER, 

49 OP, 

50 STRING, 

51 TSTRING_END, 

52 TSTRING_MIDDLE, 

53 TSTRING_START, 

54 tok_name, 

55) 

56 

57__author__ = "Ka-Ping Yee <ping@lfw.org>" 

58__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" 

59 

60import pytokens 

61from pytokens import TokenType 

62 

63from . import token as _token 

64 

65__all__ = [x for x in dir(_token) if x[0] != "_"] + [ 

66 "tokenize", 

67 "generate_tokens", 

68 "untokenize", 

69] 

70del _token 

71 

72Coord = tuple[int, int] 

73TokenInfo = tuple[int, str, Coord, Coord, str] 

74LazyStash = tuple[pytokens.Token, str, str] 

75 

76TOKEN_TYPE_MAP = { 

77 TokenType.indent: INDENT, 

78 TokenType.dedent: DEDENT, 

79 TokenType.newline: NEWLINE, 

80 TokenType.nl: NL, 

81 TokenType.comment: COMMENT, 

82 TokenType.semicolon: OP, 

83 TokenType.lparen: OP, 

84 TokenType.rparen: OP, 

85 TokenType.lbracket: OP, 

86 TokenType.rbracket: OP, 

87 TokenType.lbrace: OP, 

88 TokenType.rbrace: OP, 

89 TokenType.colon: OP, 

90 TokenType.op: OP, 

91 TokenType.identifier: NAME, 

92 TokenType.number: NUMBER, 

93 TokenType.string: STRING, 

94 TokenType.fstring_start: FSTRING_START, 

95 TokenType.fstring_middle: FSTRING_MIDDLE, 

96 TokenType.fstring_end: FSTRING_END, 

97 TokenType.tstring_start: TSTRING_START, 

98 TokenType.tstring_middle: TSTRING_MIDDLE, 

99 TokenType.tstring_end: TSTRING_END, 

100 TokenType.endmarker: ENDMARKER, 

101} 

102 

103 

104class TokenError(Exception): ... 

105 

106 

107def transform_whitespace( 

108 token: pytokens.Token, source: str, prev_token: pytokens.Token | None 

109) -> pytokens.Token: 

110 r""" 

111 Black treats `\\\n` at the end of a line as a 'NL' token, while it 

112 is ignored as whitespace in the regular Python parser. 

113 But, only the first one. If there's a `\\\n` following it 

114 (as in, a \ just by itself on a line), that is not made into NL. 

115 """ 

116 if ( 

117 token.type == TokenType.whitespace 

118 and prev_token is not None 

119 and prev_token.type not in (TokenType.nl, TokenType.newline) 

120 ): 

121 token_str = source[token.start_index : token.end_index] 

122 if token_str.startswith("\\\r\n"): 

123 return pytokens.Token( 

124 TokenType.nl, 

125 token.start_index, 

126 token.start_index + 3, 

127 token.start_line, 

128 token.start_col, 

129 token.start_line, 

130 token.start_col + 3, 

131 ) 

132 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"): 

133 return pytokens.Token( 

134 TokenType.nl, 

135 token.start_index, 

136 token.start_index + 2, 

137 token.start_line, 

138 token.start_col, 

139 token.start_line, 

140 token.start_col + 2, 

141 ) 

142 

143 return token 

144 

145 

146def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]: 

147 lines = source.split("\n") 

148 lines += [""] # For newline tokens in files that don't end in a newline 

149 line, column = 1, 0 

150 

151 prev_token: pytokens.Token | None = None 

152 lazy_stashed: LazyStash | None = None 

153 stmt_start = True 

154 

155 def emit_stashed_lazy(*, as_keyword: bool) -> Iterator[TokenInfo]: 

156 nonlocal lazy_stashed 

157 if lazy_stashed is None: 

158 return 

159 

160 stashed_token, stashed_str, stashed_line = lazy_stashed 

161 yield ( 

162 LAZY if as_keyword else NAME, 

163 stashed_str, 

164 (stashed_token.start_line, stashed_token.start_col), 

165 (stashed_token.end_line, stashed_token.end_col), 

166 stashed_line, 

167 ) 

168 lazy_stashed = None 

169 

170 try: 

171 for token in pytokens.tokenize(source): 

172 token = transform_whitespace(token, source, prev_token) 

173 

174 line, column = token.start_line, token.start_col 

175 if token.type == TokenType.whitespace: 

176 continue 

177 

178 token_str = source[token.start_index : token.end_index] 

179 

180 if token.type == TokenType.newline and token_str == "": 

181 # Black doesn't yield empty newline tokens at the end of a file 

182 # if there's no newline at the end of a file. 

183 prev_token = token 

184 continue 

185 

186 source_line = lines[token.start_line - 1] 

187 

188 if lazy_stashed is not None and not ( 

189 token.type == TokenType.identifier and token_str in ("import", "from") 

190 ): 

191 yield from emit_stashed_lazy(as_keyword=False) 

192 

193 if ( 

194 token.type == TokenType.identifier 

195 and token_str == "lazy" 

196 and stmt_start 

197 ): 

198 lazy_stashed = (token, token_str, source_line) 

199 prev_token = token 

200 stmt_start = False 

201 continue 

202 

203 if lazy_stashed is not None: 

204 yield from emit_stashed_lazy(as_keyword=True) 

205 

206 if token.type == TokenType.identifier and token_str in ("async", "await"): 

207 # Black uses `async` and `await` token types just for those two keywords 

208 yield ( 

209 ASYNC if token_str == "async" else AWAIT, 

210 token_str, 

211 (token.start_line, token.start_col), 

212 (token.end_line, token.end_col), 

213 source_line, 

214 ) 

215 elif token.type == TokenType.op and token_str == "...": 

216 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead 

217 assert token.start_line == token.end_line 

218 assert token.end_col == token.start_col + 3 

219 

220 token_str = "." 

221 for start_col in range(token.start_col, token.start_col + 3): 

222 end_col = start_col + 1 

223 yield ( 

224 TOKEN_TYPE_MAP[token.type], 

225 token_str, 

226 (token.start_line, start_col), 

227 (token.end_line, end_col), 

228 source_line, 

229 ) 

230 else: 

231 token_type = TOKEN_TYPE_MAP.get(token.type) 

232 if token_type is None: 

233 raise ValueError(f"Unknown token type: {token.type!r}") 

234 yield ( 

235 TOKEN_TYPE_MAP[token.type], 

236 token_str, 

237 (token.start_line, token.start_col), 

238 (token.end_line, token.end_col), 

239 source_line, 

240 ) 

241 prev_token = token 

242 

243 if token.type in { 

244 TokenType.indent, 

245 TokenType.dedent, 

246 TokenType.newline, 

247 TokenType.semicolon, 

248 TokenType.colon, 

249 }: 

250 stmt_start = True 

251 elif token.type not in {TokenType.comment, TokenType.nl}: 

252 stmt_start = False 

253 

254 yield from emit_stashed_lazy(as_keyword=False) 

255 

256 except pytokens.UnexpectedEOF: 

257 raise TokenError("Unexpected EOF in multi-line statement", (line, column)) 

258 except pytokens.TokenizeError as exc: 

259 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column)) 

260 

261 

262def printtoken( 

263 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str 

264) -> None: # for testing 

265 srow, scol = srow_col 

266 erow, ecol = erow_col 

267 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}") 

268 

269 

270if __name__ == "__main__": # testing 

271 if len(sys.argv) > 1: 

272 token_iterator = tokenize(open(sys.argv[1]).read()) 

273 else: 

274 token_iterator = tokenize(sys.stdin.read()) 

275 

276 for tok in token_iterator: 

277 printtoken(*tok)