Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 74%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

70 statements  

1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 

2# All rights reserved. 

3 

4# mypy: allow-untyped-defs, allow-untyped-calls 

5 

6"""Tokenization help for Python programs. 

7 

8generate_tokens(readline) is a generator that breaks a stream of 

9text into Python tokens. It accepts a readline-like method which is called 

10repeatedly to get the next line of input (or "" for EOF). It generates 

115-tuples with these members: 

12 

13 the token type (see token.py) 

14 the token (a string) 

15 the starting (row, column) indices of the token (a 2-tuple of ints) 

16 the ending (row, column) indices of the token (a 2-tuple of ints) 

17 the original line (string) 

18 

19It is designed to match the working of the Python tokenizer exactly, except 

20that it produces COMMENT tokens for comments and gives type OP for all 

21operators 

22 

23Older entry points 

24 tokenize_loop(readline, tokeneater) 

25 tokenize(readline, tokeneater=printtoken) 

26are the same, except instead of generating tokens, tokeneater is a callback 

27function to which the 5 fields described above are passed as 5 arguments, 

28each time a new token is found.""" 

29 

30import sys 

31from collections.abc import Iterator 

32from typing import Optional 

33 

34from blib2to3.pgen2.grammar import Grammar 

35from blib2to3.pgen2.token import ( 

36 ASYNC, 

37 AWAIT, 

38 COMMENT, 

39 DEDENT, 

40 ENDMARKER, 

41 FSTRING_END, 

42 FSTRING_MIDDLE, 

43 FSTRING_START, 

44 INDENT, 

45 NAME, 

46 NEWLINE, 

47 NL, 

48 NUMBER, 

49 OP, 

50 STRING, 

51 TSTRING_END, 

52 TSTRING_MIDDLE, 

53 TSTRING_START, 

54 tok_name, 

55) 

56 

57__author__ = "Ka-Ping Yee <ping@lfw.org>" 

58__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" 

59 

60import pytokens 

61from pytokens import TokenType 

62 

63from . import token as _token 

64 

65__all__ = [x for x in dir(_token) if x[0] != "_"] + [ 

66 "tokenize", 

67 "generate_tokens", 

68 "untokenize", 

69] 

70del _token 

71 

72Coord = tuple[int, int] 

73TokenInfo = tuple[int, str, Coord, Coord, str] 

74 

75TOKEN_TYPE_MAP = { 

76 TokenType.indent: INDENT, 

77 TokenType.dedent: DEDENT, 

78 TokenType.newline: NEWLINE, 

79 TokenType.nl: NL, 

80 TokenType.comment: COMMENT, 

81 TokenType.semicolon: OP, 

82 TokenType.lparen: OP, 

83 TokenType.rparen: OP, 

84 TokenType.lbracket: OP, 

85 TokenType.rbracket: OP, 

86 TokenType.lbrace: OP, 

87 TokenType.rbrace: OP, 

88 TokenType.colon: OP, 

89 TokenType.op: OP, 

90 TokenType.identifier: NAME, 

91 TokenType.number: NUMBER, 

92 TokenType.string: STRING, 

93 TokenType.fstring_start: FSTRING_START, 

94 TokenType.fstring_middle: FSTRING_MIDDLE, 

95 TokenType.fstring_end: FSTRING_END, 

96 TokenType.tstring_start: TSTRING_START, 

97 TokenType.tstring_middle: TSTRING_MIDDLE, 

98 TokenType.tstring_end: TSTRING_END, 

99 TokenType.endmarker: ENDMARKER, 

100} 

101 

102 

103class TokenError(Exception): ... 

104 

105 

106def transform_whitespace( 

107 token: pytokens.Token, source: str, prev_token: Optional[pytokens.Token] 

108) -> pytokens.Token: 

109 r""" 

110 Black treats `\\\n` at the end of a line as a 'NL' token, while it 

111 is ignored as whitespace in the regular Python parser. 

112 But, only the first one. If there's a `\\\n` following it 

113 (as in, a \ just by itself on a line), that is not made into NL. 

114 """ 

115 if ( 

116 token.type == TokenType.whitespace 

117 and prev_token is not None 

118 and prev_token.type not in (TokenType.nl, TokenType.newline) 

119 ): 

120 token_str = source[token.start_index : token.end_index] 

121 if token_str.startswith("\\\r\n"): 

122 return pytokens.Token( 

123 TokenType.nl, 

124 token.start_index, 

125 token.start_index + 3, 

126 token.start_line, 

127 token.start_col, 

128 token.start_line, 

129 token.start_col + 3, 

130 ) 

131 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"): 

132 return pytokens.Token( 

133 TokenType.nl, 

134 token.start_index, 

135 token.start_index + 2, 

136 token.start_line, 

137 token.start_col, 

138 token.start_line, 

139 token.start_col + 2, 

140 ) 

141 

142 return token 

143 

144 

145def tokenize(source: str, grammar: Optional[Grammar] = None) -> Iterator[TokenInfo]: 

146 lines = source.split("\n") 

147 lines += [""] # For newline tokens in files that don't end in a newline 

148 line, column = 1, 0 

149 

150 prev_token: Optional[pytokens.Token] = None 

151 try: 

152 for token in pytokens.tokenize(source): 

153 token = transform_whitespace(token, source, prev_token) 

154 

155 line, column = token.start_line, token.start_col 

156 if token.type == TokenType.whitespace: 

157 continue 

158 

159 token_str = source[token.start_index : token.end_index] 

160 

161 if token.type == TokenType.newline and token_str == "": 

162 # Black doesn't yield empty newline tokens at the end of a file 

163 # if there's no newline at the end of a file. 

164 prev_token = token 

165 continue 

166 

167 source_line = lines[token.start_line - 1] 

168 

169 if token.type == TokenType.identifier and token_str in ("async", "await"): 

170 # Black uses `async` and `await` token types just for those two keywords 

171 yield ( 

172 ASYNC if token_str == "async" else AWAIT, 

173 token_str, 

174 (token.start_line, token.start_col), 

175 (token.end_line, token.end_col), 

176 source_line, 

177 ) 

178 elif token.type == TokenType.op and token_str == "...": 

179 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead 

180 assert token.start_line == token.end_line 

181 assert token.end_col == token.start_col + 3 

182 

183 token_str = "." 

184 for start_col in range(token.start_col, token.start_col + 3): 

185 end_col = start_col + 1 

186 yield ( 

187 TOKEN_TYPE_MAP[token.type], 

188 token_str, 

189 (token.start_line, start_col), 

190 (token.end_line, end_col), 

191 source_line, 

192 ) 

193 else: 

194 token_type = TOKEN_TYPE_MAP.get(token.type) 

195 if token_type is None: 

196 raise ValueError(f"Unknown token type: {token.type!r}") 

197 yield ( 

198 TOKEN_TYPE_MAP[token.type], 

199 token_str, 

200 (token.start_line, token.start_col), 

201 (token.end_line, token.end_col), 

202 source_line, 

203 ) 

204 prev_token = token 

205 

206 except pytokens.UnexpectedEOF: 

207 raise TokenError("Unexpected EOF in multi-line statement", (line, column)) 

208 except pytokens.TokenizeError as exc: 

209 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column)) 

210 

211 

212def printtoken( 

213 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str 

214) -> None: # for testing 

215 (srow, scol) = srow_col 

216 (erow, ecol) = erow_col 

217 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}") 

218 

219 

220if __name__ == "__main__": # testing 

221 if len(sys.argv) > 1: 

222 token_iterator = tokenize(open(sys.argv[1]).read()) 

223 else: 

224 token_iterator = tokenize(sys.stdin.read()) 

225 

226 for tok in token_iterator: 

227 printtoken(*tok)