Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/blib2to3/pgen2/tokenize.py: 74%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

69 statements  

1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 

2# All rights reserved. 

3 

4# mypy: allow-untyped-defs, allow-untyped-calls 

5 

6"""Tokenization help for Python programs. 

7 

8generate_tokens(readline) is a generator that breaks a stream of 

9text into Python tokens. It accepts a readline-like method which is called 

10repeatedly to get the next line of input (or "" for EOF). It generates 

115-tuples with these members: 

12 

13 the token type (see token.py) 

14 the token (a string) 

15 the starting (row, column) indices of the token (a 2-tuple of ints) 

16 the ending (row, column) indices of the token (a 2-tuple of ints) 

17 the original line (string) 

18 

19It is designed to match the working of the Python tokenizer exactly, except 

20that it produces COMMENT tokens for comments and gives type OP for all 

21operators 

22 

23Older entry points 

24 tokenize_loop(readline, tokeneater) 

25 tokenize(readline, tokeneater=printtoken) 

26are the same, except instead of generating tokens, tokeneater is a callback 

27function to which the 5 fields described above are passed as 5 arguments, 

28each time a new token is found.""" 

29 

30import sys 

31from collections.abc import Iterator 

32 

33from blib2to3.pgen2.grammar import Grammar 

34from blib2to3.pgen2.token import ( 

35 ASYNC, 

36 AWAIT, 

37 COMMENT, 

38 DEDENT, 

39 ENDMARKER, 

40 FSTRING_END, 

41 FSTRING_MIDDLE, 

42 FSTRING_START, 

43 INDENT, 

44 NAME, 

45 NEWLINE, 

46 NL, 

47 NUMBER, 

48 OP, 

49 STRING, 

50 TSTRING_END, 

51 TSTRING_MIDDLE, 

52 TSTRING_START, 

53 tok_name, 

54) 

55 

56__author__ = "Ka-Ping Yee <ping@lfw.org>" 

57__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" 

58 

59import pytokens 

60from pytokens import TokenType 

61 

62from . import token as _token 

63 

64__all__ = [x for x in dir(_token) if x[0] != "_"] + [ 

65 "tokenize", 

66 "generate_tokens", 

67 "untokenize", 

68] 

69del _token 

70 

71Coord = tuple[int, int] 

72TokenInfo = tuple[int, str, Coord, Coord, str] 

73 

74TOKEN_TYPE_MAP = { 

75 TokenType.indent: INDENT, 

76 TokenType.dedent: DEDENT, 

77 TokenType.newline: NEWLINE, 

78 TokenType.nl: NL, 

79 TokenType.comment: COMMENT, 

80 TokenType.semicolon: OP, 

81 TokenType.lparen: OP, 

82 TokenType.rparen: OP, 

83 TokenType.lbracket: OP, 

84 TokenType.rbracket: OP, 

85 TokenType.lbrace: OP, 

86 TokenType.rbrace: OP, 

87 TokenType.colon: OP, 

88 TokenType.op: OP, 

89 TokenType.identifier: NAME, 

90 TokenType.number: NUMBER, 

91 TokenType.string: STRING, 

92 TokenType.fstring_start: FSTRING_START, 

93 TokenType.fstring_middle: FSTRING_MIDDLE, 

94 TokenType.fstring_end: FSTRING_END, 

95 TokenType.tstring_start: TSTRING_START, 

96 TokenType.tstring_middle: TSTRING_MIDDLE, 

97 TokenType.tstring_end: TSTRING_END, 

98 TokenType.endmarker: ENDMARKER, 

99} 

100 

101 

102class TokenError(Exception): ... 

103 

104 

105def transform_whitespace( 

106 token: pytokens.Token, source: str, prev_token: pytokens.Token | None 

107) -> pytokens.Token: 

108 r""" 

109 Black treats `\\\n` at the end of a line as a 'NL' token, while it 

110 is ignored as whitespace in the regular Python parser. 

111 But, only the first one. If there's a `\\\n` following it 

112 (as in, a \ just by itself on a line), that is not made into NL. 

113 """ 

114 if ( 

115 token.type == TokenType.whitespace 

116 and prev_token is not None 

117 and prev_token.type not in (TokenType.nl, TokenType.newline) 

118 ): 

119 token_str = source[token.start_index : token.end_index] 

120 if token_str.startswith("\\\r\n"): 

121 return pytokens.Token( 

122 TokenType.nl, 

123 token.start_index, 

124 token.start_index + 3, 

125 token.start_line, 

126 token.start_col, 

127 token.start_line, 

128 token.start_col + 3, 

129 ) 

130 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"): 

131 return pytokens.Token( 

132 TokenType.nl, 

133 token.start_index, 

134 token.start_index + 2, 

135 token.start_line, 

136 token.start_col, 

137 token.start_line, 

138 token.start_col + 2, 

139 ) 

140 

141 return token 

142 

143 

144def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]: 

145 lines = source.split("\n") 

146 lines += [""] # For newline tokens in files that don't end in a newline 

147 line, column = 1, 0 

148 

149 prev_token: pytokens.Token | None = None 

150 try: 

151 for token in pytokens.tokenize(source): 

152 token = transform_whitespace(token, source, prev_token) 

153 

154 line, column = token.start_line, token.start_col 

155 if token.type == TokenType.whitespace: 

156 continue 

157 

158 token_str = source[token.start_index : token.end_index] 

159 

160 if token.type == TokenType.newline and token_str == "": 

161 # Black doesn't yield empty newline tokens at the end of a file 

162 # if there's no newline at the end of a file. 

163 prev_token = token 

164 continue 

165 

166 source_line = lines[token.start_line - 1] 

167 

168 if token.type == TokenType.identifier and token_str in ("async", "await"): 

169 # Black uses `async` and `await` token types just for those two keywords 

170 yield ( 

171 ASYNC if token_str == "async" else AWAIT, 

172 token_str, 

173 (token.start_line, token.start_col), 

174 (token.end_line, token.end_col), 

175 source_line, 

176 ) 

177 elif token.type == TokenType.op and token_str == "...": 

178 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead 

179 assert token.start_line == token.end_line 

180 assert token.end_col == token.start_col + 3 

181 

182 token_str = "." 

183 for start_col in range(token.start_col, token.start_col + 3): 

184 end_col = start_col + 1 

185 yield ( 

186 TOKEN_TYPE_MAP[token.type], 

187 token_str, 

188 (token.start_line, start_col), 

189 (token.end_line, end_col), 

190 source_line, 

191 ) 

192 else: 

193 token_type = TOKEN_TYPE_MAP.get(token.type) 

194 if token_type is None: 

195 raise ValueError(f"Unknown token type: {token.type!r}") 

196 yield ( 

197 TOKEN_TYPE_MAP[token.type], 

198 token_str, 

199 (token.start_line, token.start_col), 

200 (token.end_line, token.end_col), 

201 source_line, 

202 ) 

203 prev_token = token 

204 

205 except pytokens.UnexpectedEOF: 

206 raise TokenError("Unexpected EOF in multi-line statement", (line, column)) 

207 except pytokens.TokenizeError as exc: 

208 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column)) 

209 

210 

211def printtoken( 

212 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str 

213) -> None: # for testing 

214 (srow, scol) = srow_col 

215 (erow, ecol) = erow_col 

216 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}") 

217 

218 

219if __name__ == "__main__": # testing 

220 if len(sys.argv) > 1: 

221 token_iterator = tokenize(open(sys.argv[1]).read()) 

222 else: 

223 token_iterator = tokenize(sys.stdin.read()) 

224 

225 for tok in token_iterator: 

226 printtoken(*tok)