Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/blib2to3/pgen2/tokenize.py: 75%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

67 statements  

1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 

2# All rights reserved. 

3 

4# mypy: allow-untyped-defs, allow-untyped-calls 

5 

6"""Tokenization help for Python programs. 

7 

8generate_tokens(readline) is a generator that breaks a stream of 

9text into Python tokens. It accepts a readline-like method which is called 

10repeatedly to get the next line of input (or "" for EOF). It generates 

115-tuples with these members: 

12 

13 the token type (see token.py) 

14 the token (a string) 

15 the starting (row, column) indices of the token (a 2-tuple of ints) 

16 the ending (row, column) indices of the token (a 2-tuple of ints) 

17 the original line (string) 

18 

19It is designed to match the working of the Python tokenizer exactly, except 

20that it produces COMMENT tokens for comments and gives type OP for all 

21operators 

22 

23Older entry points 

24 tokenize_loop(readline, tokeneater) 

25 tokenize(readline, tokeneater=printtoken) 

26are the same, except instead of generating tokens, tokeneater is a callback 

27function to which the 5 fields described above are passed as 5 arguments, 

28each time a new token is found.""" 

29 

30import sys 

31from collections.abc import Iterator 

32from typing import Optional 

33 

34from blib2to3.pgen2.grammar import Grammar 

35from blib2to3.pgen2.token import ( 

36 ASYNC, 

37 AWAIT, 

38 COMMENT, 

39 DEDENT, 

40 ENDMARKER, 

41 ERRORTOKEN, 

42 FSTRING_END, 

43 FSTRING_MIDDLE, 

44 FSTRING_START, 

45 INDENT, 

46 NAME, 

47 NEWLINE, 

48 NL, 

49 NUMBER, 

50 OP, 

51 STRING, 

52 tok_name, 

53) 

54 

55__author__ = "Ka-Ping Yee <ping@lfw.org>" 

56__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro" 

57 

58import pytokens 

59from pytokens import TokenType 

60 

61from . import token as _token 

62 

63__all__ = [x for x in dir(_token) if x[0] != "_"] + [ 

64 "tokenize", 

65 "generate_tokens", 

66 "untokenize", 

67] 

68del _token 

69 

70Coord = tuple[int, int] 

71TokenInfo = tuple[int, str, Coord, Coord, str] 

72 

73TOKEN_TYPE_MAP = { 

74 TokenType.indent: INDENT, 

75 TokenType.dedent: DEDENT, 

76 TokenType.newline: NEWLINE, 

77 TokenType.nl: NL, 

78 TokenType.comment: COMMENT, 

79 TokenType.semicolon: OP, 

80 TokenType.lparen: OP, 

81 TokenType.rparen: OP, 

82 TokenType.lbracket: OP, 

83 TokenType.rbracket: OP, 

84 TokenType.lbrace: OP, 

85 TokenType.rbrace: OP, 

86 TokenType.colon: OP, 

87 TokenType.op: OP, 

88 TokenType.identifier: NAME, 

89 TokenType.number: NUMBER, 

90 TokenType.string: STRING, 

91 TokenType.fstring_start: FSTRING_START, 

92 TokenType.fstring_middle: FSTRING_MIDDLE, 

93 TokenType.fstring_end: FSTRING_END, 

94 TokenType.endmarker: ENDMARKER, 

95} 

96 

97 

98class TokenError(Exception): ... 

99 

100 

101def transform_whitespace( 

102 token: pytokens.Token, source: str, prev_token: Optional[pytokens.Token] 

103) -> pytokens.Token: 

104 r""" 

105 Black treats `\\\n` at the end of a line as a 'NL' token, while it 

106 is ignored as whitespace in the regular Python parser. 

107 But, only the first one. If there's a `\\\n` following it 

108 (as in, a \ just by itself on a line), that is not made into NL. 

109 """ 

110 if ( 

111 token.type == TokenType.whitespace 

112 and prev_token is not None 

113 and prev_token.type not in (TokenType.nl, TokenType.newline) 

114 ): 

115 token_str = source[token.start_index : token.end_index] 

116 if token_str.startswith("\\\r\n"): 

117 return pytokens.Token( 

118 TokenType.nl, 

119 token.start_index, 

120 token.start_index + 3, 

121 token.start_line, 

122 token.start_col, 

123 token.start_line, 

124 token.start_col + 3, 

125 ) 

126 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"): 

127 return pytokens.Token( 

128 TokenType.nl, 

129 token.start_index, 

130 token.start_index + 2, 

131 token.start_line, 

132 token.start_col, 

133 token.start_line, 

134 token.start_col + 2, 

135 ) 

136 

137 return token 

138 

139 

140def tokenize(source: str, grammar: Optional[Grammar] = None) -> Iterator[TokenInfo]: 

141 lines = source.split("\n") 

142 lines += [""] # For newline tokens in files that don't end in a newline 

143 line, column = 1, 0 

144 

145 prev_token: Optional[pytokens.Token] = None 

146 try: 

147 for token in pytokens.tokenize(source): 

148 token = transform_whitespace(token, source, prev_token) 

149 

150 line, column = token.start_line, token.start_col 

151 if token.type == TokenType.whitespace: 

152 continue 

153 

154 token_str = source[token.start_index : token.end_index] 

155 

156 if token.type == TokenType.newline and token_str == "": 

157 # Black doesn't yield empty newline tokens at the end of a file 

158 # if there's no newline at the end of a file. 

159 prev_token = token 

160 continue 

161 

162 source_line = lines[token.start_line - 1] 

163 

164 if token.type == TokenType.identifier and token_str in ("async", "await"): 

165 # Black uses `async` and `await` token types just for those two keywords 

166 yield ( 

167 ASYNC if token_str == "async" else AWAIT, 

168 token_str, 

169 (token.start_line, token.start_col), 

170 (token.end_line, token.end_col), 

171 source_line, 

172 ) 

173 elif token.type == TokenType.op and token_str == "...": 

174 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead 

175 assert token.start_line == token.end_line 

176 assert token.end_col == token.start_col + 3 

177 

178 token_str = "." 

179 for start_col in range(token.start_col, token.start_col + 3): 

180 end_col = start_col + 1 

181 yield ( 

182 TOKEN_TYPE_MAP[token.type], 

183 token_str, 

184 (token.start_line, start_col), 

185 (token.end_line, end_col), 

186 source_line, 

187 ) 

188 else: 

189 yield ( 

190 TOKEN_TYPE_MAP[token.type], 

191 token_str, 

192 (token.start_line, token.start_col), 

193 (token.end_line, token.end_col), 

194 source_line, 

195 ) 

196 prev_token = token 

197 

198 except pytokens.UnexpectedEOF: 

199 raise TokenError("Unexpected EOF in multi-line statement", (line, column)) 

200 except pytokens.TokenizeError as exc: 

201 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column)) 

202 

203 

204def printtoken( 

205 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str 

206) -> None: # for testing 

207 (srow, scol) = srow_col 

208 (erow, ecol) = erow_col 

209 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}") 

210 

211 

212if __name__ == "__main__": # testing 

213 if len(sys.argv) > 1: 

214 token_iterator = tokenize(open(sys.argv[1]).read()) 

215 else: 

216 token_iterator = tokenize(sys.stdin.read()) 

217 

218 for tok in token_iterator: 

219 printtoken(*tok)