Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 74%

4# mypy: allow-untyped-defs, allow-untyped-calls

6"""Tokenization help for Python programs.

8generate_tokens(readline) is a generator that breaks a stream of

9text into Python tokens. It accepts a readline-like method which is called

10repeatedly to get the next line of input (or "" for EOF). It generates

115-tuples with these members:

13 the token type (see token.py)

14 the token (a string)

15 the starting (row, column) indices of the token (a 2-tuple of ints)

16 the ending (row, column) indices of the token (a 2-tuple of ints)

17 the original line (string)

19It is designed to match the working of the Python tokenizer exactly, except

20that it produces COMMENT tokens for comments and gives type OP for all

21operators

23Older entry points

24 tokenize_loop(readline, tokeneater)

25 tokenize(readline, tokeneater=printtoken)

26are the same, except instead of generating tokens, tokeneater is a callback

27function to which the 5 fields described above are passed as 5 arguments,

28each time a new token is found."""

30import sys

31from collections.abc import Iterator

33from blib2to3.pgen2.grammar import Grammar

34from blib2to3.pgen2.token import (

35 ASYNC,

36 AWAIT,

37 COMMENT,

38 DEDENT,

39 ENDMARKER,

40 FSTRING_END,

41 FSTRING_MIDDLE,

42 FSTRING_START,

43 INDENT,

44 NAME,

45 NEWLINE,

46 NL,

47 NUMBER,

48 OP,

49 STRING,

50 TSTRING_END,

51 TSTRING_MIDDLE,

52 TSTRING_START,

53 tok_name,

54)

56__author__ = "Ka-Ping Yee <ping@lfw.org>"

57__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"

59import pytokens

60from pytokens import TokenType

62from . import token as _token

64__all__ = [x for x in dir(_token) if x[0] != "_"] + [

65 "tokenize",

66 "generate_tokens",

67 "untokenize",

68]

69del _token

71Coord = tuple[int, int]

72TokenInfo = tuple[int, str, Coord, Coord, str]

74TOKEN_TYPE_MAP = {

75 TokenType.indent: INDENT,

76 TokenType.dedent: DEDENT,

77 TokenType.newline: NEWLINE,

78 TokenType.nl: NL,

79 TokenType.comment: COMMENT,

80 TokenType.semicolon: OP,

81 TokenType.lparen: OP,

82 TokenType.rparen: OP,

83 TokenType.lbracket: OP,

84 TokenType.rbracket: OP,

85 TokenType.lbrace: OP,

86 TokenType.rbrace: OP,

87 TokenType.colon: OP,

88 TokenType.op: OP,

89 TokenType.identifier: NAME,

90 TokenType.number: NUMBER,

91 TokenType.string: STRING,

92 TokenType.fstring_start: FSTRING_START,

93 TokenType.fstring_middle: FSTRING_MIDDLE,

94 TokenType.fstring_end: FSTRING_END,

95 TokenType.tstring_start: TSTRING_START,

96 TokenType.tstring_middle: TSTRING_MIDDLE,

97 TokenType.tstring_end: TSTRING_END,

98 TokenType.endmarker: ENDMARKER,

99}

100

101

102class TokenError(Exception): ...

103

104

105def transform_whitespace(

106 token: pytokens.Token, source: str, prev_token: pytokens.Token | None

107) -> pytokens.Token:

108 r"""

109 Black treats `\\\n` at the end of a line as a 'NL' token, while it

110 is ignored as whitespace in the regular Python parser.

111 But, only the first one. If there's a `\\\n` following it

112 (as in, a \ just by itself on a line), that is not made into NL.

113 """

114 if (

115 token.type == TokenType.whitespace

116 and prev_token is not None

117 and prev_token.type not in (TokenType.nl, TokenType.newline)

118 ):

119 token_str = source[token.start_index : token.end_index]

120 if token_str.startswith("\\\r\n"):

121 return pytokens.Token(

122 TokenType.nl,

123 token.start_index,

124 token.start_index + 3,

125 token.start_line,

126 token.start_col,

127 token.start_line,

128 token.start_col + 3,

129 )

130 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"):

131 return pytokens.Token(

132 TokenType.nl,

133 token.start_index,

134 token.start_index + 2,

135 token.start_line,

136 token.start_col,

137 token.start_line,

138 token.start_col + 2,

139 )

140

141 return token

142

143

144def tokenize(source: str, grammar: Grammar | None = None) -> Iterator[TokenInfo]:

145 lines = source.split("\n")

146 lines += [""] # For newline tokens in files that don't end in a newline

147 line, column = 1, 0

148

149 prev_token: pytokens.Token | None = None

150 try:

151 for token in pytokens.tokenize(source):

152 token = transform_whitespace(token, source, prev_token)

153

154 line, column = token.start_line, token.start_col

155 if token.type == TokenType.whitespace:

156 continue

157

158 token_str = source[token.start_index : token.end_index]

159

160 if token.type == TokenType.newline and token_str == "":

161 # Black doesn't yield empty newline tokens at the end of a file

162 # if there's no newline at the end of a file.

163 prev_token = token

164 continue

165

166 source_line = lines[token.start_line - 1]

167

168 if token.type == TokenType.identifier and token_str in ("async", "await"):

169 # Black uses `async` and `await` token types just for those two keywords

170 yield (

171 ASYNC if token_str == "async" else AWAIT,

172 token_str,

173 (token.start_line, token.start_col),

174 (token.end_line, token.end_col),

175 source_line,

176 )

177 elif token.type == TokenType.op and token_str == "...":

178 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead

179 assert token.start_line == token.end_line

180 assert token.end_col == token.start_col + 3

181

182 token_str = "."

183 for start_col in range(token.start_col, token.start_col + 3):

184 end_col = start_col + 1

185 yield (

186 TOKEN_TYPE_MAP[token.type],

187 token_str,

188 (token.start_line, start_col),

189 (token.end_line, end_col),

190 source_line,

191 )

192 else:

193 token_type = TOKEN_TYPE_MAP.get(token.type)

194 if token_type is None:

195 raise ValueError(f"Unknown token type: {token.type!r}")

196 yield (

197 TOKEN_TYPE_MAP[token.type],

198 token_str,

199 (token.start_line, token.start_col),

200 (token.end_line, token.end_col),

201 source_line,

202 )

203 prev_token = token

204

205 except pytokens.UnexpectedEOF:

206 raise TokenError("Unexpected EOF in multi-line statement", (line, column))

207 except pytokens.TokenizeError as exc:

208 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column))

209

210

211def printtoken(

212 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str

213) -> None: # for testing

214 (srow, scol) = srow_col

215 (erow, ecol) = erow_col

216 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}")

217

218

219if __name__ == "__main__": # testing

220 if len(sys.argv) > 1:

221 token_iterator = tokenize(open(sys.argv[1]).read())

222 else:

223 token_iterator = tokenize(sys.stdin.read())

224

225 for tok in token_iterator:

226 printtoken(*tok)