Coverage for /pythoncovmergedfiles/medio/medio/src/black/src/blib2to3/pgen2/tokenize.py: 75%

4# mypy: allow-untyped-defs, allow-untyped-calls

6"""Tokenization help for Python programs.

8generate_tokens(readline) is a generator that breaks a stream of

9text into Python tokens. It accepts a readline-like method which is called

10repeatedly to get the next line of input (or "" for EOF). It generates

115-tuples with these members:

13 the token type (see token.py)

14 the token (a string)

15 the starting (row, column) indices of the token (a 2-tuple of ints)

16 the ending (row, column) indices of the token (a 2-tuple of ints)

17 the original line (string)

19It is designed to match the working of the Python tokenizer exactly, except

20that it produces COMMENT tokens for comments and gives type OP for all

21operators

23Older entry points

24 tokenize_loop(readline, tokeneater)

25 tokenize(readline, tokeneater=printtoken)

26are the same, except instead of generating tokens, tokeneater is a callback

27function to which the 5 fields described above are passed as 5 arguments,

28each time a new token is found."""

30import sys

31from collections.abc import Iterator

32from typing import Optional

34from blib2to3.pgen2.grammar import Grammar

35from blib2to3.pgen2.token import (

36 ASYNC,

37 AWAIT,

38 COMMENT,

39 DEDENT,

40 ENDMARKER,

41 ERRORTOKEN,

42 FSTRING_END,

43 FSTRING_MIDDLE,

44 FSTRING_START,

45 INDENT,

46 NAME,

47 NEWLINE,

48 NL,

49 NUMBER,

50 OP,

51 STRING,

52 tok_name,

53)

55__author__ = "Ka-Ping Yee <ping@lfw.org>"

56__credits__ = "GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro"

58import pytokens

59from pytokens import TokenType

61from . import token as _token

63__all__ = [x for x in dir(_token) if x[0] != "_"] + [

64 "tokenize",

65 "generate_tokens",

66 "untokenize",

67]

68del _token

70Coord = tuple[int, int]

71TokenInfo = tuple[int, str, Coord, Coord, str]

73TOKEN_TYPE_MAP = {

74 TokenType.indent: INDENT,

75 TokenType.dedent: DEDENT,

76 TokenType.newline: NEWLINE,

77 TokenType.nl: NL,

78 TokenType.comment: COMMENT,

79 TokenType.semicolon: OP,

80 TokenType.lparen: OP,

81 TokenType.rparen: OP,

82 TokenType.lbracket: OP,

83 TokenType.rbracket: OP,

84 TokenType.lbrace: OP,

85 TokenType.rbrace: OP,

86 TokenType.colon: OP,

87 TokenType.op: OP,

88 TokenType.identifier: NAME,

89 TokenType.number: NUMBER,

90 TokenType.string: STRING,

91 TokenType.fstring_start: FSTRING_START,

92 TokenType.fstring_middle: FSTRING_MIDDLE,

93 TokenType.fstring_end: FSTRING_END,

94 TokenType.endmarker: ENDMARKER,

95}

98class TokenError(Exception): ...

100

101def transform_whitespace(

102 token: pytokens.Token, source: str, prev_token: Optional[pytokens.Token]

103) -> pytokens.Token:

104 r"""

105 Black treats `\\\n` at the end of a line as a 'NL' token, while it

106 is ignored as whitespace in the regular Python parser.

107 But, only the first one. If there's a `\\\n` following it

108 (as in, a \ just by itself on a line), that is not made into NL.

109 """

110 if (

111 token.type == TokenType.whitespace

112 and prev_token is not None

113 and prev_token.type not in (TokenType.nl, TokenType.newline)

114 ):

115 token_str = source[token.start_index : token.end_index]

116 if token_str.startswith("\\\r\n"):

117 return pytokens.Token(

118 TokenType.nl,

119 token.start_index,

120 token.start_index + 3,

121 token.start_line,

122 token.start_col,

123 token.start_line,

124 token.start_col + 3,

125 )

126 elif token_str.startswith("\\\n") or token_str.startswith("\\\r"):

127 return pytokens.Token(

128 TokenType.nl,

129 token.start_index,

130 token.start_index + 2,

131 token.start_line,

132 token.start_col,

133 token.start_line,

134 token.start_col + 2,

135 )

136

137 return token

138

139

140def tokenize(source: str, grammar: Optional[Grammar] = None) -> Iterator[TokenInfo]:

141 lines = source.split("\n")

142 lines += [""] # For newline tokens in files that don't end in a newline

143 line, column = 1, 0

144

145 prev_token: Optional[pytokens.Token] = None

146 try:

147 for token in pytokens.tokenize(source):

148 token = transform_whitespace(token, source, prev_token)

149

150 line, column = token.start_line, token.start_col

151 if token.type == TokenType.whitespace:

152 continue

153

154 token_str = source[token.start_index : token.end_index]

155

156 if token.type == TokenType.newline and token_str == "":

157 # Black doesn't yield empty newline tokens at the end of a file

158 # if there's no newline at the end of a file.

159 prev_token = token

160 continue

161

162 source_line = lines[token.start_line - 1]

163

164 if token.type == TokenType.identifier and token_str in ("async", "await"):

165 # Black uses `async` and `await` token types just for those two keywords

166 yield (

167 ASYNC if token_str == "async" else AWAIT,

168 token_str,

169 (token.start_line, token.start_col),

170 (token.end_line, token.end_col),

171 source_line,

172 )

173 elif token.type == TokenType.op and token_str == "...":

174 # Black doesn't have an ellipsis token yet, yield 3 DOTs instead

175 assert token.start_line == token.end_line

176 assert token.end_col == token.start_col + 3

177

178 token_str = "."

179 for start_col in range(token.start_col, token.start_col + 3):

180 end_col = start_col + 1

181 yield (

182 TOKEN_TYPE_MAP[token.type],

183 token_str,

184 (token.start_line, start_col),

185 (token.end_line, end_col),

186 source_line,

187 )

188 else:

189 yield (

190 TOKEN_TYPE_MAP[token.type],

191 token_str,

192 (token.start_line, token.start_col),

193 (token.end_line, token.end_col),

194 source_line,

195 )

196 prev_token = token

197

198 except pytokens.UnexpectedEOF:

199 raise TokenError("Unexpected EOF in multi-line statement", (line, column))

200 except pytokens.TokenizeError as exc:

201 raise TokenError(f"Failed to parse: {type(exc).__name__}", (line, column))

202

203

204def printtoken(

205 type: int, token: str, srow_col: Coord, erow_col: Coord, line: str

206) -> None: # for testing

207 (srow, scol) = srow_col

208 (erow, ecol) = erow_col

209 print(f"{srow},{scol}-{erow},{ecol}:\t{tok_name[type]}\t{token!r}")

210

211

212if __name__ == "__main__": # testing

213 if len(sys.argv) > 1:

214 token_iterator = tokenize(open(sys.argv[1]).read())

215 else:

216 token_iterator = tokenize(sys.stdin.read())

217

218 for tok in token_iterator:

219 printtoken(*tok)