Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/libcst/_parser/wrapped_tokenize.py: 35%

100 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-25 06:43 +0000

1# Copyright (c) Meta Platforms, Inc. and affiliates. 

2# 

3# This source code is licensed under the MIT license found in the 

4# LICENSE file in the root directory of this source tree. 

5 

6 

7""" 

8Parso's tokenize doesn't give us tokens in the format that we'd ideally like, so this 

9performs a small number of transformations to the token stream: 

10 

11- `end_pos` is precomputed as a property, instead of lazily as a method, for more 

12 efficient access. 

13- `whitespace_before` and `whitespace_after` have been added. These include the correct 

14 indentation information. 

15- `prefix` is removed, since we don't use it anywhere. 

16- `ERRORTOKEN` and `ERROR_DEDENT` have been removed, because we don't intend to support 

17 error recovery. If we encounter token errors, we'll raise a ParserSyntaxError instead. 

18 

19If performance becomes a concern, we can rewrite this later as a fork of the original 

20tokenize module, instead of as a wrapper. 

21""" 

22 

23from dataclasses import dataclass, field 

24from enum import Enum 

25from typing import Generator, Iterator, List, Optional, Sequence 

26 

27from libcst._add_slots import add_slots 

28from libcst._exceptions import ParserSyntaxError 

29from libcst._parser.parso.python.token import PythonTokenTypes, TokenType 

30from libcst._parser.parso.python.tokenize import ( 

31 Token as OrigToken, 

32 tokenize_lines as orig_tokenize_lines, 

33) 

34from libcst._parser.parso.utils import PythonVersionInfo, split_lines 

35from libcst._parser.types.token import Token 

36from libcst._parser.types.whitespace_state import WhitespaceState 

37 

38_ERRORTOKEN: TokenType = PythonTokenTypes.ERRORTOKEN 

39_ERROR_DEDENT: TokenType = PythonTokenTypes.ERROR_DEDENT 

40 

41_INDENT: TokenType = PythonTokenTypes.INDENT 

42_DEDENT: TokenType = PythonTokenTypes.DEDENT 

43_ENDMARKER: TokenType = PythonTokenTypes.ENDMARKER 

44 

45_FSTRING_START: TokenType = PythonTokenTypes.FSTRING_START 

46_FSTRING_END: TokenType = PythonTokenTypes.FSTRING_END 

47 

48_OP: TokenType = PythonTokenTypes.OP 

49 

50 

51class _ParenthesisOrFStringStackEntry(Enum): 

52 PARENTHESIS = 0 

53 FSTRING = 0 

54 

55 

56_PARENTHESIS_STACK_ENTRY: _ParenthesisOrFStringStackEntry = ( 

57 _ParenthesisOrFStringStackEntry.PARENTHESIS 

58) 

59_FSTRING_STACK_ENTRY: _ParenthesisOrFStringStackEntry = ( 

60 _ParenthesisOrFStringStackEntry.FSTRING 

61) 

62 

63 

64@add_slots 

65@dataclass(frozen=False) 

66class _TokenizeState: 

67 lines: Sequence[str] 

68 previous_whitespace_state: WhitespaceState = field( 

69 default_factory=lambda: WhitespaceState( 

70 line=1, column=0, absolute_indent="", is_parenthesized=False 

71 ) 

72 ) 

73 indents: List[str] = field(default_factory=lambda: [""]) 

74 parenthesis_or_fstring_stack: List[_ParenthesisOrFStringStackEntry] = field( 

75 default_factory=list 

76 ) 

77 

78 

79def tokenize(code: str, version_info: PythonVersionInfo) -> Iterator[Token]: 

80 try: 

81 from libcst_native import tokenize as native_tokenize 

82 

83 return native_tokenize.tokenize(code) 

84 except ImportError: 

85 lines = split_lines(code, keepends=True) 

86 return tokenize_lines(code, lines, version_info) 

87 

88 

89def tokenize_lines( 

90 code: str, lines: Sequence[str], version_info: PythonVersionInfo 

91) -> Iterator[Token]: 

92 try: 

93 from libcst_native import tokenize as native_tokenize 

94 

95 # TODO: pass through version_info 

96 return native_tokenize.tokenize(code) 

97 except ImportError: 

98 return tokenize_lines_py(code, lines, version_info) 

99 

100 

101def tokenize_lines_py( 

102 code: str, lines: Sequence[str], version_info: PythonVersionInfo 

103) -> Generator[Token, None, None]: 

104 state = _TokenizeState(lines) 

105 orig_tokens_iter = iter(orig_tokenize_lines(lines, version_info)) 

106 

107 # Iterate over the tokens and pass them to _convert_token, providing a one-token 

108 # lookahead, to enable proper indent handling. 

109 try: 

110 curr_token = next(orig_tokens_iter) 

111 except StopIteration: 

112 pass # empty file 

113 else: 

114 for next_token in orig_tokens_iter: 

115 yield _convert_token(state, curr_token, next_token) 

116 curr_token = next_token 

117 yield _convert_token(state, curr_token, None) 

118 

119 

120def _convert_token( # noqa: C901: too complex 

121 state: _TokenizeState, curr_token: OrigToken, next_token: Optional[OrigToken] 

122) -> Token: 

123 ct_type = curr_token.type 

124 ct_string = curr_token.string 

125 ct_start_pos = curr_token.start_pos 

126 if ct_type is _ERRORTOKEN: 

127 raise ParserSyntaxError( 

128 f"{ct_string!r} is not a valid token.", 

129 lines=state.lines, 

130 raw_line=ct_start_pos[0], 

131 raw_column=ct_start_pos[1], 

132 ) 

133 if ct_type is _ERROR_DEDENT: 

134 raise ParserSyntaxError( 

135 "Inconsistent indentation. Expected a dedent.", 

136 lines=state.lines, 

137 raw_line=ct_start_pos[0], 

138 raw_column=ct_start_pos[1], 

139 ) 

140 

141 # Compute relative indent changes for indent/dedent nodes 

142 relative_indent: Optional[str] = None 

143 if ct_type is _INDENT: 

144 old_indent = "" if len(state.indents) < 2 else state.indents[-2] 

145 new_indent = state.indents[-1] 

146 relative_indent = new_indent[len(old_indent) :] 

147 

148 if next_token is not None: 

149 nt_type = next_token.type 

150 if nt_type is _INDENT: 

151 nt_line, nt_column = next_token.start_pos 

152 state.indents.append(state.lines[nt_line - 1][:nt_column]) 

153 elif nt_type is _DEDENT: 

154 state.indents.pop() 

155 

156 whitespace_before = state.previous_whitespace_state 

157 

158 if ct_type is _INDENT or ct_type is _DEDENT or ct_type is _ENDMARKER: 

159 # Don't update whitespace state for these dummy tokens. This makes it possible 

160 # to partially parse whitespace for IndentedBlock footers, and then parse the 

161 # rest of the whitespace in the following statement's leading_lines. 

162 # Unfortunately, that means that the indentation is either wrong for the footer 

163 # comments, or for the next line. We've chosen to allow it to be wrong for the 

164 # IndentedBlock footer and manually override the state when parsing whitespace 

165 # in that particular node. 

166 whitespace_after = whitespace_before 

167 ct_end_pos = ct_start_pos 

168 else: 

169 # Not a dummy token, so update the whitespace state. 

170 

171 # Compute our own end_pos, since parso's end_pos is wrong for triple-strings. 

172 lines = split_lines(ct_string) 

173 if len(lines) > 1: 

174 ct_end_pos = ct_start_pos[0] + len(lines) - 1, len(lines[-1]) 

175 else: 

176 ct_end_pos = (ct_start_pos[0], ct_start_pos[1] + len(ct_string)) 

177 

178 # Figure out what mode the whitespace parser should use. If we're inside 

179 # parentheses, certain whitespace (e.g. newlines) are allowed where they would 

180 # otherwise not be. f-strings override and disable this behavior, however. 

181 # 

182 # Parso's tokenizer tracks this internally, but doesn't expose it, so we have to 

183 # duplicate that logic here. 

184 

185 pof_stack = state.parenthesis_or_fstring_stack 

186 try: 

187 if ct_type is _FSTRING_START: 

188 pof_stack.append(_FSTRING_STACK_ENTRY) 

189 elif ct_type is _FSTRING_END: 

190 pof_stack.pop() 

191 elif ct_type is _OP: 

192 if ct_string in "([{": 

193 pof_stack.append(_PARENTHESIS_STACK_ENTRY) 

194 elif ct_string in ")]}": 

195 pof_stack.pop() 

196 except IndexError: 

197 # pof_stack may be empty by the time we need to read from it due to 

198 # mismatched braces. 

199 raise ParserSyntaxError( 

200 "Encountered a closing brace without a matching opening brace.", 

201 lines=state.lines, 

202 raw_line=ct_start_pos[0], 

203 raw_column=ct_start_pos[1], 

204 ) 

205 is_parenthesized = ( 

206 len(pof_stack) > 0 and pof_stack[-1] == _PARENTHESIS_STACK_ENTRY 

207 ) 

208 

209 whitespace_after = WhitespaceState( 

210 ct_end_pos[0], ct_end_pos[1], state.indents[-1], is_parenthesized 

211 ) 

212 

213 # Hold onto whitespace_after, so we can use it as whitespace_before in the next 

214 # node. 

215 state.previous_whitespace_state = whitespace_after 

216 

217 return Token( 

218 ct_type, 

219 ct_string, 

220 ct_start_pos, 

221 ct_end_pos, 

222 whitespace_before, 

223 whitespace_after, 

224 relative_indent, 

225 )