Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/libcst/_parser/detect_config.py: 39%

92 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-25 06:43 +0000

1# Copyright (c) Meta Platforms, Inc. and affiliates. 

2# 

3# This source code is licensed under the MIT license found in the 

4# LICENSE file in the root directory of this source tree. 

5 

6 

7import itertools 

8import re 

9from dataclasses import dataclass 

10from io import BytesIO 

11from tokenize import detect_encoding as py_tokenize_detect_encoding 

12from typing import FrozenSet, Iterable, Iterator, Pattern, Set, Tuple, Union 

13 

14from libcst._nodes.whitespace import NEWLINE_RE 

15from libcst._parser.parso.python.token import PythonTokenTypes, TokenType 

16from libcst._parser.parso.utils import split_lines 

17from libcst._parser.types.config import AutoConfig, ParserConfig, PartialParserConfig 

18from libcst._parser.types.token import Token 

19from libcst._parser.wrapped_tokenize import tokenize_lines 

20 

21_INDENT: TokenType = PythonTokenTypes.INDENT 

22_NAME: TokenType = PythonTokenTypes.NAME 

23_NEWLINE: TokenType = PythonTokenTypes.NEWLINE 

24_STRING: TokenType = PythonTokenTypes.STRING 

25 

26_FALLBACK_DEFAULT_NEWLINE = "\n" 

27_FALLBACK_DEFAULT_INDENT = " " 

28_CONTINUATION_RE: Pattern[str] = re.compile(r"\\(\r\n?|\n)", re.UNICODE) 

29 

30 

31@dataclass(frozen=True) 

32class ConfigDetectionResult: 

33 # The config is a set of constant values used by the parser. 

34 config: ParserConfig 

35 # The tokens iterator is mutated by the parser. 

36 tokens: Iterator[Token] 

37 

38 

39def _detect_encoding(source: Union[str, bytes]) -> str: 

40 """ 

41 Detects the encoding from the presence of a UTF-8 BOM or an encoding cookie as 

42 specified in PEP 263. 

43 

44 If given a string (instead of bytes) the encoding is assumed to be utf-8. 

45 """ 

46 

47 if isinstance(source, str): 

48 return "utf-8" 

49 return py_tokenize_detect_encoding(BytesIO(source).readline)[0] 

50 

51 

52def _detect_default_newline(source_str: str) -> str: 

53 """ 

54 Finds the first newline, and uses that value as the default newline. 

55 """ 

56 # Don't use `NEWLINE_RE` for this, because it might match multiple newlines as a 

57 # single newline. 

58 match = NEWLINE_RE.search(source_str) 

59 return match.group(0) if match is not None else _FALLBACK_DEFAULT_NEWLINE 

60 

61 

62def _detect_indent(tokens: Iterable[Token]) -> str: 

63 """ 

64 Finds the first INDENT token, and uses that as the value of the default indent. 

65 """ 

66 try: 

67 first_indent = next(t for t in tokens if t.type is _INDENT) 

68 except StopIteration: 

69 return _FALLBACK_DEFAULT_INDENT 

70 first_indent_str = first_indent.relative_indent 

71 assert first_indent_str is not None, "INDENT tokens must contain a relative_indent" 

72 return first_indent_str 

73 

74 

75def _detect_trailing_newline(source_str: str) -> bool: 

76 if len(source_str) == 0 or not NEWLINE_RE.fullmatch(source_str[-1]): 

77 return False 

78 # Make sure that the last newline wasn't following a continuation 

79 return not ( 

80 _CONTINUATION_RE.fullmatch(source_str[-2:]) 

81 or _CONTINUATION_RE.fullmatch(source_str[-3:]) 

82 ) 

83 

84 

85def _detect_future_imports(tokens: Iterable[Token]) -> FrozenSet[str]: 

86 """ 

87 Finds __future__ imports in their proper locations. 

88 

89 See `https://www.python.org/dev/peps/pep-0236/`_ 

90 """ 

91 future_imports: Set[str] = set() 

92 state = 0 

93 for tok in tokens: 

94 if state == 0 and tok.type in (_STRING, _NEWLINE): 

95 continue 

96 elif state == 0 and tok.string == "from": 

97 state = 1 

98 elif state == 1 and tok.string == "__future__": 

99 state = 2 

100 elif state == 2 and tok.string == "import": 

101 state = 3 

102 elif state == 3 and tok.string == "as": 

103 state = 4 

104 elif state == 3 and tok.type == _NAME: 

105 future_imports.add(tok.string) 

106 elif state == 4 and tok.type == _NAME: 

107 state = 3 

108 elif state == 3 and tok.string in "(),": 

109 continue 

110 elif state == 3 and tok.type == _NEWLINE: 

111 state = 0 

112 else: 

113 break 

114 return frozenset(future_imports) 

115 

116 

117def convert_to_utf8( 

118 source: Union[str, bytes], *, partial: PartialParserConfig 

119) -> Tuple[str, str]: 

120 """ 

121 Returns an (original encoding, converted source) tuple. 

122 """ 

123 partial_encoding = partial.encoding 

124 encoding = ( 

125 _detect_encoding(source) 

126 if isinstance(partial_encoding, AutoConfig) 

127 else partial_encoding 

128 ) 

129 

130 source_str = source if isinstance(source, str) else source.decode(encoding) 

131 return (encoding, source_str) 

132 

133 

134def detect_config( 

135 source: Union[str, bytes], 

136 *, 

137 partial: PartialParserConfig, 

138 detect_trailing_newline: bool, 

139 detect_default_newline: bool, 

140) -> ConfigDetectionResult: 

141 """ 

142 Computes a ParserConfig given the current source code to be parsed and a partial 

143 config. 

144 """ 

145 

146 python_version = partial.parsed_python_version 

147 

148 encoding, source_str = convert_to_utf8(source, partial=partial) 

149 

150 partial_default_newline = partial.default_newline 

151 default_newline = ( 

152 ( 

153 _detect_default_newline(source_str) 

154 if detect_default_newline 

155 else _FALLBACK_DEFAULT_NEWLINE 

156 ) 

157 if isinstance(partial_default_newline, AutoConfig) 

158 else partial_default_newline 

159 ) 

160 

161 # HACK: The grammar requires a trailing newline, but python doesn't actually require 

162 # a trailing newline. Add one onto the end to make the parser happy. We'll strip it 

163 # out again during cst.Module's codegen. 

164 # 

165 # I think parso relies on error recovery support to handle this, which we don't 

166 # have. lib2to3 doesn't handle this case at all AFAICT. 

167 has_trailing_newline = detect_trailing_newline and _detect_trailing_newline( 

168 source_str 

169 ) 

170 if detect_trailing_newline and not has_trailing_newline: 

171 source_str += default_newline 

172 

173 lines = split_lines(source_str, keepends=True) 

174 

175 tokens = tokenize_lines(source_str, lines, python_version) 

176 

177 partial_default_indent = partial.default_indent 

178 if isinstance(partial_default_indent, AutoConfig): 

179 # We need to clone `tokens` before passing it to `_detect_indent`, because 

180 # `_detect_indent` consumes some tokens, mutating `tokens`. 

181 # 

182 # Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the 

183 # size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup` 

184 # once `token_dup` is freed at the end of this method (subject to 

185 # GC/refcounting). 

186 tokens, tokens_dup = itertools.tee(tokens) 

187 default_indent = _detect_indent(tokens_dup) 

188 else: 

189 default_indent = partial_default_indent 

190 

191 partial_future_imports = partial.future_imports 

192 if isinstance(partial_future_imports, AutoConfig): 

193 # Same note as above re itertools.tee, we will consume tokens. 

194 tokens, tokens_dup = itertools.tee(tokens) 

195 future_imports = _detect_future_imports(tokens_dup) 

196 else: 

197 future_imports = partial_future_imports 

198 

199 return ConfigDetectionResult( 

200 config=ParserConfig( 

201 lines=lines, 

202 encoding=encoding, 

203 default_indent=default_indent, 

204 default_newline=default_newline, 

205 has_trailing_newline=has_trailing_newline, 

206 version=python_version, 

207 future_imports=future_imports, 

208 ), 

209 tokens=tokens, 

210 )