Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/libcst/_parser/detect

1# Copyright (c) Meta Platforms, Inc. and affiliates.

3# This source code is licensed under the MIT license found in the

4# LICENSE file in the root directory of this source tree.

7import itertools

8import re

9from dataclasses import dataclass

10from io import BytesIO

11from tokenize import detect_encoding as py_tokenize_detect_encoding

12from typing import FrozenSet, Iterable, Iterator, Pattern, Set, Tuple, Union

14from libcst._nodes.whitespace import NEWLINE_RE

15from libcst._parser.parso.python.token import PythonTokenTypes, TokenType

16from libcst._parser.parso.utils import split_lines

17from libcst._parser.types.config import AutoConfig, ParserConfig, PartialParserConfig

18from libcst._parser.types.token import Token

19from libcst._parser.wrapped_tokenize import tokenize_lines

21_INDENT: TokenType = PythonTokenTypes.INDENT

22_NAME: TokenType = PythonTokenTypes.NAME

23_NEWLINE: TokenType = PythonTokenTypes.NEWLINE

24_STRING: TokenType = PythonTokenTypes.STRING

26_FALLBACK_DEFAULT_NEWLINE = "\n"

27_FALLBACK_DEFAULT_INDENT = " "

28_CONTINUATION_RE: Pattern[str] = re.compile(r"\\(\r\n?|\n)", re.UNICODE)

31@dataclass(frozen=True)

32class ConfigDetectionResult:

33 # The config is a set of constant values used by the parser.

34 config: ParserConfig

35 # The tokens iterator is mutated by the parser.

36 tokens: Iterator[Token]

39def _detect_encoding(source: Union[str, bytes]) -> str:

40 """

41 Detects the encoding from the presence of a UTF-8 BOM or an encoding cookie as

42 specified in PEP 263.

44 If given a string (instead of bytes) the encoding is assumed to be utf-8.

45 """

47 if isinstance(source, str):

48 return "utf-8"

49 return py_tokenize_detect_encoding(BytesIO(source).readline)[0]

52def _detect_default_newline(source_str: str) -> str:

53 """

54 Finds the first newline, and uses that value as the default newline.

55 """

56 # Don't use `NEWLINE_RE` for this, because it might match multiple newlines as a

57 # single newline.

58 match = NEWLINE_RE.search(source_str)

59 return match.group(0) if match is not None else _FALLBACK_DEFAULT_NEWLINE

62def _detect_indent(tokens: Iterable[Token]) -> str:

63 """

64 Finds the first INDENT token, and uses that as the value of the default indent.

65 """

66 try:

67 first_indent = next(t for t in tokens if t.type is _INDENT)

68 except StopIteration:

69 return _FALLBACK_DEFAULT_INDENT

70 first_indent_str = first_indent.relative_indent

71 assert first_indent_str is not None, "INDENT tokens must contain a relative_indent"

72 return first_indent_str

75def _detect_trailing_newline(source_str: str) -> bool:

76 if len(source_str) == 0 or not NEWLINE_RE.fullmatch(source_str[-1]):

77 return False

78 # Make sure that the last newline wasn't following a continuation

79 return not (

80 _CONTINUATION_RE.fullmatch(source_str[-2:])

81 or _CONTINUATION_RE.fullmatch(source_str[-3:])

82 )

85def _detect_future_imports(tokens: Iterable[Token]) -> FrozenSet[str]:

86 """

87 Finds __future__ imports in their proper locations.

89 See `https://www.python.org/dev/peps/pep-0236/`_

90 """

91 future_imports: Set[str] = set()

92 state = 0

93 for tok in tokens:

94 if state == 0 and tok.type in (_STRING, _NEWLINE):

95 continue

96 elif state == 0 and tok.string == "from":

97 state = 1

98 elif state == 1 and tok.string == "__future__":

99 state = 2

100 elif state == 2 and tok.string == "import":

101 state = 3

102 elif state == 3 and tok.string == "as":

103 state = 4

104 elif state == 3 and tok.type == _NAME:

105 future_imports.add(tok.string)

106 elif state == 4 and tok.type == _NAME:

107 state = 3

108 elif state == 3 and tok.string in "(),":

109 continue

110 elif state == 3 and tok.type == _NEWLINE:

111 state = 0

112 else:

113 break

114 return frozenset(future_imports)

115

116

117def convert_to_utf8(

118 source: Union[str, bytes], *, partial: PartialParserConfig

119) -> Tuple[str, str]:

120 """

121 Returns an (original encoding, converted source) tuple.

122 """

123 partial_encoding = partial.encoding

124 encoding = (

125 _detect_encoding(source)

126 if isinstance(partial_encoding, AutoConfig)

127 else partial_encoding

128 )

129

130 source_str = source if isinstance(source, str) else source.decode(encoding)

131 return (encoding, source_str)

132

133

134def detect_config(

135 source: Union[str, bytes],

136 *,

137 partial: PartialParserConfig,

138 detect_trailing_newline: bool,

139 detect_default_newline: bool,

140) -> ConfigDetectionResult:

141 """

142 Computes a ParserConfig given the current source code to be parsed and a partial

143 config.

144 """

145

146 python_version = partial.parsed_python_version

147

148 encoding, source_str = convert_to_utf8(source, partial=partial)

149

150 partial_default_newline = partial.default_newline

151 default_newline = (

152 (

153 _detect_default_newline(source_str)

154 if detect_default_newline

155 else _FALLBACK_DEFAULT_NEWLINE

156 )

157 if isinstance(partial_default_newline, AutoConfig)

158 else partial_default_newline

159 )

160

161 # HACK: The grammar requires a trailing newline, but python doesn't actually require

162 # a trailing newline. Add one onto the end to make the parser happy. We'll strip it

163 # out again during cst.Module's codegen.

164 #

165 # I think parso relies on error recovery support to handle this, which we don't

166 # have. lib2to3 doesn't handle this case at all AFAICT.

167 has_trailing_newline = detect_trailing_newline and _detect_trailing_newline(

168 source_str

169 )

170 if detect_trailing_newline and not has_trailing_newline:

171 source_str += default_newline

172

173 lines = split_lines(source_str, keepends=True)

174

175 tokens = tokenize_lines(source_str, lines, python_version)

176

177 partial_default_indent = partial.default_indent

178 if isinstance(partial_default_indent, AutoConfig):

179 # We need to clone `tokens` before passing it to `_detect_indent`, because

180 # `_detect_indent` consumes some tokens, mutating `tokens`.

181 #

182 # Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the

183 # size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup`

184 # once `token_dup` is freed at the end of this method (subject to

185 # GC/refcounting).

186 tokens, tokens_dup = itertools.tee(tokens)

187 default_indent = _detect_indent(tokens_dup)

188 else:

189 default_indent = partial_default_indent

190

191 partial_future_imports = partial.future_imports

192 if isinstance(partial_future_imports, AutoConfig):

193 # Same note as above re itertools.tee, we will consume tokens.

194 tokens, tokens_dup = itertools.tee(tokens)

195 future_imports = _detect_future_imports(tokens_dup)

196 else:

197 future_imports = partial_future_imports

198

199 return ConfigDetectionResult(

200 config=ParserConfig(

201 lines=lines,

202 encoding=encoding,

203 default_indent=default_indent,

204 default_newline=default_newline,

205 has_trailing_newline=has_trailing_newline,

206 version=python_version,

207 future_imports=future_imports,

208 ),

209 tokens=tokens,

210 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/libcst/_parser/detect_config.py: 39%

92 statements