Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/libcst/_parser/detect_config.py: 39%
92 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:43 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:43 +0000
1# Copyright (c) Meta Platforms, Inc. and affiliates.
2#
3# This source code is licensed under the MIT license found in the
4# LICENSE file in the root directory of this source tree.
7import itertools
8import re
9from dataclasses import dataclass
10from io import BytesIO
11from tokenize import detect_encoding as py_tokenize_detect_encoding
12from typing import FrozenSet, Iterable, Iterator, Pattern, Set, Tuple, Union
14from libcst._nodes.whitespace import NEWLINE_RE
15from libcst._parser.parso.python.token import PythonTokenTypes, TokenType
16from libcst._parser.parso.utils import split_lines
17from libcst._parser.types.config import AutoConfig, ParserConfig, PartialParserConfig
18from libcst._parser.types.token import Token
19from libcst._parser.wrapped_tokenize import tokenize_lines
21_INDENT: TokenType = PythonTokenTypes.INDENT
22_NAME: TokenType = PythonTokenTypes.NAME
23_NEWLINE: TokenType = PythonTokenTypes.NEWLINE
24_STRING: TokenType = PythonTokenTypes.STRING
26_FALLBACK_DEFAULT_NEWLINE = "\n"
27_FALLBACK_DEFAULT_INDENT = " "
28_CONTINUATION_RE: Pattern[str] = re.compile(r"\\(\r\n?|\n)", re.UNICODE)
31@dataclass(frozen=True)
32class ConfigDetectionResult:
33 # The config is a set of constant values used by the parser.
34 config: ParserConfig
35 # The tokens iterator is mutated by the parser.
36 tokens: Iterator[Token]
39def _detect_encoding(source: Union[str, bytes]) -> str:
40 """
41 Detects the encoding from the presence of a UTF-8 BOM or an encoding cookie as
42 specified in PEP 263.
44 If given a string (instead of bytes) the encoding is assumed to be utf-8.
45 """
47 if isinstance(source, str):
48 return "utf-8"
49 return py_tokenize_detect_encoding(BytesIO(source).readline)[0]
52def _detect_default_newline(source_str: str) -> str:
53 """
54 Finds the first newline, and uses that value as the default newline.
55 """
56 # Don't use `NEWLINE_RE` for this, because it might match multiple newlines as a
57 # single newline.
58 match = NEWLINE_RE.search(source_str)
59 return match.group(0) if match is not None else _FALLBACK_DEFAULT_NEWLINE
62def _detect_indent(tokens: Iterable[Token]) -> str:
63 """
64 Finds the first INDENT token, and uses that as the value of the default indent.
65 """
66 try:
67 first_indent = next(t for t in tokens if t.type is _INDENT)
68 except StopIteration:
69 return _FALLBACK_DEFAULT_INDENT
70 first_indent_str = first_indent.relative_indent
71 assert first_indent_str is not None, "INDENT tokens must contain a relative_indent"
72 return first_indent_str
75def _detect_trailing_newline(source_str: str) -> bool:
76 if len(source_str) == 0 or not NEWLINE_RE.fullmatch(source_str[-1]):
77 return False
78 # Make sure that the last newline wasn't following a continuation
79 return not (
80 _CONTINUATION_RE.fullmatch(source_str[-2:])
81 or _CONTINUATION_RE.fullmatch(source_str[-3:])
82 )
85def _detect_future_imports(tokens: Iterable[Token]) -> FrozenSet[str]:
86 """
87 Finds __future__ imports in their proper locations.
89 See `https://www.python.org/dev/peps/pep-0236/`_
90 """
91 future_imports: Set[str] = set()
92 state = 0
93 for tok in tokens:
94 if state == 0 and tok.type in (_STRING, _NEWLINE):
95 continue
96 elif state == 0 and tok.string == "from":
97 state = 1
98 elif state == 1 and tok.string == "__future__":
99 state = 2
100 elif state == 2 and tok.string == "import":
101 state = 3
102 elif state == 3 and tok.string == "as":
103 state = 4
104 elif state == 3 and tok.type == _NAME:
105 future_imports.add(tok.string)
106 elif state == 4 and tok.type == _NAME:
107 state = 3
108 elif state == 3 and tok.string in "(),":
109 continue
110 elif state == 3 and tok.type == _NEWLINE:
111 state = 0
112 else:
113 break
114 return frozenset(future_imports)
117def convert_to_utf8(
118 source: Union[str, bytes], *, partial: PartialParserConfig
119) -> Tuple[str, str]:
120 """
121 Returns an (original encoding, converted source) tuple.
122 """
123 partial_encoding = partial.encoding
124 encoding = (
125 _detect_encoding(source)
126 if isinstance(partial_encoding, AutoConfig)
127 else partial_encoding
128 )
130 source_str = source if isinstance(source, str) else source.decode(encoding)
131 return (encoding, source_str)
134def detect_config(
135 source: Union[str, bytes],
136 *,
137 partial: PartialParserConfig,
138 detect_trailing_newline: bool,
139 detect_default_newline: bool,
140) -> ConfigDetectionResult:
141 """
142 Computes a ParserConfig given the current source code to be parsed and a partial
143 config.
144 """
146 python_version = partial.parsed_python_version
148 encoding, source_str = convert_to_utf8(source, partial=partial)
150 partial_default_newline = partial.default_newline
151 default_newline = (
152 (
153 _detect_default_newline(source_str)
154 if detect_default_newline
155 else _FALLBACK_DEFAULT_NEWLINE
156 )
157 if isinstance(partial_default_newline, AutoConfig)
158 else partial_default_newline
159 )
161 # HACK: The grammar requires a trailing newline, but python doesn't actually require
162 # a trailing newline. Add one onto the end to make the parser happy. We'll strip it
163 # out again during cst.Module's codegen.
164 #
165 # I think parso relies on error recovery support to handle this, which we don't
166 # have. lib2to3 doesn't handle this case at all AFAICT.
167 has_trailing_newline = detect_trailing_newline and _detect_trailing_newline(
168 source_str
169 )
170 if detect_trailing_newline and not has_trailing_newline:
171 source_str += default_newline
173 lines = split_lines(source_str, keepends=True)
175 tokens = tokenize_lines(source_str, lines, python_version)
177 partial_default_indent = partial.default_indent
178 if isinstance(partial_default_indent, AutoConfig):
179 # We need to clone `tokens` before passing it to `_detect_indent`, because
180 # `_detect_indent` consumes some tokens, mutating `tokens`.
181 #
182 # Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the
183 # size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup`
184 # once `token_dup` is freed at the end of this method (subject to
185 # GC/refcounting).
186 tokens, tokens_dup = itertools.tee(tokens)
187 default_indent = _detect_indent(tokens_dup)
188 else:
189 default_indent = partial_default_indent
191 partial_future_imports = partial.future_imports
192 if isinstance(partial_future_imports, AutoConfig):
193 # Same note as above re itertools.tee, we will consume tokens.
194 tokens, tokens_dup = itertools.tee(tokens)
195 future_imports = _detect_future_imports(tokens_dup)
196 else:
197 future_imports = partial_future_imports
199 return ConfigDetectionResult(
200 config=ParserConfig(
201 lines=lines,
202 encoding=encoding,
203 default_indent=default_indent,
204 default_newline=default_newline,
205 has_trailing_newline=has_trailing_newline,
206 version=python_version,
207 future_imports=future_imports,
208 ),
209 tokens=tokens,
210 )