Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/black/strings.py: 72%
165 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:15 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:15 +0000
1"""
2Simple formatting on strings. Further string formatting code is in trans.py.
3"""
5import re
6import sys
7from functools import lru_cache
8from typing import List, Match, Pattern
10from blib2to3.pytree import Leaf
12if sys.version_info < (3, 8):
13 from typing_extensions import Final
14else:
15 from typing import Final
17from black._width_table import WIDTH_TABLE
19STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters.
20STRING_PREFIX_RE: Final = re.compile(
21 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
22)
23FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")
24UNICODE_ESCAPE_RE: Final = re.compile(
25 r"(?P<backslashes>\\+)(?P<body>"
26 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx
27 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx
28 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh
29 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database
30 r")",
31 re.VERBOSE,
32)
35def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
36 """Replace `regex` with `replacement` twice on `original`.
38 This is used by string normalization to perform replaces on
39 overlapping matches.
40 """
41 return regex.sub(replacement, regex.sub(replacement, original))
44def has_triple_quotes(string: str) -> bool:
45 """
46 Returns:
47 True iff @string starts with three quotation characters.
48 """
49 raw_string = string.lstrip(STRING_PREFIX_CHARS)
50 return raw_string[:3] in {'"""', "'''"}
53def lines_with_leading_tabs_expanded(s: str) -> List[str]:
54 """
55 Splits string into lines and expands only leading tabs (following the normal
56 Python rules)
57 """
58 lines = []
59 for line in s.splitlines():
60 # Find the index of the first non-whitespace character after a string of
61 # whitespace that includes at least one tab
62 match = FIRST_NON_WHITESPACE_RE.match(line)
63 if match:
64 first_non_whitespace_idx = match.start(1)
66 lines.append(
67 line[:first_non_whitespace_idx].expandtabs()
68 + line[first_non_whitespace_idx:]
69 )
70 else:
71 lines.append(line)
72 return lines
75def fix_docstring(docstring: str, prefix: str) -> str:
76 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
77 if not docstring:
78 return ""
79 lines = lines_with_leading_tabs_expanded(docstring)
80 # Determine minimum indentation (first line doesn't count):
81 indent = sys.maxsize
82 for line in lines[1:]:
83 stripped = line.lstrip()
84 if stripped:
85 indent = min(indent, len(line) - len(stripped))
86 # Remove indentation (first line is special):
87 trimmed = [lines[0].strip()]
88 if indent < sys.maxsize:
89 last_line_idx = len(lines) - 2
90 for i, line in enumerate(lines[1:]):
91 stripped_line = line[indent:].rstrip()
92 if stripped_line or i == last_line_idx:
93 trimmed.append(prefix + stripped_line)
94 else:
95 trimmed.append("")
96 return "\n".join(trimmed)
99def get_string_prefix(string: str) -> str:
100 """
101 Pre-conditions:
102 * assert_is_leaf_string(@string)
104 Returns:
105 @string's prefix (e.g. '', 'r', 'f', or 'rf').
106 """
107 assert_is_leaf_string(string)
109 prefix = ""
110 prefix_idx = 0
111 while string[prefix_idx] in STRING_PREFIX_CHARS:
112 prefix += string[prefix_idx]
113 prefix_idx += 1
115 return prefix
118def assert_is_leaf_string(string: str) -> None:
119 """
120 Checks the pre-condition that @string has the format that you would expect
121 of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
122 token.STRING`. A more precise description of the pre-conditions that are
123 checked are listed below.
125 Pre-conditions:
126 * @string starts with either ', ", <prefix>', or <prefix>" where
127 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
128 * @string ends with a quote character (' or ").
130 Raises:
131 AssertionError(...) if the pre-conditions listed above are not
132 satisfied.
133 """
134 dquote_idx = string.find('"')
135 squote_idx = string.find("'")
136 if -1 in [dquote_idx, squote_idx]:
137 quote_idx = max(dquote_idx, squote_idx)
138 else:
139 quote_idx = min(squote_idx, dquote_idx)
141 assert (
142 0 <= quote_idx < len(string) - 1
143 ), f"{string!r} is missing a starting quote character (' or \")."
144 assert string[-1] in (
145 "'",
146 '"',
147 ), f"{string!r} is missing an ending quote character (' or \")."
148 assert set(string[:quote_idx]).issubset(
149 set(STRING_PREFIX_CHARS)
150 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
153def normalize_string_prefix(s: str) -> str:
154 """Make all string prefixes lowercase."""
155 match = STRING_PREFIX_RE.match(s)
156 assert match is not None, f"failed to match string {s!r}"
157 orig_prefix = match.group(1)
158 new_prefix = (
159 orig_prefix.replace("F", "f")
160 .replace("B", "b")
161 .replace("U", "")
162 .replace("u", "")
163 )
165 # Python syntax guarantees max 2 prefixes and that one of them is "r"
166 if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
167 new_prefix = new_prefix[::-1]
168 return f"{new_prefix}{match.group(2)}"
171# Re(gex) does actually cache patterns internally but this still improves
172# performance on a long list literal of strings by 5-9% since lru_cache's
173# caching overhead is much lower.
174@lru_cache(maxsize=64)
175def _cached_compile(pattern: str) -> Pattern[str]:
176 return re.compile(pattern)
179def normalize_string_quotes(s: str) -> str:
180 """Prefer double quotes but only if it doesn't cause more escaping.
182 Adds or removes backslashes as appropriate. Doesn't parse and fix
183 strings nested in f-strings.
184 """
185 value = s.lstrip(STRING_PREFIX_CHARS)
186 if value[:3] == '"""':
187 return s
189 elif value[:3] == "'''":
190 orig_quote = "'''"
191 new_quote = '"""'
192 elif value[0] == '"':
193 orig_quote = '"'
194 new_quote = "'"
195 else:
196 orig_quote = "'"
197 new_quote = '"'
198 first_quote_pos = s.find(orig_quote)
199 if first_quote_pos == -1:
200 return s # There's an internal error
202 prefix = s[:first_quote_pos]
203 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
204 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
205 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
206 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
207 if "r" in prefix.casefold():
208 if unescaped_new_quote.search(body):
209 # There's at least one unescaped new_quote in this raw string
210 # so converting is impossible
211 return s
213 # Do not introduce or remove backslashes in raw strings
214 new_body = body
215 else:
216 # remove unnecessary escapes
217 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
218 if body != new_body:
219 # Consider the string without unnecessary escapes as the original
220 body = new_body
221 s = f"{prefix}{orig_quote}{body}{orig_quote}"
222 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
223 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
224 if "f" in prefix.casefold():
225 matches = re.findall(
226 r"""
227 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single {
228 ([^{].*?) # contents of the brackets except if begins with {{
229 \}(?:(?!\})|$) # A } followed by end of the string or a non-}
230 """,
231 new_body,
232 re.VERBOSE,
233 )
234 for m in matches:
235 if "\\" in str(m):
236 # Do not introduce backslashes in interpolated expressions
237 return s
239 if new_quote == '"""' and new_body[-1:] == '"':
240 # edge case:
241 new_body = new_body[:-1] + '\\"'
242 orig_escape_count = body.count("\\")
243 new_escape_count = new_body.count("\\")
244 if new_escape_count > orig_escape_count:
245 return s # Do not introduce more escaping
247 if new_escape_count == orig_escape_count and orig_quote == '"':
248 return s # Prefer double quotes
250 return f"{prefix}{new_quote}{new_body}{new_quote}"
253def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
254 """Replace hex codes in Unicode escape sequences with lowercase representation."""
255 text = leaf.value
256 prefix = get_string_prefix(text)
257 if "r" in prefix.lower():
258 return
260 def replace(m: Match[str]) -> str:
261 groups = m.groupdict()
262 back_slashes = groups["backslashes"]
264 if len(back_slashes) % 2 == 0:
265 return back_slashes + groups["body"]
267 if groups["u"]:
268 # \u
269 return back_slashes + "u" + groups["u"].lower()
270 elif groups["U"]:
271 # \U
272 return back_slashes + "U" + groups["U"].lower()
273 elif groups["x"]:
274 # \x
275 return back_slashes + "x" + groups["x"].lower()
276 else:
277 assert groups["N"], f"Unexpected match: {m}"
278 # \N{}
279 return back_slashes + "N{" + groups["N"].upper() + "}"
281 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
284@lru_cache(maxsize=4096)
285def char_width(char: str) -> int:
286 """Return the width of a single character as it would be displayed in a
287 terminal or editor (which respects Unicode East Asian Width).
289 Full width characters are counted as 2, while half width characters are
290 counted as 1. Also control characters are counted as 0.
291 """
292 table = WIDTH_TABLE
293 codepoint = ord(char)
294 highest = len(table) - 1
295 lowest = 0
296 idx = highest // 2
297 while True:
298 start_codepoint, end_codepoint, width = table[idx]
299 if codepoint < start_codepoint:
300 highest = idx - 1
301 elif codepoint > end_codepoint:
302 lowest = idx + 1
303 else:
304 return 0 if width < 0 else width
305 if highest < lowest:
306 break
307 idx = (highest + lowest) // 2
308 return 1
311def str_width(line_str: str) -> int:
312 """Return the width of `line_str` as it would be displayed in a terminal
313 or editor (which respects Unicode East Asian Width).
315 You could utilize this function to determine, for example, if a string
316 is too wide to display in a terminal or editor.
317 """
318 if line_str.isascii():
319 # Fast path for a line consisting of only ASCII characters
320 return len(line_str)
321 return sum(map(char_width, line_str))
324def count_chars_in_width(line_str: str, max_width: int) -> int:
325 """Count the number of characters in `line_str` that would fit in a
326 terminal or editor of `max_width` (which respects Unicode East Asian
327 Width).
328 """
329 total_width = 0
330 for i, char in enumerate(line_str):
331 width = char_width(char)
332 if width + total_width > max_width:
333 return i
334 total_width += width
335 return len(line_str)