Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/black/strings.py: 13%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Simple formatting on strings. Further string formatting code is in trans.py.
3"""
5import re
6import sys
7from functools import lru_cache
8from re import Match, Pattern
9from typing import Final
11from black._width_table import WIDTH_TABLE
12from blib2to3.pytree import Leaf
14STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters.
15STRING_PREFIX_RE: Final = re.compile(
16 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL
17)
18UNICODE_ESCAPE_RE: Final = re.compile(
19 r"(?P<backslashes>\\+)(?P<body>"
20 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx
21 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx
22 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh
23 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database
24 r")",
25 re.VERBOSE,
26)
29def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:
30 """Replace `regex` with `replacement` twice on `original`.
32 This is used by string normalization to perform replaces on
33 overlapping matches.
34 """
35 return regex.sub(replacement, regex.sub(replacement, original))
38def has_triple_quotes(string: str) -> bool:
39 """
40 Returns:
41 True iff @string starts with three quotation characters.
42 """
43 raw_string = string.lstrip(STRING_PREFIX_CHARS)
44 return raw_string[:3] in {'"""', "'''"}
47def lines_with_leading_tabs_expanded(s: str) -> list[str]:
48 """
49 Splits string into lines and expands only leading tabs (following the normal
50 Python rules)
51 """
52 lines = []
53 for line in s.splitlines():
54 stripped_line = line.lstrip()
55 if not stripped_line or stripped_line == line:
56 lines.append(line)
57 else:
58 prefix_length = len(line) - len(stripped_line)
59 prefix = line[:prefix_length].expandtabs()
60 lines.append(prefix + stripped_line)
61 if s.endswith("\n"):
62 lines.append("")
63 return lines
66def fix_multiline_docstring(docstring: str, prefix: str) -> str:
67 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation
68 assert docstring, "INTERNAL ERROR: Multiline docstrings cannot be empty"
69 lines = lines_with_leading_tabs_expanded(docstring)
70 # Determine minimum indentation (first line doesn't count):
71 indent = sys.maxsize
72 for line in lines[1:]:
73 stripped = line.lstrip()
74 if stripped:
75 indent = min(indent, len(line) - len(stripped))
76 # Remove indentation (first line is special):
77 trimmed = [lines[0].strip()]
78 if indent < sys.maxsize:
79 last_line_idx = len(lines) - 2
80 for i, line in enumerate(lines[1:]):
81 stripped_line = line[indent:].rstrip()
82 if stripped_line or i == last_line_idx:
83 trimmed.append(prefix + stripped_line)
84 else:
85 trimmed.append("")
86 return "\n".join(trimmed)
89def get_string_prefix(string: str) -> str:
90 """
91 Pre-conditions:
92 * assert_is_leaf_string(@string)
94 Returns:
95 @string's prefix (e.g. '', 'r', 'f', or 'rf').
96 """
97 assert_is_leaf_string(string)
99 prefix = ""
100 prefix_idx = 0
101 while string[prefix_idx] in STRING_PREFIX_CHARS:
102 prefix += string[prefix_idx]
103 prefix_idx += 1
105 return prefix
108def assert_is_leaf_string(string: str) -> None:
109 """
110 Checks the pre-condition that @string has the format that you would expect
111 of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==
112 token.STRING`. A more precise description of the pre-conditions that are
113 checked are listed below.
115 Pre-conditions:
116 * @string starts with either ', ", <prefix>', or <prefix>" where
117 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.
118 * @string ends with a quote character (' or ").
120 Raises:
121 AssertionError(...) if the pre-conditions listed above are not
122 satisfied.
123 """
124 dquote_idx = string.find('"')
125 squote_idx = string.find("'")
126 if -1 in [dquote_idx, squote_idx]:
127 quote_idx = max(dquote_idx, squote_idx)
128 else:
129 quote_idx = min(squote_idx, dquote_idx)
131 assert (
132 0 <= quote_idx < len(string) - 1
133 ), f"{string!r} is missing a starting quote character (' or \")."
134 assert string[-1] in (
135 "'",
136 '"',
137 ), f"{string!r} is missing an ending quote character (' or \")."
138 assert set(string[:quote_idx]).issubset(
139 set(STRING_PREFIX_CHARS)
140 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."
143def normalize_string_prefix(s: str) -> str:
144 """Make all string prefixes lowercase."""
145 match = STRING_PREFIX_RE.match(s)
146 assert match is not None, f"failed to match string {s!r}"
147 orig_prefix = match.group(1)
148 new_prefix = (
149 orig_prefix.replace("F", "f")
150 .replace("B", "b")
151 .replace("U", "")
152 .replace("u", "")
153 )
155 # Python syntax guarantees max 2 prefixes and that one of them is "r"
156 if len(new_prefix) == 2 and "r" != new_prefix[0].lower():
157 new_prefix = new_prefix[::-1]
158 return f"{new_prefix}{match.group(2)}"
161# Re(gex) does actually cache patterns internally but this still improves
162# performance on a long list literal of strings by 5-9% since lru_cache's
163# caching overhead is much lower.
164@lru_cache(maxsize=64)
165def _cached_compile(pattern: str) -> Pattern[str]:
166 return re.compile(pattern)
169def normalize_string_quotes(s: str) -> str:
170 """Prefer double quotes but only if it doesn't cause more escaping.
172 Adds or removes backslashes as appropriate.
173 """
174 value = s.lstrip(STRING_PREFIX_CHARS)
175 if value[:3] == '"""':
176 return s
178 elif value[:3] == "'''":
179 orig_quote = "'''"
180 new_quote = '"""'
181 elif value[0] == '"':
182 orig_quote = '"'
183 new_quote = "'"
184 else:
185 orig_quote = "'"
186 new_quote = '"'
187 first_quote_pos = s.find(orig_quote)
188 assert first_quote_pos != -1, f"INTERNAL ERROR: Malformed string {s!r}"
190 prefix = s[:first_quote_pos]
191 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
192 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
193 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}")
194 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]
195 if "r" in prefix.casefold():
196 if unescaped_new_quote.search(body):
197 # There's at least one unescaped new_quote in this raw string
198 # so converting is impossible
199 return s
201 # Do not introduce or remove backslashes in raw strings
202 new_body = body
203 else:
204 # remove unnecessary escapes
205 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)
206 if body != new_body:
207 # Consider the string without unnecessary escapes as the original
208 body = new_body
209 s = f"{prefix}{orig_quote}{body}{orig_quote}"
210 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)
211 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)
213 if "f" in prefix.casefold():
214 matches = re.findall(
215 r"""
216 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single {
217 ([^{].*?) # contents of the brackets except if begins with {{
218 \}(?:(?!\})|$) # A } followed by end of the string or a non-}
219 """,
220 new_body,
221 re.VERBOSE,
222 )
223 for m in matches:
224 if "\\" in str(m):
225 # Do not introduce backslashes in interpolated expressions
226 return s
228 if new_quote == '"""' and new_body[-1:] == '"':
229 # edge case:
230 new_body = new_body[:-1] + '\\"'
231 orig_escape_count = body.count("\\")
232 new_escape_count = new_body.count("\\")
233 if new_escape_count > orig_escape_count:
234 return s # Do not introduce more escaping
236 if new_escape_count == orig_escape_count and orig_quote == '"':
237 return s # Prefer double quotes
239 return f"{prefix}{new_quote}{new_body}{new_quote}"
242def normalize_fstring_quotes(
243 quote: str,
244 middles: list[Leaf],
245 is_raw_fstring: bool,
246) -> tuple[list[Leaf], str]:
247 """Prefer double quotes but only if it doesn't cause more escaping.
249 Adds or removes backslashes as appropriate.
250 """
251 if quote == '"""':
252 return middles, quote
254 elif quote == "'''":
255 new_quote = '"""'
256 elif quote == '"':
257 new_quote = "'"
258 else:
259 new_quote = '"'
261 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")
262 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}")
263 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){quote}")
264 if is_raw_fstring:
265 for middle in middles:
266 if unescaped_new_quote.search(middle.value):
267 # There's at least one unescaped new_quote in this raw string
268 # so converting is impossible
269 return middles, quote
271 # Do not introduce or remove backslashes in raw strings, just use double quote
272 return middles, '"'
274 new_segments = []
275 for middle in middles:
276 segment = middle.value
277 # remove unnecessary escapes
278 new_segment = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", segment)
279 if segment != new_segment:
280 # Consider the string without unnecessary escapes as the original
281 middle.value = new_segment
283 new_segment = sub_twice(escaped_orig_quote, rf"\1\2{quote}", new_segment)
284 new_segment = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_segment)
285 new_segments.append(new_segment)
287 if new_quote == '"""' and new_segments[-1].endswith('"'):
288 # edge case:
289 new_segments[-1] = new_segments[-1][:-1] + '\\"'
291 for middle, new_segment in zip(middles, new_segments):
292 orig_escape_count = middle.value.count("\\")
293 new_escape_count = new_segment.count("\\")
295 if new_escape_count > orig_escape_count:
296 return middles, quote # Do not introduce more escaping
298 if new_escape_count == orig_escape_count and quote == '"':
299 return middles, quote # Prefer double quotes
301 for middle, new_segment in zip(middles, new_segments):
302 middle.value = new_segment
304 return middles, new_quote
307def normalize_unicode_escape_sequences(leaf: Leaf) -> None:
308 """Replace hex codes in Unicode escape sequences with lowercase representation."""
309 text = leaf.value
310 prefix = get_string_prefix(text)
311 if "r" in prefix.lower():
312 return
314 def replace(m: Match[str]) -> str:
315 groups = m.groupdict()
316 back_slashes = groups["backslashes"]
318 if len(back_slashes) % 2 == 0:
319 return back_slashes + groups["body"]
321 if groups["u"]:
322 # \u
323 return back_slashes + "u" + groups["u"].lower()
324 elif groups["U"]:
325 # \U
326 return back_slashes + "U" + groups["U"].lower()
327 elif groups["x"]:
328 # \x
329 return back_slashes + "x" + groups["x"].lower()
330 else:
331 assert groups["N"], f"Unexpected match: {m}"
332 # \N{}
333 return back_slashes + "N{" + groups["N"].upper() + "}"
335 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)
338@lru_cache(maxsize=4096)
339def char_width(char: str) -> int:
340 """Return the width of a single character as it would be displayed in a
341 terminal or editor (which respects Unicode East Asian Width).
343 Full width characters are counted as 2, while half width characters are
344 counted as 1. Also control characters are counted as 0.
345 """
346 table = WIDTH_TABLE
347 codepoint = ord(char)
348 highest = len(table) - 1
349 lowest = 0
350 idx = highest // 2
351 while True:
352 start_codepoint, end_codepoint, width = table[idx]
353 if codepoint < start_codepoint:
354 highest = idx - 1
355 elif codepoint > end_codepoint:
356 lowest = idx + 1
357 else:
358 return 0 if width < 0 else width
359 if highest < lowest:
360 break
361 idx = (highest + lowest) // 2
362 return 1
365def str_width(line_str: str) -> int:
366 """Return the width of `line_str` as it would be displayed in a terminal
367 or editor (which respects Unicode East Asian Width).
369 You could utilize this function to determine, for example, if a string
370 is too wide to display in a terminal or editor.
371 """
372 if line_str.isascii():
373 # Fast path for a line consisting of only ASCII characters
374 return len(line_str)
375 return sum(map(char_width, line_str))
378def count_chars_in_width(line_str: str, max_width: int) -> int:
379 """Count the number of characters in `line_str` that would fit in a
380 terminal or editor of `max_width` (which respects Unicode East Asian
381 Width).
382 """
383 total_width = 0
384 for i, char in enumerate(line_str):
385 width = char_width(char)
386 if width + total_width > max_width:
387 return i
388 total_width += width
389 return len(line_str)