Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/black/strings.py: 13%

1"""

2Simple formatting on strings. Further string formatting code is in trans.py.

3"""

5import re

6import sys

7from functools import lru_cache

8from re import Match, Pattern

9from typing import Final

11from black._width_table import WIDTH_TABLE

12from blib2to3.pytree import Leaf

14STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters.

15STRING_PREFIX_RE: Final = re.compile(

16 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL

17)

18UNICODE_ESCAPE_RE: Final = re.compile(

19 r"(?P<backslashes>\\+)(?P<body>"

20 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx

21 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx

22 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh

23 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database

24 r")",

25 re.VERBOSE,

26)

29def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:

30 """Replace `regex` with `replacement` twice on `original`.

32 This is used by string normalization to perform replaces on

33 overlapping matches.

34 """

35 return regex.sub(replacement, regex.sub(replacement, original))

38def has_triple_quotes(string: str) -> bool:

39 """

40 Returns:

41 True iff @string starts with three quotation characters.

42 """

43 raw_string = string.lstrip(STRING_PREFIX_CHARS)

44 return raw_string[:3] in {'"""', "'''"}

47def lines_with_leading_tabs_expanded(s: str) -> list[str]:

48 """

49 Splits string into lines and expands only leading tabs (following the normal

50 Python rules)

51 """

52 lines = []

53 for line in s.splitlines():

54 stripped_line = line.lstrip()

55 if not stripped_line or stripped_line == line:

56 lines.append(line)

57 else:

58 prefix_length = len(line) - len(stripped_line)

59 prefix = line[:prefix_length].expandtabs()

60 lines.append(prefix + stripped_line)

61 if s.endswith("\n"):

62 lines.append("")

63 return lines

66def fix_multiline_docstring(docstring: str, prefix: str) -> str:

67 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation

68 assert docstring, "INTERNAL ERROR: Multiline docstrings cannot be empty"

69 lines = lines_with_leading_tabs_expanded(docstring)

70 # Determine minimum indentation (first line doesn't count):

71 indent = sys.maxsize

72 for line in lines[1:]:

73 stripped = line.lstrip()

74 if stripped:

75 indent = min(indent, len(line) - len(stripped))

76 # Remove indentation (first line is special):

77 trimmed = [lines[0].strip()]

78 if indent < sys.maxsize:

79 last_line_idx = len(lines) - 2

80 for i, line in enumerate(lines[1:]):

81 stripped_line = line[indent:].rstrip()

82 if stripped_line or i == last_line_idx:

83 trimmed.append(prefix + stripped_line)

84 else:

85 trimmed.append("")

86 return "\n".join(trimmed)

89def get_string_prefix(string: str) -> str:

90 """

91 Pre-conditions:

92 * assert_is_leaf_string(@string)

94 Returns:

95 @string's prefix (e.g. '', 'r', 'f', or 'rf').

96 """

97 assert_is_leaf_string(string)

99 prefix = ""

100 prefix_idx = 0

101 while string[prefix_idx] in STRING_PREFIX_CHARS:

102 prefix += string[prefix_idx]

103 prefix_idx += 1

104

105 return prefix

106

107

108def assert_is_leaf_string(string: str) -> None:

109 """

110 Checks the pre-condition that @string has the format that you would expect

111 of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==

112 token.STRING`. A more precise description of the pre-conditions that are

113 checked are listed below.

114

115 Pre-conditions:

116 * @string starts with either ', ", <prefix>', or <prefix>" where

117 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.

118 * @string ends with a quote character (' or ").

119

120 Raises:

121 AssertionError(...) if the pre-conditions listed above are not

122 satisfied.

123 """

124 dquote_idx = string.find('"')

125 squote_idx = string.find("'")

126 if -1 in [dquote_idx, squote_idx]:

127 quote_idx = max(dquote_idx, squote_idx)

128 else:

129 quote_idx = min(squote_idx, dquote_idx)

130

131 assert (

132 0 <= quote_idx < len(string) - 1

133 ), f"{string!r} is missing a starting quote character (' or \")."

134 assert string[-1] in (

135 "'",

136 '"',

137 ), f"{string!r} is missing an ending quote character (' or \")."

138 assert set(string[:quote_idx]).issubset(

139 set(STRING_PREFIX_CHARS)

140 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."

141

142

143def normalize_string_prefix(s: str) -> str:

144 """Make all string prefixes lowercase."""

145 match = STRING_PREFIX_RE.match(s)

146 assert match is not None, f"failed to match string {s!r}"

147 orig_prefix = match.group(1)

148 new_prefix = (

149 orig_prefix.replace("F", "f")

150 .replace("B", "b")

151 .replace("U", "")

152 .replace("u", "")

153 )

154

155 # Python syntax guarantees max 2 prefixes and that one of them is "r"

156 if len(new_prefix) == 2 and "r" != new_prefix[0].lower():

157 new_prefix = new_prefix[::-1]

158 return f"{new_prefix}{match.group(2)}"

159

160

161# Re(gex) does actually cache patterns internally but this still improves

162# performance on a long list literal of strings by 5-9% since lru_cache's

163# caching overhead is much lower.

164@lru_cache(maxsize=64)

165def _cached_compile(pattern: str) -> Pattern[str]:

166 return re.compile(pattern)

167

168

169def normalize_string_quotes(s: str) -> str:

170 """Prefer double quotes but only if it doesn't cause more escaping.

171

172 Adds or removes backslashes as appropriate.

173 """

174 value = s.lstrip(STRING_PREFIX_CHARS)

175 if value[:3] == '"""':

176 return s

177

178 elif value[:3] == "'''":

179 orig_quote = "'''"

180 new_quote = '"""'

181 elif value[0] == '"':

182 orig_quote = '"'

183 new_quote = "'"

184 else:

185 orig_quote = "'"

186 new_quote = '"'

187 first_quote_pos = s.find(orig_quote)

188 assert first_quote_pos != -1, f"INTERNAL ERROR: Malformed string {s!r}"

189

190 prefix = s[:first_quote_pos]

191 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")

192 escaped_new_quote = _cached_compile(rf"([^\\]|^)\$(?:\\\$*){new_quote}")

193 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\$(?:\\\$*){orig_quote}")

194 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]

195 if "r" in prefix.casefold():

196 if unescaped_new_quote.search(body):

197 # There's at least one unescaped new_quote in this raw string

198 # so converting is impossible

199 return s

200

201 # Do not introduce or remove backslashes in raw strings

202 new_body = body

203 else:

204 # remove unnecessary escapes

205 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)

206 if body != new_body:

207 # Consider the string without unnecessary escapes as the original

208 body = new_body

209 s = f"{prefix}{orig_quote}{body}{orig_quote}"

210 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)

211 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)

212

213 if "f" in prefix.casefold():

214 matches = re.findall(

215 r"""

216 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single {

217 ([^{].*?) # contents of the brackets except if begins with {{

218 \}(?:(?!\})|$) # A } followed by end of the string or a non-}

219 """,

220 new_body,

221 re.VERBOSE,

222 )

223 for m in matches:

224 if "\\" in str(m):

225 # Do not introduce backslashes in interpolated expressions

226 return s

227

228 if new_quote == '"""' and new_body[-1:] == '"':

229 # edge case:

230 new_body = new_body[:-1] + '\\"'

231 orig_escape_count = body.count("\\")

232 new_escape_count = new_body.count("\\")

233 if new_escape_count > orig_escape_count:

234 return s # Do not introduce more escaping

235

236 if new_escape_count == orig_escape_count and orig_quote == '"':

237 return s # Prefer double quotes

238

239 return f"{prefix}{new_quote}{new_body}{new_quote}"

240

241

242def normalize_fstring_quotes(

243 quote: str,

244 middles: list[Leaf],

245 is_raw_fstring: bool,

246) -> tuple[list[Leaf], str]:

247 """Prefer double quotes but only if it doesn't cause more escaping.

248

249 Adds or removes backslashes as appropriate.

250 """

251 if quote == '"""':

252 return middles, quote

253

254 elif quote == "'''":

255 new_quote = '"""'

256 elif quote == '"':

257 new_quote = "'"

258 else:

259 new_quote = '"'

260

261 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")

262 escaped_new_quote = _cached_compile(rf"([^\\]|^)\$(?:\\\$*){new_quote}")

263 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\$(?:\\\$*){quote}")

264 if is_raw_fstring:

265 for middle in middles:

266 if unescaped_new_quote.search(middle.value):

267 # There's at least one unescaped new_quote in this raw string

268 # so converting is impossible

269 return middles, quote

270

271 # Do not introduce or remove backslashes in raw strings, just use double quote

272 return middles, '"'

273

274 new_segments = []

275 for middle in middles:

276 segment = middle.value

277 # remove unnecessary escapes

278 new_segment = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", segment)

279 if segment != new_segment:

280 # Consider the string without unnecessary escapes as the original

281 middle.value = new_segment

282

283 new_segment = sub_twice(escaped_orig_quote, rf"\1\2{quote}", new_segment)

284 new_segment = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_segment)

285 new_segments.append(new_segment)

286

287 if new_quote == '"""' and new_segments[-1].endswith('"'):

288 # edge case:

289 new_segments[-1] = new_segments[-1][:-1] + '\\"'

290

291 for middle, new_segment in zip(middles, new_segments):

292 orig_escape_count = middle.value.count("\\")

293 new_escape_count = new_segment.count("\\")

294

295 if new_escape_count > orig_escape_count:

296 return middles, quote # Do not introduce more escaping

297

298 if new_escape_count == orig_escape_count and quote == '"':

299 return middles, quote # Prefer double quotes

300

301 for middle, new_segment in zip(middles, new_segments):

302 middle.value = new_segment

303

304 return middles, new_quote

305

306

307def normalize_unicode_escape_sequences(leaf: Leaf) -> None:

308 """Replace hex codes in Unicode escape sequences with lowercase representation."""

309 text = leaf.value

310 prefix = get_string_prefix(text)

311 if "r" in prefix.lower():

312 return

313

314 def replace(m: Match[str]) -> str:

315 groups = m.groupdict()

316 back_slashes = groups["backslashes"]

317

318 if len(back_slashes) % 2 == 0:

319 return back_slashes + groups["body"]

320

321 if groups["u"]:

322 # \u

323 return back_slashes + "u" + groups["u"].lower()

324 elif groups["U"]:

325 # \U

326 return back_slashes + "U" + groups["U"].lower()

327 elif groups["x"]:

328 # \x

329 return back_slashes + "x" + groups["x"].lower()

330 else:

331 assert groups["N"], f"Unexpected match: {m}"

332 # \N{}

333 return back_slashes + "N{" + groups["N"].upper() + "}"

334

335 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)

336

337

338@lru_cache(maxsize=4096)

339def char_width(char: str) -> int:

340 """Return the width of a single character as it would be displayed in a

341 terminal or editor (which respects Unicode East Asian Width).

342

343 Full width characters are counted as 2, while half width characters are

344 counted as 1. Also control characters are counted as 0.

345 """

346 table = WIDTH_TABLE

347 codepoint = ord(char)

348 highest = len(table) - 1

349 lowest = 0

350 idx = highest // 2

351 while True:

352 start_codepoint, end_codepoint, width = table[idx]

353 if codepoint < start_codepoint:

354 highest = idx - 1

355 elif codepoint > end_codepoint:

356 lowest = idx + 1

357 else:

358 return 0 if width < 0 else width

359 if highest < lowest:

360 break

361 idx = (highest + lowest) // 2

362 return 1

363

364

365def str_width(line_str: str) -> int:

366 """Return the width of `line_str` as it would be displayed in a terminal

367 or editor (which respects Unicode East Asian Width).

368

369 You could utilize this function to determine, for example, if a string

370 is too wide to display in a terminal or editor.

371 """

372 if line_str.isascii():

373 # Fast path for a line consisting of only ASCII characters

374 return len(line_str)

375 return sum(map(char_width, line_str))

376

377

378def count_chars_in_width(line_str: str, max_width: int) -> int:

379 """Count the number of characters in `line_str` that would fit in a

380 terminal or editor of `max_width` (which respects Unicode East Asian

381 Width).

382 """

383 total_width = 0

384 for i, char in enumerate(line_str):

385 width = char_width(char)

386 if width + total_width > max_width:

387 return i

388 total_width += width

389 return len(line_str)