Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/black/strings.py: 13%

1"""

2Simple formatting on strings. Further string formatting code is in trans.py.

3"""

5import re

6import sys

7from functools import lru_cache

8from re import Match, Pattern

9from typing import Final

11from black._width_table import WIDTH_TABLE

12from blib2to3.pytree import Leaf

14STRING_PREFIX_CHARS: Final = "fturbFTURB" # All possible string prefix characters.

15STRING_PREFIX_RE: Final = re.compile(

16 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL

17)

18UNICODE_ESCAPE_RE: Final = re.compile(

19 r"(?P<backslashes>\\+)(?P<body>"

20 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx

21 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx

22 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh

23 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database

24 r")",

25 re.VERBOSE,

26)

29def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:

30 """Replace `regex` with `replacement` twice on `original`.

32 This is used by string normalization to perform replaces on

33 overlapping matches.

34 """

35 return regex.sub(replacement, regex.sub(replacement, original))

38def has_triple_quotes(string: str) -> bool:

39 """

40 Returns:

41 True iff @string starts with three quotation characters.

42 """

43 raw_string = string.lstrip(STRING_PREFIX_CHARS)

44 return raw_string[:3] in {'"""', "'''"}

47def lines_with_leading_tabs_expanded(s: str) -> list[str]:

48 """

49 Splits string into lines and expands only leading tabs (following the normal

50 Python rules)

51 """

52 lines = []

53 for line in s.splitlines():

54 stripped_line = line.lstrip()

55 if not stripped_line or stripped_line == line:

56 lines.append(line)

57 else:

58 prefix_length = len(line) - len(stripped_line)

59 prefix = line[:prefix_length].expandtabs()

60 lines.append(prefix + stripped_line)

61 if s.endswith("\n"):

62 lines.append("")

63 return lines

66def fix_multiline_docstring(docstring: str, prefix: str) -> str:

67 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation

68 assert docstring, "INTERNAL ERROR: Multiline docstrings cannot be empty"

69 lines = lines_with_leading_tabs_expanded(docstring)

70 # Determine minimum indentation (first line doesn't count):

71 indent = sys.maxsize

72 for line in lines[1:]:

73 stripped = line.lstrip()

74 if stripped:

75 indent = min(indent, len(line) - len(stripped))

76 # Remove indentation (first line is special):

77 trimmed = [lines[0].strip()]

78 if indent < sys.maxsize:

79 last_line_idx = len(lines) - 2

80 for i, line in enumerate(lines[1:]):

81 stripped_line = line[indent:].rstrip()

82 if stripped_line or i == last_line_idx:

83 trimmed.append(prefix + stripped_line)

84 else:

85 trimmed.append("")

86 return "\n".join(trimmed)

89def get_string_prefix(string: str) -> str:

90 """

91 Pre-conditions:

92 * assert_is_leaf_string(@string)

94 Returns:

95 @string's prefix (e.g. '', 'r', 'f', or 'rf').

96 """

97 assert_is_leaf_string(string)

99 prefix = []

100 for char in string:

101 if char in STRING_PREFIX_CHARS:

102 prefix.append(char)

103 else:

104 break

105 return "".join(prefix)

106

107

108def assert_is_leaf_string(string: str) -> None:

109 """

110 Checks the pre-condition that @string has the format that you would expect

111 of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==

112 token.STRING`. A more precise description of the pre-conditions that are

113 checked are listed below.

114

115 Pre-conditions:

116 * @string starts with either ', ", <prefix>', or <prefix>" where

117 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.

118 * @string ends with a quote character (' or ").

119

120 Raises:

121 AssertionError(...) if the pre-conditions listed above are not

122 satisfied.

123 """

124 dquote_idx = string.find('"')

125 squote_idx = string.find("'")

126 if -1 in [dquote_idx, squote_idx]:

127 quote_idx = max(dquote_idx, squote_idx)

128 else:

129 quote_idx = min(squote_idx, dquote_idx)

130

131 assert (

132 0 <= quote_idx < len(string) - 1

133 ), f"{string!r} is missing a starting quote character (' or \")."

134 assert string[-1] in (

135 "'",

136 '"',

137 ), f"{string!r} is missing an ending quote character (' or \")."

138 assert set(string[:quote_idx]).issubset(

139 set(STRING_PREFIX_CHARS)

140 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."

141

142

143def normalize_string_prefix(s: str) -> str:

144 """Make all string prefixes lowercase."""

145 match = STRING_PREFIX_RE.match(s)

146 assert match is not None, f"failed to match string {s!r}"

147 orig_prefix = match.group(1)

148 new_prefix = (

149 orig_prefix.replace("F", "f")

150 .replace("B", "b")

151 .replace("U", "")

152 .replace("u", "")

153 )

154

155 # Python syntax guarantees max 2 prefixes and that one of them is "r"

156 if len(new_prefix) == 2 and new_prefix[0].lower() != "r":

157 new_prefix = new_prefix[::-1]

158 return f"{new_prefix}{match.group(2)}"

159

160

161# Re(gex) does actually cache patterns internally but this still improves

162# performance on a long list literal of strings by 5-9% since lru_cache's

163# caching overhead is much lower.

164@lru_cache(maxsize=64)

165def _cached_compile(pattern: str) -> Pattern[str]:

166 return re.compile(pattern)

167

168

169def normalize_string_quotes(s: str) -> str:

170 """Prefer double quotes but only if it doesn't cause more escaping.

171

172 Adds or removes backslashes as appropriate.

173 """

174 value = s.lstrip(STRING_PREFIX_CHARS)

175 if value[:3] == '"""':

176 return s

177

178 elif value[:3] == "'''":

179 orig_quote = "'''"

180 new_quote = '"""'

181 elif value[0] == '"':

182 orig_quote = '"'

183 new_quote = "'"

184 else:

185 orig_quote = "'"

186 new_quote = '"'

187 first_quote_pos = s.find(orig_quote)

188 assert first_quote_pos != -1, f"INTERNAL ERROR: Malformed string {s!r}"

189

190 prefix = s[:first_quote_pos]

191 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")

192 escaped_new_quote = _cached_compile(rf"([^\\]|^)\$(?:\\\$*){new_quote}")

193 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\$(?:\\\$*){orig_quote}")

194 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]

195 if "r" in prefix.casefold():

196 if unescaped_new_quote.search(body):

197 # There's at least one unescaped new_quote in this raw string

198 # so converting is impossible

199 return s

200

201 # Do not introduce or remove backslashes in raw strings

202 new_body = body

203 else:

204 # remove unnecessary escapes

205 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)

206 if body != new_body:

207 # Consider the string without unnecessary escapes as the original

208 body = new_body

209 s = f"{prefix}{orig_quote}{body}{orig_quote}"

210 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)

211 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)

212

213 if "f" in prefix.casefold():

214 matches = re.findall(

215 r"""

216 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single {

217 ([^{].*?) # contents of the brackets except if begins with {{

218 \}(?:(?!\})|$) # A } followed by end of the string or a non-}

219 """,

220 new_body,

221 re.VERBOSE,

222 )

223 for m in matches:

224 if "\\" in str(m):

225 # Do not introduce backslashes in interpolated expressions

226 return s

227

228 if new_quote == '"""' and new_body[-1:] == '"':

229 # edge case:

230 new_body = new_body[:-1] + '\\"'

231 orig_escape_count = body.count("\\")

232 new_escape_count = new_body.count("\\")

233 if new_escape_count > orig_escape_count:

234 return s # Do not introduce more escaping

235

236 if new_escape_count == orig_escape_count and orig_quote == '"':

237 return s # Prefer double quotes

238

239 return f"{prefix}{new_quote}{new_body}{new_quote}"

240

241

242def normalize_fstring_quotes(

243 quote: str,

244 middles: list[Leaf],

245 is_raw_fstring: bool,

246) -> tuple[list[Leaf], str]:

247 """Prefer double quotes but only if it doesn't cause more escaping.

248

249 Adds or removes backslashes as appropriate.

250 """

251 if quote == '"""':

252 return middles, quote

253

254 elif quote == "'''":

255 new_quote = '"""'

256 elif quote == '"':

257 new_quote = "'"

258 else:

259 new_quote = '"'

260

261 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")

262 escaped_new_quote = _cached_compile(rf"([^\\]|^)\$(?:\\\$*){new_quote}")

263 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\$(?:\\\$*){quote}")

264 if is_raw_fstring:

265 for middle in middles:

266 if unescaped_new_quote.search(middle.value):

267 # There's at least one unescaped new_quote in this raw string

268 # so converting is impossible

269 return middles, quote

270

271 # Do not introduce or remove backslashes in raw strings, just use double quote

272 return middles, '"'

273

274 new_segments = []

275 for middle in middles:

276 segment = middle.value

277 # remove unnecessary escapes

278 new_segment = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", segment)

279 if segment != new_segment:

280 # Consider the string without unnecessary escapes as the original

281 middle.value = new_segment

282

283 new_segment = sub_twice(escaped_orig_quote, rf"\1\2{quote}", new_segment)

284 new_segment = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_segment)

285 new_segments.append(new_segment)

286

287 if new_quote == '"""' and new_segments[-1].endswith('"'):

288 # edge case:

289 new_segments[-1] = new_segments[-1][:-1] + '\\"'

290

291 orig_escape_count = 0

292 new_escape_count = 0

293 for middle, new_segment in zip(middles, new_segments, strict=True):

294 orig_escape_count += middle.value.count("\\")

295 new_escape_count += new_segment.count("\\")

296

297 if new_escape_count > orig_escape_count:

298 return middles, quote # Do not introduce more escaping

299

300 if new_escape_count == orig_escape_count and quote == '"':

301 return middles, quote # Prefer double quotes

302

303 for middle, new_segment in zip(middles, new_segments, strict=True):

304 middle.value = new_segment

305

306 return middles, new_quote

307

308

309def normalize_unicode_escape_sequences(leaf: Leaf) -> None:

310 """Replace hex codes in Unicode escape sequences with lowercase representation."""

311 text = leaf.value

312 prefix = get_string_prefix(text)

313 if "r" in prefix.lower():

314 return

315

316 def replace(m: Match[str]) -> str:

317 groups = m.groupdict()

318 back_slashes = groups["backslashes"]

319

320 if len(back_slashes) % 2 == 0:

321 return back_slashes + groups["body"]

322

323 if groups["u"]:

324 # \u

325 return back_slashes + "u" + groups["u"].lower()

326 elif groups["U"]:

327 # \U

328 return back_slashes + "U" + groups["U"].lower()

329 elif groups["x"]:

330 # \x

331 return back_slashes + "x" + groups["x"].lower()

332 else:

333 assert groups["N"], f"Unexpected match: {m}"

334 # \N{}

335 return back_slashes + "N{" + groups["N"].upper() + "}"

336

337 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)

338

339

340@lru_cache(maxsize=4096)

341def char_width(char: str) -> int:

342 """Return the width of a single character as it would be displayed in a

343 terminal or editor (which respects Unicode East Asian Width).

344

345 Full width characters are counted as 2, while half width characters are

346 counted as 1. Also control characters are counted as 0.

347 """

348 table = WIDTH_TABLE

349 codepoint = ord(char)

350 highest = len(table) - 1

351 lowest = 0

352 idx = highest // 2

353 while True:

354 start_codepoint, end_codepoint, width = table[idx]

355 if codepoint < start_codepoint:

356 highest = idx - 1

357 elif codepoint > end_codepoint:

358 lowest = idx + 1

359 else:

360 return 0 if width < 0 else width

361 if highest < lowest:

362 break

363 idx = (highest + lowest) // 2

364 return 1

365

366

367def str_width(line_str: str) -> int:

368 """Return the width of `line_str` as it would be displayed in a terminal

369 or editor (which respects Unicode East Asian Width).

370

371 You could utilize this function to determine, for example, if a string

372 is too wide to display in a terminal or editor.

373 """

374 if line_str.isascii():

375 # Fast path for a line consisting of only ASCII characters

376 return len(line_str)

377 return sum(map(char_width, line_str))

378

379

380def count_chars_in_width(line_str: str, max_width: int) -> int:

381 """Count the number of characters in `line_str` that would fit in a

382 terminal or editor of `max_width` (which respects Unicode East Asian

383 Width).

384 """

385 total_width = 0

386 for i, char in enumerate(line_str):

387 width = char_width(char)

388 if width + total_width > max_width:

389 return i

390 total_width += width

391 return len(line_str)