Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/black/strings.py: 13%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

204 statements  

1""" 

2Simple formatting on strings. Further string formatting code is in trans.py. 

3""" 

4 

5import re 

6import sys 

7from functools import lru_cache 

8from re import Match, Pattern 

9from typing import Final 

10 

11from black._width_table import WIDTH_TABLE 

12from blib2to3.pytree import Leaf 

13 

14STRING_PREFIX_CHARS: Final = "fturbFTURB" # All possible string prefix characters. 

15STRING_PREFIX_RE: Final = re.compile( 

16 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL 

17) 

18UNICODE_ESCAPE_RE: Final = re.compile( 

19 r"(?P<backslashes>\\+)(?P<body>" 

20 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx 

21 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx 

22 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh 

23 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database 

24 r")", 

25 re.VERBOSE, 

26) 

27 

28 

29def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: 

30 """Replace `regex` with `replacement` twice on `original`. 

31 

32 This is used by string normalization to perform replaces on 

33 overlapping matches. 

34 """ 

35 return regex.sub(replacement, regex.sub(replacement, original)) 

36 

37 

38def has_triple_quotes(string: str) -> bool: 

39 """ 

40 Returns: 

41 True iff @string starts with three quotation characters. 

42 """ 

43 raw_string = string.lstrip(STRING_PREFIX_CHARS) 

44 return raw_string[:3] in {'"""', "'''"} 

45 

46 

47def lines_with_leading_tabs_expanded(s: str) -> list[str]: 

48 """ 

49 Splits string into lines and expands only leading tabs (following the normal 

50 Python rules) 

51 """ 

52 lines = [] 

53 for line in s.splitlines(): 

54 stripped_line = line.lstrip() 

55 if not stripped_line or stripped_line == line: 

56 lines.append(line) 

57 else: 

58 prefix_length = len(line) - len(stripped_line) 

59 prefix = line[:prefix_length].expandtabs() 

60 lines.append(prefix + stripped_line) 

61 if s.endswith("\n"): 

62 lines.append("") 

63 return lines 

64 

65 

66def fix_multiline_docstring(docstring: str, prefix: str) -> str: 

67 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation 

68 assert docstring, "INTERNAL ERROR: Multiline docstrings cannot be empty" 

69 lines = lines_with_leading_tabs_expanded(docstring) 

70 # Determine minimum indentation (first line doesn't count): 

71 indent = sys.maxsize 

72 for line in lines[1:]: 

73 stripped = line.lstrip() 

74 if stripped: 

75 indent = min(indent, len(line) - len(stripped)) 

76 # Remove indentation (first line is special): 

77 trimmed = [lines[0].strip()] 

78 if indent < sys.maxsize: 

79 last_line_idx = len(lines) - 2 

80 for i, line in enumerate(lines[1:]): 

81 stripped_line = line[indent:].rstrip() 

82 if stripped_line or i == last_line_idx: 

83 trimmed.append(prefix + stripped_line) 

84 else: 

85 trimmed.append("") 

86 return "\n".join(trimmed) 

87 

88 

89def get_string_prefix(string: str) -> str: 

90 """ 

91 Pre-conditions: 

92 * assert_is_leaf_string(@string) 

93 

94 Returns: 

95 @string's prefix (e.g. '', 'r', 'f', or 'rf'). 

96 """ 

97 assert_is_leaf_string(string) 

98 

99 prefix = [] 

100 for char in string: 

101 if char in STRING_PREFIX_CHARS: 

102 prefix.append(char) 

103 else: 

104 break 

105 return "".join(prefix) 

106 

107 

108def assert_is_leaf_string(string: str) -> None: 

109 """ 

110 Checks the pre-condition that @string has the format that you would expect 

111 of `leaf.value` where `leaf` is some Leaf such that `leaf.type == 

112 token.STRING`. A more precise description of the pre-conditions that are 

113 checked are listed below. 

114 

115 Pre-conditions: 

116 * @string starts with either ', ", <prefix>', or <prefix>" where 

117 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`. 

118 * @string ends with a quote character (' or "). 

119 

120 Raises: 

121 AssertionError(...) if the pre-conditions listed above are not 

122 satisfied. 

123 """ 

124 dquote_idx = string.find('"') 

125 squote_idx = string.find("'") 

126 if -1 in [dquote_idx, squote_idx]: 

127 quote_idx = max(dquote_idx, squote_idx) 

128 else: 

129 quote_idx = min(squote_idx, dquote_idx) 

130 

131 assert ( 

132 0 <= quote_idx < len(string) - 1 

133 ), f"{string!r} is missing a starting quote character (' or \")." 

134 assert string[-1] in ( 

135 "'", 

136 '"', 

137 ), f"{string!r} is missing an ending quote character (' or \")." 

138 assert set(string[:quote_idx]).issubset( 

139 set(STRING_PREFIX_CHARS) 

140 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}." 

141 

142 

143def normalize_string_prefix(s: str) -> str: 

144 """Make all string prefixes lowercase.""" 

145 match = STRING_PREFIX_RE.match(s) 

146 assert match is not None, f"failed to match string {s!r}" 

147 orig_prefix = match.group(1) 

148 new_prefix = ( 

149 orig_prefix.replace("F", "f") 

150 .replace("B", "b") 

151 .replace("U", "") 

152 .replace("u", "") 

153 ) 

154 

155 # Python syntax guarantees max 2 prefixes and that one of them is "r" 

156 if len(new_prefix) == 2 and new_prefix[0].lower() != "r": 

157 new_prefix = new_prefix[::-1] 

158 return f"{new_prefix}{match.group(2)}" 

159 

160 

161# Re(gex) does actually cache patterns internally but this still improves 

162# performance on a long list literal of strings by 5-9% since lru_cache's 

163# caching overhead is much lower. 

164@lru_cache(maxsize=64) 

165def _cached_compile(pattern: str) -> Pattern[str]: 

166 return re.compile(pattern) 

167 

168 

169def normalize_string_quotes(s: str) -> str: 

170 """Prefer double quotes but only if it doesn't cause more escaping. 

171 

172 Adds or removes backslashes as appropriate. 

173 """ 

174 value = s.lstrip(STRING_PREFIX_CHARS) 

175 if value[:3] == '"""': 

176 return s 

177 

178 elif value[:3] == "'''": 

179 orig_quote = "'''" 

180 new_quote = '"""' 

181 elif value[0] == '"': 

182 orig_quote = '"' 

183 new_quote = "'" 

184 else: 

185 orig_quote = "'" 

186 new_quote = '"' 

187 first_quote_pos = s.find(orig_quote) 

188 assert first_quote_pos != -1, f"INTERNAL ERROR: Malformed string {s!r}" 

189 

190 prefix = s[:first_quote_pos] 

191 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}") 

192 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") 

193 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}") 

194 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)] 

195 if "r" in prefix.casefold(): 

196 if unescaped_new_quote.search(body): 

197 # There's at least one unescaped new_quote in this raw string 

198 # so converting is impossible 

199 return s 

200 

201 # Do not introduce or remove backslashes in raw strings 

202 new_body = body 

203 else: 

204 # remove unnecessary escapes 

205 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body) 

206 if body != new_body: 

207 # Consider the string without unnecessary escapes as the original 

208 body = new_body 

209 s = f"{prefix}{orig_quote}{body}{orig_quote}" 

210 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body) 

211 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body) 

212 

213 if "f" in prefix.casefold(): 

214 matches = re.findall( 

215 r""" 

216 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single { 

217 ([^{].*?) # contents of the brackets except if begins with {{ 

218 \}(?:(?!\})|$) # A } followed by end of the string or a non-} 

219 """, 

220 new_body, 

221 re.VERBOSE, 

222 ) 

223 for m in matches: 

224 if "\\" in str(m): 

225 # Do not introduce backslashes in interpolated expressions 

226 return s 

227 

228 if new_quote == '"""' and new_body[-1:] == '"': 

229 # edge case: 

230 new_body = new_body[:-1] + '\\"' 

231 orig_escape_count = body.count("\\") 

232 new_escape_count = new_body.count("\\") 

233 if new_escape_count > orig_escape_count: 

234 return s # Do not introduce more escaping 

235 

236 if new_escape_count == orig_escape_count and orig_quote == '"': 

237 return s # Prefer double quotes 

238 

239 return f"{prefix}{new_quote}{new_body}{new_quote}" 

240 

241 

242def normalize_fstring_quotes( 

243 quote: str, 

244 middles: list[Leaf], 

245 is_raw_fstring: bool, 

246) -> tuple[list[Leaf], str]: 

247 """Prefer double quotes but only if it doesn't cause more escaping. 

248 

249 Adds or removes backslashes as appropriate. 

250 """ 

251 if quote == '"""': 

252 return middles, quote 

253 

254 elif quote == "'''": 

255 new_quote = '"""' 

256 elif quote == '"': 

257 new_quote = "'" 

258 else: 

259 new_quote = '"' 

260 

261 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}") 

262 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") 

263 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){quote}") 

264 if is_raw_fstring: 

265 for middle in middles: 

266 if unescaped_new_quote.search(middle.value): 

267 # There's at least one unescaped new_quote in this raw string 

268 # so converting is impossible 

269 return middles, quote 

270 

271 # Do not introduce or remove backslashes in raw strings, just use double quote 

272 return middles, '"' 

273 

274 new_segments = [] 

275 for middle in middles: 

276 segment = middle.value 

277 # remove unnecessary escapes 

278 new_segment = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", segment) 

279 if segment != new_segment: 

280 # Consider the string without unnecessary escapes as the original 

281 middle.value = new_segment 

282 

283 new_segment = sub_twice(escaped_orig_quote, rf"\1\2{quote}", new_segment) 

284 new_segment = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_segment) 

285 new_segments.append(new_segment) 

286 

287 if new_quote == '"""' and new_segments[-1].endswith('"'): 

288 # edge case: 

289 new_segments[-1] = new_segments[-1][:-1] + '\\"' 

290 

291 orig_escape_count = 0 

292 new_escape_count = 0 

293 for middle, new_segment in zip(middles, new_segments, strict=True): 

294 orig_escape_count += middle.value.count("\\") 

295 new_escape_count += new_segment.count("\\") 

296 

297 if new_escape_count > orig_escape_count: 

298 return middles, quote # Do not introduce more escaping 

299 

300 if new_escape_count == orig_escape_count and quote == '"': 

301 return middles, quote # Prefer double quotes 

302 

303 for middle, new_segment in zip(middles, new_segments, strict=True): 

304 middle.value = new_segment 

305 

306 return middles, new_quote 

307 

308 

309def normalize_unicode_escape_sequences(leaf: Leaf) -> None: 

310 """Replace hex codes in Unicode escape sequences with lowercase representation.""" 

311 text = leaf.value 

312 prefix = get_string_prefix(text) 

313 if "r" in prefix.lower(): 

314 return 

315 

316 def replace(m: Match[str]) -> str: 

317 groups = m.groupdict() 

318 back_slashes = groups["backslashes"] 

319 

320 if len(back_slashes) % 2 == 0: 

321 return back_slashes + groups["body"] 

322 

323 if groups["u"]: 

324 # \u 

325 return back_slashes + "u" + groups["u"].lower() 

326 elif groups["U"]: 

327 # \U 

328 return back_slashes + "U" + groups["U"].lower() 

329 elif groups["x"]: 

330 # \x 

331 return back_slashes + "x" + groups["x"].lower() 

332 else: 

333 assert groups["N"], f"Unexpected match: {m}" 

334 # \N{} 

335 return back_slashes + "N{" + groups["N"].upper() + "}" 

336 

337 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) 

338 

339 

340@lru_cache(maxsize=4096) 

341def char_width(char: str) -> int: 

342 """Return the width of a single character as it would be displayed in a 

343 terminal or editor (which respects Unicode East Asian Width). 

344 

345 Full width characters are counted as 2, while half width characters are 

346 counted as 1. Also control characters are counted as 0. 

347 """ 

348 table = WIDTH_TABLE 

349 codepoint = ord(char) 

350 highest = len(table) - 1 

351 lowest = 0 

352 idx = highest // 2 

353 while True: 

354 start_codepoint, end_codepoint, width = table[idx] 

355 if codepoint < start_codepoint: 

356 highest = idx - 1 

357 elif codepoint > end_codepoint: 

358 lowest = idx + 1 

359 else: 

360 return 0 if width < 0 else width 

361 if highest < lowest: 

362 break 

363 idx = (highest + lowest) // 2 

364 return 1 

365 

366 

367def str_width(line_str: str) -> int: 

368 """Return the width of `line_str` as it would be displayed in a terminal 

369 or editor (which respects Unicode East Asian Width). 

370 

371 You could utilize this function to determine, for example, if a string 

372 is too wide to display in a terminal or editor. 

373 """ 

374 if line_str.isascii(): 

375 # Fast path for a line consisting of only ASCII characters 

376 return len(line_str) 

377 return sum(map(char_width, line_str)) 

378 

379 

380def count_chars_in_width(line_str: str, max_width: int) -> int: 

381 """Count the number of characters in `line_str` that would fit in a 

382 terminal or editor of `max_width` (which respects Unicode East Asian 

383 Width). 

384 """ 

385 total_width = 0 

386 for i, char in enumerate(line_str): 

387 width = char_width(char) 

388 if width + total_width > max_width: 

389 return i 

390 total_width += width 

391 return len(line_str)