Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/black/strings.py: 13%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

202 statements  

1""" 

2Simple formatting on strings. Further string formatting code is in trans.py. 

3""" 

4 

5import re 

6import sys 

7from functools import lru_cache 

8from re import Match, Pattern 

9from typing import Final 

10 

11from black._width_table import WIDTH_TABLE 

12from blib2to3.pytree import Leaf 

13 

14STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters. 

15STRING_PREFIX_RE: Final = re.compile( 

16 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL 

17) 

18UNICODE_ESCAPE_RE: Final = re.compile( 

19 r"(?P<backslashes>\\+)(?P<body>" 

20 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx 

21 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx 

22 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh 

23 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database 

24 r")", 

25 re.VERBOSE, 

26) 

27 

28 

29def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: 

30 """Replace `regex` with `replacement` twice on `original`. 

31 

32 This is used by string normalization to perform replaces on 

33 overlapping matches. 

34 """ 

35 return regex.sub(replacement, regex.sub(replacement, original)) 

36 

37 

38def has_triple_quotes(string: str) -> bool: 

39 """ 

40 Returns: 

41 True iff @string starts with three quotation characters. 

42 """ 

43 raw_string = string.lstrip(STRING_PREFIX_CHARS) 

44 return raw_string[:3] in {'"""', "'''"} 

45 

46 

47def lines_with_leading_tabs_expanded(s: str) -> list[str]: 

48 """ 

49 Splits string into lines and expands only leading tabs (following the normal 

50 Python rules) 

51 """ 

52 lines = [] 

53 for line in s.splitlines(): 

54 stripped_line = line.lstrip() 

55 if not stripped_line or stripped_line == line: 

56 lines.append(line) 

57 else: 

58 prefix_length = len(line) - len(stripped_line) 

59 prefix = line[:prefix_length].expandtabs() 

60 lines.append(prefix + stripped_line) 

61 if s.endswith("\n"): 

62 lines.append("") 

63 return lines 

64 

65 

66def fix_multiline_docstring(docstring: str, prefix: str) -> str: 

67 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation 

68 assert docstring, "INTERNAL ERROR: Multiline docstrings cannot be empty" 

69 lines = lines_with_leading_tabs_expanded(docstring) 

70 # Determine minimum indentation (first line doesn't count): 

71 indent = sys.maxsize 

72 for line in lines[1:]: 

73 stripped = line.lstrip() 

74 if stripped: 

75 indent = min(indent, len(line) - len(stripped)) 

76 # Remove indentation (first line is special): 

77 trimmed = [lines[0].strip()] 

78 if indent < sys.maxsize: 

79 last_line_idx = len(lines) - 2 

80 for i, line in enumerate(lines[1:]): 

81 stripped_line = line[indent:].rstrip() 

82 if stripped_line or i == last_line_idx: 

83 trimmed.append(prefix + stripped_line) 

84 else: 

85 trimmed.append("") 

86 return "\n".join(trimmed) 

87 

88 

89def get_string_prefix(string: str) -> str: 

90 """ 

91 Pre-conditions: 

92 * assert_is_leaf_string(@string) 

93 

94 Returns: 

95 @string's prefix (e.g. '', 'r', 'f', or 'rf'). 

96 """ 

97 assert_is_leaf_string(string) 

98 

99 prefix = "" 

100 prefix_idx = 0 

101 while string[prefix_idx] in STRING_PREFIX_CHARS: 

102 prefix += string[prefix_idx] 

103 prefix_idx += 1 

104 

105 return prefix 

106 

107 

108def assert_is_leaf_string(string: str) -> None: 

109 """ 

110 Checks the pre-condition that @string has the format that you would expect 

111 of `leaf.value` where `leaf` is some Leaf such that `leaf.type == 

112 token.STRING`. A more precise description of the pre-conditions that are 

113 checked are listed below. 

114 

115 Pre-conditions: 

116 * @string starts with either ', ", <prefix>', or <prefix>" where 

117 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`. 

118 * @string ends with a quote character (' or "). 

119 

120 Raises: 

121 AssertionError(...) if the pre-conditions listed above are not 

122 satisfied. 

123 """ 

124 dquote_idx = string.find('"') 

125 squote_idx = string.find("'") 

126 if -1 in [dquote_idx, squote_idx]: 

127 quote_idx = max(dquote_idx, squote_idx) 

128 else: 

129 quote_idx = min(squote_idx, dquote_idx) 

130 

131 assert ( 

132 0 <= quote_idx < len(string) - 1 

133 ), f"{string!r} is missing a starting quote character (' or \")." 

134 assert string[-1] in ( 

135 "'", 

136 '"', 

137 ), f"{string!r} is missing an ending quote character (' or \")." 

138 assert set(string[:quote_idx]).issubset( 

139 set(STRING_PREFIX_CHARS) 

140 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}." 

141 

142 

143def normalize_string_prefix(s: str) -> str: 

144 """Make all string prefixes lowercase.""" 

145 match = STRING_PREFIX_RE.match(s) 

146 assert match is not None, f"failed to match string {s!r}" 

147 orig_prefix = match.group(1) 

148 new_prefix = ( 

149 orig_prefix.replace("F", "f") 

150 .replace("B", "b") 

151 .replace("U", "") 

152 .replace("u", "") 

153 ) 

154 

155 # Python syntax guarantees max 2 prefixes and that one of them is "r" 

156 if len(new_prefix) == 2 and "r" != new_prefix[0].lower(): 

157 new_prefix = new_prefix[::-1] 

158 return f"{new_prefix}{match.group(2)}" 

159 

160 

161# Re(gex) does actually cache patterns internally but this still improves 

162# performance on a long list literal of strings by 5-9% since lru_cache's 

163# caching overhead is much lower. 

164@lru_cache(maxsize=64) 

165def _cached_compile(pattern: str) -> Pattern[str]: 

166 return re.compile(pattern) 

167 

168 

169def normalize_string_quotes(s: str) -> str: 

170 """Prefer double quotes but only if it doesn't cause more escaping. 

171 

172 Adds or removes backslashes as appropriate. 

173 """ 

174 value = s.lstrip(STRING_PREFIX_CHARS) 

175 if value[:3] == '"""': 

176 return s 

177 

178 elif value[:3] == "'''": 

179 orig_quote = "'''" 

180 new_quote = '"""' 

181 elif value[0] == '"': 

182 orig_quote = '"' 

183 new_quote = "'" 

184 else: 

185 orig_quote = "'" 

186 new_quote = '"' 

187 first_quote_pos = s.find(orig_quote) 

188 assert first_quote_pos != -1, f"INTERNAL ERROR: Malformed string {s!r}" 

189 

190 prefix = s[:first_quote_pos] 

191 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}") 

192 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") 

193 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}") 

194 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)] 

195 if "r" in prefix.casefold(): 

196 if unescaped_new_quote.search(body): 

197 # There's at least one unescaped new_quote in this raw string 

198 # so converting is impossible 

199 return s 

200 

201 # Do not introduce or remove backslashes in raw strings 

202 new_body = body 

203 else: 

204 # remove unnecessary escapes 

205 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body) 

206 if body != new_body: 

207 # Consider the string without unnecessary escapes as the original 

208 body = new_body 

209 s = f"{prefix}{orig_quote}{body}{orig_quote}" 

210 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body) 

211 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body) 

212 

213 if "f" in prefix.casefold(): 

214 matches = re.findall( 

215 r""" 

216 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single { 

217 ([^{].*?) # contents of the brackets except if begins with {{ 

218 \}(?:(?!\})|$) # A } followed by end of the string or a non-} 

219 """, 

220 new_body, 

221 re.VERBOSE, 

222 ) 

223 for m in matches: 

224 if "\\" in str(m): 

225 # Do not introduce backslashes in interpolated expressions 

226 return s 

227 

228 if new_quote == '"""' and new_body[-1:] == '"': 

229 # edge case: 

230 new_body = new_body[:-1] + '\\"' 

231 orig_escape_count = body.count("\\") 

232 new_escape_count = new_body.count("\\") 

233 if new_escape_count > orig_escape_count: 

234 return s # Do not introduce more escaping 

235 

236 if new_escape_count == orig_escape_count and orig_quote == '"': 

237 return s # Prefer double quotes 

238 

239 return f"{prefix}{new_quote}{new_body}{new_quote}" 

240 

241 

242def normalize_fstring_quotes( 

243 quote: str, 

244 middles: list[Leaf], 

245 is_raw_fstring: bool, 

246) -> tuple[list[Leaf], str]: 

247 """Prefer double quotes but only if it doesn't cause more escaping. 

248 

249 Adds or removes backslashes as appropriate. 

250 """ 

251 if quote == '"""': 

252 return middles, quote 

253 

254 elif quote == "'''": 

255 new_quote = '"""' 

256 elif quote == '"': 

257 new_quote = "'" 

258 else: 

259 new_quote = '"' 

260 

261 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}") 

262 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") 

263 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){quote}") 

264 if is_raw_fstring: 

265 for middle in middles: 

266 if unescaped_new_quote.search(middle.value): 

267 # There's at least one unescaped new_quote in this raw string 

268 # so converting is impossible 

269 return middles, quote 

270 

271 # Do not introduce or remove backslashes in raw strings, just use double quote 

272 return middles, '"' 

273 

274 new_segments = [] 

275 for middle in middles: 

276 segment = middle.value 

277 # remove unnecessary escapes 

278 new_segment = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", segment) 

279 if segment != new_segment: 

280 # Consider the string without unnecessary escapes as the original 

281 middle.value = new_segment 

282 

283 new_segment = sub_twice(escaped_orig_quote, rf"\1\2{quote}", new_segment) 

284 new_segment = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_segment) 

285 new_segments.append(new_segment) 

286 

287 if new_quote == '"""' and new_segments[-1].endswith('"'): 

288 # edge case: 

289 new_segments[-1] = new_segments[-1][:-1] + '\\"' 

290 

291 for middle, new_segment in zip(middles, new_segments): 

292 orig_escape_count = middle.value.count("\\") 

293 new_escape_count = new_segment.count("\\") 

294 

295 if new_escape_count > orig_escape_count: 

296 return middles, quote # Do not introduce more escaping 

297 

298 if new_escape_count == orig_escape_count and quote == '"': 

299 return middles, quote # Prefer double quotes 

300 

301 for middle, new_segment in zip(middles, new_segments): 

302 middle.value = new_segment 

303 

304 return middles, new_quote 

305 

306 

307def normalize_unicode_escape_sequences(leaf: Leaf) -> None: 

308 """Replace hex codes in Unicode escape sequences with lowercase representation.""" 

309 text = leaf.value 

310 prefix = get_string_prefix(text) 

311 if "r" in prefix.lower(): 

312 return 

313 

314 def replace(m: Match[str]) -> str: 

315 groups = m.groupdict() 

316 back_slashes = groups["backslashes"] 

317 

318 if len(back_slashes) % 2 == 0: 

319 return back_slashes + groups["body"] 

320 

321 if groups["u"]: 

322 # \u 

323 return back_slashes + "u" + groups["u"].lower() 

324 elif groups["U"]: 

325 # \U 

326 return back_slashes + "U" + groups["U"].lower() 

327 elif groups["x"]: 

328 # \x 

329 return back_slashes + "x" + groups["x"].lower() 

330 else: 

331 assert groups["N"], f"Unexpected match: {m}" 

332 # \N{} 

333 return back_slashes + "N{" + groups["N"].upper() + "}" 

334 

335 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) 

336 

337 

338@lru_cache(maxsize=4096) 

339def char_width(char: str) -> int: 

340 """Return the width of a single character as it would be displayed in a 

341 terminal or editor (which respects Unicode East Asian Width). 

342 

343 Full width characters are counted as 2, while half width characters are 

344 counted as 1. Also control characters are counted as 0. 

345 """ 

346 table = WIDTH_TABLE 

347 codepoint = ord(char) 

348 highest = len(table) - 1 

349 lowest = 0 

350 idx = highest // 2 

351 while True: 

352 start_codepoint, end_codepoint, width = table[idx] 

353 if codepoint < start_codepoint: 

354 highest = idx - 1 

355 elif codepoint > end_codepoint: 

356 lowest = idx + 1 

357 else: 

358 return 0 if width < 0 else width 

359 if highest < lowest: 

360 break 

361 idx = (highest + lowest) // 2 

362 return 1 

363 

364 

365def str_width(line_str: str) -> int: 

366 """Return the width of `line_str` as it would be displayed in a terminal 

367 or editor (which respects Unicode East Asian Width). 

368 

369 You could utilize this function to determine, for example, if a string 

370 is too wide to display in a terminal or editor. 

371 """ 

372 if line_str.isascii(): 

373 # Fast path for a line consisting of only ASCII characters 

374 return len(line_str) 

375 return sum(map(char_width, line_str)) 

376 

377 

378def count_chars_in_width(line_str: str, max_width: int) -> int: 

379 """Count the number of characters in `line_str` that would fit in a 

380 terminal or editor of `max_width` (which respects Unicode East Asian 

381 Width). 

382 """ 

383 total_width = 0 

384 for i, char in enumerate(line_str): 

385 width = char_width(char) 

386 if width + total_width > max_width: 

387 return i 

388 total_width += width 

389 return len(line_str)