Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/black/strings.py: 72%

165 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:15 +0000

1""" 

2Simple formatting on strings. Further string formatting code is in trans.py. 

3""" 

4 

5import re 

6import sys 

7from functools import lru_cache 

8from typing import List, Match, Pattern 

9 

10from blib2to3.pytree import Leaf 

11 

12if sys.version_info < (3, 8): 

13 from typing_extensions import Final 

14else: 

15 from typing import Final 

16 

17from black._width_table import WIDTH_TABLE 

18 

19STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters. 

20STRING_PREFIX_RE: Final = re.compile( 

21 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL 

22) 

23FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") 

24UNICODE_ESCAPE_RE: Final = re.compile( 

25 r"(?P<backslashes>\\+)(?P<body>" 

26 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx 

27 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx 

28 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh 

29 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database 

30 r")", 

31 re.VERBOSE, 

32) 

33 

34 

35def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: 

36 """Replace `regex` with `replacement` twice on `original`. 

37 

38 This is used by string normalization to perform replaces on 

39 overlapping matches. 

40 """ 

41 return regex.sub(replacement, regex.sub(replacement, original)) 

42 

43 

44def has_triple_quotes(string: str) -> bool: 

45 """ 

46 Returns: 

47 True iff @string starts with three quotation characters. 

48 """ 

49 raw_string = string.lstrip(STRING_PREFIX_CHARS) 

50 return raw_string[:3] in {'"""', "'''"} 

51 

52 

53def lines_with_leading_tabs_expanded(s: str) -> List[str]: 

54 """ 

55 Splits string into lines and expands only leading tabs (following the normal 

56 Python rules) 

57 """ 

58 lines = [] 

59 for line in s.splitlines(): 

60 # Find the index of the first non-whitespace character after a string of 

61 # whitespace that includes at least one tab 

62 match = FIRST_NON_WHITESPACE_RE.match(line) 

63 if match: 

64 first_non_whitespace_idx = match.start(1) 

65 

66 lines.append( 

67 line[:first_non_whitespace_idx].expandtabs() 

68 + line[first_non_whitespace_idx:] 

69 ) 

70 else: 

71 lines.append(line) 

72 return lines 

73 

74 

75def fix_docstring(docstring: str, prefix: str) -> str: 

76 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation 

77 if not docstring: 

78 return "" 

79 lines = lines_with_leading_tabs_expanded(docstring) 

80 # Determine minimum indentation (first line doesn't count): 

81 indent = sys.maxsize 

82 for line in lines[1:]: 

83 stripped = line.lstrip() 

84 if stripped: 

85 indent = min(indent, len(line) - len(stripped)) 

86 # Remove indentation (first line is special): 

87 trimmed = [lines[0].strip()] 

88 if indent < sys.maxsize: 

89 last_line_idx = len(lines) - 2 

90 for i, line in enumerate(lines[1:]): 

91 stripped_line = line[indent:].rstrip() 

92 if stripped_line or i == last_line_idx: 

93 trimmed.append(prefix + stripped_line) 

94 else: 

95 trimmed.append("") 

96 return "\n".join(trimmed) 

97 

98 

99def get_string_prefix(string: str) -> str: 

100 """ 

101 Pre-conditions: 

102 * assert_is_leaf_string(@string) 

103 

104 Returns: 

105 @string's prefix (e.g. '', 'r', 'f', or 'rf'). 

106 """ 

107 assert_is_leaf_string(string) 

108 

109 prefix = "" 

110 prefix_idx = 0 

111 while string[prefix_idx] in STRING_PREFIX_CHARS: 

112 prefix += string[prefix_idx] 

113 prefix_idx += 1 

114 

115 return prefix 

116 

117 

118def assert_is_leaf_string(string: str) -> None: 

119 """ 

120 Checks the pre-condition that @string has the format that you would expect 

121 of `leaf.value` where `leaf` is some Leaf such that `leaf.type == 

122 token.STRING`. A more precise description of the pre-conditions that are 

123 checked are listed below. 

124 

125 Pre-conditions: 

126 * @string starts with either ', ", <prefix>', or <prefix>" where 

127 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`. 

128 * @string ends with a quote character (' or "). 

129 

130 Raises: 

131 AssertionError(...) if the pre-conditions listed above are not 

132 satisfied. 

133 """ 

134 dquote_idx = string.find('"') 

135 squote_idx = string.find("'") 

136 if -1 in [dquote_idx, squote_idx]: 

137 quote_idx = max(dquote_idx, squote_idx) 

138 else: 

139 quote_idx = min(squote_idx, dquote_idx) 

140 

141 assert ( 

142 0 <= quote_idx < len(string) - 1 

143 ), f"{string!r} is missing a starting quote character (' or \")." 

144 assert string[-1] in ( 

145 "'", 

146 '"', 

147 ), f"{string!r} is missing an ending quote character (' or \")." 

148 assert set(string[:quote_idx]).issubset( 

149 set(STRING_PREFIX_CHARS) 

150 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}." 

151 

152 

153def normalize_string_prefix(s: str) -> str: 

154 """Make all string prefixes lowercase.""" 

155 match = STRING_PREFIX_RE.match(s) 

156 assert match is not None, f"failed to match string {s!r}" 

157 orig_prefix = match.group(1) 

158 new_prefix = ( 

159 orig_prefix.replace("F", "f") 

160 .replace("B", "b") 

161 .replace("U", "") 

162 .replace("u", "") 

163 ) 

164 

165 # Python syntax guarantees max 2 prefixes and that one of them is "r" 

166 if len(new_prefix) == 2 and "r" != new_prefix[0].lower(): 

167 new_prefix = new_prefix[::-1] 

168 return f"{new_prefix}{match.group(2)}" 

169 

170 

171# Re(gex) does actually cache patterns internally but this still improves 

172# performance on a long list literal of strings by 5-9% since lru_cache's 

173# caching overhead is much lower. 

174@lru_cache(maxsize=64) 

175def _cached_compile(pattern: str) -> Pattern[str]: 

176 return re.compile(pattern) 

177 

178 

179def normalize_string_quotes(s: str) -> str: 

180 """Prefer double quotes but only if it doesn't cause more escaping. 

181 

182 Adds or removes backslashes as appropriate. Doesn't parse and fix 

183 strings nested in f-strings. 

184 """ 

185 value = s.lstrip(STRING_PREFIX_CHARS) 

186 if value[:3] == '"""': 

187 return s 

188 

189 elif value[:3] == "'''": 

190 orig_quote = "'''" 

191 new_quote = '"""' 

192 elif value[0] == '"': 

193 orig_quote = '"' 

194 new_quote = "'" 

195 else: 

196 orig_quote = "'" 

197 new_quote = '"' 

198 first_quote_pos = s.find(orig_quote) 

199 if first_quote_pos == -1: 

200 return s # There's an internal error 

201 

202 prefix = s[:first_quote_pos] 

203 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}") 

204 escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") 

205 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}") 

206 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)] 

207 if "r" in prefix.casefold(): 

208 if unescaped_new_quote.search(body): 

209 # There's at least one unescaped new_quote in this raw string 

210 # so converting is impossible 

211 return s 

212 

213 # Do not introduce or remove backslashes in raw strings 

214 new_body = body 

215 else: 

216 # remove unnecessary escapes 

217 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body) 

218 if body != new_body: 

219 # Consider the string without unnecessary escapes as the original 

220 body = new_body 

221 s = f"{prefix}{orig_quote}{body}{orig_quote}" 

222 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body) 

223 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body) 

224 if "f" in prefix.casefold(): 

225 matches = re.findall( 

226 r""" 

227 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single { 

228 ([^{].*?) # contents of the brackets except if begins with {{ 

229 \}(?:(?!\})|$) # A } followed by end of the string or a non-} 

230 """, 

231 new_body, 

232 re.VERBOSE, 

233 ) 

234 for m in matches: 

235 if "\\" in str(m): 

236 # Do not introduce backslashes in interpolated expressions 

237 return s 

238 

239 if new_quote == '"""' and new_body[-1:] == '"': 

240 # edge case: 

241 new_body = new_body[:-1] + '\\"' 

242 orig_escape_count = body.count("\\") 

243 new_escape_count = new_body.count("\\") 

244 if new_escape_count > orig_escape_count: 

245 return s # Do not introduce more escaping 

246 

247 if new_escape_count == orig_escape_count and orig_quote == '"': 

248 return s # Prefer double quotes 

249 

250 return f"{prefix}{new_quote}{new_body}{new_quote}" 

251 

252 

253def normalize_unicode_escape_sequences(leaf: Leaf) -> None: 

254 """Replace hex codes in Unicode escape sequences with lowercase representation.""" 

255 text = leaf.value 

256 prefix = get_string_prefix(text) 

257 if "r" in prefix.lower(): 

258 return 

259 

260 def replace(m: Match[str]) -> str: 

261 groups = m.groupdict() 

262 back_slashes = groups["backslashes"] 

263 

264 if len(back_slashes) % 2 == 0: 

265 return back_slashes + groups["body"] 

266 

267 if groups["u"]: 

268 # \u 

269 return back_slashes + "u" + groups["u"].lower() 

270 elif groups["U"]: 

271 # \U 

272 return back_slashes + "U" + groups["U"].lower() 

273 elif groups["x"]: 

274 # \x 

275 return back_slashes + "x" + groups["x"].lower() 

276 else: 

277 assert groups["N"], f"Unexpected match: {m}" 

278 # \N{} 

279 return back_slashes + "N{" + groups["N"].upper() + "}" 

280 

281 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) 

282 

283 

284@lru_cache(maxsize=4096) 

285def char_width(char: str) -> int: 

286 """Return the width of a single character as it would be displayed in a 

287 terminal or editor (which respects Unicode East Asian Width). 

288 

289 Full width characters are counted as 2, while half width characters are 

290 counted as 1. Also control characters are counted as 0. 

291 """ 

292 table = WIDTH_TABLE 

293 codepoint = ord(char) 

294 highest = len(table) - 1 

295 lowest = 0 

296 idx = highest // 2 

297 while True: 

298 start_codepoint, end_codepoint, width = table[idx] 

299 if codepoint < start_codepoint: 

300 highest = idx - 1 

301 elif codepoint > end_codepoint: 

302 lowest = idx + 1 

303 else: 

304 return 0 if width < 0 else width 

305 if highest < lowest: 

306 break 

307 idx = (highest + lowest) // 2 

308 return 1 

309 

310 

311def str_width(line_str: str) -> int: 

312 """Return the width of `line_str` as it would be displayed in a terminal 

313 or editor (which respects Unicode East Asian Width). 

314 

315 You could utilize this function to determine, for example, if a string 

316 is too wide to display in a terminal or editor. 

317 """ 

318 if line_str.isascii(): 

319 # Fast path for a line consisting of only ASCII characters 

320 return len(line_str) 

321 return sum(map(char_width, line_str)) 

322 

323 

324def count_chars_in_width(line_str: str, max_width: int) -> int: 

325 """Count the number of characters in `line_str` that would fit in a 

326 terminal or editor of `max_width` (which respects Unicode East Asian 

327 Width). 

328 """ 

329 total_width = 0 

330 for i, char in enumerate(line_str): 

331 width = char_width(char) 

332 if width + total_width > max_width: 

333 return i 

334 total_width += width 

335 return len(line_str)