Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/black/strings.py: 72%

1"""

2Simple formatting on strings. Further string formatting code is in trans.py.

3"""

5import re

6import sys

7from functools import lru_cache

8from typing import List, Match, Pattern

10from blib2to3.pytree import Leaf

12if sys.version_info < (3, 8):

13 from typing_extensions import Final

14else:

15 from typing import Final

17from black._width_table import WIDTH_TABLE

19STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters.

20STRING_PREFIX_RE: Final = re.compile(

21 r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL

22)

23FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)")

24UNICODE_ESCAPE_RE: Final = re.compile(

25 r"(?P<backslashes>\\+)(?P<body>"

26 r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx

27 r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx

28 r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh

29 r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database

30 r")",

31 re.VERBOSE,

32)

35def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str:

36 """Replace `regex` with `replacement` twice on `original`.

38 This is used by string normalization to perform replaces on

39 overlapping matches.

40 """

41 return regex.sub(replacement, regex.sub(replacement, original))

44def has_triple_quotes(string: str) -> bool:

45 """

46 Returns:

47 True iff @string starts with three quotation characters.

48 """

49 raw_string = string.lstrip(STRING_PREFIX_CHARS)

50 return raw_string[:3] in {'"""', "'''"}

53def lines_with_leading_tabs_expanded(s: str) -> List[str]:

54 """

55 Splits string into lines and expands only leading tabs (following the normal

56 Python rules)

57 """

58 lines = []

59 for line in s.splitlines():

60 # Find the index of the first non-whitespace character after a string of

61 # whitespace that includes at least one tab

62 match = FIRST_NON_WHITESPACE_RE.match(line)

63 if match:

64 first_non_whitespace_idx = match.start(1)

66 lines.append(

67 line[:first_non_whitespace_idx].expandtabs()

68 + line[first_non_whitespace_idx:]

69 )

70 else:

71 lines.append(line)

72 return lines

75def fix_docstring(docstring: str, prefix: str) -> str:

76 # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation

77 if not docstring:

78 return ""

79 lines = lines_with_leading_tabs_expanded(docstring)

80 # Determine minimum indentation (first line doesn't count):

81 indent = sys.maxsize

82 for line in lines[1:]:

83 stripped = line.lstrip()

84 if stripped:

85 indent = min(indent, len(line) - len(stripped))

86 # Remove indentation (first line is special):

87 trimmed = [lines[0].strip()]

88 if indent < sys.maxsize:

89 last_line_idx = len(lines) - 2

90 for i, line in enumerate(lines[1:]):

91 stripped_line = line[indent:].rstrip()

92 if stripped_line or i == last_line_idx:

93 trimmed.append(prefix + stripped_line)

94 else:

95 trimmed.append("")

96 return "\n".join(trimmed)

99def get_string_prefix(string: str) -> str:

100 """

101 Pre-conditions:

102 * assert_is_leaf_string(@string)

103

104 Returns:

105 @string's prefix (e.g. '', 'r', 'f', or 'rf').

106 """

107 assert_is_leaf_string(string)

108

109 prefix = ""

110 prefix_idx = 0

111 while string[prefix_idx] in STRING_PREFIX_CHARS:

112 prefix += string[prefix_idx]

113 prefix_idx += 1

114

115 return prefix

116

117

118def assert_is_leaf_string(string: str) -> None:

119 """

120 Checks the pre-condition that @string has the format that you would expect

121 of `leaf.value` where `leaf` is some Leaf such that `leaf.type ==

122 token.STRING`. A more precise description of the pre-conditions that are

123 checked are listed below.

124

125 Pre-conditions:

126 * @string starts with either ', ", <prefix>', or <prefix>" where

127 `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`.

128 * @string ends with a quote character (' or ").

129

130 Raises:

131 AssertionError(...) if the pre-conditions listed above are not

132 satisfied.

133 """

134 dquote_idx = string.find('"')

135 squote_idx = string.find("'")

136 if -1 in [dquote_idx, squote_idx]:

137 quote_idx = max(dquote_idx, squote_idx)

138 else:

139 quote_idx = min(squote_idx, dquote_idx)

140

141 assert (

142 0 <= quote_idx < len(string) - 1

143 ), f"{string!r} is missing a starting quote character (' or \")."

144 assert string[-1] in (

145 "'",

146 '"',

147 ), f"{string!r} is missing an ending quote character (' or \")."

148 assert set(string[:quote_idx]).issubset(

149 set(STRING_PREFIX_CHARS)

150 ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}."

151

152

153def normalize_string_prefix(s: str) -> str:

154 """Make all string prefixes lowercase."""

155 match = STRING_PREFIX_RE.match(s)

156 assert match is not None, f"failed to match string {s!r}"

157 orig_prefix = match.group(1)

158 new_prefix = (

159 orig_prefix.replace("F", "f")

160 .replace("B", "b")

161 .replace("U", "")

162 .replace("u", "")

163 )

164

165 # Python syntax guarantees max 2 prefixes and that one of them is "r"

166 if len(new_prefix) == 2 and "r" != new_prefix[0].lower():

167 new_prefix = new_prefix[::-1]

168 return f"{new_prefix}{match.group(2)}"

169

170

171# Re(gex) does actually cache patterns internally but this still improves

172# performance on a long list literal of strings by 5-9% since lru_cache's

173# caching overhead is much lower.

174@lru_cache(maxsize=64)

175def _cached_compile(pattern: str) -> Pattern[str]:

176 return re.compile(pattern)

177

178

179def normalize_string_quotes(s: str) -> str:

180 """Prefer double quotes but only if it doesn't cause more escaping.

181

182 Adds or removes backslashes as appropriate. Doesn't parse and fix

183 strings nested in f-strings.

184 """

185 value = s.lstrip(STRING_PREFIX_CHARS)

186 if value[:3] == '"""':

187 return s

188

189 elif value[:3] == "'''":

190 orig_quote = "'''"

191 new_quote = '"""'

192 elif value[0] == '"':

193 orig_quote = '"'

194 new_quote = "'"

195 else:

196 orig_quote = "'"

197 new_quote = '"'

198 first_quote_pos = s.find(orig_quote)

199 if first_quote_pos == -1:

200 return s # There's an internal error

201

202 prefix = s[:first_quote_pos]

203 unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}")

204 escaped_new_quote = _cached_compile(rf"([^\\]|^)\$(?:\\\$*){new_quote}")

205 escaped_orig_quote = _cached_compile(rf"([^\\]|^)\$(?:\\\$*){orig_quote}")

206 body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)]

207 if "r" in prefix.casefold():

208 if unescaped_new_quote.search(body):

209 # There's at least one unescaped new_quote in this raw string

210 # so converting is impossible

211 return s

212

213 # Do not introduce or remove backslashes in raw strings

214 new_body = body

215 else:

216 # remove unnecessary escapes

217 new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body)

218 if body != new_body:

219 # Consider the string without unnecessary escapes as the original

220 body = new_body

221 s = f"{prefix}{orig_quote}{body}{orig_quote}"

222 new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body)

223 new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body)

224 if "f" in prefix.casefold():

225 matches = re.findall(

226 r"""

227 (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single {

228 ([^{].*?) # contents of the brackets except if begins with {{

229 \}(?:(?!\})|$) # A } followed by end of the string or a non-}

230 """,

231 new_body,

232 re.VERBOSE,

233 )

234 for m in matches:

235 if "\\" in str(m):

236 # Do not introduce backslashes in interpolated expressions

237 return s

238

239 if new_quote == '"""' and new_body[-1:] == '"':

240 # edge case:

241 new_body = new_body[:-1] + '\\"'

242 orig_escape_count = body.count("\\")

243 new_escape_count = new_body.count("\\")

244 if new_escape_count > orig_escape_count:

245 return s # Do not introduce more escaping

246

247 if new_escape_count == orig_escape_count and orig_quote == '"':

248 return s # Prefer double quotes

249

250 return f"{prefix}{new_quote}{new_body}{new_quote}"

251

252

253def normalize_unicode_escape_sequences(leaf: Leaf) -> None:

254 """Replace hex codes in Unicode escape sequences with lowercase representation."""

255 text = leaf.value

256 prefix = get_string_prefix(text)

257 if "r" in prefix.lower():

258 return

259

260 def replace(m: Match[str]) -> str:

261 groups = m.groupdict()

262 back_slashes = groups["backslashes"]

263

264 if len(back_slashes) % 2 == 0:

265 return back_slashes + groups["body"]

266

267 if groups["u"]:

268 # \u

269 return back_slashes + "u" + groups["u"].lower()

270 elif groups["U"]:

271 # \U

272 return back_slashes + "U" + groups["U"].lower()

273 elif groups["x"]:

274 # \x

275 return back_slashes + "x" + groups["x"].lower()

276 else:

277 assert groups["N"], f"Unexpected match: {m}"

278 # \N{}

279 return back_slashes + "N{" + groups["N"].upper() + "}"

280

281 leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text)

282

283

284@lru_cache(maxsize=4096)

285def char_width(char: str) -> int:

286 """Return the width of a single character as it would be displayed in a

287 terminal or editor (which respects Unicode East Asian Width).

288

289 Full width characters are counted as 2, while half width characters are

290 counted as 1. Also control characters are counted as 0.

291 """

292 table = WIDTH_TABLE

293 codepoint = ord(char)

294 highest = len(table) - 1

295 lowest = 0

296 idx = highest // 2

297 while True:

298 start_codepoint, end_codepoint, width = table[idx]

299 if codepoint < start_codepoint:

300 highest = idx - 1

301 elif codepoint > end_codepoint:

302 lowest = idx + 1

303 else:

304 return 0 if width < 0 else width

305 if highest < lowest:

306 break

307 idx = (highest + lowest) // 2

308 return 1

309

310

311def str_width(line_str: str) -> int:

312 """Return the width of `line_str` as it would be displayed in a terminal

313 or editor (which respects Unicode East Asian Width).

314

315 You could utilize this function to determine, for example, if a string

316 is too wide to display in a terminal or editor.

317 """

318 if line_str.isascii():

319 # Fast path for a line consisting of only ASCII characters

320 return len(line_str)

321 return sum(map(char_width, line_str))

322

323

324def count_chars_in_width(line_str: str, max_width: int) -> int:

325 """Count the number of characters in `line_str` that would fit in a

326 terminal or editor of `max_width` (which respects Unicode East Asian

327 Width).

328 """

329 total_width = 0

330 for i, char in enumerate(line_str):

331 width = char_width(char)

332 if width + total_width > max_width:

333 return i

334 total_width += width

335 return len(line_str)