Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/ftfy/chardata.py: 98%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

54 statements  

1""" 

2This gives other modules access to the gritty details about characters and the 

3encodings that use them. 

4""" 

5 

6from __future__ import annotations 

7 

8import html 

9import itertools 

10import re 

11import unicodedata 

12 

13# These are the encodings we will try to fix in ftfy, in the 

14# order that they should be tried. 

15CHARMAP_ENCODINGS = [ 

16 "latin-1", 

17 "sloppy-windows-1252", 

18 "sloppy-windows-1251", 

19 "sloppy-windows-1250", 

20 "sloppy-windows-1253", 

21 "sloppy-windows-1254", 

22 "sloppy-windows-1257", 

23 "iso-8859-2", 

24 "macroman", 

25 "cp437", 

26] 

27 

28SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]") 

29DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]") 

30 

31 

32def _build_regexes() -> dict[str, re.Pattern[str]]: 

33 """ 

34 ENCODING_REGEXES contain reasonably fast ways to detect if we 

35 could represent a given string in a given encoding. The simplest one is 

36 the 'ascii' detector, which of course just determines if all characters 

37 are between U+0000 and U+007F. 

38 """ 

39 # Define a regex that matches ASCII text. 

40 encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")} 

41 

42 for encoding in CHARMAP_ENCODINGS: 

43 # Make a sequence of characters that bytes \x80 to \xFF decode to 

44 # in each encoding, as well as byte \x1A, which is used to represent 

45 # the replacement character � in the sloppy-* encodings. 

46 byte_range = bytes([*range(0x80, 0x100), 0x1A]) 

47 charlist = byte_range.decode(encoding) 

48 

49 # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B 

50 # to \x7F -- will decode as those ASCII characters in any encoding we 

51 # support, so we can just include them as ranges. This also lets us 

52 # not worry about escaping regex special characters, because all of 

53 # them are in the \x1B to \x7F range. 

54 regex = f"^[\x00-\x19\x1b-\x7f{charlist}]*$" 

55 encoding_regexes[encoding] = re.compile(regex) 

56 return encoding_regexes 

57 

58 

59ENCODING_REGEXES = _build_regexes() 

60 

61 

62def _build_html_entities() -> dict[str, str]: 

63 entities = {} 

64 # Create a dictionary based on the built-in HTML5 entity dictionary. 

65 # Add a limited set of HTML entities that we'll also decode if they've 

66 # been case-folded to uppercase, such as decoding Ñ as "Ñ". 

67 for name, char in html.entities.html5.items(): # type: ignore 

68 if name.endswith(";"): 

69 entities["&" + name] = char 

70 

71 # Restrict the set of characters we can attempt to decode if their 

72 # name has been uppercased. If we tried to handle all entity names, 

73 # the results would be ambiguous. 

74 if name == name.lower(): 

75 name_upper = name.upper() 

76 entity_upper = "&" + name_upper 

77 if html.unescape(entity_upper) == entity_upper: 

78 entities[entity_upper] = char.upper() 

79 return entities 

80 

81 

82HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};") 

83HTML_ENTITIES = _build_html_entities() 

84 

85 

86def possible_encoding(text: str, encoding: str) -> bool: 

87 """ 

88 Given text and a single-byte encoding, check whether that text could have 

89 been decoded from that single-byte encoding. 

90 

91 In other words, check whether it can be encoded in that encoding, possibly 

92 sloppily. 

93 """ 

94 return bool(ENCODING_REGEXES[encoding].match(text)) 

95 

96 

97def _build_control_char_mapping() -> dict[int, None]: 

98 """ 

99 Build a translate mapping that strips likely-unintended control characters. 

100 See :func:`ftfy.fixes.remove_control_chars` for a description of these 

101 codepoint ranges and why they should be removed. 

102 """ 

103 control_chars: dict[int, None] = {} 

104 

105 for i in itertools.chain( 

106 range(0x00, 0x09), 

107 [0x0B], 

108 range(0x0E, 0x20), 

109 [0x7F], 

110 range(0x206A, 0x2070), 

111 [0xFEFF], 

112 range(0xFFF9, 0xFFFD), 

113 ): 

114 control_chars[i] = None 

115 

116 return control_chars 

117 

118 

119CONTROL_CHARS = _build_control_char_mapping() 

120 

121 

122# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0' 

123# that some Windows-1252 program converted to a plain space. 

124# 

125# The smaller values are included on a case-by-case basis, because we don't want 

126# to decode likely input sequences to unlikely characters. These are the ones 

127# that *do* form likely characters before 0xa0: 

128# 

129# 0xc2 -> U+A0 NO-BREAK SPACE 

130# 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE 

131# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON 

132# 0xce -> U+3A0 GREEK CAPITAL LETTER PI 

133# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER 

134# 0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO 

135# 

136# In three-character sequences, we exclude some lead bytes in some cases. 

137# 

138# When the lead byte is immediately followed by 0xA0, we shouldn't accept 

139# a space there, because it leads to some less-likely character ranges: 

140# 

141# 0xe0 -> Samaritan script 

142# 0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common) 

143# 

144# We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and 

145# higher point mostly to CJK characters, which we generally don't want to 

146# decode near Latin lowercase letters. 

147# 

148# In four-character sequences, the lead byte must be F0, because that accounts 

149# for almost all of the usage of high-numbered codepoints (tag characters whose 

150# UTF-8 starts with the byte F3 are only used in some rare new emoji sequences). 

151# 

152# This is meant to be applied to encodings of text that tests true for `is_bad`. 

153# Any of these could represent characters that legitimately appear surrounded by 

154# spaces, particularly U+C5 (Å), which is a word in multiple languages! 

155# 

156# We should consider checking for b'\x85' being converted to ... in the future. 

157# I've seen it once, but the text still wasn't recoverable. 

158 

159ALTERED_UTF8_RE = re.compile( 

160 b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]" 

161 b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]" 

162 b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]" 

163 b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]" 

164 b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]" 

165 b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]" 

166) 

167 

168 

169# This expression matches UTF-8 and CESU-8 sequences where some of the 

170# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is 

171# used within ftfy to represent a byte that produced the replacement character 

172# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8 

173# sequence as \ufffd instead of failing to re-decode it at all. 

174# 

175# In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per 

176# sequence. 

177LOSSY_UTF8_RE = re.compile( 

178 b"[\xc2-\xdf][\x1a]" 

179 b"|[\xc2-\xc3][?]" 

180 b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]" 

181 b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]" 

182 b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]" 

183 b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]" 

184 b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]" 

185 b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]" 

186 b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]" 

187 b"|\x1a" 

188) 

189 

190 

191# This regex matches C1 control characters, which occupy some of the positions 

192# in the Latin-1 character map that Windows assigns to other characters instead. 

193C1_CONTROL_RE = re.compile(r"[\x80-\x9f]") 

194 

195 

196# A translate mapping that breaks ligatures made of Latin letters. While 

197# ligatures may be important to the representation of other languages, in Latin 

198# letters they tend to represent a copy/paste error. It omits ligatures such 

199# as æ that are frequently used intentionally. 

200# 

201# This list additionally includes some Latin digraphs that represent two 

202# characters for legacy encoding reasons, not for typographical reasons. 

203# 

204# Ligatures and digraphs may also be separated by NFKC normalization, but that 

205# is sometimes more normalization than you want. 

206 

207LIGATURES = { 

208 ord("IJ"): "IJ", # Dutch ligatures 

209 ord("ij"): "ij", 

210 ord("ʼn"): "ʼn", # Afrikaans digraph meant to avoid auto-curled quote 

211 ord("DZ"): "DZ", # Serbian/Croatian digraphs for Cyrillic conversion 

212 ord("Dz"): "Dz", 

213 ord("dz"): "dz", 

214 ord("DŽ"): "DŽ", 

215 ord("Dž"): "Dž", 

216 ord("dž"): "dž", 

217 ord("LJ"): "LJ", 

218 ord("Lj"): "Lj", 

219 ord("lj"): "lj", 

220 ord("NJ"): "NJ", 

221 ord("Nj"): "Nj", 

222 ord("nj"): "nj", 

223 ord("ff"): "ff", # Latin typographical ligatures 

224 ord("fi"): "fi", 

225 ord("fl"): "fl", 

226 ord("ffi"): "ffi", 

227 ord("ffl"): "ffl", 

228 ord("ſt"): "ſt", 

229 ord("st"): "st", 

230} 

231 

232 

233def _build_width_map() -> dict[int, str]: 

234 """ 

235 Build a translate mapping that replaces halfwidth and fullwidth forms 

236 with their standard-width forms. 

237 """ 

238 # Though it's not listed as a fullwidth character, we'll want to convert 

239 # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start 

240 # with that in the dictionary. 

241 width_map = {0x3000: " "} 

242 for i in range(0xFF01, 0xFFF0): 

243 char = chr(i) 

244 alternate = unicodedata.normalize("NFKC", char) 

245 if alternate != char: 

246 width_map[i] = alternate 

247 return width_map 

248 

249 

250WIDTH_MAP = _build_width_map() 

251 

252 

253# Character classes that help us pinpoint embedded mojibake. These can 

254# include common characters, because we'll also check them for 'badness'. 

255# 

256# Though they go on for many lines, the members of this dictionary are 

257# single concatenated strings. 

258# 

259# This code is generated using scripts/char_data_table.py. 

260UTF8_CLUES: dict[str, str] = { 

261 # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding 

262 "utf8_first_of_2": ( 

263 "\N{LATIN CAPITAL LETTER A WITH BREVE}" # windows-1250:C3 

264 "\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}" # latin-1:C2 

265 "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}" # latin-1:C4 

266 "\N{LATIN CAPITAL LETTER A WITH MACRON}" # windows-1257:C2 

267 "\N{LATIN CAPITAL LETTER A WITH RING ABOVE}" # latin-1:C5 

268 "\N{LATIN CAPITAL LETTER A WITH TILDE}" # latin-1:C3 

269 "\N{LATIN CAPITAL LETTER AE}" # latin-1:C6 

270 "\N{LATIN CAPITAL LETTER C WITH ACUTE}" # windows-1250:C6 

271 "\N{LATIN CAPITAL LETTER C WITH CARON}" # windows-1250:C8 

272 "\N{LATIN CAPITAL LETTER C WITH CEDILLA}" # latin-1:C7 

273 "\N{LATIN CAPITAL LETTER D WITH CARON}" # windows-1250:CF 

274 "\N{LATIN CAPITAL LETTER D WITH STROKE}" # windows-1250:D0 

275 "\N{LATIN CAPITAL LETTER E WITH ACUTE}" # latin-1:C9 

276 "\N{LATIN CAPITAL LETTER E WITH CARON}" # windows-1250:CC 

277 "\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}" # latin-1:CA 

278 "\N{LATIN CAPITAL LETTER E WITH DIAERESIS}" # latin-1:CB 

279 "\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}" # windows-1257:CB 

280 "\N{LATIN CAPITAL LETTER E WITH GRAVE}" # latin-1:C8 

281 "\N{LATIN CAPITAL LETTER E WITH MACRON}" # windows-1257:C7 

282 "\N{LATIN CAPITAL LETTER E WITH OGONEK}" # windows-1250:CA 

283 "\N{LATIN CAPITAL LETTER ETH}" # latin-1:D0 

284 "\N{LATIN CAPITAL LETTER G WITH BREVE}" # windows-1254:D0 

285 "\N{LATIN CAPITAL LETTER G WITH CEDILLA}" # windows-1257:CC 

286 "\N{LATIN CAPITAL LETTER I WITH ACUTE}" # latin-1:CD 

287 "\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}" # latin-1:CE 

288 "\N{LATIN CAPITAL LETTER I WITH DIAERESIS}" # latin-1:CF 

289 "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" # windows-1254:DD 

290 "\N{LATIN CAPITAL LETTER I WITH GRAVE}" # latin-1:CC 

291 "\N{LATIN CAPITAL LETTER I WITH MACRON}" # windows-1257:CE 

292 "\N{LATIN CAPITAL LETTER K WITH CEDILLA}" # windows-1257:CD 

293 "\N{LATIN CAPITAL LETTER L WITH ACUTE}" # windows-1250:C5 

294 "\N{LATIN CAPITAL LETTER L WITH CEDILLA}" # windows-1257:CF 

295 "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1257:D9 

296 "\N{LATIN CAPITAL LETTER N WITH ACUTE}" # windows-1250:D1 

297 "\N{LATIN CAPITAL LETTER N WITH CARON}" # windows-1250:D2 

298 "\N{LATIN CAPITAL LETTER N WITH CEDILLA}" # windows-1257:D2 

299 "\N{LATIN CAPITAL LETTER N WITH TILDE}" # latin-1:D1 

300 "\N{LATIN CAPITAL LETTER O WITH ACUTE}" # latin-1:D3 

301 "\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}" # latin-1:D4 

302 "\N{LATIN CAPITAL LETTER O WITH DIAERESIS}" # latin-1:D6 

303 "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" # windows-1250:D5 

304 "\N{LATIN CAPITAL LETTER O WITH GRAVE}" # latin-1:D2 

305 "\N{LATIN CAPITAL LETTER O WITH MACRON}" # windows-1257:D4 

306 "\N{LATIN CAPITAL LETTER O WITH STROKE}" # latin-1:D8 

307 "\N{LATIN CAPITAL LETTER O WITH TILDE}" # latin-1:D5 

308 "\N{LATIN CAPITAL LETTER R WITH CARON}" # windows-1250:D8 

309 "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1257:DA 

310 "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1257:D0 

311 "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1254:DE 

312 "\N{LATIN CAPITAL LETTER T WITH CEDILLA}" # windows-1250:DE 

313 "\N{LATIN CAPITAL LETTER THORN}" # latin-1:DE 

314 "\N{LATIN CAPITAL LETTER U WITH ACUTE}" # latin-1:DA 

315 "\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}" # latin-1:DB 

316 "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" # latin-1:DC 

317 "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}" # windows-1250:DB 

318 "\N{LATIN CAPITAL LETTER U WITH GRAVE}" # latin-1:D9 

319 "\N{LATIN CAPITAL LETTER U WITH MACRON}" # windows-1257:DB 

320 "\N{LATIN CAPITAL LETTER U WITH OGONEK}" # windows-1257:D8 

321 "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}" # windows-1250:D9 

322 "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" # latin-1:DD 

323 "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1257:CA 

324 "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1257:DE 

325 "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1257:DD 

326 "\N{LATIN SMALL LETTER SHARP S}" # latin-1:DF 

327 "\N{MULTIPLICATION SIGN}" # latin-1:D7 

328 "\N{GREEK CAPITAL LETTER BETA}" # windows-1253:C2 

329 "\N{GREEK CAPITAL LETTER GAMMA}" # windows-1253:C3 

330 "\N{GREEK CAPITAL LETTER DELTA}" # windows-1253:C4 

331 "\N{GREEK CAPITAL LETTER EPSILON}" # windows-1253:C5 

332 "\N{GREEK CAPITAL LETTER ZETA}" # windows-1253:C6 

333 "\N{GREEK CAPITAL LETTER ETA}" # windows-1253:C7 

334 "\N{GREEK CAPITAL LETTER THETA}" # windows-1253:C8 

335 "\N{GREEK CAPITAL LETTER IOTA}" # windows-1253:C9 

336 "\N{GREEK CAPITAL LETTER KAPPA}" # windows-1253:CA 

337 "\N{GREEK CAPITAL LETTER LAMDA}" # windows-1253:CB 

338 "\N{GREEK CAPITAL LETTER MU}" # windows-1253:CC 

339 "\N{GREEK CAPITAL LETTER NU}" # windows-1253:CD 

340 "\N{GREEK CAPITAL LETTER XI}" # windows-1253:CE 

341 "\N{GREEK CAPITAL LETTER OMICRON}" # windows-1253:CF 

342 "\N{GREEK CAPITAL LETTER PI}" # windows-1253:D0 

343 "\N{GREEK CAPITAL LETTER RHO}" # windows-1253:D1 

344 "\N{GREEK CAPITAL LETTER SIGMA}" # windows-1253:D3 

345 "\N{GREEK CAPITAL LETTER TAU}" # windows-1253:D4 

346 "\N{GREEK CAPITAL LETTER UPSILON}" # windows-1253:D5 

347 "\N{GREEK CAPITAL LETTER PHI}" # windows-1253:D6 

348 "\N{GREEK CAPITAL LETTER CHI}" # windows-1253:D7 

349 "\N{GREEK CAPITAL LETTER PSI}" # windows-1253:D8 

350 "\N{GREEK CAPITAL LETTER OMEGA}" # windows-1253:D9 

351 "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}" # windows-1253:DA 

352 "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}" # windows-1253:DB 

353 "\N{GREEK SMALL LETTER ALPHA WITH TONOS}" # windows-1253:DC 

354 "\N{GREEK SMALL LETTER EPSILON WITH TONOS}" # windows-1253:DD 

355 "\N{GREEK SMALL LETTER ETA WITH TONOS}" # windows-1253:DE 

356 "\N{GREEK SMALL LETTER IOTA WITH TONOS}" # windows-1253:DF 

357 "\N{CYRILLIC CAPITAL LETTER VE}" # windows-1251:C2 

358 "\N{CYRILLIC CAPITAL LETTER GHE}" # windows-1251:C3 

359 "\N{CYRILLIC CAPITAL LETTER DE}" # windows-1251:C4 

360 "\N{CYRILLIC CAPITAL LETTER IE}" # windows-1251:C5 

361 "\N{CYRILLIC CAPITAL LETTER ZHE}" # windows-1251:C6 

362 "\N{CYRILLIC CAPITAL LETTER ZE}" # windows-1251:C7 

363 "\N{CYRILLIC CAPITAL LETTER I}" # windows-1251:C8 

364 "\N{CYRILLIC CAPITAL LETTER SHORT I}" # windows-1251:C9 

365 "\N{CYRILLIC CAPITAL LETTER KA}" # windows-1251:CA 

366 "\N{CYRILLIC CAPITAL LETTER EL}" # windows-1251:CB 

367 "\N{CYRILLIC CAPITAL LETTER EM}" # windows-1251:CC 

368 "\N{CYRILLIC CAPITAL LETTER EN}" # windows-1251:CD 

369 "\N{CYRILLIC CAPITAL LETTER O}" # windows-1251:CE 

370 "\N{CYRILLIC CAPITAL LETTER PE}" # windows-1251:CF 

371 "\N{CYRILLIC CAPITAL LETTER ER}" # windows-1251:D0 

372 "\N{CYRILLIC CAPITAL LETTER ES}" # windows-1251:D1 

373 "\N{CYRILLIC CAPITAL LETTER TE}" # windows-1251:D2 

374 "\N{CYRILLIC CAPITAL LETTER U}" # windows-1251:D3 

375 "\N{CYRILLIC CAPITAL LETTER EF}" # windows-1251:D4 

376 "\N{CYRILLIC CAPITAL LETTER HA}" # windows-1251:D5 

377 "\N{CYRILLIC CAPITAL LETTER TSE}" # windows-1251:D6 

378 "\N{CYRILLIC CAPITAL LETTER CHE}" # windows-1251:D7 

379 "\N{CYRILLIC CAPITAL LETTER SHA}" # windows-1251:D8 

380 "\N{CYRILLIC CAPITAL LETTER SHCHA}" # windows-1251:D9 

381 "\N{CYRILLIC CAPITAL LETTER HARD SIGN}" # windows-1251:DA 

382 "\N{CYRILLIC CAPITAL LETTER YERU}" # windows-1251:DB 

383 "\N{CYRILLIC CAPITAL LETTER SOFT SIGN}" # windows-1251:DC 

384 "\N{CYRILLIC CAPITAL LETTER E}" # windows-1251:DD 

385 "\N{CYRILLIC CAPITAL LETTER YU}" # windows-1251:DE 

386 "\N{CYRILLIC CAPITAL LETTER YA}" # windows-1251:DF 

387 ), 

388 # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding 

389 "utf8_first_of_3": ( 

390 "\N{LATIN SMALL LETTER A WITH ACUTE}" # latin-1:E1 

391 "\N{LATIN SMALL LETTER A WITH BREVE}" # windows-1250:E3 

392 "\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}" # latin-1:E2 

393 "\N{LATIN SMALL LETTER A WITH DIAERESIS}" # latin-1:E4 

394 "\N{LATIN SMALL LETTER A WITH GRAVE}" # latin-1:E0 

395 "\N{LATIN SMALL LETTER A WITH MACRON}" # windows-1257:E2 

396 "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1257:E0 

397 "\N{LATIN SMALL LETTER A WITH RING ABOVE}" # latin-1:E5 

398 "\N{LATIN SMALL LETTER A WITH TILDE}" # latin-1:E3 

399 "\N{LATIN SMALL LETTER AE}" # latin-1:E6 

400 "\N{LATIN SMALL LETTER C WITH ACUTE}" # windows-1250:E6 

401 "\N{LATIN SMALL LETTER C WITH CARON}" # windows-1250:E8 

402 "\N{LATIN SMALL LETTER C WITH CEDILLA}" # latin-1:E7 

403 "\N{LATIN SMALL LETTER D WITH CARON}" # windows-1250:EF 

404 "\N{LATIN SMALL LETTER E WITH ACUTE}" # latin-1:E9 

405 "\N{LATIN SMALL LETTER E WITH CARON}" # windows-1250:EC 

406 "\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}" # latin-1:EA 

407 "\N{LATIN SMALL LETTER E WITH DIAERESIS}" # latin-1:EB 

408 "\N{LATIN SMALL LETTER E WITH DOT ABOVE}" # windows-1257:EB 

409 "\N{LATIN SMALL LETTER E WITH GRAVE}" # latin-1:E8 

410 "\N{LATIN SMALL LETTER E WITH MACRON}" # windows-1257:E7 

411 "\N{LATIN SMALL LETTER E WITH OGONEK}" # windows-1250:EA 

412 "\N{LATIN SMALL LETTER E WITH OGONEK}" # windows-1250:EA 

413 "\N{LATIN SMALL LETTER G WITH CEDILLA}" # windows-1257:EC 

414 "\N{LATIN SMALL LETTER I WITH ACUTE}" # latin-1:ED 

415 "\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}" # latin-1:EE 

416 "\N{LATIN SMALL LETTER I WITH DIAERESIS}" # latin-1:EF 

417 "\N{LATIN SMALL LETTER I WITH GRAVE}" # latin-1:EC 

418 "\N{LATIN SMALL LETTER I WITH MACRON}" # windows-1257:EE 

419 "\N{LATIN SMALL LETTER I WITH OGONEK}" # windows-1257:E1 

420 "\N{LATIN SMALL LETTER K WITH CEDILLA}" # windows-1257:ED 

421 "\N{LATIN SMALL LETTER L WITH ACUTE}" # windows-1250:E5 

422 "\N{LATIN SMALL LETTER L WITH CEDILLA}" # windows-1257:EF 

423 "\N{LATIN SMALL LETTER R WITH ACUTE}" # windows-1250:E0 

424 "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1257:EA 

425 "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}" # windows-1253:E0 

426 "\N{GREEK SMALL LETTER ALPHA}" # windows-1253:E1 

427 "\N{GREEK SMALL LETTER BETA}" # windows-1253:E2 

428 "\N{GREEK SMALL LETTER GAMMA}" # windows-1253:E3 

429 "\N{GREEK SMALL LETTER DELTA}" # windows-1253:E4 

430 "\N{GREEK SMALL LETTER EPSILON}" # windows-1253:E5 

431 "\N{GREEK SMALL LETTER ZETA}" # windows-1253:E6 

432 "\N{GREEK SMALL LETTER ETA}" # windows-1253:E7 

433 "\N{GREEK SMALL LETTER THETA}" # windows-1253:E8 

434 "\N{GREEK SMALL LETTER IOTA}" # windows-1253:E9 

435 "\N{GREEK SMALL LETTER KAPPA}" # windows-1253:EA 

436 "\N{GREEK SMALL LETTER LAMDA}" # windows-1253:EB 

437 "\N{GREEK SMALL LETTER MU}" # windows-1253:EC 

438 "\N{GREEK SMALL LETTER NU}" # windows-1253:ED 

439 "\N{GREEK SMALL LETTER XI}" # windows-1253:EE 

440 "\N{GREEK SMALL LETTER OMICRON}" # windows-1253:EF 

441 "\N{CYRILLIC SMALL LETTER A}" # windows-1251:E0 

442 "\N{CYRILLIC SMALL LETTER BE}" # windows-1251:E1 

443 "\N{CYRILLIC SMALL LETTER VE}" # windows-1251:E2 

444 "\N{CYRILLIC SMALL LETTER GHE}" # windows-1251:E3 

445 "\N{CYRILLIC SMALL LETTER DE}" # windows-1251:E4 

446 "\N{CYRILLIC SMALL LETTER IE}" # windows-1251:E5 

447 "\N{CYRILLIC SMALL LETTER ZHE}" # windows-1251:E6 

448 "\N{CYRILLIC SMALL LETTER ZE}" # windows-1251:E7 

449 "\N{CYRILLIC SMALL LETTER I}" # windows-1251:E8 

450 "\N{CYRILLIC SMALL LETTER SHORT I}" # windows-1251:E9 

451 "\N{CYRILLIC SMALL LETTER KA}" # windows-1251:EA 

452 "\N{CYRILLIC SMALL LETTER EL}" # windows-1251:EB 

453 "\N{CYRILLIC SMALL LETTER EM}" # windows-1251:EC 

454 "\N{CYRILLIC SMALL LETTER EN}" # windows-1251:ED 

455 "\N{CYRILLIC SMALL LETTER O}" # windows-1251:EE 

456 "\N{CYRILLIC SMALL LETTER PE}" # windows-1251:EF 

457 ), 

458 # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding. 

459 # (Other leading bytes correspond only to unassigned codepoints) 

460 "utf8_first_of_4": ( 

461 "\N{LATIN SMALL LETTER D WITH STROKE}" # windows-1250:F0 

462 "\N{LATIN SMALL LETTER ETH}" # latin-1:F0 

463 "\N{LATIN SMALL LETTER G WITH BREVE}" # windows-1254:F0 

464 "\N{LATIN SMALL LETTER O WITH ACUTE}" # latin-1:F3 

465 "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1257:F0 

466 "\N{GREEK SMALL LETTER PI}" # windows-1253:F0 

467 "\N{GREEK SMALL LETTER SIGMA}" # windows-1253:F3 

468 "\N{CYRILLIC SMALL LETTER ER}" # windows-1251:F0 

469 "\N{CYRILLIC SMALL LETTER U}" # windows-1251:F3 

470 ), 

471 # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, 

472 # including a space standing in for 0xA0 

473 "utf8_continuation": ( 

474 "\x80-\xbf" 

475 "\N{SPACE}" # modification of latin-1:A0, NO-BREAK SPACE 

476 "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5 

477 "\N{LATIN CAPITAL LETTER AE}" # windows-1257:AF 

478 "\N{LATIN CAPITAL LETTER L WITH CARON}" # windows-1250:BC 

479 "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1250:A3 

480 "\N{LATIN CAPITAL LETTER O WITH STROKE}" # windows-1257:A8 

481 "\N{LATIN CAPITAL LETTER R WITH CEDILLA}" # windows-1257:AA 

482 "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1250:8C 

483 "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1252:8A 

484 "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1250:AA 

485 "\N{LATIN CAPITAL LETTER T WITH CARON}" # windows-1250:8D 

486 "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" # windows-1252:9F 

487 "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1250:8F 

488 "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1252:8E 

489 "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1250:AF 

490 "\N{LATIN CAPITAL LIGATURE OE}" # windows-1252:8C 

491 "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1250:B9 

492 "\N{LATIN SMALL LETTER AE}" # windows-1257:BF 

493 "\N{LATIN SMALL LETTER F WITH HOOK}" # windows-1252:83 

494 "\N{LATIN SMALL LETTER L WITH CARON}" # windows-1250:BE 

495 "\N{LATIN SMALL LETTER L WITH STROKE}" # windows-1250:B3 

496 "\N{LATIN SMALL LETTER O WITH STROKE}" # windows-1257:B8 

497 "\N{LATIN SMALL LETTER R WITH CEDILLA}" # windows-1257:BA 

498 "\N{LATIN SMALL LETTER S WITH ACUTE}" # windows-1250:9C 

499 "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1252:9A 

500 "\N{LATIN SMALL LETTER S WITH CEDILLA}" # windows-1250:BA 

501 "\N{LATIN SMALL LETTER T WITH CARON}" # windows-1250:9D 

502 "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1250:9F 

503 "\N{LATIN SMALL LETTER Z WITH CARON}" # windows-1252:9E 

504 "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" # windows-1250:BF 

505 "\N{LATIN SMALL LIGATURE OE}" # windows-1252:9C 

506 "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # windows-1252:88 

507 "\N{CARON}" # windows-1250:A1 

508 "\N{BREVE}" # windows-1250:A2 

509 "\N{OGONEK}" # windows-1250:B2 

510 "\N{SMALL TILDE}" # windows-1252:98 

511 "\N{DOUBLE ACUTE ACCENT}" # windows-1250:BD 

512 "\N{GREEK TONOS}" # windows-1253:B4 

513 "\N{GREEK DIALYTIKA TONOS}" # windows-1253:A1 

514 "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" # windows-1253:A2 

515 "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" # windows-1253:B8 

516 "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" # windows-1253:B9 

517 "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" # windows-1253:BA 

518 "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" # windows-1253:BC 

519 "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" # windows-1253:BE 

520 "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" # windows-1253:BF 

521 "\N{CYRILLIC CAPITAL LETTER IO}" # windows-1251:A8 

522 "\N{CYRILLIC CAPITAL LETTER DJE}" # windows-1251:80 

523 "\N{CYRILLIC CAPITAL LETTER GJE}" # windows-1251:81 

524 "\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" # windows-1251:AA 

525 "\N{CYRILLIC CAPITAL LETTER DZE}" # windows-1251:BD 

526 "\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B2 

527 "\N{CYRILLIC CAPITAL LETTER YI}" # windows-1251:AF 

528 "\N{CYRILLIC CAPITAL LETTER JE}" # windows-1251:A3 

529 "\N{CYRILLIC CAPITAL LETTER LJE}" # windows-1251:8A 

530 "\N{CYRILLIC CAPITAL LETTER NJE}" # windows-1251:8C 

531 "\N{CYRILLIC CAPITAL LETTER TSHE}" # windows-1251:8E 

532 "\N{CYRILLIC CAPITAL LETTER KJE}" # windows-1251:8D 

533 "\N{CYRILLIC CAPITAL LETTER SHORT U}" # windows-1251:A1 

534 "\N{CYRILLIC CAPITAL LETTER DZHE}" # windows-1251:8F 

535 "\N{CYRILLIC SMALL LETTER IO}" # windows-1251:B8 

536 "\N{CYRILLIC SMALL LETTER DJE}" # windows-1251:90 

537 "\N{CYRILLIC SMALL LETTER GJE}" # windows-1251:83 

538 "\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" # windows-1251:BA 

539 "\N{CYRILLIC SMALL LETTER DZE}" # windows-1251:BE 

540 "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B3 

541 "\N{CYRILLIC SMALL LETTER YI}" # windows-1251:BF 

542 "\N{CYRILLIC SMALL LETTER JE}" # windows-1251:BC 

543 "\N{CYRILLIC SMALL LETTER LJE}" # windows-1251:9A 

544 "\N{CYRILLIC SMALL LETTER NJE}" # windows-1251:9C 

545 "\N{CYRILLIC SMALL LETTER TSHE}" # windows-1251:9E 

546 "\N{CYRILLIC SMALL LETTER KJE}" # windows-1251:9D 

547 "\N{CYRILLIC SMALL LETTER SHORT U}" # windows-1251:A2 

548 "\N{CYRILLIC SMALL LETTER DZHE}" # windows-1251:9F 

549 "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" # windows-1251:A5 

550 "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4 

551 "\N{EN DASH}" # windows-1252:96 

552 "\N{EM DASH}" # windows-1252:97 

553 "\N{HORIZONTAL BAR}" # windows-1253:AF 

554 "\N{LEFT SINGLE QUOTATION MARK}" # windows-1252:91 

555 "\N{RIGHT SINGLE QUOTATION MARK}" # windows-1252:92 

556 "\N{SINGLE LOW-9 QUOTATION MARK}" # windows-1252:82 

557 "\N{LEFT DOUBLE QUOTATION MARK}" # windows-1252:93 

558 "\N{RIGHT DOUBLE QUOTATION MARK}" # windows-1252:94 

559 "\N{DOUBLE LOW-9 QUOTATION MARK}" # windows-1252:84 

560 "\N{DAGGER}" # windows-1252:86 

561 "\N{DOUBLE DAGGER}" # windows-1252:87 

562 "\N{BULLET}" # windows-1252:95 

563 "\N{HORIZONTAL ELLIPSIS}" # windows-1252:85 

564 "\N{PER MILLE SIGN}" # windows-1252:89 

565 "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B 

566 "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B 

567 "\N{EURO SIGN}" # windows-1252:80 

568 "\N{NUMERO SIGN}" # windows-1251:B9 

569 "\N{TRADE MARK SIGN}" # windows-1252:99 

570 ), 

571 # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, 

572 # and don't usually stand for themselves when adjacent to mojibake. 

573 # This excludes spaces, dashes, 'bullet', quotation marks, and ellipses. 

574 "utf8_continuation_strict": ( 

575 "\x80-\xbf" 

576 "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5 

577 "\N{LATIN CAPITAL LETTER AE}" # windows-1257:AF 

578 "\N{LATIN CAPITAL LETTER L WITH CARON}" # windows-1250:BC 

579 "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1250:A3 

580 "\N{LATIN CAPITAL LETTER O WITH STROKE}" # windows-1257:A8 

581 "\N{LATIN CAPITAL LETTER R WITH CEDILLA}" # windows-1257:AA 

582 "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1250:8C 

583 "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1252:8A 

584 "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1250:AA 

585 "\N{LATIN CAPITAL LETTER T WITH CARON}" # windows-1250:8D 

586 "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" # windows-1252:9F 

587 "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1250:8F 

588 "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1252:8E 

589 "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1250:AF 

590 "\N{LATIN CAPITAL LIGATURE OE}" # windows-1252:8C 

591 "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1250:B9 

592 "\N{LATIN SMALL LETTER AE}" # windows-1257:BF 

593 "\N{LATIN SMALL LETTER F WITH HOOK}" # windows-1252:83 

594 "\N{LATIN SMALL LETTER L WITH CARON}" # windows-1250:BE 

595 "\N{LATIN SMALL LETTER L WITH STROKE}" # windows-1250:B3 

596 "\N{LATIN SMALL LETTER O WITH STROKE}" # windows-1257:B8 

597 "\N{LATIN SMALL LETTER R WITH CEDILLA}" # windows-1257:BA 

598 "\N{LATIN SMALL LETTER S WITH ACUTE}" # windows-1250:9C 

599 "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1252:9A 

600 "\N{LATIN SMALL LETTER S WITH CEDILLA}" # windows-1250:BA 

601 "\N{LATIN SMALL LETTER T WITH CARON}" # windows-1250:9D 

602 "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1250:9F 

603 "\N{LATIN SMALL LETTER Z WITH CARON}" # windows-1252:9E 

604 "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" # windows-1250:BF 

605 "\N{LATIN SMALL LIGATURE OE}" # windows-1252:9C 

606 "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # windows-1252:88 

607 "\N{CARON}" # windows-1250:A1 

608 "\N{BREVE}" # windows-1250:A2 

609 "\N{OGONEK}" # windows-1250:B2 

610 "\N{SMALL TILDE}" # windows-1252:98 

611 "\N{DOUBLE ACUTE ACCENT}" # windows-1250:BD 

612 "\N{GREEK TONOS}" # windows-1253:B4 

613 "\N{GREEK DIALYTIKA TONOS}" # windows-1253:A1 

614 "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" # windows-1253:A2 

615 "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" # windows-1253:B8 

616 "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" # windows-1253:B9 

617 "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" # windows-1253:BA 

618 "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" # windows-1253:BC 

619 "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" # windows-1253:BE 

620 "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" # windows-1253:BF 

621 "\N{CYRILLIC CAPITAL LETTER IO}" # windows-1251:A8 

622 "\N{CYRILLIC CAPITAL LETTER DJE}" # windows-1251:80 

623 "\N{CYRILLIC CAPITAL LETTER GJE}" # windows-1251:81 

624 "\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" # windows-1251:AA 

625 "\N{CYRILLIC CAPITAL LETTER DZE}" # windows-1251:BD 

626 "\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B2 

627 "\N{CYRILLIC CAPITAL LETTER YI}" # windows-1251:AF 

628 "\N{CYRILLIC CAPITAL LETTER JE}" # windows-1251:A3 

629 "\N{CYRILLIC CAPITAL LETTER LJE}" # windows-1251:8A 

630 "\N{CYRILLIC CAPITAL LETTER NJE}" # windows-1251:8C 

631 "\N{CYRILLIC CAPITAL LETTER TSHE}" # windows-1251:8E 

632 "\N{CYRILLIC CAPITAL LETTER KJE}" # windows-1251:8D 

633 "\N{CYRILLIC CAPITAL LETTER SHORT U}" # windows-1251:A1 

634 "\N{CYRILLIC CAPITAL LETTER DZHE}" # windows-1251:8F 

635 "\N{CYRILLIC SMALL LETTER IO}" # windows-1251:B8 

636 "\N{CYRILLIC SMALL LETTER DJE}" # windows-1251:90 

637 "\N{CYRILLIC SMALL LETTER GJE}" # windows-1251:83 

638 "\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" # windows-1251:BA 

639 "\N{CYRILLIC SMALL LETTER DZE}" # windows-1251:BE 

640 "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B3 

641 "\N{CYRILLIC SMALL LETTER YI}" # windows-1251:BF 

642 "\N{CYRILLIC SMALL LETTER JE}" # windows-1251:BC 

643 "\N{CYRILLIC SMALL LETTER LJE}" # windows-1251:9A 

644 "\N{CYRILLIC SMALL LETTER NJE}" # windows-1251:9C 

645 "\N{CYRILLIC SMALL LETTER TSHE}" # windows-1251:9E 

646 "\N{CYRILLIC SMALL LETTER KJE}" # windows-1251:9D 

647 "\N{CYRILLIC SMALL LETTER SHORT U}" # windows-1251:A2 

648 "\N{CYRILLIC SMALL LETTER DZHE}" # windows-1251:9F 

649 "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" # windows-1251:A5 

650 "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4 

651 "\N{DAGGER}" # windows-1252:86 

652 "\N{DOUBLE DAGGER}" # windows-1252:87 

653 "\N{PER MILLE SIGN}" # windows-1252:89 

654 "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B 

655 "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B 

656 "\N{EURO SIGN}" # windows-1252:80 

657 "\N{NUMERO SIGN}" # windows-1251:B9 

658 "\N{TRADE MARK SIGN}" # windows-1252:99 

659 ), 

660} 

661 

662# This regex uses UTF8_CLUES to find sequences of likely mojibake. 

663# It matches them with + so that several adjacent UTF-8-looking sequences 

664# get coalesced into one, allowing them to be fixed more efficiently 

665# and not requiring every individual subsequence to be detected as 'badness'. 

666# 

667# We accept spaces in place of "utf8_continuation", because spaces might have 

668# been intended to be U+A0 NO-BREAK SPACE. 

669# 

670# We do a lookbehind to make sure the previous character isn't a 

671# "utf8_continuation_strict" character, so that we don't fix just a few 

672# characters in a huge garble and make the situation worse. 

673# 

674# Unfortunately, the matches to this regular expression won't show their 

675# surrounding context, and including context would make the expression much 

676# less efficient. The 'badness' rules that require context, such as a preceding 

677# lowercase letter, will prevent some cases of inconsistent UTF-8 from being 

678# fixed when they don't see it. 

679UTF8_DETECTOR_RE = re.compile( 

680 """ 

681 (?<! [{utf8_continuation_strict}]) 

682 ( 

683 [{utf8_first_of_2}] [{utf8_continuation}] 

684 | 

685 [{utf8_first_of_3}] [{utf8_continuation}]{{2}} 

686 | 

687 [{utf8_first_of_4}] [{utf8_continuation}]{{3}} 

688 )+ 

689 """.format(**UTF8_CLUES), 

690 re.VERBOSE, 

691)