Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/ftfy/chardata.py: 98%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2This gives other modules access to the gritty details about characters and the
3encodings that use them.
4"""
6from __future__ import annotations
8import html
9import itertools
10import re
11import unicodedata
13# These are the encodings we will try to fix in ftfy, in the
14# order that they should be tried.
15CHARMAP_ENCODINGS = [
16 "latin-1",
17 "sloppy-windows-1252",
18 "sloppy-windows-1251",
19 "sloppy-windows-1250",
20 "sloppy-windows-1253",
21 "sloppy-windows-1254",
22 "sloppy-windows-1257",
23 "iso-8859-2",
24 "macroman",
25 "cp437",
26]
28SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]")
29DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]")
32def _build_regexes() -> dict[str, re.Pattern[str]]:
33 """
34 ENCODING_REGEXES contain reasonably fast ways to detect if we
35 could represent a given string in a given encoding. The simplest one is
36 the 'ascii' detector, which of course just determines if all characters
37 are between U+0000 and U+007F.
38 """
39 # Define a regex that matches ASCII text.
40 encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")}
42 for encoding in CHARMAP_ENCODINGS:
43 # Make a sequence of characters that bytes \x80 to \xFF decode to
44 # in each encoding, as well as byte \x1A, which is used to represent
45 # the replacement character � in the sloppy-* encodings.
46 byte_range = bytes([*range(0x80, 0x100), 0x1A])
47 charlist = byte_range.decode(encoding)
49 # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
50 # to \x7F -- will decode as those ASCII characters in any encoding we
51 # support, so we can just include them as ranges. This also lets us
52 # not worry about escaping regex special characters, because all of
53 # them are in the \x1B to \x7F range.
54 regex = f"^[\x00-\x19\x1b-\x7f{charlist}]*$"
55 encoding_regexes[encoding] = re.compile(regex)
56 return encoding_regexes
59ENCODING_REGEXES = _build_regexes()
62def _build_html_entities() -> dict[str, str]:
63 entities = {}
64 # Create a dictionary based on the built-in HTML5 entity dictionary.
65 # Add a limited set of HTML entities that we'll also decode if they've
66 # been case-folded to uppercase, such as decoding Ñ as "Ñ".
67 for name, char in html.entities.html5.items(): # type: ignore
68 if name.endswith(";"):
69 entities["&" + name] = char
71 # Restrict the set of characters we can attempt to decode if their
72 # name has been uppercased. If we tried to handle all entity names,
73 # the results would be ambiguous.
74 if name == name.lower():
75 name_upper = name.upper()
76 entity_upper = "&" + name_upper
77 if html.unescape(entity_upper) == entity_upper:
78 entities[entity_upper] = char.upper()
79 return entities
82HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};")
83HTML_ENTITIES = _build_html_entities()
86def possible_encoding(text: str, encoding: str) -> bool:
87 """
88 Given text and a single-byte encoding, check whether that text could have
89 been decoded from that single-byte encoding.
91 In other words, check whether it can be encoded in that encoding, possibly
92 sloppily.
93 """
94 return bool(ENCODING_REGEXES[encoding].match(text))
97def _build_control_char_mapping() -> dict[int, None]:
98 """
99 Build a translate mapping that strips likely-unintended control characters.
100 See :func:`ftfy.fixes.remove_control_chars` for a description of these
101 codepoint ranges and why they should be removed.
102 """
103 control_chars: dict[int, None] = {}
105 for i in itertools.chain(
106 range(0x00, 0x09),
107 [0x0B],
108 range(0x0E, 0x20),
109 [0x7F],
110 range(0x206A, 0x2070),
111 [0xFEFF],
112 range(0xFFF9, 0xFFFD),
113 ):
114 control_chars[i] = None
116 return control_chars
119CONTROL_CHARS = _build_control_char_mapping()
122# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0'
123# that some Windows-1252 program converted to a plain space.
124#
125# The smaller values are included on a case-by-case basis, because we don't want
126# to decode likely input sequences to unlikely characters. These are the ones
127# that *do* form likely characters before 0xa0:
128#
129# 0xc2 -> U+A0 NO-BREAK SPACE
130# 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE
131# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON
132# 0xce -> U+3A0 GREEK CAPITAL LETTER PI
133# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER
134# 0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO
135#
136# In three-character sequences, we exclude some lead bytes in some cases.
137#
138# When the lead byte is immediately followed by 0xA0, we shouldn't accept
139# a space there, because it leads to some less-likely character ranges:
140#
141# 0xe0 -> Samaritan script
142# 0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common)
143#
144# We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and
145# higher point mostly to CJK characters, which we generally don't want to
146# decode near Latin lowercase letters.
147#
148# In four-character sequences, the lead byte must be F0, because that accounts
149# for almost all of the usage of high-numbered codepoints (tag characters whose
150# UTF-8 starts with the byte F3 are only used in some rare new emoji sequences).
151#
152# This is meant to be applied to encodings of text that tests true for `is_bad`.
153# Any of these could represent characters that legitimately appear surrounded by
154# spaces, particularly U+C5 (Å), which is a word in multiple languages!
155#
156# We should consider checking for b'\x85' being converted to ... in the future.
157# I've seen it once, but the text still wasn't recoverable.
159ALTERED_UTF8_RE = re.compile(
160 b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]"
161 b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]"
162 b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]"
163 b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]"
164 b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]"
165 b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]"
166)
169# This expression matches UTF-8 and CESU-8 sequences where some of the
170# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is
171# used within ftfy to represent a byte that produced the replacement character
172# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8
173# sequence as \ufffd instead of failing to re-decode it at all.
174#
175# In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per
176# sequence.
177LOSSY_UTF8_RE = re.compile(
178 b"[\xc2-\xdf][\x1a]"
179 b"|[\xc2-\xc3][?]"
180 b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]"
181 b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]"
182 b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]"
183 b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]"
184 b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]"
185 b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]"
186 b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]"
187 b"|\x1a"
188)
191# This regex matches C1 control characters, which occupy some of the positions
192# in the Latin-1 character map that Windows assigns to other characters instead.
193C1_CONTROL_RE = re.compile(r"[\x80-\x9f]")
196# A translate mapping that breaks ligatures made of Latin letters. While
197# ligatures may be important to the representation of other languages, in Latin
198# letters they tend to represent a copy/paste error. It omits ligatures such
199# as æ that are frequently used intentionally.
200#
201# This list additionally includes some Latin digraphs that represent two
202# characters for legacy encoding reasons, not for typographical reasons.
203#
204# Ligatures and digraphs may also be separated by NFKC normalization, but that
205# is sometimes more normalization than you want.
207LIGATURES = {
208 ord("IJ"): "IJ", # Dutch ligatures
209 ord("ij"): "ij",
210 ord("ʼn"): "ʼn", # Afrikaans digraph meant to avoid auto-curled quote
211 ord("DZ"): "DZ", # Serbian/Croatian digraphs for Cyrillic conversion
212 ord("Dz"): "Dz",
213 ord("dz"): "dz",
214 ord("DŽ"): "DŽ",
215 ord("Dž"): "Dž",
216 ord("dž"): "dž",
217 ord("LJ"): "LJ",
218 ord("Lj"): "Lj",
219 ord("lj"): "lj",
220 ord("NJ"): "NJ",
221 ord("Nj"): "Nj",
222 ord("nj"): "nj",
223 ord("ff"): "ff", # Latin typographical ligatures
224 ord("fi"): "fi",
225 ord("fl"): "fl",
226 ord("ffi"): "ffi",
227 ord("ffl"): "ffl",
228 ord("ſt"): "ſt",
229 ord("st"): "st",
230}
233def _build_width_map() -> dict[int, str]:
234 """
235 Build a translate mapping that replaces halfwidth and fullwidth forms
236 with their standard-width forms.
237 """
238 # Though it's not listed as a fullwidth character, we'll want to convert
239 # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
240 # with that in the dictionary.
241 width_map = {0x3000: " "}
242 for i in range(0xFF01, 0xFFF0):
243 char = chr(i)
244 alternate = unicodedata.normalize("NFKC", char)
245 if alternate != char:
246 width_map[i] = alternate
247 return width_map
250WIDTH_MAP = _build_width_map()
253# Character classes that help us pinpoint embedded mojibake. These can
254# include common characters, because we'll also check them for 'badness'.
255#
256# Though they go on for many lines, the members of this dictionary are
257# single concatenated strings.
258#
259# This code is generated using scripts/char_data_table.py.
260UTF8_CLUES: dict[str, str] = {
261 # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding
262 "utf8_first_of_2": (
263 "\N{LATIN CAPITAL LETTER A WITH BREVE}" # windows-1250:C3
264 "\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}" # latin-1:C2
265 "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}" # latin-1:C4
266 "\N{LATIN CAPITAL LETTER A WITH MACRON}" # windows-1257:C2
267 "\N{LATIN CAPITAL LETTER A WITH RING ABOVE}" # latin-1:C5
268 "\N{LATIN CAPITAL LETTER A WITH TILDE}" # latin-1:C3
269 "\N{LATIN CAPITAL LETTER AE}" # latin-1:C6
270 "\N{LATIN CAPITAL LETTER C WITH ACUTE}" # windows-1250:C6
271 "\N{LATIN CAPITAL LETTER C WITH CARON}" # windows-1250:C8
272 "\N{LATIN CAPITAL LETTER C WITH CEDILLA}" # latin-1:C7
273 "\N{LATIN CAPITAL LETTER D WITH CARON}" # windows-1250:CF
274 "\N{LATIN CAPITAL LETTER D WITH STROKE}" # windows-1250:D0
275 "\N{LATIN CAPITAL LETTER E WITH ACUTE}" # latin-1:C9
276 "\N{LATIN CAPITAL LETTER E WITH CARON}" # windows-1250:CC
277 "\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}" # latin-1:CA
278 "\N{LATIN CAPITAL LETTER E WITH DIAERESIS}" # latin-1:CB
279 "\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}" # windows-1257:CB
280 "\N{LATIN CAPITAL LETTER E WITH GRAVE}" # latin-1:C8
281 "\N{LATIN CAPITAL LETTER E WITH MACRON}" # windows-1257:C7
282 "\N{LATIN CAPITAL LETTER E WITH OGONEK}" # windows-1250:CA
283 "\N{LATIN CAPITAL LETTER ETH}" # latin-1:D0
284 "\N{LATIN CAPITAL LETTER G WITH BREVE}" # windows-1254:D0
285 "\N{LATIN CAPITAL LETTER G WITH CEDILLA}" # windows-1257:CC
286 "\N{LATIN CAPITAL LETTER I WITH ACUTE}" # latin-1:CD
287 "\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}" # latin-1:CE
288 "\N{LATIN CAPITAL LETTER I WITH DIAERESIS}" # latin-1:CF
289 "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" # windows-1254:DD
290 "\N{LATIN CAPITAL LETTER I WITH GRAVE}" # latin-1:CC
291 "\N{LATIN CAPITAL LETTER I WITH MACRON}" # windows-1257:CE
292 "\N{LATIN CAPITAL LETTER K WITH CEDILLA}" # windows-1257:CD
293 "\N{LATIN CAPITAL LETTER L WITH ACUTE}" # windows-1250:C5
294 "\N{LATIN CAPITAL LETTER L WITH CEDILLA}" # windows-1257:CF
295 "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1257:D9
296 "\N{LATIN CAPITAL LETTER N WITH ACUTE}" # windows-1250:D1
297 "\N{LATIN CAPITAL LETTER N WITH CARON}" # windows-1250:D2
298 "\N{LATIN CAPITAL LETTER N WITH CEDILLA}" # windows-1257:D2
299 "\N{LATIN CAPITAL LETTER N WITH TILDE}" # latin-1:D1
300 "\N{LATIN CAPITAL LETTER O WITH ACUTE}" # latin-1:D3
301 "\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}" # latin-1:D4
302 "\N{LATIN CAPITAL LETTER O WITH DIAERESIS}" # latin-1:D6
303 "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" # windows-1250:D5
304 "\N{LATIN CAPITAL LETTER O WITH GRAVE}" # latin-1:D2
305 "\N{LATIN CAPITAL LETTER O WITH MACRON}" # windows-1257:D4
306 "\N{LATIN CAPITAL LETTER O WITH STROKE}" # latin-1:D8
307 "\N{LATIN CAPITAL LETTER O WITH TILDE}" # latin-1:D5
308 "\N{LATIN CAPITAL LETTER R WITH CARON}" # windows-1250:D8
309 "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1257:DA
310 "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1257:D0
311 "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1254:DE
312 "\N{LATIN CAPITAL LETTER T WITH CEDILLA}" # windows-1250:DE
313 "\N{LATIN CAPITAL LETTER THORN}" # latin-1:DE
314 "\N{LATIN CAPITAL LETTER U WITH ACUTE}" # latin-1:DA
315 "\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}" # latin-1:DB
316 "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" # latin-1:DC
317 "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}" # windows-1250:DB
318 "\N{LATIN CAPITAL LETTER U WITH GRAVE}" # latin-1:D9
319 "\N{LATIN CAPITAL LETTER U WITH MACRON}" # windows-1257:DB
320 "\N{LATIN CAPITAL LETTER U WITH OGONEK}" # windows-1257:D8
321 "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}" # windows-1250:D9
322 "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" # latin-1:DD
323 "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1257:CA
324 "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1257:DE
325 "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1257:DD
326 "\N{LATIN SMALL LETTER SHARP S}" # latin-1:DF
327 "\N{MULTIPLICATION SIGN}" # latin-1:D7
328 "\N{GREEK CAPITAL LETTER BETA}" # windows-1253:C2
329 "\N{GREEK CAPITAL LETTER GAMMA}" # windows-1253:C3
330 "\N{GREEK CAPITAL LETTER DELTA}" # windows-1253:C4
331 "\N{GREEK CAPITAL LETTER EPSILON}" # windows-1253:C5
332 "\N{GREEK CAPITAL LETTER ZETA}" # windows-1253:C6
333 "\N{GREEK CAPITAL LETTER ETA}" # windows-1253:C7
334 "\N{GREEK CAPITAL LETTER THETA}" # windows-1253:C8
335 "\N{GREEK CAPITAL LETTER IOTA}" # windows-1253:C9
336 "\N{GREEK CAPITAL LETTER KAPPA}" # windows-1253:CA
337 "\N{GREEK CAPITAL LETTER LAMDA}" # windows-1253:CB
338 "\N{GREEK CAPITAL LETTER MU}" # windows-1253:CC
339 "\N{GREEK CAPITAL LETTER NU}" # windows-1253:CD
340 "\N{GREEK CAPITAL LETTER XI}" # windows-1253:CE
341 "\N{GREEK CAPITAL LETTER OMICRON}" # windows-1253:CF
342 "\N{GREEK CAPITAL LETTER PI}" # windows-1253:D0
343 "\N{GREEK CAPITAL LETTER RHO}" # windows-1253:D1
344 "\N{GREEK CAPITAL LETTER SIGMA}" # windows-1253:D3
345 "\N{GREEK CAPITAL LETTER TAU}" # windows-1253:D4
346 "\N{GREEK CAPITAL LETTER UPSILON}" # windows-1253:D5
347 "\N{GREEK CAPITAL LETTER PHI}" # windows-1253:D6
348 "\N{GREEK CAPITAL LETTER CHI}" # windows-1253:D7
349 "\N{GREEK CAPITAL LETTER PSI}" # windows-1253:D8
350 "\N{GREEK CAPITAL LETTER OMEGA}" # windows-1253:D9
351 "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}" # windows-1253:DA
352 "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}" # windows-1253:DB
353 "\N{GREEK SMALL LETTER ALPHA WITH TONOS}" # windows-1253:DC
354 "\N{GREEK SMALL LETTER EPSILON WITH TONOS}" # windows-1253:DD
355 "\N{GREEK SMALL LETTER ETA WITH TONOS}" # windows-1253:DE
356 "\N{GREEK SMALL LETTER IOTA WITH TONOS}" # windows-1253:DF
357 "\N{CYRILLIC CAPITAL LETTER VE}" # windows-1251:C2
358 "\N{CYRILLIC CAPITAL LETTER GHE}" # windows-1251:C3
359 "\N{CYRILLIC CAPITAL LETTER DE}" # windows-1251:C4
360 "\N{CYRILLIC CAPITAL LETTER IE}" # windows-1251:C5
361 "\N{CYRILLIC CAPITAL LETTER ZHE}" # windows-1251:C6
362 "\N{CYRILLIC CAPITAL LETTER ZE}" # windows-1251:C7
363 "\N{CYRILLIC CAPITAL LETTER I}" # windows-1251:C8
364 "\N{CYRILLIC CAPITAL LETTER SHORT I}" # windows-1251:C9
365 "\N{CYRILLIC CAPITAL LETTER KA}" # windows-1251:CA
366 "\N{CYRILLIC CAPITAL LETTER EL}" # windows-1251:CB
367 "\N{CYRILLIC CAPITAL LETTER EM}" # windows-1251:CC
368 "\N{CYRILLIC CAPITAL LETTER EN}" # windows-1251:CD
369 "\N{CYRILLIC CAPITAL LETTER O}" # windows-1251:CE
370 "\N{CYRILLIC CAPITAL LETTER PE}" # windows-1251:CF
371 "\N{CYRILLIC CAPITAL LETTER ER}" # windows-1251:D0
372 "\N{CYRILLIC CAPITAL LETTER ES}" # windows-1251:D1
373 "\N{CYRILLIC CAPITAL LETTER TE}" # windows-1251:D2
374 "\N{CYRILLIC CAPITAL LETTER U}" # windows-1251:D3
375 "\N{CYRILLIC CAPITAL LETTER EF}" # windows-1251:D4
376 "\N{CYRILLIC CAPITAL LETTER HA}" # windows-1251:D5
377 "\N{CYRILLIC CAPITAL LETTER TSE}" # windows-1251:D6
378 "\N{CYRILLIC CAPITAL LETTER CHE}" # windows-1251:D7
379 "\N{CYRILLIC CAPITAL LETTER SHA}" # windows-1251:D8
380 "\N{CYRILLIC CAPITAL LETTER SHCHA}" # windows-1251:D9
381 "\N{CYRILLIC CAPITAL LETTER HARD SIGN}" # windows-1251:DA
382 "\N{CYRILLIC CAPITAL LETTER YERU}" # windows-1251:DB
383 "\N{CYRILLIC CAPITAL LETTER SOFT SIGN}" # windows-1251:DC
384 "\N{CYRILLIC CAPITAL LETTER E}" # windows-1251:DD
385 "\N{CYRILLIC CAPITAL LETTER YU}" # windows-1251:DE
386 "\N{CYRILLIC CAPITAL LETTER YA}" # windows-1251:DF
387 ),
388 # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding
389 "utf8_first_of_3": (
390 "\N{LATIN SMALL LETTER A WITH ACUTE}" # latin-1:E1
391 "\N{LATIN SMALL LETTER A WITH BREVE}" # windows-1250:E3
392 "\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}" # latin-1:E2
393 "\N{LATIN SMALL LETTER A WITH DIAERESIS}" # latin-1:E4
394 "\N{LATIN SMALL LETTER A WITH GRAVE}" # latin-1:E0
395 "\N{LATIN SMALL LETTER A WITH MACRON}" # windows-1257:E2
396 "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1257:E0
397 "\N{LATIN SMALL LETTER A WITH RING ABOVE}" # latin-1:E5
398 "\N{LATIN SMALL LETTER A WITH TILDE}" # latin-1:E3
399 "\N{LATIN SMALL LETTER AE}" # latin-1:E6
400 "\N{LATIN SMALL LETTER C WITH ACUTE}" # windows-1250:E6
401 "\N{LATIN SMALL LETTER C WITH CARON}" # windows-1250:E8
402 "\N{LATIN SMALL LETTER C WITH CEDILLA}" # latin-1:E7
403 "\N{LATIN SMALL LETTER D WITH CARON}" # windows-1250:EF
404 "\N{LATIN SMALL LETTER E WITH ACUTE}" # latin-1:E9
405 "\N{LATIN SMALL LETTER E WITH CARON}" # windows-1250:EC
406 "\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}" # latin-1:EA
407 "\N{LATIN SMALL LETTER E WITH DIAERESIS}" # latin-1:EB
408 "\N{LATIN SMALL LETTER E WITH DOT ABOVE}" # windows-1257:EB
409 "\N{LATIN SMALL LETTER E WITH GRAVE}" # latin-1:E8
410 "\N{LATIN SMALL LETTER E WITH MACRON}" # windows-1257:E7
411 "\N{LATIN SMALL LETTER E WITH OGONEK}" # windows-1250:EA
412 "\N{LATIN SMALL LETTER E WITH OGONEK}" # windows-1250:EA
413 "\N{LATIN SMALL LETTER G WITH CEDILLA}" # windows-1257:EC
414 "\N{LATIN SMALL LETTER I WITH ACUTE}" # latin-1:ED
415 "\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}" # latin-1:EE
416 "\N{LATIN SMALL LETTER I WITH DIAERESIS}" # latin-1:EF
417 "\N{LATIN SMALL LETTER I WITH GRAVE}" # latin-1:EC
418 "\N{LATIN SMALL LETTER I WITH MACRON}" # windows-1257:EE
419 "\N{LATIN SMALL LETTER I WITH OGONEK}" # windows-1257:E1
420 "\N{LATIN SMALL LETTER K WITH CEDILLA}" # windows-1257:ED
421 "\N{LATIN SMALL LETTER L WITH ACUTE}" # windows-1250:E5
422 "\N{LATIN SMALL LETTER L WITH CEDILLA}" # windows-1257:EF
423 "\N{LATIN SMALL LETTER R WITH ACUTE}" # windows-1250:E0
424 "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1257:EA
425 "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}" # windows-1253:E0
426 "\N{GREEK SMALL LETTER ALPHA}" # windows-1253:E1
427 "\N{GREEK SMALL LETTER BETA}" # windows-1253:E2
428 "\N{GREEK SMALL LETTER GAMMA}" # windows-1253:E3
429 "\N{GREEK SMALL LETTER DELTA}" # windows-1253:E4
430 "\N{GREEK SMALL LETTER EPSILON}" # windows-1253:E5
431 "\N{GREEK SMALL LETTER ZETA}" # windows-1253:E6
432 "\N{GREEK SMALL LETTER ETA}" # windows-1253:E7
433 "\N{GREEK SMALL LETTER THETA}" # windows-1253:E8
434 "\N{GREEK SMALL LETTER IOTA}" # windows-1253:E9
435 "\N{GREEK SMALL LETTER KAPPA}" # windows-1253:EA
436 "\N{GREEK SMALL LETTER LAMDA}" # windows-1253:EB
437 "\N{GREEK SMALL LETTER MU}" # windows-1253:EC
438 "\N{GREEK SMALL LETTER NU}" # windows-1253:ED
439 "\N{GREEK SMALL LETTER XI}" # windows-1253:EE
440 "\N{GREEK SMALL LETTER OMICRON}" # windows-1253:EF
441 "\N{CYRILLIC SMALL LETTER A}" # windows-1251:E0
442 "\N{CYRILLIC SMALL LETTER BE}" # windows-1251:E1
443 "\N{CYRILLIC SMALL LETTER VE}" # windows-1251:E2
444 "\N{CYRILLIC SMALL LETTER GHE}" # windows-1251:E3
445 "\N{CYRILLIC SMALL LETTER DE}" # windows-1251:E4
446 "\N{CYRILLIC SMALL LETTER IE}" # windows-1251:E5
447 "\N{CYRILLIC SMALL LETTER ZHE}" # windows-1251:E6
448 "\N{CYRILLIC SMALL LETTER ZE}" # windows-1251:E7
449 "\N{CYRILLIC SMALL LETTER I}" # windows-1251:E8
450 "\N{CYRILLIC SMALL LETTER SHORT I}" # windows-1251:E9
451 "\N{CYRILLIC SMALL LETTER KA}" # windows-1251:EA
452 "\N{CYRILLIC SMALL LETTER EL}" # windows-1251:EB
453 "\N{CYRILLIC SMALL LETTER EM}" # windows-1251:EC
454 "\N{CYRILLIC SMALL LETTER EN}" # windows-1251:ED
455 "\N{CYRILLIC SMALL LETTER O}" # windows-1251:EE
456 "\N{CYRILLIC SMALL LETTER PE}" # windows-1251:EF
457 ),
458 # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding.
459 # (Other leading bytes correspond only to unassigned codepoints)
460 "utf8_first_of_4": (
461 "\N{LATIN SMALL LETTER D WITH STROKE}" # windows-1250:F0
462 "\N{LATIN SMALL LETTER ETH}" # latin-1:F0
463 "\N{LATIN SMALL LETTER G WITH BREVE}" # windows-1254:F0
464 "\N{LATIN SMALL LETTER O WITH ACUTE}" # latin-1:F3
465 "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1257:F0
466 "\N{GREEK SMALL LETTER PI}" # windows-1253:F0
467 "\N{GREEK SMALL LETTER SIGMA}" # windows-1253:F3
468 "\N{CYRILLIC SMALL LETTER ER}" # windows-1251:F0
469 "\N{CYRILLIC SMALL LETTER U}" # windows-1251:F3
470 ),
471 # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
472 # including a space standing in for 0xA0
473 "utf8_continuation": (
474 "\x80-\xbf"
475 "\N{SPACE}" # modification of latin-1:A0, NO-BREAK SPACE
476 "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5
477 "\N{LATIN CAPITAL LETTER AE}" # windows-1257:AF
478 "\N{LATIN CAPITAL LETTER L WITH CARON}" # windows-1250:BC
479 "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1250:A3
480 "\N{LATIN CAPITAL LETTER O WITH STROKE}" # windows-1257:A8
481 "\N{LATIN CAPITAL LETTER R WITH CEDILLA}" # windows-1257:AA
482 "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1250:8C
483 "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1252:8A
484 "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1250:AA
485 "\N{LATIN CAPITAL LETTER T WITH CARON}" # windows-1250:8D
486 "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" # windows-1252:9F
487 "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1250:8F
488 "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1252:8E
489 "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1250:AF
490 "\N{LATIN CAPITAL LIGATURE OE}" # windows-1252:8C
491 "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1250:B9
492 "\N{LATIN SMALL LETTER AE}" # windows-1257:BF
493 "\N{LATIN SMALL LETTER F WITH HOOK}" # windows-1252:83
494 "\N{LATIN SMALL LETTER L WITH CARON}" # windows-1250:BE
495 "\N{LATIN SMALL LETTER L WITH STROKE}" # windows-1250:B3
496 "\N{LATIN SMALL LETTER O WITH STROKE}" # windows-1257:B8
497 "\N{LATIN SMALL LETTER R WITH CEDILLA}" # windows-1257:BA
498 "\N{LATIN SMALL LETTER S WITH ACUTE}" # windows-1250:9C
499 "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1252:9A
500 "\N{LATIN SMALL LETTER S WITH CEDILLA}" # windows-1250:BA
501 "\N{LATIN SMALL LETTER T WITH CARON}" # windows-1250:9D
502 "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1250:9F
503 "\N{LATIN SMALL LETTER Z WITH CARON}" # windows-1252:9E
504 "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" # windows-1250:BF
505 "\N{LATIN SMALL LIGATURE OE}" # windows-1252:9C
506 "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # windows-1252:88
507 "\N{CARON}" # windows-1250:A1
508 "\N{BREVE}" # windows-1250:A2
509 "\N{OGONEK}" # windows-1250:B2
510 "\N{SMALL TILDE}" # windows-1252:98
511 "\N{DOUBLE ACUTE ACCENT}" # windows-1250:BD
512 "\N{GREEK TONOS}" # windows-1253:B4
513 "\N{GREEK DIALYTIKA TONOS}" # windows-1253:A1
514 "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" # windows-1253:A2
515 "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" # windows-1253:B8
516 "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" # windows-1253:B9
517 "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" # windows-1253:BA
518 "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" # windows-1253:BC
519 "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" # windows-1253:BE
520 "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" # windows-1253:BF
521 "\N{CYRILLIC CAPITAL LETTER IO}" # windows-1251:A8
522 "\N{CYRILLIC CAPITAL LETTER DJE}" # windows-1251:80
523 "\N{CYRILLIC CAPITAL LETTER GJE}" # windows-1251:81
524 "\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" # windows-1251:AA
525 "\N{CYRILLIC CAPITAL LETTER DZE}" # windows-1251:BD
526 "\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B2
527 "\N{CYRILLIC CAPITAL LETTER YI}" # windows-1251:AF
528 "\N{CYRILLIC CAPITAL LETTER JE}" # windows-1251:A3
529 "\N{CYRILLIC CAPITAL LETTER LJE}" # windows-1251:8A
530 "\N{CYRILLIC CAPITAL LETTER NJE}" # windows-1251:8C
531 "\N{CYRILLIC CAPITAL LETTER TSHE}" # windows-1251:8E
532 "\N{CYRILLIC CAPITAL LETTER KJE}" # windows-1251:8D
533 "\N{CYRILLIC CAPITAL LETTER SHORT U}" # windows-1251:A1
534 "\N{CYRILLIC CAPITAL LETTER DZHE}" # windows-1251:8F
535 "\N{CYRILLIC SMALL LETTER IO}" # windows-1251:B8
536 "\N{CYRILLIC SMALL LETTER DJE}" # windows-1251:90
537 "\N{CYRILLIC SMALL LETTER GJE}" # windows-1251:83
538 "\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" # windows-1251:BA
539 "\N{CYRILLIC SMALL LETTER DZE}" # windows-1251:BE
540 "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B3
541 "\N{CYRILLIC SMALL LETTER YI}" # windows-1251:BF
542 "\N{CYRILLIC SMALL LETTER JE}" # windows-1251:BC
543 "\N{CYRILLIC SMALL LETTER LJE}" # windows-1251:9A
544 "\N{CYRILLIC SMALL LETTER NJE}" # windows-1251:9C
545 "\N{CYRILLIC SMALL LETTER TSHE}" # windows-1251:9E
546 "\N{CYRILLIC SMALL LETTER KJE}" # windows-1251:9D
547 "\N{CYRILLIC SMALL LETTER SHORT U}" # windows-1251:A2
548 "\N{CYRILLIC SMALL LETTER DZHE}" # windows-1251:9F
549 "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" # windows-1251:A5
550 "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4
551 "\N{EN DASH}" # windows-1252:96
552 "\N{EM DASH}" # windows-1252:97
553 "\N{HORIZONTAL BAR}" # windows-1253:AF
554 "\N{LEFT SINGLE QUOTATION MARK}" # windows-1252:91
555 "\N{RIGHT SINGLE QUOTATION MARK}" # windows-1252:92
556 "\N{SINGLE LOW-9 QUOTATION MARK}" # windows-1252:82
557 "\N{LEFT DOUBLE QUOTATION MARK}" # windows-1252:93
558 "\N{RIGHT DOUBLE QUOTATION MARK}" # windows-1252:94
559 "\N{DOUBLE LOW-9 QUOTATION MARK}" # windows-1252:84
560 "\N{DAGGER}" # windows-1252:86
561 "\N{DOUBLE DAGGER}" # windows-1252:87
562 "\N{BULLET}" # windows-1252:95
563 "\N{HORIZONTAL ELLIPSIS}" # windows-1252:85
564 "\N{PER MILLE SIGN}" # windows-1252:89
565 "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B
566 "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B
567 "\N{EURO SIGN}" # windows-1252:80
568 "\N{NUMERO SIGN}" # windows-1251:B9
569 "\N{TRADE MARK SIGN}" # windows-1252:99
570 ),
571 # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
572 # and don't usually stand for themselves when adjacent to mojibake.
573 # This excludes spaces, dashes, 'bullet', quotation marks, and ellipses.
574 "utf8_continuation_strict": (
575 "\x80-\xbf"
576 "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5
577 "\N{LATIN CAPITAL LETTER AE}" # windows-1257:AF
578 "\N{LATIN CAPITAL LETTER L WITH CARON}" # windows-1250:BC
579 "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1250:A3
580 "\N{LATIN CAPITAL LETTER O WITH STROKE}" # windows-1257:A8
581 "\N{LATIN CAPITAL LETTER R WITH CEDILLA}" # windows-1257:AA
582 "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1250:8C
583 "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1252:8A
584 "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1250:AA
585 "\N{LATIN CAPITAL LETTER T WITH CARON}" # windows-1250:8D
586 "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" # windows-1252:9F
587 "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1250:8F
588 "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1252:8E
589 "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1250:AF
590 "\N{LATIN CAPITAL LIGATURE OE}" # windows-1252:8C
591 "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1250:B9
592 "\N{LATIN SMALL LETTER AE}" # windows-1257:BF
593 "\N{LATIN SMALL LETTER F WITH HOOK}" # windows-1252:83
594 "\N{LATIN SMALL LETTER L WITH CARON}" # windows-1250:BE
595 "\N{LATIN SMALL LETTER L WITH STROKE}" # windows-1250:B3
596 "\N{LATIN SMALL LETTER O WITH STROKE}" # windows-1257:B8
597 "\N{LATIN SMALL LETTER R WITH CEDILLA}" # windows-1257:BA
598 "\N{LATIN SMALL LETTER S WITH ACUTE}" # windows-1250:9C
599 "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1252:9A
600 "\N{LATIN SMALL LETTER S WITH CEDILLA}" # windows-1250:BA
601 "\N{LATIN SMALL LETTER T WITH CARON}" # windows-1250:9D
602 "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1250:9F
603 "\N{LATIN SMALL LETTER Z WITH CARON}" # windows-1252:9E
604 "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" # windows-1250:BF
605 "\N{LATIN SMALL LIGATURE OE}" # windows-1252:9C
606 "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # windows-1252:88
607 "\N{CARON}" # windows-1250:A1
608 "\N{BREVE}" # windows-1250:A2
609 "\N{OGONEK}" # windows-1250:B2
610 "\N{SMALL TILDE}" # windows-1252:98
611 "\N{DOUBLE ACUTE ACCENT}" # windows-1250:BD
612 "\N{GREEK TONOS}" # windows-1253:B4
613 "\N{GREEK DIALYTIKA TONOS}" # windows-1253:A1
614 "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" # windows-1253:A2
615 "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" # windows-1253:B8
616 "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" # windows-1253:B9
617 "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" # windows-1253:BA
618 "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" # windows-1253:BC
619 "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" # windows-1253:BE
620 "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" # windows-1253:BF
621 "\N{CYRILLIC CAPITAL LETTER IO}" # windows-1251:A8
622 "\N{CYRILLIC CAPITAL LETTER DJE}" # windows-1251:80
623 "\N{CYRILLIC CAPITAL LETTER GJE}" # windows-1251:81
624 "\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" # windows-1251:AA
625 "\N{CYRILLIC CAPITAL LETTER DZE}" # windows-1251:BD
626 "\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B2
627 "\N{CYRILLIC CAPITAL LETTER YI}" # windows-1251:AF
628 "\N{CYRILLIC CAPITAL LETTER JE}" # windows-1251:A3
629 "\N{CYRILLIC CAPITAL LETTER LJE}" # windows-1251:8A
630 "\N{CYRILLIC CAPITAL LETTER NJE}" # windows-1251:8C
631 "\N{CYRILLIC CAPITAL LETTER TSHE}" # windows-1251:8E
632 "\N{CYRILLIC CAPITAL LETTER KJE}" # windows-1251:8D
633 "\N{CYRILLIC CAPITAL LETTER SHORT U}" # windows-1251:A1
634 "\N{CYRILLIC CAPITAL LETTER DZHE}" # windows-1251:8F
635 "\N{CYRILLIC SMALL LETTER IO}" # windows-1251:B8
636 "\N{CYRILLIC SMALL LETTER DJE}" # windows-1251:90
637 "\N{CYRILLIC SMALL LETTER GJE}" # windows-1251:83
638 "\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" # windows-1251:BA
639 "\N{CYRILLIC SMALL LETTER DZE}" # windows-1251:BE
640 "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B3
641 "\N{CYRILLIC SMALL LETTER YI}" # windows-1251:BF
642 "\N{CYRILLIC SMALL LETTER JE}" # windows-1251:BC
643 "\N{CYRILLIC SMALL LETTER LJE}" # windows-1251:9A
644 "\N{CYRILLIC SMALL LETTER NJE}" # windows-1251:9C
645 "\N{CYRILLIC SMALL LETTER TSHE}" # windows-1251:9E
646 "\N{CYRILLIC SMALL LETTER KJE}" # windows-1251:9D
647 "\N{CYRILLIC SMALL LETTER SHORT U}" # windows-1251:A2
648 "\N{CYRILLIC SMALL LETTER DZHE}" # windows-1251:9F
649 "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" # windows-1251:A5
650 "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4
651 "\N{DAGGER}" # windows-1252:86
652 "\N{DOUBLE DAGGER}" # windows-1252:87
653 "\N{PER MILLE SIGN}" # windows-1252:89
654 "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B
655 "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B
656 "\N{EURO SIGN}" # windows-1252:80
657 "\N{NUMERO SIGN}" # windows-1251:B9
658 "\N{TRADE MARK SIGN}" # windows-1252:99
659 ),
660}
662# This regex uses UTF8_CLUES to find sequences of likely mojibake.
663# It matches them with + so that several adjacent UTF-8-looking sequences
664# get coalesced into one, allowing them to be fixed more efficiently
665# and not requiring every individual subsequence to be detected as 'badness'.
666#
667# We accept spaces in place of "utf8_continuation", because spaces might have
668# been intended to be U+A0 NO-BREAK SPACE.
669#
670# We do a lookbehind to make sure the previous character isn't a
671# "utf8_continuation_strict" character, so that we don't fix just a few
672# characters in a huge garble and make the situation worse.
673#
674# Unfortunately, the matches to this regular expression won't show their
675# surrounding context, and including context would make the expression much
676# less efficient. The 'badness' rules that require context, such as a preceding
677# lowercase letter, will prevent some cases of inconsistent UTF-8 from being
678# fixed when they don't see it.
679UTF8_DETECTOR_RE = re.compile(
680 """
681 (?<! [{utf8_continuation_strict}])
682 (
683 [{utf8_first_of_2}] [{utf8_continuation}]
684 |
685 [{utf8_first_of_3}] [{utf8_continuation}]{{2}}
686 |
687 [{utf8_first_of_4}] [{utf8_continuation}]{{3}}
688 )+
689 """.format(**UTF8_CLUES),
690 re.VERBOSE,
691)