Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/chardata.py: 100%
53 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 06:33 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-08 06:33 +0000
1"""
2This gives other modules access to the gritty details about characters and the
3encodings that use them.
4"""
5from __future__ import annotations
6import html
7import itertools
8import re
9import unicodedata
12# These are the encodings we will try to fix in ftfy, in the
13# order that they should be tried.
14CHARMAP_ENCODINGS = [
15 "latin-1",
16 "sloppy-windows-1252",
17 "sloppy-windows-1251",
18 "sloppy-windows-1250",
19 "sloppy-windows-1253",
20 "sloppy-windows-1254",
21 "iso-8859-2",
22 "macroman",
23 "cp437",
24]
26SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]")
27DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]")
30def _build_regexes():
31 """
32 ENCODING_REGEXES contain reasonably fast ways to detect if we
33 could represent a given string in a given encoding. The simplest one is
34 the 'ascii' detector, which of course just determines if all characters
35 are between U+0000 and U+007F.
36 """
37 # Define a regex that matches ASCII text.
38 encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")}
40 for encoding in CHARMAP_ENCODINGS:
41 # Make a sequence of characters that bytes \x80 to \xFF decode to
42 # in each encoding, as well as byte \x1A, which is used to represent
43 # the replacement character � in the sloppy-* encodings.
44 byte_range = bytes(list(range(0x80, 0x100)) + [0x1A])
45 charlist = byte_range.decode(encoding)
47 # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
48 # to \x7F -- will decode as those ASCII characters in any encoding we
49 # support, so we can just include them as ranges. This also lets us
50 # not worry about escaping regex special characters, because all of
51 # them are in the \x1B to \x7F range.
52 regex = "^[\x00-\x19\x1b-\x7f{0}]*$".format(charlist)
53 encoding_regexes[encoding] = re.compile(regex)
54 return encoding_regexes
57ENCODING_REGEXES = _build_regexes()
60def _build_html_entities():
61 entities = {}
62 # Create a dictionary based on the built-in HTML5 entity dictionary.
63 # Add a limited set of HTML entities that we'll also decode if they've
64 # been case-folded to uppercase, such as decoding Ñ as "Ñ".
65 for name, char in html.entities.html5.items(): # type: ignore
66 if name.endswith(";"):
67 entities["&" + name] = char
69 # Restrict the set of characters we can attempt to decode if their
70 # name has been uppercased. If we tried to handle all entity names,
71 # the results would be ambiguous.
72 if name == name.lower():
73 name_upper = name.upper()
74 entity_upper = "&" + name_upper
75 if html.unescape(entity_upper) == entity_upper:
76 entities[entity_upper] = char.upper()
77 return entities
80HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};")
81HTML_ENTITIES = _build_html_entities()
84def possible_encoding(text, encoding):
85 """
86 Given text and a single-byte encoding, check whether that text could have
87 been decoded from that single-byte encoding.
89 In other words, check whether it can be encoded in that encoding, possibly
90 sloppily.
91 """
92 return bool(ENCODING_REGEXES[encoding].match(text))
95def _build_control_char_mapping():
96 """
97 Build a translate mapping that strips likely-unintended control characters.
98 See :func:`ftfy.fixes.remove_control_chars` for a description of these
99 codepoint ranges and why they should be removed.
100 """
101 control_chars: dict[int, None] = {}
103 for i in itertools.chain(
104 range(0x00, 0x09),
105 [0x0B],
106 range(0x0E, 0x20),
107 [0x7F],
108 range(0x206A, 0x2070),
109 [0xFEFF],
110 range(0xFFF9, 0xFFFD),
111 ):
112 control_chars[i] = None
114 return control_chars
117CONTROL_CHARS = _build_control_char_mapping()
120# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0'
121# that some Windows-1252 program converted to a plain space.
122#
123# The smaller values are included on a case-by-case basis, because we don't want
124# to decode likely input sequences to unlikely characters. These are the ones
125# that *do* form likely characters before 0xa0:
126#
127# 0xc2 -> U+A0 NO-BREAK SPACE
128# 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE
129# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON
130# 0xce -> U+3A0 GREEK CAPITAL LETTER PI
131# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER
132# 0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO
133#
134# In three-character sequences, we exclude some lead bytes in some cases.
135#
136# When the lead byte is immediately followed by 0xA0, we shouldn't accept
137# a space there, because it leads to some less-likely character ranges:
138#
139# 0xe0 -> Samaritan script
140# 0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common)
141#
142# We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and
143# higher point mostly to CJK characters, which we generally don't want to
144# decode near Latin lowercase letters.
145#
146# In four-character sequences, the lead byte must be F0, because that accounts
147# for almost all of the usage of high-numbered codepoints (tag characters whose
148# UTF-8 starts with the byte F3 are only used in some rare new emoji sequences).
149#
150# This is meant to be applied to encodings of text that tests true for `is_bad`.
151# Any of these could represent characters that legitimately appear surrounded by
152# spaces, particularly U+C5 (Å), which is a word in multiple languages!
153#
154# We should consider checking for b'\x85' being converted to ... in the future.
155# I've seen it once, but the text still wasn't recoverable.
157ALTERED_UTF8_RE = re.compile(
158 b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]"
159 b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]"
160 b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]"
161 b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]"
162 b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]"
163 b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]"
164)
167# This expression matches UTF-8 and CESU-8 sequences where some of the
168# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is
169# used within ftfy to represent a byte that produced the replacement character
170# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8
171# sequence as \ufffd instead of failing to re-decode it at all.
172#
173# In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per
174# sequence.
175LOSSY_UTF8_RE = re.compile(
176 b"[\xc2-\xdf][\x1a]"
177 b"|[\xc2-\xc3][?]"
178 b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]"
179 b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]"
180 b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]"
181 b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]"
182 b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]"
183 b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]"
184 b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]"
185 b"|\x1a"
186)
189# This regex matches C1 control characters, which occupy some of the positions
190# in the Latin-1 character map that Windows assigns to other characters instead.
191C1_CONTROL_RE = re.compile(r"[\x80-\x9f]")
194# A translate mapping that breaks ligatures made of Latin letters. While
195# ligatures may be important to the representation of other languages, in Latin
196# letters they tend to represent a copy/paste error. It omits ligatures such
197# as æ that are frequently used intentionally.
198#
199# This list additionally includes some Latin digraphs that represent two
200# characters for legacy encoding reasons, not for typographical reasons.
201#
202# Ligatures and digraphs may also be separated by NFKC normalization, but that
203# is sometimes more normalization than you want.
205LIGATURES = {
206 ord("IJ"): "IJ", # Dutch ligatures
207 ord("ij"): "ij",
208 ord("ʼn"): "ʼn", # Afrikaans digraph meant to avoid auto-curled quote
209 ord("DZ"): "DZ", # Serbian/Croatian digraphs for Cyrillic conversion
210 ord("Dz"): "Dz",
211 ord("dz"): "dz",
212 ord("DŽ"): "DŽ",
213 ord("Dž"): "Dž",
214 ord("dž"): "dž",
215 ord("LJ"): "LJ",
216 ord("Lj"): "Lj",
217 ord("lj"): "lj",
218 ord("NJ"): "NJ",
219 ord("Nj"): "Nj",
220 ord("nj"): "nj",
221 ord("ff"): "ff", # Latin typographical ligatures
222 ord("fi"): "fi",
223 ord("fl"): "fl",
224 ord("ffi"): "ffi",
225 ord("ffl"): "ffl",
226 ord("ſt"): "ſt",
227 ord("st"): "st",
228}
231def _build_width_map():
232 """
233 Build a translate mapping that replaces halfwidth and fullwidth forms
234 with their standard-width forms.
235 """
236 # Though it's not listed as a fullwidth character, we'll want to convert
237 # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
238 # with that in the dictionary.
239 width_map = {0x3000: " "}
240 for i in range(0xFF01, 0xFFF0):
241 char = chr(i)
242 alternate = unicodedata.normalize("NFKC", char)
243 if alternate != char:
244 width_map[i] = alternate
245 return width_map
248WIDTH_MAP = _build_width_map()
251# Character classes that help us pinpoint embedded mojibake. These can
252# include common characters, because we'll also check them for 'badness'.
253UTF8_CLUES = {
254 # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding
255 "utf8_first_of_2": (
256 "ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ"
257 "ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
258 ),
259 # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding
260 "utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕΰαβγδεζηθικλμνξοабвгдежзийклмноп"),
261 # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding.
262 # (Other leading bytes correspond only to unassigned codepoints)
263 "utf8_first_of_4": ("ðóđğπσру"),
264 # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
265 # including a space standing in for 0xA0
266 "utf8_continuation": (
267 "\x80-\xbf"
268 "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅"
269 "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ"
270 "–—―‘’‚“”„†‡•…‰‹›€№™"
271 " "
272 ),
273 # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,
274 # and don't usually stand for themselves when adjacent to mojibake.
275 # This excludes spaces, dashes, quotation marks, and ellipses.
276 "utf8_continuation_strict": (
277 "\x80-\xbf"
278 "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅"
279 "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ"
280 "†‡•‰‹›€№™"
281 ),
282}
284# This regex uses UTF8_CLUES to find sequences of likely mojibake.
285# It matches them with + so that several adjacent UTF-8-looking sequences
286# get coalesced into one, allowing them to be fixed more efficiently
287# and not requiring every individual subsequence to be detected as 'badness'.
288#
289# We accept spaces in place of "utf8_continuation", because spaces might have
290# been intended to be U+A0 NO-BREAK SPACE.
291#
292# We do a lookbehind to make sure the previous character isn't a
293# "utf8_continuation_strict" character, so that we don't fix just a few
294# characters in a huge garble and make the situation worse.
295#
296# Unfortunately, the matches to this regular expression won't show their
297# surrounding context, and including context would make the expression much
298# less efficient. The 'badness' rules that require context, such as a preceding
299# lowercase letter, will prevent some cases of inconsistent UTF-8 from being
300# fixed when they don't see it.
301UTF8_DETECTOR_RE = re.compile(
302 """
303 (?<! [{utf8_continuation_strict}])
304 (
305 [{utf8_first_of_2}] [{utf8_continuation}]
306 |
307 [{utf8_first_of_3}] [{utf8_continuation}]{{2}}
308 |
309 [{utf8_first_of_4}] [{utf8_continuation}]{{3}}
310 )+
311""".format(
312 **UTF8_CLUES
313 ),
314 re.VERBOSE,
315)