Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/chardata.py: 100%

1"""

2This gives other modules access to the gritty details about characters and the

3encodings that use them.

4"""

5from __future__ import annotations

6import html

7import itertools

8import re

9import unicodedata

12# These are the encodings we will try to fix in ftfy, in the

13# order that they should be tried.

14CHARMAP_ENCODINGS = [

15 "latin-1",

16 "sloppy-windows-1252",

17 "sloppy-windows-1251",

18 "sloppy-windows-1250",

19 "sloppy-windows-1253",

20 "sloppy-windows-1254",

21 "iso-8859-2",

22 "macroman",

23 "cp437",

24]

26SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]")

27DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]")

30def _build_regexes():

31 """

32 ENCODING_REGEXES contain reasonably fast ways to detect if we

33 could represent a given string in a given encoding. The simplest one is

34 the 'ascii' detector, which of course just determines if all characters

35 are between U+0000 and U+007F.

36 """

37 # Define a regex that matches ASCII text.

38 encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")}

40 for encoding in CHARMAP_ENCODINGS:

41 # Make a sequence of characters that bytes \x80 to \xFF decode to

42 # in each encoding, as well as byte \x1A, which is used to represent

43 # the replacement character � in the sloppy-* encodings.

44 byte_range = bytes(list(range(0x80, 0x100)) + [0x1A])

45 charlist = byte_range.decode(encoding)

47 # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B

48 # to \x7F -- will decode as those ASCII characters in any encoding we

49 # support, so we can just include them as ranges. This also lets us

50 # not worry about escaping regex special characters, because all of

51 # them are in the \x1B to \x7F range.

52 regex = "^[\x00-\x19\x1b-\x7f{0}]*$".format(charlist)

53 encoding_regexes[encoding] = re.compile(regex)

54 return encoding_regexes

57ENCODING_REGEXES = _build_regexes()

60def _build_html_entities():

61 entities = {}

62 # Create a dictionary based on the built-in HTML5 entity dictionary.

63 # Add a limited set of HTML entities that we'll also decode if they've

64 # been case-folded to uppercase, such as decoding &NTILDE; as "Ñ".

65 for name, char in html.entities.html5.items(): # type: ignore

66 if name.endswith(";"):

67 entities["&" + name] = char

69 # Restrict the set of characters we can attempt to decode if their

70 # name has been uppercased. If we tried to handle all entity names,

71 # the results would be ambiguous.

72 if name == name.lower():

73 name_upper = name.upper()

74 entity_upper = "&" + name_upper

75 if html.unescape(entity_upper) == entity_upper:

76 entities[entity_upper] = char.upper()

77 return entities

80HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};")

81HTML_ENTITIES = _build_html_entities()

84def possible_encoding(text, encoding):

85 """

86 Given text and a single-byte encoding, check whether that text could have

87 been decoded from that single-byte encoding.

89 In other words, check whether it can be encoded in that encoding, possibly

90 sloppily.

91 """

92 return bool(ENCODING_REGEXES[encoding].match(text))

95def _build_control_char_mapping():

96 """

97 Build a translate mapping that strips likely-unintended control characters.

98 See :func:`ftfy.fixes.remove_control_chars` for a description of these

99 codepoint ranges and why they should be removed.

100 """

101 control_chars: dict[int, None] = {}

102

103 for i in itertools.chain(

104 range(0x00, 0x09),

105 [0x0B],

106 range(0x0E, 0x20),

107 [0x7F],

108 range(0x206A, 0x2070),

109 [0xFEFF],

110 range(0xFFF9, 0xFFFD),

111 ):

112 control_chars[i] = None

113

114 return control_chars

115

116

117CONTROL_CHARS = _build_control_char_mapping()

118

119

120# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0'

121# that some Windows-1252 program converted to a plain space.

122#

123# The smaller values are included on a case-by-case basis, because we don't want

124# to decode likely input sequences to unlikely characters. These are the ones

125# that *do* form likely characters before 0xa0:

126#

127# 0xc2 -> U+A0 NO-BREAK SPACE

128# 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE

129# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON

130# 0xce -> U+3A0 GREEK CAPITAL LETTER PI

131# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER

132# 0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO

133#

134# In three-character sequences, we exclude some lead bytes in some cases.

135#

136# When the lead byte is immediately followed by 0xA0, we shouldn't accept

137# a space there, because it leads to some less-likely character ranges:

138#

139# 0xe0 -> Samaritan script

140# 0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common)

141#

142# We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and

143# higher point mostly to CJK characters, which we generally don't want to

144# decode near Latin lowercase letters.

145#

146# In four-character sequences, the lead byte must be F0, because that accounts

147# for almost all of the usage of high-numbered codepoints (tag characters whose

148# UTF-8 starts with the byte F3 are only used in some rare new emoji sequences).

149#

150# This is meant to be applied to encodings of text that tests true for `is_bad`.

151# Any of these could represent characters that legitimately appear surrounded by

152# spaces, particularly U+C5 (Å), which is a word in multiple languages!

153#

154# We should consider checking for b'\x85' being converted to ... in the future.

155# I've seen it once, but the text still wasn't recoverable.

156

157ALTERED_UTF8_RE = re.compile(

158 b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]"

159 b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]"

160 b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]"

161 b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]"

162 b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]"

163 b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]"

164)

165

166

167# This expression matches UTF-8 and CESU-8 sequences where some of the

168# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is

169# used within ftfy to represent a byte that produced the replacement character

170# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8

171# sequence as \ufffd instead of failing to re-decode it at all.

172#

173# In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per

174# sequence.

175LOSSY_UTF8_RE = re.compile(

176 b"[\xc2-\xdf][\x1a]"

177 b"|[\xc2-\xc3][?]"

178 b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]"

179 b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]"

180 b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]"

181 b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]"

182 b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]"

183 b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]"

184 b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]"

185 b"|\x1a"

186)

187

188

189# This regex matches C1 control characters, which occupy some of the positions

190# in the Latin-1 character map that Windows assigns to other characters instead.

191C1_CONTROL_RE = re.compile(r"[\x80-\x9f]")

192

193

194# A translate mapping that breaks ligatures made of Latin letters. While

195# ligatures may be important to the representation of other languages, in Latin

196# letters they tend to represent a copy/paste error. It omits ligatures such

197# as æ that are frequently used intentionally.

198#

199# This list additionally includes some Latin digraphs that represent two

200# characters for legacy encoding reasons, not for typographical reasons.

201#

202# Ligatures and digraphs may also be separated by NFKC normalization, but that

203# is sometimes more normalization than you want.

204

205LIGATURES = {

206 ord("Ĳ"): "IJ", # Dutch ligatures

207 ord("ĳ"): "ij",

208 ord("ŉ"): "ʼn", # Afrikaans digraph meant to avoid auto-curled quote

209 ord("Ǳ"): "DZ", # Serbian/Croatian digraphs for Cyrillic conversion

210 ord("ǲ"): "Dz",

211 ord("ǳ"): "dz",

212 ord("Ǆ"): "DŽ",

213 ord("ǅ"): "Dž",

214 ord("ǆ"): "dž",

215 ord("Ǉ"): "LJ",

216 ord("ǈ"): "Lj",

217 ord("ǉ"): "lj",

218 ord("Ǌ"): "NJ",

219 ord("ǋ"): "Nj",

220 ord("ǌ"): "nj",

221 ord("ﬀ"): "ff", # Latin typographical ligatures

222 ord("ﬁ"): "fi",

223 ord("ﬂ"): "fl",

224 ord("ﬃ"): "ffi",

225 ord("ﬄ"): "ffl",

226 ord("ﬅ"): "ſt",

227 ord("ﬆ"): "st",

228}

229

230

231def _build_width_map():

232 """

233 Build a translate mapping that replaces halfwidth and fullwidth forms

234 with their standard-width forms.

235 """

236 # Though it's not listed as a fullwidth character, we'll want to convert

237 # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start

238 # with that in the dictionary.

239 width_map = {0x3000: " "}

240 for i in range(0xFF01, 0xFFF0):

241 char = chr(i)

242 alternate = unicodedata.normalize("NFKC", char)

243 if alternate != char:

244 width_map[i] = alternate

245 return width_map

246

247

248WIDTH_MAP = _build_width_map()

249

250

251# Character classes that help us pinpoint embedded mojibake. These can

252# include common characters, because we'll also check them for 'badness'.

253UTF8_CLUES = {

254 # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding

255 "utf8_first_of_2": (

256 "ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ"

257 "ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"

258 ),

259 # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding

260 "utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕΰαβγδεζηθικλμνξοабвгдежзийклмноп"),

261 # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding.

262 # (Other leading bytes correspond only to unassigned codepoints)

263 "utf8_first_of_4": ("ðóđğπσру"),

264 # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,

265 # including a space standing in for 0xA0

266 "utf8_continuation": (

267 "\x80-\xbf"

268 "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅"

269 "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ"

270 "–—―‘’‚“”„†‡•…‰‹›€№™"

271 " "

272 ),

273 # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding,

274 # and don't usually stand for themselves when adjacent to mojibake.

275 # This excludes spaces, dashes, quotation marks, and ellipses.

276 "utf8_continuation_strict": (

277 "\x80-\xbf"

278 "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅"

279 "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ"

280 "†‡•‰‹›€№™"

281 ),

282}

283

284# This regex uses UTF8_CLUES to find sequences of likely mojibake.

285# It matches them with + so that several adjacent UTF-8-looking sequences

286# get coalesced into one, allowing them to be fixed more efficiently

287# and not requiring every individual subsequence to be detected as 'badness'.

288#

289# We accept spaces in place of "utf8_continuation", because spaces might have

290# been intended to be U+A0 NO-BREAK SPACE.

291#

292# We do a lookbehind to make sure the previous character isn't a

293# "utf8_continuation_strict" character, so that we don't fix just a few

294# characters in a huge garble and make the situation worse.

295#

296# Unfortunately, the matches to this regular expression won't show their

297# surrounding context, and including context would make the expression much

298# less efficient. The 'badness' rules that require context, such as a preceding

299# lowercase letter, will prevent some cases of inconsistent UTF-8 from being

300# fixed when they don't see it.

301UTF8_DETECTOR_RE = re.compile(

302 """

303 (?<! [{utf8_continuation_strict}])

304 (

305 [{utf8_first_of_2}] [{utf8_continuation}]

306 |

307 [{utf8_first_of_3}] [{utf8_continuation}]{{2}}

308 |

309 [{utf8_first_of_4}] [{utf8_continuation}]{{3}}

310 )+

311""".format(

312 **UTF8_CLUES

313 ),

314 re.VERBOSE,

315)