Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/ftfy/chardata.py: 100%

53 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-08 06:33 +0000

1""" 

2This gives other modules access to the gritty details about characters and the 

3encodings that use them. 

4""" 

5from __future__ import annotations 

6import html 

7import itertools 

8import re 

9import unicodedata 

10 

11 

12# These are the encodings we will try to fix in ftfy, in the 

13# order that they should be tried. 

14CHARMAP_ENCODINGS = [ 

15 "latin-1", 

16 "sloppy-windows-1252", 

17 "sloppy-windows-1251", 

18 "sloppy-windows-1250", 

19 "sloppy-windows-1253", 

20 "sloppy-windows-1254", 

21 "iso-8859-2", 

22 "macroman", 

23 "cp437", 

24] 

25 

26SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]") 

27DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]") 

28 

29 

30def _build_regexes(): 

31 """ 

32 ENCODING_REGEXES contain reasonably fast ways to detect if we 

33 could represent a given string in a given encoding. The simplest one is 

34 the 'ascii' detector, which of course just determines if all characters 

35 are between U+0000 and U+007F. 

36 """ 

37 # Define a regex that matches ASCII text. 

38 encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")} 

39 

40 for encoding in CHARMAP_ENCODINGS: 

41 # Make a sequence of characters that bytes \x80 to \xFF decode to 

42 # in each encoding, as well as byte \x1A, which is used to represent 

43 # the replacement character � in the sloppy-* encodings. 

44 byte_range = bytes(list(range(0x80, 0x100)) + [0x1A]) 

45 charlist = byte_range.decode(encoding) 

46 

47 # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B 

48 # to \x7F -- will decode as those ASCII characters in any encoding we 

49 # support, so we can just include them as ranges. This also lets us 

50 # not worry about escaping regex special characters, because all of 

51 # them are in the \x1B to \x7F range. 

52 regex = "^[\x00-\x19\x1b-\x7f{0}]*$".format(charlist) 

53 encoding_regexes[encoding] = re.compile(regex) 

54 return encoding_regexes 

55 

56 

57ENCODING_REGEXES = _build_regexes() 

58 

59 

60def _build_html_entities(): 

61 entities = {} 

62 # Create a dictionary based on the built-in HTML5 entity dictionary. 

63 # Add a limited set of HTML entities that we'll also decode if they've 

64 # been case-folded to uppercase, such as decoding Ñ as "Ñ". 

65 for name, char in html.entities.html5.items(): # type: ignore 

66 if name.endswith(";"): 

67 entities["&" + name] = char 

68 

69 # Restrict the set of characters we can attempt to decode if their 

70 # name has been uppercased. If we tried to handle all entity names, 

71 # the results would be ambiguous. 

72 if name == name.lower(): 

73 name_upper = name.upper() 

74 entity_upper = "&" + name_upper 

75 if html.unescape(entity_upper) == entity_upper: 

76 entities[entity_upper] = char.upper() 

77 return entities 

78 

79 

80HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};") 

81HTML_ENTITIES = _build_html_entities() 

82 

83 

84def possible_encoding(text, encoding): 

85 """ 

86 Given text and a single-byte encoding, check whether that text could have 

87 been decoded from that single-byte encoding. 

88 

89 In other words, check whether it can be encoded in that encoding, possibly 

90 sloppily. 

91 """ 

92 return bool(ENCODING_REGEXES[encoding].match(text)) 

93 

94 

95def _build_control_char_mapping(): 

96 """ 

97 Build a translate mapping that strips likely-unintended control characters. 

98 See :func:`ftfy.fixes.remove_control_chars` for a description of these 

99 codepoint ranges and why they should be removed. 

100 """ 

101 control_chars: dict[int, None] = {} 

102 

103 for i in itertools.chain( 

104 range(0x00, 0x09), 

105 [0x0B], 

106 range(0x0E, 0x20), 

107 [0x7F], 

108 range(0x206A, 0x2070), 

109 [0xFEFF], 

110 range(0xFFF9, 0xFFFD), 

111 ): 

112 control_chars[i] = None 

113 

114 return control_chars 

115 

116 

117CONTROL_CHARS = _build_control_char_mapping() 

118 

119 

120# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0' 

121# that some Windows-1252 program converted to a plain space. 

122# 

123# The smaller values are included on a case-by-case basis, because we don't want 

124# to decode likely input sequences to unlikely characters. These are the ones 

125# that *do* form likely characters before 0xa0: 

126# 

127# 0xc2 -> U+A0 NO-BREAK SPACE 

128# 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE 

129# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON 

130# 0xce -> U+3A0 GREEK CAPITAL LETTER PI 

131# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER 

132# 0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO 

133# 

134# In three-character sequences, we exclude some lead bytes in some cases. 

135# 

136# When the lead byte is immediately followed by 0xA0, we shouldn't accept 

137# a space there, because it leads to some less-likely character ranges: 

138# 

139# 0xe0 -> Samaritan script 

140# 0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common) 

141# 

142# We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and 

143# higher point mostly to CJK characters, which we generally don't want to 

144# decode near Latin lowercase letters. 

145# 

146# In four-character sequences, the lead byte must be F0, because that accounts 

147# for almost all of the usage of high-numbered codepoints (tag characters whose 

148# UTF-8 starts with the byte F3 are only used in some rare new emoji sequences). 

149# 

150# This is meant to be applied to encodings of text that tests true for `is_bad`. 

151# Any of these could represent characters that legitimately appear surrounded by 

152# spaces, particularly U+C5 (Å), which is a word in multiple languages! 

153# 

154# We should consider checking for b'\x85' being converted to ... in the future. 

155# I've seen it once, but the text still wasn't recoverable. 

156 

157ALTERED_UTF8_RE = re.compile( 

158 b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]" 

159 b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]" 

160 b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]" 

161 b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]" 

162 b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]" 

163 b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]" 

164) 

165 

166 

167# This expression matches UTF-8 and CESU-8 sequences where some of the 

168# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is 

169# used within ftfy to represent a byte that produced the replacement character 

170# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8 

171# sequence as \ufffd instead of failing to re-decode it at all. 

172# 

173# In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per 

174# sequence. 

175LOSSY_UTF8_RE = re.compile( 

176 b"[\xc2-\xdf][\x1a]" 

177 b"|[\xc2-\xc3][?]" 

178 b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]" 

179 b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]" 

180 b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]" 

181 b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]" 

182 b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]" 

183 b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]" 

184 b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]" 

185 b"|\x1a" 

186) 

187 

188 

189# This regex matches C1 control characters, which occupy some of the positions 

190# in the Latin-1 character map that Windows assigns to other characters instead. 

191C1_CONTROL_RE = re.compile(r"[\x80-\x9f]") 

192 

193 

194# A translate mapping that breaks ligatures made of Latin letters. While 

195# ligatures may be important to the representation of other languages, in Latin 

196# letters they tend to represent a copy/paste error. It omits ligatures such 

197# as æ that are frequently used intentionally. 

198# 

199# This list additionally includes some Latin digraphs that represent two 

200# characters for legacy encoding reasons, not for typographical reasons. 

201# 

202# Ligatures and digraphs may also be separated by NFKC normalization, but that 

203# is sometimes more normalization than you want. 

204 

205LIGATURES = { 

206 ord("IJ"): "IJ", # Dutch ligatures 

207 ord("ij"): "ij", 

208 ord("ʼn"): "ʼn", # Afrikaans digraph meant to avoid auto-curled quote 

209 ord("DZ"): "DZ", # Serbian/Croatian digraphs for Cyrillic conversion 

210 ord("Dz"): "Dz", 

211 ord("dz"): "dz", 

212 ord("DŽ"): "DŽ", 

213 ord("Dž"): "Dž", 

214 ord("dž"): "dž", 

215 ord("LJ"): "LJ", 

216 ord("Lj"): "Lj", 

217 ord("lj"): "lj", 

218 ord("NJ"): "NJ", 

219 ord("Nj"): "Nj", 

220 ord("nj"): "nj", 

221 ord("ff"): "ff", # Latin typographical ligatures 

222 ord("fi"): "fi", 

223 ord("fl"): "fl", 

224 ord("ffi"): "ffi", 

225 ord("ffl"): "ffl", 

226 ord("ſt"): "ſt", 

227 ord("st"): "st", 

228} 

229 

230 

231def _build_width_map(): 

232 """ 

233 Build a translate mapping that replaces halfwidth and fullwidth forms 

234 with their standard-width forms. 

235 """ 

236 # Though it's not listed as a fullwidth character, we'll want to convert 

237 # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start 

238 # with that in the dictionary. 

239 width_map = {0x3000: " "} 

240 for i in range(0xFF01, 0xFFF0): 

241 char = chr(i) 

242 alternate = unicodedata.normalize("NFKC", char) 

243 if alternate != char: 

244 width_map[i] = alternate 

245 return width_map 

246 

247 

248WIDTH_MAP = _build_width_map() 

249 

250 

251# Character classes that help us pinpoint embedded mojibake. These can 

252# include common characters, because we'll also check them for 'badness'. 

253UTF8_CLUES = { 

254 # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding 

255 "utf8_first_of_2": ( 

256 "ÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßĂĆČĎĐĘĚĞİĹŃŇŐŘŞŢŮŰ" 

257 "ΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" 

258 ), 

259 # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding 

260 "utf8_first_of_3": ("àáâãäåæçèéêëìíîïăćčďęěĺŕΰαβγδεζηθικλμνξοабвгдежзийклмноп"), 

261 # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding. 

262 # (Other leading bytes correspond only to unassigned codepoints) 

263 "utf8_first_of_4": ("ðóđğπσру"), 

264 # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, 

265 # including a space standing in for 0xA0 

266 "utf8_continuation": ( 

267 "\x80-\xbf" 

268 "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅" 

269 "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ" 

270 "–—―‘’‚“”„†‡•…‰‹›€№™" 

271 " " 

272 ), 

273 # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, 

274 # and don't usually stand for themselves when adjacent to mojibake. 

275 # This excludes spaces, dashes, quotation marks, and ellipses. 

276 "utf8_continuation_strict": ( 

277 "\x80-\xbf" 

278 "ĄąĽľŁłŒœŚśŞşŠšŤťŸŹźŻżŽžƒˆˇ˘˛˜˝΄΅" 

279 "ΆΈΉΊΌΎΏЁЂЃЄЅІЇЈЉЊЋЌЎЏёђѓєѕіїјљњћќўџҐґ" 

280 "†‡•‰‹›€№™" 

281 ), 

282} 

283 

284# This regex uses UTF8_CLUES to find sequences of likely mojibake. 

285# It matches them with + so that several adjacent UTF-8-looking sequences 

286# get coalesced into one, allowing them to be fixed more efficiently 

287# and not requiring every individual subsequence to be detected as 'badness'. 

288# 

289# We accept spaces in place of "utf8_continuation", because spaces might have 

290# been intended to be U+A0 NO-BREAK SPACE. 

291# 

292# We do a lookbehind to make sure the previous character isn't a 

293# "utf8_continuation_strict" character, so that we don't fix just a few 

294# characters in a huge garble and make the situation worse. 

295# 

296# Unfortunately, the matches to this regular expression won't show their 

297# surrounding context, and including context would make the expression much 

298# less efficient. The 'badness' rules that require context, such as a preceding 

299# lowercase letter, will prevent some cases of inconsistent UTF-8 from being 

300# fixed when they don't see it. 

301UTF8_DETECTOR_RE = re.compile( 

302 """ 

303 (?<! [{utf8_continuation_strict}]) 

304 ( 

305 [{utf8_first_of_2}] [{utf8_continuation}] 

306 | 

307 [{utf8_first_of_3}] [{utf8_continuation}]{{2}} 

308 | 

309 [{utf8_first_of_4}] [{utf8_continuation}]{{3}} 

310 )+ 

311""".format( 

312 **UTF8_CLUES 

313 ), 

314 re.VERBOSE, 

315)