Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/ftfy/badness.py: 67%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

12 statements  

1""" 

2`ftfy.badness` contains a heuristic that detects likely mojibake. 

3 

4This heuristic signals to ftfy which segments of text need to be fixed, and 

5also indicates when the text can stop being fixed. 

6 

7The design of this heuristic is that we categorize the approximately 400 

8Unicode characters that occur in UTF-8 mojibake, specifically the characters 

9that come from mixing up UTF-8 with the other encodings we support. We 

10identify sequences and contexts of these characters that are much more likely 

11to be mojibake than intended strings, such as lowercase accented letters 

12followed immediately by currency symbols. 

13""" 

14 

15import warnings 

16import re 

17 

18 

19# There are only a few hundred characters that occur in known UTF-8 mojibake, and we can 

20# characterize them: 

21 

22MOJIBAKE_CATEGORIES = { 

23 # Characters that appear in many different contexts. Sequences that contain 

24 # them are not inherently mojibake 

25 "common": ( 

26 "\N{NO-BREAK SPACE}" 

27 "\N{SOFT HYPHEN}" 

28 "\N{MIDDLE DOT}" 

29 "\N{ACUTE ACCENT}" 

30 "\N{EN DASH}" 

31 "\N{EM DASH}" 

32 "\N{HORIZONTAL BAR}" 

33 "\N{HORIZONTAL ELLIPSIS}" 

34 "\N{RIGHT SINGLE QUOTATION MARK}" 

35 ), 

36 # the C1 control character range, which have no uses outside of mojibake anymore 

37 "c1": "\x80-\x9f", 

38 # Characters that are nearly 100% used in mojibake 

39 "bad": ( 

40 "\N{BROKEN BAR}" 

41 "\N{CURRENCY SIGN}" 

42 "\N{DIAERESIS}" 

43 "\N{NOT SIGN}" 

44 "\N{MACRON}" 

45 "\N{CEDILLA}" 

46 "\N{LATIN SMALL LETTER F WITH HOOK}" 

47 "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # it's not a modifier 

48 "\N{CARON}" 

49 "\N{BREVE}" 

50 "\N{OGONEK}" 

51 "\N{SMALL TILDE}" 

52 "\N{DAGGER}" 

53 "\N{DOUBLE DAGGER}" 

54 "\N{PER MILLE SIGN}" 

55 "\N{REVERSED NOT SIGN}" 

56 "\N{LOZENGE}" 

57 "\ufffd" 

58 # Theoretically these would appear in 'numeric' contexts, but when they 

59 # co-occur with other mojibake characters, it's not really ambiguous 

60 "\N{FEMININE ORDINAL INDICATOR}" 

61 "\N{MASCULINE ORDINAL INDICATOR}" 

62 ), 

63 # Characters used in legalese 

64 "law": ( 

65 "\N{PILCROW SIGN}" 

66 "\N{SECTION SIGN}" 

67 ), 

68 "currency": ( 

69 "\N{CENT SIGN}" 

70 "\N{POUND SIGN}" 

71 "\N{YEN SIGN}" 

72 "\N{PESETA SIGN}" 

73 "\N{EURO SIGN}" 

74 ), 

75 "start_punctuation": ( 

76 "\N{INVERTED EXCLAMATION MARK}" 

77 "\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}" 

78 "\N{INVERTED QUESTION MARK}" 

79 "\N{COPYRIGHT SIGN}" 

80 "\N{GREEK TONOS}" 

81 "\N{GREEK DIALYTIKA TONOS}" 

82 "\N{LEFT SINGLE QUOTATION MARK}" 

83 "\N{SINGLE LOW-9 QUOTATION MARK}" 

84 "\N{LEFT DOUBLE QUOTATION MARK}" 

85 "\N{DOUBLE LOW-9 QUOTATION MARK}" 

86 "\N{BULLET}" 

87 "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" 

88 "\uf8ff" # OS-specific symbol, usually the Apple logo 

89 ), 

90 "end_punctuation": ( 

91 "\N{REGISTERED SIGN}" 

92 "\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}" 

93 "\N{DOUBLE ACUTE ACCENT}" 

94 "\N{RIGHT DOUBLE QUOTATION MARK}" 

95 "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" 

96 "\N{TRADE MARK SIGN}" 

97 ), 

98 "numeric": ( 

99 "\N{SUPERSCRIPT TWO}" 

100 "\N{SUPERSCRIPT THREE}" 

101 "\N{SUPERSCRIPT ONE}" 

102 "\N{PLUS-MINUS SIGN}" 

103 "\N{VULGAR FRACTION ONE QUARTER}" 

104 "\N{VULGAR FRACTION ONE HALF}" 

105 "\N{VULGAR FRACTION THREE QUARTERS}" 

106 "\N{MULTIPLICATION SIGN}" 

107 "\N{MICRO SIGN}" 

108 "\N{DIVISION SIGN}" 

109 "\N{FRACTION SLASH}" 

110 "\N{PARTIAL DIFFERENTIAL}" 

111 "\N{INCREMENT}" 

112 "\N{N-ARY PRODUCT}" 

113 "\N{N-ARY SUMMATION}" 

114 "\N{SQUARE ROOT}" 

115 "\N{INFINITY}" 

116 "\N{INTERSECTION}" 

117 "\N{INTEGRAL}" 

118 "\N{ALMOST EQUAL TO}" 

119 "\N{NOT EQUAL TO}" 

120 "\N{IDENTICAL TO}" 

121 "\N{LESS-THAN OR EQUAL TO}" 

122 "\N{GREATER-THAN OR EQUAL TO}" 

123 "\N{NUMERO SIGN}" 

124 ), 

125 # Letters that might be used to make emoticon faces (kaomoji), and 

126 # therefore might need to appear in more improbable-looking contexts. 

127 # 

128 # These are concatenated character ranges for use in a regex. I know 

129 # they look like faces themselves. I think expressing the ranges like 

130 # this helps to illustrate why we need to be careful with these 

131 # characters. 

132 "kaomoji": ( 

133 "Ò-Ö" 

134 "Ù-Ü" 

135 "ò-ö" 

136 "ø-ü" 

137 "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" 

138 "\N{LATIN CAPITAL LETTER O WITH MACRON}" 

139 "\N{LATIN CAPITAL LETTER U WITH MACRON}" 

140 "\N{LATIN CAPITAL LETTER U WITH OGONEK}" 

141 "\N{DEGREE SIGN}" 

142 ), 

143 "upper_accented": ( 

144 # LATIN CAPITAL LETTER A WITH GRAVE - LATIN CAPITAL LETTER N WITH TILDE 

145 "\xc0-\xd1" 

146 # skip capital O's and U's that could be used in kaomoji, but 

147 # include Ø because it's very common in Arabic mojibake: 

148 "\N{LATIN CAPITAL LETTER O WITH STROKE}" 

149 "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" 

150 "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" 

151 "\N{LATIN CAPITAL LETTER A WITH BREVE}" 

152 "\N{LATIN CAPITAL LETTER A WITH MACRON}" 

153 "\N{LATIN CAPITAL LETTER A WITH OGONEK}" 

154 "\N{LATIN CAPITAL LETTER C WITH ACUTE}" 

155 "\N{LATIN CAPITAL LETTER C WITH CARON}" 

156 "\N{LATIN CAPITAL LETTER D WITH CARON}" 

157 "\N{LATIN CAPITAL LETTER D WITH STROKE}" 

158 "\N{LATIN CAPITAL LETTER E WITH OGONEK}" 

159 "\N{LATIN CAPITAL LETTER E WITH CARON}" 

160 "\N{LATIN CAPITAL LETTER E WITH MACRON}" 

161 "\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}" 

162 "\N{LATIN CAPITAL LETTER G WITH BREVE}" 

163 "\N{LATIN CAPITAL LETTER G WITH CEDILLA}" 

164 "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" 

165 "\N{LATIN CAPITAL LETTER I WITH MACRON}" 

166 "\N{LATIN CAPITAL LETTER K WITH CEDILLA}" 

167 "\N{LATIN CAPITAL LETTER L WITH ACUTE}" 

168 "\N{LATIN CAPITAL LETTER L WITH CARON}" 

169 "\N{LATIN CAPITAL LETTER L WITH STROKE}" 

170 "\N{LATIN CAPITAL LETTER L WITH CEDILLA}" 

171 "\N{LATIN CAPITAL LETTER N WITH ACUTE}" 

172 "\N{LATIN CAPITAL LETTER N WITH CARON}" 

173 "\N{LATIN CAPITAL LETTER N WITH CEDILLA}" 

174 "\N{LATIN CAPITAL LIGATURE OE}" 

175 "\N{LATIN CAPITAL LETTER R WITH CARON}" 

176 "\N{LATIN CAPITAL LETTER S WITH ACUTE}" 

177 "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" 

178 "\N{LATIN CAPITAL LETTER S WITH CARON}" 

179 "\N{LATIN CAPITAL LETTER T WITH CEDILLA}" 

180 "\N{LATIN CAPITAL LETTER T WITH CARON}" 

181 "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}" 

182 "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}" 

183 "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" 

184 "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" 

185 "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" 

186 "\N{LATIN CAPITAL LETTER Z WITH CARON}" 

187 "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" 

188 ), 

189 "lower_accented": ( 

190 "\N{LATIN SMALL LETTER SHARP S}" 

191 # LATIN SMALL LETTER A WITH GRAVE - LATIN SMALL LETTER N WITH TILDE 

192 "\xe0-\xf1" 

193 # skip o's and u's that could be used in kaomoji 

194 "\N{LATIN SMALL LETTER A WITH BREVE}" 

195 "\N{LATIN SMALL LETTER A WITH OGONEK}" 

196 "\N{LATIN SMALL LETTER A WITH MACRON}" 

197 "\N{LATIN SMALL LETTER C WITH ACUTE}" 

198 "\N{LATIN SMALL LETTER C WITH CARON}" 

199 "\N{LATIN SMALL LETTER D WITH CARON}" 

200 "\N{LATIN SMALL LETTER D WITH STROKE}" 

201 "\N{LATIN SMALL LETTER E WITH OGONEK}" 

202 "\N{LATIN SMALL LETTER E WITH CARON}" 

203 "\N{LATIN SMALL LETTER E WITH MACRON}" 

204 "\N{LATIN SMALL LETTER E WITH DOT ABOVE}" 

205 "\N{LATIN SMALL LETTER G WITH BREVE}" 

206 "\N{LATIN SMALL LETTER G WITH CEDILLA}" 

207 "\N{LATIN SMALL LETTER I WITH OGONEK}" 

208 "\N{LATIN SMALL LETTER I WITH MACRON}" 

209 "\N{LATIN SMALL LETTER K WITH CEDILLA}" 

210 "\N{LATIN SMALL LETTER L WITH ACUTE}" 

211 "\N{LATIN SMALL LETTER L WITH CARON}" 

212 "\N{LATIN SMALL LETTER L WITH STROKE}" 

213 "\N{LATIN SMALL LETTER L WITH CEDILLA}" 

214 "\N{LATIN SMALL LIGATURE OE}" 

215 "\N{LATIN SMALL LETTER R WITH ACUTE}" 

216 "\N{LATIN SMALL LETTER S WITH ACUTE}" 

217 "\N{LATIN SMALL LETTER S WITH CEDILLA}" 

218 "\N{LATIN SMALL LETTER S WITH CARON}" 

219 "\N{LATIN SMALL LETTER T WITH CARON}" 

220 "\N{LATIN SMALL LETTER U WITH DIAERESIS}" 

221 "\N{LATIN SMALL LETTER Z WITH ACUTE}" 

222 "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" 

223 "\N{LATIN SMALL LETTER Z WITH CARON}" 

224 "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" 

225 "\N{LATIN SMALL LIGATURE FI}" 

226 "\N{LATIN SMALL LIGATURE FL}" 

227 ), 

228 "upper_common": ( 

229 "\N{LATIN CAPITAL LETTER THORN}" 

230 "\N{GREEK CAPITAL LETTER ALPHA}-\N{GREEK CAPITAL LETTER OMEGA}" 

231 # not included under 'accented' because these can commonly 

232 # occur at ends of words, in positions where they'd be detected 

233 # as mojibake 

234 "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" 

235 "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" 

236 "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" 

237 "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" 

238 "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" 

239 "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" 

240 "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" 

241 "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}" 

242 "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}" 

243 "\N{CYRILLIC CAPITAL LETTER IO}-\N{CYRILLIC CAPITAL LETTER YA}" 

244 ), 

245 "lower_common": ( 

246 # lowercase thorn does not appear in mojibake 

247 "\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER OMEGA}" 

248 "\N{GREEK SMALL LETTER ALPHA WITH TONOS}" 

249 "\N{GREEK SMALL LETTER EPSILON WITH TONOS}" 

250 "\N{GREEK SMALL LETTER ETA WITH TONOS}" 

251 "\N{GREEK SMALL LETTER IOTA WITH TONOS}" 

252 "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}" 

253 "\N{CYRILLIC SMALL LETTER A}-\N{CYRILLIC SMALL LETTER DZHE}" 

254 ), 

255 "box": ( 

256 # omit the single horizontal line, might be used in kaomoji 

257 "│┌┐┘├┤┬┼" 

258 "\N{BOX DRAWINGS DOUBLE HORIZONTAL}-\N{BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL}" 

259 "▀▄█▌▐░▒▓" 

260 ), 

261} 

262 

263 

264# We can now build a regular expression that detects unlikely juxtapositions 

265# of characters, mostly based on their categories. 

266# 

267# Another regular expression, which detects sequences that look more specifically 

268# like UTF-8 mojibake, appears in chardata.py. 

269# 

270# This is a verbose regular expression, with whitespace added for somewhat more 

271# readability. Remember that the only spaces that count as literal spaces in this 

272# expression are ones inside character classes (square brackets). 

273 

274BADNESS_RE = re.compile( 

275 r""" 

276 [{c1}] 

277 | 

278 [{bad}{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}{law}] [{bad}] 

279 | 

280 [a-zA-Z] [{lower_common}{upper_common}] [{bad}] 

281 | 

282 [{bad}] [{lower_accented}{upper_accented}{box}{start_punctuation}{end_punctuation}{currency}{numeric}{law}] 

283 | 

284 [{lower_accented}{lower_common}{box}{end_punctuation}{currency}{numeric}] [{upper_accented}] 

285 | 

286 [{box}{end_punctuation}{currency}{numeric}] [{lower_accented}] 

287 | 

288 [{lower_accented}{box}{end_punctuation}] [{currency}] 

289 | 

290 \s [{upper_accented}] [{currency}] 

291 | 

292 [{upper_accented}{box}] [{numeric}{law}] 

293 | 

294 [{lower_accented}{upper_accented}{box}{currency}{end_punctuation}] [{start_punctuation}] [{numeric}] 

295 | 

296 [{lower_accented}{upper_accented}{currency}{numeric}{box}{law}] [{end_punctuation}] [{start_punctuation}] 

297 | 

298 [{currency}{numeric}{box}] [{start_punctuation}] 

299 | 

300 [a-z] [{upper_accented}] [{start_punctuation}{currency}] 

301 | 

302 [{box}] [{kaomoji}] 

303 | 

304 [{lower_accented}{upper_accented}{currency}{numeric}{start_punctuation}{end_punctuation}{law}] [{box}] 

305 | 

306 [{box}] [{end_punctuation}] 

307 | 

308 [{lower_accented}{upper_accented}] [{start_punctuation}{end_punctuation}] \w 

309 | 

310 

311 # The ligature œ when not followed by an unaccented Latin letter 

312 [Œœ][^A-Za-z] 

313 | 

314 

315 # Degree signs after capital letters 

316 [{upper_accented}]° 

317 | 

318 

319 # Common Windows-1252 2-character mojibake that isn't covered by the cases above 

320 [ÂÃÎÐ][€œŠš¢£Ÿž\xa0\xad®©°·»{start_punctuation}{end_punctuation}–—´] 

321 | 

322 × [²³] 

323 | 

324 # Windows-1252 mojibake of Arabic words needs to include the 'common' characters. 

325 # To compensate, we require four characters to be matched. 

326 [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»] 

327 [ØÙ] [{common}{currency}{bad}{numeric}{start_punctuation}ŸŠ®°µ»] 

328 | 

329 

330 # Windows-1252 mojibake that starts 3-character sequences for some South Asian 

331 # alphabets 

332 à[²µ¹¼½¾] 

333 | 

334 

335 # MacRoman mojibake that isn't covered by the cases above 

336 √[±∂†≠®™´≤≥¥µø] 

337 | 

338 ≈[°¢] 

339 | 

340 ‚Ä[ìîïòôúùû†°¢π] 

341 | 

342 ‚[âó][àä°ê] 

343 | 

344 

345 # Windows-1251 mojibake of characters in the U+2000 range 

346 †

347 | 

348 

349 # Windows-1251 mojibake of Latin-1 characters and/or the Cyrillic alphabet. 

350 # Because the 2-character sequences involved here may be common, we require 

351 # seeing a 3-character sequence. 

352 [ВГРС][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°µ][ВГРС] 

353 | 

354 # A distinctive five-character sequence of Cyrillic letters, which can be 

355 # Windows-1251 mojibake on top of Latin-1 mojibake of Windows-1252 characters. 

356 # Require a Latin letter nearby. 

357 ГўВЂВ.[A-Za-z ] 

358 | 

359 

360 # Windows-1252 encodings of 'à' and 'á', as well as \xa0 itself 

361 Ã[\xa0¡] 

362 | 

363 [a-z]\s?[ÃÂ][ ] 

364 | 

365 ^[ÃÂ][ ] 

366 | 

367 

368 # Cases where  precedes a character as an encoding of exactly the same 

369 # character, and the character is common enough 

370 [a-z.,?!{end_punctuation}] Â [ {start_punctuation}{end_punctuation}] 

371 | 

372 

373 # Windows-1253 mojibake of characters in the U+2000 range 

374 β€[™\xa0Ά\xad®°] 

375 | 

376 

377 # Windows-1253 mojibake of Latin-1 characters and/or the Greek alphabet 

378 [ΒΓΞΟ][{c1}{bad}{start_punctuation}{end_punctuation}{currency}°][ΒΓΞΟ] 

379 | 

380 

381 # Windows-1257 mojibake of characters in the U+2000 range 

382 †

383 """.format( 

384 **MOJIBAKE_CATEGORIES 

385 ), 

386 re.VERBOSE, 

387) 

388 

389 

390def sequence_weirdness(text: str) -> int: 

391 """ 

392 This was the name of the heuristic used in ftfy 2.x through 5.x. As an 

393 attempt at compatibility with external code that calls the heuristic 

394 directly, we redirect to our new heuristic, :func:`badness`. 

395 """ 

396 warnings.warn( 

397 "`sequence_weirdness()` is an old heuristic, and the current " 

398 "closest equivalent is `ftfy.badness.badness()`" 

399 ) 

400 return badness(text) 

401 

402 

403def badness(text: str) -> int: 

404 """ 

405 Get the 'badness' of a sequence of text, counting the number of unlikely 

406 character sequences. A badness greater than 0 indicates that some of it 

407 seems to be mojibake. 

408 """ 

409 return len(BADNESS_RE.findall(text)) 

410 

411 

412def is_bad(text: str) -> bool: 

413 """ 

414 Returns true iff the given text looks like it contains mojibake. 

415 

416 This can be faster than `badness`, because it returns when the first match 

417 is found to a regex instead of counting matches. Note that as strings get 

418 longer, they have a higher chance of returning True for `is_bad(string)`. 

419 """ 

420 return bool(BADNESS_RE.search(text))