Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/charsetprober.py: 93%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

69 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is Mozilla Universal charset detector code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 2001 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# Shy Shalom - original C code 

12# 

13# This library is free software; you can redistribute it and/or 

14# modify it under the terms of the GNU Lesser General Public 

15# License as published by the Free Software Foundation; either 

16# version 2.1 of the License, or (at your option) any later version. 

17# 

18# This library is distributed in the hope that it will be useful, 

19# but WITHOUT ANY WARRANTY; without even the implied warranty of 

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

21# Lesser General Public License for more details. 

22# 

23# You should have received a copy of the GNU Lesser General Public 

24# License along with this library; if not, see 

25# <https://www.gnu.org/licenses/>. 

26######################### END LICENSE BLOCK ######################### 

27 

28import logging 

29import re 

30from typing import Optional, Union 

31 

32from .enums import EncodingEra, LanguageFilter, ProbingState 

33from .metadata.charsets import Charset, get_charset 

34 

35INTERNATIONAL_WORDS_PATTERN = re.compile( 

36 # Pattern rationale (see paper section 4.7 Two-Char Sequence Distribution): 

37 # We drop words composed solely of ASCII letters for scripts without Latin letters, 

38 # retaining any word containing at least one high-byte (>=0x80) character. 

39 # Structure: optional ASCII prefix + one or more high-byte chars + optional ASCII 

40 # suffix + optional single trailing marker. 

41 b"[a-zA-Z]*[\x80-\xff]+[a-zA-Z]*[^a-zA-Z\x80-\xff]?" 

42) 

43 

44 

45class CharSetProber: 

46 SHORTCUT_THRESHOLD = 0.95 

47 

48 def __init__( 

49 self, 

50 *, 

51 lang_filter: LanguageFilter = LanguageFilter.ALL, 

52 encoding_era: EncodingEra = EncodingEra.ALL, 

53 ) -> None: 

54 self._state = ProbingState.DETECTING 

55 self.active = True 

56 self.lang_filter = lang_filter 

57 self.encoding_era = encoding_era 

58 self.logger = logging.getLogger(__name__) 

59 

60 def reset(self) -> None: 

61 self._state = ProbingState.DETECTING 

62 

63 @property 

64 def charset_name(self) -> Optional[str]: 

65 return None 

66 

67 @property 

68 def charset(self) -> Optional[Charset]: 

69 """Return the Charset metadata for this prober's encoding.""" 

70 name = self.charset_name 

71 if name is None: 

72 return None 

73 return get_charset(name) 

74 

75 @property 

76 def language(self) -> Optional[str]: 

77 raise NotImplementedError 

78 

79 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: 

80 raise NotImplementedError 

81 

82 @property 

83 def state(self) -> ProbingState: 

84 return self._state 

85 

86 def get_confidence(self) -> float: 

87 return 0.0 

88 

89 @staticmethod 

90 def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes: 

91 buf = re.sub(b"([\x00-\x7f])+", b" ", buf) 

92 return buf 

93 

94 @staticmethod 

95 def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray: 

96 """Filter out ASCII-only words for non-Latin scripts. 

97 

98 Byte classes: 

99 - alphabet: ASCII letters [a-zA-Z] 

100 - international: bytes with high bit set [\x80-\xff] 

101 - marker: everything else [^a-zA-Z\x80-\xff] 

102 

103 The buffer is treated as a sequence of "words" separated by marker bytes. 

104 We KEEP only those words that contain at least one high-byte character, 

105 i.e. match the pattern: optional ASCII prefix + >=1 high-byte + optional 

106 ASCII suffix, plus at most one trailing marker. Pure ASCII words are 

107 discarded as noise when the target language model excludes ASCII letters 

108 ("English words in other-language pages" — paper §4.7 summary). 

109 

110 Why we retain surrounding ASCII letters instead of stripping them: 

111 - Preserves real adjacency for bigram modeling around high-byte letters. 

112 - Avoids creating artificial bigrams between non-adjacent high-byte chars. 

113 

114 Trailing marker normalization: a single marker at word end is converted 

115 to a space if it is an ASCII punctuation/control, collapsing runs of 

116 markers into one delimiter (reduces noise like repeated punctuation or 

117 HTML artifacts). 

118 

119 Usage is conditional: callers apply this ONLY when the language model's 

120 ``keep_ascii_letters`` is False (see ``SingleByteCharSetProber.feed``). 

121 Latin-script languages skip this and instead use ``remove_xml_tags``. 

122 

123 This behavior mirrors the original universalchardet / uchardet approach 

124 and aligns with the training pipeline which excludes ASCII letters for 

125 non-Latin alphabets. 

126 """ 

127 filtered = bytearray() 

128 

129 # This regex expression filters out only words that have at-least one 

130 # international character. The word may include one marker character at 

131 # the end. 

132 words = INTERNATIONAL_WORDS_PATTERN.findall(buf) 

133 

134 for word in words: 

135 filtered.extend(word[:-1]) 

136 

137 # If the last character in the word is a marker, replace it with a 

138 # space as markers shouldn't affect our analysis (they are used 

139 # similarly across all languages and may thus have similar 

140 # frequencies). 

141 last_char = word[-1:] 

142 if not last_char.isalpha() and last_char < b"\x80": 

143 last_char = b" " 

144 filtered.extend(last_char) 

145 

146 return filtered 

147 

148 @staticmethod 

149 def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytearray: 

150 """ 

151 Returns a copy of ``buf`` that retains only the sequences of English 

152 alphabet and high byte characters that are not between <> characters. 

153 This filter can be applied to all scripts which contain both English 

154 characters and extended ASCII characters, but is currently only used by 

155 ``Latin1Prober``. 

156 """ 

157 filtered = bytearray() 

158 in_tag = False 

159 prev = 0 

160 buf_view = memoryview(buf).cast("c") 

161 

162 for curr, buf_char in enumerate(buf_view): 

163 # Check if we're coming out of or entering an XML tag 

164 

165 # https://github.com/python/typeshed/issues/8182 

166 if buf_char == b">": # type: ignore[comparison-overlap] 

167 prev = curr + 1 

168 in_tag = False 

169 # https://github.com/python/typeshed/issues/8182 

170 elif buf_char == b"<": # type: ignore[comparison-overlap] 

171 if curr > prev and not in_tag: 

172 # Keep everything after last non-extended-ASCII, 

173 # non-alphabetic character 

174 filtered.extend(buf[prev:curr]) 

175 # Output a space to delimit stretch we kept 

176 filtered.extend(b" ") 

177 in_tag = True 

178 

179 # If we're not in a tag... 

180 if not in_tag: 

181 # Keep everything after last non-extended-ASCII, non-alphabetic 

182 # character 

183 filtered.extend(buf[prev:]) 

184 

185 return filtered