Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/charsetprober.py: 93%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is Mozilla Universal charset detector code.

4# The Initial Developer of the Original Code is

5# Netscape Communications Corporation.

9# Contributor(s):

10# Mark Pilgrim - port to Python

11# Shy Shalom - original C code

12#

13# This library is free software; you can redistribute it and/or

14# modify it under the terms of the GNU Lesser General Public

15# License as published by the Free Software Foundation; either

16# version 2.1 of the License, or (at your option) any later version.

17#

18# This library is distributed in the hope that it will be useful,

19# but WITHOUT ANY WARRANTY; without even the implied warranty of

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

21# Lesser General Public License for more details.

22#

23# You should have received a copy of the GNU Lesser General Public

24# License along with this library; if not, see

25# <https://www.gnu.org/licenses/>.

26######################### END LICENSE BLOCK #########################

28import logging

29import re

30from typing import Optional, Union

32from .enums import EncodingEra, LanguageFilter, ProbingState

33from .metadata.charsets import Charset, get_charset

35INTERNATIONAL_WORDS_PATTERN = re.compile(

36 # Pattern rationale (see paper section 4.7 Two-Char Sequence Distribution):

37 # We drop words composed solely of ASCII letters for scripts without Latin letters,

38 # retaining any word containing at least one high-byte (>=0x80) character.

39 # Structure: optional ASCII prefix + one or more high-byte chars + optional ASCII

40 # suffix + optional single trailing marker.

41 b"[a-zA-Z]*[\x80-\xff]+[a-zA-Z]*[^a-zA-Z\x80-\xff]?"

42)

45class CharSetProber:

46 SHORTCUT_THRESHOLD = 0.95

48 def __init__(

49 self,

50 *,

51 lang_filter: LanguageFilter = LanguageFilter.ALL,

52 encoding_era: EncodingEra = EncodingEra.ALL,

53 ) -> None:

54 self._state = ProbingState.DETECTING

55 self.active = True

56 self.lang_filter = lang_filter

57 self.encoding_era = encoding_era

58 self.logger = logging.getLogger(__name__)

60 def reset(self) -> None:

61 self._state = ProbingState.DETECTING

63 @property

64 def charset_name(self) -> Optional[str]:

65 return None

67 @property

68 def charset(self) -> Optional[Charset]:

69 """Return the Charset metadata for this prober's encoding."""

70 name = self.charset_name

71 if name is None:

72 return None

73 return get_charset(name)

75 @property

76 def language(self) -> Optional[str]:

77 raise NotImplementedError

79 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

80 raise NotImplementedError

82 @property

83 def state(self) -> ProbingState:

84 return self._state

86 def get_confidence(self) -> float:

87 return 0.0

89 @staticmethod

90 def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:

91 buf = re.sub(b"([\x00-\x7f])+", b" ", buf)

92 return buf

94 @staticmethod

95 def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:

96 """Filter out ASCII-only words for non-Latin scripts.

98 Byte classes:

99 - alphabet: ASCII letters [a-zA-Z]

100 - international: bytes with high bit set [\x80-\xff]

101 - marker: everything else [^a-zA-Z\x80-\xff]

102

103 The buffer is treated as a sequence of "words" separated by marker bytes.

104 We KEEP only those words that contain at least one high-byte character,

105 i.e. match the pattern: optional ASCII prefix + >=1 high-byte + optional

106 ASCII suffix, plus at most one trailing marker. Pure ASCII words are

107 discarded as noise when the target language model excludes ASCII letters

108 ("English words in other-language pages" — paper §4.7 summary).

109

110 Why we retain surrounding ASCII letters instead of stripping them:

111 - Preserves real adjacency for bigram modeling around high-byte letters.

112 - Avoids creating artificial bigrams between non-adjacent high-byte chars.

113

114 Trailing marker normalization: a single marker at word end is converted

115 to a space if it is an ASCII punctuation/control, collapsing runs of

116 markers into one delimiter (reduces noise like repeated punctuation or

117 HTML artifacts).

118

119 Usage is conditional: callers apply this ONLY when the language model's

120 ``keep_ascii_letters`` is False (see ``SingleByteCharSetProber.feed``).

121 Latin-script languages skip this and instead use ``remove_xml_tags``.

122

123 This behavior mirrors the original universalchardet / uchardet approach

124 and aligns with the training pipeline which excludes ASCII letters for

125 non-Latin alphabets.

126 """

127 filtered = bytearray()

128

129 # This regex expression filters out only words that have at-least one

130 # international character. The word may include one marker character at

131 # the end.

132 words = INTERNATIONAL_WORDS_PATTERN.findall(buf)

133

134 for word in words:

135 filtered.extend(word[:-1])

136

137 # If the last character in the word is a marker, replace it with a

138 # space as markers shouldn't affect our analysis (they are used

139 # similarly across all languages and may thus have similar

140 # frequencies).

141 last_char = word[-1:]

142 if not last_char.isalpha() and last_char < b"\x80":

143 last_char = b" "

144 filtered.extend(last_char)

145

146 return filtered

147

148 @staticmethod

149 def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytearray:

150 """

151 Returns a copy of ``buf`` that retains only the sequences of English

152 alphabet and high byte characters that are not between <> characters.

153 This filter can be applied to all scripts which contain both English

154 characters and extended ASCII characters, but is currently only used by

155 ``Latin1Prober``.

156 """

157 filtered = bytearray()

158 in_tag = False

159 prev = 0

160 buf_view = memoryview(buf).cast("c")

161

162 for curr, buf_char in enumerate(buf_view):

163 # Check if we're coming out of or entering an XML tag

164

165 # https://github.com/python/typeshed/issues/8182

166 if buf_char == b">": # type: ignore[comparison-overlap]

167 prev = curr + 1

168 in_tag = False

169 # https://github.com/python/typeshed/issues/8182

170 elif buf_char == b"<": # type: ignore[comparison-overlap]

171 if curr > prev and not in_tag:

172 # Keep everything after last non-extended-ASCII,

173 # non-alphabetic character

174 filtered.extend(buf[prev:curr])

175 # Output a space to delimit stretch we kept

176 filtered.extend(b" ")

177 in_tag = True

178

179 # If we're not in a tag...

180 if not in_tag:

181 # Keep everything after last non-extended-ASCII, non-alphabetic

182 # character

183 filtered.extend(buf[prev:])

184

185 return filtered