1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Universal charset detector code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 2001
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11# Shy Shalom - original C code
12#
13# This library is free software; you can redistribute it and/or
14# modify it under the terms of the GNU Lesser General Public
15# License as published by the Free Software Foundation; either
16# version 2.1 of the License, or (at your option) any later version.
17#
18# This library is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21# Lesser General Public License for more details.
22#
23# You should have received a copy of the GNU Lesser General Public
24# License along with this library; if not, see
25# <https://www.gnu.org/licenses/>.
26######################### END LICENSE BLOCK #########################
27
28import logging
29import re
30from typing import Optional, Union
31
32from .enums import EncodingEra, LanguageFilter, ProbingState
33from .metadata.charsets import Charset, get_charset
34
35INTERNATIONAL_WORDS_PATTERN = re.compile(
36 # Pattern rationale (see paper section 4.7 Two-Char Sequence Distribution):
37 # We drop words composed solely of ASCII letters for scripts without Latin letters,
38 # retaining any word containing at least one high-byte (>=0x80) character.
39 # Structure: optional ASCII prefix + one or more high-byte chars + optional ASCII
40 # suffix + optional single trailing marker.
41 b"[a-zA-Z]*[\x80-\xff]+[a-zA-Z]*[^a-zA-Z\x80-\xff]?"
42)
43
44
45class CharSetProber:
46 SHORTCUT_THRESHOLD = 0.95
47
48 def __init__(
49 self,
50 *,
51 lang_filter: LanguageFilter = LanguageFilter.ALL,
52 encoding_era: EncodingEra = EncodingEra.ALL,
53 ) -> None:
54 self._state = ProbingState.DETECTING
55 self.active = True
56 self.lang_filter = lang_filter
57 self.encoding_era = encoding_era
58 self.logger = logging.getLogger(__name__)
59
60 def reset(self) -> None:
61 self._state = ProbingState.DETECTING
62
63 @property
64 def charset_name(self) -> Optional[str]:
65 return None
66
67 @property
68 def charset(self) -> Optional[Charset]:
69 """Return the Charset metadata for this prober's encoding."""
70 name = self.charset_name
71 if name is None:
72 return None
73 return get_charset(name)
74
75 @property
76 def language(self) -> Optional[str]:
77 raise NotImplementedError
78
79 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
80 raise NotImplementedError
81
82 @property
83 def state(self) -> ProbingState:
84 return self._state
85
86 def get_confidence(self) -> float:
87 return 0.0
88
89 @staticmethod
90 def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
91 buf = re.sub(b"([\x00-\x7f])+", b" ", buf)
92 return buf
93
94 @staticmethod
95 def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
96 """Filter out ASCII-only words for non-Latin scripts.
97
98 Byte classes:
99 - alphabet: ASCII letters [a-zA-Z]
100 - international: bytes with high bit set [\x80-\xff]
101 - marker: everything else [^a-zA-Z\x80-\xff]
102
103 The buffer is treated as a sequence of "words" separated by marker bytes.
104 We KEEP only those words that contain at least one high-byte character,
105 i.e. match the pattern: optional ASCII prefix + >=1 high-byte + optional
106 ASCII suffix, plus at most one trailing marker. Pure ASCII words are
107 discarded as noise when the target language model excludes ASCII letters
108 ("English words in other-language pages" — paper §4.7 summary).
109
110 Why we retain surrounding ASCII letters instead of stripping them:
111 - Preserves real adjacency for bigram modeling around high-byte letters.
112 - Avoids creating artificial bigrams between non-adjacent high-byte chars.
113
114 Trailing marker normalization: a single marker at word end is converted
115 to a space if it is an ASCII punctuation/control, collapsing runs of
116 markers into one delimiter (reduces noise like repeated punctuation or
117 HTML artifacts).
118
119 Usage is conditional: callers apply this ONLY when the language model's
120 ``keep_ascii_letters`` is False (see ``SingleByteCharSetProber.feed``).
121 Latin-script languages skip this and instead use ``remove_xml_tags``.
122
123 This behavior mirrors the original universalchardet / uchardet approach
124 and aligns with the training pipeline which excludes ASCII letters for
125 non-Latin alphabets.
126 """
127 filtered = bytearray()
128
129 # This regex expression filters out only words that have at-least one
130 # international character. The word may include one marker character at
131 # the end.
132 words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
133
134 for word in words:
135 filtered.extend(word[:-1])
136
137 # If the last character in the word is a marker, replace it with a
138 # space as markers shouldn't affect our analysis (they are used
139 # similarly across all languages and may thus have similar
140 # frequencies).
141 last_char = word[-1:]
142 if not last_char.isalpha() and last_char < b"\x80":
143 last_char = b" "
144 filtered.extend(last_char)
145
146 return filtered
147
148 @staticmethod
149 def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytearray:
150 """
151 Returns a copy of ``buf`` that retains only the sequences of English
152 alphabet and high byte characters that are not between <> characters.
153 This filter can be applied to all scripts which contain both English
154 characters and extended ASCII characters, but is currently only used by
155 ``Latin1Prober``.
156 """
157 filtered = bytearray()
158 in_tag = False
159 prev = 0
160 buf_view = memoryview(buf).cast("c")
161
162 for curr, buf_char in enumerate(buf_view):
163 # Check if we're coming out of or entering an XML tag
164
165 # https://github.com/python/typeshed/issues/8182
166 if buf_char == b">": # type: ignore[comparison-overlap]
167 prev = curr + 1
168 in_tag = False
169 # https://github.com/python/typeshed/issues/8182
170 elif buf_char == b"<": # type: ignore[comparison-overlap]
171 if curr > prev and not in_tag:
172 # Keep everything after last non-extended-ASCII,
173 # non-alphabetic character
174 filtered.extend(buf[prev:curr])
175 # Output a space to delimit stretch we kept
176 filtered.extend(b" ")
177 in_tag = True
178
179 # If we're not in a tag...
180 if not in_tag:
181 # Keep everything after last non-extended-ASCII, non-alphabetic
182 # character
183 filtered.extend(buf[prev:])
184
185 return filtered