1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Universal charset detector code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 2001
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11# Shy Shalom - original C code
12#
13# This library is free software; you can redistribute it and/or
14# modify it under the terms of the GNU Lesser General Public
15# License as published by the Free Software Foundation; either
16# version 2.1 of the License, or (at your option) any later version.
17#
18# This library is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21# Lesser General Public License for more details.
22#
23# You should have received a copy of the GNU Lesser General Public
24# License along with this library; if not, see
25# <https://www.gnu.org/licenses/>.
26######################### END LICENSE BLOCK #########################
27
28import logging
29import re
30from typing import Optional, Union
31
32from .enums import LanguageFilter, ProbingState
33
34INTERNATIONAL_WORDS_PATTERN = re.compile(
35 b"[a-zA-Z]*[\x80-\xff]+[a-zA-Z]*[^a-zA-Z\x80-\xff]?"
36)
37
38
39class CharSetProber:
40 SHORTCUT_THRESHOLD = 0.95
41
42 def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
43 self._state = ProbingState.DETECTING
44 self.active = True
45 self.lang_filter = lang_filter
46 self.logger = logging.getLogger(__name__)
47
48 def reset(self) -> None:
49 self._state = ProbingState.DETECTING
50
51 @property
52 def charset_name(self) -> Optional[str]:
53 return None
54
55 @property
56 def language(self) -> Optional[str]:
57 raise NotImplementedError
58
59 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
60 raise NotImplementedError
61
62 @property
63 def state(self) -> ProbingState:
64 return self._state
65
66 def get_confidence(self) -> float:
67 return 0.0
68
69 @staticmethod
70 def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
71 buf = re.sub(b"([\x00-\x7f])+", b" ", buf)
72 return buf
73
74 @staticmethod
75 def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
76 """
77 We define three types of bytes:
78 alphabet: english alphabets [a-zA-Z]
79 international: international characters [\x80-\xff]
80 marker: everything else [^a-zA-Z\x80-\xff]
81 The input buffer can be thought to contain a series of words delimited
82 by markers. This function works to filter all words that contain at
83 least one international character. All contiguous sequences of markers
84 are replaced by a single space ascii character.
85 This filter applies to all scripts which do not use English characters.
86 """
87 filtered = bytearray()
88
89 # This regex expression filters out only words that have at-least one
90 # international character. The word may include one marker character at
91 # the end.
92 words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
93
94 for word in words:
95 filtered.extend(word[:-1])
96
97 # If the last character in the word is a marker, replace it with a
98 # space as markers shouldn't affect our analysis (they are used
99 # similarly across all languages and may thus have similar
100 # frequencies).
101 last_char = word[-1:]
102 if not last_char.isalpha() and last_char < b"\x80":
103 last_char = b" "
104 filtered.extend(last_char)
105
106 return filtered
107
108 @staticmethod
109 def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytearray:
110 """
111 Returns a copy of ``buf`` that retains only the sequences of English
112 alphabet and high byte characters that are not between <> characters.
113 This filter can be applied to all scripts which contain both English
114 characters and extended ASCII characters, but is currently only used by
115 ``Latin1Prober``.
116 """
117 filtered = bytearray()
118 in_tag = False
119 prev = 0
120 buf_view = memoryview(buf).cast("c")
121
122 for curr, buf_char in enumerate(buf_view):
123 # Check if we're coming out of or entering an XML tag
124
125 # https://github.com/python/typeshed/issues/8182
126 if buf_char == b">": # type: ignore[comparison-overlap]
127 prev = curr + 1
128 in_tag = False
129 # https://github.com/python/typeshed/issues/8182
130 elif buf_char == b"<": # type: ignore[comparison-overlap]
131 if curr > prev and not in_tag:
132 # Keep everything after last non-extended-ASCII,
133 # non-alphabetic character
134 filtered.extend(buf[prev:curr])
135 # Output a space to delimit stretch we kept
136 filtered.extend(b" ")
137 in_tag = True
138
139 # If we're not in a tag...
140 if not in_tag:
141 # Keep everything after last non-extended-ASCII, non-alphabetic
142 # character
143 filtered.extend(buf[prev:])
144
145 return filtered