1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Universal charset detector code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 2001
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11# Shy Shalom - original C code
12#
13# This library is free software; you can redistribute it and/or
14# modify it under the terms of the GNU Lesser General Public
15# License as published by the Free Software Foundation; either
16# version 2.1 of the License, or (at your option) any later version.
17#
18# This library is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21# Lesser General Public License for more details.
22#
23# You should have received a copy of the GNU Lesser General Public
24# License along with this library; if not, write to the Free Software
25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26# 02110-1301 USA
27######################### END LICENSE BLOCK #########################
28
29import logging
30import re
31from typing import Optional, Union
32
33from .enums import LanguageFilter, ProbingState
34
35INTERNATIONAL_WORDS_PATTERN = re.compile(
36 b"[a-zA-Z]*[\x80-\xff]+[a-zA-Z]*[^a-zA-Z\x80-\xff]?"
37)
38
39
40class CharSetProber:
41 SHORTCUT_THRESHOLD = 0.95
42
43 def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
44 self._state = ProbingState.DETECTING
45 self.active = True
46 self.lang_filter = lang_filter
47 self.logger = logging.getLogger(__name__)
48
49 def reset(self) -> None:
50 self._state = ProbingState.DETECTING
51
52 @property
53 def charset_name(self) -> Optional[str]:
54 return None
55
56 @property
57 def language(self) -> Optional[str]:
58 raise NotImplementedError
59
60 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
61 raise NotImplementedError
62
63 @property
64 def state(self) -> ProbingState:
65 return self._state
66
67 def get_confidence(self) -> float:
68 return 0.0
69
70 @staticmethod
71 def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
72 buf = re.sub(b"([\x00-\x7f])+", b" ", buf)
73 return buf
74
75 @staticmethod
76 def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
77 """
78 We define three types of bytes:
79 alphabet: english alphabets [a-zA-Z]
80 international: international characters [\x80-\xff]
81 marker: everything else [^a-zA-Z\x80-\xff]
82 The input buffer can be thought to contain a series of words delimited
83 by markers. This function works to filter all words that contain at
84 least one international character. All contiguous sequences of markers
85 are replaced by a single space ascii character.
86 This filter applies to all scripts which do not use English characters.
87 """
88 filtered = bytearray()
89
90 # This regex expression filters out only words that have at-least one
91 # international character. The word may include one marker character at
92 # the end.
93 words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
94
95 for word in words:
96 filtered.extend(word[:-1])
97
98 # If the last character in the word is a marker, replace it with a
99 # space as markers shouldn't affect our analysis (they are used
100 # similarly across all languages and may thus have similar
101 # frequencies).
102 last_char = word[-1:]
103 if not last_char.isalpha() and last_char < b"\x80":
104 last_char = b" "
105 filtered.extend(last_char)
106
107 return filtered
108
109 @staticmethod
110 def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
111 """
112 Returns a copy of ``buf`` that retains only the sequences of English
113 alphabet and high byte characters that are not between <> characters.
114 This filter can be applied to all scripts which contain both English
115 characters and extended ASCII characters, but is currently only used by
116 ``Latin1Prober``.
117 """
118 filtered = bytearray()
119 in_tag = False
120 prev = 0
121 buf = memoryview(buf).cast("c")
122
123 for curr, buf_char in enumerate(buf):
124 # Check if we're coming out of or entering an XML tag
125
126 # https://github.com/python/typeshed/issues/8182
127 if buf_char == b">": # type: ignore[comparison-overlap]
128 prev = curr + 1
129 in_tag = False
130 # https://github.com/python/typeshed/issues/8182
131 elif buf_char == b"<": # type: ignore[comparison-overlap]
132 if curr > prev and not in_tag:
133 # Keep everything after last non-extended-ASCII,
134 # non-alphabetic character
135 filtered.extend(buf[prev:curr])
136 # Output a space to delimit stretch we kept
137 filtered.extend(b" ")
138 in_tag = True
139
140 # If we're not in a tag...
141 if not in_tag:
142 # Keep everything after last non-extended-ASCII, non-alphabetic
143 # character
144 filtered.extend(buf[prev:])
145
146 return filtered