Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/charsetprober.py: 95%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is Mozilla Universal charset detector code.

4# The Initial Developer of the Original Code is

5# Netscape Communications Corporation.

9# Contributor(s):

10# Mark Pilgrim - port to Python

11# Shy Shalom - original C code

12#

13# This library is free software; you can redistribute it and/or

14# modify it under the terms of the GNU Lesser General Public

15# License as published by the Free Software Foundation; either

16# version 2.1 of the License, or (at your option) any later version.

17#

18# This library is distributed in the hope that it will be useful,

19# but WITHOUT ANY WARRANTY; without even the implied warranty of

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

21# Lesser General Public License for more details.

22#

23# You should have received a copy of the GNU Lesser General Public

24# License along with this library; if not, write to the Free Software

25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

26# 02110-1301 USA

27######################### END LICENSE BLOCK #########################

29import logging

30import re

31from typing import Optional, Union

33from .enums import LanguageFilter, ProbingState

35INTERNATIONAL_WORDS_PATTERN = re.compile(

36 b"[a-zA-Z]*[\x80-\xff]+[a-zA-Z]*[^a-zA-Z\x80-\xff]?"

37)

40class CharSetProber:

41 SHORTCUT_THRESHOLD = 0.95

43 def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:

44 self._state = ProbingState.DETECTING

45 self.active = True

46 self.lang_filter = lang_filter

47 self.logger = logging.getLogger(__name__)

49 def reset(self) -> None:

50 self._state = ProbingState.DETECTING

52 @property

53 def charset_name(self) -> Optional[str]:

54 return None

56 @property

57 def language(self) -> Optional[str]:

58 raise NotImplementedError

60 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

61 raise NotImplementedError

63 @property

64 def state(self) -> ProbingState:

65 return self._state

67 def get_confidence(self) -> float:

68 return 0.0

70 @staticmethod

71 def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:

72 buf = re.sub(b"([\x00-\x7f])+", b" ", buf)

73 return buf

75 @staticmethod

76 def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:

77 """

78 We define three types of bytes:

79 alphabet: english alphabets [a-zA-Z]

80 international: international characters [\x80-\xff]

81 marker: everything else [^a-zA-Z\x80-\xff]

82 The input buffer can be thought to contain a series of words delimited

83 by markers. This function works to filter all words that contain at

84 least one international character. All contiguous sequences of markers

85 are replaced by a single space ascii character.

86 This filter applies to all scripts which do not use English characters.

87 """

88 filtered = bytearray()

90 # This regex expression filters out only words that have at-least one

91 # international character. The word may include one marker character at

92 # the end.

93 words = INTERNATIONAL_WORDS_PATTERN.findall(buf)

95 for word in words:

96 filtered.extend(word[:-1])

98 # If the last character in the word is a marker, replace it with a

99 # space as markers shouldn't affect our analysis (they are used

100 # similarly across all languages and may thus have similar

101 # frequencies).

102 last_char = word[-1:]

103 if not last_char.isalpha() and last_char < b"\x80":

104 last_char = b" "

105 filtered.extend(last_char)

106

107 return filtered

108

109 @staticmethod

110 def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:

111 """

112 Returns a copy of ``buf`` that retains only the sequences of English

113 alphabet and high byte characters that are not between <> characters.

114 This filter can be applied to all scripts which contain both English

115 characters and extended ASCII characters, but is currently only used by

116 ``Latin1Prober``.

117 """

118 filtered = bytearray()

119 in_tag = False

120 prev = 0

121 buf = memoryview(buf).cast("c")

122

123 for curr, buf_char in enumerate(buf):

124 # Check if we're coming out of or entering an XML tag

125

126 # https://github.com/python/typeshed/issues/8182

127 if buf_char == b">": # type: ignore[comparison-overlap]

128 prev = curr + 1

129 in_tag = False

130 # https://github.com/python/typeshed/issues/8182

131 elif buf_char == b"<": # type: ignore[comparison-overlap]

132 if curr > prev and not in_tag:

133 # Keep everything after last non-extended-ASCII,

134 # non-alphabetic character

135 filtered.extend(buf[prev:curr])

136 # Output a space to delimit stretch we kept

137 filtered.extend(b" ")

138 in_tag = True

139

140 # If we're not in a tag...

141 if not in_tag:

142 # Keep everything after last non-extended-ASCII, non-alphabetic

143 # character

144 filtered.extend(buf[prev:])

145

146 return filtered