Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/charsetprober.py: 95%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is Mozilla Universal charset detector code.

4# The Initial Developer of the Original Code is

5# Netscape Communications Corporation.

9# Contributor(s):

10# Mark Pilgrim - port to Python

11# Shy Shalom - original C code

12#

13# This library is free software; you can redistribute it and/or

14# modify it under the terms of the GNU Lesser General Public

15# License as published by the Free Software Foundation; either

16# version 2.1 of the License, or (at your option) any later version.

17#

18# This library is distributed in the hope that it will be useful,

19# but WITHOUT ANY WARRANTY; without even the implied warranty of

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

21# Lesser General Public License for more details.

22#

23# You should have received a copy of the GNU Lesser General Public

24# License along with this library; if not, write to the Free Software

25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

26# 02110-1301 USA

27######################### END LICENSE BLOCK #########################

29import logging

30import re

31from typing import Optional, Union

33from .enums import LanguageFilter, ProbingState

35INTERNATIONAL_WORDS_PATTERN = re.compile(

36 b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"

37)

40class CharSetProber:

42 SHORTCUT_THRESHOLD = 0.95

44 def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:

45 self._state = ProbingState.DETECTING

46 self.active = True

47 self.lang_filter = lang_filter

48 self.logger = logging.getLogger(__name__)

50 def reset(self) -> None:

51 self._state = ProbingState.DETECTING

53 @property

54 def charset_name(self) -> Optional[str]:

55 return None

57 @property

58 def language(self) -> Optional[str]:

59 raise NotImplementedError

61 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

62 raise NotImplementedError

64 @property

65 def state(self) -> ProbingState:

66 return self._state

68 def get_confidence(self) -> float:

69 return 0.0

71 @staticmethod

72 def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:

73 buf = re.sub(b"([\x00-\x7F])+", b" ", buf)

74 return buf

76 @staticmethod

77 def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:

78 """

79 We define three types of bytes:

80 alphabet: english alphabets [a-zA-Z]

81 international: international characters [\x80-\xFF]

82 marker: everything else [^a-zA-Z\x80-\xFF]

83 The input buffer can be thought to contain a series of words delimited

84 by markers. This function works to filter all words that contain at

85 least one international character. All contiguous sequences of markers

86 are replaced by a single space ascii character.

87 This filter applies to all scripts which do not use English characters.

88 """

89 filtered = bytearray()

91 # This regex expression filters out only words that have at-least one

92 # international character. The word may include one marker character at

93 # the end.

94 words = INTERNATIONAL_WORDS_PATTERN.findall(buf)

96 for word in words:

97 filtered.extend(word[:-1])

99 # If the last character in the word is a marker, replace it with a

100 # space as markers shouldn't affect our analysis (they are used

101 # similarly across all languages and may thus have similar

102 # frequencies).

103 last_char = word[-1:]

104 if not last_char.isalpha() and last_char < b"\x80":

105 last_char = b" "

106 filtered.extend(last_char)

107

108 return filtered

109

110 @staticmethod

111 def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:

112 """

113 Returns a copy of ``buf`` that retains only the sequences of English

114 alphabet and high byte characters that are not between <> characters.

115 This filter can be applied to all scripts which contain both English

116 characters and extended ASCII characters, but is currently only used by

117 ``Latin1Prober``.

118 """

119 filtered = bytearray()

120 in_tag = False

121 prev = 0

122 buf = memoryview(buf).cast("c")

123

124 for curr, buf_char in enumerate(buf):

125 # Check if we're coming out of or entering an XML tag

126

127 # https://github.com/python/typeshed/issues/8182

128 if buf_char == b">": # type: ignore[comparison-overlap]

129 prev = curr + 1

130 in_tag = False

131 # https://github.com/python/typeshed/issues/8182

132 elif buf_char == b"<": # type: ignore[comparison-overlap]

133 if curr > prev and not in_tag:

134 # Keep everything after last non-extended-ASCII,

135 # non-alphabetic character

136 filtered.extend(buf[prev:curr])

137 # Output a space to delimit stretch we kept

138 filtered.extend(b" ")

139 in_tag = True

140

141 # If we're not in a tag...

142 if not in_tag:

143 # Keep everything after last non-extended-ASCII, non-alphabetic

144 # character

145 filtered.extend(buf[prev:])

146

147 return filtered