Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/charsetprober.py: 95%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

60 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is Mozilla Universal charset detector code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 2001 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# Shy Shalom - original C code 

12# 

13# This library is free software; you can redistribute it and/or 

14# modify it under the terms of the GNU Lesser General Public 

15# License as published by the Free Software Foundation; either 

16# version 2.1 of the License, or (at your option) any later version. 

17# 

18# This library is distributed in the hope that it will be useful, 

19# but WITHOUT ANY WARRANTY; without even the implied warranty of 

20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

21# Lesser General Public License for more details. 

22# 

23# You should have received a copy of the GNU Lesser General Public 

24# License along with this library; if not, write to the Free Software 

25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

26# 02110-1301 USA 

27######################### END LICENSE BLOCK ######################### 

28 

29import logging 

30import re 

31from typing import Optional, Union 

32 

33from .enums import LanguageFilter, ProbingState 

34 

35INTERNATIONAL_WORDS_PATTERN = re.compile( 

36 b"[a-zA-Z]*[\x80-\xff]+[a-zA-Z]*[^a-zA-Z\x80-\xff]?" 

37) 

38 

39 

40class CharSetProber: 

41 SHORTCUT_THRESHOLD = 0.95 

42 

43 def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: 

44 self._state = ProbingState.DETECTING 

45 self.active = True 

46 self.lang_filter = lang_filter 

47 self.logger = logging.getLogger(__name__) 

48 

49 def reset(self) -> None: 

50 self._state = ProbingState.DETECTING 

51 

52 @property 

53 def charset_name(self) -> Optional[str]: 

54 return None 

55 

56 @property 

57 def language(self) -> Optional[str]: 

58 raise NotImplementedError 

59 

60 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: 

61 raise NotImplementedError 

62 

63 @property 

64 def state(self) -> ProbingState: 

65 return self._state 

66 

67 def get_confidence(self) -> float: 

68 return 0.0 

69 

70 @staticmethod 

71 def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes: 

72 buf = re.sub(b"([\x00-\x7f])+", b" ", buf) 

73 return buf 

74 

75 @staticmethod 

76 def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray: 

77 """ 

78 We define three types of bytes: 

79 alphabet: english alphabets [a-zA-Z] 

80 international: international characters [\x80-\xff] 

81 marker: everything else [^a-zA-Z\x80-\xff] 

82 The input buffer can be thought to contain a series of words delimited 

83 by markers. This function works to filter all words that contain at 

84 least one international character. All contiguous sequences of markers 

85 are replaced by a single space ascii character. 

86 This filter applies to all scripts which do not use English characters. 

87 """ 

88 filtered = bytearray() 

89 

90 # This regex expression filters out only words that have at-least one 

91 # international character. The word may include one marker character at 

92 # the end. 

93 words = INTERNATIONAL_WORDS_PATTERN.findall(buf) 

94 

95 for word in words: 

96 filtered.extend(word[:-1]) 

97 

98 # If the last character in the word is a marker, replace it with a 

99 # space as markers shouldn't affect our analysis (they are used 

100 # similarly across all languages and may thus have similar 

101 # frequencies). 

102 last_char = word[-1:] 

103 if not last_char.isalpha() and last_char < b"\x80": 

104 last_char = b" " 

105 filtered.extend(last_char) 

106 

107 return filtered 

108 

109 @staticmethod 

110 def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes: 

111 """ 

112 Returns a copy of ``buf`` that retains only the sequences of English 

113 alphabet and high byte characters that are not between <> characters. 

114 This filter can be applied to all scripts which contain both English 

115 characters and extended ASCII characters, but is currently only used by 

116 ``Latin1Prober``. 

117 """ 

118 filtered = bytearray() 

119 in_tag = False 

120 prev = 0 

121 buf = memoryview(buf).cast("c") 

122 

123 for curr, buf_char in enumerate(buf): 

124 # Check if we're coming out of or entering an XML tag 

125 

126 # https://github.com/python/typeshed/issues/8182 

127 if buf_char == b">": # type: ignore[comparison-overlap] 

128 prev = curr + 1 

129 in_tag = False 

130 # https://github.com/python/typeshed/issues/8182 

131 elif buf_char == b"<": # type: ignore[comparison-overlap] 

132 if curr > prev and not in_tag: 

133 # Keep everything after last non-extended-ASCII, 

134 # non-alphabetic character 

135 filtered.extend(buf[prev:curr]) 

136 # Output a space to delimit stretch we kept 

137 filtered.extend(b" ") 

138 in_tag = True 

139 

140 # If we're not in a tag... 

141 if not in_tag: 

142 # Keep everything after last non-extended-ASCII, non-alphabetic 

143 # character 

144 filtered.extend(buf[prev:]) 

145 

146 return filtered