Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/escprober.py: 98%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is mozilla.org code.

4# The Initial Developer of the Original Code is

5# Netscape Communications Corporation.

9# Contributor(s):

10# Mark Pilgrim - port to Python

11#

12# This library is free software; you can redistribute it and/or

13# modify it under the terms of the GNU Lesser General Public

14# License as published by the Free Software Foundation; either

15# version 2.1 of the License, or (at your option) any later version.

16#

17# This library is distributed in the hope that it will be useful,

18# but WITHOUT ANY WARRANTY; without even the implied warranty of

19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

20# Lesser General Public License for more details.

21#

22# You should have received a copy of the GNU Lesser General Public

23# License along with this library; if not, see

24# <https://www.gnu.org/licenses/>.

25######################### END LICENSE BLOCK #########################

27from typing import Optional, Union

29from .charsetprober import CharSetProber

30from .codingstatemachine import CodingStateMachine

31from .enums import EncodingEra, LanguageFilter, MachineState, ProbingState

32from .escsm import (

33 HZ_SM_MODEL,

34 ISO2022JP_SM_MODEL,

35 ISO2022KR_SM_MODEL,

36)

39class EscCharSetProber(CharSetProber):

40 """

41 This CharSetProber uses a "code scheme" approach for detecting encodings,

42 whereby easily recognizable escape or shift sequences are relied on to

43 identify these encodings.

44 """

46 def __init__(

47 self,

48 lang_filter: LanguageFilter = LanguageFilter.ALL,

49 encoding_era: EncodingEra = EncodingEra.ALL,

50 ) -> None:

51 super().__init__(lang_filter=lang_filter, encoding_era=encoding_era)

52 self.coding_sm = []

53 if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:

54 self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))

55 if self.lang_filter & LanguageFilter.JAPANESE:

56 self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))

57 if self.lang_filter & LanguageFilter.KOREAN:

58 self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))

59 self.active_sm_count = 0

60 self._detected_charset: Optional[str] = None

61 self._detected_language: Optional[str] = None

62 self._state = ProbingState.DETECTING

63 self.reset()

65 def reset(self) -> None:

66 super().reset()

67 for coding_sm in self.coding_sm:

68 coding_sm.active = True

69 coding_sm.reset()

70 self.active_sm_count = len(self.coding_sm)

71 self._detected_charset = None

72 self._detected_language = None

74 @property

75 def charset_name(self) -> Optional[str]:

76 return self._detected_charset

78 @property

79 def language(self) -> Optional[str]:

80 return self._detected_language

82 def get_confidence(self) -> float:

83 return 0.99 if self._detected_charset else 0.00

85 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

86 for c in byte_str:

87 for coding_sm in self.coding_sm:

88 if not coding_sm.active:

89 continue

90 coding_state = coding_sm.next_state(c)

91 if coding_state == MachineState.ERROR:

92 coding_sm.active = False

93 self.active_sm_count -= 1

94 if self.active_sm_count <= 0:

95 self._state = ProbingState.NOT_ME

96 return self.state

97 elif coding_state == MachineState.ITS_ME:

98 self._state = ProbingState.FOUND_IT

99 self._detected_charset = coding_sm.get_coding_state_machine()

100 self._detected_language = coding_sm.language

101 return self.state

102

103 return self.state