Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/escprober.py: 95%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is mozilla.org code.

4# The Initial Developer of the Original Code is

5# Netscape Communications Corporation.

9# Contributor(s):

10# Mark Pilgrim - port to Python

11#

12# This library is free software; you can redistribute it and/or

13# modify it under the terms of the GNU Lesser General Public

14# License as published by the Free Software Foundation; either

15# version 2.1 of the License, or (at your option) any later version.

16#

17# This library is distributed in the hope that it will be useful,

18# but WITHOUT ANY WARRANTY; without even the implied warranty of

19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

20# Lesser General Public License for more details.

21#

22# You should have received a copy of the GNU Lesser General Public

23# License along with this library; if not, see

24# <https://www.gnu.org/licenses/>.

25######################### END LICENSE BLOCK #########################

27from typing import Optional, Union

29from .charsetprober import CharSetProber

30from .codingstatemachine import CodingStateMachine

31from .enums import LanguageFilter, MachineState, ProbingState

32from .escsm import (

33 HZ_SM_MODEL,

34 ISO2022CN_SM_MODEL,

35 ISO2022JP_SM_MODEL,

36 ISO2022KR_SM_MODEL,

37)

40class EscCharSetProber(CharSetProber):

41 """

42 This CharSetProber uses a "code scheme" approach for detecting encodings,

43 whereby easily recognizable escape or shift sequences are relied on to

44 identify these encodings.

45 """

47 def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:

48 super().__init__(lang_filter=lang_filter)

49 self.coding_sm = []

50 if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:

51 self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))

52 self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))

53 if self.lang_filter & LanguageFilter.JAPANESE:

54 self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))

55 if self.lang_filter & LanguageFilter.KOREAN:

56 self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))

57 self.active_sm_count = 0

58 self._detected_charset: Optional[str] = None

59 self._detected_language: Optional[str] = None

60 self._state = ProbingState.DETECTING

61 self.reset()

63 def reset(self) -> None:

64 super().reset()

65 for coding_sm in self.coding_sm:

66 coding_sm.active = True

67 coding_sm.reset()

68 self.active_sm_count = len(self.coding_sm)

69 self._detected_charset = None

70 self._detected_language = None

72 @property

73 def charset_name(self) -> Optional[str]:

74 return self._detected_charset

76 @property

77 def language(self) -> Optional[str]:

78 return self._detected_language

80 def get_confidence(self) -> float:

81 return 0.99 if self._detected_charset else 0.00

83 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

84 for c in byte_str:

85 for coding_sm in self.coding_sm:

86 if not coding_sm.active:

87 continue

88 coding_state = coding_sm.next_state(c)

89 if coding_state == MachineState.ERROR:

90 coding_sm.active = False

91 self.active_sm_count -= 1

92 if self.active_sm_count <= 0:

93 self._state = ProbingState.NOT_ME

94 return self.state

95 elif coding_state == MachineState.ITS_ME:

96 self._state = ProbingState.FOUND_IT

97 self._detected_charset = coding_sm.get_coding_state_machine()

98 self._detected_language = coding_sm.language

99 return self.state

100

101 return self.state