Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/escprober.py: 100%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is mozilla.org code.

4# The Initial Developer of the Original Code is

5# Netscape Communications Corporation.

9# Contributor(s):

10# Mark Pilgrim - port to Python

11#

12# This library is free software; you can redistribute it and/or

13# modify it under the terms of the GNU Lesser General Public

14# License as published by the Free Software Foundation; either

15# version 2.1 of the License, or (at your option) any later version.

16#

17# This library is distributed in the hope that it will be useful,

18# but WITHOUT ANY WARRANTY; without even the implied warranty of

19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

20# Lesser General Public License for more details.

21#

22# You should have received a copy of the GNU Lesser General Public

23# License along with this library; if not, write to the Free Software

24# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

25# 02110-1301 USA

26######################### END LICENSE BLOCK #########################

28from typing import Optional, Union

30from .charsetprober import CharSetProber

31from .codingstatemachine import CodingStateMachine

32from .enums import LanguageFilter, MachineState, ProbingState

33from .escsm import (

34 HZ_SM_MODEL,

35 ISO2022CN_SM_MODEL,

36 ISO2022JP_SM_MODEL,

37 ISO2022KR_SM_MODEL,

38)

41class EscCharSetProber(CharSetProber):

42 """

43 This CharSetProber uses a "code scheme" approach for detecting encodings,

44 whereby easily recognizable escape or shift sequences are relied on to

45 identify these encodings.

46 """

48 def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:

49 super().__init__(lang_filter=lang_filter)

50 self.coding_sm = []

51 if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:

52 self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))

53 self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))

54 if self.lang_filter & LanguageFilter.JAPANESE:

55 self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))

56 if self.lang_filter & LanguageFilter.KOREAN:

57 self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))

58 self.active_sm_count = 0

59 self._detected_charset: Optional[str] = None

60 self._detected_language: Optional[str] = None

61 self._state = ProbingState.DETECTING

62 self.reset()

64 def reset(self) -> None:

65 super().reset()

66 for coding_sm in self.coding_sm:

67 coding_sm.active = True

68 coding_sm.reset()

69 self.active_sm_count = len(self.coding_sm)

70 self._detected_charset = None

71 self._detected_language = None

73 @property

74 def charset_name(self) -> Optional[str]:

75 return self._detected_charset

77 @property

78 def language(self) -> Optional[str]:

79 return self._detected_language

81 def get_confidence(self) -> float:

82 return 0.99 if self._detected_charset else 0.00

84 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

85 for c in byte_str:

86 for coding_sm in self.coding_sm:

87 if not coding_sm.active:

88 continue

89 coding_state = coding_sm.next_state(c)

90 if coding_state == MachineState.ERROR:

91 coding_sm.active = False

92 self.active_sm_count -= 1

93 if self.active_sm_count <= 0:

94 self._state = ProbingState.NOT_ME

95 return self.state

96 elif coding_state == MachineState.ITS_ME:

97 self._state = ProbingState.FOUND_IT

98 self._detected_charset = coding_sm.get_coding_state_machine()

99 self._detected_language = coding_sm.language

100 return self.state

101

102 return self.state