Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/escprober.py: 95%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is mozilla.org code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 1998
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11#
12# This library is free software; you can redistribute it and/or
13# modify it under the terms of the GNU Lesser General Public
14# License as published by the Free Software Foundation; either
15# version 2.1 of the License, or (at your option) any later version.
16#
17# This library is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20# Lesser General Public License for more details.
21#
22# You should have received a copy of the GNU Lesser General Public
23# License along with this library; if not, see
24# <https://www.gnu.org/licenses/>.
25######################### END LICENSE BLOCK #########################
27from typing import Optional, Union
29from .charsetprober import CharSetProber
30from .codingstatemachine import CodingStateMachine
31from .enums import LanguageFilter, MachineState, ProbingState
32from .escsm import (
33 HZ_SM_MODEL,
34 ISO2022CN_SM_MODEL,
35 ISO2022JP_SM_MODEL,
36 ISO2022KR_SM_MODEL,
37)
40class EscCharSetProber(CharSetProber):
41 """
42 This CharSetProber uses a "code scheme" approach for detecting encodings,
43 whereby easily recognizable escape or shift sequences are relied on to
44 identify these encodings.
45 """
47 def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
48 super().__init__(lang_filter=lang_filter)
49 self.coding_sm = []
50 if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:
51 self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))
52 self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))
53 if self.lang_filter & LanguageFilter.JAPANESE:
54 self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))
55 if self.lang_filter & LanguageFilter.KOREAN:
56 self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))
57 self.active_sm_count = 0
58 self._detected_charset: Optional[str] = None
59 self._detected_language: Optional[str] = None
60 self._state = ProbingState.DETECTING
61 self.reset()
63 def reset(self) -> None:
64 super().reset()
65 for coding_sm in self.coding_sm:
66 coding_sm.active = True
67 coding_sm.reset()
68 self.active_sm_count = len(self.coding_sm)
69 self._detected_charset = None
70 self._detected_language = None
72 @property
73 def charset_name(self) -> Optional[str]:
74 return self._detected_charset
76 @property
77 def language(self) -> Optional[str]:
78 return self._detected_language
80 def get_confidence(self) -> float:
81 return 0.99 if self._detected_charset else 0.00
83 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
84 for c in byte_str:
85 for coding_sm in self.coding_sm:
86 if not coding_sm.active:
87 continue
88 coding_state = coding_sm.next_state(c)
89 if coding_state == MachineState.ERROR:
90 coding_sm.active = False
91 self.active_sm_count -= 1
92 if self.active_sm_count <= 0:
93 self._state = ProbingState.NOT_ME
94 return self.state
95 elif coding_state == MachineState.ITS_ME:
96 self._state = ProbingState.FOUND_IT
97 self._detected_charset = coding_sm.get_coding_state_machine()
98 self._detected_language = coding_sm.language
99 return self.state
101 return self.state