Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/eucjpprober.py: 94%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is mozilla.org code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 1998
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11#
12# This library is free software; you can redistribute it and/or
13# modify it under the terms of the GNU Lesser General Public
14# License as published by the Free Software Foundation; either
15# version 2.1 of the License, or (at your option) any later version.
16#
17# This library is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20# Lesser General Public License for more details.
21#
22# You should have received a copy of the GNU Lesser General Public
23# License along with this library; if not, see
24# <https://www.gnu.org/licenses/>.
25######################### END LICENSE BLOCK #########################
27from typing import Union
29from .chardistribution import EUCJPDistributionAnalysis
30from .codingstatemachine import CodingStateMachine
31from .enums import MachineState, ProbingState
32from .jpcntx import EUCJPContextAnalysis
33from .mbcharsetprober import MultiByteCharSetProber
34from .mbcssm import EUCJP_SM_MODEL
37class EUCJPProber(MultiByteCharSetProber):
38 def __init__(self) -> None:
39 super().__init__()
40 self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)
41 self.distribution_analyzer = EUCJPDistributionAnalysis()
42 self.context_analyzer = EUCJPContextAnalysis()
43 self.reset()
45 def reset(self) -> None:
46 super().reset()
47 self.context_analyzer.reset()
49 @property
50 def charset_name(self) -> str:
51 return "EUC-JP"
53 @property
54 def language(self) -> str:
55 return "Japanese"
57 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
58 assert self.coding_sm is not None
59 assert self.distribution_analyzer is not None
61 for i, byte in enumerate(byte_str):
62 # PY3K: byte_str is a byte array, so byte is an int, not a byte
63 coding_state = self.coding_sm.next_state(byte)
64 if coding_state == MachineState.ERROR:
65 self.logger.debug(
66 "%s %s prober hit error at byte %s",
67 self.charset_name,
68 self.language,
69 i,
70 )
71 self._state = ProbingState.NOT_ME
72 break
73 if coding_state == MachineState.ITS_ME:
74 self._state = ProbingState.FOUND_IT
75 break
76 if coding_state == MachineState.START:
77 char_len = self.coding_sm.get_current_charlen()
78 if i == 0:
79 self._last_char[1] = byte
80 self.context_analyzer.feed(self._last_char, char_len)
81 self.distribution_analyzer.feed(self._last_char, char_len)
82 else:
83 self.context_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
84 self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
86 self._last_char[0] = byte_str[-1]
88 if self.state == ProbingState.DETECTING:
89 if self.context_analyzer.got_enough_data() and (
90 self.get_confidence() > self.SHORTCUT_THRESHOLD
91 ):
92 self._state = ProbingState.FOUND_IT
94 return self.state
96 def get_confidence(self) -> float:
97 assert self.distribution_analyzer is not None
99 context_conf = self.context_analyzer.get_confidence()
100 distrib_conf = self.distribution_analyzer.get_confidence()
101 return max(context_conf, distrib_conf)