Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/sjisprober.py: 93%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is mozilla.org code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 1998
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11#
12# This library is free software; you can redistribute it and/or
13# modify it under the terms of the GNU Lesser General Public
14# License as published by the Free Software Foundation; either
15# version 2.1 of the License, or (at your option) any later version.
16#
17# This library is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20# Lesser General Public License for more details.
21#
22# You should have received a copy of the GNU Lesser General Public
23# License along with this library; if not, see
24# <https://www.gnu.org/licenses/>.
25######################### END LICENSE BLOCK #########################
27from typing import Union
29from .chardistribution import SJISDistributionAnalysis
30from .codingstatemachine import CodingStateMachine
31from .enums import MachineState, ProbingState
32from .jpcntx import SJISContextAnalysis
33from .mbcharsetprober import MultiByteCharSetProber
34from .mbcssm import SJIS_SM_MODEL
37class SJISProber(MultiByteCharSetProber):
38 def __init__(self) -> None:
39 super().__init__()
40 self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
41 self.distribution_analyzer = SJISDistributionAnalysis()
42 self.context_analyzer = SJISContextAnalysis()
43 self.reset()
45 def reset(self) -> None:
46 super().reset()
47 self.context_analyzer.reset()
49 @property
50 def charset_name(self) -> str:
51 return self.context_analyzer.charset_name
53 @property
54 def language(self) -> str:
55 return "Japanese"
57 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
58 assert self.coding_sm is not None
59 assert self.distribution_analyzer is not None
61 for i, byte in enumerate(byte_str):
62 coding_state = self.coding_sm.next_state(byte)
63 if coding_state == MachineState.ERROR:
64 self.logger.debug(
65 "%s %s prober hit error at byte %s",
66 self.charset_name,
67 self.language,
68 i,
69 )
70 self._state = ProbingState.NOT_ME
71 break
72 if coding_state == MachineState.ITS_ME:
73 self._state = ProbingState.FOUND_IT
74 break
75 if coding_state == MachineState.START:
76 char_len = self.coding_sm.get_current_charlen()
77 if i == 0:
78 self._last_char[1] = byte
79 self.context_analyzer.feed(
80 self._last_char[2 - char_len :], char_len
81 )
82 self.distribution_analyzer.feed(self._last_char, char_len)
83 else:
84 self.context_analyzer.feed(
85 byte_str[i + 1 - char_len : i + 3 - char_len], char_len
86 )
87 self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
89 self._last_char[0] = byte_str[-1]
91 if self.state == ProbingState.DETECTING:
92 if self.context_analyzer.got_enough_data() and (
93 self.get_confidence() > self.SHORTCUT_THRESHOLD
94 ):
95 self._state = ProbingState.FOUND_IT
97 return self.state
99 def get_confidence(self) -> float:
100 assert self.distribution_analyzer is not None
102 context_conf = self.context_analyzer.get_confidence()
103 distrib_conf = self.distribution_analyzer.get_confidence()
104 return max(context_conf, distrib_conf)