1######################## BEGIN LICENSE BLOCK ########################
2# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
3# The Original Code is Mozilla Universal charset detector code.
4#
5# The Initial Developer of the Original Code is
6# Netscape Communications Corporation.
7# Portions created by the Initial Developer are Copyright (C) 2001
8# the Initial Developer. All Rights Reserved.
9#
10# Contributor(s):
11# Rob Speer - adapt to MacRoman encoding
12# Mark Pilgrim - port to Python
13# Shy Shalom - original C code
14#
15# This library is free software; you can redistribute it and/or
16# modify it under the terms of the GNU Lesser General Public
17# License as published by the Free Software Foundation; either
18# version 2.1 of the License, or (at your option) any later version.
19#
20# This library is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23# Lesser General Public License for more details.
24#
25# You should have received a copy of the GNU Lesser General Public
26# License along with this library; if not, write to the Free Software
27# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
28# 02110-1301 USA
29######################### END LICENSE BLOCK #########################
30
31from typing import List, Union
32
33from .charsetprober import CharSetProber
34from .enums import ProbingState
35
36FREQ_CAT_NUM = 4
37
38UDF = 0 # undefined
39OTH = 1 # other
40ASC = 2 # ascii capital letter
41ASS = 3 # ascii small letter
42ACV = 4 # accent capital vowel
43ACO = 5 # accent capital other
44ASV = 6 # accent small vowel
45ASO = 7 # accent small other
46ODD = 8 # character that is unlikely to appear
47CLASS_NUM = 9 # total classes
48
49# The change from Latin1 is that we explicitly look for extended characters
50# that are infrequently-occurring symbols, and consider them to always be
51# improbable. This should let MacRoman get out of the way of more likely
52# encodings in most situations.
53
54# fmt: off
55MacRoman_CharToClass = (
56 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
57 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
58 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
59 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
60 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
61 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
62 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
63 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
64 OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
65 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
66 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
67 ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
68 OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
69 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
70 ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
71 ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
72 ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87
73 ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F
74 ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97
75 ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F
76 OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7
77 OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF
78 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
79 OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF
80 OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7
81 OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF
82 OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7
83 ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF
84 OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7
85 ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF
86 ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7
87 ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF
88)
89
90# 0 : illegal
91# 1 : very unlikely
92# 2 : normal
93# 3 : very likely
94MacRomanClassModel = (
95# UDF OTH ASC ASS ACV ACO ASV ASO ODD
96 0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF
97 0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH
98 0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC
99 0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS
100 0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV
101 0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO
102 0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV
103 0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO
104 0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD
105)
106# fmt: on
107
108
109class MacRomanProber(CharSetProber):
110 def __init__(self) -> None:
111 super().__init__()
112 self._last_char_class = OTH
113 self._freq_counter: List[int] = []
114 self.reset()
115
116 def reset(self) -> None:
117 self._last_char_class = OTH
118 self._freq_counter = [0] * FREQ_CAT_NUM
119
120 # express the prior that MacRoman is a somewhat rare encoding;
121 # this can be done by starting out in a slightly improbable state
122 # that must be overcome
123 self._freq_counter[2] = 10
124
125 super().reset()
126
127 @property
128 def charset_name(self) -> str:
129 return "MacRoman"
130
131 @property
132 def language(self) -> str:
133 return ""
134
135 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
136 byte_str = self.remove_xml_tags(byte_str)
137 for c in byte_str:
138 char_class = MacRoman_CharToClass[c]
139 freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class]
140 if freq == 0:
141 self._state = ProbingState.NOT_ME
142 break
143 self._freq_counter[freq] += 1
144 self._last_char_class = char_class
145
146 return self.state
147
148 def get_confidence(self) -> float:
149 if self.state == ProbingState.NOT_ME:
150 return 0.01
151
152 total = sum(self._freq_counter)
153 confidence = (
154 0.0
155 if total < 0.01
156 else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total
157 )
158 confidence = max(confidence, 0.0)
159 # lower the confidence of MacRoman so that other more accurate
160 # detector can take priority.
161 confidence *= 0.73
162 return confidence