Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/chardet/hebrewprober.py: 98%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is Mozilla Universal charset detector code.

4# The Initial Developer of the Original Code is

5# Shy Shalom

9# Contributor(s):

10# Mark Pilgrim - port to Python

11#

12# This library is free software; you can redistribute it and/or

13# modify it under the terms of the GNU Lesser General Public

14# License as published by the Free Software Foundation; either

15# version 2.1 of the License, or (at your option) any later version.

16#

17# This library is distributed in the hope that it will be useful,

18# but WITHOUT ANY WARRANTY; without even the implied warranty of

19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

20# Lesser General Public License for more details.

21#

22# You should have received a copy of the GNU Lesser General Public

23# License along with this library; if not, write to the Free Software

24# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

25# 02110-1301 USA

26######################### END LICENSE BLOCK #########################

28from typing import Optional, Union

30from .charsetprober import CharSetProber

31from .enums import ProbingState

32from .sbcharsetprober import SingleByteCharSetProber

34# This prober doesn't actually recognize a language or a charset.

35# It is a helper prober for the use of the Hebrew model probers

37### General ideas of the Hebrew charset recognition ###

38#

39# Four main charsets exist in Hebrew:

40# "ISO-8859-8" - Visual Hebrew

41# "windows-1255" - Logical Hebrew

42# "ISO-8859-8-I" - Logical Hebrew

43# "x-mac-hebrew" - ?? Logical Hebrew ??

44#

45# Both "ISO" charsets use a completely identical set of code points, whereas

46# "windows-1255" and "x-mac-hebrew" are two different proper supersets of

47# these code points. windows-1255 defines additional characters in the range

48# 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific

49# diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.

50# x-mac-hebrew defines similar additional code points but with a different

51# mapping.

52#

53# As far as an average Hebrew text with no diacritics is concerned, all four

54# charsets are identical with respect to code points. Meaning that for the

55# main Hebrew alphabet, all four map the same values to all 27 Hebrew letters

56# (including final letters).

57#

58# The dominant difference between these charsets is their directionality.

59# "Visual" directionality means that the text is ordered as if the renderer is

60# not aware of a BIDI rendering algorithm. The renderer sees the text and

61# draws it from left to right. The text itself when ordered naturally is read

62# backwards. A buffer of Visual Hebrew generally looks like so:

63# "[last word of first line spelled backwards] [whole line ordered backwards

64# and spelled backwards] [first word of first line spelled backwards]

65# [end of line] [last word of second line] ... etc' "

66# adding punctuation marks, numbers and English text to visual text is

67# naturally also "visual" and from left to right.

68#

69# "Logical" directionality means the text is ordered "naturally" according to

70# the order it is read. It is the responsibility of the renderer to display

71# the text from right to left. A BIDI algorithm is used to place general

72# punctuation marks, numbers and English text in the text.

73#

74# Texts in x-mac-hebrew are almost impossible to find on the Internet. From

75# what little evidence I could find, it seems that its general directionality

76# is Logical.

77#

78# To sum up all of the above, the Hebrew probing mechanism knows about two

79# charsets:

80# Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are

81# backwards while line order is natural. For charset recognition purposes

82# the line order is unimportant (In fact, for this implementation, even

83# word order is unimportant).

84# Logical Hebrew - "windows-1255" - normal, naturally ordered text.

85#

86# "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be

87# specifically identified.

88# "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew

89# that contain special punctuation marks or diacritics is displayed with

90# some unconverted characters showing as question marks. This problem might

91# be corrected using another model prober for x-mac-hebrew. Due to the fact

92# that x-mac-hebrew texts are so rare, writing another model prober isn't

93# worth the effort and performance hit.

94#

95#### The Prober ####

96#

97# The prober is divided between two SBCharSetProbers and a HebrewProber,

98# all of which are managed, created, fed data, inquired and deleted by the

99# SBCSGroupProber. The two SBCharSetProbers identify that the text is in

100# fact some kind of Hebrew, Logical or Visual. The final decision about which

101# one is it is made by the HebrewProber by combining final-letter scores

102# with the scores of the two SBCharSetProbers to produce a final answer.

103#

104# The SBCSGroupProber is responsible for stripping the original text of HTML

105# tags, English characters, numbers, low-ASCII punctuation characters, spaces

106# and new lines. It reduces any sequence of such characters to a single space.

107# The buffer fed to each prober in the SBCS group prober is pure text in

108# high-ASCII.

109# The two SBCharSetProbers (model probers) share the same language model:

110# Win1255Model.

111# The first SBCharSetProber uses the model normally as any other

112# SBCharSetProber does, to recognize windows-1255, upon which this model was

113# built. The second SBCharSetProber is told to make the pair-of-letter

114# lookup in the language model backwards. This in practice exactly simulates

115# a visual Hebrew model using the windows-1255 logical Hebrew model.

116#

117# The HebrewProber is not using any language model. All it does is look for

118# final-letter evidence suggesting the text is either logical Hebrew or visual

119# Hebrew. Disjointed from the model probers, the results of the HebrewProber

120# alone are meaningless. HebrewProber always returns 0.00 as confidence

121# since it never identifies a charset by itself. Instead, the pointer to the

122# HebrewProber is passed to the model probers as a helper "Name Prober".

123# When the Group prober receives a positive identification from any prober,

124# it asks for the name of the charset identified. If the prober queried is a

125# Hebrew model prober, the model prober forwards the call to the

126# HebrewProber to make the final decision. In the HebrewProber, the

127# decision is made according to the final-letters scores maintained and Both

128# model probers scores. The answer is returned in the form of the name of the

129# charset identified, either "windows-1255" or "ISO-8859-8".

130

131

132class HebrewProber(CharSetProber):

133 SPACE = 0x20

134 # windows-1255 / ISO-8859-8 code points of interest

135 FINAL_KAF = 0xEA

136 NORMAL_KAF = 0xEB

137 FINAL_MEM = 0xED

138 NORMAL_MEM = 0xEE

139 FINAL_NUN = 0xEF

140 NORMAL_NUN = 0xF0

141 FINAL_PE = 0xF3

142 NORMAL_PE = 0xF4

143 FINAL_TSADI = 0xF5

144 NORMAL_TSADI = 0xF6

145

146 # Minimum Visual vs Logical final letter score difference.

147 # If the difference is below this, don't rely solely on the final letter score

148 # distance.

149 MIN_FINAL_CHAR_DISTANCE = 5

150

151 # Minimum Visual vs Logical model score difference.

152 # If the difference is below this, don't rely at all on the model score

153 # distance.

154 MIN_MODEL_DISTANCE = 0.01

155

156 VISUAL_HEBREW_NAME = "ISO-8859-8"

157 LOGICAL_HEBREW_NAME = "windows-1255"

158

159 def __init__(self) -> None:

160 super().__init__()

161 self._final_char_logical_score = 0

162 self._final_char_visual_score = 0

163 self._prev = self.SPACE

164 self._before_prev = self.SPACE

165 self._logical_prober: Optional[SingleByteCharSetProber] = None

166 self._visual_prober: Optional[SingleByteCharSetProber] = None

167 self.reset()

168

169 def reset(self) -> None:

170 self._final_char_logical_score = 0

171 self._final_char_visual_score = 0

172 # The two last characters seen in the previous buffer,

173 # mPrev and mBeforePrev are initialized to space in order to simulate

174 # a word delimiter at the beginning of the data

175 self._prev = self.SPACE

176 self._before_prev = self.SPACE

177 # These probers are owned by the group prober.

178

179 def set_model_probers(

180 self,

181 logical_prober: SingleByteCharSetProber,

182 visual_prober: SingleByteCharSetProber,

183 ) -> None:

184 self._logical_prober = logical_prober

185 self._visual_prober = visual_prober

186

187 def is_final(self, c: int) -> bool:

188 return c in [

189 self.FINAL_KAF,

190 self.FINAL_MEM,

191 self.FINAL_NUN,

192 self.FINAL_PE,

193 self.FINAL_TSADI,

194 ]

195

196 def is_non_final(self, c: int) -> bool:

197 # The normal Tsadi is not a good Non-Final letter due to words like

198 # 'lechotet' (to chat) containing an apostrophe after the tsadi. This

199 # apostrophe is converted to a space in FilterWithoutEnglishLetters

200 # causing the Non-Final tsadi to appear at an end of a word even

201 # though this is not the case in the original text.

202 # The letters Pe and Kaf rarely display a related behavior of not being

203 # a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'

204 # for example legally end with a Non-Final Pe or Kaf. However, the

205 # benefit of these letters as Non-Final letters outweighs the damage

206 # since these words are quite rare.

207 return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]

208

209 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

210 # Final letter analysis for logical-visual decision.

211 # Look for evidence that the received buffer is either logical Hebrew

212 # or visual Hebrew.

213 # The following cases are checked:

214 # 1) A word longer than 1 letter, ending with a final letter. This is

215 # an indication that the text is laid out "naturally" since the

216 # final letter really appears at the end. +1 for logical score.

217 # 2) A word longer than 1 letter, ending with a Non-Final letter. In

218 # normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,

219 # should not end with the Non-Final form of that letter. Exceptions

220 # to this rule are mentioned above in isNonFinal(). This is an

221 # indication that the text is laid out backwards. +1 for visual

222 # score

223 # 3) A word longer than 1 letter, starting with a final letter. Final

224 # letters should not appear at the beginning of a word. This is an

225 # indication that the text is laid out backwards. +1 for visual

226 # score.

227 #

228 # The visual score and logical score are accumulated throughout the

229 # text and are finally checked against each other in GetCharSetName().

230 # No checking for final letters in the middle of words is done since

231 # that case is not an indication for either Logical or Visual text.

232 #

233 # We automatically filter out all 7-bit characters (replace them with

234 # spaces) so the word boundary detection works properly. [MAP]

235

236 if self.state == ProbingState.NOT_ME:

237 # Both model probers say it's not them. No reason to continue.

238 return ProbingState.NOT_ME

239

240 byte_str = self.filter_high_byte_only(byte_str)

241

242 for cur in byte_str:

243 if cur == self.SPACE:

244 # We stand on a space - a word just ended

245 if self._before_prev != self.SPACE:

246 # next-to-last char was not a space so self._prev is not a

247 # 1 letter word

248 if self.is_final(self._prev):

249 # case (1) [-2:not space][-1:final letter][cur:space]

250 self._final_char_logical_score += 1

251 elif self.is_non_final(self._prev):

252 # case (2) [-2:not space][-1:Non-Final letter][

253 # cur:space]

254 self._final_char_visual_score += 1

255 else:

256 # Not standing on a space

257 if (

258 (self._before_prev == self.SPACE)

259 and (self.is_final(self._prev))

260 and (cur != self.SPACE)

261 ):

262 # case (3) [-2:space][-1:final letter][cur:not space]

263 self._final_char_visual_score += 1

264 self._before_prev = self._prev

265 self._prev = cur

266

267 # Forever detecting, till the end or until both model probers return

268 # ProbingState.NOT_ME (handled above)

269 return ProbingState.DETECTING

270

271 @property

272 def charset_name(self) -> str:

273 assert self._logical_prober is not None

274 assert self._visual_prober is not None

275

276 # Make the decision: is it Logical or Visual?

277 # If the final letter score distance is dominant enough, rely on it.

278 finalsub = self._final_char_logical_score - self._final_char_visual_score

279 if finalsub >= self.MIN_FINAL_CHAR_DISTANCE:

280 return self.LOGICAL_HEBREW_NAME

281 if finalsub <= -self.MIN_FINAL_CHAR_DISTANCE:

282 return self.VISUAL_HEBREW_NAME

283

284 # It's not dominant enough, try to rely on the model scores instead.

285 modelsub = (

286 self._logical_prober.get_confidence() - self._visual_prober.get_confidence()

287 )

288 if modelsub > self.MIN_MODEL_DISTANCE:

289 return self.LOGICAL_HEBREW_NAME

290 if modelsub < -self.MIN_MODEL_DISTANCE:

291 return self.VISUAL_HEBREW_NAME

292

293 # Still no good, back to final letter distance, maybe it'll save the

294 # day.

295 if finalsub < 0.0:

296 return self.VISUAL_HEBREW_NAME

297

298 # (finalsub > 0 - Logical) or (don't know what to do) default to

299 # Logical.

300 return self.LOGICAL_HEBREW_NAME

301

302 @property

303 def language(self) -> str:

304 return "Hebrew"

305

306 @property

307 def state(self) -> ProbingState:

308 assert self._logical_prober is not None

309 assert self._visual_prober is not None

310

311 # Remain active as long as any of the model probers are active.

312 if (self._logical_prober.state == ProbingState.NOT_ME) and (

313 self._visual_prober.state == ProbingState.NOT_ME

314 ):

315 return ProbingState.NOT_ME

316 return ProbingState.DETECTING