Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/mbcssm.py: 98%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

44 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is mozilla.org code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 1998 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# 

12# This library is free software; you can redistribute it and/or 

13# modify it under the terms of the GNU Lesser General Public 

14# License as published by the Free Software Foundation; either 

15# version 2.1 of the License, or (at your option) any later version. 

16# 

17# This library is distributed in the hope that it will be useful, 

18# but WITHOUT ANY WARRANTY; without even the implied warranty of 

19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

20# Lesser General Public License for more details. 

21# 

22# You should have received a copy of the GNU Lesser General Public 

23# License along with this library; if not, see 

24# <https://www.gnu.org/licenses/>. 

25######################### END LICENSE BLOCK ######################### 

26 

27from .codingstatemachinedict import CodingStateMachineDict 

28from .enums import MachineState 

29 

30# BIG5 

31 

32# fmt: off 

33BIG5_CLS = ( 

34 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as legal value 

35 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f 

36 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17 

37 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f 

38 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27 

39 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f 

40 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37 

41 1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f 

42 2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47 

43 2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f 

44 2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57 

45 2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f 

46 2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67 

47 2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f 

48 2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77 

49 2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f 

50 4, 4, 4, 4, 4, 4, 4, 4, # 80 - 87 

51 4, 4, 4, 4, 4, 4, 4, 4, # 88 - 8f 

52 4, 4, 4, 4, 4, 4, 4, 4, # 90 - 97 

53 4, 4, 4, 4, 4, 4, 4, 4, # 98 - 9f 

54 4, 3, 3, 3, 3, 3, 3, 3, # a0 - a7 

55 3, 3, 3, 3, 3, 3, 3, 3, # a8 - af 

56 3, 3, 3, 3, 3, 3, 3, 3, # b0 - b7 

57 3, 3, 3, 3, 3, 3, 3, 3, # b8 - bf 

58 3, 3, 3, 3, 3, 3, 3, 3, # c0 - c7 

59 3, 3, 3, 3, 3, 3, 3, 3, # c8 - cf 

60 3, 3, 3, 3, 3, 3, 3, 3, # d0 - d7 

61 3, 3, 3, 3, 3, 3, 3, 3, # d8 - df 

62 3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7 

63 3, 3, 3, 3, 3, 3, 3, 3, # e8 - ef 

64 3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7 

65 3, 3, 3, 3, 3, 3, 3, 0 # f8 - ff 

66) 

67 

68BIG5_ST = ( 

69 MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07 

70 MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f 

71 MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17 

72) 

73# fmt: on 

74 

75BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0) 

76 

77BIG5_SM_MODEL: CodingStateMachineDict = { 

78 "class_table": BIG5_CLS, 

79 "class_factor": 5, 

80 "state_table": BIG5_ST, 

81 "char_len_table": BIG5_CHAR_LEN_TABLE, 

82 "name": "Big5", 

83} 

84 

85# CP949 

86# fmt: off 

87 

88""" 

89# Classes 

900: Unused 

911: 00-40, 5B-60, 7B-7F : Ascii 

922: C7-FD 

933: C9,FE : User-Defined Area 

944: 41-52 

955: 53-5A, 61-7A 

966: 81-A0 

977: A1-AC, B0-C5 

988: AD-AF 

999: C6 

100 

101# Byte 1 

102Ascii: 00-7F : 1 + 4 + 5 

103State 3: 81-AC, B0-C5 : 6 + 7 

104State 4: AD-AF : 8 

105State 5: C6 : 9 

106State 6: C7-FE : 2 (+ 3) 

107 

108 

109# Byte 2 

110State 3: 41-5A, 61-7A, 81-FE : 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 

111State 4: 41-5A, 61-7A, 81-A0 : 4 + 5 + 6 

112State 5: 41-52, A1-FE : 2 + 3 + 4 + 7 + 8 + 9 

113State 6: A1-FE : 2 + 3 + 7 + 8 + 9 

114""" 

115 

116CP949_CLS = ( 

117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, # 00 - 0f 

118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, # 10 - 1f 

119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 2f 

120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 3f 

121 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, # 40 - 4f 

122 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 50 - 5f 

123 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 60 - 6f 

124 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 70 - 7f 

125 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 80 - 8f 

126 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 90 - 9f 

127 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, # a0 - af 

128 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # b0 - bf 

129 7, 7, 7, 7, 7, 7, 9, 2, 2, 3, 2, 2, 2, 2, 2, 2, # c0 - cf 

130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # d0 - df 

131 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # e0 - ef 

132 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 0, # f0 - ff 

133) 

134 

135CP949_ST = ( 

136 # 0 1 2 3 4 5 6 7 8 9 

137 MachineState.ERROR, MachineState.START, 6, MachineState.ERROR, MachineState.START, MachineState.START, 3, 3, 4, 5, # START 

138 MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # ERROR 

139 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # ITSME 

140 MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 3 

141 MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 4 

142 MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, # 5 

143 MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, # 6 

144) 

145# fmt: on 

146 

147CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 2, 2) 

148 

149CP949_SM_MODEL: CodingStateMachineDict = { 

150 "class_table": CP949_CLS, 

151 "class_factor": 10, 

152 "state_table": CP949_ST, 

153 "char_len_table": CP949_CHAR_LEN_TABLE, 

154 "name": "CP949", 

155} 

156 

157# EUC-JP 

158# fmt: off 

159EUCJP_CLS = ( 

160 4, 4, 4, 4, 4, 4, 4, 4, # 00 - 07 

161 4, 4, 4, 4, 4, 4, 5, 5, # 08 - 0f 

162 4, 4, 4, 4, 4, 4, 4, 4, # 10 - 17 

163 4, 4, 4, 5, 4, 4, 4, 4, # 18 - 1f 

164 4, 4, 4, 4, 4, 4, 4, 4, # 20 - 27 

165 4, 4, 4, 4, 4, 4, 4, 4, # 28 - 2f 

166 4, 4, 4, 4, 4, 4, 4, 4, # 30 - 37 

167 4, 4, 4, 4, 4, 4, 4, 4, # 38 - 3f 

168 4, 4, 4, 4, 4, 4, 4, 4, # 40 - 47 

169 4, 4, 4, 4, 4, 4, 4, 4, # 48 - 4f 

170 4, 4, 4, 4, 4, 4, 4, 4, # 50 - 57 

171 4, 4, 4, 4, 4, 4, 4, 4, # 58 - 5f 

172 4, 4, 4, 4, 4, 4, 4, 4, # 60 - 67 

173 4, 4, 4, 4, 4, 4, 4, 4, # 68 - 6f 

174 4, 4, 4, 4, 4, 4, 4, 4, # 70 - 77 

175 4, 4, 4, 4, 4, 4, 4, 4, # 78 - 7f 

176 5, 5, 5, 5, 5, 5, 5, 5, # 80 - 87 

177 5, 5, 5, 5, 5, 5, 1, 3, # 88 - 8f 

178 5, 5, 5, 5, 5, 5, 5, 5, # 90 - 97 

179 5, 5, 5, 5, 5, 5, 5, 5, # 98 - 9f 

180 5, 2, 2, 2, 2, 2, 2, 2, # a0 - a7 

181 2, 2, 2, 2, 2, 2, 2, 2, # a8 - af 

182 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7 

183 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf 

184 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7 

185 2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf 

186 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7 

187 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df 

188 0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7 

189 0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef 

190 0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7 

191 0, 0, 0, 0, 0, 0, 0, 5 # f8 - ff 

192) 

193 

194EUCJP_ST = ( 

195 3, 4, 3, 5,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07 

196 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f 

197 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17 

198 MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f 

199 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27 

200) 

201# fmt: on 

202 

203EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0) 

204 

205EUCJP_SM_MODEL: CodingStateMachineDict = { 

206 "class_table": EUCJP_CLS, 

207 "class_factor": 6, 

208 "state_table": EUCJP_ST, 

209 "char_len_table": EUCJP_CHAR_LEN_TABLE, 

210 "name": "EUC-JP", 

211} 

212 

213# EUC-KR 

214# fmt: off 

215EUCKR_CLS = ( 

216 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 

217 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f 

218 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17 

219 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f 

220 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27 

221 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f 

222 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37 

223 1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f 

224 1, 1, 1, 1, 1, 1, 1, 1, # 40 - 47 

225 1, 1, 1, 1, 1, 1, 1, 1, # 48 - 4f 

226 1, 1, 1, 1, 1, 1, 1, 1, # 50 - 57 

227 1, 1, 1, 1, 1, 1, 1, 1, # 58 - 5f 

228 1, 1, 1, 1, 1, 1, 1, 1, # 60 - 67 

229 1, 1, 1, 1, 1, 1, 1, 1, # 68 - 6f 

230 1, 1, 1, 1, 1, 1, 1, 1, # 70 - 77 

231 1, 1, 1, 1, 1, 1, 1, 1, # 78 - 7f 

232 0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87 

233 0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f 

234 0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97 

235 0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f 

236 0, 2, 2, 2, 2, 2, 2, 2, # a0 - a7 

237 2, 2, 2, 2, 2, 3, 3, 3, # a8 - af 

238 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7 

239 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf 

240 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7 

241 2, 3, 2, 2, 2, 2, 2, 2, # c8 - cf 

242 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7 

243 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df 

244 2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7 

245 2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef 

246 2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7 

247 2, 2, 2, 2, 2, 2, 2, 0 # f8 - ff 

248) 

249 

250EUCKR_ST = ( 

251 MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07 

252 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f 

253) 

254# fmt: on 

255 

256EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0) 

257 

258EUCKR_SM_MODEL: CodingStateMachineDict = { 

259 "class_table": EUCKR_CLS, 

260 "class_factor": 4, 

261 "state_table": EUCKR_ST, 

262 "char_len_table": EUCKR_CHAR_LEN_TABLE, 

263 "name": "EUC-KR", 

264} 

265 

266# JOHAB 

267# fmt: off 

268JOHAB_CLS = ( 

269 4,4,4,4,4,4,4,4, # 00 - 07 

270 4,4,4,4,4,4,0,0, # 08 - 0f 

271 4,4,4,4,4,4,4,4, # 10 - 17 

272 4,4,4,0,4,4,4,4, # 18 - 1f 

273 4,4,4,4,4,4,4,4, # 20 - 27 

274 4,4,4,4,4,4,4,4, # 28 - 2f 

275 4,3,3,3,3,3,3,3, # 30 - 37 

276 3,3,3,3,3,3,3,3, # 38 - 3f 

277 3,1,1,1,1,1,1,1, # 40 - 47 

278 1,1,1,1,1,1,1,1, # 48 - 4f 

279 1,1,1,1,1,1,1,1, # 50 - 57 

280 1,1,1,1,1,1,1,1, # 58 - 5f 

281 1,1,1,1,1,1,1,1, # 60 - 67 

282 1,1,1,1,1,1,1,1, # 68 - 6f 

283 1,1,1,1,1,1,1,1, # 70 - 77 

284 1,1,1,1,1,1,1,2, # 78 - 7f 

285 6,6,6,6,8,8,8,8, # 80 - 87 

286 8,8,8,8,8,8,8,8, # 88 - 8f 

287 8,7,7,7,7,7,7,7, # 90 - 97 

288 7,7,7,7,7,7,7,7, # 98 - 9f 

289 7,7,7,7,7,7,7,7, # a0 - a7 

290 7,7,7,7,7,7,7,7, # a8 - af 

291 7,7,7,7,7,7,7,7, # b0 - b7 

292 7,7,7,7,7,7,7,7, # b8 - bf 

293 7,7,7,7,7,7,7,7, # c0 - c7 

294 7,7,7,7,7,7,7,7, # c8 - cf 

295 7,7,7,7,5,5,5,5, # d0 - d7 

296 5,9,9,9,9,9,9,5, # d8 - df 

297 9,9,9,9,9,9,9,9, # e0 - e7 

298 9,9,9,9,9,9,9,9, # e8 - ef 

299 9,9,9,9,9,9,9,9, # f0 - f7 

300 9,9,5,5,5,5,5,0 # f8 - ff 

301) 

302 

303JOHAB_ST = ( 

304# cls = 0 1 2 3 4 5 6 7 8 9 

305 MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,3 ,3 ,4 , # MachineState.START 

306 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME 

307 MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR , # MachineState.ERROR 

308 MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START , # 3 

309 MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START , # 4 

310) 

311# fmt: on 

312 

313JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2) 

314 

315JOHAB_SM_MODEL: CodingStateMachineDict = { 

316 "class_table": JOHAB_CLS, 

317 "class_factor": 10, 

318 "state_table": JOHAB_ST, 

319 "char_len_table": JOHAB_CHAR_LEN_TABLE, 

320 "name": "Johab", 

321} 

322 

323# GB2312 - REMOVED 

324# GB2312 is a subset of GB18030. The GB18030 state machine and prober now 

325# correctly detect GB2312 content with the same confidence as the old GB2312 

326# prober (both use GB2312DistributionAnalysis). The LEGACY_MAP renames 

327# GB2312 → GB18030 for backward compatibility. 

328# Having both probers was redundant after fixing GB18030's char_len_table. 

329 

330# GB18030 

331# GB18030 is a superset of GB2312 and GBK 

332# It supports: 

333# - 1-byte: ASCII (0x00-0x7F) 

334# - 2-byte: lead 0x81-0xFE, trail 0x40-0x7E or 0x80-0xFE (GBK/GB2312 compatible) 

335# - 4-byte: 0x81-0xFE, 0x30-0x39, 0x81-0xFE, 0x30-0x39 (GB18030 extension) 

336# 

337# Byte classes: 

338# 0: Invalid 

339# 1: ASCII (0x00-0x7F) 

340# 2: Digit 0x30-0x39 (can be 2nd or 4th byte in 4-byte sequence) 

341# 3: Valid 2-byte trail (0x40-0x7E) 

342# 4: Invalid byte 0x7F 

343# 5: Valid 2-byte trail and lead for 4-byte (0x80-0xFE) 

344# 6: Lead byte (0x81-0xFE) - can start 2-byte or 4-byte, or be 3rd byte in 4-byte 

345 

346# fmt: off 

347GB18030_CLS = ( 

348 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 

349 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f 

350 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17 

351 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f 

352 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27 

353 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f 

354 2, 2, 2, 2, 2, 2, 2, 2, # 30 - 37 

355 2, 2, 1, 1, 1, 1, 1, 1, # 38 - 3f 

356 3, 3, 3, 3, 3, 3, 3, 3, # 40 - 47 

357 3, 3, 3, 3, 3, 3, 3, 3, # 48 - 4f 

358 3, 3, 3, 3, 3, 3, 3, 3, # 50 - 57 

359 3, 3, 3, 3, 3, 3, 3, 3, # 58 - 5f 

360 3, 3, 3, 3, 3, 3, 3, 3, # 60 - 67 

361 3, 3, 3, 3, 3, 3, 3, 3, # 68 - 6f 

362 3, 3, 3, 3, 3, 3, 3, 3, # 70 - 77 

363 3, 3, 3, 3, 3, 3, 3, 4, # 78 - 7f 

364 5, 6, 6, 6, 6, 6, 6, 6, # 80 - 87 0x80 can be trail byte (class 5) 

365 6, 6, 6, 6, 6, 6, 6, 6, # 88 - 8f 

366 6, 6, 6, 6, 6, 6, 6, 6, # 90 - 97 

367 6, 6, 6, 6, 6, 6, 6, 6, # 98 - 9f 

368 6, 6, 6, 6, 6, 6, 6, 6, # a0 - a7 

369 6, 6, 6, 6, 6, 6, 6, 6, # a8 - af 

370 6, 6, 6, 6, 6, 6, 6, 6, # b0 - b7 

371 6, 6, 6, 6, 6, 6, 6, 6, # b8 - bf 

372 6, 6, 6, 6, 6, 6, 6, 6, # c0 - c7 

373 6, 6, 6, 6, 6, 6, 6, 6, # c8 - cf 

374 6, 6, 6, 6, 6, 6, 6, 6, # d0 - d7 

375 6, 6, 6, 6, 6, 6, 6, 6, # d8 - df 

376 6, 6, 6, 6, 6, 6, 6, 6, # e0 - e7 

377 6, 6, 6, 6, 6, 6, 6, 6, # e8 - ef 

378 6, 6, 6, 6, 6, 6, 6, 6, # f0 - f7 

379 6, 6, 6, 6, 6, 6, 6, 0 # f8 - ff 0xFF is invalid 

380) 

381 

382# States: 

383# START (0): Initial state 

384# ERROR (1): Error state 

385# ITS_ME (2): Definitive match 

386# FIRST (3): After receiving lead byte (0x81-0xFE) 

387# SECOND_4BYTE (4): After digit as 2nd byte in potential 4-byte sequence 

388# THIRD_4BYTE (5): After 3rd byte (0x81-0xFE) in 4-byte sequence 

389GB18030_ST = ( 

390# cls: 0 1 2 3 4 5 6 

391 MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START,MachineState.ERROR, 3, # START (0) 

392 MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # ERROR (1) 

393 MachineState.ITS_ME, MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # ITS_ME (2) 

394 MachineState.ERROR, MachineState.ERROR, 4,MachineState.START, MachineState.ERROR,MachineState.START,MachineState.START, # FIRST (3): 0x81-0xFE completes 2-byte 

395 MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR,MachineState.ERROR, 5, # SECOND_4BYTE (4): after 2nd digit 

396 MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.ERROR, MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # THIRD_4BYTE (5): after 3rd byte 

397) 

398# fmt: on 

399 

400# Character length table for distribution analysis 

401# Class 6 (lead byte) is marked as 2 bytes since that's the most common case 

402# (2-byte GB2312/GBK sequences). 4-byte sequences will be detected by the state 

403# machine but won't contribute to character distribution analysis. 

404GB18030_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 2, 2) 

405 

406GB18030_SM_MODEL: CodingStateMachineDict = { 

407 "class_table": GB18030_CLS, 

408 "class_factor": 7, 

409 "state_table": GB18030_ST, 

410 "char_len_table": GB18030_CHAR_LEN_TABLE, 

411 "name": "GB18030", 

412} 

413 

414# Shift_JIS 

415# fmt: off 

416SJIS_CLS = ( 

417 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 

418 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f 

419 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17 

420 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f 

421 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27 

422 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f 

423 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37 

424 1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f 

425 2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47 

426 2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f 

427 2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57 

428 2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f 

429 2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67 

430 2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f 

431 2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77 

432 2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f 

433 3, 3, 3, 3, 3, 2, 2, 3, # 80 - 87 

434 3, 3, 3, 3, 3, 3, 3, 3, # 88 - 8f 

435 3, 3, 3, 3, 3, 3, 3, 3, # 90 - 97 

436 3, 3, 3, 3, 3, 3, 3, 3, # 98 - 9f 

437 #0xa0 is illegal in sjis encoding, but some pages does 

438 #contain such byte. We need to be more error forgiven. 

439 2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7 

440 2, 2, 2, 2, 2, 2, 2, 2, # a8 - af 

441 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7 

442 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf 

443 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7 

444 2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf 

445 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7 

446 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df 

447 3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7 

448 3, 3, 3, 3, 3, 4, 4, 4, # e8 - ef 

449 3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7 

450 3, 3, 3, 3, 3, 0, 0, 0, # f8 - ff 

451) 

452 

453SJIS_ST = ( 

454 MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07 

455 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f 

456 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17 

457) 

458# fmt: on 

459 

460SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0) 

461 

462SJIS_SM_MODEL: CodingStateMachineDict = { 

463 "class_table": SJIS_CLS, 

464 "class_factor": 6, 

465 "state_table": SJIS_ST, 

466 "char_len_table": SJIS_CHAR_LEN_TABLE, 

467 "name": "Shift_JIS", 

468} 

469 

470# UCS2-BE 

471# fmt: off 

472UCS2BE_CLS = ( 

473 0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07 

474 0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f 

475 0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17 

476 0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f 

477 0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27 

478 0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f 

479 0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37 

480 0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f 

481 0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47 

482 0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f 

483 0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57 

484 0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f 

485 0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67 

486 0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f 

487 0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77 

488 0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f 

489 0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87 

490 0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f 

491 0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97 

492 0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f 

493 0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7 

494 0, 0, 0, 0, 0, 0, 0, 0, # a8 - af 

495 0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7 

496 0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf 

497 0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7 

498 0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf 

499 0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7 

500 0, 0, 0, 0, 0, 0, 0, 0, # d8 - df 

501 0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7 

502 0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef 

503 0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7 

504 0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff 

505) 

506 

507UCS2BE_ST = ( 

508 5, 7, 7,MachineState.ERROR, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07 

509 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f 

510 MachineState.ITS_ME,MachineState.ITS_ME, 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,#10-17 

511 6, 6, 6, 6, 6,MachineState.ITS_ME, 6, 6,#18-1f 

512 6, 6, 6, 6, 5, 7, 7,MachineState.ERROR,#20-27 

513 5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f 

514 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37 

515) 

516# fmt: on 

517 

518UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2) 

519 

520UCS2BE_SM_MODEL: CodingStateMachineDict = { 

521 "class_table": UCS2BE_CLS, 

522 "class_factor": 6, 

523 "state_table": UCS2BE_ST, 

524 "char_len_table": UCS2BE_CHAR_LEN_TABLE, 

525 "name": "UTF-16BE", 

526} 

527 

528# UCS2-LE 

529# fmt: off 

530UCS2LE_CLS = ( 

531 0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07 

532 0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f 

533 0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17 

534 0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f 

535 0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27 

536 0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f 

537 0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37 

538 0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f 

539 0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47 

540 0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f 

541 0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57 

542 0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f 

543 0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67 

544 0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f 

545 0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77 

546 0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f 

547 0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87 

548 0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f 

549 0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97 

550 0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f 

551 0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7 

552 0, 0, 0, 0, 0, 0, 0, 0, # a8 - af 

553 0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7 

554 0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf 

555 0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7 

556 0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf 

557 0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7 

558 0, 0, 0, 0, 0, 0, 0, 0, # d8 - df 

559 0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7 

560 0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef 

561 0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7 

562 0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff 

563) 

564 

565UCS2LE_ST = ( 

566 6, 6, 7, 6, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07 

567 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f 

568 MachineState.ITS_ME,MachineState.ITS_ME, 5, 5, 5,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#10-17 

569 5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR, 6, 6,#18-1f 

570 7, 6, 8, 8, 5, 5, 5,MachineState.ERROR,#20-27 

571 5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f 

572 5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37 

573) 

574# fmt: on 

575 

576UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2) 

577 

578UCS2LE_SM_MODEL: CodingStateMachineDict = { 

579 "class_table": UCS2LE_CLS, 

580 "class_factor": 6, 

581 "state_table": UCS2LE_ST, 

582 "char_len_table": UCS2LE_CHAR_LEN_TABLE, 

583 "name": "UTF-16LE", 

584} 

585 

586# UTF-8 

587# Adapted from Björn Höhrmann's DFA UTF-8 decoder 

588# See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. 

589# Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de> 

590# fmt: off 

591UTF8_CLS = ( 

592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0f 

593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1f 

594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2f 

595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3f 

596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4f 

597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5f 

598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6f 

599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7f 

600 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 80-8f 

601 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, # 90-9f 

602 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # a0-af 

603 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # b0-bf 

604 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # c0-cf 

605 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # d0-df 

606 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, # e0-ef 

607 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, # f0-ff 

608) 

609 

610# Höhrmann's DFA has states 0,12,24,36,48,60,72,84,96 which we map to states 0-8 

611# State 0=ACCEPT (START), State 1=REJECT (ERROR), States 2-8 are intermediate 

612UTF8_ST = ( 

613 MachineState.START,MachineState.ERROR, 3, 4, 6, 9, 8,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 7, # state 0 (START) 

614 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # state 1 (ERROR) 

615 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # state 2 (ITS_ME) 

616 MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR, # state 3 

617 MachineState.ERROR, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR, 3,MachineState.ERROR,MachineState.ERROR, # state 4 

618 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # state 5 

619 MachineState.ERROR, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,MachineState.ERROR, # state 6 

620 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR, # state 7 

621 MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR, # state 8 

622 MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # state 9 

623) 

624# fmt: on 

625 

626UTF8_CHAR_LEN_TABLE = (1, 1, 2, 3, 3, 4, 4, 1, 1, 1, 3, 4) 

627 

628UTF8_SM_MODEL: CodingStateMachineDict = { 

629 "class_table": UTF8_CLS, 

630 "class_factor": 12, 

631 "state_table": UTF8_ST, 

632 "char_len_table": UTF8_CHAR_LEN_TABLE, 

633 "name": "UTF-8", 

634}