/src/kcodecs/src/probers/nsMBCSGroupProber.cpp
Line | Count | Source |
1 | | /* -*- C++ -*- |
2 | | SPDX-FileCopyrightText: 1998 Netscape Communications Corporation <developer@mozilla.org> |
3 | | |
4 | | SPDX-License-Identifier: MIT |
5 | | */ |
6 | | |
7 | | #include "nsMBCSGroupProber.h" |
8 | | |
9 | | #include "UnicodeGroupProber.h" |
10 | | #include "nsBig5Prober.h" |
11 | | #include "nsEUCJPProber.h" |
12 | | #include "nsEUCKRProber.h" |
13 | | #include "nsGB2312Prober.h" |
14 | | #include "nsSJISProber.h" |
15 | | |
16 | | #include <stdio.h> |
17 | | #include <stdlib.h> |
18 | | |
19 | | namespace kencodingprober |
20 | | { |
21 | | #ifdef DEBUG_PROBE |
22 | | static const char *const ProberName[] = { |
23 | | "Unicode", |
24 | | "SJIS", |
25 | | "EUCJP", |
26 | | "GB18030", |
27 | | "EUCKR", |
28 | | "Big5", |
29 | | }; |
30 | | |
31 | | #endif |
32 | | |
33 | | namespace |
34 | | { |
35 | | using Prober = nsMBCSGroupProber::Prober; |
36 | | constexpr std::array<bool, 6> fromSelectedList(std::span<const Prober> selected) |
37 | 14.0k | { |
38 | 14.0k | std::array<bool, 6> isSelected{false}; |
39 | 59.2k | for (auto p : selected) { |
40 | 59.2k | const auto i = static_cast<std::underlying_type_t<Prober>>(p); |
41 | 59.2k | if (i >= NUM_OF_PROBERS) { |
42 | 0 | continue; |
43 | 0 | } |
44 | 59.2k | isSelected[i] = true; |
45 | 59.2k | } |
46 | 14.0k | return isSelected; |
47 | 14.0k | } |
48 | | static_assert(fromSelectedList({})[0] == false); |
49 | | static_assert(fromSelectedList({})[5] == false); |
50 | | static_assert(fromSelectedList(std::array{Prober::Unicode})[0] == true); |
51 | | static_assert(fromSelectedList(std::array{Prober::Unicode})[5] == false); |
52 | | static_assert(fromSelectedList(std::array{Prober::SJIS, Prober::Big5})[0] == false); |
53 | | static_assert(fromSelectedList(std::array{Prober::SJIS, Prober::Big5})[1] == true); |
54 | | static_assert(fromSelectedList(std::array{Prober::SJIS, Prober::Big5})[2] == false); |
55 | | static_assert(fromSelectedList(std::array{Prober::SJIS, Prober::Big5})[3] == false); |
56 | | static_assert(fromSelectedList(std::array{Prober::SJIS, Prober::Big5})[4] == false); |
57 | | static_assert(fromSelectedList(std::array{Prober::SJIS, Prober::Big5})[5] == true); |
58 | | |
59 | | } // namespace <anonymous> |
60 | | |
61 | | nsMBCSGroupProber::nsMBCSGroupProber(std::span<const Prober> selected) |
62 | 14.0k | : mProbers{std::make_unique<UnicodeGroupProber>(), |
63 | 14.0k | std::make_unique<nsSJISProber>(), |
64 | 14.0k | std::make_unique<nsEUCJPProber>(), |
65 | 14.0k | std::make_unique<nsGB18030Prober>(), |
66 | 14.0k | std::make_unique<nsEUCKRProber>(), |
67 | 14.0k | std::make_unique<nsBig5Prober>(), |
68 | 14.0k | } |
69 | 14.0k | , mIsSelected(fromSelectedList(selected)) |
70 | 14.0k | { |
71 | 98.5k | for (unsigned int i = 0; i < NUM_OF_PROBERS; i++) { |
72 | 84.5k | if (mProbers[i] && mIsSelected[i]) { |
73 | 59.2k | mIsActive[i] = true; |
74 | 59.2k | ++mActiveNum; |
75 | 59.2k | } |
76 | 84.5k | } |
77 | 14.0k | } |
78 | | |
79 | | nsMBCSGroupProber::nsMBCSGroupProber() |
80 | 5.67k | : nsMBCSGroupProber(std::array{ |
81 | 5.67k | Prober::Unicode, |
82 | 5.67k | Prober::SJIS, |
83 | 5.67k | Prober::EUCJP, |
84 | 5.67k | Prober::GB18030, |
85 | 5.67k | Prober::EUCKR, |
86 | 5.67k | Prober::Big5, |
87 | 5.67k | }) |
88 | 5.67k | { |
89 | 5.67k | } |
90 | | |
91 | | const char *nsMBCSGroupProber::GetCharSetName() |
92 | 89 | { |
93 | 89 | if (mBestGuess == -1) { |
94 | 0 | GetConfidence(); |
95 | 0 | if (mBestGuess == -1) { |
96 | 0 | mBestGuess = 0; |
97 | 0 | } |
98 | 0 | } |
99 | 89 | return mProbers[mBestGuess]->GetCharSetName(); |
100 | 89 | } |
101 | | |
102 | | nsProbingState nsMBCSGroupProber::HandleData(const char *aBuf, unsigned int aLen) |
103 | 7.69k | { |
104 | | // do filtering to reduce load to probers |
105 | 7.69k | char *highbyteBuf; |
106 | 7.69k | char *hptr; |
107 | 7.69k | bool keepNext = true; // assume previous is not ascii, it will do no harm except add some noise |
108 | 7.69k | hptr = highbyteBuf = (char *)malloc(aLen); |
109 | 7.69k | if (!hptr) { |
110 | 0 | return mState; |
111 | 0 | } |
112 | 178M | for (unsigned int i = 0; i < aLen; ++i) { |
113 | 178M | if (aBuf[i] & 0x80) { |
114 | 118M | *hptr++ = aBuf[i]; |
115 | 118M | keepNext = true; |
116 | 118M | } else { |
117 | | // if previous is highbyte, keep this even it is a ASCII |
118 | 59.8M | if (keepNext) { |
119 | 2.55M | *hptr++ = aBuf[i]; |
120 | 2.55M | keepNext = false; |
121 | 2.55M | } |
122 | 59.8M | } |
123 | 178M | } |
124 | | |
125 | 51.6k | for (unsigned int i = 0; i < NUM_OF_PROBERS; ++i) { |
126 | 44.8k | if (!mIsActive[i]) { |
127 | 11.7k | continue; |
128 | 11.7k | } |
129 | 33.1k | nsProbingState st = mProbers[i]->HandleData(highbyteBuf, hptr - highbyteBuf); |
130 | 33.1k | if (st == eFoundIt) { |
131 | 302 | mBestGuess = i; |
132 | 302 | mState = eFoundIt; |
133 | 302 | break; |
134 | 32.8k | } else if (st == eNotMe) { |
135 | 11.1k | mIsActive[i] = false; |
136 | 11.1k | mActiveNum--; |
137 | 11.1k | if (mActiveNum == 0) { |
138 | 604 | mState = eNotMe; |
139 | 604 | break; |
140 | 604 | } |
141 | 11.1k | } |
142 | 33.1k | } |
143 | | |
144 | 7.69k | free(highbyteBuf); |
145 | | |
146 | 7.69k | return mState; |
147 | 7.69k | } |
148 | | |
149 | | float nsMBCSGroupProber::GetConfidence(void) |
150 | 0 | { |
151 | 0 | float bestConf = 0.0; |
152 | |
|
153 | 0 | switch (mState) { |
154 | 0 | case eFoundIt: |
155 | 0 | return 0.99f; |
156 | 0 | case eNotMe: |
157 | 0 | return 0.01f; |
158 | 0 | default: |
159 | 0 | for (unsigned int i = 0; i < NUM_OF_PROBERS; ++i) { |
160 | 0 | if (!mIsActive[i]) { |
161 | 0 | continue; |
162 | 0 | } |
163 | 0 | float cf = mProbers[i]->GetConfidence(); |
164 | 0 | if (bestConf < cf) { |
165 | 0 | bestConf = cf; |
166 | 0 | mBestGuess = i; |
167 | 0 | } |
168 | 0 | } |
169 | 0 | } |
170 | 0 | return bestConf; |
171 | 0 | } |
172 | | |
173 | | #ifdef DEBUG_PROBE |
174 | | void nsMBCSGroupProber::DumpStatus() |
175 | | { |
176 | | GetConfidence(); |
177 | | for (size_t i = 0; i < NUM_OF_PROBERS; i++) { |
178 | | if (!mIsSelected[i]) { |
179 | | printf(" MBCS deselected: [%s][%s]\r\n", ProberName[i], mProbers[i]->GetCharSetName()); |
180 | | } else if (!mIsActive[i]) { |
181 | | printf(" MBCS inactive: [%s][%s] (confidence is too low).\r\n", ProberName[i], mProbers[i]->GetCharSetName()); |
182 | | } else { |
183 | | float cf = mProbers[i]->GetConfidence(); |
184 | | printf(" MBCS %1.3f: [%s][%s]\r\n", cf, ProberName[i], mProbers[i]->GetCharSetName()); |
185 | | mProbers[i]->DumpStatus(); |
186 | | } |
187 | | } |
188 | | } |
189 | | #endif |
190 | | } |