Line | Count | Source (jump to first uncovered line) |
1 | | |
2 | | #include "wc.h" |
3 | | #include "gb18030.h" |
4 | | #include "search.h" |
5 | | #include "wtf.h" |
6 | | #ifdef USE_UNICODE |
7 | | #include "ucs.h" |
8 | | #endif |
9 | | #include "map/gb18030_ucs.map" |
10 | | |
11 | | #define C0 WC_GB18030_MAP_C0 |
12 | | #define GL WC_GB18030_MAP_GL |
13 | 377k | #define C1 WC_GB18030_MAP_C1 |
14 | 1.12M | #define LB WC_GB18030_MAP_LB |
15 | 1.21M | #define UB WC_GB18030_MAP_UB |
16 | 229k | #define L4 WC_GB18030_MAP_L4 |
17 | | |
18 | | wc_uint8 WC_GB18030_MAP[ 0x100 ] = { |
19 | | C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, |
20 | | C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, |
21 | | GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, |
22 | | L4, L4, L4, L4, L4, L4, L4, L4, L4, L4, GL, GL, GL, GL, GL, GL, |
23 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, |
24 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, |
25 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, |
26 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, C0, |
27 | | |
28 | | LB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
29 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
30 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
31 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
32 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
33 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
34 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
35 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, C1, |
36 | | }; |
37 | | |
38 | | wc_wchar_t |
39 | | wc_gbk_ext_to_cs128w(wc_wchar_t cc) |
40 | 469k | { |
41 | 469k | cc.code = WC_GBK_N(cc.code); |
42 | 469k | if (cc.code < 0x4000) |
43 | 441k | cc.ccs = WC_CCS_GBK_EXT_1; |
44 | 27.6k | else { |
45 | 27.6k | cc.ccs = WC_CCS_GBK_EXT_2; |
46 | 27.6k | cc.code -= 0x4000; |
47 | 27.6k | } |
48 | 469k | cc.code = WC_N_CS128W(cc.code); |
49 | 469k | return cc; |
50 | 469k | } |
51 | | |
52 | | wc_wchar_t |
53 | | wc_cs128w_to_gbk_ext(wc_wchar_t cc) |
54 | 445k | { |
55 | 445k | cc.code = WC_CS128W_N(cc.code); |
56 | 445k | if (cc.ccs == WC_CCS_GBK_EXT_2) |
57 | 5.54k | cc.code += 0x4000; |
58 | 445k | cc.ccs = WC_CCS_GBK_EXT; |
59 | 445k | cc.code = WC_N_GBK(cc.code); |
60 | 445k | return cc; |
61 | 445k | } |
62 | | |
63 | | static wc_ccs |
64 | 981k | wc_gbk_or_gbk_ext(wc_uint16 code) { |
65 | 981k | return wc_map3_range_search(code, |
66 | 981k | gbk_ext_ucs_map, N_gbk_ext_ucs_map) |
67 | 981k | ? WC_CCS_GBK_EXT : WC_CCS_GBK; |
68 | 981k | } |
69 | | |
70 | | #ifdef USE_UNICODE |
71 | | wc_uint32 |
72 | | wc_gb18030_to_ucs(wc_wchar_t cc) |
73 | 175k | { |
74 | 175k | wc_map3 *map; |
75 | | |
76 | 175k | switch (WC_CCS_SET(cc.ccs)) { |
77 | 0 | case WC_CCS_GBK_EXT_1: |
78 | 0 | case WC_CCS_GBK_EXT_2: |
79 | 0 | cc = wc_cs128w_to_gbk_ext(cc); |
80 | 171k | case WC_CCS_GBK_EXT: |
81 | 171k | map = wc_map3_range_search((wc_uint16)cc.code, |
82 | 171k | gbk_ext_ucs_map, N_gbk_ext_ucs_map); |
83 | 171k | if (map) |
84 | 171k | return map->code3 + WC_GBK_N(cc.code) - WC_GBK_N(map->code2); |
85 | 356 | return WC_C_UCS4_ERROR; |
86 | 3.84k | case WC_CCS_GB18030: |
87 | 3.84k | break; |
88 | 0 | default: |
89 | 0 | return wc_any_to_ucs(cc); |
90 | 175k | } |
91 | 3.84k | if (cc.code >= WC_C_GB18030_UCS2 && cc.code <= WC_C_GB18030_UCS2_END) { |
92 | 1.41k | int i, min = 0, max = N_ucs_gb18030_map - 1; |
93 | | |
94 | 1.41k | cc.code = WC_GB18030_N(cc.code) - WC_GB18030_N(WC_C_GB18030_UCS2); |
95 | 1.41k | if (cc.code >= ucs_gb18030_map[max].code3) |
96 | 690 | i = max; |
97 | 727 | else { |
98 | 5.41k | while(1) { |
99 | 5.41k | i = (min + max) / 2; |
100 | 5.41k | if (min == max) |
101 | 349 | break; |
102 | 5.06k | if (cc.code < ucs_gb18030_map[i].code3) |
103 | 2.66k | max = i - 1; |
104 | 2.40k | else if (cc.code >= ucs_gb18030_map[i+1].code3) |
105 | 2.02k | min = i + 1; |
106 | 378 | else |
107 | 378 | break; |
108 | 5.06k | } |
109 | 727 | } |
110 | 1.41k | return ucs_gb18030_map[i].code + cc.code - ucs_gb18030_map[i].code3; |
111 | 1.41k | } |
112 | 2.42k | if (cc.code >= WC_C_GB18030_UCS4 && cc.code <= WC_C_GB18030_UCS4_END) |
113 | 943 | return WC_GB18030_N(cc.code) - WC_GB18030_N(WC_C_GB18030_UCS4) |
114 | 943 | + 0x10000; |
115 | 1.48k | return WC_C_UCS4_ERROR; |
116 | 2.42k | } |
117 | | |
118 | | wc_wchar_t |
119 | | wc_ucs_to_gb18030(wc_uint32 ucs) |
120 | 14.5M | { |
121 | 14.5M | wc_wchar_t cc; |
122 | 14.5M | wc_map3 *map; |
123 | | |
124 | 14.5M | if (ucs <= WC_C_UCS2_END) { |
125 | 14.5M | map = wc_map3_range_search((wc_uint16)ucs, |
126 | 14.5M | ucs_gbk_ext_map, N_ucs_gbk_ext_map); |
127 | 14.5M | if (map) { |
128 | 360 | cc.code = WC_GBK_N(map->code3) + ucs - map->code; |
129 | 360 | cc.code = WC_N_GBK(cc.code); |
130 | 360 | cc.ccs = WC_CCS_GBK_EXT; |
131 | 360 | return cc; |
132 | 360 | } |
133 | 14.5M | map = wc_map3_range_search((wc_uint16)ucs, |
134 | 14.5M | ucs_gb18030_map, N_ucs_gb18030_map); |
135 | 14.5M | if (map) { |
136 | 14.5M | cc.code = map->code3 + ucs - map->code + WC_GB18030_N(WC_C_GB18030_UCS2); |
137 | 14.5M | cc.code = WC_N_GB18030(cc.code); |
138 | 14.5M | if (WcOption.gb18030_as_ucs) |
139 | 0 | cc.ccs = WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET); |
140 | 14.5M | else |
141 | 14.5M | cc.ccs = WC_CCS_GB18030_W; |
142 | 14.5M | return cc; |
143 | 14.5M | } |
144 | 14.5M | } else if (ucs <= WC_C_UNICODE_END) { |
145 | 1.17k | cc.code = ucs - 0x10000 + WC_GB18030_N(WC_C_GB18030_UCS4); |
146 | 1.17k | cc.code = WC_N_GB18030(cc.code); |
147 | 1.17k | if (WcOption.gb18030_as_ucs) |
148 | 0 | cc.ccs = WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET); |
149 | 1.17k | else |
150 | 1.17k | cc.ccs = WC_CCS_GB18030_W; |
151 | 1.17k | return cc; |
152 | 1.17k | } |
153 | 5.00k | cc.ccs = WC_CCS_UNKNOWN; |
154 | 5.00k | cc.code = 0; |
155 | 5.00k | return cc; |
156 | 14.5M | } |
157 | | #endif |
158 | | |
159 | | Str |
160 | | wc_conv_from_gb18030(Str is, wc_ces ces) |
161 | 1.52k | { |
162 | 1.52k | Str os; |
163 | 1.52k | wc_uchar *sp = (wc_uchar *)is->ptr; |
164 | 1.52k | wc_uchar *ep = sp + is->length; |
165 | 1.52k | wc_uchar *p; |
166 | 1.52k | int state = WC_GB18030_NOSTATE; |
167 | 1.52k | wc_uint32 gbk; |
168 | 1.52k | wc_wchar_t cc; |
169 | 1.52k | #ifdef USE_UNICODE |
170 | 1.52k | wc_uint32 ucs; |
171 | 1.52k | #endif |
172 | | |
173 | 4.15k | for (p = sp; p < ep && *p < 0x80; p++) |
174 | 2.62k | ; |
175 | 1.52k | if (p == ep) |
176 | 11 | return is; |
177 | 1.51k | os = Strnew_size(is->length); |
178 | 1.51k | if (p > sp) |
179 | 317 | Strcat_charp_n(os, (char *)is->ptr, (int)(p - sp)); |
180 | | |
181 | 3.28M | for (; p < ep; p++) { |
182 | 3.28M | switch (state) { |
183 | 1.97M | case WC_GB18030_NOSTATE: |
184 | 1.97M | switch (WC_GB18030_MAP[*p]) { |
185 | 1.12M | case UB: |
186 | 1.12M | state = WC_GB18030_MBYTE1; |
187 | 1.12M | break; |
188 | 377k | case C1: |
189 | 377k | wtf_push_unknown(os, p, 1); |
190 | 377k | break; |
191 | 476k | default: |
192 | 476k | Strcat_char(os, (char)*p); |
193 | 476k | break; |
194 | 1.97M | } |
195 | 1.97M | break; |
196 | 1.97M | case WC_GB18030_MBYTE1: |
197 | 1.12M | if (WC_GB18030_MAP[*p] & LB) { |
198 | 981k | gbk = ((wc_uint32)*(p-1) << 8) | *p; |
199 | 981k | if (wc_gbk_or_gbk_ext(gbk) == WC_CCS_GBK_EXT) |
200 | 469k | wtf_push(os, WC_CCS_GBK_EXT, gbk); |
201 | 512k | else if (*(p-1) >= 0xA1 && *p >= 0xA1) |
202 | 417k | wtf_push(os, wc_gb2312_or_gbk(gbk), gbk); |
203 | 95.0k | else |
204 | 95.0k | wtf_push(os, WC_CCS_GBK, gbk); |
205 | 981k | } else if (WC_GB18030_MAP[*p] == L4) { |
206 | 93.9k | state = WC_GB18030_MBYTE2; |
207 | 93.9k | break; |
208 | 93.9k | } else |
209 | 45.2k | wtf_push_unknown(os, p-1, 2); |
210 | 1.02M | state = WC_GB18030_NOSTATE; |
211 | 1.02M | break; |
212 | 93.9k | case WC_GB18030_MBYTE2: |
213 | 93.9k | if (WC_GB18030_MAP[*p] == UB) { |
214 | 89.9k | state = WC_GB18030_MBYTE3; |
215 | 89.9k | break; |
216 | 89.9k | } else |
217 | 3.93k | wtf_push_unknown(os, p-2, 3); |
218 | 3.93k | state = WC_GB18030_NOSTATE; |
219 | 3.93k | break; |
220 | 89.9k | case WC_GB18030_MBYTE3: |
221 | 89.9k | if (WC_GB18030_MAP[*p] == L4) { |
222 | 86.5k | cc.ccs = WC_CCS_GB18030_W; |
223 | 86.5k | cc.code = ((wc_uint32)*(p-3) << 24) |
224 | 86.5k | | ((wc_uint32)*(p-2) << 16) |
225 | 86.5k | | ((wc_uint32)*(p-1) << 8) |
226 | 86.5k | | *p; |
227 | 86.5k | #ifdef USE_UNICODE |
228 | 86.5k | if (WcOption.gb18030_as_ucs && |
229 | 86.5k | (ucs = wc_gb18030_to_ucs(cc)) != WC_C_UCS4_ERROR) |
230 | 0 | wtf_push(os, WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET), cc.code); |
231 | 86.5k | else |
232 | 86.5k | #endif |
233 | 86.5k | wtf_push(os, cc.ccs, cc.code); |
234 | 86.5k | } else |
235 | 3.45k | wtf_push_unknown(os, p-3, 4); |
236 | 89.9k | state = WC_GB18030_NOSTATE; |
237 | 89.9k | break; |
238 | 3.28M | } |
239 | 3.28M | } |
240 | 1.51k | switch (state) { |
241 | 515 | case WC_GB18030_MBYTE1: |
242 | 515 | wtf_push_unknown(os, p-1, 1); |
243 | 515 | break; |
244 | 14 | case WC_GB18030_MBYTE2: |
245 | 14 | wtf_push_unknown(os, p-2, 2); |
246 | 14 | break; |
247 | 13 | case WC_GB18030_MBYTE3: |
248 | 13 | wtf_push_unknown(os, p-3, 3); |
249 | 13 | break; |
250 | 1.51k | } |
251 | 1.51k | return os; |
252 | 1.51k | } |
253 | | |
254 | | void |
255 | | wc_push_to_gb18030(Str os, wc_wchar_t cc, wc_status *st) |
256 | 17.2M | { |
257 | 33.6M | while (1) { |
258 | 33.6M | switch (WC_CCS_SET(cc.ccs)) { |
259 | 1.36M | case WC_CCS_US_ASCII: |
260 | 1.36M | Strcat_char(os, (char)cc.code); |
261 | 1.36M | return; |
262 | 650k | case WC_CCS_GB_2312: |
263 | 650k | Strcat_char(os, (char)((cc.code >> 8) | 0x80)); |
264 | 650k | Strcat_char(os, (char)((cc.code & 0xff) | 0x80)); |
265 | 650k | return; |
266 | 13.6k | case WC_CCS_GBK_1: |
267 | 14.8k | case WC_CCS_GBK_2: |
268 | 14.8k | cc = wc_cs128w_to_gbk(cc); |
269 | 40.3k | case WC_CCS_GBK: |
270 | 40.3k | Strcat_char(os, (char)(cc.code >> 8)); |
271 | 40.3k | Strcat_char(os, (char)(cc.code & 0xff)); |
272 | 40.3k | return; |
273 | 425 | case WC_CCS_GBK_EXT_1: |
274 | 836 | case WC_CCS_GBK_EXT_2: |
275 | 836 | cc = wc_cs128w_to_gbk(cc); |
276 | 274k | case WC_CCS_GBK_EXT: |
277 | 274k | Strcat_char(os, (char)(cc.code >> 8)); |
278 | 274k | Strcat_char(os, (char)(cc.code & 0xff)); |
279 | 274k | return; |
280 | 14.5M | case WC_CCS_GB18030: |
281 | 14.5M | Strcat_char(os, (char)((cc.code >> 24) & 0xff)); |
282 | 14.5M | Strcat_char(os, (char)((cc.code >> 16) & 0xff)); |
283 | 14.5M | Strcat_char(os, (char)((cc.code >> 8) & 0xff)); |
284 | 14.5M | Strcat_char(os, (char)(cc.code & 0xff)); |
285 | 14.5M | return; |
286 | 9.76k | case WC_CCS_UNKNOWN_W: |
287 | 9.76k | if (!WcOption.no_replace) |
288 | 9.76k | Strcat_charp(os, WC_REPLACE_W); |
289 | 9.76k | return; |
290 | 361k | case WC_CCS_UNKNOWN: |
291 | 361k | if (!WcOption.no_replace) |
292 | 361k | Strcat_charp(os, WC_REPLACE); |
293 | 361k | return; |
294 | 16.3M | default: |
295 | 16.3M | #ifdef USE_UNICODE |
296 | 16.3M | if (WcOption.ucs_conv) |
297 | 16.3M | cc = wc_any_to_any_ces(cc, st); |
298 | 0 | else |
299 | 0 | #endif |
300 | 0 | cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN; |
301 | 16.3M | continue; |
302 | 33.6M | } |
303 | 33.6M | } |
304 | 17.2M | } |
305 | | |
306 | | Str |
307 | | wc_char_conv_from_gb18030(wc_uchar c, wc_status *st) |
308 | 0 | { |
309 | 0 | static Str os; |
310 | 0 | static wc_uchar gb[4]; |
311 | 0 | wc_uint32 gbk; |
312 | 0 | wc_wchar_t cc; |
313 | 0 | #ifdef USE_UNICODE |
314 | 0 | wc_uint32 ucs; |
315 | 0 | #endif |
316 | |
|
317 | 0 | if (st->state == -1) { |
318 | 0 | st->state = WC_GB18030_NOSTATE; |
319 | 0 | os = Strnew_size(8); |
320 | 0 | } |
321 | |
|
322 | 0 | switch (st->state) { |
323 | 0 | case WC_GB18030_NOSTATE: |
324 | 0 | switch (WC_GB18030_MAP[c]) { |
325 | 0 | case UB: |
326 | 0 | gb[0] = c; |
327 | 0 | st->state = WC_GB18030_MBYTE1; |
328 | 0 | return NULL; |
329 | 0 | case C1: |
330 | 0 | break; |
331 | 0 | default: |
332 | 0 | Strcat_char(os, (char)c); |
333 | 0 | break; |
334 | 0 | } |
335 | 0 | break; |
336 | 0 | case WC_GB18030_MBYTE1: |
337 | 0 | if (WC_GB18030_MAP[c] & LB) { |
338 | 0 | gbk = ((wc_uint32)gb[0] << 8) | c; |
339 | 0 | if (wc_gbk_or_gbk_ext(gbk) == WC_CCS_GBK_EXT) |
340 | 0 | wtf_push(os, WC_CCS_GBK_EXT, gbk); |
341 | 0 | else if (gb[0] >= 0xA1 && c >= 0xA1) |
342 | 0 | wtf_push(os, wc_gb2312_or_gbk(gbk), gbk); |
343 | 0 | else |
344 | 0 | wtf_push(os, WC_CCS_GBK, gbk); |
345 | 0 | } else if (WC_GB18030_MAP[c] == L4) { |
346 | 0 | gb[1] = c; |
347 | 0 | st->state = WC_GB18030_MBYTE2; |
348 | 0 | return NULL; |
349 | 0 | } |
350 | 0 | break; |
351 | 0 | case WC_GB18030_MBYTE2: |
352 | 0 | if (WC_GB18030_MAP[c] == UB) { |
353 | 0 | gb[2] = c; |
354 | 0 | st->state = WC_GB18030_MBYTE3; |
355 | 0 | return NULL; |
356 | 0 | } |
357 | 0 | break; |
358 | 0 | case WC_GB18030_MBYTE3: |
359 | 0 | if (WC_GB18030_MAP[c] == L4) { |
360 | 0 | cc.ccs = WC_CCS_GB18030_W; |
361 | 0 | cc.code = ((wc_uint32)gb[0] << 24) |
362 | 0 | | ((wc_uint32)gb[1] << 16) |
363 | 0 | | ((wc_uint32)gb[2] << 8) |
364 | 0 | | c; |
365 | 0 | #ifdef USE_UNICODE |
366 | 0 | if (WcOption.gb18030_as_ucs && |
367 | 0 | (ucs = wc_gb18030_to_ucs(cc)) != WC_C_UCS4_ERROR) |
368 | 0 | wtf_push(os, WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET), cc.code); |
369 | 0 | else |
370 | 0 | #endif |
371 | 0 | wtf_push(os, cc.ccs, cc.code); |
372 | 0 | } |
373 | 0 | break; |
374 | 0 | } |
375 | 0 | st->state = -1; |
376 | 0 | return os; |
377 | 0 | } |