Line | Count | Source |
1 | | |
2 | | #include "wc.h" |
3 | | #include "gb18030.h" |
4 | | #include "search.h" |
5 | | #include "wtf.h" |
6 | | #ifdef USE_UNICODE |
7 | | #include "ucs.h" |
8 | | #endif |
9 | | #include "map/gb18030_ucs.map" |
10 | | |
11 | | #define C0 WC_GB18030_MAP_C0 |
12 | | #define GL WC_GB18030_MAP_GL |
13 | 546k | #define C1 WC_GB18030_MAP_C1 |
14 | 1.80M | #define LB WC_GB18030_MAP_LB |
15 | 1.88M | #define UB WC_GB18030_MAP_UB |
16 | 277k | #define L4 WC_GB18030_MAP_L4 |
17 | | |
18 | | wc_uint8 WC_GB18030_MAP[ 0x100 ] = { |
19 | | C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, |
20 | | C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, |
21 | | GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, |
22 | | L4, L4, L4, L4, L4, L4, L4, L4, L4, L4, GL, GL, GL, GL, GL, GL, |
23 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, |
24 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, |
25 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, |
26 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, C0, |
27 | | |
28 | | LB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
29 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
30 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
31 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
32 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
33 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
34 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
35 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, C1, |
36 | | }; |
37 | | |
38 | | wc_wchar_t |
39 | | wc_gbk_ext_to_cs128w(wc_wchar_t cc) |
40 | 312k | { |
41 | 312k | cc.code = WC_GBK_N(cc.code); |
42 | 312k | if (cc.code < 0x4000) |
43 | 235k | cc.ccs = WC_CCS_GBK_EXT_1; |
44 | 77.4k | else { |
45 | 77.4k | cc.ccs = WC_CCS_GBK_EXT_2; |
46 | 77.4k | cc.code -= 0x4000; |
47 | 77.4k | } |
48 | 312k | cc.code = WC_N_CS128W(cc.code); |
49 | 312k | return cc; |
50 | 312k | } |
51 | | |
52 | | wc_wchar_t |
53 | | wc_cs128w_to_gbk_ext(wc_wchar_t cc) |
54 | 297k | { |
55 | 297k | cc.code = WC_CS128W_N(cc.code); |
56 | 297k | if (cc.ccs == WC_CCS_GBK_EXT_2) |
57 | 68.8k | cc.code += 0x4000; |
58 | 297k | cc.ccs = WC_CCS_GBK_EXT; |
59 | 297k | cc.code = WC_N_GBK(cc.code); |
60 | 297k | return cc; |
61 | 297k | } |
62 | | |
63 | | static wc_ccs |
64 | 1.59M | wc_gbk_or_gbk_ext(wc_uint16 code) { |
65 | 1.59M | return wc_map3_range_search(code, |
66 | 1.59M | gbk_ext_ucs_map, N_gbk_ext_ucs_map) |
67 | 1.59M | ? WC_CCS_GBK_EXT : WC_CCS_GBK; |
68 | 1.59M | } |
69 | | |
70 | | #ifdef USE_UNICODE |
71 | | wc_uint32 |
72 | | wc_gb18030_to_ucs(wc_wchar_t cc) |
73 | 339k | { |
74 | 339k | wc_map3 *map; |
75 | | |
76 | 339k | switch (WC_CCS_SET(cc.ccs)) { |
77 | 0 | case WC_CCS_GBK_EXT_1: |
78 | 0 | case WC_CCS_GBK_EXT_2: |
79 | 0 | cc = wc_cs128w_to_gbk_ext(cc); |
80 | 277k | case WC_CCS_GBK_EXT: |
81 | 277k | map = wc_map3_range_search((wc_uint16)cc.code, |
82 | 277k | gbk_ext_ucs_map, N_gbk_ext_ucs_map); |
83 | 277k | if (map) |
84 | 277k | return map->code3 + WC_GBK_N(cc.code) - WC_GBK_N(map->code2); |
85 | 253 | return WC_C_UCS4_ERROR; |
86 | 61.9k | case WC_CCS_GB18030: |
87 | 61.9k | break; |
88 | 0 | default: |
89 | 0 | return wc_any_to_ucs(cc); |
90 | 339k | } |
91 | 61.9k | if (cc.code >= WC_C_GB18030_UCS2 && cc.code <= WC_C_GB18030_UCS2_END) { |
92 | 8.63k | int i, min = 0, max = N_ucs_gb18030_map - 1; |
93 | | |
94 | 8.63k | cc.code = WC_GB18030_N(cc.code) - WC_GB18030_N(WC_C_GB18030_UCS2); |
95 | 8.63k | if (cc.code >= ucs_gb18030_map[max].code3) |
96 | 5.80k | i = max; |
97 | 2.82k | else { |
98 | 19.8k | while(1) { |
99 | 19.8k | i = (min + max) / 2; |
100 | 19.8k | if (min == max) |
101 | 195 | break; |
102 | 19.6k | if (cc.code < ucs_gb18030_map[i].code3) |
103 | 13.5k | max = i - 1; |
104 | 6.09k | else if (cc.code >= ucs_gb18030_map[i+1].code3) |
105 | 3.46k | min = i + 1; |
106 | 2.63k | else |
107 | 2.63k | break; |
108 | 19.6k | } |
109 | 2.82k | } |
110 | 8.63k | return ucs_gb18030_map[i].code + cc.code - ucs_gb18030_map[i].code3; |
111 | 8.63k | } |
112 | 53.3k | if (cc.code >= WC_C_GB18030_UCS4 && cc.code <= WC_C_GB18030_UCS4_END) |
113 | 20.5k | return WC_GB18030_N(cc.code) - WC_GB18030_N(WC_C_GB18030_UCS4) |
114 | 20.5k | + 0x10000; |
115 | 32.7k | return WC_C_UCS4_ERROR; |
116 | 53.3k | } |
117 | | |
118 | | wc_wchar_t |
119 | | wc_ucs_to_gb18030(wc_uint32 ucs) |
120 | 9.95M | { |
121 | 9.95M | wc_wchar_t cc; |
122 | 9.95M | wc_map3 *map; |
123 | | |
124 | 9.95M | if (ucs <= WC_C_UCS2_END) { |
125 | 9.95M | map = wc_map3_range_search((wc_uint16)ucs, |
126 | 9.95M | ucs_gbk_ext_map, N_ucs_gbk_ext_map); |
127 | 9.95M | if (map) { |
128 | 1.35k | cc.code = WC_GBK_N(map->code3) + ucs - map->code; |
129 | 1.35k | cc.code = WC_N_GBK(cc.code); |
130 | 1.35k | cc.ccs = WC_CCS_GBK_EXT; |
131 | 1.35k | return cc; |
132 | 1.35k | } |
133 | 9.95M | map = wc_map3_range_search((wc_uint16)ucs, |
134 | 9.95M | ucs_gb18030_map, N_ucs_gb18030_map); |
135 | 9.95M | if (map) { |
136 | 9.95M | cc.code = map->code3 + ucs - map->code + WC_GB18030_N(WC_C_GB18030_UCS2); |
137 | 9.95M | cc.code = WC_N_GB18030(cc.code); |
138 | 9.95M | if (WcOption.gb18030_as_ucs) |
139 | 0 | cc.ccs = WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET); |
140 | 9.95M | else |
141 | 9.95M | cc.ccs = WC_CCS_GB18030_W; |
142 | 9.95M | return cc; |
143 | 9.95M | } |
144 | 9.95M | } else if (ucs <= WC_C_UNICODE_END) { |
145 | 315 | cc.code = ucs - 0x10000 + WC_GB18030_N(WC_C_GB18030_UCS4); |
146 | 315 | cc.code = WC_N_GB18030(cc.code); |
147 | 315 | if (WcOption.gb18030_as_ucs) |
148 | 0 | cc.ccs = WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET); |
149 | 315 | else |
150 | 315 | cc.ccs = WC_CCS_GB18030_W; |
151 | 315 | return cc; |
152 | 315 | } |
153 | 4.27k | cc.ccs = WC_CCS_UNKNOWN; |
154 | 4.27k | cc.code = 0; |
155 | 4.27k | return cc; |
156 | 9.95M | } |
157 | | #endif |
158 | | |
159 | | Str |
160 | | wc_conv_from_gb18030(Str is, wc_ces ces) |
161 | 1.35k | { |
162 | 1.35k | Str os; |
163 | 1.35k | wc_uchar *sp = (wc_uchar *)is->ptr; |
164 | 1.35k | wc_uchar *ep = sp + is->length; |
165 | 1.35k | wc_uchar *p; |
166 | 1.35k | int state = WC_GB18030_NOSTATE; |
167 | 1.35k | wc_uint32 gbk; |
168 | 1.35k | wc_wchar_t cc; |
169 | 1.35k | #ifdef USE_UNICODE |
170 | 1.35k | wc_uint32 ucs; |
171 | 1.35k | #endif |
172 | | |
173 | 2.02k | for (p = sp; p < ep && *p < 0x80; p++) |
174 | 665 | ; |
175 | 1.35k | if (p == ep) |
176 | 7 | return is; |
177 | 1.35k | os = Strnew_size(is->length); |
178 | 1.35k | if (p > sp) |
179 | 82 | Strcat_charp_n(os, (char *)is->ptr, (int)(p - sp)); |
180 | | |
181 | 6.07M | for (; p < ep; p++) { |
182 | 6.07M | switch (state) { |
183 | 4.13M | case WC_GB18030_NOSTATE: |
184 | 4.13M | switch (WC_GB18030_MAP[*p]) { |
185 | 1.81M | case UB: |
186 | 1.81M | state = WC_GB18030_MBYTE1; |
187 | 1.81M | break; |
188 | 546k | case C1: |
189 | 546k | wtf_push_unknown(os, p, 1); |
190 | 546k | break; |
191 | 1.77M | default: |
192 | 1.77M | Strcat_char(os, (char)*p); |
193 | 1.77M | break; |
194 | 4.13M | } |
195 | 4.13M | break; |
196 | 4.13M | case WC_GB18030_MBYTE1: |
197 | 1.80M | if (WC_GB18030_MAP[*p] & LB) { |
198 | 1.59M | gbk = ((wc_uint32)*(p-1) << 8) | *p; |
199 | 1.59M | if (wc_gbk_or_gbk_ext(gbk) == WC_CCS_GBK_EXT) |
200 | 312k | wtf_push(os, WC_CCS_GBK_EXT, gbk); |
201 | 1.28M | else if (*(p-1) >= 0xA1 && *p >= 0xA1) |
202 | 850k | wtf_push(os, wc_gb2312_or_gbk(gbk), gbk); |
203 | 429k | else |
204 | 429k | wtf_push(os, WC_CCS_GBK, gbk); |
205 | 1.59M | } else if (WC_GB18030_MAP[*p] == L4) { |
206 | 70.3k | state = WC_GB18030_MBYTE2; |
207 | 70.3k | break; |
208 | 70.3k | } else |
209 | 145k | wtf_push_unknown(os, p-1, 2); |
210 | 1.73M | state = WC_GB18030_NOSTATE; |
211 | 1.73M | break; |
212 | 70.2k | case WC_GB18030_MBYTE2: |
213 | 70.2k | if (WC_GB18030_MAP[*p] == UB) { |
214 | 61.3k | state = WC_GB18030_MBYTE3; |
215 | 61.3k | break; |
216 | 61.3k | } else |
217 | 8.89k | wtf_push_unknown(os, p-2, 3); |
218 | 8.89k | state = WC_GB18030_NOSTATE; |
219 | 8.89k | break; |
220 | 61.3k | case WC_GB18030_MBYTE3: |
221 | 61.3k | if (WC_GB18030_MAP[*p] == L4) { |
222 | 53.1k | cc.ccs = WC_CCS_GB18030_W; |
223 | 53.1k | cc.code = ((wc_uint32)*(p-3) << 24) |
224 | 53.1k | | ((wc_uint32)*(p-2) << 16) |
225 | 53.1k | | ((wc_uint32)*(p-1) << 8) |
226 | 53.1k | | *p; |
227 | 53.1k | #ifdef USE_UNICODE |
228 | 53.1k | if (WcOption.gb18030_as_ucs && |
229 | 0 | (ucs = wc_gb18030_to_ucs(cc)) != WC_C_UCS4_ERROR) |
230 | 0 | wtf_push(os, WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET), cc.code); |
231 | 53.1k | else |
232 | 53.1k | #endif |
233 | 53.1k | wtf_push(os, cc.ccs, cc.code); |
234 | 53.1k | } else |
235 | 8.24k | wtf_push_unknown(os, p-3, 4); |
236 | 61.3k | state = WC_GB18030_NOSTATE; |
237 | 61.3k | break; |
238 | 6.07M | } |
239 | 6.07M | } |
240 | 1.35k | switch (state) { |
241 | 590 | case WC_GB18030_MBYTE1: |
242 | 590 | wtf_push_unknown(os, p-1, 1); |
243 | 590 | break; |
244 | 8 | case WC_GB18030_MBYTE2: |
245 | 8 | wtf_push_unknown(os, p-2, 2); |
246 | 8 | break; |
247 | 5 | case WC_GB18030_MBYTE3: |
248 | 5 | wtf_push_unknown(os, p-3, 3); |
249 | 5 | break; |
250 | 1.35k | } |
251 | 1.35k | return os; |
252 | 1.35k | } |
253 | | |
254 | | void |
255 | | wc_push_to_gb18030(Str os, wc_wchar_t cc, wc_status *st) |
256 | 11.6M | { |
257 | 22.6M | while (1) { |
258 | 22.6M | switch (WC_CCS_SET(cc.ccs)) { |
259 | 3.85k | case WC_CCS_US_ASCII: |
260 | 3.85k | Strcat_char(os, (char)cc.code); |
261 | 3.85k | return; |
262 | 884k | case WC_CCS_GB_2312: |
263 | 884k | Strcat_char(os, (char)((cc.code >> 8) | 0x80)); |
264 | 884k | Strcat_char(os, (char)((cc.code & 0xff) | 0x80)); |
265 | 884k | return; |
266 | 123k | case WC_CCS_GBK_1: |
267 | 127k | case WC_CCS_GBK_2: |
268 | 127k | cc = wc_cs128w_to_gbk(cc); |
269 | 201k | case WC_CCS_GBK: |
270 | 201k | Strcat_char(os, (char)(cc.code >> 8)); |
271 | 201k | Strcat_char(os, (char)(cc.code & 0xff)); |
272 | 201k | return; |
273 | 207 | case WC_CCS_GBK_EXT_1: |
274 | 701 | case WC_CCS_GBK_EXT_2: |
275 | 701 | cc = wc_cs128w_to_gbk(cc); |
276 | 22.5k | case WC_CCS_GBK_EXT: |
277 | 22.5k | Strcat_char(os, (char)(cc.code >> 8)); |
278 | 22.5k | Strcat_char(os, (char)(cc.code & 0xff)); |
279 | 22.5k | return; |
280 | 9.95M | case WC_CCS_GB18030: |
281 | 9.95M | Strcat_char(os, (char)((cc.code >> 24) & 0xff)); |
282 | 9.95M | Strcat_char(os, (char)((cc.code >> 16) & 0xff)); |
283 | 9.95M | Strcat_char(os, (char)((cc.code >> 8) & 0xff)); |
284 | 9.95M | Strcat_char(os, (char)(cc.code & 0xff)); |
285 | 9.95M | return; |
286 | 7.56k | case WC_CCS_UNKNOWN_W: |
287 | 7.56k | if (!WcOption.no_replace) |
288 | 7.56k | Strcat_charp(os, WC_REPLACE_W); |
289 | 7.56k | return; |
290 | 629k | case WC_CCS_UNKNOWN: |
291 | 629k | if (!WcOption.no_replace) |
292 | 629k | Strcat_charp(os, WC_REPLACE); |
293 | 629k | return; |
294 | 10.9M | default: |
295 | 10.9M | #ifdef USE_UNICODE |
296 | 10.9M | if (WcOption.ucs_conv) |
297 | 10.9M | cc = wc_any_to_any_ces(cc, st); |
298 | 0 | else |
299 | 0 | #endif |
300 | 0 | cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN; |
301 | 10.9M | continue; |
302 | 22.6M | } |
303 | 22.6M | } |
304 | 11.6M | } |
305 | | |
306 | | Str |
307 | | wc_char_conv_from_gb18030(wc_uchar c, wc_status *st) |
308 | 0 | { |
309 | 0 | static Str os; |
310 | 0 | static wc_uchar gb[4]; |
311 | 0 | wc_uint32 gbk; |
312 | 0 | wc_wchar_t cc; |
313 | 0 | #ifdef USE_UNICODE |
314 | 0 | wc_uint32 ucs; |
315 | 0 | #endif |
316 | |
|
317 | 0 | if (st->state == -1) { |
318 | 0 | st->state = WC_GB18030_NOSTATE; |
319 | 0 | os = Strnew_size(8); |
320 | 0 | } |
321 | |
|
322 | 0 | switch (st->state) { |
323 | 0 | case WC_GB18030_NOSTATE: |
324 | 0 | switch (WC_GB18030_MAP[c]) { |
325 | 0 | case UB: |
326 | 0 | gb[0] = c; |
327 | 0 | st->state = WC_GB18030_MBYTE1; |
328 | 0 | return NULL; |
329 | 0 | case C1: |
330 | 0 | break; |
331 | 0 | default: |
332 | 0 | Strcat_char(os, (char)c); |
333 | 0 | break; |
334 | 0 | } |
335 | 0 | break; |
336 | 0 | case WC_GB18030_MBYTE1: |
337 | 0 | if (WC_GB18030_MAP[c] & LB) { |
338 | 0 | gbk = ((wc_uint32)gb[0] << 8) | c; |
339 | 0 | if (wc_gbk_or_gbk_ext(gbk) == WC_CCS_GBK_EXT) |
340 | 0 | wtf_push(os, WC_CCS_GBK_EXT, gbk); |
341 | 0 | else if (gb[0] >= 0xA1 && c >= 0xA1) |
342 | 0 | wtf_push(os, wc_gb2312_or_gbk(gbk), gbk); |
343 | 0 | else |
344 | 0 | wtf_push(os, WC_CCS_GBK, gbk); |
345 | 0 | } else if (WC_GB18030_MAP[c] == L4) { |
346 | 0 | gb[1] = c; |
347 | 0 | st->state = WC_GB18030_MBYTE2; |
348 | 0 | return NULL; |
349 | 0 | } |
350 | 0 | break; |
351 | 0 | case WC_GB18030_MBYTE2: |
352 | 0 | if (WC_GB18030_MAP[c] == UB) { |
353 | 0 | gb[2] = c; |
354 | 0 | st->state = WC_GB18030_MBYTE3; |
355 | 0 | return NULL; |
356 | 0 | } |
357 | 0 | break; |
358 | 0 | case WC_GB18030_MBYTE3: |
359 | 0 | if (WC_GB18030_MAP[c] == L4) { |
360 | 0 | cc.ccs = WC_CCS_GB18030_W; |
361 | 0 | cc.code = ((wc_uint32)gb[0] << 24) |
362 | 0 | | ((wc_uint32)gb[1] << 16) |
363 | 0 | | ((wc_uint32)gb[2] << 8) |
364 | 0 | | c; |
365 | 0 | #ifdef USE_UNICODE |
366 | 0 | if (WcOption.gb18030_as_ucs && |
367 | 0 | (ucs = wc_gb18030_to_ucs(cc)) != WC_C_UCS4_ERROR) |
368 | 0 | wtf_push(os, WC_CCS_GB18030 | (wc_ucs_to_ccs(ucs) & ~WC_CCS_A_SET), cc.code); |
369 | 0 | else |
370 | 0 | #endif |
371 | 0 | wtf_push(os, cc.ccs, cc.code); |
372 | 0 | } |
373 | 0 | break; |
374 | 0 | } |
375 | 0 | st->state = -1; |
376 | 0 | return os; |
377 | 0 | } |