Line | Count | Source (jump to first uncovered line) |
1 | | |
2 | | #include "wc.h" |
3 | | #include "gbk.h" |
4 | | #include "search.h" |
5 | | #include "wtf.h" |
6 | | #ifdef USE_UNICODE |
7 | | #include "ucs.h" |
8 | | #endif |
9 | | |
10 | | #include "map/gb2312_gbk.map" |
11 | | |
12 | | #define C0 WC_GBK_MAP_C0 |
13 | | #define GL WC_GBK_MAP_GL |
14 | 86.3k | #define C1 WC_GBK_MAP_C1 |
15 | 1.25M | #define LB WC_GBK_MAP_LB |
16 | 1.25M | #define UB WC_GBK_MAP_UB |
17 | 61.3k | #define C80 WC_GBK_MAP_80 |
18 | | |
19 | | wc_uint8 WC_GBK_MAP[ 0x100 ] = { |
20 | | C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, |
21 | | C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, C0, |
22 | | GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, |
23 | | GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, GL, |
24 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, |
25 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, |
26 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, |
27 | | LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, LB, C0, |
28 | | |
29 | | C80,UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
30 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
31 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
32 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
33 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
34 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
35 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, |
36 | | UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, UB, C1, |
37 | | }; |
38 | | |
39 | | wc_ccs |
40 | 979k | wc_gb2312_or_gbk(wc_uint16 code) { |
41 | 979k | return wc_map_range_search(code, |
42 | 979k | gb2312_gbk_map, N_gb2312_gbk_map) |
43 | 979k | ? WC_CCS_GBK : WC_CCS_GB_2312; |
44 | 979k | } |
45 | | |
46 | | wc_wchar_t |
47 | | wc_gbk_to_cs128w(wc_wchar_t cc) |
48 | 1.08M | { |
49 | 1.08M | cc.code = WC_GBK_N(cc.code); |
50 | 1.08M | if (cc.code < 0x4000) |
51 | 1.04M | cc.ccs = WC_CCS_GBK_1; |
52 | 40.8k | else { |
53 | 40.8k | cc.ccs = WC_CCS_GBK_2; |
54 | 40.8k | cc.code -= 0x4000; |
55 | 40.8k | } |
56 | 1.08M | cc.code = WC_N_CS128W(cc.code); |
57 | 1.08M | return cc; |
58 | 1.08M | } |
59 | | |
60 | | wc_wchar_t |
61 | | wc_cs128w_to_gbk(wc_wchar_t cc) |
62 | 1.07M | { |
63 | 1.07M | cc.code = WC_CS128W_N(cc.code); |
64 | 1.07M | if (cc.ccs == WC_CCS_GBK_2) |
65 | 39.6k | cc.code += 0x4000; |
66 | 1.07M | cc.ccs = WC_CCS_GBK; |
67 | 1.07M | cc.code = WC_N_GBK(cc.code); |
68 | 1.07M | return cc; |
69 | 1.07M | } |
70 | | |
71 | | wc_uint32 |
72 | | wc_gbk_to_N(wc_uint32 c) |
73 | 102k | { |
74 | 102k | if (c <= 0xA1A0) /* 0x8140 - 0xA1A0 */ |
75 | 49.9k | return WC_GBK_N(c); |
76 | 52.8k | if (c <= 0xA2AA) /* 0xA240 - 0xA2A0, 0xA2A1 - 0xA2AA */ |
77 | 1.06k | return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E; |
78 | 51.7k | if (c <= 0xA6A0) /* 0xA240 - 0xA6A0 */ |
79 | 1.78k | return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A; |
80 | 49.9k | if (c <= 0xA6F5) /* 0xA6E0 - 0xA6F5 */ |
81 | 1.06k | return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A - 0x3F; |
82 | 48.9k | if (c <= 0xA8A0) /* 0xA7A0 - 0xA8A0 */ |
83 | 944 | return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16; |
84 | 47.9k | if (c <= 0xA8C0) /* 0xA8BB - 0xA8C0 */ |
85 | 363 | return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16 - 0x1A; |
86 | | /* 0xA940 - 0xFEA0 */ |
87 | 47.6k | return WC_GBK_N(c) - ((c >> 8) - 0xA1) * 0x5E + 0x0A + 0x16 + 0x06; |
88 | 47.9k | } |
89 | | |
90 | | Str |
91 | | wc_conv_from_gbk(Str is, wc_ces ces) |
92 | 256 | { |
93 | 256 | Str os; |
94 | 256 | wc_uchar *sp = (wc_uchar *)is->ptr; |
95 | 256 | wc_uchar *ep = sp + is->length; |
96 | 256 | wc_uchar *p; |
97 | 256 | int state = WC_GBK_NOSTATE; |
98 | 256 | wc_uint32 gbk; |
99 | | |
100 | 18.0k | for (p = sp; p < ep && *p < 0x80; p++) |
101 | 17.7k | ; |
102 | 256 | if (p == ep) |
103 | 25 | return is; |
104 | 231 | os = Strnew_size(is->length); |
105 | 231 | if (p > sp) |
106 | 20 | Strcat_charp_n(os, (char *)is->ptr, (int)(p - sp)); |
107 | | |
108 | 3.20M | for (; p < ep; p++) { |
109 | 3.20M | switch (state) { |
110 | 1.94M | case WC_GBK_NOSTATE: |
111 | 1.94M | switch (WC_GBK_MAP[*p]) { |
112 | 1.25M | case UB: |
113 | 1.25M | state = WC_GBK_MBYTE1; |
114 | 1.25M | break; |
115 | 61.3k | case C80: |
116 | 61.3k | wtf_push(os, WC_CCS_GBK_80, *p); |
117 | 61.3k | break; |
118 | 86.3k | case C1: |
119 | 86.3k | wtf_push_unknown(os, p, 1); |
120 | 86.3k | break; |
121 | 541k | default: |
122 | 541k | Strcat_char(os, (char)*p); |
123 | 541k | break; |
124 | 1.94M | } |
125 | 1.94M | break; |
126 | 1.94M | case WC_GBK_MBYTE1: |
127 | 1.25M | if (WC_GBK_MAP[*p] & LB) { |
128 | 992k | gbk = ((wc_uint32)*(p-1) << 8) | *p; |
129 | 992k | if (*(p-1) >= 0xA1 && *p >= 0xA1) |
130 | 857k | wtf_push(os, wc_gb2312_or_gbk(gbk), gbk); |
131 | 134k | else |
132 | 134k | wtf_push(os, WC_CCS_GBK, gbk); |
133 | 992k | } else |
134 | 263k | wtf_push_unknown(os, p-1, 2); |
135 | 1.25M | state = WC_GBK_NOSTATE; |
136 | 1.25M | break; |
137 | 3.20M | } |
138 | 3.20M | } |
139 | 231 | switch (state) { |
140 | 26 | case WC_GBK_MBYTE1: |
141 | 26 | wtf_push_unknown(os, p-1, 1); |
142 | 26 | break; |
143 | 231 | } |
144 | 231 | return os; |
145 | 231 | } |
146 | | |
147 | | void |
148 | | wc_push_to_gbk(Str os, wc_wchar_t cc, wc_status *st) |
149 | 7.52M | { |
150 | 14.4M | while (1) { |
151 | 14.4M | switch (cc.ccs) { |
152 | 9.80k | case WC_CCS_US_ASCII: |
153 | 9.80k | Strcat_char(os, (char)cc.code); |
154 | 9.80k | return; |
155 | 1.97M | case WC_CCS_GB_2312: |
156 | 1.97M | Strcat_char(os, (char)((cc.code >> 8) | 0x80)); |
157 | 1.97M | Strcat_char(os, (char)((cc.code & 0xff) | 0x80)); |
158 | 1.97M | return; |
159 | 166k | case WC_CCS_GBK_80: |
160 | 166k | Strcat_char(os, (char)(cc.code | 0x80)); |
161 | 166k | return; |
162 | 853k | case WC_CCS_GBK_1: |
163 | 858k | case WC_CCS_GBK_2: |
164 | 858k | cc = wc_cs128w_to_gbk(cc); |
165 | 906k | case WC_CCS_GBK: |
166 | 906k | Strcat_char(os, (char)(cc.code >> 8)); |
167 | 906k | Strcat_char(os, (char)(cc.code & 0xff)); |
168 | 906k | return; |
169 | 3.69M | case WC_CCS_UNKNOWN_W: |
170 | 3.69M | if (!WcOption.no_replace) |
171 | 3.69M | Strcat_charp(os, WC_REPLACE_W); |
172 | 3.69M | return; |
173 | 779k | case WC_CCS_UNKNOWN: |
174 | 779k | if (!WcOption.no_replace) |
175 | 779k | Strcat_charp(os, WC_REPLACE); |
176 | 779k | return; |
177 | 6.88M | default: |
178 | 6.88M | #ifdef USE_UNICODE |
179 | 6.88M | if (WcOption.ucs_conv) |
180 | 6.88M | cc = wc_any_to_any_ces(cc, st); |
181 | 0 | else |
182 | 0 | #endif |
183 | 0 | cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN; |
184 | 6.88M | continue; |
185 | 14.4M | } |
186 | 14.4M | } |
187 | 7.52M | } |
188 | | |
189 | | Str |
190 | | wc_char_conv_from_gbk(wc_uchar c, wc_status *st) |
191 | 0 | { |
192 | 0 | static Str os; |
193 | 0 | static wc_uchar gbku; |
194 | 0 | wc_uint32 gbk; |
195 | |
|
196 | 0 | if (st->state == -1) { |
197 | 0 | st->state = WC_GBK_NOSTATE; |
198 | 0 | os = Strnew_size(8); |
199 | 0 | } |
200 | |
|
201 | 0 | switch (st->state) { |
202 | 0 | case WC_GBK_NOSTATE: |
203 | 0 | switch (WC_GBK_MAP[c]) { |
204 | 0 | case UB: |
205 | 0 | gbku = c; |
206 | 0 | st->state = WC_GBK_MBYTE1; |
207 | 0 | return NULL; |
208 | 0 | case C80: |
209 | 0 | wtf_push(os, WC_CCS_GBK_80, c); |
210 | 0 | break; |
211 | 0 | case C1: |
212 | 0 | break; |
213 | 0 | default: |
214 | 0 | Strcat_char(os, (char)c); |
215 | 0 | break; |
216 | 0 | } |
217 | 0 | break; |
218 | 0 | case WC_GBK_MBYTE1: |
219 | 0 | if (WC_GBK_MAP[c] & LB) { |
220 | 0 | gbk = ((wc_uint32)gbku << 8) | c; |
221 | 0 | if (gbku >= 0xA1 && c >= 0xA1) |
222 | 0 | wtf_push(os, wc_gb2312_or_gbk(gbk), gbk); |
223 | 0 | else |
224 | 0 | wtf_push(os, WC_CCS_GBK, gbk); |
225 | 0 | } |
226 | 0 | break; |
227 | 0 | } |
228 | 0 | st->state = -1; |
229 | 0 | return os; |
230 | 0 | } |