Line | Count | Source |
1 | | #ifdef USE_UNICODE |
2 | | |
3 | | #include "wc.h" |
4 | | #include "ucs.h" |
5 | | #include "utf7.h" |
6 | | #include "wtf.h" |
7 | | |
8 | 35.2M | #define SD WC_UTF7_MAP_SET_D |
9 | | #define SO WC_UTF7_MAP_SET_O |
10 | | #define SB WC_UTF7_MAP_SET_B |
11 | 17.4M | #define BB WC_UTF7_MAP_BASE64 |
12 | 1.72M | #define BP WC_UTF7_MAP_PLUS |
13 | 17.2M | #define BM WC_UTF7_MAP_MINUS |
14 | 35.2M | #define CD (WC_UTF7_MAP_SET_D | WC_UTF7_MAP_C0) |
15 | 958k | #define CB (WC_UTF7_MAP_SET_B | WC_UTF7_MAP_C0) |
16 | 689k | #define C1 WC_UTF7_MAP_C1 |
17 | | |
18 | | wc_uint8 WC_UTF7_MAP[ 0x100 ] = { |
19 | | /* TAB NL CR */ |
20 | | CB, CB, CB, CB, CB, CB, CB, CB, CB, CD, CD, CB, CB, CD, CB, CB, |
21 | | /* */ |
22 | | CB, CB, CB, CB, CB, CB, CB, CB, CB, CB, CB, CB, CB, CB, CB, CB, |
23 | | /* SP ! " # $ % & ' ( ) * + , - . / */ |
24 | | SD, SO, SO, SO, SO, SO, SO, SD, SD, SD, SO, BP, SD, BM, SD, BB, |
25 | | /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ |
26 | | BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, SD, SO, SO, SO, SO, SD, |
27 | | /* @ A B C D E F G H I J K L M N O */ |
28 | | BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, |
29 | | /* P Q R S T U V W X Y Z [ \ ] ^ _ */ |
30 | | BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, SO, SB, SO, SO, SO, |
31 | | /* ` a b c d e f g h i j k l m n o */ |
32 | | SO, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, |
33 | | /* p q r s t u v w x y z { | } ~ DEL */ |
34 | | BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, SO, SO, SO, SB, CB, |
35 | | |
36 | | C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, |
37 | | C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, |
38 | | C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, |
39 | | C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, |
40 | | C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, |
41 | | C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, |
42 | | C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, |
43 | | C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, C1, |
44 | | }; |
45 | | |
46 | | static char c_base64_map[ 0x60 ] = { |
47 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, |
48 | | 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, |
49 | | -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
50 | | 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, |
51 | | -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, |
52 | | 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, |
53 | | }; |
54 | | |
55 | | static char base64_c_map[] = |
56 | | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; |
57 | | |
58 | | #define BASE64_C(x) base64_c_map[(x)] |
59 | 225k | #define C_BASE64(x) c_base64_map[(x) - 0x20] |
60 | | |
61 | | Str |
62 | | wc_conv_from_utf7(Str is, wc_ces ces) |
63 | 284 | { |
64 | 284 | Str os; |
65 | 284 | wc_uchar *sp = (wc_uchar *)is->ptr; |
66 | 284 | wc_uchar *ep = sp + is->length; |
67 | 284 | wc_uchar *p; |
68 | 284 | int state = WC_UTF7_NOSTATE; |
69 | 284 | wc_uint32 b, high = 0; |
70 | 284 | wc_status st; |
71 | | |
72 | 719 | for (p = sp; p < ep && *p < 0x80 && *p != WC_C_UTF7_PLUS; p++) |
73 | 435 | ; |
74 | 284 | if (p == ep) |
75 | 16 | return is; |
76 | 268 | os = Strnew_size(is->length + is->length / 3); |
77 | 268 | if (p > sp) |
78 | 33 | Strcat_charp_n(os, is->ptr, (int)(p - sp)); |
79 | | |
80 | 268 | st.tag = NULL; |
81 | 268 | st.ntag = 0; |
82 | 3.01M | for (; p < ep; p++) { |
83 | 3.01M | switch (state) { |
84 | 944k | case WC_UTF7_NOSTATE: |
85 | 944k | if (*p == WC_C_UTF7_PLUS) { |
86 | 2.81k | state = WC_UTF7_PLUS; |
87 | 2.81k | st.shift = 16; |
88 | 2.81k | st.base = 0; |
89 | 2.81k | high = 0; |
90 | 2.81k | continue; |
91 | 2.81k | } |
92 | 941k | break; |
93 | 941k | case WC_UTF7_PLUS: |
94 | 8.30k | if (*p == WC_C_UTF7_MINUS) |
95 | 238 | wtf_push_ucs(os, (wc_uint32)WC_C_UTF7_PLUS, &st); |
96 | 2.07M | case WC_UTF7_BASE64: |
97 | 2.07M | switch (WC_UTF7_MAP[*p]) { |
98 | 191k | case BB: /* [A-Za-z0-9/] */ |
99 | 225k | case BP: /* '+' */ |
100 | 225k | b = C_BASE64(*p); |
101 | 225k | st.shift -= 6; |
102 | 225k | if (st.shift <= 0) { |
103 | 83.5k | st.base |= b >> (- st.shift); |
104 | 83.5k | if (st.base >= WC_C_UCS2_SURROGATE && |
105 | 21.4k | st.base < WC_C_UCS2_SURROGATE_LOW) { |
106 | 1.72k | if (! high) |
107 | 1.41k | high = st.base; |
108 | 311 | else |
109 | 311 | high = 0; /* error */ |
110 | 81.7k | } else if (st.base >= WC_C_UCS2_SURROGATE_LOW && |
111 | 19.6k | st.base <= WC_C_UCS2_SURROGATE_END) { |
112 | 1.84k | if (high) |
113 | 667 | wtf_push_ucs(os, wc_utf16_to_ucs(high, st.base), &st); |
114 | | /* else; */ /* error */ |
115 | 1.84k | high = 0; |
116 | 79.9k | } else if (st.base != WC_C_UCS2_BOM) |
117 | 79.7k | wtf_push_ucs(os, st.base, &st); |
118 | 83.5k | st.shift += 16; |
119 | 83.5k | st.base = 0; |
120 | 83.5k | } |
121 | 225k | st.base |= (b << st.shift) & 0xffff; |
122 | 225k | state = WC_UTF7_BASE64; |
123 | 225k | continue; |
124 | 2.62k | case BM: /* '-' */ |
125 | 2.62k | state = WC_UTF7_NOSTATE; |
126 | 2.62k | continue; |
127 | 2.07M | } |
128 | 3.01M | } |
129 | 2.78M | switch (WC_UTF7_MAP[*p]) { |
130 | 11.9k | case CD: |
131 | 958k | case CB: |
132 | 958k | Strcat_char(os, (char)*p); |
133 | 958k | break; |
134 | 689k | case C1: |
135 | 689k | wtf_push_unknown(os, p, 1); |
136 | 689k | break; |
137 | 1.13M | default: |
138 | 1.13M | wtf_push_ucs(os, (wc_uint32)*p, &st); |
139 | 1.13M | break; |
140 | 2.78M | } |
141 | 2.78M | } |
142 | 268 | return os; |
143 | 268 | } |
144 | | |
145 | | static void |
146 | | wc_push_ucs_to_utf7(Str os, wc_uint32 ucs, wc_status *st) |
147 | 1.05G | { |
148 | 1.05G | if (ucs > WC_C_UNICODE_END) |
149 | 1.22k | return; |
150 | 1.05G | if (ucs > WC_C_UCS2_END) { |
151 | 337M | ucs = wc_ucs_to_utf16(ucs); |
152 | 337M | wc_push_ucs_to_utf7(os, ucs >> 16, st); |
153 | 337M | wc_push_ucs_to_utf7(os, ucs & 0xffff, st); |
154 | 337M | return; |
155 | 337M | } |
156 | 716M | if (ucs < 0x80) { |
157 | 40.3M | switch (WC_UTF7_MAP[ucs]) { |
158 | 17.2M | case BB: |
159 | 17.2M | case BM: |
160 | 35.2M | case SD: |
161 | 35.2M | case CD: |
162 | 35.2M | if (st->state == WC_UTF7_BASE64) { |
163 | 31.9M | Strcat_char(os, BASE64_C(st->base)); |
164 | 31.9M | Strcat_char(os, WC_C_UTF7_MINUS); |
165 | 31.9M | st->state = WC_UTF7_NOSTATE; |
166 | 31.9M | } |
167 | 35.2M | Strcat_char(os, (char)ucs); |
168 | 35.2M | return; |
169 | 1.50M | case BP: |
170 | 1.50M | if (st->state == WC_UTF7_BASE64) { |
171 | 1.38M | Strcat_char(os, BASE64_C(st->base)); |
172 | 1.38M | Strcat_char(os, WC_C_UTF7_MINUS); |
173 | 1.38M | st->state = WC_UTF7_NOSTATE; |
174 | 1.38M | } |
175 | 1.50M | Strcat_char(os, WC_C_UTF7_PLUS); |
176 | 1.50M | Strcat_char(os, WC_C_UTF7_MINUS); |
177 | 1.50M | return; |
178 | 40.3M | } |
179 | 40.3M | } |
180 | 679M | if (st->state == WC_UTF7_BASE64 && st->shift) { |
181 | 446M | st->shift += 16; |
182 | 446M | st->base |= ucs >> st->shift; |
183 | 446M | Strcat_char(os, BASE64_C(st->base)); |
184 | 446M | } else { |
185 | 232M | if (st->state != WC_UTF7_BASE64) { |
186 | 33.3M | Strcat_char(os, WC_C_UTF7_PLUS); |
187 | 33.3M | st->state = WC_UTF7_BASE64; |
188 | 33.3M | } |
189 | 232M | st->shift = 16; |
190 | 232M | st->base = 0; |
191 | 232M | } |
192 | 679M | st->shift -= 6; |
193 | 679M | Strcat_char(os, BASE64_C((ucs >> st->shift) & 0x3f)); |
194 | 679M | st->shift -= 6; |
195 | 679M | Strcat_char(os, BASE64_C((ucs >> st->shift) & 0x3f)); |
196 | 679M | if (st->shift) { |
197 | 464M | st->shift -= 6; |
198 | 464M | st->base = (ucs << (- st->shift)) & 0x3f; |
199 | 464M | } |
200 | 679M | return; |
201 | 716M | } |
202 | | |
203 | | static int |
204 | | wc_push_tag_to_utf7(Str os, int ntag, wc_status *st) |
205 | 35.6M | { |
206 | 35.6M | char *p; |
207 | | |
208 | 35.6M | if (ntag) { |
209 | 17.8M | p = wc_ucs_get_tag(ntag); |
210 | 17.8M | if (p == NULL) |
211 | 606 | ntag = 0; |
212 | 17.8M | } |
213 | 35.6M | if (ntag) { |
214 | 17.8M | wc_push_ucs_to_utf7(os, WC_C_LANGUAGE_TAG, st); |
215 | 319M | for (; *p; p++) |
216 | 301M | wc_push_ucs_to_utf7(os, WC_C_LANGUAGE_TAG0 | *p, st); |
217 | 17.8M | } else |
218 | 17.8M | wc_push_ucs_to_utf7(os, WC_C_CANCEL_TAG, st); |
219 | 35.6M | return ntag; |
220 | 35.6M | } |
221 | | |
222 | | void |
223 | | wc_push_to_utf7(Str os, wc_wchar_t cc, wc_status *st) |
224 | 41.4M | { |
225 | 41.4M | char *p; |
226 | | |
227 | 42.7M | while (1) { |
228 | 42.7M | switch (WC_CCS_SET(cc.ccs)) { |
229 | 1.99k | case WC_CCS_UCS4: |
230 | 1.99k | if (cc.code > WC_C_UNICODE_END) { |
231 | 1.71k | cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN; |
232 | 1.71k | continue; |
233 | 1.71k | } |
234 | 3.60M | case WC_CCS_US_ASCII: |
235 | 4.79M | case WC_CCS_UCS2: |
236 | 4.79M | if (st->ntag) |
237 | 2.28M | st->ntag = wc_push_tag_to_utf7(os, 0, st); |
238 | 4.79M | wc_push_ucs_to_utf7(os, cc.code, st); |
239 | 4.79M | return; |
240 | 19.3M | case WC_CCS_UCS_TAG: |
241 | 19.3M | if (WcOption.use_language_tag && wc_ucs_tag_to_tag(cc.code) != st->ntag) |
242 | 17.8M | st->ntag = wc_push_tag_to_utf7(os, wc_ucs_tag_to_tag(cc.code), st); |
243 | 19.3M | wc_push_ucs_to_utf7(os, wc_ucs_tag_to_ucs(cc.code), st); |
244 | 19.3M | return; |
245 | 497 | case WC_CCS_ISO_8859_1: |
246 | 497 | if (st->ntag) |
247 | 245 | st->ntag = wc_push_tag_to_utf7(os, 0, st); |
248 | 497 | wc_push_ucs_to_utf7(os, cc.code | 0x80, st); |
249 | 497 | return; |
250 | 184k | case WC_CCS_UNKNOWN_W: |
251 | 184k | if (!WcOption.no_replace) { |
252 | 184k | if (st->ntag) |
253 | 238 | st->ntag = wc_push_tag_to_utf7(os, 0, st); |
254 | 553k | for (p = WC_REPLACE_W; *p; p++) |
255 | 368k | wc_push_ucs_to_utf7(os, (wc_uint32)*p, st); |
256 | 184k | } |
257 | 184k | return; |
258 | 17.1M | case WC_CCS_UNKNOWN: |
259 | 17.1M | if (!WcOption.no_replace) { |
260 | 17.1M | if (st->ntag) |
261 | 15.5M | st->ntag = wc_push_tag_to_utf7(os, 0, st); |
262 | 34.2M | for (p = WC_REPLACE; *p; p++) |
263 | 17.1M | wc_push_ucs_to_utf7(os, (wc_uint32)*p, st); |
264 | 17.1M | } |
265 | 17.1M | return; |
266 | 1.35M | default: |
267 | 1.35M | if (WcOption.ucs_conv && |
268 | 1.35M | (cc.code = wc_any_to_ucs(cc)) != WC_C_UCS4_ERROR) |
269 | 1.16M | cc.ccs = WC_CCS_UCS2; |
270 | 191k | else |
271 | 191k | cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN; |
272 | 1.35M | continue; |
273 | 42.7M | } |
274 | 42.7M | } |
275 | 41.4M | } |
276 | | |
277 | | void |
278 | | wc_push_to_utf7_end(Str os, wc_status *st) |
279 | 700 | { |
280 | 700 | if (st->ntag) |
281 | 257 | st->ntag = wc_push_tag_to_utf7(os, 0, st); |
282 | 700 | if (st->state == WC_UTF7_BASE64) { |
283 | 472 | if (st->shift) |
284 | 369 | Strcat_char(os, BASE64_C(st->base)); |
285 | 472 | Strcat_char(os, WC_C_UTF7_MINUS); |
286 | 472 | } |
287 | 700 | return; |
288 | 700 | } |
289 | | |
290 | | Str |
291 | | wc_char_conv_from_utf7(wc_uchar c, wc_status *st) |
292 | 0 | { |
293 | 0 | static Str os; |
294 | 0 | static wc_uint32 high; |
295 | 0 | wc_uint32 b; |
296 | |
|
297 | 0 | if (st->state == -1) { |
298 | 0 | st->state = WC_UTF7_NOSTATE; |
299 | 0 | os = Strnew_size(8); |
300 | 0 | } |
301 | |
|
302 | 0 | switch (st->state) { |
303 | 0 | case WC_UTF7_NOSTATE: |
304 | 0 | if (c == WC_C_UTF7_PLUS) { |
305 | 0 | st->state = WC_UTF7_PLUS; |
306 | 0 | st->shift = 16; |
307 | 0 | st->base = 0; |
308 | 0 | high = 0; |
309 | 0 | return NULL; |
310 | 0 | } |
311 | 0 | break; |
312 | 0 | case WC_UTF7_PLUS: |
313 | 0 | if (c == WC_C_UTF7_MINUS) { |
314 | 0 | wtf_push_ucs(os, (wc_uint32)WC_C_UTF7_PLUS, st); |
315 | 0 | st->state = -1; |
316 | 0 | return os; |
317 | 0 | } |
318 | 0 | case WC_UTF7_BASE64: |
319 | 0 | switch (WC_UTF7_MAP[c]) { |
320 | 0 | case BB: /* [A-Za-z0-9/] */ |
321 | 0 | case BP: /* '+' */ |
322 | 0 | b = C_BASE64(c); |
323 | 0 | st->shift -= 6; |
324 | 0 | if (st->shift <= 0) { |
325 | 0 | st->base |= b >> (- st->shift); |
326 | 0 | if (st->base >= WC_C_UCS2_SURROGATE && |
327 | 0 | st->base < WC_C_UCS2_SURROGATE_LOW) { |
328 | 0 | if (! high) |
329 | 0 | high = st->base; |
330 | 0 | else |
331 | 0 | high = 0; /* error */ |
332 | 0 | } else if (st->base >= WC_C_UCS2_SURROGATE_LOW && |
333 | 0 | st->base <= WC_C_UCS2_SURROGATE_END) { |
334 | 0 | if (high) |
335 | 0 | wtf_push_ucs(os, wc_utf16_to_ucs(high, st->base), st); |
336 | | /* else; */ /* error */ |
337 | 0 | high = 0; |
338 | 0 | } else if (st->base != WC_C_UCS2_BOM) |
339 | 0 | wtf_push_ucs(os, st->base, st); |
340 | 0 | st->shift += 16; |
341 | 0 | st->base = 0; |
342 | 0 | } |
343 | 0 | st->base |= (b << st->shift) & 0xffff; |
344 | 0 | st->state = WC_UTF7_BASE64; |
345 | 0 | return os; |
346 | 0 | case BM: /* '-' */ |
347 | 0 | st->state = -1; |
348 | 0 | return NULL; |
349 | 0 | } |
350 | 0 | } |
351 | 0 | switch (WC_UTF7_MAP[c]) { |
352 | 0 | case CD: |
353 | 0 | case CB: |
354 | 0 | Strcat_char(os, (char)c); |
355 | 0 | break; |
356 | 0 | case C1: |
357 | 0 | break; |
358 | 0 | default: |
359 | 0 | wtf_push_ucs(os, (wc_uint32)c, st); |
360 | 0 | break; |
361 | 0 | } |
362 | 0 | st->state = -1; |
363 | 0 | return os; |
364 | 0 | } |
365 | | |
366 | | #endif |
367 | | |
368 | | |