Line | Count | Source (jump to first uncovered line) |
1 | | |
2 | | #ifdef USE_UNICODE |
3 | | |
4 | | #include "wc.h" |
5 | | #include "ucs.h" |
6 | | #include "utf8.h" |
7 | | #include "wtf.h" |
8 | | |
9 | | wc_uint8 WC_UTF8_MAP[ 0x100 ] = { |
10 | | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
11 | | 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
12 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
13 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
14 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
15 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
16 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
17 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, |
18 | | |
19 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
20 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
21 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
22 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
23 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
24 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
25 | | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
26 | | 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, |
27 | | }; |
28 | | |
29 | | static wc_uchar utf8_buf[7]; |
30 | | |
31 | | size_t |
32 | | wc_ucs_to_utf8(wc_uint32 ucs, wc_uchar *utf8) |
33 | 59.4M | { |
34 | 59.4M | if (ucs < WC_C_UTF8_L2) { |
35 | 3.26M | utf8[0] = ucs; |
36 | 3.26M | utf8[1] = 0; |
37 | 3.26M | return 1; |
38 | 56.1M | } else if (ucs < WC_C_UTF8_L3) { |
39 | 2.56k | utf8[0] = (ucs >> 6) | 0xc0; |
40 | 2.56k | utf8[1] = (ucs & 0x3f) | 0x80; |
41 | 2.56k | utf8[2] = 0; |
42 | 2.56k | return 2; |
43 | 56.1M | } else if (ucs < WC_C_UTF8_L4) { |
44 | 65.8k | utf8[0] = (ucs >> 12) | 0xe0; |
45 | 65.8k | utf8[1] = ((ucs >> 6) & 0x3f) | 0x80; |
46 | 65.8k | utf8[2] = (ucs & 0x3f) | 0x80; |
47 | 65.8k | utf8[3] = 0; |
48 | 65.8k | return 3; |
49 | 56.1M | } else if (ucs < WC_C_UTF8_L5) { |
50 | 56.1M | utf8[0] = (ucs >> 18) | 0xf0; |
51 | 56.1M | utf8[1] = ((ucs >> 12) & 0x3f) | 0x80; |
52 | 56.1M | utf8[2] = ((ucs >> 6) & 0x3f) | 0x80; |
53 | 56.1M | utf8[3] = (ucs & 0x3f) | 0x80; |
54 | 56.1M | utf8[4] = 0; |
55 | 56.1M | return 4; |
56 | 56.1M | } else if (ucs < WC_C_UTF8_L6) { |
57 | 399 | utf8[0] = (ucs >> 24) | 0xf8; |
58 | 399 | utf8[1] = ((ucs >> 18) & 0x3f) | 0x80; |
59 | 399 | utf8[2] = ((ucs >> 12) & 0x3f) | 0x80; |
60 | 399 | utf8[3] = ((ucs >> 6) & 0x3f) | 0x80; |
61 | 399 | utf8[4] = (ucs & 0x3f) | 0x80; |
62 | 399 | utf8[5] = 0; |
63 | 399 | return 5; |
64 | 954 | } else if (ucs <= WC_C_UCS4_END) { |
65 | 715 | utf8[0] = (ucs >> 30) | 0xfc; |
66 | 715 | utf8[1] = ((ucs >> 24) & 0x3f) | 0x80; |
67 | 715 | utf8[2] = ((ucs >> 18) & 0x3f) | 0x80; |
68 | 715 | utf8[3] = ((ucs >> 12) & 0x3f) | 0x80; |
69 | 715 | utf8[4] = ((ucs >> 6) & 0x3f) | 0x80; |
70 | 715 | utf8[5] = (ucs & 0x3f) | 0x80; |
71 | 715 | utf8[6] = 0; |
72 | 715 | return 6; |
73 | 715 | } else { |
74 | 239 | utf8[0] = 0; |
75 | 239 | return 0; |
76 | 239 | } |
77 | 59.4M | } |
78 | | |
79 | | wc_uint32 |
80 | | wc_utf8_to_ucs(wc_uchar *utf8) |
81 | 146k | { |
82 | 146k | wc_uint32 ucs; |
83 | | |
84 | 146k | switch (WC_UTF8_MAP[utf8[0]]) { |
85 | 0 | case 1: |
86 | 0 | ucs = (wc_uint32) utf8[0]; |
87 | 0 | if (ucs >= WC_C_UTF8_L2) |
88 | 0 | break; |
89 | 0 | return ucs; |
90 | 75.2k | case 2: |
91 | 75.2k | ucs = ((wc_uint32)(utf8[0] & 0x1f) << 6) |
92 | 75.2k | | (wc_uint32)(utf8[1] & 0x3f); |
93 | 75.2k | if (ucs < WC_C_UTF8_L2) |
94 | 9.59k | break; |
95 | 65.6k | return ucs; |
96 | 11.9k | case 3: |
97 | 11.9k | ucs = ((wc_uint32)(utf8[0] & 0x0f) << 12) |
98 | 11.9k | | ((wc_uint32)(utf8[1] & 0x3f) << 6) |
99 | 11.9k | | (wc_uint32)(utf8[2] & 0x3f); |
100 | 11.9k | if (ucs < WC_C_UTF8_L3) |
101 | 626 | break; |
102 | 11.2k | return ucs; |
103 | 56.1k | case 4: |
104 | 56.1k | ucs = ((wc_uint32)(utf8[0] & 0x07) << 18) |
105 | 56.1k | | ((wc_uint32)(utf8[1] & 0x3f) << 12) |
106 | 56.1k | | ((wc_uint32)(utf8[2] & 0x3f) << 6) |
107 | 56.1k | | (wc_uint32)(utf8[3] & 0x3f); |
108 | 56.1k | if (ucs < WC_C_UTF8_L4) |
109 | 219 | break; |
110 | 55.9k | return ucs; |
111 | 1.28k | case 5: |
112 | 1.28k | ucs = ((wc_uint32)(utf8[0] & 0x03) << 24) |
113 | 1.28k | | ((wc_uint32)(utf8[1] & 0x3f) << 18) |
114 | 1.28k | | ((wc_uint32)(utf8[2] & 0x3f) << 12) |
115 | 1.28k | | ((wc_uint32)(utf8[3] & 0x3f) << 6) |
116 | 1.28k | | (wc_uint32)(utf8[4] & 0x3f); |
117 | 1.28k | if (ucs < WC_C_UTF8_L5) |
118 | 216 | break; |
119 | 1.07k | return ucs; |
120 | 1.65k | case 6: |
121 | 1.65k | ucs = ((wc_uint32)(utf8[0] & 0x01) << 30) |
122 | 1.65k | | ((wc_uint32)(utf8[1] & 0x3f) << 24) |
123 | 1.65k | | ((wc_uint32)(utf8[2] & 0x3f) << 18) |
124 | 1.65k | | ((wc_uint32)(utf8[3] & 0x3f) << 12) |
125 | 1.65k | | ((wc_uint32)(utf8[4] & 0x3f) << 6) |
126 | 1.65k | | (wc_uint32)(utf8[5] & 0x3f); |
127 | 1.65k | if (ucs < WC_C_UTF8_L6) |
128 | 212 | break; |
129 | 1.44k | return ucs; |
130 | 0 | default: |
131 | 0 | break; |
132 | 146k | } |
133 | 10.8k | return WC_C_UCS4_ERROR; |
134 | 146k | } |
135 | | |
136 | | Str |
137 | | wc_conv_from_utf8(Str is, wc_ces ces) |
138 | 1.95k | { |
139 | 1.95k | Str os; |
140 | 1.95k | wc_uchar *sp = (wc_uchar *)is->ptr; |
141 | 1.95k | wc_uchar *ep = sp + is->length; |
142 | 1.95k | wc_uchar *p; |
143 | 1.95k | wc_uchar *q = NULL; |
144 | 1.95k | int state = WC_UTF8_NOSTATE; |
145 | 1.95k | size_t next = 0; |
146 | 1.95k | wc_uint32 ucs; |
147 | 1.95k | wc_status st; |
148 | | |
149 | 3.26k | for (p = sp; p < ep && *p < 0x80; p++) |
150 | 1.30k | ; |
151 | 1.95k | if (p == ep) |
152 | 79 | return is; |
153 | 1.87k | os = Strnew_size(is->length + is->length / 3); |
154 | 1.87k | if (p > sp) |
155 | 298 | Strcat_charp_n(os, is->ptr, (int)(p - sp)); |
156 | | |
157 | 1.87k | st.tag = NULL; |
158 | 1.87k | st.ntag = 0; |
159 | 74.1M | for (; p < ep; p++) { |
160 | 74.1M | switch (state) { |
161 | 73.0M | case WC_UTF8_NOSTATE: |
162 | 73.0M | next = WC_UTF8_MAP[*p]; |
163 | 73.0M | switch (next) { |
164 | 42.5M | case 1: |
165 | 42.5M | wtf_push_ucs(os, (wc_uint32)*p, &st); |
166 | 42.5M | break; |
167 | 4.00M | case 8: |
168 | 4.00M | Strcat_char(os, (char)*p); |
169 | 4.00M | break; |
170 | 25.5M | case 0: |
171 | 25.6M | case 7: |
172 | 25.6M | wtf_push_unknown(os, p, 1); |
173 | 25.6M | break; |
174 | 851k | default: |
175 | 851k | q = p; |
176 | 851k | next--; |
177 | 851k | state = WC_UTF8_NEXT; |
178 | 851k | break; |
179 | 73.0M | } |
180 | 73.0M | break; |
181 | 73.0M | case WC_UTF8_NEXT: |
182 | 1.06M | if (WC_UTF8_MAP[*p]) { |
183 | 705k | wtf_push_unknown(os, q, p - q + 1); |
184 | 705k | state = WC_UTF8_NOSTATE; |
185 | 705k | break; |
186 | 705k | } |
187 | 362k | if (--next) |
188 | 215k | break; |
189 | 146k | state = WC_UTF8_NOSTATE; |
190 | 146k | ucs = wc_utf8_to_ucs(q); |
191 | 146k | if (ucs == WC_C_UCS4_ERROR || |
192 | 146k | (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END)) |
193 | 11.4k | wtf_push_unknown(os, q, p - q + 1); |
194 | 134k | else if (ucs != WC_C_UCS2_BOM) |
195 | 134k | wtf_push_ucs(os, ucs, &st); |
196 | 146k | break; |
197 | 74.1M | } |
198 | 74.1M | } |
199 | 1.87k | switch (state) { |
200 | 58 | case WC_UTF8_NEXT: |
201 | 58 | wtf_push_unknown(os, q, p - q); |
202 | 58 | break; |
203 | 1.87k | } |
204 | 1.87k | return os; |
205 | 1.87k | } |
206 | | |
207 | | static int |
208 | | wc_push_tag_to_utf8(Str os, int ntag) |
209 | 5.92M | { |
210 | 5.92M | char *p; |
211 | | |
212 | 5.92M | if (ntag) { |
213 | 2.96M | p = wc_ucs_get_tag(ntag); |
214 | 2.96M | if (p == NULL) |
215 | 30 | ntag = 0; |
216 | 2.96M | } |
217 | 5.92M | if (ntag) { |
218 | 2.96M | wc_ucs_to_utf8(WC_C_LANGUAGE_TAG, utf8_buf); |
219 | 2.96M | Strcat_charp(os, (char *)utf8_buf); |
220 | 53.1M | for (; *p; p++) { |
221 | 50.2M | wc_ucs_to_utf8(WC_C_LANGUAGE_TAG0 | *p, utf8_buf); |
222 | 50.2M | Strcat_charp(os, (char *)utf8_buf); |
223 | 50.2M | } |
224 | 2.96M | } else { |
225 | 2.96M | wc_ucs_to_utf8(WC_C_CANCEL_TAG, utf8_buf); |
226 | 2.96M | Strcat_charp(os, (char *)utf8_buf); |
227 | 2.96M | } |
228 | 5.92M | return ntag; |
229 | 5.92M | } |
230 | | |
231 | | void |
232 | | wc_push_to_utf8(Str os, wc_wchar_t cc, wc_status *st) |
233 | 6.56M | { |
234 | 6.63M | while (1) { |
235 | 6.63M | switch (WC_CCS_SET(cc.ccs)) { |
236 | 1.55M | case WC_CCS_US_ASCII: |
237 | 1.55M | if (st->ntag) |
238 | 1.49M | st->ntag = wc_push_tag_to_utf8(os, 0); |
239 | 1.55M | Strcat_char(os, (char)(cc.code & 0x7f)); |
240 | 1.55M | return; |
241 | 66.2k | case WC_CCS_UCS2: |
242 | 67.9k | case WC_CCS_UCS4: |
243 | 67.9k | if (st->ntag) |
244 | 457 | st->ntag = wc_push_tag_to_utf8(os, 0); |
245 | 67.9k | wc_ucs_to_utf8(cc.code, utf8_buf); |
246 | 67.9k | Strcat_charp(os, (char *)utf8_buf); |
247 | 67.9k | return; |
248 | 3.26M | case WC_CCS_UCS_TAG: |
249 | 3.26M | if (WcOption.use_language_tag && wc_ucs_tag_to_tag(cc.code) != st->ntag) |
250 | 2.96M | st->ntag = wc_push_tag_to_utf8(os, wc_ucs_tag_to_tag(cc.code)); |
251 | 3.26M | wc_ucs_to_utf8(wc_ucs_tag_to_ucs(cc.code), utf8_buf); |
252 | 3.26M | Strcat_charp(os, (char *)utf8_buf); |
253 | 3.26M | return; |
254 | 703 | case WC_CCS_ISO_8859_1: |
255 | 703 | if (st->ntag) |
256 | 494 | st->ntag = wc_push_tag_to_utf8(os, 0); |
257 | 703 | wc_ucs_to_utf8((cc.code | 0x80), utf8_buf); |
258 | 703 | Strcat_charp(os, (char *)utf8_buf); |
259 | 703 | return; |
260 | 1.09k | case WC_CCS_UNKNOWN_W: |
261 | 1.09k | if (!WcOption.no_replace) { |
262 | 1.09k | if (st->ntag) |
263 | 207 | st->ntag = wc_push_tag_to_utf8(os, 0); |
264 | 1.09k | Strcat_charp(os, WC_REPLACE_W); |
265 | 1.09k | } |
266 | 1.09k | return; |
267 | 1.67M | case WC_CCS_UNKNOWN: |
268 | 1.67M | if (!WcOption.no_replace) { |
269 | 1.67M | if (st->ntag) |
270 | 1.46M | st->ntag = wc_push_tag_to_utf8(os, 0); |
271 | 1.67M | Strcat_charp(os, WC_REPLACE); |
272 | 1.67M | } |
273 | 1.67M | return; |
274 | 70.7k | default: |
275 | 70.7k | if (WcOption.ucs_conv && |
276 | 70.7k | (cc.code = wc_any_to_ucs(cc)) != WC_C_UCS4_ERROR) |
277 | 66.2k | cc.ccs = WC_CCS_UCS2; |
278 | 4.48k | else |
279 | 4.48k | cc.ccs = WC_CCS_IS_WIDE(cc.ccs) ? WC_CCS_UNKNOWN_W : WC_CCS_UNKNOWN; |
280 | 70.7k | continue; |
281 | 6.63M | } |
282 | 6.63M | } |
283 | 6.56M | } |
284 | | |
285 | | void |
286 | | wc_push_to_utf8_end(Str os, wc_status *st) |
287 | 513 | { |
288 | 513 | if (st->ntag) |
289 | 93 | st->ntag = wc_push_tag_to_utf8(os, 0); |
290 | 513 | return; |
291 | 513 | } |
292 | | |
293 | | Str |
294 | | wc_char_conv_from_utf8(wc_uchar c, wc_status *st) |
295 | 0 | { |
296 | 0 | static Str os; |
297 | 0 | static wc_uchar buf[6]; |
298 | 0 | static size_t nbuf, next; |
299 | 0 | wc_uint32 ucs; |
300 | |
|
301 | 0 | if (st->state == -1) { |
302 | 0 | st->state = WC_UTF8_NOSTATE; |
303 | 0 | os = Strnew_size(8); |
304 | 0 | st->tag = NULL; |
305 | 0 | st->ntag = 0; |
306 | 0 | nbuf = 0; |
307 | 0 | } |
308 | |
|
309 | 0 | switch (st->state) { |
310 | 0 | case WC_UTF8_NOSTATE: |
311 | 0 | switch (next = WC_UTF8_MAP[c]) { |
312 | 0 | case 1: |
313 | 0 | wtf_push_ucs(os, (wc_uint32)c, st); |
314 | 0 | break; |
315 | 0 | case 8: |
316 | 0 | Strcat_char(os, (char)c); |
317 | 0 | break; |
318 | 0 | case 0: |
319 | 0 | case 7: |
320 | 0 | break; |
321 | 0 | default: |
322 | 0 | buf[nbuf++] = c; |
323 | 0 | next--; |
324 | 0 | st->state = WC_UTF8_NEXT; |
325 | 0 | return NULL; |
326 | 0 | } |
327 | 0 | break; |
328 | 0 | case WC_UTF8_NEXT: |
329 | 0 | if (WC_UTF8_MAP[c]) |
330 | 0 | break; |
331 | 0 | buf[nbuf++] = c; |
332 | 0 | if (--next) |
333 | 0 | return NULL; |
334 | 0 | ucs = wc_utf8_to_ucs(buf); |
335 | 0 | if (ucs == WC_C_UCS4_ERROR || |
336 | 0 | (ucs >= WC_C_UCS2_SURROGATE && ucs <= WC_C_UCS2_SURROGATE_END)) |
337 | 0 | break; |
338 | 0 | if (ucs != WC_C_UCS2_BOM) |
339 | 0 | wtf_push_ucs(os, ucs, st); |
340 | 0 | break; |
341 | 0 | } |
342 | 0 | st->state = -1; |
343 | 0 | return os; |
344 | 0 | } |
345 | | |
346 | | #endif |