/src/CMake/Utilities/cmlibarchive/libarchive/archive_string.c
Line | Count | Source |
1 | | /*- |
2 | | * Copyright (c) 2003-2011 Tim Kientzle |
3 | | * Copyright (c) 2011-2012 Michihiro NAKAJIMA |
4 | | * All rights reserved. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions |
8 | | * are met: |
9 | | * 1. Redistributions of source code must retain the above copyright |
10 | | * notice, this list of conditions and the following disclaimer. |
11 | | * 2. Redistributions in binary form must reproduce the above copyright |
12 | | * notice, this list of conditions and the following disclaimer in the |
13 | | * documentation and/or other materials provided with the distribution. |
14 | | * |
15 | | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR |
16 | | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
17 | | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
18 | | * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, |
19 | | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
20 | | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
21 | | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
22 | | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
23 | | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
24 | | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | | */ |
26 | | |
27 | | #include "archive_platform.h" |
28 | | |
29 | | /* |
30 | | * Basic resizable string support, to simplify manipulating arbitrary-sized |
31 | | * strings while minimizing heap activity. |
32 | | * |
33 | | * In particular, the buffer used by a string object is only grown, it |
34 | | * never shrinks, so you can clear and reuse the same string object |
35 | | * without incurring additional memory allocations. |
36 | | */ |
37 | | |
38 | | #ifdef HAVE_ERRNO_H |
39 | | #include <errno.h> |
40 | | #endif |
41 | | #ifdef HAVE_ICONV_H |
42 | | #include <iconv.h> |
43 | | #endif |
44 | | #ifdef HAVE_LANGINFO_H |
45 | | #include <langinfo.h> |
46 | | #endif |
47 | | #ifdef HAVE_LOCALCHARSET_H |
48 | | #include <localcharset.h> |
49 | | #endif |
50 | | #ifdef HAVE_STDLIB_H |
51 | | #include <stdlib.h> |
52 | | #endif |
53 | | #ifdef HAVE_STRING_H |
54 | | #include <string.h> |
55 | | #endif |
56 | | #ifdef HAVE_WCHAR_H |
57 | | #include <wchar.h> |
58 | | #endif |
59 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
60 | | #include <windows.h> |
61 | | #include <locale.h> |
62 | | #endif |
63 | | |
64 | | #include "archive_endian.h" |
65 | | #include "archive_private.h" |
66 | | #include "archive_string.h" |
67 | | #include "archive_string_composition.h" |
68 | | |
69 | | #if !defined(HAVE_WMEMCPY) && !defined(wmemcpy) |
70 | | #define wmemcpy(a,b,i) (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t)) |
71 | | #endif |
72 | | |
73 | | #if !defined(HAVE_WMEMMOVE) && !defined(wmemmove) |
74 | | #define wmemmove(a,b,i) (wchar_t *)memmove((a), (b), (i) * sizeof(wchar_t)) |
75 | | #endif |
76 | | |
77 | | #undef max |
78 | 26 | #define max(a, b) ((a)>(b)?(a):(b)) |
79 | | |
80 | | struct archive_string_conv { |
81 | | struct archive_string_conv *next; |
82 | | char *from_charset; |
83 | | char *to_charset; |
84 | | unsigned from_cp; |
85 | | unsigned to_cp; |
86 | | /* Set 1 if from_charset and to_charset are the same. */ |
87 | | int same; |
88 | | int flag; |
89 | 3.57k | #define SCONV_TO_CHARSET 1 /* MBS is being converted to specified |
90 | | * charset. */ |
91 | 8.28k | #define SCONV_FROM_CHARSET (1<<1) /* MBS is being converted from |
92 | | * specified charset. */ |
93 | 5.79k | #define SCONV_BEST_EFFORT (1<<2) /* Copy at least ASCII code. */ |
94 | | #define SCONV_WIN_CP (1<<3) /* Use Windows API for converting |
95 | | * MBS. */ |
96 | 2.72k | #define SCONV_UTF8_LIBARCHIVE_2 (1<<4) /* Incorrect UTF-8 made by libarchive |
97 | | * 2.x in the wrong assumption. */ |
98 | 5.22k | #define SCONV_NORMALIZATION_C (1<<6) /* Need normalization to be Form C. |
99 | | * Before UTF-8 characters are actually |
100 | | * processed. */ |
101 | 2.61k | #define SCONV_NORMALIZATION_D (1<<7) /* Need normalization to be Form D. |
102 | | * Before UTF-8 characters are actually |
103 | | * processed. |
104 | | * Currently this only for MAC OS X. */ |
105 | 33.4k | #define SCONV_TO_UTF8 (1<<8) /* "to charset" side is UTF-8. */ |
106 | 7.89k | #define SCONV_FROM_UTF8 (1<<9) /* "from charset" side is UTF-8. */ |
107 | 5.24k | #define SCONV_TO_UTF16BE (1<<10) /* "to charset" side is UTF-16BE. */ |
108 | 12.7k | #define SCONV_FROM_UTF16BE (1<<11) /* "from charset" side is UTF-16BE. */ |
109 | 5.24k | #define SCONV_TO_UTF16LE (1<<12) /* "to charset" side is UTF-16LE. */ |
110 | 12.8k | #define SCONV_FROM_UTF16LE (1<<13) /* "from charset" side is UTF-16LE. */ |
111 | 2.84k | #define SCONV_TO_UTF16 (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE) |
112 | 7.85k | #define SCONV_FROM_UTF16 (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE) |
113 | | |
114 | | #if HAVE_ICONV |
115 | | iconv_t cd; |
116 | | iconv_t cd_w;/* Use at archive_mstring on |
117 | | * Windows. */ |
118 | | #endif |
119 | | /* A temporary buffer for normalization. */ |
120 | | struct archive_string utftmp; |
121 | | int (*converter[2])(struct archive_string *, const void *, size_t, |
122 | | struct archive_string_conv *); |
123 | | int nconverter; |
124 | | }; |
125 | | |
126 | | #define CP_C_LOCALE 0 /* "C" locale only for this file. */ |
127 | | #define CP_UTF16LE 1200 |
128 | | #define CP_UTF16BE 1201 |
129 | | |
130 | 432 | #define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF) |
131 | 360 | #define IS_LOW_SURROGATE_LA(uc) ((uc) >= 0xDC00 && (uc) <= 0xDFFF) |
132 | 0 | #define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF) |
133 | 10.8k | #define UNICODE_MAX 0x10FFFF |
134 | 7.18k | #define UNICODE_R_CHAR 0xFFFD /* Replacement character. */ |
135 | | /* Set U+FFFD(Replacement character) in UTF-8. */ |
136 | | static const char utf8_replacement_char[] = {0xef, 0xbf, 0xbd}; |
137 | | |
138 | | static struct archive_string_conv *find_sconv_object(struct archive *, |
139 | | const char *, const char *); |
140 | | static void add_sconv_object(struct archive *, struct archive_string_conv *); |
141 | | static struct archive_string_conv *create_sconv_object(const char *, |
142 | | const char *, unsigned, int); |
143 | | static void free_sconv_object(struct archive_string_conv *); |
144 | | static struct archive_string_conv *get_sconv_object(struct archive *, |
145 | | const char *, const char *, int); |
146 | | static unsigned make_codepage_from_charset(const char *); |
147 | | static unsigned get_current_codepage(void); |
148 | | static unsigned get_current_oemcp(void); |
149 | | static size_t mbsnbytes(const void *, size_t); |
150 | | static size_t utf16nbytes(const void *, size_t); |
151 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
152 | | static int archive_wstring_append_from_mbs_in_codepage( |
153 | | struct archive_wstring *, const char *, size_t, |
154 | | struct archive_string_conv *); |
155 | | static int archive_string_append_from_wcs_in_codepage(struct archive_string *, |
156 | | const wchar_t *, size_t, struct archive_string_conv *); |
157 | | static int strncat_in_codepage(struct archive_string *, const void *, |
158 | | size_t, struct archive_string_conv *); |
159 | | static int win_strncat_from_utf16be(struct archive_string *, const void *, |
160 | | size_t, struct archive_string_conv *); |
161 | | static int win_strncat_from_utf16le(struct archive_string *, const void *, |
162 | | size_t, struct archive_string_conv *); |
163 | | static int win_strncat_to_utf16be(struct archive_string *, const void *, |
164 | | size_t, struct archive_string_conv *); |
165 | | static int win_strncat_to_utf16le(struct archive_string *, const void *, |
166 | | size_t, struct archive_string_conv *); |
167 | | #endif |
168 | | static int best_effort_strncat_from_utf16be(struct archive_string *, |
169 | | const void *, size_t, struct archive_string_conv *); |
170 | | static int best_effort_strncat_from_utf16le(struct archive_string *, |
171 | | const void *, size_t, struct archive_string_conv *); |
172 | | static int best_effort_strncat_to_utf16be(struct archive_string *, |
173 | | const void *, size_t, struct archive_string_conv *); |
174 | | static int best_effort_strncat_to_utf16le(struct archive_string *, |
175 | | const void *, size_t, struct archive_string_conv *); |
176 | | #if defined(HAVE_ICONV) |
177 | | static int iconv_strncat_in_locale(struct archive_string *, const void *, |
178 | | size_t, struct archive_string_conv *); |
179 | | #endif |
180 | | static int best_effort_strncat_in_locale(struct archive_string *, |
181 | | const void *, size_t, struct archive_string_conv *); |
182 | | static int _utf8_to_unicode(uint32_t *, const char *, size_t); |
183 | | static int utf8_to_unicode(uint32_t *, const char *, size_t); |
184 | | static inline uint32_t combine_surrogate_pair(uint32_t, uint32_t); |
185 | | static int cesu8_to_unicode(uint32_t *, const char *, size_t); |
186 | | static size_t unicode_to_utf8(char *, size_t, uint32_t); |
187 | | static int utf16_to_unicode(uint32_t *, const char *, size_t, int); |
188 | | static size_t unicode_to_utf16be(char *, size_t, uint32_t); |
189 | | static size_t unicode_to_utf16le(char *, size_t, uint32_t); |
190 | | static int strncat_from_utf8_libarchive2(struct archive_string *, |
191 | | const void *, size_t, struct archive_string_conv *); |
192 | | static int strncat_from_utf8_to_utf8(struct archive_string *, const void *, |
193 | | size_t, struct archive_string_conv *); |
194 | | static int archive_string_normalize_C(struct archive_string *, const void *, |
195 | | size_t, struct archive_string_conv *); |
196 | | static int archive_string_normalize_D(struct archive_string *, const void *, |
197 | | size_t, struct archive_string_conv *); |
198 | | static int archive_string_append_unicode(struct archive_string *, |
199 | | const void *, size_t, struct archive_string_conv *); |
200 | | |
201 | | #if defined __LITTLE_ENDIAN__ |
202 | | #define IS_BIG_ENDIAN 0 |
203 | | #elif defined __BIG_ENDIAN__ |
204 | | #define IS_BIG_ENDIAN 1 |
205 | | #elif defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) |
206 | | #define IS_BIG_ENDIAN 0 |
207 | | #elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) |
208 | | #define IS_BIG_ENDIAN 1 |
209 | | #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || defined(_M_ARM64)) |
210 | | #define IS_BIG_ENDIAN 0 |
211 | | #else |
212 | | // Detect endianness at runtime. |
213 | | static int |
214 | | is_big_endian(void) |
215 | | { |
216 | | uint16_t d = 1; |
217 | | |
218 | | return (archive_be16dec(&d) == 1); |
219 | | } |
220 | | |
221 | | #define IS_BIG_ENDIAN is_big_endian() |
222 | | #endif |
223 | | |
224 | | static struct archive_string * |
225 | | archive_string_append(struct archive_string *as, const char *p, size_t s) |
226 | 7.51M | { |
227 | 7.51M | if (archive_string_ensure(as, as->length + s + 1) == NULL) |
228 | 0 | return (NULL); |
229 | 7.51M | if (s) |
230 | 7.46M | memmove(as->s + as->length, p, s); |
231 | 7.51M | as->length += s; |
232 | 7.51M | as->s[as->length] = 0; |
233 | 7.51M | return (as); |
234 | 7.51M | } |
235 | | |
236 | | static struct archive_wstring * |
237 | | archive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s) |
238 | 28.3k | { |
239 | 28.3k | if (archive_wstring_ensure(as, as->length + s + 1) == NULL) |
240 | 0 | return (NULL); |
241 | 28.3k | if (s) |
242 | 3.78k | wmemmove(as->s + as->length, p, s); |
243 | 28.3k | as->length += s; |
244 | 28.3k | as->s[as->length] = 0; |
245 | 28.3k | return (as); |
246 | 28.3k | } |
247 | | |
248 | | struct archive_string * |
249 | | archive_array_append(struct archive_string *as, const char *p, size_t s) |
250 | 82 | { |
251 | 82 | return archive_string_append(as, p, s); |
252 | 82 | } |
253 | | |
254 | | void |
255 | | archive_string_concat(struct archive_string *dest, struct archive_string *src) |
256 | 55.8k | { |
257 | 55.8k | if (archive_string_append(dest, src->s, src->length) == NULL) |
258 | 0 | __archive_errx(1, "Out of memory"); |
259 | 55.8k | } |
260 | | |
261 | | void |
262 | | archive_wstring_concat(struct archive_wstring *dest, |
263 | | struct archive_wstring *src) |
264 | 27.9k | { |
265 | 27.9k | if (archive_wstring_append(dest, src->s, src->length) == NULL) |
266 | 0 | __archive_errx(1, "Out of memory"); |
267 | 27.9k | } |
268 | | |
269 | | void |
270 | | archive_string_free(struct archive_string *as) |
271 | 6.75M | { |
272 | 6.75M | as->length = 0; |
273 | 6.75M | as->buffer_length = 0; |
274 | 6.75M | free(as->s); |
275 | 6.75M | as->s = NULL; |
276 | 6.75M | } |
277 | | |
278 | | void |
279 | | archive_wstring_free(struct archive_wstring *as) |
280 | 331k | { |
281 | 331k | as->length = 0; |
282 | 331k | as->buffer_length = 0; |
283 | 331k | free(as->s); |
284 | 331k | as->s = NULL; |
285 | 331k | } |
286 | | |
287 | | struct archive_wstring * |
288 | | archive_wstring_ensure(struct archive_wstring *as, size_t s) |
289 | 61.5k | { |
290 | 61.5k | return (struct archive_wstring *) |
291 | 61.5k | archive_string_ensure((struct archive_string *)as, |
292 | 61.5k | s * sizeof(wchar_t)); |
293 | 61.5k | } |
294 | | |
295 | | /* Returns NULL on any allocation failure. */ |
296 | | struct archive_string * |
297 | | archive_string_ensure(struct archive_string *as, size_t s) |
298 | 9.18M | { |
299 | 9.18M | char *p; |
300 | 9.18M | size_t new_length; |
301 | | |
302 | | /* If buffer is already big enough, don't reallocate. */ |
303 | 9.18M | if (as->s && (s <= as->buffer_length)) |
304 | 8.97M | return (as); |
305 | | |
306 | | /* |
307 | | * Growing the buffer at least exponentially ensures that |
308 | | * append operations are always linear in the number of |
309 | | * characters appended. Using a smaller growth rate for |
310 | | * larger buffers reduces memory waste somewhat at the cost of |
311 | | * a larger constant factor. |
312 | | */ |
313 | 201k | if (as->buffer_length < 32) |
314 | | /* Start with a minimum 32-character buffer. */ |
315 | 199k | new_length = 32; |
316 | 2.46k | else if (as->buffer_length < 8192) |
317 | | /* Buffers under 8k are doubled for speed. */ |
318 | 2.46k | new_length = as->buffer_length + as->buffer_length; |
319 | 0 | else { |
320 | | /* Buffers 8k and over grow by at least 25% each time. */ |
321 | 0 | new_length = as->buffer_length + as->buffer_length / 4; |
322 | | /* Be safe: If size wraps, fail. */ |
323 | 0 | if (new_length < as->buffer_length) { |
324 | | /* On failure, wipe the string and return NULL. */ |
325 | 0 | archive_string_free(as); |
326 | 0 | errno = ENOMEM;/* Make sure errno has ENOMEM. */ |
327 | 0 | return (NULL); |
328 | 0 | } |
329 | 0 | } |
330 | | /* |
331 | | * The computation above is a lower limit to how much we'll |
332 | | * grow the buffer. In any case, we have to grow it enough to |
333 | | * hold the request. |
334 | | */ |
335 | 201k | if (new_length < s) |
336 | 94.5k | new_length = s; |
337 | | /* Now we can reallocate the buffer. */ |
338 | 201k | p = realloc(as->s, new_length); |
339 | 201k | if (p == NULL) { |
340 | | /* On failure, wipe the string and return NULL. */ |
341 | 0 | archive_string_free(as); |
342 | 0 | errno = ENOMEM;/* Make sure errno has ENOMEM. */ |
343 | 0 | return (NULL); |
344 | 0 | } |
345 | | |
346 | 201k | as->s = p; |
347 | 201k | as->buffer_length = new_length; |
348 | 201k | return (as); |
349 | 201k | } |
350 | | |
351 | | /* |
352 | | * TODO: See if there's a way to avoid scanning |
353 | | * the source string twice. Then test to see |
354 | | * if it actually helps (remember that we're almost |
355 | | * always called with pretty short arguments, so |
356 | | * such an optimization might not help). |
357 | | */ |
358 | | struct archive_string * |
359 | | archive_strncat(struct archive_string *as, const void *_p, size_t n) |
360 | 28.1k | { |
361 | 28.1k | size_t s; |
362 | 28.1k | const char *p, *pp; |
363 | | |
364 | 28.1k | p = (const char *)_p; |
365 | | |
366 | | /* Like strlen(p), except won't examine positions beyond p[n]. */ |
367 | 28.1k | s = 0; |
368 | 28.1k | pp = p; |
369 | 411k | while (s < n && *pp) { |
370 | 382k | pp++; |
371 | 382k | s++; |
372 | 382k | } |
373 | 28.1k | if ((as = archive_string_append(as, p, s)) == NULL) |
374 | 0 | __archive_errx(1, "Out of memory"); |
375 | 28.1k | return (as); |
376 | 28.1k | } |
377 | | |
378 | | struct archive_wstring * |
379 | | archive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n) |
380 | 398 | { |
381 | 398 | size_t s; |
382 | 398 | const wchar_t *pp; |
383 | | |
384 | | /* Like strlen(p), except won't examine positions beyond p[n]. */ |
385 | 398 | s = 0; |
386 | 398 | pp = p; |
387 | 8.31k | while (s < n && *pp) { |
388 | 7.91k | pp++; |
389 | 7.91k | s++; |
390 | 7.91k | } |
391 | 398 | if ((as = archive_wstring_append(as, p, s)) == NULL) |
392 | 0 | __archive_errx(1, "Out of memory"); |
393 | 398 | return (as); |
394 | 398 | } |
395 | | |
396 | | struct archive_string * |
397 | | archive_strcat(struct archive_string *as, const void *p) |
398 | 19.9k | { |
399 | | /* strcat is just strncat without an effective limit. |
400 | | * Assert that we'll never get called with a source |
401 | | * string over 16MB. |
402 | | * TODO: Review all uses of strcat in the source |
403 | | * and try to replace them with strncat(). |
404 | | */ |
405 | 19.9k | return archive_strncat(as, p, 0x1000000); |
406 | 19.9k | } |
407 | | |
408 | | struct archive_wstring * |
409 | | archive_wstrcat(struct archive_wstring *as, const wchar_t *p) |
410 | 0 | { |
411 | | /* Ditto. */ |
412 | 0 | return archive_wstrncat(as, p, 0x1000000); |
413 | 0 | } |
414 | | |
415 | | struct archive_string * |
416 | | archive_strappend_char(struct archive_string *as, char c) |
417 | 7.43M | { |
418 | 7.43M | if ((as = archive_string_append(as, &c, 1)) == NULL) |
419 | 0 | __archive_errx(1, "Out of memory"); |
420 | 7.43M | return (as); |
421 | 7.43M | } |
422 | | |
423 | | struct archive_wstring * |
424 | | archive_wstrappend_wchar(struct archive_wstring *as, wchar_t c) |
425 | 0 | { |
426 | 0 | if ((as = archive_wstring_append(as, &c, 1)) == NULL) |
427 | 0 | __archive_errx(1, "Out of memory"); |
428 | 0 | return (as); |
429 | 0 | } |
430 | | |
431 | | /* |
432 | | * Get the "current character set" name to use with iconv. |
433 | | * On FreeBSD, the empty character set name "" chooses |
434 | | * the correct character encoding for the current locale, |
435 | | * so this isn't necessary. |
436 | | * But iconv on Mac OS 10.6 doesn't seem to handle this correctly; |
437 | | * on that system, we have to explicitly call nl_langinfo() |
438 | | * to get the right name. Not sure about other platforms. |
439 | | * |
440 | | * NOTE: GNU libiconv does not recognize the character-set name |
441 | | * which some platform nl_langinfo(CODESET) returns, so we should |
442 | | * use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv. |
443 | | */ |
444 | | static const char * |
445 | 2.84k | default_iconv_charset(const char *charset) { |
446 | 2.84k | if (charset != NULL && charset[0] != '\0') |
447 | 150 | return charset; |
448 | | #if HAVE_LOCALE_CHARSET && !defined(__APPLE__) |
449 | | /* locale_charset() is broken on Mac OS */ |
450 | | return locale_charset(); |
451 | | #elif HAVE_NL_LANGINFO |
452 | 2.69k | return nl_langinfo(CODESET); |
453 | | #else |
454 | | return ""; |
455 | | #endif |
456 | 2.84k | } |
457 | | |
458 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
459 | | |
460 | | /* |
461 | | * Convert MBS to WCS. |
462 | | * Note: returns -1 if conversion fails. |
463 | | */ |
464 | | int |
465 | | archive_wstring_append_from_mbs(struct archive_wstring *dest, |
466 | | const char *p, size_t len) |
467 | | { |
468 | | return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL); |
469 | | } |
470 | | |
471 | | static int |
472 | | archive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest, |
473 | | const char *s, size_t length, struct archive_string_conv *sc) |
474 | | { |
475 | | int ret = 0; |
476 | | size_t count; |
477 | | UINT from_cp; |
478 | | |
479 | | if (sc != NULL) |
480 | | from_cp = sc->from_cp; |
481 | | else |
482 | | from_cp = get_current_codepage(); |
483 | | |
484 | | if (from_cp == CP_C_LOCALE) { |
485 | | /* |
486 | | * "C" locale special processing. |
487 | | */ |
488 | | wchar_t *ws; |
489 | | const unsigned char *mp; |
490 | | |
491 | | if (NULL == archive_wstring_ensure(dest, |
492 | | dest->length + length + 1)) |
493 | | return (-1); |
494 | | |
495 | | ws = dest->s + dest->length; |
496 | | mp = (const unsigned char *)s; |
497 | | count = 0; |
498 | | while (count < length && *mp) { |
499 | | *ws++ = (wchar_t)*mp++; |
500 | | count++; |
501 | | } |
502 | | } else if (sc != NULL && |
503 | | (sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) { |
504 | | /* |
505 | | * Normalize UTF-8 and UTF-16BE and convert it directly |
506 | | * to UTF-16 as wchar_t. |
507 | | */ |
508 | | struct archive_string u16; |
509 | | int saved_flag = sc->flag;/* save current flag. */ |
510 | | |
511 | | if (IS_BIG_ENDIAN) |
512 | | sc->flag |= SCONV_TO_UTF16BE; |
513 | | else |
514 | | sc->flag |= SCONV_TO_UTF16LE; |
515 | | |
516 | | if (sc->flag & SCONV_FROM_UTF16) { |
517 | | /* |
518 | | * UTF-16BE/LE NFD ===> UTF-16 NFC |
519 | | * UTF-16BE/LE NFC ===> UTF-16 NFD |
520 | | */ |
521 | | count = utf16nbytes(s, length); |
522 | | } else { |
523 | | /* |
524 | | * UTF-8 NFD ===> UTF-16 NFC |
525 | | * UTF-8 NFC ===> UTF-16 NFD |
526 | | */ |
527 | | count = mbsnbytes(s, length); |
528 | | } |
529 | | u16.s = (char *)dest->s; |
530 | | u16.length = dest->length << 1; |
531 | | u16.buffer_length = dest->buffer_length; |
532 | | if (sc->flag & SCONV_NORMALIZATION_C) |
533 | | ret = archive_string_normalize_C(&u16, s, count, sc); |
534 | | else |
535 | | ret = archive_string_normalize_D(&u16, s, count, sc); |
536 | | dest->s = (wchar_t *)u16.s; |
537 | | dest->length = u16.length >> 1; |
538 | | dest->buffer_length = u16.buffer_length; |
539 | | sc->flag = saved_flag;/* restore the saved flag. */ |
540 | | return (ret); |
541 | | } else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) { |
542 | | count = utf16nbytes(s, length); |
543 | | count >>= 1; /* to be WCS length */ |
544 | | /* Allocate memory for WCS. */ |
545 | | if (NULL == archive_wstring_ensure(dest, |
546 | | dest->length + count + 1)) |
547 | | return (-1); |
548 | | wmemcpy(dest->s + dest->length, (const wchar_t *)s, count); |
549 | | if ((sc->flag & SCONV_FROM_UTF16BE) && !IS_BIG_ENDIAN) { |
550 | | uint16_t *u16 = (uint16_t *)(dest->s + dest->length); |
551 | | size_t b; |
552 | | for (b = 0; b < count; b++) { |
553 | | uint16_t val = archive_le16dec(u16+b); |
554 | | archive_be16enc(u16+b, val); |
555 | | } |
556 | | } else if ((sc->flag & SCONV_FROM_UTF16LE) && IS_BIG_ENDIAN) { |
557 | | uint16_t *u16 = (uint16_t *)(dest->s + dest->length); |
558 | | size_t b; |
559 | | for (b = 0; b < count; b++) { |
560 | | uint16_t val = archive_be16dec(u16+b); |
561 | | archive_le16enc(u16+b, val); |
562 | | } |
563 | | } |
564 | | } else { |
565 | | DWORD mbflag; |
566 | | size_t buffsize; |
567 | | |
568 | | if (sc == NULL) |
569 | | mbflag = 0; |
570 | | else if (sc->flag & SCONV_FROM_CHARSET) { |
571 | | /* Do not trust the length which comes from |
572 | | * an archive file. */ |
573 | | length = mbsnbytes(s, length); |
574 | | mbflag = 0; |
575 | | } else |
576 | | mbflag = MB_PRECOMPOSED; |
577 | | |
578 | | /* FIXME(CMake#26903): Offer control over encoding conversion. |
579 | | For now, we instead tolerate invalid characters as |
580 | | libarchive 3.7.2 / CMake 3.30 and below did. */ |
581 | | #if 0 |
582 | | mbflag |= MB_ERR_INVALID_CHARS; |
583 | | #endif |
584 | | |
585 | | buffsize = dest->length + length + 1; |
586 | | do { |
587 | | int r; |
588 | | |
589 | | /* MultiByteToWideChar is limited to int. */ |
590 | | if (length > (size_t)INT_MAX || |
591 | | (dest->buffer_length >> 1) > (size_t)INT_MAX) |
592 | | return (-1); |
593 | | /* Allocate memory for WCS. */ |
594 | | if (NULL == archive_wstring_ensure(dest, buffsize)) |
595 | | return (-1); |
596 | | /* Convert MBS to WCS. */ |
597 | | r = MultiByteToWideChar(from_cp, |
598 | | mbflag, s, (int)length, dest->s + dest->length, |
599 | | (int)(dest->buffer_length >> 1) -1); |
600 | | if (r == 0 && |
601 | | GetLastError() == ERROR_INSUFFICIENT_BUFFER) { |
602 | | /* Expand the WCS buffer. */ |
603 | | buffsize = dest->buffer_length << 1; |
604 | | continue; |
605 | | } |
606 | | if (r == 0 && length != 0) |
607 | | ret = -1; |
608 | | count = (size_t)r; |
609 | | break; |
610 | | } while (1); |
611 | | } |
612 | | dest->length += count; |
613 | | dest->s[dest->length] = L'\0'; |
614 | | return (ret); |
615 | | } |
616 | | |
617 | | #else |
618 | | |
619 | | /* |
620 | | * Convert MBS to WCS. |
621 | | * Note: returns -1 if conversion fails. |
622 | | */ |
623 | | int |
624 | | archive_wstring_append_from_mbs(struct archive_wstring *dest, |
625 | | const char *p, size_t len) |
626 | 4.08k | { |
627 | 4.08k | size_t r; |
628 | 4.08k | int ret_val = 0; |
629 | | /* |
630 | | * No single byte will be more than one wide character, |
631 | | * so this length estimate will always be big enough. |
632 | | */ |
633 | | // size_t wcs_length = len; |
634 | 4.08k | size_t mbs_length = len; |
635 | 4.08k | const char *mbs = p; |
636 | 4.08k | wchar_t *wcs; |
637 | 4.08k | #if HAVE_MBRTOWC |
638 | 4.08k | mbstate_t shift_state; |
639 | | |
640 | 4.08k | memset(&shift_state, 0, sizeof(shift_state)); |
641 | 4.08k | #endif |
642 | | /* |
643 | | * As we decided to have wcs_length == mbs_length == len |
644 | | * we can use len here instead of wcs_length |
645 | | */ |
646 | 4.08k | if (NULL == archive_wstring_ensure(dest, dest->length + len + 1)) |
647 | 0 | return (-1); |
648 | 4.08k | wcs = dest->s + dest->length; |
649 | | /* |
650 | | * We cannot use mbsrtowcs/mbstowcs here because those may convert |
651 | | * extra MBS when strlen(p) > len and one wide character consists of |
652 | | * multi bytes. |
653 | | */ |
654 | 20.1k | while (*mbs && mbs_length > 0) { |
655 | | /* |
656 | | * The buffer we allocated is always big enough. |
657 | | * Keep this code path in a comment if we decide to choose |
658 | | * smaller wcs_length in the future |
659 | | */ |
660 | | /* |
661 | | if (wcs_length == 0) { |
662 | | dest->length = wcs - dest->s; |
663 | | dest->s[dest->length] = L'\0'; |
664 | | wcs_length = mbs_length; |
665 | | if (NULL == archive_wstring_ensure(dest, |
666 | | dest->length + wcs_length + 1)) |
667 | | return (-1); |
668 | | wcs = dest->s + dest->length; |
669 | | } |
670 | | */ |
671 | 17.1k | #if HAVE_MBRTOWC |
672 | 17.1k | r = mbrtowc(wcs, mbs, mbs_length, &shift_state); |
673 | | #else |
674 | | r = mbtowc(wcs, mbs, mbs_length); |
675 | | #endif |
676 | 17.1k | if (r == (size_t)-1 || r == (size_t)-2) { |
677 | 996 | ret_val = -1; |
678 | 996 | break; |
679 | 996 | } |
680 | 16.1k | if (r == 0 || r > mbs_length) |
681 | 0 | break; |
682 | 16.1k | wcs++; |
683 | | // wcs_length--; |
684 | 16.1k | mbs += r; |
685 | 16.1k | mbs_length -= r; |
686 | 16.1k | } |
687 | 4.08k | dest->length = wcs - dest->s; |
688 | 4.08k | dest->s[dest->length] = L'\0'; |
689 | 4.08k | return (ret_val); |
690 | 4.08k | } |
691 | | |
692 | | #endif |
693 | | |
694 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
695 | | |
696 | | /* |
697 | | * WCS ==> MBS. |
698 | | * Note: returns -1 if conversion fails. |
699 | | * |
700 | | * Win32 builds use WideCharToMultiByte from the Windows API. |
701 | | * (Maybe Cygwin should too? WideCharToMultiByte will know a |
702 | | * lot more about local character encodings than the wcrtomb() |
703 | | * wrapper is going to know.) |
704 | | */ |
705 | | int |
706 | | archive_string_append_from_wcs(struct archive_string *as, |
707 | | const wchar_t *w, size_t len) |
708 | | { |
709 | | return archive_string_append_from_wcs_in_codepage(as, w, len, NULL); |
710 | | } |
711 | | |
712 | | static int |
713 | | archive_string_append_from_wcs_in_codepage(struct archive_string *as, |
714 | | const wchar_t *ws, size_t len, struct archive_string_conv *sc) |
715 | | { |
716 | | BOOL defchar_used, *dp; |
717 | | int ret = 0; |
718 | | UINT to_cp; |
719 | | size_t count, wslen = len; |
720 | | |
721 | | if (sc != NULL) |
722 | | to_cp = sc->to_cp; |
723 | | else |
724 | | to_cp = get_current_codepage(); |
725 | | |
726 | | if (to_cp == CP_C_LOCALE) { |
727 | | /* |
728 | | * "C" locale special processing. |
729 | | */ |
730 | | const wchar_t *wp = ws; |
731 | | char *p; |
732 | | |
733 | | if (NULL == archive_string_ensure(as, |
734 | | as->length + wslen +1)) |
735 | | return (-1); |
736 | | p = as->s + as->length; |
737 | | count = 0; |
738 | | defchar_used = 0; |
739 | | while (count < wslen && *wp) { |
740 | | if (*wp > 255) { |
741 | | *p++ = '?'; |
742 | | wp++; |
743 | | defchar_used = 1; |
744 | | } else |
745 | | *p++ = (char)*wp++; |
746 | | count++; |
747 | | } |
748 | | } else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) { |
749 | | uint16_t *u16; |
750 | | |
751 | | if (NULL == |
752 | | archive_string_ensure(as, as->length + len * 2 + 2)) |
753 | | return (-1); |
754 | | u16 = (uint16_t *)(as->s + as->length); |
755 | | count = 0; |
756 | | defchar_used = 0; |
757 | | if (sc->flag & SCONV_TO_UTF16BE) { |
758 | | while (count < len && *ws) { |
759 | | archive_be16enc(u16+count, *ws); |
760 | | ws++; |
761 | | count++; |
762 | | } |
763 | | } else { |
764 | | while (count < len && *ws) { |
765 | | archive_le16enc(u16+count, *ws); |
766 | | ws++; |
767 | | count++; |
768 | | } |
769 | | } |
770 | | count <<= 1; /* to be byte size */ |
771 | | } else { |
772 | | /* Make sure the MBS buffer has plenty to set. */ |
773 | | if (NULL == |
774 | | archive_string_ensure(as, as->length + len * 2 + 1)) |
775 | | return (-1); |
776 | | do { |
777 | | int r; |
778 | | |
779 | | defchar_used = 0; |
780 | | if (to_cp == CP_UTF8 || sc == NULL) |
781 | | dp = NULL; |
782 | | else |
783 | | dp = &defchar_used; |
784 | | /* WideCharToMultiByte is limited to int. */ |
785 | | if (as->buffer_length - as->length - 1 > (size_t)INT_MAX || |
786 | | wslen > (size_t)INT_MAX) |
787 | | return (-1); |
788 | | r = WideCharToMultiByte(to_cp, 0, ws, (int)wslen, |
789 | | as->s + as->length, |
790 | | (int)(as->buffer_length - as->length - 1), NULL, dp); |
791 | | if (r == 0 && |
792 | | GetLastError() == ERROR_INSUFFICIENT_BUFFER) { |
793 | | /* Expand the MBS buffer and retry. */ |
794 | | if (NULL == archive_string_ensure(as, |
795 | | as->buffer_length + len)) |
796 | | return (-1); |
797 | | continue; |
798 | | } |
799 | | if (r == 0) |
800 | | ret = -1; |
801 | | count = (size_t)r; |
802 | | break; |
803 | | } while (1); |
804 | | } |
805 | | as->length += count; |
806 | | as->s[as->length] = '\0'; |
807 | | return (defchar_used?-1:ret); |
808 | | } |
809 | | |
810 | | #elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB) |
811 | | |
812 | | /* |
813 | | * Translates a wide character string into current locale character set |
814 | | * and appends to the archive_string. Note: returns -1 if conversion |
815 | | * fails. |
816 | | */ |
817 | | int |
818 | | archive_string_append_from_wcs(struct archive_string *as, |
819 | | const wchar_t *w, size_t len) |
820 | 230 | { |
821 | | /* We cannot use the standard wcstombs() here because it |
822 | | * cannot tell us how big the output buffer should be. So |
823 | | * I've built a loop around wcrtomb() or wctomb() that |
824 | | * converts a character at a time and resizes the string as |
825 | | * needed. We prefer wcrtomb() when it's available because |
826 | | * it's thread-safe. */ |
827 | 230 | int n, ret_val = 0; |
828 | 230 | char *p; |
829 | 230 | char *end; |
830 | 230 | #if HAVE_WCRTOMB |
831 | 230 | mbstate_t shift_state; |
832 | | |
833 | 230 | memset(&shift_state, 0, sizeof(shift_state)); |
834 | | #else |
835 | | /* Clear the shift state before starting. */ |
836 | | wctomb(NULL, L'\0'); |
837 | | #endif |
838 | | /* |
839 | | * Allocate buffer for MBS. |
840 | | * We need this allocation here since it is possible that |
841 | | * as->s is still NULL. |
842 | | */ |
843 | 230 | if (archive_string_ensure(as, as->length + len + 1) == NULL) |
844 | 0 | return (-1); |
845 | | |
846 | 230 | p = as->s + as->length; |
847 | 230 | end = as->s + as->buffer_length - MB_CUR_MAX -1; |
848 | 5.03k | while (*w != L'\0' && len > 0) { |
849 | 4.80k | if (p >= end) { |
850 | 26 | as->length = p - as->s; |
851 | 26 | as->s[as->length] = '\0'; |
852 | | /* Re-allocate buffer for MBS. */ |
853 | 26 | if (archive_string_ensure(as, |
854 | 26 | as->length + max(len * 2, |
855 | 26 | (size_t)MB_CUR_MAX) + 1) == NULL) |
856 | 0 | return (-1); |
857 | 26 | p = as->s + as->length; |
858 | 26 | end = as->s + as->buffer_length - MB_CUR_MAX -1; |
859 | 26 | } |
860 | 4.80k | #if HAVE_WCRTOMB |
861 | 4.80k | n = wcrtomb(p, *w++, &shift_state); |
862 | | #else |
863 | | n = wctomb(p, *w++); |
864 | | #endif |
865 | 4.80k | if (n == -1) { |
866 | 0 | if (errno == EILSEQ) { |
867 | | /* Skip an illegal wide char. */ |
868 | 0 | *p++ = '?'; |
869 | 0 | ret_val = -1; |
870 | 0 | } else { |
871 | 0 | ret_val = -1; |
872 | 0 | break; |
873 | 0 | } |
874 | 0 | } else |
875 | 4.80k | p += n; |
876 | 4.80k | len--; |
877 | 4.80k | } |
878 | 230 | as->length = p - as->s; |
879 | 230 | as->s[as->length] = '\0'; |
880 | 230 | return (ret_val); |
881 | 230 | } |
882 | | |
883 | | #else /* HAVE_WCTOMB || HAVE_WCRTOMB */ |
884 | | |
885 | | /* |
886 | | * TODO: Test if __STDC_ISO_10646__ is defined. |
887 | | * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion |
888 | | * one character at a time. If a non-Windows platform doesn't have |
889 | | * either of these, fall back to the built-in UTF8 conversion. |
890 | | */ |
891 | | int |
892 | | archive_string_append_from_wcs(struct archive_string *as, |
893 | | const wchar_t *w, size_t len) |
894 | | { |
895 | | (void)as;/* UNUSED */ |
896 | | (void)w;/* UNUSED */ |
897 | | (void)len;/* UNUSED */ |
898 | | errno = ENOSYS; |
899 | | return (-1); |
900 | | } |
901 | | |
902 | | #endif /* HAVE_WCTOMB || HAVE_WCRTOMB */ |
903 | | |
904 | | /* |
905 | | * Find a string conversion object by a pair of 'from' charset name |
906 | | * and 'to' charset name from an archive object. |
907 | | * Return NULL if not found. |
908 | | */ |
909 | | static struct archive_string_conv * |
910 | | find_sconv_object(struct archive *a, const char *fc, const char *tc) |
911 | 2.84k | { |
912 | 2.84k | struct archive_string_conv *sc; |
913 | | |
914 | 2.84k | if (a == NULL) |
915 | 0 | return (NULL); |
916 | | |
917 | 2.88k | for (sc = a->sconv; sc != NULL; sc = sc->next) { |
918 | 162 | if (strcmp(sc->from_charset, fc) == 0 && |
919 | 118 | strcmp(sc->to_charset, tc) == 0) |
920 | 118 | break; |
921 | 162 | } |
922 | 2.84k | return (sc); |
923 | 2.84k | } |
924 | | |
925 | | /* |
926 | | * Register a string object to an archive object. |
927 | | */ |
928 | | static void |
929 | | add_sconv_object(struct archive *a, struct archive_string_conv *sc) |
930 | 2.72k | { |
931 | 2.72k | struct archive_string_conv **psc; |
932 | | |
933 | | /* Add a new sconv to sconv list. */ |
934 | 2.72k | psc = &(a->sconv); |
935 | 2.75k | while (*psc != NULL) |
936 | 36 | psc = &((*psc)->next); |
937 | 2.72k | *psc = sc; |
938 | 2.72k | } |
939 | | |
940 | | static void |
941 | | add_converter(struct archive_string_conv *sc, int (*converter) |
942 | | (struct archive_string *, const void *, size_t, |
943 | | struct archive_string_conv *)) |
944 | 5.33k | { |
945 | 5.33k | if (sc == NULL || sc->nconverter >= 2) |
946 | 0 | __archive_errx(1, "Programming error"); |
947 | 5.33k | sc->converter[sc->nconverter++] = converter; |
948 | 5.33k | } |
949 | | |
950 | | static void |
951 | | setup_converter(struct archive_string_conv *sc) |
952 | 2.72k | { |
953 | | |
954 | | /* Reset. */ |
955 | 2.72k | sc->nconverter = 0; |
956 | | |
957 | | /* |
958 | | * Perform special sequence for the incorrect UTF-8 filenames |
959 | | * made by libarchive2.x. |
960 | | */ |
961 | 2.72k | if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) { |
962 | 0 | add_converter(sc, strncat_from_utf8_libarchive2); |
963 | 0 | return; |
964 | 0 | } |
965 | | |
966 | | /* |
967 | | * Convert a string to UTF-16BE/LE. |
968 | | */ |
969 | 2.72k | if (sc->flag & SCONV_TO_UTF16) { |
970 | | /* |
971 | | * If the current locale is UTF-8, we can translate |
972 | | * a UTF-8 string into a UTF-16BE string. |
973 | | */ |
974 | 0 | if (sc->flag & SCONV_FROM_UTF8) { |
975 | 0 | add_converter(sc, archive_string_append_unicode); |
976 | 0 | return; |
977 | 0 | } |
978 | | |
979 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
980 | | if (sc->flag & SCONV_WIN_CP) { |
981 | | if (sc->flag & SCONV_TO_UTF16BE) |
982 | | add_converter(sc, win_strncat_to_utf16be); |
983 | | else |
984 | | add_converter(sc, win_strncat_to_utf16le); |
985 | | return; |
986 | | } |
987 | | #endif |
988 | | |
989 | | #if defined(HAVE_ICONV) |
990 | | if (sc->cd != (iconv_t)-1) { |
991 | | add_converter(sc, iconv_strncat_in_locale); |
992 | | return; |
993 | | } |
994 | | #endif |
995 | | |
996 | 0 | if (sc->flag & SCONV_BEST_EFFORT) { |
997 | 0 | if (sc->flag & SCONV_TO_UTF16BE) |
998 | 0 | add_converter(sc, |
999 | 0 | best_effort_strncat_to_utf16be); |
1000 | 0 | else |
1001 | 0 | add_converter(sc, |
1002 | 0 | best_effort_strncat_to_utf16le); |
1003 | 0 | } else |
1004 | | /* Make sure we have no converter. */ |
1005 | 0 | sc->nconverter = 0; |
1006 | 0 | return; |
1007 | 0 | } |
1008 | | |
1009 | | /* |
1010 | | * Convert a string from UTF-16BE/LE. |
1011 | | */ |
1012 | 2.72k | if (sc->flag & SCONV_FROM_UTF16) { |
1013 | | /* |
1014 | | * At least we should normalize a UTF-16BE string. |
1015 | | */ |
1016 | 78 | if (sc->flag & SCONV_NORMALIZATION_D) |
1017 | 0 | add_converter(sc,archive_string_normalize_D); |
1018 | 78 | else if (sc->flag & SCONV_NORMALIZATION_C) |
1019 | 78 | add_converter(sc, archive_string_normalize_C); |
1020 | | |
1021 | 78 | if (sc->flag & SCONV_TO_UTF8) { |
1022 | | /* |
1023 | | * If the current locale is UTF-8, we can translate |
1024 | | * a UTF-16BE/LE string into a UTF-8 string directly. |
1025 | | */ |
1026 | 0 | if (!(sc->flag & |
1027 | 0 | (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) |
1028 | 0 | add_converter(sc, |
1029 | 0 | archive_string_append_unicode); |
1030 | 0 | return; |
1031 | 0 | } |
1032 | | |
1033 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
1034 | | if (sc->flag & SCONV_WIN_CP) { |
1035 | | if (sc->flag & SCONV_FROM_UTF16BE) |
1036 | | add_converter(sc, win_strncat_from_utf16be); |
1037 | | else |
1038 | | add_converter(sc, win_strncat_from_utf16le); |
1039 | | return; |
1040 | | } |
1041 | | #endif |
1042 | | |
1043 | | #if defined(HAVE_ICONV) |
1044 | | if (sc->cd != (iconv_t)-1) { |
1045 | | add_converter(sc, iconv_strncat_in_locale); |
1046 | | return; |
1047 | | } |
1048 | | #endif |
1049 | | |
1050 | 78 | if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) |
1051 | 78 | == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) |
1052 | 0 | add_converter(sc, best_effort_strncat_from_utf16be); |
1053 | 78 | else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) |
1054 | 78 | == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) |
1055 | 78 | add_converter(sc, best_effort_strncat_from_utf16le); |
1056 | 0 | else |
1057 | | /* Make sure we have no converter. */ |
1058 | 0 | sc->nconverter = 0; |
1059 | 78 | return; |
1060 | 78 | } |
1061 | | |
1062 | 2.64k | if (sc->flag & SCONV_FROM_UTF8) { |
1063 | | /* |
1064 | | * At least we should normalize a UTF-8 string. |
1065 | | */ |
1066 | 2.53k | if (sc->flag & SCONV_NORMALIZATION_D) |
1067 | 0 | add_converter(sc,archive_string_normalize_D); |
1068 | 2.53k | else if (sc->flag & SCONV_NORMALIZATION_C) |
1069 | 2.53k | add_converter(sc, archive_string_normalize_C); |
1070 | | |
1071 | | /* |
1072 | | * Copy UTF-8 string with a check of CESU-8. |
1073 | | * Apparently, iconv does not check surrogate pairs in UTF-8 |
1074 | | * when both from-charset and to-charset are UTF-8, and then |
1075 | | * we use our UTF-8 copy code. |
1076 | | */ |
1077 | 2.53k | if (sc->flag & SCONV_TO_UTF8) { |
1078 | | /* |
1079 | | * If the current locale is UTF-8, we can translate |
1080 | | * a UTF-16BE string into a UTF-8 string directly. |
1081 | | */ |
1082 | 0 | if (!(sc->flag & |
1083 | 0 | (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) |
1084 | 0 | add_converter(sc, strncat_from_utf8_to_utf8); |
1085 | 0 | return; |
1086 | 0 | } |
1087 | 2.53k | } |
1088 | | |
1089 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
1090 | | /* |
1091 | | * On Windows we can use Windows API for a string conversion. |
1092 | | */ |
1093 | | if (sc->flag & SCONV_WIN_CP) { |
1094 | | add_converter(sc, strncat_in_codepage); |
1095 | | return; |
1096 | | } |
1097 | | #endif |
1098 | | |
1099 | | #if HAVE_ICONV |
1100 | | if (sc->cd != (iconv_t)-1) { |
1101 | | add_converter(sc, iconv_strncat_in_locale); |
1102 | | /* |
1103 | | * iconv generally does not support UTF-8-MAC and so |
1104 | | * we have to the output of iconv from NFC to NFD if |
1105 | | * need. |
1106 | | */ |
1107 | | if ((sc->flag & SCONV_FROM_CHARSET) && |
1108 | | (sc->flag & SCONV_TO_UTF8)) { |
1109 | | if (sc->flag & SCONV_NORMALIZATION_D) |
1110 | | add_converter(sc, archive_string_normalize_D); |
1111 | | } |
1112 | | return; |
1113 | | } |
1114 | | #endif |
1115 | | |
1116 | | /* |
1117 | | * Try conversion in the best effort or no conversion. |
1118 | | */ |
1119 | 2.64k | if ((sc->flag & SCONV_BEST_EFFORT) || sc->same) |
1120 | 2.64k | add_converter(sc, best_effort_strncat_in_locale); |
1121 | 0 | else |
1122 | | /* Make sure we have no converter. */ |
1123 | 0 | sc->nconverter = 0; |
1124 | 2.64k | } |
1125 | | |
1126 | | /* |
1127 | | * Return canonicalized charset-name but this supports just UTF-8, UTF-16BE |
1128 | | * and CP932 which are referenced in create_sconv_object(). |
1129 | | */ |
1130 | | static const char * |
1131 | | canonical_charset_name(const char *charset) |
1132 | 5.44k | { |
1133 | 5.44k | char cs[16]; |
1134 | 5.44k | char *p; |
1135 | 5.44k | const char *s; |
1136 | | |
1137 | 5.44k | if (charset == NULL || charset[0] == '\0' |
1138 | 5.44k | || strlen(charset) > 15) |
1139 | 0 | return (charset); |
1140 | | |
1141 | | /* Copy name to uppercase. */ |
1142 | 5.44k | p = cs; |
1143 | 5.44k | s = charset; |
1144 | 58.0k | while (*s) { |
1145 | 52.5k | char c = *s++; |
1146 | 52.5k | if (c >= 'a' && c <= 'z') |
1147 | 0 | c -= 'a' - 'A'; |
1148 | 52.5k | *p++ = c; |
1149 | 52.5k | } |
1150 | 5.44k | *p++ = '\0'; |
1151 | | |
1152 | 5.44k | if (strcmp(cs, "UTF-8") == 0 || |
1153 | 2.91k | strcmp(cs, "UTF8") == 0) |
1154 | 2.53k | return ("UTF-8"); |
1155 | 2.91k | if (strcmp(cs, "UTF-16BE") == 0 || |
1156 | 2.91k | strcmp(cs, "UTF16BE") == 0) |
1157 | 0 | return ("UTF-16BE"); |
1158 | 2.91k | if (strcmp(cs, "UTF-16LE") == 0 || |
1159 | 2.83k | strcmp(cs, "UTF16LE") == 0) |
1160 | 78 | return ("UTF-16LE"); |
1161 | 2.83k | if (strcmp(cs, "CP932") == 0) |
1162 | 0 | return ("CP932"); |
1163 | 2.83k | return (charset); |
1164 | 2.83k | } |
1165 | | |
1166 | | /* |
1167 | | * Create a string conversion object. |
1168 | | */ |
1169 | | static struct archive_string_conv * |
1170 | | create_sconv_object(const char *fc, const char *tc, |
1171 | | unsigned current_codepage, int flag) |
1172 | 2.72k | { |
1173 | 2.72k | struct archive_string_conv *sc; |
1174 | | |
1175 | 2.72k | sc = calloc(1, sizeof(*sc)); |
1176 | 2.72k | if (sc == NULL) |
1177 | 0 | return (NULL); |
1178 | 2.72k | sc->next = NULL; |
1179 | 2.72k | sc->from_charset = strdup(fc); |
1180 | 2.72k | if (sc->from_charset == NULL) { |
1181 | 0 | free(sc); |
1182 | 0 | return (NULL); |
1183 | 0 | } |
1184 | 2.72k | sc->to_charset = strdup(tc); |
1185 | 2.72k | if (sc->to_charset == NULL) { |
1186 | 0 | free(sc->from_charset); |
1187 | 0 | free(sc); |
1188 | 0 | return (NULL); |
1189 | 0 | } |
1190 | 2.72k | archive_string_init(&sc->utftmp); |
1191 | | |
1192 | 2.72k | if (flag & SCONV_TO_CHARSET) { |
1193 | | /* |
1194 | | * Convert characters from the current locale charset to |
1195 | | * a specified charset. |
1196 | | */ |
1197 | 0 | sc->from_cp = current_codepage; |
1198 | 0 | sc->to_cp = make_codepage_from_charset(tc); |
1199 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
1200 | | if (IsValidCodePage(sc->to_cp)) |
1201 | | flag |= SCONV_WIN_CP; |
1202 | | #endif |
1203 | 2.72k | } else if (flag & SCONV_FROM_CHARSET) { |
1204 | | /* |
1205 | | * Convert characters from a specified charset to |
1206 | | * the current locale charset. |
1207 | | */ |
1208 | 2.72k | sc->to_cp = current_codepage; |
1209 | 2.72k | sc->from_cp = make_codepage_from_charset(fc); |
1210 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
1211 | | if (IsValidCodePage(sc->from_cp)) |
1212 | | flag |= SCONV_WIN_CP; |
1213 | | #endif |
1214 | 2.72k | } |
1215 | | |
1216 | | /* |
1217 | | * Check if "from charset" and "to charset" are the same. |
1218 | | */ |
1219 | 2.72k | if (strcmp(fc, tc) == 0 || |
1220 | 2.72k | (sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp)) |
1221 | 0 | sc->same = 1; |
1222 | 2.72k | else |
1223 | 2.72k | sc->same = 0; |
1224 | | |
1225 | | /* |
1226 | | * Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE. |
1227 | | */ |
1228 | 2.72k | if (strcmp(tc, "UTF-8") == 0) |
1229 | 0 | flag |= SCONV_TO_UTF8; |
1230 | 2.72k | else if (strcmp(tc, "UTF-16BE") == 0) |
1231 | 0 | flag |= SCONV_TO_UTF16BE; |
1232 | 2.72k | else if (strcmp(tc, "UTF-16LE") == 0) |
1233 | 0 | flag |= SCONV_TO_UTF16LE; |
1234 | 2.72k | if (strcmp(fc, "UTF-8") == 0) |
1235 | 2.53k | flag |= SCONV_FROM_UTF8; |
1236 | 190 | else if (strcmp(fc, "UTF-16BE") == 0) |
1237 | 0 | flag |= SCONV_FROM_UTF16BE; |
1238 | 190 | else if (strcmp(fc, "UTF-16LE") == 0) |
1239 | 78 | flag |= SCONV_FROM_UTF16LE; |
1240 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
1241 | | if (sc->to_cp == CP_UTF8) |
1242 | | flag |= SCONV_TO_UTF8; |
1243 | | else if (sc->to_cp == CP_UTF16BE) |
1244 | | flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP; |
1245 | | else if (sc->to_cp == CP_UTF16LE) |
1246 | | flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP; |
1247 | | if (sc->from_cp == CP_UTF8) |
1248 | | flag |= SCONV_FROM_UTF8; |
1249 | | else if (sc->from_cp == CP_UTF16BE) |
1250 | | flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP; |
1251 | | else if (sc->from_cp == CP_UTF16LE) |
1252 | | flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP; |
1253 | | #endif |
1254 | | |
1255 | | /* |
1256 | | * Set a flag for Unicode NFD. Usually iconv cannot correctly |
1257 | | * handle it. So we have to translate NFD characters to NFC ones |
1258 | | * ourselves before iconv handles. Another reason is to prevent |
1259 | | * that the same sight of two filenames, one is NFC and other |
1260 | | * is NFD, would be in its directory. |
1261 | | * On Mac OS X, although its filesystem layer automatically |
1262 | | * convert filenames to NFD, it would be useful for filename |
1263 | | * comparing to find out the same filenames that we normalize |
1264 | | * that to be NFD ourselves. |
1265 | | */ |
1266 | 2.72k | if ((flag & SCONV_FROM_CHARSET) && |
1267 | 2.72k | (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) { |
1268 | | #if defined(__APPLE__) |
1269 | | if (flag & SCONV_TO_UTF8) |
1270 | | flag |= SCONV_NORMALIZATION_D; |
1271 | | else |
1272 | | #endif |
1273 | 2.61k | flag |= SCONV_NORMALIZATION_C; |
1274 | 2.61k | } |
1275 | | #if defined(__APPLE__) |
1276 | | /* |
1277 | | * In case writing an archive file, make sure that a filename |
1278 | | * going to be passed to iconv is a Unicode NFC string since |
1279 | | * a filename in HFS Plus filesystem is a Unicode NFD one and |
1280 | | * iconv cannot handle it with "UTF-8" charset. It is simpler |
1281 | | * than a use of "UTF-8-MAC" charset. |
1282 | | */ |
1283 | | if ((flag & SCONV_TO_CHARSET) && |
1284 | | (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && |
1285 | | !(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) |
1286 | | flag |= SCONV_NORMALIZATION_C; |
1287 | | /* |
1288 | | * In case reading an archive file. make sure that a filename |
1289 | | * will be passed to users is a Unicode NFD string in order to |
1290 | | * correctly compare the filename with other one which comes |
1291 | | * from HFS Plus filesystem. |
1292 | | */ |
1293 | | if ((flag & SCONV_FROM_CHARSET) && |
1294 | | !(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && |
1295 | | (flag & SCONV_TO_UTF8)) |
1296 | | flag |= SCONV_NORMALIZATION_D; |
1297 | | #endif |
1298 | | |
1299 | | #if defined(HAVE_ICONV) |
1300 | | sc->cd_w = (iconv_t)-1; |
1301 | | /* |
1302 | | * Create an iconv object. |
1303 | | */ |
1304 | | if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) && |
1305 | | (flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) || |
1306 | | (flag & SCONV_WIN_CP)) { |
1307 | | /* This case we won't use iconv. */ |
1308 | | sc->cd = (iconv_t)-1; |
1309 | | } else { |
1310 | | sc->cd = iconv_open(tc, fc); |
1311 | | if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) { |
1312 | | /* |
1313 | | * Unfortunately, all of iconv implements do support |
1314 | | * "CP932" character-set, so we should use "SJIS" |
1315 | | * instead if iconv_open failed. |
1316 | | */ |
1317 | | if (strcmp(tc, "CP932") == 0) |
1318 | | sc->cd = iconv_open("SJIS", fc); |
1319 | | else if (strcmp(fc, "CP932") == 0) |
1320 | | sc->cd = iconv_open(tc, "SJIS"); |
1321 | | } |
1322 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
1323 | | /* |
1324 | | * archive_mstring on Windows directly convert multi-bytes |
1325 | | * into archive_wstring in order not to depend on locale |
1326 | | * so that you can do a I18N programming. This will be |
1327 | | * used only in archive_mstring_copy_mbs_len_l so far. |
1328 | | */ |
1329 | | if (flag & SCONV_FROM_CHARSET) { |
1330 | | sc->cd_w = iconv_open("UTF-8", fc); |
1331 | | if (sc->cd_w == (iconv_t)-1 && |
1332 | | (sc->flag & SCONV_BEST_EFFORT)) { |
1333 | | if (strcmp(fc, "CP932") == 0) |
1334 | | sc->cd_w = iconv_open("UTF-8", "SJIS"); |
1335 | | } |
1336 | | } |
1337 | | #endif /* _WIN32 && !__CYGWIN__ */ |
1338 | | } |
1339 | | #endif /* HAVE_ICONV */ |
1340 | | |
1341 | 2.72k | sc->flag = flag; |
1342 | | |
1343 | | /* |
1344 | | * Set up converters. |
1345 | | */ |
1346 | 2.72k | setup_converter(sc); |
1347 | | |
1348 | 2.72k | return (sc); |
1349 | 2.72k | } |
1350 | | |
1351 | | /* |
1352 | | * Free a string conversion object. |
1353 | | */ |
1354 | | static void |
1355 | | free_sconv_object(struct archive_string_conv *sc) |
1356 | 2.72k | { |
1357 | 2.72k | free(sc->from_charset); |
1358 | 2.72k | free(sc->to_charset); |
1359 | 2.72k | archive_string_free(&sc->utftmp); |
1360 | | #if HAVE_ICONV |
1361 | | if (sc->cd != (iconv_t)-1) |
1362 | | iconv_close(sc->cd); |
1363 | | if (sc->cd_w != (iconv_t)-1) |
1364 | | iconv_close(sc->cd_w); |
1365 | | #endif |
1366 | 2.72k | free(sc); |
1367 | 2.72k | } |
1368 | | |
1369 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
1370 | | # if defined(WINAPI_FAMILY_PARTITION) && !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) |
1371 | | # define GetOEMCP() CP_OEMCP |
1372 | | # endif |
1373 | | |
1374 | | static unsigned |
1375 | | my_atoi(const char *p) |
1376 | | { |
1377 | | unsigned cp; |
1378 | | |
1379 | | cp = 0; |
1380 | | while (*p) { |
1381 | | if (*p >= '0' && *p <= '9') |
1382 | | cp = cp * 10 + (*p - '0'); |
1383 | | else |
1384 | | return (-1); |
1385 | | p++; |
1386 | | } |
1387 | | return (cp); |
1388 | | } |
1389 | | |
1390 | | /* |
1391 | | * Translate Charset name (as used by iconv) into CodePage (as used by Windows) |
1392 | | * Return -1 if failed. |
1393 | | * |
1394 | | * Note: This translation code may be insufficient. |
1395 | | */ |
1396 | | static struct charset { |
1397 | | const char *name; |
1398 | | unsigned cp; |
1399 | | } charsets[] = { |
1400 | | /* MUST BE SORTED! */ |
1401 | | {"ASCII", 1252}, |
1402 | | {"ASMO-708", 708}, |
1403 | | {"BIG5", 950}, |
1404 | | {"CHINESE", 936}, |
1405 | | {"CP367", 1252}, |
1406 | | {"CP819", 1252}, |
1407 | | {"CP1025", 21025}, |
1408 | | {"DOS-720", 720}, |
1409 | | {"DOS-862", 862}, |
1410 | | {"EUC-CN", 51936}, |
1411 | | {"EUC-JP", 51932}, |
1412 | | {"EUC-KR", 949}, |
1413 | | {"EUCCN", 51936}, |
1414 | | {"EUCJP", 51932}, |
1415 | | {"EUCKR", 949}, |
1416 | | {"GB18030", 54936}, |
1417 | | {"GB2312", 936}, |
1418 | | {"HEBREW", 1255}, |
1419 | | {"HZ-GB-2312", 52936}, |
1420 | | {"IBM273", 20273}, |
1421 | | {"IBM277", 20277}, |
1422 | | {"IBM278", 20278}, |
1423 | | {"IBM280", 20280}, |
1424 | | {"IBM284", 20284}, |
1425 | | {"IBM285", 20285}, |
1426 | | {"IBM290", 20290}, |
1427 | | {"IBM297", 20297}, |
1428 | | {"IBM367", 1252}, |
1429 | | {"IBM420", 20420}, |
1430 | | {"IBM423", 20423}, |
1431 | | {"IBM424", 20424}, |
1432 | | {"IBM819", 1252}, |
1433 | | {"IBM871", 20871}, |
1434 | | {"IBM880", 20880}, |
1435 | | {"IBM905", 20905}, |
1436 | | {"IBM924", 20924}, |
1437 | | {"ISO-8859-1", 28591}, |
1438 | | {"ISO-8859-13", 28603}, |
1439 | | {"ISO-8859-15", 28605}, |
1440 | | {"ISO-8859-2", 28592}, |
1441 | | {"ISO-8859-3", 28593}, |
1442 | | {"ISO-8859-4", 28594}, |
1443 | | {"ISO-8859-5", 28595}, |
1444 | | {"ISO-8859-6", 28596}, |
1445 | | {"ISO-8859-7", 28597}, |
1446 | | {"ISO-8859-8", 28598}, |
1447 | | {"ISO-8859-9", 28599}, |
1448 | | {"ISO8859-1", 28591}, |
1449 | | {"ISO8859-13", 28603}, |
1450 | | {"ISO8859-15", 28605}, |
1451 | | {"ISO8859-2", 28592}, |
1452 | | {"ISO8859-3", 28593}, |
1453 | | {"ISO8859-4", 28594}, |
1454 | | {"ISO8859-5", 28595}, |
1455 | | {"ISO8859-6", 28596}, |
1456 | | {"ISO8859-7", 28597}, |
1457 | | {"ISO8859-8", 28598}, |
1458 | | {"ISO8859-9", 28599}, |
1459 | | {"JOHAB", 1361}, |
1460 | | {"KOI8-R", 20866}, |
1461 | | {"KOI8-U", 21866}, |
1462 | | {"KS_C_5601-1987", 949}, |
1463 | | {"LATIN1", 1252}, |
1464 | | {"LATIN2", 28592}, |
1465 | | {"MACINTOSH", 10000}, |
1466 | | {"SHIFT-JIS", 932}, |
1467 | | {"SHIFT_JIS", 932}, |
1468 | | {"SJIS", 932}, |
1469 | | {"US", 1252}, |
1470 | | {"US-ASCII", 1252}, |
1471 | | {"UTF-16", 1200}, |
1472 | | {"UTF-16BE", 1201}, |
1473 | | {"UTF-16LE", 1200}, |
1474 | | {"UTF-8", CP_UTF8}, |
1475 | | {"X-EUROPA", 29001}, |
1476 | | {"X-MAC-ARABIC", 10004}, |
1477 | | {"X-MAC-CE", 10029}, |
1478 | | {"X-MAC-CHINESEIMP", 10008}, |
1479 | | {"X-MAC-CHINESETRAD", 10002}, |
1480 | | {"X-MAC-CROATIAN", 10082}, |
1481 | | {"X-MAC-CYRILLIC", 10007}, |
1482 | | {"X-MAC-GREEK", 10006}, |
1483 | | {"X-MAC-HEBREW", 10005}, |
1484 | | {"X-MAC-ICELANDIC", 10079}, |
1485 | | {"X-MAC-JAPANESE", 10001}, |
1486 | | {"X-MAC-KOREAN", 10003}, |
1487 | | {"X-MAC-ROMANIAN", 10010}, |
1488 | | {"X-MAC-THAI", 10021}, |
1489 | | {"X-MAC-TURKISH", 10081}, |
1490 | | {"X-MAC-UKRAINIAN", 10017}, |
1491 | | }; |
1492 | | static unsigned |
1493 | | make_codepage_from_charset(const char *charset) |
1494 | | { |
1495 | | char cs[16]; |
1496 | | char *p; |
1497 | | unsigned cp; |
1498 | | int a, b; |
1499 | | |
1500 | | if (charset == NULL || strlen(charset) > 15) |
1501 | | return -1; |
1502 | | |
1503 | | /* Copy name to uppercase. */ |
1504 | | p = cs; |
1505 | | while (*charset) { |
1506 | | char c = *charset++; |
1507 | | if (c >= 'a' && c <= 'z') |
1508 | | c -= 'a' - 'A'; |
1509 | | *p++ = c; |
1510 | | } |
1511 | | *p++ = '\0'; |
1512 | | cp = -1; |
1513 | | |
1514 | | /* Look it up in the table first, so that we can easily |
1515 | | * override CP367, which we map to 1252 instead of 367. */ |
1516 | | a = 0; |
1517 | | b = sizeof(charsets)/sizeof(charsets[0]); |
1518 | | while (b > a) { |
1519 | | int c = (b + a) / 2; |
1520 | | int r = strcmp(charsets[c].name, cs); |
1521 | | if (r < 0) |
1522 | | a = c + 1; |
1523 | | else if (r > 0) |
1524 | | b = c; |
1525 | | else |
1526 | | return charsets[c].cp; |
1527 | | } |
1528 | | |
1529 | | /* If it's not in the table, try to parse it. */ |
1530 | | switch (*cs) { |
1531 | | case 'C': |
1532 | | if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') { |
1533 | | cp = my_atoi(cs + 2); |
1534 | | } else if (strcmp(cs, "CP_ACP") == 0) |
1535 | | cp = get_current_codepage(); |
1536 | | else if (strcmp(cs, "CP_OEMCP") == 0) |
1537 | | cp = get_current_oemcp(); |
1538 | | break; |
1539 | | case 'I': |
1540 | | if (cs[1] == 'B' && cs[2] == 'M' && |
1541 | | cs[3] >= '0' && cs[3] <= '9') { |
1542 | | cp = my_atoi(cs + 3); |
1543 | | } |
1544 | | break; |
1545 | | case 'W': |
1546 | | if (strncmp(cs, "WINDOWS-", 8) == 0) { |
1547 | | cp = my_atoi(cs + 8); |
1548 | | if (cp != 874 && (cp < 1250 || cp > 1258)) |
1549 | | cp = -1;/* This may invalid code. */ |
1550 | | } |
1551 | | break; |
1552 | | } |
1553 | | return (cp); |
1554 | | } |
1555 | | |
1556 | | /* |
1557 | | * Return ANSI Code Page of current locale set by setlocale(). |
1558 | | */ |
1559 | | static unsigned |
1560 | | get_current_codepage(void) |
1561 | | { |
1562 | | char *locale, *p; |
1563 | | unsigned cp; |
1564 | | |
1565 | | locale = setlocale(LC_CTYPE, NULL); |
1566 | | if (locale == NULL) |
1567 | | return (GetACP()); |
1568 | | if (locale[0] == 'C' && locale[1] == '\0') |
1569 | | return (CP_C_LOCALE); |
1570 | | p = strrchr(locale, '.'); |
1571 | | if (p == NULL) |
1572 | | return (GetACP()); |
1573 | | if ((strcmp(p+1, "utf8") == 0) || (strcmp(p+1, "UTF-8") == 0)) |
1574 | | return CP_UTF8; |
1575 | | cp = my_atoi(p+1); |
1576 | | if ((int)cp <= 0) |
1577 | | return (GetACP()); |
1578 | | return (cp); |
1579 | | } |
1580 | | |
1581 | | /* |
1582 | | * Translation table between Locale Name and ACP/OEMCP. |
1583 | | */ |
1584 | | static struct { |
1585 | | unsigned acp; |
1586 | | unsigned ocp; |
1587 | | const char *locale; |
1588 | | } acp_ocp_map[] = { |
1589 | | { 950, 950, "Chinese_Taiwan" }, |
1590 | | { 936, 936, "Chinese_People's Republic of China" }, |
1591 | | { 950, 950, "Chinese_Taiwan" }, |
1592 | | { 1250, 852, "Czech_Czech Republic" }, |
1593 | | { 1252, 850, "Danish_Denmark" }, |
1594 | | { 1252, 850, "Dutch_Netherlands" }, |
1595 | | { 1252, 850, "Dutch_Belgium" }, |
1596 | | { 1252, 437, "English_United States" }, |
1597 | | { 1252, 850, "English_Australia" }, |
1598 | | { 1252, 850, "English_Canada" }, |
1599 | | { 1252, 850, "English_New Zealand" }, |
1600 | | { 1252, 850, "English_United Kingdom" }, |
1601 | | { 1252, 437, "English_United States" }, |
1602 | | { 1252, 850, "Finnish_Finland" }, |
1603 | | { 1252, 850, "French_France" }, |
1604 | | { 1252, 850, "French_Belgium" }, |
1605 | | { 1252, 850, "French_Canada" }, |
1606 | | { 1252, 850, "French_Switzerland" }, |
1607 | | { 1252, 850, "German_Germany" }, |
1608 | | { 1252, 850, "German_Austria" }, |
1609 | | { 1252, 850, "German_Switzerland" }, |
1610 | | { 1253, 737, "Greek_Greece" }, |
1611 | | { 1250, 852, "Hungarian_Hungary" }, |
1612 | | { 1252, 850, "Icelandic_Iceland" }, |
1613 | | { 1252, 850, "Italian_Italy" }, |
1614 | | { 1252, 850, "Italian_Switzerland" }, |
1615 | | { 932, 932, "Japanese_Japan" }, |
1616 | | { 949, 949, "Korean_Korea" }, |
1617 | | { 1252, 850, "Norwegian (BokmOl)_Norway" }, |
1618 | | { 1252, 850, "Norwegian (BokmOl)_Norway" }, |
1619 | | { 1252, 850, "Norwegian-Nynorsk_Norway" }, |
1620 | | { 1250, 852, "Polish_Poland" }, |
1621 | | { 1252, 850, "Portuguese_Portugal" }, |
1622 | | { 1252, 850, "Portuguese_Brazil" }, |
1623 | | { 1251, 866, "Russian_Russia" }, |
1624 | | { 1250, 852, "Slovak_Slovakia" }, |
1625 | | { 1252, 850, "Spanish_Spain" }, |
1626 | | { 1252, 850, "Spanish_Mexico" }, |
1627 | | { 1252, 850, "Spanish_Spain" }, |
1628 | | { 1252, 850, "Swedish_Sweden" }, |
1629 | | { 1254, 857, "Turkish_Turkey" }, |
1630 | | { 0, 0, NULL} |
1631 | | }; |
1632 | | |
1633 | | /* |
1634 | | * Return OEM Code Page of current locale set by setlocale(). |
1635 | | */ |
1636 | | static unsigned |
1637 | | get_current_oemcp(void) |
1638 | | { |
1639 | | int i; |
1640 | | char *locale, *p; |
1641 | | size_t len; |
1642 | | |
1643 | | locale = setlocale(LC_CTYPE, NULL); |
1644 | | if (locale == NULL) |
1645 | | return (GetOEMCP()); |
1646 | | if (locale[0] == 'C' && locale[1] == '\0') |
1647 | | return (CP_C_LOCALE); |
1648 | | |
1649 | | p = strrchr(locale, '.'); |
1650 | | if (p == NULL) |
1651 | | return (GetOEMCP()); |
1652 | | len = p - locale; |
1653 | | for (i = 0; acp_ocp_map[i].acp; i++) { |
1654 | | if (strncmp(acp_ocp_map[i].locale, locale, len) == 0) |
1655 | | return (acp_ocp_map[i].ocp); |
1656 | | } |
1657 | | return (GetOEMCP()); |
1658 | | } |
1659 | | #else |
1660 | | |
1661 | | /* |
1662 | | * POSIX platform does not use CodePage. |
1663 | | */ |
1664 | | |
1665 | | static unsigned |
1666 | | get_current_codepage(void) |
1667 | 2.69k | { |
1668 | 2.69k | return (-1);/* Unknown */ |
1669 | 2.69k | } |
1670 | | static unsigned |
1671 | | make_codepage_from_charset(const char *charset) |
1672 | 2.72k | { |
1673 | 2.72k | (void)charset; /* UNUSED */ |
1674 | 2.72k | return (-1);/* Unknown */ |
1675 | 2.72k | } |
1676 | | static unsigned |
1677 | | get_current_oemcp(void) |
1678 | 2.69k | { |
1679 | 2.69k | return (-1);/* Unknown */ |
1680 | 2.69k | } |
1681 | | |
1682 | | #endif /* defined(_WIN32) && !defined(__CYGWIN__) */ |
1683 | | |
1684 | | /* |
1685 | | * Return a string conversion object. |
1686 | | */ |
1687 | | static struct archive_string_conv * |
1688 | | get_sconv_object(struct archive *a, const char *fc, const char *tc, int flag) |
1689 | 2.84k | { |
1690 | 2.84k | struct archive_string_conv *sc; |
1691 | 2.84k | unsigned current_codepage; |
1692 | | |
1693 | | /* Check if we have made the sconv object. */ |
1694 | 2.84k | sc = find_sconv_object(a, fc, tc); |
1695 | 2.84k | if (sc != NULL) |
1696 | 118 | return (sc); |
1697 | | |
1698 | 2.72k | if (a == NULL) |
1699 | 0 | current_codepage = get_current_codepage(); |
1700 | 2.72k | else |
1701 | 2.72k | current_codepage = a->current_codepage; |
1702 | | |
1703 | 2.72k | sc = create_sconv_object(canonical_charset_name(fc), |
1704 | 2.72k | canonical_charset_name(tc), current_codepage, flag); |
1705 | 2.72k | if (sc == NULL) { |
1706 | 0 | if (a != NULL) |
1707 | 0 | archive_set_error(a, ENOMEM, |
1708 | 0 | "Could not allocate memory for " |
1709 | 0 | "a string conversion object"); |
1710 | 0 | return (NULL); |
1711 | 0 | } |
1712 | | |
1713 | | /* |
1714 | | * If there is no converter for current string conversion object, |
1715 | | * we cannot handle this conversion. |
1716 | | */ |
1717 | 2.72k | if (sc->nconverter == 0) { |
1718 | 0 | if (a != NULL) { |
1719 | | #if HAVE_ICONV |
1720 | | archive_set_error(a, ARCHIVE_ERRNO_MISC, |
1721 | | "iconv_open failed : Cannot handle ``%s''", |
1722 | | (flag & SCONV_TO_CHARSET)?tc:fc); |
1723 | | #else |
1724 | 0 | archive_set_error(a, ARCHIVE_ERRNO_MISC, |
1725 | 0 | "A character-set conversion not fully supported " |
1726 | 0 | "on this platform"); |
1727 | 0 | #endif |
1728 | 0 | } |
1729 | | /* Failed; free a sconv object. */ |
1730 | 0 | free_sconv_object(sc); |
1731 | 0 | return (NULL); |
1732 | 0 | } |
1733 | | |
1734 | | /* |
1735 | | * Success! |
1736 | | */ |
1737 | 2.72k | if (a != NULL) |
1738 | 2.72k | add_sconv_object(a, sc); |
1739 | 2.72k | return (sc); |
1740 | 2.72k | } |
1741 | | |
1742 | | static const char * |
1743 | | get_current_charset(struct archive *a) |
1744 | 2.84k | { |
1745 | 2.84k | const char *cur_charset; |
1746 | | |
1747 | 2.84k | if (a == NULL) |
1748 | 0 | cur_charset = default_iconv_charset(""); |
1749 | 2.84k | else { |
1750 | 2.84k | cur_charset = default_iconv_charset(a->current_code); |
1751 | 2.84k | if (a->current_code == NULL) { |
1752 | 2.69k | a->current_code = strdup(cur_charset); |
1753 | 2.69k | a->current_codepage = get_current_codepage(); |
1754 | 2.69k | a->current_oemcp = get_current_oemcp(); |
1755 | 2.69k | } |
1756 | 2.84k | } |
1757 | 2.84k | return (cur_charset); |
1758 | 2.84k | } |
1759 | | |
1760 | | /* |
1761 | | * Make and Return a string conversion object. |
1762 | | * Return NULL if the platform does not support the specified conversion |
1763 | | * and best_effort is 0. |
1764 | | * If best_effort is set, A string conversion object must be returned |
1765 | | * unless memory allocation for the object fails, but the conversion |
1766 | | * might fail when non-ASCII code is found. |
1767 | | */ |
1768 | | struct archive_string_conv * |
1769 | | archive_string_conversion_to_charset(struct archive *a, const char *charset, |
1770 | | int best_effort) |
1771 | 0 | { |
1772 | 0 | int flag = SCONV_TO_CHARSET; |
1773 | |
|
1774 | 0 | if (best_effort) |
1775 | 0 | flag |= SCONV_BEST_EFFORT; |
1776 | 0 | return (get_sconv_object(a, get_current_charset(a), charset, flag)); |
1777 | 0 | } |
1778 | | |
1779 | | struct archive_string_conv * |
1780 | | archive_string_conversion_from_charset(struct archive *a, const char *charset, |
1781 | | int best_effort) |
1782 | 2.84k | { |
1783 | 2.84k | int flag = SCONV_FROM_CHARSET; |
1784 | | |
1785 | 2.84k | if (best_effort) |
1786 | 2.84k | flag |= SCONV_BEST_EFFORT; |
1787 | 2.84k | return (get_sconv_object(a, charset, get_current_charset(a), flag)); |
1788 | 2.84k | } |
1789 | | |
1790 | | /* |
1791 | | * archive_string_default_conversion_*_archive() are provided for Windows |
1792 | | * platform because other archiver application use CP_OEMCP for |
1793 | | * MultiByteToWideChar() and WideCharToMultiByte() for the filenames |
1794 | | * in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP |
1795 | | * unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP). |
1796 | | * So we should make a string conversion between CP_ACP and CP_OEMCP |
1797 | | * for compatibility. |
1798 | | */ |
1799 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
1800 | | struct archive_string_conv * |
1801 | | archive_string_default_conversion_for_read(struct archive *a) |
1802 | | { |
1803 | | const char *cur_charset = get_current_charset(a); |
1804 | | char oemcp[16]; |
1805 | | |
1806 | | /* NOTE: a check of cur_charset is unneeded but we need |
1807 | | * that get_current_charset() has been surely called at |
1808 | | * this time whatever C compiler optimized. */ |
1809 | | if (cur_charset != NULL && |
1810 | | (a->current_codepage == CP_C_LOCALE || |
1811 | | a->current_codepage == a->current_oemcp)) |
1812 | | return (NULL);/* no conversion. */ |
1813 | | |
1814 | | _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); |
1815 | | /* Make sure a null termination must be set. */ |
1816 | | oemcp[sizeof(oemcp)-1] = '\0'; |
1817 | | return (get_sconv_object(a, oemcp, cur_charset, |
1818 | | SCONV_FROM_CHARSET)); |
1819 | | } |
1820 | | |
1821 | | struct archive_string_conv * |
1822 | | archive_string_default_conversion_for_write(struct archive *a) |
1823 | | { |
1824 | | const char *cur_charset = get_current_charset(a); |
1825 | | char oemcp[16]; |
1826 | | |
1827 | | /* NOTE: a check of cur_charset is unneeded but we need |
1828 | | * that get_current_charset() has been surely called at |
1829 | | * this time whatever C compiler optimized. */ |
1830 | | if (cur_charset != NULL && |
1831 | | (a->current_codepage == CP_C_LOCALE || |
1832 | | a->current_codepage == a->current_oemcp)) |
1833 | | return (NULL);/* no conversion. */ |
1834 | | |
1835 | | _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); |
1836 | | /* Make sure a null termination must be set. */ |
1837 | | oemcp[sizeof(oemcp)-1] = '\0'; |
1838 | | return (get_sconv_object(a, cur_charset, oemcp, |
1839 | | SCONV_TO_CHARSET)); |
1840 | | } |
1841 | | #else |
1842 | | struct archive_string_conv * |
1843 | | archive_string_default_conversion_for_read(struct archive *a) |
1844 | 5.47k | { |
1845 | 5.47k | (void)a; /* UNUSED */ |
1846 | 5.47k | return (NULL); |
1847 | 5.47k | } |
1848 | | |
1849 | | struct archive_string_conv * |
1850 | | archive_string_default_conversion_for_write(struct archive *a) |
1851 | 0 | { |
1852 | 0 | (void)a; /* UNUSED */ |
1853 | 0 | return (NULL); |
1854 | 0 | } |
1855 | | #endif |
1856 | | |
1857 | | /* |
1858 | | * Dispose of all character conversion objects in the archive object. |
1859 | | */ |
1860 | | void |
1861 | | archive_string_conversion_free(struct archive *a) |
1862 | 58.1k | { |
1863 | 58.1k | struct archive_string_conv *sc; |
1864 | 58.1k | struct archive_string_conv *sc_next; |
1865 | | |
1866 | 60.8k | for (sc = a->sconv; sc != NULL; sc = sc_next) { |
1867 | 2.72k | sc_next = sc->next; |
1868 | 2.72k | free_sconv_object(sc); |
1869 | 2.72k | } |
1870 | 58.1k | a->sconv = NULL; |
1871 | 58.1k | free(a->current_code); |
1872 | 58.1k | a->current_code = NULL; |
1873 | 58.1k | } |
1874 | | |
1875 | | /* |
1876 | | * Return a conversion charset name. |
1877 | | */ |
1878 | | const char * |
1879 | | archive_string_conversion_charset_name(struct archive_string_conv *sc) |
1880 | 850 | { |
1881 | 850 | if (sc->flag & SCONV_TO_CHARSET) |
1882 | 0 | return (sc->to_charset); |
1883 | 850 | else |
1884 | 850 | return (sc->from_charset); |
1885 | 850 | } |
1886 | | |
1887 | | /* |
1888 | | * Change the behavior of a string conversion. |
1889 | | */ |
1890 | | void |
1891 | | archive_string_conversion_set_opt(struct archive_string_conv *sc, int opt) |
1892 | 0 | { |
1893 | 0 | switch (opt) { |
1894 | | /* |
1895 | | * A filename in UTF-8 was made with libarchive 2.x in a wrong |
1896 | | * assumption that wchar_t was Unicode. |
1897 | | * This option enables simulating the assumption in order to read |
1898 | | * that filename correctly. |
1899 | | */ |
1900 | 0 | case SCONV_SET_OPT_UTF8_LIBARCHIVE2X: |
1901 | 0 | #if (defined(_WIN32) && !defined(__CYGWIN__)) \ |
1902 | 0 | || defined(__STDC_ISO_10646__) || defined(__APPLE__) |
1903 | | /* |
1904 | | * Nothing to do for it since wchar_t on these platforms |
1905 | | * is really Unicode. |
1906 | | */ |
1907 | 0 | (void)sc; /* UNUSED */ |
1908 | | #else |
1909 | | if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) { |
1910 | | sc->flag |= SCONV_UTF8_LIBARCHIVE_2; |
1911 | | /* Set up string converters. */ |
1912 | | setup_converter(sc); |
1913 | | } |
1914 | | #endif |
1915 | 0 | break; |
1916 | 0 | case SCONV_SET_OPT_NORMALIZATION_C: |
1917 | 0 | if ((sc->flag & SCONV_NORMALIZATION_C) == 0) { |
1918 | 0 | sc->flag |= SCONV_NORMALIZATION_C; |
1919 | 0 | sc->flag &= ~SCONV_NORMALIZATION_D; |
1920 | | /* Set up string converters. */ |
1921 | 0 | setup_converter(sc); |
1922 | 0 | } |
1923 | 0 | break; |
1924 | 0 | case SCONV_SET_OPT_NORMALIZATION_D: |
1925 | | #if defined(HAVE_ICONV) |
1926 | | /* |
1927 | | * If iconv will take the string, do not change the |
1928 | | * setting of the normalization. |
1929 | | */ |
1930 | | if (!(sc->flag & SCONV_WIN_CP) && |
1931 | | (sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && |
1932 | | !(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) |
1933 | | break; |
1934 | | #endif |
1935 | 0 | if ((sc->flag & SCONV_NORMALIZATION_D) == 0) { |
1936 | 0 | sc->flag |= SCONV_NORMALIZATION_D; |
1937 | 0 | sc->flag &= ~SCONV_NORMALIZATION_C; |
1938 | | /* Set up string converters. */ |
1939 | 0 | setup_converter(sc); |
1940 | 0 | } |
1941 | 0 | break; |
1942 | 0 | default: |
1943 | 0 | break; |
1944 | 0 | } |
1945 | 0 | } |
1946 | | |
1947 | | /* |
1948 | | * |
1949 | | * Copy one archive_string to another in locale conversion. |
1950 | | * |
1951 | | * archive_strncat_l(); |
1952 | | * archive_strncpy_l(); |
1953 | | * |
1954 | | */ |
1955 | | |
1956 | | static size_t |
1957 | | mbsnbytes(const void *_p, size_t n) |
1958 | 5.14k | { |
1959 | 5.14k | size_t s; |
1960 | 5.14k | const char *p, *pp; |
1961 | | |
1962 | 5.14k | if (_p == NULL) |
1963 | 0 | return (0); |
1964 | 5.14k | p = (const char *)_p; |
1965 | | |
1966 | | /* Like strlen(p), except won't examine positions beyond p[n]. */ |
1967 | 5.14k | s = 0; |
1968 | 5.14k | pp = p; |
1969 | 53.3k | while (s < n && *pp) { |
1970 | 48.2k | pp++; |
1971 | 48.2k | s++; |
1972 | 48.2k | } |
1973 | 5.14k | return (s); |
1974 | 5.14k | } |
1975 | | |
1976 | | static size_t |
1977 | | utf16nbytes(const void *_p, size_t n) |
1978 | 0 | { |
1979 | 0 | size_t s; |
1980 | 0 | const char *p, *pp; |
1981 | |
|
1982 | 0 | if (_p == NULL) |
1983 | 0 | return (0); |
1984 | 0 | p = (const char *)_p; |
1985 | | |
1986 | | /* Like strlen(p), except won't examine positions beyond p[n]. */ |
1987 | 0 | s = 0; |
1988 | 0 | pp = p; |
1989 | 0 | n >>= 1; |
1990 | 0 | while (s < n && (pp[0] || pp[1])) { |
1991 | 0 | pp += 2; |
1992 | 0 | s++; |
1993 | 0 | } |
1994 | 0 | return (s<<1); |
1995 | 0 | } |
1996 | | |
1997 | | int |
1998 | | archive_strncpy_l(struct archive_string *as, const void *_p, size_t n, |
1999 | | struct archive_string_conv *sc) |
2000 | 5.45k | { |
2001 | 5.45k | as->length = 0; |
2002 | 5.45k | return (archive_strncat_l(as, _p, n, sc)); |
2003 | 5.45k | } |
2004 | | |
2005 | | int |
2006 | | archive_strncat_l(struct archive_string *as, const void *_p, size_t n, |
2007 | | struct archive_string_conv *sc) |
2008 | 5.45k | { |
2009 | 5.45k | const void *s; |
2010 | 5.45k | size_t length = 0; |
2011 | 5.45k | int i, r = 0, r2; |
2012 | | |
2013 | 5.45k | if (_p != NULL && n > 0) { |
2014 | 5.14k | if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) |
2015 | 0 | length = utf16nbytes(_p, n); |
2016 | 5.14k | else |
2017 | 5.14k | length = mbsnbytes(_p, n); |
2018 | 5.14k | } |
2019 | | |
2020 | | /* We must allocate memory even if there is no data for conversion |
2021 | | * or copy. This simulates archive_string_append behavior. */ |
2022 | 5.45k | if (length == 0) { |
2023 | 338 | size_t tn = 1; |
2024 | 338 | if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) |
2025 | 0 | tn = 2; |
2026 | 338 | if (archive_string_ensure(as, as->length + tn) == NULL) |
2027 | 0 | return (-1); |
2028 | 338 | as->s[as->length] = 0; |
2029 | 338 | if (tn == 2) |
2030 | 0 | as->s[as->length+1] = 0; |
2031 | 338 | return (0); |
2032 | 338 | } |
2033 | | |
2034 | | /* |
2035 | | * If sc is NULL, we just make a copy. |
2036 | | */ |
2037 | 5.11k | if (sc == NULL) { |
2038 | 2.72k | if (archive_string_append(as, _p, length) == NULL) |
2039 | 0 | return (-1);/* No memory */ |
2040 | 2.72k | return (0); |
2041 | 2.72k | } |
2042 | | |
2043 | 2.39k | s = _p; |
2044 | 2.39k | i = 0; |
2045 | 2.39k | if (sc->nconverter > 1) { |
2046 | 2.39k | sc->utftmp.length = 0; |
2047 | 2.39k | r2 = sc->converter[0](&(sc->utftmp), s, length, sc); |
2048 | 2.39k | if (r2 != 0 && errno == ENOMEM) |
2049 | 0 | return (r2); |
2050 | 2.39k | if (r > r2) |
2051 | 822 | r = r2; |
2052 | 2.39k | s = sc->utftmp.s; |
2053 | 2.39k | length = sc->utftmp.length; |
2054 | 2.39k | ++i; |
2055 | 2.39k | } |
2056 | 2.39k | r2 = sc->converter[i](as, s, length, sc); |
2057 | 2.39k | if (r > r2) |
2058 | 34 | r = r2; |
2059 | 2.39k | return (r); |
2060 | 2.39k | } |
2061 | | |
2062 | | struct archive_string * |
2063 | | archive_string_dirname(struct archive_string *as) |
2064 | 0 | { |
2065 | | /* strip trailing separators */ |
2066 | 0 | while (as->length > 1 && as->s[as->length - 1] == '/') |
2067 | 0 | as->length--; |
2068 | | /* strip final component */ |
2069 | 0 | while (as->length > 0 && as->s[as->length - 1] != '/') |
2070 | 0 | as->length--; |
2071 | | /* empty path -> cwd */ |
2072 | 0 | if (as->length == 0) |
2073 | 0 | return (archive_strcat(as, ".")); |
2074 | | /* strip separator(s) */ |
2075 | 0 | while (as->length > 1 && as->s[as->length - 1] == '/') |
2076 | 0 | as->length--; |
2077 | | /* terminate */ |
2078 | 0 | as->s[as->length] = '\0'; |
2079 | 0 | return (as); |
2080 | 0 | } |
2081 | | |
2082 | | #if HAVE_ICONV |
2083 | | |
2084 | | /* |
2085 | | * Return -1 if conversion fails. |
2086 | | */ |
2087 | | static int |
2088 | | iconv_strncat_in_locale(struct archive_string *as, const void *_p, |
2089 | | size_t length, struct archive_string_conv *sc) |
2090 | | { |
2091 | | ICONV_CONST char *itp; |
2092 | | size_t remaining; |
2093 | | iconv_t cd; |
2094 | | char *outp; |
2095 | | size_t avail, bs; |
2096 | | int return_value = 0; /* success */ |
2097 | | size_t to_size, from_size; |
2098 | | |
2099 | | if (sc->flag & SCONV_TO_UTF16) |
2100 | | to_size = 2; |
2101 | | else |
2102 | | to_size = 1; |
2103 | | if (sc->flag & SCONV_FROM_UTF16) |
2104 | | from_size = 2; |
2105 | | else |
2106 | | from_size = 1; |
2107 | | |
2108 | | if (archive_string_ensure(as, as->length + length*2+to_size) == NULL) |
2109 | | return (-1); |
2110 | | |
2111 | | cd = sc->cd; |
2112 | | itp = (char *)(uintptr_t)_p; |
2113 | | remaining = length; |
2114 | | outp = as->s + as->length; |
2115 | | avail = as->buffer_length - as->length - to_size; |
2116 | | while (remaining >= from_size) { |
2117 | | size_t result = iconv(cd, &itp, &remaining, &outp, &avail); |
2118 | | |
2119 | | if (result != (size_t)-1) |
2120 | | break; /* Conversion completed. */ |
2121 | | |
2122 | | if (errno == EILSEQ || errno == EINVAL) { |
2123 | | /* |
2124 | | * If an output charset is UTF-8 or UTF-16BE/LE, |
2125 | | * unknown character should be U+FFFD |
2126 | | * (replacement character). |
2127 | | */ |
2128 | | if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) { |
2129 | | size_t rbytes; |
2130 | | if (sc->flag & SCONV_TO_UTF8) |
2131 | | rbytes = sizeof(utf8_replacement_char); |
2132 | | else |
2133 | | rbytes = 2; |
2134 | | |
2135 | | if (avail < rbytes) { |
2136 | | as->length = outp - as->s; |
2137 | | bs = as->buffer_length + |
2138 | | (remaining * to_size) + rbytes; |
2139 | | if (NULL == |
2140 | | archive_string_ensure(as, bs)) |
2141 | | return (-1); |
2142 | | outp = as->s + as->length; |
2143 | | avail = as->buffer_length |
2144 | | - as->length - to_size; |
2145 | | } |
2146 | | if (sc->flag & SCONV_TO_UTF8) |
2147 | | memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char)); |
2148 | | else if (sc->flag & SCONV_TO_UTF16BE) |
2149 | | archive_be16enc(outp, UNICODE_R_CHAR); |
2150 | | else |
2151 | | archive_le16enc(outp, UNICODE_R_CHAR); |
2152 | | outp += rbytes; |
2153 | | avail -= rbytes; |
2154 | | } else { |
2155 | | /* Skip the illegal input bytes. */ |
2156 | | *outp++ = '?'; |
2157 | | avail--; |
2158 | | } |
2159 | | itp += from_size; |
2160 | | remaining -= from_size; |
2161 | | return_value = -1; /* failure */ |
2162 | | } else { |
2163 | | /* E2BIG no output buffer, |
2164 | | * Increase an output buffer. */ |
2165 | | as->length = outp - as->s; |
2166 | | bs = as->buffer_length + remaining * 2; |
2167 | | if (NULL == archive_string_ensure(as, bs)) |
2168 | | return (-1); |
2169 | | outp = as->s + as->length; |
2170 | | avail = as->buffer_length - as->length - to_size; |
2171 | | } |
2172 | | } |
2173 | | as->length = outp - as->s; |
2174 | | as->s[as->length] = 0; |
2175 | | if (to_size == 2) |
2176 | | as->s[as->length+1] = 0; |
2177 | | return (return_value); |
2178 | | } |
2179 | | |
2180 | | #endif /* HAVE_ICONV */ |
2181 | | |
2182 | | |
2183 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
2184 | | |
2185 | | /* |
2186 | | * Translate a string from a some CodePage to an another CodePage by |
2187 | | * Windows APIs, and copy the result. Return -1 if conversion fails. |
2188 | | */ |
2189 | | static int |
2190 | | strncat_in_codepage(struct archive_string *as, |
2191 | | const void *_p, size_t length, struct archive_string_conv *sc) |
2192 | | { |
2193 | | const char *s = (const char *)_p; |
2194 | | struct archive_wstring aws; |
2195 | | size_t l; |
2196 | | int r, saved_flag; |
2197 | | |
2198 | | archive_string_init(&aws); |
2199 | | saved_flag = sc->flag; |
2200 | | sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C); |
2201 | | r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc); |
2202 | | sc->flag = saved_flag; |
2203 | | if (r != 0) { |
2204 | | archive_wstring_free(&aws); |
2205 | | if (errno != ENOMEM) |
2206 | | archive_string_append(as, s, length); |
2207 | | return (-1); |
2208 | | } |
2209 | | |
2210 | | l = as->length; |
2211 | | r = archive_string_append_from_wcs_in_codepage( |
2212 | | as, aws.s, aws.length, sc); |
2213 | | if (r != 0 && errno != ENOMEM && l == as->length) |
2214 | | archive_string_append(as, s, length); |
2215 | | archive_wstring_free(&aws); |
2216 | | return (r); |
2217 | | } |
2218 | | |
2219 | | /* |
2220 | | * Test whether MBS ==> WCS is okay. |
2221 | | */ |
2222 | | static int |
2223 | | invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) |
2224 | | { |
2225 | | const char *p = (const char *)_p; |
2226 | | unsigned codepage; |
2227 | | DWORD mbflag = MB_ERR_INVALID_CHARS; |
2228 | | |
2229 | | if (sc->flag & SCONV_FROM_CHARSET) |
2230 | | codepage = sc->to_cp; |
2231 | | else |
2232 | | codepage = sc->from_cp; |
2233 | | |
2234 | | if (codepage == CP_C_LOCALE) |
2235 | | return (0); |
2236 | | if (codepage != CP_UTF8) |
2237 | | mbflag |= MB_PRECOMPOSED; |
2238 | | |
2239 | | if (n > (size_t)INT_MAX) |
2240 | | return (-1); /* Invalid */ |
2241 | | if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0) |
2242 | | return (-1); /* Invalid */ |
2243 | | return (0); /* Okay */ |
2244 | | } |
2245 | | |
2246 | | #else |
2247 | | |
2248 | | /* |
2249 | | * Test whether MBS ==> WCS is okay. |
2250 | | */ |
2251 | | static int |
2252 | | invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) |
2253 | 0 | { |
2254 | 0 | const char *p = (const char *)_p; |
2255 | 0 | size_t r; |
2256 | |
|
2257 | 0 | #if HAVE_MBRTOWC |
2258 | 0 | mbstate_t shift_state; |
2259 | |
|
2260 | 0 | memset(&shift_state, 0, sizeof(shift_state)); |
2261 | | #else |
2262 | | /* Clear the shift state before starting. */ |
2263 | | mbtowc(NULL, NULL, 0); |
2264 | | #endif |
2265 | 0 | while (n) { |
2266 | 0 | wchar_t wc; |
2267 | |
|
2268 | 0 | #if HAVE_MBRTOWC |
2269 | 0 | r = mbrtowc(&wc, p, n, &shift_state); |
2270 | | #else |
2271 | | r = mbtowc(&wc, p, n); |
2272 | | #endif |
2273 | 0 | if (r == (size_t)-1 || r == (size_t)-2) |
2274 | 0 | return (-1);/* Invalid. */ |
2275 | 0 | if (r == 0) |
2276 | 0 | break; |
2277 | 0 | p += r; |
2278 | 0 | n -= r; |
2279 | 0 | } |
2280 | 0 | (void)sc; /* UNUSED */ |
2281 | 0 | return (0); /* All Okey. */ |
2282 | 0 | } |
2283 | | |
2284 | | #endif /* defined(_WIN32) && !defined(__CYGWIN__) */ |
2285 | | |
2286 | | /* |
2287 | | * Basically returns -1 because we cannot make a conversion of charset |
2288 | | * without iconv but in some cases this would return 0. |
2289 | | * Returns 0 if all copied characters are ASCII. |
2290 | | * Returns 0 if both from-locale and to-locale are the same and those |
2291 | | * can be WCS with no error. |
2292 | | */ |
2293 | | static int |
2294 | | best_effort_strncat_in_locale(struct archive_string *as, const void *_p, |
2295 | | size_t length, struct archive_string_conv *sc) |
2296 | 2.39k | { |
2297 | 2.39k | size_t remaining; |
2298 | 2.39k | const uint8_t *itp; |
2299 | 2.39k | int return_value = 0; /* success */ |
2300 | | |
2301 | | /* |
2302 | | * If both from-locale and to-locale is the same, this makes a copy. |
2303 | | * And then this checks all copied MBS can be WCS if so returns 0. |
2304 | | */ |
2305 | 2.39k | if (sc->same) { |
2306 | 0 | if (archive_string_append(as, _p, length) == NULL) |
2307 | 0 | return (-1);/* No memory */ |
2308 | 0 | return (invalid_mbs(_p, length, sc)); |
2309 | 0 | } |
2310 | | |
2311 | | /* |
2312 | | * If a character is ASCII, this just copies it. If not, this |
2313 | | * assigns '?' character instead but in UTF-8 locale this assigns |
2314 | | * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD, |
2315 | | * a Replacement Character in Unicode. |
2316 | | */ |
2317 | | |
2318 | 2.39k | remaining = length; |
2319 | 2.39k | itp = (const uint8_t *)_p; |
2320 | 45.3k | while (*itp && remaining > 0) { |
2321 | 42.9k | if (*itp > 127) { |
2322 | | // Non-ASCII: Substitute with suitable replacement |
2323 | 28.4k | if (sc->flag & SCONV_TO_UTF8) { |
2324 | 0 | if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) { |
2325 | 0 | __archive_errx(1, "Out of memory"); |
2326 | 0 | } |
2327 | 28.4k | } else { |
2328 | 28.4k | archive_strappend_char(as, '?'); |
2329 | 28.4k | } |
2330 | 28.4k | return_value = -1; |
2331 | 28.4k | } else { |
2332 | 14.5k | archive_strappend_char(as, *itp); |
2333 | 14.5k | } |
2334 | 42.9k | ++itp; |
2335 | 42.9k | } |
2336 | 2.39k | return (return_value); |
2337 | 2.39k | } |
2338 | | |
2339 | | |
2340 | | /* |
2341 | | * Unicode conversion functions. |
2342 | | * - UTF-8 <===> UTF-8 in removing surrogate pairs. |
2343 | | * - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs. |
2344 | | * - UTF-8 made by libarchive 2.x ===> UTF-8. |
2345 | | * - UTF-16BE <===> UTF-8. |
2346 | | * |
2347 | | */ |
2348 | | |
2349 | | /* |
2350 | | * Utility to convert a single UTF-8 sequence. |
2351 | | * |
2352 | | * Usually return used bytes, return used byte in negative value when |
2353 | | * a unicode character is replaced with U+FFFD. |
2354 | | * See also http://unicode.org/review/pr-121.html Public Review Issue #121 |
2355 | | * Recommended Practice for Replacement Characters. |
2356 | | */ |
2357 | | static int |
2358 | | _utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) |
2359 | 28.6k | { |
2360 | 28.6k | static const char utf8_count[256] = { |
2361 | 28.6k | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */ |
2362 | 28.6k | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */ |
2363 | 28.6k | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */ |
2364 | 28.6k | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */ |
2365 | 28.6k | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */ |
2366 | 28.6k | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */ |
2367 | 28.6k | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */ |
2368 | 28.6k | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */ |
2369 | 28.6k | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */ |
2370 | 28.6k | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */ |
2371 | 28.6k | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */ |
2372 | 28.6k | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */ |
2373 | 28.6k | 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */ |
2374 | 28.6k | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */ |
2375 | 28.6k | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */ |
2376 | 28.6k | 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */ |
2377 | 28.6k | }; |
2378 | 28.6k | int ch, i; |
2379 | 28.6k | int cnt; |
2380 | 28.6k | uint32_t wc; |
2381 | | |
2382 | | /* Sanity check. */ |
2383 | 28.6k | if (n == 0) |
2384 | 2.45k | return (0); |
2385 | | /* |
2386 | | * Decode 1-4 bytes depending on the value of the first byte. |
2387 | | */ |
2388 | 26.1k | ch = (unsigned char)*s; |
2389 | 26.1k | if (ch == 0) |
2390 | 0 | return (0); /* Standard: return 0 for end-of-string. */ |
2391 | 26.1k | cnt = utf8_count[ch]; |
2392 | | |
2393 | | /* Invalid sequence or there are not plenty bytes. */ |
2394 | 26.1k | if (n < (size_t)cnt) { |
2395 | 214 | cnt = (int)n; |
2396 | 264 | for (i = 1; i < cnt; i++) { |
2397 | 136 | if ((s[i] & 0xc0) != 0x80) { |
2398 | 86 | cnt = i; |
2399 | 86 | break; |
2400 | 86 | } |
2401 | 136 | } |
2402 | 214 | goto invalid_sequence; |
2403 | 214 | } |
2404 | | |
2405 | | /* Make a Unicode code point from a single UTF-8 sequence. */ |
2406 | 25.9k | switch (cnt) { |
2407 | 15.1k | case 1: /* 1 byte sequence. */ |
2408 | 15.1k | *pwc = ch & 0x7f; |
2409 | 15.1k | return (cnt); |
2410 | 4.60k | case 2: /* 2 bytes sequence. */ |
2411 | 4.60k | if ((s[1] & 0xc0) != 0x80) { |
2412 | 1.32k | cnt = 1; |
2413 | 1.32k | goto invalid_sequence; |
2414 | 1.32k | } |
2415 | 3.28k | *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f); |
2416 | 3.28k | return (cnt); |
2417 | 1.22k | case 3: /* 3 bytes sequence. */ |
2418 | 1.22k | if ((s[1] & 0xc0) != 0x80) { |
2419 | 636 | cnt = 1; |
2420 | 636 | goto invalid_sequence; |
2421 | 636 | } |
2422 | 586 | if ((s[2] & 0xc0) != 0x80) { |
2423 | 80 | cnt = 2; |
2424 | 80 | goto invalid_sequence; |
2425 | 80 | } |
2426 | 506 | wc = ((ch & 0x0f) << 12) |
2427 | 506 | | ((s[1] & 0x3f) << 6) |
2428 | 506 | | (s[2] & 0x3f); |
2429 | 506 | if (wc < 0x800) |
2430 | 12 | goto invalid_sequence;/* Overlong sequence. */ |
2431 | 494 | break; |
2432 | 648 | case 4: /* 4 bytes sequence. */ |
2433 | 648 | if ((s[1] & 0xc0) != 0x80) { |
2434 | 294 | cnt = 1; |
2435 | 294 | goto invalid_sequence; |
2436 | 294 | } |
2437 | 354 | if ((s[2] & 0xc0) != 0x80) { |
2438 | 44 | cnt = 2; |
2439 | 44 | goto invalid_sequence; |
2440 | 44 | } |
2441 | 310 | if ((s[3] & 0xc0) != 0x80) { |
2442 | 60 | cnt = 3; |
2443 | 60 | goto invalid_sequence; |
2444 | 60 | } |
2445 | 250 | wc = ((ch & 0x07) << 18) |
2446 | 250 | | ((s[1] & 0x3f) << 12) |
2447 | 250 | | ((s[2] & 0x3f) << 6) |
2448 | 250 | | (s[3] & 0x3f); |
2449 | 250 | if (wc < 0x10000) |
2450 | 4 | goto invalid_sequence;/* Overlong sequence. */ |
2451 | 246 | break; |
2452 | 4.38k | default: /* Others are all invalid sequence. */ |
2453 | 4.38k | if (ch == 0xc0 || ch == 0xc1) |
2454 | 274 | cnt = 2; |
2455 | 4.11k | else if (ch >= 0xf5 && ch <= 0xf7) |
2456 | 132 | cnt = 4; |
2457 | 3.97k | else if (ch >= 0xf8 && ch <= 0xfb) |
2458 | 300 | cnt = 5; |
2459 | 3.67k | else if (ch == 0xfc || ch == 0xfd) |
2460 | 230 | cnt = 6; |
2461 | 3.44k | else |
2462 | 3.44k | cnt = 1; |
2463 | 4.38k | if (n < (size_t)cnt) |
2464 | 146 | cnt = (int)n; |
2465 | 4.50k | for (i = 1; i < cnt; i++) { |
2466 | 982 | if ((s[i] & 0xc0) != 0x80) { |
2467 | 864 | cnt = i; |
2468 | 864 | break; |
2469 | 864 | } |
2470 | 982 | } |
2471 | 4.38k | goto invalid_sequence; |
2472 | 25.9k | } |
2473 | | |
2474 | | /* The code point larger than 0x10FFFF is not legal |
2475 | | * Unicode values. */ |
2476 | 740 | if (wc > UNICODE_MAX) |
2477 | 6 | goto invalid_sequence; |
2478 | | /* Correctly gets a Unicode, returns used bytes. */ |
2479 | 734 | *pwc = wc; |
2480 | 734 | return (cnt); |
2481 | 7.05k | invalid_sequence: |
2482 | 7.05k | *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ |
2483 | 7.05k | return (cnt * -1); |
2484 | 740 | } |
2485 | | |
2486 | | static int |
2487 | | utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) |
2488 | 0 | { |
2489 | 0 | int cnt; |
2490 | |
|
2491 | 0 | cnt = _utf8_to_unicode(pwc, s, n); |
2492 | | /* Any of Surrogate pair is not legal Unicode values. */ |
2493 | 0 | if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc)) |
2494 | 0 | return (-3); |
2495 | 0 | return (cnt); |
2496 | 0 | } |
2497 | | |
2498 | | static inline uint32_t |
2499 | | combine_surrogate_pair(uint32_t uc, uint32_t uc2) |
2500 | 22 | { |
2501 | 22 | uc -= 0xD800; |
2502 | 22 | uc *= 0x400; |
2503 | 22 | uc += uc2 - 0xDC00; |
2504 | 22 | uc += 0x10000; |
2505 | 22 | return (uc); |
2506 | 22 | } |
2507 | | |
2508 | | /* |
2509 | | * Convert a single UTF-8/CESU-8 sequence to a Unicode code point in |
2510 | | * removing surrogate pairs. |
2511 | | * |
2512 | | * CESU-8: The Compatibility Encoding Scheme for UTF-16. |
2513 | | * |
2514 | | * Usually return used bytes, return used byte in negative value when |
2515 | | * a unicode character is replaced with U+FFFD. |
2516 | | */ |
2517 | | static int |
2518 | | cesu8_to_unicode(uint32_t *pwc, const char *s, size_t n) |
2519 | 28.5k | { |
2520 | 28.5k | uint32_t wc = 0; |
2521 | 28.5k | int cnt; |
2522 | | |
2523 | 28.5k | cnt = _utf8_to_unicode(&wc, s, n); |
2524 | 28.5k | if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) { |
2525 | 134 | uint32_t wc2 = 0; |
2526 | 134 | if (n - 3 < 3) { |
2527 | | /* Invalid byte sequence. */ |
2528 | 12 | goto invalid_sequence; |
2529 | 12 | } |
2530 | 122 | cnt = _utf8_to_unicode(&wc2, s+3, n-3); |
2531 | 122 | if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) { |
2532 | | /* Invalid byte sequence. */ |
2533 | 100 | goto invalid_sequence; |
2534 | 100 | } |
2535 | 22 | wc = combine_surrogate_pair(wc, wc2); |
2536 | 22 | cnt = 6; |
2537 | 28.3k | } else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) { |
2538 | | /* Invalid byte sequence. */ |
2539 | 18 | goto invalid_sequence; |
2540 | 18 | } |
2541 | 28.3k | *pwc = wc; |
2542 | 28.3k | return (cnt); |
2543 | 130 | invalid_sequence: |
2544 | 130 | *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ |
2545 | 130 | if (cnt > 0) |
2546 | 80 | cnt *= -1; |
2547 | 130 | return (cnt); |
2548 | 28.5k | } |
2549 | | |
2550 | | /* |
2551 | | * Convert a Unicode code point to a single UTF-8 sequence. |
2552 | | * |
2553 | | * NOTE:This function does not check if the Unicode is legal or not. |
2554 | | * Please you definitely check it before calling this. |
2555 | | */ |
2556 | | static size_t |
2557 | | unicode_to_utf8(char *p, size_t remaining, uint32_t uc) |
2558 | 10.1k | { |
2559 | 10.1k | char *_p = p; |
2560 | | |
2561 | | /* Invalid Unicode char maps to Replacement character */ |
2562 | 10.1k | if (uc > UNICODE_MAX) |
2563 | 0 | uc = UNICODE_R_CHAR; |
2564 | | /* Translate code point to UTF8 */ |
2565 | 10.1k | if (uc <= 0x7f) { |
2566 | 482 | if (remaining == 0) |
2567 | 16 | return (0); |
2568 | 466 | *p++ = (char)uc; |
2569 | 9.62k | } else if (uc <= 0x7ff) { |
2570 | 2.25k | if (remaining < 2) |
2571 | 104 | return (0); |
2572 | 2.14k | *p++ = 0xc0 | ((uc >> 6) & 0x1f); |
2573 | 2.14k | *p++ = 0x80 | (uc & 0x3f); |
2574 | 7.37k | } else if (uc <= 0xffff) { |
2575 | 7.25k | if (remaining < 3) |
2576 | 278 | return (0); |
2577 | 6.97k | *p++ = 0xe0 | ((uc >> 12) & 0x0f); |
2578 | 6.97k | *p++ = 0x80 | ((uc >> 6) & 0x3f); |
2579 | 6.97k | *p++ = 0x80 | (uc & 0x3f); |
2580 | 6.97k | } else { |
2581 | 122 | if (remaining < 4) |
2582 | 16 | return (0); |
2583 | 106 | *p++ = 0xf0 | ((uc >> 18) & 0x07); |
2584 | 106 | *p++ = 0x80 | ((uc >> 12) & 0x3f); |
2585 | 106 | *p++ = 0x80 | ((uc >> 6) & 0x3f); |
2586 | 106 | *p++ = 0x80 | (uc & 0x3f); |
2587 | 106 | } |
2588 | 9.69k | return (p - _p); |
2589 | 10.1k | } |
2590 | | |
2591 | | static int |
2592 | | utf16be_to_unicode(uint32_t *pwc, const char *s, size_t n) |
2593 | 0 | { |
2594 | 0 | return (utf16_to_unicode(pwc, s, n, 1)); |
2595 | 0 | } |
2596 | | |
2597 | | static int |
2598 | | utf16le_to_unicode(uint32_t *pwc, const char *s, size_t n) |
2599 | 0 | { |
2600 | 0 | return (utf16_to_unicode(pwc, s, n, 0)); |
2601 | 0 | } |
2602 | | |
2603 | | static int |
2604 | | utf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be) |
2605 | 0 | { |
2606 | 0 | const char *utf16 = s; |
2607 | 0 | unsigned uc; |
2608 | |
|
2609 | 0 | if (n == 0) |
2610 | 0 | return (0); |
2611 | 0 | if (n == 1) { |
2612 | | /* set the Replacement Character instead. */ |
2613 | 0 | *pwc = UNICODE_R_CHAR; |
2614 | 0 | return (-1); |
2615 | 0 | } |
2616 | | |
2617 | 0 | if (be) |
2618 | 0 | uc = archive_be16dec(utf16); |
2619 | 0 | else |
2620 | 0 | uc = archive_le16dec(utf16); |
2621 | 0 | utf16 += 2; |
2622 | | |
2623 | | /* If this is a surrogate pair, assemble the full code point.*/ |
2624 | 0 | if (IS_HIGH_SURROGATE_LA(uc)) { |
2625 | 0 | unsigned uc2; |
2626 | |
|
2627 | 0 | if (n >= 4) { |
2628 | 0 | if (be) |
2629 | 0 | uc2 = archive_be16dec(utf16); |
2630 | 0 | else |
2631 | 0 | uc2 = archive_le16dec(utf16); |
2632 | 0 | } else |
2633 | 0 | uc2 = 0; |
2634 | 0 | if (IS_LOW_SURROGATE_LA(uc2)) { |
2635 | 0 | uc = combine_surrogate_pair(uc, uc2); |
2636 | 0 | utf16 += 2; |
2637 | 0 | } else { |
2638 | | /* Undescribed code point should be U+FFFD |
2639 | | * (replacement character). */ |
2640 | 0 | *pwc = UNICODE_R_CHAR; |
2641 | 0 | return (-2); |
2642 | 0 | } |
2643 | 0 | } |
2644 | | |
2645 | | /* |
2646 | | * Surrogate pair values(0xd800 through 0xdfff) are only |
2647 | | * used by UTF-16, so, after above calculation, the code |
2648 | | * must not be surrogate values, and Unicode has no codes |
2649 | | * larger than 0x10ffff. Thus, those are not legal Unicode |
2650 | | * values. |
2651 | | */ |
2652 | 0 | if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) { |
2653 | | /* Undescribed code point should be U+FFFD |
2654 | | * (replacement character). */ |
2655 | 0 | *pwc = UNICODE_R_CHAR; |
2656 | 0 | return (((int)(utf16 - s)) * -1); |
2657 | 0 | } |
2658 | 0 | *pwc = uc; |
2659 | 0 | return ((int)(utf16 - s)); |
2660 | 0 | } |
2661 | | |
2662 | | static size_t |
2663 | | unicode_to_utf16be(char *p, size_t remaining, uint32_t uc) |
2664 | 0 | { |
2665 | 0 | char *utf16 = p; |
2666 | |
|
2667 | 0 | if (uc > 0xffff) { |
2668 | | /* We have a code point that won't fit into a |
2669 | | * wchar_t; convert it to a surrogate pair. */ |
2670 | 0 | if (remaining < 4) |
2671 | 0 | return (0); |
2672 | 0 | uc -= 0x10000; |
2673 | 0 | archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); |
2674 | 0 | archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); |
2675 | 0 | return (4); |
2676 | 0 | } else { |
2677 | 0 | if (remaining < 2) |
2678 | 0 | return (0); |
2679 | 0 | archive_be16enc(utf16, (uint16_t)uc); |
2680 | 0 | return (2); |
2681 | 0 | } |
2682 | 0 | } |
2683 | | |
2684 | | static size_t |
2685 | | unicode_to_utf16le(char *p, size_t remaining, uint32_t uc) |
2686 | 0 | { |
2687 | 0 | char *utf16 = p; |
2688 | |
|
2689 | 0 | if (uc > 0xffff) { |
2690 | | /* We have a code point that won't fit into a |
2691 | | * wchar_t; convert it to a surrogate pair. */ |
2692 | 0 | if (remaining < 4) |
2693 | 0 | return (0); |
2694 | 0 | uc -= 0x10000; |
2695 | 0 | archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); |
2696 | 0 | archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); |
2697 | 0 | return (4); |
2698 | 0 | } else { |
2699 | 0 | if (remaining < 2) |
2700 | 0 | return (0); |
2701 | 0 | archive_le16enc(utf16, (uint16_t)uc); |
2702 | 0 | return (2); |
2703 | 0 | } |
2704 | 0 | } |
2705 | | |
2706 | | /* |
2707 | | * Append new UTF-8 string to existing UTF-8 string. |
2708 | | * Existing string is assumed to already be in proper form; |
2709 | | * the new string will have invalid sequences replaced and |
2710 | | * surrogate pairs canonicalized. |
2711 | | */ |
2712 | | static int |
2713 | | strncat_from_utf8_to_utf8(struct archive_string *as, const void *_src, |
2714 | | size_t len, struct archive_string_conv *sc) |
2715 | 0 | { |
2716 | 0 | int ret = 0; |
2717 | 0 | const char *src = _src; |
2718 | 0 | (void)sc; /* UNUSED */ |
2719 | | |
2720 | | /* Pre-extend the destination */ |
2721 | 0 | if (archive_string_ensure(as, as->length + len + 1) == NULL) |
2722 | 0 | return (-1); |
2723 | | |
2724 | | /* Invariant: src points to the first UTF8 byte that hasn't |
2725 | | * been copied to the destination `as`. */ |
2726 | 0 | for (;;) { |
2727 | 0 | int n; |
2728 | 0 | uint32_t uc; |
2729 | 0 | const char *e = src; |
2730 | | |
2731 | | /* Skip UTF-8 sequences until we reach end-of-string or |
2732 | | * a code point that needs conversion. */ |
2733 | 0 | while ((n = utf8_to_unicode(&uc, e, len)) > 0) { |
2734 | 0 | e += n; |
2735 | 0 | len -= n; |
2736 | 0 | } |
2737 | | /* Copy the part that doesn't need conversion */ |
2738 | 0 | if (e > src) { |
2739 | 0 | if (archive_string_append(as, src, e - src) == NULL) |
2740 | 0 | return (-1); |
2741 | 0 | src = e; |
2742 | 0 | } |
2743 | | |
2744 | 0 | if (n == 0) { |
2745 | | /* We reached end-of-string */ |
2746 | 0 | return (ret); |
2747 | 0 | } else { |
2748 | | /* Next code point needs conversion */ |
2749 | 0 | char t[4]; |
2750 | 0 | size_t w; |
2751 | | |
2752 | | /* Try decoding a surrogate pair */ |
2753 | 0 | if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) { |
2754 | 0 | n = cesu8_to_unicode(&uc, src, len); |
2755 | 0 | } |
2756 | | /* Not a (valid) surrogate, so use a replacement char */ |
2757 | 0 | if (n < 0) { |
2758 | 0 | ret = -1; /* Return -1 if we used any replacement */ |
2759 | 0 | n *= -1; |
2760 | 0 | } |
2761 | | /* Consume converted code point */ |
2762 | 0 | src += n; |
2763 | 0 | len -= n; |
2764 | | /* Convert and append new UTF-8 sequence. */ |
2765 | 0 | w = unicode_to_utf8(t, sizeof(t), uc); |
2766 | 0 | if (archive_string_append(as, t, w) == NULL) |
2767 | 0 | return (-1); |
2768 | 0 | } |
2769 | 0 | } |
2770 | 0 | } |
2771 | | |
2772 | | static int |
2773 | | archive_string_append_unicode(struct archive_string *as, const void *_p, |
2774 | | size_t len, struct archive_string_conv *sc) |
2775 | 0 | { |
2776 | 0 | const char *s; |
2777 | 0 | char *p, *endp; |
2778 | 0 | uint32_t uc; |
2779 | 0 | size_t w; |
2780 | 0 | size_t ts, tm; |
2781 | 0 | int n, ret = 0; |
2782 | 0 | int (*parse)(uint32_t *, const char *, size_t); |
2783 | 0 | size_t (*unparse)(char *, size_t, uint32_t); |
2784 | |
|
2785 | 0 | if (sc->flag & SCONV_TO_UTF16BE) { |
2786 | 0 | unparse = unicode_to_utf16be; |
2787 | 0 | ts = 2; |
2788 | 0 | } else if (sc->flag & SCONV_TO_UTF16LE) { |
2789 | 0 | unparse = unicode_to_utf16le; |
2790 | 0 | ts = 2; |
2791 | 0 | } else if (sc->flag & SCONV_TO_UTF8) { |
2792 | 0 | unparse = unicode_to_utf8; |
2793 | 0 | ts = 1; |
2794 | 0 | } else { |
2795 | | /* |
2796 | | * This case is going to be converted to another |
2797 | | * character-set through iconv. |
2798 | | */ |
2799 | 0 | if (sc->flag & SCONV_FROM_UTF16BE) { |
2800 | 0 | unparse = unicode_to_utf16be; |
2801 | 0 | ts = 2; |
2802 | 0 | } else if (sc->flag & SCONV_FROM_UTF16LE) { |
2803 | 0 | unparse = unicode_to_utf16le; |
2804 | 0 | ts = 2; |
2805 | 0 | } else { |
2806 | 0 | unparse = unicode_to_utf8; |
2807 | 0 | ts = 1; |
2808 | 0 | } |
2809 | 0 | } |
2810 | |
|
2811 | 0 | if (sc->flag & SCONV_FROM_UTF16BE) { |
2812 | 0 | parse = utf16be_to_unicode; |
2813 | 0 | tm = 1; |
2814 | 0 | } else if (sc->flag & SCONV_FROM_UTF16LE) { |
2815 | 0 | parse = utf16le_to_unicode; |
2816 | 0 | tm = 1; |
2817 | 0 | } else { |
2818 | 0 | parse = cesu8_to_unicode; |
2819 | 0 | tm = ts; |
2820 | 0 | } |
2821 | |
|
2822 | 0 | if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) |
2823 | 0 | return (-1); |
2824 | | |
2825 | 0 | s = (const char *)_p; |
2826 | 0 | p = as->s + as->length; |
2827 | 0 | endp = as->s + as->buffer_length - ts; |
2828 | 0 | while ((n = parse(&uc, s, len)) != 0) { |
2829 | 0 | if (n < 0) { |
2830 | | /* Use a replaced unicode character. */ |
2831 | 0 | n *= -1; |
2832 | 0 | ret = -1; |
2833 | 0 | } |
2834 | 0 | s += n; |
2835 | 0 | len -= n; |
2836 | 0 | while ((w = unparse(p, endp - p, uc)) == 0) { |
2837 | | /* There is not enough output buffer so |
2838 | | * we have to expand it. */ |
2839 | 0 | as->length = p - as->s; |
2840 | 0 | if (archive_string_ensure(as, |
2841 | 0 | as->buffer_length + len * tm + ts) == NULL) |
2842 | 0 | return (-1); |
2843 | 0 | p = as->s + as->length; |
2844 | 0 | endp = as->s + as->buffer_length - ts; |
2845 | 0 | } |
2846 | 0 | p += w; |
2847 | 0 | } |
2848 | 0 | as->length = p - as->s; |
2849 | 0 | as->s[as->length] = '\0'; |
2850 | 0 | if (ts == 2) |
2851 | 0 | as->s[as->length+1] = '\0'; |
2852 | 0 | return (ret); |
2853 | 0 | } |
2854 | | |
2855 | | /* |
2856 | | * Following Constants for Hangul compositions this information comes from |
2857 | | * Unicode Standard Annex #15 http://unicode.org/reports/tr15/ |
2858 | | */ |
2859 | 1.33k | #define HC_SBASE 0xAC00 |
2860 | 1.33k | #define HC_LBASE 0x1100 |
2861 | 0 | #define HC_VBASE 0x1161 |
2862 | 28 | #define HC_TBASE 0x11A7 |
2863 | 274 | #define HC_LCOUNT 19 |
2864 | 112 | #define HC_VCOUNT 21 |
2865 | 154 | #define HC_TCOUNT 28 |
2866 | 112 | #define HC_NCOUNT (HC_VCOUNT * HC_TCOUNT) |
2867 | 1.44k | #define HC_SCOUNT (HC_LCOUNT * HC_NCOUNT) |
2868 | | |
2869 | | static uint32_t |
2870 | | get_nfc(uint32_t uc, uint32_t uc2) |
2871 | 3.10k | { |
2872 | 3.10k | int t, b; |
2873 | | |
2874 | 3.10k | t = 0; |
2875 | 3.10k | b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1; |
2876 | 33.0k | while (b >= t) { |
2877 | 30.0k | int m = (t + b) / 2; |
2878 | 30.0k | if (u_composition_table[m].cp1 < uc) |
2879 | 10.0k | t = m + 1; |
2880 | 20.0k | else if (u_composition_table[m].cp1 > uc) |
2881 | 16.3k | b = m - 1; |
2882 | 3.64k | else if (u_composition_table[m].cp2 < uc2) |
2883 | 2.85k | t = m + 1; |
2884 | 788 | else if (u_composition_table[m].cp2 > uc2) |
2885 | 588 | b = m - 1; |
2886 | 200 | else |
2887 | 200 | return (u_composition_table[m].nfc); |
2888 | 30.0k | } |
2889 | 2.90k | return (0); |
2890 | 3.10k | } |
2891 | | |
2892 | 3.35k | #define FDC_MAX 10 /* The maximum number of Following Decomposable |
2893 | | * Characters. */ |
2894 | | |
2895 | | /* |
2896 | | * Update first code point. |
2897 | | */ |
2898 | 200 | #define UPDATE_UC(new_uc) do { \ |
2899 | 200 | uc = new_uc; \ |
2900 | 200 | ucptr = NULL; \ |
2901 | 200 | } while (0) |
2902 | | |
2903 | | /* |
2904 | | * Replace first code point with second code point. |
2905 | | */ |
2906 | 10.4k | #define REPLACE_UC_WITH_UC2() do { \ |
2907 | 10.4k | uc = uc2; \ |
2908 | 10.4k | ucptr = uc2ptr; \ |
2909 | 10.4k | n = n2; \ |
2910 | 10.4k | } while (0) |
2911 | | |
2912 | 572 | #define EXPAND_BUFFER() do { \ |
2913 | 572 | as->length = p - as->s; \ |
2914 | 572 | if (archive_string_ensure(as, \ |
2915 | 572 | as->buffer_length + len * tm + ts) == NULL)\ |
2916 | 572 | return (-1); \ |
2917 | 572 | p = as->s + as->length; \ |
2918 | 572 | endp = as->s + as->buffer_length - ts; \ |
2919 | 572 | } while (0) |
2920 | | |
2921 | 10.5k | #define UNPARSE(p, endp, uc) do { \ |
2922 | 10.1k | while ((w = unparse(p, (endp) - (p), uc)) == 0) {\ |
2923 | 414 | EXPAND_BUFFER(); \ |
2924 | 414 | } \ |
2925 | 9.69k | p += w; \ |
2926 | 9.69k | } while (0) |
2927 | | |
2928 | | /* |
2929 | | * Write first code point. |
2930 | | * If the code point has not be changed from its original code, |
2931 | | * this just copies it from its original buffer pointer. |
2932 | | * If not, this converts it to UTF-8 byte sequence and copies it. |
2933 | | */ |
2934 | 15.4k | #define WRITE_UC() do { \ |
2935 | 15.4k | if (ucptr) { \ |
2936 | 15.2k | if (p + n > endp) \ |
2937 | 15.2k | EXPAND_BUFFER(); \ |
2938 | 15.2k | switch (n) { \ |
2939 | 114 | case 4: \ |
2940 | 114 | *p++ = *ucptr++; \ |
2941 | 114 | /* FALL THROUGH */ \ |
2942 | 378 | case 3: \ |
2943 | 378 | *p++ = *ucptr++; \ |
2944 | 378 | /* FALL THROUGH */ \ |
2945 | 1.18k | case 2: \ |
2946 | 1.18k | *p++ = *ucptr++; \ |
2947 | 1.18k | /* FALL THROUGH */ \ |
2948 | 15.2k | case 1: \ |
2949 | 15.2k | *p++ = *ucptr; \ |
2950 | 15.2k | break; \ |
2951 | 15.2k | } \ |
2952 | 15.2k | ucptr = NULL; \ |
2953 | 15.2k | } else { \ |
2954 | 220 | UNPARSE(p, endp, uc); \ |
2955 | 220 | } \ |
2956 | 15.4k | } while (0) |
2957 | | |
2958 | | /* |
2959 | | * Collect following decomposable code points. |
2960 | | */ |
2961 | 900 | #define COLLECT_CPS(start) do { \ |
2962 | 900 | int _i; \ |
2963 | 2.44k | for (_i = start; _i < FDC_MAX ; _i++) { \ |
2964 | 2.42k | nx = parse(&ucx[_i], s, len); \ |
2965 | 2.42k | if (nx <= 0) \ |
2966 | 2.42k | break; \ |
2967 | 2.42k | cx = CCC(ucx[_i]); \ |
2968 | 2.09k | if (cl >= cx && cl != 228 && cx != 228)\ |
2969 | 2.09k | break; \ |
2970 | 2.09k | s += nx; \ |
2971 | 1.54k | len -= nx; \ |
2972 | 1.54k | cl = cx; \ |
2973 | 1.54k | ccx[_i] = cx; \ |
2974 | 1.54k | } \ |
2975 | 900 | if (_i >= FDC_MAX) { \ |
2976 | 14 | ret = -1; \ |
2977 | 14 | ucx_size = FDC_MAX; \ |
2978 | 14 | } else \ |
2979 | 900 | ucx_size = _i; \ |
2980 | 900 | } while (0) |
2981 | | |
2982 | | /* |
2983 | | * Normalize UTF-8/UTF-16BE characters to Form C and copy the result. |
2984 | | * |
2985 | | * TODO: Convert composition exclusions, which are never converted |
2986 | | * from NFC,NFD,NFKC and NFKD, to Form C. |
2987 | | */ |
2988 | | static int |
2989 | | archive_string_normalize_C(struct archive_string *as, const void *_p, |
2990 | | size_t len, struct archive_string_conv *sc) |
2991 | 2.39k | { |
2992 | 2.39k | const char *s = (const char *)_p; |
2993 | 2.39k | char *p, *endp; |
2994 | 2.39k | uint32_t uc, uc2; |
2995 | 2.39k | size_t w; |
2996 | 2.39k | int always_replace, n, n2, ret = 0, spair, ts, tm; |
2997 | 2.39k | int (*parse)(uint32_t *, const char *, size_t); |
2998 | 2.39k | size_t (*unparse)(char *, size_t, uint32_t); |
2999 | | |
3000 | 2.39k | always_replace = 1; |
3001 | 2.39k | ts = 1;/* text size. */ |
3002 | 2.39k | if (sc->flag & SCONV_TO_UTF16BE) { |
3003 | 0 | unparse = unicode_to_utf16be; |
3004 | 0 | ts = 2; |
3005 | 0 | if (sc->flag & SCONV_FROM_UTF16BE) |
3006 | 0 | always_replace = 0; |
3007 | 2.39k | } else if (sc->flag & SCONV_TO_UTF16LE) { |
3008 | 0 | unparse = unicode_to_utf16le; |
3009 | 0 | ts = 2; |
3010 | 0 | if (sc->flag & SCONV_FROM_UTF16LE) |
3011 | 0 | always_replace = 0; |
3012 | 2.39k | } else if (sc->flag & SCONV_TO_UTF8) { |
3013 | 0 | unparse = unicode_to_utf8; |
3014 | 0 | if (sc->flag & SCONV_FROM_UTF8) |
3015 | 0 | always_replace = 0; |
3016 | 2.39k | } else { |
3017 | | /* |
3018 | | * This case is going to be converted to another |
3019 | | * character-set through iconv. |
3020 | | */ |
3021 | 2.39k | always_replace = 0; |
3022 | 2.39k | if (sc->flag & SCONV_FROM_UTF16BE) { |
3023 | 0 | unparse = unicode_to_utf16be; |
3024 | 0 | ts = 2; |
3025 | 2.39k | } else if (sc->flag & SCONV_FROM_UTF16LE) { |
3026 | 0 | unparse = unicode_to_utf16le; |
3027 | 0 | ts = 2; |
3028 | 2.39k | } else { |
3029 | 2.39k | unparse = unicode_to_utf8; |
3030 | 2.39k | } |
3031 | 2.39k | } |
3032 | | |
3033 | 2.39k | if (sc->flag & SCONV_FROM_UTF16BE) { |
3034 | 0 | parse = utf16be_to_unicode; |
3035 | 0 | tm = 1; |
3036 | 0 | spair = 4;/* surrogate pair size in UTF-16. */ |
3037 | 2.39k | } else if (sc->flag & SCONV_FROM_UTF16LE) { |
3038 | 0 | parse = utf16le_to_unicode; |
3039 | 0 | tm = 1; |
3040 | 0 | spair = 4;/* surrogate pair size in UTF-16. */ |
3041 | 2.39k | } else { |
3042 | 2.39k | parse = cesu8_to_unicode; |
3043 | 2.39k | tm = ts; |
3044 | 2.39k | spair = 6;/* surrogate pair size in UTF-8. */ |
3045 | 2.39k | } |
3046 | | |
3047 | 2.39k | if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) |
3048 | 0 | return (-1); |
3049 | | |
3050 | 2.39k | p = as->s + as->length; |
3051 | 2.39k | endp = as->s + as->buffer_length - ts; |
3052 | 10.0k | while ((n = parse(&uc, s, len)) != 0) { |
3053 | 9.67k | const char *ucptr, *uc2ptr; |
3054 | | |
3055 | 9.67k | if (n < 0) { |
3056 | | /* Use a replaced unicode character. */ |
3057 | 4.64k | UNPARSE(p, endp, uc); |
3058 | 4.64k | s += n*-1; |
3059 | 4.64k | len -= n*-1; |
3060 | 4.64k | ret = -1; |
3061 | 4.64k | continue; |
3062 | 5.03k | } else if (n == spair || always_replace) |
3063 | | /* uc is converted from a surrogate pair. |
3064 | | * this should be treated as a changed code. */ |
3065 | 12 | ucptr = NULL; |
3066 | 5.01k | else |
3067 | 5.01k | ucptr = s; |
3068 | 5.03k | s += n; |
3069 | 5.03k | len -= n; |
3070 | | |
3071 | | /* Read second code point. */ |
3072 | 15.5k | while ((n2 = parse(&uc2, s, len)) > 0) { |
3073 | 11.3k | uint32_t ucx[FDC_MAX]; |
3074 | 11.3k | int ccx[FDC_MAX]; |
3075 | 11.3k | int cl, cx, i, nx, ucx_size; |
3076 | 11.3k | int LIndex,SIndex; |
3077 | 11.3k | uint32_t nfc; |
3078 | | |
3079 | 11.3k | if (n2 == spair || always_replace) |
3080 | | /* uc2 is converted from a surrogate pair. |
3081 | | * this should be treated as a changed code. */ |
3082 | 8 | uc2ptr = NULL; |
3083 | 11.3k | else |
3084 | 11.3k | uc2ptr = s; |
3085 | 11.3k | s += n2; |
3086 | 11.3k | len -= n2; |
3087 | | |
3088 | | /* |
3089 | | * If current second code point is out of decomposable |
3090 | | * code points, finding compositions is unneeded. |
3091 | | */ |
3092 | 11.3k | if (!IS_DECOMPOSABLE_BLOCK(uc2)) { |
3093 | 9.99k | WRITE_UC(); |
3094 | 9.99k | REPLACE_UC_WITH_UC2(); |
3095 | 9.99k | continue; |
3096 | 9.99k | } |
3097 | | |
3098 | | /* |
3099 | | * Try to combine current code points. |
3100 | | */ |
3101 | | /* |
3102 | | * We have to combine Hangul characters according to |
3103 | | * http://uniicode.org/reports/tr15/#Hangul |
3104 | | */ |
3105 | 1.33k | if (0 <= (LIndex = uc - HC_LBASE) && |
3106 | 162 | LIndex < HC_LCOUNT) { |
3107 | | /* |
3108 | | * Hangul Composition. |
3109 | | * 1. Two current code points are L and V. |
3110 | | */ |
3111 | 0 | int VIndex = uc2 - HC_VBASE; |
3112 | 0 | if (0 <= VIndex && VIndex < HC_VCOUNT) { |
3113 | | /* Make syllable of form LV. */ |
3114 | 0 | UPDATE_UC(HC_SBASE + |
3115 | 0 | (LIndex * HC_VCOUNT + VIndex) * |
3116 | 0 | HC_TCOUNT); |
3117 | 0 | } else { |
3118 | 0 | WRITE_UC(); |
3119 | 0 | REPLACE_UC_WITH_UC2(); |
3120 | 0 | } |
3121 | 0 | continue; |
3122 | 1.33k | } else if (0 <= (SIndex = uc - HC_SBASE) && |
3123 | 112 | SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) { |
3124 | | /* |
3125 | | * Hangul Composition. |
3126 | | * 2. Two current code points are LV and T. |
3127 | | */ |
3128 | 28 | int TIndex = uc2 - HC_TBASE; |
3129 | 28 | if (0 < TIndex && TIndex < HC_TCOUNT) { |
3130 | | /* Make syllable of form LVT. */ |
3131 | 0 | UPDATE_UC(uc + TIndex); |
3132 | 28 | } else { |
3133 | 28 | WRITE_UC(); |
3134 | 28 | REPLACE_UC_WITH_UC2(); |
3135 | 28 | } |
3136 | 28 | continue; |
3137 | 1.30k | } else if ((nfc = get_nfc(uc, uc2)) != 0) { |
3138 | | /* A composition to current code points |
3139 | | * is found. */ |
3140 | 68 | UPDATE_UC(nfc); |
3141 | 68 | continue; |
3142 | 1.24k | } else if ((cl = CCC(uc2)) == 0) { |
3143 | | /* Clearly 'uc2' the second code point is not |
3144 | | * a decomposable code. */ |
3145 | 388 | WRITE_UC(); |
3146 | 388 | REPLACE_UC_WITH_UC2(); |
3147 | 388 | continue; |
3148 | 388 | } |
3149 | | |
3150 | | /* |
3151 | | * Collect following decomposable code points. |
3152 | | */ |
3153 | 852 | cx = 0; |
3154 | 852 | ucx[0] = uc2; |
3155 | 852 | ccx[0] = cl; |
3156 | 852 | COLLECT_CPS(1); |
3157 | | |
3158 | | /* |
3159 | | * Find a composed code in the collected code points. |
3160 | | */ |
3161 | 852 | i = 1; |
3162 | 2.65k | while (i < ucx_size) { |
3163 | 1.79k | int j; |
3164 | | |
3165 | 1.79k | if ((nfc = get_nfc(uc, ucx[i])) == 0) { |
3166 | 1.66k | i++; |
3167 | 1.66k | continue; |
3168 | 1.66k | } |
3169 | | |
3170 | | /* |
3171 | | * nfc is composed of uc and ucx[i]. |
3172 | | */ |
3173 | 132 | UPDATE_UC(nfc); |
3174 | | |
3175 | | /* |
3176 | | * Remove ucx[i] by shifting |
3177 | | * following code points. |
3178 | | */ |
3179 | 184 | for (j = i; j+1 < ucx_size; j++) { |
3180 | 52 | ucx[j] = ucx[j+1]; |
3181 | 52 | ccx[j] = ccx[j+1]; |
3182 | 52 | } |
3183 | 132 | ucx_size --; |
3184 | | |
3185 | | /* |
3186 | | * Collect following code points blocked |
3187 | | * by ucx[i] the removed code point. |
3188 | | */ |
3189 | 132 | if (ucx_size > 0 && i == ucx_size && |
3190 | 102 | nx > 0 && cx == cl) { |
3191 | 48 | cl = ccx[ucx_size-1]; |
3192 | 48 | COLLECT_CPS(ucx_size); |
3193 | 48 | } |
3194 | | /* |
3195 | | * Restart finding a composed code with |
3196 | | * the updated uc from the top of the |
3197 | | * collected code points. |
3198 | | */ |
3199 | 132 | i = 0; |
3200 | 132 | } |
3201 | | |
3202 | | /* |
3203 | | * Apparently the current code points are not |
3204 | | * decomposed characters or already composed. |
3205 | | */ |
3206 | 852 | WRITE_UC(); |
3207 | 3.11k | for (i = 0; i < ucx_size; i++) |
3208 | 2.26k | UNPARSE(p, endp, ucx[i]); |
3209 | | |
3210 | | /* |
3211 | | * Flush out remaining canonical combining characters. |
3212 | | */ |
3213 | 852 | if (nx > 0 && cx == cl && len > 0) { |
3214 | 542 | while ((nx = parse(&ucx[0], s, len)) |
3215 | 542 | > 0) { |
3216 | 472 | cx = CCC(ucx[0]); |
3217 | 472 | if (cl > cx) |
3218 | 58 | break; |
3219 | 414 | s += nx; |
3220 | 414 | len -= nx; |
3221 | 414 | cl = cx; |
3222 | 414 | UNPARSE(p, endp, ucx[0]); |
3223 | 414 | } |
3224 | 128 | } |
3225 | 852 | break; |
3226 | 852 | } |
3227 | 5.03k | if (n2 < 0) { |
3228 | 2.15k | WRITE_UC(); |
3229 | | /* Use a replaced unicode character. */ |
3230 | 2.15k | UNPARSE(p, endp, uc2); |
3231 | 2.15k | s += n2*-1; |
3232 | 2.15k | len -= n2*-1; |
3233 | 2.15k | ret = -1; |
3234 | 2.15k | continue; |
3235 | 2.88k | } else if (n2 == 0) { |
3236 | 2.02k | WRITE_UC(); |
3237 | 2.02k | break; |
3238 | 2.02k | } |
3239 | 5.03k | } |
3240 | 2.39k | as->length = p - as->s; |
3241 | 2.39k | as->s[as->length] = '\0'; |
3242 | 2.39k | if (ts == 2) |
3243 | 0 | as->s[as->length+1] = '\0'; |
3244 | 2.39k | return (ret); |
3245 | 2.39k | } |
3246 | | |
3247 | | static int |
3248 | | get_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc) |
3249 | 0 | { |
3250 | 0 | int t, b; |
3251 | | |
3252 | | /* |
3253 | | * These are not converted to NFD on Mac OS. |
3254 | | */ |
3255 | 0 | if ((uc >= 0x2000 && uc <= 0x2FFF) || |
3256 | 0 | (uc >= 0xF900 && uc <= 0xFAFF) || |
3257 | 0 | (uc >= 0x2F800 && uc <= 0x2FAFF)) |
3258 | 0 | return (0); |
3259 | | /* |
3260 | | * Those code points are not converted to NFD on Mac OS. |
3261 | | * I do not know the reason because it is undocumented. |
3262 | | * NFC NFD |
3263 | | * 1109A ==> 11099 110BA |
3264 | | * 1109C ==> 1109B 110BA |
3265 | | * 110AB ==> 110A5 110BA |
3266 | | */ |
3267 | 0 | if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB) |
3268 | 0 | return (0); |
3269 | | |
3270 | 0 | t = 0; |
3271 | 0 | b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1; |
3272 | 0 | while (b >= t) { |
3273 | 0 | int m = (t + b) / 2; |
3274 | 0 | if (u_decomposition_table[m].nfc < uc) |
3275 | 0 | t = m + 1; |
3276 | 0 | else if (u_decomposition_table[m].nfc > uc) |
3277 | 0 | b = m - 1; |
3278 | 0 | else { |
3279 | 0 | *cp1 = u_decomposition_table[m].cp1; |
3280 | 0 | *cp2 = u_decomposition_table[m].cp2; |
3281 | 0 | return (1); |
3282 | 0 | } |
3283 | 0 | } |
3284 | 0 | return (0); |
3285 | 0 | } |
3286 | | |
3287 | 0 | #define REPLACE_UC_WITH(cp) do { \ |
3288 | 0 | uc = cp; \ |
3289 | 0 | ucptr = NULL; \ |
3290 | 0 | } while (0) |
3291 | | |
3292 | | /* |
3293 | | * Normalize UTF-8 characters to Form D and copy the result. |
3294 | | */ |
3295 | | static int |
3296 | | archive_string_normalize_D(struct archive_string *as, const void *_p, |
3297 | | size_t len, struct archive_string_conv *sc) |
3298 | 0 | { |
3299 | 0 | const char *s = (const char *)_p; |
3300 | 0 | char *p, *endp; |
3301 | 0 | uint32_t uc, uc2; |
3302 | 0 | size_t w; |
3303 | 0 | int always_replace, n, n2, ret = 0, spair, ts, tm; |
3304 | 0 | int (*parse)(uint32_t *, const char *, size_t); |
3305 | 0 | size_t (*unparse)(char *, size_t, uint32_t); |
3306 | |
|
3307 | 0 | always_replace = 1; |
3308 | 0 | ts = 1;/* text size. */ |
3309 | 0 | if (sc->flag & SCONV_TO_UTF16BE) { |
3310 | 0 | unparse = unicode_to_utf16be; |
3311 | 0 | ts = 2; |
3312 | 0 | if (sc->flag & SCONV_FROM_UTF16BE) |
3313 | 0 | always_replace = 0; |
3314 | 0 | } else if (sc->flag & SCONV_TO_UTF16LE) { |
3315 | 0 | unparse = unicode_to_utf16le; |
3316 | 0 | ts = 2; |
3317 | 0 | if (sc->flag & SCONV_FROM_UTF16LE) |
3318 | 0 | always_replace = 0; |
3319 | 0 | } else if (sc->flag & SCONV_TO_UTF8) { |
3320 | 0 | unparse = unicode_to_utf8; |
3321 | 0 | if (sc->flag & SCONV_FROM_UTF8) |
3322 | 0 | always_replace = 0; |
3323 | 0 | } else { |
3324 | | /* |
3325 | | * This case is going to be converted to another |
3326 | | * character-set through iconv. |
3327 | | */ |
3328 | 0 | always_replace = 0; |
3329 | 0 | if (sc->flag & SCONV_FROM_UTF16BE) { |
3330 | 0 | unparse = unicode_to_utf16be; |
3331 | 0 | ts = 2; |
3332 | 0 | } else if (sc->flag & SCONV_FROM_UTF16LE) { |
3333 | 0 | unparse = unicode_to_utf16le; |
3334 | 0 | ts = 2; |
3335 | 0 | } else { |
3336 | 0 | unparse = unicode_to_utf8; |
3337 | 0 | } |
3338 | 0 | } |
3339 | |
|
3340 | 0 | if (sc->flag & SCONV_FROM_UTF16BE) { |
3341 | 0 | parse = utf16be_to_unicode; |
3342 | 0 | tm = 1; |
3343 | 0 | spair = 4;/* surrogate pair size in UTF-16. */ |
3344 | 0 | } else if (sc->flag & SCONV_FROM_UTF16LE) { |
3345 | 0 | parse = utf16le_to_unicode; |
3346 | 0 | tm = 1; |
3347 | 0 | spair = 4;/* surrogate pair size in UTF-16. */ |
3348 | 0 | } else { |
3349 | 0 | parse = cesu8_to_unicode; |
3350 | 0 | tm = ts; |
3351 | 0 | spair = 6;/* surrogate pair size in UTF-8. */ |
3352 | 0 | } |
3353 | |
|
3354 | 0 | if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) |
3355 | 0 | return (-1); |
3356 | | |
3357 | 0 | p = as->s + as->length; |
3358 | 0 | endp = as->s + as->buffer_length - ts; |
3359 | 0 | while ((n = parse(&uc, s, len)) != 0) { |
3360 | 0 | const char *ucptr; |
3361 | 0 | uint32_t cp1, cp2; |
3362 | 0 | int SIndex; |
3363 | 0 | struct { |
3364 | 0 | uint32_t uc; |
3365 | 0 | int ccc; |
3366 | 0 | } fdc[FDC_MAX]; |
3367 | 0 | int fdi, fdj; |
3368 | 0 | int ccc; |
3369 | |
|
3370 | 0 | check_first_code: |
3371 | 0 | if (n < 0) { |
3372 | | /* Use a replaced unicode character. */ |
3373 | 0 | UNPARSE(p, endp, uc); |
3374 | 0 | s += n*-1; |
3375 | 0 | len -= n*-1; |
3376 | 0 | ret = -1; |
3377 | 0 | continue; |
3378 | 0 | } else if (n == spair || always_replace) |
3379 | | /* uc is converted from a surrogate pair. |
3380 | | * this should be treated as a changed code. */ |
3381 | 0 | ucptr = NULL; |
3382 | 0 | else |
3383 | 0 | ucptr = s; |
3384 | 0 | s += n; |
3385 | 0 | len -= n; |
3386 | | |
3387 | | /* Hangul Decomposition. */ |
3388 | 0 | if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) { |
3389 | 0 | int L = HC_LBASE + SIndex / HC_NCOUNT; |
3390 | 0 | int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT; |
3391 | 0 | int T = HC_TBASE + SIndex % HC_TCOUNT; |
3392 | |
|
3393 | 0 | REPLACE_UC_WITH(L); |
3394 | 0 | WRITE_UC(); |
3395 | 0 | REPLACE_UC_WITH(V); |
3396 | 0 | WRITE_UC(); |
3397 | 0 | if (T != HC_TBASE) { |
3398 | 0 | REPLACE_UC_WITH(T); |
3399 | 0 | WRITE_UC(); |
3400 | 0 | } |
3401 | 0 | continue; |
3402 | 0 | } |
3403 | 0 | if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) { |
3404 | 0 | WRITE_UC(); |
3405 | 0 | continue; |
3406 | 0 | } |
3407 | | |
3408 | 0 | fdi = 0; |
3409 | 0 | while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) { |
3410 | 0 | int k; |
3411 | |
|
3412 | 0 | for (k = fdi; k > 0; k--) |
3413 | 0 | fdc[k] = fdc[k-1]; |
3414 | 0 | fdc[0].ccc = CCC(cp2); |
3415 | 0 | fdc[0].uc = cp2; |
3416 | 0 | fdi++; |
3417 | 0 | REPLACE_UC_WITH(cp1); |
3418 | 0 | } |
3419 | | |
3420 | | /* Read following code points. */ |
3421 | 0 | while ((n2 = parse(&uc2, s, len)) > 0 && |
3422 | 0 | (ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) { |
3423 | 0 | int j, k; |
3424 | |
|
3425 | 0 | s += n2; |
3426 | 0 | len -= n2; |
3427 | 0 | for (j = 0; j < fdi; j++) { |
3428 | 0 | if (fdc[j].ccc > ccc) |
3429 | 0 | break; |
3430 | 0 | } |
3431 | 0 | if (j < fdi) { |
3432 | 0 | for (k = fdi; k > j; k--) |
3433 | 0 | fdc[k] = fdc[k-1]; |
3434 | 0 | fdc[j].ccc = ccc; |
3435 | 0 | fdc[j].uc = uc2; |
3436 | 0 | } else { |
3437 | 0 | fdc[fdi].ccc = ccc; |
3438 | 0 | fdc[fdi].uc = uc2; |
3439 | 0 | } |
3440 | 0 | fdi++; |
3441 | 0 | } |
3442 | |
|
3443 | 0 | WRITE_UC(); |
3444 | 0 | for (fdj = 0; fdj < fdi; fdj++) { |
3445 | 0 | REPLACE_UC_WITH(fdc[fdj].uc); |
3446 | 0 | WRITE_UC(); |
3447 | 0 | } |
3448 | | |
3449 | 0 | if (n2 == 0) |
3450 | 0 | break; |
3451 | 0 | REPLACE_UC_WITH(uc2); |
3452 | 0 | n = n2; |
3453 | 0 | goto check_first_code; |
3454 | 0 | } |
3455 | 0 | as->length = p - as->s; |
3456 | 0 | as->s[as->length] = '\0'; |
3457 | 0 | if (ts == 2) |
3458 | 0 | as->s[as->length+1] = '\0'; |
3459 | 0 | return (ret); |
3460 | 0 | } |
3461 | | |
3462 | | /* |
3463 | | * libarchive 2.x made incorrect UTF-8 strings in the wrong assumption |
3464 | | * that WCS is Unicode. It is true for several platforms but some are false. |
3465 | | * And then people who did not use UTF-8 locale on the non Unicode WCS |
3466 | | * platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those |
3467 | | * now cannot get right filename from libarchive 3.x and later since we |
3468 | | * fixed the wrong assumption and it is incompatible to older its versions. |
3469 | | * So we provide special option, "compat-2x.x", for resolving it. |
3470 | | * That option enable the string conversion of libarchive 2.x. |
3471 | | * |
3472 | | * Translates the wrong UTF-8 string made by libarchive 2.x into current |
3473 | | * locale character set and appends to the archive_string. |
3474 | | * Note: returns -1 if conversion fails. |
3475 | | */ |
3476 | | static int |
3477 | | strncat_from_utf8_libarchive2(struct archive_string *as, |
3478 | | const void *_p, size_t len, struct archive_string_conv *sc) |
3479 | 0 | { |
3480 | 0 | const char *s; |
3481 | 0 | int n; |
3482 | 0 | char *p; |
3483 | 0 | char *end; |
3484 | 0 | uint32_t unicode; |
3485 | 0 | #if HAVE_WCRTOMB |
3486 | 0 | mbstate_t shift_state; |
3487 | |
|
3488 | 0 | memset(&shift_state, 0, sizeof(shift_state)); |
3489 | | #else |
3490 | | /* Clear the shift state before starting. */ |
3491 | | wctomb(NULL, L'\0'); |
3492 | | #endif |
3493 | 0 | (void)sc; /* UNUSED */ |
3494 | | /* |
3495 | | * Allocate buffer for MBS. |
3496 | | * We need this allocation here since it is possible that |
3497 | | * as->s is still NULL. |
3498 | | */ |
3499 | 0 | if (archive_string_ensure(as, as->length + len + 1) == NULL) |
3500 | 0 | return (-1); |
3501 | | |
3502 | 0 | s = (const char *)_p; |
3503 | 0 | p = as->s + as->length; |
3504 | 0 | end = as->s + as->buffer_length - MB_CUR_MAX -1; |
3505 | 0 | while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) { |
3506 | 0 | wchar_t wc; |
3507 | |
|
3508 | 0 | if (p >= end) { |
3509 | 0 | as->length = p - as->s; |
3510 | | /* Re-allocate buffer for MBS. */ |
3511 | 0 | if (archive_string_ensure(as, |
3512 | 0 | as->length + max(len * 2, |
3513 | 0 | (size_t)MB_CUR_MAX) + 1) == NULL) |
3514 | 0 | return (-1); |
3515 | 0 | p = as->s + as->length; |
3516 | 0 | end = as->s + as->buffer_length - MB_CUR_MAX -1; |
3517 | 0 | } |
3518 | | |
3519 | | /* |
3520 | | * As libarchive 2.x, translates the UTF-8 characters into |
3521 | | * wide-characters in the assumption that WCS is Unicode. |
3522 | | */ |
3523 | 0 | if (n < 0) { |
3524 | 0 | n *= -1; |
3525 | 0 | wc = L'?'; |
3526 | 0 | } else |
3527 | 0 | wc = (wchar_t)unicode; |
3528 | |
|
3529 | 0 | s += n; |
3530 | 0 | len -= n; |
3531 | | /* |
3532 | | * Translates the wide-character into the current locale MBS. |
3533 | | */ |
3534 | 0 | #if HAVE_WCRTOMB |
3535 | 0 | n = (int)wcrtomb(p, wc, &shift_state); |
3536 | | #else |
3537 | | n = (int)wctomb(p, wc); |
3538 | | #endif |
3539 | 0 | if (n == -1) |
3540 | 0 | return (-1); |
3541 | 0 | p += n; |
3542 | 0 | } |
3543 | 0 | as->length = p - as->s; |
3544 | 0 | as->s[as->length] = '\0'; |
3545 | 0 | return (0); |
3546 | 0 | } |
3547 | | |
3548 | | |
3549 | | /* |
3550 | | * Conversion functions between current locale dependent MBS and UTF-16BE. |
3551 | | * strncat_from_utf16be() : UTF-16BE --> MBS |
3552 | | * strncat_to_utf16be() : MBS --> UTF16BE |
3553 | | */ |
3554 | | |
3555 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
3556 | | |
3557 | | /* |
3558 | | * Convert a UTF-16BE/LE string to current locale and copy the result. |
3559 | | * Return -1 if conversion fails. |
3560 | | */ |
3561 | | static int |
3562 | | win_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes, |
3563 | | struct archive_string_conv *sc, int be) |
3564 | | { |
3565 | | struct archive_string tmp; |
3566 | | const char *u16; |
3567 | | BOOL defchar; |
3568 | | char *mbs; |
3569 | | size_t mbs_size, b, ll; |
3570 | | int ret = 0; |
3571 | | |
3572 | | bytes &= ~1; |
3573 | | if (archive_string_ensure(as, as->length + bytes +1) == NULL) |
3574 | | return (-1); |
3575 | | |
3576 | | mbs = as->s + as->length; |
3577 | | mbs_size = as->buffer_length - as->length -1; |
3578 | | |
3579 | | if (sc->to_cp == CP_C_LOCALE) { |
3580 | | /* |
3581 | | * "C" locale special processing. |
3582 | | */ |
3583 | | u16 = _p; |
3584 | | ll = 0; |
3585 | | for (b = 0; b < bytes; b += 2) { |
3586 | | uint16_t val; |
3587 | | if (be) |
3588 | | val = archive_be16dec(u16+b); |
3589 | | else |
3590 | | val = archive_le16dec(u16+b); |
3591 | | if (val > 255) { |
3592 | | *mbs++ = '?'; |
3593 | | ret = -1; |
3594 | | } else |
3595 | | *mbs++ = (char)(val&0xff); |
3596 | | ll++; |
3597 | | } |
3598 | | as->length += ll; |
3599 | | as->s[as->length] = '\0'; |
3600 | | return (ret); |
3601 | | } |
3602 | | |
3603 | | archive_string_init(&tmp); |
3604 | | if (be) { |
3605 | | if (IS_BIG_ENDIAN) { |
3606 | | u16 = _p; |
3607 | | } else { |
3608 | | if (archive_string_ensure(&tmp, bytes+2) == NULL) |
3609 | | return (-1); |
3610 | | memcpy(tmp.s, _p, bytes); |
3611 | | for (b = 0; b < bytes; b += 2) { |
3612 | | uint16_t val = archive_be16dec(tmp.s+b); |
3613 | | archive_le16enc(tmp.s+b, val); |
3614 | | } |
3615 | | u16 = tmp.s; |
3616 | | } |
3617 | | } else { |
3618 | | if (!IS_BIG_ENDIAN) { |
3619 | | u16 = _p; |
3620 | | } else { |
3621 | | if (archive_string_ensure(&tmp, bytes+2) == NULL) |
3622 | | return (-1); |
3623 | | memcpy(tmp.s, _p, bytes); |
3624 | | for (b = 0; b < bytes; b += 2) { |
3625 | | uint16_t val = archive_le16dec(tmp.s+b); |
3626 | | archive_be16enc(tmp.s+b, val); |
3627 | | } |
3628 | | u16 = tmp.s; |
3629 | | } |
3630 | | } |
3631 | | |
3632 | | do { |
3633 | | int r; |
3634 | | defchar = 0; |
3635 | | /* WideCharToMultiByte is limited to int. */ |
3636 | | if (bytes > (size_t)INT_MAX || mbs_size > (size_t)INT_MAX) |
3637 | | return (-1); |
3638 | | r = WideCharToMultiByte(sc->to_cp, 0, |
3639 | | (LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size, |
3640 | | NULL, &defchar); |
3641 | | /* Exit loop if we succeeded */ |
3642 | | if (r != 0 || |
3643 | | GetLastError() != ERROR_INSUFFICIENT_BUFFER) { |
3644 | | ll = (size_t)r; |
3645 | | break; |
3646 | | } |
3647 | | /* Else expand buffer and loop to try again. */ |
3648 | | r = WideCharToMultiByte(sc->to_cp, 0, |
3649 | | (LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL); |
3650 | | ll = (size_t)r; |
3651 | | if (archive_string_ensure(as, ll +1) == NULL) |
3652 | | return (-1); |
3653 | | mbs = as->s + as->length; |
3654 | | mbs_size = as->buffer_length - as->length -1; |
3655 | | } while (1); |
3656 | | archive_string_free(&tmp); |
3657 | | as->length += ll; |
3658 | | as->s[as->length] = '\0'; |
3659 | | if (ll == 0 || defchar) |
3660 | | ret = -1; |
3661 | | return (ret); |
3662 | | } |
3663 | | |
3664 | | static int |
3665 | | win_strncat_from_utf16be(struct archive_string *as, const void *_p, |
3666 | | size_t bytes, struct archive_string_conv *sc) |
3667 | | { |
3668 | | return (win_strncat_from_utf16(as, _p, bytes, sc, 1)); |
3669 | | } |
3670 | | |
3671 | | static int |
3672 | | win_strncat_from_utf16le(struct archive_string *as, const void *_p, |
3673 | | size_t bytes, struct archive_string_conv *sc) |
3674 | | { |
3675 | | return (win_strncat_from_utf16(as, _p, bytes, sc, 0)); |
3676 | | } |
3677 | | |
3678 | | /* |
3679 | | * Convert a current locale string to UTF-16BE/LE and copy the result. |
3680 | | * Return -1 if conversion fails. |
3681 | | */ |
3682 | | static int |
3683 | | win_strncat_to_utf16(struct archive_string *as16, const void *_p, |
3684 | | size_t length, struct archive_string_conv *sc, int bigendian) |
3685 | | { |
3686 | | const char *s = (const char *)_p; |
3687 | | char *u16; |
3688 | | size_t count, avail; |
3689 | | |
3690 | | if (archive_string_ensure(as16, |
3691 | | as16->length + (length + 1) * 2) == NULL) |
3692 | | return (-1); |
3693 | | |
3694 | | u16 = as16->s + as16->length; |
3695 | | avail = as16->buffer_length - 2; |
3696 | | if (sc->from_cp == CP_C_LOCALE) { |
3697 | | /* |
3698 | | * "C" locale special processing. |
3699 | | */ |
3700 | | count = 0; |
3701 | | while (count < length && *s) { |
3702 | | if (bigendian) |
3703 | | archive_be16enc(u16, *s); |
3704 | | else |
3705 | | archive_le16enc(u16, *s); |
3706 | | u16 += 2; |
3707 | | s++; |
3708 | | count++; |
3709 | | } |
3710 | | as16->length += count << 1; |
3711 | | as16->s[as16->length] = 0; |
3712 | | as16->s[as16->length+1] = 0; |
3713 | | return (0); |
3714 | | } |
3715 | | do { |
3716 | | int r; |
3717 | | if (length > (size_t)INT_MAX || (avail >> 1) > (size_t)INT_MAX) |
3718 | | return (-1); |
3719 | | r = MultiByteToWideChar(sc->from_cp, |
3720 | | MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1); |
3721 | | /* Exit loop if we succeeded */ |
3722 | | if (r != 0 || |
3723 | | GetLastError() != ERROR_INSUFFICIENT_BUFFER) { |
3724 | | count = (size_t)r; |
3725 | | break; |
3726 | | } |
3727 | | /* Expand buffer and try again */ |
3728 | | r = MultiByteToWideChar(sc->from_cp, |
3729 | | MB_PRECOMPOSED, s, (int)length, NULL, 0); |
3730 | | count = (size_t)r; |
3731 | | if (archive_string_ensure(as16, (count +1) * 2) |
3732 | | == NULL) |
3733 | | return (-1); |
3734 | | u16 = as16->s + as16->length; |
3735 | | avail = as16->buffer_length - 2; |
3736 | | } while (1); |
3737 | | as16->length += count * 2; |
3738 | | as16->s[as16->length] = 0; |
3739 | | as16->s[as16->length+1] = 0; |
3740 | | if (count == 0) |
3741 | | return (-1); |
3742 | | |
3743 | | if (IS_BIG_ENDIAN) { |
3744 | | if (!bigendian) { |
3745 | | while (count > 0) { |
3746 | | uint16_t v = archive_be16dec(u16); |
3747 | | archive_le16enc(u16, v); |
3748 | | u16 += 2; |
3749 | | count--; |
3750 | | } |
3751 | | } |
3752 | | } else { |
3753 | | if (bigendian) { |
3754 | | while (count > 0) { |
3755 | | uint16_t v = archive_le16dec(u16); |
3756 | | archive_be16enc(u16, v); |
3757 | | u16 += 2; |
3758 | | count--; |
3759 | | } |
3760 | | } |
3761 | | } |
3762 | | return (0); |
3763 | | } |
3764 | | |
3765 | | static int |
3766 | | win_strncat_to_utf16be(struct archive_string *as16, const void *_p, |
3767 | | size_t length, struct archive_string_conv *sc) |
3768 | | { |
3769 | | return (win_strncat_to_utf16(as16, _p, length, sc, 1)); |
3770 | | } |
3771 | | |
3772 | | static int |
3773 | | win_strncat_to_utf16le(struct archive_string *as16, const void *_p, |
3774 | | size_t length, struct archive_string_conv *sc) |
3775 | | { |
3776 | | return (win_strncat_to_utf16(as16, _p, length, sc, 0)); |
3777 | | } |
3778 | | |
3779 | | #endif /* _WIN32 && !__CYGWIN__ */ |
3780 | | |
3781 | | /* |
3782 | | * Do the best effort for conversions. |
3783 | | * We cannot handle UTF-16BE character-set without such iconv, |
3784 | | * but there is a chance if a string consists just ASCII code or |
3785 | | * a current locale is UTF-8. |
3786 | | */ |
3787 | | |
3788 | | /* |
3789 | | * Convert a UTF-16BE string to current locale and copy the result. |
3790 | | * Return -1 if conversion fails. |
3791 | | */ |
3792 | | static int |
3793 | | best_effort_strncat_from_utf16(struct archive_string *as, const void *_p, |
3794 | | size_t bytes, struct archive_string_conv *sc, int be) |
3795 | 0 | { |
3796 | 0 | const char *utf16 = (const char *)_p; |
3797 | 0 | char *mbs; |
3798 | 0 | uint32_t uc; |
3799 | 0 | int n, ret; |
3800 | |
|
3801 | 0 | (void)sc; /* UNUSED */ |
3802 | | /* |
3803 | | * Other case, we should do the best effort. |
3804 | | * If all character are ASCII(<0x7f), we can convert it. |
3805 | | * if not , we set a alternative character and return -1. |
3806 | | */ |
3807 | 0 | ret = 0; |
3808 | 0 | if (archive_string_ensure(as, as->length + bytes +1) == NULL) |
3809 | 0 | return (-1); |
3810 | 0 | mbs = as->s + as->length; |
3811 | |
|
3812 | 0 | while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) { |
3813 | 0 | if (n < 0) { |
3814 | 0 | n *= -1; |
3815 | 0 | ret = -1; |
3816 | 0 | } |
3817 | 0 | bytes -= n; |
3818 | 0 | utf16 += n; |
3819 | |
|
3820 | 0 | if (uc > 127) { |
3821 | | /* We cannot handle it. */ |
3822 | 0 | *mbs++ = '?'; |
3823 | 0 | ret = -1; |
3824 | 0 | } else |
3825 | 0 | *mbs++ = (char)uc; |
3826 | 0 | } |
3827 | 0 | as->length = mbs - as->s; |
3828 | 0 | as->s[as->length] = '\0'; |
3829 | 0 | return (ret); |
3830 | 0 | } |
3831 | | |
3832 | | static int |
3833 | | best_effort_strncat_from_utf16be(struct archive_string *as, const void *_p, |
3834 | | size_t bytes, struct archive_string_conv *sc) |
3835 | 0 | { |
3836 | 0 | return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1)); |
3837 | 0 | } |
3838 | | |
3839 | | static int |
3840 | | best_effort_strncat_from_utf16le(struct archive_string *as, const void *_p, |
3841 | | size_t bytes, struct archive_string_conv *sc) |
3842 | 0 | { |
3843 | 0 | return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0)); |
3844 | 0 | } |
3845 | | |
3846 | | /* |
3847 | | * Convert a current locale string to UTF-16BE/LE and copy the result. |
3848 | | * Return -1 if conversion fails. |
3849 | | */ |
3850 | | static int |
3851 | | best_effort_strncat_to_utf16(struct archive_string *as16, const void *_p, |
3852 | | size_t length, struct archive_string_conv *sc, int bigendian) |
3853 | 0 | { |
3854 | 0 | const char *s = (const char *)_p; |
3855 | 0 | char *utf16; |
3856 | 0 | size_t remaining; |
3857 | 0 | int ret; |
3858 | |
|
3859 | 0 | (void)sc; /* UNUSED */ |
3860 | | /* |
3861 | | * Other case, we should do the best effort. |
3862 | | * If all character are ASCII(<0x7f), we can convert it. |
3863 | | * if not , we set a alternative character and return -1. |
3864 | | */ |
3865 | 0 | ret = 0; |
3866 | 0 | remaining = length; |
3867 | |
|
3868 | 0 | if (archive_string_ensure(as16, |
3869 | 0 | as16->length + (length + 1) * 2) == NULL) |
3870 | 0 | return (-1); |
3871 | | |
3872 | 0 | utf16 = as16->s + as16->length; |
3873 | 0 | while (remaining--) { |
3874 | 0 | unsigned c = *s++; |
3875 | 0 | if (c > 127) { |
3876 | | /* We cannot handle it. */ |
3877 | 0 | c = UNICODE_R_CHAR; |
3878 | 0 | ret = -1; |
3879 | 0 | } |
3880 | 0 | if (bigendian) |
3881 | 0 | archive_be16enc(utf16, (uint16_t)c); |
3882 | 0 | else |
3883 | 0 | archive_le16enc(utf16, (uint16_t)c); |
3884 | 0 | utf16 += 2; |
3885 | 0 | } |
3886 | 0 | as16->length = utf16 - as16->s; |
3887 | 0 | as16->s[as16->length] = 0; |
3888 | 0 | as16->s[as16->length+1] = 0; |
3889 | 0 | return (ret); |
3890 | 0 | } |
3891 | | |
3892 | | static int |
3893 | | best_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p, |
3894 | | size_t length, struct archive_string_conv *sc) |
3895 | 0 | { |
3896 | 0 | return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1)); |
3897 | 0 | } |
3898 | | |
3899 | | static int |
3900 | | best_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p, |
3901 | | size_t length, struct archive_string_conv *sc) |
3902 | 0 | { |
3903 | 0 | return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0)); |
3904 | 0 | } |
3905 | | |
3906 | | |
3907 | | /* |
3908 | | * Multistring operations. |
3909 | | */ |
3910 | | |
3911 | | void |
3912 | | archive_mstring_clean(struct archive_mstring *aes) |
3913 | 272k | { |
3914 | 272k | archive_wstring_free(&(aes->aes_wcs)); |
3915 | 272k | archive_string_free(&(aes->aes_mbs)); |
3916 | 272k | archive_string_free(&(aes->aes_utf8)); |
3917 | 272k | archive_string_free(&(aes->aes_mbs_in_locale)); |
3918 | 272k | aes->aes_set = 0; |
3919 | 272k | } |
3920 | | |
3921 | | void |
3922 | | archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src) |
3923 | 27.9k | { |
3924 | 27.9k | dest->aes_set = src->aes_set; |
3925 | 27.9k | archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs)); |
3926 | 27.9k | archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8)); |
3927 | 27.9k | archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs)); |
3928 | 27.9k | } |
3929 | | |
3930 | | int |
3931 | | archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes, |
3932 | | const char **p) |
3933 | 0 | { |
3934 | 0 | struct archive_string_conv *sc; |
3935 | 0 | int r; |
3936 | | |
3937 | | /* If we already have a UTF8 form, return that immediately. */ |
3938 | 0 | if (aes->aes_set & AES_SET_UTF8) { |
3939 | 0 | *p = aes->aes_utf8.s; |
3940 | 0 | return (0); |
3941 | 0 | } |
3942 | | |
3943 | 0 | *p = NULL; |
3944 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
3945 | | /* |
3946 | | * On Windows, first try converting from WCS because (1) there's no |
3947 | | * guarantee that the conversion to MBS will succeed, e.g. when using |
3948 | | * CP_ACP, and (2) that's more efficient than converting to MBS, just to |
3949 | | * convert back to WCS again before finally converting to UTF-8 |
3950 | | */ |
3951 | | if ((aes->aes_set & AES_SET_WCS) != 0) { |
3952 | | sc = archive_string_conversion_to_charset(a, "UTF-8", 1); |
3953 | | if (sc == NULL) |
3954 | | return (-1);/* Couldn't allocate memory for sc. */ |
3955 | | archive_string_empty(&(aes->aes_utf8)); |
3956 | | r = archive_string_append_from_wcs_in_codepage(&(aes->aes_utf8), |
3957 | | aes->aes_wcs.s, aes->aes_wcs.length, sc); |
3958 | | if (a == NULL) |
3959 | | free_sconv_object(sc); |
3960 | | if (r == 0) { |
3961 | | aes->aes_set |= AES_SET_UTF8; |
3962 | | *p = aes->aes_utf8.s; |
3963 | | return (0);/* success. */ |
3964 | | } else |
3965 | | return (-1);/* failure. */ |
3966 | | } |
3967 | | #endif |
3968 | | /* Try converting WCS to MBS first if MBS does not exist yet. */ |
3969 | 0 | if ((aes->aes_set & AES_SET_MBS) == 0) { |
3970 | 0 | const char *pm; /* unused */ |
3971 | 0 | archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */ |
3972 | 0 | } |
3973 | 0 | if (aes->aes_set & AES_SET_MBS) { |
3974 | 0 | sc = archive_string_conversion_to_charset(a, "UTF-8", 1); |
3975 | 0 | if (sc == NULL) |
3976 | 0 | return (-1);/* Couldn't allocate memory for sc. */ |
3977 | 0 | r = archive_strncpy_l(&(aes->aes_utf8), aes->aes_mbs.s, |
3978 | 0 | aes->aes_mbs.length, sc); |
3979 | 0 | if (a == NULL) |
3980 | 0 | free_sconv_object(sc); |
3981 | 0 | if (r == 0) { |
3982 | 0 | aes->aes_set |= AES_SET_UTF8; |
3983 | 0 | *p = aes->aes_utf8.s; |
3984 | 0 | return (0);/* success. */ |
3985 | 0 | } else |
3986 | 0 | return (-1);/* failure. */ |
3987 | 0 | } |
3988 | 0 | return (0);/* success. */ |
3989 | 0 | } |
3990 | | |
3991 | | int |
3992 | | archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes, |
3993 | | const char **p) |
3994 | 18.6k | { |
3995 | 18.6k | struct archive_string_conv *sc; |
3996 | 18.6k | int r, ret = 0; |
3997 | | |
3998 | | /* If we already have an MBS form, return that immediately. */ |
3999 | 18.6k | if (aes->aes_set & AES_SET_MBS) { |
4000 | 17.8k | *p = aes->aes_mbs.s; |
4001 | 17.8k | return (ret); |
4002 | 17.8k | } |
4003 | | |
4004 | 746 | *p = NULL; |
4005 | | /* If there's a WCS form, try converting with the native locale. */ |
4006 | 746 | if (aes->aes_set & AES_SET_WCS) { |
4007 | 230 | archive_string_empty(&(aes->aes_mbs)); |
4008 | 230 | r = archive_string_append_from_wcs(&(aes->aes_mbs), |
4009 | 230 | aes->aes_wcs.s, aes->aes_wcs.length); |
4010 | 230 | *p = aes->aes_mbs.s; |
4011 | 230 | if (r == 0) { |
4012 | 230 | aes->aes_set |= AES_SET_MBS; |
4013 | 230 | return (ret); |
4014 | 230 | } else |
4015 | 0 | ret = -1; |
4016 | 230 | } |
4017 | | |
4018 | | /* If there's a UTF-8 form, try converting with the native locale. */ |
4019 | 516 | if (aes->aes_set & AES_SET_UTF8) { |
4020 | 0 | archive_string_empty(&(aes->aes_mbs)); |
4021 | 0 | sc = archive_string_conversion_from_charset(a, "UTF-8", 1); |
4022 | 0 | if (sc == NULL) |
4023 | 0 | return (-1);/* Couldn't allocate memory for sc. */ |
4024 | 0 | r = archive_strncpy_l(&(aes->aes_mbs), |
4025 | 0 | aes->aes_utf8.s, aes->aes_utf8.length, sc); |
4026 | 0 | if (a == NULL) |
4027 | 0 | free_sconv_object(sc); |
4028 | 0 | *p = aes->aes_mbs.s; |
4029 | 0 | if (r == 0) { |
4030 | 0 | aes->aes_set |= AES_SET_MBS; |
4031 | 0 | ret = 0;/* success; overwrite previous error. */ |
4032 | 0 | } else |
4033 | 0 | ret = -1;/* failure. */ |
4034 | 0 | } |
4035 | 516 | return (ret); |
4036 | 516 | } |
4037 | | |
4038 | | int |
4039 | | archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes, |
4040 | | const wchar_t **wp) |
4041 | 5.71k | { |
4042 | 5.71k | int r, ret = 0; |
4043 | | |
4044 | 5.71k | (void)a;/* UNUSED */ |
4045 | | /* Return WCS form if we already have it. */ |
4046 | 5.71k | if (aes->aes_set & AES_SET_WCS) { |
4047 | 1.34k | *wp = aes->aes_wcs.s; |
4048 | 1.34k | return (ret); |
4049 | 1.34k | } |
4050 | | |
4051 | 4.37k | *wp = NULL; |
4052 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
4053 | | /* |
4054 | | * On Windows, prefer converting from UTF-8 directly to WCS because: |
4055 | | * (1) there's no guarantee that the string can be represented in MBS (e.g. |
4056 | | * with CP_ACP), and (2) in order to convert from UTF-8 to MBS, we're going |
4057 | | * to need to convert from UTF-8 to WCS anyway and its wasteful to throw |
4058 | | * away that intermediate result |
4059 | | */ |
4060 | | if (aes->aes_set & AES_SET_UTF8) { |
4061 | | struct archive_string_conv *sc; |
4062 | | |
4063 | | sc = archive_string_conversion_from_charset(a, "UTF-8", 1); |
4064 | | if (sc != NULL) { |
4065 | | archive_wstring_empty((&aes->aes_wcs)); |
4066 | | r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), |
4067 | | aes->aes_utf8.s, aes->aes_utf8.length, sc); |
4068 | | if (a == NULL) |
4069 | | free_sconv_object(sc); |
4070 | | if (r == 0) { |
4071 | | aes->aes_set |= AES_SET_WCS; |
4072 | | *wp = aes->aes_wcs.s; |
4073 | | return (0); |
4074 | | } |
4075 | | } |
4076 | | } |
4077 | | #endif |
4078 | | /* Try converting UTF8 to MBS first if MBS does not exist yet. */ |
4079 | 4.37k | if ((aes->aes_set & AES_SET_MBS) == 0) { |
4080 | 292 | const char *p; /* unused */ |
4081 | 292 | archive_mstring_get_mbs(a, aes, &p); /* ignore errors, we'll handle it later */ |
4082 | 292 | } |
4083 | | /* Try converting MBS to WCS using native locale. */ |
4084 | 4.37k | if (aes->aes_set & AES_SET_MBS) { |
4085 | 4.08k | archive_wstring_empty(&(aes->aes_wcs)); |
4086 | 4.08k | r = archive_wstring_append_from_mbs(&(aes->aes_wcs), |
4087 | 4.08k | aes->aes_mbs.s, aes->aes_mbs.length); |
4088 | 4.08k | if (r == 0) { |
4089 | 3.08k | aes->aes_set |= AES_SET_WCS; |
4090 | 3.08k | *wp = aes->aes_wcs.s; |
4091 | 3.08k | } else |
4092 | 996 | ret = -1;/* failure. */ |
4093 | 4.08k | } |
4094 | 4.37k | return (ret); |
4095 | 5.71k | } |
4096 | | |
4097 | | int |
4098 | | archive_mstring_get_mbs_l(struct archive *a, struct archive_mstring *aes, |
4099 | | const char **p, size_t *length, struct archive_string_conv *sc) |
4100 | 0 | { |
4101 | 0 | int ret = 0; |
4102 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
4103 | | int r; |
4104 | | |
4105 | | /* |
4106 | | * Internationalization programming on Windows must use Wide |
4107 | | * characters because Windows platform cannot make locale UTF-8. |
4108 | | */ |
4109 | | if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) { |
4110 | | archive_string_empty(&(aes->aes_mbs_in_locale)); |
4111 | | r = archive_string_append_from_wcs_in_codepage( |
4112 | | &(aes->aes_mbs_in_locale), aes->aes_wcs.s, |
4113 | | aes->aes_wcs.length, sc); |
4114 | | if (r == 0) { |
4115 | | *p = aes->aes_mbs_in_locale.s; |
4116 | | if (length != NULL) |
4117 | | *length = aes->aes_mbs_in_locale.length; |
4118 | | return (0); |
4119 | | } else if (errno == ENOMEM) |
4120 | | return (-1); |
4121 | | else |
4122 | | ret = -1; |
4123 | | } |
4124 | | #endif |
4125 | | |
4126 | | /* If there is not an MBS form but there is a WCS or UTF8 form, try converting |
4127 | | * with the native locale to be used for translating it to specified |
4128 | | * character-set. */ |
4129 | 0 | if ((aes->aes_set & AES_SET_MBS) == 0) { |
4130 | 0 | const char *pm; /* unused */ |
4131 | 0 | archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */ |
4132 | 0 | } |
4133 | | /* If we already have an MBS form, use it to be translated to |
4134 | | * specified character-set. */ |
4135 | 0 | if (aes->aes_set & AES_SET_MBS) { |
4136 | 0 | if (sc == NULL) { |
4137 | | /* Conversion is unneeded. */ |
4138 | 0 | *p = aes->aes_mbs.s; |
4139 | 0 | if (length != NULL) |
4140 | 0 | *length = aes->aes_mbs.length; |
4141 | 0 | return (0); |
4142 | 0 | } |
4143 | 0 | ret = archive_strncpy_l(&(aes->aes_mbs_in_locale), |
4144 | 0 | aes->aes_mbs.s, aes->aes_mbs.length, sc); |
4145 | 0 | *p = aes->aes_mbs_in_locale.s; |
4146 | 0 | if (length != NULL) |
4147 | 0 | *length = aes->aes_mbs_in_locale.length; |
4148 | 0 | } else { |
4149 | 0 | *p = NULL; |
4150 | 0 | if (length != NULL) |
4151 | 0 | *length = 0; |
4152 | 0 | } |
4153 | 0 | return (ret); |
4154 | 0 | } |
4155 | | |
4156 | | int |
4157 | | archive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs) |
4158 | 2.68k | { |
4159 | 2.68k | if (mbs == NULL) { |
4160 | 22 | aes->aes_set = 0; |
4161 | 22 | return (0); |
4162 | 22 | } |
4163 | 2.65k | return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs))); |
4164 | 2.68k | } |
4165 | | |
4166 | | int |
4167 | | archive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs, |
4168 | | size_t len) |
4169 | 2.65k | { |
4170 | 2.65k | if (mbs == NULL) { |
4171 | 0 | aes->aes_set = 0; |
4172 | 0 | return (0); |
4173 | 0 | } |
4174 | 2.65k | aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ |
4175 | 2.65k | archive_strncpy(&(aes->aes_mbs), mbs, len); |
4176 | 2.65k | archive_string_empty(&(aes->aes_utf8)); |
4177 | 2.65k | archive_wstring_empty(&(aes->aes_wcs)); |
4178 | 2.65k | return (0); |
4179 | 2.65k | } |
4180 | | |
4181 | | int |
4182 | | archive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs) |
4183 | 210 | { |
4184 | 210 | return archive_mstring_copy_wcs_len(aes, wcs, |
4185 | 210 | wcs == NULL ? 0 : wcslen(wcs)); |
4186 | 210 | } |
4187 | | |
4188 | | int |
4189 | | archive_mstring_copy_utf8(struct archive_mstring *aes, const char *utf8) |
4190 | 0 | { |
4191 | 0 | if (utf8 == NULL) { |
4192 | 0 | aes->aes_set = 0; |
4193 | 0 | return (0); |
4194 | 0 | } |
4195 | 0 | aes->aes_set = AES_SET_UTF8; |
4196 | 0 | archive_string_empty(&(aes->aes_mbs)); |
4197 | 0 | archive_string_empty(&(aes->aes_wcs)); |
4198 | 0 | archive_strncpy(&(aes->aes_utf8), utf8, strlen(utf8)); |
4199 | 0 | return (int)strlen(utf8); |
4200 | 0 | } |
4201 | | |
4202 | | int |
4203 | | archive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs, |
4204 | | size_t len) |
4205 | 210 | { |
4206 | 210 | if (wcs == NULL) { |
4207 | 0 | aes->aes_set = 0; |
4208 | 0 | return (0); |
4209 | 0 | } |
4210 | 210 | aes->aes_set = AES_SET_WCS; /* Only WCS form set. */ |
4211 | 210 | archive_string_empty(&(aes->aes_mbs)); |
4212 | 210 | archive_string_empty(&(aes->aes_utf8)); |
4213 | 210 | archive_wstrncpy(&(aes->aes_wcs), wcs, len); |
4214 | 210 | return (0); |
4215 | 210 | } |
4216 | | |
4217 | | int |
4218 | | archive_mstring_copy_mbs_len_l(struct archive_mstring *aes, |
4219 | | const char *mbs, size_t len, struct archive_string_conv *sc) |
4220 | 5.47k | { |
4221 | 5.47k | int r; |
4222 | | |
4223 | 5.47k | if (mbs == NULL) { |
4224 | 24 | aes->aes_set = 0; |
4225 | 24 | return (0); |
4226 | 24 | } |
4227 | 5.45k | archive_string_empty(&(aes->aes_mbs)); |
4228 | 5.45k | archive_wstring_empty(&(aes->aes_wcs)); |
4229 | 5.45k | archive_string_empty(&(aes->aes_utf8)); |
4230 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
4231 | | /* |
4232 | | * Internationalization programming on Windows must use Wide |
4233 | | * characters because Windows platform cannot make locale UTF-8. |
4234 | | */ |
4235 | | if (sc == NULL) { |
4236 | | if (archive_string_append(&(aes->aes_mbs), |
4237 | | mbs, mbsnbytes(mbs, len)) == NULL) { |
4238 | | aes->aes_set = 0; |
4239 | | r = -1; |
4240 | | } else { |
4241 | | aes->aes_set = AES_SET_MBS; |
4242 | | r = 0; |
4243 | | } |
4244 | | #if defined(HAVE_ICONV) |
4245 | | } else if (sc != NULL && sc->cd_w != (iconv_t)-1) { |
4246 | | /* |
4247 | | * This case happens only when MultiByteToWideChar() cannot |
4248 | | * handle sc->from_cp, and we have to iconv in order to |
4249 | | * translate character-set to wchar_t,UTF-16. |
4250 | | */ |
4251 | | iconv_t cd = sc->cd; |
4252 | | unsigned from_cp; |
4253 | | int flag; |
4254 | | |
4255 | | /* |
4256 | | * Translate multi-bytes from some character-set to UTF-8. |
4257 | | */ |
4258 | | sc->cd = sc->cd_w; |
4259 | | r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc); |
4260 | | sc->cd = cd; |
4261 | | if (r != 0) { |
4262 | | aes->aes_set = 0; |
4263 | | return (r); |
4264 | | } |
4265 | | aes->aes_set = AES_SET_UTF8; |
4266 | | |
4267 | | /* |
4268 | | * Append the UTF-8 string into wstring. |
4269 | | */ |
4270 | | flag = sc->flag; |
4271 | | sc->flag &= ~(SCONV_NORMALIZATION_C |
4272 | | | SCONV_TO_UTF16| SCONV_FROM_UTF16); |
4273 | | from_cp = sc->from_cp; |
4274 | | sc->from_cp = CP_UTF8; |
4275 | | r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), |
4276 | | aes->aes_utf8.s, aes->aes_utf8.length, sc); |
4277 | | sc->flag = flag; |
4278 | | sc->from_cp = from_cp; |
4279 | | if (r == 0) |
4280 | | aes->aes_set |= AES_SET_WCS; |
4281 | | #endif |
4282 | | } else { |
4283 | | r = archive_wstring_append_from_mbs_in_codepage( |
4284 | | &(aes->aes_wcs), mbs, len, sc); |
4285 | | if (r == 0) |
4286 | | aes->aes_set = AES_SET_WCS; |
4287 | | else |
4288 | | aes->aes_set = 0; |
4289 | | } |
4290 | | #else |
4291 | 5.45k | r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc); |
4292 | 5.45k | if (r == 0) |
4293 | 4.59k | aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ |
4294 | 856 | else |
4295 | 856 | aes->aes_set = 0; |
4296 | 5.45k | #endif |
4297 | 5.45k | return (r); |
4298 | 5.47k | } |
4299 | | |
4300 | | /* |
4301 | | * The 'update' form tries to proactively update all forms of |
4302 | | * this string (WCS and MBS) and returns an error if any of |
4303 | | * them fail. This is used by the 'pax' handler, for instance, |
4304 | | * to detect and report character-conversion failures early while |
4305 | | * still allowing clients to get potentially useful values from |
4306 | | * the more tolerant lazy conversions. (get_mbs and get_wcs will |
4307 | | * strive to give the user something useful, so you can get hopefully |
4308 | | * usable values even if some of the character conversions are failing.) |
4309 | | */ |
4310 | | int |
4311 | | archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes, |
4312 | | const char *utf8) |
4313 | 0 | { |
4314 | 0 | struct archive_string_conv *sc; |
4315 | 0 | int r; |
4316 | |
|
4317 | 0 | if (utf8 == NULL) { |
4318 | 0 | aes->aes_set = 0; |
4319 | 0 | return (0); /* Succeeded in clearing everything. */ |
4320 | 0 | } |
4321 | | |
4322 | | /* Save the UTF8 string. */ |
4323 | 0 | archive_strcpy(&(aes->aes_utf8), utf8); |
4324 | | |
4325 | | /* Empty the mbs and wcs strings. */ |
4326 | 0 | archive_string_empty(&(aes->aes_mbs)); |
4327 | 0 | archive_wstring_empty(&(aes->aes_wcs)); |
4328 | |
|
4329 | 0 | aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */ |
4330 | |
|
4331 | 0 | sc = archive_string_conversion_from_charset(a, "UTF-8", 1); |
4332 | 0 | if (sc == NULL) |
4333 | 0 | return (-1);/* Couldn't allocate memory for sc. */ |
4334 | | |
4335 | | #if defined(_WIN32) && !defined(__CYGWIN__) |
4336 | | /* On Windows, there's no good way to convert from UTF8 -> MBS directly, so |
4337 | | * prefer to first convert to WCS as (1) it's wasteful to throw away the |
4338 | | * intermediate result, and (2) WCS will still be set even if we fail to |
4339 | | * convert to MBS (e.g. with ACP that can't represent the characters) */ |
4340 | | r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), |
4341 | | aes->aes_utf8.s, aes->aes_utf8.length, sc); |
4342 | | |
4343 | | if (a == NULL) |
4344 | | free_sconv_object(sc); |
4345 | | if (r != 0) |
4346 | | return (-1); /* This will guarantee we can't convert to MBS */ |
4347 | | aes->aes_set = AES_SET_UTF8 | AES_SET_WCS; /* Both UTF8 and WCS set. */ |
4348 | | |
4349 | | /* Try converting WCS to MBS, return false on failure. */ |
4350 | | if (archive_string_append_from_wcs(&(aes->aes_mbs), aes->aes_wcs.s, |
4351 | | aes->aes_wcs.length)) |
4352 | | return (-1); |
4353 | | #else |
4354 | | /* Try converting UTF-8 to MBS, return false on failure. */ |
4355 | 0 | r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc); |
4356 | |
|
4357 | 0 | if (a == NULL) |
4358 | 0 | free_sconv_object(sc); |
4359 | 0 | if (r != 0) |
4360 | 0 | return (-1); |
4361 | 0 | aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */ |
4362 | | |
4363 | | /* Try converting MBS to WCS, return false on failure. */ |
4364 | 0 | if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s, |
4365 | 0 | aes->aes_mbs.length)) |
4366 | 0 | return (-1); |
4367 | 0 | #endif |
4368 | | |
4369 | | /* All conversions succeeded. */ |
4370 | 0 | aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS; |
4371 | |
|
4372 | 0 | return (0); |
4373 | 0 | } |