/src/gnupg/common/utf8conv.c
Line | Count | Source |
1 | | /* utf8conf.c - UTF8 character set conversion |
2 | | * Copyright (C) 1994, 1998, 1999, 2000, 2001, 2003, 2006, |
3 | | * 2008, 2010 Free Software Foundation, Inc. |
4 | | * |
5 | | * This file is part of GnuPG. |
6 | | * |
7 | | * GnuPG is free software; you can redistribute and/or modify this |
8 | | * part of GnuPG under the terms of either |
9 | | * |
10 | | * - the GNU Lesser General Public License as published by the Free |
11 | | * Software Foundation; either version 3 of the License, or (at |
12 | | * your option) any later version. |
13 | | * |
14 | | * or |
15 | | * |
16 | | * - the GNU General Public License as published by the Free |
17 | | * Software Foundation; either version 2 of the License, or (at |
18 | | * your option) any later version. |
19 | | * |
20 | | * or both in parallel, as here. |
21 | | * |
22 | | * GnuPG is distributed in the hope that it will be useful, but |
23 | | * WITHOUT ANY WARRANTY; without even the implied warranty of |
24 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
25 | | * General Public License for more details. |
26 | | * |
27 | | * You should have received a copies of the GNU General Public License |
28 | | * and the GNU Lesser General Public License along with this program; |
29 | | * if not, see <https://www.gnu.org/licenses/>. |
30 | | */ |
31 | | |
32 | | #include <config.h> |
33 | | #include <stdlib.h> |
34 | | #include <string.h> |
35 | | #include <stdarg.h> |
36 | | #include <ctype.h> |
37 | | #ifdef HAVE_LANGINFO_CODESET |
38 | | #include <langinfo.h> |
39 | | #endif |
40 | | #include <errno.h> |
41 | | |
42 | | #if HAVE_W32_SYSTEM |
43 | | # /* Tell libgpg-error to provide the iconv macros. */ |
44 | | # define GPGRT_ENABLE_W32_ICONV_MACROS 1 |
45 | | #elif HAVE_ANDROID_SYSTEM |
46 | | # /* No iconv support. */ |
47 | | #else |
48 | | # include <iconv.h> |
49 | | #endif |
50 | | |
51 | | |
52 | | #include "util.h" |
53 | | #include "common-defs.h" |
54 | | #include "i18n.h" |
55 | | #include "stringhelp.h" |
56 | | #include "utf8conv.h" |
57 | | |
58 | | #ifdef HAVE_W32_SYSTEM |
59 | | #include <windows.h> |
60 | | #endif |
61 | | |
62 | | #ifndef MB_LEN_MAX |
63 | 0 | #define MB_LEN_MAX 16 |
64 | | #endif |
65 | | |
66 | | static const char *active_charset_name = "iso-8859-1"; |
67 | | static int no_translation; /* Set to true if we let simply pass through. */ |
68 | | static int use_iconv; /* iconv conversion functions required. */ |
69 | | |
70 | | |
71 | | #ifdef HAVE_ANDROID_SYSTEM |
72 | | /* Fake stuff to get things building. */ |
73 | | typedef void *iconv_t; |
74 | | #define ICONV_CONST |
75 | | |
76 | | static iconv_t |
77 | | iconv_open (const char *tocode, const char *fromcode) |
78 | | { |
79 | | (void)tocode; |
80 | | (void)fromcode; |
81 | | return (iconv_t)(-1); |
82 | | } |
83 | | |
84 | | static size_t |
85 | | iconv (iconv_t cd, char **inbuf, size_t *inbytesleft, |
86 | | char **outbuf, size_t *outbytesleft) |
87 | | { |
88 | | (void)cd; |
89 | | (void)inbuf; |
90 | | (void)inbytesleft; |
91 | | (void)outbuf; |
92 | | (void)outbytesleft; |
93 | | return (size_t)(0); |
94 | | } |
95 | | |
96 | | static int |
97 | | iconv_close (iconv_t cd) |
98 | | { |
99 | | (void)cd; |
100 | | return 0; |
101 | | } |
102 | | #endif /*HAVE_ANDROID_SYSTEM*/ |
103 | | |
104 | | |
105 | | /* Error handler for iconv failures. This is needed to not clutter the |
106 | | output with repeated diagnostics about a missing conversion. */ |
107 | | static void |
108 | | handle_iconv_error (const char *to, const char *from, int use_fallback) |
109 | 0 | { |
110 | 0 | if (errno == EINVAL) |
111 | 0 | { |
112 | 0 | static int shown1, shown2; |
113 | 0 | int x; |
114 | |
|
115 | 0 | if (to && !strcmp (to, "utf-8")) |
116 | 0 | { |
117 | 0 | x = shown1; |
118 | 0 | shown1 = 1; |
119 | 0 | } |
120 | 0 | else |
121 | 0 | { |
122 | 0 | x = shown2; |
123 | 0 | shown2 = 1; |
124 | 0 | } |
125 | |
|
126 | 0 | if (!x) |
127 | 0 | log_info (_("conversion from '%s' to '%s' not available\n"), |
128 | 0 | from, to); |
129 | 0 | } |
130 | 0 | else |
131 | 0 | { |
132 | 0 | static int shown; |
133 | |
|
134 | 0 | if (!shown) |
135 | 0 | log_info (_("iconv_open failed: %s\n"), strerror (errno)); |
136 | 0 | shown = 1; |
137 | 0 | } |
138 | |
|
139 | 0 | if (use_fallback) |
140 | 0 | { |
141 | | /* To avoid further error messages we fallback to UTF-8 for the |
142 | | native encoding. Nowadays this seems to be the best bet in |
143 | | case of errors from iconv or nl_langinfo. */ |
144 | 0 | active_charset_name = "utf-8"; |
145 | 0 | no_translation = 1; |
146 | 0 | use_iconv = 0; |
147 | 0 | } |
148 | 0 | } |
149 | | |
150 | | |
151 | | |
152 | | int |
153 | | set_native_charset (const char *newset) |
154 | 0 | { |
155 | 0 | const char *full_newset; |
156 | |
|
157 | 0 | if (!newset) |
158 | 0 | { |
159 | | #ifdef HAVE_ANDROID_SYSTEM |
160 | | newset = "utf-8"; |
161 | | #elif defined HAVE_W32_SYSTEM |
162 | | static char codepage[30]; |
163 | | unsigned int cpno; |
164 | | const char *aliases; |
165 | | |
166 | | /* We are a console program thus we need to use the |
167 | | GetConsoleOutputCP function and not the GetACP which |
168 | | would give the codepage for a GUI program. Note this is not |
169 | | a bulletproof detection because GetConsoleCP might return a |
170 | | different one for console input. Not sure how to cope with |
171 | | that. If the console Code page is not known we fall back to |
172 | | the system code page. */ |
173 | | cpno = GetConsoleOutputCP (); |
174 | | if (!cpno) |
175 | | cpno = GetACP (); |
176 | | sprintf (codepage, "CP%u", cpno ); |
177 | | /* Resolve alias. We use a long string string and not the usual |
178 | | array to optimize if the code is taken to a DSO. Taken from |
179 | | libiconv 1.9.2. */ |
180 | | newset = codepage; |
181 | | for (aliases = ("CP936" "\0" "GBK" "\0" |
182 | | "CP1361" "\0" "JOHAB" "\0" |
183 | | "CP20127" "\0" "ASCII" "\0" |
184 | | "CP20866" "\0" "KOI8-R" "\0" |
185 | | "CP21866" "\0" "KOI8-RU" "\0" |
186 | | "CP28591" "\0" "ISO-8859-1" "\0" |
187 | | "CP28592" "\0" "ISO-8859-2" "\0" |
188 | | "CP28593" "\0" "ISO-8859-3" "\0" |
189 | | "CP28594" "\0" "ISO-8859-4" "\0" |
190 | | "CP28595" "\0" "ISO-8859-5" "\0" |
191 | | "CP28596" "\0" "ISO-8859-6" "\0" |
192 | | "CP28597" "\0" "ISO-8859-7" "\0" |
193 | | "CP28598" "\0" "ISO-8859-8" "\0" |
194 | | "CP28599" "\0" "ISO-8859-9" "\0" |
195 | | "CP28605" "\0" "ISO-8859-15" "\0" |
196 | | "CP65001" "\0" "UTF-8" "\0"); |
197 | | *aliases; |
198 | | aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) |
199 | | { |
200 | | if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1])) |
201 | | { |
202 | | newset = aliases + strlen (aliases) + 1; |
203 | | break; |
204 | | } |
205 | | } |
206 | | |
207 | | #else /*!HAVE_W32_SYSTEM && !HAVE_ANDROID_SYSTEM*/ |
208 | |
|
209 | 0 | #ifdef HAVE_LANGINFO_CODESET |
210 | 0 | newset = nl_langinfo (CODESET); |
211 | | #else /*!HAVE_LANGINFO_CODESET*/ |
212 | | /* Try to get the used charset from environment variables. */ |
213 | | static char codepage[30]; |
214 | | const char *lc, *dot, *mod; |
215 | | |
216 | | strcpy (codepage, "iso-8859-1"); |
217 | | lc = getenv ("LC_ALL"); |
218 | | if (!lc || !*lc) |
219 | | { |
220 | | lc = getenv ("LC_CTYPE"); |
221 | | if (!lc || !*lc) |
222 | | lc = getenv ("LANG"); |
223 | | } |
224 | | if (lc && *lc) |
225 | | { |
226 | | dot = strchr (lc, '.'); |
227 | | if (dot) |
228 | | { |
229 | | mod = strchr (++dot, '@'); |
230 | | if (!mod) |
231 | | mod = dot + strlen (dot); |
232 | | if (mod - dot < sizeof codepage && dot != mod) |
233 | | { |
234 | | memcpy (codepage, dot, mod - dot); |
235 | | codepage [mod - dot] = 0; |
236 | | } |
237 | | } |
238 | | } |
239 | | newset = codepage; |
240 | | #endif /*!HAVE_LANGINFO_CODESET*/ |
241 | 0 | #endif /*!HAVE_W32_SYSTEM && !HAVE_ANDROID_SYSTEM*/ |
242 | 0 | } |
243 | |
|
244 | 0 | full_newset = newset; |
245 | 0 | if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3)) |
246 | 0 | { |
247 | 0 | newset += 3; |
248 | 0 | if (*newset == '-' || *newset == '_') |
249 | 0 | newset++; |
250 | 0 | } |
251 | | |
252 | | /* Note that we silently assume that plain ASCII is actually meant |
253 | | as Latin-1. This makes sense because many Unix system don't have |
254 | | their locale set up properly and thus would get annoying error |
255 | | messages and we have to handle all the "bug" reports. Latin-1 has |
256 | | traditionally been the character set used for 8 bit characters on |
257 | | Unix systems. */ |
258 | 0 | if ( !*newset |
259 | 0 | || !ascii_strcasecmp (newset, "8859-1" ) |
260 | 0 | || !ascii_strcasecmp (newset, "646" ) |
261 | 0 | || !ascii_strcasecmp (newset, "ASCII" ) |
262 | 0 | || !ascii_strcasecmp (newset, "ANSI_X3.4-1968" ) |
263 | 0 | ) |
264 | 0 | { |
265 | 0 | active_charset_name = "iso-8859-1"; |
266 | 0 | no_translation = 0; |
267 | 0 | use_iconv = 0; |
268 | 0 | } |
269 | 0 | else if ( !ascii_strcasecmp (newset, "utf8" ) |
270 | 0 | || !ascii_strcasecmp(newset, "utf-8") ) |
271 | 0 | { |
272 | 0 | active_charset_name = "utf-8"; |
273 | 0 | no_translation = 1; |
274 | 0 | use_iconv = 0; |
275 | 0 | } |
276 | 0 | else |
277 | 0 | { |
278 | 0 | iconv_t cd; |
279 | |
|
280 | 0 | cd = iconv_open (full_newset, "utf-8"); |
281 | 0 | if (cd == (iconv_t)-1) |
282 | 0 | { |
283 | 0 | handle_iconv_error (full_newset, "utf-8", 0); |
284 | 0 | return -1; |
285 | 0 | } |
286 | 0 | iconv_close (cd); |
287 | 0 | cd = iconv_open ("utf-8", full_newset); |
288 | 0 | if (cd == (iconv_t)-1) |
289 | 0 | { |
290 | 0 | handle_iconv_error ("utf-8", full_newset, 0); |
291 | 0 | return -1; |
292 | 0 | } |
293 | 0 | iconv_close (cd); |
294 | 0 | active_charset_name = full_newset; |
295 | 0 | no_translation = 0; |
296 | 0 | use_iconv = 1; |
297 | 0 | } |
298 | 0 | return 0; |
299 | 0 | } |
300 | | |
301 | | const char * |
302 | | get_native_charset (void) |
303 | 0 | { |
304 | 0 | return active_charset_name; |
305 | 0 | } |
306 | | |
307 | | /* Return true if the native charset is utf-8. */ |
308 | | int |
309 | | is_native_utf8 (void) |
310 | 0 | { |
311 | 0 | return no_translation; |
312 | 0 | } |
313 | | |
314 | | |
315 | | /* Convert string, which is in native encoding to UTF8 and return a |
316 | | new allocated UTF-8 string. This function terminates the process |
317 | | on memory shortage. */ |
318 | | char * |
319 | | native_to_utf8 (const char *orig_string) |
320 | 0 | { |
321 | 0 | const unsigned char *string = (const unsigned char *)orig_string; |
322 | 0 | const unsigned char *s; |
323 | 0 | char *buffer; |
324 | 0 | unsigned char *p; |
325 | 0 | size_t length = 0; |
326 | |
|
327 | 0 | if (no_translation) |
328 | 0 | { |
329 | | /* Already utf-8 encoded. */ |
330 | 0 | buffer = xstrdup (orig_string); |
331 | 0 | } |
332 | 0 | else if (!use_iconv) |
333 | 0 | { |
334 | | /* For Latin-1 we can avoid the iconv overhead. */ |
335 | 0 | for (s = string; *s; s++) |
336 | 0 | { |
337 | 0 | length++; |
338 | 0 | if (*s & 0x80) |
339 | 0 | length++; |
340 | 0 | } |
341 | 0 | buffer = xmalloc (length + 1); |
342 | 0 | for (p = (unsigned char *)buffer, s = string; *s; s++) |
343 | 0 | { |
344 | 0 | if ( (*s & 0x80 )) |
345 | 0 | { |
346 | 0 | *p++ = 0xc0 | ((*s >> 6) & 3); |
347 | 0 | *p++ = 0x80 | (*s & 0x3f); |
348 | 0 | } |
349 | 0 | else |
350 | 0 | *p++ = *s; |
351 | 0 | } |
352 | 0 | *p = 0; |
353 | 0 | } |
354 | 0 | else |
355 | 0 | { |
356 | | /* Need to use iconv. */ |
357 | 0 | iconv_t cd; |
358 | 0 | const char *inptr; |
359 | 0 | char *outptr; |
360 | 0 | size_t inbytes, outbytes; |
361 | |
|
362 | 0 | cd = iconv_open ("utf-8", active_charset_name); |
363 | 0 | if (cd == (iconv_t)-1) |
364 | 0 | { |
365 | 0 | handle_iconv_error ("utf-8", active_charset_name, 1); |
366 | 0 | return native_to_utf8 (string); |
367 | 0 | } |
368 | | |
369 | 0 | for (s=string; *s; s++ ) |
370 | 0 | { |
371 | 0 | length++; |
372 | 0 | if ((*s & 0x80)) |
373 | 0 | length += 5; /* We may need up to 6 bytes for the utf8 output. */ |
374 | 0 | } |
375 | 0 | buffer = xmalloc (length + 1); |
376 | |
|
377 | 0 | inptr = string; |
378 | 0 | inbytes = strlen (string); |
379 | 0 | outptr = buffer; |
380 | 0 | outbytes = length; |
381 | 0 | if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes, |
382 | 0 | &outptr, &outbytes) == (size_t)-1) |
383 | 0 | { |
384 | 0 | static int shown; |
385 | |
|
386 | 0 | if (!shown) |
387 | 0 | log_info (_("conversion from '%s' to '%s' failed: %s\n"), |
388 | 0 | active_charset_name, "utf-8", strerror (errno)); |
389 | 0 | shown = 1; |
390 | | /* We don't do any conversion at all but use the strings as is. */ |
391 | 0 | strcpy (buffer, string); |
392 | 0 | } |
393 | 0 | else /* Success. */ |
394 | 0 | { |
395 | 0 | *outptr = 0; |
396 | | /* We could realloc the buffer now but I doubt that it makes |
397 | | much sense given that it will get freed anyway soon |
398 | | after. */ |
399 | 0 | } |
400 | 0 | iconv_close (cd); |
401 | 0 | } |
402 | 0 | return buffer; |
403 | 0 | } |
404 | | |
405 | | |
406 | | |
407 | | static char * |
408 | | do_utf8_to_native (const char *string, size_t length, int delim, |
409 | | int with_iconv) |
410 | 5.66k | { |
411 | 5.66k | int nleft; |
412 | 5.66k | int i; |
413 | 5.66k | unsigned char encbuf[8]; |
414 | 5.66k | int encidx; |
415 | 5.66k | const unsigned char *s; |
416 | 5.66k | size_t n; |
417 | 5.66k | char *buffer = NULL; |
418 | 5.66k | char *p = NULL; |
419 | 5.66k | unsigned long val = 0; |
420 | 5.66k | size_t slen; |
421 | 5.66k | int resync = 0; |
422 | | |
423 | | /* First pass (p==NULL): count the extended utf-8 characters. */ |
424 | | /* Second pass (p!=NULL): create string. */ |
425 | 5.66k | for (;;) |
426 | 11.3k | { |
427 | 11.3k | for (slen = length, nleft = encidx = 0, n = 0, |
428 | 11.3k | s = (const unsigned char *)string; |
429 | 358k | slen; |
430 | 347k | s++, slen--) |
431 | 347k | { |
432 | 347k | if (resync) |
433 | 176k | { |
434 | 176k | if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd))) |
435 | 146k | { |
436 | | /* Still invalid. */ |
437 | 146k | if (p) |
438 | 73.7k | { |
439 | 73.7k | sprintf (p, "\\x%02x", *s); |
440 | 73.7k | p += 4; |
441 | 73.7k | } |
442 | 146k | n += 4; |
443 | 146k | continue; |
444 | 146k | } |
445 | 29.8k | resync = 0; |
446 | 29.8k | } |
447 | 200k | if (!nleft) |
448 | 156k | { |
449 | 156k | if (!(*s & 0x80)) |
450 | 116k | { |
451 | | /* Plain ascii. */ |
452 | 116k | if ( delim != -1 |
453 | 116k | && (*s < 0x20 || *s == 0x7f || *s == delim |
454 | 88.4k | || (delim && *s == '\\'))) |
455 | 28.0k | { |
456 | 28.0k | n++; |
457 | 28.0k | if (p) |
458 | 14.0k | *p++ = '\\'; |
459 | 28.0k | switch (*s) |
460 | 28.0k | { |
461 | 1.49k | case '\n': n++; if ( p ) *p++ = 'n'; break; |
462 | 1.63k | case '\r': n++; if ( p ) *p++ = 'r'; break; |
463 | 1.24k | case '\f': n++; if ( p ) *p++ = 'f'; break; |
464 | 2.41k | case '\v': n++; if ( p ) *p++ = 'v'; break; |
465 | 830 | case '\b': n++; if ( p ) *p++ = 'b'; break; |
466 | 7.09k | case 0: n++; if ( p ) *p++ = '0'; break; |
467 | 13.3k | default: |
468 | 13.3k | n += 3; |
469 | 13.3k | if (p) |
470 | 6.68k | { |
471 | 6.68k | sprintf (p, "x%02x", *s); |
472 | 6.68k | p += 3; |
473 | 6.68k | } |
474 | 13.3k | break; |
475 | 28.0k | } |
476 | 28.0k | } |
477 | 88.4k | else |
478 | 88.4k | { |
479 | 88.4k | if (p) |
480 | 44.2k | *p++ = *s; |
481 | 88.4k | n++; |
482 | 88.4k | } |
483 | 116k | } |
484 | 39.6k | else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */ |
485 | 15.5k | { |
486 | 15.5k | val = *s & 0x1f; |
487 | 15.5k | nleft = 1; |
488 | 15.5k | encidx = 0; |
489 | 15.5k | encbuf[encidx++] = *s; |
490 | 15.5k | } |
491 | 24.0k | else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */ |
492 | 4.31k | { |
493 | 4.31k | val = *s & 0x0f; |
494 | 4.31k | nleft = 2; |
495 | 4.31k | encidx = 0; |
496 | 4.31k | encbuf[encidx++] = *s; |
497 | 4.31k | } |
498 | 19.7k | else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */ |
499 | 3.94k | { |
500 | 3.94k | val = *s & 0x07; |
501 | 3.94k | nleft = 3; |
502 | 3.94k | encidx = 0; |
503 | 3.94k | encbuf[encidx++] = *s; |
504 | 3.94k | } |
505 | 15.8k | else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */ |
506 | 2.21k | { |
507 | 2.21k | val = *s & 0x03; |
508 | 2.21k | nleft = 4; |
509 | 2.21k | encidx = 0; |
510 | 2.21k | encbuf[encidx++] = *s; |
511 | 2.21k | } |
512 | 13.6k | else if ((*s & 0xfe) == 0xfc) /* 1111 110x */ |
513 | 2.80k | { |
514 | 2.80k | val = *s & 0x01; |
515 | 2.80k | nleft = 5; |
516 | 2.80k | encidx = 0; |
517 | 2.80k | encbuf[encidx++] = *s; |
518 | 2.80k | } |
519 | 10.8k | else /* Invalid encoding: print as \xNN. */ |
520 | 10.8k | { |
521 | 10.8k | if (p) |
522 | 5.08k | { |
523 | 5.08k | sprintf (p, "\\x%02x", *s); |
524 | 5.08k | p += 4; |
525 | 5.08k | } |
526 | 10.8k | n += 4; |
527 | 10.8k | resync = 1; |
528 | 10.8k | } |
529 | 156k | } |
530 | 44.1k | else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */ |
531 | 21.4k | { |
532 | 21.4k | if (p) |
533 | 10.7k | { |
534 | 25.4k | for (i = 0; i < encidx; i++) |
535 | 14.7k | { |
536 | 14.7k | sprintf (p, "\\x%02x", encbuf[i]); |
537 | 14.7k | p += 4; |
538 | 14.7k | } |
539 | 10.7k | sprintf (p, "\\x%02x", *s); |
540 | 10.7k | p += 4; |
541 | 10.7k | } |
542 | 21.4k | n += 4 + 4 * encidx; |
543 | 21.4k | nleft = 0; |
544 | 21.4k | encidx = 0; |
545 | 21.4k | resync = 1; |
546 | 21.4k | } |
547 | 22.6k | else |
548 | 22.6k | { |
549 | 22.6k | encbuf[encidx++] = *s; |
550 | 22.6k | val <<= 6; |
551 | 22.6k | val |= *s & 0x3f; |
552 | 22.6k | if (!--nleft) /* Ready. */ |
553 | 7.23k | { |
554 | 7.23k | if (no_translation) |
555 | 0 | { |
556 | 0 | if (p) |
557 | 0 | { |
558 | 0 | for (i = 0; i < encidx; i++) |
559 | 0 | *p++ = encbuf[i]; |
560 | 0 | } |
561 | 0 | n += encidx; |
562 | 0 | encidx = 0; |
563 | 0 | } |
564 | 7.23k | else if (with_iconv) |
565 | 0 | { |
566 | | /* Our strategy for using iconv is a bit strange |
567 | | but it better keeps compatibility with |
568 | | previous versions in regard to how invalid |
569 | | encodings are displayed. What we do is to |
570 | | keep the utf-8 as is and have the real |
571 | | translation step then at the end. Yes, I |
572 | | know that this is ugly. However we are short |
573 | | of the 1.4 release and for this branch we |
574 | | should not mess too much around with iconv |
575 | | things. One reason for this is that we don't |
576 | | know enough about non-GNU iconv |
577 | | implementation and want to minimize the risk |
578 | | of breaking the code on too many platforms. */ |
579 | 0 | if ( p ) |
580 | 0 | { |
581 | 0 | for (i=0; i < encidx; i++ ) |
582 | 0 | *p++ = encbuf[i]; |
583 | 0 | } |
584 | 0 | n += encidx; |
585 | 0 | encidx = 0; |
586 | 0 | } |
587 | 7.23k | else /* Latin-1 case. */ |
588 | 7.23k | { |
589 | 7.23k | if (val >= 0x80 && val < 256) |
590 | 1.16k | { |
591 | | /* We can simply print this character */ |
592 | 1.16k | n++; |
593 | 1.16k | if (p) |
594 | 580 | *p++ = val; |
595 | 1.16k | } |
596 | 6.07k | else |
597 | 6.07k | { |
598 | | /* We do not have a translation: print utf8. */ |
599 | 6.07k | if (p) |
600 | 3.03k | { |
601 | 12.8k | for (i = 0; i < encidx; i++) |
602 | 9.77k | { |
603 | 9.77k | sprintf (p, "\\x%02x", encbuf[i]); |
604 | 9.77k | p += 4; |
605 | 9.77k | } |
606 | 3.03k | } |
607 | 6.07k | n += encidx * 4; |
608 | 6.07k | encidx = 0; |
609 | 6.07k | } |
610 | 7.23k | } |
611 | 7.23k | } |
612 | | |
613 | 22.6k | } |
614 | 200k | } |
615 | 11.3k | if (!buffer) |
616 | 5.66k | { |
617 | | /* Allocate the buffer after the first pass. */ |
618 | 5.66k | buffer = p = xmalloc (n + 1); |
619 | 5.66k | } |
620 | 5.66k | else if (with_iconv) |
621 | 0 | { |
622 | | /* Note: See above for comments. */ |
623 | 0 | iconv_t cd; |
624 | 0 | const char *inptr; |
625 | 0 | char *outbuf, *outptr; |
626 | 0 | size_t inbytes, outbytes; |
627 | |
|
628 | 0 | *p = 0; /* Terminate the buffer. */ |
629 | |
|
630 | 0 | cd = iconv_open (active_charset_name, "utf-8"); |
631 | 0 | if (cd == (iconv_t)-1) |
632 | 0 | { |
633 | 0 | handle_iconv_error (active_charset_name, "utf-8", 1); |
634 | 0 | xfree (buffer); |
635 | 0 | return utf8_to_native (string, length, delim); |
636 | 0 | } |
637 | | |
638 | | /* Allocate a new buffer large enough to hold all possible |
639 | | encodings. */ |
640 | 0 | n = p - buffer + 1; |
641 | 0 | inbytes = n - 1;; |
642 | 0 | inptr = buffer; |
643 | 0 | outbytes = n * MB_LEN_MAX; |
644 | 0 | if (outbytes / MB_LEN_MAX != n) |
645 | 0 | BUG (); /* Actually an overflow. */ |
646 | 0 | outbuf = outptr = xmalloc (outbytes); |
647 | 0 | if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes, |
648 | 0 | &outptr, &outbytes) == (size_t)-1) |
649 | 0 | { |
650 | 0 | static int shown; |
651 | |
|
652 | 0 | if (!shown) |
653 | 0 | log_info (_("conversion from '%s' to '%s' failed: %s\n"), |
654 | 0 | "utf-8", active_charset_name, strerror (errno)); |
655 | 0 | shown = 1; |
656 | | /* Didn't worked out. Try again but without iconv. */ |
657 | 0 | xfree (buffer); |
658 | 0 | buffer = NULL; |
659 | 0 | xfree (outbuf); |
660 | 0 | outbuf = do_utf8_to_native (string, length, delim, 0); |
661 | 0 | } |
662 | 0 | else /* Success. */ |
663 | 0 | { |
664 | 0 | *outptr = 0; /* Make sure it is a string. */ |
665 | | /* We could realloc the buffer now but I doubt that it |
666 | | makes much sense given that it will get freed |
667 | | anyway soon after. */ |
668 | 0 | xfree (buffer); |
669 | 0 | } |
670 | 0 | iconv_close (cd); |
671 | 0 | return outbuf; |
672 | 0 | } |
673 | 5.66k | else /* Not using iconv. */ |
674 | 5.66k | { |
675 | 5.66k | *p = 0; /* Make sure it is a string. */ |
676 | 5.66k | return buffer; |
677 | 5.66k | } |
678 | 11.3k | } |
679 | 5.66k | } |
680 | | |
681 | | /* Convert string, which is in UTF-8 to native encoding. Replace |
682 | | illegal encodings by some "\xnn" and quote all control |
683 | | characters. A character with value DELIM will always be quoted, it |
684 | | must be a vanilla ASCII character. A DELIM value of -1 is special: |
685 | | it disables all quoting of control characters. This function |
686 | | terminates the process on memory shortage. */ |
687 | | char * |
688 | | utf8_to_native (const char *string, size_t length, int delim) |
689 | 5.66k | { |
690 | 5.66k | return do_utf8_to_native (string, length, delim, use_iconv); |
691 | 5.66k | } |
692 | | |
693 | | |
694 | | |
695 | | |
696 | | /* Wrapper function for iconv_open, required for W32 as we dlopen that |
697 | | library on that system. */ |
698 | | jnlib_iconv_t |
699 | | jnlib_iconv_open (const char *tocode, const char *fromcode) |
700 | 0 | { |
701 | 0 | return (jnlib_iconv_t)iconv_open (tocode, fromcode); |
702 | 0 | } |
703 | | |
704 | | |
705 | | /* Wrapper function for iconv, required for W32 as we dlopen that |
706 | | library on that system. */ |
707 | | size_t |
708 | | jnlib_iconv (jnlib_iconv_t cd, |
709 | | const char **inbuf, size_t *inbytesleft, |
710 | | char **outbuf, size_t *outbytesleft) |
711 | 0 | { |
712 | 0 | return iconv ((iconv_t)cd, (ICONV_CONST char**)inbuf, inbytesleft, |
713 | 0 | outbuf, outbytesleft); |
714 | 0 | } |
715 | | |
716 | | /* Wrapper function for iconv_close, required for W32 as we dlopen that |
717 | | library on that system. */ |
718 | | int |
719 | | jnlib_iconv_close (jnlib_iconv_t cd) |
720 | 0 | { |
721 | 0 | return iconv_close ((iconv_t)cd); |
722 | 0 | } |
723 | | |
724 | | |
725 | | #ifdef HAVE_W32_SYSTEM |
726 | | /* Return a malloced string encoded for CODEPAGE from the wide char input |
727 | | string STRING. Caller must free this value. Returns NULL and sets |
728 | | ERRNO on failure. Calling this function with STRING set to NULL is |
729 | | not defined. */ |
730 | | static char * |
731 | | wchar_to_cp (const wchar_t *string, unsigned int codepage) |
732 | | { |
733 | | int n; |
734 | | char *result; |
735 | | |
736 | | n = WideCharToMultiByte (codepage, 0, string, -1, NULL, 0, NULL, NULL); |
737 | | if (n < 0) |
738 | | { |
739 | | gpg_err_set_errno (EINVAL); |
740 | | return NULL; |
741 | | } |
742 | | |
743 | | result = xtrymalloc (n+1); |
744 | | if (!result) |
745 | | return NULL; |
746 | | |
747 | | n = WideCharToMultiByte (codepage, 0, string, -1, result, n, NULL, NULL); |
748 | | if (n < 0) |
749 | | { |
750 | | xfree (result); |
751 | | gpg_err_set_errno (EINVAL); |
752 | | result = NULL; |
753 | | } |
754 | | return result; |
755 | | } |
756 | | |
757 | | |
758 | | /* Return a malloced wide char string from a CODEPAGE encoded input |
759 | | string STRING. Caller must free this value. Returns NULL and sets |
760 | | ERRNO on failure. Calling this function with STRING set to NULL is |
761 | | not defined. */ |
762 | | static wchar_t * |
763 | | cp_to_wchar (const char *string, unsigned int codepage) |
764 | | { |
765 | | int n; |
766 | | size_t nbytes; |
767 | | wchar_t *result; |
768 | | |
769 | | n = MultiByteToWideChar (codepage, 0, string, -1, NULL, 0); |
770 | | if (n < 0) |
771 | | { |
772 | | gpg_err_set_errno (EINVAL); |
773 | | return NULL; |
774 | | } |
775 | | |
776 | | nbytes = (size_t)(n+1) * sizeof(*result); |
777 | | if (nbytes / sizeof(*result) != (n+1)) |
778 | | { |
779 | | gpg_err_set_errno (ENOMEM); |
780 | | return NULL; |
781 | | } |
782 | | result = xtrymalloc (nbytes); |
783 | | if (!result) |
784 | | return NULL; |
785 | | |
786 | | n = MultiByteToWideChar (codepage, 0, string, -1, result, n); |
787 | | if (n < 0) |
788 | | { |
789 | | xfree (result); |
790 | | gpg_err_set_errno (EINVAL); |
791 | | result = NULL; |
792 | | } |
793 | | return result; |
794 | | } |
795 | | |
796 | | |
797 | | /* Get the current codepage as used by wchar_to_native and |
798 | | * native_to_char. Note that these functions intentionally do not use |
799 | | * iconv based conversion machinery. */ |
800 | | static unsigned int |
801 | | get_w32_codepage (void) |
802 | | { |
803 | | static unsigned int cp; |
804 | | |
805 | | if (!cp) |
806 | | { |
807 | | cp = GetConsoleOutputCP (); |
808 | | if (!cp) |
809 | | cp = GetACP (); |
810 | | } |
811 | | return cp; |
812 | | } |
813 | | |
814 | | /* Return a malloced string encoded in the active code page from the |
815 | | * wide char input string STRING. Caller must free this value. |
816 | | * Returns NULL and sets ERRNO on failure. Calling this function with |
817 | | * STRING set to NULL is not defined. */ |
818 | | char * |
819 | | wchar_to_native (const wchar_t *string) |
820 | | { |
821 | | return wchar_to_cp (string, get_w32_codepage ()); |
822 | | } |
823 | | |
824 | | |
825 | | /* Return a malloced wide char string from native encoded input |
826 | | * string STRING. Caller must free this value. Returns NULL and sets |
827 | | * ERRNO on failure. Calling this function with STRING set to NULL is |
828 | | * not defined. */ |
829 | | wchar_t * |
830 | | native_to_wchar (const char *string) |
831 | | { |
832 | | return cp_to_wchar (string, get_w32_codepage ()); |
833 | | } |
834 | | |
835 | | |
836 | | /* Return a malloced string encoded in UTF-8 from the wide char input |
837 | | * string STRING. Caller must free this value. Returns NULL and sets |
838 | | * ERRNO on failure. Calling this function with STRING set to NULL is |
839 | | * not defined. */ |
840 | | char * |
841 | | wchar_to_utf8 (const wchar_t *string) |
842 | | { |
843 | | return wchar_to_cp (string, CP_UTF8); |
844 | | } |
845 | | |
846 | | |
847 | | /* Return a malloced wide char string from an UTF-8 encoded input |
848 | | * string STRING. Caller must free this value. Returns NULL and sets |
849 | | * ERRNO on failure. Calling this function with STRING set to NULL is |
850 | | * not defined. */ |
851 | | wchar_t * |
852 | | utf8_to_wchar (const char *string) |
853 | | { |
854 | | return cp_to_wchar (string, CP_UTF8); |
855 | | } |
856 | | |
857 | | #endif /*HAVE_W32_SYSTEM*/ |