/src/irssi/subprojects/glib-2.74.3/glib/gconvert.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* GLIB - Library of useful routines for C programming |
2 | | * |
3 | | * gconvert.c: Convert between character sets using iconv |
4 | | * Copyright Red Hat Inc., 2000 |
5 | | * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com> |
6 | | * |
7 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
8 | | * |
9 | | * This library is free software; you can redistribute it and/or |
10 | | * modify it under the terms of the GNU Lesser General Public |
11 | | * License as published by the Free Software Foundation; either |
12 | | * version 2.1 of the License, or (at your option) any later version. |
13 | | * |
14 | | * This library is distributed in the hope that it will be useful, |
15 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 | | * Lesser General Public License for more details. |
18 | | * |
19 | | * You should have received a copy of the GNU Lesser General Public |
20 | | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
21 | | */ |
22 | | |
23 | | #include "config.h" |
24 | | #include "glibconfig.h" |
25 | | |
26 | | #ifndef G_OS_WIN32 |
27 | | #include <iconv.h> |
28 | | #endif |
29 | | #include <errno.h> |
30 | | #include <stdio.h> |
31 | | #include <string.h> |
32 | | #include <stdlib.h> |
33 | | |
34 | | #ifdef G_OS_WIN32 |
35 | | #include "win_iconv.c" |
36 | | #endif |
37 | | |
38 | | #ifdef G_PLATFORM_WIN32 |
39 | | #define STRICT |
40 | | #include <windows.h> |
41 | | #undef STRICT |
42 | | #endif |
43 | | |
44 | | #include "gconvert.h" |
45 | | #include "gconvertprivate.h" |
46 | | |
47 | | #include "gcharsetprivate.h" |
48 | | #include "gslist.h" |
49 | | #include "gstrfuncs.h" |
50 | | #include "gtestutils.h" |
51 | | #include "gthread.h" |
52 | | #include "gthreadprivate.h" |
53 | | #include "gunicode.h" |
54 | | #include "gfileutils.h" |
55 | | #include "genviron.h" |
56 | | |
57 | | #include "glibintl.h" |
58 | | |
59 | | |
60 | | /** |
61 | | * SECTION:conversions |
62 | | * @title: Character Set Conversion |
63 | | * @short_description: convert strings between different character sets |
64 | | * |
65 | | * The g_convert() family of function wraps the functionality of iconv(). |
66 | | * In addition to pure character set conversions, GLib has functions to |
67 | | * deal with the extra complications of encodings for file names. |
68 | | * |
69 | | * ## File Name Encodings |
70 | | * |
71 | | * Historically, UNIX has not had a defined encoding for file names: |
72 | | * a file name is valid as long as it does not have path separators |
73 | | * in it ("/"). However, displaying file names may require conversion: |
74 | | * from the character set in which they were created, to the character |
75 | | * set in which the application operates. Consider the Spanish file name |
76 | | * "Presentación.sxi". If the application which created it uses |
77 | | * ISO-8859-1 for its encoding, |
78 | | * |[ |
79 | | * Character: P r e s e n t a c i ó n . s x i |
80 | | * Hex code: 50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69 |
81 | | * ]| |
82 | | * However, if the application use UTF-8, the actual file name on |
83 | | * disk would look like this: |
84 | | * |[ |
85 | | * Character: P r e s e n t a c i ó n . s x i |
86 | | * Hex code: 50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69 |
87 | | * ]| |
88 | | * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use |
89 | | * GLib do the same thing. If you get a file name from the file system, |
90 | | * for example, from readdir() or from g_dir_read_name(), and you wish |
91 | | * to display the file name to the user, you will need to convert it |
92 | | * into UTF-8. The opposite case is when the user types the name of a |
93 | | * file they wish to save: the toolkit will give you that string in |
94 | | * UTF-8 encoding, and you will need to convert it to the character |
95 | | * set used for file names before you can create the file with open() |
96 | | * or fopen(). |
97 | | * |
98 | | * By default, GLib assumes that file names on disk are in UTF-8 |
99 | | * encoding. This is a valid assumption for file systems which |
100 | | * were created relatively recently: most applications use UTF-8 |
101 | | * encoding for their strings, and that is also what they use for |
102 | | * the file names they create. However, older file systems may |
103 | | * still contain file names created in "older" encodings, such as |
104 | | * ISO-8859-1. In this case, for compatibility reasons, you may want |
105 | | * to instruct GLib to use that particular encoding for file names |
106 | | * rather than UTF-8. You can do this by specifying the encoding for |
107 | | * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING] |
108 | | * environment variable. For example, if your installation uses |
109 | | * ISO-8859-1 for file names, you can put this in your `~/.profile`: |
110 | | * |[ |
111 | | * export G_FILENAME_ENCODING=ISO-8859-1 |
112 | | * ]| |
113 | | * GLib provides the functions g_filename_to_utf8() and |
114 | | * g_filename_from_utf8() to perform the necessary conversions. |
115 | | * These functions convert file names from the encoding specified |
116 | | * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This |
117 | | * [diagram][file-name-encodings-diagram] illustrates how |
118 | | * these functions are used to convert between UTF-8 and the |
119 | | * encoding for file names in the file system. |
120 | | * |
121 | | * ## Conversion between file name encodings # {#file-name-encodings-diagram) |
122 | | * |
123 | | *  |
124 | | * |
125 | | * ## Checklist for Application Writers |
126 | | * |
127 | | * This section is a practical summary of the detailed |
128 | | * things to do to make sure your applications process file |
129 | | * name encodings correctly. |
130 | | * |
131 | | * 1. If you get a file name from the file system from a function |
132 | | * such as readdir() or gtk_file_chooser_get_filename(), you do |
133 | | * not need to do any conversion to pass that file name to |
134 | | * functions like open(), rename(), or fopen() -- those are "raw" |
135 | | * file names which the file system understands. |
136 | | * |
137 | | * 2. If you need to display a file name, convert it to UTF-8 first |
138 | | * by using g_filename_to_utf8(). If conversion fails, display a |
139 | | * string like "Unknown file name". Do not convert this string back |
140 | | * into the encoding used for file names if you wish to pass it to |
141 | | * the file system; use the original file name instead. |
142 | | * |
143 | | * For example, the document window of a word processor could display |
144 | | * "Unknown file name" in its title bar but still let the user save |
145 | | * the file, as it would keep the raw file name internally. This |
146 | | * can happen if the user has not set the `G_FILENAME_ENCODING` |
147 | | * environment variable even though they have files whose names are |
148 | | * not encoded in UTF-8. |
149 | | * |
150 | | * 3. If your user interface lets the user type a file name for saving |
151 | | * or renaming, convert it to the encoding used for file names in |
152 | | * the file system by using g_filename_from_utf8(). Pass the converted |
153 | | * file name to functions like fopen(). If conversion fails, ask the |
154 | | * user to enter a different file name. This can happen if the user |
155 | | * types Japanese characters when `G_FILENAME_ENCODING` is set to |
156 | | * `ISO-8859-1`, for example. |
157 | | */ |
158 | | |
159 | | /* We try to terminate strings in unknown charsets with this many zero bytes |
160 | | * to ensure that multibyte strings really are nul-terminated when we return |
161 | | * them from g_convert() and friends. |
162 | | */ |
163 | 476k | #define NUL_TERMINATOR_LENGTH 4 |
164 | | |
165 | | G_DEFINE_QUARK (g_convert_error, g_convert_error) |
166 | | |
167 | | static gboolean |
168 | | try_conversion (const char *to_codeset, |
169 | | const char *from_codeset, |
170 | | iconv_t *cd) |
171 | 158k | { |
172 | 158k | *cd = iconv_open (to_codeset, from_codeset); |
173 | | |
174 | 158k | if (*cd == (iconv_t)-1 && errno == EINVAL) |
175 | 0 | return FALSE; |
176 | 158k | else |
177 | 158k | return TRUE; |
178 | 158k | } |
179 | | |
180 | | static gboolean |
181 | | try_to_aliases (const char **to_aliases, |
182 | | const char *from_codeset, |
183 | | iconv_t *cd) |
184 | 0 | { |
185 | 0 | if (to_aliases) |
186 | 0 | { |
187 | 0 | const char **p = to_aliases; |
188 | 0 | while (*p) |
189 | 0 | { |
190 | 0 | if (try_conversion (*p, from_codeset, cd)) |
191 | 0 | return TRUE; |
192 | | |
193 | 0 | p++; |
194 | 0 | } |
195 | 0 | } |
196 | | |
197 | 0 | return FALSE; |
198 | 0 | } |
199 | | |
200 | | /** |
201 | | * g_iconv_open: (skip) |
202 | | * @to_codeset: destination codeset |
203 | | * @from_codeset: source codeset |
204 | | * |
205 | | * Same as the standard UNIX routine iconv_open(), but |
206 | | * may be implemented via libiconv on UNIX flavors that lack |
207 | | * a native implementation. |
208 | | * |
209 | | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
210 | | * more convenient than the raw iconv wrappers. |
211 | | * |
212 | | * Returns: a "conversion descriptor", or (GIConv)-1 if |
213 | | * opening the converter failed. |
214 | | **/ |
215 | | GIConv |
216 | | g_iconv_open (const gchar *to_codeset, |
217 | | const gchar *from_codeset) |
218 | 158k | { |
219 | 158k | iconv_t cd; |
220 | | |
221 | 158k | if (!try_conversion (to_codeset, from_codeset, &cd)) |
222 | 0 | { |
223 | 0 | const char **to_aliases = _g_charset_get_aliases (to_codeset); |
224 | 0 | const char **from_aliases = _g_charset_get_aliases (from_codeset); |
225 | |
|
226 | 0 | if (from_aliases) |
227 | 0 | { |
228 | 0 | const char **p = from_aliases; |
229 | 0 | while (*p) |
230 | 0 | { |
231 | 0 | if (try_conversion (to_codeset, *p, &cd)) |
232 | 0 | goto out; |
233 | | |
234 | 0 | if (try_to_aliases (to_aliases, *p, &cd)) |
235 | 0 | goto out; |
236 | | |
237 | 0 | p++; |
238 | 0 | } |
239 | 0 | } |
240 | | |
241 | 0 | if (try_to_aliases (to_aliases, from_codeset, &cd)) |
242 | 0 | goto out; |
243 | 0 | } |
244 | | |
245 | 158k | out: |
246 | 158k | return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd; |
247 | 158k | } |
248 | | |
249 | | /** |
250 | | * g_iconv: (skip) |
251 | | * @converter: conversion descriptor from g_iconv_open() |
252 | | * @inbuf: bytes to convert |
253 | | * @inbytes_left: (inout): inout parameter, bytes remaining to convert in @inbuf |
254 | | * @outbuf: converted output bytes |
255 | | * @outbytes_left: (inout): inout parameter, bytes available to fill in @outbuf |
256 | | * |
257 | | * Same as the standard UNIX routine iconv(), but |
258 | | * may be implemented via libiconv on UNIX flavors that lack |
259 | | * a native implementation. |
260 | | * |
261 | | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
262 | | * more convenient than the raw iconv wrappers. |
263 | | * |
264 | | * Note that the behaviour of iconv() for characters which are valid in the |
265 | | * input character set, but which have no representation in the output character |
266 | | * set, is implementation defined. This function may return success (with a |
267 | | * positive number of non-reversible conversions as replacement characters were |
268 | | * used), or it may return -1 and set an error such as %EILSEQ, in such a |
269 | | * situation. |
270 | | * |
271 | | * Returns: count of non-reversible conversions, or -1 on error |
272 | | **/ |
273 | | gsize |
274 | | g_iconv (GIConv converter, |
275 | | gchar **inbuf, |
276 | | gsize *inbytes_left, |
277 | | gchar **outbuf, |
278 | | gsize *outbytes_left) |
279 | 316k | { |
280 | 316k | iconv_t cd = (iconv_t)converter; |
281 | | |
282 | 316k | return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left); |
283 | 316k | } |
284 | | |
285 | | /** |
286 | | * g_iconv_close: (skip) |
287 | | * @converter: a conversion descriptor from g_iconv_open() |
288 | | * |
289 | | * Same as the standard UNIX routine iconv_close(), but |
290 | | * may be implemented via libiconv on UNIX flavors that lack |
291 | | * a native implementation. Should be called to clean up |
292 | | * the conversion descriptor from g_iconv_open() when |
293 | | * you are done converting things. |
294 | | * |
295 | | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
296 | | * more convenient than the raw iconv wrappers. |
297 | | * |
298 | | * Returns: -1 on error, 0 on success |
299 | | **/ |
300 | | gint |
301 | | g_iconv_close (GIConv converter) |
302 | 158k | { |
303 | 158k | iconv_t cd = (iconv_t)converter; |
304 | | |
305 | 158k | return iconv_close (cd); |
306 | 158k | } |
307 | | |
308 | | static GIConv |
309 | | open_converter (const gchar *to_codeset, |
310 | | const gchar *from_codeset, |
311 | | GError **error) |
312 | 158k | { |
313 | 158k | GIConv cd; |
314 | | |
315 | 158k | cd = g_iconv_open (to_codeset, from_codeset); |
316 | | |
317 | 158k | if (cd == (GIConv) -1) |
318 | 0 | { |
319 | | /* Something went wrong. */ |
320 | 0 | if (error) |
321 | 0 | { |
322 | 0 | if (errno == EINVAL) |
323 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION, |
324 | 0 | _("Conversion from character set “%s” to “%s” is not supported"), |
325 | 0 | from_codeset, to_codeset); |
326 | 0 | else |
327 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
328 | 0 | _("Could not open converter from “%s” to “%s”"), |
329 | 0 | from_codeset, to_codeset); |
330 | 0 | } |
331 | 0 | } |
332 | | |
333 | 158k | return cd; |
334 | 158k | } |
335 | | |
336 | | static int |
337 | | close_converter (GIConv cd) |
338 | 158k | { |
339 | 158k | if (cd == (GIConv) -1) |
340 | 0 | return 0; |
341 | | |
342 | 158k | return g_iconv_close (cd); |
343 | 158k | } |
344 | | |
345 | | /** |
346 | | * g_convert_with_iconv: (skip) |
347 | | * @str: (array length=len) (element-type guint8): |
348 | | * the string to convert. |
349 | | * @len: the length of the string in bytes, or -1 if the string is |
350 | | * nul-terminated (Note that some encodings may allow nul |
351 | | * bytes to occur inside strings. In that case, using -1 |
352 | | * for the @len parameter is unsafe) |
353 | | * @converter: conversion descriptor from g_iconv_open() |
354 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
355 | | * the input string that were successfully converted, or %NULL. |
356 | | * Even if the conversion was successful, this may be |
357 | | * less than @len if there were partial characters |
358 | | * at the end of the input. If the error |
359 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
360 | | * stored will be the byte offset after the last valid |
361 | | * input sequence. |
362 | | * @bytes_written: (out) (optional): the number of bytes stored in |
363 | | * the output buffer (not including the terminating nul). |
364 | | * @error: location to store the error occurring, or %NULL to ignore |
365 | | * errors. Any of the errors in #GConvertError may occur. |
366 | | * |
367 | | * Converts a string from one character set to another. |
368 | | * |
369 | | * Note that you should use g_iconv() for streaming conversions. |
370 | | * Despite the fact that @bytes_read can return information about partial |
371 | | * characters, the g_convert_... functions are not generally suitable |
372 | | * for streaming. If the underlying converter maintains internal state, |
373 | | * then this won't be preserved across successive calls to g_convert(), |
374 | | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
375 | | * this is the GNU C converter for CP1255 which does not emit a base |
376 | | * character until it knows that the next character is not a mark that |
377 | | * could combine with the base character.) |
378 | | * |
379 | | * Characters which are valid in the input character set, but which have no |
380 | | * representation in the output character set will result in a |
381 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv() |
382 | | * specification, which leaves this behaviour implementation defined. Note that |
383 | | * this is the same error code as is returned for an invalid byte sequence in |
384 | | * the input character set. To get defined behaviour for conversion of |
385 | | * unrepresentable characters, use g_convert_with_fallback(). |
386 | | * |
387 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
388 | | * If the conversion was successful, a newly allocated buffer |
389 | | * containing the converted string, which must be freed with |
390 | | * g_free(). Otherwise %NULL and @error will be set. |
391 | | **/ |
392 | | gchar* |
393 | | g_convert_with_iconv (const gchar *str, |
394 | | gssize len, |
395 | | GIConv converter, |
396 | | gsize *bytes_read, |
397 | | gsize *bytes_written, |
398 | | GError **error) |
399 | 156k | { |
400 | 156k | gchar *dest; |
401 | 156k | gchar *outp; |
402 | 156k | const gchar *p; |
403 | 156k | gsize inbytes_remaining; |
404 | 156k | gsize outbytes_remaining; |
405 | 156k | gsize err; |
406 | 156k | gsize outbuf_size; |
407 | 156k | gboolean have_error = FALSE; |
408 | 156k | gboolean done = FALSE; |
409 | 156k | gboolean reset = FALSE; |
410 | | |
411 | 156k | g_return_val_if_fail (converter != (GIConv) -1, NULL); |
412 | | |
413 | 156k | if (len < 0) |
414 | 4 | len = strlen (str); |
415 | | |
416 | 156k | p = str; |
417 | 156k | inbytes_remaining = len; |
418 | 156k | outbuf_size = len + NUL_TERMINATOR_LENGTH; |
419 | | |
420 | 156k | outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; |
421 | 156k | outp = dest = g_malloc (outbuf_size); |
422 | | |
423 | 467k | while (!done && !have_error) |
424 | 311k | { |
425 | 311k | if (reset) |
426 | 154k | err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining); |
427 | 157k | else |
428 | 157k | err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining); |
429 | | |
430 | 311k | if (err == (gsize) -1) |
431 | 711 | { |
432 | 711 | switch (errno) |
433 | 711 | { |
434 | 0 | case EINVAL: |
435 | | /* Incomplete text, do not report an error */ |
436 | 0 | done = TRUE; |
437 | 0 | break; |
438 | 711 | case E2BIG: |
439 | 711 | { |
440 | 711 | gsize used = outp - dest; |
441 | | |
442 | 711 | outbuf_size *= 2; |
443 | 711 | dest = g_realloc (dest, outbuf_size); |
444 | | |
445 | 711 | outp = dest + used; |
446 | 711 | outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; |
447 | 711 | } |
448 | 711 | break; |
449 | 0 | case EILSEQ: |
450 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
451 | 0 | _("Invalid byte sequence in conversion input")); |
452 | 0 | have_error = TRUE; |
453 | 0 | break; |
454 | 0 | default: |
455 | 0 | { |
456 | 0 | int errsv = errno; |
457 | |
|
458 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
459 | 0 | _("Error during conversion: %s"), |
460 | 0 | g_strerror (errsv)); |
461 | 0 | } |
462 | 0 | have_error = TRUE; |
463 | 0 | break; |
464 | 711 | } |
465 | 711 | } |
466 | 310k | else if (err > 0) |
467 | 1.98k | { |
468 | | /* @err gives the number of replacement characters used. */ |
469 | 1.98k | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
470 | 1.98k | _("Unrepresentable character in conversion input")); |
471 | 1.98k | have_error = TRUE; |
472 | 1.98k | } |
473 | 308k | else |
474 | 308k | { |
475 | 308k | if (!reset) |
476 | 154k | { |
477 | | /* call g_iconv with NULL inbuf to cleanup shift state */ |
478 | 154k | reset = TRUE; |
479 | 154k | inbytes_remaining = 0; |
480 | 154k | } |
481 | 154k | else |
482 | 154k | done = TRUE; |
483 | 308k | } |
484 | 311k | } |
485 | | |
486 | 156k | memset (outp, 0, NUL_TERMINATOR_LENGTH); |
487 | | |
488 | 156k | if (bytes_read) |
489 | 0 | *bytes_read = p - str; |
490 | 156k | else |
491 | 156k | { |
492 | 156k | if ((p - str) != len) |
493 | 0 | { |
494 | 0 | if (!have_error) |
495 | 0 | { |
496 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
497 | 0 | _("Partial character sequence at end of input")); |
498 | 0 | have_error = TRUE; |
499 | 0 | } |
500 | 0 | } |
501 | 156k | } |
502 | | |
503 | 156k | if (bytes_written) |
504 | 1.98k | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
505 | | |
506 | 156k | if (have_error) |
507 | 1.98k | { |
508 | 1.98k | g_free (dest); |
509 | 1.98k | return NULL; |
510 | 1.98k | } |
511 | 154k | else |
512 | 154k | return dest; |
513 | 156k | } |
514 | | |
515 | | /** |
516 | | * g_convert: |
517 | | * @str: (array length=len) (element-type guint8): |
518 | | * the string to convert. |
519 | | * @len: the length of the string in bytes, or -1 if the string is |
520 | | * nul-terminated (Note that some encodings may allow nul |
521 | | * bytes to occur inside strings. In that case, using -1 |
522 | | * for the @len parameter is unsafe) |
523 | | * @to_codeset: name of character set into which to convert @str |
524 | | * @from_codeset: character set of @str. |
525 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
526 | | * the input string that were successfully converted, or %NULL. |
527 | | * Even if the conversion was successful, this may be |
528 | | * less than @len if there were partial characters |
529 | | * at the end of the input. If the error |
530 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
531 | | * stored will be the byte offset after the last valid |
532 | | * input sequence. |
533 | | * @bytes_written: (out) (optional): the number of bytes stored in |
534 | | * the output buffer (not including the terminating nul). |
535 | | * @error: location to store the error occurring, or %NULL to ignore |
536 | | * errors. Any of the errors in #GConvertError may occur. |
537 | | * |
538 | | * Converts a string from one character set to another. |
539 | | * |
540 | | * Note that you should use g_iconv() for streaming conversions. |
541 | | * Despite the fact that @bytes_read can return information about partial |
542 | | * characters, the g_convert_... functions are not generally suitable |
543 | | * for streaming. If the underlying converter maintains internal state, |
544 | | * then this won't be preserved across successive calls to g_convert(), |
545 | | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
546 | | * this is the GNU C converter for CP1255 which does not emit a base |
547 | | * character until it knows that the next character is not a mark that |
548 | | * could combine with the base character.) |
549 | | * |
550 | | * Using extensions such as "//TRANSLIT" may not work (or may not work |
551 | | * well) on many platforms. Consider using g_str_to_ascii() instead. |
552 | | * |
553 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
554 | | * If the conversion was successful, a newly allocated buffer |
555 | | * containing the converted string, which must be freed with g_free(). |
556 | | * Otherwise %NULL and @error will be set. |
557 | | **/ |
558 | | gchar* |
559 | | g_convert (const gchar *str, |
560 | | gssize len, |
561 | | const gchar *to_codeset, |
562 | | const gchar *from_codeset, |
563 | | gsize *bytes_read, |
564 | | gsize *bytes_written, |
565 | | GError **error) |
566 | 156k | { |
567 | 156k | gchar *res; |
568 | 156k | GIConv cd; |
569 | | |
570 | 156k | g_return_val_if_fail (str != NULL, NULL); |
571 | 156k | g_return_val_if_fail (to_codeset != NULL, NULL); |
572 | 156k | g_return_val_if_fail (from_codeset != NULL, NULL); |
573 | | |
574 | 156k | cd = open_converter (to_codeset, from_codeset, error); |
575 | | |
576 | 156k | if (cd == (GIConv) -1) |
577 | 0 | { |
578 | 0 | if (bytes_read) |
579 | 0 | *bytes_read = 0; |
580 | | |
581 | 0 | if (bytes_written) |
582 | 0 | *bytes_written = 0; |
583 | | |
584 | 0 | return NULL; |
585 | 0 | } |
586 | | |
587 | 156k | res = g_convert_with_iconv (str, len, cd, |
588 | 156k | bytes_read, bytes_written, |
589 | 156k | error); |
590 | | |
591 | 156k | close_converter (cd); |
592 | | |
593 | 156k | return res; |
594 | 156k | } |
595 | | |
596 | | /** |
597 | | * g_convert_with_fallback: |
598 | | * @str: (array length=len) (element-type guint8): |
599 | | * the string to convert. |
600 | | * @len: the length of the string in bytes, or -1 if the string is |
601 | | * nul-terminated (Note that some encodings may allow nul |
602 | | * bytes to occur inside strings. In that case, using -1 |
603 | | * for the @len parameter is unsafe) |
604 | | * @to_codeset: name of character set into which to convert @str |
605 | | * @from_codeset: character set of @str. |
606 | | * @fallback: UTF-8 string to use in place of characters not |
607 | | * present in the target encoding. (The string must be |
608 | | * representable in the target encoding). |
609 | | * If %NULL, characters not in the target encoding will |
610 | | * be represented as Unicode escapes \uxxxx or \Uxxxxyyyy. |
611 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
612 | | * the input string that were successfully converted, or %NULL. |
613 | | * Even if the conversion was successful, this may be |
614 | | * less than @len if there were partial characters |
615 | | * at the end of the input. |
616 | | * @bytes_written: (out) (optional): the number of bytes stored in |
617 | | * the output buffer (not including the terminating nul). |
618 | | * @error: location to store the error occurring, or %NULL to ignore |
619 | | * errors. Any of the errors in #GConvertError may occur. |
620 | | * |
621 | | * Converts a string from one character set to another, possibly |
622 | | * including fallback sequences for characters not representable |
623 | | * in the output. Note that it is not guaranteed that the specification |
624 | | * for the fallback sequences in @fallback will be honored. Some |
625 | | * systems may do an approximate conversion from @from_codeset |
626 | | * to @to_codeset in their iconv() functions, |
627 | | * in which case GLib will simply return that approximate conversion. |
628 | | * |
629 | | * Note that you should use g_iconv() for streaming conversions. |
630 | | * Despite the fact that @bytes_read can return information about partial |
631 | | * characters, the g_convert_... functions are not generally suitable |
632 | | * for streaming. If the underlying converter maintains internal state, |
633 | | * then this won't be preserved across successive calls to g_convert(), |
634 | | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
635 | | * this is the GNU C converter for CP1255 which does not emit a base |
636 | | * character until it knows that the next character is not a mark that |
637 | | * could combine with the base character.) |
638 | | * |
639 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
640 | | * If the conversion was successful, a newly allocated buffer |
641 | | * containing the converted string, which must be freed with g_free(). |
642 | | * Otherwise %NULL and @error will be set. |
643 | | **/ |
644 | | gchar* |
645 | | g_convert_with_fallback (const gchar *str, |
646 | | gssize len, |
647 | | const gchar *to_codeset, |
648 | | const gchar *from_codeset, |
649 | | const gchar *fallback, |
650 | | gsize *bytes_read, |
651 | | gsize *bytes_written, |
652 | | GError **error) |
653 | 154k | { |
654 | 154k | gchar *utf8; |
655 | 154k | gchar *dest; |
656 | 154k | gchar *outp; |
657 | 154k | const gchar *insert_str = NULL; |
658 | 154k | const gchar *p; |
659 | 154k | gsize inbytes_remaining; |
660 | 154k | const gchar *save_p = NULL; |
661 | 154k | gsize save_inbytes = 0; |
662 | 154k | gsize outbytes_remaining; |
663 | 154k | gsize err; |
664 | 154k | GIConv cd; |
665 | 154k | gsize outbuf_size; |
666 | 154k | gboolean have_error = FALSE; |
667 | 154k | gboolean done = FALSE; |
668 | | |
669 | 154k | GError *local_error = NULL; |
670 | | |
671 | 154k | g_return_val_if_fail (str != NULL, NULL); |
672 | 154k | g_return_val_if_fail (to_codeset != NULL, NULL); |
673 | 154k | g_return_val_if_fail (from_codeset != NULL, NULL); |
674 | | |
675 | 154k | if (len < 0) |
676 | 0 | len = strlen (str); |
677 | | |
678 | | /* Try an exact conversion; we only proceed if this fails |
679 | | * due to an illegal sequence in the input string. |
680 | | */ |
681 | 154k | dest = g_convert (str, len, to_codeset, from_codeset, |
682 | 154k | bytes_read, bytes_written, &local_error); |
683 | 154k | if (!local_error) |
684 | 152k | return dest; |
685 | | |
686 | 154k | g_assert (dest == NULL); |
687 | | |
688 | 1.98k | if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) |
689 | 0 | { |
690 | 0 | g_propagate_error (error, local_error); |
691 | 0 | return NULL; |
692 | 0 | } |
693 | 1.98k | else |
694 | 1.98k | g_error_free (local_error); |
695 | | |
696 | 1.98k | local_error = NULL; |
697 | | |
698 | | /* No go; to proceed, we need a converter from "UTF-8" to |
699 | | * to_codeset, and the string as UTF-8. |
700 | | */ |
701 | 1.98k | cd = open_converter (to_codeset, "UTF-8", error); |
702 | 1.98k | if (cd == (GIConv) -1) |
703 | 0 | { |
704 | 0 | if (bytes_read) |
705 | 0 | *bytes_read = 0; |
706 | | |
707 | 0 | if (bytes_written) |
708 | 0 | *bytes_written = 0; |
709 | | |
710 | 0 | return NULL; |
711 | 0 | } |
712 | | |
713 | 1.98k | utf8 = g_convert (str, len, "UTF-8", from_codeset, |
714 | 1.98k | bytes_read, &inbytes_remaining, error); |
715 | 1.98k | if (!utf8) |
716 | 0 | { |
717 | 0 | close_converter (cd); |
718 | 0 | if (bytes_written) |
719 | 0 | *bytes_written = 0; |
720 | 0 | return NULL; |
721 | 0 | } |
722 | | |
723 | | /* Now the heart of the code. We loop through the UTF-8 string, and |
724 | | * whenever we hit an offending character, we form fallback, convert |
725 | | * the fallback to the target codeset, and then go back to |
726 | | * converting the original string after finishing with the fallback. |
727 | | * |
728 | | * The variables save_p and save_inbytes store the input state |
729 | | * for the original string while we are converting the fallback |
730 | | */ |
731 | 1.98k | p = utf8; |
732 | | |
733 | 1.98k | outbuf_size = len + NUL_TERMINATOR_LENGTH; |
734 | 1.98k | outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; |
735 | 1.98k | outp = dest = g_malloc (outbuf_size); |
736 | | |
737 | 6.65k | while (!done && !have_error) |
738 | 4.66k | { |
739 | 4.66k | gsize inbytes_tmp = inbytes_remaining; |
740 | 4.66k | err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining); |
741 | 4.66k | inbytes_remaining = inbytes_tmp; |
742 | | |
743 | 4.66k | if (err == (gsize) -1) |
744 | 698 | { |
745 | 698 | switch (errno) |
746 | 698 | { |
747 | 0 | case EINVAL: |
748 | 0 | g_assert_not_reached(); |
749 | 0 | break; |
750 | 698 | case E2BIG: |
751 | 698 | { |
752 | 698 | gsize used = outp - dest; |
753 | | |
754 | 698 | outbuf_size *= 2; |
755 | 698 | dest = g_realloc (dest, outbuf_size); |
756 | | |
757 | 698 | outp = dest + used; |
758 | 698 | outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; |
759 | | |
760 | 698 | break; |
761 | 0 | } |
762 | 0 | case EILSEQ: |
763 | 0 | if (save_p) |
764 | 0 | { |
765 | | /* Error converting fallback string - fatal |
766 | | */ |
767 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
768 | 0 | _("Cannot convert fallback “%s” to codeset “%s”"), |
769 | 0 | insert_str, to_codeset); |
770 | 0 | have_error = TRUE; |
771 | 0 | break; |
772 | 0 | } |
773 | 0 | else if (p) |
774 | 0 | { |
775 | 0 | if (!fallback) |
776 | 0 | { |
777 | 0 | gunichar ch = g_utf8_get_char (p); |
778 | 0 | insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x", |
779 | 0 | ch); |
780 | 0 | } |
781 | 0 | else |
782 | 0 | insert_str = fallback; |
783 | | |
784 | 0 | save_p = g_utf8_next_char (p); |
785 | 0 | save_inbytes = inbytes_remaining - (save_p - p); |
786 | 0 | p = insert_str; |
787 | 0 | inbytes_remaining = strlen (p); |
788 | 0 | break; |
789 | 0 | } |
790 | | /* if p is null */ |
791 | 0 | G_GNUC_FALLTHROUGH; |
792 | 0 | default: |
793 | 0 | { |
794 | 0 | int errsv = errno; |
795 | |
|
796 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
797 | 0 | _("Error during conversion: %s"), |
798 | 0 | g_strerror (errsv)); |
799 | 0 | } |
800 | |
|
801 | 0 | have_error = TRUE; |
802 | 0 | break; |
803 | 698 | } |
804 | 698 | } |
805 | 3.97k | else |
806 | 3.97k | { |
807 | 3.97k | if (save_p) |
808 | 0 | { |
809 | 0 | if (!fallback) |
810 | 0 | g_free ((gchar *)insert_str); |
811 | 0 | p = save_p; |
812 | 0 | inbytes_remaining = save_inbytes; |
813 | 0 | save_p = NULL; |
814 | 0 | } |
815 | 3.97k | else if (p) |
816 | 1.98k | { |
817 | | /* call g_iconv with NULL inbuf to cleanup shift state */ |
818 | 1.98k | p = NULL; |
819 | 1.98k | inbytes_remaining = 0; |
820 | 1.98k | } |
821 | 1.98k | else |
822 | 1.98k | done = TRUE; |
823 | 3.97k | } |
824 | 4.66k | } |
825 | | |
826 | | /* Cleanup |
827 | | */ |
828 | 1.98k | memset (outp, 0, NUL_TERMINATOR_LENGTH); |
829 | | |
830 | 1.98k | close_converter (cd); |
831 | | |
832 | 1.98k | if (bytes_written) |
833 | 0 | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
834 | | |
835 | 1.98k | g_free (utf8); |
836 | | |
837 | 1.98k | if (have_error) |
838 | 0 | { |
839 | 0 | if (save_p && !fallback) |
840 | 0 | g_free ((gchar *)insert_str); |
841 | 0 | g_free (dest); |
842 | 0 | return NULL; |
843 | 0 | } |
844 | 1.98k | else |
845 | 1.98k | return dest; |
846 | 1.98k | } |
847 | | |
848 | | /* |
849 | | * g_locale_to_utf8 |
850 | | * |
851 | | * |
852 | | */ |
853 | | |
854 | | /* |
855 | | * Validate @string as UTF-8. @len can be negative if @string is |
856 | | * nul-terminated, or a non-negative value in bytes. If @string ends in an |
857 | | * incomplete sequence, or contains any illegal sequences or nul codepoints, |
858 | | * %NULL will be returned and the error set to |
859 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. |
860 | | * On success, @bytes_read and @bytes_written, if provided, will be set to |
861 | | * the number of bytes in @string up to @len or the terminating nul byte. |
862 | | * On error, @bytes_read will be set to the byte offset after the last valid |
863 | | * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0. |
864 | | */ |
865 | | static gchar * |
866 | | strdup_len (const gchar *string, |
867 | | gssize len, |
868 | | gsize *bytes_read, |
869 | | gsize *bytes_written, |
870 | | GError **error) |
871 | 0 | { |
872 | 0 | gsize real_len; |
873 | 0 | const gchar *end_valid; |
874 | |
|
875 | 0 | if (!g_utf8_validate (string, len, &end_valid)) |
876 | 0 | { |
877 | 0 | if (bytes_read) |
878 | 0 | *bytes_read = end_valid - string; |
879 | 0 | if (bytes_written) |
880 | 0 | *bytes_written = 0; |
881 | |
|
882 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
883 | 0 | _("Invalid byte sequence in conversion input")); |
884 | 0 | return NULL; |
885 | 0 | } |
886 | | |
887 | 0 | real_len = end_valid - string; |
888 | |
|
889 | 0 | if (bytes_read) |
890 | 0 | *bytes_read = real_len; |
891 | 0 | if (bytes_written) |
892 | 0 | *bytes_written = real_len; |
893 | |
|
894 | 0 | return g_strndup (string, real_len); |
895 | 0 | } |
896 | | |
897 | | typedef enum |
898 | | { |
899 | | CONVERT_CHECK_NO_NULS_IN_INPUT = 1 << 0, |
900 | | CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1 |
901 | | } ConvertCheckFlags; |
902 | | |
903 | | /* |
904 | | * Convert from @string in the encoding identified by @from_codeset, |
905 | | * returning a string in the encoding identifed by @to_codeset. |
906 | | * @len can be negative if @string is nul-terminated, or a non-negative |
907 | | * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags |
908 | | * to check the input, the output, or both, for embedded nul bytes. |
909 | | * On success, @bytes_read, if provided, will be set to the number of bytes |
910 | | * in @string up to @len or the terminating nul byte, and @bytes_written, if |
911 | | * provided, will be set to the number of output bytes written into the |
912 | | * returned buffer, excluding the terminating nul sequence. |
913 | | * On error, @bytes_read will be set to the byte offset after the last valid |
914 | | * sequence in @string, and @bytes_written will be set to 0. |
915 | | */ |
916 | | static gchar * |
917 | | convert_checked (const gchar *string, |
918 | | gssize len, |
919 | | const gchar *to_codeset, |
920 | | const gchar *from_codeset, |
921 | | ConvertCheckFlags flags, |
922 | | gsize *bytes_read, |
923 | | gsize *bytes_written, |
924 | | GError **error) |
925 | 4 | { |
926 | 4 | gchar *out; |
927 | 4 | gsize outbytes; |
928 | | |
929 | 4 | if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0) |
930 | 0 | { |
931 | 0 | const gchar *early_nul = memchr (string, '\0', len); |
932 | 0 | if (early_nul != NULL) |
933 | 0 | { |
934 | 0 | if (bytes_read) |
935 | 0 | *bytes_read = early_nul - string; |
936 | 0 | if (bytes_written) |
937 | 0 | *bytes_written = 0; |
938 | |
|
939 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
940 | 0 | _("Embedded NUL byte in conversion input")); |
941 | 0 | return NULL; |
942 | 0 | } |
943 | 0 | } |
944 | | |
945 | 4 | out = g_convert (string, len, to_codeset, from_codeset, |
946 | 4 | bytes_read, &outbytes, error); |
947 | 4 | if (out == NULL) |
948 | 0 | { |
949 | 0 | if (bytes_written) |
950 | 0 | *bytes_written = 0; |
951 | 0 | return NULL; |
952 | 0 | } |
953 | | |
954 | 4 | if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT) |
955 | 4 | && memchr (out, '\0', outbytes) != NULL) |
956 | 0 | { |
957 | 0 | g_free (out); |
958 | 0 | if (bytes_written) |
959 | 0 | *bytes_written = 0; |
960 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL, |
961 | 0 | _("Embedded NUL byte in conversion output")); |
962 | 0 | return NULL; |
963 | 0 | } |
964 | | |
965 | 4 | if (bytes_written) |
966 | 0 | *bytes_written = outbytes; |
967 | 4 | return out; |
968 | 4 | } |
969 | | |
970 | | /** |
971 | | * g_locale_to_utf8: |
972 | | * @opsysstring: (array length=len) (element-type guint8): a string in the |
973 | | * encoding of the current locale. On Windows |
974 | | * this means the system codepage. |
975 | | * @len: the length of the string, or -1 if the string is |
976 | | * nul-terminated (Note that some encodings may allow nul |
977 | | * bytes to occur inside strings. In that case, using -1 |
978 | | * for the @len parameter is unsafe) |
979 | | * @bytes_read: (out) (optional): location to store the number of bytes in the |
980 | | * input string that were successfully converted, or %NULL. |
981 | | * Even if the conversion was successful, this may be |
982 | | * less than @len if there were partial characters |
983 | | * at the end of the input. If the error |
984 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
985 | | * stored will be the byte offset after the last valid |
986 | | * input sequence. |
987 | | * @bytes_written: (out) (optional): the number of bytes stored in the output |
988 | | * buffer (not including the terminating nul). |
989 | | * @error: location to store the error occurring, or %NULL to ignore |
990 | | * errors. Any of the errors in #GConvertError may occur. |
991 | | * |
992 | | * Converts a string which is in the encoding used for strings by |
993 | | * the C runtime (usually the same as that used by the operating |
994 | | * system) in the [current locale][setlocale] into a UTF-8 string. |
995 | | * |
996 | | * If the source encoding is not UTF-8 and the conversion output contains a |
997 | | * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the |
998 | | * function returns %NULL. |
999 | | * If the source encoding is UTF-8, an embedded nul character is treated with |
1000 | | * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with |
1001 | | * earlier versions of this library. Use g_convert() to produce output that |
1002 | | * may contain embedded nul characters. |
1003 | | * |
1004 | | * Returns: (type utf8): The converted string, or %NULL on an error. |
1005 | | **/ |
1006 | | gchar * |
1007 | | g_locale_to_utf8 (const gchar *opsysstring, |
1008 | | gssize len, |
1009 | | gsize *bytes_read, |
1010 | | gsize *bytes_written, |
1011 | | GError **error) |
1012 | 4 | { |
1013 | 4 | const char *charset; |
1014 | | |
1015 | 4 | if (g_get_charset (&charset)) |
1016 | 0 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
1017 | 4 | else |
1018 | 4 | return convert_checked (opsysstring, len, "UTF-8", charset, |
1019 | 4 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1020 | 4 | bytes_read, bytes_written, error); |
1021 | 4 | } |
1022 | | |
1023 | | /* |
1024 | | * Do the exact same as g_locale_to_utf8 except that the charset would |
1025 | | * be retrieved from _g_get_time_charset (which uses LC_TIME) |
1026 | | * |
1027 | | * Returns: The converted string, or %NULL on an error. |
1028 | | */ |
1029 | | gchar * |
1030 | | _g_time_locale_to_utf8 (const gchar *opsysstring, |
1031 | | gssize len, |
1032 | | gsize *bytes_read, |
1033 | | gsize *bytes_written, |
1034 | | GError **error) |
1035 | 0 | { |
1036 | 0 | const char *charset; |
1037 | |
|
1038 | 0 | if (_g_get_time_charset (&charset)) |
1039 | 0 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
1040 | 0 | else |
1041 | 0 | return convert_checked (opsysstring, len, "UTF-8", charset, |
1042 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1043 | 0 | bytes_read, bytes_written, error); |
1044 | 0 | } |
1045 | | |
1046 | | /* |
1047 | | * Do the exact same as g_locale_to_utf8 except that the charset would |
1048 | | * be retrieved from _g_get_ctype_charset (which uses LC_CTYPE) |
1049 | | * |
1050 | | * Returns: The converted string, or %NULL on an error. |
1051 | | */ |
1052 | | gchar * |
1053 | | _g_ctype_locale_to_utf8 (const gchar *opsysstring, |
1054 | | gssize len, |
1055 | | gsize *bytes_read, |
1056 | | gsize *bytes_written, |
1057 | | GError **error) |
1058 | 0 | { |
1059 | 0 | const char *charset; |
1060 | |
|
1061 | 0 | if (_g_get_ctype_charset (&charset)) |
1062 | 0 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
1063 | 0 | else |
1064 | 0 | return convert_checked (opsysstring, len, "UTF-8", charset, |
1065 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1066 | 0 | bytes_read, bytes_written, error); |
1067 | 0 | } |
1068 | | |
1069 | | /** |
1070 | | * g_locale_from_utf8: |
1071 | | * @utf8string: a UTF-8 encoded string |
1072 | | * @len: the length of the string, or -1 if the string is |
1073 | | * nul-terminated. |
1074 | | * @bytes_read: (out) (optional): location to store the number of bytes in the |
1075 | | * input string that were successfully converted, or %NULL. |
1076 | | * Even if the conversion was successful, this may be |
1077 | | * less than @len if there were partial characters |
1078 | | * at the end of the input. If the error |
1079 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1080 | | * stored will be the byte offset after the last valid |
1081 | | * input sequence. |
1082 | | * @bytes_written: (out) (optional): the number of bytes stored in the output |
1083 | | * buffer (not including the terminating nul). |
1084 | | * @error: location to store the error occurring, or %NULL to ignore |
1085 | | * errors. Any of the errors in #GConvertError may occur. |
1086 | | * |
1087 | | * Converts a string from UTF-8 to the encoding used for strings by |
1088 | | * the C runtime (usually the same as that used by the operating |
1089 | | * system) in the [current locale][setlocale]. On Windows this means |
1090 | | * the system codepage. |
1091 | | * |
1092 | | * The input string shall not contain nul characters even if the @len |
1093 | | * argument is positive. A nul character found inside the string will result |
1094 | | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert |
1095 | | * input that may contain embedded nul characters. |
1096 | | * |
1097 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
1098 | | * A newly-allocated buffer containing the converted string, |
1099 | | * or %NULL on an error, and error will be set. |
1100 | | **/ |
1101 | | gchar * |
1102 | | g_locale_from_utf8 (const gchar *utf8string, |
1103 | | gssize len, |
1104 | | gsize *bytes_read, |
1105 | | gsize *bytes_written, |
1106 | | GError **error) |
1107 | 0 | { |
1108 | 0 | const gchar *charset; |
1109 | |
|
1110 | 0 | if (g_get_charset (&charset)) |
1111 | 0 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
1112 | 0 | else |
1113 | 0 | return convert_checked (utf8string, len, charset, "UTF-8", |
1114 | 0 | CONVERT_CHECK_NO_NULS_IN_INPUT, |
1115 | 0 | bytes_read, bytes_written, error); |
1116 | 0 | } |
1117 | | |
1118 | | #ifndef G_PLATFORM_WIN32 |
1119 | | |
1120 | | typedef struct _GFilenameCharsetCache GFilenameCharsetCache; |
1121 | | |
1122 | | struct _GFilenameCharsetCache { |
1123 | | gboolean is_utf8; |
1124 | | gchar *charset; |
1125 | | gchar **filename_charsets; |
1126 | | }; |
1127 | | |
1128 | | static void |
1129 | | filename_charset_cache_free (gpointer data) |
1130 | 0 | { |
1131 | 0 | GFilenameCharsetCache *cache = data; |
1132 | 0 | g_free (cache->charset); |
1133 | 0 | g_strfreev (cache->filename_charsets); |
1134 | 0 | g_free (cache); |
1135 | 0 | } |
1136 | | |
1137 | | /** |
1138 | | * g_get_filename_charsets: |
1139 | | * @filename_charsets: (out) (transfer none) (array zero-terminated=1): |
1140 | | * return location for the %NULL-terminated list of encoding names |
1141 | | * |
1142 | | * Determines the preferred character sets used for filenames. |
1143 | | * The first character set from the @charsets is the filename encoding, the |
1144 | | * subsequent character sets are used when trying to generate a displayable |
1145 | | * representation of a filename, see g_filename_display_name(). |
1146 | | * |
1147 | | * On Unix, the character sets are determined by consulting the |
1148 | | * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`. |
1149 | | * On Windows, the character set used in the GLib API is always UTF-8 |
1150 | | * and said environment variables have no effect. |
1151 | | * |
1152 | | * `G_FILENAME_ENCODING` may be set to a comma-separated list of |
1153 | | * character set names. The special token "\@locale" is taken |
1154 | | * to mean the character set for the [current locale][setlocale]. |
1155 | | * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is, |
1156 | | * the character set of the current locale is taken as the filename |
1157 | | * encoding. If neither environment variable is set, UTF-8 is taken |
1158 | | * as the filename encoding, but the character set of the current locale |
1159 | | * is also put in the list of encodings. |
1160 | | * |
1161 | | * The returned @charsets belong to GLib and must not be freed. |
1162 | | * |
1163 | | * Note that on Unix, regardless of the locale character set or |
1164 | | * `G_FILENAME_ENCODING` value, the actual file names present |
1165 | | * on a system might be in any random encoding or just gibberish. |
1166 | | * |
1167 | | * Returns: %TRUE if the filename encoding is UTF-8. |
1168 | | * |
1169 | | * Since: 2.6 |
1170 | | */ |
1171 | | gboolean |
1172 | | g_get_filename_charsets (const gchar ***filename_charsets) |
1173 | 1.05k | { |
1174 | 1.05k | static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free); |
1175 | 1.05k | GFilenameCharsetCache *cache = g_private_get (&cache_private); |
1176 | 1.05k | const gchar *charset; |
1177 | | |
1178 | 1.05k | if (!cache) |
1179 | 1 | cache = g_private_set_alloc0 (&cache_private, sizeof (GFilenameCharsetCache)); |
1180 | | |
1181 | 1.05k | g_get_charset (&charset); |
1182 | | |
1183 | 1.05k | if (!(cache->charset && strcmp (cache->charset, charset) == 0)) |
1184 | 1 | { |
1185 | 1 | const gchar *new_charset; |
1186 | 1 | const gchar *p; |
1187 | 1 | gint i; |
1188 | | |
1189 | 1 | g_free (cache->charset); |
1190 | 1 | g_strfreev (cache->filename_charsets); |
1191 | 1 | cache->charset = g_strdup (charset); |
1192 | | |
1193 | 1 | p = g_getenv ("G_FILENAME_ENCODING"); |
1194 | 1 | if (p != NULL && p[0] != '\0') |
1195 | 0 | { |
1196 | 0 | cache->filename_charsets = g_strsplit (p, ",", 0); |
1197 | 0 | cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0); |
1198 | |
|
1199 | 0 | for (i = 0; cache->filename_charsets[i]; i++) |
1200 | 0 | { |
1201 | 0 | if (strcmp ("@locale", cache->filename_charsets[i]) == 0) |
1202 | 0 | { |
1203 | 0 | g_get_charset (&new_charset); |
1204 | 0 | g_free (cache->filename_charsets[i]); |
1205 | 0 | cache->filename_charsets[i] = g_strdup (new_charset); |
1206 | 0 | } |
1207 | 0 | } |
1208 | 0 | } |
1209 | 1 | else if (g_getenv ("G_BROKEN_FILENAMES") != NULL) |
1210 | 0 | { |
1211 | 0 | cache->filename_charsets = g_new0 (gchar *, 2); |
1212 | 0 | cache->is_utf8 = g_get_charset (&new_charset); |
1213 | 0 | cache->filename_charsets[0] = g_strdup (new_charset); |
1214 | 0 | } |
1215 | 1 | else |
1216 | 1 | { |
1217 | 1 | cache->filename_charsets = g_new0 (gchar *, 3); |
1218 | 1 | cache->is_utf8 = TRUE; |
1219 | 1 | cache->filename_charsets[0] = g_strdup ("UTF-8"); |
1220 | 1 | if (!g_get_charset (&new_charset)) |
1221 | 1 | cache->filename_charsets[1] = g_strdup (new_charset); |
1222 | 1 | } |
1223 | 1 | } |
1224 | | |
1225 | 1.05k | if (filename_charsets) |
1226 | 1.05k | *filename_charsets = (const gchar **)cache->filename_charsets; |
1227 | | |
1228 | 1.05k | return cache->is_utf8; |
1229 | 1.05k | } |
1230 | | |
1231 | | #else /* G_PLATFORM_WIN32 */ |
1232 | | |
1233 | | gboolean |
1234 | | g_get_filename_charsets (const gchar ***filename_charsets) |
1235 | | { |
1236 | | static const gchar *charsets[] = { |
1237 | | "UTF-8", |
1238 | | NULL |
1239 | | }; |
1240 | | |
1241 | | #ifdef G_OS_WIN32 |
1242 | | /* On Windows GLib pretends that the filename charset is UTF-8 */ |
1243 | | if (filename_charsets) |
1244 | | *filename_charsets = charsets; |
1245 | | |
1246 | | return TRUE; |
1247 | | #else |
1248 | | gboolean result; |
1249 | | |
1250 | | /* Cygwin works like before */ |
1251 | | result = g_get_charset (&(charsets[0])); |
1252 | | |
1253 | | if (filename_charsets) |
1254 | | *filename_charsets = charsets; |
1255 | | |
1256 | | return result; |
1257 | | #endif |
1258 | | } |
1259 | | |
1260 | | #endif /* G_PLATFORM_WIN32 */ |
1261 | | |
1262 | | static gboolean |
1263 | | get_filename_charset (const gchar **filename_charset) |
1264 | 0 | { |
1265 | 0 | const gchar **charsets; |
1266 | 0 | gboolean is_utf8; |
1267 | | |
1268 | 0 | is_utf8 = g_get_filename_charsets (&charsets); |
1269 | |
|
1270 | 0 | if (filename_charset) |
1271 | 0 | *filename_charset = charsets[0]; |
1272 | | |
1273 | 0 | return is_utf8; |
1274 | 0 | } |
1275 | | |
1276 | | /** |
1277 | | * g_filename_to_utf8: |
1278 | | * @opsysstring: (type filename): a string in the encoding for filenames |
1279 | | * @len: the length of the string, or -1 if the string is |
1280 | | * nul-terminated (Note that some encodings may allow nul |
1281 | | * bytes to occur inside strings. In that case, using -1 |
1282 | | * for the @len parameter is unsafe) |
1283 | | * @bytes_read: (out) (optional): location to store the number of bytes in the |
1284 | | * input string that were successfully converted, or %NULL. |
1285 | | * Even if the conversion was successful, this may be |
1286 | | * less than @len if there were partial characters |
1287 | | * at the end of the input. If the error |
1288 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1289 | | * stored will be the byte offset after the last valid |
1290 | | * input sequence. |
1291 | | * @bytes_written: (out) (optional): the number of bytes stored in the output |
1292 | | * buffer (not including the terminating nul). |
1293 | | * @error: location to store the error occurring, or %NULL to ignore |
1294 | | * errors. Any of the errors in #GConvertError may occur. |
1295 | | * |
1296 | | * Converts a string which is in the encoding used by GLib for |
1297 | | * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8 |
1298 | | * for filenames; on other platforms, this function indirectly depends on |
1299 | | * the [current locale][setlocale]. |
1300 | | * |
1301 | | * The input string shall not contain nul characters even if the @len |
1302 | | * argument is positive. A nul character found inside the string will result |
1303 | | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. |
1304 | | * If the source encoding is not UTF-8 and the conversion output contains a |
1305 | | * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the |
1306 | | * function returns %NULL. Use g_convert() to produce output that |
1307 | | * may contain embedded nul characters. |
1308 | | * |
1309 | | * Returns: (type utf8): The converted string, or %NULL on an error. |
1310 | | **/ |
1311 | | gchar* |
1312 | | g_filename_to_utf8 (const gchar *opsysstring, |
1313 | | gssize len, |
1314 | | gsize *bytes_read, |
1315 | | gsize *bytes_written, |
1316 | | GError **error) |
1317 | 0 | { |
1318 | 0 | const gchar *charset; |
1319 | |
|
1320 | 0 | g_return_val_if_fail (opsysstring != NULL, NULL); |
1321 | | |
1322 | 0 | if (get_filename_charset (&charset)) |
1323 | 0 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
1324 | 0 | else |
1325 | 0 | return convert_checked (opsysstring, len, "UTF-8", charset, |
1326 | 0 | CONVERT_CHECK_NO_NULS_IN_INPUT | |
1327 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1328 | 0 | bytes_read, bytes_written, error); |
1329 | 0 | } |
1330 | | |
1331 | | /** |
1332 | | * g_filename_from_utf8: |
1333 | | * @utf8string: (type utf8): a UTF-8 encoded string. |
1334 | | * @len: the length of the string, or -1 if the string is |
1335 | | * nul-terminated. |
1336 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
1337 | | * the input string that were successfully converted, or %NULL. |
1338 | | * Even if the conversion was successful, this may be |
1339 | | * less than @len if there were partial characters |
1340 | | * at the end of the input. If the error |
1341 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1342 | | * stored will be the byte offset after the last valid |
1343 | | * input sequence. |
1344 | | * @bytes_written: (out) (optional): the number of bytes stored in |
1345 | | * the output buffer (not including the terminating nul). |
1346 | | * @error: location to store the error occurring, or %NULL to ignore |
1347 | | * errors. Any of the errors in #GConvertError may occur. |
1348 | | * |
1349 | | * Converts a string from UTF-8 to the encoding GLib uses for |
1350 | | * filenames. Note that on Windows GLib uses UTF-8 for filenames; |
1351 | | * on other platforms, this function indirectly depends on the |
1352 | | * [current locale][setlocale]. |
1353 | | * |
1354 | | * The input string shall not contain nul characters even if the @len |
1355 | | * argument is positive. A nul character found inside the string will result |
1356 | | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is |
1357 | | * not UTF-8 and the conversion output contains a nul character, the error |
1358 | | * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL. |
1359 | | * |
1360 | | * Returns: (type filename): |
1361 | | * The converted string, or %NULL on an error. |
1362 | | **/ |
1363 | | gchar* |
1364 | | g_filename_from_utf8 (const gchar *utf8string, |
1365 | | gssize len, |
1366 | | gsize *bytes_read, |
1367 | | gsize *bytes_written, |
1368 | | GError **error) |
1369 | 0 | { |
1370 | 0 | const gchar *charset; |
1371 | |
|
1372 | 0 | if (get_filename_charset (&charset)) |
1373 | 0 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
1374 | 0 | else |
1375 | 0 | return convert_checked (utf8string, len, charset, "UTF-8", |
1376 | 0 | CONVERT_CHECK_NO_NULS_IN_INPUT | |
1377 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1378 | 0 | bytes_read, bytes_written, error); |
1379 | 0 | } |
1380 | | |
1381 | | /* Test of haystack has the needle prefix, comparing case |
1382 | | * insensitive. haystack may be UTF-8, but needle must |
1383 | | * contain only ascii. */ |
1384 | | static gboolean |
1385 | | has_case_prefix (const gchar *haystack, const gchar *needle) |
1386 | 0 | { |
1387 | 0 | const gchar *h, *n; |
1388 | | |
1389 | | /* Eat one character at a time. */ |
1390 | 0 | h = haystack; |
1391 | 0 | n = needle; |
1392 | |
|
1393 | 0 | while (*n && *h && |
1394 | 0 | g_ascii_tolower (*n) == g_ascii_tolower (*h)) |
1395 | 0 | { |
1396 | 0 | n++; |
1397 | 0 | h++; |
1398 | 0 | } |
1399 | | |
1400 | 0 | return *n == '\0'; |
1401 | 0 | } |
1402 | | |
1403 | | typedef enum { |
1404 | | UNSAFE_ALL = 0x1, /* Escape all unsafe characters */ |
1405 | | UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */ |
1406 | | UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */ |
1407 | | UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */ |
1408 | | UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */ |
1409 | | } UnsafeCharacterSet; |
1410 | | |
1411 | | static const guchar acceptable[96] = { |
1412 | | /* A table of the ASCII chars from space (32) to DEL (127) */ |
1413 | | /* ! " # $ % & ' ( ) * + , - . / */ |
1414 | | 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C, |
1415 | | /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ |
1416 | | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20, |
1417 | | /* @ A B C D E F G H I J K L M N O */ |
1418 | | 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
1419 | | /* P Q R S T U V W X Y Z [ \ ] ^ _ */ |
1420 | | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, |
1421 | | /* ` a b c d e f g h i j k l m n o */ |
1422 | | 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
1423 | | /* p q r s t u v w x y z { | } ~ DEL */ |
1424 | | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20 |
1425 | | }; |
1426 | | |
1427 | | static const gchar hex[] = "0123456789ABCDEF"; |
1428 | | |
1429 | | /* Note: This escape function works on file: URIs, but if you want to |
1430 | | * escape something else, please read RFC-2396 */ |
1431 | | static gchar * |
1432 | | g_escape_uri_string (const gchar *string, |
1433 | | UnsafeCharacterSet mask) |
1434 | 0 | { |
1435 | 0 | #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask)) |
1436 | |
|
1437 | 0 | const gchar *p; |
1438 | 0 | gchar *q; |
1439 | 0 | gchar *result; |
1440 | 0 | int c; |
1441 | 0 | gint unacceptable; |
1442 | 0 | UnsafeCharacterSet use_mask; |
1443 | | |
1444 | 0 | g_return_val_if_fail (mask == UNSAFE_ALL |
1445 | 0 | || mask == UNSAFE_ALLOW_PLUS |
1446 | 0 | || mask == UNSAFE_PATH |
1447 | 0 | || mask == UNSAFE_HOST |
1448 | 0 | || mask == UNSAFE_SLASHES, NULL); |
1449 | | |
1450 | 0 | unacceptable = 0; |
1451 | 0 | use_mask = mask; |
1452 | 0 | for (p = string; *p != '\0'; p++) |
1453 | 0 | { |
1454 | 0 | c = (guchar) *p; |
1455 | 0 | if (!ACCEPTABLE (c)) |
1456 | 0 | unacceptable++; |
1457 | 0 | } |
1458 | | |
1459 | 0 | result = g_malloc (p - string + unacceptable * 2 + 1); |
1460 | | |
1461 | 0 | use_mask = mask; |
1462 | 0 | for (q = result, p = string; *p != '\0'; p++) |
1463 | 0 | { |
1464 | 0 | c = (guchar) *p; |
1465 | | |
1466 | 0 | if (!ACCEPTABLE (c)) |
1467 | 0 | { |
1468 | 0 | *q++ = '%'; /* means hex coming */ |
1469 | 0 | *q++ = hex[c >> 4]; |
1470 | 0 | *q++ = hex[c & 15]; |
1471 | 0 | } |
1472 | 0 | else |
1473 | 0 | *q++ = *p; |
1474 | 0 | } |
1475 | | |
1476 | 0 | *q = '\0'; |
1477 | | |
1478 | 0 | return result; |
1479 | 0 | } |
1480 | | |
1481 | | |
1482 | | static gchar * |
1483 | | g_escape_file_uri (const gchar *hostname, |
1484 | | const gchar *pathname) |
1485 | 0 | { |
1486 | 0 | char *escaped_hostname = NULL; |
1487 | 0 | char *escaped_path; |
1488 | 0 | char *res; |
1489 | |
|
1490 | | #ifdef G_OS_WIN32 |
1491 | | char *p, *backslash; |
1492 | | |
1493 | | /* Turn backslashes into forward slashes. That's what Netscape |
1494 | | * does, and they are actually more or less equivalent in Windows. |
1495 | | */ |
1496 | | |
1497 | | pathname = g_strdup (pathname); |
1498 | | p = (char *) pathname; |
1499 | | |
1500 | | while ((backslash = strchr (p, '\\')) != NULL) |
1501 | | { |
1502 | | *backslash = '/'; |
1503 | | p = backslash + 1; |
1504 | | } |
1505 | | #endif |
1506 | |
|
1507 | 0 | if (hostname && *hostname != '\0') |
1508 | 0 | { |
1509 | 0 | escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST); |
1510 | 0 | } |
1511 | |
|
1512 | 0 | escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH); |
1513 | |
|
1514 | 0 | res = g_strconcat ("file://", |
1515 | 0 | (escaped_hostname) ? escaped_hostname : "", |
1516 | 0 | (*escaped_path != '/') ? "/" : "", |
1517 | 0 | escaped_path, |
1518 | 0 | NULL); |
1519 | |
|
1520 | | #ifdef G_OS_WIN32 |
1521 | | g_free ((char *) pathname); |
1522 | | #endif |
1523 | |
|
1524 | 0 | g_free (escaped_hostname); |
1525 | 0 | g_free (escaped_path); |
1526 | | |
1527 | 0 | return res; |
1528 | 0 | } |
1529 | | |
1530 | | static int |
1531 | | unescape_character (const char *scanner) |
1532 | 0 | { |
1533 | 0 | int first_digit; |
1534 | 0 | int second_digit; |
1535 | |
|
1536 | 0 | first_digit = g_ascii_xdigit_value (scanner[0]); |
1537 | 0 | if (first_digit < 0) |
1538 | 0 | return -1; |
1539 | | |
1540 | 0 | second_digit = g_ascii_xdigit_value (scanner[1]); |
1541 | 0 | if (second_digit < 0) |
1542 | 0 | return -1; |
1543 | | |
1544 | 0 | return (first_digit << 4) | second_digit; |
1545 | 0 | } |
1546 | | |
1547 | | static gchar * |
1548 | | g_unescape_uri_string (const char *escaped, |
1549 | | int len, |
1550 | | const char *illegal_escaped_characters, |
1551 | | gboolean ascii_must_not_be_escaped) |
1552 | 0 | { |
1553 | 0 | const gchar *in, *in_end; |
1554 | 0 | gchar *out, *result; |
1555 | 0 | int c; |
1556 | | |
1557 | 0 | if (escaped == NULL) |
1558 | 0 | return NULL; |
1559 | | |
1560 | 0 | if (len < 0) |
1561 | 0 | len = strlen (escaped); |
1562 | |
|
1563 | 0 | result = g_malloc (len + 1); |
1564 | | |
1565 | 0 | out = result; |
1566 | 0 | for (in = escaped, in_end = escaped + len; in < in_end; in++) |
1567 | 0 | { |
1568 | 0 | c = *in; |
1569 | |
|
1570 | 0 | if (c == '%') |
1571 | 0 | { |
1572 | | /* catch partial escape sequences past the end of the substring */ |
1573 | 0 | if (in + 3 > in_end) |
1574 | 0 | break; |
1575 | | |
1576 | 0 | c = unescape_character (in + 1); |
1577 | | |
1578 | | /* catch bad escape sequences and NUL characters */ |
1579 | 0 | if (c <= 0) |
1580 | 0 | break; |
1581 | | |
1582 | | /* catch escaped ASCII */ |
1583 | 0 | if (ascii_must_not_be_escaped && c <= 0x7F) |
1584 | 0 | break; |
1585 | | |
1586 | | /* catch other illegal escaped characters */ |
1587 | 0 | if (strchr (illegal_escaped_characters, c) != NULL) |
1588 | 0 | break; |
1589 | | |
1590 | 0 | in += 2; |
1591 | 0 | } |
1592 | | |
1593 | 0 | *out++ = c; |
1594 | 0 | } |
1595 | | |
1596 | 0 | g_assert (out - result <= len); |
1597 | 0 | *out = '\0'; |
1598 | |
|
1599 | 0 | if (in != in_end) |
1600 | 0 | { |
1601 | 0 | g_free (result); |
1602 | 0 | return NULL; |
1603 | 0 | } |
1604 | | |
1605 | 0 | return result; |
1606 | 0 | } |
1607 | | |
1608 | | static gboolean |
1609 | | is_asciialphanum (gunichar c) |
1610 | 0 | { |
1611 | 0 | return c <= 0x7F && g_ascii_isalnum (c); |
1612 | 0 | } |
1613 | | |
1614 | | static gboolean |
1615 | | is_asciialpha (gunichar c) |
1616 | 0 | { |
1617 | 0 | return c <= 0x7F && g_ascii_isalpha (c); |
1618 | 0 | } |
1619 | | |
1620 | | /* allows an empty string */ |
1621 | | static gboolean |
1622 | | hostname_validate (const char *hostname) |
1623 | 0 | { |
1624 | 0 | const char *p; |
1625 | 0 | gunichar c, first_char, last_char; |
1626 | |
|
1627 | 0 | p = hostname; |
1628 | 0 | if (*p == '\0') |
1629 | 0 | return TRUE; |
1630 | 0 | do |
1631 | 0 | { |
1632 | | /* read in a label */ |
1633 | 0 | c = g_utf8_get_char (p); |
1634 | 0 | p = g_utf8_next_char (p); |
1635 | 0 | if (!is_asciialphanum (c)) |
1636 | 0 | return FALSE; |
1637 | 0 | first_char = c; |
1638 | 0 | do |
1639 | 0 | { |
1640 | 0 | last_char = c; |
1641 | 0 | c = g_utf8_get_char (p); |
1642 | 0 | p = g_utf8_next_char (p); |
1643 | 0 | } |
1644 | 0 | while (is_asciialphanum (c) || c == '-'); |
1645 | 0 | if (last_char == '-') |
1646 | 0 | return FALSE; |
1647 | | |
1648 | | /* if that was the last label, check that it was a toplabel */ |
1649 | 0 | if (c == '\0' || (c == '.' && *p == '\0')) |
1650 | 0 | return is_asciialpha (first_char); |
1651 | 0 | } |
1652 | 0 | while (c == '.'); |
1653 | 0 | return FALSE; |
1654 | 0 | } |
1655 | | |
1656 | | /** |
1657 | | * g_filename_from_uri: |
1658 | | * @uri: a uri describing a filename (escaped, encoded in ASCII). |
1659 | | * @hostname: (out) (optional) (nullable): Location to store hostname for the URI. |
1660 | | * If there is no hostname in the URI, %NULL will be |
1661 | | * stored in this location. |
1662 | | * @error: location to store the error occurring, or %NULL to ignore |
1663 | | * errors. Any of the errors in #GConvertError may occur. |
1664 | | * |
1665 | | * Converts an escaped ASCII-encoded URI to a local filename in the |
1666 | | * encoding used for filenames. |
1667 | | * |
1668 | | * Returns: (type filename): a newly-allocated string holding |
1669 | | * the resulting filename, or %NULL on an error. |
1670 | | **/ |
1671 | | gchar * |
1672 | | g_filename_from_uri (const gchar *uri, |
1673 | | gchar **hostname, |
1674 | | GError **error) |
1675 | 0 | { |
1676 | 0 | const char *path_part; |
1677 | 0 | const char *host_part; |
1678 | 0 | char *unescaped_hostname; |
1679 | 0 | char *result; |
1680 | 0 | char *filename; |
1681 | 0 | int offs; |
1682 | | #ifdef G_OS_WIN32 |
1683 | | char *p, *slash; |
1684 | | #endif |
1685 | |
|
1686 | 0 | if (hostname) |
1687 | 0 | *hostname = NULL; |
1688 | |
|
1689 | 0 | if (!has_case_prefix (uri, "file:/")) |
1690 | 0 | { |
1691 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1692 | 0 | _("The URI “%s” is not an absolute URI using the “file” scheme"), |
1693 | 0 | uri); |
1694 | 0 | return NULL; |
1695 | 0 | } |
1696 | | |
1697 | 0 | path_part = uri + strlen ("file:"); |
1698 | | |
1699 | 0 | if (strchr (path_part, '#') != NULL) |
1700 | 0 | { |
1701 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1702 | 0 | _("The local file URI “%s” may not include a “#”"), |
1703 | 0 | uri); |
1704 | 0 | return NULL; |
1705 | 0 | } |
1706 | | |
1707 | 0 | if (has_case_prefix (path_part, "///")) |
1708 | 0 | path_part += 2; |
1709 | 0 | else if (has_case_prefix (path_part, "//")) |
1710 | 0 | { |
1711 | 0 | path_part += 2; |
1712 | 0 | host_part = path_part; |
1713 | |
|
1714 | 0 | path_part = strchr (path_part, '/'); |
1715 | |
|
1716 | 0 | if (path_part == NULL) |
1717 | 0 | { |
1718 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1719 | 0 | _("The URI “%s” is invalid"), |
1720 | 0 | uri); |
1721 | 0 | return NULL; |
1722 | 0 | } |
1723 | | |
1724 | 0 | unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE); |
1725 | |
|
1726 | 0 | if (unescaped_hostname == NULL || |
1727 | 0 | !hostname_validate (unescaped_hostname)) |
1728 | 0 | { |
1729 | 0 | g_free (unescaped_hostname); |
1730 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1731 | 0 | _("The hostname of the URI “%s” is invalid"), |
1732 | 0 | uri); |
1733 | 0 | return NULL; |
1734 | 0 | } |
1735 | | |
1736 | 0 | if (hostname) |
1737 | 0 | *hostname = unescaped_hostname; |
1738 | 0 | else |
1739 | 0 | g_free (unescaped_hostname); |
1740 | 0 | } |
1741 | | |
1742 | 0 | filename = g_unescape_uri_string (path_part, -1, "/", FALSE); |
1743 | |
|
1744 | 0 | if (filename == NULL) |
1745 | 0 | { |
1746 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1747 | 0 | _("The URI “%s” contains invalidly escaped characters"), |
1748 | 0 | uri); |
1749 | 0 | return NULL; |
1750 | 0 | } |
1751 | | |
1752 | 0 | offs = 0; |
1753 | | #ifdef G_OS_WIN32 |
1754 | | /* Drop localhost */ |
1755 | | if (hostname && *hostname != NULL && |
1756 | | g_ascii_strcasecmp (*hostname, "localhost") == 0) |
1757 | | { |
1758 | | g_free (*hostname); |
1759 | | *hostname = NULL; |
1760 | | } |
1761 | | |
1762 | | /* Turn slashes into backslashes, because that's the canonical spelling */ |
1763 | | p = filename; |
1764 | | while ((slash = strchr (p, '/')) != NULL) |
1765 | | { |
1766 | | *slash = '\\'; |
1767 | | p = slash + 1; |
1768 | | } |
1769 | | |
1770 | | /* Windows URIs with a drive letter can be like "file://host/c:/foo" |
1771 | | * or "file://host/c|/foo" (some Netscape versions). In those cases, start |
1772 | | * the filename from the drive letter. |
1773 | | */ |
1774 | | if (g_ascii_isalpha (filename[1])) |
1775 | | { |
1776 | | if (filename[2] == ':') |
1777 | | offs = 1; |
1778 | | else if (filename[2] == '|') |
1779 | | { |
1780 | | filename[2] = ':'; |
1781 | | offs = 1; |
1782 | | } |
1783 | | } |
1784 | | #endif |
1785 | |
|
1786 | 0 | result = g_strdup (filename + offs); |
1787 | 0 | g_free (filename); |
1788 | |
|
1789 | 0 | return result; |
1790 | 0 | } |
1791 | | |
1792 | | /** |
1793 | | * g_filename_to_uri: |
1794 | | * @filename: (type filename): an absolute filename specified in the GLib file |
1795 | | * name encoding, which is the on-disk file name bytes on Unix, and UTF-8 |
1796 | | * on Windows |
1797 | | * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none. |
1798 | | * @error: location to store the error occurring, or %NULL to ignore |
1799 | | * errors. Any of the errors in #GConvertError may occur. |
1800 | | * |
1801 | | * Converts an absolute filename to an escaped ASCII-encoded URI, with the path |
1802 | | * component following Section 3.3. of RFC 2396. |
1803 | | * |
1804 | | * Returns: a newly-allocated string holding the resulting |
1805 | | * URI, or %NULL on an error. |
1806 | | **/ |
1807 | | gchar * |
1808 | | g_filename_to_uri (const gchar *filename, |
1809 | | const gchar *hostname, |
1810 | | GError **error) |
1811 | 0 | { |
1812 | 0 | char *escaped_uri; |
1813 | |
|
1814 | 0 | g_return_val_if_fail (filename != NULL, NULL); |
1815 | | |
1816 | 0 | if (!g_path_is_absolute (filename)) |
1817 | 0 | { |
1818 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH, |
1819 | 0 | _("The pathname “%s” is not an absolute path"), |
1820 | 0 | filename); |
1821 | 0 | return NULL; |
1822 | 0 | } |
1823 | | |
1824 | 0 | if (hostname && |
1825 | 0 | !(g_utf8_validate (hostname, -1, NULL) |
1826 | 0 | && hostname_validate (hostname))) |
1827 | 0 | { |
1828 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
1829 | 0 | _("Invalid hostname")); |
1830 | 0 | return NULL; |
1831 | 0 | } |
1832 | | |
1833 | | #ifdef G_OS_WIN32 |
1834 | | /* Don't use localhost unnecessarily */ |
1835 | | if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0) |
1836 | | hostname = NULL; |
1837 | | #endif |
1838 | | |
1839 | 0 | escaped_uri = g_escape_file_uri (hostname, filename); |
1840 | |
|
1841 | 0 | return escaped_uri; |
1842 | 0 | } |
1843 | | |
1844 | | /** |
1845 | | * g_uri_list_extract_uris: |
1846 | | * @uri_list: an URI list |
1847 | | * |
1848 | | * Splits an URI list conforming to the text/uri-list |
1849 | | * mime type defined in RFC 2483 into individual URIs, |
1850 | | * discarding any comments. The URIs are not validated. |
1851 | | * |
1852 | | * Returns: (transfer full): a newly allocated %NULL-terminated list |
1853 | | * of strings holding the individual URIs. The array should be freed |
1854 | | * with g_strfreev(). |
1855 | | * |
1856 | | * Since: 2.6 |
1857 | | */ |
1858 | | gchar ** |
1859 | | g_uri_list_extract_uris (const gchar *uri_list) |
1860 | 0 | { |
1861 | 0 | GPtrArray *uris; |
1862 | 0 | const gchar *p, *q; |
1863 | |
|
1864 | 0 | uris = g_ptr_array_new (); |
1865 | |
|
1866 | 0 | p = uri_list; |
1867 | | |
1868 | | /* We don't actually try to validate the URI according to RFC |
1869 | | * 2396, or even check for allowed characters - we just ignore |
1870 | | * comments and trim whitespace off the ends. We also |
1871 | | * allow LF delimination as well as the specified CRLF. |
1872 | | * |
1873 | | * We do allow comments like specified in RFC 2483. |
1874 | | */ |
1875 | 0 | while (p) |
1876 | 0 | { |
1877 | 0 | if (*p != '#') |
1878 | 0 | { |
1879 | 0 | while (g_ascii_isspace (*p)) |
1880 | 0 | p++; |
1881 | |
|
1882 | 0 | q = p; |
1883 | 0 | while (*q && (*q != '\n') && (*q != '\r')) |
1884 | 0 | q++; |
1885 | |
|
1886 | 0 | if (q > p) |
1887 | 0 | { |
1888 | 0 | q--; |
1889 | 0 | while (q > p && g_ascii_isspace (*q)) |
1890 | 0 | q--; |
1891 | |
|
1892 | 0 | if (q > p) |
1893 | 0 | g_ptr_array_add (uris, g_strndup (p, q - p + 1)); |
1894 | 0 | } |
1895 | 0 | } |
1896 | 0 | p = strchr (p, '\n'); |
1897 | 0 | if (p) |
1898 | 0 | p++; |
1899 | 0 | } |
1900 | |
|
1901 | 0 | g_ptr_array_add (uris, NULL); |
1902 | |
|
1903 | 0 | return (gchar **) g_ptr_array_free (uris, FALSE); |
1904 | 0 | } |
1905 | | |
1906 | | /** |
1907 | | * g_filename_display_basename: |
1908 | | * @filename: (type filename): an absolute pathname in the |
1909 | | * GLib file name encoding |
1910 | | * |
1911 | | * Returns the display basename for the particular filename, guaranteed |
1912 | | * to be valid UTF-8. The display name might not be identical to the filename, |
1913 | | * for instance there might be problems converting it to UTF-8, and some files |
1914 | | * can be translated in the display. |
1915 | | * |
1916 | | * If GLib cannot make sense of the encoding of @filename, as a last resort it |
1917 | | * replaces unknown characters with U+FFFD, the Unicode replacement character. |
1918 | | * You can search the result for the UTF-8 encoding of this character (which is |
1919 | | * "\357\277\275" in octal notation) to find out if @filename was in an invalid |
1920 | | * encoding. |
1921 | | * |
1922 | | * You must pass the whole absolute pathname to this functions so that |
1923 | | * translation of well known locations can be done. |
1924 | | * |
1925 | | * This function is preferred over g_filename_display_name() if you know the |
1926 | | * whole path, as it allows translation. |
1927 | | * |
1928 | | * Returns: a newly allocated string containing |
1929 | | * a rendition of the basename of the filename in valid UTF-8 |
1930 | | * |
1931 | | * Since: 2.6 |
1932 | | **/ |
1933 | | gchar * |
1934 | | g_filename_display_basename (const gchar *filename) |
1935 | 0 | { |
1936 | 0 | char *basename; |
1937 | 0 | char *display_name; |
1938 | |
|
1939 | 0 | g_return_val_if_fail (filename != NULL, NULL); |
1940 | | |
1941 | 0 | basename = g_path_get_basename (filename); |
1942 | 0 | display_name = g_filename_display_name (basename); |
1943 | 0 | g_free (basename); |
1944 | 0 | return display_name; |
1945 | 0 | } |
1946 | | |
1947 | | /** |
1948 | | * g_filename_display_name: |
1949 | | * @filename: (type filename): a pathname hopefully in the |
1950 | | * GLib file name encoding |
1951 | | * |
1952 | | * Converts a filename into a valid UTF-8 string. The conversion is |
1953 | | * not necessarily reversible, so you should keep the original around |
1954 | | * and use the return value of this function only for display purposes. |
1955 | | * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL |
1956 | | * even if the filename actually isn't in the GLib file name encoding. |
1957 | | * |
1958 | | * If GLib cannot make sense of the encoding of @filename, as a last resort it |
1959 | | * replaces unknown characters with U+FFFD, the Unicode replacement character. |
1960 | | * You can search the result for the UTF-8 encoding of this character (which is |
1961 | | * "\357\277\275" in octal notation) to find out if @filename was in an invalid |
1962 | | * encoding. |
1963 | | * |
1964 | | * If you know the whole pathname of the file you should use |
1965 | | * g_filename_display_basename(), since that allows location-based |
1966 | | * translation of filenames. |
1967 | | * |
1968 | | * Returns: a newly allocated string containing |
1969 | | * a rendition of the filename in valid UTF-8 |
1970 | | * |
1971 | | * Since: 2.6 |
1972 | | **/ |
1973 | | gchar * |
1974 | | g_filename_display_name (const gchar *filename) |
1975 | 1.05k | { |
1976 | 1.05k | gint i; |
1977 | 1.05k | const gchar **charsets; |
1978 | 1.05k | gchar *display_name = NULL; |
1979 | 1.05k | gboolean is_utf8; |
1980 | | |
1981 | 1.05k | is_utf8 = g_get_filename_charsets (&charsets); |
1982 | | |
1983 | 1.05k | if (is_utf8) |
1984 | 1.05k | { |
1985 | 1.05k | if (g_utf8_validate (filename, -1, NULL)) |
1986 | 1.05k | display_name = g_strdup (filename); |
1987 | 1.05k | } |
1988 | | |
1989 | 1.05k | if (!display_name) |
1990 | 0 | { |
1991 | | /* Try to convert from the filename charsets to UTF-8. |
1992 | | * Skip the first charset if it is UTF-8. |
1993 | | */ |
1994 | 0 | for (i = is_utf8 ? 1 : 0; charsets[i]; i++) |
1995 | 0 | { |
1996 | 0 | display_name = g_convert (filename, -1, "UTF-8", charsets[i], |
1997 | 0 | NULL, NULL, NULL); |
1998 | |
|
1999 | 0 | if (display_name) |
2000 | 0 | break; |
2001 | 0 | } |
2002 | 0 | } |
2003 | | |
2004 | | /* if all conversions failed, we replace invalid UTF-8 |
2005 | | * by a question mark |
2006 | | */ |
2007 | 1.05k | if (!display_name) |
2008 | 0 | display_name = g_utf8_make_valid (filename, -1); |
2009 | | |
2010 | 1.05k | return display_name; |
2011 | 1.05k | } |
2012 | | |
2013 | | #ifdef G_OS_WIN32 |
2014 | | |
2015 | | /* Binary compatibility versions. Not for newly compiled code. */ |
2016 | | |
2017 | | _GLIB_EXTERN gchar *g_filename_to_utf8_utf8 (const gchar *opsysstring, |
2018 | | gssize len, |
2019 | | gsize *bytes_read, |
2020 | | gsize *bytes_written, |
2021 | | GError **error) G_GNUC_MALLOC; |
2022 | | _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar *utf8string, |
2023 | | gssize len, |
2024 | | gsize *bytes_read, |
2025 | | gsize *bytes_written, |
2026 | | GError **error) G_GNUC_MALLOC; |
2027 | | _GLIB_EXTERN gchar *g_filename_from_uri_utf8 (const gchar *uri, |
2028 | | gchar **hostname, |
2029 | | GError **error) G_GNUC_MALLOC; |
2030 | | _GLIB_EXTERN gchar *g_filename_to_uri_utf8 (const gchar *filename, |
2031 | | const gchar *hostname, |
2032 | | GError **error) G_GNUC_MALLOC; |
2033 | | |
2034 | | gchar * |
2035 | | g_filename_to_utf8_utf8 (const gchar *opsysstring, |
2036 | | gssize len, |
2037 | | gsize *bytes_read, |
2038 | | gsize *bytes_written, |
2039 | | GError **error) |
2040 | | { |
2041 | | return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error); |
2042 | | } |
2043 | | |
2044 | | gchar * |
2045 | | g_filename_from_utf8_utf8 (const gchar *utf8string, |
2046 | | gssize len, |
2047 | | gsize *bytes_read, |
2048 | | gsize *bytes_written, |
2049 | | GError **error) |
2050 | | { |
2051 | | return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error); |
2052 | | } |
2053 | | |
2054 | | gchar * |
2055 | | g_filename_from_uri_utf8 (const gchar *uri, |
2056 | | gchar **hostname, |
2057 | | GError **error) |
2058 | | { |
2059 | | return g_filename_from_uri (uri, hostname, error); |
2060 | | } |
2061 | | |
2062 | | gchar * |
2063 | | g_filename_to_uri_utf8 (const gchar *filename, |
2064 | | const gchar *hostname, |
2065 | | GError **error) |
2066 | | { |
2067 | | return g_filename_to_uri (filename, hostname, error); |
2068 | | } |
2069 | | |
2070 | | #endif |