/src/glib-2.80.0/glib/gconvert.c
Line | Count | Source |
1 | | /* GLIB - Library of useful routines for C programming |
2 | | * |
3 | | * gconvert.c: Convert between character sets using iconv |
4 | | * Copyright Red Hat Inc., 2000 |
5 | | * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com> |
6 | | * |
7 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
8 | | * |
9 | | * This library is free software; you can redistribute it and/or |
10 | | * modify it under the terms of the GNU Lesser General Public |
11 | | * License as published by the Free Software Foundation; either |
12 | | * version 2.1 of the License, or (at your option) any later version. |
13 | | * |
14 | | * This library is distributed in the hope that it will be useful, |
15 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 | | * Lesser General Public License for more details. |
18 | | * |
19 | | * You should have received a copy of the GNU Lesser General Public |
20 | | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
21 | | */ |
22 | | |
23 | | #include "config.h" |
24 | | #include "glibconfig.h" |
25 | | |
26 | | #ifndef G_OS_WIN32 |
27 | | #include <iconv.h> |
28 | | #endif |
29 | | #include <errno.h> |
30 | | #include <stdio.h> |
31 | | #include <string.h> |
32 | | #include <stdlib.h> |
33 | | |
34 | | #ifdef G_OS_WIN32 |
35 | | #include "win_iconv.c" |
36 | | #endif |
37 | | |
38 | | #ifdef G_PLATFORM_WIN32 |
39 | | #define STRICT |
40 | | #include <windows.h> |
41 | | #undef STRICT |
42 | | #endif |
43 | | |
44 | | #include "gconvert.h" |
45 | | #include "gconvertprivate.h" |
46 | | |
47 | | #include "gcharsetprivate.h" |
48 | | #include "gslist.h" |
49 | | #include "gstrfuncs.h" |
50 | | #include "gtestutils.h" |
51 | | #include "gthread.h" |
52 | | #include "gthreadprivate.h" |
53 | | #include "gunicode.h" |
54 | | #include "gfileutils.h" |
55 | | #include "genviron.h" |
56 | | |
57 | | #include "glibintl.h" |
58 | | |
59 | | /* We try to terminate strings in unknown charsets with this many zero bytes |
60 | | * to ensure that multibyte strings really are nul-terminated when we return |
61 | | * them from g_convert() and friends. |
62 | | */ |
63 | 59.7k | #define NUL_TERMINATOR_LENGTH 4 |
64 | | |
65 | | G_DEFINE_QUARK (g_convert_error, g_convert_error) |
66 | | |
67 | | static gboolean |
68 | | try_conversion (const char *to_codeset, |
69 | | const char *from_codeset, |
70 | | iconv_t *cd) |
71 | 16.5k | { |
72 | 16.5k | *cd = iconv_open (to_codeset, from_codeset); |
73 | | |
74 | 16.5k | if (*cd == (iconv_t)-1 && errno == EINVAL) |
75 | 0 | return FALSE; |
76 | | |
77 | | #if defined(__FreeBSD__) && defined(ICONV_SET_ILSEQ_INVALID) |
78 | | /* On FreeBSD request GNU iconv compatible handling of characters that cannot |
79 | | * be repesented in the destination character set. |
80 | | * See https://cgit.freebsd.org/src/commit/?id=7c5b23111c5fd1992047922d4247c4a1ce1bb6c3 |
81 | | */ |
82 | | int value = 1; |
83 | | if (iconvctl (*cd, ICONV_SET_ILSEQ_INVALID, &value) != 0) |
84 | | return FALSE; |
85 | | #endif |
86 | 16.5k | return TRUE; |
87 | 16.5k | } |
88 | | |
89 | | static gboolean |
90 | | try_to_aliases (const char **to_aliases, |
91 | | const char *from_codeset, |
92 | | iconv_t *cd) |
93 | 0 | { |
94 | 0 | if (to_aliases) |
95 | 0 | { |
96 | 0 | const char **p = to_aliases; |
97 | 0 | while (*p) |
98 | 0 | { |
99 | 0 | if (try_conversion (*p, from_codeset, cd)) |
100 | 0 | return TRUE; |
101 | | |
102 | 0 | p++; |
103 | 0 | } |
104 | 0 | } |
105 | | |
106 | 0 | return FALSE; |
107 | 0 | } |
108 | | |
109 | | /** |
110 | | * g_iconv_open: (skip) |
111 | | * @to_codeset: destination codeset |
112 | | * @from_codeset: source codeset |
113 | | * |
114 | | * Same as the standard UNIX routine iconv_open(), but |
115 | | * may be implemented via libiconv on UNIX flavors that lack |
116 | | * a native implementation. |
117 | | * |
118 | | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
119 | | * more convenient than the raw iconv wrappers. |
120 | | * |
121 | | * Returns: a "conversion descriptor", or (GIConv)-1 if |
122 | | * opening the converter failed. |
123 | | **/ |
124 | | GIConv |
125 | | g_iconv_open (const gchar *to_codeset, |
126 | | const gchar *from_codeset) |
127 | 16.5k | { |
128 | 16.5k | iconv_t cd; |
129 | | |
130 | 16.5k | if (!try_conversion (to_codeset, from_codeset, &cd)) |
131 | 0 | { |
132 | 0 | const char **to_aliases = _g_charset_get_aliases (to_codeset); |
133 | 0 | const char **from_aliases = _g_charset_get_aliases (from_codeset); |
134 | |
|
135 | 0 | if (from_aliases) |
136 | 0 | { |
137 | 0 | const char **p = from_aliases; |
138 | 0 | while (*p) |
139 | 0 | { |
140 | 0 | if (try_conversion (to_codeset, *p, &cd)) |
141 | 0 | goto out; |
142 | | |
143 | 0 | if (try_to_aliases (to_aliases, *p, &cd)) |
144 | 0 | goto out; |
145 | | |
146 | 0 | p++; |
147 | 0 | } |
148 | 0 | } |
149 | | |
150 | 0 | if (try_to_aliases (to_aliases, from_codeset, &cd)) |
151 | 0 | goto out; |
152 | 0 | } |
153 | | |
154 | 16.5k | out: |
155 | 16.5k | return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd; |
156 | 16.5k | } |
157 | | |
158 | | /** |
159 | | * g_iconv: (skip) |
160 | | * @converter: conversion descriptor from g_iconv_open() |
161 | | * @inbuf: bytes to convert |
162 | | * @inbytes_left: (inout): inout parameter, bytes remaining to convert in @inbuf |
163 | | * @outbuf: converted output bytes |
164 | | * @outbytes_left: (inout): inout parameter, bytes available to fill in @outbuf |
165 | | * |
166 | | * Same as the standard UNIX routine iconv(), but |
167 | | * may be implemented via libiconv on UNIX flavors that lack |
168 | | * a native implementation. |
169 | | * |
170 | | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
171 | | * more convenient than the raw iconv wrappers. |
172 | | * |
173 | | * Note that the behaviour of iconv() for characters which are valid in the |
174 | | * input character set, but which have no representation in the output character |
175 | | * set, is implementation defined. This function may return success (with a |
176 | | * positive number of non-reversible conversions as replacement characters were |
177 | | * used), or it may return -1 and set an error such as %EILSEQ, in such a |
178 | | * situation. |
179 | | * |
180 | | * Returns: count of non-reversible conversions, or -1 on error |
181 | | **/ |
182 | | gsize |
183 | | g_iconv (GIConv converter, |
184 | | gchar **inbuf, |
185 | | gsize *inbytes_left, |
186 | | gchar **outbuf, |
187 | | gsize *outbytes_left) |
188 | 36.2k | { |
189 | 36.2k | iconv_t cd = (iconv_t)converter; |
190 | | |
191 | 36.2k | return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left); |
192 | 36.2k | } |
193 | | |
194 | | /** |
195 | | * g_iconv_close: (skip) |
196 | | * @converter: a conversion descriptor from g_iconv_open() |
197 | | * |
198 | | * Same as the standard UNIX routine iconv_close(), but |
199 | | * may be implemented via libiconv on UNIX flavors that lack |
200 | | * a native implementation. Should be called to clean up |
201 | | * the conversion descriptor from g_iconv_open() when |
202 | | * you are done converting things. |
203 | | * |
204 | | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
205 | | * more convenient than the raw iconv wrappers. |
206 | | * |
207 | | * Returns: -1 on error, 0 on success |
208 | | **/ |
209 | | gint |
210 | | g_iconv_close (GIConv converter) |
211 | 16.5k | { |
212 | 16.5k | iconv_t cd = (iconv_t)converter; |
213 | | |
214 | 16.5k | return iconv_close (cd); |
215 | 16.5k | } |
216 | | |
217 | | static GIConv |
218 | | open_converter (const gchar *to_codeset, |
219 | | const gchar *from_codeset, |
220 | | GError **error) |
221 | 16.5k | { |
222 | 16.5k | GIConv cd; |
223 | | |
224 | 16.5k | cd = g_iconv_open (to_codeset, from_codeset); |
225 | | |
226 | 16.5k | if (cd == (GIConv) -1) |
227 | 0 | { |
228 | | /* Something went wrong. */ |
229 | 0 | if (error) |
230 | 0 | { |
231 | 0 | if (errno == EINVAL) |
232 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION, |
233 | 0 | _("Conversion from character set “%s” to “%s” is not supported"), |
234 | 0 | from_codeset, to_codeset); |
235 | 0 | else |
236 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
237 | 0 | _("Could not open converter from “%s” to “%s”"), |
238 | 0 | from_codeset, to_codeset); |
239 | 0 | } |
240 | 0 | } |
241 | | |
242 | 16.5k | return cd; |
243 | 16.5k | } |
244 | | |
245 | | static int |
246 | | close_converter (GIConv cd) |
247 | 16.5k | { |
248 | 16.5k | if (cd == (GIConv) -1) |
249 | 0 | return 0; |
250 | | |
251 | 16.5k | return g_iconv_close (cd); |
252 | 16.5k | } |
253 | | |
254 | | /** |
255 | | * g_convert_with_iconv: (skip) |
256 | | * @str: (array length=len) (element-type guint8): |
257 | | * the string to convert. |
258 | | * @len: the length of the string in bytes, or -1 if the string is |
259 | | * nul-terminated (Note that some encodings may allow nul |
260 | | * bytes to occur inside strings. In that case, using -1 |
261 | | * for the @len parameter is unsafe) |
262 | | * @converter: conversion descriptor from g_iconv_open() |
263 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
264 | | * the input string that were successfully converted, or %NULL. |
265 | | * Even if the conversion was successful, this may be |
266 | | * less than @len if there were partial characters |
267 | | * at the end of the input. If the error |
268 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
269 | | * stored will be the byte offset after the last valid |
270 | | * input sequence. |
271 | | * @bytes_written: (out) (optional): the number of bytes stored in |
272 | | * the output buffer (not including the terminating nul). |
273 | | * @error: location to store the error occurring, or %NULL to ignore |
274 | | * errors. Any of the errors in #GConvertError may occur. |
275 | | * |
276 | | * Converts a string from one character set to another. |
277 | | * |
278 | | * Note that you should use g_iconv() for streaming conversions. |
279 | | * Despite the fact that @bytes_read can return information about partial |
280 | | * characters, the g_convert_... functions are not generally suitable |
281 | | * for streaming. If the underlying converter maintains internal state, |
282 | | * then this won't be preserved across successive calls to g_convert(), |
283 | | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
284 | | * this is the GNU C converter for CP1255 which does not emit a base |
285 | | * character until it knows that the next character is not a mark that |
286 | | * could combine with the base character.) |
287 | | * |
288 | | * Characters which are valid in the input character set, but which have no |
289 | | * representation in the output character set will result in a |
290 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv() |
291 | | * specification, which leaves this behaviour implementation defined. Note that |
292 | | * this is the same error code as is returned for an invalid byte sequence in |
293 | | * the input character set. To get defined behaviour for conversion of |
294 | | * unrepresentable characters, use g_convert_with_fallback(). |
295 | | * |
296 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
297 | | * If the conversion was successful, a newly allocated buffer |
298 | | * containing the converted string, which must be freed with |
299 | | * g_free(). Otherwise %NULL and @error will be set. |
300 | | **/ |
301 | | gchar* |
302 | | g_convert_with_iconv (const gchar *str, |
303 | | gssize len, |
304 | | GIConv converter, |
305 | | gsize *bytes_read, |
306 | | gsize *bytes_written, |
307 | | GError **error) |
308 | 16.5k | { |
309 | 16.5k | gchar *dest; |
310 | 16.5k | gchar *outp; |
311 | 16.5k | const gchar *p; |
312 | 16.5k | gsize inbytes_remaining; |
313 | 16.5k | gsize outbytes_remaining; |
314 | 16.5k | gsize err; |
315 | 16.5k | gsize outbuf_size; |
316 | 16.5k | gboolean have_error = FALSE; |
317 | 16.5k | gboolean done = FALSE; |
318 | 16.5k | gboolean reset = FALSE; |
319 | | |
320 | 16.5k | g_return_val_if_fail (converter != (GIConv) -1, NULL); |
321 | | |
322 | 16.5k | if (len < 0) |
323 | 16.5k | len = strlen (str); |
324 | | |
325 | 16.5k | p = str; |
326 | 16.5k | inbytes_remaining = len; |
327 | 16.5k | outbuf_size = len + NUL_TERMINATOR_LENGTH; |
328 | | |
329 | 16.5k | outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; |
330 | 16.5k | outp = dest = g_malloc (outbuf_size); |
331 | | |
332 | 52.7k | while (!done && !have_error) |
333 | 36.2k | { |
334 | 36.2k | if (reset) |
335 | 9.50k | err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining); |
336 | 26.7k | else |
337 | 26.7k | err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining); |
338 | | |
339 | 36.2k | if (err == (gsize) -1) |
340 | 17.2k | { |
341 | 17.2k | switch (errno) |
342 | 17.2k | { |
343 | 138 | case EINVAL: |
344 | | /* Incomplete text, do not report an error */ |
345 | 138 | done = TRUE; |
346 | 138 | break; |
347 | 10.1k | case E2BIG: |
348 | 10.1k | { |
349 | 10.1k | gsize used = outp - dest; |
350 | | |
351 | 10.1k | outbuf_size *= 2; |
352 | 10.1k | dest = g_realloc (dest, outbuf_size); |
353 | | |
354 | 10.1k | outp = dest + used; |
355 | 10.1k | outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; |
356 | 10.1k | } |
357 | 10.1k | break; |
358 | 6.88k | case EILSEQ: |
359 | 6.88k | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
360 | 6.88k | _("Invalid byte sequence in conversion input")); |
361 | 6.88k | have_error = TRUE; |
362 | 6.88k | break; |
363 | 0 | default: |
364 | 0 | { |
365 | 0 | int errsv = errno; |
366 | |
|
367 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
368 | 0 | _("Error during conversion: %s"), |
369 | 0 | g_strerror (errsv)); |
370 | 0 | } |
371 | 0 | have_error = TRUE; |
372 | 0 | break; |
373 | 17.2k | } |
374 | 17.2k | } |
375 | 19.0k | else if (err > 0) |
376 | 0 | { |
377 | | /* @err gives the number of replacement characters used. */ |
378 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
379 | 0 | _("Unrepresentable character in conversion input")); |
380 | 0 | have_error = TRUE; |
381 | 0 | } |
382 | 19.0k | else |
383 | 19.0k | { |
384 | 19.0k | if (!reset) |
385 | 9.50k | { |
386 | | /* call g_iconv with NULL inbuf to cleanup shift state */ |
387 | 9.50k | reset = TRUE; |
388 | 9.50k | inbytes_remaining = 0; |
389 | 9.50k | } |
390 | 9.50k | else |
391 | 9.50k | done = TRUE; |
392 | 19.0k | } |
393 | 36.2k | } |
394 | | |
395 | 16.5k | memset (outp, 0, NUL_TERMINATOR_LENGTH); |
396 | | |
397 | 16.5k | if (bytes_read) |
398 | 0 | *bytes_read = p - str; |
399 | 16.5k | else |
400 | 16.5k | { |
401 | 16.5k | if ((p - str) != len) |
402 | 7.02k | { |
403 | 7.02k | if (!have_error) |
404 | 138 | { |
405 | 138 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
406 | 138 | _("Partial character sequence at end of input")); |
407 | 138 | have_error = TRUE; |
408 | 138 | } |
409 | 7.02k | } |
410 | 16.5k | } |
411 | | |
412 | 16.5k | if (bytes_written) |
413 | 16.5k | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
414 | | |
415 | 16.5k | if (have_error) |
416 | 7.02k | { |
417 | 7.02k | g_free (dest); |
418 | 7.02k | return NULL; |
419 | 7.02k | } |
420 | 9.50k | else |
421 | 9.50k | return dest; |
422 | 16.5k | } |
423 | | |
424 | | /** |
425 | | * g_convert: |
426 | | * @str: (array length=len) (element-type guint8): |
427 | | * the string to convert. |
428 | | * @len: the length of the string in bytes, or -1 if the string is |
429 | | * nul-terminated (Note that some encodings may allow nul |
430 | | * bytes to occur inside strings. In that case, using -1 |
431 | | * for the @len parameter is unsafe) |
432 | | * @to_codeset: name of character set into which to convert @str |
433 | | * @from_codeset: character set of @str. |
434 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
435 | | * the input string that were successfully converted, or %NULL. |
436 | | * Even if the conversion was successful, this may be |
437 | | * less than @len if there were partial characters |
438 | | * at the end of the input. If the error |
439 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
440 | | * stored will be the byte offset after the last valid |
441 | | * input sequence. |
442 | | * @bytes_written: (out) (optional): the number of bytes stored in |
443 | | * the output buffer (not including the terminating nul). |
444 | | * @error: location to store the error occurring, or %NULL to ignore |
445 | | * errors. Any of the errors in #GConvertError may occur. |
446 | | * |
447 | | * Converts a string from one character set to another. |
448 | | * |
449 | | * Note that you should use g_iconv() for streaming conversions. |
450 | | * Despite the fact that @bytes_read can return information about partial |
451 | | * characters, the g_convert_... functions are not generally suitable |
452 | | * for streaming. If the underlying converter maintains internal state, |
453 | | * then this won't be preserved across successive calls to g_convert(), |
454 | | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
455 | | * this is the GNU C converter for CP1255 which does not emit a base |
456 | | * character until it knows that the next character is not a mark that |
457 | | * could combine with the base character.) |
458 | | * |
459 | | * Using extensions such as "//TRANSLIT" may not work (or may not work |
460 | | * well) on many platforms. Consider using g_str_to_ascii() instead. |
461 | | * |
462 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
463 | | * If the conversion was successful, a newly allocated buffer |
464 | | * containing the converted string, which must be freed with g_free(). |
465 | | * Otherwise %NULL and @error will be set. |
466 | | **/ |
467 | | gchar* |
468 | | g_convert (const gchar *str, |
469 | | gssize len, |
470 | | const gchar *to_codeset, |
471 | | const gchar *from_codeset, |
472 | | gsize *bytes_read, |
473 | | gsize *bytes_written, |
474 | | GError **error) |
475 | 16.5k | { |
476 | 16.5k | gchar *res; |
477 | 16.5k | GIConv cd; |
478 | | |
479 | 16.5k | g_return_val_if_fail (str != NULL, NULL); |
480 | 16.5k | g_return_val_if_fail (to_codeset != NULL, NULL); |
481 | 16.5k | g_return_val_if_fail (from_codeset != NULL, NULL); |
482 | | |
483 | 16.5k | cd = open_converter (to_codeset, from_codeset, error); |
484 | | |
485 | 16.5k | if (cd == (GIConv) -1) |
486 | 0 | { |
487 | 0 | if (bytes_read) |
488 | 0 | *bytes_read = 0; |
489 | | |
490 | 0 | if (bytes_written) |
491 | 0 | *bytes_written = 0; |
492 | | |
493 | 0 | return NULL; |
494 | 0 | } |
495 | | |
496 | 16.5k | res = g_convert_with_iconv (str, len, cd, |
497 | 16.5k | bytes_read, bytes_written, |
498 | 16.5k | error); |
499 | | |
500 | 16.5k | close_converter (cd); |
501 | | |
502 | 16.5k | return res; |
503 | 16.5k | } |
504 | | |
505 | | /** |
506 | | * g_convert_with_fallback: |
507 | | * @str: (array length=len) (element-type guint8): |
508 | | * the string to convert. |
509 | | * @len: the length of the string in bytes, or -1 if the string is |
510 | | * nul-terminated (Note that some encodings may allow nul |
511 | | * bytes to occur inside strings. In that case, using -1 |
512 | | * for the @len parameter is unsafe) |
513 | | * @to_codeset: name of character set into which to convert @str |
514 | | * @from_codeset: character set of @str. |
515 | | * @fallback: UTF-8 string to use in place of characters not |
516 | | * present in the target encoding. (The string must be |
517 | | * representable in the target encoding). |
518 | | * If %NULL, characters not in the target encoding will |
519 | | * be represented as Unicode escapes \uxxxx or \Uxxxxyyyy. |
520 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
521 | | * the input string that were successfully converted, or %NULL. |
522 | | * Even if the conversion was successful, this may be |
523 | | * less than @len if there were partial characters |
524 | | * at the end of the input. |
525 | | * @bytes_written: (out) (optional): the number of bytes stored in |
526 | | * the output buffer (not including the terminating nul). |
527 | | * @error: location to store the error occurring, or %NULL to ignore |
528 | | * errors. Any of the errors in #GConvertError may occur. |
529 | | * |
530 | | * Converts a string from one character set to another, possibly |
531 | | * including fallback sequences for characters not representable |
532 | | * in the output. Note that it is not guaranteed that the specification |
533 | | * for the fallback sequences in @fallback will be honored. Some |
534 | | * systems may do an approximate conversion from @from_codeset |
535 | | * to @to_codeset in their iconv() functions, |
536 | | * in which case GLib will simply return that approximate conversion. |
537 | | * |
538 | | * Note that you should use g_iconv() for streaming conversions. |
539 | | * Despite the fact that @bytes_read can return information about partial |
540 | | * characters, the g_convert_... functions are not generally suitable |
541 | | * for streaming. If the underlying converter maintains internal state, |
542 | | * then this won't be preserved across successive calls to g_convert(), |
543 | | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
544 | | * this is the GNU C converter for CP1255 which does not emit a base |
545 | | * character until it knows that the next character is not a mark that |
546 | | * could combine with the base character.) |
547 | | * |
548 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
549 | | * If the conversion was successful, a newly allocated buffer |
550 | | * containing the converted string, which must be freed with g_free(). |
551 | | * Otherwise %NULL and @error will be set. |
552 | | **/ |
553 | | gchar* |
554 | | g_convert_with_fallback (const gchar *str, |
555 | | gssize len, |
556 | | const gchar *to_codeset, |
557 | | const gchar *from_codeset, |
558 | | const gchar *fallback, |
559 | | gsize *bytes_read, |
560 | | gsize *bytes_written, |
561 | | GError **error) |
562 | 28 | { |
563 | 28 | gchar *utf8; |
564 | 28 | gchar *dest; |
565 | 28 | gchar *outp; |
566 | 28 | const gchar *insert_str = NULL; |
567 | 28 | const gchar *p; |
568 | 28 | gsize inbytes_remaining; |
569 | 28 | const gchar *save_p = NULL; |
570 | 28 | gsize save_inbytes = 0; |
571 | 28 | gsize outbytes_remaining; |
572 | 28 | gsize err; |
573 | 28 | GIConv cd; |
574 | 28 | gsize outbuf_size; |
575 | 28 | gboolean have_error = FALSE; |
576 | 28 | gboolean done = FALSE; |
577 | | |
578 | 28 | GError *local_error = NULL; |
579 | | |
580 | 28 | g_return_val_if_fail (str != NULL, NULL); |
581 | 28 | g_return_val_if_fail (to_codeset != NULL, NULL); |
582 | 28 | g_return_val_if_fail (from_codeset != NULL, NULL); |
583 | | |
584 | 28 | if (len < 0) |
585 | 28 | len = strlen (str); |
586 | | |
587 | | /* Try an exact conversion; we only proceed if this fails |
588 | | * due to an illegal sequence in the input string. |
589 | | */ |
590 | 28 | dest = g_convert (str, len, to_codeset, from_codeset, |
591 | 28 | bytes_read, bytes_written, &local_error); |
592 | 28 | if (!local_error) |
593 | 28 | return dest; |
594 | | |
595 | 28 | g_assert (dest == NULL); |
596 | | |
597 | 0 | if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) |
598 | 0 | { |
599 | 0 | g_propagate_error (error, local_error); |
600 | 0 | return NULL; |
601 | 0 | } |
602 | 0 | else |
603 | 0 | g_error_free (local_error); |
604 | | |
605 | 0 | local_error = NULL; |
606 | | |
607 | | /* No go; to proceed, we need a converter from "UTF-8" to |
608 | | * to_codeset, and the string as UTF-8. |
609 | | */ |
610 | 0 | cd = open_converter (to_codeset, "UTF-8", error); |
611 | 0 | if (cd == (GIConv) -1) |
612 | 0 | { |
613 | 0 | if (bytes_read) |
614 | 0 | *bytes_read = 0; |
615 | | |
616 | 0 | if (bytes_written) |
617 | 0 | *bytes_written = 0; |
618 | | |
619 | 0 | return NULL; |
620 | 0 | } |
621 | | |
622 | 0 | utf8 = g_convert (str, len, "UTF-8", from_codeset, |
623 | 0 | bytes_read, &inbytes_remaining, error); |
624 | 0 | if (!utf8) |
625 | 0 | { |
626 | 0 | close_converter (cd); |
627 | 0 | if (bytes_written) |
628 | 0 | *bytes_written = 0; |
629 | 0 | return NULL; |
630 | 0 | } |
631 | | |
632 | | /* Now the heart of the code. We loop through the UTF-8 string, and |
633 | | * whenever we hit an offending character, we form fallback, convert |
634 | | * the fallback to the target codeset, and then go back to |
635 | | * converting the original string after finishing with the fallback. |
636 | | * |
637 | | * The variables save_p and save_inbytes store the input state |
638 | | * for the original string while we are converting the fallback |
639 | | */ |
640 | 0 | p = utf8; |
641 | |
|
642 | 0 | outbuf_size = len + NUL_TERMINATOR_LENGTH; |
643 | 0 | outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; |
644 | 0 | outp = dest = g_malloc (outbuf_size); |
645 | |
|
646 | 0 | while (!done && !have_error) |
647 | 0 | { |
648 | 0 | gsize inbytes_tmp = inbytes_remaining; |
649 | 0 | err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining); |
650 | 0 | inbytes_remaining = inbytes_tmp; |
651 | |
|
652 | 0 | if (err == (gsize) -1) |
653 | 0 | { |
654 | 0 | switch (errno) |
655 | 0 | { |
656 | 0 | case EINVAL: |
657 | 0 | g_assert_not_reached(); |
658 | 0 | break; |
659 | 0 | case E2BIG: |
660 | 0 | { |
661 | 0 | gsize used = outp - dest; |
662 | |
|
663 | 0 | outbuf_size *= 2; |
664 | 0 | dest = g_realloc (dest, outbuf_size); |
665 | | |
666 | 0 | outp = dest + used; |
667 | 0 | outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; |
668 | | |
669 | 0 | break; |
670 | 0 | } |
671 | 0 | case EILSEQ: |
672 | 0 | if (save_p) |
673 | 0 | { |
674 | | /* Error converting fallback string - fatal |
675 | | */ |
676 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
677 | 0 | _("Cannot convert fallback “%s” to codeset “%s”"), |
678 | 0 | insert_str, to_codeset); |
679 | 0 | have_error = TRUE; |
680 | 0 | break; |
681 | 0 | } |
682 | 0 | else if (p) |
683 | 0 | { |
684 | 0 | if (!fallback) |
685 | 0 | { |
686 | 0 | gunichar ch = g_utf8_get_char (p); |
687 | 0 | insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x", |
688 | 0 | ch); |
689 | 0 | } |
690 | 0 | else |
691 | 0 | insert_str = fallback; |
692 | | |
693 | 0 | save_p = g_utf8_next_char (p); |
694 | 0 | save_inbytes = inbytes_remaining - (save_p - p); |
695 | 0 | p = insert_str; |
696 | 0 | inbytes_remaining = strlen (p); |
697 | 0 | break; |
698 | 0 | } |
699 | | /* if p is null */ |
700 | 0 | G_GNUC_FALLTHROUGH; |
701 | 0 | default: |
702 | 0 | { |
703 | 0 | int errsv = errno; |
704 | |
|
705 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
706 | 0 | _("Error during conversion: %s"), |
707 | 0 | g_strerror (errsv)); |
708 | 0 | } |
709 | |
|
710 | 0 | have_error = TRUE; |
711 | 0 | break; |
712 | 0 | } |
713 | 0 | } |
714 | 0 | else |
715 | 0 | { |
716 | 0 | if (save_p) |
717 | 0 | { |
718 | 0 | if (!fallback) |
719 | 0 | g_free ((gchar *)insert_str); |
720 | 0 | p = save_p; |
721 | 0 | inbytes_remaining = save_inbytes; |
722 | 0 | save_p = NULL; |
723 | 0 | } |
724 | 0 | else if (p) |
725 | 0 | { |
726 | | /* call g_iconv with NULL inbuf to cleanup shift state */ |
727 | 0 | p = NULL; |
728 | 0 | inbytes_remaining = 0; |
729 | 0 | } |
730 | 0 | else |
731 | 0 | done = TRUE; |
732 | 0 | } |
733 | 0 | } |
734 | | |
735 | | /* Cleanup |
736 | | */ |
737 | 0 | memset (outp, 0, NUL_TERMINATOR_LENGTH); |
738 | | |
739 | 0 | close_converter (cd); |
740 | |
|
741 | 0 | if (bytes_written) |
742 | 0 | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
743 | |
|
744 | 0 | g_free (utf8); |
745 | |
|
746 | 0 | if (have_error) |
747 | 0 | { |
748 | 0 | if (save_p && !fallback) |
749 | 0 | g_free ((gchar *)insert_str); |
750 | 0 | g_free (dest); |
751 | 0 | return NULL; |
752 | 0 | } |
753 | 0 | else |
754 | 0 | return dest; |
755 | 0 | } |
756 | | |
757 | | /* |
758 | | * g_locale_to_utf8 |
759 | | * |
760 | | * |
761 | | */ |
762 | | |
763 | | /* |
764 | | * Validate @string as UTF-8. @len can be negative if @string is |
765 | | * nul-terminated, or a non-negative value in bytes. If @string ends in an |
766 | | * incomplete sequence, or contains any illegal sequences or nul codepoints, |
767 | | * %NULL will be returned and the error set to |
768 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. |
769 | | * On success, @bytes_read and @bytes_written, if provided, will be set to |
770 | | * the number of bytes in @string up to @len or the terminating nul byte. |
771 | | * On error, @bytes_read will be set to the byte offset after the last valid |
772 | | * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0. |
773 | | */ |
774 | | static gchar * |
775 | | strdup_len (const gchar *string, |
776 | | gssize len, |
777 | | gsize *bytes_read, |
778 | | gsize *bytes_written, |
779 | | GError **error) |
780 | 0 | { |
781 | 0 | gsize real_len; |
782 | 0 | const gchar *end_valid; |
783 | |
|
784 | 0 | if (!g_utf8_validate (string, len, &end_valid)) |
785 | 0 | { |
786 | 0 | if (bytes_read) |
787 | 0 | *bytes_read = end_valid - string; |
788 | 0 | if (bytes_written) |
789 | 0 | *bytes_written = 0; |
790 | |
|
791 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
792 | 0 | _("Invalid byte sequence in conversion input")); |
793 | 0 | return NULL; |
794 | 0 | } |
795 | | |
796 | 0 | real_len = end_valid - string; |
797 | |
|
798 | 0 | if (bytes_read) |
799 | 0 | *bytes_read = real_len; |
800 | 0 | if (bytes_written) |
801 | 0 | *bytes_written = real_len; |
802 | |
|
803 | 0 | return g_strndup (string, real_len); |
804 | 0 | } |
805 | | |
806 | | typedef enum |
807 | | { |
808 | | CONVERT_CHECK_NO_NULS_IN_INPUT = 1 << 0, |
809 | | CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1 |
810 | | } ConvertCheckFlags; |
811 | | |
812 | | /* |
813 | | * Convert from @string in the encoding identified by @from_codeset, |
814 | | * returning a string in the encoding identifed by @to_codeset. |
815 | | * @len can be negative if @string is nul-terminated, or a non-negative |
816 | | * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags |
817 | | * to check the input, the output, or both, for embedded nul bytes. |
818 | | * On success, @bytes_read, if provided, will be set to the number of bytes |
819 | | * in @string up to @len or the terminating nul byte, and @bytes_written, if |
820 | | * provided, will be set to the number of output bytes written into the |
821 | | * returned buffer, excluding the terminating nul sequence. |
822 | | * On error, @bytes_read will be set to the byte offset after the last valid |
823 | | * sequence in @string, and @bytes_written will be set to 0. |
824 | | */ |
825 | | static gchar * |
826 | | convert_checked (const gchar *string, |
827 | | gssize len, |
828 | | const gchar *to_codeset, |
829 | | const gchar *from_codeset, |
830 | | ConvertCheckFlags flags, |
831 | | gsize *bytes_read, |
832 | | gsize *bytes_written, |
833 | | GError **error) |
834 | 0 | { |
835 | 0 | gchar *out; |
836 | 0 | gsize outbytes; |
837 | |
|
838 | 0 | if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0) |
839 | 0 | { |
840 | 0 | const gchar *early_nul = memchr (string, '\0', len); |
841 | 0 | if (early_nul != NULL) |
842 | 0 | { |
843 | 0 | if (bytes_read) |
844 | 0 | *bytes_read = early_nul - string; |
845 | 0 | if (bytes_written) |
846 | 0 | *bytes_written = 0; |
847 | |
|
848 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
849 | 0 | _("Embedded NUL byte in conversion input")); |
850 | 0 | return NULL; |
851 | 0 | } |
852 | 0 | } |
853 | | |
854 | 0 | out = g_convert (string, len, to_codeset, from_codeset, |
855 | 0 | bytes_read, &outbytes, error); |
856 | 0 | if (out == NULL) |
857 | 0 | { |
858 | 0 | if (bytes_written) |
859 | 0 | *bytes_written = 0; |
860 | 0 | return NULL; |
861 | 0 | } |
862 | | |
863 | 0 | if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT) |
864 | 0 | && memchr (out, '\0', outbytes) != NULL) |
865 | 0 | { |
866 | 0 | g_free (out); |
867 | 0 | if (bytes_written) |
868 | 0 | *bytes_written = 0; |
869 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL, |
870 | 0 | _("Embedded NUL byte in conversion output")); |
871 | 0 | return NULL; |
872 | 0 | } |
873 | | |
874 | 0 | if (bytes_written) |
875 | 0 | *bytes_written = outbytes; |
876 | 0 | return out; |
877 | 0 | } |
878 | | |
879 | | /** |
880 | | * g_locale_to_utf8: |
881 | | * @opsysstring: (array length=len) (element-type guint8): a string in the |
882 | | * encoding of the current locale. On Windows |
883 | | * this means the system codepage. |
884 | | * @len: the length of the string, or -1 if the string is |
885 | | * nul-terminated (Note that some encodings may allow nul |
886 | | * bytes to occur inside strings. In that case, using -1 |
887 | | * for the @len parameter is unsafe) |
888 | | * @bytes_read: (out) (optional): location to store the number of bytes in the |
889 | | * input string that were successfully converted, or %NULL. |
890 | | * Even if the conversion was successful, this may be |
891 | | * less than @len if there were partial characters |
892 | | * at the end of the input. If the error |
893 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
894 | | * stored will be the byte offset after the last valid |
895 | | * input sequence. |
896 | | * @bytes_written: (out) (optional): the number of bytes stored in the output |
897 | | * buffer (not including the terminating nul). |
898 | | * @error: location to store the error occurring, or %NULL to ignore |
899 | | * errors. Any of the errors in #GConvertError may occur. |
900 | | * |
901 | | * Converts a string which is in the encoding used for strings by |
902 | | * the C runtime (usually the same as that used by the operating |
903 | | * system) in the [current locale][setlocale] into a UTF-8 string. |
904 | | * |
905 | | * If the source encoding is not UTF-8 and the conversion output contains a |
906 | | * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the |
907 | | * function returns %NULL. |
908 | | * If the source encoding is UTF-8, an embedded nul character is treated with |
909 | | * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with |
910 | | * earlier versions of this library. Use g_convert() to produce output that |
911 | | * may contain embedded nul characters. |
912 | | * |
913 | | * Returns: (type utf8): The converted string, or %NULL on an error. |
914 | | **/ |
915 | | gchar * |
916 | | g_locale_to_utf8 (const gchar *opsysstring, |
917 | | gssize len, |
918 | | gsize *bytes_read, |
919 | | gsize *bytes_written, |
920 | | GError **error) |
921 | 0 | { |
922 | 0 | const char *charset; |
923 | |
|
924 | 0 | if (g_get_charset (&charset)) |
925 | 0 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
926 | 0 | else |
927 | 0 | return convert_checked (opsysstring, len, "UTF-8", charset, |
928 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
929 | 0 | bytes_read, bytes_written, error); |
930 | 0 | } |
931 | | |
932 | | /* |
933 | | * Do the exact same as g_locale_to_utf8 except that the charset would |
934 | | * be retrieved from _g_get_time_charset (which uses LC_TIME) |
935 | | * |
936 | | * Returns: The converted string, or %NULL on an error. |
937 | | */ |
938 | | gchar * |
939 | | _g_time_locale_to_utf8 (const gchar *opsysstring, |
940 | | gssize len, |
941 | | gsize *bytes_read, |
942 | | gsize *bytes_written, |
943 | | GError **error) |
944 | 0 | { |
945 | 0 | const char *charset; |
946 | |
|
947 | 0 | if (_g_get_time_charset (&charset)) |
948 | 0 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
949 | 0 | else |
950 | 0 | return convert_checked (opsysstring, len, "UTF-8", charset, |
951 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
952 | 0 | bytes_read, bytes_written, error); |
953 | 0 | } |
954 | | |
955 | | /* |
956 | | * Do the exact same as g_locale_to_utf8 except that the charset would |
957 | | * be retrieved from _g_get_ctype_charset (which uses LC_CTYPE) |
958 | | * |
959 | | * Returns: The converted string, or %NULL on an error. |
960 | | */ |
961 | | gchar * |
962 | | _g_ctype_locale_to_utf8 (const gchar *opsysstring, |
963 | | gssize len, |
964 | | gsize *bytes_read, |
965 | | gsize *bytes_written, |
966 | | GError **error) |
967 | 0 | { |
968 | 0 | const char *charset; |
969 | |
|
970 | 0 | if (_g_get_ctype_charset (&charset)) |
971 | 0 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
972 | 0 | else |
973 | 0 | return convert_checked (opsysstring, len, "UTF-8", charset, |
974 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
975 | 0 | bytes_read, bytes_written, error); |
976 | 0 | } |
977 | | |
978 | | /** |
979 | | * g_locale_from_utf8: |
980 | | * @utf8string: a UTF-8 encoded string |
981 | | * @len: the length of the string, or -1 if the string is |
982 | | * nul-terminated. |
983 | | * @bytes_read: (out) (optional): location to store the number of bytes in the |
984 | | * input string that were successfully converted, or %NULL. |
985 | | * Even if the conversion was successful, this may be |
986 | | * less than @len if there were partial characters |
987 | | * at the end of the input. If the error |
988 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
989 | | * stored will be the byte offset after the last valid |
990 | | * input sequence. |
991 | | * @bytes_written: (out) (optional): the number of bytes stored in the output |
992 | | * buffer (not including the terminating nul). |
993 | | * @error: location to store the error occurring, or %NULL to ignore |
994 | | * errors. Any of the errors in #GConvertError may occur. |
995 | | * |
996 | | * Converts a string from UTF-8 to the encoding used for strings by |
997 | | * the C runtime (usually the same as that used by the operating |
998 | | * system) in the [current locale][setlocale]. On Windows this means |
999 | | * the system codepage. |
1000 | | * |
1001 | | * The input string shall not contain nul characters even if the @len |
1002 | | * argument is positive. A nul character found inside the string will result |
1003 | | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert |
1004 | | * input that may contain embedded nul characters. |
1005 | | * |
1006 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
1007 | | * A newly-allocated buffer containing the converted string, |
1008 | | * or %NULL on an error, and error will be set. |
1009 | | **/ |
1010 | | gchar * |
1011 | | g_locale_from_utf8 (const gchar *utf8string, |
1012 | | gssize len, |
1013 | | gsize *bytes_read, |
1014 | | gsize *bytes_written, |
1015 | | GError **error) |
1016 | 0 | { |
1017 | 0 | const gchar *charset; |
1018 | |
|
1019 | 0 | if (g_get_charset (&charset)) |
1020 | 0 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
1021 | 0 | else |
1022 | 0 | return convert_checked (utf8string, len, charset, "UTF-8", |
1023 | 0 | CONVERT_CHECK_NO_NULS_IN_INPUT, |
1024 | 0 | bytes_read, bytes_written, error); |
1025 | 0 | } |
1026 | | |
1027 | | #ifndef G_PLATFORM_WIN32 |
1028 | | |
1029 | | typedef struct _GFilenameCharsetCache GFilenameCharsetCache; |
1030 | | |
1031 | | struct _GFilenameCharsetCache { |
1032 | | gboolean is_utf8; |
1033 | | gchar *charset; |
1034 | | gchar **filename_charsets; |
1035 | | }; |
1036 | | |
1037 | | static void |
1038 | | filename_charset_cache_free (gpointer data) |
1039 | 0 | { |
1040 | 0 | GFilenameCharsetCache *cache = data; |
1041 | 0 | g_free (cache->charset); |
1042 | 0 | g_strfreev (cache->filename_charsets); |
1043 | 0 | g_free (cache); |
1044 | 0 | } |
1045 | | |
1046 | | /** |
1047 | | * g_get_filename_charsets: |
1048 | | * @filename_charsets: (out) (transfer none) (array zero-terminated=1): |
1049 | | * return location for the %NULL-terminated list of encoding names |
1050 | | * |
1051 | | * Determines the preferred character sets used for filenames. |
1052 | | * The first character set from the @charsets is the filename encoding, the |
1053 | | * subsequent character sets are used when trying to generate a displayable |
1054 | | * representation of a filename, see g_filename_display_name(). |
1055 | | * |
1056 | | * On Unix, the character sets are determined by consulting the |
1057 | | * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`. |
1058 | | * On Windows, the character set used in the GLib API is always UTF-8 |
1059 | | * and said environment variables have no effect. |
1060 | | * |
1061 | | * `G_FILENAME_ENCODING` may be set to a comma-separated list of |
1062 | | * character set names. The special token "\@locale" is taken |
1063 | | * to mean the character set for the [current locale][setlocale]. |
1064 | | * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is, |
1065 | | * the character set of the current locale is taken as the filename |
1066 | | * encoding. If neither environment variable is set, UTF-8 is taken |
1067 | | * as the filename encoding, but the character set of the current locale |
1068 | | * is also put in the list of encodings. |
1069 | | * |
1070 | | * The returned @charsets belong to GLib and must not be freed. |
1071 | | * |
1072 | | * Note that on Unix, regardless of the locale character set or |
1073 | | * `G_FILENAME_ENCODING` value, the actual file names present |
1074 | | * on a system might be in any random encoding or just gibberish. |
1075 | | * |
1076 | | * Returns: %TRUE if the filename encoding is UTF-8. |
1077 | | * |
1078 | | * Since: 2.6 |
1079 | | */ |
1080 | | gboolean |
1081 | | g_get_filename_charsets (const gchar ***filename_charsets) |
1082 | 0 | { |
1083 | 0 | static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free); |
1084 | 0 | GFilenameCharsetCache *cache = g_private_get (&cache_private); |
1085 | 0 | const gchar *charset; |
1086 | |
|
1087 | 0 | if (!cache) |
1088 | 0 | cache = g_private_set_alloc0 (&cache_private, sizeof (GFilenameCharsetCache)); |
1089 | |
|
1090 | 0 | g_get_charset (&charset); |
1091 | |
|
1092 | 0 | if (!(cache->charset && strcmp (cache->charset, charset) == 0)) |
1093 | 0 | { |
1094 | 0 | const gchar *new_charset; |
1095 | 0 | const gchar *p; |
1096 | 0 | gint i; |
1097 | |
|
1098 | 0 | g_free (cache->charset); |
1099 | 0 | g_strfreev (cache->filename_charsets); |
1100 | 0 | cache->charset = g_strdup (charset); |
1101 | | |
1102 | 0 | p = g_getenv ("G_FILENAME_ENCODING"); |
1103 | 0 | if (p != NULL && p[0] != '\0') |
1104 | 0 | { |
1105 | 0 | cache->filename_charsets = g_strsplit (p, ",", 0); |
1106 | 0 | cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0); |
1107 | |
|
1108 | 0 | for (i = 0; cache->filename_charsets[i]; i++) |
1109 | 0 | { |
1110 | 0 | if (strcmp ("@locale", cache->filename_charsets[i]) == 0) |
1111 | 0 | { |
1112 | 0 | g_get_charset (&new_charset); |
1113 | 0 | g_free (cache->filename_charsets[i]); |
1114 | 0 | cache->filename_charsets[i] = g_strdup (new_charset); |
1115 | 0 | } |
1116 | 0 | } |
1117 | 0 | } |
1118 | 0 | else if (g_getenv ("G_BROKEN_FILENAMES") != NULL) |
1119 | 0 | { |
1120 | 0 | cache->filename_charsets = g_new0 (gchar *, 2); |
1121 | 0 | cache->is_utf8 = g_get_charset (&new_charset); |
1122 | 0 | cache->filename_charsets[0] = g_strdup (new_charset); |
1123 | 0 | } |
1124 | 0 | else |
1125 | 0 | { |
1126 | 0 | cache->filename_charsets = g_new0 (gchar *, 3); |
1127 | 0 | cache->is_utf8 = TRUE; |
1128 | 0 | cache->filename_charsets[0] = g_strdup ("UTF-8"); |
1129 | 0 | if (!g_get_charset (&new_charset)) |
1130 | 0 | cache->filename_charsets[1] = g_strdup (new_charset); |
1131 | 0 | } |
1132 | 0 | } |
1133 | |
|
1134 | 0 | if (filename_charsets) |
1135 | 0 | *filename_charsets = (const gchar **)cache->filename_charsets; |
1136 | |
|
1137 | 0 | return cache->is_utf8; |
1138 | 0 | } |
1139 | | |
1140 | | #else /* G_PLATFORM_WIN32 */ |
1141 | | |
1142 | | gboolean |
1143 | | g_get_filename_charsets (const gchar ***filename_charsets) |
1144 | | { |
1145 | | static const gchar *charsets[] = { |
1146 | | "UTF-8", |
1147 | | NULL |
1148 | | }; |
1149 | | |
1150 | | #ifdef G_OS_WIN32 |
1151 | | /* On Windows GLib pretends that the filename charset is UTF-8 */ |
1152 | | if (filename_charsets) |
1153 | | *filename_charsets = charsets; |
1154 | | |
1155 | | return TRUE; |
1156 | | #else |
1157 | | gboolean result; |
1158 | | |
1159 | | /* Cygwin works like before */ |
1160 | | result = g_get_charset (&(charsets[0])); |
1161 | | |
1162 | | if (filename_charsets) |
1163 | | *filename_charsets = charsets; |
1164 | | |
1165 | | return result; |
1166 | | #endif |
1167 | | } |
1168 | | |
1169 | | #endif /* G_PLATFORM_WIN32 */ |
1170 | | |
1171 | | static gboolean |
1172 | | get_filename_charset (const gchar **filename_charset) |
1173 | 0 | { |
1174 | 0 | const gchar **charsets; |
1175 | 0 | gboolean is_utf8; |
1176 | | |
1177 | 0 | is_utf8 = g_get_filename_charsets (&charsets); |
1178 | |
|
1179 | 0 | if (filename_charset) |
1180 | 0 | *filename_charset = charsets[0]; |
1181 | | |
1182 | 0 | return is_utf8; |
1183 | 0 | } |
1184 | | |
1185 | | /** |
1186 | | * g_filename_to_utf8: |
1187 | | * @opsysstring: (type filename): a string in the encoding for filenames |
1188 | | * @len: the length of the string, or -1 if the string is |
1189 | | * nul-terminated (Note that some encodings may allow nul |
1190 | | * bytes to occur inside strings. In that case, using -1 |
1191 | | * for the @len parameter is unsafe) |
1192 | | * @bytes_read: (out) (optional): location to store the number of bytes in the |
1193 | | * input string that were successfully converted, or %NULL. |
1194 | | * Even if the conversion was successful, this may be |
1195 | | * less than @len if there were partial characters |
1196 | | * at the end of the input. If the error |
1197 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1198 | | * stored will be the byte offset after the last valid |
1199 | | * input sequence. |
1200 | | * @bytes_written: (out) (optional): the number of bytes stored in the output |
1201 | | * buffer (not including the terminating nul). |
1202 | | * @error: location to store the error occurring, or %NULL to ignore |
1203 | | * errors. Any of the errors in #GConvertError may occur. |
1204 | | * |
1205 | | * Converts a string which is in the encoding used by GLib for |
1206 | | * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8 |
1207 | | * for filenames; on other platforms, this function indirectly depends on |
1208 | | * the [current locale][setlocale]. |
1209 | | * |
1210 | | * The input string shall not contain nul characters even if the @len |
1211 | | * argument is positive. A nul character found inside the string will result |
1212 | | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. |
1213 | | * If the source encoding is not UTF-8 and the conversion output contains a |
1214 | | * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the |
1215 | | * function returns %NULL. Use g_convert() to produce output that |
1216 | | * may contain embedded nul characters. |
1217 | | * |
1218 | | * Returns: (type utf8): The converted string, or %NULL on an error. |
1219 | | **/ |
1220 | | gchar* |
1221 | | g_filename_to_utf8 (const gchar *opsysstring, |
1222 | | gssize len, |
1223 | | gsize *bytes_read, |
1224 | | gsize *bytes_written, |
1225 | | GError **error) |
1226 | 0 | { |
1227 | 0 | const gchar *charset; |
1228 | |
|
1229 | 0 | g_return_val_if_fail (opsysstring != NULL, NULL); |
1230 | | |
1231 | 0 | if (get_filename_charset (&charset)) |
1232 | 0 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
1233 | 0 | else |
1234 | 0 | return convert_checked (opsysstring, len, "UTF-8", charset, |
1235 | 0 | CONVERT_CHECK_NO_NULS_IN_INPUT | |
1236 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1237 | 0 | bytes_read, bytes_written, error); |
1238 | 0 | } |
1239 | | |
1240 | | /** |
1241 | | * g_filename_from_utf8: |
1242 | | * @utf8string: (type utf8): a UTF-8 encoded string. |
1243 | | * @len: the length of the string, or -1 if the string is |
1244 | | * nul-terminated. |
1245 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
1246 | | * the input string that were successfully converted, or %NULL. |
1247 | | * Even if the conversion was successful, this may be |
1248 | | * less than @len if there were partial characters |
1249 | | * at the end of the input. If the error |
1250 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1251 | | * stored will be the byte offset after the last valid |
1252 | | * input sequence. |
1253 | | * @bytes_written: (out) (optional): the number of bytes stored in |
1254 | | * the output buffer (not including the terminating nul). |
1255 | | * @error: location to store the error occurring, or %NULL to ignore |
1256 | | * errors. Any of the errors in #GConvertError may occur. |
1257 | | * |
1258 | | * Converts a string from UTF-8 to the encoding GLib uses for |
1259 | | * filenames. Note that on Windows GLib uses UTF-8 for filenames; |
1260 | | * on other platforms, this function indirectly depends on the |
1261 | | * [current locale][setlocale]. |
1262 | | * |
1263 | | * The input string shall not contain nul characters even if the @len |
1264 | | * argument is positive. A nul character found inside the string will result |
1265 | | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is |
1266 | | * not UTF-8 and the conversion output contains a nul character, the error |
1267 | | * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL. |
1268 | | * |
1269 | | * Returns: (type filename): |
1270 | | * The converted string, or %NULL on an error. |
1271 | | **/ |
1272 | | gchar* |
1273 | | g_filename_from_utf8 (const gchar *utf8string, |
1274 | | gssize len, |
1275 | | gsize *bytes_read, |
1276 | | gsize *bytes_written, |
1277 | | GError **error) |
1278 | 0 | { |
1279 | 0 | const gchar *charset; |
1280 | |
|
1281 | 0 | if (get_filename_charset (&charset)) |
1282 | 0 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
1283 | 0 | else |
1284 | 0 | return convert_checked (utf8string, len, charset, "UTF-8", |
1285 | 0 | CONVERT_CHECK_NO_NULS_IN_INPUT | |
1286 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1287 | 0 | bytes_read, bytes_written, error); |
1288 | 0 | } |
1289 | | |
1290 | | /* Test of haystack has the needle prefix, comparing case |
1291 | | * insensitive. haystack may be UTF-8, but needle must |
1292 | | * contain only ascii. */ |
1293 | | static gboolean |
1294 | | has_case_prefix (const gchar *haystack, const gchar *needle) |
1295 | 0 | { |
1296 | 0 | const gchar *h, *n; |
1297 | | |
1298 | | /* Eat one character at a time. */ |
1299 | 0 | h = haystack; |
1300 | 0 | n = needle; |
1301 | |
|
1302 | 0 | while (*n && *h && |
1303 | 0 | g_ascii_tolower (*n) == g_ascii_tolower (*h)) |
1304 | 0 | { |
1305 | 0 | n++; |
1306 | 0 | h++; |
1307 | 0 | } |
1308 | | |
1309 | 0 | return *n == '\0'; |
1310 | 0 | } |
1311 | | |
1312 | | typedef enum { |
1313 | | UNSAFE_ALL = 0x1, /* Escape all unsafe characters */ |
1314 | | UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */ |
1315 | | UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */ |
1316 | | UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */ |
1317 | | UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */ |
1318 | | } UnsafeCharacterSet; |
1319 | | |
1320 | | static const guchar acceptable[96] = { |
1321 | | /* A table of the ASCII chars from space (32) to DEL (127) */ |
1322 | | /* ! " # $ % & ' ( ) * + , - . / */ |
1323 | | 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C, |
1324 | | /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ |
1325 | | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20, |
1326 | | /* @ A B C D E F G H I J K L M N O */ |
1327 | | 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
1328 | | /* P Q R S T U V W X Y Z [ \ ] ^ _ */ |
1329 | | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, |
1330 | | /* ` a b c d e f g h i j k l m n o */ |
1331 | | 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
1332 | | /* p q r s t u v w x y z { | } ~ DEL */ |
1333 | | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20 |
1334 | | }; |
1335 | | |
1336 | | static const gchar hex[] = "0123456789ABCDEF"; |
1337 | | |
1338 | | /* Note: This escape function works on file: URIs, but if you want to |
1339 | | * escape something else, please read RFC-2396 */ |
1340 | | static gchar * |
1341 | | g_escape_uri_string (const gchar *string, |
1342 | | UnsafeCharacterSet mask) |
1343 | 0 | { |
1344 | 0 | #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask)) |
1345 | |
|
1346 | 0 | const gchar *p; |
1347 | 0 | gchar *q; |
1348 | 0 | gchar *result; |
1349 | 0 | int c; |
1350 | 0 | gint unacceptable; |
1351 | 0 | UnsafeCharacterSet use_mask; |
1352 | | |
1353 | 0 | g_return_val_if_fail (mask == UNSAFE_ALL |
1354 | 0 | || mask == UNSAFE_ALLOW_PLUS |
1355 | 0 | || mask == UNSAFE_PATH |
1356 | 0 | || mask == UNSAFE_HOST |
1357 | 0 | || mask == UNSAFE_SLASHES, NULL); |
1358 | | |
1359 | 0 | unacceptable = 0; |
1360 | 0 | use_mask = mask; |
1361 | 0 | for (p = string; *p != '\0'; p++) |
1362 | 0 | { |
1363 | 0 | c = (guchar) *p; |
1364 | 0 | if (!ACCEPTABLE (c)) |
1365 | 0 | unacceptable++; |
1366 | 0 | } |
1367 | | |
1368 | 0 | result = g_malloc (p - string + unacceptable * 2 + 1); |
1369 | | |
1370 | 0 | use_mask = mask; |
1371 | 0 | for (q = result, p = string; *p != '\0'; p++) |
1372 | 0 | { |
1373 | 0 | c = (guchar) *p; |
1374 | | |
1375 | 0 | if (!ACCEPTABLE (c)) |
1376 | 0 | { |
1377 | 0 | *q++ = '%'; /* means hex coming */ |
1378 | 0 | *q++ = hex[c >> 4]; |
1379 | 0 | *q++ = hex[c & 15]; |
1380 | 0 | } |
1381 | 0 | else |
1382 | 0 | *q++ = *p; |
1383 | 0 | } |
1384 | | |
1385 | 0 | *q = '\0'; |
1386 | | |
1387 | 0 | return result; |
1388 | 0 | } |
1389 | | |
1390 | | |
1391 | | static gchar * |
1392 | | g_escape_file_uri (const gchar *hostname, |
1393 | | const gchar *pathname) |
1394 | 0 | { |
1395 | 0 | char *escaped_hostname = NULL; |
1396 | 0 | char *escaped_path; |
1397 | 0 | char *res; |
1398 | |
|
1399 | | #ifdef G_OS_WIN32 |
1400 | | char *p, *backslash; |
1401 | | |
1402 | | /* Turn backslashes into forward slashes. That's what Netscape |
1403 | | * does, and they are actually more or less equivalent in Windows. |
1404 | | */ |
1405 | | |
1406 | | pathname = g_strdup (pathname); |
1407 | | p = (char *) pathname; |
1408 | | |
1409 | | while ((backslash = strchr (p, '\\')) != NULL) |
1410 | | { |
1411 | | *backslash = '/'; |
1412 | | p = backslash + 1; |
1413 | | } |
1414 | | #endif |
1415 | |
|
1416 | 0 | if (hostname && *hostname != '\0') |
1417 | 0 | { |
1418 | 0 | escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST); |
1419 | 0 | } |
1420 | |
|
1421 | 0 | escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH); |
1422 | |
|
1423 | 0 | res = g_strconcat ("file://", |
1424 | 0 | (escaped_hostname) ? escaped_hostname : "", |
1425 | 0 | (*escaped_path != '/') ? "/" : "", |
1426 | 0 | escaped_path, |
1427 | 0 | NULL); |
1428 | |
|
1429 | | #ifdef G_OS_WIN32 |
1430 | | g_free ((char *) pathname); |
1431 | | #endif |
1432 | |
|
1433 | 0 | g_free (escaped_hostname); |
1434 | 0 | g_free (escaped_path); |
1435 | | |
1436 | 0 | return res; |
1437 | 0 | } |
1438 | | |
1439 | | static int |
1440 | | unescape_character (const char *scanner) |
1441 | 0 | { |
1442 | 0 | int first_digit; |
1443 | 0 | int second_digit; |
1444 | |
|
1445 | 0 | first_digit = g_ascii_xdigit_value (scanner[0]); |
1446 | 0 | if (first_digit < 0) |
1447 | 0 | return -1; |
1448 | | |
1449 | 0 | second_digit = g_ascii_xdigit_value (scanner[1]); |
1450 | 0 | if (second_digit < 0) |
1451 | 0 | return -1; |
1452 | | |
1453 | 0 | return (first_digit << 4) | second_digit; |
1454 | 0 | } |
1455 | | |
1456 | | static gchar * |
1457 | | g_unescape_uri_string (const char *escaped, |
1458 | | int len, |
1459 | | const char *illegal_escaped_characters, |
1460 | | gboolean ascii_must_not_be_escaped) |
1461 | 0 | { |
1462 | 0 | const gchar *in, *in_end; |
1463 | 0 | gchar *out, *result; |
1464 | 0 | int c; |
1465 | | |
1466 | 0 | if (escaped == NULL) |
1467 | 0 | return NULL; |
1468 | | |
1469 | 0 | if (len < 0) |
1470 | 0 | len = strlen (escaped); |
1471 | |
|
1472 | 0 | result = g_malloc (len + 1); |
1473 | | |
1474 | 0 | out = result; |
1475 | 0 | for (in = escaped, in_end = escaped + len; in < in_end; in++) |
1476 | 0 | { |
1477 | 0 | c = *in; |
1478 | |
|
1479 | 0 | if (c == '%') |
1480 | 0 | { |
1481 | | /* catch partial escape sequences past the end of the substring */ |
1482 | 0 | if (in + 3 > in_end) |
1483 | 0 | break; |
1484 | | |
1485 | 0 | c = unescape_character (in + 1); |
1486 | | |
1487 | | /* catch bad escape sequences and NUL characters */ |
1488 | 0 | if (c <= 0) |
1489 | 0 | break; |
1490 | | |
1491 | | /* catch escaped ASCII */ |
1492 | 0 | if (ascii_must_not_be_escaped && c <= 0x7F) |
1493 | 0 | break; |
1494 | | |
1495 | | /* catch other illegal escaped characters */ |
1496 | 0 | if (strchr (illegal_escaped_characters, c) != NULL) |
1497 | 0 | break; |
1498 | | |
1499 | 0 | in += 2; |
1500 | 0 | } |
1501 | | |
1502 | 0 | *out++ = c; |
1503 | 0 | } |
1504 | | |
1505 | 0 | g_assert (out - result <= len); |
1506 | 0 | *out = '\0'; |
1507 | |
|
1508 | 0 | if (in != in_end) |
1509 | 0 | { |
1510 | 0 | g_free (result); |
1511 | 0 | return NULL; |
1512 | 0 | } |
1513 | | |
1514 | 0 | return result; |
1515 | 0 | } |
1516 | | |
1517 | | static gboolean |
1518 | | is_asciialphanum (gunichar c) |
1519 | 0 | { |
1520 | 0 | return c <= 0x7F && g_ascii_isalnum (c); |
1521 | 0 | } |
1522 | | |
1523 | | static gboolean |
1524 | | is_asciialpha (gunichar c) |
1525 | 0 | { |
1526 | 0 | return c <= 0x7F && g_ascii_isalpha (c); |
1527 | 0 | } |
1528 | | |
1529 | | /* allows an empty string */ |
1530 | | static gboolean |
1531 | | hostname_validate (const char *hostname) |
1532 | 0 | { |
1533 | 0 | const char *p; |
1534 | 0 | gunichar c, first_char, last_char; |
1535 | |
|
1536 | 0 | p = hostname; |
1537 | 0 | if (*p == '\0') |
1538 | 0 | return TRUE; |
1539 | 0 | do |
1540 | 0 | { |
1541 | | /* read in a label */ |
1542 | 0 | c = g_utf8_get_char (p); |
1543 | 0 | p = g_utf8_next_char (p); |
1544 | 0 | if (!is_asciialphanum (c)) |
1545 | 0 | return FALSE; |
1546 | 0 | first_char = c; |
1547 | 0 | do |
1548 | 0 | { |
1549 | 0 | last_char = c; |
1550 | 0 | c = g_utf8_get_char (p); |
1551 | 0 | p = g_utf8_next_char (p); |
1552 | 0 | } |
1553 | 0 | while (is_asciialphanum (c) || c == '-'); |
1554 | 0 | if (last_char == '-') |
1555 | 0 | return FALSE; |
1556 | | |
1557 | | /* if that was the last label, check that it was a toplabel */ |
1558 | 0 | if (c == '\0' || (c == '.' && *p == '\0')) |
1559 | 0 | return is_asciialpha (first_char); |
1560 | 0 | } |
1561 | 0 | while (c == '.'); |
1562 | 0 | return FALSE; |
1563 | 0 | } |
1564 | | |
1565 | | /** |
1566 | | * g_filename_from_uri: |
1567 | | * @uri: a uri describing a filename (escaped, encoded in ASCII). |
1568 | | * @hostname: (out) (optional) (nullable): Location to store hostname for the URI. |
1569 | | * If there is no hostname in the URI, %NULL will be |
1570 | | * stored in this location. |
1571 | | * @error: location to store the error occurring, or %NULL to ignore |
1572 | | * errors. Any of the errors in #GConvertError may occur. |
1573 | | * |
1574 | | * Converts an escaped ASCII-encoded URI to a local filename in the |
1575 | | * encoding used for filenames. |
1576 | | * |
1577 | | * Since GLib 2.78, the query string and fragment can be present in the URI, |
1578 | | * but are not part of the resulting filename. |
1579 | | * We take inspiration from https://url.spec.whatwg.org/#file-state, |
1580 | | * but we don't support the entire standard. |
1581 | | * |
1582 | | * Returns: (type filename): a newly-allocated string holding |
1583 | | * the resulting filename, or %NULL on an error. |
1584 | | **/ |
1585 | | gchar * |
1586 | | g_filename_from_uri (const gchar *uri, |
1587 | | gchar **hostname, |
1588 | | GError **error) |
1589 | 0 | { |
1590 | 0 | const char *past_scheme; |
1591 | 0 | const char *host_part; |
1592 | 0 | char *unescaped_hostname; |
1593 | 0 | char *result; |
1594 | 0 | char *filename; |
1595 | 0 | char *past_path; |
1596 | 0 | char *temp_uri; |
1597 | 0 | int offs; |
1598 | | #ifdef G_OS_WIN32 |
1599 | | char *p, *slash; |
1600 | | #endif |
1601 | |
|
1602 | 0 | if (hostname) |
1603 | 0 | *hostname = NULL; |
1604 | |
|
1605 | 0 | if (!has_case_prefix (uri, "file:/")) |
1606 | 0 | { |
1607 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1608 | 0 | _("The URI “%s” is not an absolute URI using the “file” scheme"), |
1609 | 0 | uri); |
1610 | 0 | return NULL; |
1611 | 0 | } |
1612 | | |
1613 | 0 | temp_uri = g_strdup (uri); |
1614 | |
|
1615 | 0 | past_scheme = temp_uri + strlen ("file:"); |
1616 | | |
1617 | 0 | past_path = strchr (past_scheme, '?'); |
1618 | 0 | if (past_path != NULL) |
1619 | 0 | *past_path = '\0'; |
1620 | |
|
1621 | 0 | past_path = strchr (past_scheme, '#'); |
1622 | 0 | if (past_path != NULL) |
1623 | 0 | *past_path = '\0'; |
1624 | |
|
1625 | 0 | if (has_case_prefix (past_scheme, "///")) |
1626 | 0 | past_scheme += 2; |
1627 | 0 | else if (has_case_prefix (past_scheme, "//")) |
1628 | 0 | { |
1629 | 0 | past_scheme += 2; |
1630 | 0 | host_part = past_scheme; |
1631 | |
|
1632 | 0 | past_scheme = strchr (past_scheme, '/'); |
1633 | |
|
1634 | 0 | if (past_scheme == NULL) |
1635 | 0 | { |
1636 | 0 | g_free (temp_uri); |
1637 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1638 | 0 | _("The URI “%s” is invalid"), |
1639 | 0 | uri); |
1640 | 0 | return NULL; |
1641 | 0 | } |
1642 | | |
1643 | 0 | unescaped_hostname = g_unescape_uri_string (host_part, past_scheme - host_part, "", TRUE); |
1644 | |
|
1645 | 0 | if (unescaped_hostname == NULL || |
1646 | 0 | !hostname_validate (unescaped_hostname)) |
1647 | 0 | { |
1648 | 0 | g_free (unescaped_hostname); |
1649 | 0 | g_free (temp_uri); |
1650 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1651 | 0 | _("The hostname of the URI “%s” is invalid"), |
1652 | 0 | uri); |
1653 | 0 | return NULL; |
1654 | 0 | } |
1655 | | |
1656 | 0 | if (hostname) |
1657 | 0 | *hostname = unescaped_hostname; |
1658 | 0 | else |
1659 | 0 | g_free (unescaped_hostname); |
1660 | 0 | } |
1661 | | |
1662 | 0 | filename = g_unescape_uri_string (past_scheme, -1, "/", FALSE); |
1663 | |
|
1664 | 0 | if (filename == NULL) |
1665 | 0 | { |
1666 | 0 | g_free (temp_uri); |
1667 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1668 | 0 | _("The URI “%s” contains invalidly escaped characters"), |
1669 | 0 | uri); |
1670 | 0 | return NULL; |
1671 | 0 | } |
1672 | | |
1673 | 0 | offs = 0; |
1674 | | #ifdef G_OS_WIN32 |
1675 | | /* Drop localhost */ |
1676 | | if (hostname && *hostname != NULL && |
1677 | | g_ascii_strcasecmp (*hostname, "localhost") == 0) |
1678 | | { |
1679 | | g_free (*hostname); |
1680 | | *hostname = NULL; |
1681 | | } |
1682 | | |
1683 | | /* Turn slashes into backslashes, because that's the canonical spelling */ |
1684 | | p = filename; |
1685 | | while ((slash = strchr (p, '/')) != NULL) |
1686 | | { |
1687 | | *slash = '\\'; |
1688 | | p = slash + 1; |
1689 | | } |
1690 | | |
1691 | | /* Windows URIs with a drive letter can be like "file://host/c:/foo" |
1692 | | * or "file://host/c|/foo" (some Netscape versions). In those cases, start |
1693 | | * the filename from the drive letter. |
1694 | | */ |
1695 | | if (g_ascii_isalpha (filename[1])) |
1696 | | { |
1697 | | if (filename[2] == ':') |
1698 | | offs = 1; |
1699 | | else if (filename[2] == '|') |
1700 | | { |
1701 | | filename[2] = ':'; |
1702 | | offs = 1; |
1703 | | } |
1704 | | } |
1705 | | #endif |
1706 | |
|
1707 | 0 | result = g_strdup (filename + offs); |
1708 | 0 | g_free (filename); |
1709 | |
|
1710 | 0 | g_free (temp_uri); |
1711 | |
|
1712 | 0 | return result; |
1713 | 0 | } |
1714 | | |
1715 | | /** |
1716 | | * g_filename_to_uri: |
1717 | | * @filename: (type filename): an absolute filename specified in the GLib file |
1718 | | * name encoding, which is the on-disk file name bytes on Unix, and UTF-8 |
1719 | | * on Windows |
1720 | | * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none. |
1721 | | * @error: location to store the error occurring, or %NULL to ignore |
1722 | | * errors. Any of the errors in #GConvertError may occur. |
1723 | | * |
1724 | | * Converts an absolute filename to an escaped ASCII-encoded URI, with the path |
1725 | | * component following Section 3.3. of RFC 2396. |
1726 | | * |
1727 | | * Returns: a newly-allocated string holding the resulting |
1728 | | * URI, or %NULL on an error. |
1729 | | **/ |
1730 | | gchar * |
1731 | | g_filename_to_uri (const gchar *filename, |
1732 | | const gchar *hostname, |
1733 | | GError **error) |
1734 | 0 | { |
1735 | 0 | char *escaped_uri; |
1736 | |
|
1737 | 0 | g_return_val_if_fail (filename != NULL, NULL); |
1738 | | |
1739 | 0 | if (!g_path_is_absolute (filename)) |
1740 | 0 | { |
1741 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH, |
1742 | 0 | _("The pathname “%s” is not an absolute path"), |
1743 | 0 | filename); |
1744 | 0 | return NULL; |
1745 | 0 | } |
1746 | | |
1747 | 0 | if (hostname && |
1748 | 0 | !(g_utf8_validate (hostname, -1, NULL) |
1749 | 0 | && hostname_validate (hostname))) |
1750 | 0 | { |
1751 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
1752 | 0 | _("Invalid hostname")); |
1753 | 0 | return NULL; |
1754 | 0 | } |
1755 | | |
1756 | | #ifdef G_OS_WIN32 |
1757 | | /* Don't use localhost unnecessarily */ |
1758 | | if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0) |
1759 | | hostname = NULL; |
1760 | | #endif |
1761 | | |
1762 | 0 | escaped_uri = g_escape_file_uri (hostname, filename); |
1763 | |
|
1764 | 0 | return escaped_uri; |
1765 | 0 | } |
1766 | | |
1767 | | /** |
1768 | | * g_uri_list_extract_uris: |
1769 | | * @uri_list: an URI list |
1770 | | * |
1771 | | * Splits an URI list conforming to the text/uri-list |
1772 | | * mime type defined in RFC 2483 into individual URIs, |
1773 | | * discarding any comments. The URIs are not validated. |
1774 | | * |
1775 | | * Returns: (transfer full): a newly allocated %NULL-terminated list |
1776 | | * of strings holding the individual URIs. The array should be freed |
1777 | | * with g_strfreev(). |
1778 | | * |
1779 | | * Since: 2.6 |
1780 | | */ |
1781 | | gchar ** |
1782 | | g_uri_list_extract_uris (const gchar *uri_list) |
1783 | 0 | { |
1784 | 0 | GPtrArray *uris; |
1785 | 0 | const gchar *p, *q; |
1786 | |
|
1787 | 0 | uris = g_ptr_array_new (); |
1788 | |
|
1789 | 0 | p = uri_list; |
1790 | | |
1791 | | /* We don't actually try to validate the URI according to RFC |
1792 | | * 2396, or even check for allowed characters - we just ignore |
1793 | | * comments and trim whitespace off the ends. We also |
1794 | | * allow LF delimination as well as the specified CRLF. |
1795 | | * |
1796 | | * We do allow comments like specified in RFC 2483. |
1797 | | */ |
1798 | 0 | while (p) |
1799 | 0 | { |
1800 | 0 | if (*p != '#') |
1801 | 0 | { |
1802 | 0 | while (g_ascii_isspace (*p)) |
1803 | 0 | p++; |
1804 | |
|
1805 | 0 | q = p; |
1806 | 0 | while (*q && (*q != '\n') && (*q != '\r')) |
1807 | 0 | q++; |
1808 | |
|
1809 | 0 | if (q > p) |
1810 | 0 | { |
1811 | 0 | q--; |
1812 | 0 | while (q > p && g_ascii_isspace (*q)) |
1813 | 0 | q--; |
1814 | |
|
1815 | 0 | if (q > p) |
1816 | 0 | g_ptr_array_add (uris, g_strndup (p, q - p + 1)); |
1817 | 0 | } |
1818 | 0 | } |
1819 | 0 | p = strchr (p, '\n'); |
1820 | 0 | if (p) |
1821 | 0 | p++; |
1822 | 0 | } |
1823 | |
|
1824 | 0 | g_ptr_array_add (uris, NULL); |
1825 | |
|
1826 | 0 | return (gchar **) g_ptr_array_free (uris, FALSE); |
1827 | 0 | } |
1828 | | |
1829 | | /** |
1830 | | * g_filename_display_basename: |
1831 | | * @filename: (type filename): an absolute pathname in the |
1832 | | * GLib file name encoding |
1833 | | * |
1834 | | * Returns the display basename for the particular filename, guaranteed |
1835 | | * to be valid UTF-8. The display name might not be identical to the filename, |
1836 | | * for instance there might be problems converting it to UTF-8, and some files |
1837 | | * can be translated in the display. |
1838 | | * |
1839 | | * If GLib cannot make sense of the encoding of @filename, as a last resort it |
1840 | | * replaces unknown characters with U+FFFD, the Unicode replacement character. |
1841 | | * You can search the result for the UTF-8 encoding of this character (which is |
1842 | | * "\357\277\275" in octal notation) to find out if @filename was in an invalid |
1843 | | * encoding. |
1844 | | * |
1845 | | * You must pass the whole absolute pathname to this functions so that |
1846 | | * translation of well known locations can be done. |
1847 | | * |
1848 | | * This function is preferred over g_filename_display_name() if you know the |
1849 | | * whole path, as it allows translation. |
1850 | | * |
1851 | | * Returns: a newly allocated string containing |
1852 | | * a rendition of the basename of the filename in valid UTF-8 |
1853 | | * |
1854 | | * Since: 2.6 |
1855 | | **/ |
1856 | | gchar * |
1857 | | g_filename_display_basename (const gchar *filename) |
1858 | 0 | { |
1859 | 0 | char *basename; |
1860 | 0 | char *display_name; |
1861 | |
|
1862 | 0 | g_return_val_if_fail (filename != NULL, NULL); |
1863 | | |
1864 | 0 | basename = g_path_get_basename (filename); |
1865 | 0 | display_name = g_filename_display_name (basename); |
1866 | 0 | g_free (basename); |
1867 | 0 | return display_name; |
1868 | 0 | } |
1869 | | |
1870 | | /** |
1871 | | * g_filename_display_name: |
1872 | | * @filename: (type filename): a pathname hopefully in the |
1873 | | * GLib file name encoding |
1874 | | * |
1875 | | * Converts a filename into a valid UTF-8 string. The conversion is |
1876 | | * not necessarily reversible, so you should keep the original around |
1877 | | * and use the return value of this function only for display purposes. |
1878 | | * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL |
1879 | | * even if the filename actually isn't in the GLib file name encoding. |
1880 | | * |
1881 | | * If GLib cannot make sense of the encoding of @filename, as a last resort it |
1882 | | * replaces unknown characters with U+FFFD, the Unicode replacement character. |
1883 | | * You can search the result for the UTF-8 encoding of this character (which is |
1884 | | * "\357\277\275" in octal notation) to find out if @filename was in an invalid |
1885 | | * encoding. |
1886 | | * |
1887 | | * If you know the whole pathname of the file you should use |
1888 | | * g_filename_display_basename(), since that allows location-based |
1889 | | * translation of filenames. |
1890 | | * |
1891 | | * Returns: a newly allocated string containing |
1892 | | * a rendition of the filename in valid UTF-8 |
1893 | | * |
1894 | | * Since: 2.6 |
1895 | | **/ |
1896 | | gchar * |
1897 | | g_filename_display_name (const gchar *filename) |
1898 | 0 | { |
1899 | 0 | gint i; |
1900 | 0 | const gchar **charsets; |
1901 | 0 | gchar *display_name = NULL; |
1902 | 0 | gboolean is_utf8; |
1903 | | |
1904 | 0 | is_utf8 = g_get_filename_charsets (&charsets); |
1905 | |
|
1906 | 0 | if (is_utf8) |
1907 | 0 | { |
1908 | 0 | if (g_utf8_validate (filename, -1, NULL)) |
1909 | 0 | display_name = g_strdup (filename); |
1910 | 0 | } |
1911 | | |
1912 | 0 | if (!display_name) |
1913 | 0 | { |
1914 | | /* Try to convert from the filename charsets to UTF-8. |
1915 | | * Skip the first charset if it is UTF-8. |
1916 | | */ |
1917 | 0 | for (i = is_utf8 ? 1 : 0; charsets[i]; i++) |
1918 | 0 | { |
1919 | 0 | display_name = g_convert (filename, -1, "UTF-8", charsets[i], |
1920 | 0 | NULL, NULL, NULL); |
1921 | |
|
1922 | 0 | if (display_name) |
1923 | 0 | break; |
1924 | 0 | } |
1925 | 0 | } |
1926 | | |
1927 | | /* if all conversions failed, we replace invalid UTF-8 |
1928 | | * by a question mark |
1929 | | */ |
1930 | 0 | if (!display_name) |
1931 | 0 | display_name = g_utf8_make_valid (filename, -1); |
1932 | |
|
1933 | 0 | return display_name; |
1934 | 0 | } |
1935 | | |
1936 | | #ifdef G_OS_WIN32 |
1937 | | |
1938 | | /* Binary compatibility versions. Not for newly compiled code. */ |
1939 | | |
1940 | | _GLIB_EXTERN gchar *g_filename_to_utf8_utf8 (const gchar *opsysstring, |
1941 | | gssize len, |
1942 | | gsize *bytes_read, |
1943 | | gsize *bytes_written, |
1944 | | GError **error) G_GNUC_MALLOC; |
1945 | | _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar *utf8string, |
1946 | | gssize len, |
1947 | | gsize *bytes_read, |
1948 | | gsize *bytes_written, |
1949 | | GError **error) G_GNUC_MALLOC; |
1950 | | _GLIB_EXTERN gchar *g_filename_from_uri_utf8 (const gchar *uri, |
1951 | | gchar **hostname, |
1952 | | GError **error) G_GNUC_MALLOC; |
1953 | | _GLIB_EXTERN gchar *g_filename_to_uri_utf8 (const gchar *filename, |
1954 | | const gchar *hostname, |
1955 | | GError **error) G_GNUC_MALLOC; |
1956 | | |
1957 | | gchar * |
1958 | | g_filename_to_utf8_utf8 (const gchar *opsysstring, |
1959 | | gssize len, |
1960 | | gsize *bytes_read, |
1961 | | gsize *bytes_written, |
1962 | | GError **error) |
1963 | | { |
1964 | | return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error); |
1965 | | } |
1966 | | |
1967 | | gchar * |
1968 | | g_filename_from_utf8_utf8 (const gchar *utf8string, |
1969 | | gssize len, |
1970 | | gsize *bytes_read, |
1971 | | gsize *bytes_written, |
1972 | | GError **error) |
1973 | | { |
1974 | | return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error); |
1975 | | } |
1976 | | |
1977 | | gchar * |
1978 | | g_filename_from_uri_utf8 (const gchar *uri, |
1979 | | gchar **hostname, |
1980 | | GError **error) |
1981 | | { |
1982 | | return g_filename_from_uri (uri, hostname, error); |
1983 | | } |
1984 | | |
1985 | | gchar * |
1986 | | g_filename_to_uri_utf8 (const gchar *filename, |
1987 | | const gchar *hostname, |
1988 | | GError **error) |
1989 | | { |
1990 | | return g_filename_to_uri (filename, hostname, error); |
1991 | | } |
1992 | | |
1993 | | #endif |