/src/glib/glib/gconvert.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* GLIB - Library of useful routines for C programming |
2 | | * |
3 | | * gconvert.c: Convert between character sets using iconv |
4 | | * Copyright Red Hat Inc., 2000 |
5 | | * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com> |
6 | | * |
7 | | * This library is free software; you can redistribute it and/or |
8 | | * modify it under the terms of the GNU Lesser General Public |
9 | | * License as published by the Free Software Foundation; either |
10 | | * version 2.1 of the License, or (at your option) any later version. |
11 | | * |
12 | | * This library is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | | * Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public |
18 | | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | #include "config.h" |
22 | | #include "glibconfig.h" |
23 | | |
24 | | #ifndef G_OS_WIN32 |
25 | | #include <iconv.h> |
26 | | #endif |
27 | | #include <errno.h> |
28 | | #include <stdio.h> |
29 | | #include <string.h> |
30 | | #include <stdlib.h> |
31 | | |
32 | | #ifdef G_OS_WIN32 |
33 | | #include "win_iconv.c" |
34 | | #endif |
35 | | |
36 | | #ifdef G_PLATFORM_WIN32 |
37 | | #define STRICT |
38 | | #include <windows.h> |
39 | | #undef STRICT |
40 | | #endif |
41 | | |
42 | | #include "gconvert.h" |
43 | | |
44 | | #include "gcharsetprivate.h" |
45 | | #include "gslist.h" |
46 | | #include "gstrfuncs.h" |
47 | | #include "gtestutils.h" |
48 | | #include "gthread.h" |
49 | | #include "gthreadprivate.h" |
50 | | #include "gunicode.h" |
51 | | #include "gfileutils.h" |
52 | | #include "genviron.h" |
53 | | |
54 | | #include "glibintl.h" |
55 | | |
56 | | |
57 | | /** |
58 | | * SECTION:conversions |
59 | | * @title: Character Set Conversion |
60 | | * @short_description: convert strings between different character sets |
61 | | * |
62 | | * The g_convert() family of function wraps the functionality of iconv(). |
63 | | * In addition to pure character set conversions, GLib has functions to |
64 | | * deal with the extra complications of encodings for file names. |
65 | | * |
66 | | * ## File Name Encodings |
67 | | * |
68 | | * Historically, UNIX has not had a defined encoding for file names: |
69 | | * a file name is valid as long as it does not have path separators |
70 | | * in it ("/"). However, displaying file names may require conversion: |
71 | | * from the character set in which they were created, to the character |
72 | | * set in which the application operates. Consider the Spanish file name |
73 | | * "Presentación.sxi". If the application which created it uses |
74 | | * ISO-8859-1 for its encoding, |
75 | | * |[ |
76 | | * Character: P r e s e n t a c i ó n . s x i |
77 | | * Hex code: 50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69 |
78 | | * ]| |
79 | | * However, if the application use UTF-8, the actual file name on |
80 | | * disk would look like this: |
81 | | * |[ |
82 | | * Character: P r e s e n t a c i ó n . s x i |
83 | | * Hex code: 50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69 |
84 | | * ]| |
85 | | * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use |
86 | | * GLib do the same thing. If you get a file name from the file system, |
87 | | * for example, from readdir() or from g_dir_read_name(), and you wish |
88 | | * to display the file name to the user, you will need to convert it |
89 | | * into UTF-8. The opposite case is when the user types the name of a |
90 | | * file they wish to save: the toolkit will give you that string in |
91 | | * UTF-8 encoding, and you will need to convert it to the character |
92 | | * set used for file names before you can create the file with open() |
93 | | * or fopen(). |
94 | | * |
95 | | * By default, GLib assumes that file names on disk are in UTF-8 |
96 | | * encoding. This is a valid assumption for file systems which |
97 | | * were created relatively recently: most applications use UTF-8 |
98 | | * encoding for their strings, and that is also what they use for |
99 | | * the file names they create. However, older file systems may |
100 | | * still contain file names created in "older" encodings, such as |
101 | | * ISO-8859-1. In this case, for compatibility reasons, you may want |
102 | | * to instruct GLib to use that particular encoding for file names |
103 | | * rather than UTF-8. You can do this by specifying the encoding for |
104 | | * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING] |
105 | | * environment variable. For example, if your installation uses |
106 | | * ISO-8859-1 for file names, you can put this in your `~/.profile`: |
107 | | * |[ |
108 | | * export G_FILENAME_ENCODING=ISO-8859-1 |
109 | | * ]| |
110 | | * GLib provides the functions g_filename_to_utf8() and |
111 | | * g_filename_from_utf8() to perform the necessary conversions. |
112 | | * These functions convert file names from the encoding specified |
113 | | * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This |
114 | | * [diagram][file-name-encodings-diagram] illustrates how |
115 | | * these functions are used to convert between UTF-8 and the |
116 | | * encoding for file names in the file system. |
117 | | * |
118 | | * ## Conversion between file name encodings # {#file-name-encodings-diagram) |
119 | | * |
120 | | *  |
121 | | * |
122 | | * ## Checklist for Application Writers |
123 | | * |
124 | | * This section is a practical summary of the detailed |
125 | | * things to do to make sure your applications process file |
126 | | * name encodings correctly. |
127 | | * |
128 | | * 1. If you get a file name from the file system from a function |
129 | | * such as readdir() or gtk_file_chooser_get_filename(), you do |
130 | | * not need to do any conversion to pass that file name to |
131 | | * functions like open(), rename(), or fopen() -- those are "raw" |
132 | | * file names which the file system understands. |
133 | | * |
134 | | * 2. If you need to display a file name, convert it to UTF-8 first |
135 | | * by using g_filename_to_utf8(). If conversion fails, display a |
136 | | * string like "Unknown file name". Do not convert this string back |
137 | | * into the encoding used for file names if you wish to pass it to |
138 | | * the file system; use the original file name instead. |
139 | | * |
140 | | * For example, the document window of a word processor could display |
141 | | * "Unknown file name" in its title bar but still let the user save |
142 | | * the file, as it would keep the raw file name internally. This |
143 | | * can happen if the user has not set the `G_FILENAME_ENCODING` |
144 | | * environment variable even though he has files whose names are |
145 | | * not encoded in UTF-8. |
146 | | * |
147 | | * 3. If your user interface lets the user type a file name for saving |
148 | | * or renaming, convert it to the encoding used for file names in |
149 | | * the file system by using g_filename_from_utf8(). Pass the converted |
150 | | * file name to functions like fopen(). If conversion fails, ask the |
151 | | * user to enter a different file name. This can happen if the user |
152 | | * types Japanese characters when `G_FILENAME_ENCODING` is set to |
153 | | * `ISO-8859-1`, for example. |
154 | | */ |
155 | | |
156 | | /* We try to terminate strings in unknown charsets with this many zero bytes |
157 | | * to ensure that multibyte strings really are nul-terminated when we return |
158 | | * them from g_convert() and friends. |
159 | | */ |
160 | 104k | #define NUL_TERMINATOR_LENGTH 4 |
161 | | |
162 | | G_DEFINE_QUARK (g_convert_error, g_convert_error) |
163 | | |
164 | | static gboolean |
165 | | try_conversion (const char *to_codeset, |
166 | | const char *from_codeset, |
167 | | iconv_t *cd) |
168 | 34.8k | { |
169 | 34.8k | *cd = iconv_open (to_codeset, from_codeset); |
170 | | |
171 | 34.8k | if (*cd == (iconv_t)-1 && errno == EINVAL) |
172 | 0 | return FALSE; |
173 | 34.8k | else |
174 | 34.8k | return TRUE; |
175 | 34.8k | } |
176 | | |
177 | | static gboolean |
178 | | try_to_aliases (const char **to_aliases, |
179 | | const char *from_codeset, |
180 | | iconv_t *cd) |
181 | 0 | { |
182 | 0 | if (to_aliases) |
183 | 0 | { |
184 | 0 | const char **p = to_aliases; |
185 | 0 | while (*p) |
186 | 0 | { |
187 | 0 | if (try_conversion (*p, from_codeset, cd)) |
188 | 0 | return TRUE; |
189 | | |
190 | 0 | p++; |
191 | 0 | } |
192 | 0 | } |
193 | | |
194 | 0 | return FALSE; |
195 | 0 | } |
196 | | |
197 | | /** |
198 | | * g_iconv_open: (skip) |
199 | | * @to_codeset: destination codeset |
200 | | * @from_codeset: source codeset |
201 | | * |
202 | | * Same as the standard UNIX routine iconv_open(), but |
203 | | * may be implemented via libiconv on UNIX flavors that lack |
204 | | * a native implementation. |
205 | | * |
206 | | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
207 | | * more convenient than the raw iconv wrappers. |
208 | | * |
209 | | * Returns: a "conversion descriptor", or (GIConv)-1 if |
210 | | * opening the converter failed. |
211 | | **/ |
212 | | GIConv |
213 | | g_iconv_open (const gchar *to_codeset, |
214 | | const gchar *from_codeset) |
215 | 34.8k | { |
216 | 34.8k | iconv_t cd; |
217 | | |
218 | 34.8k | if (!try_conversion (to_codeset, from_codeset, &cd)) |
219 | 0 | { |
220 | 0 | const char **to_aliases = _g_charset_get_aliases (to_codeset); |
221 | 0 | const char **from_aliases = _g_charset_get_aliases (from_codeset); |
222 | |
|
223 | 0 | if (from_aliases) |
224 | 0 | { |
225 | 0 | const char **p = from_aliases; |
226 | 0 | while (*p) |
227 | 0 | { |
228 | 0 | if (try_conversion (to_codeset, *p, &cd)) |
229 | 0 | goto out; |
230 | | |
231 | 0 | if (try_to_aliases (to_aliases, *p, &cd)) |
232 | 0 | goto out; |
233 | | |
234 | 0 | p++; |
235 | 0 | } |
236 | 0 | } |
237 | | |
238 | 0 | if (try_to_aliases (to_aliases, from_codeset, &cd)) |
239 | 0 | goto out; |
240 | 0 | } |
241 | | |
242 | 34.8k | out: |
243 | 34.8k | return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd; |
244 | 34.8k | } |
245 | | |
246 | | /** |
247 | | * g_iconv: (skip) |
248 | | * @converter: conversion descriptor from g_iconv_open() |
249 | | * @inbuf: bytes to convert |
250 | | * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf |
251 | | * @outbuf: converted output bytes |
252 | | * @outbytes_left: inout parameter, bytes available to fill in @outbuf |
253 | | * |
254 | | * Same as the standard UNIX routine iconv(), but |
255 | | * may be implemented via libiconv on UNIX flavors that lack |
256 | | * a native implementation. |
257 | | * |
258 | | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
259 | | * more convenient than the raw iconv wrappers. |
260 | | * |
261 | | * Note that the behaviour of iconv() for characters which are valid in the |
262 | | * input character set, but which have no representation in the output character |
263 | | * set, is implementation defined. This function may return success (with a |
264 | | * positive number of non-reversible conversions as replacement characters were |
265 | | * used), or it may return -1 and set an error such as %EILSEQ, in such a |
266 | | * situation. |
267 | | * |
268 | | * Returns: count of non-reversible conversions, or -1 on error |
269 | | **/ |
270 | | gsize |
271 | | g_iconv (GIConv converter, |
272 | | gchar **inbuf, |
273 | | gsize *inbytes_left, |
274 | | gchar **outbuf, |
275 | | gsize *outbytes_left) |
276 | 69.6k | { |
277 | 69.6k | iconv_t cd = (iconv_t)converter; |
278 | | |
279 | 69.6k | return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left); |
280 | 69.6k | } |
281 | | |
282 | | /** |
283 | | * g_iconv_close: (skip) |
284 | | * @converter: a conversion descriptor from g_iconv_open() |
285 | | * |
286 | | * Same as the standard UNIX routine iconv_close(), but |
287 | | * may be implemented via libiconv on UNIX flavors that lack |
288 | | * a native implementation. Should be called to clean up |
289 | | * the conversion descriptor from g_iconv_open() when |
290 | | * you are done converting things. |
291 | | * |
292 | | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
293 | | * more convenient than the raw iconv wrappers. |
294 | | * |
295 | | * Returns: -1 on error, 0 on success |
296 | | **/ |
297 | | gint |
298 | | g_iconv_close (GIConv converter) |
299 | 34.8k | { |
300 | 34.8k | iconv_t cd = (iconv_t)converter; |
301 | | |
302 | 34.8k | return iconv_close (cd); |
303 | 34.8k | } |
304 | | |
305 | | static GIConv |
306 | | open_converter (const gchar *to_codeset, |
307 | | const gchar *from_codeset, |
308 | | GError **error) |
309 | 34.8k | { |
310 | 34.8k | GIConv cd; |
311 | | |
312 | 34.8k | cd = g_iconv_open (to_codeset, from_codeset); |
313 | | |
314 | 34.8k | if (cd == (GIConv) -1) |
315 | 0 | { |
316 | | /* Something went wrong. */ |
317 | 0 | if (error) |
318 | 0 | { |
319 | 0 | if (errno == EINVAL) |
320 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION, |
321 | 0 | _("Conversion from character set “%s” to “%s” is not supported"), |
322 | 0 | from_codeset, to_codeset); |
323 | 0 | else |
324 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
325 | 0 | _("Could not open converter from “%s” to “%s”"), |
326 | 0 | from_codeset, to_codeset); |
327 | 0 | } |
328 | 0 | } |
329 | | |
330 | 34.8k | return cd; |
331 | 34.8k | } |
332 | | |
333 | | static int |
334 | | close_converter (GIConv cd) |
335 | 34.8k | { |
336 | 34.8k | if (cd == (GIConv) -1) |
337 | 0 | return 0; |
338 | | |
339 | 34.8k | return g_iconv_close (cd); |
340 | 34.8k | } |
341 | | |
342 | | /** |
343 | | * g_convert_with_iconv: (skip) |
344 | | * @str: (array length=len) (element-type guint8): |
345 | | * the string to convert. |
346 | | * @len: the length of the string in bytes, or -1 if the string is |
347 | | * nul-terminated (Note that some encodings may allow nul |
348 | | * bytes to occur inside strings. In that case, using -1 |
349 | | * for the @len parameter is unsafe) |
350 | | * @converter: conversion descriptor from g_iconv_open() |
351 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
352 | | * the input string that were successfully converted, or %NULL. |
353 | | * Even if the conversion was successful, this may be |
354 | | * less than @len if there were partial characters |
355 | | * at the end of the input. If the error |
356 | | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
357 | | * stored will be the byte offset after the last valid |
358 | | * input sequence. |
359 | | * @bytes_written: (out) (optional): the number of bytes stored in |
360 | | * the output buffer (not including the terminating nul). |
361 | | * @error: location to store the error occurring, or %NULL to ignore |
362 | | * errors. Any of the errors in #GConvertError may occur. |
363 | | * |
364 | | * Converts a string from one character set to another. |
365 | | * |
366 | | * Note that you should use g_iconv() for streaming conversions. |
367 | | * Despite the fact that @bytes_read can return information about partial |
368 | | * characters, the g_convert_... functions are not generally suitable |
369 | | * for streaming. If the underlying converter maintains internal state, |
370 | | * then this won't be preserved across successive calls to g_convert(), |
371 | | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
372 | | * this is the GNU C converter for CP1255 which does not emit a base |
373 | | * character until it knows that the next character is not a mark that |
374 | | * could combine with the base character.) |
375 | | * |
376 | | * Characters which are valid in the input character set, but which have no |
377 | | * representation in the output character set will result in a |
378 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv() |
379 | | * specification, which leaves this behaviour implementation defined. Note that |
380 | | * this is the same error code as is returned for an invalid byte sequence in |
381 | | * the input character set. To get defined behaviour for conversion of |
382 | | * unrepresentable characters, use g_convert_with_fallback(). |
383 | | * |
384 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
385 | | * If the conversion was successful, a newly allocated buffer |
386 | | * containing the converted string, which must be freed with |
387 | | * g_free(). Otherwise %NULL and @error will be set. |
388 | | **/ |
389 | | gchar* |
390 | | g_convert_with_iconv (const gchar *str, |
391 | | gssize len, |
392 | | GIConv converter, |
393 | | gsize *bytes_read, |
394 | | gsize *bytes_written, |
395 | | GError **error) |
396 | 34.8k | { |
397 | 34.8k | gchar *dest; |
398 | 34.8k | gchar *outp; |
399 | 34.8k | const gchar *p; |
400 | 34.8k | gsize inbytes_remaining; |
401 | 34.8k | gsize outbytes_remaining; |
402 | 34.8k | gsize err; |
403 | 34.8k | gsize outbuf_size; |
404 | 34.8k | gboolean have_error = FALSE; |
405 | 34.8k | gboolean done = FALSE; |
406 | 34.8k | gboolean reset = FALSE; |
407 | | |
408 | 34.8k | g_return_val_if_fail (converter != (GIConv) -1, NULL); |
409 | | |
410 | 34.8k | if (len < 0) |
411 | 0 | len = strlen (str); |
412 | | |
413 | 34.8k | p = str; |
414 | 34.8k | inbytes_remaining = len; |
415 | 34.8k | outbuf_size = len + NUL_TERMINATOR_LENGTH; |
416 | | |
417 | 34.8k | outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; |
418 | 34.8k | outp = dest = g_malloc (outbuf_size); |
419 | | |
420 | 104k | while (!done && !have_error) |
421 | 69.6k | { |
422 | 69.6k | if (reset) |
423 | 34.8k | err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining); |
424 | 34.8k | else |
425 | 34.8k | err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining); |
426 | | |
427 | 69.6k | if (err == (gsize) -1) |
428 | 0 | { |
429 | 0 | switch (errno) |
430 | 0 | { |
431 | 0 | case EINVAL: |
432 | | /* Incomplete text, do not report an error */ |
433 | 0 | done = TRUE; |
434 | 0 | break; |
435 | 0 | case E2BIG: |
436 | 0 | { |
437 | 0 | gsize used = outp - dest; |
438 | | |
439 | 0 | outbuf_size *= 2; |
440 | 0 | dest = g_realloc (dest, outbuf_size); |
441 | | |
442 | 0 | outp = dest + used; |
443 | 0 | outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; |
444 | 0 | } |
445 | 0 | break; |
446 | 0 | case EILSEQ: |
447 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
448 | 0 | _("Invalid byte sequence in conversion input")); |
449 | 0 | have_error = TRUE; |
450 | 0 | break; |
451 | 0 | default: |
452 | 0 | { |
453 | 0 | int errsv = errno; |
454 | |
|
455 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
456 | 0 | _("Error during conversion: %s"), |
457 | 0 | g_strerror (errsv)); |
458 | 0 | } |
459 | 0 | have_error = TRUE; |
460 | 0 | break; |
461 | 0 | } |
462 | 0 | } |
463 | 69.6k | else if (err > 0) |
464 | 0 | { |
465 | | /* @err gives the number of replacement characters used. */ |
466 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
467 | 0 | _("Unrepresentable character in conversion input")); |
468 | 0 | have_error = TRUE; |
469 | 0 | } |
470 | 69.6k | else |
471 | 69.6k | { |
472 | 69.6k | if (!reset) |
473 | 34.8k | { |
474 | | /* call g_iconv with NULL inbuf to cleanup shift state */ |
475 | 34.8k | reset = TRUE; |
476 | 34.8k | inbytes_remaining = 0; |
477 | 34.8k | } |
478 | 34.8k | else |
479 | 34.8k | done = TRUE; |
480 | 69.6k | } |
481 | 69.6k | } |
482 | | |
483 | 34.8k | memset (outp, 0, NUL_TERMINATOR_LENGTH); |
484 | | |
485 | 34.8k | if (bytes_read) |
486 | 0 | *bytes_read = p - str; |
487 | 34.8k | else |
488 | 34.8k | { |
489 | 34.8k | if ((p - str) != len) |
490 | 0 | { |
491 | 0 | if (!have_error) |
492 | 0 | { |
493 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
494 | 0 | _("Partial character sequence at end of input")); |
495 | 0 | have_error = TRUE; |
496 | 0 | } |
497 | 0 | } |
498 | 34.8k | } |
499 | | |
500 | 34.8k | if (bytes_written) |
501 | 0 | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
502 | | |
503 | 34.8k | if (have_error) |
504 | 0 | { |
505 | 0 | g_free (dest); |
506 | 0 | return NULL; |
507 | 0 | } |
508 | 34.8k | else |
509 | 34.8k | return dest; |
510 | 34.8k | } |
511 | | |
512 | | /** |
513 | | * g_convert: |
514 | | * @str: (array length=len) (element-type guint8): |
515 | | * the string to convert. |
516 | | * @len: the length of the string in bytes, or -1 if the string is |
517 | | * nul-terminated (Note that some encodings may allow nul |
518 | | * bytes to occur inside strings. In that case, using -1 |
519 | | * for the @len parameter is unsafe) |
520 | | * @to_codeset: name of character set into which to convert @str |
521 | | * @from_codeset: character set of @str. |
522 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
523 | | * the input string that were successfully converted, or %NULL. |
524 | | * Even if the conversion was successful, this may be |
525 | | * less than @len if there were partial characters |
526 | | * at the end of the input. If the error |
527 | | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
528 | | * stored will be the byte offset after the last valid |
529 | | * input sequence. |
530 | | * @bytes_written: (out) (optional): the number of bytes stored in |
531 | | * the output buffer (not including the terminating nul). |
532 | | * @error: location to store the error occurring, or %NULL to ignore |
533 | | * errors. Any of the errors in #GConvertError may occur. |
534 | | * |
535 | | * Converts a string from one character set to another. |
536 | | * |
537 | | * Note that you should use g_iconv() for streaming conversions. |
538 | | * Despite the fact that @bytes_read can return information about partial |
539 | | * characters, the g_convert_... functions are not generally suitable |
540 | | * for streaming. If the underlying converter maintains internal state, |
541 | | * then this won't be preserved across successive calls to g_convert(), |
542 | | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
543 | | * this is the GNU C converter for CP1255 which does not emit a base |
544 | | * character until it knows that the next character is not a mark that |
545 | | * could combine with the base character.) |
546 | | * |
547 | | * Using extensions such as "//TRANSLIT" may not work (or may not work |
548 | | * well) on many platforms. Consider using g_str_to_ascii() instead. |
549 | | * |
550 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
551 | | * If the conversion was successful, a newly allocated buffer |
552 | | * containing the converted string, which must be freed with g_free(). |
553 | | * Otherwise %NULL and @error will be set. |
554 | | **/ |
555 | | gchar* |
556 | | g_convert (const gchar *str, |
557 | | gssize len, |
558 | | const gchar *to_codeset, |
559 | | const gchar *from_codeset, |
560 | | gsize *bytes_read, |
561 | | gsize *bytes_written, |
562 | | GError **error) |
563 | 34.8k | { |
564 | 34.8k | gchar *res; |
565 | 34.8k | GIConv cd; |
566 | | |
567 | 34.8k | g_return_val_if_fail (str != NULL, NULL); |
568 | 34.8k | g_return_val_if_fail (to_codeset != NULL, NULL); |
569 | 34.8k | g_return_val_if_fail (from_codeset != NULL, NULL); |
570 | | |
571 | 34.8k | cd = open_converter (to_codeset, from_codeset, error); |
572 | | |
573 | 34.8k | if (cd == (GIConv) -1) |
574 | 0 | { |
575 | 0 | if (bytes_read) |
576 | 0 | *bytes_read = 0; |
577 | | |
578 | 0 | if (bytes_written) |
579 | 0 | *bytes_written = 0; |
580 | | |
581 | 0 | return NULL; |
582 | 0 | } |
583 | | |
584 | 34.8k | res = g_convert_with_iconv (str, len, cd, |
585 | 34.8k | bytes_read, bytes_written, |
586 | 34.8k | error); |
587 | | |
588 | 34.8k | close_converter (cd); |
589 | | |
590 | 34.8k | return res; |
591 | 34.8k | } |
592 | | |
593 | | /** |
594 | | * g_convert_with_fallback: |
595 | | * @str: (array length=len) (element-type guint8): |
596 | | * the string to convert. |
597 | | * @len: the length of the string in bytes, or -1 if the string is |
598 | | * nul-terminated (Note that some encodings may allow nul |
599 | | * bytes to occur inside strings. In that case, using -1 |
600 | | * for the @len parameter is unsafe) |
601 | | * @to_codeset: name of character set into which to convert @str |
602 | | * @from_codeset: character set of @str. |
603 | | * @fallback: UTF-8 string to use in place of characters not |
604 | | * present in the target encoding. (The string must be |
605 | | * representable in the target encoding). |
606 | | * If %NULL, characters not in the target encoding will |
607 | | * be represented as Unicode escapes \uxxxx or \Uxxxxyyyy. |
608 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
609 | | * the input string that were successfully converted, or %NULL. |
610 | | * Even if the conversion was successful, this may be |
611 | | * less than @len if there were partial characters |
612 | | * at the end of the input. |
613 | | * @bytes_written: (out) (optional): the number of bytes stored in |
614 | | * the output buffer (not including the terminating nul). |
615 | | * @error: location to store the error occurring, or %NULL to ignore |
616 | | * errors. Any of the errors in #GConvertError may occur. |
617 | | * |
618 | | * Converts a string from one character set to another, possibly |
619 | | * including fallback sequences for characters not representable |
620 | | * in the output. Note that it is not guaranteed that the specification |
621 | | * for the fallback sequences in @fallback will be honored. Some |
622 | | * systems may do an approximate conversion from @from_codeset |
623 | | * to @to_codeset in their iconv() functions, |
624 | | * in which case GLib will simply return that approximate conversion. |
625 | | * |
626 | | * Note that you should use g_iconv() for streaming conversions. |
627 | | * Despite the fact that @bytes_read can return information about partial |
628 | | * characters, the g_convert_... functions are not generally suitable |
629 | | * for streaming. If the underlying converter maintains internal state, |
630 | | * then this won't be preserved across successive calls to g_convert(), |
631 | | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
632 | | * this is the GNU C converter for CP1255 which does not emit a base |
633 | | * character until it knows that the next character is not a mark that |
634 | | * could combine with the base character.) |
635 | | * |
636 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
637 | | * If the conversion was successful, a newly allocated buffer |
638 | | * containing the converted string, which must be freed with g_free(). |
639 | | * Otherwise %NULL and @error will be set. |
640 | | **/ |
641 | | gchar* |
642 | | g_convert_with_fallback (const gchar *str, |
643 | | gssize len, |
644 | | const gchar *to_codeset, |
645 | | const gchar *from_codeset, |
646 | | const gchar *fallback, |
647 | | gsize *bytes_read, |
648 | | gsize *bytes_written, |
649 | | GError **error) |
650 | 34.8k | { |
651 | 34.8k | gchar *utf8; |
652 | 34.8k | gchar *dest; |
653 | 34.8k | gchar *outp; |
654 | 34.8k | const gchar *insert_str = NULL; |
655 | 34.8k | const gchar *p; |
656 | 34.8k | gsize inbytes_remaining; |
657 | 34.8k | const gchar *save_p = NULL; |
658 | 34.8k | gsize save_inbytes = 0; |
659 | 34.8k | gsize outbytes_remaining; |
660 | 34.8k | gsize err; |
661 | 34.8k | GIConv cd; |
662 | 34.8k | gsize outbuf_size; |
663 | 34.8k | gboolean have_error = FALSE; |
664 | 34.8k | gboolean done = FALSE; |
665 | | |
666 | 34.8k | GError *local_error = NULL; |
667 | | |
668 | 34.8k | g_return_val_if_fail (str != NULL, NULL); |
669 | 34.8k | g_return_val_if_fail (to_codeset != NULL, NULL); |
670 | 34.8k | g_return_val_if_fail (from_codeset != NULL, NULL); |
671 | | |
672 | 34.8k | if (len < 0) |
673 | 34.8k | len = strlen (str); |
674 | | |
675 | | /* Try an exact conversion; we only proceed if this fails |
676 | | * due to an illegal sequence in the input string. |
677 | | */ |
678 | 34.8k | dest = g_convert (str, len, to_codeset, from_codeset, |
679 | 34.8k | bytes_read, bytes_written, &local_error); |
680 | 34.8k | if (!local_error) |
681 | 34.8k | return dest; |
682 | | |
683 | 0 | if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) |
684 | 0 | { |
685 | 0 | g_propagate_error (error, local_error); |
686 | 0 | return NULL; |
687 | 0 | } |
688 | 0 | else |
689 | 0 | g_error_free (local_error); |
690 | | |
691 | 0 | local_error = NULL; |
692 | | |
693 | | /* No go; to proceed, we need a converter from "UTF-8" to |
694 | | * to_codeset, and the string as UTF-8. |
695 | | */ |
696 | 0 | cd = open_converter (to_codeset, "UTF-8", error); |
697 | 0 | if (cd == (GIConv) -1) |
698 | 0 | { |
699 | 0 | if (bytes_read) |
700 | 0 | *bytes_read = 0; |
701 | | |
702 | 0 | if (bytes_written) |
703 | 0 | *bytes_written = 0; |
704 | | |
705 | 0 | return NULL; |
706 | 0 | } |
707 | | |
708 | 0 | utf8 = g_convert (str, len, "UTF-8", from_codeset, |
709 | 0 | bytes_read, &inbytes_remaining, error); |
710 | 0 | if (!utf8) |
711 | 0 | { |
712 | 0 | close_converter (cd); |
713 | 0 | if (bytes_written) |
714 | 0 | *bytes_written = 0; |
715 | 0 | return NULL; |
716 | 0 | } |
717 | | |
718 | | /* Now the heart of the code. We loop through the UTF-8 string, and |
719 | | * whenever we hit an offending character, we form fallback, convert |
720 | | * the fallback to the target codeset, and then go back to |
721 | | * converting the original string after finishing with the fallback. |
722 | | * |
723 | | * The variables save_p and save_inbytes store the input state |
724 | | * for the original string while we are converting the fallback |
725 | | */ |
726 | 0 | p = utf8; |
727 | |
|
728 | 0 | outbuf_size = len + NUL_TERMINATOR_LENGTH; |
729 | 0 | outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; |
730 | 0 | outp = dest = g_malloc (outbuf_size); |
731 | |
|
732 | 0 | while (!done && !have_error) |
733 | 0 | { |
734 | 0 | gsize inbytes_tmp = inbytes_remaining; |
735 | 0 | err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining); |
736 | 0 | inbytes_remaining = inbytes_tmp; |
737 | |
|
738 | 0 | if (err == (gsize) -1) |
739 | 0 | { |
740 | 0 | switch (errno) |
741 | 0 | { |
742 | 0 | case EINVAL: |
743 | 0 | g_assert_not_reached(); |
744 | 0 | break; |
745 | 0 | case E2BIG: |
746 | 0 | { |
747 | 0 | gsize used = outp - dest; |
748 | |
|
749 | 0 | outbuf_size *= 2; |
750 | 0 | dest = g_realloc (dest, outbuf_size); |
751 | | |
752 | 0 | outp = dest + used; |
753 | 0 | outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; |
754 | | |
755 | 0 | break; |
756 | 0 | } |
757 | 0 | case EILSEQ: |
758 | 0 | if (save_p) |
759 | 0 | { |
760 | | /* Error converting fallback string - fatal |
761 | | */ |
762 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
763 | 0 | _("Cannot convert fallback “%s” to codeset “%s”"), |
764 | 0 | insert_str, to_codeset); |
765 | 0 | have_error = TRUE; |
766 | 0 | break; |
767 | 0 | } |
768 | 0 | else if (p) |
769 | 0 | { |
770 | 0 | if (!fallback) |
771 | 0 | { |
772 | 0 | gunichar ch = g_utf8_get_char (p); |
773 | 0 | insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x", |
774 | 0 | ch); |
775 | 0 | } |
776 | 0 | else |
777 | 0 | insert_str = fallback; |
778 | | |
779 | 0 | save_p = g_utf8_next_char (p); |
780 | 0 | save_inbytes = inbytes_remaining - (save_p - p); |
781 | 0 | p = insert_str; |
782 | 0 | inbytes_remaining = strlen (p); |
783 | 0 | break; |
784 | 0 | } |
785 | | /* if p is null */ |
786 | 0 | G_GNUC_FALLTHROUGH; |
787 | 0 | default: |
788 | 0 | { |
789 | 0 | int errsv = errno; |
790 | |
|
791 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
792 | 0 | _("Error during conversion: %s"), |
793 | 0 | g_strerror (errsv)); |
794 | 0 | } |
795 | |
|
796 | 0 | have_error = TRUE; |
797 | 0 | break; |
798 | 0 | } |
799 | 0 | } |
800 | 0 | else |
801 | 0 | { |
802 | 0 | if (save_p) |
803 | 0 | { |
804 | 0 | if (!fallback) |
805 | 0 | g_free ((gchar *)insert_str); |
806 | 0 | p = save_p; |
807 | 0 | inbytes_remaining = save_inbytes; |
808 | 0 | save_p = NULL; |
809 | 0 | } |
810 | 0 | else if (p) |
811 | 0 | { |
812 | | /* call g_iconv with NULL inbuf to cleanup shift state */ |
813 | 0 | p = NULL; |
814 | 0 | inbytes_remaining = 0; |
815 | 0 | } |
816 | 0 | else |
817 | 0 | done = TRUE; |
818 | 0 | } |
819 | 0 | } |
820 | | |
821 | | /* Cleanup |
822 | | */ |
823 | 0 | memset (outp, 0, NUL_TERMINATOR_LENGTH); |
824 | | |
825 | 0 | close_converter (cd); |
826 | |
|
827 | 0 | if (bytes_written) |
828 | 0 | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
829 | |
|
830 | 0 | g_free (utf8); |
831 | |
|
832 | 0 | if (have_error) |
833 | 0 | { |
834 | 0 | if (save_p && !fallback) |
835 | 0 | g_free ((gchar *)insert_str); |
836 | 0 | g_free (dest); |
837 | 0 | return NULL; |
838 | 0 | } |
839 | 0 | else |
840 | 0 | return dest; |
841 | 0 | } |
842 | | |
843 | | /* |
844 | | * g_locale_to_utf8 |
845 | | * |
846 | | * |
847 | | */ |
848 | | |
849 | | /* |
850 | | * Validate @string as UTF-8. @len can be negative if @string is |
851 | | * nul-terminated, or a non-negative value in bytes. If @string ends in an |
852 | | * incomplete sequence, or contains any illegal sequences or nul codepoints, |
853 | | * %NULL will be returned and the error set to |
854 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. |
855 | | * On success, @bytes_read and @bytes_written, if provided, will be set to |
856 | | * the number of bytes in @string up to @len or the terminating nul byte. |
857 | | * On error, @bytes_read will be set to the byte offset after the last valid |
858 | | * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0. |
859 | | */ |
860 | | static gchar * |
861 | | strdup_len (const gchar *string, |
862 | | gssize len, |
863 | | gsize *bytes_read, |
864 | | gsize *bytes_written, |
865 | | GError **error) |
866 | 0 | { |
867 | 0 | gsize real_len; |
868 | 0 | const gchar *end_valid; |
869 | |
|
870 | 0 | if (!g_utf8_validate (string, len, &end_valid)) |
871 | 0 | { |
872 | 0 | if (bytes_read) |
873 | 0 | *bytes_read = end_valid - string; |
874 | 0 | if (bytes_written) |
875 | 0 | *bytes_written = 0; |
876 | |
|
877 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
878 | 0 | _("Invalid byte sequence in conversion input")); |
879 | 0 | return NULL; |
880 | 0 | } |
881 | | |
882 | 0 | real_len = end_valid - string; |
883 | |
|
884 | 0 | if (bytes_read) |
885 | 0 | *bytes_read = real_len; |
886 | 0 | if (bytes_written) |
887 | 0 | *bytes_written = real_len; |
888 | |
|
889 | 0 | return g_strndup (string, real_len); |
890 | 0 | } |
891 | | |
892 | | typedef enum |
893 | | { |
894 | | CONVERT_CHECK_NO_NULS_IN_INPUT = 1 << 0, |
895 | | CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1 |
896 | | } ConvertCheckFlags; |
897 | | |
898 | | /* |
899 | | * Convert from @string in the encoding identified by @from_codeset, |
900 | | * returning a string in the encoding identifed by @to_codeset. |
901 | | * @len can be negative if @string is nul-terminated, or a non-negative |
902 | | * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags |
903 | | * to check the input, the output, or both, for embedded nul bytes. |
904 | | * On success, @bytes_read, if provided, will be set to the number of bytes |
905 | | * in @string up to @len or the terminating nul byte, and @bytes_written, if |
906 | | * provided, will be set to the number of output bytes written into the |
907 | | * returned buffer, excluding the terminating nul sequence. |
908 | | * On error, @bytes_read will be set to the byte offset after the last valid |
909 | | * sequence in @string, and @bytes_written will be set to 0. |
910 | | */ |
911 | | static gchar * |
912 | | convert_checked (const gchar *string, |
913 | | gssize len, |
914 | | const gchar *to_codeset, |
915 | | const gchar *from_codeset, |
916 | | ConvertCheckFlags flags, |
917 | | gsize *bytes_read, |
918 | | gsize *bytes_written, |
919 | | GError **error) |
920 | 0 | { |
921 | 0 | gchar *out; |
922 | 0 | gsize outbytes; |
923 | |
|
924 | 0 | if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0) |
925 | 0 | { |
926 | 0 | const gchar *early_nul = memchr (string, '\0', len); |
927 | 0 | if (early_nul != NULL) |
928 | 0 | { |
929 | 0 | if (bytes_read) |
930 | 0 | *bytes_read = early_nul - string; |
931 | 0 | if (bytes_written) |
932 | 0 | *bytes_written = 0; |
933 | |
|
934 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
935 | 0 | _("Embedded NUL byte in conversion input")); |
936 | 0 | return NULL; |
937 | 0 | } |
938 | 0 | } |
939 | | |
940 | 0 | out = g_convert (string, len, to_codeset, from_codeset, |
941 | 0 | bytes_read, &outbytes, error); |
942 | 0 | if (out == NULL) |
943 | 0 | { |
944 | 0 | if (bytes_written) |
945 | 0 | *bytes_written = 0; |
946 | 0 | return NULL; |
947 | 0 | } |
948 | | |
949 | 0 | if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT) |
950 | 0 | && memchr (out, '\0', outbytes) != NULL) |
951 | 0 | { |
952 | 0 | g_free (out); |
953 | 0 | if (bytes_written) |
954 | 0 | *bytes_written = 0; |
955 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL, |
956 | 0 | _("Embedded NUL byte in conversion output")); |
957 | 0 | return NULL; |
958 | 0 | } |
959 | | |
960 | 0 | if (bytes_written) |
961 | 0 | *bytes_written = outbytes; |
962 | 0 | return out; |
963 | 0 | } |
964 | | |
965 | | /** |
966 | | * g_locale_to_utf8: |
967 | | * @opsysstring: (array length=len) (element-type guint8): a string in the |
968 | | * encoding of the current locale. On Windows |
969 | | * this means the system codepage. |
970 | | * @len: the length of the string, or -1 if the string is |
971 | | * nul-terminated (Note that some encodings may allow nul |
972 | | * bytes to occur inside strings. In that case, using -1 |
973 | | * for the @len parameter is unsafe) |
974 | | * @bytes_read: (out) (optional): location to store the number of bytes in the |
975 | | * input string that were successfully converted, or %NULL. |
976 | | * Even if the conversion was successful, this may be |
977 | | * less than @len if there were partial characters |
978 | | * at the end of the input. If the error |
979 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
980 | | * stored will be the byte offset after the last valid |
981 | | * input sequence. |
982 | | * @bytes_written: (out) (optional): the number of bytes stored in the output |
983 | | * buffer (not including the terminating nul). |
984 | | * @error: location to store the error occurring, or %NULL to ignore |
985 | | * errors. Any of the errors in #GConvertError may occur. |
986 | | * |
987 | | * Converts a string which is in the encoding used for strings by |
988 | | * the C runtime (usually the same as that used by the operating |
989 | | * system) in the [current locale][setlocale] into a UTF-8 string. |
990 | | * |
991 | | * If the source encoding is not UTF-8 and the conversion output contains a |
992 | | * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the |
993 | | * function returns %NULL. |
994 | | * If the source encoding is UTF-8, an embedded nul character is treated with |
995 | | * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with |
996 | | * earlier versions of this library. Use g_convert() to produce output that |
997 | | * may contain embedded nul characters. |
998 | | * |
999 | | * Returns: (type utf8): The converted string, or %NULL on an error. |
1000 | | **/ |
1001 | | gchar * |
1002 | | g_locale_to_utf8 (const gchar *opsysstring, |
1003 | | gssize len, |
1004 | | gsize *bytes_read, |
1005 | | gsize *bytes_written, |
1006 | | GError **error) |
1007 | 0 | { |
1008 | 0 | const char *charset; |
1009 | |
|
1010 | 0 | if (g_get_charset (&charset)) |
1011 | 0 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
1012 | 0 | else |
1013 | 0 | return convert_checked (opsysstring, len, "UTF-8", charset, |
1014 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1015 | 0 | bytes_read, bytes_written, error); |
1016 | 0 | } |
1017 | | |
1018 | | /** |
1019 | | * g_locale_from_utf8: |
1020 | | * @utf8string: a UTF-8 encoded string |
1021 | | * @len: the length of the string, or -1 if the string is |
1022 | | * nul-terminated. |
1023 | | * @bytes_read: (out) (optional): location to store the number of bytes in the |
1024 | | * input string that were successfully converted, or %NULL. |
1025 | | * Even if the conversion was successful, this may be |
1026 | | * less than @len if there were partial characters |
1027 | | * at the end of the input. If the error |
1028 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1029 | | * stored will be the byte offset after the last valid |
1030 | | * input sequence. |
1031 | | * @bytes_written: (out) (optional): the number of bytes stored in the output |
1032 | | * buffer (not including the terminating nul). |
1033 | | * @error: location to store the error occurring, or %NULL to ignore |
1034 | | * errors. Any of the errors in #GConvertError may occur. |
1035 | | * |
1036 | | * Converts a string from UTF-8 to the encoding used for strings by |
1037 | | * the C runtime (usually the same as that used by the operating |
1038 | | * system) in the [current locale][setlocale]. On Windows this means |
1039 | | * the system codepage. |
1040 | | * |
1041 | | * The input string shall not contain nul characters even if the @len |
1042 | | * argument is positive. A nul character found inside the string will result |
1043 | | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert |
1044 | | * input that may contain embedded nul characters. |
1045 | | * |
1046 | | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
1047 | | * A newly-allocated buffer containing the converted string, |
1048 | | * or %NULL on an error, and error will be set. |
1049 | | **/ |
1050 | | gchar * |
1051 | | g_locale_from_utf8 (const gchar *utf8string, |
1052 | | gssize len, |
1053 | | gsize *bytes_read, |
1054 | | gsize *bytes_written, |
1055 | | GError **error) |
1056 | 0 | { |
1057 | 0 | const gchar *charset; |
1058 | |
|
1059 | 0 | if (g_get_charset (&charset)) |
1060 | 0 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
1061 | 0 | else |
1062 | 0 | return convert_checked (utf8string, len, charset, "UTF-8", |
1063 | 0 | CONVERT_CHECK_NO_NULS_IN_INPUT, |
1064 | 0 | bytes_read, bytes_written, error); |
1065 | 0 | } |
1066 | | |
1067 | | #ifndef G_PLATFORM_WIN32 |
1068 | | |
1069 | | typedef struct _GFilenameCharsetCache GFilenameCharsetCache; |
1070 | | |
1071 | | struct _GFilenameCharsetCache { |
1072 | | gboolean is_utf8; |
1073 | | gchar *charset; |
1074 | | gchar **filename_charsets; |
1075 | | }; |
1076 | | |
1077 | | static void |
1078 | | filename_charset_cache_free (gpointer data) |
1079 | 0 | { |
1080 | 0 | GFilenameCharsetCache *cache = data; |
1081 | 0 | g_free (cache->charset); |
1082 | 0 | g_strfreev (cache->filename_charsets); |
1083 | 0 | g_free (cache); |
1084 | 0 | } |
1085 | | |
1086 | | /** |
1087 | | * g_get_filename_charsets: |
1088 | | * @filename_charsets: (out) (transfer none) (array zero-terminated=1): |
1089 | | * return location for the %NULL-terminated list of encoding names |
1090 | | * |
1091 | | * Determines the preferred character sets used for filenames. |
1092 | | * The first character set from the @charsets is the filename encoding, the |
1093 | | * subsequent character sets are used when trying to generate a displayable |
1094 | | * representation of a filename, see g_filename_display_name(). |
1095 | | * |
1096 | | * On Unix, the character sets are determined by consulting the |
1097 | | * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`. |
1098 | | * On Windows, the character set used in the GLib API is always UTF-8 |
1099 | | * and said environment variables have no effect. |
1100 | | * |
1101 | | * `G_FILENAME_ENCODING` may be set to a comma-separated list of |
1102 | | * character set names. The special token "\@locale" is taken |
1103 | | * to mean the character set for the [current locale][setlocale]. |
1104 | | * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is, |
1105 | | * the character set of the current locale is taken as the filename |
1106 | | * encoding. If neither environment variable is set, UTF-8 is taken |
1107 | | * as the filename encoding, but the character set of the current locale |
1108 | | * is also put in the list of encodings. |
1109 | | * |
1110 | | * The returned @charsets belong to GLib and must not be freed. |
1111 | | * |
1112 | | * Note that on Unix, regardless of the locale character set or |
1113 | | * `G_FILENAME_ENCODING` value, the actual file names present |
1114 | | * on a system might be in any random encoding or just gibberish. |
1115 | | * |
1116 | | * Returns: %TRUE if the filename encoding is UTF-8. |
1117 | | * |
1118 | | * Since: 2.6 |
1119 | | */ |
1120 | | gboolean |
1121 | | g_get_filename_charsets (const gchar ***filename_charsets) |
1122 | 0 | { |
1123 | 0 | static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free); |
1124 | 0 | GFilenameCharsetCache *cache = g_private_get (&cache_private); |
1125 | 0 | const gchar *charset; |
1126 | |
|
1127 | 0 | if (!cache) |
1128 | 0 | cache = g_private_set_alloc0 (&cache_private, sizeof (GFilenameCharsetCache)); |
1129 | |
|
1130 | 0 | g_get_charset (&charset); |
1131 | |
|
1132 | 0 | if (!(cache->charset && strcmp (cache->charset, charset) == 0)) |
1133 | 0 | { |
1134 | 0 | const gchar *new_charset; |
1135 | 0 | const gchar *p; |
1136 | 0 | gint i; |
1137 | |
|
1138 | 0 | g_free (cache->charset); |
1139 | 0 | g_strfreev (cache->filename_charsets); |
1140 | 0 | cache->charset = g_strdup (charset); |
1141 | | |
1142 | 0 | p = g_getenv ("G_FILENAME_ENCODING"); |
1143 | 0 | if (p != NULL && p[0] != '\0') |
1144 | 0 | { |
1145 | 0 | cache->filename_charsets = g_strsplit (p, ",", 0); |
1146 | 0 | cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0); |
1147 | |
|
1148 | 0 | for (i = 0; cache->filename_charsets[i]; i++) |
1149 | 0 | { |
1150 | 0 | if (strcmp ("@locale", cache->filename_charsets[i]) == 0) |
1151 | 0 | { |
1152 | 0 | g_get_charset (&new_charset); |
1153 | 0 | g_free (cache->filename_charsets[i]); |
1154 | 0 | cache->filename_charsets[i] = g_strdup (new_charset); |
1155 | 0 | } |
1156 | 0 | } |
1157 | 0 | } |
1158 | 0 | else if (g_getenv ("G_BROKEN_FILENAMES") != NULL) |
1159 | 0 | { |
1160 | 0 | cache->filename_charsets = g_new0 (gchar *, 2); |
1161 | 0 | cache->is_utf8 = g_get_charset (&new_charset); |
1162 | 0 | cache->filename_charsets[0] = g_strdup (new_charset); |
1163 | 0 | } |
1164 | 0 | else |
1165 | 0 | { |
1166 | 0 | cache->filename_charsets = g_new0 (gchar *, 3); |
1167 | 0 | cache->is_utf8 = TRUE; |
1168 | 0 | cache->filename_charsets[0] = g_strdup ("UTF-8"); |
1169 | 0 | if (!g_get_charset (&new_charset)) |
1170 | 0 | cache->filename_charsets[1] = g_strdup (new_charset); |
1171 | 0 | } |
1172 | 0 | } |
1173 | |
|
1174 | 0 | if (filename_charsets) |
1175 | 0 | *filename_charsets = (const gchar **)cache->filename_charsets; |
1176 | |
|
1177 | 0 | return cache->is_utf8; |
1178 | 0 | } |
1179 | | |
1180 | | #else /* G_PLATFORM_WIN32 */ |
1181 | | |
1182 | | gboolean |
1183 | | g_get_filename_charsets (const gchar ***filename_charsets) |
1184 | | { |
1185 | | static const gchar *charsets[] = { |
1186 | | "UTF-8", |
1187 | | NULL |
1188 | | }; |
1189 | | |
1190 | | #ifdef G_OS_WIN32 |
1191 | | /* On Windows GLib pretends that the filename charset is UTF-8 */ |
1192 | | if (filename_charsets) |
1193 | | *filename_charsets = charsets; |
1194 | | |
1195 | | return TRUE; |
1196 | | #else |
1197 | | gboolean result; |
1198 | | |
1199 | | /* Cygwin works like before */ |
1200 | | result = g_get_charset (&(charsets[0])); |
1201 | | |
1202 | | if (filename_charsets) |
1203 | | *filename_charsets = charsets; |
1204 | | |
1205 | | return result; |
1206 | | #endif |
1207 | | } |
1208 | | |
1209 | | #endif /* G_PLATFORM_WIN32 */ |
1210 | | |
1211 | | static gboolean |
1212 | | get_filename_charset (const gchar **filename_charset) |
1213 | 0 | { |
1214 | 0 | const gchar **charsets; |
1215 | 0 | gboolean is_utf8; |
1216 | | |
1217 | 0 | is_utf8 = g_get_filename_charsets (&charsets); |
1218 | |
|
1219 | 0 | if (filename_charset) |
1220 | 0 | *filename_charset = charsets[0]; |
1221 | | |
1222 | 0 | return is_utf8; |
1223 | 0 | } |
1224 | | |
1225 | | /** |
1226 | | * g_filename_to_utf8: |
1227 | | * @opsysstring: (type filename): a string in the encoding for filenames |
1228 | | * @len: the length of the string, or -1 if the string is |
1229 | | * nul-terminated (Note that some encodings may allow nul |
1230 | | * bytes to occur inside strings. In that case, using -1 |
1231 | | * for the @len parameter is unsafe) |
1232 | | * @bytes_read: (out) (optional): location to store the number of bytes in the |
1233 | | * input string that were successfully converted, or %NULL. |
1234 | | * Even if the conversion was successful, this may be |
1235 | | * less than @len if there were partial characters |
1236 | | * at the end of the input. If the error |
1237 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1238 | | * stored will be the byte offset after the last valid |
1239 | | * input sequence. |
1240 | | * @bytes_written: (out) (optional): the number of bytes stored in the output |
1241 | | * buffer (not including the terminating nul). |
1242 | | * @error: location to store the error occurring, or %NULL to ignore |
1243 | | * errors. Any of the errors in #GConvertError may occur. |
1244 | | * |
1245 | | * Converts a string which is in the encoding used by GLib for |
1246 | | * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8 |
1247 | | * for filenames; on other platforms, this function indirectly depends on |
1248 | | * the [current locale][setlocale]. |
1249 | | * |
1250 | | * The input string shall not contain nul characters even if the @len |
1251 | | * argument is positive. A nul character found inside the string will result |
1252 | | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. |
1253 | | * If the source encoding is not UTF-8 and the conversion output contains a |
1254 | | * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the |
1255 | | * function returns %NULL. Use g_convert() to produce output that |
1256 | | * may contain embedded nul characters. |
1257 | | * |
1258 | | * Returns: (type utf8): The converted string, or %NULL on an error. |
1259 | | **/ |
1260 | | gchar* |
1261 | | g_filename_to_utf8 (const gchar *opsysstring, |
1262 | | gssize len, |
1263 | | gsize *bytes_read, |
1264 | | gsize *bytes_written, |
1265 | | GError **error) |
1266 | 0 | { |
1267 | 0 | const gchar *charset; |
1268 | |
|
1269 | 0 | g_return_val_if_fail (opsysstring != NULL, NULL); |
1270 | | |
1271 | 0 | if (get_filename_charset (&charset)) |
1272 | 0 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
1273 | 0 | else |
1274 | 0 | return convert_checked (opsysstring, len, "UTF-8", charset, |
1275 | 0 | CONVERT_CHECK_NO_NULS_IN_INPUT | |
1276 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1277 | 0 | bytes_read, bytes_written, error); |
1278 | 0 | } |
1279 | | |
1280 | | /** |
1281 | | * g_filename_from_utf8: |
1282 | | * @utf8string: (type utf8): a UTF-8 encoded string. |
1283 | | * @len: the length of the string, or -1 if the string is |
1284 | | * nul-terminated. |
1285 | | * @bytes_read: (out) (optional): location to store the number of bytes in |
1286 | | * the input string that were successfully converted, or %NULL. |
1287 | | * Even if the conversion was successful, this may be |
1288 | | * less than @len if there were partial characters |
1289 | | * at the end of the input. If the error |
1290 | | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1291 | | * stored will be the byte offset after the last valid |
1292 | | * input sequence. |
1293 | | * @bytes_written: (out) (optional): the number of bytes stored in |
1294 | | * the output buffer (not including the terminating nul). |
1295 | | * @error: location to store the error occurring, or %NULL to ignore |
1296 | | * errors. Any of the errors in #GConvertError may occur. |
1297 | | * |
1298 | | * Converts a string from UTF-8 to the encoding GLib uses for |
1299 | | * filenames. Note that on Windows GLib uses UTF-8 for filenames; |
1300 | | * on other platforms, this function indirectly depends on the |
1301 | | * [current locale][setlocale]. |
1302 | | * |
1303 | | * The input string shall not contain nul characters even if the @len |
1304 | | * argument is positive. A nul character found inside the string will result |
1305 | | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is |
1306 | | * not UTF-8 and the conversion output contains a nul character, the error |
1307 | | * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL. |
1308 | | * |
1309 | | * Returns: (type filename): |
1310 | | * The converted string, or %NULL on an error. |
1311 | | **/ |
1312 | | gchar* |
1313 | | g_filename_from_utf8 (const gchar *utf8string, |
1314 | | gssize len, |
1315 | | gsize *bytes_read, |
1316 | | gsize *bytes_written, |
1317 | | GError **error) |
1318 | 0 | { |
1319 | 0 | const gchar *charset; |
1320 | |
|
1321 | 0 | if (get_filename_charset (&charset)) |
1322 | 0 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
1323 | 0 | else |
1324 | 0 | return convert_checked (utf8string, len, charset, "UTF-8", |
1325 | 0 | CONVERT_CHECK_NO_NULS_IN_INPUT | |
1326 | 0 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1327 | 0 | bytes_read, bytes_written, error); |
1328 | 0 | } |
1329 | | |
1330 | | /* Test of haystack has the needle prefix, comparing case |
1331 | | * insensitive. haystack may be UTF-8, but needle must |
1332 | | * contain only ascii. */ |
1333 | | static gboolean |
1334 | | has_case_prefix (const gchar *haystack, const gchar *needle) |
1335 | 0 | { |
1336 | 0 | const gchar *h, *n; |
1337 | | |
1338 | | /* Eat one character at a time. */ |
1339 | 0 | h = haystack; |
1340 | 0 | n = needle; |
1341 | |
|
1342 | 0 | while (*n && *h && |
1343 | 0 | g_ascii_tolower (*n) == g_ascii_tolower (*h)) |
1344 | 0 | { |
1345 | 0 | n++; |
1346 | 0 | h++; |
1347 | 0 | } |
1348 | | |
1349 | 0 | return *n == '\0'; |
1350 | 0 | } |
1351 | | |
1352 | | typedef enum { |
1353 | | UNSAFE_ALL = 0x1, /* Escape all unsafe characters */ |
1354 | | UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */ |
1355 | | UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */ |
1356 | | UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */ |
1357 | | UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */ |
1358 | | } UnsafeCharacterSet; |
1359 | | |
1360 | | static const guchar acceptable[96] = { |
1361 | | /* A table of the ASCII chars from space (32) to DEL (127) */ |
1362 | | /* ! " # $ % & ' ( ) * + , - . / */ |
1363 | | 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C, |
1364 | | /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ |
1365 | | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20, |
1366 | | /* @ A B C D E F G H I J K L M N O */ |
1367 | | 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
1368 | | /* P Q R S T U V W X Y Z [ \ ] ^ _ */ |
1369 | | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, |
1370 | | /* ` a b c d e f g h i j k l m n o */ |
1371 | | 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
1372 | | /* p q r s t u v w x y z { | } ~ DEL */ |
1373 | | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20 |
1374 | | }; |
1375 | | |
1376 | | static const gchar hex[16] = "0123456789ABCDEF"; |
1377 | | |
1378 | | /* Note: This escape function works on file: URIs, but if you want to |
1379 | | * escape something else, please read RFC-2396 */ |
1380 | | static gchar * |
1381 | | g_escape_uri_string (const gchar *string, |
1382 | | UnsafeCharacterSet mask) |
1383 | 0 | { |
1384 | 0 | #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask)) |
1385 | |
|
1386 | 0 | const gchar *p; |
1387 | 0 | gchar *q; |
1388 | 0 | gchar *result; |
1389 | 0 | int c; |
1390 | 0 | gint unacceptable; |
1391 | 0 | UnsafeCharacterSet use_mask; |
1392 | | |
1393 | 0 | g_return_val_if_fail (mask == UNSAFE_ALL |
1394 | 0 | || mask == UNSAFE_ALLOW_PLUS |
1395 | 0 | || mask == UNSAFE_PATH |
1396 | 0 | || mask == UNSAFE_HOST |
1397 | 0 | || mask == UNSAFE_SLASHES, NULL); |
1398 | | |
1399 | 0 | unacceptable = 0; |
1400 | 0 | use_mask = mask; |
1401 | 0 | for (p = string; *p != '\0'; p++) |
1402 | 0 | { |
1403 | 0 | c = (guchar) *p; |
1404 | 0 | if (!ACCEPTABLE (c)) |
1405 | 0 | unacceptable++; |
1406 | 0 | } |
1407 | | |
1408 | 0 | result = g_malloc (p - string + unacceptable * 2 + 1); |
1409 | | |
1410 | 0 | use_mask = mask; |
1411 | 0 | for (q = result, p = string; *p != '\0'; p++) |
1412 | 0 | { |
1413 | 0 | c = (guchar) *p; |
1414 | | |
1415 | 0 | if (!ACCEPTABLE (c)) |
1416 | 0 | { |
1417 | 0 | *q++ = '%'; /* means hex coming */ |
1418 | 0 | *q++ = hex[c >> 4]; |
1419 | 0 | *q++ = hex[c & 15]; |
1420 | 0 | } |
1421 | 0 | else |
1422 | 0 | *q++ = *p; |
1423 | 0 | } |
1424 | | |
1425 | 0 | *q = '\0'; |
1426 | | |
1427 | 0 | return result; |
1428 | 0 | } |
1429 | | |
1430 | | |
1431 | | static gchar * |
1432 | | g_escape_file_uri (const gchar *hostname, |
1433 | | const gchar *pathname) |
1434 | 0 | { |
1435 | 0 | char *escaped_hostname = NULL; |
1436 | 0 | char *escaped_path; |
1437 | 0 | char *res; |
1438 | |
|
1439 | | #ifdef G_OS_WIN32 |
1440 | | char *p, *backslash; |
1441 | | |
1442 | | /* Turn backslashes into forward slashes. That's what Netscape |
1443 | | * does, and they are actually more or less equivalent in Windows. |
1444 | | */ |
1445 | | |
1446 | | pathname = g_strdup (pathname); |
1447 | | p = (char *) pathname; |
1448 | | |
1449 | | while ((backslash = strchr (p, '\\')) != NULL) |
1450 | | { |
1451 | | *backslash = '/'; |
1452 | | p = backslash + 1; |
1453 | | } |
1454 | | #endif |
1455 | |
|
1456 | 0 | if (hostname && *hostname != '\0') |
1457 | 0 | { |
1458 | 0 | escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST); |
1459 | 0 | } |
1460 | |
|
1461 | 0 | escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH); |
1462 | |
|
1463 | 0 | res = g_strconcat ("file://", |
1464 | 0 | (escaped_hostname) ? escaped_hostname : "", |
1465 | 0 | (*escaped_path != '/') ? "/" : "", |
1466 | 0 | escaped_path, |
1467 | 0 | NULL); |
1468 | |
|
1469 | | #ifdef G_OS_WIN32 |
1470 | | g_free ((char *) pathname); |
1471 | | #endif |
1472 | |
|
1473 | 0 | g_free (escaped_hostname); |
1474 | 0 | g_free (escaped_path); |
1475 | | |
1476 | 0 | return res; |
1477 | 0 | } |
1478 | | |
1479 | | static int |
1480 | | unescape_character (const char *scanner) |
1481 | 0 | { |
1482 | 0 | int first_digit; |
1483 | 0 | int second_digit; |
1484 | |
|
1485 | 0 | first_digit = g_ascii_xdigit_value (scanner[0]); |
1486 | 0 | if (first_digit < 0) |
1487 | 0 | return -1; |
1488 | | |
1489 | 0 | second_digit = g_ascii_xdigit_value (scanner[1]); |
1490 | 0 | if (second_digit < 0) |
1491 | 0 | return -1; |
1492 | | |
1493 | 0 | return (first_digit << 4) | second_digit; |
1494 | 0 | } |
1495 | | |
1496 | | static gchar * |
1497 | | g_unescape_uri_string (const char *escaped, |
1498 | | int len, |
1499 | | const char *illegal_escaped_characters, |
1500 | | gboolean ascii_must_not_be_escaped) |
1501 | 0 | { |
1502 | 0 | const gchar *in, *in_end; |
1503 | 0 | gchar *out, *result; |
1504 | 0 | int c; |
1505 | | |
1506 | 0 | if (escaped == NULL) |
1507 | 0 | return NULL; |
1508 | | |
1509 | 0 | if (len < 0) |
1510 | 0 | len = strlen (escaped); |
1511 | |
|
1512 | 0 | result = g_malloc (len + 1); |
1513 | | |
1514 | 0 | out = result; |
1515 | 0 | for (in = escaped, in_end = escaped + len; in < in_end; in++) |
1516 | 0 | { |
1517 | 0 | c = *in; |
1518 | |
|
1519 | 0 | if (c == '%') |
1520 | 0 | { |
1521 | | /* catch partial escape sequences past the end of the substring */ |
1522 | 0 | if (in + 3 > in_end) |
1523 | 0 | break; |
1524 | | |
1525 | 0 | c = unescape_character (in + 1); |
1526 | | |
1527 | | /* catch bad escape sequences and NUL characters */ |
1528 | 0 | if (c <= 0) |
1529 | 0 | break; |
1530 | | |
1531 | | /* catch escaped ASCII */ |
1532 | 0 | if (ascii_must_not_be_escaped && c <= 0x7F) |
1533 | 0 | break; |
1534 | | |
1535 | | /* catch other illegal escaped characters */ |
1536 | 0 | if (strchr (illegal_escaped_characters, c) != NULL) |
1537 | 0 | break; |
1538 | | |
1539 | 0 | in += 2; |
1540 | 0 | } |
1541 | | |
1542 | 0 | *out++ = c; |
1543 | 0 | } |
1544 | | |
1545 | 0 | g_assert (out - result <= len); |
1546 | 0 | *out = '\0'; |
1547 | |
|
1548 | 0 | if (in != in_end) |
1549 | 0 | { |
1550 | 0 | g_free (result); |
1551 | 0 | return NULL; |
1552 | 0 | } |
1553 | | |
1554 | 0 | return result; |
1555 | 0 | } |
1556 | | |
1557 | | static gboolean |
1558 | | is_asciialphanum (gunichar c) |
1559 | 0 | { |
1560 | 0 | return c <= 0x7F && g_ascii_isalnum (c); |
1561 | 0 | } |
1562 | | |
1563 | | static gboolean |
1564 | | is_asciialpha (gunichar c) |
1565 | 0 | { |
1566 | 0 | return c <= 0x7F && g_ascii_isalpha (c); |
1567 | 0 | } |
1568 | | |
1569 | | /* allows an empty string */ |
1570 | | static gboolean |
1571 | | hostname_validate (const char *hostname) |
1572 | 0 | { |
1573 | 0 | const char *p; |
1574 | 0 | gunichar c, first_char, last_char; |
1575 | |
|
1576 | 0 | p = hostname; |
1577 | 0 | if (*p == '\0') |
1578 | 0 | return TRUE; |
1579 | 0 | do |
1580 | 0 | { |
1581 | | /* read in a label */ |
1582 | 0 | c = g_utf8_get_char (p); |
1583 | 0 | p = g_utf8_next_char (p); |
1584 | 0 | if (!is_asciialphanum (c)) |
1585 | 0 | return FALSE; |
1586 | 0 | first_char = c; |
1587 | 0 | do |
1588 | 0 | { |
1589 | 0 | last_char = c; |
1590 | 0 | c = g_utf8_get_char (p); |
1591 | 0 | p = g_utf8_next_char (p); |
1592 | 0 | } |
1593 | 0 | while (is_asciialphanum (c) || c == '-'); |
1594 | 0 | if (last_char == '-') |
1595 | 0 | return FALSE; |
1596 | | |
1597 | | /* if that was the last label, check that it was a toplabel */ |
1598 | 0 | if (c == '\0' || (c == '.' && *p == '\0')) |
1599 | 0 | return is_asciialpha (first_char); |
1600 | 0 | } |
1601 | 0 | while (c == '.'); |
1602 | 0 | return FALSE; |
1603 | 0 | } |
1604 | | |
1605 | | /** |
1606 | | * g_filename_from_uri: |
1607 | | * @uri: a uri describing a filename (escaped, encoded in ASCII). |
1608 | | * @hostname: (out) (optional) (nullable): Location to store hostname for the URI. |
1609 | | * If there is no hostname in the URI, %NULL will be |
1610 | | * stored in this location. |
1611 | | * @error: location to store the error occurring, or %NULL to ignore |
1612 | | * errors. Any of the errors in #GConvertError may occur. |
1613 | | * |
1614 | | * Converts an escaped ASCII-encoded URI to a local filename in the |
1615 | | * encoding used for filenames. |
1616 | | * |
1617 | | * Returns: (type filename): a newly-allocated string holding |
1618 | | * the resulting filename, or %NULL on an error. |
1619 | | **/ |
1620 | | gchar * |
1621 | | g_filename_from_uri (const gchar *uri, |
1622 | | gchar **hostname, |
1623 | | GError **error) |
1624 | 0 | { |
1625 | 0 | const char *path_part; |
1626 | 0 | const char *host_part; |
1627 | 0 | char *unescaped_hostname; |
1628 | 0 | char *result; |
1629 | 0 | char *filename; |
1630 | 0 | int offs; |
1631 | | #ifdef G_OS_WIN32 |
1632 | | char *p, *slash; |
1633 | | #endif |
1634 | |
|
1635 | 0 | if (hostname) |
1636 | 0 | *hostname = NULL; |
1637 | |
|
1638 | 0 | if (!has_case_prefix (uri, "file:/")) |
1639 | 0 | { |
1640 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1641 | 0 | _("The URI “%s” is not an absolute URI using the “file” scheme"), |
1642 | 0 | uri); |
1643 | 0 | return NULL; |
1644 | 0 | } |
1645 | | |
1646 | 0 | path_part = uri + strlen ("file:"); |
1647 | | |
1648 | 0 | if (strchr (path_part, '#') != NULL) |
1649 | 0 | { |
1650 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1651 | 0 | _("The local file URI “%s” may not include a “#”"), |
1652 | 0 | uri); |
1653 | 0 | return NULL; |
1654 | 0 | } |
1655 | | |
1656 | 0 | if (has_case_prefix (path_part, "///")) |
1657 | 0 | path_part += 2; |
1658 | 0 | else if (has_case_prefix (path_part, "//")) |
1659 | 0 | { |
1660 | 0 | path_part += 2; |
1661 | 0 | host_part = path_part; |
1662 | |
|
1663 | 0 | path_part = strchr (path_part, '/'); |
1664 | |
|
1665 | 0 | if (path_part == NULL) |
1666 | 0 | { |
1667 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1668 | 0 | _("The URI “%s” is invalid"), |
1669 | 0 | uri); |
1670 | 0 | return NULL; |
1671 | 0 | } |
1672 | | |
1673 | 0 | unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE); |
1674 | |
|
1675 | 0 | if (unescaped_hostname == NULL || |
1676 | 0 | !hostname_validate (unescaped_hostname)) |
1677 | 0 | { |
1678 | 0 | g_free (unescaped_hostname); |
1679 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1680 | 0 | _("The hostname of the URI “%s” is invalid"), |
1681 | 0 | uri); |
1682 | 0 | return NULL; |
1683 | 0 | } |
1684 | | |
1685 | 0 | if (hostname) |
1686 | 0 | *hostname = unescaped_hostname; |
1687 | 0 | else |
1688 | 0 | g_free (unescaped_hostname); |
1689 | 0 | } |
1690 | | |
1691 | 0 | filename = g_unescape_uri_string (path_part, -1, "/", FALSE); |
1692 | |
|
1693 | 0 | if (filename == NULL) |
1694 | 0 | { |
1695 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
1696 | 0 | _("The URI “%s” contains invalidly escaped characters"), |
1697 | 0 | uri); |
1698 | 0 | return NULL; |
1699 | 0 | } |
1700 | | |
1701 | 0 | offs = 0; |
1702 | | #ifdef G_OS_WIN32 |
1703 | | /* Drop localhost */ |
1704 | | if (hostname && *hostname != NULL && |
1705 | | g_ascii_strcasecmp (*hostname, "localhost") == 0) |
1706 | | { |
1707 | | g_free (*hostname); |
1708 | | *hostname = NULL; |
1709 | | } |
1710 | | |
1711 | | /* Turn slashes into backslashes, because that's the canonical spelling */ |
1712 | | p = filename; |
1713 | | while ((slash = strchr (p, '/')) != NULL) |
1714 | | { |
1715 | | *slash = '\\'; |
1716 | | p = slash + 1; |
1717 | | } |
1718 | | |
1719 | | /* Windows URIs with a drive letter can be like "file://host/c:/foo" |
1720 | | * or "file://host/c|/foo" (some Netscape versions). In those cases, start |
1721 | | * the filename from the drive letter. |
1722 | | */ |
1723 | | if (g_ascii_isalpha (filename[1])) |
1724 | | { |
1725 | | if (filename[2] == ':') |
1726 | | offs = 1; |
1727 | | else if (filename[2] == '|') |
1728 | | { |
1729 | | filename[2] = ':'; |
1730 | | offs = 1; |
1731 | | } |
1732 | | } |
1733 | | #endif |
1734 | |
|
1735 | 0 | result = g_strdup (filename + offs); |
1736 | 0 | g_free (filename); |
1737 | |
|
1738 | 0 | return result; |
1739 | 0 | } |
1740 | | |
1741 | | /** |
1742 | | * g_filename_to_uri: |
1743 | | * @filename: (type filename): an absolute filename specified in the GLib file |
1744 | | * name encoding, which is the on-disk file name bytes on Unix, and UTF-8 |
1745 | | * on Windows |
1746 | | * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none. |
1747 | | * @error: location to store the error occurring, or %NULL to ignore |
1748 | | * errors. Any of the errors in #GConvertError may occur. |
1749 | | * |
1750 | | * Converts an absolute filename to an escaped ASCII-encoded URI, with the path |
1751 | | * component following Section 3.3. of RFC 2396. |
1752 | | * |
1753 | | * Returns: a newly-allocated string holding the resulting |
1754 | | * URI, or %NULL on an error. |
1755 | | **/ |
1756 | | gchar * |
1757 | | g_filename_to_uri (const gchar *filename, |
1758 | | const gchar *hostname, |
1759 | | GError **error) |
1760 | 0 | { |
1761 | 0 | char *escaped_uri; |
1762 | |
|
1763 | 0 | g_return_val_if_fail (filename != NULL, NULL); |
1764 | | |
1765 | 0 | if (!g_path_is_absolute (filename)) |
1766 | 0 | { |
1767 | 0 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH, |
1768 | 0 | _("The pathname “%s” is not an absolute path"), |
1769 | 0 | filename); |
1770 | 0 | return NULL; |
1771 | 0 | } |
1772 | | |
1773 | 0 | if (hostname && |
1774 | 0 | !(g_utf8_validate (hostname, -1, NULL) |
1775 | 0 | && hostname_validate (hostname))) |
1776 | 0 | { |
1777 | 0 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
1778 | 0 | _("Invalid hostname")); |
1779 | 0 | return NULL; |
1780 | 0 | } |
1781 | | |
1782 | | #ifdef G_OS_WIN32 |
1783 | | /* Don't use localhost unnecessarily */ |
1784 | | if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0) |
1785 | | hostname = NULL; |
1786 | | #endif |
1787 | | |
1788 | 0 | escaped_uri = g_escape_file_uri (hostname, filename); |
1789 | |
|
1790 | 0 | return escaped_uri; |
1791 | 0 | } |
1792 | | |
1793 | | /** |
1794 | | * g_uri_list_extract_uris: |
1795 | | * @uri_list: an URI list |
1796 | | * |
1797 | | * Splits an URI list conforming to the text/uri-list |
1798 | | * mime type defined in RFC 2483 into individual URIs, |
1799 | | * discarding any comments. The URIs are not validated. |
1800 | | * |
1801 | | * Returns: (transfer full): a newly allocated %NULL-terminated list |
1802 | | * of strings holding the individual URIs. The array should be freed |
1803 | | * with g_strfreev(). |
1804 | | * |
1805 | | * Since: 2.6 |
1806 | | */ |
1807 | | gchar ** |
1808 | | g_uri_list_extract_uris (const gchar *uri_list) |
1809 | 0 | { |
1810 | 0 | GPtrArray *uris; |
1811 | 0 | const gchar *p, *q; |
1812 | |
|
1813 | 0 | uris = g_ptr_array_new (); |
1814 | |
|
1815 | 0 | p = uri_list; |
1816 | | |
1817 | | /* We don't actually try to validate the URI according to RFC |
1818 | | * 2396, or even check for allowed characters - we just ignore |
1819 | | * comments and trim whitespace off the ends. We also |
1820 | | * allow LF delimination as well as the specified CRLF. |
1821 | | * |
1822 | | * We do allow comments like specified in RFC 2483. |
1823 | | */ |
1824 | 0 | while (p) |
1825 | 0 | { |
1826 | 0 | if (*p != '#') |
1827 | 0 | { |
1828 | 0 | while (g_ascii_isspace (*p)) |
1829 | 0 | p++; |
1830 | |
|
1831 | 0 | q = p; |
1832 | 0 | while (*q && (*q != '\n') && (*q != '\r')) |
1833 | 0 | q++; |
1834 | |
|
1835 | 0 | if (q > p) |
1836 | 0 | { |
1837 | 0 | q--; |
1838 | 0 | while (q > p && g_ascii_isspace (*q)) |
1839 | 0 | q--; |
1840 | |
|
1841 | 0 | if (q > p) |
1842 | 0 | g_ptr_array_add (uris, g_strndup (p, q - p + 1)); |
1843 | 0 | } |
1844 | 0 | } |
1845 | 0 | p = strchr (p, '\n'); |
1846 | 0 | if (p) |
1847 | 0 | p++; |
1848 | 0 | } |
1849 | |
|
1850 | 0 | g_ptr_array_add (uris, NULL); |
1851 | |
|
1852 | 0 | return (gchar **) g_ptr_array_free (uris, FALSE); |
1853 | 0 | } |
1854 | | |
1855 | | /** |
1856 | | * g_filename_display_basename: |
1857 | | * @filename: (type filename): an absolute pathname in the |
1858 | | * GLib file name encoding |
1859 | | * |
1860 | | * Returns the display basename for the particular filename, guaranteed |
1861 | | * to be valid UTF-8. The display name might not be identical to the filename, |
1862 | | * for instance there might be problems converting it to UTF-8, and some files |
1863 | | * can be translated in the display. |
1864 | | * |
1865 | | * If GLib cannot make sense of the encoding of @filename, as a last resort it |
1866 | | * replaces unknown characters with U+FFFD, the Unicode replacement character. |
1867 | | * You can search the result for the UTF-8 encoding of this character (which is |
1868 | | * "\357\277\275" in octal notation) to find out if @filename was in an invalid |
1869 | | * encoding. |
1870 | | * |
1871 | | * You must pass the whole absolute pathname to this functions so that |
1872 | | * translation of well known locations can be done. |
1873 | | * |
1874 | | * This function is preferred over g_filename_display_name() if you know the |
1875 | | * whole path, as it allows translation. |
1876 | | * |
1877 | | * Returns: a newly allocated string containing |
1878 | | * a rendition of the basename of the filename in valid UTF-8 |
1879 | | * |
1880 | | * Since: 2.6 |
1881 | | **/ |
1882 | | gchar * |
1883 | | g_filename_display_basename (const gchar *filename) |
1884 | 0 | { |
1885 | 0 | char *basename; |
1886 | 0 | char *display_name; |
1887 | |
|
1888 | 0 | g_return_val_if_fail (filename != NULL, NULL); |
1889 | | |
1890 | 0 | basename = g_path_get_basename (filename); |
1891 | 0 | display_name = g_filename_display_name (basename); |
1892 | 0 | g_free (basename); |
1893 | 0 | return display_name; |
1894 | 0 | } |
1895 | | |
1896 | | /** |
1897 | | * g_filename_display_name: |
1898 | | * @filename: (type filename): a pathname hopefully in the |
1899 | | * GLib file name encoding |
1900 | | * |
1901 | | * Converts a filename into a valid UTF-8 string. The conversion is |
1902 | | * not necessarily reversible, so you should keep the original around |
1903 | | * and use the return value of this function only for display purposes. |
1904 | | * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL |
1905 | | * even if the filename actually isn't in the GLib file name encoding. |
1906 | | * |
1907 | | * If GLib cannot make sense of the encoding of @filename, as a last resort it |
1908 | | * replaces unknown characters with U+FFFD, the Unicode replacement character. |
1909 | | * You can search the result for the UTF-8 encoding of this character (which is |
1910 | | * "\357\277\275" in octal notation) to find out if @filename was in an invalid |
1911 | | * encoding. |
1912 | | * |
1913 | | * If you know the whole pathname of the file you should use |
1914 | | * g_filename_display_basename(), since that allows location-based |
1915 | | * translation of filenames. |
1916 | | * |
1917 | | * Returns: a newly allocated string containing |
1918 | | * a rendition of the filename in valid UTF-8 |
1919 | | * |
1920 | | * Since: 2.6 |
1921 | | **/ |
1922 | | gchar * |
1923 | | g_filename_display_name (const gchar *filename) |
1924 | 0 | { |
1925 | 0 | gint i; |
1926 | 0 | const gchar **charsets; |
1927 | 0 | gchar *display_name = NULL; |
1928 | 0 | gboolean is_utf8; |
1929 | | |
1930 | 0 | is_utf8 = g_get_filename_charsets (&charsets); |
1931 | |
|
1932 | 0 | if (is_utf8) |
1933 | 0 | { |
1934 | 0 | if (g_utf8_validate (filename, -1, NULL)) |
1935 | 0 | display_name = g_strdup (filename); |
1936 | 0 | } |
1937 | | |
1938 | 0 | if (!display_name) |
1939 | 0 | { |
1940 | | /* Try to convert from the filename charsets to UTF-8. |
1941 | | * Skip the first charset if it is UTF-8. |
1942 | | */ |
1943 | 0 | for (i = is_utf8 ? 1 : 0; charsets[i]; i++) |
1944 | 0 | { |
1945 | 0 | display_name = g_convert (filename, -1, "UTF-8", charsets[i], |
1946 | 0 | NULL, NULL, NULL); |
1947 | |
|
1948 | 0 | if (display_name) |
1949 | 0 | break; |
1950 | 0 | } |
1951 | 0 | } |
1952 | | |
1953 | | /* if all conversions failed, we replace invalid UTF-8 |
1954 | | * by a question mark |
1955 | | */ |
1956 | 0 | if (!display_name) |
1957 | 0 | display_name = g_utf8_make_valid (filename, -1); |
1958 | |
|
1959 | 0 | return display_name; |
1960 | 0 | } |
1961 | | |
1962 | | #ifdef G_OS_WIN32 |
1963 | | |
1964 | | /* Binary compatibility versions. Not for newly compiled code. */ |
1965 | | |
1966 | | _GLIB_EXTERN gchar *g_filename_to_utf8_utf8 (const gchar *opsysstring, |
1967 | | gssize len, |
1968 | | gsize *bytes_read, |
1969 | | gsize *bytes_written, |
1970 | | GError **error) G_GNUC_MALLOC; |
1971 | | _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar *utf8string, |
1972 | | gssize len, |
1973 | | gsize *bytes_read, |
1974 | | gsize *bytes_written, |
1975 | | GError **error) G_GNUC_MALLOC; |
1976 | | _GLIB_EXTERN gchar *g_filename_from_uri_utf8 (const gchar *uri, |
1977 | | gchar **hostname, |
1978 | | GError **error) G_GNUC_MALLOC; |
1979 | | _GLIB_EXTERN gchar *g_filename_to_uri_utf8 (const gchar *filename, |
1980 | | const gchar *hostname, |
1981 | | GError **error) G_GNUC_MALLOC; |
1982 | | |
1983 | | gchar * |
1984 | | g_filename_to_utf8_utf8 (const gchar *opsysstring, |
1985 | | gssize len, |
1986 | | gsize *bytes_read, |
1987 | | gsize *bytes_written, |
1988 | | GError **error) |
1989 | | { |
1990 | | return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error); |
1991 | | } |
1992 | | |
1993 | | gchar * |
1994 | | g_filename_from_utf8_utf8 (const gchar *utf8string, |
1995 | | gssize len, |
1996 | | gsize *bytes_read, |
1997 | | gsize *bytes_written, |
1998 | | GError **error) |
1999 | | { |
2000 | | return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error); |
2001 | | } |
2002 | | |
2003 | | gchar * |
2004 | | g_filename_from_uri_utf8 (const gchar *uri, |
2005 | | gchar **hostname, |
2006 | | GError **error) |
2007 | | { |
2008 | | return g_filename_from_uri (uri, hostname, error); |
2009 | | } |
2010 | | |
2011 | | gchar * |
2012 | | g_filename_to_uri_utf8 (const gchar *filename, |
2013 | | const gchar *hostname, |
2014 | | GError **error) |
2015 | | { |
2016 | | return g_filename_to_uri (filename, hostname, error); |
2017 | | } |
2018 | | |
2019 | | #endif |