Coverage Report

Created: 2025-12-28 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gstreamer/subprojects/glib-2.86.3/glib/gconvert.c
Line
Count
Source
1
/* GLIB - Library of useful routines for C programming
2
 *
3
 * gconvert.c: Convert between character sets using iconv
4
 * Copyright Red Hat Inc., 2000
5
 * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
6
 *
7
 * SPDX-License-Identifier: LGPL-2.1-or-later
8
 *
9
 * This library is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * This library is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21
 */
22
23
#include "config.h"
24
#include "glibconfig.h"
25
26
#ifndef G_OS_WIN32
27
#include <iconv.h>
28
#endif
29
#include <errno.h>
30
#include <stdio.h>
31
#include <string.h>
32
#include <stdlib.h>
33
34
#ifdef G_OS_WIN32
35
#include <windows.h>
36
#include "win_iconv.c"
37
#endif
38
39
#include "gconvert.h"
40
#include "gconvertprivate.h"
41
42
#include "gcharsetprivate.h"
43
#include "gslist.h"
44
#include "gstrfuncs.h"
45
#include "gtestutils.h"
46
#include "gthread.h"
47
#include "gthreadprivate.h"
48
#include "gunicode.h"
49
#include "gfileutils.h"
50
#include "genviron.h"
51
52
#include "glibintl.h"
53
54
/* We try to terminate strings in unknown charsets with this many zero bytes
55
 * to ensure that multibyte strings really are nul-terminated when we return
56
 * them from g_convert() and friends.
57
 */
58
83.8k
#define NUL_TERMINATOR_LENGTH 4
59
60
G_DEFINE_QUARK (g_convert_error, g_convert_error)
61
62
static gboolean
63
try_conversion (const char *to_codeset,
64
    const char *from_codeset,
65
    iconv_t    *cd)
66
40.8k
{
67
40.8k
  *cd = iconv_open (to_codeset, from_codeset);
68
69
40.8k
  if (*cd == (iconv_t)-1 && errno == EINVAL)
70
0
    return FALSE;
71
72
#if defined(__FreeBSD__) && defined(ICONV_SET_ILSEQ_INVALID)
73
  /* On FreeBSD request GNU iconv compatible handling of characters that cannot
74
   * be represented in the destination character set.
75
   * See https://cgit.freebsd.org/src/commit/?id=7c5b23111c5fd1992047922d4247c4a1ce1bb6c3
76
   */
77
  int value = 1;
78
  if (iconvctl (*cd, ICONV_SET_ILSEQ_INVALID, &value) != 0)
79
    return FALSE;
80
#endif
81
40.8k
  return TRUE;
82
40.8k
}
83
84
static gboolean
85
try_to_aliases (const char **to_aliases,
86
    const char  *from_codeset,
87
    iconv_t     *cd)
88
0
{
89
0
  if (to_aliases)
90
0
    {
91
0
      const char **p = to_aliases;
92
0
      while (*p)
93
0
  {
94
0
    if (try_conversion (*p, from_codeset, cd))
95
0
      return TRUE;
96
97
0
    p++;
98
0
  }
99
0
    }
100
101
0
  return FALSE;
102
0
}
103
104
/**
105
 * g_iconv_open: (skip)
106
 * @to_codeset: destination codeset
107
 * @from_codeset: source codeset
108
 * 
109
 * Same as the standard UNIX routine iconv_open(), but
110
 * may be implemented via libiconv on UNIX flavors that lack
111
 * a native implementation.
112
 * 
113
 * GLib provides g_convert() and g_locale_to_utf8() which are likely
114
 * more convenient than the raw iconv wrappers.
115
 * 
116
 * Returns: a "conversion descriptor", or (GIConv)-1 if
117
 *  opening the converter failed.
118
 **/
119
GIConv
120
g_iconv_open (const gchar  *to_codeset,
121
        const gchar  *from_codeset)
122
40.8k
{
123
40.8k
  iconv_t cd;
124
  
125
40.8k
  if (!try_conversion (to_codeset, from_codeset, &cd))
126
0
    {
127
0
      const char **to_aliases = _g_charset_get_aliases (to_codeset);
128
0
      const char **from_aliases = _g_charset_get_aliases (from_codeset);
129
130
0
      if (from_aliases)
131
0
  {
132
0
    const char **p = from_aliases;
133
0
    while (*p)
134
0
      {
135
0
        if (try_conversion (to_codeset, *p, &cd))
136
0
    goto out;
137
138
0
        if (try_to_aliases (to_aliases, *p, &cd))
139
0
    goto out;
140
141
0
        p++;
142
0
      }
143
0
  }
144
145
0
      if (try_to_aliases (to_aliases, from_codeset, &cd))
146
0
  goto out;
147
0
    }
148
149
40.8k
 out:
150
40.8k
  return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
151
40.8k
}
152
153
/**
154
 * g_iconv: (skip)
155
 * @converter: conversion descriptor from g_iconv_open()
156
 * @inbuf: bytes to convert
157
 * @inbytes_left: (inout): inout parameter, bytes remaining to convert in @inbuf
158
 * @outbuf: converted output bytes
159
 * @outbytes_left: (inout): inout parameter, bytes available to fill in @outbuf
160
 * 
161
 * Same as the standard UNIX routine iconv(), but
162
 * may be implemented via libiconv on UNIX flavors that lack
163
 * a native implementation.
164
 *
165
 * GLib provides g_convert() and g_locale_to_utf8() which are likely
166
 * more convenient than the raw iconv wrappers.
167
 * 
168
 * Note that the behaviour of iconv() for characters which are valid in the
169
 * input character set, but which have no representation in the output character
170
 * set, is implementation defined. This function may return success (with a
171
 * positive number of non-reversible conversions as replacement characters were
172
 * used), or it may return -1 and set an error such as %EILSEQ, in such a
173
 * situation.
174
 *
175
 * See [`iconv(3posix)`](man:iconv(3posix)) and [`iconv(3)`](man:iconv(3)) for more details about behavior when an
176
 * error occurs.
177
 *
178
 * Returns: count of non-reversible conversions, or -1 on error
179
 **/
180
gsize 
181
g_iconv (GIConv   converter,
182
   gchar  **inbuf,
183
   gsize   *inbytes_left,
184
   gchar  **outbuf,
185
   gsize   *outbytes_left)
186
29.7k
{
187
29.7k
  iconv_t cd = (iconv_t)converter;
188
189
29.7k
  return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
190
29.7k
}
191
192
/**
193
 * g_iconv_close: (skip)
194
 * @converter: a conversion descriptor from g_iconv_open()
195
 *
196
 * Same as the standard UNIX routine iconv_close(), but
197
 * may be implemented via libiconv on UNIX flavors that lack
198
 * a native implementation. Should be called to clean up
199
 * the conversion descriptor from g_iconv_open() when
200
 * you are done converting things.
201
 *
202
 * GLib provides g_convert() and g_locale_to_utf8() which are likely
203
 * more convenient than the raw iconv wrappers.
204
 * 
205
 * Returns: -1 on error, 0 on success
206
 **/
207
gint
208
g_iconv_close (GIConv converter)
209
40.8k
{
210
40.8k
  iconv_t cd = (iconv_t)converter;
211
212
40.8k
  return iconv_close (cd);
213
40.8k
}
214
215
static GIConv
216
open_converter (const gchar *to_codeset,
217
    const gchar *from_codeset,
218
    GError     **error)
219
40.8k
{
220
40.8k
  GIConv cd;
221
222
40.8k
  cd = g_iconv_open (to_codeset, from_codeset);
223
224
40.8k
  if (cd == (GIConv) -1)
225
0
    {
226
      /* Something went wrong.  */
227
0
      if (error)
228
0
  {
229
0
    if (errno == EINVAL)
230
0
      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
231
0
       _("Conversion from character set “%s” to “%s” is not supported"),
232
0
       from_codeset, to_codeset);
233
0
    else
234
0
      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
235
0
       _("Could not open converter from “%s” to “%s”"),
236
0
       from_codeset, to_codeset);
237
0
  }
238
0
    }
239
  
240
40.8k
  return cd;
241
40.8k
}
242
243
static int
244
close_converter (GIConv cd)
245
40.8k
{
246
40.8k
  if (cd == (GIConv) -1)
247
0
    return 0;
248
  
249
40.8k
  return g_iconv_close (cd);  
250
40.8k
}
251
252
/**
253
 * g_convert_with_iconv: (skip)
254
 * @str:           (array length=len) (element-type guint8):
255
 *                 the string to convert.
256
 * @len:           the length of the string in bytes, or -1 if the string is
257
 *                 nul-terminated (Note that some encodings may allow nul
258
 *                 bytes to occur inside strings. In that case, using -1
259
 *                 for the @len parameter is unsafe)
260
 * @converter:     conversion descriptor from g_iconv_open()
261
 * @bytes_read:    (out) (optional): location to store the number of bytes in
262
 *                 the input string that were successfully converted, or %NULL.
263
 *                 Even if the conversion was successful, this may be 
264
 *                 less than @len if there were partial characters
265
 *                 at the end of the input. If the error
266
 *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
267
 *                 stored will be the byte offset after the last valid
268
 *                 input sequence.
269
 * @bytes_written: (out) (optional): the number of bytes stored in
270
 *                 the output buffer (not including the terminating nul).
271
 * @error:         location to store the error occurring, or %NULL to ignore
272
 *                 errors. Any of the errors in #GConvertError may occur.
273
 *
274
 * Converts a string from one character set to another. 
275
 * 
276
 * Note that you should use g_iconv() for streaming conversions. 
277
 * Despite the fact that @bytes_read can return information about partial
278
 * characters, the g_convert_... functions are not generally suitable
279
 * for streaming. If the underlying converter maintains internal state,
280
 * then this won't be preserved across successive calls to g_convert(),
281
 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
282
 * this is the GNU C converter for CP1255 which does not emit a base
283
 * character until it knows that the next character is not a mark that
284
 * could combine with the base character.)
285
 *
286
 * Characters which are valid in the input character set, but which have no
287
 * representation in the output character set will result in a
288
 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv()
289
 * specification, which leaves this behaviour implementation defined. Note that
290
 * this is the same error code as is returned for an invalid byte sequence in
291
 * the input character set. To get defined behaviour for conversion of
292
 * unrepresentable characters, use g_convert_with_fallback().
293
 *
294
 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
295
 *               If the conversion was successful, a newly allocated buffer
296
 *               containing the converted string, which must be freed with
297
 *               g_free(). Otherwise %NULL and @error will be set.
298
 **/
299
gchar*
300
g_convert_with_iconv (const gchar *str,
301
          gssize       len,
302
          GIConv       converter,
303
          gsize       *bytes_read, 
304
          gsize       *bytes_written, 
305
          GError     **error)
306
27.7k
{
307
27.7k
  gchar *dest;
308
27.7k
  gchar *outp;
309
27.7k
  const gchar *p;
310
27.7k
  gsize inbytes_remaining;
311
27.7k
  gsize outbytes_remaining;
312
27.7k
  gsize err;
313
27.7k
  gsize outbuf_size;
314
27.7k
  gboolean have_error = FALSE;
315
27.7k
  gboolean done = FALSE;
316
27.7k
  gboolean reset = FALSE;
317
  
318
27.7k
  g_return_val_if_fail (converter != (GIConv) -1, NULL);
319
     
320
27.7k
  if (len < 0)
321
2
    len = strlen (str);
322
323
27.7k
  p = str;
324
27.7k
  inbytes_remaining = len;
325
27.7k
  outbuf_size = len + NUL_TERMINATOR_LENGTH;
326
  
327
27.7k
  outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
328
27.7k
  outp = dest = g_malloc (outbuf_size);
329
330
57.5k
  while (!done && !have_error)
331
29.7k
    {
332
29.7k
      if (reset)
333
1.49k
        err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);
334
28.2k
      else
335
28.2k
        err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
336
337
29.7k
      if (err == (gsize) -1)
338
26.7k
  {
339
26.7k
    switch (errno)
340
26.7k
      {
341
111
      case EINVAL:
342
        /* Incomplete text, do not report an error */
343
111
        done = TRUE;
344
111
        break;
345
524
      case E2BIG:
346
524
        {
347
524
    gsize used = outp - dest;
348
    
349
524
    outbuf_size *= 2;
350
524
    dest = g_realloc (dest, outbuf_size);
351
    
352
524
    outp = dest + used;
353
524
    outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
354
524
        }
355
524
        break;
356
26.1k
      case EILSEQ:
357
26.1k
              g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
358
26.1k
                                   _("Invalid byte sequence in conversion input"));
359
26.1k
        have_error = TRUE;
360
26.1k
        break;
361
0
      default:
362
0
              {
363
0
                int errsv = errno;
364
365
0
                g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
366
0
                             _("Error during conversion: %s"),
367
0
                             g_strerror (errsv));
368
0
              }
369
0
        have_error = TRUE;
370
0
        break;
371
26.7k
      }
372
26.7k
  }
373
2.99k
      else if (err > 0)
374
0
        {
375
          /* @err gives the number of replacement characters used. */
376
0
          g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
377
0
                               _("Unrepresentable character in conversion input"));
378
0
          have_error = TRUE;
379
0
        }
380
2.99k
      else 
381
2.99k
  {
382
2.99k
    if (!reset)
383
1.49k
      {
384
        /* call g_iconv with NULL inbuf to cleanup shift state */
385
1.49k
        reset = TRUE;
386
1.49k
        inbytes_remaining = 0;
387
1.49k
      }
388
1.49k
    else
389
1.49k
      done = TRUE;
390
2.99k
  }
391
29.7k
    }
392
393
27.7k
  memset (outp, 0, NUL_TERMINATOR_LENGTH);
394
  
395
27.7k
  if (bytes_read)
396
27.7k
    *bytes_read = p - str;
397
2
  else
398
2
    {
399
2
      if ((p - str) != len) 
400
0
  {
401
0
          if (!have_error)
402
0
            {
403
0
              g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
404
0
                                   _("Partial character sequence at end of input"));
405
0
              have_error = TRUE;
406
0
            }
407
0
  }
408
2
    }
409
410
27.7k
  if (bytes_written)
411
13.0k
    *bytes_written = outp - dest; /* Doesn't include '\0' */
412
413
27.7k
  if (have_error)
414
26.1k
    {
415
26.1k
      g_free (dest);
416
26.1k
      return NULL;
417
26.1k
    }
418
1.60k
  else
419
1.60k
    return dest;
420
27.7k
}
421
422
/**
423
 * g_convert:
424
 * @str:           (array length=len) (element-type guint8):
425
 *                 the string to convert.
426
 * @len:           the length of the string in bytes, or -1 if the string is
427
 *                 nul-terminated (Note that some encodings may allow nul
428
 *                 bytes to occur inside strings. In that case, using -1
429
 *                 for the @len parameter is unsafe)
430
 * @to_codeset:    name of character set into which to convert @str
431
 * @from_codeset:  character set of @str.
432
 * @bytes_read:    (out) (optional): location to store the number of bytes in
433
 *                 the input string that were successfully converted, or %NULL.
434
 *                 Even if the conversion was successful, this may be 
435
 *                 less than @len if there were partial characters
436
 *                 at the end of the input. If the error
437
 *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
438
 *                 stored will be the byte offset after the last valid
439
 *                 input sequence.
440
 * @bytes_written: (out) (optional): the number of bytes stored in
441
 *                 the output buffer (not including the terminating nul).
442
 * @error:         location to store the error occurring, or %NULL to ignore
443
 *                 errors. Any of the errors in #GConvertError may occur.
444
 *
445
 * Converts a string from one character set to another.
446
 *
447
 * Note that you should use g_iconv() for streaming conversions. 
448
 * Despite the fact that @bytes_read can return information about partial
449
 * characters, the g_convert_... functions are not generally suitable
450
 * for streaming. If the underlying converter maintains internal state,
451
 * then this won't be preserved across successive calls to g_convert(),
452
 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
453
 * this is the GNU C converter for CP1255 which does not emit a base
454
 * character until it knows that the next character is not a mark that
455
 * could combine with the base character.)
456
 *
457
 * Using extensions such as "//TRANSLIT" may not work (or may not work
458
 * well) on many platforms.  Consider using g_str_to_ascii() instead.
459
 *
460
 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
461
 *          If the conversion was successful, a newly allocated buffer
462
 *          containing the converted string, which must be freed with g_free().
463
 *          Otherwise %NULL and @error will be set.
464
 **/
465
gchar*
466
g_convert (const gchar *str,
467
           gssize       len,  
468
           const gchar *to_codeset,
469
           const gchar *from_codeset,
470
           gsize       *bytes_read, 
471
     gsize       *bytes_written, 
472
     GError     **error)
473
27.7k
{
474
27.7k
  gchar *res;
475
27.7k
  GIConv cd;
476
477
27.7k
  g_return_val_if_fail (str != NULL, NULL);
478
27.7k
  g_return_val_if_fail (to_codeset != NULL, NULL);
479
27.7k
  g_return_val_if_fail (from_codeset != NULL, NULL);
480
  
481
27.7k
  cd = open_converter (to_codeset, from_codeset, error);
482
483
27.7k
  if (cd == (GIConv) -1)
484
0
    {
485
0
      if (bytes_read)
486
0
        *bytes_read = 0;
487
      
488
0
      if (bytes_written)
489
0
        *bytes_written = 0;
490
      
491
0
      return NULL;
492
0
    }
493
494
27.7k
  res = g_convert_with_iconv (str, len, cd,
495
27.7k
            bytes_read, bytes_written,
496
27.7k
            error);
497
498
27.7k
  close_converter (cd);
499
500
27.7k
  return res;
501
27.7k
}
502
503
/**
504
 * g_convert_with_fallback:
505
 * @str:          (array length=len) (element-type guint8):
506
 *                the string to convert.
507
 * @len:          the length of the string in bytes, or -1 if the string is
508
 *                 nul-terminated (Note that some encodings may allow nul
509
 *                 bytes to occur inside strings. In that case, using -1
510
 *                 for the @len parameter is unsafe)
511
 * @to_codeset:   name of character set into which to convert @str
512
 * @from_codeset: character set of @str.
513
 * @fallback:     UTF-8 string to use in place of characters not
514
 *                present in the target encoding. (The string must be
515
 *                representable in the target encoding). 
516
 *                If %NULL, characters not in the target encoding will 
517
 *                be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
518
 * @bytes_read:   (out) (optional): location to store the number of bytes in
519
 *                the input string that were successfully converted, or %NULL.
520
 *                Even if the conversion was successful, this may be 
521
 *                less than @len if there were partial characters
522
 *                at the end of the input.
523
 * @bytes_written: (out) (optional): the number of bytes stored in
524
 *                 the output buffer (not including the terminating nul).
525
 * @error:        location to store the error occurring, or %NULL to ignore
526
 *                errors. Any of the errors in #GConvertError may occur.
527
 *
528
 * Converts a string from one character set to another, possibly
529
 * including fallback sequences for characters not representable
530
 * in the output. Note that it is not guaranteed that the specification
531
 * for the fallback sequences in @fallback will be honored. Some
532
 * systems may do an approximate conversion from @from_codeset
533
 * to @to_codeset in their iconv() functions, 
534
 * in which case GLib will simply return that approximate conversion.
535
 *
536
 * Note that you should use g_iconv() for streaming conversions. 
537
 * Despite the fact that @bytes_read can return information about partial
538
 * characters, the g_convert_... functions are not generally suitable
539
 * for streaming. If the underlying converter maintains internal state,
540
 * then this won't be preserved across successive calls to g_convert(),
541
 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
542
 * this is the GNU C converter for CP1255 which does not emit a base
543
 * character until it knows that the next character is not a mark that
544
 * could combine with the base character.)
545
 *
546
 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
547
 *          If the conversion was successful, a newly allocated buffer
548
 *          containing the converted string, which must be freed with g_free().
549
 *          Otherwise %NULL and @error will be set.
550
 **/
551
gchar*
552
g_convert_with_fallback (const gchar *str,
553
       gssize       len,    
554
       const gchar *to_codeset,
555
       const gchar *from_codeset,
556
       const gchar *fallback,
557
       gsize       *bytes_read,
558
       gsize       *bytes_written,
559
       GError     **error)
560
14.6k
{
561
14.6k
  gchar *utf8;
562
14.6k
  gchar *dest;
563
14.6k
  gchar *outp;
564
14.6k
  const gchar *insert_str = NULL;
565
14.6k
  const gchar *p;
566
14.6k
  gsize inbytes_remaining;   
567
14.6k
  const gchar *save_p = NULL;
568
14.6k
  gsize save_inbytes = 0;
569
14.6k
  gsize outbytes_remaining; 
570
14.6k
  gsize err;
571
14.6k
  GIConv cd;
572
14.6k
  gsize outbuf_size;
573
14.6k
  gboolean have_error = FALSE;
574
14.6k
  gboolean done = FALSE;
575
576
14.6k
  GError *local_error = NULL;
577
  
578
14.6k
  g_return_val_if_fail (str != NULL, NULL);
579
14.6k
  g_return_val_if_fail (to_codeset != NULL, NULL);
580
14.6k
  g_return_val_if_fail (from_codeset != NULL, NULL);
581
     
582
14.6k
  if (len < 0)
583
0
    len = strlen (str);
584
  
585
  /* Try an exact conversion; we only proceed if this fails
586
   * due to an illegal sequence in the input string.
587
   */
588
14.6k
  dest = g_convert (str, len, to_codeset, from_codeset, 
589
14.6k
        bytes_read, bytes_written, &local_error);
590
14.6k
  if (!local_error)
591
1.60k
    return dest;
592
593
14.6k
  g_assert (dest == NULL);
594
595
13.0k
  if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
596
0
    {
597
0
      g_propagate_error (error, local_error);
598
0
      return NULL;
599
0
    }
600
13.0k
  else
601
13.0k
    g_error_free (local_error);
602
603
13.0k
  local_error = NULL;
604
  
605
  /* No go; to proceed, we need a converter from "UTF-8" to
606
   * to_codeset, and the string as UTF-8.
607
   */
608
13.0k
  cd = open_converter (to_codeset, "UTF-8", error);
609
13.0k
  if (cd == (GIConv) -1)
610
0
    {
611
0
      if (bytes_read)
612
0
        *bytes_read = 0;
613
      
614
0
      if (bytes_written)
615
0
        *bytes_written = 0;
616
      
617
0
      return NULL;
618
0
    }
619
620
13.0k
  utf8 = g_convert (str, len, "UTF-8", from_codeset, 
621
13.0k
        bytes_read, &inbytes_remaining, error);
622
13.0k
  if (!utf8)
623
13.0k
    {
624
13.0k
      close_converter (cd);
625
13.0k
      if (bytes_written)
626
0
        *bytes_written = 0;
627
13.0k
      return NULL;
628
13.0k
    }
629
630
  /* Now the heart of the code. We loop through the UTF-8 string, and
631
   * whenever we hit an offending character, we form fallback, convert
632
   * the fallback to the target codeset, and then go back to
633
   * converting the original string after finishing with the fallback.
634
   *
635
   * The variables save_p and save_inbytes store the input state
636
   * for the original string while we are converting the fallback
637
   */
638
0
  p = utf8;
639
640
0
  outbuf_size = len + NUL_TERMINATOR_LENGTH;
641
0
  outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
642
0
  outp = dest = g_malloc (outbuf_size);
643
644
0
  while (!done && !have_error)
645
0
    {
646
0
      gsize inbytes_tmp = inbytes_remaining;
647
0
      err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
648
0
      inbytes_remaining = inbytes_tmp;
649
650
0
      if (err == (gsize) -1)
651
0
  {
652
0
    switch (errno)
653
0
      {
654
0
      case EINVAL:
655
0
        g_assert_not_reached();
656
0
        break;
657
0
      case E2BIG:
658
0
        {
659
0
    gsize used = outp - dest;
660
661
0
    outbuf_size *= 2;
662
0
    dest = g_realloc (dest, outbuf_size);
663
    
664
0
    outp = dest + used;
665
0
    outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
666
    
667
0
    break;
668
0
        }
669
0
      case EILSEQ:
670
0
        if (save_p)
671
0
    {
672
      /* Error converting fallback string - fatal
673
       */
674
0
      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
675
0
             _("Cannot convert fallback “%s” to codeset “%s”"),
676
0
             insert_str, to_codeset);
677
0
      have_error = TRUE;
678
0
      break;
679
0
    }
680
0
        else if (p)
681
0
    {
682
0
      if (!fallback)
683
0
        { 
684
0
          gunichar ch = g_utf8_get_char (p);
685
0
          insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
686
0
                ch);
687
0
        }
688
0
      else
689
0
        insert_str = fallback;
690
      
691
0
      save_p = g_utf8_next_char (p);
692
0
      save_inbytes = inbytes_remaining - (save_p - p);
693
0
      p = insert_str;
694
0
      inbytes_remaining = strlen (p);
695
0
      break;
696
0
    }
697
              /* if p is null */
698
0
              G_GNUC_FALLTHROUGH;
699
0
      default:
700
0
              {
701
0
                int errsv = errno;
702
703
0
                g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
704
0
                             _("Error during conversion: %s"),
705
0
                             g_strerror (errsv));
706
0
              }
707
708
0
        have_error = TRUE;
709
0
        break;
710
0
      }
711
0
  }
712
0
      else
713
0
  {
714
0
    if (save_p)
715
0
      {
716
0
        if (!fallback)
717
0
    g_free ((gchar *)insert_str);
718
0
        p = save_p;
719
0
        inbytes_remaining = save_inbytes;
720
0
        save_p = NULL;
721
0
      }
722
0
    else if (p)
723
0
      {
724
        /* call g_iconv with NULL inbuf to cleanup shift state */
725
0
        p = NULL;
726
0
        inbytes_remaining = 0;
727
0
      }
728
0
    else
729
0
      done = TRUE;
730
0
  }
731
0
    }
732
733
  /* Cleanup
734
   */
735
0
  memset (outp, 0, NUL_TERMINATOR_LENGTH);
736
  
737
0
  close_converter (cd);
738
739
0
  if (bytes_written)
740
0
    *bytes_written = outp - dest; /* Doesn't include '\0' */
741
742
0
  g_free (utf8);
743
744
0
  if (have_error)
745
0
    {
746
0
      if (save_p && !fallback)
747
0
  g_free ((gchar *)insert_str);
748
0
      g_free (dest);
749
0
      return NULL;
750
0
    }
751
0
  else
752
0
    return dest;
753
0
}
754
755
/*
756
 * g_locale_to_utf8
757
 *
758
 * 
759
 */
760
761
/*
762
 * Validate @string as UTF-8. @len can be negative if @string is
763
 * nul-terminated, or a non-negative value in bytes. If @string ends in an
764
 * incomplete sequence, or contains any illegal sequences or nul codepoints,
765
 * %NULL will be returned and the error set to
766
 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
767
 * On success, @bytes_read and @bytes_written, if provided, will be set to
768
 * the number of bytes in @string up to @len or the terminating nul byte.
769
 * On error, @bytes_read will be set to the byte offset after the last valid
770
 * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0.
771
 */
772
static gchar *
773
strdup_len (const gchar *string,
774
      gssize       len,
775
      gsize       *bytes_read,
776
      gsize       *bytes_written,
777
      GError     **error)
778
0
{
779
0
  gsize real_len;
780
0
  const gchar *end_valid;
781
782
0
  if (!g_utf8_validate (string, len, &end_valid))
783
0
    {
784
0
      if (bytes_read)
785
0
  *bytes_read = end_valid - string;
786
0
      if (bytes_written)
787
0
  *bytes_written = 0;
788
789
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
790
0
                           _("Invalid byte sequence in conversion input"));
791
0
      return NULL;
792
0
    }
793
794
0
  real_len = end_valid - string;
795
796
0
  if (bytes_read)
797
0
    *bytes_read = real_len;
798
0
  if (bytes_written)
799
0
    *bytes_written = real_len;
800
801
0
  return g_strndup (string, real_len);
802
0
}
803
804
typedef enum
805
{
806
  CONVERT_CHECK_NO_NULS_IN_INPUT  = 1 << 0,
807
  CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1
808
} ConvertCheckFlags;
809
810
/*
811
 * Convert from @string in the encoding identified by @from_codeset,
812
 * returning a string in the encoding identified by @to_codeset.
813
 * @len can be negative if @string is nul-terminated, or a non-negative
814
 * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags
815
 * to check the input, the output, or both, for embedded nul bytes.
816
 * On success, @bytes_read, if provided, will be set to the number of bytes
817
 * in @string up to @len or the terminating nul byte, and @bytes_written, if
818
 * provided, will be set to the number of output bytes written into the
819
 * returned buffer, excluding the terminating nul sequence.
820
 * On error, @bytes_read will be set to the byte offset after the last valid
821
 * sequence in @string, and @bytes_written will be set to 0.
822
 */
823
static gchar *
824
convert_checked (const gchar      *string,
825
                 gssize            len,
826
                 const gchar      *to_codeset,
827
                 const gchar      *from_codeset,
828
                 ConvertCheckFlags flags,
829
                 gsize            *bytes_read,
830
                 gsize            *bytes_written,
831
                 GError          **error)
832
2
{
833
2
  gchar *out;
834
2
  gsize outbytes;
835
836
2
  if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0)
837
0
    {
838
0
      const gchar *early_nul = memchr (string, '\0', len);
839
0
      if (early_nul != NULL)
840
0
        {
841
0
          if (bytes_read)
842
0
            *bytes_read = early_nul - string;
843
0
          if (bytes_written)
844
0
            *bytes_written = 0;
845
846
0
          g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
847
0
                               _("Embedded NUL byte in conversion input"));
848
0
          return NULL;
849
0
        }
850
0
    }
851
852
2
  out = g_convert (string, len, to_codeset, from_codeset,
853
2
                   bytes_read, &outbytes, error);
854
2
  if (out == NULL)
855
0
    {
856
0
      if (bytes_written)
857
0
        *bytes_written = 0;
858
0
      return NULL;
859
0
    }
860
861
2
  if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT)
862
2
      && memchr (out, '\0', outbytes) != NULL)
863
0
    {
864
0
      g_free (out);
865
0
      if (bytes_written)
866
0
        *bytes_written = 0;
867
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
868
0
                           _("Embedded NUL byte in conversion output"));
869
0
      return NULL;
870
0
    }
871
872
2
  if (bytes_written)
873
0
    *bytes_written = outbytes;
874
2
  return out;
875
2
}
876
877
/**
878
 * g_locale_to_utf8:
879
 * @opsysstring:   (array length=len) (element-type guint8): a string in the
880
 *                 encoding of the current locale. On Windows
881
 *                 this means the system codepage.
882
 * @len:           the length of the string, or -1 if the string is
883
 *                 nul-terminated (Note that some encodings may allow nul
884
 *                 bytes to occur inside strings. In that case, using -1
885
 *                 for the @len parameter is unsafe)
886
 * @bytes_read: (out) (optional): location to store the number of bytes in the
887
 *                 input string that were successfully converted, or %NULL.
888
 *                 Even if the conversion was successful, this may be 
889
 *                 less than @len if there were partial characters
890
 *                 at the end of the input. If the error
891
 *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
892
 *                 stored will be the byte offset after the last valid
893
 *                 input sequence.
894
 * @bytes_written: (out) (optional): the number of bytes stored in the output
895
 *                 buffer (not including the terminating nul).
896
 * @error:         location to store the error occurring, or %NULL to ignore
897
 *                 errors. Any of the errors in #GConvertError may occur.
898
 * 
899
 * Converts a string which is in the encoding used for strings by
900
 * the C runtime (usually the same as that used by the operating
901
 * system) in the [current locale](running.html#locale) into a UTF-8 string.
902
 *
903
 * If the source encoding is not UTF-8 and the conversion output contains a
904
 * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
905
 * function returns %NULL.
906
 * If the source encoding is UTF-8, an embedded nul character is treated with
907
 * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
908
 * earlier versions of this library. Use g_convert() to produce output that
909
 * may contain embedded nul characters.
910
 * 
911
 * Returns: (type utf8): The converted string, or %NULL on an error.
912
 **/
913
gchar *
914
g_locale_to_utf8 (const gchar  *opsysstring,
915
      gssize        len,            
916
      gsize        *bytes_read,    
917
      gsize        *bytes_written,
918
      GError      **error)
919
2
{
920
2
  const char *charset;
921
922
2
  if (g_get_charset (&charset))
923
0
    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
924
2
  else
925
2
    return convert_checked (opsysstring, len, "UTF-8", charset,
926
2
                            CONVERT_CHECK_NO_NULS_IN_OUTPUT,
927
2
                            bytes_read, bytes_written, error);
928
2
}
929
930
/*
931
 * Do the exact same as g_locale_to_utf8 except that the charset would
932
 * be retrieved from _g_get_time_charset (which uses LC_TIME)
933
 *
934
 * Returns: The converted string, or %NULL on an error.
935
 */
936
gchar *
937
_g_time_locale_to_utf8 (const gchar *opsysstring,
938
                        gssize       len,
939
                        gsize       *bytes_read,
940
                        gsize       *bytes_written,
941
                        GError     **error)
942
0
{
943
0
  const char *charset;
944
945
0
  if (_g_get_time_charset (&charset))
946
0
    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
947
0
  else
948
0
    return convert_checked (opsysstring, len, "UTF-8", charset,
949
0
                            CONVERT_CHECK_NO_NULS_IN_OUTPUT,
950
0
                            bytes_read, bytes_written, error);
951
0
}
952
953
/*
954
 * Do the exact same as g_locale_to_utf8 except that the charset would
955
 * be retrieved from _g_get_ctype_charset (which uses LC_CTYPE)
956
 *
957
 * Returns: The converted string, or %NULL on an error.
958
 */
959
gchar *
960
_g_ctype_locale_to_utf8 (const gchar *opsysstring,
961
                         gssize       len,
962
                         gsize       *bytes_read,
963
                         gsize       *bytes_written,
964
                         GError     **error)
965
0
{
966
0
  const char *charset;
967
968
0
  if (_g_get_ctype_charset (&charset))
969
0
    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
970
0
  else
971
0
    return convert_checked (opsysstring, len, "UTF-8", charset,
972
0
                            CONVERT_CHECK_NO_NULS_IN_OUTPUT,
973
0
                            bytes_read, bytes_written, error);
974
0
}
975
976
/**
977
 * g_locale_from_utf8:
978
 * @utf8string:    a UTF-8 encoded string 
979
 * @len:           the length of the string, or -1 if the string is
980
 *                 nul-terminated.
981
 * @bytes_read: (out) (optional): location to store the number of bytes in the
982
 *                 input string that were successfully converted, or %NULL.
983
 *                 Even if the conversion was successful, this may be 
984
 *                 less than @len if there were partial characters
985
 *                 at the end of the input. If the error
986
 *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
987
 *                 stored will be the byte offset after the last valid
988
 *                 input sequence.
989
 * @bytes_written: (out) (optional): the number of bytes stored in the output
990
 *                 buffer (not including the terminating nul).
991
 * @error:         location to store the error occurring, or %NULL to ignore
992
 *                 errors. Any of the errors in #GConvertError may occur.
993
 * 
994
 * Converts a string from UTF-8 to the encoding used for strings by
995
 * the C runtime (usually the same as that used by the operating
996
 * system) in the [current locale](running.html#locale).
997
 * On Windows this means the system codepage.
998
 *
999
 * The input string shall not contain nul characters even if the @len
1000
 * argument is positive. A nul character found inside the string will result
1001
 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
1002
 * input that may contain embedded nul characters.
1003
 *
1004
 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
1005
 *          A newly-allocated buffer containing the converted string,
1006
 *          or %NULL on an error, and error will be set.
1007
 **/
1008
gchar *
1009
g_locale_from_utf8 (const gchar *utf8string,
1010
        gssize       len,            
1011
        gsize       *bytes_read,    
1012
        gsize       *bytes_written,
1013
        GError     **error)
1014
0
{
1015
0
  const gchar *charset;
1016
1017
0
  if (g_get_charset (&charset))
1018
0
    return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1019
0
  else
1020
0
    return convert_checked (utf8string, len, charset, "UTF-8",
1021
0
                            CONVERT_CHECK_NO_NULS_IN_INPUT,
1022
0
                            bytes_read, bytes_written, error);
1023
0
}
1024
1025
#ifndef G_PLATFORM_WIN32
1026
1027
typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1028
1029
struct _GFilenameCharsetCache {
1030
  gboolean is_utf8;
1031
  gchar *charset;
1032
  gchar **filename_charsets;
1033
};
1034
1035
static void
1036
filename_charset_cache_free (gpointer data)
1037
0
{
1038
0
  GFilenameCharsetCache *cache = data;
1039
0
  g_free (cache->charset);
1040
0
  g_strfreev (cache->filename_charsets);
1041
0
  g_free (cache);
1042
0
}
1043
1044
/**
1045
 * g_get_filename_charsets:
1046
 * @filename_charsets: (out) (transfer none) (array zero-terminated=1):
1047
 *    return location for the %NULL-terminated list of encoding names
1048
 *
1049
 * Determines the preferred character sets used for filenames.
1050
 * The first character set from the @charsets is the filename encoding, the
1051
 * subsequent character sets are used when trying to generate a displayable
1052
 * representation of a filename, see g_filename_display_name().
1053
 *
1054
 * On Unix, the character sets are determined by consulting the
1055
 * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`.
1056
 * On Windows, the character set used in the GLib API is always UTF-8
1057
 * and said environment variables have no effect.
1058
 *
1059
 * `G_FILENAME_ENCODING` may be set to a comma-separated list of
1060
 * character set names. The special token `@locale` is taken to mean the
1061
 * character set for the [current locale](running.html#locale).
1062
 * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is,
1063
 * the character set of the current locale is taken as the filename
1064
 * encoding. If neither environment variable  is set, UTF-8 is taken
1065
 * as the filename encoding, but the character set of the current locale
1066
 * is also put in the list of encodings.
1067
 *
1068
 * The returned @charsets belong to GLib and must not be freed.
1069
 *
1070
 * Note that on Unix, regardless of the locale character set or
1071
 * `G_FILENAME_ENCODING` value, the actual file names present 
1072
 * on a system might be in any random encoding or just gibberish.
1073
 *
1074
 * Returns: %TRUE if the filename encoding is UTF-8.
1075
 * 
1076
 * Since: 2.6
1077
 */
1078
gboolean
1079
g_get_filename_charsets (const gchar ***filename_charsets)
1080
4
{
1081
4
  static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free);
1082
4
  GFilenameCharsetCache *cache = g_private_get (&cache_private);
1083
4
  const gchar *charset;
1084
1085
4
  if (!cache)
1086
2
    cache = g_private_set_alloc0 (&cache_private, sizeof (GFilenameCharsetCache));
1087
1088
4
  g_get_charset (&charset);
1089
1090
4
  if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1091
2
    {
1092
2
      const gchar *new_charset;
1093
2
      const gchar *p;
1094
2
      gint i;
1095
1096
2
      g_free (cache->charset);
1097
2
      g_strfreev (cache->filename_charsets);
1098
2
      cache->charset = g_strdup (charset);
1099
      
1100
2
      p = g_getenv ("G_FILENAME_ENCODING");
1101
2
      if (p != NULL && p[0] != '\0') 
1102
0
  {
1103
0
    cache->filename_charsets = g_strsplit (p, ",", 0);
1104
0
    cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1105
1106
0
    for (i = 0; cache->filename_charsets[i]; i++)
1107
0
      {
1108
0
        if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1109
0
    {
1110
0
      g_get_charset (&new_charset);
1111
0
      g_free (cache->filename_charsets[i]);
1112
0
      cache->filename_charsets[i] = g_strdup (new_charset);
1113
0
    }
1114
0
      }
1115
0
  }
1116
2
      else if (g_getenv ("G_BROKEN_FILENAMES") != NULL)
1117
0
  {
1118
0
    cache->filename_charsets = g_new0 (gchar *, 2);
1119
0
    cache->is_utf8 = g_get_charset (&new_charset);
1120
0
    cache->filename_charsets[0] = g_strdup (new_charset);
1121
0
  }
1122
2
      else 
1123
2
  {
1124
2
    cache->filename_charsets = g_new0 (gchar *, 3);
1125
2
    cache->is_utf8 = TRUE;
1126
2
    cache->filename_charsets[0] = g_strdup ("UTF-8");
1127
2
    if (!g_get_charset (&new_charset))
1128
2
      cache->filename_charsets[1] = g_strdup (new_charset);
1129
2
  }
1130
2
    }
1131
1132
4
  if (filename_charsets)
1133
4
    *filename_charsets = (const gchar **)cache->filename_charsets;
1134
1135
4
  return cache->is_utf8;
1136
4
}
1137
1138
#else /* G_PLATFORM_WIN32 */
1139
1140
gboolean
1141
g_get_filename_charsets (const gchar ***filename_charsets) 
1142
{
1143
  static const gchar *charsets[] = {
1144
    "UTF-8",
1145
    NULL
1146
  };
1147
1148
#ifdef G_OS_WIN32
1149
  /* On Windows GLib pretends that the filename charset is UTF-8 */
1150
  if (filename_charsets)
1151
    *filename_charsets = charsets;
1152
1153
  return TRUE;
1154
#else
1155
  gboolean result;
1156
1157
  /* Cygwin works like before */
1158
  result = g_get_charset (&(charsets[0]));
1159
1160
  if (filename_charsets)
1161
    *filename_charsets = charsets;
1162
1163
  return result;
1164
#endif
1165
}
1166
1167
#endif /* G_PLATFORM_WIN32 */
1168
1169
static gboolean
1170
get_filename_charset (const gchar **filename_charset)
1171
0
{
1172
0
  const gchar **charsets;
1173
0
  gboolean is_utf8;
1174
  
1175
0
  is_utf8 = g_get_filename_charsets (&charsets);
1176
1177
0
  if (filename_charset)
1178
0
    *filename_charset = charsets[0];
1179
  
1180
0
  return is_utf8;
1181
0
}
1182
1183
/**
1184
 * g_filename_to_utf8:
1185
 * @opsysstring: (type filename): a string in the encoding for filenames
1186
 * @len:           the length of the string, or -1 if the string is
1187
 *                 nul-terminated (Note that some encodings may allow nul
1188
 *                 bytes to occur inside strings. In that case, using -1
1189
 *                 for the @len parameter is unsafe)
1190
 * @bytes_read: (out) (optional): location to store the number of bytes in the
1191
 *                 input string that were successfully converted, or %NULL.
1192
 *                 Even if the conversion was successful, this may be 
1193
 *                 less than @len if there were partial characters
1194
 *                 at the end of the input. If the error
1195
 *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1196
 *                 stored will be the byte offset after the last valid
1197
 *                 input sequence.
1198
 * @bytes_written: (out) (optional): the number of bytes stored in the output
1199
 *                 buffer (not including the terminating nul).
1200
 * @error:         location to store the error occurring, or %NULL to ignore
1201
 *                 errors. Any of the errors in #GConvertError may occur.
1202
 * 
1203
 * Converts a string which is in the encoding used by GLib for
1204
 * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1205
 * for filenames; on other platforms, this function indirectly depends on 
1206
 * the [current locale](running.html#locale).
1207
 *
1208
 * The input string shall not contain nul characters even if the @len
1209
 * argument is positive. A nul character found inside the string will result
1210
 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
1211
 * If the source encoding is not UTF-8 and the conversion output contains a
1212
 * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
1213
 * function returns %NULL. Use g_convert() to produce output that
1214
 * may contain embedded nul characters.
1215
 * 
1216
 * Returns: (type utf8): The converted string, or %NULL on an error.
1217
 **/
1218
gchar*
1219
g_filename_to_utf8 (const gchar *opsysstring, 
1220
        gssize       len,           
1221
        gsize       *bytes_read,   
1222
        gsize       *bytes_written,
1223
        GError     **error)
1224
0
{
1225
0
  const gchar *charset;
1226
1227
0
  g_return_val_if_fail (opsysstring != NULL, NULL);
1228
1229
0
  if (get_filename_charset (&charset))
1230
0
    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1231
0
  else
1232
0
    return convert_checked (opsysstring, len, "UTF-8", charset,
1233
0
                            CONVERT_CHECK_NO_NULS_IN_INPUT |
1234
0
                            CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1235
0
                            bytes_read, bytes_written, error);
1236
0
}
1237
1238
/**
1239
 * g_filename_from_utf8:
1240
 * @utf8string:    (type utf8): a UTF-8 encoded string.
1241
 * @len:           the length of the string, or -1 if the string is
1242
 *                 nul-terminated.
1243
 * @bytes_read:    (out) (optional): location to store the number of bytes in
1244
 *                 the input string that were successfully converted, or %NULL.
1245
 *                 Even if the conversion was successful, this may be 
1246
 *                 less than @len if there were partial characters
1247
 *                 at the end of the input. If the error
1248
 *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1249
 *                 stored will be the byte offset after the last valid
1250
 *                 input sequence.
1251
 * @bytes_written: (out) (optional): the number of bytes stored in
1252
 *                 the output buffer (not including the terminating nul).
1253
 * @error:         location to store the error occurring, or %NULL to ignore
1254
 *                 errors. Any of the errors in #GConvertError may occur.
1255
 * 
1256
 * Converts a string from UTF-8 to the encoding GLib uses for
1257
 * filenames. Note that on Windows GLib uses UTF-8 for filenames;
1258
 * on other platforms, this function indirectly depends on the 
1259
 * [current locale](running.html#locale).
1260
 *
1261
 * The input string shall not contain nul characters even if the @len
1262
 * argument is positive. A nul character found inside the string will result
1263
 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is
1264
 * not UTF-8 and the conversion output contains a nul character, the error
1265
 * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL.
1266
 *
1267
 * Returns: (type filename):
1268
 *               The converted string, or %NULL on an error.
1269
 **/
1270
gchar*
1271
g_filename_from_utf8 (const gchar *utf8string,
1272
          gssize       len,            
1273
          gsize       *bytes_read,    
1274
          gsize       *bytes_written,
1275
          GError     **error)
1276
0
{
1277
0
  const gchar *charset;
1278
1279
0
  if (get_filename_charset (&charset))
1280
0
    return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1281
0
  else
1282
0
    return convert_checked (utf8string, len, charset, "UTF-8",
1283
0
                            CONVERT_CHECK_NO_NULS_IN_INPUT |
1284
0
                            CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1285
0
                            bytes_read, bytes_written, error);
1286
0
}
1287
1288
/* Test of haystack has the needle prefix, comparing case
1289
 * insensitive. haystack may be UTF-8, but needle must
1290
 * contain only ascii. */
1291
static gboolean
1292
has_case_prefix (const gchar *haystack, const gchar *needle)
1293
0
{
1294
0
  const gchar *h, *n;
1295
  
1296
  /* Eat one character at a time. */
1297
0
  h = haystack;
1298
0
  n = needle;
1299
1300
0
  while (*n && *h &&
1301
0
   g_ascii_tolower (*n) == g_ascii_tolower (*h))
1302
0
    {
1303
0
      n++;
1304
0
      h++;
1305
0
    }
1306
  
1307
0
  return *n == '\0';
1308
0
}
1309
1310
typedef enum {
1311
  UNSAFE_ALL        = 0x1,  /* Escape all unsafe characters   */
1312
  UNSAFE_ALLOW_PLUS = 0x2,  /* Allows '+'  */
1313
  UNSAFE_PATH       = 0x8,  /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1314
  UNSAFE_HOST       = 0x10, /* Allows '/' and ':' and '@' */
1315
  UNSAFE_SLASHES    = 0x20  /* Allows all characters except for '/' and '%' */
1316
} UnsafeCharacterSet;
1317
1318
static const guchar acceptable[96] = {
1319
  /* A table of the ASCII chars from space (32) to DEL (127) */
1320
  /*      !    "    #    $    %    &    '    (    )    *    +    ,    -    .    / */ 
1321
  0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1322
  /* 0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ? */
1323
  0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1324
  /* @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O */
1325
  0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1326
  /* P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _ */
1327
  0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1328
  /* `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o */
1329
  0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1330
  /* p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~  DEL */
1331
  0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1332
};
1333
1334
static const gchar hex[] = "0123456789ABCDEF";
1335
1336
/* Note: This escape function works on file: URIs, but if you want to
1337
 * escape something else, please read RFC-2396 */
1338
static gchar *
1339
g_escape_uri_string (const gchar         *string,
1340
                     UnsafeCharacterSet   mask,
1341
                     GError             **error)
1342
0
{
1343
0
#define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1344
1345
0
  const gchar *p;
1346
0
  gchar *q;
1347
0
  gchar *result;
1348
0
  int c;
1349
0
  size_t unacceptable;
1350
0
  UnsafeCharacterSet use_mask;
1351
  
1352
0
  g_return_val_if_fail (mask == UNSAFE_ALL
1353
0
      || mask == UNSAFE_ALLOW_PLUS
1354
0
      || mask == UNSAFE_PATH
1355
0
      || mask == UNSAFE_HOST
1356
0
      || mask == UNSAFE_SLASHES, NULL);
1357
  
1358
0
  unacceptable = 0;
1359
0
  use_mask = mask;
1360
0
  for (p = string; *p != '\0'; p++)
1361
0
    {
1362
0
      c = (guchar) *p;
1363
0
      if (!ACCEPTABLE (c)) 
1364
0
  unacceptable++;
1365
0
    }
1366
1367
0
  if (unacceptable >= (G_MAXSIZE - (p - string)) / 2)
1368
0
    {
1369
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1370
0
                           _("Invalid hostname"));
1371
0
      return NULL;
1372
0
    }
1373
1374
0
  result = g_malloc (p - string + unacceptable * 2 + 1);
1375
  
1376
0
  use_mask = mask;
1377
0
  for (q = result, p = string; *p != '\0'; p++)
1378
0
    {
1379
0
      c = (guchar) *p;
1380
      
1381
0
      if (!ACCEPTABLE (c))
1382
0
  {
1383
0
    *q++ = '%'; /* means hex coming */
1384
0
    *q++ = hex[c >> 4];
1385
0
    *q++ = hex[c & 15];
1386
0
  }
1387
0
      else
1388
0
  *q++ = *p;
1389
0
    }
1390
  
1391
0
  *q = '\0';
1392
  
1393
0
  return result;
1394
0
}
1395
1396
1397
static gchar *
1398
g_escape_file_uri (const gchar  *hostname,
1399
                   const gchar  *pathname,
1400
                   GError      **error)
1401
0
{
1402
0
  char *escaped_hostname = NULL;
1403
0
  char *escaped_path = NULL;
1404
0
  char *res = NULL;
1405
1406
#ifdef G_OS_WIN32
1407
  char *p, *backslash;
1408
1409
  /* Turn backslashes into forward slashes. That's what Netscape
1410
   * does, and they are actually more or less equivalent in Windows.
1411
   */
1412
  
1413
  pathname = g_strdup (pathname);
1414
  p = (char *) pathname;
1415
  
1416
  while ((backslash = strchr (p, '\\')) != NULL)
1417
    {
1418
      *backslash = '/';
1419
      p = backslash + 1;
1420
    }
1421
#endif
1422
1423
0
  if (hostname && *hostname != '\0')
1424
0
    {
1425
0
      escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST, error);
1426
0
      if (escaped_hostname == NULL)
1427
0
        goto out;
1428
0
    }
1429
1430
0
  escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH, error);
1431
0
  if (escaped_path == NULL)
1432
0
    goto out;
1433
1434
0
  res = g_strconcat ("file://",
1435
0
         (escaped_hostname) ? escaped_hostname : "",
1436
0
         (*escaped_path != '/') ? "/" : "",
1437
0
         escaped_path,
1438
0
         NULL);
1439
1440
0
out:
1441
#ifdef G_OS_WIN32
1442
  g_free ((char *) pathname);
1443
#endif
1444
1445
0
  g_free (escaped_hostname);
1446
0
  g_free (escaped_path);
1447
  
1448
0
  return res;
1449
0
}
1450
1451
static int
1452
unescape_character (const char *scanner)
1453
0
{
1454
0
  int first_digit;
1455
0
  int second_digit;
1456
1457
0
  first_digit = g_ascii_xdigit_value (scanner[0]);
1458
0
  if (first_digit < 0) 
1459
0
    return -1;
1460
  
1461
0
  second_digit = g_ascii_xdigit_value (scanner[1]);
1462
0
  if (second_digit < 0) 
1463
0
    return -1;
1464
  
1465
0
  return (first_digit << 4) | second_digit;
1466
0
}
1467
1468
static gchar *
1469
g_unescape_uri_string (const char *escaped,
1470
           int         len,
1471
           const char *illegal_escaped_characters,
1472
           gboolean    ascii_must_not_be_escaped)
1473
0
{
1474
0
  const gchar *in, *in_end;
1475
0
  gchar *out, *result;
1476
0
  int c;
1477
  
1478
0
  if (escaped == NULL)
1479
0
    return NULL;
1480
1481
0
  if (len < 0)
1482
0
    len = strlen (escaped);
1483
1484
0
  result = g_malloc (len + 1);
1485
  
1486
0
  out = result;
1487
0
  for (in = escaped, in_end = escaped + len; in < in_end; in++)
1488
0
    {
1489
0
      c = *in;
1490
1491
0
      if (c == '%')
1492
0
  {
1493
    /* catch partial escape sequences past the end of the substring */
1494
0
    if (in + 3 > in_end)
1495
0
      break;
1496
1497
0
    c = unescape_character (in + 1);
1498
1499
    /* catch bad escape sequences and NUL characters */
1500
0
    if (c <= 0)
1501
0
      break;
1502
1503
    /* catch escaped ASCII */
1504
0
    if (ascii_must_not_be_escaped && c <= 0x7F)
1505
0
      break;
1506
1507
    /* catch other illegal escaped characters */
1508
0
    if (strchr (illegal_escaped_characters, c) != NULL)
1509
0
      break;
1510
1511
0
    in += 2;
1512
0
  }
1513
1514
0
      *out++ = c;
1515
0
    }
1516
  
1517
0
  g_assert (out - result <= len);
1518
0
  *out = '\0';
1519
1520
0
  if (in != in_end)
1521
0
    {
1522
0
      g_free (result);
1523
0
      return NULL;
1524
0
    }
1525
1526
0
  return result;
1527
0
}
1528
1529
static gboolean
1530
is_asciialphanum (gunichar c)
1531
0
{
1532
0
  return c <= 0x7F && g_ascii_isalnum (c);
1533
0
}
1534
1535
static gboolean
1536
is_asciialpha (gunichar c)
1537
0
{
1538
0
  return c <= 0x7F && g_ascii_isalpha (c);
1539
0
}
1540
1541
/* allows an empty string */
1542
static gboolean
1543
hostname_validate (const char *hostname)
1544
0
{
1545
0
  const char *p;
1546
0
  gunichar c, first_char, last_char;
1547
1548
0
  p = hostname;
1549
0
  if (*p == '\0')
1550
0
    return TRUE;
1551
0
  do
1552
0
    {
1553
      /* read in a label */
1554
0
      c = g_utf8_get_char (p);
1555
0
      p = g_utf8_next_char (p);
1556
0
      if (!is_asciialphanum (c))
1557
0
  return FALSE;
1558
0
      first_char = c;
1559
0
      do
1560
0
  {
1561
0
    last_char = c;
1562
0
    c = g_utf8_get_char (p);
1563
0
    p = g_utf8_next_char (p);
1564
0
  }
1565
0
      while (is_asciialphanum (c) || c == '-');
1566
0
      if (last_char == '-')
1567
0
  return FALSE;
1568
      
1569
      /* if that was the last label, check that it was a toplabel */
1570
0
      if (c == '\0' || (c == '.' && *p == '\0'))
1571
0
  return is_asciialpha (first_char);
1572
0
    }
1573
0
  while (c == '.');
1574
0
  return FALSE;
1575
0
}
1576
1577
/**
1578
 * g_filename_from_uri:
1579
 * @uri: a uri describing a filename (escaped, encoded in ASCII).
1580
 * @hostname: (out) (optional) (nullable): Location to store hostname for the URI.
1581
 *            If there is no hostname in the URI, %NULL will be
1582
 *            stored in this location.
1583
 * @error: location to store the error occurring, or %NULL to ignore
1584
 *         errors. Any of the errors in #GConvertError may occur.
1585
 * 
1586
 * Converts an escaped ASCII-encoded URI to a local filename in the
1587
 * encoding used for filenames.
1588
 *
1589
 * Since GLib 2.78, the query string and fragment can be present in the URI,
1590
 * but are not part of the resulting filename.
1591
 * We take inspiration from https://url.spec.whatwg.org/#file-state,
1592
 * but we don't support the entire standard.
1593
 * 
1594
 * Returns: (type filename): a newly-allocated string holding
1595
 *               the resulting filename, or %NULL on an error.
1596
 **/
1597
gchar *
1598
g_filename_from_uri (const gchar *uri,
1599
         gchar      **hostname,
1600
         GError     **error)
1601
0
{
1602
0
  const char *past_scheme;
1603
0
  const char *host_part;
1604
0
  char *unescaped_hostname;
1605
0
  char *result;
1606
0
  char *filename;
1607
0
  char *past_path;
1608
0
  char *temp_uri;
1609
0
  int offs;
1610
#ifdef G_OS_WIN32
1611
  char *p, *slash;
1612
#endif
1613
1614
0
  if (hostname)
1615
0
    *hostname = NULL;
1616
1617
0
  if (!has_case_prefix (uri, "file:/"))
1618
0
    {
1619
0
      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1620
0
       _("The URI “%s” is not an absolute URI using the “file” scheme"),
1621
0
       uri);
1622
0
      return NULL;
1623
0
    }
1624
1625
0
  temp_uri = g_strdup (uri);
1626
1627
0
  past_scheme = temp_uri + strlen ("file:");
1628
  
1629
0
  past_path = strchr (past_scheme, '?');
1630
0
  if (past_path != NULL)
1631
0
    *past_path = '\0';
1632
1633
0
  past_path = strchr (past_scheme, '#');
1634
0
  if (past_path != NULL)
1635
0
    *past_path = '\0';
1636
1637
0
  if (has_case_prefix (past_scheme, "///"))
1638
0
    past_scheme += 2;
1639
0
  else if (has_case_prefix (past_scheme, "//"))
1640
0
    {
1641
0
      past_scheme += 2;
1642
0
      host_part = past_scheme;
1643
1644
0
      past_scheme = strchr (past_scheme, '/');
1645
1646
0
      if (past_scheme == NULL)
1647
0
  {
1648
0
          g_free (temp_uri);
1649
0
    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1650
0
           _("The URI “%s” is invalid"),
1651
0
           uri);
1652
0
    return NULL;
1653
0
  }
1654
1655
0
      unescaped_hostname = g_unescape_uri_string (host_part, past_scheme - host_part, "", TRUE);
1656
1657
0
      if (unescaped_hostname == NULL ||
1658
0
    !hostname_validate (unescaped_hostname))
1659
0
  {
1660
0
    g_free (unescaped_hostname);
1661
0
          g_free (temp_uri);
1662
0
    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1663
0
           _("The hostname of the URI “%s” is invalid"),
1664
0
           uri);
1665
0
    return NULL;
1666
0
  }
1667
      
1668
0
      if (hostname)
1669
0
  *hostname = unescaped_hostname;
1670
0
      else
1671
0
  g_free (unescaped_hostname);
1672
0
    }
1673
1674
0
  filename = g_unescape_uri_string (past_scheme, -1, "/", FALSE);
1675
1676
0
  if (filename == NULL)
1677
0
    {
1678
0
      g_free (temp_uri);
1679
0
      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1680
0
       _("The URI “%s” contains invalidly escaped characters"),
1681
0
       uri);
1682
0
      return NULL;
1683
0
    }
1684
1685
0
  offs = 0;
1686
#ifdef G_OS_WIN32
1687
  /* Drop localhost */
1688
  if (hostname && *hostname != NULL &&
1689
      g_ascii_strcasecmp (*hostname, "localhost") == 0)
1690
    {
1691
      g_free (*hostname);
1692
      *hostname = NULL;
1693
    }
1694
1695
  /* Turn slashes into backslashes, because that's the canonical spelling */
1696
  p = filename;
1697
  while ((slash = strchr (p, '/')) != NULL)
1698
    {
1699
      *slash = '\\';
1700
      p = slash + 1;
1701
    }
1702
1703
  /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1704
   * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1705
   * the filename from the drive letter.
1706
   */
1707
  if (g_ascii_isalpha (filename[1]))
1708
    {
1709
      if (filename[2] == ':')
1710
  offs = 1;
1711
      else if (filename[2] == '|')
1712
  {
1713
    filename[2] = ':';
1714
    offs = 1;
1715
  }
1716
    }
1717
#endif
1718
1719
0
  result = g_strdup (filename + offs);
1720
0
  g_free (filename);
1721
1722
0
  g_free (temp_uri);
1723
1724
0
  return result;
1725
0
}
1726
1727
/**
1728
 * g_filename_to_uri:
1729
 * @filename: (type filename): an absolute filename specified in the GLib file
1730
 *     name encoding, which is the on-disk file name bytes on Unix, and UTF-8
1731
 *     on Windows
1732
 * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none.
1733
 * @error: location to store the error occurring, or %NULL to ignore
1734
 *         errors. Any of the errors in #GConvertError may occur.
1735
 * 
1736
 * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1737
 * component following Section 3.3. of RFC 2396.
1738
 * 
1739
 * Returns: a newly-allocated string holding the resulting
1740
 *               URI, or %NULL on an error.
1741
 **/
1742
gchar *
1743
g_filename_to_uri (const gchar *filename,
1744
       const gchar *hostname,
1745
       GError     **error)
1746
0
{
1747
0
  char *escaped_uri;
1748
1749
0
  g_return_val_if_fail (filename != NULL, NULL);
1750
1751
0
  if (!g_path_is_absolute (filename))
1752
0
    {
1753
0
      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1754
0
       _("The pathname “%s” is not an absolute path"),
1755
0
       filename);
1756
0
      return NULL;
1757
0
    }
1758
1759
0
  if (hostname &&
1760
0
      !(g_utf8_validate (hostname, -1, NULL)
1761
0
  && hostname_validate (hostname)))
1762
0
    {
1763
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1764
0
                           _("Invalid hostname"));
1765
0
      return NULL;
1766
0
    }
1767
  
1768
#ifdef G_OS_WIN32
1769
  /* Don't use localhost unnecessarily */
1770
  if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1771
    hostname = NULL;
1772
#endif
1773
1774
0
  escaped_uri = g_escape_file_uri (hostname, filename, error);
1775
1776
0
  return escaped_uri;
1777
0
}
1778
1779
/**
1780
 * g_uri_list_extract_uris:
1781
 * @uri_list: an URI list 
1782
 *
1783
 * Splits an URI list conforming to the text/uri-list
1784
 * mime type defined in RFC 2483 into individual URIs,
1785
 * discarding any comments. The URIs are not validated.
1786
 *
1787
 * Returns: (transfer full): a newly allocated %NULL-terminated list
1788
 *   of strings holding the individual URIs. The array should be freed
1789
 *   with g_strfreev().
1790
 *
1791
 * Since: 2.6
1792
 */
1793
gchar **
1794
g_uri_list_extract_uris (const gchar *uri_list)
1795
0
{
1796
0
  GPtrArray *uris;
1797
0
  const gchar *p, *q;
1798
1799
0
  uris = g_ptr_array_new ();
1800
1801
0
  p = uri_list;
1802
1803
  /* We don't actually try to validate the URI according to RFC
1804
   * 2396, or even check for allowed characters - we just ignore
1805
   * comments and trim whitespace off the ends.  We also
1806
   * allow LF delimination as well as the specified CRLF.
1807
   *
1808
   * We do allow comments like specified in RFC 2483.
1809
   */
1810
0
  while (p)
1811
0
    {
1812
0
      if (*p != '#')
1813
0
  {
1814
0
    while (g_ascii_isspace (*p))
1815
0
      p++;
1816
1817
0
    q = p;
1818
0
    while (*q && (*q != '\n') && (*q != '\r'))
1819
0
      q++;
1820
1821
0
    if (q > p)
1822
0
      {
1823
0
        q--;
1824
0
        while (q > p && g_ascii_isspace (*q))
1825
0
    q--;
1826
1827
0
        if (q > p)
1828
0
                g_ptr_array_add (uris, g_strndup (p, q - p + 1));
1829
0
            }
1830
0
        }
1831
0
      p = strchr (p, '\n');
1832
0
      if (p)
1833
0
  p++;
1834
0
    }
1835
1836
0
  g_ptr_array_add (uris, NULL);
1837
1838
0
  return (gchar **) g_ptr_array_free (uris, FALSE);
1839
0
}
1840
1841
/**
1842
 * g_filename_display_basename:
1843
 * @filename: (type filename): an absolute pathname in the
1844
 *     GLib file name encoding
1845
 *
1846
 * Returns the display basename for the particular filename, guaranteed
1847
 * to be valid UTF-8. The display name might not be identical to the filename,
1848
 * for instance there might be problems converting it to UTF-8, and some files
1849
 * can be translated in the display.
1850
 *
1851
 * If GLib cannot make sense of the encoding of @filename, as a last resort it 
1852
 * replaces unknown characters with U+FFFD, the Unicode replacement character.
1853
 * You can search the result for the UTF-8 encoding of this character (which is
1854
 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1855
 * encoding.
1856
 *
1857
 * You must pass the whole absolute pathname to this functions so that
1858
 * translation of well known locations can be done.
1859
 *
1860
 * This function is preferred over g_filename_display_name() if you know the
1861
 * whole path, as it allows translation.
1862
 *
1863
 * Returns: a newly allocated string containing
1864
 *   a rendition of the basename of the filename in valid UTF-8
1865
 *
1866
 * Since: 2.6
1867
 **/
1868
gchar *
1869
g_filename_display_basename (const gchar *filename)
1870
0
{
1871
0
  char *basename;
1872
0
  char *display_name;
1873
1874
0
  g_return_val_if_fail (filename != NULL, NULL);
1875
  
1876
0
  basename = g_path_get_basename (filename);
1877
0
  display_name = g_filename_display_name (basename);
1878
0
  g_free (basename);
1879
0
  return display_name;
1880
0
}
1881
1882
/**
1883
 * g_filename_display_name:
1884
 * @filename: (type filename): a pathname hopefully in the
1885
 *     GLib file name encoding
1886
 * 
1887
 * Converts a filename into a valid UTF-8 string. The conversion is 
1888
 * not necessarily reversible, so you should keep the original around 
1889
 * and use the return value of this function only for display purposes.
1890
 * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL 
1891
 * even if the filename actually isn't in the GLib file name encoding.
1892
 *
1893
 * If GLib cannot make sense of the encoding of @filename, as a last resort it 
1894
 * replaces unknown characters with U+FFFD, the Unicode replacement character.
1895
 * You can search the result for the UTF-8 encoding of this character (which is
1896
 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1897
 * encoding.
1898
 *
1899
 * If you know the whole pathname of the file you should use
1900
 * g_filename_display_basename(), since that allows location-based
1901
 * translation of filenames.
1902
 *
1903
 * Returns: a newly allocated string containing
1904
 *   a rendition of the filename in valid UTF-8
1905
 *
1906
 * Since: 2.6
1907
 **/
1908
gchar *
1909
g_filename_display_name (const gchar *filename)
1910
4
{
1911
4
  gint i;
1912
4
  const gchar **charsets;
1913
4
  gchar *display_name = NULL;
1914
4
  gboolean is_utf8;
1915
 
1916
4
  is_utf8 = g_get_filename_charsets (&charsets);
1917
1918
4
  if (is_utf8)
1919
4
    {
1920
4
      if (g_utf8_validate (filename, -1, NULL))
1921
4
  display_name = g_strdup (filename);
1922
4
    }
1923
  
1924
4
  if (!display_name)
1925
0
    {
1926
      /* Try to convert from the filename charsets to UTF-8.
1927
       * Skip the first charset if it is UTF-8.
1928
       */
1929
0
      for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
1930
0
  {
1931
0
    display_name = g_convert (filename, -1, "UTF-8", charsets[i], 
1932
0
            NULL, NULL, NULL);
1933
1934
0
    if (display_name)
1935
0
      break;
1936
0
  }
1937
0
    }
1938
  
1939
  /* if all conversions failed, we replace invalid UTF-8
1940
   * by a question mark
1941
   */
1942
4
  if (!display_name) 
1943
0
    display_name = g_utf8_make_valid (filename, -1);
1944
1945
4
  return display_name;
1946
4
}
1947
1948
#ifdef G_OS_WIN32
1949
1950
/* Binary compatibility versions. Not for newly compiled code. */
1951
1952
_GLIB_EXTERN gchar *g_filename_to_utf8_utf8   (const gchar  *opsysstring,
1953
                                               gssize        len,
1954
                                               gsize        *bytes_read,
1955
                                               gsize        *bytes_written,
1956
                                               GError      **error) G_GNUC_MALLOC;
1957
_GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar  *utf8string,
1958
                                               gssize        len,
1959
                                               gsize        *bytes_read,
1960
                                               gsize        *bytes_written,
1961
                                               GError      **error) G_GNUC_MALLOC;
1962
_GLIB_EXTERN gchar *g_filename_from_uri_utf8  (const gchar  *uri,
1963
                                               gchar       **hostname,
1964
                                               GError      **error) G_GNUC_MALLOC;
1965
_GLIB_EXTERN gchar *g_filename_to_uri_utf8    (const gchar  *filename,
1966
                                               const gchar  *hostname,
1967
                                               GError      **error) G_GNUC_MALLOC;
1968
1969
gchar *
1970
g_filename_to_utf8_utf8 (const gchar *opsysstring,
1971
                         gssize       len,
1972
                         gsize       *bytes_read,
1973
                         gsize       *bytes_written,
1974
                         GError     **error)
1975
{
1976
  return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error);
1977
}
1978
1979
gchar *
1980
g_filename_from_utf8_utf8 (const gchar *utf8string,
1981
                           gssize       len,
1982
                           gsize       *bytes_read,
1983
                           gsize       *bytes_written,
1984
                           GError     **error)
1985
{
1986
  return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error);
1987
}
1988
1989
gchar *
1990
g_filename_from_uri_utf8 (const gchar *uri,
1991
                          gchar      **hostname,
1992
                          GError     **error)
1993
{
1994
  return g_filename_from_uri (uri, hostname, error);
1995
}
1996
1997
gchar *
1998
g_filename_to_uri_utf8 (const gchar *filename,
1999
                        const gchar *hostname,
2000
                        GError     **error)
2001
{
2002
  return g_filename_to_uri (filename, hostname, error);
2003
}
2004
2005
#endif