Coverage Report

Created: 2025-07-23 06:42

/src/irssi/subprojects/glib-2.74.3/glib/gutf8.c
Line
Count
Source (jump to first uncovered line)
1
/* gutf8.c - Operations on UTF-8 strings.
2
 *
3
 * Copyright (C) 1999 Tom Tromey
4
 * Copyright (C) 2000 Red Hat, Inc.
5
 *
6
 * SPDX-License-Identifier: LGPL-2.1-or-later
7
 *
8
 * This library is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * This library is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20
 */
21
22
#include "config.h"
23
24
#include <stdlib.h>
25
#ifdef HAVE_CODESET
26
#include <langinfo.h>
27
#endif
28
#include <string.h>
29
30
#ifdef G_PLATFORM_WIN32
31
#include <stdio.h>
32
#define STRICT
33
#include <windows.h>
34
#undef STRICT
35
#endif
36
37
#include "gconvert.h"
38
#include "ghash.h"
39
#include "gstrfuncs.h"
40
#include "gtestutils.h"
41
#include "gtypes.h"
42
#include "gthread.h"
43
#include "glibintl.h"
44
45
#define UTF8_COMPUTE(Char, Mask, Len)               \
46
0
  if (Char < 128)                   \
47
0
    {                       \
48
0
      Len = 1;                      \
49
0
      Mask = 0x7f;                    \
50
0
    }                        \
51
0
  else if ((Char & 0xe0) == 0xc0)               \
52
0
    {                       \
53
0
      Len = 2;                      \
54
0
      Mask = 0x1f;                    \
55
0
    }                        \
56
0
  else if ((Char & 0xf0) == 0xe0)               \
57
0
    {                       \
58
0
      Len = 3;                      \
59
0
      Mask = 0x0f;                    \
60
0
    }                        \
61
0
  else if ((Char & 0xf8) == 0xf0)               \
62
0
    {                       \
63
0
      Len = 4;                      \
64
0
      Mask = 0x07;                    \
65
0
    }                        \
66
0
  else if ((Char & 0xfc) == 0xf8)               \
67
0
    {                       \
68
0
      Len = 5;                      \
69
0
      Mask = 0x03;                    \
70
0
    }                        \
71
0
  else if ((Char & 0xfe) == 0xfc)               \
72
0
    {                       \
73
0
      Len = 6;                      \
74
0
      Mask = 0x01;                    \
75
0
    }                        \
76
0
  else                        \
77
0
    Len = -1;
78
79
#define UTF8_LENGTH(Char)              \
80
0
  ((Char) < 0x80 ? 1 :                 \
81
0
   ((Char) < 0x800 ? 2 :               \
82
0
    ((Char) < 0x10000 ? 3 :            \
83
0
     ((Char) < 0x200000 ? 4 :          \
84
0
      ((Char) < 0x4000000 ? 5 : 6)))))
85
   
86
87
#define UTF8_GET(Result, Chars, Count, Mask, Len)           \
88
0
  (Result) = (Chars)[0] & (Mask);               \
89
0
  for ((Count) = 1; (Count) < (Len); ++(Count))             \
90
0
    {                       \
91
0
      if (((Chars)[(Count)] & 0xc0) != 0x80)             \
92
0
  {                     \
93
0
    (Result) = -1;                  \
94
0
    break;                    \
95
0
  }                      \
96
0
      (Result) <<= 6;                   \
97
0
      (Result) |= ((Chars)[(Count)] & 0x3f);              \
98
0
    }
99
    
100
/*
101
 * Check whether a Unicode (5.2) char is in a valid range.
102
 *
103
 * The first check comes from the Unicode guarantee to never encode
104
 * a point above 0x0010ffff, since UTF-16 couldn't represent it.
105
 * 
106
 * The second check covers surrogate pairs (category Cs).
107
 *
108
 * @param Char the character
109
 */
110
#define UNICODE_VALID(Char)                   \
111
0
    ((Char) < 0x110000 &&                     \
112
0
     (((Char) & 0xFFFFF800) != 0xD800))
113
114
    
115
static const gchar utf8_skip_data[256] = {
116
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
117
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
118
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
119
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
120
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
121
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
122
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
123
  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
124
};
125
126
const gchar * const g_utf8_skip = utf8_skip_data;
127
128
/**
129
 * g_utf8_find_prev_char:
130
 * @str: pointer to the beginning of a UTF-8 encoded string
131
 * @p: pointer to some position within @str
132
 * 
133
 * Given a position @p with a UTF-8 encoded string @str, find the start
134
 * of the previous UTF-8 character starting before @p. Returns %NULL if no
135
 * UTF-8 characters are present in @str before @p.
136
 *
137
 * @p does not have to be at the beginning of a UTF-8 character. No check
138
 * is made to see if the character found is actually valid other than
139
 * it starts with an appropriate byte.
140
 *
141
 * Returns: (transfer none) (nullable): a pointer to the found character or %NULL.
142
 */
143
gchar *
144
g_utf8_find_prev_char (const gchar *str,
145
           const gchar *p)
146
0
{
147
0
  while (p > str)
148
0
    {
149
0
      --p;
150
0
      if ((*p & 0xc0) != 0x80)
151
0
  return (gchar *)p;
152
0
    }
153
0
  return NULL;
154
0
}
155
156
/**
157
 * g_utf8_find_next_char:
158
 * @p: a pointer to a position within a UTF-8 encoded string
159
 * @end: (nullable): a pointer to the byte following the end of the string,
160
 *     or %NULL to indicate that the string is nul-terminated
161
 *
162
 * Finds the start of the next UTF-8 character in the string after @p.
163
 *
164
 * @p does not have to be at the beginning of a UTF-8 character. No check
165
 * is made to see if the character found is actually valid other than
166
 * it starts with an appropriate byte.
167
 * 
168
 * If @end is %NULL, the return value will never be %NULL: if the end of the
169
 * string is reached, a pointer to the terminating nul byte is returned. If
170
 * @end is non-%NULL, the return value will be %NULL if the end of the string
171
 * is reached.
172
 *
173
 * Returns: (transfer none) (nullable): a pointer to the found character or %NULL if @end is
174
 *    set and is reached
175
 */
176
gchar *
177
g_utf8_find_next_char (const gchar *p,
178
           const gchar *end)
179
0
{
180
0
  if (end)
181
0
    {
182
0
      for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
183
0
        ;
184
0
      return (p >= end) ? NULL : (gchar *)p;
185
0
    }
186
0
  else
187
0
    {
188
0
      for (++p; (*p & 0xc0) == 0x80; ++p)
189
0
        ;
190
0
      return (gchar *)p;
191
0
    }
192
0
}
193
194
/**
195
 * g_utf8_prev_char:
196
 * @p: a pointer to a position within a UTF-8 encoded string
197
 *
198
 * Finds the previous UTF-8 character in the string before @p.
199
 *
200
 * @p does not have to be at the beginning of a UTF-8 character. No check
201
 * is made to see if the character found is actually valid other than
202
 * it starts with an appropriate byte. If @p might be the first
203
 * character of the string, you must use g_utf8_find_prev_char() instead.
204
 * 
205
 * Returns: (transfer none) (not nullable): a pointer to the found character
206
 */
207
gchar *
208
g_utf8_prev_char (const gchar *p)
209
0
{
210
0
  while (TRUE)
211
0
    {
212
0
      p--;
213
0
      if ((*p & 0xc0) != 0x80)
214
0
  return (gchar *)p;
215
0
    }
216
0
}
217
 
218
/**
219
 * g_utf8_strlen:
220
 * @p: pointer to the start of a UTF-8 encoded string
221
 * @max: the maximum number of bytes to examine. If @max
222
 *       is less than 0, then the string is assumed to be
223
 *       nul-terminated. If @max is 0, @p will not be examined and
224
 *       may be %NULL. If @max is greater than 0, up to @max
225
 *       bytes are examined
226
 *
227
 * Computes the length of the string in characters, not including
228
 * the terminating nul character. If the @max'th byte falls in the
229
 * middle of a character, the last (partial) character is not counted.
230
 *
231
 * Returns: the length of the string in characters
232
 */
233
glong
234
g_utf8_strlen (const gchar *p,
235
               gssize       max)
236
0
{
237
0
  glong len = 0;
238
0
  const gchar *start = p;
239
0
  g_return_val_if_fail (p != NULL || max == 0, 0);
240
241
0
  if (max < 0)
242
0
    {
243
0
      while (*p)
244
0
        {
245
0
          p = g_utf8_next_char (p);
246
0
          ++len;
247
0
        }
248
0
    }
249
0
  else
250
0
    {
251
0
      if (max == 0 || !*p)
252
0
        return 0;
253
254
0
      p = g_utf8_next_char (p);
255
256
0
      while (p - start < max && *p)
257
0
        {
258
0
          ++len;
259
0
          p = g_utf8_next_char (p);
260
0
        }
261
262
      /* only do the last len increment if we got a complete
263
       * char (don't count partial chars)
264
       */
265
0
      if (p - start <= max)
266
0
        ++len;
267
0
    }
268
269
0
  return len;
270
0
}
271
272
/**
273
 * g_utf8_substring:
274
 * @str: a UTF-8 encoded string
275
 * @start_pos: a character offset within @str
276
 * @end_pos: another character offset within @str,
277
 *   or `-1` to indicate the end of the string
278
 *
279
 * Copies a substring out of a UTF-8 encoded string.
280
 * The substring will contain @end_pos - @start_pos characters.
281
 *
282
 * Since GLib 2.72, `-1` can be passed to @end_pos to indicate the
283
 * end of the string.
284
 *
285
 * Returns: (transfer full): a newly allocated copy of the requested
286
 *     substring. Free with g_free() when no longer needed.
287
 *
288
 * Since: 2.30
289
 */
290
gchar *
291
g_utf8_substring (const gchar *str,
292
                  glong        start_pos,
293
                  glong        end_pos)
294
0
{
295
0
  gchar *start, *end, *out;
296
297
0
  g_return_val_if_fail (end_pos >= start_pos || end_pos == -1, NULL);
298
299
0
  start = g_utf8_offset_to_pointer (str, start_pos);
300
301
0
  if (end_pos == -1)
302
0
    {
303
0
      glong length = g_utf8_strlen (start, -1);
304
0
      end = g_utf8_offset_to_pointer (start, length);
305
0
    }
306
0
  else
307
0
    {
308
0
      end = g_utf8_offset_to_pointer (start, end_pos - start_pos);
309
0
    }
310
311
0
  out = g_malloc (end - start + 1);
312
0
  memcpy (out, start, end - start);
313
0
  out[end - start] = 0;
314
315
0
  return out;
316
0
}
317
318
/**
319
 * g_utf8_get_char:
320
 * @p: a pointer to Unicode character encoded as UTF-8
321
 * 
322
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
323
 *
324
 * If @p does not point to a valid UTF-8 encoded character, results
325
 * are undefined. If you are not sure that the bytes are complete
326
 * valid Unicode characters, you should use g_utf8_get_char_validated()
327
 * instead.
328
 * 
329
 * Returns: the resulting character
330
 */
331
gunichar
332
g_utf8_get_char (const gchar *p)
333
0
{
334
0
  int i, mask = 0, len;
335
0
  gunichar result;
336
0
  unsigned char c = (unsigned char) *p;
337
338
0
  UTF8_COMPUTE (c, mask, len);
339
0
  if (len == -1)
340
0
    return (gunichar)-1;
341
0
  UTF8_GET (result, p, i, mask, len);
342
343
0
  return result;
344
0
}
345
346
/**
347
 * g_utf8_offset_to_pointer:
348
 * @str: a UTF-8 encoded string
349
 * @offset: a character offset within @str
350
 *
351
 * Converts from an integer character offset to a pointer to a position
352
 * within the string.
353
 *
354
 * Since 2.10, this function allows to pass a negative @offset to
355
 * step backwards. It is usually worth stepping backwards from the end
356
 * instead of forwards if @offset is in the last fourth of the string,
357
 * since moving forward is about 3 times faster than moving backward.
358
 *
359
 * Note that this function doesn't abort when reaching the end of @str.
360
 * Therefore you should be sure that @offset is within string boundaries
361
 * before calling that function. Call g_utf8_strlen() when unsure.
362
 * This limitation exists as this function is called frequently during
363
 * text rendering and therefore has to be as fast as possible.
364
 *
365
 * Returns: (transfer none): the resulting pointer
366
 */
367
gchar *
368
g_utf8_offset_to_pointer  (const gchar *str,
369
         glong        offset)
370
0
{
371
0
  const gchar *s = str;
372
373
0
  if (offset > 0) 
374
0
    while (offset--)
375
0
      s = g_utf8_next_char (s);
376
0
  else
377
0
    {
378
0
      const char *s1;
379
380
      /* This nice technique for fast backwards stepping 
381
       * through a UTF-8 string was dubbed "stutter stepping" 
382
       * by its inventor, Larry Ewing.
383
       */
384
0
      while (offset)
385
0
  {
386
0
    s1 = s;
387
0
    s += offset;
388
0
    while ((*s & 0xc0) == 0x80)
389
0
      s--;
390
391
0
    offset += g_utf8_pointer_to_offset (s, s1);
392
0
  }
393
0
    }
394
395
0
  return (gchar *)s;
396
0
}
397
398
/**
399
 * g_utf8_pointer_to_offset:
400
 * @str: a UTF-8 encoded string
401
 * @pos: a pointer to a position within @str
402
 * 
403
 * Converts from a pointer to position within a string to an integer
404
 * character offset.
405
 *
406
 * Since 2.10, this function allows @pos to be before @str, and returns
407
 * a negative offset in this case.
408
 * 
409
 * Returns: the resulting character offset
410
 */
411
glong    
412
g_utf8_pointer_to_offset (const gchar *str,
413
        const gchar *pos)
414
0
{
415
0
  const gchar *s = str;
416
0
  glong offset = 0;    
417
418
0
  if (pos < str) 
419
0
    offset = - g_utf8_pointer_to_offset (pos, str);
420
0
  else
421
0
    while (s < pos)
422
0
      {
423
0
  s = g_utf8_next_char (s);
424
0
  offset++;
425
0
      }
426
  
427
0
  return offset;
428
0
}
429
430
431
/**
432
 * g_utf8_strncpy:
433
 * @dest: (transfer none): buffer to fill with characters from @src
434
 * @src: UTF-8 encoded string
435
 * @n: character count
436
 * 
437
 * Like the standard C strncpy() function, but copies a given number
438
 * of characters instead of a given number of bytes. The @src string
439
 * must be valid UTF-8 encoded text. (Use g_utf8_validate() on all
440
 * text before trying to use UTF-8 utility functions with it.)
441
 * 
442
 * Note you must ensure @dest is at least 4 * @n + 1 to fit the
443
 * largest possible UTF-8 characters
444
 *
445
 * Returns: (transfer none): @dest
446
 */
447
gchar *
448
g_utf8_strncpy (gchar       *dest,
449
    const gchar *src,
450
    gsize        n)
451
0
{
452
0
  const gchar *s = src;
453
0
  while (n && *s)
454
0
    {
455
0
      s = g_utf8_next_char(s);
456
0
      n--;
457
0
    }
458
0
  strncpy(dest, src, s - src);
459
0
  dest[s - src] = 0;
460
0
  return dest;
461
0
}
462
463
/* unicode_strchr */
464
465
/**
466
 * g_unichar_to_utf8:
467
 * @c: a Unicode character code
468
 * @outbuf: (out caller-allocates) (optional): output buffer, must have at
469
 *       least 6 bytes of space. If %NULL, the length will be computed and
470
 *       returned and nothing will be written to @outbuf.
471
 * 
472
 * Converts a single character to UTF-8.
473
 * 
474
 * Returns: number of bytes written
475
 */
476
int
477
g_unichar_to_utf8 (gunichar c,
478
       gchar   *outbuf)
479
0
{
480
  /* If this gets modified, also update the copy in g_string_insert_unichar() */
481
0
  guint len = 0;    
482
0
  int first;
483
0
  int i;
484
485
0
  if (c < 0x80)
486
0
    {
487
0
      first = 0;
488
0
      len = 1;
489
0
    }
490
0
  else if (c < 0x800)
491
0
    {
492
0
      first = 0xc0;
493
0
      len = 2;
494
0
    }
495
0
  else if (c < 0x10000)
496
0
    {
497
0
      first = 0xe0;
498
0
      len = 3;
499
0
    }
500
0
   else if (c < 0x200000)
501
0
    {
502
0
      first = 0xf0;
503
0
      len = 4;
504
0
    }
505
0
  else if (c < 0x4000000)
506
0
    {
507
0
      first = 0xf8;
508
0
      len = 5;
509
0
    }
510
0
  else
511
0
    {
512
0
      first = 0xfc;
513
0
      len = 6;
514
0
    }
515
516
0
  if (outbuf)
517
0
    {
518
0
      for (i = len - 1; i > 0; --i)
519
0
  {
520
0
    outbuf[i] = (c & 0x3f) | 0x80;
521
0
    c >>= 6;
522
0
  }
523
0
      outbuf[0] = c | first;
524
0
    }
525
526
0
  return len;
527
0
}
528
529
/**
530
 * g_utf8_strchr:
531
 * @p: a nul-terminated UTF-8 encoded string
532
 * @len: the maximum length of @p
533
 * @c: a Unicode character
534
 * 
535
 * Finds the leftmost occurrence of the given Unicode character
536
 * in a UTF-8 encoded string, while limiting the search to @len bytes.
537
 * If @len is -1, allow unbounded search.
538
 * 
539
 * Returns: (transfer none) (nullable): %NULL if the string does not contain the character,
540
 *     otherwise, a pointer to the start of the leftmost occurrence
541
 *     of the character in the string.
542
 */
543
gchar *
544
g_utf8_strchr (const char *p,
545
         gssize      len,
546
         gunichar    c)
547
0
{
548
0
  gchar ch[10];
549
550
0
  gint charlen = g_unichar_to_utf8 (c, ch);
551
0
  ch[charlen] = '\0';
552
  
553
0
  return g_strstr_len (p, len, ch);
554
0
}
555
556
557
/**
558
 * g_utf8_strrchr:
559
 * @p: a nul-terminated UTF-8 encoded string
560
 * @len: the maximum length of @p
561
 * @c: a Unicode character
562
 * 
563
 * Find the rightmost occurrence of the given Unicode character
564
 * in a UTF-8 encoded string, while limiting the search to @len bytes.
565
 * If @len is -1, allow unbounded search.
566
 * 
567
 * Returns: (transfer none) (nullable): %NULL if the string does not contain the character,
568
 *     otherwise, a pointer to the start of the rightmost occurrence
569
 *     of the character in the string.
570
 */
571
gchar *
572
g_utf8_strrchr (const char *p,
573
    gssize      len,
574
    gunichar    c)
575
0
{
576
0
  gchar ch[10];
577
578
0
  gint charlen = g_unichar_to_utf8 (c, ch);
579
0
  ch[charlen] = '\0';
580
  
581
0
  return g_strrstr_len (p, len, ch);
582
0
}
583
584
585
/* Like g_utf8_get_char, but take a maximum length
586
 * and return (gunichar)-2 on incomplete trailing character;
587
 * also check for malformed or overlong sequences
588
 * and return (gunichar)-1 in this case.
589
 */
590
static inline gunichar
591
g_utf8_get_char_extended (const  gchar *p,
592
        gssize max_len)
593
0
{
594
0
  gsize i, len;
595
0
  gunichar min_code;
596
0
  gunichar wc = (guchar) *p;
597
0
  const gunichar partial_sequence = (gunichar) -2;
598
0
  const gunichar malformed_sequence = (gunichar) -1;
599
600
0
  if (wc < 0x80)
601
0
    {
602
0
      return wc;
603
0
    }
604
0
  else if (G_UNLIKELY (wc < 0xc0))
605
0
    {
606
0
      return malformed_sequence;
607
0
    }
608
0
  else if (wc < 0xe0)
609
0
    {
610
0
      len = 2;
611
0
      wc &= 0x1f;
612
0
      min_code = 1 << 7;
613
0
    }
614
0
  else if (wc < 0xf0)
615
0
    {
616
0
      len = 3;
617
0
      wc &= 0x0f;
618
0
      min_code = 1 << 11;
619
0
    }
620
0
  else if (wc < 0xf8)
621
0
    {
622
0
      len = 4;
623
0
      wc &= 0x07;
624
0
      min_code = 1 << 16;
625
0
    }
626
0
  else if (wc < 0xfc)
627
0
    {
628
0
      len = 5;
629
0
      wc &= 0x03;
630
0
      min_code = 1 << 21;
631
0
    }
632
0
  else if (wc < 0xfe)
633
0
    {
634
0
      len = 6;
635
0
      wc &= 0x01;
636
0
      min_code = 1 << 26;
637
0
    }
638
0
  else
639
0
    {
640
0
      return malformed_sequence;
641
0
    }
642
643
0
  if (G_UNLIKELY (max_len >= 0 && len > (gsize) max_len))
644
0
    {
645
0
      for (i = 1; i < (gsize) max_len; i++)
646
0
  {
647
0
    if ((((guchar *)p)[i] & 0xc0) != 0x80)
648
0
      return malformed_sequence;
649
0
  }
650
0
      return partial_sequence;
651
0
    }
652
653
0
  for (i = 1; i < len; ++i)
654
0
    {
655
0
      gunichar ch = ((guchar *)p)[i];
656
657
0
      if (G_UNLIKELY ((ch & 0xc0) != 0x80))
658
0
  {
659
0
    if (ch)
660
0
      return malformed_sequence;
661
0
    else
662
0
      return partial_sequence;
663
0
  }
664
665
0
      wc <<= 6;
666
0
      wc |= (ch & 0x3f);
667
0
    }
668
669
0
  if (G_UNLIKELY (wc < min_code))
670
0
    return malformed_sequence;
671
672
0
  return wc;
673
0
}
674
675
/**
676
 * g_utf8_get_char_validated:
677
 * @p: a pointer to Unicode character encoded as UTF-8
678
 * @max_len: the maximum number of bytes to read, or -1 if @p is nul-terminated
679
 *
680
 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
681
 * This function checks for incomplete characters, for invalid characters
682
 * such as characters that are out of the range of Unicode, and for
683
 * overlong encodings of valid characters.
684
 *
685
 * Note that g_utf8_get_char_validated() returns (gunichar)-2 if
686
 * @max_len is positive and any of the bytes in the first UTF-8 character
687
 * sequence are nul.
688
 * 
689
 * Returns: the resulting character. If @p points to a partial
690
 *     sequence at the end of a string that could begin a valid 
691
 *     character (or if @max_len is zero), returns (gunichar)-2; 
692
 *     otherwise, if @p does not point to a valid UTF-8 encoded 
693
 *     Unicode character, returns (gunichar)-1.
694
 */
695
gunichar
696
g_utf8_get_char_validated (const gchar *p,
697
         gssize       max_len)
698
0
{
699
0
  gunichar result;
700
701
0
  if (max_len == 0)
702
0
    return (gunichar)-2;
703
704
0
  result = g_utf8_get_char_extended (p, max_len);
705
706
  /* Disallow codepoint U+0000 as it’s a nul byte,
707
   * and all string handling in GLib is nul-terminated */
708
0
  if (result == 0 && max_len > 0)
709
0
    return (gunichar) -2;
710
711
0
  if (result & 0x80000000)
712
0
    return result;
713
0
  else if (!UNICODE_VALID (result))
714
0
    return (gunichar)-1;
715
0
  else
716
0
    return result;
717
0
}
718
719
0
#define CONT_BYTE_FAST(p) ((guchar)*p++ & 0x3f)
720
721
/**
722
 * g_utf8_to_ucs4_fast:
723
 * @str: a UTF-8 encoded string
724
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
725
 *     then the string is nul-terminated.
726
 * @items_written: (out) (optional): location to store the
727
 *     number of characters in the result, or %NULL.
728
 *
729
 * Convert a string from UTF-8 to a 32-bit fixed width
730
 * representation as UCS-4, assuming valid UTF-8 input.
731
 * This function is roughly twice as fast as g_utf8_to_ucs4()
732
 * but does no error checking on the input. A trailing 0 character
733
 * will be added to the string after the converted text.
734
 * 
735
 * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
736
 *     This value must be freed with g_free().
737
 */
738
gunichar *
739
g_utf8_to_ucs4_fast (const gchar *str,
740
         glong        len,              
741
         glong       *items_written)    
742
0
{
743
0
  gunichar *result;
744
0
  gint n_chars, i;
745
0
  const gchar *p;
746
747
0
  g_return_val_if_fail (str != NULL, NULL);
748
749
0
  p = str;
750
0
  n_chars = 0;
751
0
  if (len < 0)
752
0
    {
753
0
      while (*p)
754
0
  {
755
0
    p = g_utf8_next_char (p);
756
0
    ++n_chars;
757
0
  }
758
0
    }
759
0
  else
760
0
    {
761
0
      while (p < str + len && *p)
762
0
  {
763
0
    p = g_utf8_next_char (p);
764
0
    ++n_chars;
765
0
  }
766
0
    }
767
  
768
0
  result = g_new (gunichar, n_chars + 1);
769
  
770
0
  p = str;
771
0
  for (i=0; i < n_chars; i++)
772
0
    {
773
0
      guchar first = (guchar)*p++;
774
0
      gunichar wc;
775
776
0
      if (first < 0xc0)
777
0
  {
778
          /* We really hope first < 0x80, but we don't want to test an
779
           * extra branch for invalid input, which this function
780
           * does not care about. Handling unexpected continuation bytes
781
           * here will do the least damage. */
782
0
    wc = first;
783
0
  }
784
0
      else
785
0
  {
786
0
          gunichar c1 = CONT_BYTE_FAST(p);
787
0
          if (first < 0xe0)
788
0
            {
789
0
              wc = ((first & 0x1f) << 6) | c1;
790
0
            }
791
0
          else
792
0
            {
793
0
              gunichar c2 = CONT_BYTE_FAST(p);
794
0
              if (first < 0xf0)
795
0
                {
796
0
                  wc = ((first & 0x0f) << 12) | (c1 << 6) | c2;
797
0
                }
798
0
              else
799
0
                {
800
0
                  gunichar c3 = CONT_BYTE_FAST(p);
801
0
                  wc = ((first & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
802
0
                  if (G_UNLIKELY (first >= 0xf8))
803
0
                    {
804
                      /* This can't be valid UTF-8, but g_utf8_next_char()
805
                       * and company allow out-of-range sequences */
806
0
                      gunichar mask = 1 << 20;
807
0
                      while ((wc & mask) != 0)
808
0
                        {
809
0
                          wc <<= 6;
810
0
                          wc |= CONT_BYTE_FAST(p);
811
0
                          mask <<= 5;
812
0
                        }
813
0
                      wc &= mask - 1;
814
0
                    }
815
0
                }
816
0
            }
817
0
  }
818
0
      result[i] = wc;
819
0
    }
820
0
  result[i] = 0;
821
822
0
  if (items_written)
823
0
    *items_written = i;
824
825
0
  return result;
826
0
}
827
828
static gpointer
829
try_malloc_n (gsize n_blocks, gsize n_block_bytes, GError **error)
830
0
{
831
0
    gpointer ptr = g_try_malloc_n (n_blocks, n_block_bytes);
832
0
    if (ptr == NULL)
833
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY,
834
0
                           _("Failed to allocate memory"));
835
0
    return ptr;
836
0
}
837
838
/**
839
 * g_utf8_to_ucs4:
840
 * @str: a UTF-8 encoded string
841
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
842
 *     then the string is nul-terminated.
843
 * @items_read: (out) (optional): location to store number of
844
  *    bytes read, or %NULL.
845
 *     If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
846
 *     returned in case @str contains a trailing partial
847
 *     character. If an error occurs then the index of the
848
 *     invalid input is stored here.
849
 * @items_written: (out) (optional): location to store number
850
 *     of characters written or %NULL. The value here stored does not include
851
 *     the trailing 0 character.
852
 * @error: location to store the error occurring, or %NULL to ignore
853
 *     errors. Any of the errors in #GConvertError other than
854
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
855
 *
856
 * Convert a string from UTF-8 to a 32-bit fixed width
857
 * representation as UCS-4. A trailing 0 character will be added to the
858
 * string after the converted text.
859
 * 
860
 * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
861
 *     This value must be freed with g_free(). If an error occurs,
862
 *     %NULL will be returned and @error set.
863
 */
864
gunichar *
865
g_utf8_to_ucs4 (const gchar *str,
866
    glong        len,             
867
    glong       *items_read,      
868
    glong       *items_written,   
869
    GError     **error)
870
0
{
871
0
  gunichar *result = NULL;
872
0
  gint n_chars, i;
873
0
  const gchar *in;
874
  
875
0
  in = str;
876
0
  n_chars = 0;
877
0
  while ((len < 0 || str + len - in > 0) && *in)
878
0
    {
879
0
      gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
880
0
      if (wc & 0x80000000)
881
0
  {
882
0
    if (wc == (gunichar)-2)
883
0
      {
884
0
        if (items_read)
885
0
    break;
886
0
        else
887
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
888
0
                                     _("Partial character sequence at end of input"));
889
0
      }
890
0
    else
891
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
892
0
                                 _("Invalid byte sequence in conversion input"));
893
894
0
    goto err_out;
895
0
  }
896
897
0
      n_chars++;
898
899
0
      in = g_utf8_next_char (in);
900
0
    }
901
902
0
  result = try_malloc_n (n_chars + 1, sizeof (gunichar), error);
903
0
  if (result == NULL)
904
0
      goto err_out;
905
906
0
  in = str;
907
0
  for (i=0; i < n_chars; i++)
908
0
    {
909
0
      result[i] = g_utf8_get_char (in);
910
0
      in = g_utf8_next_char (in);
911
0
    }
912
0
  result[i] = 0;
913
914
0
  if (items_written)
915
0
    *items_written = n_chars;
916
917
0
 err_out:
918
0
  if (items_read)
919
0
    *items_read = in - str;
920
921
0
  return result;
922
0
}
923
924
/**
925
 * g_ucs4_to_utf8:
926
 * @str: a UCS-4 encoded string
927
 * @len: the maximum length (number of characters) of @str to use. 
928
 *     If @len < 0, then the string is nul-terminated.
929
 * @items_read: (out) (optional): location to store number of
930
 *     characters read, or %NULL.
931
 * @items_written: (out) (optional): location to store number
932
 *     of bytes written or %NULL. The value here stored does not include the
933
 *     trailing 0 byte.
934
 * @error: location to store the error occurring, or %NULL to ignore
935
 *         errors. Any of the errors in #GConvertError other than
936
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
937
 *
938
 * Convert a string from a 32-bit fixed width representation as UCS-4.
939
 * to UTF-8. The result will be terminated with a 0 byte.
940
 * 
941
 * Returns: (transfer full): a pointer to a newly allocated UTF-8 string.
942
 *     This value must be freed with g_free(). If an error occurs,
943
 *     %NULL will be returned and @error set. In that case, @items_read
944
 *     will be set to the position of the first invalid input character.
945
 */
946
gchar *
947
g_ucs4_to_utf8 (const gunichar *str,
948
    glong           len,              
949
    glong          *items_read,       
950
    glong          *items_written,    
951
    GError        **error)
952
0
{
953
0
  gint result_length;
954
0
  gchar *result = NULL;
955
0
  gchar *p;
956
0
  gint i;
957
958
0
  result_length = 0;
959
0
  for (i = 0; len < 0 || i < len ; i++)
960
0
    {
961
0
      if (!str[i])
962
0
  break;
963
964
0
      if (str[i] >= 0x80000000)
965
0
  {
966
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
967
0
                               _("Character out of range for UTF-8"));
968
0
    goto err_out;
969
0
  }
970
      
971
0
      result_length += UTF8_LENGTH (str[i]);
972
0
    }
973
974
0
  result = try_malloc_n (result_length + 1, 1, error);
975
0
  if (result == NULL)
976
0
      goto err_out;
977
978
0
  p = result;
979
980
0
  i = 0;
981
0
  while (p < result + result_length)
982
0
    p += g_unichar_to_utf8 (str[i++], p);
983
  
984
0
  *p = '\0';
985
986
0
  if (items_written)
987
0
    *items_written = p - result;
988
989
0
 err_out:
990
0
  if (items_read)
991
0
    *items_read = i;
992
993
0
  return result;
994
0
}
995
996
0
#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
997
998
/**
999
 * g_utf16_to_utf8:
1000
 * @str: a UTF-16 encoded string
1001
 * @len: the maximum length (number of #gunichar2) of @str to use. 
1002
 *     If @len < 0, then the string is nul-terminated.
1003
 * @items_read: (out) (optional): location to store number of
1004
 *     words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
1005
 *     be returned in case @str contains a trailing partial character. If
1006
 *     an error occurs then the index of the invalid input is stored here.
1007
 *     It’s guaranteed to be non-negative.
1008
 * @items_written: (out) (optional): location to store number
1009
 *     of bytes written, or %NULL. The value stored here does not include the
1010
 *     trailing 0 byte. It’s guaranteed to be non-negative.
1011
 * @error: location to store the error occurring, or %NULL to ignore
1012
 *     errors. Any of the errors in #GConvertError other than
1013
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
1014
 *
1015
 * Convert a string from UTF-16 to UTF-8. The result will be
1016
 * terminated with a 0 byte.
1017
 *
1018
 * Note that the input is expected to be already in native endianness,
1019
 * an initial byte-order-mark character is not handled specially.
1020
 * g_convert() can be used to convert a byte buffer of UTF-16 data of
1021
 * ambiguous endianness.
1022
 *
1023
 * Further note that this function does not validate the result
1024
 * string; it may e.g. include embedded NUL characters. The only
1025
 * validation done by this function is to ensure that the input can
1026
 * be correctly interpreted as UTF-16, i.e. it doesn't contain
1027
 * unpaired surrogates or partial character sequences.
1028
 *
1029
 * Returns: (transfer full): a pointer to a newly allocated UTF-8 string.
1030
 *     This value must be freed with g_free(). If an error occurs,
1031
 *     %NULL will be returned and @error set.
1032
 **/
1033
gchar *
1034
g_utf16_to_utf8 (const gunichar2  *str,
1035
     glong             len,
1036
     glong            *items_read,
1037
     glong            *items_written,
1038
     GError          **error)
1039
0
{
1040
  /* This function and g_utf16_to_ucs4 are almost exactly identical -
1041
   * The lines that differ are marked.
1042
   */
1043
0
  const gunichar2 *in;
1044
0
  gchar *out;
1045
0
  gchar *result = NULL;
1046
0
  gint n_bytes;
1047
0
  gunichar high_surrogate;
1048
1049
0
  g_return_val_if_fail (str != NULL, NULL);
1050
1051
0
  n_bytes = 0;
1052
0
  in = str;
1053
0
  high_surrogate = 0;
1054
0
  while ((len < 0 || in - str < len) && *in)
1055
0
    {
1056
0
      gunichar2 c = *in;
1057
0
      gunichar wc;
1058
1059
0
      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1060
0
  {
1061
0
    if (high_surrogate)
1062
0
      {
1063
0
        wc = SURROGATE_VALUE (high_surrogate, c);
1064
0
        high_surrogate = 0;
1065
0
      }
1066
0
    else
1067
0
      {
1068
0
        g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1069
0
                                   _("Invalid sequence in conversion input"));
1070
0
        goto err_out;
1071
0
      }
1072
0
  }
1073
0
      else
1074
0
  {
1075
0
    if (high_surrogate)
1076
0
      {
1077
0
        g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1078
0
                                   _("Invalid sequence in conversion input"));
1079
0
        goto err_out;
1080
0
      }
1081
1082
0
    if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1083
0
      {
1084
0
        high_surrogate = c;
1085
0
        goto next1;
1086
0
      }
1087
0
    else
1088
0
      wc = c;
1089
0
  }
1090
1091
      /********** DIFFERENT for UTF8/UCS4 **********/
1092
0
      n_bytes += UTF8_LENGTH (wc);
1093
1094
0
    next1:
1095
0
      in++;
1096
0
    }
1097
1098
0
  if (high_surrogate && !items_read)
1099
0
    {
1100
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1101
0
                           _("Partial character sequence at end of input"));
1102
0
      goto err_out;
1103
0
    }
1104
  
1105
  /* At this point, everything is valid, and we just need to convert
1106
   */
1107
  /********** DIFFERENT for UTF8/UCS4 **********/
1108
0
  result = try_malloc_n (n_bytes + 1, 1, error);
1109
0
  if (result == NULL)
1110
0
      goto err_out;
1111
1112
0
  high_surrogate = 0;
1113
0
  out = result;
1114
0
  in = str;
1115
0
  while (out < result + n_bytes)
1116
0
    {
1117
0
      gunichar2 c = *in;
1118
0
      gunichar wc;
1119
1120
0
      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1121
0
  {
1122
0
    wc = SURROGATE_VALUE (high_surrogate, c);
1123
0
    high_surrogate = 0;
1124
0
  }
1125
0
      else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1126
0
  {
1127
0
    high_surrogate = c;
1128
0
    goto next2;
1129
0
  }
1130
0
      else
1131
0
  wc = c;
1132
1133
      /********** DIFFERENT for UTF8/UCS4 **********/
1134
0
      out += g_unichar_to_utf8 (wc, out);
1135
1136
0
    next2:
1137
0
      in++;
1138
0
    }
1139
  
1140
  /********** DIFFERENT for UTF8/UCS4 **********/
1141
0
  *out = '\0';
1142
1143
0
  if (items_written)
1144
    /********** DIFFERENT for UTF8/UCS4 **********/
1145
0
    *items_written = out - result;
1146
1147
0
 err_out:
1148
0
  if (items_read)
1149
0
    *items_read = in - str;
1150
1151
0
  return result;
1152
0
}
1153
1154
/**
1155
 * g_utf16_to_ucs4:
1156
 * @str: a UTF-16 encoded string
1157
 * @len: the maximum length (number of #gunichar2) of @str to use. 
1158
 *     If @len < 0, then the string is nul-terminated.
1159
 * @items_read: (out) (optional): location to store number of
1160
 *     words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
1161
 *     be returned in case @str contains a trailing partial character. If
1162
 *     an error occurs then the index of the invalid input is stored here.
1163
 * @items_written: (out) (optional): location to store number
1164
 *     of characters written, or %NULL. The value stored here does not include
1165
 *     the trailing 0 character.
1166
 * @error: location to store the error occurring, or %NULL to ignore
1167
 *     errors. Any of the errors in #GConvertError other than
1168
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
1169
 *
1170
 * Convert a string from UTF-16 to UCS-4. The result will be
1171
 * nul-terminated.
1172
 * 
1173
 * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
1174
 *     This value must be freed with g_free(). If an error occurs,
1175
 *     %NULL will be returned and @error set.
1176
 */
1177
gunichar *
1178
g_utf16_to_ucs4 (const gunichar2  *str,
1179
     glong             len,              
1180
     glong            *items_read,       
1181
     glong            *items_written,    
1182
     GError          **error)
1183
0
{
1184
0
  const gunichar2 *in;
1185
0
  gchar *out;
1186
0
  gchar *result = NULL;
1187
0
  gint n_bytes;
1188
0
  gunichar high_surrogate;
1189
1190
0
  g_return_val_if_fail (str != NULL, NULL);
1191
1192
0
  n_bytes = 0;
1193
0
  in = str;
1194
0
  high_surrogate = 0;
1195
0
  while ((len < 0 || in - str < len) && *in)
1196
0
    {
1197
0
      gunichar2 c = *in;
1198
1199
0
      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1200
0
  {
1201
0
    if (high_surrogate)
1202
0
      {
1203
0
        high_surrogate = 0;
1204
0
      }
1205
0
    else
1206
0
      {
1207
0
        g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1208
0
                                   _("Invalid sequence in conversion input"));
1209
0
        goto err_out;
1210
0
      }
1211
0
  }
1212
0
      else
1213
0
  {
1214
0
    if (high_surrogate)
1215
0
      {
1216
0
        g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1217
0
                                   _("Invalid sequence in conversion input"));
1218
0
        goto err_out;
1219
0
      }
1220
1221
0
    if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1222
0
      {
1223
0
        high_surrogate = c;
1224
0
        goto next1;
1225
0
      }
1226
0
  }
1227
1228
      /********** DIFFERENT for UTF8/UCS4 **********/
1229
0
      n_bytes += sizeof (gunichar);
1230
1231
0
    next1:
1232
0
      in++;
1233
0
    }
1234
1235
0
  if (high_surrogate && !items_read)
1236
0
    {
1237
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1238
0
                           _("Partial character sequence at end of input"));
1239
0
      goto err_out;
1240
0
    }
1241
  
1242
  /* At this point, everything is valid, and we just need to convert
1243
   */
1244
  /********** DIFFERENT for UTF8/UCS4 **********/
1245
0
  result = try_malloc_n (n_bytes + 4, 1, error);
1246
0
  if (result == NULL)
1247
0
      goto err_out;
1248
1249
0
  high_surrogate = 0;
1250
0
  out = result;
1251
0
  in = str;
1252
0
  while (out < result + n_bytes)
1253
0
    {
1254
0
      gunichar2 c = *in;
1255
0
      gunichar wc;
1256
1257
0
      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1258
0
  {
1259
0
    wc = SURROGATE_VALUE (high_surrogate, c);
1260
0
    high_surrogate = 0;
1261
0
  }
1262
0
      else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1263
0
  {
1264
0
    high_surrogate = c;
1265
0
    goto next2;
1266
0
  }
1267
0
      else
1268
0
  wc = c;
1269
1270
      /********** DIFFERENT for UTF8/UCS4 **********/
1271
0
      *(gunichar *)out = wc;
1272
0
      out += sizeof (gunichar);
1273
1274
0
    next2:
1275
0
      in++;
1276
0
    }
1277
1278
  /********** DIFFERENT for UTF8/UCS4 **********/
1279
0
  *(gunichar *)out = 0;
1280
1281
0
  if (items_written)
1282
    /********** DIFFERENT for UTF8/UCS4 **********/
1283
0
    *items_written = (out - result) / sizeof (gunichar);
1284
1285
0
 err_out:
1286
0
  if (items_read)
1287
0
    *items_read = in - str;
1288
1289
0
  return (gunichar *)result;
1290
0
}
1291
1292
/**
1293
 * g_utf8_to_utf16:
1294
 * @str: a UTF-8 encoded string
1295
 * @len: the maximum length (number of bytes) of @str to use.
1296
 *     If @len < 0, then the string is nul-terminated.
1297
 * @items_read: (out) (optional): location to store number of
1298
 *     bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
1299
 *     be returned in case @str contains a trailing partial character. If
1300
 *     an error occurs then the index of the invalid input is stored here.
1301
 * @items_written: (out) (optional): location to store number
1302
 *     of #gunichar2 written, or %NULL. The value stored here does not include
1303
 *     the trailing 0.
1304
 * @error: location to store the error occurring, or %NULL to ignore
1305
 *     errors. Any of the errors in #GConvertError other than
1306
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
1307
 *
1308
 * Convert a string from UTF-8 to UTF-16. A 0 character will be
1309
 * added to the result after the converted text.
1310
 *
1311
 * Returns: (transfer full): a pointer to a newly allocated UTF-16 string.
1312
 *     This value must be freed with g_free(). If an error occurs,
1313
 *     %NULL will be returned and @error set.
1314
 */
1315
gunichar2 *
1316
g_utf8_to_utf16 (const gchar *str,
1317
     glong        len,
1318
     glong       *items_read,
1319
     glong       *items_written,
1320
     GError     **error)
1321
0
{
1322
0
  gunichar2 *result = NULL;
1323
0
  gint n16;
1324
0
  const gchar *in;
1325
0
  gint i;
1326
1327
0
  g_return_val_if_fail (str != NULL, NULL);
1328
1329
0
  in = str;
1330
0
  n16 = 0;
1331
0
  while ((len < 0 || str + len - in > 0) && *in)
1332
0
    {
1333
0
      gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
1334
0
      if (wc & 0x80000000)
1335
0
  {
1336
0
    if (wc == (gunichar)-2)
1337
0
      {
1338
0
        if (items_read)
1339
0
    break;
1340
0
        else
1341
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1342
0
                                     _("Partial character sequence at end of input"));
1343
0
      }
1344
0
    else
1345
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1346
0
                                 _("Invalid byte sequence in conversion input"));
1347
1348
0
    goto err_out;
1349
0
  }
1350
1351
0
      if (wc < 0xd800)
1352
0
  n16 += 1;
1353
0
      else if (wc < 0xe000)
1354
0
  {
1355
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1356
0
                               _("Invalid sequence in conversion input"));
1357
1358
0
    goto err_out;
1359
0
  }
1360
0
      else if (wc < 0x10000)
1361
0
  n16 += 1;
1362
0
      else if (wc < 0x110000)
1363
0
  n16 += 2;
1364
0
      else
1365
0
  {
1366
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1367
0
                               _("Character out of range for UTF-16"));
1368
1369
0
    goto err_out;
1370
0
  }
1371
      
1372
0
      in = g_utf8_next_char (in);
1373
0
    }
1374
1375
0
  result = try_malloc_n (n16 + 1, sizeof (gunichar2), error);
1376
0
  if (result == NULL)
1377
0
      goto err_out;
1378
1379
0
  in = str;
1380
0
  for (i = 0; i < n16;)
1381
0
    {
1382
0
      gunichar wc = g_utf8_get_char (in);
1383
1384
0
      if (wc < 0x10000)
1385
0
  {
1386
0
    result[i++] = wc;
1387
0
  }
1388
0
      else
1389
0
  {
1390
0
    result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
1391
0
    result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
1392
0
  }
1393
      
1394
0
      in = g_utf8_next_char (in);
1395
0
    }
1396
1397
0
  result[i] = 0;
1398
1399
0
  if (items_written)
1400
0
    *items_written = n16;
1401
1402
0
 err_out:
1403
0
  if (items_read)
1404
0
    *items_read = in - str;
1405
  
1406
0
  return result;
1407
0
}
1408
1409
/**
1410
 * g_ucs4_to_utf16:
1411
 * @str: a UCS-4 encoded string
1412
 * @len: the maximum length (number of characters) of @str to use. 
1413
 *     If @len < 0, then the string is nul-terminated.
1414
 * @items_read: (out) (optional): location to store number of
1415
 *     bytes read, or %NULL. If an error occurs then the index of the invalid
1416
 *     input is stored here.
1417
 * @items_written: (out) (optional): location to store number
1418
 *     of #gunichar2  written, or %NULL. The value stored here does not include
1419
 *     the trailing 0.
1420
 * @error: location to store the error occurring, or %NULL to ignore
1421
 *     errors. Any of the errors in #GConvertError other than
1422
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
1423
 *
1424
 * Convert a string from UCS-4 to UTF-16. A 0 character will be
1425
 * added to the result after the converted text.
1426
 * 
1427
 * Returns: (transfer full): a pointer to a newly allocated UTF-16 string.
1428
 *     This value must be freed with g_free(). If an error occurs,
1429
 *     %NULL will be returned and @error set.
1430
 */
1431
gunichar2 *
1432
g_ucs4_to_utf16 (const gunichar  *str,
1433
     glong            len,              
1434
     glong           *items_read,       
1435
     glong           *items_written,    
1436
     GError         **error)
1437
0
{
1438
0
  gunichar2 *result = NULL;
1439
0
  gint n16;
1440
0
  gint i, j;
1441
1442
0
  n16 = 0;
1443
0
  i = 0;
1444
0
  while ((len < 0 || i < len) && str[i])
1445
0
    {
1446
0
      gunichar wc = str[i];
1447
1448
0
      if (wc < 0xd800)
1449
0
  n16 += 1;
1450
0
      else if (wc < 0xe000)
1451
0
  {
1452
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1453
0
                               _("Invalid sequence in conversion input"));
1454
1455
0
    goto err_out;
1456
0
  }
1457
0
      else if (wc < 0x10000)
1458
0
  n16 += 1;
1459
0
      else if (wc < 0x110000)
1460
0
  n16 += 2;
1461
0
      else
1462
0
  {
1463
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1464
0
                               _("Character out of range for UTF-16"));
1465
1466
0
    goto err_out;
1467
0
  }
1468
1469
0
      i++;
1470
0
    }
1471
1472
0
  result = try_malloc_n (n16 + 1, sizeof (gunichar2), error);
1473
0
  if (result == NULL)
1474
0
      goto err_out;
1475
1476
0
  for (i = 0, j = 0; j < n16; i++)
1477
0
    {
1478
0
      gunichar wc = str[i];
1479
1480
0
      if (wc < 0x10000)
1481
0
  {
1482
0
    result[j++] = wc;
1483
0
  }
1484
0
      else
1485
0
  {
1486
0
    result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
1487
0
    result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
1488
0
  }
1489
0
    }
1490
0
  result[j] = 0;
1491
1492
0
  if (items_written)
1493
0
    *items_written = n16;
1494
  
1495
0
 err_out:
1496
0
  if (items_read)
1497
0
    *items_read = i;
1498
  
1499
0
  return result;
1500
0
}
1501
1502
#define VALIDATE_BYTE(mask, expect)                      \
1503
162k
  G_STMT_START {                                         \
1504
162k
    if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \
1505
162k
      goto error;                                        \
1506
162k
  } G_STMT_END
1507
1508
/* see IETF RFC 3629 Section 4 */
1509
1510
static const gchar *
1511
fast_validate (const char *str)
1512
1513
1.33k
{
1514
1.33k
  const gchar *p;
1515
1516
34.7k
  for (p = str; *p; p++)
1517
33.4k
    {
1518
33.4k
      if (*(guchar *)p < 128)
1519
33.4k
  /* done */;
1520
0
      else 
1521
0
  {
1522
0
    const gchar *last;
1523
1524
0
    last = p;
1525
0
    if (*(guchar *)p < 0xe0) /* 110xxxxx */
1526
0
      {
1527
0
        if (G_UNLIKELY (*(guchar *)p < 0xc2))
1528
0
    goto error;
1529
0
      }
1530
0
    else
1531
0
      {
1532
0
        if (*(guchar *)p < 0xf0) /* 1110xxxx */
1533
0
    {
1534
0
      switch (*(guchar *)p++ & 0x0f)
1535
0
        {
1536
0
        case 0:
1537
0
          VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
1538
0
          break;
1539
0
        case 0x0d:
1540
0
          VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
1541
0
          break;
1542
0
        default:
1543
0
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1544
0
        }
1545
0
    }
1546
0
        else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
1547
0
    {
1548
0
      switch (*(guchar *)p++ & 0x07)
1549
0
        {
1550
0
        case 0:
1551
0
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1552
0
          if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
1553
0
      goto error;
1554
0
          break;
1555
0
        case 4:
1556
0
          VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
1557
0
          break;
1558
0
        default:
1559
0
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1560
0
        }
1561
0
      p++;
1562
0
      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1563
0
    }
1564
0
        else
1565
0
    goto error;
1566
0
      }
1567
1568
0
    p++;
1569
0
    VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1570
1571
0
    continue;
1572
1573
0
  error:
1574
0
    return last;
1575
0
  }
1576
33.4k
    }
1577
1578
1.33k
  return p;
1579
1.33k
}
1580
1581
static const gchar *
1582
fast_validate_len (const char *str,
1583
       gssize      max_len)
1584
1585
102k
{
1586
102k
  const gchar *p;
1587
1588
102k
  g_assert (max_len >= 0);
1589
1590
3.68M
  for (p = str; ((p - str) < max_len) && *p; p++)
1591
3.68M
    {
1592
3.68M
      if (*(guchar *)p < 128)
1593
3.52M
  /* done */;
1594
155k
      else 
1595
155k
  {
1596
155k
    const gchar *last;
1597
1598
155k
    last = p;
1599
155k
    if (*(guchar *)p < 0xe0) /* 110xxxxx */
1600
52.6k
      {
1601
52.6k
        if (G_UNLIKELY (max_len - (p - str) < 2))
1602
351
    goto error;
1603
        
1604
52.3k
        if (G_UNLIKELY (*(guchar *)p < 0xc2))
1605
21.5k
    goto error;
1606
52.3k
      }
1607
103k
    else
1608
103k
      {
1609
103k
        if (*(guchar *)p < 0xf0) /* 1110xxxx */
1610
64.7k
    {
1611
64.7k
      if (G_UNLIKELY (max_len - (p - str) < 3))
1612
1.88k
        goto error;
1613
1614
62.8k
      switch (*(guchar *)p++ & 0x0f)
1615
62.8k
        {
1616
49.2k
        case 0:
1617
49.2k
          VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
1618
48.2k
          break;
1619
48.2k
        case 0x0d:
1620
1.83k
          VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
1621
1.46k
          break;
1622
11.7k
        default:
1623
11.7k
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1624
62.8k
        }
1625
62.8k
    }
1626
38.3k
        else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
1627
10.4k
    {
1628
10.4k
      if (G_UNLIKELY (max_len - (p - str) < 4))
1629
1.39k
        goto error;
1630
1631
9.09k
      switch (*(guchar *)p++ & 0x07)
1632
9.09k
        {
1633
2.30k
        case 0:
1634
2.30k
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1635
1.39k
          if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
1636
990
      goto error;
1637
404
          break;
1638
473
        case 4:
1639
473
          VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
1640
156
          break;
1641
6.32k
        default:
1642
6.32k
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1643
9.09k
        }
1644
3.26k
      p++;
1645
3.26k
      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1646
3.26k
    }
1647
27.8k
        else
1648
27.8k
    goto error;
1649
103k
      }
1650
1651
87.4k
    p++;
1652
87.4k
    VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1653
1654
57.1k
    continue;
1655
1656
98.6k
  error:
1657
98.6k
    return last;
1658
87.4k
  }
1659
3.68M
    }
1660
1661
3.40k
  return p;
1662
102k
}
1663
1664
/**
1665
 * g_utf8_validate:
1666
 * @str: (array length=max_len) (element-type guint8): a pointer to character data
1667
 * @max_len: max bytes to validate, or -1 to go until NUL
1668
 * @end: (out) (optional) (transfer none): return location for end of valid data
1669
 * 
1670
 * Validates UTF-8 encoded text. @str is the text to validate;
1671
 * if @str is nul-terminated, then @max_len can be -1, otherwise
1672
 * @max_len should be the number of bytes to validate.
1673
 * If @end is non-%NULL, then the end of the valid range
1674
 * will be stored there (i.e. the start of the first invalid 
1675
 * character if some bytes were invalid, or the end of the text 
1676
 * being validated otherwise).
1677
 *
1678
 * Note that g_utf8_validate() returns %FALSE if @max_len is 
1679
 * positive and any of the @max_len bytes are nul.
1680
 *
1681
 * Returns %TRUE if all of @str was valid. Many GLib and GTK+
1682
 * routines require valid UTF-8 as input; so data read from a file
1683
 * or the network should be checked with g_utf8_validate() before
1684
 * doing anything else with it.
1685
 * 
1686
 * Returns: %TRUE if the text was valid UTF-8
1687
 */
1688
gboolean
1689
g_utf8_validate (const char   *str,
1690
     gssize        max_len,    
1691
     const gchar **end)
1692
1693
103k
{
1694
103k
  const gchar *p;
1695
1696
103k
  if (max_len >= 0)
1697
102k
    return g_utf8_validate_len (str, max_len, end);
1698
1699
1.33k
  p = fast_validate (str);
1700
1701
1.33k
  if (end)
1702
0
    *end = p;
1703
1704
1.33k
  if (*p != '\0')
1705
0
    return FALSE;
1706
1.33k
  else
1707
1.33k
    return TRUE;
1708
1.33k
}
1709
1710
/**
1711
 * g_utf8_validate_len:
1712
 * @str: (array length=max_len) (element-type guint8): a pointer to character data
1713
 * @max_len: max bytes to validate
1714
 * @end: (out) (optional) (transfer none): return location for end of valid data
1715
 *
1716
 * Validates UTF-8 encoded text.
1717
 *
1718
 * As with g_utf8_validate(), but @max_len must be set, and hence this function
1719
 * will always return %FALSE if any of the bytes of @str are nul.
1720
 *
1721
 * Returns: %TRUE if the text was valid UTF-8
1722
 * Since: 2.60
1723
 */
1724
gboolean
1725
g_utf8_validate_len (const char   *str,
1726
                     gsize         max_len,
1727
                     const gchar **end)
1728
1729
102k
{
1730
102k
  const gchar *p;
1731
1732
102k
  p = fast_validate_len (str, max_len);
1733
1734
102k
  if (end)
1735
0
    *end = p;
1736
1737
102k
  if (p != str + max_len)
1738
98.6k
    return FALSE;
1739
3.40k
  else
1740
3.40k
    return TRUE;
1741
102k
}
1742
1743
/**
1744
 * g_unichar_validate:
1745
 * @ch: a Unicode character
1746
 * 
1747
 * Checks whether @ch is a valid Unicode character. Some possible
1748
 * integer values of @ch will not be valid. 0 is considered a valid
1749
 * character, though it's normally a string terminator.
1750
 * 
1751
 * Returns: %TRUE if @ch is a valid Unicode character
1752
 **/
1753
gboolean
1754
g_unichar_validate (gunichar ch)
1755
0
{
1756
0
  return UNICODE_VALID (ch);
1757
0
}
1758
1759
/**
1760
 * g_utf8_strreverse:
1761
 * @str: a UTF-8 encoded string
1762
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
1763
 *     then the string is nul-terminated.
1764
 *
1765
 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. 
1766
 * (Use g_utf8_validate() on all text before trying to use UTF-8 
1767
 * utility functions with it.)
1768
 *
1769
 * This function is intended for programmatic uses of reversed strings.
1770
 * It pays no attention to decomposed characters, combining marks, byte 
1771
 * order marks, directional indicators (LRM, LRO, etc) and similar 
1772
 * characters which might need special handling when reversing a string 
1773
 * for display purposes.
1774
 *
1775
 * Note that unlike g_strreverse(), this function returns
1776
 * newly-allocated memory, which should be freed with g_free() when
1777
 * no longer needed. 
1778
 *
1779
 * Returns: (transfer full): a newly-allocated string which is the reverse of @str
1780
 *
1781
 * Since: 2.2
1782
 */
1783
gchar *
1784
g_utf8_strreverse (const gchar *str,
1785
       gssize       len)
1786
0
{
1787
0
  gchar *r, *result;
1788
0
  const gchar *p;
1789
1790
0
  if (len < 0)
1791
0
    len = strlen (str);
1792
1793
0
  result = g_new (gchar, len + 1);
1794
0
  r = result + len;
1795
0
  p = str;
1796
0
  while (r > result)
1797
0
    {
1798
0
      gchar *m, skip = g_utf8_skip[*(guchar*) p];
1799
0
      r -= skip;
1800
0
      g_assert (r >= result);
1801
0
      for (m = r; skip; skip--)
1802
0
        *m++ = *p++;
1803
0
    }
1804
0
  result[len] = 0;
1805
1806
0
  return result;
1807
0
}
1808
1809
/**
1810
 * g_utf8_make_valid:
1811
 * @str: string to coerce into UTF-8
1812
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
1813
 *     then the string is nul-terminated.
1814
 *
1815
 * If the provided string is valid UTF-8, return a copy of it. If not,
1816
 * return a copy in which bytes that could not be interpreted as valid Unicode
1817
 * are replaced with the Unicode replacement character (U+FFFD).
1818
 *
1819
 * For example, this is an appropriate function to use if you have received
1820
 * a string that was incorrectly declared to be UTF-8, and you need a valid
1821
 * UTF-8 version of it that can be logged or displayed to the user, with the
1822
 * assumption that it is close enough to ASCII or UTF-8 to be mostly
1823
 * readable as-is.
1824
 *
1825
 * Returns: (transfer full): a valid UTF-8 string whose content resembles @str
1826
 *
1827
 * Since: 2.52
1828
 */
1829
gchar *
1830
g_utf8_make_valid (const gchar *str,
1831
                   gssize       len)
1832
0
{
1833
0
  GString *string;
1834
0
  const gchar *remainder, *invalid;
1835
0
  gsize remaining_bytes, valid_bytes;
1836
1837
0
  g_return_val_if_fail (str != NULL, NULL);
1838
1839
0
  if (len < 0)
1840
0
    len = strlen (str);
1841
1842
0
  string = NULL;
1843
0
  remainder = str;
1844
0
  remaining_bytes = len;
1845
1846
0
  while (remaining_bytes != 0) 
1847
0
    {
1848
0
      if (g_utf8_validate (remainder, remaining_bytes, &invalid)) 
1849
0
  break;
1850
0
      valid_bytes = invalid - remainder;
1851
    
1852
0
      if (string == NULL) 
1853
0
  string = g_string_sized_new (remaining_bytes);
1854
1855
0
      g_string_append_len (string, remainder, valid_bytes);
1856
      /* append U+FFFD REPLACEMENT CHARACTER */
1857
0
      g_string_append (string, "\357\277\275");
1858
      
1859
0
      remaining_bytes -= valid_bytes + 1;
1860
0
      remainder = invalid + 1;
1861
0
    }
1862
  
1863
0
  if (string == NULL)
1864
0
    return g_strndup (str, len);
1865
  
1866
0
  g_string_append_len (string, remainder, remaining_bytes);
1867
0
  g_string_append_c (string, '\0');
1868
1869
0
  g_assert (g_utf8_validate (string->str, -1, NULL));
1870
1871
0
  return g_string_free (string, FALSE);
1872
0
}