Coverage Report

Created: 2025-08-26 06:56

/src/tinysparql/subprojects/glib-2.80.3/glib/gutf8.c
Line
Count
Source (jump to first uncovered line)
1
/* gutf8.c - Operations on UTF-8 strings.
2
 *
3
 * Copyright (C) 1999 Tom Tromey
4
 * Copyright (C) 2000 Red Hat, Inc.
5
 *
6
 * SPDX-License-Identifier: LGPL-2.1-or-later
7
 *
8
 * This library is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * This library is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20
 */
21
22
#include "config.h"
23
24
#include <stdlib.h>
25
#ifdef HAVE_CODESET
26
#include <langinfo.h>
27
#endif
28
#include <string.h>
29
30
#ifdef G_PLATFORM_WIN32
31
#include <stdio.h>
32
#define STRICT
33
#include <windows.h>
34
#undef STRICT
35
#endif
36
37
#include "gconvert.h"
38
#include "ghash.h"
39
#include "gstrfuncs.h"
40
#include "gtestutils.h"
41
#include "gtypes.h"
42
#include "gthread.h"
43
#include "glibintl.h"
44
45
#define UTF8_COMPUTE(Char, Mask, Len)               \
46
65.9k
  if (Char < 128)                   \
47
65.9k
    {                       \
48
65.9k
      Len = 1;                      \
49
65.9k
      Mask = 0x7f;                    \
50
65.9k
    }                        \
51
65.9k
  else if ((Char & 0xe0) == 0xc0)               \
52
0
    {                       \
53
0
      Len = 2;                      \
54
0
      Mask = 0x1f;                    \
55
0
    }                        \
56
0
  else if ((Char & 0xf0) == 0xe0)               \
57
0
    {                       \
58
0
      Len = 3;                      \
59
0
      Mask = 0x0f;                    \
60
0
    }                        \
61
0
  else if ((Char & 0xf8) == 0xf0)               \
62
0
    {                       \
63
0
      Len = 4;                      \
64
0
      Mask = 0x07;                    \
65
0
    }                        \
66
0
  else if ((Char & 0xfc) == 0xf8)               \
67
0
    {                       \
68
0
      Len = 5;                      \
69
0
      Mask = 0x03;                    \
70
0
    }                        \
71
0
  else if ((Char & 0xfe) == 0xfc)               \
72
0
    {                       \
73
0
      Len = 6;                      \
74
0
      Mask = 0x01;                    \
75
0
    }                        \
76
0
  else                        \
77
0
    Len = -1;
78
79
#define UTF8_LENGTH(Char)              \
80
0
  ((Char) < 0x80 ? 1 :                 \
81
0
   ((Char) < 0x800 ? 2 :               \
82
0
    ((Char) < 0x10000 ? 3 :            \
83
0
     ((Char) < 0x200000 ? 4 :          \
84
0
      ((Char) < 0x4000000 ? 5 : 6)))))
85
   
86
87
#define UTF8_GET(Result, Chars, Count, Mask, Len)           \
88
65.9k
  (Result) = (Chars)[0] & (Mask);               \
89
65.9k
  for ((Count) = 1; (Count) < (Len); ++(Count))             \
90
65.9k
    {                       \
91
0
      if (((Chars)[(Count)] & 0xc0) != 0x80)             \
92
0
  {                     \
93
0
    (Result) = -1;                  \
94
0
    break;                    \
95
0
  }                      \
96
0
      (Result) <<= 6;                   \
97
0
      (Result) |= ((Chars)[(Count)] & 0x3f);              \
98
0
    }
99
    
100
/*
101
 * Check whether a Unicode (5.2) char is in a valid range.
102
 *
103
 * The first check comes from the Unicode guarantee to never encode
104
 * a point above 0x0010ffff, since UTF-16 couldn't represent it.
105
 * 
106
 * The second check covers surrogate pairs (category Cs).
107
 *
108
 * @param Char the character
109
 */
110
#define UNICODE_VALID(Char)                   \
111
384M
    ((Char) < 0x110000 &&                     \
112
384M
     (((Char) & 0xFFFFF800) != 0xD800))
113
114
    
115
static const gchar utf8_skip_data[256] = {
116
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
117
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
118
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
119
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
120
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
121
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
122
  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
123
  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
124
};
125
126
const gchar * const g_utf8_skip = utf8_skip_data;
127
128
/**
129
 * g_utf8_find_prev_char:
130
 * @str: pointer to the beginning of a UTF-8 encoded string
131
 * @p: pointer to some position within @str
132
 * 
133
 * Given a position @p with a UTF-8 encoded string @str, find the start
134
 * of the previous UTF-8 character starting before @p. Returns %NULL if no
135
 * UTF-8 characters are present in @str before @p.
136
 *
137
 * @p does not have to be at the beginning of a UTF-8 character. No check
138
 * is made to see if the character found is actually valid other than
139
 * it starts with an appropriate byte.
140
 *
141
 * Returns: (transfer none) (nullable): a pointer to the found character or %NULL.
142
 */
143
gchar *
144
g_utf8_find_prev_char (const gchar *str,
145
           const gchar *p)
146
0
{
147
0
  while (p > str)
148
0
    {
149
0
      --p;
150
0
      if ((*p & 0xc0) != 0x80)
151
0
  return (gchar *)p;
152
0
    }
153
0
  return NULL;
154
0
}
155
156
/**
157
 * g_utf8_find_next_char:
158
 * @p: a pointer to a position within a UTF-8 encoded string
159
 * @end: (nullable): a pointer to the byte following the end of the string,
160
 *     or %NULL to indicate that the string is nul-terminated
161
 *
162
 * Finds the start of the next UTF-8 character in the string after @p.
163
 *
164
 * @p does not have to be at the beginning of a UTF-8 character. No check
165
 * is made to see if the character found is actually valid other than
166
 * it starts with an appropriate byte.
167
 * 
168
 * If @end is %NULL, the return value will never be %NULL: if the end of the
169
 * string is reached, a pointer to the terminating nul byte is returned. If
170
 * @end is non-%NULL, the return value will be %NULL if the end of the string
171
 * is reached.
172
 *
173
 * Returns: (transfer none) (nullable): a pointer to the found character or %NULL if @end is
174
 *    set and is reached
175
 */
176
gchar *
177
g_utf8_find_next_char (const gchar *p,
178
           const gchar *end)
179
0
{
180
0
  if (end)
181
0
    {
182
0
      for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
183
0
        ;
184
0
      return (p >= end) ? NULL : (gchar *)p;
185
0
    }
186
0
  else
187
0
    {
188
0
      for (++p; (*p & 0xc0) == 0x80; ++p)
189
0
        ;
190
0
      return (gchar *)p;
191
0
    }
192
0
}
193
194
/**
195
 * g_utf8_prev_char:
196
 * @p: a pointer to a position within a UTF-8 encoded string
197
 *
198
 * Finds the previous UTF-8 character in the string before @p.
199
 *
200
 * @p does not have to be at the beginning of a UTF-8 character. No check
201
 * is made to see if the character found is actually valid other than
202
 * it starts with an appropriate byte. If @p might be the first
203
 * character of the string, you must use g_utf8_find_prev_char() instead.
204
 * 
205
 * Returns: (transfer none) (not nullable): a pointer to the found character
206
 */
207
gchar *
208
g_utf8_prev_char (const gchar *p)
209
0
{
210
0
  while (TRUE)
211
0
    {
212
0
      p--;
213
0
      if ((*p & 0xc0) != 0x80)
214
0
  return (gchar *)p;
215
0
    }
216
0
}
217
 
218
/**
219
 * g_utf8_strlen:
220
 * @p: pointer to the start of a UTF-8 encoded string
221
 * @max: the maximum number of bytes to examine. If @max
222
 *       is less than 0, then the string is assumed to be
223
 *       nul-terminated. If @max is 0, @p will not be examined and
224
 *       may be %NULL. If @max is greater than 0, up to @max
225
 *       bytes are examined
226
 *
227
 * Computes the length of the string in characters, not including
228
 * the terminating nul character. If the @max'th byte falls in the
229
 * middle of a character, the last (partial) character is not counted.
230
 *
231
 * Returns: the length of the string in characters
232
 */
233
glong
234
g_utf8_strlen (const gchar *p,
235
               gssize       max)
236
0
{
237
0
  glong len = 0;
238
0
  const gchar *start = p;
239
0
  g_return_val_if_fail (p != NULL || max == 0, 0);
240
241
0
  if (max < 0)
242
0
    {
243
0
      while (*p)
244
0
        {
245
0
          p = g_utf8_next_char (p);
246
0
          ++len;
247
0
        }
248
0
    }
249
0
  else
250
0
    {
251
0
      if (max == 0 || !*p)
252
0
        return 0;
253
254
0
      p = g_utf8_next_char (p);
255
256
0
      while (p - start < max && *p)
257
0
        {
258
0
          ++len;
259
0
          p = g_utf8_next_char (p);
260
0
        }
261
262
      /* only do the last len increment if we got a complete
263
       * char (don't count partial chars)
264
       */
265
0
      if (p - start <= max)
266
0
        ++len;
267
0
    }
268
269
0
  return len;
270
0
}
271
272
/**
273
 * g_utf8_substring:
274
 * @str: a UTF-8 encoded string
275
 * @start_pos: a character offset within @str
276
 * @end_pos: another character offset within @str,
277
 *   or `-1` to indicate the end of the string
278
 *
279
 * Copies a substring out of a UTF-8 encoded string.
280
 * The substring will contain @end_pos - @start_pos characters.
281
 *
282
 * Since GLib 2.72, `-1` can be passed to @end_pos to indicate the
283
 * end of the string.
284
 *
285
 * Returns: (transfer full): a newly allocated copy of the requested
286
 *     substring. Free with g_free() when no longer needed.
287
 *
288
 * Since: 2.30
289
 */
290
gchar *
291
g_utf8_substring (const gchar *str,
292
                  glong        start_pos,
293
                  glong        end_pos)
294
0
{
295
0
  gchar *start, *end, *out;
296
297
0
  g_return_val_if_fail (end_pos >= start_pos || end_pos == -1, NULL);
298
299
0
  start = g_utf8_offset_to_pointer (str, start_pos);
300
301
0
  if (end_pos == -1)
302
0
    {
303
0
      glong length = g_utf8_strlen (start, -1);
304
0
      end = g_utf8_offset_to_pointer (start, length);
305
0
    }
306
0
  else
307
0
    {
308
0
      end = g_utf8_offset_to_pointer (start, end_pos - start_pos);
309
0
    }
310
311
0
  out = g_malloc (end - start + 1);
312
0
  memcpy (out, start, end - start);
313
0
  out[end - start] = 0;
314
315
0
  return out;
316
0
}
317
318
/**
319
 * g_utf8_get_char:
320
 * @p: a pointer to Unicode character encoded as UTF-8
321
 * 
322
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
323
 *
324
 * If @p does not point to a valid UTF-8 encoded character, results
325
 * are undefined. If you are not sure that the bytes are complete
326
 * valid Unicode characters, you should use g_utf8_get_char_validated()
327
 * instead.
328
 * 
329
 * Returns: the resulting character
330
 */
331
gunichar
332
g_utf8_get_char (const gchar *p)
333
65.9k
{
334
65.9k
  int i, mask = 0, len;
335
65.9k
  gunichar result;
336
65.9k
  unsigned char c = (unsigned char) *p;
337
338
65.9k
  UTF8_COMPUTE (c, mask, len);
339
65.9k
  if (len == -1)
340
0
    return (gunichar)-1;
341
65.9k
  UTF8_GET (result, p, i, mask, len);
342
343
65.9k
  return result;
344
65.9k
}
345
346
/**
347
 * g_utf8_offset_to_pointer:
348
 * @str: a UTF-8 encoded string
349
 * @offset: a character offset within @str
350
 *
351
 * Converts from an integer character offset to a pointer to a position
352
 * within the string.
353
 *
354
 * Since 2.10, this function allows to pass a negative @offset to
355
 * step backwards. It is usually worth stepping backwards from the end
356
 * instead of forwards if @offset is in the last fourth of the string,
357
 * since moving forward is about 3 times faster than moving backward.
358
 *
359
 * Note that this function doesn't abort when reaching the end of @str.
360
 * Therefore you should be sure that @offset is within string boundaries
361
 * before calling that function. Call g_utf8_strlen() when unsure.
362
 * This limitation exists as this function is called frequently during
363
 * text rendering and therefore has to be as fast as possible.
364
 *
365
 * Returns: (transfer none): the resulting pointer
366
 */
367
gchar *
368
g_utf8_offset_to_pointer  (const gchar *str,
369
         glong        offset)
370
0
{
371
0
  const gchar *s = str;
372
373
0
  if (offset > 0) 
374
0
    while (offset--)
375
0
      s = g_utf8_next_char (s);
376
0
  else
377
0
    {
378
0
      const char *s1;
379
380
      /* This nice technique for fast backwards stepping 
381
       * through a UTF-8 string was dubbed "stutter stepping" 
382
       * by its inventor, Larry Ewing.
383
       */
384
0
      while (offset)
385
0
  {
386
0
    s1 = s;
387
0
    s += offset;
388
0
    while ((*s & 0xc0) == 0x80)
389
0
      s--;
390
391
0
    offset += g_utf8_pointer_to_offset (s, s1);
392
0
  }
393
0
    }
394
395
0
  return (gchar *)s;
396
0
}
397
398
/**
399
 * g_utf8_pointer_to_offset:
400
 * @str: a UTF-8 encoded string
401
 * @pos: a pointer to a position within @str
402
 * 
403
 * Converts from a pointer to position within a string to an integer
404
 * character offset.
405
 *
406
 * Since 2.10, this function allows @pos to be before @str, and returns
407
 * a negative offset in this case.
408
 * 
409
 * Returns: the resulting character offset
410
 */
411
glong    
412
g_utf8_pointer_to_offset (const gchar *str,
413
        const gchar *pos)
414
0
{
415
0
  const gchar *s = str;
416
0
  glong offset = 0;    
417
418
0
  if (pos < str) 
419
0
    offset = - g_utf8_pointer_to_offset (pos, str);
420
0
  else
421
0
    while (s < pos)
422
0
      {
423
0
  s = g_utf8_next_char (s);
424
0
  offset++;
425
0
      }
426
  
427
0
  return offset;
428
0
}
429
430
431
/**
432
 * g_utf8_strncpy:
433
 * @dest: (transfer none): buffer to fill with characters from @src
434
 * @src: UTF-8 encoded string
435
 * @n: character count
436
 * 
437
 * Like the standard C strncpy() function, but copies a given number
438
 * of characters instead of a given number of bytes. The @src string
439
 * must be valid UTF-8 encoded text. (Use g_utf8_validate() on all
440
 * text before trying to use UTF-8 utility functions with it.)
441
 * 
442
 * Note you must ensure @dest is at least 4 * @n + 1 to fit the
443
 * largest possible UTF-8 characters
444
 *
445
 * Returns: (transfer none): @dest
446
 */
447
gchar *
448
g_utf8_strncpy (gchar       *dest,
449
    const gchar *src,
450
    gsize        n)
451
0
{
452
0
  const gchar *s = src;
453
0
  while (n && *s)
454
0
    {
455
0
      s = g_utf8_next_char(s);
456
0
      n--;
457
0
    }
458
0
  strncpy(dest, src, s - src);
459
0
  dest[s - src] = 0;
460
0
  return dest;
461
0
}
462
463
/**
464
 * g_utf8_truncate_middle:
465
 * @string: (transfer none): a nul-terminated UTF-8 encoded string
466
 * @truncate_length: the new size of @string, in characters, including the ellipsis character
467
 *
468
 * Cuts off the middle of the string, preserving half of @truncate_length
469
 * characters at the beginning and half at the end.
470
 * 
471
 * If @string is already short enough, this returns a copy of @string.
472
 * If @truncate_length is `0`, an empty string is returned.
473
 *
474
 * Returns: (transfer full): a newly-allocated copy of @string ellipsized in the middle
475
 *
476
 * Since: 2.78
477
 */
478
gchar *
479
g_utf8_truncate_middle (const gchar *string,
480
                        gsize        truncate_length)
481
0
{
482
0
  const gchar *ellipsis = "…";
483
0
  const gsize ellipsis_bytes = strlen (ellipsis);
484
485
0
  gsize length;
486
0
  gsize left_substring_length;
487
0
  gchar *left_substring_end;
488
0
  gchar *right_substring_begin;
489
0
  gchar *right_substring_end;
490
0
  gsize left_bytes;
491
0
  gsize right_bytes;
492
0
  gchar *result;
493
494
0
  g_return_val_if_fail (string != NULL, NULL);
495
496
0
  length = g_utf8_strlen (string, -1);
497
  /* Current string already smaller than requested length */
498
0
  if (length <= truncate_length)
499
0
    return g_strdup (string);
500
0
  if (truncate_length == 0)
501
0
    return g_strdup ("");
502
503
  /* Find substrings to keep, ignore ellipsis character for that */
504
0
  truncate_length -= 1;
505
506
0
  left_substring_length = truncate_length / 2;
507
508
0
  left_substring_end = g_utf8_offset_to_pointer (string, left_substring_length);
509
0
  right_substring_begin = g_utf8_offset_to_pointer (left_substring_end,
510
0
                                                    length - truncate_length);
511
0
  right_substring_end = g_utf8_offset_to_pointer (right_substring_begin,
512
0
                                                  truncate_length - left_substring_length);
513
514
0
  g_assert (*right_substring_end == '\0');
515
516
0
  left_bytes = left_substring_end - string;
517
0
  right_bytes = right_substring_end - right_substring_begin;
518
519
0
  result = g_malloc (left_bytes + ellipsis_bytes + right_bytes + 1);
520
521
0
  strncpy (result, string, left_bytes);
522
0
  memcpy (result + left_bytes, ellipsis, ellipsis_bytes);
523
0
  strncpy (result + left_bytes + ellipsis_bytes, right_substring_begin, right_bytes);
524
0
  result[left_bytes + ellipsis_bytes + right_bytes] = '\0';
525
526
0
  return result;
527
0
}
528
529
/* unicode_strchr */
530
531
/**
532
 * g_unichar_to_utf8:
533
 * @c: a Unicode character code
534
 * @outbuf: (out caller-allocates) (optional): output buffer, must have at
535
 *       least 6 bytes of space. If %NULL, the length will be computed and
536
 *       returned and nothing will be written to @outbuf.
537
 * 
538
 * Converts a single character to UTF-8.
539
 * 
540
 * Returns: number of bytes written
541
 */
542
int
543
g_unichar_to_utf8 (gunichar c,
544
       gchar   *outbuf)
545
0
{
546
  /* If this gets modified, also update the copy in g_string_insert_unichar() */
547
0
  guint len = 0;    
548
0
  int first;
549
0
  int i;
550
551
0
  if (c < 0x80)
552
0
    {
553
0
      first = 0;
554
0
      len = 1;
555
0
    }
556
0
  else if (c < 0x800)
557
0
    {
558
0
      first = 0xc0;
559
0
      len = 2;
560
0
    }
561
0
  else if (c < 0x10000)
562
0
    {
563
0
      first = 0xe0;
564
0
      len = 3;
565
0
    }
566
0
   else if (c < 0x200000)
567
0
    {
568
0
      first = 0xf0;
569
0
      len = 4;
570
0
    }
571
0
  else if (c < 0x4000000)
572
0
    {
573
0
      first = 0xf8;
574
0
      len = 5;
575
0
    }
576
0
  else
577
0
    {
578
0
      first = 0xfc;
579
0
      len = 6;
580
0
    }
581
582
0
  if (outbuf)
583
0
    {
584
0
      for (i = len - 1; i > 0; --i)
585
0
  {
586
0
    outbuf[i] = (c & 0x3f) | 0x80;
587
0
    c >>= 6;
588
0
  }
589
0
      outbuf[0] = c | first;
590
0
    }
591
592
0
  return len;
593
0
}
594
595
/**
596
 * g_utf8_strchr:
597
 * @p: a nul-terminated UTF-8 encoded string
598
 * @len: the maximum length of @p
599
 * @c: a Unicode character
600
 * 
601
 * Finds the leftmost occurrence of the given Unicode character
602
 * in a UTF-8 encoded string, while limiting the search to @len bytes.
603
 * If @len is -1, allow unbounded search.
604
 * 
605
 * Returns: (transfer none) (nullable): %NULL if the string does not contain the character,
606
 *     otherwise, a pointer to the start of the leftmost occurrence
607
 *     of the character in the string.
608
 */
609
gchar *
610
g_utf8_strchr (const char *p,
611
         gssize      len,
612
         gunichar    c)
613
0
{
614
0
  gchar ch[10];
615
616
0
  gint charlen = g_unichar_to_utf8 (c, ch);
617
0
  ch[charlen] = '\0';
618
  
619
0
  return g_strstr_len (p, len, ch);
620
0
}
621
622
623
/**
624
 * g_utf8_strrchr:
625
 * @p: a nul-terminated UTF-8 encoded string
626
 * @len: the maximum length of @p
627
 * @c: a Unicode character
628
 * 
629
 * Find the rightmost occurrence of the given Unicode character
630
 * in a UTF-8 encoded string, while limiting the search to @len bytes.
631
 * If @len is -1, allow unbounded search.
632
 * 
633
 * Returns: (transfer none) (nullable): %NULL if the string does not contain the character,
634
 *     otherwise, a pointer to the start of the rightmost occurrence
635
 *     of the character in the string.
636
 */
637
gchar *
638
g_utf8_strrchr (const char *p,
639
    gssize      len,
640
    gunichar    c)
641
0
{
642
0
  gchar ch[10];
643
644
0
  gint charlen = g_unichar_to_utf8 (c, ch);
645
0
  ch[charlen] = '\0';
646
  
647
0
  return g_strrstr_len (p, len, ch);
648
0
}
649
650
651
/* Like g_utf8_get_char, but take a maximum length
652
 * and return (gunichar)-2 on incomplete trailing character;
653
 * also check for malformed or overlong sequences
654
 * and return (gunichar)-1 in this case.
655
 */
656
static inline gunichar
657
g_utf8_get_char_extended (const  gchar *p,
658
        gssize max_len)
659
423M
{
660
423M
  gsize i, len;
661
423M
  gunichar min_code;
662
423M
  gunichar wc = (guchar) *p;
663
423M
  const gunichar partial_sequence = (gunichar) -2;
664
423M
  const gunichar malformed_sequence = (gunichar) -1;
665
666
423M
  if (wc < 0x80)
667
397M
    {
668
397M
      return wc;
669
397M
    }
670
25.4M
  else if (G_UNLIKELY (wc < 0xc0))
671
11.2M
    {
672
11.2M
      return malformed_sequence;
673
11.2M
    }
674
14.1M
  else if (wc < 0xe0)
675
1.88M
    {
676
1.88M
      len = 2;
677
1.88M
      wc &= 0x1f;
678
1.88M
      min_code = 1 << 7;
679
1.88M
    }
680
12.2M
  else if (wc < 0xf0)
681
3.95M
    {
682
3.95M
      len = 3;
683
3.95M
      wc &= 0x0f;
684
3.95M
      min_code = 1 << 11;
685
3.95M
    }
686
8.34M
  else if (wc < 0xf8)
687
395k
    {
688
395k
      len = 4;
689
395k
      wc &= 0x07;
690
395k
      min_code = 1 << 16;
691
395k
    }
692
7.94M
  else if (wc < 0xfc)
693
206k
    {
694
206k
      len = 5;
695
206k
      wc &= 0x03;
696
206k
      min_code = 1 << 21;
697
206k
    }
698
7.74M
  else if (wc < 0xfe)
699
173k
    {
700
173k
      len = 6;
701
173k
      wc &= 0x01;
702
173k
      min_code = 1 << 26;
703
173k
    }
704
7.56M
  else
705
7.56M
    {
706
7.56M
      return malformed_sequence;
707
7.56M
    }
708
709
6.61M
  if (G_UNLIKELY (max_len >= 0 && len > (gsize) max_len))
710
7.40k
    {
711
8.10k
      for (i = 1; i < (gsize) max_len; i++)
712
1.12k
  {
713
1.12k
    if ((((guchar *)p)[i] & 0xc0) != 0x80)
714
428
      return malformed_sequence;
715
1.12k
  }
716
6.98k
      return partial_sequence;
717
7.40k
    }
718
719
11.7M
  for (i = 1; i < len; ++i)
720
9.06M
    {
721
9.06M
      gunichar ch = ((guchar *)p)[i];
722
723
9.06M
      if (G_UNLIKELY ((ch & 0xc0) != 0x80))
724
3.89M
  {
725
3.89M
    if (ch)
726
3.84M
      return malformed_sequence;
727
48.7k
    else
728
48.7k
      return partial_sequence;
729
3.89M
  }
730
731
5.17M
      wc <<= 6;
732
5.17M
      wc |= (ch & 0x3f);
733
5.17M
    }
734
735
2.71M
  if (G_UNLIKELY (wc < min_code))
736
7.14k
    return malformed_sequence;
737
738
2.70M
  return wc;
739
2.71M
}
740
741
/**
742
 * g_utf8_get_char_validated:
743
 * @p: a pointer to Unicode character encoded as UTF-8
744
 * @max_len: the maximum number of bytes to read, or -1 if @p is nul-terminated
745
 *
746
 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
747
 * This function checks for incomplete characters, for invalid characters
748
 * such as characters that are out of the range of Unicode, and for
749
 * overlong encodings of valid characters.
750
 *
751
 * Note that g_utf8_get_char_validated() returns (gunichar)-2 if
752
 * @max_len is positive and any of the bytes in the first UTF-8 character
753
 * sequence are nul.
754
 * 
755
 * Returns: the resulting character. If @p points to a partial
756
 *     sequence at the end of a string that could begin a valid 
757
 *     character (or if @max_len is zero), returns (gunichar)-2; 
758
 *     otherwise, if @p does not point to a valid UTF-8 encoded 
759
 *     Unicode character, returns (gunichar)-1.
760
 */
761
gunichar
762
g_utf8_get_char_validated (const gchar *p,
763
         gssize       max_len)
764
423M
{
765
423M
  gunichar result;
766
767
423M
  if (max_len == 0)
768
17.8k
    return (gunichar)-2;
769
770
423M
  result = g_utf8_get_char_extended (p, max_len);
771
772
  /* Disallow codepoint U+0000 as it’s a nul byte,
773
   * and all string handling in GLib is nul-terminated */
774
423M
  if (result == 0 && max_len > 0)
775
15.5M
    return (gunichar) -2;
776
777
407M
  if (result & 0x80000000)
778
22.6M
    return result;
779
384M
  else if (!UNICODE_VALID (result))
780
35.3k
    return (gunichar)-1;
781
384M
  else
782
384M
    return result;
783
407M
}
784
785
0
#define CONT_BYTE_FAST(p) ((guchar)*p++ & 0x3f)
786
787
/**
788
 * g_utf8_to_ucs4_fast:
789
 * @str: a UTF-8 encoded string
790
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
791
 *     then the string is nul-terminated.
792
 * @items_written: (out) (optional): location to store the
793
 *     number of characters in the result, or %NULL.
794
 *
795
 * Convert a string from UTF-8 to a 32-bit fixed width
796
 * representation as UCS-4, assuming valid UTF-8 input.
797
 * This function is roughly twice as fast as g_utf8_to_ucs4()
798
 * but does no error checking on the input. A trailing 0 character
799
 * will be added to the string after the converted text.
800
 * 
801
 * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
802
 *     This value must be freed with g_free().
803
 */
804
gunichar *
805
g_utf8_to_ucs4_fast (const gchar *str,
806
         glong        len,              
807
         glong       *items_written)    
808
0
{
809
0
  gunichar *result;
810
0
  gint n_chars, i;
811
0
  const gchar *p;
812
813
0
  g_return_val_if_fail (str != NULL, NULL);
814
815
0
  p = str;
816
0
  n_chars = 0;
817
0
  if (len < 0)
818
0
    {
819
0
      while (*p)
820
0
  {
821
0
    p = g_utf8_next_char (p);
822
0
    ++n_chars;
823
0
  }
824
0
    }
825
0
  else
826
0
    {
827
0
      while (p < str + len && *p)
828
0
  {
829
0
    p = g_utf8_next_char (p);
830
0
    ++n_chars;
831
0
  }
832
0
    }
833
  
834
0
  result = g_new (gunichar, n_chars + 1);
835
  
836
0
  p = str;
837
0
  for (i=0; i < n_chars; i++)
838
0
    {
839
0
      guchar first = (guchar)*p++;
840
0
      gunichar wc;
841
842
0
      if (first < 0xc0)
843
0
  {
844
          /* We really hope first < 0x80, but we don't want to test an
845
           * extra branch for invalid input, which this function
846
           * does not care about. Handling unexpected continuation bytes
847
           * here will do the least damage. */
848
0
    wc = first;
849
0
  }
850
0
      else
851
0
  {
852
0
          gunichar c1 = CONT_BYTE_FAST(p);
853
0
          if (first < 0xe0)
854
0
            {
855
0
              wc = ((first & 0x1f) << 6) | c1;
856
0
            }
857
0
          else
858
0
            {
859
0
              gunichar c2 = CONT_BYTE_FAST(p);
860
0
              if (first < 0xf0)
861
0
                {
862
0
                  wc = ((first & 0x0f) << 12) | (c1 << 6) | c2;
863
0
                }
864
0
              else
865
0
                {
866
0
                  gunichar c3 = CONT_BYTE_FAST(p);
867
0
                  wc = ((first & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
868
0
                  if (G_UNLIKELY (first >= 0xf8))
869
0
                    {
870
                      /* This can't be valid UTF-8, but g_utf8_next_char()
871
                       * and company allow out-of-range sequences */
872
0
                      gunichar mask = 1 << 20;
873
0
                      while ((wc & mask) != 0)
874
0
                        {
875
0
                          wc <<= 6;
876
0
                          wc |= CONT_BYTE_FAST(p);
877
0
                          mask <<= 5;
878
0
                        }
879
0
                      wc &= mask - 1;
880
0
                    }
881
0
                }
882
0
            }
883
0
  }
884
0
      result[i] = wc;
885
0
    }
886
0
  result[i] = 0;
887
888
0
  if (items_written)
889
0
    *items_written = i;
890
891
0
  return result;
892
0
}
893
894
static gpointer
895
try_malloc_n (gsize n_blocks, gsize n_block_bytes, GError **error)
896
0
{
897
0
    gpointer ptr = g_try_malloc_n (n_blocks, n_block_bytes);
898
0
    if (ptr == NULL)
899
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY,
900
0
                           _("Failed to allocate memory"));
901
0
    return ptr;
902
0
}
903
904
/**
905
 * g_utf8_to_ucs4:
906
 * @str: a UTF-8 encoded string
907
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
908
 *     then the string is nul-terminated.
909
 * @items_read: (out) (optional): location to store number of
910
  *    bytes read, or %NULL.
911
 *     If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
912
 *     returned in case @str contains a trailing partial
913
 *     character. If an error occurs then the index of the
914
 *     invalid input is stored here.
915
 * @items_written: (out) (optional): location to store number
916
 *     of characters written or %NULL. The value here stored does not include
917
 *     the trailing 0 character.
918
 * @error: location to store the error occurring, or %NULL to ignore
919
 *     errors. Any of the errors in #GConvertError other than
920
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
921
 *
922
 * Convert a string from UTF-8 to a 32-bit fixed width
923
 * representation as UCS-4. A trailing 0 character will be added to the
924
 * string after the converted text.
925
 * 
926
 * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
927
 *     This value must be freed with g_free(). If an error occurs,
928
 *     %NULL will be returned and @error set.
929
 */
930
gunichar *
931
g_utf8_to_ucs4 (const gchar *str,
932
    glong        len,             
933
    glong       *items_read,      
934
    glong       *items_written,   
935
    GError     **error)
936
0
{
937
0
  gunichar *result = NULL;
938
0
  gint n_chars, i;
939
0
  const gchar *in;
940
  
941
0
  in = str;
942
0
  n_chars = 0;
943
0
  while ((len < 0 || str + len - in > 0) && *in)
944
0
    {
945
0
      gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
946
0
      if (wc & 0x80000000)
947
0
  {
948
0
    if (wc == (gunichar)-2)
949
0
      {
950
0
        if (items_read)
951
0
    break;
952
0
        else
953
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
954
0
                                     _("Partial character sequence at end of input"));
955
0
      }
956
0
    else
957
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
958
0
                                 _("Invalid byte sequence in conversion input"));
959
960
0
    goto err_out;
961
0
  }
962
963
0
      n_chars++;
964
965
0
      in = g_utf8_next_char (in);
966
0
    }
967
968
0
  result = try_malloc_n (n_chars + 1, sizeof (gunichar), error);
969
0
  if (result == NULL)
970
0
      goto err_out;
971
972
0
  in = str;
973
0
  for (i=0; i < n_chars; i++)
974
0
    {
975
0
      result[i] = g_utf8_get_char (in);
976
0
      in = g_utf8_next_char (in);
977
0
    }
978
0
  result[i] = 0;
979
980
0
  if (items_written)
981
0
    *items_written = n_chars;
982
983
0
 err_out:
984
0
  if (items_read)
985
0
    *items_read = in - str;
986
987
0
  return result;
988
0
}
989
990
/**
991
 * g_ucs4_to_utf8:
992
 * @str: (array length=len) (element-type gunichar): a UCS-4 encoded string
993
 * @len: the maximum length (number of characters) of @str to use. 
994
 *     If @len < 0, then the string is nul-terminated.
995
 * @items_read: (out) (optional): location to store number of
996
 *     characters read, or %NULL.
997
 * @items_written: (out) (optional): location to store number
998
 *     of bytes written or %NULL. The value here stored does not include the
999
 *     trailing 0 byte.
1000
 * @error: location to store the error occurring, or %NULL to ignore
1001
 *         errors. Any of the errors in #GConvertError other than
1002
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1003
 *
1004
 * Convert a string from a 32-bit fixed width representation as UCS-4.
1005
 * to UTF-8. The result will be terminated with a 0 byte.
1006
 * 
1007
 * Returns: (transfer full): a pointer to a newly allocated UTF-8 string.
1008
 *     This value must be freed with g_free(). If an error occurs,
1009
 *     %NULL will be returned and @error set. In that case, @items_read
1010
 *     will be set to the position of the first invalid input character.
1011
 */
1012
gchar *
1013
g_ucs4_to_utf8 (const gunichar *str,
1014
    glong           len,              
1015
    glong          *items_read,       
1016
    glong          *items_written,    
1017
    GError        **error)
1018
0
{
1019
0
  gint result_length;
1020
0
  gchar *result = NULL;
1021
0
  gchar *p;
1022
0
  gint i;
1023
1024
0
  result_length = 0;
1025
0
  for (i = 0; len < 0 || i < len ; i++)
1026
0
    {
1027
0
      if (!str[i])
1028
0
  break;
1029
1030
0
      if (str[i] >= 0x80000000)
1031
0
  {
1032
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1033
0
                               _("Character out of range for UTF-8"));
1034
0
    goto err_out;
1035
0
  }
1036
      
1037
0
      result_length += UTF8_LENGTH (str[i]);
1038
0
    }
1039
1040
0
  result = try_malloc_n (result_length + 1, 1, error);
1041
0
  if (result == NULL)
1042
0
      goto err_out;
1043
1044
0
  p = result;
1045
1046
0
  i = 0;
1047
0
  while (p < result + result_length)
1048
0
    p += g_unichar_to_utf8 (str[i++], p);
1049
  
1050
0
  *p = '\0';
1051
1052
0
  if (items_written)
1053
0
    *items_written = p - result;
1054
1055
0
 err_out:
1056
0
  if (items_read)
1057
0
    *items_read = i;
1058
1059
0
  return result;
1060
0
}
1061
1062
0
#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
1063
1064
/**
1065
 * g_utf16_to_utf8:
1066
 * @str: (array length=len) (element-type guint16): a UTF-16 encoded string
1067
 * @len: the maximum length (number of #gunichar2) of @str to use. 
1068
 *     If @len < 0, then the string is nul-terminated.
1069
 * @items_read: (out) (optional): location to store number of
1070
 *     words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
1071
 *     be returned in case @str contains a trailing partial character. If
1072
 *     an error occurs then the index of the invalid input is stored here.
1073
 *     It’s guaranteed to be non-negative.
1074
 * @items_written: (out) (optional): location to store number
1075
 *     of bytes written, or %NULL. The value stored here does not include the
1076
 *     trailing 0 byte. It’s guaranteed to be non-negative.
1077
 * @error: location to store the error occurring, or %NULL to ignore
1078
 *     errors. Any of the errors in #GConvertError other than
1079
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
1080
 *
1081
 * Convert a string from UTF-16 to UTF-8. The result will be
1082
 * terminated with a 0 byte.
1083
 *
1084
 * Note that the input is expected to be already in native endianness,
1085
 * an initial byte-order-mark character is not handled specially.
1086
 * g_convert() can be used to convert a byte buffer of UTF-16 data of
1087
 * ambiguous endianness.
1088
 *
1089
 * Further note that this function does not validate the result
1090
 * string; it may e.g. include embedded NUL characters. The only
1091
 * validation done by this function is to ensure that the input can
1092
 * be correctly interpreted as UTF-16, i.e. it doesn't contain
1093
 * unpaired surrogates or partial character sequences.
1094
 *
1095
 * Returns: (transfer full): a pointer to a newly allocated UTF-8 string.
1096
 *     This value must be freed with g_free(). If an error occurs,
1097
 *     %NULL will be returned and @error set.
1098
 **/
1099
gchar *
1100
g_utf16_to_utf8 (const gunichar2  *str,
1101
     glong             len,
1102
     glong            *items_read,
1103
     glong            *items_written,
1104
     GError          **error)
1105
0
{
1106
  /* This function and g_utf16_to_ucs4 are almost exactly identical -
1107
   * The lines that differ are marked.
1108
   */
1109
0
  const gunichar2 *in;
1110
0
  gchar *out;
1111
0
  gchar *result = NULL;
1112
0
  gint n_bytes;
1113
0
  gunichar high_surrogate;
1114
1115
0
  g_return_val_if_fail (str != NULL, NULL);
1116
1117
0
  n_bytes = 0;
1118
0
  in = str;
1119
0
  high_surrogate = 0;
1120
0
  while ((len < 0 || in - str < len) && *in)
1121
0
    {
1122
0
      gunichar2 c = *in;
1123
0
      gunichar wc;
1124
1125
0
      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1126
0
  {
1127
0
    if (high_surrogate)
1128
0
      {
1129
0
        wc = SURROGATE_VALUE (high_surrogate, c);
1130
0
        high_surrogate = 0;
1131
0
      }
1132
0
    else
1133
0
      {
1134
0
        g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1135
0
                                   _("Invalid sequence in conversion input"));
1136
0
        goto err_out;
1137
0
      }
1138
0
  }
1139
0
      else
1140
0
  {
1141
0
    if (high_surrogate)
1142
0
      {
1143
0
        g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1144
0
                                   _("Invalid sequence in conversion input"));
1145
0
        goto err_out;
1146
0
      }
1147
1148
0
    if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1149
0
      {
1150
0
        high_surrogate = c;
1151
0
        goto next1;
1152
0
      }
1153
0
    else
1154
0
      wc = c;
1155
0
  }
1156
1157
      /********** DIFFERENT for UTF8/UCS4 **********/
1158
0
      n_bytes += UTF8_LENGTH (wc);
1159
1160
0
    next1:
1161
0
      in++;
1162
0
    }
1163
1164
0
  if (high_surrogate && !items_read)
1165
0
    {
1166
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1167
0
                           _("Partial character sequence at end of input"));
1168
0
      goto err_out;
1169
0
    }
1170
  
1171
  /* At this point, everything is valid, and we just need to convert
1172
   */
1173
  /********** DIFFERENT for UTF8/UCS4 **********/
1174
0
  result = try_malloc_n (n_bytes + 1, 1, error);
1175
0
  if (result == NULL)
1176
0
      goto err_out;
1177
1178
0
  high_surrogate = 0;
1179
0
  out = result;
1180
0
  in = str;
1181
0
  while (out < result + n_bytes)
1182
0
    {
1183
0
      gunichar2 c = *in;
1184
0
      gunichar wc;
1185
1186
0
      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1187
0
  {
1188
0
    wc = SURROGATE_VALUE (high_surrogate, c);
1189
0
    high_surrogate = 0;
1190
0
  }
1191
0
      else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1192
0
  {
1193
0
    high_surrogate = c;
1194
0
    goto next2;
1195
0
  }
1196
0
      else
1197
0
  wc = c;
1198
1199
      /********** DIFFERENT for UTF8/UCS4 **********/
1200
0
      out += g_unichar_to_utf8 (wc, out);
1201
1202
0
    next2:
1203
0
      in++;
1204
0
    }
1205
  
1206
  /********** DIFFERENT for UTF8/UCS4 **********/
1207
0
  *out = '\0';
1208
1209
0
  if (items_written)
1210
    /********** DIFFERENT for UTF8/UCS4 **********/
1211
0
    *items_written = out - result;
1212
1213
0
 err_out:
1214
0
  if (items_read)
1215
0
    *items_read = in - str;
1216
1217
0
  return result;
1218
0
}
1219
1220
/**
1221
 * g_utf16_to_ucs4:
1222
 * @str: (array length=len) (element-type guint16): a UTF-16 encoded string
1223
 * @len: the maximum length (number of #gunichar2) of @str to use. 
1224
 *     If @len < 0, then the string is nul-terminated.
1225
 * @items_read: (out) (optional): location to store number of
1226
 *     words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
1227
 *     be returned in case @str contains a trailing partial character. If
1228
 *     an error occurs then the index of the invalid input is stored here.
1229
 * @items_written: (out) (optional): location to store number
1230
 *     of characters written, or %NULL. The value stored here does not include
1231
 *     the trailing 0 character.
1232
 * @error: location to store the error occurring, or %NULL to ignore
1233
 *     errors. Any of the errors in #GConvertError other than
1234
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
1235
 *
1236
 * Convert a string from UTF-16 to UCS-4. The result will be
1237
 * nul-terminated.
1238
 * 
1239
 * Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
1240
 *     This value must be freed with g_free(). If an error occurs,
1241
 *     %NULL will be returned and @error set.
1242
 */
1243
gunichar *
1244
g_utf16_to_ucs4 (const gunichar2  *str,
1245
     glong             len,              
1246
     glong            *items_read,       
1247
     glong            *items_written,    
1248
     GError          **error)
1249
0
{
1250
0
  const gunichar2 *in;
1251
0
  gchar *out;
1252
0
  gchar *result = NULL;
1253
0
  gint n_bytes;
1254
0
  gunichar high_surrogate;
1255
1256
0
  g_return_val_if_fail (str != NULL, NULL);
1257
1258
0
  n_bytes = 0;
1259
0
  in = str;
1260
0
  high_surrogate = 0;
1261
0
  while ((len < 0 || in - str < len) && *in)
1262
0
    {
1263
0
      gunichar2 c = *in;
1264
1265
0
      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1266
0
  {
1267
0
    if (high_surrogate)
1268
0
      {
1269
0
        high_surrogate = 0;
1270
0
      }
1271
0
    else
1272
0
      {
1273
0
        g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1274
0
                                   _("Invalid sequence in conversion input"));
1275
0
        goto err_out;
1276
0
      }
1277
0
  }
1278
0
      else
1279
0
  {
1280
0
    if (high_surrogate)
1281
0
      {
1282
0
        g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1283
0
                                   _("Invalid sequence in conversion input"));
1284
0
        goto err_out;
1285
0
      }
1286
1287
0
    if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1288
0
      {
1289
0
        high_surrogate = c;
1290
0
        goto next1;
1291
0
      }
1292
0
  }
1293
1294
      /********** DIFFERENT for UTF8/UCS4 **********/
1295
0
      n_bytes += sizeof (gunichar);
1296
1297
0
    next1:
1298
0
      in++;
1299
0
    }
1300
1301
0
  if (high_surrogate && !items_read)
1302
0
    {
1303
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1304
0
                           _("Partial character sequence at end of input"));
1305
0
      goto err_out;
1306
0
    }
1307
  
1308
  /* At this point, everything is valid, and we just need to convert
1309
   */
1310
  /********** DIFFERENT for UTF8/UCS4 **********/
1311
0
  result = try_malloc_n (n_bytes + 4, 1, error);
1312
0
  if (result == NULL)
1313
0
      goto err_out;
1314
1315
0
  high_surrogate = 0;
1316
0
  out = result;
1317
0
  in = str;
1318
0
  while (out < result + n_bytes)
1319
0
    {
1320
0
      gunichar2 c = *in;
1321
0
      gunichar wc;
1322
1323
0
      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1324
0
  {
1325
0
    wc = SURROGATE_VALUE (high_surrogate, c);
1326
0
    high_surrogate = 0;
1327
0
  }
1328
0
      else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1329
0
  {
1330
0
    high_surrogate = c;
1331
0
    goto next2;
1332
0
  }
1333
0
      else
1334
0
  wc = c;
1335
1336
      /********** DIFFERENT for UTF8/UCS4 **********/
1337
0
      *(gunichar *)out = wc;
1338
0
      out += sizeof (gunichar);
1339
1340
0
    next2:
1341
0
      in++;
1342
0
    }
1343
1344
  /********** DIFFERENT for UTF8/UCS4 **********/
1345
0
  *(gunichar *)out = 0;
1346
1347
0
  if (items_written)
1348
    /********** DIFFERENT for UTF8/UCS4 **********/
1349
0
    *items_written = (out - result) / sizeof (gunichar);
1350
1351
0
 err_out:
1352
0
  if (items_read)
1353
0
    *items_read = in - str;
1354
1355
0
  return (gunichar *)result;
1356
0
}
1357
1358
/**
1359
 * g_utf8_to_utf16:
1360
 * @str: a UTF-8 encoded string
1361
 * @len: the maximum length (number of bytes) of @str to use.
1362
 *     If @len < 0, then the string is nul-terminated.
1363
 * @items_read: (out) (optional): location to store number of
1364
 *     bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
1365
 *     be returned in case @str contains a trailing partial character. If
1366
 *     an error occurs then the index of the invalid input is stored here.
1367
 * @items_written: (out) (optional): location to store number
1368
 *     of #gunichar2 written, or %NULL. The value stored here does not include
1369
 *     the trailing 0.
1370
 * @error: location to store the error occurring, or %NULL to ignore
1371
 *     errors. Any of the errors in #GConvertError other than
1372
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
1373
 *
1374
 * Convert a string from UTF-8 to UTF-16. A 0 character will be
1375
 * added to the result after the converted text.
1376
 *
1377
 * Returns: (transfer full): a pointer to a newly allocated UTF-16 string.
1378
 *     This value must be freed with g_free(). If an error occurs,
1379
 *     %NULL will be returned and @error set.
1380
 */
1381
gunichar2 *
1382
g_utf8_to_utf16 (const gchar *str,
1383
     glong        len,
1384
     glong       *items_read,
1385
     glong       *items_written,
1386
     GError     **error)
1387
0
{
1388
0
  gunichar2 *result = NULL;
1389
0
  gint n16;
1390
0
  const gchar *in;
1391
0
  gint i;
1392
1393
0
  g_return_val_if_fail (str != NULL, NULL);
1394
1395
0
  in = str;
1396
0
  n16 = 0;
1397
0
  while ((len < 0 || str + len - in > 0) && *in)
1398
0
    {
1399
0
      gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
1400
0
      if (wc & 0x80000000)
1401
0
  {
1402
0
    if (wc == (gunichar)-2)
1403
0
      {
1404
0
        if (items_read)
1405
0
    break;
1406
0
        else
1407
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1408
0
                                     _("Partial character sequence at end of input"));
1409
0
      }
1410
0
    else
1411
0
      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1412
0
                                 _("Invalid byte sequence in conversion input"));
1413
1414
0
    goto err_out;
1415
0
  }
1416
1417
0
      if (wc < 0xd800)
1418
0
  n16 += 1;
1419
0
      else if (wc < 0xe000)
1420
0
  {
1421
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1422
0
                               _("Invalid sequence in conversion input"));
1423
1424
0
    goto err_out;
1425
0
  }
1426
0
      else if (wc < 0x10000)
1427
0
  n16 += 1;
1428
0
      else if (wc < 0x110000)
1429
0
  n16 += 2;
1430
0
      else
1431
0
  {
1432
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1433
0
                               _("Character out of range for UTF-16"));
1434
1435
0
    goto err_out;
1436
0
  }
1437
      
1438
0
      in = g_utf8_next_char (in);
1439
0
    }
1440
1441
0
  result = try_malloc_n (n16 + 1, sizeof (gunichar2), error);
1442
0
  if (result == NULL)
1443
0
      goto err_out;
1444
1445
0
  in = str;
1446
0
  for (i = 0; i < n16;)
1447
0
    {
1448
0
      gunichar wc = g_utf8_get_char (in);
1449
1450
0
      if (wc < 0x10000)
1451
0
  {
1452
0
    result[i++] = wc;
1453
0
  }
1454
0
      else
1455
0
  {
1456
0
    result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
1457
0
    result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
1458
0
  }
1459
      
1460
0
      in = g_utf8_next_char (in);
1461
0
    }
1462
1463
0
  result[i] = 0;
1464
1465
0
  if (items_written)
1466
0
    *items_written = n16;
1467
1468
0
 err_out:
1469
0
  if (items_read)
1470
0
    *items_read = in - str;
1471
  
1472
0
  return result;
1473
0
}
1474
1475
/**
1476
 * g_ucs4_to_utf16:
1477
 * @str: (array length=len) (element-type gunichar): a UCS-4 encoded string
1478
 * @len: the maximum length (number of characters) of @str to use. 
1479
 *     If @len < 0, then the string is nul-terminated.
1480
 * @items_read: (out) (optional): location to store number of
1481
 *     bytes read, or %NULL. If an error occurs then the index of the invalid
1482
 *     input is stored here.
1483
 * @items_written: (out) (optional): location to store number
1484
 *     of #gunichar2  written, or %NULL. The value stored here does not include
1485
 *     the trailing 0.
1486
 * @error: location to store the error occurring, or %NULL to ignore
1487
 *     errors. Any of the errors in #GConvertError other than
1488
 *     %G_CONVERT_ERROR_NO_CONVERSION may occur.
1489
 *
1490
 * Convert a string from UCS-4 to UTF-16. A 0 character will be
1491
 * added to the result after the converted text.
1492
 * 
1493
 * Returns: (transfer full): a pointer to a newly allocated UTF-16 string.
1494
 *     This value must be freed with g_free(). If an error occurs,
1495
 *     %NULL will be returned and @error set.
1496
 */
1497
gunichar2 *
1498
g_ucs4_to_utf16 (const gunichar  *str,
1499
     glong            len,              
1500
     glong           *items_read,       
1501
     glong           *items_written,    
1502
     GError         **error)
1503
0
{
1504
0
  gunichar2 *result = NULL;
1505
0
  gint n16;
1506
0
  gint i, j;
1507
1508
0
  n16 = 0;
1509
0
  i = 0;
1510
0
  while ((len < 0 || i < len) && str[i])
1511
0
    {
1512
0
      gunichar wc = str[i];
1513
1514
0
      if (wc < 0xd800)
1515
0
  n16 += 1;
1516
0
      else if (wc < 0xe000)
1517
0
  {
1518
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1519
0
                               _("Invalid sequence in conversion input"));
1520
1521
0
    goto err_out;
1522
0
  }
1523
0
      else if (wc < 0x10000)
1524
0
  n16 += 1;
1525
0
      else if (wc < 0x110000)
1526
0
  n16 += 2;
1527
0
      else
1528
0
  {
1529
0
    g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1530
0
                               _("Character out of range for UTF-16"));
1531
1532
0
    goto err_out;
1533
0
  }
1534
1535
0
      i++;
1536
0
    }
1537
1538
0
  result = try_malloc_n (n16 + 1, sizeof (gunichar2), error);
1539
0
  if (result == NULL)
1540
0
      goto err_out;
1541
1542
0
  for (i = 0, j = 0; j < n16; i++)
1543
0
    {
1544
0
      gunichar wc = str[i];
1545
1546
0
      if (wc < 0x10000)
1547
0
  {
1548
0
    result[j++] = wc;
1549
0
  }
1550
0
      else
1551
0
  {
1552
0
    result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
1553
0
    result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
1554
0
  }
1555
0
    }
1556
0
  result[j] = 0;
1557
1558
0
  if (items_written)
1559
0
    *items_written = n16;
1560
  
1561
0
 err_out:
1562
0
  if (items_read)
1563
0
    *items_read = i;
1564
  
1565
0
  return result;
1566
0
}
1567
1568
#define VALIDATE_BYTE(mask, expect)                      \
1569
190k
  G_STMT_START {                                         \
1570
190k
    if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \
1571
190k
      goto error;                                        \
1572
190k
  } G_STMT_END
1573
1574
/* see IETF RFC 3629 Section 4 */
1575
1576
static const gchar *
1577
fast_validate (const char *str)
1578
1579
9.97k
{
1580
9.97k
  const gchar *p;
1581
1582
214k
  for (p = str; *p; p++)
1583
204k
    {
1584
204k
      if (*(guchar *)p < 128)
1585
204k
  /* done */;
1586
0
      else 
1587
0
  {
1588
0
    const gchar *last;
1589
1590
0
    last = p;
1591
0
    if (*(guchar *)p < 0xe0) /* 110xxxxx */
1592
0
      {
1593
0
        if (G_UNLIKELY (*(guchar *)p < 0xc2))
1594
0
    goto error;
1595
0
      }
1596
0
    else
1597
0
      {
1598
0
        if (*(guchar *)p < 0xf0) /* 1110xxxx */
1599
0
    {
1600
0
      switch (*(guchar *)p++ & 0x0f)
1601
0
        {
1602
0
        case 0:
1603
0
          VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
1604
0
          break;
1605
0
        case 0x0d:
1606
0
          VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
1607
0
          break;
1608
0
        default:
1609
0
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1610
0
        }
1611
0
    }
1612
0
        else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
1613
0
    {
1614
0
      switch (*(guchar *)p++ & 0x07)
1615
0
        {
1616
0
        case 0:
1617
0
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1618
0
          if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
1619
0
      goto error;
1620
0
          break;
1621
0
        case 4:
1622
0
          VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
1623
0
          break;
1624
0
        default:
1625
0
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1626
0
        }
1627
0
      p++;
1628
0
      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1629
0
    }
1630
0
        else
1631
0
    goto error;
1632
0
      }
1633
1634
0
    p++;
1635
0
    VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1636
1637
0
    continue;
1638
1639
0
  error:
1640
0
    return last;
1641
0
  }
1642
204k
    }
1643
1644
9.97k
  return p;
1645
9.97k
}
1646
1647
static const gchar *
1648
fast_validate_len (const char *str,
1649
       gssize      max_len)
1650
1651
72.2k
{
1652
72.2k
  const gchar *p;
1653
1654
72.2k
  g_assert (max_len >= 0);
1655
1656
149M
  for (p = str; ((p - str) < max_len) && *p; p++)
1657
149M
    {
1658
149M
      if (*(guchar *)p < 128)
1659
149M
  /* done */;
1660
86.1k
      else 
1661
86.1k
  {
1662
86.1k
    const gchar *last;
1663
1664
86.1k
    last = p;
1665
86.1k
    if (*(guchar *)p < 0xe0) /* 110xxxxx */
1666
14.5k
      {
1667
14.5k
        if (G_UNLIKELY (max_len - (p - str) < 2))
1668
12
    goto error;
1669
        
1670
14.5k
        if (G_UNLIKELY (*(guchar *)p < 0xc2))
1671
82
    goto error;
1672
14.5k
      }
1673
71.5k
    else
1674
71.5k
      {
1675
71.5k
        if (*(guchar *)p < 0xf0) /* 1110xxxx */
1676
38.7k
    {
1677
38.7k
      if (G_UNLIKELY (max_len - (p - str) < 3))
1678
6
        goto error;
1679
1680
38.7k
      switch (*(guchar *)p++ & 0x0f)
1681
38.7k
        {
1682
7.84k
        case 0:
1683
7.84k
          VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
1684
7.83k
          break;
1685
7.83k
        case 0x0d:
1686
329
          VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
1687
320
          break;
1688
30.5k
        default:
1689
30.5k
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1690
38.7k
        }
1691
38.7k
    }
1692
32.8k
        else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */
1693
32.8k
    {
1694
32.8k
      if (G_UNLIKELY (max_len - (p - str) < 4))
1695
8
        goto error;
1696
1697
32.8k
      switch (*(guchar *)p++ & 0x07)
1698
32.8k
        {
1699
2.31k
        case 0:
1700
2.31k
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1701
2.30k
          if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))
1702
2
      goto error;
1703
2.30k
          break;
1704
2.30k
        case 4:
1705
296
          VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
1706
290
          break;
1707
30.2k
        default:
1708
30.2k
          VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1709
32.8k
        }
1710
32.7k
      p++;
1711
32.7k
      VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1712
32.7k
    }
1713
49
        else
1714
49
    goto error;
1715
71.5k
      }
1716
1717
85.8k
    p++;
1718
85.8k
    VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
1719
1720
85.8k
    continue;
1721
1722
85.8k
  error:
1723
284
    return last;
1724
85.8k
  }
1725
149M
    }
1726
1727
71.9k
  return p;
1728
72.2k
}
1729
1730
/**
1731
 * g_utf8_validate:
1732
 * @str: (array length=max_len) (element-type guint8): a pointer to character data
1733
 * @max_len: max bytes to validate, or -1 to go until NUL
1734
 * @end: (out) (optional) (transfer none): return location for end of valid data
1735
 * 
1736
 * Validates UTF-8 encoded text. @str is the text to validate;
1737
 * if @str is nul-terminated, then @max_len can be -1, otherwise
1738
 * @max_len should be the number of bytes to validate.
1739
 * If @end is non-%NULL, then the end of the valid range
1740
 * will be stored there (i.e. the start of the first invalid 
1741
 * character if some bytes were invalid, or the end of the text 
1742
 * being validated otherwise).
1743
 *
1744
 * Note that g_utf8_validate() returns %FALSE if @max_len is 
1745
 * positive and any of the @max_len bytes are nul.
1746
 *
1747
 * Returns %TRUE if all of @str was valid. Many GLib and GTK
1748
 * routines require valid UTF-8 as input; so data read from a file
1749
 * or the network should be checked with g_utf8_validate() before
1750
 * doing anything else with it.
1751
 * 
1752
 * Returns: %TRUE if the text was valid UTF-8
1753
 */
1754
gboolean
1755
g_utf8_validate (const char   *str,
1756
     gssize        max_len,    
1757
     const gchar **end)
1758
1759
82.1k
{
1760
82.1k
  const gchar *p;
1761
1762
82.1k
  if (max_len >= 0)
1763
72.2k
    return g_utf8_validate_len (str, max_len, end);
1764
1765
9.97k
  p = fast_validate (str);
1766
1767
9.97k
  if (end)
1768
4
    *end = p;
1769
1770
9.97k
  if (*p != '\0')
1771
0
    return FALSE;
1772
9.97k
  else
1773
9.97k
    return TRUE;
1774
9.97k
}
1775
1776
/**
1777
 * g_utf8_validate_len:
1778
 * @str: (array length=max_len) (element-type guint8): a pointer to character data
1779
 * @max_len: max bytes to validate
1780
 * @end: (out) (optional) (transfer none): return location for end of valid data
1781
 *
1782
 * Validates UTF-8 encoded text.
1783
 *
1784
 * As with g_utf8_validate(), but @max_len must be set, and hence this function
1785
 * will always return %FALSE if any of the bytes of @str are nul.
1786
 *
1787
 * Returns: %TRUE if the text was valid UTF-8
1788
 * Since: 2.60
1789
 */
1790
gboolean
1791
g_utf8_validate_len (const char   *str,
1792
                     gsize         max_len,
1793
                     const gchar **end)
1794
1795
72.2k
{
1796
72.2k
  const gchar *p;
1797
1798
72.2k
  p = fast_validate_len (str, max_len);
1799
1800
72.2k
  if (end)
1801
64.0k
    *end = p;
1802
1803
72.2k
  if (p != str + max_len)
1804
290
    return FALSE;
1805
71.9k
  else
1806
71.9k
    return TRUE;
1807
72.2k
}
1808
1809
/**
1810
 * g_unichar_validate:
1811
 * @ch: a Unicode character
1812
 * 
1813
 * Checks whether @ch is a valid Unicode character. Some possible
1814
 * integer values of @ch will not be valid. 0 is considered a valid
1815
 * character, though it's normally a string terminator.
1816
 * 
1817
 * Returns: %TRUE if @ch is a valid Unicode character
1818
 **/
1819
gboolean
1820
g_unichar_validate (gunichar ch)
1821
13.8k
{
1822
13.8k
  return UNICODE_VALID (ch);
1823
13.8k
}
1824
1825
/**
1826
 * g_utf8_strreverse:
1827
 * @str: a UTF-8 encoded string
1828
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
1829
 *     then the string is nul-terminated.
1830
 *
1831
 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. 
1832
 * (Use g_utf8_validate() on all text before trying to use UTF-8 
1833
 * utility functions with it.)
1834
 *
1835
 * This function is intended for programmatic uses of reversed strings.
1836
 * It pays no attention to decomposed characters, combining marks, byte 
1837
 * order marks, directional indicators (LRM, LRO, etc) and similar 
1838
 * characters which might need special handling when reversing a string 
1839
 * for display purposes.
1840
 *
1841
 * Note that unlike g_strreverse(), this function returns
1842
 * newly-allocated memory, which should be freed with g_free() when
1843
 * no longer needed. 
1844
 *
1845
 * Returns: (transfer full): a newly-allocated string which is the reverse of @str
1846
 *
1847
 * Since: 2.2
1848
 */
1849
gchar *
1850
g_utf8_strreverse (const gchar *str,
1851
       gssize       len)
1852
0
{
1853
0
  gchar *r, *result;
1854
0
  const gchar *p;
1855
1856
0
  if (len < 0)
1857
0
    len = strlen (str);
1858
1859
0
  result = g_new (gchar, len + 1);
1860
0
  r = result + len;
1861
0
  p = str;
1862
0
  while (r > result)
1863
0
    {
1864
0
      gchar *m, skip = g_utf8_skip[*(guchar*) p];
1865
0
      r -= skip;
1866
0
      g_assert (r >= result);
1867
0
      for (m = r; skip; skip--)
1868
0
        *m++ = *p++;
1869
0
    }
1870
0
  result[len] = 0;
1871
1872
0
  return result;
1873
0
}
1874
1875
/**
1876
 * g_utf8_make_valid:
1877
 * @str: string to coerce into UTF-8
1878
 * @len: the maximum length of @str to use, in bytes. If @len < 0,
1879
 *     then the string is nul-terminated.
1880
 *
1881
 * If the provided string is valid UTF-8, return a copy of it. If not,
1882
 * return a copy in which bytes that could not be interpreted as valid Unicode
1883
 * are replaced with the Unicode replacement character (U+FFFD).
1884
 *
1885
 * For example, this is an appropriate function to use if you have received
1886
 * a string that was incorrectly declared to be UTF-8, and you need a valid
1887
 * UTF-8 version of it that can be logged or displayed to the user, with the
1888
 * assumption that it is close enough to ASCII or UTF-8 to be mostly
1889
 * readable as-is.
1890
 *
1891
 * Returns: (transfer full): a valid UTF-8 string whose content resembles @str
1892
 *
1893
 * Since: 2.52
1894
 */
1895
gchar *
1896
g_utf8_make_valid (const gchar *str,
1897
                   gssize       len)
1898
0
{
1899
0
  GString *string;
1900
0
  const gchar *remainder, *invalid;
1901
0
  gsize remaining_bytes, valid_bytes;
1902
1903
0
  g_return_val_if_fail (str != NULL, NULL);
1904
1905
0
  if (len < 0)
1906
0
    len = strlen (str);
1907
1908
0
  string = NULL;
1909
0
  remainder = str;
1910
0
  remaining_bytes = len;
1911
1912
0
  while (remaining_bytes != 0) 
1913
0
    {
1914
0
      if (g_utf8_validate (remainder, remaining_bytes, &invalid)) 
1915
0
  break;
1916
0
      valid_bytes = invalid - remainder;
1917
    
1918
0
      if (string == NULL) 
1919
0
  string = g_string_sized_new (remaining_bytes);
1920
1921
0
      g_string_append_len (string, remainder, valid_bytes);
1922
      /* append U+FFFD REPLACEMENT CHARACTER */
1923
0
      g_string_append (string, "\357\277\275");
1924
      
1925
0
      remaining_bytes -= valid_bytes + 1;
1926
0
      remainder = invalid + 1;
1927
0
    }
1928
  
1929
0
  if (string == NULL)
1930
0
    return g_strndup (str, len);
1931
  
1932
0
  g_string_append_len (string, remainder, remaining_bytes);
1933
0
  g_string_append_c (string, '\0');
1934
1935
0
  g_assert (g_utf8_validate (string->str, -1, NULL));
1936
1937
0
  return g_string_free (string, FALSE);
1938
0
}