Coverage Report

Created: 2025-07-11 07:03

/src/rauc/subprojects/glib-2.76.5/glib/gunicollate.c
Line
Count
Source (jump to first uncovered line)
1
/* gunicollate.c - Collation
2
 *
3
 *  Copyright 2001,2005 Red Hat, Inc.
4
 *
5
 * SPDX-License-Identifier: LGPL-2.1-or-later
6
 *
7
 * This library is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * This library is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with this library; if not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#include "config.h"
22
23
#include <locale.h>
24
#include <string.h>
25
#ifdef HAVE_WCHAR_H
26
#include <wchar.h>
27
#endif
28
29
#ifdef HAVE_CARBON
30
#include <CoreServices/CoreServices.h>
31
#endif
32
33
#include "gmem.h"
34
#include "gunicode.h"
35
#include "gunicodeprivate.h"
36
#include "gstring.h"
37
#include "gstrfuncs.h"
38
#include "gtestutils.h"
39
#include "gcharset.h"
40
#include "gconvert.h"
41
42
#if SIZEOF_WCHAR_T == 4 && defined(__STDC_ISO_10646__)
43
#define GUNICHAR_EQUALS_WCHAR_T 1
44
#endif
45
46
#ifdef _MSC_VER
47
/* Workaround for bug in MSVCR80.DLL */
48
static gsize
49
msc_strxfrm_wrapper (char       *string1,
50
         const char *string2,
51
         gsize       count)
52
{
53
  if (!string1 || count <= 0)
54
    {
55
      char tmp;
56
57
      return strxfrm (&tmp, string2, 1);
58
    }
59
  return strxfrm (string1, string2, count);
60
}
61
#define strxfrm msc_strxfrm_wrapper
62
#endif
63
64
/**
65
 * g_utf8_collate:
66
 * @str1: a UTF-8 encoded string
67
 * @str2: a UTF-8 encoded string
68
 * 
69
 * Compares two strings for ordering using the linguistically
70
 * correct rules for the [current locale][setlocale].
71
 * When sorting a large number of strings, it will be significantly 
72
 * faster to obtain collation keys with g_utf8_collate_key() and 
73
 * compare the keys with strcmp() when sorting instead of sorting 
74
 * the original strings.
75
 * 
76
 * If the two strings are not comparable due to being in different collation
77
 * sequences, the result is undefined. This can happen if the strings are in
78
 * different language scripts, for example.
79
 *
80
 * Returns: < 0 if @str1 compares before @str2, 
81
 *   0 if they compare equal, > 0 if @str1 compares after @str2.
82
 **/
83
gint
84
g_utf8_collate (const gchar *str1,
85
    const gchar *str2)
86
0
{
87
0
  gint result;
88
89
#ifdef HAVE_CARBON
90
91
  UniChar *str1_utf16;
92
  UniChar *str2_utf16;
93
  glong len1;
94
  glong len2;
95
  SInt32 retval = 0;
96
97
  g_return_val_if_fail (str1 != NULL, 0);
98
  g_return_val_if_fail (str2 != NULL, 0);
99
100
  str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
101
  str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
102
103
  UCCompareTextDefault (kUCCollateStandardOptions,
104
                        str1_utf16, len1, str2_utf16, len2,
105
                        NULL, &retval);
106
  result = retval;
107
108
  g_free (str2_utf16);
109
  g_free (str1_utf16);
110
111
#elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
112
113
0
  gunichar *str1_norm;
114
0
  gunichar *str2_norm;
115
116
0
  g_return_val_if_fail (str1 != NULL, 0);
117
0
  g_return_val_if_fail (str2 != NULL, 0);
118
119
0
  str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
120
0
  str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
121
122
0
  result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
123
124
0
  g_free (str1_norm);
125
0
  g_free (str2_norm);
126
127
#else
128
129
  const gchar *charset;
130
  gchar *str1_norm;
131
  gchar *str2_norm;
132
133
  g_return_val_if_fail (str1 != NULL, 0);
134
  g_return_val_if_fail (str2 != NULL, 0);
135
136
  str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
137
  str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
138
139
  if (g_get_charset (&charset))
140
    {
141
      result = strcoll (str1_norm, str2_norm);
142
    }
143
  else
144
    {
145
      gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
146
      gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
147
148
      if (str1_locale && str2_locale)
149
  result =  strcoll (str1_locale, str2_locale);
150
      else if (str1_locale)
151
  result = -1;
152
      else if (str2_locale)
153
  result = 1;
154
      else
155
  result = strcmp (str1_norm, str2_norm);
156
157
      g_free (str1_locale);
158
      g_free (str2_locale);
159
    }
160
161
  g_free (str1_norm);
162
  g_free (str2_norm);
163
164
#endif
165
166
0
  return result;
167
0
}
168
169
#if defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
170
/* We need UTF-8 encoding of numbers to encode the weights if
171
 * we are using wcsxfrm. However, we aren't encoding Unicode
172
 * characters, so we can't simply use g_unichar_to_utf8.
173
 *
174
 * The following routine is taken (with modification) from GNU
175
 * libc's strxfrm routine:
176
 *
177
 * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
178
 * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
179
 */
180
static inline int
181
utf8_encode (char *buf, wchar_t val)
182
0
{
183
0
  int retval;
184
185
0
  if (val < 0x80)
186
0
    {
187
0
      if (buf)
188
0
  *buf++ = (char) val;
189
0
      retval = 1;
190
0
    }
191
0
  else
192
0
    {
193
0
      int step;
194
195
0
      for (step = 2; step < 6; ++step)
196
0
        if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
197
0
          break;
198
0
      retval = step;
199
200
0
      if (buf)
201
0
  {
202
0
    *buf = (unsigned char) (~0xff >> step);
203
0
    --step;
204
0
    do
205
0
      {
206
0
        buf[step] = 0x80 | (val & 0x3f);
207
0
        val >>= 6;
208
0
      }
209
0
    while (--step > 0);
210
0
    *buf |= val;
211
0
  }
212
0
    }
213
214
0
  return retval;
215
0
}
216
#endif
217
218
#ifdef HAVE_CARBON
219
220
static gchar *
221
collate_key_to_string (UCCollationValue *key,
222
                       gsize             key_len)
223
{
224
  gchar *result;
225
  gsize result_len;
226
  long *lkey = (long *) key;
227
228
  /* UCCollationValue format:
229
   *
230
   * UCCollateOptions (32/64 bits)
231
   * SizeInBytes      (32/64 bits)
232
   * Value            (8 bits arrey)
233
   *
234
   * UCCollateOptions: ordering option mask of the collator
235
   * used to create the key. Size changes on 32-bit / 64-bit
236
   * hosts. On 64-bits also the extra half-word seems to have
237
   * some extra (unknown) meaning.
238
   * SizeInBytes: size of the whole structure, in bytes
239
   * (including UCCollateOptions and SizeInBytes fields). Size
240
   * changes on 32-bit & 64-bit hosts.
241
   * Value: array of bytes containing the comparison weights.
242
   * Seems to have several sub-strings separated by \001 and \002
243
   * chars. Also, experience shows this is directly strcmp-able.
244
   */
245
246
  result_len = lkey[1];
247
  result = g_malloc (result_len + 1);
248
  memcpy (result, &lkey[2], result_len);
249
  result[result_len] = '\0';
250
251
  return result;
252
}
253
254
static gchar *
255
carbon_collate_key_with_collator (const gchar *str,
256
                                  gssize       len,
257
                                  CollatorRef  collator)
258
{
259
  UniChar *str_utf16 = NULL;
260
  glong len_utf16;
261
  OSStatus ret;
262
  UCCollationValue staticbuf[512];
263
  UCCollationValue *freeme = NULL;
264
  UCCollationValue *buf;
265
  ItemCount buf_len;
266
  ItemCount key_len;
267
  ItemCount try_len;
268
  gchar *result = NULL;
269
270
  str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
271
  try_len = len_utf16 * 5 + 2;
272
273
  if (try_len <= sizeof staticbuf)
274
    {
275
      buf = staticbuf;
276
      buf_len = sizeof staticbuf;
277
    }
278
  else
279
    {
280
      freeme = g_new (UCCollationValue, try_len);
281
      buf = freeme;
282
      buf_len = try_len;
283
    }
284
285
  ret = UCGetCollationKey (collator, str_utf16, len_utf16,
286
                           buf_len, &key_len, buf);
287
288
  if (ret == kCollateBufferTooSmall)
289
    {
290
      freeme = g_renew (UCCollationValue, freeme, try_len * 2);
291
      buf = freeme;
292
      buf_len = try_len * 2;
293
      ret = UCGetCollationKey (collator, str_utf16, len_utf16,
294
                               buf_len, &key_len, buf);
295
    }
296
297
  if (ret == 0)
298
    result = collate_key_to_string (buf, key_len);
299
  else
300
    result = g_strdup ("");
301
302
  g_free (freeme);
303
  g_free (str_utf16);
304
  return result;
305
}
306
307
static gchar *
308
carbon_collate_key (const gchar *str,
309
                    gssize       len)
310
{
311
  static CollatorRef collator;
312
313
  if (G_UNLIKELY (!collator))
314
    {
315
      UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
316
317
      if (!collator)
318
        {
319
          static gboolean been_here;
320
          if (!been_here)
321
            g_warning ("%s: UCCreateCollator failed", G_STRLOC);
322
          been_here = TRUE;
323
          return g_strdup ("");
324
        }
325
    }
326
327
  return carbon_collate_key_with_collator (str, len, collator);
328
}
329
330
static gchar *
331
carbon_collate_key_for_filename (const gchar *str,
332
                                 gssize       len)
333
{
334
  static CollatorRef collator;
335
336
  if (G_UNLIKELY (!collator))
337
    {
338
      /* http://developer.apple.com/qa/qa2004/qa1159.html */
339
      UCCreateCollator (NULL, 0,
340
                        kUCCollateComposeInsensitiveMask
341
                         | kUCCollateWidthInsensitiveMask
342
                         | kUCCollateCaseInsensitiveMask
343
                         | kUCCollateDigitsOverrideMask
344
                         | kUCCollateDigitsAsNumberMask
345
                         | kUCCollatePunctuationSignificantMask, 
346
                        &collator);
347
348
      if (!collator)
349
        {
350
          static gboolean been_here;
351
          if (!been_here)
352
            g_warning ("%s: UCCreateCollator failed", G_STRLOC);
353
          been_here = TRUE;
354
          return g_strdup ("");
355
        }
356
    }
357
358
  return carbon_collate_key_with_collator (str, len, collator);
359
}
360
361
#endif /* HAVE_CARBON */
362
363
/**
364
 * g_utf8_collate_key:
365
 * @str: a UTF-8 encoded string.
366
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
367
 *
368
 * Converts a string into a collation key that can be compared
369
 * with other collation keys produced by the same function using 
370
 * strcmp(). 
371
 *
372
 * The results of comparing the collation keys of two strings 
373
 * with strcmp() will always be the same as comparing the two 
374
 * original keys with g_utf8_collate().
375
 * 
376
 * Note that this function depends on the [current locale][setlocale].
377
 * 
378
 * Returns: a newly allocated string. This string should
379
 *   be freed with g_free() when you are done with it.
380
 **/
381
gchar *
382
g_utf8_collate_key (const gchar *str,
383
        gssize       len)
384
0
{
385
0
  gchar *result;
386
387
#ifdef HAVE_CARBON
388
389
  g_return_val_if_fail (str != NULL, NULL);
390
  result = carbon_collate_key (str, len);
391
392
#elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
393
394
0
  gsize xfrm_len;
395
0
  gunichar *str_norm;
396
0
  wchar_t *result_wc;
397
0
  gsize i;
398
0
  gsize result_len = 0;
399
400
0
  g_return_val_if_fail (str != NULL, NULL);
401
402
0
  str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
403
404
0
  xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
405
0
  result_wc = g_new (wchar_t, xfrm_len + 1);
406
0
  wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
407
408
0
  for (i = 0; i < xfrm_len; i++)
409
0
    result_len += utf8_encode (NULL, result_wc[i]);
410
411
0
  result = g_malloc (result_len + 1);
412
0
  result_len = 0;
413
0
  for (i = 0; i < xfrm_len; i++)
414
0
    result_len += utf8_encode (result + result_len, result_wc[i]);
415
416
0
  result[result_len] = '\0';
417
418
0
  g_free (result_wc);
419
0
  g_free (str_norm);
420
421
0
  return result;
422
#else
423
424
  gsize xfrm_len = 0;
425
  const gchar *charset;
426
  gchar *str_norm;
427
428
  g_return_val_if_fail (str != NULL, NULL);
429
430
  str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
431
432
  result = NULL;
433
434
  if (g_get_charset (&charset))
435
    {
436
      xfrm_len = strxfrm (NULL, str_norm, 0);
437
      if (xfrm_len < G_MAXINT - 2)
438
        {
439
          result = g_malloc (xfrm_len + 1);
440
          strxfrm (result, str_norm, xfrm_len + 1);
441
        }
442
    }
443
  else
444
    {
445
      gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
446
447
      if (str_locale)
448
  {
449
    xfrm_len = strxfrm (NULL, str_locale, 0);
450
    if (xfrm_len >= G_MAXINT - 2)
451
      {
452
        g_free (str_locale);
453
        str_locale = NULL;
454
      }
455
  }
456
      if (str_locale)
457
  {
458
    result = g_malloc (xfrm_len + 2);
459
    result[0] = 'A';
460
    strxfrm (result + 1, str_locale, xfrm_len + 1);
461
    
462
    g_free (str_locale);
463
  }
464
    }
465
    
466
  if (!result) 
467
    {
468
      xfrm_len = strlen (str_norm);
469
      result = g_malloc (xfrm_len + 2);
470
      result[0] = 'B';
471
      memcpy (result + 1, str_norm, xfrm_len);
472
      result[xfrm_len+1] = '\0';
473
    }
474
475
  g_free (str_norm);
476
#endif
477
478
0
  return result;
479
0
}
480
481
/* This is a collation key that is very very likely to sort before any
482
 * collation key that libc strxfrm generates. We use this before any
483
 * special case (dot or number) to make sure that its sorted before
484
 * anything else.
485
 */
486
#define COLLATION_SENTINEL "\1\1\1"
487
488
/**
489
 * g_utf8_collate_key_for_filename:
490
 * @str: a UTF-8 encoded string.
491
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
492
 *
493
 * Converts a string into a collation key that can be compared
494
 * with other collation keys produced by the same function using strcmp(). 
495
 * 
496
 * In order to sort filenames correctly, this function treats the dot '.' 
497
 * as a special case. Most dictionary orderings seem to consider it
498
 * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
499
 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
500
 * would like to treat numbers intelligently so that "file1" "file10" "file5"
501
 * is sorted as "file1" "file5" "file10".
502
 * 
503
 * Note that this function depends on the [current locale][setlocale].
504
 *
505
 * Returns: a newly allocated string. This string should
506
 *   be freed with g_free() when you are done with it.
507
 *
508
 * Since: 2.8
509
 */
510
gchar *
511
g_utf8_collate_key_for_filename (const gchar *str,
512
         gssize       len)
513
0
{
514
0
#ifndef HAVE_CARBON
515
0
  GString *result;
516
0
  GString *append;
517
0
  const gchar *p;
518
0
  const gchar *prev;
519
0
  const gchar *end;
520
0
  gchar *collate_key;
521
0
  gint digits;
522
0
  gint leading_zeros;
523
524
  /*
525
   * How it works:
526
   *
527
   * Split the filename into collatable substrings which do
528
   * not contain [.0-9] and special-cased substrings. The collatable 
529
   * substrings are run through the normal g_utf8_collate_key() and the 
530
   * resulting keys are concatenated with keys generated from the 
531
   * special-cased substrings.
532
   *
533
   * Special cases: Dots are handled by replacing them with '\1' which 
534
   * implies that short dot-delimited substrings are before long ones, 
535
   * e.g.
536
   * 
537
   *   a\1a   (a.a)
538
   *   a-\1a  (a-.a)
539
   *   aa\1a  (aa.a)
540
   * 
541
   * Numbers are handled by prepending to each number d-1 superdigits 
542
   * where d = number of digits in the number and SUPERDIGIT is a 
543
   * character with an integer value higher than any digit (for instance 
544
   * ':'). This ensures that single-digit numbers are sorted before 
545
   * double-digit numbers which in turn are sorted separately from 
546
   * triple-digit numbers, etc. To avoid strange side-effects when 
547
   * sorting strings that already contain SUPERDIGITs, a '\2'
548
   * is also prepended, like this
549
   *
550
   *   file\21      (file1)
551
   *   file\25      (file5)
552
   *   file\2:10    (file10)
553
   *   file\2:26    (file26)
554
   *   file\2::100  (file100)
555
   *   file:foo     (file:foo)
556
   * 
557
   * This has the side-effect of sorting numbers before everything else (except
558
   * dots), but this is probably OK.
559
   *
560
   * Leading digits are ignored when doing the above. To discriminate
561
   * numbers which differ only in the number of leading digits, we append
562
   * the number of leading digits as a byte at the very end of the collation
563
   * key.
564
   *
565
   * To try avoid conflict with any collation key sequence generated by libc we
566
   * start each switch to a special cased part with a sentinel that hopefully
567
   * will sort before anything libc will generate.
568
   */
569
570
0
  if (len < 0)
571
0
    len = strlen (str);
572
573
0
  result = g_string_sized_new (len * 2);
574
0
  append = g_string_sized_new (0);
575
576
0
  end = str + len;
577
578
  /* No need to use utf8 functions, since we're only looking for ascii chars */
579
0
  for (prev = p = str; p < end; p++)
580
0
    {
581
0
      switch (*p)
582
0
  {
583
0
  case '.':
584
0
    if (prev != p) 
585
0
      {
586
0
        collate_key = g_utf8_collate_key (prev, p - prev);
587
0
        g_string_append (result, collate_key);
588
0
        g_free (collate_key);
589
0
      }
590
    
591
0
    g_string_append (result, COLLATION_SENTINEL "\1");
592
    
593
    /* skip the dot */
594
0
    prev = p + 1;
595
0
    break;
596
    
597
0
  case '0':
598
0
  case '1':
599
0
  case '2':
600
0
  case '3':
601
0
  case '4':
602
0
  case '5':
603
0
  case '6':
604
0
  case '7':
605
0
  case '8':
606
0
  case '9':
607
0
    if (prev != p) 
608
0
      {
609
0
        collate_key = g_utf8_collate_key (prev, p - prev);
610
0
        g_string_append (result, collate_key);
611
0
        g_free (collate_key);
612
0
      }
613
    
614
0
    g_string_append (result, COLLATION_SENTINEL "\2");
615
    
616
0
    prev = p;
617
    
618
    /* write d-1 colons */
619
0
    if (*p == '0')
620
0
      {
621
0
        leading_zeros = 1;
622
0
        digits = 0;
623
0
      }
624
0
    else
625
0
      {
626
0
        leading_zeros = 0;
627
0
        digits = 1;
628
0
      }
629
    
630
0
    while (++p < end)
631
0
      {
632
0
        if (*p == '0' && !digits)
633
0
    ++leading_zeros;
634
0
        else if (g_ascii_isdigit(*p))
635
0
    ++digits;
636
0
        else
637
0
                {
638
      /* count an all-zero sequence as
639
                   * one digit plus leading zeros
640
                   */
641
0
              if (!digits)
642
0
                    {
643
0
                      ++digits;
644
0
                      --leading_zeros;
645
0
                    }        
646
0
      break;
647
0
                }
648
0
      }
649
650
0
    while (digits > 1)
651
0
      {
652
0
        g_string_append_c (result, ':');
653
0
        --digits;
654
0
      }
655
656
0
    if (leading_zeros > 0)
657
0
      {
658
0
        g_string_append_c (append, (char)leading_zeros);
659
0
        prev += leading_zeros;
660
0
      }
661
    
662
    /* write the number itself */
663
0
    g_string_append_len (result, prev, p - prev);
664
    
665
0
    prev = p;
666
0
    --p;    /* go one step back to avoid disturbing outer loop */
667
0
    break;
668
    
669
0
  default:
670
    /* other characters just accumulate */
671
0
    break;
672
0
  }
673
0
    }
674
  
675
0
  if (prev != p) 
676
0
    {
677
0
      collate_key = g_utf8_collate_key (prev, p - prev);
678
0
      g_string_append (result, collate_key);
679
0
      g_free (collate_key);
680
0
    }
681
  
682
0
  g_string_append (result, append->str);
683
0
  g_string_free (append, TRUE);
684
685
0
  return g_string_free (result, FALSE);
686
#else /* HAVE_CARBON */
687
  return carbon_collate_key_for_filename (str, len);
688
#endif
689
0
}