Coverage Report

Created: 2025-11-16 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/glib-2.80.0/glib/gunicollate.c
Line
Count
Source
1
/* gunicollate.c - Collation
2
 *
3
 *  Copyright 2001,2005 Red Hat, Inc.
4
 *
5
 * SPDX-License-Identifier: LGPL-2.1-or-later
6
 *
7
 * This library is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * This library is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with this library; if not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#include "config.h"
22
23
#include <locale.h>
24
#include <string.h>
25
#ifdef HAVE_WCHAR_H
26
#include <wchar.h>
27
#endif
28
29
#ifdef HAVE_CARBON
30
#include <CoreServices/CoreServices.h>
31
#endif
32
33
#include "gmem.h"
34
#include "gunicode.h"
35
#include "gunicodeprivate.h"
36
#include "gstring.h"
37
#include "gstrfuncs.h"
38
#include "gtestutils.h"
39
#include "gcharset.h"
40
#include "gconvert.h"
41
42
#if SIZEOF_WCHAR_T == 4 && defined(__STDC_ISO_10646__)
43
#define GUNICHAR_EQUALS_WCHAR_T 1
44
#endif
45
46
#ifdef _MSC_VER
47
/* Workaround for bug in MSVCR80.DLL */
48
static gsize
49
msc_strxfrm_wrapper (char       *string1,
50
         const char *string2,
51
         gsize       count)
52
{
53
  if (!string1 || count <= 0)
54
    {
55
      char tmp;
56
57
      return strxfrm (&tmp, string2, 1);
58
    }
59
  return strxfrm (string1, string2, count);
60
}
61
#define strxfrm msc_strxfrm_wrapper
62
#endif
63
64
/**
65
 * g_utf8_collate:
66
 * @str1: a UTF-8 encoded string
67
 * @str2: a UTF-8 encoded string
68
 * 
69
 * Compares two strings for ordering using the linguistically
70
 * correct rules for the [current locale][setlocale].
71
 * When sorting a large number of strings, it will be significantly 
72
 * faster to obtain collation keys with g_utf8_collate_key() and 
73
 * compare the keys with strcmp() when sorting instead of sorting 
74
 * the original strings.
75
 * 
76
 * If the two strings are not comparable due to being in different collation
77
 * sequences, the result is undefined. This can happen if the strings are in
78
 * different language scripts, for example.
79
 *
80
 * Returns: < 0 if @str1 compares before @str2, 
81
 *   0 if they compare equal, > 0 if @str1 compares after @str2.
82
 **/
83
gint
84
g_utf8_collate (const gchar *str1,
85
    const gchar *str2)
86
0
{
87
0
  gint result;
88
89
#ifdef HAVE_CARBON
90
91
  UniChar *str1_utf16;
92
  UniChar *str2_utf16;
93
  glong len1;
94
  glong len2;
95
  SInt32 retval = 0;
96
97
  g_return_val_if_fail (str1 != NULL, 0);
98
  g_return_val_if_fail (str2 != NULL, 0);
99
100
  str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
101
  str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
102
103
  UCCompareTextDefault (kUCCollateStandardOptions,
104
                        str1_utf16, len1, str2_utf16, len2,
105
                        NULL, &retval);
106
  result = retval;
107
108
  g_free (str2_utf16);
109
  g_free (str1_utf16);
110
111
#elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
112
113
0
  gunichar *str1_norm;
114
0
  gunichar *str2_norm;
115
116
0
  g_return_val_if_fail (str1 != NULL, 0);
117
0
  g_return_val_if_fail (str2 != NULL, 0);
118
119
0
  str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
120
0
  str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
121
122
0
  result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
123
124
0
  g_free (str1_norm);
125
0
  g_free (str2_norm);
126
127
#else
128
129
  const gchar *charset;
130
  gchar *str1_norm;
131
  gchar *str2_norm;
132
133
  g_return_val_if_fail (str1 != NULL, 0);
134
  g_return_val_if_fail (str2 != NULL, 0);
135
136
  str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
137
  str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
138
139
  if (g_get_charset (&charset))
140
    {
141
      result = strcoll (str1_norm, str2_norm);
142
    }
143
  else
144
    {
145
      gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
146
      gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
147
148
      if (str1_locale && str2_locale)
149
  result =  strcoll (str1_locale, str2_locale);
150
      else if (str1_locale)
151
  result = -1;
152
      else if (str2_locale)
153
  result = 1;
154
      else
155
  result = strcmp (str1_norm, str2_norm);
156
157
      g_free (str1_locale);
158
      g_free (str2_locale);
159
    }
160
161
  g_free (str1_norm);
162
  g_free (str2_norm);
163
164
#endif
165
166
0
  return result;
167
0
}
168
169
#if defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
170
/* We need UTF-8 encoding of numbers to encode the weights if
171
 * we are using wcsxfrm. However, we aren't encoding Unicode
172
 * characters, so we can't simply use g_unichar_to_utf8.
173
 *
174
 * The following routine is taken (with modification) from GNU
175
 * libc's strxfrm routine:
176
 *
177
 * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
178
 * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
179
 */
180
static inline int
181
utf8_encode (char *buf, wchar_t val)
182
0
{
183
0
  int retval;
184
185
0
  if (val < 0x80)
186
0
    {
187
0
      if (buf)
188
0
  *buf++ = (char) val;
189
0
      retval = 1;
190
0
    }
191
0
  else
192
0
    {
193
0
      int step;
194
195
0
      for (step = 2; step < 6; ++step)
196
0
        if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
197
0
          break;
198
0
      retval = step;
199
200
0
      if (buf)
201
0
  {
202
0
    *buf = (unsigned char) (~0xff >> step);
203
0
    --step;
204
0
    do
205
0
      {
206
0
        buf[step] = 0x80 | (val & 0x3f);
207
0
        val >>= 6;
208
0
      }
209
0
    while (--step > 0);
210
0
    *buf |= val;
211
0
  }
212
0
    }
213
214
0
  return retval;
215
0
}
216
#endif
217
218
#ifdef HAVE_CARBON
219
220
static gchar *
221
collate_key_to_string (UCCollationValue *key,
222
                       gsize             key_len)
223
{
224
  gchar *result;
225
  gsize result_len;
226
  long *lkey = (long *) key;
227
228
  /* UCCollationValue format:
229
   *
230
   * UCCollateOptions (32/64 bits)
231
   * SizeInBytes      (32/64 bits)
232
   * Value            (8 bits arrey)
233
   *
234
   * UCCollateOptions: ordering option mask of the collator
235
   * used to create the key. Size changes on 32-bit / 64-bit
236
   * hosts. On 64-bits also the extra half-word seems to have
237
   * some extra (unknown) meaning.
238
   * SizeInBytes: size of the whole structure, in bytes
239
   * (including UCCollateOptions and SizeInBytes fields). Size
240
   * changes on 32-bit & 64-bit hosts.
241
   * Value: array of bytes containing the comparison weights.
242
   * Seems to have several sub-strings separated by \001 and \002
243
   * chars. Also, experience shows this is directly strcmp-able.
244
   */
245
246
  result_len = lkey[1];
247
  result = g_malloc (result_len + 1);
248
  memcpy (result, &lkey[2], result_len);
249
  result[result_len] = '\0';
250
251
  return result;
252
}
253
254
static gchar *
255
carbon_collate_key_with_collator (const gchar *str,
256
                                  gssize       len,
257
                                  CollatorRef  collator)
258
{
259
  UniChar *str_utf16 = NULL;
260
  glong len_utf16;
261
  OSStatus ret;
262
  UCCollationValue staticbuf[512];
263
  UCCollationValue *freeme = NULL;
264
  UCCollationValue *buf;
265
  ItemCount buf_len;
266
  ItemCount key_len;
267
  ItemCount try_len;
268
  gchar *result = NULL;
269
270
  str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
271
  try_len = len_utf16 * 5 + 2;
272
273
  if (try_len <= sizeof staticbuf)
274
    {
275
      buf = staticbuf;
276
      buf_len = sizeof staticbuf;
277
    }
278
  else
279
    {
280
      freeme = g_new (UCCollationValue, try_len);
281
      buf = freeme;
282
      buf_len = try_len;
283
    }
284
285
  ret = UCGetCollationKey (collator, str_utf16, len_utf16,
286
                           buf_len, &key_len, buf);
287
288
  if (ret == kCollateBufferTooSmall)
289
    {
290
      freeme = g_renew (UCCollationValue, freeme, try_len * 2);
291
      buf = freeme;
292
      buf_len = try_len * 2;
293
      ret = UCGetCollationKey (collator, str_utf16, len_utf16,
294
                               buf_len, &key_len, buf);
295
    }
296
297
  if (ret == 0)
298
    result = collate_key_to_string (buf, key_len);
299
  else
300
    result = g_strdup ("");
301
302
  g_free (freeme);
303
  g_free (str_utf16);
304
  return result;
305
}
306
307
static gchar *
308
carbon_collate_key (const gchar *str,
309
                    gssize       len)
310
{
311
  static CollatorRef collator;
312
313
  if (G_UNLIKELY (!collator))
314
    {
315
      UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
316
317
      if (!collator)
318
        {
319
          static gboolean been_here;
320
          if (!been_here)
321
            g_warning ("%s: UCCreateCollator failed", G_STRLOC);
322
          been_here = TRUE;
323
          return g_strdup ("");
324
        }
325
    }
326
327
  return carbon_collate_key_with_collator (str, len, collator);
328
}
329
330
static gchar *
331
carbon_collate_key_for_filename (const gchar *str,
332
                                 gssize       len)
333
{
334
  static CollatorRef collator;
335
336
  if (G_UNLIKELY (!collator))
337
    {
338
      /* http://developer.apple.com/qa/qa2004/qa1159.html */
339
      UCCreateCollator (NULL, 0,
340
                        kUCCollateComposeInsensitiveMask
341
                         | kUCCollateWidthInsensitiveMask
342
                         | kUCCollateCaseInsensitiveMask
343
                         | kUCCollateDigitsOverrideMask
344
                         | kUCCollateDigitsAsNumberMask
345
                         | kUCCollatePunctuationSignificantMask, 
346
                        &collator);
347
348
      if (!collator)
349
        {
350
          static gboolean been_here;
351
          if (!been_here)
352
            g_warning ("%s: UCCreateCollator failed", G_STRLOC);
353
          been_here = TRUE;
354
          return g_strdup ("");
355
        }
356
    }
357
358
  return carbon_collate_key_with_collator (str, len, collator);
359
}
360
361
#endif /* HAVE_CARBON */
362
363
/**
364
 * g_utf8_collate_key:
365
 * @str: a UTF-8 encoded string.
366
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
367
 *
368
 * Converts a string into a collation key that can be compared
369
 * with other collation keys produced by the same function using 
370
 * strcmp(). 
371
 *
372
 * The results of comparing the collation keys of two strings 
373
 * with strcmp() will always be the same as comparing the two 
374
 * original keys with g_utf8_collate().
375
 * 
376
 * Note that this function depends on the [current locale][setlocale].
377
 * 
378
 * Returns: a newly allocated string. This string should
379
 *   be freed with g_free() when you are done with it.
380
 **/
381
gchar *
382
g_utf8_collate_key (const gchar *str,
383
        gssize       len)
384
0
{
385
0
  gchar *result;
386
387
#ifdef HAVE_CARBON
388
389
  g_return_val_if_fail (str != NULL, NULL);
390
  result = carbon_collate_key (str, len);
391
392
#elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
393
394
0
  gsize xfrm_len;
395
0
  gunichar *str_norm;
396
0
  wchar_t *result_wc;
397
0
  gsize i;
398
0
  gsize result_len = 0;
399
400
0
  g_return_val_if_fail (str != NULL, NULL);
401
402
0
  str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
403
404
0
  g_return_val_if_fail (str_norm != NULL, NULL);
405
406
0
  xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
407
0
  result_wc = g_new (wchar_t, xfrm_len + 1);
408
0
  wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
409
410
0
  for (i = 0; i < xfrm_len; i++)
411
0
    result_len += utf8_encode (NULL, result_wc[i]);
412
413
0
  result = g_malloc (result_len + 1);
414
0
  result_len = 0;
415
0
  for (i = 0; i < xfrm_len; i++)
416
0
    result_len += utf8_encode (result + result_len, result_wc[i]);
417
418
0
  result[result_len] = '\0';
419
420
0
  g_free (result_wc);
421
0
  g_free (str_norm);
422
423
0
  return result;
424
#else
425
426
  gsize xfrm_len = 0;
427
  const gchar *charset;
428
  gchar *str_norm;
429
430
  g_return_val_if_fail (str != NULL, NULL);
431
432
  str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
433
434
  result = NULL;
435
436
  if (g_get_charset (&charset))
437
    {
438
      xfrm_len = strxfrm (NULL, str_norm, 0);
439
      if (xfrm_len < G_MAXINT - 2)
440
        {
441
          result = g_malloc (xfrm_len + 1);
442
          strxfrm (result, str_norm, xfrm_len + 1);
443
        }
444
    }
445
  else
446
    {
447
      gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
448
449
      if (str_locale)
450
  {
451
    xfrm_len = strxfrm (NULL, str_locale, 0);
452
    if (xfrm_len >= G_MAXINT - 2)
453
      {
454
        g_free (str_locale);
455
        str_locale = NULL;
456
      }
457
  }
458
      if (str_locale)
459
  {
460
    result = g_malloc (xfrm_len + 2);
461
    result[0] = 'A';
462
    strxfrm (result + 1, str_locale, xfrm_len + 1);
463
    
464
    g_free (str_locale);
465
  }
466
    }
467
    
468
  if (!result) 
469
    {
470
      xfrm_len = strlen (str_norm);
471
      result = g_malloc (xfrm_len + 2);
472
      result[0] = 'B';
473
      memcpy (result + 1, str_norm, xfrm_len);
474
      result[xfrm_len+1] = '\0';
475
    }
476
477
  g_free (str_norm);
478
#endif
479
480
0
  return result;
481
0
}
482
483
/* This is a collation key that is very very likely to sort before any
484
 * collation key that libc strxfrm generates. We use this before any
485
 * special case (dot or number) to make sure that its sorted before
486
 * anything else.
487
 */
488
#define COLLATION_SENTINEL "\1\1\1"
489
490
/**
491
 * g_utf8_collate_key_for_filename:
492
 * @str: a UTF-8 encoded string.
493
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
494
 *
495
 * Converts a string into a collation key that can be compared
496
 * with other collation keys produced by the same function using strcmp(). 
497
 * 
498
 * In order to sort filenames correctly, this function treats the dot '.' 
499
 * as a special case. Most dictionary orderings seem to consider it
500
 * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
501
 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
502
 * would like to treat numbers intelligently so that "file1" "file10" "file5"
503
 * is sorted as "file1" "file5" "file10".
504
 * 
505
 * Note that this function depends on the [current locale][setlocale].
506
 *
507
 * Returns: a newly allocated string. This string should
508
 *   be freed with g_free() when you are done with it.
509
 *
510
 * Since: 2.8
511
 */
512
gchar *
513
g_utf8_collate_key_for_filename (const gchar *str,
514
         gssize       len)
515
0
{
516
0
#ifndef HAVE_CARBON
517
0
  GString *result;
518
0
  GString *append;
519
0
  const gchar *p;
520
0
  const gchar *prev;
521
0
  const gchar *end;
522
0
  gchar *collate_key;
523
0
  gint digits;
524
0
  gint leading_zeros;
525
526
  /*
527
   * How it works:
528
   *
529
   * Split the filename into collatable substrings which do
530
   * not contain [.0-9] and special-cased substrings. The collatable 
531
   * substrings are run through the normal g_utf8_collate_key() and the 
532
   * resulting keys are concatenated with keys generated from the 
533
   * special-cased substrings.
534
   *
535
   * Special cases: Dots are handled by replacing them with '\1' which 
536
   * implies that short dot-delimited substrings are before long ones, 
537
   * e.g.
538
   * 
539
   *   a\1a   (a.a)
540
   *   a-\1a  (a-.a)
541
   *   aa\1a  (aa.a)
542
   * 
543
   * Numbers are handled by prepending to each number d-1 superdigits 
544
   * where d = number of digits in the number and SUPERDIGIT is a 
545
   * character with an integer value higher than any digit (for instance 
546
   * ':'). This ensures that single-digit numbers are sorted before 
547
   * double-digit numbers which in turn are sorted separately from 
548
   * triple-digit numbers, etc. To avoid strange side-effects when 
549
   * sorting strings that already contain SUPERDIGITs, a '\2'
550
   * is also prepended, like this
551
   *
552
   *   file\21      (file1)
553
   *   file\25      (file5)
554
   *   file\2:10    (file10)
555
   *   file\2:26    (file26)
556
   *   file\2::100  (file100)
557
   *   file:foo     (file:foo)
558
   * 
559
   * This has the side-effect of sorting numbers before everything else (except
560
   * dots), but this is probably OK.
561
   *
562
   * Leading digits are ignored when doing the above. To discriminate
563
   * numbers which differ only in the number of leading digits, we append
564
   * the number of leading digits as a byte at the very end of the collation
565
   * key.
566
   *
567
   * To try avoid conflict with any collation key sequence generated by libc we
568
   * start each switch to a special cased part with a sentinel that hopefully
569
   * will sort before anything libc will generate.
570
   */
571
572
0
  if (len < 0)
573
0
    len = strlen (str);
574
575
0
  result = g_string_sized_new (len * 2);
576
0
  append = g_string_sized_new (0);
577
578
0
  end = str + len;
579
580
  /* No need to use utf8 functions, since we're only looking for ascii chars */
581
0
  for (prev = p = str; p < end; p++)
582
0
    {
583
0
      switch (*p)
584
0
  {
585
0
  case '.':
586
0
    if (prev != p) 
587
0
      {
588
0
        collate_key = g_utf8_collate_key (prev, p - prev);
589
0
        g_string_append (result, collate_key);
590
0
        g_free (collate_key);
591
0
      }
592
    
593
0
    g_string_append (result, COLLATION_SENTINEL "\1");
594
    
595
    /* skip the dot */
596
0
    prev = p + 1;
597
0
    break;
598
    
599
0
  case '0':
600
0
  case '1':
601
0
  case '2':
602
0
  case '3':
603
0
  case '4':
604
0
  case '5':
605
0
  case '6':
606
0
  case '7':
607
0
  case '8':
608
0
  case '9':
609
0
    if (prev != p) 
610
0
      {
611
0
        collate_key = g_utf8_collate_key (prev, p - prev);
612
0
        g_string_append (result, collate_key);
613
0
        g_free (collate_key);
614
0
      }
615
    
616
0
    g_string_append (result, COLLATION_SENTINEL "\2");
617
    
618
0
    prev = p;
619
    
620
    /* write d-1 colons */
621
0
    if (*p == '0')
622
0
      {
623
0
        leading_zeros = 1;
624
0
        digits = 0;
625
0
      }
626
0
    else
627
0
      {
628
0
        leading_zeros = 0;
629
0
        digits = 1;
630
0
      }
631
    
632
0
    while (++p < end)
633
0
      {
634
0
        if (*p == '0' && !digits)
635
0
    ++leading_zeros;
636
0
        else if (g_ascii_isdigit(*p))
637
0
    ++digits;
638
0
        else
639
0
                {
640
      /* count an all-zero sequence as
641
                   * one digit plus leading zeros
642
                   */
643
0
              if (!digits)
644
0
                    {
645
0
                      ++digits;
646
0
                      --leading_zeros;
647
0
                    }        
648
0
      break;
649
0
                }
650
0
      }
651
652
0
    while (digits > 1)
653
0
      {
654
0
        g_string_append_c (result, ':');
655
0
        --digits;
656
0
      }
657
658
0
    if (leading_zeros > 0)
659
0
      {
660
0
        g_string_append_c (append, (char)leading_zeros);
661
0
        prev += leading_zeros;
662
0
      }
663
    
664
    /* write the number itself */
665
0
    g_string_append_len (result, prev, p - prev);
666
    
667
0
    prev = p;
668
0
    --p;    /* go one step back to avoid disturbing outer loop */
669
0
    break;
670
    
671
0
  default:
672
    /* other characters just accumulate */
673
0
    break;
674
0
  }
675
0
    }
676
  
677
0
  if (prev != p) 
678
0
    {
679
0
      collate_key = g_utf8_collate_key (prev, p - prev);
680
0
      g_string_append (result, collate_key);
681
0
      g_free (collate_key);
682
0
    }
683
  
684
0
  g_string_append (result, append->str);
685
0
  g_string_free (append, TRUE);
686
687
0
  return g_string_free (result, FALSE);
688
#else /* HAVE_CARBON */
689
  return carbon_collate_key_for_filename (str, len);
690
#endif
691
0
}