Coverage Report

Created: 2025-06-22 06:29

/src/glib/glib/gunicollate.c
Line
Count
Source (jump to first uncovered line)
1
/* gunicollate.c - Collation
2
 *
3
 *  Copyright 2001,2005 Red Hat, Inc.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2.1 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public License
16
 * along with this library; if not, see <http://www.gnu.org/licenses/>.
17
 */
18
19
#include "config.h"
20
21
#include <locale.h>
22
#include <string.h>
23
#ifdef HAVE_WCHAR_H
24
#include <wchar.h>
25
#endif
26
27
#ifdef HAVE_CARBON
28
#include <CoreServices/CoreServices.h>
29
#endif
30
31
#include "gmem.h"
32
#include "gunicode.h"
33
#include "gunicodeprivate.h"
34
#include "gstring.h"
35
#include "gstrfuncs.h"
36
#include "gtestutils.h"
37
#include "gcharset.h"
38
#include "gconvert.h"
39
40
#if SIZEOF_WCHAR_T == 4 && defined(__STDC_ISO_10646__)
41
#define GUNICHAR_EQUALS_WCHAR_T 1
42
#endif
43
44
#ifdef _MSC_VER
45
/* Workaround for bug in MSVCR80.DLL */
46
static gsize
47
msc_strxfrm_wrapper (char       *string1,
48
         const char *string2,
49
         gsize       count)
50
{
51
  if (!string1 || count <= 0)
52
    {
53
      char tmp;
54
55
      return strxfrm (&tmp, string2, 1);
56
    }
57
  return strxfrm (string1, string2, count);
58
}
59
#define strxfrm msc_strxfrm_wrapper
60
#endif
61
62
/**
63
 * g_utf8_collate:
64
 * @str1: a UTF-8 encoded string
65
 * @str2: a UTF-8 encoded string
66
 * 
67
 * Compares two strings for ordering using the linguistically
68
 * correct rules for the [current locale][setlocale].
69
 * When sorting a large number of strings, it will be significantly 
70
 * faster to obtain collation keys with g_utf8_collate_key() and 
71
 * compare the keys with strcmp() when sorting instead of sorting 
72
 * the original strings.
73
 * 
74
 * Returns: < 0 if @str1 compares before @str2, 
75
 *   0 if they compare equal, > 0 if @str1 compares after @str2.
76
 **/
77
gint
78
g_utf8_collate (const gchar *str1,
79
    const gchar *str2)
80
0
{
81
0
  gint result;
82
83
#ifdef HAVE_CARBON
84
85
  UniChar *str1_utf16;
86
  UniChar *str2_utf16;
87
  glong len1;
88
  glong len2;
89
  SInt32 retval = 0;
90
91
  g_return_val_if_fail (str1 != NULL, 0);
92
  g_return_val_if_fail (str2 != NULL, 0);
93
94
  str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
95
  str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
96
97
  UCCompareTextDefault (kUCCollateStandardOptions,
98
                        str1_utf16, len1, str2_utf16, len2,
99
                        NULL, &retval);
100
  result = retval;
101
102
  g_free (str2_utf16);
103
  g_free (str1_utf16);
104
105
#elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
106
107
0
  gunichar *str1_norm;
108
0
  gunichar *str2_norm;
109
110
0
  g_return_val_if_fail (str1 != NULL, 0);
111
0
  g_return_val_if_fail (str2 != NULL, 0);
112
113
0
  str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
114
0
  str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
115
116
0
  result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
117
118
0
  g_free (str1_norm);
119
0
  g_free (str2_norm);
120
121
#else
122
123
  const gchar *charset;
124
  gchar *str1_norm;
125
  gchar *str2_norm;
126
127
  g_return_val_if_fail (str1 != NULL, 0);
128
  g_return_val_if_fail (str2 != NULL, 0);
129
130
  str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
131
  str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
132
133
  if (g_get_charset (&charset))
134
    {
135
      result = strcoll (str1_norm, str2_norm);
136
    }
137
  else
138
    {
139
      gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
140
      gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
141
142
      if (str1_locale && str2_locale)
143
  result =  strcoll (str1_locale, str2_locale);
144
      else if (str1_locale)
145
  result = -1;
146
      else if (str2_locale)
147
  result = 1;
148
      else
149
  result = strcmp (str1_norm, str2_norm);
150
151
      g_free (str1_locale);
152
      g_free (str2_locale);
153
    }
154
155
  g_free (str1_norm);
156
  g_free (str2_norm);
157
158
#endif
159
160
0
  return result;
161
0
}
162
163
#if defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
164
/* We need UTF-8 encoding of numbers to encode the weights if
165
 * we are using wcsxfrm. However, we aren't encoding Unicode
166
 * characters, so we can't simply use g_unichar_to_utf8.
167
 *
168
 * The following routine is taken (with modification) from GNU
169
 * libc's strxfrm routine:
170
 *
171
 * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
172
 * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
173
 */
174
static inline int
175
utf8_encode (char *buf, wchar_t val)
176
0
{
177
0
  int retval;
178
179
0
  if (val < 0x80)
180
0
    {
181
0
      if (buf)
182
0
  *buf++ = (char) val;
183
0
      retval = 1;
184
0
    }
185
0
  else
186
0
    {
187
0
      int step;
188
189
0
      for (step = 2; step < 6; ++step)
190
0
        if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
191
0
          break;
192
0
      retval = step;
193
194
0
      if (buf)
195
0
  {
196
0
    *buf = (unsigned char) (~0xff >> step);
197
0
    --step;
198
0
    do
199
0
      {
200
0
        buf[step] = 0x80 | (val & 0x3f);
201
0
        val >>= 6;
202
0
      }
203
0
    while (--step > 0);
204
0
    *buf |= val;
205
0
  }
206
0
    }
207
208
0
  return retval;
209
0
}
210
#endif
211
212
#ifdef HAVE_CARBON
213
214
static gchar *
215
collate_key_to_string (UCCollationValue *key,
216
                       gsize             key_len)
217
{
218
  gchar *result;
219
  gsize result_len;
220
  long *lkey = (long *) key;
221
222
  /* UCCollationValue format:
223
   *
224
   * UCCollateOptions (32/64 bits)
225
   * SizeInBytes      (32/64 bits)
226
   * Value            (8 bits arrey)
227
   *
228
   * UCCollateOptions: ordering option mask of the collator
229
   * used to create the key. Size changes on 32-bit / 64-bit
230
   * hosts. On 64-bits also the extra half-word seems to have
231
   * some extra (unknown) meaning.
232
   * SizeInBytes: size of the whole structure, in bytes
233
   * (including UCCollateOptions and SizeInBytes fields). Size
234
   * changes on 32-bit & 64-bit hosts.
235
   * Value: array of bytes containing the comparison weights.
236
   * Seems to have several sub-strings separated by \001 and \002
237
   * chars. Also, experience shows this is directly strcmp-able.
238
   */
239
240
  result_len = lkey[1];
241
  result = g_malloc (result_len + 1);
242
  memcpy (result, &lkey[2], result_len);
243
  result[result_len] = '\0';
244
245
  return result;
246
}
247
248
static gchar *
249
carbon_collate_key_with_collator (const gchar *str,
250
                                  gssize       len,
251
                                  CollatorRef  collator)
252
{
253
  UniChar *str_utf16 = NULL;
254
  glong len_utf16;
255
  OSStatus ret;
256
  UCCollationValue staticbuf[512];
257
  UCCollationValue *freeme = NULL;
258
  UCCollationValue *buf;
259
  ItemCount buf_len;
260
  ItemCount key_len;
261
  ItemCount try_len;
262
  gchar *result = NULL;
263
264
  str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
265
  try_len = len_utf16 * 5 + 2;
266
267
  if (try_len <= sizeof staticbuf)
268
    {
269
      buf = staticbuf;
270
      buf_len = sizeof staticbuf;
271
    }
272
  else
273
    {
274
      freeme = g_new (UCCollationValue, try_len);
275
      buf = freeme;
276
      buf_len = try_len;
277
    }
278
279
  ret = UCGetCollationKey (collator, str_utf16, len_utf16,
280
                           buf_len, &key_len, buf);
281
282
  if (ret == kCollateBufferTooSmall)
283
    {
284
      freeme = g_renew (UCCollationValue, freeme, try_len * 2);
285
      buf = freeme;
286
      buf_len = try_len * 2;
287
      ret = UCGetCollationKey (collator, str_utf16, len_utf16,
288
                               buf_len, &key_len, buf);
289
    }
290
291
  if (ret == 0)
292
    result = collate_key_to_string (buf, key_len);
293
  else
294
    result = g_strdup ("");
295
296
  g_free (freeme);
297
  g_free (str_utf16);
298
  return result;
299
}
300
301
static gchar *
302
carbon_collate_key (const gchar *str,
303
                    gssize       len)
304
{
305
  static CollatorRef collator;
306
307
  if (G_UNLIKELY (!collator))
308
    {
309
      UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
310
311
      if (!collator)
312
        {
313
          static gboolean been_here;
314
          if (!been_here)
315
            g_warning ("%s: UCCreateCollator failed", G_STRLOC);
316
          been_here = TRUE;
317
          return g_strdup ("");
318
        }
319
    }
320
321
  return carbon_collate_key_with_collator (str, len, collator);
322
}
323
324
static gchar *
325
carbon_collate_key_for_filename (const gchar *str,
326
                                 gssize       len)
327
{
328
  static CollatorRef collator;
329
330
  if (G_UNLIKELY (!collator))
331
    {
332
      /* http://developer.apple.com/qa/qa2004/qa1159.html */
333
      UCCreateCollator (NULL, 0,
334
                        kUCCollateComposeInsensitiveMask
335
                         | kUCCollateWidthInsensitiveMask
336
                         | kUCCollateCaseInsensitiveMask
337
                         | kUCCollateDigitsOverrideMask
338
                         | kUCCollateDigitsAsNumberMask
339
                         | kUCCollatePunctuationSignificantMask, 
340
                        &collator);
341
342
      if (!collator)
343
        {
344
          static gboolean been_here;
345
          if (!been_here)
346
            g_warning ("%s: UCCreateCollator failed", G_STRLOC);
347
          been_here = TRUE;
348
          return g_strdup ("");
349
        }
350
    }
351
352
  return carbon_collate_key_with_collator (str, len, collator);
353
}
354
355
#endif /* HAVE_CARBON */
356
357
/**
358
 * g_utf8_collate_key:
359
 * @str: a UTF-8 encoded string.
360
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
361
 *
362
 * Converts a string into a collation key that can be compared
363
 * with other collation keys produced by the same function using 
364
 * strcmp(). 
365
 *
366
 * The results of comparing the collation keys of two strings 
367
 * with strcmp() will always be the same as comparing the two 
368
 * original keys with g_utf8_collate().
369
 * 
370
 * Note that this function depends on the [current locale][setlocale].
371
 * 
372
 * Returns: a newly allocated string. This string should
373
 *   be freed with g_free() when you are done with it.
374
 **/
375
gchar *
376
g_utf8_collate_key (const gchar *str,
377
        gssize       len)
378
0
{
379
0
  gchar *result;
380
381
#ifdef HAVE_CARBON
382
383
  g_return_val_if_fail (str != NULL, NULL);
384
  result = carbon_collate_key (str, len);
385
386
#elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
387
388
0
  gsize xfrm_len;
389
0
  gunichar *str_norm;
390
0
  wchar_t *result_wc;
391
0
  gsize i;
392
0
  gsize result_len = 0;
393
394
0
  g_return_val_if_fail (str != NULL, NULL);
395
396
0
  str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
397
398
0
  xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
399
0
  result_wc = g_new (wchar_t, xfrm_len + 1);
400
0
  wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
401
402
0
  for (i = 0; i < xfrm_len; i++)
403
0
    result_len += utf8_encode (NULL, result_wc[i]);
404
405
0
  result = g_malloc (result_len + 1);
406
0
  result_len = 0;
407
0
  for (i = 0; i < xfrm_len; i++)
408
0
    result_len += utf8_encode (result + result_len, result_wc[i]);
409
410
0
  result[result_len] = '\0';
411
412
0
  g_free (result_wc);
413
0
  g_free (str_norm);
414
415
0
  return result;
416
#else
417
418
  gsize xfrm_len;
419
  const gchar *charset;
420
  gchar *str_norm;
421
422
  g_return_val_if_fail (str != NULL, NULL);
423
424
  str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
425
426
  result = NULL;
427
428
  if (g_get_charset (&charset))
429
    {
430
      xfrm_len = strxfrm (NULL, str_norm, 0);
431
      if (xfrm_len < G_MAXINT - 2)
432
        {
433
          result = g_malloc (xfrm_len + 1);
434
          strxfrm (result, str_norm, xfrm_len + 1);
435
        }
436
    }
437
  else
438
    {
439
      gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
440
441
      if (str_locale)
442
  {
443
    xfrm_len = strxfrm (NULL, str_locale, 0);
444
    if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
445
      {
446
        g_free (str_locale);
447
        str_locale = NULL;
448
      }
449
  }
450
      if (str_locale)
451
  {
452
    result = g_malloc (xfrm_len + 2);
453
    result[0] = 'A';
454
    strxfrm (result + 1, str_locale, xfrm_len + 1);
455
    
456
    g_free (str_locale);
457
  }
458
    }
459
    
460
  if (!result) 
461
    {
462
      xfrm_len = strlen (str_norm);
463
      result = g_malloc (xfrm_len + 2);
464
      result[0] = 'B';
465
      memcpy (result + 1, str_norm, xfrm_len);
466
      result[xfrm_len+1] = '\0';
467
    }
468
469
  g_free (str_norm);
470
#endif
471
472
0
  return result;
473
0
}
474
475
/* This is a collation key that is very very likely to sort before any
476
 * collation key that libc strxfrm generates. We use this before any
477
 * special case (dot or number) to make sure that its sorted before
478
 * anything else.
479
 */
480
0
#define COLLATION_SENTINEL "\1\1\1"
481
482
/**
483
 * g_utf8_collate_key_for_filename:
484
 * @str: a UTF-8 encoded string.
485
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
486
 *
487
 * Converts a string into a collation key that can be compared
488
 * with other collation keys produced by the same function using strcmp(). 
489
 * 
490
 * In order to sort filenames correctly, this function treats the dot '.' 
491
 * as a special case. Most dictionary orderings seem to consider it
492
 * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
493
 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
494
 * would like to treat numbers intelligently so that "file1" "file10" "file5"
495
 * is sorted as "file1" "file5" "file10".
496
 * 
497
 * Note that this function depends on the [current locale][setlocale].
498
 *
499
 * Returns: a newly allocated string. This string should
500
 *   be freed with g_free() when you are done with it.
501
 *
502
 * Since: 2.8
503
 */
504
gchar *
505
g_utf8_collate_key_for_filename (const gchar *str,
506
         gssize       len)
507
0
{
508
0
#ifndef HAVE_CARBON
509
0
  GString *result;
510
0
  GString *append;
511
0
  const gchar *p;
512
0
  const gchar *prev;
513
0
  const gchar *end;
514
0
  gchar *collate_key;
515
0
  gint digits;
516
0
  gint leading_zeros;
517
518
  /*
519
   * How it works:
520
   *
521
   * Split the filename into collatable substrings which do
522
   * not contain [.0-9] and special-cased substrings. The collatable 
523
   * substrings are run through the normal g_utf8_collate_key() and the 
524
   * resulting keys are concatenated with keys generated from the 
525
   * special-cased substrings.
526
   *
527
   * Special cases: Dots are handled by replacing them with '\1' which 
528
   * implies that short dot-delimited substrings are before long ones, 
529
   * e.g.
530
   * 
531
   *   a\1a   (a.a)
532
   *   a-\1a  (a-.a)
533
   *   aa\1a  (aa.a)
534
   * 
535
   * Numbers are handled by prepending to each number d-1 superdigits 
536
   * where d = number of digits in the number and SUPERDIGIT is a 
537
   * character with an integer value higher than any digit (for instance 
538
   * ':'). This ensures that single-digit numbers are sorted before 
539
   * double-digit numbers which in turn are sorted separately from 
540
   * triple-digit numbers, etc. To avoid strange side-effects when 
541
   * sorting strings that already contain SUPERDIGITs, a '\2'
542
   * is also prepended, like this
543
   *
544
   *   file\21      (file1)
545
   *   file\25      (file5)
546
   *   file\2:10    (file10)
547
   *   file\2:26    (file26)
548
   *   file\2::100  (file100)
549
   *   file:foo     (file:foo)
550
   * 
551
   * This has the side-effect of sorting numbers before everything else (except
552
   * dots), but this is probably OK.
553
   *
554
   * Leading digits are ignored when doing the above. To discriminate
555
   * numbers which differ only in the number of leading digits, we append
556
   * the number of leading digits as a byte at the very end of the collation
557
   * key.
558
   *
559
   * To try avoid conflict with any collation key sequence generated by libc we
560
   * start each switch to a special cased part with a sentinel that hopefully
561
   * will sort before anything libc will generate.
562
   */
563
564
0
  if (len < 0)
565
0
    len = strlen (str);
566
567
0
  result = g_string_sized_new (len * 2);
568
0
  append = g_string_sized_new (0);
569
570
0
  end = str + len;
571
572
  /* No need to use utf8 functions, since we're only looking for ascii chars */
573
0
  for (prev = p = str; p < end; p++)
574
0
    {
575
0
      switch (*p)
576
0
  {
577
0
  case '.':
578
0
    if (prev != p) 
579
0
      {
580
0
        collate_key = g_utf8_collate_key (prev, p - prev);
581
0
        g_string_append (result, collate_key);
582
0
        g_free (collate_key);
583
0
      }
584
    
585
0
    g_string_append (result, COLLATION_SENTINEL "\1");
586
    
587
    /* skip the dot */
588
0
    prev = p + 1;
589
0
    break;
590
    
591
0
  case '0':
592
0
  case '1':
593
0
  case '2':
594
0
  case '3':
595
0
  case '4':
596
0
  case '5':
597
0
  case '6':
598
0
  case '7':
599
0
  case '8':
600
0
  case '9':
601
0
    if (prev != p) 
602
0
      {
603
0
        collate_key = g_utf8_collate_key (prev, p - prev);
604
0
        g_string_append (result, collate_key);
605
0
        g_free (collate_key);
606
0
      }
607
    
608
0
    g_string_append (result, COLLATION_SENTINEL "\2");
609
    
610
0
    prev = p;
611
    
612
    /* write d-1 colons */
613
0
    if (*p == '0')
614
0
      {
615
0
        leading_zeros = 1;
616
0
        digits = 0;
617
0
      }
618
0
    else
619
0
      {
620
0
        leading_zeros = 0;
621
0
        digits = 1;
622
0
      }
623
    
624
0
    while (++p < end)
625
0
      {
626
0
        if (*p == '0' && !digits)
627
0
    ++leading_zeros;
628
0
        else if (g_ascii_isdigit(*p))
629
0
    ++digits;
630
0
        else
631
0
                {
632
      /* count an all-zero sequence as
633
                   * one digit plus leading zeros
634
                   */
635
0
              if (!digits)
636
0
                    {
637
0
                      ++digits;
638
0
                      --leading_zeros;
639
0
                    }        
640
0
      break;
641
0
                }
642
0
      }
643
644
0
    while (digits > 1)
645
0
      {
646
0
        g_string_append_c (result, ':');
647
0
        --digits;
648
0
      }
649
650
0
    if (leading_zeros > 0)
651
0
      {
652
0
        g_string_append_c (append, (char)leading_zeros);
653
0
        prev += leading_zeros;
654
0
      }
655
    
656
    /* write the number itself */
657
0
    g_string_append_len (result, prev, p - prev);
658
    
659
0
    prev = p;
660
0
    --p;    /* go one step back to avoid disturbing outer loop */
661
0
    break;
662
    
663
0
  default:
664
    /* other characters just accumulate */
665
0
    break;
666
0
  }
667
0
    }
668
  
669
0
  if (prev != p) 
670
0
    {
671
0
      collate_key = g_utf8_collate_key (prev, p - prev);
672
0
      g_string_append (result, collate_key);
673
0
      g_free (collate_key);
674
0
    }
675
  
676
0
  g_string_append (result, append->str);
677
0
  g_string_free (append, TRUE);
678
679
0
  return g_string_free (result, FALSE);
680
#else /* HAVE_CARBON */
681
  return carbon_collate_key_for_filename (str, len);
682
#endif
683
0
}