Coverage Report

Created: 2025-07-03 06:49

/src/postgres/src/common/unicode_norm.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 * unicode_norm.c
3
 *    Normalize a Unicode string
4
 *
5
 * This implements Unicode normalization, per the documentation at
6
 * https://www.unicode.org/reports/tr15/.
7
 *
8
 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
9
 *
10
 * IDENTIFICATION
11
 *    src/common/unicode_norm.c
12
 *
13
 *-------------------------------------------------------------------------
14
 */
15
#ifndef FRONTEND
16
#include "postgres.h"
17
#else
18
#include "postgres_fe.h"
19
#endif
20
21
#include "common/unicode_norm.h"
22
#ifndef FRONTEND
23
#include "common/unicode_norm_hashfunc.h"
24
#include "common/unicode_normprops_table.h"
25
#include "port/pg_bswap.h"
26
#else
27
#include "common/unicode_norm_table.h"
28
#endif
29
30
#ifndef FRONTEND
31
0
#define ALLOC(size) palloc(size)
32
0
#define FREE(size) pfree(size)
33
#else
34
#define ALLOC(size) malloc(size)
35
#define FREE(size) free(size)
36
#endif
37
38
/* Constants for calculations with Hangul characters */
39
0
#define SBASE   0xAC00    /* U+AC00 */
40
0
#define LBASE   0x1100    /* U+1100 */
41
0
#define VBASE   0x1161    /* U+1161 */
42
0
#define TBASE   0x11A7    /* U+11A7 */
43
0
#define LCOUNT    19
44
0
#define VCOUNT    21
45
0
#define TCOUNT    28
46
0
#define NCOUNT    VCOUNT * TCOUNT
47
0
#define SCOUNT    LCOUNT * NCOUNT
48
49
#ifdef FRONTEND
50
/* comparison routine for bsearch() of decomposition lookup table. */
51
static int
52
conv_compare(const void *p1, const void *p2)
53
{
54
  uint32    v1,
55
        v2;
56
57
  v1 = *(const uint32 *) p1;
58
  v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
59
  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
60
}
61
62
#endif
63
64
/*
65
 * get_code_entry
66
 *
67
 * Get the entry corresponding to code in the decomposition lookup table.
68
 * The backend version of this code uses a perfect hash function for the
69
 * lookup, while the frontend version uses a binary search.
70
 */
71
static const pg_unicode_decomposition *
72
get_code_entry(pg_wchar code)
73
0
{
74
0
#ifndef FRONTEND
75
0
  int     h;
76
0
  uint32    hashkey;
77
0
  pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
78
79
  /*
80
   * Compute the hash function. The hash key is the codepoint with the bytes
81
   * in network order.
82
   */
83
0
  hashkey = pg_hton32(code);
84
0
  h = decompinfo.hash(&hashkey);
85
86
  /* An out-of-range result implies no match */
87
0
  if (h < 0 || h >= decompinfo.num_decomps)
88
0
    return NULL;
89
90
  /*
91
   * Since it's a perfect hash, we need only match to the specific codepoint
92
   * it identifies.
93
   */
94
0
  if (code != decompinfo.decomps[h].codepoint)
95
0
    return NULL;
96
97
  /* Success! */
98
0
  return &decompinfo.decomps[h];
99
#else
100
  return bsearch(&(code),
101
           UnicodeDecompMain,
102
           lengthof(UnicodeDecompMain),
103
           sizeof(pg_unicode_decomposition),
104
           conv_compare);
105
#endif
106
0
}
107
108
/*
109
 * Get the combining class of the given codepoint.
110
 */
111
static uint8
112
get_canonical_class(pg_wchar code)
113
0
{
114
0
  const pg_unicode_decomposition *entry = get_code_entry(code);
115
116
  /*
117
   * If no entries are found, the character used is either an Hangul
118
   * character or a character with a class of 0 and no decompositions.
119
   */
120
0
  if (!entry)
121
0
    return 0;
122
0
  else
123
0
    return entry->comb_class;
124
0
}
125
126
/*
127
 * Given a decomposition entry looked up earlier, get the decomposed
128
 * characters.
129
 *
130
 * Note: the returned pointer can point to statically allocated buffer, and
131
 * is only valid until next call to this function!
132
 */
133
static const pg_wchar *
134
get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
135
0
{
136
0
  static pg_wchar x;
137
138
0
  if (DECOMPOSITION_IS_INLINE(entry))
139
0
  {
140
0
    Assert(DECOMPOSITION_SIZE(entry) == 1);
141
0
    x = (pg_wchar) entry->dec_index;
142
0
    *dec_size = 1;
143
0
    return &x;
144
0
  }
145
0
  else
146
0
  {
147
0
    *dec_size = DECOMPOSITION_SIZE(entry);
148
0
    return &UnicodeDecomp_codepoints[entry->dec_index];
149
0
  }
150
0
}
151
152
/*
153
 * Calculate how many characters a given character will decompose to.
154
 *
155
 * This needs to recurse, if the character decomposes into characters that
156
 * are, in turn, decomposable.
157
 */
158
static int
159
get_decomposed_size(pg_wchar code, bool compat)
160
0
{
161
0
  const pg_unicode_decomposition *entry;
162
0
  int     size = 0;
163
0
  int     i;
164
0
  const uint32 *decomp;
165
0
  int     dec_size;
166
167
  /*
168
   * Fast path for Hangul characters not stored in tables to save memory as
169
   * decomposition is algorithmic. See
170
   * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
171
   * on the matter.
172
   */
173
0
  if (code >= SBASE && code < SBASE + SCOUNT)
174
0
  {
175
0
    uint32    tindex,
176
0
          sindex;
177
178
0
    sindex = code - SBASE;
179
0
    tindex = sindex % TCOUNT;
180
181
0
    if (tindex != 0)
182
0
      return 3;
183
0
    return 2;
184
0
  }
185
186
0
  entry = get_code_entry(code);
187
188
  /*
189
   * Just count current code if no other decompositions.  A NULL entry is
190
   * equivalent to a character with class 0 and no decompositions.
191
   */
192
0
  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
193
0
    (!compat && DECOMPOSITION_IS_COMPAT(entry)))
194
0
    return 1;
195
196
  /*
197
   * If this entry has other decomposition codes look at them as well. First
198
   * get its decomposition in the list of tables available.
199
   */
200
0
  decomp = get_code_decomposition(entry, &dec_size);
201
0
  for (i = 0; i < dec_size; i++)
202
0
  {
203
0
    uint32    lcode = decomp[i];
204
205
0
    size += get_decomposed_size(lcode, compat);
206
0
  }
207
208
0
  return size;
209
0
}
210
211
/*
212
 * Recompose a set of characters. For hangul characters, the calculation
213
 * is algorithmic. For others, an inverse lookup at the decomposition
214
 * table is necessary. Returns true if a recomposition can be done, and
215
 * false otherwise.
216
 */
217
static bool
218
recompose_code(uint32 start, uint32 code, uint32 *result)
219
0
{
220
  /*
221
   * Handle Hangul characters algorithmically, per the Unicode spec.
222
   *
223
   * Check if two current characters are L and V.
224
   */
225
0
  if (start >= LBASE && start < LBASE + LCOUNT &&
226
0
    code >= VBASE && code < VBASE + VCOUNT)
227
0
  {
228
    /* make syllable of form LV */
229
0
    uint32    lindex = start - LBASE;
230
0
    uint32    vindex = code - VBASE;
231
232
0
    *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
233
0
    return true;
234
0
  }
235
  /* Check if two current characters are LV and T */
236
0
  else if (start >= SBASE && start < (SBASE + SCOUNT) &&
237
0
       ((start - SBASE) % TCOUNT) == 0 &&
238
0
       code >= TBASE && code < (TBASE + TCOUNT))
239
0
  {
240
    /* make syllable of form LVT */
241
0
    uint32    tindex = code - TBASE;
242
243
0
    *result = start + tindex;
244
0
    return true;
245
0
  }
246
0
  else
247
0
  {
248
0
    const pg_unicode_decomposition *entry;
249
250
    /*
251
     * Do an inverse lookup of the decomposition tables to see if anything
252
     * matches. The comparison just needs to be a perfect match on the
253
     * sub-table of size two, because the start character has already been
254
     * recomposed partially.  This lookup uses a perfect hash function for
255
     * the backend code.
256
     */
257
0
#ifndef FRONTEND
258
259
0
    int     h,
260
0
          inv_lookup_index;
261
0
    uint64    hashkey;
262
0
    pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
263
264
    /*
265
     * Compute the hash function. The hash key is formed by concatenating
266
     * bytes of the two codepoints in network order. See also
267
     * src/common/unicode/generate-unicode_norm_table.pl.
268
     */
269
0
    hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
270
0
    h = recompinfo.hash(&hashkey);
271
272
    /* An out-of-range result implies no match */
273
0
    if (h < 0 || h >= recompinfo.num_recomps)
274
0
      return false;
275
276
0
    inv_lookup_index = recompinfo.inverse_lookup[h];
277
0
    entry = &UnicodeDecompMain[inv_lookup_index];
278
279
0
    if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
280
0
      code == UnicodeDecomp_codepoints[entry->dec_index + 1])
281
0
    {
282
0
      *result = entry->codepoint;
283
0
      return true;
284
0
    }
285
286
#else
287
288
    int     i;
289
290
    for (i = 0; i < lengthof(UnicodeDecompMain); i++)
291
    {
292
      entry = &UnicodeDecompMain[i];
293
294
      if (DECOMPOSITION_SIZE(entry) != 2)
295
        continue;
296
297
      if (DECOMPOSITION_NO_COMPOSE(entry))
298
        continue;
299
300
      if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
301
        code == UnicodeDecomp_codepoints[entry->dec_index + 1])
302
      {
303
        *result = entry->codepoint;
304
        return true;
305
      }
306
    }
307
#endif              /* !FRONTEND */
308
0
  }
309
310
0
  return false;
311
0
}
312
313
/*
314
 * Decompose the given code into the array given by caller. The
315
 * decomposition begins at the position given by caller, saving one
316
 * lookup on the decomposition table. The current position needs to be
317
 * updated here to let the caller know from where to continue filling
318
 * in the array result.
319
 */
320
static void
321
decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
322
0
{
323
0
  const pg_unicode_decomposition *entry;
324
0
  int     i;
325
0
  const uint32 *decomp;
326
0
  int     dec_size;
327
328
  /*
329
   * Fast path for Hangul characters not stored in tables to save memory as
330
   * decomposition is algorithmic. See
331
   * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
332
   * on the matter.
333
   */
334
0
  if (code >= SBASE && code < SBASE + SCOUNT)
335
0
  {
336
0
    uint32    l,
337
0
          v,
338
0
          tindex,
339
0
          sindex;
340
0
    pg_wchar   *res = *result;
341
342
0
    sindex = code - SBASE;
343
0
    l = LBASE + sindex / (VCOUNT * TCOUNT);
344
0
    v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
345
0
    tindex = sindex % TCOUNT;
346
347
0
    res[*current] = l;
348
0
    (*current)++;
349
0
    res[*current] = v;
350
0
    (*current)++;
351
352
0
    if (tindex != 0)
353
0
    {
354
0
      res[*current] = TBASE + tindex;
355
0
      (*current)++;
356
0
    }
357
358
0
    return;
359
0
  }
360
361
0
  entry = get_code_entry(code);
362
363
  /*
364
   * Just fill in with the current decomposition if there are no
365
   * decomposition codes to recurse to.  A NULL entry is equivalent to a
366
   * character with class 0 and no decompositions, so just leave also in
367
   * this case.
368
   */
369
0
  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
370
0
    (!compat && DECOMPOSITION_IS_COMPAT(entry)))
371
0
  {
372
0
    pg_wchar   *res = *result;
373
374
0
    res[*current] = code;
375
0
    (*current)++;
376
0
    return;
377
0
  }
378
379
  /*
380
   * If this entry has other decomposition codes look at them as well.
381
   */
382
0
  decomp = get_code_decomposition(entry, &dec_size);
383
0
  for (i = 0; i < dec_size; i++)
384
0
  {
385
0
    pg_wchar  lcode = (pg_wchar) decomp[i];
386
387
    /* Leave if no more decompositions */
388
0
    decompose_code(lcode, compat, result, current);
389
0
  }
390
0
}
391
392
/*
393
 * unicode_normalize - Normalize a Unicode string to the specified form.
394
 *
395
 * The input is a 0-terminated array of codepoints.
396
 *
397
 * In frontend, returns a 0-terminated array of codepoints, allocated with
398
 * malloc. Or NULL if we run out of memory. In backend, the returned
399
 * string is palloc'd instead, and OOM is reported with ereport().
400
 */
401
pg_wchar *
402
unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
403
0
{
404
0
  bool    compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
405
0
  bool    recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
406
0
  pg_wchar   *decomp_chars;
407
0
  pg_wchar   *recomp_chars;
408
0
  int     decomp_size,
409
0
        current_size;
410
0
  int     count;
411
0
  const pg_wchar *p;
412
413
  /* variables for recomposition */
414
0
  int     last_class;
415
0
  int     starter_pos;
416
0
  int     target_pos;
417
0
  uint32    starter_ch;
418
419
  /* First, do character decomposition */
420
421
  /*
422
   * Calculate how many characters long the decomposed version will be.
423
   */
424
0
  decomp_size = 0;
425
0
  for (p = input; *p; p++)
426
0
    decomp_size += get_decomposed_size(*p, compat);
427
428
0
  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
429
0
  if (decomp_chars == NULL)
430
0
    return NULL;
431
432
  /*
433
   * Now fill in each entry recursively. This needs a second pass on the
434
   * decomposition table.
435
   */
436
0
  current_size = 0;
437
0
  for (p = input; *p; p++)
438
0
    decompose_code(*p, compat, &decomp_chars, &current_size);
439
0
  decomp_chars[decomp_size] = '\0';
440
0
  Assert(decomp_size == current_size);
441
442
  /* Leave if there is nothing to decompose */
443
0
  if (decomp_size == 0)
444
0
    return decomp_chars;
445
446
  /*
447
   * Now apply canonical ordering.
448
   */
449
0
  for (count = 1; count < decomp_size; count++)
450
0
  {
451
0
    pg_wchar  prev = decomp_chars[count - 1];
452
0
    pg_wchar  next = decomp_chars[count];
453
0
    pg_wchar  tmp;
454
0
    const uint8 prevClass = get_canonical_class(prev);
455
0
    const uint8 nextClass = get_canonical_class(next);
456
457
    /*
458
     * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
459
     * annex 4, a sequence of two adjacent characters in a string is an
460
     * exchangeable pair if the combining class (from the Unicode
461
     * Character Database) for the first character is greater than the
462
     * combining class for the second, and the second is not a starter.  A
463
     * character is a starter if its combining class is 0.
464
     */
465
0
    if (prevClass == 0 || nextClass == 0)
466
0
      continue;
467
468
0
    if (prevClass <= nextClass)
469
0
      continue;
470
471
    /* exchange can happen */
472
0
    tmp = decomp_chars[count - 1];
473
0
    decomp_chars[count - 1] = decomp_chars[count];
474
0
    decomp_chars[count] = tmp;
475
476
    /* backtrack to check again */
477
0
    if (count > 1)
478
0
      count -= 2;
479
0
  }
480
481
0
  if (!recompose)
482
0
    return decomp_chars;
483
484
  /*
485
   * The last phase of NFC and NFKC is the recomposition of the reordered
486
   * Unicode string using combining classes. The recomposed string cannot be
487
   * longer than the decomposed one, so make the allocation of the output
488
   * string based on that assumption.
489
   */
490
0
  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
491
0
  if (!recomp_chars)
492
0
  {
493
0
    FREE(decomp_chars);
494
0
    return NULL;
495
0
  }
496
497
0
  last_class = -1;      /* this eliminates a special check */
498
0
  starter_pos = 0;
499
0
  target_pos = 1;
500
0
  starter_ch = recomp_chars[0] = decomp_chars[0];
501
502
0
  for (count = 1; count < decomp_size; count++)
503
0
  {
504
0
    pg_wchar  ch = decomp_chars[count];
505
0
    int     ch_class = get_canonical_class(ch);
506
0
    pg_wchar  composite;
507
508
0
    if (last_class < ch_class &&
509
0
      recompose_code(starter_ch, ch, &composite))
510
0
    {
511
0
      recomp_chars[starter_pos] = composite;
512
0
      starter_ch = composite;
513
0
    }
514
0
    else if (ch_class == 0)
515
0
    {
516
0
      starter_pos = target_pos;
517
0
      starter_ch = ch;
518
0
      last_class = -1;
519
0
      recomp_chars[target_pos++] = ch;
520
0
    }
521
0
    else
522
0
    {
523
0
      last_class = ch_class;
524
0
      recomp_chars[target_pos++] = ch;
525
0
    }
526
0
  }
527
0
  recomp_chars[target_pos] = (pg_wchar) '\0';
528
529
0
  FREE(decomp_chars);
530
531
0
  return recomp_chars;
532
0
}
533
534
/*
535
 * Normalization "quick check" algorithm; see
536
 * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
537
 */
538
539
/* We only need this in the backend. */
540
#ifndef FRONTEND
541
542
static const pg_unicode_normprops *
543
qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
544
0
{
545
0
  int     h;
546
0
  uint32    hashkey;
547
548
  /*
549
   * Compute the hash function. The hash key is the codepoint with the bytes
550
   * in network order.
551
   */
552
0
  hashkey = pg_hton32(ch);
553
0
  h = norminfo->hash(&hashkey);
554
555
  /* An out-of-range result implies no match */
556
0
  if (h < 0 || h >= norminfo->num_normprops)
557
0
    return NULL;
558
559
  /*
560
   * Since it's a perfect hash, we need only match to the specific codepoint
561
   * it identifies.
562
   */
563
0
  if (ch != norminfo->normprops[h].codepoint)
564
0
    return NULL;
565
566
  /* Success! */
567
0
  return &norminfo->normprops[h];
568
0
}
569
570
/*
571
 * Look up the normalization quick check character property
572
 */
573
static UnicodeNormalizationQC
574
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
575
0
{
576
0
  const pg_unicode_normprops *found = NULL;
577
578
0
  switch (form)
579
0
  {
580
0
    case UNICODE_NFC:
581
0
      found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
582
0
      break;
583
0
    case UNICODE_NFKC:
584
0
      found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
585
0
      break;
586
0
    default:
587
0
      Assert(false);
588
0
      break;
589
0
  }
590
591
0
  if (found)
592
0
    return found->quickcheck;
593
0
  else
594
0
    return UNICODE_NORM_QC_YES;
595
0
}
596
597
UnicodeNormalizationQC
598
unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
599
0
{
600
0
  uint8   lastCanonicalClass = 0;
601
0
  UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
602
603
  /*
604
   * For the "D" forms, we don't run the quickcheck.  We don't include the
605
   * lookup tables for those because they are huge, checking for these
606
   * particular forms is less common, and running the slow path is faster
607
   * for the "D" forms than the "C" forms because you don't need to
608
   * recompose, which is slow.
609
   */
610
0
  if (form == UNICODE_NFD || form == UNICODE_NFKD)
611
0
    return UNICODE_NORM_QC_MAYBE;
612
613
0
  for (const pg_wchar *p = input; *p; p++)
614
0
  {
615
0
    pg_wchar  ch = *p;
616
0
    uint8   canonicalClass;
617
0
    UnicodeNormalizationQC check;
618
619
0
    canonicalClass = get_canonical_class(ch);
620
0
    if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
621
0
      return UNICODE_NORM_QC_NO;
622
623
0
    check = qc_is_allowed(form, ch);
624
0
    if (check == UNICODE_NORM_QC_NO)
625
0
      return UNICODE_NORM_QC_NO;
626
0
    else if (check == UNICODE_NORM_QC_MAYBE)
627
0
      result = UNICODE_NORM_QC_MAYBE;
628
629
0
    lastCanonicalClass = canonicalClass;
630
0
  }
631
0
  return result;
632
0
}
633
634
#endif              /* !FRONTEND */