/src/postgres/src/common/unicode_norm.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 * unicode_norm.c
 *    Normalize a Unicode string
 *
 * This implements Unicode normalization, per the documentation at
 * https://www.unicode.org/reports/tr15/.
 *
 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *    src/common/unicode_norm.c
 *
 *-------------------------------------------------------------------------
 */
#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif

#include "common/unicode_norm.h"
#ifndef FRONTEND
#include "common/unicode_norm_hashfunc.h"
#include "common/unicode_normprops_table.h"
#include "port/pg_bswap.h"
#else
#include "common/unicode_norm_table.h"
#endif

#ifndef FRONTEND
#define ALLOC(size) palloc(size)
#define FREE(size) pfree(size)
#else
#define ALLOC(size) malloc(size)
#define FREE(size) free(size)
#endif

/* Constants for calculations with Hangul characters */
#define SBASE   0xAC00    /* U+AC00 */
#define LBASE   0x1100    /* U+1100 */
#define VBASE   0x1161    /* U+1161 */
#define TBASE   0x11A7    /* U+11A7 */
#define LCOUNT    19
#define VCOUNT    21
#define TCOUNT    28
#define NCOUNT    VCOUNT * TCOUNT
#define SCOUNT    LCOUNT * NCOUNT

#ifdef FRONTEND
/* comparison routine for bsearch() of decomposition lookup table. */
static int
conv_compare(const void *p1, const void *p2)
{
  uint32    v1,
        v2;

  v1 = *(const uint32 *) p1;
  v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}

#endif

/*
 * get_code_entry
 *
 * Get the entry corresponding to code in the decomposition lookup table.
 * The backend version of this code uses a perfect hash function for the
 * lookup, while the frontend version uses a binary search.
 */
static const pg_unicode_decomposition *
get_code_entry(pg_wchar code)
{
#ifndef FRONTEND
  int     h;
  uint32    hashkey;
  pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;

  /*
   * Compute the hash function. The hash key is the codepoint with the bytes
   * in network order.
   */
  hashkey = pg_hton32(code);
  h = decompinfo.hash(&hashkey);

  /* An out-of-range result implies no match */
  if (h < 0 || h >= decompinfo.num_decomps)
    return NULL;

  /*
   * Since it's a perfect hash, we need only match to the specific codepoint
   * it identifies.
   */
  if (code != decompinfo.decomps[h].codepoint)
    return NULL;

  /* Success! */
  return &decompinfo.decomps[h];
#else
  return bsearch(&(code),
           UnicodeDecompMain,
           lengthof(UnicodeDecompMain),
           sizeof(pg_unicode_decomposition),
           conv_compare);
#endif
}

/*
 * Get the combining class of the given codepoint.
 */
static uint8
get_canonical_class(pg_wchar code)
{
  const pg_unicode_decomposition *entry = get_code_entry(code);

  /*
   * If no entries are found, the character used is either an Hangul
   * character or a character with a class of 0 and no decompositions.
   */
  if (!entry)
    return 0;
  else
    return entry->comb_class;
}

/*
 * Given a decomposition entry looked up earlier, get the decomposed
 * characters.
 *
 * Note: the returned pointer can point to statically allocated buffer, and
 * is only valid until next call to this function!
 */
static const pg_wchar *
get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
{
  static pg_wchar x;

  if (DECOMPOSITION_IS_INLINE(entry))
  {
    Assert(DECOMPOSITION_SIZE(entry) == 1);
    x = (pg_wchar) entry->dec_index;
    *dec_size = 1;
    return &x;
  }
  else
  {
    *dec_size = DECOMPOSITION_SIZE(entry);
    return &UnicodeDecomp_codepoints[entry->dec_index];
  }
}

/*
 * Calculate how many characters a given character will decompose to.
 *
 * This needs to recurse, if the character decomposes into characters that
 * are, in turn, decomposable.
 */
static int
get_decomposed_size(pg_wchar code, bool compat)
{
  const pg_unicode_decomposition *entry;
  int     size = 0;
  int     i;
  const uint32 *decomp;
  int     dec_size;

  /*
   * Fast path for Hangul characters not stored in tables to save memory as
   * decomposition is algorithmic. See
   * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
   * on the matter.
   */
  if (code >= SBASE && code < SBASE + SCOUNT)
  {
    uint32    tindex,
          sindex;

    sindex = code - SBASE;
    tindex = sindex % TCOUNT;

    if (tindex != 0)
      return 3;
    return 2;
  }

  entry = get_code_entry(code);

  /*
   * Just count current code if no other decompositions.  A NULL entry is
   * equivalent to a character with class 0 and no decompositions.
   */
  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
    (!compat && DECOMPOSITION_IS_COMPAT(entry)))
    return 1;

  /*
   * If this entry has other decomposition codes look at them as well. First
   * get its decomposition in the list of tables available.
   */
  decomp = get_code_decomposition(entry, &dec_size);
  for (i = 0; i < dec_size; i++)
  {
    uint32    lcode = decomp[i];

    size += get_decomposed_size(lcode, compat);
  }

  return size;
}

/*
 * Recompose a set of characters. For hangul characters, the calculation
 * is algorithmic. For others, an inverse lookup at the decomposition
 * table is necessary. Returns true if a recomposition can be done, and
 * false otherwise.
 */
static bool
recompose_code(uint32 start, uint32 code, uint32 *result)
{
  /*
   * Handle Hangul characters algorithmically, per the Unicode spec.
   *
   * Check if two current characters are L and V.
   */
  if (start >= LBASE && start < LBASE + LCOUNT &&
    code >= VBASE && code < VBASE + VCOUNT)
  {
    /* make syllable of form LV */
    uint32    lindex = start - LBASE;
    uint32    vindex = code - VBASE;

    *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
    return true;
  }
  /* Check if two current characters are LV and T */
  else if (start >= SBASE && start < (SBASE + SCOUNT) &&
       ((start - SBASE) % TCOUNT) == 0 &&
       code >= TBASE && code < (TBASE + TCOUNT))
  {
    /* make syllable of form LVT */
    uint32    tindex = code - TBASE;

    *result = start + tindex;
    return true;
  }
  else
  {
    const pg_unicode_decomposition *entry;

    /*
     * Do an inverse lookup of the decomposition tables to see if anything
     * matches. The comparison just needs to be a perfect match on the
     * sub-table of size two, because the start character has already been
     * recomposed partially.  This lookup uses a perfect hash function for
     * the backend code.
     */
#ifndef FRONTEND

    int     h,
          inv_lookup_index;
    uint64    hashkey;
    pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;

    /*
     * Compute the hash function. The hash key is formed by concatenating
     * bytes of the two codepoints in network order. See also
     * src/common/unicode/generate-unicode_norm_table.pl.
     */
    hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
    h = recompinfo.hash(&hashkey);

    /* An out-of-range result implies no match */
    if (h < 0 || h >= recompinfo.num_recomps)
      return false;

    inv_lookup_index = recompinfo.inverse_lookup[h];
    entry = &UnicodeDecompMain[inv_lookup_index];

    if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
      code == UnicodeDecomp_codepoints[entry->dec_index + 1])
    {
      *result = entry->codepoint;
      return true;
    }

#else

    int     i;

    for (i = 0; i < lengthof(UnicodeDecompMain); i++)
    {
      entry = &UnicodeDecompMain[i];

      if (DECOMPOSITION_SIZE(entry) != 2)
        continue;

      if (DECOMPOSITION_NO_COMPOSE(entry))
        continue;

      if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
        code == UnicodeDecomp_codepoints[entry->dec_index + 1])
      {
        *result = entry->codepoint;
        return true;
      }
    }
#endif              /* !FRONTEND */
  }

  return false;
}

/*
 * Decompose the given code into the array given by caller. The
 * decomposition begins at the position given by caller, saving one
 * lookup on the decomposition table. The current position needs to be
 * updated here to let the caller know from where to continue filling
 * in the array result.
 */
static void
decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
{
  const pg_unicode_decomposition *entry;
  int     i;
  const uint32 *decomp;
  int     dec_size;

  /*
   * Fast path for Hangul characters not stored in tables to save memory as
   * decomposition is algorithmic. See
   * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
   * on the matter.
   */
  if (code >= SBASE && code < SBASE + SCOUNT)
  {
    uint32    l,
          v,
          tindex,
          sindex;
    pg_wchar   *res = *result;

    sindex = code - SBASE;
    l = LBASE + sindex / (VCOUNT * TCOUNT);
    v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
    tindex = sindex % TCOUNT;

    res[*current] = l;
    (*current)++;
    res[*current] = v;
    (*current)++;

    if (tindex != 0)
    {
      res[*current] = TBASE + tindex;
      (*current)++;
    }

    return;
  }

  entry = get_code_entry(code);

  /*
   * Just fill in with the current decomposition if there are no
   * decomposition codes to recurse to.  A NULL entry is equivalent to a
   * character with class 0 and no decompositions, so just leave also in
   * this case.
   */
  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
    (!compat && DECOMPOSITION_IS_COMPAT(entry)))
  {
    pg_wchar   *res = *result;

    res[*current] = code;
    (*current)++;
    return;
  }

  /*
   * If this entry has other decomposition codes look at them as well.
   */
  decomp = get_code_decomposition(entry, &dec_size);
  for (i = 0; i < dec_size; i++)
  {
    pg_wchar  lcode = (pg_wchar) decomp[i];

    /* Leave if no more decompositions */
    decompose_code(lcode, compat, result, current);
  }
}

/*
 * unicode_normalize - Normalize a Unicode string to the specified form.
 *
 * The input is a 0-terminated array of codepoints.
 *
 * In frontend, returns a 0-terminated array of codepoints, allocated with
 * malloc. Or NULL if we run out of memory. In backend, the returned
 * string is palloc'd instead, and OOM is reported with ereport().
 */
pg_wchar *
unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
{
  bool    compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
  bool    recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
  pg_wchar   *decomp_chars;
  pg_wchar   *recomp_chars;
  int     decomp_size,
        current_size;
  int     count;
  const pg_wchar *p;

  /* variables for recomposition */
  int     last_class;
  int     starter_pos;
  int     target_pos;
  uint32    starter_ch;

  /* First, do character decomposition */

  /*
   * Calculate how many characters long the decomposed version will be.
   */
  decomp_size = 0;
  for (p = input; *p; p++)
    decomp_size += get_decomposed_size(*p, compat);

  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
  if (decomp_chars == NULL)
    return NULL;

  /*
   * Now fill in each entry recursively. This needs a second pass on the
   * decomposition table.
   */
  current_size = 0;
  for (p = input; *p; p++)
    decompose_code(*p, compat, &decomp_chars, &current_size);
  decomp_chars[decomp_size] = '\0';
  Assert(decomp_size == current_size);

  /* Leave if there is nothing to decompose */
  if (decomp_size == 0)
    return decomp_chars;

  /*
   * Now apply canonical ordering.
   */
  for (count = 1; count < decomp_size; count++)
  {
    pg_wchar  prev = decomp_chars[count - 1];
    pg_wchar  next = decomp_chars[count];
    pg_wchar  tmp;
    const uint8 prevClass = get_canonical_class(prev);
    const uint8 nextClass = get_canonical_class(next);

    /*
     * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
     * annex 4, a sequence of two adjacent characters in a string is an
     * exchangeable pair if the combining class (from the Unicode
     * Character Database) for the first character is greater than the
     * combining class for the second, and the second is not a starter.  A
     * character is a starter if its combining class is 0.
     */
    if (prevClass == 0 || nextClass == 0)
      continue;

    if (prevClass <= nextClass)
      continue;

    /* exchange can happen */
    tmp = decomp_chars[count - 1];
    decomp_chars[count - 1] = decomp_chars[count];
    decomp_chars[count] = tmp;

    /* backtrack to check again */
    if (count > 1)
      count -= 2;
  }

  if (!recompose)
    return decomp_chars;

  /*
   * The last phase of NFC and NFKC is the recomposition of the reordered
   * Unicode string using combining classes. The recomposed string cannot be
   * longer than the decomposed one, so make the allocation of the output
   * string based on that assumption.
   */
  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
  if (!recomp_chars)
  {
    FREE(decomp_chars);
    return NULL;
  }

  last_class = -1;      /* this eliminates a special check */
  starter_pos = 0;
  target_pos = 1;
  starter_ch = recomp_chars[0] = decomp_chars[0];

  for (count = 1; count < decomp_size; count++)
  {
    pg_wchar  ch = decomp_chars[count];
    int     ch_class = get_canonical_class(ch);
    pg_wchar  composite;

    if (last_class < ch_class &&
      recompose_code(starter_ch, ch, &composite))
    {
      recomp_chars[starter_pos] = composite;
      starter_ch = composite;
    }
    else if (ch_class == 0)
    {
      starter_pos = target_pos;
      starter_ch = ch;
      last_class = -1;
      recomp_chars[target_pos++] = ch;
    }
    else
    {
      last_class = ch_class;
      recomp_chars[target_pos++] = ch;
    }
  }
  recomp_chars[target_pos] = (pg_wchar) '\0';

  FREE(decomp_chars);

  return recomp_chars;
}

/*
 * Normalization "quick check" algorithm; see
 * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
 */

/* We only need this in the backend. */
#ifndef FRONTEND

static const pg_unicode_normprops *
qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
{
  int     h;
  uint32    hashkey;

  /*
   * Compute the hash function. The hash key is the codepoint with the bytes
   * in network order.
   */
  hashkey = pg_hton32(ch);
  h = norminfo->hash(&hashkey);

  /* An out-of-range result implies no match */
  if (h < 0 || h >= norminfo->num_normprops)
    return NULL;

  /*
   * Since it's a perfect hash, we need only match to the specific codepoint
   * it identifies.
   */
  if (ch != norminfo->normprops[h].codepoint)
    return NULL;

  /* Success! */
  return &norminfo->normprops[h];
}

/*
 * Look up the normalization quick check character property
 */
static UnicodeNormalizationQC
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
{
  const pg_unicode_normprops *found = NULL;

  switch (form)
  {
    case UNICODE_NFC:
      found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
      break;
    case UNICODE_NFKC:
      found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
      break;
    default:
      Assert(false);
      break;
  }

  if (found)
    return found->quickcheck;
  else
    return UNICODE_NORM_QC_YES;
}

UnicodeNormalizationQC
unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
{
  uint8   lastCanonicalClass = 0;
  UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;

  /*
   * For the "D" forms, we don't run the quickcheck.  We don't include the
   * lookup tables for those because they are huge, checking for these
   * particular forms is less common, and running the slow path is faster
   * for the "D" forms than the "C" forms because you don't need to
   * recompose, which is slow.
   */
  if (form == UNICODE_NFD || form == UNICODE_NFKD)
    return UNICODE_NORM_QC_MAYBE;

  for (const pg_wchar *p = input; *p; p++)
  {
    pg_wchar  ch = *p;
    uint8   canonicalClass;
    UnicodeNormalizationQC check;

    canonicalClass = get_canonical_class(ch);
    if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
      return UNICODE_NORM_QC_NO;

    check = qc_is_allowed(form, ch);
    if (check == UNICODE_NORM_QC_NO)
      return UNICODE_NORM_QC_NO;
    else if (check == UNICODE_NORM_QC_MAYBE)
      result = UNICODE_NORM_QC_MAYBE;

    lastCanonicalClass = canonicalClass;
  }
  return result;
}

#endif              /* !FRONTEND */

Coverage Report

Created: 2025-07-03 06:49

Line	Count	Source (jump to first uncovered line)
1		/*-------------------------------------------------------------------------
2		* unicode_norm.c
3		* Normalize a Unicode string
4		*
5		* This implements Unicode normalization, per the documentation at
6		* https://www.unicode.org/reports/tr15/.
7		*
8		* Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
9		*
10		* IDENTIFICATION
11		* src/common/unicode_norm.c
12		*
13		*-------------------------------------------------------------------------
14		*/
15		#ifndef FRONTEND
16		#include "postgres.h"
17		#else
18		#include "postgres_fe.h"
19		#endif
20
21		#include "common/unicode_norm.h"
22		#ifndef FRONTEND
23		#include "common/unicode_norm_hashfunc.h"
24		#include "common/unicode_normprops_table.h"
25		#include "port/pg_bswap.h"
26		#else
27		#include "common/unicode_norm_table.h"
28		#endif
29
30		#ifndef FRONTEND
31	0	#define ALLOC(size) palloc(size)
32	0	#define FREE(size) pfree(size)
33		#else
34		#define ALLOC(size) malloc(size)
35		#define FREE(size) free(size)
36		#endif
37
38		/* Constants for calculations with Hangul characters */
39	0	#define SBASE 0xAC00 /* U+AC00 */
40	0	#define LBASE 0x1100 /* U+1100 */
41	0	#define VBASE 0x1161 /* U+1161 */
42	0	#define TBASE 0x11A7 /* U+11A7 */
43	0	#define LCOUNT 19
44	0	#define VCOUNT 21
45	0	#define TCOUNT 28
46	0	#define NCOUNT VCOUNT * TCOUNT
47	0	#define SCOUNT LCOUNT * NCOUNT
48
49		#ifdef FRONTEND
50		/* comparison routine for bsearch() of decomposition lookup table. */
51		static int
52		conv_compare(const void p1, const void p2)
53		{
54		uint32 v1,
55		v2;
56
57		v1 = (const uint32 ) p1;
58		v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
59		return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
60		}
61
62		#endif
63
64		/*
65		* get_code_entry
66		*
67		* Get the entry corresponding to code in the decomposition lookup table.
68		* The backend version of this code uses a perfect hash function for the
69		* lookup, while the frontend version uses a binary search.
70		*/
71		static const pg_unicode_decomposition *
72		get_code_entry(pg_wchar code)
73	0	{
74	0	#ifndef FRONTEND
75	0	int h;
76	0	uint32 hashkey;
77	0	pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
78
79		/*
80		* Compute the hash function. The hash key is the codepoint with the bytes
81		* in network order.
82		*/
83	0	hashkey = pg_hton32(code);
84	0	h = decompinfo.hash(&hashkey);
85
86		/* An out-of-range result implies no match */
87	0	if (h < 0 \|\| h >= decompinfo.num_decomps)
88	0	return NULL;
89
90		/*
91		* Since it's a perfect hash, we need only match to the specific codepoint
92		* it identifies.
93		*/
94	0	if (code != decompinfo.decomps[h].codepoint)
95	0	return NULL;
96
97		/* Success! */
98	0	return &decompinfo.decomps[h];
99		#else
100		return bsearch(&(code),
101		UnicodeDecompMain,
102		lengthof(UnicodeDecompMain),
103		sizeof(pg_unicode_decomposition),
104		conv_compare);
105		#endif
106	0	}
107
108		/*
109		* Get the combining class of the given codepoint.
110		*/
111		static uint8
112		get_canonical_class(pg_wchar code)
113	0	{
114	0	const pg_unicode_decomposition *entry = get_code_entry(code);
115
116		/*
117		* If no entries are found, the character used is either an Hangul
118		* character or a character with a class of 0 and no decompositions.
119		*/
120	0	if (!entry)
121	0	return 0;
122	0	else
123	0	return entry->comb_class;
124	0	}
125
126		/*
127		* Given a decomposition entry looked up earlier, get the decomposed
128		* characters.
129		*
130		* Note: the returned pointer can point to statically allocated buffer, and
131		* is only valid until next call to this function!
132		*/
133		static const pg_wchar *
134		get_code_decomposition(const pg_unicode_decomposition entry, int dec_size)
135	0	{
136	0	static pg_wchar x;
137
138	0	if (DECOMPOSITION_IS_INLINE(entry))
139	0	{
140	0	Assert(DECOMPOSITION_SIZE(entry) == 1);
141	0	x = (pg_wchar) entry->dec_index;
142	0	*dec_size = 1;
143	0	return &x;
144	0	}
145	0	else
146	0	{
147	0	*dec_size = DECOMPOSITION_SIZE(entry);
148	0	return &UnicodeDecomp_codepoints[entry->dec_index];
149	0	}
150	0	}
151
152		/*
153		* Calculate how many characters a given character will decompose to.
154		*
155		* This needs to recurse, if the character decomposes into characters that
156		* are, in turn, decomposable.
157		*/
158		static int
159		get_decomposed_size(pg_wchar code, bool compat)
160	0	{
161	0	const pg_unicode_decomposition *entry;
162	0	int size = 0;
163	0	int i;
164	0	const uint32 *decomp;
165	0	int dec_size;
166
167		/*
168		* Fast path for Hangul characters not stored in tables to save memory as
169		* decomposition is algorithmic. See
170		* https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
171		* on the matter.
172		*/
173	0	if (code >= SBASE && code < SBASE + SCOUNT)
174	0	{
175	0	uint32 tindex,
176	0	sindex;
177
178	0	sindex = code - SBASE;
179	0	tindex = sindex % TCOUNT;
180
181	0	if (tindex != 0)
182	0	return 3;
183	0	return 2;
184	0	}
185
186	0	entry = get_code_entry(code);
187
188		/*
189		* Just count current code if no other decompositions. A NULL entry is
190		* equivalent to a character with class 0 and no decompositions.
191		*/
192	0	if (entry == NULL \|\| DECOMPOSITION_SIZE(entry) == 0 \|\|
193	0	(!compat && DECOMPOSITION_IS_COMPAT(entry)))
194	0	return 1;
195
196		/*
197		* If this entry has other decomposition codes look at them as well. First
198		* get its decomposition in the list of tables available.
199		*/
200	0	decomp = get_code_decomposition(entry, &dec_size);
201	0	for (i = 0; i < dec_size; i++)
202	0	{
203	0	uint32 lcode = decomp[i];
204
205	0	size += get_decomposed_size(lcode, compat);
206	0	}
207
208	0	return size;
209	0	}
210
211		/*
212		* Recompose a set of characters. For hangul characters, the calculation
213		* is algorithmic. For others, an inverse lookup at the decomposition
214		* table is necessary. Returns true if a recomposition can be done, and
215		* false otherwise.
216		*/
217		static bool
218		recompose_code(uint32 start, uint32 code, uint32 *result)
219	0	{
220		/*
221		* Handle Hangul characters algorithmically, per the Unicode spec.
222		*
223		* Check if two current characters are L and V.
224		*/
225	0	if (start >= LBASE && start < LBASE + LCOUNT &&
226	0	code >= VBASE && code < VBASE + VCOUNT)
227	0	{
228		/* make syllable of form LV */
229	0	uint32 lindex = start - LBASE;
230	0	uint32 vindex = code - VBASE;
231
232	0	result = SBASE + (lindex VCOUNT + vindex) * TCOUNT;
233	0	return true;
234	0	}
235		/* Check if two current characters are LV and T */
236	0	else if (start >= SBASE && start < (SBASE + SCOUNT) &&
237	0	((start - SBASE) % TCOUNT) == 0 &&
238	0	code >= TBASE && code < (TBASE + TCOUNT))
239	0	{
240		/* make syllable of form LVT */
241	0	uint32 tindex = code - TBASE;
242
243	0	*result = start + tindex;
244	0	return true;
245	0	}
246	0	else
247	0	{
248	0	const pg_unicode_decomposition *entry;
249
250		/*
251		* Do an inverse lookup of the decomposition tables to see if anything
252		* matches. The comparison just needs to be a perfect match on the
253		* sub-table of size two, because the start character has already been
254		* recomposed partially. This lookup uses a perfect hash function for
255		* the backend code.
256		*/
257	0	#ifndef FRONTEND
258
259	0	int h,
260	0	inv_lookup_index;
261	0	uint64 hashkey;
262	0	pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
263
264		/*
265		* Compute the hash function. The hash key is formed by concatenating
266		* bytes of the two codepoints in network order. See also
267		* src/common/unicode/generate-unicode_norm_table.pl.
268		*/
269	0	hashkey = pg_hton64(((uint64) start << 32) \| (uint64) code);
270	0	h = recompinfo.hash(&hashkey);
271
272		/* An out-of-range result implies no match */
273	0	if (h < 0 \|\| h >= recompinfo.num_recomps)
274	0	return false;
275
276	0	inv_lookup_index = recompinfo.inverse_lookup[h];
277	0	entry = &UnicodeDecompMain[inv_lookup_index];
278
279	0	if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
280	0	code == UnicodeDecomp_codepoints[entry->dec_index + 1])
281	0	{
282	0	*result = entry->codepoint;
283	0	return true;
284	0	}
285
286		#else
287
288		int i;
289
290		for (i = 0; i < lengthof(UnicodeDecompMain); i++)
291		{
292		entry = &UnicodeDecompMain[i];
293
294		if (DECOMPOSITION_SIZE(entry) != 2)
295		continue;
296
297		if (DECOMPOSITION_NO_COMPOSE(entry))
298		continue;
299
300		if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
301		code == UnicodeDecomp_codepoints[entry->dec_index + 1])
302		{
303		*result = entry->codepoint;
304		return true;
305		}
306		}
307		#endif /* !FRONTEND */
308	0	}
309
310	0	return false;
311	0	}
312
313		/*
314		* Decompose the given code into the array given by caller. The
315		* decomposition begins at the position given by caller, saving one
316		* lookup on the decomposition table. The current position needs to be
317		* updated here to let the caller know from where to continue filling
318		* in the array result.
319		*/
320		static void
321		decompose_code(pg_wchar code, bool compat, pg_wchar *result, int current)
322	0	{
323	0	const pg_unicode_decomposition *entry;
324	0	int i;
325	0	const uint32 *decomp;
326	0	int dec_size;
327
328		/*
329		* Fast path for Hangul characters not stored in tables to save memory as
330		* decomposition is algorithmic. See
331		* https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
332		* on the matter.
333		*/
334	0	if (code >= SBASE && code < SBASE + SCOUNT)
335	0	{
336	0	uint32 l,
337	0	v,
338	0	tindex,
339	0	sindex;
340	0	pg_wchar res = result;
341
342	0	sindex = code - SBASE;
343	0	l = LBASE + sindex / (VCOUNT * TCOUNT);
344	0	v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
345	0	tindex = sindex % TCOUNT;
346
347	0	res[*current] = l;
348	0	(*current)++;
349	0	res[*current] = v;
350	0	(*current)++;
351
352	0	if (tindex != 0)
353	0	{
354	0	res[*current] = TBASE + tindex;
355	0	(*current)++;
356	0	}
357
358	0	return;
359	0	}
360
361	0	entry = get_code_entry(code);
362
363		/*
364		* Just fill in with the current decomposition if there are no
365		* decomposition codes to recurse to. A NULL entry is equivalent to a
366		* character with class 0 and no decompositions, so just leave also in
367		* this case.
368		*/
369	0	if (entry == NULL \|\| DECOMPOSITION_SIZE(entry) == 0 \|\|
370	0	(!compat && DECOMPOSITION_IS_COMPAT(entry)))
371	0	{
372	0	pg_wchar res = result;
373
374	0	res[*current] = code;
375	0	(*current)++;
376	0	return;
377	0	}
378
379		/*
380		* If this entry has other decomposition codes look at them as well.
381		*/
382	0	decomp = get_code_decomposition(entry, &dec_size);
383	0	for (i = 0; i < dec_size; i++)
384	0	{
385	0	pg_wchar lcode = (pg_wchar) decomp[i];
386
387		/* Leave if no more decompositions */
388	0	decompose_code(lcode, compat, result, current);
389	0	}
390	0	}
391
392		/*
393		* unicode_normalize - Normalize a Unicode string to the specified form.
394		*
395		* The input is a 0-terminated array of codepoints.
396		*
397		* In frontend, returns a 0-terminated array of codepoints, allocated with
398		* malloc. Or NULL if we run out of memory. In backend, the returned
399		* string is palloc'd instead, and OOM is reported with ereport().
400		*/
401		pg_wchar *
402		unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
403	0	{
404	0	bool compat = (form == UNICODE_NFKC \|\| form == UNICODE_NFKD);
405	0	bool recompose = (form == UNICODE_NFC \|\| form == UNICODE_NFKC);
406	0	pg_wchar *decomp_chars;
407	0	pg_wchar *recomp_chars;
408	0	int decomp_size,
409	0	current_size;
410	0	int count;
411	0	const pg_wchar *p;
412
413		/* variables for recomposition */
414	0	int last_class;
415	0	int starter_pos;
416	0	int target_pos;
417	0	uint32 starter_ch;
418
419		/* First, do character decomposition */
420
421		/*
422		* Calculate how many characters long the decomposed version will be.
423		*/
424	0	decomp_size = 0;
425	0	for (p = input; *p; p++)
426	0	decomp_size += get_decomposed_size(*p, compat);
427
428	0	decomp_chars = (pg_wchar ) ALLOC((decomp_size + 1) sizeof(pg_wchar));
429	0	if (decomp_chars == NULL)
430	0	return NULL;
431
432		/*
433		* Now fill in each entry recursively. This needs a second pass on the
434		* decomposition table.
435		*/
436	0	current_size = 0;
437	0	for (p = input; *p; p++)
438	0	decompose_code(*p, compat, &decomp_chars, &current_size);
439	0	decomp_chars[decomp_size] = '\0';
440	0	Assert(decomp_size == current_size);
441
442		/* Leave if there is nothing to decompose */
443	0	if (decomp_size == 0)
444	0	return decomp_chars;
445
446		/*
447		* Now apply canonical ordering.
448		*/
449	0	for (count = 1; count < decomp_size; count++)
450	0	{
451	0	pg_wchar prev = decomp_chars[count - 1];
452	0	pg_wchar next = decomp_chars[count];
453	0	pg_wchar tmp;
454	0	const uint8 prevClass = get_canonical_class(prev);
455	0	const uint8 nextClass = get_canonical_class(next);
456
457		/*
458		* Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
459		* annex 4, a sequence of two adjacent characters in a string is an
460		* exchangeable pair if the combining class (from the Unicode
461		* Character Database) for the first character is greater than the
462		* combining class for the second, and the second is not a starter. A
463		* character is a starter if its combining class is 0.
464		*/
465	0	if (prevClass == 0 \|\| nextClass == 0)
466	0	continue;
467
468	0	if (prevClass <= nextClass)
469	0	continue;
470
471		/* exchange can happen */
472	0	tmp = decomp_chars[count - 1];
473	0	decomp_chars[count - 1] = decomp_chars[count];
474	0	decomp_chars[count] = tmp;
475
476		/* backtrack to check again */
477	0	if (count > 1)
478	0	count -= 2;
479	0	}
480
481	0	if (!recompose)
482	0	return decomp_chars;
483
484		/*
485		* The last phase of NFC and NFKC is the recomposition of the reordered
486		* Unicode string using combining classes. The recomposed string cannot be
487		* longer than the decomposed one, so make the allocation of the output
488		* string based on that assumption.
489		*/
490	0	recomp_chars = (pg_wchar ) ALLOC((decomp_size + 1) sizeof(pg_wchar));
491	0	if (!recomp_chars)
492	0	{
493	0	FREE(decomp_chars);
494	0	return NULL;
495	0	}
496
497	0	last_class = -1; /* this eliminates a special check */
498	0	starter_pos = 0;
499	0	target_pos = 1;
500	0	starter_ch = recomp_chars[0] = decomp_chars[0];
501
502	0	for (count = 1; count < decomp_size; count++)
503	0	{
504	0	pg_wchar ch = decomp_chars[count];
505	0	int ch_class = get_canonical_class(ch);
506	0	pg_wchar composite;
507
508	0	if (last_class < ch_class &&
509	0	recompose_code(starter_ch, ch, &composite))
510	0	{
511	0	recomp_chars[starter_pos] = composite;
512	0	starter_ch = composite;
513	0	}
514	0	else if (ch_class == 0)
515	0	{
516	0	starter_pos = target_pos;
517	0	starter_ch = ch;
518	0	last_class = -1;
519	0	recomp_chars[target_pos++] = ch;
520	0	}
521	0	else
522	0	{
523	0	last_class = ch_class;
524	0	recomp_chars[target_pos++] = ch;
525	0	}
526	0	}
527	0	recomp_chars[target_pos] = (pg_wchar) '\0';
528
529	0	FREE(decomp_chars);
530
531	0	return recomp_chars;
532	0	}
533
534		/*
535		* Normalization "quick check" algorithm; see
536		* <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
537		*/
538
539		/* We only need this in the backend. */
540		#ifndef FRONTEND
541
542		static const pg_unicode_normprops *
543		qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
544	0	{
545	0	int h;
546	0	uint32 hashkey;
547
548		/*
549		* Compute the hash function. The hash key is the codepoint with the bytes
550		* in network order.
551		*/
552	0	hashkey = pg_hton32(ch);
553	0	h = norminfo->hash(&hashkey);
554
555		/* An out-of-range result implies no match */
556	0	if (h < 0 \|\| h >= norminfo->num_normprops)
557	0	return NULL;
558
559		/*
560		* Since it's a perfect hash, we need only match to the specific codepoint
561		* it identifies.
562		*/
563	0	if (ch != norminfo->normprops[h].codepoint)
564	0	return NULL;
565
566		/* Success! */
567	0	return &norminfo->normprops[h];
568	0	}
569
570		/*
571		* Look up the normalization quick check character property
572		*/
573		static UnicodeNormalizationQC
574		qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
575	0	{
576	0	const pg_unicode_normprops *found = NULL;
577
578	0	switch (form)
579	0	{
580	0	case UNICODE_NFC:
581	0	found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
582	0	break;
583	0	case UNICODE_NFKC:
584	0	found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
585	0	break;
586	0	default:
587	0	Assert(false);
588	0	break;
589	0	}
590
591	0	if (found)
592	0	return found->quickcheck;
593	0	else
594	0	return UNICODE_NORM_QC_YES;
595	0	}
596
597		UnicodeNormalizationQC
598		unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
599	0	{
600	0	uint8 lastCanonicalClass = 0;
601	0	UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
602
603		/*
604		* For the "D" forms, we don't run the quickcheck. We don't include the
605		* lookup tables for those because they are huge, checking for these
606		* particular forms is less common, and running the slow path is faster
607		* for the "D" forms than the "C" forms because you don't need to
608		* recompose, which is slow.
609		*/
610	0	if (form == UNICODE_NFD \|\| form == UNICODE_NFKD)
611	0	return UNICODE_NORM_QC_MAYBE;
612
613	0	for (const pg_wchar p = input; p; p++)
614	0	{
615	0	pg_wchar ch = *p;
616	0	uint8 canonicalClass;
617	0	UnicodeNormalizationQC check;
618
619	0	canonicalClass = get_canonical_class(ch);
620	0	if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
621	0	return UNICODE_NORM_QC_NO;
622
623	0	check = qc_is_allowed(form, ch);
624	0	if (check == UNICODE_NORM_QC_NO)
625	0	return UNICODE_NORM_QC_NO;
626	0	else if (check == UNICODE_NORM_QC_MAYBE)
627	0	result = UNICODE_NORM_QC_MAYBE;
628
629	0	lastCanonicalClass = canonicalClass;
630	0	}
631	0	return result;
632	0	}
633
634		#endif /* !FRONTEND */