/src/postgres/src/common/unicode_case.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 * unicode_case.c
 *    Unicode case mapping and case conversion.
 *
 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *    src/common/unicode_case.c
 *
 *-------------------------------------------------------------------------
 */
#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif

#include "common/unicode_case.h"
#include "common/unicode_case_table.h"
#include "common/unicode_category.h"
#include "mb/pg_wchar.h"

enum CaseMapResult
{
  CASEMAP_SELF,
  CASEMAP_SIMPLE,
  CASEMAP_SPECIAL,
};

/*
 * Map for each case kind.
 */
static const pg_wchar *const casekind_map[NCaseKind] =
{
  [CaseLower] = case_map_lower,
  [CaseTitle] = case_map_title,
  [CaseUpper] = case_map_upper,
  [CaseFold] = case_map_fold,
};

static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
               CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
               void *wbstate);
static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
                  const char *src, size_t srclen, size_t srcoff,
                  pg_wchar *simple, const pg_wchar **special);

pg_wchar
unicode_lowercase_simple(pg_wchar code)
{
  pg_wchar  cp = find_case_map(code, case_map_lower);

  return cp != 0 ? cp : code;
}

pg_wchar
unicode_titlecase_simple(pg_wchar code)
{
  pg_wchar  cp = find_case_map(code, case_map_title);

  return cp != 0 ? cp : code;
}

pg_wchar
unicode_uppercase_simple(pg_wchar code)
{
  pg_wchar  cp = find_case_map(code, case_map_upper);

  return cp != 0 ? cp : code;
}

pg_wchar
unicode_casefold_simple(pg_wchar code)
{
  pg_wchar  cp = find_case_map(code, case_map_fold);

  return cp != 0 ? cp : code;
}

/*
 * unicode_strlower()
 *
 * Convert src to lowercase, and return the result length (not including
 * terminating NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied.
 */
size_t
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
         bool full)
{
  return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
            NULL);
}

/*
 * unicode_strtitle()
 *
 * Convert src to titlecase, and return the result length (not including
 * terminating NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied. Otherwise, use only simple mappings and use
 * uppercase instead of titlecase.
 *
 * Titlecasing requires knowledge about word boundaries, which is provided by
 * the callback wbnext. A word boundary is the offset of the start of a word
 * or the offset of the character immediately following a word.
 *
 * The caller is expected to initialize and free the callback state
 * wbstate. The callback should first return offset 0 for the first boundary;
 * then the offset of each subsequent word boundary; then the total length of
 * the string to indicate the final boundary.
 */
size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
         bool full, WordBoundaryNext wbnext, void *wbstate)
{
  return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
            wbstate);
}

/*
 * unicode_strupper()
 *
 * Convert src to uppercase, and return the result length (not including
 * terminating NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied.
 */
size_t
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
         bool full)
{
  return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
            NULL);
}

/*
 * unicode_strfold()
 *
 * Case fold src, and return the result length (not including terminating
 * NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 */
size_t
unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
        bool full)
{
  return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
            NULL);
}

/*
 * Implement Unicode Default Case Conversion algorithm.
 *
 * If str_casekind is CaseLower or CaseUpper, map each character in the string
 * for which a mapping is available.
 *
 * If str_casekind is CaseTitle, maps characters found on a word boundary to
 * titlecase (or uppercase if full is false) and other characters to
 * lowercase. NB: does not currently implement the Unicode behavior in which
 * the word boundary is adjusted to the next Cased character. That behavior
 * could be implemented as an option, but it doesn't match the default
 * behavior of ICU, nor does it match the documented behavior of INITCAP().
 *
 * If full is true, use special mappings for relevant characters, which can
 * map a single codepoint to multiple codepoints, or depend on conditions.
 */
static size_t
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
       CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
       void *wbstate)
{
  /* character CaseKind varies while titlecasing */
  CaseKind  chr_casekind = str_casekind;
  size_t    srcoff = 0;
  size_t    result_len = 0;
  size_t    boundary = 0;

  Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
       (str_casekind != CaseTitle && !wbnext && !wbstate));

  if (str_casekind == CaseTitle)
  {
    boundary = wbnext(wbstate);
    Assert(boundary == 0);  /* start of text is always a boundary */
  }

  while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
  {
    pg_wchar  u1 = utf8_to_unicode((unsigned char *) src + srcoff);
    int     u1len = unicode_utf8len(u1);
    pg_wchar  simple = 0;
    const pg_wchar *special = NULL;
    enum CaseMapResult casemap_result;

    if (str_casekind == CaseTitle)
    {
      if (srcoff == boundary)
      {
        chr_casekind = full ? CaseTitle : CaseUpper;
        boundary = wbnext(wbstate);
      }
      else
        chr_casekind = CaseLower;
    }

    casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
                 &simple, &special);

    switch (casemap_result)
    {
      case CASEMAP_SELF:
        /* no mapping; copy bytes from src */
        Assert(simple == 0);
        Assert(special == NULL);
        if (result_len + u1len <= dstsize)
          memcpy(dst + result_len, src + srcoff, u1len);

        result_len += u1len;
        break;
      case CASEMAP_SIMPLE:
        {
          /* replace with single character */
          pg_wchar  u2 = simple;
          pg_wchar  u2len = unicode_utf8len(u2);

          Assert(special == NULL);
          if (result_len + u2len <= dstsize)
            unicode_to_utf8(u2, (unsigned char *) dst + result_len);

          result_len += u2len;
        }
        break;
      case CASEMAP_SPECIAL:
        /* replace with up to MAX_CASE_EXPANSION characters */
        Assert(simple == 0);
        for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
        {
          pg_wchar  u2 = special[i];
          size_t    u2len = unicode_utf8len(u2);

          if (result_len + u2len <= dstsize)
            unicode_to_utf8(u2, (unsigned char *) dst + result_len);

          result_len += u2len;
        }
        break;
    }

    srcoff += u1len;
  }

  if (result_len < dstsize)
    dst[result_len] = '\0';

  return result_len;
}

/*
 * Check that the condition matches Final_Sigma, described in Unicode Table
 * 3-17. The character at the given offset must be directly preceded by a
 * Cased character, and must not be directly followed by a Cased character.
 *
 * Case_Ignorable characters are ignored. NB: some characters may be both
 * Cased and Case_Ignorable, in which case they are ignored.
 */
static bool
check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
  /* the start of the string is not preceded by a Cased character */
  if (offset == 0)
    return false;

  /* iterate backwards, looking for Cased character */
  for (int i = offset - 1; i >= 0; i--)
  {
    if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
    {
      pg_wchar  curr = utf8_to_unicode(str + i);

      if (pg_u_prop_case_ignorable(curr))
        continue;
      else if (pg_u_prop_cased(curr))
        break;
      else
        return false;
    }
    else if ((str[i] & 0xC0) == 0x80)
      continue;

    Assert(false);      /* invalid UTF-8 */
  }

  /* end of string is not followed by a Cased character */
  if (offset == len)
    return true;

  /* iterate forwards, looking for Cased character */
  for (int i = offset + 1; i < len && str[i] != '\0'; i++)
  {
    if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
    {
      pg_wchar  curr = utf8_to_unicode(str + i);

      if (pg_u_prop_case_ignorable(curr))
        continue;
      else if (pg_u_prop_cased(curr))
        return false;
      else
        break;
    }
    else if ((str[i] & 0xC0) == 0x80)
      continue;

    Assert(false);      /* invalid UTF-8 */
  }

  return true;
}

/*
 * Unicode allows for special casing to be applied only under certain
 * circumstances. The only currently-supported condition is Final_Sigma.
 */
static bool
check_special_conditions(int conditions, const char *str, size_t len,
             size_t offset)
{
  if (conditions == 0)
    return true;
  else if (conditions == PG_U_FINAL_SIGMA)
    return check_final_sigma((unsigned char *) str, len, offset);

  /* no other conditions supported */
  Assert(false);
  return false;
}

/*
 * Map the given character to the requested case.
 *
 * If full is true, and a special case mapping is found and the conditions are
 * met, 'special' is set to the mapping result (which is an array of up to
 * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
 *
 * Otherwise, search for a simple mapping, and if found, set 'simple' to the
 * result and return CASEMAP_SIMPLE.
 *
 * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
 * character without modification.
 */
static enum CaseMapResult
casemap(pg_wchar u1, CaseKind casekind, bool full,
    const char *src, size_t srclen, size_t srcoff,
    pg_wchar *simple, const pg_wchar **special)
{
  uint16    idx;

  /* Fast path for codepoints < 0x80 */
  if (u1 < 0x80)
  {
    /*
     * The first elements in all tables are reserved as 0 (as NULL). The
     * data starts at index 1, not 0.
     */
    *simple = casekind_map[casekind][u1 + 1];

    return CASEMAP_SIMPLE;
  }

  idx = case_index(u1);

  if (idx == 0)
    return CASEMAP_SELF;

  if (full && case_map_special[idx] &&
    check_special_conditions(special_case[case_map_special[idx]].conditions,
                 src, srclen, srcoff))
  {
    *special = special_case[case_map_special[idx]].map[casekind];
    return CASEMAP_SPECIAL;
  }

  *simple = casekind_map[casekind][idx];

  return CASEMAP_SIMPLE;
}

/*
 * Find entry in simple case map.
 * If the entry does not exist, 0 will be returned.
 */
static pg_wchar
find_case_map(pg_wchar ucs, const pg_wchar *map)
{
  /* Fast path for codepoints < 0x80 */
  if (ucs < 0x80)
    /* The first elements in all tables are reserved as 0 (as NULL). */
    return map[ucs + 1];
  return map[case_index(ucs)];
}

Coverage Report

Created: 2025-06-13 06:06

Line	Count	Source (jump to first uncovered line)
1		/*-------------------------------------------------------------------------
2		* unicode_case.c
3		* Unicode case mapping and case conversion.
4		*
5		* Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6		*
7		* IDENTIFICATION
8		* src/common/unicode_case.c
9		*
10		*-------------------------------------------------------------------------
11		*/
12		#ifndef FRONTEND
13		#include "postgres.h"
14		#else
15		#include "postgres_fe.h"
16		#endif
17
18		#include "common/unicode_case.h"
19		#include "common/unicode_case_table.h"
20		#include "common/unicode_category.h"
21		#include "mb/pg_wchar.h"
22
23		enum CaseMapResult
24		{
25		CASEMAP_SELF,
26		CASEMAP_SIMPLE,
27		CASEMAP_SPECIAL,
28		};
29
30		/*
31		* Map for each case kind.
32		*/
33		static const pg_wchar *const casekind_map[NCaseKind] =
34		{
35		[CaseLower] = case_map_lower,
36		[CaseTitle] = case_map_title,
37		[CaseUpper] = case_map_upper,
38		[CaseFold] = case_map_fold,
39		};
40
41		static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
42		static size_t convert_case(char dst, size_t dstsize, const char src, ssize_t srclen,
43		CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
44		void *wbstate);
45		static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
46		const char *src, size_t srclen, size_t srcoff,
47		pg_wchar simple, const pg_wchar *special);
48
49		pg_wchar
50		unicode_lowercase_simple(pg_wchar code)
51	0	{
52	0	pg_wchar cp = find_case_map(code, case_map_lower);
53
54	0	return cp != 0 ? cp : code;
55	0	}
56
57		pg_wchar
58		unicode_titlecase_simple(pg_wchar code)
59	0	{
60	0	pg_wchar cp = find_case_map(code, case_map_title);
61
62	0	return cp != 0 ? cp : code;
63	0	}
64
65		pg_wchar
66		unicode_uppercase_simple(pg_wchar code)
67	0	{
68	0	pg_wchar cp = find_case_map(code, case_map_upper);
69
70	0	return cp != 0 ? cp : code;
71	0	}
72
73		pg_wchar
74		unicode_casefold_simple(pg_wchar code)
75	0	{
76	0	pg_wchar cp = find_case_map(code, case_map_fold);
77
78	0	return cp != 0 ? cp : code;
79	0	}
80
81		/*
82		* unicode_strlower()
83		*
84		* Convert src to lowercase, and return the result length (not including
85		* terminating NUL).
86		*
87		* String src must be encoded in UTF-8. If srclen < 0, src must be
88		* NUL-terminated.
89		*
90		* Result string is stored in dst, truncating if larger than dstsize. If
91		* dstsize is greater than the result length, dst will be NUL-terminated;
92		* otherwise not.
93		*
94		* If dstsize is zero, dst may be NULL. This is useful for calculating the
95		* required buffer size before allocating.
96		*
97		* If full is true, use special case mappings if available and if the
98		* conditions are satisfied.
99		*/
100		size_t
101		unicode_strlower(char dst, size_t dstsize, const char src, ssize_t srclen,
102		bool full)
103	0	{
104	0	return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
105	0	NULL);
106	0	}
107
108		/*
109		* unicode_strtitle()
110		*
111		* Convert src to titlecase, and return the result length (not including
112		* terminating NUL).
113		*
114		* String src must be encoded in UTF-8. If srclen < 0, src must be
115		* NUL-terminated.
116		*
117		* Result string is stored in dst, truncating if larger than dstsize. If
118		* dstsize is greater than the result length, dst will be NUL-terminated;
119		* otherwise not.
120		*
121		* If dstsize is zero, dst may be NULL. This is useful for calculating the
122		* required buffer size before allocating.
123		*
124		* If full is true, use special case mappings if available and if the
125		* conditions are satisfied. Otherwise, use only simple mappings and use
126		* uppercase instead of titlecase.
127		*
128		* Titlecasing requires knowledge about word boundaries, which is provided by
129		* the callback wbnext. A word boundary is the offset of the start of a word
130		* or the offset of the character immediately following a word.
131		*
132		* The caller is expected to initialize and free the callback state
133		* wbstate. The callback should first return offset 0 for the first boundary;
134		* then the offset of each subsequent word boundary; then the total length of
135		* the string to indicate the final boundary.
136		*/
137		size_t
138		unicode_strtitle(char dst, size_t dstsize, const char src, ssize_t srclen,
139		bool full, WordBoundaryNext wbnext, void *wbstate)
140	0	{
141	0	return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
142	0	wbstate);
143	0	}
144
145		/*
146		* unicode_strupper()
147		*
148		* Convert src to uppercase, and return the result length (not including
149		* terminating NUL).
150		*
151		* String src must be encoded in UTF-8. If srclen < 0, src must be
152		* NUL-terminated.
153		*
154		* Result string is stored in dst, truncating if larger than dstsize. If
155		* dstsize is greater than the result length, dst will be NUL-terminated;
156		* otherwise not.
157		*
158		* If dstsize is zero, dst may be NULL. This is useful for calculating the
159		* required buffer size before allocating.
160		*
161		* If full is true, use special case mappings if available and if the
162		* conditions are satisfied.
163		*/
164		size_t
165		unicode_strupper(char dst, size_t dstsize, const char src, ssize_t srclen,
166		bool full)
167	0	{
168	0	return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
169	0	NULL);
170	0	}
171
172		/*
173		* unicode_strfold()
174		*
175		* Case fold src, and return the result length (not including terminating
176		* NUL).
177		*
178		* String src must be encoded in UTF-8. If srclen < 0, src must be
179		* NUL-terminated.
180		*
181		* Result string is stored in dst, truncating if larger than dstsize. If
182		* dstsize is greater than the result length, dst will be NUL-terminated;
183		* otherwise not.
184		*
185		* If dstsize is zero, dst may be NULL. This is useful for calculating the
186		* required buffer size before allocating.
187		*/
188		size_t
189		unicode_strfold(char dst, size_t dstsize, const char src, ssize_t srclen,
190		bool full)
191	0	{
192	0	return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
193	0	NULL);
194	0	}
195
196		/*
197		* Implement Unicode Default Case Conversion algorithm.
198		*
199		* If str_casekind is CaseLower or CaseUpper, map each character in the string
200		* for which a mapping is available.
201		*
202		* If str_casekind is CaseTitle, maps characters found on a word boundary to
203		* titlecase (or uppercase if full is false) and other characters to
204		* lowercase. NB: does not currently implement the Unicode behavior in which
205		* the word boundary is adjusted to the next Cased character. That behavior
206		* could be implemented as an option, but it doesn't match the default
207		* behavior of ICU, nor does it match the documented behavior of INITCAP().
208		*
209		* If full is true, use special mappings for relevant characters, which can
210		* map a single codepoint to multiple codepoints, or depend on conditions.
211		*/
212		static size_t
213		convert_case(char dst, size_t dstsize, const char src, ssize_t srclen,
214		CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
215		void *wbstate)
216	0	{
217		/* character CaseKind varies while titlecasing */
218	0	CaseKind chr_casekind = str_casekind;
219	0	size_t srcoff = 0;
220	0	size_t result_len = 0;
221	0	size_t boundary = 0;
222
223	0	Assert((str_casekind == CaseTitle && wbnext && wbstate) \|\|
224	0	(str_casekind != CaseTitle && !wbnext && !wbstate));
225
226	0	if (str_casekind == CaseTitle)
227	0	{
228	0	boundary = wbnext(wbstate);
229	0	Assert(boundary == 0); /* start of text is always a boundary */
230	0	}
231
232	0	while ((srclen < 0 \|\| srcoff < srclen) && src[srcoff] != '\0')
233	0	{
234	0	pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
235	0	int u1len = unicode_utf8len(u1);
236	0	pg_wchar simple = 0;
237	0	const pg_wchar *special = NULL;
238	0	enum CaseMapResult casemap_result;
239
240	0	if (str_casekind == CaseTitle)
241	0	{
242	0	if (srcoff == boundary)
243	0	{
244	0	chr_casekind = full ? CaseTitle : CaseUpper;
245	0	boundary = wbnext(wbstate);
246	0	}
247	0	else
248	0	chr_casekind = CaseLower;
249	0	}
250
251	0	casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
252	0	&simple, &special);
253
254	0	switch (casemap_result)
255	0	{
256	0	case CASEMAP_SELF:
257		/* no mapping; copy bytes from src */
258	0	Assert(simple == 0);
259	0	Assert(special == NULL);
260	0	if (result_len + u1len <= dstsize)
261	0	memcpy(dst + result_len, src + srcoff, u1len);
262
263	0	result_len += u1len;
264	0	break;
265	0	case CASEMAP_SIMPLE:
266	0	{
267		/* replace with single character */
268	0	pg_wchar u2 = simple;
269	0	pg_wchar u2len = unicode_utf8len(u2);
270
271	0	Assert(special == NULL);
272	0	if (result_len + u2len <= dstsize)
273	0	unicode_to_utf8(u2, (unsigned char *) dst + result_len);
274
275	0	result_len += u2len;
276	0	}
277	0	break;
278	0	case CASEMAP_SPECIAL:
279		/* replace with up to MAX_CASE_EXPANSION characters */
280	0	Assert(simple == 0);
281	0	for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
282	0	{
283	0	pg_wchar u2 = special[i];
284	0	size_t u2len = unicode_utf8len(u2);
285
286	0	if (result_len + u2len <= dstsize)
287	0	unicode_to_utf8(u2, (unsigned char *) dst + result_len);
288
289	0	result_len += u2len;
290	0	}
291	0	break;
292	0	}
293
294	0	srcoff += u1len;
295	0	}
296
297	0	if (result_len < dstsize)
298	0	dst[result_len] = '\0';
299
300	0	return result_len;
301	0	}
302
303		/*
304		* Check that the condition matches Final_Sigma, described in Unicode Table
305		* 3-17. The character at the given offset must be directly preceded by a
306		* Cased character, and must not be directly followed by a Cased character.
307		*
308		* Case_Ignorable characters are ignored. NB: some characters may be both
309		* Cased and Case_Ignorable, in which case they are ignored.
310		*/
311		static bool
312		check_final_sigma(const unsigned char *str, size_t len, size_t offset)
313	0	{
314		/* the start of the string is not preceded by a Cased character */
315	0	if (offset == 0)
316	0	return false;
317
318		/* iterate backwards, looking for Cased character */
319	0	for (int i = offset - 1; i >= 0; i--)
320	0	{
321	0	if ((str[i] & 0x80) == 0 \|\| (str[i] & 0xC0) == 0xC0)
322	0	{
323	0	pg_wchar curr = utf8_to_unicode(str + i);
324
325	0	if (pg_u_prop_case_ignorable(curr))
326	0	continue;
327	0	else if (pg_u_prop_cased(curr))
328	0	break;
329	0	else
330	0	return false;
331	0	}
332	0	else if ((str[i] & 0xC0) == 0x80)
333	0	continue;
334
335	0	Assert(false); /* invalid UTF-8 */
336	0	}
337
338		/* end of string is not followed by a Cased character */
339	0	if (offset == len)
340	0	return true;
341
342		/* iterate forwards, looking for Cased character */
343	0	for (int i = offset + 1; i < len && str[i] != '\0'; i++)
344	0	{
345	0	if ((str[i] & 0x80) == 0 \|\| (str[i] & 0xC0) == 0xC0)
346	0	{
347	0	pg_wchar curr = utf8_to_unicode(str + i);
348
349	0	if (pg_u_prop_case_ignorable(curr))
350	0	continue;
351	0	else if (pg_u_prop_cased(curr))
352	0	return false;
353	0	else
354	0	break;
355	0	}
356	0	else if ((str[i] & 0xC0) == 0x80)
357	0	continue;
358
359	0	Assert(false); /* invalid UTF-8 */
360	0	}
361
362	0	return true;
363	0	}
364
365		/*
366		* Unicode allows for special casing to be applied only under certain
367		* circumstances. The only currently-supported condition is Final_Sigma.
368		*/
369		static bool
370		check_special_conditions(int conditions, const char *str, size_t len,
371		size_t offset)
372	0	{
373	0	if (conditions == 0)
374	0	return true;
375	0	else if (conditions == PG_U_FINAL_SIGMA)
376	0	return check_final_sigma((unsigned char *) str, len, offset);
377
378		/* no other conditions supported */
379	0	Assert(false);
380	0	return false;
381	0	}
382
383		/*
384		* Map the given character to the requested case.
385		*
386		* If full is true, and a special case mapping is found and the conditions are
387		* met, 'special' is set to the mapping result (which is an array of up to
388		* MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
389		*
390		* Otherwise, search for a simple mapping, and if found, set 'simple' to the
391		* result and return CASEMAP_SIMPLE.
392		*
393		* If no mapping is found, return CASEMAP_SELF, and the caller should copy the
394		* character without modification.
395		*/
396		static enum CaseMapResult
397		casemap(pg_wchar u1, CaseKind casekind, bool full,
398		const char *src, size_t srclen, size_t srcoff,
399		pg_wchar simple, const pg_wchar *special)
400	0	{
401	0	uint16 idx;
402
403		/* Fast path for codepoints < 0x80 */
404	0	if (u1 < 0x80)
405	0	{
406		/*
407		* The first elements in all tables are reserved as 0 (as NULL). The
408		* data starts at index 1, not 0.
409		*/
410	0	*simple = casekind_map[casekind][u1 + 1];
411
412	0	return CASEMAP_SIMPLE;
413	0	}
414
415	0	idx = case_index(u1);
416
417	0	if (idx == 0)
418	0	return CASEMAP_SELF;
419
420	0	if (full && case_map_special[idx] &&
421	0	check_special_conditions(special_case[case_map_special[idx]].conditions,
422	0	src, srclen, srcoff))
423	0	{
424	0	*special = special_case[case_map_special[idx]].map[casekind];
425	0	return CASEMAP_SPECIAL;
426	0	}
427
428	0	*simple = casekind_map[casekind][idx];
429
430	0	return CASEMAP_SIMPLE;
431	0	}
432
433		/*
434		* Find entry in simple case map.
435		* If the entry does not exist, 0 will be returned.
436		*/
437		static pg_wchar
438		find_case_map(pg_wchar ucs, const pg_wchar *map)
439	0	{
440		/* Fast path for codepoints < 0x80 */
441	0	if (ucs < 0x80)
442		/* The first elements in all tables are reserved as 0 (as NULL). */
443	0	return map[ucs + 1];
444	0	return map[case_index(ucs)];
445	0	}