Coverage Report

Created: 2025-06-13 06:06

/src/postgres/src/common/unicode_case.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 * unicode_case.c
3
 *    Unicode case mapping and case conversion.
4
 *
5
 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
6
 *
7
 * IDENTIFICATION
8
 *    src/common/unicode_case.c
9
 *
10
 *-------------------------------------------------------------------------
11
 */
12
#ifndef FRONTEND
13
#include "postgres.h"
14
#else
15
#include "postgres_fe.h"
16
#endif
17
18
#include "common/unicode_case.h"
19
#include "common/unicode_case_table.h"
20
#include "common/unicode_category.h"
21
#include "mb/pg_wchar.h"
22
23
enum CaseMapResult
24
{
25
  CASEMAP_SELF,
26
  CASEMAP_SIMPLE,
27
  CASEMAP_SPECIAL,
28
};
29
30
/*
31
 * Map for each case kind.
32
 */
33
static const pg_wchar *const casekind_map[NCaseKind] =
34
{
35
  [CaseLower] = case_map_lower,
36
  [CaseTitle] = case_map_title,
37
  [CaseUpper] = case_map_upper,
38
  [CaseFold] = case_map_fold,
39
};
40
41
static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
42
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
43
               CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
44
               void *wbstate);
45
static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
46
                  const char *src, size_t srclen, size_t srcoff,
47
                  pg_wchar *simple, const pg_wchar **special);
48
49
pg_wchar
50
unicode_lowercase_simple(pg_wchar code)
51
0
{
52
0
  pg_wchar  cp = find_case_map(code, case_map_lower);
53
54
0
  return cp != 0 ? cp : code;
55
0
}
56
57
pg_wchar
58
unicode_titlecase_simple(pg_wchar code)
59
0
{
60
0
  pg_wchar  cp = find_case_map(code, case_map_title);
61
62
0
  return cp != 0 ? cp : code;
63
0
}
64
65
pg_wchar
66
unicode_uppercase_simple(pg_wchar code)
67
0
{
68
0
  pg_wchar  cp = find_case_map(code, case_map_upper);
69
70
0
  return cp != 0 ? cp : code;
71
0
}
72
73
pg_wchar
74
unicode_casefold_simple(pg_wchar code)
75
0
{
76
0
  pg_wchar  cp = find_case_map(code, case_map_fold);
77
78
0
  return cp != 0 ? cp : code;
79
0
}
80
81
/*
82
 * unicode_strlower()
83
 *
84
 * Convert src to lowercase, and return the result length (not including
85
 * terminating NUL).
86
 *
87
 * String src must be encoded in UTF-8. If srclen < 0, src must be
88
 * NUL-terminated.
89
 *
90
 * Result string is stored in dst, truncating if larger than dstsize. If
91
 * dstsize is greater than the result length, dst will be NUL-terminated;
92
 * otherwise not.
93
 *
94
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
95
 * required buffer size before allocating.
96
 *
97
 * If full is true, use special case mappings if available and if the
98
 * conditions are satisfied.
99
 */
100
size_t
101
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
102
         bool full)
103
0
{
104
0
  return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
105
0
            NULL);
106
0
}
107
108
/*
109
 * unicode_strtitle()
110
 *
111
 * Convert src to titlecase, and return the result length (not including
112
 * terminating NUL).
113
 *
114
 * String src must be encoded in UTF-8. If srclen < 0, src must be
115
 * NUL-terminated.
116
 *
117
 * Result string is stored in dst, truncating if larger than dstsize. If
118
 * dstsize is greater than the result length, dst will be NUL-terminated;
119
 * otherwise not.
120
 *
121
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
122
 * required buffer size before allocating.
123
 *
124
 * If full is true, use special case mappings if available and if the
125
 * conditions are satisfied. Otherwise, use only simple mappings and use
126
 * uppercase instead of titlecase.
127
 *
128
 * Titlecasing requires knowledge about word boundaries, which is provided by
129
 * the callback wbnext. A word boundary is the offset of the start of a word
130
 * or the offset of the character immediately following a word.
131
 *
132
 * The caller is expected to initialize and free the callback state
133
 * wbstate. The callback should first return offset 0 for the first boundary;
134
 * then the offset of each subsequent word boundary; then the total length of
135
 * the string to indicate the final boundary.
136
 */
137
size_t
138
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
139
         bool full, WordBoundaryNext wbnext, void *wbstate)
140
0
{
141
0
  return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
142
0
            wbstate);
143
0
}
144
145
/*
146
 * unicode_strupper()
147
 *
148
 * Convert src to uppercase, and return the result length (not including
149
 * terminating NUL).
150
 *
151
 * String src must be encoded in UTF-8. If srclen < 0, src must be
152
 * NUL-terminated.
153
 *
154
 * Result string is stored in dst, truncating if larger than dstsize. If
155
 * dstsize is greater than the result length, dst will be NUL-terminated;
156
 * otherwise not.
157
 *
158
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
159
 * required buffer size before allocating.
160
 *
161
 * If full is true, use special case mappings if available and if the
162
 * conditions are satisfied.
163
 */
164
size_t
165
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
166
         bool full)
167
0
{
168
0
  return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
169
0
            NULL);
170
0
}
171
172
/*
173
 * unicode_strfold()
174
 *
175
 * Case fold src, and return the result length (not including terminating
176
 * NUL).
177
 *
178
 * String src must be encoded in UTF-8. If srclen < 0, src must be
179
 * NUL-terminated.
180
 *
181
 * Result string is stored in dst, truncating if larger than dstsize. If
182
 * dstsize is greater than the result length, dst will be NUL-terminated;
183
 * otherwise not.
184
 *
185
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
186
 * required buffer size before allocating.
187
 */
188
size_t
189
unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
190
        bool full)
191
0
{
192
0
  return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
193
0
            NULL);
194
0
}
195
196
/*
197
 * Implement Unicode Default Case Conversion algorithm.
198
 *
199
 * If str_casekind is CaseLower or CaseUpper, map each character in the string
200
 * for which a mapping is available.
201
 *
202
 * If str_casekind is CaseTitle, maps characters found on a word boundary to
203
 * titlecase (or uppercase if full is false) and other characters to
204
 * lowercase. NB: does not currently implement the Unicode behavior in which
205
 * the word boundary is adjusted to the next Cased character. That behavior
206
 * could be implemented as an option, but it doesn't match the default
207
 * behavior of ICU, nor does it match the documented behavior of INITCAP().
208
 *
209
 * If full is true, use special mappings for relevant characters, which can
210
 * map a single codepoint to multiple codepoints, or depend on conditions.
211
 */
212
static size_t
213
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
214
       CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
215
       void *wbstate)
216
0
{
217
  /* character CaseKind varies while titlecasing */
218
0
  CaseKind  chr_casekind = str_casekind;
219
0
  size_t    srcoff = 0;
220
0
  size_t    result_len = 0;
221
0
  size_t    boundary = 0;
222
223
0
  Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
224
0
       (str_casekind != CaseTitle && !wbnext && !wbstate));
225
226
0
  if (str_casekind == CaseTitle)
227
0
  {
228
0
    boundary = wbnext(wbstate);
229
0
    Assert(boundary == 0);  /* start of text is always a boundary */
230
0
  }
231
232
0
  while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
233
0
  {
234
0
    pg_wchar  u1 = utf8_to_unicode((unsigned char *) src + srcoff);
235
0
    int     u1len = unicode_utf8len(u1);
236
0
    pg_wchar  simple = 0;
237
0
    const pg_wchar *special = NULL;
238
0
    enum CaseMapResult casemap_result;
239
240
0
    if (str_casekind == CaseTitle)
241
0
    {
242
0
      if (srcoff == boundary)
243
0
      {
244
0
        chr_casekind = full ? CaseTitle : CaseUpper;
245
0
        boundary = wbnext(wbstate);
246
0
      }
247
0
      else
248
0
        chr_casekind = CaseLower;
249
0
    }
250
251
0
    casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
252
0
                 &simple, &special);
253
254
0
    switch (casemap_result)
255
0
    {
256
0
      case CASEMAP_SELF:
257
        /* no mapping; copy bytes from src */
258
0
        Assert(simple == 0);
259
0
        Assert(special == NULL);
260
0
        if (result_len + u1len <= dstsize)
261
0
          memcpy(dst + result_len, src + srcoff, u1len);
262
263
0
        result_len += u1len;
264
0
        break;
265
0
      case CASEMAP_SIMPLE:
266
0
        {
267
          /* replace with single character */
268
0
          pg_wchar  u2 = simple;
269
0
          pg_wchar  u2len = unicode_utf8len(u2);
270
271
0
          Assert(special == NULL);
272
0
          if (result_len + u2len <= dstsize)
273
0
            unicode_to_utf8(u2, (unsigned char *) dst + result_len);
274
275
0
          result_len += u2len;
276
0
        }
277
0
        break;
278
0
      case CASEMAP_SPECIAL:
279
        /* replace with up to MAX_CASE_EXPANSION characters */
280
0
        Assert(simple == 0);
281
0
        for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
282
0
        {
283
0
          pg_wchar  u2 = special[i];
284
0
          size_t    u2len = unicode_utf8len(u2);
285
286
0
          if (result_len + u2len <= dstsize)
287
0
            unicode_to_utf8(u2, (unsigned char *) dst + result_len);
288
289
0
          result_len += u2len;
290
0
        }
291
0
        break;
292
0
    }
293
294
0
    srcoff += u1len;
295
0
  }
296
297
0
  if (result_len < dstsize)
298
0
    dst[result_len] = '\0';
299
300
0
  return result_len;
301
0
}
302
303
/*
304
 * Check that the condition matches Final_Sigma, described in Unicode Table
305
 * 3-17. The character at the given offset must be directly preceded by a
306
 * Cased character, and must not be directly followed by a Cased character.
307
 *
308
 * Case_Ignorable characters are ignored. NB: some characters may be both
309
 * Cased and Case_Ignorable, in which case they are ignored.
310
 */
311
static bool
312
check_final_sigma(const unsigned char *str, size_t len, size_t offset)
313
0
{
314
  /* the start of the string is not preceded by a Cased character */
315
0
  if (offset == 0)
316
0
    return false;
317
318
  /* iterate backwards, looking for Cased character */
319
0
  for (int i = offset - 1; i >= 0; i--)
320
0
  {
321
0
    if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
322
0
    {
323
0
      pg_wchar  curr = utf8_to_unicode(str + i);
324
325
0
      if (pg_u_prop_case_ignorable(curr))
326
0
        continue;
327
0
      else if (pg_u_prop_cased(curr))
328
0
        break;
329
0
      else
330
0
        return false;
331
0
    }
332
0
    else if ((str[i] & 0xC0) == 0x80)
333
0
      continue;
334
335
0
    Assert(false);      /* invalid UTF-8 */
336
0
  }
337
338
  /* end of string is not followed by a Cased character */
339
0
  if (offset == len)
340
0
    return true;
341
342
  /* iterate forwards, looking for Cased character */
343
0
  for (int i = offset + 1; i < len && str[i] != '\0'; i++)
344
0
  {
345
0
    if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
346
0
    {
347
0
      pg_wchar  curr = utf8_to_unicode(str + i);
348
349
0
      if (pg_u_prop_case_ignorable(curr))
350
0
        continue;
351
0
      else if (pg_u_prop_cased(curr))
352
0
        return false;
353
0
      else
354
0
        break;
355
0
    }
356
0
    else if ((str[i] & 0xC0) == 0x80)
357
0
      continue;
358
359
0
    Assert(false);      /* invalid UTF-8 */
360
0
  }
361
362
0
  return true;
363
0
}
364
365
/*
366
 * Unicode allows for special casing to be applied only under certain
367
 * circumstances. The only currently-supported condition is Final_Sigma.
368
 */
369
static bool
370
check_special_conditions(int conditions, const char *str, size_t len,
371
             size_t offset)
372
0
{
373
0
  if (conditions == 0)
374
0
    return true;
375
0
  else if (conditions == PG_U_FINAL_SIGMA)
376
0
    return check_final_sigma((unsigned char *) str, len, offset);
377
378
  /* no other conditions supported */
379
0
  Assert(false);
380
0
  return false;
381
0
}
382
383
/*
384
 * Map the given character to the requested case.
385
 *
386
 * If full is true, and a special case mapping is found and the conditions are
387
 * met, 'special' is set to the mapping result (which is an array of up to
388
 * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
389
 *
390
 * Otherwise, search for a simple mapping, and if found, set 'simple' to the
391
 * result and return CASEMAP_SIMPLE.
392
 *
393
 * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
394
 * character without modification.
395
 */
396
static enum CaseMapResult
397
casemap(pg_wchar u1, CaseKind casekind, bool full,
398
    const char *src, size_t srclen, size_t srcoff,
399
    pg_wchar *simple, const pg_wchar **special)
400
0
{
401
0
  uint16    idx;
402
403
  /* Fast path for codepoints < 0x80 */
404
0
  if (u1 < 0x80)
405
0
  {
406
    /*
407
     * The first elements in all tables are reserved as 0 (as NULL). The
408
     * data starts at index 1, not 0.
409
     */
410
0
    *simple = casekind_map[casekind][u1 + 1];
411
412
0
    return CASEMAP_SIMPLE;
413
0
  }
414
415
0
  idx = case_index(u1);
416
417
0
  if (idx == 0)
418
0
    return CASEMAP_SELF;
419
420
0
  if (full && case_map_special[idx] &&
421
0
    check_special_conditions(special_case[case_map_special[idx]].conditions,
422
0
                 src, srclen, srcoff))
423
0
  {
424
0
    *special = special_case[case_map_special[idx]].map[casekind];
425
0
    return CASEMAP_SPECIAL;
426
0
  }
427
428
0
  *simple = casekind_map[casekind][idx];
429
430
0
  return CASEMAP_SIMPLE;
431
0
}
432
433
/*
434
 * Find entry in simple case map.
435
 * If the entry does not exist, 0 will be returned.
436
 */
437
static pg_wchar
438
find_case_map(pg_wchar ucs, const pg_wchar *map)
439
0
{
440
  /* Fast path for codepoints < 0x80 */
441
0
  if (ucs < 0x80)
442
    /* The first elements in all tables are reserved as 0 (as NULL). */
443
0
    return map[ucs + 1];
444
0
  return map[case_index(ucs)];
445
0
}