/src/postgres/src/common/unicode_case.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * unicode_case.c |
3 | | * Unicode case mapping and case conversion. |
4 | | * |
5 | | * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group |
6 | | * |
7 | | * IDENTIFICATION |
8 | | * src/common/unicode_case.c |
9 | | * |
10 | | *------------------------------------------------------------------------- |
11 | | */ |
12 | | #ifndef FRONTEND |
13 | | #include "postgres.h" |
14 | | #else |
15 | | #include "postgres_fe.h" |
16 | | #endif |
17 | | |
18 | | #include "common/unicode_case.h" |
19 | | #include "common/unicode_case_table.h" |
20 | | #include "common/unicode_category.h" |
21 | | #include "mb/pg_wchar.h" |
22 | | |
23 | | enum CaseMapResult |
24 | | { |
25 | | CASEMAP_SELF, |
26 | | CASEMAP_SIMPLE, |
27 | | CASEMAP_SPECIAL, |
28 | | }; |
29 | | |
30 | | /* |
31 | | * Map for each case kind. |
32 | | */ |
33 | | static const pg_wchar *const casekind_map[NCaseKind] = |
34 | | { |
35 | | [CaseLower] = case_map_lower, |
36 | | [CaseTitle] = case_map_title, |
37 | | [CaseUpper] = case_map_upper, |
38 | | [CaseFold] = case_map_fold, |
39 | | }; |
40 | | |
41 | | static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map); |
42 | | static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, |
43 | | CaseKind str_casekind, bool full, WordBoundaryNext wbnext, |
44 | | void *wbstate); |
45 | | static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full, |
46 | | const char *src, size_t srclen, size_t srcoff, |
47 | | pg_wchar *simple, const pg_wchar **special); |
48 | | |
49 | | pg_wchar |
50 | | unicode_lowercase_simple(pg_wchar code) |
51 | 0 | { |
52 | 0 | pg_wchar cp = find_case_map(code, case_map_lower); |
53 | |
|
54 | 0 | return cp != 0 ? cp : code; |
55 | 0 | } |
56 | | |
57 | | pg_wchar |
58 | | unicode_titlecase_simple(pg_wchar code) |
59 | 0 | { |
60 | 0 | pg_wchar cp = find_case_map(code, case_map_title); |
61 | |
|
62 | 0 | return cp != 0 ? cp : code; |
63 | 0 | } |
64 | | |
65 | | pg_wchar |
66 | | unicode_uppercase_simple(pg_wchar code) |
67 | 0 | { |
68 | 0 | pg_wchar cp = find_case_map(code, case_map_upper); |
69 | |
|
70 | 0 | return cp != 0 ? cp : code; |
71 | 0 | } |
72 | | |
73 | | pg_wchar |
74 | | unicode_casefold_simple(pg_wchar code) |
75 | 0 | { |
76 | 0 | pg_wchar cp = find_case_map(code, case_map_fold); |
77 | |
|
78 | 0 | return cp != 0 ? cp : code; |
79 | 0 | } |
80 | | |
81 | | /* |
82 | | * unicode_strlower() |
83 | | * |
84 | | * Convert src to lowercase, and return the result length (not including |
85 | | * terminating NUL). |
86 | | * |
87 | | * String src must be encoded in UTF-8. If srclen < 0, src must be |
88 | | * NUL-terminated. |
89 | | * |
90 | | * Result string is stored in dst, truncating if larger than dstsize. If |
91 | | * dstsize is greater than the result length, dst will be NUL-terminated; |
92 | | * otherwise not. |
93 | | * |
94 | | * If dstsize is zero, dst may be NULL. This is useful for calculating the |
95 | | * required buffer size before allocating. |
96 | | * |
97 | | * If full is true, use special case mappings if available and if the |
98 | | * conditions are satisfied. |
99 | | */ |
100 | | size_t |
101 | | unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, |
102 | | bool full) |
103 | 0 | { |
104 | 0 | return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL, |
105 | 0 | NULL); |
106 | 0 | } |
107 | | |
108 | | /* |
109 | | * unicode_strtitle() |
110 | | * |
111 | | * Convert src to titlecase, and return the result length (not including |
112 | | * terminating NUL). |
113 | | * |
114 | | * String src must be encoded in UTF-8. If srclen < 0, src must be |
115 | | * NUL-terminated. |
116 | | * |
117 | | * Result string is stored in dst, truncating if larger than dstsize. If |
118 | | * dstsize is greater than the result length, dst will be NUL-terminated; |
119 | | * otherwise not. |
120 | | * |
121 | | * If dstsize is zero, dst may be NULL. This is useful for calculating the |
122 | | * required buffer size before allocating. |
123 | | * |
124 | | * If full is true, use special case mappings if available and if the |
125 | | * conditions are satisfied. Otherwise, use only simple mappings and use |
126 | | * uppercase instead of titlecase. |
127 | | * |
128 | | * Titlecasing requires knowledge about word boundaries, which is provided by |
129 | | * the callback wbnext. A word boundary is the offset of the start of a word |
130 | | * or the offset of the character immediately following a word. |
131 | | * |
132 | | * The caller is expected to initialize and free the callback state |
133 | | * wbstate. The callback should first return offset 0 for the first boundary; |
134 | | * then the offset of each subsequent word boundary; then the total length of |
135 | | * the string to indicate the final boundary. |
136 | | */ |
137 | | size_t |
138 | | unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, |
139 | | bool full, WordBoundaryNext wbnext, void *wbstate) |
140 | 0 | { |
141 | 0 | return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext, |
142 | 0 | wbstate); |
143 | 0 | } |
144 | | |
145 | | /* |
146 | | * unicode_strupper() |
147 | | * |
148 | | * Convert src to uppercase, and return the result length (not including |
149 | | * terminating NUL). |
150 | | * |
151 | | * String src must be encoded in UTF-8. If srclen < 0, src must be |
152 | | * NUL-terminated. |
153 | | * |
154 | | * Result string is stored in dst, truncating if larger than dstsize. If |
155 | | * dstsize is greater than the result length, dst will be NUL-terminated; |
156 | | * otherwise not. |
157 | | * |
158 | | * If dstsize is zero, dst may be NULL. This is useful for calculating the |
159 | | * required buffer size before allocating. |
160 | | * |
161 | | * If full is true, use special case mappings if available and if the |
162 | | * conditions are satisfied. |
163 | | */ |
164 | | size_t |
165 | | unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, |
166 | | bool full) |
167 | 0 | { |
168 | 0 | return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL, |
169 | 0 | NULL); |
170 | 0 | } |
171 | | |
172 | | /* |
173 | | * unicode_strfold() |
174 | | * |
175 | | * Case fold src, and return the result length (not including terminating |
176 | | * NUL). |
177 | | * |
178 | | * String src must be encoded in UTF-8. If srclen < 0, src must be |
179 | | * NUL-terminated. |
180 | | * |
181 | | * Result string is stored in dst, truncating if larger than dstsize. If |
182 | | * dstsize is greater than the result length, dst will be NUL-terminated; |
183 | | * otherwise not. |
184 | | * |
185 | | * If dstsize is zero, dst may be NULL. This is useful for calculating the |
186 | | * required buffer size before allocating. |
187 | | */ |
188 | | size_t |
189 | | unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, |
190 | | bool full) |
191 | 0 | { |
192 | 0 | return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL, |
193 | 0 | NULL); |
194 | 0 | } |
195 | | |
196 | | /* |
197 | | * Implement Unicode Default Case Conversion algorithm. |
198 | | * |
199 | | * If str_casekind is CaseLower or CaseUpper, map each character in the string |
200 | | * for which a mapping is available. |
201 | | * |
202 | | * If str_casekind is CaseTitle, maps characters found on a word boundary to |
203 | | * titlecase (or uppercase if full is false) and other characters to |
204 | | * lowercase. NB: does not currently implement the Unicode behavior in which |
205 | | * the word boundary is adjusted to the next Cased character. That behavior |
206 | | * could be implemented as an option, but it doesn't match the default |
207 | | * behavior of ICU, nor does it match the documented behavior of INITCAP(). |
208 | | * |
209 | | * If full is true, use special mappings for relevant characters, which can |
210 | | * map a single codepoint to multiple codepoints, or depend on conditions. |
211 | | */ |
212 | | static size_t |
213 | | convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, |
214 | | CaseKind str_casekind, bool full, WordBoundaryNext wbnext, |
215 | | void *wbstate) |
216 | 0 | { |
217 | | /* character CaseKind varies while titlecasing */ |
218 | 0 | CaseKind chr_casekind = str_casekind; |
219 | 0 | size_t srcoff = 0; |
220 | 0 | size_t result_len = 0; |
221 | 0 | size_t boundary = 0; |
222 | |
|
223 | 0 | Assert((str_casekind == CaseTitle && wbnext && wbstate) || |
224 | 0 | (str_casekind != CaseTitle && !wbnext && !wbstate)); |
225 | |
|
226 | 0 | if (str_casekind == CaseTitle) |
227 | 0 | { |
228 | 0 | boundary = wbnext(wbstate); |
229 | 0 | Assert(boundary == 0); /* start of text is always a boundary */ |
230 | 0 | } |
231 | |
|
232 | 0 | while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0') |
233 | 0 | { |
234 | 0 | pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff); |
235 | 0 | int u1len = unicode_utf8len(u1); |
236 | 0 | pg_wchar simple = 0; |
237 | 0 | const pg_wchar *special = NULL; |
238 | 0 | enum CaseMapResult casemap_result; |
239 | |
|
240 | 0 | if (str_casekind == CaseTitle) |
241 | 0 | { |
242 | 0 | if (srcoff == boundary) |
243 | 0 | { |
244 | 0 | chr_casekind = full ? CaseTitle : CaseUpper; |
245 | 0 | boundary = wbnext(wbstate); |
246 | 0 | } |
247 | 0 | else |
248 | 0 | chr_casekind = CaseLower; |
249 | 0 | } |
250 | |
|
251 | 0 | casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff, |
252 | 0 | &simple, &special); |
253 | |
|
254 | 0 | switch (casemap_result) |
255 | 0 | { |
256 | 0 | case CASEMAP_SELF: |
257 | | /* no mapping; copy bytes from src */ |
258 | 0 | Assert(simple == 0); |
259 | 0 | Assert(special == NULL); |
260 | 0 | if (result_len + u1len <= dstsize) |
261 | 0 | memcpy(dst + result_len, src + srcoff, u1len); |
262 | |
|
263 | 0 | result_len += u1len; |
264 | 0 | break; |
265 | 0 | case CASEMAP_SIMPLE: |
266 | 0 | { |
267 | | /* replace with single character */ |
268 | 0 | pg_wchar u2 = simple; |
269 | 0 | pg_wchar u2len = unicode_utf8len(u2); |
270 | |
|
271 | 0 | Assert(special == NULL); |
272 | 0 | if (result_len + u2len <= dstsize) |
273 | 0 | unicode_to_utf8(u2, (unsigned char *) dst + result_len); |
274 | |
|
275 | 0 | result_len += u2len; |
276 | 0 | } |
277 | 0 | break; |
278 | 0 | case CASEMAP_SPECIAL: |
279 | | /* replace with up to MAX_CASE_EXPANSION characters */ |
280 | 0 | Assert(simple == 0); |
281 | 0 | for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++) |
282 | 0 | { |
283 | 0 | pg_wchar u2 = special[i]; |
284 | 0 | size_t u2len = unicode_utf8len(u2); |
285 | |
|
286 | 0 | if (result_len + u2len <= dstsize) |
287 | 0 | unicode_to_utf8(u2, (unsigned char *) dst + result_len); |
288 | |
|
289 | 0 | result_len += u2len; |
290 | 0 | } |
291 | 0 | break; |
292 | 0 | } |
293 | | |
294 | 0 | srcoff += u1len; |
295 | 0 | } |
296 | | |
297 | 0 | if (result_len < dstsize) |
298 | 0 | dst[result_len] = '\0'; |
299 | |
|
300 | 0 | return result_len; |
301 | 0 | } |
302 | | |
303 | | /* |
304 | | * Check that the condition matches Final_Sigma, described in Unicode Table |
305 | | * 3-17. The character at the given offset must be directly preceded by a |
306 | | * Cased character, and must not be directly followed by a Cased character. |
307 | | * |
308 | | * Case_Ignorable characters are ignored. NB: some characters may be both |
309 | | * Cased and Case_Ignorable, in which case they are ignored. |
310 | | */ |
311 | | static bool |
312 | | check_final_sigma(const unsigned char *str, size_t len, size_t offset) |
313 | 0 | { |
314 | | /* the start of the string is not preceded by a Cased character */ |
315 | 0 | if (offset == 0) |
316 | 0 | return false; |
317 | | |
318 | | /* iterate backwards, looking for Cased character */ |
319 | 0 | for (int i = offset - 1; i >= 0; i--) |
320 | 0 | { |
321 | 0 | if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0) |
322 | 0 | { |
323 | 0 | pg_wchar curr = utf8_to_unicode(str + i); |
324 | |
|
325 | 0 | if (pg_u_prop_case_ignorable(curr)) |
326 | 0 | continue; |
327 | 0 | else if (pg_u_prop_cased(curr)) |
328 | 0 | break; |
329 | 0 | else |
330 | 0 | return false; |
331 | 0 | } |
332 | 0 | else if ((str[i] & 0xC0) == 0x80) |
333 | 0 | continue; |
334 | | |
335 | 0 | Assert(false); /* invalid UTF-8 */ |
336 | 0 | } |
337 | | |
338 | | /* end of string is not followed by a Cased character */ |
339 | 0 | if (offset == len) |
340 | 0 | return true; |
341 | | |
342 | | /* iterate forwards, looking for Cased character */ |
343 | 0 | for (int i = offset + 1; i < len && str[i] != '\0'; i++) |
344 | 0 | { |
345 | 0 | if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0) |
346 | 0 | { |
347 | 0 | pg_wchar curr = utf8_to_unicode(str + i); |
348 | |
|
349 | 0 | if (pg_u_prop_case_ignorable(curr)) |
350 | 0 | continue; |
351 | 0 | else if (pg_u_prop_cased(curr)) |
352 | 0 | return false; |
353 | 0 | else |
354 | 0 | break; |
355 | 0 | } |
356 | 0 | else if ((str[i] & 0xC0) == 0x80) |
357 | 0 | continue; |
358 | | |
359 | 0 | Assert(false); /* invalid UTF-8 */ |
360 | 0 | } |
361 | | |
362 | 0 | return true; |
363 | 0 | } |
364 | | |
365 | | /* |
366 | | * Unicode allows for special casing to be applied only under certain |
367 | | * circumstances. The only currently-supported condition is Final_Sigma. |
368 | | */ |
369 | | static bool |
370 | | check_special_conditions(int conditions, const char *str, size_t len, |
371 | | size_t offset) |
372 | 0 | { |
373 | 0 | if (conditions == 0) |
374 | 0 | return true; |
375 | 0 | else if (conditions == PG_U_FINAL_SIGMA) |
376 | 0 | return check_final_sigma((unsigned char *) str, len, offset); |
377 | | |
378 | | /* no other conditions supported */ |
379 | 0 | Assert(false); |
380 | 0 | return false; |
381 | 0 | } |
382 | | |
383 | | /* |
384 | | * Map the given character to the requested case. |
385 | | * |
386 | | * If full is true, and a special case mapping is found and the conditions are |
387 | | * met, 'special' is set to the mapping result (which is an array of up to |
388 | | * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned. |
389 | | * |
390 | | * Otherwise, search for a simple mapping, and if found, set 'simple' to the |
391 | | * result and return CASEMAP_SIMPLE. |
392 | | * |
393 | | * If no mapping is found, return CASEMAP_SELF, and the caller should copy the |
394 | | * character without modification. |
395 | | */ |
396 | | static enum CaseMapResult |
397 | | casemap(pg_wchar u1, CaseKind casekind, bool full, |
398 | | const char *src, size_t srclen, size_t srcoff, |
399 | | pg_wchar *simple, const pg_wchar **special) |
400 | 0 | { |
401 | 0 | uint16 idx; |
402 | | |
403 | | /* Fast path for codepoints < 0x80 */ |
404 | 0 | if (u1 < 0x80) |
405 | 0 | { |
406 | | /* |
407 | | * The first elements in all tables are reserved as 0 (as NULL). The |
408 | | * data starts at index 1, not 0. |
409 | | */ |
410 | 0 | *simple = casekind_map[casekind][u1 + 1]; |
411 | |
|
412 | 0 | return CASEMAP_SIMPLE; |
413 | 0 | } |
414 | | |
415 | 0 | idx = case_index(u1); |
416 | |
|
417 | 0 | if (idx == 0) |
418 | 0 | return CASEMAP_SELF; |
419 | | |
420 | 0 | if (full && case_map_special[idx] && |
421 | 0 | check_special_conditions(special_case[case_map_special[idx]].conditions, |
422 | 0 | src, srclen, srcoff)) |
423 | 0 | { |
424 | 0 | *special = special_case[case_map_special[idx]].map[casekind]; |
425 | 0 | return CASEMAP_SPECIAL; |
426 | 0 | } |
427 | | |
428 | 0 | *simple = casekind_map[casekind][idx]; |
429 | |
|
430 | 0 | return CASEMAP_SIMPLE; |
431 | 0 | } |
432 | | |
433 | | /* |
434 | | * Find entry in simple case map. |
435 | | * If the entry does not exist, 0 will be returned. |
436 | | */ |
437 | | static pg_wchar |
438 | | find_case_map(pg_wchar ucs, const pg_wchar *map) |
439 | 0 | { |
440 | | /* Fast path for codepoints < 0x80 */ |
441 | 0 | if (ucs < 0x80) |
442 | | /* The first elements in all tables are reserved as 0 (as NULL). */ |
443 | 0 | return map[ucs + 1]; |
444 | 0 | return map[case_index(ucs)]; |
445 | 0 | } |