/src/postgres/src/common/unicode_norm.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * unicode_norm.c |
3 | | * Normalize a Unicode string |
4 | | * |
5 | | * This implements Unicode normalization, per the documentation at |
6 | | * https://www.unicode.org/reports/tr15/. |
7 | | * |
8 | | * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/common/unicode_norm.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | #ifndef FRONTEND |
16 | | #include "postgres.h" |
17 | | #else |
18 | | #include "postgres_fe.h" |
19 | | #endif |
20 | | |
21 | | #include "common/unicode_norm.h" |
22 | | #ifndef FRONTEND |
23 | | #include "common/unicode_norm_hashfunc.h" |
24 | | #include "common/unicode_normprops_table.h" |
25 | | #include "port/pg_bswap.h" |
26 | | #else |
27 | | #include "common/unicode_norm_table.h" |
28 | | #endif |
29 | | |
30 | | #ifndef FRONTEND |
31 | 0 | #define ALLOC(size) palloc(size) |
32 | 0 | #define FREE(size) pfree(size) |
33 | | #else |
34 | | #define ALLOC(size) malloc(size) |
35 | | #define FREE(size) free(size) |
36 | | #endif |
37 | | |
38 | | /* Constants for calculations with Hangul characters */ |
39 | 0 | #define SBASE 0xAC00 /* U+AC00 */ |
40 | 0 | #define LBASE 0x1100 /* U+1100 */ |
41 | 0 | #define VBASE 0x1161 /* U+1161 */ |
42 | 0 | #define TBASE 0x11A7 /* U+11A7 */ |
43 | 0 | #define LCOUNT 19 |
44 | 0 | #define VCOUNT 21 |
45 | 0 | #define TCOUNT 28 |
46 | 0 | #define NCOUNT VCOUNT * TCOUNT |
47 | 0 | #define SCOUNT LCOUNT * NCOUNT |
48 | | |
49 | | #ifdef FRONTEND |
50 | | /* comparison routine for bsearch() of decomposition lookup table. */ |
51 | | static int |
52 | | conv_compare(const void *p1, const void *p2) |
53 | | { |
54 | | uint32 v1, |
55 | | v2; |
56 | | |
57 | | v1 = *(const uint32 *) p1; |
58 | | v2 = ((const pg_unicode_decomposition *) p2)->codepoint; |
59 | | return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); |
60 | | } |
61 | | |
62 | | #endif |
63 | | |
64 | | /* |
65 | | * get_code_entry |
66 | | * |
67 | | * Get the entry corresponding to code in the decomposition lookup table. |
68 | | * The backend version of this code uses a perfect hash function for the |
69 | | * lookup, while the frontend version uses a binary search. |
70 | | */ |
71 | | static const pg_unicode_decomposition * |
72 | | get_code_entry(pg_wchar code) |
73 | 0 | { |
74 | 0 | #ifndef FRONTEND |
75 | 0 | int h; |
76 | 0 | uint32 hashkey; |
77 | 0 | pg_unicode_decompinfo decompinfo = UnicodeDecompInfo; |
78 | | |
79 | | /* |
80 | | * Compute the hash function. The hash key is the codepoint with the bytes |
81 | | * in network order. |
82 | | */ |
83 | 0 | hashkey = pg_hton32(code); |
84 | 0 | h = decompinfo.hash(&hashkey); |
85 | | |
86 | | /* An out-of-range result implies no match */ |
87 | 0 | if (h < 0 || h >= decompinfo.num_decomps) |
88 | 0 | return NULL; |
89 | | |
90 | | /* |
91 | | * Since it's a perfect hash, we need only match to the specific codepoint |
92 | | * it identifies. |
93 | | */ |
94 | 0 | if (code != decompinfo.decomps[h].codepoint) |
95 | 0 | return NULL; |
96 | | |
97 | | /* Success! */ |
98 | 0 | return &decompinfo.decomps[h]; |
99 | | #else |
100 | | return bsearch(&(code), |
101 | | UnicodeDecompMain, |
102 | | lengthof(UnicodeDecompMain), |
103 | | sizeof(pg_unicode_decomposition), |
104 | | conv_compare); |
105 | | #endif |
106 | 0 | } |
107 | | |
108 | | /* |
109 | | * Get the combining class of the given codepoint. |
110 | | */ |
111 | | static uint8 |
112 | | get_canonical_class(pg_wchar code) |
113 | 0 | { |
114 | 0 | const pg_unicode_decomposition *entry = get_code_entry(code); |
115 | | |
116 | | /* |
117 | | * If no entries are found, the character used is either an Hangul |
118 | | * character or a character with a class of 0 and no decompositions. |
119 | | */ |
120 | 0 | if (!entry) |
121 | 0 | return 0; |
122 | 0 | else |
123 | 0 | return entry->comb_class; |
124 | 0 | } |
125 | | |
126 | | /* |
127 | | * Given a decomposition entry looked up earlier, get the decomposed |
128 | | * characters. |
129 | | * |
130 | | * Note: the returned pointer can point to statically allocated buffer, and |
131 | | * is only valid until next call to this function! |
132 | | */ |
133 | | static const pg_wchar * |
134 | | get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size) |
135 | 0 | { |
136 | 0 | static pg_wchar x; |
137 | |
|
138 | 0 | if (DECOMPOSITION_IS_INLINE(entry)) |
139 | 0 | { |
140 | 0 | Assert(DECOMPOSITION_SIZE(entry) == 1); |
141 | 0 | x = (pg_wchar) entry->dec_index; |
142 | 0 | *dec_size = 1; |
143 | 0 | return &x; |
144 | 0 | } |
145 | 0 | else |
146 | 0 | { |
147 | 0 | *dec_size = DECOMPOSITION_SIZE(entry); |
148 | 0 | return &UnicodeDecomp_codepoints[entry->dec_index]; |
149 | 0 | } |
150 | 0 | } |
151 | | |
152 | | /* |
153 | | * Calculate how many characters a given character will decompose to. |
154 | | * |
155 | | * This needs to recurse, if the character decomposes into characters that |
156 | | * are, in turn, decomposable. |
157 | | */ |
158 | | static int |
159 | | get_decomposed_size(pg_wchar code, bool compat) |
160 | 0 | { |
161 | 0 | const pg_unicode_decomposition *entry; |
162 | 0 | int size = 0; |
163 | 0 | int i; |
164 | 0 | const uint32 *decomp; |
165 | 0 | int dec_size; |
166 | | |
167 | | /* |
168 | | * Fast path for Hangul characters not stored in tables to save memory as |
169 | | * decomposition is algorithmic. See |
170 | | * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details |
171 | | * on the matter. |
172 | | */ |
173 | 0 | if (code >= SBASE && code < SBASE + SCOUNT) |
174 | 0 | { |
175 | 0 | uint32 tindex, |
176 | 0 | sindex; |
177 | |
|
178 | 0 | sindex = code - SBASE; |
179 | 0 | tindex = sindex % TCOUNT; |
180 | |
|
181 | 0 | if (tindex != 0) |
182 | 0 | return 3; |
183 | 0 | return 2; |
184 | 0 | } |
185 | | |
186 | 0 | entry = get_code_entry(code); |
187 | | |
188 | | /* |
189 | | * Just count current code if no other decompositions. A NULL entry is |
190 | | * equivalent to a character with class 0 and no decompositions. |
191 | | */ |
192 | 0 | if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || |
193 | 0 | (!compat && DECOMPOSITION_IS_COMPAT(entry))) |
194 | 0 | return 1; |
195 | | |
196 | | /* |
197 | | * If this entry has other decomposition codes look at them as well. First |
198 | | * get its decomposition in the list of tables available. |
199 | | */ |
200 | 0 | decomp = get_code_decomposition(entry, &dec_size); |
201 | 0 | for (i = 0; i < dec_size; i++) |
202 | 0 | { |
203 | 0 | uint32 lcode = decomp[i]; |
204 | |
|
205 | 0 | size += get_decomposed_size(lcode, compat); |
206 | 0 | } |
207 | |
|
208 | 0 | return size; |
209 | 0 | } |
210 | | |
211 | | /* |
212 | | * Recompose a set of characters. For hangul characters, the calculation |
213 | | * is algorithmic. For others, an inverse lookup at the decomposition |
214 | | * table is necessary. Returns true if a recomposition can be done, and |
215 | | * false otherwise. |
216 | | */ |
217 | | static bool |
218 | | recompose_code(uint32 start, uint32 code, uint32 *result) |
219 | 0 | { |
220 | | /* |
221 | | * Handle Hangul characters algorithmically, per the Unicode spec. |
222 | | * |
223 | | * Check if two current characters are L and V. |
224 | | */ |
225 | 0 | if (start >= LBASE && start < LBASE + LCOUNT && |
226 | 0 | code >= VBASE && code < VBASE + VCOUNT) |
227 | 0 | { |
228 | | /* make syllable of form LV */ |
229 | 0 | uint32 lindex = start - LBASE; |
230 | 0 | uint32 vindex = code - VBASE; |
231 | |
|
232 | 0 | *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT; |
233 | 0 | return true; |
234 | 0 | } |
235 | | /* Check if two current characters are LV and T */ |
236 | 0 | else if (start >= SBASE && start < (SBASE + SCOUNT) && |
237 | 0 | ((start - SBASE) % TCOUNT) == 0 && |
238 | 0 | code >= TBASE && code < (TBASE + TCOUNT)) |
239 | 0 | { |
240 | | /* make syllable of form LVT */ |
241 | 0 | uint32 tindex = code - TBASE; |
242 | |
|
243 | 0 | *result = start + tindex; |
244 | 0 | return true; |
245 | 0 | } |
246 | 0 | else |
247 | 0 | { |
248 | 0 | const pg_unicode_decomposition *entry; |
249 | | |
250 | | /* |
251 | | * Do an inverse lookup of the decomposition tables to see if anything |
252 | | * matches. The comparison just needs to be a perfect match on the |
253 | | * sub-table of size two, because the start character has already been |
254 | | * recomposed partially. This lookup uses a perfect hash function for |
255 | | * the backend code. |
256 | | */ |
257 | 0 | #ifndef FRONTEND |
258 | |
|
259 | 0 | int h, |
260 | 0 | inv_lookup_index; |
261 | 0 | uint64 hashkey; |
262 | 0 | pg_unicode_recompinfo recompinfo = UnicodeRecompInfo; |
263 | | |
264 | | /* |
265 | | * Compute the hash function. The hash key is formed by concatenating |
266 | | * bytes of the two codepoints in network order. See also |
267 | | * src/common/unicode/generate-unicode_norm_table.pl. |
268 | | */ |
269 | 0 | hashkey = pg_hton64(((uint64) start << 32) | (uint64) code); |
270 | 0 | h = recompinfo.hash(&hashkey); |
271 | | |
272 | | /* An out-of-range result implies no match */ |
273 | 0 | if (h < 0 || h >= recompinfo.num_recomps) |
274 | 0 | return false; |
275 | | |
276 | 0 | inv_lookup_index = recompinfo.inverse_lookup[h]; |
277 | 0 | entry = &UnicodeDecompMain[inv_lookup_index]; |
278 | |
|
279 | 0 | if (start == UnicodeDecomp_codepoints[entry->dec_index] && |
280 | 0 | code == UnicodeDecomp_codepoints[entry->dec_index + 1]) |
281 | 0 | { |
282 | 0 | *result = entry->codepoint; |
283 | 0 | return true; |
284 | 0 | } |
285 | |
|
286 | | #else |
287 | | |
288 | | int i; |
289 | | |
290 | | for (i = 0; i < lengthof(UnicodeDecompMain); i++) |
291 | | { |
292 | | entry = &UnicodeDecompMain[i]; |
293 | | |
294 | | if (DECOMPOSITION_SIZE(entry) != 2) |
295 | | continue; |
296 | | |
297 | | if (DECOMPOSITION_NO_COMPOSE(entry)) |
298 | | continue; |
299 | | |
300 | | if (start == UnicodeDecomp_codepoints[entry->dec_index] && |
301 | | code == UnicodeDecomp_codepoints[entry->dec_index + 1]) |
302 | | { |
303 | | *result = entry->codepoint; |
304 | | return true; |
305 | | } |
306 | | } |
307 | | #endif /* !FRONTEND */ |
308 | 0 | } |
309 | | |
310 | 0 | return false; |
311 | 0 | } |
312 | | |
313 | | /* |
314 | | * Decompose the given code into the array given by caller. The |
315 | | * decomposition begins at the position given by caller, saving one |
316 | | * lookup on the decomposition table. The current position needs to be |
317 | | * updated here to let the caller know from where to continue filling |
318 | | * in the array result. |
319 | | */ |
320 | | static void |
321 | | decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) |
322 | 0 | { |
323 | 0 | const pg_unicode_decomposition *entry; |
324 | 0 | int i; |
325 | 0 | const uint32 *decomp; |
326 | 0 | int dec_size; |
327 | | |
328 | | /* |
329 | | * Fast path for Hangul characters not stored in tables to save memory as |
330 | | * decomposition is algorithmic. See |
331 | | * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details |
332 | | * on the matter. |
333 | | */ |
334 | 0 | if (code >= SBASE && code < SBASE + SCOUNT) |
335 | 0 | { |
336 | 0 | uint32 l, |
337 | 0 | v, |
338 | 0 | tindex, |
339 | 0 | sindex; |
340 | 0 | pg_wchar *res = *result; |
341 | |
|
342 | 0 | sindex = code - SBASE; |
343 | 0 | l = LBASE + sindex / (VCOUNT * TCOUNT); |
344 | 0 | v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT; |
345 | 0 | tindex = sindex % TCOUNT; |
346 | |
|
347 | 0 | res[*current] = l; |
348 | 0 | (*current)++; |
349 | 0 | res[*current] = v; |
350 | 0 | (*current)++; |
351 | |
|
352 | 0 | if (tindex != 0) |
353 | 0 | { |
354 | 0 | res[*current] = TBASE + tindex; |
355 | 0 | (*current)++; |
356 | 0 | } |
357 | |
|
358 | 0 | return; |
359 | 0 | } |
360 | | |
361 | 0 | entry = get_code_entry(code); |
362 | | |
363 | | /* |
364 | | * Just fill in with the current decomposition if there are no |
365 | | * decomposition codes to recurse to. A NULL entry is equivalent to a |
366 | | * character with class 0 and no decompositions, so just leave also in |
367 | | * this case. |
368 | | */ |
369 | 0 | if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || |
370 | 0 | (!compat && DECOMPOSITION_IS_COMPAT(entry))) |
371 | 0 | { |
372 | 0 | pg_wchar *res = *result; |
373 | |
|
374 | 0 | res[*current] = code; |
375 | 0 | (*current)++; |
376 | 0 | return; |
377 | 0 | } |
378 | | |
379 | | /* |
380 | | * If this entry has other decomposition codes look at them as well. |
381 | | */ |
382 | 0 | decomp = get_code_decomposition(entry, &dec_size); |
383 | 0 | for (i = 0; i < dec_size; i++) |
384 | 0 | { |
385 | 0 | pg_wchar lcode = (pg_wchar) decomp[i]; |
386 | | |
387 | | /* Leave if no more decompositions */ |
388 | 0 | decompose_code(lcode, compat, result, current); |
389 | 0 | } |
390 | 0 | } |
391 | | |
392 | | /* |
393 | | * unicode_normalize - Normalize a Unicode string to the specified form. |
394 | | * |
395 | | * The input is a 0-terminated array of codepoints. |
396 | | * |
397 | | * In frontend, returns a 0-terminated array of codepoints, allocated with |
398 | | * malloc. Or NULL if we run out of memory. In backend, the returned |
399 | | * string is palloc'd instead, and OOM is reported with ereport(). |
400 | | */ |
401 | | pg_wchar * |
402 | | unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) |
403 | 0 | { |
404 | 0 | bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD); |
405 | 0 | bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC); |
406 | 0 | pg_wchar *decomp_chars; |
407 | 0 | pg_wchar *recomp_chars; |
408 | 0 | int decomp_size, |
409 | 0 | current_size; |
410 | 0 | int count; |
411 | 0 | const pg_wchar *p; |
412 | | |
413 | | /* variables for recomposition */ |
414 | 0 | int last_class; |
415 | 0 | int starter_pos; |
416 | 0 | int target_pos; |
417 | 0 | uint32 starter_ch; |
418 | | |
419 | | /* First, do character decomposition */ |
420 | | |
421 | | /* |
422 | | * Calculate how many characters long the decomposed version will be. |
423 | | */ |
424 | 0 | decomp_size = 0; |
425 | 0 | for (p = input; *p; p++) |
426 | 0 | decomp_size += get_decomposed_size(*p, compat); |
427 | |
|
428 | 0 | decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); |
429 | 0 | if (decomp_chars == NULL) |
430 | 0 | return NULL; |
431 | | |
432 | | /* |
433 | | * Now fill in each entry recursively. This needs a second pass on the |
434 | | * decomposition table. |
435 | | */ |
436 | 0 | current_size = 0; |
437 | 0 | for (p = input; *p; p++) |
438 | 0 | decompose_code(*p, compat, &decomp_chars, ¤t_size); |
439 | 0 | decomp_chars[decomp_size] = '\0'; |
440 | 0 | Assert(decomp_size == current_size); |
441 | | |
442 | | /* Leave if there is nothing to decompose */ |
443 | 0 | if (decomp_size == 0) |
444 | 0 | return decomp_chars; |
445 | | |
446 | | /* |
447 | | * Now apply canonical ordering. |
448 | | */ |
449 | 0 | for (count = 1; count < decomp_size; count++) |
450 | 0 | { |
451 | 0 | pg_wchar prev = decomp_chars[count - 1]; |
452 | 0 | pg_wchar next = decomp_chars[count]; |
453 | 0 | pg_wchar tmp; |
454 | 0 | const uint8 prevClass = get_canonical_class(prev); |
455 | 0 | const uint8 nextClass = get_canonical_class(next); |
456 | | |
457 | | /* |
458 | | * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) |
459 | | * annex 4, a sequence of two adjacent characters in a string is an |
460 | | * exchangeable pair if the combining class (from the Unicode |
461 | | * Character Database) for the first character is greater than the |
462 | | * combining class for the second, and the second is not a starter. A |
463 | | * character is a starter if its combining class is 0. |
464 | | */ |
465 | 0 | if (prevClass == 0 || nextClass == 0) |
466 | 0 | continue; |
467 | | |
468 | 0 | if (prevClass <= nextClass) |
469 | 0 | continue; |
470 | | |
471 | | /* exchange can happen */ |
472 | 0 | tmp = decomp_chars[count - 1]; |
473 | 0 | decomp_chars[count - 1] = decomp_chars[count]; |
474 | 0 | decomp_chars[count] = tmp; |
475 | | |
476 | | /* backtrack to check again */ |
477 | 0 | if (count > 1) |
478 | 0 | count -= 2; |
479 | 0 | } |
480 | |
|
481 | 0 | if (!recompose) |
482 | 0 | return decomp_chars; |
483 | | |
484 | | /* |
485 | | * The last phase of NFC and NFKC is the recomposition of the reordered |
486 | | * Unicode string using combining classes. The recomposed string cannot be |
487 | | * longer than the decomposed one, so make the allocation of the output |
488 | | * string based on that assumption. |
489 | | */ |
490 | 0 | recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); |
491 | 0 | if (!recomp_chars) |
492 | 0 | { |
493 | 0 | FREE(decomp_chars); |
494 | 0 | return NULL; |
495 | 0 | } |
496 | | |
497 | 0 | last_class = -1; /* this eliminates a special check */ |
498 | 0 | starter_pos = 0; |
499 | 0 | target_pos = 1; |
500 | 0 | starter_ch = recomp_chars[0] = decomp_chars[0]; |
501 | |
|
502 | 0 | for (count = 1; count < decomp_size; count++) |
503 | 0 | { |
504 | 0 | pg_wchar ch = decomp_chars[count]; |
505 | 0 | int ch_class = get_canonical_class(ch); |
506 | 0 | pg_wchar composite; |
507 | |
|
508 | 0 | if (last_class < ch_class && |
509 | 0 | recompose_code(starter_ch, ch, &composite)) |
510 | 0 | { |
511 | 0 | recomp_chars[starter_pos] = composite; |
512 | 0 | starter_ch = composite; |
513 | 0 | } |
514 | 0 | else if (ch_class == 0) |
515 | 0 | { |
516 | 0 | starter_pos = target_pos; |
517 | 0 | starter_ch = ch; |
518 | 0 | last_class = -1; |
519 | 0 | recomp_chars[target_pos++] = ch; |
520 | 0 | } |
521 | 0 | else |
522 | 0 | { |
523 | 0 | last_class = ch_class; |
524 | 0 | recomp_chars[target_pos++] = ch; |
525 | 0 | } |
526 | 0 | } |
527 | 0 | recomp_chars[target_pos] = (pg_wchar) '\0'; |
528 | |
|
529 | 0 | FREE(decomp_chars); |
530 | |
|
531 | 0 | return recomp_chars; |
532 | 0 | } |
533 | | |
534 | | /* |
535 | | * Normalization "quick check" algorithm; see |
536 | | * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms> |
537 | | */ |
538 | | |
539 | | /* We only need this in the backend. */ |
540 | | #ifndef FRONTEND |
541 | | |
542 | | static const pg_unicode_normprops * |
543 | | qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo) |
544 | 0 | { |
545 | 0 | int h; |
546 | 0 | uint32 hashkey; |
547 | | |
548 | | /* |
549 | | * Compute the hash function. The hash key is the codepoint with the bytes |
550 | | * in network order. |
551 | | */ |
552 | 0 | hashkey = pg_hton32(ch); |
553 | 0 | h = norminfo->hash(&hashkey); |
554 | | |
555 | | /* An out-of-range result implies no match */ |
556 | 0 | if (h < 0 || h >= norminfo->num_normprops) |
557 | 0 | return NULL; |
558 | | |
559 | | /* |
560 | | * Since it's a perfect hash, we need only match to the specific codepoint |
561 | | * it identifies. |
562 | | */ |
563 | 0 | if (ch != norminfo->normprops[h].codepoint) |
564 | 0 | return NULL; |
565 | | |
566 | | /* Success! */ |
567 | 0 | return &norminfo->normprops[h]; |
568 | 0 | } |
569 | | |
570 | | /* |
571 | | * Look up the normalization quick check character property |
572 | | */ |
573 | | static UnicodeNormalizationQC |
574 | | qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch) |
575 | 0 | { |
576 | 0 | const pg_unicode_normprops *found = NULL; |
577 | |
|
578 | 0 | switch (form) |
579 | 0 | { |
580 | 0 | case UNICODE_NFC: |
581 | 0 | found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC); |
582 | 0 | break; |
583 | 0 | case UNICODE_NFKC: |
584 | 0 | found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC); |
585 | 0 | break; |
586 | 0 | default: |
587 | 0 | Assert(false); |
588 | 0 | break; |
589 | 0 | } |
590 | | |
591 | 0 | if (found) |
592 | 0 | return found->quickcheck; |
593 | 0 | else |
594 | 0 | return UNICODE_NORM_QC_YES; |
595 | 0 | } |
596 | | |
597 | | UnicodeNormalizationQC |
598 | | unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input) |
599 | 0 | { |
600 | 0 | uint8 lastCanonicalClass = 0; |
601 | 0 | UnicodeNormalizationQC result = UNICODE_NORM_QC_YES; |
602 | | |
603 | | /* |
604 | | * For the "D" forms, we don't run the quickcheck. We don't include the |
605 | | * lookup tables for those because they are huge, checking for these |
606 | | * particular forms is less common, and running the slow path is faster |
607 | | * for the "D" forms than the "C" forms because you don't need to |
608 | | * recompose, which is slow. |
609 | | */ |
610 | 0 | if (form == UNICODE_NFD || form == UNICODE_NFKD) |
611 | 0 | return UNICODE_NORM_QC_MAYBE; |
612 | | |
613 | 0 | for (const pg_wchar *p = input; *p; p++) |
614 | 0 | { |
615 | 0 | pg_wchar ch = *p; |
616 | 0 | uint8 canonicalClass; |
617 | 0 | UnicodeNormalizationQC check; |
618 | |
|
619 | 0 | canonicalClass = get_canonical_class(ch); |
620 | 0 | if (lastCanonicalClass > canonicalClass && canonicalClass != 0) |
621 | 0 | return UNICODE_NORM_QC_NO; |
622 | | |
623 | 0 | check = qc_is_allowed(form, ch); |
624 | 0 | if (check == UNICODE_NORM_QC_NO) |
625 | 0 | return UNICODE_NORM_QC_NO; |
626 | 0 | else if (check == UNICODE_NORM_QC_MAYBE) |
627 | 0 | result = UNICODE_NORM_QC_MAYBE; |
628 | | |
629 | 0 | lastCanonicalClass = canonicalClass; |
630 | 0 | } |
631 | 0 | return result; |
632 | 0 | } |
633 | | |
634 | | #endif /* !FRONTEND */ |