/src/dovecot/src/lib/unicode-transform.c
Line | Count | Source |
1 | | /* Copyright (c) 2025 Dovecot authors, see the included COPYING file */ |
2 | | |
3 | | #include "lib.h" |
4 | | #include "unichar.h" |
5 | | #include "unicode-data.h" |
6 | | #include "unicode-transform.h" |
7 | | |
8 | 0 | #define HANGUL_FIRST 0xac00 |
9 | 0 | #define HANGUL_LAST 0xd7a3 |
10 | | |
11 | | /* |
12 | | * Transform |
13 | | */ |
14 | | |
15 | | ssize_t uniform_transform_forward( |
16 | | struct unicode_transform *trans, const uint32_t *out, |
17 | | const struct unicode_code_point_data *const *out_data, size_t out_len, |
18 | | const char **error_r) |
19 | 0 | { |
20 | 0 | struct unicode_transform_buffer buf_next; |
21 | 0 | ssize_t sret; |
22 | |
|
23 | 0 | i_zero(&buf_next); |
24 | 0 | buf_next.cp = out; |
25 | 0 | buf_next.cp_data = out_data; |
26 | 0 | buf_next.cp_count = out_len; |
27 | |
|
28 | 0 | i_assert(trans->next != NULL); |
29 | 0 | i_assert(trans->next->def != NULL); |
30 | 0 | i_assert(trans->next->def->input != NULL); |
31 | 0 | sret = trans->next->def->input(trans->next, &buf_next, error_r); |
32 | |
|
33 | 0 | i_assert(sret >= 0 || *error_r != NULL); |
34 | 0 | i_assert(sret <= (ssize_t)out_len); |
35 | 0 | return sret; |
36 | 0 | } |
37 | | |
38 | | ssize_t unicode_transform_input_buf(struct unicode_transform *trans, |
39 | | const struct unicode_transform_buffer *buf, |
40 | | const char **error_r) |
41 | 0 | { |
42 | 0 | struct unicode_transform_buffer in_buf; |
43 | 0 | size_t input_total = 0; |
44 | 0 | ssize_t sret; |
45 | 0 | bool flushed = FALSE; |
46 | 0 | int ret; |
47 | |
|
48 | 0 | *error_r = NULL; |
49 | |
|
50 | 0 | in_buf = *buf; |
51 | |
|
52 | 0 | while (in_buf.cp_count > 0) { |
53 | 0 | if (in_buf.cp_count > 0) { |
54 | 0 | i_assert(trans->def->input != NULL); |
55 | 0 | sret = trans->def->input(trans, &in_buf, error_r); |
56 | 0 | if (sret < 0) { |
57 | 0 | i_assert(*error_r != NULL); |
58 | 0 | return -1; |
59 | 0 | } |
60 | 0 | if (sret > 0) { |
61 | 0 | i_assert((size_t)sret <= in_buf.cp_count); |
62 | 0 | in_buf.cp += sret; |
63 | 0 | in_buf.cp_count -= sret; |
64 | 0 | input_total += sret; |
65 | 0 | flushed = FALSE; |
66 | 0 | continue; |
67 | 0 | } |
68 | 0 | if (sret == 0 && flushed) |
69 | 0 | break; |
70 | 0 | } |
71 | | |
72 | 0 | struct unicode_transform *tp = trans; |
73 | |
|
74 | 0 | while (tp->next != NULL) { |
75 | 0 | if (tp->def->flush != NULL) { |
76 | 0 | ret = tp->def->flush(tp, FALSE, error_r); |
77 | 0 | if (ret < 0) { |
78 | 0 | i_assert(*error_r != NULL); |
79 | 0 | return -1; |
80 | 0 | } |
81 | 0 | } |
82 | 0 | tp = tp->next; |
83 | 0 | } |
84 | | |
85 | 0 | flushed = TRUE; |
86 | 0 | } |
87 | | |
88 | 0 | return input_total; |
89 | 0 | } |
90 | | |
91 | | int unicode_transform_flush(struct unicode_transform *trans, |
92 | | const char **error_r) |
93 | 0 | { |
94 | 0 | int ret; |
95 | |
|
96 | 0 | *error_r = NULL; |
97 | |
|
98 | 0 | while (trans != NULL) { |
99 | 0 | struct unicode_transform *tp = trans; |
100 | 0 | bool progress = FALSE; |
101 | |
|
102 | 0 | while (tp != NULL) { |
103 | 0 | if (tp->def->flush == NULL) { |
104 | 0 | progress = TRUE; |
105 | 0 | if (tp == trans) |
106 | 0 | trans = trans->next; |
107 | 0 | } else { |
108 | 0 | ret = tp->def->flush(tp, (tp == trans), error_r); |
109 | 0 | if (ret < 0) { |
110 | 0 | i_assert(*error_r != NULL); |
111 | 0 | return -1; |
112 | 0 | } |
113 | 0 | if (ret > 0) { |
114 | 0 | progress = TRUE; |
115 | 0 | if (tp == trans) |
116 | 0 | trans = trans->next; |
117 | 0 | } |
118 | 0 | } |
119 | 0 | tp = tp->next; |
120 | 0 | } |
121 | 0 | if (!progress) |
122 | 0 | return 0; |
123 | 0 | } |
124 | 0 | return 1; |
125 | 0 | } |
126 | | |
127 | | /* Buffer Sink */ |
128 | | |
129 | | static ssize_t |
130 | | unicode_buffer_sink_input(struct unicode_transform *trans, |
131 | | const struct unicode_transform_buffer *buf, |
132 | | const char **error_r); |
133 | | |
134 | | static const struct unicode_transform_def unicode_buffer_sink_def = { |
135 | | .input = unicode_buffer_sink_input, |
136 | | }; |
137 | | |
138 | | void unicode_buffer_sink_init(struct unicode_buffer_sink *sink, |
139 | | buffer_t *buffer) |
140 | 0 | { |
141 | 0 | i_zero(sink); |
142 | 0 | unicode_transform_init(&sink->transform, &unicode_buffer_sink_def); |
143 | 0 | sink->buffer = buffer; |
144 | 0 | } |
145 | | |
146 | | static ssize_t |
147 | | unicode_buffer_sink_input(struct unicode_transform *trans, |
148 | | const struct unicode_transform_buffer *buf, |
149 | | const char **error_r ATTR_UNUSED) |
150 | 0 | { |
151 | 0 | struct unicode_buffer_sink *sink = |
152 | 0 | container_of(trans, struct unicode_buffer_sink, transform); |
153 | |
|
154 | 0 | uni_ucs4_to_utf8(buf->cp, buf->cp_count, sink->buffer); |
155 | 0 | return buf->cp_count; |
156 | 0 | } |
157 | | |
158 | | /* Static Array Sink */ |
159 | | |
160 | | static ssize_t |
161 | | unicode_static_array_sink_input(struct unicode_transform *trans, |
162 | | const struct unicode_transform_buffer *buf, |
163 | | const char **error_r); |
164 | | |
165 | | static const struct unicode_transform_def unicode_static_array_sink_def = { |
166 | | .input = unicode_static_array_sink_input, |
167 | | }; |
168 | | |
169 | | void unicode_static_array_sink_init(struct unicode_static_array_sink *sink, |
170 | | uint32_t *array, size_t array_size, |
171 | | size_t *array_pos) |
172 | 0 | { |
173 | 0 | i_zero(sink); |
174 | 0 | unicode_transform_init(&sink->transform, |
175 | 0 | &unicode_static_array_sink_def); |
176 | 0 | sink->array = array; |
177 | 0 | sink->array_size = array_size; |
178 | 0 | sink->array_pos = array_pos; |
179 | 0 | } |
180 | | |
181 | | static ssize_t |
182 | | unicode_static_array_sink_input(struct unicode_transform *trans, |
183 | | const struct unicode_transform_buffer *buf, |
184 | | const char **error_r) |
185 | 0 | { |
186 | 0 | struct unicode_static_array_sink *sink = |
187 | 0 | container_of(trans, struct unicode_static_array_sink, |
188 | 0 | transform); |
189 | |
|
190 | 0 | if (*sink->array_pos + buf->cp_count > sink->array_size) { |
191 | 0 | *error_r = "Output overflow"; |
192 | 0 | return -1; |
193 | 0 | } |
194 | 0 | memcpy(sink->array + *sink->array_pos, buf->cp, |
195 | 0 | buf->cp_count * sizeof(*buf->cp)); |
196 | 0 | *sink->array_pos += buf->cp_count; |
197 | 0 | return buf->cp_count; |
198 | 0 | } |
199 | | |
200 | | /* |
201 | | * Hangul syllable (de)composition |
202 | | */ |
203 | | |
204 | 0 | #define UNI_HANGUL_S_BASE 0xac00 |
205 | 0 | #define UNI_HANGUL_L_BASE 0x1100 |
206 | 0 | #define UNI_HANGUL_V_BASE 0x1161 |
207 | 0 | #define UNI_HANGUL_T_BASE 0x11a7 |
208 | 0 | #define UNI_HANGUL_L_COUNT 19 |
209 | 0 | #define UNI_HANGUL_V_COUNT 21 |
210 | 0 | #define UNI_HANGUL_T_COUNT 28 |
211 | 0 | #define UNI_HANGUL_N_COUNT (UNI_HANGUL_V_COUNT * UNI_HANGUL_T_COUNT) |
212 | 0 | #define UNI_HANGUL_L_END (UNI_HANGUL_L_BASE + UNI_HANGUL_L_COUNT) |
213 | 0 | #define UNI_HANGUL_V_END (UNI_HANGUL_V_BASE + UNI_HANGUL_V_COUNT) |
214 | 0 | #define UNI_HANGUL_T_END (UNI_HANGUL_T_BASE + UNI_HANGUL_T_COUNT) |
215 | 0 | #define UNI_HANGUL_S_END 0xD7A4 |
216 | | |
217 | | static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3]) |
218 | 0 | { |
219 | | /* The Unicode Standard, Section 3.12.2: |
220 | | Hangul Syllable Decomposition |
221 | | */ |
222 | |
|
223 | 0 | size_t s_index = cp - UNI_HANGUL_S_BASE; |
224 | 0 | size_t l_index = s_index / UNI_HANGUL_N_COUNT; |
225 | 0 | size_t v_index = ((s_index % UNI_HANGUL_N_COUNT) / UNI_HANGUL_T_COUNT); |
226 | 0 | size_t t_index = s_index % UNI_HANGUL_T_COUNT; |
227 | 0 | uint32_t l_part = UNI_HANGUL_L_BASE + l_index; |
228 | 0 | uint32_t v_part = UNI_HANGUL_V_BASE + v_index; |
229 | |
|
230 | 0 | if (t_index == 0) { |
231 | 0 | buf[0] = l_part; |
232 | 0 | buf[1] = v_part; |
233 | 0 | return 2; |
234 | 0 | } |
235 | | |
236 | 0 | uint32_t t_part = UNI_HANGUL_T_BASE + t_index; |
237 | |
|
238 | 0 | buf[0] = l_part; |
239 | 0 | buf[1] = v_part; |
240 | 0 | buf[2] = t_part; |
241 | 0 | return 3; |
242 | 0 | } |
243 | | |
244 | | static uint32_t unicode_hangul_compose_pair(uint32_t l, uint32_t r) |
245 | 0 | { |
246 | | /* The Unicode Standard, Section 3.12.3: |
247 | | Hangul Syllable Composition |
248 | | */ |
249 | | |
250 | | /* <LPart, VPart> */ |
251 | 0 | if (l >= UNI_HANGUL_L_BASE && l < UNI_HANGUL_L_END && |
252 | 0 | r >= UNI_HANGUL_V_BASE && r < UNI_HANGUL_V_END) { |
253 | 0 | uint32_t l_part = l, v_part = r; |
254 | |
|
255 | 0 | size_t l_index = l_part - UNI_HANGUL_L_BASE; |
256 | 0 | size_t v_index = v_part - UNI_HANGUL_V_BASE; |
257 | 0 | size_t lv_index = l_index * UNI_HANGUL_N_COUNT + |
258 | 0 | v_index * UNI_HANGUL_T_COUNT; |
259 | 0 | return UNI_HANGUL_S_BASE + lv_index; |
260 | 0 | } |
261 | | /* A sequence <LVPart, TPart> */ |
262 | 0 | if (l >= UNI_HANGUL_S_BASE && l < UNI_HANGUL_S_END && |
263 | 0 | r >= (UNI_HANGUL_T_BASE + 1u) && r < UNI_HANGUL_T_END && |
264 | 0 | ((l - UNI_HANGUL_S_BASE) % UNI_HANGUL_T_COUNT) == 0) { |
265 | 0 | uint32_t lv_part = l, t_part = r; |
266 | |
|
267 | 0 | size_t t_index = t_part - UNI_HANGUL_T_BASE; |
268 | 0 | return lv_part + t_index; |
269 | 0 | } |
270 | 0 | return 0x0000; |
271 | 0 | } |
272 | | |
273 | | /* |
274 | | * Normalization transform: NFD, NFKD, NFC, NFKC |
275 | | */ |
276 | | |
277 | | static ssize_t |
278 | | unicode_nf_input(struct unicode_transform *trans, |
279 | | const struct unicode_transform_buffer *buf, |
280 | | const char **error_r); |
281 | | static int |
282 | | unicode_nf_flush(struct unicode_transform *trans, bool finished, |
283 | | const char **error_r); |
284 | | |
285 | | static const struct unicode_transform_def unicode_nf_def = { |
286 | | .input = unicode_nf_input, |
287 | | .flush = unicode_nf_flush, |
288 | | }; |
289 | | |
290 | | void unicode_nf_init(struct unicode_nf_context *ctx_r, |
291 | | enum unicode_nf_type type) |
292 | 0 | { |
293 | 0 | i_zero(ctx_r); |
294 | 0 | unicode_transform_init(&ctx_r->transform, &unicode_nf_def); |
295 | |
|
296 | 0 | switch (type) { |
297 | 0 | case UNICODE_NFD: |
298 | 0 | ctx_r->canonical = TRUE; |
299 | 0 | ctx_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK; |
300 | 0 | break; |
301 | 0 | case UNICODE_NFKD: |
302 | 0 | ctx_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK; |
303 | 0 | break; |
304 | 0 | case UNICODE_NFC: |
305 | 0 | ctx_r->compose = TRUE; |
306 | 0 | ctx_r->canonical = TRUE; |
307 | 0 | ctx_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK; |
308 | 0 | break; |
309 | 0 | case UNICODE_NFKC: |
310 | 0 | ctx_r->compose = TRUE; |
311 | 0 | ctx_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK; |
312 | 0 | break; |
313 | 0 | } |
314 | 0 | } |
315 | | |
316 | | void unicode_nf_reset(struct unicode_nf_context *ctx) |
317 | 0 | { |
318 | 0 | enum unicode_nf_type type = |
319 | 0 | (ctx->compose ? (ctx->canonical ? UNICODE_NFC : UNICODE_NFKC) : |
320 | 0 | (ctx->canonical ? UNICODE_NFD : UNICODE_NFKD)); |
321 | 0 | struct unicode_transform *next = ctx->transform.next; |
322 | |
|
323 | 0 | unicode_nf_init(ctx, type); |
324 | 0 | unicode_transform_chain(&ctx->transform, next); |
325 | 0 | } |
326 | | |
327 | | static void |
328 | | unicode_nf_buffer_delete(struct unicode_nf_context *ctx, size_t offset, |
329 | | size_t count) |
330 | 0 | { |
331 | 0 | if (count == 0) |
332 | 0 | return; |
333 | | |
334 | 0 | i_assert(offset < ctx->buffer_len); |
335 | 0 | i_assert(count <= ctx->buffer_len); |
336 | 0 | i_assert(offset <= (ctx->buffer_len - count)); |
337 | | |
338 | 0 | if (count == ctx->buffer_len) { |
339 | 0 | ctx->buffer_len = 0; |
340 | 0 | return; |
341 | 0 | } |
342 | | |
343 | 0 | size_t trailer = ctx->buffer_len - (offset + count); |
344 | 0 | if (trailer > 0) { |
345 | 0 | memmove(&ctx->cp_buffer[offset], |
346 | 0 | &ctx->cp_buffer[offset + count], |
347 | 0 | trailer * sizeof(ctx->cp_buffer[0])); |
348 | 0 | memmove(&ctx->cpd_buffer[offset], |
349 | 0 | &ctx->cpd_buffer[offset + count], |
350 | 0 | trailer * sizeof(ctx->cpd_buffer[0])); |
351 | 0 | } |
352 | 0 | ctx->buffer_len -= count; |
353 | 0 | } |
354 | | |
355 | | static void |
356 | | unicode_nf_buffer_swap(struct unicode_nf_context *ctx, |
357 | | size_t idx1, size_t idx2) |
358 | 0 | { |
359 | 0 | uint32_t tmp_cp = ctx->cp_buffer[idx2]; |
360 | 0 | const struct unicode_code_point_data *tmp_cpd = ctx->cpd_buffer[idx2]; |
361 | |
|
362 | 0 | ctx->cp_buffer[idx2] = ctx->cp_buffer[idx1]; |
363 | 0 | ctx->cpd_buffer[idx2] = ctx->cpd_buffer[idx1]; |
364 | 0 | ctx->cp_buffer[idx1] = tmp_cp; |
365 | 0 | ctx->cpd_buffer[idx1] = tmp_cpd; |
366 | 0 | } |
367 | | |
368 | | static void |
369 | | unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp, |
370 | | const struct unicode_code_point_data *cpd) |
371 | 0 | { |
372 | 0 | static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE; |
373 | 0 | uint8_t nf_qc_mask = ctx->nf_qc_mask; |
374 | 0 | size_t i; |
375 | | |
376 | | /* |
377 | | * Decompose the code point |
378 | | */ |
379 | |
|
380 | 0 | const uint32_t *decomp, *decomp_k; |
381 | 0 | uint32_t decomp_hangul[3]; |
382 | 0 | size_t len, len_k; |
383 | |
|
384 | 0 | if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) { |
385 | 0 | len = len_k = unicode_hangul_decompose(cp, decomp_hangul); |
386 | 0 | decomp = decomp_k = decomp_hangul; |
387 | 0 | } else { |
388 | 0 | if (cpd == NULL) |
389 | 0 | cpd = unicode_code_point_get_data(cp); |
390 | 0 | len = unicode_code_point_data_get_full_decomposition( |
391 | 0 | cpd, ctx->canonical, &decomp); |
392 | 0 | if (len == 0) { |
393 | 0 | decomp = &cp; |
394 | 0 | len = 1; |
395 | 0 | } |
396 | 0 | len_k = len; |
397 | 0 | decomp_k = decomp; |
398 | 0 | if (ctx->canonical) { |
399 | 0 | len_k = unicode_code_point_data_get_full_decomposition( |
400 | 0 | cpd, ctx->canonical, &decomp_k); |
401 | 0 | if (len_k == 0) { |
402 | 0 | decomp_k = decomp; |
403 | 0 | len_k = len; |
404 | 0 | } |
405 | 0 | } |
406 | 0 | if (len > 0) |
407 | 0 | cpd = NULL; |
408 | 0 | } |
409 | |
|
410 | 0 | i_assert(len <= UNICODE_DECOMPOSITION_MAX_LENGTH); |
411 | 0 | i_assert(len_k <= UNICODE_DECOMPOSITION_MAX_LENGTH); |
412 | | |
413 | 0 | if ((ctx->buffer_len + len) > buffer_size) { |
414 | | /* Decomposition overflows the buffer. Record and mark it as |
415 | | pending and come back to it once the buffer is sufficiently |
416 | | drained. */ |
417 | 0 | i_assert(ctx->pending_decomp == 0); |
418 | 0 | ctx->pending_decomp = len; |
419 | 0 | ctx->pending_cp = cp; |
420 | 0 | ctx->pending_cpd = cpd; |
421 | 0 | return; |
422 | 0 | } |
423 | | |
424 | | /* UAX15-D4: Stream-Safe Text Process is the process of producing a |
425 | | Unicode string in Stream-Safe Text Format by processing that string |
426 | | from start to finish, inserting U+034F COMBINING GRAPHEME JOINER |
427 | | (CGJ) within long sequences of non-starters. The exact position o |
428 | | the inserted CGJs are determined according to the following |
429 | | algorithm, which describes the generation of an output string from an |
430 | | input string: |
431 | | |
432 | | 1. If the input string is empty, return an empty output string. |
433 | | 2. Set nonStarterCount to zero. |
434 | | 3. For each code point C in the input string: |
435 | | a. Produce the NFKD decomposition S. |
436 | | b. If nonStarterCount plus the number of initial non-starters in |
437 | | S is greater than 30, append a CGJ to the output string and |
438 | | set the nonStarterCount to zero. |
439 | | c. Append C to the output string. |
440 | | d. If there are no starters in S, increment nonStarterCount by |
441 | | the number of code points in S; otherwise, set |
442 | | nonStarterCount to the number of trailing non-starters in S |
443 | | (which may be zero). |
444 | | 4. Return the output string. |
445 | | */ |
446 | | |
447 | | /* Determine number of leading and trailing non-starters in full NFKD |
448 | | decomposition. */ |
449 | 0 | const struct unicode_code_point_data * |
450 | 0 | decomp_cpd[UNICODE_DECOMPOSITION_MAX_LENGTH]; |
451 | 0 | size_t ns_lead = 0, ns_trail = 0; |
452 | 0 | bool seen_starter = FALSE; |
453 | 0 | for (i = 0; i < len_k; i++) { |
454 | 0 | if (cpd == NULL) |
455 | 0 | cpd = unicode_code_point_get_data(decomp[i]); |
456 | |
|
457 | 0 | uint8_t ccc = cpd->canonical_combining_class; |
458 | |
|
459 | 0 | if (decomp == decomp_k) { |
460 | 0 | decomp_cpd[i] = cpd; |
461 | 0 | cpd = NULL; |
462 | 0 | } |
463 | |
|
464 | 0 | if (ccc == 0) |
465 | 0 | seen_starter = TRUE; |
466 | 0 | else if (!seen_starter) |
467 | 0 | ns_lead++; |
468 | 0 | else |
469 | 0 | ns_trail++; |
470 | 0 | } |
471 | | |
472 | | /* Lookup canonical decomposed code points if necessary (avoid double |
473 | | lookups). */ |
474 | 0 | if (decomp != decomp_k) { |
475 | 0 | for (i = 0; i < len; i++) { |
476 | 0 | if (cpd == NULL) |
477 | 0 | cpd = unicode_code_point_get_data(decomp[i]); |
478 | 0 | decomp_cpd[i] = cpd; |
479 | 0 | cpd = NULL; |
480 | 0 | } |
481 | 0 | } |
482 | |
|
483 | 0 | ctx->nonstarter_count += ns_lead; |
484 | 0 | if (ctx->nonstarter_count > 30) { |
485 | 0 | ctx->nonstarter_count = ns_trail; |
486 | | |
487 | | /* Write U+034F COMBINING GRAPHEME JOINER (CGJ) |
488 | | */ |
489 | 0 | ctx->cp_buffer[ctx->buffer_len] = 0x034F; |
490 | 0 | ctx->cpd_buffer[ctx->buffer_len] = |
491 | 0 | unicode_code_point_get_data(0x034F); |
492 | 0 | ctx->buffer_len++; |
493 | 0 | } |
494 | | |
495 | | /* |
496 | | * Buffer the requested decomposition for COA sorting |
497 | | */ |
498 | |
|
499 | 0 | i_assert(ctx->buffer_len <= buffer_size); |
500 | 0 | if ((ctx->buffer_len + len) > buffer_size) { |
501 | | /* Decomposition now overflows the buffer. Record and mark it as |
502 | | pending and come back to it once the buffer is sufficiently |
503 | | drained. */ |
504 | 0 | i_assert(ctx->pending_decomp == 0); |
505 | 0 | ctx->pending_decomp = len; |
506 | 0 | ctx->pending_cp = cp; |
507 | 0 | ctx->pending_cpd = cpd; |
508 | 0 | } else { |
509 | 0 | for (i = 0; i < len; i++) { |
510 | 0 | ctx->cp_buffer[ctx->buffer_len] = decomp[i]; |
511 | 0 | ctx->cpd_buffer[ctx->buffer_len] = decomp_cpd[i]; |
512 | 0 | ctx->buffer_len++; |
513 | 0 | } |
514 | 0 | i_assert(ctx->buffer_len <= buffer_size); |
515 | 0 | } |
516 | | |
517 | | /* |
518 | | * Apply the Canonical Ordering Algorithm (COA) |
519 | | */ |
520 | | |
521 | 0 | bool changed = TRUE; |
522 | 0 | size_t last_qc_y; |
523 | 0 | size_t last_starter; |
524 | |
|
525 | 0 | while (changed) { |
526 | 0 | changed = FALSE; |
527 | 0 | last_qc_y = 0; |
528 | 0 | last_starter = 0; |
529 | |
|
530 | 0 | for (i = I_MAX(1, ctx->buffer_output_max); |
531 | 0 | i < ctx->buffer_len; i++) { |
532 | 0 | const struct unicode_code_point_data |
533 | 0 | *cpd_i = ctx->cpd_buffer[i], |
534 | 0 | *cpd_im1 = ctx->cpd_buffer[i - 1]; |
535 | 0 | uint8_t ccc_i = cpd_i->canonical_combining_class; |
536 | 0 | uint8_t ccc_im1 = cpd_im1->canonical_combining_class; |
537 | 0 | bool nqc = ((cpd_i->nf_quick_check & nf_qc_mask) == 0); |
538 | |
|
539 | 0 | if (ccc_i == 0) { |
540 | 0 | last_starter = i; |
541 | 0 | if (nqc) |
542 | 0 | last_qc_y = i; |
543 | 0 | } else if (ccc_im1 > ccc_i) { |
544 | 0 | unicode_nf_buffer_swap(ctx, i - 1, i); |
545 | 0 | changed = TRUE; |
546 | 0 | } |
547 | 0 | } |
548 | 0 | } |
549 | 0 | ctx->buffer_output_max = I_MIN(last_qc_y, last_starter); |
550 | 0 | } |
551 | | |
552 | | static bool |
553 | | unicode_nf_input_cp(struct unicode_nf_context *ctx, uint32_t cp, |
554 | | const struct unicode_code_point_data *cpd) |
555 | 0 | { |
556 | 0 | static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE; |
557 | |
|
558 | 0 | i_assert(ctx->buffer_len <= buffer_size); |
559 | 0 | if (ctx->buffer_len == buffer_size || |
560 | 0 | (ctx->pending_decomp > 0 && |
561 | 0 | ctx->buffer_len > (buffer_size - ctx->pending_decomp))) { |
562 | | /* Buffer is (still too) full. */ |
563 | 0 | return FALSE; |
564 | 0 | } |
565 | | |
566 | 0 | if (ctx->pending_decomp > 0) { |
567 | | /* Earlier, the buffer was too full for the next decomposition |
568 | | and it was recorded and marked as pending. Now, we have the |
569 | | opportunity to continue. */ |
570 | 0 | unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd); |
571 | 0 | ctx->pending_decomp = 0; |
572 | |
|
573 | 0 | i_assert(ctx->buffer_len <= buffer_size); |
574 | 0 | if (ctx->buffer_output_max > 0 && |
575 | 0 | ctx->buffer_len == buffer_size) { |
576 | | /* Pending decomposition filled the buffer completely. |
577 | | */ |
578 | 0 | return FALSE; |
579 | 0 | } |
580 | 0 | } |
581 | | |
582 | | /* Normal input of next code point */ |
583 | 0 | unicode_nf_cp(ctx, cp, cpd); |
584 | 0 | return TRUE; |
585 | 0 | } |
586 | | |
587 | | static ssize_t |
588 | | unicode_nf_input(struct unicode_transform *trans, |
589 | | const struct unicode_transform_buffer *buf, |
590 | | const char **error_r ATTR_UNUSED) |
591 | 0 | { |
592 | 0 | struct unicode_nf_context *ctx = |
593 | 0 | container_of(trans, struct unicode_nf_context, transform); |
594 | 0 | size_t n; |
595 | |
|
596 | 0 | for (n = 0; n < buf->cp_count; n++) { |
597 | 0 | if (!unicode_nf_input_cp(ctx, buf->cp[n], |
598 | 0 | (buf->cp_data == NULL ? |
599 | 0 | NULL : buf->cp_data[n]))) |
600 | 0 | break; |
601 | 0 | } |
602 | 0 | return n; |
603 | 0 | } |
604 | | |
605 | | static uint32_t |
606 | | unicode_nf_compose_pair(uint32_t l, uint32_t r, |
607 | | const struct unicode_code_point_data **l_data) |
608 | 0 | { |
609 | 0 | uint32_t comp = unicode_hangul_compose_pair(l, r); |
610 | |
|
611 | 0 | if (comp > 0x0000) |
612 | 0 | return comp; |
613 | | |
614 | 0 | if (*l_data == NULL) |
615 | 0 | *l_data = unicode_code_point_get_data(l); |
616 | 0 | return unicode_code_point_data_find_composition(*l_data, r); |
617 | 0 | } |
618 | | |
619 | | static int |
620 | | unicode_nf_flush_more(struct unicode_nf_context *ctx, bool finished, |
621 | | const char **error_r) |
622 | 0 | { |
623 | 0 | struct unicode_transform *trans = &ctx->transform; |
624 | |
|
625 | 0 | ctx->finished = finished; |
626 | |
|
627 | 0 | if (ctx->buffer_len == 0) |
628 | 0 | return 1; |
629 | 0 | if (!finished && ctx->buffer_output_max == 0) |
630 | 0 | return 0; |
631 | | |
632 | | /* |
633 | | * Apply the Canonical Composition Algorithm |
634 | | */ |
635 | | |
636 | 0 | if (ctx->finished) |
637 | 0 | ctx->buffer_output_max = ctx->buffer_len; |
638 | 0 | i_assert(ctx->buffer_processed <= ctx->buffer_output_max); |
639 | 0 | if (ctx->compose && ctx->buffer_len > 1) { |
640 | 0 | size_t in_pos, out_pos, starter; |
641 | 0 | int last_ccc; |
642 | |
|
643 | 0 | out_pos = 1; |
644 | 0 | last_ccc = -1; |
645 | 0 | starter = 0; |
646 | 0 | for (in_pos = I_MAX(1, ctx->buffer_processed); |
647 | 0 | in_pos < ctx->buffer_output_max; in_pos++) { |
648 | 0 | uint32_t cp = ctx->cp_buffer[in_pos]; |
649 | 0 | const struct unicode_code_point_data *cpd = |
650 | 0 | ctx->cpd_buffer[in_pos]; |
651 | |
|
652 | 0 | if (cpd == NULL) { |
653 | 0 | ctx->cpd_buffer[in_pos] = cpd = |
654 | 0 | unicode_code_point_get_data(cp); |
655 | 0 | } |
656 | |
|
657 | 0 | uint8_t ccc = cpd->canonical_combining_class; |
658 | 0 | uint32_t comp = 0x0000; |
659 | 0 | if (last_ccc < (int)ccc) { |
660 | 0 | comp = unicode_nf_compose_pair( |
661 | 0 | ctx->cp_buffer[starter], cp, |
662 | 0 | &ctx->cpd_buffer[starter]); |
663 | 0 | } |
664 | 0 | if (comp > 0x0000) { |
665 | 0 | ctx->cp_buffer[starter] = comp; |
666 | 0 | ctx->cpd_buffer[starter] = NULL; |
667 | 0 | } else if (ccc == 0) { |
668 | 0 | starter = out_pos; |
669 | 0 | last_ccc = -1; |
670 | 0 | ctx->cp_buffer[out_pos] = cp; |
671 | 0 | ctx->cpd_buffer[out_pos] = cpd; |
672 | 0 | out_pos++; |
673 | 0 | } else { |
674 | 0 | last_ccc = ccc; |
675 | 0 | ctx->cp_buffer[out_pos] = cp; |
676 | 0 | ctx->cpd_buffer[out_pos] = cpd; |
677 | 0 | out_pos++; |
678 | 0 | } |
679 | 0 | } |
680 | 0 | if (finished) { |
681 | 0 | ctx->buffer_len = ctx->buffer_output_max = out_pos; |
682 | 0 | } else if (in_pos > out_pos) { |
683 | 0 | unicode_nf_buffer_delete(ctx, out_pos, |
684 | 0 | (in_pos - out_pos)); |
685 | 0 | ctx->buffer_output_max = out_pos; |
686 | 0 | } |
687 | 0 | } |
688 | 0 | ctx->buffer_processed = ctx->buffer_output_max; |
689 | | |
690 | | /* |
691 | | * Forward output |
692 | | */ |
693 | |
|
694 | 0 | size_t output_len = ctx->buffer_processed; |
695 | 0 | ssize_t sret; |
696 | |
|
697 | 0 | sret = uniform_transform_forward(trans, ctx->cp_buffer, ctx->cpd_buffer, |
698 | 0 | output_len, error_r); |
699 | 0 | if (sret < 0) |
700 | 0 | return -1; |
701 | | |
702 | 0 | i_assert((size_t)sret <= ctx->buffer_processed); |
703 | 0 | unicode_nf_buffer_delete(ctx, 0, sret); |
704 | 0 | ctx->buffer_processed -= sret; |
705 | 0 | ctx->buffer_output_max -= sret; |
706 | 0 | if ((size_t)sret < output_len) |
707 | 0 | return 0; |
708 | 0 | return 1; |
709 | 0 | } |
710 | | |
711 | | static int |
712 | | unicode_nf_flush(struct unicode_transform *trans, bool finished, |
713 | | const char **error_r) |
714 | 0 | { |
715 | 0 | struct unicode_nf_context *ctx = |
716 | 0 | container_of(trans, struct unicode_nf_context, transform); |
717 | 0 | int ret; |
718 | |
|
719 | 0 | ret = unicode_nf_flush_more(ctx, finished, error_r); |
720 | 0 | if (ret <= 0) |
721 | 0 | return ret; |
722 | | |
723 | 0 | if (finished && ctx->pending_decomp > 0) { |
724 | 0 | unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd); |
725 | 0 | ctx->pending_decomp = 0; |
726 | 0 | } |
727 | |
|
728 | 0 | return unicode_nf_flush_more(ctx, finished, error_r); |
729 | 0 | } |
730 | | |
731 | | /* |
732 | | * Normalization check |
733 | | */ |
734 | | |
735 | | static ssize_t |
736 | | unicode_nf_check_sink_input(struct unicode_transform *trans, |
737 | | const struct unicode_transform_buffer *buf, |
738 | | const char **error_r); |
739 | | |
740 | | static const struct unicode_transform_def unicode_nf_check_sink_def = { |
741 | | .input = unicode_nf_check_sink_input, |
742 | | }; |
743 | | |
744 | | void unicode_nf_checker_init(struct unicode_nf_checker *unc_r, |
745 | | enum unicode_nf_type type) |
746 | 0 | { |
747 | 0 | i_zero(unc_r); |
748 | |
|
749 | 0 | switch (type) { |
750 | 0 | case UNICODE_NFD: |
751 | 0 | unc_r->canonical = TRUE; |
752 | 0 | unc_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK; |
753 | 0 | unc_r->nf_qc_yes = UNICODE_NFD_QUICK_CHECK_YES; |
754 | 0 | unc_r->nf_qc_no = UNICODE_NFD_QUICK_CHECK_NO; |
755 | 0 | break; |
756 | 0 | case UNICODE_NFKD: |
757 | 0 | unc_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK; |
758 | 0 | unc_r->nf_qc_yes = UNICODE_NFKD_QUICK_CHECK_YES; |
759 | 0 | unc_r->nf_qc_no = UNICODE_NFKD_QUICK_CHECK_NO; |
760 | 0 | break; |
761 | 0 | case UNICODE_NFC: |
762 | 0 | unc_r->compose = TRUE; |
763 | 0 | unc_r->canonical = TRUE; |
764 | 0 | unc_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK; |
765 | 0 | unc_r->nf_qc_yes = UNICODE_NFC_QUICK_CHECK_YES; |
766 | 0 | unc_r->nf_qc_no = UNICODE_NFC_QUICK_CHECK_NO; |
767 | 0 | break; |
768 | 0 | case UNICODE_NFKC: |
769 | 0 | unc_r->compose = TRUE; |
770 | 0 | unc_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK; |
771 | 0 | unc_r->nf_qc_yes = UNICODE_NFKC_QUICK_CHECK_YES; |
772 | 0 | unc_r->nf_qc_no = UNICODE_NFKC_QUICK_CHECK_NO; |
773 | 0 | break; |
774 | 0 | } |
775 | | |
776 | 0 | unicode_nf_init(&unc_r->nf, type); |
777 | 0 | unicode_transform_init(&unc_r->sink, &unicode_nf_check_sink_def); |
778 | 0 | unicode_transform_chain(&unc_r->nf.transform, &unc_r->sink); |
779 | 0 | } |
780 | | |
781 | | void unicode_nf_checker_reset(struct unicode_nf_checker *unc) |
782 | 0 | { |
783 | 0 | enum unicode_nf_type type = |
784 | 0 | (unc->compose ? (unc->canonical ? UNICODE_NFC : UNICODE_NFKC) : |
785 | 0 | (unc->canonical ? UNICODE_NFD : UNICODE_NFKD)); |
786 | |
|
787 | 0 | unicode_nf_checker_init(unc, type); |
788 | 0 | } |
789 | | |
790 | | static ssize_t |
791 | | unicode_nf_check_sink_input(struct unicode_transform *trans, |
792 | | const struct unicode_transform_buffer *buf, |
793 | | const char **error_r) |
794 | 0 | { |
795 | 0 | struct unicode_nf_checker *unc = |
796 | 0 | container_of(trans, struct unicode_nf_checker, sink); |
797 | 0 | size_t n; |
798 | |
|
799 | 0 | i_assert(unc->buffer_len > 0); |
800 | 0 | i_assert(buf->cp_count <= unc->buffer_len); |
801 | 0 | for (n = 0; n < buf->cp_count; n++) { |
802 | 0 | if (buf->cp[n] != unc->cp_buffer[n]) { |
803 | 0 | *error_r = "Not normalized"; |
804 | 0 | return -1; |
805 | 0 | } |
806 | 0 | } |
807 | 0 | if (buf->cp_count == unc->buffer_len) |
808 | 0 | unc->buffer_len = 0; |
809 | 0 | else { |
810 | 0 | unc->buffer_len -= buf->cp_count; |
811 | 0 | memmove(&unc->cp_buffer[0], &unc->cp_buffer[buf->cp_count], |
812 | 0 | unc->buffer_len); |
813 | 0 | } |
814 | 0 | return buf->cp_count; |
815 | 0 | } |
816 | | |
817 | | int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp, |
818 | | const struct unicode_code_point_data **_cp_data) |
819 | 0 | { |
820 | 0 | const struct unicode_code_point_data *cpd_last = unc->cpd_last; |
821 | |
|
822 | 0 | if (*_cp_data == NULL) |
823 | 0 | *_cp_data = unicode_code_point_get_data(cp); |
824 | |
|
825 | 0 | const struct unicode_code_point_data *cp_data = *_cp_data; |
826 | 0 | const char *error; |
827 | 0 | int ret; |
828 | |
|
829 | 0 | unc->cpd_last = cp_data; |
830 | |
|
831 | 0 | if (cp_data->general_category == UNICODE_GENERAL_CATEGORY_INVALID) |
832 | 0 | return -1; |
833 | 0 | if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_no) |
834 | 0 | return 0; |
835 | 0 | if (cpd_last != NULL && cp_data->canonical_combining_class != 0 && |
836 | 0 | cpd_last->canonical_combining_class > |
837 | 0 | cp_data->canonical_combining_class) |
838 | 0 | return 0; |
839 | 0 | if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_yes && |
840 | 0 | cp_data->canonical_combining_class == 0) { |
841 | 0 | if (unc->buffer_len > 0) { |
842 | 0 | ret = unicode_transform_flush(&unc->nf.transform, |
843 | 0 | &error); |
844 | 0 | i_assert(ret != 0); |
845 | 0 | if (ret < 0) |
846 | 0 | return 0; |
847 | 0 | unicode_nf_reset(&unc->nf); |
848 | 0 | } |
849 | 0 | i_assert(unc->buffer_len == 0); |
850 | 0 | unc->cp_buffer[0] = cp; |
851 | 0 | return 1; |
852 | 0 | } |
853 | | |
854 | 0 | struct unicode_transform_buffer buf; |
855 | 0 | ssize_t sret; |
856 | |
|
857 | 0 | if (unc->buffer_len == 0 && cpd_last != NULL) { |
858 | 0 | i_zero(&buf); |
859 | 0 | buf.cp = &unc->cp_buffer[0]; |
860 | 0 | buf.cp_data = &cpd_last; |
861 | 0 | buf.cp_count = 1; |
862 | |
|
863 | 0 | unc->buffer_len++; |
864 | 0 | sret = unicode_transform_input_buf(&unc->nf.transform, &buf, |
865 | 0 | &error); |
866 | 0 | i_assert(sret != 0); |
867 | 0 | if (sret < 0) |
868 | 0 | return 0; |
869 | 0 | } |
870 | | |
871 | 0 | i_assert(unc->buffer_len < UNICODE_NF_BUFFER_SIZE); |
872 | 0 | unc->cp_buffer[unc->buffer_len] = cp; |
873 | 0 | unc->buffer_len++; |
874 | |
|
875 | 0 | i_zero(&buf); |
876 | 0 | buf.cp = &cp; |
877 | 0 | buf.cp_data = &cp_data; |
878 | 0 | buf.cp_count = 1; |
879 | 0 | sret = unicode_transform_input_buf(&unc->nf.transform, &buf, &error); |
880 | 0 | i_assert(sret != 0); |
881 | 0 | if (sret < 0) |
882 | 0 | return 0; |
883 | 0 | return 1; |
884 | 0 | } |
885 | | |
886 | | int unicode_nf_checker_finish(struct unicode_nf_checker *unc) |
887 | 0 | { |
888 | 0 | if (unc->buffer_len == 0) |
889 | 0 | return 1; |
890 | | |
891 | 0 | const char *error; |
892 | 0 | int ret; |
893 | |
|
894 | 0 | ret = unicode_transform_flush(&unc->nf.transform, &error); |
895 | 0 | i_assert(ret != 0); |
896 | 0 | return (ret > 0 ? 1 : 0); |
897 | 0 | } |
898 | | |
899 | | /* |
900 | | * Casemap Transform |
901 | | */ |
902 | | |
903 | | static size_t |
904 | | unicode_casemap_uppercase_cp(const struct unicode_code_point_data *cp_data, |
905 | | const uint32_t **map_r); |
906 | | static size_t |
907 | | unicode_casemap_lowercase_cp(const struct unicode_code_point_data *cp_data, |
908 | | const uint32_t **map_r); |
909 | | static size_t |
910 | | unicode_casemap_casefold_cp(const struct unicode_code_point_data *cp_data, |
911 | | const uint32_t **map_r); |
912 | | |
913 | | static ssize_t |
914 | | unicode_casemap_input(struct unicode_transform *trans, |
915 | | const struct unicode_transform_buffer *buf, |
916 | | const char **error_r); |
917 | | static int |
918 | | unicode_casemap_flush(struct unicode_transform *trans, bool finished, |
919 | | const char **error_r); |
920 | | |
921 | | static const struct unicode_transform_def unicode_casemap_def = { |
922 | | .input = unicode_casemap_input, |
923 | | .flush = unicode_casemap_flush, |
924 | | }; |
925 | | |
926 | | void unicode_casemap_init_uppercase(struct unicode_casemap *map_r) |
927 | 0 | { |
928 | 0 | i_zero(map_r); |
929 | 0 | unicode_transform_init(&map_r->transform, &unicode_casemap_def); |
930 | 0 | map_r->map = unicode_casemap_uppercase_cp; |
931 | 0 | } |
932 | | |
933 | | void unicode_casemap_init_lowercase(struct unicode_casemap *map_r) |
934 | 0 | { |
935 | 0 | i_zero(map_r); |
936 | 0 | unicode_transform_init(&map_r->transform, &unicode_casemap_def); |
937 | 0 | map_r->map = unicode_casemap_lowercase_cp; |
938 | 0 | } |
939 | | |
940 | | void unicode_casemap_init_casefold(struct unicode_casemap *map_r) |
941 | 0 | { |
942 | 0 | i_zero(map_r); |
943 | 0 | unicode_transform_init(&map_r->transform, &unicode_casemap_def); |
944 | 0 | map_r->map = unicode_casemap_casefold_cp; |
945 | 0 | } |
946 | | |
947 | | static size_t |
948 | | unicode_casemap_uppercase_cp(const struct unicode_code_point_data *cp_data, |
949 | | const uint32_t **map_r) |
950 | 0 | { |
951 | 0 | return unicode_code_point_data_get_uppercase_mapping(cp_data, map_r); |
952 | 0 | } |
953 | | |
954 | | static size_t |
955 | | unicode_casemap_lowercase_cp(const struct unicode_code_point_data *cp_data, |
956 | | const uint32_t **map_r) |
957 | 0 | { |
958 | 0 | return unicode_code_point_data_get_lowercase_mapping(cp_data, map_r); |
959 | 0 | } |
960 | | |
961 | | static size_t |
962 | | unicode_casemap_casefold_cp(const struct unicode_code_point_data *cp_data, |
963 | | const uint32_t **map_r) |
964 | 0 | { |
965 | 0 | return unicode_code_point_data_get_casefold_mapping(cp_data, map_r); |
966 | 0 | } |
967 | | |
968 | | static ssize_t |
969 | | unicode_casemap_input_cp(struct unicode_casemap *map, uint32_t cp, |
970 | | const struct unicode_code_point_data *cp_data, |
971 | | const char **error_r) |
972 | 0 | { |
973 | 0 | bool was_buffered = map->cp_buffered; |
974 | 0 | ssize_t sret; |
975 | |
|
976 | 0 | if (cp_data == NULL) |
977 | 0 | cp_data = unicode_code_point_get_data(cp); |
978 | |
|
979 | 0 | const uint32_t *map_cps; |
980 | 0 | const struct unicode_code_point_data *const *map_cps_data = NULL; |
981 | 0 | size_t map_cps_len; |
982 | |
|
983 | 0 | map_cps_len = map->map(cp_data, &map_cps); |
984 | 0 | if (map_cps_len == 0) { |
985 | 0 | map_cps = &cp; |
986 | 0 | map_cps_data = &cp_data; |
987 | 0 | map_cps_len = 1; |
988 | 0 | } |
989 | 0 | i_assert(map_cps_len > map->cp_map_pos); |
990 | | |
991 | 0 | map_cps += map->cp_map_pos; |
992 | 0 | map_cps_len -= map->cp_map_pos; |
993 | 0 | sret = uniform_transform_forward(&map->transform, |
994 | 0 | map_cps, map_cps_data, map_cps_len, |
995 | 0 | error_r); |
996 | 0 | if (sret < 0) { |
997 | 0 | i_assert(*error_r != NULL); |
998 | 0 | return -1; |
999 | 0 | } |
1000 | 0 | if ((size_t)sret < map_cps_len) { |
1001 | 0 | map->cp_buffered = TRUE; |
1002 | 0 | map->cp = cp; |
1003 | 0 | map->cp_data = cp_data; |
1004 | 0 | map->cp_map_pos += sret; |
1005 | 0 | return (was_buffered ? 0 : 1); |
1006 | 0 | } |
1007 | | |
1008 | 0 | map->cp_buffered = FALSE; |
1009 | 0 | map->cp_data = NULL; |
1010 | 0 | map->cp_map_pos = 0; |
1011 | 0 | return 1; |
1012 | 0 | } |
1013 | | |
1014 | | static ssize_t |
1015 | | unicode_casemap_input(struct unicode_transform *trans, |
1016 | | const struct unicode_transform_buffer *buf, |
1017 | | const char **error_r) |
1018 | 0 | { |
1019 | 0 | struct unicode_casemap *map = |
1020 | 0 | container_of(trans, struct unicode_casemap, transform); |
1021 | 0 | int ret; |
1022 | |
|
1023 | 0 | ret = unicode_casemap_flush(trans, TRUE, error_r); |
1024 | 0 | if (ret < 0) { |
1025 | 0 | i_assert(*error_r != NULL); |
1026 | 0 | return -1; |
1027 | 0 | } |
1028 | 0 | if (map->cp_buffered) |
1029 | 0 | return 0; |
1030 | | |
1031 | 0 | size_t n; |
1032 | 0 | for (n = 0; n < buf->cp_count; n++) { |
1033 | 0 | if (map->cp_buffered) |
1034 | 0 | break; |
1035 | 0 | ret = unicode_casemap_input_cp(map, buf->cp[n], |
1036 | 0 | (buf->cp_data != NULL ? |
1037 | 0 | buf->cp_data[n] : NULL), |
1038 | 0 | error_r); |
1039 | 0 | if (ret < 0) { |
1040 | 0 | i_assert(*error_r != NULL); |
1041 | 0 | return -1; |
1042 | 0 | } |
1043 | 0 | if (ret == 0) |
1044 | 0 | break; |
1045 | 0 | } |
1046 | 0 | return n; |
1047 | 0 | } |
1048 | | |
1049 | | static int |
1050 | | unicode_casemap_flush(struct unicode_transform *trans, |
1051 | | bool finished ATTR_UNUSED, const char **error_r) |
1052 | 0 | { |
1053 | 0 | struct unicode_casemap *map = |
1054 | 0 | container_of(trans, struct unicode_casemap, transform); |
1055 | 0 | int ret; |
1056 | |
|
1057 | 0 | if (!map->cp_buffered) |
1058 | 0 | return 1; |
1059 | | |
1060 | 0 | ret = unicode_casemap_input_cp(map, map->cp, map->cp_data, error_r); |
1061 | 0 | i_assert(ret >= 0 || *error_r != NULL); |
1062 | 0 | return ret; |
1063 | 0 | } |
1064 | | |
1065 | | /* |
1066 | | * RFC 5051 - Simple Unicode Collation Algorithm |
1067 | | */ |
1068 | | |
1069 | | void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx) |
1070 | 0 | { |
1071 | 0 | i_zero(ctx); |
1072 | 0 | } |
1073 | | |
1074 | | size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx, |
1075 | | uint32_t cp, const uint32_t **norm_r) |
1076 | 0 | { |
1077 | 0 | const struct unicode_code_point_data *cpd; |
1078 | 0 | size_t len; |
1079 | |
|
1080 | 0 | cpd = unicode_code_point_get_data(cp); |
1081 | 0 | if (cpd->simple_titlecase_mapping != 0x0000) |
1082 | 0 | cp = cpd->simple_titlecase_mapping; |
1083 | |
|
1084 | 0 | if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) { |
1085 | 0 | *norm_r = ctx->buffer; |
1086 | 0 | return unicode_hangul_decompose(cp, ctx->buffer); |
1087 | 0 | } |
1088 | | |
1089 | 0 | len = unicode_code_point_get_full_decomposition(cp, FALSE, norm_r); |
1090 | 0 | if (len == 0) { |
1091 | 0 | ctx->buffer[0] = cp; |
1092 | 0 | *norm_r = ctx->buffer; |
1093 | 0 | return 1; |
1094 | 0 | } |
1095 | 0 | return len; |
1096 | 0 | } |