/src/dovecot/src/lib/unicode-transform.c
Line | Count | Source |
1 | | /* Copyright (c) 2025 Dovecot authors, see the included COPYING file */ |
2 | | |
3 | | #include "lib.h" |
4 | | #include "unichar.h" |
5 | | #include "unicode-data.h" |
6 | | #include "unicode-transform.h" |
7 | | |
8 | 0 | #define HANGUL_FIRST 0xac00 |
9 | 0 | #define HANGUL_LAST 0xd7a3 |
10 | | |
11 | | /* |
12 | | * Transform |
13 | | */ |
14 | | |
15 | | ssize_t uniform_transform_forward( |
16 | | struct unicode_transform *trans, const uint32_t *out, |
17 | | const struct unicode_code_point_data *const *out_data, size_t out_len, |
18 | | const char **error_r) |
19 | 0 | { |
20 | 0 | struct unicode_transform_buffer buf_next; |
21 | 0 | ssize_t sret; |
22 | |
|
23 | 0 | i_zero(&buf_next); |
24 | 0 | buf_next.cp = out; |
25 | 0 | buf_next.cp_data = out_data; |
26 | 0 | buf_next.cp_count = out_len; |
27 | |
|
28 | 0 | i_assert(trans->next != NULL); |
29 | 0 | i_assert(trans->next->def != NULL); |
30 | 0 | i_assert(trans->next->def->input != NULL); |
31 | 0 | sret = trans->next->def->input(trans->next, &buf_next, error_r); |
32 | |
|
33 | 0 | i_assert(sret >= 0 || *error_r != NULL); |
34 | 0 | i_assert(sret <= (ssize_t)out_len); |
35 | 0 | return sret; |
36 | 0 | } |
37 | | |
38 | | ssize_t unicode_transform_input_buf(struct unicode_transform *trans, |
39 | | const struct unicode_transform_buffer *buf, |
40 | | const char **error_r) |
41 | 0 | { |
42 | 0 | struct unicode_transform_buffer in_buf; |
43 | 0 | size_t input_total = 0; |
44 | 0 | ssize_t sret; |
45 | 0 | bool flushed = FALSE; |
46 | 0 | int ret; |
47 | |
|
48 | 0 | *error_r = NULL; |
49 | |
|
50 | 0 | in_buf = *buf; |
51 | |
|
52 | 0 | while (in_buf.cp_count > 0) { |
53 | 0 | if (in_buf.cp_count > 0) { |
54 | 0 | i_assert(trans->def->input != NULL); |
55 | 0 | sret = trans->def->input(trans, &in_buf, error_r); |
56 | 0 | if (sret < 0) { |
57 | 0 | i_assert(*error_r != NULL); |
58 | 0 | return -1; |
59 | 0 | } |
60 | 0 | if (sret > 0) { |
61 | 0 | i_assert((size_t)sret <= in_buf.cp_count); |
62 | 0 | in_buf.cp += sret; |
63 | 0 | in_buf.cp_count -= sret; |
64 | 0 | input_total += sret; |
65 | 0 | flushed = FALSE; |
66 | 0 | continue; |
67 | 0 | } |
68 | 0 | if (sret == 0 && flushed) |
69 | 0 | break; |
70 | 0 | } |
71 | | |
72 | 0 | struct unicode_transform *tp = trans; |
73 | |
|
74 | 0 | while (tp->next != NULL) { |
75 | 0 | if (tp->def->flush != NULL) { |
76 | 0 | ret = tp->def->flush(tp, FALSE, error_r); |
77 | 0 | if (ret < 0) { |
78 | 0 | i_assert(*error_r != NULL); |
79 | 0 | return -1; |
80 | 0 | } |
81 | 0 | } |
82 | 0 | tp = tp->next; |
83 | 0 | } |
84 | | |
85 | 0 | flushed = TRUE; |
86 | 0 | } |
87 | | |
88 | 0 | return input_total; |
89 | 0 | } |
90 | | |
91 | | int unicode_transform_flush(struct unicode_transform *trans, |
92 | | const char **error_r) |
93 | 0 | { |
94 | 0 | int ret; |
95 | |
|
96 | 0 | *error_r = NULL; |
97 | |
|
98 | 0 | while (trans != NULL) { |
99 | 0 | struct unicode_transform *tp = trans; |
100 | 0 | bool progress = FALSE; |
101 | |
|
102 | 0 | while (tp != NULL) { |
103 | 0 | if (tp->def->flush == NULL) { |
104 | 0 | progress = TRUE; |
105 | 0 | if (tp == trans) |
106 | 0 | trans = trans->next; |
107 | 0 | } else { |
108 | 0 | ret = tp->def->flush(tp, (tp == trans), error_r); |
109 | 0 | if (ret < 0) { |
110 | 0 | i_assert(*error_r != NULL); |
111 | 0 | return -1; |
112 | 0 | } |
113 | 0 | if (ret > 0) { |
114 | 0 | progress = TRUE; |
115 | 0 | if (tp == trans) |
116 | 0 | trans = trans->next; |
117 | 0 | } |
118 | 0 | } |
119 | 0 | tp = tp->next; |
120 | 0 | } |
121 | 0 | if (!progress) |
122 | 0 | return 0; |
123 | 0 | } |
124 | 0 | return 1; |
125 | 0 | } |
126 | | |
127 | | /* Buffer Sink */ |
128 | | |
129 | | static ssize_t |
130 | | unicode_buffer_sink_input(struct unicode_transform *trans, |
131 | | const struct unicode_transform_buffer *buf, |
132 | | const char **error_r); |
133 | | |
134 | | static const struct unicode_transform_def unicode_buffer_sink_def = { |
135 | | .input = unicode_buffer_sink_input, |
136 | | }; |
137 | | |
138 | | void unicode_buffer_sink_init(struct unicode_buffer_sink *sink, |
139 | | buffer_t *buffer) |
140 | 0 | { |
141 | 0 | i_zero(sink); |
142 | 0 | unicode_transform_init(&sink->transform, &unicode_buffer_sink_def); |
143 | 0 | sink->buffer = buffer; |
144 | 0 | } |
145 | | |
146 | | static ssize_t |
147 | | unicode_buffer_sink_input(struct unicode_transform *trans, |
148 | | const struct unicode_transform_buffer *buf, |
149 | | const char **error_r ATTR_UNUSED) |
150 | 0 | { |
151 | 0 | struct unicode_buffer_sink *sink = |
152 | 0 | container_of(trans, struct unicode_buffer_sink, transform); |
153 | |
|
154 | 0 | uni_ucs4_to_utf8(buf->cp, buf->cp_count, sink->buffer); |
155 | 0 | return buf->cp_count; |
156 | 0 | } |
157 | | |
158 | | /* Static Array Sink */ |
159 | | |
160 | | static ssize_t |
161 | | unicode_static_array_sink_input(struct unicode_transform *trans, |
162 | | const struct unicode_transform_buffer *buf, |
163 | | const char **error_r); |
164 | | |
165 | | static const struct unicode_transform_def unicode_static_array_sink_def = { |
166 | | .input = unicode_static_array_sink_input, |
167 | | }; |
168 | | |
169 | | void unicode_static_array_sink_init(struct unicode_static_array_sink *sink, |
170 | | uint32_t *array, size_t array_size, |
171 | | size_t *array_pos) |
172 | 0 | { |
173 | 0 | i_zero(sink); |
174 | 0 | unicode_transform_init(&sink->transform, |
175 | 0 | &unicode_static_array_sink_def); |
176 | 0 | sink->array = array; |
177 | 0 | sink->array_size = array_size; |
178 | 0 | sink->array_pos = array_pos; |
179 | 0 | } |
180 | | |
181 | | static ssize_t |
182 | | unicode_static_array_sink_input(struct unicode_transform *trans, |
183 | | const struct unicode_transform_buffer *buf, |
184 | | const char **error_r) |
185 | 0 | { |
186 | 0 | struct unicode_static_array_sink *sink = |
187 | 0 | container_of(trans, struct unicode_static_array_sink, |
188 | 0 | transform); |
189 | |
|
190 | 0 | if (*sink->array_pos + buf->cp_count > sink->array_size) { |
191 | 0 | *error_r = "Output overflow"; |
192 | 0 | return -1; |
193 | 0 | } |
194 | 0 | memcpy(sink->array + *sink->array_pos, buf->cp, |
195 | 0 | buf->cp_count * sizeof(*buf->cp)); |
196 | 0 | *sink->array_pos += buf->cp_count; |
197 | 0 | return buf->cp_count; |
198 | 0 | } |
199 | | |
200 | | /* |
201 | | * Hangul syllable (de)composition |
202 | | */ |
203 | | |
204 | 0 | #define UNI_HANGUL_S_BASE 0xac00 |
205 | 0 | #define UNI_HANGUL_L_BASE 0x1100 |
206 | 0 | #define UNI_HANGUL_V_BASE 0x1161 |
207 | 0 | #define UNI_HANGUL_T_BASE 0x11a7 |
208 | 0 | #define UNI_HANGUL_L_COUNT 19 |
209 | 0 | #define UNI_HANGUL_V_COUNT 21 |
210 | 0 | #define UNI_HANGUL_T_COUNT 28 |
211 | 0 | #define UNI_HANGUL_N_COUNT (UNI_HANGUL_V_COUNT * UNI_HANGUL_T_COUNT) |
212 | 0 | #define UNI_HANGUL_L_END (UNI_HANGUL_L_BASE + UNI_HANGUL_L_COUNT) |
213 | 0 | #define UNI_HANGUL_V_END (UNI_HANGUL_V_BASE + UNI_HANGUL_V_COUNT) |
214 | 0 | #define UNI_HANGUL_T_END (UNI_HANGUL_T_BASE + UNI_HANGUL_T_COUNT) |
215 | 0 | #define UNI_HANGUL_S_END 0xD7A4 |
216 | | |
217 | | static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3]) |
218 | 0 | { |
219 | | /* The Unicode Standard, Section 3.12.2: |
220 | | Hangul Syllable Decomposition |
221 | | */ |
222 | |
|
223 | 0 | size_t s_index = cp - UNI_HANGUL_S_BASE; |
224 | 0 | size_t l_index = s_index / UNI_HANGUL_N_COUNT; |
225 | 0 | size_t v_index = ((s_index % UNI_HANGUL_N_COUNT) / UNI_HANGUL_T_COUNT); |
226 | 0 | size_t t_index = s_index % UNI_HANGUL_T_COUNT; |
227 | 0 | uint32_t l_part = UNI_HANGUL_L_BASE + l_index; |
228 | 0 | uint32_t v_part = UNI_HANGUL_V_BASE + v_index; |
229 | |
|
230 | 0 | if (t_index == 0) { |
231 | 0 | buf[0] = l_part; |
232 | 0 | buf[1] = v_part; |
233 | 0 | return 2; |
234 | 0 | } |
235 | | |
236 | 0 | uint32_t t_part = UNI_HANGUL_T_BASE + t_index; |
237 | |
|
238 | 0 | buf[0] = l_part; |
239 | 0 | buf[1] = v_part; |
240 | 0 | buf[2] = t_part; |
241 | 0 | return 3; |
242 | 0 | } |
243 | | |
244 | | static uint32_t unicode_hangul_compose_pair(uint32_t l, uint32_t r) |
245 | 0 | { |
246 | | /* The Unicode Standard, Section 3.12.3: |
247 | | Hangul Syllable Composition |
248 | | */ |
249 | | |
250 | | /* <LPart, VPart> */ |
251 | 0 | if (l >= UNI_HANGUL_L_BASE && l < UNI_HANGUL_L_END && |
252 | 0 | r >= UNI_HANGUL_V_BASE && r < UNI_HANGUL_V_END) { |
253 | 0 | uint32_t l_part = l, v_part = r; |
254 | |
|
255 | 0 | size_t l_index = l_part - UNI_HANGUL_L_BASE; |
256 | 0 | size_t v_index = v_part - UNI_HANGUL_V_BASE; |
257 | 0 | size_t lv_index = l_index * UNI_HANGUL_N_COUNT + |
258 | 0 | v_index * UNI_HANGUL_T_COUNT; |
259 | 0 | return UNI_HANGUL_S_BASE + lv_index; |
260 | 0 | } |
261 | | /* A sequence <LVPart, TPart> */ |
262 | 0 | if (l >= UNI_HANGUL_S_BASE && l < UNI_HANGUL_S_END && |
263 | 0 | r >= (UNI_HANGUL_T_BASE + 1u) && r < UNI_HANGUL_T_END && |
264 | 0 | ((l - UNI_HANGUL_S_BASE) % UNI_HANGUL_T_COUNT) == 0) { |
265 | 0 | uint32_t lv_part = l, t_part = r; |
266 | |
|
267 | 0 | size_t t_index = t_part - UNI_HANGUL_T_BASE; |
268 | 0 | return lv_part + t_index; |
269 | 0 | } |
270 | 0 | return 0x0000; |
271 | 0 | } |
272 | | |
273 | | /* |
274 | | * Normalization transform: NFD, NFKD, NFC, NFKC |
275 | | */ |
276 | | |
277 | | static ssize_t |
278 | | unicode_nf_input(struct unicode_transform *trans, |
279 | | const struct unicode_transform_buffer *buf, |
280 | | const char **error_r); |
281 | | static int |
282 | | unicode_nf_flush(struct unicode_transform *trans, bool finished, |
283 | | const char **error_r); |
284 | | |
285 | | static const struct unicode_transform_def unicode_nf_def = { |
286 | | .input = unicode_nf_input, |
287 | | .flush = unicode_nf_flush, |
288 | | }; |
289 | | |
290 | | void unicode_nf_init(struct unicode_nf_context *ctx_r, |
291 | | enum unicode_nf_type type) |
292 | 0 | { |
293 | 0 | i_zero(ctx_r); |
294 | 0 | unicode_transform_init(&ctx_r->transform, &unicode_nf_def); |
295 | |
|
296 | 0 | switch (type) { |
297 | 0 | case UNICODE_NFD: |
298 | 0 | ctx_r->canonical = TRUE; |
299 | 0 | ctx_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK; |
300 | 0 | break; |
301 | 0 | case UNICODE_NFKD: |
302 | 0 | ctx_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK; |
303 | 0 | break; |
304 | 0 | case UNICODE_NFC: |
305 | 0 | ctx_r->compose = TRUE; |
306 | 0 | ctx_r->canonical = TRUE; |
307 | 0 | ctx_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK; |
308 | 0 | break; |
309 | 0 | case UNICODE_NFKC: |
310 | 0 | ctx_r->compose = TRUE; |
311 | 0 | ctx_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK; |
312 | 0 | break; |
313 | 0 | } |
314 | 0 | } |
315 | | |
316 | | void unicode_nf_reset(struct unicode_nf_context *ctx) |
317 | 0 | { |
318 | 0 | enum unicode_nf_type type = |
319 | 0 | (ctx->compose ? (ctx->canonical ? UNICODE_NFC : UNICODE_NFKC) : |
320 | 0 | (ctx->canonical ? UNICODE_NFD : UNICODE_NFKD)); |
321 | 0 | struct unicode_transform *next = ctx->transform.next; |
322 | |
|
323 | 0 | unicode_nf_init(ctx, type); |
324 | 0 | unicode_transform_chain(&ctx->transform, next); |
325 | 0 | } |
326 | | |
327 | | static void |
328 | | unicode_nf_buffer_delete(struct unicode_nf_context *ctx, size_t offset, |
329 | | size_t count) |
330 | 0 | { |
331 | 0 | if (count == 0) |
332 | 0 | return; |
333 | | |
334 | 0 | i_assert(offset < ctx->buffer_len); |
335 | 0 | i_assert(count <= ctx->buffer_len); |
336 | 0 | i_assert(offset <= (ctx->buffer_len - count)); |
337 | | |
338 | 0 | if (count == ctx->buffer_len) { |
339 | 0 | ctx->buffer_len = 0; |
340 | 0 | return; |
341 | 0 | } |
342 | | |
343 | 0 | size_t trailer = ctx->buffer_len - (offset + count); |
344 | 0 | if (trailer > 0) { |
345 | 0 | memmove(&ctx->cp_buffer[offset], |
346 | 0 | &ctx->cp_buffer[offset + count], |
347 | 0 | trailer * sizeof(ctx->cp_buffer[0])); |
348 | 0 | memmove(&ctx->cpd_buffer[offset], |
349 | 0 | &ctx->cpd_buffer[offset + count], |
350 | 0 | trailer * sizeof(ctx->cpd_buffer[0])); |
351 | 0 | } |
352 | 0 | ctx->buffer_len -= count; |
353 | 0 | } |
354 | | |
355 | | static void |
356 | | unicode_nf_buffer_swap(struct unicode_nf_context *ctx, |
357 | | size_t idx1, size_t idx2) |
358 | 0 | { |
359 | 0 | uint32_t tmp_cp = ctx->cp_buffer[idx2]; |
360 | 0 | const struct unicode_code_point_data *tmp_cpd = ctx->cpd_buffer[idx2]; |
361 | |
|
362 | 0 | ctx->cp_buffer[idx2] = ctx->cp_buffer[idx1]; |
363 | 0 | ctx->cpd_buffer[idx2] = ctx->cpd_buffer[idx1]; |
364 | 0 | ctx->cp_buffer[idx1] = tmp_cp; |
365 | 0 | ctx->cpd_buffer[idx1] = tmp_cpd; |
366 | 0 | } |
367 | | |
368 | | static bool |
369 | | unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp, |
370 | | const struct unicode_code_point_data *cpd) |
371 | 0 | { |
372 | 0 | static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE; |
373 | 0 | uint8_t nf_qc_mask = ctx->nf_qc_mask; |
374 | 0 | size_t i; |
375 | |
|
376 | 0 | i_assert(ctx->buffer_len <= buffer_size); |
377 | 0 | if (ctx->buffer_len == buffer_size) { |
378 | | /* Buffer already full */ |
379 | 0 | return FALSE; |
380 | 0 | } |
381 | | |
382 | | /* |
383 | | * Decompose the code point |
384 | | */ |
385 | | |
386 | 0 | const uint32_t *decomp, *decomp_k; |
387 | 0 | uint32_t decomp_hangul[3]; |
388 | 0 | size_t len, len_k; |
389 | |
|
390 | 0 | if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) { |
391 | 0 | len = len_k = unicode_hangul_decompose(cp, decomp_hangul); |
392 | 0 | decomp = decomp_k = decomp_hangul; |
393 | 0 | } else { |
394 | 0 | if (cpd == NULL) |
395 | 0 | cpd = unicode_code_point_get_data(cp); |
396 | 0 | len = unicode_code_point_data_get_full_decomposition( |
397 | 0 | cpd, ctx->canonical, &decomp); |
398 | 0 | if (len == 0) { |
399 | 0 | decomp = &cp; |
400 | 0 | len = 1; |
401 | 0 | } |
402 | 0 | len_k = len; |
403 | 0 | decomp_k = decomp; |
404 | 0 | if (ctx->canonical) { |
405 | 0 | len_k = unicode_code_point_data_get_full_decomposition( |
406 | 0 | cpd, ctx->canonical, &decomp_k); |
407 | 0 | if (len_k == 0) { |
408 | 0 | decomp_k = decomp; |
409 | 0 | len_k = len; |
410 | 0 | } |
411 | 0 | } |
412 | 0 | if (len > 0) |
413 | 0 | cpd = NULL; |
414 | 0 | } |
415 | |
|
416 | 0 | i_assert(len <= UNICODE_DECOMPOSITION_MAX_LENGTH); |
417 | 0 | i_assert(len_k <= UNICODE_DECOMPOSITION_MAX_LENGTH); |
418 | | |
419 | 0 | if ((ctx->buffer_len + len) > buffer_size && |
420 | 0 | (ctx->nonstarter_count + len) <= |
421 | 0 | UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN) { |
422 | | /* Decomposition overflows the buffer. Record and mark it as |
423 | | pending and come back to it once the buffer is sufficiently |
424 | | drained. */ |
425 | 0 | i_assert(ctx->pending_decomp == 0 || ctx->pending_cp == cp); |
426 | 0 | ctx->pending_decomp = len; |
427 | 0 | ctx->pending_cp = cp; |
428 | 0 | ctx->pending_cpd = cpd; |
429 | 0 | return FALSE; |
430 | 0 | } |
431 | | |
432 | | /* UAX15-D4: Stream-Safe Text Process is the process of producing a |
433 | | Unicode string in Stream-Safe Text Format by processing that string |
434 | | from start to finish, inserting U+034F COMBINING GRAPHEME JOINER |
435 | | (CGJ) within long sequences of non-starters. The exact position o |
436 | | the inserted CGJs are determined according to the following |
437 | | algorithm, which describes the generation of an output string from an |
438 | | input string: |
439 | | |
440 | | 1. If the input string is empty, return an empty output string. |
441 | | 2. Set nonStarterCount to zero. |
442 | | 3. For each code point C in the input string: |
443 | | a. Produce the NFKD decomposition S. |
444 | | b. If nonStarterCount plus the number of initial non-starters in |
445 | | S is greater than 30, append a CGJ to the output string and |
446 | | set the nonStarterCount to zero. |
447 | | c. Append C to the output string. |
448 | | d. If there are no starters in S, increment nonStarterCount by |
449 | | the number of code points in S; otherwise, set |
450 | | nonStarterCount to the number of trailing non-starters in S |
451 | | (which may be zero). |
452 | | 4. Return the output string. |
453 | | */ |
454 | | |
455 | | /* Determine number of leading and trailing non-starters in full NFKD |
456 | | decomposition. */ |
457 | 0 | const struct unicode_code_point_data * |
458 | 0 | decomp_cpd[UNICODE_DECOMPOSITION_MAX_LENGTH]; |
459 | 0 | size_t ns_lead = 0, ns_trail = 0; |
460 | 0 | bool seen_starter = FALSE; |
461 | 0 | for (i = 0; i < len_k; i++) { |
462 | 0 | if (cpd == NULL) |
463 | 0 | cpd = unicode_code_point_get_data(decomp[i]); |
464 | |
|
465 | 0 | uint8_t ccc = cpd->canonical_combining_class; |
466 | |
|
467 | 0 | if (decomp == decomp_k) { |
468 | 0 | decomp_cpd[i] = cpd; |
469 | 0 | cpd = NULL; |
470 | 0 | } |
471 | |
|
472 | 0 | if (ccc == 0) |
473 | 0 | seen_starter = TRUE; |
474 | 0 | else if (!seen_starter) |
475 | 0 | ns_lead++; |
476 | 0 | else |
477 | 0 | ns_trail++; |
478 | 0 | } |
479 | | |
480 | | /* Lookup canonical decomposed code points if necessary (avoid double |
481 | | lookups). */ |
482 | 0 | if (decomp != decomp_k) { |
483 | 0 | for (i = 0; i < len; i++) { |
484 | 0 | if (cpd == NULL) |
485 | 0 | cpd = unicode_code_point_get_data(decomp[i]); |
486 | 0 | decomp_cpd[i] = cpd; |
487 | 0 | cpd = NULL; |
488 | 0 | } |
489 | 0 | } |
490 | |
|
491 | 0 | ctx->nonstarter_count += ns_lead; |
492 | 0 | if (ctx->nonstarter_count > UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN) { |
493 | 0 | ctx->nonstarter_count = 0; |
494 | | /* Write U+034F COMBINING GRAPHEME JOINER (CGJ) |
495 | | */ |
496 | 0 | ctx->cp_buffer[ctx->buffer_len] = 0x034F; |
497 | 0 | ctx->cpd_buffer[ctx->buffer_len] = |
498 | 0 | unicode_code_point_get_data(0x034F); |
499 | 0 | ctx->buffer_len++; |
500 | 0 | } else if (seen_starter) { |
501 | 0 | ctx->nonstarter_count = ns_trail; |
502 | 0 | } |
503 | | |
504 | | /* |
505 | | * Buffer the requested decomposition for COA sorting |
506 | | */ |
507 | |
|
508 | 0 | bool pending_decomp = FALSE; |
509 | |
|
510 | 0 | i_assert(ctx->buffer_len <= buffer_size); |
511 | 0 | if ((ctx->buffer_len + len) > buffer_size) { |
512 | | /* Decomposition now overflows the buffer. Record and mark it as |
513 | | pending and come back to it once the buffer is sufficiently |
514 | | drained. */ |
515 | 0 | i_assert(ctx->pending_decomp == 0 || ctx->pending_cp == cp); |
516 | 0 | ctx->pending_decomp = len; |
517 | 0 | ctx->pending_cp = cp; |
518 | 0 | ctx->pending_cpd = cpd; |
519 | 0 | pending_decomp = TRUE; |
520 | 0 | } else { |
521 | 0 | for (i = 0; i < len; i++) { |
522 | 0 | ctx->cp_buffer[ctx->buffer_len] = decomp[i]; |
523 | 0 | ctx->cpd_buffer[ctx->buffer_len] = decomp_cpd[i]; |
524 | 0 | ctx->buffer_len++; |
525 | 0 | } |
526 | 0 | i_assert(ctx->buffer_len <= buffer_size); |
527 | 0 | } |
528 | | |
529 | | /* |
530 | | * Apply the Canonical Ordering Algorithm (COA) |
531 | | */ |
532 | | |
533 | 0 | bool changed = TRUE; |
534 | 0 | size_t last_qc_y; |
535 | 0 | size_t last_starter; |
536 | |
|
537 | 0 | while (changed) { |
538 | 0 | changed = FALSE; |
539 | 0 | last_qc_y = 0; |
540 | 0 | last_starter = 0; |
541 | |
|
542 | 0 | for (i = I_MAX(1, ctx->buffer_output_max); |
543 | 0 | i < ctx->buffer_len; i++) { |
544 | 0 | const struct unicode_code_point_data |
545 | 0 | *cpd_i = ctx->cpd_buffer[i], |
546 | 0 | *cpd_im1 = ctx->cpd_buffer[i - 1]; |
547 | 0 | uint8_t ccc_i = cpd_i->canonical_combining_class; |
548 | 0 | uint8_t ccc_im1 = cpd_im1->canonical_combining_class; |
549 | 0 | bool nqc = ((cpd_i->nf_quick_check & nf_qc_mask) == 0); |
550 | |
|
551 | 0 | if (ccc_i == 0) { |
552 | 0 | last_starter = i; |
553 | 0 | if (nqc) |
554 | 0 | last_qc_y = i; |
555 | 0 | } else if (ccc_im1 > ccc_i) { |
556 | 0 | unicode_nf_buffer_swap(ctx, i - 1, i); |
557 | 0 | changed = TRUE; |
558 | 0 | } |
559 | 0 | } |
560 | 0 | } |
561 | 0 | ctx->buffer_output_max = I_MIN(last_qc_y, last_starter); |
562 | 0 | return !pending_decomp; |
563 | 0 | } |
564 | | |
565 | | static bool |
566 | | unicode_nf_input_cp(struct unicode_nf_context *ctx, uint32_t cp, |
567 | | const struct unicode_code_point_data *cpd) |
568 | 0 | { |
569 | 0 | static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE; |
570 | |
|
571 | 0 | i_assert(ctx->buffer_len <= buffer_size); |
572 | 0 | if (ctx->buffer_len == buffer_size || |
573 | 0 | (ctx->pending_decomp > 0 && |
574 | 0 | ctx->buffer_len > (buffer_size - ctx->pending_decomp))) { |
575 | | /* Buffer is (still too) full. */ |
576 | 0 | return FALSE; |
577 | 0 | } |
578 | | |
579 | 0 | if (ctx->pending_decomp > 0) { |
580 | | /* Earlier, the buffer was too full for the next decomposition |
581 | | and it was recorded and marked as pending. Now, we have the |
582 | | opportunity to continue. */ |
583 | 0 | if (!unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd)) |
584 | 0 | return FALSE; |
585 | 0 | ctx->pending_decomp = 0; |
586 | |
|
587 | 0 | i_assert(ctx->buffer_len <= buffer_size); |
588 | 0 | if (ctx->buffer_output_max > 0 && |
589 | 0 | ctx->buffer_len == buffer_size) { |
590 | | /* Pending decomposition filled the buffer completely. |
591 | | */ |
592 | 0 | return FALSE; |
593 | 0 | } |
594 | 0 | } |
595 | | |
596 | | /* Normal input of next code point */ |
597 | 0 | (void)unicode_nf_cp(ctx, cp, cpd); |
598 | 0 | return TRUE; |
599 | 0 | } |
600 | | |
601 | | static ssize_t |
602 | | unicode_nf_input(struct unicode_transform *trans, |
603 | | const struct unicode_transform_buffer *buf, |
604 | | const char **error_r ATTR_UNUSED) |
605 | 0 | { |
606 | 0 | struct unicode_nf_context *ctx = |
607 | 0 | container_of(trans, struct unicode_nf_context, transform); |
608 | 0 | size_t n; |
609 | |
|
610 | 0 | for (n = 0; n < buf->cp_count; n++) { |
611 | 0 | if (!unicode_nf_input_cp(ctx, buf->cp[n], |
612 | 0 | (buf->cp_data == NULL ? |
613 | 0 | NULL : buf->cp_data[n]))) |
614 | 0 | break; |
615 | 0 | } |
616 | 0 | return n; |
617 | 0 | } |
618 | | |
619 | | static uint32_t |
620 | | unicode_nf_compose_pair(uint32_t l, uint32_t r, |
621 | | const struct unicode_code_point_data **l_data) |
622 | 0 | { |
623 | 0 | uint32_t comp = unicode_hangul_compose_pair(l, r); |
624 | |
|
625 | 0 | if (comp > 0x0000) |
626 | 0 | return comp; |
627 | | |
628 | 0 | if (*l_data == NULL) |
629 | 0 | *l_data = unicode_code_point_get_data(l); |
630 | 0 | return unicode_code_point_data_find_composition(*l_data, r); |
631 | 0 | } |
632 | | |
633 | | static int |
634 | | unicode_nf_flush_more(struct unicode_nf_context *ctx, bool finished, |
635 | | const char **error_r) |
636 | 0 | { |
637 | 0 | struct unicode_transform *trans = &ctx->transform; |
638 | |
|
639 | 0 | ctx->finished = finished; |
640 | |
|
641 | 0 | if (ctx->buffer_len == 0) |
642 | 0 | return 1; |
643 | 0 | if (!finished && ctx->buffer_output_max == 0) |
644 | 0 | return 0; |
645 | | |
646 | | /* |
647 | | * Apply the Canonical Composition Algorithm |
648 | | */ |
649 | | |
650 | 0 | if (ctx->finished) |
651 | 0 | ctx->buffer_output_max = ctx->buffer_len; |
652 | 0 | i_assert(ctx->buffer_processed <= ctx->buffer_output_max); |
653 | 0 | if (ctx->compose && ctx->buffer_len > 1) { |
654 | 0 | size_t in_pos, out_pos, starter; |
655 | 0 | int last_ccc; |
656 | |
|
657 | 0 | out_pos = 1; |
658 | 0 | last_ccc = -1; |
659 | 0 | starter = 0; |
660 | 0 | for (in_pos = I_MAX(1, ctx->buffer_processed); |
661 | 0 | in_pos < ctx->buffer_output_max; in_pos++) { |
662 | 0 | uint32_t cp = ctx->cp_buffer[in_pos]; |
663 | 0 | const struct unicode_code_point_data *cpd = |
664 | 0 | ctx->cpd_buffer[in_pos]; |
665 | |
|
666 | 0 | if (cpd == NULL) { |
667 | 0 | ctx->cpd_buffer[in_pos] = cpd = |
668 | 0 | unicode_code_point_get_data(cp); |
669 | 0 | } |
670 | |
|
671 | 0 | uint8_t ccc = cpd->canonical_combining_class; |
672 | 0 | uint32_t comp = 0x0000; |
673 | 0 | if (last_ccc < (int)ccc) { |
674 | 0 | comp = unicode_nf_compose_pair( |
675 | 0 | ctx->cp_buffer[starter], cp, |
676 | 0 | &ctx->cpd_buffer[starter]); |
677 | 0 | } |
678 | 0 | if (comp > 0x0000) { |
679 | 0 | ctx->cp_buffer[starter] = comp; |
680 | 0 | ctx->cpd_buffer[starter] = NULL; |
681 | 0 | } else if (ccc == 0) { |
682 | 0 | starter = out_pos; |
683 | 0 | last_ccc = -1; |
684 | 0 | ctx->cp_buffer[out_pos] = cp; |
685 | 0 | ctx->cpd_buffer[out_pos] = cpd; |
686 | 0 | out_pos++; |
687 | 0 | } else { |
688 | 0 | last_ccc = ccc; |
689 | 0 | ctx->cp_buffer[out_pos] = cp; |
690 | 0 | ctx->cpd_buffer[out_pos] = cpd; |
691 | 0 | out_pos++; |
692 | 0 | } |
693 | 0 | } |
694 | 0 | if (finished) { |
695 | 0 | ctx->buffer_len = ctx->buffer_output_max = out_pos; |
696 | 0 | } else if (in_pos > out_pos) { |
697 | 0 | unicode_nf_buffer_delete(ctx, out_pos, |
698 | 0 | (in_pos - out_pos)); |
699 | 0 | ctx->buffer_output_max = out_pos; |
700 | 0 | } |
701 | 0 | } |
702 | 0 | ctx->buffer_processed = ctx->buffer_output_max; |
703 | | |
704 | | /* |
705 | | * Forward output |
706 | | */ |
707 | |
|
708 | 0 | size_t output_len = ctx->buffer_processed; |
709 | 0 | ssize_t sret; |
710 | |
|
711 | 0 | sret = uniform_transform_forward(trans, ctx->cp_buffer, ctx->cpd_buffer, |
712 | 0 | output_len, error_r); |
713 | 0 | if (sret < 0) |
714 | 0 | return -1; |
715 | | |
716 | 0 | i_assert((size_t)sret <= ctx->buffer_processed); |
717 | 0 | unicode_nf_buffer_delete(ctx, 0, sret); |
718 | 0 | ctx->buffer_processed -= sret; |
719 | 0 | ctx->buffer_output_max -= sret; |
720 | 0 | if ((size_t)sret < output_len) |
721 | 0 | return 0; |
722 | 0 | return 1; |
723 | 0 | } |
724 | | |
725 | | static int |
726 | | unicode_nf_flush(struct unicode_transform *trans, bool finished, |
727 | | const char **error_r) |
728 | 0 | { |
729 | 0 | struct unicode_nf_context *ctx = |
730 | 0 | container_of(trans, struct unicode_nf_context, transform); |
731 | 0 | int ret; |
732 | |
|
733 | 0 | ret = unicode_nf_flush_more(ctx, finished, error_r); |
734 | 0 | if (ret <= 0) |
735 | 0 | return ret; |
736 | | |
737 | 0 | if (finished && ctx->pending_decomp > 0) { |
738 | 0 | if (unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd)) |
739 | 0 | ctx->pending_decomp = 0; |
740 | 0 | } |
741 | |
|
742 | 0 | return unicode_nf_flush_more(ctx, finished, error_r); |
743 | 0 | } |
744 | | |
745 | | /* |
746 | | * Normalization check |
747 | | */ |
748 | | |
749 | | static ssize_t |
750 | | unicode_nf_check_sink_input(struct unicode_transform *trans, |
751 | | const struct unicode_transform_buffer *buf, |
752 | | const char **error_r); |
753 | | |
754 | | static const struct unicode_transform_def unicode_nf_check_sink_def = { |
755 | | .input = unicode_nf_check_sink_input, |
756 | | }; |
757 | | |
758 | | void unicode_nf_checker_init(struct unicode_nf_checker *unc_r, |
759 | | enum unicode_nf_type type) |
760 | 0 | { |
761 | 0 | i_zero(unc_r); |
762 | |
|
763 | 0 | switch (type) { |
764 | 0 | case UNICODE_NFD: |
765 | 0 | unc_r->canonical = TRUE; |
766 | 0 | unc_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK; |
767 | 0 | unc_r->nf_qc_yes = UNICODE_NFD_QUICK_CHECK_YES; |
768 | 0 | unc_r->nf_qc_no = UNICODE_NFD_QUICK_CHECK_NO; |
769 | 0 | break; |
770 | 0 | case UNICODE_NFKD: |
771 | 0 | unc_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK; |
772 | 0 | unc_r->nf_qc_yes = UNICODE_NFKD_QUICK_CHECK_YES; |
773 | 0 | unc_r->nf_qc_no = UNICODE_NFKD_QUICK_CHECK_NO; |
774 | 0 | break; |
775 | 0 | case UNICODE_NFC: |
776 | 0 | unc_r->compose = TRUE; |
777 | 0 | unc_r->canonical = TRUE; |
778 | 0 | unc_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK; |
779 | 0 | unc_r->nf_qc_yes = UNICODE_NFC_QUICK_CHECK_YES; |
780 | 0 | unc_r->nf_qc_no = UNICODE_NFC_QUICK_CHECK_NO; |
781 | 0 | break; |
782 | 0 | case UNICODE_NFKC: |
783 | 0 | unc_r->compose = TRUE; |
784 | 0 | unc_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK; |
785 | 0 | unc_r->nf_qc_yes = UNICODE_NFKC_QUICK_CHECK_YES; |
786 | 0 | unc_r->nf_qc_no = UNICODE_NFKC_QUICK_CHECK_NO; |
787 | 0 | break; |
788 | 0 | } |
789 | | |
790 | 0 | unicode_nf_init(&unc_r->nf, type); |
791 | 0 | unicode_transform_init(&unc_r->sink, &unicode_nf_check_sink_def); |
792 | 0 | unicode_transform_chain(&unc_r->nf.transform, &unc_r->sink); |
793 | 0 | } |
794 | | |
795 | | void unicode_nf_checker_reset(struct unicode_nf_checker *unc) |
796 | 0 | { |
797 | 0 | enum unicode_nf_type type = |
798 | 0 | (unc->compose ? (unc->canonical ? UNICODE_NFC : UNICODE_NFKC) : |
799 | 0 | (unc->canonical ? UNICODE_NFD : UNICODE_NFKD)); |
800 | |
|
801 | 0 | unicode_nf_checker_init(unc, type); |
802 | 0 | } |
803 | | |
804 | | static ssize_t |
805 | | unicode_nf_check_sink_input(struct unicode_transform *trans, |
806 | | const struct unicode_transform_buffer *buf, |
807 | | const char **error_r) |
808 | 0 | { |
809 | 0 | struct unicode_nf_checker *unc = |
810 | 0 | container_of(trans, struct unicode_nf_checker, sink); |
811 | 0 | size_t n; |
812 | |
|
813 | 0 | i_assert(unc->buffer_len > 0); |
814 | 0 | i_assert(buf->cp_count <= unc->buffer_len); |
815 | 0 | for (n = 0; n < buf->cp_count; n++) { |
816 | 0 | if (buf->cp[n] != unc->cp_buffer[n]) { |
817 | 0 | *error_r = "Not normalized"; |
818 | 0 | return -1; |
819 | 0 | } |
820 | 0 | } |
821 | 0 | if (buf->cp_count == unc->buffer_len) |
822 | 0 | unc->buffer_len = 0; |
823 | 0 | else { |
824 | 0 | unc->buffer_len -= buf->cp_count; |
825 | 0 | memmove(&unc->cp_buffer[0], &unc->cp_buffer[buf->cp_count], |
826 | 0 | unc->buffer_len); |
827 | 0 | } |
828 | 0 | return buf->cp_count; |
829 | 0 | } |
830 | | |
831 | | int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp, |
832 | | const struct unicode_code_point_data **_cp_data) |
833 | 0 | { |
834 | 0 | const struct unicode_code_point_data *cpd_last = unc->cpd_last; |
835 | |
|
836 | 0 | if (*_cp_data == NULL) |
837 | 0 | *_cp_data = unicode_code_point_get_data(cp); |
838 | |
|
839 | 0 | const struct unicode_code_point_data *cp_data = *_cp_data; |
840 | 0 | const char *error; |
841 | 0 | int ret; |
842 | |
|
843 | 0 | unc->cpd_last = cp_data; |
844 | |
|
845 | 0 | if (cp_data->general_category == UNICODE_GENERAL_CATEGORY_INVALID) |
846 | 0 | return -1; |
847 | 0 | if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_no) |
848 | 0 | return 0; |
849 | 0 | if (cpd_last != NULL && cp_data->canonical_combining_class != 0 && |
850 | 0 | cpd_last->canonical_combining_class > |
851 | 0 | cp_data->canonical_combining_class) |
852 | 0 | return 0; |
853 | 0 | if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_yes && |
854 | 0 | cp_data->canonical_combining_class == 0) { |
855 | 0 | if (unc->buffer_len > 0) { |
856 | 0 | ret = unicode_transform_flush(&unc->nf.transform, |
857 | 0 | &error); |
858 | 0 | i_assert(ret != 0); |
859 | 0 | if (ret < 0) |
860 | 0 | return 0; |
861 | 0 | unicode_nf_reset(&unc->nf); |
862 | 0 | } |
863 | 0 | i_assert(unc->buffer_len == 0); |
864 | 0 | unc->cp_buffer[0] = cp; |
865 | 0 | return 1; |
866 | 0 | } |
867 | | |
868 | 0 | struct unicode_transform_buffer buf; |
869 | 0 | ssize_t sret; |
870 | |
|
871 | 0 | if (unc->buffer_len == 0 && cpd_last != NULL) { |
872 | 0 | i_zero(&buf); |
873 | 0 | buf.cp = &unc->cp_buffer[0]; |
874 | 0 | buf.cp_data = &cpd_last; |
875 | 0 | buf.cp_count = 1; |
876 | |
|
877 | 0 | unc->buffer_len++; |
878 | 0 | sret = unicode_transform_input_buf(&unc->nf.transform, &buf, |
879 | 0 | &error); |
880 | 0 | i_assert(sret != 0); |
881 | 0 | if (sret < 0) |
882 | 0 | return 0; |
883 | 0 | } |
884 | | |
885 | 0 | i_assert(unc->buffer_len < UNICODE_NF_BUFFER_SIZE); |
886 | 0 | unc->cp_buffer[unc->buffer_len] = cp; |
887 | 0 | unc->buffer_len++; |
888 | |
|
889 | 0 | i_zero(&buf); |
890 | 0 | buf.cp = &cp; |
891 | 0 | buf.cp_data = &cp_data; |
892 | 0 | buf.cp_count = 1; |
893 | 0 | sret = unicode_transform_input_buf(&unc->nf.transform, &buf, &error); |
894 | 0 | i_assert(sret != 0); |
895 | 0 | if (sret < 0) |
896 | 0 | return 0; |
897 | 0 | return 1; |
898 | 0 | } |
899 | | |
900 | | int unicode_nf_checker_finish(struct unicode_nf_checker *unc) |
901 | 0 | { |
902 | 0 | if (unc->buffer_len == 0) |
903 | 0 | return 1; |
904 | | |
905 | 0 | const char *error; |
906 | 0 | int ret; |
907 | |
|
908 | 0 | ret = unicode_transform_flush(&unc->nf.transform, &error); |
909 | 0 | i_assert(ret != 0); |
910 | 0 | return (ret > 0 ? 1 : 0); |
911 | 0 | } |
912 | | |
913 | | /* |
914 | | * Casemap Transform |
915 | | */ |
916 | | |
917 | | static size_t |
918 | | unicode_casemap_uppercase_cp(const struct unicode_code_point_data *cp_data, |
919 | | const uint32_t **map_r); |
920 | | static size_t |
921 | | unicode_casemap_lowercase_cp(const struct unicode_code_point_data *cp_data, |
922 | | const uint32_t **map_r); |
923 | | static size_t |
924 | | unicode_casemap_casefold_cp(const struct unicode_code_point_data *cp_data, |
925 | | const uint32_t **map_r); |
926 | | |
927 | | static ssize_t |
928 | | unicode_casemap_input(struct unicode_transform *trans, |
929 | | const struct unicode_transform_buffer *buf, |
930 | | const char **error_r); |
931 | | static int |
932 | | unicode_casemap_flush(struct unicode_transform *trans, bool finished, |
933 | | const char **error_r); |
934 | | |
935 | | static const struct unicode_transform_def unicode_casemap_def = { |
936 | | .input = unicode_casemap_input, |
937 | | .flush = unicode_casemap_flush, |
938 | | }; |
939 | | |
940 | | void unicode_casemap_init_uppercase(struct unicode_casemap *map_r) |
941 | 0 | { |
942 | 0 | i_zero(map_r); |
943 | 0 | unicode_transform_init(&map_r->transform, &unicode_casemap_def); |
944 | 0 | map_r->map = unicode_casemap_uppercase_cp; |
945 | 0 | } |
946 | | |
947 | | void unicode_casemap_init_lowercase(struct unicode_casemap *map_r) |
948 | 0 | { |
949 | 0 | i_zero(map_r); |
950 | 0 | unicode_transform_init(&map_r->transform, &unicode_casemap_def); |
951 | 0 | map_r->map = unicode_casemap_lowercase_cp; |
952 | 0 | } |
953 | | |
954 | | void unicode_casemap_init_casefold(struct unicode_casemap *map_r) |
955 | 0 | { |
956 | 0 | i_zero(map_r); |
957 | 0 | unicode_transform_init(&map_r->transform, &unicode_casemap_def); |
958 | 0 | map_r->map = unicode_casemap_casefold_cp; |
959 | 0 | } |
960 | | |
961 | | static size_t |
962 | | unicode_casemap_uppercase_cp(const struct unicode_code_point_data *cp_data, |
963 | | const uint32_t **map_r) |
964 | 0 | { |
965 | 0 | return unicode_code_point_data_get_uppercase_mapping(cp_data, map_r); |
966 | 0 | } |
967 | | |
968 | | static size_t |
969 | | unicode_casemap_lowercase_cp(const struct unicode_code_point_data *cp_data, |
970 | | const uint32_t **map_r) |
971 | 0 | { |
972 | 0 | return unicode_code_point_data_get_lowercase_mapping(cp_data, map_r); |
973 | 0 | } |
974 | | |
975 | | static size_t |
976 | | unicode_casemap_casefold_cp(const struct unicode_code_point_data *cp_data, |
977 | | const uint32_t **map_r) |
978 | 0 | { |
979 | 0 | return unicode_code_point_data_get_casefold_mapping(cp_data, map_r); |
980 | 0 | } |
981 | | |
982 | | static ssize_t |
983 | | unicode_casemap_input_cp(struct unicode_casemap *map, uint32_t cp, |
984 | | const struct unicode_code_point_data *cp_data, |
985 | | const char **error_r) |
986 | 0 | { |
987 | 0 | bool was_buffered = map->cp_buffered; |
988 | 0 | ssize_t sret; |
989 | |
|
990 | 0 | if (cp_data == NULL) |
991 | 0 | cp_data = unicode_code_point_get_data(cp); |
992 | |
|
993 | 0 | const uint32_t *map_cps; |
994 | 0 | const struct unicode_code_point_data *const *map_cps_data = NULL; |
995 | 0 | size_t map_cps_len; |
996 | |
|
997 | 0 | map_cps_len = map->map(cp_data, &map_cps); |
998 | 0 | if (map_cps_len == 0) { |
999 | 0 | map_cps = &cp; |
1000 | 0 | map_cps_data = &cp_data; |
1001 | 0 | map_cps_len = 1; |
1002 | 0 | } |
1003 | 0 | i_assert(map_cps_len > map->cp_map_pos); |
1004 | | |
1005 | 0 | map_cps += map->cp_map_pos; |
1006 | 0 | map_cps_len -= map->cp_map_pos; |
1007 | 0 | sret = uniform_transform_forward(&map->transform, |
1008 | 0 | map_cps, map_cps_data, map_cps_len, |
1009 | 0 | error_r); |
1010 | 0 | if (sret < 0) { |
1011 | 0 | i_assert(*error_r != NULL); |
1012 | 0 | return -1; |
1013 | 0 | } |
1014 | 0 | if ((size_t)sret < map_cps_len) { |
1015 | 0 | map->cp_buffered = TRUE; |
1016 | 0 | map->cp = cp; |
1017 | 0 | map->cp_data = cp_data; |
1018 | 0 | map->cp_map_pos += sret; |
1019 | 0 | return (was_buffered ? 0 : 1); |
1020 | 0 | } |
1021 | | |
1022 | 0 | map->cp_buffered = FALSE; |
1023 | 0 | map->cp_data = NULL; |
1024 | 0 | map->cp_map_pos = 0; |
1025 | 0 | return 1; |
1026 | 0 | } |
1027 | | |
1028 | | static ssize_t |
1029 | | unicode_casemap_input(struct unicode_transform *trans, |
1030 | | const struct unicode_transform_buffer *buf, |
1031 | | const char **error_r) |
1032 | 0 | { |
1033 | 0 | struct unicode_casemap *map = |
1034 | 0 | container_of(trans, struct unicode_casemap, transform); |
1035 | 0 | int ret; |
1036 | |
|
1037 | 0 | ret = unicode_casemap_flush(trans, TRUE, error_r); |
1038 | 0 | if (ret < 0) { |
1039 | 0 | i_assert(*error_r != NULL); |
1040 | 0 | return -1; |
1041 | 0 | } |
1042 | 0 | if (map->cp_buffered) |
1043 | 0 | return 0; |
1044 | | |
1045 | 0 | size_t n; |
1046 | 0 | for (n = 0; n < buf->cp_count; n++) { |
1047 | 0 | if (map->cp_buffered) |
1048 | 0 | break; |
1049 | 0 | ret = unicode_casemap_input_cp(map, buf->cp[n], |
1050 | 0 | (buf->cp_data != NULL ? |
1051 | 0 | buf->cp_data[n] : NULL), |
1052 | 0 | error_r); |
1053 | 0 | if (ret < 0) { |
1054 | 0 | i_assert(*error_r != NULL); |
1055 | 0 | return -1; |
1056 | 0 | } |
1057 | 0 | if (ret == 0) |
1058 | 0 | break; |
1059 | 0 | } |
1060 | 0 | return n; |
1061 | 0 | } |
1062 | | |
1063 | | static int |
1064 | | unicode_casemap_flush(struct unicode_transform *trans, |
1065 | | bool finished ATTR_UNUSED, const char **error_r) |
1066 | 0 | { |
1067 | 0 | struct unicode_casemap *map = |
1068 | 0 | container_of(trans, struct unicode_casemap, transform); |
1069 | 0 | int ret; |
1070 | |
|
1071 | 0 | if (!map->cp_buffered) |
1072 | 0 | return 1; |
1073 | | |
1074 | 0 | ret = unicode_casemap_input_cp(map, map->cp, map->cp_data, error_r); |
1075 | 0 | i_assert(ret >= 0 || *error_r != NULL); |
1076 | 0 | return ret; |
1077 | 0 | } |
1078 | | |
1079 | | /* |
1080 | | * RFC 5051 - Simple Unicode Collation Algorithm |
1081 | | */ |
1082 | | |
1083 | | void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx) |
1084 | 0 | { |
1085 | 0 | i_zero(ctx); |
1086 | 0 | } |
1087 | | |
1088 | | size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx, |
1089 | | uint32_t cp, const uint32_t **norm_r) |
1090 | 0 | { |
1091 | 0 | const struct unicode_code_point_data *cpd; |
1092 | 0 | size_t len; |
1093 | |
|
1094 | 0 | cpd = unicode_code_point_get_data(cp); |
1095 | 0 | if (cpd->simple_titlecase_mapping != 0x0000) |
1096 | 0 | cp = cpd->simple_titlecase_mapping; |
1097 | |
|
1098 | 0 | if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) { |
1099 | 0 | *norm_r = ctx->buffer; |
1100 | 0 | return unicode_hangul_decompose(cp, ctx->buffer); |
1101 | 0 | } |
1102 | | |
1103 | 0 | len = unicode_code_point_get_full_decomposition(cp, FALSE, norm_r); |
1104 | 0 | if (len == 0) { |
1105 | 0 | ctx->buffer[0] = cp; |
1106 | 0 | *norm_r = ctx->buffer; |
1107 | 0 | return 1; |
1108 | 0 | } |
1109 | 0 | return len; |
1110 | 0 | } |