Coverage Report

Created: 2026-05-16 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/dovecot/src/lib/unicode-transform.c
Line
Count
Source
1
/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
2
3
#include "lib.h"
4
#include "unichar.h"
5
#include "unicode-data.h"
6
#include "unicode-transform.h"
7
8
0
#define HANGUL_FIRST 0xac00
9
0
#define HANGUL_LAST 0xd7a3
10
11
/*
12
 * Transform
13
 */
14
15
ssize_t uniform_transform_forward(
16
  struct unicode_transform *trans, const uint32_t *out,
17
  const struct unicode_code_point_data *const *out_data, size_t out_len,
18
  const char **error_r)
19
0
{
20
0
  struct unicode_transform_buffer buf_next;
21
0
  ssize_t sret;
22
23
0
  i_zero(&buf_next);
24
0
  buf_next.cp = out;
25
0
  buf_next.cp_data = out_data;
26
0
  buf_next.cp_count = out_len;
27
28
0
  i_assert(trans->next != NULL);
29
0
  i_assert(trans->next->def != NULL);
30
0
  i_assert(trans->next->def->input != NULL);
31
0
  sret = trans->next->def->input(trans->next, &buf_next, error_r);
32
33
0
  i_assert(sret >= 0 || *error_r != NULL);
34
0
  i_assert(sret <= (ssize_t)out_len);
35
0
  return sret;
36
0
}
37
38
ssize_t unicode_transform_input_buf(struct unicode_transform *trans,
39
            const struct unicode_transform_buffer *buf,
40
            const char **error_r)
41
0
{
42
0
  struct unicode_transform_buffer in_buf;
43
0
  size_t input_total = 0;
44
0
  ssize_t sret;
45
0
  bool flushed = FALSE;
46
0
  int ret;
47
48
0
  *error_r = NULL;
49
50
0
  in_buf = *buf;
51
52
0
  while (in_buf.cp_count > 0) {
53
0
    if (in_buf.cp_count > 0) {
54
0
      i_assert(trans->def->input != NULL);
55
0
      sret = trans->def->input(trans, &in_buf, error_r);
56
0
      if (sret < 0) {
57
0
        i_assert(*error_r != NULL);
58
0
        return -1;
59
0
      }
60
0
      if (sret > 0) {
61
0
        i_assert((size_t)sret <= in_buf.cp_count);
62
0
        in_buf.cp += sret;
63
0
        in_buf.cp_count -= sret;
64
0
        input_total += sret;
65
0
        flushed = FALSE;
66
0
        continue;
67
0
      }
68
0
      if (sret == 0 && flushed)
69
0
        break;
70
0
    }
71
72
0
    struct unicode_transform *tp = trans;
73
74
0
    while (tp->next != NULL) {
75
0
      if (tp->def->flush != NULL) {
76
0
        ret = tp->def->flush(tp, FALSE, error_r);
77
0
        if (ret < 0) {
78
0
          i_assert(*error_r != NULL);
79
0
          return -1;
80
0
        }
81
0
      }
82
0
      tp = tp->next;
83
0
    }
84
85
0
    flushed = TRUE;
86
0
  }
87
88
0
  return input_total;
89
0
}
90
91
int unicode_transform_flush(struct unicode_transform *trans,
92
          const char **error_r)
93
0
{
94
0
  int ret;
95
96
0
  *error_r = NULL;
97
98
0
  while (trans != NULL) {
99
0
    struct unicode_transform *tp = trans;
100
0
    bool progress = FALSE;
101
102
0
    while (tp != NULL) {
103
0
      if (tp->def->flush == NULL) {
104
0
        progress = TRUE;
105
0
        if (tp == trans)
106
0
          trans = trans->next;
107
0
      } else {
108
0
        ret = tp->def->flush(tp, (tp == trans), error_r);
109
0
        if (ret < 0) {
110
0
          i_assert(*error_r != NULL);
111
0
          return -1;
112
0
        }
113
0
        if (ret > 0) {
114
0
          progress = TRUE;
115
0
          if (tp == trans)
116
0
            trans = trans->next;
117
0
        }
118
0
      }
119
0
      tp = tp->next;
120
0
    }
121
0
    if (!progress)
122
0
      return 0;
123
0
  }
124
0
  return 1;
125
0
}
126
127
/* Buffer Sink */
128
129
static ssize_t
130
unicode_buffer_sink_input(struct unicode_transform *trans,
131
        const struct unicode_transform_buffer *buf,
132
        const char **error_r);
133
134
static const struct unicode_transform_def unicode_buffer_sink_def = {
135
  .input = unicode_buffer_sink_input,
136
};
137
138
void unicode_buffer_sink_init(struct unicode_buffer_sink *sink,
139
            buffer_t *buffer)
140
0
{
141
0
  i_zero(sink);
142
0
  unicode_transform_init(&sink->transform, &unicode_buffer_sink_def);
143
0
  sink->buffer = buffer;
144
0
}
145
146
static ssize_t
147
unicode_buffer_sink_input(struct unicode_transform *trans,
148
        const struct unicode_transform_buffer *buf,
149
        const char **error_r ATTR_UNUSED)
150
0
{
151
0
  struct unicode_buffer_sink *sink =
152
0
    container_of(trans, struct unicode_buffer_sink, transform);
153
154
0
  uni_ucs4_to_utf8(buf->cp, buf->cp_count, sink->buffer);
155
0
  return buf->cp_count;
156
0
}
157
158
/* Static Array Sink */
159
160
static ssize_t
161
unicode_static_array_sink_input(struct unicode_transform *trans,
162
        const struct unicode_transform_buffer *buf,
163
        const char **error_r);
164
165
static const struct unicode_transform_def unicode_static_array_sink_def = {
166
  .input = unicode_static_array_sink_input,
167
};
168
169
void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
170
            uint32_t *array, size_t array_size,
171
            size_t *array_pos)
172
0
{
173
0
  i_zero(sink);
174
0
  unicode_transform_init(&sink->transform,
175
0
             &unicode_static_array_sink_def);
176
0
  sink->array = array;
177
0
  sink->array_size = array_size;
178
0
  sink->array_pos = array_pos;
179
0
}
180
181
static ssize_t
182
unicode_static_array_sink_input(struct unicode_transform *trans,
183
        const struct unicode_transform_buffer *buf,
184
        const char **error_r)
185
0
{
186
0
  struct unicode_static_array_sink *sink =
187
0
    container_of(trans, struct unicode_static_array_sink,
188
0
           transform);
189
190
0
  if (*sink->array_pos + buf->cp_count > sink->array_size) {
191
0
    *error_r = "Output overflow";
192
0
    return -1;
193
0
  }
194
0
  memcpy(sink->array + *sink->array_pos, buf->cp,
195
0
         buf->cp_count * sizeof(*buf->cp));
196
0
  *sink->array_pos += buf->cp_count;
197
0
  return buf->cp_count;
198
0
}
199
200
/*
201
 * Hangul syllable (de)composition
202
 */
203
204
0
#define UNI_HANGUL_S_BASE 0xac00
205
0
#define UNI_HANGUL_L_BASE 0x1100
206
0
#define UNI_HANGUL_V_BASE 0x1161
207
0
#define UNI_HANGUL_T_BASE 0x11a7
208
0
#define UNI_HANGUL_L_COUNT 19
209
0
#define UNI_HANGUL_V_COUNT 21
210
0
#define UNI_HANGUL_T_COUNT 28
211
0
#define UNI_HANGUL_N_COUNT (UNI_HANGUL_V_COUNT * UNI_HANGUL_T_COUNT)
212
0
#define UNI_HANGUL_L_END (UNI_HANGUL_L_BASE + UNI_HANGUL_L_COUNT)
213
0
#define UNI_HANGUL_V_END (UNI_HANGUL_V_BASE + UNI_HANGUL_V_COUNT)
214
0
#define UNI_HANGUL_T_END (UNI_HANGUL_T_BASE + UNI_HANGUL_T_COUNT)
215
0
#define UNI_HANGUL_S_END 0xD7A4
216
217
static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3])
218
0
{
219
  /* The Unicode Standard, Section 3.12.2:
220
     Hangul Syllable Decomposition
221
   */
222
223
0
  size_t s_index = cp - UNI_HANGUL_S_BASE;
224
0
  size_t l_index = s_index / UNI_HANGUL_N_COUNT;
225
0
  size_t v_index = ((s_index % UNI_HANGUL_N_COUNT) / UNI_HANGUL_T_COUNT);
226
0
  size_t t_index = s_index % UNI_HANGUL_T_COUNT;
227
0
  uint32_t l_part = UNI_HANGUL_L_BASE + l_index;
228
0
  uint32_t v_part = UNI_HANGUL_V_BASE + v_index;
229
230
0
  if (t_index == 0) {
231
0
    buf[0] = l_part;
232
0
    buf[1] = v_part;
233
0
    return 2;
234
0
  }
235
236
0
  uint32_t t_part = UNI_HANGUL_T_BASE + t_index;
237
238
0
  buf[0] = l_part;
239
0
  buf[1] = v_part;
240
0
  buf[2] = t_part;
241
0
  return 3;
242
0
}
243
244
static uint32_t unicode_hangul_compose_pair(uint32_t l, uint32_t r)
245
0
{
246
  /* The Unicode Standard, Section 3.12.3:
247
     Hangul Syllable Composition
248
   */
249
250
  /* <LPart, VPart> */
251
0
  if (l >= UNI_HANGUL_L_BASE && l < UNI_HANGUL_L_END &&
252
0
      r >= UNI_HANGUL_V_BASE && r < UNI_HANGUL_V_END) {
253
0
    uint32_t l_part = l, v_part = r;
254
255
0
    size_t l_index = l_part - UNI_HANGUL_L_BASE;
256
0
    size_t v_index = v_part - UNI_HANGUL_V_BASE;
257
0
    size_t lv_index = l_index * UNI_HANGUL_N_COUNT +
258
0
          v_index * UNI_HANGUL_T_COUNT;
259
0
    return UNI_HANGUL_S_BASE + lv_index;
260
0
  }
261
  /* A sequence <LVPart, TPart> */
262
0
  if (l >= UNI_HANGUL_S_BASE && l < UNI_HANGUL_S_END &&
263
0
      r >= (UNI_HANGUL_T_BASE + 1u) && r < UNI_HANGUL_T_END &&
264
0
      ((l - UNI_HANGUL_S_BASE) % UNI_HANGUL_T_COUNT) == 0) {
265
0
    uint32_t lv_part = l, t_part = r;
266
267
0
    size_t t_index = t_part - UNI_HANGUL_T_BASE;
268
0
    return lv_part + t_index;
269
0
  }
270
0
  return 0x0000;
271
0
}
272
273
/*
274
 * Normalization transform: NFD, NFKD, NFC, NFKC
275
 */
276
277
static ssize_t
278
unicode_nf_input(struct unicode_transform *trans,
279
     const struct unicode_transform_buffer *buf,
280
     const char **error_r);
281
static int
282
unicode_nf_flush(struct unicode_transform *trans, bool finished,
283
     const char **error_r);
284
285
static const struct unicode_transform_def unicode_nf_def = {
286
  .input = unicode_nf_input,
287
  .flush = unicode_nf_flush,
288
};
289
290
void unicode_nf_init(struct unicode_nf_context *ctx_r,
291
         enum unicode_nf_type type)
292
0
{
293
0
  i_zero(ctx_r);
294
0
  unicode_transform_init(&ctx_r->transform, &unicode_nf_def);
295
296
0
  switch (type) {
297
0
  case UNICODE_NFD:
298
0
    ctx_r->canonical = TRUE;
299
0
    ctx_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK;
300
0
    break;
301
0
  case UNICODE_NFKD:
302
0
    ctx_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK;
303
0
    break;
304
0
  case UNICODE_NFC:
305
0
    ctx_r->compose = TRUE;
306
0
    ctx_r->canonical = TRUE;
307
0
    ctx_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK;
308
0
    break;
309
0
  case UNICODE_NFKC:
310
0
    ctx_r->compose = TRUE;
311
0
    ctx_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK;
312
0
    break;
313
0
  }
314
0
}
315
316
void unicode_nf_reset(struct unicode_nf_context *ctx)
317
0
{
318
0
  enum unicode_nf_type type =
319
0
    (ctx->compose ? (ctx->canonical ? UNICODE_NFC : UNICODE_NFKC) :
320
0
        (ctx->canonical ? UNICODE_NFD : UNICODE_NFKD));
321
0
  struct unicode_transform *next = ctx->transform.next;
322
323
0
  unicode_nf_init(ctx, type);
324
0
  unicode_transform_chain(&ctx->transform, next);
325
0
}
326
327
static void
328
unicode_nf_buffer_delete(struct unicode_nf_context *ctx, size_t offset,
329
       size_t count)
330
0
{
331
0
  if (count == 0)
332
0
    return;
333
334
0
  i_assert(offset < ctx->buffer_len);
335
0
  i_assert(count <= ctx->buffer_len);
336
0
  i_assert(offset <= (ctx->buffer_len - count));
337
338
0
  if (count == ctx->buffer_len) {
339
0
    ctx->buffer_len = 0;
340
0
    return;
341
0
  }
342
343
0
  size_t trailer = ctx->buffer_len - (offset + count);
344
0
  if (trailer > 0) {
345
0
    memmove(&ctx->cp_buffer[offset],
346
0
      &ctx->cp_buffer[offset + count],
347
0
      trailer * sizeof(ctx->cp_buffer[0]));
348
0
    memmove(&ctx->cpd_buffer[offset],
349
0
      &ctx->cpd_buffer[offset + count],
350
0
      trailer * sizeof(ctx->cpd_buffer[0]));
351
0
  }
352
0
  ctx->buffer_len -= count;
353
0
}
354
355
static void
356
unicode_nf_buffer_swap(struct unicode_nf_context *ctx,
357
           size_t idx1, size_t idx2)
358
0
{
359
0
  uint32_t tmp_cp = ctx->cp_buffer[idx2];
360
0
  const struct unicode_code_point_data *tmp_cpd = ctx->cpd_buffer[idx2];
361
362
0
  ctx->cp_buffer[idx2] = ctx->cp_buffer[idx1];
363
0
  ctx->cpd_buffer[idx2] = ctx->cpd_buffer[idx1];
364
0
  ctx->cp_buffer[idx1] = tmp_cp;
365
0
  ctx->cpd_buffer[idx1] = tmp_cpd;
366
0
}
367
368
static bool
369
unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp,
370
        const struct unicode_code_point_data *cpd)
371
0
{
372
0
  static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE;
373
0
  uint8_t nf_qc_mask = ctx->nf_qc_mask;
374
0
  size_t i;
375
376
0
  i_assert(ctx->buffer_len <= buffer_size);
377
0
  if (ctx->buffer_len == buffer_size) {
378
    /* Buffer already full */
379
0
    return FALSE;
380
0
  }
381
382
  /*
383
   * Decompose the code point
384
   */
385
386
0
  const uint32_t *decomp, *decomp_k;
387
0
  uint32_t decomp_hangul[3];
388
0
  size_t len, len_k;
389
390
0
  if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) {
391
0
    len = len_k = unicode_hangul_decompose(cp, decomp_hangul);
392
0
    decomp = decomp_k = decomp_hangul;
393
0
  } else {
394
0
    if (cpd == NULL)
395
0
      cpd = unicode_code_point_get_data(cp);
396
0
    len = unicode_code_point_data_get_full_decomposition(
397
0
      cpd, ctx->canonical, &decomp);
398
0
    if (len == 0) {
399
0
      decomp = &cp;
400
0
      len = 1;
401
0
    }
402
0
    len_k = len;
403
0
    decomp_k = decomp;
404
0
    if (ctx->canonical) {
405
0
      len_k = unicode_code_point_data_get_full_decomposition(
406
0
        cpd, ctx->canonical, &decomp_k);
407
0
      if (len_k == 0) {
408
0
        decomp_k = decomp;
409
0
        len_k = len;
410
0
      }
411
0
    }
412
0
    if (len > 0)
413
0
      cpd = NULL;
414
0
  }
415
416
0
  i_assert(len <= UNICODE_DECOMPOSITION_MAX_LENGTH);
417
0
  i_assert(len_k <= UNICODE_DECOMPOSITION_MAX_LENGTH);
418
419
0
  if ((ctx->buffer_len + len) > buffer_size &&
420
0
      (ctx->nonstarter_count + len) <=
421
0
    UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN) {
422
    /* Decomposition overflows the buffer. Record and mark it as
423
       pending and come back to it once the buffer is sufficiently
424
       drained. */
425
0
    i_assert(ctx->pending_decomp == 0 || ctx->pending_cp == cp);
426
0
    ctx->pending_decomp = len;
427
0
    ctx->pending_cp = cp;
428
0
    ctx->pending_cpd = cpd;
429
0
    return FALSE;
430
0
  }
431
432
  /* UAX15-D4: Stream-Safe Text Process is the process of producing a
433
     Unicode string in Stream-Safe Text Format by processing that string
434
     from start to finish, inserting U+034F COMBINING GRAPHEME JOINER
435
     (CGJ) within long sequences of non-starters. The exact position o
436
     the inserted CGJs are determined according to the following
437
     algorithm, which describes the generation of an output string from an
438
     input string:
439
440
     1. If the input string is empty, return an empty output string.
441
     2. Set nonStarterCount to zero.
442
     3. For each code point C in the input string:
443
    a. Produce the NFKD decomposition S.
444
    b. If nonStarterCount plus the number of initial non-starters in
445
       S is greater than 30, append a CGJ to the output string and
446
       set the nonStarterCount to zero.
447
    c. Append C to the output string.
448
    d. If there are no starters in S, increment nonStarterCount by
449
       the number of code points in S; otherwise, set
450
       nonStarterCount to the number of trailing non-starters in S
451
       (which may be zero).
452
     4. Return the output string.
453
   */
454
455
  /* Determine number of leading and trailing non-starters in full NFKD
456
     decomposition. */
457
0
  const struct unicode_code_point_data *
458
0
    decomp_cpd[UNICODE_DECOMPOSITION_MAX_LENGTH];
459
0
  size_t ns_lead = 0, ns_trail = 0;
460
0
  bool seen_starter = FALSE;
461
0
  for (i = 0; i < len_k; i++) {
462
0
    if (cpd == NULL)
463
0
      cpd = unicode_code_point_get_data(decomp[i]);
464
465
0
    uint8_t ccc = cpd->canonical_combining_class;
466
467
0
    if (decomp == decomp_k) {
468
0
      decomp_cpd[i] = cpd;
469
0
      cpd = NULL;
470
0
    }
471
472
0
    if (ccc == 0)
473
0
      seen_starter = TRUE;
474
0
    else if (!seen_starter)
475
0
      ns_lead++;
476
0
    else
477
0
      ns_trail++;
478
0
  }
479
480
  /* Lookup canonical decomposed code points if necessary (avoid double
481
     lookups). */
482
0
  if (decomp != decomp_k) {
483
0
    for (i = 0; i < len; i++) {
484
0
      if (cpd == NULL)
485
0
        cpd = unicode_code_point_get_data(decomp[i]);
486
0
      decomp_cpd[i] = cpd;
487
0
      cpd = NULL;
488
0
    }
489
0
  }
490
491
0
  ctx->nonstarter_count += ns_lead;
492
0
  if (ctx->nonstarter_count > UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN) {
493
0
    ctx->nonstarter_count = 0;
494
    /* Write U+034F COMBINING GRAPHEME JOINER (CGJ)
495
     */
496
0
    ctx->cp_buffer[ctx->buffer_len] = 0x034F;
497
0
    ctx->cpd_buffer[ctx->buffer_len] =
498
0
      unicode_code_point_get_data(0x034F);
499
0
    ctx->buffer_len++;
500
0
  } else if (seen_starter) {
501
0
    ctx->nonstarter_count = ns_trail;
502
0
  }
503
504
  /*
505
   * Buffer the requested decomposition for COA sorting
506
   */
507
508
0
  bool pending_decomp = FALSE;
509
510
0
  i_assert(ctx->buffer_len <= buffer_size);
511
0
  if ((ctx->buffer_len + len) > buffer_size) {
512
    /* Decomposition now overflows the buffer. Record and mark it as
513
       pending and come back to it once the buffer is sufficiently
514
       drained. */
515
0
    i_assert(ctx->pending_decomp == 0 || ctx->pending_cp == cp);
516
0
    ctx->pending_decomp = len;
517
0
    ctx->pending_cp = cp;
518
0
    ctx->pending_cpd = cpd;
519
0
    pending_decomp = TRUE;
520
0
  } else {
521
0
    for (i = 0; i < len; i++) {
522
0
      ctx->cp_buffer[ctx->buffer_len] = decomp[i];
523
0
      ctx->cpd_buffer[ctx->buffer_len] = decomp_cpd[i];
524
0
      ctx->buffer_len++;
525
0
    }
526
0
    i_assert(ctx->buffer_len <= buffer_size);
527
0
  }
528
529
  /*
530
   * Apply the Canonical Ordering Algorithm (COA)
531
   */
532
533
0
  bool changed = TRUE;
534
0
  size_t last_qc_y;
535
0
  size_t last_starter;
536
537
0
  while (changed) {
538
0
    changed = FALSE;
539
0
    last_qc_y = 0;
540
0
    last_starter = 0;
541
542
0
    for (i = I_MAX(1, ctx->buffer_output_max);
543
0
         i < ctx->buffer_len; i++) {
544
0
      const struct unicode_code_point_data
545
0
        *cpd_i = ctx->cpd_buffer[i],
546
0
        *cpd_im1 = ctx->cpd_buffer[i - 1];
547
0
      uint8_t ccc_i = cpd_i->canonical_combining_class;
548
0
      uint8_t ccc_im1 = cpd_im1->canonical_combining_class;
549
0
      bool nqc = ((cpd_i->nf_quick_check & nf_qc_mask) == 0);
550
551
0
      if (ccc_i == 0) {
552
0
        last_starter = i;
553
0
        if (nqc)
554
0
          last_qc_y = i;
555
0
      } else if (ccc_im1 > ccc_i) {
556
0
        unicode_nf_buffer_swap(ctx, i - 1, i);
557
0
        changed = TRUE;
558
0
      }
559
0
    }
560
0
  }
561
0
  ctx->buffer_output_max = I_MIN(last_qc_y, last_starter);
562
0
  return !pending_decomp;
563
0
}
564
565
static bool
566
unicode_nf_input_cp(struct unicode_nf_context *ctx, uint32_t cp,
567
        const struct unicode_code_point_data *cpd)
568
0
{
569
0
  static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE;
570
571
0
  i_assert(ctx->buffer_len <= buffer_size);
572
0
  if (ctx->buffer_len == buffer_size ||
573
0
      (ctx->pending_decomp > 0 &&
574
0
       ctx->buffer_len > (buffer_size - ctx->pending_decomp))) {
575
    /* Buffer is (still too) full. */
576
0
    return FALSE;
577
0
  }
578
579
0
  if (ctx->pending_decomp > 0) {
580
    /* Earlier, the buffer was too full for the next decomposition
581
       and it was recorded and marked as pending. Now, we have the
582
       opportunity to continue. */
583
0
    if (!unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd))
584
0
      return FALSE;
585
0
    ctx->pending_decomp = 0;
586
587
0
    i_assert(ctx->buffer_len <= buffer_size);
588
0
    if (ctx->buffer_output_max > 0 &&
589
0
        ctx->buffer_len == buffer_size) {
590
      /* Pending decomposition filled the buffer completely.
591
       */
592
0
      return FALSE;
593
0
    }
594
0
  }
595
596
  /* Normal input of next code point */
597
0
  (void)unicode_nf_cp(ctx, cp, cpd);
598
0
  return TRUE;
599
0
}
600
601
static ssize_t
602
unicode_nf_input(struct unicode_transform *trans,
603
     const struct unicode_transform_buffer *buf,
604
     const char **error_r ATTR_UNUSED)
605
0
{
606
0
  struct unicode_nf_context *ctx =
607
0
    container_of(trans, struct unicode_nf_context, transform);
608
0
  size_t n;
609
610
0
  for (n = 0; n < buf->cp_count; n++) {
611
0
    if (!unicode_nf_input_cp(ctx, buf->cp[n],
612
0
           (buf->cp_data == NULL ?
613
0
            NULL : buf->cp_data[n])))
614
0
      break;
615
0
  }
616
0
  return n;
617
0
}
618
619
static uint32_t
620
unicode_nf_compose_pair(uint32_t l, uint32_t r,
621
      const struct unicode_code_point_data **l_data)
622
0
{
623
0
  uint32_t comp = unicode_hangul_compose_pair(l, r);
624
625
0
  if (comp > 0x0000)
626
0
    return comp;
627
628
0
  if (*l_data == NULL)
629
0
    *l_data = unicode_code_point_get_data(l);
630
0
  return unicode_code_point_data_find_composition(*l_data, r);
631
0
}
632
633
static int
634
unicode_nf_flush_more(struct unicode_nf_context *ctx, bool finished,
635
          const char **error_r)
636
0
{
637
0
  struct unicode_transform *trans = &ctx->transform;
638
639
0
  ctx->finished = finished;
640
641
0
  if (ctx->buffer_len == 0)
642
0
    return 1;
643
0
  if (!finished && ctx->buffer_output_max == 0)
644
0
    return 0;
645
646
  /*
647
   * Apply the Canonical Composition Algorithm
648
   */
649
650
0
  if (ctx->finished)
651
0
    ctx->buffer_output_max = ctx->buffer_len;
652
0
  i_assert(ctx->buffer_processed <= ctx->buffer_output_max);
653
0
  if (ctx->compose && ctx->buffer_len > 1) {
654
0
    size_t in_pos, out_pos, starter;
655
0
    int last_ccc;
656
657
0
    out_pos = 1;
658
0
    last_ccc = -1;
659
0
    starter = 0;
660
0
    for (in_pos = I_MAX(1, ctx->buffer_processed);
661
0
         in_pos < ctx->buffer_output_max; in_pos++) {
662
0
      uint32_t cp = ctx->cp_buffer[in_pos];
663
0
      const struct unicode_code_point_data *cpd =
664
0
        ctx->cpd_buffer[in_pos];
665
666
0
      if (cpd == NULL) {
667
0
        ctx->cpd_buffer[in_pos] = cpd =
668
0
          unicode_code_point_get_data(cp);
669
0
      }
670
671
0
      uint8_t ccc = cpd->canonical_combining_class;
672
0
      uint32_t comp = 0x0000;
673
0
      if (last_ccc < (int)ccc) {
674
0
        comp = unicode_nf_compose_pair(
675
0
          ctx->cp_buffer[starter], cp,
676
0
          &ctx->cpd_buffer[starter]);
677
0
      }
678
0
      if (comp > 0x0000) {
679
0
        ctx->cp_buffer[starter] = comp;
680
0
        ctx->cpd_buffer[starter] = NULL;
681
0
      } else if (ccc == 0) {
682
0
        starter = out_pos;
683
0
        last_ccc = -1;
684
0
        ctx->cp_buffer[out_pos] = cp;
685
0
        ctx->cpd_buffer[out_pos] = cpd;
686
0
        out_pos++;
687
0
      } else {
688
0
        last_ccc = ccc;
689
0
        ctx->cp_buffer[out_pos] = cp;
690
0
        ctx->cpd_buffer[out_pos] = cpd;
691
0
        out_pos++;
692
0
      }
693
0
    }
694
0
    if (finished) {
695
0
      ctx->buffer_len = ctx->buffer_output_max = out_pos;
696
0
    } else if (in_pos > out_pos) {
697
0
      unicode_nf_buffer_delete(ctx, out_pos,
698
0
             (in_pos - out_pos));
699
0
      ctx->buffer_output_max = out_pos;
700
0
    }
701
0
  }
702
0
  ctx->buffer_processed = ctx->buffer_output_max;
703
704
  /*
705
   * Forward output
706
   */
707
708
0
  size_t output_len = ctx->buffer_processed;
709
0
  ssize_t sret;
710
711
0
  sret = uniform_transform_forward(trans, ctx->cp_buffer, ctx->cpd_buffer,
712
0
           output_len, error_r);
713
0
  if (sret < 0)
714
0
    return -1;
715
716
0
  i_assert((size_t)sret <= ctx->buffer_processed);
717
0
  unicode_nf_buffer_delete(ctx, 0, sret);
718
0
  ctx->buffer_processed -= sret;
719
0
  ctx->buffer_output_max -= sret;
720
0
  if ((size_t)sret < output_len)
721
0
    return 0;
722
0
  return 1;
723
0
}
724
725
static int
726
unicode_nf_flush(struct unicode_transform *trans, bool finished,
727
     const char **error_r)
728
0
{
729
0
  struct unicode_nf_context *ctx =
730
0
    container_of(trans, struct unicode_nf_context, transform);
731
0
  int ret;
732
733
0
  ret = unicode_nf_flush_more(ctx, finished, error_r);
734
0
  if (ret <= 0)
735
0
    return ret;
736
737
0
  if (finished && ctx->pending_decomp > 0) {
738
0
    if (unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd))
739
0
      ctx->pending_decomp = 0;
740
0
  }
741
742
0
  return unicode_nf_flush_more(ctx, finished, error_r);
743
0
}
744
745
/*
746
 * Normalization check
747
 */
748
749
static ssize_t
750
unicode_nf_check_sink_input(struct unicode_transform *trans,
751
          const struct unicode_transform_buffer *buf,
752
          const char **error_r);
753
754
static const struct unicode_transform_def unicode_nf_check_sink_def = {
755
  .input = unicode_nf_check_sink_input,
756
};
757
758
void unicode_nf_checker_init(struct unicode_nf_checker *unc_r,
759
           enum unicode_nf_type type)
760
0
{
761
0
  i_zero(unc_r);
762
763
0
  switch (type) {
764
0
  case UNICODE_NFD:
765
0
    unc_r->canonical = TRUE;
766
0
    unc_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK;
767
0
    unc_r->nf_qc_yes = UNICODE_NFD_QUICK_CHECK_YES;
768
0
    unc_r->nf_qc_no = UNICODE_NFD_QUICK_CHECK_NO;
769
0
    break;
770
0
  case UNICODE_NFKD:
771
0
    unc_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK;
772
0
    unc_r->nf_qc_yes = UNICODE_NFKD_QUICK_CHECK_YES;
773
0
    unc_r->nf_qc_no = UNICODE_NFKD_QUICK_CHECK_NO;
774
0
    break;
775
0
  case UNICODE_NFC:
776
0
    unc_r->compose = TRUE;
777
0
    unc_r->canonical = TRUE;
778
0
    unc_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK;
779
0
    unc_r->nf_qc_yes = UNICODE_NFC_QUICK_CHECK_YES;
780
0
    unc_r->nf_qc_no = UNICODE_NFC_QUICK_CHECK_NO;
781
0
    break;
782
0
  case UNICODE_NFKC:
783
0
    unc_r->compose = TRUE;
784
0
    unc_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK;
785
0
    unc_r->nf_qc_yes = UNICODE_NFKC_QUICK_CHECK_YES;
786
0
    unc_r->nf_qc_no = UNICODE_NFKC_QUICK_CHECK_NO;
787
0
    break;
788
0
  }
789
790
0
  unicode_nf_init(&unc_r->nf, type);
791
0
  unicode_transform_init(&unc_r->sink, &unicode_nf_check_sink_def);
792
0
  unicode_transform_chain(&unc_r->nf.transform, &unc_r->sink);
793
0
}
794
795
void unicode_nf_checker_reset(struct unicode_nf_checker *unc)
796
0
{
797
0
  enum unicode_nf_type type =
798
0
    (unc->compose ? (unc->canonical ? UNICODE_NFC : UNICODE_NFKC) :
799
0
        (unc->canonical ? UNICODE_NFD : UNICODE_NFKD));
800
801
0
  unicode_nf_checker_init(unc, type);
802
0
}
803
804
static ssize_t
805
unicode_nf_check_sink_input(struct unicode_transform *trans,
806
          const struct unicode_transform_buffer *buf,
807
          const char **error_r)
808
0
{
809
0
  struct unicode_nf_checker *unc =
810
0
    container_of(trans, struct unicode_nf_checker, sink);
811
0
  size_t n;
812
813
0
  i_assert(unc->buffer_len > 0);
814
0
  i_assert(buf->cp_count <= unc->buffer_len);
815
0
  for (n = 0; n < buf->cp_count; n++) {
816
0
    if (buf->cp[n] != unc->cp_buffer[n]) {
817
0
      *error_r = "Not normalized";
818
0
      return -1;
819
0
    }
820
0
  }
821
0
  if (buf->cp_count == unc->buffer_len)
822
0
    unc->buffer_len = 0;
823
0
  else {
824
0
    unc->buffer_len -= buf->cp_count;
825
0
    memmove(&unc->cp_buffer[0], &unc->cp_buffer[buf->cp_count],
826
0
      unc->buffer_len);
827
0
  }
828
0
  return buf->cp_count;
829
0
}
830
831
int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp,
832
           const struct unicode_code_point_data **_cp_data)
833
0
{
834
0
  const struct unicode_code_point_data *cpd_last = unc->cpd_last;
835
836
0
  if (*_cp_data == NULL)
837
0
    *_cp_data = unicode_code_point_get_data(cp);
838
839
0
  const struct unicode_code_point_data *cp_data = *_cp_data;
840
0
  const char *error;
841
0
  int ret;
842
843
0
  unc->cpd_last = cp_data;
844
845
0
  if (cp_data->general_category == UNICODE_GENERAL_CATEGORY_INVALID)
846
0
    return -1;
847
0
  if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_no)
848
0
    return 0;
849
0
  if (cpd_last != NULL && cp_data->canonical_combining_class != 0 &&
850
0
      cpd_last->canonical_combining_class >
851
0
    cp_data->canonical_combining_class)
852
0
    return 0;
853
0
  if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_yes &&
854
0
      cp_data->canonical_combining_class == 0) {
855
0
    if (unc->buffer_len > 0) {
856
0
      ret = unicode_transform_flush(&unc->nf.transform,
857
0
                  &error);
858
0
      i_assert(ret != 0);
859
0
      if (ret < 0)
860
0
        return 0;
861
0
      unicode_nf_reset(&unc->nf);
862
0
    }
863
0
    i_assert(unc->buffer_len == 0);
864
0
    unc->cp_buffer[0] = cp;
865
0
    return 1;
866
0
  }
867
868
0
  struct unicode_transform_buffer buf;
869
0
  ssize_t sret;
870
871
0
  if (unc->buffer_len == 0 && cpd_last != NULL) {
872
0
    i_zero(&buf);
873
0
    buf.cp = &unc->cp_buffer[0];
874
0
    buf.cp_data = &cpd_last;
875
0
    buf.cp_count = 1;
876
877
0
    unc->buffer_len++;
878
0
    sret = unicode_transform_input_buf(&unc->nf.transform, &buf,
879
0
               &error);
880
0
    i_assert(sret != 0);
881
0
    if (sret < 0)
882
0
      return 0;
883
0
  }
884
885
0
  i_assert(unc->buffer_len < UNICODE_NF_BUFFER_SIZE);
886
0
  unc->cp_buffer[unc->buffer_len] = cp;
887
0
  unc->buffer_len++;
888
889
0
  i_zero(&buf);
890
0
  buf.cp = &cp;
891
0
  buf.cp_data = &cp_data;
892
0
  buf.cp_count = 1;
893
0
  sret = unicode_transform_input_buf(&unc->nf.transform, &buf, &error);
894
0
  i_assert(sret != 0);
895
0
  if (sret < 0)
896
0
    return 0;
897
0
  return 1;
898
0
}
899
900
int unicode_nf_checker_finish(struct unicode_nf_checker *unc)
901
0
{
902
0
  if (unc->buffer_len == 0)
903
0
    return 1;
904
905
0
  const char *error;
906
0
  int ret;
907
908
0
  ret = unicode_transform_flush(&unc->nf.transform, &error);
909
0
  i_assert(ret != 0);
910
0
  return (ret > 0 ? 1 : 0);
911
0
}
912
913
/*
914
 * Casemap Transform
915
 */
916
917
static size_t
918
unicode_casemap_uppercase_cp(const struct unicode_code_point_data *cp_data,
919
           const uint32_t **map_r);
920
static size_t
921
unicode_casemap_lowercase_cp(const struct unicode_code_point_data *cp_data,
922
           const uint32_t **map_r);
923
static size_t
924
unicode_casemap_casefold_cp(const struct unicode_code_point_data *cp_data,
925
          const uint32_t **map_r);
926
927
static ssize_t
928
unicode_casemap_input(struct unicode_transform *trans,
929
          const struct unicode_transform_buffer *buf,
930
          const char **error_r);
931
static int
932
unicode_casemap_flush(struct unicode_transform *trans, bool finished,
933
          const char **error_r);
934
935
static const struct unicode_transform_def unicode_casemap_def = {
936
  .input = unicode_casemap_input,
937
  .flush = unicode_casemap_flush,
938
};
939
940
void unicode_casemap_init_uppercase(struct unicode_casemap *map_r)
941
0
{
942
0
  i_zero(map_r);
943
0
  unicode_transform_init(&map_r->transform, &unicode_casemap_def);
944
0
  map_r->map = unicode_casemap_uppercase_cp;
945
0
}
946
947
void unicode_casemap_init_lowercase(struct unicode_casemap *map_r)
948
0
{
949
0
  i_zero(map_r);
950
0
  unicode_transform_init(&map_r->transform, &unicode_casemap_def);
951
0
  map_r->map = unicode_casemap_lowercase_cp;
952
0
}
953
954
void unicode_casemap_init_casefold(struct unicode_casemap *map_r)
955
0
{
956
0
  i_zero(map_r);
957
0
  unicode_transform_init(&map_r->transform, &unicode_casemap_def);
958
0
  map_r->map = unicode_casemap_casefold_cp;
959
0
}
960
961
static size_t
962
unicode_casemap_uppercase_cp(const struct unicode_code_point_data *cp_data,
963
           const uint32_t **map_r)
964
0
{
965
0
  return unicode_code_point_data_get_uppercase_mapping(cp_data, map_r);
966
0
}
967
968
static size_t
969
unicode_casemap_lowercase_cp(const struct unicode_code_point_data *cp_data,
970
           const uint32_t **map_r)
971
0
{
972
0
  return unicode_code_point_data_get_lowercase_mapping(cp_data, map_r);
973
0
}
974
975
static size_t
976
unicode_casemap_casefold_cp(const struct unicode_code_point_data *cp_data,
977
          const uint32_t **map_r)
978
0
{
979
0
  return unicode_code_point_data_get_casefold_mapping(cp_data, map_r);
980
0
}
981
982
static ssize_t
983
unicode_casemap_input_cp(struct unicode_casemap *map, uint32_t cp,
984
       const struct unicode_code_point_data *cp_data,
985
       const char **error_r)
986
0
{
987
0
  bool was_buffered = map->cp_buffered;
988
0
  ssize_t sret;
989
990
0
  if (cp_data == NULL)
991
0
    cp_data = unicode_code_point_get_data(cp);
992
993
0
  const uint32_t *map_cps;
994
0
  const struct unicode_code_point_data *const *map_cps_data = NULL;
995
0
  size_t map_cps_len;
996
997
0
  map_cps_len = map->map(cp_data, &map_cps);
998
0
  if (map_cps_len == 0) {
999
0
    map_cps = &cp;
1000
0
    map_cps_data = &cp_data;
1001
0
    map_cps_len = 1;
1002
0
  }
1003
0
  i_assert(map_cps_len > map->cp_map_pos);
1004
1005
0
  map_cps += map->cp_map_pos;
1006
0
  map_cps_len -= map->cp_map_pos;
1007
0
  sret = uniform_transform_forward(&map->transform,
1008
0
           map_cps, map_cps_data, map_cps_len,
1009
0
           error_r);
1010
0
  if (sret < 0) {
1011
0
    i_assert(*error_r != NULL);
1012
0
    return -1;
1013
0
  }
1014
0
  if ((size_t)sret < map_cps_len) {
1015
0
    map->cp_buffered = TRUE;
1016
0
    map->cp = cp;
1017
0
    map->cp_data = cp_data;
1018
0
    map->cp_map_pos += sret;
1019
0
    return (was_buffered ? 0 : 1);
1020
0
  }
1021
1022
0
  map->cp_buffered = FALSE;
1023
0
  map->cp_data = NULL;
1024
0
  map->cp_map_pos = 0;
1025
0
  return 1;
1026
0
}
1027
1028
static ssize_t
1029
unicode_casemap_input(struct unicode_transform *trans,
1030
          const struct unicode_transform_buffer *buf,
1031
          const char **error_r)
1032
0
{
1033
0
  struct unicode_casemap *map =
1034
0
    container_of(trans, struct unicode_casemap, transform);
1035
0
  int ret;
1036
1037
0
  ret = unicode_casemap_flush(trans, TRUE, error_r);
1038
0
  if (ret < 0) {
1039
0
    i_assert(*error_r != NULL);
1040
0
    return -1;
1041
0
  }
1042
0
  if (map->cp_buffered)
1043
0
    return 0;
1044
1045
0
  size_t n;
1046
0
  for (n = 0; n < buf->cp_count; n++) {
1047
0
    if (map->cp_buffered)
1048
0
      break;
1049
0
    ret = unicode_casemap_input_cp(map, buf->cp[n],
1050
0
                 (buf->cp_data != NULL ?
1051
0
                  buf->cp_data[n] : NULL),
1052
0
                 error_r);
1053
0
    if (ret < 0) {
1054
0
      i_assert(*error_r != NULL);
1055
0
      return -1;
1056
0
    }
1057
0
    if (ret == 0)
1058
0
      break;
1059
0
  }
1060
0
  return n;
1061
0
}
1062
1063
static int
1064
unicode_casemap_flush(struct unicode_transform *trans,
1065
          bool finished ATTR_UNUSED, const char **error_r)
1066
0
{
1067
0
  struct unicode_casemap *map =
1068
0
    container_of(trans, struct unicode_casemap, transform);
1069
0
  int ret;
1070
1071
0
  if (!map->cp_buffered)
1072
0
    return 1;
1073
1074
0
  ret = unicode_casemap_input_cp(map, map->cp, map->cp_data, error_r);
1075
0
  i_assert(ret >= 0 || *error_r != NULL);
1076
0
  return ret;
1077
0
}
1078
1079
/*
1080
 * RFC 5051 - Simple Unicode Collation Algorithm
1081
 */
1082
1083
void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx)
1084
0
{
1085
0
  i_zero(ctx);
1086
0
}
1087
1088
size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx,
1089
         uint32_t cp, const uint32_t **norm_r)
1090
0
{
1091
0
  const struct unicode_code_point_data *cpd;
1092
0
  size_t len;
1093
1094
0
  cpd = unicode_code_point_get_data(cp);
1095
0
  if (cpd->simple_titlecase_mapping != 0x0000)
1096
0
    cp = cpd->simple_titlecase_mapping;
1097
1098
0
  if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) {
1099
0
    *norm_r = ctx->buffer;
1100
0
    return unicode_hangul_decompose(cp, ctx->buffer);
1101
0
  }
1102
1103
0
  len = unicode_code_point_get_full_decomposition(cp, FALSE, norm_r);
1104
0
  if (len == 0) {
1105
0
    ctx->buffer[0] = cp;
1106
0
    *norm_r = ctx->buffer;
1107
0
    return 1;
1108
0
  }
1109
0
  return len;
1110
0
}