Coverage Report

Created: 2025-11-15 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/dovecot/src/lib/unicode-transform.c
Line
Count
Source
1
/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
2
3
#include "lib.h"
4
#include "unichar.h"
5
#include "unicode-data.h"
6
#include "unicode-transform.h"
7
8
0
#define HANGUL_FIRST 0xac00
9
0
#define HANGUL_LAST 0xd7a3
10
11
/*
12
 * Transform
13
 */
14
15
ssize_t uniform_transform_forward(
16
  struct unicode_transform *trans, const uint32_t *out,
17
  const struct unicode_code_point_data *const *out_data, size_t out_len,
18
  const char **error_r)
19
0
{
20
0
  struct unicode_transform_buffer buf_next;
21
0
  ssize_t sret;
22
23
0
  i_zero(&buf_next);
24
0
  buf_next.cp = out;
25
0
  buf_next.cp_data = out_data;
26
0
  buf_next.cp_count = out_len;
27
28
0
  i_assert(trans->next != NULL);
29
0
  i_assert(trans->next->def != NULL);
30
0
  i_assert(trans->next->def->input != NULL);
31
0
  sret = trans->next->def->input(trans->next, &buf_next, error_r);
32
33
0
  i_assert(sret >= 0 || *error_r != NULL);
34
0
  i_assert(sret <= (ssize_t)out_len);
35
0
  return sret;
36
0
}
37
38
ssize_t unicode_transform_input_buf(struct unicode_transform *trans,
39
            const struct unicode_transform_buffer *buf,
40
            const char **error_r)
41
0
{
42
0
  struct unicode_transform_buffer in_buf;
43
0
  size_t input_total = 0;
44
0
  ssize_t sret;
45
0
  bool flushed = FALSE;
46
0
  int ret;
47
48
0
  *error_r = NULL;
49
50
0
  in_buf = *buf;
51
52
0
  while (in_buf.cp_count > 0) {
53
0
    if (in_buf.cp_count > 0) {
54
0
      i_assert(trans->def->input != NULL);
55
0
      sret = trans->def->input(trans, &in_buf, error_r);
56
0
      if (sret < 0) {
57
0
        i_assert(*error_r != NULL);
58
0
        return -1;
59
0
      }
60
0
      if (sret > 0) {
61
0
        i_assert((size_t)sret <= in_buf.cp_count);
62
0
        in_buf.cp += sret;
63
0
        in_buf.cp_count -= sret;
64
0
        input_total += sret;
65
0
        flushed = FALSE;
66
0
        continue;
67
0
      }
68
0
      if (sret == 0 && flushed)
69
0
        break;
70
0
    }
71
72
0
    struct unicode_transform *tp = trans;
73
74
0
    while (tp->next != NULL) {
75
0
      if (tp->def->flush != NULL) {
76
0
        ret = tp->def->flush(tp, FALSE, error_r);
77
0
        if (ret < 0) {
78
0
          i_assert(*error_r != NULL);
79
0
          return -1;
80
0
        }
81
0
      }
82
0
      tp = tp->next;
83
0
    }
84
85
0
    flushed = TRUE;
86
0
  }
87
88
0
  return input_total;
89
0
}
90
91
int unicode_transform_flush(struct unicode_transform *trans,
92
          const char **error_r)
93
0
{
94
0
  int ret;
95
96
0
  *error_r = NULL;
97
98
0
  while (trans != NULL) {
99
0
    struct unicode_transform *tp = trans;
100
0
    bool progress = FALSE;
101
102
0
    while (tp != NULL) {
103
0
      if (tp->def->flush == NULL) {
104
0
        progress = TRUE;
105
0
        if (tp == trans)
106
0
          trans = trans->next;
107
0
      } else {
108
0
        ret = tp->def->flush(tp, (tp == trans), error_r);
109
0
        if (ret < 0) {
110
0
          i_assert(*error_r != NULL);
111
0
          return -1;
112
0
        }
113
0
        if (ret > 0) {
114
0
          progress = TRUE;
115
0
          if (tp == trans)
116
0
            trans = trans->next;
117
0
        }
118
0
      }
119
0
      tp = tp->next;
120
0
    }
121
0
    if (!progress)
122
0
      return 0;
123
0
  }
124
0
  return 1;
125
0
}
126
127
/* Buffer Sink */
128
129
static ssize_t
130
unicode_buffer_sink_input(struct unicode_transform *trans,
131
        const struct unicode_transform_buffer *buf,
132
        const char **error_r);
133
134
static const struct unicode_transform_def unicode_buffer_sink_def = {
135
  .input = unicode_buffer_sink_input,
136
};
137
138
void unicode_buffer_sink_init(struct unicode_buffer_sink *sink,
139
            buffer_t *buffer)
140
0
{
141
0
  i_zero(sink);
142
0
  unicode_transform_init(&sink->transform, &unicode_buffer_sink_def);
143
0
  sink->buffer = buffer;
144
0
}
145
146
static ssize_t
147
unicode_buffer_sink_input(struct unicode_transform *trans,
148
        const struct unicode_transform_buffer *buf,
149
        const char **error_r ATTR_UNUSED)
150
0
{
151
0
  struct unicode_buffer_sink *sink =
152
0
    container_of(trans, struct unicode_buffer_sink, transform);
153
154
0
  uni_ucs4_to_utf8(buf->cp, buf->cp_count, sink->buffer);
155
0
  return buf->cp_count;
156
0
}
157
158
/* Static Array Sink */
159
160
static ssize_t
161
unicode_static_array_sink_input(struct unicode_transform *trans,
162
        const struct unicode_transform_buffer *buf,
163
        const char **error_r);
164
165
static const struct unicode_transform_def unicode_static_array_sink_def = {
166
  .input = unicode_static_array_sink_input,
167
};
168
169
void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
170
            uint32_t *array, size_t array_size,
171
            size_t *array_pos)
172
0
{
173
0
  i_zero(sink);
174
0
  unicode_transform_init(&sink->transform,
175
0
             &unicode_static_array_sink_def);
176
0
  sink->array = array;
177
0
  sink->array_size = array_size;
178
0
  sink->array_pos = array_pos;
179
0
}
180
181
static ssize_t
182
unicode_static_array_sink_input(struct unicode_transform *trans,
183
        const struct unicode_transform_buffer *buf,
184
        const char **error_r)
185
0
{
186
0
  struct unicode_static_array_sink *sink =
187
0
    container_of(trans, struct unicode_static_array_sink,
188
0
           transform);
189
190
0
  if (*sink->array_pos + buf->cp_count > sink->array_size) {
191
0
    *error_r = "Output overflow";
192
0
    return -1;
193
0
  }
194
0
  memcpy(sink->array + *sink->array_pos, buf->cp,
195
0
         buf->cp_count * sizeof(*buf->cp));
196
0
  *sink->array_pos += buf->cp_count;
197
0
  return buf->cp_count;
198
0
}
199
200
/*
201
 * Hangul syllable (de)composition
202
 */
203
204
0
#define UNI_HANGUL_S_BASE 0xac00
205
0
#define UNI_HANGUL_L_BASE 0x1100
206
0
#define UNI_HANGUL_V_BASE 0x1161
207
0
#define UNI_HANGUL_T_BASE 0x11a7
208
0
#define UNI_HANGUL_L_COUNT 19
209
0
#define UNI_HANGUL_V_COUNT 21
210
0
#define UNI_HANGUL_T_COUNT 28
211
0
#define UNI_HANGUL_N_COUNT (UNI_HANGUL_V_COUNT * UNI_HANGUL_T_COUNT)
212
0
#define UNI_HANGUL_L_END (UNI_HANGUL_L_BASE + UNI_HANGUL_L_COUNT)
213
0
#define UNI_HANGUL_V_END (UNI_HANGUL_V_BASE + UNI_HANGUL_V_COUNT)
214
0
#define UNI_HANGUL_T_END (UNI_HANGUL_T_BASE + UNI_HANGUL_T_COUNT)
215
0
#define UNI_HANGUL_S_END 0xD7A4
216
217
static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3])
218
0
{
219
  /* The Unicode Standard, Section 3.12.2:
220
     Hangul Syllable Decomposition
221
   */
222
223
0
  size_t s_index = cp - UNI_HANGUL_S_BASE;
224
0
  size_t l_index = s_index / UNI_HANGUL_N_COUNT;
225
0
  size_t v_index = ((s_index % UNI_HANGUL_N_COUNT) / UNI_HANGUL_T_COUNT);
226
0
  size_t t_index = s_index % UNI_HANGUL_T_COUNT;
227
0
  uint32_t l_part = UNI_HANGUL_L_BASE + l_index;
228
0
  uint32_t v_part = UNI_HANGUL_V_BASE + v_index;
229
230
0
  if (t_index == 0) {
231
0
    buf[0] = l_part;
232
0
    buf[1] = v_part;
233
0
    return 2;
234
0
  }
235
236
0
  uint32_t t_part = UNI_HANGUL_T_BASE + t_index;
237
238
0
  buf[0] = l_part;
239
0
  buf[1] = v_part;
240
0
  buf[2] = t_part;
241
0
  return 3;
242
0
}
243
244
static uint32_t unicode_hangul_compose_pair(uint32_t l, uint32_t r)
245
0
{
246
  /* The Unicode Standard, Section 3.12.3:
247
     Hangul Syllable Composition
248
   */
249
250
  /* <LPart, VPart> */
251
0
  if (l >= UNI_HANGUL_L_BASE && l < UNI_HANGUL_L_END &&
252
0
      r >= UNI_HANGUL_V_BASE && r < UNI_HANGUL_V_END) {
253
0
    uint32_t l_part = l, v_part = r;
254
255
0
    size_t l_index = l_part - UNI_HANGUL_L_BASE;
256
0
    size_t v_index = v_part - UNI_HANGUL_V_BASE;
257
0
    size_t lv_index = l_index * UNI_HANGUL_N_COUNT +
258
0
          v_index * UNI_HANGUL_T_COUNT;
259
0
    return UNI_HANGUL_S_BASE + lv_index;
260
0
  }
261
  /* A sequence <LVPart, TPart> */
262
0
  if (l >= UNI_HANGUL_S_BASE && l < UNI_HANGUL_S_END &&
263
0
      r >= (UNI_HANGUL_T_BASE + 1u) && r < UNI_HANGUL_T_END &&
264
0
      ((l - UNI_HANGUL_S_BASE) % UNI_HANGUL_T_COUNT) == 0) {
265
0
    uint32_t lv_part = l, t_part = r;
266
267
0
    size_t t_index = t_part - UNI_HANGUL_T_BASE;
268
0
    return lv_part + t_index;
269
0
  }
270
0
  return 0x0000;
271
0
}
272
273
/*
274
 * Normalization transform: NFD, NFKD, NFC, NFKC
275
 */
276
277
static ssize_t
278
unicode_nf_input(struct unicode_transform *trans,
279
     const struct unicode_transform_buffer *buf,
280
     const char **error_r);
281
static int
282
unicode_nf_flush(struct unicode_transform *trans, bool finished,
283
     const char **error_r);
284
285
static const struct unicode_transform_def unicode_nf_def = {
286
  .input = unicode_nf_input,
287
  .flush = unicode_nf_flush,
288
};
289
290
void unicode_nf_init(struct unicode_nf_context *ctx_r,
291
         enum unicode_nf_type type)
292
0
{
293
0
  i_zero(ctx_r);
294
0
  unicode_transform_init(&ctx_r->transform, &unicode_nf_def);
295
296
0
  switch (type) {
297
0
  case UNICODE_NFD:
298
0
    ctx_r->canonical = TRUE;
299
0
    ctx_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK;
300
0
    break;
301
0
  case UNICODE_NFKD:
302
0
    ctx_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK;
303
0
    break;
304
0
  case UNICODE_NFC:
305
0
    ctx_r->compose = TRUE;
306
0
    ctx_r->canonical = TRUE;
307
0
    ctx_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK;
308
0
    break;
309
0
  case UNICODE_NFKC:
310
0
    ctx_r->compose = TRUE;
311
0
    ctx_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK;
312
0
    break;
313
0
  }
314
0
}
315
316
void unicode_nf_reset(struct unicode_nf_context *ctx)
317
0
{
318
0
  enum unicode_nf_type type =
319
0
    (ctx->compose ? (ctx->canonical ? UNICODE_NFC : UNICODE_NFKC) :
320
0
        (ctx->canonical ? UNICODE_NFD : UNICODE_NFKD));
321
0
  struct unicode_transform *next = ctx->transform.next;
322
323
0
  unicode_nf_init(ctx, type);
324
0
  unicode_transform_chain(&ctx->transform, next);
325
0
}
326
327
static void
328
unicode_nf_buffer_delete(struct unicode_nf_context *ctx, size_t offset,
329
       size_t count)
330
0
{
331
0
  if (count == 0)
332
0
    return;
333
334
0
  i_assert(offset < ctx->buffer_len);
335
0
  i_assert(count <= ctx->buffer_len);
336
0
  i_assert(offset <= (ctx->buffer_len - count));
337
338
0
  if (count == ctx->buffer_len) {
339
0
    ctx->buffer_len = 0;
340
0
    return;
341
0
  }
342
343
0
  size_t trailer = ctx->buffer_len - (offset + count);
344
0
  if (trailer > 0) {
345
0
    memmove(&ctx->cp_buffer[offset],
346
0
      &ctx->cp_buffer[offset + count],
347
0
      trailer * sizeof(ctx->cp_buffer[0]));
348
0
    memmove(&ctx->cpd_buffer[offset],
349
0
      &ctx->cpd_buffer[offset + count],
350
0
      trailer * sizeof(ctx->cpd_buffer[0]));
351
0
  }
352
0
  ctx->buffer_len -= count;
353
0
}
354
355
static void
356
unicode_nf_buffer_swap(struct unicode_nf_context *ctx,
357
           size_t idx1, size_t idx2)
358
0
{
359
0
  uint32_t tmp_cp = ctx->cp_buffer[idx2];
360
0
  const struct unicode_code_point_data *tmp_cpd = ctx->cpd_buffer[idx2];
361
362
0
  ctx->cp_buffer[idx2] = ctx->cp_buffer[idx1];
363
0
  ctx->cpd_buffer[idx2] = ctx->cpd_buffer[idx1];
364
0
  ctx->cp_buffer[idx1] = tmp_cp;
365
0
  ctx->cpd_buffer[idx1] = tmp_cpd;
366
0
}
367
368
static void
369
unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp,
370
        const struct unicode_code_point_data *cpd)
371
0
{
372
0
  static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE;
373
0
  uint8_t nf_qc_mask = ctx->nf_qc_mask;
374
0
  size_t i;
375
376
  /*
377
   * Decompose the code point
378
   */
379
380
0
  const uint32_t *decomp, *decomp_k;
381
0
  uint32_t decomp_hangul[3];
382
0
  size_t len, len_k;
383
384
0
  if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) {
385
0
    len = len_k = unicode_hangul_decompose(cp, decomp_hangul);
386
0
    decomp = decomp_k = decomp_hangul;
387
0
  } else {
388
0
    if (cpd == NULL)
389
0
      cpd = unicode_code_point_get_data(cp);
390
0
    len = unicode_code_point_data_get_full_decomposition(
391
0
      cpd, ctx->canonical, &decomp);
392
0
    if (len == 0) {
393
0
      decomp = &cp;
394
0
      len = 1;
395
0
    }
396
0
    len_k = len;
397
0
    decomp_k = decomp;
398
0
    if (ctx->canonical) {
399
0
      len_k = unicode_code_point_data_get_full_decomposition(
400
0
        cpd, ctx->canonical, &decomp_k);
401
0
      if (len_k == 0) {
402
0
        decomp_k = decomp;
403
0
        len_k = len;
404
0
      }
405
0
    }
406
0
    if (len > 0)
407
0
      cpd = NULL;
408
0
  }
409
410
0
  i_assert(len <= UNICODE_DECOMPOSITION_MAX_LENGTH);
411
0
  i_assert(len_k <= UNICODE_DECOMPOSITION_MAX_LENGTH);
412
413
0
  if ((ctx->buffer_len + len) > buffer_size) {
414
    /* Decomposition overflows the buffer. Record and mark it as
415
       pending and come back to it once the buffer is sufficiently
416
       drained. */
417
0
    i_assert(ctx->pending_decomp == 0);
418
0
    ctx->pending_decomp = len;
419
0
    ctx->pending_cp = cp;
420
0
    ctx->pending_cpd = cpd;
421
0
    return;
422
0
  }
423
424
  /* UAX15-D4: Stream-Safe Text Process is the process of producing a
425
     Unicode string in Stream-Safe Text Format by processing that string
426
     from start to finish, inserting U+034F COMBINING GRAPHEME JOINER
427
     (CGJ) within long sequences of non-starters. The exact position o
428
     the inserted CGJs are determined according to the following
429
     algorithm, which describes the generation of an output string from an
430
     input string:
431
432
     1. If the input string is empty, return an empty output string.
433
     2. Set nonStarterCount to zero.
434
     3. For each code point C in the input string:
435
    a. Produce the NFKD decomposition S.
436
    b. If nonStarterCount plus the number of initial non-starters in
437
       S is greater than 30, append a CGJ to the output string and
438
       set the nonStarterCount to zero.
439
    c. Append C to the output string.
440
    d. If there are no starters in S, increment nonStarterCount by
441
       the number of code points in S; otherwise, set
442
       nonStarterCount to the number of trailing non-starters in S
443
       (which may be zero).
444
     4. Return the output string.
445
   */
446
447
  /* Determine number of leading and trailing non-starters in full NFKD
448
     decomposition. */
449
0
  const struct unicode_code_point_data *
450
0
    decomp_cpd[UNICODE_DECOMPOSITION_MAX_LENGTH];
451
0
  size_t ns_lead = 0, ns_trail = 0;
452
0
  bool seen_starter = FALSE;
453
0
  for (i = 0; i < len_k; i++) {
454
0
    if (cpd == NULL)
455
0
      cpd = unicode_code_point_get_data(decomp[i]);
456
457
0
    uint8_t ccc = cpd->canonical_combining_class;
458
459
0
    if (decomp == decomp_k) {
460
0
      decomp_cpd[i] = cpd;
461
0
      cpd = NULL;
462
0
    }
463
464
0
    if (ccc == 0)
465
0
      seen_starter = TRUE;
466
0
    else if (!seen_starter)
467
0
      ns_lead++;
468
0
    else
469
0
      ns_trail++;
470
0
  }
471
472
  /* Lookup canonical decomposed code points if necessary (avoid double
473
     lookups). */
474
0
  if (decomp != decomp_k) {
475
0
    for (i = 0; i < len; i++) {
476
0
      if (cpd == NULL)
477
0
        cpd = unicode_code_point_get_data(decomp[i]);
478
0
      decomp_cpd[i] = cpd;
479
0
      cpd = NULL;
480
0
    }
481
0
  }
482
483
0
  ctx->nonstarter_count += ns_lead;
484
0
  if (ctx->nonstarter_count > 30) {
485
0
    ctx->nonstarter_count = ns_trail;
486
487
    /* Write U+034F COMBINING GRAPHEME JOINER (CGJ)
488
     */
489
0
    ctx->cp_buffer[ctx->buffer_len] = 0x034F;
490
0
    ctx->cpd_buffer[ctx->buffer_len] =
491
0
      unicode_code_point_get_data(0x034F);
492
0
    ctx->buffer_len++;
493
0
  }
494
495
  /*
496
   * Buffer the requested decomposition for COA sorting
497
   */
498
499
0
  i_assert(ctx->buffer_len <= buffer_size);
500
0
  if ((ctx->buffer_len + len) > buffer_size) {
501
    /* Decomposition now overflows the buffer. Record and mark it as
502
       pending and come back to it once the buffer is sufficiently
503
       drained. */
504
0
    i_assert(ctx->pending_decomp == 0);
505
0
    ctx->pending_decomp = len;
506
0
    ctx->pending_cp = cp;
507
0
    ctx->pending_cpd = cpd;
508
0
  } else {
509
0
    for (i = 0; i < len; i++) {
510
0
      ctx->cp_buffer[ctx->buffer_len] = decomp[i];
511
0
      ctx->cpd_buffer[ctx->buffer_len] = decomp_cpd[i];
512
0
      ctx->buffer_len++;
513
0
    }
514
0
    i_assert(ctx->buffer_len <= buffer_size);
515
0
  }
516
517
  /*
518
   * Apply the Canonical Ordering Algorithm (COA)
519
   */
520
521
0
  bool changed = TRUE;
522
0
  size_t last_qc_y;
523
0
  size_t last_starter;
524
525
0
  while (changed) {
526
0
    changed = FALSE;
527
0
    last_qc_y = 0;
528
0
    last_starter = 0;
529
530
0
    for (i = I_MAX(1, ctx->buffer_output_max);
531
0
         i < ctx->buffer_len; i++) {
532
0
      const struct unicode_code_point_data
533
0
        *cpd_i = ctx->cpd_buffer[i],
534
0
        *cpd_im1 = ctx->cpd_buffer[i - 1];
535
0
      uint8_t ccc_i = cpd_i->canonical_combining_class;
536
0
      uint8_t ccc_im1 = cpd_im1->canonical_combining_class;
537
0
      bool nqc = ((cpd_i->nf_quick_check & nf_qc_mask) == 0);
538
539
0
      if (ccc_i == 0) {
540
0
        last_starter = i;
541
0
        if (nqc)
542
0
          last_qc_y = i;
543
0
      } else if (ccc_im1 > ccc_i) {
544
0
        unicode_nf_buffer_swap(ctx, i - 1, i);
545
0
        changed = TRUE;
546
0
      }
547
0
    }
548
0
  }
549
0
  ctx->buffer_output_max = I_MIN(last_qc_y, last_starter);
550
0
}
551
552
static bool
553
unicode_nf_input_cp(struct unicode_nf_context *ctx, uint32_t cp,
554
        const struct unicode_code_point_data *cpd)
555
0
{
556
0
  static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE;
557
558
0
  i_assert(ctx->buffer_len <= buffer_size);
559
0
  if (ctx->buffer_len == buffer_size ||
560
0
      (ctx->pending_decomp > 0 &&
561
0
       ctx->buffer_len > (buffer_size - ctx->pending_decomp))) {
562
    /* Buffer is (still too) full. */
563
0
    return FALSE;
564
0
  }
565
566
0
  if (ctx->pending_decomp > 0) {
567
    /* Earlier, the buffer was too full for the next decomposition
568
       and it was recorded and marked as pending. Now, we have the
569
       opportunity to continue. */
570
0
    unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd);
571
0
    ctx->pending_decomp = 0;
572
573
0
    i_assert(ctx->buffer_len <= buffer_size);
574
0
    if (ctx->buffer_output_max > 0 &&
575
0
        ctx->buffer_len == buffer_size) {
576
      /* Pending decomposition filled the buffer completely.
577
       */
578
0
      return FALSE;
579
0
    }
580
0
  }
581
582
  /* Normal input of next code point */
583
0
  unicode_nf_cp(ctx, cp, cpd);
584
0
  return TRUE;
585
0
}
586
587
static ssize_t
588
unicode_nf_input(struct unicode_transform *trans,
589
     const struct unicode_transform_buffer *buf,
590
     const char **error_r ATTR_UNUSED)
591
0
{
592
0
  struct unicode_nf_context *ctx =
593
0
    container_of(trans, struct unicode_nf_context, transform);
594
0
  size_t n;
595
596
0
  for (n = 0; n < buf->cp_count; n++) {
597
0
    if (!unicode_nf_input_cp(ctx, buf->cp[n],
598
0
           (buf->cp_data == NULL ?
599
0
            NULL : buf->cp_data[n])))
600
0
      break;
601
0
  }
602
0
  return n;
603
0
}
604
605
static uint32_t
606
unicode_nf_compose_pair(uint32_t l, uint32_t r,
607
      const struct unicode_code_point_data **l_data)
608
0
{
609
0
  uint32_t comp = unicode_hangul_compose_pair(l, r);
610
611
0
  if (comp > 0x0000)
612
0
    return comp;
613
614
0
  if (*l_data == NULL)
615
0
    *l_data = unicode_code_point_get_data(l);
616
0
  return unicode_code_point_data_find_composition(*l_data, r);
617
0
}
618
619
static int
620
unicode_nf_flush_more(struct unicode_nf_context *ctx, bool finished,
621
          const char **error_r)
622
0
{
623
0
  struct unicode_transform *trans = &ctx->transform;
624
625
0
  ctx->finished = finished;
626
627
0
  if (ctx->buffer_len == 0)
628
0
    return 1;
629
0
  if (!finished && ctx->buffer_output_max == 0)
630
0
    return 0;
631
632
  /*
633
   * Apply the Canonical Composition Algorithm
634
   */
635
636
0
  if (ctx->finished)
637
0
    ctx->buffer_output_max = ctx->buffer_len;
638
0
  i_assert(ctx->buffer_processed <= ctx->buffer_output_max);
639
0
  if (ctx->compose && ctx->buffer_len > 1) {
640
0
    size_t in_pos, out_pos, starter;
641
0
    int last_ccc;
642
643
0
    out_pos = 1;
644
0
    last_ccc = -1;
645
0
    starter = 0;
646
0
    for (in_pos = I_MAX(1, ctx->buffer_processed);
647
0
         in_pos < ctx->buffer_output_max; in_pos++) {
648
0
      uint32_t cp = ctx->cp_buffer[in_pos];
649
0
      const struct unicode_code_point_data *cpd =
650
0
        ctx->cpd_buffer[in_pos];
651
652
0
      if (cpd == NULL) {
653
0
        ctx->cpd_buffer[in_pos] = cpd =
654
0
          unicode_code_point_get_data(cp);
655
0
      }
656
657
0
      uint8_t ccc = cpd->canonical_combining_class;
658
0
      uint32_t comp = 0x0000;
659
0
      if (last_ccc < (int)ccc) {
660
0
        comp = unicode_nf_compose_pair(
661
0
          ctx->cp_buffer[starter], cp,
662
0
          &ctx->cpd_buffer[starter]);
663
0
      }
664
0
      if (comp > 0x0000) {
665
0
        ctx->cp_buffer[starter] = comp;
666
0
        ctx->cpd_buffer[starter] = NULL;
667
0
      } else if (ccc == 0) {
668
0
        starter = out_pos;
669
0
        last_ccc = -1;
670
0
        ctx->cp_buffer[out_pos] = cp;
671
0
        ctx->cpd_buffer[out_pos] = cpd;
672
0
        out_pos++;
673
0
      } else {
674
0
        last_ccc = ccc;
675
0
        ctx->cp_buffer[out_pos] = cp;
676
0
        ctx->cpd_buffer[out_pos] = cpd;
677
0
        out_pos++;
678
0
      }
679
0
    }
680
0
    if (finished) {
681
0
      ctx->buffer_len = ctx->buffer_output_max = out_pos;
682
0
    } else if (in_pos > out_pos) {
683
0
      unicode_nf_buffer_delete(ctx, out_pos,
684
0
             (in_pos - out_pos));
685
0
      ctx->buffer_output_max = out_pos;
686
0
    }
687
0
  }
688
0
  ctx->buffer_processed = ctx->buffer_output_max;
689
690
  /*
691
   * Forward output
692
   */
693
694
0
  size_t output_len = ctx->buffer_processed;
695
0
  ssize_t sret;
696
697
0
  sret = uniform_transform_forward(trans, ctx->cp_buffer, ctx->cpd_buffer,
698
0
           output_len, error_r);
699
0
  if (sret < 0)
700
0
    return -1;
701
702
0
  i_assert((size_t)sret <= ctx->buffer_processed);
703
0
  unicode_nf_buffer_delete(ctx, 0, sret);
704
0
  ctx->buffer_processed -= sret;
705
0
  ctx->buffer_output_max -= sret;
706
0
  if ((size_t)sret < output_len)
707
0
    return 0;
708
0
  return 1;
709
0
}
710
711
static int
712
unicode_nf_flush(struct unicode_transform *trans, bool finished,
713
     const char **error_r)
714
0
{
715
0
  struct unicode_nf_context *ctx =
716
0
    container_of(trans, struct unicode_nf_context, transform);
717
0
  int ret;
718
719
0
  ret = unicode_nf_flush_more(ctx, finished, error_r);
720
0
  if (ret <= 0)
721
0
    return ret;
722
723
0
  if (finished && ctx->pending_decomp > 0) {
724
0
    unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd);
725
0
    ctx->pending_decomp = 0;
726
0
  }
727
728
0
  return unicode_nf_flush_more(ctx, finished, error_r);
729
0
}
730
731
/*
732
 * Normalization check
733
 */
734
735
static ssize_t
736
unicode_nf_check_sink_input(struct unicode_transform *trans,
737
          const struct unicode_transform_buffer *buf,
738
          const char **error_r);
739
740
static const struct unicode_transform_def unicode_nf_check_sink_def = {
741
  .input = unicode_nf_check_sink_input,
742
};
743
744
void unicode_nf_checker_init(struct unicode_nf_checker *unc_r,
745
           enum unicode_nf_type type)
746
0
{
747
0
  i_zero(unc_r);
748
749
0
  switch (type) {
750
0
  case UNICODE_NFD:
751
0
    unc_r->canonical = TRUE;
752
0
    unc_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK;
753
0
    unc_r->nf_qc_yes = UNICODE_NFD_QUICK_CHECK_YES;
754
0
    unc_r->nf_qc_no = UNICODE_NFD_QUICK_CHECK_NO;
755
0
    break;
756
0
  case UNICODE_NFKD:
757
0
    unc_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK;
758
0
    unc_r->nf_qc_yes = UNICODE_NFKD_QUICK_CHECK_YES;
759
0
    unc_r->nf_qc_no = UNICODE_NFKD_QUICK_CHECK_NO;
760
0
    break;
761
0
  case UNICODE_NFC:
762
0
    unc_r->compose = TRUE;
763
0
    unc_r->canonical = TRUE;
764
0
    unc_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK;
765
0
    unc_r->nf_qc_yes = UNICODE_NFC_QUICK_CHECK_YES;
766
0
    unc_r->nf_qc_no = UNICODE_NFC_QUICK_CHECK_NO;
767
0
    break;
768
0
  case UNICODE_NFKC:
769
0
    unc_r->compose = TRUE;
770
0
    unc_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK;
771
0
    unc_r->nf_qc_yes = UNICODE_NFKC_QUICK_CHECK_YES;
772
0
    unc_r->nf_qc_no = UNICODE_NFKC_QUICK_CHECK_NO;
773
0
    break;
774
0
  }
775
776
0
  unicode_nf_init(&unc_r->nf, type);
777
0
  unicode_transform_init(&unc_r->sink, &unicode_nf_check_sink_def);
778
0
  unicode_transform_chain(&unc_r->nf.transform, &unc_r->sink);
779
0
}
780
781
void unicode_nf_checker_reset(struct unicode_nf_checker *unc)
782
0
{
783
0
  enum unicode_nf_type type =
784
0
    (unc->compose ? (unc->canonical ? UNICODE_NFC : UNICODE_NFKC) :
785
0
        (unc->canonical ? UNICODE_NFD : UNICODE_NFKD));
786
787
0
  unicode_nf_checker_init(unc, type);
788
0
}
789
790
static ssize_t
791
unicode_nf_check_sink_input(struct unicode_transform *trans,
792
          const struct unicode_transform_buffer *buf,
793
          const char **error_r)
794
0
{
795
0
  struct unicode_nf_checker *unc =
796
0
    container_of(trans, struct unicode_nf_checker, sink);
797
0
  size_t n;
798
799
0
  i_assert(unc->buffer_len > 0);
800
0
  i_assert(buf->cp_count <= unc->buffer_len);
801
0
  for (n = 0; n < buf->cp_count; n++) {
802
0
    if (buf->cp[n] != unc->cp_buffer[n]) {
803
0
      *error_r = "Not normalized";
804
0
      return -1;
805
0
    }
806
0
  }
807
0
  if (buf->cp_count == unc->buffer_len)
808
0
    unc->buffer_len = 0;
809
0
  else {
810
0
    unc->buffer_len -= buf->cp_count;
811
0
    memmove(&unc->cp_buffer[0], &unc->cp_buffer[buf->cp_count],
812
0
      unc->buffer_len);
813
0
  }
814
0
  return buf->cp_count;
815
0
}
816
817
int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp,
818
           const struct unicode_code_point_data **_cp_data)
819
0
{
820
0
  const struct unicode_code_point_data *cpd_last = unc->cpd_last;
821
822
0
  if (*_cp_data == NULL)
823
0
    *_cp_data = unicode_code_point_get_data(cp);
824
825
0
  const struct unicode_code_point_data *cp_data = *_cp_data;
826
0
  const char *error;
827
0
  int ret;
828
829
0
  unc->cpd_last = cp_data;
830
831
0
  if (cp_data->general_category == UNICODE_GENERAL_CATEGORY_INVALID)
832
0
    return -1;
833
0
  if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_no)
834
0
    return 0;
835
0
  if (cpd_last != NULL && cp_data->canonical_combining_class != 0 &&
836
0
      cpd_last->canonical_combining_class >
837
0
    cp_data->canonical_combining_class)
838
0
    return 0;
839
0
  if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_yes &&
840
0
      cp_data->canonical_combining_class == 0) {
841
0
    if (unc->buffer_len > 0) {
842
0
      ret = unicode_transform_flush(&unc->nf.transform,
843
0
                  &error);
844
0
      i_assert(ret != 0);
845
0
      if (ret < 0)
846
0
        return 0;
847
0
      unicode_nf_reset(&unc->nf);
848
0
    }
849
0
    i_assert(unc->buffer_len == 0);
850
0
    unc->cp_buffer[0] = cp;
851
0
    return 1;
852
0
  }
853
854
0
  struct unicode_transform_buffer buf;
855
0
  ssize_t sret;
856
857
0
  if (unc->buffer_len == 0 && cpd_last != NULL) {
858
0
    i_zero(&buf);
859
0
    buf.cp = &unc->cp_buffer[0];
860
0
    buf.cp_data = &cpd_last;
861
0
    buf.cp_count = 1;
862
863
0
    unc->buffer_len++;
864
0
    sret = unicode_transform_input_buf(&unc->nf.transform, &buf,
865
0
               &error);
866
0
    i_assert(sret != 0);
867
0
    if (sret < 0)
868
0
      return 0;
869
0
  }
870
871
0
  i_assert(unc->buffer_len < UNICODE_NF_BUFFER_SIZE);
872
0
  unc->cp_buffer[unc->buffer_len] = cp;
873
0
  unc->buffer_len++;
874
875
0
  i_zero(&buf);
876
0
  buf.cp = &cp;
877
0
  buf.cp_data = &cp_data;
878
0
  buf.cp_count = 1;
879
0
  sret = unicode_transform_input_buf(&unc->nf.transform, &buf, &error);
880
0
  i_assert(sret != 0);
881
0
  if (sret < 0)
882
0
    return 0;
883
0
  return 1;
884
0
}
885
886
int unicode_nf_checker_finish(struct unicode_nf_checker *unc)
887
0
{
888
0
  if (unc->buffer_len == 0)
889
0
    return 1;
890
891
0
  const char *error;
892
0
  int ret;
893
894
0
  ret = unicode_transform_flush(&unc->nf.transform, &error);
895
0
  i_assert(ret != 0);
896
0
  return (ret > 0 ? 1 : 0);
897
0
}
898
899
/*
900
 * Casemap Transform
901
 */
902
903
static size_t
904
unicode_casemap_uppercase_cp(const struct unicode_code_point_data *cp_data,
905
           const uint32_t **map_r);
906
static size_t
907
unicode_casemap_lowercase_cp(const struct unicode_code_point_data *cp_data,
908
           const uint32_t **map_r);
909
static size_t
910
unicode_casemap_casefold_cp(const struct unicode_code_point_data *cp_data,
911
          const uint32_t **map_r);
912
913
static ssize_t
914
unicode_casemap_input(struct unicode_transform *trans,
915
          const struct unicode_transform_buffer *buf,
916
          const char **error_r);
917
static int
918
unicode_casemap_flush(struct unicode_transform *trans, bool finished,
919
          const char **error_r);
920
921
static const struct unicode_transform_def unicode_casemap_def = {
922
  .input = unicode_casemap_input,
923
  .flush = unicode_casemap_flush,
924
};
925
926
void unicode_casemap_init_uppercase(struct unicode_casemap *map_r)
927
0
{
928
0
  i_zero(map_r);
929
0
  unicode_transform_init(&map_r->transform, &unicode_casemap_def);
930
0
  map_r->map = unicode_casemap_uppercase_cp;
931
0
}
932
933
void unicode_casemap_init_lowercase(struct unicode_casemap *map_r)
934
0
{
935
0
  i_zero(map_r);
936
0
  unicode_transform_init(&map_r->transform, &unicode_casemap_def);
937
0
  map_r->map = unicode_casemap_lowercase_cp;
938
0
}
939
940
void unicode_casemap_init_casefold(struct unicode_casemap *map_r)
941
0
{
942
0
  i_zero(map_r);
943
0
  unicode_transform_init(&map_r->transform, &unicode_casemap_def);
944
0
  map_r->map = unicode_casemap_casefold_cp;
945
0
}
946
947
static size_t
948
unicode_casemap_uppercase_cp(const struct unicode_code_point_data *cp_data,
949
           const uint32_t **map_r)
950
0
{
951
0
  return unicode_code_point_data_get_uppercase_mapping(cp_data, map_r);
952
0
}
953
954
static size_t
955
unicode_casemap_lowercase_cp(const struct unicode_code_point_data *cp_data,
956
           const uint32_t **map_r)
957
0
{
958
0
  return unicode_code_point_data_get_lowercase_mapping(cp_data, map_r);
959
0
}
960
961
static size_t
962
unicode_casemap_casefold_cp(const struct unicode_code_point_data *cp_data,
963
          const uint32_t **map_r)
964
0
{
965
0
  return unicode_code_point_data_get_casefold_mapping(cp_data, map_r);
966
0
}
967
968
static ssize_t
969
unicode_casemap_input_cp(struct unicode_casemap *map, uint32_t cp,
970
       const struct unicode_code_point_data *cp_data,
971
       const char **error_r)
972
0
{
973
0
  bool was_buffered = map->cp_buffered;
974
0
  ssize_t sret;
975
976
0
  if (cp_data == NULL)
977
0
    cp_data = unicode_code_point_get_data(cp);
978
979
0
  const uint32_t *map_cps;
980
0
  const struct unicode_code_point_data *const *map_cps_data = NULL;
981
0
  size_t map_cps_len;
982
983
0
  map_cps_len = map->map(cp_data, &map_cps);
984
0
  if (map_cps_len == 0) {
985
0
    map_cps = &cp;
986
0
    map_cps_data = &cp_data;
987
0
    map_cps_len = 1;
988
0
  }
989
0
  i_assert(map_cps_len > map->cp_map_pos);
990
991
0
  map_cps += map->cp_map_pos;
992
0
  map_cps_len -= map->cp_map_pos;
993
0
  sret = uniform_transform_forward(&map->transform,
994
0
           map_cps, map_cps_data, map_cps_len,
995
0
           error_r);
996
0
  if (sret < 0) {
997
0
    i_assert(*error_r != NULL);
998
0
    return -1;
999
0
  }
1000
0
  if ((size_t)sret < map_cps_len) {
1001
0
    map->cp_buffered = TRUE;
1002
0
    map->cp = cp;
1003
0
    map->cp_data = cp_data;
1004
0
    map->cp_map_pos += sret;
1005
0
    return (was_buffered ? 0 : 1);
1006
0
  }
1007
1008
0
  map->cp_buffered = FALSE;
1009
0
  map->cp_data = NULL;
1010
0
  map->cp_map_pos = 0;
1011
0
  return 1;
1012
0
}
1013
1014
static ssize_t
1015
unicode_casemap_input(struct unicode_transform *trans,
1016
          const struct unicode_transform_buffer *buf,
1017
          const char **error_r)
1018
0
{
1019
0
  struct unicode_casemap *map =
1020
0
    container_of(trans, struct unicode_casemap, transform);
1021
0
  int ret;
1022
1023
0
  ret = unicode_casemap_flush(trans, TRUE, error_r);
1024
0
  if (ret < 0) {
1025
0
    i_assert(*error_r != NULL);
1026
0
    return -1;
1027
0
  }
1028
0
  if (map->cp_buffered)
1029
0
    return 0;
1030
1031
0
  size_t n;
1032
0
  for (n = 0; n < buf->cp_count; n++) {
1033
0
    if (map->cp_buffered)
1034
0
      break;
1035
0
    ret = unicode_casemap_input_cp(map, buf->cp[n],
1036
0
                 (buf->cp_data != NULL ?
1037
0
                  buf->cp_data[n] : NULL),
1038
0
                 error_r);
1039
0
    if (ret < 0) {
1040
0
      i_assert(*error_r != NULL);
1041
0
      return -1;
1042
0
    }
1043
0
    if (ret == 0)
1044
0
      break;
1045
0
  }
1046
0
  return n;
1047
0
}
1048
1049
static int
1050
unicode_casemap_flush(struct unicode_transform *trans,
1051
          bool finished ATTR_UNUSED, const char **error_r)
1052
0
{
1053
0
  struct unicode_casemap *map =
1054
0
    container_of(trans, struct unicode_casemap, transform);
1055
0
  int ret;
1056
1057
0
  if (!map->cp_buffered)
1058
0
    return 1;
1059
1060
0
  ret = unicode_casemap_input_cp(map, map->cp, map->cp_data, error_r);
1061
0
  i_assert(ret >= 0 || *error_r != NULL);
1062
0
  return ret;
1063
0
}
1064
1065
/*
1066
 * RFC 5051 - Simple Unicode Collation Algorithm
1067
 */
1068
1069
void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx)
1070
0
{
1071
0
  i_zero(ctx);
1072
0
}
1073
1074
size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx,
1075
         uint32_t cp, const uint32_t **norm_r)
1076
0
{
1077
0
  const struct unicode_code_point_data *cpd;
1078
0
  size_t len;
1079
1080
0
  cpd = unicode_code_point_get_data(cp);
1081
0
  if (cpd->simple_titlecase_mapping != 0x0000)
1082
0
    cp = cpd->simple_titlecase_mapping;
1083
1084
0
  if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) {
1085
0
    *norm_r = ctx->buffer;
1086
0
    return unicode_hangul_decompose(cp, ctx->buffer);
1087
0
  }
1088
1089
0
  len = unicode_code_point_get_full_decomposition(cp, FALSE, norm_r);
1090
0
  if (len == 0) {
1091
0
    ctx->buffer[0] = cp;
1092
0
    *norm_r = ctx->buffer;
1093
0
    return 1;
1094
0
  }
1095
0
  return len;
1096
0
}