Coverage Report

Created: 2026-03-11 06:16

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/dovecot/src/lib/unicode-transform.h
Line
Count
Source
1
#ifndef UNICODE_NF_H
2
#define UNICODE_NF_H
3
4
0
#define UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN 30
5
0
#define UNICODE_NF_BUFFER_SIZE (UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN + 2)
6
7
struct unicode_code_point_data;
8
9
/*
10
 * Transform API
11
 */
12
13
struct unicode_transform;
14
15
struct unicode_transform_buffer {
16
  const uint32_t *cp;
17
  const struct unicode_code_point_data *const *cp_data;
18
  size_t cp_count;
19
};
20
21
struct unicode_transform_def {
22
  ssize_t (*input)(struct unicode_transform *trans,
23
       const struct unicode_transform_buffer *buf,
24
       const char **error_r);
25
  int (*flush)(struct unicode_transform *trans, bool finished,
26
         const char **error_r);
27
};
28
29
struct unicode_transform {
30
  const struct unicode_transform_def *def;
31
  struct unicode_transform *next;
32
};
33
34
static inline void
35
unicode_transform_init(struct unicode_transform *trans,
36
           const struct unicode_transform_def *def)
37
0
{
38
0
  i_zero(trans);
39
0
  trans->def = def;
40
0
}
Unexecuted instantiation: unichar.c:unicode_transform_init
Unexecuted instantiation: unicode-transform.c:unicode_transform_init
41
42
static inline void
43
unicode_transform_chain(struct unicode_transform *trans,
44
      struct unicode_transform *next)
45
0
{
46
0
  i_assert(trans->next == NULL);
47
0
  trans->next = next;
48
0
}
Unexecuted instantiation: unichar.c:unicode_transform_chain
Unexecuted instantiation: unicode-transform.c:unicode_transform_chain
49
50
static inline struct unicode_transform *
51
unicode_transform_get_last(struct unicode_transform *trans)
52
0
{
53
0
  while (trans->next != NULL)
54
0
    trans = trans->next;
55
0
  return trans;
56
0
}
Unexecuted instantiation: unichar.c:unicode_transform_get_last
Unexecuted instantiation: unicode-transform.c:unicode_transform_get_last
57
58
ssize_t uniform_transform_forward(
59
  struct unicode_transform *trans, const uint32_t *out,
60
  const struct unicode_code_point_data *const *out_data, size_t out_len,
61
  const char **error_r);
62
63
ssize_t unicode_transform_input_buf(struct unicode_transform *trans,
64
            const struct unicode_transform_buffer *buf,
65
            const char **error_r);
66
static inline ssize_t
67
unicode_transform_input(struct unicode_transform *trans,
68
      const uint32_t *in, size_t in_len, const char **error_r)
69
0
{
70
0
  struct unicode_transform_buffer buf = {
71
0
    .cp = in,
72
0
    .cp_count = in_len,
73
0
  };
74
75
0
  return unicode_transform_input_buf(trans, &buf, error_r);
76
0
}
Unexecuted instantiation: unichar.c:unicode_transform_input
Unexecuted instantiation: unicode-transform.c:unicode_transform_input
77
78
int unicode_transform_flush(struct unicode_transform *trans,
79
          const char **error_r);
80
81
/* Buffer Sink */
82
83
struct unicode_buffer_sink {
84
  struct unicode_transform transform;
85
  buffer_t *buffer;
86
};
87
88
void unicode_buffer_sink_init(struct unicode_buffer_sink *sink,
89
            buffer_t *buffer);
90
91
/* Static Array Sink */
92
93
struct unicode_static_array_sink {
94
  struct unicode_transform transform;
95
  uint32_t *array;
96
  size_t array_size;
97
  size_t *array_pos;
98
};
99
100
void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
101
            uint32_t *array, size_t array_size,
102
            size_t *array_pos);
103
104
/*
105
 * NFD, NFKD, NFC, NFKC
106
 */
107
108
/* Unicode Standard Annex #15, Section 1.2:
109
110
   Unicode Normalization Forms are formally defined normalizations of Unicode
111
   strings which make it possible to determine whether any two Unicode strings
112
   are equivalent to each other. Depending on the particular Unicode
113
   Normalization Form, that equivalence can either be a canonical equivalence or
114
   a compatibility equivalence.
115
116
   Essentially, the Unicode Normalization Algorithm puts all combining marks in
117
   a specified order, and uses rules for decomposition and composition to
118
   transform each string into one of the Unicode Normalization Forms. A binary
119
   comparison of the transformed strings will then determine equivalence.
120
121
   The four Unicode Normalization Forms are summarized as follows:
122
123
     Normalization Form D  (NFD)   - Canonical Decomposition
124
     Normalization Form KD (NFKD)  - Compatibility Decomposition
125
     Normalization Form C  (NFC)   - Canonical Decomposition, followed by
126
                                     Canonical Composition
127
     Normalization Form KC (NFKC)  - Compatibility Decomposition, followed by
128
                                     Canonical Composition
129
130
   There are two forms of normalization that convert to composite characters:
131
   Normalization Form C and Normalization Form KC. The difference between these
132
   depends on whether the resulting text is to be a canonical equivalent to the
133
   original unnormalized text or a compatibility equivalent to the original
134
   unnormalized text. (In NFKC and NFKD, a K is used to stand for compatibility
135
   to avoid confusion with the C standing for composition.) Both types of
136
   normalization can be useful in different circumstances.
137
 */
138
139
enum unicode_nf_type {
140
  UNICODE_NFD,
141
  UNICODE_NFKD,
142
  UNICODE_NFC,
143
  UNICODE_NFKC,
144
};
145
146
struct unicode_nf_context {
147
  struct unicode_transform transform;
148
149
  size_t nonstarter_count;
150
  uint32_t cp_buffer[UNICODE_NF_BUFFER_SIZE];
151
  const struct unicode_code_point_data *
152
    cpd_buffer[UNICODE_NF_BUFFER_SIZE];
153
  size_t buffer_len, buffer_processed, buffer_output_max;
154
155
  size_t pending_decomp;
156
  uint32_t pending_cp;
157
  const struct unicode_code_point_data *pending_cpd;
158
159
  uint8_t nf_qc_mask;
160
161
  bool compose:1;
162
  bool canonical:1;
163
  bool finished:1;
164
};
165
166
void unicode_nf_init(struct unicode_nf_context *ctx_r,
167
         enum unicode_nf_type type);
168
void unicode_nf_reset(struct unicode_nf_context *ctx);
169
170
/*
171
 * Normalization check
172
 */
173
174
struct unicode_nf_checker {
175
  const struct unicode_code_point_data *cpd_last;
176
177
  uint8_t nf_qc_mask;
178
  uint8_t nf_qc_yes;
179
  uint8_t nf_qc_no;
180
181
  uint32_t cp_buffer[UNICODE_NF_BUFFER_SIZE];
182
  size_t buffer_len;
183
  struct unicode_nf_context nf;
184
  struct unicode_transform sink;
185
186
  bool not_first_cp;
187
  bool compose:1;
188
  bool canonical:1;
189
};
190
191
void unicode_nf_checker_init(struct unicode_nf_checker *unc_r,
192
           enum unicode_nf_type type);
193
void unicode_nf_checker_reset(struct unicode_nf_checker *unc);
194
195
int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp,
196
           const struct unicode_code_point_data **cp_data);
197
int unicode_nf_checker_finish(struct unicode_nf_checker *unc);
198
199
/*
200
 * Casemap Transform
201
 */
202
203
struct unicode_casemap {
204
  struct unicode_transform transform;
205
206
  size_t (*map)(const struct unicode_code_point_data *cp_data,
207
          const uint32_t **map_r);
208
209
  uint32_t cp;
210
  const struct unicode_code_point_data *cp_data;
211
  unsigned int cp_map_pos;
212
213
  bool cp_buffered:1;
214
};
215
216
void unicode_casemap_init_uppercase(struct unicode_casemap *map);
217
void unicode_casemap_init_lowercase(struct unicode_casemap *map);
218
void unicode_casemap_init_casefold(struct unicode_casemap *map);
219
220
/*
221
 * RFC 5051 - Simple Unicode Collation Algorithm
222
 */
223
224
struct unicode_rfc5051_context {
225
  uint32_t buffer[3];
226
};
227
228
void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx);
229
size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx,
230
         uint32_t cp, const uint32_t **norm_r);
231
232
#endif