/src/dovecot/src/lib/unicode-transform.h
Line | Count | Source |
1 | | #ifndef UNICODE_NF_H |
2 | | #define UNICODE_NF_H |
3 | | |
4 | 0 | #define UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN 30 |
5 | 0 | #define UNICODE_NF_BUFFER_SIZE (UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN + 2) |
6 | | |
7 | | struct unicode_code_point_data; |
8 | | |
9 | | /* |
10 | | * Transform API |
11 | | */ |
12 | | |
13 | | struct unicode_transform; |
14 | | |
15 | | struct unicode_transform_buffer { |
16 | | const uint32_t *cp; |
17 | | const struct unicode_code_point_data *const *cp_data; |
18 | | size_t cp_count; |
19 | | }; |
20 | | |
21 | | struct unicode_transform_def { |
22 | | ssize_t (*input)(struct unicode_transform *trans, |
23 | | const struct unicode_transform_buffer *buf, |
24 | | const char **error_r); |
25 | | int (*flush)(struct unicode_transform *trans, bool finished, |
26 | | const char **error_r); |
27 | | }; |
28 | | |
29 | | struct unicode_transform { |
30 | | const struct unicode_transform_def *def; |
31 | | struct unicode_transform *next; |
32 | | }; |
33 | | |
34 | | static inline void |
35 | | unicode_transform_init(struct unicode_transform *trans, |
36 | | const struct unicode_transform_def *def) |
37 | 0 | { |
38 | 0 | i_zero(trans); |
39 | 0 | trans->def = def; |
40 | 0 | } Unexecuted instantiation: unichar.c:unicode_transform_init Unexecuted instantiation: unicode-transform.c:unicode_transform_init |
41 | | |
42 | | static inline void |
43 | | unicode_transform_chain(struct unicode_transform *trans, |
44 | | struct unicode_transform *next) |
45 | 0 | { |
46 | 0 | i_assert(trans->next == NULL); |
47 | 0 | trans->next = next; |
48 | 0 | } Unexecuted instantiation: unichar.c:unicode_transform_chain Unexecuted instantiation: unicode-transform.c:unicode_transform_chain |
49 | | |
50 | | static inline struct unicode_transform * |
51 | | unicode_transform_get_last(struct unicode_transform *trans) |
52 | 0 | { |
53 | 0 | while (trans->next != NULL) |
54 | 0 | trans = trans->next; |
55 | 0 | return trans; |
56 | 0 | } Unexecuted instantiation: unichar.c:unicode_transform_get_last Unexecuted instantiation: unicode-transform.c:unicode_transform_get_last |
57 | | |
58 | | ssize_t uniform_transform_forward( |
59 | | struct unicode_transform *trans, const uint32_t *out, |
60 | | const struct unicode_code_point_data *const *out_data, size_t out_len, |
61 | | const char **error_r); |
62 | | |
63 | | ssize_t unicode_transform_input_buf(struct unicode_transform *trans, |
64 | | const struct unicode_transform_buffer *buf, |
65 | | const char **error_r); |
66 | | static inline ssize_t |
67 | | unicode_transform_input(struct unicode_transform *trans, |
68 | | const uint32_t *in, size_t in_len, const char **error_r) |
69 | 0 | { |
70 | 0 | struct unicode_transform_buffer buf = { |
71 | 0 | .cp = in, |
72 | 0 | .cp_count = in_len, |
73 | 0 | }; |
74 | |
|
75 | 0 | return unicode_transform_input_buf(trans, &buf, error_r); |
76 | 0 | } Unexecuted instantiation: unichar.c:unicode_transform_input Unexecuted instantiation: unicode-transform.c:unicode_transform_input |
77 | | |
78 | | int unicode_transform_flush(struct unicode_transform *trans, |
79 | | const char **error_r); |
80 | | |
81 | | /* Buffer Sink */ |
82 | | |
83 | | struct unicode_buffer_sink { |
84 | | struct unicode_transform transform; |
85 | | buffer_t *buffer; |
86 | | }; |
87 | | |
88 | | void unicode_buffer_sink_init(struct unicode_buffer_sink *sink, |
89 | | buffer_t *buffer); |
90 | | |
91 | | /* Static Array Sink */ |
92 | | |
93 | | struct unicode_static_array_sink { |
94 | | struct unicode_transform transform; |
95 | | uint32_t *array; |
96 | | size_t array_size; |
97 | | size_t *array_pos; |
98 | | }; |
99 | | |
100 | | void unicode_static_array_sink_init(struct unicode_static_array_sink *sink, |
101 | | uint32_t *array, size_t array_size, |
102 | | size_t *array_pos); |
103 | | |
104 | | /* |
105 | | * NFD, NFKD, NFC, NFKC |
106 | | */ |
107 | | |
108 | | /* Unicode Standard Annex #15, Section 1.2: |
109 | | |
110 | | Unicode Normalization Forms are formally defined normalizations of Unicode |
111 | | strings which make it possible to determine whether any two Unicode strings |
112 | | are equivalent to each other. Depending on the particular Unicode |
113 | | Normalization Form, that equivalence can either be a canonical equivalence or |
114 | | a compatibility equivalence. |
115 | | |
116 | | Essentially, the Unicode Normalization Algorithm puts all combining marks in |
117 | | a specified order, and uses rules for decomposition and composition to |
118 | | transform each string into one of the Unicode Normalization Forms. A binary |
119 | | comparison of the transformed strings will then determine equivalence. |
120 | | |
121 | | The four Unicode Normalization Forms are summarized as follows: |
122 | | |
123 | | Normalization Form D (NFD) - Canonical Decomposition |
124 | | Normalization Form KD (NFKD) - Compatibility Decomposition |
125 | | Normalization Form C (NFC) - Canonical Decomposition, followed by |
126 | | Canonical Composition |
127 | | Normalization Form KC (NFKC) - Compatibility Decomposition, followed by |
128 | | Canonical Composition |
129 | | |
130 | | There are two forms of normalization that convert to composite characters: |
131 | | Normalization Form C and Normalization Form KC. The difference between these |
132 | | depends on whether the resulting text is to be a canonical equivalent to the |
133 | | original unnormalized text or a compatibility equivalent to the original |
134 | | unnormalized text. (In NFKC and NFKD, a K is used to stand for compatibility |
135 | | to avoid confusion with the C standing for composition.) Both types of |
136 | | normalization can be useful in different circumstances. |
137 | | */ |
138 | | |
139 | | enum unicode_nf_type { |
140 | | UNICODE_NFD, |
141 | | UNICODE_NFKD, |
142 | | UNICODE_NFC, |
143 | | UNICODE_NFKC, |
144 | | }; |
145 | | |
146 | | struct unicode_nf_context { |
147 | | struct unicode_transform transform; |
148 | | |
149 | | size_t nonstarter_count; |
150 | | uint32_t cp_buffer[UNICODE_NF_BUFFER_SIZE]; |
151 | | const struct unicode_code_point_data * |
152 | | cpd_buffer[UNICODE_NF_BUFFER_SIZE]; |
153 | | size_t buffer_len, buffer_processed, buffer_output_max; |
154 | | |
155 | | size_t pending_decomp; |
156 | | uint32_t pending_cp; |
157 | | const struct unicode_code_point_data *pending_cpd; |
158 | | |
159 | | uint8_t nf_qc_mask; |
160 | | |
161 | | bool compose:1; |
162 | | bool canonical:1; |
163 | | bool finished:1; |
164 | | }; |
165 | | |
166 | | void unicode_nf_init(struct unicode_nf_context *ctx_r, |
167 | | enum unicode_nf_type type); |
168 | | void unicode_nf_reset(struct unicode_nf_context *ctx); |
169 | | |
170 | | /* |
171 | | * Normalization check |
172 | | */ |
173 | | |
174 | | struct unicode_nf_checker { |
175 | | const struct unicode_code_point_data *cpd_last; |
176 | | |
177 | | uint8_t nf_qc_mask; |
178 | | uint8_t nf_qc_yes; |
179 | | uint8_t nf_qc_no; |
180 | | |
181 | | uint32_t cp_buffer[UNICODE_NF_BUFFER_SIZE]; |
182 | | size_t buffer_len; |
183 | | struct unicode_nf_context nf; |
184 | | struct unicode_transform sink; |
185 | | |
186 | | bool not_first_cp; |
187 | | bool compose:1; |
188 | | bool canonical:1; |
189 | | }; |
190 | | |
191 | | void unicode_nf_checker_init(struct unicode_nf_checker *unc_r, |
192 | | enum unicode_nf_type type); |
193 | | void unicode_nf_checker_reset(struct unicode_nf_checker *unc); |
194 | | |
195 | | int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp, |
196 | | const struct unicode_code_point_data **cp_data); |
197 | | int unicode_nf_checker_finish(struct unicode_nf_checker *unc); |
198 | | |
199 | | /* |
200 | | * Casemap Transform |
201 | | */ |
202 | | |
203 | | struct unicode_casemap { |
204 | | struct unicode_transform transform; |
205 | | |
206 | | size_t (*map)(const struct unicode_code_point_data *cp_data, |
207 | | const uint32_t **map_r); |
208 | | |
209 | | uint32_t cp; |
210 | | const struct unicode_code_point_data *cp_data; |
211 | | unsigned int cp_map_pos; |
212 | | |
213 | | bool cp_buffered:1; |
214 | | }; |
215 | | |
216 | | void unicode_casemap_init_uppercase(struct unicode_casemap *map); |
217 | | void unicode_casemap_init_lowercase(struct unicode_casemap *map); |
218 | | void unicode_casemap_init_casefold(struct unicode_casemap *map); |
219 | | |
220 | | /* |
221 | | * RFC 5051 - Simple Unicode Collation Algorithm |
222 | | */ |
223 | | |
224 | | struct unicode_rfc5051_context { |
225 | | uint32_t buffer[3]; |
226 | | }; |
227 | | |
228 | | void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx); |
229 | | size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx, |
230 | | uint32_t cp, const uint32_t **norm_r); |
231 | | |
232 | | #endif |