Coverage Report

Created: 2026-01-25 07:08

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/dovecot/src/lib/unichar.h
Line
Count
Source
1
#ifndef UNICHAR_H
2
#define UNICHAR_H
3
4
#include "unicode-break.h"
5
6
/* Character used to replace invalid input. */
7
0
#define UNICODE_REPLACEMENT_CHAR 0xfffd
8
#define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD"
9
#define UNICODE_REPLACEMENT_CHAR_UTF8_LEN \
10
  (sizeof(UNICODE_REPLACEMENT_CHAR_UTF8) - 1);
11
/* Horizontal ellipsis character ('...') */
12
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR 0x2026
13
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8 "\xE2\x80\xA6"
14
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8_LEN \
15
  (sizeof(UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8) - 1);
16
17
/* Characters >= base require surrogates */
18
#define UTF16_SURROGATE_BASE 0x10000
19
20
#define UTF16_SURROGATE_SHIFT 10
21
#define UTF16_SURROGATE_MASK 0x03ff
22
0
#define UTF16_SURROGATE_HIGH_FIRST 0xd800
23
#define UTF16_SURROGATE_HIGH_LAST 0xdbff
24
#define UTF16_SURROGATE_HIGH_MAX 0xdfff
25
0
#define UTF16_SURROGATE_LOW_FIRST 0xdc00
26
#define UTF16_SURROGATE_LOW_LAST 0xdfff
27
28
#define UTF16_SURROGATE_HIGH(chr) \
29
  (UTF16_SURROGATE_HIGH_FIRST + \
30
   (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT))
31
#define UTF16_SURROGATE_LOW(chr) \
32
  (UTF16_SURROGATE_LOW_FIRST + \
33
   (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK))
34
35
/* Returns TRUE if given byte is ASCII character or the beginning of a
36
   multibyte UTF-8 sequence */
37
#define UTF8_IS_START_SEQ(b) \
38
  (((b) & 0x80) == 0 || ((b) & 0xC0) == 0xC0)
39
40
0
#define UTF8_REPLACEMENT_CHAR_LEN 3
41
42
0
#define UNICHAR_T_MAX 0x10ffff
43
44
0
#define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST)
45
0
#define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST)
46
47
struct unicode_transform;
48
49
typedef uint32_t unichar_t;
50
ARRAY_DEFINE_TYPE(unichars, unichar_t);
51
52
/* Normalize UTF8 input and append it to output buffer.
53
   Returns 0 if ok, -1 if input was invalid. Even if input was invalid,
54
   as much as possible should be added to output. */
55
typedef int normalizer_func_t(const void *input, size_t size,
56
            buffer_t *output);
57
58
extern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN];
59
extern const uint8_t *const uni_utf8_non1_bytes;
60
61
static inline bool ATTR_PURE uni_is_valid_ucs4(unichar_t chr)
62
0
{
63
0
  return (!UTF16_VALID_HIGH_SURROGATE(chr) &&
64
0
    !UTF16_VALID_LOW_SURROGATE(chr) &&
65
0
    chr <= UNICHAR_T_MAX);
66
0
};
Unexecuted instantiation: imap-bodystructure.c:uni_is_valid_ucs4
Unexecuted instantiation: str.c:uni_is_valid_ucs4
Unexecuted instantiation: unichar.c:uni_is_valid_ucs4
Unexecuted instantiation: unicode-transform.c:uni_is_valid_ucs4
67
68
/* Returns number of characters in a NUL-terminated unicode string */
69
unsigned int uni_strlen(const unichar_t *str) ATTR_PURE;
70
/* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
71
   invalid */
72
int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output);
73
int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
74
           ARRAY_TYPE(unichars) *output);
75
/* Translates UCS-4 input to UTF-8 output. */
76
void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
77
void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);
78
79
/* Return number of octets needed to encode this codepoint in UTF-8. */
80
static inline unsigned int uni_ucs4_to_utf8_len(unichar_t chr)
81
0
{
82
0
  i_assert(uni_is_valid_ucs4(chr));
83
0
  if (chr > 0xFFFF)
84
0
    return 4;
85
0
  if (chr > 0x07FF)
86
0
    return 3;
87
0
  if (chr > 0x007f)
88
0
    return 2;
89
0
  return 1;
90
0
}
Unexecuted instantiation: imap-bodystructure.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: str.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: unichar.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: unicode-transform.c:uni_ucs4_to_utf8_len
91
92
/* Returns char_bytes (>0) if *chr_r is set, 0 for incomplete trailing character,
93
   -1 for invalid input. */
94
int uni_utf8_get_char(const char *input, unichar_t *chr_r);
95
int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
96
int uni_utf8_get_char_buf(const void *buffer, size_t size, unichar_t *chr_r);
97
/* Returns number of characters in UTF-8 string. */
98
unsigned int uni_utf8_strlen(const char *input) ATTR_PURE;
99
/* Returns number of characters in UTF-8 input of specified size. */
100
unsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE;
101
/* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8
102
   character, don't include it in the return value and set partial_pos_r to
103
   where the character begins. Otherwise partial_pos_r is set to the end
104
   of the input. */
105
unsigned int uni_utf8_partial_strlen_n(const void *input, size_t size,
106
               size_t *partial_pos_r);
107
108
/* Returns the number of bytes belonging to this UTF-8 character. The given
109
   parameter is the first byte of the UTF-8 sequence. Invalid input is
110
   returned with length 1. */
111
static inline unsigned int ATTR_CONST
112
uni_utf8_char_bytes(unsigned char chr)
113
0
{
114
  /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
115
0
  if (chr < (192 + 2))
116
0
    return 1;
117
0
  return uni_utf8_non1_bytes[chr - (192 + 2)];
118
0
}
Unexecuted instantiation: imap-bodystructure.c:uni_utf8_char_bytes
Unexecuted instantiation: str.c:uni_utf8_char_bytes
Unexecuted instantiation: unichar.c:uni_utf8_char_bytes
Unexecuted instantiation: unicode-transform.c:uni_utf8_char_bytes
119
120
/* Return given character in titlecase. */
121
unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
122
123
/* Run the UTF8 string through the provided Unicode transform and write the
124
   result into the buffer again encoded in UTF8. */
125
int uni_utf8_run_transform(const void *_input, size_t size,
126
         struct unicode_transform *trans, buffer_t *output,
127
         const char **error_r);
128
129
/* Normalize the UTF-8 input in Unicode NFD, NFKD, NFC or NFKC form and write
130
   the result to the output buffer.
131
132
   Refer to Unicode Standard Annex #15, Section 1.2 for more information. An
133
   excerpt can be found in unicode-nf.h.
134
135
   NOTE: Do not blindly use this function to write and append several values
136
   together expecting the result to be NF* normalized as well. This function
137
   does not check whether concatenation preserves the desired normalization nor
138
   does it endeavour to achieve this result. Blind concatination works only in
139
   very specific cases, so make sure you know what you are doing.
140
 */
141
int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output);
142
int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output);
143
int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output);
144
int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output);
145
146
/* Same as the write variants, but return the normalized input in the
147
   output_r argument as a C string.
148
 */
149
int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r);
150
int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r);
151
int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r);
152
int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r);
153
154
/* Check whether the input is normalized in the indicated form. Returns -1 if
155
   the input is not even valid UTF8 or contains invalid code points. Returns 1
156
   if the input adheres to the requested normalization form and 0 otherwise. */
157
int uni_utf8_is_nfd(const void *input, size_t size);
158
int uni_utf8_is_nfkd(const void *input, size_t size);
159
int uni_utf8_is_nfc(const void *input, size_t size);
160
int uni_utf8_is_nfkc(const void *input, size_t size);
161
162
/* Write the input UTF8 string to the provided buffer after mapping it to the
163
   requested case. */
164
int uni_utf8_write_uppercase(const void *_input, size_t size, buffer_t *output);
165
int uni_utf8_write_lowercase(const void *_input, size_t size, buffer_t *output);
166
int uni_utf8_write_casefold(const void *_input, size_t size, buffer_t *output);
167
168
int uni_utf8_to_uppercase(const void *input, size_t size, const char **output_r);
169
int uni_utf8_to_lowercase(const void *input, size_t size, const char **output_r);
170
int uni_utf8_to_casefold(const void *input, size_t size, const char **output_r);
171
172
/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
173
   output buffer. Returns 0 if ok, -1 if input was invalid. This generates
174
   output that's compatible with i;unicode-casemap comparator. Invalid input
175
   is replaced with unicode replacement character (0xfffd). */
176
int uni_utf8_to_decomposed_titlecase(const void *input, size_t size,
177
             buffer_t *output);
178
179
/* If input contains only valid UTF-8 characters, return TRUE without updating
180
   buf. If input contains invalid UTF-8 characters, replace them with unicode
181
   replacement character (0xfffd), write the output to buf and return FALSE. */
182
bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
183
           buffer_t *buf) ATTR_WARN_UNUSED_RESULT;
184
/* Returns TRUE if string is valid UTF-8 input. */
185
bool uni_utf8_str_is_valid(const char *str);
186
/* Returns TRUE if data contains only valid UTF-8 input. */
187
bool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
188
/* Returns the size of the data when truncated to be less than or equal to
189
   max_new_size, making sure UTF-8 character boundaries are respected. This only
190
   looks at the last character at the new boundary. */
191
size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size,
192
            size_t max_new_size);
193
194
/* surrogate handling */
195
static inline unichar_t uni_join_surrogate(unichar_t high, unichar_t low)
196
0
{
197
0
  i_assert(UTF16_VALID_HIGH_SURROGATE(high) &&
198
0
     UTF16_VALID_LOW_SURROGATE(low));
199
0
200
0
  return ((high - UTF16_SURROGATE_HIGH_FIRST)<<10) +
201
0
    (low - UTF16_SURROGATE_LOW_FIRST) +
202
0
    UTF16_SURROGATE_BASE;
203
0
}
Unexecuted instantiation: imap-bodystructure.c:uni_join_surrogate
Unexecuted instantiation: str.c:uni_join_surrogate
Unexecuted instantiation: unichar.c:uni_join_surrogate
Unexecuted instantiation: unicode-transform.c:uni_join_surrogate
204
205
static inline void uni_split_surrogate(unichar_t chr, unichar_t *high_r, unichar_t *low_r)
206
0
{
207
0
  i_assert(chr >= UTF16_SURROGATE_BASE && chr <= UNICHAR_T_MAX);
208
0
  i_assert(high_r != NULL && low_r != NULL);
209
0
  *high_r = UTF16_SURROGATE_HIGH(chr);
210
0
  *low_r = UTF16_SURROGATE_LOW(chr);
211
0
}
Unexecuted instantiation: imap-bodystructure.c:uni_split_surrogate
Unexecuted instantiation: str.c:uni_split_surrogate
Unexecuted instantiation: unichar.c:uni_split_surrogate
Unexecuted instantiation: unicode-transform.c:uni_split_surrogate
212
213
/*
214
 * Grapheme clusters
215
 */
216
217
/* The grapheme cluster scanner is used to split a Unicode string into a
218
   sequence of grapheme clusters, which are in essence the Unicode characters as
219
   perceived by the user. These can be longer than a single code point and by
220
   consequence longer than a single octet. The Unicode standard defines what
221
   constitutes a grapheme cluster in Annex #29. */
222
223
struct uni_gc_scanner {
224
  pool_t pool;
225
  struct unicode_gc_break gcbrk;
226
227
  const unsigned char *poffset, *p, *pend;
228
229
  unichar_t cp;
230
  int cp_size;
231
};
232
233
/* Initialize the scanner. */
234
void uni_gc_scanner_init(struct uni_gc_scanner *gcsc,
235
       const void *input, size_t size);
236
/* Shift scanner position to next grapheme cluster. Returns TRUE when scanner
237
   points to a valid grapheme cluster and has not reached the end. */
238
bool uni_gc_scan_shift(struct uni_gc_scanner *gcsc) ATTR_NOWARN_UNUSED_RESULT;
239
240
241
/* Obtain a pointer to the current grapheme cluster the scanner points to.
242
   Returns the size of the cluster in octets in size_r. */
243
static inline const unsigned char *
244
uni_gc_scan_get(struct uni_gc_scanner *gcsc, size_t *size_r)
245
0
{
246
0
  if (gcsc->poffset == NULL)
247
0
    uni_gc_scan_shift(gcsc);
248
0
  if (size_r != NULL)
249
0
    *size_r = gcsc->p - gcsc->poffset;
250
0
  return gcsc->poffset;
251
0
}
Unexecuted instantiation: imap-bodystructure.c:uni_gc_scan_get
Unexecuted instantiation: str.c:uni_gc_scan_get
Unexecuted instantiation: unichar.c:uni_gc_scan_get
Unexecuted instantiation: unicode-transform.c:uni_gc_scan_get
252
253
/* Convenience function for checking whether current grapheme cluster is a
254
   particular (single-octet) ASCII character.  */
255
static inline bool
256
uni_gc_scan_ascii_equals(struct uni_gc_scanner *gcsc, unsigned int c)
257
0
{
258
0
  size_t gc_size;
259
0
  const unsigned char *gc = uni_gc_scan_get(gcsc, &gc_size);
260
0
261
0
  if (gc_size != 1)
262
0
    return FALSE;
263
0
264
0
  return (*gc == (unsigned char)c);
265
0
}
Unexecuted instantiation: imap-bodystructure.c:uni_gc_scan_ascii_equals
Unexecuted instantiation: str.c:uni_gc_scan_ascii_equals
Unexecuted instantiation: unichar.c:uni_gc_scan_ascii_equals
Unexecuted instantiation: unicode-transform.c:uni_gc_scan_ascii_equals
266
267
/* Returns TRUE when the scanner has reached the end of input. */
268
static inline bool uni_gc_scan_at_end(struct uni_gc_scanner *gcsc)
269
0
{
270
0
  size_t gc_size;
271
0
  (void)uni_gc_scan_get(gcsc, &gc_size);
272
0
  return (gc_size == 0);
273
0
}
Unexecuted instantiation: imap-bodystructure.c:uni_gc_scan_at_end
Unexecuted instantiation: str.c:uni_gc_scan_at_end
Unexecuted instantiation: unichar.c:uni_gc_scan_at_end
Unexecuted instantiation: unicode-transform.c:uni_gc_scan_at_end
274
275
#endif