Coverage Report

Created: 2025-11-11 07:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/dovecot/src/lib/unichar.h
Line
Count
Source
1
#ifndef UNICHAR_H
2
#define UNICHAR_H
3
4
/* Character used to replace invalid input. */
5
0
#define UNICODE_REPLACEMENT_CHAR 0xfffd
6
86.5k
#define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD"
7
#define UNICODE_REPLACEMENT_CHAR_UTF8_LEN \
8
  (sizeof(UNICODE_REPLACEMENT_CHAR_UTF8) - 1);
9
/* Horizontal ellipsis character ('...') */
10
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR 0x2026
11
0
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8 "\xE2\x80\xA6"
12
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8_LEN \
13
  (sizeof(UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8) - 1);
14
15
/* Characters >= base require surrogates */
16
23.2M
#define UTF16_SURROGATE_BASE 0x10000
17
18
4.92k
#define UTF16_SURROGATE_SHIFT 10
19
5.97k
#define UTF16_SURROGATE_MASK 0x03ff
20
116M
#define UTF16_SURROGATE_HIGH_FIRST 0xd800
21
1.08k
#define UTF16_SURROGATE_HIGH_LAST 0xdbff
22
3.36k
#define UTF16_SURROGATE_HIGH_MAX 0xdfff
23
116M
#define UTF16_SURROGATE_LOW_FIRST 0xdc00
24
1.06k
#define UTF16_SURROGATE_LOW_LAST 0xdfff
25
26
#define UTF16_SURROGATE_HIGH(chr) \
27
3.86k
  (UTF16_SURROGATE_HIGH_FIRST + \
28
3.86k
   (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT))
29
#define UTF16_SURROGATE_LOW(chr) \
30
3.86k
  (UTF16_SURROGATE_LOW_FIRST + \
31
3.86k
   (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK))
32
33
/* Returns TRUE if given byte is ASCII character or the beginning of a
34
   multibyte UTF-8 sequence */
35
#define UTF8_IS_START_SEQ(b) \
36
  (((b) & 0x80) == 0 || ((b) & 0xC0) == 0xC0)
37
38
241k
#define UTF8_REPLACEMENT_CHAR_LEN 3
39
40
116M
#define UNICHAR_T_MAX 0x10ffff
41
42
232M
#define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST)
43
232M
#define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST)
44
45
struct unicode_transform;
46
47
typedef uint32_t unichar_t;
48
ARRAY_DEFINE_TYPE(unichars, unichar_t);
49
50
/* Normalize UTF8 input and append it to output buffer.
51
   Returns 0 if ok, -1 if input was invalid. Even if input was invalid,
52
   as much as possible should be added to output. */
53
typedef int normalizer_func_t(const void *input, size_t size,
54
            buffer_t *output);
55
56
extern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN];
57
extern const uint8_t *const uni_utf8_non1_bytes;
58
59
static inline bool ATTR_PURE uni_is_valid_ucs4(unichar_t chr)
60
116M
{
61
116M
  return (!UTF16_VALID_HIGH_SURROGATE(chr) &&
62
116M
    !UTF16_VALID_LOW_SURROGATE(chr) &&
63
116M
    chr <= UNICHAR_T_MAX);
64
116M
};
json-parser.c:uni_is_valid_ucs4
Line
Count
Source
60
156k
{
61
156k
  return (!UTF16_VALID_HIGH_SURROGATE(chr) &&
62
156k
    !UTF16_VALID_LOW_SURROGATE(chr) &&
63
156k
    chr <= UNICHAR_T_MAX);
64
156k
};
json-generator.c:uni_is_valid_ucs4
Line
Count
Source
60
55.0k
{
61
55.0k
  return (!UTF16_VALID_HIGH_SURROGATE(chr) &&
62
55.0k
    !UTF16_VALID_LOW_SURROGATE(chr) &&
63
55.0k
    chr <= UNICHAR_T_MAX);
64
55.0k
};
Unexecuted instantiation: json-syntax.c:uni_is_valid_ucs4
Unexecuted instantiation: str.c:uni_is_valid_ucs4
Unexecuted instantiation: str-sanitize.c:uni_is_valid_ucs4
unichar.c:uni_is_valid_ucs4
Line
Count
Source
60
116M
{
61
116M
  return (!UTF16_VALID_HIGH_SURROGATE(chr) &&
62
115M
    !UTF16_VALID_LOW_SURROGATE(chr) &&
63
115M
    chr <= UNICHAR_T_MAX);
64
116M
};
Unexecuted instantiation: unicode-transform.c:uni_is_valid_ucs4
Unexecuted instantiation: smtp-command-parser.c:uni_is_valid_ucs4
Unexecuted instantiation: message-address.c:uni_is_valid_ucs4
Unexecuted instantiation: message-date.c:uni_is_valid_ucs4
Unexecuted instantiation: rfc822-parser.c:uni_is_valid_ucs4
Unexecuted instantiation: var-expand-lexer.c:uni_is_valid_ucs4
Unexecuted instantiation: punycode.c:uni_is_valid_ucs4
Unexecuted instantiation: imap-bodystructure.c:uni_is_valid_ucs4
Unexecuted instantiation: imap-utf7.c:uni_is_valid_ucs4
Unexecuted instantiation: message-parser.c:uni_is_valid_ucs4
Unexecuted instantiation: rfc2231-parser.c:uni_is_valid_ucs4
Unexecuted instantiation: message-header-parser.c:uni_is_valid_ucs4
Unexecuted instantiation: charset-utf8.c:uni_is_valid_ucs4
Unexecuted instantiation: charset-iconv.c:uni_is_valid_ucs4
65
66
/* Returns number of characters in a NUL-terminated unicode string */
67
unsigned int uni_strlen(const unichar_t *str) ATTR_PURE;
68
/* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
69
   invalid */
70
int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output);
71
int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
72
           ARRAY_TYPE(unichars) *output);
73
/* Translates UCS-4 input to UTF-8 output. */
74
void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
75
void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);
76
77
/* Return number of octets needed to encode this codepoint in UTF-8. */
78
static inline unsigned int uni_ucs4_to_utf8_len(unichar_t chr)
79
78.0k
{
80
78.0k
  i_assert(uni_is_valid_ucs4(chr));
81
78.0k
  if (chr > 0xFFFF)
82
379
    return 4;
83
77.6k
  if (chr > 0x07FF)
84
73.3k
    return 3;
85
4.29k
  if (chr > 0x007f)
86
3.21k
    return 2;
87
1.08k
  return 1;
88
4.29k
}
json-parser.c:uni_ucs4_to_utf8_len
Line
Count
Source
79
78.0k
{
80
78.0k
  i_assert(uni_is_valid_ucs4(chr));
81
78.0k
  if (chr > 0xFFFF)
82
379
    return 4;
83
77.6k
  if (chr > 0x07FF)
84
73.3k
    return 3;
85
4.29k
  if (chr > 0x007f)
86
3.21k
    return 2;
87
1.08k
  return 1;
88
4.29k
}
Unexecuted instantiation: json-generator.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: json-syntax.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: str.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: str-sanitize.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: unichar.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: unicode-transform.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: smtp-command-parser.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: message-address.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: message-date.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: rfc822-parser.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: var-expand-lexer.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: punycode.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: imap-bodystructure.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: imap-utf7.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: message-parser.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: rfc2231-parser.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: message-header-parser.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: charset-utf8.c:uni_ucs4_to_utf8_len
Unexecuted instantiation: charset-iconv.c:uni_ucs4_to_utf8_len
89
90
/* Returns char_bytes (>0) if *chr_r is set, 0 for incomplete trailing character,
91
   -1 for invalid input. */
92
int uni_utf8_get_char(const char *input, unichar_t *chr_r);
93
int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
94
int uni_utf8_get_char_buf(const void *buffer, size_t size, unichar_t *chr_r);
95
/* Returns number of characters in UTF-8 string. */
96
unsigned int uni_utf8_strlen(const char *input) ATTR_PURE;
97
/* Returns number of characters in UTF-8 input of specified size. */
98
unsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE;
99
/* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8
100
   character, don't include it in the return value and set partial_pos_r to
101
   where the character begins. Otherwise partial_pos_r is set to the end
102
   of the input. */
103
unsigned int uni_utf8_partial_strlen_n(const void *input, size_t size,
104
               size_t *partial_pos_r);
105
106
/* Returns the number of bytes belonging to this UTF-8 character. The given
107
   parameter is the first byte of the UTF-8 sequence. Invalid input is
108
   returned with length 1. */
109
static inline unsigned int ATTR_CONST
110
uni_utf8_char_bytes(unsigned char chr)
111
255M
{
112
  /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
113
255M
  if (chr < (192 + 2))
114
23.8M
    return 1;
115
231M
  return uni_utf8_non1_bytes[chr - (192 + 2)];
116
255M
}
Unexecuted instantiation: json-parser.c:uni_utf8_char_bytes
Unexecuted instantiation: json-generator.c:uni_utf8_char_bytes
Unexecuted instantiation: json-syntax.c:uni_utf8_char_bytes
Unexecuted instantiation: str.c:uni_utf8_char_bytes
Unexecuted instantiation: str-sanitize.c:uni_utf8_char_bytes
unichar.c:uni_utf8_char_bytes
Line
Count
Source
111
232M
{
112
  /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
113
232M
  if (chr < (192 + 2))
114
581k
    return 1;
115
231M
  return uni_utf8_non1_bytes[chr - (192 + 2)];
116
232M
}
Unexecuted instantiation: unicode-transform.c:uni_utf8_char_bytes
smtp-command-parser.c:uni_utf8_char_bytes
Line
Count
Source
111
94
{
112
  /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
113
94
  if (chr < (192 + 2))
114
0
    return 1;
115
94
  return uni_utf8_non1_bytes[chr - (192 + 2)];
116
94
}
Unexecuted instantiation: message-address.c:uni_utf8_char_bytes
Unexecuted instantiation: message-date.c:uni_utf8_char_bytes
Unexecuted instantiation: rfc822-parser.c:uni_utf8_char_bytes
Unexecuted instantiation: var-expand-lexer.c:uni_utf8_char_bytes
Unexecuted instantiation: punycode.c:uni_utf8_char_bytes
Unexecuted instantiation: imap-bodystructure.c:uni_utf8_char_bytes
imap-utf7.c:uni_utf8_char_bytes
Line
Count
Source
111
23.2M
{
112
  /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
113
23.2M
  if (chr < (192 + 2))
114
23.2M
    return 1;
115
4.79k
  return uni_utf8_non1_bytes[chr - (192 + 2)];
116
23.2M
}
Unexecuted instantiation: message-parser.c:uni_utf8_char_bytes
Unexecuted instantiation: rfc2231-parser.c:uni_utf8_char_bytes
Unexecuted instantiation: message-header-parser.c:uni_utf8_char_bytes
Unexecuted instantiation: charset-utf8.c:uni_utf8_char_bytes
Unexecuted instantiation: charset-iconv.c:uni_utf8_char_bytes
117
118
/* Return given character in titlecase. */
119
unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
120
121
/* Run the UTF8 string through the provided Unicode transform and write the
122
   result into the buffer again encoded in UTF8. */
123
int uni_utf8_run_transform(const void *_input, size_t size,
124
         struct unicode_transform *trans, buffer_t *output,
125
         const char **error_r);
126
127
/* Normalize the UTF-8 input in Unicode NFD, NFKD, NFC or NFKC form and write
128
   the result to the output buffer.
129
130
   Refer to Unicode Standard Annex #15, Section 1.2 for more information. An
131
   excerpt can be found in unicode-nf.h.
132
133
   NOTE: Do not blindly use this function to write and append several values
134
   together expecting the result to be NF* normalized as well. This function
135
   does not check whether concatenation preserves the desired normalization nor
136
   does it endeavour to achieve this result. Blind concatination works only in
137
   very specific cases, so make sure you know what you are doing.
138
 */
139
int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output);
140
int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output);
141
int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output);
142
int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output);
143
144
/* Same as the write variants, but return the normalized input in the
145
   output_r argument as a C string.
146
 */
147
int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r);
148
int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r);
149
int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r);
150
int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r);
151
152
/* Check whether the input is normalized in the indicated form. Returns -1 if
153
   the input is not even valid UTF8 or contains invalid code points. Returns 1
154
   if the input adheres to the requested normalization form and 0 otherwise. */
155
int uni_utf8_is_nfd(const void *input, size_t size);
156
int uni_utf8_is_nfkd(const void *input, size_t size);
157
int uni_utf8_is_nfc(const void *input, size_t size);
158
int uni_utf8_is_nfkc(const void *input, size_t size);
159
160
/* Write the input UTF8 string to the provided buffer after mapping it to the
161
   requested case. */
162
int uni_utf8_write_uppercase(const void *_input, size_t size, buffer_t *output);
163
int uni_utf8_write_lowercase(const void *_input, size_t size, buffer_t *output);
164
int uni_utf8_write_casefold(const void *_input, size_t size, buffer_t *output);
165
166
int uni_utf8_to_uppercase(const void *input, size_t size, const char **output_r);
167
int uni_utf8_to_lowercase(const void *input, size_t size, const char **output_r);
168
int uni_utf8_to_casefold(const void *input, size_t size, const char **output_r);
169
170
/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
171
   output buffer. Returns 0 if ok, -1 if input was invalid. This generates
172
   output that's compatible with i;unicode-casemap comparator. Invalid input
173
   is replaced with unicode replacement character (0xfffd). */
174
int uni_utf8_to_decomposed_titlecase(const void *input, size_t size,
175
             buffer_t *output);
176
177
/* If input contains only valid UTF-8 characters, return TRUE without updating
178
   buf. If input contains invalid UTF-8 characters, replace them with unicode
179
   replacement character (0xfffd), write the output to buf and return FALSE. */
180
bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
181
           buffer_t *buf) ATTR_WARN_UNUSED_RESULT;
182
/* Returns TRUE if string is valid UTF-8 input. */
183
bool uni_utf8_str_is_valid(const char *str);
184
/* Returns TRUE if data contains only valid UTF-8 input. */
185
bool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
186
/* Returns the size of the data when truncated to be less than or equal to
187
   max_new_size, making sure UTF-8 character boundaries are respected. This only
188
   looks at the last character at the new boundary. */
189
size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size,
190
            size_t max_new_size);
191
192
/* surrogate handling */
193
static inline unichar_t uni_join_surrogate(unichar_t high, unichar_t low)
194
0
{
195
0
  i_assert(UTF16_VALID_HIGH_SURROGATE(high) &&
196
0
     UTF16_VALID_LOW_SURROGATE(low));
197
0
198
0
  return ((high - UTF16_SURROGATE_HIGH_FIRST)<<10) +
199
0
    (low - UTF16_SURROGATE_LOW_FIRST) +
200
0
    UTF16_SURROGATE_BASE;
201
0
}
Unexecuted instantiation: json-parser.c:uni_join_surrogate
Unexecuted instantiation: json-generator.c:uni_join_surrogate
Unexecuted instantiation: json-syntax.c:uni_join_surrogate
Unexecuted instantiation: str.c:uni_join_surrogate
Unexecuted instantiation: str-sanitize.c:uni_join_surrogate
Unexecuted instantiation: unichar.c:uni_join_surrogate
Unexecuted instantiation: unicode-transform.c:uni_join_surrogate
Unexecuted instantiation: smtp-command-parser.c:uni_join_surrogate
Unexecuted instantiation: message-address.c:uni_join_surrogate
Unexecuted instantiation: message-date.c:uni_join_surrogate
Unexecuted instantiation: rfc822-parser.c:uni_join_surrogate
Unexecuted instantiation: var-expand-lexer.c:uni_join_surrogate
Unexecuted instantiation: punycode.c:uni_join_surrogate
Unexecuted instantiation: imap-bodystructure.c:uni_join_surrogate
Unexecuted instantiation: imap-utf7.c:uni_join_surrogate
Unexecuted instantiation: message-parser.c:uni_join_surrogate
Unexecuted instantiation: rfc2231-parser.c:uni_join_surrogate
Unexecuted instantiation: message-header-parser.c:uni_join_surrogate
Unexecuted instantiation: charset-utf8.c:uni_join_surrogate
Unexecuted instantiation: charset-iconv.c:uni_join_surrogate
202
203
static inline void uni_split_surrogate(unichar_t chr, unichar_t *high_r, unichar_t *low_r)
204
0
{
205
0
  i_assert(chr >= UTF16_SURROGATE_BASE && chr <= UNICHAR_T_MAX);
206
0
  i_assert(high_r != NULL && low_r != NULL);
207
0
  *high_r = UTF16_SURROGATE_HIGH(chr);
208
0
  *low_r = UTF16_SURROGATE_LOW(chr);
209
0
}
Unexecuted instantiation: json-parser.c:uni_split_surrogate
Unexecuted instantiation: json-generator.c:uni_split_surrogate
Unexecuted instantiation: json-syntax.c:uni_split_surrogate
Unexecuted instantiation: str.c:uni_split_surrogate
Unexecuted instantiation: str-sanitize.c:uni_split_surrogate
Unexecuted instantiation: unichar.c:uni_split_surrogate
Unexecuted instantiation: unicode-transform.c:uni_split_surrogate
Unexecuted instantiation: smtp-command-parser.c:uni_split_surrogate
Unexecuted instantiation: message-address.c:uni_split_surrogate
Unexecuted instantiation: message-date.c:uni_split_surrogate
Unexecuted instantiation: rfc822-parser.c:uni_split_surrogate
Unexecuted instantiation: var-expand-lexer.c:uni_split_surrogate
Unexecuted instantiation: punycode.c:uni_split_surrogate
Unexecuted instantiation: imap-bodystructure.c:uni_split_surrogate
Unexecuted instantiation: imap-utf7.c:uni_split_surrogate
Unexecuted instantiation: message-parser.c:uni_split_surrogate
Unexecuted instantiation: rfc2231-parser.c:uni_split_surrogate
Unexecuted instantiation: message-header-parser.c:uni_split_surrogate
Unexecuted instantiation: charset-utf8.c:uni_split_surrogate
Unexecuted instantiation: charset-iconv.c:uni_split_surrogate
210
#endif