/src/dovecot/src/lib/unichar.h
Line | Count | Source |
1 | | #ifndef UNICHAR_H |
2 | | #define UNICHAR_H |
3 | | |
4 | | /* Character used to replace invalid input. */ |
5 | 0 | #define UNICODE_REPLACEMENT_CHAR 0xfffd |
6 | 86.5k | #define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD" |
7 | | #define UNICODE_REPLACEMENT_CHAR_UTF8_LEN \ |
8 | | (sizeof(UNICODE_REPLACEMENT_CHAR_UTF8) - 1); |
9 | | /* Horizontal ellipsis character ('...') */ |
10 | | #define UNICODE_HORIZONTAL_ELLIPSIS_CHAR 0x2026 |
11 | 0 | #define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8 "\xE2\x80\xA6" |
12 | | #define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8_LEN \ |
13 | | (sizeof(UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8) - 1); |
14 | | |
15 | | /* Characters >= base require surrogates */ |
16 | 23.2M | #define UTF16_SURROGATE_BASE 0x10000 |
17 | | |
18 | 4.92k | #define UTF16_SURROGATE_SHIFT 10 |
19 | 5.97k | #define UTF16_SURROGATE_MASK 0x03ff |
20 | 116M | #define UTF16_SURROGATE_HIGH_FIRST 0xd800 |
21 | 1.08k | #define UTF16_SURROGATE_HIGH_LAST 0xdbff |
22 | 3.36k | #define UTF16_SURROGATE_HIGH_MAX 0xdfff |
23 | 116M | #define UTF16_SURROGATE_LOW_FIRST 0xdc00 |
24 | 1.06k | #define UTF16_SURROGATE_LOW_LAST 0xdfff |
25 | | |
26 | | #define UTF16_SURROGATE_HIGH(chr) \ |
27 | 3.86k | (UTF16_SURROGATE_HIGH_FIRST + \ |
28 | 3.86k | (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT)) |
29 | | #define UTF16_SURROGATE_LOW(chr) \ |
30 | 3.86k | (UTF16_SURROGATE_LOW_FIRST + \ |
31 | 3.86k | (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK)) |
32 | | |
33 | | /* Returns TRUE if given byte is ASCII character or the beginning of a |
34 | | multibyte UTF-8 sequence */ |
35 | | #define UTF8_IS_START_SEQ(b) \ |
36 | | (((b) & 0x80) == 0 || ((b) & 0xC0) == 0xC0) |
37 | | |
38 | 241k | #define UTF8_REPLACEMENT_CHAR_LEN 3 |
39 | | |
40 | 116M | #define UNICHAR_T_MAX 0x10ffff |
41 | | |
42 | 232M | #define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST) |
43 | 232M | #define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST) |
44 | | |
45 | | struct unicode_transform; |
46 | | |
47 | | typedef uint32_t unichar_t; |
48 | | ARRAY_DEFINE_TYPE(unichars, unichar_t); |
49 | | |
50 | | /* Normalize UTF8 input and append it to output buffer. |
51 | | Returns 0 if ok, -1 if input was invalid. Even if input was invalid, |
52 | | as much as possible should be added to output. */ |
53 | | typedef int normalizer_func_t(const void *input, size_t size, |
54 | | buffer_t *output); |
55 | | |
56 | | extern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN]; |
57 | | extern const uint8_t *const uni_utf8_non1_bytes; |
58 | | |
59 | | static inline bool ATTR_PURE uni_is_valid_ucs4(unichar_t chr) |
60 | 116M | { |
61 | 116M | return (!UTF16_VALID_HIGH_SURROGATE(chr) && |
62 | 116M | !UTF16_VALID_LOW_SURROGATE(chr) && |
63 | 116M | chr <= UNICHAR_T_MAX); |
64 | 116M | }; json-parser.c:uni_is_valid_ucs4 Line | Count | Source | 60 | 156k | { | 61 | 156k | return (!UTF16_VALID_HIGH_SURROGATE(chr) && | 62 | 156k | !UTF16_VALID_LOW_SURROGATE(chr) && | 63 | 156k | chr <= UNICHAR_T_MAX); | 64 | 156k | }; |
json-generator.c:uni_is_valid_ucs4 Line | Count | Source | 60 | 55.0k | { | 61 | 55.0k | return (!UTF16_VALID_HIGH_SURROGATE(chr) && | 62 | 55.0k | !UTF16_VALID_LOW_SURROGATE(chr) && | 63 | 55.0k | chr <= UNICHAR_T_MAX); | 64 | 55.0k | }; |
Unexecuted instantiation: json-syntax.c:uni_is_valid_ucs4 Unexecuted instantiation: str.c:uni_is_valid_ucs4 Unexecuted instantiation: str-sanitize.c:uni_is_valid_ucs4 unichar.c:uni_is_valid_ucs4 Line | Count | Source | 60 | 116M | { | 61 | 116M | return (!UTF16_VALID_HIGH_SURROGATE(chr) && | 62 | 115M | !UTF16_VALID_LOW_SURROGATE(chr) && | 63 | 115M | chr <= UNICHAR_T_MAX); | 64 | 116M | }; |
Unexecuted instantiation: unicode-transform.c:uni_is_valid_ucs4 Unexecuted instantiation: smtp-command-parser.c:uni_is_valid_ucs4 Unexecuted instantiation: message-address.c:uni_is_valid_ucs4 Unexecuted instantiation: message-date.c:uni_is_valid_ucs4 Unexecuted instantiation: rfc822-parser.c:uni_is_valid_ucs4 Unexecuted instantiation: var-expand-lexer.c:uni_is_valid_ucs4 Unexecuted instantiation: punycode.c:uni_is_valid_ucs4 Unexecuted instantiation: imap-bodystructure.c:uni_is_valid_ucs4 Unexecuted instantiation: imap-utf7.c:uni_is_valid_ucs4 Unexecuted instantiation: message-parser.c:uni_is_valid_ucs4 Unexecuted instantiation: rfc2231-parser.c:uni_is_valid_ucs4 Unexecuted instantiation: message-header-parser.c:uni_is_valid_ucs4 Unexecuted instantiation: charset-utf8.c:uni_is_valid_ucs4 Unexecuted instantiation: charset-iconv.c:uni_is_valid_ucs4 |
65 | | |
66 | | /* Returns number of characters in a NUL-terminated unicode string */ |
67 | | unsigned int uni_strlen(const unichar_t *str) ATTR_PURE; |
68 | | /* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was |
69 | | invalid */ |
70 | | int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output); |
71 | | int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size, |
72 | | ARRAY_TYPE(unichars) *output); |
73 | | /* Translates UCS-4 input to UTF-8 output. */ |
74 | | void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output); |
75 | | void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output); |
76 | | |
77 | | /* Return number of octets needed to encode this codepoint in UTF-8. */ |
78 | | static inline unsigned int uni_ucs4_to_utf8_len(unichar_t chr) |
79 | 78.0k | { |
80 | 78.0k | i_assert(uni_is_valid_ucs4(chr)); |
81 | 78.0k | if (chr > 0xFFFF) |
82 | 379 | return 4; |
83 | 77.6k | if (chr > 0x07FF) |
84 | 73.3k | return 3; |
85 | 4.29k | if (chr > 0x007f) |
86 | 3.21k | return 2; |
87 | 1.08k | return 1; |
88 | 4.29k | } json-parser.c:uni_ucs4_to_utf8_len Line | Count | Source | 79 | 78.0k | { | 80 | 78.0k | i_assert(uni_is_valid_ucs4(chr)); | 81 | 78.0k | if (chr > 0xFFFF) | 82 | 379 | return 4; | 83 | 77.6k | if (chr > 0x07FF) | 84 | 73.3k | return 3; | 85 | 4.29k | if (chr > 0x007f) | 86 | 3.21k | return 2; | 87 | 1.08k | return 1; | 88 | 4.29k | } |
Unexecuted instantiation: json-generator.c:uni_ucs4_to_utf8_len Unexecuted instantiation: json-syntax.c:uni_ucs4_to_utf8_len Unexecuted instantiation: str.c:uni_ucs4_to_utf8_len Unexecuted instantiation: str-sanitize.c:uni_ucs4_to_utf8_len Unexecuted instantiation: unichar.c:uni_ucs4_to_utf8_len Unexecuted instantiation: unicode-transform.c:uni_ucs4_to_utf8_len Unexecuted instantiation: smtp-command-parser.c:uni_ucs4_to_utf8_len Unexecuted instantiation: message-address.c:uni_ucs4_to_utf8_len Unexecuted instantiation: message-date.c:uni_ucs4_to_utf8_len Unexecuted instantiation: rfc822-parser.c:uni_ucs4_to_utf8_len Unexecuted instantiation: var-expand-lexer.c:uni_ucs4_to_utf8_len Unexecuted instantiation: punycode.c:uni_ucs4_to_utf8_len Unexecuted instantiation: imap-bodystructure.c:uni_ucs4_to_utf8_len Unexecuted instantiation: imap-utf7.c:uni_ucs4_to_utf8_len Unexecuted instantiation: message-parser.c:uni_ucs4_to_utf8_len Unexecuted instantiation: rfc2231-parser.c:uni_ucs4_to_utf8_len Unexecuted instantiation: message-header-parser.c:uni_ucs4_to_utf8_len Unexecuted instantiation: charset-utf8.c:uni_ucs4_to_utf8_len Unexecuted instantiation: charset-iconv.c:uni_ucs4_to_utf8_len |
89 | | |
90 | | /* Returns char_bytes (>0) if *chr_r is set, 0 for incomplete trailing character, |
91 | | -1 for invalid input. */ |
92 | | int uni_utf8_get_char(const char *input, unichar_t *chr_r); |
93 | | int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r); |
94 | | int uni_utf8_get_char_buf(const void *buffer, size_t size, unichar_t *chr_r); |
95 | | /* Returns number of characters in UTF-8 string. */ |
96 | | unsigned int uni_utf8_strlen(const char *input) ATTR_PURE; |
97 | | /* Returns number of characters in UTF-8 input of specified size. */ |
98 | | unsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE; |
99 | | /* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8 |
100 | | character, don't include it in the return value and set partial_pos_r to |
101 | | where the character begins. Otherwise partial_pos_r is set to the end |
102 | | of the input. */ |
103 | | unsigned int uni_utf8_partial_strlen_n(const void *input, size_t size, |
104 | | size_t *partial_pos_r); |
105 | | |
106 | | /* Returns the number of bytes belonging to this UTF-8 character. The given |
107 | | parameter is the first byte of the UTF-8 sequence. Invalid input is |
108 | | returned with length 1. */ |
109 | | static inline unsigned int ATTR_CONST |
110 | | uni_utf8_char_bytes(unsigned char chr) |
111 | 255M | { |
112 | | /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */ |
113 | 255M | if (chr < (192 + 2)) |
114 | 23.8M | return 1; |
115 | 231M | return uni_utf8_non1_bytes[chr - (192 + 2)]; |
116 | 255M | } Unexecuted instantiation: json-parser.c:uni_utf8_char_bytes Unexecuted instantiation: json-generator.c:uni_utf8_char_bytes Unexecuted instantiation: json-syntax.c:uni_utf8_char_bytes Unexecuted instantiation: str.c:uni_utf8_char_bytes Unexecuted instantiation: str-sanitize.c:uni_utf8_char_bytes unichar.c:uni_utf8_char_bytes Line | Count | Source | 111 | 232M | { | 112 | | /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */ | 113 | 232M | if (chr < (192 + 2)) | 114 | 581k | return 1; | 115 | 231M | return uni_utf8_non1_bytes[chr - (192 + 2)]; | 116 | 232M | } |
Unexecuted instantiation: unicode-transform.c:uni_utf8_char_bytes smtp-command-parser.c:uni_utf8_char_bytes Line | Count | Source | 111 | 94 | { | 112 | | /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */ | 113 | 94 | if (chr < (192 + 2)) | 114 | 0 | return 1; | 115 | 94 | return uni_utf8_non1_bytes[chr - (192 + 2)]; | 116 | 94 | } |
Unexecuted instantiation: message-address.c:uni_utf8_char_bytes Unexecuted instantiation: message-date.c:uni_utf8_char_bytes Unexecuted instantiation: rfc822-parser.c:uni_utf8_char_bytes Unexecuted instantiation: var-expand-lexer.c:uni_utf8_char_bytes Unexecuted instantiation: punycode.c:uni_utf8_char_bytes Unexecuted instantiation: imap-bodystructure.c:uni_utf8_char_bytes imap-utf7.c:uni_utf8_char_bytes Line | Count | Source | 111 | 23.2M | { | 112 | | /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */ | 113 | 23.2M | if (chr < (192 + 2)) | 114 | 23.2M | return 1; | 115 | 4.79k | return uni_utf8_non1_bytes[chr - (192 + 2)]; | 116 | 23.2M | } |
Unexecuted instantiation: message-parser.c:uni_utf8_char_bytes Unexecuted instantiation: rfc2231-parser.c:uni_utf8_char_bytes Unexecuted instantiation: message-header-parser.c:uni_utf8_char_bytes Unexecuted instantiation: charset-utf8.c:uni_utf8_char_bytes Unexecuted instantiation: charset-iconv.c:uni_utf8_char_bytes |
117 | | |
118 | | /* Return given character in titlecase. */ |
119 | | unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST; |
120 | | |
121 | | /* Run the UTF8 string through the provided Unicode transform and write the |
122 | | result into the buffer again encoded in UTF8. */ |
123 | | int uni_utf8_run_transform(const void *_input, size_t size, |
124 | | struct unicode_transform *trans, buffer_t *output, |
125 | | const char **error_r); |
126 | | |
127 | | /* Normalize the UTF-8 input in Unicode NFD, NFKD, NFC or NFKC form and write |
128 | | the result to the output buffer. |
129 | | |
130 | | Refer to Unicode Standard Annex #15, Section 1.2 for more information. An |
131 | | excerpt can be found in unicode-nf.h. |
132 | | |
133 | | NOTE: Do not blindly use this function to write and append several values |
134 | | together expecting the result to be NF* normalized as well. This function |
135 | | does not check whether concatenation preserves the desired normalization nor |
136 | | does it endeavour to achieve this result. Blind concatination works only in |
137 | | very specific cases, so make sure you know what you are doing. |
138 | | */ |
139 | | int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output); |
140 | | int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output); |
141 | | int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output); |
142 | | int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output); |
143 | | |
144 | | /* Same as the write variants, but return the normalized input in the |
145 | | output_r argument as a C string. |
146 | | */ |
147 | | int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r); |
148 | | int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r); |
149 | | int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r); |
150 | | int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r); |
151 | | |
152 | | /* Check whether the input is normalized in the indicated form. Returns -1 if |
153 | | the input is not even valid UTF8 or contains invalid code points. Returns 1 |
154 | | if the input adheres to the requested normalization form and 0 otherwise. */ |
155 | | int uni_utf8_is_nfd(const void *input, size_t size); |
156 | | int uni_utf8_is_nfkd(const void *input, size_t size); |
157 | | int uni_utf8_is_nfc(const void *input, size_t size); |
158 | | int uni_utf8_is_nfkc(const void *input, size_t size); |
159 | | |
160 | | /* Write the input UTF8 string to the provided buffer after mapping it to the |
161 | | requested case. */ |
162 | | int uni_utf8_write_uppercase(const void *_input, size_t size, buffer_t *output); |
163 | | int uni_utf8_write_lowercase(const void *_input, size_t size, buffer_t *output); |
164 | | int uni_utf8_write_casefold(const void *_input, size_t size, buffer_t *output); |
165 | | |
166 | | int uni_utf8_to_uppercase(const void *input, size_t size, const char **output_r); |
167 | | int uni_utf8_to_lowercase(const void *input, size_t size, const char **output_r); |
168 | | int uni_utf8_to_casefold(const void *input, size_t size, const char **output_r); |
169 | | |
170 | | /* Convert UTF-8 input to titlecase and decompose the titlecase characters to |
171 | | output buffer. Returns 0 if ok, -1 if input was invalid. This generates |
172 | | output that's compatible with i;unicode-casemap comparator. Invalid input |
173 | | is replaced with unicode replacement character (0xfffd). */ |
174 | | int uni_utf8_to_decomposed_titlecase(const void *input, size_t size, |
175 | | buffer_t *output); |
176 | | |
177 | | /* If input contains only valid UTF-8 characters, return TRUE without updating |
178 | | buf. If input contains invalid UTF-8 characters, replace them with unicode |
179 | | replacement character (0xfffd), write the output to buf and return FALSE. */ |
180 | | bool uni_utf8_get_valid_data(const unsigned char *input, size_t size, |
181 | | buffer_t *buf) ATTR_WARN_UNUSED_RESULT; |
182 | | /* Returns TRUE if string is valid UTF-8 input. */ |
183 | | bool uni_utf8_str_is_valid(const char *str); |
184 | | /* Returns TRUE if data contains only valid UTF-8 input. */ |
185 | | bool uni_utf8_data_is_valid(const unsigned char *data, size_t size); |
186 | | /* Returns the size of the data when truncated to be less than or equal to |
187 | | max_new_size, making sure UTF-8 character boundaries are respected. This only |
188 | | looks at the last character at the new boundary. */ |
189 | | size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size, |
190 | | size_t max_new_size); |
191 | | |
192 | | /* surrogate handling */ |
193 | | static inline unichar_t uni_join_surrogate(unichar_t high, unichar_t low) |
194 | 0 | { |
195 | 0 | i_assert(UTF16_VALID_HIGH_SURROGATE(high) && |
196 | 0 | UTF16_VALID_LOW_SURROGATE(low)); |
197 | 0 |
|
198 | 0 | return ((high - UTF16_SURROGATE_HIGH_FIRST)<<10) + |
199 | 0 | (low - UTF16_SURROGATE_LOW_FIRST) + |
200 | 0 | UTF16_SURROGATE_BASE; |
201 | 0 | } Unexecuted instantiation: json-parser.c:uni_join_surrogate Unexecuted instantiation: json-generator.c:uni_join_surrogate Unexecuted instantiation: json-syntax.c:uni_join_surrogate Unexecuted instantiation: str.c:uni_join_surrogate Unexecuted instantiation: str-sanitize.c:uni_join_surrogate Unexecuted instantiation: unichar.c:uni_join_surrogate Unexecuted instantiation: unicode-transform.c:uni_join_surrogate Unexecuted instantiation: smtp-command-parser.c:uni_join_surrogate Unexecuted instantiation: message-address.c:uni_join_surrogate Unexecuted instantiation: message-date.c:uni_join_surrogate Unexecuted instantiation: rfc822-parser.c:uni_join_surrogate Unexecuted instantiation: var-expand-lexer.c:uni_join_surrogate Unexecuted instantiation: punycode.c:uni_join_surrogate Unexecuted instantiation: imap-bodystructure.c:uni_join_surrogate Unexecuted instantiation: imap-utf7.c:uni_join_surrogate Unexecuted instantiation: message-parser.c:uni_join_surrogate Unexecuted instantiation: rfc2231-parser.c:uni_join_surrogate Unexecuted instantiation: message-header-parser.c:uni_join_surrogate Unexecuted instantiation: charset-utf8.c:uni_join_surrogate Unexecuted instantiation: charset-iconv.c:uni_join_surrogate |
202 | | |
203 | | static inline void uni_split_surrogate(unichar_t chr, unichar_t *high_r, unichar_t *low_r) |
204 | 0 | { |
205 | 0 | i_assert(chr >= UTF16_SURROGATE_BASE && chr <= UNICHAR_T_MAX); |
206 | 0 | i_assert(high_r != NULL && low_r != NULL); |
207 | 0 | *high_r = UTF16_SURROGATE_HIGH(chr); |
208 | 0 | *low_r = UTF16_SURROGATE_LOW(chr); |
209 | 0 | } Unexecuted instantiation: json-parser.c:uni_split_surrogate Unexecuted instantiation: json-generator.c:uni_split_surrogate Unexecuted instantiation: json-syntax.c:uni_split_surrogate Unexecuted instantiation: str.c:uni_split_surrogate Unexecuted instantiation: str-sanitize.c:uni_split_surrogate Unexecuted instantiation: unichar.c:uni_split_surrogate Unexecuted instantiation: unicode-transform.c:uni_split_surrogate Unexecuted instantiation: smtp-command-parser.c:uni_split_surrogate Unexecuted instantiation: message-address.c:uni_split_surrogate Unexecuted instantiation: message-date.c:uni_split_surrogate Unexecuted instantiation: rfc822-parser.c:uni_split_surrogate Unexecuted instantiation: var-expand-lexer.c:uni_split_surrogate Unexecuted instantiation: punycode.c:uni_split_surrogate Unexecuted instantiation: imap-bodystructure.c:uni_split_surrogate Unexecuted instantiation: imap-utf7.c:uni_split_surrogate Unexecuted instantiation: message-parser.c:uni_split_surrogate Unexecuted instantiation: rfc2231-parser.c:uni_split_surrogate Unexecuted instantiation: message-header-parser.c:uni_split_surrogate Unexecuted instantiation: charset-utf8.c:uni_split_surrogate Unexecuted instantiation: charset-iconv.c:uni_split_surrogate |
210 | | #endif |