/src/dovecot/src/lib/unichar.h

Source
#ifndef UNICHAR_H
#define UNICHAR_H

#include "unicode-break.h"

/* Character used to replace invalid input. */
#define UNICODE_REPLACEMENT_CHAR 0xfffd
#define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD"
#define UNICODE_REPLACEMENT_CHAR_UTF8_LEN \
  (sizeof(UNICODE_REPLACEMENT_CHAR_UTF8) - 1);
/* Horizontal ellipsis character ('...') */
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR 0x2026
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8 "\xE2\x80\xA6"
#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8_LEN \
  (sizeof(UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8) - 1);

/* Characters >= base require surrogates */
#define UTF16_SURROGATE_BASE 0x10000

#define UTF16_SURROGATE_SHIFT 10
#define UTF16_SURROGATE_MASK 0x03ff
#define UTF16_SURROGATE_HIGH_FIRST 0xd800
#define UTF16_SURROGATE_HIGH_LAST 0xdbff
#define UTF16_SURROGATE_HIGH_MAX 0xdfff
#define UTF16_SURROGATE_LOW_FIRST 0xdc00
#define UTF16_SURROGATE_LOW_LAST 0xdfff

#define UTF16_SURROGATE_HIGH(chr) \
  (UTF16_SURROGATE_HIGH_FIRST + \
   (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT))
#define UTF16_SURROGATE_LOW(chr) \
  (UTF16_SURROGATE_LOW_FIRST + \
   (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK))

/* Returns TRUE if given byte is ASCII character or the beginning of a
   multibyte UTF-8 sequence */
#define UTF8_IS_START_SEQ(b) \
  (((b) & 0x80) == 0 || ((b) & 0xC0) == 0xC0)

#define UTF8_REPLACEMENT_CHAR_LEN 3

#define UNICHAR_T_MAX 0x10ffff

#define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST)
#define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST)

struct unicode_transform;

typedef uint32_t unichar_t;
ARRAY_DEFINE_TYPE(unichars, unichar_t);

/* Normalize UTF8 input and append it to output buffer.
   Returns 0 if ok, -1 if input was invalid. Even if input was invalid,
   as much as possible should be added to output. */
typedef int normalizer_func_t(const void *input, size_t size,
            buffer_t *output);

extern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN];
extern const uint8_t *const uni_utf8_non1_bytes;

static inline bool ATTR_PURE uni_is_valid_ucs4(unichar_t chr)
{
  return (!UTF16_VALID_HIGH_SURROGATE(chr) &&
    !UTF16_VALID_LOW_SURROGATE(chr) &&
    chr <= UNICHAR_T_MAX);
};

/* Returns number of characters in a NUL-terminated unicode string */
unsigned int uni_strlen(const unichar_t *str) ATTR_PURE;
/* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
   invalid */
int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output);
int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
           ARRAY_TYPE(unichars) *output);
/* Translates UCS-4 input to UTF-8 output. */
void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);

/* Return number of octets needed to encode this codepoint in UTF-8. */
static inline unsigned int uni_ucs4_to_utf8_len(unichar_t chr)
{
  i_assert(uni_is_valid_ucs4(chr));
  if (chr > 0xFFFF)
    return 4;
  if (chr > 0x07FF)
    return 3;
  if (chr > 0x007f)
    return 2;
  return 1;
}

/* Returns char_bytes (>0) if *chr_r is set, 0 for incomplete trailing character,
   -1 for invalid input. */
int uni_utf8_get_char(const char *input, unichar_t *chr_r);
int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
int uni_utf8_get_char_buf(const void *buffer, size_t size, unichar_t *chr_r);
/* Returns number of characters in UTF-8 string. */
unsigned int uni_utf8_strlen(const char *input) ATTR_PURE;
/* Returns number of characters in UTF-8 input of specified size. */
unsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE;
/* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8
   character, don't include it in the return value and set partial_pos_r to
   where the character begins. Otherwise partial_pos_r is set to the end
   of the input. */
unsigned int uni_utf8_partial_strlen_n(const void *input, size_t size,
               size_t *partial_pos_r);

/* Returns the number of bytes belonging to this UTF-8 character. The given
   parameter is the first byte of the UTF-8 sequence. Invalid input is
   returned with length 1. */
static inline unsigned int ATTR_CONST
uni_utf8_char_bytes(unsigned char chr)
{
  /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
  if (chr < (192 + 2))
    return 1;
  return uni_utf8_non1_bytes[chr - (192 + 2)];
}

/* Return given character in titlecase. */
unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;

/* Run the UTF8 string through the provided Unicode transform and write the
   result into the buffer again encoded in UTF8. */
int uni_utf8_run_transform(const void *_input, size_t size,
         struct unicode_transform *trans, buffer_t *output,
         const char **error_r);

/* Normalize the UTF-8 input in Unicode NFD, NFKD, NFC or NFKC form and write
   the result to the output buffer.

   Refer to Unicode Standard Annex #15, Section 1.2 for more information. An
   excerpt can be found in unicode-nf.h.

   NOTE: Do not blindly use this function to write and append several values
   together expecting the result to be NF* normalized as well. This function
   does not check whether concatenation preserves the desired normalization nor
   does it endeavour to achieve this result. Blind concatination works only in
   very specific cases, so make sure you know what you are doing.
 */
int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output);
int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output);
int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output);
int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output);

/* Same as the write variants, but return the normalized input in the
   output_r argument as a C string.
 */
int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r);
int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r);
int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r);
int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r);

/* Check whether the input is normalized in the indicated form. Returns -1 if
   the input is not even valid UTF8 or contains invalid code points. Returns 1
   if the input adheres to the requested normalization form and 0 otherwise. */
int uni_utf8_is_nfd(const void *input, size_t size);
int uni_utf8_is_nfkd(const void *input, size_t size);
int uni_utf8_is_nfc(const void *input, size_t size);
int uni_utf8_is_nfkc(const void *input, size_t size);

/* Write the input UTF8 string to the provided buffer after mapping it to the
   requested case. */
int uni_utf8_write_uppercase(const void *_input, size_t size, buffer_t *output);
int uni_utf8_write_lowercase(const void *_input, size_t size, buffer_t *output);
int uni_utf8_write_casefold(const void *_input, size_t size, buffer_t *output);

int uni_utf8_to_uppercase(const void *input, size_t size, const char **output_r);
int uni_utf8_to_lowercase(const void *input, size_t size, const char **output_r);
int uni_utf8_to_casefold(const void *input, size_t size, const char **output_r);

/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
   output buffer. Returns 0 if ok, -1 if input was invalid. This generates
   output that's compatible with i;unicode-casemap comparator. Invalid input
   is replaced with unicode replacement character (0xfffd). */
int uni_utf8_to_decomposed_titlecase(const void *input, size_t size,
             buffer_t *output);

/* If input contains only valid UTF-8 characters, return TRUE without updating
   buf. If input contains invalid UTF-8 characters, replace them with unicode
   replacement character (0xfffd), write the output to buf and return FALSE. */
bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
           buffer_t *buf) ATTR_WARN_UNUSED_RESULT;
/* Returns TRUE if string is valid UTF-8 input. */
bool uni_utf8_str_is_valid(const char *str);
/* Returns TRUE if data contains only valid UTF-8 input. */
bool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
/* Returns the size of the data when truncated to be less than or equal to
   max_new_size, making sure UTF-8 character boundaries are respected. This only
   looks at the last character at the new boundary. */
size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size,
            size_t max_new_size);

/* surrogate handling */
static inline unichar_t uni_join_surrogate(unichar_t high, unichar_t low)
{
  i_assert(UTF16_VALID_HIGH_SURROGATE(high) &&
     UTF16_VALID_LOW_SURROGATE(low));

  return ((high - UTF16_SURROGATE_HIGH_FIRST)<<10) +
    (low - UTF16_SURROGATE_LOW_FIRST) +
    UTF16_SURROGATE_BASE;
}

static inline void uni_split_surrogate(unichar_t chr, unichar_t *high_r, unichar_t *low_r)
{
  i_assert(chr >= UTF16_SURROGATE_BASE && chr <= UNICHAR_T_MAX);
  i_assert(high_r != NULL && low_r != NULL);
  *high_r = UTF16_SURROGATE_HIGH(chr);
  *low_r = UTF16_SURROGATE_LOW(chr);
}

/*
 * Grapheme clusters
 */

/* The grapheme cluster scanner is used to split a Unicode string into a
   sequence of grapheme clusters, which are in essence the Unicode characters as
   perceived by the user. These can be longer than a single code point and by
   consequence longer than a single octet. The Unicode standard defines what
   constitutes a grapheme cluster in Annex #29. */

struct uni_gc_scanner {
  pool_t pool;
  struct unicode_gc_break gcbrk;

  const unsigned char *poffset, *p, *pend;

  unichar_t cp;
  int cp_size;
};

/* Initialize the scanner. */
void uni_gc_scanner_init(struct uni_gc_scanner *gcsc,
       const void *input, size_t size);
/* Shift scanner position to next grapheme cluster. Returns TRUE when scanner
   points to a valid grapheme cluster and has not reached the end. */
bool uni_gc_scan_shift(struct uni_gc_scanner *gcsc) ATTR_NOWARN_UNUSED_RESULT;


/* Obtain a pointer to the current grapheme cluster the scanner points to.
   Returns the size of the cluster in octets in size_r. */
static inline const unsigned char *
uni_gc_scan_get(struct uni_gc_scanner *gcsc, size_t *size_r)
{
  if (gcsc->poffset == NULL)
    uni_gc_scan_shift(gcsc);
  if (size_r != NULL)
    *size_r = gcsc->p - gcsc->poffset;
  return gcsc->poffset;
}

/* Convenience function for checking whether current grapheme cluster is a
   particular (single-octet) ASCII character.  */
static inline bool
uni_gc_scan_ascii_equals(struct uni_gc_scanner *gcsc, unsigned int c)
{
  size_t gc_size;
  const unsigned char *gc = uni_gc_scan_get(gcsc, &gc_size);

  if (gc_size != 1)
    return FALSE;

  return (*gc == (unsigned char)c);
}

/* Returns TRUE when the scanner has reached the end of input. */
static inline bool uni_gc_scan_at_end(struct uni_gc_scanner *gcsc)
{
  size_t gc_size;
  (void)uni_gc_scan_get(gcsc, &gc_size);
  return (gc_size == 0);
}

#endif

Coverage Report

Created: 2026-01-25 07:08

Line	Count	Source
1		#ifndef UNICHAR_H
2		#define UNICHAR_H
3
4		#include "unicode-break.h"
5
6		/* Character used to replace invalid input. */
7	0	#define UNICODE_REPLACEMENT_CHAR 0xfffd
8		#define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD"
9		#define UNICODE_REPLACEMENT_CHAR_UTF8_LEN \
10		(sizeof(UNICODE_REPLACEMENT_CHAR_UTF8) - 1);
11		/* Horizontal ellipsis character ('...') */
12		#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR 0x2026
13		#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8 "\xE2\x80\xA6"
14		#define UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8_LEN \
15		(sizeof(UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8) - 1);
16
17		/* Characters >= base require surrogates */
18		#define UTF16_SURROGATE_BASE 0x10000
19
20		#define UTF16_SURROGATE_SHIFT 10
21		#define UTF16_SURROGATE_MASK 0x03ff
22	0	#define UTF16_SURROGATE_HIGH_FIRST 0xd800
23		#define UTF16_SURROGATE_HIGH_LAST 0xdbff
24		#define UTF16_SURROGATE_HIGH_MAX 0xdfff
25	0	#define UTF16_SURROGATE_LOW_FIRST 0xdc00
26		#define UTF16_SURROGATE_LOW_LAST 0xdfff
27
28		#define UTF16_SURROGATE_HIGH(chr) \
29		(UTF16_SURROGATE_HIGH_FIRST + \
30		(((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT))
31		#define UTF16_SURROGATE_LOW(chr) \
32		(UTF16_SURROGATE_LOW_FIRST + \
33		(((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK))
34
35		/* Returns TRUE if given byte is ASCII character or the beginning of a
36		multibyte UTF-8 sequence */
37		#define UTF8_IS_START_SEQ(b) \
38		(((b) & 0x80) == 0 \|\| ((b) & 0xC0) == 0xC0)
39
40	0	#define UTF8_REPLACEMENT_CHAR_LEN 3
41
42	0	#define UNICHAR_T_MAX 0x10ffff
43
44	0	#define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST)
45	0	#define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST)
46
47		struct unicode_transform;
48
49		typedef uint32_t unichar_t;
50		ARRAY_DEFINE_TYPE(unichars, unichar_t);
51
52		/* Normalize UTF8 input and append it to output buffer.
53		Returns 0 if ok, -1 if input was invalid. Even if input was invalid,
54		as much as possible should be added to output. */
55		typedef int normalizer_func_t(const void *input, size_t size,
56		buffer_t *output);
57
58		extern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN];
59		extern const uint8_t *const uni_utf8_non1_bytes;
60
61		static inline bool ATTR_PURE uni_is_valid_ucs4(unichar_t chr)
62	0	{
63	0	return (!UTF16_VALID_HIGH_SURROGATE(chr) &&
64	0	!UTF16_VALID_LOW_SURROGATE(chr) &&
65	0	chr <= UNICHAR_T_MAX);
66	0	}; Unexecuted instantiation: imap-bodystructure.c:uni_is_valid_ucs4 Unexecuted instantiation: str.c:uni_is_valid_ucs4 Unexecuted instantiation: unichar.c:uni_is_valid_ucs4 Unexecuted instantiation: unicode-transform.c:uni_is_valid_ucs4
67
68		/* Returns number of characters in a NUL-terminated unicode string */
69		unsigned int uni_strlen(const unichar_t *str) ATTR_PURE;
70		/* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
71		invalid */
72		int uni_utf8_to_ucs4(const char input, ARRAY_TYPE(unichars) output);
73		int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
74		ARRAY_TYPE(unichars) *output);
75		/* Translates UCS-4 input to UTF-8 output. */
76		void uni_ucs4_to_utf8(const unichar_t input, size_t len, buffer_t output);
77		void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);
78
79		/* Return number of octets needed to encode this codepoint in UTF-8. */
80		static inline unsigned int uni_ucs4_to_utf8_len(unichar_t chr)
81	0	{
82	0	i_assert(uni_is_valid_ucs4(chr));
83	0	if (chr > 0xFFFF)
84	0	return 4;
85	0	if (chr > 0x07FF)
86	0	return 3;
87	0	if (chr > 0x007f)
88	0	return 2;
89	0	return 1;
90	0	} Unexecuted instantiation: imap-bodystructure.c:uni_ucs4_to_utf8_len Unexecuted instantiation: str.c:uni_ucs4_to_utf8_len Unexecuted instantiation: unichar.c:uni_ucs4_to_utf8_len Unexecuted instantiation: unicode-transform.c:uni_ucs4_to_utf8_len
91
92		/* Returns char_bytes (>0) if *chr_r is set, 0 for incomplete trailing character,
93		-1 for invalid input. */
94		int uni_utf8_get_char(const char input, unichar_t chr_r);
95		int uni_utf8_get_char_n(const void input, size_t max_len, unichar_t chr_r);
96		int uni_utf8_get_char_buf(const void buffer, size_t size, unichar_t chr_r);
97		/* Returns number of characters in UTF-8 string. */
98		unsigned int uni_utf8_strlen(const char *input) ATTR_PURE;
99		/* Returns number of characters in UTF-8 input of specified size. */
100		unsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE;
101		/* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8
102		character, don't include it in the return value and set partial_pos_r to
103		where the character begins. Otherwise partial_pos_r is set to the end
104		of the input. */
105		unsigned int uni_utf8_partial_strlen_n(const void *input, size_t size,
106		size_t *partial_pos_r);
107
108		/* Returns the number of bytes belonging to this UTF-8 character. The given
109		parameter is the first byte of the UTF-8 sequence. Invalid input is
110		returned with length 1. */
111		static inline unsigned int ATTR_CONST
112		uni_utf8_char_bytes(unsigned char chr)
113	0	{
114		/* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
115	0	if (chr < (192 + 2))
116	0	return 1;
117	0	return uni_utf8_non1_bytes[chr - (192 + 2)];
118	0	} Unexecuted instantiation: imap-bodystructure.c:uni_utf8_char_bytes Unexecuted instantiation: str.c:uni_utf8_char_bytes Unexecuted instantiation: unichar.c:uni_utf8_char_bytes Unexecuted instantiation: unicode-transform.c:uni_utf8_char_bytes
119
120		/* Return given character in titlecase. */
121		unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
122
123		/* Run the UTF8 string through the provided Unicode transform and write the
124		result into the buffer again encoded in UTF8. */
125		int uni_utf8_run_transform(const void *_input, size_t size,
126		struct unicode_transform trans, buffer_t output,
127		const char **error_r);
128
129		/* Normalize the UTF-8 input in Unicode NFD, NFKD, NFC or NFKC form and write
130		the result to the output buffer.
131
132		Refer to Unicode Standard Annex #15, Section 1.2 for more information. An
133		excerpt can be found in unicode-nf.h.
134
135		NOTE: Do not blindly use this function to write and append several values
136		together expecting the result to be NF* normalized as well. This function
137		does not check whether concatenation preserves the desired normalization nor
138		does it endeavour to achieve this result. Blind concatination works only in
139		very specific cases, so make sure you know what you are doing.
140		*/
141		int uni_utf8_write_nfd(const void input, size_t size, buffer_t output);
142		int uni_utf8_write_nfkd(const void input, size_t size, buffer_t output);
143		int uni_utf8_write_nfc(const void input, size_t size, buffer_t output);
144		int uni_utf8_write_nfkc(const void input, size_t size, buffer_t output);
145
146		/* Same as the write variants, but return the normalized input in the
147		output_r argument as a C string.
148		*/
149		int uni_utf8_to_nfd(const void input, size_t size, const char *output_r);
150		int uni_utf8_to_nfkd(const void input, size_t size, const char *output_r);
151		int uni_utf8_to_nfc(const void input, size_t size, const char *output_r);
152		int uni_utf8_to_nfkc(const void input, size_t size, const char *output_r);
153
154		/* Check whether the input is normalized in the indicated form. Returns -1 if
155		the input is not even valid UTF8 or contains invalid code points. Returns 1
156		if the input adheres to the requested normalization form and 0 otherwise. */
157		int uni_utf8_is_nfd(const void *input, size_t size);
158		int uni_utf8_is_nfkd(const void *input, size_t size);
159		int uni_utf8_is_nfc(const void *input, size_t size);
160		int uni_utf8_is_nfkc(const void *input, size_t size);
161
162		/* Write the input UTF8 string to the provided buffer after mapping it to the
163		requested case. */
164		int uni_utf8_write_uppercase(const void _input, size_t size, buffer_t output);
165		int uni_utf8_write_lowercase(const void _input, size_t size, buffer_t output);
166		int uni_utf8_write_casefold(const void _input, size_t size, buffer_t output);
167
168		int uni_utf8_to_uppercase(const void input, size_t size, const char *output_r);
169		int uni_utf8_to_lowercase(const void input, size_t size, const char *output_r);
170		int uni_utf8_to_casefold(const void input, size_t size, const char *output_r);
171
172		/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
173		output buffer. Returns 0 if ok, -1 if input was invalid. This generates
174		output that's compatible with i;unicode-casemap comparator. Invalid input
175		is replaced with unicode replacement character (0xfffd). */
176		int uni_utf8_to_decomposed_titlecase(const void *input, size_t size,
177		buffer_t *output);
178
179		/* If input contains only valid UTF-8 characters, return TRUE without updating
180		buf. If input contains invalid UTF-8 characters, replace them with unicode
181		replacement character (0xfffd), write the output to buf and return FALSE. */
182		bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
183		buffer_t *buf) ATTR_WARN_UNUSED_RESULT;
184		/* Returns TRUE if string is valid UTF-8 input. */
185		bool uni_utf8_str_is_valid(const char *str);
186		/* Returns TRUE if data contains only valid UTF-8 input. */
187		bool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
188		/* Returns the size of the data when truncated to be less than or equal to
189		max_new_size, making sure UTF-8 character boundaries are respected. This only
190		looks at the last character at the new boundary. */
191		size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size,
192		size_t max_new_size);
193
194		/* surrogate handling */
195		static inline unichar_t uni_join_surrogate(unichar_t high, unichar_t low)
196	0	{
197	0	i_assert(UTF16_VALID_HIGH_SURROGATE(high) &&
198	0	UTF16_VALID_LOW_SURROGATE(low));
199	0
200	0	return ((high - UTF16_SURROGATE_HIGH_FIRST)<<10) +
201	0	(low - UTF16_SURROGATE_LOW_FIRST) +
202	0	UTF16_SURROGATE_BASE;
203	0	} Unexecuted instantiation: imap-bodystructure.c:uni_join_surrogate Unexecuted instantiation: str.c:uni_join_surrogate Unexecuted instantiation: unichar.c:uni_join_surrogate Unexecuted instantiation: unicode-transform.c:uni_join_surrogate
204
205		static inline void uni_split_surrogate(unichar_t chr, unichar_t high_r, unichar_t low_r)
206	0	{
207	0	i_assert(chr >= UTF16_SURROGATE_BASE && chr <= UNICHAR_T_MAX);
208	0	i_assert(high_r != NULL && low_r != NULL);
209	0	*high_r = UTF16_SURROGATE_HIGH(chr);
210	0	*low_r = UTF16_SURROGATE_LOW(chr);
211	0	} Unexecuted instantiation: imap-bodystructure.c:uni_split_surrogate Unexecuted instantiation: str.c:uni_split_surrogate Unexecuted instantiation: unichar.c:uni_split_surrogate Unexecuted instantiation: unicode-transform.c:uni_split_surrogate
212
213		/*
214		* Grapheme clusters
215		*/
216
217		/* The grapheme cluster scanner is used to split a Unicode string into a
218		sequence of grapheme clusters, which are in essence the Unicode characters as
219		perceived by the user. These can be longer than a single code point and by
220		consequence longer than a single octet. The Unicode standard defines what
221		constitutes a grapheme cluster in Annex #29. */
222
223		struct uni_gc_scanner {
224		pool_t pool;
225		struct unicode_gc_break gcbrk;
226
227		const unsigned char poffset, p, *pend;
228
229		unichar_t cp;
230		int cp_size;
231		};
232
233		/* Initialize the scanner. */
234		void uni_gc_scanner_init(struct uni_gc_scanner *gcsc,
235		const void *input, size_t size);
236		/* Shift scanner position to next grapheme cluster. Returns TRUE when scanner
237		points to a valid grapheme cluster and has not reached the end. */
238		bool uni_gc_scan_shift(struct uni_gc_scanner *gcsc) ATTR_NOWARN_UNUSED_RESULT;
239
240
241		/* Obtain a pointer to the current grapheme cluster the scanner points to.
242		Returns the size of the cluster in octets in size_r. */
243		static inline const unsigned char *
244		uni_gc_scan_get(struct uni_gc_scanner gcsc, size_t size_r)
245	0	{
246	0	if (gcsc->poffset == NULL)
247	0	uni_gc_scan_shift(gcsc);
248	0	if (size_r != NULL)
249	0	*size_r = gcsc->p - gcsc->poffset;
250	0	return gcsc->poffset;
251	0	} Unexecuted instantiation: imap-bodystructure.c:uni_gc_scan_get Unexecuted instantiation: str.c:uni_gc_scan_get Unexecuted instantiation: unichar.c:uni_gc_scan_get Unexecuted instantiation: unicode-transform.c:uni_gc_scan_get
252
253		/* Convenience function for checking whether current grapheme cluster is a
254		particular (single-octet) ASCII character. */
255		static inline bool
256		uni_gc_scan_ascii_equals(struct uni_gc_scanner *gcsc, unsigned int c)
257	0	{
258	0	size_t gc_size;
259	0	const unsigned char *gc = uni_gc_scan_get(gcsc, &gc_size);
260	0
261	0	if (gc_size != 1)
262	0	return FALSE;
263	0
264	0	return (*gc == (unsigned char)c);
265	0	} Unexecuted instantiation: imap-bodystructure.c:uni_gc_scan_ascii_equals Unexecuted instantiation: str.c:uni_gc_scan_ascii_equals Unexecuted instantiation: unichar.c:uni_gc_scan_ascii_equals Unexecuted instantiation: unicode-transform.c:uni_gc_scan_ascii_equals
266
267		/* Returns TRUE when the scanner has reached the end of input. */
268		static inline bool uni_gc_scan_at_end(struct uni_gc_scanner *gcsc)
269	0	{
270	0	size_t gc_size;
271	0	(void)uni_gc_scan_get(gcsc, &gc_size);
272	0	return (gc_size == 0);
273	0	} Unexecuted instantiation: imap-bodystructure.c:uni_gc_scan_at_end Unexecuted instantiation: str.c:uni_gc_scan_at_end Unexecuted instantiation: unichar.c:uni_gc_scan_at_end Unexecuted instantiation: unicode-transform.c:uni_gc_scan_at_end
274
275		#endif