/src/icu/source/common/utrie2.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | ******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 2001-2014, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | ******************************************************************************  | 
10  |  | *   file name:  utrie2.h  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:4  | 
14  |  | *  | 
15  |  | *   created on: 2008aug16 (starting from a copy of utrie.h)  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | */  | 
18  |  |  | 
19  |  | #ifndef __UTRIE2_H__  | 
20  |  | #define __UTRIE2_H__  | 
21  |  |  | 
22  |  | #include "unicode/utypes.h"  | 
23  |  | #include "unicode/utf8.h"  | 
24  |  | #include "putilimp.h"  | 
25  |  |  | 
26  |  | U_CDECL_BEGIN  | 
27  |  |  | 
28  |  | struct UTrie;  /* forward declaration */  | 
29  |  | #ifndef __UTRIE_H__  | 
30  |  | typedef struct UTrie UTrie;  | 
31  |  | #endif  | 
32  |  |  | 
33  |  | /**  | 
34  |  |  * \file  | 
35  |  |  *  | 
36  |  |  * This is a common implementation of a Unicode trie.  | 
37  |  |  * It is a kind of compressed, serializable table of 16- or 32-bit values associated with  | 
38  |  |  * Unicode code points (0..0x10ffff). (A map from code points to integers.)  | 
39  |  |  *  | 
40  |  |  * This is the second common version of a Unicode trie (hence the name UTrie2).  | 
41  |  |  * Compared with UTrie version 1:  | 
42  |  |  * - Still splitting BMP code points 11:5 bits for index and data table lookups.  | 
43  |  |  * - Still separate data for lead surrogate code _units_ vs. code _points_,  | 
44  |  |  *   but the lead surrogate code unit values are not required any more  | 
45  |  |  *   for data lookup for supplementary code points.  | 
46  |  |  * - The "folding" mechanism is removed. In UTrie version 1, this somewhat  | 
47  |  |  *   hard-to-explain mechanism was meant to be used for optimized UTF-16  | 
48  |  |  *   processing, with application-specific encoding of indexing bits  | 
49  |  |  *   in the lead surrogate data for the associated supplementary code points.  | 
50  |  |  * - For the last single-value code point range (ending with U+10ffff),  | 
51  |  |  *   the starting code point ("highStart") and the value are stored. | 
52  |  |  * - For supplementary code points U+10000..highStart-1 a three-table lookup  | 
53  |  |  *   (two index tables and one data table) is used. The first index  | 
54  |  |  *   is truncated, omitting both the BMP portion and the high range.  | 
55  |  |  * - There is a special small index for 2-byte UTF-8, and the initial data  | 
56  |  |  *   entries are designed for fast 1/2-byte UTF-8 lookup.  | 
57  |  |  *   Starting with ICU 60, C0 and C1 are not recognized as UTF-8 lead bytes any more at all,  | 
58  |  |  *   and the associated 2-byte indexes are unused.  | 
59  |  |  */  | 
60  |  |  | 
61  |  | /**  | 
62  |  |  * Trie structure.  | 
63  |  |  * Use only with public API macros and functions.  | 
64  |  |  */  | 
65  |  | struct UTrie2;  | 
66  |  | typedef struct UTrie2 UTrie2;  | 
67  |  |  | 
68  |  | /* Public UTrie2 API functions: read-only access ---------------------------- */  | 
69  |  |  | 
70  |  | /**  | 
71  |  |  * Selectors for the width of a UTrie2 data value.  | 
72  |  |  */  | 
73  |  | enum UTrie2ValueBits { | 
74  |  |     /** 16 bits per UTrie2 data value. */  | 
75  |  |     UTRIE2_16_VALUE_BITS,  | 
76  |  |     /** 32 bits per UTrie2 data value. */  | 
77  |  |     UTRIE2_32_VALUE_BITS,  | 
78  |  |     /** Number of selectors for the width of UTrie2 data values. */  | 
79  |  |     UTRIE2_COUNT_VALUE_BITS  | 
80  |  | };  | 
81  |  | typedef enum UTrie2ValueBits UTrie2ValueBits;  | 
82  |  |  | 
83  |  | /**  | 
84  |  |  * Open a frozen trie from its serialized from, stored in 32-bit-aligned memory.  | 
85  |  |  * Inverse of utrie2_serialize().  | 
86  |  |  * The memory must remain valid and unchanged as long as the trie is used.  | 
87  |  |  * You must utrie2_close() the trie once you are done using it.  | 
88  |  |  *  | 
89  |  |  * @param valueBits selects the data entry size; results in an  | 
90  |  |  *                  U_INVALID_FORMAT_ERROR if it does not match the serialized form  | 
91  |  |  * @param data a pointer to 32-bit-aligned memory containing the serialized form of a UTrie2  | 
92  |  |  * @param length the number of bytes available at data;  | 
93  |  |  *               can be more than necessary  | 
94  |  |  * @param pActualLength receives the actual number of bytes at data taken up by the trie data;  | 
95  |  |  *                      can be NULL  | 
96  |  |  * @param pErrorCode an in/out ICU UErrorCode  | 
97  |  |  * @return the unserialized trie  | 
98  |  |  *  | 
99  |  |  * @see utrie2_open  | 
100  |  |  * @see utrie2_serialize  | 
101  |  |  */  | 
102  |  | U_CAPI UTrie2 * U_EXPORT2  | 
103  |  | utrie2_openFromSerialized(UTrie2ValueBits valueBits,  | 
104  |  |                           const void *data, int32_t length, int32_t *pActualLength,  | 
105  |  |                           UErrorCode *pErrorCode);  | 
106  |  |  | 
107  |  | /**  | 
108  |  |  * Open a frozen, empty "dummy" trie.  | 
109  |  |  * A dummy trie is an empty trie, used when a real data trie cannot  | 
110  |  |  * be loaded. Equivalent to calling utrie2_open() and utrie2_freeze(),  | 
111  |  |  * but without internally creating and compacting/serializing the  | 
112  |  |  * builder data structure.  | 
113  |  |  *  | 
114  |  |  * The trie always returns the initialValue,  | 
115  |  |  * or the errorValue for out-of-range code points and illegal UTF-8.  | 
116  |  |  *  | 
117  |  |  * You must utrie2_close() the trie once you are done using it.  | 
118  |  |  *  | 
119  |  |  * @param valueBits selects the data entry size  | 
120  |  |  * @param initialValue the initial value that is set for all code points  | 
121  |  |  * @param errorValue the value for out-of-range code points and illegal UTF-8  | 
122  |  |  * @param pErrorCode an in/out ICU UErrorCode  | 
123  |  |  * @return the dummy trie  | 
124  |  |  *  | 
125  |  |  * @see utrie2_openFromSerialized  | 
126  |  |  * @see utrie2_open  | 
127  |  |  */  | 
128  |  | U_CAPI UTrie2 * U_EXPORT2  | 
129  |  | utrie2_openDummy(UTrie2ValueBits valueBits,  | 
130  |  |                  uint32_t initialValue, uint32_t errorValue,  | 
131  |  |                  UErrorCode *pErrorCode);  | 
132  |  |  | 
133  |  | /**  | 
134  |  |  * Get a value from a code point as stored in the trie.  | 
135  |  |  * Easier to use than UTRIE2_GET16() and UTRIE2_GET32() but slower.  | 
136  |  |  * Easier to use because, unlike the macros, this function works on all UTrie2  | 
137  |  |  * objects, frozen or not, holding 16-bit or 32-bit data values.  | 
138  |  |  *  | 
139  |  |  * @param trie the trie  | 
140  |  |  * @param c the code point  | 
141  |  |  * @return the value  | 
142  |  |  */  | 
143  |  | U_CAPI uint32_t U_EXPORT2  | 
144  |  | utrie2_get32(const UTrie2 *trie, UChar32 c);  | 
145  |  |  | 
146  |  | /* enumeration callback types */  | 
147  |  |  | 
148  |  | /**  | 
149  |  |  * Callback from utrie2_enum(), extracts a uint32_t value from a  | 
150  |  |  * trie value. This value will be passed on to the UTrie2EnumRange function.  | 
151  |  |  *  | 
152  |  |  * @param context an opaque pointer, as passed into utrie2_enum()  | 
153  |  |  * @param value a value from the trie  | 
154  |  |  * @return the value that is to be passed on to the UTrie2EnumRange function  | 
155  |  |  */  | 
156  |  | typedef uint32_t U_CALLCONV  | 
157  |  | UTrie2EnumValue(const void *context, uint32_t value);  | 
158  |  |  | 
159  |  | /**  | 
160  |  |  * Callback from utrie2_enum(), is called for each contiguous range  | 
161  |  |  * of code points with the same value as retrieved from the trie and  | 
162  |  |  * transformed by the UTrie2EnumValue function.  | 
163  |  |  *  | 
164  |  |  * The callback function can stop the enumeration by returning false.  | 
165  |  |  *  | 
166  |  |  * @param context an opaque pointer, as passed into utrie2_enum()  | 
167  |  |  * @param start the first code point in a contiguous range with value  | 
168  |  |  * @param end the last code point in a contiguous range with value (inclusive)  | 
169  |  |  * @param value the value that is set for all code points in [start..end]  | 
170  |  |  * @return false to stop the enumeration  | 
171  |  |  */  | 
172  |  | typedef UBool U_CALLCONV  | 
173  |  | UTrie2EnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value);  | 
174  |  |  | 
175  |  | /**  | 
176  |  |  * Enumerate efficiently all values in a trie.  | 
177  |  |  * Do not modify the trie during the enumeration.  | 
178  |  |  *  | 
179  |  |  * For each entry in the trie, the value to be delivered is passed through  | 
180  |  |  * the UTrie2EnumValue function.  | 
181  |  |  * The value is unchanged if that function pointer is NULL.  | 
182  |  |  *  | 
183  |  |  * For each contiguous range of code points with a given (transformed) value,  | 
184  |  |  * the UTrie2EnumRange function is called.  | 
185  |  |  *  | 
186  |  |  * @param trie a pointer to the trie  | 
187  |  |  * @param enumValue a pointer to a function that may transform the trie entry value,  | 
188  |  |  *                  or NULL if the values from the trie are to be used directly  | 
189  |  |  * @param enumRange a pointer to a function that is called for each contiguous range  | 
190  |  |  *                  of code points with the same (transformed) value  | 
191  |  |  * @param context an opaque pointer that is passed on to the callback functions  | 
192  |  |  */  | 
193  |  | U_CAPI void U_EXPORT2  | 
194  |  | utrie2_enum(const UTrie2 *trie,  | 
195  |  |             UTrie2EnumValue *enumValue, UTrie2EnumRange *enumRange, const void *context);  | 
196  |  |  | 
197  |  | /* Building a trie ---------------------------------------------------------- */  | 
198  |  |  | 
199  |  | /**  | 
200  |  |  * Open an empty, writable trie. At build time, 32-bit data values are used.  | 
201  |  |  * utrie2_freeze() takes a valueBits parameter  | 
202  |  |  * which determines the data value width in the serialized and frozen forms.  | 
203  |  |  * You must utrie2_close() the trie once you are done using it.  | 
204  |  |  *  | 
205  |  |  * @param initialValue the initial value that is set for all code points  | 
206  |  |  * @param errorValue the value for out-of-range code points and illegal UTF-8  | 
207  |  |  * @param pErrorCode an in/out ICU UErrorCode  | 
208  |  |  * @return a pointer to the allocated and initialized new trie  | 
209  |  |  */  | 
210  |  | U_CAPI UTrie2 * U_EXPORT2  | 
211  |  | utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode);  | 
212  |  |  | 
213  |  | /**  | 
214  |  |  * Clone a trie.  | 
215  |  |  * You must utrie2_close() the clone once you are done using it.  | 
216  |  |  *  | 
217  |  |  * @param other the trie to clone  | 
218  |  |  * @param pErrorCode an in/out ICU UErrorCode  | 
219  |  |  * @return a pointer to the new trie clone  | 
220  |  |  */  | 
221  |  | U_CAPI UTrie2 * U_EXPORT2  | 
222  |  | utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode);  | 
223  |  |  | 
224  |  | /**  | 
225  |  |  * Clone a trie. The clone will be mutable/writable even if the other trie  | 
226  |  |  * is frozen. (See utrie2_freeze().)  | 
227  |  |  * You must utrie2_close() the clone once you are done using it.  | 
228  |  |  *  | 
229  |  |  * @param other the trie to clone  | 
230  |  |  * @param pErrorCode an in/out ICU UErrorCode  | 
231  |  |  * @return a pointer to the new trie clone  | 
232  |  |  */  | 
233  |  | U_CAPI UTrie2 * U_EXPORT2  | 
234  |  | utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode);  | 
235  |  |  | 
236  |  | /**  | 
237  |  |  * Close a trie and release associated memory.  | 
238  |  |  *  | 
239  |  |  * @param trie the trie  | 
240  |  |  */  | 
241  |  | U_CAPI void U_EXPORT2  | 
242  |  | utrie2_close(UTrie2 *trie);  | 
243  |  |  | 
244  |  | /**  | 
245  |  |  * Set a value for a code point.  | 
246  |  |  *  | 
247  |  |  * @param trie the unfrozen trie  | 
248  |  |  * @param c the code point  | 
249  |  |  * @param value the value  | 
250  |  |  * @param pErrorCode an in/out ICU UErrorCode; among other possible error codes:  | 
251  |  |  * - U_NO_WRITE_PERMISSION if the trie is frozen  | 
252  |  |  */  | 
253  |  | U_CAPI void U_EXPORT2  | 
254  |  | utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode);  | 
255  |  |  | 
256  |  | /**  | 
257  |  |  * Set a value in a range of code points [start..end].  | 
258  |  |  * All code points c with start<=c<=end will get the value if  | 
259  |  |  * overwrite is true or if the old value is the initial value.  | 
260  |  |  *  | 
261  |  |  * @param trie the unfrozen trie  | 
262  |  |  * @param start the first code point to get the value  | 
263  |  |  * @param end the last code point to get the value (inclusive)  | 
264  |  |  * @param value the value  | 
265  |  |  * @param overwrite flag for whether old non-initial values are to be overwritten  | 
266  |  |  * @param pErrorCode an in/out ICU UErrorCode; among other possible error codes:  | 
267  |  |  * - U_NO_WRITE_PERMISSION if the trie is frozen  | 
268  |  |  */  | 
269  |  | U_CAPI void U_EXPORT2  | 
270  |  | utrie2_setRange32(UTrie2 *trie,  | 
271  |  |                   UChar32 start, UChar32 end,  | 
272  |  |                   uint32_t value, UBool overwrite,  | 
273  |  |                   UErrorCode *pErrorCode);  | 
274  |  |  | 
275  |  | /**  | 
276  |  |  * Freeze a trie. Make it immutable (read-only) and compact it,  | 
277  |  |  * ready for serialization and for use with fast macros.  | 
278  |  |  * Functions to set values will fail after serializing.  | 
279  |  |  *  | 
280  |  |  * A trie can be frozen only once. If this function is called again with different  | 
281  |  |  * valueBits then it will set a U_ILLEGAL_ARGUMENT_ERROR.  | 
282  |  |  *  | 
283  |  |  * @param trie the trie  | 
284  |  |  * @param valueBits selects the data entry size; if smaller than 32 bits, then  | 
285  |  |  *                  the values stored in the trie will be truncated  | 
286  |  |  * @param pErrorCode an in/out ICU UErrorCode; among other possible error codes:  | 
287  |  |  * - U_INDEX_OUTOFBOUNDS_ERROR if the compacted index or data arrays are too long  | 
288  |  |  *                             for serialization  | 
289  |  |  *                             (the trie will be immutable and usable,  | 
290  |  |  *                             but not frozen and not usable with the fast macros)  | 
291  |  |  *  | 
292  |  |  * @see utrie2_cloneAsThawed  | 
293  |  |  */  | 
294  |  | U_CAPI void U_EXPORT2  | 
295  |  | utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode);  | 
296  |  |  | 
297  |  | /**  | 
298  |  |  * Test if the trie is frozen. (See utrie2_freeze().)  | 
299  |  |  *  | 
300  |  |  * @param trie the trie  | 
301  |  |  * @return true if the trie is frozen, that is, immutable, ready for serialization  | 
302  |  |  *         and for use with fast macros  | 
303  |  |  */  | 
304  |  | U_CAPI UBool U_EXPORT2  | 
305  |  | utrie2_isFrozen(const UTrie2 *trie);  | 
306  |  |  | 
307  |  | /**  | 
308  |  |  * Serialize a frozen trie into 32-bit aligned memory.  | 
309  |  |  * If the trie is not frozen, then the function returns with a U_ILLEGAL_ARGUMENT_ERROR.  | 
310  |  |  * A trie can be serialized multiple times.  | 
311  |  |  *  | 
312  |  |  * @param trie the frozen trie  | 
313  |  |  * @param data a pointer to 32-bit-aligned memory to be filled with the trie data,  | 
314  |  |  *             can be NULL if capacity==0  | 
315  |  |  * @param capacity the number of bytes available at data,  | 
316  |  |  *                 or 0 for preflighting  | 
317  |  |  * @param pErrorCode an in/out ICU UErrorCode; among other possible error codes:  | 
318  |  |  * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization  | 
319  |  |  * - U_ILLEGAL_ARGUMENT_ERROR if the trie is not frozen or the data and capacity  | 
320  |  |  *                            parameters are bad  | 
321  |  |  * @return the number of bytes written or needed for the trie  | 
322  |  |  *  | 
323  |  |  * @see utrie2_openFromSerialized()  | 
324  |  |  */  | 
325  |  | U_CAPI int32_t U_EXPORT2  | 
326  |  | utrie2_serialize(const UTrie2 *trie,  | 
327  |  |                  void *data, int32_t capacity,  | 
328  |  |                  UErrorCode *pErrorCode);  | 
329  |  |  | 
330  |  | /* Public UTrie2 API: miscellaneous functions ------------------------------- */  | 
331  |  |  | 
332  |  | /**  | 
333  |  |  * Build a UTrie2 (version 2) from a UTrie (version 1).  | 
334  |  |  * Enumerates all values in the UTrie and builds a UTrie2 with the same values.  | 
335  |  |  * The resulting UTrie2 will be frozen.  | 
336  |  |  *  | 
337  |  |  * @param trie1 the runtime UTrie structure to be enumerated  | 
338  |  |  * @param errorValue the value for out-of-range code points and illegal UTF-8  | 
339  |  |  * @param pErrorCode an in/out ICU UErrorCode  | 
340  |  |  * @return The frozen UTrie2 with the same values as the UTrie.  | 
341  |  |  */  | 
342  |  | U_CAPI UTrie2 * U_EXPORT2  | 
343  |  | utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode);  | 
344  |  |  | 
345  |  | /* Public UTrie2 API macros ------------------------------------------------- */  | 
346  |  |  | 
347  |  | /*  | 
348  |  |  * These macros provide fast data lookup from a frozen trie.  | 
349  |  |  * They will crash when used on an unfrozen trie.  | 
350  |  |  */  | 
351  |  |  | 
352  |  | /**  | 
353  |  |  * Return a 16-bit trie value from a code point, with range checking.  | 
354  |  |  * Returns trie->errorValue if c is not in the range 0..U+10ffff.  | 
355  |  |  *  | 
356  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
357  |  |  * @param c (UChar32, in) the input code point  | 
358  |  |  * @return (uint16_t) The code point's trie value.  | 
359  |  |  */  | 
360  | 0  | #define UTRIE2_GET16(trie, c) _UTRIE2_GET((trie), index, (trie)->indexLength, (c))  | 
361  |  |  | 
362  |  | /**  | 
363  |  |  * Return a 32-bit trie value from a code point, with range checking.  | 
364  |  |  * Returns trie->errorValue if c is not in the range 0..U+10ffff.  | 
365  |  |  *  | 
366  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
367  |  |  * @param c (UChar32, in) the input code point  | 
368  |  |  * @return (uint32_t) The code point's trie value.  | 
369  |  |  */  | 
370  | 0  | #define UTRIE2_GET32(trie, c) _UTRIE2_GET((trie), data32, 0, (c))  | 
371  |  |  | 
372  |  | /**  | 
373  |  |  * UTF-16: Get the next code point (UChar32 c, out), post-increment src,  | 
374  |  |  * and get a 16-bit value from the trie.  | 
375  |  |  *  | 
376  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
377  |  |  * @param src (const UChar *, in/out) the source text pointer  | 
378  |  |  * @param limit (const UChar *, in) the limit pointer for the text, or NULL if NUL-terminated  | 
379  |  |  * @param c (UChar32, out) variable for the code point  | 
380  |  |  * @param result (uint16_t, out) uint16_t variable for the trie lookup result  | 
381  |  |  */  | 
382  | 0  | #define UTRIE2_U16_NEXT16(trie, src, limit, c, result) _UTRIE2_U16_NEXT(trie, index, src, limit, c, result)  | 
383  |  |  | 
384  |  | /**  | 
385  |  |  * UTF-16: Get the next code point (UChar32 c, out), post-increment src,  | 
386  |  |  * and get a 32-bit value from the trie.  | 
387  |  |  *  | 
388  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
389  |  |  * @param src (const UChar *, in/out) the source text pointer  | 
390  |  |  * @param limit (const UChar *, in) the limit pointer for the text, or NULL if NUL-terminated  | 
391  |  |  * @param c (UChar32, out) variable for the code point  | 
392  |  |  * @param result (uint32_t, out) uint32_t variable for the trie lookup result  | 
393  |  |  */  | 
394  |  | #define UTRIE2_U16_NEXT32(trie, src, limit, c, result) _UTRIE2_U16_NEXT(trie, data32, src, limit, c, result)  | 
395  |  |  | 
396  |  | /**  | 
397  |  |  * UTF-16: Get the previous code point (UChar32 c, out), pre-decrement src,  | 
398  |  |  * and get a 16-bit value from the trie.  | 
399  |  |  *  | 
400  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
401  |  |  * @param start (const UChar *, in) the start pointer for the text  | 
402  |  |  * @param src (const UChar *, in/out) the source text pointer  | 
403  |  |  * @param c (UChar32, out) variable for the code point  | 
404  |  |  * @param result (uint16_t, out) uint16_t variable for the trie lookup result  | 
405  |  |  */  | 
406  | 0  | #define UTRIE2_U16_PREV16(trie, start, src, c, result) _UTRIE2_U16_PREV(trie, index, start, src, c, result)  | 
407  |  |  | 
408  |  | /**  | 
409  |  |  * UTF-16: Get the previous code point (UChar32 c, out), pre-decrement src,  | 
410  |  |  * and get a 32-bit value from the trie.  | 
411  |  |  *  | 
412  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
413  |  |  * @param start (const UChar *, in) the start pointer for the text  | 
414  |  |  * @param src (const UChar *, in/out) the source text pointer  | 
415  |  |  * @param c (UChar32, out) variable for the code point  | 
416  |  |  * @param result (uint32_t, out) uint32_t variable for the trie lookup result  | 
417  |  |  */  | 
418  |  | #define UTRIE2_U16_PREV32(trie, start, src, c, result) _UTRIE2_U16_PREV(trie, data32, start, src, c, result)  | 
419  |  |  | 
420  |  | /**  | 
421  |  |  * UTF-8: Post-increment src and get a 16-bit value from the trie.  | 
422  |  |  *  | 
423  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
424  |  |  * @param src (const char *, in/out) the source text pointer  | 
425  |  |  * @param limit (const char *, in) the limit pointer for the text (must not be NULL)  | 
426  |  |  * @param result (uint16_t, out) uint16_t variable for the trie lookup result  | 
427  |  |  */  | 
428  |  | #define UTRIE2_U8_NEXT16(trie, src, limit, result)\  | 
429  |  |     _UTRIE2_U8_NEXT(trie, data16, index, src, limit, result)  | 
430  |  |  | 
431  |  | /**  | 
432  |  |  * UTF-8: Post-increment src and get a 32-bit value from the trie.  | 
433  |  |  *  | 
434  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
435  |  |  * @param src (const char *, in/out) the source text pointer  | 
436  |  |  * @param limit (const char *, in) the limit pointer for the text (must not be NULL)  | 
437  |  |  * @param result (uint16_t, out) uint32_t variable for the trie lookup result  | 
438  |  |  */  | 
439  |  | #define UTRIE2_U8_NEXT32(trie, src, limit, result) \  | 
440  |  |     _UTRIE2_U8_NEXT(trie, data32, data32, src, limit, result)  | 
441  |  |  | 
442  |  | /**  | 
443  |  |  * UTF-8: Pre-decrement src and get a 16-bit value from the trie.  | 
444  |  |  *  | 
445  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
446  |  |  * @param start (const char *, in) the start pointer for the text  | 
447  |  |  * @param src (const char *, in/out) the source text pointer  | 
448  |  |  * @param result (uint16_t, out) uint16_t variable for the trie lookup result  | 
449  |  |  */  | 
450  |  | #define UTRIE2_U8_PREV16(trie, start, src, result) \  | 
451  |  |     _UTRIE2_U8_PREV(trie, data16, index, start, src, result)  | 
452  |  |  | 
453  |  | /**  | 
454  |  |  * UTF-8: Pre-decrement src and get a 32-bit value from the trie.  | 
455  |  |  *  | 
456  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
457  |  |  * @param start (const char *, in) the start pointer for the text  | 
458  |  |  * @param src (const char *, in/out) the source text pointer  | 
459  |  |  * @param result (uint16_t, out) uint32_t variable for the trie lookup result  | 
460  |  |  */  | 
461  |  | #define UTRIE2_U8_PREV32(trie, start, src, result) \  | 
462  |  |     _UTRIE2_U8_PREV(trie, data32, data32, start, src, result)  | 
463  |  |  | 
464  |  | /* Public UTrie2 API: optimized UTF-16 access ------------------------------- */  | 
465  |  |  | 
466  |  | /*  | 
467  |  |  * The following functions and macros are used for highly optimized UTF-16  | 
468  |  |  * text processing. The UTRIE2_U16_NEXTxy() macros do not depend on these.  | 
469  |  |  *  | 
470  |  |  * A UTrie2 stores separate values for lead surrogate code _units_ vs. code _points_.  | 
471  |  |  * UTF-16 text processing can be optimized by detecting surrogate pairs and  | 
472  |  |  * assembling supplementary code points only when there is non-trivial data  | 
473  |  |  * available.  | 
474  |  |  *  | 
475  |  |  * At build-time, use utrie2_enumForLeadSurrogate() to see if there  | 
476  |  |  * is non-trivial (non-initialValue) data for any of the supplementary  | 
477  |  |  * code points associated with a lead surrogate.  | 
478  |  |  * If so, then set a special (application-specific) value for the  | 
479  |  |  * lead surrogate code _unit_, with utrie2_set32ForLeadSurrogateCodeUnit().  | 
480  |  |  *  | 
481  |  |  * At runtime, use UTRIE2_GET16_FROM_U16_SINGLE_LEAD() or  | 
482  |  |  * UTRIE2_GET32_FROM_U16_SINGLE_LEAD() per code unit. If there is non-trivial  | 
483  |  |  * data and the code unit is a lead surrogate, then check if a trail surrogate  | 
484  |  |  * follows. If so, assemble the supplementary code point with  | 
485  |  |  * U16_GET_SUPPLEMENTARY() and look up its value with UTRIE2_GET16_FROM_SUPP()  | 
486  |  |  * or UTRIE2_GET32_FROM_SUPP(); otherwise reset the lead  | 
487  |  |  * surrogate's value or do a code point lookup for it.  | 
488  |  |  *  | 
489  |  |  * If there is only trivial data for lead and trail surrogates, then processing  | 
490  |  |  * can often skip them. For example, in normalization or case mapping  | 
491  |  |  * all characters that do not have any mappings are simply copied as is.  | 
492  |  |  */  | 
493  |  |  | 
494  |  | /**  | 
495  |  |  * Get a value from a lead surrogate code unit as stored in the trie.  | 
496  |  |  *  | 
497  |  |  * @param trie the trie  | 
498  |  |  * @param c the code unit (U+D800..U+DBFF)  | 
499  |  |  * @return the value  | 
500  |  |  */  | 
501  |  | U_CAPI uint32_t U_EXPORT2  | 
502  |  | utrie2_get32FromLeadSurrogateCodeUnit(const UTrie2 *trie, UChar32 c);  | 
503  |  |  | 
504  |  | /**  | 
505  |  |  * Enumerate the trie values for the 1024=0x400 code points  | 
506  |  |  * corresponding to a given lead surrogate.  | 
507  |  |  * For example, for the lead surrogate U+D87E it will enumerate the values  | 
508  |  |  * for [U+2F800..U+2FC00[.  | 
509  |  |  * Used by data builder code that sets special lead surrogate code unit values  | 
510  |  |  * for optimized UTF-16 string processing.  | 
511  |  |  *  | 
512  |  |  * Do not modify the trie during the enumeration.  | 
513  |  |  *  | 
514  |  |  * Except for the limited code point range, this functions just like utrie2_enum():  | 
515  |  |  * For each entry in the trie, the value to be delivered is passed through  | 
516  |  |  * the UTrie2EnumValue function.  | 
517  |  |  * The value is unchanged if that function pointer is NULL.  | 
518  |  |  *  | 
519  |  |  * For each contiguous range of code points with a given (transformed) value,  | 
520  |  |  * the UTrie2EnumRange function is called.  | 
521  |  |  *  | 
522  |  |  * @param trie a pointer to the trie  | 
523  |  |  * @param enumValue a pointer to a function that may transform the trie entry value,  | 
524  |  |  *                  or NULL if the values from the trie are to be used directly  | 
525  |  |  * @param enumRange a pointer to a function that is called for each contiguous range  | 
526  |  |  *                  of code points with the same (transformed) value  | 
527  |  |  * @param context an opaque pointer that is passed on to the callback functions  | 
528  |  |  */  | 
529  |  | U_CAPI void U_EXPORT2  | 
530  |  | utrie2_enumForLeadSurrogate(const UTrie2 *trie, UChar32 lead,  | 
531  |  |                             UTrie2EnumValue *enumValue, UTrie2EnumRange *enumRange,  | 
532  |  |                             const void *context);  | 
533  |  |  | 
534  |  | /**  | 
535  |  |  * Set a value for a lead surrogate code unit.  | 
536  |  |  *  | 
537  |  |  * @param trie the unfrozen trie  | 
538  |  |  * @param lead the lead surrogate code unit (U+D800..U+DBFF)  | 
539  |  |  * @param value the value  | 
540  |  |  * @param pErrorCode an in/out ICU UErrorCode; among other possible error codes:  | 
541  |  |  * - U_NO_WRITE_PERMISSION if the trie is frozen  | 
542  |  |  */  | 
543  |  | U_CAPI void U_EXPORT2  | 
544  |  | utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,  | 
545  |  |                                      UChar32 lead, uint32_t value,  | 
546  |  |                                      UErrorCode *pErrorCode);  | 
547  |  |  | 
548  |  | /**  | 
549  |  |  * Return a 16-bit trie value from a UTF-16 single/lead code unit (<=U+ffff).  | 
550  |  |  * Same as UTRIE2_GET16() if c is a BMP code point except for lead surrogates,  | 
551  |  |  * but smaller and faster.  | 
552  |  |  *  | 
553  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
554  |  |  * @param c (UChar32, in) the input code unit, must be 0<=c<=U+ffff  | 
555  |  |  * @return (uint16_t) The code unit's trie value.  | 
556  |  |  */  | 
557  | 0  | #define UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, c) _UTRIE2_GET_FROM_U16_SINGLE_LEAD((trie), index, c)  | 
558  |  |  | 
559  |  | /**  | 
560  |  |  * Return a 32-bit trie value from a UTF-16 single/lead code unit (<=U+ffff).  | 
561  |  |  * Same as UTRIE2_GET32() if c is a BMP code point except for lead surrogates,  | 
562  |  |  * but smaller and faster.  | 
563  |  |  *  | 
564  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
565  |  |  * @param c (UChar32, in) the input code unit, must be 0<=c<=U+ffff  | 
566  |  |  * @return (uint32_t) The code unit's trie value.  | 
567  |  |  */  | 
568  | 0  | #define UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c) _UTRIE2_GET_FROM_U16_SINGLE_LEAD((trie), data32, c)  | 
569  |  |  | 
570  |  | /**  | 
571  |  |  * Return a 16-bit trie value from a supplementary code point (U+10000..U+10ffff).  | 
572  |  |  *  | 
573  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
574  |  |  * @param c (UChar32, in) the input code point, must be U+10000<=c<=U+10ffff  | 
575  |  |  * @return (uint16_t) The code point's trie value.  | 
576  |  |  */  | 
577  |  | #define UTRIE2_GET16_FROM_SUPP(trie, c) _UTRIE2_GET_FROM_SUPP((trie), index, c)  | 
578  |  |  | 
579  |  | /**  | 
580  |  |  * Return a 32-bit trie value from a supplementary code point (U+10000..U+10ffff).  | 
581  |  |  *  | 
582  |  |  * @param trie (const UTrie2 *, in) a frozen trie  | 
583  |  |  * @param c (UChar32, in) the input code point, must be U+10000<=c<=U+10ffff  | 
584  |  |  * @return (uint32_t) The code point's trie value.  | 
585  |  |  */  | 
586  | 0  | #define UTRIE2_GET32_FROM_SUPP(trie, c) _UTRIE2_GET_FROM_SUPP((trie), data32, c)  | 
587  |  |  | 
588  |  | U_CDECL_END  | 
589  |  |  | 
590  |  | /* C++ convenience wrappers ------------------------------------------------- */  | 
591  |  |  | 
592  |  | #ifdef __cplusplus  | 
593  |  |  | 
594  |  | #include "unicode/utf.h"  | 
595  |  | #include "mutex.h"  | 
596  |  |  | 
597  |  | U_NAMESPACE_BEGIN  | 
598  |  |  | 
599  |  | // Use the Forward/Backward subclasses below.  | 
600  |  | class UTrie2StringIterator : public UMemory { | 
601  |  | public:  | 
602  |  |     UTrie2StringIterator(const UTrie2 *t, const UChar *p) :  | 
603  | 0  |         trie(t), codePointStart(p), codePointLimit(p), codePoint(U_SENTINEL) {} | 
604  |  |  | 
605  |  |     const UTrie2 *trie;  | 
606  |  |     const UChar *codePointStart, *codePointLimit;  | 
607  |  |     UChar32 codePoint;  | 
608  |  | };  | 
609  |  |  | 
610  |  | class BackwardUTrie2StringIterator : public UTrie2StringIterator { | 
611  |  | public:  | 
612  |  |     BackwardUTrie2StringIterator(const UTrie2 *t, const UChar *s, const UChar *p) :  | 
613  | 0  |         UTrie2StringIterator(t, p), start(s) {} | 
614  |  |  | 
615  |  |     uint16_t previous16();  | 
616  |  |  | 
617  |  |     const UChar *start;  | 
618  |  | };  | 
619  |  |  | 
620  |  | class ForwardUTrie2StringIterator : public UTrie2StringIterator { | 
621  |  | public:  | 
622  |  |     // Iteration limit l can be NULL.  | 
623  |  |     // In that case, the caller must detect c==0 and stop.  | 
624  |  |     ForwardUTrie2StringIterator(const UTrie2 *t, const UChar *p, const UChar *l) :  | 
625  | 0  |         UTrie2StringIterator(t, p), limit(l) {} | 
626  |  |  | 
627  |  |     uint16_t next16();  | 
628  |  |  | 
629  |  |     const UChar *limit;  | 
630  |  | };  | 
631  |  |  | 
632  |  | U_NAMESPACE_END  | 
633  |  |  | 
634  |  | #endif  | 
635  |  |  | 
636  |  | /* Internal definitions ----------------------------------------------------- */  | 
637  |  |  | 
638  |  | U_CDECL_BEGIN  | 
639  |  |  | 
640  |  | /** Build-time trie structure. */  | 
641  |  | struct UNewTrie2;  | 
642  |  | typedef struct UNewTrie2 UNewTrie2;  | 
643  |  |  | 
644  |  | /*  | 
645  |  |  * Trie structure definition.  | 
646  |  |  *  | 
647  |  |  * Either the data table is 16 bits wide and accessed via the index  | 
648  |  |  * pointer, with each index item increased by indexLength;  | 
649  |  |  * in this case, data32==NULL, and data16 is used for direct ASCII access.  | 
650  |  |  *  | 
651  |  |  * Or the data table is 32 bits wide and accessed via the data32 pointer.  | 
652  |  |  */  | 
653  |  | struct UTrie2 { | 
654  |  |     /* protected: used by macros and functions for reading values */  | 
655  |  |     const uint16_t *index;  | 
656  |  |     const uint16_t *data16;     /* for fast UTF-8 ASCII access, if 16b data */  | 
657  |  |     const uint32_t *data32;     /* NULL if 16b data is used via index */  | 
658  |  |  | 
659  |  |     int32_t indexLength, dataLength;  | 
660  |  |     uint16_t index2NullOffset;  /* 0xffff if there is no dedicated index-2 null block */  | 
661  |  |     uint16_t dataNullOffset;  | 
662  |  |     uint32_t initialValue;  | 
663  |  |     /** Value returned for out-of-range code points and illegal UTF-8. */  | 
664  |  |     uint32_t errorValue;  | 
665  |  |  | 
666  |  |     /* Start of the last range which ends at U+10ffff, and its value. */  | 
667  |  |     UChar32 highStart;  | 
668  |  |     int32_t highValueIndex;  | 
669  |  |  | 
670  |  |     /* private: used by builder and unserialization functions */  | 
671  |  |     void *memory;           /* serialized bytes; NULL if not frozen yet */  | 
672  |  |     int32_t length;         /* number of serialized bytes at memory; 0 if not frozen yet */  | 
673  |  |     UBool isMemoryOwned;    /* true if the trie owns the memory */  | 
674  |  |     UBool padding1;  | 
675  |  |     int16_t padding2;  | 
676  |  |     UNewTrie2 *newTrie;     /* builder object; NULL when frozen */  | 
677  |  |  | 
678  |  | #ifdef UTRIE2_DEBUG  | 
679  |  |     const char *name;  | 
680  |  | #endif  | 
681  |  | };  | 
682  |  |  | 
683  |  | /**  | 
684  |  |  * Trie constants, defining shift widths, index array lengths, etc.  | 
685  |  |  *  | 
686  |  |  * These are needed for the runtime macros but users can treat these as  | 
687  |  |  * implementation details and skip to the actual public API further below.  | 
688  |  |  */  | 
689  |  | enum { | 
690  |  |     /** Shift size for getting the index-1 table offset. */  | 
691  |  |     UTRIE2_SHIFT_1=6+5,  | 
692  |  |  | 
693  |  |     /** Shift size for getting the index-2 table offset. */  | 
694  |  |     UTRIE2_SHIFT_2=5,  | 
695  |  |  | 
696  |  |     /**  | 
697  |  |      * Difference between the two shift sizes,  | 
698  |  |      * for getting an index-1 offset from an index-2 offset. 6=11-5  | 
699  |  |      */  | 
700  |  |     UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2,  | 
701  |  |  | 
702  |  |     /**  | 
703  |  |      * Number of index-1 entries for the BMP. 32=0x20  | 
704  |  |      * This part of the index-1 table is omitted from the serialized form.  | 
705  |  |      */  | 
706  |  |     UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1,  | 
707  |  |  | 
708  |  |     /** Number of code points per index-1 table entry. 2048=0x800 */  | 
709  |  |     UTRIE2_CP_PER_INDEX_1_ENTRY=1<<UTRIE2_SHIFT_1,  | 
710  |  |  | 
711  |  |     /** Number of entries in an index-2 block. 64=0x40 */  | 
712  |  |     UTRIE2_INDEX_2_BLOCK_LENGTH=1<<UTRIE2_SHIFT_1_2,  | 
713  |  |  | 
714  |  |     /** Mask for getting the lower bits for the in-index-2-block offset. */  | 
715  |  |     UTRIE2_INDEX_2_MASK=UTRIE2_INDEX_2_BLOCK_LENGTH-1,  | 
716  |  |  | 
717  |  |     /** Number of entries in a data block. 32=0x20 */  | 
718  |  |     UTRIE2_DATA_BLOCK_LENGTH=1<<UTRIE2_SHIFT_2,  | 
719  |  |  | 
720  |  |     /** Mask for getting the lower bits for the in-data-block offset. */  | 
721  |  |     UTRIE2_DATA_MASK=UTRIE2_DATA_BLOCK_LENGTH-1,  | 
722  |  |  | 
723  |  |     /**  | 
724  |  |      * Shift size for shifting left the index array values.  | 
725  |  |      * Increases possible data size with 16-bit index values at the cost  | 
726  |  |      * of compactability.  | 
727  |  |      * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY.  | 
728  |  |      */  | 
729  |  |     UTRIE2_INDEX_SHIFT=2,  | 
730  |  |  | 
731  |  |     /** The alignment size of a data block. Also the granularity for compaction. */  | 
732  |  |     UTRIE2_DATA_GRANULARITY=1<<UTRIE2_INDEX_SHIFT,  | 
733  |  |  | 
734  |  |     /* Fixed layout of the first part of the index array. ------------------- */  | 
735  |  |  | 
736  |  |     /**  | 
737  |  |      * The BMP part of the index-2 table is fixed and linear and starts at offset 0.  | 
738  |  |      * Length=2048=0x800=0x10000>>UTRIE2_SHIFT_2.  | 
739  |  |      */  | 
740  |  |     UTRIE2_INDEX_2_OFFSET=0,  | 
741  |  |  | 
742  |  |     /**  | 
743  |  |      * The part of the index-2 table for U+D800..U+DBFF stores values for  | 
744  |  |      * lead surrogate code _units_ not code _points_.  | 
745  |  |      * Values for lead surrogate code _points_ are indexed with this portion of the table.  | 
746  |  |      * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.)  | 
747  |  |      */  | 
748  |  |     UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2,  | 
749  |  |     UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2,  | 
750  |  |  | 
751  |  |     /** Count the lengths of both BMP pieces. 2080=0x820 */  | 
752  |  |     UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH,  | 
753  |  |  | 
754  |  |     /**  | 
755  |  |      * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.  | 
756  |  |      * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2.  | 
757  |  |      */  | 
758  |  |     UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH,  | 
759  |  |     UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6,  /* U+0800 is the first code point after 2-byte UTF-8 */  | 
760  |  |  | 
761  |  |     /**  | 
762  |  |      * The index-1 table, only used for supplementary code points, at offset 2112=0x840.  | 
763  |  |      * Variable length, for code points up to highStart, where the last single-value range starts.  | 
764  |  |      * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1.  | 
765  |  |      * (For 0x100000 supplementary code points U+10000..U+10ffff.)  | 
766  |  |      *  | 
767  |  |      * The part of the index-2 table for supplementary code points starts  | 
768  |  |      * after this index-1 table.  | 
769  |  |      *  | 
770  |  |      * Both the index-1 table and the following part of the index-2 table  | 
771  |  |      * are omitted completely if there is only BMP data.  | 
772  |  |      */  | 
773  |  |     UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH,  | 
774  |  |     UTRIE2_MAX_INDEX_1_LENGTH=0x100000>>UTRIE2_SHIFT_1,  | 
775  |  |  | 
776  |  |     /*  | 
777  |  |      * Fixed layout of the first part of the data array. -----------------------  | 
778  |  |      * Starts with 4 blocks (128=0x80 entries) for ASCII.  | 
779  |  |      */  | 
780  |  |  | 
781  |  |     /**  | 
782  |  |      * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80.  | 
783  |  |      * Used with linear access for single bytes 0..0xbf for simple error handling.  | 
784  |  |      * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH.  | 
785  |  |      */  | 
786  |  |     UTRIE2_BAD_UTF8_DATA_OFFSET=0x80,  | 
787  |  |  | 
788  |  |     /** The start of non-linear-ASCII data blocks, at offset 192=0xc0. */  | 
789  |  |     UTRIE2_DATA_START_OFFSET=0xc0  | 
790  |  | };  | 
791  |  |  | 
792  |  | /* Internal functions and macros -------------------------------------------- */  | 
793  |  |  | 
794  |  | /**  | 
795  |  |  * Internal function for part of the UTRIE2_U8_NEXTxx() macro implementations.  | 
796  |  |  * Do not call directly.  | 
797  |  |  * @internal  | 
798  |  |  */  | 
799  |  | U_CAPI int32_t U_EXPORT2  | 
800  |  | utrie2_internalU8NextIndex(const UTrie2 *trie, UChar32 c,  | 
801  |  |                            const uint8_t *src, const uint8_t *limit);  | 
802  |  |  | 
803  |  | /**  | 
804  |  |  * Internal function for part of the UTRIE2_U8_PREVxx() macro implementations.  | 
805  |  |  * Do not call directly.  | 
806  |  |  * @internal  | 
807  |  |  */  | 
808  |  | U_CAPI int32_t U_EXPORT2  | 
809  |  | utrie2_internalU8PrevIndex(const UTrie2 *trie, UChar32 c,  | 
810  |  |                            const uint8_t *start, const uint8_t *src);  | 
811  |  |  | 
812  |  |  | 
813  |  | /** Internal low-level trie getter. Returns a data index. */  | 
814  |  | #define _UTRIE2_INDEX_RAW(offset, trieIndex, c) \  | 
815  | 0  |     (((int32_t)((trieIndex)[(offset)+((c)>>UTRIE2_SHIFT_2)]) \  | 
816  | 0  |     <<UTRIE2_INDEX_SHIFT)+ \  | 
817  | 0  |     ((c)&UTRIE2_DATA_MASK))  | 
818  |  |  | 
819  |  | /** Internal trie getter from a UTF-16 single/lead code unit. Returns the data index. */  | 
820  | 0  | #define _UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(trieIndex, c) _UTRIE2_INDEX_RAW(0, trieIndex, c)  | 
821  |  |  | 
822  |  | /** Internal trie getter from a lead surrogate code point (D800..DBFF). Returns the data index. */  | 
823  |  | #define _UTRIE2_INDEX_FROM_LSCP(trieIndex, c) \  | 
824  | 0  |     _UTRIE2_INDEX_RAW(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2), trieIndex, c)  | 
825  |  |  | 
826  |  | /** Internal trie getter from a BMP code point. Returns the data index. */  | 
827  |  | #define _UTRIE2_INDEX_FROM_BMP(trieIndex, c) \  | 
828  | 0  |     _UTRIE2_INDEX_RAW(U_IS_LEAD(c) ? UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2) : 0, \  | 
829  | 0  |                       trieIndex, c)  | 
830  |  |  | 
831  |  | /** Internal trie getter from a supplementary code point below highStart. Returns the data index. */  | 
832  |  | #define _UTRIE2_INDEX_FROM_SUPP(trieIndex, c) \  | 
833  | 0  |     (((int32_t)((trieIndex)[ \  | 
834  | 0  |         (trieIndex)[(UTRIE2_INDEX_1_OFFSET-UTRIE2_OMITTED_BMP_INDEX_1_LENGTH)+ \  | 
835  | 0  |                       ((c)>>UTRIE2_SHIFT_1)]+ \  | 
836  | 0  |         (((c)>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK)]) \  | 
837  | 0  |     <<UTRIE2_INDEX_SHIFT)+ \  | 
838  | 0  |     ((c)&UTRIE2_DATA_MASK))  | 
839  |  |  | 
840  |  | /**  | 
841  |  |  * Internal trie getter from a code point, with checking that c is in 0..10FFFF.  | 
842  |  |  * Returns the data index.  | 
843  |  |  */  | 
844  |  | #define _UTRIE2_INDEX_FROM_CP(trie, asciiOffset, c) \  | 
845  | 0  |     ((uint32_t)(c)<0xd800 ? \  | 
846  | 0  |         _UTRIE2_INDEX_RAW(0, (trie)->index, c) : \  | 
847  | 0  |         (uint32_t)(c)<=0xffff ? \  | 
848  | 0  |             _UTRIE2_INDEX_RAW( \  | 
849  | 0  |                 (c)<=0xdbff ? UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2) : 0, \  | 
850  | 0  |                 (trie)->index, c) : \  | 
851  | 0  |             (uint32_t)(c)>0x10ffff ? \  | 
852  | 0  |                 (asciiOffset)+UTRIE2_BAD_UTF8_DATA_OFFSET : \  | 
853  | 0  |                 (c)>=(trie)->highStart ? \  | 
854  | 0  |                     (trie)->highValueIndex : \  | 
855  | 0  |                     _UTRIE2_INDEX_FROM_SUPP((trie)->index, c))  | 
856  |  |  | 
857  |  | /** Internal trie getter from a UTF-16 single/lead code unit. Returns the data. */  | 
858  |  | #define _UTRIE2_GET_FROM_U16_SINGLE_LEAD(trie, data, c) \  | 
859  | 0  |     (trie)->data[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD((trie)->index, c)]  | 
860  |  |  | 
861  |  | /** Internal trie getter from a supplementary code point. Returns the data. */  | 
862  |  | #define _UTRIE2_GET_FROM_SUPP(trie, data, c) \  | 
863  | 0  |     (trie)->data[(c)>=(trie)->highStart ? (trie)->highValueIndex : \  | 
864  | 0  |                  _UTRIE2_INDEX_FROM_SUPP((trie)->index, c)]  | 
865  |  |  | 
866  |  | /**  | 
867  |  |  * Internal trie getter from a code point, with checking that c is in 0..10FFFF.  | 
868  |  |  * Returns the data.  | 
869  |  |  */  | 
870  |  | #define _UTRIE2_GET(trie, data, asciiOffset, c) \  | 
871  | 0  |     (trie)->data[_UTRIE2_INDEX_FROM_CP(trie, asciiOffset, c)]  | 
872  |  |  | 
873  |  | /** Internal next-post-increment: get the next code point (c) and its data. */  | 
874  | 0  | #define _UTRIE2_U16_NEXT(trie, data, src, limit, c, result) UPRV_BLOCK_MACRO_BEGIN { \ | 
875  | 0  |     { \ | 
876  | 0  |         uint16_t __c2; \  | 
877  | 0  |         (c)=*(src)++; \  | 
878  | 0  |         if(!U16_IS_LEAD(c)) { \ | 
879  | 0  |             (result)=_UTRIE2_GET_FROM_U16_SINGLE_LEAD(trie, data, c); \  | 
880  | 0  |         } else if((src)==(limit) || !U16_IS_TRAIL(__c2=*(src))) { \ | 
881  | 0  |             (result)=(trie)->data[_UTRIE2_INDEX_FROM_LSCP((trie)->index, c)]; \  | 
882  | 0  |         } else { \ | 
883  | 0  |             ++(src); \  | 
884  | 0  |             (c)=U16_GET_SUPPLEMENTARY((c), __c2); \  | 
885  | 0  |             (result)=_UTRIE2_GET_FROM_SUPP((trie), data, (c)); \  | 
886  | 0  |         } \  | 
887  | 0  |     } \  | 
888  | 0  | } UPRV_BLOCK_MACRO_END  | 
889  |  |  | 
890  |  | /** Internal pre-decrement-previous: get the previous code point (c) and its data */  | 
891  | 0  | #define _UTRIE2_U16_PREV(trie, data, start, src, c, result) UPRV_BLOCK_MACRO_BEGIN { \ | 
892  | 0  |     { \ | 
893  | 0  |         uint16_t __c2; \  | 
894  | 0  |         (c)=*--(src); \  | 
895  | 0  |         if(!U16_IS_TRAIL(c) || (src)==(start) || !U16_IS_LEAD(__c2=*((src)-1))) { \ | 
896  | 0  |             (result)=(trie)->data[_UTRIE2_INDEX_FROM_BMP((trie)->index, c)]; \  | 
897  | 0  |         } else { \ | 
898  | 0  |             --(src); \  | 
899  | 0  |             (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \  | 
900  | 0  |             (result)=_UTRIE2_GET_FROM_SUPP((trie), data, (c)); \  | 
901  | 0  |         } \  | 
902  | 0  |     } \  | 
903  | 0  | } UPRV_BLOCK_MACRO_END  | 
904  |  |  | 
905  |  | /** Internal UTF-8 next-post-increment: get the next code point's data. */  | 
906  |  | #define _UTRIE2_U8_NEXT(trie, ascii, data, src, limit, result) UPRV_BLOCK_MACRO_BEGIN { \ | 
907  |  |     uint8_t __lead=(uint8_t)*(src)++; \  | 
908  |  |     if(U8_IS_SINGLE(__lead)) { \ | 
909  |  |         (result)=(trie)->ascii[__lead]; \  | 
910  |  |     } else { \ | 
911  |  |         uint8_t __t1, __t2; \  | 
912  |  |         if( /* handle U+0800..U+FFFF inline */ \  | 
913  |  |             0xe0<=__lead && __lead<0xf0 && ((src)+1)<(limit) && \  | 
914  |  |             U8_IS_VALID_LEAD3_AND_T1(__lead, __t1=(uint8_t)*(src)) && \  | 
915  |  |             (__t2=(uint8_t)(*((src)+1)-0x80))<= 0x3f \  | 
916  |  |         ) { \ | 
917  |  |             (src)+=2; \  | 
918  |  |             (result)=(trie)->data[ \  | 
919  |  |                 ((int32_t)((trie)->index[((__lead-0xe0)<<(12-UTRIE2_SHIFT_2))+ \  | 
920  |  |                                          ((__t1&0x3f)<<(6-UTRIE2_SHIFT_2))+(__t2>>UTRIE2_SHIFT_2)]) \  | 
921  |  |                 <<UTRIE2_INDEX_SHIFT)+ \  | 
922  |  |                 (__t2&UTRIE2_DATA_MASK)]; \  | 
923  |  |         } else if( /* handle U+0080..U+07FF inline */ \  | 
924  |  |             __lead<0xe0 && __lead>=0xc2 && (src)<(limit) && \  | 
925  |  |             (__t1=(uint8_t)(*(src)-0x80))<=0x3f \  | 
926  |  |         ) { \ | 
927  |  |             ++(src); \  | 
928  |  |             (result)=(trie)->data[ \  | 
929  |  |                 (trie)->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET-0xc0)+__lead]+ \  | 
930  |  |                 __t1]; \  | 
931  |  |         } else { \ | 
932  |  |             int32_t __index=utrie2_internalU8NextIndex((trie), __lead, (const uint8_t *)(src), \  | 
933  |  |                                                                        (const uint8_t *)(limit)); \  | 
934  |  |             (src)+=__index&7; \  | 
935  |  |             (result)=(trie)->data[__index>>3]; \  | 
936  |  |         } \  | 
937  |  |     } \  | 
938  |  | } UPRV_BLOCK_MACRO_END  | 
939  |  |  | 
940  |  | /** Internal UTF-8 pre-decrement-previous: get the previous code point's data. */  | 
941  |  | #define _UTRIE2_U8_PREV(trie, ascii, data, start, src, result) UPRV_BLOCK_MACRO_BEGIN { \ | 
942  |  |     uint8_t __b=(uint8_t)*--(src); \  | 
943  |  |     if(U8_IS_SINGLE(__b)) { \ | 
944  |  |         (result)=(trie)->ascii[__b]; \  | 
945  |  |     } else { \ | 
946  |  |         int32_t __index=utrie2_internalU8PrevIndex((trie), __b, (const uint8_t *)(start), \  | 
947  |  |                                                                 (const uint8_t *)(src)); \  | 
948  |  |         (src)-=__index&7; \  | 
949  |  |         (result)=(trie)->data[__index>>3]; \  | 
950  |  |     } \  | 
951  |  | } UPRV_BLOCK_MACRO_END  | 
952  |  |  | 
953  |  | U_CDECL_END  | 
954  |  |  | 
955  |  | #endif  |