/src/icu/source/common/utrie.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | ******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 2001-2012, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | ******************************************************************************  | 
10  |  | *   file name:  utrie.cpp  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:4  | 
14  |  | *  | 
15  |  | *   created on: 2001oct20  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | *  | 
18  |  | *   This is a common implementation of a "folded" trie.  | 
19  |  | *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with  | 
20  |  | *   Unicode code points (0..0x10ffff).  | 
21  |  | */  | 
22  |  |  | 
23  |  | #ifdef UTRIE_DEBUG  | 
24  |  | #   include <stdio.h>  | 
25  |  | #endif  | 
26  |  |  | 
27  |  | #include "unicode/utypes.h"  | 
28  |  | #include "cmemory.h"  | 
29  |  | #include "utrie.h"  | 
30  |  |  | 
31  |  | /* miscellaneous ------------------------------------------------------------ */  | 
32  |  |  | 
33  |  | #undef ABS  | 
34  | 0  | #define ABS(x) ((x)>=0 ? (x) : -(x))  | 
35  |  |  | 
36  |  | static inline UBool  | 
37  | 0  | equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) { | 
38  | 0  |     while(length>0 && *s==*t) { | 
39  | 0  |         ++s;  | 
40  | 0  |         ++t;  | 
41  | 0  |         --length;  | 
42  | 0  |     }  | 
43  | 0  |     return (UBool)(length==0);  | 
44  | 0  | }  | 
45  |  |  | 
46  |  | /* Building a trie ----------------------------------------------------------*/  | 
47  |  |  | 
48  |  | U_CAPI UNewTrie * U_EXPORT2  | 
49  |  | utrie_open(UNewTrie *fillIn,  | 
50  |  |            uint32_t *aliasData, int32_t maxDataLength,  | 
51  |  |            uint32_t initialValue, uint32_t leadUnitValue,  | 
52  | 0  |            UBool latin1Linear) { | 
53  | 0  |     UNewTrie *trie;  | 
54  | 0  |     int32_t i, j;  | 
55  |  | 
  | 
56  | 0  |     if( maxDataLength<UTRIE_DATA_BLOCK_LENGTH ||  | 
57  | 0  |         (latin1Linear && maxDataLength<1024)  | 
58  | 0  |     ) { | 
59  | 0  |         return NULL;  | 
60  | 0  |     }  | 
61  |  |  | 
62  | 0  |     if(fillIn!=NULL) { | 
63  | 0  |         trie=fillIn;  | 
64  | 0  |     } else { | 
65  | 0  |         trie=(UNewTrie *)uprv_malloc(sizeof(UNewTrie));  | 
66  | 0  |         if(trie==NULL) { | 
67  | 0  |             return NULL;  | 
68  | 0  |         }  | 
69  | 0  |     }  | 
70  | 0  |     uprv_memset(trie, 0, sizeof(UNewTrie));  | 
71  | 0  |     trie->isAllocated= (UBool)(fillIn==NULL);  | 
72  |  | 
  | 
73  | 0  |     if(aliasData!=NULL) { | 
74  | 0  |         trie->data=aliasData;  | 
75  | 0  |         trie->isDataAllocated=FALSE;  | 
76  | 0  |     } else { | 
77  | 0  |         trie->data=(uint32_t *)uprv_malloc(maxDataLength*4);  | 
78  | 0  |         if(trie->data==NULL) { | 
79  | 0  |             uprv_free(trie);  | 
80  | 0  |             return NULL;  | 
81  | 0  |         }  | 
82  | 0  |         trie->isDataAllocated=TRUE;  | 
83  | 0  |     }  | 
84  |  |  | 
85  |  |     /* preallocate and reset the first data block (block index 0) */  | 
86  | 0  |     j=UTRIE_DATA_BLOCK_LENGTH;  | 
87  |  | 
  | 
88  | 0  |     if(latin1Linear) { | 
89  |  |         /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */  | 
90  |  |         /* made sure above that maxDataLength>=1024 */  | 
91  |  |  | 
92  |  |         /* set indexes to point to consecutive data blocks */  | 
93  | 0  |         i=0;  | 
94  | 0  |         do { | 
95  |  |             /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */  | 
96  | 0  |             trie->index[i++]=j;  | 
97  | 0  |             j+=UTRIE_DATA_BLOCK_LENGTH;  | 
98  | 0  |         } while(i<(256>>UTRIE_SHIFT));  | 
99  | 0  |     }  | 
100  |  |  | 
101  |  |     /* reset the initially allocated blocks to the initial value */  | 
102  | 0  |     trie->dataLength=j;  | 
103  | 0  |     while(j>0) { | 
104  | 0  |         trie->data[--j]=initialValue;  | 
105  | 0  |     }  | 
106  |  | 
  | 
107  | 0  |     trie->leadUnitValue=leadUnitValue;  | 
108  | 0  |     trie->indexLength=UTRIE_MAX_INDEX_LENGTH;  | 
109  | 0  |     trie->dataCapacity=maxDataLength;  | 
110  | 0  |     trie->isLatin1Linear=latin1Linear;  | 
111  | 0  |     trie->isCompacted=FALSE;  | 
112  | 0  |     return trie;  | 
113  | 0  | }  | 
114  |  |  | 
115  |  | U_CAPI UNewTrie * U_EXPORT2  | 
116  | 0  | utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) { | 
117  | 0  |     UNewTrie *trie;  | 
118  | 0  |     UBool isDataAllocated;  | 
119  |  |  | 
120  |  |     /* do not clone if other is not valid or already compacted */  | 
121  | 0  |     if(other==NULL || other->data==NULL || other->isCompacted) { | 
122  | 0  |         return NULL;  | 
123  | 0  |     }  | 
124  |  |  | 
125  |  |     /* clone data */  | 
126  | 0  |     if(aliasData!=NULL && aliasDataCapacity>=other->dataCapacity) { | 
127  | 0  |         isDataAllocated=FALSE;  | 
128  | 0  |     } else { | 
129  | 0  |         aliasDataCapacity=other->dataCapacity;  | 
130  | 0  |         aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4);  | 
131  | 0  |         if(aliasData==NULL) { | 
132  | 0  |             return NULL;  | 
133  | 0  |         }  | 
134  | 0  |         isDataAllocated=TRUE;  | 
135  | 0  |     }  | 
136  |  |  | 
137  | 0  |     trie=utrie_open(fillIn, aliasData, aliasDataCapacity,  | 
138  | 0  |                     other->data[0], other->leadUnitValue,  | 
139  | 0  |                     other->isLatin1Linear);  | 
140  | 0  |     if(trie==NULL) { | 
141  | 0  |         uprv_free(aliasData);  | 
142  | 0  |     } else { | 
143  | 0  |         uprv_memcpy(trie->index, other->index, sizeof(trie->index));  | 
144  | 0  |         uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);  | 
145  | 0  |         trie->dataLength=other->dataLength;  | 
146  | 0  |         trie->isDataAllocated=isDataAllocated;  | 
147  | 0  |     }  | 
148  |  | 
  | 
149  | 0  |     return trie;  | 
150  | 0  | }  | 
151  |  |  | 
152  |  | U_CAPI void U_EXPORT2  | 
153  | 0  | utrie_close(UNewTrie *trie) { | 
154  | 0  |     if(trie!=NULL) { | 
155  | 0  |         if(trie->isDataAllocated) { | 
156  | 0  |             uprv_free(trie->data);  | 
157  | 0  |             trie->data=NULL;  | 
158  | 0  |         }  | 
159  | 0  |         if(trie->isAllocated) { | 
160  | 0  |             uprv_free(trie);  | 
161  | 0  |         }  | 
162  | 0  |     }  | 
163  | 0  | }  | 
164  |  |  | 
165  |  | U_CAPI uint32_t * U_EXPORT2  | 
166  | 0  | utrie_getData(UNewTrie *trie, int32_t *pLength) { | 
167  | 0  |     if(trie==NULL || pLength==NULL) { | 
168  | 0  |         return NULL;  | 
169  | 0  |     }  | 
170  |  |  | 
171  | 0  |     *pLength=trie->dataLength;  | 
172  | 0  |     return trie->data;  | 
173  | 0  | }  | 
174  |  |  | 
175  |  | static int32_t  | 
176  | 0  | utrie_allocDataBlock(UNewTrie *trie) { | 
177  | 0  |     int32_t newBlock, newTop;  | 
178  |  | 
  | 
179  | 0  |     newBlock=trie->dataLength;  | 
180  | 0  |     newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;  | 
181  | 0  |     if(newTop>trie->dataCapacity) { | 
182  |  |         /* out of memory in the data array */  | 
183  | 0  |         return -1;  | 
184  | 0  |     }  | 
185  | 0  |     trie->dataLength=newTop;  | 
186  | 0  |     return newBlock;  | 
187  | 0  | }  | 
188  |  |  | 
189  |  | /**  | 
190  |  |  * No error checking for illegal arguments.  | 
191  |  |  *  | 
192  |  |  * @return -1 if no new data block available (out of memory in data array)  | 
193  |  |  * @internal  | 
194  |  |  */  | 
195  |  | static int32_t  | 
196  | 0  | utrie_getDataBlock(UNewTrie *trie, UChar32 c) { | 
197  | 0  |     int32_t indexValue, newBlock;  | 
198  |  | 
  | 
199  | 0  |     c>>=UTRIE_SHIFT;  | 
200  | 0  |     indexValue=trie->index[c];  | 
201  | 0  |     if(indexValue>0) { | 
202  | 0  |         return indexValue;  | 
203  | 0  |     }  | 
204  |  |  | 
205  |  |     /* allocate a new data block */  | 
206  | 0  |     newBlock=utrie_allocDataBlock(trie);  | 
207  | 0  |     if(newBlock<0) { | 
208  |  |         /* out of memory in the data array */  | 
209  | 0  |         return -1;  | 
210  | 0  |     }  | 
211  | 0  |     trie->index[c]=newBlock;  | 
212  |  |  | 
213  |  |     /* copy-on-write for a block from a setRange() */  | 
214  | 0  |     uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH);  | 
215  | 0  |     return newBlock;  | 
216  | 0  | }  | 
217  |  |  | 
218  |  | /**  | 
219  |  |  * @return TRUE if the value was successfully set  | 
220  |  |  */  | 
221  |  | U_CAPI UBool U_EXPORT2  | 
222  | 0  | utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) { | 
223  | 0  |     int32_t block;  | 
224  |  |  | 
225  |  |     /* valid, uncompacted trie and valid c? */  | 
226  | 0  |     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) { | 
227  | 0  |         return FALSE;  | 
228  | 0  |     }  | 
229  |  |  | 
230  | 0  |     block=utrie_getDataBlock(trie, c);  | 
231  | 0  |     if(block<0) { | 
232  | 0  |         return FALSE;  | 
233  | 0  |     }  | 
234  |  |  | 
235  | 0  |     trie->data[block+(c&UTRIE_MASK)]=value;  | 
236  | 0  |     return TRUE;  | 
237  | 0  | }  | 
238  |  |  | 
239  |  | U_CAPI uint32_t U_EXPORT2  | 
240  | 0  | utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) { | 
241  | 0  |     int32_t block;  | 
242  |  |  | 
243  |  |     /* valid, uncompacted trie and valid c? */  | 
244  | 0  |     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) { | 
245  | 0  |         if(pInBlockZero!=NULL) { | 
246  | 0  |             *pInBlockZero=TRUE;  | 
247  | 0  |         }  | 
248  | 0  |         return 0;  | 
249  | 0  |     }  | 
250  |  |  | 
251  | 0  |     block=trie->index[c>>UTRIE_SHIFT];  | 
252  | 0  |     if(pInBlockZero!=NULL) { | 
253  | 0  |         *pInBlockZero= (UBool)(block==0);  | 
254  | 0  |     }  | 
255  |  | 
  | 
256  | 0  |     return trie->data[ABS(block)+(c&UTRIE_MASK)];  | 
257  | 0  | }  | 
258  |  |  | 
259  |  | /**  | 
260  |  |  * @internal  | 
261  |  |  */  | 
262  |  | static void  | 
263  |  | utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit,  | 
264  | 0  |                 uint32_t value, uint32_t initialValue, UBool overwrite) { | 
265  | 0  |     uint32_t *pLimit;  | 
266  |  | 
  | 
267  | 0  |     pLimit=block+limit;  | 
268  | 0  |     block+=start;  | 
269  | 0  |     if(overwrite) { | 
270  | 0  |         while(block<pLimit) { | 
271  | 0  |             *block++=value;  | 
272  | 0  |         }  | 
273  | 0  |     } else { | 
274  | 0  |         while(block<pLimit) { | 
275  | 0  |             if(*block==initialValue) { | 
276  | 0  |                 *block=value;  | 
277  | 0  |             }  | 
278  | 0  |             ++block;  | 
279  | 0  |         }  | 
280  | 0  |     }  | 
281  | 0  | }  | 
282  |  |  | 
283  |  | U_CAPI UBool U_EXPORT2  | 
284  | 0  | utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite) { | 
285  |  |     /*  | 
286  |  |      * repeat value in [start..limit[  | 
287  |  |      * mark index values for repeat-data blocks by setting bit 31 of the index values  | 
288  |  |      * fill around existing values if any, if(overwrite)  | 
289  |  |      */  | 
290  | 0  |     uint32_t initialValue;  | 
291  | 0  |     int32_t block, rest, repeatBlock;  | 
292  |  |  | 
293  |  |     /* valid, uncompacted trie and valid indexes? */  | 
294  | 0  |     if( trie==NULL || trie->isCompacted ||  | 
295  | 0  |         (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit  | 
296  | 0  |     ) { | 
297  | 0  |         return FALSE;  | 
298  | 0  |     }  | 
299  | 0  |     if(start==limit) { | 
300  | 0  |         return TRUE; /* nothing to do */  | 
301  | 0  |     }  | 
302  |  |  | 
303  | 0  |     initialValue=trie->data[0];  | 
304  | 0  |     if(start&UTRIE_MASK) { | 
305  | 0  |         UChar32 nextStart;  | 
306  |  |  | 
307  |  |         /* set partial block at [start..following block boundary[ */  | 
308  | 0  |         block=utrie_getDataBlock(trie, start);  | 
309  | 0  |         if(block<0) { | 
310  | 0  |             return FALSE;  | 
311  | 0  |         }  | 
312  |  |  | 
313  | 0  |         nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK;  | 
314  | 0  |         if(nextStart<=limit) { | 
315  | 0  |             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH,  | 
316  | 0  |                             value, initialValue, overwrite);  | 
317  | 0  |             start=nextStart;  | 
318  | 0  |         } else { | 
319  | 0  |             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK,  | 
320  | 0  |                             value, initialValue, overwrite);  | 
321  | 0  |             return TRUE;  | 
322  | 0  |         }  | 
323  | 0  |     }  | 
324  |  |  | 
325  |  |     /* number of positions in the last, partial block */  | 
326  | 0  |     rest=limit&UTRIE_MASK;  | 
327  |  |  | 
328  |  |     /* round down limit to a block boundary */  | 
329  | 0  |     limit&=~UTRIE_MASK;  | 
330  |  |  | 
331  |  |     /* iterate over all-value blocks */  | 
332  | 0  |     if(value==initialValue) { | 
333  | 0  |         repeatBlock=0;  | 
334  | 0  |     } else { | 
335  | 0  |         repeatBlock=-1;  | 
336  | 0  |     }  | 
337  | 0  |     while(start<limit) { | 
338  |  |         /* get index value */  | 
339  | 0  |         block=trie->index[start>>UTRIE_SHIFT];  | 
340  | 0  |         if(block>0) { | 
341  |  |             /* already allocated, fill in value */  | 
342  | 0  |             utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite);  | 
343  | 0  |         } else if(trie->data[-block]!=value && (block==0 || overwrite)) { | 
344  |  |             /* set the repeatBlock instead of the current block 0 or range block */  | 
345  | 0  |             if(repeatBlock>=0) { | 
346  | 0  |                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;  | 
347  | 0  |             } else { | 
348  |  |                 /* create and set and fill the repeatBlock */  | 
349  | 0  |                 repeatBlock=utrie_getDataBlock(trie, start);  | 
350  | 0  |                 if(repeatBlock<0) { | 
351  | 0  |                     return FALSE;  | 
352  | 0  |                 }  | 
353  |  |  | 
354  |  |                 /* set the negative block number to indicate that it is a repeat block */  | 
355  | 0  |                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;  | 
356  | 0  |                 utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, TRUE);  | 
357  | 0  |             }  | 
358  | 0  |         }  | 
359  |  |  | 
360  | 0  |         start+=UTRIE_DATA_BLOCK_LENGTH;  | 
361  | 0  |     }  | 
362  |  |  | 
363  | 0  |     if(rest>0) { | 
364  |  |         /* set partial block at [last block boundary..limit[ */  | 
365  | 0  |         block=utrie_getDataBlock(trie, start);  | 
366  | 0  |         if(block<0) { | 
367  | 0  |             return FALSE;  | 
368  | 0  |         }  | 
369  |  |  | 
370  | 0  |         utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite);  | 
371  | 0  |     }  | 
372  |  |  | 
373  | 0  |     return TRUE;  | 
374  | 0  | }  | 
375  |  |  | 
376  |  | static int32_t  | 
377  |  | _findSameIndexBlock(const int32_t *idx, int32_t indexLength,  | 
378  | 0  |                     int32_t otherBlock) { | 
379  | 0  |     int32_t block, i;  | 
380  |  | 
  | 
381  | 0  |     for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) { | 
382  | 0  |         for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) { | 
383  | 0  |             if(idx[block+i]!=idx[otherBlock+i]) { | 
384  | 0  |                 break;  | 
385  | 0  |             }  | 
386  | 0  |         }  | 
387  | 0  |         if(i==UTRIE_SURROGATE_BLOCK_COUNT) { | 
388  | 0  |             return block;  | 
389  | 0  |         }  | 
390  | 0  |     }  | 
391  | 0  |     return indexLength;  | 
392  | 0  | }  | 
393  |  |  | 
394  |  | /*  | 
395  |  |  * Fold the normalization data for supplementary code points into  | 
396  |  |  * a compact area on top of the BMP-part of the trie index,  | 
397  |  |  * with the lead surrogates indexing this compact area.  | 
398  |  |  *  | 
399  |  |  * Duplicate the index values for lead surrogates:  | 
400  |  |  * From inside the BMP area, where some may be overridden with folded values,  | 
401  |  |  * to just after the BMP area, where they can be retrieved for  | 
402  |  |  * code point lookups.  | 
403  |  |  */  | 
404  |  | static void  | 
405  | 0  | utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) { | 
406  | 0  |     int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT];  | 
407  | 0  |     int32_t *idx;  | 
408  | 0  |     uint32_t value;  | 
409  | 0  |     UChar32 c;  | 
410  | 0  |     int32_t indexLength, block;  | 
411  |  | #ifdef UTRIE_DEBUG  | 
412  |  |     int countLeadCUWithData=0;  | 
413  |  | #endif  | 
414  |  | 
  | 
415  | 0  |     idx=trie->index;  | 
416  |  |  | 
417  |  |     /* copy the lead surrogate indexes into a temporary array */  | 
418  | 0  |     uprv_memcpy(leadIndexes, idx+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);  | 
419  |  |  | 
420  |  |     /*  | 
421  |  |      * set all values for lead surrogate code *units* to leadUnitValue  | 
422  |  |      * so that, by default, runtime lookups will find no data for associated  | 
423  |  |      * supplementary code points, unless there is data for such code points  | 
424  |  |      * which will result in a non-zero folding value below that is set for  | 
425  |  |      * the respective lead units  | 
426  |  |      *  | 
427  |  |      * the above saved the indexes for surrogate code *points*  | 
428  |  |      * fill the indexes with simplified code from utrie_setRange32()  | 
429  |  |      */  | 
430  | 0  |     if(trie->leadUnitValue==trie->data[0]) { | 
431  | 0  |         block=0; /* leadUnitValue==initialValue, use all-initial-value block */  | 
432  | 0  |     } else { | 
433  |  |         /* create and fill the repeatBlock */  | 
434  | 0  |         block=utrie_allocDataBlock(trie);  | 
435  | 0  |         if(block<0) { | 
436  |  |             /* data table overflow */  | 
437  | 0  |             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;  | 
438  | 0  |             return;  | 
439  | 0  |         }  | 
440  | 0  |         utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE);  | 
441  | 0  |         block=-block; /* negative block number to indicate that it is a repeat block */  | 
442  | 0  |     }  | 
443  | 0  |     for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) { | 
444  | 0  |         trie->index[c]=block;  | 
445  | 0  |     }  | 
446  |  |  | 
447  |  |     /*  | 
448  |  |      * Fold significant index values into the area just after the BMP indexes.  | 
449  |  |      * In case the first lead surrogate has significant data,  | 
450  |  |      * its index block must be used first (in which case the folding is a no-op).  | 
451  |  |      * Later all folded index blocks are moved up one to insert the copied  | 
452  |  |      * lead surrogate indexes.  | 
453  |  |      */  | 
454  | 0  |     indexLength=UTRIE_BMP_INDEX_LENGTH;  | 
455  |  |  | 
456  |  |     /* search for any index (stage 1) entries for supplementary code points */  | 
457  | 0  |     for(c=0x10000; c<0x110000;) { | 
458  | 0  |         if(idx[c>>UTRIE_SHIFT]!=0) { | 
459  |  |             /* there is data, treat the full block for a lead surrogate */  | 
460  | 0  |             c&=~0x3ff;  | 
461  |  | 
  | 
462  |  | #ifdef UTRIE_DEBUG  | 
463  |  |             ++countLeadCUWithData;  | 
464  |  |             /* printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10))); */ | 
465  |  | #endif  | 
466  |  |  | 
467  |  |             /* is there an identical index block? */  | 
468  | 0  |             block=_findSameIndexBlock(idx, indexLength, c>>UTRIE_SHIFT);  | 
469  |  |  | 
470  |  |             /*  | 
471  |  |              * get a folded value for [c..c+0x400[ and,  | 
472  |  |              * if different from the value for the lead surrogate code point,  | 
473  |  |              * set it for the lead surrogate code unit  | 
474  |  |              */  | 
475  | 0  |             value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT);  | 
476  | 0  |             if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) { | 
477  | 0  |                 if(!utrie_set32(trie, U16_LEAD(c), value)) { | 
478  |  |                     /* data table overflow */  | 
479  | 0  |                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;  | 
480  | 0  |                     return;  | 
481  | 0  |                 }  | 
482  |  |  | 
483  |  |                 /* if we did not find an identical index block... */  | 
484  | 0  |                 if(block==indexLength) { | 
485  |  |                     /* move the actual index (stage 1) entries from the supplementary position to the new one */  | 
486  | 0  |                     uprv_memmove(idx+indexLength,  | 
487  | 0  |                                  idx+(c>>UTRIE_SHIFT),  | 
488  | 0  |                                  4*UTRIE_SURROGATE_BLOCK_COUNT);  | 
489  | 0  |                     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;  | 
490  | 0  |                 }  | 
491  | 0  |             }  | 
492  | 0  |             c+=0x400;  | 
493  | 0  |         } else { | 
494  | 0  |             c+=UTRIE_DATA_BLOCK_LENGTH;  | 
495  | 0  |         }  | 
496  | 0  |     }  | 
497  |  | #ifdef UTRIE_DEBUG  | 
498  |  |     if(countLeadCUWithData>0) { | 
499  |  |         printf("supplementary data for %d lead surrogates\n", countLeadCUWithData); | 
500  |  |     }  | 
501  |  | #endif  | 
502  |  |  | 
503  |  |     /*  | 
504  |  |      * index array overflow?  | 
505  |  |      * This is to guarantee that a folding offset is of the form  | 
506  |  |      * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.  | 
507  |  |      * If the index is too large, then n>=1024 and more than 10 bits are necessary.  | 
508  |  |      *  | 
509  |  |      * In fact, it can only ever become n==1024 with completely unfoldable data and  | 
510  |  |      * the additional block of duplicated values for lead surrogates.  | 
511  |  |      */  | 
512  | 0  |     if(indexLength>=UTRIE_MAX_INDEX_LENGTH) { | 
513  | 0  |         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;  | 
514  | 0  |         return;  | 
515  | 0  |     }  | 
516  |  |  | 
517  |  |     /*  | 
518  |  |      * make space for the lead surrogate index block and  | 
519  |  |      * insert it between the BMP indexes and the folded ones  | 
520  |  |      */  | 
521  | 0  |     uprv_memmove(idx+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,  | 
522  | 0  |                  idx+UTRIE_BMP_INDEX_LENGTH,  | 
523  | 0  |                  4*(indexLength-UTRIE_BMP_INDEX_LENGTH));  | 
524  | 0  |     uprv_memcpy(idx+UTRIE_BMP_INDEX_LENGTH,  | 
525  | 0  |                 leadIndexes,  | 
526  | 0  |                 4*UTRIE_SURROGATE_BLOCK_COUNT);  | 
527  | 0  |     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;  | 
528  |  | 
  | 
529  |  | #ifdef UTRIE_DEBUG  | 
530  |  |     printf("trie index count: BMP %ld  all Unicode %ld  folded %ld\n", | 
531  |  |            UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength);  | 
532  |  | #endif  | 
533  |  | 
  | 
534  | 0  |     trie->indexLength=indexLength;  | 
535  | 0  | }  | 
536  |  |  | 
537  |  | /*  | 
538  |  |  * Set a value in the trie index map to indicate which data block  | 
539  |  |  * is referenced and which one is not.  | 
540  |  |  * utrie_compact() will remove data blocks that are not used at all.  | 
541  |  |  * Set  | 
542  |  |  * - 0 if it is used  | 
543  |  |  * - -1 if it is not used  | 
544  |  |  */  | 
545  |  | static void  | 
546  | 0  | _findUnusedBlocks(UNewTrie *trie) { | 
547  | 0  |     int32_t i;  | 
548  |  |  | 
549  |  |     /* fill the entire map with "not used" */  | 
550  | 0  |     uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4);  | 
551  |  |  | 
552  |  |     /* mark each block that _is_ used with 0 */  | 
553  | 0  |     for(i=0; i<trie->indexLength; ++i) { | 
554  | 0  |         trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0;  | 
555  | 0  |     }  | 
556  |  |  | 
557  |  |     /* never move the all-initial-value block 0 */  | 
558  | 0  |     trie->map[0]=0;  | 
559  | 0  | }  | 
560  |  |  | 
561  |  | static int32_t  | 
562  |  | _findSameDataBlock(const uint32_t *data, int32_t dataLength,  | 
563  | 0  |                    int32_t otherBlock, int32_t step) { | 
564  | 0  |     int32_t block;  | 
565  |  |  | 
566  |  |     /* ensure that we do not even partially get past dataLength */  | 
567  | 0  |     dataLength-=UTRIE_DATA_BLOCK_LENGTH;  | 
568  |  | 
  | 
569  | 0  |     for(block=0; block<=dataLength; block+=step) { | 
570  | 0  |         if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) { | 
571  | 0  |             return block;  | 
572  | 0  |         }  | 
573  | 0  |     }  | 
574  | 0  |     return -1;  | 
575  | 0  | }  | 
576  |  |  | 
577  |  | /*  | 
578  |  |  * Compact a folded build-time trie.  | 
579  |  |  *  | 
580  |  |  * The compaction  | 
581  |  |  * - removes blocks that are identical with earlier ones  | 
582  |  |  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)  | 
583  |  |  * - moves blocks in steps of the data granularity  | 
584  |  |  * - moves and overlaps blocks that overlap with multiple values in the overlap region  | 
585  |  |  *  | 
586  |  |  * It does not  | 
587  |  |  * - try to move and overlap blocks that are not already adjacent  | 
588  |  |  */  | 
589  |  | static void  | 
590  | 0  | utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) { | 
591  | 0  |     int32_t i, start, newStart, overlapStart;  | 
592  |  | 
  | 
593  | 0  |     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | 
594  | 0  |         return;  | 
595  | 0  |     }  | 
596  |  |  | 
597  |  |     /* valid, uncompacted trie? */  | 
598  | 0  |     if(trie==NULL) { | 
599  | 0  |         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;  | 
600  | 0  |         return;  | 
601  | 0  |     }  | 
602  | 0  |     if(trie->isCompacted) { | 
603  | 0  |         return; /* nothing left to do */  | 
604  | 0  |     }  | 
605  |  |  | 
606  |  |     /* compaction */  | 
607  |  |  | 
608  |  |     /* initialize the index map with "block is used/unused" flags */  | 
609  | 0  |     _findUnusedBlocks(trie);  | 
610  |  |  | 
611  |  |     /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */  | 
612  | 0  |     if(trie->isLatin1Linear && UTRIE_SHIFT<=8) { | 
613  | 0  |         overlapStart=UTRIE_DATA_BLOCK_LENGTH+256;  | 
614  | 0  |     } else { | 
615  | 0  |         overlapStart=UTRIE_DATA_BLOCK_LENGTH;  | 
616  | 0  |     }  | 
617  |  | 
  | 
618  | 0  |     newStart=UTRIE_DATA_BLOCK_LENGTH;  | 
619  | 0  |     for(start=newStart; start<trie->dataLength;) { | 
620  |  |         /*  | 
621  |  |          * start: index of first entry of current block  | 
622  |  |          * newStart: index where the current block is to be moved  | 
623  |  |          *           (right after current end of already-compacted data)  | 
624  |  |          */  | 
625  |  |  | 
626  |  |         /* skip blocks that are not used */  | 
627  | 0  |         if(trie->map[start>>UTRIE_SHIFT]<0) { | 
628  |  |             /* advance start to the next block */  | 
629  | 0  |             start+=UTRIE_DATA_BLOCK_LENGTH;  | 
630  |  |  | 
631  |  |             /* leave newStart with the previous block! */  | 
632  | 0  |             continue;  | 
633  | 0  |         }  | 
634  |  |  | 
635  |  |         /* search for an identical block */  | 
636  | 0  |         if( start>=overlapStart &&  | 
637  | 0  |             (i=_findSameDataBlock(trie->data, newStart, start,  | 
638  | 0  |                             overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH))  | 
639  | 0  |              >=0  | 
640  | 0  |         ) { | 
641  |  |             /* found an identical block, set the other block's index value for the current block */  | 
642  | 0  |             trie->map[start>>UTRIE_SHIFT]=i;  | 
643  |  |  | 
644  |  |             /* advance start to the next block */  | 
645  | 0  |             start+=UTRIE_DATA_BLOCK_LENGTH;  | 
646  |  |  | 
647  |  |             /* leave newStart with the previous block! */  | 
648  | 0  |             continue;  | 
649  | 0  |         }  | 
650  |  |  | 
651  |  |         /* see if the beginning of this block can be overlapped with the end of the previous block */  | 
652  | 0  |         if(overlap && start>=overlapStart) { | 
653  |  |             /* look for maximum overlap (modulo granularity) with the previous, adjacent block */  | 
654  | 0  |             for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY;  | 
655  | 0  |                 i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i);  | 
656  | 0  |                 i-=UTRIE_DATA_GRANULARITY) {} | 
657  | 0  |         } else { | 
658  | 0  |             i=0;  | 
659  | 0  |         }  | 
660  |  | 
  | 
661  | 0  |         if(i>0) { | 
662  |  |             /* some overlap */  | 
663  | 0  |             trie->map[start>>UTRIE_SHIFT]=newStart-i;  | 
664  |  |  | 
665  |  |             /* move the non-overlapping indexes to their new positions */  | 
666  | 0  |             start+=i;  | 
667  | 0  |             for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) { | 
668  | 0  |                 trie->data[newStart++]=trie->data[start++];  | 
669  | 0  |             }  | 
670  | 0  |         } else if(newStart<start) { | 
671  |  |             /* no overlap, just move the indexes to their new positions */  | 
672  | 0  |             trie->map[start>>UTRIE_SHIFT]=newStart;  | 
673  | 0  |             for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) { | 
674  | 0  |                 trie->data[newStart++]=trie->data[start++];  | 
675  | 0  |             }  | 
676  | 0  |         } else /* no overlap && newStart==start */ { | 
677  | 0  |             trie->map[start>>UTRIE_SHIFT]=start;  | 
678  | 0  |             newStart+=UTRIE_DATA_BLOCK_LENGTH;  | 
679  | 0  |             start=newStart;  | 
680  | 0  |         }  | 
681  | 0  |     }  | 
682  |  |  | 
683  |  |     /* now adjust the index (stage 1) table */  | 
684  | 0  |     for(i=0; i<trie->indexLength; ++i) { | 
685  | 0  |         trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT];  | 
686  | 0  |     }  | 
687  |  | 
  | 
688  |  | #ifdef UTRIE_DEBUG  | 
689  |  |     /* we saved some space */  | 
690  |  |     printf("compacting trie: count of 32-bit words %lu->%lu\n", | 
691  |  |             (long)trie->dataLength, (long)newStart);  | 
692  |  | #endif  | 
693  |  | 
  | 
694  | 0  |     trie->dataLength=newStart;  | 
695  | 0  | }  | 
696  |  |  | 
697  |  | /* serialization ------------------------------------------------------------ */  | 
698  |  |  | 
699  |  | /*  | 
700  |  |  * Default function for the folding value:  | 
701  |  |  * Just store the offset (16 bits) if there is any non-initial-value entry.  | 
702  |  |  *  | 
703  |  |  * The offset parameter is never 0.  | 
704  |  |  * Returning the offset itself is safe for UTRIE_SHIFT>=5 because  | 
705  |  |  * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800  | 
706  |  |  * which fits into 16-bit trie values;  | 
707  |  |  * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.  | 
708  |  |  *  | 
709  |  |  * Theoretically, it would be safer for all possible UTRIE_SHIFT including  | 
710  |  |  * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS  | 
711  |  |  * which would always result in a value of 0x40..0x43f  | 
712  |  |  * (start/end 1k blocks of supplementary Unicode code points).  | 
713  |  |  * However, this would be uglier, and would not work for some existing  | 
714  |  |  * binary data file formats.  | 
715  |  |  *  | 
716  |  |  * Also, we do not plan to change UTRIE_SHIFT because it would change binary  | 
717  |  |  * data file formats, and we would probably not make it smaller because of  | 
718  |  |  * the then even larger BMP index length even for empty tries.  | 
719  |  |  */  | 
720  |  | static uint32_t U_CALLCONV  | 
721  | 0  | defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { | 
722  | 0  |     uint32_t value, initialValue;  | 
723  | 0  |     UChar32 limit;  | 
724  | 0  |     UBool inBlockZero;  | 
725  |  | 
  | 
726  | 0  |     initialValue=trie->data[0];  | 
727  | 0  |     limit=start+0x400;  | 
728  | 0  |     while(start<limit) { | 
729  | 0  |         value=utrie_get32(trie, start, &inBlockZero);  | 
730  | 0  |         if(inBlockZero) { | 
731  | 0  |             start+=UTRIE_DATA_BLOCK_LENGTH;  | 
732  | 0  |         } else if(value!=initialValue) { | 
733  | 0  |             return (uint32_t)offset;  | 
734  | 0  |         } else { | 
735  | 0  |             ++start;  | 
736  | 0  |         }  | 
737  | 0  |     }  | 
738  | 0  |     return 0;  | 
739  | 0  | }  | 
740  |  |  | 
741  |  | U_CAPI int32_t U_EXPORT2  | 
742  |  | utrie_serialize(UNewTrie *trie, void *dt, int32_t capacity,  | 
743  |  |                 UNewTrieGetFoldedValue *getFoldedValue,  | 
744  |  |                 UBool reduceTo16Bits,  | 
745  | 0  |                 UErrorCode *pErrorCode) { | 
746  | 0  |     UTrieHeader *header;  | 
747  | 0  |     uint32_t *p;  | 
748  | 0  |     uint16_t *dest16;  | 
749  | 0  |     int32_t i, length;  | 
750  | 0  |     uint8_t* data = NULL;  | 
751  |  |  | 
752  |  |     /* argument check */  | 
753  | 0  |     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | 
754  | 0  |         return 0;  | 
755  | 0  |     }  | 
756  |  |  | 
757  | 0  |     if(trie==NULL || capacity<0 || (capacity>0 && dt==NULL)) { | 
758  | 0  |         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;  | 
759  | 0  |         return 0;  | 
760  | 0  |     }  | 
761  | 0  |     if(getFoldedValue==NULL) { | 
762  | 0  |         getFoldedValue=defaultGetFoldedValue;  | 
763  | 0  |     }  | 
764  |  | 
  | 
765  | 0  |     data = (uint8_t*)dt;  | 
766  |  |     /* fold and compact if necessary, also checks that indexLength is within limits */  | 
767  | 0  |     if(!trie->isCompacted) { | 
768  |  |         /* compact once without overlap to improve folding */  | 
769  | 0  |         utrie_compact(trie, FALSE, pErrorCode);  | 
770  |  |  | 
771  |  |         /* fold the supplementary part of the index array */  | 
772  | 0  |         utrie_fold(trie, getFoldedValue, pErrorCode);  | 
773  |  |  | 
774  |  |         /* compact again with overlap for minimum data array length */  | 
775  | 0  |         utrie_compact(trie, TRUE, pErrorCode);  | 
776  |  | 
  | 
777  | 0  |         trie->isCompacted=TRUE;  | 
778  | 0  |         if(U_FAILURE(*pErrorCode)) { | 
779  | 0  |             return 0;  | 
780  | 0  |         }  | 
781  | 0  |     }  | 
782  |  |  | 
783  |  |     /* is dataLength within limits? */  | 
784  | 0  |     if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) { | 
785  | 0  |         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;  | 
786  | 0  |     }  | 
787  |  | 
  | 
788  | 0  |     length=sizeof(UTrieHeader)+2*trie->indexLength;  | 
789  | 0  |     if(reduceTo16Bits) { | 
790  | 0  |         length+=2*trie->dataLength;  | 
791  | 0  |     } else { | 
792  | 0  |         length+=4*trie->dataLength;  | 
793  | 0  |     }  | 
794  |  | 
  | 
795  | 0  |     if(length>capacity) { | 
796  | 0  |         return length; /* preflighting */  | 
797  | 0  |     }  | 
798  |  |  | 
799  |  | #ifdef UTRIE_DEBUG  | 
800  |  |     printf("**UTrieLengths(serialize)** index:%6ld  data:%6ld  serialized:%6ld\n", | 
801  |  |            (long)trie->indexLength, (long)trie->dataLength, (long)length);  | 
802  |  | #endif  | 
803  |  |  | 
804  |  |     /* set the header fields */  | 
805  | 0  |     header=(UTrieHeader *)data;  | 
806  | 0  |     data+=sizeof(UTrieHeader);  | 
807  |  | 
  | 
808  | 0  |     header->signature=0x54726965; /* "Trie" */  | 
809  | 0  |     header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<<UTRIE_OPTIONS_INDEX_SHIFT);  | 
810  |  | 
  | 
811  | 0  |     if(!reduceTo16Bits) { | 
812  | 0  |         header->options|=UTRIE_OPTIONS_DATA_IS_32_BIT;  | 
813  | 0  |     }  | 
814  | 0  |     if(trie->isLatin1Linear) { | 
815  | 0  |         header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR;  | 
816  | 0  |     }  | 
817  |  | 
  | 
818  | 0  |     header->indexLength=trie->indexLength;  | 
819  | 0  |     header->dataLength=trie->dataLength;  | 
820  |  |  | 
821  |  |     /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */  | 
822  | 0  |     if(reduceTo16Bits) { | 
823  |  |         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */  | 
824  | 0  |         p=(uint32_t *)trie->index;  | 
825  | 0  |         dest16=(uint16_t *)data;  | 
826  | 0  |         for(i=trie->indexLength; i>0; --i) { | 
827  | 0  |             *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT);  | 
828  | 0  |         }  | 
829  |  |  | 
830  |  |         /* write 16-bit data values */  | 
831  | 0  |         p=trie->data;  | 
832  | 0  |         for(i=trie->dataLength; i>0; --i) { | 
833  | 0  |             *dest16++=(uint16_t)*p++;  | 
834  | 0  |         }  | 
835  | 0  |     } else { | 
836  |  |         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */  | 
837  | 0  |         p=(uint32_t *)trie->index;  | 
838  | 0  |         dest16=(uint16_t *)data;  | 
839  | 0  |         for(i=trie->indexLength; i>0; --i) { | 
840  | 0  |             *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT);  | 
841  | 0  |         }  | 
842  |  |  | 
843  |  |         /* write 32-bit data values */  | 
844  | 0  |         uprv_memcpy(dest16, trie->data, 4*(size_t)trie->dataLength);  | 
845  | 0  |     }  | 
846  |  | 
  | 
847  | 0  |     return length;  | 
848  | 0  | }  | 
849  |  |  | 
850  |  | /* inverse to defaultGetFoldedValue() */  | 
851  |  | U_CAPI int32_t U_EXPORT2  | 
852  | 0  | utrie_defaultGetFoldingOffset(uint32_t data) { | 
853  | 0  |     return (int32_t)data;  | 
854  | 0  | }  | 
855  |  |  | 
856  |  | U_CAPI int32_t U_EXPORT2  | 
857  | 0  | utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) { | 
858  | 0  |     const UTrieHeader *header;  | 
859  | 0  |     const uint16_t *p16;  | 
860  | 0  |     uint32_t options;  | 
861  |  | 
  | 
862  | 0  |     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | 
863  | 0  |         return -1;  | 
864  | 0  |     }  | 
865  |  |  | 
866  |  |     /* enough data for a trie header? */  | 
867  | 0  |     if(length<(int32_t)sizeof(UTrieHeader)) { | 
868  | 0  |         *pErrorCode=U_INVALID_FORMAT_ERROR;  | 
869  | 0  |         return -1;  | 
870  | 0  |     }  | 
871  |  |  | 
872  |  |     /* check the signature */  | 
873  | 0  |     header=(const UTrieHeader *)data;  | 
874  | 0  |     if(header->signature!=0x54726965) { | 
875  | 0  |         *pErrorCode=U_INVALID_FORMAT_ERROR;  | 
876  | 0  |         return -1;  | 
877  | 0  |     }  | 
878  |  |  | 
879  |  |     /* get the options and check the shift values */  | 
880  | 0  |     options=header->options;  | 
881  | 0  |     if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||  | 
882  | 0  |         ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT  | 
883  | 0  |     ) { | 
884  | 0  |         *pErrorCode=U_INVALID_FORMAT_ERROR;  | 
885  | 0  |         return -1;  | 
886  | 0  |     }  | 
887  | 0  |     trie->isLatin1Linear= (UBool)((options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0);  | 
888  |  |  | 
889  |  |     /* get the length values */  | 
890  | 0  |     trie->indexLength=header->indexLength;  | 
891  | 0  |     trie->dataLength=header->dataLength;  | 
892  |  | 
  | 
893  | 0  |     length-=(int32_t)sizeof(UTrieHeader);  | 
894  |  |  | 
895  |  |     /* enough data for the index? */  | 
896  | 0  |     if(length<2*trie->indexLength) { | 
897  | 0  |         *pErrorCode=U_INVALID_FORMAT_ERROR;  | 
898  | 0  |         return -1;  | 
899  | 0  |     }  | 
900  | 0  |     p16=(const uint16_t *)(header+1);  | 
901  | 0  |     trie->index=p16;  | 
902  | 0  |     p16+=trie->indexLength;  | 
903  | 0  |     length-=2*trie->indexLength;  | 
904  |  |  | 
905  |  |     /* get the data */  | 
906  | 0  |     if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) { | 
907  | 0  |         if(length<4*trie->dataLength) { | 
908  | 0  |             *pErrorCode=U_INVALID_FORMAT_ERROR;  | 
909  | 0  |             return -1;  | 
910  | 0  |         }  | 
911  | 0  |         trie->data32=(const uint32_t *)p16;  | 
912  | 0  |         trie->initialValue=trie->data32[0];  | 
913  | 0  |         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength;  | 
914  | 0  |     } else { | 
915  | 0  |         if(length<2*trie->dataLength) { | 
916  | 0  |             *pErrorCode=U_INVALID_FORMAT_ERROR;  | 
917  | 0  |             return -1;  | 
918  | 0  |         }  | 
919  |  |  | 
920  |  |         /* the "data16" data is used via the index pointer */  | 
921  | 0  |         trie->data32=NULL;  | 
922  | 0  |         trie->initialValue=trie->index[trie->indexLength];  | 
923  | 0  |         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength;  | 
924  | 0  |     }  | 
925  |  |  | 
926  | 0  |     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;  | 
927  |  | 
  | 
928  | 0  |     return length;  | 
929  | 0  | }  | 
930  |  |  | 
931  |  | U_CAPI int32_t U_EXPORT2  | 
932  |  | utrie_unserializeDummy(UTrie *trie,  | 
933  |  |                        void *data, int32_t length,  | 
934  |  |                        uint32_t initialValue, uint32_t leadUnitValue,  | 
935  |  |                        UBool make16BitTrie,  | 
936  | 0  |                        UErrorCode *pErrorCode) { | 
937  | 0  |     uint16_t *p16;  | 
938  | 0  |     int32_t actualLength, latin1Length, i, limit;  | 
939  | 0  |     uint16_t block;  | 
940  |  | 
  | 
941  | 0  |     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | 
942  | 0  |         return -1;  | 
943  | 0  |     }  | 
944  |  |  | 
945  |  |     /* calculate the actual size of the dummy trie data */  | 
946  |  |  | 
947  |  |     /* max(Latin-1, block 0) */  | 
948  | 0  |     latin1Length= 256; /*UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;*/  | 
949  |  | 
  | 
950  | 0  |     trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT;  | 
951  | 0  |     trie->dataLength=latin1Length;  | 
952  | 0  |     if(leadUnitValue!=initialValue) { | 
953  | 0  |         trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH;  | 
954  | 0  |     }  | 
955  |  | 
  | 
956  | 0  |     actualLength=trie->indexLength*2;  | 
957  | 0  |     if(make16BitTrie) { | 
958  | 0  |         actualLength+=trie->dataLength*2;  | 
959  | 0  |     } else { | 
960  | 0  |         actualLength+=trie->dataLength*4;  | 
961  | 0  |     }  | 
962  |  |  | 
963  |  |     /* enough space for the dummy trie? */  | 
964  | 0  |     if(length<actualLength) { | 
965  | 0  |         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;  | 
966  | 0  |         return actualLength;  | 
967  | 0  |     }  | 
968  |  |  | 
969  | 0  |     trie->isLatin1Linear=TRUE;  | 
970  | 0  |     trie->initialValue=initialValue;  | 
971  |  |  | 
972  |  |     /* fill the index and data arrays */  | 
973  | 0  |     p16=(uint16_t *)data;  | 
974  | 0  |     trie->index=p16;  | 
975  |  | 
  | 
976  | 0  |     if(make16BitTrie) { | 
977  |  |         /* indexes to block 0 */  | 
978  | 0  |         block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT);  | 
979  | 0  |         limit=trie->indexLength;  | 
980  | 0  |         for(i=0; i<limit; ++i) { | 
981  | 0  |             p16[i]=block;  | 
982  | 0  |         }  | 
983  |  | 
  | 
984  | 0  |         if(leadUnitValue!=initialValue) { | 
985  |  |             /* indexes for lead surrogate code units to the block after Latin-1 */  | 
986  | 0  |             block+=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);  | 
987  | 0  |             i=0xd800>>UTRIE_SHIFT;  | 
988  | 0  |             limit=0xdc00>>UTRIE_SHIFT;  | 
989  | 0  |             for(; i<limit; ++i) { | 
990  | 0  |                 p16[i]=block;  | 
991  | 0  |             }  | 
992  | 0  |         }  | 
993  |  | 
  | 
994  | 0  |         trie->data32=NULL;  | 
995  |  |  | 
996  |  |         /* Latin-1 data */  | 
997  | 0  |         p16+=trie->indexLength;  | 
998  | 0  |         for(i=0; i<latin1Length; ++i) { | 
999  | 0  |             p16[i]=(uint16_t)initialValue;  | 
1000  | 0  |         }  | 
1001  |  |  | 
1002  |  |         /* data for lead surrogate code units */  | 
1003  | 0  |         if(leadUnitValue!=initialValue) { | 
1004  | 0  |             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;  | 
1005  | 0  |             for(/* i=latin1Length */; i<limit; ++i) { | 
1006  | 0  |                 p16[i]=(uint16_t)leadUnitValue;  | 
1007  | 0  |             }  | 
1008  | 0  |         }  | 
1009  | 0  |     } else { | 
1010  | 0  |         uint32_t *p32;  | 
1011  |  |  | 
1012  |  |         /* indexes to block 0 */  | 
1013  | 0  |         uprv_memset(p16, 0, trie->indexLength*2);  | 
1014  |  | 
  | 
1015  | 0  |         if(leadUnitValue!=initialValue) { | 
1016  |  |             /* indexes for lead surrogate code units to the block after Latin-1 */  | 
1017  | 0  |             block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);  | 
1018  | 0  |             i=0xd800>>UTRIE_SHIFT;  | 
1019  | 0  |             limit=0xdc00>>UTRIE_SHIFT;  | 
1020  | 0  |             for(; i<limit; ++i) { | 
1021  | 0  |                 p16[i]=block;  | 
1022  | 0  |             }  | 
1023  | 0  |         }  | 
1024  |  | 
  | 
1025  | 0  |         trie->data32=p32=(uint32_t *)(p16+trie->indexLength);  | 
1026  |  |  | 
1027  |  |         /* Latin-1 data */  | 
1028  | 0  |         for(i=0; i<latin1Length; ++i) { | 
1029  | 0  |             p32[i]=initialValue;  | 
1030  | 0  |         }  | 
1031  |  |  | 
1032  |  |         /* data for lead surrogate code units */  | 
1033  | 0  |         if(leadUnitValue!=initialValue) { | 
1034  | 0  |             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;  | 
1035  | 0  |             for(/* i=latin1Length */; i<limit; ++i) { | 
1036  | 0  |                 p32[i]=leadUnitValue;  | 
1037  | 0  |             }  | 
1038  | 0  |         }  | 
1039  | 0  |     }  | 
1040  |  | 
  | 
1041  | 0  |     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;  | 
1042  |  | 
  | 
1043  | 0  |     return actualLength;  | 
1044  | 0  | }  | 
1045  |  |  | 
1046  |  | /* enumeration -------------------------------------------------------------- */  | 
1047  |  |  | 
1048  |  | /* default UTrieEnumValue() returns the input value itself */  | 
1049  |  | static uint32_t U_CALLCONV  | 
1050  | 0  | enumSameValue(const void * /*context*/, uint32_t value) { | 
1051  | 0  |     return value;  | 
1052  | 0  | }  | 
1053  |  |  | 
1054  |  | /**  | 
1055  |  |  * Enumerate all ranges of code points with the same relevant values.  | 
1056  |  |  * The values are transformed from the raw trie entries by the enumValue function.  | 
1057  |  |  */  | 
1058  |  | U_CAPI void U_EXPORT2  | 
1059  |  | utrie_enum(const UTrie *trie,  | 
1060  | 0  |            UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) { | 
1061  | 0  |     const uint32_t *data32;  | 
1062  | 0  |     const uint16_t *idx;  | 
1063  |  | 
  | 
1064  | 0  |     uint32_t value, prevValue, initialValue;  | 
1065  | 0  |     UChar32 c, prev;  | 
1066  | 0  |     int32_t l, i, j, block, prevBlock, nullBlock, offset;  | 
1067  |  |  | 
1068  |  |     /* check arguments */  | 
1069  | 0  |     if(trie==NULL || trie->index==NULL || enumRange==NULL) { | 
1070  | 0  |         return;  | 
1071  | 0  |     }  | 
1072  | 0  |     if(enumValue==NULL) { | 
1073  | 0  |         enumValue=enumSameValue;  | 
1074  | 0  |     }  | 
1075  |  | 
  | 
1076  | 0  |     idx=trie->index;  | 
1077  | 0  |     data32=trie->data32;  | 
1078  |  |  | 
1079  |  |     /* get the enumeration value that corresponds to an initial-value trie data entry */  | 
1080  | 0  |     initialValue=enumValue(context, trie->initialValue);  | 
1081  |  | 
  | 
1082  | 0  |     if(data32==NULL) { | 
1083  | 0  |         nullBlock=trie->indexLength;  | 
1084  | 0  |     } else { | 
1085  | 0  |         nullBlock=0;  | 
1086  | 0  |     }  | 
1087  |  |  | 
1088  |  |     /* set variables for previous range */  | 
1089  | 0  |     prevBlock=nullBlock;  | 
1090  | 0  |     prev=0;  | 
1091  | 0  |     prevValue=initialValue;  | 
1092  |  |  | 
1093  |  |     /* enumerate BMP - the main loop enumerates data blocks */  | 
1094  | 0  |     for(i=0, c=0; c<=0xffff; ++i) { | 
1095  | 0  |         if(c==0xd800) { | 
1096  |  |             /* skip lead surrogate code _units_, go to lead surr. code _points_ */  | 
1097  | 0  |             i=UTRIE_BMP_INDEX_LENGTH;  | 
1098  | 0  |         } else if(c==0xdc00) { | 
1099  |  |             /* go back to regular BMP code points */  | 
1100  | 0  |             i=c>>UTRIE_SHIFT;  | 
1101  | 0  |         }  | 
1102  |  | 
  | 
1103  | 0  |         block=idx[i]<<UTRIE_INDEX_SHIFT;  | 
1104  | 0  |         if(block==prevBlock) { | 
1105  |  |             /* the block is the same as the previous one, and filled with value */  | 
1106  | 0  |             c+=UTRIE_DATA_BLOCK_LENGTH;  | 
1107  | 0  |         } else if(block==nullBlock) { | 
1108  |  |             /* this is the all-initial-value block */  | 
1109  | 0  |             if(prevValue!=initialValue) { | 
1110  | 0  |                 if(prev<c) { | 
1111  | 0  |                     if(!enumRange(context, prev, c, prevValue)) { | 
1112  | 0  |                         return;  | 
1113  | 0  |                     }  | 
1114  | 0  |                 }  | 
1115  | 0  |                 prevBlock=nullBlock;  | 
1116  | 0  |                 prev=c;  | 
1117  | 0  |                 prevValue=initialValue;  | 
1118  | 0  |             }  | 
1119  | 0  |             c+=UTRIE_DATA_BLOCK_LENGTH;  | 
1120  | 0  |         } else { | 
1121  | 0  |             prevBlock=block;  | 
1122  | 0  |             for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) { | 
1123  | 0  |                 value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);  | 
1124  | 0  |                 if(value!=prevValue) { | 
1125  | 0  |                     if(prev<c) { | 
1126  | 0  |                         if(!enumRange(context, prev, c, prevValue)) { | 
1127  | 0  |                             return;  | 
1128  | 0  |                         }  | 
1129  | 0  |                     }  | 
1130  | 0  |                     if(j>0) { | 
1131  |  |                         /* the block is not filled with all the same value */  | 
1132  | 0  |                         prevBlock=-1;  | 
1133  | 0  |                     }  | 
1134  | 0  |                     prev=c;  | 
1135  | 0  |                     prevValue=value;  | 
1136  | 0  |                 }  | 
1137  | 0  |                 ++c;  | 
1138  | 0  |             }  | 
1139  | 0  |         }  | 
1140  | 0  |     }  | 
1141  |  |  | 
1142  |  |     /* enumerate supplementary code points */  | 
1143  | 0  |     for(l=0xd800; l<0xdc00;) { | 
1144  |  |         /* lead surrogate access */  | 
1145  | 0  |         offset=idx[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;  | 
1146  | 0  |         if(offset==nullBlock) { | 
1147  |  |             /* no entries for a whole block of lead surrogates */  | 
1148  | 0  |             if(prevValue!=initialValue) { | 
1149  | 0  |                 if(prev<c) { | 
1150  | 0  |                     if(!enumRange(context, prev, c, prevValue)) { | 
1151  | 0  |                         return;  | 
1152  | 0  |                     }  | 
1153  | 0  |                 }  | 
1154  | 0  |                 prevBlock=nullBlock;  | 
1155  | 0  |                 prev=c;  | 
1156  | 0  |                 prevValue=initialValue;  | 
1157  | 0  |             }  | 
1158  |  |  | 
1159  | 0  |             l+=UTRIE_DATA_BLOCK_LENGTH;  | 
1160  | 0  |             c+=UTRIE_DATA_BLOCK_LENGTH<<10;  | 
1161  | 0  |             continue;  | 
1162  | 0  |         }  | 
1163  |  |  | 
1164  | 0  |         value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : idx[offset+(l&UTRIE_MASK)];  | 
1165  |  |  | 
1166  |  |         /* enumerate trail surrogates for this lead surrogate */  | 
1167  | 0  |         offset=trie->getFoldingOffset(value);  | 
1168  | 0  |         if(offset<=0) { | 
1169  |  |             /* no data for this lead surrogate */  | 
1170  | 0  |             if(prevValue!=initialValue) { | 
1171  | 0  |                 if(prev<c) { | 
1172  | 0  |                     if(!enumRange(context, prev, c, prevValue)) { | 
1173  | 0  |                         return;  | 
1174  | 0  |                     }  | 
1175  | 0  |                 }  | 
1176  | 0  |                 prevBlock=nullBlock;  | 
1177  | 0  |                 prev=c;  | 
1178  | 0  |                 prevValue=initialValue;  | 
1179  | 0  |             }  | 
1180  |  |  | 
1181  |  |             /* nothing else to do for the supplementary code points for this lead surrogate */  | 
1182  | 0  |             c+=0x400;  | 
1183  | 0  |         } else { | 
1184  |  |             /* enumerate code points for this lead surrogate */  | 
1185  | 0  |             i=offset;  | 
1186  | 0  |             offset+=UTRIE_SURROGATE_BLOCK_COUNT;  | 
1187  | 0  |             do { | 
1188  |  |                 /* copy of most of the body of the BMP loop */  | 
1189  | 0  |                 block=idx[i]<<UTRIE_INDEX_SHIFT;  | 
1190  | 0  |                 if(block==prevBlock) { | 
1191  |  |                     /* the block is the same as the previous one, and filled with value */  | 
1192  | 0  |                     c+=UTRIE_DATA_BLOCK_LENGTH;  | 
1193  | 0  |                 } else if(block==nullBlock) { | 
1194  |  |                     /* this is the all-initial-value block */  | 
1195  | 0  |                     if(prevValue!=initialValue) { | 
1196  | 0  |                         if(prev<c) { | 
1197  | 0  |                             if(!enumRange(context, prev, c, prevValue)) { | 
1198  | 0  |                                 return;  | 
1199  | 0  |                             }  | 
1200  | 0  |                         }  | 
1201  | 0  |                         prevBlock=nullBlock;  | 
1202  | 0  |                         prev=c;  | 
1203  | 0  |                         prevValue=initialValue;  | 
1204  | 0  |                     }  | 
1205  | 0  |                     c+=UTRIE_DATA_BLOCK_LENGTH;  | 
1206  | 0  |                 } else { | 
1207  | 0  |                     prevBlock=block;  | 
1208  | 0  |                     for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) { | 
1209  | 0  |                         value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);  | 
1210  | 0  |                         if(value!=prevValue) { | 
1211  | 0  |                             if(prev<c) { | 
1212  | 0  |                                 if(!enumRange(context, prev, c, prevValue)) { | 
1213  | 0  |                                     return;  | 
1214  | 0  |                                 }  | 
1215  | 0  |                             }  | 
1216  | 0  |                             if(j>0) { | 
1217  |  |                                 /* the block is not filled with all the same value */  | 
1218  | 0  |                                 prevBlock=-1;  | 
1219  | 0  |                             }  | 
1220  | 0  |                             prev=c;  | 
1221  | 0  |                             prevValue=value;  | 
1222  | 0  |                         }  | 
1223  | 0  |                         ++c;  | 
1224  | 0  |                     }  | 
1225  | 0  |                 }  | 
1226  | 0  |             } while(++i<offset);  | 
1227  | 0  |         }  | 
1228  |  |  | 
1229  | 0  |         ++l;  | 
1230  | 0  |     }  | 
1231  |  |  | 
1232  |  |     /* deliver last range */  | 
1233  | 0  |     enumRange(context, prev, c, prevValue);  | 
1234  | 0  | }  |