/src/icu/source/i18n/collationfastlatinbuilder.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | * Copyright (C) 2013-2015, International Business Machines  | 
6  |  | * Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | * collationfastlatinbuilder.cpp  | 
9  |  | *  | 
10  |  | * created on: 2013aug09  | 
11  |  | * created by: Markus W. Scherer  | 
12  |  | */  | 
13  |  |  | 
14  |  | #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0  // 0 or 1 or 2  | 
15  |  | #if DEBUG_COLLATION_FAST_LATIN_BUILDER  | 
16  |  | #include <stdio.h>  | 
17  |  | #include <string>  | 
18  |  | #endif  | 
19  |  |  | 
20  |  | #include "unicode/utypes.h"  | 
21  |  |  | 
22  |  | #if !UCONFIG_NO_COLLATION  | 
23  |  |  | 
24  |  | #include "unicode/ucol.h"  | 
25  |  | #include "unicode/ucharstrie.h"  | 
26  |  | #include "unicode/unistr.h"  | 
27  |  | #include "unicode/uobject.h"  | 
28  |  | #include "unicode/uscript.h"  | 
29  |  | #include "cmemory.h"  | 
30  |  | #include "collation.h"  | 
31  |  | #include "collationdata.h"  | 
32  |  | #include "collationfastlatin.h"  | 
33  |  | #include "collationfastlatinbuilder.h"  | 
34  |  | #include "uassert.h"  | 
35  |  | #include "uvectr64.h"  | 
36  |  |  | 
37  |  | U_NAMESPACE_BEGIN  | 
38  |  |  | 
39  |  | struct CollationData;  | 
40  |  |  | 
41  |  | namespace { | 
42  |  |  | 
43  |  | /**  | 
44  |  |  * Compare two signed int64_t values as if they were unsigned.  | 
45  |  |  */  | 
46  |  | int32_t  | 
47  | 0  | compareInt64AsUnsigned(int64_t a, int64_t b) { | 
48  | 0  |     if((uint64_t)a < (uint64_t)b) { | 
49  | 0  |         return -1;  | 
50  | 0  |     } else if((uint64_t)a > (uint64_t)b) { | 
51  | 0  |         return 1;  | 
52  | 0  |     } else { | 
53  | 0  |         return 0;  | 
54  | 0  |     }  | 
55  | 0  | }  | 
56  |  |  | 
57  |  | // TODO: Merge this with the near-identical version in collationbasedatabuilder.cpp  | 
58  |  | /**  | 
59  |  |  * Like Java Collections.binarySearch(List, String, Comparator).  | 
60  |  |  *  | 
61  |  |  * @return the index>=0 where the item was found,  | 
62  |  |  *         or the index<0 for inserting the string at ~index in sorted order  | 
63  |  |  */  | 
64  |  | int32_t  | 
65  | 0  | binarySearch(const int64_t list[], int32_t limit, int64_t ce) { | 
66  | 0  |     if (limit == 0) { return ~0; } | 
67  | 0  |     int32_t start = 0;  | 
68  | 0  |     for (;;) { | 
69  | 0  |         int32_t i = (start + limit) / 2;  | 
70  | 0  |         int32_t cmp = compareInt64AsUnsigned(ce, list[i]);  | 
71  | 0  |         if (cmp == 0) { | 
72  | 0  |             return i;  | 
73  | 0  |         } else if (cmp < 0) { | 
74  | 0  |             if (i == start) { | 
75  | 0  |                 return ~start;  // insert ce before i  | 
76  | 0  |             }  | 
77  | 0  |             limit = i;  | 
78  | 0  |         } else { | 
79  | 0  |             if (i == start) { | 
80  | 0  |                 return ~(start + 1);  // insert ce after i  | 
81  | 0  |             }  | 
82  | 0  |             start = i;  | 
83  | 0  |         }  | 
84  | 0  |     }  | 
85  | 0  | }  | 
86  |  |  | 
87  |  | }  // namespace  | 
88  |  |  | 
89  |  | CollationFastLatinBuilder::CollationFastLatinBuilder(UErrorCode &errorCode)  | 
90  | 0  |         : ce0(0), ce1(0),  | 
91  | 0  |           contractionCEs(errorCode), uniqueCEs(errorCode),  | 
92  |  |           miniCEs(NULL),  | 
93  | 0  |           firstDigitPrimary(0), firstLatinPrimary(0), lastLatinPrimary(0),  | 
94  | 0  |           firstShortPrimary(0), shortPrimaryOverflow(FALSE),  | 
95  | 0  |           headerLength(0) { | 
96  | 0  | }  | 
97  |  |  | 
98  | 0  | CollationFastLatinBuilder::~CollationFastLatinBuilder() { | 
99  | 0  |     uprv_free(miniCEs);  | 
100  | 0  | }  | 
101  |  |  | 
102  |  | UBool  | 
103  | 0  | CollationFastLatinBuilder::forData(const CollationData &data, UErrorCode &errorCode) { | 
104  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
105  | 0  |     if(!result.isEmpty()) {  // This builder is not reusable. | 
106  | 0  |         errorCode = U_INVALID_STATE_ERROR;  | 
107  | 0  |         return FALSE;  | 
108  | 0  |     }  | 
109  | 0  |     if(!loadGroups(data, errorCode)) { return FALSE; } | 
110  |  |  | 
111  |  |     // Fast handling of digits.  | 
112  | 0  |     firstShortPrimary = firstDigitPrimary;  | 
113  | 0  |     getCEs(data, errorCode);  | 
114  | 0  |     if(!encodeUniqueCEs(errorCode)) { return FALSE; } | 
115  | 0  |     if(shortPrimaryOverflow) { | 
116  |  |         // Give digits long mini primaries,  | 
117  |  |         // so that there are more short primaries for letters.  | 
118  | 0  |         firstShortPrimary = firstLatinPrimary;  | 
119  | 0  |         resetCEs();  | 
120  | 0  |         getCEs(data, errorCode);  | 
121  | 0  |         if(!encodeUniqueCEs(errorCode)) { return FALSE; } | 
122  | 0  |     }  | 
123  |  |     // Note: If we still have a short-primary overflow but not a long-primary overflow,  | 
124  |  |     // then we could calculate how many more long primaries would fit,  | 
125  |  |     // and set the firstShortPrimary to that many after the current firstShortPrimary,  | 
126  |  |     // and try again.  | 
127  |  |     // However, this might only benefit the en_US_POSIX tailoring,  | 
128  |  |     // and it is simpler to suppress building fast Latin data for it in genrb,  | 
129  |  |     // or by returning FALSE here if shortPrimaryOverflow.  | 
130  |  |  | 
131  | 0  |     UBool ok = !shortPrimaryOverflow &&  | 
132  | 0  |             encodeCharCEs(errorCode) && encodeContractions(errorCode);  | 
133  | 0  |     contractionCEs.removeAllElements();  // might reduce heap memory usage  | 
134  | 0  |     uniqueCEs.removeAllElements();  | 
135  | 0  |     return ok;  | 
136  | 0  | }  | 
137  |  |  | 
138  |  | UBool  | 
139  | 0  | CollationFastLatinBuilder::loadGroups(const CollationData &data, UErrorCode &errorCode) { | 
140  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
141  | 0  |     headerLength = 1 + NUM_SPECIAL_GROUPS;  | 
142  | 0  |     uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength;  | 
143  | 0  |     result.append((UChar)r0);  | 
144  |  |     // The first few reordering groups should be special groups  | 
145  |  |     // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.  | 
146  | 0  |     for(int32_t i = 0; i < NUM_SPECIAL_GROUPS; ++i) { | 
147  | 0  |         lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i);  | 
148  | 0  |         if(lastSpecialPrimaries[i] == 0) { | 
149  |  |             // missing data  | 
150  | 0  |             return FALSE;  | 
151  | 0  |         }  | 
152  | 0  |         result.append((UChar)0);  // reserve a slot for this group  | 
153  | 0  |     }  | 
154  |  |  | 
155  | 0  |     firstDigitPrimary = data.getFirstPrimaryForGroup(UCOL_REORDER_CODE_DIGIT);  | 
156  | 0  |     firstLatinPrimary = data.getFirstPrimaryForGroup(USCRIPT_LATIN);  | 
157  | 0  |     lastLatinPrimary = data.getLastPrimaryForGroup(USCRIPT_LATIN);  | 
158  | 0  |     if(firstDigitPrimary == 0 || firstLatinPrimary == 0) { | 
159  |  |         // missing data  | 
160  | 0  |         return FALSE;  | 
161  | 0  |     }  | 
162  | 0  |     return TRUE;  | 
163  | 0  | }  | 
164  |  |  | 
165  |  | UBool  | 
166  | 0  | CollationFastLatinBuilder::inSameGroup(uint32_t p, uint32_t q) const { | 
167  |  |     // Both or neither need to be encoded as short primaries,  | 
168  |  |     // so that we can test only one and use the same bit mask.  | 
169  | 0  |     if(p >= firstShortPrimary) { | 
170  | 0  |         return q >= firstShortPrimary;  | 
171  | 0  |     } else if(q >= firstShortPrimary) { | 
172  | 0  |         return FALSE;  | 
173  | 0  |     }  | 
174  |  |     // Both or neither must be potentially-variable,  | 
175  |  |     // so that we can test only one and determine if both are variable.  | 
176  | 0  |     uint32_t lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];  | 
177  | 0  |     if(p > lastVariablePrimary) { | 
178  | 0  |         return q > lastVariablePrimary;  | 
179  | 0  |     } else if(q > lastVariablePrimary) { | 
180  | 0  |         return FALSE;  | 
181  | 0  |     }  | 
182  |  |     // Both will be encoded with long mini primaries.  | 
183  |  |     // They must be in the same special reordering group,  | 
184  |  |     // so that we can test only one and determine if both are variable.  | 
185  | 0  |     U_ASSERT(p != 0 && q != 0);  | 
186  | 0  |     for(int32_t i = 0;; ++i) {  // will terminate | 
187  | 0  |         uint32_t lastPrimary = lastSpecialPrimaries[i];  | 
188  | 0  |         if(p <= lastPrimary) { | 
189  | 0  |             return q <= lastPrimary;  | 
190  | 0  |         } else if(q <= lastPrimary) { | 
191  | 0  |             return FALSE;  | 
192  | 0  |         }  | 
193  | 0  |     }  | 
194  | 0  | }  | 
195  |  |  | 
196  |  | void  | 
197  | 0  | CollationFastLatinBuilder::resetCEs() { | 
198  | 0  |     contractionCEs.removeAllElements();  | 
199  | 0  |     uniqueCEs.removeAllElements();  | 
200  | 0  |     shortPrimaryOverflow = FALSE;  | 
201  | 0  |     result.truncate(headerLength);  | 
202  | 0  | }  | 
203  |  |  | 
204  |  | void  | 
205  | 0  | CollationFastLatinBuilder::getCEs(const CollationData &data, UErrorCode &errorCode) { | 
206  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
207  | 0  |     int32_t i = 0;  | 
208  | 0  |     for(UChar c = 0;; ++i, ++c) { | 
209  | 0  |         if(c == CollationFastLatin::LATIN_LIMIT) { | 
210  | 0  |             c = CollationFastLatin::PUNCT_START;  | 
211  | 0  |         } else if(c == CollationFastLatin::PUNCT_LIMIT) { | 
212  | 0  |             break;  | 
213  | 0  |         }  | 
214  | 0  |         const CollationData *d;  | 
215  | 0  |         uint32_t ce32 = data.getCE32(c);  | 
216  | 0  |         if(ce32 == Collation::FALLBACK_CE32) { | 
217  | 0  |             d = data.base;  | 
218  | 0  |             ce32 = d->getCE32(c);  | 
219  | 0  |         } else { | 
220  | 0  |             d = &data;  | 
221  | 0  |         }  | 
222  | 0  |         if(getCEsFromCE32(*d, c, ce32, errorCode)) { | 
223  | 0  |             charCEs[i][0] = ce0;  | 
224  | 0  |             charCEs[i][1] = ce1;  | 
225  | 0  |             addUniqueCE(ce0, errorCode);  | 
226  | 0  |             addUniqueCE(ce1, errorCode);  | 
227  | 0  |         } else { | 
228  |  |             // bail out for c  | 
229  | 0  |             charCEs[i][0] = ce0 = Collation::NO_CE;  | 
230  | 0  |             charCEs[i][1] = ce1 = 0;  | 
231  | 0  |         }  | 
232  | 0  |         if(c == 0 && !isContractionCharCE(ce0)) { | 
233  |  |             // Always map U+0000 to a contraction.  | 
234  |  |             // Write a contraction list with only a default value if there is no real contraction.  | 
235  | 0  |             U_ASSERT(contractionCEs.isEmpty());  | 
236  | 0  |             addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode);  | 
237  | 0  |             charCEs[0][0] = ((int64_t)Collation::NO_CE_PRIMARY << 32) | CONTRACTION_FLAG;  | 
238  | 0  |             charCEs[0][1] = 0;  | 
239  | 0  |         }  | 
240  | 0  |     }  | 
241  |  |     // Terminate the last contraction list.  | 
242  | 0  |     contractionCEs.addElement(CollationFastLatin::CONTR_CHAR_MASK, errorCode);  | 
243  | 0  | }  | 
244  |  |  | 
245  |  | UBool  | 
246  |  | CollationFastLatinBuilder::getCEsFromCE32(const CollationData &data, UChar32 c, uint32_t ce32,  | 
247  | 0  |                                           UErrorCode &errorCode) { | 
248  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
249  | 0  |     ce32 = data.getFinalCE32(ce32);  | 
250  | 0  |     ce1 = 0;  | 
251  | 0  |     if(Collation::isSimpleOrLongCE32(ce32)) { | 
252  | 0  |         ce0 = Collation::ceFromCE32(ce32);  | 
253  | 0  |     } else { | 
254  | 0  |         switch(Collation::tagFromCE32(ce32)) { | 
255  | 0  |         case Collation::LATIN_EXPANSION_TAG:  | 
256  | 0  |             ce0 = Collation::latinCE0FromCE32(ce32);  | 
257  | 0  |             ce1 = Collation::latinCE1FromCE32(ce32);  | 
258  | 0  |             break;  | 
259  | 0  |         case Collation::EXPANSION32_TAG: { | 
260  | 0  |             const uint32_t *ce32s = data.ce32s + Collation::indexFromCE32(ce32);  | 
261  | 0  |             int32_t length = Collation::lengthFromCE32(ce32);  | 
262  | 0  |             if(length <= 2) { | 
263  | 0  |                 ce0 = Collation::ceFromCE32(ce32s[0]);  | 
264  | 0  |                 if(length == 2) { | 
265  | 0  |                     ce1 = Collation::ceFromCE32(ce32s[1]);  | 
266  | 0  |                 }  | 
267  | 0  |                 break;  | 
268  | 0  |             } else { | 
269  | 0  |                 return FALSE;  | 
270  | 0  |             }  | 
271  | 0  |         }  | 
272  | 0  |         case Collation::EXPANSION_TAG: { | 
273  | 0  |             const int64_t *ces = data.ces + Collation::indexFromCE32(ce32);  | 
274  | 0  |             int32_t length = Collation::lengthFromCE32(ce32);  | 
275  | 0  |             if(length <= 2) { | 
276  | 0  |                 ce0 = ces[0];  | 
277  | 0  |                 if(length == 2) { | 
278  | 0  |                     ce1 = ces[1];  | 
279  | 0  |                 }  | 
280  | 0  |                 break;  | 
281  | 0  |             } else { | 
282  | 0  |                 return FALSE;  | 
283  | 0  |             }  | 
284  | 0  |         }  | 
285  |  |         // Note: We could support PREFIX_TAG (assert c>=0)  | 
286  |  |         // by recursing on its default CE32 and checking that none of the prefixes starts  | 
287  |  |         // with a fast Latin character.  | 
288  |  |         // However, currently (2013) there are only the L-before-middle-dot  | 
289  |  |         // prefix mappings in the Latin range, and those would be rejected anyway.  | 
290  | 0  |         case Collation::CONTRACTION_TAG:  | 
291  | 0  |             U_ASSERT(c >= 0);  | 
292  | 0  |             return getCEsFromContractionCE32(data, ce32, errorCode);  | 
293  | 0  |         case Collation::OFFSET_TAG:  | 
294  | 0  |             U_ASSERT(c >= 0);  | 
295  | 0  |             ce0 = data.getCEFromOffsetCE32(c, ce32);  | 
296  | 0  |             break;  | 
297  | 0  |         default:  | 
298  | 0  |             return FALSE;  | 
299  | 0  |         }  | 
300  | 0  |     }  | 
301  |  |     // A mapping can be completely ignorable.  | 
302  | 0  |     if(ce0 == 0) { return ce1 == 0; } | 
303  |  |     // We do not support an ignorable ce0 unless it is completely ignorable.  | 
304  | 0  |     uint32_t p0 = (uint32_t)(ce0 >> 32);  | 
305  | 0  |     if(p0 == 0) { return FALSE; } | 
306  |  |     // We only support primaries up to the Latin script.  | 
307  | 0  |     if(p0 > lastLatinPrimary) { return FALSE; } | 
308  |  |     // We support non-common secondary and case weights only together with short primaries.  | 
309  | 0  |     uint32_t lower32_0 = (uint32_t)ce0;  | 
310  | 0  |     if(p0 < firstShortPrimary) { | 
311  | 0  |         uint32_t sc0 = lower32_0 & Collation::SECONDARY_AND_CASE_MASK;  | 
312  | 0  |         if(sc0 != Collation::COMMON_SECONDARY_CE) { return FALSE; } | 
313  | 0  |     }  | 
314  |  |     // No below-common tertiary weights.  | 
315  | 0  |     if((lower32_0 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; } | 
316  | 0  |     if(ce1 != 0) { | 
317  |  |         // Both primaries must be in the same group,  | 
318  |  |         // or both must get short mini primaries,  | 
319  |  |         // or a short-primary CE is followed by a secondary CE.  | 
320  |  |         // This is so that we can test the first primary and use the same mask for both,  | 
321  |  |         // and determine for both whether they are variable.  | 
322  | 0  |         uint32_t p1 = (uint32_t)(ce1 >> 32);  | 
323  | 0  |         if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return FALSE; } | 
324  | 0  |         uint32_t lower32_1 = (uint32_t)ce1;  | 
325  |  |         // No tertiary CEs.  | 
326  | 0  |         if((lower32_1 >> 16) == 0) { return FALSE; } | 
327  |  |         // We support non-common secondary and case weights  | 
328  |  |         // only for secondary CEs or together with short primaries.  | 
329  | 0  |         if(p1 != 0 && p1 < firstShortPrimary) { | 
330  | 0  |             uint32_t sc1 = lower32_1 & Collation::SECONDARY_AND_CASE_MASK;  | 
331  | 0  |             if(sc1 != Collation::COMMON_SECONDARY_CE) { return FALSE; } | 
332  | 0  |         }  | 
333  |  |         // No below-common tertiary weights.  | 
334  | 0  |         if((lower32_1 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; } | 
335  | 0  |     }  | 
336  |  |     // No quaternary weights.  | 
337  | 0  |     if(((ce0 | ce1) & Collation::QUATERNARY_MASK) != 0) { return FALSE; } | 
338  | 0  |     return TRUE;  | 
339  | 0  | }  | 
340  |  |  | 
341  |  | UBool  | 
342  |  | CollationFastLatinBuilder::getCEsFromContractionCE32(const CollationData &data, uint32_t ce32,  | 
343  | 0  |                                                      UErrorCode &errorCode) { | 
344  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
345  | 0  |     const UChar *p = data.contexts + Collation::indexFromCE32(ce32);  | 
346  | 0  |     ce32 = CollationData::readCE32(p);  // Default if no suffix match.  | 
347  |  |     // Since the original ce32 is not a prefix mapping,  | 
348  |  |     // the default ce32 must not be another contraction.  | 
349  | 0  |     U_ASSERT(!Collation::isContractionCE32(ce32));  | 
350  | 0  |     int32_t contractionIndex = contractionCEs.size();  | 
351  | 0  |     if(getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) { | 
352  | 0  |         addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode);  | 
353  | 0  |     } else { | 
354  |  |         // Bail out for c-without-contraction.  | 
355  | 0  |         addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, Collation::NO_CE, 0, errorCode);  | 
356  | 0  |     }  | 
357  |  |     // Handle an encodable contraction unless the next contraction is too long  | 
358  |  |     // and starts with the same character.  | 
359  | 0  |     int32_t prevX = -1;  | 
360  | 0  |     UBool addContraction = FALSE;  | 
361  | 0  |     UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);  | 
362  | 0  |     while(suffixes.next(errorCode)) { | 
363  | 0  |         const UnicodeString &suffix = suffixes.getString();  | 
364  | 0  |         int32_t x = CollationFastLatin::getCharIndex(suffix.charAt(0));  | 
365  | 0  |         if(x < 0) { continue; }  // ignore anything but fast Latin text | 
366  | 0  |         if(x == prevX) { | 
367  | 0  |             if(addContraction) { | 
368  |  |                 // Bail out for all contractions starting with this character.  | 
369  | 0  |                 addContractionEntry(x, Collation::NO_CE, 0, errorCode);  | 
370  | 0  |                 addContraction = FALSE;  | 
371  | 0  |             }  | 
372  | 0  |             continue;  | 
373  | 0  |         }  | 
374  | 0  |         if(addContraction) { | 
375  | 0  |             addContractionEntry(prevX, ce0, ce1, errorCode);  | 
376  | 0  |         }  | 
377  | 0  |         ce32 = (uint32_t)suffixes.getValue();  | 
378  | 0  |         if(suffix.length() == 1 && getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) { | 
379  | 0  |             addContraction = TRUE;  | 
380  | 0  |         } else { | 
381  | 0  |             addContractionEntry(x, Collation::NO_CE, 0, errorCode);  | 
382  | 0  |             addContraction = FALSE;  | 
383  | 0  |         }  | 
384  | 0  |         prevX = x;  | 
385  | 0  |     }  | 
386  | 0  |     if(addContraction) { | 
387  | 0  |         addContractionEntry(prevX, ce0, ce1, errorCode);  | 
388  | 0  |     }  | 
389  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
390  |  |     // Note: There might not be any fast Latin contractions, but  | 
391  |  |     // we need to enter contraction handling anyway so that we can bail out  | 
392  |  |     // when there is a non-fast-Latin character following.  | 
393  |  |     // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the  | 
394  |  |     // following umlaut and bail out, rather than return the difference of Y vs. u.  | 
395  | 0  |     ce0 = ((int64_t)Collation::NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex;  | 
396  | 0  |     ce1 = 0;  | 
397  | 0  |     return TRUE;  | 
398  | 0  | }  | 
399  |  |  | 
400  |  | void  | 
401  |  | CollationFastLatinBuilder::addContractionEntry(int32_t x, int64_t cce0, int64_t cce1,  | 
402  | 0  |                                                UErrorCode &errorCode) { | 
403  | 0  |     contractionCEs.addElement(x, errorCode);  | 
404  | 0  |     contractionCEs.addElement(cce0, errorCode);  | 
405  | 0  |     contractionCEs.addElement(cce1, errorCode);  | 
406  | 0  |     addUniqueCE(cce0, errorCode);  | 
407  | 0  |     addUniqueCE(cce1, errorCode);  | 
408  | 0  | }  | 
409  |  |  | 
410  |  | void  | 
411  | 0  | CollationFastLatinBuilder::addUniqueCE(int64_t ce, UErrorCode &errorCode) { | 
412  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
413  | 0  |     if(ce == 0 || (uint32_t)(ce >> 32) == Collation::NO_CE_PRIMARY) { return; } | 
414  | 0  |     ce &= ~(int64_t)Collation::CASE_MASK;  // blank out case bits  | 
415  | 0  |     int32_t i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);  | 
416  | 0  |     if(i < 0) { | 
417  | 0  |         uniqueCEs.insertElementAt(ce, ~i, errorCode);  | 
418  | 0  |     }  | 
419  | 0  | }  | 
420  |  |  | 
421  |  | uint32_t  | 
422  | 0  | CollationFastLatinBuilder::getMiniCE(int64_t ce) const { | 
423  | 0  |     ce &= ~(int64_t)Collation::CASE_MASK;  // blank out case bits  | 
424  | 0  |     int32_t index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);  | 
425  | 0  |     U_ASSERT(index >= 0);  | 
426  | 0  |     return miniCEs[index];  | 
427  | 0  | }  | 
428  |  |  | 
429  |  | UBool  | 
430  | 0  | CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) { | 
431  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
432  | 0  |     uprv_free(miniCEs);  | 
433  | 0  |     miniCEs = (uint16_t *)uprv_malloc(uniqueCEs.size() * 2);  | 
434  | 0  |     if(miniCEs == NULL) { | 
435  | 0  |         errorCode = U_MEMORY_ALLOCATION_ERROR;  | 
436  | 0  |         return FALSE;  | 
437  | 0  |     }  | 
438  | 0  |     int32_t group = 0;  | 
439  | 0  |     uint32_t lastGroupPrimary = lastSpecialPrimaries[group];  | 
440  |  |     // The lowest unique CE must be at least a secondary CE.  | 
441  | 0  |     U_ASSERT(((uint32_t)uniqueCEs.elementAti(0) >> 16) != 0);  | 
442  | 0  |     uint32_t prevPrimary = 0;  | 
443  | 0  |     uint32_t prevSecondary = 0;  | 
444  | 0  |     uint32_t pri = 0;  | 
445  | 0  |     uint32_t sec = 0;  | 
446  | 0  |     uint32_t ter = CollationFastLatin::COMMON_TER;  | 
447  | 0  |     for(int32_t i = 0; i < uniqueCEs.size(); ++i) { | 
448  | 0  |         int64_t ce = uniqueCEs.elementAti(i);  | 
449  |  |         // Note: At least one of the p/s/t weights changes from one unique CE to the next.  | 
450  |  |         // (uniqueCEs does not store case bits.)  | 
451  | 0  |         uint32_t p = (uint32_t)(ce >> 32);  | 
452  | 0  |         if(p != prevPrimary) { | 
453  | 0  |             while(p > lastGroupPrimary) { | 
454  | 0  |                 U_ASSERT(pri <= CollationFastLatin::MAX_LONG);  | 
455  |  |                 // Set the group's header entry to the  | 
456  |  |                 // last "long primary" in or before the group.  | 
457  | 0  |                 result.setCharAt(1 + group, (UChar)pri);  | 
458  | 0  |                 if(++group < NUM_SPECIAL_GROUPS) { | 
459  | 0  |                     lastGroupPrimary = lastSpecialPrimaries[group];  | 
460  | 0  |                 } else { | 
461  | 0  |                     lastGroupPrimary = 0xffffffff;  | 
462  | 0  |                     break;  | 
463  | 0  |                 }  | 
464  | 0  |             }  | 
465  | 0  |             if(p < firstShortPrimary) { | 
466  | 0  |                 if(pri == 0) { | 
467  | 0  |                     pri = CollationFastLatin::MIN_LONG;  | 
468  | 0  |                 } else if(pri < CollationFastLatin::MAX_LONG) { | 
469  | 0  |                     pri += CollationFastLatin::LONG_INC;  | 
470  | 0  |                 } else { | 
471  |  | #if DEBUG_COLLATION_FAST_LATIN_BUILDER  | 
472  |  |                     printf("long-primary overflow for %08x\n", p); | 
473  |  | #endif  | 
474  | 0  |                     miniCEs[i] = CollationFastLatin::BAIL_OUT;  | 
475  | 0  |                     continue;  | 
476  | 0  |                 }  | 
477  | 0  |             } else { | 
478  | 0  |                 if(pri < CollationFastLatin::MIN_SHORT) { | 
479  | 0  |                     pri = CollationFastLatin::MIN_SHORT;  | 
480  | 0  |                 } else if(pri < (CollationFastLatin::MAX_SHORT - CollationFastLatin::SHORT_INC)) { | 
481  |  |                     // Reserve the highest primary weight for U+FFFF.  | 
482  | 0  |                     pri += CollationFastLatin::SHORT_INC;  | 
483  | 0  |                 } else { | 
484  |  | #if DEBUG_COLLATION_FAST_LATIN_BUILDER  | 
485  |  |                     printf("short-primary overflow for %08x\n", p); | 
486  |  | #endif  | 
487  | 0  |                     shortPrimaryOverflow = TRUE;  | 
488  | 0  |                     miniCEs[i] = CollationFastLatin::BAIL_OUT;  | 
489  | 0  |                     continue;  | 
490  | 0  |                 }  | 
491  | 0  |             }  | 
492  | 0  |             prevPrimary = p;  | 
493  | 0  |             prevSecondary = Collation::COMMON_WEIGHT16;  | 
494  | 0  |             sec = CollationFastLatin::COMMON_SEC;  | 
495  | 0  |             ter = CollationFastLatin::COMMON_TER;  | 
496  | 0  |         }  | 
497  | 0  |         uint32_t lower32 = (uint32_t)ce;  | 
498  | 0  |         uint32_t s = lower32 >> 16;  | 
499  | 0  |         if(s != prevSecondary) { | 
500  | 0  |             if(pri == 0) { | 
501  | 0  |                 if(sec == 0) { | 
502  | 0  |                     sec = CollationFastLatin::MIN_SEC_HIGH;  | 
503  | 0  |                 } else if(sec < CollationFastLatin::MAX_SEC_HIGH) { | 
504  | 0  |                     sec += CollationFastLatin::SEC_INC;  | 
505  | 0  |                 } else { | 
506  | 0  |                     miniCEs[i] = CollationFastLatin::BAIL_OUT;  | 
507  | 0  |                     continue;  | 
508  | 0  |                 }  | 
509  | 0  |                 prevSecondary = s;  | 
510  | 0  |                 ter = CollationFastLatin::COMMON_TER;  | 
511  | 0  |             } else if(s < Collation::COMMON_WEIGHT16) { | 
512  | 0  |                 if(sec == CollationFastLatin::COMMON_SEC) { | 
513  | 0  |                     sec = CollationFastLatin::MIN_SEC_BEFORE;  | 
514  | 0  |                 } else if(sec < CollationFastLatin::MAX_SEC_BEFORE) { | 
515  | 0  |                     sec += CollationFastLatin::SEC_INC;  | 
516  | 0  |                 } else { | 
517  | 0  |                     miniCEs[i] = CollationFastLatin::BAIL_OUT;  | 
518  | 0  |                     continue;  | 
519  | 0  |                 }  | 
520  | 0  |             } else if(s == Collation::COMMON_WEIGHT16) { | 
521  | 0  |                 sec = CollationFastLatin::COMMON_SEC;  | 
522  | 0  |             } else { | 
523  | 0  |                 if(sec < CollationFastLatin::MIN_SEC_AFTER) { | 
524  | 0  |                     sec = CollationFastLatin::MIN_SEC_AFTER;  | 
525  | 0  |                 } else if(sec < CollationFastLatin::MAX_SEC_AFTER) { | 
526  | 0  |                     sec += CollationFastLatin::SEC_INC;  | 
527  | 0  |                 } else { | 
528  | 0  |                     miniCEs[i] = CollationFastLatin::BAIL_OUT;  | 
529  | 0  |                     continue;  | 
530  | 0  |                 }  | 
531  | 0  |             }  | 
532  | 0  |             prevSecondary = s;  | 
533  | 0  |             ter = CollationFastLatin::COMMON_TER;  | 
534  | 0  |         }  | 
535  | 0  |         U_ASSERT((lower32 & Collation::CASE_MASK) == 0);  // blanked out in uniqueCEs  | 
536  | 0  |         uint32_t t = lower32 & Collation::ONLY_TERTIARY_MASK;  | 
537  | 0  |         if(t > Collation::COMMON_WEIGHT16) { | 
538  | 0  |             if(ter < CollationFastLatin::MAX_TER_AFTER) { | 
539  | 0  |                 ++ter;  | 
540  | 0  |             } else { | 
541  | 0  |                 miniCEs[i] = CollationFastLatin::BAIL_OUT;  | 
542  | 0  |                 continue;  | 
543  | 0  |             }  | 
544  | 0  |         }  | 
545  | 0  |         if(CollationFastLatin::MIN_LONG <= pri && pri <= CollationFastLatin::MAX_LONG) { | 
546  | 0  |             U_ASSERT(sec == CollationFastLatin::COMMON_SEC);  | 
547  | 0  |             miniCEs[i] = (uint16_t)(pri | ter);  | 
548  | 0  |         } else { | 
549  | 0  |             miniCEs[i] = (uint16_t)(pri | sec | ter);  | 
550  | 0  |         }  | 
551  | 0  |     }  | 
552  |  | #if DEBUG_COLLATION_FAST_LATIN_BUILDER  | 
553  |  |     printf("last mini primary: %04x\n", pri); | 
554  |  | #endif  | 
555  |  | #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2  | 
556  |  |     for(int32_t i = 0; i < uniqueCEs.size(); ++i) { | 
557  |  |         int64_t ce = uniqueCEs.elementAti(i);  | 
558  |  |         printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]); | 
559  |  |     }  | 
560  |  | #endif  | 
561  | 0  |     return U_SUCCESS(errorCode);  | 
562  | 0  | }  | 
563  |  |  | 
564  |  | UBool  | 
565  | 0  | CollationFastLatinBuilder::encodeCharCEs(UErrorCode &errorCode) { | 
566  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
567  | 0  |     int32_t miniCEsStart = result.length();  | 
568  | 0  |     for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) { | 
569  | 0  |         result.append((UChar)0);  // initialize to completely ignorable  | 
570  | 0  |     }  | 
571  | 0  |     int32_t indexBase = result.length();  | 
572  | 0  |     for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) { | 
573  | 0  |         int64_t ce = charCEs[i][0];  | 
574  | 0  |         if(isContractionCharCE(ce)) { continue; }  // defer contraction | 
575  | 0  |         uint32_t miniCE = encodeTwoCEs(ce, charCEs[i][1]);  | 
576  | 0  |         if(miniCE > 0xffff) { | 
577  |  |             // Note: There is a chance that this new expansion is the same as a previous one,  | 
578  |  |             // and if so, then we could reuse the other expansion.  | 
579  |  |             // However, that seems unlikely.  | 
580  | 0  |             int32_t expansionIndex = result.length() - indexBase;  | 
581  | 0  |             if(expansionIndex > (int32_t)CollationFastLatin::INDEX_MASK) { | 
582  | 0  |                 miniCE = CollationFastLatin::BAIL_OUT;  | 
583  | 0  |             } else { | 
584  | 0  |                 result.append((UChar)(miniCE >> 16)).append((UChar)miniCE);  | 
585  | 0  |                 miniCE = CollationFastLatin::EXPANSION | expansionIndex;  | 
586  | 0  |             }  | 
587  | 0  |         }  | 
588  | 0  |         result.setCharAt(miniCEsStart + i, (UChar)miniCE);  | 
589  | 0  |     }  | 
590  | 0  |     return U_SUCCESS(errorCode);  | 
591  | 0  | }  | 
592  |  |  | 
593  |  | UBool  | 
594  | 0  | CollationFastLatinBuilder::encodeContractions(UErrorCode &errorCode) { | 
595  |  |     // We encode all contraction lists so that the first word of a list  | 
596  |  |     // terminates the previous list, and we only need one additional terminator at the end.  | 
597  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
598  | 0  |     int32_t indexBase = headerLength + CollationFastLatin::NUM_FAST_CHARS;  | 
599  | 0  |     int32_t firstContractionIndex = result.length();  | 
600  | 0  |     for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) { | 
601  | 0  |         int64_t ce = charCEs[i][0];  | 
602  | 0  |         if(!isContractionCharCE(ce)) { continue; } | 
603  | 0  |         int32_t contractionIndex = result.length() - indexBase;  | 
604  | 0  |         if(contractionIndex > (int32_t)CollationFastLatin::INDEX_MASK) { | 
605  | 0  |             result.setCharAt(headerLength + i, CollationFastLatin::BAIL_OUT);  | 
606  | 0  |             continue;  | 
607  | 0  |         }  | 
608  | 0  |         UBool firstTriple = TRUE;  | 
609  | 0  |         for(int32_t index = (int32_t)ce & 0x7fffffff;; index += 3) { | 
610  | 0  |             int32_t x = static_cast<int32_t>(contractionCEs.elementAti(index));  | 
611  | 0  |             if((uint32_t)x == CollationFastLatin::CONTR_CHAR_MASK && !firstTriple) { break; } | 
612  | 0  |             int64_t cce0 = contractionCEs.elementAti(index + 1);  | 
613  | 0  |             int64_t cce1 = contractionCEs.elementAti(index + 2);  | 
614  | 0  |             uint32_t miniCE = encodeTwoCEs(cce0, cce1);  | 
615  | 0  |             if(miniCE == CollationFastLatin::BAIL_OUT) { | 
616  | 0  |                 result.append((UChar)(x | (1 << CollationFastLatin::CONTR_LENGTH_SHIFT)));  | 
617  | 0  |             } else if(miniCE <= 0xffff) { | 
618  | 0  |                 result.append((UChar)(x | (2 << CollationFastLatin::CONTR_LENGTH_SHIFT)));  | 
619  | 0  |                 result.append((UChar)miniCE);  | 
620  | 0  |             } else { | 
621  | 0  |                 result.append((UChar)(x | (3 << CollationFastLatin::CONTR_LENGTH_SHIFT)));  | 
622  | 0  |                 result.append((UChar)(miniCE >> 16)).append((UChar)miniCE);  | 
623  | 0  |             }  | 
624  | 0  |             firstTriple = FALSE;  | 
625  | 0  |         }  | 
626  |  |         // Note: There is a chance that this new contraction list is the same as a previous one,  | 
627  |  |         // and if so, then we could truncate the result and reuse the other list.  | 
628  |  |         // However, that seems unlikely.  | 
629  | 0  |         result.setCharAt(headerLength + i,  | 
630  | 0  |                          (UChar)(CollationFastLatin::CONTRACTION | contractionIndex));  | 
631  | 0  |     }  | 
632  | 0  |     if(result.length() > firstContractionIndex) { | 
633  |  |         // Terminate the last contraction list.  | 
634  | 0  |         result.append((UChar)CollationFastLatin::CONTR_CHAR_MASK);  | 
635  | 0  |     }  | 
636  | 0  |     if(result.isBogus()) { | 
637  | 0  |         errorCode = U_MEMORY_ALLOCATION_ERROR;  | 
638  | 0  |         return FALSE;  | 
639  | 0  |     }  | 
640  |  | #if DEBUG_COLLATION_FAST_LATIN_BUILDER  | 
641  |  |     printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2); | 
642  |  |     puts("   header & below-digit groups map"); | 
643  |  |     int32_t i = 0;  | 
644  |  |     for(; i < headerLength; ++i) { | 
645  |  |         printf(" %04x", result[i]); | 
646  |  |     }  | 
647  |  |     printf("\n   char mini CEs"); | 
648  |  |     U_ASSERT(CollationFastLatin::NUM_FAST_CHARS % 16 == 0);  | 
649  |  |     for(; i < indexBase; i += 16) { | 
650  |  |         UChar32 c = i - headerLength;  | 
651  |  |         if(c >= CollationFastLatin::LATIN_LIMIT) { | 
652  |  |             c = CollationFastLatin::PUNCT_START + c - CollationFastLatin::LATIN_LIMIT;  | 
653  |  |         }  | 
654  |  |         printf("\n %04x:", c); | 
655  |  |         for(int32_t j = 0; j < 16; ++j) { | 
656  |  |             printf(" %04x", result[i + j]); | 
657  |  |         }  | 
658  |  |     }  | 
659  |  |     printf("\n   expansions & contractions"); | 
660  |  |     for(; i < result.length(); ++i) { | 
661  |  |         if((i - indexBase) % 16 == 0) { puts(""); } | 
662  |  |         printf(" %04x", result[i]); | 
663  |  |     }  | 
664  |  |     puts(""); | 
665  |  | #endif  | 
666  | 0  |     return TRUE;  | 
667  | 0  | }  | 
668  |  |  | 
669  |  | uint32_t  | 
670  | 0  | CollationFastLatinBuilder::encodeTwoCEs(int64_t first, int64_t second) const { | 
671  | 0  |     if(first == 0) { | 
672  | 0  |         return 0;  // completely ignorable  | 
673  | 0  |     }  | 
674  | 0  |     if(first == Collation::NO_CE) { | 
675  | 0  |         return CollationFastLatin::BAIL_OUT;  | 
676  | 0  |     }  | 
677  | 0  |     U_ASSERT((uint32_t)(first >> 32) != Collation::NO_CE_PRIMARY);  | 
678  |  | 
  | 
679  | 0  |     uint32_t miniCE = getMiniCE(first);  | 
680  | 0  |     if(miniCE == CollationFastLatin::BAIL_OUT) { return miniCE; } | 
681  | 0  |     if(miniCE >= CollationFastLatin::MIN_SHORT) { | 
682  |  |         // Extract & copy the case bits.  | 
683  |  |         // Shift them from normal CE bits 15..14 to mini CE bits 4..3.  | 
684  | 0  |         uint32_t c = (((uint32_t)first & Collation::CASE_MASK) >> (14 - 3));  | 
685  |  |         // Only in mini CEs: Ignorable case bits = 0, lowercase = 1.  | 
686  | 0  |         c += CollationFastLatin::LOWER_CASE;  | 
687  | 0  |         miniCE |= c;  | 
688  | 0  |     }  | 
689  | 0  |     if(second == 0) { return miniCE; } | 
690  |  |  | 
691  | 0  |     uint32_t miniCE1 = getMiniCE(second);  | 
692  | 0  |     if(miniCE1 == CollationFastLatin::BAIL_OUT) { return miniCE1; } | 
693  |  |  | 
694  | 0  |     uint32_t case1 = (uint32_t)second & Collation::CASE_MASK;  | 
695  | 0  |     if(miniCE >= CollationFastLatin::MIN_SHORT &&  | 
696  | 0  |             (miniCE & CollationFastLatin::SECONDARY_MASK) == CollationFastLatin::COMMON_SEC) { | 
697  |  |         // Try to combine the two mini CEs into one.  | 
698  | 0  |         uint32_t sec1 = miniCE1 & CollationFastLatin::SECONDARY_MASK;  | 
699  | 0  |         uint32_t ter1 = miniCE1 & CollationFastLatin::TERTIARY_MASK;  | 
700  | 0  |         if(sec1 >= CollationFastLatin::MIN_SEC_HIGH && case1 == 0 &&  | 
701  | 0  |                 ter1 == CollationFastLatin::COMMON_TER) { | 
702  |  |             // sec1>=sec_high implies pri1==0.  | 
703  | 0  |             return (miniCE & ~CollationFastLatin::SECONDARY_MASK) | sec1;  | 
704  | 0  |         }  | 
705  | 0  |     }  | 
706  |  |  | 
707  | 0  |     if(miniCE1 <= CollationFastLatin::SECONDARY_MASK || CollationFastLatin::MIN_SHORT <= miniCE1) { | 
708  |  |         // Secondary CE, or a CE with a short primary, copy the case bits.  | 
709  | 0  |         case1 = (case1 >> (14 - 3)) + CollationFastLatin::LOWER_CASE;  | 
710  | 0  |         miniCE1 |= case1;  | 
711  | 0  |     }  | 
712  | 0  |     return (miniCE << 16) | miniCE1;  | 
713  | 0  | }  | 
714  |  |  | 
715  |  | U_NAMESPACE_END  | 
716  |  |  | 
717  |  | #endif  // !UCONFIG_NO_COLLATION  |