/src/icu/source/i18n/collationbuilder.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | * Copyright (C) 2013-2014, International Business Machines  | 
6  |  | * Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | * collationbuilder.cpp  | 
9  |  | *  | 
10  |  | * (replaced the former ucol_bld.cpp)  | 
11  |  | *  | 
12  |  | * created on: 2013may06  | 
13  |  | * created by: Markus W. Scherer  | 
14  |  | */  | 
15  |  |  | 
16  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
17  |  | #include <stdio.h>  | 
18  |  | #endif  | 
19  |  |  | 
20  |  | #include "unicode/utypes.h"  | 
21  |  |  | 
22  |  | #if !UCONFIG_NO_COLLATION  | 
23  |  |  | 
24  |  | #include "unicode/caniter.h"  | 
25  |  | #include "unicode/normalizer2.h"  | 
26  |  | #include "unicode/tblcoll.h"  | 
27  |  | #include "unicode/parseerr.h"  | 
28  |  | #include "unicode/uchar.h"  | 
29  |  | #include "unicode/ucol.h"  | 
30  |  | #include "unicode/unistr.h"  | 
31  |  | #include "unicode/usetiter.h"  | 
32  |  | #include "unicode/utf16.h"  | 
33  |  | #include "unicode/uversion.h"  | 
34  |  | #include "cmemory.h"  | 
35  |  | #include "collation.h"  | 
36  |  | #include "collationbuilder.h"  | 
37  |  | #include "collationdata.h"  | 
38  |  | #include "collationdatabuilder.h"  | 
39  |  | #include "collationfastlatin.h"  | 
40  |  | #include "collationroot.h"  | 
41  |  | #include "collationrootelements.h"  | 
42  |  | #include "collationruleparser.h"  | 
43  |  | #include "collationsettings.h"  | 
44  |  | #include "collationtailoring.h"  | 
45  |  | #include "collationweights.h"  | 
46  |  | #include "normalizer2impl.h"  | 
47  |  | #include "uassert.h"  | 
48  |  | #include "ucol_imp.h"  | 
49  |  | #include "utf16collationiterator.h"  | 
50  |  |  | 
51  |  | U_NAMESPACE_BEGIN  | 
52  |  |  | 
53  |  | namespace { | 
54  |  |  | 
55  |  | class BundleImporter : public CollationRuleParser::Importer { | 
56  |  | public:  | 
57  | 0  |     BundleImporter() {} | 
58  |  |     virtual ~BundleImporter();  | 
59  |  |     virtual void getRules(  | 
60  |  |             const char *localeID, const char *collationType,  | 
61  |  |             UnicodeString &rules,  | 
62  |  |             const char *&errorReason, UErrorCode &errorCode);  | 
63  |  | };  | 
64  |  |  | 
65  |  | BundleImporter::~BundleImporter() {} | 
66  |  |  | 
67  |  | void  | 
68  |  | BundleImporter::getRules(  | 
69  |  |         const char *localeID, const char *collationType,  | 
70  |  |         UnicodeString &rules,  | 
71  | 0  |         const char *& /*errorReason*/, UErrorCode &errorCode) { | 
72  | 0  |     CollationLoader::loadRules(localeID, collationType, rules, errorCode);  | 
73  | 0  | }  | 
74  |  |  | 
75  |  | }  // namespace  | 
76  |  |  | 
77  |  | // RuleBasedCollator implementation ---------------------------------------- ***  | 
78  |  |  | 
79  |  | // These methods are here, rather than in rulebasedcollator.cpp,  | 
80  |  | // for modularization:  | 
81  |  | // Most code using Collator does not need to build a Collator from rules.  | 
82  |  | // By moving these constructors and helper methods to a separate file,  | 
83  |  | // most code will not have a static dependency on the builder code.  | 
84  |  |  | 
85  |  | RuleBasedCollator::RuleBasedCollator()  | 
86  |  |         : data(NULL),  | 
87  |  |           settings(NULL),  | 
88  |  |           tailoring(NULL),  | 
89  |  |           cacheEntry(NULL),  | 
90  | 0  |           validLocale(""), | 
91  | 0  |           explicitlySetAttributes(0),  | 
92  | 0  |           actualLocaleIsSameAsValid(FALSE) { | 
93  | 0  | }  | 
94  |  |  | 
95  |  | RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode)  | 
96  |  |         : data(NULL),  | 
97  |  |           settings(NULL),  | 
98  |  |           tailoring(NULL),  | 
99  |  |           cacheEntry(NULL),  | 
100  | 0  |           validLocale(""), | 
101  | 0  |           explicitlySetAttributes(0),  | 
102  | 0  |           actualLocaleIsSameAsValid(FALSE) { | 
103  | 0  |     internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode);  | 
104  | 0  | }  | 
105  |  |  | 
106  |  | RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength,  | 
107  |  |                                      UErrorCode &errorCode)  | 
108  |  |         : data(NULL),  | 
109  |  |           settings(NULL),  | 
110  |  |           tailoring(NULL),  | 
111  |  |           cacheEntry(NULL),  | 
112  | 0  |           validLocale(""), | 
113  | 0  |           explicitlySetAttributes(0),  | 
114  | 0  |           actualLocaleIsSameAsValid(FALSE) { | 
115  | 0  |     internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode);  | 
116  | 0  | }  | 
117  |  |  | 
118  |  | RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,  | 
119  |  |                                      UColAttributeValue decompositionMode,  | 
120  |  |                                      UErrorCode &errorCode)  | 
121  |  |         : data(NULL),  | 
122  |  |           settings(NULL),  | 
123  |  |           tailoring(NULL),  | 
124  |  |           cacheEntry(NULL),  | 
125  | 0  |           validLocale(""), | 
126  | 0  |           explicitlySetAttributes(0),  | 
127  | 0  |           actualLocaleIsSameAsValid(FALSE) { | 
128  | 0  |     internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode);  | 
129  | 0  | }  | 
130  |  |  | 
131  |  | RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,  | 
132  |  |                                      ECollationStrength strength,  | 
133  |  |                                      UColAttributeValue decompositionMode,  | 
134  |  |                                      UErrorCode &errorCode)  | 
135  |  |         : data(NULL),  | 
136  |  |           settings(NULL),  | 
137  |  |           tailoring(NULL),  | 
138  |  |           cacheEntry(NULL),  | 
139  | 0  |           validLocale(""), | 
140  | 0  |           explicitlySetAttributes(0),  | 
141  | 0  |           actualLocaleIsSameAsValid(FALSE) { | 
142  | 0  |     internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode);  | 
143  | 0  | }  | 
144  |  |  | 
145  |  | RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,  | 
146  |  |                                      UParseError &parseError, UnicodeString &reason,  | 
147  |  |                                      UErrorCode &errorCode)  | 
148  |  |         : data(NULL),  | 
149  |  |           settings(NULL),  | 
150  |  |           tailoring(NULL),  | 
151  |  |           cacheEntry(NULL),  | 
152  | 0  |           validLocale(""), | 
153  | 0  |           explicitlySetAttributes(0),  | 
154  | 0  |           actualLocaleIsSameAsValid(FALSE) { | 
155  | 0  |     internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode);  | 
156  | 0  | }  | 
157  |  |  | 
158  |  | void  | 
159  |  | RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules,  | 
160  |  |                                           int32_t strength,  | 
161  |  |                                           UColAttributeValue decompositionMode,  | 
162  |  |                                           UParseError *outParseError, UnicodeString *outReason,  | 
163  | 0  |                                           UErrorCode &errorCode) { | 
164  | 0  |     const CollationTailoring *base = CollationRoot::getRoot(errorCode);  | 
165  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
166  | 0  |     if(outReason != NULL) { outReason->remove(); } | 
167  | 0  |     CollationBuilder builder(base, errorCode);  | 
168  | 0  |     UVersionInfo noVersion = { 0, 0, 0, 0 }; | 
169  | 0  |     BundleImporter importer;  | 
170  | 0  |     LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion,  | 
171  | 0  |                                                              &importer,  | 
172  | 0  |                                                              outParseError, errorCode));  | 
173  | 0  |     if(U_FAILURE(errorCode)) { | 
174  | 0  |         const char *reason = builder.getErrorReason();  | 
175  | 0  |         if(reason != NULL && outReason != NULL) { | 
176  | 0  |             *outReason = UnicodeString(reason, -1, US_INV);  | 
177  | 0  |         }  | 
178  | 0  |         return;  | 
179  | 0  |     }  | 
180  | 0  |     t->actualLocale.setToBogus();  | 
181  | 0  |     adoptTailoring(t.orphan(), errorCode);  | 
182  |  |     // Set attributes after building the collator,  | 
183  |  |     // to keep the default settings consistent with the rule string.  | 
184  | 0  |     if(strength != UCOL_DEFAULT) { | 
185  | 0  |         setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode);  | 
186  | 0  |     }  | 
187  | 0  |     if(decompositionMode != UCOL_DEFAULT) { | 
188  | 0  |         setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode);  | 
189  | 0  |     }  | 
190  | 0  | }  | 
191  |  |  | 
192  |  | // CollationBuilder implementation ----------------------------------------- ***  | 
193  |  |  | 
194  |  | // Some compilers don't care if constants are defined in the .cpp file.  | 
195  |  | // MS Visual C++ does not like it, but gcc requires it. clang does not care.  | 
196  |  | #ifndef _MSC_VER  | 
197  |  | const int32_t CollationBuilder::HAS_BEFORE2;  | 
198  |  | const int32_t CollationBuilder::HAS_BEFORE3;  | 
199  |  | #endif  | 
200  |  |  | 
201  |  | CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)  | 
202  | 0  |         : nfd(*Normalizer2::getNFDInstance(errorCode)),  | 
203  | 0  |           fcd(*Normalizer2Factory::getFCDInstance(errorCode)),  | 
204  | 0  |           nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),  | 
205  | 0  |           base(b),  | 
206  | 0  |           baseData(b->data),  | 
207  | 0  |           rootElements(b->data->rootElements, b->data->rootElementsLength),  | 
208  | 0  |           variableTop(0),  | 
209  | 0  |           dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE),  | 
210  |  |           errorReason(NULL),  | 
211  | 0  |           cesLength(0),  | 
212  | 0  |           rootPrimaryIndexes(errorCode), nodes(errorCode) { | 
213  | 0  |     nfcImpl.ensureCanonIterData(errorCode);  | 
214  | 0  |     if(U_FAILURE(errorCode)) { | 
215  | 0  |         errorReason = "CollationBuilder fields initialization failed";  | 
216  | 0  |         return;  | 
217  | 0  |     }  | 
218  | 0  |     if(dataBuilder == NULL) { | 
219  | 0  |         errorCode = U_MEMORY_ALLOCATION_ERROR;  | 
220  | 0  |         return;  | 
221  | 0  |     }  | 
222  | 0  |     dataBuilder->initForTailoring(baseData, errorCode);  | 
223  | 0  |     if(U_FAILURE(errorCode)) { | 
224  | 0  |         errorReason = "CollationBuilder initialization failed";  | 
225  | 0  |     }  | 
226  | 0  | }  | 
227  |  |  | 
228  | 0  | CollationBuilder::~CollationBuilder() { | 
229  | 0  |     delete dataBuilder;  | 
230  | 0  | }  | 
231  |  |  | 
232  |  | CollationTailoring *  | 
233  |  | CollationBuilder::parseAndBuild(const UnicodeString &ruleString,  | 
234  |  |                                 const UVersionInfo rulesVersion,  | 
235  |  |                                 CollationRuleParser::Importer *importer,  | 
236  |  |                                 UParseError *outParseError,  | 
237  | 0  |                                 UErrorCode &errorCode) { | 
238  | 0  |     if(U_FAILURE(errorCode)) { return NULL; } | 
239  | 0  |     if(baseData->rootElements == NULL) { | 
240  | 0  |         errorCode = U_MISSING_RESOURCE_ERROR;  | 
241  | 0  |         errorReason = "missing root elements data, tailoring not supported";  | 
242  | 0  |         return NULL;  | 
243  | 0  |     }  | 
244  | 0  |     LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings));  | 
245  | 0  |     if(tailoring.isNull() || tailoring->isBogus()) { | 
246  | 0  |         errorCode = U_MEMORY_ALLOCATION_ERROR;  | 
247  | 0  |         return NULL;  | 
248  | 0  |     }  | 
249  | 0  |     CollationRuleParser parser(baseData, errorCode);  | 
250  | 0  |     if(U_FAILURE(errorCode)) { return NULL; } | 
251  |  |     // Note: This always bases &[last variable] and &[first regular]  | 
252  |  |     // on the root collator's maxVariable/variableTop.  | 
253  |  |     // If we wanted this to change after [maxVariable x], then we would keep  | 
254  |  |     // the tailoring.settings pointer here and read its variableTop when we need it.  | 
255  |  |     // See http://unicode.org/cldr/trac/ticket/6070  | 
256  | 0  |     variableTop = base->settings->variableTop;  | 
257  | 0  |     parser.setSink(this);  | 
258  | 0  |     parser.setImporter(importer);  | 
259  | 0  |     CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings);  | 
260  | 0  |     parser.parse(ruleString, ownedSettings, outParseError, errorCode);  | 
261  | 0  |     errorReason = parser.getErrorReason();  | 
262  | 0  |     if(U_FAILURE(errorCode)) { return NULL; } | 
263  | 0  |     if(dataBuilder->hasMappings()) { | 
264  | 0  |         makeTailoredCEs(errorCode);  | 
265  | 0  |         closeOverComposites(errorCode);  | 
266  | 0  |         finalizeCEs(errorCode);  | 
267  |  |         // Copy all of ASCII, and Latin-1 letters, into each tailoring.  | 
268  | 0  |         optimizeSet.add(0, 0x7f);  | 
269  | 0  |         optimizeSet.add(0xc0, 0xff);  | 
270  |  |         // Hangul is decomposed on the fly during collation,  | 
271  |  |         // and the tailoring data is always built with HANGUL_TAG specials.  | 
272  | 0  |         optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);  | 
273  | 0  |         dataBuilder->optimize(optimizeSet, errorCode);  | 
274  | 0  |         tailoring->ensureOwnedData(errorCode);  | 
275  | 0  |         if(U_FAILURE(errorCode)) { return NULL; } | 
276  | 0  |         if(fastLatinEnabled) { dataBuilder->enableFastLatin(); } | 
277  | 0  |         dataBuilder->build(*tailoring->ownedData, errorCode);  | 
278  | 0  |         tailoring->builder = dataBuilder;  | 
279  | 0  |         dataBuilder = NULL;  | 
280  | 0  |     } else { | 
281  | 0  |         tailoring->data = baseData;  | 
282  | 0  |     }  | 
283  | 0  |     if(U_FAILURE(errorCode)) { return NULL; } | 
284  | 0  |     ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(  | 
285  | 0  |         tailoring->data, ownedSettings,  | 
286  | 0  |         ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));  | 
287  | 0  |     tailoring->rules = ruleString;  | 
288  | 0  |     tailoring->rules.getTerminatedBuffer();  // ensure NUL-termination  | 
289  | 0  |     tailoring->setVersion(base->version, rulesVersion);  | 
290  | 0  |     return tailoring.orphan();  | 
291  | 0  | }  | 
292  |  |  | 
293  |  | void  | 
294  |  | CollationBuilder::addReset(int32_t strength, const UnicodeString &str,  | 
295  | 0  |                            const char *&parserErrorReason, UErrorCode &errorCode) { | 
296  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
297  | 0  |     U_ASSERT(!str.isEmpty());  | 
298  | 0  |     if(str.charAt(0) == CollationRuleParser::POS_LEAD) { | 
299  | 0  |         ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode);  | 
300  | 0  |         cesLength = 1;  | 
301  | 0  |         if(U_FAILURE(errorCode)) { return; } | 
302  | 0  |         U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0);  | 
303  | 0  |     } else { | 
304  |  |         // normal reset to a character or string  | 
305  | 0  |         UnicodeString nfdString = nfd.normalize(str, errorCode);  | 
306  | 0  |         if(U_FAILURE(errorCode)) { | 
307  | 0  |             parserErrorReason = "normalizing the reset position";  | 
308  | 0  |             return;  | 
309  | 0  |         }  | 
310  | 0  |         cesLength = dataBuilder->getCEs(nfdString, ces, 0);  | 
311  | 0  |         if(cesLength > Collation::MAX_EXPANSION_LENGTH) { | 
312  | 0  |             errorCode = U_ILLEGAL_ARGUMENT_ERROR;  | 
313  | 0  |             parserErrorReason = "reset position maps to too many collation elements (more than 31)";  | 
314  | 0  |             return;  | 
315  | 0  |         }  | 
316  | 0  |     }  | 
317  | 0  |     if(strength == UCOL_IDENTICAL) { return; }  // simple reset-at-position | 
318  |  |  | 
319  |  |     // &[before strength]position  | 
320  | 0  |     U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY);  | 
321  | 0  |     int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);  | 
322  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
323  |  |  | 
324  | 0  |     int64_t node = nodes.elementAti(index);  | 
325  |  |     // If the index is for a "weaker" node,  | 
326  |  |     // then skip backwards over this and further "weaker" nodes.  | 
327  | 0  |     while(strengthFromNode(node) > strength) { | 
328  | 0  |         index = previousIndexFromNode(node);  | 
329  | 0  |         node = nodes.elementAti(index);  | 
330  | 0  |     }  | 
331  |  |  | 
332  |  |     // Find or insert a node whose index we will put into a temporary CE.  | 
333  | 0  |     if(strengthFromNode(node) == strength && isTailoredNode(node)) { | 
334  |  |         // Reset to just before this same-strength tailored node.  | 
335  | 0  |         index = previousIndexFromNode(node);  | 
336  | 0  |     } else if(strength == UCOL_PRIMARY) { | 
337  |  |         // root primary node (has no previous index)  | 
338  | 0  |         uint32_t p = weight32FromNode(node);  | 
339  | 0  |         if(p == 0) { | 
340  | 0  |             errorCode = U_UNSUPPORTED_ERROR;  | 
341  | 0  |             parserErrorReason = "reset primary-before ignorable not possible";  | 
342  | 0  |             return;  | 
343  | 0  |         }  | 
344  | 0  |         if(p <= rootElements.getFirstPrimary()) { | 
345  |  |             // There is no primary gap between ignorables and the space-first-primary.  | 
346  | 0  |             errorCode = U_UNSUPPORTED_ERROR;  | 
347  | 0  |             parserErrorReason = "reset primary-before first non-ignorable not supported";  | 
348  | 0  |             return;  | 
349  | 0  |         }  | 
350  | 0  |         if(p == Collation::FIRST_TRAILING_PRIMARY) { | 
351  |  |             // We do not support tailoring to an unassigned-implicit CE.  | 
352  | 0  |             errorCode = U_UNSUPPORTED_ERROR;  | 
353  | 0  |             parserErrorReason = "reset primary-before [first trailing] not supported";  | 
354  | 0  |             return;  | 
355  | 0  |         }  | 
356  | 0  |         p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p));  | 
357  | 0  |         index = findOrInsertNodeForPrimary(p, errorCode);  | 
358  |  |         // Go to the last node in this list:  | 
359  |  |         // Tailor after the last node between adjacent root nodes.  | 
360  | 0  |         for(;;) { | 
361  | 0  |             node = nodes.elementAti(index);  | 
362  | 0  |             int32_t nextIndex = nextIndexFromNode(node);  | 
363  | 0  |             if(nextIndex == 0) { break; } | 
364  | 0  |             index = nextIndex;  | 
365  | 0  |         }  | 
366  | 0  |     } else { | 
367  |  |         // &[before 2] or &[before 3]  | 
368  | 0  |         index = findCommonNode(index, UCOL_SECONDARY);  | 
369  | 0  |         if(strength >= UCOL_TERTIARY) { | 
370  | 0  |             index = findCommonNode(index, UCOL_TERTIARY);  | 
371  | 0  |         }  | 
372  |  |         // findCommonNode() stayed on the stronger node or moved to  | 
373  |  |         // an explicit common-weight node of the reset-before strength.  | 
374  | 0  |         node = nodes.elementAti(index);  | 
375  | 0  |         if(strengthFromNode(node) == strength) { | 
376  |  |             // Found a same-strength node with an explicit weight.  | 
377  | 0  |             uint32_t weight16 = weight16FromNode(node);  | 
378  | 0  |             if(weight16 == 0) { | 
379  | 0  |                 errorCode = U_UNSUPPORTED_ERROR;  | 
380  | 0  |                 if(strength == UCOL_SECONDARY) { | 
381  | 0  |                     parserErrorReason = "reset secondary-before secondary ignorable not possible";  | 
382  | 0  |                 } else { | 
383  | 0  |                     parserErrorReason = "reset tertiary-before completely ignorable not possible";  | 
384  | 0  |                 }  | 
385  | 0  |                 return;  | 
386  | 0  |             }  | 
387  | 0  |             U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16);  | 
388  |  |             // Reset to just before this node.  | 
389  |  |             // Insert the preceding same-level explicit weight if it is not there already.  | 
390  |  |             // Which explicit weight immediately precedes this one?  | 
391  | 0  |             weight16 = getWeight16Before(index, node, strength);  | 
392  |  |             // Does this preceding weight have a node?  | 
393  | 0  |             uint32_t previousWeight16;  | 
394  | 0  |             int32_t previousIndex = previousIndexFromNode(node);  | 
395  | 0  |             for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) { | 
396  | 0  |                 node = nodes.elementAti(i);  | 
397  | 0  |                 int32_t previousStrength = strengthFromNode(node);  | 
398  | 0  |                 if(previousStrength < strength) { | 
399  | 0  |                     U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex);  | 
400  |  |                     // Either the reset element has an above-common weight and  | 
401  |  |                     // the parent node provides the implied common weight,  | 
402  |  |                     // or the reset element has a weight<=common in the node  | 
403  |  |                     // right after the parent, and we need to insert the preceding weight.  | 
404  | 0  |                     previousWeight16 = Collation::COMMON_WEIGHT16;  | 
405  | 0  |                     break;  | 
406  | 0  |                 } else if(previousStrength == strength && !isTailoredNode(node)) { | 
407  | 0  |                     previousWeight16 = weight16FromNode(node);  | 
408  | 0  |                     break;  | 
409  | 0  |                 }  | 
410  |  |                 // Skip weaker nodes and same-level tailored nodes.  | 
411  | 0  |             }  | 
412  | 0  |             if(previousWeight16 == weight16) { | 
413  |  |                 // The preceding weight has a node,  | 
414  |  |                 // maybe with following weaker or tailored nodes.  | 
415  |  |                 // Reset to the last of them.  | 
416  | 0  |                 index = previousIndex;  | 
417  | 0  |             } else { | 
418  |  |                 // Insert a node with the preceding weight, reset to that.  | 
419  | 0  |                 node = nodeFromWeight16(weight16) | nodeFromStrength(strength);  | 
420  | 0  |                 index = insertNodeBetween(previousIndex, index, node, errorCode);  | 
421  | 0  |             }  | 
422  | 0  |         } else { | 
423  |  |             // Found a stronger node with implied strength-common weight.  | 
424  | 0  |             uint32_t weight16 = getWeight16Before(index, node, strength);  | 
425  | 0  |             index = findOrInsertWeakNode(index, weight16, strength, errorCode);  | 
426  | 0  |         }  | 
427  |  |         // Strength of the temporary CE = strength of its reset position.  | 
428  |  |         // Code above raises an error if the before-strength is stronger.  | 
429  | 0  |         strength = ceStrength(ces[cesLength - 1]);  | 
430  | 0  |     }  | 
431  | 0  |     if(U_FAILURE(errorCode)) { | 
432  | 0  |         parserErrorReason = "inserting reset position for &[before n]";  | 
433  | 0  |         return;  | 
434  | 0  |     }  | 
435  | 0  |     ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);  | 
436  | 0  | }  | 
437  |  |  | 
438  |  | uint32_t  | 
439  | 0  | CollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) { | 
440  | 0  |     U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node));  | 
441  |  |     // Collect the root CE weights if this node is for a root CE.  | 
442  |  |     // If it is not, then return the low non-primary boundary for a tailored CE.  | 
443  | 0  |     uint32_t t;  | 
444  | 0  |     if(strengthFromNode(node) == UCOL_TERTIARY) { | 
445  | 0  |         t = weight16FromNode(node);  | 
446  | 0  |     } else { | 
447  | 0  |         t = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.  | 
448  | 0  |     }  | 
449  | 0  |     while(strengthFromNode(node) > UCOL_SECONDARY) { | 
450  | 0  |         index = previousIndexFromNode(node);  | 
451  | 0  |         node = nodes.elementAti(index);  | 
452  | 0  |     }  | 
453  | 0  |     if(isTailoredNode(node)) { | 
454  | 0  |         return Collation::BEFORE_WEIGHT16;  | 
455  | 0  |     }  | 
456  | 0  |     uint32_t s;  | 
457  | 0  |     if(strengthFromNode(node) == UCOL_SECONDARY) { | 
458  | 0  |         s = weight16FromNode(node);  | 
459  | 0  |     } else { | 
460  | 0  |         s = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.  | 
461  | 0  |     }  | 
462  | 0  |     while(strengthFromNode(node) > UCOL_PRIMARY) { | 
463  | 0  |         index = previousIndexFromNode(node);  | 
464  | 0  |         node = nodes.elementAti(index);  | 
465  | 0  |     }  | 
466  | 0  |     if(isTailoredNode(node)) { | 
467  | 0  |         return Collation::BEFORE_WEIGHT16;  | 
468  | 0  |     }  | 
469  |  |     // [p, s, t] is a root CE. Return the preceding weight for the requested level.  | 
470  | 0  |     uint32_t p = weight32FromNode(node);  | 
471  | 0  |     uint32_t weight16;  | 
472  | 0  |     if(level == UCOL_SECONDARY) { | 
473  | 0  |         weight16 = rootElements.getSecondaryBefore(p, s);  | 
474  | 0  |     } else { | 
475  | 0  |         weight16 = rootElements.getTertiaryBefore(p, s, t);  | 
476  | 0  |         U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);  | 
477  | 0  |     }  | 
478  | 0  |     return weight16;  | 
479  | 0  | }  | 
480  |  |  | 
481  |  | int64_t  | 
482  |  | CollationBuilder::getSpecialResetPosition(const UnicodeString &str,  | 
483  | 0  |                                           const char *&parserErrorReason, UErrorCode &errorCode) { | 
484  | 0  |     U_ASSERT(str.length() == 2);  | 
485  | 0  |     int64_t ce;  | 
486  | 0  |     int32_t strength = UCOL_PRIMARY;  | 
487  | 0  |     UBool isBoundary = FALSE;  | 
488  | 0  |     UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE;  | 
489  | 0  |     U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING);  | 
490  | 0  |     switch(pos) { | 
491  | 0  |     case CollationRuleParser::FIRST_TERTIARY_IGNORABLE:  | 
492  |  |         // Quaternary CEs are not supported.  | 
493  |  |         // Non-zero quaternary weights are possible only on tertiary or stronger CEs.  | 
494  | 0  |         return 0;  | 
495  | 0  |     case CollationRuleParser::LAST_TERTIARY_IGNORABLE:  | 
496  | 0  |         return 0;  | 
497  | 0  |     case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: { | 
498  |  |         // Look for a tailored tertiary node after [0, 0, 0].  | 
499  | 0  |         int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode);  | 
500  | 0  |         if(U_FAILURE(errorCode)) { return 0; } | 
501  | 0  |         int64_t node = nodes.elementAti(index);  | 
502  | 0  |         if((index = nextIndexFromNode(node)) != 0) { | 
503  | 0  |             node = nodes.elementAti(index);  | 
504  | 0  |             U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY);  | 
505  | 0  |             if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) { | 
506  | 0  |                 return tempCEFromIndexAndStrength(index, UCOL_TERTIARY);  | 
507  | 0  |             }  | 
508  | 0  |         }  | 
509  | 0  |         return rootElements.getFirstTertiaryCE();  | 
510  |  |         // No need to look for nodeHasAnyBefore() on a tertiary node.  | 
511  | 0  |     }  | 
512  | 0  |     case CollationRuleParser::LAST_SECONDARY_IGNORABLE:  | 
513  | 0  |         ce = rootElements.getLastTertiaryCE();  | 
514  | 0  |         strength = UCOL_TERTIARY;  | 
515  | 0  |         break;  | 
516  | 0  |     case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: { | 
517  |  |         // Look for a tailored secondary node after [0, 0, *].  | 
518  | 0  |         int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode);  | 
519  | 0  |         if(U_FAILURE(errorCode)) { return 0; } | 
520  | 0  |         int64_t node = nodes.elementAti(index);  | 
521  | 0  |         while((index = nextIndexFromNode(node)) != 0) { | 
522  | 0  |             node = nodes.elementAti(index);  | 
523  | 0  |             strength = strengthFromNode(node);  | 
524  | 0  |             if(strength < UCOL_SECONDARY) { break; } | 
525  | 0  |             if(strength == UCOL_SECONDARY) { | 
526  | 0  |                 if(isTailoredNode(node)) { | 
527  | 0  |                     if(nodeHasBefore3(node)) { | 
528  | 0  |                         index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));  | 
529  | 0  |                         U_ASSERT(isTailoredNode(nodes.elementAti(index)));  | 
530  | 0  |                     }  | 
531  | 0  |                     return tempCEFromIndexAndStrength(index, UCOL_SECONDARY);  | 
532  | 0  |                 } else { | 
533  | 0  |                     break;  | 
534  | 0  |                 }  | 
535  | 0  |             }  | 
536  | 0  |         }  | 
537  | 0  |         ce = rootElements.getFirstSecondaryCE();  | 
538  | 0  |         strength = UCOL_SECONDARY;  | 
539  | 0  |         break;  | 
540  | 0  |     }  | 
541  | 0  |     case CollationRuleParser::LAST_PRIMARY_IGNORABLE:  | 
542  | 0  |         ce = rootElements.getLastSecondaryCE();  | 
543  | 0  |         strength = UCOL_SECONDARY;  | 
544  | 0  |         break;  | 
545  | 0  |     case CollationRuleParser::FIRST_VARIABLE:  | 
546  | 0  |         ce = rootElements.getFirstPrimaryCE();  | 
547  | 0  |         isBoundary = TRUE;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary  | 
548  | 0  |         break;  | 
549  | 0  |     case CollationRuleParser::LAST_VARIABLE:  | 
550  | 0  |         ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);  | 
551  | 0  |         break;  | 
552  | 0  |     case CollationRuleParser::FIRST_REGULAR:  | 
553  | 0  |         ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);  | 
554  | 0  |         isBoundary = TRUE;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary  | 
555  | 0  |         break;  | 
556  | 0  |     case CollationRuleParser::LAST_REGULAR:  | 
557  |  |         // Use the Hani-first-primary rather than the actual last "regular" CE before it,  | 
558  |  |         // for backward compatibility with behavior before the introduction of  | 
559  |  |         // script-first-primary CEs in the root collator.  | 
560  | 0  |         ce = rootElements.firstCEWithPrimaryAtLeast(  | 
561  | 0  |             baseData->getFirstPrimaryForGroup(USCRIPT_HAN));  | 
562  | 0  |         break;  | 
563  | 0  |     case CollationRuleParser::FIRST_IMPLICIT:  | 
564  | 0  |         ce = baseData->getSingleCE(0x4e00, errorCode);  | 
565  | 0  |         break;  | 
566  | 0  |     case CollationRuleParser::LAST_IMPLICIT:  | 
567  |  |         // We do not support tailoring to an unassigned-implicit CE.  | 
568  | 0  |         errorCode = U_UNSUPPORTED_ERROR;  | 
569  | 0  |         parserErrorReason = "reset to [last implicit] not supported";  | 
570  | 0  |         return 0;  | 
571  | 0  |     case CollationRuleParser::FIRST_TRAILING:  | 
572  | 0  |         ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);  | 
573  | 0  |         isBoundary = TRUE;  // trailing first primary (there is no mapping for it)  | 
574  | 0  |         break;  | 
575  | 0  |     case CollationRuleParser::LAST_TRAILING:  | 
576  | 0  |         errorCode = U_ILLEGAL_ARGUMENT_ERROR;  | 
577  | 0  |         parserErrorReason = "LDML forbids tailoring to U+FFFF";  | 
578  | 0  |         return 0;  | 
579  | 0  |     default:  | 
580  | 0  |         UPRV_UNREACHABLE;  | 
581  | 0  |     }  | 
582  |  |  | 
583  | 0  |     int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode);  | 
584  | 0  |     if(U_FAILURE(errorCode)) { return 0; } | 
585  | 0  |     int64_t node = nodes.elementAti(index);  | 
586  | 0  |     if((pos & 1) == 0) { | 
587  |  |         // even pos = [first xyz]  | 
588  | 0  |         if(!nodeHasAnyBefore(node) && isBoundary) { | 
589  |  |             // A <group> first primary boundary is artificially added to FractionalUCA.txt.  | 
590  |  |             // It is reachable via its special contraction, but is not normally used.  | 
591  |  |             // Find the first character tailored after the boundary CE,  | 
592  |  |             // or the first real root CE after it.  | 
593  | 0  |             if((index = nextIndexFromNode(node)) != 0) { | 
594  |  |                 // If there is a following node, then it must be tailored  | 
595  |  |                 // because there are no root CEs with a boundary primary  | 
596  |  |                 // and non-common secondary/tertiary weights.  | 
597  | 0  |                 node = nodes.elementAti(index);  | 
598  | 0  |                 U_ASSERT(isTailoredNode(node));  | 
599  | 0  |                 ce = tempCEFromIndexAndStrength(index, strength);  | 
600  | 0  |             } else { | 
601  | 0  |                 U_ASSERT(strength == UCOL_PRIMARY);  | 
602  | 0  |                 uint32_t p = (uint32_t)(ce >> 32);  | 
603  | 0  |                 int32_t pIndex = rootElements.findPrimary(p);  | 
604  | 0  |                 UBool isCompressible = baseData->isCompressiblePrimary(p);  | 
605  | 0  |                 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);  | 
606  | 0  |                 ce = Collation::makeCE(p);  | 
607  | 0  |                 index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode);  | 
608  | 0  |                 if(U_FAILURE(errorCode)) { return 0; } | 
609  | 0  |                 node = nodes.elementAti(index);  | 
610  | 0  |             }  | 
611  | 0  |         }  | 
612  | 0  |         if(nodeHasAnyBefore(node)) { | 
613  |  |             // Get the first node that was tailored before this one at a weaker strength.  | 
614  | 0  |             if(nodeHasBefore2(node)) { | 
615  | 0  |                 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));  | 
616  | 0  |                 node = nodes.elementAti(index);  | 
617  | 0  |             }  | 
618  | 0  |             if(nodeHasBefore3(node)) { | 
619  | 0  |                 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));  | 
620  | 0  |             }  | 
621  | 0  |             U_ASSERT(isTailoredNode(nodes.elementAti(index)));  | 
622  | 0  |             ce = tempCEFromIndexAndStrength(index, strength);  | 
623  | 0  |         }  | 
624  | 0  |     } else { | 
625  |  |         // odd pos = [last xyz]  | 
626  |  |         // Find the last node that was tailored after the [last xyz]  | 
627  |  |         // at a strength no greater than the position's strength.  | 
628  | 0  |         for(;;) { | 
629  | 0  |             int32_t nextIndex = nextIndexFromNode(node);  | 
630  | 0  |             if(nextIndex == 0) { break; } | 
631  | 0  |             int64_t nextNode = nodes.elementAti(nextIndex);  | 
632  | 0  |             if(strengthFromNode(nextNode) < strength) { break; } | 
633  | 0  |             index = nextIndex;  | 
634  | 0  |             node = nextNode;  | 
635  | 0  |         }  | 
636  |  |         // Do not make a temporary CE for a root node.  | 
637  |  |         // This last node might be the node for the root CE itself,  | 
638  |  |         // or a node with a common secondary or tertiary weight.  | 
639  | 0  |         if(isTailoredNode(node)) { | 
640  | 0  |             ce = tempCEFromIndexAndStrength(index, strength);  | 
641  | 0  |         }  | 
642  | 0  |     }  | 
643  | 0  |     return ce;  | 
644  | 0  | }  | 
645  |  |  | 
646  |  | void  | 
647  |  | CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,  | 
648  |  |                               const UnicodeString &str, const UnicodeString &extension,  | 
649  | 0  |                               const char *&parserErrorReason, UErrorCode &errorCode) { | 
650  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
651  | 0  |     UnicodeString nfdPrefix;  | 
652  | 0  |     if(!prefix.isEmpty()) { | 
653  | 0  |         nfd.normalize(prefix, nfdPrefix, errorCode);  | 
654  | 0  |         if(U_FAILURE(errorCode)) { | 
655  | 0  |             parserErrorReason = "normalizing the relation prefix";  | 
656  | 0  |             return;  | 
657  | 0  |         }  | 
658  | 0  |     }  | 
659  | 0  |     UnicodeString nfdString = nfd.normalize(str, errorCode);  | 
660  | 0  |     if(U_FAILURE(errorCode)) { | 
661  | 0  |         parserErrorReason = "normalizing the relation string";  | 
662  | 0  |         return;  | 
663  | 0  |     }  | 
664  |  |  | 
665  |  |     // The runtime code decomposes Hangul syllables on the fly,  | 
666  |  |     // with recursive processing but without making the Jamo pieces visible for matching.  | 
667  |  |     // It does not work with certain types of contextual mappings.  | 
668  | 0  |     int32_t nfdLength = nfdString.length();  | 
669  | 0  |     if(nfdLength >= 2) { | 
670  | 0  |         UChar c = nfdString.charAt(0);  | 
671  | 0  |         if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) { | 
672  |  |             // While handling a Hangul syllable, contractions starting with Jamo L or V  | 
673  |  |             // would not see the following Jamo of that syllable.  | 
674  | 0  |             errorCode = U_UNSUPPORTED_ERROR;  | 
675  | 0  |             parserErrorReason = "contractions starting with conjoining Jamo L or V not supported";  | 
676  | 0  |             return;  | 
677  | 0  |         }  | 
678  | 0  |         c = nfdString.charAt(nfdLength - 1);  | 
679  | 0  |         if(Hangul::isJamoL(c) ||  | 
680  | 0  |                 (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) { | 
681  |  |             // A contraction ending with Jamo L or L+V would require  | 
682  |  |             // generating Hangul syllables in addTailComposites() (588 for a Jamo L),  | 
683  |  |             // or decomposing a following Hangul syllable on the fly, during contraction matching.  | 
684  | 0  |             errorCode = U_UNSUPPORTED_ERROR;  | 
685  | 0  |             parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported";  | 
686  | 0  |             return;  | 
687  | 0  |         }  | 
688  |  |         // A Hangul syllable completely inside a contraction is ok.  | 
689  | 0  |     }  | 
690  |  |     // Note: If there is a prefix, then the parser checked that  | 
691  |  |     // both the prefix and the string begin with NFC boundaries (not Jamo V or T).  | 
692  |  |     // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))  | 
693  |  |     // (While handling a Hangul syllable, prefixes on Jamo V or T  | 
694  |  |     // would not see the previous Jamo of that syllable.)  | 
695  |  |  | 
696  | 0  |     if(strength != UCOL_IDENTICAL) { | 
697  |  |         // Find the node index after which we insert the new tailored node.  | 
698  | 0  |         int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);  | 
699  | 0  |         U_ASSERT(cesLength > 0);  | 
700  | 0  |         int64_t ce = ces[cesLength - 1];  | 
701  | 0  |         if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) { | 
702  |  |             // There is no primary gap between ignorables and the space-first-primary.  | 
703  | 0  |             errorCode = U_UNSUPPORTED_ERROR;  | 
704  | 0  |             parserErrorReason = "tailoring primary after ignorables not supported";  | 
705  | 0  |             return;  | 
706  | 0  |         }  | 
707  | 0  |         if(strength == UCOL_QUATERNARY && ce == 0) { | 
708  |  |             // The CE data structure does not support non-zero quaternary weights  | 
709  |  |             // on tertiary ignorables.  | 
710  | 0  |             errorCode = U_UNSUPPORTED_ERROR;  | 
711  | 0  |             parserErrorReason = "tailoring quaternary after tertiary ignorables not supported";  | 
712  | 0  |             return;  | 
713  | 0  |         }  | 
714  |  |         // Insert the new tailored node.  | 
715  | 0  |         index = insertTailoredNodeAfter(index, strength, errorCode);  | 
716  | 0  |         if(U_FAILURE(errorCode)) { | 
717  | 0  |             parserErrorReason = "modifying collation elements";  | 
718  | 0  |             return;  | 
719  | 0  |         }  | 
720  |  |         // Strength of the temporary CE:  | 
721  |  |         // The new relation may yield a stronger CE but not a weaker one.  | 
722  | 0  |         int32_t tempStrength = ceStrength(ce);  | 
723  | 0  |         if(strength < tempStrength) { tempStrength = strength; } | 
724  | 0  |         ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);  | 
725  | 0  |     }  | 
726  |  |  | 
727  | 0  |     setCaseBits(nfdString, parserErrorReason, errorCode);  | 
728  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
729  |  |  | 
730  | 0  |     int32_t cesLengthBeforeExtension = cesLength;  | 
731  | 0  |     if(!extension.isEmpty()) { | 
732  | 0  |         UnicodeString nfdExtension = nfd.normalize(extension, errorCode);  | 
733  | 0  |         if(U_FAILURE(errorCode)) { | 
734  | 0  |             parserErrorReason = "normalizing the relation extension";  | 
735  | 0  |             return;  | 
736  | 0  |         }  | 
737  | 0  |         cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength);  | 
738  | 0  |         if(cesLength > Collation::MAX_EXPANSION_LENGTH) { | 
739  | 0  |             errorCode = U_ILLEGAL_ARGUMENT_ERROR;  | 
740  | 0  |             parserErrorReason =  | 
741  | 0  |                 "extension string adds too many collation elements (more than 31 total)";  | 
742  | 0  |             return;  | 
743  | 0  |         }  | 
744  | 0  |     }  | 
745  | 0  |     uint32_t ce32 = Collation::UNASSIGNED_CE32;  | 
746  | 0  |     if((prefix != nfdPrefix || str != nfdString) &&  | 
747  | 0  |             !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) { | 
748  |  |         // Map from the original input to the CEs.  | 
749  |  |         // We do this in case the canonical closure is incomplete,  | 
750  |  |         // so that it is possible to explicitly provide the missing mappings.  | 
751  | 0  |         ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);  | 
752  | 0  |     }  | 
753  | 0  |     addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);  | 
754  | 0  |     if(U_FAILURE(errorCode)) { | 
755  | 0  |         parserErrorReason = "writing collation elements";  | 
756  | 0  |         return;  | 
757  | 0  |     }  | 
758  | 0  |     cesLength = cesLengthBeforeExtension;  | 
759  | 0  | }  | 
760  |  |  | 
761  |  | int32_t  | 
762  |  | CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,  | 
763  | 0  |                                          UErrorCode &errorCode) { | 
764  | 0  |     if(U_FAILURE(errorCode)) { return 0; } | 
765  | 0  |     U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY);  | 
766  |  |  | 
767  |  |     // Find the last CE that is at least as "strong" as the requested difference.  | 
768  |  |     // Note: Stronger is smaller (UCOL_PRIMARY=0).  | 
769  | 0  |     int64_t ce;  | 
770  | 0  |     for(;; --cesLength) { | 
771  | 0  |         if(cesLength == 0) { | 
772  | 0  |             ce = ces[0] = 0;  | 
773  | 0  |             cesLength = 1;  | 
774  | 0  |             break;  | 
775  | 0  |         } else { | 
776  | 0  |             ce = ces[cesLength - 1];  | 
777  | 0  |         }  | 
778  | 0  |         if(ceStrength(ce) <= strength) { break; } | 
779  | 0  |     }  | 
780  |  | 
  | 
781  | 0  |     if(isTempCE(ce)) { | 
782  |  |         // No need to findCommonNode() here for lower levels  | 
783  |  |         // because insertTailoredNodeAfter() will do that anyway.  | 
784  | 0  |         return indexFromTempCE(ce);  | 
785  | 0  |     }  | 
786  |  |  | 
787  |  |     // root CE  | 
788  | 0  |     if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) { | 
789  | 0  |         errorCode = U_UNSUPPORTED_ERROR;  | 
790  | 0  |         parserErrorReason = "tailoring relative to an unassigned code point not supported";  | 
791  | 0  |         return 0;  | 
792  | 0  |     }  | 
793  | 0  |     return findOrInsertNodeForRootCE(ce, strength, errorCode);  | 
794  | 0  | }  | 
795  |  |  | 
796  |  | int32_t  | 
797  | 0  | CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) { | 
798  | 0  |     if(U_FAILURE(errorCode)) { return 0; } | 
799  | 0  |     U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE);  | 
800  |  |  | 
801  |  |     // Find or insert the node for each of the root CE's weights,  | 
802  |  |     // down to the requested level/strength.  | 
803  |  |     // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).  | 
804  | 0  |     U_ASSERT((ce & 0xc0) == 0);  | 
805  | 0  |     int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32), errorCode);  | 
806  | 0  |     if(strength >= UCOL_SECONDARY) { | 
807  | 0  |         uint32_t lower32 = (uint32_t)ce;  | 
808  | 0  |         index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);  | 
809  | 0  |         if(strength >= UCOL_TERTIARY) { | 
810  | 0  |             index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK,  | 
811  | 0  |                                          UCOL_TERTIARY, errorCode);  | 
812  | 0  |         }  | 
813  | 0  |     }  | 
814  | 0  |     return index;  | 
815  | 0  | }  | 
816  |  |  | 
817  |  | namespace { | 
818  |  |  | 
819  |  | /**  | 
820  |  |  * Like Java Collections.binarySearch(List, key, Comparator).  | 
821  |  |  *  | 
822  |  |  * @return the index>=0 where the item was found,  | 
823  |  |  *         or the index<0 for inserting the string at ~index in sorted order  | 
824  |  |  *         (index into rootPrimaryIndexes)  | 
825  |  |  */  | 
826  |  | int32_t  | 
827  |  | binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length,  | 
828  | 0  |                                const int64_t *nodes, uint32_t p) { | 
829  | 0  |     if(length == 0) { return ~0; } | 
830  | 0  |     int32_t start = 0;  | 
831  | 0  |     int32_t limit = length;  | 
832  | 0  |     for (;;) { | 
833  | 0  |         int32_t i = (start + limit) / 2;  | 
834  | 0  |         int64_t node = nodes[rootPrimaryIndexes[i]];  | 
835  | 0  |         uint32_t nodePrimary = (uint32_t)(node >> 32);  // weight32FromNode(node)  | 
836  | 0  |         if (p == nodePrimary) { | 
837  | 0  |             return i;  | 
838  | 0  |         } else if (p < nodePrimary) { | 
839  | 0  |             if (i == start) { | 
840  | 0  |                 return ~start;  // insert s before i  | 
841  | 0  |             }  | 
842  | 0  |             limit = i;  | 
843  | 0  |         } else { | 
844  | 0  |             if (i == start) { | 
845  | 0  |                 return ~(start + 1);  // insert s after i  | 
846  | 0  |             }  | 
847  | 0  |             start = i;  | 
848  | 0  |         }  | 
849  | 0  |     }  | 
850  | 0  | }  | 
851  |  |  | 
852  |  | }  // namespace  | 
853  |  |  | 
854  |  | int32_t  | 
855  | 0  | CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) { | 
856  | 0  |     if(U_FAILURE(errorCode)) { return 0; } | 
857  |  |  | 
858  | 0  |     int32_t rootIndex = binarySearchForRootPrimaryNode(  | 
859  | 0  |         rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);  | 
860  | 0  |     if(rootIndex >= 0) { | 
861  | 0  |         return rootPrimaryIndexes.elementAti(rootIndex);  | 
862  | 0  |     } else { | 
863  |  |         // Start a new list of nodes with this primary.  | 
864  | 0  |         int32_t index = nodes.size();  | 
865  | 0  |         nodes.addElement(nodeFromWeight32(p), errorCode);  | 
866  | 0  |         rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode);  | 
867  | 0  |         return index;  | 
868  | 0  |     }  | 
869  | 0  | }  | 
870  |  |  | 
871  |  | int32_t  | 
872  | 0  | CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) { | 
873  | 0  |     if(U_FAILURE(errorCode)) { return 0; } | 
874  | 0  |     U_ASSERT(0 <= index && index < nodes.size());  | 
875  | 0  |     U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY);  | 
876  |  | 
  | 
877  | 0  |     if(weight16 == Collation::COMMON_WEIGHT16) { | 
878  | 0  |         return findCommonNode(index, level);  | 
879  | 0  |     }  | 
880  |  |  | 
881  |  |     // If this will be the first below-common weight for the parent node,  | 
882  |  |     // then we will also need to insert a common weight after it.  | 
883  | 0  |     int64_t node = nodes.elementAti(index);  | 
884  | 0  |     U_ASSERT(strengthFromNode(node) < level);  // parent node is stronger  | 
885  | 0  |     if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) { | 
886  | 0  |         int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;  | 
887  | 0  |         if((node & hasThisLevelBefore) == 0) { | 
888  |  |             // The parent node has an implied level-common weight.  | 
889  | 0  |             int64_t commonNode =  | 
890  | 0  |                 nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level);  | 
891  | 0  |             if(level == UCOL_SECONDARY) { | 
892  |  |                 // Move the HAS_BEFORE3 flag from the parent node  | 
893  |  |                 // to the new secondary common node.  | 
894  | 0  |                 commonNode |= node & HAS_BEFORE3;  | 
895  | 0  |                 node &= ~(int64_t)HAS_BEFORE3;  | 
896  | 0  |             }  | 
897  | 0  |             nodes.setElementAt(node | hasThisLevelBefore, index);  | 
898  |  |             // Insert below-common-weight node.  | 
899  | 0  |             int32_t nextIndex = nextIndexFromNode(node);  | 
900  | 0  |             node = nodeFromWeight16(weight16) | nodeFromStrength(level);  | 
901  | 0  |             index = insertNodeBetween(index, nextIndex, node, errorCode);  | 
902  |  |             // Insert common-weight node.  | 
903  | 0  |             insertNodeBetween(index, nextIndex, commonNode, errorCode);  | 
904  |  |             // Return index of below-common-weight node.  | 
905  | 0  |             return index;  | 
906  | 0  |         }  | 
907  | 0  |     }  | 
908  |  |  | 
909  |  |     // Find the root CE's weight for this level.  | 
910  |  |     // Postpone insertion if not found:  | 
911  |  |     // Insert the new root node before the next stronger node,  | 
912  |  |     // or before the next root node with the same strength and a larger weight.  | 
913  | 0  |     int32_t nextIndex;  | 
914  | 0  |     while((nextIndex = nextIndexFromNode(node)) != 0) { | 
915  | 0  |         node = nodes.elementAti(nextIndex);  | 
916  | 0  |         int32_t nextStrength = strengthFromNode(node);  | 
917  | 0  |         if(nextStrength <= level) { | 
918  |  |             // Insert before a stronger node.  | 
919  | 0  |             if(nextStrength < level) { break; } | 
920  |  |             // nextStrength == level  | 
921  | 0  |             if(!isTailoredNode(node)) { | 
922  | 0  |                 uint32_t nextWeight16 = weight16FromNode(node);  | 
923  | 0  |                 if(nextWeight16 == weight16) { | 
924  |  |                     // Found the node for the root CE up to this level.  | 
925  | 0  |                     return nextIndex;  | 
926  | 0  |                 }  | 
927  |  |                 // Insert before a node with a larger same-strength weight.  | 
928  | 0  |                 if(nextWeight16 > weight16) { break; } | 
929  | 0  |             }  | 
930  | 0  |         }  | 
931  |  |         // Skip the next node.  | 
932  | 0  |         index = nextIndex;  | 
933  | 0  |     }  | 
934  | 0  |     node = nodeFromWeight16(weight16) | nodeFromStrength(level);  | 
935  | 0  |     return insertNodeBetween(index, nextIndex, node, errorCode);  | 
936  | 0  | }  | 
937  |  |  | 
938  |  | int32_t  | 
939  | 0  | CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) { | 
940  | 0  |     if(U_FAILURE(errorCode)) { return 0; } | 
941  | 0  |     U_ASSERT(0 <= index && index < nodes.size());  | 
942  | 0  |     if(strength >= UCOL_SECONDARY) { | 
943  | 0  |         index = findCommonNode(index, UCOL_SECONDARY);  | 
944  | 0  |         if(strength >= UCOL_TERTIARY) { | 
945  | 0  |             index = findCommonNode(index, UCOL_TERTIARY);  | 
946  | 0  |         }  | 
947  | 0  |     }  | 
948  |  |     // Postpone insertion:  | 
949  |  |     // Insert the new node before the next one with a strength at least as strong.  | 
950  | 0  |     int64_t node = nodes.elementAti(index);  | 
951  | 0  |     int32_t nextIndex;  | 
952  | 0  |     while((nextIndex = nextIndexFromNode(node)) != 0) { | 
953  | 0  |         node = nodes.elementAti(nextIndex);  | 
954  | 0  |         if(strengthFromNode(node) <= strength) { break; } | 
955  |  |         // Skip the next node which has a weaker (larger) strength than the new one.  | 
956  | 0  |         index = nextIndex;  | 
957  | 0  |     }  | 
958  | 0  |     node = IS_TAILORED | nodeFromStrength(strength);  | 
959  | 0  |     return insertNodeBetween(index, nextIndex, node, errorCode);  | 
960  | 0  | }  | 
961  |  |  | 
962  |  | int32_t  | 
963  |  | CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,  | 
964  | 0  |                                     UErrorCode &errorCode) { | 
965  | 0  |     if(U_FAILURE(errorCode)) { return 0; } | 
966  | 0  |     U_ASSERT(previousIndexFromNode(node) == 0);  | 
967  | 0  |     U_ASSERT(nextIndexFromNode(node) == 0);  | 
968  | 0  |     U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);  | 
969  |  |     // Append the new node and link it to the existing nodes.  | 
970  | 0  |     int32_t newIndex = nodes.size();  | 
971  | 0  |     node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);  | 
972  | 0  |     nodes.addElement(node, errorCode);  | 
973  | 0  |     if(U_FAILURE(errorCode)) { return 0; } | 
974  |  |     // nodes[index].nextIndex = newIndex  | 
975  | 0  |     node = nodes.elementAti(index);  | 
976  | 0  |     nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);  | 
977  |  |     // nodes[nextIndex].previousIndex = newIndex  | 
978  | 0  |     if(nextIndex != 0) { | 
979  | 0  |         node = nodes.elementAti(nextIndex);  | 
980  | 0  |         nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);  | 
981  | 0  |     }  | 
982  | 0  |     return newIndex;  | 
983  | 0  | }  | 
984  |  |  | 
985  |  | int32_t  | 
986  | 0  | CollationBuilder::findCommonNode(int32_t index, int32_t strength) const { | 
987  | 0  |     U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY);  | 
988  | 0  |     int64_t node = nodes.elementAti(index);  | 
989  | 0  |     if(strengthFromNode(node) >= strength) { | 
990  |  |         // The current node is no stronger.  | 
991  | 0  |         return index;  | 
992  | 0  |     }  | 
993  | 0  |     if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { | 
994  |  |         // The current node implies the strength-common weight.  | 
995  | 0  |         return index;  | 
996  | 0  |     }  | 
997  | 0  |     index = nextIndexFromNode(node);  | 
998  | 0  |     node = nodes.elementAti(index);  | 
999  | 0  |     U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&  | 
1000  | 0  |             weight16FromNode(node) < Collation::COMMON_WEIGHT16);  | 
1001  |  |     // Skip to the explicit common node.  | 
1002  | 0  |     do { | 
1003  | 0  |         index = nextIndexFromNode(node);  | 
1004  | 0  |         node = nodes.elementAti(index);  | 
1005  | 0  |         U_ASSERT(strengthFromNode(node) >= strength);  | 
1006  | 0  |     } while(isTailoredNode(node) || strengthFromNode(node) > strength ||  | 
1007  | 0  |             weight16FromNode(node) < Collation::COMMON_WEIGHT16);  | 
1008  | 0  |     U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);  | 
1009  | 0  |     return index;  | 
1010  | 0  | }  | 
1011  |  |  | 
1012  |  | void  | 
1013  |  | CollationBuilder::setCaseBits(const UnicodeString &nfdString,  | 
1014  | 0  |                               const char *&parserErrorReason, UErrorCode &errorCode) { | 
1015  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
1016  | 0  |     int32_t numTailoredPrimaries = 0;  | 
1017  | 0  |     for(int32_t i = 0; i < cesLength; ++i) { | 
1018  | 0  |         if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; } | 
1019  | 0  |     }  | 
1020  |  |     // We should not be able to get too many case bits because  | 
1021  |  |     // cesLength<=31==MAX_EXPANSION_LENGTH.  | 
1022  |  |     // 31 pairs of case bits fit into an int64_t without setting its sign bit.  | 
1023  | 0  |     U_ASSERT(numTailoredPrimaries <= 31);  | 
1024  |  | 
  | 
1025  | 0  |     int64_t cases = 0;  | 
1026  | 0  |     if(numTailoredPrimaries > 0) { | 
1027  | 0  |         const UChar *s = nfdString.getBuffer();  | 
1028  | 0  |         UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length());  | 
1029  | 0  |         int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1;  | 
1030  | 0  |         if(U_FAILURE(errorCode)) { | 
1031  | 0  |             parserErrorReason = "fetching root CEs for tailored string";  | 
1032  | 0  |             return;  | 
1033  | 0  |         }  | 
1034  | 0  |         U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE);  | 
1035  |  | 
  | 
1036  | 0  |         uint32_t lastCase = 0;  | 
1037  | 0  |         int32_t numBasePrimaries = 0;  | 
1038  | 0  |         for(int32_t i = 0; i < baseCEsLength; ++i) { | 
1039  | 0  |             int64_t ce = baseCEs.getCE(i);  | 
1040  | 0  |             if((ce >> 32) != 0) { | 
1041  | 0  |                 ++numBasePrimaries;  | 
1042  | 0  |                 uint32_t c = ((uint32_t)ce >> 14) & 3;  | 
1043  | 0  |                 U_ASSERT(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE  | 
1044  | 0  |                 if(numBasePrimaries < numTailoredPrimaries) { | 
1045  | 0  |                     cases |= (int64_t)c << ((numBasePrimaries - 1) * 2);  | 
1046  | 0  |                 } else if(numBasePrimaries == numTailoredPrimaries) { | 
1047  | 0  |                     lastCase = c;  | 
1048  | 0  |                 } else if(c != lastCase) { | 
1049  |  |                     // There are more base primary CEs than tailored primaries.  | 
1050  |  |                     // Set mixed case if the case bits of the remainder differ.  | 
1051  | 0  |                     lastCase = 1;  | 
1052  |  |                     // Nothing more can change.  | 
1053  | 0  |                     break;  | 
1054  | 0  |                 }  | 
1055  | 0  |             }  | 
1056  | 0  |         }  | 
1057  | 0  |         if(numBasePrimaries >= numTailoredPrimaries) { | 
1058  | 0  |             cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2);  | 
1059  | 0  |         }  | 
1060  | 0  |     }  | 
1061  |  |  | 
1062  | 0  |     for(int32_t i = 0; i < cesLength; ++i) { | 
1063  | 0  |         int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff);  // clear old case bits  | 
1064  | 0  |         int32_t strength = ceStrength(ce);  | 
1065  | 0  |         if(strength == UCOL_PRIMARY) { | 
1066  | 0  |             ce |= (cases & 3) << 14;  | 
1067  | 0  |             cases >>= 2;  | 
1068  | 0  |         } else if(strength == UCOL_TERTIARY) { | 
1069  |  |             // Tertiary CEs must have uppercase bits.  | 
1070  |  |             // See the LDML spec, and comments in class CollationCompare.  | 
1071  | 0  |             ce |= 0x8000;  | 
1072  | 0  |         }  | 
1073  |  |         // Tertiary ignorable CEs must have 0 case bits.  | 
1074  |  |         // We set 0 case bits for secondary CEs too  | 
1075  |  |         // since currently only U+0345 is cased and maps to a secondary CE,  | 
1076  |  |         // and it is lowercase. Other secondaries are uncased.  | 
1077  |  |         // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.  | 
1078  | 0  |         ces[i] = ce;  | 
1079  | 0  |     }  | 
1080  | 0  | }  | 
1081  |  |  | 
1082  |  | void  | 
1083  |  | CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,  | 
1084  | 0  |                                        UErrorCode &errorCode) { | 
1085  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
1086  | 0  |     dataBuilder->suppressContractions(set, errorCode);  | 
1087  | 0  |     if(U_FAILURE(errorCode)) { | 
1088  | 0  |         parserErrorReason = "application of [suppressContractions [set]] failed";  | 
1089  | 0  |     }  | 
1090  | 0  | }  | 
1091  |  |  | 
1092  |  | void  | 
1093  |  | CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */,  | 
1094  | 0  |                            UErrorCode &errorCode) { | 
1095  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
1096  | 0  |     optimizeSet.addAll(set);  | 
1097  | 0  | }  | 
1098  |  |  | 
1099  |  | uint32_t  | 
1100  |  | CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,  | 
1101  |  |                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,  | 
1102  | 0  |                                  UErrorCode &errorCode) { | 
1103  |  |     // Map from the NFD input to the CEs.  | 
1104  | 0  |     ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);  | 
1105  | 0  |     ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);  | 
1106  | 0  |     addTailComposites(nfdPrefix, nfdString, errorCode);  | 
1107  | 0  |     return ce32;  | 
1108  | 0  | }  | 
1109  |  |  | 
1110  |  | uint32_t  | 
1111  |  | CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,  | 
1112  |  |                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,  | 
1113  | 0  |                                  UErrorCode &errorCode) { | 
1114  | 0  |     if(U_FAILURE(errorCode)) { return ce32; } | 
1115  |  |  | 
1116  |  |     // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)  | 
1117  | 0  |     if(nfdPrefix.isEmpty()) { | 
1118  | 0  |         CanonicalIterator stringIter(nfdString, errorCode);  | 
1119  | 0  |         if(U_FAILURE(errorCode)) { return ce32; } | 
1120  | 0  |         UnicodeString prefix;  | 
1121  | 0  |         for(;;) { | 
1122  | 0  |             UnicodeString str = stringIter.next();  | 
1123  | 0  |             if(str.isBogus()) { break; } | 
1124  | 0  |             if(ignoreString(str, errorCode) || str == nfdString) { continue; } | 
1125  | 0  |             ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);  | 
1126  | 0  |             if(U_FAILURE(errorCode)) { return ce32; } | 
1127  | 0  |         }  | 
1128  | 0  |     } else { | 
1129  | 0  |         CanonicalIterator prefixIter(nfdPrefix, errorCode);  | 
1130  | 0  |         CanonicalIterator stringIter(nfdString, errorCode);  | 
1131  | 0  |         if(U_FAILURE(errorCode)) { return ce32; } | 
1132  | 0  |         for(;;) { | 
1133  | 0  |             UnicodeString prefix = prefixIter.next();  | 
1134  | 0  |             if(prefix.isBogus()) { break; } | 
1135  | 0  |             if(ignorePrefix(prefix, errorCode)) { continue; } | 
1136  | 0  |             UBool samePrefix = prefix == nfdPrefix;  | 
1137  | 0  |             for(;;) { | 
1138  | 0  |                 UnicodeString str = stringIter.next();  | 
1139  | 0  |                 if(str.isBogus()) { break; } | 
1140  | 0  |                 if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; } | 
1141  | 0  |                 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);  | 
1142  | 0  |                 if(U_FAILURE(errorCode)) { return ce32; } | 
1143  | 0  |             }  | 
1144  | 0  |             stringIter.reset();  | 
1145  | 0  |         }  | 
1146  | 0  |     }  | 
1147  | 0  |     return ce32;  | 
1148  | 0  | }  | 
1149  |  |  | 
1150  |  | void  | 
1151  |  | CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,  | 
1152  | 0  |                                     UErrorCode &errorCode) { | 
1153  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
1154  |  |  | 
1155  |  |     // Look for the last starter in the NFD string.  | 
1156  | 0  |     UChar32 lastStarter;  | 
1157  | 0  |     int32_t indexAfterLastStarter = nfdString.length();  | 
1158  | 0  |     for(;;) { | 
1159  | 0  |         if(indexAfterLastStarter == 0) { return; }  // no starter at all | 
1160  | 0  |         lastStarter = nfdString.char32At(indexAfterLastStarter - 1);  | 
1161  | 0  |         if(nfd.getCombiningClass(lastStarter) == 0) { break; } | 
1162  | 0  |         indexAfterLastStarter -= U16_LENGTH(lastStarter);  | 
1163  | 0  |     }  | 
1164  |  |     // No closure to Hangul syllables since we decompose them on the fly.  | 
1165  | 0  |     if(Hangul::isJamoL(lastStarter)) { return; } | 
1166  |  |  | 
1167  |  |     // Are there any composites whose decomposition starts with the lastStarter?  | 
1168  |  |     // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.  | 
1169  |  |     // We might find some more equivalent mappings here if it did.  | 
1170  | 0  |     UnicodeSet composites;  | 
1171  | 0  |     if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } | 
1172  |  |  | 
1173  | 0  |     UnicodeString decomp;  | 
1174  | 0  |     UnicodeString newNFDString, newString;  | 
1175  | 0  |     int64_t newCEs[Collation::MAX_EXPANSION_LENGTH];  | 
1176  | 0  |     UnicodeSetIterator iter(composites);  | 
1177  | 0  |     while(iter.next()) { | 
1178  | 0  |         U_ASSERT(!iter.isString());  | 
1179  | 0  |         UChar32 composite = iter.getCodepoint();  | 
1180  | 0  |         nfd.getDecomposition(composite, decomp);  | 
1181  | 0  |         if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,  | 
1182  | 0  |                                      newNFDString, newString, errorCode)) { | 
1183  | 0  |             continue;  | 
1184  | 0  |         }  | 
1185  | 0  |         int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0);  | 
1186  | 0  |         if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) { | 
1187  |  |             // Ignore mappings that we cannot store.  | 
1188  | 0  |             continue;  | 
1189  | 0  |         }  | 
1190  |  |         // Note: It is possible that the newCEs do not make use of the mapping  | 
1191  |  |         // for which we are adding the tail composites, in which case we might be adding  | 
1192  |  |         // unnecessary mappings.  | 
1193  |  |         // For example, when we add tail composites for ae^ (^=combining circumflex),  | 
1194  |  |         // UCA discontiguous-contraction matching does not find any matches  | 
1195  |  |         // for ae_^ (_=any combining diacritic below) *unless* there is also  | 
1196  |  |         // a contraction mapping for ae.  | 
1197  |  |         // Thus, if there is no ae contraction, then the ae^ mapping is ignored  | 
1198  |  |         // while fetching the newCEs for ae_^.  | 
1199  |  |         // TODO: Try to detect this effectively.  | 
1200  |  |         // (Alternatively, print a warning when prefix contractions are missing.)  | 
1201  |  |  | 
1202  |  |         // We do not need an explicit mapping for the NFD strings.  | 
1203  |  |         // It is fine if the NFD input collates like this via a sequence of mappings.  | 
1204  |  |         // It also saves a little bit of space, and may reduce the set of characters with contractions.  | 
1205  | 0  |         uint32_t ce32 = addIfDifferent(nfdPrefix, newString,  | 
1206  | 0  |                                        newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode);  | 
1207  | 0  |         if(ce32 != Collation::UNASSIGNED_CE32) { | 
1208  |  |             // was different, was added  | 
1209  | 0  |             addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode);  | 
1210  | 0  |         }  | 
1211  | 0  |     }  | 
1212  | 0  | }  | 
1213  |  |  | 
1214  |  | UBool  | 
1215  |  | CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString,  | 
1216  |  |                                            int32_t indexAfterLastStarter,  | 
1217  |  |                                            UChar32 composite, const UnicodeString &decomp,  | 
1218  |  |                                            UnicodeString &newNFDString, UnicodeString &newString,  | 
1219  | 0  |                                            UErrorCode &errorCode) const { | 
1220  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
1221  | 0  |     U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0));  | 
1222  | 0  |     int32_t lastStarterLength = decomp.moveIndex32(0, 1);  | 
1223  | 0  |     if(lastStarterLength == decomp.length()) { | 
1224  |  |         // Singleton decompositions should be found by addWithClosure()  | 
1225  |  |         // and the CanonicalIterator, so we can ignore them here.  | 
1226  | 0  |         return FALSE;  | 
1227  | 0  |     }  | 
1228  | 0  |     if(nfdString.compare(indexAfterLastStarter, 0x7fffffff,  | 
1229  | 0  |                          decomp, lastStarterLength, 0x7fffffff) == 0) { | 
1230  |  |         // same strings, nothing new to be found here  | 
1231  | 0  |         return FALSE;  | 
1232  | 0  |     }  | 
1233  |  |  | 
1234  |  |     // Make new FCD strings that combine a composite, or its decomposition,  | 
1235  |  |     // into the nfdString's last starter and the combining marks following it.  | 
1236  |  |     // Make an NFD version, and a version with the composite.  | 
1237  | 0  |     newNFDString.setTo(nfdString, 0, indexAfterLastStarter);  | 
1238  | 0  |     newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite);  | 
1239  |  |  | 
1240  |  |     // The following is related to discontiguous contraction matching,  | 
1241  |  |     // but builds only FCD strings (or else returns FALSE).  | 
1242  | 0  |     int32_t sourceIndex = indexAfterLastStarter;  | 
1243  | 0  |     int32_t decompIndex = lastStarterLength;  | 
1244  |  |     // Small optimization: We keep the source character across loop iterations  | 
1245  |  |     // because we do not always consume it,  | 
1246  |  |     // and then need not fetch it again nor look up its combining class again.  | 
1247  | 0  |     UChar32 sourceChar = U_SENTINEL;  | 
1248  |  |     // The cc variables need to be declared before the loop so that at the end  | 
1249  |  |     // they are set to the last combining classes seen.  | 
1250  | 0  |     uint8_t sourceCC = 0;  | 
1251  | 0  |     uint8_t decompCC = 0;  | 
1252  | 0  |     for(;;) { | 
1253  | 0  |         if(sourceChar < 0) { | 
1254  | 0  |             if(sourceIndex >= nfdString.length()) { break; } | 
1255  | 0  |             sourceChar = nfdString.char32At(sourceIndex);  | 
1256  | 0  |             sourceCC = nfd.getCombiningClass(sourceChar);  | 
1257  | 0  |             U_ASSERT(sourceCC != 0);  | 
1258  | 0  |         }  | 
1259  |  |         // We consume a decomposition character in each iteration.  | 
1260  | 0  |         if(decompIndex >= decomp.length()) { break; } | 
1261  | 0  |         UChar32 decompChar = decomp.char32At(decompIndex);  | 
1262  | 0  |         decompCC = nfd.getCombiningClass(decompChar);  | 
1263  |  |         // Compare the two characters and their combining classes.  | 
1264  | 0  |         if(decompCC == 0) { | 
1265  |  |             // Unable to merge because the source contains a non-zero combining mark  | 
1266  |  |             // but the composite's decomposition contains another starter.  | 
1267  |  |             // The strings would not be equivalent.  | 
1268  | 0  |             return FALSE;  | 
1269  | 0  |         } else if(sourceCC < decompCC) { | 
1270  |  |             // Composite + sourceChar would not be FCD.  | 
1271  | 0  |             return FALSE;  | 
1272  | 0  |         } else if(decompCC < sourceCC) { | 
1273  | 0  |             newNFDString.append(decompChar);  | 
1274  | 0  |             decompIndex += U16_LENGTH(decompChar);  | 
1275  | 0  |         } else if(decompChar != sourceChar) { | 
1276  |  |             // Blocked because same combining class.  | 
1277  | 0  |             return FALSE;  | 
1278  | 0  |         } else {  // match: decompChar == sourceChar | 
1279  | 0  |             newNFDString.append(decompChar);  | 
1280  | 0  |             decompIndex += U16_LENGTH(decompChar);  | 
1281  | 0  |             sourceIndex += U16_LENGTH(decompChar);  | 
1282  | 0  |             sourceChar = U_SENTINEL;  | 
1283  | 0  |         }  | 
1284  | 0  |     }  | 
1285  |  |     // We are at the end of at least one of the two inputs.  | 
1286  | 0  |     if(sourceChar >= 0) {  // more characters from nfdString but not from decomp | 
1287  | 0  |         if(sourceCC < decompCC) { | 
1288  |  |             // Appending the next source character to the composite would not be FCD.  | 
1289  | 0  |             return FALSE;  | 
1290  | 0  |         }  | 
1291  | 0  |         newNFDString.append(nfdString, sourceIndex, 0x7fffffff);  | 
1292  | 0  |         newString.append(nfdString, sourceIndex, 0x7fffffff);  | 
1293  | 0  |     } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString | 
1294  | 0  |         newNFDString.append(decomp, decompIndex, 0x7fffffff);  | 
1295  | 0  |     }  | 
1296  | 0  |     U_ASSERT(nfd.isNormalized(newNFDString, errorCode));  | 
1297  | 0  |     U_ASSERT(fcd.isNormalized(newString, errorCode));  | 
1298  | 0  |     U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString);  // canonically equivalent  | 
1299  | 0  |     return TRUE;  | 
1300  | 0  | }  | 
1301  |  |  | 
1302  |  | UBool  | 
1303  | 0  | CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const { | 
1304  |  |     // Do not map non-FCD prefixes.  | 
1305  | 0  |     return !isFCD(s, errorCode);  | 
1306  | 0  | }  | 
1307  |  |  | 
1308  |  | UBool  | 
1309  | 0  | CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const { | 
1310  |  |     // Do not map non-FCD strings.  | 
1311  |  |     // Do not map strings that start with Hangul syllables: We decompose those on the fly.  | 
1312  | 0  |     return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0));  | 
1313  | 0  | }  | 
1314  |  |  | 
1315  |  | UBool  | 
1316  | 0  | CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const { | 
1317  | 0  |     return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode);  | 
1318  | 0  | }  | 
1319  |  |  | 
1320  |  | void  | 
1321  | 0  | CollationBuilder::closeOverComposites(UErrorCode &errorCode) { | 
1322  | 0  |     UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode);  // Java: static final | 
1323  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
1324  |  |     // Hangul is decomposed on the fly during collation.  | 
1325  | 0  |     composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);  | 
1326  | 0  |     UnicodeString prefix;  // empty  | 
1327  | 0  |     UnicodeString nfdString;  | 
1328  | 0  |     UnicodeSetIterator iter(composites);  | 
1329  | 0  |     while(iter.next()) { | 
1330  | 0  |         U_ASSERT(!iter.isString());  | 
1331  | 0  |         nfd.getDecomposition(iter.getCodepoint(), nfdString);  | 
1332  | 0  |         cesLength = dataBuilder->getCEs(nfdString, ces, 0);  | 
1333  | 0  |         if(cesLength > Collation::MAX_EXPANSION_LENGTH) { | 
1334  |  |             // Too many CEs from the decomposition (unusual), ignore this composite.  | 
1335  |  |             // We could add a capacity parameter to getCEs() and reallocate if necessary.  | 
1336  |  |             // However, this can only really happen in contrived cases.  | 
1337  | 0  |             continue;  | 
1338  | 0  |         }  | 
1339  | 0  |         const UnicodeString &composite(iter.getString());  | 
1340  | 0  |         addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode);  | 
1341  | 0  |     }  | 
1342  | 0  | }  | 
1343  |  |  | 
1344  |  | uint32_t  | 
1345  |  | CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,  | 
1346  |  |                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,  | 
1347  | 0  |                                  UErrorCode &errorCode) { | 
1348  | 0  |     if(U_FAILURE(errorCode)) { return ce32; } | 
1349  | 0  |     int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH];  | 
1350  | 0  |     int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0);  | 
1351  | 0  |     if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { | 
1352  | 0  |         if(ce32 == Collation::UNASSIGNED_CE32) { | 
1353  | 0  |             ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode);  | 
1354  | 0  |         }  | 
1355  | 0  |         dataBuilder->addCE32(prefix, str, ce32, errorCode);  | 
1356  | 0  |     }  | 
1357  | 0  |     return ce32;  | 
1358  | 0  | }  | 
1359  |  |  | 
1360  |  | UBool  | 
1361  |  | CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length,  | 
1362  | 0  |                           const int64_t ces2[], int32_t ces2Length) { | 
1363  | 0  |     if(ces1Length != ces2Length) { | 
1364  | 0  |         return FALSE;  | 
1365  | 0  |     }  | 
1366  | 0  |     U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH);  | 
1367  | 0  |     for(int32_t i = 0; i < ces1Length; ++i) { | 
1368  | 0  |         if(ces1[i] != ces2[i]) { return FALSE; } | 
1369  | 0  |     }  | 
1370  | 0  |     return TRUE;  | 
1371  | 0  | }  | 
1372  |  |  | 
1373  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1374  |  |  | 
1375  |  | uint32_t  | 
1376  |  | alignWeightRight(uint32_t w) { | 
1377  |  |     if(w != 0) { | 
1378  |  |         while((w & 0xff) == 0) { w >>= 8; } | 
1379  |  |     }  | 
1380  |  |     return w;  | 
1381  |  | }  | 
1382  |  |  | 
1383  |  | #endif  | 
1384  |  |  | 
1385  |  | void  | 
1386  | 0  | CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) { | 
1387  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
1388  |  |  | 
1389  | 0  |     CollationWeights primaries, secondaries, tertiaries;  | 
1390  | 0  |     int64_t *nodesArray = nodes.getBuffer();  | 
1391  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1392  |  |         puts("\nCollationBuilder::makeTailoredCEs()"); | 
1393  |  | #endif  | 
1394  |  | 
  | 
1395  | 0  |     for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { | 
1396  | 0  |         int32_t i = rootPrimaryIndexes.elementAti(rpi);  | 
1397  | 0  |         int64_t node = nodesArray[i];  | 
1398  | 0  |         uint32_t p = weight32FromNode(node);  | 
1399  | 0  |         uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16;  | 
1400  | 0  |         uint32_t t = s;  | 
1401  | 0  |         uint32_t q = 0;  | 
1402  | 0  |         UBool pIsTailored = FALSE;  | 
1403  | 0  |         UBool sIsTailored = FALSE;  | 
1404  | 0  |         UBool tIsTailored = FALSE;  | 
1405  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1406  |  |         printf("\nprimary     %lx\n", (long)alignWeightRight(p)); | 
1407  |  | #endif  | 
1408  | 0  |         int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p);  | 
1409  | 0  |         int32_t nextIndex = nextIndexFromNode(node);  | 
1410  | 0  |         while(nextIndex != 0) { | 
1411  | 0  |             i = nextIndex;  | 
1412  | 0  |             node = nodesArray[i];  | 
1413  | 0  |             nextIndex = nextIndexFromNode(node);  | 
1414  | 0  |             int32_t strength = strengthFromNode(node);  | 
1415  | 0  |             if(strength == UCOL_QUATERNARY) { | 
1416  | 0  |                 U_ASSERT(isTailoredNode(node));  | 
1417  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1418  |  |                 printf("      quat+     "); | 
1419  |  | #endif  | 
1420  | 0  |                 if(q == 3) { | 
1421  | 0  |                     errorCode = U_BUFFER_OVERFLOW_ERROR;  | 
1422  | 0  |                     errorReason = "quaternary tailoring gap too small";  | 
1423  | 0  |                     return;  | 
1424  | 0  |                 }  | 
1425  | 0  |                 ++q;  | 
1426  | 0  |             } else { | 
1427  | 0  |                 if(strength == UCOL_TERTIARY) { | 
1428  | 0  |                     if(isTailoredNode(node)) { | 
1429  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1430  |  |                         printf("    ter+        "); | 
1431  |  | #endif  | 
1432  | 0  |                         if(!tIsTailored) { | 
1433  |  |                             // First tailored tertiary node for [p, s].  | 
1434  | 0  |                             int32_t tCount = countTailoredNodes(nodesArray, nextIndex,  | 
1435  | 0  |                                                                 UCOL_TERTIARY) + 1;  | 
1436  | 0  |                             uint32_t tLimit;  | 
1437  | 0  |                             if(t == 0) { | 
1438  |  |                                 // Gap at the beginning of the tertiary CE range.  | 
1439  | 0  |                                 t = rootElements.getTertiaryBoundary() - 0x100;  | 
1440  | 0  |                                 tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;  | 
1441  | 0  |                             } else if(!pIsTailored && !sIsTailored) { | 
1442  |  |                                 // p and s are root weights.  | 
1443  | 0  |                                 tLimit = rootElements.getTertiaryAfter(pIndex, s, t);  | 
1444  | 0  |                             } else if(t == Collation::BEFORE_WEIGHT16) { | 
1445  | 0  |                                 tLimit = Collation::COMMON_WEIGHT16;  | 
1446  | 0  |                             } else { | 
1447  |  |                                 // [p, s] is tailored.  | 
1448  | 0  |                                 U_ASSERT(t == Collation::COMMON_WEIGHT16);  | 
1449  | 0  |                                 tLimit = rootElements.getTertiaryBoundary();  | 
1450  | 0  |                             }  | 
1451  | 0  |                             U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0);  | 
1452  | 0  |                             tertiaries.initForTertiary();  | 
1453  | 0  |                             if(!tertiaries.allocWeights(t, tLimit, tCount)) { | 
1454  | 0  |                                 errorCode = U_BUFFER_OVERFLOW_ERROR;  | 
1455  | 0  |                                 errorReason = "tertiary tailoring gap too small";  | 
1456  | 0  |                                 return;  | 
1457  | 0  |                             }  | 
1458  | 0  |                             tIsTailored = TRUE;  | 
1459  | 0  |                         }  | 
1460  | 0  |                         t = tertiaries.nextWeight();  | 
1461  | 0  |                         U_ASSERT(t != 0xffffffff);  | 
1462  | 0  |                     } else { | 
1463  | 0  |                         t = weight16FromNode(node);  | 
1464  | 0  |                         tIsTailored = FALSE;  | 
1465  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1466  |  |                         printf("    ter     %lx\n", (long)alignWeightRight(t)); | 
1467  |  | #endif  | 
1468  | 0  |                     }  | 
1469  | 0  |                 } else { | 
1470  | 0  |                     if(strength == UCOL_SECONDARY) { | 
1471  | 0  |                         if(isTailoredNode(node)) { | 
1472  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1473  |  |                             printf("  sec+          "); | 
1474  |  | #endif  | 
1475  | 0  |                             if(!sIsTailored) { | 
1476  |  |                                 // First tailored secondary node for p.  | 
1477  | 0  |                                 int32_t sCount = countTailoredNodes(nodesArray, nextIndex,  | 
1478  | 0  |                                                                     UCOL_SECONDARY) + 1;  | 
1479  | 0  |                                 uint32_t sLimit;  | 
1480  | 0  |                                 if(s == 0) { | 
1481  |  |                                     // Gap at the beginning of the secondary CE range.  | 
1482  | 0  |                                     s = rootElements.getSecondaryBoundary() - 0x100;  | 
1483  | 0  |                                     sLimit = rootElements.getFirstSecondaryCE() >> 16;  | 
1484  | 0  |                                 } else if(!pIsTailored) { | 
1485  |  |                                     // p is a root primary.  | 
1486  | 0  |                                     sLimit = rootElements.getSecondaryAfter(pIndex, s);  | 
1487  | 0  |                                 } else if(s == Collation::BEFORE_WEIGHT16) { | 
1488  | 0  |                                     sLimit = Collation::COMMON_WEIGHT16;  | 
1489  | 0  |                                 } else { | 
1490  |  |                                     // p is a tailored primary.  | 
1491  | 0  |                                     U_ASSERT(s == Collation::COMMON_WEIGHT16);  | 
1492  | 0  |                                     sLimit = rootElements.getSecondaryBoundary();  | 
1493  | 0  |                                 }  | 
1494  | 0  |                                 if(s == Collation::COMMON_WEIGHT16) { | 
1495  |  |                                     // Do not tailor into the getSortKey() range of  | 
1496  |  |                                     // compressed common secondaries.  | 
1497  | 0  |                                     s = rootElements.getLastCommonSecondary();  | 
1498  | 0  |                                 }  | 
1499  | 0  |                                 secondaries.initForSecondary();  | 
1500  | 0  |                                 if(!secondaries.allocWeights(s, sLimit, sCount)) { | 
1501  | 0  |                                     errorCode = U_BUFFER_OVERFLOW_ERROR;  | 
1502  | 0  |                                     errorReason = "secondary tailoring gap too small";  | 
1503  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1504  |  |                                     printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n", | 
1505  |  |                                            (long)alignWeightRight(s), (long)alignWeightRight(sLimit),  | 
1506  |  |                                            (long)alignWeightRight(sCount));  | 
1507  |  | #endif  | 
1508  | 0  |                                     return;  | 
1509  | 0  |                                 }  | 
1510  | 0  |                                 sIsTailored = TRUE;  | 
1511  | 0  |                             }  | 
1512  | 0  |                             s = secondaries.nextWeight();  | 
1513  | 0  |                             U_ASSERT(s != 0xffffffff);  | 
1514  | 0  |                         } else { | 
1515  | 0  |                             s = weight16FromNode(node);  | 
1516  | 0  |                             sIsTailored = FALSE;  | 
1517  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1518  |  |                             printf("  sec       %lx\n", (long)alignWeightRight(s)); | 
1519  |  | #endif  | 
1520  | 0  |                         }  | 
1521  | 0  |                     } else /* UCOL_PRIMARY */ { | 
1522  | 0  |                         U_ASSERT(isTailoredNode(node));  | 
1523  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1524  |  |                         printf("pri+            "); | 
1525  |  | #endif  | 
1526  | 0  |                         if(!pIsTailored) { | 
1527  |  |                             // First tailored primary node in this list.  | 
1528  | 0  |                             int32_t pCount = countTailoredNodes(nodesArray, nextIndex,  | 
1529  | 0  |                                                                 UCOL_PRIMARY) + 1;  | 
1530  | 0  |                             UBool isCompressible = baseData->isCompressiblePrimary(p);  | 
1531  | 0  |                             uint32_t pLimit =  | 
1532  | 0  |                                 rootElements.getPrimaryAfter(p, pIndex, isCompressible);  | 
1533  | 0  |                             primaries.initForPrimary(isCompressible);  | 
1534  | 0  |                             if(!primaries.allocWeights(p, pLimit, pCount)) { | 
1535  | 0  |                                 errorCode = U_BUFFER_OVERFLOW_ERROR;  // TODO: introduce a more specific UErrorCode?  | 
1536  | 0  |                                 errorReason = "primary tailoring gap too small";  | 
1537  | 0  |                                 return;  | 
1538  | 0  |                             }  | 
1539  | 0  |                             pIsTailored = TRUE;  | 
1540  | 0  |                         }  | 
1541  | 0  |                         p = primaries.nextWeight();  | 
1542  | 0  |                         U_ASSERT(p != 0xffffffff);  | 
1543  | 0  |                         s = Collation::COMMON_WEIGHT16;  | 
1544  | 0  |                         sIsTailored = FALSE;  | 
1545  | 0  |                     }  | 
1546  | 0  |                     t = s == 0 ? 0 : Collation::COMMON_WEIGHT16;  | 
1547  | 0  |                     tIsTailored = FALSE;  | 
1548  | 0  |                 }  | 
1549  | 0  |                 q = 0;  | 
1550  | 0  |             }  | 
1551  | 0  |             if(isTailoredNode(node)) { | 
1552  | 0  |                 nodesArray[i] = Collation::makeCE(p, s, t, q);  | 
1553  |  | #ifdef DEBUG_COLLATION_BUILDER  | 
1554  |  |                 printf("%016llx\n", (long long)nodesArray[i]); | 
1555  |  | #endif  | 
1556  | 0  |             }  | 
1557  | 0  |         }  | 
1558  | 0  |     }  | 
1559  | 0  | }  | 
1560  |  |  | 
1561  |  | int32_t  | 
1562  | 0  | CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) { | 
1563  | 0  |     int32_t count = 0;  | 
1564  | 0  |     for(;;) { | 
1565  | 0  |         if(i == 0) { break; } | 
1566  | 0  |         int64_t node = nodesArray[i];  | 
1567  | 0  |         if(strengthFromNode(node) < strength) { break; } | 
1568  | 0  |         if(strengthFromNode(node) == strength) { | 
1569  | 0  |             if(isTailoredNode(node)) { | 
1570  | 0  |                 ++count;  | 
1571  | 0  |             } else { | 
1572  | 0  |                 break;  | 
1573  | 0  |             }  | 
1574  | 0  |         }  | 
1575  | 0  |         i = nextIndexFromNode(node);  | 
1576  | 0  |     }  | 
1577  | 0  |     return count;  | 
1578  | 0  | }  | 
1579  |  |  | 
1580  |  | class CEFinalizer : public CollationDataBuilder::CEModifier { | 
1581  |  | public:  | 
1582  | 0  |     CEFinalizer(const int64_t *ces) : finalCEs(ces) {} | 
1583  |  |     virtual ~CEFinalizer();  | 
1584  | 0  |     virtual int64_t modifyCE32(uint32_t ce32) const { | 
1585  | 0  |         U_ASSERT(!Collation::isSpecialCE32(ce32));  | 
1586  | 0  |         if(CollationBuilder::isTempCE32(ce32)) { | 
1587  |  |             // retain case bits  | 
1588  | 0  |             return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);  | 
1589  | 0  |         } else { | 
1590  | 0  |             return Collation::NO_CE;  | 
1591  | 0  |         }  | 
1592  | 0  |     }  | 
1593  | 0  |     virtual int64_t modifyCE(int64_t ce) const { | 
1594  | 0  |         if(CollationBuilder::isTempCE(ce)) { | 
1595  |  |             // retain case bits  | 
1596  | 0  |             return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000);  | 
1597  | 0  |         } else { | 
1598  | 0  |             return Collation::NO_CE;  | 
1599  | 0  |         }  | 
1600  | 0  |     }  | 
1601  |  |  | 
1602  |  | private:  | 
1603  |  |     const int64_t *finalCEs;  | 
1604  |  | };  | 
1605  |  |  | 
1606  | 0  | CEFinalizer::~CEFinalizer() {} | 
1607  |  |  | 
1608  |  | void  | 
1609  | 0  | CollationBuilder::finalizeCEs(UErrorCode &errorCode) { | 
1610  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
1611  | 0  |     LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode), errorCode);  | 
1612  | 0  |     if(U_FAILURE(errorCode)) { | 
1613  | 0  |         return;  | 
1614  | 0  |     }  | 
1615  | 0  |     newBuilder->initForTailoring(baseData, errorCode);  | 
1616  | 0  |     CEFinalizer finalizer(nodes.getBuffer());  | 
1617  | 0  |     newBuilder->copyFrom(*dataBuilder, finalizer, errorCode);  | 
1618  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
1619  | 0  |     delete dataBuilder;  | 
1620  | 0  |     dataBuilder = newBuilder.orphan();  | 
1621  | 0  | }  | 
1622  |  |  | 
1623  |  | int32_t  | 
1624  | 0  | CollationBuilder::ceStrength(int64_t ce) { | 
1625  | 0  |     return  | 
1626  | 0  |         isTempCE(ce) ? strengthFromTempCE(ce) :  | 
1627  | 0  |         (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY :  | 
1628  | 0  |         ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY :  | 
1629  | 0  |         ce != 0 ? UCOL_TERTIARY :  | 
1630  | 0  |         UCOL_IDENTICAL;  | 
1631  | 0  | }  | 
1632  |  |  | 
1633  |  | U_NAMESPACE_END  | 
1634  |  |  | 
1635  |  | U_NAMESPACE_USE  | 
1636  |  |  | 
1637  |  | U_CAPI UCollator * U_EXPORT2  | 
1638  |  | ucol_openRules(const UChar *rules, int32_t rulesLength,  | 
1639  |  |                UColAttributeValue normalizationMode, UCollationStrength strength,  | 
1640  | 0  |                UParseError *parseError, UErrorCode *pErrorCode) { | 
1641  | 0  |     if(U_FAILURE(*pErrorCode)) { return NULL; } | 
1642  | 0  |     if(rules == NULL && rulesLength != 0) { | 
1643  | 0  |         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;  | 
1644  | 0  |         return NULL;  | 
1645  | 0  |     }  | 
1646  | 0  |     RuleBasedCollator *coll = new RuleBasedCollator();  | 
1647  | 0  |     if(coll == NULL) { | 
1648  | 0  |         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;  | 
1649  | 0  |         return NULL;  | 
1650  | 0  |     }  | 
1651  | 0  |     UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength);  | 
1652  | 0  |     coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode);  | 
1653  | 0  |     if(U_FAILURE(*pErrorCode)) { | 
1654  | 0  |         delete coll;  | 
1655  | 0  |         return NULL;  | 
1656  | 0  |     }  | 
1657  | 0  |     return coll->toUCollator();  | 
1658  | 0  | }  | 
1659  |  |  | 
1660  |  | static const int32_t internalBufferSize = 512;  | 
1661  |  |  | 
1662  |  | // The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp  | 
1663  |  | // because it calls UnicodeSet "builder" code that depends on all Unicode properties,  | 
1664  |  | // and the rest of the collation "runtime" code only depends on normalization.  | 
1665  |  | // This function is not related to the collation builder,  | 
1666  |  | // but it did not seem worth moving it into its own .cpp file,  | 
1667  |  | // nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods.  | 
1668  |  | U_CAPI int32_t U_EXPORT2  | 
1669  |  | ucol_getUnsafeSet( const UCollator *coll,  | 
1670  |  |                   USet *unsafe,  | 
1671  |  |                   UErrorCode *status)  | 
1672  | 0  | { | 
1673  | 0  |     UChar buffer[internalBufferSize];  | 
1674  | 0  |     int32_t len = 0;  | 
1675  |  | 
  | 
1676  | 0  |     uset_clear(unsafe);  | 
1677  |  |  | 
1678  |  |     // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant  | 
1679  | 0  |     static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, | 
1680  | 0  |                                     0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };  | 
1681  |  |  | 
1682  |  |     // add chars that fail the fcd check  | 
1683  | 0  |     uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);  | 
1684  |  |  | 
1685  |  |     // add lead/trail surrogates  | 
1686  |  |     // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*,  | 
1687  |  |     // not when testing code *points*)  | 
1688  | 0  |     uset_addRange(unsafe, 0xd800, 0xdfff);  | 
1689  |  | 
  | 
1690  | 0  |     USet *contractions = uset_open(0,0);  | 
1691  |  | 
  | 
1692  | 0  |     int32_t i = 0, j = 0;  | 
1693  | 0  |     ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);  | 
1694  | 0  |     int32_t contsSize = uset_size(contractions);  | 
1695  | 0  |     UChar32 c = 0;  | 
1696  |  |     // Contraction set consists only of strings  | 
1697  |  |     // to get unsafe code points, we need to  | 
1698  |  |     // break the strings apart and add them to the unsafe set  | 
1699  | 0  |     for(i = 0; i < contsSize; i++) { | 
1700  | 0  |         len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);  | 
1701  | 0  |         if(len > 0) { | 
1702  | 0  |             j = 0;  | 
1703  | 0  |             while(j < len) { | 
1704  | 0  |                 U16_NEXT(buffer, j, len, c);  | 
1705  | 0  |                 if(j < len) { | 
1706  | 0  |                     uset_add(unsafe, c);  | 
1707  | 0  |                 }  | 
1708  | 0  |             }  | 
1709  | 0  |         }  | 
1710  | 0  |     }  | 
1711  |  | 
  | 
1712  | 0  |     uset_close(contractions);  | 
1713  |  | 
  | 
1714  | 0  |     return uset_size(unsafe);  | 
1715  | 0  | }  | 
1716  |  |  | 
1717  |  | #endif  // !UCONFIG_NO_COLLATION  |