/src/icu/source/i18n/collationiterator.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | * Copyright (C) 2010-2014, International Business Machines  | 
6  |  | * Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | * collationiterator.cpp  | 
9  |  | *  | 
10  |  | * created on: 2010oct27  | 
11  |  | * created by: Markus W. Scherer  | 
12  |  | */  | 
13  |  |  | 
14  |  | #include "utypeinfo.h"  // for 'typeid' to work  | 
15  |  |  | 
16  |  | #include "unicode/utypes.h"  | 
17  |  |  | 
18  |  | #if !UCONFIG_NO_COLLATION  | 
19  |  |  | 
20  |  | #include "unicode/ucharstrie.h"  | 
21  |  | #include "unicode/ustringtrie.h"  | 
22  |  | #include "charstr.h"  | 
23  |  | #include "cmemory.h"  | 
24  |  | #include "collation.h"  | 
25  |  | #include "collationdata.h"  | 
26  |  | #include "collationfcd.h"  | 
27  |  | #include "collationiterator.h"  | 
28  |  | #include "normalizer2impl.h"  | 
29  |  | #include "uassert.h"  | 
30  |  | #include "uvectr32.h"  | 
31  |  |  | 
32  |  | U_NAMESPACE_BEGIN  | 
33  |  |  | 
34  | 0  | CollationIterator::CEBuffer::~CEBuffer() {} | 
35  |  |  | 
36  |  | UBool  | 
37  | 0  | CollationIterator::CEBuffer::ensureAppendCapacity(int32_t appCap, UErrorCode &errorCode) { | 
38  | 0  |     int32_t capacity = buffer.getCapacity();  | 
39  | 0  |     if((length + appCap) <= capacity) { return TRUE; } | 
40  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
41  | 0  |     do { | 
42  | 0  |         if(capacity < 1000) { | 
43  | 0  |             capacity *= 4;  | 
44  | 0  |         } else { | 
45  | 0  |             capacity *= 2;  | 
46  | 0  |         }  | 
47  | 0  |     } while(capacity < (length + appCap));  | 
48  | 0  |     int64_t *p = buffer.resize(capacity, length);  | 
49  | 0  |     if(p == NULL) { | 
50  | 0  |         errorCode = U_MEMORY_ALLOCATION_ERROR;  | 
51  | 0  |         return FALSE;  | 
52  | 0  |     }  | 
53  | 0  |     return TRUE;  | 
54  | 0  | }  | 
55  |  |  | 
56  |  | // State of combining marks skipped in discontiguous contraction.  | 
57  |  | // We create a state object on first use and keep it around deactivated between uses.  | 
58  |  | class SkippedState : public UMemory { | 
59  |  | public:  | 
60  |  |     // Born active but empty.  | 
61  | 0  |     SkippedState() : pos(0), skipLengthAtMatch(0) {} | 
62  | 0  |     void clear() { | 
63  | 0  |         oldBuffer.remove();  | 
64  | 0  |         pos = 0;  | 
65  |  |         // The newBuffer is reset by setFirstSkipped().  | 
66  | 0  |     }  | 
67  |  |  | 
68  | 0  |     UBool isEmpty() const { return oldBuffer.isEmpty(); } | 
69  |  |  | 
70  | 0  |     UBool hasNext() const { return pos < oldBuffer.length(); } | 
71  |  |  | 
72  |  |     // Requires hasNext().  | 
73  | 0  |     UChar32 next() { | 
74  | 0  |         UChar32 c = oldBuffer.char32At(pos);  | 
75  | 0  |         pos += U16_LENGTH(c);  | 
76  | 0  |         return c;  | 
77  | 0  |     }  | 
78  |  |  | 
79  |  |     // Accounts for one more input code point read beyond the end of the marks buffer.  | 
80  | 0  |     void incBeyond() { | 
81  | 0  |         U_ASSERT(!hasNext());  | 
82  | 0  |         ++pos;  | 
83  | 0  |     }  | 
84  |  |  | 
85  |  |     // Goes backward through the skipped-marks buffer.  | 
86  |  |     // Returns the number of code points read beyond the skipped marks  | 
87  |  |     // that need to be backtracked through normal input.  | 
88  | 0  |     int32_t backwardNumCodePoints(int32_t n) { | 
89  | 0  |         int32_t length = oldBuffer.length();  | 
90  | 0  |         int32_t beyond = pos - length;  | 
91  | 0  |         if(beyond > 0) { | 
92  | 0  |             if(beyond >= n) { | 
93  |  |                 // Not back far enough to re-enter the oldBuffer.  | 
94  | 0  |                 pos -= n;  | 
95  | 0  |                 return n;  | 
96  | 0  |             } else { | 
97  |  |                 // Back out all beyond-oldBuffer code points and re-enter the buffer.  | 
98  | 0  |                 pos = oldBuffer.moveIndex32(length, beyond - n);  | 
99  | 0  |                 return beyond;  | 
100  | 0  |             }  | 
101  | 0  |         } else { | 
102  |  |             // Go backwards from inside the oldBuffer.  | 
103  | 0  |             pos = oldBuffer.moveIndex32(pos, -n);  | 
104  | 0  |             return 0;  | 
105  | 0  |         }  | 
106  | 0  |     }  | 
107  |  |  | 
108  | 0  |     void setFirstSkipped(UChar32 c) { | 
109  | 0  |         skipLengthAtMatch = 0;  | 
110  | 0  |         newBuffer.setTo(c);  | 
111  | 0  |     }  | 
112  |  |  | 
113  | 0  |     void skip(UChar32 c) { | 
114  | 0  |         newBuffer.append(c);  | 
115  | 0  |     }  | 
116  |  |  | 
117  | 0  |     void recordMatch() { skipLengthAtMatch = newBuffer.length(); } | 
118  |  |  | 
119  |  |     // Replaces the characters we consumed with the newly skipped ones.  | 
120  | 0  |     void replaceMatch() { | 
121  |  |         // Note: UnicodeString.replace() pins pos to at most length().  | 
122  | 0  |         oldBuffer.replace(0, pos, newBuffer, 0, skipLengthAtMatch);  | 
123  | 0  |         pos = 0;  | 
124  | 0  |     }  | 
125  |  |  | 
126  | 0  |     void saveTrieState(const UCharsTrie &trie) { trie.saveState(state); } | 
127  | 0  |     void resetToTrieState(UCharsTrie &trie) const { trie.resetToState(state); } | 
128  |  |  | 
129  |  | private:  | 
130  |  |     // Combining marks skipped in previous discontiguous-contraction matching.  | 
131  |  |     // After that discontiguous contraction was completed, we start reading them from here.  | 
132  |  |     UnicodeString oldBuffer;  | 
133  |  |     // Combining marks newly skipped in current discontiguous-contraction matching.  | 
134  |  |     // These might have been read from the normal text or from the oldBuffer.  | 
135  |  |     UnicodeString newBuffer;  | 
136  |  |     // Reading index in oldBuffer,  | 
137  |  |     // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).  | 
138  |  |     int32_t pos;  | 
139  |  |     // newBuffer.length() at the time of the last matching character.  | 
140  |  |     // When a partial match fails, we back out skipped and partial-matching input characters.  | 
141  |  |     int32_t skipLengthAtMatch;  | 
142  |  |     // We save the trie state before we attempt to match a character,  | 
143  |  |     // so that we can skip it and try the next one.  | 
144  |  |     UCharsTrie::State state;  | 
145  |  | };  | 
146  |  |  | 
147  |  | CollationIterator::CollationIterator(const CollationIterator &other)  | 
148  | 0  |         : UObject(other),  | 
149  | 0  |           trie(other.trie),  | 
150  | 0  |           data(other.data),  | 
151  | 0  |           cesIndex(other.cesIndex),  | 
152  |  |           skipped(NULL),  | 
153  | 0  |           numCpFwd(other.numCpFwd),  | 
154  | 0  |           isNumeric(other.isNumeric) { | 
155  | 0  |     UErrorCode errorCode = U_ZERO_ERROR;  | 
156  | 0  |     int32_t length = other.ceBuffer.length;  | 
157  | 0  |     if(length > 0 && ceBuffer.ensureAppendCapacity(length, errorCode)) { | 
158  | 0  |         for(int32_t i = 0; i < length; ++i) { | 
159  | 0  |             ceBuffer.set(i, other.ceBuffer.get(i));  | 
160  | 0  |         }  | 
161  | 0  |         ceBuffer.length = length;  | 
162  | 0  |     } else { | 
163  | 0  |         cesIndex = 0;  | 
164  | 0  |     }  | 
165  | 0  | }  | 
166  |  |  | 
167  | 0  | CollationIterator::~CollationIterator() { | 
168  | 0  |     delete skipped;  | 
169  | 0  | }  | 
170  |  |  | 
171  |  | bool  | 
172  | 0  | CollationIterator::operator==(const CollationIterator &other) const { | 
173  |  |     // Subclasses: Call this method and then add more specific checks.  | 
174  |  |     // Compare the iterator state but not the collation data (trie & data fields):  | 
175  |  |     // Assume that the caller compares the data.  | 
176  |  |     // Ignore skipped since that should be unused between calls to nextCE().  | 
177  |  |     // (It only stays around to avoid another memory allocation.)  | 
178  | 0  |     if(!(typeid(*this) == typeid(other) &&  | 
179  | 0  |             ceBuffer.length == other.ceBuffer.length &&  | 
180  | 0  |             cesIndex == other.cesIndex &&  | 
181  | 0  |             numCpFwd == other.numCpFwd &&  | 
182  | 0  |             isNumeric == other.isNumeric)) { | 
183  | 0  |         return FALSE;  | 
184  | 0  |     }  | 
185  | 0  |     for(int32_t i = 0; i < ceBuffer.length; ++i) { | 
186  | 0  |         if(ceBuffer.get(i) != other.ceBuffer.get(i)) { return FALSE; } | 
187  | 0  |     }  | 
188  | 0  |     return TRUE;  | 
189  | 0  | }  | 
190  |  |  | 
191  |  | void  | 
192  | 0  | CollationIterator::reset() { | 
193  | 0  |     cesIndex = ceBuffer.length = 0;  | 
194  | 0  |     if(skipped != NULL) { skipped->clear(); } | 
195  | 0  | }  | 
196  |  |  | 
197  |  | int32_t  | 
198  | 0  | CollationIterator::fetchCEs(UErrorCode &errorCode) { | 
199  | 0  |     while(U_SUCCESS(errorCode) && nextCE(errorCode) != Collation::NO_CE) { | 
200  |  |         // No need to loop for each expansion CE.  | 
201  | 0  |         cesIndex = ceBuffer.length;  | 
202  | 0  |     }  | 
203  | 0  |     return ceBuffer.length;  | 
204  | 0  | }  | 
205  |  |  | 
206  |  | uint32_t  | 
207  | 0  | CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { | 
208  | 0  |     c = nextCodePoint(errorCode);  | 
209  | 0  |     return (c < 0) ? Collation::FALLBACK_CE32 : data->getCE32(c);  | 
210  | 0  | }  | 
211  |  |  | 
212  |  | UChar  | 
213  | 0  | CollationIterator::handleGetTrailSurrogate() { | 
214  | 0  |     return 0;  | 
215  | 0  | }  | 
216  |  |  | 
217  |  | UBool  | 
218  | 0  | CollationIterator::foundNULTerminator() { | 
219  | 0  |     return FALSE;  | 
220  | 0  | }  | 
221  |  |  | 
222  |  | UBool  | 
223  | 0  | CollationIterator::forbidSurrogateCodePoints() const { | 
224  | 0  |     return FALSE;  | 
225  | 0  | }  | 
226  |  |  | 
227  |  | uint32_t  | 
228  | 0  | CollationIterator::getDataCE32(UChar32 c) const { | 
229  | 0  |     return data->getCE32(c);  | 
230  | 0  | }  | 
231  |  |  | 
232  |  | uint32_t  | 
233  | 0  | CollationIterator::getCE32FromBuilderData(uint32_t /*ce32*/, UErrorCode &errorCode) { | 
234  | 0  |     if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; } | 
235  | 0  |     return 0;  | 
236  | 0  | }  | 
237  |  |  | 
238  |  | int64_t  | 
239  |  | CollationIterator::nextCEFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,  | 
240  | 0  |                                   UErrorCode &errorCode) { | 
241  | 0  |     --ceBuffer.length;  // Undo ceBuffer.incLength().  | 
242  | 0  |     appendCEsFromCE32(d, c, ce32, TRUE, errorCode);  | 
243  | 0  |     if(U_SUCCESS(errorCode)) { | 
244  | 0  |         return ceBuffer.get(cesIndex++);  | 
245  | 0  |     } else { | 
246  | 0  |         return Collation::NO_CE_PRIMARY;  | 
247  | 0  |     }  | 
248  | 0  | }  | 
249  |  |  | 
250  |  | void  | 
251  |  | CollationIterator::appendCEsFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,  | 
252  | 0  |                                      UBool forward, UErrorCode &errorCode) { | 
253  | 0  |     while(Collation::isSpecialCE32(ce32)) { | 
254  | 0  |         switch(Collation::tagFromCE32(ce32)) { | 
255  | 0  |         case Collation::FALLBACK_TAG:  | 
256  | 0  |         case Collation::RESERVED_TAG_3:  | 
257  | 0  |             if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; } | 
258  | 0  |             return;  | 
259  | 0  |         case Collation::LONG_PRIMARY_TAG:  | 
260  | 0  |             ceBuffer.append(Collation::ceFromLongPrimaryCE32(ce32), errorCode);  | 
261  | 0  |             return;  | 
262  | 0  |         case Collation::LONG_SECONDARY_TAG:  | 
263  | 0  |             ceBuffer.append(Collation::ceFromLongSecondaryCE32(ce32), errorCode);  | 
264  | 0  |             return;  | 
265  | 0  |         case Collation::LATIN_EXPANSION_TAG:  | 
266  | 0  |             if(ceBuffer.ensureAppendCapacity(2, errorCode)) { | 
267  | 0  |                 ceBuffer.set(ceBuffer.length, Collation::latinCE0FromCE32(ce32));  | 
268  | 0  |                 ceBuffer.set(ceBuffer.length + 1, Collation::latinCE1FromCE32(ce32));  | 
269  | 0  |                 ceBuffer.length += 2;  | 
270  | 0  |             }  | 
271  | 0  |             return;  | 
272  | 0  |         case Collation::EXPANSION32_TAG: { | 
273  | 0  |             const uint32_t *ce32s = d->ce32s + Collation::indexFromCE32(ce32);  | 
274  | 0  |             int32_t length = Collation::lengthFromCE32(ce32);  | 
275  | 0  |             if(ceBuffer.ensureAppendCapacity(length, errorCode)) { | 
276  | 0  |                 do { | 
277  | 0  |                     ceBuffer.appendUnsafe(Collation::ceFromCE32(*ce32s++));  | 
278  | 0  |                 } while(--length > 0);  | 
279  | 0  |             }  | 
280  | 0  |             return;  | 
281  | 0  |         }  | 
282  | 0  |         case Collation::EXPANSION_TAG: { | 
283  | 0  |             const int64_t *ces = d->ces + Collation::indexFromCE32(ce32);  | 
284  | 0  |             int32_t length = Collation::lengthFromCE32(ce32);  | 
285  | 0  |             if(ceBuffer.ensureAppendCapacity(length, errorCode)) { | 
286  | 0  |                 do { | 
287  | 0  |                     ceBuffer.appendUnsafe(*ces++);  | 
288  | 0  |                 } while(--length > 0);  | 
289  | 0  |             }  | 
290  | 0  |             return;  | 
291  | 0  |         }  | 
292  | 0  |         case Collation::BUILDER_DATA_TAG:  | 
293  | 0  |             ce32 = getCE32FromBuilderData(ce32, errorCode);  | 
294  | 0  |             if(U_FAILURE(errorCode)) { return; } | 
295  | 0  |             if(ce32 == Collation::FALLBACK_CE32) { | 
296  | 0  |                 d = data->base;  | 
297  | 0  |                 ce32 = d->getCE32(c);  | 
298  | 0  |             }  | 
299  | 0  |             break;  | 
300  | 0  |         case Collation::PREFIX_TAG:  | 
301  | 0  |             if(forward) { backwardNumCodePoints(1, errorCode); } | 
302  | 0  |             ce32 = getCE32FromPrefix(d, ce32, errorCode);  | 
303  | 0  |             if(forward) { forwardNumCodePoints(1, errorCode); } | 
304  | 0  |             break;  | 
305  | 0  |         case Collation::CONTRACTION_TAG: { | 
306  | 0  |             const UChar *p = d->contexts + Collation::indexFromCE32(ce32);  | 
307  | 0  |             uint32_t defaultCE32 = CollationData::readCE32(p);  // Default if no suffix match.  | 
308  | 0  |             if(!forward) { | 
309  |  |                 // Backward contractions are handled by previousCEUnsafe().  | 
310  |  |                 // c has contractions but they were not found.  | 
311  | 0  |                 ce32 = defaultCE32;  | 
312  | 0  |                 break;  | 
313  | 0  |             }  | 
314  | 0  |             UChar32 nextCp;  | 
315  | 0  |             if(skipped == NULL && numCpFwd < 0) { | 
316  |  |                 // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,  | 
317  |  |                 // avoiding the function call and the nextSkippedCodePoint() overhead.  | 
318  | 0  |                 nextCp = nextCodePoint(errorCode);  | 
319  | 0  |                 if(nextCp < 0) { | 
320  |  |                     // No more text.  | 
321  | 0  |                     ce32 = defaultCE32;  | 
322  | 0  |                     break;  | 
323  | 0  |                 } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&  | 
324  | 0  |                         !CollationFCD::mayHaveLccc(nextCp)) { | 
325  |  |                     // All contraction suffixes start with characters with lccc!=0  | 
326  |  |                     // but the next code point has lccc==0.  | 
327  | 0  |                     backwardNumCodePoints(1, errorCode);  | 
328  | 0  |                     ce32 = defaultCE32;  | 
329  | 0  |                     break;  | 
330  | 0  |                 }  | 
331  | 0  |             } else { | 
332  | 0  |                 nextCp = nextSkippedCodePoint(errorCode);  | 
333  | 0  |                 if(nextCp < 0) { | 
334  |  |                     // No more text.  | 
335  | 0  |                     ce32 = defaultCE32;  | 
336  | 0  |                     break;  | 
337  | 0  |                 } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&  | 
338  | 0  |                         !CollationFCD::mayHaveLccc(nextCp)) { | 
339  |  |                     // All contraction suffixes start with characters with lccc!=0  | 
340  |  |                     // but the next code point has lccc==0.  | 
341  | 0  |                     backwardNumSkipped(1, errorCode);  | 
342  | 0  |                     ce32 = defaultCE32;  | 
343  | 0  |                     break;  | 
344  | 0  |                 }  | 
345  | 0  |             }  | 
346  | 0  |             ce32 = nextCE32FromContraction(d, ce32, p + 2, defaultCE32, nextCp, errorCode);  | 
347  | 0  |             if(ce32 == Collation::NO_CE32) { | 
348  |  |                 // CEs from a discontiguous contraction plus the skipped combining marks  | 
349  |  |                 // have been appended already.  | 
350  | 0  |                 return;  | 
351  | 0  |             }  | 
352  | 0  |             break;  | 
353  | 0  |         }  | 
354  | 0  |         case Collation::DIGIT_TAG:  | 
355  | 0  |             if(isNumeric) { | 
356  | 0  |                 appendNumericCEs(ce32, forward, errorCode);  | 
357  | 0  |                 return;  | 
358  | 0  |             } else { | 
359  |  |                 // Fetch the non-numeric-collation CE32 and continue.  | 
360  | 0  |                 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];  | 
361  | 0  |                 break;  | 
362  | 0  |             }  | 
363  | 0  |         case Collation::U0000_TAG:  | 
364  | 0  |             U_ASSERT(c == 0);  | 
365  | 0  |             if(forward && foundNULTerminator()) { | 
366  |  |                 // Handle NUL-termination. (Not needed in Java.)  | 
367  | 0  |                 ceBuffer.append(Collation::NO_CE, errorCode);  | 
368  | 0  |                 return;  | 
369  | 0  |             } else { | 
370  |  |                 // Fetch the normal ce32 for U+0000 and continue.  | 
371  | 0  |                 ce32 = d->ce32s[0];  | 
372  | 0  |                 break;  | 
373  | 0  |             }  | 
374  | 0  |         case Collation::HANGUL_TAG: { | 
375  | 0  |             const uint32_t *jamoCE32s = d->jamoCE32s;  | 
376  | 0  |             c -= Hangul::HANGUL_BASE;  | 
377  | 0  |             UChar32 t = c % Hangul::JAMO_T_COUNT;  | 
378  | 0  |             c /= Hangul::JAMO_T_COUNT;  | 
379  | 0  |             UChar32 v = c % Hangul::JAMO_V_COUNT;  | 
380  | 0  |             c /= Hangul::JAMO_V_COUNT;  | 
381  | 0  |             if((ce32 & Collation::HANGUL_NO_SPECIAL_JAMO) != 0) { | 
382  |  |                 // None of the Jamo CE32s are isSpecialCE32().  | 
383  |  |                 // Avoid recursive function calls and per-Jamo tests.  | 
384  | 0  |                 if(ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3, errorCode)) { | 
385  | 0  |                     ceBuffer.set(ceBuffer.length, Collation::ceFromCE32(jamoCE32s[c]));  | 
386  | 0  |                     ceBuffer.set(ceBuffer.length + 1, Collation::ceFromCE32(jamoCE32s[19 + v]));  | 
387  | 0  |                     ceBuffer.length += 2;  | 
388  | 0  |                     if(t != 0) { | 
389  | 0  |                         ceBuffer.appendUnsafe(Collation::ceFromCE32(jamoCE32s[39 + t]));  | 
390  | 0  |                     }  | 
391  | 0  |                 }  | 
392  | 0  |                 return;  | 
393  | 0  |             } else { | 
394  |  |                 // We should not need to compute each Jamo code point.  | 
395  |  |                 // In particular, there should be no offset or implicit ce32.  | 
396  | 0  |                 appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[c], forward, errorCode);  | 
397  | 0  |                 appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[19 + v], forward, errorCode);  | 
398  | 0  |                 if(t == 0) { return; } | 
399  |  |                 // offset 39 = 19 + 21 - 1:  | 
400  |  |                 // 19 = JAMO_L_COUNT  | 
401  |  |                 // 21 = JAMO_T_COUNT  | 
402  |  |                 // -1 = omit t==0  | 
403  | 0  |                 ce32 = jamoCE32s[39 + t];  | 
404  | 0  |                 c = U_SENTINEL;  | 
405  | 0  |                 break;  | 
406  | 0  |             }  | 
407  | 0  |         }  | 
408  | 0  |         case Collation::LEAD_SURROGATE_TAG: { | 
409  | 0  |             U_ASSERT(forward);  // Backward iteration should never see lead surrogate code _unit_ data.  | 
410  | 0  |             U_ASSERT(U16_IS_LEAD(c));  | 
411  | 0  |             UChar trail;  | 
412  | 0  |             if(U16_IS_TRAIL(trail = handleGetTrailSurrogate())) { | 
413  | 0  |                 c = U16_GET_SUPPLEMENTARY(c, trail);  | 
414  | 0  |                 ce32 &= Collation::LEAD_TYPE_MASK;  | 
415  | 0  |                 if(ce32 == Collation::LEAD_ALL_UNASSIGNED) { | 
416  | 0  |                     ce32 = Collation::UNASSIGNED_CE32;  // unassigned-implicit  | 
417  | 0  |                 } else if(ce32 == Collation::LEAD_ALL_FALLBACK ||  | 
418  | 0  |                         (ce32 = d->getCE32FromSupplementary(c)) == Collation::FALLBACK_CE32) { | 
419  |  |                     // fall back to the base data  | 
420  | 0  |                     d = d->base;  | 
421  | 0  |                     ce32 = d->getCE32FromSupplementary(c);  | 
422  | 0  |                 }  | 
423  | 0  |             } else { | 
424  |  |                 // c is an unpaired surrogate.  | 
425  | 0  |                 ce32 = Collation::UNASSIGNED_CE32;  | 
426  | 0  |             }  | 
427  | 0  |             break;  | 
428  | 0  |         }  | 
429  | 0  |         case Collation::OFFSET_TAG:  | 
430  | 0  |             U_ASSERT(c >= 0);  | 
431  | 0  |             ceBuffer.append(d->getCEFromOffsetCE32(c, ce32), errorCode);  | 
432  | 0  |             return;  | 
433  | 0  |         case Collation::IMPLICIT_TAG:  | 
434  | 0  |             U_ASSERT(c >= 0);  | 
435  | 0  |             if(U_IS_SURROGATE(c) && forbidSurrogateCodePoints()) { | 
436  | 0  |                 ce32 = Collation::FFFD_CE32;  | 
437  | 0  |                 break;  | 
438  | 0  |             } else { | 
439  | 0  |                 ceBuffer.append(Collation::unassignedCEFromCodePoint(c), errorCode);  | 
440  | 0  |                 return;  | 
441  | 0  |             }  | 
442  | 0  |         }  | 
443  | 0  |     }  | 
444  | 0  |     ceBuffer.append(Collation::ceFromSimpleCE32(ce32), errorCode);  | 
445  | 0  | }  | 
446  |  |  | 
447  |  | uint32_t  | 
448  |  | CollationIterator::getCE32FromPrefix(const CollationData *d, uint32_t ce32,  | 
449  | 0  |                                      UErrorCode &errorCode) { | 
450  | 0  |     const UChar *p = d->contexts + Collation::indexFromCE32(ce32);  | 
451  | 0  |     ce32 = CollationData::readCE32(p);  // Default if no prefix match.  | 
452  | 0  |     p += 2;  | 
453  |  |     // Number of code points read before the original code point.  | 
454  | 0  |     int32_t lookBehind = 0;  | 
455  | 0  |     UCharsTrie prefixes(p);  | 
456  | 0  |     for(;;) { | 
457  | 0  |         UChar32 c = previousCodePoint(errorCode);  | 
458  | 0  |         if(c < 0) { break; } | 
459  | 0  |         ++lookBehind;  | 
460  | 0  |         UStringTrieResult match = prefixes.nextForCodePoint(c);  | 
461  | 0  |         if(USTRINGTRIE_HAS_VALUE(match)) { | 
462  | 0  |             ce32 = (uint32_t)prefixes.getValue();  | 
463  | 0  |         }  | 
464  | 0  |         if(!USTRINGTRIE_HAS_NEXT(match)) { break; } | 
465  | 0  |     }  | 
466  | 0  |     forwardNumCodePoints(lookBehind, errorCode);  | 
467  | 0  |     return ce32;  | 
468  | 0  | }  | 
469  |  |  | 
470  |  | UChar32  | 
471  | 0  | CollationIterator::nextSkippedCodePoint(UErrorCode &errorCode) { | 
472  | 0  |     if(skipped != NULL && skipped->hasNext()) { return skipped->next(); } | 
473  | 0  |     if(numCpFwd == 0) { return U_SENTINEL; } | 
474  | 0  |     UChar32 c = nextCodePoint(errorCode);  | 
475  | 0  |     if(skipped != NULL && !skipped->isEmpty() && c >= 0) { skipped->incBeyond(); } | 
476  | 0  |     if(numCpFwd > 0 && c >= 0) { --numCpFwd; } | 
477  | 0  |     return c;  | 
478  | 0  | }  | 
479  |  |  | 
480  |  | void  | 
481  | 0  | CollationIterator::backwardNumSkipped(int32_t n, UErrorCode &errorCode) { | 
482  | 0  |     if(skipped != NULL && !skipped->isEmpty()) { | 
483  | 0  |         n = skipped->backwardNumCodePoints(n);  | 
484  | 0  |     }  | 
485  | 0  |     backwardNumCodePoints(n, errorCode);  | 
486  | 0  |     if(numCpFwd >= 0) { numCpFwd += n; } | 
487  | 0  | }  | 
488  |  |  | 
489  |  | uint32_t  | 
490  |  | CollationIterator::nextCE32FromContraction(const CollationData *d, uint32_t contractionCE32,  | 
491  |  |                                            const UChar *p, uint32_t ce32, UChar32 c,  | 
492  | 0  |                                            UErrorCode &errorCode) { | 
493  |  |     // c: next code point after the original one  | 
494  |  |  | 
495  |  |     // Number of code points read beyond the original code point.  | 
496  |  |     // Needed for discontiguous contraction matching.  | 
497  | 0  |     int32_t lookAhead = 1;  | 
498  |  |     // Number of code points read since the last match (initially only c).  | 
499  | 0  |     int32_t sinceMatch = 1;  | 
500  |  |     // Normally we only need a contiguous match,  | 
501  |  |     // and therefore need not remember the suffixes state from before a mismatch for retrying.  | 
502  |  |     // If we are already processing skipped combining marks, then we do track the state.  | 
503  | 0  |     UCharsTrie suffixes(p);  | 
504  | 0  |     if(skipped != NULL && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); } | 
505  | 0  |     UStringTrieResult match = suffixes.firstForCodePoint(c);  | 
506  | 0  |     for(;;) { | 
507  | 0  |         UChar32 nextCp;  | 
508  | 0  |         if(USTRINGTRIE_HAS_VALUE(match)) { | 
509  | 0  |             ce32 = (uint32_t)suffixes.getValue();  | 
510  | 0  |             if(!USTRINGTRIE_HAS_NEXT(match) || (c = nextSkippedCodePoint(errorCode)) < 0) { | 
511  | 0  |                 return ce32;  | 
512  | 0  |             }  | 
513  | 0  |             if(skipped != NULL && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); } | 
514  | 0  |             sinceMatch = 1;  | 
515  | 0  |         } else if(match == USTRINGTRIE_NO_MATCH || (nextCp = nextSkippedCodePoint(errorCode)) < 0) { | 
516  |  |             // No match for c, or partial match (USTRINGTRIE_NO_VALUE) and no further text.  | 
517  |  |             // Back up if necessary, and try a discontiguous contraction.  | 
518  | 0  |             if((contractionCE32 & Collation::CONTRACT_TRAILING_CCC) != 0 &&  | 
519  |  |                     // Discontiguous contraction matching extends an existing match.  | 
520  |  |                     // If there is no match yet, then there is nothing to do.  | 
521  | 0  |                     ((contractionCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||  | 
522  | 0  |                         sinceMatch < lookAhead)) { | 
523  |  |                 // The last character of at least one suffix has lccc!=0,  | 
524  |  |                 // allowing for discontiguous contractions.  | 
525  |  |                 // UCA S2.1.1 only processes non-starters immediately following  | 
526  |  |                 // "a match in the table" (sinceMatch=1).  | 
527  | 0  |                 if(sinceMatch > 1) { | 
528  |  |                     // Return to the state after the last match.  | 
529  |  |                     // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)  | 
530  | 0  |                     backwardNumSkipped(sinceMatch, errorCode);  | 
531  | 0  |                     c = nextSkippedCodePoint(errorCode);  | 
532  | 0  |                     lookAhead -= sinceMatch - 1;  | 
533  | 0  |                     sinceMatch = 1;  | 
534  | 0  |                 }  | 
535  | 0  |                 if(d->getFCD16(c) > 0xff) { | 
536  | 0  |                     return nextCE32FromDiscontiguousContraction(  | 
537  | 0  |                         d, suffixes, ce32, lookAhead, c, errorCode);  | 
538  | 0  |                 }  | 
539  | 0  |             }  | 
540  | 0  |             break;  | 
541  | 0  |         } else { | 
542  |  |             // Continue after partial match (USTRINGTRIE_NO_VALUE) for c.  | 
543  |  |             // It does not have a result value, therefore it is not itself "a match in the table".  | 
544  |  |             // If a partially-matched c has ccc!=0 then  | 
545  |  |             // it might be skipped in discontiguous contraction.  | 
546  | 0  |             c = nextCp;  | 
547  | 0  |             ++sinceMatch;  | 
548  | 0  |         }  | 
549  | 0  |         ++lookAhead;  | 
550  | 0  |         match = suffixes.nextForCodePoint(c);  | 
551  | 0  |     }  | 
552  | 0  |     backwardNumSkipped(sinceMatch, errorCode);  | 
553  | 0  |     return ce32;  | 
554  | 0  | }  | 
555  |  |  | 
556  |  | uint32_t  | 
557  |  | CollationIterator::nextCE32FromDiscontiguousContraction(  | 
558  |  |         const CollationData *d, UCharsTrie &suffixes, uint32_t ce32,  | 
559  |  |         int32_t lookAhead, UChar32 c,  | 
560  | 0  |         UErrorCode &errorCode) { | 
561  | 0  |     if(U_FAILURE(errorCode)) { return 0; } | 
562  |  |  | 
563  |  |     // UCA section 3.3.2 Contractions:  | 
564  |  |     // Contractions that end with non-starter characters  | 
565  |  |     // are known as discontiguous contractions.  | 
566  |  |     // ... discontiguous contractions must be detected in input text  | 
567  |  |     // whenever the final sequence of non-starter characters could be rearranged  | 
568  |  |     // so as to make a contiguous matching sequence that is canonically equivalent.  | 
569  |  |  | 
570  |  |     // UCA: http://www.unicode.org/reports/tr10/#S2.1  | 
571  |  |     // S2.1 Find the longest initial substring S at each point that has a match in the table.  | 
572  |  |     // S2.1.1 If there are any non-starters following S, process each non-starter C.  | 
573  |  |     // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.  | 
574  |  |     //     Note: A non-starter in a string is called blocked  | 
575  |  |     //     if there is another non-starter of the same canonical combining class or zero  | 
576  |  |     //     between it and the last character of canonical combining class 0.  | 
577  |  |     // S2.1.3 If there is a match, replace S by S + C, and remove C.  | 
578  |  |  | 
579  |  |     // First: Is a discontiguous contraction even possible?  | 
580  | 0  |     uint16_t fcd16 = d->getFCD16(c);  | 
581  | 0  |     U_ASSERT(fcd16 > 0xff);  // The caller checked this already, as a shortcut.  | 
582  | 0  |     UChar32 nextCp = nextSkippedCodePoint(errorCode);  | 
583  | 0  |     if(nextCp < 0) { | 
584  |  |         // No further text.  | 
585  | 0  |         backwardNumSkipped(1, errorCode);  | 
586  | 0  |         return ce32;  | 
587  | 0  |     }  | 
588  | 0  |     ++lookAhead;  | 
589  | 0  |     uint8_t prevCC = (uint8_t)fcd16;  | 
590  | 0  |     fcd16 = d->getFCD16(nextCp);  | 
591  | 0  |     if(fcd16 <= 0xff) { | 
592  |  |         // The next code point after c is a starter (S2.1.1 "process each non-starter").  | 
593  | 0  |         backwardNumSkipped(2, errorCode);  | 
594  | 0  |         return ce32;  | 
595  | 0  |     }  | 
596  |  |  | 
597  |  |     // We have read and matched (lookAhead-2) code points,  | 
598  |  |     // read non-matching c and peeked ahead at nextCp.  | 
599  |  |     // Return to the state before the mismatch and continue matching with nextCp.  | 
600  | 0  |     if(skipped == NULL || skipped->isEmpty()) { | 
601  | 0  |         if(skipped == NULL) { | 
602  | 0  |             skipped = new SkippedState();  | 
603  | 0  |             if(skipped == NULL) { | 
604  | 0  |                 errorCode = U_MEMORY_ALLOCATION_ERROR;  | 
605  | 0  |                 return 0;  | 
606  | 0  |             }  | 
607  | 0  |         }  | 
608  | 0  |         suffixes.reset();  | 
609  | 0  |         if(lookAhead > 2) { | 
610  |  |             // Replay the partial match so far.  | 
611  | 0  |             backwardNumCodePoints(lookAhead, errorCode);  | 
612  | 0  |             suffixes.firstForCodePoint(nextCodePoint(errorCode));  | 
613  | 0  |             for(int32_t i = 3; i < lookAhead; ++i) { | 
614  | 0  |                 suffixes.nextForCodePoint(nextCodePoint(errorCode));  | 
615  | 0  |             }  | 
616  |  |             // Skip c (which did not match) and nextCp (which we will try now).  | 
617  | 0  |             forwardNumCodePoints(2, errorCode);  | 
618  | 0  |         }  | 
619  | 0  |         skipped->saveTrieState(suffixes);  | 
620  | 0  |     } else { | 
621  |  |         // Reset to the trie state before the failed match of c.  | 
622  | 0  |         skipped->resetToTrieState(suffixes);  | 
623  | 0  |     }  | 
624  |  |  | 
625  | 0  |     skipped->setFirstSkipped(c);  | 
626  |  |     // Number of code points read since the last match (at this point: c and nextCp).  | 
627  | 0  |     int32_t sinceMatch = 2;  | 
628  | 0  |     c = nextCp;  | 
629  | 0  |     for(;;) { | 
630  | 0  |         UStringTrieResult match;  | 
631  |  |         // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)  | 
632  | 0  |         if(prevCC < (fcd16 >> 8) && USTRINGTRIE_HAS_VALUE(match = suffixes.nextForCodePoint(c))) { | 
633  |  |             // "If there is a match, replace S by S + C, and remove C." (S2.1.3)  | 
634  |  |             // Keep prevCC unchanged.  | 
635  | 0  |             ce32 = (uint32_t)suffixes.getValue();  | 
636  | 0  |             sinceMatch = 0;  | 
637  | 0  |             skipped->recordMatch();  | 
638  | 0  |             if(!USTRINGTRIE_HAS_NEXT(match)) { break; } | 
639  | 0  |             skipped->saveTrieState(suffixes);  | 
640  | 0  |         } else { | 
641  |  |             // No match for "S + C", skip C.  | 
642  | 0  |             skipped->skip(c);  | 
643  | 0  |             skipped->resetToTrieState(suffixes);  | 
644  | 0  |             prevCC = (uint8_t)fcd16;  | 
645  | 0  |         }  | 
646  | 0  |         if((c = nextSkippedCodePoint(errorCode)) < 0) { break; } | 
647  | 0  |         ++sinceMatch;  | 
648  | 0  |         fcd16 = d->getFCD16(c);  | 
649  | 0  |         if(fcd16 <= 0xff) { | 
650  |  |             // The next code point after c is a starter (S2.1.1 "process each non-starter").  | 
651  | 0  |             break;  | 
652  | 0  |         }  | 
653  | 0  |     }  | 
654  | 0  |     backwardNumSkipped(sinceMatch, errorCode);  | 
655  | 0  |     UBool isTopDiscontiguous = skipped->isEmpty();  | 
656  | 0  |     skipped->replaceMatch();  | 
657  | 0  |     if(isTopDiscontiguous && !skipped->isEmpty()) { | 
658  |  |         // We did get a match after skipping one or more combining marks,  | 
659  |  |         // and we are not in a recursive discontiguous contraction.  | 
660  |  |         // Append CEs from the contraction ce32  | 
661  |  |         // and then from the combining marks that we skipped before the match.  | 
662  | 0  |         c = U_SENTINEL;  | 
663  | 0  |         for(;;) { | 
664  | 0  |             appendCEsFromCE32(d, c, ce32, TRUE, errorCode);  | 
665  |  |             // Fetch CE32s for skipped combining marks from the normal data, with fallback,  | 
666  |  |             // rather than from the CollationData where we found the contraction.  | 
667  | 0  |             if(!skipped->hasNext()) { break; } | 
668  | 0  |             c = skipped->next();  | 
669  | 0  |             ce32 = getDataCE32(c);  | 
670  | 0  |             if(ce32 == Collation::FALLBACK_CE32) { | 
671  | 0  |                 d = data->base;  | 
672  | 0  |                 ce32 = d->getCE32(c);  | 
673  | 0  |             } else { | 
674  | 0  |                 d = data;  | 
675  | 0  |             }  | 
676  |  |             // Note: A nested discontiguous-contraction match  | 
677  |  |             // replaces consumed combining marks with newly skipped ones  | 
678  |  |             // and resets the reading position to the beginning.  | 
679  | 0  |         }  | 
680  | 0  |         skipped->clear();  | 
681  | 0  |         ce32 = Collation::NO_CE32;  // Signal to the caller that the result is in the ceBuffer.  | 
682  | 0  |     }  | 
683  | 0  |     return ce32;  | 
684  | 0  | }  | 
685  |  |  | 
686  |  | void  | 
687  | 0  | CollationIterator::appendNumericCEs(uint32_t ce32, UBool forward, UErrorCode &errorCode) { | 
688  |  |     // Collect digits.  | 
689  | 0  |     CharString digits;  | 
690  | 0  |     if(forward) { | 
691  | 0  |         for(;;) { | 
692  | 0  |             char digit = Collation::digitFromCE32(ce32);  | 
693  | 0  |             digits.append(digit, errorCode);  | 
694  | 0  |             if(numCpFwd == 0) { break; } | 
695  | 0  |             UChar32 c = nextCodePoint(errorCode);  | 
696  | 0  |             if(c < 0) { break; } | 
697  | 0  |             ce32 = data->getCE32(c);  | 
698  | 0  |             if(ce32 == Collation::FALLBACK_CE32) { | 
699  | 0  |                 ce32 = data->base->getCE32(c);  | 
700  | 0  |             }  | 
701  | 0  |             if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) { | 
702  | 0  |                 backwardNumCodePoints(1, errorCode);  | 
703  | 0  |                 break;  | 
704  | 0  |             }  | 
705  | 0  |             if(numCpFwd > 0) { --numCpFwd; } | 
706  | 0  |         }  | 
707  | 0  |     } else { | 
708  | 0  |         for(;;) { | 
709  | 0  |             char digit = Collation::digitFromCE32(ce32);  | 
710  | 0  |             digits.append(digit, errorCode);  | 
711  | 0  |             UChar32 c = previousCodePoint(errorCode);  | 
712  | 0  |             if(c < 0) { break; } | 
713  | 0  |             ce32 = data->getCE32(c);  | 
714  | 0  |             if(ce32 == Collation::FALLBACK_CE32) { | 
715  | 0  |                 ce32 = data->base->getCE32(c);  | 
716  | 0  |             }  | 
717  | 0  |             if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) { | 
718  | 0  |                 forwardNumCodePoints(1, errorCode);  | 
719  | 0  |                 break;  | 
720  | 0  |             }  | 
721  | 0  |         }  | 
722  |  |         // Reverse the digit string.  | 
723  | 0  |         char *p = digits.data();  | 
724  | 0  |         char *q = p + digits.length() - 1;  | 
725  | 0  |         while(p < q) { | 
726  | 0  |             char digit = *p;  | 
727  | 0  |             *p++ = *q;  | 
728  | 0  |             *q-- = digit;  | 
729  | 0  |         }  | 
730  | 0  |     }  | 
731  | 0  |     if(U_FAILURE(errorCode)) { return; } | 
732  | 0  |     int32_t pos = 0;  | 
733  | 0  |     do { | 
734  |  |         // Skip leading zeros.  | 
735  | 0  |         while(pos < (digits.length() - 1) && digits[pos] == 0) { ++pos; } | 
736  |  |         // Write a sequence of CEs for at most 254 digits at a time.  | 
737  | 0  |         int32_t segmentLength = digits.length() - pos;  | 
738  | 0  |         if(segmentLength > 254) { segmentLength = 254; } | 
739  | 0  |         appendNumericSegmentCEs(digits.data() + pos, segmentLength, errorCode);  | 
740  | 0  |         pos += segmentLength;  | 
741  | 0  |     } while(U_SUCCESS(errorCode) && pos < digits.length());  | 
742  | 0  | }  | 
743  |  |  | 
744  |  | void  | 
745  | 0  | CollationIterator::appendNumericSegmentCEs(const char *digits, int32_t length, UErrorCode &errorCode) { | 
746  | 0  |     U_ASSERT(1 <= length && length <= 254);  | 
747  | 0  |     U_ASSERT(length == 1 || digits[0] != 0);  | 
748  | 0  |     uint32_t numericPrimary = data->numericPrimary;  | 
749  |  |     // Note: We use primary byte values 2..255: digits are not compressible.  | 
750  | 0  |     if(length <= 7) { | 
751  |  |         // Very dense encoding for small numbers.  | 
752  | 0  |         int32_t value = digits[0];  | 
753  | 0  |         for(int32_t i = 1; i < length; ++i) { | 
754  | 0  |             value = value * 10 + digits[i];  | 
755  | 0  |         }  | 
756  |  |         // Primary weight second byte values:  | 
757  |  |         //     74 byte values   2.. 75 for small numbers in two-byte primary weights.  | 
758  |  |         //     40 byte values  76..115 for medium numbers in three-byte primary weights.  | 
759  |  |         //     16 byte values 116..131 for large numbers in four-byte primary weights.  | 
760  |  |         //    124 byte values 132..255 for very large numbers with 4..127 digit pairs.  | 
761  | 0  |         int32_t firstByte = 2;  | 
762  | 0  |         int32_t numBytes = 74;  | 
763  | 0  |         if(value < numBytes) { | 
764  |  |             // Two-byte primary for 0..73, good for day & month numbers etc.  | 
765  | 0  |             uint32_t primary = numericPrimary | ((firstByte + value) << 16);  | 
766  | 0  |             ceBuffer.append(Collation::makeCE(primary), errorCode);  | 
767  | 0  |             return;  | 
768  | 0  |         }  | 
769  | 0  |         value -= numBytes;  | 
770  | 0  |         firstByte += numBytes;  | 
771  | 0  |         numBytes = 40;  | 
772  | 0  |         if(value < numBytes * 254) { | 
773  |  |             // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.  | 
774  | 0  |             uint32_t primary = numericPrimary |  | 
775  | 0  |                 ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);  | 
776  | 0  |             ceBuffer.append(Collation::makeCE(primary), errorCode);  | 
777  | 0  |             return;  | 
778  | 0  |         }  | 
779  | 0  |         value -= numBytes * 254;  | 
780  | 0  |         firstByte += numBytes;  | 
781  | 0  |         numBytes = 16;  | 
782  | 0  |         if(value < numBytes * 254 * 254) { | 
783  |  |             // Four-byte primary for 10234..1042489=10234+16*254*254-1.  | 
784  | 0  |             uint32_t primary = numericPrimary | (2 + value % 254);  | 
785  | 0  |             value /= 254;  | 
786  | 0  |             primary |= (2 + value % 254) << 8;  | 
787  | 0  |             value /= 254;  | 
788  | 0  |             primary |= (firstByte + value % 254) << 16;  | 
789  | 0  |             ceBuffer.append(Collation::makeCE(primary), errorCode);  | 
790  | 0  |             return;  | 
791  | 0  |         }  | 
792  |  |         // original value > 1042489  | 
793  | 0  |     }  | 
794  | 0  |     U_ASSERT(length >= 7);  | 
795  |  |  | 
796  |  |     // The second primary byte value 132..255 indicates the number of digit pairs (4..127),  | 
797  |  |     // then we generate primary bytes with those pairs.  | 
798  |  |     // Omit trailing 00 pairs.  | 
799  |  |     // Decrement the value for the last pair.  | 
800  |  |  | 
801  |  |     // Set the exponent. 4 pairs->132, 5 pairs->133, ..., 127 pairs->255.  | 
802  | 0  |     int32_t numPairs = (length + 1) / 2;  | 
803  | 0  |     uint32_t primary = numericPrimary | ((132 - 4 + numPairs) << 16);  | 
804  |  |     // Find the length without trailing 00 pairs.  | 
805  | 0  |     while(digits[length - 1] == 0 && digits[length - 2] == 0) { | 
806  | 0  |         length -= 2;  | 
807  | 0  |     }  | 
808  |  |     // Read the first pair.  | 
809  | 0  |     uint32_t pair;  | 
810  | 0  |     int32_t pos;  | 
811  | 0  |     if(length & 1) { | 
812  |  |         // Only "half a pair" if we have an odd number of digits.  | 
813  | 0  |         pair = digits[0];  | 
814  | 0  |         pos = 1;  | 
815  | 0  |     } else { | 
816  | 0  |         pair = digits[0] * 10 + digits[1];  | 
817  | 0  |         pos = 2;  | 
818  | 0  |     }  | 
819  | 0  |     pair = 11 + 2 * pair;  | 
820  |  |     // Add the pairs of digits between pos and length.  | 
821  | 0  |     int32_t shift = 8;  | 
822  | 0  |     while(pos < length) { | 
823  | 0  |         if(shift == 0) { | 
824  |  |             // Every three pairs/bytes we need to store a 4-byte-primary CE  | 
825  |  |             // and start with a new CE with the '0' primary lead byte.  | 
826  | 0  |             primary |= pair;  | 
827  | 0  |             ceBuffer.append(Collation::makeCE(primary), errorCode);  | 
828  | 0  |             primary = numericPrimary;  | 
829  | 0  |             shift = 16;  | 
830  | 0  |         } else { | 
831  | 0  |             primary |= pair << shift;  | 
832  | 0  |             shift -= 8;  | 
833  | 0  |         }  | 
834  | 0  |         pair = 11 + 2 * (digits[pos] * 10 + digits[pos + 1]);  | 
835  | 0  |         pos += 2;  | 
836  | 0  |     }  | 
837  | 0  |     primary |= (pair - 1) << shift;  | 
838  | 0  |     ceBuffer.append(Collation::makeCE(primary), errorCode);  | 
839  | 0  | }  | 
840  |  |  | 
841  |  | int64_t  | 
842  | 0  | CollationIterator::previousCE(UVector32 &offsets, UErrorCode &errorCode) { | 
843  | 0  |     if(ceBuffer.length > 0) { | 
844  |  |         // Return the previous buffered CE.  | 
845  | 0  |         return ceBuffer.get(--ceBuffer.length);  | 
846  | 0  |     }  | 
847  | 0  |     offsets.removeAllElements();  | 
848  | 0  |     int32_t limitOffset = getOffset();  | 
849  | 0  |     UChar32 c = previousCodePoint(errorCode);  | 
850  | 0  |     if(c < 0) { return Collation::NO_CE; } | 
851  | 0  |     if(data->isUnsafeBackward(c, isNumeric)) { | 
852  | 0  |         return previousCEUnsafe(c, offsets, errorCode);  | 
853  | 0  |     }  | 
854  |  |     // Simple, safe-backwards iteration:  | 
855  |  |     // Get a CE going backwards, handle prefixes but no contractions.  | 
856  | 0  |     uint32_t ce32 = data->getCE32(c);  | 
857  | 0  |     const CollationData *d;  | 
858  | 0  |     if(ce32 == Collation::FALLBACK_CE32) { | 
859  | 0  |         d = data->base;  | 
860  | 0  |         ce32 = d->getCE32(c);  | 
861  | 0  |     } else { | 
862  | 0  |         d = data;  | 
863  | 0  |     }  | 
864  | 0  |     if(Collation::isSimpleOrLongCE32(ce32)) { | 
865  | 0  |         return Collation::ceFromCE32(ce32);  | 
866  | 0  |     }  | 
867  | 0  |     appendCEsFromCE32(d, c, ce32, FALSE, errorCode);  | 
868  | 0  |     if(U_SUCCESS(errorCode)) { | 
869  | 0  |         if(ceBuffer.length > 1) { | 
870  | 0  |             offsets.addElement(getOffset(), errorCode);  | 
871  |  |             // For an expansion, the offset of each non-initial CE is the limit offset,  | 
872  |  |             // consistent with forward iteration.  | 
873  | 0  |             while(offsets.size() <= ceBuffer.length) { | 
874  | 0  |                 offsets.addElement(limitOffset, errorCode);  | 
875  | 0  |             }  | 
876  | 0  |         }  | 
877  | 0  |         return ceBuffer.get(--ceBuffer.length);  | 
878  | 0  |     } else { | 
879  | 0  |         return Collation::NO_CE_PRIMARY;  | 
880  | 0  |     }  | 
881  | 0  | }  | 
882  |  |  | 
883  |  | int64_t  | 
884  | 0  | CollationIterator::previousCEUnsafe(UChar32 c, UVector32 &offsets, UErrorCode &errorCode) { | 
885  |  |     // We just move through the input counting safe and unsafe code points  | 
886  |  |     // without collecting the unsafe-backward substring into a buffer and  | 
887  |  |     // switching to it.  | 
888  |  |     // This is to keep the logic simple. Otherwise we would have to handle  | 
889  |  |     // prefix matching going before the backward buffer, switching  | 
890  |  |     // to iteration and back, etc.  | 
891  |  |     // In the most important case of iterating over a normal string,  | 
892  |  |     // reading from the string itself is already maximally fast.  | 
893  |  |     // The only drawback there is that after getting the CEs we always  | 
894  |  |     // skip backward to the safe character rather than switching out  | 
895  |  |     // of a backwardBuffer.  | 
896  |  |     // But this should not be the common case for previousCE(),  | 
897  |  |     // and correctness and maintainability are more important than  | 
898  |  |     // complex optimizations.  | 
899  |  |     // Find the first safe character before c.  | 
900  | 0  |     int32_t numBackward = 1;  | 
901  | 0  |     while((c = previousCodePoint(errorCode)) >= 0) { | 
902  | 0  |         ++numBackward;  | 
903  | 0  |         if(!data->isUnsafeBackward(c, isNumeric)) { | 
904  | 0  |             break;  | 
905  | 0  |         }  | 
906  | 0  |     }  | 
907  |  |     // Set the forward iteration limit.  | 
908  |  |     // Note: This counts code points.  | 
909  |  |     // We cannot enforce a limit in the middle of a surrogate pair or similar.  | 
910  | 0  |     numCpFwd = numBackward;  | 
911  |  |     // Reset the forward iterator.  | 
912  | 0  |     cesIndex = 0;  | 
913  | 0  |     U_ASSERT(ceBuffer.length == 0);  | 
914  |  |     // Go forward and collect the CEs.  | 
915  | 0  |     int32_t offset = getOffset();  | 
916  | 0  |     while(numCpFwd > 0) { | 
917  |  |         // nextCE() normally reads one code point.  | 
918  |  |         // Contraction matching and digit specials read more and check numCpFwd.  | 
919  | 0  |         --numCpFwd;  | 
920  |  |         // Append one or more CEs to the ceBuffer.  | 
921  | 0  |         (void)nextCE(errorCode);  | 
922  | 0  |         U_ASSERT(U_FAILURE(errorCode) || ceBuffer.get(ceBuffer.length - 1) != Collation::NO_CE);  | 
923  |  |         // No need to loop for getting each expansion CE from nextCE().  | 
924  | 0  |         cesIndex = ceBuffer.length;  | 
925  |  |         // However, we need to write an offset for each CE.  | 
926  |  |         // This is for CollationElementIterator::getOffset() to return  | 
927  |  |         // intermediate offsets from the unsafe-backwards segment.  | 
928  | 0  |         U_ASSERT(offsets.size() < ceBuffer.length);  | 
929  | 0  |         offsets.addElement(offset, errorCode);  | 
930  |  |         // For an expansion, the offset of each non-initial CE is the limit offset,  | 
931  |  |         // consistent with forward iteration.  | 
932  | 0  |         offset = getOffset();  | 
933  | 0  |         while(offsets.size() < ceBuffer.length) { | 
934  | 0  |             offsets.addElement(offset, errorCode);  | 
935  | 0  |         }  | 
936  | 0  |     }  | 
937  | 0  |     U_ASSERT(offsets.size() == ceBuffer.length);  | 
938  |  |     // End offset corresponding to just after the unsafe-backwards segment.  | 
939  | 0  |     offsets.addElement(offset, errorCode);  | 
940  |  |     // Reset the forward iteration limit  | 
941  |  |     // and move backward to before the segment for which we fetched CEs.  | 
942  | 0  |     numCpFwd = -1;  | 
943  | 0  |     backwardNumCodePoints(numBackward, errorCode);  | 
944  |  |     // Use the collected CEs and return the last one.  | 
945  | 0  |     cesIndex = 0;  // Avoid cesIndex > ceBuffer.length when that gets decremented.  | 
946  | 0  |     if(U_SUCCESS(errorCode)) { | 
947  | 0  |         return ceBuffer.get(--ceBuffer.length);  | 
948  | 0  |     } else { | 
949  | 0  |         return Collation::NO_CE_PRIMARY;  | 
950  | 0  |     }  | 
951  | 0  | }  | 
952  |  |  | 
953  |  | U_NAMESPACE_END  | 
954  |  |  | 
955  |  | #endif  // !UCONFIG_NO_COLLATION  |