/src/icu/source/i18n/utf8collationiterator.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | * Copyright (C) 2012-2014, International Business Machines  | 
6  |  | * Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | * utf8collationiterator.cpp  | 
9  |  | *  | 
10  |  | * created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)  | 
11  |  | * created by: Markus W. Scherer  | 
12  |  | */  | 
13  |  |  | 
14  |  | #include "unicode/utypes.h"  | 
15  |  |  | 
16  |  | #if !UCONFIG_NO_COLLATION  | 
17  |  |  | 
18  |  | #include "unicode/utf8.h"  | 
19  |  | #include "charstr.h"  | 
20  |  | #include "cmemory.h"  | 
21  |  | #include "collation.h"  | 
22  |  | #include "collationdata.h"  | 
23  |  | #include "collationfcd.h"  | 
24  |  | #include "collationiterator.h"  | 
25  |  | #include "normalizer2impl.h"  | 
26  |  | #include "uassert.h"  | 
27  |  | #include "utf8collationiterator.h"  | 
28  |  |  | 
29  |  | U_NAMESPACE_BEGIN  | 
30  |  |  | 
31  | 0  | UTF8CollationIterator::~UTF8CollationIterator() {} | 
32  |  |  | 
33  |  | void  | 
34  | 0  | UTF8CollationIterator::resetToOffset(int32_t newOffset) { | 
35  | 0  |     reset();  | 
36  | 0  |     pos = newOffset;  | 
37  | 0  | }  | 
38  |  |  | 
39  |  | int32_t  | 
40  | 0  | UTF8CollationIterator::getOffset() const { | 
41  | 0  |     return pos;  | 
42  | 0  | }  | 
43  |  |  | 
44  |  | uint32_t  | 
45  | 0  | UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { | 
46  | 0  |     if(pos == length) { | 
47  | 0  |         c = U_SENTINEL;  | 
48  | 0  |         return Collation::FALLBACK_CE32;  | 
49  | 0  |     }  | 
50  |  |     // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().  | 
51  | 0  |     c = u8[pos++];  | 
52  | 0  |     if(U8_IS_SINGLE(c)) { | 
53  |  |         // ASCII 00..7F  | 
54  | 0  |         return trie->data32[c];  | 
55  | 0  |     }  | 
56  | 0  |     uint8_t t1, t2;  | 
57  | 0  |     if(0xe0 <= c && c < 0xf0 &&  | 
58  | 0  |             ((pos + 1) < length || length < 0) &&  | 
59  | 0  |             U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&  | 
60  | 0  |             (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) { | 
61  |  |         // U+0800..U+FFFF except surrogates  | 
62  | 0  |         c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);  | 
63  | 0  |         pos += 2;  | 
64  | 0  |         return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);  | 
65  | 0  |     } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { | 
66  |  |         // U+0080..U+07FF  | 
67  | 0  |         uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];  | 
68  | 0  |         c = ((c & 0x1f) << 6) | t1;  | 
69  | 0  |         ++pos;  | 
70  | 0  |         return ce32;  | 
71  | 0  |     } else { | 
72  |  |         // Function call for supplementary code points and error cases.  | 
73  |  |         // Illegal byte sequences yield U+FFFD.  | 
74  | 0  |         c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);  | 
75  | 0  |         return data->getCE32(c);  | 
76  | 0  |     }  | 
77  | 0  | }  | 
78  |  |  | 
79  |  | UBool  | 
80  | 0  | UTF8CollationIterator::foundNULTerminator() { | 
81  | 0  |     if(length < 0) { | 
82  | 0  |         length = --pos;  | 
83  | 0  |         return TRUE;  | 
84  | 0  |     } else { | 
85  | 0  |         return FALSE;  | 
86  | 0  |     }  | 
87  | 0  | }  | 
88  |  |  | 
89  |  | UBool  | 
90  | 0  | UTF8CollationIterator::forbidSurrogateCodePoints() const { | 
91  | 0  |     return TRUE;  | 
92  | 0  | }  | 
93  |  |  | 
94  |  | UChar32  | 
95  | 0  | UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { | 
96  | 0  |     if(pos == length) { | 
97  | 0  |         return U_SENTINEL;  | 
98  | 0  |     }  | 
99  | 0  |     if(u8[pos] == 0 && length < 0) { | 
100  | 0  |         length = pos;  | 
101  | 0  |         return U_SENTINEL;  | 
102  | 0  |     }  | 
103  | 0  |     UChar32 c;  | 
104  | 0  |     U8_NEXT_OR_FFFD(u8, pos, length, c);  | 
105  | 0  |     return c;  | 
106  | 0  | }  | 
107  |  |  | 
108  |  | UChar32  | 
109  | 0  | UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { | 
110  | 0  |     if(pos == 0) { | 
111  | 0  |         return U_SENTINEL;  | 
112  | 0  |     }  | 
113  | 0  |     UChar32 c;  | 
114  | 0  |     U8_PREV_OR_FFFD(u8, 0, pos, c);  | 
115  | 0  |     return c;  | 
116  | 0  | }  | 
117  |  |  | 
118  |  | void  | 
119  | 0  | UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { | 
120  | 0  |     U8_FWD_N(u8, pos, length, num);  | 
121  | 0  | }  | 
122  |  |  | 
123  |  | void  | 
124  | 0  | UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { | 
125  | 0  |     U8_BACK_N(u8, 0, pos, num);  | 
126  | 0  | }  | 
127  |  |  | 
128  |  | // FCDUTF8CollationIterator ------------------------------------------------ ***  | 
129  |  |  | 
130  | 0  | FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {} | 
131  |  |  | 
132  |  | void  | 
133  | 0  | FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) { | 
134  | 0  |     reset();  | 
135  | 0  |     start = pos = newOffset;  | 
136  | 0  |     state = CHECK_FWD;  | 
137  | 0  | }  | 
138  |  |  | 
139  |  | int32_t  | 
140  | 0  | FCDUTF8CollationIterator::getOffset() const { | 
141  | 0  |     if(state != IN_NORMALIZED) { | 
142  | 0  |         return pos;  | 
143  | 0  |     } else if(pos == 0) { | 
144  | 0  |         return start;  | 
145  | 0  |     } else { | 
146  | 0  |         return limit;  | 
147  | 0  |     }  | 
148  | 0  | }  | 
149  |  |  | 
150  |  | uint32_t  | 
151  | 0  | FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { | 
152  | 0  |     for(;;) { | 
153  | 0  |         if(state == CHECK_FWD) { | 
154  |  |             // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.  | 
155  | 0  |             if(pos == length) { | 
156  | 0  |                 c = U_SENTINEL;  | 
157  | 0  |                 return Collation::FALLBACK_CE32;  | 
158  | 0  |             }  | 
159  | 0  |             c = u8[pos++];  | 
160  | 0  |             if(U8_IS_SINGLE(c)) { | 
161  |  |                 // ASCII 00..7F  | 
162  | 0  |                 return trie->data32[c];  | 
163  | 0  |             }  | 
164  | 0  |             uint8_t t1, t2;  | 
165  | 0  |             if(0xe0 <= c && c < 0xf0 &&  | 
166  | 0  |                     ((pos + 1) < length || length < 0) &&  | 
167  | 0  |                     U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&  | 
168  | 0  |                     (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) { | 
169  |  |                 // U+0800..U+FFFF except surrogates  | 
170  | 0  |                 c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);  | 
171  | 0  |                 pos += 2;  | 
172  | 0  |                 if(CollationFCD::hasTccc(c) &&  | 
173  | 0  |                         (CollationFCD::maybeTibetanCompositeVowel(c) ||  | 
174  | 0  |                             (pos != length && nextHasLccc()))) { | 
175  | 0  |                     pos -= 3;  | 
176  | 0  |                 } else { | 
177  | 0  |                     break;  // return CE32(BMP)  | 
178  | 0  |                 }  | 
179  | 0  |             } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { | 
180  |  |                 // U+0080..U+07FF  | 
181  | 0  |                 uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];  | 
182  | 0  |                 c = ((c & 0x1f) << 6) | t1;  | 
183  | 0  |                 ++pos;  | 
184  | 0  |                 if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) { | 
185  | 0  |                     pos -= 2;  | 
186  | 0  |                 } else { | 
187  | 0  |                     return ce32;  | 
188  | 0  |                 }  | 
189  | 0  |             } else { | 
190  |  |                 // Function call for supplementary code points and error cases.  | 
191  |  |                 // Illegal byte sequences yield U+FFFD.  | 
192  | 0  |                 c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);  | 
193  | 0  |                 if(c == 0xfffd) { | 
194  | 0  |                     return Collation::FFFD_CE32;  | 
195  | 0  |                 } else { | 
196  | 0  |                     U_ASSERT(c > 0xffff);  | 
197  | 0  |                     if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) { | 
198  | 0  |                         pos -= 4;  | 
199  | 0  |                     } else { | 
200  | 0  |                         return data->getCE32FromSupplementary(c);  | 
201  | 0  |                     }  | 
202  | 0  |                 }  | 
203  | 0  |             }  | 
204  | 0  |             if(!nextSegment(errorCode)) { | 
205  | 0  |                 c = U_SENTINEL;  | 
206  | 0  |                 return Collation::FALLBACK_CE32;  | 
207  | 0  |             }  | 
208  | 0  |             continue;  | 
209  | 0  |         } else if(state == IN_FCD_SEGMENT && pos != limit) { | 
210  | 0  |             return UTF8CollationIterator::handleNextCE32(c, errorCode);  | 
211  | 0  |         } else if(state == IN_NORMALIZED && pos != normalized.length()) { | 
212  | 0  |             c = normalized[pos++];  | 
213  | 0  |             break;  | 
214  | 0  |         } else { | 
215  | 0  |             switchToForward();  | 
216  | 0  |         }  | 
217  | 0  |     }  | 
218  | 0  |     return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);  | 
219  | 0  | }  | 
220  |  |  | 
221  |  | UBool  | 
222  | 0  | FCDUTF8CollationIterator::nextHasLccc() const { | 
223  | 0  |     U_ASSERT(state == CHECK_FWD && pos != length);  | 
224  |  |     // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.  | 
225  |  |     // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)  | 
226  | 0  |     UChar32 c = u8[pos];  | 
227  | 0  |     if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; } | 
228  | 0  |     int32_t i = pos;  | 
229  | 0  |     U8_NEXT_OR_FFFD(u8, i, length, c);  | 
230  | 0  |     if(c > 0xffff) { c = U16_LEAD(c); } | 
231  | 0  |     return CollationFCD::hasLccc(c);  | 
232  | 0  | }  | 
233  |  |  | 
234  |  | UBool  | 
235  | 0  | FCDUTF8CollationIterator::previousHasTccc() const { | 
236  | 0  |     U_ASSERT(state == CHECK_BWD && pos != 0);  | 
237  | 0  |     UChar32 c = u8[pos - 1];  | 
238  | 0  |     if(U8_IS_SINGLE(c)) { return FALSE; } | 
239  | 0  |     int32_t i = pos;  | 
240  | 0  |     U8_PREV_OR_FFFD(u8, 0, i, c);  | 
241  | 0  |     if(c > 0xffff) { c = U16_LEAD(c); } | 
242  | 0  |     return CollationFCD::hasTccc(c);  | 
243  | 0  | }  | 
244  |  |  | 
245  |  | UChar  | 
246  | 0  | FCDUTF8CollationIterator::handleGetTrailSurrogate() { | 
247  | 0  |     if(state != IN_NORMALIZED) { return 0; } | 
248  | 0  |     U_ASSERT(pos < normalized.length());  | 
249  | 0  |     UChar trail;  | 
250  | 0  |     if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; } | 
251  | 0  |     return trail;  | 
252  | 0  | }  | 
253  |  |  | 
254  |  | UBool  | 
255  | 0  | FCDUTF8CollationIterator::foundNULTerminator() { | 
256  | 0  |     if(state == CHECK_FWD && length < 0) { | 
257  | 0  |         length = --pos;  | 
258  | 0  |         return TRUE;  | 
259  | 0  |     } else { | 
260  | 0  |         return FALSE;  | 
261  | 0  |     }  | 
262  | 0  | }  | 
263  |  |  | 
264  |  | UChar32  | 
265  | 0  | FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) { | 
266  | 0  |     UChar32 c;  | 
267  | 0  |     for(;;) { | 
268  | 0  |         if(state == CHECK_FWD) { | 
269  | 0  |             if(pos == length || ((c = u8[pos]) == 0 && length < 0)) { | 
270  | 0  |                 return U_SENTINEL;  | 
271  | 0  |             }  | 
272  | 0  |             if(U8_IS_SINGLE(c)) { | 
273  | 0  |                 ++pos;  | 
274  | 0  |                 return c;  | 
275  | 0  |             }  | 
276  | 0  |             U8_NEXT_OR_FFFD(u8, pos, length, c);  | 
277  | 0  |             if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&  | 
278  | 0  |                     (CollationFCD::maybeTibetanCompositeVowel(c) ||  | 
279  | 0  |                         (pos != length && nextHasLccc()))) { | 
280  |  |                 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence  | 
281  |  |                 // and we can use U8_LENGTH() rather than a previous-position variable.  | 
282  | 0  |                 pos -= U8_LENGTH(c);  | 
283  | 0  |                 if(!nextSegment(errorCode)) { | 
284  | 0  |                     return U_SENTINEL;  | 
285  | 0  |                 }  | 
286  | 0  |                 continue;  | 
287  | 0  |             }  | 
288  | 0  |             return c;  | 
289  | 0  |         } else if(state == IN_FCD_SEGMENT && pos != limit) { | 
290  | 0  |             U8_NEXT_OR_FFFD(u8, pos, length, c);  | 
291  | 0  |             return c;  | 
292  | 0  |         } else if(state == IN_NORMALIZED && pos != normalized.length()) { | 
293  | 0  |             c = normalized.char32At(pos);  | 
294  | 0  |             pos += U16_LENGTH(c);  | 
295  | 0  |             return c;  | 
296  | 0  |         } else { | 
297  | 0  |             switchToForward();  | 
298  | 0  |         }  | 
299  | 0  |     }  | 
300  | 0  | }  | 
301  |  |  | 
302  |  | UChar32  | 
303  | 0  | FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) { | 
304  | 0  |     UChar32 c;  | 
305  | 0  |     for(;;) { | 
306  | 0  |         if(state == CHECK_BWD) { | 
307  | 0  |             if(pos == 0) { | 
308  | 0  |                 return U_SENTINEL;  | 
309  | 0  |             }  | 
310  | 0  |             if(U8_IS_SINGLE(c = u8[pos - 1])) { | 
311  | 0  |                 --pos;  | 
312  | 0  |                 return c;  | 
313  | 0  |             }  | 
314  | 0  |             U8_PREV_OR_FFFD(u8, 0, pos, c);  | 
315  | 0  |             if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&  | 
316  | 0  |                     (CollationFCD::maybeTibetanCompositeVowel(c) ||  | 
317  | 0  |                         (pos != 0 && previousHasTccc()))) { | 
318  |  |                 // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence  | 
319  |  |                 // and we can use U8_LENGTH() rather than a previous-position variable.  | 
320  | 0  |                 pos += U8_LENGTH(c);  | 
321  | 0  |                 if(!previousSegment(errorCode)) { | 
322  | 0  |                     return U_SENTINEL;  | 
323  | 0  |                 }  | 
324  | 0  |                 continue;  | 
325  | 0  |             }  | 
326  | 0  |             return c;  | 
327  | 0  |         } else if(state == IN_FCD_SEGMENT && pos != start) { | 
328  | 0  |             U8_PREV_OR_FFFD(u8, 0, pos, c);  | 
329  | 0  |             return c;  | 
330  | 0  |         } else if(state >= IN_NORMALIZED && pos != 0) { | 
331  | 0  |             c = normalized.char32At(pos - 1);  | 
332  | 0  |             pos -= U16_LENGTH(c);  | 
333  | 0  |             return c;  | 
334  | 0  |         } else { | 
335  | 0  |             switchToBackward();  | 
336  | 0  |         }  | 
337  | 0  |     }  | 
338  | 0  | }  | 
339  |  |  | 
340  |  | void  | 
341  | 0  | FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) { | 
342  |  |     // Specify the class to avoid a virtual-function indirection.  | 
343  |  |     // In Java, we would declare this class final.  | 
344  | 0  |     while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) { | 
345  | 0  |         --num;  | 
346  | 0  |     }  | 
347  | 0  | }  | 
348  |  |  | 
349  |  | void  | 
350  | 0  | FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) { | 
351  |  |     // Specify the class to avoid a virtual-function indirection.  | 
352  |  |     // In Java, we would declare this class final.  | 
353  | 0  |     while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) { | 
354  | 0  |         --num;  | 
355  | 0  |     }  | 
356  | 0  | }  | 
357  |  |  | 
358  |  | void  | 
359  | 0  | FCDUTF8CollationIterator::switchToForward() { | 
360  | 0  |     U_ASSERT(state == CHECK_BWD ||  | 
361  | 0  |              (state == IN_FCD_SEGMENT && pos == limit) ||  | 
362  | 0  |              (state == IN_NORMALIZED && pos == normalized.length()));  | 
363  | 0  |     if(state == CHECK_BWD) { | 
364  |  |         // Turn around from backward checking.  | 
365  | 0  |         start = pos;  | 
366  | 0  |         if(pos == limit) { | 
367  | 0  |             state = CHECK_FWD;  // Check forward.  | 
368  | 0  |         } else {  // pos < limit | 
369  | 0  |             state = IN_FCD_SEGMENT;  // Stay in FCD segment.  | 
370  | 0  |         }  | 
371  | 0  |     } else { | 
372  |  |         // Reached the end of the FCD segment.  | 
373  | 0  |         if(state == IN_FCD_SEGMENT) { | 
374  |  |             // The input text segment is FCD, extend it forward.  | 
375  | 0  |         } else { | 
376  |  |             // The input text segment needed to be normalized.  | 
377  |  |             // Switch to checking forward from it.  | 
378  | 0  |             start = pos = limit;  | 
379  | 0  |         }  | 
380  | 0  |         state = CHECK_FWD;  | 
381  | 0  |     }  | 
382  | 0  | }  | 
383  |  |  | 
384  |  | UBool  | 
385  | 0  | FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) { | 
386  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
387  | 0  |     U_ASSERT(state == CHECK_FWD && pos != length);  | 
388  |  |     // The input text [start..pos[ passes the FCD check.  | 
389  | 0  |     int32_t segmentStart = pos;  | 
390  |  |     // Collect the characters being checked, in case they need to be normalized.  | 
391  | 0  |     UnicodeString s;  | 
392  | 0  |     uint8_t prevCC = 0;  | 
393  | 0  |     for(;;) { | 
394  |  |         // Fetch the next character and its fcd16 value.  | 
395  | 0  |         int32_t cpStart = pos;  | 
396  | 0  |         UChar32 c;  | 
397  | 0  |         U8_NEXT_OR_FFFD(u8, pos, length, c);  | 
398  | 0  |         uint16_t fcd16 = nfcImpl.getFCD16(c);  | 
399  | 0  |         uint8_t leadCC = (uint8_t)(fcd16 >> 8);  | 
400  | 0  |         if(leadCC == 0 && cpStart != segmentStart) { | 
401  |  |             // FCD boundary before this character.  | 
402  | 0  |             pos = cpStart;  | 
403  | 0  |             break;  | 
404  | 0  |         }  | 
405  | 0  |         s.append(c);  | 
406  | 0  |         if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { | 
407  |  |             // Fails FCD check. Find the next FCD boundary and normalize.  | 
408  | 0  |             while(pos != length) { | 
409  | 0  |                 cpStart = pos;  | 
410  | 0  |                 U8_NEXT_OR_FFFD(u8, pos, length, c);  | 
411  | 0  |                 if(nfcImpl.getFCD16(c) <= 0xff) { | 
412  | 0  |                     pos = cpStart;  | 
413  | 0  |                     break;  | 
414  | 0  |                 }  | 
415  | 0  |                 s.append(c);  | 
416  | 0  |             }  | 
417  | 0  |             if(!normalize(s, errorCode)) { return FALSE; } | 
418  | 0  |             start = segmentStart;  | 
419  | 0  |             limit = pos;  | 
420  | 0  |             state = IN_NORMALIZED;  | 
421  | 0  |             pos = 0;  | 
422  | 0  |             return TRUE;  | 
423  | 0  |         }  | 
424  | 0  |         prevCC = (uint8_t)fcd16;  | 
425  | 0  |         if(pos == length || prevCC == 0) { | 
426  |  |             // FCD boundary after the last character.  | 
427  | 0  |             break;  | 
428  | 0  |         }  | 
429  | 0  |     }  | 
430  | 0  |     limit = pos;  | 
431  | 0  |     pos = segmentStart;  | 
432  | 0  |     U_ASSERT(pos != limit);  | 
433  | 0  |     state = IN_FCD_SEGMENT;  | 
434  | 0  |     return TRUE;  | 
435  | 0  | }  | 
436  |  |  | 
437  |  | void  | 
438  | 0  | FCDUTF8CollationIterator::switchToBackward() { | 
439  | 0  |     U_ASSERT(state == CHECK_FWD ||  | 
440  | 0  |              (state == IN_FCD_SEGMENT && pos == start) ||  | 
441  | 0  |              (state >= IN_NORMALIZED && pos == 0));  | 
442  | 0  |     if(state == CHECK_FWD) { | 
443  |  |         // Turn around from forward checking.  | 
444  | 0  |         limit = pos;  | 
445  | 0  |         if(pos == start) { | 
446  | 0  |             state = CHECK_BWD;  // Check backward.  | 
447  | 0  |         } else {  // pos > start | 
448  | 0  |             state = IN_FCD_SEGMENT;  // Stay in FCD segment.  | 
449  | 0  |         }  | 
450  | 0  |     } else { | 
451  |  |         // Reached the start of the FCD segment.  | 
452  | 0  |         if(state == IN_FCD_SEGMENT) { | 
453  |  |             // The input text segment is FCD, extend it backward.  | 
454  | 0  |         } else { | 
455  |  |             // The input text segment needed to be normalized.  | 
456  |  |             // Switch to checking backward from it.  | 
457  | 0  |             limit = pos = start;  | 
458  | 0  |         }  | 
459  | 0  |         state = CHECK_BWD;  | 
460  | 0  |     }  | 
461  | 0  | }  | 
462  |  |  | 
463  |  | UBool  | 
464  | 0  | FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) { | 
465  | 0  |     if(U_FAILURE(errorCode)) { return FALSE; } | 
466  | 0  |     U_ASSERT(state == CHECK_BWD && pos != 0);  | 
467  |  |     // The input text [pos..limit[ passes the FCD check.  | 
468  | 0  |     int32_t segmentLimit = pos;  | 
469  |  |     // Collect the characters being checked, in case they need to be normalized.  | 
470  | 0  |     UnicodeString s;  | 
471  | 0  |     uint8_t nextCC = 0;  | 
472  | 0  |     for(;;) { | 
473  |  |         // Fetch the previous character and its fcd16 value.  | 
474  | 0  |         int32_t cpLimit = pos;  | 
475  | 0  |         UChar32 c;  | 
476  | 0  |         U8_PREV_OR_FFFD(u8, 0, pos, c);  | 
477  | 0  |         uint16_t fcd16 = nfcImpl.getFCD16(c);  | 
478  | 0  |         uint8_t trailCC = (uint8_t)fcd16;  | 
479  | 0  |         if(trailCC == 0 && cpLimit != segmentLimit) { | 
480  |  |             // FCD boundary after this character.  | 
481  | 0  |             pos = cpLimit;  | 
482  | 0  |             break;  | 
483  | 0  |         }  | 
484  | 0  |         s.append(c);  | 
485  | 0  |         if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||  | 
486  | 0  |                             CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) { | 
487  |  |             // Fails FCD check. Find the previous FCD boundary and normalize.  | 
488  | 0  |             while(fcd16 > 0xff && pos != 0) { | 
489  | 0  |                 cpLimit = pos;  | 
490  | 0  |                 U8_PREV_OR_FFFD(u8, 0, pos, c);  | 
491  | 0  |                 fcd16 = nfcImpl.getFCD16(c);  | 
492  | 0  |                 if(fcd16 == 0) { | 
493  | 0  |                     pos = cpLimit;  | 
494  | 0  |                     break;  | 
495  | 0  |                 }  | 
496  | 0  |                 s.append(c);  | 
497  | 0  |             }  | 
498  | 0  |             s.reverse();  | 
499  | 0  |             if(!normalize(s, errorCode)) { return FALSE; } | 
500  | 0  |             limit = segmentLimit;  | 
501  | 0  |             start = pos;  | 
502  | 0  |             state = IN_NORMALIZED;  | 
503  | 0  |             pos = normalized.length();  | 
504  | 0  |             return TRUE;  | 
505  | 0  |         }  | 
506  | 0  |         nextCC = (uint8_t)(fcd16 >> 8);  | 
507  | 0  |         if(pos == 0 || nextCC == 0) { | 
508  |  |             // FCD boundary before the following character.  | 
509  | 0  |             break;  | 
510  | 0  |         }  | 
511  | 0  |     }  | 
512  | 0  |     start = pos;  | 
513  | 0  |     pos = segmentLimit;  | 
514  | 0  |     U_ASSERT(pos != start);  | 
515  | 0  |     state = IN_FCD_SEGMENT;  | 
516  | 0  |     return TRUE;  | 
517  | 0  | }  | 
518  |  |  | 
519  |  | UBool  | 
520  | 0  | FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) { | 
521  |  |     // NFD without argument checking.  | 
522  | 0  |     U_ASSERT(U_SUCCESS(errorCode));  | 
523  | 0  |     nfcImpl.decompose(s, normalized, errorCode);  | 
524  | 0  |     return U_SUCCESS(errorCode);  | 
525  | 0  | }  | 
526  |  |  | 
527  |  | U_NAMESPACE_END  | 
528  |  |  | 
529  |  | #endif  // !UCONFIG_NO_COLLATION  |