Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/i18n/uitercollationiterator.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2012-2014, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* uitercollationiterator.cpp
9
*
10
* created on: 2012sep23 (from utf16collationiterator.cpp)
11
* created by: Markus W. Scherer
12
*/
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_COLLATION
17
18
#include "unicode/uiter.h"
19
#include "charstr.h"
20
#include "cmemory.h"
21
#include "collation.h"
22
#include "collationdata.h"
23
#include "collationfcd.h"
24
#include "collationiterator.h"
25
#include "normalizer2impl.h"
26
#include "uassert.h"
27
#include "uitercollationiterator.h"
28
29
U_NAMESPACE_BEGIN
30
31
0
UIterCollationIterator::~UIterCollationIterator() {}
32
33
void
34
0
UIterCollationIterator::resetToOffset(int32_t newOffset) {
35
0
    reset();
36
0
    iter.move(&iter, newOffset, UITER_START);
37
0
}
38
39
int32_t
40
0
UIterCollationIterator::getOffset() const {
41
0
    return iter.getIndex(&iter, UITER_CURRENT);
42
0
}
43
44
uint32_t
45
0
UIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
46
0
    c = iter.next(&iter);
47
0
    if(c < 0) {
48
0
        return Collation::FALLBACK_CE32;
49
0
    }
50
0
    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
51
0
}
52
53
UChar
54
0
UIterCollationIterator::handleGetTrailSurrogate() {
55
0
    UChar32 trail = iter.next(&iter);
56
0
    if(!U16_IS_TRAIL(trail) && trail >= 0) { iter.previous(&iter); }
57
0
    return (UChar)trail;
58
0
}
59
60
UChar32
61
0
UIterCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
62
0
    return uiter_next32(&iter);
63
0
}
64
65
UChar32
66
0
UIterCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
67
0
    return uiter_previous32(&iter);
68
0
}
69
70
void
71
0
UIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
72
0
    while(num > 0 && (uiter_next32(&iter)) >= 0) {
73
0
        --num;
74
0
    }
75
0
}
76
77
void
78
0
UIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
79
0
    while(num > 0 && (uiter_previous32(&iter)) >= 0) {
80
0
        --num;
81
0
    }
82
0
}
83
84
// FCDUIterCollationIterator ----------------------------------------------- ***
85
86
0
FCDUIterCollationIterator::~FCDUIterCollationIterator() {}
87
88
void
89
0
FCDUIterCollationIterator::resetToOffset(int32_t newOffset) {
90
0
    UIterCollationIterator::resetToOffset(newOffset);
91
0
    start = newOffset;
92
0
    state = ITER_CHECK_FWD;
93
0
}
94
95
int32_t
96
0
FCDUIterCollationIterator::getOffset() const {
97
0
    if(state <= ITER_CHECK_BWD) {
98
0
        return iter.getIndex(&iter, UITER_CURRENT);
99
0
    } else if(state == ITER_IN_FCD_SEGMENT) {
100
0
        return pos;
101
0
    } else if(pos == 0) {
102
0
        return start;
103
0
    } else {
104
0
        return limit;
105
0
    }
106
0
}
107
108
uint32_t
109
0
FCDUIterCollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
110
0
    for(;;) {
111
0
        if(state == ITER_CHECK_FWD) {
112
0
            c = iter.next(&iter);
113
0
            if(c < 0) {
114
0
                return Collation::FALLBACK_CE32;
115
0
            }
116
0
            if(CollationFCD::hasTccc(c)) {
117
0
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
118
0
                        CollationFCD::hasLccc(iter.current(&iter))) {
119
0
                    iter.previous(&iter);
120
0
                    if(!nextSegment(errorCode)) {
121
0
                        c = U_SENTINEL;
122
0
                        return Collation::FALLBACK_CE32;
123
0
                    }
124
0
                    continue;
125
0
                }
126
0
            }
127
0
            break;
128
0
        } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
129
0
            c = iter.next(&iter);
130
0
            ++pos;
131
0
            U_ASSERT(c >= 0);
132
0
            break;
133
0
        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
134
0
            c = normalized[pos++];
135
0
            break;
136
0
        } else {
137
0
            switchToForward();
138
0
        }
139
0
    }
140
0
    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
141
0
}
142
143
UChar
144
0
FCDUIterCollationIterator::handleGetTrailSurrogate() {
145
0
    if(state <= ITER_IN_FCD_SEGMENT) {
146
0
        UChar32 trail = iter.next(&iter);
147
0
        if(U16_IS_TRAIL(trail)) {
148
0
            if(state == ITER_IN_FCD_SEGMENT) { ++pos; }
149
0
        } else if(trail >= 0) {
150
0
            iter.previous(&iter);
151
0
        }
152
0
        return (UChar)trail;
153
0
    } else {
154
0
        U_ASSERT(pos < normalized.length());
155
0
        UChar trail;
156
0
        if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
157
0
        return trail;
158
0
    }
159
0
}
160
161
UChar32
162
0
FCDUIterCollationIterator::nextCodePoint(UErrorCode &errorCode) {
163
0
    UChar32 c;
164
0
    for(;;) {
165
0
        if(state == ITER_CHECK_FWD) {
166
0
            c = iter.next(&iter);
167
0
            if(c < 0) {
168
0
                return c;
169
0
            }
170
0
            if(CollationFCD::hasTccc(c)) {
171
0
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
172
0
                        CollationFCD::hasLccc(iter.current(&iter))) {
173
0
                    iter.previous(&iter);
174
0
                    if(!nextSegment(errorCode)) {
175
0
                        return U_SENTINEL;
176
0
                    }
177
0
                    continue;
178
0
                }
179
0
            }
180
0
            if(U16_IS_LEAD(c)) {
181
0
                UChar32 trail = iter.next(&iter);
182
0
                if(U16_IS_TRAIL(trail)) {
183
0
                    return U16_GET_SUPPLEMENTARY(c, trail);
184
0
                } else if(trail >= 0) {
185
0
                    iter.previous(&iter);
186
0
                }
187
0
            }
188
0
            return c;
189
0
        } else if(state == ITER_IN_FCD_SEGMENT && pos != limit) {
190
0
            c = uiter_next32(&iter);
191
0
            pos += U16_LENGTH(c);
192
0
            U_ASSERT(c >= 0);
193
0
            return c;
194
0
        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != normalized.length()) {
195
0
            c = normalized.char32At(pos);
196
0
            pos += U16_LENGTH(c);
197
0
            return c;
198
0
        } else {
199
0
            switchToForward();
200
0
        }
201
0
    }
202
0
}
203
204
UChar32
205
0
FCDUIterCollationIterator::previousCodePoint(UErrorCode &errorCode) {
206
0
    UChar32 c;
207
0
    for(;;) {
208
0
        if(state == ITER_CHECK_BWD) {
209
0
            c = iter.previous(&iter);
210
0
            if(c < 0) {
211
0
                start = pos = 0;
212
0
                state = ITER_IN_FCD_SEGMENT;
213
0
                return U_SENTINEL;
214
0
            }
215
0
            if(CollationFCD::hasLccc(c)) {
216
0
                UChar32 prev = U_SENTINEL;
217
0
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
218
0
                        CollationFCD::hasTccc(prev = iter.previous(&iter))) {
219
0
                    iter.next(&iter);
220
0
                    if(prev >= 0) {
221
0
                        iter.next(&iter);
222
0
                    }
223
0
                    if(!previousSegment(errorCode)) {
224
0
                        return U_SENTINEL;
225
0
                    }
226
0
                    continue;
227
0
                }
228
                // hasLccc(trail)=true for all trail surrogates
229
0
                if(U16_IS_TRAIL(c)) {
230
0
                    if(prev < 0) {
231
0
                        prev = iter.previous(&iter);
232
0
                    }
233
0
                    if(U16_IS_LEAD(prev)) {
234
0
                        return U16_GET_SUPPLEMENTARY(prev, c);
235
0
                    }
236
0
                }
237
0
                if(prev >= 0) {
238
0
                    iter.next(&iter);
239
0
                }
240
0
            }
241
0
            return c;
242
0
        } else if(state == ITER_IN_FCD_SEGMENT && pos != start) {
243
0
            c = uiter_previous32(&iter);
244
0
            pos -= U16_LENGTH(c);
245
0
            U_ASSERT(c >= 0);
246
0
            return c;
247
0
        } else if(state >= IN_NORM_ITER_AT_LIMIT && pos != 0) {
248
0
            c = normalized.char32At(pos - 1);
249
0
            pos -= U16_LENGTH(c);
250
0
            return c;
251
0
        } else {
252
0
            switchToBackward();
253
0
        }
254
0
    }
255
0
}
256
257
void
258
0
FCDUIterCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
259
    // Specify the class to avoid a virtual-function indirection.
260
    // In Java, we would declare this class final.
261
0
    while(num > 0 && FCDUIterCollationIterator::nextCodePoint(errorCode) >= 0) {
262
0
        --num;
263
0
    }
264
0
}
265
266
void
267
0
FCDUIterCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
268
    // Specify the class to avoid a virtual-function indirection.
269
    // In Java, we would declare this class final.
270
0
    while(num > 0 && FCDUIterCollationIterator::previousCodePoint(errorCode) >= 0) {
271
0
        --num;
272
0
    }
273
0
}
274
275
void
276
0
FCDUIterCollationIterator::switchToForward() {
277
0
    U_ASSERT(state == ITER_CHECK_BWD ||
278
0
             (state == ITER_IN_FCD_SEGMENT && pos == limit) ||
279
0
             (state >= IN_NORM_ITER_AT_LIMIT && pos == normalized.length()));
280
0
    if(state == ITER_CHECK_BWD) {
281
        // Turn around from backward checking.
282
0
        start = pos = iter.getIndex(&iter, UITER_CURRENT);
283
0
        if(pos == limit) {
284
0
            state = ITER_CHECK_FWD;  // Check forward.
285
0
        } else {  // pos < limit
286
0
            state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
287
0
        }
288
0
    } else {
289
        // Reached the end of the FCD segment.
290
0
        if(state == ITER_IN_FCD_SEGMENT) {
291
            // The input text segment is FCD, extend it forward.
292
0
        } else {
293
            // The input text segment needed to be normalized.
294
            // Switch to checking forward from it.
295
0
            if(state == IN_NORM_ITER_AT_START) {
296
0
                iter.move(&iter, limit - start, UITER_CURRENT);
297
0
            }
298
0
            start = limit;
299
0
        }
300
0
        state = ITER_CHECK_FWD;
301
0
    }
302
0
}
303
304
UBool
305
0
FCDUIterCollationIterator::nextSegment(UErrorCode &errorCode) {
306
0
    if(U_FAILURE(errorCode)) { return FALSE; }
307
0
    U_ASSERT(state == ITER_CHECK_FWD);
308
    // The input text [start..(iter index)[ passes the FCD check.
309
0
    pos = iter.getIndex(&iter, UITER_CURRENT);
310
    // Collect the characters being checked, in case they need to be normalized.
311
0
    UnicodeString s;
312
0
    uint8_t prevCC = 0;
313
0
    for(;;) {
314
        // Fetch the next character and its fcd16 value.
315
0
        UChar32 c = uiter_next32(&iter);
316
0
        if(c < 0) { break; }
317
0
        uint16_t fcd16 = nfcImpl.getFCD16(c);
318
0
        uint8_t leadCC = (uint8_t)(fcd16 >> 8);
319
0
        if(leadCC == 0 && !s.isEmpty()) {
320
            // FCD boundary before this character.
321
0
            uiter_previous32(&iter);
322
0
            break;
323
0
        }
324
0
        s.append(c);
325
0
        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
326
            // Fails FCD check. Find the next FCD boundary and normalize.
327
0
            for(;;) {
328
0
                c = uiter_next32(&iter);
329
0
                if(c < 0) { break; }
330
0
                if(nfcImpl.getFCD16(c) <= 0xff) {
331
0
                    uiter_previous32(&iter);
332
0
                    break;
333
0
                }
334
0
                s.append(c);
335
0
            }
336
0
            if(!normalize(s, errorCode)) { return FALSE; }
337
0
            start = pos;
338
0
            limit = pos + s.length();
339
0
            state = IN_NORM_ITER_AT_LIMIT;
340
0
            pos = 0;
341
0
            return TRUE;
342
0
        }
343
0
        prevCC = (uint8_t)fcd16;
344
0
        if(prevCC == 0) {
345
            // FCD boundary after the last character.
346
0
            break;
347
0
        }
348
0
    }
349
0
    limit = pos + s.length();
350
0
    U_ASSERT(pos != limit);
351
0
    iter.move(&iter, -s.length(), UITER_CURRENT);
352
0
    state = ITER_IN_FCD_SEGMENT;
353
0
    return TRUE;
354
0
}
355
356
void
357
0
FCDUIterCollationIterator::switchToBackward() {
358
0
    U_ASSERT(state == ITER_CHECK_FWD ||
359
0
             (state == ITER_IN_FCD_SEGMENT && pos == start) ||
360
0
             (state >= IN_NORM_ITER_AT_LIMIT && pos == 0));
361
0
    if(state == ITER_CHECK_FWD) {
362
        // Turn around from forward checking.
363
0
        limit = pos = iter.getIndex(&iter, UITER_CURRENT);
364
0
        if(pos == start) {
365
0
            state = ITER_CHECK_BWD;  // Check backward.
366
0
        } else {  // pos > start
367
0
            state = ITER_IN_FCD_SEGMENT;  // Stay in FCD segment.
368
0
        }
369
0
    } else {
370
        // Reached the start of the FCD segment.
371
0
        if(state == ITER_IN_FCD_SEGMENT) {
372
            // The input text segment is FCD, extend it backward.
373
0
        } else {
374
            // The input text segment needed to be normalized.
375
            // Switch to checking backward from it.
376
0
            if(state == IN_NORM_ITER_AT_LIMIT) {
377
0
                iter.move(&iter, start - limit, UITER_CURRENT);
378
0
            }
379
0
            limit = start;
380
0
        }
381
0
        state = ITER_CHECK_BWD;
382
0
    }
383
0
}
384
385
UBool
386
0
FCDUIterCollationIterator::previousSegment(UErrorCode &errorCode) {
387
0
    if(U_FAILURE(errorCode)) { return FALSE; }
388
0
    U_ASSERT(state == ITER_CHECK_BWD);
389
    // The input text [(iter index)..limit[ passes the FCD check.
390
0
    pos = iter.getIndex(&iter, UITER_CURRENT);
391
    // Collect the characters being checked, in case they need to be normalized.
392
0
    UnicodeString s;
393
0
    uint8_t nextCC = 0;
394
0
    for(;;) {
395
        // Fetch the previous character and its fcd16 value.
396
0
        UChar32 c = uiter_previous32(&iter);
397
0
        if(c < 0) { break; }
398
0
        uint16_t fcd16 = nfcImpl.getFCD16(c);
399
0
        uint8_t trailCC = (uint8_t)fcd16;
400
0
        if(trailCC == 0 && !s.isEmpty()) {
401
            // FCD boundary after this character.
402
0
            uiter_next32(&iter);
403
0
            break;
404
0
        }
405
0
        s.append(c);
406
0
        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
407
0
                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
408
            // Fails FCD check. Find the previous FCD boundary and normalize.
409
0
            while(fcd16 > 0xff) {
410
0
                c = uiter_previous32(&iter);
411
0
                if(c < 0) { break; }
412
0
                fcd16 = nfcImpl.getFCD16(c);
413
0
                if(fcd16 == 0) {
414
0
                    (void)uiter_next32(&iter);
415
0
                    break;
416
0
                }
417
0
                s.append(c);
418
0
            }
419
0
            s.reverse();
420
0
            if(!normalize(s, errorCode)) { return FALSE; }
421
0
            limit = pos;
422
0
            start = pos - s.length();
423
0
            state = IN_NORM_ITER_AT_START;
424
0
            pos = normalized.length();
425
0
            return TRUE;
426
0
        }
427
0
        nextCC = (uint8_t)(fcd16 >> 8);
428
0
        if(nextCC == 0) {
429
            // FCD boundary before the following character.
430
0
            break;
431
0
        }
432
0
    }
433
0
    start = pos - s.length();
434
0
    U_ASSERT(pos != start);
435
0
    iter.move(&iter, s.length(), UITER_CURRENT);
436
0
    state = ITER_IN_FCD_SEGMENT;
437
0
    return TRUE;
438
0
}
439
440
UBool
441
0
FCDUIterCollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
442
    // NFD without argument checking.
443
0
    U_ASSERT(U_SUCCESS(errorCode));
444
0
    nfcImpl.decompose(s, normalized, errorCode);
445
0
    return U_SUCCESS(errorCode);
446
0
}
447
448
U_NAMESPACE_END
449
450
#endif  // !UCONFIG_NO_COLLATION