Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/i18n/utf8collationiterator.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2012-2014, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* utf8collationiterator.cpp
9
*
10
* created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)
11
* created by: Markus W. Scherer
12
*/
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_COLLATION
17
18
#include "unicode/utf8.h"
19
#include "charstr.h"
20
#include "cmemory.h"
21
#include "collation.h"
22
#include "collationdata.h"
23
#include "collationfcd.h"
24
#include "collationiterator.h"
25
#include "normalizer2impl.h"
26
#include "uassert.h"
27
#include "utf8collationiterator.h"
28
29
U_NAMESPACE_BEGIN
30
31
0
UTF8CollationIterator::~UTF8CollationIterator() {}
32
33
void
34
0
UTF8CollationIterator::resetToOffset(int32_t newOffset) {
35
0
    reset();
36
0
    pos = newOffset;
37
0
}
38
39
int32_t
40
0
UTF8CollationIterator::getOffset() const {
41
0
    return pos;
42
0
}
43
44
uint32_t
45
0
UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
46
0
    if(pos == length) {
47
0
        c = U_SENTINEL;
48
0
        return Collation::FALLBACK_CE32;
49
0
    }
50
    // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
51
0
    c = u8[pos++];
52
0
    if(U8_IS_SINGLE(c)) {
53
        // ASCII 00..7F
54
0
        return trie->data32[c];
55
0
    }
56
0
    uint8_t t1, t2;
57
0
    if(0xe0 <= c && c < 0xf0 &&
58
0
            ((pos + 1) < length || length < 0) &&
59
0
            U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
60
0
            (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
61
        // U+0800..U+FFFF except surrogates
62
0
        c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
63
0
        pos += 2;
64
0
        return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
65
0
    } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
66
        // U+0080..U+07FF
67
0
        uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
68
0
        c = ((c & 0x1f) << 6) | t1;
69
0
        ++pos;
70
0
        return ce32;
71
0
    } else {
72
        // Function call for supplementary code points and error cases.
73
        // Illegal byte sequences yield U+FFFD.
74
0
        c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
75
0
        return data->getCE32(c);
76
0
    }
77
0
}
78
79
UBool
80
0
UTF8CollationIterator::foundNULTerminator() {
81
0
    if(length < 0) {
82
0
        length = --pos;
83
0
        return TRUE;
84
0
    } else {
85
0
        return FALSE;
86
0
    }
87
0
}
88
89
UBool
90
0
UTF8CollationIterator::forbidSurrogateCodePoints() const {
91
0
    return TRUE;
92
0
}
93
94
UChar32
95
0
UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
96
0
    if(pos == length) {
97
0
        return U_SENTINEL;
98
0
    }
99
0
    if(u8[pos] == 0 && length < 0) {
100
0
        length = pos;
101
0
        return U_SENTINEL;
102
0
    }
103
0
    UChar32 c;
104
0
    U8_NEXT_OR_FFFD(u8, pos, length, c);
105
0
    return c;
106
0
}
107
108
UChar32
109
0
UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
110
0
    if(pos == 0) {
111
0
        return U_SENTINEL;
112
0
    }
113
0
    UChar32 c;
114
0
    U8_PREV_OR_FFFD(u8, 0, pos, c);
115
0
    return c;
116
0
}
117
118
void
119
0
UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
120
0
    U8_FWD_N(u8, pos, length, num);
121
0
}
122
123
void
124
0
UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
125
0
    U8_BACK_N(u8, 0, pos, num);
126
0
}
127
128
// FCDUTF8CollationIterator ------------------------------------------------ ***
129
130
0
FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}
131
132
void
133
0
FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) {
134
0
    reset();
135
0
    start = pos = newOffset;
136
0
    state = CHECK_FWD;
137
0
}
138
139
int32_t
140
0
FCDUTF8CollationIterator::getOffset() const {
141
0
    if(state != IN_NORMALIZED) {
142
0
        return pos;
143
0
    } else if(pos == 0) {
144
0
        return start;
145
0
    } else {
146
0
        return limit;
147
0
    }
148
0
}
149
150
uint32_t
151
0
FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
152
0
    for(;;) {
153
0
        if(state == CHECK_FWD) {
154
            // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.
155
0
            if(pos == length) {
156
0
                c = U_SENTINEL;
157
0
                return Collation::FALLBACK_CE32;
158
0
            }
159
0
            c = u8[pos++];
160
0
            if(U8_IS_SINGLE(c)) {
161
                // ASCII 00..7F
162
0
                return trie->data32[c];
163
0
            }
164
0
            uint8_t t1, t2;
165
0
            if(0xe0 <= c && c < 0xf0 &&
166
0
                    ((pos + 1) < length || length < 0) &&
167
0
                    U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
168
0
                    (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
169
                // U+0800..U+FFFF except surrogates
170
0
                c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
171
0
                pos += 2;
172
0
                if(CollationFCD::hasTccc(c) &&
173
0
                        (CollationFCD::maybeTibetanCompositeVowel(c) ||
174
0
                            (pos != length && nextHasLccc()))) {
175
0
                    pos -= 3;
176
0
                } else {
177
0
                    break;  // return CE32(BMP)
178
0
                }
179
0
            } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
180
                // U+0080..U+07FF
181
0
                uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
182
0
                c = ((c & 0x1f) << 6) | t1;
183
0
                ++pos;
184
0
                if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
185
0
                    pos -= 2;
186
0
                } else {
187
0
                    return ce32;
188
0
                }
189
0
            } else {
190
                // Function call for supplementary code points and error cases.
191
                // Illegal byte sequences yield U+FFFD.
192
0
                c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
193
0
                if(c == 0xfffd) {
194
0
                    return Collation::FFFD_CE32;
195
0
                } else {
196
0
                    U_ASSERT(c > 0xffff);
197
0
                    if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) {
198
0
                        pos -= 4;
199
0
                    } else {
200
0
                        return data->getCE32FromSupplementary(c);
201
0
                    }
202
0
                }
203
0
            }
204
0
            if(!nextSegment(errorCode)) {
205
0
                c = U_SENTINEL;
206
0
                return Collation::FALLBACK_CE32;
207
0
            }
208
0
            continue;
209
0
        } else if(state == IN_FCD_SEGMENT && pos != limit) {
210
0
            return UTF8CollationIterator::handleNextCE32(c, errorCode);
211
0
        } else if(state == IN_NORMALIZED && pos != normalized.length()) {
212
0
            c = normalized[pos++];
213
0
            break;
214
0
        } else {
215
0
            switchToForward();
216
0
        }
217
0
    }
218
0
    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
219
0
}
220
221
UBool
222
0
FCDUTF8CollationIterator::nextHasLccc() const {
223
0
    U_ASSERT(state == CHECK_FWD && pos != length);
224
    // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
225
    // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
226
0
    UChar32 c = u8[pos];
227
0
    if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; }
228
0
    int32_t i = pos;
229
0
    U8_NEXT_OR_FFFD(u8, i, length, c);
230
0
    if(c > 0xffff) { c = U16_LEAD(c); }
231
0
    return CollationFCD::hasLccc(c);
232
0
}
233
234
UBool
235
0
FCDUTF8CollationIterator::previousHasTccc() const {
236
0
    U_ASSERT(state == CHECK_BWD && pos != 0);
237
0
    UChar32 c = u8[pos - 1];
238
0
    if(U8_IS_SINGLE(c)) { return FALSE; }
239
0
    int32_t i = pos;
240
0
    U8_PREV_OR_FFFD(u8, 0, i, c);
241
0
    if(c > 0xffff) { c = U16_LEAD(c); }
242
0
    return CollationFCD::hasTccc(c);
243
0
}
244
245
UChar
246
0
FCDUTF8CollationIterator::handleGetTrailSurrogate() {
247
0
    if(state != IN_NORMALIZED) { return 0; }
248
0
    U_ASSERT(pos < normalized.length());
249
0
    UChar trail;
250
0
    if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
251
0
    return trail;
252
0
}
253
254
UBool
255
0
FCDUTF8CollationIterator::foundNULTerminator() {
256
0
    if(state == CHECK_FWD && length < 0) {
257
0
        length = --pos;
258
0
        return TRUE;
259
0
    } else {
260
0
        return FALSE;
261
0
    }
262
0
}
263
264
UChar32
265
0
FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
266
0
    UChar32 c;
267
0
    for(;;) {
268
0
        if(state == CHECK_FWD) {
269
0
            if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
270
0
                return U_SENTINEL;
271
0
            }
272
0
            if(U8_IS_SINGLE(c)) {
273
0
                ++pos;
274
0
                return c;
275
0
            }
276
0
            U8_NEXT_OR_FFFD(u8, pos, length, c);
277
0
            if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&
278
0
                    (CollationFCD::maybeTibetanCompositeVowel(c) ||
279
0
                        (pos != length && nextHasLccc()))) {
280
                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
281
                // and we can use U8_LENGTH() rather than a previous-position variable.
282
0
                pos -= U8_LENGTH(c);
283
0
                if(!nextSegment(errorCode)) {
284
0
                    return U_SENTINEL;
285
0
                }
286
0
                continue;
287
0
            }
288
0
            return c;
289
0
        } else if(state == IN_FCD_SEGMENT && pos != limit) {
290
0
            U8_NEXT_OR_FFFD(u8, pos, length, c);
291
0
            return c;
292
0
        } else if(state == IN_NORMALIZED && pos != normalized.length()) {
293
0
            c = normalized.char32At(pos);
294
0
            pos += U16_LENGTH(c);
295
0
            return c;
296
0
        } else {
297
0
            switchToForward();
298
0
        }
299
0
    }
300
0
}
301
302
UChar32
303
0
FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
304
0
    UChar32 c;
305
0
    for(;;) {
306
0
        if(state == CHECK_BWD) {
307
0
            if(pos == 0) {
308
0
                return U_SENTINEL;
309
0
            }
310
0
            if(U8_IS_SINGLE(c = u8[pos - 1])) {
311
0
                --pos;
312
0
                return c;
313
0
            }
314
0
            U8_PREV_OR_FFFD(u8, 0, pos, c);
315
0
            if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&
316
0
                    (CollationFCD::maybeTibetanCompositeVowel(c) ||
317
0
                        (pos != 0 && previousHasTccc()))) {
318
                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
319
                // and we can use U8_LENGTH() rather than a previous-position variable.
320
0
                pos += U8_LENGTH(c);
321
0
                if(!previousSegment(errorCode)) {
322
0
                    return U_SENTINEL;
323
0
                }
324
0
                continue;
325
0
            }
326
0
            return c;
327
0
        } else if(state == IN_FCD_SEGMENT && pos != start) {
328
0
            U8_PREV_OR_FFFD(u8, 0, pos, c);
329
0
            return c;
330
0
        } else if(state >= IN_NORMALIZED && pos != 0) {
331
0
            c = normalized.char32At(pos - 1);
332
0
            pos -= U16_LENGTH(c);
333
0
            return c;
334
0
        } else {
335
0
            switchToBackward();
336
0
        }
337
0
    }
338
0
}
339
340
void
341
0
FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
342
    // Specify the class to avoid a virtual-function indirection.
343
    // In Java, we would declare this class final.
344
0
    while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) {
345
0
        --num;
346
0
    }
347
0
}
348
349
void
350
0
FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
351
    // Specify the class to avoid a virtual-function indirection.
352
    // In Java, we would declare this class final.
353
0
    while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) {
354
0
        --num;
355
0
    }
356
0
}
357
358
void
359
0
FCDUTF8CollationIterator::switchToForward() {
360
0
    U_ASSERT(state == CHECK_BWD ||
361
0
             (state == IN_FCD_SEGMENT && pos == limit) ||
362
0
             (state == IN_NORMALIZED && pos == normalized.length()));
363
0
    if(state == CHECK_BWD) {
364
        // Turn around from backward checking.
365
0
        start = pos;
366
0
        if(pos == limit) {
367
0
            state = CHECK_FWD;  // Check forward.
368
0
        } else {  // pos < limit
369
0
            state = IN_FCD_SEGMENT;  // Stay in FCD segment.
370
0
        }
371
0
    } else {
372
        // Reached the end of the FCD segment.
373
0
        if(state == IN_FCD_SEGMENT) {
374
            // The input text segment is FCD, extend it forward.
375
0
        } else {
376
            // The input text segment needed to be normalized.
377
            // Switch to checking forward from it.
378
0
            start = pos = limit;
379
0
        }
380
0
        state = CHECK_FWD;
381
0
    }
382
0
}
383
384
UBool
385
0
FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) {
386
0
    if(U_FAILURE(errorCode)) { return FALSE; }
387
0
    U_ASSERT(state == CHECK_FWD && pos != length);
388
    // The input text [start..pos[ passes the FCD check.
389
0
    int32_t segmentStart = pos;
390
    // Collect the characters being checked, in case they need to be normalized.
391
0
    UnicodeString s;
392
0
    uint8_t prevCC = 0;
393
0
    for(;;) {
394
        // Fetch the next character and its fcd16 value.
395
0
        int32_t cpStart = pos;
396
0
        UChar32 c;
397
0
        U8_NEXT_OR_FFFD(u8, pos, length, c);
398
0
        uint16_t fcd16 = nfcImpl.getFCD16(c);
399
0
        uint8_t leadCC = (uint8_t)(fcd16 >> 8);
400
0
        if(leadCC == 0 && cpStart != segmentStart) {
401
            // FCD boundary before this character.
402
0
            pos = cpStart;
403
0
            break;
404
0
        }
405
0
        s.append(c);
406
0
        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
407
            // Fails FCD check. Find the next FCD boundary and normalize.
408
0
            while(pos != length) {
409
0
                cpStart = pos;
410
0
                U8_NEXT_OR_FFFD(u8, pos, length, c);
411
0
                if(nfcImpl.getFCD16(c) <= 0xff) {
412
0
                    pos = cpStart;
413
0
                    break;
414
0
                }
415
0
                s.append(c);
416
0
            }
417
0
            if(!normalize(s, errorCode)) { return FALSE; }
418
0
            start = segmentStart;
419
0
            limit = pos;
420
0
            state = IN_NORMALIZED;
421
0
            pos = 0;
422
0
            return TRUE;
423
0
        }
424
0
        prevCC = (uint8_t)fcd16;
425
0
        if(pos == length || prevCC == 0) {
426
            // FCD boundary after the last character.
427
0
            break;
428
0
        }
429
0
    }
430
0
    limit = pos;
431
0
    pos = segmentStart;
432
0
    U_ASSERT(pos != limit);
433
0
    state = IN_FCD_SEGMENT;
434
0
    return TRUE;
435
0
}
436
437
void
438
0
FCDUTF8CollationIterator::switchToBackward() {
439
0
    U_ASSERT(state == CHECK_FWD ||
440
0
             (state == IN_FCD_SEGMENT && pos == start) ||
441
0
             (state >= IN_NORMALIZED && pos == 0));
442
0
    if(state == CHECK_FWD) {
443
        // Turn around from forward checking.
444
0
        limit = pos;
445
0
        if(pos == start) {
446
0
            state = CHECK_BWD;  // Check backward.
447
0
        } else {  // pos > start
448
0
            state = IN_FCD_SEGMENT;  // Stay in FCD segment.
449
0
        }
450
0
    } else {
451
        // Reached the start of the FCD segment.
452
0
        if(state == IN_FCD_SEGMENT) {
453
            // The input text segment is FCD, extend it backward.
454
0
        } else {
455
            // The input text segment needed to be normalized.
456
            // Switch to checking backward from it.
457
0
            limit = pos = start;
458
0
        }
459
0
        state = CHECK_BWD;
460
0
    }
461
0
}
462
463
UBool
464
0
FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) {
465
0
    if(U_FAILURE(errorCode)) { return FALSE; }
466
0
    U_ASSERT(state == CHECK_BWD && pos != 0);
467
    // The input text [pos..limit[ passes the FCD check.
468
0
    int32_t segmentLimit = pos;
469
    // Collect the characters being checked, in case they need to be normalized.
470
0
    UnicodeString s;
471
0
    uint8_t nextCC = 0;
472
0
    for(;;) {
473
        // Fetch the previous character and its fcd16 value.
474
0
        int32_t cpLimit = pos;
475
0
        UChar32 c;
476
0
        U8_PREV_OR_FFFD(u8, 0, pos, c);
477
0
        uint16_t fcd16 = nfcImpl.getFCD16(c);
478
0
        uint8_t trailCC = (uint8_t)fcd16;
479
0
        if(trailCC == 0 && cpLimit != segmentLimit) {
480
            // FCD boundary after this character.
481
0
            pos = cpLimit;
482
0
            break;
483
0
        }
484
0
        s.append(c);
485
0
        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
486
0
                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
487
            // Fails FCD check. Find the previous FCD boundary and normalize.
488
0
            while(fcd16 > 0xff && pos != 0) {
489
0
                cpLimit = pos;
490
0
                U8_PREV_OR_FFFD(u8, 0, pos, c);
491
0
                fcd16 = nfcImpl.getFCD16(c);
492
0
                if(fcd16 == 0) {
493
0
                    pos = cpLimit;
494
0
                    break;
495
0
                }
496
0
                s.append(c);
497
0
            }
498
0
            s.reverse();
499
0
            if(!normalize(s, errorCode)) { return FALSE; }
500
0
            limit = segmentLimit;
501
0
            start = pos;
502
0
            state = IN_NORMALIZED;
503
0
            pos = normalized.length();
504
0
            return TRUE;
505
0
        }
506
0
        nextCC = (uint8_t)(fcd16 >> 8);
507
0
        if(pos == 0 || nextCC == 0) {
508
            // FCD boundary before the following character.
509
0
            break;
510
0
        }
511
0
    }
512
0
    start = pos;
513
0
    pos = segmentLimit;
514
0
    U_ASSERT(pos != start);
515
0
    state = IN_FCD_SEGMENT;
516
0
    return TRUE;
517
0
}
518
519
UBool
520
0
FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
521
    // NFD without argument checking.
522
0
    U_ASSERT(U_SUCCESS(errorCode));
523
0
    nfcImpl.decompose(s, normalized, errorCode);
524
0
    return U_SUCCESS(errorCode);
525
0
}
526
527
U_NAMESPACE_END
528
529
#endif  // !UCONFIG_NO_COLLATION