Coverage Report

Created: 2025-11-07 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/utf16collationiterator.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2010-2014, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* utf16collationiterator.cpp
9
*
10
* created on: 2010oct27
11
* created by: Markus W. Scherer
12
*/
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_COLLATION
17
18
#include "charstr.h"
19
#include "cmemory.h"
20
#include "collation.h"
21
#include "collationdata.h"
22
#include "collationfcd.h"
23
#include "collationiterator.h"
24
#include "normalizer2impl.h"
25
#include "uassert.h"
26
#include "utf16collationiterator.h"
27
28
U_NAMESPACE_BEGIN
29
30
UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other,
31
                                               const char16_t *newText)
32
0
        : CollationIterator(other),
33
0
          start(newText),
34
0
          pos(newText + (other.pos - other.start)),
35
0
          limit(other.limit == nullptr ? nullptr : newText + (other.limit - other.start)) {
36
0
}
37
38
4.81M
UTF16CollationIterator::~UTF16CollationIterator() {}
39
40
bool
41
0
UTF16CollationIterator::operator==(const CollationIterator &other) const {
42
0
    if(!CollationIterator::operator==(other)) { return false; }
43
0
    const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other);
44
    // Compare the iterator state but not the text: Assume that the caller does that.
45
0
    return (pos - start) == (o.pos - o.start);
46
0
}
47
48
void
49
0
UTF16CollationIterator::resetToOffset(int32_t newOffset) {
50
0
    reset();
51
0
    pos = start + newOffset;
52
0
}
53
54
int32_t
55
0
UTF16CollationIterator::getOffset() const {
56
0
    return static_cast<int32_t>(pos - start);
57
0
}
58
59
uint32_t
60
10.2M
UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
61
10.2M
    if(pos == limit) {
62
4.79M
        c = U_SENTINEL;
63
4.79M
        return Collation::FALLBACK_CE32;
64
4.79M
    }
65
5.47M
    c = *pos++;
66
5.47M
    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
67
10.2M
}
68
69
char16_t
70
949k
UTF16CollationIterator::handleGetTrailSurrogate() {
71
949k
    if(pos == limit) { return 0; }
72
947k
    char16_t trail;
73
947k
    if(U16_IS_TRAIL(trail = *pos)) { ++pos; }
74
947k
    return trail;
75
949k
}
76
77
UBool
78
58.2k
UTF16CollationIterator::foundNULTerminator() {
79
58.2k
    if(limit == nullptr) {
80
0
        limit = --pos;
81
0
        return true;
82
58.2k
    } else {
83
58.2k
        return false;
84
58.2k
    }
85
58.2k
}
86
87
UChar32
88
1.32M
UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
89
1.32M
    if(pos == limit) {
90
7.88k
        return U_SENTINEL;
91
7.88k
    }
92
1.31M
    UChar32 c = *pos;
93
1.31M
    if(c == 0 && limit == nullptr) {
94
0
        limit = pos;
95
0
        return U_SENTINEL;
96
0
    }
97
1.31M
    ++pos;
98
1.31M
    char16_t trail;
99
1.31M
    if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
100
5.94k
        ++pos;
101
5.94k
        return U16_GET_SUPPLEMENTARY(c, trail);
102
1.30M
    } else {
103
1.30M
        return c;
104
1.30M
    }
105
1.31M
}
106
107
UChar32
108
20.5k
UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
109
20.5k
    if(pos == start) {
110
663
        return U_SENTINEL;
111
663
    }
112
19.8k
    UChar32 c = *--pos;
113
19.8k
    char16_t lead;
114
19.8k
    if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
115
327
        --pos;
116
327
        return U16_GET_SUPPLEMENTARY(lead, c);
117
19.5k
    } else {
118
19.5k
        return c;
119
19.5k
    }
120
19.8k
}
121
122
void
123
30.0k
UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
124
65.0k
    while(num > 0 && pos != limit) {
125
34.9k
        UChar32 c = *pos;
126
34.9k
        if(c == 0 && limit == nullptr) {
127
0
            limit = pos;
128
0
            break;
129
0
        }
130
34.9k
        ++pos;
131
34.9k
        --num;
132
34.9k
        if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) {
133
531
            ++pos;
134
531
        }
135
34.9k
    }
136
30.0k
}
137
138
void
139
63.7k
UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
140
1.24M
    while(num > 0 && pos != start) {
141
1.17M
        UChar32 c = *--pos;
142
1.17M
        --num;
143
1.17M
        if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) {
144
4.31k
            --pos;
145
4.31k
        }
146
1.17M
    }
147
63.7k
}
148
149
// FCDUTF16CollationIterator ----------------------------------------------- ***
150
151
FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other,
152
                                                     const char16_t *newText)
153
0
        : UTF16CollationIterator(other),
154
0
          rawStart(newText),
155
0
          segmentStart(newText + (other.segmentStart - other.rawStart)),
156
0
          segmentLimit(other.segmentLimit == nullptr ? nullptr : newText + (other.segmentLimit - other.rawStart)),
157
0
          rawLimit(other.rawLimit == nullptr ? nullptr : newText + (other.rawLimit - other.rawStart)),
158
0
          nfcImpl(other.nfcImpl),
159
0
          normalized(other.normalized),
160
0
          checkDir(other.checkDir) {
161
0
    if(checkDir != 0 || other.start == other.segmentStart) {
162
0
        start = newText + (other.start - other.rawStart);
163
0
        pos = newText + (other.pos - other.rawStart);
164
0
        limit = other.limit == nullptr ? nullptr : newText + (other.limit - other.rawStart);
165
0
    } else {
166
0
        start = normalized.getBuffer();
167
0
        pos = start + (other.pos - other.start);
168
0
        limit = start + normalized.length();
169
0
    }
170
0
}
171
172
5.69k
FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {}
173
174
bool
175
0
FCDUTF16CollationIterator::operator==(const CollationIterator &other) const {
176
    // Skip the UTF16CollationIterator and call its parent.
177
0
    if(!CollationIterator::operator==(other)) { return false; }
178
0
    const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other);
179
    // Compare the iterator state but not the text: Assume that the caller does that.
180
0
    if(checkDir != o.checkDir) { return false; }
181
0
    if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return false; }
182
0
    if(checkDir != 0 || start == segmentStart) {
183
0
        return (pos - rawStart) == (o.pos - o.rawStart);
184
0
    } else {
185
0
        return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) &&
186
0
                (pos - start) == (o.pos - o.start);
187
0
    }
188
0
}
189
190
void
191
0
FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) {
192
0
    reset();
193
0
    start = segmentStart = pos = rawStart + newOffset;
194
0
    limit = rawLimit;
195
0
    checkDir = 1;
196
0
}
197
198
int32_t
199
0
FCDUTF16CollationIterator::getOffset() const {
200
0
    if(checkDir != 0 || start == segmentStart) {
201
0
        return static_cast<int32_t>(pos - rawStart);
202
0
    } else if(pos == start) {
203
0
        return static_cast<int32_t>(segmentStart - rawStart);
204
0
    } else {
205
0
        return static_cast<int32_t>(segmentLimit - rawStart);
206
0
    }
207
0
}
208
209
uint32_t
210
297k
FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
211
321k
    for(;;) {
212
321k
        if(checkDir > 0) {
213
252k
            if(pos == limit) {
214
2.57k
                c = U_SENTINEL;
215
2.57k
                return Collation::FALLBACK_CE32;
216
2.57k
            }
217
249k
            c = *pos++;
218
249k
            if(CollationFCD::hasTccc(c)) {
219
30.9k
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
220
22.8k
                        (pos != limit && CollationFCD::hasLccc(*pos))) {
221
16.5k
                    --pos;
222
16.5k
                    if(!nextSegment(errorCode)) {
223
0
                        c = U_SENTINEL;
224
0
                        return Collation::FALLBACK_CE32;
225
0
                    }
226
16.5k
                    c = *pos++;
227
16.5k
                }
228
30.9k
            }
229
249k
            break;
230
249k
        } else if(checkDir == 0 && pos != limit) {
231
45.4k
            c = *pos++;
232
45.4k
            break;
233
45.4k
        } else {
234
23.7k
            switchToForward();
235
23.7k
        }
236
321k
    }
237
294k
    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
238
297k
}
239
240
UBool
241
35.4k
FCDUTF16CollationIterator::foundNULTerminator() {
242
35.4k
    if(limit == nullptr) {
243
0
        limit = rawLimit = --pos;
244
0
        return true;
245
35.4k
    } else {
246
35.4k
        return false;
247
35.4k
    }
248
35.4k
}
249
250
UChar32
251
1.71M
FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) {
252
1.71M
    UChar32 c;
253
1.76M
    for(;;) {
254
1.76M
        if(checkDir > 0) {
255
133k
            if(pos == limit) {
256
43.6k
                return U_SENTINEL;
257
43.6k
            }
258
89.5k
            c = *pos++;
259
89.5k
            if(CollationFCD::hasTccc(c)) {
260
25.1k
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
261
19.3k
                        (pos != limit && CollationFCD::hasLccc(*pos))) {
262
17.4k
                    --pos;
263
17.4k
                    if(!nextSegment(errorCode)) {
264
0
                        return U_SENTINEL;
265
0
                    }
266
17.4k
                    c = *pos++;
267
17.4k
                }
268
64.3k
            } else if(c == 0 && limit == nullptr) {
269
0
                limit = rawLimit = --pos;
270
0
                return U_SENTINEL;
271
0
            }
272
89.5k
            break;
273
1.63M
        } else if(checkDir == 0 && pos != limit) {
274
1.58M
            c = *pos++;
275
1.58M
            break;
276
1.58M
        } else {
277
48.1k
            switchToForward();
278
48.1k
        }
279
1.76M
    }
280
1.67M
    char16_t trail;
281
1.67M
    if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
282
15.9k
        ++pos;
283
15.9k
        return U16_GET_SUPPLEMENTARY(c, trail);
284
1.65M
    } else {
285
1.65M
        return c;
286
1.65M
    }
287
1.67M
}
288
289
UChar32
290
1.36M
FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
291
1.36M
    UChar32 c;
292
1.43M
    for(;;) {
293
1.43M
        if(checkDir < 0) {
294
42.2k
            if(pos == start) {
295
36
                return U_SENTINEL;
296
36
            }
297
42.2k
            c = *--pos;
298
42.2k
            if(CollationFCD::hasLccc(c)) {
299
35.9k
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
300
33.9k
                        (pos != start && CollationFCD::hasTccc(*(pos - 1)))) {
301
33.9k
                    ++pos;
302
33.9k
                    if(!previousSegment(errorCode)) {
303
0
                        return U_SENTINEL;
304
0
                    }
305
33.9k
                    c = *--pos;
306
33.9k
                }
307
35.9k
            }
308
42.2k
            break;
309
1.38M
        } else if(checkDir == 0 && pos != start) {
310
1.32M
            c = *--pos;
311
1.32M
            break;
312
1.32M
        } else {
313
65.4k
            switchToBackward();
314
65.4k
        }
315
1.43M
    }
316
1.36M
    char16_t lead;
317
1.36M
    if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
318
15.4k
        --pos;
319
15.4k
        return U16_GET_SUPPLEMENTARY(lead, c);
320
1.35M
    } else {
321
1.35M
        return c;
322
1.35M
    }
323
1.36M
}
324
325
void
326
10.4k
FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
327
    // Specify the class to avoid a virtual-function indirection.
328
    // In Java, we would declare this class final.
329
23.0k
    while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) {
330
12.6k
        --num;
331
12.6k
    }
332
10.4k
}
333
334
void
335
182k
FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
336
    // Specify the class to avoid a virtual-function indirection.
337
    // In Java, we would declare this class final.
338
1.54M
    while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) {
339
1.36M
        --num;
340
1.36M
    }
341
182k
}
342
343
void
344
71.9k
FCDUTF16CollationIterator::switchToForward() {
345
71.9k
    U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit));
346
71.9k
    if(checkDir < 0) {
347
        // Turn around from backward checking.
348
1.40k
        start = segmentStart = pos;
349
1.40k
        if(pos == segmentLimit) {
350
0
            limit = rawLimit;
351
0
            checkDir = 1;  // Check forward.
352
1.40k
        } else {  // pos < segmentLimit
353
1.40k
            checkDir = 0;  // Stay in FCD segment.
354
1.40k
        }
355
70.5k
    } else {
356
        // Reached the end of the FCD segment.
357
70.5k
        if(start == segmentStart) {
358
            // The input text segment is FCD, extend it forward.
359
45.7k
        } else {
360
            // The input text segment needed to be normalized.
361
            // Switch to checking forward from it.
362
45.7k
            pos = start = segmentStart = segmentLimit;
363
            // Note: If this segment is at the end of the input text,
364
            // then it might help to return false to indicate that, so that
365
            // we do not have to re-check and normalize when we turn around and go backwards.
366
            // However, that would complicate the call sites for an optimization of an unusual case.
367
45.7k
        }
368
70.5k
        limit = rawLimit;
369
70.5k
        checkDir = 1;
370
70.5k
    }
371
71.9k
}
372
373
UBool
374
34.0k
FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) {
375
34.0k
    if(U_FAILURE(errorCode)) { return false; }
376
34.0k
    U_ASSERT(checkDir > 0 && pos != limit);
377
    // The input text [segmentStart..pos[ passes the FCD check.
378
34.0k
    const char16_t *p = pos;
379
34.0k
    uint8_t prevCC = 0;
380
62.5k
    for(;;) {
381
        // Fetch the next character's fcd16 value.
382
62.5k
        const char16_t *q = p;
383
62.5k
        uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit);
384
62.5k
        uint8_t leadCC = static_cast<uint8_t>(fcd16 >> 8);
385
62.5k
        if(leadCC == 0 && q != pos) {
386
            // FCD boundary before the [q, p[ character.
387
4.14k
            limit = segmentLimit = q;
388
4.14k
            break;
389
4.14k
        }
390
58.4k
        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
391
            // Fails FCD check. Find the next FCD boundary and normalize.
392
345k
            do {
393
345k
                q = p;
394
345k
            } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff);
395
25.7k
            if(!normalize(pos, q, errorCode)) { return false; }
396
25.7k
            pos = start;
397
25.7k
            break;
398
25.7k
        }
399
32.6k
        prevCC = static_cast<uint8_t>(fcd16);
400
32.6k
        if(p == rawLimit || prevCC == 0) {
401
            // FCD boundary after the last character.
402
4.16k
            limit = segmentLimit = p;
403
4.16k
            break;
404
4.16k
        }
405
32.6k
    }
406
34.0k
    U_ASSERT(pos != limit);
407
34.0k
    checkDir = 0;
408
34.0k
    return true;
409
34.0k
}
410
411
void
412
65.4k
FCDUTF16CollationIterator::switchToBackward() {
413
65.4k
    U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start));
414
65.4k
    if(checkDir > 0) {
415
        // Turn around from forward checking.
416
37.8k
        limit = segmentLimit = pos;
417
37.8k
        if(pos == segmentStart) {
418
7.81k
            start = rawStart;
419
7.81k
            checkDir = -1;  // Check backward.
420
30.0k
        } else {  // pos > segmentStart
421
30.0k
            checkDir = 0;  // Stay in FCD segment.
422
30.0k
        }
423
37.8k
    } else {
424
        // Reached the start of the FCD segment.
425
27.5k
        if(start == segmentStart) {
426
            // The input text segment is FCD, extend it backward.
427
16.6k
        } else {
428
            // The input text segment needed to be normalized.
429
            // Switch to checking backward from it.
430
10.8k
            pos = limit = segmentLimit = segmentStart;
431
10.8k
        }
432
27.5k
        start = rawStart;
433
27.5k
        checkDir = -1;
434
27.5k
    }
435
65.4k
}
436
437
UBool
438
33.9k
FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) {
439
33.9k
    if(U_FAILURE(errorCode)) { return false; }
440
33.9k
    U_ASSERT(checkDir < 0 && pos != start);
441
    // The input text [pos..segmentLimit[ passes the FCD check.
442
33.9k
    const char16_t *p = pos;
443
33.9k
    uint8_t nextCC = 0;
444
233k
    for(;;) {
445
        // Fetch the previous character's fcd16 value.
446
233k
        const char16_t *q = p;
447
233k
        uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p);
448
233k
        uint8_t trailCC = static_cast<uint8_t>(fcd16);
449
233k
        if(trailCC == 0 && q != pos) {
450
            // FCD boundary after the [p, q[ character.
451
924
            start = segmentStart = q;
452
924
            break;
453
924
        }
454
232k
        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
455
222k
                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
456
            // Fails FCD check. Find the previous FCD boundary and normalize.
457
3.74M
            do {
458
3.74M
                q = p;
459
3.74M
            } while(fcd16 > 0xff && p != rawStart &&
460
3.73M
                    (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0);
461
31.7k
            if(!normalize(q, pos, errorCode)) { return false; }
462
31.7k
            pos = limit;
463
31.7k
            break;
464
31.7k
        }
465
200k
        nextCC = static_cast<uint8_t>(fcd16 >> 8);
466
200k
        if(p == rawStart || nextCC == 0) {
467
            // FCD boundary before the following character.
468
1.27k
            start = segmentStart = p;
469
1.27k
            break;
470
1.27k
        }
471
200k
    }
472
33.9k
    U_ASSERT(pos != start);
473
33.9k
    checkDir = 0;
474
33.9k
    return true;
475
33.9k
}
476
477
UBool
478
57.4k
FCDUTF16CollationIterator::normalize(const char16_t *from, const char16_t *to, UErrorCode &errorCode) {
479
    // NFD without argument checking.
480
57.4k
    U_ASSERT(U_SUCCESS(errorCode));
481
57.4k
    nfcImpl.decompose(from, to, normalized, static_cast<int32_t>(to - from), errorCode);
482
57.4k
    if(U_FAILURE(errorCode)) { return false; }
483
    // Switch collation processing into the FCD buffer
484
    // with the result of normalizing [segmentStart, segmentLimit[.
485
57.4k
    segmentStart = from;
486
57.4k
    segmentLimit = to;
487
57.4k
    start = normalized.getBuffer();
488
57.4k
    limit = start + normalized.length();
489
57.4k
    return true;
490
57.4k
}
491
492
U_NAMESPACE_END
493
494
#endif  // !UCONFIG_NO_COLLATION