Coverage Report

Created: 2023-02-22 06:51

/src/icu/source/i18n/utf16collationiterator.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2010-2014, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* utf16collationiterator.cpp
9
*
10
* created on: 2010oct27
11
* created by: Markus W. Scherer
12
*/
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_COLLATION
17
18
#include "charstr.h"
19
#include "cmemory.h"
20
#include "collation.h"
21
#include "collationdata.h"
22
#include "collationfcd.h"
23
#include "collationiterator.h"
24
#include "normalizer2impl.h"
25
#include "uassert.h"
26
#include "utf16collationiterator.h"
27
28
U_NAMESPACE_BEGIN
29
30
UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other,
31
                                               const UChar *newText)
32
        : CollationIterator(other),
33
          start(newText),
34
          pos(newText + (other.pos - other.start)),
35
0
          limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) {
36
0
}
37
38
0
UTF16CollationIterator::~UTF16CollationIterator() {}
39
40
bool
41
0
UTF16CollationIterator::operator==(const CollationIterator &other) const {
42
0
    if(!CollationIterator::operator==(other)) { return FALSE; }
43
0
    const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other);
44
    // Compare the iterator state but not the text: Assume that the caller does that.
45
0
    return (pos - start) == (o.pos - o.start);
46
0
}
47
48
void
49
0
UTF16CollationIterator::resetToOffset(int32_t newOffset) {
50
0
    reset();
51
0
    pos = start + newOffset;
52
0
}
53
54
int32_t
55
0
UTF16CollationIterator::getOffset() const {
56
0
    return (int32_t)(pos - start);
57
0
}
58
59
uint32_t
60
0
UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
61
0
    if(pos == limit) {
62
0
        c = U_SENTINEL;
63
0
        return Collation::FALLBACK_CE32;
64
0
    }
65
0
    c = *pos++;
66
0
    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
67
0
}
68
69
UChar
70
0
UTF16CollationIterator::handleGetTrailSurrogate() {
71
0
    if(pos == limit) { return 0; }
72
0
    UChar trail;
73
0
    if(U16_IS_TRAIL(trail = *pos)) { ++pos; }
74
0
    return trail;
75
0
}
76
77
UBool
78
0
UTF16CollationIterator::foundNULTerminator() {
79
0
    if(limit == NULL) {
80
0
        limit = --pos;
81
0
        return TRUE;
82
0
    } else {
83
0
        return FALSE;
84
0
    }
85
0
}
86
87
UChar32
88
0
UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
89
0
    if(pos == limit) {
90
0
        return U_SENTINEL;
91
0
    }
92
0
    UChar32 c = *pos;
93
0
    if(c == 0 && limit == NULL) {
94
0
        limit = pos;
95
0
        return U_SENTINEL;
96
0
    }
97
0
    ++pos;
98
0
    UChar trail;
99
0
    if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
100
0
        ++pos;
101
0
        return U16_GET_SUPPLEMENTARY(c, trail);
102
0
    } else {
103
0
        return c;
104
0
    }
105
0
}
106
107
UChar32
108
0
UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
109
0
    if(pos == start) {
110
0
        return U_SENTINEL;
111
0
    }
112
0
    UChar32 c = *--pos;
113
0
    UChar lead;
114
0
    if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
115
0
        --pos;
116
0
        return U16_GET_SUPPLEMENTARY(lead, c);
117
0
    } else {
118
0
        return c;
119
0
    }
120
0
}
121
122
void
123
0
UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
124
0
    while(num > 0 && pos != limit) {
125
0
        UChar32 c = *pos;
126
0
        if(c == 0 && limit == NULL) {
127
0
            limit = pos;
128
0
            break;
129
0
        }
130
0
        ++pos;
131
0
        --num;
132
0
        if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) {
133
0
            ++pos;
134
0
        }
135
0
    }
136
0
}
137
138
void
139
0
UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
140
0
    while(num > 0 && pos != start) {
141
0
        UChar32 c = *--pos;
142
0
        --num;
143
0
        if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) {
144
0
            --pos;
145
0
        }
146
0
    }
147
0
}
148
149
// FCDUTF16CollationIterator ----------------------------------------------- ***
150
151
FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other,
152
                                                     const UChar *newText)
153
        : UTF16CollationIterator(other),
154
          rawStart(newText),
155
          segmentStart(newText + (other.segmentStart - other.rawStart)),
156
          segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)),
157
          rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)),
158
          nfcImpl(other.nfcImpl),
159
          normalized(other.normalized),
160
0
          checkDir(other.checkDir) {
161
0
    if(checkDir != 0 || other.start == other.segmentStart) {
162
0
        start = newText + (other.start - other.rawStart);
163
0
        pos = newText + (other.pos - other.rawStart);
164
0
        limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart);
165
0
    } else {
166
0
        start = normalized.getBuffer();
167
0
        pos = start + (other.pos - other.start);
168
0
        limit = start + normalized.length();
169
0
    }
170
0
}
171
172
0
FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {}
173
174
bool
175
0
FCDUTF16CollationIterator::operator==(const CollationIterator &other) const {
176
    // Skip the UTF16CollationIterator and call its parent.
177
0
    if(!CollationIterator::operator==(other)) { return FALSE; }
178
0
    const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other);
179
    // Compare the iterator state but not the text: Assume that the caller does that.
180
0
    if(checkDir != o.checkDir) { return FALSE; }
181
0
    if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; }
182
0
    if(checkDir != 0 || start == segmentStart) {
183
0
        return (pos - rawStart) == (o.pos - o.rawStart);
184
0
    } else {
185
0
        return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) &&
186
0
                (pos - start) == (o.pos - o.start);
187
0
    }
188
0
}
189
190
void
191
0
FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) {
192
0
    reset();
193
0
    start = segmentStart = pos = rawStart + newOffset;
194
0
    limit = rawLimit;
195
0
    checkDir = 1;
196
0
}
197
198
int32_t
199
0
FCDUTF16CollationIterator::getOffset() const {
200
0
    if(checkDir != 0 || start == segmentStart) {
201
0
        return (int32_t)(pos - rawStart);
202
0
    } else if(pos == start) {
203
0
        return (int32_t)(segmentStart - rawStart);
204
0
    } else {
205
0
        return (int32_t)(segmentLimit - rawStart);
206
0
    }
207
0
}
208
209
uint32_t
210
0
FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
211
0
    for(;;) {
212
0
        if(checkDir > 0) {
213
0
            if(pos == limit) {
214
0
                c = U_SENTINEL;
215
0
                return Collation::FALLBACK_CE32;
216
0
            }
217
0
            c = *pos++;
218
0
            if(CollationFCD::hasTccc(c)) {
219
0
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
220
0
                        (pos != limit && CollationFCD::hasLccc(*pos))) {
221
0
                    --pos;
222
0
                    if(!nextSegment(errorCode)) {
223
0
                        c = U_SENTINEL;
224
0
                        return Collation::FALLBACK_CE32;
225
0
                    }
226
0
                    c = *pos++;
227
0
                }
228
0
            }
229
0
            break;
230
0
        } else if(checkDir == 0 && pos != limit) {
231
0
            c = *pos++;
232
0
            break;
233
0
        } else {
234
0
            switchToForward();
235
0
        }
236
0
    }
237
0
    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
238
0
}
239
240
UBool
241
0
FCDUTF16CollationIterator::foundNULTerminator() {
242
0
    if(limit == NULL) {
243
0
        limit = rawLimit = --pos;
244
0
        return TRUE;
245
0
    } else {
246
0
        return FALSE;
247
0
    }
248
0
}
249
250
UChar32
251
0
FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) {
252
0
    UChar32 c;
253
0
    for(;;) {
254
0
        if(checkDir > 0) {
255
0
            if(pos == limit) {
256
0
                return U_SENTINEL;
257
0
            }
258
0
            c = *pos++;
259
0
            if(CollationFCD::hasTccc(c)) {
260
0
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
261
0
                        (pos != limit && CollationFCD::hasLccc(*pos))) {
262
0
                    --pos;
263
0
                    if(!nextSegment(errorCode)) {
264
0
                        return U_SENTINEL;
265
0
                    }
266
0
                    c = *pos++;
267
0
                }
268
0
            } else if(c == 0 && limit == NULL) {
269
0
                limit = rawLimit = --pos;
270
0
                return U_SENTINEL;
271
0
            }
272
0
            break;
273
0
        } else if(checkDir == 0 && pos != limit) {
274
0
            c = *pos++;
275
0
            break;
276
0
        } else {
277
0
            switchToForward();
278
0
        }
279
0
    }
280
0
    UChar trail;
281
0
    if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
282
0
        ++pos;
283
0
        return U16_GET_SUPPLEMENTARY(c, trail);
284
0
    } else {
285
0
        return c;
286
0
    }
287
0
}
288
289
UChar32
290
0
FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
291
0
    UChar32 c;
292
0
    for(;;) {
293
0
        if(checkDir < 0) {
294
0
            if(pos == start) {
295
0
                return U_SENTINEL;
296
0
            }
297
0
            c = *--pos;
298
0
            if(CollationFCD::hasLccc(c)) {
299
0
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
300
0
                        (pos != start && CollationFCD::hasTccc(*(pos - 1)))) {
301
0
                    ++pos;
302
0
                    if(!previousSegment(errorCode)) {
303
0
                        return U_SENTINEL;
304
0
                    }
305
0
                    c = *--pos;
306
0
                }
307
0
            }
308
0
            break;
309
0
        } else if(checkDir == 0 && pos != start) {
310
0
            c = *--pos;
311
0
            break;
312
0
        } else {
313
0
            switchToBackward();
314
0
        }
315
0
    }
316
0
    UChar lead;
317
0
    if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
318
0
        --pos;
319
0
        return U16_GET_SUPPLEMENTARY(lead, c);
320
0
    } else {
321
0
        return c;
322
0
    }
323
0
}
324
325
void
326
0
FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
327
    // Specify the class to avoid a virtual-function indirection.
328
    // In Java, we would declare this class final.
329
0
    while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) {
330
0
        --num;
331
0
    }
332
0
}
333
334
void
335
0
FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
336
    // Specify the class to avoid a virtual-function indirection.
337
    // In Java, we would declare this class final.
338
0
    while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) {
339
0
        --num;
340
0
    }
341
0
}
342
343
void
344
0
FCDUTF16CollationIterator::switchToForward() {
345
0
    U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit));
346
0
    if(checkDir < 0) {
347
        // Turn around from backward checking.
348
0
        start = segmentStart = pos;
349
0
        if(pos == segmentLimit) {
350
0
            limit = rawLimit;
351
0
            checkDir = 1;  // Check forward.
352
0
        } else {  // pos < segmentLimit
353
0
            checkDir = 0;  // Stay in FCD segment.
354
0
        }
355
0
    } else {
356
        // Reached the end of the FCD segment.
357
0
        if(start == segmentStart) {
358
            // The input text segment is FCD, extend it forward.
359
0
        } else {
360
            // The input text segment needed to be normalized.
361
            // Switch to checking forward from it.
362
0
            pos = start = segmentStart = segmentLimit;
363
            // Note: If this segment is at the end of the input text,
364
            // then it might help to return FALSE to indicate that, so that
365
            // we do not have to re-check and normalize when we turn around and go backwards.
366
            // However, that would complicate the call sites for an optimization of an unusual case.
367
0
        }
368
0
        limit = rawLimit;
369
0
        checkDir = 1;
370
0
    }
371
0
}
372
373
UBool
374
0
FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) {
375
0
    if(U_FAILURE(errorCode)) { return FALSE; }
376
0
    U_ASSERT(checkDir > 0 && pos != limit);
377
    // The input text [segmentStart..pos[ passes the FCD check.
378
0
    const UChar *p = pos;
379
0
    uint8_t prevCC = 0;
380
0
    for(;;) {
381
        // Fetch the next character's fcd16 value.
382
0
        const UChar *q = p;
383
0
        uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit);
384
0
        uint8_t leadCC = (uint8_t)(fcd16 >> 8);
385
0
        if(leadCC == 0 && q != pos) {
386
            // FCD boundary before the [q, p[ character.
387
0
            limit = segmentLimit = q;
388
0
            break;
389
0
        }
390
0
        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
391
            // Fails FCD check. Find the next FCD boundary and normalize.
392
0
            do {
393
0
                q = p;
394
0
            } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff);
395
0
            if(!normalize(pos, q, errorCode)) { return FALSE; }
396
0
            pos = start;
397
0
            break;
398
0
        }
399
0
        prevCC = (uint8_t)fcd16;
400
0
        if(p == rawLimit || prevCC == 0) {
401
            // FCD boundary after the last character.
402
0
            limit = segmentLimit = p;
403
0
            break;
404
0
        }
405
0
    }
406
0
    U_ASSERT(pos != limit);
407
0
    checkDir = 0;
408
0
    return TRUE;
409
0
}
410
411
void
412
0
FCDUTF16CollationIterator::switchToBackward() {
413
0
    U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start));
414
0
    if(checkDir > 0) {
415
        // Turn around from forward checking.
416
0
        limit = segmentLimit = pos;
417
0
        if(pos == segmentStart) {
418
0
            start = rawStart;
419
0
            checkDir = -1;  // Check backward.
420
0
        } else {  // pos > segmentStart
421
0
            checkDir = 0;  // Stay in FCD segment.
422
0
        }
423
0
    } else {
424
        // Reached the start of the FCD segment.
425
0
        if(start == segmentStart) {
426
            // The input text segment is FCD, extend it backward.
427
0
        } else {
428
            // The input text segment needed to be normalized.
429
            // Switch to checking backward from it.
430
0
            pos = limit = segmentLimit = segmentStart;
431
0
        }
432
0
        start = rawStart;
433
0
        checkDir = -1;
434
0
    }
435
0
}
436
437
UBool
438
0
FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) {
439
0
    if(U_FAILURE(errorCode)) { return FALSE; }
440
0
    U_ASSERT(checkDir < 0 && pos != start);
441
    // The input text [pos..segmentLimit[ passes the FCD check.
442
0
    const UChar *p = pos;
443
0
    uint8_t nextCC = 0;
444
0
    for(;;) {
445
        // Fetch the previous character's fcd16 value.
446
0
        const UChar *q = p;
447
0
        uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p);
448
0
        uint8_t trailCC = (uint8_t)fcd16;
449
0
        if(trailCC == 0 && q != pos) {
450
            // FCD boundary after the [p, q[ character.
451
0
            start = segmentStart = q;
452
0
            break;
453
0
        }
454
0
        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
455
0
                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
456
            // Fails FCD check. Find the previous FCD boundary and normalize.
457
0
            do {
458
0
                q = p;
459
0
            } while(fcd16 > 0xff && p != rawStart &&
460
0
                    (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0);
461
0
            if(!normalize(q, pos, errorCode)) { return FALSE; }
462
0
            pos = limit;
463
0
            break;
464
0
        }
465
0
        nextCC = (uint8_t)(fcd16 >> 8);
466
0
        if(p == rawStart || nextCC == 0) {
467
            // FCD boundary before the following character.
468
0
            start = segmentStart = p;
469
0
            break;
470
0
        }
471
0
    }
472
0
    U_ASSERT(pos != start);
473
0
    checkDir = 0;
474
0
    return TRUE;
475
0
}
476
477
UBool
478
0
FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) {
479
    // NFD without argument checking.
480
0
    U_ASSERT(U_SUCCESS(errorCode));
481
0
    nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode);
482
0
    if(U_FAILURE(errorCode)) { return FALSE; }
483
    // Switch collation processing into the FCD buffer
484
    // with the result of normalizing [segmentStart, segmentLimit[.
485
0
    segmentStart = from;
486
0
    segmentLimit = to;
487
0
    start = normalized.getBuffer();
488
0
    limit = start + normalized.length();
489
0
    return TRUE;
490
0
}
491
492
U_NAMESPACE_END
493
494
#endif  // !UCONFIG_NO_COLLATION