Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/i18n/coleitr.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 1996-2014, International Business Machines Corporation and
6
* others. All Rights Reserved.
7
*******************************************************************************
8
*/
9
10
/*
11
* File coleitr.cpp
12
*
13
* Created by: Helena Shih
14
*
15
* Modification History:
16
*
17
*  Date      Name        Description
18
*
19
*  6/23/97   helena      Adding comments to make code more readable.
20
* 08/03/98   erm         Synched with 1.2 version of CollationElementIterator.java
21
* 12/10/99   aliu        Ported Thai collation support from Java.
22
* 01/25/01   swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
23
* 02/19/01   swquek      Removed CollationElementIterator() since it is 
24
*                        private constructor and no calls are made to it
25
* 2012-2014  markus      Rewritten in C++ again.
26
*/
27
28
#include "unicode/utypes.h"
29
30
#if !UCONFIG_NO_COLLATION
31
32
#include "unicode/chariter.h"
33
#include "unicode/coleitr.h"
34
#include "unicode/tblcoll.h"
35
#include "unicode/ustring.h"
36
#include "cmemory.h"
37
#include "collation.h"
38
#include "collationdata.h"
39
#include "collationiterator.h"
40
#include "collationsets.h"
41
#include "collationtailoring.h"
42
#include "uassert.h"
43
#include "uhash.h"
44
#include "utf16collationiterator.h"
45
#include "uvectr32.h"
46
47
/* Constants --------------------------------------------------------------- */
48
49
U_NAMESPACE_BEGIN
50
51
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
52
53
/* CollationElementIterator public constructor/destructor ------------------ */
54
55
CollationElementIterator::CollationElementIterator(
56
                                         const CollationElementIterator& other) 
57
0
        : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
58
0
    *this = other;
59
0
}
60
61
CollationElementIterator::~CollationElementIterator()
62
0
{
63
0
    delete iter_;
64
0
    delete offsets_;
65
0
}
66
67
/* CollationElementIterator public methods --------------------------------- */
68
69
namespace {
70
71
0
uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
72
0
    return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
73
0
}
74
0
uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
75
0
    return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
76
0
}
77
0
UBool ceNeedsTwoParts(int64_t ce) {
78
0
    return (ce & INT64_C(0xffff00ff003f)) != 0;
79
0
}
80
81
}  // namespace
82
83
int32_t CollationElementIterator::getOffset() const
84
0
{
85
0
    if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
86
        // CollationIterator::previousCE() decrements the CEs length
87
        // while it pops CEs from its internal buffer.
88
0
        int32_t i = iter_->getCEsLength();
89
0
        if (otherHalf_ != 0) {
90
            // Return the trailing CE offset while we are in the middle of a 64-bit CE.
91
0
            ++i;
92
0
        }
93
0
        U_ASSERT(i < offsets_->size());
94
0
        return offsets_->elementAti(i);
95
0
    }
96
0
    return iter_->getOffset();
97
0
}
98
99
/**
100
* Get the ordering priority of the next character in the string.
101
* @return the next character's ordering. Returns NULLORDER if an error has 
102
*         occurred or if the end of string has been reached
103
*/
104
int32_t CollationElementIterator::next(UErrorCode& status)
105
0
{
106
0
    if (U_FAILURE(status)) { return NULLORDER; }
107
0
    if (dir_ > 1) {
108
        // Continue forward iteration. Test this first.
109
0
        if (otherHalf_ != 0) {
110
0
            uint32_t oh = otherHalf_;
111
0
            otherHalf_ = 0;
112
0
            return oh;
113
0
        }
114
0
    } else if (dir_ == 1) {
115
        // next() after setOffset()
116
0
        dir_ = 2;
117
0
    } else if (dir_ == 0) {
118
        // The iter_ is already reset to the start of the text.
119
0
        dir_ = 2;
120
0
    } else /* dir_ < 0 */ {
121
        // illegal change of direction
122
0
        status = U_INVALID_STATE_ERROR;
123
0
        return NULLORDER;
124
0
    }
125
    // No need to keep all CEs in the buffer when we iterate.
126
0
    iter_->clearCEsIfNoneRemaining();
127
0
    int64_t ce = iter_->nextCE(status);
128
0
    if (ce == Collation::NO_CE) { return NULLORDER; }
129
    // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
130
0
    uint32_t p = (uint32_t)(ce >> 32);
131
0
    uint32_t lower32 = (uint32_t)ce;
132
0
    uint32_t firstHalf = getFirstHalf(p, lower32);
133
0
    uint32_t secondHalf = getSecondHalf(p, lower32);
134
0
    if (secondHalf != 0) {
135
0
        otherHalf_ = secondHalf | 0xc0;  // continuation CE
136
0
    }
137
0
    return firstHalf;
138
0
}
139
140
bool CollationElementIterator::operator!=(
141
                                  const CollationElementIterator& other) const
142
0
{
143
0
    return !(*this == other);
144
0
}
145
146
bool CollationElementIterator::operator==(
147
                                    const CollationElementIterator& that) const
148
0
{
149
0
    if (this == &that) {
150
0
        return TRUE;
151
0
    }
152
153
0
    return
154
0
        (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
155
0
        otherHalf_ == that.otherHalf_ &&
156
0
        normalizeDir() == that.normalizeDir() &&
157
0
        string_ == that.string_ &&
158
0
        *iter_ == *that.iter_;
159
0
}
160
161
/**
162
* Get the ordering priority of the previous collation element in the string.
163
* @param status the error code status.
164
* @return the previous element's ordering. Returns NULLORDER if an error has 
165
*         occurred or if the start of string has been reached.
166
*/
167
int32_t CollationElementIterator::previous(UErrorCode& status)
168
0
{
169
0
    if (U_FAILURE(status)) { return NULLORDER; }
170
0
    if (dir_ < 0) {
171
        // Continue backwards iteration. Test this first.
172
0
        if (otherHalf_ != 0) {
173
0
            uint32_t oh = otherHalf_;
174
0
            otherHalf_ = 0;
175
0
            return oh;
176
0
        }
177
0
    } else if (dir_ == 0) {
178
0
        iter_->resetToOffset(string_.length());
179
0
        dir_ = -1;
180
0
    } else if (dir_ == 1) {
181
        // previous() after setOffset()
182
0
        dir_ = -1;
183
0
    } else /* dir_ > 1 */ {
184
        // illegal change of direction
185
0
        status = U_INVALID_STATE_ERROR;
186
0
        return NULLORDER;
187
0
    }
188
0
    if (offsets_ == NULL) {
189
0
        offsets_ = new UVector32(status);
190
0
        if (offsets_ == NULL) {
191
0
            status = U_MEMORY_ALLOCATION_ERROR;
192
0
            return NULLORDER;
193
0
        }
194
0
    }
195
    // If we already have expansion CEs, then we also have offsets.
196
    // Otherwise remember the trailing offset in case we need to
197
    // write offsets for an artificial expansion.
198
0
    int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
199
0
    int64_t ce = iter_->previousCE(*offsets_, status);
200
0
    if (ce == Collation::NO_CE) { return NULLORDER; }
201
    // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
202
0
    uint32_t p = (uint32_t)(ce >> 32);
203
0
    uint32_t lower32 = (uint32_t)ce;
204
0
    uint32_t firstHalf = getFirstHalf(p, lower32);
205
0
    uint32_t secondHalf = getSecondHalf(p, lower32);
206
0
    if (secondHalf != 0) {
207
0
        if (offsets_->isEmpty()) {
208
            // When we convert a single 64-bit CE into two 32-bit CEs,
209
            // we need to make this artificial expansion behave like a normal expansion.
210
            // See CollationIterator::previousCE().
211
0
            offsets_->addElement(iter_->getOffset(), status);
212
0
            offsets_->addElement(limitOffset, status);
213
0
        }
214
0
        otherHalf_ = firstHalf;
215
0
        return secondHalf | 0xc0;  // continuation CE
216
0
    }
217
0
    return firstHalf;
218
0
}
219
220
/**
221
* Resets the cursor to the beginning of the string.
222
*/
223
void CollationElementIterator::reset()
224
0
{
225
0
    iter_ ->resetToOffset(0);
226
0
    otherHalf_ = 0;
227
0
    dir_ = 0;
228
0
}
229
230
void CollationElementIterator::setOffset(int32_t newOffset, 
231
                                         UErrorCode& status)
232
0
{
233
0
    if (U_FAILURE(status)) { return; }
234
0
    if (0 < newOffset && newOffset < string_.length()) {
235
0
        int32_t offset = newOffset;
236
0
        do {
237
0
            UChar c = string_.charAt(offset);
238
0
            if (!rbc_->isUnsafe(c) ||
239
0
                    (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
240
0
                break;
241
0
            }
242
            // Back up to before this unsafe character.
243
0
            --offset;
244
0
        } while (offset > 0);
245
0
        if (offset < newOffset) {
246
            // We might have backed up more than necessary.
247
            // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
248
            // but for text "chu" setOffset(2) should remain at 2
249
            // although we initially back up to offset 0.
250
            // Find the last safe offset no greater than newOffset by iterating forward.
251
0
            int32_t lastSafeOffset = offset;
252
0
            do {
253
0
                iter_->resetToOffset(lastSafeOffset);
254
0
                do {
255
0
                    iter_->nextCE(status);
256
0
                    if (U_FAILURE(status)) { return; }
257
0
                } while ((offset = iter_->getOffset()) == lastSafeOffset);
258
0
                if (offset <= newOffset) {
259
0
                    lastSafeOffset = offset;
260
0
                }
261
0
            } while (offset < newOffset);
262
0
            newOffset = lastSafeOffset;
263
0
        }
264
0
    }
265
0
    iter_->resetToOffset(newOffset);
266
0
    otherHalf_ = 0;
267
0
    dir_ = 1;
268
0
}
269
270
/**
271
* Sets the source to the new source string.
272
*/
273
void CollationElementIterator::setText(const UnicodeString& source,
274
                                       UErrorCode& status)
275
0
{
276
0
    if (U_FAILURE(status)) {
277
0
        return;
278
0
    }
279
280
0
    string_ = source;
281
0
    const UChar *s = string_.getBuffer();
282
0
    CollationIterator *newIter;
283
0
    UBool numeric = rbc_->settings->isNumeric();
284
0
    if (rbc_->settings->dontCheckFCD()) {
285
0
        newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
286
0
    } else {
287
0
        newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
288
0
    }
289
0
    if (newIter == NULL) {
290
0
        status = U_MEMORY_ALLOCATION_ERROR;
291
0
        return;
292
0
    }
293
0
    delete iter_;
294
0
    iter_ = newIter;
295
0
    otherHalf_ = 0;
296
0
    dir_ = 0;
297
0
}
298
299
// Sets the source to the new character iterator.
300
void CollationElementIterator::setText(CharacterIterator& source, 
301
                                       UErrorCode& status)
302
0
{
303
0
    if (U_FAILURE(status)) 
304
0
        return;
305
306
0
    source.getText(string_);
307
0
    setText(string_, status);
308
0
}
309
310
int32_t CollationElementIterator::strengthOrder(int32_t order) const
311
0
{
312
0
    UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
313
    // Mask off the unwanted differences.
314
0
    if (s == UCOL_PRIMARY) {
315
0
        order &= 0xffff0000;
316
0
    }
317
0
    else if (s == UCOL_SECONDARY) {
318
0
        order &= 0xffffff00;
319
0
    }
320
321
0
    return order;
322
0
}
323
324
/* CollationElementIterator private constructors/destructors --------------- */
325
326
/** 
327
* This is the "real" constructor for this class; it constructs an iterator
328
* over the source text using the specified collator
329
*/
330
CollationElementIterator::CollationElementIterator(
331
                                               const UnicodeString &source,
332
                                               const RuleBasedCollator *coll,
333
                                               UErrorCode &status)
334
0
        : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
335
0
    setText(source, status);
336
0
}
337
338
/** 
339
* This is the "real" constructor for this class; it constructs an iterator over 
340
* the source text using the specified collator
341
*/
342
CollationElementIterator::CollationElementIterator(
343
                                           const CharacterIterator &source,
344
                                           const RuleBasedCollator *coll,
345
                                           UErrorCode &status)
346
0
        : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
347
    // We only call source.getText() which should be const anyway.
348
0
    setText(const_cast<CharacterIterator &>(source), status);
349
0
}
350
351
/* CollationElementIterator private methods -------------------------------- */
352
353
const CollationElementIterator& CollationElementIterator::operator=(
354
                                         const CollationElementIterator& other)
355
0
{
356
0
    if (this == &other) {
357
0
        return *this;
358
0
    }
359
360
0
    CollationIterator *newIter;
361
0
    const FCDUTF16CollationIterator *otherFCDIter =
362
0
            dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
363
0
    if(otherFCDIter != NULL) {
364
0
        newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
365
0
    } else {
366
0
        const UTF16CollationIterator *otherIter =
367
0
                dynamic_cast<const UTF16CollationIterator *>(other.iter_);
368
0
        if(otherIter != NULL) {
369
0
            newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
370
0
        } else {
371
0
            newIter = NULL;
372
0
        }
373
0
    }
374
0
    if(newIter != NULL) {
375
0
        delete iter_;
376
0
        iter_ = newIter;
377
0
        rbc_ = other.rbc_;
378
0
        otherHalf_ = other.otherHalf_;
379
0
        dir_ = other.dir_;
380
381
0
        string_ = other.string_;
382
0
    }
383
0
    if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
384
0
        UErrorCode errorCode = U_ZERO_ERROR;
385
0
        if(offsets_ == NULL) {
386
0
            offsets_ = new UVector32(other.offsets_->size(), errorCode);
387
0
        }
388
0
        if(offsets_ != NULL) {
389
0
            offsets_->assign(*other.offsets_, errorCode);
390
0
        }
391
0
    }
392
0
    return *this;
393
0
}
394
395
namespace {
396
397
class MaxExpSink : public ContractionsAndExpansions::CESink {
398
public:
399
0
    MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
400
    virtual ~MaxExpSink();
401
0
    virtual void handleCE(int64_t /*ce*/) {}
402
0
    virtual void handleExpansion(const int64_t ces[], int32_t length) {
403
0
        if (length <= 1) {
404
            // We do not need to add single CEs into the map.
405
0
            return;
406
0
        }
407
0
        int32_t count = 0;  // number of CE "halves"
408
0
        for (int32_t i = 0; i < length; ++i) {
409
0
            count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
410
0
        }
411
        // last "half" of the last CE
412
0
        int64_t ce = ces[length - 1];
413
0
        uint32_t p = (uint32_t)(ce >> 32);
414
0
        uint32_t lower32 = (uint32_t)ce;
415
0
        uint32_t lastHalf = getSecondHalf(p, lower32);
416
0
        if (lastHalf == 0) {
417
0
            lastHalf = getFirstHalf(p, lower32);
418
0
            U_ASSERT(lastHalf != 0);
419
0
        } else {
420
0
            lastHalf |= 0xc0;  // old-style continuation CE
421
0
        }
422
0
        if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
423
0
            uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
424
0
        }
425
0
    }
426
427
private:
428
    UHashtable *maxExpansions;
429
    UErrorCode &errorCode;
430
};
431
432
MaxExpSink::~MaxExpSink() {}
433
434
}  // namespace
435
436
UHashtable *
437
0
CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
438
0
    if (U_FAILURE(errorCode)) { return NULL; }
439
0
    UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
440
0
                                           uhash_compareLong, &errorCode);
441
0
    if (U_FAILURE(errorCode)) { return NULL; }
442
0
    MaxExpSink sink(maxExpansions, errorCode);
443
0
    ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
444
0
    if (U_FAILURE(errorCode)) {
445
0
        uhash_close(maxExpansions);
446
0
        return NULL;
447
0
    }
448
0
    return maxExpansions;
449
0
}
450
451
int32_t
452
0
CollationElementIterator::getMaxExpansion(int32_t order) const {
453
0
    return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
454
0
}
455
456
int32_t
457
0
CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
458
0
    if (order == 0) { return 1; }
459
0
    int32_t max;
460
0
    if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
461
0
        return max;
462
0
    }
463
0
    if ((order & 0xc0) == 0xc0) {
464
        // old-style continuation CE
465
0
        return 2;
466
0
    } else {
467
0
        return 1;
468
0
    }
469
0
}
470
471
U_NAMESPACE_END
472
473
#endif /* #if !UCONFIG_NO_COLLATION */