Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/i18n/collationcompare.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 1996-2015, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationcompare.cpp
9
*
10
* created on: 2012feb14 with new and old collation code
11
* created by: Markus W. Scherer
12
*/
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_COLLATION
17
18
#include "unicode/ucol.h"
19
#include "cmemory.h"
20
#include "collation.h"
21
#include "collationcompare.h"
22
#include "collationiterator.h"
23
#include "collationsettings.h"
24
#include "uassert.h"
25
26
U_NAMESPACE_BEGIN
27
28
UCollationResult
29
CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
30
                                        const CollationSettings &settings,
31
0
                                        UErrorCode &errorCode) {
32
0
    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
33
34
0
    int32_t options = settings.options;
35
0
    uint32_t variableTop;
36
0
    if((options & CollationSettings::ALTERNATE_MASK) == 0) {
37
0
        variableTop = 0;
38
0
    } else {
39
        // +1 so that we can use "<" and primary ignorables test out early.
40
0
        variableTop = settings.variableTop + 1;
41
0
    }
42
0
    UBool anyVariable = FALSE;
43
44
    // Fetch CEs, compare primaries, store secondary & tertiary weights.
45
0
    for(;;) {
46
        // We fetch CEs until we get a non-ignorable primary or reach the end.
47
0
        uint32_t leftPrimary;
48
0
        do {
49
0
            int64_t ce = left.nextCE(errorCode);
50
0
            leftPrimary = (uint32_t)(ce >> 32);
51
0
            if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
52
                // Variable CE, shift it to quaternary level.
53
                // Ignore all following primary ignorables, and shift further variable CEs.
54
0
                anyVariable = TRUE;
55
0
                do {
56
                    // Store only the primary of the variable CE.
57
0
                    left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
58
0
                    for(;;) {
59
0
                        ce = left.nextCE(errorCode);
60
0
                        leftPrimary = (uint32_t)(ce >> 32);
61
0
                        if(leftPrimary == 0) {
62
0
                            left.setCurrentCE(0);
63
0
                        } else {
64
0
                            break;
65
0
                        }
66
0
                    }
67
0
                } while(leftPrimary < variableTop &&
68
0
                        leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
69
0
            }
70
0
        } while(leftPrimary == 0);
71
72
0
        uint32_t rightPrimary;
73
0
        do {
74
0
            int64_t ce = right.nextCE(errorCode);
75
0
            rightPrimary = (uint32_t)(ce >> 32);
76
0
            if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
77
                // Variable CE, shift it to quaternary level.
78
                // Ignore all following primary ignorables, and shift further variable CEs.
79
0
                anyVariable = TRUE;
80
0
                do {
81
                    // Store only the primary of the variable CE.
82
0
                    right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
83
0
                    for(;;) {
84
0
                        ce = right.nextCE(errorCode);
85
0
                        rightPrimary = (uint32_t)(ce >> 32);
86
0
                        if(rightPrimary == 0) {
87
0
                            right.setCurrentCE(0);
88
0
                        } else {
89
0
                            break;
90
0
                        }
91
0
                    }
92
0
                } while(rightPrimary < variableTop &&
93
0
                        rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
94
0
            }
95
0
        } while(rightPrimary == 0);
96
97
0
        if(leftPrimary != rightPrimary) {
98
            // Return the primary difference, with script reordering.
99
0
            if(settings.hasReordering()) {
100
0
                leftPrimary = settings.reorder(leftPrimary);
101
0
                rightPrimary = settings.reorder(rightPrimary);
102
0
            }
103
0
            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
104
0
        }
105
0
        if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
106
0
    }
107
0
    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
108
109
    // Compare the buffered secondary & tertiary weights.
110
    // We might skip the secondary level but continue with the case level
111
    // which is turned on separately.
112
0
    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
113
0
        if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
114
0
            int32_t leftIndex = 0;
115
0
            int32_t rightIndex = 0;
116
0
            for(;;) {
117
0
                uint32_t leftSecondary;
118
0
                do {
119
0
                    leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
120
0
                } while(leftSecondary == 0);
121
122
0
                uint32_t rightSecondary;
123
0
                do {
124
0
                    rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
125
0
                } while(rightSecondary == 0);
126
127
0
                if(leftSecondary != rightSecondary) {
128
0
                    return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
129
0
                }
130
0
                if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
131
0
            }
132
0
        } else {
133
            // The backwards secondary level compares secondary weights backwards
134
            // within segments separated by the merge separator (U+FFFE, weight 02).
135
0
            int32_t leftStart = 0;
136
0
            int32_t rightStart = 0;
137
0
            for(;;) {
138
                // Find the merge separator or the NO_CE terminator.
139
0
                uint32_t p;
140
0
                int32_t leftLimit = leftStart;
141
0
                while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
142
0
                            Collation::MERGE_SEPARATOR_PRIMARY ||
143
0
                        p == 0) {
144
0
                    ++leftLimit;
145
0
                }
146
0
                int32_t rightLimit = rightStart;
147
0
                while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
148
0
                            Collation::MERGE_SEPARATOR_PRIMARY ||
149
0
                        p == 0) {
150
0
                    ++rightLimit;
151
0
                }
152
153
                // Compare the segments.
154
0
                int32_t leftIndex = leftLimit;
155
0
                int32_t rightIndex = rightLimit;
156
0
                for(;;) {
157
0
                    int32_t leftSecondary = 0;
158
0
                    while(leftSecondary == 0 && leftIndex > leftStart) {
159
0
                        leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
160
0
                    }
161
162
0
                    int32_t rightSecondary = 0;
163
0
                    while(rightSecondary == 0 && rightIndex > rightStart) {
164
0
                        rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
165
0
                    }
166
167
0
                    if(leftSecondary != rightSecondary) {
168
0
                        return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
169
0
                    }
170
0
                    if(leftSecondary == 0) { break; }
171
0
                }
172
173
                // Did we reach the end of either string?
174
                // Both strings have the same number of merge separators,
175
                // or else there would have been a primary-level difference.
176
0
                U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
177
0
                if(p == Collation::NO_CE_PRIMARY) { break; }
178
                // Skip both merge separators and continue.
179
0
                leftStart = leftLimit + 1;
180
0
                rightStart = rightLimit + 1;
181
0
            }
182
0
        }
183
0
    }
184
185
0
    if((options & CollationSettings::CASE_LEVEL) != 0) {
186
0
        int32_t strength = CollationSettings::getStrength(options);
187
0
        int32_t leftIndex = 0;
188
0
        int32_t rightIndex = 0;
189
0
        for(;;) {
190
0
            uint32_t leftCase, leftLower32, rightCase;
191
0
            if(strength == UCOL_PRIMARY) {
192
                // Primary+caseLevel: Ignore case level weights of primary ignorables.
193
                // Otherwise we would get a-umlaut > a
194
                // which is not desirable for accent-insensitive sorting.
195
                // Check for (lower 32 bits) == 0 as well because variable CEs are stored
196
                // with only primary weights.
197
0
                int64_t ce;
198
0
                do {
199
0
                    ce = left.getCE(leftIndex++);
200
0
                    leftCase = (uint32_t)ce;
201
0
                } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
202
0
                leftLower32 = leftCase;
203
0
                leftCase &= 0xc000;
204
205
0
                do {
206
0
                    ce = right.getCE(rightIndex++);
207
0
                    rightCase = (uint32_t)ce;
208
0
                } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
209
0
                rightCase &= 0xc000;
210
0
            } else {
211
                // Secondary+caseLevel: By analogy with the above,
212
                // ignore case level weights of secondary ignorables.
213
                //
214
                // Note: A tertiary CE has uppercase case bits (0.0.ut)
215
                // to keep tertiary+caseFirst well-formed.
216
                //
217
                // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
218
                // Otherwise a tertiary CE's uppercase would be no greater than
219
                // a primary/secondary CE's uppercase.
220
                // (See UCA well-formedness condition 2.)
221
                // We could construct a special case weight higher than uppercase,
222
                // but it's simpler to always ignore case weights of secondary ignorables,
223
                // turning 0.0.ut into 0.0.0.t.
224
                // (See LDML Collation, Case Parameters.)
225
0
                do {
226
0
                    leftCase = (uint32_t)left.getCE(leftIndex++);
227
0
                } while(leftCase <= 0xffff);
228
0
                leftLower32 = leftCase;
229
0
                leftCase &= 0xc000;
230
231
0
                do {
232
0
                    rightCase = (uint32_t)right.getCE(rightIndex++);
233
0
                } while(rightCase <= 0xffff);
234
0
                rightCase &= 0xc000;
235
0
            }
236
237
            // No need to handle NO_CE and MERGE_SEPARATOR specially:
238
            // There is one case weight for each previous-level weight,
239
            // so level length differences were handled there.
240
0
            if(leftCase != rightCase) {
241
0
                if((options & CollationSettings::UPPER_FIRST) == 0) {
242
0
                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
243
0
                } else {
244
0
                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
245
0
                }
246
0
            }
247
0
            if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
248
0
        }
249
0
    }
250
0
    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
251
252
0
    uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
253
254
0
    int32_t leftIndex = 0;
255
0
    int32_t rightIndex = 0;
256
0
    uint32_t anyQuaternaries = 0;
257
0
    for(;;) {
258
0
        uint32_t leftLower32, leftTertiary;
259
0
        do {
260
0
            leftLower32 = (uint32_t)left.getCE(leftIndex++);
261
0
            anyQuaternaries |= leftLower32;
262
0
            U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
263
0
                     (leftLower32 & 0xc0c0) == 0);
264
0
            leftTertiary = leftLower32 & tertiaryMask;
265
0
        } while(leftTertiary == 0);
266
267
0
        uint32_t rightLower32, rightTertiary;
268
0
        do {
269
0
            rightLower32 = (uint32_t)right.getCE(rightIndex++);
270
0
            anyQuaternaries |= rightLower32;
271
0
            U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
272
0
                     (rightLower32 & 0xc0c0) == 0);
273
0
            rightTertiary = rightLower32 & tertiaryMask;
274
0
        } while(rightTertiary == 0);
275
276
0
        if(leftTertiary != rightTertiary) {
277
0
            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
278
                // Pass through NO_CE and keep real tertiary weights larger than that.
279
                // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
280
                // to keep tertiary CEs well-formed.
281
                // Their case+tertiary weights must be greater than those of
282
                // primary and secondary CEs.
283
0
                if(leftTertiary > Collation::NO_CE_WEIGHT16) {
284
0
                    if(leftLower32 > 0xffff) {
285
0
                        leftTertiary ^= 0xc000;
286
0
                    } else {
287
0
                        leftTertiary += 0x4000;
288
0
                    }
289
0
                }
290
0
                if(rightTertiary > Collation::NO_CE_WEIGHT16) {
291
0
                    if(rightLower32 > 0xffff) {
292
0
                        rightTertiary ^= 0xc000;
293
0
                    } else {
294
0
                        rightTertiary += 0x4000;
295
0
                    }
296
0
                }
297
0
            }
298
0
            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
299
0
        }
300
0
        if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
301
0
    }
302
0
    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
303
304
0
    if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
305
        // If there are no "variable" CEs and no non-zero quaternary weights,
306
        // then there are no quaternary differences.
307
0
        return UCOL_EQUAL;
308
0
    }
309
310
0
    leftIndex = 0;
311
0
    rightIndex = 0;
312
0
    for(;;) {
313
0
        uint32_t leftQuaternary;
314
0
        do {
315
0
            int64_t ce = left.getCE(leftIndex++);
316
0
            leftQuaternary = (uint32_t)ce & 0xffff;
317
0
            if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
318
                // Variable primary or completely ignorable or NO_CE.
319
0
                leftQuaternary = (uint32_t)(ce >> 32);
320
0
            } else {
321
                // Regular CE, not tertiary ignorable.
322
                // Preserve the quaternary weight in bits 7..6.
323
0
                leftQuaternary |= 0xffffff3f;
324
0
            }
325
0
        } while(leftQuaternary == 0);
326
327
0
        uint32_t rightQuaternary;
328
0
        do {
329
0
            int64_t ce = right.getCE(rightIndex++);
330
0
            rightQuaternary = (uint32_t)ce & 0xffff;
331
0
            if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
332
                // Variable primary or completely ignorable or NO_CE.
333
0
                rightQuaternary = (uint32_t)(ce >> 32);
334
0
            } else {
335
                // Regular CE, not tertiary ignorable.
336
                // Preserve the quaternary weight in bits 7..6.
337
0
                rightQuaternary |= 0xffffff3f;
338
0
            }
339
0
        } while(rightQuaternary == 0);
340
341
0
        if(leftQuaternary != rightQuaternary) {
342
            // Return the difference, with script reordering.
343
0
            if(settings.hasReordering()) {
344
0
                leftQuaternary = settings.reorder(leftQuaternary);
345
0
                rightQuaternary = settings.reorder(rightQuaternary);
346
0
            }
347
0
            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
348
0
        }
349
0
        if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
350
0
    }
351
0
    return UCOL_EQUAL;
352
0
}
353
354
U_NAMESPACE_END
355
356
#endif  // !UCONFIG_NO_COLLATION