Coverage Report

Created: 2023-02-22 06:51

/src/icu/source/i18n/collationdata.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2012-2015, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationdata.cpp
9
*
10
* created on: 2012jul28
11
* created by: Markus W. Scherer
12
*/
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_COLLATION
17
18
#include "unicode/ucol.h"
19
#include "unicode/udata.h"
20
#include "unicode/uscript.h"
21
#include "cmemory.h"
22
#include "collation.h"
23
#include "collationdata.h"
24
#include "uassert.h"
25
#include "utrie2.h"
26
#include "uvectr32.h"
27
28
U_NAMESPACE_BEGIN
29
30
uint32_t
31
0
CollationData::getIndirectCE32(uint32_t ce32) const {
32
0
    U_ASSERT(Collation::isSpecialCE32(ce32));
33
0
    int32_t tag = Collation::tagFromCE32(ce32);
34
0
    if(tag == Collation::DIGIT_TAG) {
35
        // Fetch the non-numeric-collation CE32.
36
0
        ce32 = ce32s[Collation::indexFromCE32(ce32)];
37
0
    } else if(tag == Collation::LEAD_SURROGATE_TAG) {
38
0
        ce32 = Collation::UNASSIGNED_CE32;
39
0
    } else if(tag == Collation::U0000_TAG) {
40
        // Fetch the normal ce32 for U+0000.
41
0
        ce32 = ce32s[0];
42
0
    }
43
0
    return ce32;
44
0
}
45
46
uint32_t
47
0
CollationData::getFinalCE32(uint32_t ce32) const {
48
0
    if(Collation::isSpecialCE32(ce32)) {
49
0
        ce32 = getIndirectCE32(ce32);
50
0
    }
51
0
    return ce32;
52
0
}
53
54
int64_t
55
0
CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
56
0
    if(U_FAILURE(errorCode)) { return 0; }
57
    // Keep parallel with CollationDataBuilder::getSingleCE().
58
0
    const CollationData *d;
59
0
    uint32_t ce32 = getCE32(c);
60
0
    if(ce32 == Collation::FALLBACK_CE32) {
61
0
        d = base;
62
0
        ce32 = base->getCE32(c);
63
0
    } else {
64
0
        d = this;
65
0
    }
66
0
    while(Collation::isSpecialCE32(ce32)) {
67
0
        switch(Collation::tagFromCE32(ce32)) {
68
0
        case Collation::LATIN_EXPANSION_TAG:
69
0
        case Collation::BUILDER_DATA_TAG:
70
0
        case Collation::PREFIX_TAG:
71
0
        case Collation::CONTRACTION_TAG:
72
0
        case Collation::HANGUL_TAG:
73
0
        case Collation::LEAD_SURROGATE_TAG:
74
0
            errorCode = U_UNSUPPORTED_ERROR;
75
0
            return 0;
76
0
        case Collation::FALLBACK_TAG:
77
0
        case Collation::RESERVED_TAG_3:
78
0
            errorCode = U_INTERNAL_PROGRAM_ERROR;
79
0
            return 0;
80
0
        case Collation::LONG_PRIMARY_TAG:
81
0
            return Collation::ceFromLongPrimaryCE32(ce32);
82
0
        case Collation::LONG_SECONDARY_TAG:
83
0
            return Collation::ceFromLongSecondaryCE32(ce32);
84
0
        case Collation::EXPANSION32_TAG:
85
0
            if(Collation::lengthFromCE32(ce32) == 1) {
86
0
                ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
87
0
                break;
88
0
            } else {
89
0
                errorCode = U_UNSUPPORTED_ERROR;
90
0
                return 0;
91
0
            }
92
0
        case Collation::EXPANSION_TAG: {
93
0
            if(Collation::lengthFromCE32(ce32) == 1) {
94
0
                return d->ces[Collation::indexFromCE32(ce32)];
95
0
            } else {
96
0
                errorCode = U_UNSUPPORTED_ERROR;
97
0
                return 0;
98
0
            }
99
0
        }
100
0
        case Collation::DIGIT_TAG:
101
            // Fetch the non-numeric-collation CE32 and continue.
102
0
            ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
103
0
            break;
104
0
        case Collation::U0000_TAG:
105
0
            U_ASSERT(c == 0);
106
            // Fetch the normal ce32 for U+0000 and continue.
107
0
            ce32 = d->ce32s[0];
108
0
            break;
109
0
        case Collation::OFFSET_TAG:
110
0
            return d->getCEFromOffsetCE32(c, ce32);
111
0
        case Collation::IMPLICIT_TAG:
112
0
            return Collation::unassignedCEFromCodePoint(c);
113
0
        }
114
0
    }
115
0
    return Collation::ceFromSimpleCE32(ce32);
116
0
}
117
118
uint32_t
119
0
CollationData::getFirstPrimaryForGroup(int32_t script) const {
120
0
    int32_t index = getScriptIndex(script);
121
0
    return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
122
0
}
123
124
uint32_t
125
0
CollationData::getLastPrimaryForGroup(int32_t script) const {
126
0
    int32_t index = getScriptIndex(script);
127
0
    if(index == 0) {
128
0
        return 0;
129
0
    }
130
0
    uint32_t limit = scriptStarts[index + 1];
131
0
    return (limit << 16) - 1;
132
0
}
133
134
int32_t
135
0
CollationData::getGroupForPrimary(uint32_t p) const {
136
0
    p >>= 16;
137
0
    if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
138
0
        return -1;
139
0
    }
140
0
    int32_t index = 1;
141
0
    while(p >= scriptStarts[index + 1]) { ++index; }
142
0
    for(int32_t i = 0; i < numScripts; ++i) {
143
0
        if(scriptsIndex[i] == index) {
144
0
            return i;
145
0
        }
146
0
    }
147
0
    for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
148
0
        if(scriptsIndex[numScripts + i] == index) {
149
0
            return UCOL_REORDER_CODE_FIRST + i;
150
0
        }
151
0
    }
152
0
    return -1;
153
0
}
154
155
int32_t
156
0
CollationData::getScriptIndex(int32_t script) const {
157
0
    if(script < 0) {
158
0
        return 0;
159
0
    } else if(script < numScripts) {
160
0
        return scriptsIndex[script];
161
0
    } else if(script < UCOL_REORDER_CODE_FIRST) {
162
0
        return 0;
163
0
    } else {
164
0
        script -= UCOL_REORDER_CODE_FIRST;
165
0
        if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
166
0
            return scriptsIndex[numScripts + script];
167
0
        } else {
168
0
            return 0;
169
0
        }
170
0
    }
171
0
}
172
173
int32_t
174
CollationData::getEquivalentScripts(int32_t script,
175
                                    int32_t dest[], int32_t capacity,
176
0
                                    UErrorCode &errorCode) const {
177
0
    if(U_FAILURE(errorCode)) { return 0; }
178
0
    int32_t index = getScriptIndex(script);
179
0
    if(index == 0) { return 0; }
180
0
    if(script >= UCOL_REORDER_CODE_FIRST) {
181
        // Special groups have no aliases.
182
0
        if(capacity > 0) {
183
0
            dest[0] = script;
184
0
        } else {
185
0
            errorCode = U_BUFFER_OVERFLOW_ERROR;
186
0
        }
187
0
        return 1;
188
0
    }
189
190
0
    int32_t length = 0;
191
0
    for(int32_t i = 0; i < numScripts; ++i) {
192
0
        if(scriptsIndex[i] == index) {
193
0
            if(length < capacity) {
194
0
                dest[length] = i;
195
0
            }
196
0
            ++length;
197
0
        }
198
0
    }
199
0
    if(length > capacity) {
200
0
        errorCode = U_BUFFER_OVERFLOW_ERROR;
201
0
    }
202
0
    return length;
203
0
}
204
205
void
206
CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
207
0
                                 UVector32 &ranges, UErrorCode &errorCode) const {
208
0
    makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
209
0
}
210
211
void
212
CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
213
                                 UBool latinMustMove,
214
0
                                 UVector32 &ranges, UErrorCode &errorCode) const {
215
0
    if(U_FAILURE(errorCode)) { return; }
216
0
    ranges.removeAllElements();
217
0
    if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
218
0
        return;
219
0
    }
220
221
    // Maps each script-or-group range to a new lead byte.
222
0
    uint8_t table[MAX_NUM_SCRIPT_RANGES];
223
0
    uprv_memset(table, 0, sizeof(table));
224
225
0
    {
226
        // Set "don't care" values for reserved ranges.
227
0
        int32_t index = scriptsIndex[
228
0
                numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
229
0
        if(index != 0) {
230
0
            table[index] = 0xff;
231
0
        }
232
0
        index = scriptsIndex[
233
0
                numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
234
0
        if(index != 0) {
235
0
            table[index] = 0xff;
236
0
        }
237
0
    }
238
239
    // Never reorder special low and high primary lead bytes.
240
0
    U_ASSERT(scriptStartsLength >= 2);
241
0
    U_ASSERT(scriptStarts[0] == 0);
242
0
    int32_t lowStart = scriptStarts[1];
243
0
    U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
244
0
    int32_t highLimit = scriptStarts[scriptStartsLength - 1];
245
0
    U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
246
247
    // Get the set of special reorder codes in the input list.
248
    // This supports a fixed number of special reorder codes;
249
    // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
250
0
    uint32_t specials = 0;
251
0
    for(int32_t i = 0; i < length; ++i) {
252
0
        int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
253
0
        if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
254
0
            specials |= (uint32_t)1 << reorderCode;
255
0
        }
256
0
    }
257
258
    // Start the reordering with the special low reorder codes that do not occur in the input.
259
0
    for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
260
0
        int32_t index = scriptsIndex[numScripts + i];
261
0
        if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
262
0
            lowStart = addLowScriptRange(table, index, lowStart);
263
0
        }
264
0
    }
265
266
    // Skip the reserved range before Latin if Latin is the first script,
267
    // so that we do not move it unnecessarily.
268
0
    int32_t skippedReserved = 0;
269
0
    if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
270
0
        int32_t index = scriptsIndex[USCRIPT_LATIN];
271
0
        U_ASSERT(index != 0);
272
0
        int32_t start = scriptStarts[index];
273
0
        U_ASSERT(lowStart <= start);
274
0
        skippedReserved = start - lowStart;
275
0
        lowStart = start;
276
0
    }
277
278
    // Reorder according to the input scripts, continuing from the bottom of the primary range.
279
0
    int32_t originalLength = length;  // length will be decremented if "others" is in the list.
280
0
    UBool hasReorderToEnd = FALSE;
281
0
    for(int32_t i = 0; i < length;) {
282
0
        int32_t script = reorder[i++];
283
0
        if(script == USCRIPT_UNKNOWN) {
284
            // Put the remaining scripts at the top.
285
0
            hasReorderToEnd = TRUE;
286
0
            while(i < length) {
287
0
                script = reorder[--length];
288
0
                if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
289
0
                        script == UCOL_REORDER_CODE_DEFAULT) {
290
0
                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
291
0
                    return;
292
0
                }
293
0
                int32_t index = getScriptIndex(script);
294
0
                if(index == 0) { continue; }
295
0
                if(table[index] != 0) {  // Duplicate or equivalent script.
296
0
                    errorCode = U_ILLEGAL_ARGUMENT_ERROR;
297
0
                    return;
298
0
                }
299
0
                highLimit = addHighScriptRange(table, index, highLimit);
300
0
            }
301
0
            break;
302
0
        }
303
0
        if(script == UCOL_REORDER_CODE_DEFAULT) {
304
            // The default code must be the only one in the list, and that is handled by the caller.
305
            // Otherwise it must not be used.
306
0
            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
307
0
            return;
308
0
        }
309
0
        int32_t index = getScriptIndex(script);
310
0
        if(index == 0) { continue; }
311
0
        if(table[index] != 0) {  // Duplicate or equivalent script.
312
0
            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
313
0
            return;
314
0
        }
315
0
        lowStart = addLowScriptRange(table, index, lowStart);
316
0
    }
317
318
    // Put all remaining scripts into the middle.
319
0
    for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
320
0
        int32_t leadByte = table[i];
321
0
        if(leadByte != 0) { continue; }
322
0
        int32_t start = scriptStarts[i];
323
0
        if(!hasReorderToEnd && start > lowStart) {
324
            // No need to move this script.
325
0
            lowStart = start;
326
0
        }
327
0
        lowStart = addLowScriptRange(table, i, lowStart);
328
0
    }
329
0
    if(lowStart > highLimit) {
330
0
        if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
331
            // Try not skipping the before-Latin reserved range.
332
0
            makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
333
0
            return;
334
0
        }
335
        // We need more primary lead bytes than available, despite the reserved ranges.
336
0
        errorCode = U_BUFFER_OVERFLOW_ERROR;
337
0
        return;
338
0
    }
339
340
    // Turn lead bytes into a list of (limit, offset) pairs.
341
    // Encode each pair in one list element:
342
    // Upper 16 bits = limit, lower 16 = signed lead byte offset.
343
0
    int32_t offset = 0;
344
0
    for(int32_t i = 1;; ++i) {
345
0
        int32_t nextOffset = offset;
346
0
        while(i < scriptStartsLength - 1) {
347
0
            int32_t newLeadByte = table[i];
348
0
            if(newLeadByte == 0xff) {
349
                // "Don't care" lead byte for reserved range, continue with current offset.
350
0
            } else {
351
0
                nextOffset = newLeadByte - (scriptStarts[i] >> 8);
352
0
                if(nextOffset != offset) { break; }
353
0
            }
354
0
            ++i;
355
0
        }
356
0
        if(offset != 0 || i < scriptStartsLength - 1) {
357
0
            ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
358
0
        }
359
0
        if(i == scriptStartsLength - 1) { break; }
360
0
        offset = nextOffset;
361
0
    }
362
0
}
363
364
int32_t
365
0
CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
366
0
    int32_t start = scriptStarts[index];
367
0
    if((start & 0xff) < (lowStart & 0xff)) {
368
0
        lowStart += 0x100;
369
0
    }
370
0
    table[index] = (uint8_t)(lowStart >> 8);
371
0
    int32_t limit = scriptStarts[index + 1];
372
0
    lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
373
0
    return lowStart;
374
0
}
375
376
int32_t
377
0
CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
378
0
    int32_t limit = scriptStarts[index + 1];
379
0
    if((limit & 0xff) > (highLimit & 0xff)) {
380
0
        highLimit -= 0x100;
381
0
    }
382
0
    int32_t start = scriptStarts[index];
383
0
    highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
384
0
    table[index] = (uint8_t)(highLimit >> 8);
385
0
    return highLimit;
386
0
}
387
388
U_NAMESPACE_END
389
390
#endif  // !UCONFIG_NO_COLLATION