Coverage Report

Created: 2025-06-13 06:38

/src/icu/icu4c/source/i18n/collationdata.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2010-2015, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationdata.h
9
*
10
* created on: 2010oct27
11
* created by: Markus W. Scherer
12
*/
13
14
#ifndef __COLLATIONDATA_H__
15
#define __COLLATIONDATA_H__
16
17
#include "unicode/utypes.h"
18
19
#if !UCONFIG_NO_COLLATION
20
21
#include "unicode/ucol.h"
22
#include "unicode/uniset.h"
23
#include "collation.h"
24
#include "normalizer2impl.h"
25
#include "utrie2.h"
26
27
struct UDataMemory;
28
29
U_NAMESPACE_BEGIN
30
31
class UVector32;
32
33
/**
34
 * Collation data container.
35
 * Immutable data created by a CollationDataBuilder, or loaded from a file,
36
 * or deserialized from API-provided binary data.
37
 *
38
 * Includes data for the collation base (root/default), aliased if this is not the base.
39
 */
40
struct U_I18N_API CollationData : public UMemory {
41
    // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
42
    // parallel with the ranges, and resetting ranges that are indexed.
43
    // The reordering builder code could clone the resulting template array.
44
    static constexpr int32_t REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14;
45
    static constexpr int32_t REORDER_RESERVED_AFTER_LATIN = REORDER_RESERVED_BEFORE_LATIN + 1;
46
47
    static constexpr int32_t MAX_NUM_SPECIAL_REORDER_CODES = 8;
48
    /** C++ only, data reader check scriptStartsLength. */
49
    static constexpr int32_t MAX_NUM_SCRIPT_RANGES = 256;
50
51
    CollationData(const Normalizer2Impl &nfc)
52
9.72k
            : trie(nullptr),
53
9.72k
              ce32s(nullptr), ces(nullptr), contexts(nullptr), base(nullptr),
54
9.72k
              jamoCE32s(nullptr),
55
9.72k
              nfcImpl(nfc),
56
9.72k
              numericPrimary(0x12000000),
57
9.72k
              ce32sLength(0), cesLength(0), contextsLength(0),
58
9.72k
              compressibleBytes(nullptr),
59
9.72k
              unsafeBackwardSet(nullptr),
60
9.72k
              fastLatinTable(nullptr), fastLatinTableLength(0),
61
9.72k
              numScripts(0), scriptsIndex(nullptr), scriptStarts(nullptr), scriptStartsLength(0),
62
9.72k
              rootElements(nullptr), rootElementsLength(0) {}
63
64
96.3M
    uint32_t getCE32(UChar32 c) const {
65
96.3M
        return UTRIE2_GET32(trie, c);
66
96.3M
    }
67
68
302k
    uint32_t getCE32FromSupplementary(UChar32 c) const {
69
302k
        return UTRIE2_GET32_FROM_SUPP(trie, c);
70
302k
    }
71
72
0
    UBool isDigit(UChar32 c) const {
73
0
        return c < 0x660 ? c <= 0x39 && 0x30 <= c :
74
0
                Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
75
0
    }
76
77
8.36k
    UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
78
8.36k
        return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
79
8.36k
    }
80
81
12.2k
    UBool isCompressibleLeadByte(uint32_t b) const {
82
12.2k
        return compressibleBytes[b];
83
12.2k
    }
84
85
12.2k
    inline UBool isCompressiblePrimary(uint32_t p) const {
86
12.2k
        return isCompressibleLeadByte(p >> 24);
87
12.2k
    }
88
89
    /**
90
     * Returns the CE32 from two contexts words.
91
     * Access to the defaultCE32 for contraction and prefix matching.
92
     */
93
3.26M
    static uint32_t readCE32(const char16_t *p) {
94
3.26M
        return (static_cast<uint32_t>(p[0]) << 16) | p[1];
95
3.26M
    }
96
97
    /**
98
     * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
99
     * Requires that ce32 is special.
100
     */
101
    uint32_t getIndirectCE32(uint32_t ce32) const;
102
    /**
103
     * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
104
     * if ce32 is special.
105
     */
106
    uint32_t getFinalCE32(uint32_t ce32) const;
107
108
    /**
109
     * Computes a CE from c's ce32 which has the OFFSET_TAG.
110
     */
111
6.50M
    int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
112
6.50M
        int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
113
6.50M
        return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
114
6.50M
    }
115
116
    /**
117
     * Returns the single CE that c maps to.
118
     * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
119
     */
120
    int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
121
122
    /**
123
     * Returns the FCD16 value for code point c. c must be >= 0.
124
     */
125
74.1M
    uint16_t getFCD16(UChar32 c) const {
126
74.1M
        return nfcImpl.getFCD16(c);
127
74.1M
    }
128
129
    /**
130
     * Returns the first primary for the script's reordering group.
131
     * @return the primary with only the first primary lead byte of the group
132
     *         (not necessarily an actual root collator primary weight),
133
     *         or 0 if the script is unknown
134
     */
135
    uint32_t getFirstPrimaryForGroup(int32_t script) const;
136
137
    /**
138
     * Returns the last primary for the script's reordering group.
139
     * @return the last primary of the group
140
     *         (not an actual root collator primary weight),
141
     *         or 0 if the script is unknown
142
     */
143
    uint32_t getLastPrimaryForGroup(int32_t script) const;
144
145
    /**
146
     * Finds the reordering group which contains the primary weight.
147
     * @return the first script of the group, or -1 if the weight is beyond the last group
148
     */
149
    int32_t getGroupForPrimary(uint32_t p) const;
150
151
    int32_t getEquivalentScripts(int32_t script,
152
                                 int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
153
154
    /**
155
     * Writes the permutation of primary-weight ranges
156
     * for the given reordering of scripts and groups.
157
     * The caller checks for illegal arguments and
158
     * takes care of [DEFAULT] and memory allocation.
159
     *
160
     * Each list element will be a (limit, offset) pair as described
161
     * for the CollationSettings::reorderRanges.
162
     * The list will be empty if no ranges are reordered.
163
     */
164
    void makeReorderRanges(const int32_t *reorder, int32_t length,
165
                           UVector32 &ranges, UErrorCode &errorCode) const;
166
167
    /** @see jamoCE32s */
168
    static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
169
170
    /** Main lookup trie. */
171
    const UTrie2 *trie;
172
    /**
173
     * Array of CE32 values.
174
     * At index 0 there must be CE32(U+0000)
175
     * to support U+0000's special-tag for NUL-termination handling.
176
     */
177
    const uint32_t *ce32s;
178
    /** Array of CE values for expansions and OFFSET_TAG. */
179
    const int64_t *ces;
180
    /** Array of prefix and contraction-suffix matching data. */
181
    const char16_t *contexts;
182
    /** Base collation data, or nullptr if this data itself is a base. */
183
    const CollationData *base;
184
    /**
185
     * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
186
     * They are normally simple CE32s, rarely expansions.
187
     * For fast handling of HANGUL_TAG.
188
     */
189
    const uint32_t *jamoCE32s;
190
    const Normalizer2Impl &nfcImpl;
191
    /** The single-byte primary weight (xx000000) for numeric collation. */
192
    uint32_t numericPrimary;
193
194
    int32_t ce32sLength;
195
    int32_t cesLength;
196
    int32_t contextsLength;
197
198
    /** 256 flags for which primary-weight lead bytes are compressible. */
199
    const UBool *compressibleBytes;
200
    /**
201
     * Set of code points that are unsafe for starting string comparison after an identical prefix,
202
     * or in backwards CE iteration.
203
     */
204
    const UnicodeSet *unsafeBackwardSet;
205
206
    /**
207
     * Fast Latin table for common-Latin-text string comparisons.
208
     * Data structure see class CollationFastLatin.
209
     */
210
    const uint16_t *fastLatinTable;
211
    int32_t fastLatinTableLength;
212
213
    /**
214
     * Data for scripts and reordering groups.
215
     * Uses include building a reordering permutation table and
216
     * providing script boundaries to AlphabeticIndex.
217
     */
218
    int32_t numScripts;
219
    /**
220
     * The length of scriptsIndex is numScripts+16.
221
     * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
222
     * 16 special reorder codes (not all used) are mapped starting at numScripts.
223
     * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
224
     * There are special codes at the end for reorder-reserved primary ranges.
225
     *
226
     * Multiple scripts may share a range and index, for example Hira & Kana.
227
     */
228
    const uint16_t *scriptsIndex;
229
    /**
230
     * Start primary weight (top 16 bits only) for a group/script/reserved range
231
     * indexed by scriptsIndex.
232
     * The first range (separators & terminators) and the last range (trailing weights)
233
     * are not reorderable, and no scriptsIndex entry points to them.
234
     */
235
    const uint16_t *scriptStarts;
236
    int32_t scriptStartsLength;
237
238
    /**
239
     * Collation elements in the root collator.
240
     * Used by the CollationRootElements class. The data structure is described there.
241
     * nullptr in a tailoring.
242
     */
243
    const uint32_t *rootElements;
244
    int32_t rootElementsLength;
245
246
private:
247
    int32_t getScriptIndex(int32_t script) const;
248
    void makeReorderRanges(const int32_t *reorder, int32_t length,
249
                           UBool latinMustMove,
250
                           UVector32 &ranges, UErrorCode &errorCode) const;
251
    int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
252
    int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
253
};
254
255
U_NAMESPACE_END
256
257
#endif  // !UCONFIG_NO_COLLATION
258
#endif  // __COLLATIONDATA_H__