Coverage Report

Created: 2025-01-28 06:38

/src/icu/source/i18n/collationsettings.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2013-2015, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationsettings.h
9
*
10
* created on: 2013feb07
11
* created by: Markus W. Scherer
12
*/
13
14
#ifndef __COLLATIONSETTINGS_H__
15
#define __COLLATIONSETTINGS_H__
16
17
#include "unicode/utypes.h"
18
19
#if !UCONFIG_NO_COLLATION
20
21
#include "unicode/ucol.h"
22
#include "collation.h"
23
#include "sharedobject.h"
24
#include "umutex.h"
25
26
U_NAMESPACE_BEGIN
27
28
struct CollationData;
29
30
/**
31
 * Collation settings/options/attributes.
32
 * These are the values that can be changed via API.
33
 */
34
struct U_I18N_API CollationSettings : public SharedObject {
35
    /**
36
     * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
37
     */
38
    static const int32_t CHECK_FCD = 1;
39
    /**
40
     * Options bit 1: Numeric collation.
41
     * Also known as CODAN = COllate Digits As Numbers.
42
     *
43
     * Treat digit sequences as numbers with CE sequences in numeric order,
44
     * rather than returning a normal CE for each digit.
45
     */
46
    static const int32_t NUMERIC = 2;
47
    /**
48
     * "Shifted" alternate handling, see ALTERNATE_MASK.
49
     */
50
    static const int32_t SHIFTED = 4;
51
    /**
52
     * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
53
     * Reserve values 8 and 0xc for shift-trimmed and blanked.
54
     */
55
    static const int32_t ALTERNATE_MASK = 0xc;
56
    /**
57
     * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
58
     */
59
    static const int32_t MAX_VARIABLE_SHIFT = 4;
60
    /** maxVariable options bit mask before shifting. */
61
    static const int32_t MAX_VARIABLE_MASK = 0x70;
62
    /** Options bit 7: Reserved/unused/0. */
63
    /**
64
     * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
65
     */
66
    static const int32_t UPPER_FIRST = 0x100;
67
    /**
68
     * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
69
     * unless case level is on (when they are *moved* into the separate case level).
70
     * By default, the case bits are removed from the tertiary weight (ignored).
71
     *
72
     * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
73
     * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
74
     */
75
    static const int32_t CASE_FIRST = 0x200;
76
    /**
77
     * Options bit mask for caseFirst and upperFirst, before shifting.
78
     * Same value as caseFirst==upperFirst.
79
     */
80
    static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
81
    /**
82
     * Options bit 10: Insert the case level between the secondary and tertiary levels.
83
     */
84
    static const int32_t CASE_LEVEL = 0x400;
85
    /**
86
     * Options bit 11: Compare secondary weights backwards. ("French secondary")
87
     */
88
    static const int32_t BACKWARD_SECONDARY = 0x800;
89
    /**
90
     * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
91
     * It is the top used bit field in the options. (No need to mask after shifting.)
92
     */
93
    static const int32_t STRENGTH_SHIFT = 12;
94
    /** Strength options bit mask before shifting. */
95
    static const int32_t STRENGTH_MASK = 0xf000;
96
97
    /** maxVariable values */
98
    enum MaxVariable {
99
        MAX_VAR_SPACE,
100
        MAX_VAR_PUNCT,
101
        MAX_VAR_SYMBOL,
102
        MAX_VAR_CURRENCY
103
    };
104
105
    CollationSettings()
106
0
            : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) |
107
0
                      (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)),
108
0
              variableTop(0),
109
              reorderTable(NULL),
110
0
              minHighNoReorder(0),
111
0
              reorderRanges(NULL), reorderRangesLength(0),
112
0
              reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
113
0
              fastLatinOptions(-1) {}
114
115
    CollationSettings(const CollationSettings &other);
116
    virtual ~CollationSettings();
117
118
    bool operator==(const CollationSettings &other) const;
119
120
0
    inline bool operator!=(const CollationSettings &other) const {
121
0
        return !operator==(other);
122
0
    }
123
124
    int32_t hashCode() const;
125
126
    void resetReordering();
127
    void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
128
                         const uint32_t *ranges, int32_t rangesLength,
129
                         const uint8_t *table, UErrorCode &errorCode);
130
    void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength,
131
                       UErrorCode &errorCode);
132
    void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode);
133
134
0
    inline UBool hasReordering() const { return reorderTable != NULL; }
135
    static UBool reorderTableHasSplitBytes(const uint8_t table[256]);
136
0
    inline uint32_t reorder(uint32_t p) const {
137
0
        uint8_t b = reorderTable[p >> 24];
138
0
        if(b != 0 || p <= Collation::NO_CE_PRIMARY) {
139
0
            return ((uint32_t)b << 24) | (p & 0xffffff);
140
0
        } else {
141
0
            return reorderEx(p);
142
0
        }
143
0
    }
144
145
    void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
146
147
0
    static int32_t getStrength(int32_t options) {
148
0
        return options >> STRENGTH_SHIFT;
149
0
    }
150
151
0
    int32_t getStrength() const {
152
0
        return getStrength(options);
153
0
    }
154
155
    /** Sets the options bit for an on/off attribute. */
156
    void setFlag(int32_t bit, UColAttributeValue value,
157
                 int32_t defaultOptions, UErrorCode &errorCode);
158
159
0
    UColAttributeValue getFlag(int32_t bit) const {
160
0
        return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF;
161
0
    }
162
163
    void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode);
164
165
0
    UColAttributeValue getCaseFirst() const {
166
0
        int32_t option = options & CASE_FIRST_AND_UPPER_MASK;
167
0
        return (option == 0) ? UCOL_OFF :
168
0
                (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST;
169
0
    }
170
171
    void setAlternateHandling(UColAttributeValue value,
172
                              int32_t defaultOptions, UErrorCode &errorCode);
173
174
0
    UColAttributeValue getAlternateHandling() const {
175
0
        return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED;
176
0
    }
177
178
    void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
179
180
0
    MaxVariable getMaxVariable() const {
181
0
        return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT);
182
0
    }
183
184
    /**
185
     * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
186
     */
187
0
    static inline UBool isTertiaryWithCaseBits(int32_t options) {
188
0
        return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
189
0
    }
190
0
    static uint32_t getTertiaryMask(int32_t options) {
191
        // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
192
0
        return isTertiaryWithCaseBits(options) ?
193
0
                Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK;
194
0
    }
195
196
0
    static UBool sortsTertiaryUpperCaseFirst(int32_t options) {
197
        // On tertiary level, consider case bits and sort uppercase first
198
        // if caseLevel is off and caseFirst==upperFirst.
199
0
        return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
200
0
    }
201
202
0
    inline UBool dontCheckFCD() const {
203
0
        return (options & CHECK_FCD) == 0;
204
0
    }
205
206
0
    inline UBool hasBackwardSecondary() const {
207
0
        return (options & BACKWARD_SECONDARY) != 0;
208
0
    }
209
210
0
    inline UBool isNumeric() const {
211
0
        return (options & NUMERIC) != 0;
212
0
    }
213
214
    /** CHECK_FCD etc. */
215
    int32_t options;
216
    /** Variable-top primary weight. */
217
    uint32_t variableTop;
218
    /**
219
     * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering.
220
     * A 0 entry at a non-zero index means that the primary lead byte is "split"
221
     * (there are different offsets for primaries that share that lead byte)
222
     * and the reordering offset must be determined via the reorderRanges.
223
     */
224
    const uint8_t *reorderTable;
225
    /** Limit of last reordered range. 0 if no reordering or no split bytes. */
226
    uint32_t minHighNoReorder;
227
    /**
228
     * Primary-weight ranges for script reordering,
229
     * to be used by reorder(p) for split-reordered primary lead bytes.
230
     *
231
     * Each entry is a (limit, offset) pair.
232
     * The upper 16 bits of the entry are the upper 16 bits of the
233
     * exclusive primary limit of a range.
234
     * Primaries between the previous limit and this one have their lead bytes
235
     * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
236
     *
237
     * CollationData::makeReorderRanges() writes a full list where the first range
238
     * (at least for terminators and separators) has a 0 offset.
239
     * The last range has a non-zero offset.
240
     * minHighNoReorder is set to the limit of that last range.
241
     *
242
     * In the settings object, the initial ranges before the first split lead byte
243
     * are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
244
     * If there are no split-reordered lead bytes, then no ranges are needed.
245
     */
246
    const uint32_t *reorderRanges;
247
    int32_t reorderRangesLength;
248
    /** Array of reorder codes; ignored if reorderCodesLength == 0. */
249
    const int32_t *reorderCodes;
250
    /** Number of reorder codes; 0 if no reordering. */
251
    int32_t reorderCodesLength;
252
    /**
253
     * Capacity of reorderCodes.
254
     * If 0, then the codes, the ranges, and the table are aliases.
255
     * Otherwise, this object owns the memory via the reorderCodes pointer;
256
     * the codes, the ranges, and the table are in the same memory block, in that order.
257
     */
258
    int32_t reorderCodesCapacity;
259
260
    /** Options for CollationFastLatin. Negative if disabled. */
261
    int32_t fastLatinOptions;
262
    uint16_t fastLatinPrimaries[0x180];
263
264
private:
265
    void setReorderArrays(const int32_t *codes, int32_t codesLength,
266
                          const uint32_t *ranges, int32_t rangesLength,
267
                          const uint8_t *table, UErrorCode &errorCode);
268
    uint32_t reorderEx(uint32_t p) const;
269
};
270
271
U_NAMESPACE_END
272
273
#endif  // !UCONFIG_NO_COLLATION
274
#endif  // __COLLATIONSETTINGS_H__