/src/icu/source/i18n/collationsettings.h
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * Copyright (C) 2013-2015, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ******************************************************************************* |
8 | | * collationsettings.h |
9 | | * |
10 | | * created on: 2013feb07 |
11 | | * created by: Markus W. Scherer |
12 | | */ |
13 | | |
14 | | #ifndef __COLLATIONSETTINGS_H__ |
15 | | #define __COLLATIONSETTINGS_H__ |
16 | | |
17 | | #include "unicode/utypes.h" |
18 | | |
19 | | #if !UCONFIG_NO_COLLATION |
20 | | |
21 | | #include "unicode/ucol.h" |
22 | | #include "collation.h" |
23 | | #include "sharedobject.h" |
24 | | #include "umutex.h" |
25 | | |
26 | | U_NAMESPACE_BEGIN |
27 | | |
28 | | struct CollationData; |
29 | | |
30 | | /** |
31 | | * Collation settings/options/attributes. |
32 | | * These are the values that can be changed via API. |
33 | | */ |
34 | | struct U_I18N_API CollationSettings : public SharedObject { |
35 | | /** |
36 | | * Options bit 0: Perform the FCD check on the input text and deliver normalized text. |
37 | | */ |
38 | | static const int32_t CHECK_FCD = 1; |
39 | | /** |
40 | | * Options bit 1: Numeric collation. |
41 | | * Also known as CODAN = COllate Digits As Numbers. |
42 | | * |
43 | | * Treat digit sequences as numbers with CE sequences in numeric order, |
44 | | * rather than returning a normal CE for each digit. |
45 | | */ |
46 | | static const int32_t NUMERIC = 2; |
47 | | /** |
48 | | * "Shifted" alternate handling, see ALTERNATE_MASK. |
49 | | */ |
50 | | static const int32_t SHIFTED = 4; |
51 | | /** |
52 | | * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable. |
53 | | * Reserve values 8 and 0xc for shift-trimmed and blanked. |
54 | | */ |
55 | | static const int32_t ALTERNATE_MASK = 0xc; |
56 | | /** |
57 | | * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value. |
58 | | */ |
59 | | static const int32_t MAX_VARIABLE_SHIFT = 4; |
60 | | /** maxVariable options bit mask before shifting. */ |
61 | | static const int32_t MAX_VARIABLE_MASK = 0x70; |
62 | | /** Options bit 7: Reserved/unused/0. */ |
63 | | /** |
64 | | * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on. |
65 | | */ |
66 | | static const int32_t UPPER_FIRST = 0x100; |
67 | | /** |
68 | | * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values) |
69 | | * unless case level is on (when they are *moved* into the separate case level). |
70 | | * By default, the case bits are removed from the tertiary weight (ignored). |
71 | | * |
72 | | * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to |
73 | | * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST. |
74 | | */ |
75 | | static const int32_t CASE_FIRST = 0x200; |
76 | | /** |
77 | | * Options bit mask for caseFirst and upperFirst, before shifting. |
78 | | * Same value as caseFirst==upperFirst. |
79 | | */ |
80 | | static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST; |
81 | | /** |
82 | | * Options bit 10: Insert the case level between the secondary and tertiary levels. |
83 | | */ |
84 | | static const int32_t CASE_LEVEL = 0x400; |
85 | | /** |
86 | | * Options bit 11: Compare secondary weights backwards. ("French secondary") |
87 | | */ |
88 | | static const int32_t BACKWARD_SECONDARY = 0x800; |
89 | | /** |
90 | | * Options bits 15..12: The 4-bit strength value bit field is shifted by this value. |
91 | | * It is the top used bit field in the options. (No need to mask after shifting.) |
92 | | */ |
93 | | static const int32_t STRENGTH_SHIFT = 12; |
94 | | /** Strength options bit mask before shifting. */ |
95 | | static const int32_t STRENGTH_MASK = 0xf000; |
96 | | |
97 | | /** maxVariable values */ |
98 | | enum MaxVariable { |
99 | | MAX_VAR_SPACE, |
100 | | MAX_VAR_PUNCT, |
101 | | MAX_VAR_SYMBOL, |
102 | | MAX_VAR_CURRENCY |
103 | | }; |
104 | | |
105 | | CollationSettings() |
106 | 0 | : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) | |
107 | 0 | (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)), |
108 | 0 | variableTop(0), |
109 | | reorderTable(NULL), |
110 | 0 | minHighNoReorder(0), |
111 | 0 | reorderRanges(NULL), reorderRangesLength(0), |
112 | 0 | reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0), |
113 | 0 | fastLatinOptions(-1) {} |
114 | | |
115 | | CollationSettings(const CollationSettings &other); |
116 | | virtual ~CollationSettings(); |
117 | | |
118 | | bool operator==(const CollationSettings &other) const; |
119 | | |
120 | 0 | inline bool operator!=(const CollationSettings &other) const { |
121 | 0 | return !operator==(other); |
122 | 0 | } |
123 | | |
124 | | int32_t hashCode() const; |
125 | | |
126 | | void resetReordering(); |
127 | | void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length, |
128 | | const uint32_t *ranges, int32_t rangesLength, |
129 | | const uint8_t *table, UErrorCode &errorCode); |
130 | | void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength, |
131 | | UErrorCode &errorCode); |
132 | | void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode); |
133 | | |
134 | 0 | inline UBool hasReordering() const { return reorderTable != NULL; } |
135 | | static UBool reorderTableHasSplitBytes(const uint8_t table[256]); |
136 | 0 | inline uint32_t reorder(uint32_t p) const { |
137 | 0 | uint8_t b = reorderTable[p >> 24]; |
138 | 0 | if(b != 0 || p <= Collation::NO_CE_PRIMARY) { |
139 | 0 | return ((uint32_t)b << 24) | (p & 0xffffff); |
140 | 0 | } else { |
141 | 0 | return reorderEx(p); |
142 | 0 | } |
143 | 0 | } |
144 | | |
145 | | void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); |
146 | | |
147 | 0 | static int32_t getStrength(int32_t options) { |
148 | 0 | return options >> STRENGTH_SHIFT; |
149 | 0 | } |
150 | | |
151 | 0 | int32_t getStrength() const { |
152 | 0 | return getStrength(options); |
153 | 0 | } |
154 | | |
155 | | /** Sets the options bit for an on/off attribute. */ |
156 | | void setFlag(int32_t bit, UColAttributeValue value, |
157 | | int32_t defaultOptions, UErrorCode &errorCode); |
158 | | |
159 | 0 | UColAttributeValue getFlag(int32_t bit) const { |
160 | 0 | return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF; |
161 | 0 | } |
162 | | |
163 | | void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode); |
164 | | |
165 | 0 | UColAttributeValue getCaseFirst() const { |
166 | 0 | int32_t option = options & CASE_FIRST_AND_UPPER_MASK; |
167 | 0 | return (option == 0) ? UCOL_OFF : |
168 | 0 | (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST; |
169 | 0 | } |
170 | | |
171 | | void setAlternateHandling(UColAttributeValue value, |
172 | | int32_t defaultOptions, UErrorCode &errorCode); |
173 | | |
174 | 0 | UColAttributeValue getAlternateHandling() const { |
175 | 0 | return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED; |
176 | 0 | } |
177 | | |
178 | | void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); |
179 | | |
180 | 0 | MaxVariable getMaxVariable() const { |
181 | 0 | return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT); |
182 | 0 | } |
183 | | |
184 | | /** |
185 | | * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off. |
186 | | */ |
187 | 0 | static inline UBool isTertiaryWithCaseBits(int32_t options) { |
188 | 0 | return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST; |
189 | 0 | } |
190 | 0 | static uint32_t getTertiaryMask(int32_t options) { |
191 | | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. |
192 | 0 | return isTertiaryWithCaseBits(options) ? |
193 | 0 | Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK; |
194 | 0 | } |
195 | | |
196 | 0 | static UBool sortsTertiaryUpperCaseFirst(int32_t options) { |
197 | | // On tertiary level, consider case bits and sort uppercase first |
198 | | // if caseLevel is off and caseFirst==upperFirst. |
199 | 0 | return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK; |
200 | 0 | } |
201 | | |
202 | 0 | inline UBool dontCheckFCD() const { |
203 | 0 | return (options & CHECK_FCD) == 0; |
204 | 0 | } |
205 | | |
206 | 0 | inline UBool hasBackwardSecondary() const { |
207 | 0 | return (options & BACKWARD_SECONDARY) != 0; |
208 | 0 | } |
209 | | |
210 | 0 | inline UBool isNumeric() const { |
211 | 0 | return (options & NUMERIC) != 0; |
212 | 0 | } |
213 | | |
214 | | /** CHECK_FCD etc. */ |
215 | | int32_t options; |
216 | | /** Variable-top primary weight. */ |
217 | | uint32_t variableTop; |
218 | | /** |
219 | | * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering. |
220 | | * A 0 entry at a non-zero index means that the primary lead byte is "split" |
221 | | * (there are different offsets for primaries that share that lead byte) |
222 | | * and the reordering offset must be determined via the reorderRanges. |
223 | | */ |
224 | | const uint8_t *reorderTable; |
225 | | /** Limit of last reordered range. 0 if no reordering or no split bytes. */ |
226 | | uint32_t minHighNoReorder; |
227 | | /** |
228 | | * Primary-weight ranges for script reordering, |
229 | | * to be used by reorder(p) for split-reordered primary lead bytes. |
230 | | * |
231 | | * Each entry is a (limit, offset) pair. |
232 | | * The upper 16 bits of the entry are the upper 16 bits of the |
233 | | * exclusive primary limit of a range. |
234 | | * Primaries between the previous limit and this one have their lead bytes |
235 | | * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits. |
236 | | * |
237 | | * CollationData::makeReorderRanges() writes a full list where the first range |
238 | | * (at least for terminators and separators) has a 0 offset. |
239 | | * The last range has a non-zero offset. |
240 | | * minHighNoReorder is set to the limit of that last range. |
241 | | * |
242 | | * In the settings object, the initial ranges before the first split lead byte |
243 | | * are omitted for efficiency; they are handled by reorder(p) via the reorderTable. |
244 | | * If there are no split-reordered lead bytes, then no ranges are needed. |
245 | | */ |
246 | | const uint32_t *reorderRanges; |
247 | | int32_t reorderRangesLength; |
248 | | /** Array of reorder codes; ignored if reorderCodesLength == 0. */ |
249 | | const int32_t *reorderCodes; |
250 | | /** Number of reorder codes; 0 if no reordering. */ |
251 | | int32_t reorderCodesLength; |
252 | | /** |
253 | | * Capacity of reorderCodes. |
254 | | * If 0, then the codes, the ranges, and the table are aliases. |
255 | | * Otherwise, this object owns the memory via the reorderCodes pointer; |
256 | | * the codes, the ranges, and the table are in the same memory block, in that order. |
257 | | */ |
258 | | int32_t reorderCodesCapacity; |
259 | | |
260 | | /** Options for CollationFastLatin. Negative if disabled. */ |
261 | | int32_t fastLatinOptions; |
262 | | uint16_t fastLatinPrimaries[0x180]; |
263 | | |
264 | | private: |
265 | | void setReorderArrays(const int32_t *codes, int32_t codesLength, |
266 | | const uint32_t *ranges, int32_t rangesLength, |
267 | | const uint8_t *table, UErrorCode &errorCode); |
268 | | uint32_t reorderEx(uint32_t p) const; |
269 | | }; |
270 | | |
271 | | U_NAMESPACE_END |
272 | | |
273 | | #endif // !UCONFIG_NO_COLLATION |
274 | | #endif // __COLLATIONSETTINGS_H__ |