/src/icu/source/i18n/collationdatabuilder.h
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * Copyright (C) 2012-2014, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ******************************************************************************* |
8 | | * collationdatabuilder.h |
9 | | * |
10 | | * created on: 2012apr01 |
11 | | * created by: Markus W. Scherer |
12 | | */ |
13 | | |
14 | | #ifndef __COLLATIONDATABUILDER_H__ |
15 | | #define __COLLATIONDATABUILDER_H__ |
16 | | |
17 | | #include "unicode/utypes.h" |
18 | | |
19 | | #if !UCONFIG_NO_COLLATION |
20 | | |
21 | | #include "unicode/uniset.h" |
22 | | #include "unicode/unistr.h" |
23 | | #include "unicode/uversion.h" |
24 | | #include "collation.h" |
25 | | #include "collationdata.h" |
26 | | #include "collationsettings.h" |
27 | | #include "normalizer2impl.h" |
28 | | #include "utrie2.h" |
29 | | #include "uvectr32.h" |
30 | | #include "uvectr64.h" |
31 | | #include "uvector.h" |
32 | | |
33 | | U_NAMESPACE_BEGIN |
34 | | |
35 | | struct ConditionalCE32; |
36 | | |
37 | | class CollationFastLatinBuilder; |
38 | | class CopyHelper; |
39 | | class DataBuilderCollationIterator; |
40 | | class UCharsTrieBuilder; |
41 | | |
42 | | /** |
43 | | * Low-level CollationData builder. |
44 | | * Takes (character, CE) pairs and builds them into runtime data structures. |
45 | | * Supports characters with context prefixes and contraction suffixes. |
46 | | */ |
47 | | class U_I18N_API CollationDataBuilder : public UObject { |
48 | | public: |
49 | | /** |
50 | | * Collation element modifier. Interface class for a modifier |
51 | | * that changes a tailoring builder's temporary CEs to final CEs. |
52 | | * Called for every non-special CE32 and every expansion CE. |
53 | | */ |
54 | | class CEModifier : public UObject { |
55 | | public: |
56 | | virtual ~CEModifier(); |
57 | | /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */ |
58 | | virtual int64_t modifyCE32(uint32_t ce32) const = 0; |
59 | | /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */ |
60 | | virtual int64_t modifyCE(int64_t ce) const = 0; |
61 | | }; |
62 | | |
63 | | CollationDataBuilder(UErrorCode &errorCode); |
64 | | |
65 | | virtual ~CollationDataBuilder(); |
66 | | |
67 | | void initForTailoring(const CollationData *b, UErrorCode &errorCode); |
68 | | |
69 | | virtual UBool isCompressibleLeadByte(uint32_t b) const; |
70 | | |
71 | 0 | inline UBool isCompressiblePrimary(uint32_t p) const { |
72 | 0 | return isCompressibleLeadByte(p >> 24); |
73 | 0 | } |
74 | | |
75 | | /** |
76 | | * @return true if this builder has mappings (e.g., add() has been called) |
77 | | */ |
78 | 0 | UBool hasMappings() const { return modified; } |
79 | | |
80 | | /** |
81 | | * @return true if c has CEs in this builder |
82 | | */ |
83 | | UBool isAssigned(UChar32 c) const; |
84 | | |
85 | | /** |
86 | | * @return the three-byte primary if c maps to a single such CE and has no context data, |
87 | | * otherwise returns 0. |
88 | | */ |
89 | | uint32_t getLongPrimaryIfSingleCE(UChar32 c) const; |
90 | | |
91 | | /** |
92 | | * @return the single CE for c. |
93 | | * Sets an error code if c does not have a single CE. |
94 | | */ |
95 | | int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const; |
96 | | |
97 | | void add(const UnicodeString &prefix, const UnicodeString &s, |
98 | | const int64_t ces[], int32_t cesLength, |
99 | | UErrorCode &errorCode); |
100 | | |
101 | | /** |
102 | | * Encodes the ces as either the returned ce32 by itself, |
103 | | * or by storing an expansion, with the returned ce32 referring to that. |
104 | | * |
105 | | * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) |
106 | | */ |
107 | | virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode); |
108 | | void addCE32(const UnicodeString &prefix, const UnicodeString &s, |
109 | | uint32_t ce32, UErrorCode &errorCode); |
110 | | |
111 | | /** |
112 | | * Sets three-byte-primary CEs for a range of code points in code point order, |
113 | | * if it is worth doing; otherwise no change is made. |
114 | | * None of the code points in the range should have complex mappings so far |
115 | | * (expansions/contractions/prefixes). |
116 | | * @param start first code point |
117 | | * @param end last code point (inclusive) |
118 | | * @param primary primary weight for 'start' |
119 | | * @param step per-code point primary-weight increment |
120 | | * @param errorCode ICU in/out error code |
121 | | * @return true if an OFFSET_TAG range was used for start..end |
122 | | */ |
123 | | UBool maybeSetPrimaryRange(UChar32 start, UChar32 end, |
124 | | uint32_t primary, int32_t step, |
125 | | UErrorCode &errorCode); |
126 | | |
127 | | /** |
128 | | * Sets three-byte-primary CEs for a range of code points in code point order. |
129 | | * Sets range values if that is worth doing, or else individual values. |
130 | | * None of the code points in the range should have complex mappings so far |
131 | | * (expansions/contractions/prefixes). |
132 | | * @param start first code point |
133 | | * @param end last code point (inclusive) |
134 | | * @param primary primary weight for 'start' |
135 | | * @param step per-code point primary-weight increment |
136 | | * @param errorCode ICU in/out error code |
137 | | * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step |
138 | | */ |
139 | | uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, |
140 | | uint32_t primary, int32_t step, |
141 | | UErrorCode &errorCode); |
142 | | |
143 | | /** |
144 | | * Copies all mappings from the src builder, with modifications. |
145 | | * This builder here must not be built yet, and should be empty. |
146 | | */ |
147 | | void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, |
148 | | UErrorCode &errorCode); |
149 | | |
150 | | void optimize(const UnicodeSet &set, UErrorCode &errorCode); |
151 | | void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode); |
152 | | |
153 | 0 | void enableFastLatin() { fastLatinEnabled = true; } |
154 | | virtual void build(CollationData &data, UErrorCode &errorCode); |
155 | | |
156 | | /** |
157 | | * Looks up CEs for s and appends them to the ces array. |
158 | | * Does not handle normalization: s should be in FCD form. |
159 | | * |
160 | | * Does not write completely ignorable CEs. |
161 | | * Does not write beyond Collation::MAX_EXPANSION_LENGTH. |
162 | | * |
163 | | * @return incremented cesLength |
164 | | */ |
165 | | int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength); |
166 | | int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s, |
167 | | int64_t ces[], int32_t cesLength); |
168 | | |
169 | | protected: |
170 | | friend class CopyHelper; |
171 | | friend class DataBuilderCollationIterator; |
172 | | |
173 | | uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const; |
174 | | |
175 | | int32_t addCE(int64_t ce, UErrorCode &errorCode); |
176 | | int32_t addCE32(uint32_t ce32, UErrorCode &errorCode); |
177 | | int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode); |
178 | | |
179 | 0 | inline ConditionalCE32 *getConditionalCE32(int32_t index) const { |
180 | 0 | return static_cast<ConditionalCE32 *>(conditionalCE32s[index]); |
181 | 0 | } |
182 | 0 | inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const { |
183 | 0 | return getConditionalCE32(Collation::indexFromCE32(ce32)); |
184 | 0 | } |
185 | | |
186 | 0 | static uint32_t makeBuilderContextCE32(int32_t index) { |
187 | 0 | return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index); |
188 | 0 | } |
189 | 0 | static inline UBool isBuilderContextCE32(uint32_t ce32) { |
190 | 0 | return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG); |
191 | 0 | } |
192 | | |
193 | | static uint32_t encodeOneCEAsCE32(int64_t ce); |
194 | | uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode); |
195 | | uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode); |
196 | | uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode); |
197 | | |
198 | | uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode); |
199 | | /** |
200 | | * Copies base contractions to a list of ConditionalCE32. |
201 | | * Sets cond->next to the index of the first new item |
202 | | * and returns the index of the last new item. |
203 | | */ |
204 | | int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, |
205 | | ConditionalCE32 *cond, UErrorCode &errorCode); |
206 | | |
207 | | UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode); |
208 | | void setDigitTags(UErrorCode &errorCode); |
209 | | void setLeadSurrogates(UErrorCode &errorCode); |
210 | | |
211 | | void buildMappings(CollationData &data, UErrorCode &errorCode); |
212 | | |
213 | | void clearContexts(); |
214 | | void buildContexts(UErrorCode &errorCode); |
215 | | uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode); |
216 | | int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, |
217 | | UErrorCode &errorCode); |
218 | | |
219 | | void buildFastLatinTable(CollationData &data, UErrorCode &errorCode); |
220 | | |
221 | | int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength); |
222 | | |
223 | 0 | static UChar32 jamoCpFromIndex(int32_t i) { |
224 | | // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27 |
225 | 0 | if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; } |
226 | 0 | i -= Hangul::JAMO_L_COUNT; |
227 | 0 | if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; } |
228 | 0 | i -= Hangul::JAMO_V_COUNT; |
229 | | // i < 27 |
230 | 0 | return Hangul::JAMO_T_BASE + 1 + i; |
231 | 0 | } |
232 | | |
233 | | /** @see Collation::BUILDER_DATA_TAG */ |
234 | | static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100; |
235 | | |
236 | | const Normalizer2Impl &nfcImpl; |
237 | | const CollationData *base; |
238 | | const CollationSettings *baseSettings; |
239 | | UTrie2 *trie; |
240 | | UVector32 ce32s; |
241 | | UVector64 ce64s; |
242 | | UVector conditionalCE32s; // vector of ConditionalCE32 |
243 | | // Characters that have context (prefixes or contraction suffixes). |
244 | | UnicodeSet contextChars; |
245 | | // Serialized UCharsTrie structures for finalized contexts. |
246 | | UnicodeString contexts; |
247 | | UnicodeSet unsafeBackwardSet; |
248 | | UBool modified; |
249 | | |
250 | | UBool fastLatinEnabled; |
251 | | CollationFastLatinBuilder *fastLatinBuilder; |
252 | | |
253 | | DataBuilderCollationIterator *collIter; |
254 | | }; |
255 | | |
256 | | U_NAMESPACE_END |
257 | | |
258 | | #endif // !UCONFIG_NO_COLLATION |
259 | | #endif // __COLLATIONDATABUILDER_H__ |