/src/icu/icu4c/source/common/ucharstriebuilder.cpp
Line | Count | Source |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * Copyright (C) 2010-2012, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ******************************************************************************* |
8 | | * file name: ucharstriebuilder.h |
9 | | * encoding: UTF-8 |
10 | | * tab size: 8 (not used) |
11 | | * indentation:4 |
12 | | * |
13 | | * created on: 2010nov14 |
14 | | * created by: Markus W. Scherer |
15 | | */ |
16 | | |
17 | | #include "unicode/utypes.h" |
18 | | #include "unicode/ucharstrie.h" |
19 | | #include "unicode/ucharstriebuilder.h" |
20 | | #include "unicode/unistr.h" |
21 | | #include "unicode/ustring.h" |
22 | | #include "cmemory.h" |
23 | | #include "uarrsort.h" |
24 | | #include "uassert.h" |
25 | | #include "uhash.h" |
26 | | #include "ustr_imp.h" |
27 | | |
28 | | U_NAMESPACE_BEGIN |
29 | | |
30 | | /* |
31 | | * Note: This builder implementation stores (string, value) pairs with full copies |
32 | | * of the 16-bit-unit sequences, until the UCharsTrie is built. |
33 | | * It might(!) take less memory if we collected the data in a temporary, dynamic trie. |
34 | | */ |
35 | | |
36 | | class UCharsTrieElement : public UMemory { |
37 | | public: |
38 | | // Use compiler's default constructor, initializes nothing. |
39 | | |
40 | | void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode); |
41 | | |
42 | 721M | UnicodeString getString(const UnicodeString &strings) const { |
43 | 721M | int32_t length=strings[stringOffset]; |
44 | 721M | return strings.tempSubString(stringOffset+1, length); |
45 | 721M | } |
46 | 153M | int32_t getStringLength(const UnicodeString &strings) const { |
47 | 153M | return strings[stringOffset]; |
48 | 153M | } |
49 | | |
50 | 1.51G | char16_t charAt(int32_t index, const UnicodeString &strings) const { |
51 | 1.51G | return strings[stringOffset+1+index]; |
52 | 1.51G | } |
53 | | |
54 | 42.2M | int32_t getValue() const { return value; } |
55 | | |
56 | | int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const; |
57 | | |
58 | | private: |
59 | | // The first strings unit contains the string length. |
60 | | // (Compared with a stringLength field here, this saves 2 bytes per string.) |
61 | | int32_t stringOffset; |
62 | | int32_t value; |
63 | | }; |
64 | | |
65 | | void |
66 | | UCharsTrieElement::setTo(const UnicodeString &s, int32_t val, |
67 | 42.2M | UnicodeString &strings, UErrorCode &errorCode) { |
68 | 42.2M | if(U_FAILURE(errorCode)) { |
69 | 0 | return; |
70 | 0 | } |
71 | 42.2M | int32_t length=s.length(); |
72 | 42.2M | if(length>0xffff) { |
73 | | // Too long: We store the length in 1 unit. |
74 | 0 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
75 | 0 | return; |
76 | 0 | } |
77 | 42.2M | stringOffset=strings.length(); |
78 | 42.2M | strings.append(static_cast<char16_t>(length)); |
79 | 42.2M | value=val; |
80 | 42.2M | strings.append(s); |
81 | 42.2M | } |
82 | | |
83 | | int32_t |
84 | 312M | UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const { |
85 | 312M | return getString(strings).compare(other.getString(strings)); |
86 | 312M | } |
87 | | |
88 | | UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/) |
89 | 871k | : elements(nullptr), elementsCapacity(0), elementsLength(0), |
90 | 871k | uchars(nullptr), ucharsCapacity(0), ucharsLength(0) {} |
91 | | |
92 | 871k | UCharsTrieBuilder::~UCharsTrieBuilder() { |
93 | 871k | delete[] elements; |
94 | 871k | uprv_free(uchars); |
95 | 871k | } |
96 | | |
97 | | UCharsTrieBuilder & |
98 | 42.2M | UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) { |
99 | 42.2M | if(U_FAILURE(errorCode)) { |
100 | 0 | return *this; |
101 | 0 | } |
102 | 42.2M | if(ucharsLength>0) { |
103 | | // Cannot add elements after building. |
104 | 0 | errorCode=U_NO_WRITE_PERMISSION; |
105 | 0 | return *this; |
106 | 0 | } |
107 | 42.2M | if(elementsLength==elementsCapacity) { |
108 | 483k | int32_t newCapacity; |
109 | 483k | if(elementsCapacity==0) { |
110 | 481k | newCapacity=1024; |
111 | 481k | } else { |
112 | 1.99k | newCapacity=4*elementsCapacity; |
113 | 1.99k | } |
114 | 483k | UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity]; |
115 | 483k | if(newElements==nullptr) { |
116 | 0 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
117 | 0 | return *this; |
118 | 0 | } |
119 | 483k | if(elementsLength>0) { |
120 | 1.99k | uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(UCharsTrieElement)); |
121 | 1.99k | } |
122 | 483k | delete[] elements; |
123 | 483k | elements=newElements; |
124 | 483k | elementsCapacity=newCapacity; |
125 | 483k | } |
126 | 42.2M | elements[elementsLength++].setTo(s, value, strings, errorCode); |
127 | 42.2M | if(U_SUCCESS(errorCode) && strings.isBogus()) { |
128 | 0 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
129 | 0 | } |
130 | 42.2M | return *this; |
131 | 42.2M | } |
132 | | |
133 | | U_CDECL_BEGIN |
134 | | |
135 | | static int32_t U_CALLCONV |
136 | 312M | compareElementStrings(const void *context, const void *left, const void *right) { |
137 | 312M | const UnicodeString *strings=static_cast<const UnicodeString *>(context); |
138 | 312M | const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left); |
139 | 312M | const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right); |
140 | 312M | return leftElement->compareStringTo(*rightElement, *strings); |
141 | 312M | } |
142 | | |
143 | | U_CDECL_END |
144 | | |
145 | | UCharsTrie * |
146 | 80 | UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { |
147 | 80 | buildUChars(buildOption, errorCode); |
148 | 80 | UCharsTrie *newTrie=nullptr; |
149 | 80 | if(U_SUCCESS(errorCode)) { |
150 | 80 | newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength)); |
151 | 80 | if(newTrie==nullptr) { |
152 | 0 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
153 | 80 | } else { |
154 | 80 | uchars=nullptr; // The new trie now owns the array. |
155 | 80 | ucharsCapacity=0; |
156 | 80 | } |
157 | 80 | } |
158 | 80 | return newTrie; |
159 | 80 | } |
160 | | |
161 | | UnicodeString & |
162 | | UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result, |
163 | 1.23M | UErrorCode &errorCode) { |
164 | 1.23M | buildUChars(buildOption, errorCode); |
165 | 1.23M | if(U_SUCCESS(errorCode)) { |
166 | 1.23M | result.setTo(false, uchars+(ucharsCapacity-ucharsLength), ucharsLength); |
167 | 1.23M | } |
168 | 1.23M | return result; |
169 | 1.23M | } |
170 | | |
171 | | void |
172 | 1.23M | UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { |
173 | 1.23M | if(U_FAILURE(errorCode)) { |
174 | 0 | return; |
175 | 0 | } |
176 | 1.23M | if(uchars!=nullptr && ucharsLength>0) { |
177 | | // Already built. |
178 | 0 | return; |
179 | 0 | } |
180 | 1.23M | if(ucharsLength==0) { |
181 | 1.23M | if(elementsLength==0) { |
182 | 0 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
183 | 0 | return; |
184 | 0 | } |
185 | 1.23M | if(strings.isBogus()) { |
186 | 0 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
187 | 0 | return; |
188 | 0 | } |
189 | 1.23M | uprv_sortArray(elements, elementsLength, static_cast<int32_t>(sizeof(UCharsTrieElement)), |
190 | 1.23M | compareElementStrings, &strings, |
191 | 1.23M | false, // need not be a stable sort |
192 | 1.23M | &errorCode); |
193 | 1.23M | if(U_FAILURE(errorCode)) { |
194 | 0 | return; |
195 | 0 | } |
196 | | // Duplicate strings are not allowed. |
197 | 1.23M | UnicodeString prev=elements[0].getString(strings); |
198 | 42.2M | for(int32_t i=1; i<elementsLength; ++i) { |
199 | 40.9M | UnicodeString current=elements[i].getString(strings); |
200 | 40.9M | if(prev==current) { |
201 | 0 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
202 | 0 | return; |
203 | 0 | } |
204 | 40.9M | prev.fastCopyFrom(current); |
205 | 40.9M | } |
206 | 1.23M | } |
207 | | // Create and char16_t-serialize the trie for the elements. |
208 | 1.23M | ucharsLength=0; |
209 | 1.23M | int32_t capacity=strings.length(); |
210 | 1.23M | if(capacity<1024) { |
211 | 1.09M | capacity=1024; |
212 | 1.09M | } |
213 | 1.23M | if(ucharsCapacity<capacity) { |
214 | 487k | uprv_free(uchars); |
215 | 487k | uchars=static_cast<char16_t *>(uprv_malloc(capacity*2)); |
216 | 487k | if(uchars==nullptr) { |
217 | 0 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
218 | 0 | ucharsCapacity=0; |
219 | 0 | return; |
220 | 0 | } |
221 | 487k | ucharsCapacity=capacity; |
222 | 487k | } |
223 | 1.23M | StringTrieBuilder::build(buildOption, elementsLength, errorCode); |
224 | 1.23M | if(uchars==nullptr) { |
225 | 0 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
226 | 0 | } |
227 | 1.23M | } |
228 | | |
229 | | int32_t |
230 | 118M | UCharsTrieBuilder::getElementStringLength(int32_t i) const { |
231 | 118M | return elements[i].getStringLength(strings); |
232 | 118M | } |
233 | | |
234 | | char16_t |
235 | 175M | UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const { |
236 | 175M | return elements[i].charAt(unitIndex, strings); |
237 | 175M | } |
238 | | |
239 | | int32_t |
240 | 42.2M | UCharsTrieBuilder::getElementValue(int32_t i) const { |
241 | 42.2M | return elements[i].getValue(); |
242 | 42.2M | } |
243 | | |
244 | | int32_t |
245 | 34.8M | UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const { |
246 | 34.8M | const UCharsTrieElement &firstElement=elements[first]; |
247 | 34.8M | const UCharsTrieElement &lastElement=elements[last]; |
248 | 34.8M | int32_t minStringLength=firstElement.getStringLength(strings); |
249 | 461M | while(++unitIndex<minStringLength && |
250 | 440M | firstElement.charAt(unitIndex, strings)== |
251 | 440M | lastElement.charAt(unitIndex, strings)) {} |
252 | 34.8M | return unitIndex; |
253 | 34.8M | } |
254 | | |
255 | | int32_t |
256 | 21.0M | UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const { |
257 | 21.0M | int32_t length=0; // Number of different units at unitIndex. |
258 | 21.0M | int32_t i=start; |
259 | 61.6M | do { |
260 | 61.6M | char16_t unit=elements[i++].charAt(unitIndex, strings); |
261 | 240M | while(i<limit && unit==elements[i].charAt(unitIndex, strings)) { |
262 | 178M | ++i; |
263 | 178M | } |
264 | 61.6M | ++length; |
265 | 61.6M | } while(i<limit); |
266 | 21.0M | return length; |
267 | 21.0M | } |
268 | | |
269 | | int32_t |
270 | 2.45M | UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const { |
271 | 13.4M | do { |
272 | 13.4M | char16_t unit=elements[i++].charAt(unitIndex, strings); |
273 | 19.2M | while(unit==elements[i].charAt(unitIndex, strings)) { |
274 | 5.76M | ++i; |
275 | 5.76M | } |
276 | 13.4M | } while(--count>0); |
277 | 2.45M | return i; |
278 | 2.45M | } |
279 | | |
280 | | int32_t |
281 | 38.1M | UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const { |
282 | 142M | while(unit==elements[i].charAt(unitIndex, strings)) { |
283 | 103M | ++i; |
284 | 103M | } |
285 | 38.1M | return i; |
286 | 38.1M | } |
287 | | |
288 | | UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const char16_t *units, int32_t len, Node *nextNode) |
289 | 54.2M | : LinearMatchNode(len, nextNode), s(units) { |
290 | 54.2M | hash=hash*37u+ustr_hashUCharsN(units, len); |
291 | 54.2M | } |
292 | | |
293 | | bool |
294 | 48.2M | UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const { |
295 | 48.2M | if(this==&other) { |
296 | 0 | return true; |
297 | 0 | } |
298 | 48.2M | if(!LinearMatchNode::operator==(other)) { |
299 | 13.1k | return false; |
300 | 13.1k | } |
301 | 48.2M | const UCTLinearMatchNode &o=static_cast<const UCTLinearMatchNode &>(other); |
302 | 48.2M | return 0==u_memcmp(s, o.s, length); |
303 | 48.2M | } |
304 | | |
305 | | void |
306 | 10.7M | UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) { |
307 | 10.7M | UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder; |
308 | 10.7M | next->write(builder); |
309 | 10.7M | b.write(s, length); |
310 | 10.7M | offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1); |
311 | 10.7M | } |
312 | | |
313 | | StringTrieBuilder::Node * |
314 | | UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, |
315 | 54.2M | Node *nextNode) const { |
316 | 54.2M | return new UCTLinearMatchNode( |
317 | 54.2M | elements[i].getString(strings).getBuffer()+unitIndex, |
318 | 54.2M | length, |
319 | 54.2M | nextNode); |
320 | 54.2M | } |
321 | | |
322 | | UBool |
323 | 66.5M | UCharsTrieBuilder::ensureCapacity(int32_t length) { |
324 | 66.5M | if(uchars==nullptr) { |
325 | 0 | return false; // previous memory allocation had failed |
326 | 0 | } |
327 | 66.5M | if(length>ucharsCapacity) { |
328 | 1.30k | int32_t newCapacity=ucharsCapacity; |
329 | 1.30k | do { |
330 | 1.30k | newCapacity*=2; |
331 | 1.30k | } while(newCapacity<=length); |
332 | 1.30k | char16_t *newUChars=static_cast<char16_t *>(uprv_malloc(newCapacity*2)); |
333 | 1.30k | if(newUChars==nullptr) { |
334 | | // unable to allocate memory |
335 | 0 | uprv_free(uchars); |
336 | 0 | uchars=nullptr; |
337 | 0 | ucharsCapacity=0; |
338 | 0 | return false; |
339 | 0 | } |
340 | 1.30k | u_memcpy(newUChars+(newCapacity-ucharsLength), |
341 | 1.30k | uchars+(ucharsCapacity-ucharsLength), ucharsLength); |
342 | 1.30k | uprv_free(uchars); |
343 | 1.30k | uchars=newUChars; |
344 | 1.30k | ucharsCapacity=newCapacity; |
345 | 1.30k | } |
346 | 66.5M | return true; |
347 | 66.5M | } |
348 | | |
349 | | int32_t |
350 | 46.8M | UCharsTrieBuilder::write(int32_t unit) { |
351 | 46.8M | int32_t newLength=ucharsLength+1; |
352 | 46.8M | if(ensureCapacity(newLength)) { |
353 | 46.8M | ucharsLength=newLength; |
354 | 46.8M | uchars[ucharsCapacity - ucharsLength] = static_cast<char16_t>(unit); |
355 | 46.8M | } |
356 | 46.8M | return ucharsLength; |
357 | 46.8M | } |
358 | | |
359 | | int32_t |
360 | 19.6M | UCharsTrieBuilder::write(const char16_t *s, int32_t length) { |
361 | 19.6M | int32_t newLength=ucharsLength+length; |
362 | 19.6M | if(ensureCapacity(newLength)) { |
363 | 19.6M | ucharsLength=newLength; |
364 | 19.6M | u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length); |
365 | 19.6M | } |
366 | 19.6M | return ucharsLength; |
367 | 19.6M | } |
368 | | |
369 | | int32_t |
370 | 4.43k | UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) { |
371 | 4.43k | return write(elements[i].getString(strings).getBuffer()+unitIndex, length); |
372 | 4.43k | } |
373 | | |
374 | | int32_t |
375 | 17.2M | UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { |
376 | 17.2M | if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) { |
377 | 8.42M | return write(i|(isFinal<<15)); |
378 | 8.42M | } |
379 | 8.78M | char16_t intUnits[3]; |
380 | 8.78M | int32_t length; |
381 | 8.78M | if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) { |
382 | 3.82M | intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitValueLead); |
383 | 3.82M | intUnits[1] = static_cast<char16_t>(static_cast<uint32_t>(i) >> 16); |
384 | 3.82M | intUnits[2] = static_cast<char16_t>(i); |
385 | 3.82M | length=3; |
386 | | // } else if(i<=UCharsTrie::kMaxOneUnitValue) { |
387 | | // intUnits[0]=(char16_t)(i); |
388 | | // length=1; |
389 | 4.95M | } else { |
390 | 4.95M | intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitValueLead + (i >> 16)); |
391 | 4.95M | intUnits[1] = static_cast<char16_t>(i); |
392 | 4.95M | length=2; |
393 | 4.95M | } |
394 | 8.78M | intUnits[0] = static_cast<char16_t>(intUnits[0] | (isFinal << 15)); |
395 | 8.78M | return write(intUnits, length); |
396 | 17.2M | } |
397 | | |
398 | | int32_t |
399 | 16.7M | UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { |
400 | 16.7M | if(!hasValue) { |
401 | 16.6M | return write(node); |
402 | 16.6M | } |
403 | 124k | char16_t intUnits[3]; |
404 | 124k | int32_t length; |
405 | 124k | if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) { |
406 | 59.6k | intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitNodeValueLead); |
407 | 59.6k | intUnits[1] = static_cast<char16_t>(static_cast<uint32_t>(value) >> 16); |
408 | 59.6k | intUnits[2] = static_cast<char16_t>(value); |
409 | 59.6k | length=3; |
410 | 65.3k | } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) { |
411 | 4.98k | intUnits[0] = static_cast<char16_t>((value + 1) << 6); |
412 | 4.98k | length=1; |
413 | 60.3k | } else { |
414 | 60.3k | intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitNodeValueLead + ((value >> 10) & 0x7fc0)); |
415 | 60.3k | intUnits[1] = static_cast<char16_t>(value); |
416 | 60.3k | length=2; |
417 | 60.3k | } |
418 | 124k | intUnits[0] |= static_cast<char16_t>(node); |
419 | 124k | return write(intUnits, length); |
420 | 16.7M | } |
421 | | |
422 | | int32_t |
423 | 1.33M | UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) { |
424 | 1.33M | int32_t i=ucharsLength-jumpTarget; |
425 | 1.33M | U_ASSERT(i>=0); |
426 | 1.33M | if(i<=UCharsTrie::kMaxOneUnitDelta) { |
427 | 1.33M | return write(i); |
428 | 1.33M | } |
429 | 0 | char16_t intUnits[3]; |
430 | 0 | int32_t length; |
431 | 0 | if(i<=UCharsTrie::kMaxTwoUnitDelta) { |
432 | 0 | intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitDeltaLead + (i >> 16)); |
433 | 0 | length=1; |
434 | 0 | } else { |
435 | 0 | intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitDeltaLead); |
436 | 0 | intUnits[1] = static_cast<char16_t>(i >> 16); |
437 | 0 | length=2; |
438 | 0 | } |
439 | 0 | intUnits[length++] = static_cast<char16_t>(i); |
440 | 0 | return write(intUnits, length); |
441 | 1.33M | } |
442 | | |
443 | | U_NAMESPACE_END |