/src/icu/source/i18n/coleitr.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * Copyright (C) 1996-2014, International Business Machines Corporation and |
6 | | * others. All Rights Reserved. |
7 | | ******************************************************************************* |
8 | | */ |
9 | | |
10 | | /* |
11 | | * File coleitr.cpp |
12 | | * |
13 | | * Created by: Helena Shih |
14 | | * |
15 | | * Modification History: |
16 | | * |
17 | | * Date Name Description |
18 | | * |
19 | | * 6/23/97 helena Adding comments to make code more readable. |
20 | | * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java |
21 | | * 12/10/99 aliu Ported Thai collation support from Java. |
22 | | * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) |
23 | | * 02/19/01 swquek Removed CollationElementIterator() since it is |
24 | | * private constructor and no calls are made to it |
25 | | * 2012-2014 markus Rewritten in C++ again. |
26 | | */ |
27 | | |
28 | | #include "unicode/utypes.h" |
29 | | |
30 | | #if !UCONFIG_NO_COLLATION |
31 | | |
32 | | #include "unicode/chariter.h" |
33 | | #include "unicode/coleitr.h" |
34 | | #include "unicode/tblcoll.h" |
35 | | #include "unicode/ustring.h" |
36 | | #include "cmemory.h" |
37 | | #include "collation.h" |
38 | | #include "collationdata.h" |
39 | | #include "collationiterator.h" |
40 | | #include "collationsets.h" |
41 | | #include "collationtailoring.h" |
42 | | #include "uassert.h" |
43 | | #include "uhash.h" |
44 | | #include "utf16collationiterator.h" |
45 | | #include "uvectr32.h" |
46 | | |
47 | | /* Constants --------------------------------------------------------------- */ |
48 | | |
49 | | U_NAMESPACE_BEGIN |
50 | | |
51 | | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator) |
52 | | |
53 | | /* CollationElementIterator public constructor/destructor ------------------ */ |
54 | | |
55 | | CollationElementIterator::CollationElementIterator( |
56 | | const CollationElementIterator& other) |
57 | 0 | : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) { |
58 | 0 | *this = other; |
59 | 0 | } |
60 | | |
61 | | CollationElementIterator::~CollationElementIterator() |
62 | 0 | { |
63 | 0 | delete iter_; |
64 | 0 | delete offsets_; |
65 | 0 | } |
66 | | |
67 | | /* CollationElementIterator public methods --------------------------------- */ |
68 | | |
69 | | namespace { |
70 | | |
71 | 0 | uint32_t getFirstHalf(uint32_t p, uint32_t lower32) { |
72 | 0 | return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff); |
73 | 0 | } |
74 | 0 | uint32_t getSecondHalf(uint32_t p, uint32_t lower32) { |
75 | 0 | return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f); |
76 | 0 | } |
77 | 0 | UBool ceNeedsTwoParts(int64_t ce) { |
78 | 0 | return (ce & INT64_C(0xffff00ff003f)) != 0; |
79 | 0 | } |
80 | | |
81 | | } // namespace |
82 | | |
83 | | int32_t CollationElementIterator::getOffset() const |
84 | 0 | { |
85 | 0 | if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) { |
86 | | // CollationIterator::previousCE() decrements the CEs length |
87 | | // while it pops CEs from its internal buffer. |
88 | 0 | int32_t i = iter_->getCEsLength(); |
89 | 0 | if (otherHalf_ != 0) { |
90 | | // Return the trailing CE offset while we are in the middle of a 64-bit CE. |
91 | 0 | ++i; |
92 | 0 | } |
93 | 0 | U_ASSERT(i < offsets_->size()); |
94 | 0 | return offsets_->elementAti(i); |
95 | 0 | } |
96 | 0 | return iter_->getOffset(); |
97 | 0 | } |
98 | | |
99 | | /** |
100 | | * Get the ordering priority of the next character in the string. |
101 | | * @return the next character's ordering. Returns NULLORDER if an error has |
102 | | * occurred or if the end of string has been reached |
103 | | */ |
104 | | int32_t CollationElementIterator::next(UErrorCode& status) |
105 | 0 | { |
106 | 0 | if (U_FAILURE(status)) { return NULLORDER; } |
107 | 0 | if (dir_ > 1) { |
108 | | // Continue forward iteration. Test this first. |
109 | 0 | if (otherHalf_ != 0) { |
110 | 0 | uint32_t oh = otherHalf_; |
111 | 0 | otherHalf_ = 0; |
112 | 0 | return oh; |
113 | 0 | } |
114 | 0 | } else if (dir_ == 1) { |
115 | | // next() after setOffset() |
116 | 0 | dir_ = 2; |
117 | 0 | } else if (dir_ == 0) { |
118 | | // The iter_ is already reset to the start of the text. |
119 | 0 | dir_ = 2; |
120 | 0 | } else /* dir_ < 0 */ { |
121 | | // illegal change of direction |
122 | 0 | status = U_INVALID_STATE_ERROR; |
123 | 0 | return NULLORDER; |
124 | 0 | } |
125 | | // No need to keep all CEs in the buffer when we iterate. |
126 | 0 | iter_->clearCEsIfNoneRemaining(); |
127 | 0 | int64_t ce = iter_->nextCE(status); |
128 | 0 | if (ce == Collation::NO_CE) { return NULLORDER; } |
129 | | // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. |
130 | 0 | uint32_t p = (uint32_t)(ce >> 32); |
131 | 0 | uint32_t lower32 = (uint32_t)ce; |
132 | 0 | uint32_t firstHalf = getFirstHalf(p, lower32); |
133 | 0 | uint32_t secondHalf = getSecondHalf(p, lower32); |
134 | 0 | if (secondHalf != 0) { |
135 | 0 | otherHalf_ = secondHalf | 0xc0; // continuation CE |
136 | 0 | } |
137 | 0 | return firstHalf; |
138 | 0 | } |
139 | | |
140 | | bool CollationElementIterator::operator!=( |
141 | | const CollationElementIterator& other) const |
142 | 0 | { |
143 | 0 | return !(*this == other); |
144 | 0 | } |
145 | | |
146 | | bool CollationElementIterator::operator==( |
147 | | const CollationElementIterator& that) const |
148 | 0 | { |
149 | 0 | if (this == &that) { |
150 | 0 | return TRUE; |
151 | 0 | } |
152 | | |
153 | 0 | return |
154 | 0 | (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) && |
155 | 0 | otherHalf_ == that.otherHalf_ && |
156 | 0 | normalizeDir() == that.normalizeDir() && |
157 | 0 | string_ == that.string_ && |
158 | 0 | *iter_ == *that.iter_; |
159 | 0 | } |
160 | | |
161 | | /** |
162 | | * Get the ordering priority of the previous collation element in the string. |
163 | | * @param status the error code status. |
164 | | * @return the previous element's ordering. Returns NULLORDER if an error has |
165 | | * occurred or if the start of string has been reached. |
166 | | */ |
167 | | int32_t CollationElementIterator::previous(UErrorCode& status) |
168 | 0 | { |
169 | 0 | if (U_FAILURE(status)) { return NULLORDER; } |
170 | 0 | if (dir_ < 0) { |
171 | | // Continue backwards iteration. Test this first. |
172 | 0 | if (otherHalf_ != 0) { |
173 | 0 | uint32_t oh = otherHalf_; |
174 | 0 | otherHalf_ = 0; |
175 | 0 | return oh; |
176 | 0 | } |
177 | 0 | } else if (dir_ == 0) { |
178 | 0 | iter_->resetToOffset(string_.length()); |
179 | 0 | dir_ = -1; |
180 | 0 | } else if (dir_ == 1) { |
181 | | // previous() after setOffset() |
182 | 0 | dir_ = -1; |
183 | 0 | } else /* dir_ > 1 */ { |
184 | | // illegal change of direction |
185 | 0 | status = U_INVALID_STATE_ERROR; |
186 | 0 | return NULLORDER; |
187 | 0 | } |
188 | 0 | if (offsets_ == NULL) { |
189 | 0 | offsets_ = new UVector32(status); |
190 | 0 | if (offsets_ == NULL) { |
191 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
192 | 0 | return NULLORDER; |
193 | 0 | } |
194 | 0 | } |
195 | | // If we already have expansion CEs, then we also have offsets. |
196 | | // Otherwise remember the trailing offset in case we need to |
197 | | // write offsets for an artificial expansion. |
198 | 0 | int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0; |
199 | 0 | int64_t ce = iter_->previousCE(*offsets_, status); |
200 | 0 | if (ce == Collation::NO_CE) { return NULLORDER; } |
201 | | // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits. |
202 | 0 | uint32_t p = (uint32_t)(ce >> 32); |
203 | 0 | uint32_t lower32 = (uint32_t)ce; |
204 | 0 | uint32_t firstHalf = getFirstHalf(p, lower32); |
205 | 0 | uint32_t secondHalf = getSecondHalf(p, lower32); |
206 | 0 | if (secondHalf != 0) { |
207 | 0 | if (offsets_->isEmpty()) { |
208 | | // When we convert a single 64-bit CE into two 32-bit CEs, |
209 | | // we need to make this artificial expansion behave like a normal expansion. |
210 | | // See CollationIterator::previousCE(). |
211 | 0 | offsets_->addElement(iter_->getOffset(), status); |
212 | 0 | offsets_->addElement(limitOffset, status); |
213 | 0 | } |
214 | 0 | otherHalf_ = firstHalf; |
215 | 0 | return secondHalf | 0xc0; // continuation CE |
216 | 0 | } |
217 | 0 | return firstHalf; |
218 | 0 | } |
219 | | |
220 | | /** |
221 | | * Resets the cursor to the beginning of the string. |
222 | | */ |
223 | | void CollationElementIterator::reset() |
224 | 0 | { |
225 | 0 | iter_ ->resetToOffset(0); |
226 | 0 | otherHalf_ = 0; |
227 | 0 | dir_ = 0; |
228 | 0 | } |
229 | | |
230 | | void CollationElementIterator::setOffset(int32_t newOffset, |
231 | | UErrorCode& status) |
232 | 0 | { |
233 | 0 | if (U_FAILURE(status)) { return; } |
234 | 0 | if (0 < newOffset && newOffset < string_.length()) { |
235 | 0 | int32_t offset = newOffset; |
236 | 0 | do { |
237 | 0 | UChar c = string_.charAt(offset); |
238 | 0 | if (!rbc_->isUnsafe(c) || |
239 | 0 | (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) { |
240 | 0 | break; |
241 | 0 | } |
242 | | // Back up to before this unsafe character. |
243 | 0 | --offset; |
244 | 0 | } while (offset > 0); |
245 | 0 | if (offset < newOffset) { |
246 | | // We might have backed up more than necessary. |
247 | | // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe, |
248 | | // but for text "chu" setOffset(2) should remain at 2 |
249 | | // although we initially back up to offset 0. |
250 | | // Find the last safe offset no greater than newOffset by iterating forward. |
251 | 0 | int32_t lastSafeOffset = offset; |
252 | 0 | do { |
253 | 0 | iter_->resetToOffset(lastSafeOffset); |
254 | 0 | do { |
255 | 0 | iter_->nextCE(status); |
256 | 0 | if (U_FAILURE(status)) { return; } |
257 | 0 | } while ((offset = iter_->getOffset()) == lastSafeOffset); |
258 | 0 | if (offset <= newOffset) { |
259 | 0 | lastSafeOffset = offset; |
260 | 0 | } |
261 | 0 | } while (offset < newOffset); |
262 | 0 | newOffset = lastSafeOffset; |
263 | 0 | } |
264 | 0 | } |
265 | 0 | iter_->resetToOffset(newOffset); |
266 | 0 | otherHalf_ = 0; |
267 | 0 | dir_ = 1; |
268 | 0 | } |
269 | | |
270 | | /** |
271 | | * Sets the source to the new source string. |
272 | | */ |
273 | | void CollationElementIterator::setText(const UnicodeString& source, |
274 | | UErrorCode& status) |
275 | 0 | { |
276 | 0 | if (U_FAILURE(status)) { |
277 | 0 | return; |
278 | 0 | } |
279 | | |
280 | 0 | string_ = source; |
281 | 0 | const UChar *s = string_.getBuffer(); |
282 | 0 | CollationIterator *newIter; |
283 | 0 | UBool numeric = rbc_->settings->isNumeric(); |
284 | 0 | if (rbc_->settings->dontCheckFCD()) { |
285 | 0 | newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); |
286 | 0 | } else { |
287 | 0 | newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length()); |
288 | 0 | } |
289 | 0 | if (newIter == NULL) { |
290 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
291 | 0 | return; |
292 | 0 | } |
293 | 0 | delete iter_; |
294 | 0 | iter_ = newIter; |
295 | 0 | otherHalf_ = 0; |
296 | 0 | dir_ = 0; |
297 | 0 | } |
298 | | |
299 | | // Sets the source to the new character iterator. |
300 | | void CollationElementIterator::setText(CharacterIterator& source, |
301 | | UErrorCode& status) |
302 | 0 | { |
303 | 0 | if (U_FAILURE(status)) |
304 | 0 | return; |
305 | | |
306 | 0 | source.getText(string_); |
307 | 0 | setText(string_, status); |
308 | 0 | } |
309 | | |
310 | | int32_t CollationElementIterator::strengthOrder(int32_t order) const |
311 | 0 | { |
312 | 0 | UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength(); |
313 | | // Mask off the unwanted differences. |
314 | 0 | if (s == UCOL_PRIMARY) { |
315 | 0 | order &= 0xffff0000; |
316 | 0 | } |
317 | 0 | else if (s == UCOL_SECONDARY) { |
318 | 0 | order &= 0xffffff00; |
319 | 0 | } |
320 | |
|
321 | 0 | return order; |
322 | 0 | } |
323 | | |
324 | | /* CollationElementIterator private constructors/destructors --------------- */ |
325 | | |
326 | | /** |
327 | | * This is the "real" constructor for this class; it constructs an iterator |
328 | | * over the source text using the specified collator |
329 | | */ |
330 | | CollationElementIterator::CollationElementIterator( |
331 | | const UnicodeString &source, |
332 | | const RuleBasedCollator *coll, |
333 | | UErrorCode &status) |
334 | 0 | : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { |
335 | 0 | setText(source, status); |
336 | 0 | } |
337 | | |
338 | | /** |
339 | | * This is the "real" constructor for this class; it constructs an iterator over |
340 | | * the source text using the specified collator |
341 | | */ |
342 | | CollationElementIterator::CollationElementIterator( |
343 | | const CharacterIterator &source, |
344 | | const RuleBasedCollator *coll, |
345 | | UErrorCode &status) |
346 | 0 | : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) { |
347 | | // We only call source.getText() which should be const anyway. |
348 | 0 | setText(const_cast<CharacterIterator &>(source), status); |
349 | 0 | } |
350 | | |
351 | | /* CollationElementIterator private methods -------------------------------- */ |
352 | | |
353 | | const CollationElementIterator& CollationElementIterator::operator=( |
354 | | const CollationElementIterator& other) |
355 | 0 | { |
356 | 0 | if (this == &other) { |
357 | 0 | return *this; |
358 | 0 | } |
359 | | |
360 | 0 | CollationIterator *newIter; |
361 | 0 | const FCDUTF16CollationIterator *otherFCDIter = |
362 | 0 | dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_); |
363 | 0 | if(otherFCDIter != NULL) { |
364 | 0 | newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer()); |
365 | 0 | } else { |
366 | 0 | const UTF16CollationIterator *otherIter = |
367 | 0 | dynamic_cast<const UTF16CollationIterator *>(other.iter_); |
368 | 0 | if(otherIter != NULL) { |
369 | 0 | newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer()); |
370 | 0 | } else { |
371 | 0 | newIter = NULL; |
372 | 0 | } |
373 | 0 | } |
374 | 0 | if(newIter != NULL) { |
375 | 0 | delete iter_; |
376 | 0 | iter_ = newIter; |
377 | 0 | rbc_ = other.rbc_; |
378 | 0 | otherHalf_ = other.otherHalf_; |
379 | 0 | dir_ = other.dir_; |
380 | |
|
381 | 0 | string_ = other.string_; |
382 | 0 | } |
383 | 0 | if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) { |
384 | 0 | UErrorCode errorCode = U_ZERO_ERROR; |
385 | 0 | if(offsets_ == NULL) { |
386 | 0 | offsets_ = new UVector32(other.offsets_->size(), errorCode); |
387 | 0 | } |
388 | 0 | if(offsets_ != NULL) { |
389 | 0 | offsets_->assign(*other.offsets_, errorCode); |
390 | 0 | } |
391 | 0 | } |
392 | 0 | return *this; |
393 | 0 | } |
394 | | |
395 | | namespace { |
396 | | |
397 | | class MaxExpSink : public ContractionsAndExpansions::CESink { |
398 | | public: |
399 | 0 | MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {} |
400 | | virtual ~MaxExpSink(); |
401 | 0 | virtual void handleCE(int64_t /*ce*/) {} |
402 | 0 | virtual void handleExpansion(const int64_t ces[], int32_t length) { |
403 | 0 | if (length <= 1) { |
404 | | // We do not need to add single CEs into the map. |
405 | 0 | return; |
406 | 0 | } |
407 | 0 | int32_t count = 0; // number of CE "halves" |
408 | 0 | for (int32_t i = 0; i < length; ++i) { |
409 | 0 | count += ceNeedsTwoParts(ces[i]) ? 2 : 1; |
410 | 0 | } |
411 | | // last "half" of the last CE |
412 | 0 | int64_t ce = ces[length - 1]; |
413 | 0 | uint32_t p = (uint32_t)(ce >> 32); |
414 | 0 | uint32_t lower32 = (uint32_t)ce; |
415 | 0 | uint32_t lastHalf = getSecondHalf(p, lower32); |
416 | 0 | if (lastHalf == 0) { |
417 | 0 | lastHalf = getFirstHalf(p, lower32); |
418 | 0 | U_ASSERT(lastHalf != 0); |
419 | 0 | } else { |
420 | 0 | lastHalf |= 0xc0; // old-style continuation CE |
421 | 0 | } |
422 | 0 | if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) { |
423 | 0 | uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode); |
424 | 0 | } |
425 | 0 | } |
426 | | |
427 | | private: |
428 | | UHashtable *maxExpansions; |
429 | | UErrorCode &errorCode; |
430 | | }; |
431 | | |
432 | | MaxExpSink::~MaxExpSink() {} |
433 | | |
434 | | } // namespace |
435 | | |
436 | | UHashtable * |
437 | 0 | CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) { |
438 | 0 | if (U_FAILURE(errorCode)) { return NULL; } |
439 | 0 | UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong, |
440 | 0 | uhash_compareLong, &errorCode); |
441 | 0 | if (U_FAILURE(errorCode)) { return NULL; } |
442 | 0 | MaxExpSink sink(maxExpansions, errorCode); |
443 | 0 | ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode); |
444 | 0 | if (U_FAILURE(errorCode)) { |
445 | 0 | uhash_close(maxExpansions); |
446 | 0 | return NULL; |
447 | 0 | } |
448 | 0 | return maxExpansions; |
449 | 0 | } |
450 | | |
451 | | int32_t |
452 | 0 | CollationElementIterator::getMaxExpansion(int32_t order) const { |
453 | 0 | return getMaxExpansion(rbc_->tailoring->maxExpansions, order); |
454 | 0 | } |
455 | | |
456 | | int32_t |
457 | 0 | CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) { |
458 | 0 | if (order == 0) { return 1; } |
459 | 0 | int32_t max; |
460 | 0 | if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) { |
461 | 0 | return max; |
462 | 0 | } |
463 | 0 | if ((order & 0xc0) == 0xc0) { |
464 | | // old-style continuation CE |
465 | 0 | return 2; |
466 | 0 | } else { |
467 | 0 | return 1; |
468 | 0 | } |
469 | 0 | } |
470 | | |
471 | | U_NAMESPACE_END |
472 | | |
473 | | #endif /* #if !UCONFIG_NO_COLLATION */ |