/src/icu/source/i18n/collationcompare.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * Copyright (C) 1996-2015, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ******************************************************************************* |
8 | | * collationcompare.cpp |
9 | | * |
10 | | * created on: 2012feb14 with new and old collation code |
11 | | * created by: Markus W. Scherer |
12 | | */ |
13 | | |
14 | | #include "unicode/utypes.h" |
15 | | |
16 | | #if !UCONFIG_NO_COLLATION |
17 | | |
18 | | #include "unicode/ucol.h" |
19 | | #include "cmemory.h" |
20 | | #include "collation.h" |
21 | | #include "collationcompare.h" |
22 | | #include "collationiterator.h" |
23 | | #include "collationsettings.h" |
24 | | #include "uassert.h" |
25 | | |
26 | | U_NAMESPACE_BEGIN |
27 | | |
28 | | UCollationResult |
29 | | CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right, |
30 | | const CollationSettings &settings, |
31 | 0 | UErrorCode &errorCode) { |
32 | 0 | if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } |
33 | | |
34 | 0 | int32_t options = settings.options; |
35 | 0 | uint32_t variableTop; |
36 | 0 | if((options & CollationSettings::ALTERNATE_MASK) == 0) { |
37 | 0 | variableTop = 0; |
38 | 0 | } else { |
39 | | // +1 so that we can use "<" and primary ignorables test out early. |
40 | 0 | variableTop = settings.variableTop + 1; |
41 | 0 | } |
42 | 0 | UBool anyVariable = FALSE; |
43 | | |
44 | | // Fetch CEs, compare primaries, store secondary & tertiary weights. |
45 | 0 | for(;;) { |
46 | | // We fetch CEs until we get a non-ignorable primary or reach the end. |
47 | 0 | uint32_t leftPrimary; |
48 | 0 | do { |
49 | 0 | int64_t ce = left.nextCE(errorCode); |
50 | 0 | leftPrimary = (uint32_t)(ce >> 32); |
51 | 0 | if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { |
52 | | // Variable CE, shift it to quaternary level. |
53 | | // Ignore all following primary ignorables, and shift further variable CEs. |
54 | 0 | anyVariable = TRUE; |
55 | 0 | do { |
56 | | // Store only the primary of the variable CE. |
57 | 0 | left.setCurrentCE(ce & INT64_C(0xffffffff00000000)); |
58 | 0 | for(;;) { |
59 | 0 | ce = left.nextCE(errorCode); |
60 | 0 | leftPrimary = (uint32_t)(ce >> 32); |
61 | 0 | if(leftPrimary == 0) { |
62 | 0 | left.setCurrentCE(0); |
63 | 0 | } else { |
64 | 0 | break; |
65 | 0 | } |
66 | 0 | } |
67 | 0 | } while(leftPrimary < variableTop && |
68 | 0 | leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY); |
69 | 0 | } |
70 | 0 | } while(leftPrimary == 0); |
71 | |
|
72 | 0 | uint32_t rightPrimary; |
73 | 0 | do { |
74 | 0 | int64_t ce = right.nextCE(errorCode); |
75 | 0 | rightPrimary = (uint32_t)(ce >> 32); |
76 | 0 | if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { |
77 | | // Variable CE, shift it to quaternary level. |
78 | | // Ignore all following primary ignorables, and shift further variable CEs. |
79 | 0 | anyVariable = TRUE; |
80 | 0 | do { |
81 | | // Store only the primary of the variable CE. |
82 | 0 | right.setCurrentCE(ce & INT64_C(0xffffffff00000000)); |
83 | 0 | for(;;) { |
84 | 0 | ce = right.nextCE(errorCode); |
85 | 0 | rightPrimary = (uint32_t)(ce >> 32); |
86 | 0 | if(rightPrimary == 0) { |
87 | 0 | right.setCurrentCE(0); |
88 | 0 | } else { |
89 | 0 | break; |
90 | 0 | } |
91 | 0 | } |
92 | 0 | } while(rightPrimary < variableTop && |
93 | 0 | rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY); |
94 | 0 | } |
95 | 0 | } while(rightPrimary == 0); |
96 | |
|
97 | 0 | if(leftPrimary != rightPrimary) { |
98 | | // Return the primary difference, with script reordering. |
99 | 0 | if(settings.hasReordering()) { |
100 | 0 | leftPrimary = settings.reorder(leftPrimary); |
101 | 0 | rightPrimary = settings.reorder(rightPrimary); |
102 | 0 | } |
103 | 0 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; |
104 | 0 | } |
105 | 0 | if(leftPrimary == Collation::NO_CE_PRIMARY) { break; } |
106 | 0 | } |
107 | 0 | if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } |
108 | | |
109 | | // Compare the buffered secondary & tertiary weights. |
110 | | // We might skip the secondary level but continue with the case level |
111 | | // which is turned on separately. |
112 | 0 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { |
113 | 0 | if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { |
114 | 0 | int32_t leftIndex = 0; |
115 | 0 | int32_t rightIndex = 0; |
116 | 0 | for(;;) { |
117 | 0 | uint32_t leftSecondary; |
118 | 0 | do { |
119 | 0 | leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16; |
120 | 0 | } while(leftSecondary == 0); |
121 | |
|
122 | 0 | uint32_t rightSecondary; |
123 | 0 | do { |
124 | 0 | rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16; |
125 | 0 | } while(rightSecondary == 0); |
126 | |
|
127 | 0 | if(leftSecondary != rightSecondary) { |
128 | 0 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; |
129 | 0 | } |
130 | 0 | if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; } |
131 | 0 | } |
132 | 0 | } else { |
133 | | // The backwards secondary level compares secondary weights backwards |
134 | | // within segments separated by the merge separator (U+FFFE, weight 02). |
135 | 0 | int32_t leftStart = 0; |
136 | 0 | int32_t rightStart = 0; |
137 | 0 | for(;;) { |
138 | | // Find the merge separator or the NO_CE terminator. |
139 | 0 | uint32_t p; |
140 | 0 | int32_t leftLimit = leftStart; |
141 | 0 | while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) > |
142 | 0 | Collation::MERGE_SEPARATOR_PRIMARY || |
143 | 0 | p == 0) { |
144 | 0 | ++leftLimit; |
145 | 0 | } |
146 | 0 | int32_t rightLimit = rightStart; |
147 | 0 | while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) > |
148 | 0 | Collation::MERGE_SEPARATOR_PRIMARY || |
149 | 0 | p == 0) { |
150 | 0 | ++rightLimit; |
151 | 0 | } |
152 | | |
153 | | // Compare the segments. |
154 | 0 | int32_t leftIndex = leftLimit; |
155 | 0 | int32_t rightIndex = rightLimit; |
156 | 0 | for(;;) { |
157 | 0 | int32_t leftSecondary = 0; |
158 | 0 | while(leftSecondary == 0 && leftIndex > leftStart) { |
159 | 0 | leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16; |
160 | 0 | } |
161 | |
|
162 | 0 | int32_t rightSecondary = 0; |
163 | 0 | while(rightSecondary == 0 && rightIndex > rightStart) { |
164 | 0 | rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16; |
165 | 0 | } |
166 | |
|
167 | 0 | if(leftSecondary != rightSecondary) { |
168 | 0 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; |
169 | 0 | } |
170 | 0 | if(leftSecondary == 0) { break; } |
171 | 0 | } |
172 | | |
173 | | // Did we reach the end of either string? |
174 | | // Both strings have the same number of merge separators, |
175 | | // or else there would have been a primary-level difference. |
176 | 0 | U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit)); |
177 | 0 | if(p == Collation::NO_CE_PRIMARY) { break; } |
178 | | // Skip both merge separators and continue. |
179 | 0 | leftStart = leftLimit + 1; |
180 | 0 | rightStart = rightLimit + 1; |
181 | 0 | } |
182 | 0 | } |
183 | 0 | } |
184 | | |
185 | 0 | if((options & CollationSettings::CASE_LEVEL) != 0) { |
186 | 0 | int32_t strength = CollationSettings::getStrength(options); |
187 | 0 | int32_t leftIndex = 0; |
188 | 0 | int32_t rightIndex = 0; |
189 | 0 | for(;;) { |
190 | 0 | uint32_t leftCase, leftLower32, rightCase; |
191 | 0 | if(strength == UCOL_PRIMARY) { |
192 | | // Primary+caseLevel: Ignore case level weights of primary ignorables. |
193 | | // Otherwise we would get a-umlaut > a |
194 | | // which is not desirable for accent-insensitive sorting. |
195 | | // Check for (lower 32 bits) == 0 as well because variable CEs are stored |
196 | | // with only primary weights. |
197 | 0 | int64_t ce; |
198 | 0 | do { |
199 | 0 | ce = left.getCE(leftIndex++); |
200 | 0 | leftCase = (uint32_t)ce; |
201 | 0 | } while((uint32_t)(ce >> 32) == 0 || leftCase == 0); |
202 | 0 | leftLower32 = leftCase; |
203 | 0 | leftCase &= 0xc000; |
204 | |
|
205 | 0 | do { |
206 | 0 | ce = right.getCE(rightIndex++); |
207 | 0 | rightCase = (uint32_t)ce; |
208 | 0 | } while((uint32_t)(ce >> 32) == 0 || rightCase == 0); |
209 | 0 | rightCase &= 0xc000; |
210 | 0 | } else { |
211 | | // Secondary+caseLevel: By analogy with the above, |
212 | | // ignore case level weights of secondary ignorables. |
213 | | // |
214 | | // Note: A tertiary CE has uppercase case bits (0.0.ut) |
215 | | // to keep tertiary+caseFirst well-formed. |
216 | | // |
217 | | // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables. |
218 | | // Otherwise a tertiary CE's uppercase would be no greater than |
219 | | // a primary/secondary CE's uppercase. |
220 | | // (See UCA well-formedness condition 2.) |
221 | | // We could construct a special case weight higher than uppercase, |
222 | | // but it's simpler to always ignore case weights of secondary ignorables, |
223 | | // turning 0.0.ut into 0.0.0.t. |
224 | | // (See LDML Collation, Case Parameters.) |
225 | 0 | do { |
226 | 0 | leftCase = (uint32_t)left.getCE(leftIndex++); |
227 | 0 | } while(leftCase <= 0xffff); |
228 | 0 | leftLower32 = leftCase; |
229 | 0 | leftCase &= 0xc000; |
230 | |
|
231 | 0 | do { |
232 | 0 | rightCase = (uint32_t)right.getCE(rightIndex++); |
233 | 0 | } while(rightCase <= 0xffff); |
234 | 0 | rightCase &= 0xc000; |
235 | 0 | } |
236 | | |
237 | | // No need to handle NO_CE and MERGE_SEPARATOR specially: |
238 | | // There is one case weight for each previous-level weight, |
239 | | // so level length differences were handled there. |
240 | 0 | if(leftCase != rightCase) { |
241 | 0 | if((options & CollationSettings::UPPER_FIRST) == 0) { |
242 | 0 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; |
243 | 0 | } else { |
244 | 0 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; |
245 | 0 | } |
246 | 0 | } |
247 | 0 | if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; } |
248 | 0 | } |
249 | 0 | } |
250 | 0 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } |
251 | | |
252 | 0 | uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); |
253 | |
|
254 | 0 | int32_t leftIndex = 0; |
255 | 0 | int32_t rightIndex = 0; |
256 | 0 | uint32_t anyQuaternaries = 0; |
257 | 0 | for(;;) { |
258 | 0 | uint32_t leftLower32, leftTertiary; |
259 | 0 | do { |
260 | 0 | leftLower32 = (uint32_t)left.getCE(leftIndex++); |
261 | 0 | anyQuaternaries |= leftLower32; |
262 | 0 | U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || |
263 | 0 | (leftLower32 & 0xc0c0) == 0); |
264 | 0 | leftTertiary = leftLower32 & tertiaryMask; |
265 | 0 | } while(leftTertiary == 0); |
266 | |
|
267 | 0 | uint32_t rightLower32, rightTertiary; |
268 | 0 | do { |
269 | 0 | rightLower32 = (uint32_t)right.getCE(rightIndex++); |
270 | 0 | anyQuaternaries |= rightLower32; |
271 | 0 | U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || |
272 | 0 | (rightLower32 & 0xc0c0) == 0); |
273 | 0 | rightTertiary = rightLower32 & tertiaryMask; |
274 | 0 | } while(rightTertiary == 0); |
275 | |
|
276 | 0 | if(leftTertiary != rightTertiary) { |
277 | 0 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { |
278 | | // Pass through NO_CE and keep real tertiary weights larger than that. |
279 | | // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), |
280 | | // to keep tertiary CEs well-formed. |
281 | | // Their case+tertiary weights must be greater than those of |
282 | | // primary and secondary CEs. |
283 | 0 | if(leftTertiary > Collation::NO_CE_WEIGHT16) { |
284 | 0 | if(leftLower32 > 0xffff) { |
285 | 0 | leftTertiary ^= 0xc000; |
286 | 0 | } else { |
287 | 0 | leftTertiary += 0x4000; |
288 | 0 | } |
289 | 0 | } |
290 | 0 | if(rightTertiary > Collation::NO_CE_WEIGHT16) { |
291 | 0 | if(rightLower32 > 0xffff) { |
292 | 0 | rightTertiary ^= 0xc000; |
293 | 0 | } else { |
294 | 0 | rightTertiary += 0x4000; |
295 | 0 | } |
296 | 0 | } |
297 | 0 | } |
298 | 0 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; |
299 | 0 | } |
300 | 0 | if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; } |
301 | 0 | } |
302 | 0 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } |
303 | | |
304 | 0 | if(!anyVariable && (anyQuaternaries & 0xc0) == 0) { |
305 | | // If there are no "variable" CEs and no non-zero quaternary weights, |
306 | | // then there are no quaternary differences. |
307 | 0 | return UCOL_EQUAL; |
308 | 0 | } |
309 | | |
310 | 0 | leftIndex = 0; |
311 | 0 | rightIndex = 0; |
312 | 0 | for(;;) { |
313 | 0 | uint32_t leftQuaternary; |
314 | 0 | do { |
315 | 0 | int64_t ce = left.getCE(leftIndex++); |
316 | 0 | leftQuaternary = (uint32_t)ce & 0xffff; |
317 | 0 | if(leftQuaternary <= Collation::NO_CE_WEIGHT16) { |
318 | | // Variable primary or completely ignorable or NO_CE. |
319 | 0 | leftQuaternary = (uint32_t)(ce >> 32); |
320 | 0 | } else { |
321 | | // Regular CE, not tertiary ignorable. |
322 | | // Preserve the quaternary weight in bits 7..6. |
323 | 0 | leftQuaternary |= 0xffffff3f; |
324 | 0 | } |
325 | 0 | } while(leftQuaternary == 0); |
326 | |
|
327 | 0 | uint32_t rightQuaternary; |
328 | 0 | do { |
329 | 0 | int64_t ce = right.getCE(rightIndex++); |
330 | 0 | rightQuaternary = (uint32_t)ce & 0xffff; |
331 | 0 | if(rightQuaternary <= Collation::NO_CE_WEIGHT16) { |
332 | | // Variable primary or completely ignorable or NO_CE. |
333 | 0 | rightQuaternary = (uint32_t)(ce >> 32); |
334 | 0 | } else { |
335 | | // Regular CE, not tertiary ignorable. |
336 | | // Preserve the quaternary weight in bits 7..6. |
337 | 0 | rightQuaternary |= 0xffffff3f; |
338 | 0 | } |
339 | 0 | } while(rightQuaternary == 0); |
340 | |
|
341 | 0 | if(leftQuaternary != rightQuaternary) { |
342 | | // Return the difference, with script reordering. |
343 | 0 | if(settings.hasReordering()) { |
344 | 0 | leftQuaternary = settings.reorder(leftQuaternary); |
345 | 0 | rightQuaternary = settings.reorder(rightQuaternary); |
346 | 0 | } |
347 | 0 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; |
348 | 0 | } |
349 | 0 | if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; } |
350 | 0 | } |
351 | 0 | return UCOL_EQUAL; |
352 | 0 | } |
353 | | |
354 | | U_NAMESPACE_END |
355 | | |
356 | | #endif // !UCONFIG_NO_COLLATION |