/src/icu/source/i18n/collationfastlatin.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * Copyright (C) 2013-2015, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ******************************************************************************* |
8 | | * collationfastlatin.cpp |
9 | | * |
10 | | * created on: 2013aug18 |
11 | | * created by: Markus W. Scherer |
12 | | */ |
13 | | |
14 | | #include "unicode/utypes.h" |
15 | | |
16 | | #if !UCONFIG_NO_COLLATION |
17 | | |
18 | | #include "unicode/ucol.h" |
19 | | #include "collationdata.h" |
20 | | #include "collationfastlatin.h" |
21 | | #include "collationsettings.h" |
22 | | #include "uassert.h" |
23 | | |
24 | | U_NAMESPACE_BEGIN |
25 | | |
26 | | int32_t |
27 | | CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, |
28 | 0 | uint16_t *primaries, int32_t capacity) { |
29 | 0 | const uint16_t *table = data->fastLatinTable; |
30 | 0 | if(table == NULL) { return -1; } |
31 | 0 | U_ASSERT(capacity == LATIN_LIMIT); |
32 | 0 | if(capacity != LATIN_LIMIT) { return -1; } |
33 | | |
34 | 0 | uint32_t miniVarTop; |
35 | 0 | if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { |
36 | | // No mini primaries are variable, set a variableTop just below the |
37 | | // lowest long mini primary. |
38 | 0 | miniVarTop = MIN_LONG - 1; |
39 | 0 | } else { |
40 | 0 | int32_t headerLength = *table & 0xff; |
41 | 0 | int32_t i = 1 + settings.getMaxVariable(); |
42 | 0 | if(i >= headerLength) { |
43 | 0 | return -1; // variableTop >= digits, should not occur |
44 | 0 | } |
45 | 0 | miniVarTop = table[i]; |
46 | 0 | } |
47 | | |
48 | 0 | UBool digitsAreReordered = FALSE; |
49 | 0 | if(settings.hasReordering()) { |
50 | 0 | uint32_t prevStart = 0; |
51 | 0 | uint32_t beforeDigitStart = 0; |
52 | 0 | uint32_t digitStart = 0; |
53 | 0 | uint32_t afterDigitStart = 0; |
54 | 0 | for(int32_t group = UCOL_REORDER_CODE_FIRST; |
55 | 0 | group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES; |
56 | 0 | ++group) { |
57 | 0 | uint32_t start = data->getFirstPrimaryForGroup(group); |
58 | 0 | start = settings.reorder(start); |
59 | 0 | if(group == UCOL_REORDER_CODE_DIGIT) { |
60 | 0 | beforeDigitStart = prevStart; |
61 | 0 | digitStart = start; |
62 | 0 | } else if(start != 0) { |
63 | 0 | if(start < prevStart) { |
64 | | // The permutation affects the groups up to Latin. |
65 | 0 | return -1; |
66 | 0 | } |
67 | | // In the future, there might be a special group between digits & Latin. |
68 | 0 | if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { |
69 | 0 | afterDigitStart = start; |
70 | 0 | } |
71 | 0 | prevStart = start; |
72 | 0 | } |
73 | 0 | } |
74 | 0 | uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); |
75 | 0 | latinStart = settings.reorder(latinStart); |
76 | 0 | if(latinStart < prevStart) { |
77 | 0 | return -1; |
78 | 0 | } |
79 | 0 | if(afterDigitStart == 0) { |
80 | 0 | afterDigitStart = latinStart; |
81 | 0 | } |
82 | 0 | if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { |
83 | 0 | digitsAreReordered = TRUE; |
84 | 0 | } |
85 | 0 | } |
86 | | |
87 | 0 | table += (table[0] & 0xff); // skip the header |
88 | 0 | for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { |
89 | 0 | uint32_t p = table[c]; |
90 | 0 | if(p >= MIN_SHORT) { |
91 | 0 | p &= SHORT_PRIMARY_MASK; |
92 | 0 | } else if(p > miniVarTop) { |
93 | 0 | p &= LONG_PRIMARY_MASK; |
94 | 0 | } else { |
95 | 0 | p = 0; |
96 | 0 | } |
97 | 0 | primaries[c] = (uint16_t)p; |
98 | 0 | } |
99 | 0 | if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) { |
100 | | // Bail out for digits. |
101 | 0 | for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } |
102 | 0 | } |
103 | | |
104 | | // Shift the miniVarTop above other options. |
105 | 0 | return ((int32_t)miniVarTop << 16) | settings.options; |
106 | 0 | } |
107 | | |
108 | | int32_t |
109 | | CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, |
110 | | const UChar *left, int32_t leftLength, |
111 | 0 | const UChar *right, int32_t rightLength) { |
112 | | // This is a modified copy of CollationCompare::compareUpToQuaternary(), |
113 | | // optimized for common Latin text. |
114 | | // Keep them in sync! |
115 | | // Keep compareUTF16() and compareUTF8() in sync very closely! |
116 | |
|
117 | 0 | U_ASSERT((table[0] >> 8) == VERSION); |
118 | 0 | table += (table[0] & 0xff); // skip the header |
119 | 0 | uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() |
120 | 0 | options &= 0xffff; // needed for CollationSettings::getStrength() to work |
121 | | |
122 | | // Check for supported characters, fetch mini CEs, and compare primaries. |
123 | 0 | int32_t leftIndex = 0, rightIndex = 0; |
124 | | /** |
125 | | * Single mini CE or a pair. |
126 | | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. |
127 | | * If there is only one, then it is in the lower bits, and the upper bits are 0. |
128 | | */ |
129 | 0 | uint32_t leftPair = 0, rightPair = 0; |
130 | 0 | for(;;) { |
131 | | // We fetch CEs until we get a non-ignorable primary or reach the end. |
132 | 0 | while(leftPair == 0) { |
133 | 0 | if(leftIndex == leftLength) { |
134 | 0 | leftPair = EOS; |
135 | 0 | break; |
136 | 0 | } |
137 | 0 | UChar32 c = left[leftIndex++]; |
138 | 0 | if(c <= LATIN_MAX) { |
139 | 0 | leftPair = primaries[c]; |
140 | 0 | if(leftPair != 0) { break; } |
141 | 0 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { |
142 | 0 | return BAIL_OUT_RESULT; |
143 | 0 | } |
144 | 0 | leftPair = table[c]; |
145 | 0 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
146 | 0 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; |
147 | 0 | } else { |
148 | 0 | leftPair = lookup(table, c); |
149 | 0 | } |
150 | 0 | if(leftPair >= MIN_SHORT) { |
151 | 0 | leftPair &= SHORT_PRIMARY_MASK; |
152 | 0 | break; |
153 | 0 | } else if(leftPair > variableTop) { |
154 | 0 | leftPair &= LONG_PRIMARY_MASK; |
155 | 0 | break; |
156 | 0 | } else { |
157 | 0 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); |
158 | 0 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
159 | 0 | leftPair = getPrimaries(variableTop, leftPair); |
160 | 0 | } |
161 | 0 | } |
162 | | |
163 | 0 | while(rightPair == 0) { |
164 | 0 | if(rightIndex == rightLength) { |
165 | 0 | rightPair = EOS; |
166 | 0 | break; |
167 | 0 | } |
168 | 0 | UChar32 c = right[rightIndex++]; |
169 | 0 | if(c <= LATIN_MAX) { |
170 | 0 | rightPair = primaries[c]; |
171 | 0 | if(rightPair != 0) { break; } |
172 | 0 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { |
173 | 0 | return BAIL_OUT_RESULT; |
174 | 0 | } |
175 | 0 | rightPair = table[c]; |
176 | 0 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
177 | 0 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; |
178 | 0 | } else { |
179 | 0 | rightPair = lookup(table, c); |
180 | 0 | } |
181 | 0 | if(rightPair >= MIN_SHORT) { |
182 | 0 | rightPair &= SHORT_PRIMARY_MASK; |
183 | 0 | break; |
184 | 0 | } else if(rightPair > variableTop) { |
185 | 0 | rightPair &= LONG_PRIMARY_MASK; |
186 | 0 | break; |
187 | 0 | } else { |
188 | 0 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); |
189 | 0 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
190 | 0 | rightPair = getPrimaries(variableTop, rightPair); |
191 | 0 | } |
192 | 0 | } |
193 | | |
194 | 0 | if(leftPair == rightPair) { |
195 | 0 | if(leftPair == EOS) { break; } |
196 | 0 | leftPair = rightPair = 0; |
197 | 0 | continue; |
198 | 0 | } |
199 | 0 | uint32_t leftPrimary = leftPair & 0xffff; |
200 | 0 | uint32_t rightPrimary = rightPair & 0xffff; |
201 | 0 | if(leftPrimary != rightPrimary) { |
202 | | // Return the primary difference. |
203 | 0 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; |
204 | 0 | } |
205 | 0 | if(leftPair == EOS) { break; } |
206 | 0 | leftPair >>= 16; |
207 | 0 | rightPair >>= 16; |
208 | 0 | } |
209 | | // In the following, we need to re-fetch each character because we did not buffer the CEs, |
210 | | // but we know that the string is well-formed and |
211 | | // only contains supported characters and mappings. |
212 | | |
213 | | // We might skip the secondary level but continue with the case level |
214 | | // which is turned on separately. |
215 | 0 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { |
216 | 0 | leftIndex = rightIndex = 0; |
217 | 0 | leftPair = rightPair = 0; |
218 | 0 | for(;;) { |
219 | 0 | while(leftPair == 0) { |
220 | 0 | if(leftIndex == leftLength) { |
221 | 0 | leftPair = EOS; |
222 | 0 | break; |
223 | 0 | } |
224 | 0 | UChar32 c = left[leftIndex++]; |
225 | 0 | if(c <= LATIN_MAX) { |
226 | 0 | leftPair = table[c]; |
227 | 0 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
228 | 0 | leftPair = table[c - PUNCT_START + LATIN_LIMIT]; |
229 | 0 | } else { |
230 | 0 | leftPair = lookup(table, c); |
231 | 0 | } |
232 | 0 | if(leftPair >= MIN_SHORT) { |
233 | 0 | leftPair = getSecondariesFromOneShortCE(leftPair); |
234 | 0 | break; |
235 | 0 | } else if(leftPair > variableTop) { |
236 | 0 | leftPair = COMMON_SEC_PLUS_OFFSET; |
237 | 0 | break; |
238 | 0 | } else { |
239 | 0 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); |
240 | 0 | leftPair = getSecondaries(variableTop, leftPair); |
241 | 0 | } |
242 | 0 | } |
243 | |
|
244 | 0 | while(rightPair == 0) { |
245 | 0 | if(rightIndex == rightLength) { |
246 | 0 | rightPair = EOS; |
247 | 0 | break; |
248 | 0 | } |
249 | 0 | UChar32 c = right[rightIndex++]; |
250 | 0 | if(c <= LATIN_MAX) { |
251 | 0 | rightPair = table[c]; |
252 | 0 | } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
253 | 0 | rightPair = table[c - PUNCT_START + LATIN_LIMIT]; |
254 | 0 | } else { |
255 | 0 | rightPair = lookup(table, c); |
256 | 0 | } |
257 | 0 | if(rightPair >= MIN_SHORT) { |
258 | 0 | rightPair = getSecondariesFromOneShortCE(rightPair); |
259 | 0 | break; |
260 | 0 | } else if(rightPair > variableTop) { |
261 | 0 | rightPair = COMMON_SEC_PLUS_OFFSET; |
262 | 0 | break; |
263 | 0 | } else { |
264 | 0 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); |
265 | 0 | rightPair = getSecondaries(variableTop, rightPair); |
266 | 0 | } |
267 | 0 | } |
268 | |
|
269 | 0 | if(leftPair == rightPair) { |
270 | 0 | if(leftPair == EOS) { break; } |
271 | 0 | leftPair = rightPair = 0; |
272 | 0 | continue; |
273 | 0 | } |
274 | 0 | uint32_t leftSecondary = leftPair & 0xffff; |
275 | 0 | uint32_t rightSecondary = rightPair & 0xffff; |
276 | 0 | if(leftSecondary != rightSecondary) { |
277 | 0 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { |
278 | | // Full support for backwards secondary requires backwards contraction matching |
279 | | // and moving backwards between merge separators. |
280 | 0 | return BAIL_OUT_RESULT; |
281 | 0 | } |
282 | 0 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; |
283 | 0 | } |
284 | 0 | if(leftPair == EOS) { break; } |
285 | 0 | leftPair >>= 16; |
286 | 0 | rightPair >>= 16; |
287 | 0 | } |
288 | 0 | } |
289 | | |
290 | 0 | if((options & CollationSettings::CASE_LEVEL) != 0) { |
291 | 0 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; |
292 | 0 | leftIndex = rightIndex = 0; |
293 | 0 | leftPair = rightPair = 0; |
294 | 0 | for(;;) { |
295 | 0 | while(leftPair == 0) { |
296 | 0 | if(leftIndex == leftLength) { |
297 | 0 | leftPair = EOS; |
298 | 0 | break; |
299 | 0 | } |
300 | 0 | UChar32 c = left[leftIndex++]; |
301 | 0 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
302 | 0 | if(leftPair < MIN_LONG) { |
303 | 0 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); |
304 | 0 | } |
305 | 0 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); |
306 | 0 | } |
307 | |
|
308 | 0 | while(rightPair == 0) { |
309 | 0 | if(rightIndex == rightLength) { |
310 | 0 | rightPair = EOS; |
311 | 0 | break; |
312 | 0 | } |
313 | 0 | UChar32 c = right[rightIndex++]; |
314 | 0 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
315 | 0 | if(rightPair < MIN_LONG) { |
316 | 0 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); |
317 | 0 | } |
318 | 0 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); |
319 | 0 | } |
320 | |
|
321 | 0 | if(leftPair == rightPair) { |
322 | 0 | if(leftPair == EOS) { break; } |
323 | 0 | leftPair = rightPair = 0; |
324 | 0 | continue; |
325 | 0 | } |
326 | 0 | uint32_t leftCase = leftPair & 0xffff; |
327 | 0 | uint32_t rightCase = rightPair & 0xffff; |
328 | 0 | if(leftCase != rightCase) { |
329 | 0 | if((options & CollationSettings::UPPER_FIRST) == 0) { |
330 | 0 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; |
331 | 0 | } else { |
332 | 0 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; |
333 | 0 | } |
334 | 0 | } |
335 | 0 | if(leftPair == EOS) { break; } |
336 | 0 | leftPair >>= 16; |
337 | 0 | rightPair >>= 16; |
338 | 0 | } |
339 | 0 | } |
340 | 0 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } |
341 | | |
342 | | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. |
343 | 0 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); |
344 | |
|
345 | 0 | leftIndex = rightIndex = 0; |
346 | 0 | leftPair = rightPair = 0; |
347 | 0 | for(;;) { |
348 | 0 | while(leftPair == 0) { |
349 | 0 | if(leftIndex == leftLength) { |
350 | 0 | leftPair = EOS; |
351 | 0 | break; |
352 | 0 | } |
353 | 0 | UChar32 c = left[leftIndex++]; |
354 | 0 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
355 | 0 | if(leftPair < MIN_LONG) { |
356 | 0 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); |
357 | 0 | } |
358 | 0 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); |
359 | 0 | } |
360 | |
|
361 | 0 | while(rightPair == 0) { |
362 | 0 | if(rightIndex == rightLength) { |
363 | 0 | rightPair = EOS; |
364 | 0 | break; |
365 | 0 | } |
366 | 0 | UChar32 c = right[rightIndex++]; |
367 | 0 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
368 | 0 | if(rightPair < MIN_LONG) { |
369 | 0 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); |
370 | 0 | } |
371 | 0 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); |
372 | 0 | } |
373 | |
|
374 | 0 | if(leftPair == rightPair) { |
375 | 0 | if(leftPair == EOS) { break; } |
376 | 0 | leftPair = rightPair = 0; |
377 | 0 | continue; |
378 | 0 | } |
379 | 0 | uint32_t leftTertiary = leftPair & 0xffff; |
380 | 0 | uint32_t rightTertiary = rightPair & 0xffff; |
381 | 0 | if(leftTertiary != rightTertiary) { |
382 | 0 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { |
383 | | // Pass through EOS and MERGE_WEIGHT |
384 | | // and keep real tertiary weights larger than the MERGE_WEIGHT. |
385 | | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. |
386 | 0 | if(leftTertiary > MERGE_WEIGHT) { |
387 | 0 | leftTertiary ^= CASE_MASK; |
388 | 0 | } |
389 | 0 | if(rightTertiary > MERGE_WEIGHT) { |
390 | 0 | rightTertiary ^= CASE_MASK; |
391 | 0 | } |
392 | 0 | } |
393 | 0 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; |
394 | 0 | } |
395 | 0 | if(leftPair == EOS) { break; } |
396 | 0 | leftPair >>= 16; |
397 | 0 | rightPair >>= 16; |
398 | 0 | } |
399 | 0 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } |
400 | | |
401 | 0 | leftIndex = rightIndex = 0; |
402 | 0 | leftPair = rightPair = 0; |
403 | 0 | for(;;) { |
404 | 0 | while(leftPair == 0) { |
405 | 0 | if(leftIndex == leftLength) { |
406 | 0 | leftPair = EOS; |
407 | 0 | break; |
408 | 0 | } |
409 | 0 | UChar32 c = left[leftIndex++]; |
410 | 0 | leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
411 | 0 | if(leftPair < MIN_LONG) { |
412 | 0 | leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); |
413 | 0 | } |
414 | 0 | leftPair = getQuaternaries(variableTop, leftPair); |
415 | 0 | } |
416 | |
|
417 | 0 | while(rightPair == 0) { |
418 | 0 | if(rightIndex == rightLength) { |
419 | 0 | rightPair = EOS; |
420 | 0 | break; |
421 | 0 | } |
422 | 0 | UChar32 c = right[rightIndex++]; |
423 | 0 | rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); |
424 | 0 | if(rightPair < MIN_LONG) { |
425 | 0 | rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); |
426 | 0 | } |
427 | 0 | rightPair = getQuaternaries(variableTop, rightPair); |
428 | 0 | } |
429 | |
|
430 | 0 | if(leftPair == rightPair) { |
431 | 0 | if(leftPair == EOS) { break; } |
432 | 0 | leftPair = rightPair = 0; |
433 | 0 | continue; |
434 | 0 | } |
435 | 0 | uint32_t leftQuaternary = leftPair & 0xffff; |
436 | 0 | uint32_t rightQuaternary = rightPair & 0xffff; |
437 | 0 | if(leftQuaternary != rightQuaternary) { |
438 | 0 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; |
439 | 0 | } |
440 | 0 | if(leftPair == EOS) { break; } |
441 | 0 | leftPair >>= 16; |
442 | 0 | rightPair >>= 16; |
443 | 0 | } |
444 | 0 | return UCOL_EQUAL; |
445 | 0 | } |
446 | | |
447 | | int32_t |
448 | | CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, |
449 | | const uint8_t *left, int32_t leftLength, |
450 | 0 | const uint8_t *right, int32_t rightLength) { |
451 | | // Keep compareUTF16() and compareUTF8() in sync very closely! |
452 | |
|
453 | 0 | U_ASSERT((table[0] >> 8) == VERSION); |
454 | 0 | table += (table[0] & 0xff); // skip the header |
455 | 0 | uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() |
456 | 0 | options &= 0xffff; // needed for CollationSettings::getStrength() to work |
457 | | |
458 | | // Check for supported characters, fetch mini CEs, and compare primaries. |
459 | 0 | int32_t leftIndex = 0, rightIndex = 0; |
460 | | /** |
461 | | * Single mini CE or a pair. |
462 | | * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. |
463 | | * If there is only one, then it is in the lower bits, and the upper bits are 0. |
464 | | */ |
465 | 0 | uint32_t leftPair = 0, rightPair = 0; |
466 | | // Note: There is no need to assemble the code point. |
467 | | // We only need to look up the table entry for the character, |
468 | | // and nextPair() looks for whether c==0. |
469 | 0 | for(;;) { |
470 | | // We fetch CEs until we get a non-ignorable primary or reach the end. |
471 | 0 | while(leftPair == 0) { |
472 | 0 | if(leftIndex == leftLength) { |
473 | 0 | leftPair = EOS; |
474 | 0 | break; |
475 | 0 | } |
476 | 0 | UChar32 c = left[leftIndex++]; |
477 | 0 | uint8_t t; |
478 | 0 | if(c <= 0x7f) { |
479 | 0 | leftPair = primaries[c]; |
480 | 0 | if(leftPair != 0) { break; } |
481 | 0 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { |
482 | 0 | return BAIL_OUT_RESULT; |
483 | 0 | } |
484 | 0 | leftPair = table[c]; |
485 | 0 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && |
486 | 0 | 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { |
487 | 0 | ++leftIndex; |
488 | 0 | c = ((c - 0xc2) << 6) + t; |
489 | 0 | leftPair = primaries[c]; |
490 | 0 | if(leftPair != 0) { break; } |
491 | 0 | leftPair = table[c]; |
492 | 0 | } else { |
493 | 0 | leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); |
494 | 0 | } |
495 | 0 | if(leftPair >= MIN_SHORT) { |
496 | 0 | leftPair &= SHORT_PRIMARY_MASK; |
497 | 0 | break; |
498 | 0 | } else if(leftPair > variableTop) { |
499 | 0 | leftPair &= LONG_PRIMARY_MASK; |
500 | 0 | break; |
501 | 0 | } else { |
502 | 0 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); |
503 | 0 | if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
504 | 0 | leftPair = getPrimaries(variableTop, leftPair); |
505 | 0 | } |
506 | 0 | } |
507 | | |
508 | 0 | while(rightPair == 0) { |
509 | 0 | if(rightIndex == rightLength) { |
510 | 0 | rightPair = EOS; |
511 | 0 | break; |
512 | 0 | } |
513 | 0 | UChar32 c = right[rightIndex++]; |
514 | 0 | uint8_t t; |
515 | 0 | if(c <= 0x7f) { |
516 | 0 | rightPair = primaries[c]; |
517 | 0 | if(rightPair != 0) { break; } |
518 | 0 | if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { |
519 | 0 | return BAIL_OUT_RESULT; |
520 | 0 | } |
521 | 0 | rightPair = table[c]; |
522 | 0 | } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && |
523 | 0 | 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { |
524 | 0 | ++rightIndex; |
525 | 0 | c = ((c - 0xc2) << 6) + t; |
526 | 0 | rightPair = primaries[c]; |
527 | 0 | if(rightPair != 0) { break; } |
528 | 0 | rightPair = table[c]; |
529 | 0 | } else { |
530 | 0 | rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); |
531 | 0 | } |
532 | 0 | if(rightPair >= MIN_SHORT) { |
533 | 0 | rightPair &= SHORT_PRIMARY_MASK; |
534 | 0 | break; |
535 | 0 | } else if(rightPair > variableTop) { |
536 | 0 | rightPair &= LONG_PRIMARY_MASK; |
537 | 0 | break; |
538 | 0 | } else { |
539 | 0 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); |
540 | 0 | if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } |
541 | 0 | rightPair = getPrimaries(variableTop, rightPair); |
542 | 0 | } |
543 | 0 | } |
544 | | |
545 | 0 | if(leftPair == rightPair) { |
546 | 0 | if(leftPair == EOS) { break; } |
547 | 0 | leftPair = rightPair = 0; |
548 | 0 | continue; |
549 | 0 | } |
550 | 0 | uint32_t leftPrimary = leftPair & 0xffff; |
551 | 0 | uint32_t rightPrimary = rightPair & 0xffff; |
552 | 0 | if(leftPrimary != rightPrimary) { |
553 | | // Return the primary difference. |
554 | 0 | return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; |
555 | 0 | } |
556 | 0 | if(leftPair == EOS) { break; } |
557 | 0 | leftPair >>= 16; |
558 | 0 | rightPair >>= 16; |
559 | 0 | } |
560 | | // In the following, we need to re-fetch each character because we did not buffer the CEs, |
561 | | // but we know that the string is well-formed and |
562 | | // only contains supported characters and mappings. |
563 | | |
564 | | // We might skip the secondary level but continue with the case level |
565 | | // which is turned on separately. |
566 | 0 | if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { |
567 | 0 | leftIndex = rightIndex = 0; |
568 | 0 | leftPair = rightPair = 0; |
569 | 0 | for(;;) { |
570 | 0 | while(leftPair == 0) { |
571 | 0 | if(leftIndex == leftLength) { |
572 | 0 | leftPair = EOS; |
573 | 0 | break; |
574 | 0 | } |
575 | 0 | UChar32 c = left[leftIndex++]; |
576 | 0 | if(c <= 0x7f) { |
577 | 0 | leftPair = table[c]; |
578 | 0 | } else if(c <= LATIN_MAX_UTF8_LEAD) { |
579 | 0 | leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; |
580 | 0 | } else { |
581 | 0 | leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); |
582 | 0 | } |
583 | 0 | if(leftPair >= MIN_SHORT) { |
584 | 0 | leftPair = getSecondariesFromOneShortCE(leftPair); |
585 | 0 | break; |
586 | 0 | } else if(leftPair > variableTop) { |
587 | 0 | leftPair = COMMON_SEC_PLUS_OFFSET; |
588 | 0 | break; |
589 | 0 | } else { |
590 | 0 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); |
591 | 0 | leftPair = getSecondaries(variableTop, leftPair); |
592 | 0 | } |
593 | 0 | } |
594 | |
|
595 | 0 | while(rightPair == 0) { |
596 | 0 | if(rightIndex == rightLength) { |
597 | 0 | rightPair = EOS; |
598 | 0 | break; |
599 | 0 | } |
600 | 0 | UChar32 c = right[rightIndex++]; |
601 | 0 | if(c <= 0x7f) { |
602 | 0 | rightPair = table[c]; |
603 | 0 | } else if(c <= LATIN_MAX_UTF8_LEAD) { |
604 | 0 | rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; |
605 | 0 | } else { |
606 | 0 | rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); |
607 | 0 | } |
608 | 0 | if(rightPair >= MIN_SHORT) { |
609 | 0 | rightPair = getSecondariesFromOneShortCE(rightPair); |
610 | 0 | break; |
611 | 0 | } else if(rightPair > variableTop) { |
612 | 0 | rightPair = COMMON_SEC_PLUS_OFFSET; |
613 | 0 | break; |
614 | 0 | } else { |
615 | 0 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); |
616 | 0 | rightPair = getSecondaries(variableTop, rightPair); |
617 | 0 | } |
618 | 0 | } |
619 | |
|
620 | 0 | if(leftPair == rightPair) { |
621 | 0 | if(leftPair == EOS) { break; } |
622 | 0 | leftPair = rightPair = 0; |
623 | 0 | continue; |
624 | 0 | } |
625 | 0 | uint32_t leftSecondary = leftPair & 0xffff; |
626 | 0 | uint32_t rightSecondary = rightPair & 0xffff; |
627 | 0 | if(leftSecondary != rightSecondary) { |
628 | 0 | if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { |
629 | | // Full support for backwards secondary requires backwards contraction matching |
630 | | // and moving backwards between merge separators. |
631 | 0 | return BAIL_OUT_RESULT; |
632 | 0 | } |
633 | 0 | return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; |
634 | 0 | } |
635 | 0 | if(leftPair == EOS) { break; } |
636 | 0 | leftPair >>= 16; |
637 | 0 | rightPair >>= 16; |
638 | 0 | } |
639 | 0 | } |
640 | | |
641 | 0 | if((options & CollationSettings::CASE_LEVEL) != 0) { |
642 | 0 | UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; |
643 | 0 | leftIndex = rightIndex = 0; |
644 | 0 | leftPair = rightPair = 0; |
645 | 0 | for(;;) { |
646 | 0 | while(leftPair == 0) { |
647 | 0 | if(leftIndex == leftLength) { |
648 | 0 | leftPair = EOS; |
649 | 0 | break; |
650 | 0 | } |
651 | 0 | UChar32 c = left[leftIndex++]; |
652 | 0 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); |
653 | 0 | if(leftPair < MIN_LONG) { |
654 | 0 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); |
655 | 0 | } |
656 | 0 | leftPair = getCases(variableTop, strengthIsPrimary, leftPair); |
657 | 0 | } |
658 | |
|
659 | 0 | while(rightPair == 0) { |
660 | 0 | if(rightIndex == rightLength) { |
661 | 0 | rightPair = EOS; |
662 | 0 | break; |
663 | 0 | } |
664 | 0 | UChar32 c = right[rightIndex++]; |
665 | 0 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); |
666 | 0 | if(rightPair < MIN_LONG) { |
667 | 0 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); |
668 | 0 | } |
669 | 0 | rightPair = getCases(variableTop, strengthIsPrimary, rightPair); |
670 | 0 | } |
671 | |
|
672 | 0 | if(leftPair == rightPair) { |
673 | 0 | if(leftPair == EOS) { break; } |
674 | 0 | leftPair = rightPair = 0; |
675 | 0 | continue; |
676 | 0 | } |
677 | 0 | uint32_t leftCase = leftPair & 0xffff; |
678 | 0 | uint32_t rightCase = rightPair & 0xffff; |
679 | 0 | if(leftCase != rightCase) { |
680 | 0 | if((options & CollationSettings::UPPER_FIRST) == 0) { |
681 | 0 | return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; |
682 | 0 | } else { |
683 | 0 | return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; |
684 | 0 | } |
685 | 0 | } |
686 | 0 | if(leftPair == EOS) { break; } |
687 | 0 | leftPair >>= 16; |
688 | 0 | rightPair >>= 16; |
689 | 0 | } |
690 | 0 | } |
691 | 0 | if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } |
692 | | |
693 | | // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. |
694 | 0 | UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); |
695 | |
|
696 | 0 | leftIndex = rightIndex = 0; |
697 | 0 | leftPair = rightPair = 0; |
698 | 0 | for(;;) { |
699 | 0 | while(leftPair == 0) { |
700 | 0 | if(leftIndex == leftLength) { |
701 | 0 | leftPair = EOS; |
702 | 0 | break; |
703 | 0 | } |
704 | 0 | UChar32 c = left[leftIndex++]; |
705 | 0 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); |
706 | 0 | if(leftPair < MIN_LONG) { |
707 | 0 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); |
708 | 0 | } |
709 | 0 | leftPair = getTertiaries(variableTop, withCaseBits, leftPair); |
710 | 0 | } |
711 | |
|
712 | 0 | while(rightPair == 0) { |
713 | 0 | if(rightIndex == rightLength) { |
714 | 0 | rightPair = EOS; |
715 | 0 | break; |
716 | 0 | } |
717 | 0 | UChar32 c = right[rightIndex++]; |
718 | 0 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); |
719 | 0 | if(rightPair < MIN_LONG) { |
720 | 0 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); |
721 | 0 | } |
722 | 0 | rightPair = getTertiaries(variableTop, withCaseBits, rightPair); |
723 | 0 | } |
724 | |
|
725 | 0 | if(leftPair == rightPair) { |
726 | 0 | if(leftPair == EOS) { break; } |
727 | 0 | leftPair = rightPair = 0; |
728 | 0 | continue; |
729 | 0 | } |
730 | 0 | uint32_t leftTertiary = leftPair & 0xffff; |
731 | 0 | uint32_t rightTertiary = rightPair & 0xffff; |
732 | 0 | if(leftTertiary != rightTertiary) { |
733 | 0 | if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { |
734 | | // Pass through EOS and MERGE_WEIGHT |
735 | | // and keep real tertiary weights larger than the MERGE_WEIGHT. |
736 | | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. |
737 | 0 | if(leftTertiary > MERGE_WEIGHT) { |
738 | 0 | leftTertiary ^= CASE_MASK; |
739 | 0 | } |
740 | 0 | if(rightTertiary > MERGE_WEIGHT) { |
741 | 0 | rightTertiary ^= CASE_MASK; |
742 | 0 | } |
743 | 0 | } |
744 | 0 | return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; |
745 | 0 | } |
746 | 0 | if(leftPair == EOS) { break; } |
747 | 0 | leftPair >>= 16; |
748 | 0 | rightPair >>= 16; |
749 | 0 | } |
750 | 0 | if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } |
751 | | |
752 | 0 | leftIndex = rightIndex = 0; |
753 | 0 | leftPair = rightPair = 0; |
754 | 0 | for(;;) { |
755 | 0 | while(leftPair == 0) { |
756 | 0 | if(leftIndex == leftLength) { |
757 | 0 | leftPair = EOS; |
758 | 0 | break; |
759 | 0 | } |
760 | 0 | UChar32 c = left[leftIndex++]; |
761 | 0 | leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); |
762 | 0 | if(leftPair < MIN_LONG) { |
763 | 0 | leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); |
764 | 0 | } |
765 | 0 | leftPair = getQuaternaries(variableTop, leftPair); |
766 | 0 | } |
767 | |
|
768 | 0 | while(rightPair == 0) { |
769 | 0 | if(rightIndex == rightLength) { |
770 | 0 | rightPair = EOS; |
771 | 0 | break; |
772 | 0 | } |
773 | 0 | UChar32 c = right[rightIndex++]; |
774 | 0 | rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); |
775 | 0 | if(rightPair < MIN_LONG) { |
776 | 0 | rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); |
777 | 0 | } |
778 | 0 | rightPair = getQuaternaries(variableTop, rightPair); |
779 | 0 | } |
780 | |
|
781 | 0 | if(leftPair == rightPair) { |
782 | 0 | if(leftPair == EOS) { break; } |
783 | 0 | leftPair = rightPair = 0; |
784 | 0 | continue; |
785 | 0 | } |
786 | 0 | uint32_t leftQuaternary = leftPair & 0xffff; |
787 | 0 | uint32_t rightQuaternary = rightPair & 0xffff; |
788 | 0 | if(leftQuaternary != rightQuaternary) { |
789 | 0 | return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; |
790 | 0 | } |
791 | 0 | if(leftPair == EOS) { break; } |
792 | 0 | leftPair >>= 16; |
793 | 0 | rightPair >>= 16; |
794 | 0 | } |
795 | 0 | return UCOL_EQUAL; |
796 | 0 | } |
797 | | |
798 | | uint32_t |
799 | 0 | CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { |
800 | 0 | U_ASSERT(c > LATIN_MAX); |
801 | 0 | if(PUNCT_START <= c && c < PUNCT_LIMIT) { |
802 | 0 | return table[c - PUNCT_START + LATIN_LIMIT]; |
803 | 0 | } else if(c == 0xfffe) { |
804 | 0 | return MERGE_WEIGHT; |
805 | 0 | } else if(c == 0xffff) { |
806 | 0 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; |
807 | 0 | } else { |
808 | 0 | return BAIL_OUT; |
809 | 0 | } |
810 | 0 | } |
811 | | |
812 | | uint32_t |
813 | | CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, |
814 | 0 | const uint8_t *s8, int32_t &sIndex, int32_t sLength) { |
815 | | // The caller handled ASCII and valid/supported Latin. |
816 | 0 | U_ASSERT(c > 0x7f); |
817 | 0 | int32_t i2 = sIndex + 1; |
818 | 0 | if(i2 < sLength || sLength < 0) { |
819 | 0 | uint8_t t1 = s8[sIndex]; |
820 | 0 | uint8_t t2 = s8[i2]; |
821 | 0 | sIndex += 2; |
822 | 0 | if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { |
823 | 0 | return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF |
824 | 0 | } else if(c == 0xef && t1 == 0xbf) { |
825 | 0 | if(t2 == 0xbe) { |
826 | 0 | return MERGE_WEIGHT; // U+FFFE |
827 | 0 | } else if(t2 == 0xbf) { |
828 | 0 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF |
829 | 0 | } |
830 | 0 | } |
831 | 0 | } |
832 | 0 | return BAIL_OUT; |
833 | 0 | } |
834 | | |
835 | | uint32_t |
836 | | CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, |
837 | 0 | const uint8_t *s8, int32_t &sIndex) { |
838 | | // The caller handled ASCII. |
839 | | // The string is well-formed and contains only supported characters. |
840 | 0 | U_ASSERT(c > 0x7f); |
841 | 0 | if(c <= LATIN_MAX_UTF8_LEAD) { |
842 | 0 | return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F |
843 | 0 | } |
844 | 0 | uint8_t t2 = s8[sIndex + 1]; |
845 | 0 | sIndex += 2; |
846 | 0 | if(c == 0xe2) { |
847 | 0 | return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF |
848 | 0 | } else if(t2 == 0xbe) { |
849 | 0 | return MERGE_WEIGHT; // U+FFFE |
850 | 0 | } else { |
851 | 0 | return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF |
852 | 0 | } |
853 | 0 | } |
854 | | |
855 | | uint32_t |
856 | | CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, |
857 | 0 | const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { |
858 | 0 | if(ce >= MIN_LONG || ce < CONTRACTION) { |
859 | 0 | return ce; // simple or special mini CE |
860 | 0 | } else if(ce >= EXPANSION) { |
861 | 0 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); |
862 | 0 | return ((uint32_t)table[index + 1] << 16) | table[index]; |
863 | 0 | } else /* ce >= CONTRACTION */ { |
864 | 0 | if(c == 0 && sLength < 0) { |
865 | 0 | sLength = sIndex - 1; |
866 | 0 | return EOS; |
867 | 0 | } |
868 | | // Contraction list: Default mapping followed by |
869 | | // 0 or more single-character contraction suffix mappings. |
870 | 0 | int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); |
871 | 0 | if(sIndex != sLength) { |
872 | | // Read the next character. |
873 | 0 | int32_t c2; |
874 | 0 | int32_t nextIndex = sIndex; |
875 | 0 | if(s16 != NULL) { |
876 | 0 | c2 = s16[nextIndex++]; |
877 | 0 | if(c2 > LATIN_MAX) { |
878 | 0 | if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { |
879 | 0 | c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF |
880 | 0 | } else if(c2 == 0xfffe || c2 == 0xffff) { |
881 | 0 | c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. |
882 | 0 | } else { |
883 | 0 | return BAIL_OUT; |
884 | 0 | } |
885 | 0 | } |
886 | 0 | } else { |
887 | 0 | c2 = s8[nextIndex++]; |
888 | 0 | if(c2 > 0x7f) { |
889 | 0 | uint8_t t; |
890 | 0 | if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && |
891 | 0 | 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { |
892 | 0 | c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F |
893 | 0 | ++nextIndex; |
894 | 0 | } else { |
895 | 0 | int32_t i2 = nextIndex + 1; |
896 | 0 | if(i2 < sLength || sLength < 0) { |
897 | 0 | if(c2 == 0xe2 && s8[nextIndex] == 0x80 && |
898 | 0 | 0x80 <= (t = s8[i2]) && t <= 0xbf) { |
899 | 0 | c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF |
900 | 0 | } else if(c2 == 0xef && s8[nextIndex] == 0xbf && |
901 | 0 | ((t = s8[i2]) == 0xbe || t == 0xbf)) { |
902 | 0 | c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. |
903 | 0 | } else { |
904 | 0 | return BAIL_OUT; |
905 | 0 | } |
906 | 0 | } else { |
907 | 0 | return BAIL_OUT; |
908 | 0 | } |
909 | 0 | nextIndex += 2; |
910 | 0 | } |
911 | 0 | } |
912 | 0 | } |
913 | 0 | if(c2 == 0 && sLength < 0) { |
914 | 0 | sLength = sIndex; |
915 | 0 | c2 = -1; |
916 | 0 | } |
917 | | // Look for the next character in the contraction suffix list, |
918 | | // which is in ascending order of single suffix characters. |
919 | 0 | int32_t i = index; |
920 | 0 | int32_t head = table[i]; // first skip the default mapping |
921 | 0 | int32_t x; |
922 | 0 | do { |
923 | 0 | i += head >> CONTR_LENGTH_SHIFT; |
924 | 0 | head = table[i]; |
925 | 0 | x = head & CONTR_CHAR_MASK; |
926 | 0 | } while(x < c2); |
927 | 0 | if(x == c2) { |
928 | 0 | index = i; |
929 | 0 | sIndex = nextIndex; |
930 | 0 | } |
931 | 0 | } |
932 | | // Return the CE or CEs for the default or contraction mapping. |
933 | 0 | int32_t length = table[index] >> CONTR_LENGTH_SHIFT; |
934 | 0 | if(length == 1) { |
935 | 0 | return BAIL_OUT; |
936 | 0 | } |
937 | 0 | ce = table[index + 1]; |
938 | 0 | if(length == 2) { |
939 | 0 | return ce; |
940 | 0 | } else { |
941 | 0 | return ((uint32_t)table[index + 2] << 16) | ce; |
942 | 0 | } |
943 | 0 | } |
944 | 0 | } |
945 | | |
946 | | uint32_t |
947 | 0 | CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { |
948 | 0 | if(pair <= 0xffff) { |
949 | | // one mini CE |
950 | 0 | if(pair >= MIN_SHORT) { |
951 | 0 | pair = getSecondariesFromOneShortCE(pair); |
952 | 0 | } else if(pair > variableTop) { |
953 | 0 | pair = COMMON_SEC_PLUS_OFFSET; |
954 | 0 | } else if(pair >= MIN_LONG) { |
955 | 0 | pair = 0; // variable |
956 | 0 | } |
957 | | // else special mini CE |
958 | 0 | } else { |
959 | 0 | uint32_t ce = pair & 0xffff; |
960 | 0 | if(ce >= MIN_SHORT) { |
961 | 0 | pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; |
962 | 0 | } else if(ce > variableTop) { |
963 | 0 | pair = TWO_COMMON_SEC_PLUS_OFFSET; |
964 | 0 | } else { |
965 | 0 | U_ASSERT(ce >= MIN_LONG); |
966 | 0 | pair = 0; // variable |
967 | 0 | } |
968 | 0 | } |
969 | 0 | return pair; |
970 | 0 | } |
971 | | |
972 | | uint32_t |
973 | 0 | CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { |
974 | | // Primary+caseLevel: Ignore case level weights of primary ignorables. |
975 | | // Otherwise: Ignore case level weights of secondary ignorables. |
976 | | // For details see the comments in the CollationCompare class. |
977 | | // Tertiary CEs (secondary ignorables) are not supported in fast Latin. |
978 | 0 | if(pair <= 0xffff) { |
979 | | // one mini CE |
980 | 0 | if(pair >= MIN_SHORT) { |
981 | | // A high secondary weight means we really have two CEs, |
982 | | // a primary CE and a secondary CE. |
983 | 0 | uint32_t ce = pair; |
984 | 0 | pair &= CASE_MASK; // explicit weight of primary CE |
985 | 0 | if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
986 | 0 | pair |= LOWER_CASE << 16; // implied weight of secondary CE |
987 | 0 | } |
988 | 0 | } else if(pair > variableTop) { |
989 | 0 | pair = LOWER_CASE; |
990 | 0 | } else if(pair >= MIN_LONG) { |
991 | 0 | pair = 0; // variable |
992 | 0 | } |
993 | | // else special mini CE |
994 | 0 | } else { |
995 | | // two mini CEs, same primary groups, neither expands like above |
996 | 0 | uint32_t ce = pair & 0xffff; |
997 | 0 | if(ce >= MIN_SHORT) { |
998 | 0 | if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { |
999 | 0 | pair &= CASE_MASK; |
1000 | 0 | } else { |
1001 | 0 | pair &= TWO_CASES_MASK; |
1002 | 0 | } |
1003 | 0 | } else if(ce > variableTop) { |
1004 | 0 | pair = TWO_LOWER_CASES; |
1005 | 0 | } else { |
1006 | 0 | U_ASSERT(ce >= MIN_LONG); |
1007 | 0 | pair = 0; // variable |
1008 | 0 | } |
1009 | 0 | } |
1010 | 0 | return pair; |
1011 | 0 | } |
1012 | | |
1013 | | uint32_t |
1014 | 0 | CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { |
1015 | 0 | if(pair <= 0xffff) { |
1016 | | // one mini CE |
1017 | 0 | if(pair >= MIN_SHORT) { |
1018 | | // A high secondary weight means we really have two CEs, |
1019 | | // a primary CE and a secondary CE. |
1020 | 0 | uint32_t ce = pair; |
1021 | 0 | if(withCaseBits) { |
1022 | 0 | pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; |
1023 | 0 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
1024 | 0 | pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; |
1025 | 0 | } |
1026 | 0 | } else { |
1027 | 0 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; |
1028 | 0 | if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
1029 | 0 | pair |= COMMON_TER_PLUS_OFFSET << 16; |
1030 | 0 | } |
1031 | 0 | } |
1032 | 0 | } else if(pair > variableTop) { |
1033 | 0 | pair = (pair & TERTIARY_MASK) + TER_OFFSET; |
1034 | 0 | if(withCaseBits) { |
1035 | 0 | pair |= LOWER_CASE; |
1036 | 0 | } |
1037 | 0 | } else if(pair >= MIN_LONG) { |
1038 | 0 | pair = 0; // variable |
1039 | 0 | } |
1040 | | // else special mini CE |
1041 | 0 | } else { |
1042 | | // two mini CEs, same primary groups, neither expands like above |
1043 | 0 | uint32_t ce = pair & 0xffff; |
1044 | 0 | if(ce >= MIN_SHORT) { |
1045 | 0 | if(withCaseBits) { |
1046 | 0 | pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; |
1047 | 0 | } else { |
1048 | 0 | pair &= TWO_TERTIARIES_MASK; |
1049 | 0 | } |
1050 | 0 | pair += TWO_TER_OFFSETS; |
1051 | 0 | } else if(ce > variableTop) { |
1052 | 0 | pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; |
1053 | 0 | if(withCaseBits) { |
1054 | 0 | pair |= TWO_LOWER_CASES; |
1055 | 0 | } |
1056 | 0 | } else { |
1057 | 0 | U_ASSERT(ce >= MIN_LONG); |
1058 | 0 | pair = 0; // variable |
1059 | 0 | } |
1060 | 0 | } |
1061 | 0 | return pair; |
1062 | 0 | } |
1063 | | |
1064 | | uint32_t |
1065 | 0 | CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { |
1066 | | // Return the primary weight of a variable CE, |
1067 | | // or the maximum primary weight for a non-variable, not-completely-ignorable CE. |
1068 | 0 | if(pair <= 0xffff) { |
1069 | | // one mini CE |
1070 | 0 | if(pair >= MIN_SHORT) { |
1071 | | // A high secondary weight means we really have two CEs, |
1072 | | // a primary CE and a secondary CE. |
1073 | 0 | if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { |
1074 | 0 | pair = TWO_SHORT_PRIMARIES_MASK; |
1075 | 0 | } else { |
1076 | 0 | pair = SHORT_PRIMARY_MASK; |
1077 | 0 | } |
1078 | 0 | } else if(pair > variableTop) { |
1079 | 0 | pair = SHORT_PRIMARY_MASK; |
1080 | 0 | } else if(pair >= MIN_LONG) { |
1081 | 0 | pair &= LONG_PRIMARY_MASK; // variable |
1082 | 0 | } |
1083 | | // else special mini CE |
1084 | 0 | } else { |
1085 | | // two mini CEs, same primary groups, neither expands like above |
1086 | 0 | uint32_t ce = pair & 0xffff; |
1087 | 0 | if(ce > variableTop) { |
1088 | 0 | pair = TWO_SHORT_PRIMARIES_MASK; |
1089 | 0 | } else { |
1090 | 0 | U_ASSERT(ce >= MIN_LONG); |
1091 | 0 | pair &= TWO_LONG_PRIMARIES_MASK; // variable |
1092 | 0 | } |
1093 | 0 | } |
1094 | 0 | return pair; |
1095 | 0 | } |
1096 | | |
1097 | | U_NAMESPACE_END |
1098 | | |
1099 | | #endif // !UCONFIG_NO_COLLATION |