/src/icu/source/common/ustrcase.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * |
6 | | * Copyright (C) 2001-2015, International Business Machines |
7 | | * Corporation and others. All Rights Reserved. |
8 | | * |
9 | | ******************************************************************************* |
10 | | * file name: ustrcase.cpp |
11 | | * encoding: UTF-8 |
12 | | * tab size: 8 (not used) |
13 | | * indentation:4 |
14 | | * |
15 | | * created on: 2002feb20 |
16 | | * created by: Markus W. Scherer |
17 | | * |
18 | | * Implementation file for string casing C API functions. |
19 | | * Uses functions from uchar.c for basic functionality that requires access |
20 | | * to the Unicode Character Database (uprops.dat). |
21 | | */ |
22 | | |
23 | | #include "unicode/utypes.h" |
24 | | #include "unicode/brkiter.h" |
25 | | #include "unicode/casemap.h" |
26 | | #include "unicode/edits.h" |
27 | | #include "unicode/ustring.h" |
28 | | #include "unicode/ucasemap.h" |
29 | | #include "unicode/ubrk.h" |
30 | | #include "unicode/utf.h" |
31 | | #include "unicode/utf16.h" |
32 | | #include "cmemory.h" |
33 | | #include "ucase.h" |
34 | | #include "ucasemap_imp.h" |
35 | | #include "ustr_imp.h" |
36 | | #include "uassert.h" |
37 | | |
38 | | U_NAMESPACE_BEGIN |
39 | | |
40 | | namespace { |
41 | | |
42 | | int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity, |
43 | 3.57k | Edits *edits, UErrorCode &errorCode) { |
44 | 3.57k | if (U_SUCCESS(errorCode)) { |
45 | 3.57k | if (destIndex > destCapacity) { |
46 | 0 | errorCode = U_BUFFER_OVERFLOW_ERROR; |
47 | 3.57k | } else if (edits != NULL) { |
48 | 0 | edits->copyErrorTo(errorCode); |
49 | 0 | } |
50 | 3.57k | } |
51 | 3.57k | return destIndex; |
52 | 3.57k | } |
53 | | |
54 | | } // namespace |
55 | | |
56 | | U_NAMESPACE_END |
57 | | |
58 | | U_NAMESPACE_USE |
59 | | |
60 | | /* string casing ------------------------------------------------------------ */ |
61 | | |
62 | | /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ |
63 | | static inline int32_t |
64 | | appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, |
65 | | int32_t result, const UChar *s, |
66 | 7.31M | int32_t cpLength, uint32_t options, icu::Edits *edits) { |
67 | 7.31M | UChar32 c; |
68 | 7.31M | int32_t length; |
69 | | |
70 | | /* decode the result */ |
71 | 7.31M | if(result<0) { |
72 | | /* (not) original code point */ |
73 | 6.79M | if(edits!=NULL) { |
74 | 0 | edits->addUnchanged(cpLength); |
75 | 0 | if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) { |
76 | 0 | return destIndex; |
77 | 0 | } |
78 | 0 | } |
79 | 6.79M | c=~result; |
80 | 6.79M | if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath |
81 | 6.77M | dest[destIndex++]=(UChar)c; |
82 | 6.77M | return destIndex; |
83 | 6.77M | } |
84 | 19.4k | length=cpLength; |
85 | 516k | } else { |
86 | 516k | if(result<=UCASE_MAX_STRING_LENGTH) { |
87 | 128k | c=U_SENTINEL; |
88 | 128k | length=result; |
89 | 388k | } else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath |
90 | 383k | dest[destIndex++]=(UChar)result; |
91 | 383k | if(edits!=NULL) { |
92 | 0 | edits->addReplace(cpLength, 1); |
93 | 0 | } |
94 | 383k | return destIndex; |
95 | 383k | } else { |
96 | 5.29k | c=result; |
97 | 5.29k | length=U16_LENGTH(c); |
98 | 5.29k | } |
99 | 133k | if(edits!=NULL) { |
100 | 0 | edits->addReplace(cpLength, length); |
101 | 0 | } |
102 | 133k | } |
103 | 152k | if(length>(INT32_MAX-destIndex)) { |
104 | 0 | return -1; // integer overflow |
105 | 0 | } |
106 | | |
107 | 152k | if(destIndex<destCapacity) { |
108 | | /* append the result */ |
109 | 152k | if(c>=0) { |
110 | | /* code point */ |
111 | 24.7k | UBool isError=FALSE; |
112 | 24.7k | U16_APPEND(dest, destIndex, destCapacity, c, isError); |
113 | 24.7k | if(isError) { |
114 | | /* overflow, nothing written */ |
115 | 0 | destIndex+=length; |
116 | 0 | } |
117 | 128k | } else { |
118 | | /* string */ |
119 | 128k | if((destIndex+length)<=destCapacity) { |
120 | 384k | while(length>0) { |
121 | 256k | dest[destIndex++]=*s++; |
122 | 256k | --length; |
123 | 256k | } |
124 | 128k | } else { |
125 | | /* overflow */ |
126 | 0 | destIndex+=length; |
127 | 0 | } |
128 | 128k | } |
129 | 152k | } else { |
130 | | /* preflight */ |
131 | 0 | destIndex+=length; |
132 | 0 | } |
133 | 152k | return destIndex; |
134 | 152k | } |
135 | | |
136 | | static inline int32_t |
137 | 0 | appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) { |
138 | 0 | if(destIndex<destCapacity) { |
139 | 0 | dest[destIndex]=c; |
140 | 0 | } else if(destIndex==INT32_MAX) { |
141 | 0 | return -1; // integer overflow |
142 | 0 | } |
143 | 0 | return destIndex+1; |
144 | 0 | } |
145 | | |
146 | | static inline int32_t |
147 | | appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity, |
148 | 0 | const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) { |
149 | 0 | if(length>0) { |
150 | 0 | if(edits!=NULL) { |
151 | 0 | edits->addUnchanged(length); |
152 | 0 | if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) { |
153 | 0 | return destIndex; |
154 | 0 | } |
155 | 0 | } |
156 | 0 | if(length>(INT32_MAX-destIndex)) { |
157 | 0 | return -1; // integer overflow |
158 | 0 | } |
159 | 0 | if((destIndex+length)<=destCapacity) { |
160 | 0 | u_memcpy(dest+destIndex, s, length); |
161 | 0 | } |
162 | 0 | destIndex+=length; |
163 | 0 | } |
164 | 0 | return destIndex; |
165 | 0 | } |
166 | | |
167 | | static UChar32 U_CALLCONV |
168 | 58.2k | utf16_caseContextIterator(void *context, int8_t dir) { |
169 | 58.2k | UCaseContext *csc=(UCaseContext *)context; |
170 | 58.2k | UChar32 c; |
171 | | |
172 | 58.2k | if(dir<0) { |
173 | | /* reset for backward iteration */ |
174 | 25.6k | csc->index=csc->cpStart; |
175 | 25.6k | csc->dir=dir; |
176 | 32.6k | } else if(dir>0) { |
177 | | /* reset for forward iteration */ |
178 | 28.7k | csc->index=csc->cpLimit; |
179 | 28.7k | csc->dir=dir; |
180 | 28.7k | } else { |
181 | | /* continue current iteration direction */ |
182 | 3.90k | dir=csc->dir; |
183 | 3.90k | } |
184 | | |
185 | 58.2k | if(dir<0) { |
186 | 26.6k | if(csc->start<csc->index) { |
187 | 26.5k | U16_PREV((const UChar *)csc->p, csc->start, csc->index, c); |
188 | 26.5k | return c; |
189 | 26.5k | } |
190 | 31.5k | } else { |
191 | 31.5k | if(csc->index<csc->limit) { |
192 | 31.4k | U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c); |
193 | 31.4k | return c; |
194 | 31.4k | } |
195 | 31.5k | } |
196 | 260 | return U_SENTINEL; |
197 | 58.2k | } |
198 | | |
199 | | /* |
200 | | * Case-maps [srcStart..srcLimit[ but takes |
201 | | * context [0..srcLength[ into account. |
202 | | */ |
203 | | static int32_t |
204 | | _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map, |
205 | | UChar *dest, int32_t destCapacity, |
206 | | const UChar *src, UCaseContext *csc, |
207 | | int32_t srcStart, int32_t srcLimit, |
208 | | icu::Edits *edits, |
209 | 3.57k | UErrorCode &errorCode) { |
210 | | /* case mapping loop */ |
211 | 3.57k | int32_t srcIndex=srcStart; |
212 | 3.57k | int32_t destIndex=0; |
213 | 7.31M | while(srcIndex<srcLimit) { |
214 | 7.31M | int32_t cpStart; |
215 | 7.31M | csc->cpStart=cpStart=srcIndex; |
216 | 7.31M | UChar32 c; |
217 | 7.31M | U16_NEXT(src, srcIndex, srcLimit, c); |
218 | 7.31M | csc->cpLimit=srcIndex; |
219 | 7.31M | const UChar *s; |
220 | 7.31M | c=map(c, utf16_caseContextIterator, csc, &s, caseLocale); |
221 | 7.31M | destIndex = appendResult(dest, destIndex, destCapacity, c, s, |
222 | 7.31M | srcIndex - cpStart, options, edits); |
223 | 7.31M | if (destIndex < 0) { |
224 | 0 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
225 | 0 | return 0; |
226 | 0 | } |
227 | 7.31M | } |
228 | | |
229 | 3.57k | return destIndex; |
230 | 3.57k | } |
231 | | |
232 | | #if !UCONFIG_NO_BREAK_ITERATION |
233 | | |
234 | | U_CFUNC int32_t U_CALLCONV |
235 | | ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter, |
236 | | UChar *dest, int32_t destCapacity, |
237 | | const UChar *src, int32_t srcLength, |
238 | | icu::Edits *edits, |
239 | 0 | UErrorCode &errorCode) { |
240 | 0 | if(U_FAILURE(errorCode)) { |
241 | 0 | return 0; |
242 | 0 | } |
243 | | |
244 | | /* set up local variables */ |
245 | 0 | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
246 | 0 | csc.p=(void *)src; |
247 | 0 | csc.limit=srcLength; |
248 | 0 | int32_t destIndex=0; |
249 | 0 | int32_t prev=0; |
250 | 0 | UBool isFirstIndex=TRUE; |
251 | | |
252 | | /* titlecasing loop */ |
253 | 0 | while(prev<srcLength) { |
254 | | /* find next index where to titlecase */ |
255 | 0 | int32_t index; |
256 | 0 | if(isFirstIndex) { |
257 | 0 | isFirstIndex=FALSE; |
258 | 0 | index=iter->first(); |
259 | 0 | } else { |
260 | 0 | index=iter->next(); |
261 | 0 | } |
262 | 0 | if(index==UBRK_DONE || index>srcLength) { |
263 | 0 | index=srcLength; |
264 | 0 | } |
265 | | |
266 | | /* |
267 | | * Unicode 4 & 5 section 3.13 Default Case Operations: |
268 | | * |
269 | | * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex |
270 | | * #29, "Text Boundaries." Between each pair of word boundaries, find the first |
271 | | * cased character F. If F exists, map F to default_title(F); then map each |
272 | | * subsequent character C to default_lower(C). |
273 | | * |
274 | | * In this implementation, segment [prev..index[ into 3 parts: |
275 | | * a) uncased characters (copy as-is) [prev..titleStart[ |
276 | | * b) first case letter (titlecase) [titleStart..titleLimit[ |
277 | | * c) subsequent characters (lowercase) [titleLimit..index[ |
278 | | */ |
279 | 0 | if(prev<index) { |
280 | | /* find and copy uncased characters [prev..titleStart[ */ |
281 | 0 | int32_t titleStart=prev; |
282 | 0 | int32_t titleLimit=prev; |
283 | 0 | UChar32 c; |
284 | 0 | U16_NEXT(src, titleLimit, index, c); |
285 | 0 | if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) { |
286 | | /* Adjust the titlecasing index (titleStart) to the next cased character. */ |
287 | 0 | for(;;) { |
288 | 0 | titleStart=titleLimit; |
289 | 0 | if(titleLimit==index) { |
290 | | /* |
291 | | * only uncased characters in [prev..index[ |
292 | | * stop with titleStart==titleLimit==index |
293 | | */ |
294 | 0 | break; |
295 | 0 | } |
296 | 0 | U16_NEXT(src, titleLimit, index, c); |
297 | 0 | if(UCASE_NONE!=ucase_getType(c)) { |
298 | 0 | break; /* cased letter at [titleStart..titleLimit[ */ |
299 | 0 | } |
300 | 0 | } |
301 | 0 | destIndex=appendUnchanged(dest, destIndex, destCapacity, |
302 | 0 | src+prev, titleStart-prev, options, edits); |
303 | 0 | if(destIndex<0) { |
304 | 0 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
305 | 0 | return 0; |
306 | 0 | } |
307 | 0 | } |
308 | | |
309 | 0 | if(titleStart<titleLimit) { |
310 | | /* titlecase c which is from [titleStart..titleLimit[ */ |
311 | 0 | csc.cpStart=titleStart; |
312 | 0 | csc.cpLimit=titleLimit; |
313 | 0 | const UChar *s; |
314 | 0 | c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale); |
315 | 0 | destIndex=appendResult(dest, destIndex, destCapacity, c, s, |
316 | 0 | titleLimit-titleStart, options, edits); |
317 | 0 | if(destIndex<0) { |
318 | 0 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
319 | 0 | return 0; |
320 | 0 | } |
321 | | |
322 | | /* Special case Dutch IJ titlecasing */ |
323 | 0 | if (titleStart+1 < index && |
324 | 0 | caseLocale == UCASE_LOC_DUTCH && |
325 | 0 | (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { |
326 | 0 | if (src[titleStart+1] == 0x006A) { |
327 | 0 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A); |
328 | 0 | if(destIndex<0) { |
329 | 0 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
330 | 0 | return 0; |
331 | 0 | } |
332 | 0 | if(edits!=NULL) { |
333 | 0 | edits->addReplace(1, 1); |
334 | 0 | } |
335 | 0 | titleLimit++; |
336 | 0 | } else if (src[titleStart+1] == 0x004A) { |
337 | | // Keep the capital J from getting lowercased. |
338 | 0 | destIndex=appendUnchanged(dest, destIndex, destCapacity, |
339 | 0 | src+titleStart+1, 1, options, edits); |
340 | 0 | if(destIndex<0) { |
341 | 0 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
342 | 0 | return 0; |
343 | 0 | } |
344 | 0 | titleLimit++; |
345 | 0 | } |
346 | 0 | } |
347 | | |
348 | | /* lowercase [titleLimit..index[ */ |
349 | 0 | if(titleLimit<index) { |
350 | 0 | if((options&U_TITLECASE_NO_LOWERCASE)==0) { |
351 | | /* Normal operation: Lowercase the rest of the word. */ |
352 | 0 | destIndex+= |
353 | 0 | _caseMap( |
354 | 0 | caseLocale, options, ucase_toFullLower, |
355 | 0 | dest+destIndex, destCapacity-destIndex, |
356 | 0 | src, &csc, |
357 | 0 | titleLimit, index, |
358 | 0 | edits, errorCode); |
359 | 0 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { |
360 | 0 | errorCode=U_ZERO_ERROR; |
361 | 0 | } |
362 | 0 | if(U_FAILURE(errorCode)) { |
363 | 0 | return destIndex; |
364 | 0 | } |
365 | 0 | } else { |
366 | | /* Optionally just copy the rest of the word unchanged. */ |
367 | 0 | destIndex=appendUnchanged(dest, destIndex, destCapacity, |
368 | 0 | src+titleLimit, index-titleLimit, options, edits); |
369 | 0 | if(destIndex<0) { |
370 | 0 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
371 | 0 | return 0; |
372 | 0 | } |
373 | 0 | } |
374 | 0 | } |
375 | 0 | } |
376 | 0 | } |
377 | | |
378 | 0 | prev=index; |
379 | 0 | } |
380 | | |
381 | 0 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); |
382 | 0 | } |
383 | | |
384 | | #endif // !UCONFIG_NO_BREAK_ITERATION |
385 | | |
386 | | U_NAMESPACE_BEGIN |
387 | | namespace GreekUpper { |
388 | | |
389 | | // Data generated by prototype code, see |
390 | | // http://site.icu-project.org/design/case/greek-upper |
391 | | // TODO: Move this data into ucase.icu. |
392 | | static const uint16_t data0370[] = { |
393 | | // U+0370..03FF |
394 | | 0x0370, |
395 | | 0x0370, |
396 | | 0x0372, |
397 | | 0x0372, |
398 | | 0, |
399 | | 0, |
400 | | 0x0376, |
401 | | 0x0376, |
402 | | 0, |
403 | | 0, |
404 | | 0x037A, |
405 | | 0x03FD, |
406 | | 0x03FE, |
407 | | 0x03FF, |
408 | | 0, |
409 | | 0x037F, |
410 | | 0, |
411 | | 0, |
412 | | 0, |
413 | | 0, |
414 | | 0, |
415 | | 0, |
416 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
417 | | 0, |
418 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
419 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
420 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
421 | | 0, |
422 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
423 | | 0, |
424 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
425 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
426 | | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
427 | | 0x0391 | HAS_VOWEL, |
428 | | 0x0392, |
429 | | 0x0393, |
430 | | 0x0394, |
431 | | 0x0395 | HAS_VOWEL, |
432 | | 0x0396, |
433 | | 0x0397 | HAS_VOWEL, |
434 | | 0x0398, |
435 | | 0x0399 | HAS_VOWEL, |
436 | | 0x039A, |
437 | | 0x039B, |
438 | | 0x039C, |
439 | | 0x039D, |
440 | | 0x039E, |
441 | | 0x039F | HAS_VOWEL, |
442 | | 0x03A0, |
443 | | 0x03A1, |
444 | | 0, |
445 | | 0x03A3, |
446 | | 0x03A4, |
447 | | 0x03A5 | HAS_VOWEL, |
448 | | 0x03A6, |
449 | | 0x03A7, |
450 | | 0x03A8, |
451 | | 0x03A9 | HAS_VOWEL, |
452 | | 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, |
453 | | 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, |
454 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
455 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
456 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
457 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
458 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
459 | | 0x0391 | HAS_VOWEL, |
460 | | 0x0392, |
461 | | 0x0393, |
462 | | 0x0394, |
463 | | 0x0395 | HAS_VOWEL, |
464 | | 0x0396, |
465 | | 0x0397 | HAS_VOWEL, |
466 | | 0x0398, |
467 | | 0x0399 | HAS_VOWEL, |
468 | | 0x039A, |
469 | | 0x039B, |
470 | | 0x039C, |
471 | | 0x039D, |
472 | | 0x039E, |
473 | | 0x039F | HAS_VOWEL, |
474 | | 0x03A0, |
475 | | 0x03A1, |
476 | | 0x03A3, |
477 | | 0x03A3, |
478 | | 0x03A4, |
479 | | 0x03A5 | HAS_VOWEL, |
480 | | 0x03A6, |
481 | | 0x03A7, |
482 | | 0x03A8, |
483 | | 0x03A9 | HAS_VOWEL, |
484 | | 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, |
485 | | 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, |
486 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
487 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
488 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
489 | | 0x03CF, |
490 | | 0x0392, |
491 | | 0x0398, |
492 | | 0x03D2, |
493 | | 0x03D2 | HAS_ACCENT, |
494 | | 0x03D2 | HAS_DIALYTIKA, |
495 | | 0x03A6, |
496 | | 0x03A0, |
497 | | 0x03CF, |
498 | | 0x03D8, |
499 | | 0x03D8, |
500 | | 0x03DA, |
501 | | 0x03DA, |
502 | | 0x03DC, |
503 | | 0x03DC, |
504 | | 0x03DE, |
505 | | 0x03DE, |
506 | | 0x03E0, |
507 | | 0x03E0, |
508 | | 0, |
509 | | 0, |
510 | | 0, |
511 | | 0, |
512 | | 0, |
513 | | 0, |
514 | | 0, |
515 | | 0, |
516 | | 0, |
517 | | 0, |
518 | | 0, |
519 | | 0, |
520 | | 0, |
521 | | 0, |
522 | | 0x039A, |
523 | | 0x03A1, |
524 | | 0x03F9, |
525 | | 0x037F, |
526 | | 0x03F4, |
527 | | 0x0395 | HAS_VOWEL, |
528 | | 0, |
529 | | 0x03F7, |
530 | | 0x03F7, |
531 | | 0x03F9, |
532 | | 0x03FA, |
533 | | 0x03FA, |
534 | | 0x03FC, |
535 | | 0x03FD, |
536 | | 0x03FE, |
537 | | 0x03FF, |
538 | | }; |
539 | | |
540 | | static const uint16_t data1F00[] = { |
541 | | // U+1F00..1FFF |
542 | | 0x0391 | HAS_VOWEL, |
543 | | 0x0391 | HAS_VOWEL, |
544 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
545 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
546 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
547 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
548 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
549 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
550 | | 0x0391 | HAS_VOWEL, |
551 | | 0x0391 | HAS_VOWEL, |
552 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
553 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
554 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
555 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
556 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
557 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
558 | | 0x0395 | HAS_VOWEL, |
559 | | 0x0395 | HAS_VOWEL, |
560 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
561 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
562 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
563 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
564 | | 0, |
565 | | 0, |
566 | | 0x0395 | HAS_VOWEL, |
567 | | 0x0395 | HAS_VOWEL, |
568 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
569 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
570 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
571 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
572 | | 0, |
573 | | 0, |
574 | | 0x0397 | HAS_VOWEL, |
575 | | 0x0397 | HAS_VOWEL, |
576 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
577 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
578 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
579 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
580 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
581 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
582 | | 0x0397 | HAS_VOWEL, |
583 | | 0x0397 | HAS_VOWEL, |
584 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
585 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
586 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
587 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
588 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
589 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
590 | | 0x0399 | HAS_VOWEL, |
591 | | 0x0399 | HAS_VOWEL, |
592 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
593 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
594 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
595 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
596 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
597 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
598 | | 0x0399 | HAS_VOWEL, |
599 | | 0x0399 | HAS_VOWEL, |
600 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
601 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
602 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
603 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
604 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
605 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
606 | | 0x039F | HAS_VOWEL, |
607 | | 0x039F | HAS_VOWEL, |
608 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
609 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
610 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
611 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
612 | | 0, |
613 | | 0, |
614 | | 0x039F | HAS_VOWEL, |
615 | | 0x039F | HAS_VOWEL, |
616 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
617 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
618 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
619 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
620 | | 0, |
621 | | 0, |
622 | | 0x03A5 | HAS_VOWEL, |
623 | | 0x03A5 | HAS_VOWEL, |
624 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
625 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
626 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
627 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
628 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
629 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
630 | | 0, |
631 | | 0x03A5 | HAS_VOWEL, |
632 | | 0, |
633 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
634 | | 0, |
635 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
636 | | 0, |
637 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
638 | | 0x03A9 | HAS_VOWEL, |
639 | | 0x03A9 | HAS_VOWEL, |
640 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
641 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
642 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
643 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
644 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
645 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
646 | | 0x03A9 | HAS_VOWEL, |
647 | | 0x03A9 | HAS_VOWEL, |
648 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
649 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
650 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
651 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
652 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
653 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
654 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
655 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
656 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
657 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
658 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
659 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
660 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
661 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
662 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
663 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
664 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
665 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
666 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
667 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
668 | | 0, |
669 | | 0, |
670 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
671 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
672 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
673 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
674 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
675 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
676 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
677 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
678 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
679 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
680 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
681 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
682 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
683 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
684 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
685 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
686 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
687 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
688 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
689 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
690 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
691 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
692 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
693 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
694 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
695 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
696 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
697 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
698 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
699 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
700 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
701 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
702 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
703 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
704 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
705 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
706 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
707 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
708 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
709 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
710 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
711 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
712 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
713 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
714 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
715 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
716 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
717 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
718 | | 0x0391 | HAS_VOWEL, |
719 | | 0x0391 | HAS_VOWEL, |
720 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
721 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
722 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
723 | | 0, |
724 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
725 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
726 | | 0x0391 | HAS_VOWEL, |
727 | | 0x0391 | HAS_VOWEL, |
728 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
729 | | 0x0391 | HAS_VOWEL | HAS_ACCENT, |
730 | | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
731 | | 0, |
732 | | 0x0399 | HAS_VOWEL, |
733 | | 0, |
734 | | 0, |
735 | | 0, |
736 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
737 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
738 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
739 | | 0, |
740 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
741 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
742 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
743 | | 0x0395 | HAS_VOWEL | HAS_ACCENT, |
744 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
745 | | 0x0397 | HAS_VOWEL | HAS_ACCENT, |
746 | | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
747 | | 0, |
748 | | 0, |
749 | | 0, |
750 | | 0x0399 | HAS_VOWEL, |
751 | | 0x0399 | HAS_VOWEL, |
752 | | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
753 | | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
754 | | 0, |
755 | | 0, |
756 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
757 | | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
758 | | 0x0399 | HAS_VOWEL, |
759 | | 0x0399 | HAS_VOWEL, |
760 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
761 | | 0x0399 | HAS_VOWEL | HAS_ACCENT, |
762 | | 0, |
763 | | 0, |
764 | | 0, |
765 | | 0, |
766 | | 0x03A5 | HAS_VOWEL, |
767 | | 0x03A5 | HAS_VOWEL, |
768 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
769 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
770 | | 0x03A1, |
771 | | 0x03A1, |
772 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
773 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, |
774 | | 0x03A5 | HAS_VOWEL, |
775 | | 0x03A5 | HAS_VOWEL, |
776 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
777 | | 0x03A5 | HAS_VOWEL | HAS_ACCENT, |
778 | | 0x03A1, |
779 | | 0, |
780 | | 0, |
781 | | 0, |
782 | | 0, |
783 | | 0, |
784 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
785 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
786 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
787 | | 0, |
788 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
789 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, |
790 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
791 | | 0x039F | HAS_VOWEL | HAS_ACCENT, |
792 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
793 | | 0x03A9 | HAS_VOWEL | HAS_ACCENT, |
794 | | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, |
795 | | 0, |
796 | | 0, |
797 | | 0, |
798 | | }; |
799 | | |
800 | | // U+2126 Ohm sign |
801 | | static const uint16_t data2126 = 0x03A9 | HAS_VOWEL; |
802 | | |
803 | 0 | uint32_t getLetterData(UChar32 c) { |
804 | 0 | if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) { |
805 | 0 | return 0; |
806 | 0 | } else if (c <= 0x3ff) { |
807 | 0 | return data0370[c - 0x370]; |
808 | 0 | } else if (c <= 0x1fff) { |
809 | 0 | return data1F00[c - 0x1f00]; |
810 | 0 | } else if (c == 0x2126) { |
811 | 0 | return data2126; |
812 | 0 | } else { |
813 | 0 | return 0; |
814 | 0 | } |
815 | 0 | } |
816 | | |
817 | 0 | uint32_t getDiacriticData(UChar32 c) { |
818 | 0 | switch (c) { |
819 | 0 | case 0x0300: // varia |
820 | 0 | case 0x0301: // tonos = oxia |
821 | 0 | case 0x0342: // perispomeni |
822 | 0 | case 0x0302: // circumflex can look like perispomeni |
823 | 0 | case 0x0303: // tilde can look like perispomeni |
824 | 0 | case 0x0311: // inverted breve can look like perispomeni |
825 | 0 | return HAS_ACCENT; |
826 | 0 | case 0x0308: // dialytika = diaeresis |
827 | 0 | return HAS_COMBINING_DIALYTIKA; |
828 | 0 | case 0x0344: // dialytika tonos |
829 | 0 | return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; |
830 | 0 | case 0x0345: // ypogegrammeni = iota subscript |
831 | 0 | return HAS_YPOGEGRAMMENI; |
832 | 0 | case 0x0304: // macron |
833 | 0 | case 0x0306: // breve |
834 | 0 | case 0x0313: // comma above |
835 | 0 | case 0x0314: // reversed comma above |
836 | 0 | case 0x0343: // koronis |
837 | 0 | return HAS_OTHER_GREEK_DIACRITIC; |
838 | 0 | default: |
839 | 0 | return 0; |
840 | 0 | } |
841 | 0 | } |
842 | | |
843 | 0 | UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) { |
844 | 0 | while (i < length) { |
845 | 0 | UChar32 c; |
846 | 0 | U16_NEXT(s, i, length, c); |
847 | 0 | int32_t type = ucase_getTypeOrIgnorable(c); |
848 | 0 | if ((type & UCASE_IGNORABLE) != 0) { |
849 | | // Case-ignorable, continue with the loop. |
850 | 0 | } else if (type != UCASE_NONE) { |
851 | 0 | return TRUE; // Followed by cased letter. |
852 | 0 | } else { |
853 | 0 | return FALSE; // Uncased and not case-ignorable. |
854 | 0 | } |
855 | 0 | } |
856 | 0 | return FALSE; // Not followed by cased letter. |
857 | 0 | } |
858 | | |
859 | | /** |
860 | | * Greek string uppercasing with a state machine. |
861 | | * Probably simpler than a stateless function that has to figure out complex context-before |
862 | | * for each character. |
863 | | * TODO: Try to re-consolidate one way or another with the non-Greek function. |
864 | | */ |
865 | | int32_t toUpper(uint32_t options, |
866 | | UChar *dest, int32_t destCapacity, |
867 | | const UChar *src, int32_t srcLength, |
868 | | Edits *edits, |
869 | 0 | UErrorCode &errorCode) { |
870 | 0 | int32_t destIndex=0; |
871 | 0 | uint32_t state = 0; |
872 | 0 | for (int32_t i = 0; i < srcLength;) { |
873 | 0 | int32_t nextIndex = i; |
874 | 0 | UChar32 c; |
875 | 0 | U16_NEXT(src, nextIndex, srcLength, c); |
876 | 0 | uint32_t nextState = 0; |
877 | 0 | int32_t type = ucase_getTypeOrIgnorable(c); |
878 | 0 | if ((type & UCASE_IGNORABLE) != 0) { |
879 | | // c is case-ignorable |
880 | 0 | nextState |= (state & AFTER_CASED); |
881 | 0 | } else if (type != UCASE_NONE) { |
882 | | // c is cased |
883 | 0 | nextState |= AFTER_CASED; |
884 | 0 | } |
885 | 0 | uint32_t data = getLetterData(c); |
886 | 0 | if (data > 0) { |
887 | 0 | uint32_t upper = data & UPPER_MASK; |
888 | | // Add a dialytika to this iota or ypsilon vowel |
889 | | // if we removed a tonos from the previous vowel, |
890 | | // and that previous vowel did not also have (or gain) a dialytika. |
891 | | // Adding one only to the final vowel in a longer sequence |
892 | | // (which does not occur in normal writing) would require lookahead. |
893 | | // Set the same flag as for preserving an existing dialytika. |
894 | 0 | if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && |
895 | 0 | (upper == 0x399 || upper == 0x3A5)) { |
896 | 0 | data |= HAS_DIALYTIKA; |
897 | 0 | } |
898 | 0 | int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. |
899 | 0 | if ((data & HAS_YPOGEGRAMMENI) != 0) { |
900 | 0 | numYpogegrammeni = 1; |
901 | 0 | } |
902 | | // Skip combining diacritics after this Greek letter. |
903 | 0 | while (nextIndex < srcLength) { |
904 | 0 | uint32_t diacriticData = getDiacriticData(src[nextIndex]); |
905 | 0 | if (diacriticData != 0) { |
906 | 0 | data |= diacriticData; |
907 | 0 | if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { |
908 | 0 | ++numYpogegrammeni; |
909 | 0 | } |
910 | 0 | ++nextIndex; |
911 | 0 | } else { |
912 | 0 | break; // not a Greek diacritic |
913 | 0 | } |
914 | 0 | } |
915 | 0 | if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { |
916 | 0 | nextState |= AFTER_VOWEL_WITH_ACCENT; |
917 | 0 | } |
918 | | // Map according to Greek rules. |
919 | 0 | UBool addTonos = FALSE; |
920 | 0 | if (upper == 0x397 && |
921 | 0 | (data & HAS_ACCENT) != 0 && |
922 | 0 | numYpogegrammeni == 0 && |
923 | 0 | (state & AFTER_CASED) == 0 && |
924 | 0 | !isFollowedByCasedLetter(src, nextIndex, srcLength)) { |
925 | | // Keep disjunctive "or" with (only) a tonos. |
926 | | // We use the same "word boundary" conditions as for the Final_Sigma test. |
927 | 0 | if (i == nextIndex) { |
928 | 0 | upper = 0x389; // Preserve the precomposed form. |
929 | 0 | } else { |
930 | 0 | addTonos = TRUE; |
931 | 0 | } |
932 | 0 | } else if ((data & HAS_DIALYTIKA) != 0) { |
933 | | // Preserve a vowel with dialytika in precomposed form if it exists. |
934 | 0 | if (upper == 0x399) { |
935 | 0 | upper = 0x3AA; |
936 | 0 | data &= ~HAS_EITHER_DIALYTIKA; |
937 | 0 | } else if (upper == 0x3A5) { |
938 | 0 | upper = 0x3AB; |
939 | 0 | data &= ~HAS_EITHER_DIALYTIKA; |
940 | 0 | } |
941 | 0 | } |
942 | |
|
943 | 0 | UBool change = TRUE; |
944 | 0 | if (edits != NULL) { |
945 | | // Find out first whether we are changing the text. |
946 | 0 | change = src[i] != upper || numYpogegrammeni > 0; |
947 | 0 | int32_t i2 = i + 1; |
948 | 0 | if ((data & HAS_EITHER_DIALYTIKA) != 0) { |
949 | 0 | change |= i2 >= nextIndex || src[i2] != 0x308; |
950 | 0 | ++i2; |
951 | 0 | } |
952 | 0 | if (addTonos) { |
953 | 0 | change |= i2 >= nextIndex || src[i2] != 0x301; |
954 | 0 | ++i2; |
955 | 0 | } |
956 | 0 | int32_t oldLength = nextIndex - i; |
957 | 0 | int32_t newLength = (i2 - i) + numYpogegrammeni; |
958 | 0 | change |= oldLength != newLength; |
959 | 0 | if (change) { |
960 | 0 | if (edits != NULL) { |
961 | 0 | edits->addReplace(oldLength, newLength); |
962 | 0 | } |
963 | 0 | } else { |
964 | 0 | if (edits != NULL) { |
965 | 0 | edits->addUnchanged(oldLength); |
966 | 0 | } |
967 | | // Write unchanged text? |
968 | 0 | change = (options & UCASEMAP_OMIT_UNCHANGED_TEXT) == 0; |
969 | 0 | } |
970 | 0 | } |
971 | |
|
972 | 0 | if (change) { |
973 | 0 | destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper); |
974 | 0 | if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { |
975 | 0 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika |
976 | 0 | } |
977 | 0 | if (destIndex >= 0 && addTonos) { |
978 | 0 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x301); |
979 | 0 | } |
980 | 0 | while (destIndex >= 0 && numYpogegrammeni > 0) { |
981 | 0 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x399); |
982 | 0 | --numYpogegrammeni; |
983 | 0 | } |
984 | 0 | if(destIndex<0) { |
985 | 0 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
986 | 0 | return 0; |
987 | 0 | } |
988 | 0 | } |
989 | 0 | } else { |
990 | 0 | const UChar *s; |
991 | 0 | c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK); |
992 | 0 | destIndex = appendResult(dest, destIndex, destCapacity, c, s, |
993 | 0 | nextIndex - i, options, edits); |
994 | 0 | if (destIndex < 0) { |
995 | 0 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
996 | 0 | return 0; |
997 | 0 | } |
998 | 0 | } |
999 | 0 | i = nextIndex; |
1000 | 0 | state = nextState; |
1001 | 0 | } |
1002 | | |
1003 | 0 | return destIndex; |
1004 | 0 | } |
1005 | | |
1006 | | } // namespace GreekUpper |
1007 | | U_NAMESPACE_END |
1008 | | |
1009 | | /* functions available in the common library (for unistr_case.cpp) */ |
1010 | | |
1011 | | U_CFUNC int32_t U_CALLCONV |
1012 | | ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
1013 | | UChar *dest, int32_t destCapacity, |
1014 | | const UChar *src, int32_t srcLength, |
1015 | | icu::Edits *edits, |
1016 | 3.57k | UErrorCode &errorCode) { |
1017 | 3.57k | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
1018 | 3.57k | csc.p=(void *)src; |
1019 | 3.57k | csc.limit=srcLength; |
1020 | 3.57k | int32_t destIndex = _caseMap( |
1021 | 3.57k | caseLocale, options, ucase_toFullLower, |
1022 | 3.57k | dest, destCapacity, |
1023 | 3.57k | src, &csc, 0, srcLength, |
1024 | 3.57k | edits, errorCode); |
1025 | 3.57k | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); |
1026 | 3.57k | } |
1027 | | |
1028 | | U_CFUNC int32_t U_CALLCONV |
1029 | | ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
1030 | | UChar *dest, int32_t destCapacity, |
1031 | | const UChar *src, int32_t srcLength, |
1032 | | icu::Edits *edits, |
1033 | 0 | UErrorCode &errorCode) { |
1034 | 0 | int32_t destIndex; |
1035 | 0 | if (caseLocale == UCASE_LOC_GREEK) { |
1036 | 0 | destIndex = GreekUpper::toUpper(options, dest, destCapacity, |
1037 | 0 | src, srcLength, edits, errorCode); |
1038 | 0 | } else { |
1039 | 0 | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
1040 | 0 | csc.p=(void *)src; |
1041 | 0 | csc.limit=srcLength; |
1042 | 0 | destIndex = _caseMap( |
1043 | 0 | caseLocale, options, ucase_toFullUpper, |
1044 | 0 | dest, destCapacity, |
1045 | 0 | src, &csc, 0, srcLength, |
1046 | 0 | edits, errorCode); |
1047 | 0 | } |
1048 | 0 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); |
1049 | 0 | } |
1050 | | |
1051 | | U_CFUNC int32_t U_CALLCONV |
1052 | | ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
1053 | | UChar *dest, int32_t destCapacity, |
1054 | | const UChar *src, int32_t srcLength, |
1055 | | icu::Edits *edits, |
1056 | 0 | UErrorCode &errorCode) { |
1057 | | /* case mapping loop */ |
1058 | 0 | int32_t srcIndex = 0; |
1059 | 0 | int32_t destIndex = 0; |
1060 | 0 | while (srcIndex < srcLength) { |
1061 | 0 | int32_t cpStart = srcIndex; |
1062 | 0 | UChar32 c; |
1063 | 0 | U16_NEXT(src, srcIndex, srcLength, c); |
1064 | 0 | const UChar *s; |
1065 | 0 | c = ucase_toFullFolding(c, &s, options); |
1066 | 0 | destIndex = appendResult(dest, destIndex, destCapacity, c, s, |
1067 | 0 | srcIndex - cpStart, options, edits); |
1068 | 0 | if (destIndex < 0) { |
1069 | 0 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
1070 | 0 | return 0; |
1071 | 0 | } |
1072 | 0 | } |
1073 | | |
1074 | 0 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); |
1075 | 0 | } |
1076 | | |
1077 | | U_CFUNC int32_t |
1078 | | ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM |
1079 | | UChar *dest, int32_t destCapacity, |
1080 | | const UChar *src, int32_t srcLength, |
1081 | | UStringCaseMapper *stringCaseMapper, |
1082 | | icu::Edits *edits, |
1083 | 0 | UErrorCode &errorCode) { |
1084 | 0 | int32_t destLength; |
1085 | | |
1086 | | /* check argument values */ |
1087 | 0 | if(U_FAILURE(errorCode)) { |
1088 | 0 | return 0; |
1089 | 0 | } |
1090 | 0 | if( destCapacity<0 || |
1091 | 0 | (dest==NULL && destCapacity>0) || |
1092 | 0 | src==NULL || |
1093 | 0 | srcLength<-1 |
1094 | 0 | ) { |
1095 | 0 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
1096 | 0 | return 0; |
1097 | 0 | } |
1098 | | |
1099 | | /* get the string length */ |
1100 | 0 | if(srcLength==-1) { |
1101 | 0 | srcLength=u_strlen(src); |
1102 | 0 | } |
1103 | | |
1104 | | /* check for overlapping source and destination */ |
1105 | 0 | if( dest!=NULL && |
1106 | 0 | ((src>=dest && src<(dest+destCapacity)) || |
1107 | 0 | (dest>=src && dest<(src+srcLength))) |
1108 | 0 | ) { |
1109 | 0 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
1110 | 0 | return 0; |
1111 | 0 | } |
1112 | | |
1113 | 0 | if(edits!=NULL) { |
1114 | 0 | edits->reset(); |
1115 | 0 | } |
1116 | 0 | destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR |
1117 | 0 | dest, destCapacity, src, srcLength, edits, errorCode); |
1118 | 0 | return u_terminateUChars(dest, destCapacity, destLength, &errorCode); |
1119 | 0 | } |
1120 | | |
1121 | | U_CFUNC int32_t |
1122 | | ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM |
1123 | | UChar *dest, int32_t destCapacity, |
1124 | | const UChar *src, int32_t srcLength, |
1125 | | UStringCaseMapper *stringCaseMapper, |
1126 | 3.57k | UErrorCode &errorCode) { |
1127 | 3.57k | UChar buffer[300]; |
1128 | 3.57k | UChar *temp; |
1129 | | |
1130 | 3.57k | int32_t destLength; |
1131 | | |
1132 | | /* check argument values */ |
1133 | 3.57k | if(U_FAILURE(errorCode)) { |
1134 | 0 | return 0; |
1135 | 0 | } |
1136 | 3.57k | if( destCapacity<0 || |
1137 | 3.57k | (dest==NULL && destCapacity>0) || |
1138 | 3.57k | src==NULL || |
1139 | 3.57k | srcLength<-1 |
1140 | 3.57k | ) { |
1141 | 0 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
1142 | 0 | return 0; |
1143 | 0 | } |
1144 | | |
1145 | | /* get the string length */ |
1146 | 3.57k | if(srcLength==-1) { |
1147 | 0 | srcLength=u_strlen(src); |
1148 | 0 | } |
1149 | | |
1150 | | /* check for overlapping source and destination */ |
1151 | 3.57k | if( dest!=NULL && |
1152 | 3.57k | ((src>=dest && src<(dest+destCapacity)) || |
1153 | 3.57k | (dest>=src && dest<(src+srcLength))) |
1154 | 3.57k | ) { |
1155 | | /* overlap: provide a temporary destination buffer and later copy the result */ |
1156 | 0 | if(destCapacity<=UPRV_LENGTHOF(buffer)) { |
1157 | | /* the stack buffer is large enough */ |
1158 | 0 | temp=buffer; |
1159 | 0 | } else { |
1160 | | /* allocate a buffer */ |
1161 | 0 | temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); |
1162 | 0 | if(temp==NULL) { |
1163 | 0 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
1164 | 0 | return 0; |
1165 | 0 | } |
1166 | 0 | } |
1167 | 3.57k | } else { |
1168 | 3.57k | temp=dest; |
1169 | 3.57k | } |
1170 | | |
1171 | 3.57k | destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR |
1172 | 3.57k | temp, destCapacity, src, srcLength, NULL, errorCode); |
1173 | 3.57k | if(temp!=dest) { |
1174 | | /* copy the result string to the destination buffer */ |
1175 | 0 | if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) { |
1176 | 0 | u_memmove(dest, temp, destLength); |
1177 | 0 | } |
1178 | 0 | if(temp!=buffer) { |
1179 | 0 | uprv_free(temp); |
1180 | 0 | } |
1181 | 0 | } |
1182 | | |
1183 | 3.57k | return u_terminateUChars(dest, destCapacity, destLength, &errorCode); |
1184 | 3.57k | } |
1185 | | |
1186 | | /* public API functions */ |
1187 | | |
1188 | | U_CAPI int32_t U_EXPORT2 |
1189 | | u_strFoldCase(UChar *dest, int32_t destCapacity, |
1190 | | const UChar *src, int32_t srcLength, |
1191 | | uint32_t options, |
1192 | 0 | UErrorCode *pErrorCode) { |
1193 | 0 | return ustrcase_mapWithOverlap( |
1194 | 0 | UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL |
1195 | 0 | dest, destCapacity, |
1196 | 0 | src, srcLength, |
1197 | 0 | ustrcase_internalFold, *pErrorCode); |
1198 | 0 | } |
1199 | | |
1200 | | U_NAMESPACE_BEGIN |
1201 | | |
1202 | | int32_t CaseMap::fold( |
1203 | | uint32_t options, |
1204 | | const UChar *src, int32_t srcLength, |
1205 | | UChar *dest, int32_t destCapacity, Edits *edits, |
1206 | 0 | UErrorCode &errorCode) { |
1207 | 0 | return ustrcase_map( |
1208 | 0 | UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL |
1209 | 0 | dest, destCapacity, |
1210 | 0 | src, srcLength, |
1211 | 0 | ustrcase_internalFold, edits, errorCode); |
1212 | 0 | } |
1213 | | |
1214 | | U_NAMESPACE_END |
1215 | | |
1216 | | /* case-insensitive string comparisons -------------------------------------- */ |
1217 | | |
1218 | | /* |
1219 | | * This function is a copy of unorm_cmpEquivFold() minus the parts for |
1220 | | * canonical equivalence. |
1221 | | * Keep the functions in sync, and see there for how this works. |
1222 | | * The duplication is for modularization: |
1223 | | * It makes caseless (but not canonical caseless) matches independent of |
1224 | | * the normalization code. |
1225 | | */ |
1226 | | |
1227 | | /* stack element for previous-level source/decomposition pointers */ |
1228 | | struct CmpEquivLevel { |
1229 | | const UChar *start, *s, *limit; |
1230 | | }; |
1231 | | typedef struct CmpEquivLevel CmpEquivLevel; |
1232 | | |
1233 | | /** |
1234 | | * Internal implementation code comparing string with case fold. |
1235 | | * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch(). |
1236 | | * |
1237 | | * @param s1 input string 1 |
1238 | | * @param length1 length of string 1, or -1 (NULL terminated) |
1239 | | * @param s2 input string 2 |
1240 | | * @param length2 length of string 2, or -1 (NULL terminated) |
1241 | | * @param options compare options |
1242 | | * @param matchLen1 (output) length of partial prefix match in s1 |
1243 | | * @param matchLen2 (output) length of partial prefix match in s2 |
1244 | | * @param pErrorCode receives error status |
1245 | | * @return The result of comparison |
1246 | | */ |
1247 | | static int32_t _cmpFold( |
1248 | | const UChar *s1, int32_t length1, |
1249 | | const UChar *s2, int32_t length2, |
1250 | | uint32_t options, |
1251 | | int32_t *matchLen1, int32_t *matchLen2, |
1252 | 0 | UErrorCode *pErrorCode) { |
1253 | 0 | int32_t cmpRes = 0; |
1254 | | |
1255 | | /* current-level start/limit - s1/s2 as current */ |
1256 | 0 | const UChar *start1, *start2, *limit1, *limit2; |
1257 | | |
1258 | | /* points to the original start address */ |
1259 | 0 | const UChar *org1, *org2; |
1260 | | |
1261 | | /* points to the end of match + 1 */ |
1262 | 0 | const UChar *m1, *m2; |
1263 | | |
1264 | | /* case folding variables */ |
1265 | 0 | const UChar *p; |
1266 | 0 | int32_t length; |
1267 | | |
1268 | | /* stacks of previous-level start/current/limit */ |
1269 | 0 | CmpEquivLevel stack1[2], stack2[2]; |
1270 | | |
1271 | | /* case folding buffers, only use current-level start/limit */ |
1272 | 0 | UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; |
1273 | | |
1274 | | /* track which is the current level per string */ |
1275 | 0 | int32_t level1, level2; |
1276 | | |
1277 | | /* current code units, and code points for lookups */ |
1278 | 0 | UChar32 c1, c2, cp1, cp2; |
1279 | | |
1280 | | /* no argument error checking because this itself is not an API */ |
1281 | | |
1282 | | /* |
1283 | | * assume that at least the option U_COMPARE_IGNORE_CASE is set |
1284 | | * otherwise this function would have to behave exactly as uprv_strCompare() |
1285 | | */ |
1286 | 0 | if(U_FAILURE(*pErrorCode)) { |
1287 | 0 | return 0; |
1288 | 0 | } |
1289 | | |
1290 | | /* initialize */ |
1291 | 0 | if(matchLen1) { |
1292 | 0 | U_ASSERT(matchLen2 !=NULL); |
1293 | 0 | *matchLen1=0; |
1294 | 0 | *matchLen2=0; |
1295 | 0 | } |
1296 | |
|
1297 | 0 | start1=m1=org1=s1; |
1298 | 0 | if(length1==-1) { |
1299 | 0 | limit1=NULL; |
1300 | 0 | } else { |
1301 | 0 | limit1=s1+length1; |
1302 | 0 | } |
1303 | |
|
1304 | 0 | start2=m2=org2=s2; |
1305 | 0 | if(length2==-1) { |
1306 | 0 | limit2=NULL; |
1307 | 0 | } else { |
1308 | 0 | limit2=s2+length2; |
1309 | 0 | } |
1310 | |
|
1311 | 0 | level1=level2=0; |
1312 | 0 | c1=c2=-1; |
1313 | | |
1314 | | /* comparison loop */ |
1315 | 0 | for(;;) { |
1316 | | /* |
1317 | | * here a code unit value of -1 means "get another code unit" |
1318 | | * below it will mean "this source is finished" |
1319 | | */ |
1320 | |
|
1321 | 0 | if(c1<0) { |
1322 | | /* get next code unit from string 1, post-increment */ |
1323 | 0 | for(;;) { |
1324 | 0 | if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { |
1325 | 0 | if(level1==0) { |
1326 | 0 | c1=-1; |
1327 | 0 | break; |
1328 | 0 | } |
1329 | 0 | } else { |
1330 | 0 | ++s1; |
1331 | 0 | break; |
1332 | 0 | } |
1333 | | |
1334 | | /* reached end of level buffer, pop one level */ |
1335 | 0 | do { |
1336 | 0 | --level1; |
1337 | 0 | start1=stack1[level1].start; /*Not uninitialized*/ |
1338 | 0 | } while(start1==NULL); |
1339 | 0 | s1=stack1[level1].s; /*Not uninitialized*/ |
1340 | 0 | limit1=stack1[level1].limit; /*Not uninitialized*/ |
1341 | 0 | } |
1342 | 0 | } |
1343 | |
|
1344 | 0 | if(c2<0) { |
1345 | | /* get next code unit from string 2, post-increment */ |
1346 | 0 | for(;;) { |
1347 | 0 | if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { |
1348 | 0 | if(level2==0) { |
1349 | 0 | c2=-1; |
1350 | 0 | break; |
1351 | 0 | } |
1352 | 0 | } else { |
1353 | 0 | ++s2; |
1354 | 0 | break; |
1355 | 0 | } |
1356 | | |
1357 | | /* reached end of level buffer, pop one level */ |
1358 | 0 | do { |
1359 | 0 | --level2; |
1360 | 0 | start2=stack2[level2].start; /*Not uninitialized*/ |
1361 | 0 | } while(start2==NULL); |
1362 | 0 | s2=stack2[level2].s; /*Not uninitialized*/ |
1363 | 0 | limit2=stack2[level2].limit; /*Not uninitialized*/ |
1364 | 0 | } |
1365 | 0 | } |
1366 | | |
1367 | | /* |
1368 | | * compare c1 and c2 |
1369 | | * either variable c1, c2 is -1 only if the corresponding string is finished |
1370 | | */ |
1371 | 0 | if(c1==c2) { |
1372 | 0 | const UChar *next1, *next2; |
1373 | |
|
1374 | 0 | if(c1<0) { |
1375 | 0 | cmpRes=0; /* c1==c2==-1 indicating end of strings */ |
1376 | 0 | break; |
1377 | 0 | } |
1378 | | |
1379 | | /* |
1380 | | * Note: Move the match positions in both strings at the same time |
1381 | | * only when corresponding code point(s) in the original strings |
1382 | | * are fully consumed. For example, when comparing s1="Fust" and |
1383 | | * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches |
1384 | | * the first code point in the case-folded data. But the second "s" |
1385 | | * has no matching code point in s1, so this implementation returns |
1386 | | * 2 as the prefix match length ("Fu"). |
1387 | | */ |
1388 | 0 | next1=next2=NULL; |
1389 | 0 | if(level1==0) { |
1390 | 0 | next1=s1; |
1391 | 0 | } else if(s1==limit1) { |
1392 | | /* Note: This implementation only use a single level of stack. |
1393 | | * If this code needs to be changed to use multiple levels |
1394 | | * of stacks, the code above should check if the current |
1395 | | * code is at the end of all stacks. |
1396 | | */ |
1397 | 0 | U_ASSERT(level1==1); |
1398 | | |
1399 | | /* is s1 at the end of the current stack? */ |
1400 | 0 | next1=stack1[0].s; |
1401 | 0 | } |
1402 | |
|
1403 | 0 | if (next1!=NULL) { |
1404 | 0 | if(level2==0) { |
1405 | 0 | next2=s2; |
1406 | 0 | } else if(s2==limit2) { |
1407 | 0 | U_ASSERT(level2==1); |
1408 | | |
1409 | | /* is s2 at the end of the current stack? */ |
1410 | 0 | next2=stack2[0].s; |
1411 | 0 | } |
1412 | 0 | if(next2!=NULL) { |
1413 | 0 | m1=next1; |
1414 | 0 | m2=next2; |
1415 | 0 | } |
1416 | 0 | } |
1417 | 0 | c1=c2=-1; /* make us fetch new code units */ |
1418 | 0 | continue; |
1419 | 0 | } else if(c1<0) { |
1420 | 0 | cmpRes=-1; /* string 1 ends before string 2 */ |
1421 | 0 | break; |
1422 | 0 | } else if(c2<0) { |
1423 | 0 | cmpRes=1; /* string 2 ends before string 1 */ |
1424 | 0 | break; |
1425 | 0 | } |
1426 | | /* c1!=c2 && c1>=0 && c2>=0 */ |
1427 | | |
1428 | | /* get complete code points for c1, c2 for lookups if either is a surrogate */ |
1429 | 0 | cp1=c1; |
1430 | 0 | if(U_IS_SURROGATE(c1)) { |
1431 | 0 | UChar c; |
1432 | |
|
1433 | 0 | if(U_IS_SURROGATE_LEAD(c1)) { |
1434 | 0 | if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { |
1435 | | /* advance ++s1; only below if cp1 decomposes/case-folds */ |
1436 | 0 | cp1=U16_GET_SUPPLEMENTARY(c1, c); |
1437 | 0 | } |
1438 | 0 | } else /* isTrail(c1) */ { |
1439 | 0 | if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { |
1440 | 0 | cp1=U16_GET_SUPPLEMENTARY(c, c1); |
1441 | 0 | } |
1442 | 0 | } |
1443 | 0 | } |
1444 | |
|
1445 | 0 | cp2=c2; |
1446 | 0 | if(U_IS_SURROGATE(c2)) { |
1447 | 0 | UChar c; |
1448 | |
|
1449 | 0 | if(U_IS_SURROGATE_LEAD(c2)) { |
1450 | 0 | if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { |
1451 | | /* advance ++s2; only below if cp2 decomposes/case-folds */ |
1452 | 0 | cp2=U16_GET_SUPPLEMENTARY(c2, c); |
1453 | 0 | } |
1454 | 0 | } else /* isTrail(c2) */ { |
1455 | 0 | if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { |
1456 | 0 | cp2=U16_GET_SUPPLEMENTARY(c, c2); |
1457 | 0 | } |
1458 | 0 | } |
1459 | 0 | } |
1460 | | |
1461 | | /* |
1462 | | * go down one level for each string |
1463 | | * continue with the main loop as soon as there is a real change |
1464 | | */ |
1465 | |
|
1466 | 0 | if( level1==0 && |
1467 | 0 | (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0 |
1468 | 0 | ) { |
1469 | | /* cp1 case-folds to the code point "length" or to p[length] */ |
1470 | 0 | if(U_IS_SURROGATE(c1)) { |
1471 | 0 | if(U_IS_SURROGATE_LEAD(c1)) { |
1472 | | /* advance beyond source surrogate pair if it case-folds */ |
1473 | 0 | ++s1; |
1474 | 0 | } else /* isTrail(c1) */ { |
1475 | | /* |
1476 | | * we got a supplementary code point when hitting its trail surrogate, |
1477 | | * therefore the lead surrogate must have been the same as in the other string; |
1478 | | * compare this decomposition with the lead surrogate in the other string |
1479 | | * remember that this simulates bulk text replacement: |
1480 | | * the decomposition would replace the entire code point |
1481 | | */ |
1482 | 0 | --s2; |
1483 | 0 | --m2; |
1484 | 0 | c2=*(s2-1); |
1485 | 0 | } |
1486 | 0 | } |
1487 | | |
1488 | | /* push current level pointers */ |
1489 | 0 | stack1[0].start=start1; |
1490 | 0 | stack1[0].s=s1; |
1491 | 0 | stack1[0].limit=limit1; |
1492 | 0 | ++level1; |
1493 | | |
1494 | | /* copy the folding result to fold1[] */ |
1495 | 0 | if(length<=UCASE_MAX_STRING_LENGTH) { |
1496 | 0 | u_memcpy(fold1, p, length); |
1497 | 0 | } else { |
1498 | 0 | int32_t i=0; |
1499 | 0 | U16_APPEND_UNSAFE(fold1, i, length); |
1500 | 0 | length=i; |
1501 | 0 | } |
1502 | | |
1503 | | /* set next level pointers to case folding */ |
1504 | 0 | start1=s1=fold1; |
1505 | 0 | limit1=fold1+length; |
1506 | | |
1507 | | /* get ready to read from decomposition, continue with loop */ |
1508 | 0 | c1=-1; |
1509 | 0 | continue; |
1510 | 0 | } |
1511 | | |
1512 | 0 | if( level2==0 && |
1513 | 0 | (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0 |
1514 | 0 | ) { |
1515 | | /* cp2 case-folds to the code point "length" or to p[length] */ |
1516 | 0 | if(U_IS_SURROGATE(c2)) { |
1517 | 0 | if(U_IS_SURROGATE_LEAD(c2)) { |
1518 | | /* advance beyond source surrogate pair if it case-folds */ |
1519 | 0 | ++s2; |
1520 | 0 | } else /* isTrail(c2) */ { |
1521 | | /* |
1522 | | * we got a supplementary code point when hitting its trail surrogate, |
1523 | | * therefore the lead surrogate must have been the same as in the other string; |
1524 | | * compare this decomposition with the lead surrogate in the other string |
1525 | | * remember that this simulates bulk text replacement: |
1526 | | * the decomposition would replace the entire code point |
1527 | | */ |
1528 | 0 | --s1; |
1529 | 0 | --m2; |
1530 | 0 | c1=*(s1-1); |
1531 | 0 | } |
1532 | 0 | } |
1533 | | |
1534 | | /* push current level pointers */ |
1535 | 0 | stack2[0].start=start2; |
1536 | 0 | stack2[0].s=s2; |
1537 | 0 | stack2[0].limit=limit2; |
1538 | 0 | ++level2; |
1539 | | |
1540 | | /* copy the folding result to fold2[] */ |
1541 | 0 | if(length<=UCASE_MAX_STRING_LENGTH) { |
1542 | 0 | u_memcpy(fold2, p, length); |
1543 | 0 | } else { |
1544 | 0 | int32_t i=0; |
1545 | 0 | U16_APPEND_UNSAFE(fold2, i, length); |
1546 | 0 | length=i; |
1547 | 0 | } |
1548 | | |
1549 | | /* set next level pointers to case folding */ |
1550 | 0 | start2=s2=fold2; |
1551 | 0 | limit2=fold2+length; |
1552 | | |
1553 | | /* get ready to read from decomposition, continue with loop */ |
1554 | 0 | c2=-1; |
1555 | 0 | continue; |
1556 | 0 | } |
1557 | | |
1558 | | /* |
1559 | | * no decomposition/case folding, max level for both sides: |
1560 | | * return difference result |
1561 | | * |
1562 | | * code point order comparison must not just return cp1-cp2 |
1563 | | * because when single surrogates are present then the surrogate pairs |
1564 | | * that formed cp1 and cp2 may be from different string indexes |
1565 | | * |
1566 | | * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units |
1567 | | * c1=d800 cp1=10001 c2=dc00 cp2=10000 |
1568 | | * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } |
1569 | | * |
1570 | | * therefore, use same fix-up as in ustring.c/uprv_strCompare() |
1571 | | * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ |
1572 | | * so we have slightly different pointer/start/limit comparisons here |
1573 | | */ |
1574 | | |
1575 | 0 | if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { |
1576 | | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ |
1577 | 0 | if( |
1578 | 0 | (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || |
1579 | 0 | (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) |
1580 | 0 | ) { |
1581 | | /* part of a surrogate pair, leave >=d800 */ |
1582 | 0 | } else { |
1583 | | /* BMP code point - may be surrogate code point - make <d800 */ |
1584 | 0 | c1-=0x2800; |
1585 | 0 | } |
1586 | |
|
1587 | 0 | if( |
1588 | 0 | (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || |
1589 | 0 | (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) |
1590 | 0 | ) { |
1591 | | /* part of a surrogate pair, leave >=d800 */ |
1592 | 0 | } else { |
1593 | | /* BMP code point - may be surrogate code point - make <d800 */ |
1594 | 0 | c2-=0x2800; |
1595 | 0 | } |
1596 | 0 | } |
1597 | |
|
1598 | 0 | cmpRes=c1-c2; |
1599 | 0 | break; |
1600 | 0 | } |
1601 | |
|
1602 | 0 | if(matchLen1) { |
1603 | 0 | *matchLen1=m1-org1; |
1604 | 0 | *matchLen2=m2-org2; |
1605 | 0 | } |
1606 | 0 | return cmpRes; |
1607 | 0 | } |
1608 | | |
1609 | | /* internal function */ |
1610 | | U_CFUNC int32_t |
1611 | | u_strcmpFold(const UChar *s1, int32_t length1, |
1612 | | const UChar *s2, int32_t length2, |
1613 | | uint32_t options, |
1614 | 0 | UErrorCode *pErrorCode) { |
1615 | 0 | return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode); |
1616 | 0 | } |
1617 | | |
1618 | | /* public API functions */ |
1619 | | |
1620 | | U_CAPI int32_t U_EXPORT2 |
1621 | | u_strCaseCompare(const UChar *s1, int32_t length1, |
1622 | | const UChar *s2, int32_t length2, |
1623 | | uint32_t options, |
1624 | 0 | UErrorCode *pErrorCode) { |
1625 | | /* argument checking */ |
1626 | 0 | if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { |
1627 | 0 | return 0; |
1628 | 0 | } |
1629 | 0 | if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { |
1630 | 0 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
1631 | 0 | return 0; |
1632 | 0 | } |
1633 | 0 | return u_strcmpFold(s1, length1, s2, length2, |
1634 | 0 | options|U_COMPARE_IGNORE_CASE, |
1635 | 0 | pErrorCode); |
1636 | 0 | } |
1637 | | |
1638 | | U_CAPI int32_t U_EXPORT2 |
1639 | 0 | u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) { |
1640 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
1641 | 0 | return u_strcmpFold(s1, -1, s2, -1, |
1642 | 0 | options|U_COMPARE_IGNORE_CASE, |
1643 | 0 | &errorCode); |
1644 | 0 | } |
1645 | | |
1646 | | U_CAPI int32_t U_EXPORT2 |
1647 | 0 | u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) { |
1648 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
1649 | 0 | return u_strcmpFold(s1, length, s2, length, |
1650 | 0 | options|U_COMPARE_IGNORE_CASE, |
1651 | 0 | &errorCode); |
1652 | 0 | } |
1653 | | |
1654 | | U_CAPI int32_t U_EXPORT2 |
1655 | 0 | u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { |
1656 | 0 | UErrorCode errorCode=U_ZERO_ERROR; |
1657 | 0 | return u_strcmpFold(s1, n, s2, n, |
1658 | 0 | options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), |
1659 | 0 | &errorCode); |
1660 | 0 | } |
1661 | | |
1662 | | /* internal API - detect length of shared prefix */ |
1663 | | U_CAPI void |
1664 | | u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1, |
1665 | | const UChar *s2, int32_t length2, |
1666 | | uint32_t options, |
1667 | | int32_t *matchLen1, int32_t *matchLen2, |
1668 | 0 | UErrorCode *pErrorCode) { |
1669 | 0 | _cmpFold(s1, length1, s2, length2, options, |
1670 | 0 | matchLen1, matchLen2, pErrorCode); |
1671 | 0 | } |