Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/common/ucasemap.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2005-2016, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  ucasemap.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2005may06
16
*   created by: Markus W. Scherer
17
*
18
*   Case mapping service object and functions using it.
19
*/
20
21
#include "unicode/utypes.h"
22
#include "unicode/brkiter.h"
23
#include "unicode/bytestream.h"
24
#include "unicode/casemap.h"
25
#include "unicode/edits.h"
26
#include "unicode/stringoptions.h"
27
#include "unicode/stringpiece.h"
28
#include "unicode/ubrk.h"
29
#include "unicode/uloc.h"
30
#include "unicode/ustring.h"
31
#include "unicode/ucasemap.h"
32
#if !UCONFIG_NO_BREAK_ITERATION
33
#include "unicode/utext.h"
34
#endif
35
#include "unicode/utf.h"
36
#include "unicode/utf8.h"
37
#include "unicode/utf16.h"
38
#include "bytesinkutil.h"
39
#include "cmemory.h"
40
#include "cstring.h"
41
#include "uassert.h"
42
#include "ucase.h"
43
#include "ucasemap_imp.h"
44
#include "ustr_imp.h"
45
46
U_NAMESPACE_USE
47
48
/* UCaseMap service object -------------------------------------------------- */
49
50
UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51
#if !UCONFIG_NO_BREAK_ITERATION
52
        iter(NULL),
53
#endif
54
0
        caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55
0
    ucasemap_setLocale(this, localeID, pErrorCode);
56
0
}
57
58
0
UCaseMap::~UCaseMap() {
59
0
#if !UCONFIG_NO_BREAK_ITERATION
60
0
    delete iter;
61
0
#endif
62
0
}
63
64
U_CAPI UCaseMap * U_EXPORT2
65
0
ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
66
0
    if(U_FAILURE(*pErrorCode)) {
67
0
        return NULL;
68
0
    }
69
0
    UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
70
0
    if(csm==NULL) {
71
0
        *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72
0
        return NULL;
73
0
    } else if (U_FAILURE(*pErrorCode)) {
74
0
        delete csm;
75
0
        return NULL;
76
0
    }
77
0
    return csm;
78
0
}
79
80
U_CAPI void U_EXPORT2
81
0
ucasemap_close(UCaseMap *csm) {
82
0
    delete csm;
83
0
}
84
85
U_CAPI const char * U_EXPORT2
86
0
ucasemap_getLocale(const UCaseMap *csm) {
87
0
    return csm->locale;
88
0
}
89
90
U_CAPI uint32_t U_EXPORT2
91
0
ucasemap_getOptions(const UCaseMap *csm) {
92
0
    return csm->options;
93
0
}
94
95
U_CAPI void U_EXPORT2
96
0
ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
97
0
    if(U_FAILURE(*pErrorCode)) {
98
0
        return;
99
0
    }
100
0
    if (locale != NULL && *locale == 0) {
101
0
        csm->locale[0] = 0;
102
0
        csm->caseLocale = UCASE_LOC_ROOT;
103
0
        return;
104
0
    }
105
0
106
0
    int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107
0
    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108
0
        *pErrorCode=U_ZERO_ERROR;
109
0
        /* we only really need the language code for case mappings */
110
0
        length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111
0
    }
112
0
    if(length==sizeof(csm->locale)) {
113
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114
0
    }
115
0
    if(U_SUCCESS(*pErrorCode)) {
116
0
        csm->caseLocale=UCASE_LOC_UNKNOWN;
117
0
        csm->caseLocale = ucase_getCaseLocale(csm->locale);
118
0
    } else {
119
0
        csm->locale[0]=0;
120
0
        csm->caseLocale = UCASE_LOC_ROOT;
121
0
    }
122
0
}
123
124
U_CAPI void U_EXPORT2
125
0
ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
126
0
    if(U_FAILURE(*pErrorCode)) {
127
0
        return;
128
0
    }
129
0
    csm->options=options;
130
0
}
131
132
/* UTF-8 string case mappings ----------------------------------------------- */
133
134
/* TODO(markus): Move to a new, separate utf8case.cpp file. */
135
136
namespace {
137
138
/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
139
inline UBool
140
appendResult(int32_t cpLength, int32_t result, const UChar *s,
141
0
             ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
142
0
    U_ASSERT(U_SUCCESS(errorCode));
143
0
144
0
    /* decode the result */
145
0
    if(result<0) {
146
0
        /* (not) original code point */
147
0
        if(edits!=NULL) {
148
0
            edits->addUnchanged(cpLength);
149
0
        }
150
0
        if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
151
0
            ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
152
0
        }
153
0
    } else {
154
0
        if(result<=UCASE_MAX_STRING_LENGTH) {
155
0
            // string: "result" is the UTF-16 length
156
0
            return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
157
0
        } else {
158
0
            ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
159
0
        }
160
0
    }
161
0
    return TRUE;
162
0
}
163
164
// See unicode/utf8.h U8_APPEND_UNSAFE().
165
0
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
166
0
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
167
168
UChar32 U_CALLCONV
169
0
utf8_caseContextIterator(void *context, int8_t dir) {
170
0
    UCaseContext *csc=(UCaseContext *)context;
171
0
    UChar32 c;
172
0
173
0
    if(dir<0) {
174
0
        /* reset for backward iteration */
175
0
        csc->index=csc->cpStart;
176
0
        csc->dir=dir;
177
0
    } else if(dir>0) {
178
0
        /* reset for forward iteration */
179
0
        csc->index=csc->cpLimit;
180
0
        csc->dir=dir;
181
0
    } else {
182
0
        /* continue current iteration direction */
183
0
        dir=csc->dir;
184
0
    }
185
0
186
0
    if(dir<0) {
187
0
        if(csc->start<csc->index) {
188
0
            U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
189
0
            return c;
190
0
        }
191
0
    } else {
192
0
        if(csc->index<csc->limit) {
193
0
            U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
194
0
            return c;
195
0
        }
196
0
    }
197
0
    return U_SENTINEL;
198
0
}
199
200
/**
201
 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
202
 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
203
 */
204
void toLower(int32_t caseLocale, uint32_t options,
205
             const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
206
0
             icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
207
0
    const int8_t *latinToLower;
208
0
    if (caseLocale == UCASE_LOC_ROOT ||
209
0
            (caseLocale >= 0 ?
210
0
                !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
211
0
                (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
212
0
        latinToLower = LatinCase::TO_LOWER_NORMAL;
213
0
    } else {
214
0
        latinToLower = LatinCase::TO_LOWER_TR_LT;
215
0
    }
216
0
    const UTrie2 *trie = ucase_getTrie();
217
0
    int32_t prev = srcStart;
218
0
    int32_t srcIndex = srcStart;
219
0
    for (;;) {
220
0
        // fast path for simple cases
221
0
        int32_t cpStart;
222
0
        UChar32 c;
223
0
        for (;;) {
224
0
            if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
225
0
                c = U_SENTINEL;
226
0
                break;
227
0
            }
228
0
            uint8_t lead = src[srcIndex++];
229
0
            if (lead <= 0x7f) {
230
0
                int8_t d = latinToLower[lead];
231
0
                if (d == LatinCase::EXC) {
232
0
                    cpStart = srcIndex - 1;
233
0
                    c = lead;
234
0
                    break;
235
0
                }
236
0
                if (d == 0) { continue; }
237
0
                ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
238
0
                                              sink, options, edits, errorCode);
239
0
                char ascii = (char)(lead + d);
240
0
                sink.Append(&ascii, 1);
241
0
                if (edits != nullptr) {
242
0
                    edits->addReplace(1, 1);
243
0
                }
244
0
                prev = srcIndex;
245
0
                continue;
246
0
            } else if (lead < 0xe3) {
247
0
                uint8_t t;
248
0
                if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
249
0
                        (t = src[srcIndex] - 0x80) <= 0x3f) {
250
0
                    // U+0080..U+017F
251
0
                    ++srcIndex;
252
0
                    c = ((lead - 0xc0) << 6) | t;
253
0
                    int8_t d = latinToLower[c];
254
0
                    if (d == LatinCase::EXC) {
255
0
                        cpStart = srcIndex - 2;
256
0
                        break;
257
0
                    }
258
0
                    if (d == 0) { continue; }
259
0
                    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
260
0
                                                  sink, options, edits, errorCode);
261
0
                    ByteSinkUtil::appendTwoBytes(c + d, sink);
262
0
                    if (edits != nullptr) {
263
0
                        edits->addReplace(2, 2);
264
0
                    }
265
0
                    prev = srcIndex;
266
0
                    continue;
267
0
                }
268
0
            } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
269
0
                    (srcIndex + 2) <= srcLimit &&
270
0
                    U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
271
0
                // most of CJK: no case mappings
272
0
                srcIndex += 2;
273
0
                continue;
274
0
            }
275
0
            cpStart = --srcIndex;
276
0
            U8_NEXT(src, srcIndex, srcLimit, c);
277
0
            if (c < 0) {
278
0
                // ill-formed UTF-8
279
0
                continue;
280
0
            }
281
0
            uint16_t props = UTRIE2_GET16(trie, c);
282
0
            if (UCASE_HAS_EXCEPTION(props)) { break; }
283
0
            int32_t delta;
284
0
            if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
285
0
                continue;
286
0
            }
287
0
            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
288
0
                                          sink, options, edits, errorCode);
289
0
            ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
290
0
            prev = srcIndex;
291
0
        }
292
0
        if (c < 0) {
293
0
            break;
294
0
        }
295
0
        // slow path
296
0
        const UChar *s;
297
0
        if (caseLocale >= 0) {
298
0
            csc->cpStart = cpStart;
299
0
            csc->cpLimit = srcIndex;
300
0
            c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
301
0
        } else {
302
0
            c = ucase_toFullFolding(c, &s, options);
303
0
        }
304
0
        if (c >= 0) {
305
0
            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
306
0
                                          sink, options, edits, errorCode);
307
0
            appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
308
0
            prev = srcIndex;
309
0
        }
310
0
    }
311
0
    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
312
0
                                  sink, options, edits, errorCode);
313
0
}
314
315
void toUpper(int32_t caseLocale, uint32_t options,
316
             const uint8_t *src, UCaseContext *csc, int32_t srcLength,
317
0
             icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
318
0
    const int8_t *latinToUpper;
319
0
    if (caseLocale == UCASE_LOC_TURKISH) {
320
0
        latinToUpper = LatinCase::TO_UPPER_TR;
321
0
    } else {
322
0
        latinToUpper = LatinCase::TO_UPPER_NORMAL;
323
0
    }
324
0
    const UTrie2 *trie = ucase_getTrie();
325
0
    int32_t prev = 0;
326
0
    int32_t srcIndex = 0;
327
0
    for (;;) {
328
0
        // fast path for simple cases
329
0
        int32_t cpStart;
330
0
        UChar32 c;
331
0
        for (;;) {
332
0
            if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
333
0
                c = U_SENTINEL;
334
0
                break;
335
0
            }
336
0
            uint8_t lead = src[srcIndex++];
337
0
            if (lead <= 0x7f) {
338
0
                int8_t d = latinToUpper[lead];
339
0
                if (d == LatinCase::EXC) {
340
0
                    cpStart = srcIndex - 1;
341
0
                    c = lead;
342
0
                    break;
343
0
                }
344
0
                if (d == 0) { continue; }
345
0
                ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
346
0
                                              sink, options, edits, errorCode);
347
0
                char ascii = (char)(lead + d);
348
0
                sink.Append(&ascii, 1);
349
0
                if (edits != nullptr) {
350
0
                    edits->addReplace(1, 1);
351
0
                }
352
0
                prev = srcIndex;
353
0
                continue;
354
0
            } else if (lead < 0xe3) {
355
0
                uint8_t t;
356
0
                if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
357
0
                        (t = src[srcIndex] - 0x80) <= 0x3f) {
358
0
                    // U+0080..U+017F
359
0
                    ++srcIndex;
360
0
                    c = ((lead - 0xc0) << 6) | t;
361
0
                    int8_t d = latinToUpper[c];
362
0
                    if (d == LatinCase::EXC) {
363
0
                        cpStart = srcIndex - 2;
364
0
                        break;
365
0
                    }
366
0
                    if (d == 0) { continue; }
367
0
                    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
368
0
                                                  sink, options, edits, errorCode);
369
0
                    ByteSinkUtil::appendTwoBytes(c + d, sink);
370
0
                    if (edits != nullptr) {
371
0
                        edits->addReplace(2, 2);
372
0
                    }
373
0
                    prev = srcIndex;
374
0
                    continue;
375
0
                }
376
0
            } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
377
0
                    (srcIndex + 2) <= srcLength &&
378
0
                    U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
379
0
                // most of CJK: no case mappings
380
0
                srcIndex += 2;
381
0
                continue;
382
0
            }
383
0
            cpStart = --srcIndex;
384
0
            U8_NEXT(src, srcIndex, srcLength, c);
385
0
            if (c < 0) {
386
0
                // ill-formed UTF-8
387
0
                continue;
388
0
            }
389
0
            uint16_t props = UTRIE2_GET16(trie, c);
390
0
            if (UCASE_HAS_EXCEPTION(props)) { break; }
391
0
            int32_t delta;
392
0
            if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
393
0
                continue;
394
0
            }
395
0
            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
396
0
                                          sink, options, edits, errorCode);
397
0
            ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
398
0
            prev = srcIndex;
399
0
        }
400
0
        if (c < 0) {
401
0
            break;
402
0
        }
403
0
        // slow path
404
0
        csc->cpStart = cpStart;
405
0
        csc->cpLimit = srcIndex;
406
0
        const UChar *s;
407
0
        c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
408
0
        if (c >= 0) {
409
0
            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
410
0
                                          sink, options, edits, errorCode);
411
0
            appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
412
0
            prev = srcIndex;
413
0
        }
414
0
    }
415
0
    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
416
0
                                  sink, options, edits, errorCode);
417
0
}
418
419
}  // namespace
420
421
#if !UCONFIG_NO_BREAK_ITERATION
422
423
U_CFUNC void U_CALLCONV
424
ucasemap_internalUTF8ToTitle(
425
        int32_t caseLocale, uint32_t options, BreakIterator *iter,
426
        const uint8_t *src, int32_t srcLength,
427
        ByteSink &sink, icu::Edits *edits,
428
0
        UErrorCode &errorCode) {
429
0
    if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
430
0
        return;
431
0
    }
432
0
433
0
    /* set up local variables */
434
0
    UCaseContext csc=UCASECONTEXT_INITIALIZER;
435
0
    csc.p=(void *)src;
436
0
    csc.limit=srcLength;
437
0
    int32_t prev=0;
438
0
    UBool isFirstIndex=TRUE;
439
0
440
0
    /* titlecasing loop */
441
0
    while(prev<srcLength) {
442
0
        /* find next index where to titlecase */
443
0
        int32_t index;
444
0
        if(isFirstIndex) {
445
0
            isFirstIndex=FALSE;
446
0
            index=iter->first();
447
0
        } else {
448
0
            index=iter->next();
449
0
        }
450
0
        if(index==UBRK_DONE || index>srcLength) {
451
0
            index=srcLength;
452
0
        }
453
0
454
0
        /*
455
0
         * Segment [prev..index[ into 3 parts:
456
0
         * a) skipped characters (copy as-is) [prev..titleStart[
457
0
         * b) first letter (titlecase)              [titleStart..titleLimit[
458
0
         * c) subsequent characters (lowercase)                 [titleLimit..index[
459
0
         */
460
0
        if(prev<index) {
461
0
            /* find and copy skipped characters [prev..titleStart[ */
462
0
            int32_t titleStart=prev;
463
0
            int32_t titleLimit=prev;
464
0
            UChar32 c;
465
0
            U8_NEXT(src, titleLimit, index, c);
466
0
            if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
467
0
                // Adjust the titlecasing index to the next cased character,
468
0
                // or to the next letter/number/symbol/private use.
469
0
                // Stop with titleStart<titleLimit<=index
470
0
                // if there is a character to be titlecased,
471
0
                // or else stop with titleStart==titleLimit==index.
472
0
                UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
473
0
                while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
474
0
                    titleStart=titleLimit;
475
0
                    if(titleLimit==index) {
476
0
                        break;
477
0
                    }
478
0
                    U8_NEXT(src, titleLimit, index, c);
479
0
                }
480
0
                if (prev < titleStart) {
481
0
                    if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
482
0
                                                       sink, options, edits, errorCode)) {
483
0
                        return;
484
0
                    }
485
0
                }
486
0
            }
487
0
488
0
            if(titleStart<titleLimit) {
489
0
                /* titlecase c which is from [titleStart..titleLimit[ */
490
0
                if(c>=0) {
491
0
                    csc.cpStart=titleStart;
492
0
                    csc.cpLimit=titleLimit;
493
0
                    const UChar *s;
494
0
                    c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
495
0
                    if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
496
0
                        return;
497
0
                    }
498
0
                } else {
499
0
                    // Malformed UTF-8.
500
0
                    if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
501
0
                                                       sink, options, edits, errorCode)) {
502
0
                        return;
503
0
                    }
504
0
                }
505
0
506
0
                /* Special case Dutch IJ titlecasing */
507
0
                if (titleStart+1 < index &&
508
0
                        caseLocale == UCASE_LOC_DUTCH &&
509
0
                        (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
510
0
                    if (src[titleStart+1] == 0x006A) {
511
0
                        ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
512
0
                        titleLimit++;
513
0
                    } else if (src[titleStart+1] == 0x004A) {
514
0
                        // Keep the capital J from getting lowercased.
515
0
                        if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
516
0
                                                           sink, options, edits, errorCode)) {
517
0
                            return;
518
0
                        }
519
0
                        titleLimit++;
520
0
                    }
521
0
                }
522
0
523
0
                /* lowercase [titleLimit..index[ */
524
0
                if(titleLimit<index) {
525
0
                    if((options&U_TITLECASE_NO_LOWERCASE)==0) {
526
0
                        /* Normal operation: Lowercase the rest of the word. */
527
0
                        toLower(caseLocale, options,
528
0
                                src, &csc, titleLimit, index,
529
0
                                sink, edits, errorCode);
530
0
                        if(U_FAILURE(errorCode)) {
531
0
                            return;
532
0
                        }
533
0
                    } else {
534
0
                        /* Optionally just copy the rest of the word unchanged. */
535
0
                        if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
536
0
                                                           sink, options, edits, errorCode)) {
537
0
                            return;
538
0
                        }
539
0
                    }
540
0
                }
541
0
            }
542
0
        }
543
0
544
0
        prev=index;
545
0
    }
546
0
}
547
548
#endif
549
550
U_NAMESPACE_BEGIN
551
namespace GreekUpper {
552
553
0
UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
554
0
    while (i < length) {
555
0
        UChar32 c;
556
0
        U8_NEXT(s, i, length, c);
557
0
        int32_t type = ucase_getTypeOrIgnorable(c);
558
0
        if ((type & UCASE_IGNORABLE) != 0) {
559
0
            // Case-ignorable, continue with the loop.
560
0
        } else if (type != UCASE_NONE) {
561
0
            return TRUE;  // Followed by cased letter.
562
0
        } else {
563
0
            return FALSE;  // Uncased and not case-ignorable.
564
0
        }
565
0
    }
566
0
    return FALSE;  // Not followed by cased letter.
567
0
}
568
569
// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
570
void toUpper(uint32_t options,
571
             const uint8_t *src, int32_t srcLength,
572
             ByteSink &sink, Edits *edits,
573
0
             UErrorCode &errorCode) {
574
0
    uint32_t state = 0;
575
0
    for (int32_t i = 0; i < srcLength;) {
576
0
        int32_t nextIndex = i;
577
0
        UChar32 c;
578
0
        U8_NEXT(src, nextIndex, srcLength, c);
579
0
        uint32_t nextState = 0;
580
0
        int32_t type = ucase_getTypeOrIgnorable(c);
581
0
        if ((type & UCASE_IGNORABLE) != 0) {
582
0
            // c is case-ignorable
583
0
            nextState |= (state & AFTER_CASED);
584
0
        } else if (type != UCASE_NONE) {
585
0
            // c is cased
586
0
            nextState |= AFTER_CASED;
587
0
        }
588
0
        uint32_t data = getLetterData(c);
589
0
        if (data > 0) {
590
0
            uint32_t upper = data & UPPER_MASK;
591
0
            // Add a dialytika to this iota or ypsilon vowel
592
0
            // if we removed a tonos from the previous vowel,
593
0
            // and that previous vowel did not also have (or gain) a dialytika.
594
0
            // Adding one only to the final vowel in a longer sequence
595
0
            // (which does not occur in normal writing) would require lookahead.
596
0
            // Set the same flag as for preserving an existing dialytika.
597
0
            if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
598
0
                    (upper == 0x399 || upper == 0x3A5)) {
599
0
                data |= HAS_DIALYTIKA;
600
0
            }
601
0
            int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
602
0
            if ((data & HAS_YPOGEGRAMMENI) != 0) {
603
0
                numYpogegrammeni = 1;
604
0
            }
605
0
            // Skip combining diacritics after this Greek letter.
606
0
            int32_t nextNextIndex = nextIndex;
607
0
            while (nextIndex < srcLength) {
608
0
                UChar32 c2;
609
0
                U8_NEXT(src, nextNextIndex, srcLength, c2);
610
0
                uint32_t diacriticData = getDiacriticData(c2);
611
0
                if (diacriticData != 0) {
612
0
                    data |= diacriticData;
613
0
                    if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
614
0
                        ++numYpogegrammeni;
615
0
                    }
616
0
                    nextIndex = nextNextIndex;
617
0
                } else {
618
0
                    break;  // not a Greek diacritic
619
0
                }
620
0
            }
621
0
            if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
622
0
                nextState |= AFTER_VOWEL_WITH_ACCENT;
623
0
            }
624
0
            // Map according to Greek rules.
625
0
            UBool addTonos = FALSE;
626
0
            if (upper == 0x397 &&
627
0
                    (data & HAS_ACCENT) != 0 &&
628
0
                    numYpogegrammeni == 0 &&
629
0
                    (state & AFTER_CASED) == 0 &&
630
0
                    !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
631
0
                // Keep disjunctive "or" with (only) a tonos.
632
0
                // We use the same "word boundary" conditions as for the Final_Sigma test.
633
0
                if (i == nextIndex) {
634
0
                    upper = 0x389;  // Preserve the precomposed form.
635
0
                } else {
636
0
                    addTonos = TRUE;
637
0
                }
638
0
            } else if ((data & HAS_DIALYTIKA) != 0) {
639
0
                // Preserve a vowel with dialytika in precomposed form if it exists.
640
0
                if (upper == 0x399) {
641
0
                    upper = 0x3AA;
642
0
                    data &= ~HAS_EITHER_DIALYTIKA;
643
0
                } else if (upper == 0x3A5) {
644
0
                    upper = 0x3AB;
645
0
                    data &= ~HAS_EITHER_DIALYTIKA;
646
0
                }
647
0
            }
648
0
649
0
            UBool change;
650
0
            if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
651
0
                change = TRUE;  // common, simple usage
652
0
            } else {
653
0
                // Find out first whether we are changing the text.
654
0
                U_ASSERT(0x370 <= upper && upper <= 0x3ff);  // 2-byte UTF-8, main Greek block
655
0
                change = (i + 2) > nextIndex ||
656
0
                        src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
657
0
                        numYpogegrammeni > 0;
658
0
                int32_t i2 = i + 2;
659
0
                if ((data & HAS_EITHER_DIALYTIKA) != 0) {
660
0
                    change |= (i2 + 2) > nextIndex ||
661
0
                            src[i2] != (uint8_t)u8"\u0308"[0] ||
662
0
                            src[i2 + 1] != (uint8_t)u8"\u0308"[1];
663
0
                    i2 += 2;
664
0
                }
665
0
                if (addTonos) {
666
0
                    change |= (i2 + 2) > nextIndex ||
667
0
                            src[i2] != (uint8_t)u8"\u0301"[0] ||
668
0
                            src[i2 + 1] != (uint8_t)u8"\u0301"[1];
669
0
                    i2 += 2;
670
0
                }
671
0
                int32_t oldLength = nextIndex - i;
672
0
                int32_t newLength = (i2 - i) + numYpogegrammeni * 2;  // 2 bytes per U+0399
673
0
                change |= oldLength != newLength;
674
0
                if (change) {
675
0
                    if (edits != NULL) {
676
0
                        edits->addReplace(oldLength, newLength);
677
0
                    }
678
0
                } else {
679
0
                    if (edits != NULL) {
680
0
                        edits->addUnchanged(oldLength);
681
0
                    }
682
0
                    // Write unchanged text?
683
0
                    change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
684
0
                }
685
0
            }
686
0
687
0
            if (change) {
688
0
                ByteSinkUtil::appendTwoBytes(upper, sink);
689
0
                if ((data & HAS_EITHER_DIALYTIKA) != 0) {
690
0
                    sink.Append(u8"\u0308", 2);  // restore or add a dialytika
691
0
                }
692
0
                if (addTonos) {
693
0
                    sink.Append(u8"\u0301", 2);
694
0
                }
695
0
                while (numYpogegrammeni > 0) {
696
0
                    sink.Append(u8"\u0399", 2);
697
0
                    --numYpogegrammeni;
698
0
                }
699
0
            }
700
0
        } else if(c>=0) {
701
0
            const UChar *s;
702
0
            c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
703
0
            if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
704
0
                return;
705
0
            }
706
0
        } else {
707
0
            // Malformed UTF-8.
708
0
            if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
709
0
                                               sink, options, edits, errorCode)) {
710
0
                return;
711
0
            }
712
0
        }
713
0
        i = nextIndex;
714
0
        state = nextState;
715
0
    }
716
0
}
717
718
}  // namespace GreekUpper
719
U_NAMESPACE_END
720
721
static void U_CALLCONV
722
ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
723
                             const uint8_t *src, int32_t srcLength,
724
                             icu::ByteSink &sink, icu::Edits *edits,
725
0
                             UErrorCode &errorCode) {
726
0
    UCaseContext csc=UCASECONTEXT_INITIALIZER;
727
0
    csc.p=(void *)src;
728
0
    csc.limit=srcLength;
729
0
    toLower(
730
0
        caseLocale, options,
731
0
        src, &csc, 0, srcLength,
732
0
        sink, edits, errorCode);
733
0
}
734
735
static void U_CALLCONV
736
ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
737
                             const uint8_t *src, int32_t srcLength,
738
                             icu::ByteSink &sink, icu::Edits *edits,
739
0
                             UErrorCode &errorCode) {
740
0
    if (caseLocale == UCASE_LOC_GREEK) {
741
0
        GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
742
0
    } else {
743
0
        UCaseContext csc=UCASECONTEXT_INITIALIZER;
744
0
        csc.p=(void *)src;
745
0
        csc.limit=srcLength;
746
0
        toUpper(
747
0
            caseLocale, options,
748
0
            src, &csc, srcLength,
749
0
            sink, edits, errorCode);
750
0
    }
751
0
}
752
753
static void U_CALLCONV
754
ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
755
                          const uint8_t *src, int32_t srcLength,
756
                          icu::ByteSink &sink, icu::Edits *edits,
757
0
                          UErrorCode &errorCode) {
758
0
    toLower(
759
0
        -1, options,
760
0
        src, nullptr, 0, srcLength,
761
0
        sink, edits, errorCode);
762
0
}
763
764
void
765
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
766
                 const char *src, int32_t srcLength,
767
                 UTF8CaseMapper *stringCaseMapper,
768
                 icu::ByteSink &sink, icu::Edits *edits,
769
0
                 UErrorCode &errorCode) {
770
0
    /* check argument values */
771
0
    if (U_FAILURE(errorCode)) {
772
0
        return;
773
0
    }
774
0
    if ((src == nullptr && srcLength != 0) || srcLength < -1) {
775
0
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
776
0
        return;
777
0
    }
778
0
779
0
    // Get the string length.
780
0
    if (srcLength == -1) {
781
0
        srcLength = (int32_t)uprv_strlen((const char *)src);
782
0
    }
783
0
784
0
    if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
785
0
        edits->reset();
786
0
    }
787
0
    stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
788
0
                     (const uint8_t *)src, srcLength, sink, edits, errorCode);
789
0
    sink.Flush();
790
0
    if (U_SUCCESS(errorCode)) {
791
0
        if (edits != nullptr) {
792
0
            edits->copyErrorTo(errorCode);
793
0
        }
794
0
    }
795
0
}
796
797
int32_t
798
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
799
                 char *dest, int32_t destCapacity,
800
                 const char *src, int32_t srcLength,
801
                 UTF8CaseMapper *stringCaseMapper,
802
                 icu::Edits *edits,
803
0
                 UErrorCode &errorCode) {
804
0
    /* check argument values */
805
0
    if(U_FAILURE(errorCode)) {
806
0
        return 0;
807
0
    }
808
0
    if( destCapacity<0 ||
809
0
        (dest==NULL && destCapacity>0) ||
810
0
        (src==NULL && srcLength!=0) || srcLength<-1
811
0
    ) {
812
0
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
813
0
        return 0;
814
0
    }
815
0
816
0
    /* get the string length */
817
0
    if(srcLength==-1) {
818
0
        srcLength=(int32_t)uprv_strlen((const char *)src);
819
0
    }
820
0
821
0
    /* check for overlapping source and destination */
822
0
    if( dest!=NULL &&
823
0
        ((src>=dest && src<(dest+destCapacity)) ||
824
0
         (dest>=src && dest<(src+srcLength)))
825
0
    ) {
826
0
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
827
0
        return 0;
828
0
    }
829
0
830
0
    CheckedArrayByteSink sink(dest, destCapacity);
831
0
    if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
832
0
        edits->reset();
833
0
    }
834
0
    stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
835
0
                     (const uint8_t *)src, srcLength, sink, edits, errorCode);
836
0
    sink.Flush();
837
0
    if (U_SUCCESS(errorCode)) {
838
0
        if (sink.Overflowed()) {
839
0
            errorCode = U_BUFFER_OVERFLOW_ERROR;
840
0
        } else if (edits != nullptr) {
841
0
            edits->copyErrorTo(errorCode);
842
0
        }
843
0
    }
844
0
    return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
845
0
}
846
847
/* public API functions */
848
849
U_CAPI int32_t U_EXPORT2
850
ucasemap_utf8ToLower(const UCaseMap *csm,
851
                     char *dest, int32_t destCapacity,
852
                     const char *src, int32_t srcLength,
853
0
                     UErrorCode *pErrorCode) {
854
0
    return ucasemap_mapUTF8(
855
0
        csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
856
0
        dest, destCapacity,
857
0
        src, srcLength,
858
0
        ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
859
0
}
860
861
U_CAPI int32_t U_EXPORT2
862
ucasemap_utf8ToUpper(const UCaseMap *csm,
863
                     char *dest, int32_t destCapacity,
864
                     const char *src, int32_t srcLength,
865
0
                     UErrorCode *pErrorCode) {
866
0
    return ucasemap_mapUTF8(
867
0
        csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
868
0
        dest, destCapacity,
869
0
        src, srcLength,
870
0
        ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
871
0
}
872
873
U_CAPI int32_t U_EXPORT2
874
ucasemap_utf8FoldCase(const UCaseMap *csm,
875
                      char *dest, int32_t destCapacity,
876
                      const char *src, int32_t srcLength,
877
0
                      UErrorCode *pErrorCode) {
878
0
    return ucasemap_mapUTF8(
879
0
        UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
880
0
        dest, destCapacity,
881
0
        src, srcLength,
882
0
        ucasemap_internalUTF8Fold, NULL, *pErrorCode);
883
0
}
884
885
U_NAMESPACE_BEGIN
886
887
void CaseMap::utf8ToLower(
888
        const char *locale, uint32_t options,
889
        StringPiece src, ByteSink &sink, Edits *edits,
890
0
        UErrorCode &errorCode) {
891
0
    ucasemap_mapUTF8(
892
0
        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
893
0
        src.data(), src.length(),
894
0
        ucasemap_internalUTF8ToLower, sink, edits, errorCode);
895
0
}
896
897
void CaseMap::utf8ToUpper(
898
        const char *locale, uint32_t options,
899
        StringPiece src, ByteSink &sink, Edits *edits,
900
0
        UErrorCode &errorCode) {
901
0
    ucasemap_mapUTF8(
902
0
        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
903
0
        src.data(), src.length(),
904
0
        ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
905
0
}
906
907
void CaseMap::utf8Fold(
908
        uint32_t options,
909
        StringPiece src, ByteSink &sink, Edits *edits,
910
0
        UErrorCode &errorCode) {
911
0
    ucasemap_mapUTF8(
912
0
        UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
913
0
        src.data(), src.length(),
914
0
        ucasemap_internalUTF8Fold, sink, edits, errorCode);
915
0
}
916
917
int32_t CaseMap::utf8ToLower(
918
        const char *locale, uint32_t options,
919
        const char *src, int32_t srcLength,
920
        char *dest, int32_t destCapacity, Edits *edits,
921
0
        UErrorCode &errorCode) {
922
0
    return ucasemap_mapUTF8(
923
0
        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
924
0
        dest, destCapacity,
925
0
        src, srcLength,
926
0
        ucasemap_internalUTF8ToLower, edits, errorCode);
927
0
}
928
929
int32_t CaseMap::utf8ToUpper(
930
        const char *locale, uint32_t options,
931
        const char *src, int32_t srcLength,
932
        char *dest, int32_t destCapacity, Edits *edits,
933
0
        UErrorCode &errorCode) {
934
0
    return ucasemap_mapUTF8(
935
0
        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
936
0
        dest, destCapacity,
937
0
        src, srcLength,
938
0
        ucasemap_internalUTF8ToUpper, edits, errorCode);
939
0
}
940
941
int32_t CaseMap::utf8Fold(
942
        uint32_t options,
943
        const char *src, int32_t srcLength,
944
        char *dest, int32_t destCapacity, Edits *edits,
945
0
        UErrorCode &errorCode) {
946
0
    return ucasemap_mapUTF8(
947
0
        UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
948
0
        dest, destCapacity,
949
0
        src, srcLength,
950
0
        ucasemap_internalUTF8Fold, edits, errorCode);
951
0
}
952
953
U_NAMESPACE_END