Coverage Report

Created: 2024-04-24 06:23

/src/icu/source/common/ucasemap.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2005-2016, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  ucasemap.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2005may06
16
*   created by: Markus W. Scherer
17
*
18
*   Case mapping service object and functions using it.
19
*/
20
21
#include "unicode/utypes.h"
22
#include "unicode/brkiter.h"
23
#include "unicode/bytestream.h"
24
#include "unicode/casemap.h"
25
#include "unicode/edits.h"
26
#include "unicode/stringoptions.h"
27
#include "unicode/stringpiece.h"
28
#include "unicode/ubrk.h"
29
#include "unicode/uloc.h"
30
#include "unicode/ustring.h"
31
#include "unicode/ucasemap.h"
32
#if !UCONFIG_NO_BREAK_ITERATION
33
#include "unicode/utext.h"
34
#endif
35
#include "unicode/utf.h"
36
#include "unicode/utf8.h"
37
#include "unicode/utf16.h"
38
#include "bytesinkutil.h"
39
#include "cmemory.h"
40
#include "cstring.h"
41
#include "uassert.h"
42
#include "ucase.h"
43
#include "ucasemap_imp.h"
44
#include "ustr_imp.h"
45
46
U_NAMESPACE_USE
47
48
/* UCaseMap service object -------------------------------------------------- */
49
50
UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51
#if !UCONFIG_NO_BREAK_ITERATION
52
        iter(NULL),
53
#endif
54
2
        caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55
2
    ucasemap_setLocale(this, localeID, pErrorCode);
56
2
}
57
58
0
UCaseMap::~UCaseMap() {
59
0
#if !UCONFIG_NO_BREAK_ITERATION
60
0
    delete iter;
61
0
#endif
62
0
}
63
64
U_CAPI UCaseMap * U_EXPORT2
65
2
ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
66
2
    if(U_FAILURE(*pErrorCode)) {
67
0
        return NULL;
68
0
    }
69
2
    UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
70
2
    if(csm==NULL) {
71
0
        *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72
0
        return NULL;
73
2
    } else if (U_FAILURE(*pErrorCode)) {
74
0
        delete csm;
75
0
        return NULL;
76
0
    }
77
2
    return csm;
78
2
}
79
80
U_CAPI void U_EXPORT2
81
0
ucasemap_close(UCaseMap *csm) {
82
0
    delete csm;
83
0
}
84
85
U_CAPI const char * U_EXPORT2
86
0
ucasemap_getLocale(const UCaseMap *csm) {
87
0
    return csm->locale;
88
0
}
89
90
U_CAPI uint32_t U_EXPORT2
91
0
ucasemap_getOptions(const UCaseMap *csm) {
92
0
    return csm->options;
93
0
}
94
95
U_CAPI void U_EXPORT2
96
2
ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
97
2
    if(U_FAILURE(*pErrorCode)) {
98
0
        return;
99
0
    }
100
2
    if (locale != NULL && *locale == 0) {
101
2
        csm->locale[0] = 0;
102
2
        csm->caseLocale = UCASE_LOC_ROOT;
103
2
        return;
104
2
    }
105
106
0
    int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107
0
    if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108
0
        *pErrorCode=U_ZERO_ERROR;
109
        /* we only really need the language code for case mappings */
110
0
        length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111
0
    }
112
0
    if(length==sizeof(csm->locale)) {
113
0
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114
0
    }
115
0
    if(U_SUCCESS(*pErrorCode)) {     
116
0
        csm->caseLocale = ucase_getCaseLocale(csm->locale);
117
0
    } else {
118
0
        csm->locale[0]=0;
119
0
        csm->caseLocale = UCASE_LOC_ROOT;
120
0
    }
121
0
}
122
123
U_CAPI void U_EXPORT2
124
0
ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
125
0
    if(U_FAILURE(*pErrorCode)) {
126
0
        return;
127
0
    }
128
0
    csm->options=options;
129
0
}
130
131
/* UTF-8 string case mappings ----------------------------------------------- */
132
133
/* TODO(markus): Move to a new, separate utf8case.cpp file. */
134
135
namespace {
136
137
/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
138
inline UBool
139
appendResult(int32_t cpLength, int32_t result, const UChar *s,
140
0
             ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
141
0
    U_ASSERT(U_SUCCESS(errorCode));
142
143
    /* decode the result */
144
0
    if(result<0) {
145
        /* (not) original code point */
146
0
        if(edits!=NULL) {
147
0
            edits->addUnchanged(cpLength);
148
0
        }
149
0
        if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
150
0
            ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
151
0
        }
152
0
    } else {
153
0
        if(result<=UCASE_MAX_STRING_LENGTH) {
154
            // string: "result" is the UTF-16 length
155
0
            return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
156
0
        } else {
157
0
            ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
158
0
        }
159
0
    }
160
0
    return TRUE;
161
0
}
162
163
// See unicode/utf8.h U8_APPEND_UNSAFE().
164
0
inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
165
0
inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
166
167
UChar32 U_CALLCONV
168
0
utf8_caseContextIterator(void *context, int8_t dir) {
169
0
    UCaseContext *csc=(UCaseContext *)context;
170
0
    UChar32 c;
171
172
0
    if(dir<0) {
173
        /* reset for backward iteration */
174
0
        csc->index=csc->cpStart;
175
0
        csc->dir=dir;
176
0
    } else if(dir>0) {
177
        /* reset for forward iteration */
178
0
        csc->index=csc->cpLimit;
179
0
        csc->dir=dir;
180
0
    } else {
181
        /* continue current iteration direction */
182
0
        dir=csc->dir;
183
0
    }
184
185
0
    if(dir<0) {
186
0
        if(csc->start<csc->index) {
187
0
            U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
188
0
            return c;
189
0
        }
190
0
    } else {
191
0
        if(csc->index<csc->limit) {
192
0
            U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
193
0
            return c;
194
0
        }
195
0
    }
196
0
    return U_SENTINEL;
197
0
}
198
199
/**
200
 * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
201
 * caseLocale < 0: Case-folds [srcStart..srcLimit[.
202
 */
203
void toLower(int32_t caseLocale, uint32_t options,
204
             const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
205
0
             icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
206
0
    const int8_t *latinToLower;
207
0
    if (caseLocale == UCASE_LOC_ROOT ||
208
0
            (caseLocale >= 0 ?
209
0
                !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
210
0
                (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
211
0
        latinToLower = LatinCase::TO_LOWER_NORMAL;
212
0
    } else {
213
0
        latinToLower = LatinCase::TO_LOWER_TR_LT;
214
0
    }
215
0
    const UTrie2 *trie = ucase_getTrie();
216
0
    int32_t prev = srcStart;
217
0
    int32_t srcIndex = srcStart;
218
0
    for (;;) {
219
        // fast path for simple cases
220
0
        int32_t cpStart;
221
0
        UChar32 c;
222
0
        for (;;) {
223
0
            if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
224
0
                c = U_SENTINEL;
225
0
                break;
226
0
            }
227
0
            uint8_t lead = src[srcIndex++];
228
0
            if (lead <= 0x7f) {
229
0
                int8_t d = latinToLower[lead];
230
0
                if (d == LatinCase::EXC) {
231
0
                    cpStart = srcIndex - 1;
232
0
                    c = lead;
233
0
                    break;
234
0
                }
235
0
                if (d == 0) { continue; }
236
0
                ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
237
0
                                              sink, options, edits, errorCode);
238
0
                char ascii = (char)(lead + d);
239
0
                sink.Append(&ascii, 1);
240
0
                if (edits != nullptr) {
241
0
                    edits->addReplace(1, 1);
242
0
                }
243
0
                prev = srcIndex;
244
0
                continue;
245
0
            } else if (lead < 0xe3) {
246
0
                uint8_t t;
247
0
                if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
248
0
                        (t = src[srcIndex] - 0x80) <= 0x3f) {
249
                    // U+0080..U+017F
250
0
                    ++srcIndex;
251
0
                    c = ((lead - 0xc0) << 6) | t;
252
0
                    int8_t d = latinToLower[c];
253
0
                    if (d == LatinCase::EXC) {
254
0
                        cpStart = srcIndex - 2;
255
0
                        break;
256
0
                    }
257
0
                    if (d == 0) { continue; }
258
0
                    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
259
0
                                                  sink, options, edits, errorCode);
260
0
                    ByteSinkUtil::appendTwoBytes(c + d, sink);
261
0
                    if (edits != nullptr) {
262
0
                        edits->addReplace(2, 2);
263
0
                    }
264
0
                    prev = srcIndex;
265
0
                    continue;
266
0
                }
267
0
            } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
268
0
                    (srcIndex + 2) <= srcLimit &&
269
0
                    U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
270
                // most of CJK: no case mappings
271
0
                srcIndex += 2;
272
0
                continue;
273
0
            }
274
0
            cpStart = --srcIndex;
275
0
            U8_NEXT(src, srcIndex, srcLimit, c);
276
0
            if (c < 0) {
277
                // ill-formed UTF-8
278
0
                continue;
279
0
            }
280
0
            uint16_t props = UTRIE2_GET16(trie, c);
281
0
            if (UCASE_HAS_EXCEPTION(props)) { break; }
282
0
            int32_t delta;
283
0
            if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
284
0
                continue;
285
0
            }
286
0
            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
287
0
                                          sink, options, edits, errorCode);
288
0
            ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
289
0
            prev = srcIndex;
290
0
        }
291
0
        if (c < 0) {
292
0
            break;
293
0
        }
294
        // slow path
295
0
        const UChar *s;
296
0
        if (caseLocale >= 0) {
297
0
            csc->cpStart = cpStart;
298
0
            csc->cpLimit = srcIndex;
299
0
            c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
300
0
        } else {
301
0
            c = ucase_toFullFolding(c, &s, options);
302
0
        }
303
0
        if (c >= 0) {
304
0
            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
305
0
                                          sink, options, edits, errorCode);
306
0
            appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
307
0
            prev = srcIndex;
308
0
        }
309
0
    }
310
0
    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
311
0
                                  sink, options, edits, errorCode);
312
0
}
313
314
void toUpper(int32_t caseLocale, uint32_t options,
315
             const uint8_t *src, UCaseContext *csc, int32_t srcLength,
316
19.1k
             icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
317
19.1k
    const int8_t *latinToUpper;
318
19.1k
    if (caseLocale == UCASE_LOC_TURKISH) {
319
0
        latinToUpper = LatinCase::TO_UPPER_TR;
320
19.1k
    } else {
321
19.1k
        latinToUpper = LatinCase::TO_UPPER_NORMAL;
322
19.1k
    }
323
19.1k
    const UTrie2 *trie = ucase_getTrie();
324
19.1k
    int32_t prev = 0;
325
19.1k
    int32_t srcIndex = 0;
326
19.1k
    for (;;) {
327
        // fast path for simple cases
328
19.1k
        int32_t cpStart;
329
19.1k
        UChar32 c;
330
146k
        for (;;) {
331
146k
            if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
332
19.1k
                c = U_SENTINEL;
333
19.1k
                break;
334
19.1k
            }
335
127k
            uint8_t lead = src[srcIndex++];
336
127k
            if (lead <= 0x7f) {
337
127k
                int8_t d = latinToUpper[lead];
338
127k
                if (d == LatinCase::EXC) {
339
0
                    cpStart = srcIndex - 1;
340
0
                    c = lead;
341
0
                    break;
342
0
                }
343
127k
                if (d == 0) { continue; }
344
107k
                ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
345
107k
                                              sink, options, edits, errorCode);
346
107k
                char ascii = (char)(lead + d);
347
107k
                sink.Append(&ascii, 1);
348
107k
                if (edits != nullptr) {
349
0
                    edits->addReplace(1, 1);
350
0
                }
351
107k
                prev = srcIndex;
352
107k
                continue;
353
127k
            } else if (lead < 0xe3) {
354
0
                uint8_t t;
355
0
                if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
356
0
                        (t = src[srcIndex] - 0x80) <= 0x3f) {
357
                    // U+0080..U+017F
358
0
                    ++srcIndex;
359
0
                    c = ((lead - 0xc0) << 6) | t;
360
0
                    int8_t d = latinToUpper[c];
361
0
                    if (d == LatinCase::EXC) {
362
0
                        cpStart = srcIndex - 2;
363
0
                        break;
364
0
                    }
365
0
                    if (d == 0) { continue; }
366
0
                    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
367
0
                                                  sink, options, edits, errorCode);
368
0
                    ByteSinkUtil::appendTwoBytes(c + d, sink);
369
0
                    if (edits != nullptr) {
370
0
                        edits->addReplace(2, 2);
371
0
                    }
372
0
                    prev = srcIndex;
373
0
                    continue;
374
0
                }
375
0
            } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
376
0
                    (srcIndex + 2) <= srcLength &&
377
0
                    U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
378
                // most of CJK: no case mappings
379
0
                srcIndex += 2;
380
0
                continue;
381
0
            }
382
0
            cpStart = --srcIndex;
383
0
            U8_NEXT(src, srcIndex, srcLength, c);
384
0
            if (c < 0) {
385
                // ill-formed UTF-8
386
0
                continue;
387
0
            }
388
0
            uint16_t props = UTRIE2_GET16(trie, c);
389
0
            if (UCASE_HAS_EXCEPTION(props)) { break; }
390
0
            int32_t delta;
391
0
            if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
392
0
                continue;
393
0
            }
394
0
            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
395
0
                                          sink, options, edits, errorCode);
396
0
            ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
397
0
            prev = srcIndex;
398
0
        }
399
19.1k
        if (c < 0) {
400
19.1k
            break;
401
19.1k
        }
402
        // slow path
403
0
        csc->cpStart = cpStart;
404
0
        csc->cpLimit = srcIndex;
405
0
        const UChar *s;
406
0
        c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
407
0
        if (c >= 0) {
408
0
            ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
409
0
                                          sink, options, edits, errorCode);
410
0
            appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
411
0
            prev = srcIndex;
412
0
        }
413
0
    }
414
19.1k
    ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
415
19.1k
                                  sink, options, edits, errorCode);
416
19.1k
}
417
418
}  // namespace
419
420
#if !UCONFIG_NO_BREAK_ITERATION
421
422
namespace {
423
424
constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
425
426
constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
427
428
/**
429
 * Input: c is a letter I with or without acute accent.
430
 * start is the index in src after c, and is less than segmentLimit.
431
 * If a plain i/I is followed by a plain j/J,
432
 * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
433
 * then we output accordingly.
434
 *
435
 * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
436
 */
437
int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
438
0
                          ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
439
0
    U_ASSERT(start < segmentLimit);
440
441
0
    int32_t index = start;
442
0
    bool withAcute = false;
443
444
    // If the conditions are met, then the following variables tell us what to output.
445
0
    int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
446
0
    bool doTitleJ = false;  // true if the j needs to be titlecased
447
0
    int32_t unchanged2 = 0;  // after the j (0 or 1)
448
449
    // next character after the first letter
450
0
    UChar32 c2;
451
0
    c2 = src[index++];
452
453
    // Is the first letter an i/I with accent?
454
0
    if (c == u'I') {
455
0
        if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
456
0
            withAcute = true;
457
0
            unchanged1 = 2;  // ACUTE is 2 code units in UTF-8
458
0
            if (index == segmentLimit) { return start; }
459
0
            c2 = src[index++];
460
0
        }
461
0
    } else {  // Í
462
0
        withAcute = true;
463
0
    }
464
465
    // Is the next character a j/J?
466
0
    if (c2 == u'j') {
467
0
        doTitleJ = true;
468
0
    } else if (c2 == u'J') {
469
0
        ++unchanged1;
470
0
    } else {
471
0
        return start;
472
0
    }
473
474
    // A plain i/I must be followed by a plain j/J.
475
    // An i/I with acute must be followed by a j/J with acute.
476
0
    if (withAcute) {
477
0
        if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
478
0
            return start;
479
0
        }
480
0
        if (doTitleJ) {
481
0
            unchanged2 = 2;  // ACUTE is 2 code units in UTF-8
482
0
        } else {
483
0
            unchanged1 = unchanged1 + 2;    // ACUTE is 2 code units in UTF-8
484
0
        }
485
0
    }
486
487
    // There must not be another combining mark.
488
0
    if (index < segmentLimit) {
489
0
        int32_t cp;
490
0
        int32_t i = index;
491
0
        U8_NEXT(src, i, segmentLimit, cp);
492
0
        uint32_t typeMask = U_GET_GC_MASK(cp);
493
0
        if ((typeMask & U_GC_M_MASK) != 0) {
494
0
            return start;
495
0
        }
496
0
    }
497
498
    // Output the rest of the Dutch IJ.
499
0
    ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
500
0
    start += unchanged1;
501
0
    if (doTitleJ) {
502
0
        ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
503
0
        ++start;
504
0
    }
505
0
    ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
506
507
0
    U_ASSERT(start + unchanged2 == index);
508
0
    return index;
509
0
}
510
511
}  // namespace
512
513
U_CFUNC void U_CALLCONV
514
ucasemap_internalUTF8ToTitle(
515
        int32_t caseLocale, uint32_t options, BreakIterator *iter,
516
        const uint8_t *src, int32_t srcLength,
517
        ByteSink &sink, icu::Edits *edits,
518
0
        UErrorCode &errorCode) {
519
0
    if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
520
0
        return;
521
0
    }
522
523
    /* set up local variables */
524
0
    UCaseContext csc=UCASECONTEXT_INITIALIZER;
525
0
    csc.p=(void *)src;
526
0
    csc.limit=srcLength;
527
0
    int32_t prev=0;
528
0
    UBool isFirstIndex=TRUE;
529
530
    /* titlecasing loop */
531
0
    while(prev<srcLength) {
532
        /* find next index where to titlecase */
533
0
        int32_t index;
534
0
        if(isFirstIndex) {
535
0
            isFirstIndex=FALSE;
536
0
            index=iter->first();
537
0
        } else {
538
0
            index=iter->next();
539
0
        }
540
0
        if(index==UBRK_DONE || index>srcLength) {
541
0
            index=srcLength;
542
0
        }
543
544
        /*
545
         * Segment [prev..index[ into 3 parts:
546
         * a) skipped characters (copy as-is) [prev..titleStart[
547
         * b) first letter (titlecase)              [titleStart..titleLimit[
548
         * c) subsequent characters (lowercase)                 [titleLimit..index[
549
         */
550
0
        if(prev<index) {
551
            /* find and copy skipped characters [prev..titleStart[ */
552
0
            int32_t titleStart=prev;
553
0
            int32_t titleLimit=prev;
554
0
            UChar32 c;
555
0
            U8_NEXT(src, titleLimit, index, c);
556
0
            if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
557
                // Adjust the titlecasing index to the next cased character,
558
                // or to the next letter/number/symbol/private use.
559
                // Stop with titleStart<titleLimit<=index
560
                // if there is a character to be titlecased,
561
                // or else stop with titleStart==titleLimit==index.
562
0
                UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
563
0
                while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
564
0
                    titleStart=titleLimit;
565
0
                    if(titleLimit==index) {
566
0
                        break;
567
0
                    }
568
0
                    U8_NEXT(src, titleLimit, index, c);
569
0
                }
570
0
                if (prev < titleStart) {
571
0
                    if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
572
0
                                                       sink, options, edits, errorCode)) {
573
0
                        return;
574
0
                    }
575
0
                }
576
0
            }
577
578
0
            if(titleStart<titleLimit) {
579
                /* titlecase c which is from [titleStart..titleLimit[ */
580
0
                if(c>=0) {
581
0
                    csc.cpStart=titleStart;
582
0
                    csc.cpLimit=titleLimit;
583
0
                    const UChar *s;
584
0
                    c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
585
0
                    if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
586
0
                        return;
587
0
                    }
588
0
                } else {
589
                    // Malformed UTF-8.
590
0
                    if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
591
0
                                                       sink, options, edits, errorCode)) {
592
0
                        return;
593
0
                    }
594
0
                }
595
596
                /* Special case Dutch IJ titlecasing */
597
0
                if (titleLimit < index &&
598
0
                    caseLocale == UCASE_LOC_DUTCH) {
599
0
                    if (c < 0) {
600
0
                        c = ~c;
601
0
                    }
602
603
0
                    if (c == u'I' || c == u'Í') {
604
0
                        titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
605
0
                    }
606
0
                }
607
608
                /* lowercase [titleLimit..index[ */
609
0
                if(titleLimit<index) {
610
0
                    if((options&U_TITLECASE_NO_LOWERCASE)==0) {
611
                        /* Normal operation: Lowercase the rest of the word. */
612
0
                        toLower(caseLocale, options,
613
0
                                src, &csc, titleLimit, index,
614
0
                                sink, edits, errorCode);
615
0
                        if(U_FAILURE(errorCode)) {
616
0
                            return;
617
0
                        }
618
0
                    } else {
619
                        /* Optionally just copy the rest of the word unchanged. */
620
0
                        if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
621
0
                                                           sink, options, edits, errorCode)) {
622
0
                            return;
623
0
                        }
624
0
                    }
625
0
                }
626
0
            }
627
0
        }
628
629
0
        prev=index;
630
0
    }
631
0
}
632
633
#endif
634
635
U_NAMESPACE_BEGIN
636
namespace GreekUpper {
637
638
0
UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
639
0
    while (i < length) {
640
0
        UChar32 c;
641
0
        U8_NEXT(s, i, length, c);
642
0
        int32_t type = ucase_getTypeOrIgnorable(c);
643
0
        if ((type & UCASE_IGNORABLE) != 0) {
644
            // Case-ignorable, continue with the loop.
645
0
        } else if (type != UCASE_NONE) {
646
0
            return TRUE;  // Followed by cased letter.
647
0
        } else {
648
0
            return FALSE;  // Uncased and not case-ignorable.
649
0
        }
650
0
    }
651
0
    return FALSE;  // Not followed by cased letter.
652
0
}
653
654
// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
655
void toUpper(uint32_t options,
656
             const uint8_t *src, int32_t srcLength,
657
             ByteSink &sink, Edits *edits,
658
0
             UErrorCode &errorCode) {
659
0
    uint32_t state = 0;
660
0
    for (int32_t i = 0; i < srcLength;) {
661
0
        int32_t nextIndex = i;
662
0
        UChar32 c;
663
0
        U8_NEXT(src, nextIndex, srcLength, c);
664
0
        uint32_t nextState = 0;
665
0
        int32_t type = ucase_getTypeOrIgnorable(c);
666
0
        if ((type & UCASE_IGNORABLE) != 0) {
667
            // c is case-ignorable
668
0
            nextState |= (state & AFTER_CASED);
669
0
        } else if (type != UCASE_NONE) {
670
            // c is cased
671
0
            nextState |= AFTER_CASED;
672
0
        }
673
0
        uint32_t data = getLetterData(c);
674
0
        if (data > 0) {
675
0
            uint32_t upper = data & UPPER_MASK;
676
            // Add a dialytika to this iota or ypsilon vowel
677
            // if we removed a tonos from the previous vowel,
678
            // and that previous vowel did not also have (or gain) a dialytika.
679
            // Adding one only to the final vowel in a longer sequence
680
            // (which does not occur in normal writing) would require lookahead.
681
            // Set the same flag as for preserving an existing dialytika.
682
0
            if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
683
0
                    (upper == 0x399 || upper == 0x3A5)) {
684
0
                data |= HAS_DIALYTIKA;
685
0
            }
686
0
            int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
687
0
            if ((data & HAS_YPOGEGRAMMENI) != 0) {
688
0
                numYpogegrammeni = 1;
689
0
            }
690
            // Skip combining diacritics after this Greek letter.
691
0
            int32_t nextNextIndex = nextIndex;
692
0
            while (nextIndex < srcLength) {
693
0
                UChar32 c2;
694
0
                U8_NEXT(src, nextNextIndex, srcLength, c2);
695
0
                uint32_t diacriticData = getDiacriticData(c2);
696
0
                if (diacriticData != 0) {
697
0
                    data |= diacriticData;
698
0
                    if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
699
0
                        ++numYpogegrammeni;
700
0
                    }
701
0
                    nextIndex = nextNextIndex;
702
0
                } else {
703
0
                    break;  // not a Greek diacritic
704
0
                }
705
0
            }
706
0
            if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
707
0
                nextState |= AFTER_VOWEL_WITH_ACCENT;
708
0
            }
709
            // Map according to Greek rules.
710
0
            UBool addTonos = FALSE;
711
0
            if (upper == 0x397 &&
712
0
                    (data & HAS_ACCENT) != 0 &&
713
0
                    numYpogegrammeni == 0 &&
714
0
                    (state & AFTER_CASED) == 0 &&
715
0
                    !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
716
                // Keep disjunctive "or" with (only) a tonos.
717
                // We use the same "word boundary" conditions as for the Final_Sigma test.
718
0
                if (i == nextIndex) {
719
0
                    upper = 0x389;  // Preserve the precomposed form.
720
0
                } else {
721
0
                    addTonos = TRUE;
722
0
                }
723
0
            } else if ((data & HAS_DIALYTIKA) != 0) {
724
                // Preserve a vowel with dialytika in precomposed form if it exists.
725
0
                if (upper == 0x399) {
726
0
                    upper = 0x3AA;
727
0
                    data &= ~HAS_EITHER_DIALYTIKA;
728
0
                } else if (upper == 0x3A5) {
729
0
                    upper = 0x3AB;
730
0
                    data &= ~HAS_EITHER_DIALYTIKA;
731
0
                }
732
0
            }
733
734
0
            UBool change;
735
0
            if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
736
0
                change = TRUE;  // common, simple usage
737
0
            } else {
738
                // Find out first whether we are changing the text.
739
0
                U_ASSERT(0x370 <= upper && upper <= 0x3ff);  // 2-byte UTF-8, main Greek block
740
0
                change = (i + 2) > nextIndex ||
741
0
                        src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
742
0
                        numYpogegrammeni > 0;
743
0
                int32_t i2 = i + 2;
744
0
                if ((data & HAS_EITHER_DIALYTIKA) != 0) {
745
0
                    change |= (i2 + 2) > nextIndex ||
746
0
                            src[i2] != (uint8_t)u8"\u0308"[0] ||
747
0
                            src[i2 + 1] != (uint8_t)u8"\u0308"[1];
748
0
                    i2 += 2;
749
0
                }
750
0
                if (addTonos) {
751
0
                    change |= (i2 + 2) > nextIndex ||
752
0
                            src[i2] != (uint8_t)u8"\u0301"[0] ||
753
0
                            src[i2 + 1] != (uint8_t)u8"\u0301"[1];
754
0
                    i2 += 2;
755
0
                }
756
0
                int32_t oldLength = nextIndex - i;
757
0
                int32_t newLength = (i2 - i) + numYpogegrammeni * 2;  // 2 bytes per U+0399
758
0
                change |= oldLength != newLength;
759
0
                if (change) {
760
0
                    if (edits != NULL) {
761
0
                        edits->addReplace(oldLength, newLength);
762
0
                    }
763
0
                } else {
764
0
                    if (edits != NULL) {
765
0
                        edits->addUnchanged(oldLength);
766
0
                    }
767
                    // Write unchanged text?
768
0
                    change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
769
0
                }
770
0
            }
771
772
0
            if (change) {
773
0
                ByteSinkUtil::appendTwoBytes(upper, sink);
774
0
                if ((data & HAS_EITHER_DIALYTIKA) != 0) {
775
0
                    sink.AppendU8(u8"\u0308", 2);  // restore or add a dialytika
776
0
                }
777
0
                if (addTonos) {
778
0
                    sink.AppendU8(u8"\u0301", 2);
779
0
                }
780
0
                while (numYpogegrammeni > 0) {
781
0
                    sink.AppendU8(u8"\u0399", 2);
782
0
                    --numYpogegrammeni;
783
0
                }
784
0
            }
785
0
        } else if(c>=0) {
786
0
            const UChar *s;
787
0
            c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
788
0
            if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
789
0
                return;
790
0
            }
791
0
        } else {
792
            // Malformed UTF-8.
793
0
            if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
794
0
                                               sink, options, edits, errorCode)) {
795
0
                return;
796
0
            }
797
0
        }
798
0
        i = nextIndex;
799
0
        state = nextState;
800
0
    }
801
0
}
802
803
}  // namespace GreekUpper
804
U_NAMESPACE_END
805
806
static void U_CALLCONV
807
ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
808
                             const uint8_t *src, int32_t srcLength,
809
                             icu::ByteSink &sink, icu::Edits *edits,
810
0
                             UErrorCode &errorCode) {
811
0
    UCaseContext csc=UCASECONTEXT_INITIALIZER;
812
0
    csc.p=(void *)src;
813
0
    csc.limit=srcLength;
814
0
    toLower(
815
0
        caseLocale, options,
816
0
        src, &csc, 0, srcLength,
817
0
        sink, edits, errorCode);
818
0
}
819
820
static void U_CALLCONV
821
ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
822
                             const uint8_t *src, int32_t srcLength,
823
                             icu::ByteSink &sink, icu::Edits *edits,
824
19.1k
                             UErrorCode &errorCode) {
825
19.1k
    if (caseLocale == UCASE_LOC_GREEK) {
826
0
        GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
827
19.1k
    } else {
828
19.1k
        UCaseContext csc=UCASECONTEXT_INITIALIZER;
829
19.1k
        csc.p=(void *)src;
830
19.1k
        csc.limit=srcLength;
831
19.1k
        toUpper(
832
19.1k
            caseLocale, options,
833
19.1k
            src, &csc, srcLength,
834
19.1k
            sink, edits, errorCode);
835
19.1k
    }
836
19.1k
}
837
838
static void U_CALLCONV
839
ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
840
                          const uint8_t *src, int32_t srcLength,
841
                          icu::ByteSink &sink, icu::Edits *edits,
842
0
                          UErrorCode &errorCode) {
843
0
    toLower(
844
0
        -1, options,
845
0
        src, nullptr, 0, srcLength,
846
0
        sink, edits, errorCode);
847
0
}
848
849
void
850
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
851
                 const char *src, int32_t srcLength,
852
                 UTF8CaseMapper *stringCaseMapper,
853
                 icu::ByteSink &sink, icu::Edits *edits,
854
0
                 UErrorCode &errorCode) {
855
    /* check argument values */
856
0
    if (U_FAILURE(errorCode)) {
857
0
        return;
858
0
    }
859
0
    if ((src == nullptr && srcLength != 0) || srcLength < -1) {
860
0
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
861
0
        return;
862
0
    }
863
864
    // Get the string length.
865
0
    if (srcLength == -1) {
866
0
        srcLength = (int32_t)uprv_strlen((const char *)src);
867
0
    }
868
869
0
    if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
870
0
        edits->reset();
871
0
    }
872
0
    stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
873
0
                     (const uint8_t *)src, srcLength, sink, edits, errorCode);
874
0
    sink.Flush();
875
0
    if (U_SUCCESS(errorCode)) {
876
0
        if (edits != nullptr) {
877
0
            edits->copyErrorTo(errorCode);
878
0
        }
879
0
    }
880
0
}
881
882
int32_t
883
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
884
                 char *dest, int32_t destCapacity,
885
                 const char *src, int32_t srcLength,
886
                 UTF8CaseMapper *stringCaseMapper,
887
                 icu::Edits *edits,
888
19.1k
                 UErrorCode &errorCode) {
889
    /* check argument values */
890
19.1k
    if(U_FAILURE(errorCode)) {
891
0
        return 0;
892
0
    }
893
19.1k
    if( destCapacity<0 ||
894
19.1k
        (dest==NULL && destCapacity>0) ||
895
19.1k
        (src==NULL && srcLength!=0) || srcLength<-1
896
19.1k
    ) {
897
0
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
898
0
        return 0;
899
0
    }
900
901
    /* get the string length */
902
19.1k
    if(srcLength==-1) {
903
0
        srcLength=(int32_t)uprv_strlen((const char *)src);
904
0
    }
905
906
    /* check for overlapping source and destination */
907
19.1k
    if( dest!=NULL &&
908
19.1k
        ((src>=dest && src<(dest+destCapacity)) ||
909
19.1k
         (dest>=src && dest<(src+srcLength)))
910
19.1k
    ) {
911
0
        errorCode=U_ILLEGAL_ARGUMENT_ERROR;
912
0
        return 0;
913
0
    }
914
915
19.1k
    CheckedArrayByteSink sink(dest, destCapacity);
916
19.1k
    if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
917
0
        edits->reset();
918
0
    }
919
19.1k
    stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
920
19.1k
                     (const uint8_t *)src, srcLength, sink, edits, errorCode);
921
19.1k
    sink.Flush();
922
19.1k
    if (U_SUCCESS(errorCode)) {
923
19.1k
        if (sink.Overflowed()) {
924
0
            errorCode = U_BUFFER_OVERFLOW_ERROR;
925
19.1k
        } else if (edits != nullptr) {
926
0
            edits->copyErrorTo(errorCode);
927
0
        }
928
19.1k
    }
929
19.1k
    return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
930
19.1k
}
931
932
/* public API functions */
933
934
U_CAPI int32_t U_EXPORT2
935
ucasemap_utf8ToLower(const UCaseMap *csm,
936
                     char *dest, int32_t destCapacity,
937
                     const char *src, int32_t srcLength,
938
0
                     UErrorCode *pErrorCode) {
939
0
    return ucasemap_mapUTF8(
940
0
        csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
941
0
        dest, destCapacity,
942
0
        src, srcLength,
943
0
        ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
944
0
}
945
946
U_CAPI int32_t U_EXPORT2
947
ucasemap_utf8ToUpper(const UCaseMap *csm,
948
                     char *dest, int32_t destCapacity,
949
                     const char *src, int32_t srcLength,
950
19.1k
                     UErrorCode *pErrorCode) {
951
19.1k
    return ucasemap_mapUTF8(
952
19.1k
        csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
953
19.1k
        dest, destCapacity,
954
19.1k
        src, srcLength,
955
19.1k
        ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
956
19.1k
}
957
958
U_CAPI int32_t U_EXPORT2
959
ucasemap_utf8FoldCase(const UCaseMap *csm,
960
                      char *dest, int32_t destCapacity,
961
                      const char *src, int32_t srcLength,
962
0
                      UErrorCode *pErrorCode) {
963
0
    return ucasemap_mapUTF8(
964
0
        UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
965
0
        dest, destCapacity,
966
0
        src, srcLength,
967
0
        ucasemap_internalUTF8Fold, NULL, *pErrorCode);
968
0
}
969
970
U_NAMESPACE_BEGIN
971
972
void CaseMap::utf8ToLower(
973
        const char *locale, uint32_t options,
974
        StringPiece src, ByteSink &sink, Edits *edits,
975
0
        UErrorCode &errorCode) {
976
0
    ucasemap_mapUTF8(
977
0
        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
978
0
        src.data(), src.length(),
979
0
        ucasemap_internalUTF8ToLower, sink, edits, errorCode);
980
0
}
981
982
void CaseMap::utf8ToUpper(
983
        const char *locale, uint32_t options,
984
        StringPiece src, ByteSink &sink, Edits *edits,
985
0
        UErrorCode &errorCode) {
986
0
    ucasemap_mapUTF8(
987
0
        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
988
0
        src.data(), src.length(),
989
0
        ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
990
0
}
991
992
void CaseMap::utf8Fold(
993
        uint32_t options,
994
        StringPiece src, ByteSink &sink, Edits *edits,
995
0
        UErrorCode &errorCode) {
996
0
    ucasemap_mapUTF8(
997
0
        UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
998
0
        src.data(), src.length(),
999
0
        ucasemap_internalUTF8Fold, sink, edits, errorCode);
1000
0
}
1001
1002
int32_t CaseMap::utf8ToLower(
1003
        const char *locale, uint32_t options,
1004
        const char *src, int32_t srcLength,
1005
        char *dest, int32_t destCapacity, Edits *edits,
1006
0
        UErrorCode &errorCode) {
1007
0
    return ucasemap_mapUTF8(
1008
0
        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1009
0
        dest, destCapacity,
1010
0
        src, srcLength,
1011
0
        ucasemap_internalUTF8ToLower, edits, errorCode);
1012
0
}
1013
1014
int32_t CaseMap::utf8ToUpper(
1015
        const char *locale, uint32_t options,
1016
        const char *src, int32_t srcLength,
1017
        char *dest, int32_t destCapacity, Edits *edits,
1018
0
        UErrorCode &errorCode) {
1019
0
    return ucasemap_mapUTF8(
1020
0
        ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1021
0
        dest, destCapacity,
1022
0
        src, srcLength,
1023
0
        ucasemap_internalUTF8ToUpper, edits, errorCode);
1024
0
}
1025
1026
int32_t CaseMap::utf8Fold(
1027
        uint32_t options,
1028
        const char *src, int32_t srcLength,
1029
        char *dest, int32_t destCapacity, Edits *edits,
1030
0
        UErrorCode &errorCode) {
1031
0
    return ucasemap_mapUTF8(
1032
0
        UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1033
0
        dest, destCapacity,
1034
0
        src, srcLength,
1035
0
        ucasemap_internalUTF8Fold, edits, errorCode);
1036
0
}
1037
1038
U_NAMESPACE_END