Coverage Report

Created: 2023-03-29 06:08

/src/icu/icu4c/source/common/ucase.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2004-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  ucase.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2004aug30
16
*   created by: Markus W. Scherer
17
*
18
*   Low-level Unicode character/string case mapping code.
19
*   Much code moved here (and modified) from uchar.c.
20
*/
21
22
#include "unicode/utypes.h"
23
#include "unicode/unistr.h"
24
#include "unicode/uset.h"
25
#include "unicode/utf16.h"
26
#include "cmemory.h"
27
#include "uassert.h"
28
#include "ucase.h"
29
#include "umutex.h"
30
#include "utrie2.h"
31
32
/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
33
#define INCLUDED_FROM_UCASE_CPP
34
#include "ucase_props_data.h"
35
36
/* set of property starts for UnicodeSet ------------------------------------ */
37
38
static UBool U_CALLCONV
39
0
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
40
    /* add the start code point to the USet */
41
0
    const USetAdder *sa=(const USetAdder *)context;
42
0
    sa->add(sa->set, start);
43
0
    return true;
44
0
}
45
46
U_CFUNC void U_EXPORT2
47
0
ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
48
0
    if(U_FAILURE(*pErrorCode)) {
49
0
        return;
50
0
    }
51
52
    /* add the start code point of each same-value range of the trie */
53
0
    utrie2_enum(&ucase_props_singleton.trie, nullptr, _enumPropertyStartsRange, sa);
54
55
    /* add code points with hardcoded properties, plus the ones following them */
56
57
    /* (none right now, see comment below) */
58
59
    /*
60
     * Omit code points with hardcoded specialcasing properties
61
     * because we do not build property UnicodeSets for them right now.
62
     */
63
0
}
64
65
/* data access primitives --------------------------------------------------- */
66
67
U_CAPI const struct UCaseProps * U_EXPORT2
68
0
ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
69
0
    *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
70
0
    *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
71
0
    return &ucase_props_singleton;
72
0
}
73
74
U_CFUNC const UTrie2 * U_EXPORT2
75
955k
ucase_getTrie() {
76
955k
    return &ucase_props_singleton.trie;
77
955k
}
78
79
1.32M
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
80
81
/* number of bits in an 8-bit integer value */
82
static const uint8_t flagsOffset[256]={
83
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
84
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
85
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
86
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
87
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
88
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
89
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
90
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
91
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
94
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
95
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
99
};
100
101
4.08M
#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
102
805k
#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
103
104
/*
105
 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
106
 *
107
 * @param excWord (in) initial exceptions word
108
 * @param idx (in) desired slot index
109
 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
110
 *               moved to the last uint16_t of the value, use +1 for beginning of next slot
111
 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
112
 */
113
805k
#define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
114
805k
    if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
115
805k
        (pExc16)+=SLOT_OFFSET(excWord, idx); \
116
805k
        (value)=*pExc16; \
117
805k
    } else { \
118
0
        (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
119
0
        (value)=*pExc16++; \
120
0
        (value)=((value)<<16)|*pExc16; \
121
0
    } \
122
1.23M
} UPRV_BLOCK_MACRO_END
123
124
/* simple case mappings ----------------------------------------------------- */
125
126
U_CAPI UChar32 U_EXPORT2
127
0
ucase_tolower(UChar32 c) {
128
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
129
0
    if(!UCASE_HAS_EXCEPTION(props)) {
130
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
131
0
            c+=UCASE_GET_DELTA(props);
132
0
        }
133
0
    } else {
134
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
135
0
        uint16_t excWord=*pe++;
136
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
137
0
            int32_t delta;
138
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
139
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
140
0
        }
141
0
        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
142
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
143
0
        }
144
0
    }
145
0
    return c;
146
0
}
147
148
U_CAPI UChar32 U_EXPORT2
149
0
ucase_toupper(UChar32 c) {
150
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
151
0
    if(!UCASE_HAS_EXCEPTION(props)) {
152
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
153
0
            c+=UCASE_GET_DELTA(props);
154
0
        }
155
0
    } else {
156
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
157
0
        uint16_t excWord=*pe++;
158
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
159
0
            int32_t delta;
160
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
161
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
162
0
        }
163
0
        if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
164
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
165
0
        }
166
0
    }
167
0
    return c;
168
0
}
169
170
U_CAPI UChar32 U_EXPORT2
171
0
ucase_totitle(UChar32 c) {
172
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
173
0
    if(!UCASE_HAS_EXCEPTION(props)) {
174
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
175
0
            c+=UCASE_GET_DELTA(props);
176
0
        }
177
0
    } else {
178
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
179
0
        uint16_t excWord=*pe++;
180
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
181
0
            int32_t delta;
182
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
183
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
184
0
        }
185
0
        int32_t idx;
186
0
        if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
187
0
            idx=UCASE_EXC_TITLE;
188
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
189
0
            idx=UCASE_EXC_UPPER;
190
0
        } else {
191
0
            return c;
192
0
        }
193
0
        GET_SLOT_VALUE(excWord, idx, pe, c);
194
0
    }
195
0
    return c;
196
0
}
197
198
static const char16_t iDot[2] = { 0x69, 0x307 };
199
static const char16_t jDot[2] = { 0x6a, 0x307 };
200
static const char16_t iOgonekDot[3] = { 0x12f, 0x307 };
201
static const char16_t iDotGrave[3] = { 0x69, 0x307, 0x300 };
202
static const char16_t iDotAcute[3] = { 0x69, 0x307, 0x301 };
203
static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 };
204
205
206
U_CFUNC void U_EXPORT2
207
0
ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
208
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
209
0
    if(!UCASE_HAS_EXCEPTION(props)) {
210
0
        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
211
            /* add the one simple case mapping, no matter what type it is */
212
0
            int32_t delta=UCASE_GET_DELTA(props);
213
0
            if(delta!=0) {
214
0
                sa->add(sa->set, c+delta);
215
0
            }
216
0
        }
217
0
    } else {
218
        /*
219
         * c has exceptions, so there may be multiple simple and/or
220
         * full case mappings. Add them all.
221
         */
222
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
223
0
        uint16_t excWord=*pe++;
224
0
        const uint16_t *pe0=pe;
225
226
        // Hardcode the case closure of i and its relatives and ignore the
227
        // data file data for these characters.
228
        // The Turkic dotless i and dotted I with their case mapping conditions
229
        // and case folding option make the related characters behave specially.
230
        // This code matches their closure behavior to their case folding behavior.
231
0
        if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
232
            // These characters have Turkic case foldings. Hardcode their closure.
233
0
            if (c == 0x49) {
234
                // Regular i and I are in one equivalence class.
235
0
                sa->add(sa->set, 0x69);
236
0
                return;
237
0
            } else if (c == 0x130) {
238
                // Dotted I is in a class with <0069 0307>
239
                // (for canonical equivalence with <0049 0307>).
240
0
                sa->addString(sa->set, iDot, 2);
241
0
                return;
242
0
            }
243
0
        } else if (c == 0x69) {
244
0
            sa->add(sa->set, 0x49);
245
0
            return;
246
0
        } else if (c == 0x131) {
247
            // Dotless i is in a class by itself.
248
0
            return;
249
0
        }
250
251
        /* add all simple case mappings */
252
0
        for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
253
0
            if(HAS_SLOT(excWord, idx)) {
254
0
                pe=pe0;
255
0
                UChar32 mapping;
256
0
                GET_SLOT_VALUE(excWord, idx, pe, mapping);
257
0
                sa->add(sa->set, mapping);
258
0
            }
259
0
        }
260
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
261
0
            pe=pe0;
262
0
            int32_t delta;
263
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
264
0
            sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
265
0
        }
266
267
        /* get the closure string pointer & length */
268
0
        const char16_t *closure;
269
0
        int32_t closureLength;
270
0
        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
271
0
            pe=pe0;
272
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
273
0
            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
274
0
            closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
275
0
        } else {
276
0
            closureLength=0;
277
0
            closure=nullptr;
278
0
        }
279
280
        /* add the full case folding */
281
0
        if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
282
0
            pe=pe0;
283
0
            int32_t fullLength;
284
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
285
286
            /* start of full case mapping strings */
287
0
            ++pe;
288
289
0
            fullLength&=0xffff; /* bits 16 and higher are reserved */
290
291
            /* skip the lowercase result string */
292
0
            pe+=fullLength&UCASE_FULL_LOWER;
293
0
            fullLength>>=4;
294
295
            /* add the full case folding string */
296
0
            int32_t length=fullLength&0xf;
297
0
            if(length!=0) {
298
0
                sa->addString(sa->set, (const char16_t *)pe, length);
299
0
                pe+=length;
300
0
            }
301
302
            /* skip the uppercase and titlecase strings */
303
0
            fullLength>>=4;
304
0
            pe+=fullLength&0xf;
305
0
            fullLength>>=4;
306
0
            pe+=fullLength;
307
308
0
            closure=(const char16_t *)pe; /* behind full case mappings */
309
0
        }
310
311
        /* add each code point in the closure string */
312
0
        for(int32_t idx=0; idx<closureLength;) {
313
0
            UChar32 mapping;
314
0
            U16_NEXT_UNSAFE(closure, idx, mapping);
315
0
            sa->add(sa->set, mapping);
316
0
        }
317
0
    }
318
0
}
319
320
namespace {
321
322
/**
323
 * Add the simple case closure mapping,
324
 * except if there is not actually an scf relationship between the two characters.
325
 * TODO: Unicode should probably add the corresponding scf mappings.
326
 * See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
327
 * If & when those scf mappings are added, we should be able to remove all of these exceptions.
328
 */
329
0
void addOneSimpleCaseClosure(UChar32 c, UChar32 t, const USetAdder *sa) {
330
0
    switch (c) {
331
0
    case 0x0390:
332
0
        if (t == 0x1FD3) { return; }
333
0
        break;
334
0
    case 0x03B0:
335
0
        if (t == 0x1FE3) { return; }
336
0
        break;
337
0
    case 0x1FD3:
338
0
        if (t == 0x0390) { return; }
339
0
        break;
340
0
    case 0x1FE3:
341
0
        if (t == 0x03B0) { return; }
342
0
        break;
343
0
    case 0xFB05:
344
0
        if (t == 0xFB06) { return; }
345
0
        break;
346
0
    case 0xFB06:
347
0
        if (t == 0xFB05) { return; }
348
0
        break;
349
0
    default:
350
0
        break;
351
0
    }
352
0
    sa->add(sa->set, t);
353
0
}
354
355
}  // namespace
356
357
U_CFUNC void U_EXPORT2
358
0
ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
359
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
360
0
    if(!UCASE_HAS_EXCEPTION(props)) {
361
0
        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
362
            /* add the one simple case mapping, no matter what type it is */
363
0
            int32_t delta=UCASE_GET_DELTA(props);
364
0
            if(delta!=0) {
365
0
                sa->add(sa->set, c+delta);
366
0
            }
367
0
        }
368
0
    } else {
369
        // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
370
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
371
0
        uint16_t excWord=*pe++;
372
0
        const uint16_t *pe0=pe;
373
374
        // Hardcode the case closure of i and its relatives and ignore the
375
        // data file data for these characters, like in ucase_addCaseClosure().
376
0
        if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
377
            // These characters have Turkic case foldings. Hardcode their closure.
378
0
            if (c == 0x49) {
379
                // Regular i and I are in one equivalence class.
380
0
                sa->add(sa->set, 0x69);
381
0
                return;
382
0
            } else if (c == 0x130) {
383
                // For scf=Simple_Case_Folding, dotted I is in a class by itself.
384
0
                return;
385
0
            }
386
0
        } else if (c == 0x69) {
387
0
            sa->add(sa->set, 0x49);
388
0
            return;
389
0
        } else if (c == 0x131) {
390
            // Dotless i is in a class by itself.
391
0
            return;
392
0
        }
393
394
        // Add all simple case mappings.
395
0
        for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
396
0
            if(HAS_SLOT(excWord, idx)) {
397
0
                pe=pe0;
398
0
                UChar32 mapping;
399
0
                GET_SLOT_VALUE(excWord, idx, pe, mapping);
400
0
                addOneSimpleCaseClosure(c, mapping, sa);
401
0
            }
402
0
        }
403
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
404
0
            pe=pe0;
405
0
            int32_t delta;
406
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
407
0
            UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
408
0
            addOneSimpleCaseClosure(c, mapping, sa);
409
0
        }
410
411
        /* get the closure string pointer & length */
412
0
        const char16_t *closure;
413
0
        int32_t closureLength;
414
0
        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
415
0
            pe=pe0;
416
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
417
0
            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
418
0
            closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
419
0
        } else {
420
0
            closureLength=0;
421
0
            closure=nullptr;
422
0
        }
423
424
        // Skip the full case mappings.
425
0
        if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
426
0
            pe=pe0;
427
0
            int32_t fullLength;
428
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
429
430
            /* start of full case mapping strings */
431
0
            ++pe;
432
433
0
            fullLength&=0xffff; /* bits 16 and higher are reserved */
434
435
            // Skip all 4 full case mappings.
436
0
            pe+=fullLength&UCASE_FULL_LOWER;
437
0
            fullLength>>=4;
438
0
            pe+=fullLength&0xf;
439
0
            fullLength>>=4;
440
0
            pe+=fullLength&0xf;
441
0
            fullLength>>=4;
442
0
            pe+=fullLength;
443
444
0
            closure=(const char16_t *)pe; /* behind full case mappings */
445
0
        }
446
447
        // Add each code point in the closure string whose scf maps back to c.
448
0
        for(int32_t idx=0; idx<closureLength;) {
449
0
            UChar32 mapping;
450
0
            U16_NEXT_UNSAFE(closure, idx, mapping);
451
0
            addOneSimpleCaseClosure(c, mapping, sa);
452
0
        }
453
0
    }
454
0
}
455
456
/*
457
 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
458
 * must be length>0 and max>0 and length<=max
459
 */
460
static inline int32_t
461
0
strcmpMax(const char16_t *s, int32_t length, const char16_t *t, int32_t max) {
462
0
    int32_t c1, c2;
463
464
0
    max-=length; /* we require length<=max, so no need to decrement max in the loop */
465
0
    do {
466
0
        c1=*s++;
467
0
        c2=*t++;
468
0
        if(c2==0) {
469
0
            return 1; /* reached the end of t but not of s */
470
0
        }
471
0
        c1-=c2;
472
0
        if(c1!=0) {
473
0
            return c1; /* return difference result */
474
0
        }
475
0
    } while(--length>0);
476
    /* ends with length==0 */
477
478
0
    if(max==0 || *t==0) {
479
0
        return 0; /* equal to length of both strings */
480
0
    } else {
481
0
        return -max; /* return length difference */
482
0
    }
483
0
}
484
485
U_CFUNC UBool U_EXPORT2
486
0
ucase_addStringCaseClosure(const char16_t *s, int32_t length, const USetAdder *sa) {
487
0
    int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
488
489
0
    if(ucase_props_singleton.unfold==nullptr || s==nullptr) {
490
0
        return false; /* no reverse case folding data, or no string */
491
0
    }
492
0
    if(length<=1) {
493
        /* the string is too short to find any match */
494
        /*
495
         * more precise would be:
496
         * if(!u_strHasMoreChar32Than(s, length, 1))
497
         * but this does not make much practical difference because
498
         * a single supplementary code point would just not be found
499
         */
500
0
        return false;
501
0
    }
502
503
0
    const uint16_t *unfold=ucase_props_singleton.unfold;
504
0
    unfoldRows=unfold[UCASE_UNFOLD_ROWS];
505
0
    unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
506
0
    unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
507
0
    unfold+=unfoldRowWidth;
508
509
0
    if(length>unfoldStringWidth) {
510
        /* the string is too long to find any match */
511
0
        return false;
512
0
    }
513
514
    /* do a binary search for the string */
515
0
    start=0;
516
0
    limit=unfoldRows;
517
0
    while(start<limit) {
518
0
        i=(start+limit)/2;
519
0
        const char16_t *p=reinterpret_cast<const char16_t *>(unfold+(i*unfoldRowWidth));
520
0
        result=strcmpMax(s, length, p, unfoldStringWidth);
521
522
0
        if(result==0) {
523
            /* found the string: add each code point, and its case closure */
524
0
            UChar32 c;
525
526
0
            for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
527
0
                U16_NEXT_UNSAFE(p, i, c);
528
0
                sa->add(sa->set, c);
529
0
                ucase_addCaseClosure(c, sa);
530
0
            }
531
0
            return true;
532
0
        } else if(result<0) {
533
0
            limit=i;
534
0
        } else /* result>0 */ {
535
0
            start=i+1;
536
0
        }
537
0
    }
538
539
0
    return false; /* string not found */
540
0
}
541
542
U_NAMESPACE_BEGIN
543
544
FullCaseFoldingIterator::FullCaseFoldingIterator()
545
        : unfold(reinterpret_cast<const char16_t *>(ucase_props_singleton.unfold)),
546
          unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
547
          unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
548
          unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
549
          currentRow(0),
550
0
          rowCpIndex(unfoldStringWidth) {
551
0
    unfold+=unfoldRowWidth;
552
0
}
553
554
UChar32
555
0
FullCaseFoldingIterator::next(UnicodeString &full) {
556
    // Advance past the last-delivered code point.
557
0
    const char16_t *p=unfold+(currentRow*unfoldRowWidth);
558
0
    if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
559
0
        ++currentRow;
560
0
        p+=unfoldRowWidth;
561
0
        rowCpIndex=unfoldStringWidth;
562
0
    }
563
0
    if(currentRow>=unfoldRows) { return U_SENTINEL; }
564
    // Set "full" to the NUL-terminated string in the first unfold column.
565
0
    int32_t length=unfoldStringWidth;
566
0
    while(length>0 && p[length-1]==0) { --length; }
567
0
    full.setTo(false, p, length);
568
    // Return the code point.
569
0
    UChar32 c;
570
0
    U16_NEXT_UNSAFE(p, rowCpIndex, c);
571
0
    return c;
572
0
}
573
574
namespace LatinCase {
575
576
const int8_t TO_LOWER_NORMAL[LIMIT] = {
577
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581
582
    0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
583
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
584
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
585
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
586
587
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
588
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
589
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
590
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591
592
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
593
    32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
594
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
595
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
596
597
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
598
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
599
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
600
    EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
601
602
    0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
603
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
604
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
605
    1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
606
};
607
608
const int8_t TO_LOWER_TR_LT[LIMIT] = {
609
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
610
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
611
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
612
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
613
614
    0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
615
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
616
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618
619
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
620
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
621
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623
624
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
625
    32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
626
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628
629
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
630
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
631
    1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
632
    EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
633
634
    0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
635
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
636
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
637
    1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
638
};
639
640
const int8_t TO_UPPER_NORMAL[LIMIT] = {
641
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
642
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
643
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
644
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
645
646
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648
    0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
649
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
650
651
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
652
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
653
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
654
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
655
656
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
657
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
658
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
659
    -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
660
661
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
662
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
663
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
664
    0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
665
666
    -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
667
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
668
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
669
    0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
670
};
671
672
const int8_t TO_UPPER_TR[LIMIT] = {
673
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
674
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
675
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
676
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
677
678
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
679
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
680
    0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
681
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
682
683
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
684
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
685
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
686
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
687
688
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
689
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
690
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
691
    -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
692
693
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
694
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
695
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
696
    0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
697
698
    -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
699
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
700
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
701
    0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
702
};
703
704
}  // namespace LatinCase
705
706
U_NAMESPACE_END
707
708
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
709
U_CAPI int32_t U_EXPORT2
710
12.3M
ucase_getType(UChar32 c) {
711
12.3M
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
712
12.3M
    return UCASE_GET_TYPE(props);
713
12.3M
}
714
715
/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
716
U_CAPI int32_t U_EXPORT2
717
4.21M
ucase_getTypeOrIgnorable(UChar32 c) {
718
4.21M
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
719
4.21M
    return UCASE_GET_TYPE_AND_IGNORABLE(props);
720
4.21M
}
721
722
/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
723
static inline int32_t
724
25.7k
getDotType(UChar32 c) {
725
25.7k
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
726
25.7k
    if(!UCASE_HAS_EXCEPTION(props)) {
727
17.3k
        return props&UCASE_DOT_MASK;
728
17.3k
    } else {
729
8.41k
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
730
8.41k
        return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
731
8.41k
    }
732
25.7k
}
733
734
U_CAPI UBool U_EXPORT2
735
0
ucase_isSoftDotted(UChar32 c) {
736
0
    return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
737
0
}
738
739
U_CAPI UBool U_EXPORT2
740
0
ucase_isCaseSensitive(UChar32 c) {
741
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
742
0
    if(!UCASE_HAS_EXCEPTION(props)) {
743
0
        return (UBool)((props&UCASE_SENSITIVE)!=0);
744
0
    } else {
745
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
746
0
        return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
747
0
    }
748
0
}
749
750
/* string casing ------------------------------------------------------------ */
751
752
/*
753
 * These internal functions form the core of string case mappings.
754
 * They map single code points to result code points or strings and take
755
 * all necessary conditions (context, locale ID, options) into account.
756
 *
757
 * They do not iterate over the source or write to the destination
758
 * so that the same functions are useful for non-standard string storage,
759
 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
760
 * For the same reason, the "surrounding text" context is passed in as a
761
 * UCaseContextIterator which does not make any assumptions about
762
 * the underlying storage.
763
 *
764
 * This section contains helper functions that check for conditions
765
 * in the input text surrounding the current code point
766
 * according to SpecialCasing.txt.
767
 *
768
 * Each helper function gets the index
769
 * - after the current code point if it looks at following text
770
 * - before the current code point if it looks at preceding text
771
 *
772
 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
773
 *
774
 * Final_Sigma
775
 *   C is preceded by a sequence consisting of
776
 *     a cased letter and a case-ignorable sequence,
777
 *   and C is not followed by a sequence consisting of
778
 *     an ignorable sequence and then a cased letter.
779
 *
780
 * More_Above
781
 *   C is followed by one or more characters of combining class 230 (ABOVE)
782
 *   in the combining character sequence.
783
 *
784
 * After_Soft_Dotted
785
 *   The last preceding character with combining class of zero before C
786
 *   was Soft_Dotted,
787
 *   and there is no intervening combining character class 230 (ABOVE).
788
 *
789
 * Before_Dot
790
 *   C is followed by combining dot above (U+0307).
791
 *   Any sequence of characters with a combining class that is neither 0 nor 230
792
 *   may intervene between the current character and the combining dot above.
793
 *
794
 * The erratum from 2002-10-31 adds the condition
795
 *
796
 * After_I
797
 *   The last preceding base character was an uppercase I, and there is no
798
 *   intervening combining character class 230 (ABOVE).
799
 *
800
 *   (See Jitterbug 2344 and the comments on After_I below.)
801
 *
802
 * Helper definitions in Unicode 3.2 UAX 21:
803
 *
804
 * D1. A character C is defined to be cased
805
 *     if it meets any of the following criteria:
806
 *
807
 *   - The general category of C is Titlecase Letter (Lt)
808
 *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
809
 *   - Given D = NFD(C), then it is not the case that:
810
 *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
811
 *     (This third criterion does not add any characters to the list
812
 *      for Unicode 3.2. Ignored.)
813
 *
814
 * D2. A character C is defined to be case-ignorable
815
 *     if it meets either of the following criteria:
816
 *
817
 *   - The general category of C is
818
 *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
819
 *     Letter Modifier (Lm), or Symbol Modifier (Sk)
820
 *   - C is one of the following characters
821
 *     U+0027 APOSTROPHE
822
 *     U+00AD SOFT HYPHEN (SHY)
823
 *     U+2019 RIGHT SINGLE QUOTATION MARK
824
 *            (the preferred character for apostrophe)
825
 *
826
 * D3. A case-ignorable sequence is a sequence of
827
 *     zero or more case-ignorable characters.
828
 */
829
830
414
#define is_d(c) ((c)=='d' || (c)=='D')
831
608
#define is_e(c) ((c)=='e' || (c)=='E')
832
570
#define is_i(c) ((c)=='i' || (c)=='I')
833
3.36k
#define is_l(c) ((c)=='l' || (c)=='L')
834
182
#define is_r(c) ((c)=='r' || (c)=='R')
835
570
#define is_t(c) ((c)=='t' || (c)=='T')
836
182
#define is_u(c) ((c)=='u' || (c)=='U')
837
130
#define is_y(c) ((c)=='y' || (c)=='Y')
838
1.51k
#define is_z(c) ((c)=='z' || (c)=='Z')
839
840
/* separator? */
841
2.47k
#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
842
843
/**
844
 * Requires non-nullptr locale ID but otherwise does the equivalent of
845
 * checking for language codes as if uloc_getLanguage() were called:
846
 * Accepts both 2- and 3-letter codes and accepts case variants.
847
 */
848
U_CFUNC int32_t
849
7.86k
ucase_getCaseLocale(const char *locale) {
850
    /*
851
     * This function used to use uloc_getLanguage(), but the current code
852
     * removes the dependency of this low-level code on uloc implementation code
853
     * and is faster because not the whole locale ID has to be
854
     * examined and copied/transformed.
855
     *
856
     * Because this code does not want to depend on uloc, the caller must
857
     * pass in a non-nullptr locale, i.e., may need to call uloc_getDefault().
858
     */
859
7.86k
    char c=*locale++;
860
    // Fastpath for English "en" which is often used for default (=root locale) case mappings,
861
    // and for Chinese "zh": Very common but no special case mapping behavior.
862
    // Then check lowercase vs. uppercase to reduce the number of comparisons
863
    // for other locales without special behavior.
864
7.86k
    if(c=='e') {
865
        /* el or ell? */
866
1.92k
        c=*locale++;
867
1.92k
        if(is_l(c)) {
868
917
            c=*locale++;
869
917
            if(is_l(c)) {
870
0
                c=*locale;
871
0
            }
872
917
            if(is_sep(c)) {
873
917
                return UCASE_LOC_GREEK;
874
917
            }
875
917
        }
876
        // en, es, ... -> root
877
5.94k
    } else if(c=='z') {
878
187
        return UCASE_LOC_ROOT;
879
187
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
880
5.75k
    } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
881
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
882
    } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
883
#else
884
#   error Unknown charset family!
885
#endif
886
        // lowercase c
887
5.75k
        if(c=='t') {
888
            /* tr or tur? */
889
182
            c=*locale++;
890
182
            if(is_u(c)) {
891
0
                c=*locale++;
892
0
            }
893
182
            if(is_r(c)) {
894
66
                c=*locale;
895
66
                if(is_sep(c)) {
896
66
                    return UCASE_LOC_TURKISH;
897
66
                }
898
66
            }
899
5.57k
        } else if(c=='a') {
900
            /* az or aze? */
901
1.51k
            c=*locale++;
902
1.51k
            if(is_z(c)) {
903
578
                c=*locale++;
904
578
                if(is_e(c)) {
905
0
                    c=*locale;
906
0
                }
907
578
                if(is_sep(c)) {
908
578
                    return UCASE_LOC_TURKISH;
909
578
                }
910
578
            }
911
4.06k
        } else if(c=='l') {
912
            /* lt or lit? */
913
570
            c=*locale++;
914
570
            if(is_i(c)) {
915
0
                c=*locale++;
916
0
            }
917
570
            if(is_t(c)) {
918
472
                c=*locale;
919
472
                if(is_sep(c)) {
920
472
                    return UCASE_LOC_LITHUANIAN;
921
472
                }
922
472
            }
923
3.49k
        } else if(c=='n') {
924
            /* nl or nld? */
925
530
            c=*locale++;
926
530
            if(is_l(c)) {
927
414
                c=*locale++;
928
414
                if(is_d(c)) {
929
0
                    c=*locale;
930
0
                }
931
414
                if(is_sep(c)) {
932
414
                    return UCASE_LOC_DUTCH;
933
414
                }
934
414
            }
935
2.96k
        } else if(c=='h') {
936
            /* hy or hye? *not* hyw */
937
130
            c=*locale++;
938
130
            if(is_y(c)) {
939
30
                c=*locale++;
940
30
                if(is_e(c)) {
941
0
                    c=*locale;
942
0
                }
943
30
                if(is_sep(c)) {
944
30
                    return UCASE_LOC_ARMENIAN;
945
30
                }
946
30
            }
947
130
        }
948
5.75k
    } else {
949
        // uppercase c
950
        // Same code as for lowercase c but also check for 'E'.
951
0
        if(c=='T') {
952
            /* tr or tur? */
953
0
            c=*locale++;
954
0
            if(is_u(c)) {
955
0
                c=*locale++;
956
0
            }
957
0
            if(is_r(c)) {
958
0
                c=*locale;
959
0
                if(is_sep(c)) {
960
0
                    return UCASE_LOC_TURKISH;
961
0
                }
962
0
            }
963
0
        } else if(c=='A') {
964
            /* az or aze? */
965
0
            c=*locale++;
966
0
            if(is_z(c)) {
967
0
                c=*locale++;
968
0
                if(is_e(c)) {
969
0
                    c=*locale;
970
0
                }
971
0
                if(is_sep(c)) {
972
0
                    return UCASE_LOC_TURKISH;
973
0
                }
974
0
            }
975
0
        } else if(c=='L') {
976
            /* lt or lit? */
977
0
            c=*locale++;
978
0
            if(is_i(c)) {
979
0
                c=*locale++;
980
0
            }
981
0
            if(is_t(c)) {
982
0
                c=*locale;
983
0
                if(is_sep(c)) {
984
0
                    return UCASE_LOC_LITHUANIAN;
985
0
                }
986
0
            }
987
0
        } else if(c=='E') {
988
            /* el or ell? */
989
0
            c=*locale++;
990
0
            if(is_l(c)) {
991
0
                c=*locale++;
992
0
                if(is_l(c)) {
993
0
                    c=*locale;
994
0
                }
995
0
                if(is_sep(c)) {
996
0
                    return UCASE_LOC_GREEK;
997
0
                }
998
0
            }
999
0
        } else if(c=='N') {
1000
            /* nl or nld? */
1001
0
            c=*locale++;
1002
0
            if(is_l(c)) {
1003
0
                c=*locale++;
1004
0
                if(is_d(c)) {
1005
0
                    c=*locale;
1006
0
                }
1007
0
                if(is_sep(c)) {
1008
0
                    return UCASE_LOC_DUTCH;
1009
0
                }
1010
0
            }
1011
0
        } else if(c=='H') {
1012
            /* hy or hye? *not* hyw */
1013
0
            c=*locale++;
1014
0
            if(is_y(c)) {
1015
0
                c=*locale++;
1016
0
                if(is_e(c)) {
1017
0
                    c=*locale;
1018
0
                }
1019
0
                if(is_sep(c)) {
1020
0
                    return UCASE_LOC_ARMENIAN;
1021
0
                }
1022
0
            }
1023
0
        }
1024
0
    }
1025
5.20k
    return UCASE_LOC_ROOT;
1026
7.86k
}
1027
1028
/*
1029
 * Is followed by
1030
 *   {case-ignorable}* cased
1031
 * ?
1032
 * (dir determines looking forward/backward)
1033
 * If a character is case-ignorable, it is skipped regardless of whether
1034
 * it is also cased or not.
1035
 */
1036
static UBool
1037
393k
isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
1038
393k
    UChar32 c;
1039
1040
393k
    if(iter==nullptr) {
1041
0
        return false;
1042
0
    }
1043
1044
416k
    for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
1045
55.4k
        int32_t type=ucase_getTypeOrIgnorable(c);
1046
55.4k
        if(type&4) {
1047
            /* case-ignorable, continue with the loop */
1048
32.1k
        } else if(type!=UCASE_NONE) {
1049
13.7k
            return true; /* followed by cased letter */
1050
18.3k
        } else {
1051
18.3k
            return false; /* uncased and not case-ignorable */
1052
18.3k
        }
1053
55.4k
    }
1054
1055
361k
    return false; /* not followed by cased letter */
1056
393k
}
1057
1058
/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
1059
static UBool
1060
2.14k
isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
1061
2.14k
    UChar32 c;
1062
2.14k
    int32_t dotType;
1063
2.14k
    int8_t dir;
1064
1065
2.14k
    if(iter==nullptr) {
1066
0
        return false;
1067
0
    }
1068
1069
2.58k
    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1070
1.92k
        dotType=getDotType(c);
1071
1.92k
        if(dotType==UCASE_SOFT_DOTTED) {
1072
338
            return true; /* preceded by TYPE_i */
1073
1.58k
        } else if(dotType!=UCASE_OTHER_ACCENT) {
1074
1.14k
            return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
1075
1.14k
        }
1076
1.92k
    }
1077
1078
659
    return false; /* not preceded by TYPE_i */
1079
2.14k
}
1080
1081
/*
1082
 * See Jitterbug 2344:
1083
 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
1084
 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
1085
 * we made those releases compatible with Unicode 3.2 which had not fixed
1086
 * a related bug in SpecialCasing.txt.
1087
 *
1088
 * From the Jitterbug 2344 text:
1089
 * ... this bug is listed as a Unicode erratum
1090
 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
1091
 * <quote>
1092
 * There are two errors in SpecialCasing.txt.
1093
 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
1094
 * 2. An incorrect context definition. Correct as follows:
1095
 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
1096
 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
1097
 * ---
1098
 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1099
 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1100
 * where the context After_I is defined as:
1101
 * The last preceding base character was an uppercase I, and there is no
1102
 * intervening combining character class 230 (ABOVE).
1103
 * </quote>
1104
 *
1105
 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
1106
 *
1107
 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1108
 * # This matches the behavior of the canonically equivalent I-dot_above
1109
 *
1110
 * See also the description in this place in older versions of uchar.c (revision 1.100).
1111
 *
1112
 * Markus W. Scherer 2003-feb-15
1113
 */
1114
1115
/* Is preceded by base character 'I' with no intervening cc=230 ? */
1116
static UBool
1117
12.1k
isPrecededBy_I(UCaseContextIterator *iter, void *context) {
1118
12.1k
    UChar32 c;
1119
12.1k
    int32_t dotType;
1120
12.1k
    int8_t dir;
1121
1122
12.1k
    if(iter==nullptr) {
1123
0
        return false;
1124
0
    }
1125
1126
12.7k
    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1127
8.69k
        if(c==0x49) {
1128
1.11k
            return true; /* preceded by I */
1129
1.11k
        }
1130
7.57k
        dotType=getDotType(c);
1131
7.57k
        if(dotType!=UCASE_OTHER_ACCENT) {
1132
6.97k
            return false; /* preceded by different base character (not I), or intervening cc==230 */
1133
6.97k
        }
1134
7.57k
    }
1135
1136
4.01k
    return false; /* not preceded by I */
1137
12.1k
}
1138
1139
/* Is followed by one or more cc==230 ? */
1140
static UBool
1141
7.37k
isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
1142
7.37k
    UChar32 c;
1143
7.37k
    int32_t dotType;
1144
7.37k
    int8_t dir;
1145
1146
7.37k
    if(iter==nullptr) {
1147
0
        return false;
1148
0
    }
1149
1150
7.86k
    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1151
6.56k
        dotType=getDotType(c);
1152
6.56k
        if(dotType==UCASE_ABOVE) {
1153
1.11k
            return true; /* at least one cc==230 following */
1154
5.45k
        } else if(dotType!=UCASE_OTHER_ACCENT) {
1155
4.95k
            return false; /* next base character, no more cc==230 following */
1156
4.95k
        }
1157
6.56k
    }
1158
1159
1.30k
    return false; /* no more cc==230 following */
1160
7.37k
}
1161
1162
/* Is followed by a dot above (without cc==230 in between) ? */
1163
static UBool
1164
12.0k
isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1165
12.0k
    UChar32 c;
1166
12.0k
    int32_t dotType;
1167
12.0k
    int8_t dir;
1168
1169
12.0k
    if(iter==nullptr) {
1170
0
        return false;
1171
0
    }
1172
1173
14.5k
    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1174
10.8k
        if(c==0x307) {
1175
1.11k
            return true;
1176
1.11k
        }
1177
9.73k
        dotType=getDotType(c);
1178
9.73k
        if(dotType!=UCASE_OTHER_ACCENT) {
1179
7.27k
            return false; /* next base character or cc==230 in between */
1180
7.27k
        }
1181
9.73k
    }
1182
1183
3.68k
    return false; /* no dot above following */
1184
12.0k
}
1185
1186
U_CAPI int32_t U_EXPORT2
1187
ucase_toFullLower(UChar32 c,
1188
                  UCaseContextIterator *iter, void *context,
1189
                  const char16_t **pString,
1190
571k
                  int32_t loc) {
1191
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1192
571k
    U_ASSERT(c >= 0);
1193
571k
    UChar32 result=c;
1194
    // Reset the output pointer in case it was uninitialized.
1195
571k
    *pString=nullptr;
1196
571k
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1197
571k
    if(!UCASE_HAS_EXCEPTION(props)) {
1198
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1199
0
            result=c+UCASE_GET_DELTA(props);
1200
0
        }
1201
571k
    } else {
1202
571k
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1203
571k
        uint16_t excWord=*pe++;
1204
571k
        int32_t full;
1205
1206
571k
        pe2=pe;
1207
1208
571k
        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1209
            /* use hardcoded conditions and mappings */
1210
1211
            /*
1212
             * Test for conditional mappings first
1213
             *   (otherwise the unconditional default mappings are always taken),
1214
             * then test for characters that have unconditional mappings in SpecialCasing.txt,
1215
             * then get the UnicodeData.txt mappings.
1216
             */
1217
275k
            if( loc==UCASE_LOC_LITHUANIAN &&
1218
                    /* base characters, find accents above */
1219
275k
                    (((c==0x49 || c==0x4a || c==0x12e) &&
1220
19.0k
                        isFollowedByMoreAbove(iter, context)) ||
1221
                    /* precomposed with accent above, no need to find one */
1222
19.0k
                    (c==0xcc || c==0xcd || c==0x128))
1223
275k
            ) {
1224
                /*
1225
                    # Lithuanian
1226
1227
                    # Lithuanian retains the dot in a lowercase i when followed by accents.
1228
1229
                    # Introduce an explicit dot above when lowercasing capital I's and J's
1230
                    # whenever there are more accents above.
1231
                    # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1232
1233
                    0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1234
                    004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1235
                    012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1236
                    00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1237
                    00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1238
                    0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1239
                 */
1240
1.81k
                switch(c) {
1241
446
                case 0x49:  /* LATIN CAPITAL LETTER I */
1242
446
                    *pString=iDot;
1243
446
                    return 2;
1244
273
                case 0x4a:  /* LATIN CAPITAL LETTER J */
1245
273
                    *pString=jDot;
1246
273
                    return 2;
1247
392
                case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1248
392
                    *pString=iOgonekDot;
1249
392
                    return 2;
1250
244
                case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1251
244
                    *pString=iDotGrave;
1252
244
                    return 3;
1253
232
                case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1254
232
                    *pString=iDotAcute;
1255
232
                    return 3;
1256
228
                case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1257
228
                    *pString=iDotTilde;
1258
228
                    return 3;
1259
0
                default:
1260
0
                    return 0; /* will not occur */
1261
1.81k
                }
1262
            /* # Turkish and Azeri */
1263
274k
            } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1264
                /*
1265
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1266
                    # The following rules handle those cases.
1267
1268
                    0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1269
                    0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1270
                 */
1271
198
                return 0x69;
1272
273k
            } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1273
                /*
1274
                    # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1275
                    # This matches the behavior of the canonically equivalent I-dot_above
1276
1277
                    0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1278
                    0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1279
                 */
1280
1.11k
                return 0; /* remove the dot (continue without output) */
1281
272k
            } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1282
                /*
1283
                    # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1284
1285
                    0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1286
                    0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1287
                 */
1288
10.9k
                return 0x131;
1289
261k
            } else if(c==0x130) {
1290
                /*
1291
                    # Preserve canonical equivalence for I with dot. Turkic is handled below.
1292
1293
                    0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1294
                 */
1295
629
                *pString=iDot;
1296
629
                return 2;
1297
261k
            } else if(  c==0x3a3 &&
1298
261k
                        !isFollowedByCasedLetter(iter, context, 1) &&
1299
261k
                        isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1300
261k
            ) {
1301
                /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1302
                /*
1303
                    # Special case for final form of sigma
1304
1305
                    03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1306
                 */
1307
9.60k
                return 0x3c2; /* greek small final sigma */
1308
251k
            } else {
1309
                /* no known conditional special case mapping, use a normal mapping */
1310
251k
            }
1311
295k
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1312
271k
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1313
271k
            full&=UCASE_FULL_LOWER;
1314
271k
            if(full!=0) {
1315
                /* set the output pointer to the lowercase mapping */
1316
0
                *pString=reinterpret_cast<const char16_t *>(pe+1);
1317
1318
                /* return the string length */
1319
0
                return full;
1320
0
            }
1321
271k
        }
1322
1323
547k
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1324
219k
            int32_t delta;
1325
219k
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1326
219k
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1327
219k
        }
1328
327k
        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1329
6.33k
            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1330
6.33k
        }
1331
327k
    }
1332
1333
327k
    return (result==c) ? ~result : result;
1334
571k
}
1335
1336
/* internal */
1337
static int32_t
1338
toUpperOrTitle(UChar32 c,
1339
               UCaseContextIterator *iter, void *context,
1340
               const char16_t **pString,
1341
               int32_t loc,
1342
19.9M
               UBool upperNotTitle) {
1343
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1344
19.9M
    U_ASSERT(c >= 0);
1345
19.9M
    UChar32 result=c;
1346
    // Reset the output pointer in case it was uninitialized.
1347
19.9M
    *pString=nullptr;
1348
19.9M
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1349
19.9M
    if(!UCASE_HAS_EXCEPTION(props)) {
1350
19.2M
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1351
784k
            result=c+UCASE_GET_DELTA(props);
1352
784k
        }
1353
19.2M
    } else {
1354
734k
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1355
734k
        uint16_t excWord=*pe++;
1356
734k
        int32_t full, idx;
1357
1358
734k
        pe2=pe;
1359
1360
734k
        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1361
            /* use hardcoded conditions and mappings */
1362
476k
            if(loc==UCASE_LOC_TURKISH && c==0x69) {
1363
                /*
1364
                    # Turkish and Azeri
1365
1366
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1367
                    # The following rules handle those cases.
1368
1369
                    # When uppercasing, i turns into a dotted capital I
1370
1371
                    0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1372
                    0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1373
                */
1374
1.20k
                return 0x130;
1375
475k
            } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1376
                /*
1377
                    # Lithuanian
1378
1379
                    # Lithuanian retains the dot in a lowercase i when followed by accents.
1380
1381
                    # Remove DOT ABOVE after "i" with upper or titlecase
1382
1383
                    0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1384
                 */
1385
338
                return 0; /* remove the dot (continue without output) */
1386
474k
            } else if(c==0x0587) {
1387
                // See ICU-13416:
1388
                // և ligature ech-yiwn
1389
                // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1390
                // but to ԵՎ=ech+vew in Eastern Armenian.
1391
3.08k
                if(loc==UCASE_LOC_ARMENIAN) {
1392
391
                    *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1393
2.69k
                } else {
1394
2.69k
                    *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1395
2.69k
                }
1396
3.08k
                return 2;
1397
471k
            } else {
1398
                /* no known conditional special case mapping, use a normal mapping */
1399
471k
            }
1400
476k
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1401
134k
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1402
1403
            /* start of full case mapping strings */
1404
134k
            ++pe;
1405
1406
            /* skip the lowercase and case-folding result strings */
1407
134k
            pe+=full&UCASE_FULL_LOWER;
1408
134k
            full>>=4;
1409
134k
            pe+=full&0xf;
1410
134k
            full>>=4;
1411
1412
134k
            if(upperNotTitle) {
1413
109k
                full&=0xf;
1414
109k
            } else {
1415
                /* skip the uppercase result string */
1416
24.9k
                pe+=full&0xf;
1417
24.9k
                full=(full>>4)&0xf;
1418
24.9k
            }
1419
1420
134k
            if(full!=0) {
1421
                /* set the output pointer to the result string */
1422
127k
                *pString=reinterpret_cast<const char16_t *>(pe);
1423
1424
                /* return the string length */
1425
127k
                return full;
1426
127k
            }
1427
134k
        }
1428
1429
602k
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1430
100k
            int32_t delta;
1431
100k
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1432
100k
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1433
100k
        }
1434
501k
        if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1435
1.31k
            idx=UCASE_EXC_TITLE;
1436
500k
        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1437
            /* here, titlecase is same as uppercase */
1438
67.5k
            idx=UCASE_EXC_UPPER;
1439
432k
        } else {
1440
432k
            return ~c;
1441
432k
        }
1442
501k
        GET_SLOT_VALUE(excWord, idx, pe2, result);
1443
68.8k
    }
1444
1445
19.2M
    return (result==c) ? ~result : result;
1446
19.9M
}
1447
1448
U_CAPI int32_t U_EXPORT2
1449
ucase_toFullUpper(UChar32 c,
1450
                  UCaseContextIterator *iter, void *context,
1451
                  const char16_t **pString,
1452
2.61M
                  int32_t caseLocale) {
1453
2.61M
    return toUpperOrTitle(c, iter, context, pString, caseLocale, true);
1454
2.61M
}
1455
1456
U_CAPI int32_t U_EXPORT2
1457
ucase_toFullTitle(UChar32 c,
1458
                  UCaseContextIterator *iter, void *context,
1459
                  const char16_t **pString,
1460
17.3M
                  int32_t caseLocale) {
1461
17.3M
    return toUpperOrTitle(c, iter, context, pString, caseLocale, false);
1462
17.3M
}
1463
1464
/* case folding ------------------------------------------------------------- */
1465
1466
/*
1467
 * Case folding is similar to lowercasing.
1468
 * The result may be a simple mapping, i.e., a single code point, or
1469
 * a full mapping, i.e., a string.
1470
 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1471
 * then only the lowercase mapping is stored.
1472
 *
1473
 * Some special cases are hardcoded because their conditions cannot be
1474
 * parsed and processed from CaseFolding.txt.
1475
 *
1476
 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1477
1478
# C: common case folding, common mappings shared by both simple and full mappings.
1479
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1480
# S: simple case folding, mappings to single characters where different from F.
1481
# T: special case for uppercase I and dotted uppercase I
1482
#    - For non-Turkic languages, this mapping is normally not used.
1483
#    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1484
#
1485
# Usage:
1486
#  A. To do a simple case folding, use the mappings with status C + S.
1487
#  B. To do a full case folding, use the mappings with status C + F.
1488
#
1489
#    The mappings with status T can be used or omitted depending on the desired case-folding
1490
#    behavior. (The default option is to exclude them.)
1491
1492
 * Unicode 3.2 has 'T' mappings as follows:
1493
1494
0049; T; 0131; # LATIN CAPITAL LETTER I
1495
0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1496
1497
 * while the default mappings for these code points are:
1498
1499
0049; C; 0069; # LATIN CAPITAL LETTER I
1500
0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1501
1502
 * U+0130 has no simple case folding (simple-case-folds to itself).
1503
 */
1504
1505
/* return the simple case folding mapping for c */
1506
U_CAPI UChar32 U_EXPORT2
1507
0
ucase_fold(UChar32 c, uint32_t options) {
1508
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1509
0
    if(!UCASE_HAS_EXCEPTION(props)) {
1510
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1511
0
            c+=UCASE_GET_DELTA(props);
1512
0
        }
1513
0
    } else {
1514
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1515
0
        uint16_t excWord=*pe++;
1516
0
        int32_t idx;
1517
0
        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1518
            /* special case folding mappings, hardcoded */
1519
0
            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1520
                /* default mappings */
1521
0
                if(c==0x49) {
1522
                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1523
0
                    return 0x69;
1524
0
                } else if(c==0x130) {
1525
                    /* no simple case folding for U+0130 */
1526
0
                    return c;
1527
0
                }
1528
0
            } else {
1529
                /* Turkic mappings */
1530
0
                if(c==0x49) {
1531
                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1532
0
                    return 0x131;
1533
0
                } else if(c==0x130) {
1534
                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1535
0
                    return 0x69;
1536
0
                }
1537
0
            }
1538
0
        }
1539
0
        if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1540
0
            return c;
1541
0
        }
1542
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1543
0
            int32_t delta;
1544
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1545
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1546
0
        }
1547
0
        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1548
0
            idx=UCASE_EXC_FOLD;
1549
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1550
0
            idx=UCASE_EXC_LOWER;
1551
0
        } else {
1552
0
            return c;
1553
0
        }
1554
0
        GET_SLOT_VALUE(excWord, idx, pe, c);
1555
0
    }
1556
0
    return c;
1557
0
}
1558
1559
/*
1560
 * Issue for canonical caseless match (UAX #21):
1561
 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1562
 * canonical equivalence, unlike default-option casefolding.
1563
 * For example, I-grave and I + grave fold to strings that are not canonically
1564
 * equivalent.
1565
 * For more details, see the comment in unorm_compare() in unorm.cpp
1566
 * and the intermediate prototype changes for Jitterbug 2021.
1567
 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1568
 *
1569
 * This did not get fixed because it appears that it is not possible to fix
1570
 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1571
 * together in a way that they still fold to common result strings.
1572
 */
1573
1574
U_CAPI int32_t U_EXPORT2
1575
ucase_toFullFolding(UChar32 c,
1576
                    const char16_t **pString,
1577
6.09k
                    uint32_t options) {
1578
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1579
6.09k
    U_ASSERT(c >= 0);
1580
6.09k
    UChar32 result=c;
1581
    // Reset the output pointer in case it was uninitialized.
1582
6.09k
    *pString=nullptr;
1583
6.09k
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1584
6.09k
    if(!UCASE_HAS_EXCEPTION(props)) {
1585
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1586
0
            result=c+UCASE_GET_DELTA(props);
1587
0
        }
1588
6.09k
    } else {
1589
6.09k
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1590
6.09k
        uint16_t excWord=*pe++;
1591
6.09k
        int32_t full, idx;
1592
1593
6.09k
        pe2=pe;
1594
1595
6.09k
        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1596
            /* use hardcoded conditions and mappings */
1597
866
            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1598
                /* default mappings */
1599
209
                if(c==0x49) {
1600
                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1601
0
                    return 0x69;
1602
209
                } else if(c==0x130) {
1603
                    /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1604
209
                    *pString=iDot;
1605
209
                    return 2;
1606
209
                }
1607
657
            } else {
1608
                /* Turkic mappings */
1609
657
                if(c==0x49) {
1610
                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1611
397
                    return 0x131;
1612
397
                } else if(c==0x130) {
1613
                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1614
260
                    return 0x69;
1615
260
                }
1616
657
            }
1617
5.23k
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1618
2.56k
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1619
1620
            /* start of full case mapping strings */
1621
2.56k
            ++pe;
1622
1623
            /* skip the lowercase result string */
1624
2.56k
            pe+=full&UCASE_FULL_LOWER;
1625
2.56k
            full=(full>>4)&0xf;
1626
1627
2.56k
            if(full!=0) {
1628
                /* set the output pointer to the result string */
1629
2.56k
                *pString=reinterpret_cast<const char16_t *>(pe);
1630
1631
                /* return the string length */
1632
2.56k
                return full;
1633
2.56k
            }
1634
2.56k
        }
1635
1636
2.66k
        if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1637
200
            return ~c;
1638
200
        }
1639
2.46k
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1640
515
            int32_t delta;
1641
515
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1642
515
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1643
515
        }
1644
1.95k
        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1645
388
            idx=UCASE_EXC_FOLD;
1646
1.56k
        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1647
638
            idx=UCASE_EXC_LOWER;
1648
928
        } else {
1649
928
            return ~c;
1650
928
        }
1651
1.95k
        GET_SLOT_VALUE(excWord, idx, pe2, result);
1652
1.02k
    }
1653
1654
1.02k
    return (result==c) ? ~result : result;
1655
6.09k
}
1656
1657
/* case mapping properties API ---------------------------------------------- */
1658
1659
/* public API (see uchar.h) */
1660
1661
U_CAPI UBool U_EXPORT2
1662
0
u_isULowercase(UChar32 c) {
1663
0
    return (UBool)(UCASE_LOWER==ucase_getType(c));
1664
0
}
1665
1666
U_CAPI UBool U_EXPORT2
1667
0
u_isUUppercase(UChar32 c) {
1668
0
    return (UBool)(UCASE_UPPER==ucase_getType(c));
1669
0
}
1670
1671
/* Transforms the Unicode character to its lower case equivalent.*/
1672
U_CAPI UChar32 U_EXPORT2
1673
0
u_tolower(UChar32 c) {
1674
0
    return ucase_tolower(c);
1675
0
}
1676
1677
/* Transforms the Unicode character to its upper case equivalent.*/
1678
U_CAPI UChar32 U_EXPORT2
1679
0
u_toupper(UChar32 c) {
1680
0
    return ucase_toupper(c);
1681
0
}
1682
1683
/* Transforms the Unicode character to its title case equivalent.*/
1684
U_CAPI UChar32 U_EXPORT2
1685
0
u_totitle(UChar32 c) {
1686
0
    return ucase_totitle(c);
1687
0
}
1688
1689
/* return the simple case folding mapping for c */
1690
U_CAPI UChar32 U_EXPORT2
1691
0
u_foldCase(UChar32 c, uint32_t options) {
1692
0
    return ucase_fold(c, options);
1693
0
}
1694
1695
U_CFUNC int32_t U_EXPORT2
1696
0
ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1697
    /* case mapping properties */
1698
0
    const char16_t *resultString;
1699
0
    switch(which) {
1700
0
    case UCHAR_LOWERCASE:
1701
0
        return (UBool)(UCASE_LOWER==ucase_getType(c));
1702
0
    case UCHAR_UPPERCASE:
1703
0
        return (UBool)(UCASE_UPPER==ucase_getType(c));
1704
0
    case UCHAR_SOFT_DOTTED:
1705
0
        return ucase_isSoftDotted(c);
1706
0
    case UCHAR_CASE_SENSITIVE:
1707
0
        return ucase_isCaseSensitive(c);
1708
0
    case UCHAR_CASED:
1709
0
        return (UBool)(UCASE_NONE!=ucase_getType(c));
1710
0
    case UCHAR_CASE_IGNORABLE:
1711
0
        return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1712
    /*
1713
     * Note: The following Changes_When_Xyz are defined as testing whether
1714
     * the NFD form of the input changes when Xyz-case-mapped.
1715
     * However, this simpler implementation of these properties,
1716
     * ignoring NFD, passes the tests.
1717
     * The implementation needs to be changed if the tests start failing.
1718
     * When that happens, optimizations should be used to work with the
1719
     * per-single-code point ucase_toFullXyz() functions unless
1720
     * the NFD form has more than one code point,
1721
     * and the property starts set needs to be the union of the
1722
     * start sets for normalization and case mappings.
1723
     */
1724
0
    case UCHAR_CHANGES_WHEN_LOWERCASED:
1725
0
        return (UBool)(ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1726
0
    case UCHAR_CHANGES_WHEN_UPPERCASED:
1727
0
        return (UBool)(ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1728
0
    case UCHAR_CHANGES_WHEN_TITLECASED:
1729
0
        return (UBool)(ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1730
    /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1731
0
    case UCHAR_CHANGES_WHEN_CASEMAPPED:
1732
0
        return (UBool)(
1733
0
            ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
1734
0
            ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
1735
0
            ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1736
0
    default:
1737
0
        return false;
1738
0
    }
1739
0
}