Coverage Report

Created: 2022-11-20 06:14

/src/icu/icu4c/source/common/ucase.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2004-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  ucase.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2004aug30
16
*   created by: Markus W. Scherer
17
*
18
*   Low-level Unicode character/string case mapping code.
19
*   Much code moved here (and modified) from uchar.c.
20
*/
21
22
#include "unicode/utypes.h"
23
#include "unicode/unistr.h"
24
#include "unicode/uset.h"
25
#include "unicode/utf16.h"
26
#include "cmemory.h"
27
#include "uassert.h"
28
#include "ucase.h"
29
#include "umutex.h"
30
#include "utrie2.h"
31
32
/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
33
#define INCLUDED_FROM_UCASE_CPP
34
#include "ucase_props_data.h"
35
36
/* set of property starts for UnicodeSet ------------------------------------ */
37
38
static UBool U_CALLCONV
39
0
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
40
    /* add the start code point to the USet */
41
0
    const USetAdder *sa=(const USetAdder *)context;
42
0
    sa->add(sa->set, start);
43
0
    return true;
44
0
}
45
46
U_CFUNC void U_EXPORT2
47
0
ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
48
0
    if(U_FAILURE(*pErrorCode)) {
49
0
        return;
50
0
    }
51
52
    /* add the start code point of each same-value range of the trie */
53
0
    utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
54
55
    /* add code points with hardcoded properties, plus the ones following them */
56
57
    /* (none right now, see comment below) */
58
59
    /*
60
     * Omit code points with hardcoded specialcasing properties
61
     * because we do not build property UnicodeSets for them right now.
62
     */
63
0
}
64
65
/* data access primitives --------------------------------------------------- */
66
67
U_CAPI const struct UCaseProps * U_EXPORT2
68
0
ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
69
0
    *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
70
0
    *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
71
0
    return &ucase_props_singleton;
72
0
}
73
74
U_CFUNC const UTrie2 * U_EXPORT2
75
1.00M
ucase_getTrie() {
76
1.00M
    return &ucase_props_singleton.trie;
77
1.00M
}
78
79
1.64M
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
80
81
/* number of bits in an 8-bit integer value */
82
static const uint8_t flagsOffset[256]={
83
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
84
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
85
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
86
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
87
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
88
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
89
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
90
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
91
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
94
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
95
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
99
};
100
101
5.26M
#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
102
1.02M
#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
103
104
/*
105
 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
106
 *
107
 * @param excWord (in) initial exceptions word
108
 * @param idx (in) desired slot index
109
 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
110
 *               moved to the last uint16_t of the value, use +1 for beginning of next slot
111
 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
112
 */
113
1.02M
#define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
114
1.02M
    if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
115
1.02M
        (pExc16)+=SLOT_OFFSET(excWord, idx); \
116
1.02M
        (value)=*pExc16; \
117
1.02M
    } else { \
118
0
        (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
119
0
        (value)=*pExc16++; \
120
0
        (value)=((value)<<16)|*pExc16; \
121
0
    } \
122
1.55M
} UPRV_BLOCK_MACRO_END
123
124
/* simple case mappings ----------------------------------------------------- */
125
126
U_CAPI UChar32 U_EXPORT2
127
0
ucase_tolower(UChar32 c) {
128
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
129
0
    if(!UCASE_HAS_EXCEPTION(props)) {
130
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
131
0
            c+=UCASE_GET_DELTA(props);
132
0
        }
133
0
    } else {
134
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
135
0
        uint16_t excWord=*pe++;
136
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
137
0
            int32_t delta;
138
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
139
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
140
0
        }
141
0
        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
142
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
143
0
        }
144
0
    }
145
0
    return c;
146
0
}
147
148
U_CAPI UChar32 U_EXPORT2
149
0
ucase_toupper(UChar32 c) {
150
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
151
0
    if(!UCASE_HAS_EXCEPTION(props)) {
152
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
153
0
            c+=UCASE_GET_DELTA(props);
154
0
        }
155
0
    } else {
156
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
157
0
        uint16_t excWord=*pe++;
158
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
159
0
            int32_t delta;
160
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
161
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
162
0
        }
163
0
        if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
164
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
165
0
        }
166
0
    }
167
0
    return c;
168
0
}
169
170
U_CAPI UChar32 U_EXPORT2
171
0
ucase_totitle(UChar32 c) {
172
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
173
0
    if(!UCASE_HAS_EXCEPTION(props)) {
174
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
175
0
            c+=UCASE_GET_DELTA(props);
176
0
        }
177
0
    } else {
178
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
179
0
        uint16_t excWord=*pe++;
180
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
181
0
            int32_t delta;
182
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
183
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
184
0
        }
185
0
        int32_t idx;
186
0
        if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
187
0
            idx=UCASE_EXC_TITLE;
188
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
189
0
            idx=UCASE_EXC_UPPER;
190
0
        } else {
191
0
            return c;
192
0
        }
193
0
        GET_SLOT_VALUE(excWord, idx, pe, c);
194
0
    }
195
0
    return c;
196
0
}
197
198
static const UChar iDot[2] = { 0x69, 0x307 };
199
static const UChar jDot[2] = { 0x6a, 0x307 };
200
static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
201
static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
202
static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
203
static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
204
205
206
U_CFUNC void U_EXPORT2
207
0
ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
208
0
    uint16_t props;
209
210
    /*
211
     * Hardcode the case closure of i and its relatives and ignore the
212
     * data file data for these characters.
213
     * The Turkic dotless i and dotted I with their case mapping conditions
214
     * and case folding option make the related characters behave specially.
215
     * This code matches their closure behavior to their case folding behavior.
216
     */
217
218
0
    switch(c) {
219
0
    case 0x49:
220
        /* regular i and I are in one equivalence class */
221
0
        sa->add(sa->set, 0x69);
222
0
        return;
223
0
    case 0x69:
224
0
        sa->add(sa->set, 0x49);
225
0
        return;
226
0
    case 0x130:
227
        /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
228
0
        sa->addString(sa->set, iDot, 2);
229
0
        return;
230
0
    case 0x131:
231
        /* dotless i is in a class by itself */
232
0
        return;
233
0
    default:
234
        /* otherwise use the data file data */
235
0
        break;
236
0
    }
237
238
0
    props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
239
0
    if(!UCASE_HAS_EXCEPTION(props)) {
240
0
        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
241
            /* add the one simple case mapping, no matter what type it is */
242
0
            int32_t delta=UCASE_GET_DELTA(props);
243
0
            if(delta!=0) {
244
0
                sa->add(sa->set, c+delta);
245
0
            }
246
0
        }
247
0
    } else {
248
        /*
249
         * c has exceptions, so there may be multiple simple and/or
250
         * full case mappings. Add them all.
251
         */
252
0
        const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
253
0
        const UChar *closure;
254
0
        uint16_t excWord=*pe++;
255
0
        int32_t idx, closureLength, fullLength, length;
256
257
0
        pe0=pe;
258
259
        /* add all simple case mappings */
260
0
        for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
261
0
            if(HAS_SLOT(excWord, idx)) {
262
0
                pe=pe0;
263
0
                GET_SLOT_VALUE(excWord, idx, pe, c);
264
0
                sa->add(sa->set, c);
265
0
            }
266
0
        }
267
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
268
0
            pe=pe0;
269
0
            int32_t delta;
270
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
271
0
            sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
272
0
        }
273
274
        /* get the closure string pointer & length */
275
0
        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
276
0
            pe=pe0;
277
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
278
0
            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
279
0
            closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
280
0
        } else {
281
0
            closureLength=0;
282
0
            closure=NULL;
283
0
        }
284
285
        /* add the full case folding */
286
0
        if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
287
0
            pe=pe0;
288
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
289
290
            /* start of full case mapping strings */
291
0
            ++pe;
292
293
0
            fullLength&=0xffff; /* bits 16 and higher are reserved */
294
295
            /* skip the lowercase result string */
296
0
            pe+=fullLength&UCASE_FULL_LOWER;
297
0
            fullLength>>=4;
298
299
            /* add the full case folding string */
300
0
            length=fullLength&0xf;
301
0
            if(length!=0) {
302
0
                sa->addString(sa->set, (const UChar *)pe, length);
303
0
                pe+=length;
304
0
            }
305
306
            /* skip the uppercase and titlecase strings */
307
0
            fullLength>>=4;
308
0
            pe+=fullLength&0xf;
309
0
            fullLength>>=4;
310
0
            pe+=fullLength;
311
312
0
            closure=(const UChar *)pe; /* behind full case mappings */
313
0
        }
314
315
        /* add each code point in the closure string */
316
0
        for(idx=0; idx<closureLength;) {
317
0
            U16_NEXT_UNSAFE(closure, idx, c);
318
0
            sa->add(sa->set, c);
319
0
        }
320
0
    }
321
0
}
322
323
/*
324
 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
325
 * must be length>0 and max>0 and length<=max
326
 */
327
static inline int32_t
328
0
strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
329
0
    int32_t c1, c2;
330
331
0
    max-=length; /* we require length<=max, so no need to decrement max in the loop */
332
0
    do {
333
0
        c1=*s++;
334
0
        c2=*t++;
335
0
        if(c2==0) {
336
0
            return 1; /* reached the end of t but not of s */
337
0
        }
338
0
        c1-=c2;
339
0
        if(c1!=0) {
340
0
            return c1; /* return difference result */
341
0
        }
342
0
    } while(--length>0);
343
    /* ends with length==0 */
344
345
0
    if(max==0 || *t==0) {
346
0
        return 0; /* equal to length of both strings */
347
0
    } else {
348
0
        return -max; /* return length difference */
349
0
    }
350
0
}
351
352
U_CFUNC UBool U_EXPORT2
353
0
ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
354
0
    int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
355
356
0
    if(ucase_props_singleton.unfold==NULL || s==NULL) {
357
0
        return false; /* no reverse case folding data, or no string */
358
0
    }
359
0
    if(length<=1) {
360
        /* the string is too short to find any match */
361
        /*
362
         * more precise would be:
363
         * if(!u_strHasMoreChar32Than(s, length, 1))
364
         * but this does not make much practical difference because
365
         * a single supplementary code point would just not be found
366
         */
367
0
        return false;
368
0
    }
369
370
0
    const uint16_t *unfold=ucase_props_singleton.unfold;
371
0
    unfoldRows=unfold[UCASE_UNFOLD_ROWS];
372
0
    unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
373
0
    unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
374
0
    unfold+=unfoldRowWidth;
375
376
0
    if(length>unfoldStringWidth) {
377
        /* the string is too long to find any match */
378
0
        return false;
379
0
    }
380
381
    /* do a binary search for the string */
382
0
    start=0;
383
0
    limit=unfoldRows;
384
0
    while(start<limit) {
385
0
        i=(start+limit)/2;
386
0
        const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
387
0
        result=strcmpMax(s, length, p, unfoldStringWidth);
388
389
0
        if(result==0) {
390
            /* found the string: add each code point, and its case closure */
391
0
            UChar32 c;
392
393
0
            for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
394
0
                U16_NEXT_UNSAFE(p, i, c);
395
0
                sa->add(sa->set, c);
396
0
                ucase_addCaseClosure(c, sa);
397
0
            }
398
0
            return true;
399
0
        } else if(result<0) {
400
0
            limit=i;
401
0
        } else /* result>0 */ {
402
0
            start=i+1;
403
0
        }
404
0
    }
405
406
0
    return false; /* string not found */
407
0
}
408
409
U_NAMESPACE_BEGIN
410
411
FullCaseFoldingIterator::FullCaseFoldingIterator()
412
        : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
413
          unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
414
          unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
415
          unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
416
          currentRow(0),
417
0
          rowCpIndex(unfoldStringWidth) {
418
0
    unfold+=unfoldRowWidth;
419
0
}
420
421
UChar32
422
0
FullCaseFoldingIterator::next(UnicodeString &full) {
423
    // Advance past the last-delivered code point.
424
0
    const UChar *p=unfold+(currentRow*unfoldRowWidth);
425
0
    if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
426
0
        ++currentRow;
427
0
        p+=unfoldRowWidth;
428
0
        rowCpIndex=unfoldStringWidth;
429
0
    }
430
0
    if(currentRow>=unfoldRows) { return U_SENTINEL; }
431
    // Set "full" to the NUL-terminated string in the first unfold column.
432
0
    int32_t length=unfoldStringWidth;
433
0
    while(length>0 && p[length-1]==0) { --length; }
434
0
    full.setTo(false, p, length);
435
    // Return the code point.
436
0
    UChar32 c;
437
0
    U16_NEXT_UNSAFE(p, rowCpIndex, c);
438
0
    return c;
439
0
}
440
441
namespace LatinCase {
442
443
const int8_t TO_LOWER_NORMAL[LIMIT] = {
444
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
445
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
446
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
447
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
448
449
    0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
450
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453
454
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
455
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458
459
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
460
    32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
461
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463
464
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
465
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
466
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
467
    EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
468
469
    0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
470
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472
    1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
473
};
474
475
const int8_t TO_LOWER_TR_LT[LIMIT] = {
476
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
477
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
478
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
479
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
480
481
    0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
482
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
483
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485
486
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
487
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
488
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490
491
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
492
    32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
493
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495
496
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
497
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
498
    1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
499
    EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
500
501
    0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
502
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
504
    1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
505
};
506
507
const int8_t TO_UPPER_NORMAL[LIMIT] = {
508
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
509
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
510
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
511
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
512
513
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
514
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515
    0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
516
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
517
518
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
519
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
522
523
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
524
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
525
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
526
    -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
527
528
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
529
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
530
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
531
    0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
532
533
    -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
534
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536
    0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
537
};
538
539
const int8_t TO_UPPER_TR[LIMIT] = {
540
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
541
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
542
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544
545
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547
    0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
548
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
549
550
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
554
555
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
556
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
557
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
558
    -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
559
560
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
561
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
562
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
563
    0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
564
565
    -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
566
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568
    0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
569
};
570
571
}  // namespace LatinCase
572
573
U_NAMESPACE_END
574
575
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
576
U_CAPI int32_t U_EXPORT2
577
9.73M
ucase_getType(UChar32 c) {
578
9.73M
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
579
9.73M
    return UCASE_GET_TYPE(props);
580
9.73M
}
581
582
/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
583
U_CAPI int32_t U_EXPORT2
584
6.17M
ucase_getTypeOrIgnorable(UChar32 c) {
585
6.17M
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
586
6.17M
    return UCASE_GET_TYPE_AND_IGNORABLE(props);
587
6.17M
}
588
589
/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
590
static inline int32_t
591
20.7k
getDotType(UChar32 c) {
592
20.7k
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
593
20.7k
    if(!UCASE_HAS_EXCEPTION(props)) {
594
15.7k
        return props&UCASE_DOT_MASK;
595
15.7k
    } else {
596
5.06k
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
597
5.06k
        return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
598
5.06k
    }
599
20.7k
}
600
601
U_CAPI UBool U_EXPORT2
602
0
ucase_isSoftDotted(UChar32 c) {
603
0
    return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
604
0
}
605
606
U_CAPI UBool U_EXPORT2
607
0
ucase_isCaseSensitive(UChar32 c) {
608
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
609
0
    if(!UCASE_HAS_EXCEPTION(props)) {
610
0
        return (UBool)((props&UCASE_SENSITIVE)!=0);
611
0
    } else {
612
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
613
0
        return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
614
0
    }
615
0
}
616
617
/* string casing ------------------------------------------------------------ */
618
619
/*
620
 * These internal functions form the core of string case mappings.
621
 * They map single code points to result code points or strings and take
622
 * all necessary conditions (context, locale ID, options) into account.
623
 *
624
 * They do not iterate over the source or write to the destination
625
 * so that the same functions are useful for non-standard string storage,
626
 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
627
 * For the same reason, the "surrounding text" context is passed in as a
628
 * UCaseContextIterator which does not make any assumptions about
629
 * the underlying storage.
630
 *
631
 * This section contains helper functions that check for conditions
632
 * in the input text surrounding the current code point
633
 * according to SpecialCasing.txt.
634
 *
635
 * Each helper function gets the index
636
 * - after the current code point if it looks at following text
637
 * - before the current code point if it looks at preceding text
638
 *
639
 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
640
 *
641
 * Final_Sigma
642
 *   C is preceded by a sequence consisting of
643
 *     a cased letter and a case-ignorable sequence,
644
 *   and C is not followed by a sequence consisting of
645
 *     an ignorable sequence and then a cased letter.
646
 *
647
 * More_Above
648
 *   C is followed by one or more characters of combining class 230 (ABOVE)
649
 *   in the combining character sequence.
650
 *
651
 * After_Soft_Dotted
652
 *   The last preceding character with combining class of zero before C
653
 *   was Soft_Dotted,
654
 *   and there is no intervening combining character class 230 (ABOVE).
655
 *
656
 * Before_Dot
657
 *   C is followed by combining dot above (U+0307).
658
 *   Any sequence of characters with a combining class that is neither 0 nor 230
659
 *   may intervene between the current character and the combining dot above.
660
 *
661
 * The erratum from 2002-10-31 adds the condition
662
 *
663
 * After_I
664
 *   The last preceding base character was an uppercase I, and there is no
665
 *   intervening combining character class 230 (ABOVE).
666
 *
667
 *   (See Jitterbug 2344 and the comments on After_I below.)
668
 *
669
 * Helper definitions in Unicode 3.2 UAX 21:
670
 *
671
 * D1. A character C is defined to be cased
672
 *     if it meets any of the following criteria:
673
 *
674
 *   - The general category of C is Titlecase Letter (Lt)
675
 *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
676
 *   - Given D = NFD(C), then it is not the case that:
677
 *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
678
 *     (This third criterion does not add any characters to the list
679
 *      for Unicode 3.2. Ignored.)
680
 *
681
 * D2. A character C is defined to be case-ignorable
682
 *     if it meets either of the following criteria:
683
 *
684
 *   - The general category of C is
685
 *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
686
 *     Letter Modifier (Lm), or Symbol Modifier (Sk)
687
 *   - C is one of the following characters
688
 *     U+0027 APOSTROPHE
689
 *     U+00AD SOFT HYPHEN (SHY)
690
 *     U+2019 RIGHT SINGLE QUOTATION MARK
691
 *            (the preferred character for apostrophe)
692
 *
693
 * D3. A case-ignorable sequence is a sequence of
694
 *     zero or more case-ignorable characters.
695
 */
696
697
431
#define is_d(c) ((c)=='d' || (c)=='D')
698
618
#define is_e(c) ((c)=='e' || (c)=='E')
699
590
#define is_i(c) ((c)=='i' || (c)=='I')
700
3.34k
#define is_l(c) ((c)=='l' || (c)=='L')
701
195
#define is_r(c) ((c)=='r' || (c)=='R')
702
590
#define is_t(c) ((c)=='t' || (c)=='T')
703
195
#define is_u(c) ((c)=='u' || (c)=='U')
704
110
#define is_y(c) ((c)=='y' || (c)=='Y')
705
1.50k
#define is_z(c) ((c)=='z' || (c)=='Z')
706
707
/* separator? */
708
2.50k
#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
709
710
/**
711
 * Requires non-NULL locale ID but otherwise does the equivalent of
712
 * checking for language codes as if uloc_getLanguage() were called:
713
 * Accepts both 2- and 3-letter codes and accepts case variants.
714
 */
715
U_CFUNC int32_t
716
7.74k
ucase_getCaseLocale(const char *locale) {
717
    /*
718
     * This function used to use uloc_getLanguage(), but the current code
719
     * removes the dependency of this low-level code on uloc implementation code
720
     * and is faster because not the whole locale ID has to be
721
     * examined and copied/transformed.
722
     *
723
     * Because this code does not want to depend on uloc, the caller must
724
     * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
725
     */
726
7.74k
    char c=*locale++;
727
    // Fastpath for English "en" which is often used for default (=root locale) case mappings,
728
    // and for Chinese "zh": Very common but no special case mapping behavior.
729
    // Then check lowercase vs. uppercase to reduce the number of comparisons
730
    // for other locales without special behavior.
731
7.74k
    if(c=='e') {
732
        /* el or ell? */
733
1.84k
        c=*locale++;
734
1.84k
        if(is_l(c)) {
735
926
            c=*locale++;
736
926
            if(is_l(c)) {
737
0
                c=*locale;
738
0
            }
739
926
            if(is_sep(c)) {
740
926
                return UCASE_LOC_GREEK;
741
926
            }
742
926
        }
743
        // en, es, ... -> root
744
5.89k
    } else if(c=='z') {
745
155
        return UCASE_LOC_ROOT;
746
155
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
747
5.74k
    } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
748
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
749
    } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
750
#else
751
#   error Unknown charset family!
752
#endif
753
        // lowercase c
754
5.74k
        if(c=='t') {
755
            /* tr or tur? */
756
195
            c=*locale++;
757
195
            if(is_u(c)) {
758
0
                c=*locale++;
759
0
            }
760
195
            if(is_r(c)) {
761
52
                c=*locale;
762
52
                if(is_sep(c)) {
763
52
                    return UCASE_LOC_TURKISH;
764
52
                }
765
52
            }
766
5.54k
        } else if(c=='a') {
767
            /* az or aze? */
768
1.50k
            c=*locale++;
769
1.50k
            if(is_z(c)) {
770
593
                c=*locale++;
771
593
                if(is_e(c)) {
772
0
                    c=*locale;
773
0
                }
774
593
                if(is_sep(c)) {
775
593
                    return UCASE_LOC_TURKISH;
776
593
                }
777
593
            }
778
4.04k
        } else if(c=='l') {
779
            /* lt or lit? */
780
590
            c=*locale++;
781
590
            if(is_i(c)) {
782
0
                c=*locale++;
783
0
            }
784
590
            if(is_t(c)) {
785
473
                c=*locale;
786
473
                if(is_sep(c)) {
787
473
                    return UCASE_LOC_LITHUANIAN;
788
473
                }
789
473
            }
790
3.45k
        } else if(c=='n') {
791
            /* nl or nld? */
792
566
            c=*locale++;
793
566
            if(is_l(c)) {
794
431
                c=*locale++;
795
431
                if(is_d(c)) {
796
0
                    c=*locale;
797
0
                }
798
431
                if(is_sep(c)) {
799
431
                    return UCASE_LOC_DUTCH;
800
431
                }
801
431
            }
802
2.88k
        } else if(c=='h') {
803
            /* hy or hye? *not* hyw */
804
110
            c=*locale++;
805
110
            if(is_y(c)) {
806
25
                c=*locale++;
807
25
                if(is_e(c)) {
808
0
                    c=*locale;
809
0
                }
810
25
                if(is_sep(c)) {
811
25
                    return UCASE_LOC_ARMENIAN;
812
25
                }
813
25
            }
814
110
        }
815
5.74k
    } else {
816
        // uppercase c
817
        // Same code as for lowercase c but also check for 'E'.
818
0
        if(c=='T') {
819
            /* tr or tur? */
820
0
            c=*locale++;
821
0
            if(is_u(c)) {
822
0
                c=*locale++;
823
0
            }
824
0
            if(is_r(c)) {
825
0
                c=*locale;
826
0
                if(is_sep(c)) {
827
0
                    return UCASE_LOC_TURKISH;
828
0
                }
829
0
            }
830
0
        } else if(c=='A') {
831
            /* az or aze? */
832
0
            c=*locale++;
833
0
            if(is_z(c)) {
834
0
                c=*locale++;
835
0
                if(is_e(c)) {
836
0
                    c=*locale;
837
0
                }
838
0
                if(is_sep(c)) {
839
0
                    return UCASE_LOC_TURKISH;
840
0
                }
841
0
            }
842
0
        } else if(c=='L') {
843
            /* lt or lit? */
844
0
            c=*locale++;
845
0
            if(is_i(c)) {
846
0
                c=*locale++;
847
0
            }
848
0
            if(is_t(c)) {
849
0
                c=*locale;
850
0
                if(is_sep(c)) {
851
0
                    return UCASE_LOC_LITHUANIAN;
852
0
                }
853
0
            }
854
0
        } else if(c=='E') {
855
            /* el or ell? */
856
0
            c=*locale++;
857
0
            if(is_l(c)) {
858
0
                c=*locale++;
859
0
                if(is_l(c)) {
860
0
                    c=*locale;
861
0
                }
862
0
                if(is_sep(c)) {
863
0
                    return UCASE_LOC_GREEK;
864
0
                }
865
0
            }
866
0
        } else if(c=='N') {
867
            /* nl or nld? */
868
0
            c=*locale++;
869
0
            if(is_l(c)) {
870
0
                c=*locale++;
871
0
                if(is_d(c)) {
872
0
                    c=*locale;
873
0
                }
874
0
                if(is_sep(c)) {
875
0
                    return UCASE_LOC_DUTCH;
876
0
                }
877
0
            }
878
0
        } else if(c=='H') {
879
            /* hy or hye? *not* hyw */
880
0
            c=*locale++;
881
0
            if(is_y(c)) {
882
0
                c=*locale++;
883
0
                if(is_e(c)) {
884
0
                    c=*locale;
885
0
                }
886
0
                if(is_sep(c)) {
887
0
                    return UCASE_LOC_ARMENIAN;
888
0
                }
889
0
            }
890
0
        }
891
0
    }
892
5.09k
    return UCASE_LOC_ROOT;
893
7.74k
}
894
895
/*
896
 * Is followed by
897
 *   {case-ignorable}* cased
898
 * ?
899
 * (dir determines looking forward/backward)
900
 * If a character is case-ignorable, it is skipped regardless of whether
901
 * it is also cased or not.
902
 */
903
static UBool
904
320k
isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
905
320k
    UChar32 c;
906
907
320k
    if(iter==NULL) {
908
0
        return false;
909
0
    }
910
911
337k
    for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
912
46.5k
        int32_t type=ucase_getTypeOrIgnorable(c);
913
46.5k
        if(type&4) {
914
            /* case-ignorable, continue with the loop */
915
29.6k
        } else if(type!=UCASE_NONE) {
916
14.6k
            return true; /* followed by cased letter */
917
14.9k
        } else {
918
14.9k
            return false; /* uncased and not case-ignorable */
919
14.9k
        }
920
46.5k
    }
921
922
290k
    return false; /* not followed by cased letter */
923
320k
}
924
925
/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
926
static UBool
927
2.91k
isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
928
2.91k
    UChar32 c;
929
2.91k
    int32_t dotType;
930
2.91k
    int8_t dir;
931
932
2.91k
    if(iter==NULL) {
933
0
        return false;
934
0
    }
935
936
3.34k
    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
937
2.67k
        dotType=getDotType(c);
938
2.67k
        if(dotType==UCASE_SOFT_DOTTED) {
939
197
            return true; /* preceded by TYPE_i */
940
2.47k
        } else if(dotType!=UCASE_OTHER_ACCENT) {
941
2.03k
            return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
942
2.03k
        }
943
2.67k
    }
944
945
678
    return false; /* not preceded by TYPE_i */
946
2.91k
}
947
948
/*
949
 * See Jitterbug 2344:
950
 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
951
 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
952
 * we made those releases compatible with Unicode 3.2 which had not fixed
953
 * a related bug in SpecialCasing.txt.
954
 *
955
 * From the Jitterbug 2344 text:
956
 * ... this bug is listed as a Unicode erratum
957
 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
958
 * <quote>
959
 * There are two errors in SpecialCasing.txt.
960
 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
961
 * 2. An incorrect context definition. Correct as follows:
962
 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
963
 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
964
 * ---
965
 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
966
 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
967
 * where the context After_I is defined as:
968
 * The last preceding base character was an uppercase I, and there is no
969
 * intervening combining character class 230 (ABOVE).
970
 * </quote>
971
 *
972
 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
973
 *
974
 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
975
 * # This matches the behavior of the canonically equivalent I-dot_above
976
 *
977
 * See also the description in this place in older versions of uchar.c (revision 1.100).
978
 *
979
 * Markus W. Scherer 2003-feb-15
980
 */
981
982
/* Is preceded by base character 'I' with no intervening cc=230 ? */
983
static UBool
984
13.0k
isPrecededBy_I(UCaseContextIterator *iter, void *context) {
985
13.0k
    UChar32 c;
986
13.0k
    int32_t dotType;
987
13.0k
    int8_t dir;
988
989
13.0k
    if(iter==NULL) {
990
0
        return false;
991
0
    }
992
993
13.8k
    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
994
6.82k
        if(c==0x49) {
995
216
            return true; /* preceded by I */
996
216
        }
997
6.61k
        dotType=getDotType(c);
998
6.61k
        if(dotType!=UCASE_OTHER_ACCENT) {
999
5.81k
            return false; /* preceded by different base character (not I), or intervening cc==230 */
1000
5.81k
        }
1001
6.61k
    }
1002
1003
7.00k
    return false; /* not preceded by I */
1004
13.0k
}
1005
1006
/* Is followed by one or more cc==230 ? */
1007
static UBool
1008
7.55k
isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
1009
7.55k
    UChar32 c;
1010
7.55k
    int32_t dotType;
1011
7.55k
    int8_t dir;
1012
1013
7.55k
    if(iter==NULL) {
1014
0
        return false;
1015
0
    }
1016
1017
8.46k
    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1018
6.89k
        dotType=getDotType(c);
1019
6.89k
        if(dotType==UCASE_ABOVE) {
1020
2.43k
            return true; /* at least one cc==230 following */
1021
4.46k
        } else if(dotType!=UCASE_OTHER_ACCENT) {
1022
3.55k
            return false; /* next base character, no more cc==230 following */
1023
3.55k
        }
1024
6.89k
    }
1025
1026
1.56k
    return false; /* no more cc==230 following */
1027
7.55k
}
1028
1029
/* Is followed by a dot above (without cc==230 in between) ? */
1030
static UBool
1031
6.54k
isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1032
6.54k
    UChar32 c;
1033
6.54k
    int32_t dotType;
1034
6.54k
    int8_t dir;
1035
1036
6.54k
    if(iter==NULL) {
1037
0
        return false;
1038
0
    }
1039
1040
7.28k
    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1041
4.80k
        if(c==0x307) {
1042
216
            return true;
1043
216
        }
1044
4.59k
        dotType=getDotType(c);
1045
4.59k
        if(dotType!=UCASE_OTHER_ACCENT) {
1046
3.84k
            return false; /* next base character or cc==230 in between */
1047
3.84k
        }
1048
4.59k
    }
1049
1050
2.47k
    return false; /* no dot above following */
1051
6.54k
}
1052
1053
U_CAPI int32_t U_EXPORT2
1054
ucase_toFullLower(UChar32 c,
1055
                  UCaseContextIterator *iter, void *context,
1056
                  const UChar **pString,
1057
711k
                  int32_t loc) {
1058
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1059
711k
    U_ASSERT(c >= 0);
1060
711k
    UChar32 result=c;
1061
    // Reset the output pointer in case it was uninitialized.
1062
711k
    *pString=nullptr;
1063
711k
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1064
711k
    if(!UCASE_HAS_EXCEPTION(props)) {
1065
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1066
0
            result=c+UCASE_GET_DELTA(props);
1067
0
        }
1068
711k
    } else {
1069
711k
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1070
711k
        uint16_t excWord=*pe++;
1071
711k
        int32_t full;
1072
1073
711k
        pe2=pe;
1074
1075
711k
        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1076
            /* use hardcoded conditions and mappings */
1077
1078
            /*
1079
             * Test for conditional mappings first
1080
             *   (otherwise the unconditional default mappings are always taken),
1081
             * then test for characters that have unconditional mappings in SpecialCasing.txt,
1082
             * then get the UnicodeData.txt mappings.
1083
             */
1084
233k
            if( loc==UCASE_LOC_LITHUANIAN &&
1085
                    /* base characters, find accents above */
1086
233k
                    (((c==0x49 || c==0x4a || c==0x12e) &&
1087
10.8k
                        isFollowedByMoreAbove(iter, context)) ||
1088
                    /* precomposed with accent above, no need to find one */
1089
10.8k
                    (c==0xcc || c==0xcd || c==0x128))
1090
233k
            ) {
1091
                /*
1092
                    # Lithuanian
1093
1094
                    # Lithuanian retains the dot in a lowercase i when followed by accents.
1095
1096
                    # Introduce an explicit dot above when lowercasing capital I's and J's
1097
                    # whenever there are more accents above.
1098
                    # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1099
1100
                    0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1101
                    004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1102
                    012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1103
                    00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1104
                    00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1105
                    0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1106
                 */
1107
3.08k
                switch(c) {
1108
578
                case 0x49:  /* LATIN CAPITAL LETTER I */
1109
578
                    *pString=iDot;
1110
578
                    return 2;
1111
505
                case 0x4a:  /* LATIN CAPITAL LETTER J */
1112
505
                    *pString=jDot;
1113
505
                    return 2;
1114
1.35k
                case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1115
1.35k
                    *pString=iOgonekDot;
1116
1.35k
                    return 2;
1117
233
                case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1118
233
                    *pString=iDotGrave;
1119
233
                    return 3;
1120
219
                case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1121
219
                    *pString=iDotAcute;
1122
219
                    return 3;
1123
199
                case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1124
199
                    *pString=iDotTilde;
1125
199
                    return 3;
1126
0
                default:
1127
0
                    return 0; /* will not occur */
1128
3.08k
                }
1129
            /* # Turkish and Azeri */
1130
230k
            } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1131
                /*
1132
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1133
                    # The following rules handle those cases.
1134
1135
                    0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1136
                    0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1137
                 */
1138
327
                return 0x69;
1139
229k
            } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1140
                /*
1141
                    # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1142
                    # This matches the behavior of the canonically equivalent I-dot_above
1143
1144
                    0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1145
                    0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1146
                 */
1147
216
                return 0; /* remove the dot (continue without output) */
1148
229k
            } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1149
                /*
1150
                    # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1151
1152
                    0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1153
                    0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1154
                 */
1155
6.32k
                return 0x131;
1156
223k
            } else if(c==0x130) {
1157
                /*
1158
                    # Preserve canonical equivalence for I with dot. Turkic is handled below.
1159
1160
                    0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1161
                 */
1162
1.12k
                *pString=iDot;
1163
1.12k
                return 2;
1164
222k
            } else if(  c==0x3a3 &&
1165
222k
                        !isFollowedByCasedLetter(iter, context, 1) &&
1166
222k
                        isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1167
222k
            ) {
1168
                /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1169
                /*
1170
                    # Special case for final form of sigma
1171
1172
                    03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1173
                 */
1174
9.73k
                return 0x3c2; /* greek small final sigma */
1175
212k
            } else {
1176
                /* no known conditional special case mapping, use a normal mapping */
1177
212k
            }
1178
478k
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1179
451k
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1180
451k
            full&=UCASE_FULL_LOWER;
1181
451k
            if(full!=0) {
1182
                /* set the output pointer to the lowercase mapping */
1183
0
                *pString=reinterpret_cast<const UChar *>(pe+1);
1184
1185
                /* return the string length */
1186
0
                return full;
1187
0
            }
1188
451k
        }
1189
1190
690k
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1191
175k
            int32_t delta;
1192
175k
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1193
175k
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1194
175k
        }
1195
514k
        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1196
6.21k
            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1197
6.21k
        }
1198
514k
    }
1199
1200
514k
    return (result==c) ? ~result : result;
1201
711k
}
1202
1203
/* internal */
1204
static int32_t
1205
toUpperOrTitle(UChar32 c,
1206
               UCaseContextIterator *iter, void *context,
1207
               const UChar **pString,
1208
               int32_t loc,
1209
20.5M
               UBool upperNotTitle) {
1210
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1211
20.5M
    U_ASSERT(c >= 0);
1212
20.5M
    UChar32 result=c;
1213
    // Reset the output pointer in case it was uninitialized.
1214
20.5M
    *pString=nullptr;
1215
20.5M
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1216
20.5M
    if(!UCASE_HAS_EXCEPTION(props)) {
1217
19.6M
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1218
867k
            result=c+UCASE_GET_DELTA(props);
1219
867k
        }
1220
19.6M
    } else {
1221
862k
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1222
862k
        uint16_t excWord=*pe++;
1223
862k
        int32_t full, idx;
1224
1225
862k
        pe2=pe;
1226
1227
862k
        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1228
            /* use hardcoded conditions and mappings */
1229
623k
            if(loc==UCASE_LOC_TURKISH && c==0x69) {
1230
                /*
1231
                    # Turkish and Azeri
1232
1233
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1234
                    # The following rules handle those cases.
1235
1236
                    # When uppercasing, i turns into a dotted capital I
1237
1238
                    0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1239
                    0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1240
                */
1241
2.21k
                return 0x130;
1242
621k
            } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1243
                /*
1244
                    # Lithuanian
1245
1246
                    # Lithuanian retains the dot in a lowercase i when followed by accents.
1247
1248
                    # Remove DOT ABOVE after "i" with upper or titlecase
1249
1250
                    0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1251
                 */
1252
197
                return 0; /* remove the dot (continue without output) */
1253
621k
            } else if(c==0x0587) {
1254
                // See ICU-13416:
1255
                // և ligature ech-yiwn
1256
                // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1257
                // but to ԵՎ=ech+vew in Eastern Armenian.
1258
6.86k
                if(loc==UCASE_LOC_ARMENIAN) {
1259
515
                    *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1260
6.34k
                } else {
1261
6.34k
                    *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1262
6.34k
                }
1263
6.86k
                return 2;
1264
614k
            } else {
1265
                /* no known conditional special case mapping, use a normal mapping */
1266
614k
            }
1267
623k
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1268
84.1k
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1269
1270
            /* start of full case mapping strings */
1271
84.1k
            ++pe;
1272
1273
            /* skip the lowercase and case-folding result strings */
1274
84.1k
            pe+=full&UCASE_FULL_LOWER;
1275
84.1k
            full>>=4;
1276
84.1k
            pe+=full&0xf;
1277
84.1k
            full>>=4;
1278
1279
84.1k
            if(upperNotTitle) {
1280
61.3k
                full&=0xf;
1281
61.3k
            } else {
1282
                /* skip the uppercase result string */
1283
22.7k
                pe+=full&0xf;
1284
22.7k
                full=(full>>4)&0xf;
1285
22.7k
            }
1286
1287
84.1k
            if(full!=0) {
1288
                /* set the output pointer to the result string */
1289
76.3k
                *pString=reinterpret_cast<const UChar *>(pe);
1290
1291
                /* return the string length */
1292
76.3k
                return full;
1293
76.3k
            }
1294
84.1k
        }
1295
1296
777k
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1297
150k
            int32_t delta;
1298
150k
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1299
150k
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1300
150k
        }
1301
627k
        if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1302
1.38k
            idx=UCASE_EXC_TITLE;
1303
625k
        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1304
            /* here, titlecase is same as uppercase */
1305
98.0k
            idx=UCASE_EXC_UPPER;
1306
527k
        } else {
1307
527k
            return ~c;
1308
527k
        }
1309
627k
        GET_SLOT_VALUE(excWord, idx, pe2, result);
1310
99.3k
    }
1311
1312
19.7M
    return (result==c) ? ~result : result;
1313
20.5M
}
1314
1315
U_CAPI int32_t U_EXPORT2
1316
ucase_toFullUpper(UChar32 c,
1317
                  UCaseContextIterator *iter, void *context,
1318
                  const UChar **pString,
1319
3.81M
                  int32_t caseLocale) {
1320
3.81M
    return toUpperOrTitle(c, iter, context, pString, caseLocale, true);
1321
3.81M
}
1322
1323
U_CAPI int32_t U_EXPORT2
1324
ucase_toFullTitle(UChar32 c,
1325
                  UCaseContextIterator *iter, void *context,
1326
                  const UChar **pString,
1327
16.7M
                  int32_t caseLocale) {
1328
16.7M
    return toUpperOrTitle(c, iter, context, pString, caseLocale, false);
1329
16.7M
}
1330
1331
/* case folding ------------------------------------------------------------- */
1332
1333
/*
1334
 * Case folding is similar to lowercasing.
1335
 * The result may be a simple mapping, i.e., a single code point, or
1336
 * a full mapping, i.e., a string.
1337
 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1338
 * then only the lowercase mapping is stored.
1339
 *
1340
 * Some special cases are hardcoded because their conditions cannot be
1341
 * parsed and processed from CaseFolding.txt.
1342
 *
1343
 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1344
1345
# C: common case folding, common mappings shared by both simple and full mappings.
1346
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1347
# S: simple case folding, mappings to single characters where different from F.
1348
# T: special case for uppercase I and dotted uppercase I
1349
#    - For non-Turkic languages, this mapping is normally not used.
1350
#    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1351
#
1352
# Usage:
1353
#  A. To do a simple case folding, use the mappings with status C + S.
1354
#  B. To do a full case folding, use the mappings with status C + F.
1355
#
1356
#    The mappings with status T can be used or omitted depending on the desired case-folding
1357
#    behavior. (The default option is to exclude them.)
1358
1359
 * Unicode 3.2 has 'T' mappings as follows:
1360
1361
0049; T; 0131; # LATIN CAPITAL LETTER I
1362
0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1363
1364
 * while the default mappings for these code points are:
1365
1366
0049; C; 0069; # LATIN CAPITAL LETTER I
1367
0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1368
1369
 * U+0130 has no simple case folding (simple-case-folds to itself).
1370
 */
1371
1372
/* return the simple case folding mapping for c */
1373
U_CAPI UChar32 U_EXPORT2
1374
0
ucase_fold(UChar32 c, uint32_t options) {
1375
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1376
0
    if(!UCASE_HAS_EXCEPTION(props)) {
1377
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1378
0
            c+=UCASE_GET_DELTA(props);
1379
0
        }
1380
0
    } else {
1381
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1382
0
        uint16_t excWord=*pe++;
1383
0
        int32_t idx;
1384
0
        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1385
            /* special case folding mappings, hardcoded */
1386
0
            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1387
                /* default mappings */
1388
0
                if(c==0x49) {
1389
                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1390
0
                    return 0x69;
1391
0
                } else if(c==0x130) {
1392
                    /* no simple case folding for U+0130 */
1393
0
                    return c;
1394
0
                }
1395
0
            } else {
1396
                /* Turkic mappings */
1397
0
                if(c==0x49) {
1398
                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1399
0
                    return 0x131;
1400
0
                } else if(c==0x130) {
1401
                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1402
0
                    return 0x69;
1403
0
                }
1404
0
            }
1405
0
        }
1406
0
        if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1407
0
            return c;
1408
0
        }
1409
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1410
0
            int32_t delta;
1411
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1412
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1413
0
        }
1414
0
        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1415
0
            idx=UCASE_EXC_FOLD;
1416
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1417
0
            idx=UCASE_EXC_LOWER;
1418
0
        } else {
1419
0
            return c;
1420
0
        }
1421
0
        GET_SLOT_VALUE(excWord, idx, pe, c);
1422
0
    }
1423
0
    return c;
1424
0
}
1425
1426
/*
1427
 * Issue for canonical caseless match (UAX #21):
1428
 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1429
 * canonical equivalence, unlike default-option casefolding.
1430
 * For example, I-grave and I + grave fold to strings that are not canonically
1431
 * equivalent.
1432
 * For more details, see the comment in unorm_compare() in unorm.cpp
1433
 * and the intermediate prototype changes for Jitterbug 2021.
1434
 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1435
 *
1436
 * This did not get fixed because it appears that it is not possible to fix
1437
 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1438
 * together in a way that they still fold to common result strings.
1439
 */
1440
1441
U_CAPI int32_t U_EXPORT2
1442
ucase_toFullFolding(UChar32 c,
1443
                    const UChar **pString,
1444
61.3k
                    uint32_t options) {
1445
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1446
61.3k
    U_ASSERT(c >= 0);
1447
61.3k
    UChar32 result=c;
1448
    // Reset the output pointer in case it was uninitialized.
1449
61.3k
    *pString=nullptr;
1450
61.3k
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1451
61.3k
    if(!UCASE_HAS_EXCEPTION(props)) {
1452
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1453
0
            result=c+UCASE_GET_DELTA(props);
1454
0
        }
1455
61.3k
    } else {
1456
61.3k
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1457
61.3k
        uint16_t excWord=*pe++;
1458
61.3k
        int32_t full, idx;
1459
1460
61.3k
        pe2=pe;
1461
1462
61.3k
        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1463
            /* use hardcoded conditions and mappings */
1464
1.38k
            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1465
                /* default mappings */
1466
268
                if(c==0x49) {
1467
                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1468
0
                    return 0x69;
1469
268
                } else if(c==0x130) {
1470
                    /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1471
268
                    *pString=iDot;
1472
268
                    return 2;
1473
268
                }
1474
1.11k
            } else {
1475
                /* Turkic mappings */
1476
1.11k
                if(c==0x49) {
1477
                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1478
922
                    return 0x131;
1479
922
                } else if(c==0x130) {
1480
                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1481
194
                    return 0x69;
1482
194
                }
1483
1.11k
            }
1484
60.0k
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1485
50.7k
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1486
1487
            /* start of full case mapping strings */
1488
50.7k
            ++pe;
1489
1490
            /* skip the lowercase result string */
1491
50.7k
            pe+=full&UCASE_FULL_LOWER;
1492
50.7k
            full=(full>>4)&0xf;
1493
1494
50.7k
            if(full!=0) {
1495
                /* set the output pointer to the result string */
1496
50.7k
                *pString=reinterpret_cast<const UChar *>(pe);
1497
1498
                /* return the string length */
1499
50.7k
                return full;
1500
50.7k
            }
1501
50.7k
        }
1502
1503
9.23k
        if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1504
218
            return ~c;
1505
218
        }
1506
9.01k
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1507
3.62k
            int32_t delta;
1508
3.62k
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1509
3.62k
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1510
3.62k
        }
1511
5.38k
        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1512
889
            idx=UCASE_EXC_FOLD;
1513
4.49k
        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1514
1.58k
            idx=UCASE_EXC_LOWER;
1515
2.91k
        } else {
1516
2.91k
            return ~c;
1517
2.91k
        }
1518
5.38k
        GET_SLOT_VALUE(excWord, idx, pe2, result);
1519
2.47k
    }
1520
1521
2.47k
    return (result==c) ? ~result : result;
1522
61.3k
}
1523
1524
/* case mapping properties API ---------------------------------------------- */
1525
1526
/* public API (see uchar.h) */
1527
1528
U_CAPI UBool U_EXPORT2
1529
0
u_isULowercase(UChar32 c) {
1530
0
    return (UBool)(UCASE_LOWER==ucase_getType(c));
1531
0
}
1532
1533
U_CAPI UBool U_EXPORT2
1534
0
u_isUUppercase(UChar32 c) {
1535
0
    return (UBool)(UCASE_UPPER==ucase_getType(c));
1536
0
}
1537
1538
/* Transforms the Unicode character to its lower case equivalent.*/
1539
U_CAPI UChar32 U_EXPORT2
1540
0
u_tolower(UChar32 c) {
1541
0
    return ucase_tolower(c);
1542
0
}
1543
1544
/* Transforms the Unicode character to its upper case equivalent.*/
1545
U_CAPI UChar32 U_EXPORT2
1546
0
u_toupper(UChar32 c) {
1547
0
    return ucase_toupper(c);
1548
0
}
1549
1550
/* Transforms the Unicode character to its title case equivalent.*/
1551
U_CAPI UChar32 U_EXPORT2
1552
0
u_totitle(UChar32 c) {
1553
0
    return ucase_totitle(c);
1554
0
}
1555
1556
/* return the simple case folding mapping for c */
1557
U_CAPI UChar32 U_EXPORT2
1558
0
u_foldCase(UChar32 c, uint32_t options) {
1559
0
    return ucase_fold(c, options);
1560
0
}
1561
1562
U_CFUNC int32_t U_EXPORT2
1563
0
ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1564
    /* case mapping properties */
1565
0
    const UChar *resultString;
1566
0
    switch(which) {
1567
0
    case UCHAR_LOWERCASE:
1568
0
        return (UBool)(UCASE_LOWER==ucase_getType(c));
1569
0
    case UCHAR_UPPERCASE:
1570
0
        return (UBool)(UCASE_UPPER==ucase_getType(c));
1571
0
    case UCHAR_SOFT_DOTTED:
1572
0
        return ucase_isSoftDotted(c);
1573
0
    case UCHAR_CASE_SENSITIVE:
1574
0
        return ucase_isCaseSensitive(c);
1575
0
    case UCHAR_CASED:
1576
0
        return (UBool)(UCASE_NONE!=ucase_getType(c));
1577
0
    case UCHAR_CASE_IGNORABLE:
1578
0
        return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1579
    /*
1580
     * Note: The following Changes_When_Xyz are defined as testing whether
1581
     * the NFD form of the input changes when Xyz-case-mapped.
1582
     * However, this simpler implementation of these properties,
1583
     * ignoring NFD, passes the tests.
1584
     * The implementation needs to be changed if the tests start failing.
1585
     * When that happens, optimizations should be used to work with the
1586
     * per-single-code point ucase_toFullXyz() functions unless
1587
     * the NFD form has more than one code point,
1588
     * and the property starts set needs to be the union of the
1589
     * start sets for normalization and case mappings.
1590
     */
1591
0
    case UCHAR_CHANGES_WHEN_LOWERCASED:
1592
0
        return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1593
0
    case UCHAR_CHANGES_WHEN_UPPERCASED:
1594
0
        return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1595
0
    case UCHAR_CHANGES_WHEN_TITLECASED:
1596
0
        return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1597
    /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1598
0
    case UCHAR_CHANGES_WHEN_CASEMAPPED:
1599
0
        return (UBool)(
1600
0
            ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1601
0
            ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1602
0
            ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1603
0
    default:
1604
0
        return false;
1605
0
    }
1606
0
}