Coverage Report

Created: 2021-08-22 09:07

/src/skia/third_party/externals/icu/source/common/ucase.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2004-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  ucase.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2004aug30
16
*   created by: Markus W. Scherer
17
*
18
*   Low-level Unicode character/string case mapping code.
19
*   Much code moved here (and modified) from uchar.c.
20
*/
21
22
#include "unicode/utypes.h"
23
#include "unicode/unistr.h"
24
#include "unicode/uset.h"
25
#include "unicode/udata.h" /* UDataInfo */
26
#include "unicode/utf16.h"
27
#include "ucmndata.h" /* DataHeader */
28
#include "udatamem.h"
29
#include "umutex.h"
30
#include "uassert.h"
31
#include "cmemory.h"
32
#include "utrie2.h"
33
#include "ucase.h"
34
35
struct UCaseProps {
36
    UDataMemory *mem;
37
    const int32_t *indexes;
38
    const uint16_t *exceptions;
39
    const uint16_t *unfold;
40
41
    UTrie2 trie;
42
    uint8_t formatVersion[4];
43
};
44
45
/* ucase_props_data.h is machine-generated by gencase --csource */
46
#define INCLUDED_FROM_UCASE_CPP
47
#include "ucase_props_data.h"
48
49
/* set of property starts for UnicodeSet ------------------------------------ */
50
51
static UBool U_CALLCONV
52
0
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
53
    /* add the start code point to the USet */
54
0
    const USetAdder *sa=(const USetAdder *)context;
55
0
    sa->add(sa->set, start);
56
0
    return TRUE;
57
0
}
58
59
U_CFUNC void U_EXPORT2
60
0
ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
61
0
    if(U_FAILURE(*pErrorCode)) {
62
0
        return;
63
0
    }
64
65
    /* add the start code point of each same-value range of the trie */
66
0
    utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
67
68
    /* add code points with hardcoded properties, plus the ones following them */
69
70
    /* (none right now, see comment below) */
71
72
    /*
73
     * Omit code points with hardcoded specialcasing properties
74
     * because we do not build property UnicodeSets for them right now.
75
     */
76
0
}
77
78
/* data access primitives --------------------------------------------------- */
79
80
U_CFUNC const UTrie2 * U_EXPORT2
81
0
ucase_getTrie() {
82
0
    return &ucase_props_singleton.trie;
83
0
}
84
85
0
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
86
87
/* number of bits in an 8-bit integer value */
88
static const uint8_t flagsOffset[256]={
89
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
105
};
106
107
0
#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
108
0
#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
109
110
/*
111
 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
112
 *
113
 * @param excWord (in) initial exceptions word
114
 * @param idx (in) desired slot index
115
 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
116
 *               moved to the last uint16_t of the value, use +1 for beginning of next slot
117
 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
118
 */
119
0
#define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
120
0
    if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
121
0
        (pExc16)+=SLOT_OFFSET(excWord, idx); \
122
0
        (value)=*pExc16; \
123
0
    } else { \
124
0
        (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
125
0
        (value)=*pExc16++; \
126
0
        (value)=((value)<<16)|*pExc16; \
127
0
    } \
128
0
} UPRV_BLOCK_MACRO_END
129
130
/* simple case mappings ----------------------------------------------------- */
131
132
U_CAPI UChar32 U_EXPORT2
133
0
ucase_tolower(UChar32 c) {
134
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
135
0
    if(!UCASE_HAS_EXCEPTION(props)) {
136
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
137
0
            c+=UCASE_GET_DELTA(props);
138
0
        }
139
0
    } else {
140
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
141
0
        uint16_t excWord=*pe++;
142
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
143
0
            int32_t delta;
144
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
145
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
146
0
        }
147
0
        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
148
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
149
0
        }
150
0
    }
151
0
    return c;
152
0
}
153
154
U_CAPI UChar32 U_EXPORT2
155
0
ucase_toupper(UChar32 c) {
156
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
157
0
    if(!UCASE_HAS_EXCEPTION(props)) {
158
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
159
0
            c+=UCASE_GET_DELTA(props);
160
0
        }
161
0
    } else {
162
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
163
0
        uint16_t excWord=*pe++;
164
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
165
0
            int32_t delta;
166
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
167
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
168
0
        }
169
0
        if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
170
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
171
0
        }
172
0
    }
173
0
    return c;
174
0
}
175
176
U_CAPI UChar32 U_EXPORT2
177
0
ucase_totitle(UChar32 c) {
178
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
179
0
    if(!UCASE_HAS_EXCEPTION(props)) {
180
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
181
0
            c+=UCASE_GET_DELTA(props);
182
0
        }
183
0
    } else {
184
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
185
0
        uint16_t excWord=*pe++;
186
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
187
0
            int32_t delta;
188
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
189
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
190
0
        }
191
0
        int32_t idx;
192
0
        if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
193
0
            idx=UCASE_EXC_TITLE;
194
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
195
0
            idx=UCASE_EXC_UPPER;
196
0
        } else {
197
0
            return c;
198
0
        }
199
0
        GET_SLOT_VALUE(excWord, idx, pe, c);
200
0
    }
201
0
    return c;
202
0
}
203
204
static const UChar iDot[2] = { 0x69, 0x307 };
205
static const UChar jDot[2] = { 0x6a, 0x307 };
206
static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
207
static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
208
static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
209
static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
210
211
212
U_CFUNC void U_EXPORT2
213
0
ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
214
0
    uint16_t props;
215
216
    /*
217
     * Hardcode the case closure of i and its relatives and ignore the
218
     * data file data for these characters.
219
     * The Turkic dotless i and dotted I with their case mapping conditions
220
     * and case folding option make the related characters behave specially.
221
     * This code matches their closure behavior to their case folding behavior.
222
     */
223
224
0
    switch(c) {
225
0
    case 0x49:
226
        /* regular i and I are in one equivalence class */
227
0
        sa->add(sa->set, 0x69);
228
0
        return;
229
0
    case 0x69:
230
0
        sa->add(sa->set, 0x49);
231
0
        return;
232
0
    case 0x130:
233
        /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
234
0
        sa->addString(sa->set, iDot, 2);
235
0
        return;
236
0
    case 0x131:
237
        /* dotless i is in a class by itself */
238
0
        return;
239
0
    default:
240
        /* otherwise use the data file data */
241
0
        break;
242
0
    }
243
244
0
    props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
245
0
    if(!UCASE_HAS_EXCEPTION(props)) {
246
0
        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
247
            /* add the one simple case mapping, no matter what type it is */
248
0
            int32_t delta=UCASE_GET_DELTA(props);
249
0
            if(delta!=0) {
250
0
                sa->add(sa->set, c+delta);
251
0
            }
252
0
        }
253
0
    } else {
254
        /*
255
         * c has exceptions, so there may be multiple simple and/or
256
         * full case mappings. Add them all.
257
         */
258
0
        const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
259
0
        const UChar *closure;
260
0
        uint16_t excWord=*pe++;
261
0
        int32_t idx, closureLength, fullLength, length;
262
263
0
        pe0=pe;
264
265
        /* add all simple case mappings */
266
0
        for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
267
0
            if(HAS_SLOT(excWord, idx)) {
268
0
                pe=pe0;
269
0
                GET_SLOT_VALUE(excWord, idx, pe, c);
270
0
                sa->add(sa->set, c);
271
0
            }
272
0
        }
273
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
274
0
            pe=pe0;
275
0
            int32_t delta;
276
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
277
0
            sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
278
0
        }
279
280
        /* get the closure string pointer & length */
281
0
        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
282
0
            pe=pe0;
283
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
284
0
            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
285
0
            closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
286
0
        } else {
287
0
            closureLength=0;
288
0
            closure=NULL;
289
0
        }
290
291
        /* add the full case folding */
292
0
        if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
293
0
            pe=pe0;
294
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
295
296
            /* start of full case mapping strings */
297
0
            ++pe;
298
299
0
            fullLength&=0xffff; /* bits 16 and higher are reserved */
300
301
            /* skip the lowercase result string */
302
0
            pe+=fullLength&UCASE_FULL_LOWER;
303
0
            fullLength>>=4;
304
305
            /* add the full case folding string */
306
0
            length=fullLength&0xf;
307
0
            if(length!=0) {
308
0
                sa->addString(sa->set, (const UChar *)pe, length);
309
0
                pe+=length;
310
0
            }
311
312
            /* skip the uppercase and titlecase strings */
313
0
            fullLength>>=4;
314
0
            pe+=fullLength&0xf;
315
0
            fullLength>>=4;
316
0
            pe+=fullLength;
317
318
0
            closure=(const UChar *)pe; /* behind full case mappings */
319
0
        }
320
321
        /* add each code point in the closure string */
322
0
        for(idx=0; idx<closureLength;) {
323
0
            U16_NEXT_UNSAFE(closure, idx, c);
324
0
            sa->add(sa->set, c);
325
0
        }
326
0
    }
327
0
}
328
329
/*
330
 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
331
 * must be length>0 and max>0 and length<=max
332
 */
333
static inline int32_t
334
0
strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
335
0
    int32_t c1, c2;
336
337
0
    max-=length; /* we require length<=max, so no need to decrement max in the loop */
338
0
    do {
339
0
        c1=*s++;
340
0
        c2=*t++;
341
0
        if(c2==0) {
342
0
            return 1; /* reached the end of t but not of s */
343
0
        }
344
0
        c1-=c2;
345
0
        if(c1!=0) {
346
0
            return c1; /* return difference result */
347
0
        }
348
0
    } while(--length>0);
349
    /* ends with length==0 */
350
351
0
    if(max==0 || *t==0) {
352
0
        return 0; /* equal to length of both strings */
353
0
    } else {
354
0
        return -max; /* return lengh difference */
355
0
    }
356
0
}
357
358
U_CFUNC UBool U_EXPORT2
359
0
ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
360
0
    int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
361
362
0
    if(ucase_props_singleton.unfold==NULL || s==NULL) {
363
0
        return FALSE; /* no reverse case folding data, or no string */
364
0
    }
365
0
    if(length<=1) {
366
        /* the string is too short to find any match */
367
        /*
368
         * more precise would be:
369
         * if(!u_strHasMoreChar32Than(s, length, 1))
370
         * but this does not make much practical difference because
371
         * a single supplementary code point would just not be found
372
         */
373
0
        return FALSE;
374
0
    }
375
376
0
    const uint16_t *unfold=ucase_props_singleton.unfold;
377
0
    unfoldRows=unfold[UCASE_UNFOLD_ROWS];
378
0
    unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
379
0
    unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
380
0
    unfold+=unfoldRowWidth;
381
382
0
    if(length>unfoldStringWidth) {
383
        /* the string is too long to find any match */
384
0
        return FALSE;
385
0
    }
386
387
    /* do a binary search for the string */
388
0
    start=0;
389
0
    limit=unfoldRows;
390
0
    while(start<limit) {
391
0
        i=(start+limit)/2;
392
0
        const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
393
0
        result=strcmpMax(s, length, p, unfoldStringWidth);
394
395
0
        if(result==0) {
396
            /* found the string: add each code point, and its case closure */
397
0
            UChar32 c;
398
399
0
            for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
400
0
                U16_NEXT_UNSAFE(p, i, c);
401
0
                sa->add(sa->set, c);
402
0
                ucase_addCaseClosure(c, sa);
403
0
            }
404
0
            return TRUE;
405
0
        } else if(result<0) {
406
0
            limit=i;
407
0
        } else /* result>0 */ {
408
0
            start=i+1;
409
0
        }
410
0
    }
411
412
0
    return FALSE; /* string not found */
413
0
}
414
415
U_NAMESPACE_BEGIN
416
417
FullCaseFoldingIterator::FullCaseFoldingIterator()
418
        : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
419
          unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
420
          unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
421
          unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
422
          currentRow(0),
423
0
          rowCpIndex(unfoldStringWidth) {
424
0
    unfold+=unfoldRowWidth;
425
0
}
426
427
UChar32
428
0
FullCaseFoldingIterator::next(UnicodeString &full) {
429
    // Advance past the last-delivered code point.
430
0
    const UChar *p=unfold+(currentRow*unfoldRowWidth);
431
0
    if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
432
0
        ++currentRow;
433
0
        p+=unfoldRowWidth;
434
0
        rowCpIndex=unfoldStringWidth;
435
0
    }
436
0
    if(currentRow>=unfoldRows) { return U_SENTINEL; }
437
    // Set "full" to the NUL-terminated string in the first unfold column.
438
0
    int32_t length=unfoldStringWidth;
439
0
    while(length>0 && p[length-1]==0) { --length; }
440
0
    full.setTo(FALSE, p, length);
441
    // Return the code point.
442
0
    UChar32 c;
443
0
    U16_NEXT_UNSAFE(p, rowCpIndex, c);
444
0
    return c;
445
0
}
446
447
namespace LatinCase {
448
449
const int8_t TO_LOWER_NORMAL[LIMIT] = {
450
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
454
455
    0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
456
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
459
460
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
464
465
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
466
    32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
467
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
469
470
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
473
    EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
474
475
    0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
476
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
477
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
478
    1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
479
};
480
481
const int8_t TO_LOWER_TR_LT[LIMIT] = {
482
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
483
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
486
487
    0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
488
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
489
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491
492
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
496
497
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
498
    32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
499
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
500
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
501
502
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
504
    1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
505
    EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
506
507
    0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
508
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
509
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
510
    1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
511
};
512
513
const int8_t TO_UPPER_NORMAL[LIMIT] = {
514
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
516
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
517
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
518
519
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521
    0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
522
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
523
524
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
527
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
528
529
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
530
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
531
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
532
    -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
533
534
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
537
    0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
538
539
    -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
540
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
541
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
542
    0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
543
};
544
545
const int8_t TO_UPPER_TR[LIMIT] = {
546
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550
551
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553
    0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
554
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
555
556
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
557
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560
561
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
563
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
564
    -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
565
566
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
569
    0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
570
571
    -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
572
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
573
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
574
    0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
575
};
576
577
}  // namespace LatinCase
578
579
U_NAMESPACE_END
580
581
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
582
U_CAPI int32_t U_EXPORT2
583
0
ucase_getType(UChar32 c) {
584
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
585
0
    return UCASE_GET_TYPE(props);
586
0
}
587
588
/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
589
U_CAPI int32_t U_EXPORT2
590
0
ucase_getTypeOrIgnorable(UChar32 c) {
591
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
592
0
    return UCASE_GET_TYPE_AND_IGNORABLE(props);
593
0
}
594
595
/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
596
static inline int32_t
597
0
getDotType(UChar32 c) {
598
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
599
0
    if(!UCASE_HAS_EXCEPTION(props)) {
600
0
        return props&UCASE_DOT_MASK;
601
0
    } else {
602
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
603
0
        return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
604
0
    }
605
0
}
606
607
U_CAPI UBool U_EXPORT2
608
0
ucase_isSoftDotted(UChar32 c) {
609
0
    return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
610
0
}
611
612
U_CAPI UBool U_EXPORT2
613
0
ucase_isCaseSensitive(UChar32 c) {
614
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
615
0
    if(!UCASE_HAS_EXCEPTION(props)) {
616
0
        return (UBool)((props&UCASE_SENSITIVE)!=0);
617
0
    } else {
618
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
619
0
        return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
620
0
    }
621
0
}
622
623
/* string casing ------------------------------------------------------------ */
624
625
/*
626
 * These internal functions form the core of string case mappings.
627
 * They map single code points to result code points or strings and take
628
 * all necessary conditions (context, locale ID, options) into account.
629
 *
630
 * They do not iterate over the source or write to the destination
631
 * so that the same functions are useful for non-standard string storage,
632
 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
633
 * For the same reason, the "surrounding text" context is passed in as a
634
 * UCaseContextIterator which does not make any assumptions about
635
 * the underlying storage.
636
 *
637
 * This section contains helper functions that check for conditions
638
 * in the input text surrounding the current code point
639
 * according to SpecialCasing.txt.
640
 *
641
 * Each helper function gets the index
642
 * - after the current code point if it looks at following text
643
 * - before the current code point if it looks at preceding text
644
 *
645
 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
646
 *
647
 * Final_Sigma
648
 *   C is preceded by a sequence consisting of
649
 *     a cased letter and a case-ignorable sequence,
650
 *   and C is not followed by a sequence consisting of
651
 *     an ignorable sequence and then a cased letter.
652
 *
653
 * More_Above
654
 *   C is followed by one or more characters of combining class 230 (ABOVE)
655
 *   in the combining character sequence.
656
 *
657
 * After_Soft_Dotted
658
 *   The last preceding character with combining class of zero before C
659
 *   was Soft_Dotted,
660
 *   and there is no intervening combining character class 230 (ABOVE).
661
 *
662
 * Before_Dot
663
 *   C is followed by combining dot above (U+0307).
664
 *   Any sequence of characters with a combining class that is neither 0 nor 230
665
 *   may intervene between the current character and the combining dot above.
666
 *
667
 * The erratum from 2002-10-31 adds the condition
668
 *
669
 * After_I
670
 *   The last preceding base character was an uppercase I, and there is no
671
 *   intervening combining character class 230 (ABOVE).
672
 *
673
 *   (See Jitterbug 2344 and the comments on After_I below.)
674
 *
675
 * Helper definitions in Unicode 3.2 UAX 21:
676
 *
677
 * D1. A character C is defined to be cased
678
 *     if it meets any of the following criteria:
679
 *
680
 *   - The general category of C is Titlecase Letter (Lt)
681
 *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
682
 *   - Given D = NFD(C), then it is not the case that:
683
 *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
684
 *     (This third criterion does not add any characters to the list
685
 *      for Unicode 3.2. Ignored.)
686
 *
687
 * D2. A character C is defined to be case-ignorable
688
 *     if it meets either of the following criteria:
689
 *
690
 *   - The general category of C is
691
 *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
692
 *     Letter Modifier (Lm), or Symbol Modifier (Sk)
693
 *   - C is one of the following characters 
694
 *     U+0027 APOSTROPHE
695
 *     U+00AD SOFT HYPHEN (SHY)
696
 *     U+2019 RIGHT SINGLE QUOTATION MARK
697
 *            (the preferred character for apostrophe)
698
 *
699
 * D3. A case-ignorable sequence is a sequence of
700
 *     zero or more case-ignorable characters.
701
 */
702
703
0
#define is_d(c) ((c)=='d' || (c)=='D')
704
0
#define is_e(c) ((c)=='e' || (c)=='E')
705
0
#define is_i(c) ((c)=='i' || (c)=='I')
706
0
#define is_l(c) ((c)=='l' || (c)=='L')
707
0
#define is_r(c) ((c)=='r' || (c)=='R')
708
0
#define is_t(c) ((c)=='t' || (c)=='T')
709
0
#define is_u(c) ((c)=='u' || (c)=='U')
710
0
#define is_y(c) ((c)=='y' || (c)=='Y')
711
0
#define is_z(c) ((c)=='z' || (c)=='Z')
712
713
/* separator? */
714
0
#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
715
716
/**
717
 * Requires non-NULL locale ID but otherwise does the equivalent of
718
 * checking for language codes as if uloc_getLanguage() were called:
719
 * Accepts both 2- and 3-letter codes and accepts case variants.
720
 */
721
U_CFUNC int32_t
722
0
ucase_getCaseLocale(const char *locale) {
723
    /*
724
     * This function used to use uloc_getLanguage(), but the current code
725
     * removes the dependency of this low-level code on uloc implementation code
726
     * and is faster because not the whole locale ID has to be
727
     * examined and copied/transformed.
728
     *
729
     * Because this code does not want to depend on uloc, the caller must
730
     * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
731
     */
732
0
    char c=*locale++;
733
    // Fastpath for English "en" which is often used for default (=root locale) case mappings,
734
    // and for Chinese "zh": Very common but no special case mapping behavior.
735
    // Then check lowercase vs. uppercase to reduce the number of comparisons
736
    // for other locales without special behavior.
737
0
    if(c=='e') {
738
        /* el or ell? */
739
0
        c=*locale++;
740
0
        if(is_l(c)) {
741
0
            c=*locale++;
742
0
            if(is_l(c)) {
743
0
                c=*locale;
744
0
            }
745
0
            if(is_sep(c)) {
746
0
                return UCASE_LOC_GREEK;
747
0
            }
748
0
        }
749
        // en, es, ... -> root
750
0
    } else if(c=='z') {
751
0
        return UCASE_LOC_ROOT;
752
0
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
753
0
    } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
754
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
755
    } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
756
#else
757
#   error Unknown charset family!
758
#endif
759
        // lowercase c
760
0
        if(c=='t') {
761
            /* tr or tur? */
762
0
            c=*locale++;
763
0
            if(is_u(c)) {
764
0
                c=*locale++;
765
0
            }
766
0
            if(is_r(c)) {
767
0
                c=*locale;
768
0
                if(is_sep(c)) {
769
0
                    return UCASE_LOC_TURKISH;
770
0
                }
771
0
            }
772
0
        } else if(c=='a') {
773
            /* az or aze? */
774
0
            c=*locale++;
775
0
            if(is_z(c)) {
776
0
                c=*locale++;
777
0
                if(is_e(c)) {
778
0
                    c=*locale;
779
0
                }
780
0
                if(is_sep(c)) {
781
0
                    return UCASE_LOC_TURKISH;
782
0
                }
783
0
            }
784
0
        } else if(c=='l') {
785
            /* lt or lit? */
786
0
            c=*locale++;
787
0
            if(is_i(c)) {
788
0
                c=*locale++;
789
0
            }
790
0
            if(is_t(c)) {
791
0
                c=*locale;
792
0
                if(is_sep(c)) {
793
0
                    return UCASE_LOC_LITHUANIAN;
794
0
                }
795
0
            }
796
0
        } else if(c=='n') {
797
            /* nl or nld? */
798
0
            c=*locale++;
799
0
            if(is_l(c)) {
800
0
                c=*locale++;
801
0
                if(is_d(c)) {
802
0
                    c=*locale;
803
0
                }
804
0
                if(is_sep(c)) {
805
0
                    return UCASE_LOC_DUTCH;
806
0
                }
807
0
            }
808
0
        } else if(c=='h') {
809
            /* hy or hye? *not* hyw */
810
0
            c=*locale++;
811
0
            if(is_y(c)) {
812
0
                c=*locale++;
813
0
                if(is_e(c)) {
814
0
                    c=*locale;
815
0
                }
816
0
                if(is_sep(c)) {
817
0
                    return UCASE_LOC_ARMENIAN;
818
0
                }
819
0
            }
820
0
        }
821
0
    } else {
822
        // uppercase c
823
        // Same code as for lowercase c but also check for 'E'.
824
0
        if(c=='T') {
825
            /* tr or tur? */
826
0
            c=*locale++;
827
0
            if(is_u(c)) {
828
0
                c=*locale++;
829
0
            }
830
0
            if(is_r(c)) {
831
0
                c=*locale;
832
0
                if(is_sep(c)) {
833
0
                    return UCASE_LOC_TURKISH;
834
0
                }
835
0
            }
836
0
        } else if(c=='A') {
837
            /* az or aze? */
838
0
            c=*locale++;
839
0
            if(is_z(c)) {
840
0
                c=*locale++;
841
0
                if(is_e(c)) {
842
0
                    c=*locale;
843
0
                }
844
0
                if(is_sep(c)) {
845
0
                    return UCASE_LOC_TURKISH;
846
0
                }
847
0
            }
848
0
        } else if(c=='L') {
849
            /* lt or lit? */
850
0
            c=*locale++;
851
0
            if(is_i(c)) {
852
0
                c=*locale++;
853
0
            }
854
0
            if(is_t(c)) {
855
0
                c=*locale;
856
0
                if(is_sep(c)) {
857
0
                    return UCASE_LOC_LITHUANIAN;
858
0
                }
859
0
            }
860
0
        } else if(c=='E') {
861
            /* el or ell? */
862
0
            c=*locale++;
863
0
            if(is_l(c)) {
864
0
                c=*locale++;
865
0
                if(is_l(c)) {
866
0
                    c=*locale;
867
0
                }
868
0
                if(is_sep(c)) {
869
0
                    return UCASE_LOC_GREEK;
870
0
                }
871
0
            }
872
0
        } else if(c=='N') {
873
            /* nl or nld? */
874
0
            c=*locale++;
875
0
            if(is_l(c)) {
876
0
                c=*locale++;
877
0
                if(is_d(c)) {
878
0
                    c=*locale;
879
0
                }
880
0
                if(is_sep(c)) {
881
0
                    return UCASE_LOC_DUTCH;
882
0
                }
883
0
            }
884
0
        } else if(c=='H') {
885
            /* hy or hye? *not* hyw */
886
0
            c=*locale++;
887
0
            if(is_y(c)) {
888
0
                c=*locale++;
889
0
                if(is_e(c)) {
890
0
                    c=*locale;
891
0
                }
892
0
                if(is_sep(c)) {
893
0
                    return UCASE_LOC_ARMENIAN;
894
0
                }
895
0
            }
896
0
        }
897
0
    }
898
0
    return UCASE_LOC_ROOT;
899
0
}
900
901
/*
902
 * Is followed by
903
 *   {case-ignorable}* cased
904
 * ?
905
 * (dir determines looking forward/backward)
906
 * If a character is case-ignorable, it is skipped regardless of whether
907
 * it is also cased or not.
908
 */
909
static UBool
910
0
isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
911
0
    UChar32 c;
912
913
0
    if(iter==NULL) {
914
0
        return FALSE;
915
0
    }
916
917
0
    for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
918
0
        int32_t type=ucase_getTypeOrIgnorable(c);
919
0
        if(type&4) {
920
            /* case-ignorable, continue with the loop */
921
0
        } else if(type!=UCASE_NONE) {
922
0
            return TRUE; /* followed by cased letter */
923
0
        } else {
924
0
            return FALSE; /* uncased and not case-ignorable */
925
0
        }
926
0
    }
927
928
0
    return FALSE; /* not followed by cased letter */
929
0
}
930
931
/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
932
static UBool
933
0
isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
934
0
    UChar32 c;
935
0
    int32_t dotType;
936
0
    int8_t dir;
937
938
0
    if(iter==NULL) {
939
0
        return FALSE;
940
0
    }
941
942
0
    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
943
0
        dotType=getDotType(c);
944
0
        if(dotType==UCASE_SOFT_DOTTED) {
945
0
            return TRUE; /* preceded by TYPE_i */
946
0
        } else if(dotType!=UCASE_OTHER_ACCENT) {
947
0
            return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
948
0
        }
949
0
    }
950
951
0
    return FALSE; /* not preceded by TYPE_i */
952
0
}
953
954
/*
955
 * See Jitterbug 2344:
956
 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
957
 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
958
 * we made those releases compatible with Unicode 3.2 which had not fixed
959
 * a related bug in SpecialCasing.txt.
960
 *
961
 * From the Jitterbug 2344 text:
962
 * ... this bug is listed as a Unicode erratum
963
 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
964
 * <quote>
965
 * There are two errors in SpecialCasing.txt.
966
 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
967
 * 2. An incorrect context definition. Correct as follows:
968
 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
969
 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
970
 * ---
971
 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
972
 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
973
 * where the context After_I is defined as:
974
 * The last preceding base character was an uppercase I, and there is no
975
 * intervening combining character class 230 (ABOVE).
976
 * </quote>
977
 *
978
 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
979
 *
980
 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
981
 * # This matches the behavior of the canonically equivalent I-dot_above
982
 *
983
 * See also the description in this place in older versions of uchar.c (revision 1.100).
984
 *
985
 * Markus W. Scherer 2003-feb-15
986
 */
987
988
/* Is preceded by base character 'I' with no intervening cc=230 ? */
989
static UBool
990
0
isPrecededBy_I(UCaseContextIterator *iter, void *context) {
991
0
    UChar32 c;
992
0
    int32_t dotType;
993
0
    int8_t dir;
994
995
0
    if(iter==NULL) {
996
0
        return FALSE;
997
0
    }
998
999
0
    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1000
0
        if(c==0x49) {
1001
0
            return TRUE; /* preceded by I */
1002
0
        }
1003
0
        dotType=getDotType(c);
1004
0
        if(dotType!=UCASE_OTHER_ACCENT) {
1005
0
            return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
1006
0
        }
1007
0
    }
1008
1009
0
    return FALSE; /* not preceded by I */
1010
0
}
1011
1012
/* Is followed by one or more cc==230 ? */
1013
static UBool
1014
0
isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
1015
0
    UChar32 c;
1016
0
    int32_t dotType;
1017
0
    int8_t dir;
1018
1019
0
    if(iter==NULL) {
1020
0
        return FALSE;
1021
0
    }
1022
1023
0
    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1024
0
        dotType=getDotType(c);
1025
0
        if(dotType==UCASE_ABOVE) {
1026
0
            return TRUE; /* at least one cc==230 following */
1027
0
        } else if(dotType!=UCASE_OTHER_ACCENT) {
1028
0
            return FALSE; /* next base character, no more cc==230 following */
1029
0
        }
1030
0
    }
1031
1032
0
    return FALSE; /* no more cc==230 following */
1033
0
}
1034
1035
/* Is followed by a dot above (without cc==230 in between) ? */
1036
static UBool
1037
0
isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1038
0
    UChar32 c;
1039
0
    int32_t dotType;
1040
0
    int8_t dir;
1041
1042
0
    if(iter==NULL) {
1043
0
        return FALSE;
1044
0
    }
1045
1046
0
    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1047
0
        if(c==0x307) {
1048
0
            return TRUE;
1049
0
        }
1050
0
        dotType=getDotType(c);
1051
0
        if(dotType!=UCASE_OTHER_ACCENT) {
1052
0
            return FALSE; /* next base character or cc==230 in between */
1053
0
        }
1054
0
    }
1055
1056
0
    return FALSE; /* no dot above following */
1057
0
}
1058
1059
U_CAPI int32_t U_EXPORT2
1060
ucase_toFullLower(UChar32 c,
1061
                  UCaseContextIterator *iter, void *context,
1062
                  const UChar **pString,
1063
0
                  int32_t loc) {
1064
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1065
0
    U_ASSERT(c >= 0);
1066
0
    UChar32 result=c;
1067
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1068
0
    if(!UCASE_HAS_EXCEPTION(props)) {
1069
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1070
0
            result=c+UCASE_GET_DELTA(props);
1071
0
        }
1072
0
    } else {
1073
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1074
0
        uint16_t excWord=*pe++;
1075
0
        int32_t full;
1076
1077
0
        pe2=pe;
1078
1079
0
        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1080
            /* use hardcoded conditions and mappings */
1081
1082
            /*
1083
             * Test for conditional mappings first
1084
             *   (otherwise the unconditional default mappings are always taken),
1085
             * then test for characters that have unconditional mappings in SpecialCasing.txt,
1086
             * then get the UnicodeData.txt mappings.
1087
             */
1088
0
            if( loc==UCASE_LOC_LITHUANIAN &&
1089
                    /* base characters, find accents above */
1090
0
                    (((c==0x49 || c==0x4a || c==0x12e) &&
1091
0
                        isFollowedByMoreAbove(iter, context)) ||
1092
                    /* precomposed with accent above, no need to find one */
1093
0
                    (c==0xcc || c==0xcd || c==0x128))
1094
0
            ) {
1095
                /*
1096
                    # Lithuanian
1097
1098
                    # Lithuanian retains the dot in a lowercase i when followed by accents.
1099
1100
                    # Introduce an explicit dot above when lowercasing capital I's and J's
1101
                    # whenever there are more accents above.
1102
                    # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1103
1104
                    0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1105
                    004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1106
                    012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1107
                    00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1108
                    00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1109
                    0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1110
                 */
1111
0
                switch(c) {
1112
0
                case 0x49:  /* LATIN CAPITAL LETTER I */
1113
0
                    *pString=iDot;
1114
0
                    return 2;
1115
0
                case 0x4a:  /* LATIN CAPITAL LETTER J */
1116
0
                    *pString=jDot;
1117
0
                    return 2;
1118
0
                case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1119
0
                    *pString=iOgonekDot;
1120
0
                    return 2;
1121
0
                case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1122
0
                    *pString=iDotGrave;
1123
0
                    return 3;
1124
0
                case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1125
0
                    *pString=iDotAcute;
1126
0
                    return 3;
1127
0
                case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1128
0
                    *pString=iDotTilde;
1129
0
                    return 3;
1130
0
                default:
1131
0
                    return 0; /* will not occur */
1132
0
                }
1133
            /* # Turkish and Azeri */
1134
0
            } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1135
                /*
1136
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1137
                    # The following rules handle those cases.
1138
1139
                    0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1140
                    0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1141
                 */
1142
0
                return 0x69;
1143
0
            } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1144
                /*
1145
                    # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1146
                    # This matches the behavior of the canonically equivalent I-dot_above
1147
1148
                    0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1149
                    0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1150
                 */
1151
0
                *pString=nullptr;
1152
0
                return 0; /* remove the dot (continue without output) */
1153
0
            } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1154
                /*
1155
                    # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1156
1157
                    0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1158
                    0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1159
                 */
1160
0
                return 0x131;
1161
0
            } else if(c==0x130) {
1162
                /*
1163
                    # Preserve canonical equivalence for I with dot. Turkic is handled below.
1164
1165
                    0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1166
                 */
1167
0
                *pString=iDot;
1168
0
                return 2;
1169
0
            } else if(  c==0x3a3 &&
1170
0
                        !isFollowedByCasedLetter(iter, context, 1) &&
1171
0
                        isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1172
0
            ) {
1173
                /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1174
                /*
1175
                    # Special case for final form of sigma
1176
1177
                    03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1178
                 */
1179
0
                return 0x3c2; /* greek small final sigma */
1180
0
            } else {
1181
                /* no known conditional special case mapping, use a normal mapping */
1182
0
            }
1183
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1184
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1185
0
            full&=UCASE_FULL_LOWER;
1186
0
            if(full!=0) {
1187
                /* set the output pointer to the lowercase mapping */
1188
0
                *pString=reinterpret_cast<const UChar *>(pe+1);
1189
1190
                /* return the string length */
1191
0
                return full;
1192
0
            }
1193
0
        }
1194
1195
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1196
0
            int32_t delta;
1197
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1198
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1199
0
        }
1200
0
        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1201
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1202
0
        }
1203
0
    }
1204
1205
0
    return (result==c) ? ~result : result;
1206
0
}
1207
1208
/* internal */
1209
static int32_t
1210
toUpperOrTitle(UChar32 c,
1211
               UCaseContextIterator *iter, void *context,
1212
               const UChar **pString,
1213
               int32_t loc,
1214
0
               UBool upperNotTitle) {
1215
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1216
0
    U_ASSERT(c >= 0);
1217
0
    UChar32 result=c;
1218
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1219
0
    if(!UCASE_HAS_EXCEPTION(props)) {
1220
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1221
0
            result=c+UCASE_GET_DELTA(props);
1222
0
        }
1223
0
    } else {
1224
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1225
0
        uint16_t excWord=*pe++;
1226
0
        int32_t full, idx;
1227
1228
0
        pe2=pe;
1229
1230
0
        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1231
            /* use hardcoded conditions and mappings */
1232
0
            if(loc==UCASE_LOC_TURKISH && c==0x69) {
1233
                /*
1234
                    # Turkish and Azeri
1235
1236
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1237
                    # The following rules handle those cases.
1238
1239
                    # When uppercasing, i turns into a dotted capital I
1240
1241
                    0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1242
                    0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1243
                */
1244
0
                return 0x130;
1245
0
            } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1246
                /*
1247
                    # Lithuanian
1248
1249
                    # Lithuanian retains the dot in a lowercase i when followed by accents.
1250
1251
                    # Remove DOT ABOVE after "i" with upper or titlecase
1252
1253
                    0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1254
                 */
1255
0
                *pString=nullptr;
1256
0
                return 0; /* remove the dot (continue without output) */
1257
0
            } else if(c==0x0587) {
1258
                // See ICU-13416:
1259
                // և ligature ech-yiwn
1260
                // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1261
                // but to ԵՎ=ech+vew in Eastern Armenian.
1262
0
                if(loc==UCASE_LOC_ARMENIAN) {
1263
0
                    *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1264
0
                } else {
1265
0
                    *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1266
0
                }
1267
0
                return 2;
1268
0
            } else {
1269
                /* no known conditional special case mapping, use a normal mapping */
1270
0
            }
1271
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1272
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1273
1274
            /* start of full case mapping strings */
1275
0
            ++pe;
1276
1277
            /* skip the lowercase and case-folding result strings */
1278
0
            pe+=full&UCASE_FULL_LOWER;
1279
0
            full>>=4;
1280
0
            pe+=full&0xf;
1281
0
            full>>=4;
1282
1283
0
            if(upperNotTitle) {
1284
0
                full&=0xf;
1285
0
            } else {
1286
                /* skip the uppercase result string */
1287
0
                pe+=full&0xf;
1288
0
                full=(full>>4)&0xf;
1289
0
            }
1290
1291
0
            if(full!=0) {
1292
                /* set the output pointer to the result string */
1293
0
                *pString=reinterpret_cast<const UChar *>(pe);
1294
1295
                /* return the string length */
1296
0
                return full;
1297
0
            }
1298
0
        }
1299
1300
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1301
0
            int32_t delta;
1302
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1303
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1304
0
        }
1305
0
        if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1306
0
            idx=UCASE_EXC_TITLE;
1307
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1308
            /* here, titlecase is same as uppercase */
1309
0
            idx=UCASE_EXC_UPPER;
1310
0
        } else {
1311
0
            return ~c;
1312
0
        }
1313
0
        GET_SLOT_VALUE(excWord, idx, pe2, result);
1314
0
    }
1315
1316
0
    return (result==c) ? ~result : result;
1317
0
}
1318
1319
U_CAPI int32_t U_EXPORT2
1320
ucase_toFullUpper(UChar32 c,
1321
                  UCaseContextIterator *iter, void *context,
1322
                  const UChar **pString,
1323
0
                  int32_t caseLocale) {
1324
0
    return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
1325
0
}
1326
1327
U_CAPI int32_t U_EXPORT2
1328
ucase_toFullTitle(UChar32 c,
1329
                  UCaseContextIterator *iter, void *context,
1330
                  const UChar **pString,
1331
0
                  int32_t caseLocale) {
1332
0
    return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
1333
0
}
1334
1335
/* case folding ------------------------------------------------------------- */
1336
1337
/*
1338
 * Case folding is similar to lowercasing.
1339
 * The result may be a simple mapping, i.e., a single code point, or
1340
 * a full mapping, i.e., a string.
1341
 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1342
 * then only the lowercase mapping is stored.
1343
 *
1344
 * Some special cases are hardcoded because their conditions cannot be
1345
 * parsed and processed from CaseFolding.txt.
1346
 *
1347
 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1348
1349
# C: common case folding, common mappings shared by both simple and full mappings.
1350
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1351
# S: simple case folding, mappings to single characters where different from F.
1352
# T: special case for uppercase I and dotted uppercase I
1353
#    - For non-Turkic languages, this mapping is normally not used.
1354
#    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1355
#
1356
# Usage:
1357
#  A. To do a simple case folding, use the mappings with status C + S.
1358
#  B. To do a full case folding, use the mappings with status C + F.
1359
#
1360
#    The mappings with status T can be used or omitted depending on the desired case-folding
1361
#    behavior. (The default option is to exclude them.)
1362
1363
 * Unicode 3.2 has 'T' mappings as follows:
1364
1365
0049; T; 0131; # LATIN CAPITAL LETTER I
1366
0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1367
1368
 * while the default mappings for these code points are:
1369
1370
0049; C; 0069; # LATIN CAPITAL LETTER I
1371
0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1372
1373
 * U+0130 has no simple case folding (simple-case-folds to itself).
1374
 */
1375
1376
/* return the simple case folding mapping for c */
1377
U_CAPI UChar32 U_EXPORT2
1378
0
ucase_fold(UChar32 c, uint32_t options) {
1379
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1380
0
    if(!UCASE_HAS_EXCEPTION(props)) {
1381
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1382
0
            c+=UCASE_GET_DELTA(props);
1383
0
        }
1384
0
    } else {
1385
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1386
0
        uint16_t excWord=*pe++;
1387
0
        int32_t idx;
1388
0
        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1389
            /* special case folding mappings, hardcoded */
1390
0
            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1391
                /* default mappings */
1392
0
                if(c==0x49) {
1393
                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1394
0
                    return 0x69;
1395
0
                } else if(c==0x130) {
1396
                    /* no simple case folding for U+0130 */
1397
0
                    return c;
1398
0
                }
1399
0
            } else {
1400
                /* Turkic mappings */
1401
0
                if(c==0x49) {
1402
                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1403
0
                    return 0x131;
1404
0
                } else if(c==0x130) {
1405
                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1406
0
                    return 0x69;
1407
0
                }
1408
0
            }
1409
0
        }
1410
0
        if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1411
0
            return c;
1412
0
        }
1413
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1414
0
            int32_t delta;
1415
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1416
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1417
0
        }
1418
0
        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1419
0
            idx=UCASE_EXC_FOLD;
1420
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1421
0
            idx=UCASE_EXC_LOWER;
1422
0
        } else {
1423
0
            return c;
1424
0
        }
1425
0
        GET_SLOT_VALUE(excWord, idx, pe, c);
1426
0
    }
1427
0
    return c;
1428
0
}
1429
1430
/*
1431
 * Issue for canonical caseless match (UAX #21):
1432
 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1433
 * canonical equivalence, unlike default-option casefolding.
1434
 * For example, I-grave and I + grave fold to strings that are not canonically
1435
 * equivalent.
1436
 * For more details, see the comment in unorm_compare() in unorm.cpp
1437
 * and the intermediate prototype changes for Jitterbug 2021.
1438
 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1439
 *
1440
 * This did not get fixed because it appears that it is not possible to fix
1441
 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1442
 * together in a way that they still fold to common result strings.
1443
 */
1444
1445
U_CAPI int32_t U_EXPORT2
1446
ucase_toFullFolding(UChar32 c,
1447
                    const UChar **pString,
1448
0
                    uint32_t options) {
1449
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1450
0
    U_ASSERT(c >= 0);
1451
0
    UChar32 result=c;
1452
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1453
0
    if(!UCASE_HAS_EXCEPTION(props)) {
1454
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1455
0
            result=c+UCASE_GET_DELTA(props);
1456
0
        }
1457
0
    } else {
1458
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1459
0
        uint16_t excWord=*pe++;
1460
0
        int32_t full, idx;
1461
1462
0
        pe2=pe;
1463
1464
0
        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1465
            /* use hardcoded conditions and mappings */
1466
0
            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1467
                /* default mappings */
1468
0
                if(c==0x49) {
1469
                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1470
0
                    return 0x69;
1471
0
                } else if(c==0x130) {
1472
                    /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1473
0
                    *pString=iDot;
1474
0
                    return 2;
1475
0
                }
1476
0
            } else {
1477
                /* Turkic mappings */
1478
0
                if(c==0x49) {
1479
                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1480
0
                    return 0x131;
1481
0
                } else if(c==0x130) {
1482
                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1483
0
                    return 0x69;
1484
0
                }
1485
0
            }
1486
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1487
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1488
1489
            /* start of full case mapping strings */
1490
0
            ++pe;
1491
1492
            /* skip the lowercase result string */
1493
0
            pe+=full&UCASE_FULL_LOWER;
1494
0
            full=(full>>4)&0xf;
1495
1496
0
            if(full!=0) {
1497
                /* set the output pointer to the result string */
1498
0
                *pString=reinterpret_cast<const UChar *>(pe);
1499
1500
                /* return the string length */
1501
0
                return full;
1502
0
            }
1503
0
        }
1504
1505
0
        if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1506
0
            return ~c;
1507
0
        }
1508
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1509
0
            int32_t delta;
1510
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1511
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1512
0
        }
1513
0
        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1514
0
            idx=UCASE_EXC_FOLD;
1515
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1516
0
            idx=UCASE_EXC_LOWER;
1517
0
        } else {
1518
0
            return ~c;
1519
0
        }
1520
0
        GET_SLOT_VALUE(excWord, idx, pe2, result);
1521
0
    }
1522
1523
0
    return (result==c) ? ~result : result;
1524
0
}
1525
1526
/* case mapping properties API ---------------------------------------------- */
1527
1528
/* public API (see uchar.h) */
1529
1530
U_CAPI UBool U_EXPORT2
1531
0
u_isULowercase(UChar32 c) {
1532
0
    return (UBool)(UCASE_LOWER==ucase_getType(c));
1533
0
}
1534
1535
U_CAPI UBool U_EXPORT2
1536
0
u_isUUppercase(UChar32 c) {
1537
0
    return (UBool)(UCASE_UPPER==ucase_getType(c));
1538
0
}
1539
1540
/* Transforms the Unicode character to its lower case equivalent.*/
1541
U_CAPI UChar32 U_EXPORT2
1542
0
u_tolower(UChar32 c) {
1543
0
    return ucase_tolower(c);
1544
0
}
1545
    
1546
/* Transforms the Unicode character to its upper case equivalent.*/
1547
U_CAPI UChar32 U_EXPORT2
1548
0
u_toupper(UChar32 c) {
1549
0
    return ucase_toupper(c);
1550
0
}
1551
1552
/* Transforms the Unicode character to its title case equivalent.*/
1553
U_CAPI UChar32 U_EXPORT2
1554
0
u_totitle(UChar32 c) {
1555
0
    return ucase_totitle(c);
1556
0
}
1557
1558
/* return the simple case folding mapping for c */
1559
U_CAPI UChar32 U_EXPORT2
1560
0
u_foldCase(UChar32 c, uint32_t options) {
1561
0
    return ucase_fold(c, options);
1562
0
}
1563
1564
U_CFUNC int32_t U_EXPORT2
1565
0
ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1566
    /* case mapping properties */
1567
0
    const UChar *resultString;
1568
0
    switch(which) {
1569
0
    case UCHAR_LOWERCASE:
1570
0
        return (UBool)(UCASE_LOWER==ucase_getType(c));
1571
0
    case UCHAR_UPPERCASE:
1572
0
        return (UBool)(UCASE_UPPER==ucase_getType(c));
1573
0
    case UCHAR_SOFT_DOTTED:
1574
0
        return ucase_isSoftDotted(c);
1575
0
    case UCHAR_CASE_SENSITIVE:
1576
0
        return ucase_isCaseSensitive(c);
1577
0
    case UCHAR_CASED:
1578
0
        return (UBool)(UCASE_NONE!=ucase_getType(c));
1579
0
    case UCHAR_CASE_IGNORABLE:
1580
0
        return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1581
    /*
1582
     * Note: The following Changes_When_Xyz are defined as testing whether
1583
     * the NFD form of the input changes when Xyz-case-mapped.
1584
     * However, this simpler implementation of these properties,
1585
     * ignoring NFD, passes the tests.
1586
     * The implementation needs to be changed if the tests start failing.
1587
     * When that happens, optimizations should be used to work with the
1588
     * per-single-code point ucase_toFullXyz() functions unless
1589
     * the NFD form has more than one code point,
1590
     * and the property starts set needs to be the union of the
1591
     * start sets for normalization and case mappings.
1592
     */
1593
0
    case UCHAR_CHANGES_WHEN_LOWERCASED:
1594
0
        return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1595
0
    case UCHAR_CHANGES_WHEN_UPPERCASED:
1596
0
        return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1597
0
    case UCHAR_CHANGES_WHEN_TITLECASED:
1598
0
        return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1599
    /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1600
0
    case UCHAR_CHANGES_WHEN_CASEMAPPED:
1601
0
        return (UBool)(
1602
0
            ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1603
0
            ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1604
0
            ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1605
0
    default:
1606
0
        return FALSE;
1607
0
    }
1608
0
}