Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/common/ucase.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2004-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  ucase.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2004aug30
16
*   created by: Markus W. Scherer
17
*
18
*   Low-level Unicode character/string case mapping code.
19
*   Much code moved here (and modified) from uchar.c.
20
*/
21
22
#include "unicode/utypes.h"
23
#include "unicode/unistr.h"
24
#include "unicode/uset.h"
25
#include "unicode/udata.h" /* UDataInfo */
26
#include "unicode/utf16.h"
27
#include "ucmndata.h" /* DataHeader */
28
#include "udatamem.h"
29
#include "umutex.h"
30
#include "uassert.h"
31
#include "cmemory.h"
32
#include "utrie2.h"
33
#include "ucase.h"
34
35
struct UCaseProps {
36
    UDataMemory *mem;
37
    const int32_t *indexes;
38
    const uint16_t *exceptions;
39
    const uint16_t *unfold;
40
41
    UTrie2 trie;
42
    uint8_t formatVersion[4];
43
};
44
45
/* ucase_props_data.h is machine-generated by gencase --csource */
46
#define INCLUDED_FROM_UCASE_CPP
47
#include "ucase_props_data.h"
48
49
/* set of property starts for UnicodeSet ------------------------------------ */
50
51
static UBool U_CALLCONV
52
0
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
53
0
    /* add the start code point to the USet */
54
0
    const USetAdder *sa=(const USetAdder *)context;
55
0
    sa->add(sa->set, start);
56
0
    return TRUE;
57
0
}
58
59
U_CFUNC void U_EXPORT2
60
0
ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
61
0
    if(U_FAILURE(*pErrorCode)) {
62
0
        return;
63
0
    }
64
0
65
0
    /* add the start code point of each same-value range of the trie */
66
0
    utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
67
0
68
0
    /* add code points with hardcoded properties, plus the ones following them */
69
0
70
0
    /* (none right now, see comment below) */
71
0
72
0
    /*
73
0
     * Omit code points with hardcoded specialcasing properties
74
0
     * because we do not build property UnicodeSets for them right now.
75
0
     */
76
0
}
77
78
/* data access primitives --------------------------------------------------- */
79
80
U_CFUNC const UTrie2 * U_EXPORT2
81
0
ucase_getTrie() {
82
0
    return &ucase_props_singleton.trie;
83
0
}
84
85
0
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
86
87
/* number of bits in an 8-bit integer value */
88
static const uint8_t flagsOffset[256]={
89
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
105
};
106
107
0
#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
108
0
#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
109
110
/*
111
 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
112
 *
113
 * @param excWord (in) initial exceptions word
114
 * @param idx (in) desired slot index
115
 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
116
 *               moved to the last uint16_t of the value, use +1 for beginning of next slot
117
 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
118
 */
119
#define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
120
0
    if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
121
0
        (pExc16)+=SLOT_OFFSET(excWord, idx); \
122
0
        (value)=*pExc16; \
123
0
    } else { \
124
0
        (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
125
0
        (value)=*pExc16++; \
126
0
        (value)=((value)<<16)|*pExc16; \
127
0
    }
128
129
/* simple case mappings ----------------------------------------------------- */
130
131
U_CAPI UChar32 U_EXPORT2
132
18.2k
ucase_tolower(UChar32 c) {
133
18.2k
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
134
18.2k
    if(!UCASE_HAS_EXCEPTION(props)) {
135
18.2k
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
136
0
            c+=UCASE_GET_DELTA(props);
137
0
        }
138
18.2k
    } else {
139
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
140
0
        uint16_t excWord=*pe++;
141
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
142
0
            int32_t delta;
143
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
144
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
145
0
        }
146
0
        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
147
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
148
0
        }
149
0
    }
150
18.2k
    return c;
151
18.2k
}
152
153
U_CAPI UChar32 U_EXPORT2
154
18.2k
ucase_toupper(UChar32 c) {
155
18.2k
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
156
18.2k
    if(!UCASE_HAS_EXCEPTION(props)) {
157
18.2k
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
158
0
            c+=UCASE_GET_DELTA(props);
159
0
        }
160
18.2k
    } else {
161
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
162
0
        uint16_t excWord=*pe++;
163
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
164
0
            int32_t delta;
165
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
166
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
167
0
        }
168
0
        if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
169
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
170
0
        }
171
0
    }
172
18.2k
    return c;
173
18.2k
}
174
175
U_CAPI UChar32 U_EXPORT2
176
0
ucase_totitle(UChar32 c) {
177
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
178
0
    if(!UCASE_HAS_EXCEPTION(props)) {
179
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
180
0
            c+=UCASE_GET_DELTA(props);
181
0
        }
182
0
    } else {
183
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
184
0
        uint16_t excWord=*pe++;
185
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
186
0
            int32_t delta;
187
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
188
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
189
0
        }
190
0
        int32_t idx;
191
0
        if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
192
0
            idx=UCASE_EXC_TITLE;
193
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
194
0
            idx=UCASE_EXC_UPPER;
195
0
        } else {
196
0
            return c;
197
0
        }
198
0
        GET_SLOT_VALUE(excWord, idx, pe, c);
199
0
    }
200
0
    return c;
201
0
}
202
203
static const UChar iDot[2] = { 0x69, 0x307 };
204
static const UChar jDot[2] = { 0x6a, 0x307 };
205
static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
206
static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
207
static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
208
static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
209
210
211
U_CFUNC void U_EXPORT2
212
0
ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
213
0
    uint16_t props;
214
0
215
0
    /*
216
0
     * Hardcode the case closure of i and its relatives and ignore the
217
0
     * data file data for these characters.
218
0
     * The Turkic dotless i and dotted I with their case mapping conditions
219
0
     * and case folding option make the related characters behave specially.
220
0
     * This code matches their closure behavior to their case folding behavior.
221
0
     */
222
0
223
0
    switch(c) {
224
0
    case 0x49:
225
0
        /* regular i and I are in one equivalence class */
226
0
        sa->add(sa->set, 0x69);
227
0
        return;
228
0
    case 0x69:
229
0
        sa->add(sa->set, 0x49);
230
0
        return;
231
0
    case 0x130:
232
0
        /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
233
0
        sa->addString(sa->set, iDot, 2);
234
0
        return;
235
0
    case 0x131:
236
0
        /* dotless i is in a class by itself */
237
0
        return;
238
0
    default:
239
0
        /* otherwise use the data file data */
240
0
        break;
241
0
    }
242
0
243
0
    props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
244
0
    if(!UCASE_HAS_EXCEPTION(props)) {
245
0
        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
246
0
            /* add the one simple case mapping, no matter what type it is */
247
0
            int32_t delta=UCASE_GET_DELTA(props);
248
0
            if(delta!=0) {
249
0
                sa->add(sa->set, c+delta);
250
0
            }
251
0
        }
252
0
    } else {
253
0
        /*
254
0
         * c has exceptions, so there may be multiple simple and/or
255
0
         * full case mappings. Add them all.
256
0
         */
257
0
        const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
258
0
        const UChar *closure;
259
0
        uint16_t excWord=*pe++;
260
0
        int32_t idx, closureLength, fullLength, length;
261
0
262
0
        pe0=pe;
263
0
264
0
        /* add all simple case mappings */
265
0
        for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
266
0
            if(HAS_SLOT(excWord, idx)) {
267
0
                pe=pe0;
268
0
                GET_SLOT_VALUE(excWord, idx, pe, c);
269
0
                sa->add(sa->set, c);
270
0
            }
271
0
        }
272
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
273
0
            pe=pe0;
274
0
            int32_t delta;
275
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
276
0
            sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
277
0
        }
278
0
279
0
        /* get the closure string pointer & length */
280
0
        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
281
0
            pe=pe0;
282
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
283
0
            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
284
0
            closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
285
0
        } else {
286
0
            closureLength=0;
287
0
            closure=NULL;
288
0
        }
289
0
290
0
        /* add the full case folding */
291
0
        if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
292
0
            pe=pe0;
293
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
294
0
295
0
            /* start of full case mapping strings */
296
0
            ++pe;
297
0
298
0
            fullLength&=0xffff; /* bits 16 and higher are reserved */
299
0
300
0
            /* skip the lowercase result string */
301
0
            pe+=fullLength&UCASE_FULL_LOWER;
302
0
            fullLength>>=4;
303
0
304
0
            /* add the full case folding string */
305
0
            length=fullLength&0xf;
306
0
            if(length!=0) {
307
0
                sa->addString(sa->set, (const UChar *)pe, length);
308
0
                pe+=length;
309
0
            }
310
0
311
0
            /* skip the uppercase and titlecase strings */
312
0
            fullLength>>=4;
313
0
            pe+=fullLength&0xf;
314
0
            fullLength>>=4;
315
0
            pe+=fullLength;
316
0
317
0
            closure=(const UChar *)pe; /* behind full case mappings */
318
0
        }
319
0
320
0
        /* add each code point in the closure string */
321
0
        for(idx=0; idx<closureLength;) {
322
0
            U16_NEXT_UNSAFE(closure, idx, c);
323
0
            sa->add(sa->set, c);
324
0
        }
325
0
    }
326
0
}
327
328
/*
329
 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
330
 * must be length>0 and max>0 and length<=max
331
 */
332
static inline int32_t
333
0
strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
334
0
    int32_t c1, c2;
335
0
336
0
    max-=length; /* we require length<=max, so no need to decrement max in the loop */
337
0
    do {
338
0
        c1=*s++;
339
0
        c2=*t++;
340
0
        if(c2==0) {
341
0
            return 1; /* reached the end of t but not of s */
342
0
        }
343
0
        c1-=c2;
344
0
        if(c1!=0) {
345
0
            return c1; /* return difference result */
346
0
        }
347
0
    } while(--length>0);
348
0
    /* ends with length==0 */
349
0
350
0
    if(max==0 || *t==0) {
351
0
        return 0; /* equal to length of both strings */
352
0
    } else {
353
0
        return -max; /* return lengh difference */
354
0
    }
355
0
}
356
357
U_CFUNC UBool U_EXPORT2
358
0
ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
359
0
    int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
360
0
361
0
    if(ucase_props_singleton.unfold==NULL || s==NULL) {
362
0
        return FALSE; /* no reverse case folding data, or no string */
363
0
    }
364
0
    if(length<=1) {
365
0
        /* the string is too short to find any match */
366
0
        /*
367
0
         * more precise would be:
368
0
         * if(!u_strHasMoreChar32Than(s, length, 1))
369
0
         * but this does not make much practical difference because
370
0
         * a single supplementary code point would just not be found
371
0
         */
372
0
        return FALSE;
373
0
    }
374
0
375
0
    const uint16_t *unfold=ucase_props_singleton.unfold;
376
0
    unfoldRows=unfold[UCASE_UNFOLD_ROWS];
377
0
    unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
378
0
    unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
379
0
    unfold+=unfoldRowWidth;
380
0
381
0
    if(length>unfoldStringWidth) {
382
0
        /* the string is too long to find any match */
383
0
        return FALSE;
384
0
    }
385
0
386
0
    /* do a binary search for the string */
387
0
    start=0;
388
0
    limit=unfoldRows;
389
0
    while(start<limit) {
390
0
        i=(start+limit)/2;
391
0
        const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
392
0
        result=strcmpMax(s, length, p, unfoldStringWidth);
393
0
394
0
        if(result==0) {
395
0
            /* found the string: add each code point, and its case closure */
396
0
            UChar32 c;
397
0
398
0
            for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
399
0
                U16_NEXT_UNSAFE(p, i, c);
400
0
                sa->add(sa->set, c);
401
0
                ucase_addCaseClosure(c, sa);
402
0
            }
403
0
            return TRUE;
404
0
        } else if(result<0) {
405
0
            limit=i;
406
0
        } else /* result>0 */ {
407
0
            start=i+1;
408
0
        }
409
0
    }
410
0
411
0
    return FALSE; /* string not found */
412
0
}
413
414
U_NAMESPACE_BEGIN
415
416
FullCaseFoldingIterator::FullCaseFoldingIterator()
417
        : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
418
          unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
419
          unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
420
          unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
421
          currentRow(0),
422
0
          rowCpIndex(unfoldStringWidth) {
423
0
    unfold+=unfoldRowWidth;
424
0
}
425
426
UChar32
427
0
FullCaseFoldingIterator::next(UnicodeString &full) {
428
0
    // Advance past the last-delivered code point.
429
0
    const UChar *p=unfold+(currentRow*unfoldRowWidth);
430
0
    if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
431
0
        ++currentRow;
432
0
        p+=unfoldRowWidth;
433
0
        rowCpIndex=unfoldStringWidth;
434
0
    }
435
0
    if(currentRow>=unfoldRows) { return U_SENTINEL; }
436
0
    // Set "full" to the NUL-terminated string in the first unfold column.
437
0
    int32_t length=unfoldStringWidth;
438
0
    while(length>0 && p[length-1]==0) { --length; }
439
0
    full.setTo(FALSE, p, length);
440
0
    // Return the code point.
441
0
    UChar32 c;
442
0
    U16_NEXT_UNSAFE(p, rowCpIndex, c);
443
0
    return c;
444
0
}
445
446
namespace LatinCase {
447
448
const int8_t TO_LOWER_NORMAL[LIMIT] = {
449
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453
454
    0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
455
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458
459
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463
464
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
465
    32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
466
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
467
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468
469
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
470
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472
    EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
473
474
    0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
475
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
476
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
477
    1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
478
};
479
480
const int8_t TO_LOWER_TR_LT[LIMIT] = {
481
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
482
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
483
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485
486
    0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
487
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
488
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490
491
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495
496
    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
497
    32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
498
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
499
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
500
501
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
502
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503
    1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
504
    EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
505
506
    0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
507
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
508
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
509
    1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
510
};
511
512
const int8_t TO_UPPER_NORMAL[LIMIT] = {
513
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
514
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
516
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
517
518
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
519
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520
    0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
521
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
522
523
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
524
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
527
528
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
529
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
530
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
531
    -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
532
533
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
534
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536
    0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
537
538
    -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
539
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
540
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
541
    0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
542
};
543
544
const int8_t TO_UPPER_TR[LIMIT] = {
545
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549
550
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552
    0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
553
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
554
555
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
556
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
557
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558
    0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559
560
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
562
    -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
563
    -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
564
565
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
566
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568
    0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
569
570
    -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
571
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
572
    0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
573
    0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
574
};
575
576
}  // namespace LatinCase
577
578
U_NAMESPACE_END
579
580
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
581
U_CAPI int32_t U_EXPORT2
582
0
ucase_getType(UChar32 c) {
583
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
584
0
    return UCASE_GET_TYPE(props);
585
0
}
586
587
/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
588
U_CAPI int32_t U_EXPORT2
589
0
ucase_getTypeOrIgnorable(UChar32 c) {
590
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
591
0
    return UCASE_GET_TYPE_AND_IGNORABLE(props);
592
0
}
593
594
/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
595
static inline int32_t
596
0
getDotType(UChar32 c) {
597
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
598
0
    if(!UCASE_HAS_EXCEPTION(props)) {
599
0
        return props&UCASE_DOT_MASK;
600
0
    } else {
601
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
602
0
        return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
603
0
    }
604
0
}
605
606
U_CAPI UBool U_EXPORT2
607
0
ucase_isSoftDotted(UChar32 c) {
608
0
    return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
609
0
}
610
611
U_CAPI UBool U_EXPORT2
612
0
ucase_isCaseSensitive(UChar32 c) {
613
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
614
0
    if(!UCASE_HAS_EXCEPTION(props)) {
615
0
        return (UBool)((props&UCASE_SENSITIVE)!=0);
616
0
    } else {
617
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
618
0
        return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
619
0
    }
620
0
}
621
622
/* string casing ------------------------------------------------------------ */
623
624
/*
625
 * These internal functions form the core of string case mappings.
626
 * They map single code points to result code points or strings and take
627
 * all necessary conditions (context, locale ID, options) into account.
628
 *
629
 * They do not iterate over the source or write to the destination
630
 * so that the same functions are useful for non-standard string storage,
631
 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
632
 * For the same reason, the "surrounding text" context is passed in as a
633
 * UCaseContextIterator which does not make any assumptions about
634
 * the underlying storage.
635
 *
636
 * This section contains helper functions that check for conditions
637
 * in the input text surrounding the current code point
638
 * according to SpecialCasing.txt.
639
 *
640
 * Each helper function gets the index
641
 * - after the current code point if it looks at following text
642
 * - before the current code point if it looks at preceding text
643
 *
644
 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
645
 *
646
 * Final_Sigma
647
 *   C is preceded by a sequence consisting of
648
 *     a cased letter and a case-ignorable sequence,
649
 *   and C is not followed by a sequence consisting of
650
 *     an ignorable sequence and then a cased letter.
651
 *
652
 * More_Above
653
 *   C is followed by one or more characters of combining class 230 (ABOVE)
654
 *   in the combining character sequence.
655
 *
656
 * After_Soft_Dotted
657
 *   The last preceding character with combining class of zero before C
658
 *   was Soft_Dotted,
659
 *   and there is no intervening combining character class 230 (ABOVE).
660
 *
661
 * Before_Dot
662
 *   C is followed by combining dot above (U+0307).
663
 *   Any sequence of characters with a combining class that is neither 0 nor 230
664
 *   may intervene between the current character and the combining dot above.
665
 *
666
 * The erratum from 2002-10-31 adds the condition
667
 *
668
 * After_I
669
 *   The last preceding base character was an uppercase I, and there is no
670
 *   intervening combining character class 230 (ABOVE).
671
 *
672
 *   (See Jitterbug 2344 and the comments on After_I below.)
673
 *
674
 * Helper definitions in Unicode 3.2 UAX 21:
675
 *
676
 * D1. A character C is defined to be cased
677
 *     if it meets any of the following criteria:
678
 *
679
 *   - The general category of C is Titlecase Letter (Lt)
680
 *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
681
 *   - Given D = NFD(C), then it is not the case that:
682
 *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
683
 *     (This third criterium does not add any characters to the list
684
 *      for Unicode 3.2. Ignored.)
685
 *
686
 * D2. A character C is defined to be case-ignorable
687
 *     if it meets either of the following criteria:
688
 *
689
 *   - The general category of C is
690
 *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
691
 *     Letter Modifier (Lm), or Symbol Modifier (Sk)
692
 *   - C is one of the following characters 
693
 *     U+0027 APOSTROPHE
694
 *     U+00AD SOFT HYPHEN (SHY)
695
 *     U+2019 RIGHT SINGLE QUOTATION MARK
696
 *            (the preferred character for apostrophe)
697
 *
698
 * D3. A case-ignorable sequence is a sequence of
699
 *     zero or more case-ignorable characters.
700
 */
701
702
0
#define is_d(c) ((c)=='d' || (c)=='D')
703
0
#define is_e(c) ((c)=='e' || (c)=='E')
704
0
#define is_i(c) ((c)=='i' || (c)=='I')
705
0
#define is_l(c) ((c)=='l' || (c)=='L')
706
0
#define is_r(c) ((c)=='r' || (c)=='R')
707
0
#define is_t(c) ((c)=='t' || (c)=='T')
708
0
#define is_u(c) ((c)=='u' || (c)=='U')
709
0
#define is_z(c) ((c)=='z' || (c)=='Z')
710
711
/* separator? */
712
0
#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
713
714
/**
715
 * Requires non-NULL locale ID but otherwise does the equivalent of
716
 * checking for language codes as if uloc_getLanguage() were called:
717
 * Accepts both 2- and 3-letter codes and accepts case variants.
718
 */
719
U_CFUNC int32_t
720
0
ucase_getCaseLocale(const char *locale) {
721
0
    /*
722
0
     * This function used to use uloc_getLanguage(), but the current code
723
0
     * removes the dependency of this low-level code on uloc implementation code
724
0
     * and is faster because not the whole locale ID has to be
725
0
     * examined and copied/transformed.
726
0
     *
727
0
     * Because this code does not want to depend on uloc, the caller must
728
0
     * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
729
0
     */
730
0
    char c=*locale++;
731
0
    // Fastpath for English "en" which is often used for default (=root locale) case mappings,
732
0
    // and for Chinese "zh": Very common but no special case mapping behavior.
733
0
    // Then check lowercase vs. uppercase to reduce the number of comparisons
734
0
    // for other locales without special behavior.
735
0
    if(c=='e') {
736
0
        /* el or ell? */
737
0
        c=*locale++;
738
0
        if(is_l(c)) {
739
0
            c=*locale++;
740
0
            if(is_l(c)) {
741
0
                c=*locale;
742
0
            }
743
0
            if(is_sep(c)) {
744
0
                return UCASE_LOC_GREEK;
745
0
            }
746
0
        }
747
0
        // en, es, ... -> root
748
0
    } else if(c=='z') {
749
0
        return UCASE_LOC_ROOT;
750
0
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
751
0
    } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
752
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
753
    } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
754
#else
755
#   error Unknown charset family!
756
#endif
757
        // lowercase c
758
0
        if(c=='t') {
759
0
            /* tr or tur? */
760
0
            c=*locale++;
761
0
            if(is_u(c)) {
762
0
                c=*locale++;
763
0
            }
764
0
            if(is_r(c)) {
765
0
                c=*locale;
766
0
                if(is_sep(c)) {
767
0
                    return UCASE_LOC_TURKISH;
768
0
                }
769
0
            }
770
0
        } else if(c=='a') {
771
0
            /* az or aze? */
772
0
            c=*locale++;
773
0
            if(is_z(c)) {
774
0
                c=*locale++;
775
0
                if(is_e(c)) {
776
0
                    c=*locale;
777
0
                }
778
0
                if(is_sep(c)) {
779
0
                    return UCASE_LOC_TURKISH;
780
0
                }
781
0
            }
782
0
        } else if(c=='l') {
783
0
            /* lt or lit? */
784
0
            c=*locale++;
785
0
            if(is_i(c)) {
786
0
                c=*locale++;
787
0
            }
788
0
            if(is_t(c)) {
789
0
                c=*locale;
790
0
                if(is_sep(c)) {
791
0
                    return UCASE_LOC_LITHUANIAN;
792
0
                }
793
0
            }
794
0
        } else if(c=='n') {
795
0
            /* nl or nld? */
796
0
            c=*locale++;
797
0
            if(is_l(c)) {
798
0
                c=*locale++;
799
0
                if(is_d(c)) {
800
0
                    c=*locale;
801
0
                }
802
0
                if(is_sep(c)) {
803
0
                    return UCASE_LOC_DUTCH;
804
0
                }
805
0
            }
806
0
        }
807
0
    } else {
808
0
        // uppercase c
809
0
        // Same code as for lowercase c but also check for 'E'.
810
0
        if(c=='T') {
811
0
            /* tr or tur? */
812
0
            c=*locale++;
813
0
            if(is_u(c)) {
814
0
                c=*locale++;
815
0
            }
816
0
            if(is_r(c)) {
817
0
                c=*locale;
818
0
                if(is_sep(c)) {
819
0
                    return UCASE_LOC_TURKISH;
820
0
                }
821
0
            }
822
0
        } else if(c=='A') {
823
0
            /* az or aze? */
824
0
            c=*locale++;
825
0
            if(is_z(c)) {
826
0
                c=*locale++;
827
0
                if(is_e(c)) {
828
0
                    c=*locale;
829
0
                }
830
0
                if(is_sep(c)) {
831
0
                    return UCASE_LOC_TURKISH;
832
0
                }
833
0
            }
834
0
        } else if(c=='L') {
835
0
            /* lt or lit? */
836
0
            c=*locale++;
837
0
            if(is_i(c)) {
838
0
                c=*locale++;
839
0
            }
840
0
            if(is_t(c)) {
841
0
                c=*locale;
842
0
                if(is_sep(c)) {
843
0
                    return UCASE_LOC_LITHUANIAN;
844
0
                }
845
0
            }
846
0
        } else if(c=='E') {
847
0
            /* el or ell? */
848
0
            c=*locale++;
849
0
            if(is_l(c)) {
850
0
                c=*locale++;
851
0
                if(is_l(c)) {
852
0
                    c=*locale;
853
0
                }
854
0
                if(is_sep(c)) {
855
0
                    return UCASE_LOC_GREEK;
856
0
                }
857
0
            }
858
0
        } else if(c=='N') {
859
0
            /* nl or nld? */
860
0
            c=*locale++;
861
0
            if(is_l(c)) {
862
0
                c=*locale++;
863
0
                if(is_d(c)) {
864
0
                    c=*locale;
865
0
                }
866
0
                if(is_sep(c)) {
867
0
                    return UCASE_LOC_DUTCH;
868
0
                }
869
0
            }
870
0
        }
871
0
    }
872
0
    return UCASE_LOC_ROOT;
873
0
}
874
875
/*
876
 * Is followed by
877
 *   {case-ignorable}* cased
878
 * ?
879
 * (dir determines looking forward/backward)
880
 * If a character is case-ignorable, it is skipped regardless of whether
881
 * it is also cased or not.
882
 */
883
static UBool
884
0
isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
885
0
    UChar32 c;
886
0
887
0
    if(iter==NULL) {
888
0
        return FALSE;
889
0
    }
890
0
891
0
    for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
892
0
        int32_t type=ucase_getTypeOrIgnorable(c);
893
0
        if(type&4) {
894
0
            /* case-ignorable, continue with the loop */
895
0
        } else if(type!=UCASE_NONE) {
896
0
            return TRUE; /* followed by cased letter */
897
0
        } else {
898
0
            return FALSE; /* uncased and not case-ignorable */
899
0
        }
900
0
    }
901
0
902
0
    return FALSE; /* not followed by cased letter */
903
0
}
904
905
/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
906
static UBool
907
0
isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
908
0
    UChar32 c;
909
0
    int32_t dotType;
910
0
    int8_t dir;
911
0
912
0
    if(iter==NULL) {
913
0
        return FALSE;
914
0
    }
915
0
916
0
    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
917
0
        dotType=getDotType(c);
918
0
        if(dotType==UCASE_SOFT_DOTTED) {
919
0
            return TRUE; /* preceded by TYPE_i */
920
0
        } else if(dotType!=UCASE_OTHER_ACCENT) {
921
0
            return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
922
0
        }
923
0
    }
924
0
925
0
    return FALSE; /* not preceded by TYPE_i */
926
0
}
927
928
/*
929
 * See Jitterbug 2344:
930
 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
931
 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
932
 * we made those releases compatible with Unicode 3.2 which had not fixed
933
 * a related bug in SpecialCasing.txt.
934
 *
935
 * From the Jitterbug 2344 text:
936
 * ... this bug is listed as a Unicode erratum
937
 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
938
 * <quote>
939
 * There are two errors in SpecialCasing.txt.
940
 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
941
 * 2. An incorrect context definition. Correct as follows:
942
 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
943
 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
944
 * ---
945
 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
946
 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
947
 * where the context After_I is defined as:
948
 * The last preceding base character was an uppercase I, and there is no
949
 * intervening combining character class 230 (ABOVE).
950
 * </quote>
951
 *
952
 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
953
 *
954
 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
955
 * # This matches the behavior of the canonically equivalent I-dot_above
956
 *
957
 * See also the description in this place in older versions of uchar.c (revision 1.100).
958
 *
959
 * Markus W. Scherer 2003-feb-15
960
 */
961
962
/* Is preceded by base character 'I' with no intervening cc=230 ? */
963
static UBool
964
0
isPrecededBy_I(UCaseContextIterator *iter, void *context) {
965
0
    UChar32 c;
966
0
    int32_t dotType;
967
0
    int8_t dir;
968
0
969
0
    if(iter==NULL) {
970
0
        return FALSE;
971
0
    }
972
0
973
0
    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
974
0
        if(c==0x49) {
975
0
            return TRUE; /* preceded by I */
976
0
        }
977
0
        dotType=getDotType(c);
978
0
        if(dotType!=UCASE_OTHER_ACCENT) {
979
0
            return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
980
0
        }
981
0
    }
982
0
983
0
    return FALSE; /* not preceded by I */
984
0
}
985
986
/* Is followed by one or more cc==230 ? */
987
static UBool
988
0
isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
989
0
    UChar32 c;
990
0
    int32_t dotType;
991
0
    int8_t dir;
992
0
993
0
    if(iter==NULL) {
994
0
        return FALSE;
995
0
    }
996
0
997
0
    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
998
0
        dotType=getDotType(c);
999
0
        if(dotType==UCASE_ABOVE) {
1000
0
            return TRUE; /* at least one cc==230 following */
1001
0
        } else if(dotType!=UCASE_OTHER_ACCENT) {
1002
0
            return FALSE; /* next base character, no more cc==230 following */
1003
0
        }
1004
0
    }
1005
0
1006
0
    return FALSE; /* no more cc==230 following */
1007
0
}
1008
1009
/* Is followed by a dot above (without cc==230 in between) ? */
1010
static UBool
1011
0
isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1012
0
    UChar32 c;
1013
0
    int32_t dotType;
1014
0
    int8_t dir;
1015
0
1016
0
    if(iter==NULL) {
1017
0
        return FALSE;
1018
0
    }
1019
0
1020
0
    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1021
0
        if(c==0x307) {
1022
0
            return TRUE;
1023
0
        }
1024
0
        dotType=getDotType(c);
1025
0
        if(dotType!=UCASE_OTHER_ACCENT) {
1026
0
            return FALSE; /* next base character or cc==230 in between */
1027
0
        }
1028
0
    }
1029
0
1030
0
    return FALSE; /* no dot above following */
1031
0
}
1032
1033
U_CAPI int32_t U_EXPORT2
1034
ucase_toFullLower(UChar32 c,
1035
                  UCaseContextIterator *iter, void *context,
1036
                  const UChar **pString,
1037
0
                  int32_t loc) {
1038
0
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1039
0
    U_ASSERT(c >= 0);
1040
0
    UChar32 result=c;
1041
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1042
0
    if(!UCASE_HAS_EXCEPTION(props)) {
1043
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1044
0
            result=c+UCASE_GET_DELTA(props);
1045
0
        }
1046
0
    } else {
1047
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1048
0
        uint16_t excWord=*pe++;
1049
0
        int32_t full;
1050
0
1051
0
        pe2=pe;
1052
0
1053
0
        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1054
0
            /* use hardcoded conditions and mappings */
1055
0
1056
0
            /*
1057
0
             * Test for conditional mappings first
1058
0
             *   (otherwise the unconditional default mappings are always taken),
1059
0
             * then test for characters that have unconditional mappings in SpecialCasing.txt,
1060
0
             * then get the UnicodeData.txt mappings.
1061
0
             */
1062
0
            if( loc==UCASE_LOC_LITHUANIAN &&
1063
0
                    /* base characters, find accents above */
1064
0
                    (((c==0x49 || c==0x4a || c==0x12e) &&
1065
0
                        isFollowedByMoreAbove(iter, context)) ||
1066
0
                    /* precomposed with accent above, no need to find one */
1067
0
                    (c==0xcc || c==0xcd || c==0x128))
1068
0
            ) {
1069
0
                /*
1070
0
                    # Lithuanian
1071
0
1072
0
                    # Lithuanian retains the dot in a lowercase i when followed by accents.
1073
0
1074
0
                    # Introduce an explicit dot above when lowercasing capital I's and J's
1075
0
                    # whenever there are more accents above.
1076
0
                    # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1077
0
1078
0
                    0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1079
0
                    004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1080
0
                    012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1081
0
                    00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1082
0
                    00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1083
0
                    0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1084
0
                 */
1085
0
                switch(c) {
1086
0
                case 0x49:  /* LATIN CAPITAL LETTER I */
1087
0
                    *pString=iDot;
1088
0
                    return 2;
1089
0
                case 0x4a:  /* LATIN CAPITAL LETTER J */
1090
0
                    *pString=jDot;
1091
0
                    return 2;
1092
0
                case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1093
0
                    *pString=iOgonekDot;
1094
0
                    return 2;
1095
0
                case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1096
0
                    *pString=iDotGrave;
1097
0
                    return 3;
1098
0
                case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1099
0
                    *pString=iDotAcute;
1100
0
                    return 3;
1101
0
                case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1102
0
                    *pString=iDotTilde;
1103
0
                    return 3;
1104
0
                default:
1105
0
                    return 0; /* will not occur */
1106
0
                }
1107
0
            /* # Turkish and Azeri */
1108
0
            } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1109
0
                /*
1110
0
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1111
0
                    # The following rules handle those cases.
1112
0
1113
0
                    0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1114
0
                    0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1115
0
                 */
1116
0
                return 0x69;
1117
0
            } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1118
0
                /*
1119
0
                    # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1120
0
                    # This matches the behavior of the canonically equivalent I-dot_above
1121
0
1122
0
                    0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1123
0
                    0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1124
0
                 */
1125
0
                *pString=nullptr;
1126
0
                return 0; /* remove the dot (continue without output) */
1127
0
            } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1128
0
                /*
1129
0
                    # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1130
0
1131
0
                    0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1132
0
                    0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1133
0
                 */
1134
0
                return 0x131;
1135
0
            } else if(c==0x130) {
1136
0
                /*
1137
0
                    # Preserve canonical equivalence for I with dot. Turkic is handled below.
1138
0
1139
0
                    0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1140
0
                 */
1141
0
                *pString=iDot;
1142
0
                return 2;
1143
0
            } else if(  c==0x3a3 &&
1144
0
                        !isFollowedByCasedLetter(iter, context, 1) &&
1145
0
                        isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1146
0
            ) {
1147
0
                /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1148
0
                /*
1149
0
                    # Special case for final form of sigma
1150
0
1151
0
                    03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1152
0
                 */
1153
0
                return 0x3c2; /* greek small final sigma */
1154
0
            } else {
1155
0
                /* no known conditional special case mapping, use a normal mapping */
1156
0
            }
1157
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1158
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1159
0
            full&=UCASE_FULL_LOWER;
1160
0
            if(full!=0) {
1161
0
                /* set the output pointer to the lowercase mapping */
1162
0
                *pString=reinterpret_cast<const UChar *>(pe+1);
1163
0
1164
0
                /* return the string length */
1165
0
                return full;
1166
0
            }
1167
0
        }
1168
0
1169
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1170
0
            int32_t delta;
1171
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1172
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1173
0
        }
1174
0
        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1175
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1176
0
        }
1177
0
    }
1178
0
1179
0
    return (result==c) ? ~result : result;
1180
0
}
1181
1182
/* internal */
1183
static int32_t
1184
toUpperOrTitle(UChar32 c,
1185
               UCaseContextIterator *iter, void *context,
1186
               const UChar **pString,
1187
               int32_t loc,
1188
0
               UBool upperNotTitle) {
1189
0
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1190
0
    U_ASSERT(c >= 0);
1191
0
    UChar32 result=c;
1192
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1193
0
    if(!UCASE_HAS_EXCEPTION(props)) {
1194
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1195
0
            result=c+UCASE_GET_DELTA(props);
1196
0
        }
1197
0
    } else {
1198
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1199
0
        uint16_t excWord=*pe++;
1200
0
        int32_t full, idx;
1201
0
1202
0
        pe2=pe;
1203
0
1204
0
        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1205
0
            /* use hardcoded conditions and mappings */
1206
0
            if(loc==UCASE_LOC_TURKISH && c==0x69) {
1207
0
                /*
1208
0
                    # Turkish and Azeri
1209
0
1210
0
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1211
0
                    # The following rules handle those cases.
1212
0
1213
0
                    # When uppercasing, i turns into a dotted capital I
1214
0
1215
0
                    0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1216
0
                    0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1217
0
                */
1218
0
                return 0x130;
1219
0
            } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1220
0
                /*
1221
0
                    # Lithuanian
1222
0
1223
0
                    # Lithuanian retains the dot in a lowercase i when followed by accents.
1224
0
1225
0
                    # Remove DOT ABOVE after "i" with upper or titlecase
1226
0
1227
0
                    0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1228
0
                 */
1229
0
                *pString=nullptr;
1230
0
                return 0; /* remove the dot (continue without output) */
1231
0
            } else {
1232
0
                /* no known conditional special case mapping, use a normal mapping */
1233
0
            }
1234
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1235
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1236
0
1237
0
            /* start of full case mapping strings */
1238
0
            ++pe;
1239
0
1240
0
            /* skip the lowercase and case-folding result strings */
1241
0
            pe+=full&UCASE_FULL_LOWER;
1242
0
            full>>=4;
1243
0
            pe+=full&0xf;
1244
0
            full>>=4;
1245
0
1246
0
            if(upperNotTitle) {
1247
0
                full&=0xf;
1248
0
            } else {
1249
0
                /* skip the uppercase result string */
1250
0
                pe+=full&0xf;
1251
0
                full=(full>>4)&0xf;
1252
0
            }
1253
0
1254
0
            if(full!=0) {
1255
0
                /* set the output pointer to the result string */
1256
0
                *pString=reinterpret_cast<const UChar *>(pe);
1257
0
1258
0
                /* return the string length */
1259
0
                return full;
1260
0
            }
1261
0
        }
1262
0
1263
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1264
0
            int32_t delta;
1265
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1266
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1267
0
        }
1268
0
        if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1269
0
            idx=UCASE_EXC_TITLE;
1270
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1271
0
            /* here, titlecase is same as uppercase */
1272
0
            idx=UCASE_EXC_UPPER;
1273
0
        } else {
1274
0
            return ~c;
1275
0
        }
1276
0
        GET_SLOT_VALUE(excWord, idx, pe2, result);
1277
0
    }
1278
0
1279
0
    return (result==c) ? ~result : result;
1280
0
}
1281
1282
U_CAPI int32_t U_EXPORT2
1283
ucase_toFullUpper(UChar32 c,
1284
                  UCaseContextIterator *iter, void *context,
1285
                  const UChar **pString,
1286
0
                  int32_t caseLocale) {
1287
0
    return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
1288
0
}
1289
1290
U_CAPI int32_t U_EXPORT2
1291
ucase_toFullTitle(UChar32 c,
1292
                  UCaseContextIterator *iter, void *context,
1293
                  const UChar **pString,
1294
0
                  int32_t caseLocale) {
1295
0
    return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
1296
0
}
1297
1298
/* case folding ------------------------------------------------------------- */
1299
1300
/*
1301
 * Case folding is similar to lowercasing.
1302
 * The result may be a simple mapping, i.e., a single code point, or
1303
 * a full mapping, i.e., a string.
1304
 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1305
 * then only the lowercase mapping is stored.
1306
 *
1307
 * Some special cases are hardcoded because their conditions cannot be
1308
 * parsed and processed from CaseFolding.txt.
1309
 *
1310
 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1311
1312
# C: common case folding, common mappings shared by both simple and full mappings.
1313
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1314
# S: simple case folding, mappings to single characters where different from F.
1315
# T: special case for uppercase I and dotted uppercase I
1316
#    - For non-Turkic languages, this mapping is normally not used.
1317
#    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1318
#
1319
# Usage:
1320
#  A. To do a simple case folding, use the mappings with status C + S.
1321
#  B. To do a full case folding, use the mappings with status C + F.
1322
#
1323
#    The mappings with status T can be used or omitted depending on the desired case-folding
1324
#    behavior. (The default option is to exclude them.)
1325
1326
 * Unicode 3.2 has 'T' mappings as follows:
1327
1328
0049; T; 0131; # LATIN CAPITAL LETTER I
1329
0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1330
1331
 * while the default mappings for these code points are:
1332
1333
0049; C; 0069; # LATIN CAPITAL LETTER I
1334
0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1335
1336
 * U+0130 has no simple case folding (simple-case-folds to itself).
1337
 */
1338
1339
/* return the simple case folding mapping for c */
1340
U_CAPI UChar32 U_EXPORT2
1341
0
ucase_fold(UChar32 c, uint32_t options) {
1342
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1343
0
    if(!UCASE_HAS_EXCEPTION(props)) {
1344
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1345
0
            c+=UCASE_GET_DELTA(props);
1346
0
        }
1347
0
    } else {
1348
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1349
0
        uint16_t excWord=*pe++;
1350
0
        int32_t idx;
1351
0
        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1352
0
            /* special case folding mappings, hardcoded */
1353
0
            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1354
0
                /* default mappings */
1355
0
                if(c==0x49) {
1356
0
                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1357
0
                    return 0x69;
1358
0
                } else if(c==0x130) {
1359
0
                    /* no simple case folding for U+0130 */
1360
0
                    return c;
1361
0
                }
1362
0
            } else {
1363
0
                /* Turkic mappings */
1364
0
                if(c==0x49) {
1365
0
                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1366
0
                    return 0x131;
1367
0
                } else if(c==0x130) {
1368
0
                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1369
0
                    return 0x69;
1370
0
                }
1371
0
            }
1372
0
        }
1373
0
        if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1374
0
            return c;
1375
0
        }
1376
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1377
0
            int32_t delta;
1378
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1379
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1380
0
        }
1381
0
        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1382
0
            idx=UCASE_EXC_FOLD;
1383
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1384
0
            idx=UCASE_EXC_LOWER;
1385
0
        } else {
1386
0
            return c;
1387
0
        }
1388
0
        GET_SLOT_VALUE(excWord, idx, pe, c);
1389
0
    }
1390
0
    return c;
1391
0
}
1392
1393
/*
1394
 * Issue for canonical caseless match (UAX #21):
1395
 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1396
 * canonical equivalence, unlike default-option casefolding.
1397
 * For example, I-grave and I + grave fold to strings that are not canonically
1398
 * equivalent.
1399
 * For more details, see the comment in unorm_compare() in unorm.cpp
1400
 * and the intermediate prototype changes for Jitterbug 2021.
1401
 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1402
 *
1403
 * This did not get fixed because it appears that it is not possible to fix
1404
 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1405
 * together in a way that they still fold to common result strings.
1406
 */
1407
1408
U_CAPI int32_t U_EXPORT2
1409
ucase_toFullFolding(UChar32 c,
1410
                    const UChar **pString,
1411
0
                    uint32_t options) {
1412
0
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1413
0
    U_ASSERT(c >= 0);
1414
0
    UChar32 result=c;
1415
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1416
0
    if(!UCASE_HAS_EXCEPTION(props)) {
1417
0
        if(UCASE_IS_UPPER_OR_TITLE(props)) {
1418
0
            result=c+UCASE_GET_DELTA(props);
1419
0
        }
1420
0
    } else {
1421
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1422
0
        uint16_t excWord=*pe++;
1423
0
        int32_t full, idx;
1424
0
1425
0
        pe2=pe;
1426
0
1427
0
        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1428
0
            /* use hardcoded conditions and mappings */
1429
0
            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1430
0
                /* default mappings */
1431
0
                if(c==0x49) {
1432
0
                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1433
0
                    return 0x69;
1434
0
                } else if(c==0x130) {
1435
0
                    /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1436
0
                    *pString=iDot;
1437
0
                    return 2;
1438
0
                }
1439
0
            } else {
1440
0
                /* Turkic mappings */
1441
0
                if(c==0x49) {
1442
0
                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1443
0
                    return 0x131;
1444
0
                } else if(c==0x130) {
1445
0
                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1446
0
                    return 0x69;
1447
0
                }
1448
0
            }
1449
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1450
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1451
0
1452
0
            /* start of full case mapping strings */
1453
0
            ++pe;
1454
0
1455
0
            /* skip the lowercase result string */
1456
0
            pe+=full&UCASE_FULL_LOWER;
1457
0
            full=(full>>4)&0xf;
1458
0
1459
0
            if(full!=0) {
1460
0
                /* set the output pointer to the result string */
1461
0
                *pString=reinterpret_cast<const UChar *>(pe);
1462
0
1463
0
                /* return the string length */
1464
0
                return full;
1465
0
            }
1466
0
        }
1467
0
1468
0
        if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1469
0
            return ~c;
1470
0
        }
1471
0
        if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1472
0
            int32_t delta;
1473
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1474
0
            return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1475
0
        }
1476
0
        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1477
0
            idx=UCASE_EXC_FOLD;
1478
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1479
0
            idx=UCASE_EXC_LOWER;
1480
0
        } else {
1481
0
            return ~c;
1482
0
        }
1483
0
        GET_SLOT_VALUE(excWord, idx, pe2, result);
1484
0
    }
1485
0
1486
0
    return (result==c) ? ~result : result;
1487
0
}
1488
1489
/* case mapping properties API ---------------------------------------------- */
1490
1491
/* public API (see uchar.h) */
1492
1493
U_CAPI UBool U_EXPORT2
1494
0
u_isULowercase(UChar32 c) {
1495
0
    return (UBool)(UCASE_LOWER==ucase_getType(c));
1496
0
}
1497
1498
U_CAPI UBool U_EXPORT2
1499
0
u_isUUppercase(UChar32 c) {
1500
0
    return (UBool)(UCASE_UPPER==ucase_getType(c));
1501
0
}
1502
1503
/* Transforms the Unicode character to its lower case equivalent.*/
1504
U_CAPI UChar32 U_EXPORT2
1505
18.2k
u_tolower(UChar32 c) {
1506
18.2k
    return ucase_tolower(c);
1507
18.2k
}
1508
    
1509
/* Transforms the Unicode character to its upper case equivalent.*/
1510
U_CAPI UChar32 U_EXPORT2
1511
18.2k
u_toupper(UChar32 c) {
1512
18.2k
    return ucase_toupper(c);
1513
18.2k
}
1514
1515
/* Transforms the Unicode character to its title case equivalent.*/
1516
U_CAPI UChar32 U_EXPORT2
1517
0
u_totitle(UChar32 c) {
1518
0
    return ucase_totitle(c);
1519
0
}
1520
1521
/* return the simple case folding mapping for c */
1522
U_CAPI UChar32 U_EXPORT2
1523
0
u_foldCase(UChar32 c, uint32_t options) {
1524
0
    return ucase_fold(c, options);
1525
0
}
1526
1527
U_CFUNC int32_t U_EXPORT2
1528
0
ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1529
0
    /* case mapping properties */
1530
0
    const UChar *resultString;
1531
0
    switch(which) {
1532
0
    case UCHAR_LOWERCASE:
1533
0
        return (UBool)(UCASE_LOWER==ucase_getType(c));
1534
0
    case UCHAR_UPPERCASE:
1535
0
        return (UBool)(UCASE_UPPER==ucase_getType(c));
1536
0
    case UCHAR_SOFT_DOTTED:
1537
0
        return ucase_isSoftDotted(c);
1538
0
    case UCHAR_CASE_SENSITIVE:
1539
0
        return ucase_isCaseSensitive(c);
1540
0
    case UCHAR_CASED:
1541
0
        return (UBool)(UCASE_NONE!=ucase_getType(c));
1542
0
    case UCHAR_CASE_IGNORABLE:
1543
0
        return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1544
0
    /*
1545
0
     * Note: The following Changes_When_Xyz are defined as testing whether
1546
0
     * the NFD form of the input changes when Xyz-case-mapped.
1547
0
     * However, this simpler implementation of these properties,
1548
0
     * ignoring NFD, passes the tests.
1549
0
     * The implementation needs to be changed if the tests start failing.
1550
0
     * When that happens, optimizations should be used to work with the
1551
0
     * per-single-code point ucase_toFullXyz() functions unless
1552
0
     * the NFD form has more than one code point,
1553
0
     * and the property starts set needs to be the union of the
1554
0
     * start sets for normalization and case mappings.
1555
0
     */
1556
0
    case UCHAR_CHANGES_WHEN_LOWERCASED:
1557
0
        return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1558
0
    case UCHAR_CHANGES_WHEN_UPPERCASED:
1559
0
        return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1560
0
    case UCHAR_CHANGES_WHEN_TITLECASED:
1561
0
        return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1562
0
    /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1563
0
    case UCHAR_CHANGES_WHEN_CASEMAPPED:
1564
0
        return (UBool)(
1565
0
            ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1566
0
            ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1567
0
            ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1568
0
    default:
1569
0
        return FALSE;
1570
0
    }
1571
0
}