Coverage Report

Created: 2023-06-07 07:17

/src/icu/source/common/ucase.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*
6
*   Copyright (C) 2004-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  ucase.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2004aug30
16
*   created by: Markus W. Scherer
17
*
18
*   Low-level Unicode character/string case mapping code.
19
*   Much code moved here (and modified) from uchar.c.
20
*/
21
22
#include "unicode/utypes.h"
23
#include "unicode/unistr.h"
24
#include "unicode/uset.h"
25
#include "unicode/udata.h" /* UDataInfo */
26
#include "unicode/utf16.h"
27
#include "ucmndata.h" /* DataHeader */
28
#include "udatamem.h"
29
#include "umutex.h"
30
#include "uassert.h"
31
#include "cmemory.h"
32
#include "utrie2.h"
33
#include "ucase.h"
34
35
struct UCaseProps {
36
    UDataMemory *mem;
37
    const int32_t *indexes;
38
    const uint16_t *exceptions;
39
    const uint16_t *unfold;
40
41
    UTrie2 trie;
42
    uint8_t formatVersion[4];
43
};
44
45
/* ucase_props_data.h is machine-generated by gencase --csource */
46
#define INCLUDED_FROM_UCASE_CPP
47
#include "ucase_props_data.h"
48
49
/* set of property starts for UnicodeSet ------------------------------------ */
50
51
static UBool U_CALLCONV
52
0
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
53
    /* add the start code point to the USet */
54
0
    const USetAdder *sa=(const USetAdder *)context;
55
0
    sa->add(sa->set, start);
56
0
    return TRUE;
57
0
}
58
59
U_CFUNC void U_EXPORT2
60
0
ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
61
0
    if(U_FAILURE(*pErrorCode)) {
62
0
        return;
63
0
    }
64
65
    /* add the start code point of each same-value range of the trie */
66
0
    utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
67
68
    /* add code points with hardcoded properties, plus the ones following them */
69
70
    /* (none right now, see comment below) */
71
72
    /*
73
     * Omit code points with hardcoded specialcasing properties
74
     * because we do not build property UnicodeSets for them right now.
75
     */
76
0
}
77
78
/* data access primitives --------------------------------------------------- */
79
80
213k
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
81
82
7.31M
#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
83
84
/* number of bits in an 8-bit integer value */
85
static const uint8_t flagsOffset[256]={
86
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
87
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
88
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
89
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
90
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
92
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
94
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
95
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
97
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
100
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
102
};
103
104
113k
#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
105
47.1k
#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
106
107
/*
108
 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
109
 *
110
 * @param excWord (in) initial exceptions word
111
 * @param idx (in) desired slot index
112
 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
113
 *               moved to the last uint16_t of the value, use +1 for beginning of next slot
114
 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
115
 */
116
#define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
117
47.1k
    if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
118
47.1k
        (pExc16)+=SLOT_OFFSET(excWord, idx); \
119
47.1k
        (value)=*pExc16; \
120
47.1k
    } else { \
121
0
        (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
122
0
        (value)=*pExc16++; \
123
0
        (value)=((value)<<16)|*pExc16; \
124
0
    }
125
126
/* simple case mappings ----------------------------------------------------- */
127
128
U_CAPI UChar32 U_EXPORT2
129
0
ucase_tolower(UChar32 c) {
130
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
131
0
    if(!PROPS_HAS_EXCEPTION(props)) {
132
0
        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
133
0
            c+=UCASE_GET_DELTA(props);
134
0
        }
135
0
    } else {
136
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
137
0
        uint16_t excWord=*pe++;
138
0
        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
139
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
140
0
        }
141
0
    }
142
0
    return c;
143
0
}
144
145
U_CAPI UChar32 U_EXPORT2
146
0
ucase_toupper(UChar32 c) {
147
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
148
0
    if(!PROPS_HAS_EXCEPTION(props)) {
149
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
150
0
            c+=UCASE_GET_DELTA(props);
151
0
        }
152
0
    } else {
153
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
154
0
        uint16_t excWord=*pe++;
155
0
        if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
156
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
157
0
        }
158
0
    }
159
0
    return c;
160
0
}
161
162
U_CAPI UChar32 U_EXPORT2
163
0
ucase_totitle(UChar32 c) {
164
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
165
0
    if(!PROPS_HAS_EXCEPTION(props)) {
166
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
167
0
            c+=UCASE_GET_DELTA(props);
168
0
        }
169
0
    } else {
170
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
171
0
        uint16_t excWord=*pe++;
172
0
        int32_t idx;
173
0
        if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
174
0
            idx=UCASE_EXC_TITLE;
175
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
176
0
            idx=UCASE_EXC_UPPER;
177
0
        } else {
178
0
            return c;
179
0
        }
180
0
        GET_SLOT_VALUE(excWord, idx, pe, c);
181
0
    }
182
0
    return c;
183
0
}
184
185
static const UChar iDot[2] = { 0x69, 0x307 };
186
static const UChar jDot[2] = { 0x6a, 0x307 };
187
static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
188
static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
189
static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
190
static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
191
192
193
U_CFUNC void U_EXPORT2
194
0
ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
195
0
    uint16_t props;
196
197
    /*
198
     * Hardcode the case closure of i and its relatives and ignore the
199
     * data file data for these characters.
200
     * The Turkic dotless i and dotted I with their case mapping conditions
201
     * and case folding option make the related characters behave specially.
202
     * This code matches their closure behavior to their case folding behavior.
203
     */
204
205
0
    switch(c) {
206
0
    case 0x49:
207
        /* regular i and I are in one equivalence class */
208
0
        sa->add(sa->set, 0x69);
209
0
        return;
210
0
    case 0x69:
211
0
        sa->add(sa->set, 0x49);
212
0
        return;
213
0
    case 0x130:
214
        /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
215
0
        sa->addString(sa->set, iDot, 2);
216
0
        return;
217
0
    case 0x131:
218
        /* dotless i is in a class by itself */
219
0
        return;
220
0
    default:
221
        /* otherwise use the data file data */
222
0
        break;
223
0
    }
224
225
0
    props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
226
0
    if(!PROPS_HAS_EXCEPTION(props)) {
227
0
        if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
228
            /* add the one simple case mapping, no matter what type it is */
229
0
            int32_t delta=UCASE_GET_DELTA(props);
230
0
            if(delta!=0) {
231
0
                sa->add(sa->set, c+delta);
232
0
            }
233
0
        }
234
0
    } else {
235
        /*
236
         * c has exceptions, so there may be multiple simple and/or
237
         * full case mappings. Add them all.
238
         */
239
0
        const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
240
0
        const UChar *closure;
241
0
        uint16_t excWord=*pe++;
242
0
        int32_t idx, closureLength, fullLength, length;
243
244
0
        pe0=pe;
245
246
        /* add all simple case mappings */
247
0
        for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
248
0
            if(HAS_SLOT(excWord, idx)) {
249
0
                pe=pe0;
250
0
                GET_SLOT_VALUE(excWord, idx, pe, c);
251
0
                sa->add(sa->set, c);
252
0
            }
253
0
        }
254
255
        /* get the closure string pointer & length */
256
0
        if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
257
0
            pe=pe0;
258
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
259
0
            closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
260
0
            closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
261
0
        } else {
262
0
            closureLength=0;
263
0
            closure=NULL;
264
0
        }
265
266
        /* add the full case folding */
267
0
        if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
268
0
            pe=pe0;
269
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
270
271
            /* start of full case mapping strings */
272
0
            ++pe;
273
274
0
            fullLength&=0xffff; /* bits 16 and higher are reserved */
275
276
            /* skip the lowercase result string */
277
0
            pe+=fullLength&UCASE_FULL_LOWER;
278
0
            fullLength>>=4;
279
280
            /* add the full case folding string */
281
0
            length=fullLength&0xf;
282
0
            if(length!=0) {
283
0
                sa->addString(sa->set, (const UChar *)pe, length);
284
0
                pe+=length;
285
0
            }
286
287
            /* skip the uppercase and titlecase strings */
288
0
            fullLength>>=4;
289
0
            pe+=fullLength&0xf;
290
0
            fullLength>>=4;
291
0
            pe+=fullLength;
292
293
0
            closure=(const UChar *)pe; /* behind full case mappings */
294
0
        }
295
296
        /* add each code point in the closure string */
297
0
        for(idx=0; idx<closureLength;) {
298
0
            U16_NEXT_UNSAFE(closure, idx, c);
299
0
            sa->add(sa->set, c);
300
0
        }
301
0
    }
302
0
}
303
304
/*
305
 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
306
 * must be length>0 and max>0 and length<=max
307
 */
308
static inline int32_t
309
0
strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
310
0
    int32_t c1, c2;
311
312
0
    max-=length; /* we require length<=max, so no need to decrement max in the loop */
313
0
    do {
314
0
        c1=*s++;
315
0
        c2=*t++;
316
0
        if(c2==0) {
317
0
            return 1; /* reached the end of t but not of s */
318
0
        }
319
0
        c1-=c2;
320
0
        if(c1!=0) {
321
0
            return c1; /* return difference result */
322
0
        }
323
0
    } while(--length>0);
324
    /* ends with length==0 */
325
326
0
    if(max==0 || *t==0) {
327
0
        return 0; /* equal to length of both strings */
328
0
    } else {
329
0
        return -max; /* return lengh difference */
330
0
    }
331
0
}
332
333
U_CFUNC UBool U_EXPORT2
334
0
ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
335
0
    int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
336
337
0
    if(ucase_props_singleton.unfold==NULL || s==NULL) {
338
0
        return FALSE; /* no reverse case folding data, or no string */
339
0
    }
340
0
    if(length<=1) {
341
        /* the string is too short to find any match */
342
        /*
343
         * more precise would be:
344
         * if(!u_strHasMoreChar32Than(s, length, 1))
345
         * but this does not make much practical difference because
346
         * a single supplementary code point would just not be found
347
         */
348
0
        return FALSE;
349
0
    }
350
351
0
    const uint16_t *unfold=ucase_props_singleton.unfold;
352
0
    unfoldRows=unfold[UCASE_UNFOLD_ROWS];
353
0
    unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
354
0
    unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
355
0
    unfold+=unfoldRowWidth;
356
357
0
    if(length>unfoldStringWidth) {
358
        /* the string is too long to find any match */
359
0
        return FALSE;
360
0
    }
361
362
    /* do a binary search for the string */
363
0
    start=0;
364
0
    limit=unfoldRows;
365
0
    while(start<limit) {
366
0
        i=(start+limit)/2;
367
0
        const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
368
0
        result=strcmpMax(s, length, p, unfoldStringWidth);
369
370
0
        if(result==0) {
371
            /* found the string: add each code point, and its case closure */
372
0
            UChar32 c;
373
374
0
            for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
375
0
                U16_NEXT_UNSAFE(p, i, c);
376
0
                sa->add(sa->set, c);
377
0
                ucase_addCaseClosure(c, sa);
378
0
            }
379
0
            return TRUE;
380
0
        } else if(result<0) {
381
0
            limit=i;
382
0
        } else /* result>0 */ {
383
0
            start=i+1;
384
0
        }
385
0
    }
386
387
0
    return FALSE; /* string not found */
388
0
}
389
390
U_NAMESPACE_BEGIN
391
392
FullCaseFoldingIterator::FullCaseFoldingIterator()
393
        : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
394
          unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
395
          unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
396
          unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
397
          currentRow(0),
398
0
          rowCpIndex(unfoldStringWidth) {
399
0
    unfold+=unfoldRowWidth;
400
0
}
401
402
UChar32
403
0
FullCaseFoldingIterator::next(UnicodeString &full) {
404
    // Advance past the last-delivered code point.
405
0
    const UChar *p=unfold+(currentRow*unfoldRowWidth);
406
0
    if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
407
0
        ++currentRow;
408
0
        p+=unfoldRowWidth;
409
0
        rowCpIndex=unfoldStringWidth;
410
0
    }
411
0
    if(currentRow>=unfoldRows) { return U_SENTINEL; }
412
    // Set "full" to the NUL-terminated string in the first unfold column.
413
0
    int32_t length=unfoldStringWidth;
414
0
    while(length>0 && p[length-1]==0) { --length; }
415
0
    full.setTo(FALSE, p, length);
416
    // Return the code point.
417
0
    UChar32 c;
418
0
    U16_NEXT_UNSAFE(p, rowCpIndex, c);
419
0
    return c;
420
0
}
421
422
U_NAMESPACE_END
423
424
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
425
U_CAPI int32_t U_EXPORT2
426
0
ucase_getType(UChar32 c) {
427
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
428
0
    return UCASE_GET_TYPE(props);
429
0
}
430
431
/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
432
U_CAPI int32_t U_EXPORT2
433
57.9k
ucase_getTypeOrIgnorable(UChar32 c) {
434
57.9k
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
435
57.9k
    return UCASE_GET_TYPE_AND_IGNORABLE(props);
436
57.9k
}
437
438
/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
439
static inline int32_t
440
0
getDotType(UChar32 c) {
441
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
442
0
    if(!PROPS_HAS_EXCEPTION(props)) {
443
0
        return props&UCASE_DOT_MASK;
444
0
    } else {
445
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
446
0
        return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
447
0
    }
448
0
}
449
450
U_CAPI UBool U_EXPORT2
451
0
ucase_isSoftDotted(UChar32 c) {
452
0
    return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
453
0
}
454
455
U_CAPI UBool U_EXPORT2
456
0
ucase_isCaseSensitive(UChar32 c) {
457
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
458
0
    return (UBool)((props&UCASE_SENSITIVE)!=0);
459
0
}
460
461
/* string casing ------------------------------------------------------------ */
462
463
/*
464
 * These internal functions form the core of string case mappings.
465
 * They map single code points to result code points or strings and take
466
 * all necessary conditions (context, locale ID, options) into account.
467
 *
468
 * They do not iterate over the source or write to the destination
469
 * so that the same functions are useful for non-standard string storage,
470
 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
471
 * For the same reason, the "surrounding text" context is passed in as a
472
 * UCaseContextIterator which does not make any assumptions about
473
 * the underlying storage.
474
 *
475
 * This section contains helper functions that check for conditions
476
 * in the input text surrounding the current code point
477
 * according to SpecialCasing.txt.
478
 *
479
 * Each helper function gets the index
480
 * - after the current code point if it looks at following text
481
 * - before the current code point if it looks at preceding text
482
 *
483
 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
484
 *
485
 * Final_Sigma
486
 *   C is preceded by a sequence consisting of
487
 *     a cased letter and a case-ignorable sequence,
488
 *   and C is not followed by a sequence consisting of
489
 *     an ignorable sequence and then a cased letter.
490
 *
491
 * More_Above
492
 *   C is followed by one or more characters of combining class 230 (ABOVE)
493
 *   in the combining character sequence.
494
 *
495
 * After_Soft_Dotted
496
 *   The last preceding character with combining class of zero before C
497
 *   was Soft_Dotted,
498
 *   and there is no intervening combining character class 230 (ABOVE).
499
 *
500
 * Before_Dot
501
 *   C is followed by combining dot above (U+0307).
502
 *   Any sequence of characters with a combining class that is neither 0 nor 230
503
 *   may intervene between the current character and the combining dot above.
504
 *
505
 * The erratum from 2002-10-31 adds the condition
506
 *
507
 * After_I
508
 *   The last preceding base character was an uppercase I, and there is no
509
 *   intervening combining character class 230 (ABOVE).
510
 *
511
 *   (See Jitterbug 2344 and the comments on After_I below.)
512
 *
513
 * Helper definitions in Unicode 3.2 UAX 21:
514
 *
515
 * D1. A character C is defined to be cased
516
 *     if it meets any of the following criteria:
517
 *
518
 *   - The general category of C is Titlecase Letter (Lt)
519
 *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
520
 *   - Given D = NFD(C), then it is not the case that:
521
 *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
522
 *     (This third criterium does not add any characters to the list
523
 *      for Unicode 3.2. Ignored.)
524
 *
525
 * D2. A character C is defined to be case-ignorable
526
 *     if it meets either of the following criteria:
527
 *
528
 *   - The general category of C is
529
 *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
530
 *     Letter Modifier (Lm), or Symbol Modifier (Sk)
531
 *   - C is one of the following characters 
532
 *     U+0027 APOSTROPHE
533
 *     U+00AD SOFT HYPHEN (SHY)
534
 *     U+2019 RIGHT SINGLE QUOTATION MARK
535
 *            (the preferred character for apostrophe)
536
 *
537
 * D3. A case-ignorable sequence is a sequence of
538
 *     zero or more case-ignorable characters.
539
 */
540
541
0
#define is_d(c) ((c)=='d' || (c)=='D')
542
0
#define is_e(c) ((c)=='e' || (c)=='E')
543
0
#define is_i(c) ((c)=='i' || (c)=='I')
544
3.57k
#define is_l(c) ((c)=='l' || (c)=='L')
545
0
#define is_r(c) ((c)=='r' || (c)=='R')
546
0
#define is_t(c) ((c)=='t' || (c)=='T')
547
0
#define is_u(c) ((c)=='u' || (c)=='U')
548
0
#define is_z(c) ((c)=='z' || (c)=='Z')
549
550
/* separator? */
551
0
#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
552
553
/**
554
 * Requires non-NULL locale ID but otherwise does the equivalent of
555
 * checking for language codes as if uloc_getLanguage() were called:
556
 * Accepts both 2- and 3-letter codes and accepts case variants.
557
 */
558
U_CFUNC int32_t
559
3.57k
ucase_getCaseLocale(const char *locale) {
560
    /*
561
     * This function used to use uloc_getLanguage(), but the current code
562
     * removes the dependency of this low-level code on uloc implementation code
563
     * and is faster because not the whole locale ID has to be
564
     * examined and copied/transformed.
565
     *
566
     * Because this code does not want to depend on uloc, the caller must
567
     * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
568
     */
569
3.57k
    char c=*locale++;
570
    // Fastpath for English "en" which is often used for default (=root locale) case mappings,
571
    // and for Chinese "zh": Very common but no special case mapping behavior.
572
    // Then check lowercase vs. uppercase to reduce the number of comparisons
573
    // for other locales without special behavior.
574
3.57k
    if(c=='e') {
575
        /* el or ell? */
576
3.57k
        c=*locale++;
577
3.57k
        if(is_l(c)) {
578
0
            c=*locale++;
579
0
            if(is_l(c)) {
580
0
                c=*locale;
581
0
            }
582
0
            if(is_sep(c)) {
583
0
                return UCASE_LOC_GREEK;
584
0
            }
585
0
        }
586
        // en, es, ... -> root
587
3.57k
    } else if(c=='z') {
588
0
        return UCASE_LOC_ROOT;
589
0
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
590
0
    } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
591
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
592
    } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
593
#else
594
#   error Unknown charset family!
595
#endif
596
        // lowercase c
597
0
        if(c=='t') {
598
            /* tr or tur? */
599
0
            c=*locale++;
600
0
            if(is_u(c)) {
601
0
                c=*locale++;
602
0
            }
603
0
            if(is_r(c)) {
604
0
                c=*locale;
605
0
                if(is_sep(c)) {
606
0
                    return UCASE_LOC_TURKISH;
607
0
                }
608
0
            }
609
0
        } else if(c=='a') {
610
            /* az or aze? */
611
0
            c=*locale++;
612
0
            if(is_z(c)) {
613
0
                c=*locale++;
614
0
                if(is_e(c)) {
615
0
                    c=*locale;
616
0
                }
617
0
                if(is_sep(c)) {
618
0
                    return UCASE_LOC_TURKISH;
619
0
                }
620
0
            }
621
0
        } else if(c=='l') {
622
            /* lt or lit? */
623
0
            c=*locale++;
624
0
            if(is_i(c)) {
625
0
                c=*locale++;
626
0
            }
627
0
            if(is_t(c)) {
628
0
                c=*locale;
629
0
                if(is_sep(c)) {
630
0
                    return UCASE_LOC_LITHUANIAN;
631
0
                }
632
0
            }
633
0
        } else if(c=='n') {
634
            /* nl or nld? */
635
0
            c=*locale++;
636
0
            if(is_l(c)) {
637
0
                c=*locale++;
638
0
                if(is_d(c)) {
639
0
                    c=*locale;
640
0
                }
641
0
                if(is_sep(c)) {
642
0
                    return UCASE_LOC_DUTCH;
643
0
                }
644
0
            }
645
0
        }
646
0
    } else {
647
        // uppercase c
648
        // Same code as for lowercase c but also check for 'E'.
649
0
        if(c=='T') {
650
            /* tr or tur? */
651
0
            c=*locale++;
652
0
            if(is_u(c)) {
653
0
                c=*locale++;
654
0
            }
655
0
            if(is_r(c)) {
656
0
                c=*locale;
657
0
                if(is_sep(c)) {
658
0
                    return UCASE_LOC_TURKISH;
659
0
                }
660
0
            }
661
0
        } else if(c=='A') {
662
            /* az or aze? */
663
0
            c=*locale++;
664
0
            if(is_z(c)) {
665
0
                c=*locale++;
666
0
                if(is_e(c)) {
667
0
                    c=*locale;
668
0
                }
669
0
                if(is_sep(c)) {
670
0
                    return UCASE_LOC_TURKISH;
671
0
                }
672
0
            }
673
0
        } else if(c=='L') {
674
            /* lt or lit? */
675
0
            c=*locale++;
676
0
            if(is_i(c)) {
677
0
                c=*locale++;
678
0
            }
679
0
            if(is_t(c)) {
680
0
                c=*locale;
681
0
                if(is_sep(c)) {
682
0
                    return UCASE_LOC_LITHUANIAN;
683
0
                }
684
0
            }
685
0
        } else if(c=='E') {
686
            /* el or ell? */
687
0
            c=*locale++;
688
0
            if(is_l(c)) {
689
0
                c=*locale++;
690
0
                if(is_l(c)) {
691
0
                    c=*locale;
692
0
                }
693
0
                if(is_sep(c)) {
694
0
                    return UCASE_LOC_GREEK;
695
0
                }
696
0
            }
697
0
        } else if(c=='N') {
698
            /* nl or nld? */
699
0
            c=*locale++;
700
0
            if(is_l(c)) {
701
0
                c=*locale++;
702
0
                if(is_d(c)) {
703
0
                    c=*locale;
704
0
                }
705
0
                if(is_sep(c)) {
706
0
                    return UCASE_LOC_DUTCH;
707
0
                }
708
0
            }
709
0
        }
710
0
    }
711
3.57k
    return UCASE_LOC_ROOT;
712
3.57k
}
713
714
/*
715
 * Is followed by
716
 *   {case-ignorable}* cased
717
 * ?
718
 * (dir determines looking forward/backward)
719
 * If a character is case-ignorable, it is skipped regardless of whether
720
 * it is also cased or not.
721
 */
722
static UBool
723
54.3k
isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
724
54.3k
    UChar32 c;
725
726
54.3k
    if(iter==NULL) {
727
0
        return FALSE;
728
0
    }
729
730
58.2k
    for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
731
57.9k
        int32_t type=ucase_getTypeOrIgnorable(c);
732
57.9k
        if(type&4) {
733
            /* case-ignorable, continue with the loop */
734
54.0k
        } else if(type!=UCASE_NONE) {
735
6.34k
            return TRUE; /* followed by cased letter */
736
47.7k
        } else {
737
47.7k
            return FALSE; /* uncased and not case-ignorable */
738
47.7k
        }
739
57.9k
    }
740
741
260
    return FALSE; /* not followed by cased letter */
742
54.3k
}
743
744
/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
745
static UBool
746
0
isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
747
0
    UChar32 c;
748
0
    int32_t dotType;
749
0
    int8_t dir;
750
751
0
    if(iter==NULL) {
752
0
        return FALSE;
753
0
    }
754
755
0
    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
756
0
        dotType=getDotType(c);
757
0
        if(dotType==UCASE_SOFT_DOTTED) {
758
0
            return TRUE; /* preceded by TYPE_i */
759
0
        } else if(dotType!=UCASE_OTHER_ACCENT) {
760
0
            return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
761
0
        }
762
0
    }
763
764
0
    return FALSE; /* not preceded by TYPE_i */
765
0
}
766
767
/*
768
 * See Jitterbug 2344:
769
 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
770
 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
771
 * we made those releases compatible with Unicode 3.2 which had not fixed
772
 * a related bug in SpecialCasing.txt.
773
 *
774
 * From the Jitterbug 2344 text:
775
 * ... this bug is listed as a Unicode erratum
776
 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
777
 * <quote>
778
 * There are two errors in SpecialCasing.txt.
779
 * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
780
 * 2. An incorrect context definition. Correct as follows:
781
 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
782
 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
783
 * ---
784
 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
785
 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
786
 * where the context After_I is defined as:
787
 * The last preceding base character was an uppercase I, and there is no
788
 * intervening combining character class 230 (ABOVE).
789
 * </quote>
790
 *
791
 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
792
 *
793
 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
794
 * # This matches the behavior of the canonically equivalent I-dot_above
795
 *
796
 * See also the description in this place in older versions of uchar.c (revision 1.100).
797
 *
798
 * Markus W. Scherer 2003-feb-15
799
 */
800
801
/* Is preceded by base character 'I' with no intervening cc=230 ? */
802
static UBool
803
0
isPrecededBy_I(UCaseContextIterator *iter, void *context) {
804
0
    UChar32 c;
805
0
    int32_t dotType;
806
0
    int8_t dir;
807
808
0
    if(iter==NULL) {
809
0
        return FALSE;
810
0
    }
811
812
0
    for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
813
0
        if(c==0x49) {
814
0
            return TRUE; /* preceded by I */
815
0
        }
816
0
        dotType=getDotType(c);
817
0
        if(dotType!=UCASE_OTHER_ACCENT) {
818
0
            return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
819
0
        }
820
0
    }
821
822
0
    return FALSE; /* not preceded by I */
823
0
}
824
825
/* Is followed by one or more cc==230 ? */
826
static UBool
827
0
isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
828
0
    UChar32 c;
829
0
    int32_t dotType;
830
0
    int8_t dir;
831
832
0
    if(iter==NULL) {
833
0
        return FALSE;
834
0
    }
835
836
0
    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
837
0
        dotType=getDotType(c);
838
0
        if(dotType==UCASE_ABOVE) {
839
0
            return TRUE; /* at least one cc==230 following */
840
0
        } else if(dotType!=UCASE_OTHER_ACCENT) {
841
0
            return FALSE; /* next base character, no more cc==230 following */
842
0
        }
843
0
    }
844
845
0
    return FALSE; /* no more cc==230 following */
846
0
}
847
848
/* Is followed by a dot above (without cc==230 in between) ? */
849
static UBool
850
0
isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
851
0
    UChar32 c;
852
0
    int32_t dotType;
853
0
    int8_t dir;
854
855
0
    if(iter==NULL) {
856
0
        return FALSE;
857
0
    }
858
859
0
    for(dir=1; (c=iter(context, dir))>=0; dir=0) {
860
0
        if(c==0x307) {
861
0
            return TRUE;
862
0
        }
863
0
        dotType=getDotType(c);
864
0
        if(dotType!=UCASE_OTHER_ACCENT) {
865
0
            return FALSE; /* next base character or cc==230 in between */
866
0
        }
867
0
    }
868
869
0
    return FALSE; /* no dot above following */
870
0
}
871
872
U_CAPI int32_t U_EXPORT2
873
ucase_toFullLower(UChar32 c,
874
                  UCaseContextIterator *iter, void *context,
875
                  const UChar **pString,
876
7.31M
                  int32_t loc) {
877
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
878
7.31M
    U_ASSERT(c >= 0);
879
7.31M
    UChar32 result=c;
880
7.31M
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
881
7.31M
    if(!PROPS_HAS_EXCEPTION(props)) {
882
7.10M
        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
883
349k
            result=c+UCASE_GET_DELTA(props);
884
349k
        }
885
7.10M
    } else {
886
213k
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
887
213k
        uint16_t excWord=*pe++;
888
213k
        int32_t full;
889
890
213k
        pe2=pe;
891
892
213k
        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
893
            /* use hardcoded conditions and mappings */
894
895
            /*
896
             * Test for conditional mappings first
897
             *   (otherwise the unconditional default mappings are always taken),
898
             * then test for characters that have unconditional mappings in SpecialCasing.txt,
899
             * then get the UnicodeData.txt mappings.
900
             */
901
182k
            if( loc==UCASE_LOC_LITHUANIAN &&
902
                    /* base characters, find accents above */
903
182k
                    (((c==0x49 || c==0x4a || c==0x12e) &&
904
0
                        isFollowedByMoreAbove(iter, context)) ||
905
                    /* precomposed with accent above, no need to find one */
906
0
                    (c==0xcc || c==0xcd || c==0x128))
907
182k
            ) {
908
                /*
909
                    # Lithuanian
910
911
                    # Lithuanian retains the dot in a lowercase i when followed by accents.
912
913
                    # Introduce an explicit dot above when lowercasing capital I's and J's
914
                    # whenever there are more accents above.
915
                    # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
916
917
                    0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
918
                    004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
919
                    012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
920
                    00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
921
                    00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
922
                    0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
923
                 */
924
0
                switch(c) {
925
0
                case 0x49:  /* LATIN CAPITAL LETTER I */
926
0
                    *pString=iDot;
927
0
                    return 2;
928
0
                case 0x4a:  /* LATIN CAPITAL LETTER J */
929
0
                    *pString=jDot;
930
0
                    return 2;
931
0
                case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
932
0
                    *pString=iOgonekDot;
933
0
                    return 2;
934
0
                case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
935
0
                    *pString=iDotGrave;
936
0
                    return 3;
937
0
                case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
938
0
                    *pString=iDotAcute;
939
0
                    return 3;
940
0
                case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
941
0
                    *pString=iDotTilde;
942
0
                    return 3;
943
0
                default:
944
0
                    return 0; /* will not occur */
945
0
                }
946
            /* # Turkish and Azeri */
947
182k
            } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
948
                /*
949
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
950
                    # The following rules handle those cases.
951
952
                    0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
953
                    0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
954
                 */
955
0
                return 0x69;
956
182k
            } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
957
                /*
958
                    # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
959
                    # This matches the behavior of the canonically equivalent I-dot_above
960
961
                    0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
962
                    0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
963
                 */
964
0
                return 0; /* remove the dot (continue without output) */
965
182k
            } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
966
                /*
967
                    # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
968
969
                    0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
970
                    0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
971
                 */
972
0
                return 0x131;
973
182k
            } else if(c==0x130) {
974
                /*
975
                    # Preserve canonical equivalence for I with dot. Turkic is handled below.
976
977
                    0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
978
                 */
979
128k
                *pString=iDot;
980
128k
                return 2;
981
128k
            } else if(  c==0x3a3 &&
982
54.0k
                        !isFollowedByCasedLetter(iter, context, 1) &&
983
54.0k
                        isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
984
54.0k
            ) {
985
                /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
986
                /*
987
                    # Special case for final form of sigma
988
989
                    03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
990
                 */
991
3.24k
                return 0x3c2; /* greek small final sigma */
992
50.7k
            } else {
993
                /* no known conditional special case mapping, use a normal mapping */
994
50.7k
            }
995
182k
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
996
10.8k
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
997
10.8k
            full&=UCASE_FULL_LOWER;
998
10.8k
            if(full!=0) {
999
                /* set the output pointer to the lowercase mapping */
1000
0
                *pString=reinterpret_cast<const UChar *>(pe+1);
1001
1002
                /* return the string length */
1003
0
                return full;
1004
0
            }
1005
10.8k
        }
1006
1007
81.9k
        if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1008
36.3k
            GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1009
36.3k
        }
1010
81.9k
    }
1011
1012
7.18M
    return (result==c) ? ~result : result;
1013
7.31M
}
1014
1015
/* internal */
1016
static int32_t
1017
toUpperOrTitle(UChar32 c,
1018
               UCaseContextIterator *iter, void *context,
1019
               const UChar **pString,
1020
               int32_t loc,
1021
0
               UBool upperNotTitle) {
1022
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1023
0
    U_ASSERT(c >= 0);
1024
0
    UChar32 result=c;
1025
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1026
0
    if(!PROPS_HAS_EXCEPTION(props)) {
1027
0
        if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1028
0
            result=c+UCASE_GET_DELTA(props);
1029
0
        }
1030
0
    } else {
1031
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1032
0
        uint16_t excWord=*pe++;
1033
0
        int32_t full, idx;
1034
1035
0
        pe2=pe;
1036
1037
0
        if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1038
            /* use hardcoded conditions and mappings */
1039
0
            if(loc==UCASE_LOC_TURKISH && c==0x69) {
1040
                /*
1041
                    # Turkish and Azeri
1042
1043
                    # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1044
                    # The following rules handle those cases.
1045
1046
                    # When uppercasing, i turns into a dotted capital I
1047
1048
                    0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1049
                    0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1050
                */
1051
0
                return 0x130;
1052
0
            } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1053
                /*
1054
                    # Lithuanian
1055
1056
                    # Lithuanian retains the dot in a lowercase i when followed by accents.
1057
1058
                    # Remove DOT ABOVE after "i" with upper or titlecase
1059
1060
                    0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1061
                 */
1062
0
                return 0; /* remove the dot (continue without output) */
1063
0
            } else {
1064
                /* no known conditional special case mapping, use a normal mapping */
1065
0
            }
1066
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1067
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1068
1069
            /* start of full case mapping strings */
1070
0
            ++pe;
1071
1072
            /* skip the lowercase and case-folding result strings */
1073
0
            pe+=full&UCASE_FULL_LOWER;
1074
0
            full>>=4;
1075
0
            pe+=full&0xf;
1076
0
            full>>=4;
1077
1078
0
            if(upperNotTitle) {
1079
0
                full&=0xf;
1080
0
            } else {
1081
                /* skip the uppercase result string */
1082
0
                pe+=full&0xf;
1083
0
                full=(full>>4)&0xf;
1084
0
            }
1085
1086
0
            if(full!=0) {
1087
                /* set the output pointer to the result string */
1088
0
                *pString=reinterpret_cast<const UChar *>(pe);
1089
1090
                /* return the string length */
1091
0
                return full;
1092
0
            }
1093
0
        }
1094
1095
0
        if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1096
0
            idx=UCASE_EXC_TITLE;
1097
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1098
            /* here, titlecase is same as uppercase */
1099
0
            idx=UCASE_EXC_UPPER;
1100
0
        } else {
1101
0
            return ~c;
1102
0
        }
1103
0
        GET_SLOT_VALUE(excWord, idx, pe2, result);
1104
0
    }
1105
1106
0
    return (result==c) ? ~result : result;
1107
0
}
1108
1109
U_CAPI int32_t U_EXPORT2
1110
ucase_toFullUpper(UChar32 c,
1111
                  UCaseContextIterator *iter, void *context,
1112
                  const UChar **pString,
1113
0
                  int32_t caseLocale) {
1114
0
    return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
1115
0
}
1116
1117
U_CAPI int32_t U_EXPORT2
1118
ucase_toFullTitle(UChar32 c,
1119
                  UCaseContextIterator *iter, void *context,
1120
                  const UChar **pString,
1121
0
                  int32_t caseLocale) {
1122
0
    return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
1123
0
}
1124
1125
/* case folding ------------------------------------------------------------- */
1126
1127
/*
1128
 * Case folding is similar to lowercasing.
1129
 * The result may be a simple mapping, i.e., a single code point, or
1130
 * a full mapping, i.e., a string.
1131
 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1132
 * then only the lowercase mapping is stored.
1133
 *
1134
 * Some special cases are hardcoded because their conditions cannot be
1135
 * parsed and processed from CaseFolding.txt.
1136
 *
1137
 * Unicode 3.2 CaseFolding.txt specifies for its status field:
1138
1139
# C: common case folding, common mappings shared by both simple and full mappings.
1140
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1141
# S: simple case folding, mappings to single characters where different from F.
1142
# T: special case for uppercase I and dotted uppercase I
1143
#    - For non-Turkic languages, this mapping is normally not used.
1144
#    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1145
#
1146
# Usage:
1147
#  A. To do a simple case folding, use the mappings with status C + S.
1148
#  B. To do a full case folding, use the mappings with status C + F.
1149
#
1150
#    The mappings with status T can be used or omitted depending on the desired case-folding
1151
#    behavior. (The default option is to exclude them.)
1152
1153
 * Unicode 3.2 has 'T' mappings as follows:
1154
1155
0049; T; 0131; # LATIN CAPITAL LETTER I
1156
0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1157
1158
 * while the default mappings for these code points are:
1159
1160
0049; C; 0069; # LATIN CAPITAL LETTER I
1161
0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1162
1163
 * U+0130 has no simple case folding (simple-case-folds to itself).
1164
 */
1165
1166
/* return the simple case folding mapping for c */
1167
U_CAPI UChar32 U_EXPORT2
1168
0
ucase_fold(UChar32 c, uint32_t options) {
1169
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1170
0
    if(!PROPS_HAS_EXCEPTION(props)) {
1171
0
        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1172
0
            c+=UCASE_GET_DELTA(props);
1173
0
        }
1174
0
    } else {
1175
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1176
0
        uint16_t excWord=*pe++;
1177
0
        int32_t idx;
1178
0
        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1179
            /* special case folding mappings, hardcoded */
1180
0
            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1181
                /* default mappings */
1182
0
                if(c==0x49) {
1183
                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1184
0
                    return 0x69;
1185
0
                } else if(c==0x130) {
1186
                    /* no simple case folding for U+0130 */
1187
0
                    return c;
1188
0
                }
1189
0
            } else {
1190
                /* Turkic mappings */
1191
0
                if(c==0x49) {
1192
                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1193
0
                    return 0x131;
1194
0
                } else if(c==0x130) {
1195
                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1196
0
                    return 0x69;
1197
0
                }
1198
0
            }
1199
0
        }
1200
0
        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1201
0
            idx=UCASE_EXC_FOLD;
1202
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1203
0
            idx=UCASE_EXC_LOWER;
1204
0
        } else {
1205
0
            return c;
1206
0
        }
1207
0
        GET_SLOT_VALUE(excWord, idx, pe, c);
1208
0
    }
1209
0
    return c;
1210
0
}
1211
1212
/*
1213
 * Issue for canonical caseless match (UAX #21):
1214
 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1215
 * canonical equivalence, unlike default-option casefolding.
1216
 * For example, I-grave and I + grave fold to strings that are not canonically
1217
 * equivalent.
1218
 * For more details, see the comment in unorm_compare() in unorm.cpp
1219
 * and the intermediate prototype changes for Jitterbug 2021.
1220
 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1221
 *
1222
 * This did not get fixed because it appears that it is not possible to fix
1223
 * it for uppercase and lowercase characters (I-grave vs. i-grave)
1224
 * together in a way that they still fold to common result strings.
1225
 */
1226
1227
U_CAPI int32_t U_EXPORT2
1228
ucase_toFullFolding(UChar32 c,
1229
                    const UChar **pString,
1230
0
                    uint32_t options) {
1231
    // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1232
0
    U_ASSERT(c >= 0);
1233
0
    UChar32 result=c;
1234
0
    uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1235
0
    if(!PROPS_HAS_EXCEPTION(props)) {
1236
0
        if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1237
0
            result=c+UCASE_GET_DELTA(props);
1238
0
        }
1239
0
    } else {
1240
0
        const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1241
0
        uint16_t excWord=*pe++;
1242
0
        int32_t full, idx;
1243
1244
0
        pe2=pe;
1245
1246
0
        if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1247
            /* use hardcoded conditions and mappings */
1248
0
            if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1249
                /* default mappings */
1250
0
                if(c==0x49) {
1251
                    /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1252
0
                    return 0x69;
1253
0
                } else if(c==0x130) {
1254
                    /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1255
0
                    *pString=iDot;
1256
0
                    return 2;
1257
0
                }
1258
0
            } else {
1259
                /* Turkic mappings */
1260
0
                if(c==0x49) {
1261
                    /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1262
0
                    return 0x131;
1263
0
                } else if(c==0x130) {
1264
                    /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1265
0
                    return 0x69;
1266
0
                }
1267
0
            }
1268
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1269
0
            GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1270
1271
            /* start of full case mapping strings */
1272
0
            ++pe;
1273
1274
            /* skip the lowercase result string */
1275
0
            pe+=full&UCASE_FULL_LOWER;
1276
0
            full=(full>>4)&0xf;
1277
1278
0
            if(full!=0) {
1279
                /* set the output pointer to the result string */
1280
0
                *pString=reinterpret_cast<const UChar *>(pe);
1281
1282
                /* return the string length */
1283
0
                return full;
1284
0
            }
1285
0
        }
1286
1287
0
        if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1288
0
            idx=UCASE_EXC_FOLD;
1289
0
        } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1290
0
            idx=UCASE_EXC_LOWER;
1291
0
        } else {
1292
0
            return ~c;
1293
0
        }
1294
0
        GET_SLOT_VALUE(excWord, idx, pe2, result);
1295
0
    }
1296
1297
0
    return (result==c) ? ~result : result;
1298
0
}
1299
1300
/* case mapping properties API ---------------------------------------------- */
1301
1302
/* public API (see uchar.h) */
1303
1304
U_CAPI UBool U_EXPORT2
1305
0
u_isULowercase(UChar32 c) {
1306
0
    return (UBool)(UCASE_LOWER==ucase_getType(c));
1307
0
}
1308
1309
U_CAPI UBool U_EXPORT2
1310
0
u_isUUppercase(UChar32 c) {
1311
0
    return (UBool)(UCASE_UPPER==ucase_getType(c));
1312
0
}
1313
1314
/* Transforms the Unicode character to its lower case equivalent.*/
1315
U_CAPI UChar32 U_EXPORT2
1316
0
u_tolower(UChar32 c) {
1317
0
    return ucase_tolower(c);
1318
0
}
1319
    
1320
/* Transforms the Unicode character to its upper case equivalent.*/
1321
U_CAPI UChar32 U_EXPORT2
1322
0
u_toupper(UChar32 c) {
1323
0
    return ucase_toupper(c);
1324
0
}
1325
1326
/* Transforms the Unicode character to its title case equivalent.*/
1327
U_CAPI UChar32 U_EXPORT2
1328
0
u_totitle(UChar32 c) {
1329
0
    return ucase_totitle(c);
1330
0
}
1331
1332
/* return the simple case folding mapping for c */
1333
U_CAPI UChar32 U_EXPORT2
1334
0
u_foldCase(UChar32 c, uint32_t options) {
1335
0
    return ucase_fold(c, options);
1336
0
}
1337
1338
U_CFUNC int32_t U_EXPORT2
1339
0
ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1340
    /* case mapping properties */
1341
0
    const UChar *resultString;
1342
0
    switch(which) {
1343
0
    case UCHAR_LOWERCASE:
1344
0
        return (UBool)(UCASE_LOWER==ucase_getType(c));
1345
0
    case UCHAR_UPPERCASE:
1346
0
        return (UBool)(UCASE_UPPER==ucase_getType(c));
1347
0
    case UCHAR_SOFT_DOTTED:
1348
0
        return ucase_isSoftDotted(c);
1349
0
    case UCHAR_CASE_SENSITIVE:
1350
0
        return ucase_isCaseSensitive(c);
1351
0
    case UCHAR_CASED:
1352
0
        return (UBool)(UCASE_NONE!=ucase_getType(c));
1353
0
    case UCHAR_CASE_IGNORABLE:
1354
0
        return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1355
    /*
1356
     * Note: The following Changes_When_Xyz are defined as testing whether
1357
     * the NFD form of the input changes when Xyz-case-mapped.
1358
     * However, this simpler implementation of these properties,
1359
     * ignoring NFD, passes the tests.
1360
     * The implementation needs to be changed if the tests start failing.
1361
     * When that happens, optimizations should be used to work with the
1362
     * per-single-code point ucase_toFullXyz() functions unless
1363
     * the NFD form has more than one code point,
1364
     * and the property starts set needs to be the union of the
1365
     * start sets for normalization and case mappings.
1366
     */
1367
0
    case UCHAR_CHANGES_WHEN_LOWERCASED:
1368
0
        return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1369
0
    case UCHAR_CHANGES_WHEN_UPPERCASED:
1370
0
        return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1371
0
    case UCHAR_CHANGES_WHEN_TITLECASED:
1372
0
        return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1373
    /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1374
0
    case UCHAR_CHANGES_WHEN_CASEMAPPED:
1375
0
        return (UBool)(
1376
0
            ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1377
0
            ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1378
0
            ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1379
0
    default:
1380
0
        return FALSE;
1381
0
    }
1382
0
}