Coverage Report

Created: 2025-06-13 06:35

/src/icu/icu4c/source/common/unames.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 1999-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  unames.c
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 1999oct04
16
*   created by: Markus W. Scherer
17
*/
18
19
#include "unicode/utypes.h"
20
#include "unicode/putil.h"
21
#include "unicode/uchar.h"
22
#include "unicode/udata.h"
23
#include "unicode/utf.h"
24
#include "unicode/utf16.h"
25
#include "uassert.h"
26
#include "ustr_imp.h"
27
#include "umutex.h"
28
#include "cmemory.h"
29
#include "cstring.h"
30
#include "ucln_cmn.h"
31
#include "udataswp.h"
32
#include "uprops.h"
33
34
U_NAMESPACE_BEGIN
35
36
/* prototypes ------------------------------------------------------------- */
37
38
static const char DATA_NAME[] = "unames";
39
static const char DATA_TYPE[] = "icu";
40
41
338M
#define GROUP_SHIFT 5
42
333M
#define LINES_PER_GROUP (1L<<GROUP_SHIFT)
43
257M
#define GROUP_MASK (LINES_PER_GROUP-1)
44
45
/*
46
 * This struct was replaced by explicitly accessing equivalent
47
 * fields from triples of uint16_t.
48
 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
49
 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
50
 * would advance by 6 bytes (3 uint16_t).
51
 *
52
 * We can't just change the data structure because it's loaded from a data file,
53
 * and we don't want to make it less compact, so we changed the access code.
54
 *
55
 * For details see ICU tickets 6331 and 6008.
56
typedef struct {
57
    uint16_t groupMSB,
58
             offsetHigh, offsetLow; / * avoid padding * /
59
} Group;
60
 */
61
enum {
62
    GROUP_MSB,
63
    GROUP_OFFSET_HIGH,
64
    GROUP_OFFSET_LOW,
65
    GROUP_LENGTH
66
};
67
68
/*
69
 * Get the 32-bit group offset.
70
 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
71
 * @return group offset (int32_t)
72
 */
73
4.02M
#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
74
75
4.02M
#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
76
240
#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
77
78
typedef struct {
79
    uint32_t start, end;
80
    uint8_t type, variant;
81
    uint16_t size;
82
} AlgorithmicRange;
83
84
typedef struct {
85
    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
86
} UCharNames;
87
88
/*
89
 * Get the groups table from a UCharNames struct.
90
 * The groups table consists of one uint16_t groupCount followed by
91
 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
92
 * and the comment for the old struct Group above.
93
 *
94
 * @param names (const UCharNames *) pointer to the UCharNames indexes
95
 * @return (const uint16_t *) pointer to the groups table
96
 */
97
6.30k
#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
98
99
typedef struct {
100
    const char *otherName;
101
    UChar32 code;
102
} FindName;
103
104
4.22M
#define DO_FIND_NAME nullptr
105
106
static UDataMemory *uCharNamesData=nullptr;
107
static UCharNames *uCharNames=nullptr;
108
static icu::UInitOnce gCharNamesInitOnce {};
109
110
/*
111
 * Maximum length of character names (regular & 1.0).
112
 */
113
static int32_t gMaxNameLength=0;
114
115
/*
116
 * Set of chars used in character names (regular & 1.0).
117
 * Chars are platform-dependent (can be EBCDIC).
118
 */
119
static uint32_t gNameSet[8]={ 0 };
120
121
10
#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
122
18
#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
123
22
#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
124
125
#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
126
127
static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
128
    "unassigned",
129
    "uppercase letter",
130
    "lowercase letter",
131
    "titlecase letter",
132
    "modifier letter",
133
    "other letter",
134
    "non spacing mark",
135
    "enclosing mark",
136
    "combining spacing mark",
137
    "decimal digit number",
138
    "letter number",
139
    "other number",
140
    "space separator",
141
    "line separator",
142
    "paragraph separator",
143
    "control",
144
    "format",
145
    "private use area",
146
    "surrogate",
147
    "dash punctuation",   
148
    "start punctuation",
149
    "end punctuation",
150
    "connector punctuation",
151
    "other punctuation",
152
    "math symbol",
153
    "currency symbol",
154
    "modifier symbol",
155
    "other symbol",
156
    "initial punctuation",
157
    "final punctuation",
158
    "noncharacter",
159
    "lead surrogate",
160
    "trail surrogate"
161
};
162
163
/* implementation ----------------------------------------------------------- */
164
165
static UBool U_CALLCONV unames_cleanup()
166
0
{
167
0
    if(uCharNamesData) {
168
0
        udata_close(uCharNamesData);
169
0
        uCharNamesData = nullptr;
170
0
    }
171
0
    if(uCharNames) {
172
0
        uCharNames = nullptr;
173
0
    }
174
0
    gCharNamesInitOnce.reset();
175
0
    gMaxNameLength=0;
176
0
    return true;
177
0
}
178
179
static UBool U_CALLCONV
180
isAcceptable(void * /*context*/,
181
             const char * /*type*/, const char * /*name*/,
182
1
             const UDataInfo *pInfo) {
183
1
    return
184
1
        pInfo->size>=20 &&
185
1
        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
186
1
        pInfo->charsetFamily==U_CHARSET_FAMILY &&
187
1
        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
188
1
        pInfo->dataFormat[1]==0x6e &&
189
1
        pInfo->dataFormat[2]==0x61 &&
190
1
        pInfo->dataFormat[3]==0x6d &&
191
1
        pInfo->formatVersion[0]==1;
192
1
}
193
194
static void U_CALLCONV
195
1
loadCharNames(UErrorCode &status) {
196
1
    U_ASSERT(uCharNamesData == nullptr);
197
1
    U_ASSERT(uCharNames == nullptr);
198
199
1
    uCharNamesData = udata_openChoice(nullptr, DATA_TYPE, DATA_NAME, isAcceptable, nullptr, &status);
200
1
    if(U_FAILURE(status)) {
201
0
        uCharNamesData = nullptr;
202
1
    } else {
203
1
        uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
204
1
    }
205
1
    ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
206
1
}
207
208
209
static UBool
210
3.63k
isDataLoaded(UErrorCode *pErrorCode) {
211
3.63k
    umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
212
3.63k
    return U_SUCCESS(*pErrorCode);
213
3.63k
}
214
215
0
#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) UPRV_BLOCK_MACRO_BEGIN { \
216
0
    if((bufferLength)>0) { \
217
0
        *(buffer)++=c; \
218
0
        --(bufferLength); \
219
0
    } \
220
0
    ++(bufferPos); \
221
0
} UPRV_BLOCK_MACRO_END
222
223
0
#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
224
225
/*
226
 * Important: expandName() and compareName() are almost the same -
227
 * apply fixes to both.
228
 *
229
 * UnicodeData.txt uses ';' as a field separator, so no
230
 * field can contain ';' as part of its contents.
231
 * In unames.dat, it is marked as token[';']==-1 only if the
232
 * semicolon is used in the data file - which is iff we
233
 * have Unicode 1.0 names or ISO comments or aliases.
234
 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
235
 * although we know that it will never be part of a name.
236
 */
237
static uint16_t
238
expandName(UCharNames *names,
239
           const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
240
0
           char *buffer, uint16_t bufferLength) {
241
0
    uint16_t* tokens = reinterpret_cast<uint16_t*>(names) + 8;
242
0
    uint16_t token, tokenCount=*tokens++, bufferPos=0;
243
0
    uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(names) + names->tokenStringOffset;
244
0
    uint8_t c;
245
246
0
    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
247
        /*
248
         * skip the modern name if it is not requested _and_
249
         * if the semicolon byte value is a character, not a token number
250
         */
251
0
        if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
252
0
            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
253
0
            do {
254
0
                while(nameLength>0) {
255
0
                    --nameLength;
256
0
                    if(*name++==';') {
257
0
                        break;
258
0
                    }
259
0
                }
260
0
            } while(--fieldIndex>0);
261
0
        } else {
262
            /*
263
             * the semicolon byte value is a token number, therefore
264
             * only modern names are stored in unames.dat and there is no
265
             * such requested alternate name here
266
             */
267
0
            nameLength=0;
268
0
        }
269
0
    }
270
271
    /* write each letter directly, and write a token word per token */
272
0
    while(nameLength>0) {
273
0
        --nameLength;
274
0
        c=*name++;
275
276
0
        if(c>=tokenCount) {
277
0
            if(c!=';') {
278
                /* implicit letter */
279
0
                WRITE_CHAR(buffer, bufferLength, bufferPos, c);
280
0
            } else {
281
                /* finished */
282
0
                break;
283
0
            }
284
0
        } else {
285
0
            token=tokens[c];
286
0
            if (token == static_cast<uint16_t>(-2)) {
287
                /* this is a lead byte for a double-byte token */
288
0
                token=tokens[c<<8|*name++];
289
0
                --nameLength;
290
0
            }
291
0
            if (token == static_cast<uint16_t>(-1)) {
292
0
                if(c!=';') {
293
                    /* explicit letter */
294
0
                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
295
0
                } else {
296
                    /* stop, but skip the semicolon if we are seeking
297
                       extended names and there was no 2.0 name but there
298
                       is a 1.0 name. */
299
0
                    if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
300
0
                        if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
301
0
                            continue;
302
0
                        }
303
0
                    }
304
                    /* finished */
305
0
                    break;
306
0
                }
307
0
            } else {
308
                /* write token word */
309
0
                uint8_t *tokenString=tokenStrings+token;
310
0
                while((c=*tokenString++)!=0) {
311
0
                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
312
0
                }
313
0
            }
314
0
        }
315
0
    }
316
317
    /* zero-terminate */
318
0
    if(bufferLength>0) {
319
0
        *buffer=0;
320
0
    }
321
322
0
    return bufferPos;
323
0
}
324
325
/*
326
 * compareName() is almost the same as expandName() except that it compares
327
 * the currently expanded name to an input name.
328
 * It returns the match/no match result as soon as possible.
329
 */
330
static UBool
331
compareName(UCharNames *names,
332
            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
333
128M
            const char *otherName) {
334
128M
    uint16_t* tokens = reinterpret_cast<uint16_t*>(names) + 8;
335
128M
    uint16_t token, tokenCount=*tokens++;
336
128M
    uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(names) + names->tokenStringOffset;
337
128M
    uint8_t c;
338
128M
    const char *origOtherName = otherName;
339
340
128M
    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
341
        /*
342
         * skip the modern name if it is not requested _and_
343
         * if the semicolon byte value is a character, not a token number
344
         */
345
0
        if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
346
0
            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
347
0
            do {
348
0
                while(nameLength>0) {
349
0
                    --nameLength;
350
0
                    if(*name++==';') {
351
0
                        break;
352
0
                    }
353
0
                }
354
0
            } while(--fieldIndex>0);
355
0
        } else {
356
            /*
357
             * the semicolon byte value is a token number, therefore
358
             * only modern names are stored in unames.dat and there is no
359
             * such requested alternate name here
360
             */
361
0
            nameLength=0;
362
0
        }
363
0
    }
364
365
    /* compare each letter directly, and compare a token word per token */
366
128M
    while(nameLength>0) {
367
117M
        --nameLength;
368
117M
        c=*name++;
369
370
117M
        if(c>=tokenCount) {
371
0
            if(c!=';') {
372
                /* implicit letter */
373
0
                if (static_cast<char>(c) != *otherName++) {
374
0
                    return false;
375
0
                }
376
0
            } else {
377
                /* finished */
378
0
                break;
379
0
            }
380
117M
        } else {
381
117M
            token=tokens[c];
382
117M
            if (token == static_cast<uint16_t>(-2)) {
383
                /* this is a lead byte for a double-byte token */
384
22.4M
                token=tokens[c<<8|*name++];
385
22.4M
                --nameLength;
386
22.4M
            }
387
117M
            if (token == static_cast<uint16_t>(-1)) {
388
1.90M
                if(c!=';') {
389
                    /* explicit letter */
390
1.90M
                    if (static_cast<char>(c) != *otherName++) {
391
1.85M
                        return false;
392
1.85M
                    }
393
1.90M
                } else {
394
                    /* stop, but skip the semicolon if we are seeking
395
                       extended names and there was no 2.0 name but there
396
                       is a 1.0 name. */
397
0
                    if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
398
0
                        if (static_cast<uint8_t>(';') >= tokenCount || tokens[static_cast<uint8_t>(';')] == static_cast<uint16_t>(-1)) {
399
0
                            continue;
400
0
                        }
401
0
                    }
402
                    /* finished */
403
0
                    break;
404
0
                }
405
115M
            } else {
406
                /* write token word */
407
115M
                uint8_t *tokenString=tokenStrings+token;
408
118M
                while((c=*tokenString++)!=0) {
409
118M
                    if (static_cast<char>(c) != *otherName++) {
410
115M
                        return false;
411
115M
                    }
412
118M
                }
413
115M
            }
414
117M
        }
415
117M
    }
416
417
    /* complete match? */
418
11.3M
    return *otherName == 0;
419
128M
}
420
421
374
static uint8_t getCharCat(UChar32 cp) {
422
374
    uint8_t cat;
423
424
374
    if (U_IS_UNICODE_NONCHAR(cp)) {
425
10
        return U_NONCHARACTER_CODE_POINT;
426
10
    }
427
428
364
    if ((cat = u_charType(cp)) == U_SURROGATE) {
429
20
        cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
430
20
    }
431
432
364
    return cat;
433
374
}
434
435
0
static const char *getCharCatName(UChar32 cp) {
436
0
    uint8_t cat = getCharCat(cp);
437
438
    /* Return unknown if the table of names above is not up to
439
       date. */
440
441
0
    if (cat >= UPRV_LENGTHOF(charCatNames)) {
442
0
        return "unknown";
443
0
    } else {
444
0
        return charCatNames[cat];
445
0
    }
446
0
}
447
448
0
static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
449
0
    const char *catname = getCharCatName(code);
450
0
    uint16_t length = 0;
451
452
0
    UChar32 cp;
453
0
    int ndigits, i;
454
    
455
0
    WRITE_CHAR(buffer, bufferLength, length, '<');
456
0
    while (catname[length - 1]) {
457
0
        WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
458
0
    }
459
0
    WRITE_CHAR(buffer, bufferLength, length, '-');
460
0
    for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
461
0
        ;
462
0
    if (ndigits < 4)
463
0
        ndigits = 4;
464
0
    for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
465
0
        uint8_t v = static_cast<uint8_t>(cp & 0xf);
466
0
        buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
467
0
    }
468
0
    buffer += ndigits;
469
0
    length += static_cast<uint16_t>(ndigits);
470
0
    WRITE_CHAR(buffer, bufferLength, length, '>');
471
472
0
    return length;
473
0
}
474
475
/*
476
 * getGroup() does a binary search for the group that contains the
477
 * Unicode code point "code".
478
 * The return value is always a valid Group* that may contain "code"
479
 * or else is the highest group before "code".
480
 * If the lowest group is after "code", then that one is returned.
481
 */
482
static const uint16_t *
483
3.15k
getGroup(UCharNames *names, uint32_t code) {
484
3.15k
    const uint16_t *groups=GET_GROUPS(names);
485
3.15k
    uint16_t groupMSB = static_cast<uint16_t>(code >> GROUP_SHIFT),
486
3.15k
             start=0,
487
3.15k
             limit=*groups++,
488
3.15k
             number;
489
490
    /* binary search for the group of names that contains the one for code */
491
34.6k
    while(start<limit-1) {
492
31.5k
        number = static_cast<uint16_t>((start + limit) / 2);
493
31.5k
        if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
494
31.5k
            limit=number;
495
31.5k
        } else {
496
0
            start=number;
497
0
        }
498
31.5k
    }
499
500
    /* return this regardless of whether it is an exact match */
501
3.15k
    return groups+start*GROUP_LENGTH;
502
3.15k
}
503
504
/*
505
 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
506
 * expands them into offsets and lengths for each string.
507
 * Lengths are stored with a variable-width encoding in consecutive nibbles:
508
 * If a nibble<0xc, then it is the length itself (0=empty string).
509
 * If a nibble>=0xc, then it forms a length value with the following nibble.
510
 * Calculation see below.
511
 * The offsets and lengths arrays must be at least 33 (one more) long because
512
 * there is no check here at the end if the last nibble is still used.
513
 */
514
static const uint8_t *
515
expandGroupLengths(const uint8_t *s,
516
4.02M
                   uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
517
    /* read the lengths of the 32 strings in this group and get each string's offset */
518
4.02M
    uint16_t i=0, offset=0, length=0;
519
4.02M
    uint8_t lengthByte;
520
521
    /* all 32 lengths must be read to get the offset of the first group string */
522
72.0M
    while(i<LINES_PER_GROUP) {
523
68.0M
        lengthByte=*s++;
524
525
        /* read even nibble - MSBs of lengthByte */
526
68.0M
        if(length>=12) {
527
            /* double-nibble length spread across two bytes */
528
2.73M
            length = static_cast<uint16_t>(((length & 0x3) << 4 | lengthByte >> 4) + 12);
529
2.73M
            lengthByte&=0xf;
530
65.3M
        } else if((lengthByte /* &0xf0 */)>=0xc0) {
531
            /* double-nibble length spread across this one byte */
532
3.76M
            length = static_cast<uint16_t>((lengthByte & 0x3f) + 12);
533
61.5M
        } else {
534
            /* single-nibble length in MSBs */
535
61.5M
            length = static_cast<uint16_t>(lengthByte >> 4);
536
61.5M
            lengthByte&=0xf;
537
61.5M
        }
538
539
68.0M
        *offsets++=offset;
540
68.0M
        *lengths++=length;
541
542
68.0M
        offset+=length;
543
68.0M
        ++i;
544
545
        /* read odd nibble - LSBs of lengthByte */
546
68.0M
        if((lengthByte&0xf0)==0) {
547
            /* this nibble was not consumed for a double-nibble length above */
548
64.2M
            length=lengthByte;
549
64.2M
            if(length<12) {
550
                /* single-nibble length in LSBs */
551
61.5M
                *offsets++=offset;
552
61.5M
                *lengths++=length;
553
554
61.5M
                offset+=length;
555
61.5M
                ++i;
556
61.5M
            }
557
64.2M
        } else {
558
3.76M
            length=0;   /* prevent double-nibble detection in the next iteration */
559
3.76M
        }
560
68.0M
    }
561
562
    /* now, s is at the first group string */
563
4.02M
    return s;
564
4.02M
}
565
566
static uint16_t
567
expandGroupName(UCharNames *names, const uint16_t *group,
568
                uint16_t lineNumber, UCharNameChoice nameChoice,
569
0
                char *buffer, uint16_t bufferLength) {
570
0
    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
571
0
    const uint8_t* s = reinterpret_cast<uint8_t*>(names) + names->groupStringOffset + GET_GROUP_OFFSET(group);
572
0
    s=expandGroupLengths(s, offsets, lengths);
573
0
    return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
574
0
                      buffer, bufferLength);
575
0
}
576
577
static uint16_t
578
getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
579
0
        char *buffer, uint16_t bufferLength) {
580
0
    const uint16_t *group=getGroup(names, code);
581
0
    if (static_cast<uint16_t>(code >> GROUP_SHIFT) == group[GROUP_MSB]) {
582
0
        return expandGroupName(names, group, static_cast<uint16_t>(code & GROUP_MASK), nameChoice,
583
0
                               buffer, bufferLength);
584
0
    } else {
585
        /* group not found */
586
        /* zero-terminate */
587
0
        if(bufferLength>0) {
588
0
            *buffer=0;
589
0
        }
590
0
        return 0;
591
0
    }
592
0
}
593
594
/*
595
 * enumGroupNames() enumerates all the names in a 32-group
596
 * and either calls the enumerator function or finds a given input name.
597
 */
598
static UBool
599
enumGroupNames(UCharNames *names, const uint16_t *group,
600
               UChar32 start, UChar32 end,
601
               UEnumCharNamesFn *fn, void *context,
602
4.02M
               UCharNameChoice nameChoice) {
603
4.02M
    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
604
4.02M
    const uint8_t* s = reinterpret_cast<uint8_t*>(names) + names->groupStringOffset + GET_GROUP_OFFSET(group);
605
606
4.02M
    s=expandGroupLengths(s, offsets, lengths);
607
4.02M
    if(fn!=DO_FIND_NAME) {
608
0
        char buffer[200];
609
0
        uint16_t length;
610
611
0
        while(start<=end) {
612
0
            length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
613
0
            if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
614
0
                buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
615
0
            }
616
            /* here, we assume that the buffer is large enough */
617
0
            if(length>0) {
618
0
                if(!fn(context, start, nameChoice, buffer, length)) {
619
0
                    return false;
620
0
                }
621
0
            }
622
0
            ++start;
623
0
        }
624
4.02M
    } else {
625
4.02M
        const char* otherName = static_cast<FindName*>(context)->otherName;
626
132M
        while(start<=end) {
627
128M
            if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
628
2.91k
                static_cast<FindName*>(context)->code = start;
629
2.91k
                return false;
630
2.91k
            }
631
128M
            ++start;
632
128M
        }
633
4.02M
    }
634
4.02M
    return true;
635
4.02M
}
636
637
/*
638
 * enumExtNames enumerate extended names.
639
 * It only needs to do it if it is called with a real function and not
640
 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
641
 * for extended names by itself.
642
 */ 
643
static UBool
644
enumExtNames(UChar32 start, UChar32 end,
645
             UEnumCharNamesFn *fn, void *context)
646
198k
{
647
198k
    if(fn!=DO_FIND_NAME) {
648
0
        char buffer[200];
649
0
        uint16_t length;
650
        
651
0
        while(start<=end) {
652
0
            buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
653
            /* here, we assume that the buffer is large enough */
654
0
            if(length>0) {
655
0
                if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
656
0
                    return false;
657
0
                }
658
0
            }
659
0
            ++start;
660
0
        }
661
0
    }
662
663
198k
    return true;
664
198k
}
665
666
static UBool
667
enumNames(UCharNames *names,
668
          UChar32 start, UChar32 limit,
669
          UEnumCharNamesFn *fn, void *context,
670
3.15k
          UCharNameChoice nameChoice) {
671
3.15k
    uint16_t startGroupMSB, endGroupMSB, groupCount;
672
3.15k
    const uint16_t *group, *groupLimit;
673
674
3.15k
    startGroupMSB = static_cast<uint16_t>(start >> GROUP_SHIFT);
675
3.15k
    endGroupMSB = static_cast<uint16_t>((limit - 1) >> GROUP_SHIFT);
676
677
    /* find the group that contains start, or the highest before it */
678
3.15k
    group=getGroup(names, start);
679
680
3.15k
    if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
681
        /* enumerate synthetic names between start and the group start */
682
3.15k
        UChar32 extLimit = static_cast<UChar32>(group[GROUP_MSB]) << GROUP_SHIFT;
683
3.15k
        if(extLimit>limit) {
684
0
            extLimit=limit;
685
0
        }
686
3.15k
        if(!enumExtNames(start, extLimit-1, fn, context)) {
687
0
            return false;
688
0
        }
689
3.15k
        start=extLimit;
690
3.15k
    }
691
692
3.15k
    if(startGroupMSB==endGroupMSB) {
693
0
        if(startGroupMSB==group[GROUP_MSB]) {
694
            /* if start and limit-1 are in the same group, then enumerate only in that one */
695
0
            return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
696
0
        }
697
3.15k
    } else {
698
3.15k
        const uint16_t *groups=GET_GROUPS(names);
699
3.15k
        groupCount=*groups++;
700
3.15k
        groupLimit=groups+groupCount*GROUP_LENGTH;
701
702
3.15k
        if(startGroupMSB==group[GROUP_MSB]) {
703
            /* enumerate characters in the partial start group */
704
0
            if((start&GROUP_MASK)!=0) {
705
0
                if(!enumGroupNames(names, group,
706
0
                                   start, (static_cast<UChar32>(startGroupMSB) << GROUP_SHIFT) + LINES_PER_GROUP - 1,
707
0
                                   fn, context, nameChoice)) {
708
0
                    return false;
709
0
                }
710
0
                group=NEXT_GROUP(group); /* continue with the next group */
711
0
            }
712
3.15k
        } else if(startGroupMSB>group[GROUP_MSB]) {
713
            /* make sure that we start enumerating with the first group after start */
714
0
            const uint16_t *nextGroup=NEXT_GROUP(group);
715
0
            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
716
0
                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
717
0
                if (end > limit) {
718
0
                    end = limit;
719
0
                }
720
0
                if (!enumExtNames(start, end - 1, fn, context)) {
721
0
                    return false;
722
0
                }
723
0
            }
724
0
            group=nextGroup;
725
0
        }
726
727
        /* enumerate entire groups between the start- and end-groups */
728
4.02M
        while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
729
4.02M
            const uint16_t *nextGroup;
730
4.02M
            start = static_cast<UChar32>(group[GROUP_MSB]) << GROUP_SHIFT;
731
4.02M
            if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
732
2.91k
                return false;
733
2.91k
            }
734
4.02M
            nextGroup=NEXT_GROUP(group);
735
4.02M
            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
736
194k
                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
737
194k
                if (end > limit) {
738
0
                    end = limit;
739
0
                }
740
194k
                if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
741
0
                    return false;
742
0
                }
743
194k
            }
744
4.02M
            group=nextGroup;
745
4.02M
        }
746
747
        /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
748
240
        if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
749
0
            return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
750
240
        } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
751
240
            UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
752
240
            if (next > start) {
753
240
                start = next;
754
240
            }
755
240
        } else {
756
0
            return true;
757
0
        }
758
240
    }
759
760
    /* we have not found a group, which means everything is made of
761
       extended names. */
762
240
    if (nameChoice == U_EXTENDED_CHAR_NAME) {
763
240
        if (limit > UCHAR_MAX_VALUE + 1) {
764
0
            limit = UCHAR_MAX_VALUE + 1;
765
0
        }
766
240
        return enumExtNames(start, limit - 1, fn, context);
767
240
    }
768
    
769
0
    return true;
770
240
}
771
772
static uint16_t
773
writeFactorSuffix(const uint16_t *factors, uint16_t count,
774
                  const char *s, /* suffix elements */
775
                  uint32_t code,
776
                  uint16_t indexes[8], /* output fields from here */
777
                  const char *elementBases[8], const char *elements[8],
778
0
                  char *buffer, uint16_t bufferLength) {
779
0
    uint16_t i, factor, bufferPos=0;
780
0
    char c;
781
782
    /* write elements according to the factors */
783
784
    /*
785
     * the factorized elements are determined by modulo arithmetic
786
     * with the factors of this algorithm
787
     *
788
     * note that for fewer operations, count is decremented here
789
     */
790
0
    --count;
791
0
    for(i=count; i>0; --i) {
792
0
        factor=factors[i];
793
0
        indexes[i] = static_cast<uint16_t>(code % factor);
794
0
        code/=factor;
795
0
    }
796
    /*
797
     * we don't need to calculate the last modulus because start<=code<=end
798
     * guarantees here that code<=factors[0]
799
     */
800
0
    indexes[0] = static_cast<uint16_t>(code);
801
802
    /* write each element */
803
0
    for(;;) {
804
0
        if(elementBases!=nullptr) {
805
0
            *elementBases++=s;
806
0
        }
807
808
        /* skip indexes[i] strings */
809
0
        factor=indexes[i];
810
0
        while(factor>0) {
811
0
            while(*s++!=0) {}
812
0
            --factor;
813
0
        }
814
0
        if(elements!=nullptr) {
815
0
            *elements++=s;
816
0
        }
817
818
        /* write element */
819
0
        while((c=*s++)!=0) {
820
0
            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
821
0
        }
822
823
        /* we do not need to perform the rest of this loop for i==count - break here */
824
0
        if(i>=count) {
825
0
            break;
826
0
        }
827
828
        /* skip the rest of the strings for this factors[i] */
829
0
        factor = static_cast<uint16_t>(factors[i] - indexes[i] - 1);
830
0
        while(factor>0) {
831
0
            while(*s++!=0) {}
832
0
            --factor;
833
0
        }
834
835
0
        ++i;
836
0
    }
837
838
    /* zero-terminate */
839
0
    if(bufferLength>0) {
840
0
        *buffer=0;
841
0
    }
842
843
0
    return bufferPos;
844
0
}
845
846
/*
847
 * Important:
848
 * Parts of findAlgName() are almost the same as some of getAlgName().
849
 * Fixes must be applied to both.
850
 */
851
static uint16_t
852
getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
853
0
        char *buffer, uint16_t bufferLength) {
854
0
    uint16_t bufferPos=0;
855
856
    /* Only the normative character name can be algorithmic. */
857
0
    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
858
        /* zero-terminate */
859
0
        if(bufferLength>0) {
860
0
            *buffer=0;
861
0
        }
862
0
        return 0;
863
0
    }
864
865
0
    switch(range->type) {
866
0
    case 0: {
867
        /* name = prefix hex-digits */
868
0
        const char* s = reinterpret_cast<const char*>(range + 1);
869
0
        char c;
870
871
0
        uint16_t i, count;
872
873
        /* copy prefix */
874
0
        while((c=*s++)!=0) {
875
0
            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
876
0
        }
877
878
        /* write hexadecimal code point value */
879
0
        count=range->variant;
880
881
        /* zero-terminate */
882
0
        if(count<bufferLength) {
883
0
            buffer[count]=0;
884
0
        }
885
886
0
        for(i=count; i>0;) {
887
0
            if(--i<bufferLength) {
888
0
                c = static_cast<char>(code & 0xf);
889
0
                if(c<10) {
890
0
                    c+='0';
891
0
                } else {
892
0
                    c+='A'-10;
893
0
                }
894
0
                buffer[i]=c;
895
0
            }
896
0
            code>>=4;
897
0
        }
898
899
0
        bufferPos+=count;
900
0
        break;
901
0
    }
902
0
    case 1: {
903
        /* name = prefix factorized-elements */
904
0
        uint16_t indexes[8];
905
0
        const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
906
0
        uint16_t count=range->variant;
907
0
        const char* s = reinterpret_cast<const char*>(factors + count);
908
0
        char c;
909
910
        /* copy prefix */
911
0
        while((c=*s++)!=0) {
912
0
            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
913
0
        }
914
915
0
        bufferPos+=writeFactorSuffix(factors, count,
916
0
                                     s, code-range->start, indexes, nullptr, nullptr, buffer, bufferLength);
917
0
        break;
918
0
    }
919
0
    default:
920
        /* undefined type */
921
        /* zero-terminate */
922
0
        if(bufferLength>0) {
923
0
            *buffer=0;
924
0
        }
925
0
        break;
926
0
    }
927
928
0
    return bufferPos;
929
0
}
930
931
/*
932
 * Important: enumAlgNames() and findAlgName() are almost the same.
933
 * Any fix must be applied to both.
934
 */
935
static UBool
936
enumAlgNames(AlgorithmicRange *range,
937
             UChar32 start, UChar32 limit,
938
             UEnumCharNamesFn *fn, void *context,
939
0
             UCharNameChoice nameChoice) {
940
0
    char buffer[200];
941
0
    uint16_t length;
942
943
0
    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
944
0
        return true;
945
0
    }
946
947
0
    switch(range->type) {
948
0
    case 0: {
949
0
        char *s, *end;
950
0
        char c;
951
952
        /* get the full name of the start character */
953
0
        length = getAlgName(range, static_cast<uint32_t>(start), nameChoice, buffer, sizeof(buffer));
954
0
        if(length<=0) {
955
0
            return true;
956
0
        }
957
958
        /* call the enumerator function with this first character */
959
0
        if(!fn(context, start, nameChoice, buffer, length)) {
960
0
            return false;
961
0
        }
962
963
        /* go to the end of the name; all these names have the same length */
964
0
        end=buffer;
965
0
        while(*end!=0) {
966
0
            ++end;
967
0
        }
968
969
        /* enumerate the rest of the names */
970
0
        while(++start<limit) {
971
            /* increment the hexadecimal number on a character-basis */
972
0
            s=end;
973
0
            for (;;) {
974
0
                c=*--s;
975
0
                if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
976
0
                    *s = static_cast<char>(c + 1);
977
0
                    break;
978
0
                } else if(c=='9') {
979
0
                    *s='A';
980
0
                    break;
981
0
                } else if(c=='F') {
982
0
                    *s='0';
983
0
                }
984
0
            }
985
986
0
            if(!fn(context, start, nameChoice, buffer, length)) {
987
0
                return false;
988
0
            }
989
0
        }
990
0
        break;
991
0
    }
992
0
    case 1: {
993
0
        uint16_t indexes[8];
994
0
        const char *elementBases[8], *elements[8];
995
0
        const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
996
0
        uint16_t count=range->variant;
997
0
        const char* s = reinterpret_cast<const char*>(factors + count);
998
0
        char *suffix, *t;
999
0
        uint16_t prefixLength, i, idx;
1000
1001
0
        char c;
1002
1003
        /* name = prefix factorized-elements */
1004
1005
        /* copy prefix */
1006
0
        suffix=buffer;
1007
0
        prefixLength=0;
1008
0
        while((c=*s++)!=0) {
1009
0
            *suffix++=c;
1010
0
            ++prefixLength;
1011
0
        }
1012
1013
        /* append the suffix of the start character */
1014
0
        length = static_cast<uint16_t>(prefixLength + writeFactorSuffix(factors, count,
1015
0
                                              s, static_cast<uint32_t>(start) - range->start,
1016
0
                                              indexes, elementBases, elements,
1017
0
                                              suffix, static_cast<uint16_t>(sizeof(buffer) - prefixLength)));
1018
1019
        /* call the enumerator function with this first character */
1020
0
        if(!fn(context, start, nameChoice, buffer, length)) {
1021
0
            return false;
1022
0
        }
1023
1024
        /* enumerate the rest of the names */
1025
0
        while(++start<limit) {
1026
            /* increment the indexes in lexical order bound by the factors */
1027
0
            i=count;
1028
0
            for (;;) {
1029
0
                idx = static_cast<uint16_t>(indexes[--i] + 1);
1030
0
                if(idx<factors[i]) {
1031
                    /* skip one index and its element string */
1032
0
                    indexes[i]=idx;
1033
0
                    s=elements[i];
1034
0
                    while(*s++!=0) {
1035
0
                    }
1036
0
                    elements[i]=s;
1037
0
                    break;
1038
0
                } else {
1039
                    /* reset this index to 0 and its element string to the first one */
1040
0
                    indexes[i]=0;
1041
0
                    elements[i]=elementBases[i];
1042
0
                }
1043
0
            }
1044
1045
            /* to make matters a little easier, just append all elements to the suffix */
1046
0
            t=suffix;
1047
0
            length=prefixLength;
1048
0
            for(i=0; i<count; ++i) {
1049
0
                s=elements[i];
1050
0
                while((c=*s++)!=0) {
1051
0
                    *t++=c;
1052
0
                    ++length;
1053
0
                }
1054
0
            }
1055
            /* zero-terminate */
1056
0
            *t=0;
1057
1058
0
            if(!fn(context, start, nameChoice, buffer, length)) {
1059
0
                return false;
1060
0
            }
1061
0
        }
1062
0
        break;
1063
0
    }
1064
0
    default:
1065
        /* undefined type */
1066
0
        break;
1067
0
    }
1068
1069
0
    return true;
1070
0
}
1071
1072
/*
1073
 * findAlgName() is almost the same as enumAlgNames() except that it
1074
 * returns the code point for a name if it fits into the range.
1075
 * It returns 0xffff otherwise.
1076
 */
1077
static UChar32
1078
41.0k
findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1079
41.0k
    UChar32 code;
1080
1081
41.0k
    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1082
0
        return 0xffff;
1083
0
    }
1084
1085
41.0k
    switch(range->type) {
1086
37.8k
    case 0: {
1087
        /* name = prefix hex-digits */
1088
37.8k
        const char* s = reinterpret_cast<const char*>(range + 1);
1089
37.8k
        char c;
1090
1091
37.8k
        uint16_t i, count;
1092
1093
        /* compare prefix */
1094
38.1k
        while((c=*s++)!=0) {
1095
38.1k
            if (c != *otherName++) {
1096
37.8k
                return 0xffff;
1097
37.8k
            }
1098
38.1k
        }
1099
1100
        /* read hexadecimal code point value */
1101
0
        count=range->variant;
1102
0
        code=0;
1103
0
        for(i=0; i<count; ++i) {
1104
0
            c=*otherName++;
1105
0
            if('0'<=c && c<='9') {
1106
0
                code=(code<<4)|(c-'0');
1107
0
            } else if('A'<=c && c<='F') {
1108
0
                code=(code<<4)|(c-'A'+10);
1109
0
            } else {
1110
0
                return 0xffff;
1111
0
            }
1112
0
        }
1113
1114
        /* does it fit into the range? */
1115
0
        if (*otherName == 0 && range->start <= static_cast<uint32_t>(code) && static_cast<uint32_t>(code) <= range->end) {
1116
0
            return code;
1117
0
        }
1118
0
        break;
1119
0
    }
1120
3.15k
    case 1: {
1121
3.15k
        char buffer[64];
1122
3.15k
        uint16_t indexes[8];
1123
3.15k
        const char *elementBases[8], *elements[8];
1124
3.15k
        const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
1125
3.15k
        uint16_t count=range->variant;
1126
3.15k
        const char *s = reinterpret_cast<const char*>(factors + count), *t;
1127
3.15k
        UChar32 start, limit;
1128
3.15k
        uint16_t i, idx;
1129
1130
3.15k
        char c;
1131
1132
        /* name = prefix factorized-elements */
1133
1134
        /* compare prefix */
1135
3.16k
        while((c=*s++)!=0) {
1136
3.16k
            if (c != *otherName++) {
1137
3.15k
                return 0xffff;
1138
3.15k
            }
1139
3.16k
        }
1140
1141
0
        start = static_cast<UChar32>(range->start);
1142
0
        limit = static_cast<UChar32>(range->end + 1);
1143
1144
        /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1145
0
        writeFactorSuffix(factors, count, s, 0,
1146
0
                          indexes, elementBases, elements, buffer, sizeof(buffer));
1147
1148
        /* compare the first suffix */
1149
0
        if(0==uprv_strcmp(otherName, buffer)) {
1150
0
            return start;
1151
0
        }
1152
1153
        /* enumerate and compare the rest of the suffixes */
1154
0
        while(++start<limit) {
1155
            /* increment the indexes in lexical order bound by the factors */
1156
0
            i=count;
1157
0
            for (;;) {
1158
0
                idx = static_cast<uint16_t>(indexes[--i] + 1);
1159
0
                if(idx<factors[i]) {
1160
                    /* skip one index and its element string */
1161
0
                    indexes[i]=idx;
1162
0
                    s=elements[i];
1163
0
                    while(*s++!=0) {}
1164
0
                    elements[i]=s;
1165
0
                    break;
1166
0
                } else {
1167
                    /* reset this index to 0 and its element string to the first one */
1168
0
                    indexes[i]=0;
1169
0
                    elements[i]=elementBases[i];
1170
0
                }
1171
0
            }
1172
1173
            /* to make matters a little easier, just compare all elements of the suffix */
1174
0
            t=otherName;
1175
0
            for(i=0; i<count; ++i) {
1176
0
                s=elements[i];
1177
0
                while((c=*s++)!=0) {
1178
0
                    if(c!=*t++) {
1179
0
                        s=""; /* does not match */
1180
0
                        i=99;
1181
0
                    }
1182
0
                }
1183
0
            }
1184
0
            if(i<99 && *t==0) {
1185
0
                return start;
1186
0
            }
1187
0
        }
1188
0
        break;
1189
0
    }
1190
0
    default:
1191
        /* undefined type */
1192
0
        break;
1193
41.0k
    }
1194
1195
0
    return 0xffff;
1196
41.0k
}
1197
1198
/* sets of name characters, maximum name lengths ---------------------------- */
1199
1200
0
#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1201
0
#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1202
1203
static int32_t
1204
0
calcStringSetLength(uint32_t set[8], const char *s) {
1205
0
    int32_t length=0;
1206
0
    char c;
1207
1208
0
    while((c=*s++)!=0) {
1209
0
        SET_ADD(set, c);
1210
0
        ++length;
1211
0
    }
1212
0
    return length;
1213
0
}
1214
1215
static int32_t
1216
0
calcAlgNameSetsLengths(int32_t maxNameLength) {
1217
0
    AlgorithmicRange *range;
1218
0
    uint32_t *p;
1219
0
    uint32_t rangeCount;
1220
0
    int32_t length;
1221
1222
    /* enumerate algorithmic ranges */
1223
0
    p = reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(uCharNames) + uCharNames->algNamesOffset);
1224
0
    rangeCount=*p;
1225
0
    range = reinterpret_cast<AlgorithmicRange*>(p + 1);
1226
0
    while(rangeCount>0) {
1227
0
        switch(range->type) {
1228
0
        case 0:
1229
            /* name = prefix + (range->variant times) hex-digits */
1230
            /* prefix */
1231
0
            length = calcStringSetLength(gNameSet, reinterpret_cast<const char*>(range + 1)) + range->variant;
1232
0
            if(length>maxNameLength) {
1233
0
                maxNameLength=length;
1234
0
            }
1235
0
            break;
1236
0
        case 1: {
1237
            /* name = prefix factorized-elements */
1238
0
            const uint16_t* factors = reinterpret_cast<const uint16_t*>(range + 1);
1239
0
            const char *s;
1240
0
            int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1241
1242
            /* prefix length */
1243
0
            s = reinterpret_cast<const char*>(factors + count);
1244
0
            length=calcStringSetLength(gNameSet, s);
1245
0
            s+=length+1; /* start of factor suffixes */
1246
1247
            /* get the set and maximum factor suffix length for each factor */
1248
0
            for(i=0; i<count; ++i) {
1249
0
                maxFactorLength=0;
1250
0
                for(factor=factors[i]; factor>0; --factor) {
1251
0
                    factorLength=calcStringSetLength(gNameSet, s);
1252
0
                    s+=factorLength+1;
1253
0
                    if(factorLength>maxFactorLength) {
1254
0
                        maxFactorLength=factorLength;
1255
0
                    }
1256
0
                }
1257
0
                length+=maxFactorLength;
1258
0
            }
1259
1260
0
            if(length>maxNameLength) {
1261
0
                maxNameLength=length;
1262
0
            }
1263
0
            break;
1264
0
        }
1265
0
        default:
1266
            /* unknown type */
1267
0
            break;
1268
0
        }
1269
1270
0
        range = reinterpret_cast<AlgorithmicRange*>(reinterpret_cast<uint8_t*>(range) + range->size);
1271
0
        --rangeCount;
1272
0
    }
1273
0
    return maxNameLength;
1274
0
}
1275
1276
static int32_t
1277
0
calcExtNameSetsLengths(int32_t maxNameLength) {
1278
0
    int32_t i, length;
1279
1280
0
    for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) {
1281
        /*
1282
         * for each category, count the length of the category name
1283
         * plus 9=
1284
         * 2 for <>
1285
         * 1 for -
1286
         * 6 for most hex digits per code point
1287
         */
1288
0
        length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1289
0
        if(length>maxNameLength) {
1290
0
            maxNameLength=length;
1291
0
        }
1292
0
    }
1293
0
    return maxNameLength;
1294
0
}
1295
1296
static int32_t
1297
calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1298
                  uint32_t set[8],
1299
0
                  const uint8_t **pLine, const uint8_t *lineLimit) {
1300
0
    const uint8_t *line=*pLine;
1301
0
    int32_t length=0, tokenLength;
1302
0
    uint16_t c, token;
1303
1304
0
    while (line != lineLimit && (c = *line++) != static_cast<uint8_t>(';')) {
1305
0
        if(c>=tokenCount) {
1306
            /* implicit letter */
1307
0
            SET_ADD(set, c);
1308
0
            ++length;
1309
0
        } else {
1310
0
            token=tokens[c];
1311
0
            if (token == static_cast<uint16_t>(-2)) {
1312
                /* this is a lead byte for a double-byte token */
1313
0
                c=c<<8|*line++;
1314
0
                token=tokens[c];
1315
0
            }
1316
0
            if (token == static_cast<uint16_t>(-1)) {
1317
                /* explicit letter */
1318
0
                SET_ADD(set, c);
1319
0
                ++length;
1320
0
            } else {
1321
                /* count token word */
1322
0
                if(tokenLengths!=nullptr) {
1323
                    /* use cached token length */
1324
0
                    tokenLength=tokenLengths[c];
1325
0
                    if(tokenLength==0) {
1326
0
                        tokenLength = calcStringSetLength(set, reinterpret_cast<const char*>(tokenStrings) + token);
1327
0
                        tokenLengths[c] = static_cast<int8_t>(tokenLength);
1328
0
                    }
1329
0
                } else {
1330
0
                    tokenLength = calcStringSetLength(set, reinterpret_cast<const char*>(tokenStrings) + token);
1331
0
                }
1332
0
                length+=tokenLength;
1333
0
            }
1334
0
        }
1335
0
    }
1336
1337
0
    *pLine=line;
1338
0
    return length;
1339
0
}
1340
1341
static void
1342
0
calcGroupNameSetsLengths(int32_t maxNameLength) {
1343
0
    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1344
1345
0
    uint16_t* tokens = reinterpret_cast<uint16_t*>(uCharNames) + 8;
1346
0
    uint16_t tokenCount=*tokens++;
1347
0
    uint8_t* tokenStrings = reinterpret_cast<uint8_t*>(uCharNames) + uCharNames->tokenStringOffset;
1348
1349
0
    int8_t *tokenLengths;
1350
1351
0
    const uint16_t *group;
1352
0
    const uint8_t *s, *line, *lineLimit;
1353
1354
0
    int32_t groupCount, lineNumber, length;
1355
1356
0
    tokenLengths = static_cast<int8_t*>(uprv_malloc(tokenCount));
1357
0
    if(tokenLengths!=nullptr) {
1358
0
        uprv_memset(tokenLengths, 0, tokenCount);
1359
0
    }
1360
1361
0
    group=GET_GROUPS(uCharNames);
1362
0
    groupCount=*group++;
1363
1364
    /* enumerate all groups */
1365
0
    while(groupCount>0) {
1366
0
        s = reinterpret_cast<uint8_t*>(uCharNames) + uCharNames->groupStringOffset + GET_GROUP_OFFSET(group);
1367
0
        s=expandGroupLengths(s, offsets, lengths);
1368
1369
        /* enumerate all lines in each group */
1370
0
        for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1371
0
            line=s+offsets[lineNumber];
1372
0
            length=lengths[lineNumber];
1373
0
            if(length==0) {
1374
0
                continue;
1375
0
            }
1376
1377
0
            lineLimit=line+length;
1378
1379
            /* read regular name */
1380
0
            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1381
0
            if(length>maxNameLength) {
1382
0
                maxNameLength=length;
1383
0
            }
1384
0
            if(line==lineLimit) {
1385
0
                continue;
1386
0
            }
1387
1388
            /* read Unicode 1.0 name */
1389
0
            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390
0
            if(length>maxNameLength) {
1391
0
                maxNameLength=length;
1392
0
            }
1393
0
            if(line==lineLimit) {
1394
0
                continue;
1395
0
            }
1396
1397
            /* read ISO comment */
1398
            /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1399
0
        }
1400
1401
0
        group=NEXT_GROUP(group);
1402
0
        --groupCount;
1403
0
    }
1404
1405
0
    if(tokenLengths!=nullptr) {
1406
0
        uprv_free(tokenLengths);
1407
0
    }
1408
1409
    /* set gMax... - name length last for threading */
1410
0
    gMaxNameLength=maxNameLength;
1411
0
}
1412
1413
static UBool
1414
0
calcNameSetsLengths(UErrorCode *pErrorCode) {
1415
0
    static const char extChars[]="0123456789ABCDEF<>-";
1416
0
    int32_t i, maxNameLength;
1417
1418
0
    if(gMaxNameLength!=0) {
1419
0
        return true;
1420
0
    }
1421
1422
0
    if(!isDataLoaded(pErrorCode)) {
1423
0
        return false;
1424
0
    }
1425
1426
    /* set hex digits, used in various names, and <>-, used in extended names */
1427
0
    for (i = 0; i < static_cast<int32_t>(sizeof(extChars)) - 1; ++i) {
1428
0
        SET_ADD(gNameSet, extChars[i]);
1429
0
    }
1430
1431
    /* set sets and lengths from algorithmic names */
1432
0
    maxNameLength=calcAlgNameSetsLengths(0);
1433
1434
    /* set sets and lengths from extended names */
1435
0
    maxNameLength=calcExtNameSetsLengths(maxNameLength);
1436
1437
    /* set sets and lengths from group names, set global maximum values */
1438
0
    calcGroupNameSetsLengths(maxNameLength);
1439
1440
0
    return true;
1441
0
}
1442
1443
U_NAMESPACE_END
1444
1445
/* public API --------------------------------------------------------------- */
1446
1447
U_NAMESPACE_USE
1448
1449
U_CAPI int32_t U_EXPORT2
1450
u_charName(UChar32 code, UCharNameChoice nameChoice,
1451
           char *buffer, int32_t bufferLength,
1452
0
           UErrorCode *pErrorCode) {
1453
0
     AlgorithmicRange *algRange;
1454
0
    uint32_t *p;
1455
0
    uint32_t i;
1456
0
    int32_t length;
1457
1458
    /* check the argument values */
1459
0
    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
1460
0
        return 0;
1461
0
    } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1462
0
              bufferLength<0 || (bufferLength>0 && buffer==nullptr)
1463
0
    ) {
1464
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1465
0
        return 0;
1466
0
    }
1467
1468
0
    if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1469
0
        return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1470
0
    }
1471
1472
0
    length=0;
1473
1474
    /* try algorithmic names first */
1475
0
    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1476
0
    i=*p;
1477
0
    algRange=(AlgorithmicRange *)(p+1);
1478
0
    while(i>0) {
1479
0
        if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1480
0
            length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1481
0
            break;
1482
0
        }
1483
0
        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1484
0
        --i;
1485
0
    }
1486
1487
0
    if(i==0) {
1488
0
        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1489
0
            length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1490
0
            if (!length) {
1491
                /* extended character name */
1492
0
                length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1493
0
            }
1494
0
        } else {
1495
            /* normal character name */
1496
0
            length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1497
0
        }
1498
0
    }
1499
1500
0
    return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1501
0
}
1502
1503
U_CAPI int32_t U_EXPORT2
1504
u_getISOComment(UChar32 /*c*/,
1505
                char *dest, int32_t destCapacity,
1506
0
                UErrorCode *pErrorCode) {
1507
    /* check the argument values */
1508
0
    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
1509
0
        return 0;
1510
0
    } else if(destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
1511
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1512
0
        return 0;
1513
0
    }
1514
1515
0
    return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1516
0
}
1517
1518
U_CAPI UChar32 U_EXPORT2
1519
u_charFromName(UCharNameChoice nameChoice,
1520
               const char *name,
1521
3.67k
               UErrorCode *pErrorCode) {
1522
3.67k
    char upper[120] = {0};
1523
3.67k
    char lower[120] = {0};
1524
3.67k
    FindName findName;
1525
3.67k
    AlgorithmicRange *algRange;
1526
3.67k
    uint32_t *p;
1527
3.67k
    uint32_t i;
1528
3.67k
    UChar32 cp = 0;
1529
3.67k
    char c0;
1530
3.67k
    static constexpr UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
1531
1532
3.67k
    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
1533
0
        return error;
1534
0
    }
1535
1536
3.67k
    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==nullptr || *name==0) {
1537
38
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1538
38
        return error;
1539
38
    }
1540
1541
3.63k
    if(!isDataLoaded(pErrorCode)) {
1542
0
        return error;
1543
0
    }
1544
1545
    /* construct the uppercase and lowercase of the name first */
1546
18.6k
    for(i=0; i<sizeof(upper); ++i) {
1547
18.6k
        if((c0=*name++)!=0) {
1548
15.0k
            upper[i]=uprv_toupper(c0);
1549
15.0k
            lower[i]=uprv_tolower(c0);
1550
15.0k
        } else {
1551
3.62k
            upper[i]=lower[i]=0;
1552
3.62k
            break;
1553
3.62k
        }
1554
18.6k
    }
1555
3.63k
    if(i==sizeof(upper)) {
1556
        /* name too long, there is no such character */
1557
8
        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1558
8
        return error;
1559
8
    }
1560
    // i==strlen(name)==strlen(lower)==strlen(upper)
1561
1562
    /* try extended names first */
1563
3.62k
    if (lower[0] == '<') {
1564
470
        if (nameChoice == U_EXTENDED_CHAR_NAME && lower[--i] == '>') {
1565
            // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
1566
444
            uint32_t limit = i;
1567
1.84k
            while (i >= 3 && lower[--i] != '-') {}
1568
1569
            // There should be 1 to 8 hex digits.
1570
444
            int32_t hexLength = limit - (i + 1);
1571
444
            if (i >= 2 && lower[i] == '-' && 1 <= hexLength && hexLength <= 8) {
1572
398
                uint32_t cIdx;
1573
1574
398
                lower[i] = 0;
1575
1576
1.27k
                for (++i; i < limit; ++i) {
1577
900
                    if (lower[i] >= '0' && lower[i] <= '9') {
1578
286
                        cp = (cp << 4) + lower[i] - '0';
1579
614
                    } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1580
596
                        cp = (cp << 4) + lower[i] - 'a' + 10;
1581
596
                    } else {
1582
18
                        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1583
18
                        return error;
1584
18
                    }
1585
                    // Prevent signed-integer overflow and out-of-range code points.
1586
882
                    if (cp > UCHAR_MAX_VALUE) {
1587
6
                        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1588
6
                        return error;
1589
6
                    }
1590
882
                }
1591
1592
                /* Now validate the category name.
1593
                   We could use a binary search, or a trie, if
1594
                   we really wanted to. */
1595
374
                uint8_t cat = getCharCat(cp);
1596
12.7k
                for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
1597
1598
12.3k
                    if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1599
0
                        if (cat == cIdx) {
1600
0
                            return cp;
1601
0
                        }
1602
0
                        break;
1603
0
                    }
1604
12.3k
                }
1605
374
            }
1606
444
        }
1607
1608
446
        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1609
446
        return error;
1610
470
    }
1611
1612
    /* try algorithmic names now */
1613
3.15k
    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1614
3.15k
    i=*p;
1615
3.15k
    algRange=(AlgorithmicRange *)(p+1);
1616
44.1k
    while(i>0) {
1617
41.0k
        if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1618
0
            return cp;
1619
0
        }
1620
41.0k
        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1621
41.0k
        --i;
1622
41.0k
    }
1623
1624
    /* normal character name */
1625
3.15k
    findName.otherName=upper;
1626
3.15k
    findName.code=error;
1627
3.15k
    enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1628
3.15k
    if (findName.code == error) {
1629
240
         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1630
240
    }
1631
3.15k
    return findName.code;
1632
3.15k
}
1633
1634
U_CAPI void U_EXPORT2
1635
u_enumCharNames(UChar32 start, UChar32 limit,
1636
                UEnumCharNamesFn *fn,
1637
                void *context,
1638
                UCharNameChoice nameChoice,
1639
0
                UErrorCode *pErrorCode) {
1640
0
    AlgorithmicRange *algRange;
1641
0
    uint32_t *p;
1642
0
    uint32_t i;
1643
1644
0
    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
1645
0
        return;
1646
0
    }
1647
1648
0
    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==nullptr) {
1649
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1650
0
        return;
1651
0
    }
1652
1653
0
    if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1654
0
        limit = UCHAR_MAX_VALUE + 1;
1655
0
    }
1656
0
    if((uint32_t)start>=(uint32_t)limit) {
1657
0
        return;
1658
0
    }
1659
1660
0
    if(!isDataLoaded(pErrorCode)) {
1661
0
        return;
1662
0
    }
1663
1664
    /* interleave the data-driven ones with the algorithmic ones */
1665
    /* iterate over all algorithmic ranges; assume that they are in ascending order */
1666
0
    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1667
0
    i=*p;
1668
0
    algRange=(AlgorithmicRange *)(p+1);
1669
0
    while(i>0) {
1670
        /* enumerate the character names before the current algorithmic range */
1671
        /* here: start<limit */
1672
0
        if((uint32_t)start<algRange->start) {
1673
0
            if((uint32_t)limit<=algRange->start) {
1674
0
                enumNames(uCharNames, start, limit, fn, context, nameChoice);
1675
0
                return;
1676
0
            }
1677
0
            if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1678
0
                return;
1679
0
            }
1680
0
            start=(UChar32)algRange->start;
1681
0
        }
1682
        /* enumerate the character names in the current algorithmic range */
1683
        /* here: algRange->start<=start<limit */
1684
0
        if((uint32_t)start<=algRange->end) {
1685
0
            if((uint32_t)limit<=(algRange->end+1)) {
1686
0
                enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1687
0
                return;
1688
0
            }
1689
0
            if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1690
0
                return;
1691
0
            }
1692
0
            start=(UChar32)algRange->end+1;
1693
0
        }
1694
        /* continue to the next algorithmic range (here: start<limit) */
1695
0
        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1696
0
        --i;
1697
0
    }
1698
    /* enumerate the character names after the last algorithmic range */
1699
0
    enumNames(uCharNames, start, limit, fn, context, nameChoice);
1700
0
}
1701
1702
U_CAPI int32_t U_EXPORT2
1703
0
uprv_getMaxCharNameLength() {
1704
0
    UErrorCode errorCode=U_ZERO_ERROR;
1705
0
    if(calcNameSetsLengths(&errorCode)) {
1706
0
        return gMaxNameLength;
1707
0
    } else {
1708
0
        return 0;
1709
0
    }
1710
0
}
1711
1712
/**
1713
 * Converts the char set cset into a Unicode set uset.
1714
 * @param cset Set of 256 bit flags corresponding to a set of chars.
1715
 * @param uset USet to receive characters. Existing contents are deleted.
1716
 */
1717
static void
1718
0
charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1719
0
    char16_t us[256];
1720
0
    char cs[256];
1721
1722
0
    int32_t i, length;
1723
0
    UErrorCode errorCode;
1724
1725
0
    errorCode=U_ZERO_ERROR;
1726
1727
0
    if(!calcNameSetsLengths(&errorCode)) {
1728
0
        return;
1729
0
    }
1730
1731
    /* build a char string with all chars that are used in character names */
1732
0
    length=0;
1733
0
    for(i=0; i<256; ++i) {
1734
0
        if(SET_CONTAINS(cset, i)) {
1735
0
            cs[length++] = static_cast<char>(i);
1736
0
        }
1737
0
    }
1738
1739
    /* convert the char string to a char16_t string */
1740
0
    u_charsToUChars(cs, us, length);
1741
1742
    /* add each char16_t to the USet */
1743
0
    for(i=0; i<length; ++i) {
1744
0
        if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (char16_t)0 */
1745
0
            sa->add(sa->set, us[i]);
1746
0
        }
1747
0
    }
1748
0
}
1749
1750
/**
1751
 * Fills set with characters that are used in Unicode character names.
1752
 * @param set USet to receive characters.
1753
 */
1754
U_CAPI void U_EXPORT2
1755
0
uprv_getCharNameCharacters(const USetAdder *sa) {
1756
0
    charSetToUSet(gNameSet, sa);
1757
0
}
1758
1759
/* data swapping ------------------------------------------------------------ */
1760
1761
/*
1762
 * The token table contains non-negative entries for token bytes,
1763
 * and -1 for bytes that represent themselves in the data file's charset.
1764
 * -2 entries are used for lead bytes.
1765
 *
1766
 * Direct bytes (-1 entries) must be translated from the input charset family
1767
 * to the output charset family.
1768
 * makeTokenMap() writes a permutation mapping for this.
1769
 * Use it once for single-/lead-byte tokens and once more for all trail byte
1770
 * tokens. (';' is an unused trail byte marked with -1.)
1771
 */
1772
static void
1773
makeTokenMap(const UDataSwapper *ds,
1774
             int16_t tokens[], uint16_t tokenCount,
1775
             uint8_t map[256],
1776
0
             UErrorCode *pErrorCode) {
1777
0
    UBool usedOutChar[256];
1778
0
    uint16_t i, j;
1779
0
    uint8_t c1, c2;
1780
1781
0
    if(U_FAILURE(*pErrorCode)) {
1782
0
        return;
1783
0
    }
1784
1785
0
    if(ds->inCharset==ds->outCharset) {
1786
        /* Same charset family: identity permutation */
1787
0
        for(i=0; i<256; ++i) {
1788
0
            map[i] = static_cast<uint8_t>(i);
1789
0
        }
1790
0
    } else {
1791
0
        uprv_memset(map, 0, 256);
1792
0
        uprv_memset(usedOutChar, 0, 256);
1793
1794
0
        if(tokenCount>256) {
1795
0
            tokenCount=256;
1796
0
        }
1797
1798
        /* set the direct bytes (byte 0 always maps to itself) */
1799
0
        for(i=1; i<tokenCount; ++i) {
1800
0
            if(tokens[i]==-1) {
1801
                /* convert the direct byte character */
1802
0
                c1 = static_cast<uint8_t>(i);
1803
0
                ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1804
0
                if(U_FAILURE(*pErrorCode)) {
1805
0
                    udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1806
0
                                     i, ds->inCharset);
1807
0
                    return;
1808
0
                }
1809
1810
                /* enter the converted character into the map and mark it used */
1811
0
                map[c1]=c2;
1812
0
                usedOutChar[c2]=true;
1813
0
            }
1814
0
        }
1815
1816
        /* set the mappings for the rest of the permutation */
1817
0
        for(i=j=1; i<tokenCount; ++i) {
1818
            /* set mappings that were not set for direct bytes */
1819
0
            if(map[i]==0) {
1820
                /* set an output byte value that was not used as an output byte above */
1821
0
                while(usedOutChar[j]) {
1822
0
                    ++j;
1823
0
                }
1824
0
                map[i] = static_cast<uint8_t>(j++);
1825
0
            }
1826
0
        }
1827
1828
        /*
1829
         * leave mappings at tokenCount and above unset if tokenCount<256
1830
         * because they won't be used
1831
         */
1832
0
    }
1833
0
}
1834
1835
U_CAPI int32_t U_EXPORT2
1836
uchar_swapNames(const UDataSwapper *ds,
1837
                const void *inData, int32_t length, void *outData,
1838
0
                UErrorCode *pErrorCode) {
1839
0
    const UDataInfo *pInfo;
1840
0
    int32_t headerSize;
1841
1842
0
    const uint8_t *inBytes;
1843
0
    uint8_t *outBytes;
1844
1845
0
    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1846
0
             offset, i, count, stringsCount;
1847
1848
0
    const AlgorithmicRange *inRange;
1849
0
    AlgorithmicRange *outRange;
1850
1851
    /* udata_swapDataHeader checks the arguments */
1852
0
    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1853
0
    if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
1854
0
        return 0;
1855
0
    }
1856
1857
    /* check data format and format version */
1858
0
    pInfo=(const UDataInfo *)((const char *)inData+4);
1859
0
    if(!(
1860
0
        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
1861
0
        pInfo->dataFormat[1]==0x6e &&
1862
0
        pInfo->dataFormat[2]==0x61 &&
1863
0
        pInfo->dataFormat[3]==0x6d &&
1864
0
        pInfo->formatVersion[0]==1
1865
0
    )) {
1866
0
        udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1867
0
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
1868
0
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
1869
0
                         pInfo->formatVersion[0]);
1870
0
        *pErrorCode=U_UNSUPPORTED_ERROR;
1871
0
        return 0;
1872
0
    }
1873
1874
0
    inBytes=(const uint8_t *)inData+headerSize;
1875
0
    outBytes=(outData == nullptr) ? nullptr : (uint8_t *)outData+headerSize;
1876
0
    if(length<0) {
1877
0
        algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1878
0
    } else {
1879
0
        length-=headerSize;
1880
0
        if( length<20 ||
1881
0
            (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1882
0
        ) {
1883
0
            udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1884
0
                             length);
1885
0
            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1886
0
            return 0;
1887
0
        }
1888
0
    }
1889
1890
0
    if(length<0) {
1891
        /* preflighting: iterate through algorithmic ranges */
1892
0
        offset=algNamesOffset;
1893
0
        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1894
0
        offset+=4;
1895
1896
0
        for(i=0; i<count; ++i) {
1897
0
            inRange=(const AlgorithmicRange *)(inBytes+offset);
1898
0
            offset+=ds->readUInt16(inRange->size);
1899
0
        }
1900
0
    } else {
1901
        /* swap data */
1902
0
        const uint16_t *p;
1903
0
        uint16_t *q, *temp;
1904
1905
0
        int16_t tokens[512];
1906
0
        uint16_t tokenCount;
1907
1908
0
        uint8_t map[256], trailMap[256];
1909
1910
        /* copy the data for inaccessible bytes */
1911
0
        if(inBytes!=outBytes) {
1912
0
            uprv_memcpy(outBytes, inBytes, length);
1913
0
        }
1914
1915
        /* the initial 4 offsets first */
1916
0
        tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1917
0
        groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1918
0
        groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1919
0
        ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1920
1921
        /*
1922
         * now the tokens table
1923
         * it needs to be permutated along with the compressed name strings
1924
         */
1925
0
        p=(const uint16_t *)(inBytes+16);
1926
0
        q=(uint16_t *)(outBytes+16);
1927
1928
        /* read and swap the tokenCount */
1929
0
        tokenCount=ds->readUInt16(*p);
1930
0
        ds->swapArray16(ds, p, 2, q, pErrorCode);
1931
0
        ++p;
1932
0
        ++q;
1933
1934
        /* read the first 512 tokens and make the token maps */
1935
0
        if(tokenCount<=512) {
1936
0
            count=tokenCount;
1937
0
        } else {
1938
0
            count=512;
1939
0
        }
1940
0
        for(i=0; i<count; ++i) {
1941
0
            tokens[i]=udata_readInt16(ds, p[i]);
1942
0
        }
1943
0
        for(; i<512; ++i) {
1944
0
            tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1945
0
        }
1946
0
        makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1947
0
        makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1948
0
        if(U_FAILURE(*pErrorCode)) {
1949
0
            return 0;
1950
0
        }
1951
1952
        /*
1953
         * swap and permutate the tokens
1954
         * go through a temporary array to support in-place swapping
1955
         */
1956
0
        temp=(uint16_t *)uprv_malloc(tokenCount*2);
1957
0
        if(temp==nullptr) {
1958
0
            udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1959
0
                             tokenCount);
1960
0
            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1961
0
            return 0;
1962
0
        }
1963
1964
        /* swap and permutate single-/lead-byte tokens */
1965
0
        for(i=0; i<tokenCount && i<256; ++i) {
1966
0
            ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1967
0
        }
1968
1969
        /* swap and permutate trail-byte tokens */
1970
0
        for(; i<tokenCount; ++i) {
1971
0
            ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1972
0
        }
1973
1974
        /* copy the result into the output and free the temporary array */
1975
0
        uprv_memcpy(q, temp, tokenCount*2);
1976
0
        uprv_free(temp);
1977
1978
        /*
1979
         * swap the token strings but not a possible padding byte after
1980
         * the terminating NUL of the last string
1981
         */
1982
0
        udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1983
0
                                    outBytes+tokenStringOffset, pErrorCode);
1984
0
        if(U_FAILURE(*pErrorCode)) {
1985
0
            udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1986
0
            return 0;
1987
0
        }
1988
1989
        /* swap the group table */
1990
0
        count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1991
0
        ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1992
0
                           outBytes+groupsOffset, pErrorCode);
1993
1994
        /*
1995
         * swap the group strings
1996
         * swap the string bytes but not the nibble-encoded string lengths
1997
         */
1998
0
        if(ds->inCharset!=ds->outCharset) {
1999
0
            uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
2000
2001
0
            const uint8_t *inStrings, *nextInStrings;
2002
0
            uint8_t *outStrings;
2003
2004
0
            uint8_t c;
2005
2006
0
            inStrings=inBytes+groupStringOffset;
2007
0
            outStrings=outBytes+groupStringOffset;
2008
2009
0
            stringsCount=algNamesOffset-groupStringOffset;
2010
2011
            /* iterate through string groups until only a few padding bytes are left */
2012
0
            while(stringsCount>32) {
2013
0
                nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2014
2015
                /* move past the length bytes */
2016
0
                stringsCount-=(uint32_t)(nextInStrings-inStrings);
2017
0
                outStrings+=nextInStrings-inStrings;
2018
0
                inStrings=nextInStrings;
2019
2020
0
                count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2021
0
                stringsCount-=count;
2022
2023
                /* swap the string bytes using map[] and trailMap[] */
2024
0
                while(count>0) {
2025
0
                    c=*inStrings++;
2026
0
                    *outStrings++=map[c];
2027
0
                    if(tokens[c]!=-2) {
2028
0
                        --count;
2029
0
                    } else {
2030
                        /* token lead byte: swap the trail byte, too */
2031
0
                        *outStrings++=trailMap[*inStrings++];
2032
0
                        count-=2;
2033
0
                    }
2034
0
                }
2035
0
            }
2036
0
        }
2037
2038
        /* swap the algorithmic ranges */
2039
0
        offset=algNamesOffset;
2040
0
        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2041
0
        ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2042
0
        offset+=4;
2043
2044
0
        for(i=0; i<count; ++i) {
2045
0
            if(offset>(uint32_t)length) {
2046
0
                udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2047
0
                                 length, i);
2048
0
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2049
0
                return 0;
2050
0
            }
2051
2052
0
            inRange=(const AlgorithmicRange *)(inBytes+offset);
2053
0
            outRange=(AlgorithmicRange *)(outBytes+offset);
2054
0
            offset+=ds->readUInt16(inRange->size);
2055
2056
0
            ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2057
0
            ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2058
0
            switch(inRange->type) {
2059
0
            case 0:
2060
                /* swap prefix string */
2061
0
                ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2062
0
                                    outRange+1, pErrorCode);
2063
0
                if(U_FAILURE(*pErrorCode)) {
2064
0
                    udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2065
0
                                     i);
2066
0
                    return 0;
2067
0
                }
2068
0
                break;
2069
0
            case 1:
2070
0
                {
2071
                    /* swap factors and the prefix and factor strings */
2072
0
                    uint32_t factorsCount;
2073
2074
0
                    factorsCount=inRange->variant;
2075
0
                    p=(const uint16_t *)(inRange+1);
2076
0
                    q=(uint16_t *)(outRange+1);
2077
0
                    ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2078
2079
                    /* swap the strings, up to the last terminating NUL */
2080
0
                    p+=factorsCount;
2081
0
                    q+=factorsCount;
2082
0
                    stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2083
0
                    while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2084
0
                        --stringsCount;
2085
0
                    }
2086
0
                    ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2087
0
                }
2088
0
                break;
2089
0
            default:
2090
0
                udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2091
0
                                 inRange->type, i);
2092
0
                *pErrorCode=U_UNSUPPORTED_ERROR;
2093
0
                return 0;
2094
0
            }
2095
0
        }
2096
0
    }
2097
2098
0
    return headerSize+(int32_t)offset;
2099
0
}
2100
2101
/*
2102
 * Hey, Emacs, please set the following:
2103
 *
2104
 * Local Variables:
2105
 * indent-tabs-mode: nil
2106
 * End:
2107
 *
2108
 */