Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/common/unames.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 1999-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  unames.c
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 1999oct04
16
*   created by: Markus W. Scherer
17
*/
18
19
#include "unicode/utypes.h"
20
#include "unicode/putil.h"
21
#include "unicode/uchar.h"
22
#include "unicode/udata.h"
23
#include "unicode/utf.h"
24
#include "unicode/utf16.h"
25
#include "uassert.h"
26
#include "ustr_imp.h"
27
#include "umutex.h"
28
#include "cmemory.h"
29
#include "cstring.h"
30
#include "ucln_cmn.h"
31
#include "udataswp.h"
32
#include "uprops.h"
33
34
U_NAMESPACE_BEGIN
35
36
/* prototypes ------------------------------------------------------------- */
37
38
static const char DATA_NAME[] = "unames";
39
static const char DATA_TYPE[] = "icu";
40
41
0
#define GROUP_SHIFT 5
42
0
#define LINES_PER_GROUP (1L<<GROUP_SHIFT)
43
0
#define GROUP_MASK (LINES_PER_GROUP-1)
44
45
/*
46
 * This struct was replaced by explicitly accessing equivalent
47
 * fields from triples of uint16_t.
48
 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
49
 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
50
 * would advance by 6 bytes (3 uint16_t).
51
 *
52
 * We can't just change the data structure because it's loaded from a data file,
53
 * and we don't want to make it less compact, so we changed the access code.
54
 *
55
 * For details see ICU tickets 6331 and 6008.
56
typedef struct {
57
    uint16_t groupMSB,
58
             offsetHigh, offsetLow; / * avoid padding * /
59
} Group;
60
 */
61
enum {
62
    GROUP_MSB,
63
    GROUP_OFFSET_HIGH,
64
    GROUP_OFFSET_LOW,
65
    GROUP_LENGTH
66
};
67
68
/*
69
 * Get the 32-bit group offset.
70
 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
71
 * @return group offset (int32_t)
72
 */
73
0
#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
74
75
0
#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
76
0
#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
77
78
typedef struct {
79
    uint32_t start, end;
80
    uint8_t type, variant;
81
    uint16_t size;
82
} AlgorithmicRange;
83
84
typedef struct {
85
    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
86
} UCharNames;
87
88
/*
89
 * Get the groups table from a UCharNames struct.
90
 * The groups table consists of one uint16_t groupCount followed by
91
 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
92
 * and the comment for the old struct Group above.
93
 *
94
 * @param names (const UCharNames *) pointer to the UCharNames indexes
95
 * @return (const uint16_t *) pointer to the groups table
96
 */
97
0
#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
98
99
typedef struct {
100
    const char *otherName;
101
    UChar32 code;
102
} FindName;
103
104
0
#define DO_FIND_NAME NULL
105
106
static UDataMemory *uCharNamesData=NULL;
107
static UCharNames *uCharNames=NULL;
108
static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
109
110
/*
111
 * Maximum length of character names (regular & 1.0).
112
 */
113
static int32_t gMaxNameLength=0;
114
115
/*
116
 * Set of chars used in character names (regular & 1.0).
117
 * Chars are platform-dependent (can be EBCDIC).
118
 */
119
static uint32_t gNameSet[8]={ 0 };
120
121
0
#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
122
0
#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
123
0
#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
124
125
#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
126
127
static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
128
    "unassigned",
129
    "uppercase letter",
130
    "lowercase letter",
131
    "titlecase letter",
132
    "modifier letter",
133
    "other letter",
134
    "non spacing mark",
135
    "enclosing mark",
136
    "combining spacing mark",
137
    "decimal digit number",
138
    "letter number",
139
    "other number",
140
    "space separator",
141
    "line separator",
142
    "paragraph separator",
143
    "control",
144
    "format",
145
    "private use area",
146
    "surrogate",
147
    "dash punctuation",   
148
    "start punctuation",
149
    "end punctuation",
150
    "connector punctuation",
151
    "other punctuation",
152
    "math symbol",
153
    "currency symbol",
154
    "modifier symbol",
155
    "other symbol",
156
    "initial punctuation",
157
    "final punctuation",
158
    "noncharacter",
159
    "lead surrogate",
160
    "trail surrogate"
161
};
162
163
/* implementation ----------------------------------------------------------- */
164
165
static UBool U_CALLCONV unames_cleanup(void)
166
0
{
167
0
    if(uCharNamesData) {
168
0
        udata_close(uCharNamesData);
169
0
        uCharNamesData = NULL;
170
0
    }
171
0
    if(uCharNames) {
172
0
        uCharNames = NULL;
173
0
    }
174
0
    gCharNamesInitOnce.reset();
175
0
    gMaxNameLength=0;
176
0
    return TRUE;
177
0
}
178
179
static UBool U_CALLCONV
180
isAcceptable(void * /*context*/,
181
             const char * /*type*/, const char * /*name*/,
182
0
             const UDataInfo *pInfo) {
183
0
    return (UBool)(
184
0
        pInfo->size>=20 &&
185
0
        pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
186
0
        pInfo->charsetFamily==U_CHARSET_FAMILY &&
187
0
        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
188
0
        pInfo->dataFormat[1]==0x6e &&
189
0
        pInfo->dataFormat[2]==0x61 &&
190
0
        pInfo->dataFormat[3]==0x6d &&
191
0
        pInfo->formatVersion[0]==1);
192
0
}
193
194
static void U_CALLCONV
195
0
loadCharNames(UErrorCode &status) {
196
0
    U_ASSERT(uCharNamesData == NULL);
197
0
    U_ASSERT(uCharNames == NULL);
198
0
199
0
    uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
200
0
    if(U_FAILURE(status)) {
201
0
        uCharNamesData = NULL;
202
0
    } else {
203
0
        uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
204
0
    }
205
0
    ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
206
0
}
207
208
209
static UBool
210
0
isDataLoaded(UErrorCode *pErrorCode) {
211
0
    umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
212
0
    return U_SUCCESS(*pErrorCode);
213
0
}
214
215
0
#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
216
0
    if((bufferLength)>0) { \
217
0
        *(buffer)++=c; \
218
0
        --(bufferLength); \
219
0
    } \
220
0
    ++(bufferPos); \
221
0
}
222
223
0
#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
224
225
/*
226
 * Important: expandName() and compareName() are almost the same -
227
 * apply fixes to both.
228
 *
229
 * UnicodeData.txt uses ';' as a field separator, so no
230
 * field can contain ';' as part of its contents.
231
 * In unames.dat, it is marked as token[';']==-1 only if the
232
 * semicolon is used in the data file - which is iff we
233
 * have Unicode 1.0 names or ISO comments or aliases.
234
 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
235
 * although we know that it will never be part of a name.
236
 */
237
static uint16_t
238
expandName(UCharNames *names,
239
           const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
240
0
           char *buffer, uint16_t bufferLength) {
241
0
    uint16_t *tokens=(uint16_t *)names+8;
242
0
    uint16_t token, tokenCount=*tokens++, bufferPos=0;
243
0
    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
244
0
    uint8_t c;
245
0
246
0
    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
247
0
        /*
248
0
         * skip the modern name if it is not requested _and_
249
0
         * if the semicolon byte value is a character, not a token number
250
0
         */
251
0
        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
252
0
            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
253
0
            do {
254
0
                while(nameLength>0) {
255
0
                    --nameLength;
256
0
                    if(*name++==';') {
257
0
                        break;
258
0
                    }
259
0
                }
260
0
            } while(--fieldIndex>0);
261
0
        } else {
262
0
            /*
263
0
             * the semicolon byte value is a token number, therefore
264
0
             * only modern names are stored in unames.dat and there is no
265
0
             * such requested alternate name here
266
0
             */
267
0
            nameLength=0;
268
0
        }
269
0
    }
270
0
271
0
    /* write each letter directly, and write a token word per token */
272
0
    while(nameLength>0) {
273
0
        --nameLength;
274
0
        c=*name++;
275
0
276
0
        if(c>=tokenCount) {
277
0
            if(c!=';') {
278
0
                /* implicit letter */
279
0
                WRITE_CHAR(buffer, bufferLength, bufferPos, c);
280
0
            } else {
281
0
                /* finished */
282
0
                break;
283
0
            }
284
0
        } else {
285
0
            token=tokens[c];
286
0
            if(token==(uint16_t)(-2)) {
287
0
                /* this is a lead byte for a double-byte token */
288
0
                token=tokens[c<<8|*name++];
289
0
                --nameLength;
290
0
            }
291
0
            if(token==(uint16_t)(-1)) {
292
0
                if(c!=';') {
293
0
                    /* explicit letter */
294
0
                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
295
0
                } else {
296
0
                    /* stop, but skip the semicolon if we are seeking
297
0
                       extended names and there was no 2.0 name but there
298
0
                       is a 1.0 name. */
299
0
                    if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
300
0
                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
301
0
                            continue;
302
0
                        }
303
0
                    }
304
0
                    /* finished */
305
0
                    break;
306
0
                }
307
0
            } else {
308
0
                /* write token word */
309
0
                uint8_t *tokenString=tokenStrings+token;
310
0
                while((c=*tokenString++)!=0) {
311
0
                    WRITE_CHAR(buffer, bufferLength, bufferPos, c);
312
0
                }
313
0
            }
314
0
        }
315
0
    }
316
0
317
0
    /* zero-terminate */
318
0
    if(bufferLength>0) {
319
0
        *buffer=0;
320
0
    }
321
0
322
0
    return bufferPos;
323
0
}
324
325
/*
326
 * compareName() is almost the same as expandName() except that it compares
327
 * the currently expanded name to an input name.
328
 * It returns the match/no match result as soon as possible.
329
 */
330
static UBool
331
compareName(UCharNames *names,
332
            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
333
0
            const char *otherName) {
334
0
    uint16_t *tokens=(uint16_t *)names+8;
335
0
    uint16_t token, tokenCount=*tokens++;
336
0
    uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
337
0
    uint8_t c;
338
0
    const char *origOtherName = otherName;
339
0
340
0
    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
341
0
        /*
342
0
         * skip the modern name if it is not requested _and_
343
0
         * if the semicolon byte value is a character, not a token number
344
0
         */
345
0
        if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
346
0
            int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
347
0
            do {
348
0
                while(nameLength>0) {
349
0
                    --nameLength;
350
0
                    if(*name++==';') {
351
0
                        break;
352
0
                    }
353
0
                }
354
0
            } while(--fieldIndex>0);
355
0
        } else {
356
0
            /*
357
0
             * the semicolon byte value is a token number, therefore
358
0
             * only modern names are stored in unames.dat and there is no
359
0
             * such requested alternate name here
360
0
             */
361
0
            nameLength=0;
362
0
        }
363
0
    }
364
0
365
0
    /* compare each letter directly, and compare a token word per token */
366
0
    while(nameLength>0) {
367
0
        --nameLength;
368
0
        c=*name++;
369
0
370
0
        if(c>=tokenCount) {
371
0
            if(c!=';') {
372
0
                /* implicit letter */
373
0
                if((char)c!=*otherName++) {
374
0
                    return FALSE;
375
0
                }
376
0
            } else {
377
0
                /* finished */
378
0
                break;
379
0
            }
380
0
        } else {
381
0
            token=tokens[c];
382
0
            if(token==(uint16_t)(-2)) {
383
0
                /* this is a lead byte for a double-byte token */
384
0
                token=tokens[c<<8|*name++];
385
0
                --nameLength;
386
0
            }
387
0
            if(token==(uint16_t)(-1)) {
388
0
                if(c!=';') {
389
0
                    /* explicit letter */
390
0
                    if((char)c!=*otherName++) {
391
0
                        return FALSE;
392
0
                    }
393
0
                } else {
394
0
                    /* stop, but skip the semicolon if we are seeking
395
0
                       extended names and there was no 2.0 name but there
396
0
                       is a 1.0 name. */
397
0
                    if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
398
0
                        if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
399
0
                            continue;
400
0
                        }
401
0
                    }
402
0
                    /* finished */
403
0
                    break;
404
0
                }
405
0
            } else {
406
0
                /* write token word */
407
0
                uint8_t *tokenString=tokenStrings+token;
408
0
                while((c=*tokenString++)!=0) {
409
0
                    if((char)c!=*otherName++) {
410
0
                        return FALSE;
411
0
                    }
412
0
                }
413
0
            }
414
0
        }
415
0
    }
416
0
417
0
    /* complete match? */
418
0
    return (UBool)(*otherName==0);
419
0
}
420
421
0
static uint8_t getCharCat(UChar32 cp) {
422
0
    uint8_t cat;
423
0
424
0
    if (U_IS_UNICODE_NONCHAR(cp)) {
425
0
        return U_NONCHARACTER_CODE_POINT;
426
0
    }
427
0
428
0
    if ((cat = u_charType(cp)) == U_SURROGATE) {
429
0
        cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
430
0
    }
431
0
432
0
    return cat;
433
0
}
434
435
0
static const char *getCharCatName(UChar32 cp) {
436
0
    uint8_t cat = getCharCat(cp);
437
0
438
0
    /* Return unknown if the table of names above is not up to
439
0
       date. */
440
0
441
0
    if (cat >= UPRV_LENGTHOF(charCatNames)) {
442
0
        return "unknown";
443
0
    } else {
444
0
        return charCatNames[cat];
445
0
    }
446
0
}
447
448
0
static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
449
0
    const char *catname = getCharCatName(code);
450
0
    uint16_t length = 0;
451
0
452
0
    UChar32 cp;
453
0
    int ndigits, i;
454
0
    
455
0
    WRITE_CHAR(buffer, bufferLength, length, '<');
456
0
    while (catname[length - 1]) {
457
0
        WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
458
0
    }
459
0
    WRITE_CHAR(buffer, bufferLength, length, '-');
460
0
    for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
461
0
        ;
462
0
    if (ndigits < 4)
463
0
        ndigits = 4;
464
0
    for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
465
0
        uint8_t v = (uint8_t)(cp & 0xf);
466
0
        buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
467
0
    }
468
0
    buffer += ndigits;
469
0
    length += ndigits;
470
0
    WRITE_CHAR(buffer, bufferLength, length, '>');
471
0
472
0
    return length;
473
0
}
474
475
/*
476
 * getGroup() does a binary search for the group that contains the
477
 * Unicode code point "code".
478
 * The return value is always a valid Group* that may contain "code"
479
 * or else is the highest group before "code".
480
 * If the lowest group is after "code", then that one is returned.
481
 */
482
static const uint16_t *
483
0
getGroup(UCharNames *names, uint32_t code) {
484
0
    const uint16_t *groups=GET_GROUPS(names);
485
0
    uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
486
0
             start=0,
487
0
             limit=*groups++,
488
0
             number;
489
0
490
0
    /* binary search for the group of names that contains the one for code */
491
0
    while(start<limit-1) {
492
0
        number=(uint16_t)((start+limit)/2);
493
0
        if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
494
0
            limit=number;
495
0
        } else {
496
0
            start=number;
497
0
        }
498
0
    }
499
0
500
0
    /* return this regardless of whether it is an exact match */
501
0
    return groups+start*GROUP_LENGTH;
502
0
}
503
504
/*
505
 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
506
 * expands them into offsets and lengths for each string.
507
 * Lengths are stored with a variable-width encoding in consecutive nibbles:
508
 * If a nibble<0xc, then it is the length itself (0=empty string).
509
 * If a nibble>=0xc, then it forms a length value with the following nibble.
510
 * Calculation see below.
511
 * The offsets and lengths arrays must be at least 33 (one more) long because
512
 * there is no check here at the end if the last nibble is still used.
513
 */
514
static const uint8_t *
515
expandGroupLengths(const uint8_t *s,
516
0
                   uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
517
0
    /* read the lengths of the 32 strings in this group and get each string's offset */
518
0
    uint16_t i=0, offset=0, length=0;
519
0
    uint8_t lengthByte;
520
0
521
0
    /* all 32 lengths must be read to get the offset of the first group string */
522
0
    while(i<LINES_PER_GROUP) {
523
0
        lengthByte=*s++;
524
0
525
0
        /* read even nibble - MSBs of lengthByte */
526
0
        if(length>=12) {
527
0
            /* double-nibble length spread across two bytes */
528
0
            length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
529
0
            lengthByte&=0xf;
530
0
        } else if((lengthByte /* &0xf0 */)>=0xc0) {
531
0
            /* double-nibble length spread across this one byte */
532
0
            length=(uint16_t)((lengthByte&0x3f)+12);
533
0
        } else {
534
0
            /* single-nibble length in MSBs */
535
0
            length=(uint16_t)(lengthByte>>4);
536
0
            lengthByte&=0xf;
537
0
        }
538
0
539
0
        *offsets++=offset;
540
0
        *lengths++=length;
541
0
542
0
        offset+=length;
543
0
        ++i;
544
0
545
0
        /* read odd nibble - LSBs of lengthByte */
546
0
        if((lengthByte&0xf0)==0) {
547
0
            /* this nibble was not consumed for a double-nibble length above */
548
0
            length=lengthByte;
549
0
            if(length<12) {
550
0
                /* single-nibble length in LSBs */
551
0
                *offsets++=offset;
552
0
                *lengths++=length;
553
0
554
0
                offset+=length;
555
0
                ++i;
556
0
            }
557
0
        } else {
558
0
            length=0;   /* prevent double-nibble detection in the next iteration */
559
0
        }
560
0
    }
561
0
562
0
    /* now, s is at the first group string */
563
0
    return s;
564
0
}
565
566
static uint16_t
567
expandGroupName(UCharNames *names, const uint16_t *group,
568
                uint16_t lineNumber, UCharNameChoice nameChoice,
569
0
                char *buffer, uint16_t bufferLength) {
570
0
    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
571
0
    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
572
0
    s=expandGroupLengths(s, offsets, lengths);
573
0
    return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
574
0
                      buffer, bufferLength);
575
0
}
576
577
static uint16_t
578
getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
579
0
        char *buffer, uint16_t bufferLength) {
580
0
    const uint16_t *group=getGroup(names, code);
581
0
    if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
582
0
        return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
583
0
                               buffer, bufferLength);
584
0
    } else {
585
0
        /* group not found */
586
0
        /* zero-terminate */
587
0
        if(bufferLength>0) {
588
0
            *buffer=0;
589
0
        }
590
0
        return 0;
591
0
    }
592
0
}
593
594
/*
595
 * enumGroupNames() enumerates all the names in a 32-group
596
 * and either calls the enumerator function or finds a given input name.
597
 */
598
static UBool
599
enumGroupNames(UCharNames *names, const uint16_t *group,
600
               UChar32 start, UChar32 end,
601
               UEnumCharNamesFn *fn, void *context,
602
0
               UCharNameChoice nameChoice) {
603
0
    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
604
0
    const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
605
0
606
0
    s=expandGroupLengths(s, offsets, lengths);
607
0
    if(fn!=DO_FIND_NAME) {
608
0
        char buffer[200];
609
0
        uint16_t length;
610
0
611
0
        while(start<=end) {
612
0
            length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
613
0
            if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
614
0
                buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
615
0
            }
616
0
            /* here, we assume that the buffer is large enough */
617
0
            if(length>0) {
618
0
                if(!fn(context, start, nameChoice, buffer, length)) {
619
0
                    return FALSE;
620
0
                }
621
0
            }
622
0
            ++start;
623
0
        }
624
0
    } else {
625
0
        const char *otherName=((FindName *)context)->otherName;
626
0
        while(start<=end) {
627
0
            if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
628
0
                ((FindName *)context)->code=start;
629
0
                return FALSE;
630
0
            }
631
0
            ++start;
632
0
        }
633
0
    }
634
0
    return TRUE;
635
0
}
636
637
/*
638
 * enumExtNames enumerate extended names.
639
 * It only needs to do it if it is called with a real function and not
640
 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
641
 * for extended names by itself.
642
 */ 
643
static UBool
644
enumExtNames(UChar32 start, UChar32 end,
645
             UEnumCharNamesFn *fn, void *context)
646
0
{
647
0
    if(fn!=DO_FIND_NAME) {
648
0
        char buffer[200];
649
0
        uint16_t length;
650
0
        
651
0
        while(start<=end) {
652
0
            buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
653
0
            /* here, we assume that the buffer is large enough */
654
0
            if(length>0) {
655
0
                if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
656
0
                    return FALSE;
657
0
                }
658
0
            }
659
0
            ++start;
660
0
        }
661
0
    }
662
0
663
0
    return TRUE;
664
0
}
665
666
static UBool
667
enumNames(UCharNames *names,
668
          UChar32 start, UChar32 limit,
669
          UEnumCharNamesFn *fn, void *context,
670
0
          UCharNameChoice nameChoice) {
671
0
    uint16_t startGroupMSB, endGroupMSB, groupCount;
672
0
    const uint16_t *group, *groupLimit;
673
0
674
0
    startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
675
0
    endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
676
0
677
0
    /* find the group that contains start, or the highest before it */
678
0
    group=getGroup(names, start);
679
0
680
0
    if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
681
0
        /* enumerate synthetic names between start and the group start */
682
0
        UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
683
0
        if(extLimit>limit) {
684
0
            extLimit=limit;
685
0
        }
686
0
        if(!enumExtNames(start, extLimit-1, fn, context)) {
687
0
            return FALSE;
688
0
        }
689
0
        start=extLimit;
690
0
    }
691
0
692
0
    if(startGroupMSB==endGroupMSB) {
693
0
        if(startGroupMSB==group[GROUP_MSB]) {
694
0
            /* if start and limit-1 are in the same group, then enumerate only in that one */
695
0
            return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
696
0
        }
697
0
    } else {
698
0
        const uint16_t *groups=GET_GROUPS(names);
699
0
        groupCount=*groups++;
700
0
        groupLimit=groups+groupCount*GROUP_LENGTH;
701
0
702
0
        if(startGroupMSB==group[GROUP_MSB]) {
703
0
            /* enumerate characters in the partial start group */
704
0
            if((start&GROUP_MASK)!=0) {
705
0
                if(!enumGroupNames(names, group,
706
0
                                   start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
707
0
                                   fn, context, nameChoice)) {
708
0
                    return FALSE;
709
0
                }
710
0
                group=NEXT_GROUP(group); /* continue with the next group */
711
0
            }
712
0
        } else if(startGroupMSB>group[GROUP_MSB]) {
713
0
            /* make sure that we start enumerating with the first group after start */
714
0
            const uint16_t *nextGroup=NEXT_GROUP(group);
715
0
            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
716
0
                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
717
0
                if (end > limit) {
718
0
                    end = limit;
719
0
                }
720
0
                if (!enumExtNames(start, end - 1, fn, context)) {
721
0
                    return FALSE;
722
0
                }
723
0
            }
724
0
            group=nextGroup;
725
0
        }
726
0
727
0
        /* enumerate entire groups between the start- and end-groups */
728
0
        while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
729
0
            const uint16_t *nextGroup;
730
0
            start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
731
0
            if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
732
0
                return FALSE;
733
0
            }
734
0
            nextGroup=NEXT_GROUP(group);
735
0
            if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
736
0
                UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
737
0
                if (end > limit) {
738
0
                    end = limit;
739
0
                }
740
0
                if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
741
0
                    return FALSE;
742
0
                }
743
0
            }
744
0
            group=nextGroup;
745
0
        }
746
0
747
0
        /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
748
0
        if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
749
0
            return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
750
0
        } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
751
0
            UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
752
0
            if (next > start) {
753
0
                start = next;
754
0
            }
755
0
        } else {
756
0
            return TRUE;
757
0
        }
758
0
    }
759
0
760
0
    /* we have not found a group, which means everything is made of
761
0
       extended names. */
762
0
    if (nameChoice == U_EXTENDED_CHAR_NAME) {
763
0
        if (limit > UCHAR_MAX_VALUE + 1) {
764
0
            limit = UCHAR_MAX_VALUE + 1;
765
0
        }
766
0
        return enumExtNames(start, limit - 1, fn, context);
767
0
    }
768
0
    
769
0
    return TRUE;
770
0
}
771
772
static uint16_t
773
writeFactorSuffix(const uint16_t *factors, uint16_t count,
774
                  const char *s, /* suffix elements */
775
                  uint32_t code,
776
                  uint16_t indexes[8], /* output fields from here */
777
                  const char *elementBases[8], const char *elements[8],
778
0
                  char *buffer, uint16_t bufferLength) {
779
0
    uint16_t i, factor, bufferPos=0;
780
0
    char c;
781
0
782
0
    /* write elements according to the factors */
783
0
784
0
    /*
785
0
     * the factorized elements are determined by modulo arithmetic
786
0
     * with the factors of this algorithm
787
0
     *
788
0
     * note that for fewer operations, count is decremented here
789
0
     */
790
0
    --count;
791
0
    for(i=count; i>0; --i) {
792
0
        factor=factors[i];
793
0
        indexes[i]=(uint16_t)(code%factor);
794
0
        code/=factor;
795
0
    }
796
0
    /*
797
0
     * we don't need to calculate the last modulus because start<=code<=end
798
0
     * guarantees here that code<=factors[0]
799
0
     */
800
0
    indexes[0]=(uint16_t)code;
801
0
802
0
    /* write each element */
803
0
    for(;;) {
804
0
        if(elementBases!=NULL) {
805
0
            *elementBases++=s;
806
0
        }
807
0
808
0
        /* skip indexes[i] strings */
809
0
        factor=indexes[i];
810
0
        while(factor>0) {
811
0
            while(*s++!=0) {}
812
0
            --factor;
813
0
        }
814
0
        if(elements!=NULL) {
815
0
            *elements++=s;
816
0
        }
817
0
818
0
        /* write element */
819
0
        while((c=*s++)!=0) {
820
0
            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
821
0
        }
822
0
823
0
        /* we do not need to perform the rest of this loop for i==count - break here */
824
0
        if(i>=count) {
825
0
            break;
826
0
        }
827
0
828
0
        /* skip the rest of the strings for this factors[i] */
829
0
        factor=(uint16_t)(factors[i]-indexes[i]-1);
830
0
        while(factor>0) {
831
0
            while(*s++!=0) {}
832
0
            --factor;
833
0
        }
834
0
835
0
        ++i;
836
0
    }
837
0
838
0
    /* zero-terminate */
839
0
    if(bufferLength>0) {
840
0
        *buffer=0;
841
0
    }
842
0
843
0
    return bufferPos;
844
0
}
845
846
/*
847
 * Important:
848
 * Parts of findAlgName() are almost the same as some of getAlgName().
849
 * Fixes must be applied to both.
850
 */
851
static uint16_t
852
getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
853
0
        char *buffer, uint16_t bufferLength) {
854
0
    uint16_t bufferPos=0;
855
0
856
0
    /* Only the normative character name can be algorithmic. */
857
0
    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
858
0
        /* zero-terminate */
859
0
        if(bufferLength>0) {
860
0
            *buffer=0;
861
0
        }
862
0
        return 0;
863
0
    }
864
0
865
0
    switch(range->type) {
866
0
    case 0: {
867
0
        /* name = prefix hex-digits */
868
0
        const char *s=(const char *)(range+1);
869
0
        char c;
870
0
871
0
        uint16_t i, count;
872
0
873
0
        /* copy prefix */
874
0
        while((c=*s++)!=0) {
875
0
            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
876
0
        }
877
0
878
0
        /* write hexadecimal code point value */
879
0
        count=range->variant;
880
0
881
0
        /* zero-terminate */
882
0
        if(count<bufferLength) {
883
0
            buffer[count]=0;
884
0
        }
885
0
886
0
        for(i=count; i>0;) {
887
0
            if(--i<bufferLength) {
888
0
                c=(char)(code&0xf);
889
0
                if(c<10) {
890
0
                    c+='0';
891
0
                } else {
892
0
                    c+='A'-10;
893
0
                }
894
0
                buffer[i]=c;
895
0
            }
896
0
            code>>=4;
897
0
        }
898
0
899
0
        bufferPos+=count;
900
0
        break;
901
0
    }
902
0
    case 1: {
903
0
        /* name = prefix factorized-elements */
904
0
        uint16_t indexes[8];
905
0
        const uint16_t *factors=(const uint16_t *)(range+1);
906
0
        uint16_t count=range->variant;
907
0
        const char *s=(const char *)(factors+count);
908
0
        char c;
909
0
910
0
        /* copy prefix */
911
0
        while((c=*s++)!=0) {
912
0
            WRITE_CHAR(buffer, bufferLength, bufferPos, c);
913
0
        }
914
0
915
0
        bufferPos+=writeFactorSuffix(factors, count,
916
0
                                     s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
917
0
        break;
918
0
    }
919
0
    default:
920
0
        /* undefined type */
921
0
        /* zero-terminate */
922
0
        if(bufferLength>0) {
923
0
            *buffer=0;
924
0
        }
925
0
        break;
926
0
    }
927
0
928
0
    return bufferPos;
929
0
}
930
931
/*
932
 * Important: enumAlgNames() and findAlgName() are almost the same.
933
 * Any fix must be applied to both.
934
 */
935
static UBool
936
enumAlgNames(AlgorithmicRange *range,
937
             UChar32 start, UChar32 limit,
938
             UEnumCharNamesFn *fn, void *context,
939
0
             UCharNameChoice nameChoice) {
940
0
    char buffer[200];
941
0
    uint16_t length;
942
0
943
0
    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
944
0
        return TRUE;
945
0
    }
946
0
947
0
    switch(range->type) {
948
0
    case 0: {
949
0
        char *s, *end;
950
0
        char c;
951
0
952
0
        /* get the full name of the start character */
953
0
        length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
954
0
        if(length<=0) {
955
0
            return TRUE;
956
0
        }
957
0
958
0
        /* call the enumerator function with this first character */
959
0
        if(!fn(context, start, nameChoice, buffer, length)) {
960
0
            return FALSE;
961
0
        }
962
0
963
0
        /* go to the end of the name; all these names have the same length */
964
0
        end=buffer;
965
0
        while(*end!=0) {
966
0
            ++end;
967
0
        }
968
0
969
0
        /* enumerate the rest of the names */
970
0
        while(++start<limit) {
971
0
            /* increment the hexadecimal number on a character-basis */
972
0
            s=end;
973
0
            for (;;) {
974
0
                c=*--s;
975
0
                if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
976
0
                    *s=(char)(c+1);
977
0
                    break;
978
0
                } else if(c=='9') {
979
0
                    *s='A';
980
0
                    break;
981
0
                } else if(c=='F') {
982
0
                    *s='0';
983
0
                }
984
0
            }
985
0
986
0
            if(!fn(context, start, nameChoice, buffer, length)) {
987
0
                return FALSE;
988
0
            }
989
0
        }
990
0
        break;
991
0
    }
992
0
    case 1: {
993
0
        uint16_t indexes[8];
994
0
        const char *elementBases[8], *elements[8];
995
0
        const uint16_t *factors=(const uint16_t *)(range+1);
996
0
        uint16_t count=range->variant;
997
0
        const char *s=(const char *)(factors+count);
998
0
        char *suffix, *t;
999
0
        uint16_t prefixLength, i, idx;
1000
0
1001
0
        char c;
1002
0
1003
0
        /* name = prefix factorized-elements */
1004
0
1005
0
        /* copy prefix */
1006
0
        suffix=buffer;
1007
0
        prefixLength=0;
1008
0
        while((c=*s++)!=0) {
1009
0
            *suffix++=c;
1010
0
            ++prefixLength;
1011
0
        }
1012
0
1013
0
        /* append the suffix of the start character */
1014
0
        length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1015
0
                                              s, (uint32_t)start-range->start,
1016
0
                                              indexes, elementBases, elements,
1017
0
                                              suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1018
0
1019
0
        /* call the enumerator function with this first character */
1020
0
        if(!fn(context, start, nameChoice, buffer, length)) {
1021
0
            return FALSE;
1022
0
        }
1023
0
1024
0
        /* enumerate the rest of the names */
1025
0
        while(++start<limit) {
1026
0
            /* increment the indexes in lexical order bound by the factors */
1027
0
            i=count;
1028
0
            for (;;) {
1029
0
                idx=(uint16_t)(indexes[--i]+1);
1030
0
                if(idx<factors[i]) {
1031
0
                    /* skip one index and its element string */
1032
0
                    indexes[i]=idx;
1033
0
                    s=elements[i];
1034
0
                    while(*s++!=0) {
1035
0
                    }
1036
0
                    elements[i]=s;
1037
0
                    break;
1038
0
                } else {
1039
0
                    /* reset this index to 0 and its element string to the first one */
1040
0
                    indexes[i]=0;
1041
0
                    elements[i]=elementBases[i];
1042
0
                }
1043
0
            }
1044
0
1045
0
            /* to make matters a little easier, just append all elements to the suffix */
1046
0
            t=suffix;
1047
0
            length=prefixLength;
1048
0
            for(i=0; i<count; ++i) {
1049
0
                s=elements[i];
1050
0
                while((c=*s++)!=0) {
1051
0
                    *t++=c;
1052
0
                    ++length;
1053
0
                }
1054
0
            }
1055
0
            /* zero-terminate */
1056
0
            *t=0;
1057
0
1058
0
            if(!fn(context, start, nameChoice, buffer, length)) {
1059
0
                return FALSE;
1060
0
            }
1061
0
        }
1062
0
        break;
1063
0
    }
1064
0
    default:
1065
0
        /* undefined type */
1066
0
        break;
1067
0
    }
1068
0
1069
0
    return TRUE;
1070
0
}
1071
1072
/*
1073
 * findAlgName() is almost the same as enumAlgNames() except that it
1074
 * returns the code point for a name if it fits into the range.
1075
 * It returns 0xffff otherwise.
1076
 */
1077
static UChar32
1078
0
findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1079
0
    UChar32 code;
1080
0
1081
0
    if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1082
0
        return 0xffff;
1083
0
    }
1084
0
1085
0
    switch(range->type) {
1086
0
    case 0: {
1087
0
        /* name = prefix hex-digits */
1088
0
        const char *s=(const char *)(range+1);
1089
0
        char c;
1090
0
1091
0
        uint16_t i, count;
1092
0
1093
0
        /* compare prefix */
1094
0
        while((c=*s++)!=0) {
1095
0
            if((char)c!=*otherName++) {
1096
0
                return 0xffff;
1097
0
            }
1098
0
        }
1099
0
1100
0
        /* read hexadecimal code point value */
1101
0
        count=range->variant;
1102
0
        code=0;
1103
0
        for(i=0; i<count; ++i) {
1104
0
            c=*otherName++;
1105
0
            if('0'<=c && c<='9') {
1106
0
                code=(code<<4)|(c-'0');
1107
0
            } else if('A'<=c && c<='F') {
1108
0
                code=(code<<4)|(c-'A'+10);
1109
0
            } else {
1110
0
                return 0xffff;
1111
0
            }
1112
0
        }
1113
0
1114
0
        /* does it fit into the range? */
1115
0
        if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1116
0
            return code;
1117
0
        }
1118
0
        break;
1119
0
    }
1120
0
    case 1: {
1121
0
        char buffer[64];
1122
0
        uint16_t indexes[8];
1123
0
        const char *elementBases[8], *elements[8];
1124
0
        const uint16_t *factors=(const uint16_t *)(range+1);
1125
0
        uint16_t count=range->variant;
1126
0
        const char *s=(const char *)(factors+count), *t;
1127
0
        UChar32 start, limit;
1128
0
        uint16_t i, idx;
1129
0
1130
0
        char c;
1131
0
1132
0
        /* name = prefix factorized-elements */
1133
0
1134
0
        /* compare prefix */
1135
0
        while((c=*s++)!=0) {
1136
0
            if((char)c!=*otherName++) {
1137
0
                return 0xffff;
1138
0
            }
1139
0
        }
1140
0
1141
0
        start=(UChar32)range->start;
1142
0
        limit=(UChar32)(range->end+1);
1143
0
1144
0
        /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1145
0
        writeFactorSuffix(factors, count, s, 0,
1146
0
                          indexes, elementBases, elements, buffer, sizeof(buffer));
1147
0
1148
0
        /* compare the first suffix */
1149
0
        if(0==uprv_strcmp(otherName, buffer)) {
1150
0
            return start;
1151
0
        }
1152
0
1153
0
        /* enumerate and compare the rest of the suffixes */
1154
0
        while(++start<limit) {
1155
0
            /* increment the indexes in lexical order bound by the factors */
1156
0
            i=count;
1157
0
            for (;;) {
1158
0
                idx=(uint16_t)(indexes[--i]+1);
1159
0
                if(idx<factors[i]) {
1160
0
                    /* skip one index and its element string */
1161
0
                    indexes[i]=idx;
1162
0
                    s=elements[i];
1163
0
                    while(*s++!=0) {}
1164
0
                    elements[i]=s;
1165
0
                    break;
1166
0
                } else {
1167
0
                    /* reset this index to 0 and its element string to the first one */
1168
0
                    indexes[i]=0;
1169
0
                    elements[i]=elementBases[i];
1170
0
                }
1171
0
            }
1172
0
1173
0
            /* to make matters a little easier, just compare all elements of the suffix */
1174
0
            t=otherName;
1175
0
            for(i=0; i<count; ++i) {
1176
0
                s=elements[i];
1177
0
                while((c=*s++)!=0) {
1178
0
                    if(c!=*t++) {
1179
0
                        s=""; /* does not match */
1180
0
                        i=99;
1181
0
                    }
1182
0
                }
1183
0
            }
1184
0
            if(i<99 && *t==0) {
1185
0
                return start;
1186
0
            }
1187
0
        }
1188
0
        break;
1189
0
    }
1190
0
    default:
1191
0
        /* undefined type */
1192
0
        break;
1193
0
    }
1194
0
1195
0
    return 0xffff;
1196
0
}
1197
1198
/* sets of name characters, maximum name lengths ---------------------------- */
1199
1200
0
#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1201
0
#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1202
1203
static int32_t
1204
0
calcStringSetLength(uint32_t set[8], const char *s) {
1205
0
    int32_t length=0;
1206
0
    char c;
1207
0
1208
0
    while((c=*s++)!=0) {
1209
0
        SET_ADD(set, c);
1210
0
        ++length;
1211
0
    }
1212
0
    return length;
1213
0
}
1214
1215
static int32_t
1216
0
calcAlgNameSetsLengths(int32_t maxNameLength) {
1217
0
    AlgorithmicRange *range;
1218
0
    uint32_t *p;
1219
0
    uint32_t rangeCount;
1220
0
    int32_t length;
1221
0
1222
0
    /* enumerate algorithmic ranges */
1223
0
    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1224
0
    rangeCount=*p;
1225
0
    range=(AlgorithmicRange *)(p+1);
1226
0
    while(rangeCount>0) {
1227
0
        switch(range->type) {
1228
0
        case 0:
1229
0
            /* name = prefix + (range->variant times) hex-digits */
1230
0
            /* prefix */
1231
0
            length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1232
0
            if(length>maxNameLength) {
1233
0
                maxNameLength=length;
1234
0
            }
1235
0
            break;
1236
0
        case 1: {
1237
0
            /* name = prefix factorized-elements */
1238
0
            const uint16_t *factors=(const uint16_t *)(range+1);
1239
0
            const char *s;
1240
0
            int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1241
0
1242
0
            /* prefix length */
1243
0
            s=(const char *)(factors+count);
1244
0
            length=calcStringSetLength(gNameSet, s);
1245
0
            s+=length+1; /* start of factor suffixes */
1246
0
1247
0
            /* get the set and maximum factor suffix length for each factor */
1248
0
            for(i=0; i<count; ++i) {
1249
0
                maxFactorLength=0;
1250
0
                for(factor=factors[i]; factor>0; --factor) {
1251
0
                    factorLength=calcStringSetLength(gNameSet, s);
1252
0
                    s+=factorLength+1;
1253
0
                    if(factorLength>maxFactorLength) {
1254
0
                        maxFactorLength=factorLength;
1255
0
                    }
1256
0
                }
1257
0
                length+=maxFactorLength;
1258
0
            }
1259
0
1260
0
            if(length>maxNameLength) {
1261
0
                maxNameLength=length;
1262
0
            }
1263
0
            break;
1264
0
        }
1265
0
        default:
1266
0
            /* unknown type */
1267
0
            break;
1268
0
        }
1269
0
1270
0
        range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1271
0
        --rangeCount;
1272
0
    }
1273
0
    return maxNameLength;
1274
0
}
1275
1276
static int32_t
1277
0
calcExtNameSetsLengths(int32_t maxNameLength) {
1278
0
    int32_t i, length;
1279
0
1280
0
    for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) {
1281
0
        /*
1282
0
         * for each category, count the length of the category name
1283
0
         * plus 9=
1284
0
         * 2 for <>
1285
0
         * 1 for -
1286
0
         * 6 for most hex digits per code point
1287
0
         */
1288
0
        length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1289
0
        if(length>maxNameLength) {
1290
0
            maxNameLength=length;
1291
0
        }
1292
0
    }
1293
0
    return maxNameLength;
1294
0
}
1295
1296
static int32_t
1297
calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1298
                  uint32_t set[8],
1299
0
                  const uint8_t **pLine, const uint8_t *lineLimit) {
1300
0
    const uint8_t *line=*pLine;
1301
0
    int32_t length=0, tokenLength;
1302
0
    uint16_t c, token;
1303
0
1304
0
    while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1305
0
        if(c>=tokenCount) {
1306
0
            /* implicit letter */
1307
0
            SET_ADD(set, c);
1308
0
            ++length;
1309
0
        } else {
1310
0
            token=tokens[c];
1311
0
            if(token==(uint16_t)(-2)) {
1312
0
                /* this is a lead byte for a double-byte token */
1313
0
                c=c<<8|*line++;
1314
0
                token=tokens[c];
1315
0
            }
1316
0
            if(token==(uint16_t)(-1)) {
1317
0
                /* explicit letter */
1318
0
                SET_ADD(set, c);
1319
0
                ++length;
1320
0
            } else {
1321
0
                /* count token word */
1322
0
                if(tokenLengths!=NULL) {
1323
0
                    /* use cached token length */
1324
0
                    tokenLength=tokenLengths[c];
1325
0
                    if(tokenLength==0) {
1326
0
                        tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1327
0
                        tokenLengths[c]=(int8_t)tokenLength;
1328
0
                    }
1329
0
                } else {
1330
0
                    tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1331
0
                }
1332
0
                length+=tokenLength;
1333
0
            }
1334
0
        }
1335
0
    }
1336
0
1337
0
    *pLine=line;
1338
0
    return length;
1339
0
}
1340
1341
static void
1342
0
calcGroupNameSetsLengths(int32_t maxNameLength) {
1343
0
    uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1344
0
1345
0
    uint16_t *tokens=(uint16_t *)uCharNames+8;
1346
0
    uint16_t tokenCount=*tokens++;
1347
0
    uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1348
0
1349
0
    int8_t *tokenLengths;
1350
0
1351
0
    const uint16_t *group;
1352
0
    const uint8_t *s, *line, *lineLimit;
1353
0
1354
0
    int32_t groupCount, lineNumber, length;
1355
0
1356
0
    tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1357
0
    if(tokenLengths!=NULL) {
1358
0
        uprv_memset(tokenLengths, 0, tokenCount);
1359
0
    }
1360
0
1361
0
    group=GET_GROUPS(uCharNames);
1362
0
    groupCount=*group++;
1363
0
1364
0
    /* enumerate all groups */
1365
0
    while(groupCount>0) {
1366
0
        s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1367
0
        s=expandGroupLengths(s, offsets, lengths);
1368
0
1369
0
        /* enumerate all lines in each group */
1370
0
        for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1371
0
            line=s+offsets[lineNumber];
1372
0
            length=lengths[lineNumber];
1373
0
            if(length==0) {
1374
0
                continue;
1375
0
            }
1376
0
1377
0
            lineLimit=line+length;
1378
0
1379
0
            /* read regular name */
1380
0
            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1381
0
            if(length>maxNameLength) {
1382
0
                maxNameLength=length;
1383
0
            }
1384
0
            if(line==lineLimit) {
1385
0
                continue;
1386
0
            }
1387
0
1388
0
            /* read Unicode 1.0 name */
1389
0
            length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390
0
            if(length>maxNameLength) {
1391
0
                maxNameLength=length;
1392
0
            }
1393
0
            if(line==lineLimit) {
1394
0
                continue;
1395
0
            }
1396
0
1397
0
            /* read ISO comment */
1398
0
            /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1399
0
        }
1400
0
1401
0
        group=NEXT_GROUP(group);
1402
0
        --groupCount;
1403
0
    }
1404
0
1405
0
    if(tokenLengths!=NULL) {
1406
0
        uprv_free(tokenLengths);
1407
0
    }
1408
0
1409
0
    /* set gMax... - name length last for threading */
1410
0
    gMaxNameLength=maxNameLength;
1411
0
}
1412
1413
static UBool
1414
0
calcNameSetsLengths(UErrorCode *pErrorCode) {
1415
0
    static const char extChars[]="0123456789ABCDEF<>-";
1416
0
    int32_t i, maxNameLength;
1417
0
1418
0
    if(gMaxNameLength!=0) {
1419
0
        return TRUE;
1420
0
    }
1421
0
1422
0
    if(!isDataLoaded(pErrorCode)) {
1423
0
        return FALSE;
1424
0
    }
1425
0
1426
0
    /* set hex digits, used in various names, and <>-, used in extended names */
1427
0
    for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
1428
0
        SET_ADD(gNameSet, extChars[i]);
1429
0
    }
1430
0
1431
0
    /* set sets and lengths from algorithmic names */
1432
0
    maxNameLength=calcAlgNameSetsLengths(0);
1433
0
1434
0
    /* set sets and lengths from extended names */
1435
0
    maxNameLength=calcExtNameSetsLengths(maxNameLength);
1436
0
1437
0
    /* set sets and lengths from group names, set global maximum values */
1438
0
    calcGroupNameSetsLengths(maxNameLength);
1439
0
1440
0
    return TRUE;
1441
0
}
1442
1443
U_NAMESPACE_END
1444
1445
/* public API --------------------------------------------------------------- */
1446
1447
U_NAMESPACE_USE
1448
1449
U_CAPI int32_t U_EXPORT2
1450
u_charName(UChar32 code, UCharNameChoice nameChoice,
1451
           char *buffer, int32_t bufferLength,
1452
0
           UErrorCode *pErrorCode) {
1453
0
     AlgorithmicRange *algRange;
1454
0
    uint32_t *p;
1455
0
    uint32_t i;
1456
0
    int32_t length;
1457
0
1458
0
    /* check the argument values */
1459
0
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1460
0
        return 0;
1461
0
    } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1462
0
              bufferLength<0 || (bufferLength>0 && buffer==NULL)
1463
0
    ) {
1464
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1465
0
        return 0;
1466
0
    }
1467
0
1468
0
    if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1469
0
        return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1470
0
    }
1471
0
1472
0
    length=0;
1473
0
1474
0
    /* try algorithmic names first */
1475
0
    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1476
0
    i=*p;
1477
0
    algRange=(AlgorithmicRange *)(p+1);
1478
0
    while(i>0) {
1479
0
        if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1480
0
            length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1481
0
            break;
1482
0
        }
1483
0
        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1484
0
        --i;
1485
0
    }
1486
0
1487
0
    if(i==0) {
1488
0
        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1489
0
            length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1490
0
            if (!length) {
1491
0
                /* extended character name */
1492
0
                length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1493
0
            }
1494
0
        } else {
1495
0
            /* normal character name */
1496
0
            length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1497
0
        }
1498
0
    }
1499
0
1500
0
    return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1501
0
}
1502
1503
U_CAPI int32_t U_EXPORT2
1504
u_getISOComment(UChar32 /*c*/,
1505
                char *dest, int32_t destCapacity,
1506
0
                UErrorCode *pErrorCode) {
1507
0
    /* check the argument values */
1508
0
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1509
0
        return 0;
1510
0
    } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1511
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1512
0
        return 0;
1513
0
    }
1514
0
1515
0
    return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1516
0
}
1517
1518
U_CAPI UChar32 U_EXPORT2
1519
u_charFromName(UCharNameChoice nameChoice,
1520
               const char *name,
1521
0
               UErrorCode *pErrorCode) {
1522
0
    char upper[120], lower[120];
1523
0
    FindName findName;
1524
0
    AlgorithmicRange *algRange;
1525
0
    uint32_t *p;
1526
0
    uint32_t i;
1527
0
    UChar32 cp = 0;
1528
0
    char c0;
1529
0
    UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
1530
0
1531
0
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1532
0
        return error;
1533
0
    }
1534
0
1535
0
    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1536
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1537
0
        return error;
1538
0
    }
1539
0
1540
0
    if(!isDataLoaded(pErrorCode)) {
1541
0
        return error;
1542
0
    }
1543
0
1544
0
    /* construct the uppercase and lowercase of the name first */
1545
0
    for(i=0; i<sizeof(upper); ++i) {
1546
0
        if((c0=*name++)!=0) {
1547
0
            upper[i]=uprv_toupper(c0);
1548
0
            lower[i]=uprv_tolower(c0);
1549
0
        } else {
1550
0
            upper[i]=lower[i]=0;
1551
0
            break;
1552
0
        }
1553
0
    }
1554
0
    if(i==sizeof(upper)) {
1555
0
        /* name too long, there is no such character */
1556
0
        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1557
0
        return error;
1558
0
    }
1559
0
    // i==strlen(name)==strlen(lower)==strlen(upper)
1560
0
1561
0
    /* try extended names first */
1562
0
    if (lower[0] == '<') {
1563
0
        if (nameChoice == U_EXTENDED_CHAR_NAME) {
1564
0
            // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
1565
0
            if (lower[--i] == '>' && i >= 3 && lower[--i] != '-') {
1566
0
                while (i >= 3 && lower[--i] != '-') {}
1567
0
1568
0
                if (i >= 2 && lower[i] == '-') {
1569
0
                    uint32_t cIdx;
1570
0
1571
0
                    lower[i] = 0;
1572
0
1573
0
                    for (++i; lower[i] != '>'; ++i) {
1574
0
                        if (lower[i] >= '0' && lower[i] <= '9') {
1575
0
                            cp = (cp << 4) + lower[i] - '0';
1576
0
                        } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1577
0
                            cp = (cp << 4) + lower[i] - 'a' + 10;
1578
0
                        } else {
1579
0
                            *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1580
0
                            return error;
1581
0
                        }
1582
0
                    }
1583
0
1584
0
                    /* Now validate the category name.
1585
0
                       We could use a binary search, or a trie, if
1586
0
                       we really wanted to. */
1587
0
1588
0
                    for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
1589
0
1590
0
                        if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1591
0
                            if (getCharCat(cp) == cIdx) {
1592
0
                                return cp;
1593
0
                            }
1594
0
                            break;
1595
0
                        }
1596
0
                    }
1597
0
                }
1598
0
            }
1599
0
        }
1600
0
1601
0
        *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1602
0
        return error;
1603
0
    }
1604
0
1605
0
    /* try algorithmic names now */
1606
0
    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1607
0
    i=*p;
1608
0
    algRange=(AlgorithmicRange *)(p+1);
1609
0
    while(i>0) {
1610
0
        if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1611
0
            return cp;
1612
0
        }
1613
0
        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1614
0
        --i;
1615
0
    }
1616
0
1617
0
    /* normal character name */
1618
0
    findName.otherName=upper;
1619
0
    findName.code=error;
1620
0
    enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1621
0
    if (findName.code == error) {
1622
0
         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1623
0
    }
1624
0
    return findName.code;
1625
0
}
1626
1627
U_CAPI void U_EXPORT2
1628
u_enumCharNames(UChar32 start, UChar32 limit,
1629
                UEnumCharNamesFn *fn,
1630
                void *context,
1631
                UCharNameChoice nameChoice,
1632
0
                UErrorCode *pErrorCode) {
1633
0
    AlgorithmicRange *algRange;
1634
0
    uint32_t *p;
1635
0
    uint32_t i;
1636
0
1637
0
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1638
0
        return;
1639
0
    }
1640
0
1641
0
    if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1642
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1643
0
        return;
1644
0
    }
1645
0
1646
0
    if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1647
0
        limit = UCHAR_MAX_VALUE + 1;
1648
0
    }
1649
0
    if((uint32_t)start>=(uint32_t)limit) {
1650
0
        return;
1651
0
    }
1652
0
1653
0
    if(!isDataLoaded(pErrorCode)) {
1654
0
        return;
1655
0
    }
1656
0
1657
0
    /* interleave the data-driven ones with the algorithmic ones */
1658
0
    /* iterate over all algorithmic ranges; assume that they are in ascending order */
1659
0
    p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1660
0
    i=*p;
1661
0
    algRange=(AlgorithmicRange *)(p+1);
1662
0
    while(i>0) {
1663
0
        /* enumerate the character names before the current algorithmic range */
1664
0
        /* here: start<limit */
1665
0
        if((uint32_t)start<algRange->start) {
1666
0
            if((uint32_t)limit<=algRange->start) {
1667
0
                enumNames(uCharNames, start, limit, fn, context, nameChoice);
1668
0
                return;
1669
0
            }
1670
0
            if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1671
0
                return;
1672
0
            }
1673
0
            start=(UChar32)algRange->start;
1674
0
        }
1675
0
        /* enumerate the character names in the current algorithmic range */
1676
0
        /* here: algRange->start<=start<limit */
1677
0
        if((uint32_t)start<=algRange->end) {
1678
0
            if((uint32_t)limit<=(algRange->end+1)) {
1679
0
                enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1680
0
                return;
1681
0
            }
1682
0
            if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1683
0
                return;
1684
0
            }
1685
0
            start=(UChar32)algRange->end+1;
1686
0
        }
1687
0
        /* continue to the next algorithmic range (here: start<limit) */
1688
0
        algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1689
0
        --i;
1690
0
    }
1691
0
    /* enumerate the character names after the last algorithmic range */
1692
0
    enumNames(uCharNames, start, limit, fn, context, nameChoice);
1693
0
}
1694
1695
U_CAPI int32_t U_EXPORT2
1696
0
uprv_getMaxCharNameLength() {
1697
0
    UErrorCode errorCode=U_ZERO_ERROR;
1698
0
    if(calcNameSetsLengths(&errorCode)) {
1699
0
        return gMaxNameLength;
1700
0
    } else {
1701
0
        return 0;
1702
0
    }
1703
0
}
1704
1705
/**
1706
 * Converts the char set cset into a Unicode set uset.
1707
 * @param cset Set of 256 bit flags corresponding to a set of chars.
1708
 * @param uset USet to receive characters. Existing contents are deleted.
1709
 */
1710
static void
1711
0
charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1712
0
    UChar us[256];
1713
0
    char cs[256];
1714
0
1715
0
    int32_t i, length;
1716
0
    UErrorCode errorCode;
1717
0
1718
0
    errorCode=U_ZERO_ERROR;
1719
0
1720
0
    if(!calcNameSetsLengths(&errorCode)) {
1721
0
        return;
1722
0
    }
1723
0
1724
0
    /* build a char string with all chars that are used in character names */
1725
0
    length=0;
1726
0
    for(i=0; i<256; ++i) {
1727
0
        if(SET_CONTAINS(cset, i)) {
1728
0
            cs[length++]=(char)i;
1729
0
        }
1730
0
    }
1731
0
1732
0
    /* convert the char string to a UChar string */
1733
0
    u_charsToUChars(cs, us, length);
1734
0
1735
0
    /* add each UChar to the USet */
1736
0
    for(i=0; i<length; ++i) {
1737
0
        if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1738
0
            sa->add(sa->set, us[i]);
1739
0
        }
1740
0
    }
1741
0
}
1742
1743
/**
1744
 * Fills set with characters that are used in Unicode character names.
1745
 * @param set USet to receive characters.
1746
 */
1747
U_CAPI void U_EXPORT2
1748
0
uprv_getCharNameCharacters(const USetAdder *sa) {
1749
0
    charSetToUSet(gNameSet, sa);
1750
0
}
1751
1752
/* data swapping ------------------------------------------------------------ */
1753
1754
/*
1755
 * The token table contains non-negative entries for token bytes,
1756
 * and -1 for bytes that represent themselves in the data file's charset.
1757
 * -2 entries are used for lead bytes.
1758
 *
1759
 * Direct bytes (-1 entries) must be translated from the input charset family
1760
 * to the output charset family.
1761
 * makeTokenMap() writes a permutation mapping for this.
1762
 * Use it once for single-/lead-byte tokens and once more for all trail byte
1763
 * tokens. (';' is an unused trail byte marked with -1.)
1764
 */
1765
static void
1766
makeTokenMap(const UDataSwapper *ds,
1767
             int16_t tokens[], uint16_t tokenCount,
1768
             uint8_t map[256],
1769
0
             UErrorCode *pErrorCode) {
1770
0
    UBool usedOutChar[256];
1771
0
    uint16_t i, j;
1772
0
    uint8_t c1, c2;
1773
0
1774
0
    if(U_FAILURE(*pErrorCode)) {
1775
0
        return;
1776
0
    }
1777
0
1778
0
    if(ds->inCharset==ds->outCharset) {
1779
0
        /* Same charset family: identity permutation */
1780
0
        for(i=0; i<256; ++i) {
1781
0
            map[i]=(uint8_t)i;
1782
0
        }
1783
0
    } else {
1784
0
        uprv_memset(map, 0, 256);
1785
0
        uprv_memset(usedOutChar, 0, 256);
1786
0
1787
0
        if(tokenCount>256) {
1788
0
            tokenCount=256;
1789
0
        }
1790
0
1791
0
        /* set the direct bytes (byte 0 always maps to itself) */
1792
0
        for(i=1; i<tokenCount; ++i) {
1793
0
            if(tokens[i]==-1) {
1794
0
                /* convert the direct byte character */
1795
0
                c1=(uint8_t)i;
1796
0
                ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1797
0
                if(U_FAILURE(*pErrorCode)) {
1798
0
                    udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1799
0
                                     i, ds->inCharset);
1800
0
                    return;
1801
0
                }
1802
0
1803
0
                /* enter the converted character into the map and mark it used */
1804
0
                map[c1]=c2;
1805
0
                usedOutChar[c2]=TRUE;
1806
0
            }
1807
0
        }
1808
0
1809
0
        /* set the mappings for the rest of the permutation */
1810
0
        for(i=j=1; i<tokenCount; ++i) {
1811
0
            /* set mappings that were not set for direct bytes */
1812
0
            if(map[i]==0) {
1813
0
                /* set an output byte value that was not used as an output byte above */
1814
0
                while(usedOutChar[j]) {
1815
0
                    ++j;
1816
0
                }
1817
0
                map[i]=(uint8_t)j++;
1818
0
            }
1819
0
        }
1820
0
1821
0
        /*
1822
0
         * leave mappings at tokenCount and above unset if tokenCount<256
1823
0
         * because they won't be used
1824
0
         */
1825
0
    }
1826
0
}
1827
1828
U_CAPI int32_t U_EXPORT2
1829
uchar_swapNames(const UDataSwapper *ds,
1830
                const void *inData, int32_t length, void *outData,
1831
0
                UErrorCode *pErrorCode) {
1832
0
    const UDataInfo *pInfo;
1833
0
    int32_t headerSize;
1834
0
1835
0
    const uint8_t *inBytes;
1836
0
    uint8_t *outBytes;
1837
0
1838
0
    uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1839
0
             offset, i, count, stringsCount;
1840
0
1841
0
    const AlgorithmicRange *inRange;
1842
0
    AlgorithmicRange *outRange;
1843
0
1844
0
    /* udata_swapDataHeader checks the arguments */
1845
0
    headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1846
0
    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1847
0
        return 0;
1848
0
    }
1849
0
1850
0
    /* check data format and format version */
1851
0
    pInfo=(const UDataInfo *)((const char *)inData+4);
1852
0
    if(!(
1853
0
        pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
1854
0
        pInfo->dataFormat[1]==0x6e &&
1855
0
        pInfo->dataFormat[2]==0x61 &&
1856
0
        pInfo->dataFormat[3]==0x6d &&
1857
0
        pInfo->formatVersion[0]==1
1858
0
    )) {
1859
0
        udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1860
0
                         pInfo->dataFormat[0], pInfo->dataFormat[1],
1861
0
                         pInfo->dataFormat[2], pInfo->dataFormat[3],
1862
0
                         pInfo->formatVersion[0]);
1863
0
        *pErrorCode=U_UNSUPPORTED_ERROR;
1864
0
        return 0;
1865
0
    }
1866
0
1867
0
    inBytes=(const uint8_t *)inData+headerSize;
1868
0
    outBytes=(uint8_t *)outData+headerSize;
1869
0
    if(length<0) {
1870
0
        algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1871
0
    } else {
1872
0
        length-=headerSize;
1873
0
        if( length<20 ||
1874
0
            (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1875
0
        ) {
1876
0
            udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1877
0
                             length);
1878
0
            *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1879
0
            return 0;
1880
0
        }
1881
0
    }
1882
0
1883
0
    if(length<0) {
1884
0
        /* preflighting: iterate through algorithmic ranges */
1885
0
        offset=algNamesOffset;
1886
0
        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1887
0
        offset+=4;
1888
0
1889
0
        for(i=0; i<count; ++i) {
1890
0
            inRange=(const AlgorithmicRange *)(inBytes+offset);
1891
0
            offset+=ds->readUInt16(inRange->size);
1892
0
        }
1893
0
    } else {
1894
0
        /* swap data */
1895
0
        const uint16_t *p;
1896
0
        uint16_t *q, *temp;
1897
0
1898
0
        int16_t tokens[512];
1899
0
        uint16_t tokenCount;
1900
0
1901
0
        uint8_t map[256], trailMap[256];
1902
0
1903
0
        /* copy the data for inaccessible bytes */
1904
0
        if(inBytes!=outBytes) {
1905
0
            uprv_memcpy(outBytes, inBytes, length);
1906
0
        }
1907
0
1908
0
        /* the initial 4 offsets first */
1909
0
        tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1910
0
        groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1911
0
        groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1912
0
        ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1913
0
1914
0
        /*
1915
0
         * now the tokens table
1916
0
         * it needs to be permutated along with the compressed name strings
1917
0
         */
1918
0
        p=(const uint16_t *)(inBytes+16);
1919
0
        q=(uint16_t *)(outBytes+16);
1920
0
1921
0
        /* read and swap the tokenCount */
1922
0
        tokenCount=ds->readUInt16(*p);
1923
0
        ds->swapArray16(ds, p, 2, q, pErrorCode);
1924
0
        ++p;
1925
0
        ++q;
1926
0
1927
0
        /* read the first 512 tokens and make the token maps */
1928
0
        if(tokenCount<=512) {
1929
0
            count=tokenCount;
1930
0
        } else {
1931
0
            count=512;
1932
0
        }
1933
0
        for(i=0; i<count; ++i) {
1934
0
            tokens[i]=udata_readInt16(ds, p[i]);
1935
0
        }
1936
0
        for(; i<512; ++i) {
1937
0
            tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1938
0
        }
1939
0
        makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1940
0
        makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1941
0
        if(U_FAILURE(*pErrorCode)) {
1942
0
            return 0;
1943
0
        }
1944
0
1945
0
        /*
1946
0
         * swap and permutate the tokens
1947
0
         * go through a temporary array to support in-place swapping
1948
0
         */
1949
0
        temp=(uint16_t *)uprv_malloc(tokenCount*2);
1950
0
        if(temp==NULL) {
1951
0
            udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1952
0
                             tokenCount);
1953
0
            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1954
0
            return 0;
1955
0
        }
1956
0
1957
0
        /* swap and permutate single-/lead-byte tokens */
1958
0
        for(i=0; i<tokenCount && i<256; ++i) {
1959
0
            ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1960
0
        }
1961
0
1962
0
        /* swap and permutate trail-byte tokens */
1963
0
        for(; i<tokenCount; ++i) {
1964
0
            ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1965
0
        }
1966
0
1967
0
        /* copy the result into the output and free the temporary array */
1968
0
        uprv_memcpy(q, temp, tokenCount*2);
1969
0
        uprv_free(temp);
1970
0
1971
0
        /*
1972
0
         * swap the token strings but not a possible padding byte after
1973
0
         * the terminating NUL of the last string
1974
0
         */
1975
0
        udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1976
0
                                    outBytes+tokenStringOffset, pErrorCode);
1977
0
        if(U_FAILURE(*pErrorCode)) {
1978
0
            udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1979
0
            return 0;
1980
0
        }
1981
0
1982
0
        /* swap the group table */
1983
0
        count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1984
0
        ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1985
0
                           outBytes+groupsOffset, pErrorCode);
1986
0
1987
0
        /*
1988
0
         * swap the group strings
1989
0
         * swap the string bytes but not the nibble-encoded string lengths
1990
0
         */
1991
0
        if(ds->inCharset!=ds->outCharset) {
1992
0
            uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
1993
0
1994
0
            const uint8_t *inStrings, *nextInStrings;
1995
0
            uint8_t *outStrings;
1996
0
1997
0
            uint8_t c;
1998
0
1999
0
            inStrings=inBytes+groupStringOffset;
2000
0
            outStrings=outBytes+groupStringOffset;
2001
0
2002
0
            stringsCount=algNamesOffset-groupStringOffset;
2003
0
2004
0
            /* iterate through string groups until only a few padding bytes are left */
2005
0
            while(stringsCount>32) {
2006
0
                nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2007
0
2008
0
                /* move past the length bytes */
2009
0
                stringsCount-=(uint32_t)(nextInStrings-inStrings);
2010
0
                outStrings+=nextInStrings-inStrings;
2011
0
                inStrings=nextInStrings;
2012
0
2013
0
                count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2014
0
                stringsCount-=count;
2015
0
2016
0
                /* swap the string bytes using map[] and trailMap[] */
2017
0
                while(count>0) {
2018
0
                    c=*inStrings++;
2019
0
                    *outStrings++=map[c];
2020
0
                    if(tokens[c]!=-2) {
2021
0
                        --count;
2022
0
                    } else {
2023
0
                        /* token lead byte: swap the trail byte, too */
2024
0
                        *outStrings++=trailMap[*inStrings++];
2025
0
                        count-=2;
2026
0
                    }
2027
0
                }
2028
0
            }
2029
0
        }
2030
0
2031
0
        /* swap the algorithmic ranges */
2032
0
        offset=algNamesOffset;
2033
0
        count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2034
0
        ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2035
0
        offset+=4;
2036
0
2037
0
        for(i=0; i<count; ++i) {
2038
0
            if(offset>(uint32_t)length) {
2039
0
                udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2040
0
                                 length, i);
2041
0
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2042
0
                return 0;
2043
0
            }
2044
0
2045
0
            inRange=(const AlgorithmicRange *)(inBytes+offset);
2046
0
            outRange=(AlgorithmicRange *)(outBytes+offset);
2047
0
            offset+=ds->readUInt16(inRange->size);
2048
0
2049
0
            ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2050
0
            ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2051
0
            switch(inRange->type) {
2052
0
            case 0:
2053
0
                /* swap prefix string */
2054
0
                ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2055
0
                                    outRange+1, pErrorCode);
2056
0
                if(U_FAILURE(*pErrorCode)) {
2057
0
                    udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2058
0
                                     i);
2059
0
                    return 0;
2060
0
                }
2061
0
                break;
2062
0
            case 1:
2063
0
                {
2064
0
                    /* swap factors and the prefix and factor strings */
2065
0
                    uint32_t factorsCount;
2066
0
2067
0
                    factorsCount=inRange->variant;
2068
0
                    p=(const uint16_t *)(inRange+1);
2069
0
                    q=(uint16_t *)(outRange+1);
2070
0
                    ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2071
0
2072
0
                    /* swap the strings, up to the last terminating NUL */
2073
0
                    p+=factorsCount;
2074
0
                    q+=factorsCount;
2075
0
                    stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2076
0
                    while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2077
0
                        --stringsCount;
2078
0
                    }
2079
0
                    ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2080
0
                }
2081
0
                break;
2082
0
            default:
2083
0
                udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2084
0
                                 inRange->type, i);
2085
0
                *pErrorCode=U_UNSUPPORTED_ERROR;
2086
0
                return 0;
2087
0
            }
2088
0
        }
2089
0
    }
2090
0
2091
0
    return headerSize+(int32_t)offset;
2092
0
}
2093
2094
/*
2095
 * Hey, Emacs, please set the following:
2096
 *
2097
 * Local Variables:
2098
 * indent-tabs-mode: nil
2099
 * End:
2100
 *
2101
 */