Coverage Report

Created: 2026-01-25 06:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/utrie2_builder.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2001-2014, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  utrie2_builder.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2008sep26 (split off from utrie2.c)
16
*   created by: Markus W. Scherer
17
*
18
*   This is a common implementation of a Unicode trie.
19
*   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
20
*   Unicode code points (0..0x10ffff).
21
*   This is the second common version of a Unicode trie (hence the name UTrie2).
22
*   See utrie2.h for a comparison.
23
*
24
*   This file contains only the builder code.
25
*   See utrie2.c for the runtime and enumeration code.
26
*/
27
// #define UTRIE2_DEBUG
28
#ifdef UTRIE2_DEBUG
29
#   include <stdio.h>
30
#endif
31
// #define UCPTRIE_DEBUG
32
33
#include "unicode/utypes.h"
34
#ifdef UCPTRIE_DEBUG
35
#include "unicode/ucptrie.h"
36
#include "unicode/umutablecptrie.h"
37
#include "ucptrie_impl.h"
38
#endif
39
#include "cmemory.h"
40
#include "utrie2.h"
41
#include "utrie2_impl.h"
42
43
#include "utrie.h"  // for utrie2_fromUTrie()
44
45
/* Implementation notes ----------------------------------------------------- */
46
47
/*
48
 * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
49
 * have been chosen to minimize trie sizes overall.
50
 * Most of the code is flexible enough to work with a range of values,
51
 * within certain limits.
52
 *
53
 * Exception: Support for separate values for lead surrogate code _units_
54
 * vs. code _points_ was added after the constants were fixed,
55
 * and has not been tested nor particularly designed for different constant values.
56
 * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
57
 * part and back.)
58
 *
59
 * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
60
 * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
61
 * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
62
 * remapping) stops working.
63
 *
64
 * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
65
 * assumes that a single index-2 block is used for 0x400 code points
66
 * corresponding to one lead surrogate.
67
 *
68
 * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
69
 * more than one Unicode plane, and the split of the index-2 table into a BMP
70
 * part and a supplementary part, with a gap in between, would not work.
71
 *
72
 * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
73
 * there is data with more than 64k distinct values,
74
 * for example for Unihan collation with a separate collation weight per
75
 * Han character.
76
 */
77
78
/* Building a trie ----------------------------------------------------------*/
79
80
enum {
81
    /** The null index-2 block, following the gap in the index-2 table. */
82
    UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
83
84
    /** The start of allocated index-2 blocks. */
85
    UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
86
87
    /**
88
     * The null data block.
89
     * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
90
     * to work with 6-bit trail bytes from 2-byte UTF-8.
91
     */
92
    UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
93
94
    /** The start of allocated data blocks. */
95
    UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
96
97
    /**
98
     * The start of data blocks for U+0800 and above.
99
     * Below, compaction uses a block length of 64 for 2-byte UTF-8.
100
     * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
101
     * Data values for 0x780 code points beyond ASCII.
102
     */
103
    UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
104
};
105
106
/* Start with allocation of 16k data entries. */
107
22.2k
#define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
108
109
/* Grow about 8x each time. */
110
323
#define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
111
112
static int32_t
113
allocIndex2Block(UNewTrie2 *trie);
114
115
U_CAPI UTrie2 * U_EXPORT2
116
11.1k
utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
117
11.1k
    UTrie2 *trie;
118
11.1k
    UNewTrie2 *newTrie;
119
11.1k
    uint32_t *data;
120
11.1k
    int32_t i, j;
121
122
11.1k
    if(U_FAILURE(*pErrorCode)) {
123
0
        return nullptr;
124
0
    }
125
126
11.1k
    trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
127
11.1k
    newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
128
11.1k
    data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
129
11.1k
    if(trie==nullptr || newTrie==nullptr || data==nullptr) {
130
0
        uprv_free(trie);
131
0
        uprv_free(newTrie);
132
0
        uprv_free(data);
133
0
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
134
0
        return nullptr;
135
0
    }
136
137
11.1k
    uprv_memset(trie, 0, sizeof(UTrie2));
138
11.1k
    trie->initialValue=initialValue;
139
11.1k
    trie->errorValue=errorValue;
140
11.1k
    trie->highStart=0x110000;
141
11.1k
    trie->newTrie=newTrie;
142
#ifdef UTRIE2_DEBUG
143
    trie->name="open";
144
#endif
145
146
11.1k
    newTrie->data=data;
147
#ifdef UCPTRIE_DEBUG
148
    newTrie->t3=umutablecptrie_open(initialValue, errorValue, pErrorCode);
149
#endif
150
11.1k
    newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
151
11.1k
    newTrie->initialValue=initialValue;
152
11.1k
    newTrie->errorValue=errorValue;
153
11.1k
    newTrie->highStart=0x110000;
154
11.1k
    newTrie->firstFreeBlock=0;  /* no free block in the list */
155
11.1k
    newTrie->isCompacted=false;
156
157
    /*
158
     * preallocate and reset
159
     * - ASCII
160
     * - the bad-UTF-8-data block
161
     * - the null data block
162
     */
163
1.43M
    for(i=0; i<0x80; ++i) {
164
1.42M
        newTrie->data[i]=initialValue;
165
1.42M
    }
166
721k
    for(; i<0xc0; ++i) {
167
710k
        newTrie->data[i]=errorValue;
168
710k
    }
169
721k
    for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
170
710k
        newTrie->data[i]=initialValue;
171
710k
    }
172
11.1k
    newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
173
11.1k
    newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
174
175
    /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
176
55.5k
    for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
177
44.4k
        newTrie->index2[i]=j;
178
44.4k
        newTrie->map[i]=1;
179
44.4k
    }
180
    /* reference counts for the bad-UTF-8-data block */
181
33.3k
    for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
182
22.2k
        newTrie->map[i]=0;
183
22.2k
    }
184
    /*
185
     * Reference counts for the null data block: all blocks except for the ASCII blocks.
186
     * Plus 1 so that we don't drop this block during compaction.
187
     * Plus as many as needed for lead surrogate code points.
188
     */
189
    /* i==newTrie->dataNullOffset */
190
11.1k
    newTrie->map[i++]=
191
11.1k
        (0x110000>>UTRIE2_SHIFT_2)-
192
11.1k
        (0x80>>UTRIE2_SHIFT_2)+
193
11.1k
        1+
194
11.1k
        UTRIE2_LSCP_INDEX_2_LENGTH;
195
11.1k
    j+=UTRIE2_DATA_BLOCK_LENGTH;
196
22.2k
    for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
197
11.1k
        newTrie->map[i]=0;
198
11.1k
    }
199
200
    /*
201
     * set the remaining indexes in the BMP index-2 block
202
     * to the null data block
203
     */
204
23.0M
    for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
205
23.0M
        newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
206
23.0M
    }
207
208
    /*
209
     * Fill the index gap with impossible values so that compaction
210
     * does not overlap other index-2 blocks with the gap.
211
     */
212
6.40M
    for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
213
6.39M
        newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
214
6.39M
    }
215
216
    /* set the indexes in the null index-2 block */
217
721k
    for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
218
710k
        newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
219
710k
    }
220
11.1k
    newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
221
11.1k
    newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
222
223
    /* set the index-1 indexes for the linear index-2 block */
224
11.1k
    for(i=0, j=0;
225
366k
        i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
226
355k
        ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
227
355k
    ) {
228
355k
        newTrie->index1[i]=j;
229
355k
    }
230
231
    /* set the remaining index-1 indexes to the null index-2 block */
232
5.69M
    for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
233
5.68M
        newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
234
5.68M
    }
235
236
    /*
237
     * Preallocate and reset data for U+0080..U+07ff,
238
     * for 2-byte UTF-8 which will be compacted in 64-blocks
239
     * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
240
     */
241
677k
    for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
242
666k
        utrie2_set32(trie, i, initialValue, pErrorCode);
243
666k
    }
244
245
11.1k
    return trie;
246
11.1k
}
247
248
static UNewTrie2 *
249
0
cloneBuilder(const UNewTrie2 *other) {
250
0
    UNewTrie2 *trie;
251
252
0
    trie = static_cast<UNewTrie2*>(uprv_malloc(sizeof(UNewTrie2)));
253
0
    if(trie==nullptr) {
254
0
        return nullptr;
255
0
    }
256
257
0
    trie->data = static_cast<uint32_t*>(uprv_malloc(other->dataCapacity * 4));
258
0
    if(trie->data==nullptr) {
259
0
        uprv_free(trie);
260
0
        return nullptr;
261
0
    }
262
#ifdef UCPTRIE_DEBUG
263
    if(other->t3==nullptr) {
264
        trie->t3=nullptr;
265
    } else {
266
        UErrorCode errorCode=U_ZERO_ERROR;
267
        trie->t3=umutablecptrie_clone(other->t3, &errorCode);
268
    }
269
#endif
270
0
    trie->dataCapacity=other->dataCapacity;
271
272
    /* clone data */
273
0
    uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
274
0
    uprv_memcpy(trie->index2, other->index2, (size_t)other->index2Length*4);
275
0
    trie->index2NullOffset=other->index2NullOffset;
276
0
    trie->index2Length=other->index2Length;
277
278
0
    uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);
279
0
    trie->dataNullOffset=other->dataNullOffset;
280
0
    trie->dataLength=other->dataLength;
281
282
    /* reference counters */
283
0
    if(other->isCompacted) {
284
0
        trie->firstFreeBlock=0;
285
0
    } else {
286
0
        uprv_memcpy(trie->map, other->map, ((size_t)other->dataLength>>UTRIE2_SHIFT_2)*4);
287
0
        trie->firstFreeBlock=other->firstFreeBlock;
288
0
    }
289
290
0
    trie->initialValue=other->initialValue;
291
0
    trie->errorValue=other->errorValue;
292
0
    trie->highStart=other->highStart;
293
0
    trie->isCompacted=other->isCompacted;
294
295
0
    return trie;
296
0
}
297
298
U_CAPI UTrie2 * U_EXPORT2
299
0
utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
300
0
    UTrie2 *trie;
301
302
0
    if(U_FAILURE(*pErrorCode)) {
303
0
        return nullptr;
304
0
    }
305
0
    if(other==nullptr || (other->memory==nullptr && other->newTrie==nullptr)) {
306
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
307
0
        return nullptr;
308
0
    }
309
310
0
    trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
311
0
    if(trie==nullptr) {
312
0
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
313
0
        return nullptr;
314
0
    }
315
0
    uprv_memcpy(trie, other, sizeof(UTrie2));
316
317
0
    if(other->memory!=nullptr) {
318
0
        trie->memory=uprv_malloc(other->length);
319
0
        if(trie->memory!=nullptr) {
320
0
            trie->isMemoryOwned=true;
321
0
            uprv_memcpy(trie->memory, other->memory, other->length);
322
323
            /* make the clone's pointers point to its own memory */
324
0
            trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
325
0
            if(other->data16!=nullptr) {
326
0
                trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
327
0
            }
328
0
            if(other->data32!=nullptr) {
329
0
                trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
330
0
            }
331
0
        }
332
0
    } else /* other->newTrie!=nullptr */ {
333
0
        trie->newTrie=cloneBuilder(other->newTrie);
334
0
    }
335
336
0
    if(trie->memory==nullptr && trie->newTrie==nullptr) {
337
0
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
338
0
        uprv_free(trie);
339
0
        trie=nullptr;
340
0
    }
341
0
    return trie;
342
0
}
343
344
typedef struct NewTrieAndStatus {
345
    UTrie2 *trie;
346
    UErrorCode errorCode;
347
    UBool exclusiveLimit;  /* rather than inclusive range end */
348
} NewTrieAndStatus;
349
350
static UBool U_CALLCONV
351
0
copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
352
0
    NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
353
0
    if(value!=nt->trie->initialValue) {
354
0
        if(nt->exclusiveLimit) {
355
0
            --end;
356
0
        }
357
0
        if(start==end) {
358
0
            utrie2_set32(nt->trie, start, value, &nt->errorCode);
359
0
        } else {
360
0
            utrie2_setRange32(nt->trie, start, end, value, true, &nt->errorCode);
361
0
        }
362
0
        return U_SUCCESS(nt->errorCode);
363
0
    } else {
364
0
        return true;
365
0
    }
366
0
}
367
368
#ifdef UTRIE2_DEBUG
369
static long countInitial(const UTrie2 *trie) {
370
    uint32_t initialValue=trie->initialValue;
371
    int32_t length=trie->dataLength;
372
    long count=0;
373
    if(trie->data16!=nullptr) {
374
        for(int32_t i=0; i<length; ++i) {
375
            if(trie->data16[i]==initialValue) { ++count; }
376
        }
377
    } else {
378
        for(int32_t i=0; i<length; ++i) {
379
            if(trie->data32[i]==initialValue) { ++count; }
380
        }
381
    }
382
    return count;
383
}
384
385
static void
386
utrie_printLengths(const UTrie *trie) {
387
    long indexLength=trie->indexLength;
388
    long dataLength=(long)trie->dataLength;
389
    long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=nullptr ? 4 : 2);
390
    printf("**UTrieLengths** index:%6ld  data:%6ld  serialized:%6ld\n",
391
           indexLength, dataLength, totalLength);
392
}
393
394
static void
395
utrie2_printLengths(const UTrie2 *trie, const char *which) {
396
    long indexLength=trie->indexLength;
397
    long dataLength=(long)trie->dataLength;
398
    long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=nullptr ? 4 : 2);
399
    printf("**UTrie2Lengths(%s %s)** index:%6ld  data:%6ld  countInitial:%6ld  serialized:%6ld\n",
400
           which, trie->name, indexLength, dataLength, countInitial(trie), totalLength);
401
}
402
#endif
403
404
U_CAPI UTrie2 * U_EXPORT2
405
0
utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
406
0
    NewTrieAndStatus context;
407
0
    char16_t lead;
408
409
0
    if(U_FAILURE(*pErrorCode)) {
410
0
        return nullptr;
411
0
    }
412
0
    if(other==nullptr || (other->memory==nullptr && other->newTrie==nullptr)) {
413
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
414
0
        return nullptr;
415
0
    }
416
0
    if(other->newTrie!=nullptr && !other->newTrie->isCompacted) {
417
0
        return utrie2_clone(other, pErrorCode);  /* clone an unfrozen trie */
418
0
    }
419
420
    /* Clone the frozen trie by enumerating it and building a new one. */
421
0
    context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
422
0
    if(U_FAILURE(*pErrorCode)) {
423
0
        return nullptr;
424
0
    }
425
0
    context.exclusiveLimit=false;
426
0
    context.errorCode=*pErrorCode;
427
0
    utrie2_enum(other, nullptr, copyEnumRange, &context);
428
0
    *pErrorCode=context.errorCode;
429
0
    for(lead=0xd800; lead<0xdc00; ++lead) {
430
0
        uint32_t value;
431
0
        if(other->data32==nullptr) {
432
0
            value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
433
0
        } else {
434
0
            value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
435
0
        }
436
0
        if(value!=other->initialValue) {
437
0
            utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
438
0
        }
439
0
    }
440
0
    if(U_FAILURE(*pErrorCode)) {
441
0
        utrie2_close(context.trie);
442
0
        context.trie=nullptr;
443
0
    }
444
0
    return context.trie;
445
0
}
446
447
/* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
448
U_CAPI UTrie2 * U_EXPORT2
449
0
utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
450
0
    NewTrieAndStatus context;
451
0
    char16_t lead;
452
453
0
    if(U_FAILURE(*pErrorCode)) {
454
0
        return nullptr;
455
0
    }
456
0
    if(trie1==nullptr) {
457
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
458
0
        return nullptr;
459
0
    }
460
0
    context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
461
0
    if(U_FAILURE(*pErrorCode)) {
462
0
        return nullptr;
463
0
    }
464
0
    context.exclusiveLimit=true;
465
0
    context.errorCode=*pErrorCode;
466
0
    utrie_enum(trie1, nullptr, copyEnumRange, &context);
467
0
    *pErrorCode=context.errorCode;
468
0
    for(lead=0xd800; lead<0xdc00; ++lead) {
469
0
        uint32_t value;
470
0
        if(trie1->data32==nullptr) {
471
0
            value=UTRIE_GET16_FROM_LEAD(trie1, lead);
472
0
        } else {
473
0
            value=UTRIE_GET32_FROM_LEAD(trie1, lead);
474
0
        }
475
0
        if(value!=trie1->initialValue) {
476
0
            utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
477
0
        }
478
0
    }
479
0
    if(U_SUCCESS(*pErrorCode)) {
480
0
        utrie2_freeze(context.trie,
481
0
                      trie1->data32!=nullptr ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
482
0
                      pErrorCode);
483
0
    }
484
#ifdef UTRIE2_DEBUG
485
    if(U_SUCCESS(*pErrorCode)) {
486
        utrie_printLengths(trie1);
487
        utrie2_printLengths(context.trie, "fromUTrie");
488
    }
489
#endif
490
0
    if(U_FAILURE(*pErrorCode)) {
491
0
        utrie2_close(context.trie);
492
0
        context.trie=nullptr;
493
0
    }
494
0
    return context.trie;
495
0
}
496
497
static inline UBool
498
113M
isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
499
113M
    int32_t i2, block;
500
501
113M
    if(U_IS_LEAD(c) && forLSCP) {
502
0
        i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
503
0
            (c>>UTRIE2_SHIFT_2);
504
113M
    } else {
505
113M
        i2=trie->index1[c>>UTRIE2_SHIFT_1]+
506
113M
            ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
507
113M
    }
508
113M
    block=trie->index2[i2];
509
113M
    return block == trie->dataNullOffset;
510
113M
}
511
512
static int32_t
513
3.67k
allocIndex2Block(UNewTrie2 *trie) {
514
3.67k
    int32_t newBlock, newTop;
515
516
3.67k
    newBlock=trie->index2Length;
517
3.67k
    newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
518
3.67k
    if(newTop>UPRV_LENGTHOF(trie->index2)) {
519
        /*
520
         * Should never occur.
521
         * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
522
         * or the code writes more values than should be possible.
523
         */
524
0
        return -1;
525
0
    }
526
3.67k
    trie->index2Length=newTop;
527
3.67k
    uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
528
3.67k
    return newBlock;
529
3.67k
}
530
531
static int32_t
532
21.8M
getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
533
21.8M
    int32_t i1, i2;
534
535
21.8M
    if(U_IS_LEAD(c) && forLSCP) {
536
9.56k
        return UTRIE2_LSCP_INDEX_2_OFFSET;
537
9.56k
    }
538
539
21.8M
    i1=c>>UTRIE2_SHIFT_1;
540
21.8M
    i2=trie->index1[i1];
541
21.8M
    if(i2==trie->index2NullOffset) {
542
3.67k
        i2=allocIndex2Block(trie);
543
3.67k
        if(i2<0) {
544
0
            return -1;  /* program error */
545
0
        }
546
3.67k
        trie->index1[i1]=i2;
547
3.67k
    }
548
21.8M
    return i2;
549
21.8M
}
550
551
static int32_t
552
1.21M
allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
553
1.21M
    int32_t newBlock, newTop;
554
555
1.21M
    if(trie->firstFreeBlock!=0) {
556
        /* get the first free block */
557
3.46k
        newBlock=trie->firstFreeBlock;
558
3.46k
        trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
559
1.20M
    } else {
560
        /* get a new block from the high end */
561
1.20M
        newBlock=trie->dataLength;
562
1.20M
        newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
563
1.20M
        if(newTop>trie->dataCapacity) {
564
            /* out of memory in the data array */
565
165
            int32_t capacity;
566
165
            uint32_t *data;
567
568
165
            if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
569
158
                capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
570
158
            } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
571
7
                capacity=UNEWTRIE2_MAX_DATA_LENGTH;
572
7
            } else {
573
                /*
574
                 * Should never occur.
575
                 * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
576
                 * or the code writes more values than should be possible.
577
                 */
578
0
                return -1;
579
0
            }
580
165
            data = static_cast<uint32_t*>(uprv_malloc(capacity * 4));
581
165
            if(data==nullptr) {
582
0
                return -1;
583
0
            }
584
165
            uprv_memcpy(data, trie->data, (size_t)trie->dataLength*4);
585
165
            uprv_free(trie->data);
586
165
            trie->data=data;
587
165
            trie->dataCapacity=capacity;
588
165
        }
589
1.20M
        trie->dataLength=newTop;
590
1.20M
    }
591
1.21M
    uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
592
1.21M
    trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
593
1.21M
    return newBlock;
594
1.21M
}
595
596
/* call when the block's reference counter reaches 0 */
597
static void
598
3.46k
releaseDataBlock(UNewTrie2 *trie, int32_t block) {
599
    /* put this block at the front of the free-block chain */
600
3.46k
    trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
601
3.46k
    trie->firstFreeBlock=block;
602
3.46k
}
603
604
static inline UBool
605
21.8M
isWritableBlock(UNewTrie2 *trie, int32_t block) {
606
21.8M
    return block != trie->dataNullOffset && 1 == trie->map[block >> UTRIE2_SHIFT_2];
607
21.8M
}
608
609
static inline void
610
6.23M
setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
611
6.23M
    int32_t oldBlock;
612
6.23M
    ++trie->map[block>>UTRIE2_SHIFT_2];  /* increment first, in case block==oldBlock! */
613
6.23M
    oldBlock=trie->index2[i2];
614
6.23M
    if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
615
3.46k
        releaseDataBlock(trie, oldBlock);
616
3.46k
    }
617
6.23M
    trie->index2[i2]=block;
618
6.23M
}
619
620
/**
621
 * No error checking for illegal arguments.
622
 *
623
 * @return -1 if no new data block available (out of memory in data array)
624
 * @internal
625
 */
626
static int32_t
627
15.4M
getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
628
15.4M
    int32_t i2, oldBlock, newBlock;
629
630
15.4M
    i2=getIndex2Block(trie, c, forLSCP);
631
15.4M
    if(i2<0) {
632
0
        return -1;  /* program error */
633
0
    }
634
635
15.4M
    i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
636
15.4M
    oldBlock=trie->index2[i2];
637
15.4M
    if(isWritableBlock(trie, oldBlock)) {
638
14.2M
        return oldBlock;
639
14.2M
    }
640
641
    /* allocate a new data block */
642
1.21M
    newBlock=allocDataBlock(trie, oldBlock);
643
1.21M
    if(newBlock<0) {
644
        /* out of memory in the data array */
645
0
        return -1;
646
0
    }
647
1.21M
    setIndex2Entry(trie, i2, newBlock);
648
1.21M
    return newBlock;
649
1.21M
}
650
651
/**
652
 * @return true if the value was successfully set
653
 */
654
static void
655
set32(UNewTrie2 *trie,
656
      UChar32 c, UBool forLSCP, uint32_t value,
657
13.8M
      UErrorCode *pErrorCode) {
658
13.8M
    int32_t block;
659
660
13.8M
    if(trie==nullptr || trie->isCompacted) {
661
0
        *pErrorCode=U_NO_WRITE_PERMISSION;
662
0
        return;
663
0
    }
664
#ifdef UCPTRIE_DEBUG
665
    umutablecptrie_set(trie->t3, c, value, pErrorCode);
666
#endif
667
668
13.8M
    block=getDataBlock(trie, c, forLSCP);
669
13.8M
    if(block<0) {
670
0
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
671
0
        return;
672
0
    }
673
674
13.8M
    trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
675
13.8M
}
676
677
U_CAPI void U_EXPORT2
678
9.99M
utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
679
9.99M
    if(U_FAILURE(*pErrorCode)) {
680
0
        return;
681
0
    }
682
9.99M
    if((uint32_t)c>0x10ffff) {
683
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
684
0
        return;
685
0
    }
686
9.99M
    set32(trie->newTrie, c, true, value, pErrorCode);
687
9.99M
}
688
689
U_CAPI void U_EXPORT2
690
utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
691
                                     UChar32 c, uint32_t value,
692
3.81M
                                     UErrorCode *pErrorCode) {
693
3.81M
    if(U_FAILURE(*pErrorCode)) {
694
0
        return;
695
0
    }
696
3.81M
    if(!U_IS_LEAD(c)) {
697
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
698
0
        return;
699
0
    }
700
3.81M
    set32(trie->newTrie, c, false, value, pErrorCode);
701
3.81M
}
702
703
static void
704
80.1k
writeBlock(uint32_t *block, uint32_t value) {
705
80.1k
    uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
706
2.64M
    while(block<limit) {
707
2.56M
        *block++=value;
708
2.56M
    }
709
80.1k
}
710
711
/**
712
 * initialValue is ignored if overwrite=true
713
 * @internal
714
 */
715
static void
716
fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
717
1.52M
          uint32_t value, uint32_t initialValue, UBool overwrite) {
718
1.52M
    uint32_t *pLimit;
719
720
1.52M
    pLimit=block+limit;
721
1.52M
    block+=start;
722
1.52M
    if(overwrite) {
723
4.94M
        while(block<pLimit) {
724
3.41M
            *block++=value;
725
3.41M
        }
726
1.52M
    } else {
727
0
        while(block<pLimit) {
728
0
            if(*block==initialValue) {
729
0
                *block=value;
730
0
            }
731
0
            ++block;
732
0
        }
733
0
    }
734
1.52M
}
735
736
U_CAPI void U_EXPORT2
737
utrie2_setRange32(UTrie2 *trie,
738
                  UChar32 start, UChar32 end,
739
                  uint32_t value, UBool overwrite,
740
1.47M
                  UErrorCode *pErrorCode) {
741
    /*
742
     * repeat value in [start..end]
743
     * mark index values for repeat-data blocks by setting bit 31 of the index values
744
     * fill around existing values if any, if(overwrite)
745
     */
746
1.47M
    UNewTrie2 *newTrie;
747
1.47M
    int32_t block, rest, repeatBlock;
748
1.47M
    UChar32 limit;
749
750
1.47M
    if(U_FAILURE(*pErrorCode)) {
751
0
        return;
752
0
    }
753
1.47M
    if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
754
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
755
0
        return;
756
0
    }
757
1.47M
    newTrie=trie->newTrie;
758
1.47M
    if(newTrie==nullptr || newTrie->isCompacted) {
759
0
        *pErrorCode=U_NO_WRITE_PERMISSION;
760
0
        return;
761
0
    }
762
#ifdef UCPTRIE_DEBUG
763
    umutablecptrie_setRange(newTrie->t3, start, end, value, pErrorCode);
764
#endif
765
1.47M
    if(!overwrite && value==newTrie->initialValue) {
766
0
        return; /* nothing to do */
767
0
    }
768
769
1.47M
    limit=end+1;
770
1.47M
    if(start&UTRIE2_DATA_MASK) {
771
1.40M
        UChar32 nextStart;
772
773
        /* set partial block at [start..following block boundary[ */
774
1.40M
        block=getDataBlock(newTrie, start, true);
775
1.40M
        if(block<0) {
776
0
            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
777
0
            return;
778
0
        }
779
780
1.40M
        nextStart=(start+UTRIE2_DATA_MASK)&~UTRIE2_DATA_MASK;
781
1.40M
        if(nextStart<=limit) {
782
101k
            fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
783
101k
                      value, newTrie->initialValue, overwrite);
784
101k
            start=nextStart;
785
1.29M
        } else {
786
1.29M
            fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
787
1.29M
                      value, newTrie->initialValue, overwrite);
788
1.29M
            return;
789
1.29M
        }
790
1.40M
    }
791
792
    /* number of positions in the last, partial block */
793
177k
    rest=limit&UTRIE2_DATA_MASK;
794
795
    /* round down limit to a block boundary */
796
177k
    limit&=~UTRIE2_DATA_MASK;
797
798
    /* iterate over all-value blocks */
799
177k
    if(value==newTrie->initialValue) {
800
3.65k
        repeatBlock=newTrie->dataNullOffset;
801
174k
    } else {
802
174k
        repeatBlock=-1;
803
174k
    }
804
805
119M
    while(start<limit) {
806
119M
        int32_t i2;
807
119M
        UBool setRepeatBlock=false;
808
809
119M
        if(value==newTrie->initialValue && isInNullBlock(newTrie, start, true)) {
810
113M
            start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
811
113M
            continue;
812
113M
        }
813
814
        /* get index value */
815
6.44M
        i2=getIndex2Block(newTrie, start, true);
816
6.44M
        if(i2<0) {
817
0
            *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
818
0
            return;
819
0
        }
820
6.44M
        i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
821
6.44M
        block=newTrie->index2[i2];
822
6.44M
        if(isWritableBlock(newTrie, block)) {
823
            /* already allocated */
824
3.71k
            if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
825
                /*
826
                 * We overwrite all values, and it's not a
827
                 * protected (ASCII-linear or 2-byte UTF-8) block:
828
                 * replace with the repeatBlock.
829
                 */
830
3.46k
                setRepeatBlock=true;
831
3.46k
            } else {
832
                /* !overwrite, or protected block: just write the values into this block */
833
250
                fillBlock(newTrie->data+block,
834
250
                          0, UTRIE2_DATA_BLOCK_LENGTH,
835
250
                          value, newTrie->initialValue, overwrite);
836
250
            }
837
6.43M
        } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
838
            /*
839
             * Set the repeatBlock instead of the null block or previous repeat block:
840
             *
841
             * If !isWritableBlock() then all entries in the block have the same value
842
             * because it's the null block or a range block (the repeatBlock from a previous
843
             * call to utrie2_setRange32()).
844
             * No other blocks are used multiple times before compacting.
845
             *
846
             * The null block is the only non-writable block with the initialValue because
847
             * of the repeatBlock initialization above. (If value==initialValue, then
848
             * the repeatBlock will be the null data block.)
849
             *
850
             * We set our repeatBlock if the desired value differs from the block's value,
851
             * and if we overwrite any data or if the data is all initial values
852
             * (which is the same as the block being the null block, see above).
853
             */
854
5.10M
            setRepeatBlock=true;
855
5.10M
        }
856
6.44M
        if(setRepeatBlock) {
857
5.10M
            if(repeatBlock>=0) {
858
5.02M
                setIndex2Entry(newTrie, i2, repeatBlock);
859
5.02M
            } else {
860
                /* create and set and fill the repeatBlock */
861
80.1k
                repeatBlock=getDataBlock(newTrie, start, true);
862
80.1k
                if(repeatBlock<0) {
863
0
                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
864
0
                    return;
865
0
                }
866
80.1k
                writeBlock(newTrie->data+repeatBlock, value);
867
80.1k
            }
868
5.10M
        }
869
870
6.44M
        start+=UTRIE2_DATA_BLOCK_LENGTH;
871
6.44M
    }
872
873
177k
    if(rest>0) {
874
        /* set partial block at [last block boundary..limit[ */
875
126k
        block=getDataBlock(newTrie, start, true);
876
126k
        if(block<0) {
877
0
            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
878
0
            return;
879
0
        }
880
881
126k
        fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
882
126k
    }
883
177k
}
884
885
/* compaction --------------------------------------------------------------- */
886
887
static inline UBool
888
4.73M
equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
889
43.2M
    while(length>0 && *s==*t) {
890
38.5M
        ++s;
891
38.5M
        ++t;
892
38.5M
        --length;
893
38.5M
    }
894
4.73M
    return length == 0;
895
4.73M
}
896
897
static inline UBool
898
807M
equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
899
873M
    while(length>0 && *s==*t) {
900
65.5M
        ++s;
901
65.5M
        ++t;
902
65.5M
        --length;
903
65.5M
    }
904
807M
    return length == 0;
905
807M
}
906
907
static int32_t
908
2.36k
findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
909
2.36k
    int32_t block;
910
911
    /* ensure that we do not even partially get past index2Length */
912
2.36k
    index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
913
914
4.64M
    for(block=0; block<=index2Length; ++block) {
915
4.64M
        if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
916
844
            return block;
917
844
        }
918
4.64M
    }
919
1.51k
    return -1;
920
2.36k
}
921
922
static int32_t
923
492k
findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
924
492k
    int32_t block;
925
926
    /* ensure that we do not even partially get past dataLength */
927
492k
    dataLength-=blockLength;
928
929
806M
    for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
930
806M
        if(equal_uint32(data+block, data+otherBlock, blockLength)) {
931
338k
            return block;
932
338k
        }
933
806M
    }
934
153k
    return -1;
935
492k
}
936
937
/*
938
 * Find the start of the last range in the trie by enumerating backward.
939
 * Indexes for supplementary code points higher than this will be omitted.
940
 */
941
static UChar32
942
3.73k
findHighStart(UNewTrie2 *trie, uint32_t highValue) {
943
3.73k
    const uint32_t *data32;
944
945
3.73k
    uint32_t value, initialValue;
946
3.73k
    UChar32 c, prev;
947
3.73k
    int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
948
949
3.73k
    data32=trie->data;
950
3.73k
    initialValue=trie->initialValue;
951
952
3.73k
    index2NullOffset=trie->index2NullOffset;
953
3.73k
    nullBlock=trie->dataNullOffset;
954
955
    /* set variables for previous range */
956
3.73k
    if(highValue==initialValue) {
957
3.70k
        prevI2Block=index2NullOffset;
958
3.70k
        prevBlock=nullBlock;
959
3.70k
    } else {
960
24
        prevI2Block=-1;
961
24
        prevBlock=-1;
962
24
    }
963
3.73k
    prev=0x110000;
964
965
    /* enumerate index-2 blocks */
966
3.73k
    i1=UNEWTRIE2_INDEX_1_LENGTH;
967
3.73k
    c=prev;
968
1.78M
    while(c>0) {
969
1.78M
        i2Block=trie->index1[--i1];
970
1.78M
        if(i2Block==prevI2Block) {
971
            /* the index-2 block is the same as the previous one, and filled with highValue */
972
1.77M
            c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
973
1.77M
            continue;
974
1.77M
        }
975
13.2k
        prevI2Block=i2Block;
976
13.2k
        if(i2Block==index2NullOffset) {
977
            /* this is the null index-2 block */
978
0
            if(highValue!=initialValue) {
979
0
                return c;
980
0
            }
981
0
            c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
982
13.2k
        } else {
983
            /* enumerate data blocks for one index-2 block */
984
735k
            for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
985
726k
                block=trie->index2[i2Block+ --i2];
986
726k
                if(block==prevBlock) {
987
                    /* the block is the same as the previous one, and filled with highValue */
988
722k
                    c-=UTRIE2_DATA_BLOCK_LENGTH;
989
722k
                    continue;
990
722k
                }
991
3.73k
                prevBlock=block;
992
3.73k
                if(block==nullBlock) {
993
                    /* this is the null data block */
994
0
                    if(highValue!=initialValue) {
995
0
                        return c;
996
0
                    }
997
0
                    c-=UTRIE2_DATA_BLOCK_LENGTH;
998
3.73k
                } else {
999
25.1k
                    for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
1000
25.1k
                        value=data32[block+ --j];
1001
25.1k
                        if(value!=highValue) {
1002
3.73k
                            return c;
1003
3.73k
                        }
1004
21.4k
                        --c;
1005
21.4k
                    }
1006
3.73k
                }
1007
3.73k
            }
1008
13.2k
        }
1009
13.2k
    }
1010
1011
    /* deliver last range */
1012
0
    return 0;
1013
3.73k
}
1014
1015
/*
1016
 * Compact a build-time trie.
1017
 *
1018
 * The compaction
1019
 * - removes blocks that are identical with earlier ones
1020
 * - overlaps adjacent blocks as much as possible (if overlap==true)
1021
 * - moves blocks in steps of the data granularity
1022
 * - moves and overlaps blocks that overlap with multiple values in the overlap region
1023
 *
1024
 * It does not
1025
 * - try to move and overlap blocks that are not already adjacent
1026
 */
1027
static void
1028
3.73k
compactData(UNewTrie2 *trie) {
1029
#ifdef UTRIE2_DEBUG
1030
    int32_t countSame=0, sumOverlaps=0;
1031
#endif
1032
1033
3.73k
    int32_t start, newStart, movedStart;
1034
3.73k
    int32_t blockLength, overlap;
1035
3.73k
    int32_t i, mapIndex, blockCount;
1036
1037
    /* do not compact linear-ASCII data */
1038
3.73k
    newStart=UTRIE2_DATA_START_OFFSET;
1039
26.1k
    for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
1040
22.3k
        trie->map[i]=start;
1041
22.3k
    }
1042
1043
    /*
1044
     * Start with a block length of 64 for 2-byte UTF-8,
1045
     * then switch to UTRIE2_DATA_BLOCK_LENGTH.
1046
     */
1047
3.73k
    blockLength=64;
1048
3.73k
    blockCount=blockLength>>UTRIE2_SHIFT_2;
1049
496k
    for(start=newStart; start<trie->dataLength;) {
1050
        /*
1051
         * start: index of first entry of current block
1052
         * newStart: index where the current block is to be moved
1053
         *           (right after current end of already-compacted data)
1054
         */
1055
492k
        if(start==UNEWTRIE2_DATA_0800_OFFSET) {
1056
3.73k
            blockLength=UTRIE2_DATA_BLOCK_LENGTH;
1057
3.73k
            blockCount=1;
1058
3.73k
        }
1059
1060
        /* skip blocks that are not used */
1061
492k
        if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
1062
            /* advance start to the next block */
1063
0
            start+=blockLength;
1064
1065
            /* leave newStart with the previous block! */
1066
0
            continue;
1067
0
        }
1068
1069
        /* search for an identical block */
1070
492k
        if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
1071
492k
             >=0
1072
492k
        ) {
1073
#ifdef UTRIE2_DEBUG
1074
            ++countSame;
1075
#endif
1076
            /* found an identical block, set the other block's index value for the current block */
1077
776k
            for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1078
437k
                trie->map[mapIndex++]=movedStart;
1079
437k
                movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1080
437k
            }
1081
1082
            /* advance start to the next block */
1083
338k
            start+=blockLength;
1084
1085
            /* leave newStart with the previous block! */
1086
338k
            continue;
1087
338k
        }
1088
1089
        /* see if the beginning of this block can be overlapped with the end of the previous block */
1090
        /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
1091
153k
        for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
1092
1.30M
            overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
1093
1.15M
            overlap-=UTRIE2_DATA_GRANULARITY) {}
1094
1095
#ifdef UTRIE2_DEBUG
1096
            sumOverlaps+=overlap;
1097
#endif
1098
153k
        if(overlap>0 || newStart<start) {
1099
            /* some overlap, or just move the whole block */
1100
147k
            movedStart=newStart-overlap;
1101
307k
            for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1102
159k
                trie->map[mapIndex++]=movedStart;
1103
159k
                movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1104
159k
            }
1105
1106
            /* move the non-overlapping indexes to their new positions */
1107
147k
            start+=overlap;
1108
5.01M
            for(i=blockLength-overlap; i>0; --i) {
1109
4.87M
                trie->data[newStart++]=trie->data[start++];
1110
4.87M
            }
1111
147k
        } else /* no overlap && newStart==start */ {
1112
17.6k
            for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1113
11.4k
                trie->map[mapIndex++]=start;
1114
11.4k
                start+=UTRIE2_DATA_BLOCK_LENGTH;
1115
11.4k
            }
1116
6.24k
            newStart=start;
1117
6.24k
        }
1118
153k
    }
1119
1120
    /* now adjust the index-2 table */
1121
8.10M
    for(i=0; i<trie->index2Length; ++i) {
1122
8.10M
        if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
1123
            /* Gap indexes are invalid (-1). Skip over the gap. */
1124
3.73k
            i+=UNEWTRIE2_INDEX_GAP_LENGTH;
1125
3.73k
        }
1126
8.10M
        trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
1127
8.10M
    }
1128
3.73k
    trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
1129
1130
    /* ensure dataLength alignment */
1131
3.73k
    while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1132
0
        trie->data[newStart++]=trie->initialValue;
1133
0
    }
1134
1135
#ifdef UTRIE2_DEBUG
1136
    /* we saved some space */
1137
    printf("compacting UTrie2: count of 32-bit data words %lu->%lu  countSame=%ld  sumOverlaps=%ld\n",
1138
            (long)trie->dataLength, (long)newStart, (long)countSame, (long)sumOverlaps);
1139
#endif
1140
1141
3.73k
    trie->dataLength=newStart;
1142
3.73k
}
1143
1144
static void
1145
747
compactIndex2(UNewTrie2 *trie) {
1146
747
    int32_t i, start, newStart, movedStart, overlap;
1147
1148
    /* do not compact linear-BMP index-2 blocks */
1149
747
    newStart=UTRIE2_INDEX_2_BMP_LENGTH;
1150
25.3k
    for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
1151
24.6k
        trie->map[i]=start;
1152
24.6k
    }
1153
1154
    /* Reduce the index table gap to what will be needed at runtime. */
1155
747
    newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
1156
1157
3.11k
    for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
1158
        /*
1159
         * start: index of first entry of current block
1160
         * newStart: index where the current block is to be moved
1161
         *           (right after current end of already-compacted data)
1162
         */
1163
1164
        /* search for an identical block */
1165
2.36k
        if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
1166
2.36k
             >=0
1167
2.36k
        ) {
1168
            /* found an identical block, set the other block's index value for the current block */
1169
844
            trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
1170
1171
            /* advance start to the next block */
1172
844
            start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1173
1174
            /* leave newStart with the previous block! */
1175
844
            continue;
1176
844
        }
1177
1178
        /* see if the beginning of this block can be overlapped with the end of the previous block */
1179
        /* look for maximum overlap with the previous, adjacent block */
1180
1.51k
        for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
1181
88.9k
            overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
1182
87.4k
            --overlap) {}
1183
1184
1.51k
        if(overlap>0 || newStart<start) {
1185
            /* some overlap, or just move the whole block */
1186
1.51k
            trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
1187
1188
            /* move the non-overlapping indexes to their new positions */
1189
1.51k
            start+=overlap;
1190
90.4k
            for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
1191
88.9k
                trie->index2[newStart++]=trie->index2[start++];
1192
88.9k
            }
1193
1.51k
        } else /* no overlap && newStart==start */ {
1194
0
            trie->map[start>>UTRIE2_SHIFT_1_2]=start;
1195
0
            start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1196
0
            newStart=start;
1197
0
        }
1198
1.51k
    }
1199
1200
    /* now adjust the index-1 table */
1201
407k
    for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
1202
406k
        trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
1203
406k
    }
1204
747
    trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
1205
1206
    /*
1207
     * Ensure data table alignment:
1208
     * Needs to be granularity-aligned for 16-bit trie
1209
     * (so that dataMove will be down-shiftable),
1210
     * and 2-aligned for uint32_t data.
1211
     */
1212
1.34k
    while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
1213
        /* Arbitrary value: 0x3fffc not possible for real data. */
1214
599
        trie->index2[newStart++] = static_cast<int32_t>(0xffff) << UTRIE2_INDEX_SHIFT;
1215
599
    }
1216
1217
#ifdef UTRIE2_DEBUG
1218
    /* we saved some space */
1219
    printf("compacting UTrie2: count of 16-bit index words %lu->%lu\n",
1220
            (long)trie->index2Length, (long)newStart);
1221
#endif
1222
1223
747
    trie->index2Length=newStart;
1224
747
}
1225
1226
static void
1227
3.73k
compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
1228
3.73k
    UNewTrie2 *newTrie;
1229
3.73k
    UChar32 highStart, suppHighStart;
1230
3.73k
    uint32_t highValue;
1231
1232
3.73k
    newTrie=trie->newTrie;
1233
1234
    /* find highStart and round it up */
1235
3.73k
    highValue=utrie2_get32(trie, 0x10ffff);
1236
3.73k
    highStart=findHighStart(newTrie, highValue);
1237
3.73k
    highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
1238
3.73k
    if(highStart==0x110000) {
1239
71
        highValue=trie->errorValue;
1240
71
    }
1241
1242
    /*
1243
     * Set trie->highStart only after utrie2_get32(trie, highStart).
1244
     * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
1245
     */
1246
3.73k
    trie->highStart=newTrie->highStart=highStart;
1247
1248
#ifdef UTRIE2_DEBUG
1249
    printf("UTrie2: highStart U+%06lx  highValue 0x%lx  initialValue 0x%lx\n",
1250
            (long)highStart, (long)highValue, (long)trie->initialValue);
1251
#endif
1252
1253
3.73k
    if(highStart<0x110000) {
1254
        /* Blank out [highStart..10ffff] to release associated data blocks. */
1255
3.65k
        suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
1256
3.65k
        utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, true, pErrorCode);
1257
3.65k
        if(U_FAILURE(*pErrorCode)) {
1258
0
            return;
1259
0
        }
1260
3.65k
    }
1261
1262
3.73k
    compactData(newTrie);
1263
3.73k
    if(highStart>0x10000) {
1264
747
        compactIndex2(newTrie);
1265
#ifdef UTRIE2_DEBUG
1266
    } else {
1267
        printf("UTrie2: highStart U+%04lx  count of 16-bit index words %lu->%lu\n",
1268
                (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
1269
#endif
1270
747
    }
1271
1272
    /*
1273
     * Store the highValue in the data array and round up the dataLength.
1274
     * Must be done after compactData() because that assumes that dataLength
1275
     * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
1276
     */
1277
3.73k
    newTrie->data[newTrie->dataLength++]=highValue;
1278
14.9k
    while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1279
11.1k
        newTrie->data[newTrie->dataLength++]=trie->initialValue;
1280
11.1k
    }
1281
1282
3.73k
    newTrie->isCompacted=true;
1283
3.73k
}
1284
1285
/* serialization ------------------------------------------------------------ */
1286
1287
/**
1288
 * Maximum length of the runtime index array.
1289
 * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
1290
 * (The actual maximum length is lower,
1291
 * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
1292
 */
1293
7.46k
#define UTRIE2_MAX_INDEX_LENGTH 0xffff
1294
1295
/**
1296
 * Maximum length of the runtime data array.
1297
 * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
1298
 * and by uint16_t UTrie2Header.shiftedDataLength.
1299
 */
1300
3.73k
#define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
1301
1302
/* Compact and internally serialize the trie. */
1303
U_CAPI void U_EXPORT2
1304
3.73k
utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
1305
3.73k
    UNewTrie2 *newTrie;
1306
3.73k
    UTrie2Header *header;
1307
3.73k
    uint32_t *p;
1308
3.73k
    uint16_t *dest16;
1309
3.73k
    int32_t i, length;
1310
3.73k
    int32_t allIndexesLength;
1311
3.73k
    int32_t dataMove;  /* >0 if the data is moved to the end of the index array */
1312
3.73k
    UChar32 highStart;
1313
1314
    /* argument check */
1315
3.73k
    if(U_FAILURE(*pErrorCode)) {
1316
0
        return;
1317
0
    }
1318
3.73k
    if( trie==nullptr ||
1319
3.73k
        valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
1320
3.73k
    ) {
1321
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1322
0
        return;
1323
0
    }
1324
3.73k
    newTrie=trie->newTrie;
1325
3.73k
    if(newTrie==nullptr) {
1326
        /* already frozen */
1327
0
        UTrie2ValueBits frozenValueBits=
1328
0
            trie->data16!=nullptr ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
1329
0
        if(valueBits!=frozenValueBits) {
1330
0
            *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1331
0
        }
1332
0
        return;
1333
0
    }
1334
1335
    /* compact if necessary */
1336
3.73k
    if(!newTrie->isCompacted) {
1337
3.73k
        compactTrie(trie, pErrorCode);
1338
3.73k
        if(U_FAILURE(*pErrorCode)) {
1339
0
            return;
1340
0
        }
1341
3.73k
    }
1342
3.73k
    highStart=trie->highStart;
1343
1344
3.73k
    if(highStart<=0x10000) {
1345
2.98k
        allIndexesLength=UTRIE2_INDEX_1_OFFSET;
1346
2.98k
    } else {
1347
747
        allIndexesLength=newTrie->index2Length;
1348
747
    }
1349
3.73k
    if(valueBits==UTRIE2_16_VALUE_BITS) {
1350
0
        dataMove=allIndexesLength;
1351
3.73k
    } else {
1352
3.73k
        dataMove=0;
1353
3.73k
    }
1354
1355
    /* are indexLength and dataLength within limits? */
1356
3.73k
    if( /* for unshifted indexLength */
1357
3.73k
        allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
1358
        /* for unshifted dataNullOffset */
1359
3.73k
        (dataMove+newTrie->dataNullOffset)>0xffff ||
1360
        /* for unshifted 2-byte UTF-8 index-2 values */
1361
3.73k
        (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
1362
        /* for shiftedDataLength */
1363
3.73k
        (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
1364
3.73k
    ) {
1365
0
        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1366
0
        return;
1367
0
    }
1368
1369
    /* calculate the total serialized length */
1370
3.73k
    length=sizeof(UTrie2Header)+allIndexesLength*2;
1371
3.73k
    if(valueBits==UTRIE2_16_VALUE_BITS) {
1372
0
        length+=newTrie->dataLength*2;
1373
3.73k
    } else {
1374
3.73k
        length+=newTrie->dataLength*4;
1375
3.73k
    }
1376
1377
3.73k
    trie->memory=uprv_malloc(length);
1378
3.73k
    if(trie->memory==nullptr) {
1379
0
        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1380
0
        return;
1381
0
    }
1382
3.73k
    trie->length=length;
1383
3.73k
    trie->isMemoryOwned=true;
1384
1385
3.73k
    trie->indexLength=allIndexesLength;
1386
3.73k
    trie->dataLength=newTrie->dataLength;
1387
3.73k
    if(highStart<=0x10000) {
1388
2.98k
        trie->index2NullOffset=0xffff;
1389
2.98k
    } else {
1390
747
        trie->index2NullOffset=static_cast<uint16_t>(UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset);
1391
747
    }
1392
3.73k
    trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
1393
3.73k
    trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
1394
1395
    /* set the header fields */
1396
3.73k
    header=(UTrie2Header *)trie->memory;
1397
1398
3.73k
    header->signature=UTRIE2_SIG; /* "Tri2" */
1399
3.73k
    header->options=(uint16_t)valueBits;
1400
1401
3.73k
    header->indexLength=(uint16_t)trie->indexLength;
1402
3.73k
    header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
1403
3.73k
    header->index2NullOffset=trie->index2NullOffset;
1404
3.73k
    header->dataNullOffset=trie->dataNullOffset;
1405
3.73k
    header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
1406
1407
    /* fill the index and data arrays */
1408
3.73k
    dest16=(uint16_t *)(header+1);
1409
3.73k
    trie->index=dest16;
1410
1411
    /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
1412
3.73k
    p=(uint32_t *)newTrie->index2;
1413
7.76M
    for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
1414
7.75M
        *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1415
7.75M
    }
1416
1417
    /* write UTF-8 2-byte index-2 values, not right-shifted */
1418
11.1k
    for(i=0; i<(0xc2-0xc0); ++i) {                                  /* C0..C1 */
1419
7.46k
        *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
1420
7.46k
    }
1421
115k
    for(; i<(0xe0-0xc0); ++i) {                                     /* C2..DF */
1422
111k
        *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
1423
111k
    }
1424
1425
3.73k
    if(highStart>0x10000) {
1426
747
        int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
1427
747
        int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
1428
1429
        /* write 16-bit index-1 values for supplementary code points */
1430
747
        p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
1431
139k
        for(i=index1Length; i>0; --i) {
1432
138k
            *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
1433
138k
        }
1434
1435
        /*
1436
         * write the index-2 array values for supplementary code points,
1437
         * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
1438
         */
1439
747
        p=(uint32_t *)newTrie->index2+index2Offset;
1440
90.3k
        for(i=newTrie->index2Length-index2Offset; i>0; --i) {
1441
89.5k
            *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1442
89.5k
        }
1443
747
    }
1444
1445
    /* write the 16/32-bit data array */
1446
3.73k
    switch(valueBits) {
1447
0
    case UTRIE2_16_VALUE_BITS:
1448
        /* write 16-bit data values */
1449
0
        trie->data16=dest16;
1450
0
        trie->data32=nullptr;
1451
0
        p=newTrie->data;
1452
0
        for(i=newTrie->dataLength; i>0; --i) {
1453
0
            *dest16++=(uint16_t)*p++;
1454
0
        }
1455
0
        break;
1456
3.73k
    case UTRIE2_32_VALUE_BITS:
1457
        /* write 32-bit data values */
1458
3.73k
        trie->data16=nullptr;
1459
3.73k
        trie->data32=(uint32_t *)dest16;
1460
3.73k
        uprv_memcpy(dest16, newTrie->data, (size_t)newTrie->dataLength*4);
1461
3.73k
        break;
1462
0
    default:
1463
0
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1464
0
        return;
1465
3.73k
    }
1466
1467
#ifdef UTRIE2_DEBUG
1468
    utrie2_printLengths(trie, "");
1469
#endif
1470
1471
#ifdef UCPTRIE_DEBUG
1472
    umutablecptrie_setName(newTrie->t3, trie->name);
1473
    ucptrie_close(
1474
        umutablecptrie_buildImmutable(
1475
            newTrie->t3, UCPTRIE_TYPE_FAST, (UCPTrieValueWidth)valueBits, pErrorCode));
1476
#endif
1477
    /* Delete the UNewTrie2. */
1478
3.73k
    uprv_free(newTrie->data);
1479
3.73k
    uprv_free(newTrie);
1480
3.73k
    trie->newTrie=nullptr;
1481
3.73k
}