Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/i18n/collationweights.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*  
4
*******************************************************************************
5
*
6
*   Copyright (C) 1999-2015, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
*******************************************************************************
10
*   file name:  collationweights.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2001mar08 as ucol_wgt.cpp
16
*   created by: Markus W. Scherer
17
*
18
*   This file contains code for allocating n collation element weights
19
*   between two exclusive limits.
20
*   It is used only internally by the collation tailoring builder.
21
*/
22
23
#include "unicode/utypes.h"
24
25
#if !UCONFIG_NO_COLLATION
26
27
#include "cmemory.h"
28
#include "collation.h"
29
#include "collationweights.h"
30
#include "uarrsort.h"
31
#include "uassert.h"
32
33
#ifdef UCOL_DEBUG
34
#   include <stdio.h>
35
#endif
36
37
U_NAMESPACE_BEGIN
38
39
/* collation element weight allocation -------------------------------------- */
40
41
/* helper functions for CE weights */
42
43
static inline uint32_t
44
0
getWeightTrail(uint32_t weight, int32_t length) {
45
0
    return (uint32_t)(weight>>(8*(4-length)))&0xff;
46
0
}
47
48
static inline uint32_t
49
0
setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
50
0
    length=8*(4-length);
51
0
    return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
52
0
}
53
54
static inline uint32_t
55
0
getWeightByte(uint32_t weight, int32_t idx) {
56
0
    return getWeightTrail(weight, idx); /* same calculation */
57
0
}
58
59
static inline uint32_t
60
0
setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
61
0
    uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
62
63
0
    idx*=8;
64
0
    if(idx<32) {
65
0
        mask=((uint32_t)0xffffffff)>>idx;
66
0
    } else {
67
        // Do not use uint32_t>>32 because on some platforms that does not shift at all
68
        // while we need it to become 0.
69
        // PowerPC: 0xffffffff>>32 = 0           (wanted)
70
        // x86:     0xffffffff>>32 = 0xffffffff  (not wanted)
71
        //
72
        // ANSI C99 6.5.7 Bitwise shift operators:
73
        // "If the value of the right operand is negative
74
        // or is greater than or equal to the width of the promoted left operand,
75
        // the behavior is undefined."
76
0
        mask=0;
77
0
    }
78
0
    idx=32-idx;
79
0
    mask|=0xffffff00<<idx;
80
0
    return (uint32_t)((weight&mask)|(byte<<idx));
81
0
}
82
83
static inline uint32_t
84
0
truncateWeight(uint32_t weight, int32_t length) {
85
0
    return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
86
0
}
87
88
static inline uint32_t
89
0
incWeightTrail(uint32_t weight, int32_t length) {
90
0
    return (uint32_t)(weight+(1UL<<(8*(4-length))));
91
0
}
92
93
static inline uint32_t
94
0
decWeightTrail(uint32_t weight, int32_t length) {
95
0
    return (uint32_t)(weight-(1UL<<(8*(4-length))));
96
0
}
97
98
CollationWeights::CollationWeights()
99
0
        : middleLength(0), rangeIndex(0), rangeCount(0) {
100
0
    for(int32_t i = 0; i < 5; ++i) {
101
0
        minBytes[i] = maxBytes[i] = 0;
102
0
    }
103
0
}
104
105
void
106
0
CollationWeights::initForPrimary(UBool compressible) {
107
0
    middleLength=1;
108
0
    minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
109
0
    maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
110
0
    if(compressible) {
111
0
        minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
112
0
        maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
113
0
    } else {
114
0
        minBytes[2] = 2;
115
0
        maxBytes[2] = 0xff;
116
0
    }
117
0
    minBytes[3] = 2;
118
0
    maxBytes[3] = 0xff;
119
0
    minBytes[4] = 2;
120
0
    maxBytes[4] = 0xff;
121
0
}
122
123
void
124
0
CollationWeights::initForSecondary() {
125
    // We use only the lower 16 bits for secondary weights.
126
0
    middleLength=3;
127
0
    minBytes[1] = 0;
128
0
    maxBytes[1] = 0;
129
0
    minBytes[2] = 0;
130
0
    maxBytes[2] = 0;
131
0
    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
132
0
    maxBytes[3] = 0xff;
133
0
    minBytes[4] = 2;
134
0
    maxBytes[4] = 0xff;
135
0
}
136
137
void
138
0
CollationWeights::initForTertiary() {
139
    // We use only the lower 16 bits for tertiary weights.
140
0
    middleLength=3;
141
0
    minBytes[1] = 0;
142
0
    maxBytes[1] = 0;
143
0
    minBytes[2] = 0;
144
0
    maxBytes[2] = 0;
145
    // We use only 6 bits per byte.
146
    // The other bits are used for case & quaternary weights.
147
0
    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
148
0
    maxBytes[3] = 0x3f;
149
0
    minBytes[4] = 2;
150
0
    maxBytes[4] = 0x3f;
151
0
}
152
153
uint32_t
154
0
CollationWeights::incWeight(uint32_t weight, int32_t length) const {
155
0
    for(;;) {
156
0
        uint32_t byte=getWeightByte(weight, length);
157
0
        if(byte<maxBytes[length]) {
158
0
            return setWeightByte(weight, length, byte+1);
159
0
        } else {
160
            // Roll over, set this byte to the minimum and increment the previous one.
161
0
            weight=setWeightByte(weight, length, minBytes[length]);
162
0
            --length;
163
0
            U_ASSERT(length > 0);
164
0
        }
165
0
    }
166
0
}
167
168
uint32_t
169
0
CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
170
0
    for(;;) {
171
0
        offset += getWeightByte(weight, length);
172
0
        if((uint32_t)offset <= maxBytes[length]) {
173
0
            return setWeightByte(weight, length, offset);
174
0
        } else {
175
            // Split the offset between this byte and the previous one.
176
0
            offset -= minBytes[length];
177
0
            weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
178
0
            offset /= countBytes(length);
179
0
            --length;
180
0
            U_ASSERT(length > 0);
181
0
        }
182
0
    }
183
0
}
184
185
void
186
0
CollationWeights::lengthenRange(WeightRange &range) const {
187
0
    int32_t length=range.length+1;
188
0
    range.start=setWeightTrail(range.start, length, minBytes[length]);
189
0
    range.end=setWeightTrail(range.end, length, maxBytes[length]);
190
0
    range.count*=countBytes(length);
191
0
    range.length=length;
192
0
}
193
194
/* for uprv_sortArray: sort ranges in weight order */
195
static int32_t U_CALLCONV
196
0
compareRanges(const void * /*context*/, const void *left, const void *right) {
197
0
    uint32_t l, r;
198
199
0
    l=((const CollationWeights::WeightRange *)left)->start;
200
0
    r=((const CollationWeights::WeightRange *)right)->start;
201
0
    if(l<r) {
202
0
        return -1;
203
0
    } else if(l>r) {
204
0
        return 1;
205
0
    } else {
206
0
        return 0;
207
0
    }
208
0
}
209
210
UBool
211
0
CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
212
0
    U_ASSERT(lowerLimit != 0);
213
0
    U_ASSERT(upperLimit != 0);
214
215
    /* get the lengths of the limits */
216
0
    int32_t lowerLength=lengthOfWeight(lowerLimit);
217
0
    int32_t upperLength=lengthOfWeight(upperLimit);
218
219
#ifdef UCOL_DEBUG
220
    printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
221
    printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
222
#endif
223
0
    U_ASSERT(lowerLength>=middleLength);
224
    // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
225
226
0
    if(lowerLimit>=upperLimit) {
227
#ifdef UCOL_DEBUG
228
        printf("error: no space between lower & upper limits\n");
229
#endif
230
0
        return FALSE;
231
0
    }
232
233
    /* check that neither is a prefix of the other */
234
0
    if(lowerLength<upperLength) {
235
0
        if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
236
#ifdef UCOL_DEBUG
237
            printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
238
#endif
239
0
            return FALSE;
240
0
        }
241
0
    }
242
    /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
243
244
0
    WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
245
0
    uprv_memset(lower, 0, sizeof(lower));
246
0
    uprv_memset(&middle, 0, sizeof(middle));
247
0
    uprv_memset(upper, 0, sizeof(upper));
248
249
    /*
250
     * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
251
     * range     minimum length
252
     * lower[4]  4
253
     * lower[3]  3
254
     * lower[2]  2
255
     * middle    1
256
     * upper[2]  2
257
     * upper[3]  3
258
     * upper[4]  4
259
     *
260
     * We are now going to calculate up to 7 ranges.
261
     * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
262
     */
263
0
    uint32_t weight=lowerLimit;
264
0
    for(int32_t length=lowerLength; length>middleLength; --length) {
265
0
        uint32_t trail=getWeightTrail(weight, length);
266
0
        if(trail<maxBytes[length]) {
267
0
            lower[length].start=incWeightTrail(weight, length);
268
0
            lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
269
0
            lower[length].length=length;
270
0
            lower[length].count=maxBytes[length]-trail;
271
0
        }
272
0
        weight=truncateWeight(weight, length-1);
273
0
    }
274
0
    if(weight<0xff000000) {
275
0
        middle.start=incWeightTrail(weight, middleLength);
276
0
    } else {
277
        // Prevent overflow for primary lead byte FF
278
        // which would yield a middle range starting at 0.
279
0
        middle.start=0xffffffff;  // no middle range
280
0
    }
281
282
0
    weight=upperLimit;
283
0
    for(int32_t length=upperLength; length>middleLength; --length) {
284
0
        uint32_t trail=getWeightTrail(weight, length);
285
0
        if(trail>minBytes[length]) {
286
0
            upper[length].start=setWeightTrail(weight, length, minBytes[length]);
287
0
            upper[length].end=decWeightTrail(weight, length);
288
0
            upper[length].length=length;
289
0
            upper[length].count=trail-minBytes[length];
290
0
        }
291
0
        weight=truncateWeight(weight, length-1);
292
0
    }
293
0
    middle.end=decWeightTrail(weight, middleLength);
294
295
    /* set the middle range */
296
0
    middle.length=middleLength;
297
0
    if(middle.end>=middle.start) {
298
0
        middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
299
0
    } else {
300
        /* no middle range, eliminate overlaps */
301
0
        for(int32_t length=4; length>middleLength; --length) {
302
0
            if(lower[length].count>0 && upper[length].count>0) {
303
                // Note: The lowerEnd and upperStart weights are versions of
304
                // lowerLimit and upperLimit (which are lowerLimit<upperLimit),
305
                // truncated (still less-or-equal)
306
                // and then with their last bytes changed to the
307
                // maxByte (for lowerEnd) or minByte (for upperStart).
308
0
                const uint32_t lowerEnd=lower[length].end;
309
0
                const uint32_t upperStart=upper[length].start;
310
0
                UBool merged=FALSE;
311
312
0
                if(lowerEnd>upperStart) {
313
                    // These two lower and upper ranges collide.
314
                    // Since lowerLimit<upperLimit and lowerEnd and upperStart
315
                    // are versions with only their last bytes modified
316
                    // (and following ones removed/reset to 0),
317
                    // lowerEnd>upperStart is only possible
318
                    // if the leading bytes are equal
319
                    // and lastByte(lowerEnd)>lastByte(upperStart).
320
0
                    U_ASSERT(truncateWeight(lowerEnd, length-1)==
321
0
                            truncateWeight(upperStart, length-1));
322
                    // Intersect these two ranges.
323
0
                    lower[length].end=upper[length].end;
324
0
                    lower[length].count=
325
0
                            (int32_t)getWeightTrail(lower[length].end, length)-
326
0
                            (int32_t)getWeightTrail(lower[length].start, length)+1;
327
                    // count might be <=0 in which case there is no room,
328
                    // and the range-collecting code below will ignore this range.
329
0
                    merged=TRUE;
330
0
                } else if(lowerEnd==upperStart) {
331
                    // Not possible, unless minByte==maxByte which is not allowed.
332
0
                    U_ASSERT(minBytes[length]<maxBytes[length]);
333
0
                } else /* lowerEnd<upperStart */ {
334
0
                    if(incWeight(lowerEnd, length)==upperStart) {
335
                        // Merge adjacent ranges.
336
0
                        lower[length].end=upper[length].end;
337
0
                        lower[length].count+=upper[length].count;  // might be >countBytes
338
0
                        merged=TRUE;
339
0
                    }
340
0
                }
341
0
                if(merged) {
342
                    // Remove all shorter ranges.
343
                    // There was no room available for them between the ranges we just merged.
344
0
                    upper[length].count=0;
345
0
                    while(--length>middleLength) {
346
0
                        lower[length].count=upper[length].count=0;
347
0
                    }
348
0
                    break;
349
0
                }
350
0
            }
351
0
        }
352
0
    }
353
354
#ifdef UCOL_DEBUG
355
    /* print ranges */
356
    for(int32_t length=4; length>=2; --length) {
357
        if(lower[length].count>0) {
358
            printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
359
        }
360
    }
361
    if(middle.count>0) {
362
        printf("middle   .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
363
    }
364
    for(int32_t length=2; length<=4; ++length) {
365
        if(upper[length].count>0) {
366
            printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
367
        }
368
    }
369
#endif
370
371
    /* copy the ranges, shortest first, into the result array */
372
0
    rangeCount=0;
373
0
    if(middle.count>0) {
374
0
        uprv_memcpy(ranges, &middle, sizeof(WeightRange));
375
0
        rangeCount=1;
376
0
    }
377
0
    for(int32_t length=middleLength+1; length<=4; ++length) {
378
        /* copy upper first so that later the middle range is more likely the first one to use */
379
0
        if(upper[length].count>0) {
380
0
            uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
381
0
            ++rangeCount;
382
0
        }
383
0
        if(lower[length].count>0) {
384
0
            uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
385
0
            ++rangeCount;
386
0
        }
387
0
    }
388
0
    return rangeCount>0;
389
0
}
390
391
UBool
392
0
CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
393
    // See if the first few minLength and minLength+1 ranges have enough weights.
394
0
    for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
395
0
        if(n <= ranges[i].count) {
396
            // Use the first few minLength and minLength+1 ranges.
397
0
            if(ranges[i].length > minLength) {
398
                // Reduce the number of weights from the last minLength+1 range
399
                // which might sort before some minLength ranges,
400
                // so that we use all weights in the minLength ranges.
401
0
                ranges[i].count = n;
402
0
            }
403
0
            rangeCount = i + 1;
404
#ifdef UCOL_DEBUG
405
            printf("take first %ld ranges\n", rangeCount);
406
#endif
407
408
0
            if(rangeCount>1) {
409
                /* sort the ranges by weight values */
410
0
                UErrorCode errorCode=U_ZERO_ERROR;
411
0
                uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
412
0
                               compareRanges, NULL, FALSE, &errorCode);
413
                /* ignore error code: we know that the internal sort function will not fail here */
414
0
            }
415
0
            return TRUE;
416
0
        }
417
0
        n -= ranges[i].count;  // still >0
418
0
    }
419
0
    return FALSE;
420
0
}
421
422
UBool
423
0
CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
424
    // See if the minLength ranges have enough weights
425
    // when we split one and lengthen the following ones.
426
0
    int32_t count = 0;
427
0
    int32_t minLengthRangeCount;
428
0
    for(minLengthRangeCount = 0;
429
0
            minLengthRangeCount < rangeCount &&
430
0
                ranges[minLengthRangeCount].length == minLength;
431
0
            ++minLengthRangeCount) {
432
0
        count += ranges[minLengthRangeCount].count;
433
0
    }
434
435
0
    int32_t nextCountBytes = countBytes(minLength + 1);
436
0
    if(n > count * nextCountBytes) { return FALSE; }
437
438
    // Use the minLength ranges. Merge them, and then split again as necessary.
439
0
    uint32_t start = ranges[0].start;
440
0
    uint32_t end = ranges[0].end;
441
0
    for(int32_t i = 1; i < minLengthRangeCount; ++i) {
442
0
        if(ranges[i].start < start) { start = ranges[i].start; }
443
0
        if(ranges[i].end > end) { end = ranges[i].end; }
444
0
    }
445
446
    // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
447
    // Goal:
448
    //   count1 + count2 * nextCountBytes = n
449
    //   count1 + count2 = count
450
    // These turn into
451
    //   (count - count2) + count2 * nextCountBytes = n
452
    // and then into the following count1 & count2 computations.
453
0
    int32_t count2 = (n - count) / (nextCountBytes - 1);  // number of weights to be lengthened
454
0
    int32_t count1 = count - count2;  // number of minLength weights
455
0
    if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) {
456
        // round up
457
0
        ++count2;
458
0
        --count1;
459
0
        U_ASSERT((count1 + count2 * nextCountBytes) >= n);
460
0
    }
461
462
0
    ranges[0].start = start;
463
464
0
    if(count1 == 0) {
465
        // Make one long range.
466
0
        ranges[0].end = end;
467
0
        ranges[0].count = count;
468
0
        lengthenRange(ranges[0]);
469
0
        rangeCount = 1;
470
0
    } else {
471
        // Split the range, lengthen the second part.
472
#ifdef UCOL_DEBUG
473
        printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
474
               splitRange, rangeCount, count1, count2);
475
#endif
476
477
        // Next start = start + count1. First end = 1 before that.
478
0
        ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
479
0
        ranges[0].count = count1;
480
481
0
        ranges[1].start = incWeight(ranges[0].end, minLength);
482
0
        ranges[1].end = end;
483
0
        ranges[1].length = minLength;  // +1 when lengthened
484
0
        ranges[1].count = count2;  // *countBytes when lengthened
485
0
        lengthenRange(ranges[1]);
486
0
        rangeCount = 2;
487
0
    }
488
0
    return TRUE;
489
0
}
490
491
/*
492
 * call getWeightRanges and then determine heuristically
493
 * which ranges to use for a given number of weights between (excluding)
494
 * two limits
495
 */
496
UBool
497
0
CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
498
#ifdef UCOL_DEBUG
499
    puts("");
500
#endif
501
502
0
    if(!getWeightRanges(lowerLimit, upperLimit)) {
503
#ifdef UCOL_DEBUG
504
        printf("error: unable to get Weight ranges\n");
505
#endif
506
0
        return FALSE;
507
0
    }
508
509
    /* try until we find suitably large ranges */
510
0
    for(;;) {
511
        /* get the smallest number of bytes in a range */
512
0
        int32_t minLength=ranges[0].length;
513
514
0
        if(allocWeightsInShortRanges(n, minLength)) { break; }
515
516
0
        if(minLength == 4) {
517
#ifdef UCOL_DEBUG
518
            printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
519
                   minLengthCount, n);
520
#endif
521
0
            return FALSE;
522
0
        }
523
524
0
        if(allocWeightsInMinLengthRanges(n, minLength)) { break; }
525
526
        /* no good match, lengthen all minLength ranges and iterate */
527
#ifdef UCOL_DEBUG
528
        printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
529
#endif
530
0
        for(int32_t i=0; i<rangeCount && ranges[i].length==minLength; ++i) {
531
0
            lengthenRange(ranges[i]);
532
0
        }
533
0
    }
534
535
#ifdef UCOL_DEBUG
536
    puts("final ranges:");
537
    for(int32_t i=0; i<rangeCount; ++i) {
538
        printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
539
               i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
540
    }
541
#endif
542
543
0
    rangeIndex = 0;
544
0
    return TRUE;
545
0
}
546
547
uint32_t
548
0
CollationWeights::nextWeight() {
549
0
    if(rangeIndex >= rangeCount) {
550
0
        return 0xffffffff;
551
0
    } else {
552
        /* get the next weight */
553
0
        WeightRange &range = ranges[rangeIndex];
554
0
        uint32_t weight = range.start;
555
0
        if(--range.count == 0) {
556
            /* this range is finished */
557
0
            ++rangeIndex;
558
0
        } else {
559
            /* increment the weight for the next value */
560
0
            range.start = incWeight(weight, range.length);
561
0
            U_ASSERT(range.start <= range.end);
562
0
        }
563
564
0
        return weight;
565
0
    }
566
0
}
567
568
U_NAMESPACE_END
569
570
#endif /* #if !UCONFIG_NO_COLLATION */