/src/icu/icu4c/source/i18n/collationweights.cpp

Source (jump to first uncovered line)
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*  
*******************************************************************************
*
*   Copyright (C) 1999-2015, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  collationweights.cpp
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2001mar08 as ucol_wgt.cpp
*   created by: Markus W. Scherer
*
*   This file contains code for allocating n collation element weights
*   between two exclusive limits.
*   It is used only internally by the collation tailoring builder.
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "cmemory.h"
#include "collation.h"
#include "collationweights.h"
#include "uarrsort.h"
#include "uassert.h"

#ifdef UCOL_DEBUG
#   include <stdio.h>
#endif

U_NAMESPACE_BEGIN

/* collation element weight allocation -------------------------------------- */

/* helper functions for CE weights */

static inline uint32_t
getWeightTrail(uint32_t weight, int32_t length) {
    return (weight >> (8 * (4 - length))) & 0xff;
}

static inline uint32_t
setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
    length=8*(4-length);
    return static_cast<uint32_t>((weight & (0xffffff00 << length)) | (trail << length));
}

static inline uint32_t
getWeightByte(uint32_t weight, int32_t idx) {
    return getWeightTrail(weight, idx); /* same calculation */
}

static inline uint32_t
setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
    uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */

    idx*=8;
    if(idx<32) {
        mask = (static_cast<uint32_t>(0xffffffff)) >> idx;
    } else {
        // Do not use uint32_t>>32 because on some platforms that does not shift at all
        // while we need it to become 0.
        // PowerPC: 0xffffffff>>32 = 0           (wanted)
        // x86:     0xffffffff>>32 = 0xffffffff  (not wanted)
        //
        // ANSI C99 6.5.7 Bitwise shift operators:
        // "If the value of the right operand is negative
        // or is greater than or equal to the width of the promoted left operand,
        // the behavior is undefined."
        mask=0;
    }
    idx=32-idx;
    mask|=0xffffff00<<idx;
    return ((weight & mask) | (byte << idx));
}

static inline uint32_t
truncateWeight(uint32_t weight, int32_t length) {
    return static_cast<uint32_t>(weight & (0xffffffff << (8 * (4 - length))));
}

static inline uint32_t
incWeightTrail(uint32_t weight, int32_t length) {
    return static_cast<uint32_t>(weight + (1UL << (8 * (4 - length))));
}

static inline uint32_t
decWeightTrail(uint32_t weight, int32_t length) {
    return static_cast<uint32_t>(weight - (1UL << (8 * (4 - length))));
}

CollationWeights::CollationWeights()
        : middleLength(0), rangeIndex(0), rangeCount(0) {
    for(int32_t i = 0; i < 5; ++i) {
        minBytes[i] = maxBytes[i] = 0;
    }
}

void
CollationWeights::initForPrimary(UBool compressible) {
    middleLength=1;
    minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
    maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
    if(compressible) {
        minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
        maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
    } else {
        minBytes[2] = 2;
        maxBytes[2] = 0xff;
    }
    minBytes[3] = 2;
    maxBytes[3] = 0xff;
    minBytes[4] = 2;
    maxBytes[4] = 0xff;
}

void
CollationWeights::initForSecondary() {
    // We use only the lower 16 bits for secondary weights.
    middleLength=3;
    minBytes[1] = 0;
    maxBytes[1] = 0;
    minBytes[2] = 0;
    maxBytes[2] = 0;
    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
    maxBytes[3] = 0xff;
    minBytes[4] = 2;
    maxBytes[4] = 0xff;
}

void
CollationWeights::initForTertiary() {
    // We use only the lower 16 bits for tertiary weights.
    middleLength=3;
    minBytes[1] = 0;
    maxBytes[1] = 0;
    minBytes[2] = 0;
    maxBytes[2] = 0;
    // We use only 6 bits per byte.
    // The other bits are used for case & quaternary weights.
    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
    maxBytes[3] = 0x3f;
    minBytes[4] = 2;
    maxBytes[4] = 0x3f;
}

uint32_t
CollationWeights::incWeight(uint32_t weight, int32_t length) const {
    for(;;) {
        uint32_t byte=getWeightByte(weight, length);
        if(byte<maxBytes[length]) {
            return setWeightByte(weight, length, byte+1);
        } else {
            // Roll over, set this byte to the minimum and increment the previous one.
            weight=setWeightByte(weight, length, minBytes[length]);
            --length;
            U_ASSERT(length > 0);
        }
    }
}

uint32_t
CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
    for(;;) {
        offset += getWeightByte(weight, length);
        if (static_cast<uint32_t>(offset) <= maxBytes[length]) {
            return setWeightByte(weight, length, offset);
        } else {
            // Split the offset between this byte and the previous one.
            offset -= minBytes[length];
            weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
            offset /= countBytes(length);
            --length;
            U_ASSERT(length > 0);
        }
    }
}

void
CollationWeights::lengthenRange(WeightRange &range) const {
    int32_t length=range.length+1;
    range.start=setWeightTrail(range.start, length, minBytes[length]);
    range.end=setWeightTrail(range.end, length, maxBytes[length]);
    range.count*=countBytes(length);
    range.length=length;
}

/* for uprv_sortArray: sort ranges in weight order */
static int32_t U_CALLCONV
compareRanges(const void * /*context*/, const void *left, const void *right) {
    uint32_t l, r;

    l = static_cast<const CollationWeights::WeightRange*>(left)->start;
    r = static_cast<const CollationWeights::WeightRange*>(right)->start;
    if(l<r) {
        return -1;
    } else if(l>r) {
        return 1;
    } else {
        return 0;
    }
}

UBool
CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
    U_ASSERT(lowerLimit != 0);
    U_ASSERT(upperLimit != 0);

    /* get the lengths of the limits */
    int32_t lowerLength=lengthOfWeight(lowerLimit);
    int32_t upperLength=lengthOfWeight(upperLimit);

#ifdef UCOL_DEBUG
    printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
    printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
#endif
    U_ASSERT(lowerLength>=middleLength);
    // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.

    if(lowerLimit>=upperLimit) {
#ifdef UCOL_DEBUG
        printf("error: no space between lower & upper limits\n");
#endif
        return false;
    }

    /* check that neither is a prefix of the other */
    if(lowerLength<upperLength) {
        if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
#ifdef UCOL_DEBUG
            printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
#endif
            return false;
        }
    }
    /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */

    WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
    uprv_memset(lower, 0, sizeof(lower));
    uprv_memset(&middle, 0, sizeof(middle));
    uprv_memset(upper, 0, sizeof(upper));

    /*
     * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
     * range     minimum length
     * lower[4]  4
     * lower[3]  3
     * lower[2]  2
     * middle    1
     * upper[2]  2
     * upper[3]  3
     * upper[4]  4
     *
     * We are now going to calculate up to 7 ranges.
     * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
     */
    uint32_t weight=lowerLimit;
    for(int32_t length=lowerLength; length>middleLength; --length) {
        uint32_t trail=getWeightTrail(weight, length);
        if(trail<maxBytes[length]) {
            lower[length].start=incWeightTrail(weight, length);
            lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
            lower[length].length=length;
            lower[length].count=maxBytes[length]-trail;
        }
        weight=truncateWeight(weight, length-1);
    }
    if(weight<0xff000000) {
        middle.start=incWeightTrail(weight, middleLength);
    } else {
        // Prevent overflow for primary lead byte FF
        // which would yield a middle range starting at 0.
        middle.start=0xffffffff;  // no middle range
    }

    weight=upperLimit;
    for(int32_t length=upperLength; length>middleLength; --length) {
        uint32_t trail=getWeightTrail(weight, length);
        if(trail>minBytes[length]) {
            upper[length].start=setWeightTrail(weight, length, minBytes[length]);
            upper[length].end=decWeightTrail(weight, length);
            upper[length].length=length;
            upper[length].count=trail-minBytes[length];
        }
        weight=truncateWeight(weight, length-1);
    }
    middle.end=decWeightTrail(weight, middleLength);

    /* set the middle range */
    middle.length=middleLength;
    if(middle.end>=middle.start) {
        middle.count = static_cast<int32_t>((middle.end - middle.start) >> (8 * (4 - middleLength))) + 1;
    } else {
        /* no middle range, eliminate overlaps */
        for(int32_t length=4; length>middleLength; --length) {
            if(lower[length].count>0 && upper[length].count>0) {
                // Note: The lowerEnd and upperStart weights are versions of
                // lowerLimit and upperLimit (which are lowerLimit<upperLimit),
                // truncated (still less-or-equal)
                // and then with their last bytes changed to the
                // maxByte (for lowerEnd) or minByte (for upperStart).
                const uint32_t lowerEnd=lower[length].end;
                const uint32_t upperStart=upper[length].start;
                UBool merged=false;

                if(lowerEnd>upperStart) {
                    // These two lower and upper ranges collide.
                    // Since lowerLimit<upperLimit and lowerEnd and upperStart
                    // are versions with only their last bytes modified
                    // (and following ones removed/reset to 0),
                    // lowerEnd>upperStart is only possible
                    // if the leading bytes are equal
                    // and lastByte(lowerEnd)>lastByte(upperStart).
                    U_ASSERT(truncateWeight(lowerEnd, length-1)==
                            truncateWeight(upperStart, length-1));
                    // Intersect these two ranges.
                    lower[length].end=upper[length].end;
                    lower[length].count=
                            static_cast<int32_t>(getWeightTrail(lower[length].end, length)) -
                            static_cast<int32_t>(getWeightTrail(lower[length].start, length)) + 1;
                    // count might be <=0 in which case there is no room,
                    // and the range-collecting code below will ignore this range.
                    merged=true;
                } else if(lowerEnd==upperStart) {
                    // Not possible, unless minByte==maxByte which is not allowed.
                    U_ASSERT(minBytes[length]<maxBytes[length]);
                } else /* lowerEnd<upperStart */ {
                    if(incWeight(lowerEnd, length)==upperStart) {
                        // Merge adjacent ranges.
                        lower[length].end=upper[length].end;
                        lower[length].count+=upper[length].count;  // might be >countBytes
                        merged=true;
                    }
                }
                if(merged) {
                    // Remove all shorter ranges.
                    // There was no room available for them between the ranges we just merged.
                    upper[length].count=0;
                    while(--length>middleLength) {
                        lower[length].count=upper[length].count=0;
                    }
                    break;
                }
            }
        }
    }

#ifdef UCOL_DEBUG
    /* print ranges */
    for(int32_t length=4; length>=2; --length) {
        if(lower[length].count>0) {
            printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
        }
    }
    if(middle.count>0) {
        printf("middle   .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
    }
    for(int32_t length=2; length<=4; ++length) {
        if(upper[length].count>0) {
            printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
        }
    }
#endif

    /* copy the ranges, shortest first, into the result array */
    rangeCount=0;
    if(middle.count>0) {
        uprv_memcpy(ranges, &middle, sizeof(WeightRange));
        rangeCount=1;
    }
    for(int32_t length=middleLength+1; length<=4; ++length) {
        /* copy upper first so that later the middle range is more likely the first one to use */
        if(upper[length].count>0) {
            uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
            ++rangeCount;
        }
        if(lower[length].count>0) {
            uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
            ++rangeCount;
        }
    }
    return rangeCount>0;
}

UBool
CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
    // See if the first few minLength and minLength+1 ranges have enough weights.
    for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
        if(n <= ranges[i].count) {
            // Use the first few minLength and minLength+1 ranges.
            if(ranges[i].length > minLength) {
                // Reduce the number of weights from the last minLength+1 range
                // which might sort before some minLength ranges,
                // so that we use all weights in the minLength ranges.
                ranges[i].count = n;
            }
            rangeCount = i + 1;
#ifdef UCOL_DEBUG
            printf("take first %ld ranges\n", rangeCount);
#endif

            if(rangeCount>1) {
                /* sort the ranges by weight values */
                UErrorCode errorCode=U_ZERO_ERROR;
                uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
                               compareRanges, nullptr, false, &errorCode);
                /* ignore error code: we know that the internal sort function will not fail here */
            }
            return true;
        }
        n -= ranges[i].count;  // still >0
    }
    return false;
}

UBool
CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
    // See if the minLength ranges have enough weights
    // when we split one and lengthen the following ones.
    int32_t count = 0;
    int32_t minLengthRangeCount;
    for(minLengthRangeCount = 0;
            minLengthRangeCount < rangeCount &&
                ranges[minLengthRangeCount].length == minLength;
            ++minLengthRangeCount) {
        count += ranges[minLengthRangeCount].count;
    }

    int32_t nextCountBytes = countBytes(minLength + 1);
    if(n > count * nextCountBytes) { return false; }

    // Use the minLength ranges. Merge them, and then split again as necessary.
    uint32_t start = ranges[0].start;
    uint32_t end = ranges[0].end;
    for(int32_t i = 1; i < minLengthRangeCount; ++i) {
        if(ranges[i].start < start) { start = ranges[i].start; }
        if(ranges[i].end > end) { end = ranges[i].end; }
    }

    // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
    // Goal:
    //   count1 + count2 * nextCountBytes = n
    //   count1 + count2 = count
    // These turn into
    //   (count - count2) + count2 * nextCountBytes = n
    // and then into the following count1 & count2 computations.
    int32_t count2 = (n - count) / (nextCountBytes - 1);  // number of weights to be lengthened
    int32_t count1 = count - count2;  // number of minLength weights
    if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) {
        // round up
        ++count2;
        --count1;
        U_ASSERT((count1 + count2 * nextCountBytes) >= n);
    }

    ranges[0].start = start;

    if(count1 == 0) {
        // Make one long range.
        ranges[0].end = end;
        ranges[0].count = count;
        lengthenRange(ranges[0]);
        rangeCount = 1;
    } else {
        // Split the range, lengthen the second part.
#ifdef UCOL_DEBUG
        printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
               splitRange, rangeCount, count1, count2);
#endif

        // Next start = start + count1. First end = 1 before that.
        ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
        ranges[0].count = count1;

        ranges[1].start = incWeight(ranges[0].end, minLength);
        ranges[1].end = end;
        ranges[1].length = minLength;  // +1 when lengthened
        ranges[1].count = count2;  // *countBytes when lengthened
        lengthenRange(ranges[1]);
        rangeCount = 2;
    }
    return true;
}

/*
 * call getWeightRanges and then determine heuristically
 * which ranges to use for a given number of weights between (excluding)
 * two limits
 */
UBool
CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
#ifdef UCOL_DEBUG
    puts("");
#endif

    if(!getWeightRanges(lowerLimit, upperLimit)) {
#ifdef UCOL_DEBUG
        printf("error: unable to get Weight ranges\n");
#endif
        return false;
    }

    /* try until we find suitably large ranges */
    for(;;) {
        /* get the smallest number of bytes in a range */
        int32_t minLength=ranges[0].length;

        if(allocWeightsInShortRanges(n, minLength)) { break; }

        if(minLength == 4) {
#ifdef UCOL_DEBUG
            printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
                   minLengthCount, n);
#endif
            return false;
        }

        if(allocWeightsInMinLengthRanges(n, minLength)) { break; }

        /* no good match, lengthen all minLength ranges and iterate */
#ifdef UCOL_DEBUG
        printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
#endif
        for(int32_t i=0; i<rangeCount && ranges[i].length==minLength; ++i) {
            lengthenRange(ranges[i]);
        }
    }

#ifdef UCOL_DEBUG
    puts("final ranges:");
    for(int32_t i=0; i<rangeCount; ++i) {
        printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
               i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
    }
#endif

    rangeIndex = 0;
    return true;
}

uint32_t
CollationWeights::nextWeight() {
    if(rangeIndex >= rangeCount) {
        return 0xffffffff;
    } else {
        /* get the next weight */
        WeightRange &range = ranges[rangeIndex];
        uint32_t weight = range.start;
        if(--range.count == 0) {
            /* this range is finished */
            ++rangeIndex;
        } else {
            /* increment the weight for the next value */
            range.start = incWeight(weight, range.length);
            U_ASSERT(range.start <= range.end);
        }

        return weight;
    }
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_COLLATION */

Coverage Report

Created: 2025-06-24 06:54

Line	Count	Source (jump to first uncovered line)
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		*******************************************************************************
5		*
6		* Copyright (C) 1999-2015, International Business Machines
7		* Corporation and others. All Rights Reserved.
8		*
9		*******************************************************************************
10		* file name: collationweights.cpp
11		* encoding: UTF-8
12		* tab size: 8 (not used)
13		* indentation:4
14		*
15		* created on: 2001mar08 as ucol_wgt.cpp
16		* created by: Markus W. Scherer
17		*
18		* This file contains code for allocating n collation element weights
19		* between two exclusive limits.
20		* It is used only internally by the collation tailoring builder.
21		*/
22
23		#include "unicode/utypes.h"
24
25		#if !UCONFIG_NO_COLLATION
26
27		#include "cmemory.h"
28		#include "collation.h"
29		#include "collationweights.h"
30		#include "uarrsort.h"
31		#include "uassert.h"
32
33		#ifdef UCOL_DEBUG
34		# include <stdio.h>
35		#endif
36
37		U_NAMESPACE_BEGIN
38
39		/* collation element weight allocation -------------------------------------- */
40
41		/* helper functions for CE weights */
42
43		static inline uint32_t
44	3.18M	getWeightTrail(uint32_t weight, int32_t length) {
45	3.18M	return (weight >> (8 * (4 - length))) & 0xff;
46	3.18M	}
47
48		static inline uint32_t
49	25.0k	setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
50	25.0k	length=8*(4-length);
51	25.0k	return static_cast<uint32_t>((weight & (0xffffff00 << length)) \| (trail << length));
52	25.0k	}
53
54		static inline uint32_t
55	3.15M	getWeightByte(uint32_t weight, int32_t idx) {
56	3.15M	return getWeightTrail(weight, idx); /* same calculation */
57	3.15M	}
58
59		static inline uint32_t
60	3.15M	setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
61	3.15M	uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
62
63	3.15M	idx*=8;
64	3.15M	if(idx<32) {
65	1.91M	mask = (static_cast<uint32_t>(0xffffffff)) >> idx;
66	1.91M	} else {
67		// Do not use uint32_t>>32 because on some platforms that does not shift at all
68		// while we need it to become 0.
69		// PowerPC: 0xffffffff>>32 = 0 (wanted)
70		// x86: 0xffffffff>>32 = 0xffffffff (not wanted)
71		//
72		// ANSI C99 6.5.7 Bitwise shift operators:
73		// "If the value of the right operand is negative
74		// or is greater than or equal to the width of the promoted left operand,
75		// the behavior is undefined."
76	1.23M	mask=0;
77	1.23M	}
78	3.15M	idx=32-idx;
79	3.15M	mask\|=0xffffff00<<idx;
80	3.15M	return ((weight & mask) \| (byte << idx));
81	3.15M	}
82
83		static inline uint32_t
84	20.7k	truncateWeight(uint32_t weight, int32_t length) {
85	20.7k	return static_cast<uint32_t>(weight & (0xffffffff << (8 * (4 - length))));
86	20.7k	}
87
88		static inline uint32_t
89	30.5k	incWeightTrail(uint32_t weight, int32_t length) {
90	30.5k	return static_cast<uint32_t>(weight + (1UL << (8 * (4 - length))));
91	30.5k	}
92
93		static inline uint32_t
94	30.4k	decWeightTrail(uint32_t weight, int32_t length) {
95	30.4k	return static_cast<uint32_t>(weight - (1UL << (8 * (4 - length))));
96	30.4k	}
97
98		CollationWeights::CollationWeights()
99	12.0k	: middleLength(0), rangeIndex(0), rangeCount(0) {
100	72.3k	for(int32_t i = 0; i < 5; ++i) {
101	60.2k	minBytes[i] = maxBytes[i] = 0;
102	60.2k	}
103	12.0k	}
104
105		void
106	7.81k	CollationWeights::initForPrimary(UBool compressible) {
107	7.81k	middleLength=1;
108	7.81k	minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
109	7.81k	maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
110	7.81k	if(compressible) {
111	3.22k	minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
112	3.22k	maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
113	4.59k	} else {
114	4.59k	minBytes[2] = 2;
115	4.59k	maxBytes[2] = 0xff;
116	4.59k	}
117	7.81k	minBytes[3] = 2;
118	7.81k	maxBytes[3] = 0xff;
119	7.81k	minBytes[4] = 2;
120	7.81k	maxBytes[4] = 0xff;
121	7.81k	}
122
123		void
124	5.82k	CollationWeights::initForSecondary() {
125		// We use only the lower 16 bits for secondary weights.
126	5.82k	middleLength=3;
127	5.82k	minBytes[1] = 0;
128	5.82k	maxBytes[1] = 0;
129	5.82k	minBytes[2] = 0;
130	5.82k	maxBytes[2] = 0;
131	5.82k	minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
132	5.82k	maxBytes[3] = 0xff;
133	5.82k	minBytes[4] = 2;
134	5.82k	maxBytes[4] = 0xff;
135	5.82k	}
136
137		void
138	7.52k	CollationWeights::initForTertiary() {
139		// We use only the lower 16 bits for tertiary weights.
140	7.52k	middleLength=3;
141	7.52k	minBytes[1] = 0;
142	7.52k	maxBytes[1] = 0;
143	7.52k	minBytes[2] = 0;
144	7.52k	maxBytes[2] = 0;
145		// We use only 6 bits per byte.
146		// The other bits are used for case & quaternary weights.
147	7.52k	minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
148	7.52k	maxBytes[3] = 0x3f;
149	7.52k	minBytes[4] = 2;
150	7.52k	maxBytes[4] = 0x3f;
151	7.52k	}
152
153		uint32_t
154	3.14M	CollationWeights::incWeight(uint32_t weight, int32_t length) const {
155	3.15M	for(;;) {
156	3.15M	uint32_t byte=getWeightByte(weight, length);
157	3.15M	if(byte<maxBytes[length]) {
158	3.14M	return setWeightByte(weight, length, byte+1);
159	3.14M	} else {
160		// Roll over, set this byte to the minimum and increment the previous one.
161	12.2k	weight=setWeightByte(weight, length, minBytes[length]);
162	12.2k	--length;
163	12.2k	U_ASSERT(length > 0);
164	12.2k	}
165	3.15M	}
166	3.14M	}
167
168		uint32_t
169	954	CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
170	993	for(;;) {
171	993	offset += getWeightByte(weight, length);
172	993	if (static_cast<uint32_t>(offset) <= maxBytes[length]) {
173	954	return setWeightByte(weight, length, offset);
174	954	} else {
175		// Split the offset between this byte and the previous one.
176	39	offset -= minBytes[length];
177	39	weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
178	39	offset /= countBytes(length);
179	39	--length;
180	39	U_ASSERT(length > 0);
181	39	}
182	993	}
183	954	}
184
185		void
186	3.21k	CollationWeights::lengthenRange(WeightRange &range) const {
187	3.21k	int32_t length=range.length+1;
188	3.21k	range.start=setWeightTrail(range.start, length, minBytes[length]);
189	3.21k	range.end=setWeightTrail(range.end, length, maxBytes[length]);
190	3.21k	range.count*=countBytes(length);
191	3.21k	range.length=length;
192	3.21k	}
193
194		/* for uprv_sortArray: sort ranges in weight order */
195		static int32_t U_CALLCONV
196	73	compareRanges(const void * /context/, const void left, const void right) {
197	73	uint32_t l, r;
198
199	73	l = static_cast<const CollationWeights::WeightRange*>(left)->start;
200	73	r = static_cast<const CollationWeights::WeightRange*>(right)->start;
201	73	if(l<r) {
202	73	return -1;
203	73	} else if(l>r) {
204	0	return 1;
205	0	} else {
206	0	return 0;
207	0	}
208	73	}
209
210		UBool
211	21.1k	CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
212	21.1k	U_ASSERT(lowerLimit != 0);
213	21.1k	U_ASSERT(upperLimit != 0);
214
215		/* get the lengths of the limits */
216	21.1k	int32_t lowerLength=lengthOfWeight(lowerLimit);
217	21.1k	int32_t upperLength=lengthOfWeight(upperLimit);
218
219		#ifdef UCOL_DEBUG
220		printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
221		printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
222		#endif
223	21.1k	U_ASSERT(lowerLength>=middleLength);
224		// Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
225
226	21.1k	if(lowerLimit>=upperLimit) {
227		#ifdef UCOL_DEBUG
228		printf("error: no space between lower & upper limits\n");
229		#endif
230	0	return false;
231	0	}
232
233		/* check that neither is a prefix of the other */
234	21.1k	if(lowerLength<upperLength) {
235	1.36k	if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
236		#ifdef UCOL_DEBUG
237		printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
238		#endif
239	0	return false;
240	0	}
241	1.36k	}
242		/* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
243
244	21.1k	WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
245	21.1k	uprv_memset(lower, 0, sizeof(lower));
246	21.1k	uprv_memset(&middle, 0, sizeof(middle));
247	21.1k	uprv_memset(upper, 0, sizeof(upper));
248
249		/*
250		* With the limit lengths of 1..4, there are up to 7 ranges for allocation:
251		* range minimum length
252		* lower[4] 4
253		* lower[3] 3
254		* lower[2] 2
255		* middle 1
256		* upper[2] 2
257		* upper[3] 3
258		* upper[4] 4
259		*
260		* We are now going to calculate up to 7 ranges.
261		* Some of them will typically overlap, so we will then have to merge and eliminate ranges.
262		*/
263	21.1k	uint32_t weight=lowerLimit;
264	30.5k	for(int32_t length=lowerLength; length>middleLength; --length) {
265	9.40k	uint32_t trail=getWeightTrail(weight, length);
266	9.40k	if(trail<maxBytes[length]) {
267	9.37k	lower[length].start=incWeightTrail(weight, length);
268	9.37k	lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
269	9.37k	lower[length].length=length;
270	9.37k	lower[length].count=maxBytes[length]-trail;
271	9.37k	}
272	9.40k	weight=truncateWeight(weight, length-1);
273	9.40k	}
274	21.1k	if(weight<0xff000000) {
275	21.1k	middle.start=incWeightTrail(weight, middleLength);
276	21.1k	} else {
277		// Prevent overflow for primary lead byte FF
278		// which would yield a middle range starting at 0.
279	0	middle.start=0xffffffff; // no middle range
280	0	}
281
282	21.1k	weight=upperLimit;
283	31.1k	for(int32_t length=upperLength; length>middleLength; --length) {
284	9.99k	uint32_t trail=getWeightTrail(weight, length);
285	9.99k	if(trail>minBytes[length]) {
286	9.24k	upper[length].start=setWeightTrail(weight, length, minBytes[length]);
287	9.24k	upper[length].end=decWeightTrail(weight, length);
288	9.24k	upper[length].length=length;
289	9.24k	upper[length].count=trail-minBytes[length];
290	9.24k	}
291	9.99k	weight=truncateWeight(weight, length-1);
292	9.99k	}
293	21.1k	middle.end=decWeightTrail(weight, middleLength);
294
295		/* set the middle range */
296	21.1k	middle.length=middleLength;
297	21.1k	if(middle.end>=middle.start) {
298	13.0k	middle.count = static_cast<int32_t>((middle.end - middle.start) >> (8 * (4 - middleLength))) + 1;
299	13.0k	} else {
300		/* no middle range, eliminate overlaps */
301	23.1k	for(int32_t length=4; length>middleLength; --length) {
302	21.6k	if(lower[length].count>0 && upper[length].count>0) {
303		// Note: The lowerEnd and upperStart weights are versions of
304		// lowerLimit and upperLimit (which are lowerLimit<upperLimit),
305		// truncated (still less-or-equal)
306		// and then with their last bytes changed to the
307		// maxByte (for lowerEnd) or minByte (for upperStart).
308	6.69k	const uint32_t lowerEnd=lower[length].end;
309	6.69k	const uint32_t upperStart=upper[length].start;
310	6.69k	UBool merged=false;
311
312	6.69k	if(lowerEnd>upperStart) {
313		// These two lower and upper ranges collide.
314		// Since lowerLimit<upperLimit and lowerEnd and upperStart
315		// are versions with only their last bytes modified
316		// (and following ones removed/reset to 0),
317		// lowerEnd>upperStart is only possible
318		// if the leading bytes are equal
319		// and lastByte(lowerEnd)>lastByte(upperStart).
320	6.68k	U_ASSERT(truncateWeight(lowerEnd, length-1)==
321	6.68k	truncateWeight(upperStart, length-1));
322		// Intersect these two ranges.
323	6.68k	lower[length].end=upper[length].end;
324	6.68k	lower[length].count=
325	6.68k	static_cast<int32_t>(getWeightTrail(lower[length].end, length)) -
326	6.68k	static_cast<int32_t>(getWeightTrail(lower[length].start, length)) + 1;
327		// count might be <=0 in which case there is no room,
328		// and the range-collecting code below will ignore this range.
329	6.68k	merged=true;
330	6.68k	} else if(lowerEnd==upperStart) {
331		// Not possible, unless minByte==maxByte which is not allowed.
332	0	U_ASSERT(minBytes[length]<maxBytes[length]);
333	16	} else /* lowerEnd<upperStart */ {
334	16	if(incWeight(lowerEnd, length)==upperStart) {
335		// Merge adjacent ranges.
336	16	lower[length].end=upper[length].end;
337	16	lower[length].count+=upper[length].count; // might be >countBytes
338	16	merged=true;
339	16	}
340	16	}
341	6.69k	if(merged) {
342		// Remove all shorter ranges.
343		// There was no room available for them between the ranges we just merged.
344	6.69k	upper[length].count=0;
345	8.51k	while(--length>middleLength) {
346	1.82k	lower[length].count=upper[length].count=0;
347	1.82k	}
348	6.69k	break;
349	6.69k	}
350	6.69k	}
351	21.6k	}
352	8.17k	}
353
354		#ifdef UCOL_DEBUG
355		/* print ranges */
356		for(int32_t length=4; length>=2; --length) {
357		if(lower[length].count>0) {
358		printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
359		}
360		}
361		if(middle.count>0) {
362		printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
363		}
364		for(int32_t length=2; length<=4; ++length) {
365		if(upper[length].count>0) {
366		printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
367		}
368		}
369		#endif
370
371		/* copy the ranges, shortest first, into the result array */
372	21.1k	rangeCount=0;
373	21.1k	if(middle.count>0) {
374	13.0k	uprv_memcpy(ranges, &middle, sizeof(WeightRange));
375	13.0k	rangeCount=1;
376	13.0k	}
377	57.9k	for(int32_t length=middleLength+1; length<=4; ++length) {
378		/* copy upper first so that later the middle range is more likely the first one to use */
379	36.8k	if(upper[length].count>0) {
380	782	uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
381	782	++rangeCount;
382	782	}
383	36.8k	if(lower[length].count>0) {
384	7.54k	uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
385	7.54k	++rangeCount;
386	7.54k	}
387	36.8k	}
388	21.1k	return rangeCount>0;
389	21.1k	}
390
391		UBool
392	21.4k	CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
393		// See if the first few minLength and minLength+1 ranges have enough weights.
394	24.8k	for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
395	21.5k	if(n <= ranges[i].count) {
396		// Use the first few minLength and minLength+1 ranges.
397	18.2k	if(ranges[i].length > minLength) {
398		// Reduce the number of weights from the last minLength+1 range
399		// which might sort before some minLength ranges,
400		// so that we use all weights in the minLength ranges.
401	55	ranges[i].count = n;
402	55	}
403	18.2k	rangeCount = i + 1;
404		#ifdef UCOL_DEBUG
405		printf("take first %ld ranges\n", rangeCount);
406		#endif
407
408	18.2k	if(rangeCount>1) {
409		/* sort the ranges by weight values */
410	73	UErrorCode errorCode=U_ZERO_ERROR;
411	73	uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
412	73	compareRanges, nullptr, false, &errorCode);
413		/* ignore error code: we know that the internal sort function will not fail here */
414	73	}
415	18.2k	return true;
416	18.2k	}
417	3.38k	n -= ranges[i].count; // still >0
418	3.38k	}
419	3.24k	return false;
420	21.4k	}
421
422		UBool
423	3.21k	CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
424		// See if the minLength ranges have enough weights
425		// when we split one and lengthen the following ones.
426	3.21k	int32_t count = 0;
427	3.21k	int32_t minLengthRangeCount;
428	3.21k	for(minLengthRangeCount = 0;
429	6.44k	minLengthRangeCount < rangeCount &&
430	6.44k	ranges[minLengthRangeCount].length == minLength;
431	3.22k	++minLengthRangeCount) {
432	3.22k	count += ranges[minLengthRangeCount].count;
433	3.22k	}
434
435	3.21k	int32_t nextCountBytes = countBytes(minLength + 1);
436	3.21k	if(n > count * nextCountBytes) { return false; }
437
438		// Use the minLength ranges. Merge them, and then split again as necessary.
439	2.94k	uint32_t start = ranges[0].start;
440	2.94k	uint32_t end = ranges[0].end;
441	2.94k	for(int32_t i = 1; i < minLengthRangeCount; ++i) {
442	7	if(ranges[i].start < start) { start = ranges[i].start; }
443	7	if(ranges[i].end > end) { end = ranges[i].end; }
444	7	}
445
446		// Calculate how to split the range between minLength (count1) and minLength+1 (count2).
447		// Goal:
448		// count1 + count2 * nextCountBytes = n
449		// count1 + count2 = count
450		// These turn into
451		// (count - count2) + count2 * nextCountBytes = n
452		// and then into the following count1 & count2 computations.
453	2.94k	int32_t count2 = (n - count) / (nextCountBytes - 1); // number of weights to be lengthened
454	2.94k	int32_t count1 = count - count2; // number of minLength weights
455	2.94k	if(count2 == 0 \|\| (count1 + count2 * nextCountBytes) < n) {
456		// round up
457	2.91k	++count2;
458	2.91k	--count1;
459	2.91k	U_ASSERT((count1 + count2 * nextCountBytes) >= n);
460	2.91k	}
461
462	2.94k	ranges[0].start = start;
463
464	2.94k	if(count1 == 0) {
465		// Make one long range.
466	1.98k	ranges[0].end = end;
467	1.98k	ranges[0].count = count;
468	1.98k	lengthenRange(ranges[0]);
469	1.98k	rangeCount = 1;
470	1.98k	} else {
471		// Split the range, lengthen the second part.
472		#ifdef UCOL_DEBUG
473		printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
474		splitRange, rangeCount, count1, count2);
475		#endif
476
477		// Next start = start + count1. First end = 1 before that.
478	954	ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
479	954	ranges[0].count = count1;
480
481	954	ranges[1].start = incWeight(ranges[0].end, minLength);
482	954	ranges[1].end = end;
483	954	ranges[1].length = minLength; // +1 when lengthened
484	954	ranges[1].count = count2; // *countBytes when lengthened
485	954	lengthenRange(ranges[1]);
486	954	rangeCount = 2;
487	954	}
488	2.94k	return true;
489	3.21k	}
490
491		/*
492		* call getWeightRanges and then determine heuristically
493		* which ranges to use for a given number of weights between (excluding)
494		* two limits
495		*/
496		UBool
497	21.1k	CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
498		#ifdef UCOL_DEBUG
499		puts("");
500		#endif
501
502	21.1k	if(!getWeightRanges(lowerLimit, upperLimit)) {
503		#ifdef UCOL_DEBUG
504		printf("error: unable to get Weight ranges\n");
505		#endif
506	0	return false;
507	0	}
508
509		/* try until we find suitably large ranges */
510	21.4k	for(;;) {
511		/* get the smallest number of bytes in a range */
512	21.4k	int32_t minLength=ranges[0].length;
513
514	21.4k	if(allocWeightsInShortRanges(n, minLength)) { break; }
515
516	3.24k	if(minLength == 4) {
517		#ifdef UCOL_DEBUG
518		printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
519		minLengthCount, n);
520		#endif
521	27	return false;
522	27	}
523
524	3.21k	if(allocWeightsInMinLengthRanges(n, minLength)) { break; }
525
526		/* no good match, lengthen all minLength ranges and iterate */
527		#ifdef UCOL_DEBUG
528		printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
529		#endif
530	556	for(int32_t i=0; i<rangeCount && ranges[i].length==minLength; ++i) {
531	278	lengthenRange(ranges[i]);
532	278	}
533	278	}
534
535		#ifdef UCOL_DEBUG
536		puts("final ranges:");
537		for(int32_t i=0; i<rangeCount; ++i) {
538		printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
539		i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
540		}
541		#endif
542
543	21.1k	rangeIndex = 0;
544	21.1k	return true;
545	21.1k	}
546
547		uint32_t
548	3.14M	CollationWeights::nextWeight() {
549	3.14M	if(rangeIndex >= rangeCount) {
550	0	return 0xffffffff;
551	3.14M	} else {
552		/* get the next weight */
553	3.14M	WeightRange &range = ranges[rangeIndex];
554	3.14M	uint32_t weight = range.start;
555	3.14M	if(--range.count == 0) {
556		/* this range is finished */
557	5.38k	++rangeIndex;
558	3.14M	} else {
559		/* increment the weight for the next value */
560	3.14M	range.start = incWeight(weight, range.length);
561	3.14M	U_ASSERT(range.start <= range.end);
562	3.14M	}
563
564	3.14M	return weight;
565	3.14M	}
566	3.14M	}
567
568		U_NAMESPACE_END
569
570		#endif /* #if !UCONFIG_NO_COLLATION */