/src/icu/source/i18n/collationweights.cpp

Source (jump to first uncovered line)
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*  
*******************************************************************************
*
*   Copyright (C) 1999-2015, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  collationweights.cpp
*   encoding:   UTF-8
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2001mar08 as ucol_wgt.cpp
*   created by: Markus W. Scherer
*
*   This file contains code for allocating n collation element weights
*   between two exclusive limits.
*   It is used only internally by the collation tailoring builder.
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "cmemory.h"
#include "collation.h"
#include "collationweights.h"
#include "uarrsort.h"
#include "uassert.h"

#ifdef UCOL_DEBUG
#   include <stdio.h>
#endif

U_NAMESPACE_BEGIN

/* collation element weight allocation -------------------------------------- */

/* helper functions for CE weights */

static inline uint32_t
getWeightTrail(uint32_t weight, int32_t length) {
    return (uint32_t)(weight>>(8*(4-length)))&0xff;
}

static inline uint32_t
setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
    length=8*(4-length);
    return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
}

static inline uint32_t
getWeightByte(uint32_t weight, int32_t idx) {
    return getWeightTrail(weight, idx); /* same calculation */
}

static inline uint32_t
setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
    uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */

    idx*=8;
    if(idx<32) {
        mask=((uint32_t)0xffffffff)>>idx;
    } else {
        // Do not use uint32_t>>32 because on some platforms that does not shift at all
        // while we need it to become 0.
        // PowerPC: 0xffffffff>>32 = 0           (wanted)
        // x86:     0xffffffff>>32 = 0xffffffff  (not wanted)
        //
        // ANSI C99 6.5.7 Bitwise shift operators:
        // "If the value of the right operand is negative
        // or is greater than or equal to the width of the promoted left operand,
        // the behavior is undefined."
        mask=0;
    }
    idx=32-idx;
    mask|=0xffffff00<<idx;
    return (uint32_t)((weight&mask)|(byte<<idx));
}

static inline uint32_t
truncateWeight(uint32_t weight, int32_t length) {
    return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
}

static inline uint32_t
incWeightTrail(uint32_t weight, int32_t length) {
    return (uint32_t)(weight+(1UL<<(8*(4-length))));
}

static inline uint32_t
decWeightTrail(uint32_t weight, int32_t length) {
    return (uint32_t)(weight-(1UL<<(8*(4-length))));
}

CollationWeights::CollationWeights()
        : middleLength(0), rangeIndex(0), rangeCount(0) {
    for(int32_t i = 0; i < 5; ++i) {
        minBytes[i] = maxBytes[i] = 0;
    }
}

void
CollationWeights::initForPrimary(UBool compressible) {
    middleLength=1;
    minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
    maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
    if(compressible) {
        minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
        maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
    } else {
        minBytes[2] = 2;
        maxBytes[2] = 0xff;
    }
    minBytes[3] = 2;
    maxBytes[3] = 0xff;
    minBytes[4] = 2;
    maxBytes[4] = 0xff;
}

void
CollationWeights::initForSecondary() {
    // We use only the lower 16 bits for secondary weights.
    middleLength=3;
    minBytes[1] = 0;
    maxBytes[1] = 0;
    minBytes[2] = 0;
    maxBytes[2] = 0;
    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
    maxBytes[3] = 0xff;
    minBytes[4] = 2;
    maxBytes[4] = 0xff;
}

void
CollationWeights::initForTertiary() {
    // We use only the lower 16 bits for tertiary weights.
    middleLength=3;
    minBytes[1] = 0;
    maxBytes[1] = 0;
    minBytes[2] = 0;
    maxBytes[2] = 0;
    // We use only 6 bits per byte.
    // The other bits are used for case & quaternary weights.
    minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
    maxBytes[3] = 0x3f;
    minBytes[4] = 2;
    maxBytes[4] = 0x3f;
}

uint32_t
CollationWeights::incWeight(uint32_t weight, int32_t length) const {
    for(;;) {
        uint32_t byte=getWeightByte(weight, length);
        if(byte<maxBytes[length]) {
            return setWeightByte(weight, length, byte+1);
        } else {
            // Roll over, set this byte to the minimum and increment the previous one.
            weight=setWeightByte(weight, length, minBytes[length]);
            --length;
            U_ASSERT(length > 0);
        }
    }
}

uint32_t
CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
    for(;;) {
        offset += getWeightByte(weight, length);
        if((uint32_t)offset <= maxBytes[length]) {
            return setWeightByte(weight, length, offset);
        } else {
            // Split the offset between this byte and the previous one.
            offset -= minBytes[length];
            weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
            offset /= countBytes(length);
            --length;
            U_ASSERT(length > 0);
        }
    }
}

void
CollationWeights::lengthenRange(WeightRange &range) const {
    int32_t length=range.length+1;
    range.start=setWeightTrail(range.start, length, minBytes[length]);
    range.end=setWeightTrail(range.end, length, maxBytes[length]);
    range.count*=countBytes(length);
    range.length=length;
}

/* for uprv_sortArray: sort ranges in weight order */
static int32_t U_CALLCONV
compareRanges(const void * /*context*/, const void *left, const void *right) {
    uint32_t l, r;

    l=((const CollationWeights::WeightRange *)left)->start;
    r=((const CollationWeights::WeightRange *)right)->start;
    if(l<r) {
        return -1;
    } else if(l>r) {
        return 1;
    } else {
        return 0;
    }
}

UBool
CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
    U_ASSERT(lowerLimit != 0);
    U_ASSERT(upperLimit != 0);

    /* get the lengths of the limits */
    int32_t lowerLength=lengthOfWeight(lowerLimit);
    int32_t upperLength=lengthOfWeight(upperLimit);

#ifdef UCOL_DEBUG
    printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
    printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
#endif
    U_ASSERT(lowerLength>=middleLength);
    // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.

    if(lowerLimit>=upperLimit) {
#ifdef UCOL_DEBUG
        printf("error: no space between lower & upper limits\n");
#endif
        return FALSE;
    }

    /* check that neither is a prefix of the other */
    if(lowerLength<upperLength) {
        if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
#ifdef UCOL_DEBUG
            printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
#endif
            return FALSE;
        }
    }
    /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */

    WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
    uprv_memset(lower, 0, sizeof(lower));
    uprv_memset(&middle, 0, sizeof(middle));
    uprv_memset(upper, 0, sizeof(upper));

    /*
     * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
     * range     minimum length
     * lower[4]  4
     * lower[3]  3
     * lower[2]  2
     * middle    1
     * upper[2]  2
     * upper[3]  3
     * upper[4]  4
     *
     * We are now going to calculate up to 7 ranges.
     * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
     */
    uint32_t weight=lowerLimit;
    for(int32_t length=lowerLength; length>middleLength; --length) {
        uint32_t trail=getWeightTrail(weight, length);
        if(trail<maxBytes[length]) {
            lower[length].start=incWeightTrail(weight, length);
            lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
            lower[length].length=length;
            lower[length].count=maxBytes[length]-trail;
        }
        weight=truncateWeight(weight, length-1);
    }
    if(weight<0xff000000) {
        middle.start=incWeightTrail(weight, middleLength);
    } else {
        // Prevent overflow for primary lead byte FF
        // which would yield a middle range starting at 0.
        middle.start=0xffffffff;  // no middle range
    }

    weight=upperLimit;
    for(int32_t length=upperLength; length>middleLength; --length) {
        uint32_t trail=getWeightTrail(weight, length);
        if(trail>minBytes[length]) {
            upper[length].start=setWeightTrail(weight, length, minBytes[length]);
            upper[length].end=decWeightTrail(weight, length);
            upper[length].length=length;
            upper[length].count=trail-minBytes[length];
        }
        weight=truncateWeight(weight, length-1);
    }
    middle.end=decWeightTrail(weight, middleLength);

    /* set the middle range */
    middle.length=middleLength;
    if(middle.end>=middle.start) {
        middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
    } else {
        /* no middle range, eliminate overlaps */
        for(int32_t length=4; length>middleLength; --length) {
            if(lower[length].count>0 && upper[length].count>0) {
                // Note: The lowerEnd and upperStart weights are versions of
                // lowerLimit and upperLimit (which are lowerLimit<upperLimit),
                // truncated (still less-or-equal)
                // and then with their last bytes changed to the
                // maxByte (for lowerEnd) or minByte (for upperStart).
                const uint32_t lowerEnd=lower[length].end;
                const uint32_t upperStart=upper[length].start;
                UBool merged=FALSE;

                if(lowerEnd>upperStart) {
                    // These two lower and upper ranges collide.
                    // Since lowerLimit<upperLimit and lowerEnd and upperStart
                    // are versions with only their last bytes modified
                    // (and following ones removed/reset to 0),
                    // lowerEnd>upperStart is only possible
                    // if the leading bytes are equal
                    // and lastByte(lowerEnd)>lastByte(upperStart).
                    U_ASSERT(truncateWeight(lowerEnd, length-1)==
                            truncateWeight(upperStart, length-1));
                    // Intersect these two ranges.
                    lower[length].end=upper[length].end;
                    lower[length].count=
                            (int32_t)getWeightTrail(lower[length].end, length)-
                            (int32_t)getWeightTrail(lower[length].start, length)+1;
                    // count might be <=0 in which case there is no room,
                    // and the range-collecting code below will ignore this range.
                    merged=TRUE;
                } else if(lowerEnd==upperStart) {
                    // Not possible, unless minByte==maxByte which is not allowed.
                    U_ASSERT(minBytes[length]<maxBytes[length]);
                } else /* lowerEnd<upperStart */ {
                    if(incWeight(lowerEnd, length)==upperStart) {
                        // Merge adjacent ranges.
                        lower[length].end=upper[length].end;
                        lower[length].count+=upper[length].count;  // might be >countBytes
                        merged=TRUE;
                    }
                }
                if(merged) {
                    // Remove all shorter ranges.
                    // There was no room available for them between the ranges we just merged.
                    upper[length].count=0;
                    while(--length>middleLength) {
                        lower[length].count=upper[length].count=0;
                    }
                    break;
                }
            }
        }
    }

#ifdef UCOL_DEBUG
    /* print ranges */
    for(int32_t length=4; length>=2; --length) {
        if(lower[length].count>0) {
            printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
        }
    }
    if(middle.count>0) {
        printf("middle   .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
    }
    for(int32_t length=2; length<=4; ++length) {
        if(upper[length].count>0) {
            printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
        }
    }
#endif

    /* copy the ranges, shortest first, into the result array */
    rangeCount=0;
    if(middle.count>0) {
        uprv_memcpy(ranges, &middle, sizeof(WeightRange));
        rangeCount=1;
    }
    for(int32_t length=middleLength+1; length<=4; ++length) {
        /* copy upper first so that later the middle range is more likely the first one to use */
        if(upper[length].count>0) {
            uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
            ++rangeCount;
        }
        if(lower[length].count>0) {
            uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
            ++rangeCount;
        }
    }
    return rangeCount>0;
}

UBool
CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
    // See if the first few minLength and minLength+1 ranges have enough weights.
    for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
        if(n <= ranges[i].count) {
            // Use the first few minLength and minLength+1 ranges.
            if(ranges[i].length > minLength) {
                // Reduce the number of weights from the last minLength+1 range
                // which might sort before some minLength ranges,
                // so that we use all weights in the minLength ranges.
                ranges[i].count = n;
            }
            rangeCount = i + 1;
#ifdef UCOL_DEBUG
            printf("take first %ld ranges\n", rangeCount);
#endif

            if(rangeCount>1) {
                /* sort the ranges by weight values */
                UErrorCode errorCode=U_ZERO_ERROR;
                uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
                               compareRanges, NULL, FALSE, &errorCode);
                /* ignore error code: we know that the internal sort function will not fail here */
            }
            return TRUE;
        }
        n -= ranges[i].count;  // still >0
    }
    return FALSE;
}

UBool
CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
    // See if the minLength ranges have enough weights
    // when we split one and lengthen the following ones.
    int32_t count = 0;
    int32_t minLengthRangeCount;
    for(minLengthRangeCount = 0;
            minLengthRangeCount < rangeCount &&
                ranges[minLengthRangeCount].length == minLength;
            ++minLengthRangeCount) {
        count += ranges[minLengthRangeCount].count;
    }

    int32_t nextCountBytes = countBytes(minLength + 1);
    if(n > count * nextCountBytes) { return FALSE; }

    // Use the minLength ranges. Merge them, and then split again as necessary.
    uint32_t start = ranges[0].start;
    uint32_t end = ranges[0].end;
    for(int32_t i = 1; i < minLengthRangeCount; ++i) {
        if(ranges[i].start < start) { start = ranges[i].start; }
        if(ranges[i].end > end) { end = ranges[i].end; }
    }

    // Calculate how to split the range between minLength (count1) and minLength+1 (count2).
    // Goal:
    //   count1 + count2 * nextCountBytes = n
    //   count1 + count2 = count
    // These turn into
    //   (count - count2) + count2 * nextCountBytes = n
    // and then into the following count1 & count2 computations.
    int32_t count2 = (n - count) / (nextCountBytes - 1);  // number of weights to be lengthened
    int32_t count1 = count - count2;  // number of minLength weights
    if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) {
        // round up
        ++count2;
        --count1;
        U_ASSERT((count1 + count2 * nextCountBytes) >= n);
    }

    ranges[0].start = start;

    if(count1 == 0) {
        // Make one long range.
        ranges[0].end = end;
        ranges[0].count = count;
        lengthenRange(ranges[0]);
        rangeCount = 1;
    } else {
        // Split the range, lengthen the second part.
#ifdef UCOL_DEBUG
        printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
               splitRange, rangeCount, count1, count2);
#endif

        // Next start = start + count1. First end = 1 before that.
        ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
        ranges[0].count = count1;

        ranges[1].start = incWeight(ranges[0].end, minLength);
        ranges[1].end = end;
        ranges[1].length = minLength;  // +1 when lengthened
        ranges[1].count = count2;  // *countBytes when lengthened
        lengthenRange(ranges[1]);
        rangeCount = 2;
    }
    return TRUE;
}

/*
 * call getWeightRanges and then determine heuristically
 * which ranges to use for a given number of weights between (excluding)
 * two limits
 */
UBool
CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
#ifdef UCOL_DEBUG
    puts("");
#endif

    if(!getWeightRanges(lowerLimit, upperLimit)) {
#ifdef UCOL_DEBUG
        printf("error: unable to get Weight ranges\n");
#endif
        return FALSE;
    }

    /* try until we find suitably large ranges */
    for(;;) {
        /* get the smallest number of bytes in a range */
        int32_t minLength=ranges[0].length;

        if(allocWeightsInShortRanges(n, minLength)) { break; }

        if(minLength == 4) {
#ifdef UCOL_DEBUG
            printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
                   minLengthCount, n);
#endif
            return FALSE;
        }

        if(allocWeightsInMinLengthRanges(n, minLength)) { break; }

        /* no good match, lengthen all minLength ranges and iterate */
#ifdef UCOL_DEBUG
        printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
#endif
        for(int32_t i=0; i<rangeCount && ranges[i].length==minLength; ++i) {
            lengthenRange(ranges[i]);
        }
    }

#ifdef UCOL_DEBUG
    puts("final ranges:");
    for(int32_t i=0; i<rangeCount; ++i) {
        printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
               i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
    }
#endif

    rangeIndex = 0;
    return TRUE;
}

uint32_t
CollationWeights::nextWeight() {
    if(rangeIndex >= rangeCount) {
        return 0xffffffff;
    } else {
        /* get the next weight */
        WeightRange &range = ranges[rangeIndex];
        uint32_t weight = range.start;
        if(--range.count == 0) {
            /* this range is finished */
            ++rangeIndex;
        } else {
            /* increment the weight for the next value */
            range.start = incWeight(weight, range.length);
            U_ASSERT(range.start <= range.end);
        }

        return weight;
    }
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_COLLATION */

Coverage Report

Created: 2025-01-28 06:38

Line	Count	Source (jump to first uncovered line)
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		*******************************************************************************
5		*
6		* Copyright (C) 1999-2015, International Business Machines
7		* Corporation and others. All Rights Reserved.
8		*
9		*******************************************************************************
10		* file name: collationweights.cpp
11		* encoding: UTF-8
12		* tab size: 8 (not used)
13		* indentation:4
14		*
15		* created on: 2001mar08 as ucol_wgt.cpp
16		* created by: Markus W. Scherer
17		*
18		* This file contains code for allocating n collation element weights
19		* between two exclusive limits.
20		* It is used only internally by the collation tailoring builder.
21		*/
22
23		#include "unicode/utypes.h"
24
25		#if !UCONFIG_NO_COLLATION
26
27		#include "cmemory.h"
28		#include "collation.h"
29		#include "collationweights.h"
30		#include "uarrsort.h"
31		#include "uassert.h"
32
33		#ifdef UCOL_DEBUG
34		# include <stdio.h>
35		#endif
36
37		U_NAMESPACE_BEGIN
38
39		/* collation element weight allocation -------------------------------------- */
40
41		/* helper functions for CE weights */
42
43		static inline uint32_t
44	0	getWeightTrail(uint32_t weight, int32_t length) {
45	0	return (uint32_t)(weight>>(8*(4-length)))&0xff;
46	0	}
47
48		static inline uint32_t
49	0	setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
50	0	length=8*(4-length);
51	0	return (uint32_t)((weight&(0xffffff00<<length))\|(trail<<length));
52	0	}
53
54		static inline uint32_t
55	0	getWeightByte(uint32_t weight, int32_t idx) {
56	0	return getWeightTrail(weight, idx); /* same calculation */
57	0	}
58
59		static inline uint32_t
60	0	setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
61	0	uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
62
63	0	idx*=8;
64	0	if(idx<32) {
65	0	mask=((uint32_t)0xffffffff)>>idx;
66	0	} else {
67		// Do not use uint32_t>>32 because on some platforms that does not shift at all
68		// while we need it to become 0.
69		// PowerPC: 0xffffffff>>32 = 0 (wanted)
70		// x86: 0xffffffff>>32 = 0xffffffff (not wanted)
71		//
72		// ANSI C99 6.5.7 Bitwise shift operators:
73		// "If the value of the right operand is negative
74		// or is greater than or equal to the width of the promoted left operand,
75		// the behavior is undefined."
76	0	mask=0;
77	0	}
78	0	idx=32-idx;
79	0	mask\|=0xffffff00<<idx;
80	0	return (uint32_t)((weight&mask)\|(byte<<idx));
81	0	}
82
83		static inline uint32_t
84	0	truncateWeight(uint32_t weight, int32_t length) {
85	0	return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
86	0	}
87
88		static inline uint32_t
89	0	incWeightTrail(uint32_t weight, int32_t length) {
90	0	return (uint32_t)(weight+(1UL<<(8*(4-length))));
91	0	}
92
93		static inline uint32_t
94	0	decWeightTrail(uint32_t weight, int32_t length) {
95	0	return (uint32_t)(weight-(1UL<<(8*(4-length))));
96	0	}
97
98		CollationWeights::CollationWeights()
99	0	: middleLength(0), rangeIndex(0), rangeCount(0) {
100	0	for(int32_t i = 0; i < 5; ++i) {
101	0	minBytes[i] = maxBytes[i] = 0;
102	0	}
103	0	}
104
105		void
106	0	CollationWeights::initForPrimary(UBool compressible) {
107	0	middleLength=1;
108	0	minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1;
109	0	maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE;
110	0	if(compressible) {
111	0	minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1;
112	0	maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1;
113	0	} else {
114	0	minBytes[2] = 2;
115	0	maxBytes[2] = 0xff;
116	0	}
117	0	minBytes[3] = 2;
118	0	maxBytes[3] = 0xff;
119	0	minBytes[4] = 2;
120	0	maxBytes[4] = 0xff;
121	0	}
122
123		void
124	0	CollationWeights::initForSecondary() {
125		// We use only the lower 16 bits for secondary weights.
126	0	middleLength=3;
127	0	minBytes[1] = 0;
128	0	maxBytes[1] = 0;
129	0	minBytes[2] = 0;
130	0	maxBytes[2] = 0;
131	0	minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
132	0	maxBytes[3] = 0xff;
133	0	minBytes[4] = 2;
134	0	maxBytes[4] = 0xff;
135	0	}
136
137		void
138	0	CollationWeights::initForTertiary() {
139		// We use only the lower 16 bits for tertiary weights.
140	0	middleLength=3;
141	0	minBytes[1] = 0;
142	0	maxBytes[1] = 0;
143	0	minBytes[2] = 0;
144	0	maxBytes[2] = 0;
145		// We use only 6 bits per byte.
146		// The other bits are used for case & quaternary weights.
147	0	minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1;
148	0	maxBytes[3] = 0x3f;
149	0	minBytes[4] = 2;
150	0	maxBytes[4] = 0x3f;
151	0	}
152
153		uint32_t
154	0	CollationWeights::incWeight(uint32_t weight, int32_t length) const {
155	0	for(;;) {
156	0	uint32_t byte=getWeightByte(weight, length);
157	0	if(byte<maxBytes[length]) {
158	0	return setWeightByte(weight, length, byte+1);
159	0	} else {
160		// Roll over, set this byte to the minimum and increment the previous one.
161	0	weight=setWeightByte(weight, length, minBytes[length]);
162	0	--length;
163	0	U_ASSERT(length > 0);
164	0	}
165	0	}
166	0	}
167
168		uint32_t
169	0	CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const {
170	0	for(;;) {
171	0	offset += getWeightByte(weight, length);
172	0	if((uint32_t)offset <= maxBytes[length]) {
173	0	return setWeightByte(weight, length, offset);
174	0	} else {
175		// Split the offset between this byte and the previous one.
176	0	offset -= minBytes[length];
177	0	weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length));
178	0	offset /= countBytes(length);
179	0	--length;
180	0	U_ASSERT(length > 0);
181	0	}
182	0	}
183	0	}
184
185		void
186	0	CollationWeights::lengthenRange(WeightRange &range) const {
187	0	int32_t length=range.length+1;
188	0	range.start=setWeightTrail(range.start, length, minBytes[length]);
189	0	range.end=setWeightTrail(range.end, length, maxBytes[length]);
190	0	range.count*=countBytes(length);
191	0	range.length=length;
192	0	}
193
194		/* for uprv_sortArray: sort ranges in weight order */
195		static int32_t U_CALLCONV
196	0	compareRanges(const void * /context/, const void left, const void right) {
197	0	uint32_t l, r;
198
199	0	l=((const CollationWeights::WeightRange *)left)->start;
200	0	r=((const CollationWeights::WeightRange *)right)->start;
201	0	if(l<r) {
202	0	return -1;
203	0	} else if(l>r) {
204	0	return 1;
205	0	} else {
206	0	return 0;
207	0	}
208	0	}
209
210		UBool
211	0	CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) {
212	0	U_ASSERT(lowerLimit != 0);
213	0	U_ASSERT(upperLimit != 0);
214
215		/* get the lengths of the limits */
216	0	int32_t lowerLength=lengthOfWeight(lowerLimit);
217	0	int32_t upperLength=lengthOfWeight(upperLimit);
218
219		#ifdef UCOL_DEBUG
220		printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
221		printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
222		#endif
223	0	U_ASSERT(lowerLength>=middleLength);
224		// Permit upperLength<middleLength: The upper limit for secondaries is 0x10000.
225
226	0	if(lowerLimit>=upperLimit) {
227		#ifdef UCOL_DEBUG
228		printf("error: no space between lower & upper limits\n");
229		#endif
230	0	return FALSE;
231	0	}
232
233		/* check that neither is a prefix of the other */
234	0	if(lowerLength<upperLength) {
235	0	if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
236		#ifdef UCOL_DEBUG
237		printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
238		#endif
239	0	return FALSE;
240	0	}
241	0	}
242		/* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
243
244	0	WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
245	0	uprv_memset(lower, 0, sizeof(lower));
246	0	uprv_memset(&middle, 0, sizeof(middle));
247	0	uprv_memset(upper, 0, sizeof(upper));
248
249		/*
250		* With the limit lengths of 1..4, there are up to 7 ranges for allocation:
251		* range minimum length
252		* lower[4] 4
253		* lower[3] 3
254		* lower[2] 2
255		* middle 1
256		* upper[2] 2
257		* upper[3] 3
258		* upper[4] 4
259		*
260		* We are now going to calculate up to 7 ranges.
261		* Some of them will typically overlap, so we will then have to merge and eliminate ranges.
262		*/
263	0	uint32_t weight=lowerLimit;
264	0	for(int32_t length=lowerLength; length>middleLength; --length) {
265	0	uint32_t trail=getWeightTrail(weight, length);
266	0	if(trail<maxBytes[length]) {
267	0	lower[length].start=incWeightTrail(weight, length);
268	0	lower[length].end=setWeightTrail(weight, length, maxBytes[length]);
269	0	lower[length].length=length;
270	0	lower[length].count=maxBytes[length]-trail;
271	0	}
272	0	weight=truncateWeight(weight, length-1);
273	0	}
274	0	if(weight<0xff000000) {
275	0	middle.start=incWeightTrail(weight, middleLength);
276	0	} else {
277		// Prevent overflow for primary lead byte FF
278		// which would yield a middle range starting at 0.
279	0	middle.start=0xffffffff; // no middle range
280	0	}
281
282	0	weight=upperLimit;
283	0	for(int32_t length=upperLength; length>middleLength; --length) {
284	0	uint32_t trail=getWeightTrail(weight, length);
285	0	if(trail>minBytes[length]) {
286	0	upper[length].start=setWeightTrail(weight, length, minBytes[length]);
287	0	upper[length].end=decWeightTrail(weight, length);
288	0	upper[length].length=length;
289	0	upper[length].count=trail-minBytes[length];
290	0	}
291	0	weight=truncateWeight(weight, length-1);
292	0	}
293	0	middle.end=decWeightTrail(weight, middleLength);
294
295		/* set the middle range */
296	0	middle.length=middleLength;
297	0	if(middle.end>=middle.start) {
298	0	middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1;
299	0	} else {
300		/* no middle range, eliminate overlaps */
301	0	for(int32_t length=4; length>middleLength; --length) {
302	0	if(lower[length].count>0 && upper[length].count>0) {
303		// Note: The lowerEnd and upperStart weights are versions of
304		// lowerLimit and upperLimit (which are lowerLimit<upperLimit),
305		// truncated (still less-or-equal)
306		// and then with their last bytes changed to the
307		// maxByte (for lowerEnd) or minByte (for upperStart).
308	0	const uint32_t lowerEnd=lower[length].end;
309	0	const uint32_t upperStart=upper[length].start;
310	0	UBool merged=FALSE;
311
312	0	if(lowerEnd>upperStart) {
313		// These two lower and upper ranges collide.
314		// Since lowerLimit<upperLimit and lowerEnd and upperStart
315		// are versions with only their last bytes modified
316		// (and following ones removed/reset to 0),
317		// lowerEnd>upperStart is only possible
318		// if the leading bytes are equal
319		// and lastByte(lowerEnd)>lastByte(upperStart).
320	0	U_ASSERT(truncateWeight(lowerEnd, length-1)==
321	0	truncateWeight(upperStart, length-1));
322		// Intersect these two ranges.
323	0	lower[length].end=upper[length].end;
324	0	lower[length].count=
325	0	(int32_t)getWeightTrail(lower[length].end, length)-
326	0	(int32_t)getWeightTrail(lower[length].start, length)+1;
327		// count might be <=0 in which case there is no room,
328		// and the range-collecting code below will ignore this range.
329	0	merged=TRUE;
330	0	} else if(lowerEnd==upperStart) {
331		// Not possible, unless minByte==maxByte which is not allowed.
332	0	U_ASSERT(minBytes[length]<maxBytes[length]);
333	0	} else /* lowerEnd<upperStart */ {
334	0	if(incWeight(lowerEnd, length)==upperStart) {
335		// Merge adjacent ranges.
336	0	lower[length].end=upper[length].end;
337	0	lower[length].count+=upper[length].count; // might be >countBytes
338	0	merged=TRUE;
339	0	}
340	0	}
341	0	if(merged) {
342		// Remove all shorter ranges.
343		// There was no room available for them between the ranges we just merged.
344	0	upper[length].count=0;
345	0	while(--length>middleLength) {
346	0	lower[length].count=upper[length].count=0;
347	0	}
348	0	break;
349	0	}
350	0	}
351	0	}
352	0	}
353
354		#ifdef UCOL_DEBUG
355		/* print ranges */
356		for(int32_t length=4; length>=2; --length) {
357		if(lower[length].count>0) {
358		printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
359		}
360		}
361		if(middle.count>0) {
362		printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
363		}
364		for(int32_t length=2; length<=4; ++length) {
365		if(upper[length].count>0) {
366		printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
367		}
368		}
369		#endif
370
371		/* copy the ranges, shortest first, into the result array */
372	0	rangeCount=0;
373	0	if(middle.count>0) {
374	0	uprv_memcpy(ranges, &middle, sizeof(WeightRange));
375	0	rangeCount=1;
376	0	}
377	0	for(int32_t length=middleLength+1; length<=4; ++length) {
378		/* copy upper first so that later the middle range is more likely the first one to use */
379	0	if(upper[length].count>0) {
380	0	uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
381	0	++rangeCount;
382	0	}
383	0	if(lower[length].count>0) {
384	0	uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
385	0	++rangeCount;
386	0	}
387	0	}
388	0	return rangeCount>0;
389	0	}
390
391		UBool
392	0	CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) {
393		// See if the first few minLength and minLength+1 ranges have enough weights.
394	0	for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) {
395	0	if(n <= ranges[i].count) {
396		// Use the first few minLength and minLength+1 ranges.
397	0	if(ranges[i].length > minLength) {
398		// Reduce the number of weights from the last minLength+1 range
399		// which might sort before some minLength ranges,
400		// so that we use all weights in the minLength ranges.
401	0	ranges[i].count = n;
402	0	}
403	0	rangeCount = i + 1;
404		#ifdef UCOL_DEBUG
405		printf("take first %ld ranges\n", rangeCount);
406		#endif
407
408	0	if(rangeCount>1) {
409		/* sort the ranges by weight values */
410	0	UErrorCode errorCode=U_ZERO_ERROR;
411	0	uprv_sortArray(ranges, rangeCount, sizeof(WeightRange),
412	0	compareRanges, NULL, FALSE, &errorCode);
413		/* ignore error code: we know that the internal sort function will not fail here */
414	0	}
415	0	return TRUE;
416	0	}
417	0	n -= ranges[i].count; // still >0
418	0	}
419	0	return FALSE;
420	0	}
421
422		UBool
423	0	CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) {
424		// See if the minLength ranges have enough weights
425		// when we split one and lengthen the following ones.
426	0	int32_t count = 0;
427	0	int32_t minLengthRangeCount;
428	0	for(minLengthRangeCount = 0;
429	0	minLengthRangeCount < rangeCount &&
430	0	ranges[minLengthRangeCount].length == minLength;
431	0	++minLengthRangeCount) {
432	0	count += ranges[minLengthRangeCount].count;
433	0	}
434
435	0	int32_t nextCountBytes = countBytes(minLength + 1);
436	0	if(n > count * nextCountBytes) { return FALSE; }
437
438		// Use the minLength ranges. Merge them, and then split again as necessary.
439	0	uint32_t start = ranges[0].start;
440	0	uint32_t end = ranges[0].end;
441	0	for(int32_t i = 1; i < minLengthRangeCount; ++i) {
442	0	if(ranges[i].start < start) { start = ranges[i].start; }
443	0	if(ranges[i].end > end) { end = ranges[i].end; }
444	0	}
445
446		// Calculate how to split the range between minLength (count1) and minLength+1 (count2).
447		// Goal:
448		// count1 + count2 * nextCountBytes = n
449		// count1 + count2 = count
450		// These turn into
451		// (count - count2) + count2 * nextCountBytes = n
452		// and then into the following count1 & count2 computations.
453	0	int32_t count2 = (n - count) / (nextCountBytes - 1); // number of weights to be lengthened
454	0	int32_t count1 = count - count2; // number of minLength weights
455	0	if(count2 == 0 \|\| (count1 + count2 * nextCountBytes) < n) {
456		// round up
457	0	++count2;
458	0	--count1;
459	0	U_ASSERT((count1 + count2 * nextCountBytes) >= n);
460	0	}
461
462	0	ranges[0].start = start;
463
464	0	if(count1 == 0) {
465		// Make one long range.
466	0	ranges[0].end = end;
467	0	ranges[0].count = count;
468	0	lengthenRange(ranges[0]);
469	0	rangeCount = 1;
470	0	} else {
471		// Split the range, lengthen the second part.
472		#ifdef UCOL_DEBUG
473		printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n",
474		splitRange, rangeCount, count1, count2);
475		#endif
476
477		// Next start = start + count1. First end = 1 before that.
478	0	ranges[0].end = incWeightByOffset(start, minLength, count1 - 1);
479	0	ranges[0].count = count1;
480
481	0	ranges[1].start = incWeight(ranges[0].end, minLength);
482	0	ranges[1].end = end;
483	0	ranges[1].length = minLength; // +1 when lengthened
484	0	ranges[1].count = count2; // *countBytes when lengthened
485	0	lengthenRange(ranges[1]);
486	0	rangeCount = 2;
487	0	}
488	0	return TRUE;
489	0	}
490
491		/*
492		* call getWeightRanges and then determine heuristically
493		* which ranges to use for a given number of weights between (excluding)
494		* two limits
495		*/
496		UBool
497	0	CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) {
498		#ifdef UCOL_DEBUG
499		puts("");
500		#endif
501
502	0	if(!getWeightRanges(lowerLimit, upperLimit)) {
503		#ifdef UCOL_DEBUG
504		printf("error: unable to get Weight ranges\n");
505		#endif
506	0	return FALSE;
507	0	}
508
509		/* try until we find suitably large ranges */
510	0	for(;;) {
511		/* get the smallest number of bytes in a range */
512	0	int32_t minLength=ranges[0].length;
513
514	0	if(allocWeightsInShortRanges(n, minLength)) { break; }
515
516	0	if(minLength == 4) {
517		#ifdef UCOL_DEBUG
518		printf("error: the maximum number of %ld weights is insufficient for n=%ld\n",
519		minLengthCount, n);
520		#endif
521	0	return FALSE;
522	0	}
523
524	0	if(allocWeightsInMinLengthRanges(n, minLength)) { break; }
525
526		/* no good match, lengthen all minLength ranges and iterate */
527		#ifdef UCOL_DEBUG
528		printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
529		#endif
530	0	for(int32_t i=0; i<rangeCount && ranges[i].length==minLength; ++i) {
531	0	lengthenRange(ranges[i]);
532	0	}
533	0	}
534
535		#ifdef UCOL_DEBUG
536		puts("final ranges:");
537		for(int32_t i=0; i<rangeCount; ++i) {
538		printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n",
539		i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count);
540		}
541		#endif
542
543	0	rangeIndex = 0;
544	0	return TRUE;
545	0	}
546
547		uint32_t
548	0	CollationWeights::nextWeight() {
549	0	if(rangeIndex >= rangeCount) {
550	0	return 0xffffffff;
551	0	} else {
552		/* get the next weight */
553	0	WeightRange &range = ranges[rangeIndex];
554	0	uint32_t weight = range.start;
555	0	if(--range.count == 0) {
556		/* this range is finished */
557	0	++rangeIndex;
558	0	} else {
559		/* increment the weight for the next value */
560	0	range.start = incWeight(weight, range.length);
561	0	U_ASSERT(range.start <= range.end);
562	0	}
563
564	0	return weight;
565	0	}
566	0	}
567
568		U_NAMESPACE_END
569
570		#endif /* #if !UCONFIG_NO_COLLATION */