/src/icu/source/i18n/coleitr.cpp

Source (jump to first uncovered line)
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/

/*
* File coleitr.cpp
*
* Created by: Helena Shih
*
* Modification History:
*
*  Date      Name        Description
*
*  6/23/97   helena      Adding comments to make code more readable.
* 08/03/98   erm         Synched with 1.2 version of CollationElementIterator.java
* 12/10/99   aliu        Ported Thai collation support from Java.
* 01/25/01   swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
* 02/19/01   swquek      Removed CollationElementIterator() since it is 
*                        private constructor and no calls are made to it
* 2012-2014  markus      Rewritten in C++ again.
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/chariter.h"
#include "unicode/coleitr.h"
#include "unicode/tblcoll.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "collation.h"
#include "collationdata.h"
#include "collationiterator.h"
#include "collationsets.h"
#include "collationtailoring.h"
#include "uassert.h"
#include "uhash.h"
#include "utf16collationiterator.h"
#include "uvectr32.h"

/* Constants --------------------------------------------------------------- */

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)

/* CollationElementIterator public constructor/destructor ------------------ */

CollationElementIterator::CollationElementIterator(
                                         const CollationElementIterator& other) 
        : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
    *this = other;
}

CollationElementIterator::~CollationElementIterator()
{
    delete iter_;
    delete offsets_;
}

/* CollationElementIterator public methods --------------------------------- */

namespace {

uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
    return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
}
uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
    return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
}
UBool ceNeedsTwoParts(int64_t ce) {
    return (ce & INT64_C(0xffff00ff003f)) != 0;
}

}  // namespace

int32_t CollationElementIterator::getOffset() const
{
    if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
        // CollationIterator::previousCE() decrements the CEs length
        // while it pops CEs from its internal buffer.
        int32_t i = iter_->getCEsLength();
        if (otherHalf_ != 0) {
            // Return the trailing CE offset while we are in the middle of a 64-bit CE.
            ++i;
        }
        U_ASSERT(i < offsets_->size());
        return offsets_->elementAti(i);
    }
    return iter_->getOffset();
}

/**
* Get the ordering priority of the next character in the string.
* @return the next character's ordering. Returns NULLORDER if an error has 
*         occurred or if the end of string has been reached
*/
int32_t CollationElementIterator::next(UErrorCode& status)
{
    if (U_FAILURE(status)) { return NULLORDER; }
    if (dir_ > 1) {
        // Continue forward iteration. Test this first.
        if (otherHalf_ != 0) {
            uint32_t oh = otherHalf_;
            otherHalf_ = 0;
            return oh;
        }
    } else if (dir_ == 1) {
        // next() after setOffset()
        dir_ = 2;
    } else if (dir_ == 0) {
        // The iter_ is already reset to the start of the text.
        dir_ = 2;
    } else /* dir_ < 0 */ {
        // illegal change of direction
        status = U_INVALID_STATE_ERROR;
        return NULLORDER;
    }
    // No need to keep all CEs in the buffer when we iterate.
    iter_->clearCEsIfNoneRemaining();
    int64_t ce = iter_->nextCE(status);
    if (ce == Collation::NO_CE) { return NULLORDER; }
    // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
    uint32_t p = (uint32_t)(ce >> 32);
    uint32_t lower32 = (uint32_t)ce;
    uint32_t firstHalf = getFirstHalf(p, lower32);
    uint32_t secondHalf = getSecondHalf(p, lower32);
    if (secondHalf != 0) {
        otherHalf_ = secondHalf | 0xc0;  // continuation CE
    }
    return firstHalf;
}

bool CollationElementIterator::operator!=(
                                  const CollationElementIterator& other) const
{
    return !(*this == other);
}

bool CollationElementIterator::operator==(
                                    const CollationElementIterator& that) const
{
    if (this == &that) {
        return TRUE;
    }

    return
        (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
        otherHalf_ == that.otherHalf_ &&
        normalizeDir() == that.normalizeDir() &&
        string_ == that.string_ &&
        *iter_ == *that.iter_;
}

/**
* Get the ordering priority of the previous collation element in the string.
* @param status the error code status.
* @return the previous element's ordering. Returns NULLORDER if an error has 
*         occurred or if the start of string has been reached.
*/
int32_t CollationElementIterator::previous(UErrorCode& status)
{
    if (U_FAILURE(status)) { return NULLORDER; }
    if (dir_ < 0) {
        // Continue backwards iteration. Test this first.
        if (otherHalf_ != 0) {
            uint32_t oh = otherHalf_;
            otherHalf_ = 0;
            return oh;
        }
    } else if (dir_ == 0) {
        iter_->resetToOffset(string_.length());
        dir_ = -1;
    } else if (dir_ == 1) {
        // previous() after setOffset()
        dir_ = -1;
    } else /* dir_ > 1 */ {
        // illegal change of direction
        status = U_INVALID_STATE_ERROR;
        return NULLORDER;
    }
    if (offsets_ == NULL) {
        offsets_ = new UVector32(status);
        if (offsets_ == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return NULLORDER;
        }
    }
    // If we already have expansion CEs, then we also have offsets.
    // Otherwise remember the trailing offset in case we need to
    // write offsets for an artificial expansion.
    int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
    int64_t ce = iter_->previousCE(*offsets_, status);
    if (ce == Collation::NO_CE) { return NULLORDER; }
    // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
    uint32_t p = (uint32_t)(ce >> 32);
    uint32_t lower32 = (uint32_t)ce;
    uint32_t firstHalf = getFirstHalf(p, lower32);
    uint32_t secondHalf = getSecondHalf(p, lower32);
    if (secondHalf != 0) {
        if (offsets_->isEmpty()) {
            // When we convert a single 64-bit CE into two 32-bit CEs,
            // we need to make this artificial expansion behave like a normal expansion.
            // See CollationIterator::previousCE().
            offsets_->addElement(iter_->getOffset(), status);
            offsets_->addElement(limitOffset, status);
        }
        otherHalf_ = firstHalf;
        return secondHalf | 0xc0;  // continuation CE
    }
    return firstHalf;
}

/**
* Resets the cursor to the beginning of the string.
*/
void CollationElementIterator::reset()
{
    iter_ ->resetToOffset(0);
    otherHalf_ = 0;
    dir_ = 0;
}

void CollationElementIterator::setOffset(int32_t newOffset, 
                                         UErrorCode& status)
{
    if (U_FAILURE(status)) { return; }
    if (0 < newOffset && newOffset < string_.length()) {
        int32_t offset = newOffset;
        do {
            UChar c = string_.charAt(offset);
            if (!rbc_->isUnsafe(c) ||
                    (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
                break;
            }
            // Back up to before this unsafe character.
            --offset;
        } while (offset > 0);
        if (offset < newOffset) {
            // We might have backed up more than necessary.
            // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
            // but for text "chu" setOffset(2) should remain at 2
            // although we initially back up to offset 0.
            // Find the last safe offset no greater than newOffset by iterating forward.
            int32_t lastSafeOffset = offset;
            do {
                iter_->resetToOffset(lastSafeOffset);
                do {
                    iter_->nextCE(status);
                    if (U_FAILURE(status)) { return; }
                } while ((offset = iter_->getOffset()) == lastSafeOffset);
                if (offset <= newOffset) {
                    lastSafeOffset = offset;
                }
            } while (offset < newOffset);
            newOffset = lastSafeOffset;
        }
    }
    iter_->resetToOffset(newOffset);
    otherHalf_ = 0;
    dir_ = 1;
}

/**
* Sets the source to the new source string.
*/
void CollationElementIterator::setText(const UnicodeString& source,
                                       UErrorCode& status)
{
    if (U_FAILURE(status)) {
        return;
    }

    string_ = source;
    const UChar *s = string_.getBuffer();
    CollationIterator *newIter;
    UBool numeric = rbc_->settings->isNumeric();
    if (rbc_->settings->dontCheckFCD()) {
        newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
    } else {
        newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
    }
    if (newIter == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    delete iter_;
    iter_ = newIter;
    otherHalf_ = 0;
    dir_ = 0;
}

// Sets the source to the new character iterator.
void CollationElementIterator::setText(CharacterIterator& source, 
                                       UErrorCode& status)
{
    if (U_FAILURE(status)) 
        return;

    source.getText(string_);
    setText(string_, status);
}

int32_t CollationElementIterator::strengthOrder(int32_t order) const
{
    UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
    // Mask off the unwanted differences.
    if (s == UCOL_PRIMARY) {
        order &= 0xffff0000;
    }
    else if (s == UCOL_SECONDARY) {
        order &= 0xffffff00;
    }

    return order;
}

/* CollationElementIterator private constructors/destructors --------------- */

/** 
* This is the "real" constructor for this class; it constructs an iterator
* over the source text using the specified collator
*/
CollationElementIterator::CollationElementIterator(
                                               const UnicodeString &source,
                                               const RuleBasedCollator *coll,
                                               UErrorCode &status)
        : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
    setText(source, status);
}

/** 
* This is the "real" constructor for this class; it constructs an iterator over 
* the source text using the specified collator
*/
CollationElementIterator::CollationElementIterator(
                                           const CharacterIterator &source,
                                           const RuleBasedCollator *coll,
                                           UErrorCode &status)
        : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
    // We only call source.getText() which should be const anyway.
    setText(const_cast<CharacterIterator &>(source), status);
}

/* CollationElementIterator private methods -------------------------------- */

const CollationElementIterator& CollationElementIterator::operator=(
                                         const CollationElementIterator& other)
{
    if (this == &other) {
        return *this;
    }

    CollationIterator *newIter;
    const FCDUTF16CollationIterator *otherFCDIter =
            dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
    if(otherFCDIter != NULL) {
        newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
    } else {
        const UTF16CollationIterator *otherIter =
                dynamic_cast<const UTF16CollationIterator *>(other.iter_);
        if(otherIter != NULL) {
            newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
        } else {
            newIter = NULL;
        }
    }
    if(newIter != NULL) {
        delete iter_;
        iter_ = newIter;
        rbc_ = other.rbc_;
        otherHalf_ = other.otherHalf_;
        dir_ = other.dir_;

        string_ = other.string_;
    }
    if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
        UErrorCode errorCode = U_ZERO_ERROR;
        if(offsets_ == NULL) {
            offsets_ = new UVector32(other.offsets_->size(), errorCode);
        }
        if(offsets_ != NULL) {
            offsets_->assign(*other.offsets_, errorCode);
        }
    }
    return *this;
}

namespace {

class MaxExpSink : public ContractionsAndExpansions::CESink {
public:
    MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
    virtual ~MaxExpSink();
    virtual void handleCE(int64_t /*ce*/) {}
    virtual void handleExpansion(const int64_t ces[], int32_t length) {
        if (length <= 1) {
            // We do not need to add single CEs into the map.
            return;
        }
        int32_t count = 0;  // number of CE "halves"
        for (int32_t i = 0; i < length; ++i) {
            count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
        }
        // last "half" of the last CE
        int64_t ce = ces[length - 1];
        uint32_t p = (uint32_t)(ce >> 32);
        uint32_t lower32 = (uint32_t)ce;
        uint32_t lastHalf = getSecondHalf(p, lower32);
        if (lastHalf == 0) {
            lastHalf = getFirstHalf(p, lower32);
            U_ASSERT(lastHalf != 0);
        } else {
            lastHalf |= 0xc0;  // old-style continuation CE
        }
        if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
            uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
        }
    }

private:
    UHashtable *maxExpansions;
    UErrorCode &errorCode;
};

MaxExpSink::~MaxExpSink() {}

}  // namespace

UHashtable *
CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
    if (U_FAILURE(errorCode)) { return NULL; }
    UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
                                           uhash_compareLong, &errorCode);
    if (U_FAILURE(errorCode)) { return NULL; }
    MaxExpSink sink(maxExpansions, errorCode);
    ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
    if (U_FAILURE(errorCode)) {
        uhash_close(maxExpansions);
        return NULL;
    }
    return maxExpansions;
}

int32_t
CollationElementIterator::getMaxExpansion(int32_t order) const {
    return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
}

int32_t
CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
    if (order == 0) { return 1; }
    int32_t max;
    if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
        return max;
    }
    if ((order & 0xc0) == 0xc0) {
        // old-style continuation CE
        return 2;
    } else {
        return 1;
    }
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_COLLATION */

Coverage Report

Created: 2025-06-24 06:43

Line	Count	Source (jump to first uncovered line)
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		*******************************************************************************
5		* Copyright (C) 1996-2014, International Business Machines Corporation and
6		* others. All Rights Reserved.
7		*******************************************************************************
8		*/
9
10		/*
11		* File coleitr.cpp
12		*
13		* Created by: Helena Shih
14		*
15		* Modification History:
16		*
17		* Date Name Description
18		*
19		* 6/23/97 helena Adding comments to make code more readable.
20		* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
21		* 12/10/99 aliu Ported Thai collation support from Java.
22		* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
23		* 02/19/01 swquek Removed CollationElementIterator() since it is
24		* private constructor and no calls are made to it
25		* 2012-2014 markus Rewritten in C++ again.
26		*/
27
28		#include "unicode/utypes.h"
29
30		#if !UCONFIG_NO_COLLATION
31
32		#include "unicode/chariter.h"
33		#include "unicode/coleitr.h"
34		#include "unicode/tblcoll.h"
35		#include "unicode/ustring.h"
36		#include "cmemory.h"
37		#include "collation.h"
38		#include "collationdata.h"
39		#include "collationiterator.h"
40		#include "collationsets.h"
41		#include "collationtailoring.h"
42		#include "uassert.h"
43		#include "uhash.h"
44		#include "utf16collationiterator.h"
45		#include "uvectr32.h"
46
47		/* Constants --------------------------------------------------------------- */
48
49		U_NAMESPACE_BEGIN
50
51		UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
52
53		/* CollationElementIterator public constructor/destructor ------------------ */
54
55		CollationElementIterator::CollationElementIterator(
56		const CollationElementIterator& other)
57	0	: UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
58	0	*this = other;
59	0	}
60
61		CollationElementIterator::~CollationElementIterator()
62	0	{
63	0	delete iter_;
64	0	delete offsets_;
65	0	}
66
67		/* CollationElementIterator public methods --------------------------------- */
68
69		namespace {
70
71	0	uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
72	0	return (p & 0xffff0000) \| ((lower32 >> 16) & 0xff00) \| ((lower32 >> 8) & 0xff);
73	0	}
74	0	uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
75	0	return (p << 16) \| ((lower32 >> 8) & 0xff00) \| (lower32 & 0x3f);
76	0	}
77	0	UBool ceNeedsTwoParts(int64_t ce) {
78	0	return (ce & INT64_C(0xffff00ff003f)) != 0;
79	0	}
80
81		} // namespace
82
83		int32_t CollationElementIterator::getOffset() const
84	0	{
85	0	if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
86		// CollationIterator::previousCE() decrements the CEs length
87		// while it pops CEs from its internal buffer.
88	0	int32_t i = iter_->getCEsLength();
89	0	if (otherHalf_ != 0) {
90		// Return the trailing CE offset while we are in the middle of a 64-bit CE.
91	0	++i;
92	0	}
93	0	U_ASSERT(i < offsets_->size());
94	0	return offsets_->elementAti(i);
95	0	}
96	0	return iter_->getOffset();
97	0	}
98
99		/**
100		* Get the ordering priority of the next character in the string.
101		* @return the next character's ordering. Returns NULLORDER if an error has
102		* occurred or if the end of string has been reached
103		*/
104		int32_t CollationElementIterator::next(UErrorCode& status)
105	0	{
106	0	if (U_FAILURE(status)) { return NULLORDER; }
107	0	if (dir_ > 1) {
108		// Continue forward iteration. Test this first.
109	0	if (otherHalf_ != 0) {
110	0	uint32_t oh = otherHalf_;
111	0	otherHalf_ = 0;
112	0	return oh;
113	0	}
114	0	} else if (dir_ == 1) {
115		// next() after setOffset()
116	0	dir_ = 2;
117	0	} else if (dir_ == 0) {
118		// The iter_ is already reset to the start of the text.
119	0	dir_ = 2;
120	0	} else /* dir_ < 0 */ {
121		// illegal change of direction
122	0	status = U_INVALID_STATE_ERROR;
123	0	return NULLORDER;
124	0	}
125		// No need to keep all CEs in the buffer when we iterate.
126	0	iter_->clearCEsIfNoneRemaining();
127	0	int64_t ce = iter_->nextCE(status);
128	0	if (ce == Collation::NO_CE) { return NULLORDER; }
129		// Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
130	0	uint32_t p = (uint32_t)(ce >> 32);
131	0	uint32_t lower32 = (uint32_t)ce;
132	0	uint32_t firstHalf = getFirstHalf(p, lower32);
133	0	uint32_t secondHalf = getSecondHalf(p, lower32);
134	0	if (secondHalf != 0) {
135	0	otherHalf_ = secondHalf \| 0xc0; // continuation CE
136	0	}
137	0	return firstHalf;
138	0	}
139
140		bool CollationElementIterator::operator!=(
141		const CollationElementIterator& other) const
142	0	{
143	0	return !(*this == other);
144	0	}
145
146		bool CollationElementIterator::operator==(
147		const CollationElementIterator& that) const
148	0	{
149	0	if (this == &that) {
150	0	return TRUE;
151	0	}
152
153	0	return
154	0	(rbc_ == that.rbc_ \|\| rbc_ == that.rbc_) &&
155	0	otherHalf_ == that.otherHalf_ &&
156	0	normalizeDir() == that.normalizeDir() &&
157	0	string_ == that.string_ &&
158	0	iter_ == that.iter_;
159	0	}
160
161		/**
162		* Get the ordering priority of the previous collation element in the string.
163		* @param status the error code status.
164		* @return the previous element's ordering. Returns NULLORDER if an error has
165		* occurred or if the start of string has been reached.
166		*/
167		int32_t CollationElementIterator::previous(UErrorCode& status)
168	0	{
169	0	if (U_FAILURE(status)) { return NULLORDER; }
170	0	if (dir_ < 0) {
171		// Continue backwards iteration. Test this first.
172	0	if (otherHalf_ != 0) {
173	0	uint32_t oh = otherHalf_;
174	0	otherHalf_ = 0;
175	0	return oh;
176	0	}
177	0	} else if (dir_ == 0) {
178	0	iter_->resetToOffset(string_.length());
179	0	dir_ = -1;
180	0	} else if (dir_ == 1) {
181		// previous() after setOffset()
182	0	dir_ = -1;
183	0	} else /* dir_ > 1 */ {
184		// illegal change of direction
185	0	status = U_INVALID_STATE_ERROR;
186	0	return NULLORDER;
187	0	}
188	0	if (offsets_ == NULL) {
189	0	offsets_ = new UVector32(status);
190	0	if (offsets_ == NULL) {
191	0	status = U_MEMORY_ALLOCATION_ERROR;
192	0	return NULLORDER;
193	0	}
194	0	}
195		// If we already have expansion CEs, then we also have offsets.
196		// Otherwise remember the trailing offset in case we need to
197		// write offsets for an artificial expansion.
198	0	int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
199	0	int64_t ce = iter_->previousCE(*offsets_, status);
200	0	if (ce == Collation::NO_CE) { return NULLORDER; }
201		// Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
202	0	uint32_t p = (uint32_t)(ce >> 32);
203	0	uint32_t lower32 = (uint32_t)ce;
204	0	uint32_t firstHalf = getFirstHalf(p, lower32);
205	0	uint32_t secondHalf = getSecondHalf(p, lower32);
206	0	if (secondHalf != 0) {
207	0	if (offsets_->isEmpty()) {
208		// When we convert a single 64-bit CE into two 32-bit CEs,
209		// we need to make this artificial expansion behave like a normal expansion.
210		// See CollationIterator::previousCE().
211	0	offsets_->addElement(iter_->getOffset(), status);
212	0	offsets_->addElement(limitOffset, status);
213	0	}
214	0	otherHalf_ = firstHalf;
215	0	return secondHalf \| 0xc0; // continuation CE
216	0	}
217	0	return firstHalf;
218	0	}
219
220		/**
221		* Resets the cursor to the beginning of the string.
222		*/
223		void CollationElementIterator::reset()
224	0	{
225	0	iter_ ->resetToOffset(0);
226	0	otherHalf_ = 0;
227	0	dir_ = 0;
228	0	}
229
230		void CollationElementIterator::setOffset(int32_t newOffset,
231		UErrorCode& status)
232	0	{
233	0	if (U_FAILURE(status)) { return; }
234	0	if (0 < newOffset && newOffset < string_.length()) {
235	0	int32_t offset = newOffset;
236	0	do {
237	0	UChar c = string_.charAt(offset);
238	0	if (!rbc_->isUnsafe(c) \|\|
239	0	(U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
240	0	break;
241	0	}
242		// Back up to before this unsafe character.
243	0	--offset;
244	0	} while (offset > 0);
245	0	if (offset < newOffset) {
246		// We might have backed up more than necessary.
247		// For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
248		// but for text "chu" setOffset(2) should remain at 2
249		// although we initially back up to offset 0.
250		// Find the last safe offset no greater than newOffset by iterating forward.
251	0	int32_t lastSafeOffset = offset;
252	0	do {
253	0	iter_->resetToOffset(lastSafeOffset);
254	0	do {
255	0	iter_->nextCE(status);
256	0	if (U_FAILURE(status)) { return; }
257	0	} while ((offset = iter_->getOffset()) == lastSafeOffset);
258	0	if (offset <= newOffset) {
259	0	lastSafeOffset = offset;
260	0	}
261	0	} while (offset < newOffset);
262	0	newOffset = lastSafeOffset;
263	0	}
264	0	}
265	0	iter_->resetToOffset(newOffset);
266	0	otherHalf_ = 0;
267	0	dir_ = 1;
268	0	}
269
270		/**
271		* Sets the source to the new source string.
272		*/
273		void CollationElementIterator::setText(const UnicodeString& source,
274		UErrorCode& status)
275	0	{
276	0	if (U_FAILURE(status)) {
277	0	return;
278	0	}
279
280	0	string_ = source;
281	0	const UChar *s = string_.getBuffer();
282	0	CollationIterator *newIter;
283	0	UBool numeric = rbc_->settings->isNumeric();
284	0	if (rbc_->settings->dontCheckFCD()) {
285	0	newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
286	0	} else {
287	0	newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
288	0	}
289	0	if (newIter == NULL) {
290	0	status = U_MEMORY_ALLOCATION_ERROR;
291	0	return;
292	0	}
293	0	delete iter_;
294	0	iter_ = newIter;
295	0	otherHalf_ = 0;
296	0	dir_ = 0;
297	0	}
298
299		// Sets the source to the new character iterator.
300		void CollationElementIterator::setText(CharacterIterator& source,
301		UErrorCode& status)
302	0	{
303	0	if (U_FAILURE(status))
304	0	return;
305
306	0	source.getText(string_);
307	0	setText(string_, status);
308	0	}
309
310		int32_t CollationElementIterator::strengthOrder(int32_t order) const
311	0	{
312	0	UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
313		// Mask off the unwanted differences.
314	0	if (s == UCOL_PRIMARY) {
315	0	order &= 0xffff0000;
316	0	}
317	0	else if (s == UCOL_SECONDARY) {
318	0	order &= 0xffffff00;
319	0	}
320
321	0	return order;
322	0	}
323
324		/* CollationElementIterator private constructors/destructors --------------- */
325
326		/**
327		* This is the "real" constructor for this class; it constructs an iterator
328		* over the source text using the specified collator
329		*/
330		CollationElementIterator::CollationElementIterator(
331		const UnicodeString &source,
332		const RuleBasedCollator *coll,
333		UErrorCode &status)
334	0	: iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
335	0	setText(source, status);
336	0	}
337
338		/**
339		* This is the "real" constructor for this class; it constructs an iterator over
340		* the source text using the specified collator
341		*/
342		CollationElementIterator::CollationElementIterator(
343		const CharacterIterator &source,
344		const RuleBasedCollator *coll,
345		UErrorCode &status)
346	0	: iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
347		// We only call source.getText() which should be const anyway.
348	0	setText(const_cast<CharacterIterator &>(source), status);
349	0	}
350
351		/* CollationElementIterator private methods -------------------------------- */
352
353		const CollationElementIterator& CollationElementIterator::operator=(
354		const CollationElementIterator& other)
355	0	{
356	0	if (this == &other) {
357	0	return *this;
358	0	}
359
360	0	CollationIterator *newIter;
361	0	const FCDUTF16CollationIterator *otherFCDIter =
362	0	dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
363	0	if(otherFCDIter != NULL) {
364	0	newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
365	0	} else {
366	0	const UTF16CollationIterator *otherIter =
367	0	dynamic_cast<const UTF16CollationIterator *>(other.iter_);
368	0	if(otherIter != NULL) {
369	0	newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
370	0	} else {
371	0	newIter = NULL;
372	0	}
373	0	}
374	0	if(newIter != NULL) {
375	0	delete iter_;
376	0	iter_ = newIter;
377	0	rbc_ = other.rbc_;
378	0	otherHalf_ = other.otherHalf_;
379	0	dir_ = other.dir_;
380
381	0	string_ = other.string_;
382	0	}
383	0	if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
384	0	UErrorCode errorCode = U_ZERO_ERROR;
385	0	if(offsets_ == NULL) {
386	0	offsets_ = new UVector32(other.offsets_->size(), errorCode);
387	0	}
388	0	if(offsets_ != NULL) {
389	0	offsets_->assign(*other.offsets_, errorCode);
390	0	}
391	0	}
392	0	return *this;
393	0	}
394
395		namespace {
396
397		class MaxExpSink : public ContractionsAndExpansions::CESink {
398		public:
399	0	MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
400		virtual ~MaxExpSink();
401	0	virtual void handleCE(int64_t /ce/) {}
402	0	virtual void handleExpansion(const int64_t ces[], int32_t length) {
403	0	if (length <= 1) {
404		// We do not need to add single CEs into the map.
405	0	return;
406	0	}
407	0	int32_t count = 0; // number of CE "halves"
408	0	for (int32_t i = 0; i < length; ++i) {
409	0	count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
410	0	}
411		// last "half" of the last CE
412	0	int64_t ce = ces[length - 1];
413	0	uint32_t p = (uint32_t)(ce >> 32);
414	0	uint32_t lower32 = (uint32_t)ce;
415	0	uint32_t lastHalf = getSecondHalf(p, lower32);
416	0	if (lastHalf == 0) {
417	0	lastHalf = getFirstHalf(p, lower32);
418	0	U_ASSERT(lastHalf != 0);
419	0	} else {
420	0	lastHalf \|= 0xc0; // old-style continuation CE
421	0	}
422	0	if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
423	0	uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
424	0	}
425	0	}
426
427		private:
428		UHashtable *maxExpansions;
429		UErrorCode &errorCode;
430		};
431
432		MaxExpSink::~MaxExpSink() {}
433
434		} // namespace
435
436		UHashtable *
437	0	CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
438	0	if (U_FAILURE(errorCode)) { return NULL; }
439	0	UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
440	0	uhash_compareLong, &errorCode);
441	0	if (U_FAILURE(errorCode)) { return NULL; }
442	0	MaxExpSink sink(maxExpansions, errorCode);
443	0	ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
444	0	if (U_FAILURE(errorCode)) {
445	0	uhash_close(maxExpansions);
446	0	return NULL;
447	0	}
448	0	return maxExpansions;
449	0	}
450
451		int32_t
452	0	CollationElementIterator::getMaxExpansion(int32_t order) const {
453	0	return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
454	0	}
455
456		int32_t
457	0	CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
458	0	if (order == 0) { return 1; }
459	0	int32_t max;
460	0	if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
461	0	return max;
462	0	}
463	0	if ((order & 0xc0) == 0xc0) {
464		// old-style continuation CE
465	0	return 2;
466	0	} else {
467	0	return 1;
468	0	}
469	0	}
470
471		U_NAMESPACE_END
472
473		#endif /* #if !UCONFIG_NO_COLLATION */