/src/icu/source/i18n/utf8collationiterator.cpp

Source (jump to first uncovered line)
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* utf8collationiterator.cpp
*
* created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)
* created by: Markus W. Scherer
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/utf8.h"
#include "charstr.h"
#include "cmemory.h"
#include "collation.h"
#include "collationdata.h"
#include "collationfcd.h"
#include "collationiterator.h"
#include "normalizer2impl.h"
#include "uassert.h"
#include "utf8collationiterator.h"

U_NAMESPACE_BEGIN

UTF8CollationIterator::~UTF8CollationIterator() {}

void
UTF8CollationIterator::resetToOffset(int32_t newOffset) {
    reset();
    pos = newOffset;
}

int32_t
UTF8CollationIterator::getOffset() const {
    return pos;
}

uint32_t
UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
    if(pos == length) {
        c = U_SENTINEL;
        return Collation::FALLBACK_CE32;
    }
    // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
    c = u8[pos++];
    if(U8_IS_SINGLE(c)) {
        // ASCII 00..7F
        return trie->data32[c];
    }
    uint8_t t1, t2;
    if(0xe0 <= c && c < 0xf0 &&
            ((pos + 1) < length || length < 0) &&
            U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
            (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
        // U+0800..U+FFFF except surrogates
        c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
        pos += 2;
        return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
    } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
        // U+0080..U+07FF
        uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
        c = ((c & 0x1f) << 6) | t1;
        ++pos;
        return ce32;
    } else {
        // Function call for supplementary code points and error cases.
        // Illegal byte sequences yield U+FFFD.
        c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
        return data->getCE32(c);
    }
}

UBool
UTF8CollationIterator::foundNULTerminator() {
    if(length < 0) {
        length = --pos;
        return TRUE;
    } else {
        return FALSE;
    }
}

UBool
UTF8CollationIterator::forbidSurrogateCodePoints() const {
    return TRUE;
}

UChar32
UTF8CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
    if(pos == length) {
        return U_SENTINEL;
    }
    if(u8[pos] == 0 && length < 0) {
        length = pos;
        return U_SENTINEL;
    }
    UChar32 c;
    U8_NEXT_OR_FFFD(u8, pos, length, c);
    return c;
}

UChar32
UTF8CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
    if(pos == 0) {
        return U_SENTINEL;
    }
    UChar32 c;
    U8_PREV_OR_FFFD(u8, 0, pos, c);
    return c;
}

void
UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    U8_FWD_N(u8, pos, length, num);
}

void
UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    U8_BACK_N(u8, 0, pos, num);
}

// FCDUTF8CollationIterator ------------------------------------------------ ***

FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}

void
FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) {
    reset();
    start = pos = newOffset;
    state = CHECK_FWD;
}

int32_t
FCDUTF8CollationIterator::getOffset() const {
    if(state != IN_NORMALIZED) {
        return pos;
    } else if(pos == 0) {
        return start;
    } else {
        return limit;
    }
}

uint32_t
FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    for(;;) {
        if(state == CHECK_FWD) {
            // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.
            if(pos == length) {
                c = U_SENTINEL;
                return Collation::FALLBACK_CE32;
            }
            c = u8[pos++];
            if(U8_IS_SINGLE(c)) {
                // ASCII 00..7F
                return trie->data32[c];
            }
            uint8_t t1, t2;
            if(0xe0 <= c && c < 0xf0 &&
                    ((pos + 1) < length || length < 0) &&
                    U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
                    (t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
                // U+0800..U+FFFF except surrogates
                c = (((c & 0xf) << 12) | ((t1 & 0x3f) << 6) | t2);
                pos += 2;
                if(CollationFCD::hasTccc(c) &&
                        (CollationFCD::maybeTibetanCompositeVowel(c) ||
                            (pos != length && nextHasLccc()))) {
                    pos -= 3;
                } else {
                    break;  // return CE32(BMP)
                }
            } else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
                // U+0080..U+07FF
                uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
                c = ((c & 0x1f) << 6) | t1;
                ++pos;
                if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
                    pos -= 2;
                } else {
                    return ce32;
                }
            } else {
                // Function call for supplementary code points and error cases.
                // Illegal byte sequences yield U+FFFD.
                c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
                if(c == 0xfffd) {
                    return Collation::FFFD_CE32;
                } else {
                    U_ASSERT(c > 0xffff);
                    if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) {
                        pos -= 4;
                    } else {
                        return data->getCE32FromSupplementary(c);
                    }
                }
            }
            if(!nextSegment(errorCode)) {
                c = U_SENTINEL;
                return Collation::FALLBACK_CE32;
            }
            continue;
        } else if(state == IN_FCD_SEGMENT && pos != limit) {
            return UTF8CollationIterator::handleNextCE32(c, errorCode);
        } else if(state == IN_NORMALIZED && pos != normalized.length()) {
            c = normalized[pos++];
            break;
        } else {
            switchToForward();
        }
    }
    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
}

UBool
FCDUTF8CollationIterator::nextHasLccc() const {
    U_ASSERT(state == CHECK_FWD && pos != length);
    // The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
    // CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
    UChar32 c = u8[pos];
    if(c < 0xcc || (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; }
    int32_t i = pos;
    U8_NEXT_OR_FFFD(u8, i, length, c);
    if(c > 0xffff) { c = U16_LEAD(c); }
    return CollationFCD::hasLccc(c);
}

UBool
FCDUTF8CollationIterator::previousHasTccc() const {
    U_ASSERT(state == CHECK_BWD && pos != 0);
    UChar32 c = u8[pos - 1];
    if(U8_IS_SINGLE(c)) { return FALSE; }
    int32_t i = pos;
    U8_PREV_OR_FFFD(u8, 0, i, c);
    if(c > 0xffff) { c = U16_LEAD(c); }
    return CollationFCD::hasTccc(c);
}

UChar
FCDUTF8CollationIterator::handleGetTrailSurrogate() {
    if(state != IN_NORMALIZED) { return 0; }
    U_ASSERT(pos < normalized.length());
    UChar trail;
    if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
    return trail;
}

UBool
FCDUTF8CollationIterator::foundNULTerminator() {
    if(state == CHECK_FWD && length < 0) {
        length = --pos;
        return TRUE;
    } else {
        return FALSE;
    }
}

UChar32
FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
    UChar32 c;
    for(;;) {
        if(state == CHECK_FWD) {
            if(pos == length || ((c = u8[pos]) == 0 && length < 0)) {
                return U_SENTINEL;
            }
            if(U8_IS_SINGLE(c)) {
                ++pos;
                return c;
            }
            U8_NEXT_OR_FFFD(u8, pos, length, c);
            if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&
                    (CollationFCD::maybeTibetanCompositeVowel(c) ||
                        (pos != length && nextHasLccc()))) {
                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
                // and we can use U8_LENGTH() rather than a previous-position variable.
                pos -= U8_LENGTH(c);
                if(!nextSegment(errorCode)) {
                    return U_SENTINEL;
                }
                continue;
            }
            return c;
        } else if(state == IN_FCD_SEGMENT && pos != limit) {
            U8_NEXT_OR_FFFD(u8, pos, length, c);
            return c;
        } else if(state == IN_NORMALIZED && pos != normalized.length()) {
            c = normalized.char32At(pos);
            pos += U16_LENGTH(c);
            return c;
        } else {
            switchToForward();
        }
    }
}

UChar32
FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
    UChar32 c;
    for(;;) {
        if(state == CHECK_BWD) {
            if(pos == 0) {
                return U_SENTINEL;
            }
            if(U8_IS_SINGLE(c = u8[pos - 1])) {
                --pos;
                return c;
            }
            U8_PREV_OR_FFFD(u8, 0, pos, c);
            if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&
                    (CollationFCD::maybeTibetanCompositeVowel(c) ||
                        (pos != 0 && previousHasTccc()))) {
                // c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
                // and we can use U8_LENGTH() rather than a previous-position variable.
                pos += U8_LENGTH(c);
                if(!previousSegment(errorCode)) {
                    return U_SENTINEL;
                }
                continue;
            }
            return c;
        } else if(state == IN_FCD_SEGMENT && pos != start) {
            U8_PREV_OR_FFFD(u8, 0, pos, c);
            return c;
        } else if(state >= IN_NORMALIZED && pos != 0) {
            c = normalized.char32At(pos - 1);
            pos -= U16_LENGTH(c);
            return c;
        } else {
            switchToBackward();
        }
    }
}

void
FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    // Specify the class to avoid a virtual-function indirection.
    // In Java, we would declare this class final.
    while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) {
        --num;
    }
}

void
FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    // Specify the class to avoid a virtual-function indirection.
    // In Java, we would declare this class final.
    while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) {
        --num;
    }
}

void
FCDUTF8CollationIterator::switchToForward() {
    U_ASSERT(state == CHECK_BWD ||
             (state == IN_FCD_SEGMENT && pos == limit) ||
             (state == IN_NORMALIZED && pos == normalized.length()));
    if(state == CHECK_BWD) {
        // Turn around from backward checking.
        start = pos;
        if(pos == limit) {
            state = CHECK_FWD;  // Check forward.
        } else {  // pos < limit
            state = IN_FCD_SEGMENT;  // Stay in FCD segment.
        }
    } else {
        // Reached the end of the FCD segment.
        if(state == IN_FCD_SEGMENT) {
            // The input text segment is FCD, extend it forward.
        } else {
            // The input text segment needed to be normalized.
            // Switch to checking forward from it.
            start = pos = limit;
        }
        state = CHECK_FWD;
    }
}

UBool
FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return FALSE; }
    U_ASSERT(state == CHECK_FWD && pos != length);
    // The input text [start..pos[ passes the FCD check.
    int32_t segmentStart = pos;
    // Collect the characters being checked, in case they need to be normalized.
    UnicodeString s;
    uint8_t prevCC = 0;
    for(;;) {
        // Fetch the next character and its fcd16 value.
        int32_t cpStart = pos;
        UChar32 c;
        U8_NEXT_OR_FFFD(u8, pos, length, c);
        uint16_t fcd16 = nfcImpl.getFCD16(c);
        uint8_t leadCC = (uint8_t)(fcd16 >> 8);
        if(leadCC == 0 && cpStart != segmentStart) {
            // FCD boundary before this character.
            pos = cpStart;
            break;
        }
        s.append(c);
        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
            // Fails FCD check. Find the next FCD boundary and normalize.
            while(pos != length) {
                cpStart = pos;
                U8_NEXT_OR_FFFD(u8, pos, length, c);
                if(nfcImpl.getFCD16(c) <= 0xff) {
                    pos = cpStart;
                    break;
                }
                s.append(c);
            }
            if(!normalize(s, errorCode)) { return FALSE; }
            start = segmentStart;
            limit = pos;
            state = IN_NORMALIZED;
            pos = 0;
            return TRUE;
        }
        prevCC = (uint8_t)fcd16;
        if(pos == length || prevCC == 0) {
            // FCD boundary after the last character.
            break;
        }
    }
    limit = pos;
    pos = segmentStart;
    U_ASSERT(pos != limit);
    state = IN_FCD_SEGMENT;
    return TRUE;
}

void
FCDUTF8CollationIterator::switchToBackward() {
    U_ASSERT(state == CHECK_FWD ||
             (state == IN_FCD_SEGMENT && pos == start) ||
             (state >= IN_NORMALIZED && pos == 0));
    if(state == CHECK_FWD) {
        // Turn around from forward checking.
        limit = pos;
        if(pos == start) {
            state = CHECK_BWD;  // Check backward.
        } else {  // pos > start
            state = IN_FCD_SEGMENT;  // Stay in FCD segment.
        }
    } else {
        // Reached the start of the FCD segment.
        if(state == IN_FCD_SEGMENT) {
            // The input text segment is FCD, extend it backward.
        } else {
            // The input text segment needed to be normalized.
            // Switch to checking backward from it.
            limit = pos = start;
        }
        state = CHECK_BWD;
    }
}

UBool
FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return FALSE; }
    U_ASSERT(state == CHECK_BWD && pos != 0);
    // The input text [pos..limit[ passes the FCD check.
    int32_t segmentLimit = pos;
    // Collect the characters being checked, in case they need to be normalized.
    UnicodeString s;
    uint8_t nextCC = 0;
    for(;;) {
        // Fetch the previous character and its fcd16 value.
        int32_t cpLimit = pos;
        UChar32 c;
        U8_PREV_OR_FFFD(u8, 0, pos, c);
        uint16_t fcd16 = nfcImpl.getFCD16(c);
        uint8_t trailCC = (uint8_t)fcd16;
        if(trailCC == 0 && cpLimit != segmentLimit) {
            // FCD boundary after this character.
            pos = cpLimit;
            break;
        }
        s.append(c);
        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
            // Fails FCD check. Find the previous FCD boundary and normalize.
            while(fcd16 > 0xff && pos != 0) {
                cpLimit = pos;
                U8_PREV_OR_FFFD(u8, 0, pos, c);
                fcd16 = nfcImpl.getFCD16(c);
                if(fcd16 == 0) {
                    pos = cpLimit;
                    break;
                }
                s.append(c);
            }
            s.reverse();
            if(!normalize(s, errorCode)) { return FALSE; }
            limit = segmentLimit;
            start = pos;
            state = IN_NORMALIZED;
            pos = normalized.length();
            return TRUE;
        }
        nextCC = (uint8_t)(fcd16 >> 8);
        if(pos == 0 || nextCC == 0) {
            // FCD boundary before the following character.
            break;
        }
    }
    start = pos;
    pos = segmentLimit;
    U_ASSERT(pos != start);
    state = IN_FCD_SEGMENT;
    return TRUE;
}

UBool
FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
    // NFD without argument checking.
    U_ASSERT(U_SUCCESS(errorCode));
    nfcImpl.decompose(s, normalized, errorCode);
    return U_SUCCESS(errorCode);
}

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION

Coverage Report

Created: 2025-06-24 06:43

Line	Count	Source (jump to first uncovered line)
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		*******************************************************************************
5		* Copyright (C) 2012-2014, International Business Machines
6		* Corporation and others. All Rights Reserved.
7		*******************************************************************************
8		* utf8collationiterator.cpp
9		*
10		* created on: 2012nov12 (from utf16collationiterator.cpp & uitercollationiterator.cpp)
11		* created by: Markus W. Scherer
12		*/
13
14		#include "unicode/utypes.h"
15
16		#if !UCONFIG_NO_COLLATION
17
18		#include "unicode/utf8.h"
19		#include "charstr.h"
20		#include "cmemory.h"
21		#include "collation.h"
22		#include "collationdata.h"
23		#include "collationfcd.h"
24		#include "collationiterator.h"
25		#include "normalizer2impl.h"
26		#include "uassert.h"
27		#include "utf8collationiterator.h"
28
29		U_NAMESPACE_BEGIN
30
31	0	UTF8CollationIterator::~UTF8CollationIterator() {}
32
33		void
34	0	UTF8CollationIterator::resetToOffset(int32_t newOffset) {
35	0	reset();
36	0	pos = newOffset;
37	0	}
38
39		int32_t
40	0	UTF8CollationIterator::getOffset() const {
41	0	return pos;
42	0	}
43
44		uint32_t
45	0	UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /errorCode/) {
46	0	if(pos == length) {
47	0	c = U_SENTINEL;
48	0	return Collation::FALLBACK_CE32;
49	0	}
50		// Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32().
51	0	c = u8[pos++];
52	0	if(U8_IS_SINGLE(c)) {
53		// ASCII 00..7F
54	0	return trie->data32[c];
55	0	}
56	0	uint8_t t1, t2;
57	0	if(0xe0 <= c && c < 0xf0 &&
58	0	((pos + 1) < length \|\| length < 0) &&
59	0	U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
60	0	(t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
61		// U+0800..U+FFFF except surrogates
62	0	c = (((c & 0xf) << 12) \| ((t1 & 0x3f) << 6) \| t2);
63	0	pos += 2;
64	0	return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
65	0	} else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
66		// U+0080..U+07FF
67	0	uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
68	0	c = ((c & 0x1f) << 6) \| t1;
69	0	++pos;
70	0	return ce32;
71	0	} else {
72		// Function call for supplementary code points and error cases.
73		// Illegal byte sequences yield U+FFFD.
74	0	c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
75	0	return data->getCE32(c);
76	0	}
77	0	}
78
79		UBool
80	0	UTF8CollationIterator::foundNULTerminator() {
81	0	if(length < 0) {
82	0	length = --pos;
83	0	return TRUE;
84	0	} else {
85	0	return FALSE;
86	0	}
87	0	}
88
89		UBool
90	0	UTF8CollationIterator::forbidSurrogateCodePoints() const {
91	0	return TRUE;
92	0	}
93
94		UChar32
95	0	UTF8CollationIterator::nextCodePoint(UErrorCode & /errorCode/) {
96	0	if(pos == length) {
97	0	return U_SENTINEL;
98	0	}
99	0	if(u8[pos] == 0 && length < 0) {
100	0	length = pos;
101	0	return U_SENTINEL;
102	0	}
103	0	UChar32 c;
104	0	U8_NEXT_OR_FFFD(u8, pos, length, c);
105	0	return c;
106	0	}
107
108		UChar32
109	0	UTF8CollationIterator::previousCodePoint(UErrorCode & /errorCode/) {
110	0	if(pos == 0) {
111	0	return U_SENTINEL;
112	0	}
113	0	UChar32 c;
114	0	U8_PREV_OR_FFFD(u8, 0, pos, c);
115	0	return c;
116	0	}
117
118		void
119	0	UTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /errorCode/) {
120	0	U8_FWD_N(u8, pos, length, num);
121	0	}
122
123		void
124	0	UTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /errorCode/) {
125	0	U8_BACK_N(u8, 0, pos, num);
126	0	}
127
128		// FCDUTF8CollationIterator ------------------------------------------------ ***
129
130	0	FCDUTF8CollationIterator::~FCDUTF8CollationIterator() {}
131
132		void
133	0	FCDUTF8CollationIterator::resetToOffset(int32_t newOffset) {
134	0	reset();
135	0	start = pos = newOffset;
136	0	state = CHECK_FWD;
137	0	}
138
139		int32_t
140	0	FCDUTF8CollationIterator::getOffset() const {
141	0	if(state != IN_NORMALIZED) {
142	0	return pos;
143	0	} else if(pos == 0) {
144	0	return start;
145	0	} else {
146	0	return limit;
147	0	}
148	0	}
149
150		uint32_t
151	0	FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
152	0	for(;;) {
153	0	if(state == CHECK_FWD) {
154		// Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath.
155	0	if(pos == length) {
156	0	c = U_SENTINEL;
157	0	return Collation::FALLBACK_CE32;
158	0	}
159	0	c = u8[pos++];
160	0	if(U8_IS_SINGLE(c)) {
161		// ASCII 00..7F
162	0	return trie->data32[c];
163	0	}
164	0	uint8_t t1, t2;
165	0	if(0xe0 <= c && c < 0xf0 &&
166	0	((pos + 1) < length \|\| length < 0) &&
167	0	U8_IS_VALID_LEAD3_AND_T1(c, t1 = u8[pos]) &&
168	0	(t2 = (u8[pos + 1] - 0x80)) <= 0x3f) {
169		// U+0800..U+FFFF except surrogates
170	0	c = (((c & 0xf) << 12) \| ((t1 & 0x3f) << 6) \| t2);
171	0	pos += 2;
172	0	if(CollationFCD::hasTccc(c) &&
173	0	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
174	0	(pos != length && nextHasLccc()))) {
175	0	pos -= 3;
176	0	} else {
177	0	break; // return CE32(BMP)
178	0	}
179	0	} else if(c < 0xe0 && c >= 0xc2 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) {
180		// U+0080..U+07FF
181	0	uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1];
182	0	c = ((c & 0x1f) << 6) \| t1;
183	0	++pos;
184	0	if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) {
185	0	pos -= 2;
186	0	} else {
187	0	return ce32;
188	0	}
189	0	} else {
190		// Function call for supplementary code points and error cases.
191		// Illegal byte sequences yield U+FFFD.
192	0	c = utf8_nextCharSafeBody(u8, &pos, length, c, -3);
193	0	if(c == 0xfffd) {
194	0	return Collation::FFFD_CE32;
195	0	} else {
196	0	U_ASSERT(c > 0xffff);
197	0	if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) {
198	0	pos -= 4;
199	0	} else {
200	0	return data->getCE32FromSupplementary(c);
201	0	}
202	0	}
203	0	}
204	0	if(!nextSegment(errorCode)) {
205	0	c = U_SENTINEL;
206	0	return Collation::FALLBACK_CE32;
207	0	}
208	0	continue;
209	0	} else if(state == IN_FCD_SEGMENT && pos != limit) {
210	0	return UTF8CollationIterator::handleNextCE32(c, errorCode);
211	0	} else if(state == IN_NORMALIZED && pos != normalized.length()) {
212	0	c = normalized[pos++];
213	0	break;
214	0	} else {
215	0	switchToForward();
216	0	}
217	0	}
218	0	return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
219	0	}
220
221		UBool
222	0	FCDUTF8CollationIterator::nextHasLccc() const {
223	0	U_ASSERT(state == CHECK_FWD && pos != length);
224		// The lowest code point with ccc!=0 is U+0300 which is CC 80 in UTF-8.
225		// CJK U+4000..U+DFFF except U+Axxx are also FCD-inert. (Lead bytes E4..ED except EA.)
226	0	UChar32 c = u8[pos];
227	0	if(c < 0xcc \|\| (0xe4 <= c && c <= 0xed && c != 0xea)) { return FALSE; }
228	0	int32_t i = pos;
229	0	U8_NEXT_OR_FFFD(u8, i, length, c);
230	0	if(c > 0xffff) { c = U16_LEAD(c); }
231	0	return CollationFCD::hasLccc(c);
232	0	}
233
234		UBool
235	0	FCDUTF8CollationIterator::previousHasTccc() const {
236	0	U_ASSERT(state == CHECK_BWD && pos != 0);
237	0	UChar32 c = u8[pos - 1];
238	0	if(U8_IS_SINGLE(c)) { return FALSE; }
239	0	int32_t i = pos;
240	0	U8_PREV_OR_FFFD(u8, 0, i, c);
241	0	if(c > 0xffff) { c = U16_LEAD(c); }
242	0	return CollationFCD::hasTccc(c);
243	0	}
244
245		UChar
246	0	FCDUTF8CollationIterator::handleGetTrailSurrogate() {
247	0	if(state != IN_NORMALIZED) { return 0; }
248	0	U_ASSERT(pos < normalized.length());
249	0	UChar trail;
250	0	if(U16_IS_TRAIL(trail = normalized[pos])) { ++pos; }
251	0	return trail;
252	0	}
253
254		UBool
255	0	FCDUTF8CollationIterator::foundNULTerminator() {
256	0	if(state == CHECK_FWD && length < 0) {
257	0	length = --pos;
258	0	return TRUE;
259	0	} else {
260	0	return FALSE;
261	0	}
262	0	}
263
264		UChar32
265	0	FCDUTF8CollationIterator::nextCodePoint(UErrorCode &errorCode) {
266	0	UChar32 c;
267	0	for(;;) {
268	0	if(state == CHECK_FWD) {
269	0	if(pos == length \|\| ((c = u8[pos]) == 0 && length < 0)) {
270	0	return U_SENTINEL;
271	0	}
272	0	if(U8_IS_SINGLE(c)) {
273	0	++pos;
274	0	return c;
275	0	}
276	0	U8_NEXT_OR_FFFD(u8, pos, length, c);
277	0	if(CollationFCD::hasTccc(c <= 0xffff ? c : U16_LEAD(c)) &&
278	0	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
279	0	(pos != length && nextHasLccc()))) {
280		// c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
281		// and we can use U8_LENGTH() rather than a previous-position variable.
282	0	pos -= U8_LENGTH(c);
283	0	if(!nextSegment(errorCode)) {
284	0	return U_SENTINEL;
285	0	}
286	0	continue;
287	0	}
288	0	return c;
289	0	} else if(state == IN_FCD_SEGMENT && pos != limit) {
290	0	U8_NEXT_OR_FFFD(u8, pos, length, c);
291	0	return c;
292	0	} else if(state == IN_NORMALIZED && pos != normalized.length()) {
293	0	c = normalized.char32At(pos);
294	0	pos += U16_LENGTH(c);
295	0	return c;
296	0	} else {
297	0	switchToForward();
298	0	}
299	0	}
300	0	}
301
302		UChar32
303	0	FCDUTF8CollationIterator::previousCodePoint(UErrorCode &errorCode) {
304	0	UChar32 c;
305	0	for(;;) {
306	0	if(state == CHECK_BWD) {
307	0	if(pos == 0) {
308	0	return U_SENTINEL;
309	0	}
310	0	if(U8_IS_SINGLE(c = u8[pos - 1])) {
311	0	--pos;
312	0	return c;
313	0	}
314	0	U8_PREV_OR_FFFD(u8, 0, pos, c);
315	0	if(CollationFCD::hasLccc(c <= 0xffff ? c : U16_LEAD(c)) &&
316	0	(CollationFCD::maybeTibetanCompositeVowel(c) \|\|
317	0	(pos != 0 && previousHasTccc()))) {
318		// c is not FCD-inert, therefore it is not U+FFFD and it has a valid byte sequence
319		// and we can use U8_LENGTH() rather than a previous-position variable.
320	0	pos += U8_LENGTH(c);
321	0	if(!previousSegment(errorCode)) {
322	0	return U_SENTINEL;
323	0	}
324	0	continue;
325	0	}
326	0	return c;
327	0	} else if(state == IN_FCD_SEGMENT && pos != start) {
328	0	U8_PREV_OR_FFFD(u8, 0, pos, c);
329	0	return c;
330	0	} else if(state >= IN_NORMALIZED && pos != 0) {
331	0	c = normalized.char32At(pos - 1);
332	0	pos -= U16_LENGTH(c);
333	0	return c;
334	0	} else {
335	0	switchToBackward();
336	0	}
337	0	}
338	0	}
339
340		void
341	0	FCDUTF8CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
342		// Specify the class to avoid a virtual-function indirection.
343		// In Java, we would declare this class final.
344	0	while(num > 0 && FCDUTF8CollationIterator::nextCodePoint(errorCode) >= 0) {
345	0	--num;
346	0	}
347	0	}
348
349		void
350	0	FCDUTF8CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
351		// Specify the class to avoid a virtual-function indirection.
352		// In Java, we would declare this class final.
353	0	while(num > 0 && FCDUTF8CollationIterator::previousCodePoint(errorCode) >= 0) {
354	0	--num;
355	0	}
356	0	}
357
358		void
359	0	FCDUTF8CollationIterator::switchToForward() {
360	0	U_ASSERT(state == CHECK_BWD \|\|
361	0	(state == IN_FCD_SEGMENT && pos == limit) \|\|
362	0	(state == IN_NORMALIZED && pos == normalized.length()));
363	0	if(state == CHECK_BWD) {
364		// Turn around from backward checking.
365	0	start = pos;
366	0	if(pos == limit) {
367	0	state = CHECK_FWD; // Check forward.
368	0	} else { // pos < limit
369	0	state = IN_FCD_SEGMENT; // Stay in FCD segment.
370	0	}
371	0	} else {
372		// Reached the end of the FCD segment.
373	0	if(state == IN_FCD_SEGMENT) {
374		// The input text segment is FCD, extend it forward.
375	0	} else {
376		// The input text segment needed to be normalized.
377		// Switch to checking forward from it.
378	0	start = pos = limit;
379	0	}
380	0	state = CHECK_FWD;
381	0	}
382	0	}
383
384		UBool
385	0	FCDUTF8CollationIterator::nextSegment(UErrorCode &errorCode) {
386	0	if(U_FAILURE(errorCode)) { return FALSE; }
387	0	U_ASSERT(state == CHECK_FWD && pos != length);
388		// The input text [start..pos[ passes the FCD check.
389	0	int32_t segmentStart = pos;
390		// Collect the characters being checked, in case they need to be normalized.
391	0	UnicodeString s;
392	0	uint8_t prevCC = 0;
393	0	for(;;) {
394		// Fetch the next character and its fcd16 value.
395	0	int32_t cpStart = pos;
396	0	UChar32 c;
397	0	U8_NEXT_OR_FFFD(u8, pos, length, c);
398	0	uint16_t fcd16 = nfcImpl.getFCD16(c);
399	0	uint8_t leadCC = (uint8_t)(fcd16 >> 8);
400	0	if(leadCC == 0 && cpStart != segmentStart) {
401		// FCD boundary before this character.
402	0	pos = cpStart;
403	0	break;
404	0	}
405	0	s.append(c);
406	0	if(leadCC != 0 && (prevCC > leadCC \|\| CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
407		// Fails FCD check. Find the next FCD boundary and normalize.
408	0	while(pos != length) {
409	0	cpStart = pos;
410	0	U8_NEXT_OR_FFFD(u8, pos, length, c);
411	0	if(nfcImpl.getFCD16(c) <= 0xff) {
412	0	pos = cpStart;
413	0	break;
414	0	}
415	0	s.append(c);
416	0	}
417	0	if(!normalize(s, errorCode)) { return FALSE; }
418	0	start = segmentStart;
419	0	limit = pos;
420	0	state = IN_NORMALIZED;
421	0	pos = 0;
422	0	return TRUE;
423	0	}
424	0	prevCC = (uint8_t)fcd16;
425	0	if(pos == length \|\| prevCC == 0) {
426		// FCD boundary after the last character.
427	0	break;
428	0	}
429	0	}
430	0	limit = pos;
431	0	pos = segmentStart;
432	0	U_ASSERT(pos != limit);
433	0	state = IN_FCD_SEGMENT;
434	0	return TRUE;
435	0	}
436
437		void
438	0	FCDUTF8CollationIterator::switchToBackward() {
439	0	U_ASSERT(state == CHECK_FWD \|\|
440	0	(state == IN_FCD_SEGMENT && pos == start) \|\|
441	0	(state >= IN_NORMALIZED && pos == 0));
442	0	if(state == CHECK_FWD) {
443		// Turn around from forward checking.
444	0	limit = pos;
445	0	if(pos == start) {
446	0	state = CHECK_BWD; // Check backward.
447	0	} else { // pos > start
448	0	state = IN_FCD_SEGMENT; // Stay in FCD segment.
449	0	}
450	0	} else {
451		// Reached the start of the FCD segment.
452	0	if(state == IN_FCD_SEGMENT) {
453		// The input text segment is FCD, extend it backward.
454	0	} else {
455		// The input text segment needed to be normalized.
456		// Switch to checking backward from it.
457	0	limit = pos = start;
458	0	}
459	0	state = CHECK_BWD;
460	0	}
461	0	}
462
463		UBool
464	0	FCDUTF8CollationIterator::previousSegment(UErrorCode &errorCode) {
465	0	if(U_FAILURE(errorCode)) { return FALSE; }
466	0	U_ASSERT(state == CHECK_BWD && pos != 0);
467		// The input text [pos..limit[ passes the FCD check.
468	0	int32_t segmentLimit = pos;
469		// Collect the characters being checked, in case they need to be normalized.
470	0	UnicodeString s;
471	0	uint8_t nextCC = 0;
472	0	for(;;) {
473		// Fetch the previous character and its fcd16 value.
474	0	int32_t cpLimit = pos;
475	0	UChar32 c;
476	0	U8_PREV_OR_FFFD(u8, 0, pos, c);
477	0	uint16_t fcd16 = nfcImpl.getFCD16(c);
478	0	uint8_t trailCC = (uint8_t)fcd16;
479	0	if(trailCC == 0 && cpLimit != segmentLimit) {
480		// FCD boundary after this character.
481	0	pos = cpLimit;
482	0	break;
483	0	}
484	0	s.append(c);
485	0	if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) \|\|
486	0	CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
487		// Fails FCD check. Find the previous FCD boundary and normalize.
488	0	while(fcd16 > 0xff && pos != 0) {
489	0	cpLimit = pos;
490	0	U8_PREV_OR_FFFD(u8, 0, pos, c);
491	0	fcd16 = nfcImpl.getFCD16(c);
492	0	if(fcd16 == 0) {
493	0	pos = cpLimit;
494	0	break;
495	0	}
496	0	s.append(c);
497	0	}
498	0	s.reverse();
499	0	if(!normalize(s, errorCode)) { return FALSE; }
500	0	limit = segmentLimit;
501	0	start = pos;
502	0	state = IN_NORMALIZED;
503	0	pos = normalized.length();
504	0	return TRUE;
505	0	}
506	0	nextCC = (uint8_t)(fcd16 >> 8);
507	0	if(pos == 0 \|\| nextCC == 0) {
508		// FCD boundary before the following character.
509	0	break;
510	0	}
511	0	}
512	0	start = pos;
513	0	pos = segmentLimit;
514	0	U_ASSERT(pos != start);
515	0	state = IN_FCD_SEGMENT;
516	0	return TRUE;
517	0	}
518
519		UBool
520	0	FCDUTF8CollationIterator::normalize(const UnicodeString &s, UErrorCode &errorCode) {
521		// NFD without argument checking.
522	0	U_ASSERT(U_SUCCESS(errorCode));
523	0	nfcImpl.decompose(s, normalized, errorCode);
524	0	return U_SUCCESS(errorCode);
525	0	}
526
527		U_NAMESPACE_END
528
529		#endif // !UCONFIG_NO_COLLATION