/src/icu/source/i18n/collationdatareader.cpp

Source (jump to first uncovered line)
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2013-2015, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* collationdatareader.cpp
*
* created on: 2013feb07
* created by: Markus W. Scherer
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/ucol.h"
#include "unicode/udata.h"
#include "unicode/uscript.h"
#include "cmemory.h"
#include "collation.h"
#include "collationdata.h"
#include "collationdatareader.h"
#include "collationfastlatin.h"
#include "collationkeys.h"
#include "collationrootelements.h"
#include "collationsettings.h"
#include "collationtailoring.h"
#include "collunsafe.h"
#include "normalizer2impl.h"
#include "uassert.h"
#include "ucmndata.h"
#include "utrie2.h"

U_NAMESPACE_BEGIN

namespace {

int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
    return (i < length) ? indexes[i] : -1;
}

}  // namespace

void
CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
                          CollationTailoring &tailoring, UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return; }
    if(base != NULL) {
        if(inBytes == NULL || (0 <= inLength && inLength < 24)) {
            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
            return;
        }
        const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes);
        if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
                isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
        if(base->getUCAVersion() != tailoring.getUCAVersion()) {
            errorCode = U_COLLATOR_VERSION_MISMATCH;
            return;
        }
        int32_t headerLength = header->dataHeader.headerSize;
        inBytes += headerLength;
        if(inLength >= 0) {
            inLength -= headerLength;
        }
    }

    if(inBytes == NULL || (0 <= inLength && inLength < 8)) {
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);
    int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
    if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {
        errorCode = U_INVALID_FORMAT_ERROR;  // Not enough indexes.
        return;
    }

    // Assume that the tailoring data is in initial state,
    // with NULL pointers and 0 lengths.

    // Set pointers to non-empty data parts.
    // Do this in order of their byte offsets. (Should help porting to Java.)

    int32_t index;  // one of the indexes[] slots
    int32_t offset;  // byte offset for the index part
    int32_t length;  // number of bytes in the index part

    if(indexesLength > IX_TOTAL_SIZE) {
        length = inIndexes[IX_TOTAL_SIZE];
    } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
        length = inIndexes[indexesLength - 1];
    } else {
        length = 0;  // only indexes, and inLength was already checked for them
    }
    if(0 <= inLength && inLength < length) {
        errorCode = U_INVALID_FORMAT_ERROR;
        return;
    }

    const CollationData *baseData = base == NULL ? NULL : base->data;
    const int32_t *reorderCodes = NULL;
    int32_t reorderCodesLength = 0;
    const uint32_t *reorderRanges = NULL;
    int32_t reorderRangesLength = 0;
    index = IX_REORDER_CODES_OFFSET;
    offset = getIndex(inIndexes, indexesLength, index);
    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    if(length >= 4) {
        if(baseData == NULL) {
            // We assume for collation settings that
            // the base data does not have a reordering.
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
        reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
        reorderCodesLength = length / 4;

        // The reorderRanges (if any) are the trailing reorderCodes entries.
        // Split the array at the boundary.
        // Script or reorder codes do not exceed 16-bit values.
        // Range limits are stored in the upper 16 bits, and are never 0.
        while(reorderRangesLength < reorderCodesLength &&
                (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
            ++reorderRangesLength;
        }
        U_ASSERT(reorderRangesLength < reorderCodesLength);
        if(reorderRangesLength != 0) {
            reorderCodesLength -= reorderRangesLength;
            reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength);
        }
    }

    // There should be a reorder table only if there are reorder codes.
    // However, when there are reorder codes the reorder table may be omitted to reduce
    // the data size.
    const uint8_t *reorderTable = NULL;
    index = IX_REORDER_TABLE_OFFSET;
    offset = getIndex(inIndexes, indexesLength, index);
    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    if(length >= 256) {
        if(reorderCodesLength == 0) {
            errorCode = U_INVALID_FORMAT_ERROR;  // Reordering table without reordering codes.
            return;
        }
        reorderTable = inBytes + offset;
    } else {
        // If we have reorder codes, then build the reorderTable at the end,
        // when the CollationData is otherwise complete.
    }

    if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
        errorCode = U_INVALID_FORMAT_ERROR;
        return;
    }
    CollationData *data = NULL;  // Remains NULL if there are no mappings.

    index = IX_TRIE_OFFSET;
    offset = getIndex(inIndexes, indexesLength, index);
    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    if(length >= 8) {
        if(!tailoring.ensureOwnedData(errorCode)) { return; }
        data = tailoring.ownedData;
        data->base = baseData;
        data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
        data->trie = tailoring.trie = utrie2_openFromSerialized(
            UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
            &errorCode);
        if(U_FAILURE(errorCode)) { return; }
    } else if(baseData != NULL) {
        // Use the base data. Only the settings are tailored.
        tailoring.data = baseData;
    } else {
        errorCode = U_INVALID_FORMAT_ERROR;  // No mappings.
        return;
    }

    index = IX_CES_OFFSET;
    offset = getIndex(inIndexes, indexesLength, index);
    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    if(length >= 8) {
        if(data == NULL) {
            errorCode = U_INVALID_FORMAT_ERROR;  // Tailored ces without tailored trie.
            return;
        }
        data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
        data->cesLength = length / 8;
    }

    index = IX_CE32S_OFFSET;
    offset = getIndex(inIndexes, indexesLength, index);
    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    if(length >= 4) {
        if(data == NULL) {
            errorCode = U_INVALID_FORMAT_ERROR;  // Tailored ce32s without tailored trie.
            return;
        }
        data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
        data->ce32sLength = length / 4;
    }

    int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
    if(jamoCE32sStart >= 0) {
        if(data == NULL || data->ce32s == NULL) {
            errorCode = U_INVALID_FORMAT_ERROR;  // Index into non-existent ce32s[].
            return;
        }
        data->jamoCE32s = data->ce32s + jamoCE32sStart;
    } else if(data == NULL) {
        // Nothing to do.
    } else if(baseData != NULL) {
        data->jamoCE32s = baseData->jamoCE32s;
    } else {
        errorCode = U_INVALID_FORMAT_ERROR;  // No Jamo CE32s for Hangul processing.
        return;
    }

    index = IX_ROOT_ELEMENTS_OFFSET;
    offset = getIndex(inIndexes, indexesLength, index);
    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    if(length >= 4) {
        length /= 4;
        if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
        data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
        data->rootElementsLength = length;
        uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
        if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
        uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
        if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
            // [fixed last secondary common byte] is too low,
            // and secondary weights would collide with compressed common secondaries.
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
    }

    index = IX_CONTEXTS_OFFSET;
    offset = getIndex(inIndexes, indexesLength, index);
    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    if(length >= 2) {
        if(data == NULL) {
            errorCode = U_INVALID_FORMAT_ERROR;  // Tailored contexts without tailored trie.
            return;
        }
        data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
        data->contextsLength = length / 2;
    }

    index = IX_UNSAFE_BWD_OFFSET;
    offset = getIndex(inIndexes, indexesLength, index);
    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    if(length >= 2) {
        if(data == NULL) {
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
        if(baseData == NULL) {
#if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE)
          tailoring.unsafeBackwardSet = new UnicodeSet(unsafe_serializedData, unsafe_serializedCount, UnicodeSet::kSerialized, errorCode);
          if(tailoring.unsafeBackwardSet == NULL) {
            errorCode = U_MEMORY_ALLOCATION_ERROR;
            return;
          } else if (U_FAILURE(errorCode)) {
            return;
          }
#else
            // Create the unsafe-backward set for the root collator.
            // Include all non-zero combining marks and trail surrogates.
            // We do this at load time, rather than at build time,
            // to simplify Unicode version bootstrapping:
            // The root data builder only needs the new FractionalUCA.txt data,
            // but it need not be built with a version of ICU already updated to
            // the corresponding new Unicode Character Database.
            //
            // The following is an optimized version of
            // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
            // It is faster and requires fewer code dependencies.
            tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff);  // trail surrogates
            if(tailoring.unsafeBackwardSet == NULL) {
                errorCode = U_MEMORY_ALLOCATION_ERROR;
                return;
            }
            data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
#endif // !COLLUNSAFE_SERIALIZE || !COLLUNSAFE_COLL_VERSION
        } else {
            // Clone the root collator's set contents.
            tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
                baseData->unsafeBackwardSet->cloneAsThawed());
            if(tailoring.unsafeBackwardSet == NULL) {
                errorCode = U_MEMORY_ALLOCATION_ERROR;
                return;
            }
        }
        // Add the ranges from the data file to the unsafe-backward set.
        USerializedSet sset;
        const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);
        if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
        int32_t count = uset_getSerializedRangeCount(&sset);
        for(int32_t i = 0; i < count; ++i) {
            UChar32 start, end;
            uset_getSerializedRange(&sset, i, &start, &end);
            tailoring.unsafeBackwardSet->add(start, end);
        }
        // Mark each lead surrogate as "unsafe"
        // if any of its 1024 associated supplementary code points is "unsafe".
        UChar32 c = 0x10000;
        for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
            if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
                tailoring.unsafeBackwardSet->add(lead);
            }
        }
        tailoring.unsafeBackwardSet->freeze();
        data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
    } else if(data == NULL) {
        // Nothing to do.
    } else if(baseData != NULL) {
        // No tailoring-specific data: Alias the root collator's set.
        data->unsafeBackwardSet = baseData->unsafeBackwardSet;
    } else {
        errorCode = U_INVALID_FORMAT_ERROR;  // No unsafeBackwardSet.
        return;
    }

    // If the fast Latin format version is different,
    // or the version is set to 0 for "no fast Latin table",
    // then just always use the normal string comparison path.
    if(data != NULL) {
        data->fastLatinTable = NULL;
        data->fastLatinTableLength = 0;
        if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
            index = IX_FAST_LATIN_TABLE_OFFSET;
            offset = getIndex(inIndexes, indexesLength, index);
            length = getIndex(inIndexes, indexesLength, index + 1) - offset;
            if(length >= 2) {
                data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
                data->fastLatinTableLength = length / 2;
                if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
                    errorCode = U_INVALID_FORMAT_ERROR;  // header vs. table version mismatch
                    return;
                }
            } else if(baseData != NULL) {
                data->fastLatinTable = baseData->fastLatinTable;
                data->fastLatinTableLength = baseData->fastLatinTableLength;
            }
        }
    }

    index = IX_SCRIPTS_OFFSET;
    offset = getIndex(inIndexes, indexesLength, index);
    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    if(length >= 2) {
        if(data == NULL) {
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
        const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
        int32_t scriptsLength = length / 2;
        data->numScripts = scripts[0];
        // There must be enough entries for both arrays, including more than two range starts.
        data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16);
        if(data->scriptStartsLength <= 2 ||
                CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
        data->scriptsIndex = scripts + 1;
        data->scriptStarts = scripts + 1 + data->numScripts + 16;
        if(!(data->scriptStarts[0] == 0 &&
                data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) &&
                data->scriptStarts[data->scriptStartsLength - 1] ==
                        (Collation::TRAIL_WEIGHT_BYTE << 8))) {
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
    } else if(data == NULL) {
        // Nothing to do.
    } else if(baseData != NULL) {
        data->numScripts = baseData->numScripts;
        data->scriptsIndex = baseData->scriptsIndex;
        data->scriptStarts = baseData->scriptStarts;
        data->scriptStartsLength = baseData->scriptStartsLength;
    }

    index = IX_COMPRESSIBLE_BYTES_OFFSET;
    offset = getIndex(inIndexes, indexesLength, index);
    length = getIndex(inIndexes, indexesLength, index + 1) - offset;
    if(length >= 256) {
        if(data == NULL) {
            errorCode = U_INVALID_FORMAT_ERROR;
            return;
        }
        data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
    } else if(data == NULL) {
        // Nothing to do.
    } else if(baseData != NULL) {
        data->compressibleBytes = baseData->compressibleBytes;
    } else {
        errorCode = U_INVALID_FORMAT_ERROR;  // No compressibleBytes[].
        return;
    }

    const CollationSettings &ts = *tailoring.settings;
    int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
    uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
    int32_t fastLatinOptions = CollationFastLatin::getOptions(
            tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries));
    if(options == ts.options && ts.variableTop != 0 &&
            reorderCodesLength == ts.reorderCodesLength &&
            (reorderCodesLength == 0 ||
                uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0) &&
            fastLatinOptions == ts.fastLatinOptions &&
            (fastLatinOptions < 0 ||
                uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
                            sizeof(fastLatinPrimaries)) == 0)) {
        return;
    }

    CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
    if(settings == NULL) {
        errorCode = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    settings->options = options;
    // Set variableTop from options and scripts data.
    settings->variableTop = tailoring.data->getLastPrimaryForGroup(
            UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
    if(settings->variableTop == 0) {
        errorCode = U_INVALID_FORMAT_ERROR;
        return;
    }

    if(reorderCodesLength != 0) {
        settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,
                                  reorderRanges, reorderRangesLength,
                                  reorderTable, errorCode);
    }

    settings->fastLatinOptions = CollationFastLatin::getOptions(
        tailoring.data, *settings,
        settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries));
}

UBool U_CALLCONV
CollationDataReader::isAcceptable(void *context,
                                  const char * /* type */, const char * /*name*/,
                                  const UDataInfo *pInfo) {
    if(
        pInfo->size >= 20 &&
        pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
        pInfo->charsetFamily == U_CHARSET_FAMILY &&
        pInfo->dataFormat[0] == 0x55 &&  // dataFormat="UCol"
        pInfo->dataFormat[1] == 0x43 &&
        pInfo->dataFormat[2] == 0x6f &&
        pInfo->dataFormat[3] == 0x6c &&
        pInfo->formatVersion[0] == 5
    ) {
        UVersionInfo *version = static_cast<UVersionInfo *>(context);
        if(version != NULL) {
            uprv_memcpy(version, pInfo->dataVersion, 4);
        }
        return TRUE;
    } else {
        return FALSE;
    }
}

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION

Coverage Report

Created: 2025-06-24 06:43

Line	Count	Source (jump to first uncovered line)
1		// © 2016 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3		/*
4		*******************************************************************************
5		* Copyright (C) 2013-2015, International Business Machines
6		* Corporation and others. All Rights Reserved.
7		*******************************************************************************
8		* collationdatareader.cpp
9		*
10		* created on: 2013feb07
11		* created by: Markus W. Scherer
12		*/
13
14		#include "unicode/utypes.h"
15
16		#if !UCONFIG_NO_COLLATION
17
18		#include "unicode/ucol.h"
19		#include "unicode/udata.h"
20		#include "unicode/uscript.h"
21		#include "cmemory.h"
22		#include "collation.h"
23		#include "collationdata.h"
24		#include "collationdatareader.h"
25		#include "collationfastlatin.h"
26		#include "collationkeys.h"
27		#include "collationrootelements.h"
28		#include "collationsettings.h"
29		#include "collationtailoring.h"
30		#include "collunsafe.h"
31		#include "normalizer2impl.h"
32		#include "uassert.h"
33		#include "ucmndata.h"
34		#include "utrie2.h"
35
36		U_NAMESPACE_BEGIN
37
38		namespace {
39
40	0	int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
41	0	return (i < length) ? indexes[i] : -1;
42	0	}
43
44		} // namespace
45
46		void
47		CollationDataReader::read(const CollationTailoring base, const uint8_t inBytes, int32_t inLength,
48	0	CollationTailoring &tailoring, UErrorCode &errorCode) {
49	0	if(U_FAILURE(errorCode)) { return; }
50	0	if(base != NULL) {
51	0	if(inBytes == NULL \|\| (0 <= inLength && inLength < 24)) {
52	0	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
53	0	return;
54	0	}
55	0	const DataHeader header = reinterpret_cast<const DataHeader >(inBytes);
56	0	if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
57	0	isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
58	0	errorCode = U_INVALID_FORMAT_ERROR;
59	0	return;
60	0	}
61	0	if(base->getUCAVersion() != tailoring.getUCAVersion()) {
62	0	errorCode = U_COLLATOR_VERSION_MISMATCH;
63	0	return;
64	0	}
65	0	int32_t headerLength = header->dataHeader.headerSize;
66	0	inBytes += headerLength;
67	0	if(inLength >= 0) {
68	0	inLength -= headerLength;
69	0	}
70	0	}
71
72	0	if(inBytes == NULL \|\| (0 <= inLength && inLength < 8)) {
73	0	errorCode = U_ILLEGAL_ARGUMENT_ERROR;
74	0	return;
75	0	}
76	0	const int32_t inIndexes = reinterpret_cast<const int32_t >(inBytes);
77	0	int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
78	0	if(indexesLength < 2 \|\| (0 <= inLength && inLength < indexesLength * 4)) {
79	0	errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
80	0	return;
81	0	}
82
83		// Assume that the tailoring data is in initial state,
84		// with NULL pointers and 0 lengths.
85
86		// Set pointers to non-empty data parts.
87		// Do this in order of their byte offsets. (Should help porting to Java.)
88
89	0	int32_t index; // one of the indexes[] slots
90	0	int32_t offset; // byte offset for the index part
91	0	int32_t length; // number of bytes in the index part
92
93	0	if(indexesLength > IX_TOTAL_SIZE) {
94	0	length = inIndexes[IX_TOTAL_SIZE];
95	0	} else if(indexesLength > IX_REORDER_CODES_OFFSET) {
96	0	length = inIndexes[indexesLength - 1];
97	0	} else {
98	0	length = 0; // only indexes, and inLength was already checked for them
99	0	}
100	0	if(0 <= inLength && inLength < length) {
101	0	errorCode = U_INVALID_FORMAT_ERROR;
102	0	return;
103	0	}
104
105	0	const CollationData *baseData = base == NULL ? NULL : base->data;
106	0	const int32_t *reorderCodes = NULL;
107	0	int32_t reorderCodesLength = 0;
108	0	const uint32_t *reorderRanges = NULL;
109	0	int32_t reorderRangesLength = 0;
110	0	index = IX_REORDER_CODES_OFFSET;
111	0	offset = getIndex(inIndexes, indexesLength, index);
112	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
113	0	if(length >= 4) {
114	0	if(baseData == NULL) {
115		// We assume for collation settings that
116		// the base data does not have a reordering.
117	0	errorCode = U_INVALID_FORMAT_ERROR;
118	0	return;
119	0	}
120	0	reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
121	0	reorderCodesLength = length / 4;
122
123		// The reorderRanges (if any) are the trailing reorderCodes entries.
124		// Split the array at the boundary.
125		// Script or reorder codes do not exceed 16-bit values.
126		// Range limits are stored in the upper 16 bits, and are never 0.
127	0	while(reorderRangesLength < reorderCodesLength &&
128	0	(reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
129	0	++reorderRangesLength;
130	0	}
131	0	U_ASSERT(reorderRangesLength < reorderCodesLength);
132	0	if(reorderRangesLength != 0) {
133	0	reorderCodesLength -= reorderRangesLength;
134	0	reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength);
135	0	}
136	0	}
137
138		// There should be a reorder table only if there are reorder codes.
139		// However, when there are reorder codes the reorder table may be omitted to reduce
140		// the data size.
141	0	const uint8_t *reorderTable = NULL;
142	0	index = IX_REORDER_TABLE_OFFSET;
143	0	offset = getIndex(inIndexes, indexesLength, index);
144	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
145	0	if(length >= 256) {
146	0	if(reorderCodesLength == 0) {
147	0	errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reordering codes.
148	0	return;
149	0	}
150	0	reorderTable = inBytes + offset;
151	0	} else {
152		// If we have reorder codes, then build the reorderTable at the end,
153		// when the CollationData is otherwise complete.
154	0	}
155
156	0	if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
157	0	errorCode = U_INVALID_FORMAT_ERROR;
158	0	return;
159	0	}
160	0	CollationData *data = NULL; // Remains NULL if there are no mappings.
161
162	0	index = IX_TRIE_OFFSET;
163	0	offset = getIndex(inIndexes, indexesLength, index);
164	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
165	0	if(length >= 8) {
166	0	if(!tailoring.ensureOwnedData(errorCode)) { return; }
167	0	data = tailoring.ownedData;
168	0	data->base = baseData;
169	0	data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
170	0	data->trie = tailoring.trie = utrie2_openFromSerialized(
171	0	UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
172	0	&errorCode);
173	0	if(U_FAILURE(errorCode)) { return; }
174	0	} else if(baseData != NULL) {
175		// Use the base data. Only the settings are tailored.
176	0	tailoring.data = baseData;
177	0	} else {
178	0	errorCode = U_INVALID_FORMAT_ERROR; // No mappings.
179	0	return;
180	0	}
181
182	0	index = IX_CES_OFFSET;
183	0	offset = getIndex(inIndexes, indexesLength, index);
184	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
185	0	if(length >= 8) {
186	0	if(data == NULL) {
187	0	errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailored trie.
188	0	return;
189	0	}
190	0	data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
191	0	data->cesLength = length / 8;
192	0	}
193
194	0	index = IX_CE32S_OFFSET;
195	0	offset = getIndex(inIndexes, indexesLength, index);
196	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
197	0	if(length >= 4) {
198	0	if(data == NULL) {
199	0	errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailored trie.
200	0	return;
201	0	}
202	0	data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
203	0	data->ce32sLength = length / 4;
204	0	}
205
206	0	int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
207	0	if(jamoCE32sStart >= 0) {
208	0	if(data == NULL \|\| data->ce32s == NULL) {
209	0	errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32s[].
210	0	return;
211	0	}
212	0	data->jamoCE32s = data->ce32s + jamoCE32sStart;
213	0	} else if(data == NULL) {
214		// Nothing to do.
215	0	} else if(baseData != NULL) {
216	0	data->jamoCE32s = baseData->jamoCE32s;
217	0	} else {
218	0	errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul processing.
219	0	return;
220	0	}
221
222	0	index = IX_ROOT_ELEMENTS_OFFSET;
223	0	offset = getIndex(inIndexes, indexesLength, index);
224	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
225	0	if(length >= 4) {
226	0	length /= 4;
227	0	if(data == NULL \|\| length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
228	0	errorCode = U_INVALID_FORMAT_ERROR;
229	0	return;
230	0	}
231	0	data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
232	0	data->rootElementsLength = length;
233	0	uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
234	0	if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
235	0	errorCode = U_INVALID_FORMAT_ERROR;
236	0	return;
237	0	}
238	0	uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
239	0	if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
240		// [fixed last secondary common byte] is too low,
241		// and secondary weights would collide with compressed common secondaries.
242	0	errorCode = U_INVALID_FORMAT_ERROR;
243	0	return;
244	0	}
245	0	}
246
247	0	index = IX_CONTEXTS_OFFSET;
248	0	offset = getIndex(inIndexes, indexesLength, index);
249	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
250	0	if(length >= 2) {
251	0	if(data == NULL) {
252	0	errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without tailored trie.
253	0	return;
254	0	}
255	0	data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
256	0	data->contextsLength = length / 2;
257	0	}
258
259	0	index = IX_UNSAFE_BWD_OFFSET;
260	0	offset = getIndex(inIndexes, indexesLength, index);
261	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
262	0	if(length >= 2) {
263	0	if(data == NULL) {
264	0	errorCode = U_INVALID_FORMAT_ERROR;
265	0	return;
266	0	}
267	0	if(baseData == NULL) {
268	0	#if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE)
269	0	tailoring.unsafeBackwardSet = new UnicodeSet(unsafe_serializedData, unsafe_serializedCount, UnicodeSet::kSerialized, errorCode);
270	0	if(tailoring.unsafeBackwardSet == NULL) {
271	0	errorCode = U_MEMORY_ALLOCATION_ERROR;
272	0	return;
273	0	} else if (U_FAILURE(errorCode)) {
274	0	return;
275	0	}
276		#else
277		// Create the unsafe-backward set for the root collator.
278		// Include all non-zero combining marks and trail surrogates.
279		// We do this at load time, rather than at build time,
280		// to simplify Unicode version bootstrapping:
281		// The root data builder only needs the new FractionalUCA.txt data,
282		// but it need not be built with a version of ICU already updated to
283		// the corresponding new Unicode Character Database.
284		//
285		// The following is an optimized version of
286		// new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
287		// It is faster and requires fewer code dependencies.
288		tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
289		if(tailoring.unsafeBackwardSet == NULL) {
290		errorCode = U_MEMORY_ALLOCATION_ERROR;
291		return;
292		}
293		data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
294		#endif // !COLLUNSAFE_SERIALIZE \|\| !COLLUNSAFE_COLL_VERSION
295	0	} else {
296		// Clone the root collator's set contents.
297	0	tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
298	0	baseData->unsafeBackwardSet->cloneAsThawed());
299	0	if(tailoring.unsafeBackwardSet == NULL) {
300	0	errorCode = U_MEMORY_ALLOCATION_ERROR;
301	0	return;
302	0	}
303	0	}
304		// Add the ranges from the data file to the unsafe-backward set.
305	0	USerializedSet sset;
306	0	const uint16_t unsafeData = reinterpret_cast<const uint16_t >(inBytes + offset);
307	0	if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
308	0	errorCode = U_INVALID_FORMAT_ERROR;
309	0	return;
310	0	}
311	0	int32_t count = uset_getSerializedRangeCount(&sset);
312	0	for(int32_t i = 0; i < count; ++i) {
313	0	UChar32 start, end;
314	0	uset_getSerializedRange(&sset, i, &start, &end);
315	0	tailoring.unsafeBackwardSet->add(start, end);
316	0	}
317		// Mark each lead surrogate as "unsafe"
318		// if any of its 1024 associated supplementary code points is "unsafe".
319	0	UChar32 c = 0x10000;
320	0	for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
321	0	if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
322	0	tailoring.unsafeBackwardSet->add(lead);
323	0	}
324	0	}
325	0	tailoring.unsafeBackwardSet->freeze();
326	0	data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
327	0	} else if(data == NULL) {
328		// Nothing to do.
329	0	} else if(baseData != NULL) {
330		// No tailoring-specific data: Alias the root collator's set.
331	0	data->unsafeBackwardSet = baseData->unsafeBackwardSet;
332	0	} else {
333	0	errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet.
334	0	return;
335	0	}
336
337		// If the fast Latin format version is different,
338		// or the version is set to 0 for "no fast Latin table",
339		// then just always use the normal string comparison path.
340	0	if(data != NULL) {
341	0	data->fastLatinTable = NULL;
342	0	data->fastLatinTableLength = 0;
343	0	if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
344	0	index = IX_FAST_LATIN_TABLE_OFFSET;
345	0	offset = getIndex(inIndexes, indexesLength, index);
346	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
347	0	if(length >= 2) {
348	0	data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
349	0	data->fastLatinTableLength = length / 2;
350	0	if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
351	0	errorCode = U_INVALID_FORMAT_ERROR; // header vs. table version mismatch
352	0	return;
353	0	}
354	0	} else if(baseData != NULL) {
355	0	data->fastLatinTable = baseData->fastLatinTable;
356	0	data->fastLatinTableLength = baseData->fastLatinTableLength;
357	0	}
358	0	}
359	0	}
360
361	0	index = IX_SCRIPTS_OFFSET;
362	0	offset = getIndex(inIndexes, indexesLength, index);
363	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
364	0	if(length >= 2) {
365	0	if(data == NULL) {
366	0	errorCode = U_INVALID_FORMAT_ERROR;
367	0	return;
368	0	}
369	0	const uint16_t scripts = reinterpret_cast<const uint16_t >(inBytes + offset);
370	0	int32_t scriptsLength = length / 2;
371	0	data->numScripts = scripts[0];
372		// There must be enough entries for both arrays, including more than two range starts.
373	0	data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16);
374	0	if(data->scriptStartsLength <= 2 \|\|
375	0	CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {
376	0	errorCode = U_INVALID_FORMAT_ERROR;
377	0	return;
378	0	}
379	0	data->scriptsIndex = scripts + 1;
380	0	data->scriptStarts = scripts + 1 + data->numScripts + 16;
381	0	if(!(data->scriptStarts[0] == 0 &&
382	0	data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) &&
383	0	data->scriptStarts[data->scriptStartsLength - 1] ==
384	0	(Collation::TRAIL_WEIGHT_BYTE << 8))) {
385	0	errorCode = U_INVALID_FORMAT_ERROR;
386	0	return;
387	0	}
388	0	} else if(data == NULL) {
389		// Nothing to do.
390	0	} else if(baseData != NULL) {
391	0	data->numScripts = baseData->numScripts;
392	0	data->scriptsIndex = baseData->scriptsIndex;
393	0	data->scriptStarts = baseData->scriptStarts;
394	0	data->scriptStartsLength = baseData->scriptStartsLength;
395	0	}
396
397	0	index = IX_COMPRESSIBLE_BYTES_OFFSET;
398	0	offset = getIndex(inIndexes, indexesLength, index);
399	0	length = getIndex(inIndexes, indexesLength, index + 1) - offset;
400	0	if(length >= 256) {
401	0	if(data == NULL) {
402	0	errorCode = U_INVALID_FORMAT_ERROR;
403	0	return;
404	0	}
405	0	data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
406	0	} else if(data == NULL) {
407		// Nothing to do.
408	0	} else if(baseData != NULL) {
409	0	data->compressibleBytes = baseData->compressibleBytes;
410	0	} else {
411	0	errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[].
412	0	return;
413	0	}
414
415	0	const CollationSettings &ts = *tailoring.settings;
416	0	int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
417	0	uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
418	0	int32_t fastLatinOptions = CollationFastLatin::getOptions(
419	0	tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries));
420	0	if(options == ts.options && ts.variableTop != 0 &&
421	0	reorderCodesLength == ts.reorderCodesLength &&
422	0	(reorderCodesLength == 0 \|\|
423	0	uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0) &&
424	0	fastLatinOptions == ts.fastLatinOptions &&
425	0	(fastLatinOptions < 0 \|\|
426	0	uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
427	0	sizeof(fastLatinPrimaries)) == 0)) {
428	0	return;
429	0	}
430
431	0	CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
432	0	if(settings == NULL) {
433	0	errorCode = U_MEMORY_ALLOCATION_ERROR;
434	0	return;
435	0	}
436	0	settings->options = options;
437		// Set variableTop from options and scripts data.
438	0	settings->variableTop = tailoring.data->getLastPrimaryForGroup(
439	0	UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
440	0	if(settings->variableTop == 0) {
441	0	errorCode = U_INVALID_FORMAT_ERROR;
442	0	return;
443	0	}
444
445	0	if(reorderCodesLength != 0) {
446	0	settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,
447	0	reorderRanges, reorderRangesLength,
448	0	reorderTable, errorCode);
449	0	}
450
451	0	settings->fastLatinOptions = CollationFastLatin::getOptions(
452	0	tailoring.data, *settings,
453	0	settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries));
454	0	}
455
456		UBool U_CALLCONV
457		CollationDataReader::isAcceptable(void *context,
458		const char * /* type /, const char /name/,
459	0	const UDataInfo *pInfo) {
460	0	if(
461	0	pInfo->size >= 20 &&
462	0	pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
463	0	pInfo->charsetFamily == U_CHARSET_FAMILY &&
464	0	pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol"
465	0	pInfo->dataFormat[1] == 0x43 &&
466	0	pInfo->dataFormat[2] == 0x6f &&
467	0	pInfo->dataFormat[3] == 0x6c &&
468	0	pInfo->formatVersion[0] == 5
469	0	) {
470	0	UVersionInfo version = static_cast<UVersionInfo >(context);
471	0	if(version != NULL) {
472	0	uprv_memcpy(version, pInfo->dataVersion, 4);
473	0	}
474	0	return TRUE;
475	0	} else {
476	0	return FALSE;
477	0	}
478	0	}
479
480		U_NAMESPACE_END
481
482		#endif // !UCONFIG_NO_COLLATION