Coverage Report

Created: 2026-03-31 06:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/collationdatawriter.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2013-2015, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationdatawriter.cpp
9
*
10
* created on: 2013aug06
11
* created by: Markus W. Scherer
12
*/
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_COLLATION
17
18
#include "unicode/tblcoll.h"
19
#include "unicode/udata.h"
20
#include "unicode/uniset.h"
21
#include "cmemory.h"
22
#include "collationdata.h"
23
#include "collationdatabuilder.h"
24
#include "collationdatareader.h"
25
#include "collationdatawriter.h"
26
#include "collationfastlatin.h"
27
#include "collationsettings.h"
28
#include "collationtailoring.h"
29
#include "uassert.h"
30
#include "ucmndata.h"
31
32
U_NAMESPACE_BEGIN
33
34
uint8_t *
35
0
RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
36
0
    if(U_FAILURE(errorCode)) { return nullptr; }
37
0
    LocalMemory<uint8_t> buffer(static_cast<uint8_t*>(uprv_malloc(20000)));
38
0
    if(buffer.isNull()) {
39
0
        errorCode = U_MEMORY_ALLOCATION_ERROR;
40
0
        return nullptr;
41
0
    }
42
0
    UErrorCode bufferStatus = U_ZERO_ERROR;
43
0
    length = cloneBinary(buffer.getAlias(), 20000, bufferStatus);
44
0
    if(bufferStatus == U_BUFFER_OVERFLOW_ERROR) {
45
0
        if(buffer.allocateInsteadAndCopy(length, 0) == nullptr) {
46
0
            errorCode = U_MEMORY_ALLOCATION_ERROR;
47
0
            return nullptr;
48
0
        }
49
0
        bufferStatus = U_ZERO_ERROR;
50
0
        length = cloneBinary(buffer.getAlias(), length, bufferStatus);
51
0
    }
52
0
    if(U_FAILURE(bufferStatus)) {
53
0
        errorCode = bufferStatus;
54
0
        return nullptr;
55
0
    }
56
0
    return buffer.orphan();
57
0
}
58
59
int32_t
60
0
RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
61
0
    int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
62
0
    return CollationDataWriter::writeTailoring(
63
0
            *tailoring, *settings, indexes, dest, capacity,
64
0
            errorCode);
65
0
}
66
67
static const UDataInfo dataInfo = {
68
    sizeof(UDataInfo),
69
    0,
70
71
    U_IS_BIG_ENDIAN,
72
    U_CHARSET_FAMILY,
73
    U_SIZEOF_UCHAR,
74
    0,
75
76
    { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
77
    { 5, 0, 0, 0 },                     // formatVersion
78
    { 6, 3, 0, 0 }                      // dataVersion
79
};
80
81
int32_t
82
CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
83
                               const void *rootElements, int32_t rootElementsLength,
84
                               int32_t indexes[], uint8_t *dest, int32_t capacity,
85
0
                               UErrorCode &errorCode) {
86
0
    return write(true, nullptr,
87
0
                 data, settings,
88
0
                 rootElements, rootElementsLength,
89
0
                 indexes, dest, capacity, errorCode);
90
0
}
91
92
int32_t
93
CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
94
                                    int32_t indexes[], uint8_t *dest, int32_t capacity,
95
0
                                    UErrorCode &errorCode) {
96
0
    return write(false, t.version,
97
0
                 *t.data, settings,
98
0
                 nullptr, 0,
99
0
                 indexes, dest, capacity, errorCode);
100
0
}
101
102
int32_t
103
CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
104
                           const CollationData &data, const CollationSettings &settings,
105
                           const void *rootElements, int32_t rootElementsLength,
106
                           int32_t indexes[], uint8_t *dest, int32_t capacity,
107
0
                           UErrorCode &errorCode) {
108
0
    if(U_FAILURE(errorCode)) { return 0; }
109
0
    if(capacity < 0 || (capacity > 0 && dest == nullptr)) {
110
0
        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
111
0
        return 0;
112
0
    }
113
114
    // Figure out which data items to write before settling on
115
    // the indexes length and writing offsets.
116
    // For any data item, we need to write the start and limit offsets,
117
    // so the indexes length must be at least index-of-start-offset + 2.
118
0
    int32_t indexesLength;
119
0
    UBool hasMappings;
120
0
    UnicodeSet unsafeBackwardSet;
121
0
    const CollationData *baseData = data.base;
122
123
0
    int32_t fastLatinVersion;
124
0
    if(data.fastLatinTable != nullptr) {
125
0
        fastLatinVersion = static_cast<int32_t>(CollationFastLatin::VERSION) << 16;
126
0
    } else {
127
0
        fastLatinVersion = 0;
128
0
    }
129
0
    int32_t fastLatinTableLength = 0;
130
131
0
    if(isBase) {
132
        // For the root collator, we write an even number of indexes
133
        // so that we start with an 8-aligned offset.
134
0
        indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
135
0
        U_ASSERT(settings.reorderCodesLength == 0);
136
0
        hasMappings = true;
137
0
        unsafeBackwardSet = *data.unsafeBackwardSet;
138
0
        fastLatinTableLength = data.fastLatinTableLength;
139
0
    } else if(baseData == nullptr) {
140
0
        hasMappings = false;
141
0
        if(settings.reorderCodesLength == 0) {
142
            // only options
143
0
            indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
144
0
        } else {
145
            // only options, reorder codes, and the reorder table
146
0
            indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
147
0
        }
148
0
    } else {
149
0
        hasMappings = true;
150
        // Tailored mappings, and what else?
151
        // Check in ascending order of optional tailoring data items.
152
0
        indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
153
0
        if(data.contextsLength != 0) {
154
0
            indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
155
0
        }
156
0
        unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
157
0
        if(!unsafeBackwardSet.isEmpty()) {
158
0
            indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
159
0
        }
160
0
        if(data.fastLatinTable != baseData->fastLatinTable) {
161
0
            fastLatinTableLength = data.fastLatinTableLength;
162
0
            indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
163
0
        }
164
0
    }
165
166
0
    UVector32 codesAndRanges(errorCode);
167
0
    const int32_t *reorderCodes = settings.reorderCodes;
168
0
    int32_t reorderCodesLength = settings.reorderCodesLength;
169
0
    if(settings.hasReordering() &&
170
0
            CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
171
        // Rebuild the full list of reorder ranges.
172
        // The list in the settings is truncated for efficiency.
173
0
        data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
174
        // Write the codes, then the ranges.
175
0
        for(int32_t i = 0; i < reorderCodesLength; ++i) {
176
0
            codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
177
0
        }
178
0
        if(U_FAILURE(errorCode)) { return 0; }
179
0
        reorderCodes = codesAndRanges.getBuffer();
180
0
        reorderCodesLength = codesAndRanges.size();
181
0
    }
182
183
0
    int32_t headerSize;
184
0
    if(isBase) {
185
0
        headerSize = 0;  // udata_create() writes the header
186
0
    } else {
187
0
        DataHeader header;
188
0
        header.dataHeader.magic1 = 0xda;
189
0
        header.dataHeader.magic2 = 0x27;
190
0
        uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
191
0
        uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
192
0
        headerSize = static_cast<int32_t>(sizeof(header));
193
0
        U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
194
0
        if(hasMappings && data.cesLength != 0) {
195
            // Sum of the sizes of the data items which are
196
            // not automatically multiples of 8 bytes and which are placed before the CEs.
197
0
            int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
198
0
            if((sum & 7) != 0) {
199
                // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
200
                // We add to the header size here.
201
                // Alternatively, we could increment the indexesLength
202
                // or add a few bytes to the reorderTable.
203
0
                headerSize += 4;
204
0
            }
205
0
        }
206
0
        header.dataHeader.headerSize = static_cast<uint16_t>(headerSize);
207
0
        if(headerSize <= capacity) {
208
0
            uprv_memcpy(dest, &header, sizeof(header));
209
            // Write 00 bytes so that the padding is not mistaken for a copyright string.
210
0
            uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
211
0
            dest += headerSize;
212
0
            capacity -= headerSize;
213
0
        } else {
214
0
            dest = nullptr;
215
0
            capacity = 0;
216
0
        }
217
0
    }
218
219
0
    indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
220
0
    U_ASSERT((settings.options & ~0xffff) == 0);
221
0
    indexes[CollationDataReader::IX_OPTIONS] =
222
0
            data.numericPrimary | fastLatinVersion | settings.options;
223
0
    indexes[CollationDataReader::IX_RESERVED2] = 0;
224
0
    indexes[CollationDataReader::IX_RESERVED3] = 0;
225
226
    // Byte offsets of data items all start from the start of the indexes.
227
    // We add the headerSize at the very end.
228
0
    int32_t totalSize = indexesLength * 4;
229
230
0
    if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
231
0
        indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);
232
0
    } else {
233
0
        indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
234
0
    }
235
236
0
    indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
237
0
    totalSize += reorderCodesLength * 4;
238
239
0
    indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
240
0
    if(settings.reorderTable != nullptr) {
241
0
        totalSize += 256;
242
0
    }
243
244
0
    indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
245
0
    if(hasMappings) {
246
0
        UErrorCode errorCode2 = U_ZERO_ERROR;
247
0
        int32_t length;
248
0
        if(totalSize < capacity) {
249
0
            length = utrie2_serialize(data.trie, dest + totalSize,
250
0
                                      capacity - totalSize, &errorCode2);
251
0
        } else {
252
0
            length = utrie2_serialize(data.trie, nullptr, 0, &errorCode2);
253
0
        }
254
0
        if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
255
0
            errorCode = errorCode2;
256
0
            return 0;
257
0
        }
258
        // The trie size should be a multiple of 8 bytes due to the way
259
        // compactIndex2(UNewTrie2 *trie) currently works.
260
0
        U_ASSERT((length & 7) == 0);
261
0
        totalSize += length;
262
0
    }
263
264
0
    indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
265
0
    indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
266
0
    if(hasMappings && data.cesLength != 0) {
267
0
        U_ASSERT(((headerSize + totalSize) & 7) == 0);
268
0
        totalSize += data.cesLength * 8;
269
0
    }
270
271
0
    indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
272
0
    indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
273
0
    if(hasMappings) {
274
0
        totalSize += data.ce32sLength * 4;
275
0
    }
276
277
0
    indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
278
0
    totalSize += rootElementsLength * 4;
279
280
0
    indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
281
0
    if(hasMappings) {
282
0
        totalSize += data.contextsLength * 2;
283
0
    }
284
285
0
    indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
286
0
    if(hasMappings && !unsafeBackwardSet.isEmpty()) {
287
0
        UErrorCode errorCode2 = U_ZERO_ERROR;
288
0
        int32_t length;
289
0
        if(totalSize < capacity) {
290
0
            uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
291
0
            length = unsafeBackwardSet.serialize(
292
0
                    p, (capacity - totalSize) / 2, errorCode2);
293
0
        } else {
294
0
            length = unsafeBackwardSet.serialize(nullptr, 0, errorCode2);
295
0
        }
296
0
        if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
297
0
            errorCode = errorCode2;
298
0
            return 0;
299
0
        }
300
0
        totalSize += length * 2;
301
0
    }
302
303
0
    indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
304
0
    totalSize += fastLatinTableLength * 2;
305
306
0
    UnicodeString scripts;
307
0
    indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
308
0
    if(isBase) {
309
0
        scripts.append(static_cast<char16_t>(data.numScripts));
310
0
        scripts.append(reinterpret_cast<const char16_t *>(data.scriptsIndex), data.numScripts + 16);
311
0
        scripts.append(reinterpret_cast<const char16_t *>(data.scriptStarts), data.scriptStartsLength);
312
0
        totalSize += scripts.length() * 2;
313
0
    }
314
315
0
    indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
316
0
    if(isBase) {
317
0
        totalSize += 256;
318
0
    }
319
320
0
    indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
321
0
    indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
322
323
0
    if(totalSize > capacity) {
324
0
        errorCode = U_BUFFER_OVERFLOW_ERROR;
325
0
        return headerSize + totalSize;
326
0
    }
327
328
0
    uprv_memcpy(dest, indexes, indexesLength * 4);
329
0
    copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
330
0
    copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
331
    // The trie has already been serialized into the dest buffer.
332
0
    copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
333
0
    copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
334
0
    copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
335
0
    copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
336
    // The unsafeBackwardSet has already been serialized into the dest buffer.
337
0
    copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
338
0
    copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
339
0
    copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
340
341
0
    return headerSize + totalSize;
342
0
}
343
344
void
345
CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
346
0
                              const void *src, uint8_t *dest) {
347
0
    int32_t start = indexes[startIndex];
348
0
    int32_t limit = indexes[startIndex + 1];
349
0
    if(start < limit) {
350
0
        uprv_memcpy(dest + start, src, limit - start);
351
0
    }
352
0
}
353
354
U_NAMESPACE_END
355
356
#endif  // !UCONFIG_NO_COLLATION