/src/icu/source/i18n/collationdatawriter.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | * Copyright (C) 2013-2015, International Business Machines  | 
6  |  | * Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | * collationdatawriter.cpp  | 
9  |  | *  | 
10  |  | * created on: 2013aug06  | 
11  |  | * created by: Markus W. Scherer  | 
12  |  | */  | 
13  |  |  | 
14  |  | #include "unicode/utypes.h"  | 
15  |  |  | 
16  |  | #if !UCONFIG_NO_COLLATION  | 
17  |  |  | 
18  |  | #include "unicode/tblcoll.h"  | 
19  |  | #include "unicode/udata.h"  | 
20  |  | #include "unicode/uniset.h"  | 
21  |  | #include "cmemory.h"  | 
22  |  | #include "collationdata.h"  | 
23  |  | #include "collationdatabuilder.h"  | 
24  |  | #include "collationdatareader.h"  | 
25  |  | #include "collationdatawriter.h"  | 
26  |  | #include "collationfastlatin.h"  | 
27  |  | #include "collationsettings.h"  | 
28  |  | #include "collationtailoring.h"  | 
29  |  | #include "uassert.h"  | 
30  |  | #include "ucmndata.h"  | 
31  |  |  | 
32  |  | U_NAMESPACE_BEGIN  | 
33  |  |  | 
34  |  | uint8_t *  | 
35  | 0  | RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const { | 
36  | 0  |     if(U_FAILURE(errorCode)) { return NULL; } | 
37  | 0  |     LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));  | 
38  | 0  |     if(buffer.isNull()) { | 
39  | 0  |         errorCode = U_MEMORY_ALLOCATION_ERROR;  | 
40  | 0  |         return NULL;  | 
41  | 0  |     }  | 
42  | 0  |     length = cloneBinary(buffer.getAlias(), 20000, errorCode);  | 
43  | 0  |     if(errorCode == U_BUFFER_OVERFLOW_ERROR) { | 
44  | 0  |         if(buffer.allocateInsteadAndCopy(length, 0) == NULL) { | 
45  | 0  |             errorCode = U_MEMORY_ALLOCATION_ERROR;  | 
46  | 0  |             return NULL;  | 
47  | 0  |         }  | 
48  | 0  |         errorCode = U_ZERO_ERROR;  | 
49  | 0  |         length = cloneBinary(buffer.getAlias(), length, errorCode);  | 
50  | 0  |     }  | 
51  | 0  |     if(U_FAILURE(errorCode)) { return NULL; } | 
52  | 0  |     return buffer.orphan();  | 
53  | 0  | }  | 
54  |  |  | 
55  |  | int32_t  | 
56  | 0  | RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const { | 
57  | 0  |     int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];  | 
58  | 0  |     return CollationDataWriter::writeTailoring(  | 
59  | 0  |             *tailoring, *settings, indexes, dest, capacity,  | 
60  | 0  |             errorCode);  | 
61  | 0  | }  | 
62  |  |  | 
63  |  | static const UDataInfo dataInfo = { | 
64  |  |     sizeof(UDataInfo),  | 
65  |  |     0,  | 
66  |  |  | 
67  |  |     U_IS_BIG_ENDIAN,  | 
68  |  |     U_CHARSET_FAMILY,  | 
69  |  |     U_SIZEOF_UCHAR,  | 
70  |  |     0,  | 
71  |  |  | 
72  |  |     { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol" | 
73  |  |     { 5, 0, 0, 0 },                     // formatVersion | 
74  |  |     { 6, 3, 0, 0 }                      // dataVersion | 
75  |  | };  | 
76  |  |  | 
77  |  | int32_t  | 
78  |  | CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,  | 
79  |  |                                const void *rootElements, int32_t rootElementsLength,  | 
80  |  |                                int32_t indexes[], uint8_t *dest, int32_t capacity,  | 
81  | 0  |                                UErrorCode &errorCode) { | 
82  | 0  |     return write(TRUE, NULL,  | 
83  | 0  |                  data, settings,  | 
84  | 0  |                  rootElements, rootElementsLength,  | 
85  | 0  |                  indexes, dest, capacity, errorCode);  | 
86  | 0  | }  | 
87  |  |  | 
88  |  | int32_t  | 
89  |  | CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,  | 
90  |  |                                     int32_t indexes[], uint8_t *dest, int32_t capacity,  | 
91  | 0  |                                     UErrorCode &errorCode) { | 
92  | 0  |     return write(FALSE, t.version,  | 
93  | 0  |                  *t.data, settings,  | 
94  | 0  |                  NULL, 0,  | 
95  | 0  |                  indexes, dest, capacity, errorCode);  | 
96  | 0  | }  | 
97  |  |  | 
98  |  | int32_t  | 
99  |  | CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,  | 
100  |  |                            const CollationData &data, const CollationSettings &settings,  | 
101  |  |                            const void *rootElements, int32_t rootElementsLength,  | 
102  |  |                            int32_t indexes[], uint8_t *dest, int32_t capacity,  | 
103  | 0  |                            UErrorCode &errorCode) { | 
104  | 0  |     if(U_FAILURE(errorCode)) { return 0; } | 
105  | 0  |     if(capacity < 0 || (capacity > 0 && dest == NULL)) { | 
106  | 0  |         errorCode = U_ILLEGAL_ARGUMENT_ERROR;  | 
107  | 0  |         return 0;  | 
108  | 0  |     }  | 
109  |  |  | 
110  |  |     // Figure out which data items to write before settling on  | 
111  |  |     // the indexes length and writing offsets.  | 
112  |  |     // For any data item, we need to write the start and limit offsets,  | 
113  |  |     // so the indexes length must be at least index-of-start-offset + 2.  | 
114  | 0  |     int32_t indexesLength;  | 
115  | 0  |     UBool hasMappings;  | 
116  | 0  |     UnicodeSet unsafeBackwardSet;  | 
117  | 0  |     const CollationData *baseData = data.base;  | 
118  |  | 
  | 
119  | 0  |     int32_t fastLatinVersion;  | 
120  | 0  |     if(data.fastLatinTable != NULL) { | 
121  | 0  |         fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;  | 
122  | 0  |     } else { | 
123  | 0  |         fastLatinVersion = 0;  | 
124  | 0  |     }  | 
125  | 0  |     int32_t fastLatinTableLength = 0;  | 
126  |  | 
  | 
127  | 0  |     if(isBase) { | 
128  |  |         // For the root collator, we write an even number of indexes  | 
129  |  |         // so that we start with an 8-aligned offset.  | 
130  | 0  |         indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;  | 
131  | 0  |         U_ASSERT(settings.reorderCodesLength == 0);  | 
132  | 0  |         hasMappings = TRUE;  | 
133  | 0  |         unsafeBackwardSet = *data.unsafeBackwardSet;  | 
134  | 0  |         fastLatinTableLength = data.fastLatinTableLength;  | 
135  | 0  |     } else if(baseData == NULL) { | 
136  | 0  |         hasMappings = FALSE;  | 
137  | 0  |         if(settings.reorderCodesLength == 0) { | 
138  |  |             // only options  | 
139  | 0  |             indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here  | 
140  | 0  |         } else { | 
141  |  |             // only options, reorder codes, and the reorder table  | 
142  | 0  |             indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;  | 
143  | 0  |         }  | 
144  | 0  |     } else { | 
145  | 0  |         hasMappings = TRUE;  | 
146  |  |         // Tailored mappings, and what else?  | 
147  |  |         // Check in ascending order of optional tailoring data items.  | 
148  | 0  |         indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;  | 
149  | 0  |         if(data.contextsLength != 0) { | 
150  | 0  |             indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;  | 
151  | 0  |         }  | 
152  | 0  |         unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);  | 
153  | 0  |         if(!unsafeBackwardSet.isEmpty()) { | 
154  | 0  |             indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;  | 
155  | 0  |         }  | 
156  | 0  |         if(data.fastLatinTable != baseData->fastLatinTable) { | 
157  | 0  |             fastLatinTableLength = data.fastLatinTableLength;  | 
158  | 0  |             indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;  | 
159  | 0  |         }  | 
160  | 0  |     }  | 
161  |  | 
  | 
162  | 0  |     UVector32 codesAndRanges(errorCode);  | 
163  | 0  |     const int32_t *reorderCodes = settings.reorderCodes;  | 
164  | 0  |     int32_t reorderCodesLength = settings.reorderCodesLength;  | 
165  | 0  |     if(settings.hasReordering() &&  | 
166  | 0  |             CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) { | 
167  |  |         // Rebuild the full list of reorder ranges.  | 
168  |  |         // The list in the settings is truncated for efficiency.  | 
169  | 0  |         data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);  | 
170  |  |         // Write the codes, then the ranges.  | 
171  | 0  |         for(int32_t i = 0; i < reorderCodesLength; ++i) { | 
172  | 0  |             codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);  | 
173  | 0  |         }  | 
174  | 0  |         if(U_FAILURE(errorCode)) { return 0; } | 
175  | 0  |         reorderCodes = codesAndRanges.getBuffer();  | 
176  | 0  |         reorderCodesLength = codesAndRanges.size();  | 
177  | 0  |     }  | 
178  |  |  | 
179  | 0  |     int32_t headerSize;  | 
180  | 0  |     if(isBase) { | 
181  | 0  |         headerSize = 0;  // udata_create() writes the header  | 
182  | 0  |     } else { | 
183  | 0  |         DataHeader header;  | 
184  | 0  |         header.dataHeader.magic1 = 0xda;  | 
185  | 0  |         header.dataHeader.magic2 = 0x27;  | 
186  | 0  |         uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));  | 
187  | 0  |         uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));  | 
188  | 0  |         headerSize = (int32_t)sizeof(header);  | 
189  | 0  |         U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes  | 
190  | 0  |         if(hasMappings && data.cesLength != 0) { | 
191  |  |             // Sum of the sizes of the data items which are  | 
192  |  |             // not automatically multiples of 8 bytes and which are placed before the CEs.  | 
193  | 0  |             int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;  | 
194  | 0  |             if((sum & 7) != 0) { | 
195  |  |                 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.  | 
196  |  |                 // We add to the header size here.  | 
197  |  |                 // Alternatively, we could increment the indexesLength  | 
198  |  |                 // or add a few bytes to the reorderTable.  | 
199  | 0  |                 headerSize += 4;  | 
200  | 0  |             }  | 
201  | 0  |         }  | 
202  | 0  |         header.dataHeader.headerSize = (uint16_t)headerSize;  | 
203  | 0  |         if(headerSize <= capacity) { | 
204  | 0  |             uprv_memcpy(dest, &header, sizeof(header));  | 
205  |  |             // Write 00 bytes so that the padding is not mistaken for a copyright string.  | 
206  | 0  |             uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));  | 
207  | 0  |             dest += headerSize;  | 
208  | 0  |             capacity -= headerSize;  | 
209  | 0  |         } else { | 
210  | 0  |             dest = NULL;  | 
211  | 0  |             capacity = 0;  | 
212  | 0  |         }  | 
213  | 0  |     }  | 
214  |  | 
  | 
215  | 0  |     indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;  | 
216  | 0  |     U_ASSERT((settings.options & ~0xffff) == 0);  | 
217  | 0  |     indexes[CollationDataReader::IX_OPTIONS] =  | 
218  | 0  |             data.numericPrimary | fastLatinVersion | settings.options;  | 
219  | 0  |     indexes[CollationDataReader::IX_RESERVED2] = 0;  | 
220  | 0  |     indexes[CollationDataReader::IX_RESERVED3] = 0;  | 
221  |  |  | 
222  |  |     // Byte offsets of data items all start from the start of the indexes.  | 
223  |  |     // We add the headerSize at the very end.  | 
224  | 0  |     int32_t totalSize = indexesLength * 4;  | 
225  |  | 
  | 
226  | 0  |     if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) { | 
227  | 0  |         indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s);  | 
228  | 0  |     } else { | 
229  | 0  |         indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;  | 
230  | 0  |     }  | 
231  |  | 
  | 
232  | 0  |     indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;  | 
233  | 0  |     totalSize += reorderCodesLength * 4;  | 
234  |  | 
  | 
235  | 0  |     indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;  | 
236  | 0  |     if(settings.reorderTable != NULL) { | 
237  | 0  |         totalSize += 256;  | 
238  | 0  |     }  | 
239  |  | 
  | 
240  | 0  |     indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;  | 
241  | 0  |     if(hasMappings) { | 
242  | 0  |         UErrorCode errorCode2 = U_ZERO_ERROR;  | 
243  | 0  |         int32_t length;  | 
244  | 0  |         if(totalSize < capacity) { | 
245  | 0  |             length = utrie2_serialize(data.trie, dest + totalSize,  | 
246  | 0  |                                       capacity - totalSize, &errorCode2);  | 
247  | 0  |         } else { | 
248  | 0  |             length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);  | 
249  | 0  |         }  | 
250  | 0  |         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { | 
251  | 0  |             errorCode = errorCode2;  | 
252  | 0  |             return 0;  | 
253  | 0  |         }  | 
254  |  |         // The trie size should be a multiple of 8 bytes due to the way  | 
255  |  |         // compactIndex2(UNewTrie2 *trie) currently works.  | 
256  | 0  |         U_ASSERT((length & 7) == 0);  | 
257  | 0  |         totalSize += length;  | 
258  | 0  |     }  | 
259  |  |  | 
260  | 0  |     indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;  | 
261  | 0  |     indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;  | 
262  | 0  |     if(hasMappings && data.cesLength != 0) { | 
263  | 0  |         U_ASSERT(((headerSize + totalSize) & 7) == 0);  | 
264  | 0  |         totalSize += data.cesLength * 8;  | 
265  | 0  |     }  | 
266  |  | 
  | 
267  | 0  |     indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;  | 
268  | 0  |     indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;  | 
269  | 0  |     if(hasMappings) { | 
270  | 0  |         totalSize += data.ce32sLength * 4;  | 
271  | 0  |     }  | 
272  |  | 
  | 
273  | 0  |     indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;  | 
274  | 0  |     totalSize += rootElementsLength * 4;  | 
275  |  | 
  | 
276  | 0  |     indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;  | 
277  | 0  |     if(hasMappings) { | 
278  | 0  |         totalSize += data.contextsLength * 2;  | 
279  | 0  |     }  | 
280  |  | 
  | 
281  | 0  |     indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;  | 
282  | 0  |     if(hasMappings && !unsafeBackwardSet.isEmpty()) { | 
283  | 0  |         UErrorCode errorCode2 = U_ZERO_ERROR;  | 
284  | 0  |         int32_t length;  | 
285  | 0  |         if(totalSize < capacity) { | 
286  | 0  |             uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);  | 
287  | 0  |             length = unsafeBackwardSet.serialize(  | 
288  | 0  |                     p, (capacity - totalSize) / 2, errorCode2);  | 
289  | 0  |         } else { | 
290  | 0  |             length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);  | 
291  | 0  |         }  | 
292  | 0  |         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { | 
293  | 0  |             errorCode = errorCode2;  | 
294  | 0  |             return 0;  | 
295  | 0  |         }  | 
296  | 0  |         totalSize += length * 2;  | 
297  | 0  |     }  | 
298  |  |  | 
299  | 0  |     indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;  | 
300  | 0  |     totalSize += fastLatinTableLength * 2;  | 
301  |  | 
  | 
302  | 0  |     UnicodeString scripts;  | 
303  | 0  |     indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;  | 
304  | 0  |     if(isBase) { | 
305  | 0  |         scripts.append((UChar)data.numScripts);  | 
306  | 0  |         scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);  | 
307  | 0  |         scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);  | 
308  | 0  |         totalSize += scripts.length() * 2;  | 
309  | 0  |     }  | 
310  |  | 
  | 
311  | 0  |     indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;  | 
312  | 0  |     if(isBase) { | 
313  | 0  |         totalSize += 256;  | 
314  | 0  |     }  | 
315  |  | 
  | 
316  | 0  |     indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;  | 
317  | 0  |     indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;  | 
318  |  | 
  | 
319  | 0  |     if(totalSize > capacity) { | 
320  | 0  |         errorCode = U_BUFFER_OVERFLOW_ERROR;  | 
321  | 0  |         return headerSize + totalSize;  | 
322  | 0  |     }  | 
323  |  |  | 
324  | 0  |     uprv_memcpy(dest, indexes, indexesLength * 4);  | 
325  | 0  |     copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);  | 
326  | 0  |     copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);  | 
327  |  |     // The trie has already been serialized into the dest buffer.  | 
328  | 0  |     copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);  | 
329  | 0  |     copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);  | 
330  | 0  |     copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);  | 
331  | 0  |     copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);  | 
332  |  |     // The unsafeBackwardSet has already been serialized into the dest buffer.  | 
333  | 0  |     copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);  | 
334  | 0  |     copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);  | 
335  | 0  |     copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);  | 
336  |  | 
  | 
337  | 0  |     return headerSize + totalSize;  | 
338  | 0  | }  | 
339  |  |  | 
340  |  | void  | 
341  |  | CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,  | 
342  | 0  |                               const void *src, uint8_t *dest) { | 
343  | 0  |     int32_t start = indexes[startIndex];  | 
344  | 0  |     int32_t limit = indexes[startIndex + 1];  | 
345  | 0  |     if(start < limit) { | 
346  | 0  |         uprv_memcpy(dest + start, src, limit - start);  | 
347  | 0  |     }  | 
348  | 0  | }  | 
349  |  |  | 
350  |  | U_NAMESPACE_END  | 
351  |  |  | 
352  |  | #endif  // !UCONFIG_NO_COLLATION  |