Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/common/bytestriebuilder.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*   Copyright (C) 2010-2012, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
*   file name:  bytestriebuilder.cpp
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2010sep25
14
*   created by: Markus W. Scherer
15
*/
16
17
#include "unicode/utypes.h"
18
#include "unicode/bytestrie.h"
19
#include "unicode/bytestriebuilder.h"
20
#include "unicode/stringpiece.h"
21
#include "charstr.h"
22
#include "cmemory.h"
23
#include "uhash.h"
24
#include "uarrsort.h"
25
#include "uassert.h"
26
#include "ustr_imp.h"
27
28
U_NAMESPACE_BEGIN
29
30
/*
31
 * Note: This builder implementation stores (bytes, value) pairs with full copies
32
 * of the byte sequences, until the BytesTrie is built.
33
 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
34
 */
35
36
class BytesTrieElement : public UMemory {
37
public:
38
    // Use compiler's default constructor, initializes nothing.
39
40
    void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
41
42
0
    StringPiece getString(const CharString &strings) const {
43
0
        int32_t offset=stringOffset;
44
0
        int32_t length;
45
0
        if(offset>=0) {
46
0
            length=(uint8_t)strings[offset++];
47
0
        } else {
48
0
            offset=~offset;
49
0
            length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
50
0
            offset+=2;
51
0
        }
52
0
        return StringPiece(strings.data()+offset, length);
53
0
    }
54
0
    int32_t getStringLength(const CharString &strings) const {
55
0
        int32_t offset=stringOffset;
56
0
        if(offset>=0) {
57
0
            return (uint8_t)strings[offset];
58
0
        } else {
59
0
            offset=~offset;
60
0
            return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
61
0
        }
62
0
    }
63
64
0
    char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
65
66
0
    int32_t getValue() const { return value; }
67
68
    int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
69
70
private:
71
0
    const char *data(const CharString &strings) const {
72
0
        int32_t offset=stringOffset;
73
0
        if(offset>=0) {
74
0
            ++offset;
75
0
        } else {
76
0
            offset=~offset+2;
77
0
        }
78
0
        return strings.data()+offset;
79
0
    }
80
81
    // If the stringOffset is non-negative, then the first strings byte contains
82
    // the string length.
83
    // If the stringOffset is negative, then the first two strings bytes contain
84
    // the string length (big-endian), and the offset needs to be bit-inverted.
85
    // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
86
    int32_t stringOffset;
87
    int32_t value;
88
};
89
90
void
91
BytesTrieElement::setTo(StringPiece s, int32_t val,
92
0
                        CharString &strings, UErrorCode &errorCode) {
93
0
    if(U_FAILURE(errorCode)) {
94
0
        return;
95
0
    }
96
0
    int32_t length=s.length();
97
0
    if(length>0xffff) {
98
        // Too long: We store the length in 1 or 2 bytes.
99
0
        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
100
0
        return;
101
0
    }
102
0
    int32_t offset=strings.length();
103
0
    if(length>0xff) {
104
0
        offset=~offset;
105
0
        strings.append((char)(length>>8), errorCode);
106
0
    }
107
0
    strings.append((char)length, errorCode);
108
0
    stringOffset=offset;
109
0
    value=val;
110
0
    strings.append(s, errorCode);
111
0
}
112
113
int32_t
114
0
BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
115
    // TODO: add StringPiece::compare(), see ticket #8187
116
0
    StringPiece thisString=getString(strings);
117
0
    StringPiece otherString=other.getString(strings);
118
0
    int32_t lengthDiff=thisString.length()-otherString.length();
119
0
    int32_t commonLength;
120
0
    if(lengthDiff<=0) {
121
0
        commonLength=thisString.length();
122
0
    } else {
123
0
        commonLength=otherString.length();
124
0
    }
125
0
    int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
126
0
    return diff!=0 ? diff : lengthDiff;
127
0
}
128
129
BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
130
0
        : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
131
0
          bytes(NULL), bytesCapacity(0), bytesLength(0) {
132
0
    if(U_FAILURE(errorCode)) {
133
0
        return;
134
0
    }
135
0
    strings=new CharString();
136
0
    if(strings==NULL) {
137
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
138
0
    }
139
0
}
140
141
0
BytesTrieBuilder::~BytesTrieBuilder() {
142
0
    delete strings;
143
0
    delete[] elements;
144
0
    uprv_free(bytes);
145
0
}
146
147
BytesTrieBuilder &
148
0
BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
149
0
    if(U_FAILURE(errorCode)) {
150
0
        return *this;
151
0
    }
152
0
    if(bytesLength>0) {
153
        // Cannot add elements after building.
154
0
        errorCode=U_NO_WRITE_PERMISSION;
155
0
        return *this;
156
0
    }
157
0
    if(elementsLength==elementsCapacity) {
158
0
        int32_t newCapacity;
159
0
        if(elementsCapacity==0) {
160
0
            newCapacity=1024;
161
0
        } else {
162
0
            newCapacity=4*elementsCapacity;
163
0
        }
164
0
        BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
165
0
        if(newElements==NULL) {
166
0
            errorCode=U_MEMORY_ALLOCATION_ERROR;
167
0
            return *this; // error instead of dereferencing null
168
0
        }
169
0
        if(elementsLength>0) {
170
0
            uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
171
0
        }
172
0
        delete[] elements;
173
0
        elements=newElements;
174
0
        elementsCapacity=newCapacity;
175
0
    }
176
0
    elements[elementsLength++].setTo(s, value, *strings, errorCode);
177
0
    return *this;
178
0
}
179
180
U_CDECL_BEGIN
181
182
static int32_t U_CALLCONV
183
0
compareElementStrings(const void *context, const void *left, const void *right) {
184
0
    const CharString *strings=static_cast<const CharString *>(context);
185
0
    const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
186
0
    const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
187
0
    return leftElement->compareStringTo(*rightElement, *strings);
188
0
}
189
190
U_CDECL_END
191
192
BytesTrie *
193
0
BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
194
0
    buildBytes(buildOption, errorCode);
195
0
    BytesTrie *newTrie=NULL;
196
0
    if(U_SUCCESS(errorCode)) {
197
0
        newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
198
0
        if(newTrie==NULL) {
199
0
            errorCode=U_MEMORY_ALLOCATION_ERROR;
200
0
        } else {
201
0
            bytes=NULL;  // The new trie now owns the array.
202
0
            bytesCapacity=0;
203
0
        }
204
0
    }
205
0
    return newTrie;
206
0
}
207
208
StringPiece
209
0
BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
210
0
    buildBytes(buildOption, errorCode);
211
0
    StringPiece result;
212
0
    if(U_SUCCESS(errorCode)) {
213
0
        result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
214
0
    }
215
0
    return result;
216
0
}
217
218
void
219
0
BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
220
0
    if(U_FAILURE(errorCode)) {
221
0
        return;
222
0
    }
223
0
    if(bytes!=NULL && bytesLength>0) {
224
        // Already built.
225
0
        return;
226
0
    }
227
0
    if(bytesLength==0) {
228
0
        if(elementsLength==0) {
229
0
            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
230
0
            return;
231
0
        }
232
0
        uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
233
0
                      compareElementStrings, strings,
234
0
                      FALSE,  // need not be a stable sort
235
0
                      &errorCode);
236
0
        if(U_FAILURE(errorCode)) {
237
0
            return;
238
0
        }
239
        // Duplicate strings are not allowed.
240
0
        StringPiece prev=elements[0].getString(*strings);
241
0
        for(int32_t i=1; i<elementsLength; ++i) {
242
0
            StringPiece current=elements[i].getString(*strings);
243
0
            if(prev==current) {
244
0
                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
245
0
                return;
246
0
            }
247
0
            prev=current;
248
0
        }
249
0
    }
250
    // Create and byte-serialize the trie for the elements.
251
0
    bytesLength=0;
252
0
    int32_t capacity=strings->length();
253
0
    if(capacity<1024) {
254
0
        capacity=1024;
255
0
    }
256
0
    if(bytesCapacity<capacity) {
257
0
        uprv_free(bytes);
258
0
        bytes=static_cast<char *>(uprv_malloc(capacity));
259
0
        if(bytes==NULL) {
260
0
            errorCode=U_MEMORY_ALLOCATION_ERROR;
261
0
            bytesCapacity=0;
262
0
            return;
263
0
        }
264
0
        bytesCapacity=capacity;
265
0
    }
266
0
    StringTrieBuilder::build(buildOption, elementsLength, errorCode);
267
0
    if(bytes==NULL) {
268
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
269
0
    }
270
0
}
271
272
BytesTrieBuilder &
273
0
BytesTrieBuilder::clear() {
274
0
    strings->clear();
275
0
    elementsLength=0;
276
0
    bytesLength=0;
277
0
    return *this;
278
0
}
279
280
int32_t
281
0
BytesTrieBuilder::getElementStringLength(int32_t i) const {
282
0
    return elements[i].getStringLength(*strings);
283
0
}
284
285
UChar
286
0
BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
287
0
    return (uint8_t)elements[i].charAt(byteIndex, *strings);
288
0
}
289
290
int32_t
291
0
BytesTrieBuilder::getElementValue(int32_t i) const {
292
0
    return elements[i].getValue();
293
0
}
294
295
int32_t
296
0
BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
297
0
    const BytesTrieElement &firstElement=elements[first];
298
0
    const BytesTrieElement &lastElement=elements[last];
299
0
    int32_t minStringLength=firstElement.getStringLength(*strings);
300
0
    while(++byteIndex<minStringLength &&
301
0
            firstElement.charAt(byteIndex, *strings)==
302
0
            lastElement.charAt(byteIndex, *strings)) {}
303
0
    return byteIndex;
304
0
}
305
306
int32_t
307
0
BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
308
0
    int32_t length=0;  // Number of different bytes at byteIndex.
309
0
    int32_t i=start;
310
0
    do {
311
0
        char byte=elements[i++].charAt(byteIndex, *strings);
312
0
        while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
313
0
            ++i;
314
0
        }
315
0
        ++length;
316
0
    } while(i<limit);
317
0
    return length;
318
0
}
319
320
int32_t
321
0
BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
322
0
    do {
323
0
        char byte=elements[i++].charAt(byteIndex, *strings);
324
0
        while(byte==elements[i].charAt(byteIndex, *strings)) {
325
0
            ++i;
326
0
        }
327
0
    } while(--count>0);
328
0
    return i;
329
0
}
330
331
int32_t
332
0
BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
333
0
    char b=(char)byte;
334
0
    while(b==elements[i].charAt(byteIndex, *strings)) {
335
0
        ++i;
336
0
    }
337
0
    return i;
338
0
}
339
340
BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
341
0
        : LinearMatchNode(len, nextNode), s(bytes) {
342
0
    hash=static_cast<int32_t>(
343
0
        static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len)));
344
0
}
345
346
bool
347
0
BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
348
0
    if(this==&other) {
349
0
        return TRUE;
350
0
    }
351
0
    if(!LinearMatchNode::operator==(other)) {
352
0
        return FALSE;
353
0
    }
354
0
    const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
355
0
    return 0==uprv_memcmp(s, o.s, length);
356
0
}
357
358
void
359
0
BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
360
0
    BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
361
0
    next->write(builder);
362
0
    b.write(s, length);
363
0
    offset=b.write(b.getMinLinearMatch()+length-1);
364
0
}
365
366
StringTrieBuilder::Node *
367
BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
368
0
                                        Node *nextNode) const {
369
0
    return new BTLinearMatchNode(
370
0
            elements[i].getString(*strings).data()+byteIndex,
371
0
            length,
372
0
            nextNode);
373
0
}
374
375
UBool
376
0
BytesTrieBuilder::ensureCapacity(int32_t length) {
377
0
    if(bytes==NULL) {
378
0
        return FALSE;  // previous memory allocation had failed
379
0
    }
380
0
    if(length>bytesCapacity) {
381
0
        int32_t newCapacity=bytesCapacity;
382
0
        do {
383
0
            newCapacity*=2;
384
0
        } while(newCapacity<=length);
385
0
        char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
386
0
        if(newBytes==NULL) {
387
            // unable to allocate memory
388
0
            uprv_free(bytes);
389
0
            bytes=NULL;
390
0
            bytesCapacity=0;
391
0
            return FALSE;
392
0
        }
393
0
        uprv_memcpy(newBytes+(newCapacity-bytesLength),
394
0
                    bytes+(bytesCapacity-bytesLength), bytesLength);
395
0
        uprv_free(bytes);
396
0
        bytes=newBytes;
397
0
        bytesCapacity=newCapacity;
398
0
    }
399
0
    return TRUE;
400
0
}
401
402
int32_t
403
0
BytesTrieBuilder::write(int32_t byte) {
404
0
    int32_t newLength=bytesLength+1;
405
0
    if(ensureCapacity(newLength)) {
406
0
        bytesLength=newLength;
407
0
        bytes[bytesCapacity-bytesLength]=(char)byte;
408
0
    }
409
0
    return bytesLength;
410
0
}
411
412
int32_t
413
0
BytesTrieBuilder::write(const char *b, int32_t length) {
414
0
    int32_t newLength=bytesLength+length;
415
0
    if(ensureCapacity(newLength)) {
416
0
        bytesLength=newLength;
417
0
        uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
418
0
    }
419
0
    return bytesLength;
420
0
}
421
422
int32_t
423
0
BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
424
0
    return write(elements[i].getString(*strings).data()+byteIndex, length);
425
0
}
426
427
int32_t
428
0
BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
429
0
    if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
430
0
        return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
431
0
    }
432
0
    char intBytes[5];
433
0
    int32_t length=1;
434
0
    if(i<0 || i>0xffffff) {
435
0
        intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
436
0
        intBytes[1]=(char)((uint32_t)i>>24);
437
0
        intBytes[2]=(char)((uint32_t)i>>16);
438
0
        intBytes[3]=(char)((uint32_t)i>>8);
439
0
        intBytes[4]=(char)i;
440
0
        length=5;
441
    // } else if(i<=BytesTrie::kMaxOneByteValue) {
442
    //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
443
0
    } else {
444
0
        if(i<=BytesTrie::kMaxTwoByteValue) {
445
0
            intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
446
0
        } else {
447
0
            if(i<=BytesTrie::kMaxThreeByteValue) {
448
0
                intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
449
0
            } else {
450
0
                intBytes[0]=(char)BytesTrie::kFourByteValueLead;
451
0
                intBytes[1]=(char)(i>>16);
452
0
                length=2;
453
0
            }
454
0
            intBytes[length++]=(char)(i>>8);
455
0
        }
456
0
        intBytes[length++]=(char)i;
457
0
    }
458
0
    intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
459
0
    return write(intBytes, length);
460
0
}
461
462
int32_t
463
0
BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
464
0
    int32_t offset=write(node);
465
0
    if(hasValue) {
466
0
        offset=writeValueAndFinal(value, FALSE);
467
0
    }
468
0
    return offset;
469
0
}
470
471
int32_t
472
0
BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
473
0
    int32_t i=bytesLength-jumpTarget;
474
0
    U_ASSERT(i>=0);
475
0
    if(i<=BytesTrie::kMaxOneByteDelta) {
476
0
        return write(i);
477
0
    } else {
478
0
        char intBytes[5];
479
0
        return write(intBytes, internalEncodeDelta(i, intBytes));
480
0
    }
481
0
}
482
483
int32_t
484
0
BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) {
485
0
    U_ASSERT(i>=0);
486
0
    if(i<=BytesTrie::kMaxOneByteDelta) {
487
0
        intBytes[0]=(char)i;
488
0
        return 1;
489
0
    }
490
0
    int32_t length=1;
491
0
    if(i<=BytesTrie::kMaxTwoByteDelta) {
492
0
        intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
493
0
    } else {
494
0
        if(i<=BytesTrie::kMaxThreeByteDelta) {
495
0
            intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
496
0
        } else {
497
0
            if(i<=0xffffff) {
498
0
                intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
499
0
            } else {
500
0
                intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
501
0
                intBytes[1]=(char)(i>>24);
502
0
                length=2;
503
0
            }
504
0
            intBytes[length++]=(char)(i>>16);
505
0
        }
506
0
        intBytes[length++]=(char)(i>>8);
507
0
    }
508
0
    intBytes[length++]=(char)i;
509
0
    return length;
510
0
}
511
512
U_NAMESPACE_END