Coverage Report

Created: 2026-01-22 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/ucharstriebuilder.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*   Copyright (C) 2010-2012, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
*   file name:  ucharstriebuilder.h
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2010nov14
14
*   created by: Markus W. Scherer
15
*/
16
17
#include "unicode/utypes.h"
18
#include "unicode/ucharstrie.h"
19
#include "unicode/ucharstriebuilder.h"
20
#include "unicode/unistr.h"
21
#include "unicode/ustring.h"
22
#include "cmemory.h"
23
#include "uarrsort.h"
24
#include "uassert.h"
25
#include "uhash.h"
26
#include "ustr_imp.h"
27
28
U_NAMESPACE_BEGIN
29
30
/*
31
 * Note: This builder implementation stores (string, value) pairs with full copies
32
 * of the 16-bit-unit sequences, until the UCharsTrie is built.
33
 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
34
 */
35
36
class UCharsTrieElement : public UMemory {
37
public:
38
    // Use compiler's default constructor, initializes nothing.
39
40
    void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
41
42
721M
    UnicodeString getString(const UnicodeString &strings) const {
43
721M
        int32_t length=strings[stringOffset];
44
721M
        return strings.tempSubString(stringOffset+1, length);
45
721M
    }
46
153M
    int32_t getStringLength(const UnicodeString &strings) const {
47
153M
        return strings[stringOffset];
48
153M
    }
49
50
1.51G
    char16_t charAt(int32_t index, const UnicodeString &strings) const {
51
1.51G
        return strings[stringOffset+1+index];
52
1.51G
    }
53
54
42.2M
    int32_t getValue() const { return value; }
55
56
    int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
57
58
private:
59
    // The first strings unit contains the string length.
60
    // (Compared with a stringLength field here, this saves 2 bytes per string.)
61
    int32_t stringOffset;
62
    int32_t value;
63
};
64
65
void
66
UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
67
42.2M
                         UnicodeString &strings, UErrorCode &errorCode) {
68
42.2M
    if(U_FAILURE(errorCode)) {
69
0
        return;
70
0
    }
71
42.2M
    int32_t length=s.length();
72
42.2M
    if(length>0xffff) {
73
        // Too long: We store the length in 1 unit.
74
0
        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
75
0
        return;
76
0
    }
77
42.2M
    stringOffset=strings.length();
78
42.2M
    strings.append(static_cast<char16_t>(length));
79
42.2M
    value=val;
80
42.2M
    strings.append(s);
81
42.2M
}
82
83
int32_t
84
312M
UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
85
312M
    return getString(strings).compare(other.getString(strings));
86
312M
}
87
88
UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
89
871k
        : elements(nullptr), elementsCapacity(0), elementsLength(0),
90
871k
          uchars(nullptr), ucharsCapacity(0), ucharsLength(0) {}
91
92
871k
UCharsTrieBuilder::~UCharsTrieBuilder() {
93
871k
    delete[] elements;
94
871k
    uprv_free(uchars);
95
871k
}
96
97
UCharsTrieBuilder &
98
42.2M
UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
99
42.2M
    if(U_FAILURE(errorCode)) {
100
0
        return *this;
101
0
    }
102
42.2M
    if(ucharsLength>0) {
103
        // Cannot add elements after building.
104
0
        errorCode=U_NO_WRITE_PERMISSION;
105
0
        return *this;
106
0
    }
107
42.2M
    if(elementsLength==elementsCapacity) {
108
483k
        int32_t newCapacity;
109
483k
        if(elementsCapacity==0) {
110
481k
            newCapacity=1024;
111
481k
        } else {
112
1.99k
            newCapacity=4*elementsCapacity;
113
1.99k
        }
114
483k
        UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
115
483k
        if(newElements==nullptr) {
116
0
            errorCode=U_MEMORY_ALLOCATION_ERROR;
117
0
            return *this;
118
0
        }
119
483k
        if(elementsLength>0) {
120
1.99k
            uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(UCharsTrieElement));
121
1.99k
        }
122
483k
        delete[] elements;
123
483k
        elements=newElements;
124
483k
        elementsCapacity=newCapacity;
125
483k
    }
126
42.2M
    elements[elementsLength++].setTo(s, value, strings, errorCode);
127
42.2M
    if(U_SUCCESS(errorCode) && strings.isBogus()) {
128
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
129
0
    }
130
42.2M
    return *this;
131
42.2M
}
132
133
U_CDECL_BEGIN
134
135
static int32_t U_CALLCONV
136
312M
compareElementStrings(const void *context, const void *left, const void *right) {
137
312M
    const UnicodeString *strings=static_cast<const UnicodeString *>(context);
138
312M
    const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
139
312M
    const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
140
312M
    return leftElement->compareStringTo(*rightElement, *strings);
141
312M
}
142
143
U_CDECL_END
144
145
UCharsTrie *
146
80
UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
147
80
    buildUChars(buildOption, errorCode);
148
80
    UCharsTrie *newTrie=nullptr;
149
80
    if(U_SUCCESS(errorCode)) {
150
80
        newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
151
80
        if(newTrie==nullptr) {
152
0
            errorCode=U_MEMORY_ALLOCATION_ERROR;
153
80
        } else {
154
80
            uchars=nullptr;  // The new trie now owns the array.
155
80
            ucharsCapacity=0;
156
80
        }
157
80
    }
158
80
    return newTrie;
159
80
}
160
161
UnicodeString &
162
UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
163
1.23M
                                      UErrorCode &errorCode) {
164
1.23M
    buildUChars(buildOption, errorCode);
165
1.23M
    if(U_SUCCESS(errorCode)) {
166
1.23M
        result.setTo(false, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
167
1.23M
    }
168
1.23M
    return result;
169
1.23M
}
170
171
void
172
1.23M
UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
173
1.23M
    if(U_FAILURE(errorCode)) {
174
0
        return;
175
0
    }
176
1.23M
    if(uchars!=nullptr && ucharsLength>0) {
177
        // Already built.
178
0
        return;
179
0
    }
180
1.23M
    if(ucharsLength==0) {
181
1.23M
        if(elementsLength==0) {
182
0
            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
183
0
            return;
184
0
        }
185
1.23M
        if(strings.isBogus()) {
186
0
            errorCode=U_MEMORY_ALLOCATION_ERROR;
187
0
            return;
188
0
        }
189
1.23M
        uprv_sortArray(elements, elementsLength, static_cast<int32_t>(sizeof(UCharsTrieElement)),
190
1.23M
                      compareElementStrings, &strings,
191
1.23M
                      false,  // need not be a stable sort
192
1.23M
                      &errorCode);
193
1.23M
        if(U_FAILURE(errorCode)) {
194
0
            return;
195
0
        }
196
        // Duplicate strings are not allowed.
197
1.23M
        UnicodeString prev=elements[0].getString(strings);
198
42.2M
        for(int32_t i=1; i<elementsLength; ++i) {
199
40.9M
            UnicodeString current=elements[i].getString(strings);
200
40.9M
            if(prev==current) {
201
0
                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
202
0
                return;
203
0
            }
204
40.9M
            prev.fastCopyFrom(current);
205
40.9M
        }
206
1.23M
    }
207
    // Create and char16_t-serialize the trie for the elements.
208
1.23M
    ucharsLength=0;
209
1.23M
    int32_t capacity=strings.length();
210
1.23M
    if(capacity<1024) {
211
1.09M
        capacity=1024;
212
1.09M
    }
213
1.23M
    if(ucharsCapacity<capacity) {
214
487k
        uprv_free(uchars);
215
487k
        uchars=static_cast<char16_t *>(uprv_malloc(capacity*2));
216
487k
        if(uchars==nullptr) {
217
0
            errorCode=U_MEMORY_ALLOCATION_ERROR;
218
0
            ucharsCapacity=0;
219
0
            return;
220
0
        }
221
487k
        ucharsCapacity=capacity;
222
487k
    }
223
1.23M
    StringTrieBuilder::build(buildOption, elementsLength, errorCode);
224
1.23M
    if(uchars==nullptr) {
225
0
        errorCode=U_MEMORY_ALLOCATION_ERROR;
226
0
    }
227
1.23M
}
228
229
int32_t
230
118M
UCharsTrieBuilder::getElementStringLength(int32_t i) const {
231
118M
    return elements[i].getStringLength(strings);
232
118M
}
233
234
char16_t
235
175M
UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
236
175M
    return elements[i].charAt(unitIndex, strings);
237
175M
}
238
239
int32_t
240
42.2M
UCharsTrieBuilder::getElementValue(int32_t i) const {
241
42.2M
    return elements[i].getValue();
242
42.2M
}
243
244
int32_t
245
34.8M
UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
246
34.8M
    const UCharsTrieElement &firstElement=elements[first];
247
34.8M
    const UCharsTrieElement &lastElement=elements[last];
248
34.8M
    int32_t minStringLength=firstElement.getStringLength(strings);
249
461M
    while(++unitIndex<minStringLength &&
250
440M
            firstElement.charAt(unitIndex, strings)==
251
440M
            lastElement.charAt(unitIndex, strings)) {}
252
34.8M
    return unitIndex;
253
34.8M
}
254
255
int32_t
256
21.0M
UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
257
21.0M
    int32_t length=0;  // Number of different units at unitIndex.
258
21.0M
    int32_t i=start;
259
61.6M
    do {
260
61.6M
        char16_t unit=elements[i++].charAt(unitIndex, strings);
261
240M
        while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
262
178M
            ++i;
263
178M
        }
264
61.6M
        ++length;
265
61.6M
    } while(i<limit);
266
21.0M
    return length;
267
21.0M
}
268
269
int32_t
270
2.45M
UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
271
13.4M
    do {
272
13.4M
        char16_t unit=elements[i++].charAt(unitIndex, strings);
273
19.2M
        while(unit==elements[i].charAt(unitIndex, strings)) {
274
5.76M
            ++i;
275
5.76M
        }
276
13.4M
    } while(--count>0);
277
2.45M
    return i;
278
2.45M
}
279
280
int32_t
281
38.1M
UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const {
282
142M
    while(unit==elements[i].charAt(unitIndex, strings)) {
283
103M
        ++i;
284
103M
    }
285
38.1M
    return i;
286
38.1M
}
287
288
UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const char16_t *units, int32_t len, Node *nextNode)
289
54.2M
        : LinearMatchNode(len, nextNode), s(units) {
290
54.2M
    hash=hash*37u+ustr_hashUCharsN(units, len);
291
54.2M
}
292
293
bool
294
48.2M
UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
295
48.2M
    if(this==&other) {
296
0
        return true;
297
0
    }
298
48.2M
    if(!LinearMatchNode::operator==(other)) {
299
13.1k
        return false;
300
13.1k
    }
301
48.2M
    const UCTLinearMatchNode &o=static_cast<const UCTLinearMatchNode &>(other);
302
48.2M
    return 0==u_memcmp(s, o.s, length);
303
48.2M
}
304
305
void
306
10.7M
UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
307
10.7M
    UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
308
10.7M
    next->write(builder);
309
10.7M
    b.write(s, length);
310
10.7M
    offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
311
10.7M
}
312
313
StringTrieBuilder::Node *
314
UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
315
54.2M
                                         Node *nextNode) const {
316
54.2M
    return new UCTLinearMatchNode(
317
54.2M
            elements[i].getString(strings).getBuffer()+unitIndex,
318
54.2M
            length,
319
54.2M
            nextNode);
320
54.2M
}
321
322
UBool
323
66.5M
UCharsTrieBuilder::ensureCapacity(int32_t length) {
324
66.5M
    if(uchars==nullptr) {
325
0
        return false;  // previous memory allocation had failed
326
0
    }
327
66.5M
    if(length>ucharsCapacity) {
328
1.30k
        int32_t newCapacity=ucharsCapacity;
329
1.30k
        do {
330
1.30k
            newCapacity*=2;
331
1.30k
        } while(newCapacity<=length);
332
1.30k
        char16_t *newUChars=static_cast<char16_t *>(uprv_malloc(newCapacity*2));
333
1.30k
        if(newUChars==nullptr) {
334
            // unable to allocate memory
335
0
            uprv_free(uchars);
336
0
            uchars=nullptr;
337
0
            ucharsCapacity=0;
338
0
            return false;
339
0
        }
340
1.30k
        u_memcpy(newUChars+(newCapacity-ucharsLength),
341
1.30k
                 uchars+(ucharsCapacity-ucharsLength), ucharsLength);
342
1.30k
        uprv_free(uchars);
343
1.30k
        uchars=newUChars;
344
1.30k
        ucharsCapacity=newCapacity;
345
1.30k
    }
346
66.5M
    return true;
347
66.5M
}
348
349
int32_t
350
46.8M
UCharsTrieBuilder::write(int32_t unit) {
351
46.8M
    int32_t newLength=ucharsLength+1;
352
46.8M
    if(ensureCapacity(newLength)) {
353
46.8M
        ucharsLength=newLength;
354
46.8M
        uchars[ucharsCapacity - ucharsLength] = static_cast<char16_t>(unit);
355
46.8M
    }
356
46.8M
    return ucharsLength;
357
46.8M
}
358
359
int32_t
360
19.6M
UCharsTrieBuilder::write(const char16_t *s, int32_t length) {
361
19.6M
    int32_t newLength=ucharsLength+length;
362
19.6M
    if(ensureCapacity(newLength)) {
363
19.6M
        ucharsLength=newLength;
364
19.6M
        u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
365
19.6M
    }
366
19.6M
    return ucharsLength;
367
19.6M
}
368
369
int32_t
370
4.43k
UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
371
4.43k
    return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
372
4.43k
}
373
374
int32_t
375
17.2M
UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
376
17.2M
    if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
377
8.42M
        return write(i|(isFinal<<15));
378
8.42M
    }
379
8.78M
    char16_t intUnits[3];
380
8.78M
    int32_t length;
381
8.78M
    if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
382
3.82M
        intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitValueLead);
383
3.82M
        intUnits[1] = static_cast<char16_t>(static_cast<uint32_t>(i) >> 16);
384
3.82M
        intUnits[2] = static_cast<char16_t>(i);
385
3.82M
        length=3;
386
    // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
387
    //     intUnits[0]=(char16_t)(i);
388
    //     length=1;
389
4.95M
    } else {
390
4.95M
        intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitValueLead + (i >> 16));
391
4.95M
        intUnits[1] = static_cast<char16_t>(i);
392
4.95M
        length=2;
393
4.95M
    }
394
8.78M
    intUnits[0] = static_cast<char16_t>(intUnits[0] | (isFinal << 15));
395
8.78M
    return write(intUnits, length);
396
17.2M
}
397
398
int32_t
399
16.7M
UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
400
16.7M
    if(!hasValue) {
401
16.6M
        return write(node);
402
16.6M
    }
403
124k
    char16_t intUnits[3];
404
124k
    int32_t length;
405
124k
    if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
406
59.6k
        intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitNodeValueLead);
407
59.6k
        intUnits[1] = static_cast<char16_t>(static_cast<uint32_t>(value) >> 16);
408
59.6k
        intUnits[2] = static_cast<char16_t>(value);
409
59.6k
        length=3;
410
65.3k
    } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
411
4.98k
        intUnits[0] = static_cast<char16_t>((value + 1) << 6);
412
4.98k
        length=1;
413
60.3k
    } else {
414
60.3k
        intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitNodeValueLead + ((value >> 10) & 0x7fc0));
415
60.3k
        intUnits[1] = static_cast<char16_t>(value);
416
60.3k
        length=2;
417
60.3k
    }
418
124k
    intUnits[0] |= static_cast<char16_t>(node);
419
124k
    return write(intUnits, length);
420
16.7M
}
421
422
int32_t
423
1.33M
UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
424
1.33M
    int32_t i=ucharsLength-jumpTarget;
425
1.33M
    U_ASSERT(i>=0);
426
1.33M
    if(i<=UCharsTrie::kMaxOneUnitDelta) {
427
1.33M
        return write(i);
428
1.33M
    }
429
0
    char16_t intUnits[3];
430
0
    int32_t length;
431
0
    if(i<=UCharsTrie::kMaxTwoUnitDelta) {
432
0
        intUnits[0] = static_cast<char16_t>(UCharsTrie::kMinTwoUnitDeltaLead + (i >> 16));
433
0
        length=1;
434
0
    } else {
435
0
        intUnits[0] = static_cast<char16_t>(UCharsTrie::kThreeUnitDeltaLead);
436
0
        intUnits[1] = static_cast<char16_t>(i >> 16);
437
0
        length=2;
438
0
    }
439
0
    intUnits[length++] = static_cast<char16_t>(i);
440
0
    return write(intUnits, length);
441
1.33M
}
442
443
U_NAMESPACE_END