Coverage Report

Created: 2025-06-13 06:38

/src/icu/icu4c/source/common/bytestrie.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*   Copyright (C) 2010-2011, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
*   file name:  bytestrie.cpp
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2010sep25
14
*   created by: Markus W. Scherer
15
*/
16
17
#include "unicode/utypes.h"
18
#include "unicode/bytestream.h"
19
#include "unicode/bytestrie.h"
20
#include "unicode/uobject.h"
21
#include "cmemory.h"
22
#include "uassert.h"
23
24
U_NAMESPACE_BEGIN
25
26
16.1M
BytesTrie::~BytesTrie() {
27
16.1M
    uprv_free(ownedArray_);
28
16.1M
}
29
30
// lead byte already shifted right by 1.
31
int32_t
32
411k
BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {
33
411k
    int32_t value;
34
411k
    if(leadByte<kMinTwoByteValueLead) {
35
117k
        value=leadByte-kMinOneByteValueLead;
36
294k
    } else if(leadByte<kMinThreeByteValueLead) {
37
200k
        value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
38
200k
    } else if(leadByte<kFourByteValueLead) {
39
83.9k
        value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
40
83.9k
    } else if(leadByte==kFourByteValueLead) {
41
3.63k
        value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
42
6.09k
    } else {
43
6.09k
        value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
44
6.09k
    }
45
411k
    return value;
46
411k
}
47
48
const uint8_t *
49
57.9M
BytesTrie::jumpByDelta(const uint8_t *pos) {
50
57.9M
    int32_t delta=*pos++;
51
57.9M
    if(delta<kMinTwoByteDeltaLead) {
52
        // nothing to do
53
54.3M
    } else if(delta<kMinThreeByteDeltaLead) {
54
28.7M
        delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
55
28.7M
    } else if(delta<kFourByteDeltaLead) {
56
25.6M
        delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
57
25.6M
        pos+=2;
58
25.6M
    } else if(delta==kFourByteDeltaLead) {
59
0
        delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
60
0
        pos+=3;
61
0
    } else {
62
0
        delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
63
0
        pos+=4;
64
0
    }
65
57.9M
    return pos+delta;
66
57.9M
}
67
68
UStringTrieResult
69
0
BytesTrie::current() const {
70
0
    const uint8_t *pos=pos_;
71
0
    if(pos==nullptr) {
72
0
        return USTRINGTRIE_NO_MATCH;
73
0
    } else {
74
0
        int32_t node;
75
0
        return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
76
0
                valueResult(node) : USTRINGTRIE_NO_VALUE;
77
0
    }
78
0
}
79
80
UStringTrieResult
81
43.3M
BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
82
    // Branch according to the current byte.
83
43.3M
    if(length==0) {
84
29.5M
        length=*pos++;
85
29.5M
    }
86
43.3M
    ++length;
87
    // The length of the branch is the number of bytes to select from.
88
    // The data structure encodes a binary search.
89
153M
    while(length>kMaxBranchLinearSubNodeLength) {
90
109M
        if(inByte<*pos++) {
91
57.9M
            length>>=1;
92
57.9M
            pos=jumpByDelta(pos);
93
57.9M
        } else {
94
51.9M
            length=length-(length>>1);
95
51.9M
            pos=skipDelta(pos);
96
51.9M
        }
97
109M
    }
98
    // Drop down to linear search for the last few bytes.
99
    // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
100
    // and divides length by 2.
101
91.2M
    do {
102
91.2M
        if(inByte==*pos++) {
103
17.3M
            UStringTrieResult result;
104
17.3M
            int32_t node=*pos;
105
17.3M
            U_ASSERT(node>=kMinValueLead);
106
17.3M
            if(node&kValueIsFinal) {
107
                // Leave the final value for getValue() to read.
108
614k
                result=USTRINGTRIE_FINAL_VALUE;
109
16.6M
            } else {
110
                // Use the non-final value as the jump delta.
111
16.6M
                ++pos;
112
                // int32_t delta=readValue(pos, node>>1);
113
16.6M
                node>>=1;
114
16.6M
                int32_t delta;
115
16.6M
                if(node<kMinTwoByteValueLead) {
116
1.45M
                    delta=node-kMinOneByteValueLead;
117
15.2M
                } else if(node<kMinThreeByteValueLead) {
118
12.5M
                    delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
119
12.5M
                } else if(node<kFourByteValueLead) {
120
2.71M
                    delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
121
2.71M
                    pos+=2;
122
2.71M
                } else if(node==kFourByteValueLead) {
123
0
                    delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
124
0
                    pos+=3;
125
0
                } else {
126
0
                    delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
127
0
                    pos+=4;
128
0
                }
129
                // end readValue()
130
16.6M
                pos+=delta;
131
16.6M
                node=*pos;
132
16.6M
                result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
133
16.6M
            }
134
17.3M
            pos_=pos;
135
17.3M
            return result;
136
17.3M
        }
137
73.9M
        --length;
138
73.9M
        pos=skipValue(pos);
139
73.9M
    } while(length>1);
140
26.0M
    if(inByte==*pos++) {
141
12.9M
        pos_=pos;
142
12.9M
        int32_t node=*pos;
143
12.9M
        return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
144
13.0M
    } else {
145
13.0M
        stop();
146
13.0M
        return USTRINGTRIE_NO_MATCH;
147
13.0M
    }
148
26.0M
}
149
150
UStringTrieResult
151
45.3M
BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
152
57.1M
    for(;;) {
153
57.1M
        int32_t node=*pos++;
154
57.1M
        if(node<kMinLinearMatch) {
155
43.3M
            return branchNext(pos, node, inByte);
156
43.3M
        } else if(node<kMinValueLead) {
157
            // Match the first of length+1 bytes.
158
2.06M
            int32_t length=node-kMinLinearMatch;  // Actual match length minus 1.
159
2.06M
            if(inByte==*pos++) {
160
127k
                remainingMatchLength_=--length;
161
127k
                pos_=pos;
162
127k
                return (length<0 && (node=*pos)>=kMinValueLead) ?
163
94.6k
                        valueResult(node) : USTRINGTRIE_NO_VALUE;
164
1.93M
            } else {
165
                // No match.
166
1.93M
                break;
167
1.93M
            }
168
11.7M
        } else if(node&kValueIsFinal) {
169
            // No further matching bytes.
170
0
            break;
171
11.7M
        } else {
172
            // Skip intermediate value.
173
11.7M
            pos=skipValue(pos, node);
174
            // The next node must not also be a value node.
175
11.7M
            U_ASSERT(*pos<kMinValueLead);
176
11.7M
        }
177
57.1M
    }
178
1.93M
    stop();
179
1.93M
    return USTRINGTRIE_NO_MATCH;
180
45.3M
}
181
182
UStringTrieResult
183
29.8M
BytesTrie::next(int32_t inByte) {
184
29.8M
    const uint8_t *pos=pos_;
185
29.8M
    if(pos==nullptr) {
186
0
        return USTRINGTRIE_NO_MATCH;
187
0
    }
188
29.8M
    if(inByte<0) {
189
4.61k
        inByte+=0x100;
190
4.61k
    }
191
29.8M
    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
192
29.8M
    if(length>=0) {
193
        // Remaining part of a linear-match node.
194
75.7k
        if(inByte==*pos++) {
195
53.9k
            remainingMatchLength_=--length;
196
53.9k
            pos_=pos;
197
53.9k
            int32_t node;
198
53.9k
            return (length<0 && (node=*pos)>=kMinValueLead) ?
199
27.0k
                    valueResult(node) : USTRINGTRIE_NO_VALUE;
200
53.9k
        } else {
201
21.8k
            stop();
202
21.8k
            return USTRINGTRIE_NO_MATCH;
203
21.8k
        }
204
75.7k
    }
205
29.8M
    return nextImpl(pos, inByte);
206
29.8M
}
207
208
UStringTrieResult
209
0
BytesTrie::next(const char *s, int32_t sLength) {
210
0
    if(sLength<0 ? *s==0 : sLength==0) {
211
        // Empty input.
212
0
        return current();
213
0
    }
214
0
    const uint8_t *pos=pos_;
215
0
    if(pos==nullptr) {
216
0
        return USTRINGTRIE_NO_MATCH;
217
0
    }
218
0
    int32_t length=remainingMatchLength_;  // Actual remaining match length minus 1.
219
0
    for(;;) {
220
        // Fetch the next input byte, if there is one.
221
        // Continue a linear-match node without rechecking sLength<0.
222
0
        int32_t inByte;
223
0
        if(sLength<0) {
224
0
            for(;;) {
225
0
                if((inByte=*s++)==0) {
226
0
                    remainingMatchLength_=length;
227
0
                    pos_=pos;
228
0
                    int32_t node;
229
0
                    return (length<0 && (node=*pos)>=kMinValueLead) ?
230
0
                            valueResult(node) : USTRINGTRIE_NO_VALUE;
231
0
                }
232
0
                if(length<0) {
233
0
                    remainingMatchLength_=length;
234
0
                    break;
235
0
                }
236
0
                if(inByte!=*pos) {
237
0
                    stop();
238
0
                    return USTRINGTRIE_NO_MATCH;
239
0
                }
240
0
                ++pos;
241
0
                --length;
242
0
            }
243
0
        } else {
244
0
            for(;;) {
245
0
                if(sLength==0) {
246
0
                    remainingMatchLength_=length;
247
0
                    pos_=pos;
248
0
                    int32_t node;
249
0
                    return (length<0 && (node=*pos)>=kMinValueLead) ?
250
0
                            valueResult(node) : USTRINGTRIE_NO_VALUE;
251
0
                }
252
0
                inByte=*s++;
253
0
                --sLength;
254
0
                if(length<0) {
255
0
                    remainingMatchLength_=length;
256
0
                    break;
257
0
                }
258
0
                if(inByte!=*pos) {
259
0
                    stop();
260
0
                    return USTRINGTRIE_NO_MATCH;
261
0
                }
262
0
                ++pos;
263
0
                --length;
264
0
            }
265
0
        }
266
0
        for(;;) {
267
0
            int32_t node=*pos++;
268
0
            if(node<kMinLinearMatch) {
269
0
                UStringTrieResult result=branchNext(pos, node, inByte);
270
0
                if(result==USTRINGTRIE_NO_MATCH) {
271
0
                    return USTRINGTRIE_NO_MATCH;
272
0
                }
273
                // Fetch the next input byte, if there is one.
274
0
                if(sLength<0) {
275
0
                    if((inByte=*s++)==0) {
276
0
                        return result;
277
0
                    }
278
0
                } else {
279
0
                    if(sLength==0) {
280
0
                        return result;
281
0
                    }
282
0
                    inByte=*s++;
283
0
                    --sLength;
284
0
                }
285
0
                if(result==USTRINGTRIE_FINAL_VALUE) {
286
                    // No further matching bytes.
287
0
                    stop();
288
0
                    return USTRINGTRIE_NO_MATCH;
289
0
                }
290
0
                pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
291
0
            } else if(node<kMinValueLead) {
292
                // Match length+1 bytes.
293
0
                length=node-kMinLinearMatch;  // Actual match length minus 1.
294
0
                if(inByte!=*pos) {
295
0
                    stop();
296
0
                    return USTRINGTRIE_NO_MATCH;
297
0
                }
298
0
                ++pos;
299
0
                --length;
300
0
                break;
301
0
            } else if(node&kValueIsFinal) {
302
                // No further matching bytes.
303
0
                stop();
304
0
                return USTRINGTRIE_NO_MATCH;
305
0
            } else {
306
                // Skip intermediate value.
307
0
                pos=skipValue(pos, node);
308
                // The next node must not also be a value node.
309
0
                U_ASSERT(*pos<kMinValueLead);
310
0
            }
311
0
        }
312
0
    }
313
0
}
314
315
const uint8_t *
316
BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
317
0
                                     UBool haveUniqueValue, int32_t &uniqueValue) {
318
0
    while(length>kMaxBranchLinearSubNodeLength) {
319
0
        ++pos;  // ignore the comparison byte
320
0
        if(nullptr==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
321
0
            return nullptr;
322
0
        }
323
0
        length=length-(length>>1);
324
0
        pos=skipDelta(pos);
325
0
    }
326
0
    do {
327
0
        ++pos;  // ignore a comparison byte
328
        // handle its value
329
0
        int32_t node=*pos++;
330
0
        UBool isFinal = static_cast<UBool>(node & kValueIsFinal);
331
0
        int32_t value=readValue(pos, node>>1);
332
0
        pos=skipValue(pos, node);
333
0
        if(isFinal) {
334
0
            if(haveUniqueValue) {
335
0
                if(value!=uniqueValue) {
336
0
                    return nullptr;
337
0
                }
338
0
            } else {
339
0
                uniqueValue=value;
340
0
                haveUniqueValue=true;
341
0
            }
342
0
        } else {
343
0
            if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
344
0
                return nullptr;
345
0
            }
346
0
            haveUniqueValue=true;
347
0
        }
348
0
    } while(--length>1);
349
0
    return pos+1;  // ignore the last comparison byte
350
0
}
351
352
UBool
353
0
BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
354
0
    for(;;) {
355
0
        int32_t node=*pos++;
356
0
        if(node<kMinLinearMatch) {
357
0
            if(node==0) {
358
0
                node=*pos++;
359
0
            }
360
0
            pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
361
0
            if(pos==nullptr) {
362
0
                return false;
363
0
            }
364
0
            haveUniqueValue=true;
365
0
        } else if(node<kMinValueLead) {
366
            // linear-match node
367
0
            pos+=node-kMinLinearMatch+1;  // Ignore the match bytes.
368
0
        } else {
369
0
            UBool isFinal = static_cast<UBool>(node & kValueIsFinal);
370
0
            int32_t value=readValue(pos, node>>1);
371
0
            if(haveUniqueValue) {
372
0
                if(value!=uniqueValue) {
373
0
                    return false;
374
0
                }
375
0
            } else {
376
0
                uniqueValue=value;
377
0
                haveUniqueValue=true;
378
0
            }
379
0
            if(isFinal) {
380
0
                return true;
381
0
            }
382
0
            pos=skipValue(pos, node);
383
0
        }
384
0
    }
385
0
}
386
387
int32_t
388
0
BytesTrie::getNextBytes(ByteSink &out) const {
389
0
    const uint8_t *pos=pos_;
390
0
    if(pos==nullptr) {
391
0
        return 0;
392
0
    }
393
0
    if(remainingMatchLength_>=0) {
394
0
        append(out, *pos);  // Next byte of a pending linear-match node.
395
0
        return 1;
396
0
    }
397
0
    int32_t node=*pos++;
398
0
    if(node>=kMinValueLead) {
399
0
        if(node&kValueIsFinal) {
400
0
            return 0;
401
0
        } else {
402
0
            pos=skipValue(pos, node);
403
0
            node=*pos++;
404
0
            U_ASSERT(node<kMinValueLead);
405
0
        }
406
0
    }
407
0
    if(node<kMinLinearMatch) {
408
0
        if(node==0) {
409
0
            node=*pos++;
410
0
        }
411
0
        getNextBranchBytes(pos, ++node, out);
412
0
        return node;
413
0
    } else {
414
        // First byte of the linear-match node.
415
0
        append(out, *pos);
416
0
        return 1;
417
0
    }
418
0
}
419
420
void
421
0
BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
422
0
    while(length>kMaxBranchLinearSubNodeLength) {
423
0
        ++pos;  // ignore the comparison byte
424
0
        getNextBranchBytes(jumpByDelta(pos), length>>1, out);
425
0
        length=length-(length>>1);
426
0
        pos=skipDelta(pos);
427
0
    }
428
0
    do {
429
0
        append(out, *pos++);
430
0
        pos=skipValue(pos);
431
0
    } while(--length>1);
432
0
    append(out, *pos);
433
0
}
434
435
void
436
0
BytesTrie::append(ByteSink &out, int c) {
437
0
    char ch = static_cast<char>(c);
438
0
    out.Append(&ch, 1);
439
0
}
440
441
U_NAMESPACE_END