Coverage Report

Created: 2025-11-07 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/bmpset.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2007-2012, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  bmpset.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2007jan29
16
*   created by: Markus W. Scherer
17
*/
18
19
#include "unicode/utypes.h"
20
#include "unicode/uniset.h"
21
#include "unicode/utf8.h"
22
#include "unicode/utf16.h"
23
#include "cmemory.h"
24
#include "bmpset.h"
25
#include "uassert.h"
26
27
U_NAMESPACE_BEGIN
28
29
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
30
401k
        list(parentList), listLength(parentListLength) {
31
401k
    uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
32
401k
    uprv_memset(table7FF, 0, sizeof(table7FF));
33
401k
    uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
34
35
    /*
36
     * Set the list indexes for binary searches for
37
     * U+0800, U+1000, U+2000, .., U+F000, U+10000.
38
     * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
39
     * looked up in the bit tables.
40
     * The last pair of indexes is for finding supplementary code points.
41
     */
42
401k
    list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
43
401k
    int32_t i;
44
6.83M
    for(i=1; i<=0x10; ++i) {
45
6.43M
        list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
46
6.43M
    }
47
401k
    list4kStarts[0x11]=listLength-1;
48
401k
    containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
49
50
401k
    initBits();
51
401k
    overrideIllegal();
52
401k
}
53
54
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
55
0
        containsFFFD(otherBMPSet.containsFFFD),
56
0
        list(newParentList), listLength(newParentListLength) {
57
0
    uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
58
0
    uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
59
0
    uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
60
0
    uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
61
0
}
62
63
401k
BMPSet::~BMPSet() {
64
401k
}
65
66
/*
67
 * Set bits in a bit rectangle in "vertical" bit organization.
68
 * start<limit<=0x800
69
 */
70
978k
static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
71
978k
    U_ASSERT(start<limit);
72
978k
    U_ASSERT(limit<=0x800);
73
74
978k
    int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
75
978k
    int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
76
77
    // Set one bit indicating an all-one block.
78
978k
    uint32_t bits = static_cast<uint32_t>(1) << lead;
79
978k
    if((start+1)==limit) {  // Single-character shortcut.
80
407k
        table[trail]|=bits;
81
407k
        return;
82
407k
    }
83
84
571k
    int32_t limitLead=limit>>6;
85
571k
    int32_t limitTrail=limit&0x3f;
86
87
571k
    if(lead==limitLead) {
88
        // Partial vertical bit column.
89
3.76M
        while(trail<limitTrail) {
90
3.36M
            table[trail++]|=bits;
91
3.36M
        }
92
398k
    } else {
93
        // Partial vertical bit column,
94
        // followed by a bit rectangle,
95
        // followed by another partial vertical bit column.
96
172k
        if(trail>0) {
97
3.85M
            do {
98
3.85M
                table[trail++]|=bits;
99
3.85M
            } while(trail<64);
100
149k
            ++lead;
101
149k
        }
102
172k
        if(lead<limitLead) {
103
93.9k
            bits = ~((static_cast<unsigned>(1) << lead) - 1);
104
93.9k
            if(limitLead<0x20) {
105
80.2k
                bits &= (static_cast<unsigned>(1) << limitLead) - 1;
106
80.2k
            }
107
6.10M
            for(trail=0; trail<64; ++trail) {
108
6.01M
                table[trail]|=bits;
109
6.01M
            }
110
93.9k
        }
111
        // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
112
        // In that case, bits=1<<limitLead is undefined but the bits value
113
        // is not used because trail<limitTrail is already false.
114
172k
        bits = static_cast<uint32_t>(1) << ((limitLead == 0x20) ? (limitLead - 1) : limitLead);
115
2.72M
        for(trail=0; trail<limitTrail; ++trail) {
116
2.55M
            table[trail]|=bits;
117
2.55M
        }
118
172k
    }
119
571k
}
120
121
401k
void BMPSet::initBits() {
122
401k
    UChar32 start, limit;
123
401k
    int32_t listIndex=0;
124
125
    // Set latin1Contains[].
126
1.22M
    do {
127
1.22M
        start=list[listIndex++];
128
1.22M
        if(listIndex<listLength) {
129
933k
            limit=list[listIndex++];
130
933k
        } else {
131
294k
            limit=0x110000;
132
294k
        }
133
1.22M
        if(start>=0x100) {
134
384k
            break;
135
384k
        }
136
5.96M
        do {
137
5.96M
            latin1Contains[start++]=1;
138
5.96M
        } while(start<limit && start<0x100);
139
843k
    } while(limit<=0x100);
140
141
    // Find the first range overlapping with (or after) 80..FF again,
142
    // to include them in table7FF as well.
143
1.13M
    for(listIndex=0;;) {
144
1.13M
        start=list[listIndex++];
145
1.13M
        if(listIndex<listLength) {
146
840k
            limit=list[listIndex++];
147
840k
        } else {
148
294k
            limit=0x110000;
149
294k
        }
150
1.13M
        if(limit>0x80) {
151
401k
            if(start<0x80) {
152
21.3k
                start=0x80;
153
21.3k
            }
154
401k
            break;
155
401k
        }
156
1.13M
    }
157
158
    // Set table7FF[].
159
1.13M
    while(start<0x800) {
160
749k
        set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
161
749k
        if(limit>0x800) {
162
17.7k
            start=0x800;
163
17.7k
            break;
164
17.7k
        }
165
166
732k
        start=list[listIndex++];
167
732k
        if(listIndex<listLength) {
168
712k
            limit=list[listIndex++];
169
712k
        } else {
170
20.1k
            limit=0x110000;
171
20.1k
        }
172
732k
    }
173
174
    // Set bmpBlockBits[].
175
401k
    int32_t minStart=0x800;
176
6.16M
    while(start<0x10000) {
177
5.78M
        if(limit>0x10000) {
178
16.5k
            limit=0x10000;
179
16.5k
        }
180
181
5.78M
        if(start<minStart) {
182
3.29M
            start=minStart;
183
3.29M
        }
184
5.78M
        if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
185
2.90M
            if(start&0x3f) {
186
                // Mixed-value block of 64 code points.
187
2.05M
                start>>=6;
188
2.05M
                bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
189
2.05M
                start=(start+1)<<6;  // Round up to the next block boundary.
190
2.05M
                minStart=start;      // Ignore further ranges in this block.
191
2.05M
            }
192
2.90M
            if(start<limit) {
193
919k
                if(start<(limit&~0x3f)) {
194
                    // Multiple all-ones blocks of 64 code points each.
195
228k
                    set32x64Bits(bmpBlockBits, start>>6, limit>>6);
196
228k
                }
197
198
919k
                if(limit&0x3f) {
199
                    // Mixed-value block of 64 code points.
200
863k
                    limit>>=6;
201
863k
                    bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
202
863k
                    limit=(limit+1)<<6;  // Round up to the next block boundary.
203
863k
                    minStart=limit;      // Ignore further ranges in this block.
204
863k
                }
205
919k
            }
206
2.90M
        }
207
208
5.78M
        if(limit==0x10000) {
209
24.7k
            break;
210
24.7k
        }
211
212
5.76M
        start=list[listIndex++];
213
5.76M
        if(listIndex<listLength) {
214
5.73M
            limit=list[listIndex++];
215
5.73M
        } else {
216
30.7k
            limit=0x110000;
217
30.7k
        }
218
5.76M
    }
219
401k
}
220
221
/*
222
 * Override some bits and bytes to the result of contains(FFFD)
223
 * for faster validity checking at runtime.
224
 * No need to set 0 values where they were reset to 0 in the constructor
225
 * and not modified by initBits().
226
 * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
227
 * Need to set 0 values for surrogates D800..DFFF.
228
 */
229
401k
void BMPSet::overrideIllegal() {
230
401k
    uint32_t bits, mask;
231
401k
    int32_t i;
232
233
401k
    if(containsFFFD) {
234
19.7k
        bits=3;                 // Lead bytes 0xC0 and 0xC1.
235
1.28M
        for(i=0; i<64; ++i) {
236
1.26M
            table7FF[i]|=bits;
237
1.26M
        }
238
239
19.7k
        bits=1;                 // Lead byte 0xE0.
240
651k
        for(i=0; i<32; ++i) {   // First half of 4k block.
241
631k
            bmpBlockBits[i]|=bits;
242
631k
        }
243
244
19.7k
        mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.
245
19.7k
        bits=1<<0xd;
246
651k
        for(i=32; i<64; ++i) {  // Second half of 4k block.
247
631k
            bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
248
631k
        }
249
382k
    } else {
250
382k
        mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.
251
12.6M
        for(i=32; i<64; ++i) {  // Second half of 4k block.
252
12.2M
            bmpBlockBits[i]&=mask;
253
12.2M
        }
254
382k
    }
255
401k
}
256
257
62.8M
int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
258
    /* Examples:
259
                                       findCodePoint(c)
260
       set              list[]         c=0 1 3 4 7 8
261
       ===              ==============   ===========
262
       []               [110000]         0 0 0 0 0 0
263
       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
264
       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
265
       [:Any:]          [0, 110000]      1 1 1 1 1 1
266
     */
267
268
    // Return the smallest i such that c < list[i].  Assume
269
    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
270
62.8M
    if (c < list[lo])
271
13.8M
        return lo;
272
    // High runner test.  c is often after the last range, so an
273
    // initial check for this condition pays off.
274
49.0M
    if (lo >= hi || c >= list[hi-1])
275
1.39M
        return hi;
276
    // invariant: c >= list[lo]
277
    // invariant: c < list[hi]
278
357M
    for (;;) {
279
357M
        int32_t i = (lo + hi) >> 1;
280
357M
        if (i == lo) {
281
47.6M
            break; // Found!
282
309M
        } else if (c < list[i]) {
283
214M
            hi = i;
284
214M
        } else {
285
95.4M
            lo = i;
286
95.4M
        }
287
357M
    }
288
47.6M
    return hi;
289
49.0M
}
290
291
UBool
292
410M
BMPSet::contains(UChar32 c) const {
293
410M
    if (static_cast<uint32_t>(c) <= 0xff) {
294
137M
        return latin1Contains[c];
295
273M
    } else if (static_cast<uint32_t>(c) <= 0x7ff) {
296
8.59M
        return (table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0;
297
264M
    } else if (static_cast<uint32_t>(c) < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
298
245M
        int lead=c>>12;
299
245M
        uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
300
245M
        if(twoBits<=1) {
301
            // All 64 code points with the same bits 15..6
302
            // are either in the set or not.
303
209M
            return twoBits;
304
209M
        } else {
305
            // Look up the code point in its 4k block of code points.
306
35.8M
            return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
307
35.8M
        }
308
245M
    } else if (static_cast<uint32_t>(c) <= 0x10ffff) {
309
        // surrogate or supplementary code point
310
19.7M
        return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
311
19.7M
    } else {
312
        // Out-of-range code points get false, consistent with long-standing
313
        // behavior of UnicodeSet::contains(c).
314
11.7k
        return false;
315
11.7k
    }
316
410M
}
317
318
/*
319
 * Check for sufficient length for trail unit for each surrogate pair.
320
 * Handle single surrogates as surrogate code points as usual in ICU.
321
 */
322
const char16_t *
323
5.52k
BMPSet::span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {
324
5.52k
    char16_t c, c2;
325
326
5.52k
    if(spanCondition) {
327
        // span
328
9.03k
        do {
329
9.03k
            c=*s;
330
9.03k
            if(c<=0xff) {
331
30
                if(!latin1Contains[c]) {
332
30
                    break;
333
30
                }
334
9.00k
            } else if(c<=0x7ff) {
335
0
                if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) == 0) {
336
0
                    break;
337
0
                }
338
9.00k
            } else if(c<0xd800 || c>=0xe000) {
339
1.86k
                int lead=c>>12;
340
1.86k
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
341
1.86k
                if(twoBits<=1) {
342
                    // All 64 code points with the same bits 15..6
343
                    // are either in the set or not.
344
1.64k
                    if(twoBits==0) {
345
1.64k
                        break;
346
1.64k
                    }
347
1.64k
                } else {
348
                    // Look up the code point in its 4k block of code points.
349
217
                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
350
207
                        break;
351
207
                    }
352
217
                }
353
7.13k
            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
354
                // surrogate code point
355
0
                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
356
0
                    break;
357
0
                }
358
7.13k
            } else {
359
                // surrogate pair
360
7.13k
                if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
361
3.01k
                    break;
362
3.01k
                }
363
4.11k
                ++s;
364
4.11k
            }
365
9.03k
        } while(++s<limit);
366
5.52k
    } else {
367
        // span not
368
0
        do {
369
0
            c=*s;
370
0
            if(c<=0xff) {
371
0
                if(latin1Contains[c]) {
372
0
                    break;
373
0
                }
374
0
            } else if(c<=0x7ff) {
375
0
                if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) {
376
0
                    break;
377
0
                }
378
0
            } else if(c<0xd800 || c>=0xe000) {
379
0
                int lead=c>>12;
380
0
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
381
0
                if(twoBits<=1) {
382
                    // All 64 code points with the same bits 15..6
383
                    // are either in the set or not.
384
0
                    if(twoBits!=0) {
385
0
                        break;
386
0
                    }
387
0
                } else {
388
                    // Look up the code point in its 4k block of code points.
389
0
                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
390
0
                        break;
391
0
                    }
392
0
                }
393
0
            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
394
                // surrogate code point
395
0
                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
396
0
                    break;
397
0
                }
398
0
            } else {
399
                // surrogate pair
400
0
                if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
401
0
                    break;
402
0
                }
403
0
                ++s;
404
0
            }
405
0
        } while(++s<limit);
406
0
    }
407
5.52k
    return s;
408
5.52k
}
409
410
/* Symmetrical with span(). */
411
const char16_t *
412
4.90k
BMPSet::spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {
413
4.90k
    char16_t c, c2;
414
415
4.90k
    if(spanCondition) {
416
        // span
417
6.21k
        for(;;) {
418
6.21k
            c=*(--limit);
419
6.21k
            if(c<=0xff) {
420
2
                if(!latin1Contains[c]) {
421
2
                    break;
422
2
                }
423
6.21k
            } else if(c<=0x7ff) {
424
0
                if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) == 0) {
425
0
                    break;
426
0
                }
427
6.21k
            } else if(c<0xd800 || c>=0xe000) {
428
2.83k
                int lead=c>>12;
429
2.83k
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
430
2.83k
                if(twoBits<=1) {
431
                    // All 64 code points with the same bits 15..6
432
                    // are either in the set or not.
433
2.81k
                    if(twoBits==0) {
434
2.81k
                        break;
435
2.81k
                    }
436
2.81k
                } else {
437
                    // Look up the code point in its 4k block of code points.
438
22
                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
439
20
                        break;
440
20
                    }
441
22
                }
442
3.37k
            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
443
                // surrogate code point
444
0
                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
445
0
                    break;
446
0
                }
447
3.37k
            } else {
448
                // surrogate pair
449
3.37k
                if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
450
2.06k
                    break;
451
2.06k
                }
452
1.31k
                --limit;
453
1.31k
            }
454
1.31k
            if(s==limit) {
455
0
                return s;
456
0
            }
457
1.31k
        }
458
4.90k
    } else {
459
        // span not
460
0
        for(;;) {
461
0
            c=*(--limit);
462
0
            if(c<=0xff) {
463
0
                if(latin1Contains[c]) {
464
0
                    break;
465
0
                }
466
0
            } else if(c<=0x7ff) {
467
0
                if ((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) {
468
0
                    break;
469
0
                }
470
0
            } else if(c<0xd800 || c>=0xe000) {
471
0
                int lead=c>>12;
472
0
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
473
0
                if(twoBits<=1) {
474
                    // All 64 code points with the same bits 15..6
475
                    // are either in the set or not.
476
0
                    if(twoBits!=0) {
477
0
                        break;
478
0
                    }
479
0
                } else {
480
                    // Look up the code point in its 4k block of code points.
481
0
                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
482
0
                        break;
483
0
                    }
484
0
                }
485
0
            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
486
                // surrogate code point
487
0
                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
488
0
                    break;
489
0
                }
490
0
            } else {
491
                // surrogate pair
492
0
                if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
493
0
                    break;
494
0
                }
495
0
                --limit;
496
0
            }
497
0
            if(s==limit) {
498
0
                return s;
499
0
            }
500
0
        }
501
0
    }
502
4.90k
    return limit+1;
503
4.90k
}
504
505
/*
506
 * Precheck for sufficient trail bytes at end of string only once per span.
507
 * Check validity.
508
 */
509
const uint8_t *
510
4.90k
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
511
4.90k
    const uint8_t *limit=s+length;
512
4.90k
    uint8_t b=*s;
513
4.90k
    if(U8_IS_SINGLE(b)) {
514
        // Initial all-ASCII span.
515
26
        if(spanCondition) {
516
26
            do {
517
26
                if(!latin1Contains[b] || ++s==limit) {
518
26
                    return s;
519
26
                }
520
0
                b=*s;
521
0
            } while(U8_IS_SINGLE(b));
522
26
        } else {
523
0
            do {
524
0
                if(latin1Contains[b] || ++s==limit) {
525
0
                    return s;
526
0
                }
527
0
                b=*s;
528
0
            } while(U8_IS_SINGLE(b));
529
0
        }
530
0
        length = static_cast<int32_t>(limit - s);
531
0
    }
532
533
4.87k
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
534
4.87k
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
535
4.87k
    }
536
537
4.87k
    const uint8_t *limit0=limit;
538
539
    /*
540
     * Make sure that the last 1/2/3/4-byte sequence before limit is complete
541
     * or runs into a lead byte.
542
     * In the span loop compare s with limit only once
543
     * per multi-byte character.
544
     *
545
     * Give a trailing illegal sequence the same value as the result of contains(FFFD),
546
     * including it if that is part of the span, otherwise set limit0 to before
547
     * the truncated sequence.
548
     */
549
4.87k
    b=*(limit-1);
550
4.87k
    if (static_cast<int8_t>(b) < 0) {
551
        // b>=0x80: lead or trail byte
552
4.87k
        if(b<0xc0) {
553
            // single trail byte, check for preceding 3- or 4-byte lead byte
554
4.87k
            if(length>=2 && (b=*(limit-2))>=0xe0) {
555
0
                limit-=2;
556
0
                if(containsFFFD!=spanCondition) {
557
0
                    limit0=limit;
558
0
                }
559
4.87k
            } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
560
                // 4-byte lead byte with only two trail bytes
561
0
                limit-=3;
562
0
                if(containsFFFD!=spanCondition) {
563
0
                    limit0=limit;
564
0
                }
565
0
            }
566
4.87k
        } else {
567
            // lead byte with no trail bytes
568
0
            --limit;
569
0
            if(containsFFFD!=spanCondition) {
570
0
                limit0=limit;
571
0
            }
572
0
        }
573
4.87k
    }
574
575
4.87k
    uint8_t t1, t2, t3;
576
577
7.76k
    while(s<limit) {
578
7.76k
        b=*s;
579
7.76k
        if(U8_IS_SINGLE(b)) {
580
            // ASCII
581
0
            if(spanCondition) {
582
0
                do {
583
0
                    if(!latin1Contains[b]) {
584
0
                        return s;
585
0
                    } else if(++s==limit) {
586
0
                        return limit0;
587
0
                    }
588
0
                    b=*s;
589
0
                } while(U8_IS_SINGLE(b));
590
0
            } else {
591
0
                do {
592
0
                    if(latin1Contains[b]) {
593
0
                        return s;
594
0
                    } else if(++s==limit) {
595
0
                        return limit0;
596
0
                    }
597
0
                    b=*s;
598
0
                } while(U8_IS_SINGLE(b));
599
0
            }
600
0
        }
601
7.76k
        ++s;  // Advance past the lead byte.
602
7.76k
        if(b>=0xe0) {
603
7.76k
            if(b<0xf0) {
604
1.85k
                if( /* handle U+0000..U+FFFF inline */
605
1.85k
                    (t1 = static_cast<uint8_t>(s[0] - 0x80)) <= 0x3f &&
606
1.85k
                    (t2 = static_cast<uint8_t>(s[1] - 0x80)) <= 0x3f
607
1.85k
                ) {
608
1.85k
                    b&=0xf;
609
1.85k
                    uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
610
1.85k
                    if(twoBits<=1) {
611
                        // All 64 code points with this lead byte and middle trail byte
612
                        // are either in the set or not.
613
1.64k
                        if (twoBits != static_cast<uint32_t>(spanCondition)) {
614
1.64k
                            return s-1;
615
1.64k
                        }
616
1.64k
                    } else {
617
                        // Look up the code point in its 4k block of code points.
618
207
                        UChar32 c=(b<<12)|(t1<<6)|t2;
619
207
                        if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
620
207
                            return s-1;
621
207
                        }
622
207
                    }
623
0
                    s+=2;
624
0
                    continue;
625
1.85k
                }
626
5.90k
            } else if( /* handle U+10000..U+10FFFF inline */
627
5.90k
                (t1 = static_cast<uint8_t>(s[0] - 0x80)) <= 0x3f &&
628
5.90k
                (t2 = static_cast<uint8_t>(s[1] - 0x80)) <= 0x3f &&
629
5.90k
                (t3 = static_cast<uint8_t>(s[2] - 0x80)) <= 0x3f
630
5.90k
            ) {
631
                // Give an illegal sequence the same value as the result of contains(FFFD).
632
5.90k
                UChar32 c = (static_cast<UChar32>(b - 0xf0) << 18) | (static_cast<UChar32>(t1) << 12) | (t2 << 6) | t3;
633
5.90k
                if( (   (0x10000<=c && c<=0x10ffff) ?
634
5.90k
                            containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
635
5.90k
                            containsFFFD
636
5.90k
                    ) != spanCondition
637
5.90k
                ) {
638
3.01k
                    return s-1;
639
3.01k
                }
640
2.88k
                s+=3;
641
2.88k
                continue;
642
5.90k
            }
643
7.76k
        } else {
644
4
            if( /* handle U+0000..U+07FF inline */
645
4
                b>=0xc0 &&
646
4
                (t1 = static_cast<uint8_t>(*s - 0x80)) <= 0x3f
647
4
            ) {
648
4
                if (static_cast<USetSpanCondition>((table7FF[t1] & (static_cast<uint32_t>(1) << (b & 0x1f))) != 0) != spanCondition) {
649
4
                    return s-1;
650
4
                }
651
0
                ++s;
652
0
                continue;
653
4
            }
654
4
        }
655
656
        // Give an illegal sequence the same value as the result of contains(FFFD).
657
        // Handle each byte of an illegal sequence separately to simplify the code;
658
        // no need to optimize error handling.
659
0
        if(containsFFFD!=spanCondition) {
660
0
            return s-1;
661
0
        }
662
0
    }
663
664
0
    return limit0;
665
4.87k
}
666
667
/*
668
 * While going backwards through UTF-8 optimize only for ASCII.
669
 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
670
 * possible to tell from the last byte in a multi-byte sequence how many
671
 * preceding bytes there should be. Therefore, going backwards through UTF-8
672
 * is much harder than going forward.
673
 */
674
int32_t
675
4.90k
BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
676
4.90k
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
677
4.90k
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
678
4.90k
    }
679
680
4.90k
    uint8_t b;
681
682
6.21k
    do {
683
6.21k
        b=s[--length];
684
6.21k
        if(U8_IS_SINGLE(b)) {
685
            // ASCII sub-span
686
2
            if(spanCondition) {
687
2
                do {
688
2
                    if(!latin1Contains[b]) {
689
2
                        return length+1;
690
2
                    } else if(length==0) {
691
0
                        return 0;
692
0
                    }
693
0
                    b=s[--length];
694
0
                } while(U8_IS_SINGLE(b));
695
2
            } else {
696
0
                do {
697
0
                    if(latin1Contains[b]) {
698
0
                        return length+1;
699
0
                    } else if(length==0) {
700
0
                        return 0;
701
0
                    }
702
0
                    b=s[--length];
703
0
                } while(U8_IS_SINGLE(b));
704
0
            }
705
2
        }
706
707
6.21k
        int32_t prev=length;
708
6.21k
        UChar32 c;
709
        // trail byte: collect a multi-byte character
710
        // (or  lead byte in last-trail position)
711
6.21k
        c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
712
        // c is a valid code point, not ASCII, not a surrogate
713
6.21k
        if(c<=0x7ff) {
714
0
            if (static_cast<USetSpanCondition>((table7FF[c & 0x3f] & (static_cast<uint32_t>(1) << (c >> 6))) != 0) != spanCondition) {
715
0
                return prev+1;
716
0
            }
717
6.21k
        } else if(c<=0xffff) {
718
2.83k
            int lead=c>>12;
719
2.83k
            uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
720
2.83k
            if(twoBits<=1) {
721
                // All 64 code points with the same bits 15..6
722
                // are either in the set or not.
723
2.81k
                if (twoBits != static_cast<uint32_t>(spanCondition)) {
724
2.81k
                    return prev+1;
725
2.81k
                }
726
2.81k
            } else {
727
                // Look up the code point in its 4k block of code points.
728
22
                if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
729
20
                    return prev+1;
730
20
                }
731
22
            }
732
3.37k
        } else {
733
3.37k
            if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
734
2.06k
                return prev+1;
735
2.06k
            }
736
3.37k
        }
737
6.21k
    } while(length>0);
738
0
    return 0;
739
4.90k
}
740
741
U_NAMESPACE_END