Coverage Report

Created: 2025-07-11 06:23

/src/icu/source/common/bmpset.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2007-2012, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  bmpset.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2007jan29
16
*   created by: Markus W. Scherer
17
*/
18
19
#include "unicode/utypes.h"
20
#include "unicode/uniset.h"
21
#include "unicode/utf8.h"
22
#include "unicode/utf16.h"
23
#include "cmemory.h"
24
#include "bmpset.h"
25
#include "uassert.h"
26
27
U_NAMESPACE_BEGIN
28
29
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
30
0
        list(parentList), listLength(parentListLength) {
31
0
    uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
32
0
    uprv_memset(table7FF, 0, sizeof(table7FF));
33
0
    uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
34
35
    /*
36
     * Set the list indexes for binary searches for
37
     * U+0800, U+1000, U+2000, .., U+F000, U+10000.
38
     * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
39
     * looked up in the bit tables.
40
     * The last pair of indexes is for finding supplementary code points.
41
     */
42
0
    list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
43
0
    int32_t i;
44
0
    for(i=1; i<=0x10; ++i) {
45
0
        list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
46
0
    }
47
0
    list4kStarts[0x11]=listLength-1;
48
49
0
    initBits();
50
0
    overrideIllegal();
51
0
}
52
53
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
54
0
        list(newParentList), listLength(newParentListLength) {
55
0
    uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
56
0
    uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
57
0
    uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
58
0
    uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
59
0
}
60
61
0
BMPSet::~BMPSet() {
62
0
}
63
64
/*
65
 * Set bits in a bit rectangle in "vertical" bit organization.
66
 * start<limit<=0x800
67
 */
68
0
static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
69
0
    U_ASSERT(start<limit);
70
0
    U_ASSERT(limit<=0x800);
71
72
0
    int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
73
0
    int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
74
75
    // Set one bit indicating an all-one block.
76
0
    uint32_t bits=(uint32_t)1<<lead;
77
0
    if((start+1)==limit) {  // Single-character shortcut.
78
0
        table[trail]|=bits;
79
0
        return;
80
0
    }
81
82
0
    int32_t limitLead=limit>>6;
83
0
    int32_t limitTrail=limit&0x3f;
84
85
0
    if(lead==limitLead) {
86
        // Partial vertical bit column.
87
0
        while(trail<limitTrail) {
88
0
            table[trail++]|=bits;
89
0
        }
90
0
    } else {
91
        // Partial vertical bit column,
92
        // followed by a bit rectangle,
93
        // followed by another partial vertical bit column.
94
0
        if(trail>0) {
95
0
            do {
96
0
                table[trail++]|=bits;
97
0
            } while(trail<64);
98
0
            ++lead;
99
0
        }
100
0
        if(lead<limitLead) {
101
0
            bits=~((1<<lead)-1);
102
0
            if(limitLead<0x20) {
103
0
                bits&=(1<<limitLead)-1;
104
0
            }
105
0
            for(trail=0; trail<64; ++trail) {
106
0
                table[trail]|=bits;
107
0
            }
108
0
        }
109
        // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
110
        // In that case, bits=1<<limitLead is undefined but the bits value
111
        // is not used because trail<limitTrail is already false.
112
0
        bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
113
0
        for(trail=0; trail<limitTrail; ++trail) {
114
0
            table[trail]|=bits;
115
0
        }
116
0
    }
117
0
}
118
119
0
void BMPSet::initBits() {
120
0
    UChar32 start, limit;
121
0
    int32_t listIndex=0;
122
123
    // Set asciiBytes[].
124
0
    do {
125
0
        start=list[listIndex++];
126
0
        if(listIndex<listLength) {
127
0
            limit=list[listIndex++];
128
0
        } else {
129
0
            limit=0x110000;
130
0
        }
131
0
        if(start>=0x80) {
132
0
            break;
133
0
        }
134
0
        do {
135
0
            asciiBytes[start++]=1;
136
0
        } while(start<limit && start<0x80);
137
0
    } while(limit<=0x80);
138
139
    // Set table7FF[].
140
0
    while(start<0x800) {
141
0
        set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
142
0
        if(limit>0x800) {
143
0
            start=0x800;
144
0
            break;
145
0
        }
146
147
0
        start=list[listIndex++];
148
0
        if(listIndex<listLength) {
149
0
            limit=list[listIndex++];
150
0
        } else {
151
0
            limit=0x110000;
152
0
        }
153
0
    }
154
155
    // Set bmpBlockBits[].
156
0
    int32_t minStart=0x800;
157
0
    while(start<0x10000) {
158
0
        if(limit>0x10000) {
159
0
            limit=0x10000;
160
0
        }
161
162
0
        if(start<minStart) {
163
0
            start=minStart;
164
0
        }
165
0
        if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
166
0
            if(start&0x3f) {
167
                // Mixed-value block of 64 code points.
168
0
                start>>=6;
169
0
                bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
170
0
                start=(start+1)<<6;  // Round up to the next block boundary.
171
0
                minStart=start;      // Ignore further ranges in this block.
172
0
            }
173
0
            if(start<limit) {
174
0
                if(start<(limit&~0x3f)) {
175
                    // Multiple all-ones blocks of 64 code points each.
176
0
                    set32x64Bits(bmpBlockBits, start>>6, limit>>6);
177
0
                }
178
179
0
                if(limit&0x3f) {
180
                    // Mixed-value block of 64 code points.
181
0
                    limit>>=6;
182
0
                    bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
183
0
                    limit=(limit+1)<<6;  // Round up to the next block boundary.
184
0
                    minStart=limit;      // Ignore further ranges in this block.
185
0
                }
186
0
            }
187
0
        }
188
189
0
        if(limit==0x10000) {
190
0
            break;
191
0
        }
192
193
0
        start=list[listIndex++];
194
0
        if(listIndex<listLength) {
195
0
            limit=list[listIndex++];
196
0
        } else {
197
0
            limit=0x110000;
198
0
        }
199
0
    }
200
0
}
201
202
/*
203
 * Override some bits and bytes to the result of contains(FFFD)
204
 * for faster validity checking at runtime.
205
 * No need to set 0 values where they were reset to 0 in the constructor
206
 * and not modified by initBits().
207
 * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
208
 * Need to set 0 values for surrogates D800..DFFF.
209
 */
210
0
void BMPSet::overrideIllegal() {
211
0
    uint32_t bits, mask;
212
0
    int32_t i;
213
214
0
    if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
215
        // contains(FFFD)==TRUE
216
0
        for(i=0x80; i<0xc0; ++i) {
217
0
            asciiBytes[i]=1;
218
0
        }
219
220
0
        bits=3;                 // Lead bytes 0xC0 and 0xC1.
221
0
        for(i=0; i<64; ++i) {
222
0
            table7FF[i]|=bits;
223
0
        }
224
225
0
        bits=1;                 // Lead byte 0xE0.
226
0
        for(i=0; i<32; ++i) {   // First half of 4k block.
227
0
            bmpBlockBits[i]|=bits;
228
0
        }
229
230
0
        mask=~(0x10001<<0xd);   // Lead byte 0xED.
231
0
        bits=1<<0xd;
232
0
        for(i=32; i<64; ++i) {  // Second half of 4k block.
233
0
            bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
234
0
        }
235
0
    } else {
236
        // contains(FFFD)==FALSE
237
0
        mask=~(0x10001<<0xd);   // Lead byte 0xED.
238
0
        for(i=32; i<64; ++i) {  // Second half of 4k block.
239
0
            bmpBlockBits[i]&=mask;
240
0
        }
241
0
    }
242
0
}
243
244
0
int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
245
    /* Examples:
246
                                       findCodePoint(c)
247
       set              list[]         c=0 1 3 4 7 8
248
       ===              ==============   ===========
249
       []               [110000]         0 0 0 0 0 0
250
       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
251
       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
252
       [:Any:]          [0, 110000]      1 1 1 1 1 1
253
     */
254
255
    // Return the smallest i such that c < list[i].  Assume
256
    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
257
0
    if (c < list[lo])
258
0
        return lo;
259
    // High runner test.  c is often after the last range, so an
260
    // initial check for this condition pays off.
261
0
    if (lo >= hi || c >= list[hi-1])
262
0
        return hi;
263
    // invariant: c >= list[lo]
264
    // invariant: c < list[hi]
265
0
    for (;;) {
266
0
        int32_t i = (lo + hi) >> 1;
267
0
        if (i == lo) {
268
0
            break; // Found!
269
0
        } else if (c < list[i]) {
270
0
            hi = i;
271
0
        } else {
272
0
            lo = i;
273
0
        }
274
0
    }
275
0
    return hi;
276
0
}
277
278
UBool
279
0
BMPSet::contains(UChar32 c) const {
280
0
    if((uint32_t)c<=0x7f) {
281
0
        return (UBool)asciiBytes[c];
282
0
    } else if((uint32_t)c<=0x7ff) {
283
0
        return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
284
0
    } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
285
0
        int lead=c>>12;
286
0
        uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
287
0
        if(twoBits<=1) {
288
            // All 64 code points with the same bits 15..6
289
            // are either in the set or not.
290
0
            return (UBool)twoBits;
291
0
        } else {
292
            // Look up the code point in its 4k block of code points.
293
0
            return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
294
0
        }
295
0
    } else if((uint32_t)c<=0x10ffff) {
296
        // surrogate or supplementary code point
297
0
        return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
298
0
    } else {
299
        // Out-of-range code points get FALSE, consistent with long-standing
300
        // behavior of UnicodeSet::contains(c).
301
0
        return FALSE;
302
0
    }
303
0
}
304
305
/*
306
 * Check for sufficient length for trail unit for each surrogate pair.
307
 * Handle single surrogates as surrogate code points as usual in ICU.
308
 */
309
const UChar *
310
0
BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
311
0
    UChar c, c2;
312
313
0
    if(spanCondition) {
314
        // span
315
0
        do {
316
0
            c=*s;
317
0
            if(c<=0x7f) {
318
0
                if(!asciiBytes[c]) {
319
0
                    break;
320
0
                }
321
0
            } else if(c<=0x7ff) {
322
0
                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
323
0
                    break;
324
0
                }
325
0
            } else if(c<0xd800 || c>=0xe000) {
326
0
                int lead=c>>12;
327
0
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
328
0
                if(twoBits<=1) {
329
                    // All 64 code points with the same bits 15..6
330
                    // are either in the set or not.
331
0
                    if(twoBits==0) {
332
0
                        break;
333
0
                    }
334
0
                } else {
335
                    // Look up the code point in its 4k block of code points.
336
0
                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
337
0
                        break;
338
0
                    }
339
0
                }
340
0
            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
341
                // surrogate code point
342
0
                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
343
0
                    break;
344
0
                }
345
0
            } else {
346
                // surrogate pair
347
0
                if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
348
0
                    break;
349
0
                }
350
0
                ++s;
351
0
            }
352
0
        } while(++s<limit);
353
0
    } else {
354
        // span not
355
0
        do {
356
0
            c=*s;
357
0
            if(c<=0x7f) {
358
0
                if(asciiBytes[c]) {
359
0
                    break;
360
0
                }
361
0
            } else if(c<=0x7ff) {
362
0
                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
363
0
                    break;
364
0
                }
365
0
            } else if(c<0xd800 || c>=0xe000) {
366
0
                int lead=c>>12;
367
0
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
368
0
                if(twoBits<=1) {
369
                    // All 64 code points with the same bits 15..6
370
                    // are either in the set or not.
371
0
                    if(twoBits!=0) {
372
0
                        break;
373
0
                    }
374
0
                } else {
375
                    // Look up the code point in its 4k block of code points.
376
0
                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
377
0
                        break;
378
0
                    }
379
0
                }
380
0
            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
381
                // surrogate code point
382
0
                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
383
0
                    break;
384
0
                }
385
0
            } else {
386
                // surrogate pair
387
0
                if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
388
0
                    break;
389
0
                }
390
0
                ++s;
391
0
            }
392
0
        } while(++s<limit);
393
0
    }
394
0
    return s;
395
0
}
396
397
/* Symmetrical with span(). */
398
const UChar *
399
0
BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
400
0
    UChar c, c2;
401
402
0
    if(spanCondition) {
403
        // span
404
0
        for(;;) {
405
0
            c=*(--limit);
406
0
            if(c<=0x7f) {
407
0
                if(!asciiBytes[c]) {
408
0
                    break;
409
0
                }
410
0
            } else if(c<=0x7ff) {
411
0
                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
412
0
                    break;
413
0
                }
414
0
            } else if(c<0xd800 || c>=0xe000) {
415
0
                int lead=c>>12;
416
0
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
417
0
                if(twoBits<=1) {
418
                    // All 64 code points with the same bits 15..6
419
                    // are either in the set or not.
420
0
                    if(twoBits==0) {
421
0
                        break;
422
0
                    }
423
0
                } else {
424
                    // Look up the code point in its 4k block of code points.
425
0
                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
426
0
                        break;
427
0
                    }
428
0
                }
429
0
            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
430
                // surrogate code point
431
0
                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
432
0
                    break;
433
0
                }
434
0
            } else {
435
                // surrogate pair
436
0
                if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
437
0
                    break;
438
0
                }
439
0
                --limit;
440
0
            }
441
0
            if(s==limit) {
442
0
                return s;
443
0
            }
444
0
        }
445
0
    } else {
446
        // span not
447
0
        for(;;) {
448
0
            c=*(--limit);
449
0
            if(c<=0x7f) {
450
0
                if(asciiBytes[c]) {
451
0
                    break;
452
0
                }
453
0
            } else if(c<=0x7ff) {
454
0
                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
455
0
                    break;
456
0
                }
457
0
            } else if(c<0xd800 || c>=0xe000) {
458
0
                int lead=c>>12;
459
0
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
460
0
                if(twoBits<=1) {
461
                    // All 64 code points with the same bits 15..6
462
                    // are either in the set or not.
463
0
                    if(twoBits!=0) {
464
0
                        break;
465
0
                    }
466
0
                } else {
467
                    // Look up the code point in its 4k block of code points.
468
0
                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
469
0
                        break;
470
0
                    }
471
0
                }
472
0
            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
473
                // surrogate code point
474
0
                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
475
0
                    break;
476
0
                }
477
0
            } else {
478
                // surrogate pair
479
0
                if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
480
0
                    break;
481
0
                }
482
0
                --limit;
483
0
            }
484
0
            if(s==limit) {
485
0
                return s;
486
0
            }
487
0
        }
488
0
    }
489
0
    return limit+1;
490
0
}
491
492
/*
493
 * Precheck for sufficient trail bytes at end of string only once per span.
494
 * Check validity.
495
 */
496
const uint8_t *
497
0
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
498
0
    const uint8_t *limit=s+length;
499
0
    uint8_t b=*s;
500
0
    if((int8_t)b>=0) {
501
        // Initial all-ASCII span.
502
0
        if(spanCondition) {
503
0
            do {
504
0
                if(!asciiBytes[b] || ++s==limit) {
505
0
                    return s;
506
0
                }
507
0
                b=*s;
508
0
            } while((int8_t)b>=0);
509
0
        } else {
510
0
            do {
511
0
                if(asciiBytes[b] || ++s==limit) {
512
0
                    return s;
513
0
                }
514
0
                b=*s;
515
0
            } while((int8_t)b>=0);
516
0
        }
517
0
        length=(int32_t)(limit-s);
518
0
    }
519
520
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
521
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
522
0
    }
523
524
0
    const uint8_t *limit0=limit;
525
526
    /*
527
     * Make sure that the last 1/2/3/4-byte sequence before limit is complete
528
     * or runs into a lead byte.
529
     * In the span loop compare s with limit only once
530
     * per multi-byte character.
531
     *
532
     * Give a trailing illegal sequence the same value as the result of contains(FFFD),
533
     * including it if that is part of the span, otherwise set limit0 to before
534
     * the truncated sequence.
535
     */
536
0
    b=*(limit-1);
537
0
    if((int8_t)b<0) {
538
        // b>=0x80: lead or trail byte
539
0
        if(b<0xc0) {
540
            // single trail byte, check for preceding 3- or 4-byte lead byte
541
0
            if(length>=2 && (b=*(limit-2))>=0xe0) {
542
0
                limit-=2;
543
0
                if(asciiBytes[0x80]!=spanCondition) {
544
0
                    limit0=limit;
545
0
                }
546
0
            } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
547
                // 4-byte lead byte with only two trail bytes
548
0
                limit-=3;
549
0
                if(asciiBytes[0x80]!=spanCondition) {
550
0
                    limit0=limit;
551
0
                }
552
0
            }
553
0
        } else {
554
            // lead byte with no trail bytes
555
0
            --limit;
556
0
            if(asciiBytes[0x80]!=spanCondition) {
557
0
                limit0=limit;
558
0
            }
559
0
        }
560
0
    }
561
562
0
    uint8_t t1, t2, t3;
563
564
0
    while(s<limit) {
565
0
        b=*s;
566
0
        if(b<0xc0) {
567
            // ASCII; or trail bytes with the result of contains(FFFD).
568
0
            if(spanCondition) {
569
0
                do {
570
0
                    if(!asciiBytes[b]) {
571
0
                        return s;
572
0
                    } else if(++s==limit) {
573
0
                        return limit0;
574
0
                    }
575
0
                    b=*s;
576
0
                } while(b<0xc0);
577
0
            } else {
578
0
                do {
579
0
                    if(asciiBytes[b]) {
580
0
                        return s;
581
0
                    } else if(++s==limit) {
582
0
                        return limit0;
583
0
                    }
584
0
                    b=*s;
585
0
                } while(b<0xc0);
586
0
            }
587
0
        }
588
0
        ++s;  // Advance past the lead byte.
589
0
        if(b>=0xe0) {
590
0
            if(b<0xf0) {
591
0
                if( /* handle U+0000..U+FFFF inline */
592
0
                    (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
593
0
                    (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
594
0
                ) {
595
0
                    b&=0xf;
596
0
                    uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
597
0
                    if(twoBits<=1) {
598
                        // All 64 code points with this lead byte and middle trail byte
599
                        // are either in the set or not.
600
0
                        if(twoBits!=(uint32_t)spanCondition) {
601
0
                            return s-1;
602
0
                        }
603
0
                    } else {
604
                        // Look up the code point in its 4k block of code points.
605
0
                        UChar32 c=(b<<12)|(t1<<6)|t2;
606
0
                        if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
607
0
                            return s-1;
608
0
                        }
609
0
                    }
610
0
                    s+=2;
611
0
                    continue;
612
0
                }
613
0
            } else if( /* handle U+10000..U+10FFFF inline */
614
0
                (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
615
0
                (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
616
0
                (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
617
0
            ) {
618
                // Give an illegal sequence the same value as the result of contains(FFFD).
619
0
                UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
620
0
                if( (   (0x10000<=c && c<=0x10ffff) ?
621
0
                            containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
622
0
                            asciiBytes[0x80]
623
0
                    ) != spanCondition
624
0
                ) {
625
0
                    return s-1;
626
0
                }
627
0
                s+=3;
628
0
                continue;
629
0
            }
630
0
        } else /* 0xc0<=b<0xe0 */ {
631
0
            if( /* handle U+0000..U+07FF inline */
632
0
                (t1=(uint8_t)(*s-0x80)) <= 0x3f
633
0
            ) {
634
0
                if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
635
0
                    return s-1;
636
0
                }
637
0
                ++s;
638
0
                continue;
639
0
            }
640
0
        }
641
642
        // Give an illegal sequence the same value as the result of contains(FFFD).
643
        // Handle each byte of an illegal sequence separately to simplify the code;
644
        // no need to optimize error handling.
645
0
        if(asciiBytes[0x80]!=spanCondition) {
646
0
            return s-1;
647
0
        }
648
0
    }
649
650
0
    return limit0;
651
0
}
652
653
/*
654
 * While going backwards through UTF-8 optimize only for ASCII.
655
 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
656
 * possible to tell from the last byte in a multi-byte sequence how many
657
 * preceding bytes there should be. Therefore, going backwards through UTF-8
658
 * is much harder than going forward.
659
 */
660
int32_t
661
0
BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
662
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
663
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
664
0
    }
665
666
0
    uint8_t b;
667
668
0
    do {
669
0
        b=s[--length];
670
0
        if((int8_t)b>=0) {
671
            // ASCII sub-span
672
0
            if(spanCondition) {
673
0
                do {
674
0
                    if(!asciiBytes[b]) {
675
0
                        return length+1;
676
0
                    } else if(length==0) {
677
0
                        return 0;
678
0
                    }
679
0
                    b=s[--length];
680
0
                } while((int8_t)b>=0);
681
0
            } else {
682
0
                do {
683
0
                    if(asciiBytes[b]) {
684
0
                        return length+1;
685
0
                    } else if(length==0) {
686
0
                        return 0;
687
0
                    }
688
0
                    b=s[--length];
689
0
                } while((int8_t)b>=0);
690
0
            }
691
0
        }
692
693
0
        int32_t prev=length;
694
0
        UChar32 c;
695
        // trail byte: collect a multi-byte character
696
        // (or  lead byte in last-trail position)
697
0
        c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
698
        // c is a valid code point, not ASCII, not a surrogate
699
0
        if(c<=0x7ff) {
700
0
            if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
701
0
                return prev+1;
702
0
            }
703
0
        } else if(c<=0xffff) {
704
0
            int lead=c>>12;
705
0
            uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
706
0
            if(twoBits<=1) {
707
                // All 64 code points with the same bits 15..6
708
                // are either in the set or not.
709
0
                if(twoBits!=(uint32_t)spanCondition) {
710
0
                    return prev+1;
711
0
                }
712
0
            } else {
713
                // Look up the code point in its 4k block of code points.
714
0
                if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
715
0
                    return prev+1;
716
0
                }
717
0
            }
718
0
        } else {
719
0
            if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
720
0
                return prev+1;
721
0
            }
722
0
        }
723
0
    } while(length>0);
724
0
    return 0;
725
0
}
726
727
U_NAMESPACE_END