Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/common/bmpset.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2007-2012, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  bmpset.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2007jan29
16
*   created by: Markus W. Scherer
17
*/
18
19
#include "unicode/utypes.h"
20
#include "unicode/uniset.h"
21
#include "unicode/utf8.h"
22
#include "unicode/utf16.h"
23
#include "cmemory.h"
24
#include "bmpset.h"
25
#include "uassert.h"
26
27
U_NAMESPACE_BEGIN
28
29
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
30
0
        list(parentList), listLength(parentListLength) {
31
0
    uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
32
0
    uprv_memset(table7FF, 0, sizeof(table7FF));
33
0
    uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
34
35
    /*
36
     * Set the list indexes for binary searches for
37
     * U+0800, U+1000, U+2000, .., U+F000, U+10000.
38
     * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
39
     * looked up in the bit tables.
40
     * The last pair of indexes is for finding supplementary code points.
41
     */
42
0
    list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
43
0
    int32_t i;
44
0
    for(i=1; i<=0x10; ++i) {
45
0
        list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
46
0
    }
47
0
    list4kStarts[0x11]=listLength-1;
48
0
    containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
49
50
0
    initBits();
51
0
    overrideIllegal();
52
0
}
53
54
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
55
0
        containsFFFD(otherBMPSet.containsFFFD),
56
0
        list(newParentList), listLength(newParentListLength) {
57
0
    uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
58
0
    uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
59
0
    uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
60
0
    uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
61
0
}
62
63
0
BMPSet::~BMPSet() {
64
0
}
65
66
/*
67
 * Set bits in a bit rectangle in "vertical" bit organization.
68
 * start<limit<=0x800
69
 */
70
0
static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
71
0
    U_ASSERT(start<limit);
72
0
    U_ASSERT(limit<=0x800);
73
74
0
    int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
75
0
    int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
76
77
    // Set one bit indicating an all-one block.
78
0
    uint32_t bits=(uint32_t)1<<lead;
79
0
    if((start+1)==limit) {  // Single-character shortcut.
80
0
        table[trail]|=bits;
81
0
        return;
82
0
    }
83
84
0
    int32_t limitLead=limit>>6;
85
0
    int32_t limitTrail=limit&0x3f;
86
87
0
    if(lead==limitLead) {
88
        // Partial vertical bit column.
89
0
        while(trail<limitTrail) {
90
0
            table[trail++]|=bits;
91
0
        }
92
0
    } else {
93
        // Partial vertical bit column,
94
        // followed by a bit rectangle,
95
        // followed by another partial vertical bit column.
96
0
        if(trail>0) {
97
0
            do {
98
0
                table[trail++]|=bits;
99
0
            } while(trail<64);
100
0
            ++lead;
101
0
        }
102
0
        if(lead<limitLead) {
103
0
            bits=~(((unsigned)1<<lead)-1);
104
0
            if(limitLead<0x20) {
105
0
                bits&=((unsigned)1<<limitLead)-1;
106
0
            }
107
0
            for(trail=0; trail<64; ++trail) {
108
0
                table[trail]|=bits;
109
0
            }
110
0
        }
111
        // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
112
        // In that case, bits=1<<limitLead is undefined but the bits value
113
        // is not used because trail<limitTrail is already false.
114
0
        bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
115
0
        for(trail=0; trail<limitTrail; ++trail) {
116
0
            table[trail]|=bits;
117
0
        }
118
0
    }
119
0
}
120
121
0
void BMPSet::initBits() {
122
0
    UChar32 start, limit;
123
0
    int32_t listIndex=0;
124
125
    // Set latin1Contains[].
126
0
    do {
127
0
        start=list[listIndex++];
128
0
        if(listIndex<listLength) {
129
0
            limit=list[listIndex++];
130
0
        } else {
131
0
            limit=0x110000;
132
0
        }
133
0
        if(start>=0x100) {
134
0
            break;
135
0
        }
136
0
        do {
137
0
            latin1Contains[start++]=1;
138
0
        } while(start<limit && start<0x100);
139
0
    } while(limit<=0x100);
140
141
    // Find the first range overlapping with (or after) 80..FF again,
142
    // to include them in table7FF as well.
143
0
    for(listIndex=0;;) {
144
0
        start=list[listIndex++];
145
0
        if(listIndex<listLength) {
146
0
            limit=list[listIndex++];
147
0
        } else {
148
0
            limit=0x110000;
149
0
        }
150
0
        if(limit>0x80) {
151
0
            if(start<0x80) {
152
0
                start=0x80;
153
0
            }
154
0
            break;
155
0
        }
156
0
    }
157
158
    // Set table7FF[].
159
0
    while(start<0x800) {
160
0
        set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
161
0
        if(limit>0x800) {
162
0
            start=0x800;
163
0
            break;
164
0
        }
165
166
0
        start=list[listIndex++];
167
0
        if(listIndex<listLength) {
168
0
            limit=list[listIndex++];
169
0
        } else {
170
0
            limit=0x110000;
171
0
        }
172
0
    }
173
174
    // Set bmpBlockBits[].
175
0
    int32_t minStart=0x800;
176
0
    while(start<0x10000) {
177
0
        if(limit>0x10000) {
178
0
            limit=0x10000;
179
0
        }
180
181
0
        if(start<minStart) {
182
0
            start=minStart;
183
0
        }
184
0
        if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
185
0
            if(start&0x3f) {
186
                // Mixed-value block of 64 code points.
187
0
                start>>=6;
188
0
                bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
189
0
                start=(start+1)<<6;  // Round up to the next block boundary.
190
0
                minStart=start;      // Ignore further ranges in this block.
191
0
            }
192
0
            if(start<limit) {
193
0
                if(start<(limit&~0x3f)) {
194
                    // Multiple all-ones blocks of 64 code points each.
195
0
                    set32x64Bits(bmpBlockBits, start>>6, limit>>6);
196
0
                }
197
198
0
                if(limit&0x3f) {
199
                    // Mixed-value block of 64 code points.
200
0
                    limit>>=6;
201
0
                    bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
202
0
                    limit=(limit+1)<<6;  // Round up to the next block boundary.
203
0
                    minStart=limit;      // Ignore further ranges in this block.
204
0
                }
205
0
            }
206
0
        }
207
208
0
        if(limit==0x10000) {
209
0
            break;
210
0
        }
211
212
0
        start=list[listIndex++];
213
0
        if(listIndex<listLength) {
214
0
            limit=list[listIndex++];
215
0
        } else {
216
0
            limit=0x110000;
217
0
        }
218
0
    }
219
0
}
220
221
/*
222
 * Override some bits and bytes to the result of contains(FFFD)
223
 * for faster validity checking at runtime.
224
 * No need to set 0 values where they were reset to 0 in the constructor
225
 * and not modified by initBits().
226
 * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
227
 * Need to set 0 values for surrogates D800..DFFF.
228
 */
229
0
void BMPSet::overrideIllegal() {
230
0
    uint32_t bits, mask;
231
0
    int32_t i;
232
233
0
    if(containsFFFD) {
234
0
        bits=3;                 // Lead bytes 0xC0 and 0xC1.
235
0
        for(i=0; i<64; ++i) {
236
0
            table7FF[i]|=bits;
237
0
        }
238
239
0
        bits=1;                 // Lead byte 0xE0.
240
0
        for(i=0; i<32; ++i) {   // First half of 4k block.
241
0
            bmpBlockBits[i]|=bits;
242
0
        }
243
244
0
        mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.
245
0
        bits=1<<0xd;
246
0
        for(i=32; i<64; ++i) {  // Second half of 4k block.
247
0
            bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
248
0
        }
249
0
    } else {
250
0
        mask= static_cast<uint32_t>(~(0x10001<<0xd));   // Lead byte 0xED.
251
0
        for(i=32; i<64; ++i) {  // Second half of 4k block.
252
0
            bmpBlockBits[i]&=mask;
253
0
        }
254
0
    }
255
0
}
256
257
0
int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
258
    /* Examples:
259
                                       findCodePoint(c)
260
       set              list[]         c=0 1 3 4 7 8
261
       ===              ==============   ===========
262
       []               [110000]         0 0 0 0 0 0
263
       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
264
       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
265
       [:Any:]          [0, 110000]      1 1 1 1 1 1
266
     */
267
268
    // Return the smallest i such that c < list[i].  Assume
269
    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
270
0
    if (c < list[lo])
271
0
        return lo;
272
    // High runner test.  c is often after the last range, so an
273
    // initial check for this condition pays off.
274
0
    if (lo >= hi || c >= list[hi-1])
275
0
        return hi;
276
    // invariant: c >= list[lo]
277
    // invariant: c < list[hi]
278
0
    for (;;) {
279
0
        int32_t i = (lo + hi) >> 1;
280
0
        if (i == lo) {
281
0
            break; // Found!
282
0
        } else if (c < list[i]) {
283
0
            hi = i;
284
0
        } else {
285
0
            lo = i;
286
0
        }
287
0
    }
288
0
    return hi;
289
0
}
290
291
UBool
292
0
BMPSet::contains(UChar32 c) const {
293
0
    if((uint32_t)c<=0xff) {
294
0
        return (UBool)latin1Contains[c];
295
0
    } else if((uint32_t)c<=0x7ff) {
296
0
        return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
297
0
    } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
298
0
        int lead=c>>12;
299
0
        uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
300
0
        if(twoBits<=1) {
301
            // All 64 code points with the same bits 15..6
302
            // are either in the set or not.
303
0
            return (UBool)twoBits;
304
0
        } else {
305
            // Look up the code point in its 4k block of code points.
306
0
            return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
307
0
        }
308
0
    } else if((uint32_t)c<=0x10ffff) {
309
        // surrogate or supplementary code point
310
0
        return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
311
0
    } else {
312
        // Out-of-range code points get FALSE, consistent with long-standing
313
        // behavior of UnicodeSet::contains(c).
314
0
        return FALSE;
315
0
    }
316
0
}
317
318
/*
319
 * Check for sufficient length for trail unit for each surrogate pair.
320
 * Handle single surrogates as surrogate code points as usual in ICU.
321
 */
322
const UChar *
323
0
BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
324
0
    UChar c, c2;
325
326
0
    if(spanCondition) {
327
        // span
328
0
        do {
329
0
            c=*s;
330
0
            if(c<=0xff) {
331
0
                if(!latin1Contains[c]) {
332
0
                    break;
333
0
                }
334
0
            } else if(c<=0x7ff) {
335
0
                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
336
0
                    break;
337
0
                }
338
0
            } else if(c<0xd800 || c>=0xe000) {
339
0
                int lead=c>>12;
340
0
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
341
0
                if(twoBits<=1) {
342
                    // All 64 code points with the same bits 15..6
343
                    // are either in the set or not.
344
0
                    if(twoBits==0) {
345
0
                        break;
346
0
                    }
347
0
                } else {
348
                    // Look up the code point in its 4k block of code points.
349
0
                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
350
0
                        break;
351
0
                    }
352
0
                }
353
0
            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
354
                // surrogate code point
355
0
                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
356
0
                    break;
357
0
                }
358
0
            } else {
359
                // surrogate pair
360
0
                if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
361
0
                    break;
362
0
                }
363
0
                ++s;
364
0
            }
365
0
        } while(++s<limit);
366
0
    } else {
367
        // span not
368
0
        do {
369
0
            c=*s;
370
0
            if(c<=0xff) {
371
0
                if(latin1Contains[c]) {
372
0
                    break;
373
0
                }
374
0
            } else if(c<=0x7ff) {
375
0
                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
376
0
                    break;
377
0
                }
378
0
            } else if(c<0xd800 || c>=0xe000) {
379
0
                int lead=c>>12;
380
0
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
381
0
                if(twoBits<=1) {
382
                    // All 64 code points with the same bits 15..6
383
                    // are either in the set or not.
384
0
                    if(twoBits!=0) {
385
0
                        break;
386
0
                    }
387
0
                } else {
388
                    // Look up the code point in its 4k block of code points.
389
0
                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
390
0
                        break;
391
0
                    }
392
0
                }
393
0
            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
394
                // surrogate code point
395
0
                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
396
0
                    break;
397
0
                }
398
0
            } else {
399
                // surrogate pair
400
0
                if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
401
0
                    break;
402
0
                }
403
0
                ++s;
404
0
            }
405
0
        } while(++s<limit);
406
0
    }
407
0
    return s;
408
0
}
409
410
/* Symmetrical with span(). */
411
const UChar *
412
0
BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
413
0
    UChar c, c2;
414
415
0
    if(spanCondition) {
416
        // span
417
0
        for(;;) {
418
0
            c=*(--limit);
419
0
            if(c<=0xff) {
420
0
                if(!latin1Contains[c]) {
421
0
                    break;
422
0
                }
423
0
            } else if(c<=0x7ff) {
424
0
                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
425
0
                    break;
426
0
                }
427
0
            } else if(c<0xd800 || c>=0xe000) {
428
0
                int lead=c>>12;
429
0
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
430
0
                if(twoBits<=1) {
431
                    // All 64 code points with the same bits 15..6
432
                    // are either in the set or not.
433
0
                    if(twoBits==0) {
434
0
                        break;
435
0
                    }
436
0
                } else {
437
                    // Look up the code point in its 4k block of code points.
438
0
                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
439
0
                        break;
440
0
                    }
441
0
                }
442
0
            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
443
                // surrogate code point
444
0
                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
445
0
                    break;
446
0
                }
447
0
            } else {
448
                // surrogate pair
449
0
                if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
450
0
                    break;
451
0
                }
452
0
                --limit;
453
0
            }
454
0
            if(s==limit) {
455
0
                return s;
456
0
            }
457
0
        }
458
0
    } else {
459
        // span not
460
0
        for(;;) {
461
0
            c=*(--limit);
462
0
            if(c<=0xff) {
463
0
                if(latin1Contains[c]) {
464
0
                    break;
465
0
                }
466
0
            } else if(c<=0x7ff) {
467
0
                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
468
0
                    break;
469
0
                }
470
0
            } else if(c<0xd800 || c>=0xe000) {
471
0
                int lead=c>>12;
472
0
                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
473
0
                if(twoBits<=1) {
474
                    // All 64 code points with the same bits 15..6
475
                    // are either in the set or not.
476
0
                    if(twoBits!=0) {
477
0
                        break;
478
0
                    }
479
0
                } else {
480
                    // Look up the code point in its 4k block of code points.
481
0
                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
482
0
                        break;
483
0
                    }
484
0
                }
485
0
            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
486
                // surrogate code point
487
0
                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
488
0
                    break;
489
0
                }
490
0
            } else {
491
                // surrogate pair
492
0
                if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
493
0
                    break;
494
0
                }
495
0
                --limit;
496
0
            }
497
0
            if(s==limit) {
498
0
                return s;
499
0
            }
500
0
        }
501
0
    }
502
0
    return limit+1;
503
0
}
504
505
/*
506
 * Precheck for sufficient trail bytes at end of string only once per span.
507
 * Check validity.
508
 */
509
const uint8_t *
510
0
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
511
0
    const uint8_t *limit=s+length;
512
0
    uint8_t b=*s;
513
0
    if(U8_IS_SINGLE(b)) {
514
        // Initial all-ASCII span.
515
0
        if(spanCondition) {
516
0
            do {
517
0
                if(!latin1Contains[b] || ++s==limit) {
518
0
                    return s;
519
0
                }
520
0
                b=*s;
521
0
            } while(U8_IS_SINGLE(b));
522
0
        } else {
523
0
            do {
524
0
                if(latin1Contains[b] || ++s==limit) {
525
0
                    return s;
526
0
                }
527
0
                b=*s;
528
0
            } while(U8_IS_SINGLE(b));
529
0
        }
530
0
        length=(int32_t)(limit-s);
531
0
    }
532
533
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
534
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
535
0
    }
536
537
0
    const uint8_t *limit0=limit;
538
539
    /*
540
     * Make sure that the last 1/2/3/4-byte sequence before limit is complete
541
     * or runs into a lead byte.
542
     * In the span loop compare s with limit only once
543
     * per multi-byte character.
544
     *
545
     * Give a trailing illegal sequence the same value as the result of contains(FFFD),
546
     * including it if that is part of the span, otherwise set limit0 to before
547
     * the truncated sequence.
548
     */
549
0
    b=*(limit-1);
550
0
    if((int8_t)b<0) {
551
        // b>=0x80: lead or trail byte
552
0
        if(b<0xc0) {
553
            // single trail byte, check for preceding 3- or 4-byte lead byte
554
0
            if(length>=2 && (b=*(limit-2))>=0xe0) {
555
0
                limit-=2;
556
0
                if(containsFFFD!=spanCondition) {
557
0
                    limit0=limit;
558
0
                }
559
0
            } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
560
                // 4-byte lead byte with only two trail bytes
561
0
                limit-=3;
562
0
                if(containsFFFD!=spanCondition) {
563
0
                    limit0=limit;
564
0
                }
565
0
            }
566
0
        } else {
567
            // lead byte with no trail bytes
568
0
            --limit;
569
0
            if(containsFFFD!=spanCondition) {
570
0
                limit0=limit;
571
0
            }
572
0
        }
573
0
    }
574
575
0
    uint8_t t1, t2, t3;
576
577
0
    while(s<limit) {
578
0
        b=*s;
579
0
        if(U8_IS_SINGLE(b)) {
580
            // ASCII
581
0
            if(spanCondition) {
582
0
                do {
583
0
                    if(!latin1Contains[b]) {
584
0
                        return s;
585
0
                    } else if(++s==limit) {
586
0
                        return limit0;
587
0
                    }
588
0
                    b=*s;
589
0
                } while(U8_IS_SINGLE(b));
590
0
            } else {
591
0
                do {
592
0
                    if(latin1Contains[b]) {
593
0
                        return s;
594
0
                    } else if(++s==limit) {
595
0
                        return limit0;
596
0
                    }
597
0
                    b=*s;
598
0
                } while(U8_IS_SINGLE(b));
599
0
            }
600
0
        }
601
0
        ++s;  // Advance past the lead byte.
602
0
        if(b>=0xe0) {
603
0
            if(b<0xf0) {
604
0
                if( /* handle U+0000..U+FFFF inline */
605
0
                    (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
606
0
                    (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
607
0
                ) {
608
0
                    b&=0xf;
609
0
                    uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
610
0
                    if(twoBits<=1) {
611
                        // All 64 code points with this lead byte and middle trail byte
612
                        // are either in the set or not.
613
0
                        if(twoBits!=(uint32_t)spanCondition) {
614
0
                            return s-1;
615
0
                        }
616
0
                    } else {
617
                        // Look up the code point in its 4k block of code points.
618
0
                        UChar32 c=(b<<12)|(t1<<6)|t2;
619
0
                        if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
620
0
                            return s-1;
621
0
                        }
622
0
                    }
623
0
                    s+=2;
624
0
                    continue;
625
0
                }
626
0
            } else if( /* handle U+10000..U+10FFFF inline */
627
0
                (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
628
0
                (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
629
0
                (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
630
0
            ) {
631
                // Give an illegal sequence the same value as the result of contains(FFFD).
632
0
                UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
633
0
                if( (   (0x10000<=c && c<=0x10ffff) ?
634
0
                            containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
635
0
                            containsFFFD
636
0
                    ) != spanCondition
637
0
                ) {
638
0
                    return s-1;
639
0
                }
640
0
                s+=3;
641
0
                continue;
642
0
            }
643
0
        } else {
644
0
            if( /* handle U+0000..U+07FF inline */
645
0
                b>=0xc0 &&
646
0
                (t1=(uint8_t)(*s-0x80)) <= 0x3f
647
0
            ) {
648
0
                if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
649
0
                    return s-1;
650
0
                }
651
0
                ++s;
652
0
                continue;
653
0
            }
654
0
        }
655
656
        // Give an illegal sequence the same value as the result of contains(FFFD).
657
        // Handle each byte of an illegal sequence separately to simplify the code;
658
        // no need to optimize error handling.
659
0
        if(containsFFFD!=spanCondition) {
660
0
            return s-1;
661
0
        }
662
0
    }
663
664
0
    return limit0;
665
0
}
666
667
/*
668
 * While going backwards through UTF-8 optimize only for ASCII.
669
 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
670
 * possible to tell from the last byte in a multi-byte sequence how many
671
 * preceding bytes there should be. Therefore, going backwards through UTF-8
672
 * is much harder than going forward.
673
 */
674
int32_t
675
0
BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
676
0
    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
677
0
        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
678
0
    }
679
680
0
    uint8_t b;
681
682
0
    do {
683
0
        b=s[--length];
684
0
        if(U8_IS_SINGLE(b)) {
685
            // ASCII sub-span
686
0
            if(spanCondition) {
687
0
                do {
688
0
                    if(!latin1Contains[b]) {
689
0
                        return length+1;
690
0
                    } else if(length==0) {
691
0
                        return 0;
692
0
                    }
693
0
                    b=s[--length];
694
0
                } while(U8_IS_SINGLE(b));
695
0
            } else {
696
0
                do {
697
0
                    if(latin1Contains[b]) {
698
0
                        return length+1;
699
0
                    } else if(length==0) {
700
0
                        return 0;
701
0
                    }
702
0
                    b=s[--length];
703
0
                } while(U8_IS_SINGLE(b));
704
0
            }
705
0
        }
706
707
0
        int32_t prev=length;
708
0
        UChar32 c;
709
        // trail byte: collect a multi-byte character
710
        // (or  lead byte in last-trail position)
711
0
        c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
712
        // c is a valid code point, not ASCII, not a surrogate
713
0
        if(c<=0x7ff) {
714
0
            if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
715
0
                return prev+1;
716
0
            }
717
0
        } else if(c<=0xffff) {
718
0
            int lead=c>>12;
719
0
            uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
720
0
            if(twoBits<=1) {
721
                // All 64 code points with the same bits 15..6
722
                // are either in the set or not.
723
0
                if(twoBits!=(uint32_t)spanCondition) {
724
0
                    return prev+1;
725
0
                }
726
0
            } else {
727
                // Look up the code point in its 4k block of code points.
728
0
                if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
729
0
                    return prev+1;
730
0
                }
731
0
            }
732
0
        } else {
733
0
            if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
734
0
                return prev+1;
735
0
            }
736
0
        }
737
0
    } while(length>0);
738
0
    return 0;
739
0
}
740
741
U_NAMESPACE_END