/src/icu/source/common/bmpset.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | ******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 2007, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | ******************************************************************************  | 
10  |  | *   file name:  bmpset.h  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:4  | 
14  |  | *  | 
15  |  | *   created on: 2007jan29  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | */  | 
18  |  |  | 
19  |  | #ifndef __BMPSET_H__  | 
20  |  | #define __BMPSET_H__  | 
21  |  |  | 
22  |  | #include "unicode/utypes.h"  | 
23  |  | #include "unicode/uniset.h"  | 
24  |  |  | 
25  |  | U_NAMESPACE_BEGIN  | 
26  |  |  | 
27  |  | /*  | 
28  |  |  * Helper class for frozen UnicodeSets, implements contains() and span()  | 
29  |  |  * optimized for BMP code points. Structured to be UTF-8-friendly.  | 
30  |  |  *  | 
31  |  |  * Latin-1: Look up bytes.  | 
32  |  |  * 2-byte characters: Bits organized vertically.  | 
33  |  |  * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,  | 
34  |  |  *                    with mixed for illegal ranges.  | 
35  |  |  * Supplementary characters: Binary search over  | 
36  |  |  * the supplementary part of the parent set's inversion list.  | 
37  |  |  */  | 
38  |  | class BMPSet : public UMemory { | 
39  |  | public:  | 
40  |  |     BMPSet(const int32_t *parentList, int32_t parentListLength);  | 
41  |  |     BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);  | 
42  |  |     virtual ~BMPSet();  | 
43  |  |  | 
44  |  |     virtual UBool contains(UChar32 c) const;  | 
45  |  |  | 
46  |  |     /*  | 
47  |  |      * Span the initial substring for which each character c has spanCondition==contains(c).  | 
48  |  |      * It must be s<limit and spanCondition==0 or 1.  | 
49  |  |      * @return The string pointer which limits the span.  | 
50  |  |      */  | 
51  |  |     const UChar *span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;  | 
52  |  |  | 
53  |  |     /*  | 
54  |  |      * Span the trailing substring for which each character c has spanCondition==contains(c).  | 
55  |  |      * It must be s<limit and spanCondition==0 or 1.  | 
56  |  |      * @return The string pointer which starts the span.  | 
57  |  |      */  | 
58  |  |     const UChar *spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;  | 
59  |  |  | 
60  |  |     /*  | 
61  |  |      * Span the initial substring for which each character c has spanCondition==contains(c).  | 
62  |  |      * It must be length>0 and spanCondition==0 or 1.  | 
63  |  |      * @return The string pointer which limits the span.  | 
64  |  |      */  | 
65  |  |     const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;  | 
66  |  |  | 
67  |  |     /*  | 
68  |  |      * Span the trailing substring for which each character c has spanCondition==contains(c).  | 
69  |  |      * It must be length>0 and spanCondition==0 or 1.  | 
70  |  |      * @return The start of the span.  | 
71  |  |      */  | 
72  |  |     int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;  | 
73  |  |  | 
74  |  | private:  | 
75  |  |     void initBits();  | 
76  |  |     void overrideIllegal();  | 
77  |  |  | 
78  |  |     /**  | 
79  |  |      * Same as UnicodeSet::findCodePoint(UChar32 c) const except that the  | 
80  |  |      * binary search is restricted for finding code points in a certain range.  | 
81  |  |      *  | 
82  |  |      * For restricting the search for finding in the range start..end,  | 
83  |  |      * pass in  | 
84  |  |      *   lo=findCodePoint(start) and  | 
85  |  |      *   hi=findCodePoint(end)  | 
86  |  |      * with 0<=lo<=hi<len.  | 
87  |  |      * findCodePoint(c) defaults to lo=0 and hi=len-1.  | 
88  |  |      *  | 
89  |  |      * @param c a character in a subrange of MIN_VALUE..MAX_VALUE  | 
90  |  |      * @param lo The lowest index to be returned.  | 
91  |  |      * @param hi The highest index to be returned.  | 
92  |  |      * @return the smallest integer i in the range lo..hi,  | 
93  |  |      *         inclusive, such that c < list[i]  | 
94  |  |      */  | 
95  |  |     int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;  | 
96  |  |  | 
97  |  |     inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;  | 
98  |  |  | 
99  |  |     /*  | 
100  |  |      * One byte 0 or 1 per Latin-1 character.  | 
101  |  |      */  | 
102  |  |     UBool latin1Contains[0x100];  | 
103  |  |  | 
104  |  |     /* true if contains(U+FFFD). */  | 
105  |  |     UBool containsFFFD;  | 
106  |  |  | 
107  |  |     /*  | 
108  |  |      * One bit per code point from U+0000..U+07FF.  | 
109  |  |      * The bits are organized vertically; consecutive code points  | 
110  |  |      * correspond to the same bit positions in consecutive table words.  | 
111  |  |      * With code point parts  | 
112  |  |      *   lead=c{10..6} | 
113  |  |      *   trail=c{5..0} | 
114  |  |      * it is set.contains(c)==(table7FF[trail] bit lead)  | 
115  |  |      *  | 
116  |  |      * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)  | 
117  |  |      * for faster validity checking at runtime.  | 
118  |  |      */  | 
119  |  |     uint32_t table7FF[64];  | 
120  |  |  | 
121  |  |     /*  | 
122  |  |      * One bit per 64 BMP code points.  | 
123  |  |      * The bits are organized vertically; consecutive 64-code point blocks  | 
124  |  |      * correspond to the same bit position in consecutive table words.  | 
125  |  |      * With code point parts  | 
126  |  |      *   lead=c{15..12} | 
127  |  |      *   t1=c{11..6} | 
128  |  |      * test bits (lead+16) and lead in bmpBlockBits[t1].  | 
129  |  |      * If the upper bit is 0, then the lower bit indicates if contains(c)  | 
130  |  |      * for all code points in the 64-block.  | 
131  |  |      * If the upper bit is 1, then the block is mixed and set.contains(c)  | 
132  |  |      * must be called.  | 
133  |  |      *  | 
134  |  |      * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to  | 
135  |  |      * the result of contains(FFFD) for faster validity checking at runtime.  | 
136  |  |      */  | 
137  |  |     uint32_t bmpBlockBits[64];  | 
138  |  |  | 
139  |  |     /*  | 
140  |  |      * Inversion list indexes for restricted binary searches in  | 
141  |  |      * findCodePoint(), from  | 
142  |  |      * findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).  | 
143  |  |      * U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are  | 
144  |  |      * always looked up in the bit tables.  | 
145  |  |      * The last pair of indexes is for finding supplementary code points.  | 
146  |  |      */  | 
147  |  |     int32_t list4kStarts[18];  | 
148  |  |  | 
149  |  |     /*  | 
150  |  |      * The inversion list of the parent set, for the slower contains() implementation  | 
151  |  |      * for mixed BMP blocks and for supplementary code points.  | 
152  |  |      * The list is terminated with list[listLength-1]=0x110000.  | 
153  |  |      */  | 
154  |  |     const int32_t *list;  | 
155  |  |     int32_t listLength;  | 
156  |  | };  | 
157  |  |  | 
158  | 0  | inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const { | 
159  | 0  |     return (UBool)(findCodePoint(c, lo, hi) & 1);  | 
160  | 0  | }  | 
161  |  |  | 
162  |  | U_NAMESPACE_END  | 
163  |  |  | 
164  |  | #endif  |