/src/icu/source/i18n/collationfcd.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | * Copyright (C) 2012-2014, International Business Machines  | 
6  |  | * Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | * collationfcd.h  | 
9  |  | *  | 
10  |  | * created on: 2012aug18  | 
11  |  | * created by: Markus W. Scherer  | 
12  |  | */  | 
13  |  |  | 
14  |  | #ifndef __COLLATIONFCD_H__  | 
15  |  | #define __COLLATIONFCD_H__  | 
16  |  |  | 
17  |  | #include "unicode/utypes.h"  | 
18  |  |  | 
19  |  | #if !UCONFIG_NO_COLLATION  | 
20  |  |  | 
21  |  | #include "unicode/utf16.h"  | 
22  |  |  | 
23  |  | U_NAMESPACE_BEGIN  | 
24  |  |  | 
25  |  | /**  | 
26  |  |  * Data and functions for the FCD check fast path.  | 
27  |  |  *  | 
28  |  |  * The fast path looks at a pair of 16-bit code units and checks  | 
29  |  |  * whether there is an FCD boundary between them;  | 
30  |  |  * there is if the first unit has a trailing ccc=0 (!hasTccc(first))  | 
31  |  |  * or the second unit has a leading ccc=0 (!hasLccc(second)),  | 
32  |  |  * or both.  | 
33  |  |  * When the fast path finds a possible non-boundary,  | 
34  |  |  * then the FCD check slow path looks at the actual sequence of FCD values.  | 
35  |  |  *  | 
36  |  |  * This is a pure optimization.  | 
37  |  |  * The fast path must at least find all possible non-boundaries.  | 
38  |  |  * If the fast path is too pessimistic, it costs performance.  | 
39  |  |  *  | 
40  |  |  * For a pair of BMP characters, the fast path tests are precise (1 bit per character).  | 
41  |  |  *  | 
42  |  |  * For a supplementary code point, the two units are its lead and trail surrogates.  | 
43  |  |  * We set hasTccc(lead)=true if any of its 1024 associated supplementary code points  | 
44  |  |  * has lccc!=0 or tccc!=0.  | 
45  |  |  * We set hasLccc(trail)=true for all trail surrogates.  | 
46  |  |  * As a result, we leave the fast path if the lead surrogate might start a  | 
47  |  |  * supplementary code point that is not FCD-inert.  | 
48  |  |  * (So the fast path need not detect that there is a surrogate pair,  | 
49  |  |  * nor look ahead to the next full code point.)  | 
50  |  |  *  | 
51  |  |  * hasLccc(lead)=true if any of its 1024 associated supplementary code points  | 
52  |  |  * has lccc!=0, for fast boundary checking between BMP & supplementary.  | 
53  |  |  *  | 
54  |  |  * hasTccc(trail)=false:  | 
55  |  |  * It should only be tested for unpaired trail surrogates which are FCD-inert.  | 
56  |  |  */  | 
57  |  | class U_I18N_API CollationFCD { | 
58  |  | public:  | 
59  | 0  |     static inline UBool hasLccc(UChar32 c) { | 
60  |  |         // assert c <= 0xffff  | 
61  |  |         // c can be negative, e.g., U_SENTINEL from UCharIterator;  | 
62  |  |         // that is handled in the first test.  | 
63  | 0  |         int32_t i;  | 
64  | 0  |         return  | 
65  |  |             // U+0300 is the first character with lccc!=0.  | 
66  | 0  |             c >= 0x300 &&  | 
67  | 0  |             (i = lcccIndex[c >> 5]) != 0 &&  | 
68  | 0  |             (lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;  | 
69  | 0  |     }  | 
70  |  |  | 
71  | 0  |     static inline UBool hasTccc(UChar32 c) { | 
72  |  |         // assert c <= 0xffff  | 
73  |  |         // c can be negative, e.g., U_SENTINEL from UCharIterator;  | 
74  |  |         // that is handled in the first test.  | 
75  | 0  |         int32_t i;  | 
76  | 0  |         return  | 
77  |  |             // U+00C0 is the first character with tccc!=0.  | 
78  | 0  |             c >= 0xc0 &&  | 
79  | 0  |             (i = tcccIndex[c >> 5]) != 0 &&  | 
80  | 0  |             (tcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;  | 
81  | 0  |     }  | 
82  |  |  | 
83  | 0  |     static inline UBool mayHaveLccc(UChar32 c) { | 
84  |  |         // Handles all of Unicode 0..10FFFF.  | 
85  |  |         // c can be negative, e.g., U_SENTINEL.  | 
86  |  |         // U+0300 is the first character with lccc!=0.  | 
87  | 0  |         if(c < 0x300) { return false; } | 
88  | 0  |         if(c > 0xffff) { c = U16_LEAD(c); } | 
89  | 0  |         int32_t i;  | 
90  | 0  |         return  | 
91  | 0  |             (i = lcccIndex[c >> 5]) != 0 &&  | 
92  | 0  |             (lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;  | 
93  | 0  |     }  | 
94  |  |  | 
95  |  |     /**  | 
96  |  |      * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)  | 
97  |  |      * must be decomposed before reaching the core collation code,  | 
98  |  |      * or else some sequences including them, even ones passing the FCD check,  | 
99  |  |      * do not yield canonically equivalent results.  | 
100  |  |      *  | 
101  |  |      * This is a fast and imprecise test.  | 
102  |  |      *  | 
103  |  |      * @param c a code point  | 
104  |  |      * @return true if c is U+0F73, U+0F75 or U+0F81 or one of several other Tibetan characters  | 
105  |  |      */  | 
106  | 0  |     static inline UBool maybeTibetanCompositeVowel(UChar32 c) { | 
107  | 0  |         return (c & 0x1fff01) == 0xf01;  | 
108  | 0  |     }  | 
109  |  |  | 
110  |  |     /**  | 
111  |  |      * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)  | 
112  |  |      * must be decomposed before reaching the core collation code,  | 
113  |  |      * or else some sequences including them, even ones passing the FCD check,  | 
114  |  |      * do not yield canonically equivalent results.  | 
115  |  |      *  | 
116  |  |      * They have distinct lccc/tccc combinations: 129/130 or 129/132.  | 
117  |  |      *  | 
118  |  |      * @param fcd16 the FCD value (lccc/tccc combination) of a code point  | 
119  |  |      * @return true if fcd16 is from U+0F73, U+0F75 or U+0F81  | 
120  |  |      */  | 
121  | 0  |     static inline UBool isFCD16OfTibetanCompositeVowel(uint16_t fcd16) { | 
122  | 0  |         return fcd16 == 0x8182 || fcd16 == 0x8184;  | 
123  | 0  |     }  | 
124  |  |  | 
125  |  | private:  | 
126  |  |     CollationFCD();  // No instantiation.  | 
127  |  |  | 
128  |  |     static const uint8_t lcccIndex[2048];  | 
129  |  |     static const uint8_t tcccIndex[2048];  | 
130  |  |     static const uint32_t lcccBits[];  | 
131  |  |     static const uint32_t tcccBits[];  | 
132  |  | };  | 
133  |  |  | 
134  |  | U_NAMESPACE_END  | 
135  |  |  | 
136  |  | #endif  // !UCONFIG_NO_COLLATION  | 
137  |  | #endif  // __COLLATIONFCD_H__  |