/src/icu/source/i18n/collationdata.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | * Copyright (C) 2010-2015, International Business Machines  | 
6  |  | * Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | * collationdata.h  | 
9  |  | *  | 
10  |  | * created on: 2010oct27  | 
11  |  | * created by: Markus W. Scherer  | 
12  |  | */  | 
13  |  |  | 
14  |  | #ifndef __COLLATIONDATA_H__  | 
15  |  | #define __COLLATIONDATA_H__  | 
16  |  |  | 
17  |  | #include "unicode/utypes.h"  | 
18  |  |  | 
19  |  | #if !UCONFIG_NO_COLLATION  | 
20  |  |  | 
21  |  | #include "unicode/ucol.h"  | 
22  |  | #include "unicode/uniset.h"  | 
23  |  | #include "collation.h"  | 
24  |  | #include "normalizer2impl.h"  | 
25  |  | #include "utrie2.h"  | 
26  |  |  | 
27  |  | struct UDataMemory;  | 
28  |  |  | 
29  |  | U_NAMESPACE_BEGIN  | 
30  |  |  | 
31  |  | class UVector32;  | 
32  |  |  | 
33  |  | /**  | 
34  |  |  * Collation data container.  | 
35  |  |  * Immutable data created by a CollationDataBuilder, or loaded from a file,  | 
36  |  |  * or deserialized from API-provided binary data.  | 
37  |  |  *  | 
38  |  |  * Includes data for the collation base (root/default), aliased if this is not the base.  | 
39  |  |  */  | 
40  |  | struct U_I18N_API CollationData : public UMemory { | 
41  |  |     // Note: The ucadata.icu loader could discover the reserved ranges by setting an array  | 
42  |  |     // parallel with the ranges, and resetting ranges that are indexed.  | 
43  |  |     // The reordering builder code could clone the resulting template array.  | 
44  |  |     enum { | 
45  |  |         REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14,  | 
46  |  |         REORDER_RESERVED_AFTER_LATIN  | 
47  |  |     };  | 
48  |  |  | 
49  |  |     enum { | 
50  |  |         MAX_NUM_SPECIAL_REORDER_CODES = 8,  | 
51  |  |         /** C++ only, data reader check scriptStartsLength. */  | 
52  |  |         MAX_NUM_SCRIPT_RANGES = 256  | 
53  |  |     };  | 
54  |  |  | 
55  |  |     CollationData(const Normalizer2Impl &nfc)  | 
56  |  |             : trie(NULL),  | 
57  |  |               ce32s(NULL), ces(NULL), contexts(NULL), base(NULL),  | 
58  |  |               jamoCE32s(NULL),  | 
59  | 0  |               nfcImpl(nfc),  | 
60  | 0  |               numericPrimary(0x12000000),  | 
61  | 0  |               ce32sLength(0), cesLength(0), contextsLength(0),  | 
62  |  |               compressibleBytes(NULL),  | 
63  |  |               unsafeBackwardSet(NULL),  | 
64  | 0  |               fastLatinTable(NULL), fastLatinTableLength(0),  | 
65  | 0  |               numScripts(0), scriptsIndex(NULL), scriptStarts(NULL), scriptStartsLength(0),  | 
66  | 0  |               rootElements(NULL), rootElementsLength(0) {} | 
67  |  |  | 
68  | 0  |     uint32_t getCE32(UChar32 c) const { | 
69  | 0  |         return UTRIE2_GET32(trie, c);  | 
70  | 0  |     }  | 
71  |  |  | 
72  | 0  |     uint32_t getCE32FromSupplementary(UChar32 c) const { | 
73  | 0  |         return UTRIE2_GET32_FROM_SUPP(trie, c);  | 
74  | 0  |     }  | 
75  |  |  | 
76  | 0  |     UBool isDigit(UChar32 c) const { | 
77  | 0  |         return c < 0x660 ? c <= 0x39 && 0x30 <= c :  | 
78  | 0  |                 Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);  | 
79  | 0  |     }  | 
80  |  |  | 
81  | 0  |     UBool isUnsafeBackward(UChar32 c, UBool numeric) const { | 
82  | 0  |         return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));  | 
83  | 0  |     }  | 
84  |  |  | 
85  | 0  |     UBool isCompressibleLeadByte(uint32_t b) const { | 
86  | 0  |         return compressibleBytes[b];  | 
87  | 0  |     }  | 
88  |  |  | 
89  | 0  |     inline UBool isCompressiblePrimary(uint32_t p) const { | 
90  | 0  |         return isCompressibleLeadByte(p >> 24);  | 
91  | 0  |     }  | 
92  |  |  | 
93  |  |     /**  | 
94  |  |      * Returns the CE32 from two contexts words.  | 
95  |  |      * Access to the defaultCE32 for contraction and prefix matching.  | 
96  |  |      */  | 
97  | 0  |     static uint32_t readCE32(const UChar *p) { | 
98  | 0  |         return ((uint32_t)p[0] << 16) | p[1];  | 
99  | 0  |     }  | 
100  |  |  | 
101  |  |     /**  | 
102  |  |      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).  | 
103  |  |      * Requires that ce32 is special.  | 
104  |  |      */  | 
105  |  |     uint32_t getIndirectCE32(uint32_t ce32) const;  | 
106  |  |     /**  | 
107  |  |      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),  | 
108  |  |      * if ce32 is special.  | 
109  |  |      */  | 
110  |  |     uint32_t getFinalCE32(uint32_t ce32) const;  | 
111  |  |  | 
112  |  |     /**  | 
113  |  |      * Computes a CE from c's ce32 which has the OFFSET_TAG.  | 
114  |  |      */  | 
115  | 0  |     int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const { | 
116  | 0  |         int64_t dataCE = ces[Collation::indexFromCE32(ce32)];  | 
117  | 0  |         return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));  | 
118  | 0  |     }  | 
119  |  |  | 
120  |  |     /**  | 
121  |  |      * Returns the single CE that c maps to.  | 
122  |  |      * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.  | 
123  |  |      */  | 
124  |  |     int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;  | 
125  |  |  | 
126  |  |     /**  | 
127  |  |      * Returns the FCD16 value for code point c. c must be >= 0.  | 
128  |  |      */  | 
129  | 0  |     uint16_t getFCD16(UChar32 c) const { | 
130  | 0  |         return nfcImpl.getFCD16(c);  | 
131  | 0  |     }  | 
132  |  |  | 
133  |  |     /**  | 
134  |  |      * Returns the first primary for the script's reordering group.  | 
135  |  |      * @return the primary with only the first primary lead byte of the group  | 
136  |  |      *         (not necessarily an actual root collator primary weight),  | 
137  |  |      *         or 0 if the script is unknown  | 
138  |  |      */  | 
139  |  |     uint32_t getFirstPrimaryForGroup(int32_t script) const;  | 
140  |  |  | 
141  |  |     /**  | 
142  |  |      * Returns the last primary for the script's reordering group.  | 
143  |  |      * @return the last primary of the group  | 
144  |  |      *         (not an actual root collator primary weight),  | 
145  |  |      *         or 0 if the script is unknown  | 
146  |  |      */  | 
147  |  |     uint32_t getLastPrimaryForGroup(int32_t script) const;  | 
148  |  |  | 
149  |  |     /**  | 
150  |  |      * Finds the reordering group which contains the primary weight.  | 
151  |  |      * @return the first script of the group, or -1 if the weight is beyond the last group  | 
152  |  |      */  | 
153  |  |     int32_t getGroupForPrimary(uint32_t p) const;  | 
154  |  |  | 
155  |  |     int32_t getEquivalentScripts(int32_t script,  | 
156  |  |                                  int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;  | 
157  |  |  | 
158  |  |     /**  | 
159  |  |      * Writes the permutation of primary-weight ranges  | 
160  |  |      * for the given reordering of scripts and groups.  | 
161  |  |      * The caller checks for illegal arguments and  | 
162  |  |      * takes care of [DEFAULT] and memory allocation.  | 
163  |  |      *  | 
164  |  |      * Each list element will be a (limit, offset) pair as described  | 
165  |  |      * for the CollationSettings::reorderRanges.  | 
166  |  |      * The list will be empty if no ranges are reordered.  | 
167  |  |      */  | 
168  |  |     void makeReorderRanges(const int32_t *reorder, int32_t length,  | 
169  |  |                            UVector32 &ranges, UErrorCode &errorCode) const;  | 
170  |  |  | 
171  |  |     /** @see jamoCE32s */  | 
172  |  |     static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;  | 
173  |  |  | 
174  |  |     /** Main lookup trie. */  | 
175  |  |     const UTrie2 *trie;  | 
176  |  |     /**  | 
177  |  |      * Array of CE32 values.  | 
178  |  |      * At index 0 there must be CE32(U+0000)  | 
179  |  |      * to support U+0000's special-tag for NUL-termination handling.  | 
180  |  |      */  | 
181  |  |     const uint32_t *ce32s;  | 
182  |  |     /** Array of CE values for expansions and OFFSET_TAG. */  | 
183  |  |     const int64_t *ces;  | 
184  |  |     /** Array of prefix and contraction-suffix matching data. */  | 
185  |  |     const UChar *contexts;  | 
186  |  |     /** Base collation data, or NULL if this data itself is a base. */  | 
187  |  |     const CollationData *base;  | 
188  |  |     /**  | 
189  |  |      * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.  | 
190  |  |      * They are normally simple CE32s, rarely expansions.  | 
191  |  |      * For fast handling of HANGUL_TAG.  | 
192  |  |      */  | 
193  |  |     const uint32_t *jamoCE32s;  | 
194  |  |     const Normalizer2Impl &nfcImpl;  | 
195  |  |     /** The single-byte primary weight (xx000000) for numeric collation. */  | 
196  |  |     uint32_t numericPrimary;  | 
197  |  |  | 
198  |  |     int32_t ce32sLength;  | 
199  |  |     int32_t cesLength;  | 
200  |  |     int32_t contextsLength;  | 
201  |  |  | 
202  |  |     /** 256 flags for which primary-weight lead bytes are compressible. */  | 
203  |  |     const UBool *compressibleBytes;  | 
204  |  |     /**  | 
205  |  |      * Set of code points that are unsafe for starting string comparison after an identical prefix,  | 
206  |  |      * or in backwards CE iteration.  | 
207  |  |      */  | 
208  |  |     const UnicodeSet *unsafeBackwardSet;  | 
209  |  |  | 
210  |  |     /**  | 
211  |  |      * Fast Latin table for common-Latin-text string comparisons.  | 
212  |  |      * Data structure see class CollationFastLatin.  | 
213  |  |      */  | 
214  |  |     const uint16_t *fastLatinTable;  | 
215  |  |     int32_t fastLatinTableLength;  | 
216  |  |  | 
217  |  |     /**  | 
218  |  |      * Data for scripts and reordering groups.  | 
219  |  |      * Uses include building a reordering permutation table and  | 
220  |  |      * providing script boundaries to AlphabeticIndex.  | 
221  |  |      */  | 
222  |  |     int32_t numScripts;  | 
223  |  |     /**  | 
224  |  |      * The length of scriptsIndex is numScripts+16.  | 
225  |  |      * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.  | 
226  |  |      * 16 special reorder codes (not all used) are mapped starting at numScripts.  | 
227  |  |      * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.  | 
228  |  |      * There are special codes at the end for reorder-reserved primary ranges.  | 
229  |  |      *  | 
230  |  |      * Multiple scripts may share a range and index, for example Hira & Kana.  | 
231  |  |      */  | 
232  |  |     const uint16_t *scriptsIndex;  | 
233  |  |     /**  | 
234  |  |      * Start primary weight (top 16 bits only) for a group/script/reserved range  | 
235  |  |      * indexed by scriptsIndex.  | 
236  |  |      * The first range (separators & terminators) and the last range (trailing weights)  | 
237  |  |      * are not reorderable, and no scriptsIndex entry points to them.  | 
238  |  |      */  | 
239  |  |     const uint16_t *scriptStarts;  | 
240  |  |     int32_t scriptStartsLength;  | 
241  |  |  | 
242  |  |     /**  | 
243  |  |      * Collation elements in the root collator.  | 
244  |  |      * Used by the CollationRootElements class. The data structure is described there.  | 
245  |  |      * NULL in a tailoring.  | 
246  |  |      */  | 
247  |  |     const uint32_t *rootElements;  | 
248  |  |     int32_t rootElementsLength;  | 
249  |  |  | 
250  |  | private:  | 
251  |  |     int32_t getScriptIndex(int32_t script) const;  | 
252  |  |     void makeReorderRanges(const int32_t *reorder, int32_t length,  | 
253  |  |                            UBool latinMustMove,  | 
254  |  |                            UVector32 &ranges, UErrorCode &errorCode) const;  | 
255  |  |     int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;  | 
256  |  |     int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;  | 
257  |  | };  | 
258  |  |  | 
259  |  | U_NAMESPACE_END  | 
260  |  |  | 
261  |  | #endif  // !UCONFIG_NO_COLLATION  | 
262  |  | #endif  // __COLLATIONDATA_H__  |