/src/icu/source/common/normalizer2impl.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 2009-2014, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | *******************************************************************************  | 
10  |  | *   file name:  normalizer2impl.h  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:4  | 
14  |  | *  | 
15  |  | *   created on: 2009nov22  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | */  | 
18  |  |  | 
19  |  | #ifndef __NORMALIZER2IMPL_H__  | 
20  |  | #define __NORMALIZER2IMPL_H__  | 
21  |  |  | 
22  |  | #include "unicode/utypes.h"  | 
23  |  |  | 
24  |  | #if !UCONFIG_NO_NORMALIZATION  | 
25  |  |  | 
26  |  | #include "unicode/normalizer2.h"  | 
27  |  | #include "unicode/ucptrie.h"  | 
28  |  | #include "unicode/unistr.h"  | 
29  |  | #include "unicode/unorm.h"  | 
30  |  | #include "unicode/utf.h"  | 
31  |  | #include "unicode/utf16.h"  | 
32  |  | #include "mutex.h"  | 
33  |  | #include "udataswp.h"  | 
34  |  | #include "uset_imp.h"  | 
35  |  |  | 
36  |  | // When the nfc.nrm data is *not* hardcoded into the common library  | 
37  |  | // (with this constant set to 0),  | 
38  |  | // then it needs to be built into the data package:  | 
39  |  | // Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT  | 
40  |  | #define NORM2_HARDCODE_NFC_DATA 1  | 
41  |  |  | 
42  |  | U_NAMESPACE_BEGIN  | 
43  |  |  | 
44  |  | struct CanonIterData;  | 
45  |  |  | 
46  |  | class ByteSink;  | 
47  |  | class Edits;  | 
48  |  | class InitCanonIterData;  | 
49  |  | class LcccContext;  | 
50  |  |  | 
51  |  | class U_COMMON_API Hangul { | 
52  |  | public:  | 
53  |  |     /* Korean Hangul and Jamo constants */  | 
54  |  |     enum { | 
55  |  |         JAMO_L_BASE=0x1100,     /* "lead" jamo */  | 
56  |  |         JAMO_L_END=0x1112,  | 
57  |  |         JAMO_V_BASE=0x1161,     /* "vowel" jamo */  | 
58  |  |         JAMO_V_END=0x1175,  | 
59  |  |         JAMO_T_BASE=0x11a7,     /* "trail" jamo */  | 
60  |  |         JAMO_T_END=0x11c2,  | 
61  |  |  | 
62  |  |         HANGUL_BASE=0xac00,  | 
63  |  |         HANGUL_END=0xd7a3,  | 
64  |  |  | 
65  |  |         JAMO_L_COUNT=19,  | 
66  |  |         JAMO_V_COUNT=21,  | 
67  |  |         JAMO_T_COUNT=28,  | 
68  |  |  | 
69  |  |         JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT,  | 
70  |  |  | 
71  |  |         HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,  | 
72  |  |         HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT  | 
73  |  |     };  | 
74  |  |  | 
75  | 0  |     static inline UBool isHangul(UChar32 c) { | 
76  | 0  |         return HANGUL_BASE<=c && c<HANGUL_LIMIT;  | 
77  | 0  |     }  | 
78  |  |     static inline UBool  | 
79  | 0  |     isHangulLV(UChar32 c) { | 
80  | 0  |         c-=HANGUL_BASE;  | 
81  | 0  |         return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;  | 
82  | 0  |     }  | 
83  | 0  |     static inline UBool isJamoL(UChar32 c) { | 
84  | 0  |         return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;  | 
85  | 0  |     }  | 
86  | 0  |     static inline UBool isJamoV(UChar32 c) { | 
87  | 0  |         return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;  | 
88  | 0  |     }  | 
89  | 0  |     static inline UBool isJamoT(UChar32 c) { | 
90  | 0  |         int32_t t=c-JAMO_T_BASE;  | 
91  | 0  |         return 0<t && t<JAMO_T_COUNT;  // not JAMO_T_BASE itself  | 
92  | 0  |     }  | 
93  | 0  |     static UBool isJamo(UChar32 c) { | 
94  | 0  |         return JAMO_L_BASE<=c && c<=JAMO_T_END &&  | 
95  | 0  |             (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);  | 
96  | 0  |     }  | 
97  |  |  | 
98  |  |     /**  | 
99  |  |      * Decomposes c, which must be a Hangul syllable, into buffer  | 
100  |  |      * and returns the length of the decomposition (2 or 3).  | 
101  |  |      */  | 
102  | 0  |     static inline int32_t decompose(UChar32 c, UChar buffer[3]) { | 
103  | 0  |         c-=HANGUL_BASE;  | 
104  | 0  |         UChar32 c2=c%JAMO_T_COUNT;  | 
105  | 0  |         c/=JAMO_T_COUNT;  | 
106  | 0  |         buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);  | 
107  | 0  |         buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);  | 
108  | 0  |         if(c2==0) { | 
109  | 0  |             return 2;  | 
110  | 0  |         } else { | 
111  | 0  |             buffer[2]=(UChar)(JAMO_T_BASE+c2);  | 
112  | 0  |             return 3;  | 
113  | 0  |         }  | 
114  | 0  |     }  | 
115  |  |  | 
116  |  |     /**  | 
117  |  |      * Decomposes c, which must be a Hangul syllable, into buffer.  | 
118  |  |      * This is the raw, not recursive, decomposition. Its length is always 2.  | 
119  |  |      */  | 
120  | 0  |     static inline void getRawDecomposition(UChar32 c, UChar buffer[2]) { | 
121  | 0  |         UChar32 orig=c;  | 
122  | 0  |         c-=HANGUL_BASE;  | 
123  | 0  |         UChar32 c2=c%JAMO_T_COUNT;  | 
124  | 0  |         if(c2==0) { | 
125  | 0  |             c/=JAMO_T_COUNT;  | 
126  | 0  |             buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);  | 
127  | 0  |             buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);  | 
128  | 0  |         } else { | 
129  | 0  |             buffer[0]=(UChar)(orig-c2);  // LV syllable  | 
130  | 0  |             buffer[1]=(UChar)(JAMO_T_BASE+c2);  | 
131  | 0  |         }  | 
132  | 0  |     }  | 
133  |  | private:  | 
134  |  |     Hangul();  // no instantiation  | 
135  |  | };  | 
136  |  |  | 
137  |  | class Normalizer2Impl;  | 
138  |  |  | 
139  |  | class U_COMMON_API ReorderingBuffer : public UMemory { | 
140  |  | public:  | 
141  |  |     /** Constructs only; init() should be called. */  | 
142  |  |     ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :  | 
143  | 0  |         impl(ni), str(dest),  | 
144  |  |         start(NULL), reorderStart(NULL), limit(NULL),  | 
145  | 0  |         remainingCapacity(0), lastCC(0) {} | 
146  |  |     /** Constructs, removes the string contents, and initializes for a small initial capacity. */  | 
147  |  |     ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode);  | 
148  | 0  |     ~ReorderingBuffer() { | 
149  | 0  |         if(start!=NULL) { | 
150  | 0  |             str.releaseBuffer((int32_t)(limit-start));  | 
151  | 0  |         }  | 
152  | 0  |     }  | 
153  |  |     UBool init(int32_t destCapacity, UErrorCode &errorCode);  | 
154  |  |  | 
155  | 0  |     UBool isEmpty() const { return start==limit; } | 
156  | 0  |     int32_t length() const { return (int32_t)(limit-start); } | 
157  | 0  |     UChar *getStart() { return start; } | 
158  | 0  |     UChar *getLimit() { return limit; } | 
159  | 0  |     uint8_t getLastCC() const { return lastCC; } | 
160  |  |  | 
161  |  |     UBool equals(const UChar *start, const UChar *limit) const;  | 
162  |  |     UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const;  | 
163  |  |  | 
164  | 0  |     UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) { | 
165  | 0  |         return (c<=0xffff) ?  | 
166  | 0  |             appendBMP((UChar)c, cc, errorCode) :  | 
167  | 0  |             appendSupplementary(c, cc, errorCode);  | 
168  | 0  |     }  | 
169  |  |     UBool append(const UChar *s, int32_t length, UBool isNFD,  | 
170  |  |                  uint8_t leadCC, uint8_t trailCC,  | 
171  |  |                  UErrorCode &errorCode);  | 
172  | 0  |     UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) { | 
173  | 0  |         if(remainingCapacity==0 && !resize(1, errorCode)) { | 
174  | 0  |             return false;  | 
175  | 0  |         }  | 
176  | 0  |         if(lastCC<=cc || cc==0) { | 
177  | 0  |             *limit++=c;  | 
178  | 0  |             lastCC=cc;  | 
179  | 0  |             if(cc<=1) { | 
180  | 0  |                 reorderStart=limit;  | 
181  | 0  |             }  | 
182  | 0  |         } else { | 
183  | 0  |             insert(c, cc);  | 
184  | 0  |         }  | 
185  | 0  |         --remainingCapacity;  | 
186  | 0  |         return true;  | 
187  | 0  |     }  | 
188  |  |     UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);  | 
189  |  |     UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode);  | 
190  |  |     void remove();  | 
191  |  |     void removeSuffix(int32_t suffixLength);  | 
192  | 0  |     void setReorderingLimit(UChar *newLimit) { | 
193  | 0  |         remainingCapacity+=(int32_t)(limit-newLimit);  | 
194  | 0  |         reorderStart=limit=newLimit;  | 
195  | 0  |         lastCC=0;  | 
196  | 0  |     }  | 
197  | 0  |     void copyReorderableSuffixTo(UnicodeString &s) const { | 
198  | 0  |         s.setTo(ConstChar16Ptr(reorderStart), (int32_t)(limit-reorderStart));  | 
199  | 0  |     }  | 
200  |  | private:  | 
201  |  |     /*  | 
202  |  |      * TODO: Revisit whether it makes sense to track reorderStart.  | 
203  |  |      * It is set to after the last known character with cc<=1,  | 
204  |  |      * which stops previousCC() before it reads that character and looks up its cc.  | 
205  |  |      * previousCC() is normally only called from insert().  | 
206  |  |      * In other words, reorderStart speeds up the insertion of a combining mark  | 
207  |  |      * into a multi-combining mark sequence where it does not belong at the end.  | 
208  |  |      * This might not be worth the trouble.  | 
209  |  |      * On the other hand, it's not a huge amount of trouble.  | 
210  |  |      *  | 
211  |  |      * We probably need it for UNORM_SIMPLE_APPEND.  | 
212  |  |      */  | 
213  |  |  | 
214  |  |     UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);  | 
215  |  |     void insert(UChar32 c, uint8_t cc);  | 
216  | 0  |     static void writeCodePoint(UChar *p, UChar32 c) { | 
217  | 0  |         if(c<=0xffff) { | 
218  | 0  |             *p=(UChar)c;  | 
219  | 0  |         } else { | 
220  | 0  |             p[0]=U16_LEAD(c);  | 
221  | 0  |             p[1]=U16_TRAIL(c);  | 
222  | 0  |         }  | 
223  | 0  |     }  | 
224  |  |     UBool resize(int32_t appendLength, UErrorCode &errorCode);  | 
225  |  |  | 
226  |  |     const Normalizer2Impl &impl;  | 
227  |  |     UnicodeString &str;  | 
228  |  |     UChar *start, *reorderStart, *limit;  | 
229  |  |     int32_t remainingCapacity;  | 
230  |  |     uint8_t lastCC;  | 
231  |  |  | 
232  |  |     // private backward iterator  | 
233  | 0  |     void setIterator() { codePointStart=limit; } | 
234  |  |     void skipPrevious();  // Requires start<codePointStart.  | 
235  |  |     uint8_t previousCC();  // Returns 0 if there is no previous character.  | 
236  |  |  | 
237  |  |     UChar *codePointStart, *codePointLimit;  | 
238  |  | };  | 
239  |  |  | 
240  |  | /**  | 
241  |  |  * Low-level implementation of the Unicode Normalization Algorithm.  | 
242  |  |  * For the data structure and details see the documentation at the end of  | 
243  |  |  * this normalizer2impl.h and in the design doc at  | 
244  |  |  * http://site.icu-project.org/design/normalization/custom  | 
245  |  |  */  | 
246  |  | class U_COMMON_API Normalizer2Impl : public UObject { | 
247  |  | public:  | 
248  | 0  |     Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) { } | 
249  |  |     virtual ~Normalizer2Impl();  | 
250  |  |  | 
251  |  |     void init(const int32_t *inIndexes, const UCPTrie *inTrie,  | 
252  |  |               const uint16_t *inExtraData, const uint8_t *inSmallFCD);  | 
253  |  |  | 
254  |  |     void addLcccChars(UnicodeSet &set) const;  | 
255  |  |     void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;  | 
256  |  |     void addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;  | 
257  |  |  | 
258  |  |     // low-level properties ------------------------------------------------ ***  | 
259  |  |  | 
260  |  |     UBool ensureCanonIterData(UErrorCode &errorCode) const;  | 
261  |  |  | 
262  |  |     // The trie stores values for lead surrogate code *units*.  | 
263  |  |     // Surrogate code *points* are inert.  | 
264  | 0  |     uint16_t getNorm16(UChar32 c) const { | 
265  | 0  |         return U_IS_LEAD(c) ?  | 
266  | 0  |             static_cast<uint16_t>(INERT) :  | 
267  | 0  |             UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c);  | 
268  | 0  |     }  | 
269  | 0  |     uint16_t getRawNorm16(UChar32 c) const { return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); } | 
270  |  |  | 
271  | 0  |     UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { | 
272  | 0  |         if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { | 
273  | 0  |             return UNORM_YES;  | 
274  | 0  |         } else if(minMaybeYes<=norm16) { | 
275  | 0  |             return UNORM_MAYBE;  | 
276  | 0  |         } else { | 
277  | 0  |             return UNORM_NO;  | 
278  | 0  |         }  | 
279  | 0  |     }  | 
280  | 0  |     UBool isAlgorithmicNoNo(uint16_t norm16) const { return limitNoNo<=norm16 && norm16<minMaybeYes; } | 
281  | 0  |     UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; } | 
282  | 0  |     UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; } | 
283  |  |  | 
284  | 0  |     uint8_t getCC(uint16_t norm16) const { | 
285  | 0  |         if(norm16>=MIN_NORMAL_MAYBE_YES) { | 
286  | 0  |             return getCCFromNormalYesOrMaybe(norm16);  | 
287  | 0  |         }  | 
288  | 0  |         if(norm16<minNoNo || limitNoNo<=norm16) { | 
289  | 0  |             return 0;  | 
290  | 0  |         }  | 
291  | 0  |         return getCCFromNoNo(norm16);  | 
292  | 0  |     }  | 
293  | 0  |     static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16) { | 
294  | 0  |         return (uint8_t)(norm16 >> OFFSET_SHIFT);  | 
295  | 0  |     }  | 
296  | 0  |     static uint8_t getCCFromYesOrMaybe(uint16_t norm16) { | 
297  | 0  |         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;  | 
298  | 0  |     }  | 
299  | 0  |     uint8_t getCCFromYesOrMaybeCP(UChar32 c) const { | 
300  | 0  |         if (c < minCompNoMaybeCP) { return 0; } | 
301  | 0  |         return getCCFromYesOrMaybe(getNorm16(c));  | 
302  | 0  |     }  | 
303  |  |  | 
304  |  |     /**  | 
305  |  |      * Returns the FCD data for code point c.  | 
306  |  |      * @param c A Unicode code point.  | 
307  |  |      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.  | 
308  |  |      */  | 
309  | 0  |     uint16_t getFCD16(UChar32 c) const { | 
310  | 0  |         if(c<minDecompNoCP) { | 
311  | 0  |             return 0;  | 
312  | 0  |         } else if(c<=0xffff) { | 
313  | 0  |             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } | 
314  | 0  |         }  | 
315  | 0  |         return getFCD16FromNormData(c);  | 
316  | 0  |     }  | 
317  |  |     /**  | 
318  |  |      * Returns the FCD data for the next code point (post-increment).  | 
319  |  |      * Might skip only a lead surrogate rather than the whole surrogate pair if none of  | 
320  |  |      * the supplementary code points associated with the lead surrogate have non-zero FCD data.  | 
321  |  |      * @param s A valid pointer into a string. Requires s!=limit.  | 
322  |  |      * @param limit The end of the string, or NULL.  | 
323  |  |      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.  | 
324  |  |      */  | 
325  | 0  |     uint16_t nextFCD16(const UChar *&s, const UChar *limit) const { | 
326  | 0  |         UChar32 c=*s++;  | 
327  | 0  |         if(c<minDecompNoCP || !singleLeadMightHaveNonZeroFCD16(c)) { | 
328  | 0  |             return 0;  | 
329  | 0  |         }  | 
330  | 0  |         UChar c2;  | 
331  | 0  |         if(U16_IS_LEAD(c) && s!=limit && U16_IS_TRAIL(c2=*s)) { | 
332  | 0  |             c=U16_GET_SUPPLEMENTARY(c, c2);  | 
333  | 0  |             ++s;  | 
334  | 0  |         }  | 
335  | 0  |         return getFCD16FromNormData(c);  | 
336  | 0  |     }  | 
337  |  |     /**  | 
338  |  |      * Returns the FCD data for the previous code point (pre-decrement).  | 
339  |  |      * @param start The start of the string.  | 
340  |  |      * @param s A valid pointer into a string. Requires start<s.  | 
341  |  |      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.  | 
342  |  |      */  | 
343  | 0  |     uint16_t previousFCD16(const UChar *start, const UChar *&s) const { | 
344  | 0  |         UChar32 c=*--s;  | 
345  | 0  |         if(c<minDecompNoCP) { | 
346  | 0  |             return 0;  | 
347  | 0  |         }  | 
348  | 0  |         if(!U16_IS_TRAIL(c)) { | 
349  | 0  |             if(!singleLeadMightHaveNonZeroFCD16(c)) { | 
350  | 0  |                 return 0;  | 
351  | 0  |             }  | 
352  | 0  |         } else { | 
353  | 0  |             UChar c2;  | 
354  | 0  |             if(start<s && U16_IS_LEAD(c2=*(s-1))) { | 
355  | 0  |                 c=U16_GET_SUPPLEMENTARY(c2, c);  | 
356  | 0  |                 --s;  | 
357  | 0  |             }  | 
358  | 0  |         }  | 
359  | 0  |         return getFCD16FromNormData(c);  | 
360  | 0  |     }  | 
361  |  |  | 
362  |  |     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */  | 
363  | 0  |     UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const { | 
364  |  |         // 0<=lead<=0xffff  | 
365  | 0  |         uint8_t bits=smallFCD[lead>>8];  | 
366  | 0  |         if(bits==0) { return false; } | 
367  | 0  |         return (UBool)((bits>>((lead>>5)&7))&1);  | 
368  | 0  |     }  | 
369  |  |     /** Returns the FCD value from the regular normalization data. */  | 
370  |  |     uint16_t getFCD16FromNormData(UChar32 c) const;  | 
371  |  |  | 
372  |  |     /**  | 
373  |  |      * Gets the decomposition for one code point.  | 
374  |  |      * @param c code point  | 
375  |  |      * @param buffer out-only buffer for algorithmic decompositions  | 
376  |  |      * @param length out-only, takes the length of the decomposition, if any  | 
377  |  |      * @return pointer to the decomposition, or NULL if none  | 
378  |  |      */  | 
379  |  |     const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const;  | 
380  |  |  | 
381  |  |     /**  | 
382  |  |      * Gets the raw decomposition for one code point.  | 
383  |  |      * @param c code point  | 
384  |  |      * @param buffer out-only buffer for algorithmic decompositions  | 
385  |  |      * @param length out-only, takes the length of the decomposition, if any  | 
386  |  |      * @return pointer to the decomposition, or NULL if none  | 
387  |  |      */  | 
388  |  |     const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const;  | 
389  |  |  | 
390  |  |     UChar32 composePair(UChar32 a, UChar32 b) const;  | 
391  |  |  | 
392  |  |     UBool isCanonSegmentStarter(UChar32 c) const;  | 
393  |  |     UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;  | 
394  |  |  | 
395  |  |     enum { | 
396  |  |         // Fixed norm16 values.  | 
397  |  |         MIN_YES_YES_WITH_CC=0xfe02,  | 
398  |  |         JAMO_VT=0xfe00,  | 
399  |  |         MIN_NORMAL_MAYBE_YES=0xfc00,  | 
400  |  |         JAMO_L=2,  // offset=1 hasCompBoundaryAfter=false  | 
401  |  |         INERT=1,  // offset=0 hasCompBoundaryAfter=true  | 
402  |  |  | 
403  |  |         // norm16 bit 0 is comp-boundary-after.  | 
404  |  |         HAS_COMP_BOUNDARY_AFTER=1,  | 
405  |  |         OFFSET_SHIFT=1,  | 
406  |  |  | 
407  |  |         // For algorithmic one-way mappings, norm16 bits 2..1 indicate the  | 
408  |  |         // tccc (0, 1, >1) for quick FCC boundary-after tests.  | 
409  |  |         DELTA_TCCC_0=0,  | 
410  |  |         DELTA_TCCC_1=2,  | 
411  |  |         DELTA_TCCC_GT_1=4,  | 
412  |  |         DELTA_TCCC_MASK=6,  | 
413  |  |         DELTA_SHIFT=3,  | 
414  |  |  | 
415  |  |         MAX_DELTA=0x40  | 
416  |  |     };  | 
417  |  |  | 
418  |  |     enum { | 
419  |  |         // Byte offsets from the start of the data, after the generic header.  | 
420  |  |         IX_NORM_TRIE_OFFSET,  | 
421  |  |         IX_EXTRA_DATA_OFFSET,  | 
422  |  |         IX_SMALL_FCD_OFFSET,  | 
423  |  |         IX_RESERVED3_OFFSET,  | 
424  |  |         IX_RESERVED4_OFFSET,  | 
425  |  |         IX_RESERVED5_OFFSET,  | 
426  |  |         IX_RESERVED6_OFFSET,  | 
427  |  |         IX_TOTAL_SIZE,  | 
428  |  |  | 
429  |  |         // Code point thresholds for quick check codes.  | 
430  |  |         IX_MIN_DECOMP_NO_CP,  | 
431  |  |         IX_MIN_COMP_NO_MAYBE_CP,  | 
432  |  |  | 
433  |  |         // Norm16 value thresholds for quick check combinations and types of extra data.  | 
434  |  |  | 
435  |  |         /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */  | 
436  |  |         IX_MIN_YES_NO,  | 
437  |  |         /** Mappings are comp-normalized. */  | 
438  |  |         IX_MIN_NO_NO,  | 
439  |  |         IX_LIMIT_NO_NO,  | 
440  |  |         IX_MIN_MAYBE_YES,  | 
441  |  |  | 
442  |  |         /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */  | 
443  |  |         IX_MIN_YES_NO_MAPPINGS_ONLY,  | 
444  |  |         /** Mappings are not comp-normalized but have a comp boundary before. */  | 
445  |  |         IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,  | 
446  |  |         /** Mappings do not have a comp boundary before. */  | 
447  |  |         IX_MIN_NO_NO_COMP_NO_MAYBE_CC,  | 
448  |  |         /** Mappings to the empty string. */  | 
449  |  |         IX_MIN_NO_NO_EMPTY,  | 
450  |  |  | 
451  |  |         IX_MIN_LCCC_CP,  | 
452  |  |         IX_RESERVED19,  | 
453  |  |         IX_COUNT  | 
454  |  |     };  | 
455  |  |  | 
456  |  |     enum { | 
457  |  |         MAPPING_HAS_CCC_LCCC_WORD=0x80,  | 
458  |  |         MAPPING_HAS_RAW_MAPPING=0x40,  | 
459  |  |         // unused bit 0x20,  | 
460  |  |         MAPPING_LENGTH_MASK=0x1f  | 
461  |  |     };  | 
462  |  |  | 
463  |  |     enum { | 
464  |  |         COMP_1_LAST_TUPLE=0x8000,  | 
465  |  |         COMP_1_TRIPLE=1,  | 
466  |  |         COMP_1_TRAIL_LIMIT=0x3400,  | 
467  |  |         COMP_1_TRAIL_MASK=0x7ffe,  | 
468  |  |         COMP_1_TRAIL_SHIFT=9,  // 10-1 for the "triple" bit  | 
469  |  |         COMP_2_TRAIL_SHIFT=6,  | 
470  |  |         COMP_2_TRAIL_MASK=0xffc0  | 
471  |  |     };  | 
472  |  |  | 
473  |  |     // higher-level functionality ------------------------------------------ ***  | 
474  |  |  | 
475  |  |     // NFD without an NFD Normalizer2 instance.  | 
476  |  |     UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest,  | 
477  |  |                              UErrorCode &errorCode) const;  | 
478  |  |     /**  | 
479  |  |      * Decomposes [src, limit[ and writes the result to dest.  | 
480  |  |      * limit can be NULL if src is NUL-terminated.  | 
481  |  |      * destLengthEstimate is the initial dest buffer capacity and can be -1.  | 
482  |  |      */  | 
483  |  |     void decompose(const UChar *src, const UChar *limit,  | 
484  |  |                    UnicodeString &dest, int32_t destLengthEstimate,  | 
485  |  |                    UErrorCode &errorCode) const;  | 
486  |  |  | 
487  |  |     const UChar *decompose(const UChar *src, const UChar *limit,  | 
488  |  |                            ReorderingBuffer *buffer, UErrorCode &errorCode) const;  | 
489  |  |     void decomposeAndAppend(const UChar *src, const UChar *limit,  | 
490  |  |                             UBool doDecompose,  | 
491  |  |                             UnicodeString &safeMiddle,  | 
492  |  |                             ReorderingBuffer &buffer,  | 
493  |  |                             UErrorCode &errorCode) const;  | 
494  |  |  | 
495  |  |     /** sink==nullptr: isNormalized()/spanQuickCheckYes() */  | 
496  |  |     const uint8_t *decomposeUTF8(uint32_t options,  | 
497  |  |                                  const uint8_t *src, const uint8_t *limit,  | 
498  |  |                                  ByteSink *sink, Edits *edits, UErrorCode &errorCode) const;  | 
499  |  |  | 
500  |  |     UBool compose(const UChar *src, const UChar *limit,  | 
501  |  |                   UBool onlyContiguous,  | 
502  |  |                   UBool doCompose,  | 
503  |  |                   ReorderingBuffer &buffer,  | 
504  |  |                   UErrorCode &errorCode) const;  | 
505  |  |     const UChar *composeQuickCheck(const UChar *src, const UChar *limit,  | 
506  |  |                                    UBool onlyContiguous,  | 
507  |  |                                    UNormalizationCheckResult *pQCResult) const;  | 
508  |  |     void composeAndAppend(const UChar *src, const UChar *limit,  | 
509  |  |                           UBool doCompose,  | 
510  |  |                           UBool onlyContiguous,  | 
511  |  |                           UnicodeString &safeMiddle,  | 
512  |  |                           ReorderingBuffer &buffer,  | 
513  |  |                           UErrorCode &errorCode) const;  | 
514  |  |  | 
515  |  |     /** sink==nullptr: isNormalized() */  | 
516  |  |     UBool composeUTF8(uint32_t options, UBool onlyContiguous,  | 
517  |  |                       const uint8_t *src, const uint8_t *limit,  | 
518  |  |                       ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const;  | 
519  |  |  | 
520  |  |     const UChar *makeFCD(const UChar *src, const UChar *limit,  | 
521  |  |                          ReorderingBuffer *buffer, UErrorCode &errorCode) const;  | 
522  |  |     void makeFCDAndAppend(const UChar *src, const UChar *limit,  | 
523  |  |                           UBool doMakeFCD,  | 
524  |  |                           UnicodeString &safeMiddle,  | 
525  |  |                           ReorderingBuffer &buffer,  | 
526  |  |                           UErrorCode &errorCode) const;  | 
527  |  |  | 
528  |  |     UBool hasDecompBoundaryBefore(UChar32 c) const;  | 
529  |  |     UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const;  | 
530  |  |     UBool hasDecompBoundaryAfter(UChar32 c) const;  | 
531  |  |     UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const;  | 
532  | 0  |     UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } | 
533  |  |  | 
534  | 0  |     UBool hasCompBoundaryBefore(UChar32 c) const { | 
535  | 0  |         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));  | 
536  | 0  |     }  | 
537  | 0  |     UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous) const { | 
538  | 0  |         return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);  | 
539  | 0  |     }  | 
540  | 0  |     UBool isCompInert(UChar32 c, UBool onlyContiguous) const { | 
541  | 0  |         uint16_t norm16=getNorm16(c);  | 
542  | 0  |         return isCompYesAndZeroCC(norm16) &&  | 
543  | 0  |             (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&  | 
544  | 0  |             (!onlyContiguous || isInert(norm16) || *getMapping(norm16) <= 0x1ff);  | 
545  | 0  |     }  | 
546  |  |  | 
547  | 0  |     UBool hasFCDBoundaryBefore(UChar32 c) const { return hasDecompBoundaryBefore(c); } | 
548  | 0  |     UBool hasFCDBoundaryAfter(UChar32 c) const { return hasDecompBoundaryAfter(c); } | 
549  | 0  |     UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; } | 
550  |  | private:  | 
551  |  |     friend class InitCanonIterData;  | 
552  |  |     friend class LcccContext;  | 
553  |  |  | 
554  | 0  |     UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } | 
555  | 0  |     UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; } | 
556  | 0  |     static UBool isInert(uint16_t norm16) { return norm16==INERT; } | 
557  | 0  |     static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; } | 
558  | 0  |     static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } | 
559  | 0  |     uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } | 
560  | 0  |     UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; } | 
561  | 0  |     UBool isHangulLVT(uint16_t norm16) const { | 
562  | 0  |         return norm16==hangulLVT();  | 
563  | 0  |     }  | 
564  | 0  |     UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; } | 
565  |  |     // UBool isCompYes(uint16_t norm16) const { | 
566  |  |     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;  | 
567  |  |     // }  | 
568  |  |     // UBool isCompYesOrMaybe(uint16_t norm16) const { | 
569  |  |     //     return norm16<minNoNo || minMaybeYes<=norm16;  | 
570  |  |     // }  | 
571  |  |     // UBool hasZeroCCFromDecompYes(uint16_t norm16) const { | 
572  |  |     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;  | 
573  |  |     // }  | 
574  | 0  |     UBool isDecompYesAndZeroCC(uint16_t norm16) const { | 
575  | 0  |         return norm16<minYesNo ||  | 
576  | 0  |                norm16==JAMO_VT ||  | 
577  | 0  |                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);  | 
578  | 0  |     }  | 
579  |  |     /**  | 
580  |  |      * A little faster and simpler than isDecompYesAndZeroCC() but does not include  | 
581  |  |      * the MaybeYes which combine-forward and have ccc=0.  | 
582  |  |      * (Standard Unicode 10 normalization does not have such characters.)  | 
583  |  |      */  | 
584  | 0  |     UBool isMostDecompYesAndZeroCC(uint16_t norm16) const { | 
585  | 0  |         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;  | 
586  | 0  |     }  | 
587  | 0  |     UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; } | 
588  |  |  | 
589  |  |     // For use with isCompYes().  | 
590  |  |     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.  | 
591  |  |     // static uint8_t getCCFromYes(uint16_t norm16) { | 
592  |  |     //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;  | 
593  |  |     // }  | 
594  | 0  |     uint8_t getCCFromNoNo(uint16_t norm16) const { | 
595  | 0  |         const uint16_t *mapping=getMapping(norm16);  | 
596  | 0  |         if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { | 
597  | 0  |             return (uint8_t)*(mapping-1);  | 
598  | 0  |         } else { | 
599  | 0  |             return 0;  | 
600  | 0  |         }  | 
601  | 0  |     }  | 
602  |  |     // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()  | 
603  | 0  |     uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const { | 
604  | 0  |         if(norm16<=minYesNo) { | 
605  | 0  |             return 0;  // yesYes and Hangul LV have ccc=tccc=0  | 
606  | 0  |         } else { | 
607  |  |             // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.  | 
608  | 0  |             return (uint8_t)(*getMapping(norm16)>>8);  // tccc from yesNo  | 
609  | 0  |         }  | 
610  | 0  |     }  | 
611  |  |     uint8_t getPreviousTrailCC(const UChar *start, const UChar *p) const;  | 
612  |  |     uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const;  | 
613  |  |  | 
614  |  |     // Requires algorithmic-NoNo.  | 
615  | 0  |     UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { | 
616  | 0  |         return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;  | 
617  | 0  |     }  | 
618  | 0  |     UChar32 getAlgorithmicDelta(uint16_t norm16) const { | 
619  | 0  |         return (norm16>>DELTA_SHIFT)-centerNoNoDelta;  | 
620  | 0  |     }  | 
621  |  |  | 
622  |  |     // Requires minYesNo<norm16<limitNoNo.  | 
623  | 0  |     const uint16_t *getMapping(uint16_t norm16) const { return extraData+(norm16>>OFFSET_SHIFT); } | 
624  | 0  |     const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const { | 
625  | 0  |         if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { | 
626  | 0  |             return NULL;  | 
627  | 0  |         } else if(norm16<minMaybeYes) { | 
628  | 0  |             return getMapping(norm16);  // for yesYes; if Jamo L: harmless empty list  | 
629  | 0  |         } else { | 
630  | 0  |             return maybeYesCompositions+norm16-minMaybeYes;  | 
631  | 0  |         }  | 
632  | 0  |     }  | 
633  | 0  |     const uint16_t *getCompositionsListForComposite(uint16_t norm16) const { | 
634  |  |         // A composite has both mapping & compositions list.  | 
635  | 0  |         const uint16_t *list=getMapping(norm16);  | 
636  | 0  |         return list+  // mapping pointer  | 
637  | 0  |             1+  // +1 to skip the first unit with the mapping length  | 
638  | 0  |             (*list&MAPPING_LENGTH_MASK);  // + mapping length  | 
639  | 0  |     }  | 
640  | 0  |     const uint16_t *getCompositionsListForMaybe(uint16_t norm16) const { | 
641  |  |         // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES  | 
642  | 0  |         return maybeYesCompositions+((norm16-minMaybeYes)>>OFFSET_SHIFT);  | 
643  | 0  |     }  | 
644  |  |     /**  | 
645  |  |      * @param c code point must have compositions  | 
646  |  |      * @return compositions list pointer  | 
647  |  |      */  | 
648  | 0  |     const uint16_t *getCompositionsList(uint16_t norm16) const { | 
649  | 0  |         return isDecompYes(norm16) ?  | 
650  | 0  |                 getCompositionsListForDecompYes(norm16) :  | 
651  | 0  |                 getCompositionsListForComposite(norm16);  | 
652  | 0  |     }  | 
653  |  |  | 
654  |  |     const UChar *copyLowPrefixFromNulTerminated(const UChar *src,  | 
655  |  |                                                 UChar32 minNeedDataCP,  | 
656  |  |                                                 ReorderingBuffer *buffer,  | 
657  |  |                                                 UErrorCode &errorCode) const;  | 
658  |  |  | 
659  |  |     enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY }; | 
660  |  |  | 
661  |  |     const UChar *decomposeShort(const UChar *src, const UChar *limit,  | 
662  |  |                                 UBool stopAtCompBoundary, UBool onlyContiguous,  | 
663  |  |                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const;  | 
664  |  |     UBool decompose(UChar32 c, uint16_t norm16,  | 
665  |  |                     ReorderingBuffer &buffer, UErrorCode &errorCode) const;  | 
666  |  |  | 
667  |  |     const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,  | 
668  |  |                                   StopAt stopAt, UBool onlyContiguous,  | 
669  |  |                                   ReorderingBuffer &buffer, UErrorCode &errorCode) const;  | 
670  |  |  | 
671  |  |     static int32_t combine(const uint16_t *list, UChar32 trail);  | 
672  |  |     void addComposites(const uint16_t *list, UnicodeSet &set) const;  | 
673  |  |     void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,  | 
674  |  |                    UBool onlyContiguous) const;  | 
675  |  |  | 
676  | 0  |     UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { | 
677  | 0  |         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);  | 
678  | 0  |     }  | 
679  | 0  |     UBool norm16HasCompBoundaryBefore(uint16_t norm16) const  { | 
680  | 0  |         return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);  | 
681  | 0  |     }  | 
682  |  |     UBool hasCompBoundaryBefore(const UChar *src, const UChar *limit) const;  | 
683  |  |     UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const;  | 
684  |  |     UBool hasCompBoundaryAfter(const UChar *start, const UChar *p,  | 
685  |  |                                UBool onlyContiguous) const;  | 
686  |  |     UBool hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,  | 
687  |  |                                UBool onlyContiguous) const;  | 
688  | 0  |     UBool norm16HasCompBoundaryAfter(uint16_t norm16, UBool onlyContiguous) const { | 
689  | 0  |         return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&  | 
690  | 0  |             (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));  | 
691  | 0  |     }  | 
692  |  |     /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */  | 
693  | 0  |     UBool isTrailCC01ForCompBoundaryAfter(uint16_t norm16) const { | 
694  | 0  |         return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?  | 
695  | 0  |             (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getMapping(norm16) <= 0x1ff);  | 
696  | 0  |     }  | 
697  |  |  | 
698  |  |     const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p, UBool onlyContiguous) const;  | 
699  |  |     const UChar *findNextCompBoundary(const UChar *p, const UChar *limit, UBool onlyContiguous) const;  | 
700  |  |  | 
701  |  |     const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;  | 
702  |  |     const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;  | 
703  |  |  | 
704  |  |     void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,  | 
705  |  |                                      CanonIterData &newData, UErrorCode &errorCode) const;  | 
706  |  |  | 
707  |  |     int32_t getCanonValue(UChar32 c) const;  | 
708  |  |     const UnicodeSet &getCanonStartSet(int32_t n) const;  | 
709  |  |  | 
710  |  |     // UVersionInfo dataVersion;  | 
711  |  |  | 
712  |  |     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.  | 
713  |  |     UChar minDecompNoCP;  | 
714  |  |     UChar minCompNoMaybeCP;  | 
715  |  |     UChar minLcccCP;  | 
716  |  |  | 
717  |  |     // Norm16 value thresholds for quick check combinations and types of extra data.  | 
718  |  |     uint16_t minYesNo;  | 
719  |  |     uint16_t minYesNoMappingsOnly;  | 
720  |  |     uint16_t minNoNo;  | 
721  |  |     uint16_t minNoNoCompBoundaryBefore;  | 
722  |  |     uint16_t minNoNoCompNoMaybeCC;  | 
723  |  |     uint16_t minNoNoEmpty;  | 
724  |  |     uint16_t limitNoNo;  | 
725  |  |     uint16_t centerNoNoDelta;  | 
726  |  |     uint16_t minMaybeYes;  | 
727  |  |  | 
728  |  |     const UCPTrie *normTrie;  | 
729  |  |     const uint16_t *maybeYesCompositions;  | 
730  |  |     const uint16_t *extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters  | 
731  |  |     const uint8_t *smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0  | 
732  |  |  | 
733  |  |     UInitOnce       fCanonIterDataInitOnce = U_INITONCE_INITIALIZER;  | 
734  |  |     CanonIterData  *fCanonIterData;  | 
735  |  | };  | 
736  |  |  | 
737  |  | // bits in canonIterData  | 
738  | 0  | #define CANON_NOT_SEGMENT_STARTER 0x80000000  | 
739  | 0  | #define CANON_HAS_COMPOSITIONS 0x40000000  | 
740  | 0  | #define CANON_HAS_SET 0x200000  | 
741  | 0  | #define CANON_VALUE_MASK 0x1fffff  | 
742  |  |  | 
743  |  | /**  | 
744  |  |  * ICU-internal shortcut for quick access to standard Unicode normalization.  | 
745  |  |  */  | 
746  |  | class U_COMMON_API Normalizer2Factory { | 
747  |  | public:  | 
748  |  |     static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);  | 
749  |  |     static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);  | 
750  |  |     static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);  | 
751  |  |  | 
752  |  |     static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);  | 
753  |  |  | 
754  |  |     static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);  | 
755  |  |     static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);  | 
756  |  |     static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);  | 
757  |  |  | 
758  |  |     // Get the Impl instance of the Normalizer2.  | 
759  |  |     // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.  | 
760  |  |     static const Normalizer2Impl *getImpl(const Normalizer2 *norm2);  | 
761  |  | private:  | 
762  |  |     Normalizer2Factory();  // No instantiation.  | 
763  |  | };  | 
764  |  |  | 
765  |  | U_NAMESPACE_END  | 
766  |  |  | 
767  |  | U_CAPI int32_t U_EXPORT2  | 
768  |  | unorm2_swap(const UDataSwapper *ds,  | 
769  |  |             const void *inData, int32_t length, void *outData,  | 
770  |  |             UErrorCode *pErrorCode);  | 
771  |  |  | 
772  |  | /**  | 
773  |  |  * Get the NF*_QC property for a code point, for u_getIntPropertyValue().  | 
774  |  |  * @internal  | 
775  |  |  */  | 
776  |  | U_CFUNC UNormalizationCheckResult  | 
777  |  | unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);  | 
778  |  |  | 
779  |  | /**  | 
780  |  |  * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().  | 
781  |  |  * @internal  | 
782  |  |  */  | 
783  |  | U_CFUNC uint16_t  | 
784  |  | unorm_getFCD16(UChar32 c);  | 
785  |  |  | 
786  |  | /**  | 
787  |  |  * Format of Normalizer2 .nrm data files.  | 
788  |  |  * Format version 4.0.  | 
789  |  |  *  | 
790  |  |  * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.  | 
791  |  |  * ICU ships with data files for standard Unicode Normalization Forms  | 
792  |  |  * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm).  | 
793  |  |  * Custom (application-specific) data can be built into additional .nrm files  | 
794  |  |  * with the gennorm2 build tool.  | 
795  |  |  * ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.  | 
796  |  |  *  | 
797  |  |  * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been  | 
798  |  |  * cached already. Internally, Normalizer2Impl.load() reads the .nrm file.  | 
799  |  |  *  | 
800  |  |  * A .nrm file begins with a standard ICU data file header  | 
801  |  |  * (DataHeader, see ucmndata.h and unicode/udata.h).  | 
802  |  |  * The UDataInfo.dataVersion field usually contains the Unicode version  | 
803  |  |  * for which the data was generated.  | 
804  |  |  *  | 
805  |  |  * After the header, the file contains the following parts.  | 
806  |  |  * Constants are defined as enum values of the Normalizer2Impl class.  | 
807  |  |  *  | 
808  |  |  * Many details of the data structures are described in the design doc  | 
809  |  |  * which is at http://site.icu-project.org/design/normalization/custom  | 
810  |  |  *  | 
811  |  |  * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4;  | 
812  |  |  *  | 
813  |  |  *      The first eight indexes are byte offsets in ascending order.  | 
814  |  |  *      Each byte offset marks the start of the next part in the data file,  | 
815  |  |  *      and the end of the previous one.  | 
816  |  |  *      When two consecutive byte offsets are the same, then the corresponding part is empty.  | 
817  |  |  *      Byte offsets are offsets from after the header,  | 
818  |  |  *      that is, from the beginning of the indexes[].  | 
819  |  |  *      Each part starts at an offset with proper alignment for its data.  | 
820  |  |  *      If necessary, the previous part may include padding bytes to achieve this alignment.  | 
821  |  |  *  | 
822  |  |  *      minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point  | 
823  |  |  *      with a decomposition mapping, that is, with NF*D_QC=No.  | 
824  |  |  *      minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point  | 
825  |  |  *      with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).  | 
826  |  |  *      minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3)  | 
827  |  |  *      is the lowest code point with lccc!=0.  | 
828  |  |  *  | 
829  |  |  *      The next eight indexes are thresholds of 16-bit trie values for ranges of  | 
830  |  |  *      values indicating multiple normalization properties.  | 
831  |  |  *      They are listed here in threshold order, not in the order they are stored in the indexes.  | 
832  |  |  *          minYesNo=indexes[IX_MIN_YES_NO];  | 
833  |  |  *          minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];  | 
834  |  |  *          minNoNo=indexes[IX_MIN_NO_NO];  | 
835  |  |  *          minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];  | 
836  |  |  *          minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];  | 
837  |  |  *          minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];  | 
838  |  |  *          limitNoNo=indexes[IX_LIMIT_NO_NO];  | 
839  |  |  *          minMaybeYes=indexes[IX_MIN_MAYBE_YES];  | 
840  |  |  *      See the normTrie description below and the design doc for details.  | 
841  |  |  *  | 
842  |  |  * UCPTrie normTrie; -- see ucptrie_impl.h and ucptrie.h, same as Java CodePointTrie  | 
843  |  |  *  | 
844  |  |  *      The trie holds the main normalization data. Each code point is mapped to a 16-bit value.  | 
845  |  |  *      Rather than using independent bits in the value (which would require more than 16 bits),  | 
846  |  |  *      information is extracted primarily via range checks.  | 
847  |  |  *      Except, format version 3 uses bit 0 for hasCompBoundaryAfter().  | 
848  |  |  *      For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo  | 
849  |  |  *      means that the character has NF*C_QC=Yes and NF*D_QC=No properties,  | 
850  |  |  *      which means it has a two-way (round-trip) decomposition mapping.  | 
851  |  |  *      Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData  | 
852  |  |  *      pointing to mappings, compositions lists, or both.  | 
853  |  |  *      Value norm16==INERT (0 in versions 1 & 2, 1 in version 3)  | 
854  |  |  *      means that the character is normalization-inert, that is,  | 
855  |  |  *      it does not have a mapping, does not participate in composition, has a zero  | 
856  |  |  *      canonical combining class, and forms a boundary where text before it and after it  | 
857  |  |  *      can be normalized independently.  | 
858  |  |  *      For details about how multiple properties are encoded in 16-bit values  | 
859  |  |  *      see the design doc.  | 
860  |  |  *      Note that the encoding cannot express all combinations of the properties involved;  | 
861  |  |  *      it only supports those combinations that are allowed by  | 
862  |  |  *      the Unicode Normalization algorithms. Details are in the design doc as well.  | 
863  |  |  *      The gennorm2 tool only builds .nrm files for data that conforms to the limitations.  | 
864  |  |  *  | 
865  |  |  *      The trie has a value for each lead surrogate code unit representing the "worst case"  | 
866  |  |  *      properties of the 1024 supplementary characters whose UTF-16 form starts with  | 
867  |  |  *      the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,  | 
868  |  |  *      then their lead surrogate code unit has the trie value INERT.  | 
869  |  |  *      When the lead surrogate unit's value exceeds the quick check minimum during processing,  | 
870  |  |  *      the properties for the full supplementary code point need to be looked up.  | 
871  |  |  *  | 
872  |  |  * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes];  | 
873  |  |  * uint16_t extraData[];  | 
874  |  |  *  | 
875  |  |  *      There is only one byte offset for the end of these two arrays.  | 
876  |  |  *      The split between them is given by the constant and variable mentioned above.  | 
877  |  |  *      In version 3, the difference must be shifted right by OFFSET_SHIFT.  | 
878  |  |  *  | 
879  |  |  *      The maybeYesCompositions array contains compositions lists for characters that  | 
880  |  |  *      combine both forward (as starters in composition pairs)  | 
881  |  |  *      and backward (as trailing characters in composition pairs).  | 
882  |  |  *      Such characters do not occur in Unicode 5.2 but are allowed by  | 
883  |  |  *      the Unicode Normalization algorithms.  | 
884  |  |  *      If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES  | 
885  |  |  *      and the maybeYesCompositions array is empty.  | 
886  |  |  *      If there are such characters, then minMaybeYes is subtracted from their norm16 values  | 
887  |  |  *      to get the index into this array.  | 
888  |  |  *  | 
889  |  |  *      The extraData array contains compositions lists for "YesYes" characters,  | 
890  |  |  *      followed by mappings and optional compositions lists for "YesNo" characters,  | 
891  |  |  *      followed by only mappings for "NoNo" characters.  | 
892  |  |  *      (Referring to pairs of NFC/NFD quick check values.)  | 
893  |  |  *      The norm16 values of those characters are directly indexes into the extraData array.  | 
894  |  |  *      In version 3, the norm16 values must be shifted right by OFFSET_SHIFT  | 
895  |  |  *      for accessing extraData.  | 
896  |  |  *  | 
897  |  |  *      The data structures for compositions lists and mappings are described in the design doc.  | 
898  |  |  *  | 
899  |  |  * uint8_t smallFCD[0x100]; -- new in format version 2  | 
900  |  |  *  | 
901  |  |  *      This is a bit set to help speed up FCD value lookups in the absence of a full  | 
902  |  |  *      UTrie2 or other large data structure with the full FCD value mapping.  | 
903  |  |  *  | 
904  |  |  *      Each smallFCD bit is set if any of the corresponding 32 BMP code points  | 
905  |  |  *      has a non-zero FCD value (lccc!=0 or tccc!=0).  | 
906  |  |  *      Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF.  | 
907  |  |  *      A bit for 32 lead surrogates is set if any of the 32k corresponding  | 
908  |  |  *      _supplementary_ code points has a non-zero FCD value.  | 
909  |  |  *  | 
910  |  |  *      This bit set is most useful for the large blocks of CJK characters with FCD=0.  | 
911  |  |  *  | 
912  |  |  * Changes from format version 1 to format version 2 ---------------------------  | 
913  |  |  *  | 
914  |  |  * - Addition of data for raw (not recursively decomposed) mappings.  | 
915  |  |  *   + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when  | 
916  |  |  *     the mapping is to an empty string or when the character combines-forward.  | 
917  |  |  *     This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which  | 
918  |  |  *     is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.  | 
919  |  |  *   + For details see the design doc.  | 
920  |  |  * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into  | 
921  |  |  *   distinct ranges (combines-forward vs. not)  | 
922  |  |  *   so that a range check can be used to find out if there is a compositions list.  | 
923  |  |  *   This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.  | 
924  |  |  *   It is needed for the new (in ICU 49) composePair(), not for other normalization.  | 
925  |  |  * - Addition of the smallFCD[] bit set.  | 
926  |  |  *  | 
927  |  |  * Changes from format version 2 to format version 3 (ICU 60) ------------------  | 
928  |  |  *  | 
929  |  |  * - norm16 bit 0 indicates hasCompBoundaryAfter(),  | 
930  |  |  *   except that for contiguous composition (FCC) the tccc must be checked as well.  | 
931  |  |  *   Data indexes and ccc values are shifted left by one (OFFSET_SHIFT).  | 
932  |  |  *   Thresholds like minNoNo are tested before shifting.  | 
933  |  |  *  | 
934  |  |  * - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT),  | 
935  |  |  *   to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater.  | 
936  |  |  *   See DELTA_TCCC_MASK etc.  | 
937  |  |  *   This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter().  | 
938  |  |  *   minMaybeYes is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly.  | 
939  |  |  *  | 
940  |  |  * - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters,  | 
941  |  |  *   and ASCII characters are mapped algorithmically only to other ASCII characters.  | 
942  |  |  *   This helps with hasCompBoundaryBefore() and compose() fast paths.  | 
943  |  |  *   It is never necessary any more to loop for algorithmic mappings.  | 
944  |  |  *  | 
945  |  |  * - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE],  | 
946  |  |  *   indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY],  | 
947  |  |  *   and separation of the noNo extraData into distinct ranges.  | 
948  |  |  *   With this, the noNo norm16 value indicates whether the mapping is  | 
949  |  |  *   compose-normalized, not normalized but hasCompBoundaryBefore(),  | 
950  |  |  *   not even that, or maps to an empty string.  | 
951  |  |  *   hasCompBoundaryBefore() can be determined solely from the norm16 value.  | 
952  |  |  *  | 
953  |  |  * - The norm16 value for Hangul LVT is now different from that for Hangul LV,  | 
954  |  |  *   so that hasCompBoundaryAfter() need not check for the syllable type.  | 
955  |  |  *   For Hangul LV, minYesNo continues to be used (no comp-boundary-after).  | 
956  |  |  *   For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used.  | 
957  |  |  *   The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively,  | 
958  |  |  *   to simplify some code.  | 
959  |  |  *  | 
960  |  |  * - The extraData firstUnit bit 5 is no longer necessary  | 
961  |  |  *   (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER),  | 
962  |  |  *   is reserved again, and always set to 0.  | 
963  |  |  *  | 
964  |  |  * - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0.  | 
965  |  |  *   This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower:  | 
966  |  |  *   U+00AD Soft Hyphen maps to an empty string,  | 
967  |  |  *   which is artificially assigned "worst case" values lccc=1 and tccc=255.  | 
968  |  |  *  | 
969  |  |  * - A mapping to an empty string has explicit lccc=1 and tccc=255 values.  | 
970  |  |  *  | 
971  |  |  * Changes from format version 3 to format version 4 (ICU 63) ------------------  | 
972  |  |  *  | 
973  |  |  * Switched from UTrie2 to UCPTrie/CodePointTrie.  | 
974  |  |  *  | 
975  |  |  * The new trie no longer stores different values for surrogate code *units* vs.  | 
976  |  |  * surrogate code *points*.  | 
977  |  |  * Lead surrogates still have values for optimized UTF-16 string processing.  | 
978  |  |  * When looking up code point properties, the code now checks for lead surrogates and  | 
979  |  |  * treats them as inert.  | 
980  |  |  *  | 
981  |  |  * gennorm2 now has to reject mappings for surrogate code points.  | 
982  |  |  * UTS #46 maps unpaired surrogates to U+FFFD in code rather than via its  | 
983  |  |  * custom normalization data file.  | 
984  |  |  */  | 
985  |  |  | 
986  |  | #endif  /* !UCONFIG_NO_NORMALIZATION */  | 
987  |  | #endif  /* __NORMALIZER2IMPL_H__ */  |