/src/icu/source/common/normalizer2impl.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 2009-2014, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | *******************************************************************************  | 
10  |  | *   file name:  normalizer2impl.cpp  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:4  | 
14  |  | *  | 
15  |  | *   created on: 2009nov22  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | */  | 
18  |  |  | 
19  |  | // #define UCPTRIE_DEBUG  | 
20  |  |  | 
21  |  | #include "unicode/utypes.h"  | 
22  |  |  | 
23  |  | #if !UCONFIG_NO_NORMALIZATION  | 
24  |  |  | 
25  |  | #include "unicode/bytestream.h"  | 
26  |  | #include "unicode/edits.h"  | 
27  |  | #include "unicode/normalizer2.h"  | 
28  |  | #include "unicode/stringoptions.h"  | 
29  |  | #include "unicode/ucptrie.h"  | 
30  |  | #include "unicode/udata.h"  | 
31  |  | #include "unicode/umutablecptrie.h"  | 
32  |  | #include "unicode/ustring.h"  | 
33  |  | #include "unicode/utf16.h"  | 
34  |  | #include "unicode/utf8.h"  | 
35  |  | #include "bytesinkutil.h"  | 
36  |  | #include "cmemory.h"  | 
37  |  | #include "mutex.h"  | 
38  |  | #include "normalizer2impl.h"  | 
39  |  | #include "putilimp.h"  | 
40  |  | #include "uassert.h"  | 
41  |  | #include "ucptrie_impl.h"  | 
42  |  | #include "uset_imp.h"  | 
43  |  | #include "uvector.h"  | 
44  |  |  | 
45  |  | U_NAMESPACE_BEGIN  | 
46  |  |  | 
47  |  | namespace { | 
48  |  |  | 
49  |  | /**  | 
50  |  |  * UTF-8 lead byte for minNoMaybeCP.  | 
51  |  |  * Can be lower than the actual lead byte for c.  | 
52  |  |  * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.  | 
53  |  |  */  | 
54  | 0  | inline uint8_t leadByteForCP(UChar32 c) { | 
55  | 0  |     if (c <= 0x7f) { | 
56  | 0  |         return (uint8_t)c;  | 
57  | 0  |     } else if (c <= 0x7ff) { | 
58  | 0  |         return (uint8_t)(0xc0+(c>>6));  | 
59  | 0  |     } else { | 
60  |  |         // Should not occur because ccc(U+0300)!=0.  | 
61  | 0  |         return 0xe0;  | 
62  | 0  |     }  | 
63  | 0  | }  | 
64  |  |  | 
65  |  | /**  | 
66  |  |  * Returns the code point from one single well-formed UTF-8 byte sequence  | 
67  |  |  * between cpStart and cpLimit.  | 
68  |  |  *  | 
69  |  |  * Trie UTF-8 macros do not assemble whole code points (for efficiency).  | 
70  |  |  * When we do need the code point, we call this function.  | 
71  |  |  * We should not need it for normalization-inert data (norm16==0).  | 
72  |  |  * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.  | 
73  |  |  */  | 
74  | 0  | UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) { | 
75  |  |     // Similar to U8_NEXT_UNSAFE(s, i, c).  | 
76  | 0  |     U_ASSERT(cpStart < cpLimit);  | 
77  | 0  |     uint8_t c = *cpStart;  | 
78  | 0  |     switch(cpLimit-cpStart) { | 
79  | 0  |     case 1:  | 
80  | 0  |         return c;  | 
81  | 0  |     case 2:  | 
82  | 0  |         return ((c&0x1f)<<6) | (cpStart[1]&0x3f);  | 
83  | 0  |     case 3:  | 
84  |  |         // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar)  | 
85  | 0  |         return (UChar)((c<<12) | ((cpStart[1]&0x3f)<<6) | (cpStart[2]&0x3f));  | 
86  | 0  |     case 4:  | 
87  | 0  |         return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f);  | 
88  | 0  |     default:  | 
89  | 0  |         UPRV_UNREACHABLE;  // Should not occur.  | 
90  | 0  |     }  | 
91  | 0  | }  | 
92  |  |  | 
93  |  | /**  | 
94  |  |  * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.  | 
95  |  |  * Otherwise returns a negative value.  | 
96  |  |  */  | 
97  | 0  | UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) { | 
98  | 0  |     if ((p - start) >= 3) { | 
99  | 0  |         p -= 3;  | 
100  | 0  |         uint8_t l = *p;  | 
101  | 0  |         uint8_t t1, t2;  | 
102  | 0  |         if (0xe1 <= l && l <= 0xed &&  | 
103  | 0  |                 (t1 = (uint8_t)(p[1] - 0x80)) <= 0x3f &&  | 
104  | 0  |                 (t2 = (uint8_t)(p[2] - 0x80)) <= 0x3f &&  | 
105  | 0  |                 (l < 0xed || t1 <= 0x1f)) { | 
106  | 0  |             return ((l & 0xf) << 12) | (t1 << 6) | t2;  | 
107  | 0  |         }  | 
108  | 0  |     }  | 
109  | 0  |     return U_SENTINEL;  | 
110  | 0  | }  | 
111  |  |  | 
112  |  | /**  | 
113  |  |  * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.  | 
114  |  |  * Otherwise returns a negative value.  | 
115  |  |  */  | 
116  | 0  | int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) { | 
117  |  |     // Jamo T: E1 86 A8..E1 87 82  | 
118  | 0  |     if ((limit - src) >= 3 && *src == 0xe1) { | 
119  | 0  |         if (src[1] == 0x86) { | 
120  | 0  |             uint8_t t = src[2];  | 
121  |  |             // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.  | 
122  |  |             // Offset 0 does not correspond to any conjoining Jamo.  | 
123  | 0  |             if (0xa8 <= t && t <= 0xbf) { | 
124  | 0  |                 return t - 0xa7;  | 
125  | 0  |             }  | 
126  | 0  |         } else if (src[1] == 0x87) { | 
127  | 0  |             uint8_t t = src[2];  | 
128  | 0  |             if ((int8_t)t <= (int8_t)0x82u) { | 
129  | 0  |                 return t - (0xa7 - 0x40);  | 
130  | 0  |             }  | 
131  | 0  |         }  | 
132  | 0  |     }  | 
133  | 0  |     return -1;  | 
134  | 0  | }  | 
135  |  |  | 
136  |  | void  | 
137  |  | appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t delta,  | 
138  | 0  |                      ByteSink &sink, Edits *edits) { | 
139  | 0  |     char buffer[U8_MAX_LENGTH];  | 
140  | 0  |     int32_t length;  | 
141  | 0  |     int32_t cpLength = (int32_t)(cpLimit - cpStart);  | 
142  | 0  |     if (cpLength == 1) { | 
143  |  |         // The builder makes ASCII map to ASCII.  | 
144  | 0  |         buffer[0] = (uint8_t)(*cpStart + delta);  | 
145  | 0  |         length = 1;  | 
146  | 0  |     } else { | 
147  | 0  |         int32_t trail = *(cpLimit-1) + delta;  | 
148  | 0  |         if (0x80 <= trail && trail <= 0xbf) { | 
149  |  |             // The delta only changes the last trail byte.  | 
150  | 0  |             --cpLimit;  | 
151  | 0  |             length = 0;  | 
152  | 0  |             do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit); | 
153  | 0  |             buffer[length++] = (uint8_t)trail;  | 
154  | 0  |         } else { | 
155  |  |             // Decode the code point, add the delta, re-encode.  | 
156  | 0  |             UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta;  | 
157  | 0  |             length = 0;  | 
158  | 0  |             U8_APPEND_UNSAFE(buffer, length, c);  | 
159  | 0  |         }  | 
160  | 0  |     }  | 
161  | 0  |     if (edits != nullptr) { | 
162  | 0  |         edits->addReplace(cpLength, length);  | 
163  | 0  |     }  | 
164  | 0  |     sink.Append(buffer, length);  | 
165  | 0  | }  | 
166  |  |  | 
167  |  | }  // namespace  | 
168  |  |  | 
169  |  | // ReorderingBuffer -------------------------------------------------------- ***  | 
170  |  |  | 
171  |  | ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest,  | 
172  |  |                                    UErrorCode &errorCode) :  | 
173  | 0  |         impl(ni), str(dest),  | 
174  | 0  |         start(str.getBuffer(8)), reorderStart(start), limit(start),  | 
175  | 0  |         remainingCapacity(str.getCapacity()), lastCC(0) { | 
176  | 0  |     if (start == nullptr && U_SUCCESS(errorCode)) { | 
177  |  |         // getBuffer() already did str.setToBogus()  | 
178  | 0  |         errorCode = U_MEMORY_ALLOCATION_ERROR;  | 
179  | 0  |     }  | 
180  | 0  | }  | 
181  |  |  | 
182  | 0  | UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { | 
183  | 0  |     int32_t length=str.length();  | 
184  | 0  |     start=str.getBuffer(destCapacity);  | 
185  | 0  |     if(start==NULL) { | 
186  |  |         // getBuffer() already did str.setToBogus()  | 
187  | 0  |         errorCode=U_MEMORY_ALLOCATION_ERROR;  | 
188  | 0  |         return FALSE;  | 
189  | 0  |     }  | 
190  | 0  |     limit=start+length;  | 
191  | 0  |     remainingCapacity=str.getCapacity()-length;  | 
192  | 0  |     reorderStart=start;  | 
193  | 0  |     if(start==limit) { | 
194  | 0  |         lastCC=0;  | 
195  | 0  |     } else { | 
196  | 0  |         setIterator();  | 
197  | 0  |         lastCC=previousCC();  | 
198  |  |         // Set reorderStart after the last code point with cc<=1 if there is one.  | 
199  | 0  |         if(lastCC>1) { | 
200  | 0  |             while(previousCC()>1) {} | 
201  | 0  |         }  | 
202  | 0  |         reorderStart=codePointLimit;  | 
203  | 0  |     }  | 
204  | 0  |     return TRUE;  | 
205  | 0  | }  | 
206  |  |  | 
207  | 0  | UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const { | 
208  | 0  |     int32_t length=(int32_t)(limit-start);  | 
209  | 0  |     return  | 
210  | 0  |         length==(int32_t)(otherLimit-otherStart) &&  | 
211  | 0  |         0==u_memcmp(start, otherStart, length);  | 
212  | 0  | }  | 
213  |  |  | 
214  | 0  | UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const { | 
215  | 0  |     U_ASSERT((otherLimit - otherStart) <= INT32_MAX);  // ensured by caller  | 
216  | 0  |     int32_t length = (int32_t)(limit - start);  | 
217  | 0  |     int32_t otherLength = (int32_t)(otherLimit - otherStart);  | 
218  |  |     // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.  | 
219  | 0  |     if (otherLength < length || (otherLength / 3) > length) { | 
220  | 0  |         return FALSE;  | 
221  | 0  |     }  | 
222  |  |     // Compare valid strings from between normalization boundaries.  | 
223  |  |     // (Invalid sequences are normalization-inert.)  | 
224  | 0  |     for (int32_t i = 0, j = 0;;) { | 
225  | 0  |         if (i >= length) { | 
226  | 0  |             return j >= otherLength;  | 
227  | 0  |         } else if (j >= otherLength) { | 
228  | 0  |             return FALSE;  | 
229  | 0  |         }  | 
230  |  |         // Not at the end of either string yet.  | 
231  | 0  |         UChar32 c, other;  | 
232  | 0  |         U16_NEXT_UNSAFE(start, i, c);  | 
233  | 0  |         U8_NEXT_UNSAFE(otherStart, j, other);  | 
234  | 0  |         if (c != other) { | 
235  | 0  |             return FALSE;  | 
236  | 0  |         }  | 
237  | 0  |     }  | 
238  | 0  | }  | 
239  |  |  | 
240  | 0  | UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) { | 
241  | 0  |     if(remainingCapacity<2 && !resize(2, errorCode)) { | 
242  | 0  |         return FALSE;  | 
243  | 0  |     }  | 
244  | 0  |     if(lastCC<=cc || cc==0) { | 
245  | 0  |         limit[0]=U16_LEAD(c);  | 
246  | 0  |         limit[1]=U16_TRAIL(c);  | 
247  | 0  |         limit+=2;  | 
248  | 0  |         lastCC=cc;  | 
249  | 0  |         if(cc<=1) { | 
250  | 0  |             reorderStart=limit;  | 
251  | 0  |         }  | 
252  | 0  |     } else { | 
253  | 0  |         insert(c, cc);  | 
254  | 0  |     }  | 
255  | 0  |     remainingCapacity-=2;  | 
256  | 0  |     return TRUE;  | 
257  | 0  | }  | 
258  |  |  | 
259  |  | UBool ReorderingBuffer::append(const UChar *s, int32_t length, UBool isNFD,  | 
260  |  |                                uint8_t leadCC, uint8_t trailCC,  | 
261  | 0  |                                UErrorCode &errorCode) { | 
262  | 0  |     if(length==0) { | 
263  | 0  |         return TRUE;  | 
264  | 0  |     }  | 
265  | 0  |     if(remainingCapacity<length && !resize(length, errorCode)) { | 
266  | 0  |         return FALSE;  | 
267  | 0  |     }  | 
268  | 0  |     remainingCapacity-=length;  | 
269  | 0  |     if(lastCC<=leadCC || leadCC==0) { | 
270  | 0  |         if(trailCC<=1) { | 
271  | 0  |             reorderStart=limit+length;  | 
272  | 0  |         } else if(leadCC<=1) { | 
273  | 0  |             reorderStart=limit+1;  // Ok if not a code point boundary.  | 
274  | 0  |         }  | 
275  | 0  |         const UChar *sLimit=s+length;  | 
276  | 0  |         do { *limit++=*s++; } while(s!=sLimit); | 
277  | 0  |         lastCC=trailCC;  | 
278  | 0  |     } else { | 
279  | 0  |         int32_t i=0;  | 
280  | 0  |         UChar32 c;  | 
281  | 0  |         U16_NEXT(s, i, length, c);  | 
282  | 0  |         insert(c, leadCC);  // insert first code point  | 
283  | 0  |         while(i<length) { | 
284  | 0  |             U16_NEXT(s, i, length, c);  | 
285  | 0  |             if(i<length) { | 
286  | 0  |                 if (isNFD) { | 
287  | 0  |                     leadCC = Normalizer2Impl::getCCFromYesOrMaybe(impl.getRawNorm16(c));  | 
288  | 0  |                 } else { | 
289  | 0  |                     leadCC = impl.getCC(impl.getNorm16(c));  | 
290  | 0  |                 }  | 
291  | 0  |             } else { | 
292  | 0  |                 leadCC=trailCC;  | 
293  | 0  |             }  | 
294  | 0  |             append(c, leadCC, errorCode);  | 
295  | 0  |         }  | 
296  | 0  |     }  | 
297  | 0  |     return TRUE;  | 
298  | 0  | }  | 
299  |  |  | 
300  | 0  | UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { | 
301  | 0  |     int32_t cpLength=U16_LENGTH(c);  | 
302  | 0  |     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) { | 
303  | 0  |         return FALSE;  | 
304  | 0  |     }  | 
305  | 0  |     remainingCapacity-=cpLength;  | 
306  | 0  |     if(cpLength==1) { | 
307  | 0  |         *limit++=(UChar)c;  | 
308  | 0  |     } else { | 
309  | 0  |         limit[0]=U16_LEAD(c);  | 
310  | 0  |         limit[1]=U16_TRAIL(c);  | 
311  | 0  |         limit+=2;  | 
312  | 0  |     }  | 
313  | 0  |     lastCC=0;  | 
314  | 0  |     reorderStart=limit;  | 
315  | 0  |     return TRUE;  | 
316  | 0  | }  | 
317  |  |  | 
318  | 0  | UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) { | 
319  | 0  |     if(s==sLimit) { | 
320  | 0  |         return TRUE;  | 
321  | 0  |     }  | 
322  | 0  |     int32_t length=(int32_t)(sLimit-s);  | 
323  | 0  |     if(remainingCapacity<length && !resize(length, errorCode)) { | 
324  | 0  |         return FALSE;  | 
325  | 0  |     }  | 
326  | 0  |     u_memcpy(limit, s, length);  | 
327  | 0  |     limit+=length;  | 
328  | 0  |     remainingCapacity-=length;  | 
329  | 0  |     lastCC=0;  | 
330  | 0  |     reorderStart=limit;  | 
331  | 0  |     return TRUE;  | 
332  | 0  | }  | 
333  |  |  | 
334  | 0  | void ReorderingBuffer::remove() { | 
335  | 0  |     reorderStart=limit=start;  | 
336  | 0  |     remainingCapacity=str.getCapacity();  | 
337  | 0  |     lastCC=0;  | 
338  | 0  | }  | 
339  |  |  | 
340  | 0  | void ReorderingBuffer::removeSuffix(int32_t suffixLength) { | 
341  | 0  |     if(suffixLength<(limit-start)) { | 
342  | 0  |         limit-=suffixLength;  | 
343  | 0  |         remainingCapacity+=suffixLength;  | 
344  | 0  |     } else { | 
345  | 0  |         limit=start;  | 
346  | 0  |         remainingCapacity=str.getCapacity();  | 
347  | 0  |     }  | 
348  | 0  |     lastCC=0;  | 
349  | 0  |     reorderStart=limit;  | 
350  | 0  | }  | 
351  |  |  | 
352  | 0  | UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { | 
353  | 0  |     int32_t reorderStartIndex=(int32_t)(reorderStart-start);  | 
354  | 0  |     int32_t length=(int32_t)(limit-start);  | 
355  | 0  |     str.releaseBuffer(length);  | 
356  | 0  |     int32_t newCapacity=length+appendLength;  | 
357  | 0  |     int32_t doubleCapacity=2*str.getCapacity();  | 
358  | 0  |     if(newCapacity<doubleCapacity) { | 
359  | 0  |         newCapacity=doubleCapacity;  | 
360  | 0  |     }  | 
361  | 0  |     if(newCapacity<256) { | 
362  | 0  |         newCapacity=256;  | 
363  | 0  |     }  | 
364  | 0  |     start=str.getBuffer(newCapacity);  | 
365  | 0  |     if(start==NULL) { | 
366  |  |         // getBuffer() already did str.setToBogus()  | 
367  | 0  |         errorCode=U_MEMORY_ALLOCATION_ERROR;  | 
368  | 0  |         return FALSE;  | 
369  | 0  |     }  | 
370  | 0  |     reorderStart=start+reorderStartIndex;  | 
371  | 0  |     limit=start+length;  | 
372  | 0  |     remainingCapacity=str.getCapacity()-length;  | 
373  | 0  |     return TRUE;  | 
374  | 0  | }  | 
375  |  |  | 
376  | 0  | void ReorderingBuffer::skipPrevious() { | 
377  | 0  |     codePointLimit=codePointStart;  | 
378  | 0  |     UChar c=*--codePointStart;  | 
379  | 0  |     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) { | 
380  | 0  |         --codePointStart;  | 
381  | 0  |     }  | 
382  | 0  | }  | 
383  |  |  | 
384  | 0  | uint8_t ReorderingBuffer::previousCC() { | 
385  | 0  |     codePointLimit=codePointStart;  | 
386  | 0  |     if(reorderStart>=codePointStart) { | 
387  | 0  |         return 0;  | 
388  | 0  |     }  | 
389  | 0  |     UChar32 c=*--codePointStart;  | 
390  | 0  |     UChar c2;  | 
391  | 0  |     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) { | 
392  | 0  |         --codePointStart;  | 
393  | 0  |         c=U16_GET_SUPPLEMENTARY(c2, c);  | 
394  | 0  |     }  | 
395  | 0  |     return impl.getCCFromYesOrMaybeCP(c);  | 
396  | 0  | }  | 
397  |  |  | 
398  |  | // Inserts c somewhere before the last character.  | 
399  |  | // Requires 0<cc<lastCC which implies reorderStart<limit.  | 
400  | 0  | void ReorderingBuffer::insert(UChar32 c, uint8_t cc) { | 
401  | 0  |     for(setIterator(), skipPrevious(); previousCC()>cc;) {} | 
402  |  |     // insert c at codePointLimit, after the character with prevCC<=cc  | 
403  | 0  |     UChar *q=limit;  | 
404  | 0  |     UChar *r=limit+=U16_LENGTH(c);  | 
405  | 0  |     do { | 
406  | 0  |         *--r=*--q;  | 
407  | 0  |     } while(codePointLimit!=q);  | 
408  | 0  |     writeCodePoint(q, c);  | 
409  | 0  |     if(cc<=1) { | 
410  | 0  |         reorderStart=r;  | 
411  | 0  |     }  | 
412  | 0  | }  | 
413  |  |  | 
414  |  | // Normalizer2Impl --------------------------------------------------------- ***  | 
415  |  |  | 
416  |  | struct CanonIterData : public UMemory { | 
417  |  |     CanonIterData(UErrorCode &errorCode);  | 
418  |  |     ~CanonIterData();  | 
419  |  |     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);  | 
420  |  |     UMutableCPTrie *mutableTrie;  | 
421  |  |     UCPTrie *trie;  | 
422  |  |     UVector canonStartSets;  // contains UnicodeSet *  | 
423  |  | };  | 
424  |  |  | 
425  | 0  | Normalizer2Impl::~Normalizer2Impl() { | 
426  | 0  |     delete fCanonIterData;  | 
427  | 0  | }  | 
428  |  |  | 
429  |  | void  | 
430  |  | Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie,  | 
431  | 0  |                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) { | 
432  | 0  |     minDecompNoCP = static_cast<UChar>(inIndexes[IX_MIN_DECOMP_NO_CP]);  | 
433  | 0  |     minCompNoMaybeCP = static_cast<UChar>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]);  | 
434  | 0  |     minLcccCP = static_cast<UChar>(inIndexes[IX_MIN_LCCC_CP]);  | 
435  |  | 
  | 
436  | 0  |     minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]);  | 
437  | 0  |     minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]);  | 
438  | 0  |     minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]);  | 
439  | 0  |     minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);  | 
440  | 0  |     minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);  | 
441  | 0  |     minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]);  | 
442  | 0  |     limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]);  | 
443  | 0  |     minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]);  | 
444  | 0  |     U_ASSERT((minMaybeYes & 7) == 0);  // 8-aligned for noNoDelta bit fields  | 
445  | 0  |     centerNoNoDelta = (minMaybeYes >> DELTA_SHIFT) - MAX_DELTA - 1;  | 
446  |  | 
  | 
447  | 0  |     normTrie=inTrie;  | 
448  |  | 
  | 
449  | 0  |     maybeYesCompositions=inExtraData;  | 
450  | 0  |     extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);  | 
451  |  | 
  | 
452  | 0  |     smallFCD=inSmallFCD;  | 
453  | 0  | }  | 
454  |  |  | 
455  |  | U_CDECL_BEGIN  | 
456  |  |  | 
457  |  | static uint32_t U_CALLCONV  | 
458  | 0  | segmentStarterMapper(const void * /*context*/, uint32_t value) { | 
459  | 0  |     return value&CANON_NOT_SEGMENT_STARTER;  | 
460  | 0  | }  | 
461  |  |  | 
462  |  | U_CDECL_END  | 
463  |  |  | 
464  |  | void  | 
465  | 0  | Normalizer2Impl::addLcccChars(UnicodeSet &set) const { | 
466  | 0  |     UChar32 start = 0, end;  | 
467  | 0  |     uint32_t norm16;  | 
468  | 0  |     while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,  | 
469  | 0  |                                    nullptr, nullptr, &norm16)) >= 0) { | 
470  | 0  |         if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&  | 
471  | 0  |                 norm16 != Normalizer2Impl::JAMO_VT) { | 
472  | 0  |             set.add(start, end);  | 
473  | 0  |         } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) { | 
474  | 0  |             uint16_t fcd16 = getFCD16(start);  | 
475  | 0  |             if (fcd16 > 0xff) { set.add(start, end); } | 
476  | 0  |         }  | 
477  | 0  |         start = end + 1;  | 
478  | 0  |     }  | 
479  | 0  | }  | 
480  |  |  | 
481  |  | void  | 
482  | 0  | Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const { | 
483  |  |     // Add the start code point of each same-value range of the trie.  | 
484  | 0  |     UChar32 start = 0, end;  | 
485  | 0  |     uint32_t value;  | 
486  | 0  |     while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,  | 
487  | 0  |                                    nullptr, nullptr, &value)) >= 0) { | 
488  | 0  |         sa->add(sa->set, start);  | 
489  | 0  |         if (start != end && isAlgorithmicNoNo((uint16_t)value) &&  | 
490  | 0  |                 (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) { | 
491  |  |             // Range of code points with same-norm16-value algorithmic decompositions.  | 
492  |  |             // They might have different non-zero FCD16 values.  | 
493  | 0  |             uint16_t prevFCD16 = getFCD16(start);  | 
494  | 0  |             while (++start <= end) { | 
495  | 0  |                 uint16_t fcd16 = getFCD16(start);  | 
496  | 0  |                 if (fcd16 != prevFCD16) { | 
497  | 0  |                     sa->add(sa->set, start);  | 
498  | 0  |                     prevFCD16 = fcd16;  | 
499  | 0  |                 }  | 
500  | 0  |             }  | 
501  | 0  |         }  | 
502  | 0  |         start = end + 1;  | 
503  | 0  |     }  | 
504  |  |  | 
505  |  |     /* add Hangul LV syllables and LV+1 because of skippables */  | 
506  | 0  |     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) { | 
507  | 0  |         sa->add(sa->set, c);  | 
508  | 0  |         sa->add(sa->set, c+1);  | 
509  | 0  |     }  | 
510  | 0  |     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */  | 
511  | 0  | }  | 
512  |  |  | 
513  |  | void  | 
514  | 0  | Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const { | 
515  |  |     // Add the start code point of each same-value range of the canonical iterator data trie.  | 
516  | 0  |     if (!ensureCanonIterData(errorCode)) { return; } | 
517  |  |     // Currently only used for the SEGMENT_STARTER property.  | 
518  | 0  |     UChar32 start = 0, end;  | 
519  | 0  |     uint32_t value;  | 
520  | 0  |     while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,  | 
521  | 0  |                                    segmentStarterMapper, nullptr, &value)) >= 0) { | 
522  | 0  |         sa->add(sa->set, start);  | 
523  | 0  |         start = end + 1;  | 
524  | 0  |     }  | 
525  | 0  | }  | 
526  |  |  | 
527  |  | const UChar *  | 
528  |  | Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,  | 
529  |  |                                                 UChar32 minNeedDataCP,  | 
530  |  |                                                 ReorderingBuffer *buffer,  | 
531  | 0  |                                                 UErrorCode &errorCode) const { | 
532  |  |     // Make some effort to support NUL-terminated strings reasonably.  | 
533  |  |     // Take the part of the fast quick check loop that does not look up  | 
534  |  |     // data and check the first part of the string.  | 
535  |  |     // After this prefix, determine the string length to simplify the rest  | 
536  |  |     // of the code.  | 
537  | 0  |     const UChar *prevSrc=src;  | 
538  | 0  |     UChar c;  | 
539  | 0  |     while((c=*src++)<minNeedDataCP && c!=0) {} | 
540  |  |     // Back out the last character for full processing.  | 
541  |  |     // Copy this prefix.  | 
542  | 0  |     if(--src!=prevSrc) { | 
543  | 0  |         if(buffer!=NULL) { | 
544  | 0  |             buffer->appendZeroCC(prevSrc, src, errorCode);  | 
545  | 0  |         }  | 
546  | 0  |     }  | 
547  | 0  |     return src;  | 
548  | 0  | }  | 
549  |  |  | 
550  |  | UnicodeString &  | 
551  |  | Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,  | 
552  | 0  |                            UErrorCode &errorCode) const { | 
553  | 0  |     if(U_FAILURE(errorCode)) { | 
554  | 0  |         dest.setToBogus();  | 
555  | 0  |         return dest;  | 
556  | 0  |     }  | 
557  | 0  |     const UChar *sArray=src.getBuffer();  | 
558  | 0  |     if(&dest==&src || sArray==NULL) { | 
559  | 0  |         errorCode=U_ILLEGAL_ARGUMENT_ERROR;  | 
560  | 0  |         dest.setToBogus();  | 
561  | 0  |         return dest;  | 
562  | 0  |     }  | 
563  | 0  |     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);  | 
564  | 0  |     return dest;  | 
565  | 0  | }  | 
566  |  |  | 
567  |  | void  | 
568  |  | Normalizer2Impl::decompose(const UChar *src, const UChar *limit,  | 
569  |  |                            UnicodeString &dest,  | 
570  |  |                            int32_t destLengthEstimate,  | 
571  | 0  |                            UErrorCode &errorCode) const { | 
572  | 0  |     if(destLengthEstimate<0 && limit!=NULL) { | 
573  | 0  |         destLengthEstimate=(int32_t)(limit-src);  | 
574  | 0  |     }  | 
575  | 0  |     dest.remove();  | 
576  | 0  |     ReorderingBuffer buffer(*this, dest);  | 
577  | 0  |     if(buffer.init(destLengthEstimate, errorCode)) { | 
578  | 0  |         decompose(src, limit, &buffer, errorCode);  | 
579  | 0  |     }  | 
580  | 0  | }  | 
581  |  |  | 
582  |  | // Dual functionality:  | 
583  |  | // buffer!=NULL: normalize  | 
584  |  | // buffer==NULL: isNormalized/spanQuickCheckYes  | 
585  |  | const UChar *  | 
586  |  | Normalizer2Impl::decompose(const UChar *src, const UChar *limit,  | 
587  |  |                            ReorderingBuffer *buffer,  | 
588  | 0  |                            UErrorCode &errorCode) const { | 
589  | 0  |     UChar32 minNoCP=minDecompNoCP;  | 
590  | 0  |     if(limit==NULL) { | 
591  | 0  |         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);  | 
592  | 0  |         if(U_FAILURE(errorCode)) { | 
593  | 0  |             return src;  | 
594  | 0  |         }  | 
595  | 0  |         limit=u_strchr(src, 0);  | 
596  | 0  |     }  | 
597  |  |  | 
598  | 0  |     const UChar *prevSrc;  | 
599  | 0  |     UChar32 c=0;  | 
600  | 0  |     uint16_t norm16=0;  | 
601  |  |  | 
602  |  |     // only for quick check  | 
603  | 0  |     const UChar *prevBoundary=src;  | 
604  | 0  |     uint8_t prevCC=0;  | 
605  |  | 
  | 
606  | 0  |     for(;;) { | 
607  |  |         // count code units below the minimum or with irrelevant data for the quick check  | 
608  | 0  |         for(prevSrc=src; src!=limit;) { | 
609  | 0  |             if( (c=*src)<minNoCP ||  | 
610  | 0  |                 isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))  | 
611  | 0  |             ) { | 
612  | 0  |                 ++src;  | 
613  | 0  |             } else if(!U16_IS_LEAD(c)) { | 
614  | 0  |                 break;  | 
615  | 0  |             } else { | 
616  | 0  |                 UChar c2;  | 
617  | 0  |                 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { | 
618  | 0  |                     c=U16_GET_SUPPLEMENTARY(c, c2);  | 
619  | 0  |                     norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);  | 
620  | 0  |                     if(isMostDecompYesAndZeroCC(norm16)) { | 
621  | 0  |                         src+=2;  | 
622  | 0  |                     } else { | 
623  | 0  |                         break;  | 
624  | 0  |                     }  | 
625  | 0  |                 } else { | 
626  | 0  |                     ++src;  // unpaired lead surrogate: inert  | 
627  | 0  |                 }  | 
628  | 0  |             }  | 
629  | 0  |         }  | 
630  |  |         // copy these code units all at once  | 
631  | 0  |         if(src!=prevSrc) { | 
632  | 0  |             if(buffer!=NULL) { | 
633  | 0  |                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) { | 
634  | 0  |                     break;  | 
635  | 0  |                 }  | 
636  | 0  |             } else { | 
637  | 0  |                 prevCC=0;  | 
638  | 0  |                 prevBoundary=src;  | 
639  | 0  |             }  | 
640  | 0  |         }  | 
641  | 0  |         if(src==limit) { | 
642  | 0  |             break;  | 
643  | 0  |         }  | 
644  |  |  | 
645  |  |         // Check one above-minimum, relevant code point.  | 
646  | 0  |         src+=U16_LENGTH(c);  | 
647  | 0  |         if(buffer!=NULL) { | 
648  | 0  |             if(!decompose(c, norm16, *buffer, errorCode)) { | 
649  | 0  |                 break;  | 
650  | 0  |             }  | 
651  | 0  |         } else { | 
652  | 0  |             if(isDecompYes(norm16)) { | 
653  | 0  |                 uint8_t cc=getCCFromYesOrMaybe(norm16);  | 
654  | 0  |                 if(prevCC<=cc || cc==0) { | 
655  | 0  |                     prevCC=cc;  | 
656  | 0  |                     if(cc<=1) { | 
657  | 0  |                         prevBoundary=src;  | 
658  | 0  |                     }  | 
659  | 0  |                     continue;  | 
660  | 0  |                 }  | 
661  | 0  |             }  | 
662  | 0  |             return prevBoundary;  // "no" or cc out of order  | 
663  | 0  |         }  | 
664  | 0  |     }  | 
665  | 0  |     return src;  | 
666  | 0  | }  | 
667  |  |  | 
668  |  | // Decompose a short piece of text which is likely to contain characters that  | 
669  |  | // fail the quick check loop and/or where the quick check loop's overhead  | 
670  |  | // is unlikely to be amortized.  | 
671  |  | // Called by the compose() and makeFCD() implementations.  | 
672  |  | const UChar *  | 
673  |  | Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,  | 
674  |  |                                 UBool stopAtCompBoundary, UBool onlyContiguous,  | 
675  | 0  |                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const { | 
676  | 0  |     if (U_FAILURE(errorCode)) { | 
677  | 0  |         return nullptr;  | 
678  | 0  |     }  | 
679  | 0  |     while(src<limit) { | 
680  | 0  |         if (stopAtCompBoundary && *src < minCompNoMaybeCP) { | 
681  | 0  |             return src;  | 
682  | 0  |         }  | 
683  | 0  |         const UChar *prevSrc = src;  | 
684  | 0  |         UChar32 c;  | 
685  | 0  |         uint16_t norm16;  | 
686  | 0  |         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);  | 
687  | 0  |         if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { | 
688  | 0  |             return prevSrc;  | 
689  | 0  |         }  | 
690  | 0  |         if(!decompose(c, norm16, buffer, errorCode)) { | 
691  | 0  |             return nullptr;  | 
692  | 0  |         }  | 
693  | 0  |         if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { | 
694  | 0  |             return src;  | 
695  | 0  |         }  | 
696  | 0  |     }  | 
697  | 0  |     return src;  | 
698  | 0  | }  | 
699  |  |  | 
700  |  | UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,  | 
701  |  |                                  ReorderingBuffer &buffer,  | 
702  | 0  |                                  UErrorCode &errorCode) const { | 
703  |  |     // get the decomposition and the lead and trail cc's  | 
704  | 0  |     if (norm16 >= limitNoNo) { | 
705  | 0  |         if (isMaybeOrNonZeroCC(norm16)) { | 
706  | 0  |             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);  | 
707  | 0  |         }  | 
708  |  |         // Maps to an isCompYesAndZeroCC.  | 
709  | 0  |         c=mapAlgorithmic(c, norm16);  | 
710  | 0  |         norm16=getRawNorm16(c);  | 
711  | 0  |     }  | 
712  | 0  |     if (norm16 < minYesNo) { | 
713  |  |         // c does not decompose  | 
714  | 0  |         return buffer.append(c, 0, errorCode);  | 
715  | 0  |     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { | 
716  |  |         // Hangul syllable: decompose algorithmically  | 
717  | 0  |         UChar jamos[3];  | 
718  | 0  |         return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);  | 
719  | 0  |     }  | 
720  |  |     // c decomposes, get everything from the variable-length extra data  | 
721  | 0  |     const uint16_t *mapping=getMapping(norm16);  | 
722  | 0  |     uint16_t firstUnit=*mapping;  | 
723  | 0  |     int32_t length=firstUnit&MAPPING_LENGTH_MASK;  | 
724  | 0  |     uint8_t leadCC, trailCC;  | 
725  | 0  |     trailCC=(uint8_t)(firstUnit>>8);  | 
726  | 0  |     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { | 
727  | 0  |         leadCC=(uint8_t)(*(mapping-1)>>8);  | 
728  | 0  |     } else { | 
729  | 0  |         leadCC=0;  | 
730  | 0  |     }  | 
731  | 0  |     return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);  | 
732  | 0  | }  | 
733  |  |  | 
734  |  | // Dual functionality:  | 
735  |  | // sink != nullptr: normalize  | 
736  |  | // sink == nullptr: isNormalized/spanQuickCheckYes  | 
737  |  | const uint8_t *  | 
738  |  | Normalizer2Impl::decomposeUTF8(uint32_t options,  | 
739  |  |                                const uint8_t *src, const uint8_t *limit,  | 
740  | 0  |                                ByteSink *sink, Edits *edits, UErrorCode &errorCode) const { | 
741  | 0  |     U_ASSERT(limit != nullptr);  | 
742  | 0  |     UnicodeString s16;  | 
743  | 0  |     uint8_t minNoLead = leadByteForCP(minDecompNoCP);  | 
744  |  | 
  | 
745  | 0  |     const uint8_t *prevBoundary = src;  | 
746  |  |     // only for quick check  | 
747  | 0  |     uint8_t prevCC = 0;  | 
748  |  | 
  | 
749  | 0  |     for (;;) { | 
750  |  |         // Fast path: Scan over a sequence of characters below the minimum "no" code point,  | 
751  |  |         // or with (decompYes && ccc==0) properties.  | 
752  | 0  |         const uint8_t *fastStart = src;  | 
753  | 0  |         const uint8_t *prevSrc;  | 
754  | 0  |         uint16_t norm16 = 0;  | 
755  |  | 
  | 
756  | 0  |         for (;;) { | 
757  | 0  |             if (src == limit) { | 
758  | 0  |                 if (prevBoundary != limit && sink != nullptr) { | 
759  | 0  |                     ByteSinkUtil::appendUnchanged(prevBoundary, limit,  | 
760  | 0  |                                                   *sink, options, edits, errorCode);  | 
761  | 0  |                 }  | 
762  | 0  |                 return src;  | 
763  | 0  |             }  | 
764  | 0  |             if (*src < minNoLead) { | 
765  | 0  |                 ++src;  | 
766  | 0  |             } else { | 
767  | 0  |                 prevSrc = src;  | 
768  | 0  |                 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);  | 
769  | 0  |                 if (!isMostDecompYesAndZeroCC(norm16)) { | 
770  | 0  |                     break;  | 
771  | 0  |                 }  | 
772  | 0  |             }  | 
773  | 0  |         }  | 
774  |  |         // isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,  | 
775  |  |         // and the current character at [prevSrc..src[ is not a common case with cc=0  | 
776  |  |         // (MIN_NORMAL_MAYBE_YES or JAMO_VT).  | 
777  |  |         // It could still be a maybeYes with cc=0.  | 
778  | 0  |         if (prevSrc != fastStart) { | 
779  |  |             // The fast path looped over yes/0 characters before the current one.  | 
780  | 0  |             if (sink != nullptr &&  | 
781  | 0  |                     !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,  | 
782  | 0  |                                                    *sink, options, edits, errorCode)) { | 
783  | 0  |                 break;  | 
784  | 0  |             }  | 
785  | 0  |             prevBoundary = prevSrc;  | 
786  | 0  |             prevCC = 0;  | 
787  | 0  |         }  | 
788  |  |  | 
789  |  |         // Medium-fast path: Quick check.  | 
790  | 0  |         if (isMaybeOrNonZeroCC(norm16)) { | 
791  |  |             // Does not decompose.  | 
792  | 0  |             uint8_t cc = getCCFromYesOrMaybe(norm16);  | 
793  | 0  |             if (prevCC <= cc || cc == 0) { | 
794  | 0  |                 prevCC = cc;  | 
795  | 0  |                 if (cc <= 1) { | 
796  | 0  |                     if (sink != nullptr &&  | 
797  | 0  |                             !ByteSinkUtil::appendUnchanged(prevBoundary, src,  | 
798  | 0  |                                                            *sink, options, edits, errorCode)) { | 
799  | 0  |                         break;  | 
800  | 0  |                     }  | 
801  | 0  |                     prevBoundary = src;  | 
802  | 0  |                 }  | 
803  | 0  |                 continue;  | 
804  | 0  |             }  | 
805  | 0  |         }  | 
806  | 0  |         if (sink == nullptr) { | 
807  | 0  |             return prevBoundary;  // quick check: "no" or cc out of order  | 
808  | 0  |         }  | 
809  |  |  | 
810  |  |         // Slow path  | 
811  |  |         // Decompose up to and including the current character.  | 
812  | 0  |         if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) { | 
813  | 0  |             if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,  | 
814  | 0  |                                                *sink, options, edits, errorCode)) { | 
815  | 0  |                 break;  | 
816  | 0  |             }  | 
817  | 0  |             prevBoundary = prevSrc;  | 
818  | 0  |         }  | 
819  | 0  |         ReorderingBuffer buffer(*this, s16, errorCode);  | 
820  | 0  |         if (U_FAILURE(errorCode)) { | 
821  | 0  |             break;  | 
822  | 0  |         }  | 
823  | 0  |         decomposeShort(prevBoundary, src, STOP_AT_LIMIT, FALSE /* onlyContiguous */,  | 
824  | 0  |                        buffer, errorCode);  | 
825  |  |         // Decompose until the next boundary.  | 
826  | 0  |         if (buffer.getLastCC() > 1) { | 
827  | 0  |             src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, FALSE /* onlyContiguous */,  | 
828  | 0  |                                  buffer, errorCode);  | 
829  | 0  |         }  | 
830  | 0  |         if (U_FAILURE(errorCode)) { | 
831  | 0  |             break;  | 
832  | 0  |         }  | 
833  | 0  |         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals() | 
834  | 0  |             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;  | 
835  | 0  |             break;  | 
836  | 0  |         }  | 
837  |  |         // We already know there was a change if the original character decomposed;  | 
838  |  |         // otherwise compare.  | 
839  | 0  |         if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) { | 
840  | 0  |             if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,  | 
841  | 0  |                                                *sink, options, edits, errorCode)) { | 
842  | 0  |                 break;  | 
843  | 0  |             }  | 
844  | 0  |         } else { | 
845  | 0  |             if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),  | 
846  | 0  |                                             *sink, edits, errorCode)) { | 
847  | 0  |                 break;  | 
848  | 0  |             }  | 
849  | 0  |         }  | 
850  | 0  |         prevBoundary = src;  | 
851  | 0  |         prevCC = 0;  | 
852  | 0  |     }  | 
853  | 0  |     return src;  | 
854  | 0  | }  | 
855  |  |  | 
856  |  | const uint8_t *  | 
857  |  | Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,  | 
858  |  |                                 StopAt stopAt, UBool onlyContiguous,  | 
859  | 0  |                                 ReorderingBuffer &buffer, UErrorCode &errorCode) const { | 
860  | 0  |     if (U_FAILURE(errorCode)) { | 
861  | 0  |         return nullptr;  | 
862  | 0  |     }  | 
863  | 0  |     while (src < limit) { | 
864  | 0  |         const uint8_t *prevSrc = src;  | 
865  | 0  |         uint16_t norm16;  | 
866  | 0  |         UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);  | 
867  |  |         // Get the decomposition and the lead and trail cc's.  | 
868  | 0  |         UChar32 c = U_SENTINEL;  | 
869  | 0  |         if (norm16 >= limitNoNo) { | 
870  | 0  |             if (isMaybeOrNonZeroCC(norm16)) { | 
871  |  |                 // No comp boundaries around this character.  | 
872  | 0  |                 uint8_t cc = getCCFromYesOrMaybe(norm16);  | 
873  | 0  |                 if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) { | 
874  | 0  |                     return prevSrc;  | 
875  | 0  |                 }  | 
876  | 0  |                 c = codePointFromValidUTF8(prevSrc, src);  | 
877  | 0  |                 if (!buffer.append(c, cc, errorCode)) { | 
878  | 0  |                     return nullptr;  | 
879  | 0  |                 }  | 
880  | 0  |                 if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) { | 
881  | 0  |                     return src;  | 
882  | 0  |                 }  | 
883  | 0  |                 continue;  | 
884  | 0  |             }  | 
885  |  |             // Maps to an isCompYesAndZeroCC.  | 
886  | 0  |             if (stopAt != STOP_AT_LIMIT) { | 
887  | 0  |                 return prevSrc;  | 
888  | 0  |             }  | 
889  | 0  |             c = codePointFromValidUTF8(prevSrc, src);  | 
890  | 0  |             c = mapAlgorithmic(c, norm16);  | 
891  | 0  |             norm16 = getRawNorm16(c);  | 
892  | 0  |         } else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) { | 
893  | 0  |             return prevSrc;  | 
894  | 0  |         }  | 
895  |  |         // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.  | 
896  |  |         // We do not see invalid UTF-8 here because  | 
897  |  |         // its norm16==INERT is normalization-inert,  | 
898  |  |         // so it gets copied unchanged in the fast path,  | 
899  |  |         // and we stop the slow path where invalid UTF-8 begins.  | 
900  |  |         // c >= 0 is the result of an algorithmic mapping.  | 
901  | 0  |         U_ASSERT(c >= 0 || norm16 != INERT);  | 
902  | 0  |         if (norm16 < minYesNo) { | 
903  | 0  |             if (c < 0) { | 
904  | 0  |                 c = codePointFromValidUTF8(prevSrc, src);  | 
905  | 0  |             }  | 
906  |  |             // does not decompose  | 
907  | 0  |             if (!buffer.append(c, 0, errorCode)) { | 
908  | 0  |                 return nullptr;  | 
909  | 0  |             }  | 
910  | 0  |         } else if (isHangulLV(norm16) || isHangulLVT(norm16)) { | 
911  |  |             // Hangul syllable: decompose algorithmically  | 
912  | 0  |             if (c < 0) { | 
913  | 0  |                 c = codePointFromValidUTF8(prevSrc, src);  | 
914  | 0  |             }  | 
915  | 0  |             char16_t jamos[3];  | 
916  | 0  |             if (!buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode)) { | 
917  | 0  |                 return nullptr;  | 
918  | 0  |             }  | 
919  | 0  |         } else { | 
920  |  |             // The character decomposes, get everything from the variable-length extra data.  | 
921  | 0  |             const uint16_t *mapping = getMapping(norm16);  | 
922  | 0  |             uint16_t firstUnit = *mapping;  | 
923  | 0  |             int32_t length = firstUnit & MAPPING_LENGTH_MASK;  | 
924  | 0  |             uint8_t trailCC = (uint8_t)(firstUnit >> 8);  | 
925  | 0  |             uint8_t leadCC;  | 
926  | 0  |             if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) { | 
927  | 0  |                 leadCC = (uint8_t)(*(mapping-1) >> 8);  | 
928  | 0  |             } else { | 
929  | 0  |                 leadCC = 0;  | 
930  | 0  |             }  | 
931  | 0  |             if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) { | 
932  | 0  |                 return prevSrc;  | 
933  | 0  |             }  | 
934  | 0  |             if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) { | 
935  | 0  |                 return nullptr;  | 
936  | 0  |             }  | 
937  | 0  |         }  | 
938  | 0  |         if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||  | 
939  | 0  |                 (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) { | 
940  | 0  |             return src;  | 
941  | 0  |         }  | 
942  | 0  |     }  | 
943  | 0  |     return src;  | 
944  | 0  | }  | 
945  |  |  | 
946  |  | const UChar *  | 
947  | 0  | Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const { | 
948  | 0  |     uint16_t norm16;  | 
949  | 0  |     if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { | 
950  |  |         // c does not decompose  | 
951  | 0  |         return nullptr;  | 
952  | 0  |     }  | 
953  | 0  |     const UChar *decomp = nullptr;  | 
954  | 0  |     if(isDecompNoAlgorithmic(norm16)) { | 
955  |  |         // Maps to an isCompYesAndZeroCC.  | 
956  | 0  |         c=mapAlgorithmic(c, norm16);  | 
957  | 0  |         decomp=buffer;  | 
958  | 0  |         length=0;  | 
959  | 0  |         U16_APPEND_UNSAFE(buffer, length, c);  | 
960  |  |         // The mapping might decompose further.  | 
961  | 0  |         norm16 = getRawNorm16(c);  | 
962  | 0  |     }  | 
963  | 0  |     if (norm16 < minYesNo) { | 
964  | 0  |         return decomp;  | 
965  | 0  |     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { | 
966  |  |         // Hangul syllable: decompose algorithmically  | 
967  | 0  |         length=Hangul::decompose(c, buffer);  | 
968  | 0  |         return buffer;  | 
969  | 0  |     }  | 
970  |  |     // c decomposes, get everything from the variable-length extra data  | 
971  | 0  |     const uint16_t *mapping=getMapping(norm16);  | 
972  | 0  |     length=*mapping&MAPPING_LENGTH_MASK;  | 
973  | 0  |     return (const UChar *)mapping+1;  | 
974  | 0  | }  | 
975  |  |  | 
976  |  | // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1  | 
977  |  | // so that a raw mapping fits that consists of one unit ("rm0") | 
978  |  | // plus all but the first two code units of the normal mapping.  | 
979  |  | // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.  | 
980  |  | const UChar *  | 
981  | 0  | Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const { | 
982  | 0  |     uint16_t norm16;  | 
983  | 0  |     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { | 
984  |  |         // c does not decompose  | 
985  | 0  |         return NULL;  | 
986  | 0  |     } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { | 
987  |  |         // Hangul syllable: decompose algorithmically  | 
988  | 0  |         Hangul::getRawDecomposition(c, buffer);  | 
989  | 0  |         length=2;  | 
990  | 0  |         return buffer;  | 
991  | 0  |     } else if(isDecompNoAlgorithmic(norm16)) { | 
992  | 0  |         c=mapAlgorithmic(c, norm16);  | 
993  | 0  |         length=0;  | 
994  | 0  |         U16_APPEND_UNSAFE(buffer, length, c);  | 
995  | 0  |         return buffer;  | 
996  | 0  |     }  | 
997  |  |     // c decomposes, get everything from the variable-length extra data  | 
998  | 0  |     const uint16_t *mapping=getMapping(norm16);  | 
999  | 0  |     uint16_t firstUnit=*mapping;  | 
1000  | 0  |     int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping  | 
1001  | 0  |     if(firstUnit&MAPPING_HAS_RAW_MAPPING) { | 
1002  |  |         // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.  | 
1003  |  |         // Bit 7=MAPPING_HAS_CCC_LCCC_WORD  | 
1004  | 0  |         const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;  | 
1005  | 0  |         uint16_t rm0=*rawMapping;  | 
1006  | 0  |         if(rm0<=MAPPING_LENGTH_MASK) { | 
1007  | 0  |             length=rm0;  | 
1008  | 0  |             return (const UChar *)rawMapping-rm0;  | 
1009  | 0  |         } else { | 
1010  |  |             // Copy the normal mapping and replace its first two code units with rm0.  | 
1011  | 0  |             buffer[0]=(UChar)rm0;  | 
1012  | 0  |             u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);  | 
1013  | 0  |             length=mLength-1;  | 
1014  | 0  |             return buffer;  | 
1015  | 0  |         }  | 
1016  | 0  |     } else { | 
1017  | 0  |         length=mLength;  | 
1018  | 0  |         return (const UChar *)mapping+1;  | 
1019  | 0  |     }  | 
1020  | 0  | }  | 
1021  |  |  | 
1022  |  | void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,  | 
1023  |  |                                          UBool doDecompose,  | 
1024  |  |                                          UnicodeString &safeMiddle,  | 
1025  |  |                                          ReorderingBuffer &buffer,  | 
1026  | 0  |                                          UErrorCode &errorCode) const { | 
1027  | 0  |     buffer.copyReorderableSuffixTo(safeMiddle);  | 
1028  | 0  |     if(doDecompose) { | 
1029  | 0  |         decompose(src, limit, &buffer, errorCode);  | 
1030  | 0  |         return;  | 
1031  | 0  |     }  | 
1032  |  |     // Just merge the strings at the boundary.  | 
1033  | 0  |     bool isFirst = true;  | 
1034  | 0  |     uint8_t firstCC = 0, prevCC = 0, cc;  | 
1035  | 0  |     const UChar *p = src;  | 
1036  | 0  |     while (p != limit) { | 
1037  | 0  |         const UChar *codePointStart = p;  | 
1038  | 0  |         UChar32 c;  | 
1039  | 0  |         uint16_t norm16;  | 
1040  | 0  |         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);  | 
1041  | 0  |         if ((cc = getCC(norm16)) == 0) { | 
1042  | 0  |             p = codePointStart;  | 
1043  | 0  |             break;  | 
1044  | 0  |         }  | 
1045  | 0  |         if (isFirst) { | 
1046  | 0  |             firstCC = cc;  | 
1047  | 0  |             isFirst = false;  | 
1048  | 0  |         }  | 
1049  | 0  |         prevCC = cc;  | 
1050  | 0  |     }  | 
1051  | 0  |     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL | 
1052  | 0  |         limit=u_strchr(p, 0);  | 
1053  | 0  |     }  | 
1054  |  | 
  | 
1055  | 0  |     if (buffer.append(src, (int32_t)(p - src), FALSE, firstCC, prevCC, errorCode)) { | 
1056  | 0  |         buffer.appendZeroCC(p, limit, errorCode);  | 
1057  | 0  |     }  | 
1058  | 0  | }  | 
1059  |  |  | 
1060  | 0  | UBool Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c) const { | 
1061  | 0  |     return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||  | 
1062  | 0  |         norm16HasDecompBoundaryBefore(getNorm16(c));  | 
1063  | 0  | }  | 
1064  |  |  | 
1065  | 0  | UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const { | 
1066  | 0  |     if (norm16 < minNoNoCompNoMaybeCC) { | 
1067  | 0  |         return TRUE;  | 
1068  | 0  |     }  | 
1069  | 0  |     if (norm16 >= limitNoNo) { | 
1070  | 0  |         return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;  | 
1071  | 0  |     }  | 
1072  |  |     // c decomposes, get everything from the variable-length extra data  | 
1073  | 0  |     const uint16_t *mapping=getMapping(norm16);  | 
1074  | 0  |     uint16_t firstUnit=*mapping;  | 
1075  |  |     // TRUE if leadCC==0 (hasFCDBoundaryBefore())  | 
1076  | 0  |     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;  | 
1077  | 0  | }  | 
1078  |  |  | 
1079  | 0  | UBool Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c) const { | 
1080  | 0  |     if (c < minDecompNoCP) { | 
1081  | 0  |         return TRUE;  | 
1082  | 0  |     }  | 
1083  | 0  |     if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { | 
1084  | 0  |         return TRUE;  | 
1085  | 0  |     }  | 
1086  | 0  |     return norm16HasDecompBoundaryAfter(getNorm16(c));  | 
1087  | 0  | }  | 
1088  |  |  | 
1089  | 0  | UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const { | 
1090  | 0  |     if(norm16 <= minYesNo || isHangulLVT(norm16)) { | 
1091  | 0  |         return TRUE;  | 
1092  | 0  |     }  | 
1093  | 0  |     if (norm16 >= limitNoNo) { | 
1094  | 0  |         if (isMaybeOrNonZeroCC(norm16)) { | 
1095  | 0  |             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;  | 
1096  | 0  |         }  | 
1097  |  |         // Maps to an isCompYesAndZeroCC.  | 
1098  | 0  |         return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;  | 
1099  | 0  |     }  | 
1100  |  |     // c decomposes, get everything from the variable-length extra data  | 
1101  | 0  |     const uint16_t *mapping=getMapping(norm16);  | 
1102  | 0  |     uint16_t firstUnit=*mapping;  | 
1103  |  |     // decomp after-boundary: same as hasFCDBoundaryAfter(),  | 
1104  |  |     // fcd16<=1 || trailCC==0  | 
1105  | 0  |     if(firstUnit>0x1ff) { | 
1106  | 0  |         return FALSE;  // trailCC>1  | 
1107  | 0  |     }  | 
1108  | 0  |     if(firstUnit<=0xff) { | 
1109  | 0  |         return TRUE;  // trailCC==0  | 
1110  | 0  |     }  | 
1111  |  |     // if(trailCC==1) test leadCC==0, same as checking for before-boundary  | 
1112  |  |     // TRUE if leadCC==0 (hasFCDBoundaryBefore())  | 
1113  | 0  |     return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;  | 
1114  | 0  | }  | 
1115  |  |  | 
1116  |  | /*  | 
1117  |  |  * Finds the recomposition result for  | 
1118  |  |  * a forward-combining "lead" character,  | 
1119  |  |  * specified with a pointer to its compositions list,  | 
1120  |  |  * and a backward-combining "trail" character.  | 
1121  |  |  *  | 
1122  |  |  * If the lead and trail characters combine, then this function returns  | 
1123  |  |  * the following "compositeAndFwd" value:  | 
1124  |  |  * Bits 21..1  composite character  | 
1125  |  |  * Bit      0  set if the composite is a forward-combining starter  | 
1126  |  |  * otherwise it returns -1.  | 
1127  |  |  *  | 
1128  |  |  * The compositions list has (trail, compositeAndFwd) pair entries,  | 
1129  |  |  * encoded as either pairs or triples of 16-bit units.  | 
1130  |  |  * The last entry has the high bit of its first unit set.  | 
1131  |  |  *  | 
1132  |  |  * The list is sorted by ascending trail characters (there are no duplicates).  | 
1133  |  |  * A linear search is used.  | 
1134  |  |  *  | 
1135  |  |  * See normalizer2impl.h for a more detailed description  | 
1136  |  |  * of the compositions list format.  | 
1137  |  |  */  | 
1138  | 0  | int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { | 
1139  | 0  |     uint16_t key1, firstUnit;  | 
1140  | 0  |     if(trail<COMP_1_TRAIL_LIMIT) { | 
1141  |  |         // trail character is 0..33FF  | 
1142  |  |         // result entry may have 2 or 3 units  | 
1143  | 0  |         key1=(uint16_t)(trail<<1);  | 
1144  | 0  |         while(key1>(firstUnit=*list)) { | 
1145  | 0  |             list+=2+(firstUnit&COMP_1_TRIPLE);  | 
1146  | 0  |         }  | 
1147  | 0  |         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { | 
1148  | 0  |             if(firstUnit&COMP_1_TRIPLE) { | 
1149  | 0  |                 return ((int32_t)list[1]<<16)|list[2];  | 
1150  | 0  |             } else { | 
1151  | 0  |                 return list[1];  | 
1152  | 0  |             }  | 
1153  | 0  |         }  | 
1154  | 0  |     } else { | 
1155  |  |         // trail character is 3400..10FFFF  | 
1156  |  |         // result entry has 3 units  | 
1157  | 0  |         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+  | 
1158  | 0  |                         (((trail>>COMP_1_TRAIL_SHIFT))&  | 
1159  | 0  |                           ~COMP_1_TRIPLE));  | 
1160  | 0  |         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);  | 
1161  | 0  |         uint16_t secondUnit;  | 
1162  | 0  |         for(;;) { | 
1163  | 0  |             if(key1>(firstUnit=*list)) { | 
1164  | 0  |                 list+=2+(firstUnit&COMP_1_TRIPLE);  | 
1165  | 0  |             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { | 
1166  | 0  |                 if(key2>(secondUnit=list[1])) { | 
1167  | 0  |                     if(firstUnit&COMP_1_LAST_TUPLE) { | 
1168  | 0  |                         break;  | 
1169  | 0  |                     } else { | 
1170  | 0  |                         list+=3;  | 
1171  | 0  |                     }  | 
1172  | 0  |                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { | 
1173  | 0  |                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];  | 
1174  | 0  |                 } else { | 
1175  | 0  |                     break;  | 
1176  | 0  |                 }  | 
1177  | 0  |             } else { | 
1178  | 0  |                 break;  | 
1179  | 0  |             }  | 
1180  | 0  |         }  | 
1181  | 0  |     }  | 
1182  | 0  |     return -1;  | 
1183  | 0  | }  | 
1184  |  |  | 
1185  |  | /**  | 
1186  |  |   * @param list some character's compositions list  | 
1187  |  |   * @param set recursively receives the composites from these compositions  | 
1188  |  |   */  | 
1189  | 0  | void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const { | 
1190  | 0  |     uint16_t firstUnit;  | 
1191  | 0  |     int32_t compositeAndFwd;  | 
1192  | 0  |     do { | 
1193  | 0  |         firstUnit=*list;  | 
1194  | 0  |         if((firstUnit&COMP_1_TRIPLE)==0) { | 
1195  | 0  |             compositeAndFwd=list[1];  | 
1196  | 0  |             list+=2;  | 
1197  | 0  |         } else { | 
1198  | 0  |             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];  | 
1199  | 0  |             list+=3;  | 
1200  | 0  |         }  | 
1201  | 0  |         UChar32 composite=compositeAndFwd>>1;  | 
1202  | 0  |         if((compositeAndFwd&1)!=0) { | 
1203  | 0  |             addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);  | 
1204  | 0  |         }  | 
1205  | 0  |         set.add(composite);  | 
1206  | 0  |     } while((firstUnit&COMP_1_LAST_TUPLE)==0);  | 
1207  | 0  | }  | 
1208  |  |  | 
1209  |  | /*  | 
1210  |  |  * Recomposes the buffer text starting at recomposeStartIndex  | 
1211  |  |  * (which is in NFD - decomposed and canonically ordered),  | 
1212  |  |  * and truncates the buffer contents.  | 
1213  |  |  *  | 
1214  |  |  * Note that recomposition never lengthens the text:  | 
1215  |  |  * Any character consists of either one or two code units;  | 
1216  |  |  * a composition may contain at most one more code unit than the original starter,  | 
1217  |  |  * while the combining mark that is removed has at least one code unit.  | 
1218  |  |  */  | 
1219  |  | void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,  | 
1220  | 0  |                                 UBool onlyContiguous) const { | 
1221  | 0  |     UChar *p=buffer.getStart()+recomposeStartIndex;  | 
1222  | 0  |     UChar *limit=buffer.getLimit();  | 
1223  | 0  |     if(p==limit) { | 
1224  | 0  |         return;  | 
1225  | 0  |     }  | 
1226  |  |  | 
1227  | 0  |     UChar *starter, *pRemove, *q, *r;  | 
1228  | 0  |     const uint16_t *compositionsList;  | 
1229  | 0  |     UChar32 c, compositeAndFwd;  | 
1230  | 0  |     uint16_t norm16;  | 
1231  | 0  |     uint8_t cc, prevCC;  | 
1232  | 0  |     UBool starterIsSupplementary;  | 
1233  |  |  | 
1234  |  |     // Some of the following variables are not used until we have a forward-combining starter  | 
1235  |  |     // and are only initialized now to avoid compiler warnings.  | 
1236  | 0  |     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter  | 
1237  | 0  |     starter=NULL;  | 
1238  | 0  |     starterIsSupplementary=FALSE;  | 
1239  | 0  |     prevCC=0;  | 
1240  |  | 
  | 
1241  | 0  |     for(;;) { | 
1242  | 0  |         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);  | 
1243  | 0  |         cc=getCCFromYesOrMaybe(norm16);  | 
1244  | 0  |         if( // this character combines backward and  | 
1245  | 0  |             isMaybe(norm16) &&  | 
1246  |  |             // we have seen a starter that combines forward and  | 
1247  | 0  |             compositionsList!=NULL &&  | 
1248  |  |             // the backward-combining character is not blocked  | 
1249  | 0  |             (prevCC<cc || prevCC==0)  | 
1250  | 0  |         ) { | 
1251  | 0  |             if(isJamoVT(norm16)) { | 
1252  |  |                 // c is a Jamo V/T, see if we can compose it with the previous character.  | 
1253  | 0  |                 if(c<Hangul::JAMO_T_BASE) { | 
1254  |  |                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.  | 
1255  | 0  |                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);  | 
1256  | 0  |                     if(prev<Hangul::JAMO_L_COUNT) { | 
1257  | 0  |                         pRemove=p-1;  | 
1258  | 0  |                         UChar syllable=(UChar)  | 
1259  | 0  |                             (Hangul::HANGUL_BASE+  | 
1260  | 0  |                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*  | 
1261  | 0  |                              Hangul::JAMO_T_COUNT);  | 
1262  | 0  |                         UChar t;  | 
1263  | 0  |                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { | 
1264  | 0  |                             ++p;  | 
1265  | 0  |                             syllable+=t;  // The next character was a Jamo T.  | 
1266  | 0  |                         }  | 
1267  | 0  |                         *starter=syllable;  | 
1268  |  |                         // remove the Jamo V/T  | 
1269  | 0  |                         q=pRemove;  | 
1270  | 0  |                         r=p;  | 
1271  | 0  |                         while(r<limit) { | 
1272  | 0  |                             *q++=*r++;  | 
1273  | 0  |                         }  | 
1274  | 0  |                         limit=q;  | 
1275  | 0  |                         p=pRemove;  | 
1276  | 0  |                     }  | 
1277  | 0  |                 }  | 
1278  |  |                 /*  | 
1279  |  |                  * No "else" for Jamo T:  | 
1280  |  |                  * Since the input is in NFD, there are no Hangul LV syllables that  | 
1281  |  |                  * a Jamo T could combine with.  | 
1282  |  |                  * All Jamo Ts are combined above when handling Jamo Vs.  | 
1283  |  |                  */  | 
1284  | 0  |                 if(p==limit) { | 
1285  | 0  |                     break;  | 
1286  | 0  |                 }  | 
1287  | 0  |                 compositionsList=NULL;  | 
1288  | 0  |                 continue;  | 
1289  | 0  |             } else if((compositeAndFwd=combine(compositionsList, c))>=0) { | 
1290  |  |                 // The starter and the combining mark (c) do combine.  | 
1291  | 0  |                 UChar32 composite=compositeAndFwd>>1;  | 
1292  |  |  | 
1293  |  |                 // Replace the starter with the composite, remove the combining mark.  | 
1294  | 0  |                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark  | 
1295  | 0  |                 if(starterIsSupplementary) { | 
1296  | 0  |                     if(U_IS_SUPPLEMENTARY(composite)) { | 
1297  |  |                         // both are supplementary  | 
1298  | 0  |                         starter[0]=U16_LEAD(composite);  | 
1299  | 0  |                         starter[1]=U16_TRAIL(composite);  | 
1300  | 0  |                     } else { | 
1301  | 0  |                         *starter=(UChar)composite;  | 
1302  |  |                         // The composite is shorter than the starter,  | 
1303  |  |                         // move the intermediate characters forward one.  | 
1304  | 0  |                         starterIsSupplementary=FALSE;  | 
1305  | 0  |                         q=starter+1;  | 
1306  | 0  |                         r=q+1;  | 
1307  | 0  |                         while(r<pRemove) { | 
1308  | 0  |                             *q++=*r++;  | 
1309  | 0  |                         }  | 
1310  | 0  |                         --pRemove;  | 
1311  | 0  |                     }  | 
1312  | 0  |                 } else if(U_IS_SUPPLEMENTARY(composite)) { | 
1313  |  |                     // The composite is longer than the starter,  | 
1314  |  |                     // move the intermediate characters back one.  | 
1315  | 0  |                     starterIsSupplementary=TRUE;  | 
1316  | 0  |                     ++starter;  // temporarily increment for the loop boundary  | 
1317  | 0  |                     q=pRemove;  | 
1318  | 0  |                     r=++pRemove;  | 
1319  | 0  |                     while(starter<q) { | 
1320  | 0  |                         *--r=*--q;  | 
1321  | 0  |                     }  | 
1322  | 0  |                     *starter=U16_TRAIL(composite);  | 
1323  | 0  |                     *--starter=U16_LEAD(composite);  // undo the temporary increment  | 
1324  | 0  |                 } else { | 
1325  |  |                     // both are on the BMP  | 
1326  | 0  |                     *starter=(UChar)composite;  | 
1327  | 0  |                 }  | 
1328  |  |  | 
1329  |  |                 /* remove the combining mark by moving the following text over it */  | 
1330  | 0  |                 if(pRemove<p) { | 
1331  | 0  |                     q=pRemove;  | 
1332  | 0  |                     r=p;  | 
1333  | 0  |                     while(r<limit) { | 
1334  | 0  |                         *q++=*r++;  | 
1335  | 0  |                     }  | 
1336  | 0  |                     limit=q;  | 
1337  | 0  |                     p=pRemove;  | 
1338  | 0  |                 }  | 
1339  |  |                 // Keep prevCC because we removed the combining mark.  | 
1340  |  | 
  | 
1341  | 0  |                 if(p==limit) { | 
1342  | 0  |                     break;  | 
1343  | 0  |                 }  | 
1344  |  |                 // Is the composite a starter that combines forward?  | 
1345  | 0  |                 if(compositeAndFwd&1) { | 
1346  | 0  |                     compositionsList=  | 
1347  | 0  |                         getCompositionsListForComposite(getRawNorm16(composite));  | 
1348  | 0  |                 } else { | 
1349  | 0  |                     compositionsList=NULL;  | 
1350  | 0  |                 }  | 
1351  |  |  | 
1352  |  |                 // We combined; continue with looking for compositions.  | 
1353  | 0  |                 continue;  | 
1354  | 0  |             }  | 
1355  | 0  |         }  | 
1356  |  |  | 
1357  |  |         // no combination this time  | 
1358  | 0  |         prevCC=cc;  | 
1359  | 0  |         if(p==limit) { | 
1360  | 0  |             break;  | 
1361  | 0  |         }  | 
1362  |  |  | 
1363  |  |         // If c did not combine, then check if it is a starter.  | 
1364  | 0  |         if(cc==0) { | 
1365  |  |             // Found a new starter.  | 
1366  | 0  |             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) { | 
1367  |  |                 // It may combine with something, prepare for it.  | 
1368  | 0  |                 if(U_IS_BMP(c)) { | 
1369  | 0  |                     starterIsSupplementary=FALSE;  | 
1370  | 0  |                     starter=p-1;  | 
1371  | 0  |                 } else { | 
1372  | 0  |                     starterIsSupplementary=TRUE;  | 
1373  | 0  |                     starter=p-2;  | 
1374  | 0  |                 }  | 
1375  | 0  |             }  | 
1376  | 0  |         } else if(onlyContiguous) { | 
1377  |  |             // FCC: no discontiguous compositions; any intervening character blocks.  | 
1378  | 0  |             compositionsList=NULL;  | 
1379  | 0  |         }  | 
1380  | 0  |     }  | 
1381  | 0  |     buffer.setReorderingLimit(limit);  | 
1382  | 0  | }  | 
1383  |  |  | 
1384  |  | UChar32  | 
1385  | 0  | Normalizer2Impl::composePair(UChar32 a, UChar32 b) const { | 
1386  | 0  |     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16  | 
1387  | 0  |     const uint16_t *list;  | 
1388  | 0  |     if(isInert(norm16)) { | 
1389  | 0  |         return U_SENTINEL;  | 
1390  | 0  |     } else if(norm16<minYesNoMappingsOnly) { | 
1391  |  |         // a combines forward.  | 
1392  | 0  |         if(isJamoL(norm16)) { | 
1393  | 0  |             b-=Hangul::JAMO_V_BASE;  | 
1394  | 0  |             if(0<=b && b<Hangul::JAMO_V_COUNT) { | 
1395  | 0  |                 return  | 
1396  | 0  |                     (Hangul::HANGUL_BASE+  | 
1397  | 0  |                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*  | 
1398  | 0  |                      Hangul::JAMO_T_COUNT);  | 
1399  | 0  |             } else { | 
1400  | 0  |                 return U_SENTINEL;  | 
1401  | 0  |             }  | 
1402  | 0  |         } else if(isHangulLV(norm16)) { | 
1403  | 0  |             b-=Hangul::JAMO_T_BASE;  | 
1404  | 0  |             if(0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0! | 
1405  | 0  |                 return a+b;  | 
1406  | 0  |             } else { | 
1407  | 0  |                 return U_SENTINEL;  | 
1408  | 0  |             }  | 
1409  | 0  |         } else { | 
1410  |  |             // 'a' has a compositions list in extraData  | 
1411  | 0  |             list=getMapping(norm16);  | 
1412  | 0  |             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list | 
1413  | 0  |                 list+=  // mapping pointer  | 
1414  | 0  |                     1+  // +1 to skip the first unit with the mapping length  | 
1415  | 0  |                     (*list&MAPPING_LENGTH_MASK);  // + mapping length  | 
1416  | 0  |             }  | 
1417  | 0  |         }  | 
1418  | 0  |     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { | 
1419  | 0  |         return U_SENTINEL;  | 
1420  | 0  |     } else { | 
1421  | 0  |         list=getCompositionsListForMaybe(norm16);  | 
1422  | 0  |     }  | 
1423  | 0  |     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b | 
1424  | 0  |         return U_SENTINEL;  | 
1425  | 0  |     }  | 
1426  | 0  | #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC  | 
1427  | 0  |     return combine(list, b)>>1;  | 
1428  |  | #else  | 
1429  |  |     int32_t compositeAndFwd=combine(list, b);  | 
1430  |  |     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;  | 
1431  |  | #endif  | 
1432  | 0  | }  | 
1433  |  |  | 
1434  |  | // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.  | 
1435  |  | // doCompose: normalize  | 
1436  |  | // !doCompose: isNormalized (buffer must be empty and initialized)  | 
1437  |  | UBool  | 
1438  |  | Normalizer2Impl::compose(const UChar *src, const UChar *limit,  | 
1439  |  |                          UBool onlyContiguous,  | 
1440  |  |                          UBool doCompose,  | 
1441  |  |                          ReorderingBuffer &buffer,  | 
1442  | 0  |                          UErrorCode &errorCode) const { | 
1443  | 0  |     const UChar *prevBoundary=src;  | 
1444  | 0  |     UChar32 minNoMaybeCP=minCompNoMaybeCP;  | 
1445  | 0  |     if(limit==NULL) { | 
1446  | 0  |         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,  | 
1447  | 0  |                                            doCompose ? &buffer : NULL,  | 
1448  | 0  |                                            errorCode);  | 
1449  | 0  |         if(U_FAILURE(errorCode)) { | 
1450  | 0  |             return FALSE;  | 
1451  | 0  |         }  | 
1452  | 0  |         limit=u_strchr(src, 0);  | 
1453  | 0  |         if (prevBoundary != src) { | 
1454  | 0  |             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) { | 
1455  | 0  |                 prevBoundary = src;  | 
1456  | 0  |             } else { | 
1457  | 0  |                 buffer.removeSuffix(1);  | 
1458  | 0  |                 prevBoundary = --src;  | 
1459  | 0  |             }  | 
1460  | 0  |         }  | 
1461  | 0  |     }  | 
1462  |  |  | 
1463  | 0  |     for (;;) { | 
1464  |  |         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,  | 
1465  |  |         // or with (compYes && ccc==0) properties.  | 
1466  | 0  |         const UChar *prevSrc;  | 
1467  | 0  |         UChar32 c = 0;  | 
1468  | 0  |         uint16_t norm16 = 0;  | 
1469  | 0  |         for (;;) { | 
1470  | 0  |             if (src == limit) { | 
1471  | 0  |                 if (prevBoundary != limit && doCompose) { | 
1472  | 0  |                     buffer.appendZeroCC(prevBoundary, limit, errorCode);  | 
1473  | 0  |                 }  | 
1474  | 0  |                 return TRUE;  | 
1475  | 0  |             }  | 
1476  | 0  |             if( (c=*src)<minNoMaybeCP ||  | 
1477  | 0  |                 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))  | 
1478  | 0  |             ) { | 
1479  | 0  |                 ++src;  | 
1480  | 0  |             } else { | 
1481  | 0  |                 prevSrc = src++;  | 
1482  | 0  |                 if(!U16_IS_LEAD(c)) { | 
1483  | 0  |                     break;  | 
1484  | 0  |                 } else { | 
1485  | 0  |                     UChar c2;  | 
1486  | 0  |                     if(src!=limit && U16_IS_TRAIL(c2=*src)) { | 
1487  | 0  |                         ++src;  | 
1488  | 0  |                         c=U16_GET_SUPPLEMENTARY(c, c2);  | 
1489  | 0  |                         norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);  | 
1490  | 0  |                         if(!isCompYesAndZeroCC(norm16)) { | 
1491  | 0  |                             break;  | 
1492  | 0  |                         }  | 
1493  | 0  |                     }  | 
1494  | 0  |                 }  | 
1495  | 0  |             }  | 
1496  | 0  |         }  | 
1497  |  |         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.  | 
1498  |  |         // The current character is either a "noNo" (has a mapping)  | 
1499  |  |         // or a "maybeYes" (combines backward)  | 
1500  |  |         // or a "yesYes" with ccc!=0.  | 
1501  |  |         // It is not a Hangul syllable or Jamo L because those have "yes" properties.  | 
1502  |  |  | 
1503  |  |         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.  | 
1504  | 0  |         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes | 
1505  | 0  |             if (!doCompose) { | 
1506  | 0  |                 return FALSE;  | 
1507  | 0  |             }  | 
1508  |  |             // Fast path for mapping a character that is immediately surrounded by boundaries.  | 
1509  |  |             // In this case, we need not decompose around the current character.  | 
1510  | 0  |             if (isDecompNoAlgorithmic(norm16)) { | 
1511  |  |                 // Maps to a single isCompYesAndZeroCC character  | 
1512  |  |                 // which also implies hasCompBoundaryBefore.  | 
1513  | 0  |                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||  | 
1514  | 0  |                         hasCompBoundaryBefore(src, limit)) { | 
1515  | 0  |                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { | 
1516  | 0  |                         break;  | 
1517  | 0  |                     }  | 
1518  | 0  |                     if(!buffer.append(mapAlgorithmic(c, norm16), 0, errorCode)) { | 
1519  | 0  |                         break;  | 
1520  | 0  |                     }  | 
1521  | 0  |                     prevBoundary = src;  | 
1522  | 0  |                     continue;  | 
1523  | 0  |                 }  | 
1524  | 0  |             } else if (norm16 < minNoNoCompBoundaryBefore) { | 
1525  |  |                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.  | 
1526  | 0  |                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||  | 
1527  | 0  |                         hasCompBoundaryBefore(src, limit)) { | 
1528  | 0  |                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { | 
1529  | 0  |                         break;  | 
1530  | 0  |                     }  | 
1531  | 0  |                     const UChar *mapping = reinterpret_cast<const UChar *>(getMapping(norm16));  | 
1532  | 0  |                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;  | 
1533  | 0  |                     if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) { | 
1534  | 0  |                         break;  | 
1535  | 0  |                     }  | 
1536  | 0  |                     prevBoundary = src;  | 
1537  | 0  |                     continue;  | 
1538  | 0  |                 }  | 
1539  | 0  |             } else if (norm16 >= minNoNoEmpty) { | 
1540  |  |                 // The current character maps to nothing.  | 
1541  |  |                 // Simply omit it from the output if there is a boundary before _or_ after it.  | 
1542  |  |                 // The character itself implies no boundaries.  | 
1543  | 0  |                 if (hasCompBoundaryBefore(src, limit) ||  | 
1544  | 0  |                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) { | 
1545  | 0  |                     if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { | 
1546  | 0  |                         break;  | 
1547  | 0  |                     }  | 
1548  | 0  |                     prevBoundary = src;  | 
1549  | 0  |                     continue;  | 
1550  | 0  |                 }  | 
1551  | 0  |             }  | 
1552  |  |             // Other "noNo" type, or need to examine more text around this character:  | 
1553  |  |             // Fall through to the slow path.  | 
1554  | 0  |         } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { | 
1555  | 0  |             UChar prev=*(prevSrc-1);  | 
1556  | 0  |             if(c<Hangul::JAMO_T_BASE) { | 
1557  |  |                 // The current character is a Jamo Vowel,  | 
1558  |  |                 // compose with previous Jamo L and following Jamo T.  | 
1559  | 0  |                 UChar l = (UChar)(prev-Hangul::JAMO_L_BASE);  | 
1560  | 0  |                 if(l<Hangul::JAMO_L_COUNT) { | 
1561  | 0  |                     if (!doCompose) { | 
1562  | 0  |                         return FALSE;  | 
1563  | 0  |                     }  | 
1564  | 0  |                     int32_t t;  | 
1565  | 0  |                     if (src != limit &&  | 
1566  | 0  |                             0 < (t = ((int32_t)*src - Hangul::JAMO_T_BASE)) &&  | 
1567  | 0  |                             t < Hangul::JAMO_T_COUNT) { | 
1568  |  |                         // The next character is a Jamo T.  | 
1569  | 0  |                         ++src;  | 
1570  | 0  |                     } else if (hasCompBoundaryBefore(src, limit)) { | 
1571  |  |                         // No Jamo T follows, not even via decomposition.  | 
1572  | 0  |                         t = 0;  | 
1573  | 0  |                     } else { | 
1574  | 0  |                         t = -1;  | 
1575  | 0  |                     }  | 
1576  | 0  |                     if (t >= 0) { | 
1577  | 0  |                         UChar32 syllable = Hangul::HANGUL_BASE +  | 
1578  | 0  |                             (l*Hangul::JAMO_V_COUNT + (c-Hangul::JAMO_V_BASE)) *  | 
1579  | 0  |                             Hangul::JAMO_T_COUNT + t;  | 
1580  | 0  |                         --prevSrc;  // Replace the Jamo L as well.  | 
1581  | 0  |                         if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { | 
1582  | 0  |                             break;  | 
1583  | 0  |                         }  | 
1584  | 0  |                         if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) { | 
1585  | 0  |                             break;  | 
1586  | 0  |                         }  | 
1587  | 0  |                         prevBoundary = src;  | 
1588  | 0  |                         continue;  | 
1589  | 0  |                     }  | 
1590  |  |                     // If we see L+V+x where x!=T then we drop to the slow path,  | 
1591  |  |                     // decompose and recompose.  | 
1592  |  |                     // This is to deal with NFKC finding normal L and V but a  | 
1593  |  |                     // compatibility variant of a T.  | 
1594  |  |                     // We need to either fully compose that combination here  | 
1595  |  |                     // (which would complicate the code and may not work with strange custom data)  | 
1596  |  |                     // or use the slow path.  | 
1597  | 0  |                 }  | 
1598  | 0  |             } else if (Hangul::isHangulLV(prev)) { | 
1599  |  |                 // The current character is a Jamo Trailing consonant,  | 
1600  |  |                 // compose with previous Hangul LV that does not contain a Jamo T.  | 
1601  | 0  |                 if (!doCompose) { | 
1602  | 0  |                     return FALSE;  | 
1603  | 0  |                 }  | 
1604  | 0  |                 UChar32 syllable = prev + c - Hangul::JAMO_T_BASE;  | 
1605  | 0  |                 --prevSrc;  // Replace the Hangul LV as well.  | 
1606  | 0  |                 if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { | 
1607  | 0  |                     break;  | 
1608  | 0  |                 }  | 
1609  | 0  |                 if(!buffer.appendBMP((UChar)syllable, 0, errorCode)) { | 
1610  | 0  |                     break;  | 
1611  | 0  |                 }  | 
1612  | 0  |                 prevBoundary = src;  | 
1613  | 0  |                 continue;  | 
1614  | 0  |             }  | 
1615  |  |             // No matching context, or may need to decompose surrounding text first:  | 
1616  |  |             // Fall through to the slow path.  | 
1617  | 0  |         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC | 
1618  |  |             // One or more combining marks that do not combine-back:  | 
1619  |  |             // Check for canonical order, copy unchanged if ok and  | 
1620  |  |             // if followed by a character with a boundary-before.  | 
1621  | 0  |             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0  | 
1622  | 0  |             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) { | 
1623  |  |                 // Fails FCD test, need to decompose and contiguously recompose.  | 
1624  | 0  |                 if (!doCompose) { | 
1625  | 0  |                     return FALSE;  | 
1626  | 0  |                 }  | 
1627  | 0  |             } else { | 
1628  |  |                 // If !onlyContiguous (not FCC), then we ignore the tccc of  | 
1629  |  |                 // the previous character which passed the quick check "yes && ccc==0" test.  | 
1630  | 0  |                 const UChar *nextSrc;  | 
1631  | 0  |                 uint16_t n16;  | 
1632  | 0  |                 for (;;) { | 
1633  | 0  |                     if (src == limit) { | 
1634  | 0  |                         if (doCompose) { | 
1635  | 0  |                             buffer.appendZeroCC(prevBoundary, limit, errorCode);  | 
1636  | 0  |                         }  | 
1637  | 0  |                         return TRUE;  | 
1638  | 0  |                     }  | 
1639  | 0  |                     uint8_t prevCC = cc;  | 
1640  | 0  |                     nextSrc = src;  | 
1641  | 0  |                     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16);  | 
1642  | 0  |                     if (n16 >= MIN_YES_YES_WITH_CC) { | 
1643  | 0  |                         cc = getCCFromNormalYesOrMaybe(n16);  | 
1644  | 0  |                         if (prevCC > cc) { | 
1645  | 0  |                             if (!doCompose) { | 
1646  | 0  |                                 return FALSE;  | 
1647  | 0  |                             }  | 
1648  | 0  |                             break;  | 
1649  | 0  |                         }  | 
1650  | 0  |                     } else { | 
1651  | 0  |                         break;  | 
1652  | 0  |                     }  | 
1653  | 0  |                     src = nextSrc;  | 
1654  | 0  |                 }  | 
1655  |  |                 // src is after the last in-order combining mark.  | 
1656  |  |                 // If there is a boundary here, then we continue with no change.  | 
1657  | 0  |                 if (norm16HasCompBoundaryBefore(n16)) { | 
1658  | 0  |                     if (isCompYesAndZeroCC(n16)) { | 
1659  | 0  |                         src = nextSrc;  | 
1660  | 0  |                     }  | 
1661  | 0  |                     continue;  | 
1662  | 0  |                 }  | 
1663  |  |                 // Use the slow path. There is no boundary in [prevSrc, src[.  | 
1664  | 0  |             }  | 
1665  | 0  |         }  | 
1666  |  |  | 
1667  |  |         // Slow path: Find the nearest boundaries around the current character,  | 
1668  |  |         // decompose and recompose.  | 
1669  | 0  |         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { | 
1670  | 0  |             const UChar *p = prevSrc;  | 
1671  | 0  |             UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16);  | 
1672  | 0  |             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { | 
1673  | 0  |                 prevSrc = p;  | 
1674  | 0  |             }  | 
1675  | 0  |         }  | 
1676  | 0  |         if (doCompose && prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { | 
1677  | 0  |             break;  | 
1678  | 0  |         }  | 
1679  | 0  |         int32_t recomposeStartIndex=buffer.length();  | 
1680  |  |         // We know there is not a boundary here.  | 
1681  | 0  |         decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,  | 
1682  | 0  |                        buffer, errorCode);  | 
1683  |  |         // Decompose until the next boundary.  | 
1684  | 0  |         src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,  | 
1685  | 0  |                              buffer, errorCode);  | 
1686  | 0  |         if (U_FAILURE(errorCode)) { | 
1687  | 0  |             break;  | 
1688  | 0  |         }  | 
1689  | 0  |         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals() | 
1690  | 0  |             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;  | 
1691  | 0  |             return TRUE;  | 
1692  | 0  |         }  | 
1693  | 0  |         recompose(buffer, recomposeStartIndex, onlyContiguous);  | 
1694  | 0  |         if(!doCompose) { | 
1695  | 0  |             if(!buffer.equals(prevSrc, src)) { | 
1696  | 0  |                 return FALSE;  | 
1697  | 0  |             }  | 
1698  | 0  |             buffer.remove();  | 
1699  | 0  |         }  | 
1700  | 0  |         prevBoundary=src;  | 
1701  | 0  |     }  | 
1702  | 0  |     return TRUE;  | 
1703  | 0  | }  | 
1704  |  |  | 
1705  |  | // Very similar to compose(): Make the same changes in both places if relevant.  | 
1706  |  | // pQCResult==NULL: spanQuickCheckYes  | 
1707  |  | // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)  | 
1708  |  | const UChar *  | 
1709  |  | Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,  | 
1710  |  |                                    UBool onlyContiguous,  | 
1711  | 0  |                                    UNormalizationCheckResult *pQCResult) const { | 
1712  | 0  |     const UChar *prevBoundary=src;  | 
1713  | 0  |     UChar32 minNoMaybeCP=minCompNoMaybeCP;  | 
1714  | 0  |     if(limit==NULL) { | 
1715  | 0  |         UErrorCode errorCode=U_ZERO_ERROR;  | 
1716  | 0  |         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);  | 
1717  | 0  |         limit=u_strchr(src, 0);  | 
1718  | 0  |         if (prevBoundary != src) { | 
1719  | 0  |             if (hasCompBoundaryAfter(*(src-1), onlyContiguous)) { | 
1720  | 0  |                 prevBoundary = src;  | 
1721  | 0  |             } else { | 
1722  | 0  |                 prevBoundary = --src;  | 
1723  | 0  |             }  | 
1724  | 0  |         }  | 
1725  | 0  |     }  | 
1726  |  | 
  | 
1727  | 0  |     for(;;) { | 
1728  |  |         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,  | 
1729  |  |         // or with (compYes && ccc==0) properties.  | 
1730  | 0  |         const UChar *prevSrc;  | 
1731  | 0  |         UChar32 c = 0;  | 
1732  | 0  |         uint16_t norm16 = 0;  | 
1733  | 0  |         for (;;) { | 
1734  | 0  |             if(src==limit) { | 
1735  | 0  |                 return src;  | 
1736  | 0  |             }  | 
1737  | 0  |             if( (c=*src)<minNoMaybeCP ||  | 
1738  | 0  |                 isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))  | 
1739  | 0  |             ) { | 
1740  | 0  |                 ++src;  | 
1741  | 0  |             } else { | 
1742  | 0  |                 prevSrc = src++;  | 
1743  | 0  |                 if(!U16_IS_LEAD(c)) { | 
1744  | 0  |                     break;  | 
1745  | 0  |                 } else { | 
1746  | 0  |                     UChar c2;  | 
1747  | 0  |                     if(src!=limit && U16_IS_TRAIL(c2=*src)) { | 
1748  | 0  |                         ++src;  | 
1749  | 0  |                         c=U16_GET_SUPPLEMENTARY(c, c2);  | 
1750  | 0  |                         norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);  | 
1751  | 0  |                         if(!isCompYesAndZeroCC(norm16)) { | 
1752  | 0  |                             break;  | 
1753  | 0  |                         }  | 
1754  | 0  |                     }  | 
1755  | 0  |                 }  | 
1756  | 0  |             }  | 
1757  | 0  |         }  | 
1758  |  |         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.  | 
1759  |  |         // The current character is either a "noNo" (has a mapping)  | 
1760  |  |         // or a "maybeYes" (combines backward)  | 
1761  |  |         // or a "yesYes" with ccc!=0.  | 
1762  |  |         // It is not a Hangul syllable or Jamo L because those have "yes" properties.  | 
1763  |  |  | 
1764  | 0  |         uint16_t prevNorm16 = INERT;  | 
1765  | 0  |         if (prevBoundary != prevSrc) { | 
1766  | 0  |             if (norm16HasCompBoundaryBefore(norm16)) { | 
1767  | 0  |                 prevBoundary = prevSrc;  | 
1768  | 0  |             } else { | 
1769  | 0  |                 const UChar *p = prevSrc;  | 
1770  | 0  |                 uint16_t n16;  | 
1771  | 0  |                 UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16);  | 
1772  | 0  |                 if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) { | 
1773  | 0  |                     prevBoundary = prevSrc;  | 
1774  | 0  |                 } else { | 
1775  | 0  |                     prevBoundary = p;  | 
1776  | 0  |                     prevNorm16 = n16;  | 
1777  | 0  |                 }  | 
1778  | 0  |             }  | 
1779  | 0  |         }  | 
1780  |  | 
  | 
1781  | 0  |         if(isMaybeOrNonZeroCC(norm16)) { | 
1782  | 0  |             uint8_t cc=getCCFromYesOrMaybe(norm16);  | 
1783  | 0  |             if (onlyContiguous /* FCC */ && cc != 0 &&  | 
1784  | 0  |                     getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { | 
1785  |  |                 // The [prevBoundary..prevSrc[ character  | 
1786  |  |                 // passed the quick check "yes && ccc==0" test  | 
1787  |  |                 // but is out of canonical order with the current combining mark.  | 
1788  | 0  |             } else { | 
1789  |  |                 // If !onlyContiguous (not FCC), then we ignore the tccc of  | 
1790  |  |                 // the previous character which passed the quick check "yes && ccc==0" test.  | 
1791  | 0  |                 const UChar *nextSrc;  | 
1792  | 0  |                 for (;;) { | 
1793  | 0  |                     if (norm16 < MIN_YES_YES_WITH_CC) { | 
1794  | 0  |                         if (pQCResult != nullptr) { | 
1795  | 0  |                             *pQCResult = UNORM_MAYBE;  | 
1796  | 0  |                         } else { | 
1797  | 0  |                             return prevBoundary;  | 
1798  | 0  |                         }  | 
1799  | 0  |                     }  | 
1800  | 0  |                     if (src == limit) { | 
1801  | 0  |                         return src;  | 
1802  | 0  |                     }  | 
1803  | 0  |                     uint8_t prevCC = cc;  | 
1804  | 0  |                     nextSrc = src;  | 
1805  | 0  |                     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16);  | 
1806  | 0  |                     if (isMaybeOrNonZeroCC(norm16)) { | 
1807  | 0  |                         cc = getCCFromYesOrMaybe(norm16);  | 
1808  | 0  |                         if (!(prevCC <= cc || cc == 0)) { | 
1809  | 0  |                             break;  | 
1810  | 0  |                         }  | 
1811  | 0  |                     } else { | 
1812  | 0  |                         break;  | 
1813  | 0  |                     }  | 
1814  | 0  |                     src = nextSrc;  | 
1815  | 0  |                 }  | 
1816  |  |                 // src is after the last in-order combining mark.  | 
1817  | 0  |                 if (isCompYesAndZeroCC(norm16)) { | 
1818  | 0  |                     prevBoundary = src;  | 
1819  | 0  |                     src = nextSrc;  | 
1820  | 0  |                     continue;  | 
1821  | 0  |                 }  | 
1822  | 0  |             }  | 
1823  | 0  |         }  | 
1824  | 0  |         if(pQCResult!=NULL) { | 
1825  | 0  |             *pQCResult=UNORM_NO;  | 
1826  | 0  |         }  | 
1827  | 0  |         return prevBoundary;  | 
1828  | 0  |     }  | 
1829  | 0  | }  | 
1830  |  |  | 
1831  |  | void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,  | 
1832  |  |                                        UBool doCompose,  | 
1833  |  |                                        UBool onlyContiguous,  | 
1834  |  |                                        UnicodeString &safeMiddle,  | 
1835  |  |                                        ReorderingBuffer &buffer,  | 
1836  | 0  |                                        UErrorCode &errorCode) const { | 
1837  | 0  |     if(!buffer.isEmpty()) { | 
1838  | 0  |         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit, onlyContiguous);  | 
1839  | 0  |         if(src!=firstStarterInSrc) { | 
1840  | 0  |             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),  | 
1841  | 0  |                                                                     buffer.getLimit(), onlyContiguous);  | 
1842  | 0  |             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);  | 
1843  | 0  |             UnicodeString middle(lastStarterInDest, destSuffixLength);  | 
1844  | 0  |             buffer.removeSuffix(destSuffixLength);  | 
1845  | 0  |             safeMiddle=middle;  | 
1846  | 0  |             middle.append(src, (int32_t)(firstStarterInSrc-src));  | 
1847  | 0  |             const UChar *middleStart=middle.getBuffer();  | 
1848  | 0  |             compose(middleStart, middleStart+middle.length(), onlyContiguous,  | 
1849  | 0  |                     TRUE, buffer, errorCode);  | 
1850  | 0  |             if(U_FAILURE(errorCode)) { | 
1851  | 0  |                 return;  | 
1852  | 0  |             }  | 
1853  | 0  |             src=firstStarterInSrc;  | 
1854  | 0  |         }  | 
1855  | 0  |     }  | 
1856  | 0  |     if(doCompose) { | 
1857  | 0  |         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);  | 
1858  | 0  |     } else { | 
1859  | 0  |         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL | 
1860  | 0  |             limit=u_strchr(src, 0);  | 
1861  | 0  |         }  | 
1862  | 0  |         buffer.appendZeroCC(src, limit, errorCode);  | 
1863  | 0  |     }  | 
1864  | 0  | }  | 
1865  |  |  | 
1866  |  | UBool  | 
1867  |  | Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,  | 
1868  |  |                              const uint8_t *src, const uint8_t *limit,  | 
1869  | 0  |                              ByteSink *sink, Edits *edits, UErrorCode &errorCode) const { | 
1870  | 0  |     U_ASSERT(limit != nullptr);  | 
1871  | 0  |     UnicodeString s16;  | 
1872  | 0  |     uint8_t minNoMaybeLead = leadByteForCP(minCompNoMaybeCP);  | 
1873  | 0  |     const uint8_t *prevBoundary = src;  | 
1874  |  | 
  | 
1875  | 0  |     for (;;) { | 
1876  |  |         // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,  | 
1877  |  |         // or with (compYes && ccc==0) properties.  | 
1878  | 0  |         const uint8_t *prevSrc;  | 
1879  | 0  |         uint16_t norm16 = 0;  | 
1880  | 0  |         for (;;) { | 
1881  | 0  |             if (src == limit) { | 
1882  | 0  |                 if (prevBoundary != limit && sink != nullptr) { | 
1883  | 0  |                     ByteSinkUtil::appendUnchanged(prevBoundary, limit,  | 
1884  | 0  |                                                   *sink, options, edits, errorCode);  | 
1885  | 0  |                 }  | 
1886  | 0  |                 return TRUE;  | 
1887  | 0  |             }  | 
1888  | 0  |             if (*src < minNoMaybeLead) { | 
1889  | 0  |                 ++src;  | 
1890  | 0  |             } else { | 
1891  | 0  |                 prevSrc = src;  | 
1892  | 0  |                 UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);  | 
1893  | 0  |                 if (!isCompYesAndZeroCC(norm16)) { | 
1894  | 0  |                     break;  | 
1895  | 0  |                 }  | 
1896  | 0  |             }  | 
1897  | 0  |         }  | 
1898  |  |         // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.  | 
1899  |  |         // The current character is either a "noNo" (has a mapping)  | 
1900  |  |         // or a "maybeYes" (combines backward)  | 
1901  |  |         // or a "yesYes" with ccc!=0.  | 
1902  |  |         // It is not a Hangul syllable or Jamo L because those have "yes" properties.  | 
1903  |  |  | 
1904  |  |         // Medium-fast path: Handle cases that do not require full decomposition and recomposition.  | 
1905  | 0  |         if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes | 
1906  | 0  |             if (sink == nullptr) { | 
1907  | 0  |                 return FALSE;  | 
1908  | 0  |             }  | 
1909  |  |             // Fast path for mapping a character that is immediately surrounded by boundaries.  | 
1910  |  |             // In this case, we need not decompose around the current character.  | 
1911  | 0  |             if (isDecompNoAlgorithmic(norm16)) { | 
1912  |  |                 // Maps to a single isCompYesAndZeroCC character  | 
1913  |  |                 // which also implies hasCompBoundaryBefore.  | 
1914  | 0  |                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||  | 
1915  | 0  |                         hasCompBoundaryBefore(src, limit)) { | 
1916  | 0  |                     if (prevBoundary != prevSrc &&  | 
1917  | 0  |                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,  | 
1918  | 0  |                                                            *sink, options, edits, errorCode)) { | 
1919  | 0  |                         break;  | 
1920  | 0  |                     }  | 
1921  | 0  |                     appendCodePointDelta(prevSrc, src, getAlgorithmicDelta(norm16), *sink, edits);  | 
1922  | 0  |                     prevBoundary = src;  | 
1923  | 0  |                     continue;  | 
1924  | 0  |                 }  | 
1925  | 0  |             } else if (norm16 < minNoNoCompBoundaryBefore) { | 
1926  |  |                 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.  | 
1927  | 0  |                 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||  | 
1928  | 0  |                         hasCompBoundaryBefore(src, limit)) { | 
1929  | 0  |                     if (prevBoundary != prevSrc &&  | 
1930  | 0  |                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,  | 
1931  | 0  |                                                            *sink, options, edits, errorCode)) { | 
1932  | 0  |                         break;  | 
1933  | 0  |                     }  | 
1934  | 0  |                     const uint16_t *mapping = getMapping(norm16);  | 
1935  | 0  |                     int32_t length = *mapping++ & MAPPING_LENGTH_MASK;  | 
1936  | 0  |                     if (!ByteSinkUtil::appendChange(prevSrc, src, (const UChar *)mapping, length,  | 
1937  | 0  |                                                     *sink, edits, errorCode)) { | 
1938  | 0  |                         break;  | 
1939  | 0  |                     }  | 
1940  | 0  |                     prevBoundary = src;  | 
1941  | 0  |                     continue;  | 
1942  | 0  |                 }  | 
1943  | 0  |             } else if (norm16 >= minNoNoEmpty) { | 
1944  |  |                 // The current character maps to nothing.  | 
1945  |  |                 // Simply omit it from the output if there is a boundary before _or_ after it.  | 
1946  |  |                 // The character itself implies no boundaries.  | 
1947  | 0  |                 if (hasCompBoundaryBefore(src, limit) ||  | 
1948  | 0  |                         hasCompBoundaryAfter(prevBoundary, prevSrc, onlyContiguous)) { | 
1949  | 0  |                     if (prevBoundary != prevSrc &&  | 
1950  | 0  |                             !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,  | 
1951  | 0  |                                                            *sink, options, edits, errorCode)) { | 
1952  | 0  |                         break;  | 
1953  | 0  |                     }  | 
1954  | 0  |                     if (edits != nullptr) { | 
1955  | 0  |                         edits->addReplace((int32_t)(src - prevSrc), 0);  | 
1956  | 0  |                     }  | 
1957  | 0  |                     prevBoundary = src;  | 
1958  | 0  |                     continue;  | 
1959  | 0  |                 }  | 
1960  | 0  |             }  | 
1961  |  |             // Other "noNo" type, or need to examine more text around this character:  | 
1962  |  |             // Fall through to the slow path.  | 
1963  | 0  |         } else if (isJamoVT(norm16)) { | 
1964  |  |             // Jamo L: E1 84 80..92  | 
1965  |  |             // Jamo V: E1 85 A1..B5  | 
1966  |  |             // Jamo T: E1 86 A8..E1 87 82  | 
1967  | 0  |             U_ASSERT((src - prevSrc) == 3 && *prevSrc == 0xe1);  | 
1968  | 0  |             UChar32 prev = previousHangulOrJamo(prevBoundary, prevSrc);  | 
1969  | 0  |             if (prevSrc[1] == 0x85) { | 
1970  |  |                 // The current character is a Jamo Vowel,  | 
1971  |  |                 // compose with previous Jamo L and following Jamo T.  | 
1972  | 0  |                 UChar32 l = prev - Hangul::JAMO_L_BASE;  | 
1973  | 0  |                 if ((uint32_t)l < Hangul::JAMO_L_COUNT) { | 
1974  | 0  |                     if (sink == nullptr) { | 
1975  | 0  |                         return FALSE;  | 
1976  | 0  |                     }  | 
1977  | 0  |                     int32_t t = getJamoTMinusBase(src, limit);  | 
1978  | 0  |                     if (t >= 0) { | 
1979  |  |                         // The next character is a Jamo T.  | 
1980  | 0  |                         src += 3;  | 
1981  | 0  |                     } else if (hasCompBoundaryBefore(src, limit)) { | 
1982  |  |                         // No Jamo T follows, not even via decomposition.  | 
1983  | 0  |                         t = 0;  | 
1984  | 0  |                     }  | 
1985  | 0  |                     if (t >= 0) { | 
1986  | 0  |                         UChar32 syllable = Hangul::HANGUL_BASE +  | 
1987  | 0  |                             (l*Hangul::JAMO_V_COUNT + (prevSrc[2]-0xa1)) *  | 
1988  | 0  |                             Hangul::JAMO_T_COUNT + t;  | 
1989  | 0  |                         prevSrc -= 3;  // Replace the Jamo L as well.  | 
1990  | 0  |                         if (prevBoundary != prevSrc &&  | 
1991  | 0  |                                 !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,  | 
1992  | 0  |                                                                *sink, options, edits, errorCode)) { | 
1993  | 0  |                             break;  | 
1994  | 0  |                         }  | 
1995  | 0  |                         ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);  | 
1996  | 0  |                         prevBoundary = src;  | 
1997  | 0  |                         continue;  | 
1998  | 0  |                     }  | 
1999  |  |                     // If we see L+V+x where x!=T then we drop to the slow path,  | 
2000  |  |                     // decompose and recompose.  | 
2001  |  |                     // This is to deal with NFKC finding normal L and V but a  | 
2002  |  |                     // compatibility variant of a T.  | 
2003  |  |                     // We need to either fully compose that combination here  | 
2004  |  |                     // (which would complicate the code and may not work with strange custom data)  | 
2005  |  |                     // or use the slow path.  | 
2006  | 0  |                 }  | 
2007  | 0  |             } else if (Hangul::isHangulLV(prev)) { | 
2008  |  |                 // The current character is a Jamo Trailing consonant,  | 
2009  |  |                 // compose with previous Hangul LV that does not contain a Jamo T.  | 
2010  | 0  |                 if (sink == nullptr) { | 
2011  | 0  |                     return FALSE;  | 
2012  | 0  |                 }  | 
2013  | 0  |                 UChar32 syllable = prev + getJamoTMinusBase(prevSrc, src);  | 
2014  | 0  |                 prevSrc -= 3;  // Replace the Hangul LV as well.  | 
2015  | 0  |                 if (prevBoundary != prevSrc &&  | 
2016  | 0  |                         !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,  | 
2017  | 0  |                                                        *sink, options, edits, errorCode)) { | 
2018  | 0  |                     break;  | 
2019  | 0  |                 }  | 
2020  | 0  |                 ByteSinkUtil::appendCodePoint(prevSrc, src, syllable, *sink, edits);  | 
2021  | 0  |                 prevBoundary = src;  | 
2022  | 0  |                 continue;  | 
2023  | 0  |             }  | 
2024  |  |             // No matching context, or may need to decompose surrounding text first:  | 
2025  |  |             // Fall through to the slow path.  | 
2026  | 0  |         } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC | 
2027  |  |             // One or more combining marks that do not combine-back:  | 
2028  |  |             // Check for canonical order, copy unchanged if ok and  | 
2029  |  |             // if followed by a character with a boundary-before.  | 
2030  | 0  |             uint8_t cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0  | 
2031  | 0  |             if (onlyContiguous /* FCC */ && getPreviousTrailCC(prevBoundary, prevSrc) > cc) { | 
2032  |  |                 // Fails FCD test, need to decompose and contiguously recompose.  | 
2033  | 0  |                 if (sink == nullptr) { | 
2034  | 0  |                     return FALSE;  | 
2035  | 0  |                 }  | 
2036  | 0  |             } else { | 
2037  |  |                 // If !onlyContiguous (not FCC), then we ignore the tccc of  | 
2038  |  |                 // the previous character which passed the quick check "yes && ccc==0" test.  | 
2039  | 0  |                 const uint8_t *nextSrc;  | 
2040  | 0  |                 uint16_t n16;  | 
2041  | 0  |                 for (;;) { | 
2042  | 0  |                     if (src == limit) { | 
2043  | 0  |                         if (sink != nullptr) { | 
2044  | 0  |                             ByteSinkUtil::appendUnchanged(prevBoundary, limit,  | 
2045  | 0  |                                                           *sink, options, edits, errorCode);  | 
2046  | 0  |                         }  | 
2047  | 0  |                         return TRUE;  | 
2048  | 0  |                     }  | 
2049  | 0  |                     uint8_t prevCC = cc;  | 
2050  | 0  |                     nextSrc = src;  | 
2051  | 0  |                     UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16);  | 
2052  | 0  |                     if (n16 >= MIN_YES_YES_WITH_CC) { | 
2053  | 0  |                         cc = getCCFromNormalYesOrMaybe(n16);  | 
2054  | 0  |                         if (prevCC > cc) { | 
2055  | 0  |                             if (sink == nullptr) { | 
2056  | 0  |                                 return FALSE;  | 
2057  | 0  |                             }  | 
2058  | 0  |                             break;  | 
2059  | 0  |                         }  | 
2060  | 0  |                     } else { | 
2061  | 0  |                         break;  | 
2062  | 0  |                     }  | 
2063  | 0  |                     src = nextSrc;  | 
2064  | 0  |                 }  | 
2065  |  |                 // src is after the last in-order combining mark.  | 
2066  |  |                 // If there is a boundary here, then we continue with no change.  | 
2067  | 0  |                 if (norm16HasCompBoundaryBefore(n16)) { | 
2068  | 0  |                     if (isCompYesAndZeroCC(n16)) { | 
2069  | 0  |                         src = nextSrc;  | 
2070  | 0  |                     }  | 
2071  | 0  |                     continue;  | 
2072  | 0  |                 }  | 
2073  |  |                 // Use the slow path. There is no boundary in [prevSrc, src[.  | 
2074  | 0  |             }  | 
2075  | 0  |         }  | 
2076  |  |  | 
2077  |  |         // Slow path: Find the nearest boundaries around the current character,  | 
2078  |  |         // decompose and recompose.  | 
2079  | 0  |         if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { | 
2080  | 0  |             const uint8_t *p = prevSrc;  | 
2081  | 0  |             UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16);  | 
2082  | 0  |             if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { | 
2083  | 0  |                 prevSrc = p;  | 
2084  | 0  |             }  | 
2085  | 0  |         }  | 
2086  | 0  |         ReorderingBuffer buffer(*this, s16, errorCode);  | 
2087  | 0  |         if (U_FAILURE(errorCode)) { | 
2088  | 0  |             break;  | 
2089  | 0  |         }  | 
2090  |  |         // We know there is not a boundary here.  | 
2091  | 0  |         decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,  | 
2092  | 0  |                        buffer, errorCode);  | 
2093  |  |         // Decompose until the next boundary.  | 
2094  | 0  |         src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,  | 
2095  | 0  |                              buffer, errorCode);  | 
2096  | 0  |         if (U_FAILURE(errorCode)) { | 
2097  | 0  |             break;  | 
2098  | 0  |         }  | 
2099  | 0  |         if ((src - prevSrc) > INT32_MAX) {  // guard before buffer.equals() | 
2100  | 0  |             errorCode = U_INDEX_OUTOFBOUNDS_ERROR;  | 
2101  | 0  |             return TRUE;  | 
2102  | 0  |         }  | 
2103  | 0  |         recompose(buffer, 0, onlyContiguous);  | 
2104  | 0  |         if (!buffer.equals(prevSrc, src)) { | 
2105  | 0  |             if (sink == nullptr) { | 
2106  | 0  |                 return FALSE;  | 
2107  | 0  |             }  | 
2108  | 0  |             if (prevBoundary != prevSrc &&  | 
2109  | 0  |                     !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,  | 
2110  | 0  |                                                    *sink, options, edits, errorCode)) { | 
2111  | 0  |                 break;  | 
2112  | 0  |             }  | 
2113  | 0  |             if (!ByteSinkUtil::appendChange(prevSrc, src, buffer.getStart(), buffer.length(),  | 
2114  | 0  |                                             *sink, edits, errorCode)) { | 
2115  | 0  |                 break;  | 
2116  | 0  |             }  | 
2117  | 0  |             prevBoundary = src;  | 
2118  | 0  |         }  | 
2119  | 0  |     }  | 
2120  | 0  |     return TRUE;  | 
2121  | 0  | }  | 
2122  |  |  | 
2123  | 0  | UBool Normalizer2Impl::hasCompBoundaryBefore(const UChar *src, const UChar *limit) const { | 
2124  | 0  |     if (src == limit || *src < minCompNoMaybeCP) { | 
2125  | 0  |         return TRUE;  | 
2126  | 0  |     }  | 
2127  | 0  |     UChar32 c;  | 
2128  | 0  |     uint16_t norm16;  | 
2129  | 0  |     UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);  | 
2130  | 0  |     return norm16HasCompBoundaryBefore(norm16);  | 
2131  | 0  | }  | 
2132  |  |  | 
2133  | 0  | UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const { | 
2134  | 0  |     if (src == limit) { | 
2135  | 0  |         return TRUE;  | 
2136  | 0  |     }  | 
2137  | 0  |     uint16_t norm16;  | 
2138  | 0  |     UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);  | 
2139  | 0  |     return norm16HasCompBoundaryBefore(norm16);  | 
2140  | 0  | }  | 
2141  |  |  | 
2142  |  | UBool Normalizer2Impl::hasCompBoundaryAfter(const UChar *start, const UChar *p,  | 
2143  | 0  |                                             UBool onlyContiguous) const { | 
2144  | 0  |     if (start == p) { | 
2145  | 0  |         return TRUE;  | 
2146  | 0  |     }  | 
2147  | 0  |     UChar32 c;  | 
2148  | 0  |     uint16_t norm16;  | 
2149  | 0  |     UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);  | 
2150  | 0  |     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);  | 
2151  | 0  | }  | 
2152  |  |  | 
2153  |  | UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,  | 
2154  | 0  |                                             UBool onlyContiguous) const { | 
2155  | 0  |     if (start == p) { | 
2156  | 0  |         return TRUE;  | 
2157  | 0  |     }  | 
2158  | 0  |     uint16_t norm16;  | 
2159  | 0  |     UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16);  | 
2160  | 0  |     return norm16HasCompBoundaryAfter(norm16, onlyContiguous);  | 
2161  | 0  | }  | 
2162  |  |  | 
2163  |  | const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p,  | 
2164  | 0  |                                                        UBool onlyContiguous) const { | 
2165  | 0  |     while (p != start) { | 
2166  | 0  |         const UChar *codePointLimit = p;  | 
2167  | 0  |         UChar32 c;  | 
2168  | 0  |         uint16_t norm16;  | 
2169  | 0  |         UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);  | 
2170  | 0  |         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { | 
2171  | 0  |             return codePointLimit;  | 
2172  | 0  |         }  | 
2173  | 0  |         if (hasCompBoundaryBefore(c, norm16)) { | 
2174  | 0  |             return p;  | 
2175  | 0  |         }  | 
2176  | 0  |     }  | 
2177  | 0  |     return p;  | 
2178  | 0  | }  | 
2179  |  |  | 
2180  |  | const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit,  | 
2181  | 0  |                                                    UBool onlyContiguous) const { | 
2182  | 0  |     while (p != limit) { | 
2183  | 0  |         const UChar *codePointStart = p;  | 
2184  | 0  |         UChar32 c;  | 
2185  | 0  |         uint16_t norm16;  | 
2186  | 0  |         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);  | 
2187  | 0  |         if (hasCompBoundaryBefore(c, norm16)) { | 
2188  | 0  |             return codePointStart;  | 
2189  | 0  |         }  | 
2190  | 0  |         if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { | 
2191  | 0  |             return p;  | 
2192  | 0  |         }  | 
2193  | 0  |     }  | 
2194  | 0  |     return p;  | 
2195  | 0  | }  | 
2196  |  |  | 
2197  | 0  | uint8_t Normalizer2Impl::getPreviousTrailCC(const UChar *start, const UChar *p) const { | 
2198  | 0  |     if (start == p) { | 
2199  | 0  |         return 0;  | 
2200  | 0  |     }  | 
2201  | 0  |     int32_t i = (int32_t)(p - start);  | 
2202  | 0  |     UChar32 c;  | 
2203  | 0  |     U16_PREV(start, 0, i, c);  | 
2204  | 0  |     return (uint8_t)getFCD16(c);  | 
2205  | 0  | }  | 
2206  |  |  | 
2207  | 0  | uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const { | 
2208  | 0  |     if (start == p) { | 
2209  | 0  |         return 0;  | 
2210  | 0  |     }  | 
2211  | 0  |     int32_t i = (int32_t)(p - start);  | 
2212  | 0  |     UChar32 c;  | 
2213  | 0  |     U8_PREV(start, 0, i, c);  | 
2214  | 0  |     return (uint8_t)getFCD16(c);  | 
2215  | 0  | }  | 
2216  |  |  | 
2217  |  | // Note: normalizer2impl.cpp r30982 (2011-nov-27)  | 
2218  |  | // still had getFCDTrie() which built and cached an FCD trie.  | 
2219  |  | // That provided faster access to FCD data than getFCD16FromNormData()  | 
2220  |  | // but required synchronization and consumed some 10kB of heap memory  | 
2221  |  | // in any process that uses FCD (e.g., via collation).  | 
2222  |  | // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,  | 
2223  |  | // at least for ASCII & CJK.  | 
2224  |  |  | 
2225  |  | // Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this  | 
2226  |  | // function on Windows ARM64. As a work-around, we disable optimizations for this function.  | 
2227  |  | // This work-around could/should be removed once the following versions of Visual Studio are no  | 
2228  |  | // longer supported: All versions of VS2017, and versions of VS2019 below 16.4.  | 
2229  |  | #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))  | 
2230  |  | #pragma optimize( "", off )  | 
2231  |  | #endif  | 
2232  |  | // Gets the FCD value from the regular normalization data.  | 
2233  | 0  | uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { | 
2234  | 0  |     uint16_t norm16=getNorm16(c);  | 
2235  | 0  |     if (norm16 >= limitNoNo) { | 
2236  | 0  |         if(norm16>=MIN_NORMAL_MAYBE_YES) { | 
2237  |  |             // combining mark  | 
2238  | 0  |             norm16=getCCFromNormalYesOrMaybe(norm16);  | 
2239  | 0  |             return norm16|(norm16<<8);  | 
2240  | 0  |         } else if(norm16>=minMaybeYes) { | 
2241  | 0  |             return 0;  | 
2242  | 0  |         } else {  // isDecompNoAlgorithmic(norm16) | 
2243  | 0  |             uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK;  | 
2244  | 0  |             if (deltaTrailCC <= DELTA_TCCC_1) { | 
2245  | 0  |                 return deltaTrailCC >> OFFSET_SHIFT;  | 
2246  | 0  |             }  | 
2247  |  |             // Maps to an isCompYesAndZeroCC.  | 
2248  | 0  |             c=mapAlgorithmic(c, norm16);  | 
2249  | 0  |             norm16=getRawNorm16(c);  | 
2250  | 0  |         }  | 
2251  | 0  |     }  | 
2252  | 0  |     if(norm16<=minYesNo || isHangulLVT(norm16)) { | 
2253  |  |         // no decomposition or Hangul syllable, all zeros  | 
2254  | 0  |         return 0;  | 
2255  | 0  |     }  | 
2256  |  |     // c decomposes, get everything from the variable-length extra data  | 
2257  | 0  |     const uint16_t *mapping=getMapping(norm16);  | 
2258  | 0  |     uint16_t firstUnit=*mapping;  | 
2259  | 0  |     norm16=firstUnit>>8;  // tccc  | 
2260  | 0  |     if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { | 
2261  | 0  |         norm16|=*(mapping-1)&0xff00;  // lccc  | 
2262  | 0  |     }  | 
2263  | 0  |     return norm16;  | 
2264  | 0  | }  | 
2265  |  | #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))  | 
2266  |  | #pragma optimize( "", on )  | 
2267  |  | #endif  | 
2268  |  |  | 
2269  |  | // Dual functionality:  | 
2270  |  | // buffer!=NULL: normalize  | 
2271  |  | // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes  | 
2272  |  | const UChar *  | 
2273  |  | Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,  | 
2274  |  |                          ReorderingBuffer *buffer,  | 
2275  | 0  |                          UErrorCode &errorCode) const { | 
2276  |  |     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.  | 
2277  |  |     // Similar to the prevBoundary in the compose() implementation.  | 
2278  | 0  |     const UChar *prevBoundary=src;  | 
2279  | 0  |     int32_t prevFCD16=0;  | 
2280  | 0  |     if(limit==NULL) { | 
2281  | 0  |         src=copyLowPrefixFromNulTerminated(src, minLcccCP, buffer, errorCode);  | 
2282  | 0  |         if(U_FAILURE(errorCode)) { | 
2283  | 0  |             return src;  | 
2284  | 0  |         }  | 
2285  | 0  |         if(prevBoundary<src) { | 
2286  | 0  |             prevBoundary=src;  | 
2287  |  |             // We know that the previous character's lccc==0.  | 
2288  |  |             // Fetching the fcd16 value was deferred for this below-U+0300 code point.  | 
2289  | 0  |             prevFCD16=getFCD16(*(src-1));  | 
2290  | 0  |             if(prevFCD16>1) { | 
2291  | 0  |                 --prevBoundary;  | 
2292  | 0  |             }  | 
2293  | 0  |         }  | 
2294  | 0  |         limit=u_strchr(src, 0);  | 
2295  | 0  |     }  | 
2296  |  |  | 
2297  |  |     // Note: In this function we use buffer->appendZeroCC() because we track  | 
2298  |  |     // the lead and trail combining classes here, rather than leaving it to  | 
2299  |  |     // the ReorderingBuffer.  | 
2300  |  |     // The exception is the call to decomposeShort() which uses the buffer  | 
2301  |  |     // in the normal way.  | 
2302  |  |  | 
2303  | 0  |     const UChar *prevSrc;  | 
2304  | 0  |     UChar32 c=0;  | 
2305  | 0  |     uint16_t fcd16=0;  | 
2306  |  | 
  | 
2307  | 0  |     for(;;) { | 
2308  |  |         // count code units with lccc==0  | 
2309  | 0  |         for(prevSrc=src; src!=limit;) { | 
2310  | 0  |             if((c=*src)<minLcccCP) { | 
2311  | 0  |                 prevFCD16=~c;  | 
2312  | 0  |                 ++src;  | 
2313  | 0  |             } else if(!singleLeadMightHaveNonZeroFCD16(c)) { | 
2314  | 0  |                 prevFCD16=0;  | 
2315  | 0  |                 ++src;  | 
2316  | 0  |             } else { | 
2317  | 0  |                 if(U16_IS_LEAD(c)) { | 
2318  | 0  |                     UChar c2;  | 
2319  | 0  |                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) { | 
2320  | 0  |                         c=U16_GET_SUPPLEMENTARY(c, c2);  | 
2321  | 0  |                     }  | 
2322  | 0  |                 }  | 
2323  | 0  |                 if((fcd16=getFCD16FromNormData(c))<=0xff) { | 
2324  | 0  |                     prevFCD16=fcd16;  | 
2325  | 0  |                     src+=U16_LENGTH(c);  | 
2326  | 0  |                 } else { | 
2327  | 0  |                     break;  | 
2328  | 0  |                 }  | 
2329  | 0  |             }  | 
2330  | 0  |         }  | 
2331  |  |         // copy these code units all at once  | 
2332  | 0  |         if(src!=prevSrc) { | 
2333  | 0  |             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) { | 
2334  | 0  |                 break;  | 
2335  | 0  |             }  | 
2336  | 0  |             if(src==limit) { | 
2337  | 0  |                 break;  | 
2338  | 0  |             }  | 
2339  | 0  |             prevBoundary=src;  | 
2340  |  |             // We know that the previous character's lccc==0.  | 
2341  | 0  |             if(prevFCD16<0) { | 
2342  |  |                 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.  | 
2343  | 0  |                 UChar32 prev=~prevFCD16;  | 
2344  | 0  |                 if(prev<minDecompNoCP) { | 
2345  | 0  |                     prevFCD16=0;  | 
2346  | 0  |                 } else { | 
2347  | 0  |                     prevFCD16=getFCD16FromNormData(prev);  | 
2348  | 0  |                     if(prevFCD16>1) { | 
2349  | 0  |                         --prevBoundary;  | 
2350  | 0  |                     }  | 
2351  | 0  |                 }  | 
2352  | 0  |             } else { | 
2353  | 0  |                 const UChar *p=src-1;  | 
2354  | 0  |                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) { | 
2355  | 0  |                     --p;  | 
2356  |  |                     // Need to fetch the previous character's FCD value because  | 
2357  |  |                     // prevFCD16 was just for the trail surrogate code point.  | 
2358  | 0  |                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));  | 
2359  |  |                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.  | 
2360  | 0  |                 }  | 
2361  | 0  |                 if(prevFCD16>1) { | 
2362  | 0  |                     prevBoundary=p;  | 
2363  | 0  |                 }  | 
2364  | 0  |             }  | 
2365  |  |             // The start of the current character (c).  | 
2366  | 0  |             prevSrc=src;  | 
2367  | 0  |         } else if(src==limit) { | 
2368  | 0  |             break;  | 
2369  | 0  |         }  | 
2370  |  |  | 
2371  | 0  |         src+=U16_LENGTH(c);  | 
2372  |  |         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.  | 
2373  |  |         // Check for proper order, and decompose locally if necessary.  | 
2374  | 0  |         if((prevFCD16&0xff)<=(fcd16>>8)) { | 
2375  |  |             // proper order: prev tccc <= current lccc  | 
2376  | 0  |             if((fcd16&0xff)<=1) { | 
2377  | 0  |                 prevBoundary=src;  | 
2378  | 0  |             }  | 
2379  | 0  |             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) { | 
2380  | 0  |                 break;  | 
2381  | 0  |             }  | 
2382  | 0  |             prevFCD16=fcd16;  | 
2383  | 0  |             continue;  | 
2384  | 0  |         } else if(buffer==NULL) { | 
2385  | 0  |             return prevBoundary;  // quick check "no"  | 
2386  | 0  |         } else { | 
2387  |  |             /*  | 
2388  |  |              * Back out the part of the source that we copied or appended  | 
2389  |  |              * already but is now going to be decomposed.  | 
2390  |  |              * prevSrc is set to after what was copied/appended.  | 
2391  |  |              */  | 
2392  | 0  |             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));  | 
2393  |  |             /*  | 
2394  |  |              * Find the part of the source that needs to be decomposed,  | 
2395  |  |              * up to the next safe boundary.  | 
2396  |  |              */  | 
2397  | 0  |             src=findNextFCDBoundary(src, limit);  | 
2398  |  |             /*  | 
2399  |  |              * The source text does not fulfill the conditions for FCD.  | 
2400  |  |              * Decompose and reorder a limited piece of the text.  | 
2401  |  |              */  | 
2402  | 0  |             decomposeShort(prevBoundary, src, FALSE, FALSE, *buffer, errorCode);  | 
2403  | 0  |             if (U_FAILURE(errorCode)) { | 
2404  | 0  |                 break;  | 
2405  | 0  |             }  | 
2406  | 0  |             prevBoundary=src;  | 
2407  | 0  |             prevFCD16=0;  | 
2408  | 0  |         }  | 
2409  | 0  |     }  | 
2410  | 0  |     return src;  | 
2411  | 0  | }  | 
2412  |  |  | 
2413  |  | void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,  | 
2414  |  |                                        UBool doMakeFCD,  | 
2415  |  |                                        UnicodeString &safeMiddle,  | 
2416  |  |                                        ReorderingBuffer &buffer,  | 
2417  | 0  |                                        UErrorCode &errorCode) const { | 
2418  | 0  |     if(!buffer.isEmpty()) { | 
2419  | 0  |         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);  | 
2420  | 0  |         if(src!=firstBoundaryInSrc) { | 
2421  | 0  |             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),  | 
2422  | 0  |                                                                     buffer.getLimit());  | 
2423  | 0  |             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);  | 
2424  | 0  |             UnicodeString middle(lastBoundaryInDest, destSuffixLength);  | 
2425  | 0  |             buffer.removeSuffix(destSuffixLength);  | 
2426  | 0  |             safeMiddle=middle;  | 
2427  | 0  |             middle.append(src, (int32_t)(firstBoundaryInSrc-src));  | 
2428  | 0  |             const UChar *middleStart=middle.getBuffer();  | 
2429  | 0  |             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);  | 
2430  | 0  |             if(U_FAILURE(errorCode)) { | 
2431  | 0  |                 return;  | 
2432  | 0  |             }  | 
2433  | 0  |             src=firstBoundaryInSrc;  | 
2434  | 0  |         }  | 
2435  | 0  |     }  | 
2436  | 0  |     if(doMakeFCD) { | 
2437  | 0  |         makeFCD(src, limit, &buffer, errorCode);  | 
2438  | 0  |     } else { | 
2439  | 0  |         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL | 
2440  | 0  |             limit=u_strchr(src, 0);  | 
2441  | 0  |         }  | 
2442  | 0  |         buffer.appendZeroCC(src, limit, errorCode);  | 
2443  | 0  |     }  | 
2444  | 0  | }  | 
2445  |  |  | 
2446  | 0  | const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const { | 
2447  | 0  |     while(start<p) { | 
2448  | 0  |         const UChar *codePointLimit = p;  | 
2449  | 0  |         UChar32 c;  | 
2450  | 0  |         uint16_t norm16;  | 
2451  | 0  |         UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);  | 
2452  | 0  |         if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) { | 
2453  | 0  |             return codePointLimit;  | 
2454  | 0  |         }  | 
2455  | 0  |         if (norm16HasDecompBoundaryBefore(norm16)) { | 
2456  | 0  |             return p;  | 
2457  | 0  |         }  | 
2458  | 0  |     }  | 
2459  | 0  |     return p;  | 
2460  | 0  | }  | 
2461  |  |  | 
2462  | 0  | const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const { | 
2463  | 0  |     while(p<limit) { | 
2464  | 0  |         const UChar *codePointStart=p;  | 
2465  | 0  |         UChar32 c;  | 
2466  | 0  |         uint16_t norm16;  | 
2467  | 0  |         UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);  | 
2468  | 0  |         if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) { | 
2469  | 0  |             return codePointStart;  | 
2470  | 0  |         }  | 
2471  | 0  |         if (norm16HasDecompBoundaryAfter(norm16)) { | 
2472  | 0  |             return p;  | 
2473  | 0  |         }  | 
2474  | 0  |     }  | 
2475  | 0  |     return p;  | 
2476  | 0  | }  | 
2477  |  |  | 
2478  |  | // CanonicalIterator data -------------------------------------------------- ***  | 
2479  |  |  | 
2480  |  | CanonIterData::CanonIterData(UErrorCode &errorCode) :  | 
2481  | 0  |         mutableTrie(umutablecptrie_open(0, 0, &errorCode)), trie(nullptr),  | 
2482  | 0  |         canonStartSets(uprv_deleteUObject, NULL, errorCode) {} | 
2483  |  |  | 
2484  | 0  | CanonIterData::~CanonIterData() { | 
2485  | 0  |     umutablecptrie_close(mutableTrie);  | 
2486  | 0  |     ucptrie_close(trie);  | 
2487  | 0  | }  | 
2488  |  |  | 
2489  | 0  | void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) { | 
2490  | 0  |     uint32_t canonValue = umutablecptrie_get(mutableTrie, decompLead);  | 
2491  | 0  |     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { | 
2492  |  |         // origin is the first character whose decomposition starts with  | 
2493  |  |         // the character for which we are setting the value.  | 
2494  | 0  |         umutablecptrie_set(mutableTrie, decompLead, canonValue|origin, &errorCode);  | 
2495  | 0  |     } else { | 
2496  |  |         // origin is not the first character, or it is U+0000.  | 
2497  | 0  |         UnicodeSet *set;  | 
2498  | 0  |         if((canonValue&CANON_HAS_SET)==0) { | 
2499  | 0  |             set=new UnicodeSet;  | 
2500  | 0  |             if(set==NULL) { | 
2501  | 0  |                 errorCode=U_MEMORY_ALLOCATION_ERROR;  | 
2502  | 0  |                 return;  | 
2503  | 0  |             }  | 
2504  | 0  |             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);  | 
2505  | 0  |             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();  | 
2506  | 0  |             umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode);  | 
2507  | 0  |             canonStartSets.addElementX(set, errorCode);  | 
2508  | 0  |             if(firstOrigin!=0) { | 
2509  | 0  |                 set->add(firstOrigin);  | 
2510  | 0  |             }  | 
2511  | 0  |         } else { | 
2512  | 0  |             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];  | 
2513  | 0  |         }  | 
2514  | 0  |         set->add(origin);  | 
2515  | 0  |     }  | 
2516  | 0  | }  | 
2517  |  |  | 
2518  |  | // C++ class for friend access to private Normalizer2Impl members.  | 
2519  |  | class InitCanonIterData { | 
2520  |  | public:  | 
2521  |  |     static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);  | 
2522  |  | };  | 
2523  |  |  | 
2524  |  | U_CDECL_BEGIN  | 
2525  |  |  | 
2526  |  | // UInitOnce instantiation function for CanonIterData  | 
2527  |  | static void U_CALLCONV  | 
2528  | 0  | initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) { | 
2529  | 0  |     InitCanonIterData::doInit(impl, errorCode);  | 
2530  | 0  | }  | 
2531  |  |  | 
2532  |  | U_CDECL_END  | 
2533  |  |  | 
2534  | 0  | void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) { | 
2535  | 0  |     U_ASSERT(impl->fCanonIterData == NULL);  | 
2536  | 0  |     impl->fCanonIterData = new CanonIterData(errorCode);  | 
2537  | 0  |     if (impl->fCanonIterData == NULL) { | 
2538  | 0  |         errorCode=U_MEMORY_ALLOCATION_ERROR;  | 
2539  | 0  |     }  | 
2540  | 0  |     if (U_SUCCESS(errorCode)) { | 
2541  | 0  |         UChar32 start = 0, end;  | 
2542  | 0  |         uint32_t value;  | 
2543  | 0  |         while ((end = ucptrie_getRange(impl->normTrie, start,  | 
2544  | 0  |                                        UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,  | 
2545  | 0  |                                        nullptr, nullptr, &value)) >= 0) { | 
2546  |  |             // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.  | 
2547  | 0  |             if (value != Normalizer2Impl::INERT) { | 
2548  | 0  |                 impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);  | 
2549  | 0  |             }  | 
2550  | 0  |             start = end + 1;  | 
2551  | 0  |         }  | 
2552  |  | #ifdef UCPTRIE_DEBUG  | 
2553  |  |         umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData");  | 
2554  |  | #endif  | 
2555  | 0  |         impl->fCanonIterData->trie = umutablecptrie_buildImmutable(  | 
2556  | 0  |             impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode);  | 
2557  | 0  |         umutablecptrie_close(impl->fCanonIterData->mutableTrie);  | 
2558  | 0  |         impl->fCanonIterData->mutableTrie = nullptr;  | 
2559  | 0  |     }  | 
2560  | 0  |     if (U_FAILURE(errorCode)) { | 
2561  | 0  |         delete impl->fCanonIterData;  | 
2562  | 0  |         impl->fCanonIterData = NULL;  | 
2563  | 0  |     }  | 
2564  | 0  | }  | 
2565  |  |  | 
2566  |  | void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,  | 
2567  |  |                                                   CanonIterData &newData,  | 
2568  | 0  |                                                   UErrorCode &errorCode) const { | 
2569  | 0  |     if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) { | 
2570  |  |         // Inert, or 2-way mapping (including Hangul syllable).  | 
2571  |  |         // We do not write a canonStartSet for any yesNo character.  | 
2572  |  |         // Composites from 2-way mappings are added at runtime from the  | 
2573  |  |         // starter's compositions list, and the other characters in  | 
2574  |  |         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are  | 
2575  |  |         // "maybe" characters.  | 
2576  | 0  |         return;  | 
2577  | 0  |     }  | 
2578  | 0  |     for(UChar32 c=start; c<=end; ++c) { | 
2579  | 0  |         uint32_t oldValue = umutablecptrie_get(newData.mutableTrie, c);  | 
2580  | 0  |         uint32_t newValue=oldValue;  | 
2581  | 0  |         if(isMaybeOrNonZeroCC(norm16)) { | 
2582  |  |             // not a segment starter if it occurs in a decomposition or has cc!=0  | 
2583  | 0  |             newValue|=CANON_NOT_SEGMENT_STARTER;  | 
2584  | 0  |             if(norm16<MIN_NORMAL_MAYBE_YES) { | 
2585  | 0  |                 newValue|=CANON_HAS_COMPOSITIONS;  | 
2586  | 0  |             }  | 
2587  | 0  |         } else if(norm16<minYesNo) { | 
2588  | 0  |             newValue|=CANON_HAS_COMPOSITIONS;  | 
2589  | 0  |         } else { | 
2590  |  |             // c has a one-way decomposition  | 
2591  | 0  |             UChar32 c2=c;  | 
2592  |  |             // Do not modify the whole-range norm16 value.  | 
2593  | 0  |             uint16_t norm16_2=norm16;  | 
2594  | 0  |             if (isDecompNoAlgorithmic(norm16_2)) { | 
2595  |  |                 // Maps to an isCompYesAndZeroCC.  | 
2596  | 0  |                 c2 = mapAlgorithmic(c2, norm16_2);  | 
2597  | 0  |                 norm16_2 = getRawNorm16(c2);  | 
2598  |  |                 // No compatibility mappings for the CanonicalIterator.  | 
2599  | 0  |                 U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));  | 
2600  | 0  |             }  | 
2601  | 0  |             if (norm16_2 > minYesNo) { | 
2602  |  |                 // c decomposes, get everything from the variable-length extra data  | 
2603  | 0  |                 const uint16_t *mapping=getMapping(norm16_2);  | 
2604  | 0  |                 uint16_t firstUnit=*mapping;  | 
2605  | 0  |                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;  | 
2606  | 0  |                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { | 
2607  | 0  |                     if(c==c2 && (*(mapping-1)&0xff)!=0) { | 
2608  | 0  |                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0  | 
2609  | 0  |                     }  | 
2610  | 0  |                 }  | 
2611  |  |                 // Skip empty mappings (no characters in the decomposition).  | 
2612  | 0  |                 if(length!=0) { | 
2613  | 0  |                     ++mapping;  // skip over the firstUnit  | 
2614  |  |                     // add c to first code point's start set  | 
2615  | 0  |                     int32_t i=0;  | 
2616  | 0  |                     U16_NEXT_UNSAFE(mapping, i, c2);  | 
2617  | 0  |                     newData.addToStartSet(c, c2, errorCode);  | 
2618  |  |                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a  | 
2619  |  |                     // one-way mapping. A 2-way mapping is possible here after  | 
2620  |  |                     // intermediate algorithmic mapping.  | 
2621  | 0  |                     if(norm16_2>=minNoNo) { | 
2622  | 0  |                         while(i<length) { | 
2623  | 0  |                             U16_NEXT_UNSAFE(mapping, i, c2);  | 
2624  | 0  |                             uint32_t c2Value = umutablecptrie_get(newData.mutableTrie, c2);  | 
2625  | 0  |                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { | 
2626  | 0  |                                 umutablecptrie_set(newData.mutableTrie, c2,  | 
2627  | 0  |                                                    c2Value|CANON_NOT_SEGMENT_STARTER, &errorCode);  | 
2628  | 0  |                             }  | 
2629  | 0  |                         }  | 
2630  | 0  |                     }  | 
2631  | 0  |                 }  | 
2632  | 0  |             } else { | 
2633  |  |                 // c decomposed to c2 algorithmically; c has cc==0  | 
2634  | 0  |                 newData.addToStartSet(c, c2, errorCode);  | 
2635  | 0  |             }  | 
2636  | 0  |         }  | 
2637  | 0  |         if(newValue!=oldValue) { | 
2638  | 0  |             umutablecptrie_set(newData.mutableTrie, c, newValue, &errorCode);  | 
2639  | 0  |         }  | 
2640  | 0  |     }  | 
2641  | 0  | }  | 
2642  |  |  | 
2643  | 0  | UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { | 
2644  |  |     // Logically const: Synchronized instantiation.  | 
2645  | 0  |     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);  | 
2646  | 0  |     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);  | 
2647  | 0  |     return U_SUCCESS(errorCode);  | 
2648  | 0  | }  | 
2649  |  |  | 
2650  | 0  | int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { | 
2651  | 0  |     return (int32_t)ucptrie_get(fCanonIterData->trie, c);  | 
2652  | 0  | }  | 
2653  |  |  | 
2654  | 0  | const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { | 
2655  | 0  |     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];  | 
2656  | 0  | }  | 
2657  |  |  | 
2658  | 0  | UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { | 
2659  | 0  |     return getCanonValue(c)>=0;  | 
2660  | 0  | }  | 
2661  |  |  | 
2662  | 0  | UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { | 
2663  | 0  |     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;  | 
2664  | 0  |     if(canonValue==0) { | 
2665  | 0  |         return FALSE;  | 
2666  | 0  |     }  | 
2667  | 0  |     set.clear();  | 
2668  | 0  |     int32_t value=canonValue&CANON_VALUE_MASK;  | 
2669  | 0  |     if((canonValue&CANON_HAS_SET)!=0) { | 
2670  | 0  |         set.addAll(getCanonStartSet(value));  | 
2671  | 0  |     } else if(value!=0) { | 
2672  | 0  |         set.add(value);  | 
2673  | 0  |     }  | 
2674  | 0  |     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { | 
2675  | 0  |         uint16_t norm16=getRawNorm16(c);  | 
2676  | 0  |         if(norm16==JAMO_L) { | 
2677  | 0  |             UChar32 syllable=  | 
2678  | 0  |                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);  | 
2679  | 0  |             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);  | 
2680  | 0  |         } else { | 
2681  | 0  |             addComposites(getCompositionsList(norm16), set);  | 
2682  | 0  |         }  | 
2683  | 0  |     }  | 
2684  | 0  |     return TRUE;  | 
2685  | 0  | }  | 
2686  |  |  | 
2687  |  | U_NAMESPACE_END  | 
2688  |  |  | 
2689  |  | // Normalizer2 data swapping ----------------------------------------------- ***  | 
2690  |  |  | 
2691  |  | U_NAMESPACE_USE  | 
2692  |  |  | 
2693  |  | U_CAPI int32_t U_EXPORT2  | 
2694  |  | unorm2_swap(const UDataSwapper *ds,  | 
2695  |  |             const void *inData, int32_t length, void *outData,  | 
2696  | 0  |             UErrorCode *pErrorCode) { | 
2697  | 0  |     const UDataInfo *pInfo;  | 
2698  | 0  |     int32_t headerSize;  | 
2699  |  | 
  | 
2700  | 0  |     const uint8_t *inBytes;  | 
2701  | 0  |     uint8_t *outBytes;  | 
2702  |  | 
  | 
2703  | 0  |     const int32_t *inIndexes;  | 
2704  | 0  |     int32_t indexes[Normalizer2Impl::IX_TOTAL_SIZE+1];  | 
2705  |  | 
  | 
2706  | 0  |     int32_t i, offset, nextOffset, size;  | 
2707  |  |  | 
2708  |  |     /* udata_swapDataHeader checks the arguments */  | 
2709  | 0  |     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);  | 
2710  | 0  |     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { | 
2711  | 0  |         return 0;  | 
2712  | 0  |     }  | 
2713  |  |  | 
2714  |  |     /* check data format and format version */  | 
2715  | 0  |     pInfo=(const UDataInfo *)((const char *)inData+4);  | 
2716  | 0  |     uint8_t formatVersion0=pInfo->formatVersion[0];  | 
2717  | 0  |     if(!(  | 
2718  | 0  |         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */  | 
2719  | 0  |         pInfo->dataFormat[1]==0x72 &&  | 
2720  | 0  |         pInfo->dataFormat[2]==0x6d &&  | 
2721  | 0  |         pInfo->dataFormat[3]==0x32 &&  | 
2722  | 0  |         (1<=formatVersion0 && formatVersion0<=4)  | 
2723  | 0  |     )) { | 
2724  | 0  |         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",  | 
2725  | 0  |                          pInfo->dataFormat[0], pInfo->dataFormat[1],  | 
2726  | 0  |                          pInfo->dataFormat[2], pInfo->dataFormat[3],  | 
2727  | 0  |                          pInfo->formatVersion[0]);  | 
2728  | 0  |         *pErrorCode=U_UNSUPPORTED_ERROR;  | 
2729  | 0  |         return 0;  | 
2730  | 0  |     }  | 
2731  |  |  | 
2732  | 0  |     inBytes=(const uint8_t *)inData+headerSize;  | 
2733  | 0  |     outBytes=(uint8_t *)outData+headerSize;  | 
2734  |  | 
  | 
2735  | 0  |     inIndexes=(const int32_t *)inBytes;  | 
2736  | 0  |     int32_t minIndexesLength;  | 
2737  | 0  |     if(formatVersion0==1) { | 
2738  | 0  |         minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1;  | 
2739  | 0  |     } else if(formatVersion0==2) { | 
2740  | 0  |         minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1;  | 
2741  | 0  |     } else { | 
2742  | 0  |         minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1;  | 
2743  | 0  |     }  | 
2744  |  | 
  | 
2745  | 0  |     if(length>=0) { | 
2746  | 0  |         length-=headerSize;  | 
2747  | 0  |         if(length<minIndexesLength*4) { | 
2748  | 0  |             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",  | 
2749  | 0  |                              length);  | 
2750  | 0  |             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;  | 
2751  | 0  |             return 0;  | 
2752  | 0  |         }  | 
2753  | 0  |     }  | 
2754  |  |  | 
2755  |  |     /* read the first few indexes */  | 
2756  | 0  |     for(i=0; i<UPRV_LENGTHOF(indexes); ++i) { | 
2757  | 0  |         indexes[i]=udata_readInt32(ds, inIndexes[i]);  | 
2758  | 0  |     }  | 
2759  |  |  | 
2760  |  |     /* get the total length of the data */  | 
2761  | 0  |     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];  | 
2762  |  | 
  | 
2763  | 0  |     if(length>=0) { | 
2764  | 0  |         if(length<size) { | 
2765  | 0  |             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",  | 
2766  | 0  |                              length);  | 
2767  | 0  |             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;  | 
2768  | 0  |             return 0;  | 
2769  | 0  |         }  | 
2770  |  |  | 
2771  |  |         /* copy the data for inaccessible bytes */  | 
2772  | 0  |         if(inBytes!=outBytes) { | 
2773  | 0  |             uprv_memcpy(outBytes, inBytes, size);  | 
2774  | 0  |         }  | 
2775  |  | 
  | 
2776  | 0  |         offset=0;  | 
2777  |  |  | 
2778  |  |         /* swap the int32_t indexes[] */  | 
2779  | 0  |         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];  | 
2780  | 0  |         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);  | 
2781  | 0  |         offset=nextOffset;  | 
2782  |  |  | 
2783  |  |         /* swap the trie */  | 
2784  | 0  |         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];  | 
2785  | 0  |         utrie_swapAnyVersion(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);  | 
2786  | 0  |         offset=nextOffset;  | 
2787  |  |  | 
2788  |  |         /* swap the uint16_t extraData[] */  | 
2789  | 0  |         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];  | 
2790  | 0  |         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);  | 
2791  | 0  |         offset=nextOffset;  | 
2792  |  |  | 
2793  |  |         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */  | 
2794  | 0  |         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];  | 
2795  | 0  |         offset=nextOffset;  | 
2796  |  | 
  | 
2797  | 0  |         U_ASSERT(offset==size);  | 
2798  | 0  |     }  | 
2799  |  |  | 
2800  | 0  |     return headerSize+size;  | 
2801  | 0  | }  | 
2802  |  |  | 
2803  |  | #endif  // !UCONFIG_NO_NORMALIZATION  |