/src/icu/source/common/caniter.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  |  *****************************************************************************  | 
5  |  |  * Copyright (C) 1996-2015, International Business Machines Corporation and  | 
6  |  |  * others. All Rights Reserved.  | 
7  |  |  *****************************************************************************  | 
8  |  |  */  | 
9  |  |  | 
10  |  | #include "unicode/utypes.h"  | 
11  |  |  | 
12  |  | #if !UCONFIG_NO_NORMALIZATION  | 
13  |  |  | 
14  |  | #include "unicode/caniter.h"  | 
15  |  | #include "unicode/normalizer2.h"  | 
16  |  | #include "unicode/uchar.h"  | 
17  |  | #include "unicode/uniset.h"  | 
18  |  | #include "unicode/usetiter.h"  | 
19  |  | #include "unicode/ustring.h"  | 
20  |  | #include "unicode/utf16.h"  | 
21  |  | #include "cmemory.h"  | 
22  |  | #include "hash.h"  | 
23  |  | #include "normalizer2impl.h"  | 
24  |  |  | 
25  |  | /**  | 
26  |  |  * This class allows one to iterate through all the strings that are canonically equivalent to a given  | 
27  |  |  * string. For example, here are some sample results:  | 
28  |  | Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} | 
29  |  | 1: \u0041\u030A\u0064\u0307\u0327  | 
30  |  |  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} | 
31  |  | 2: \u0041\u030A\u0064\u0327\u0307  | 
32  |  |  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} | 
33  |  | 3: \u0041\u030A\u1E0B\u0327  | 
34  |  |  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} | 
35  |  | 4: \u0041\u030A\u1E11\u0307  | 
36  |  |  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} | 
37  |  | 5: \u00C5\u0064\u0307\u0327  | 
38  |  |  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} | 
39  |  | 6: \u00C5\u0064\u0327\u0307  | 
40  |  |  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} | 
41  |  | 7: \u00C5\u1E0B\u0327  | 
42  |  |  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} | 
43  |  | 8: \u00C5\u1E11\u0307  | 
44  |  |  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} | 
45  |  | 9: \u212B\u0064\u0307\u0327  | 
46  |  |  = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} | 
47  |  | 10: \u212B\u0064\u0327\u0307  | 
48  |  |  = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} | 
49  |  | 11: \u212B\u1E0B\u0327  | 
50  |  |  = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} | 
51  |  | 12: \u212B\u1E11\u0307  | 
52  |  |  = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} | 
53  |  |  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,  | 
54  |  |  * since it has not been optimized for that situation.  | 
55  |  |  *@author M. Davis  | 
56  |  |  *@draft  | 
57  |  |  */  | 
58  |  |  | 
59  |  | // public  | 
60  |  |  | 
61  |  | U_NAMESPACE_BEGIN  | 
62  |  |  | 
63  |  | // TODO: add boilerplate methods.  | 
64  |  |  | 
65  |  | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)  | 
66  |  |  | 
67  |  | /**  | 
68  |  |  *@param source string to get results for  | 
69  |  |  */  | 
70  |  | CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :  | 
71  |  |     pieces(NULL),  | 
72  | 0  |     pieces_length(0),  | 
73  |  |     pieces_lengths(NULL),  | 
74  |  |     current(NULL),  | 
75  | 0  |     current_length(0),  | 
76  | 0  |     nfd(*Normalizer2::getNFDInstance(status)),  | 
77  | 0  |     nfcImpl(*Normalizer2Factory::getNFCImpl(status))  | 
78  | 0  | { | 
79  | 0  |     if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) { | 
80  | 0  |       setSource(sourceStr, status);  | 
81  | 0  |     }  | 
82  | 0  | }  | 
83  |  |  | 
84  | 0  | CanonicalIterator::~CanonicalIterator() { | 
85  | 0  |   cleanPieces();  | 
86  | 0  | }  | 
87  |  |  | 
88  | 0  | void CanonicalIterator::cleanPieces() { | 
89  | 0  |     int32_t i = 0;  | 
90  | 0  |     if(pieces != NULL) { | 
91  | 0  |         for(i = 0; i < pieces_length; i++) { | 
92  | 0  |             if(pieces[i] != NULL) { | 
93  | 0  |                 delete[] pieces[i];  | 
94  | 0  |             }  | 
95  | 0  |         }  | 
96  | 0  |         uprv_free(pieces);  | 
97  | 0  |         pieces = NULL;  | 
98  | 0  |         pieces_length = 0;  | 
99  | 0  |     }  | 
100  | 0  |     if(pieces_lengths != NULL) { | 
101  | 0  |         uprv_free(pieces_lengths);  | 
102  | 0  |         pieces_lengths = NULL;  | 
103  | 0  |     }  | 
104  | 0  |     if(current != NULL) { | 
105  | 0  |         uprv_free(current);  | 
106  | 0  |         current = NULL;  | 
107  | 0  |         current_length = 0;  | 
108  | 0  |     }  | 
109  | 0  | }  | 
110  |  |  | 
111  |  | /**  | 
112  |  |  *@return gets the source: NOTE: it is the NFD form of source  | 
113  |  |  */  | 
114  | 0  | UnicodeString CanonicalIterator::getSource() { | 
115  | 0  |   return source;  | 
116  | 0  | }  | 
117  |  |  | 
118  |  | /**  | 
119  |  |  * Resets the iterator so that one can start again from the beginning.  | 
120  |  |  */  | 
121  | 0  | void CanonicalIterator::reset() { | 
122  | 0  |     done = FALSE;  | 
123  | 0  |     for (int i = 0; i < current_length; ++i) { | 
124  | 0  |         current[i] = 0;  | 
125  | 0  |     }  | 
126  | 0  | }  | 
127  |  |  | 
128  |  | /**  | 
129  |  |  *@return the next string that is canonically equivalent. The value null is returned when  | 
130  |  |  * the iteration is done.  | 
131  |  |  */  | 
132  | 0  | UnicodeString CanonicalIterator::next() { | 
133  | 0  |     int32_t i = 0;  | 
134  |  | 
  | 
135  | 0  |     if (done) { | 
136  | 0  |       buffer.setToBogus();  | 
137  | 0  |       return buffer;  | 
138  | 0  |     }  | 
139  |  |  | 
140  |  |     // delete old contents  | 
141  | 0  |     buffer.remove();  | 
142  |  |  | 
143  |  |     // construct return value  | 
144  |  | 
  | 
145  | 0  |     for (i = 0; i < pieces_length; ++i) { | 
146  | 0  |         buffer.append(pieces[i][current[i]]);  | 
147  | 0  |     }  | 
148  |  |     //String result = buffer.toString(); // not needed  | 
149  |  |  | 
150  |  |     // find next value for next time  | 
151  |  | 
  | 
152  | 0  |     for (i = current_length - 1; ; --i) { | 
153  | 0  |         if (i < 0) { | 
154  | 0  |             done = TRUE;  | 
155  | 0  |             break;  | 
156  | 0  |         }  | 
157  | 0  |         current[i]++;  | 
158  | 0  |         if (current[i] < pieces_lengths[i]) break; // got sequence  | 
159  | 0  |         current[i] = 0;  | 
160  | 0  |     }  | 
161  | 0  |     return buffer;  | 
162  | 0  | }  | 
163  |  |  | 
164  |  | /**  | 
165  |  |  *@param set the source string to iterate against. This allows the same iterator to be used  | 
166  |  |  * while changing the source string, saving object creation.  | 
167  |  |  */  | 
168  | 0  | void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) { | 
169  | 0  |     int32_t list_length = 0;  | 
170  | 0  |     UChar32 cp = 0;  | 
171  | 0  |     int32_t start = 0;  | 
172  | 0  |     int32_t i = 0;  | 
173  | 0  |     UnicodeString *list = NULL;  | 
174  |  | 
  | 
175  | 0  |     nfd.normalize(newSource, source, status);  | 
176  | 0  |     if(U_FAILURE(status)) { | 
177  | 0  |       return;  | 
178  | 0  |     }  | 
179  | 0  |     done = FALSE;  | 
180  |  | 
  | 
181  | 0  |     cleanPieces();  | 
182  |  |  | 
183  |  |     // catch degenerate case  | 
184  | 0  |     if (newSource.length() == 0) { | 
185  | 0  |         pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *));  | 
186  | 0  |         pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t));  | 
187  | 0  |         pieces_length = 1;  | 
188  | 0  |         current = (int32_t*)uprv_malloc(1 * sizeof(int32_t));  | 
189  | 0  |         current_length = 1;  | 
190  | 0  |         if (pieces == NULL || pieces_lengths == NULL || current == NULL) { | 
191  | 0  |             status = U_MEMORY_ALLOCATION_ERROR;  | 
192  | 0  |             goto CleanPartialInitialization;  | 
193  | 0  |         }  | 
194  | 0  |         current[0] = 0;  | 
195  | 0  |         pieces[0] = new UnicodeString[1];  | 
196  | 0  |         pieces_lengths[0] = 1;  | 
197  | 0  |         if (pieces[0] == 0) { | 
198  | 0  |             status = U_MEMORY_ALLOCATION_ERROR;  | 
199  | 0  |             goto CleanPartialInitialization;  | 
200  | 0  |         }  | 
201  | 0  |         return;  | 
202  | 0  |     }  | 
203  |  |  | 
204  |  |  | 
205  | 0  |     list = new UnicodeString[source.length()];  | 
206  | 0  |     if (list == 0) { | 
207  | 0  |         status = U_MEMORY_ALLOCATION_ERROR;  | 
208  | 0  |         goto CleanPartialInitialization;  | 
209  | 0  |     }  | 
210  |  |  | 
211  |  |     // i should initially be the number of code units at the   | 
212  |  |     // start of the string  | 
213  | 0  |     i = U16_LENGTH(source.char32At(0));  | 
214  |  |     // int32_t i = 1;  | 
215  |  |     // find the segments  | 
216  |  |     // This code iterates through the source string and   | 
217  |  |     // extracts segments that end up on a codepoint that  | 
218  |  |     // doesn't start any decompositions. (Analysis is done  | 
219  |  |     // on the NFD form - see above).  | 
220  | 0  |     for (; i < source.length(); i += U16_LENGTH(cp)) { | 
221  | 0  |         cp = source.char32At(i);  | 
222  | 0  |         if (nfcImpl.isCanonSegmentStarter(cp)) { | 
223  | 0  |             source.extract(start, i-start, list[list_length++]); // add up to i  | 
224  | 0  |             start = i;  | 
225  | 0  |         }  | 
226  | 0  |     }  | 
227  | 0  |     source.extract(start, i-start, list[list_length++]); // add last one  | 
228  |  |  | 
229  |  |  | 
230  |  |     // allocate the arrays, and find the strings that are CE to each segment  | 
231  | 0  |     pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *));  | 
232  | 0  |     pieces_length = list_length;  | 
233  | 0  |     pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));  | 
234  | 0  |     current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));  | 
235  | 0  |     current_length = list_length;  | 
236  | 0  |     if (pieces == NULL || pieces_lengths == NULL || current == NULL) { | 
237  | 0  |         status = U_MEMORY_ALLOCATION_ERROR;  | 
238  | 0  |         goto CleanPartialInitialization;  | 
239  | 0  |     }  | 
240  |  |  | 
241  | 0  |     for (i = 0; i < current_length; i++) { | 
242  | 0  |         current[i] = 0;  | 
243  | 0  |     }  | 
244  |  |     // for each segment, get all the combinations that can produce   | 
245  |  |     // it after NFD normalization  | 
246  | 0  |     for (i = 0; i < pieces_length; ++i) { | 
247  |  |         //if (PROGRESS) printf("SEGMENT\n"); | 
248  | 0  |         pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);  | 
249  | 0  |     }  | 
250  |  | 
  | 
251  | 0  |     delete[] list;  | 
252  | 0  |     return;  | 
253  |  | // Common section to cleanup all local variables and reset object variables.  | 
254  | 0  | CleanPartialInitialization:  | 
255  | 0  |     if (list != NULL) { | 
256  | 0  |         delete[] list;  | 
257  | 0  |     }  | 
258  | 0  |     cleanPieces();  | 
259  | 0  | }  | 
260  |  |  | 
261  |  | /**  | 
262  |  |  * Dumb recursive implementation of permutation.  | 
263  |  |  * TODO: optimize  | 
264  |  |  * @param source the string to find permutations for  | 
265  |  |  * @return the results in a set.  | 
266  |  |  */  | 
267  | 0  | void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) { | 
268  | 0  |     if(U_FAILURE(status)) { | 
269  | 0  |         return;  | 
270  | 0  |     }  | 
271  |  |     //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); | 
272  | 0  |     int32_t i = 0;  | 
273  |  |  | 
274  |  |     // optimization:  | 
275  |  |     // if zero or one character, just return a set with it  | 
276  |  |     // we check for length < 2 to keep from counting code points all the time  | 
277  | 0  |     if (source.length() <= 2 && source.countChar32() <= 1) { | 
278  | 0  |         UnicodeString *toPut = new UnicodeString(source);  | 
279  |  |         /* test for NULL */  | 
280  | 0  |         if (toPut == 0) { | 
281  | 0  |             status = U_MEMORY_ALLOCATION_ERROR;  | 
282  | 0  |             return;  | 
283  | 0  |         }  | 
284  | 0  |         result->put(source, toPut, status);  | 
285  | 0  |         return;  | 
286  | 0  |     }  | 
287  |  |  | 
288  |  |     // otherwise iterate through the string, and recursively permute all the other characters  | 
289  | 0  |     UChar32 cp;  | 
290  | 0  |     Hashtable subpermute(status);  | 
291  | 0  |     if(U_FAILURE(status)) { | 
292  | 0  |         return;  | 
293  | 0  |     }  | 
294  | 0  |     subpermute.setValueDeleter(uprv_deleteUObject);  | 
295  |  | 
  | 
296  | 0  |     for (i = 0; i < source.length(); i += U16_LENGTH(cp)) { | 
297  | 0  |         cp = source.char32At(i);  | 
298  | 0  |         const UHashElement *ne = NULL;  | 
299  | 0  |         int32_t el = UHASH_FIRST;  | 
300  | 0  |         UnicodeString subPermuteString = source;  | 
301  |  |  | 
302  |  |         // optimization:  | 
303  |  |         // if the character is canonical combining class zero,  | 
304  |  |         // don't permute it  | 
305  | 0  |         if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) { | 
306  |  |             //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); | 
307  | 0  |             continue;  | 
308  | 0  |         }  | 
309  |  |  | 
310  | 0  |         subpermute.removeAll();  | 
311  |  |  | 
312  |  |         // see what the permutations of the characters before and after this one are  | 
313  |  |         //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));  | 
314  | 0  |         permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status);  | 
315  |  |         /* Test for buffer overflows */  | 
316  | 0  |         if(U_FAILURE(status)) { | 
317  | 0  |             return;  | 
318  | 0  |         }  | 
319  |  |         // The upper remove is destructive. The question is do we have to make a copy, or we don't care about the contents   | 
320  |  |         // of source at this point.  | 
321  |  |  | 
322  |  |         // prefix this character to all of them  | 
323  | 0  |         ne = subpermute.nextElement(el);  | 
324  | 0  |         while (ne != NULL) { | 
325  | 0  |             UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);  | 
326  | 0  |             UnicodeString *chStr = new UnicodeString(cp);  | 
327  |  |             //test for  NULL  | 
328  | 0  |             if (chStr == NULL) { | 
329  | 0  |                 status = U_MEMORY_ALLOCATION_ERROR;  | 
330  | 0  |                 return;  | 
331  | 0  |             }  | 
332  | 0  |             chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer));  | 
333  |  |             //if (PROGRESS) printf("  Piece: %s\n", UToS(*chStr)); | 
334  | 0  |             result->put(*chStr, chStr, status);  | 
335  | 0  |             ne = subpermute.nextElement(el);  | 
336  | 0  |         }  | 
337  | 0  |     }  | 
338  |  |     //return result;  | 
339  | 0  | }  | 
340  |  |  | 
341  |  | // privates  | 
342  |  |  | 
343  |  | // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.  | 
344  | 0  | UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) { | 
345  | 0  |     Hashtable result(status);  | 
346  | 0  |     Hashtable permutations(status);  | 
347  | 0  |     Hashtable basic(status);  | 
348  | 0  |     if (U_FAILURE(status)) { | 
349  | 0  |         return 0;  | 
350  | 0  |     }  | 
351  | 0  |     result.setValueDeleter(uprv_deleteUObject);  | 
352  | 0  |     permutations.setValueDeleter(uprv_deleteUObject);  | 
353  | 0  |     basic.setValueDeleter(uprv_deleteUObject);  | 
354  |  | 
  | 
355  | 0  |     UChar USeg[256];  | 
356  | 0  |     int32_t segLen = segment.extract(USeg, 256, status);  | 
357  | 0  |     getEquivalents2(&basic, USeg, segLen, status);  | 
358  |  |  | 
359  |  |     // now get all the permutations  | 
360  |  |     // add only the ones that are canonically equivalent  | 
361  |  |     // TODO: optimize by not permuting any class zero.  | 
362  |  | 
  | 
363  | 0  |     const UHashElement *ne = NULL;  | 
364  | 0  |     int32_t el = UHASH_FIRST;  | 
365  |  |     //Iterator it = basic.iterator();  | 
366  | 0  |     ne = basic.nextElement(el);  | 
367  |  |     //while (it.hasNext())  | 
368  | 0  |     while (ne != NULL) { | 
369  |  |         //String item = (String) it.next();  | 
370  | 0  |         UnicodeString item = *((UnicodeString *)(ne->value.pointer));  | 
371  |  | 
  | 
372  | 0  |         permutations.removeAll();  | 
373  | 0  |         permute(item, CANITER_SKIP_ZEROES, &permutations, status);  | 
374  | 0  |         const UHashElement *ne2 = NULL;  | 
375  | 0  |         int32_t el2 = UHASH_FIRST;  | 
376  |  |         //Iterator it2 = permutations.iterator();  | 
377  | 0  |         ne2 = permutations.nextElement(el2);  | 
378  |  |         //while (it2.hasNext())  | 
379  | 0  |         while (ne2 != NULL) { | 
380  |  |             //String possible = (String) it2.next();  | 
381  |  |             //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));  | 
382  | 0  |             UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));  | 
383  | 0  |             UnicodeString attempt;  | 
384  | 0  |             nfd.normalize(possible, attempt, status);  | 
385  |  |  | 
386  |  |             // TODO: check if operator == is semanticaly the same as attempt.equals(segment)  | 
387  | 0  |             if (attempt==segment) { | 
388  |  |                 //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible))); | 
389  |  |                 // TODO: use the hashtable just to catch duplicates - store strings directly (somehow).  | 
390  | 0  |                 result.put(possible, new UnicodeString(possible), status); //add(possible);  | 
391  | 0  |             } else { | 
392  |  |                 //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible))); | 
393  | 0  |             }  | 
394  |  | 
  | 
395  | 0  |             ne2 = permutations.nextElement(el2);  | 
396  | 0  |         }  | 
397  | 0  |         ne = basic.nextElement(el);  | 
398  | 0  |     }  | 
399  |  |  | 
400  |  |     /* Test for buffer overflows */  | 
401  | 0  |     if(U_FAILURE(status)) { | 
402  | 0  |         return 0;  | 
403  | 0  |     }  | 
404  |  |     // convert into a String[] to clean up storage  | 
405  |  |     //String[] finalResult = new String[result.size()];  | 
406  | 0  |     UnicodeString *finalResult = NULL;  | 
407  | 0  |     int32_t resultCount;  | 
408  | 0  |     if((resultCount = result.count()) != 0) { | 
409  | 0  |         finalResult = new UnicodeString[resultCount];  | 
410  | 0  |         if (finalResult == 0) { | 
411  | 0  |             status = U_MEMORY_ALLOCATION_ERROR;  | 
412  | 0  |             return NULL;  | 
413  | 0  |         }  | 
414  | 0  |     }  | 
415  | 0  |     else { | 
416  | 0  |         status = U_ILLEGAL_ARGUMENT_ERROR;  | 
417  | 0  |         return NULL;  | 
418  | 0  |     }  | 
419  |  |     //result.toArray(finalResult);  | 
420  | 0  |     result_len = 0;  | 
421  | 0  |     el = UHASH_FIRST;  | 
422  | 0  |     ne = result.nextElement(el);  | 
423  | 0  |     while(ne != NULL) { | 
424  | 0  |         finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer));  | 
425  | 0  |         ne = result.nextElement(el);  | 
426  | 0  |     }  | 
427  |  |  | 
428  |  | 
  | 
429  | 0  |     return finalResult;  | 
430  | 0  | }  | 
431  |  |  | 
432  | 0  | Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { | 
433  |  | 
  | 
434  | 0  |     if (U_FAILURE(status)) { | 
435  | 0  |         return NULL;  | 
436  | 0  |     }  | 
437  |  |  | 
438  |  |     //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); | 
439  |  |  | 
440  | 0  |     UnicodeString toPut(segment, segLen);  | 
441  |  | 
  | 
442  | 0  |     fillinResult->put(toPut, new UnicodeString(toPut), status);  | 
443  |  | 
  | 
444  | 0  |     UnicodeSet starts;  | 
445  |  |  | 
446  |  |     // cycle through all the characters  | 
447  | 0  |     UChar32 cp;  | 
448  | 0  |     for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) { | 
449  |  |         // see if any character is at the start of some decomposition  | 
450  | 0  |         U16_GET(segment, 0, i, segLen, cp);  | 
451  | 0  |         if (!nfcImpl.getCanonStartSet(cp, starts)) { | 
452  | 0  |             continue;  | 
453  | 0  |         }  | 
454  |  |         // if so, see which decompositions match  | 
455  | 0  |         UnicodeSetIterator iter(starts);  | 
456  | 0  |         while (iter.next()) { | 
457  | 0  |             UChar32 cp2 = iter.getCodepoint();  | 
458  | 0  |             Hashtable remainder(status);  | 
459  | 0  |             remainder.setValueDeleter(uprv_deleteUObject);  | 
460  | 0  |             if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) { | 
461  | 0  |                 continue;  | 
462  | 0  |             }  | 
463  |  |  | 
464  |  |             // there were some matches, so add all the possibilities to the set.  | 
465  | 0  |             UnicodeString prefix(segment, i);  | 
466  | 0  |             prefix += cp2;  | 
467  |  | 
  | 
468  | 0  |             int32_t el = UHASH_FIRST;  | 
469  | 0  |             const UHashElement *ne = remainder.nextElement(el);  | 
470  | 0  |             while (ne != NULL) { | 
471  | 0  |                 UnicodeString item = *((UnicodeString *)(ne->value.pointer));  | 
472  | 0  |                 UnicodeString *toAdd = new UnicodeString(prefix);  | 
473  |  |                 /* test for NULL */  | 
474  | 0  |                 if (toAdd == 0) { | 
475  | 0  |                     status = U_MEMORY_ALLOCATION_ERROR;  | 
476  | 0  |                     return NULL;  | 
477  | 0  |                 }  | 
478  | 0  |                 *toAdd += item;  | 
479  | 0  |                 fillinResult->put(*toAdd, toAdd, status);  | 
480  |  |  | 
481  |  |                 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); | 
482  |  | 
  | 
483  | 0  |                 ne = remainder.nextElement(el);  | 
484  | 0  |             }  | 
485  | 0  |         }  | 
486  | 0  |     }  | 
487  |  |  | 
488  |  |     /* Test for buffer overflows */  | 
489  | 0  |     if(U_FAILURE(status)) { | 
490  | 0  |         return NULL;  | 
491  | 0  |     }  | 
492  | 0  |     return fillinResult;  | 
493  | 0  | }  | 
494  |  |  | 
495  |  | /**  | 
496  |  |  * See if the decomposition of cp2 is at segment starting at segmentPos   | 
497  |  |  * (with canonical rearrangement!)  | 
498  |  |  * If so, take the remainder, and return the equivalents   | 
499  |  |  */  | 
500  | 0  | Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { | 
501  |  | //Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { | 
502  |  |     //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp)))); | 
503  |  |     //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos); | 
504  |  | 
  | 
505  | 0  |     if (U_FAILURE(status)) { | 
506  | 0  |         return NULL;  | 
507  | 0  |     }  | 
508  |  |  | 
509  | 0  |     UnicodeString temp(comp);  | 
510  | 0  |     int32_t inputLen=temp.length();  | 
511  | 0  |     UnicodeString decompString;  | 
512  | 0  |     nfd.normalize(temp, decompString, status);  | 
513  | 0  |     if (U_FAILURE(status)) { | 
514  | 0  |         return NULL;  | 
515  | 0  |     }  | 
516  | 0  |     if (decompString.isBogus()) { | 
517  | 0  |         status = U_MEMORY_ALLOCATION_ERROR;  | 
518  | 0  |         return NULL;  | 
519  | 0  |     }  | 
520  | 0  |     const UChar *decomp=decompString.getBuffer();  | 
521  | 0  |     int32_t decompLen=decompString.length();  | 
522  |  |  | 
523  |  |     // See if it matches the start of segment (at segmentPos)  | 
524  | 0  |     UBool ok = FALSE;  | 
525  | 0  |     UChar32 cp;  | 
526  | 0  |     int32_t decompPos = 0;  | 
527  | 0  |     UChar32 decompCp;  | 
528  | 0  |     U16_NEXT(decomp, decompPos, decompLen, decompCp);  | 
529  |  | 
  | 
530  | 0  |     int32_t i = segmentPos;  | 
531  | 0  |     while(i < segLen) { | 
532  | 0  |         U16_NEXT(segment, i, segLen, cp);  | 
533  |  | 
  | 
534  | 0  |         if (cp == decompCp) { // if equal, eat another cp from decomp | 
535  |  |  | 
536  |  |             //if (PROGRESS) printf("  matches: %s\n", UToS(Tr(UnicodeString(cp)))); | 
537  |  | 
  | 
538  | 0  |             if (decompPos == decompLen) { // done, have all decomp characters! | 
539  | 0  |                 temp.append(segment+i, segLen-i);  | 
540  | 0  |                 ok = TRUE;  | 
541  | 0  |                 break;  | 
542  | 0  |             }  | 
543  | 0  |             U16_NEXT(decomp, decompPos, decompLen, decompCp);  | 
544  | 0  |         } else { | 
545  |  |             //if (PROGRESS) printf("  buffer: %s\n", UToS(Tr(UnicodeString(cp)))); | 
546  |  |  | 
547  |  |             // brute force approach  | 
548  | 0  |             temp.append(cp);  | 
549  |  |  | 
550  |  |             /* TODO: optimize  | 
551  |  |             // since we know that the classes are monotonically increasing, after zero  | 
552  |  |             // e.g. 0 5 7 9 0 3  | 
553  |  |             // we can do an optimization  | 
554  |  |             // there are only a few cases that work: zero, less, same, greater  | 
555  |  |             // if both classes are the same, we fail  | 
556  |  |             // if the decomp class < the segment class, we fail  | 
557  |  |  | 
558  |  |             segClass = getClass(cp);  | 
559  |  |             if (decompClass <= segClass) return null;  | 
560  |  |             */  | 
561  | 0  |         }  | 
562  | 0  |     }  | 
563  | 0  |     if (!ok)  | 
564  | 0  |         return NULL; // we failed, characters left over  | 
565  |  |  | 
566  |  |     //if (PROGRESS) printf("Matches\n"); | 
567  |  |  | 
568  | 0  |     if (inputLen == temp.length()) { | 
569  | 0  |         fillinResult->put(UnicodeString(), new UnicodeString(), status);  | 
570  | 0  |         return fillinResult; // succeed, but no remainder  | 
571  | 0  |     }  | 
572  |  |  | 
573  |  |     // brute force approach  | 
574  |  |     // check to make sure result is canonically equivalent  | 
575  | 0  |     UnicodeString trial;  | 
576  | 0  |     nfd.normalize(temp, trial, status);  | 
577  | 0  |     if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) { | 
578  | 0  |         return NULL;  | 
579  | 0  |     }  | 
580  |  |  | 
581  | 0  |     return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status);  | 
582  | 0  | }  | 
583  |  |  | 
584  |  | U_NAMESPACE_END  | 
585  |  |  | 
586  |  | #endif /* #if !UCONFIG_NO_NORMALIZATION */  |