/src/icu/source/common/unisetspan.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | ******************************************************************************  | 
5  |  | *  | 
6  |  | *   Copyright (C) 2007-2012, International Business Machines  | 
7  |  | *   Corporation and others.  All Rights Reserved.  | 
8  |  | *  | 
9  |  | ******************************************************************************  | 
10  |  | *   file name:  unisetspan.cpp  | 
11  |  | *   encoding:   UTF-8  | 
12  |  | *   tab size:   8 (not used)  | 
13  |  | *   indentation:4  | 
14  |  | *  | 
15  |  | *   created on: 2007mar01  | 
16  |  | *   created by: Markus W. Scherer  | 
17  |  | */  | 
18  |  |  | 
19  |  | #include "unicode/utypes.h"  | 
20  |  | #include "unicode/uniset.h"  | 
21  |  | #include "unicode/ustring.h"  | 
22  |  | #include "unicode/utf8.h"  | 
23  |  | #include "unicode/utf16.h"  | 
24  |  | #include "cmemory.h"  | 
25  |  | #include "uvector.h"  | 
26  |  | #include "unisetspan.h"  | 
27  |  |  | 
28  |  | U_NAMESPACE_BEGIN  | 
29  |  |  | 
30  |  | /*  | 
31  |  |  * List of offsets from the current position from where to try matching  | 
32  |  |  * a code point or a string.  | 
33  |  |  * Store offsets rather than indexes to simplify the code and use the same list  | 
34  |  |  * for both increments (in span()) and decrements (in spanBack()).  | 
35  |  |  *  | 
36  |  |  * Assumption: The maximum offset is limited, and the offsets that are stored  | 
37  |  |  * at any one time are relatively dense, that is, there are normally no gaps of  | 
38  |  |  * hundreds or thousands of offset values.  | 
39  |  |  *  | 
40  |  |  * The implementation uses a circular buffer of byte flags,  | 
41  |  |  * each indicating whether the corresponding offset is in the list.  | 
42  |  |  * This avoids inserting into a sorted list of offsets (or absolute indexes) and  | 
43  |  |  * physically moving part of the list.  | 
44  |  |  *  | 
45  |  |  * Note: In principle, the caller should setMaxLength() to the maximum of the  | 
46  |  |  * max string length and U16_LENGTH/U8_LENGTH to account for  | 
47  |  |  * "long" single code points.  | 
48  |  |  * However, this implementation uses at least a staticList with more than  | 
49  |  |  * U8_LENGTH entries anyway.  | 
50  |  |  *  | 
51  |  |  * Note: If maxLength were guaranteed to be no more than 32 or 64,  | 
52  |  |  * the list could be stored as bit flags in a single integer.  | 
53  |  |  * Rather than handling a circular buffer with a start list index,  | 
54  |  |  * the integer would simply be shifted when lower offsets are removed.  | 
55  |  |  * UnicodeSet does not have a limit on the lengths of strings.  | 
56  |  |  */  | 
57  |  | class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory. | 
58  |  | public:  | 
59  | 0  |     OffsetList() : list(staticList), capacity(0), length(0), start(0) {} | 
60  |  |  | 
61  | 0  |     ~OffsetList() { | 
62  | 0  |         if(list!=staticList) { | 
63  | 0  |             uprv_free(list);  | 
64  | 0  |         }  | 
65  | 0  |     }  | 
66  |  |  | 
67  |  |     // Call exactly once if the list is to be used.  | 
68  | 0  |     void setMaxLength(int32_t maxLength) { | 
69  | 0  |         if(maxLength<=(int32_t)sizeof(staticList)) { | 
70  | 0  |             capacity=(int32_t)sizeof(staticList);  | 
71  | 0  |         } else { | 
72  | 0  |             UBool *l=(UBool *)uprv_malloc(maxLength);  | 
73  | 0  |             if(l!=NULL) { | 
74  | 0  |                 list=l;  | 
75  | 0  |                 capacity=maxLength;  | 
76  | 0  |             }  | 
77  | 0  |         }  | 
78  | 0  |         uprv_memset(list, 0, capacity);  | 
79  | 0  |     }  | 
80  |  |  | 
81  | 0  |     void clear() { | 
82  | 0  |         uprv_memset(list, 0, capacity);  | 
83  | 0  |         start=length=0;  | 
84  | 0  |     }  | 
85  |  |  | 
86  | 0  |     UBool isEmpty() const { | 
87  | 0  |         return (UBool)(length==0);  | 
88  | 0  |     }  | 
89  |  |  | 
90  |  |     // Reduce all stored offsets by delta, used when the current position  | 
91  |  |     // moves by delta.  | 
92  |  |     // There must not be any offsets lower than delta.  | 
93  |  |     // If there is an offset equal to delta, it is removed.  | 
94  |  |     // delta=[1..maxLength]  | 
95  | 0  |     void shift(int32_t delta) { | 
96  | 0  |         int32_t i=start+delta;  | 
97  | 0  |         if(i>=capacity) { | 
98  | 0  |             i-=capacity;  | 
99  | 0  |         }  | 
100  | 0  |         if(list[i]) { | 
101  | 0  |             list[i]=FALSE;  | 
102  | 0  |             --length;  | 
103  | 0  |         }  | 
104  | 0  |         start=i;  | 
105  | 0  |     }  | 
106  |  |  | 
107  |  |     // Add an offset. The list must not contain it yet.  | 
108  |  |     // offset=[1..maxLength]  | 
109  | 0  |     void addOffset(int32_t offset) { | 
110  | 0  |         int32_t i=start+offset;  | 
111  | 0  |         if(i>=capacity) { | 
112  | 0  |             i-=capacity;  | 
113  | 0  |         }  | 
114  | 0  |         list[i]=TRUE;  | 
115  | 0  |         ++length;  | 
116  | 0  |     }  | 
117  |  |  | 
118  |  |     // offset=[1..maxLength]  | 
119  | 0  |     UBool containsOffset(int32_t offset) const { | 
120  | 0  |         int32_t i=start+offset;  | 
121  | 0  |         if(i>=capacity) { | 
122  | 0  |             i-=capacity;  | 
123  | 0  |         }  | 
124  | 0  |         return list[i];  | 
125  | 0  |     }  | 
126  |  |  | 
127  |  |     // Find the lowest stored offset from a non-empty list, remove it,  | 
128  |  |     // and reduce all other offsets by this minimum.  | 
129  |  |     // Returns [1..maxLength].  | 
130  | 0  |     int32_t popMinimum() { | 
131  |  |         // Look for the next offset in list[start+1..capacity-1].  | 
132  | 0  |         int32_t i=start, result;  | 
133  | 0  |         while(++i<capacity) { | 
134  | 0  |             if(list[i]) { | 
135  | 0  |                 list[i]=FALSE;  | 
136  | 0  |                 --length;  | 
137  | 0  |                 result=i-start;  | 
138  | 0  |                 start=i;  | 
139  | 0  |                 return result;  | 
140  | 0  |             }  | 
141  | 0  |         }  | 
142  |  |         // i==capacity  | 
143  |  |  | 
144  |  |         // Wrap around and look for the next offset in list[0..start].  | 
145  |  |         // Since the list is not empty, there will be one.  | 
146  | 0  |         result=capacity-start;  | 
147  | 0  |         i=0;  | 
148  | 0  |         while(!list[i]) { | 
149  | 0  |             ++i;  | 
150  | 0  |         }  | 
151  | 0  |         list[i]=FALSE;  | 
152  | 0  |         --length;  | 
153  | 0  |         start=i;  | 
154  | 0  |         return result+=i;  | 
155  | 0  |     }  | 
156  |  |  | 
157  |  | private:  | 
158  |  |     UBool *list;  | 
159  |  |     int32_t capacity;  | 
160  |  |     int32_t length;  | 
161  |  |     int32_t start;  | 
162  |  |  | 
163  |  |     UBool staticList[16];  | 
164  |  | };  | 
165  |  |  | 
166  |  | // Get the number of UTF-8 bytes for a UTF-16 (sub)string.  | 
167  |  | static int32_t  | 
168  | 0  | getUTF8Length(const UChar *s, int32_t length) { | 
169  | 0  |     UErrorCode errorCode=U_ZERO_ERROR;  | 
170  | 0  |     int32_t length8=0;  | 
171  | 0  |     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);  | 
172  | 0  |     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { | 
173  | 0  |         return length8;  | 
174  | 0  |     } else { | 
175  |  |         // The string contains an unpaired surrogate.  | 
176  |  |         // Ignore this string.  | 
177  | 0  |         return 0;  | 
178  | 0  |     }  | 
179  | 0  | }  | 
180  |  |  | 
181  |  | // Append the UTF-8 version of the string to t and return the appended UTF-8 length.  | 
182  |  | static int32_t  | 
183  | 0  | appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { | 
184  | 0  |     UErrorCode errorCode=U_ZERO_ERROR;  | 
185  | 0  |     int32_t length8=0;  | 
186  | 0  |     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);  | 
187  | 0  |     if(U_SUCCESS(errorCode)) { | 
188  | 0  |         return length8;  | 
189  | 0  |     } else { | 
190  |  |         // The string contains an unpaired surrogate.  | 
191  |  |         // Ignore this string.  | 
192  | 0  |         return 0;  | 
193  | 0  |     }  | 
194  | 0  | }  | 
195  |  |  | 
196  |  | static inline uint8_t  | 
197  | 0  | makeSpanLengthByte(int32_t spanLength) { | 
198  |  |     // 0xfe==UnicodeSetStringSpan::LONG_SPAN  | 
199  | 0  |     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;  | 
200  | 0  | }  | 
201  |  |  | 
202  |  | // Construct for all variants of span(), or only for any one variant.  | 
203  |  | // Initialize as little as possible, for single use.  | 
204  |  | UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,  | 
205  |  |                                            const UVector &setStrings,  | 
206  |  |                                            uint32_t which)  | 
207  | 0  |         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),  | 
208  |  |           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),  | 
209  | 0  |           utf8Length(0),  | 
210  | 0  |           maxLength16(0), maxLength8(0),  | 
211  | 0  |           all((UBool)(which==ALL)) { | 
212  | 0  |     spanSet.retainAll(set);  | 
213  | 0  |     if(which&NOT_CONTAINED) { | 
214  |  |         // Default to the same sets.  | 
215  |  |         // addToSpanNotSet() will create a separate set if necessary.  | 
216  | 0  |         pSpanNotSet=&spanSet;  | 
217  | 0  |     }  | 
218  |  |  | 
219  |  |     // Determine if the strings even need to be taken into account at all for span() etc.  | 
220  |  |     // If any string is relevant, then all strings need to be used for  | 
221  |  |     // span(longest match) but only the relevant ones for span(while contained).  | 
222  |  |     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH  | 
223  |  |     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.  | 
224  |  |     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)  | 
225  |  |     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.  | 
226  | 0  |     int32_t stringsLength=strings.size();  | 
227  |  | 
  | 
228  | 0  |     int32_t i, spanLength;  | 
229  | 0  |     UBool someRelevant=FALSE;  | 
230  | 0  |     for(i=0; i<stringsLength; ++i) { | 
231  | 0  |         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);  | 
232  | 0  |         const UChar *s16=string.getBuffer();  | 
233  | 0  |         int32_t length16=string.length();  | 
234  | 0  |         if (length16==0) { | 
235  | 0  |             continue;  // skip the empty string  | 
236  | 0  |         }  | 
237  | 0  |         UBool thisRelevant;  | 
238  | 0  |         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);  | 
239  | 0  |         if(spanLength<length16) {  // Relevant string. | 
240  | 0  |             someRelevant=thisRelevant=TRUE;  | 
241  | 0  |         } else { | 
242  | 0  |             thisRelevant=FALSE;  | 
243  | 0  |         }  | 
244  | 0  |         if((which&UTF16) && length16>maxLength16) { | 
245  | 0  |             maxLength16=length16;  | 
246  | 0  |         }  | 
247  | 0  |         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { | 
248  | 0  |             int32_t length8=getUTF8Length(s16, length16);  | 
249  | 0  |             utf8Length+=length8;  | 
250  | 0  |             if(length8>maxLength8) { | 
251  | 0  |                 maxLength8=length8;  | 
252  | 0  |             }  | 
253  | 0  |         }  | 
254  | 0  |     }  | 
255  | 0  |     if(!someRelevant) { | 
256  | 0  |         maxLength16=maxLength8=0;  | 
257  | 0  |         return;  | 
258  | 0  |     }  | 
259  |  |  | 
260  |  |     // Freeze after checking for the need to use strings at all because freezing  | 
261  |  |     // a set takes some time and memory which are wasted if there are no relevant strings.  | 
262  | 0  |     if(all) { | 
263  | 0  |         spanSet.freeze();  | 
264  | 0  |     }  | 
265  |  | 
  | 
266  | 0  |     uint8_t *spanBackLengths;  | 
267  | 0  |     uint8_t *spanUTF8Lengths;  | 
268  | 0  |     uint8_t *spanBackUTF8Lengths;  | 
269  |  |  | 
270  |  |     // Allocate a block of meta data.  | 
271  | 0  |     int32_t allocSize;  | 
272  | 0  |     if(all) { | 
273  |  |         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.  | 
274  | 0  |         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;  | 
275  | 0  |     } else { | 
276  | 0  |         allocSize=stringsLength;  // One set of span lengths.  | 
277  | 0  |         if(which&UTF8) { | 
278  |  |             // UTF-8 lengths and UTF-8 strings.  | 
279  | 0  |             allocSize+=stringsLength*4+utf8Length;  | 
280  | 0  |         }  | 
281  | 0  |     }  | 
282  | 0  |     if(allocSize<=(int32_t)sizeof(staticLengths)) { | 
283  | 0  |         utf8Lengths=staticLengths;  | 
284  | 0  |     } else { | 
285  | 0  |         utf8Lengths=(int32_t *)uprv_malloc(allocSize);  | 
286  | 0  |         if(utf8Lengths==NULL) { | 
287  | 0  |             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.  | 
288  | 0  |             return;  // Out of memory.  | 
289  | 0  |         }  | 
290  | 0  |     }  | 
291  |  |  | 
292  | 0  |     if(all) { | 
293  |  |         // Store span lengths for all span() variants.  | 
294  | 0  |         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);  | 
295  | 0  |         spanBackLengths=spanLengths+stringsLength;  | 
296  | 0  |         spanUTF8Lengths=spanBackLengths+stringsLength;  | 
297  | 0  |         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;  | 
298  | 0  |         utf8=spanBackUTF8Lengths+stringsLength;  | 
299  | 0  |     } else { | 
300  |  |         // Store span lengths for only one span() variant.  | 
301  | 0  |         if(which&UTF8) { | 
302  | 0  |             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);  | 
303  | 0  |             utf8=spanLengths+stringsLength;  | 
304  | 0  |         } else { | 
305  | 0  |             spanLengths=(uint8_t *)utf8Lengths;  | 
306  | 0  |         }  | 
307  | 0  |         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;  | 
308  | 0  |     }  | 
309  |  |  | 
310  |  |     // Set the meta data and pSpanNotSet and write the UTF-8 strings.  | 
311  | 0  |     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.  | 
312  |  | 
  | 
313  | 0  |     for(i=0; i<stringsLength; ++i) { | 
314  | 0  |         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);  | 
315  | 0  |         const UChar *s16=string.getBuffer();  | 
316  | 0  |         int32_t length16=string.length();  | 
317  | 0  |         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);  | 
318  | 0  |         if(spanLength<length16 && length16>0) {  // Relevant string. | 
319  | 0  |             if(which&UTF16) { | 
320  | 0  |                 if(which&CONTAINED) { | 
321  | 0  |                     if(which&FWD) { | 
322  | 0  |                         spanLengths[i]=makeSpanLengthByte(spanLength);  | 
323  | 0  |                     }  | 
324  | 0  |                     if(which&BACK) { | 
325  | 0  |                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);  | 
326  | 0  |                         spanBackLengths[i]=makeSpanLengthByte(spanLength);  | 
327  | 0  |                     }  | 
328  | 0  |                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ { | 
329  | 0  |                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.  | 
330  | 0  |                 }  | 
331  | 0  |             }  | 
332  | 0  |             if(which&UTF8) { | 
333  | 0  |                 uint8_t *s8=utf8+utf8Count;  | 
334  | 0  |                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);  | 
335  | 0  |                 utf8Count+=utf8Lengths[i]=length8;  | 
336  | 0  |                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8. | 
337  | 0  |                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;  | 
338  | 0  |                 } else {  // Relevant for UTF-8. | 
339  | 0  |                     if(which&CONTAINED) { | 
340  | 0  |                         if(which&FWD) { | 
341  | 0  |                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);  | 
342  | 0  |                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);  | 
343  | 0  |                         }  | 
344  | 0  |                         if(which&BACK) { | 
345  | 0  |                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);  | 
346  | 0  |                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);  | 
347  | 0  |                         }  | 
348  | 0  |                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ { | 
349  | 0  |                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.  | 
350  | 0  |                     }  | 
351  | 0  |                 }  | 
352  | 0  |             }  | 
353  | 0  |             if(which&NOT_CONTAINED) { | 
354  |  |                 // Add string start and end code points to the spanNotSet so that  | 
355  |  |                 // a span(while not contained) stops before any string.  | 
356  | 0  |                 UChar32 c;  | 
357  | 0  |                 if(which&FWD) { | 
358  | 0  |                     int32_t len=0;  | 
359  | 0  |                     U16_NEXT(s16, len, length16, c);  | 
360  | 0  |                     addToSpanNotSet(c);  | 
361  | 0  |                 }  | 
362  | 0  |                 if(which&BACK) { | 
363  | 0  |                     int32_t len=length16;  | 
364  | 0  |                     U16_PREV(s16, 0, len, c);  | 
365  | 0  |                     addToSpanNotSet(c);  | 
366  | 0  |                 }  | 
367  | 0  |             }  | 
368  | 0  |         } else {  // Irrelevant string. (Also the empty string.) | 
369  | 0  |             if(which&UTF8) { | 
370  | 0  |                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH. | 
371  | 0  |                     uint8_t *s8=utf8+utf8Count;  | 
372  | 0  |                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);  | 
373  | 0  |                     utf8Count+=utf8Lengths[i]=length8;  | 
374  | 0  |                 } else { | 
375  | 0  |                     utf8Lengths[i]=0;  | 
376  | 0  |                 }  | 
377  | 0  |             }  | 
378  | 0  |             if(all) { | 
379  | 0  |                 spanLengths[i]=spanBackLengths[i]=  | 
380  | 0  |                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=  | 
381  | 0  |                         (uint8_t)ALL_CP_CONTAINED;  | 
382  | 0  |             } else { | 
383  |  |                 // All spanXYZLengths pointers contain the same address.  | 
384  | 0  |                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;  | 
385  | 0  |             }  | 
386  | 0  |         }  | 
387  | 0  |     }  | 
388  |  |  | 
389  |  |     // Finish.  | 
390  | 0  |     if(all) { | 
391  | 0  |         pSpanNotSet->freeze();  | 
392  | 0  |     }  | 
393  | 0  | }  | 
394  |  |  | 
395  |  | // Copy constructor. Assumes which==ALL for a frozen set.  | 
396  |  | UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,  | 
397  |  |                                            const UVector &newParentSetStrings)  | 
398  | 0  |         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),  | 
399  |  |           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),  | 
400  | 0  |           utf8Length(otherStringSpan.utf8Length),  | 
401  | 0  |           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),  | 
402  | 0  |           all(TRUE) { | 
403  | 0  |     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { | 
404  | 0  |         pSpanNotSet=&spanSet;  | 
405  | 0  |     } else { | 
406  | 0  |         pSpanNotSet=otherStringSpan.pSpanNotSet->clone();  | 
407  | 0  |     }  | 
408  |  |  | 
409  |  |     // Allocate a block of meta data.  | 
410  |  |     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.  | 
411  | 0  |     int32_t stringsLength=strings.size();  | 
412  | 0  |     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;  | 
413  | 0  |     if(allocSize<=(int32_t)sizeof(staticLengths)) { | 
414  | 0  |         utf8Lengths=staticLengths;  | 
415  | 0  |     } else { | 
416  | 0  |         utf8Lengths=(int32_t *)uprv_malloc(allocSize);  | 
417  | 0  |         if(utf8Lengths==NULL) { | 
418  | 0  |             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.  | 
419  | 0  |             return;  // Out of memory.  | 
420  | 0  |         }  | 
421  | 0  |     }  | 
422  |  |  | 
423  | 0  |     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);  | 
424  | 0  |     utf8=spanLengths+stringsLength*4;  | 
425  | 0  |     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);  | 
426  | 0  | }  | 
427  |  |  | 
428  | 0  | UnicodeSetStringSpan::~UnicodeSetStringSpan() { | 
429  | 0  |     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { | 
430  | 0  |         delete pSpanNotSet;  | 
431  | 0  |     }  | 
432  | 0  |     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { | 
433  | 0  |         uprv_free(utf8Lengths);  | 
434  | 0  |     }  | 
435  | 0  | }  | 
436  |  |  | 
437  | 0  | void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { | 
438  | 0  |     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { | 
439  | 0  |         if(spanSet.contains(c)) { | 
440  | 0  |             return;  // Nothing to do.  | 
441  | 0  |         }  | 
442  | 0  |         UnicodeSet *newSet=spanSet.cloneAsThawed();  | 
443  | 0  |         if(newSet==NULL) { | 
444  | 0  |             return;  // Out of memory.  | 
445  | 0  |         } else { | 
446  | 0  |             pSpanNotSet=newSet;  | 
447  | 0  |         }  | 
448  | 0  |     }  | 
449  | 0  |     pSpanNotSet->add(c);  | 
450  | 0  | }  | 
451  |  |  | 
452  |  | // Compare strings without any argument checks. Requires length>0.  | 
453  |  | static inline UBool  | 
454  | 0  | matches16(const UChar *s, const UChar *t, int32_t length) { | 
455  | 0  |     do { | 
456  | 0  |         if(*s++!=*t++) { | 
457  | 0  |             return FALSE;  | 
458  | 0  |         }  | 
459  | 0  |     } while(--length>0);  | 
460  | 0  |     return TRUE;  | 
461  | 0  | }  | 
462  |  |  | 
463  |  | static inline UBool  | 
464  | 0  | matches8(const uint8_t *s, const uint8_t *t, int32_t length) { | 
465  | 0  |     do { | 
466  | 0  |         if(*s++!=*t++) { | 
467  | 0  |             return FALSE;  | 
468  | 0  |         }  | 
469  | 0  |     } while(--length>0);  | 
470  | 0  |     return TRUE;  | 
471  | 0  | }  | 
472  |  |  | 
473  |  | // Compare 16-bit Unicode strings (which may be malformed UTF-16)  | 
474  |  | // at code point boundaries.  | 
475  |  | // That is, each edge of a match must not be in the middle of a surrogate pair.  | 
476  |  | static inline UBool  | 
477  | 0  | matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { | 
478  | 0  |     s+=start;  | 
479  | 0  |     limit-=start;  | 
480  | 0  |     return matches16(s, t, length) &&  | 
481  | 0  |            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&  | 
482  | 0  |            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));  | 
483  | 0  | }  | 
484  |  |  | 
485  |  | // Does the set contain the next code point?  | 
486  |  | // If so, return its length; otherwise return its negative length.  | 
487  |  | static inline int32_t  | 
488  | 0  | spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { | 
489  | 0  |     UChar c=*s, c2;  | 
490  | 0  |     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { | 
491  | 0  |         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;  | 
492  | 0  |     }  | 
493  | 0  |     return set.contains(c) ? 1 : -1;  | 
494  | 0  | }  | 
495  |  |  | 
496  |  | static inline int32_t  | 
497  | 0  | spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { | 
498  | 0  |     UChar c=s[length-1], c2;  | 
499  | 0  |     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { | 
500  | 0  |         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;  | 
501  | 0  |     }  | 
502  | 0  |     return set.contains(c) ? 1 : -1;  | 
503  | 0  | }  | 
504  |  |  | 
505  |  | static inline int32_t  | 
506  | 0  | spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { | 
507  | 0  |     UChar32 c=*s;  | 
508  | 0  |     if(U8_IS_SINGLE(c)) { | 
509  | 0  |         return set.contains(c) ? 1 : -1;  | 
510  | 0  |     }  | 
511  |  |     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().  | 
512  | 0  |     int32_t i=0;  | 
513  | 0  |     U8_NEXT_OR_FFFD(s, i, length, c);  | 
514  | 0  |     return set.contains(c) ? i : -i;  | 
515  | 0  | }  | 
516  |  |  | 
517  |  | static inline int32_t  | 
518  | 0  | spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { | 
519  | 0  |     UChar32 c=s[length-1];  | 
520  | 0  |     if(U8_IS_SINGLE(c)) { | 
521  | 0  |         return set.contains(c) ? 1 : -1;  | 
522  | 0  |     }  | 
523  | 0  |     int32_t i=length-1;  | 
524  | 0  |     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);  | 
525  | 0  |     length-=i;  | 
526  | 0  |     return set.contains(c) ? length : -length;  | 
527  | 0  | }  | 
528  |  |  | 
529  |  | /*  | 
530  |  |  * Note: In span() when spanLength==0 (after a string match, or at the beginning  | 
531  |  |  * after an empty code point span) and in spanNot() and spanNotUTF8(),  | 
532  |  |  * string matching could use a binary search  | 
533  |  |  * because all string matches are done from the same start index.  | 
534  |  |  *  | 
535  |  |  * For UTF-8, this would require a comparison function that returns UTF-16 order.  | 
536  |  |  *  | 
537  |  |  * This optimization should not be necessary for normal UnicodeSets because  | 
538  |  |  * most sets have no strings, and most sets with strings have  | 
539  |  |  * very few very short strings.  | 
540  |  |  * For cases with many strings, it might be better to use a different API  | 
541  |  |  * and implementation with a DFA (state machine).  | 
542  |  |  */  | 
543  |  |  | 
544  |  | /*  | 
545  |  |  * Algorithm for span(USET_SPAN_CONTAINED)  | 
546  |  |  *  | 
547  |  |  * Theoretical algorithm:  | 
548  |  |  * - Iterate through the string, and at each code point boundary:  | 
549  |  |  *   + If the code point there is in the set, then remember to continue after it.  | 
550  |  |  *   + If a set string matches at the current position, then remember to continue after it.  | 
551  |  |  *   + Either recursively span for each code point or string match,  | 
552  |  |  *     or recursively span for all but the shortest one and  | 
553  |  |  *     iteratively continue the span with the shortest local match.  | 
554  |  |  *   + Remember the longest recursive span (the farthest end point).  | 
555  |  |  *   + If there is no match at the current position, neither for the code point there  | 
556  |  |  *     nor for any set string, then stop and return the longest recursive span length.  | 
557  |  |  *  | 
558  |  |  * Optimized implementation:  | 
559  |  |  *  | 
560  |  |  * (We assume that most sets will have very few very short strings.  | 
561  |  |  * A span using a string-less set is extremely fast.)  | 
562  |  |  *  | 
563  |  |  * Create and cache a spanSet which contains all of the single code points  | 
564  |  |  * of the original set but none of its strings.  | 
565  |  |  *  | 
566  |  |  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).  | 
567  |  |  * - Loop:  | 
568  |  |  *   + Try to match each set string at the end of the spanLength.  | 
569  |  |  *     ~ Set strings that start with set-contained code points must be matched  | 
570  |  |  *       with a partial overlap because the recursive algorithm would have tried  | 
571  |  |  *       to match them at every position.  | 
572  |  |  *     ~ Set strings that entirely consist of set-contained code points  | 
573  |  |  *       are irrelevant for span(USET_SPAN_CONTAINED) because the  | 
574  |  |  *       recursive algorithm would continue after them anyway  | 
575  |  |  *       and find the longest recursive match from their end.  | 
576  |  |  *     ~ Rather than recursing, note each end point of a set string match.  | 
577  |  |  *   + If no set string matched after spanSet.span(), then return  | 
578  |  |  *     with where the spanSet.span() ended.  | 
579  |  |  *   + If at least one set string matched after spanSet.span(), then  | 
580  |  |  *     pop the shortest string match end point and continue  | 
581  |  |  *     the loop, trying to match all set strings from there.  | 
582  |  |  *   + If at least one more set string matched after a previous string match,  | 
583  |  |  *     then test if the code point after the previous string match is also  | 
584  |  |  *     contained in the set.  | 
585  |  |  *     Continue the loop with the shortest end point of either this code point  | 
586  |  |  *     or a matching set string.  | 
587  |  |  *   + If no more set string matched after a previous string match,  | 
588  |  |  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).  | 
589  |  |  *     Stop if spanLength==0, otherwise continue the loop.  | 
590  |  |  *  | 
591  |  |  * By noting each end point of a set string match,  | 
592  |  |  * the function visits each string position at most once and finishes  | 
593  |  |  * in linear time.  | 
594  |  |  *  | 
595  |  |  * The recursive algorithm may visit the same string position many times  | 
596  |  |  * if multiple paths lead to it and finishes in exponential time.  | 
597  |  |  */  | 
598  |  |  | 
599  |  | /*  | 
600  |  |  * Algorithm for span(USET_SPAN_SIMPLE)  | 
601  |  |  *  | 
602  |  |  * Theoretical algorithm:  | 
603  |  |  * - Iterate through the string, and at each code point boundary:  | 
604  |  |  *   + If the code point there is in the set, then remember to continue after it.  | 
605  |  |  *   + If a set string matches at the current position, then remember to continue after it.  | 
606  |  |  *   + Continue from the farthest match position and ignore all others.  | 
607  |  |  *   + If there is no match at the current position,  | 
608  |  |  *     then stop and return the current position.  | 
609  |  |  *  | 
610  |  |  * Optimized implementation:  | 
611  |  |  *  | 
612  |  |  * (Same assumption and spanSet as above.)  | 
613  |  |  *  | 
614  |  |  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).  | 
615  |  |  * - Loop:  | 
616  |  |  *   + Try to match each set string at the end of the spanLength.  | 
617  |  |  *     ~ Set strings that start with set-contained code points must be matched  | 
618  |  |  *       with a partial overlap because the standard algorithm would have tried  | 
619  |  |  *       to match them earlier.  | 
620  |  |  *     ~ Set strings that entirely consist of set-contained code points  | 
621  |  |  *       must be matched with a full overlap because the longest-match algorithm  | 
622  |  |  *       would hide set string matches that end earlier.  | 
623  |  |  *       Such set strings need not be matched earlier inside the code point span  | 
624  |  |  *       because the standard algorithm would then have continued after  | 
625  |  |  *       the set string match anyway.  | 
626  |  |  *     ~ Remember the longest set string match (farthest end point) from the earliest  | 
627  |  |  *       starting point.  | 
628  |  |  *   + If no set string matched after spanSet.span(), then return  | 
629  |  |  *     with where the spanSet.span() ended.  | 
630  |  |  *   + If at least one set string matched, then continue the loop after the  | 
631  |  |  *     longest match from the earliest position.  | 
632  |  |  *   + If no more set string matched after a previous string match,  | 
633  |  |  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).  | 
634  |  |  *     Stop if spanLength==0, otherwise continue the loop.  | 
635  |  |  */  | 
636  |  |  | 
637  | 0  | int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { | 
638  | 0  |     if(spanCondition==USET_SPAN_NOT_CONTAINED) { | 
639  | 0  |         return spanNot(s, length);  | 
640  | 0  |     }  | 
641  | 0  |     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);  | 
642  | 0  |     if(spanLength==length) { | 
643  | 0  |         return length;  | 
644  | 0  |     }  | 
645  |  |  | 
646  |  |     // Consider strings; they may overlap with the span.  | 
647  | 0  |     OffsetList offsets;  | 
648  | 0  |     if(spanCondition==USET_SPAN_CONTAINED) { | 
649  |  |         // Use offset list to try all possibilities.  | 
650  | 0  |         offsets.setMaxLength(maxLength16);  | 
651  | 0  |     }  | 
652  | 0  |     int32_t pos=spanLength, rest=length-pos;  | 
653  | 0  |     int32_t i, stringsLength=strings.size();  | 
654  | 0  |     for(;;) { | 
655  | 0  |         if(spanCondition==USET_SPAN_CONTAINED) { | 
656  | 0  |             for(i=0; i<stringsLength; ++i) { | 
657  | 0  |                 int32_t overlap=spanLengths[i];  | 
658  | 0  |                 if(overlap==ALL_CP_CONTAINED) { | 
659  | 0  |                     continue;  // Irrelevant string. (Also the empty string.)  | 
660  | 0  |                 }  | 
661  | 0  |                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);  | 
662  | 0  |                 const UChar *s16=string.getBuffer();  | 
663  | 0  |                 int32_t length16=string.length();  | 
664  | 0  |                 U_ASSERT(length>0);  | 
665  |  |  | 
666  |  |                 // Try to match this string at pos-overlap..pos.  | 
667  | 0  |                 if(overlap>=LONG_SPAN) { | 
668  | 0  |                     overlap=length16;  | 
669  |  |                     // While contained: No point matching fully inside the code point span.  | 
670  | 0  |                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.  | 
671  | 0  |                 }  | 
672  | 0  |                 if(overlap>spanLength) { | 
673  | 0  |                     overlap=spanLength;  | 
674  | 0  |                 }  | 
675  | 0  |                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.  | 
676  | 0  |                 for(;;) { | 
677  | 0  |                     if(inc>rest) { | 
678  | 0  |                         break;  | 
679  | 0  |                     }  | 
680  |  |                     // Try to match if the increment is not listed already.  | 
681  | 0  |                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { | 
682  | 0  |                         if(inc==rest) { | 
683  | 0  |                             return length;  // Reached the end of the string.  | 
684  | 0  |                         }  | 
685  | 0  |                         offsets.addOffset(inc);  | 
686  | 0  |                     }  | 
687  | 0  |                     if(overlap==0) { | 
688  | 0  |                         break;  | 
689  | 0  |                     }  | 
690  | 0  |                     --overlap;  | 
691  | 0  |                     ++inc;  | 
692  | 0  |                 }  | 
693  | 0  |             }  | 
694  | 0  |         } else /* USET_SPAN_SIMPLE */ { | 
695  | 0  |             int32_t maxInc=0, maxOverlap=0;  | 
696  | 0  |             for(i=0; i<stringsLength; ++i) { | 
697  | 0  |                 int32_t overlap=spanLengths[i];  | 
698  |  |                 // For longest match, we do need to try to match even an all-contained string  | 
699  |  |                 // to find the match from the earliest start.  | 
700  |  | 
  | 
701  | 0  |                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);  | 
702  | 0  |                 const UChar *s16=string.getBuffer();  | 
703  | 0  |                 int32_t length16=string.length();  | 
704  | 0  |                 if (length16==0) { | 
705  | 0  |                     continue;  // skip the empty string  | 
706  | 0  |                 }  | 
707  |  |  | 
708  |  |                 // Try to match this string at pos-overlap..pos.  | 
709  | 0  |                 if(overlap>=LONG_SPAN) { | 
710  | 0  |                     overlap=length16;  | 
711  |  |                     // Longest match: Need to match fully inside the code point span  | 
712  |  |                     // to find the match from the earliest start.  | 
713  | 0  |                 }  | 
714  | 0  |                 if(overlap>spanLength) { | 
715  | 0  |                     overlap=spanLength;  | 
716  | 0  |                 }  | 
717  | 0  |                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.  | 
718  | 0  |                 for(;;) { | 
719  | 0  |                     if(inc>rest || overlap<maxOverlap) { | 
720  | 0  |                         break;  | 
721  | 0  |                     }  | 
722  |  |                     // Try to match if the string is longer or starts earlier.  | 
723  | 0  |                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&  | 
724  | 0  |                         matches16CPB(s, pos-overlap, length, s16, length16)  | 
725  | 0  |                     ) { | 
726  | 0  |                         maxInc=inc;  // Longest match from earliest start.  | 
727  | 0  |                         maxOverlap=overlap;  | 
728  | 0  |                         break;  | 
729  | 0  |                     }  | 
730  | 0  |                     --overlap;  | 
731  | 0  |                     ++inc;  | 
732  | 0  |                 }  | 
733  | 0  |             }  | 
734  |  | 
  | 
735  | 0  |             if(maxInc!=0 || maxOverlap!=0) { | 
736  |  |                 // Longest-match algorithm, and there was a string match.  | 
737  |  |                 // Simply continue after it.  | 
738  | 0  |                 pos+=maxInc;  | 
739  | 0  |                 rest-=maxInc;  | 
740  | 0  |                 if(rest==0) { | 
741  | 0  |                     return length;  // Reached the end of the string.  | 
742  | 0  |                 }  | 
743  | 0  |                 spanLength=0;  // Match strings from after a string match.  | 
744  | 0  |                 continue;  | 
745  | 0  |             }  | 
746  | 0  |         }  | 
747  |  |         // Finished trying to match all strings at pos.  | 
748  |  |  | 
749  | 0  |         if(spanLength!=0 || pos==0) { | 
750  |  |             // The position is after an unlimited code point span (spanLength!=0),  | 
751  |  |             // not after a string match.  | 
752  |  |             // The only position where spanLength==0 after a span is pos==0.  | 
753  |  |             // Otherwise, an unlimited code point span is only tried again when no  | 
754  |  |             // strings match, and if such a non-initial span fails we stop.  | 
755  | 0  |             if(offsets.isEmpty()) { | 
756  | 0  |                 return pos;  // No strings matched after a span.  | 
757  | 0  |             }  | 
758  |  |             // Match strings from after the next string match.  | 
759  | 0  |         } else { | 
760  |  |             // The position is after a string match (or a single code point).  | 
761  | 0  |             if(offsets.isEmpty()) { | 
762  |  |                 // No more strings matched after a previous string match.  | 
763  |  |                 // Try another code point span from after the last string match.  | 
764  | 0  |                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);  | 
765  | 0  |                 if( spanLength==rest || // Reached the end of the string, or  | 
766  | 0  |                     spanLength==0       // neither strings nor span progressed.  | 
767  | 0  |                 ) { | 
768  | 0  |                     return pos+spanLength;  | 
769  | 0  |                 }  | 
770  | 0  |                 pos+=spanLength;  | 
771  | 0  |                 rest-=spanLength;  | 
772  | 0  |                 continue;  // spanLength>0: Match strings from after a span.  | 
773  | 0  |             } else { | 
774  |  |                 // Try to match only one code point from after a string match if some  | 
775  |  |                 // string matched beyond it, so that we try all possible positions  | 
776  |  |                 // and don't overshoot.  | 
777  | 0  |                 spanLength=spanOne(spanSet, s+pos, rest);  | 
778  | 0  |                 if(spanLength>0) { | 
779  | 0  |                     if(spanLength==rest) { | 
780  | 0  |                         return length;  // Reached the end of the string.  | 
781  | 0  |                     }  | 
782  |  |                     // Match strings after this code point.  | 
783  |  |                     // There cannot be any increments below it because UnicodeSet strings  | 
784  |  |                     // contain multiple code points.  | 
785  | 0  |                     pos+=spanLength;  | 
786  | 0  |                     rest-=spanLength;  | 
787  | 0  |                     offsets.shift(spanLength);  | 
788  | 0  |                     spanLength=0;  | 
789  | 0  |                     continue;  // Match strings from after a single code point.  | 
790  | 0  |                 }  | 
791  |  |                 // Match strings from after the next string match.  | 
792  | 0  |             }  | 
793  | 0  |         }  | 
794  | 0  |         int32_t minOffset=offsets.popMinimum();  | 
795  | 0  |         pos+=minOffset;  | 
796  | 0  |         rest-=minOffset;  | 
797  | 0  |         spanLength=0;  // Match strings from after a string match.  | 
798  | 0  |     }  | 
799  | 0  | }  | 
800  |  |  | 
801  | 0  | int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { | 
802  | 0  |     if(spanCondition==USET_SPAN_NOT_CONTAINED) { | 
803  | 0  |         return spanNotBack(s, length);  | 
804  | 0  |     }  | 
805  | 0  |     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);  | 
806  | 0  |     if(pos==0) { | 
807  | 0  |         return 0;  | 
808  | 0  |     }  | 
809  | 0  |     int32_t spanLength=length-pos;  | 
810  |  |  | 
811  |  |     // Consider strings; they may overlap with the span.  | 
812  | 0  |     OffsetList offsets;  | 
813  | 0  |     if(spanCondition==USET_SPAN_CONTAINED) { | 
814  |  |         // Use offset list to try all possibilities.  | 
815  | 0  |         offsets.setMaxLength(maxLength16);  | 
816  | 0  |     }  | 
817  | 0  |     int32_t i, stringsLength=strings.size();  | 
818  | 0  |     uint8_t *spanBackLengths=spanLengths;  | 
819  | 0  |     if(all) { | 
820  | 0  |         spanBackLengths+=stringsLength;  | 
821  | 0  |     }  | 
822  | 0  |     for(;;) { | 
823  | 0  |         if(spanCondition==USET_SPAN_CONTAINED) { | 
824  | 0  |             for(i=0; i<stringsLength; ++i) { | 
825  | 0  |                 int32_t overlap=spanBackLengths[i];  | 
826  | 0  |                 if(overlap==ALL_CP_CONTAINED) { | 
827  | 0  |                     continue;  // Irrelevant string. (Also the empty string.)  | 
828  | 0  |                 }  | 
829  | 0  |                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);  | 
830  | 0  |                 const UChar *s16=string.getBuffer();  | 
831  | 0  |                 int32_t length16=string.length();  | 
832  | 0  |                 U_ASSERT(length>0);  | 
833  |  |  | 
834  |  |                 // Try to match this string at pos-(length16-overlap)..pos-length16.  | 
835  | 0  |                 if(overlap>=LONG_SPAN) { | 
836  | 0  |                     overlap=length16;  | 
837  |  |                     // While contained: No point matching fully inside the code point span.  | 
838  | 0  |                     int32_t len1=0;  | 
839  | 0  |                     U16_FWD_1(s16, len1, overlap);  | 
840  | 0  |                     overlap-=len1;  // Length of the string minus the first code point.  | 
841  | 0  |                 }  | 
842  | 0  |                 if(overlap>spanLength) { | 
843  | 0  |                     overlap=spanLength;  | 
844  | 0  |                 }  | 
845  | 0  |                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.  | 
846  | 0  |                 for(;;) { | 
847  | 0  |                     if(dec>pos) { | 
848  | 0  |                         break;  | 
849  | 0  |                     }  | 
850  |  |                     // Try to match if the decrement is not listed already.  | 
851  | 0  |                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { | 
852  | 0  |                         if(dec==pos) { | 
853  | 0  |                             return 0;  // Reached the start of the string.  | 
854  | 0  |                         }  | 
855  | 0  |                         offsets.addOffset(dec);  | 
856  | 0  |                     }  | 
857  | 0  |                     if(overlap==0) { | 
858  | 0  |                         break;  | 
859  | 0  |                     }  | 
860  | 0  |                     --overlap;  | 
861  | 0  |                     ++dec;  | 
862  | 0  |                 }  | 
863  | 0  |             }  | 
864  | 0  |         } else /* USET_SPAN_SIMPLE */ { | 
865  | 0  |             int32_t maxDec=0, maxOverlap=0;  | 
866  | 0  |             for(i=0; i<stringsLength; ++i) { | 
867  | 0  |                 int32_t overlap=spanBackLengths[i];  | 
868  |  |                 // For longest match, we do need to try to match even an all-contained string  | 
869  |  |                 // to find the match from the latest end.  | 
870  |  | 
  | 
871  | 0  |                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);  | 
872  | 0  |                 const UChar *s16=string.getBuffer();  | 
873  | 0  |                 int32_t length16=string.length();  | 
874  | 0  |                 if (length16==0) { | 
875  | 0  |                     continue;  // skip the empty string  | 
876  | 0  |                 }  | 
877  |  |  | 
878  |  |                 // Try to match this string at pos-(length16-overlap)..pos-length16.  | 
879  | 0  |                 if(overlap>=LONG_SPAN) { | 
880  | 0  |                     overlap=length16;  | 
881  |  |                     // Longest match: Need to match fully inside the code point span  | 
882  |  |                     // to find the match from the latest end.  | 
883  | 0  |                 }  | 
884  | 0  |                 if(overlap>spanLength) { | 
885  | 0  |                     overlap=spanLength;  | 
886  | 0  |                 }  | 
887  | 0  |                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.  | 
888  | 0  |                 for(;;) { | 
889  | 0  |                     if(dec>pos || overlap<maxOverlap) { | 
890  | 0  |                         break;  | 
891  | 0  |                     }  | 
892  |  |                     // Try to match if the string is longer or ends later.  | 
893  | 0  |                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&  | 
894  | 0  |                         matches16CPB(s, pos-dec, length, s16, length16)  | 
895  | 0  |                     ) { | 
896  | 0  |                         maxDec=dec;  // Longest match from latest end.  | 
897  | 0  |                         maxOverlap=overlap;  | 
898  | 0  |                         break;  | 
899  | 0  |                     }  | 
900  | 0  |                     --overlap;  | 
901  | 0  |                     ++dec;  | 
902  | 0  |                 }  | 
903  | 0  |             }  | 
904  |  | 
  | 
905  | 0  |             if(maxDec!=0 || maxOverlap!=0) { | 
906  |  |                 // Longest-match algorithm, and there was a string match.  | 
907  |  |                 // Simply continue before it.  | 
908  | 0  |                 pos-=maxDec;  | 
909  | 0  |                 if(pos==0) { | 
910  | 0  |                     return 0;  // Reached the start of the string.  | 
911  | 0  |                 }  | 
912  | 0  |                 spanLength=0;  // Match strings from before a string match.  | 
913  | 0  |                 continue;  | 
914  | 0  |             }  | 
915  | 0  |         }  | 
916  |  |         // Finished trying to match all strings at pos.  | 
917  |  |  | 
918  | 0  |         if(spanLength!=0 || pos==length) { | 
919  |  |             // The position is before an unlimited code point span (spanLength!=0),  | 
920  |  |             // not before a string match.  | 
921  |  |             // The only position where spanLength==0 before a span is pos==length.  | 
922  |  |             // Otherwise, an unlimited code point span is only tried again when no  | 
923  |  |             // strings match, and if such a non-initial span fails we stop.  | 
924  | 0  |             if(offsets.isEmpty()) { | 
925  | 0  |                 return pos;  // No strings matched before a span.  | 
926  | 0  |             }  | 
927  |  |             // Match strings from before the next string match.  | 
928  | 0  |         } else { | 
929  |  |             // The position is before a string match (or a single code point).  | 
930  | 0  |             if(offsets.isEmpty()) { | 
931  |  |                 // No more strings matched before a previous string match.  | 
932  |  |                 // Try another code point span from before the last string match.  | 
933  | 0  |                 int32_t oldPos=pos;  | 
934  | 0  |                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);  | 
935  | 0  |                 spanLength=oldPos-pos;  | 
936  | 0  |                 if( pos==0 ||           // Reached the start of the string, or  | 
937  | 0  |                     spanLength==0       // neither strings nor span progressed.  | 
938  | 0  |                 ) { | 
939  | 0  |                     return pos;  | 
940  | 0  |                 }  | 
941  | 0  |                 continue;  // spanLength>0: Match strings from before a span.  | 
942  | 0  |             } else { | 
943  |  |                 // Try to match only one code point from before a string match if some  | 
944  |  |                 // string matched beyond it, so that we try all possible positions  | 
945  |  |                 // and don't overshoot.  | 
946  | 0  |                 spanLength=spanOneBack(spanSet, s, pos);  | 
947  | 0  |                 if(spanLength>0) { | 
948  | 0  |                     if(spanLength==pos) { | 
949  | 0  |                         return 0;  // Reached the start of the string.  | 
950  | 0  |                     }  | 
951  |  |                     // Match strings before this code point.  | 
952  |  |                     // There cannot be any decrements below it because UnicodeSet strings  | 
953  |  |                     // contain multiple code points.  | 
954  | 0  |                     pos-=spanLength;  | 
955  | 0  |                     offsets.shift(spanLength);  | 
956  | 0  |                     spanLength=0;  | 
957  | 0  |                     continue;  // Match strings from before a single code point.  | 
958  | 0  |                 }  | 
959  |  |                 // Match strings from before the next string match.  | 
960  | 0  |             }  | 
961  | 0  |         }  | 
962  | 0  |         pos-=offsets.popMinimum();  | 
963  | 0  |         spanLength=0;  // Match strings from before a string match.  | 
964  | 0  |     }  | 
965  | 0  | }  | 
966  |  |  | 
967  | 0  | int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { | 
968  | 0  |     if(spanCondition==USET_SPAN_NOT_CONTAINED) { | 
969  | 0  |         return spanNotUTF8(s, length);  | 
970  | 0  |     }  | 
971  | 0  |     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);  | 
972  | 0  |     if(spanLength==length) { | 
973  | 0  |         return length;  | 
974  | 0  |     }  | 
975  |  |  | 
976  |  |     // Consider strings; they may overlap with the span.  | 
977  | 0  |     OffsetList offsets;  | 
978  | 0  |     if(spanCondition==USET_SPAN_CONTAINED) { | 
979  |  |         // Use offset list to try all possibilities.  | 
980  | 0  |         offsets.setMaxLength(maxLength8);  | 
981  | 0  |     }  | 
982  | 0  |     int32_t pos=spanLength, rest=length-pos;  | 
983  | 0  |     int32_t i, stringsLength=strings.size();  | 
984  | 0  |     uint8_t *spanUTF8Lengths=spanLengths;  | 
985  | 0  |     if(all) { | 
986  | 0  |         spanUTF8Lengths+=2*stringsLength;  | 
987  | 0  |     }  | 
988  | 0  |     for(;;) { | 
989  | 0  |         const uint8_t *s8=utf8;  | 
990  | 0  |         int32_t length8;  | 
991  | 0  |         if(spanCondition==USET_SPAN_CONTAINED) { | 
992  | 0  |             for(i=0; i<stringsLength; ++i) { | 
993  | 0  |                 length8=utf8Lengths[i];  | 
994  | 0  |                 if(length8==0) { | 
995  | 0  |                     continue;  // String not representable in UTF-8.  | 
996  | 0  |                 }  | 
997  | 0  |                 int32_t overlap=spanUTF8Lengths[i];  | 
998  | 0  |                 if(overlap==ALL_CP_CONTAINED) { | 
999  | 0  |                     s8+=length8;  | 
1000  | 0  |                     continue;  // Irrelevant string.  | 
1001  | 0  |                 }  | 
1002  |  |  | 
1003  |  |                 // Try to match this string at pos-overlap..pos.  | 
1004  | 0  |                 if(overlap>=LONG_SPAN) { | 
1005  | 0  |                     overlap=length8;  | 
1006  |  |                     // While contained: No point matching fully inside the code point span.  | 
1007  | 0  |                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.  | 
1008  | 0  |                 }  | 
1009  | 0  |                 if(overlap>spanLength) { | 
1010  | 0  |                     overlap=spanLength;  | 
1011  | 0  |                 }  | 
1012  | 0  |                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.  | 
1013  | 0  |                 for(;;) { | 
1014  | 0  |                     if(inc>rest) { | 
1015  | 0  |                         break;  | 
1016  | 0  |                     }  | 
1017  |  |                     // Try to match if the increment is not listed already.  | 
1018  |  |                     // Match at code point boundaries. (The UTF-8 strings were converted  | 
1019  |  |                     // from UTF-16 and are guaranteed to be well-formed.)  | 
1020  | 0  |                     if(!U8_IS_TRAIL(s[pos-overlap]) &&  | 
1021  | 0  |                             !offsets.containsOffset(inc) &&  | 
1022  | 0  |                             matches8(s+pos-overlap, s8, length8)) { | 
1023  | 0  |                         if(inc==rest) { | 
1024  | 0  |                             return length;  // Reached the end of the string.  | 
1025  | 0  |                         }  | 
1026  | 0  |                         offsets.addOffset(inc);  | 
1027  | 0  |                     }  | 
1028  | 0  |                     if(overlap==0) { | 
1029  | 0  |                         break;  | 
1030  | 0  |                     }  | 
1031  | 0  |                     --overlap;  | 
1032  | 0  |                     ++inc;  | 
1033  | 0  |                 }  | 
1034  | 0  |                 s8+=length8;  | 
1035  | 0  |             }  | 
1036  | 0  |         } else /* USET_SPAN_SIMPLE */ { | 
1037  | 0  |             int32_t maxInc=0, maxOverlap=0;  | 
1038  | 0  |             for(i=0; i<stringsLength; ++i) { | 
1039  | 0  |                 length8=utf8Lengths[i];  | 
1040  | 0  |                 if(length8==0) { | 
1041  | 0  |                     continue;  // String not representable in UTF-8.  | 
1042  | 0  |                 }  | 
1043  | 0  |                 int32_t overlap=spanUTF8Lengths[i];  | 
1044  |  |                 // For longest match, we do need to try to match even an all-contained string  | 
1045  |  |                 // to find the match from the earliest start.  | 
1046  |  |  | 
1047  |  |                 // Try to match this string at pos-overlap..pos.  | 
1048  | 0  |                 if(overlap>=LONG_SPAN) { | 
1049  | 0  |                     overlap=length8;  | 
1050  |  |                     // Longest match: Need to match fully inside the code point span  | 
1051  |  |                     // to find the match from the earliest start.  | 
1052  | 0  |                 }  | 
1053  | 0  |                 if(overlap>spanLength) { | 
1054  | 0  |                     overlap=spanLength;  | 
1055  | 0  |                 }  | 
1056  | 0  |                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.  | 
1057  | 0  |                 for(;;) { | 
1058  | 0  |                     if(inc>rest || overlap<maxOverlap) { | 
1059  | 0  |                         break;  | 
1060  | 0  |                     }  | 
1061  |  |                     // Try to match if the string is longer or starts earlier.  | 
1062  |  |                     // Match at code point boundaries. (The UTF-8 strings were converted  | 
1063  |  |                     // from UTF-16 and are guaranteed to be well-formed.)  | 
1064  | 0  |                     if(!U8_IS_TRAIL(s[pos-overlap]) &&  | 
1065  | 0  |                             (overlap>maxOverlap ||  | 
1066  | 0  |                                 /* redundant overlap==maxOverlap && */ inc>maxInc) &&  | 
1067  | 0  |                             matches8(s+pos-overlap, s8, length8)) { | 
1068  | 0  |                         maxInc=inc;  // Longest match from earliest start.  | 
1069  | 0  |                         maxOverlap=overlap;  | 
1070  | 0  |                         break;  | 
1071  | 0  |                     }  | 
1072  | 0  |                     --overlap;  | 
1073  | 0  |                     ++inc;  | 
1074  | 0  |                 }  | 
1075  | 0  |                 s8+=length8;  | 
1076  | 0  |             }  | 
1077  |  | 
  | 
1078  | 0  |             if(maxInc!=0 || maxOverlap!=0) { | 
1079  |  |                 // Longest-match algorithm, and there was a string match.  | 
1080  |  |                 // Simply continue after it.  | 
1081  | 0  |                 pos+=maxInc;  | 
1082  | 0  |                 rest-=maxInc;  | 
1083  | 0  |                 if(rest==0) { | 
1084  | 0  |                     return length;  // Reached the end of the string.  | 
1085  | 0  |                 }  | 
1086  | 0  |                 spanLength=0;  // Match strings from after a string match.  | 
1087  | 0  |                 continue;  | 
1088  | 0  |             }  | 
1089  | 0  |         }  | 
1090  |  |         // Finished trying to match all strings at pos.  | 
1091  |  |  | 
1092  | 0  |         if(spanLength!=0 || pos==0) { | 
1093  |  |             // The position is after an unlimited code point span (spanLength!=0),  | 
1094  |  |             // not after a string match.  | 
1095  |  |             // The only position where spanLength==0 after a span is pos==0.  | 
1096  |  |             // Otherwise, an unlimited code point span is only tried again when no  | 
1097  |  |             // strings match, and if such a non-initial span fails we stop.  | 
1098  | 0  |             if(offsets.isEmpty()) { | 
1099  | 0  |                 return pos;  // No strings matched after a span.  | 
1100  | 0  |             }  | 
1101  |  |             // Match strings from after the next string match.  | 
1102  | 0  |         } else { | 
1103  |  |             // The position is after a string match (or a single code point).  | 
1104  | 0  |             if(offsets.isEmpty()) { | 
1105  |  |                 // No more strings matched after a previous string match.  | 
1106  |  |                 // Try another code point span from after the last string match.  | 
1107  | 0  |                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);  | 
1108  | 0  |                 if( spanLength==rest || // Reached the end of the string, or  | 
1109  | 0  |                     spanLength==0       // neither strings nor span progressed.  | 
1110  | 0  |                 ) { | 
1111  | 0  |                     return pos+spanLength;  | 
1112  | 0  |                 }  | 
1113  | 0  |                 pos+=spanLength;  | 
1114  | 0  |                 rest-=spanLength;  | 
1115  | 0  |                 continue;  // spanLength>0: Match strings from after a span.  | 
1116  | 0  |             } else { | 
1117  |  |                 // Try to match only one code point from after a string match if some  | 
1118  |  |                 // string matched beyond it, so that we try all possible positions  | 
1119  |  |                 // and don't overshoot.  | 
1120  | 0  |                 spanLength=spanOneUTF8(spanSet, s+pos, rest);  | 
1121  | 0  |                 if(spanLength>0) { | 
1122  | 0  |                     if(spanLength==rest) { | 
1123  | 0  |                         return length;  // Reached the end of the string.  | 
1124  | 0  |                     }  | 
1125  |  |                     // Match strings after this code point.  | 
1126  |  |                     // There cannot be any increments below it because UnicodeSet strings  | 
1127  |  |                     // contain multiple code points.  | 
1128  | 0  |                     pos+=spanLength;  | 
1129  | 0  |                     rest-=spanLength;  | 
1130  | 0  |                     offsets.shift(spanLength);  | 
1131  | 0  |                     spanLength=0;  | 
1132  | 0  |                     continue;  // Match strings from after a single code point.  | 
1133  | 0  |                 }  | 
1134  |  |                 // Match strings from after the next string match.  | 
1135  | 0  |             }  | 
1136  | 0  |         }  | 
1137  | 0  |         int32_t minOffset=offsets.popMinimum();  | 
1138  | 0  |         pos+=minOffset;  | 
1139  | 0  |         rest-=minOffset;  | 
1140  | 0  |         spanLength=0;  // Match strings from after a string match.  | 
1141  | 0  |     }  | 
1142  | 0  | }  | 
1143  |  |  | 
1144  | 0  | int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { | 
1145  | 0  |     if(spanCondition==USET_SPAN_NOT_CONTAINED) { | 
1146  | 0  |         return spanNotBackUTF8(s, length);  | 
1147  | 0  |     }  | 
1148  | 0  |     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);  | 
1149  | 0  |     if(pos==0) { | 
1150  | 0  |         return 0;  | 
1151  | 0  |     }  | 
1152  | 0  |     int32_t spanLength=length-pos;  | 
1153  |  |  | 
1154  |  |     // Consider strings; they may overlap with the span.  | 
1155  | 0  |     OffsetList offsets;  | 
1156  | 0  |     if(spanCondition==USET_SPAN_CONTAINED) { | 
1157  |  |         // Use offset list to try all possibilities.  | 
1158  | 0  |         offsets.setMaxLength(maxLength8);  | 
1159  | 0  |     }  | 
1160  | 0  |     int32_t i, stringsLength=strings.size();  | 
1161  | 0  |     uint8_t *spanBackUTF8Lengths=spanLengths;  | 
1162  | 0  |     if(all) { | 
1163  | 0  |         spanBackUTF8Lengths+=3*stringsLength;  | 
1164  | 0  |     }  | 
1165  | 0  |     for(;;) { | 
1166  | 0  |         const uint8_t *s8=utf8;  | 
1167  | 0  |         int32_t length8;  | 
1168  | 0  |         if(spanCondition==USET_SPAN_CONTAINED) { | 
1169  | 0  |             for(i=0; i<stringsLength; ++i) { | 
1170  | 0  |                 length8=utf8Lengths[i];  | 
1171  | 0  |                 if(length8==0) { | 
1172  | 0  |                     continue;  // String not representable in UTF-8.  | 
1173  | 0  |                 }  | 
1174  | 0  |                 int32_t overlap=spanBackUTF8Lengths[i];  | 
1175  | 0  |                 if(overlap==ALL_CP_CONTAINED) { | 
1176  | 0  |                     s8+=length8;  | 
1177  | 0  |                     continue;  // Irrelevant string.  | 
1178  | 0  |                 }  | 
1179  |  |  | 
1180  |  |                 // Try to match this string at pos-(length8-overlap)..pos-length8.  | 
1181  | 0  |                 if(overlap>=LONG_SPAN) { | 
1182  | 0  |                     overlap=length8;  | 
1183  |  |                     // While contained: No point matching fully inside the code point span.  | 
1184  | 0  |                     int32_t len1=0;  | 
1185  | 0  |                     U8_FWD_1(s8, len1, overlap);  | 
1186  | 0  |                     overlap-=len1;  // Length of the string minus the first code point.  | 
1187  | 0  |                 }  | 
1188  | 0  |                 if(overlap>spanLength) { | 
1189  | 0  |                     overlap=spanLength;  | 
1190  | 0  |                 }  | 
1191  | 0  |                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.  | 
1192  | 0  |                 for(;;) { | 
1193  | 0  |                     if(dec>pos) { | 
1194  | 0  |                         break;  | 
1195  | 0  |                     }  | 
1196  |  |                     // Try to match if the decrement is not listed already.  | 
1197  |  |                     // Match at code point boundaries. (The UTF-8 strings were converted  | 
1198  |  |                     // from UTF-16 and are guaranteed to be well-formed.)  | 
1199  | 0  |                     if( !U8_IS_TRAIL(s[pos-dec]) &&  | 
1200  | 0  |                         !offsets.containsOffset(dec) &&  | 
1201  | 0  |                         matches8(s+pos-dec, s8, length8)  | 
1202  | 0  |                     ) { | 
1203  | 0  |                         if(dec==pos) { | 
1204  | 0  |                             return 0;  // Reached the start of the string.  | 
1205  | 0  |                         }  | 
1206  | 0  |                         offsets.addOffset(dec);  | 
1207  | 0  |                     }  | 
1208  | 0  |                     if(overlap==0) { | 
1209  | 0  |                         break;  | 
1210  | 0  |                     }  | 
1211  | 0  |                     --overlap;  | 
1212  | 0  |                     ++dec;  | 
1213  | 0  |                 }  | 
1214  | 0  |                 s8+=length8;  | 
1215  | 0  |             }  | 
1216  | 0  |         } else /* USET_SPAN_SIMPLE */ { | 
1217  | 0  |             int32_t maxDec=0, maxOverlap=0;  | 
1218  | 0  |             for(i=0; i<stringsLength; ++i) { | 
1219  | 0  |                 length8=utf8Lengths[i];  | 
1220  | 0  |                 if(length8==0) { | 
1221  | 0  |                     continue;  // String not representable in UTF-8.  | 
1222  | 0  |                 }  | 
1223  | 0  |                 int32_t overlap=spanBackUTF8Lengths[i];  | 
1224  |  |                 // For longest match, we do need to try to match even an all-contained string  | 
1225  |  |                 // to find the match from the latest end.  | 
1226  |  |  | 
1227  |  |                 // Try to match this string at pos-(length8-overlap)..pos-length8.  | 
1228  | 0  |                 if(overlap>=LONG_SPAN) { | 
1229  | 0  |                     overlap=length8;  | 
1230  |  |                     // Longest match: Need to match fully inside the code point span  | 
1231  |  |                     // to find the match from the latest end.  | 
1232  | 0  |                 }  | 
1233  | 0  |                 if(overlap>spanLength) { | 
1234  | 0  |                     overlap=spanLength;  | 
1235  | 0  |                 }  | 
1236  | 0  |                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.  | 
1237  | 0  |                 for(;;) { | 
1238  | 0  |                     if(dec>pos || overlap<maxOverlap) { | 
1239  | 0  |                         break;  | 
1240  | 0  |                     }  | 
1241  |  |                     // Try to match if the string is longer or ends later.  | 
1242  |  |                     // Match at code point boundaries. (The UTF-8 strings were converted  | 
1243  |  |                     // from UTF-16 and are guaranteed to be well-formed.)  | 
1244  | 0  |                     if( !U8_IS_TRAIL(s[pos-dec]) &&  | 
1245  | 0  |                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&  | 
1246  | 0  |                         matches8(s+pos-dec, s8, length8)  | 
1247  | 0  |                     ) { | 
1248  | 0  |                         maxDec=dec;  // Longest match from latest end.  | 
1249  | 0  |                         maxOverlap=overlap;  | 
1250  | 0  |                         break;  | 
1251  | 0  |                     }  | 
1252  | 0  |                     --overlap;  | 
1253  | 0  |                     ++dec;  | 
1254  | 0  |                 }  | 
1255  | 0  |                 s8+=length8;  | 
1256  | 0  |             }  | 
1257  |  | 
  | 
1258  | 0  |             if(maxDec!=0 || maxOverlap!=0) { | 
1259  |  |                 // Longest-match algorithm, and there was a string match.  | 
1260  |  |                 // Simply continue before it.  | 
1261  | 0  |                 pos-=maxDec;  | 
1262  | 0  |                 if(pos==0) { | 
1263  | 0  |                     return 0;  // Reached the start of the string.  | 
1264  | 0  |                 }  | 
1265  | 0  |                 spanLength=0;  // Match strings from before a string match.  | 
1266  | 0  |                 continue;  | 
1267  | 0  |             }  | 
1268  | 0  |         }  | 
1269  |  |         // Finished trying to match all strings at pos.  | 
1270  |  |  | 
1271  | 0  |         if(spanLength!=0 || pos==length) { | 
1272  |  |             // The position is before an unlimited code point span (spanLength!=0),  | 
1273  |  |             // not before a string match.  | 
1274  |  |             // The only position where spanLength==0 before a span is pos==length.  | 
1275  |  |             // Otherwise, an unlimited code point span is only tried again when no  | 
1276  |  |             // strings match, and if such a non-initial span fails we stop.  | 
1277  | 0  |             if(offsets.isEmpty()) { | 
1278  | 0  |                 return pos;  // No strings matched before a span.  | 
1279  | 0  |             }  | 
1280  |  |             // Match strings from before the next string match.  | 
1281  | 0  |         } else { | 
1282  |  |             // The position is before a string match (or a single code point).  | 
1283  | 0  |             if(offsets.isEmpty()) { | 
1284  |  |                 // No more strings matched before a previous string match.  | 
1285  |  |                 // Try another code point span from before the last string match.  | 
1286  | 0  |                 int32_t oldPos=pos;  | 
1287  | 0  |                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);  | 
1288  | 0  |                 spanLength=oldPos-pos;  | 
1289  | 0  |                 if( pos==0 ||           // Reached the start of the string, or  | 
1290  | 0  |                     spanLength==0       // neither strings nor span progressed.  | 
1291  | 0  |                 ) { | 
1292  | 0  |                     return pos;  | 
1293  | 0  |                 }  | 
1294  | 0  |                 continue;  // spanLength>0: Match strings from before a span.  | 
1295  | 0  |             } else { | 
1296  |  |                 // Try to match only one code point from before a string match if some  | 
1297  |  |                 // string matched beyond it, so that we try all possible positions  | 
1298  |  |                 // and don't overshoot.  | 
1299  | 0  |                 spanLength=spanOneBackUTF8(spanSet, s, pos);  | 
1300  | 0  |                 if(spanLength>0) { | 
1301  | 0  |                     if(spanLength==pos) { | 
1302  | 0  |                         return 0;  // Reached the start of the string.  | 
1303  | 0  |                     }  | 
1304  |  |                     // Match strings before this code point.  | 
1305  |  |                     // There cannot be any decrements below it because UnicodeSet strings  | 
1306  |  |                     // contain multiple code points.  | 
1307  | 0  |                     pos-=spanLength;  | 
1308  | 0  |                     offsets.shift(spanLength);  | 
1309  | 0  |                     spanLength=0;  | 
1310  | 0  |                     continue;  // Match strings from before a single code point.  | 
1311  | 0  |                 }  | 
1312  |  |                 // Match strings from before the next string match.  | 
1313  | 0  |             }  | 
1314  | 0  |         }  | 
1315  | 0  |         pos-=offsets.popMinimum();  | 
1316  | 0  |         spanLength=0;  // Match strings from before a string match.  | 
1317  | 0  |     }  | 
1318  | 0  | }  | 
1319  |  |  | 
1320  |  | /*  | 
1321  |  |  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)  | 
1322  |  |  *  | 
1323  |  |  * Theoretical algorithm:  | 
1324  |  |  * - Iterate through the string, and at each code point boundary:  | 
1325  |  |  *   + If the code point there is in the set, then return with the current position.  | 
1326  |  |  *   + If a set string matches at the current position, then return with the current position.  | 
1327  |  |  *  | 
1328  |  |  * Optimized implementation:  | 
1329  |  |  *  | 
1330  |  |  * (Same assumption as for span() above.)  | 
1331  |  |  *  | 
1332  |  |  * Create and cache a spanNotSet which contains all of the single code points  | 
1333  |  |  * of the original set but none of its strings.  | 
1334  |  |  * For each set string add its initial code point to the spanNotSet.  | 
1335  |  |  * (Also add its final code point for spanNotBack().)  | 
1336  |  |  *  | 
1337  |  |  * - Loop:  | 
1338  |  |  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).  | 
1339  |  |  *   + If the current code point is in the original set, then  | 
1340  |  |  *     return the current position.  | 
1341  |  |  *   + If any set string matches at the current position, then  | 
1342  |  |  *     return the current position.  | 
1343  |  |  *   + If there is no match at the current position, neither for the code point there  | 
1344  |  |  *     nor for any set string, then skip this code point and continue the loop.  | 
1345  |  |  *     This happens for set-string-initial code points that were added to spanNotSet  | 
1346  |  |  *     when there is not actually a match for such a set string.  | 
1347  |  |  */  | 
1348  |  |  | 
1349  | 0  | int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { | 
1350  | 0  |     int32_t pos=0, rest=length;  | 
1351  | 0  |     int32_t i, stringsLength=strings.size();  | 
1352  | 0  |     do { | 
1353  |  |         // Span until we find a code point from the set,  | 
1354  |  |         // or a code point that starts or ends some string.  | 
1355  | 0  |         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);  | 
1356  | 0  |         if(i==rest) { | 
1357  | 0  |             return length;  // Reached the end of the string.  | 
1358  | 0  |         }  | 
1359  | 0  |         pos+=i;  | 
1360  | 0  |         rest-=i;  | 
1361  |  |  | 
1362  |  |         // Check whether the current code point is in the original set,  | 
1363  |  |         // without the string starts and ends.  | 
1364  | 0  |         int32_t cpLength=spanOne(spanSet, s+pos, rest);  | 
1365  | 0  |         if(cpLength>0) { | 
1366  | 0  |             return pos;  // There is a set element at pos.  | 
1367  | 0  |         }  | 
1368  |  |  | 
1369  |  |         // Try to match the strings at pos.  | 
1370  | 0  |         for(i=0; i<stringsLength; ++i) { | 
1371  | 0  |             if(spanLengths[i]==ALL_CP_CONTAINED) { | 
1372  | 0  |                 continue;  // Irrelevant string. (Also the empty string.)  | 
1373  | 0  |             }  | 
1374  | 0  |             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);  | 
1375  | 0  |             const UChar *s16=string.getBuffer();  | 
1376  | 0  |             int32_t length16=string.length();  | 
1377  | 0  |             U_ASSERT(length>0);  | 
1378  | 0  |             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { | 
1379  | 0  |                 return pos;  // There is a set element at pos.  | 
1380  | 0  |             }  | 
1381  | 0  |         }  | 
1382  |  |  | 
1383  |  |         // The span(while not contained) ended on a string start/end which is  | 
1384  |  |         // not in the original set. Skip this code point and continue.  | 
1385  |  |         // cpLength<0  | 
1386  | 0  |         pos-=cpLength;  | 
1387  | 0  |         rest+=cpLength;  | 
1388  | 0  |     } while(rest!=0);  | 
1389  | 0  |     return length;  // Reached the end of the string.  | 
1390  | 0  | }  | 
1391  |  |  | 
1392  | 0  | int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { | 
1393  | 0  |     int32_t pos=length;  | 
1394  | 0  |     int32_t i, stringsLength=strings.size();  | 
1395  | 0  |     do { | 
1396  |  |         // Span until we find a code point from the set,  | 
1397  |  |         // or a code point that starts or ends some string.  | 
1398  | 0  |         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);  | 
1399  | 0  |         if(pos==0) { | 
1400  | 0  |             return 0;  // Reached the start of the string.  | 
1401  | 0  |         }  | 
1402  |  |  | 
1403  |  |         // Check whether the current code point is in the original set,  | 
1404  |  |         // without the string starts and ends.  | 
1405  | 0  |         int32_t cpLength=spanOneBack(spanSet, s, pos);  | 
1406  | 0  |         if(cpLength>0) { | 
1407  | 0  |             return pos;  // There is a set element at pos.  | 
1408  | 0  |         }  | 
1409  |  |  | 
1410  |  |         // Try to match the strings at pos.  | 
1411  | 0  |         for(i=0; i<stringsLength; ++i) { | 
1412  |  |             // Use spanLengths rather than a spanBackLengths pointer because  | 
1413  |  |             // it is easier and we only need to know whether the string is irrelevant  | 
1414  |  |             // which is the same in either array.  | 
1415  | 0  |             if(spanLengths[i]==ALL_CP_CONTAINED) { | 
1416  | 0  |                 continue;  // Irrelevant string. (Also the empty string.)  | 
1417  | 0  |             }  | 
1418  | 0  |             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);  | 
1419  | 0  |             const UChar *s16=string.getBuffer();  | 
1420  | 0  |             int32_t length16=string.length();  | 
1421  | 0  |             U_ASSERT(length>0);  | 
1422  | 0  |             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { | 
1423  | 0  |                 return pos;  // There is a set element at pos.  | 
1424  | 0  |             }  | 
1425  | 0  |         }  | 
1426  |  |  | 
1427  |  |         // The span(while not contained) ended on a string start/end which is  | 
1428  |  |         // not in the original set. Skip this code point and continue.  | 
1429  |  |         // cpLength<0  | 
1430  | 0  |         pos+=cpLength;  | 
1431  | 0  |     } while(pos!=0);  | 
1432  | 0  |     return 0;  // Reached the start of the string.  | 
1433  | 0  | }  | 
1434  |  |  | 
1435  | 0  | int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { | 
1436  | 0  |     int32_t pos=0, rest=length;  | 
1437  | 0  |     int32_t i, stringsLength=strings.size();  | 
1438  | 0  |     uint8_t *spanUTF8Lengths=spanLengths;  | 
1439  | 0  |     if(all) { | 
1440  | 0  |         spanUTF8Lengths+=2*stringsLength;  | 
1441  | 0  |     }  | 
1442  | 0  |     do { | 
1443  |  |         // Span until we find a code point from the set,  | 
1444  |  |         // or a code point that starts or ends some string.  | 
1445  | 0  |         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);  | 
1446  | 0  |         if(i==rest) { | 
1447  | 0  |             return length;  // Reached the end of the string.  | 
1448  | 0  |         }  | 
1449  | 0  |         pos+=i;  | 
1450  | 0  |         rest-=i;  | 
1451  |  |  | 
1452  |  |         // Check whether the current code point is in the original set,  | 
1453  |  |         // without the string starts and ends.  | 
1454  | 0  |         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);  | 
1455  | 0  |         if(cpLength>0) { | 
1456  | 0  |             return pos;  // There is a set element at pos.  | 
1457  | 0  |         }  | 
1458  |  |  | 
1459  |  |         // Try to match the strings at pos.  | 
1460  | 0  |         const uint8_t *s8=utf8;  | 
1461  | 0  |         int32_t length8;  | 
1462  | 0  |         for(i=0; i<stringsLength; ++i) { | 
1463  | 0  |             length8=utf8Lengths[i];  | 
1464  |  |             // ALL_CP_CONTAINED: Irrelevant string.  | 
1465  | 0  |             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { | 
1466  | 0  |                 return pos;  // There is a set element at pos.  | 
1467  | 0  |             }  | 
1468  | 0  |             s8+=length8;  | 
1469  | 0  |         }  | 
1470  |  |  | 
1471  |  |         // The span(while not contained) ended on a string start/end which is  | 
1472  |  |         // not in the original set. Skip this code point and continue.  | 
1473  |  |         // cpLength<0  | 
1474  | 0  |         pos-=cpLength;  | 
1475  | 0  |         rest+=cpLength;  | 
1476  | 0  |     } while(rest!=0);  | 
1477  | 0  |     return length;  // Reached the end of the string.  | 
1478  | 0  | }  | 
1479  |  |  | 
1480  | 0  | int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { | 
1481  | 0  |     int32_t pos=length;  | 
1482  | 0  |     int32_t i, stringsLength=strings.size();  | 
1483  | 0  |     uint8_t *spanBackUTF8Lengths=spanLengths;  | 
1484  | 0  |     if(all) { | 
1485  | 0  |         spanBackUTF8Lengths+=3*stringsLength;  | 
1486  | 0  |     }  | 
1487  | 0  |     do { | 
1488  |  |         // Span until we find a code point from the set,  | 
1489  |  |         // or a code point that starts or ends some string.  | 
1490  | 0  |         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);  | 
1491  | 0  |         if(pos==0) { | 
1492  | 0  |             return 0;  // Reached the start of the string.  | 
1493  | 0  |         }  | 
1494  |  |  | 
1495  |  |         // Check whether the current code point is in the original set,  | 
1496  |  |         // without the string starts and ends.  | 
1497  | 0  |         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);  | 
1498  | 0  |         if(cpLength>0) { | 
1499  | 0  |             return pos;  // There is a set element at pos.  | 
1500  | 0  |         }  | 
1501  |  |  | 
1502  |  |         // Try to match the strings at pos.  | 
1503  | 0  |         const uint8_t *s8=utf8;  | 
1504  | 0  |         int32_t length8;  | 
1505  | 0  |         for(i=0; i<stringsLength; ++i) { | 
1506  | 0  |             length8=utf8Lengths[i];  | 
1507  |  |             // ALL_CP_CONTAINED: Irrelevant string.  | 
1508  | 0  |             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { | 
1509  | 0  |                 return pos;  // There is a set element at pos.  | 
1510  | 0  |             }  | 
1511  | 0  |             s8+=length8;  | 
1512  | 0  |         }  | 
1513  |  |  | 
1514  |  |         // The span(while not contained) ended on a string start/end which is  | 
1515  |  |         // not in the original set. Skip this code point and continue.  | 
1516  |  |         // cpLength<0  | 
1517  | 0  |         pos+=cpLength;  | 
1518  | 0  |     } while(pos!=0);  | 
1519  | 0  |     return 0;  // Reached the start of the string.  | 
1520  | 0  | }  | 
1521  |  |  | 
1522  |  | U_NAMESPACE_END  |