Coverage Report

Created: 2023-06-07 07:18

/src/icu/source/common/unisetspan.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2007-2012, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  unisetspan.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2007mar01
16
*   created by: Markus W. Scherer
17
*/
18
19
#include "unicode/utypes.h"
20
#include "unicode/uniset.h"
21
#include "unicode/ustring.h"
22
#include "unicode/utf8.h"
23
#include "unicode/utf16.h"
24
#include "cmemory.h"
25
#include "uvector.h"
26
#include "unisetspan.h"
27
28
U_NAMESPACE_BEGIN
29
30
/*
31
 * List of offsets from the current position from where to try matching
32
 * a code point or a string.
33
 * Store offsets rather than indexes to simplify the code and use the same list
34
 * for both increments (in span()) and decrements (in spanBack()).
35
 *
36
 * Assumption: The maximum offset is limited, and the offsets that are stored
37
 * at any one time are relatively dense, that is, there are normally no gaps of
38
 * hundreds or thousands of offset values.
39
 *
40
 * The implementation uses a circular buffer of byte flags,
41
 * each indicating whether the corresponding offset is in the list.
42
 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43
 * physically moving part of the list.
44
 *
45
 * Note: In principle, the caller should setMaxLength() to the maximum of the
46
 * max string length and U16_LENGTH/U8_LENGTH to account for
47
 * "long" single code points.
48
 * However, this implementation uses at least a staticList with more than
49
 * U8_LENGTH entries anyway.
50
 *
51
 * Note: If maxLength were guaranteed to be no more than 32 or 64,
52
 * the list could be stored as bit flags in a single integer.
53
 * Rather than handling a circular buffer with a start list index,
54
 * the integer would simply be shifted when lower offsets are removed.
55
 * UnicodeSet does not have a limit on the lengths of strings.
56
 */
57
class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
58
public:
59
0
    OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60
61
0
    ~OffsetList() {
62
0
        if(list!=staticList) {
63
0
            uprv_free(list);
64
0
        }
65
0
    }
66
67
    // Call exactly once if the list is to be used.
68
0
    void setMaxLength(int32_t maxLength) {
69
0
        if(maxLength<=(int32_t)sizeof(staticList)) {
70
0
            capacity=(int32_t)sizeof(staticList);
71
0
        } else {
72
0
            UBool *l=(UBool *)uprv_malloc(maxLength);
73
0
            if(l!=NULL) {
74
0
                list=l;
75
0
                capacity=maxLength;
76
0
            }
77
0
        }
78
0
        uprv_memset(list, 0, capacity);
79
0
    }
80
81
0
    void clear() {
82
0
        uprv_memset(list, 0, capacity);
83
0
        start=length=0;
84
0
    }
85
86
0
    UBool isEmpty() const {
87
0
        return (UBool)(length==0);
88
0
    }
89
90
    // Reduce all stored offsets by delta, used when the current position
91
    // moves by delta.
92
    // There must not be any offsets lower than delta.
93
    // If there is an offset equal to delta, it is removed.
94
    // delta=[1..maxLength]
95
0
    void shift(int32_t delta) {
96
0
        int32_t i=start+delta;
97
0
        if(i>=capacity) {
98
0
            i-=capacity;
99
0
        }
100
0
        if(list[i]) {
101
0
            list[i]=FALSE;
102
0
            --length;
103
0
        }
104
0
        start=i;
105
0
    }
106
107
    // Add an offset. The list must not contain it yet.
108
    // offset=[1..maxLength]
109
0
    void addOffset(int32_t offset) {
110
0
        int32_t i=start+offset;
111
0
        if(i>=capacity) {
112
0
            i-=capacity;
113
0
        }
114
0
        list[i]=TRUE;
115
0
        ++length;
116
0
    }
117
118
    // offset=[1..maxLength]
119
0
    UBool containsOffset(int32_t offset) const {
120
0
        int32_t i=start+offset;
121
0
        if(i>=capacity) {
122
0
            i-=capacity;
123
0
        }
124
0
        return list[i];
125
0
    }
126
127
    // Find the lowest stored offset from a non-empty list, remove it,
128
    // and reduce all other offsets by this minimum.
129
    // Returns [1..maxLength].
130
0
    int32_t popMinimum() {
131
        // Look for the next offset in list[start+1..capacity-1].
132
0
        int32_t i=start, result;
133
0
        while(++i<capacity) {
134
0
            if(list[i]) {
135
0
                list[i]=FALSE;
136
0
                --length;
137
0
                result=i-start;
138
0
                start=i;
139
0
                return result;
140
0
            }
141
0
        }
142
        // i==capacity
143
144
        // Wrap around and look for the next offset in list[0..start].
145
        // Since the list is not empty, there will be one.
146
0
        result=capacity-start;
147
0
        i=0;
148
0
        while(!list[i]) {
149
0
            ++i;
150
0
        }
151
0
        list[i]=FALSE;
152
0
        --length;
153
0
        start=i;
154
0
        return result+=i;
155
0
    }
156
157
private:
158
    UBool *list;
159
    int32_t capacity;
160
    int32_t length;
161
    int32_t start;
162
163
    UBool staticList[16];
164
};
165
166
// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167
static int32_t
168
0
getUTF8Length(const UChar *s, int32_t length) {
169
0
    UErrorCode errorCode=U_ZERO_ERROR;
170
0
    int32_t length8=0;
171
0
    u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
172
0
    if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173
0
        return length8;
174
0
    } else {
175
        // The string contains an unpaired surrogate.
176
        // Ignore this string.
177
0
        return 0;
178
0
    }
179
0
}
180
181
// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182
static int32_t
183
0
appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
184
0
    UErrorCode errorCode=U_ZERO_ERROR;
185
0
    int32_t length8=0;
186
0
    u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
187
0
    if(U_SUCCESS(errorCode)) {
188
0
        return length8;
189
0
    } else {
190
        // The string contains an unpaired surrogate.
191
        // Ignore this string.
192
0
        return 0;
193
0
    }
194
0
}
195
196
static inline uint8_t
197
0
makeSpanLengthByte(int32_t spanLength) {
198
    // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199
0
    return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200
0
}
201
202
// Construct for all variants of span(), or only for any one variant.
203
// Initialize as little as possible, for single use.
204
UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205
                                           const UVector &setStrings,
206
                                           uint32_t which)
207
        : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
208
          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
209
          utf8Length(0),
210
          maxLength16(0), maxLength8(0),
211
0
          all((UBool)(which==ALL)) {
212
0
    spanSet.retainAll(set);
213
0
    if(which&NOT_CONTAINED) {
214
        // Default to the same sets.
215
        // addToSpanNotSet() will create a separate set if necessary.
216
0
        pSpanNotSet=&spanSet;
217
0
    }
218
219
    // Determine if the strings even need to be taken into account at all for span() etc.
220
    // If any string is relevant, then all strings need to be used for
221
    // span(longest match) but only the relevant ones for span(while contained).
222
    // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223
    //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224
    //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225
    // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226
0
    int32_t stringsLength=strings.size();
227
228
0
    int32_t i, spanLength;
229
0
    UBool someRelevant=FALSE;
230
0
    for(i=0; i<stringsLength; ++i) {
231
0
        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
232
0
        const UChar *s16=string.getBuffer();
233
0
        int32_t length16=string.length();
234
0
        UBool thisRelevant;
235
0
        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
236
0
        if(spanLength<length16) {  // Relevant string.
237
0
            someRelevant=thisRelevant=TRUE;
238
0
        } else {
239
0
            thisRelevant=FALSE;
240
0
        }
241
0
        if((which&UTF16) && length16>maxLength16) {
242
0
            maxLength16=length16;
243
0
        }
244
0
        if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
245
0
            int32_t length8=getUTF8Length(s16, length16);
246
0
            utf8Length+=length8;
247
0
            if(length8>maxLength8) {
248
0
                maxLength8=length8;
249
0
            }
250
0
        }
251
0
    }
252
0
    if(!someRelevant) {
253
0
        maxLength16=maxLength8=0;
254
0
        return;
255
0
    }
256
257
    // Freeze after checking for the need to use strings at all because freezing
258
    // a set takes some time and memory which are wasted if there are no relevant strings.
259
0
    if(all) {
260
0
        spanSet.freeze();
261
0
    }
262
263
0
    uint8_t *spanBackLengths;
264
0
    uint8_t *spanUTF8Lengths;
265
0
    uint8_t *spanBackUTF8Lengths;
266
267
    // Allocate a block of meta data.
268
0
    int32_t allocSize;
269
0
    if(all) {
270
        // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
271
0
        allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
272
0
    } else {
273
0
        allocSize=stringsLength;  // One set of span lengths.
274
0
        if(which&UTF8) {
275
            // UTF-8 lengths and UTF-8 strings.
276
0
            allocSize+=stringsLength*4+utf8Length;
277
0
        }
278
0
    }
279
0
    if(allocSize<=(int32_t)sizeof(staticLengths)) {
280
0
        utf8Lengths=staticLengths;
281
0
    } else {
282
0
        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
283
0
        if(utf8Lengths==NULL) {
284
0
            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
285
0
            return;  // Out of memory.
286
0
        }
287
0
    }
288
289
0
    if(all) {
290
        // Store span lengths for all span() variants.
291
0
        spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
292
0
        spanBackLengths=spanLengths+stringsLength;
293
0
        spanUTF8Lengths=spanBackLengths+stringsLength;
294
0
        spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
295
0
        utf8=spanBackUTF8Lengths+stringsLength;
296
0
    } else {
297
        // Store span lengths for only one span() variant.
298
0
        if(which&UTF8) {
299
0
            spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
300
0
            utf8=spanLengths+stringsLength;
301
0
        } else {
302
0
            spanLengths=(uint8_t *)utf8Lengths;
303
0
        }
304
0
        spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
305
0
    }
306
307
    // Set the meta data and pSpanNotSet and write the UTF-8 strings.
308
0
    int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
309
310
0
    for(i=0; i<stringsLength; ++i) {
311
0
        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
312
0
        const UChar *s16=string.getBuffer();
313
0
        int32_t length16=string.length();
314
0
        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
315
0
        if(spanLength<length16) {  // Relevant string.
316
0
            if(which&UTF16) {
317
0
                if(which&CONTAINED) {
318
0
                    if(which&FWD) {
319
0
                        spanLengths[i]=makeSpanLengthByte(spanLength);
320
0
                    }
321
0
                    if(which&BACK) {
322
0
                        spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
323
0
                        spanBackLengths[i]=makeSpanLengthByte(spanLength);
324
0
                    }
325
0
                } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
326
0
                    spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
327
0
                }
328
0
            }
329
0
            if(which&UTF8) {
330
0
                uint8_t *s8=utf8+utf8Count;
331
0
                int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
332
0
                utf8Count+=utf8Lengths[i]=length8;
333
0
                if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
334
0
                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
335
0
                } else {  // Relevant for UTF-8.
336
0
                    if(which&CONTAINED) {
337
0
                        if(which&FWD) {
338
0
                            spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
339
0
                            spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
340
0
                        }
341
0
                        if(which&BACK) {
342
0
                            spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
343
0
                            spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
344
0
                        }
345
0
                    } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
346
0
                        spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
347
0
                    }
348
0
                }
349
0
            }
350
0
            if(which&NOT_CONTAINED) {
351
                // Add string start and end code points to the spanNotSet so that
352
                // a span(while not contained) stops before any string.
353
0
                UChar32 c;
354
0
                if(which&FWD) {
355
0
                    int32_t len=0;
356
0
                    U16_NEXT(s16, len, length16, c);
357
0
                    addToSpanNotSet(c);
358
0
                }
359
0
                if(which&BACK) {
360
0
                    int32_t len=length16;
361
0
                    U16_PREV(s16, 0, len, c);
362
0
                    addToSpanNotSet(c);
363
0
                }
364
0
            }
365
0
        } else {  // Irrelevant string.
366
0
            if(which&UTF8) {
367
0
                if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
368
0
                    uint8_t *s8=utf8+utf8Count;
369
0
                    int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
370
0
                    utf8Count+=utf8Lengths[i]=length8;
371
0
                } else {
372
0
                    utf8Lengths[i]=0;
373
0
                }
374
0
            }
375
0
            if(all) {
376
0
                spanLengths[i]=spanBackLengths[i]=
377
0
                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
378
0
                        (uint8_t)ALL_CP_CONTAINED;
379
0
            } else {
380
                // All spanXYZLengths pointers contain the same address.
381
0
                spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
382
0
            }
383
0
        }
384
0
    }
385
386
    // Finish.
387
0
    if(all) {
388
0
        pSpanNotSet->freeze();
389
0
    }
390
0
}
391
392
// Copy constructor. Assumes which==ALL for a frozen set.
393
UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
394
                                           const UVector &newParentSetStrings)
395
        : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
396
          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
397
          utf8Length(otherStringSpan.utf8Length),
398
          maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
399
0
          all(TRUE) {
400
0
    if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
401
0
        pSpanNotSet=&spanSet;
402
0
    } else {
403
0
        pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
404
0
    }
405
406
    // Allocate a block of meta data.
407
    // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
408
0
    int32_t stringsLength=strings.size();
409
0
    int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
410
0
    if(allocSize<=(int32_t)sizeof(staticLengths)) {
411
0
        utf8Lengths=staticLengths;
412
0
    } else {
413
0
        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
414
0
        if(utf8Lengths==NULL) {
415
0
            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
416
0
            return;  // Out of memory.
417
0
        }
418
0
    }
419
420
0
    spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
421
0
    utf8=spanLengths+stringsLength*4;
422
0
    uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
423
0
}
424
425
0
UnicodeSetStringSpan::~UnicodeSetStringSpan() {
426
0
    if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
427
0
        delete pSpanNotSet;
428
0
    }
429
0
    if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
430
0
        uprv_free(utf8Lengths);
431
0
    }
432
0
}
433
434
0
void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
435
0
    if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
436
0
        if(spanSet.contains(c)) {
437
0
            return;  // Nothing to do.
438
0
        }
439
0
        UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
440
0
        if(newSet==NULL) {
441
0
            return;  // Out of memory.
442
0
        } else {
443
0
            pSpanNotSet=newSet;
444
0
        }
445
0
    }
446
0
    pSpanNotSet->add(c);
447
0
}
448
449
// Compare strings without any argument checks. Requires length>0.
450
static inline UBool
451
0
matches16(const UChar *s, const UChar *t, int32_t length) {
452
0
    do {
453
0
        if(*s++!=*t++) {
454
0
            return FALSE;
455
0
        }
456
0
    } while(--length>0);
457
0
    return TRUE;
458
0
}
459
460
static inline UBool
461
0
matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
462
0
    do {
463
0
        if(*s++!=*t++) {
464
0
            return FALSE;
465
0
        }
466
0
    } while(--length>0);
467
0
    return TRUE;
468
0
}
469
470
// Compare 16-bit Unicode strings (which may be malformed UTF-16)
471
// at code point boundaries.
472
// That is, each edge of a match must not be in the middle of a surrogate pair.
473
static inline UBool
474
0
matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
475
0
    s+=start;
476
0
    limit-=start;
477
0
    return matches16(s, t, length) &&
478
0
           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
479
0
           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
480
0
}
481
482
// Does the set contain the next code point?
483
// If so, return its length; otherwise return its negative length.
484
static inline int32_t
485
0
spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
486
0
    UChar c=*s, c2;
487
0
    if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
488
0
        return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
489
0
    }
490
0
    return set.contains(c) ? 1 : -1;
491
0
}
492
493
static inline int32_t
494
0
spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
495
0
    UChar c=s[length-1], c2;
496
0
    if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
497
0
        return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
498
0
    }
499
0
    return set.contains(c) ? 1 : -1;
500
0
}
501
502
static inline int32_t
503
0
spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
504
0
    UChar32 c=*s;
505
0
    if((int8_t)c>=0) {
506
0
        return set.contains(c) ? 1 : -1;
507
0
    }
508
    // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
509
0
    int32_t i=0;
510
0
    U8_NEXT_OR_FFFD(s, i, length, c);
511
0
    return set.contains(c) ? i : -i;
512
0
}
513
514
static inline int32_t
515
0
spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
516
0
    UChar32 c=s[length-1];
517
0
    if((int8_t)c>=0) {
518
0
        return set.contains(c) ? 1 : -1;
519
0
    }
520
0
    int32_t i=length-1;
521
0
    c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
522
0
    length-=i;
523
0
    return set.contains(c) ? length : -length;
524
0
}
525
526
/*
527
 * Note: In span() when spanLength==0 (after a string match, or at the beginning
528
 * after an empty code point span) and in spanNot() and spanNotUTF8(),
529
 * string matching could use a binary search
530
 * because all string matches are done from the same start index.
531
 *
532
 * For UTF-8, this would require a comparison function that returns UTF-16 order.
533
 *
534
 * This optimization should not be necessary for normal UnicodeSets because
535
 * most sets have no strings, and most sets with strings have
536
 * very few very short strings.
537
 * For cases with many strings, it might be better to use a different API
538
 * and implementation with a DFA (state machine).
539
 */
540
541
/*
542
 * Algorithm for span(USET_SPAN_CONTAINED)
543
 *
544
 * Theoretical algorithm:
545
 * - Iterate through the string, and at each code point boundary:
546
 *   + If the code point there is in the set, then remember to continue after it.
547
 *   + If a set string matches at the current position, then remember to continue after it.
548
 *   + Either recursively span for each code point or string match,
549
 *     or recursively span for all but the shortest one and
550
 *     iteratively continue the span with the shortest local match.
551
 *   + Remember the longest recursive span (the farthest end point).
552
 *   + If there is no match at the current position, neither for the code point there
553
 *     nor for any set string, then stop and return the longest recursive span length.
554
 *
555
 * Optimized implementation:
556
 *
557
 * (We assume that most sets will have very few very short strings.
558
 * A span using a string-less set is extremely fast.)
559
 *
560
 * Create and cache a spanSet which contains all of the single code points
561
 * of the original set but none of its strings.
562
 *
563
 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
564
 * - Loop:
565
 *   + Try to match each set string at the end of the spanLength.
566
 *     ~ Set strings that start with set-contained code points must be matched
567
 *       with a partial overlap because the recursive algorithm would have tried
568
 *       to match them at every position.
569
 *     ~ Set strings that entirely consist of set-contained code points
570
 *       are irrelevant for span(USET_SPAN_CONTAINED) because the
571
 *       recursive algorithm would continue after them anyway
572
 *       and find the longest recursive match from their end.
573
 *     ~ Rather than recursing, note each end point of a set string match.
574
 *   + If no set string matched after spanSet.span(), then return
575
 *     with where the spanSet.span() ended.
576
 *   + If at least one set string matched after spanSet.span(), then
577
 *     pop the shortest string match end point and continue
578
 *     the loop, trying to match all set strings from there.
579
 *   + If at least one more set string matched after a previous string match,
580
 *     then test if the code point after the previous string match is also
581
 *     contained in the set.
582
 *     Continue the loop with the shortest end point of either this code point
583
 *     or a matching set string.
584
 *   + If no more set string matched after a previous string match,
585
 *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
586
 *     Stop if spanLength==0, otherwise continue the loop.
587
 *
588
 * By noting each end point of a set string match,
589
 * the function visits each string position at most once and finishes
590
 * in linear time.
591
 *
592
 * The recursive algorithm may visit the same string position many times
593
 * if multiple paths lead to it and finishes in exponential time.
594
 */
595
596
/*
597
 * Algorithm for span(USET_SPAN_SIMPLE)
598
 *
599
 * Theoretical algorithm:
600
 * - Iterate through the string, and at each code point boundary:
601
 *   + If the code point there is in the set, then remember to continue after it.
602
 *   + If a set string matches at the current position, then remember to continue after it.
603
 *   + Continue from the farthest match position and ignore all others.
604
 *   + If there is no match at the current position,
605
 *     then stop and return the current position.
606
 *
607
 * Optimized implementation:
608
 *
609
 * (Same assumption and spanSet as above.)
610
 *
611
 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
612
 * - Loop:
613
 *   + Try to match each set string at the end of the spanLength.
614
 *     ~ Set strings that start with set-contained code points must be matched
615
 *       with a partial overlap because the standard algorithm would have tried
616
 *       to match them earlier.
617
 *     ~ Set strings that entirely consist of set-contained code points
618
 *       must be matched with a full overlap because the longest-match algorithm
619
 *       would hide set string matches that end earlier.
620
 *       Such set strings need not be matched earlier inside the code point span
621
 *       because the standard algorithm would then have continued after
622
 *       the set string match anyway.
623
 *     ~ Remember the longest set string match (farthest end point) from the earliest
624
 *       starting point.
625
 *   + If no set string matched after spanSet.span(), then return
626
 *     with where the spanSet.span() ended.
627
 *   + If at least one set string matched, then continue the loop after the
628
 *     longest match from the earliest position.
629
 *   + If no more set string matched after a previous string match,
630
 *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
631
 *     Stop if spanLength==0, otherwise continue the loop.
632
 */
633
634
0
int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
635
0
    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
636
0
        return spanNot(s, length);
637
0
    }
638
0
    int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
639
0
    if(spanLength==length) {
640
0
        return length;
641
0
    }
642
643
    // Consider strings; they may overlap with the span.
644
0
    OffsetList offsets;
645
0
    if(spanCondition==USET_SPAN_CONTAINED) {
646
        // Use offset list to try all possibilities.
647
0
        offsets.setMaxLength(maxLength16);
648
0
    }
649
0
    int32_t pos=spanLength, rest=length-pos;
650
0
    int32_t i, stringsLength=strings.size();
651
0
    for(;;) {
652
0
        if(spanCondition==USET_SPAN_CONTAINED) {
653
0
            for(i=0; i<stringsLength; ++i) {
654
0
                int32_t overlap=spanLengths[i];
655
0
                if(overlap==ALL_CP_CONTAINED) {
656
0
                    continue;  // Irrelevant string.
657
0
                }
658
0
                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
659
0
                const UChar *s16=string.getBuffer();
660
0
                int32_t length16=string.length();
661
662
                // Try to match this string at pos-overlap..pos.
663
0
                if(overlap>=LONG_SPAN) {
664
0
                    overlap=length16;
665
                    // While contained: No point matching fully inside the code point span.
666
0
                    U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
667
0
                }
668
0
                if(overlap>spanLength) {
669
0
                    overlap=spanLength;
670
0
                }
671
0
                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
672
0
                for(;;) {
673
0
                    if(inc>rest) {
674
0
                        break;
675
0
                    }
676
                    // Try to match if the increment is not listed already.
677
0
                    if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
678
0
                        if(inc==rest) {
679
0
                            return length;  // Reached the end of the string.
680
0
                        }
681
0
                        offsets.addOffset(inc);
682
0
                    }
683
0
                    if(overlap==0) {
684
0
                        break;
685
0
                    }
686
0
                    --overlap;
687
0
                    ++inc;
688
0
                }
689
0
            }
690
0
        } else /* USET_SPAN_SIMPLE */ {
691
0
            int32_t maxInc=0, maxOverlap=0;
692
0
            for(i=0; i<stringsLength; ++i) {
693
0
                int32_t overlap=spanLengths[i];
694
                // For longest match, we do need to try to match even an all-contained string
695
                // to find the match from the earliest start.
696
697
0
                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
698
0
                const UChar *s16=string.getBuffer();
699
0
                int32_t length16=string.length();
700
701
                // Try to match this string at pos-overlap..pos.
702
0
                if(overlap>=LONG_SPAN) {
703
0
                    overlap=length16;
704
                    // Longest match: Need to match fully inside the code point span
705
                    // to find the match from the earliest start.
706
0
                }
707
0
                if(overlap>spanLength) {
708
0
                    overlap=spanLength;
709
0
                }
710
0
                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
711
0
                for(;;) {
712
0
                    if(inc>rest || overlap<maxOverlap) {
713
0
                        break;
714
0
                    }
715
                    // Try to match if the string is longer or starts earlier.
716
0
                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
717
0
                        matches16CPB(s, pos-overlap, length, s16, length16)
718
0
                    ) {
719
0
                        maxInc=inc;  // Longest match from earliest start.
720
0
                        maxOverlap=overlap;
721
0
                        break;
722
0
                    }
723
0
                    --overlap;
724
0
                    ++inc;
725
0
                }
726
0
            }
727
728
0
            if(maxInc!=0 || maxOverlap!=0) {
729
                // Longest-match algorithm, and there was a string match.
730
                // Simply continue after it.
731
0
                pos+=maxInc;
732
0
                rest-=maxInc;
733
0
                if(rest==0) {
734
0
                    return length;  // Reached the end of the string.
735
0
                }
736
0
                spanLength=0;  // Match strings from after a string match.
737
0
                continue;
738
0
            }
739
0
        }
740
        // Finished trying to match all strings at pos.
741
742
0
        if(spanLength!=0 || pos==0) {
743
            // The position is after an unlimited code point span (spanLength!=0),
744
            // not after a string match.
745
            // The only position where spanLength==0 after a span is pos==0.
746
            // Otherwise, an unlimited code point span is only tried again when no
747
            // strings match, and if such a non-initial span fails we stop.
748
0
            if(offsets.isEmpty()) {
749
0
                return pos;  // No strings matched after a span.
750
0
            }
751
            // Match strings from after the next string match.
752
0
        } else {
753
            // The position is after a string match (or a single code point).
754
0
            if(offsets.isEmpty()) {
755
                // No more strings matched after a previous string match.
756
                // Try another code point span from after the last string match.
757
0
                spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
758
0
                if( spanLength==rest || // Reached the end of the string, or
759
0
                    spanLength==0       // neither strings nor span progressed.
760
0
                ) {
761
0
                    return pos+spanLength;
762
0
                }
763
0
                pos+=spanLength;
764
0
                rest-=spanLength;
765
0
                continue;  // spanLength>0: Match strings from after a span.
766
0
            } else {
767
                // Try to match only one code point from after a string match if some
768
                // string matched beyond it, so that we try all possible positions
769
                // and don't overshoot.
770
0
                spanLength=spanOne(spanSet, s+pos, rest);
771
0
                if(spanLength>0) {
772
0
                    if(spanLength==rest) {
773
0
                        return length;  // Reached the end of the string.
774
0
                    }
775
                    // Match strings after this code point.
776
                    // There cannot be any increments below it because UnicodeSet strings
777
                    // contain multiple code points.
778
0
                    pos+=spanLength;
779
0
                    rest-=spanLength;
780
0
                    offsets.shift(spanLength);
781
0
                    spanLength=0;
782
0
                    continue;  // Match strings from after a single code point.
783
0
                }
784
                // Match strings from after the next string match.
785
0
            }
786
0
        }
787
0
        int32_t minOffset=offsets.popMinimum();
788
0
        pos+=minOffset;
789
0
        rest-=minOffset;
790
0
        spanLength=0;  // Match strings from after a string match.
791
0
    }
792
0
}
793
794
0
int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
795
0
    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
796
0
        return spanNotBack(s, length);
797
0
    }
798
0
    int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
799
0
    if(pos==0) {
800
0
        return 0;
801
0
    }
802
0
    int32_t spanLength=length-pos;
803
804
    // Consider strings; they may overlap with the span.
805
0
    OffsetList offsets;
806
0
    if(spanCondition==USET_SPAN_CONTAINED) {
807
        // Use offset list to try all possibilities.
808
0
        offsets.setMaxLength(maxLength16);
809
0
    }
810
0
    int32_t i, stringsLength=strings.size();
811
0
    uint8_t *spanBackLengths=spanLengths;
812
0
    if(all) {
813
0
        spanBackLengths+=stringsLength;
814
0
    }
815
0
    for(;;) {
816
0
        if(spanCondition==USET_SPAN_CONTAINED) {
817
0
            for(i=0; i<stringsLength; ++i) {
818
0
                int32_t overlap=spanBackLengths[i];
819
0
                if(overlap==ALL_CP_CONTAINED) {
820
0
                    continue;  // Irrelevant string.
821
0
                }
822
0
                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
823
0
                const UChar *s16=string.getBuffer();
824
0
                int32_t length16=string.length();
825
826
                // Try to match this string at pos-(length16-overlap)..pos-length16.
827
0
                if(overlap>=LONG_SPAN) {
828
0
                    overlap=length16;
829
                    // While contained: No point matching fully inside the code point span.
830
0
                    int32_t len1=0;
831
0
                    U16_FWD_1(s16, len1, overlap);
832
0
                    overlap-=len1;  // Length of the string minus the first code point.
833
0
                }
834
0
                if(overlap>spanLength) {
835
0
                    overlap=spanLength;
836
0
                }
837
0
                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
838
0
                for(;;) {
839
0
                    if(dec>pos) {
840
0
                        break;
841
0
                    }
842
                    // Try to match if the decrement is not listed already.
843
0
                    if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
844
0
                        if(dec==pos) {
845
0
                            return 0;  // Reached the start of the string.
846
0
                        }
847
0
                        offsets.addOffset(dec);
848
0
                    }
849
0
                    if(overlap==0) {
850
0
                        break;
851
0
                    }
852
0
                    --overlap;
853
0
                    ++dec;
854
0
                }
855
0
            }
856
0
        } else /* USET_SPAN_SIMPLE */ {
857
0
            int32_t maxDec=0, maxOverlap=0;
858
0
            for(i=0; i<stringsLength; ++i) {
859
0
                int32_t overlap=spanBackLengths[i];
860
                // For longest match, we do need to try to match even an all-contained string
861
                // to find the match from the latest end.
862
863
0
                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
864
0
                const UChar *s16=string.getBuffer();
865
0
                int32_t length16=string.length();
866
867
                // Try to match this string at pos-(length16-overlap)..pos-length16.
868
0
                if(overlap>=LONG_SPAN) {
869
0
                    overlap=length16;
870
                    // Longest match: Need to match fully inside the code point span
871
                    // to find the match from the latest end.
872
0
                }
873
0
                if(overlap>spanLength) {
874
0
                    overlap=spanLength;
875
0
                }
876
0
                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
877
0
                for(;;) {
878
0
                    if(dec>pos || overlap<maxOverlap) {
879
0
                        break;
880
0
                    }
881
                    // Try to match if the string is longer or ends later.
882
0
                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
883
0
                        matches16CPB(s, pos-dec, length, s16, length16)
884
0
                    ) {
885
0
                        maxDec=dec;  // Longest match from latest end.
886
0
                        maxOverlap=overlap;
887
0
                        break;
888
0
                    }
889
0
                    --overlap;
890
0
                    ++dec;
891
0
                }
892
0
            }
893
894
0
            if(maxDec!=0 || maxOverlap!=0) {
895
                // Longest-match algorithm, and there was a string match.
896
                // Simply continue before it.
897
0
                pos-=maxDec;
898
0
                if(pos==0) {
899
0
                    return 0;  // Reached the start of the string.
900
0
                }
901
0
                spanLength=0;  // Match strings from before a string match.
902
0
                continue;
903
0
            }
904
0
        }
905
        // Finished trying to match all strings at pos.
906
907
0
        if(spanLength!=0 || pos==length) {
908
            // The position is before an unlimited code point span (spanLength!=0),
909
            // not before a string match.
910
            // The only position where spanLength==0 before a span is pos==length.
911
            // Otherwise, an unlimited code point span is only tried again when no
912
            // strings match, and if such a non-initial span fails we stop.
913
0
            if(offsets.isEmpty()) {
914
0
                return pos;  // No strings matched before a span.
915
0
            }
916
            // Match strings from before the next string match.
917
0
        } else {
918
            // The position is before a string match (or a single code point).
919
0
            if(offsets.isEmpty()) {
920
                // No more strings matched before a previous string match.
921
                // Try another code point span from before the last string match.
922
0
                int32_t oldPos=pos;
923
0
                pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
924
0
                spanLength=oldPos-pos;
925
0
                if( pos==0 ||           // Reached the start of the string, or
926
0
                    spanLength==0       // neither strings nor span progressed.
927
0
                ) {
928
0
                    return pos;
929
0
                }
930
0
                continue;  // spanLength>0: Match strings from before a span.
931
0
            } else {
932
                // Try to match only one code point from before a string match if some
933
                // string matched beyond it, so that we try all possible positions
934
                // and don't overshoot.
935
0
                spanLength=spanOneBack(spanSet, s, pos);
936
0
                if(spanLength>0) {
937
0
                    if(spanLength==pos) {
938
0
                        return 0;  // Reached the start of the string.
939
0
                    }
940
                    // Match strings before this code point.
941
                    // There cannot be any decrements below it because UnicodeSet strings
942
                    // contain multiple code points.
943
0
                    pos-=spanLength;
944
0
                    offsets.shift(spanLength);
945
0
                    spanLength=0;
946
0
                    continue;  // Match strings from before a single code point.
947
0
                }
948
                // Match strings from before the next string match.
949
0
            }
950
0
        }
951
0
        pos-=offsets.popMinimum();
952
0
        spanLength=0;  // Match strings from before a string match.
953
0
    }
954
0
}
955
956
0
int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
957
0
    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
958
0
        return spanNotUTF8(s, length);
959
0
    }
960
0
    int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
961
0
    if(spanLength==length) {
962
0
        return length;
963
0
    }
964
965
    // Consider strings; they may overlap with the span.
966
0
    OffsetList offsets;
967
0
    if(spanCondition==USET_SPAN_CONTAINED) {
968
        // Use offset list to try all possibilities.
969
0
        offsets.setMaxLength(maxLength8);
970
0
    }
971
0
    int32_t pos=spanLength, rest=length-pos;
972
0
    int32_t i, stringsLength=strings.size();
973
0
    uint8_t *spanUTF8Lengths=spanLengths;
974
0
    if(all) {
975
0
        spanUTF8Lengths+=2*stringsLength;
976
0
    }
977
0
    for(;;) {
978
0
        const uint8_t *s8=utf8;
979
0
        int32_t length8;
980
0
        if(spanCondition==USET_SPAN_CONTAINED) {
981
0
            for(i=0; i<stringsLength; ++i) {
982
0
                length8=utf8Lengths[i];
983
0
                if(length8==0) {
984
0
                    continue;  // String not representable in UTF-8.
985
0
                }
986
0
                int32_t overlap=spanUTF8Lengths[i];
987
0
                if(overlap==ALL_CP_CONTAINED) {
988
0
                    s8+=length8;
989
0
                    continue;  // Irrelevant string.
990
0
                }
991
992
                // Try to match this string at pos-overlap..pos.
993
0
                if(overlap>=LONG_SPAN) {
994
0
                    overlap=length8;
995
                    // While contained: No point matching fully inside the code point span.
996
0
                    U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
997
0
                }
998
0
                if(overlap>spanLength) {
999
0
                    overlap=spanLength;
1000
0
                }
1001
0
                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1002
0
                for(;;) {
1003
0
                    if(inc>rest) {
1004
0
                        break;
1005
0
                    }
1006
                    // Try to match if the increment is not listed already.
1007
                    // Match at code point boundaries. (The UTF-8 strings were converted
1008
                    // from UTF-16 and are guaranteed to be well-formed.)
1009
0
                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1010
0
                        !offsets.containsOffset(inc) &&
1011
0
                        matches8(s+pos-overlap, s8, length8)
1012
                        
1013
0
                    ) {
1014
0
                        if(inc==rest) {
1015
0
                            return length;  // Reached the end of the string.
1016
0
                        }
1017
0
                        offsets.addOffset(inc);
1018
0
                    }
1019
0
                    if(overlap==0) {
1020
0
                        break;
1021
0
                    }
1022
0
                    --overlap;
1023
0
                    ++inc;
1024
0
                }
1025
0
                s8+=length8;
1026
0
            }
1027
0
        } else /* USET_SPAN_SIMPLE */ {
1028
0
            int32_t maxInc=0, maxOverlap=0;
1029
0
            for(i=0; i<stringsLength; ++i) {
1030
0
                length8=utf8Lengths[i];
1031
0
                if(length8==0) {
1032
0
                    continue;  // String not representable in UTF-8.
1033
0
                }
1034
0
                int32_t overlap=spanUTF8Lengths[i];
1035
                // For longest match, we do need to try to match even an all-contained string
1036
                // to find the match from the earliest start.
1037
1038
                // Try to match this string at pos-overlap..pos.
1039
0
                if(overlap>=LONG_SPAN) {
1040
0
                    overlap=length8;
1041
                    // Longest match: Need to match fully inside the code point span
1042
                    // to find the match from the earliest start.
1043
0
                }
1044
0
                if(overlap>spanLength) {
1045
0
                    overlap=spanLength;
1046
0
                }
1047
0
                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1048
0
                for(;;) {
1049
0
                    if(inc>rest || overlap<maxOverlap) {
1050
0
                        break;
1051
0
                    }
1052
                    // Try to match if the string is longer or starts earlier.
1053
                    // Match at code point boundaries. (The UTF-8 strings were converted
1054
                    // from UTF-16 and are guaranteed to be well-formed.)
1055
0
                    if( !U8_IS_TRAIL(s[pos-overlap]) &&
1056
0
                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1057
0
                        matches8(s+pos-overlap, s8, length8)
1058
                        
1059
0
                    ) {
1060
0
                        maxInc=inc;  // Longest match from earliest start.
1061
0
                        maxOverlap=overlap;
1062
0
                        break;
1063
0
                    }
1064
0
                    --overlap;
1065
0
                    ++inc;
1066
0
                }
1067
0
                s8+=length8;
1068
0
            }
1069
1070
0
            if(maxInc!=0 || maxOverlap!=0) {
1071
                // Longest-match algorithm, and there was a string match.
1072
                // Simply continue after it.
1073
0
                pos+=maxInc;
1074
0
                rest-=maxInc;
1075
0
                if(rest==0) {
1076
0
                    return length;  // Reached the end of the string.
1077
0
                }
1078
0
                spanLength=0;  // Match strings from after a string match.
1079
0
                continue;
1080
0
            }
1081
0
        }
1082
        // Finished trying to match all strings at pos.
1083
1084
0
        if(spanLength!=0 || pos==0) {
1085
            // The position is after an unlimited code point span (spanLength!=0),
1086
            // not after a string match.
1087
            // The only position where spanLength==0 after a span is pos==0.
1088
            // Otherwise, an unlimited code point span is only tried again when no
1089
            // strings match, and if such a non-initial span fails we stop.
1090
0
            if(offsets.isEmpty()) {
1091
0
                return pos;  // No strings matched after a span.
1092
0
            }
1093
            // Match strings from after the next string match.
1094
0
        } else {
1095
            // The position is after a string match (or a single code point).
1096
0
            if(offsets.isEmpty()) {
1097
                // No more strings matched after a previous string match.
1098
                // Try another code point span from after the last string match.
1099
0
                spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1100
0
                if( spanLength==rest || // Reached the end of the string, or
1101
0
                    spanLength==0       // neither strings nor span progressed.
1102
0
                ) {
1103
0
                    return pos+spanLength;
1104
0
                }
1105
0
                pos+=spanLength;
1106
0
                rest-=spanLength;
1107
0
                continue;  // spanLength>0: Match strings from after a span.
1108
0
            } else {
1109
                // Try to match only one code point from after a string match if some
1110
                // string matched beyond it, so that we try all possible positions
1111
                // and don't overshoot.
1112
0
                spanLength=spanOneUTF8(spanSet, s+pos, rest);
1113
0
                if(spanLength>0) {
1114
0
                    if(spanLength==rest) {
1115
0
                        return length;  // Reached the end of the string.
1116
0
                    }
1117
                    // Match strings after this code point.
1118
                    // There cannot be any increments below it because UnicodeSet strings
1119
                    // contain multiple code points.
1120
0
                    pos+=spanLength;
1121
0
                    rest-=spanLength;
1122
0
                    offsets.shift(spanLength);
1123
0
                    spanLength=0;
1124
0
                    continue;  // Match strings from after a single code point.
1125
0
                }
1126
                // Match strings from after the next string match.
1127
0
            }
1128
0
        }
1129
0
        int32_t minOffset=offsets.popMinimum();
1130
0
        pos+=minOffset;
1131
0
        rest-=minOffset;
1132
0
        spanLength=0;  // Match strings from after a string match.
1133
0
    }
1134
0
}
1135
1136
0
int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1137
0
    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1138
0
        return spanNotBackUTF8(s, length);
1139
0
    }
1140
0
    int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1141
0
    if(pos==0) {
1142
0
        return 0;
1143
0
    }
1144
0
    int32_t spanLength=length-pos;
1145
1146
    // Consider strings; they may overlap with the span.
1147
0
    OffsetList offsets;
1148
0
    if(spanCondition==USET_SPAN_CONTAINED) {
1149
        // Use offset list to try all possibilities.
1150
0
        offsets.setMaxLength(maxLength8);
1151
0
    }
1152
0
    int32_t i, stringsLength=strings.size();
1153
0
    uint8_t *spanBackUTF8Lengths=spanLengths;
1154
0
    if(all) {
1155
0
        spanBackUTF8Lengths+=3*stringsLength;
1156
0
    }
1157
0
    for(;;) {
1158
0
        const uint8_t *s8=utf8;
1159
0
        int32_t length8;
1160
0
        if(spanCondition==USET_SPAN_CONTAINED) {
1161
0
            for(i=0; i<stringsLength; ++i) {
1162
0
                length8=utf8Lengths[i];
1163
0
                if(length8==0) {
1164
0
                    continue;  // String not representable in UTF-8.
1165
0
                }
1166
0
                int32_t overlap=spanBackUTF8Lengths[i];
1167
0
                if(overlap==ALL_CP_CONTAINED) {
1168
0
                    s8+=length8;
1169
0
                    continue;  // Irrelevant string.
1170
0
                }
1171
1172
                // Try to match this string at pos-(length8-overlap)..pos-length8.
1173
0
                if(overlap>=LONG_SPAN) {
1174
0
                    overlap=length8;
1175
                    // While contained: No point matching fully inside the code point span.
1176
0
                    int32_t len1=0;
1177
0
                    U8_FWD_1(s8, len1, overlap);
1178
0
                    overlap-=len1;  // Length of the string minus the first code point.
1179
0
                }
1180
0
                if(overlap>spanLength) {
1181
0
                    overlap=spanLength;
1182
0
                }
1183
0
                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1184
0
                for(;;) {
1185
0
                    if(dec>pos) {
1186
0
                        break;
1187
0
                    }
1188
                    // Try to match if the decrement is not listed already.
1189
                    // Match at code point boundaries. (The UTF-8 strings were converted
1190
                    // from UTF-16 and are guaranteed to be well-formed.)
1191
0
                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1192
0
                        !offsets.containsOffset(dec) &&
1193
0
                        matches8(s+pos-dec, s8, length8)
1194
0
                    ) {
1195
0
                        if(dec==pos) {
1196
0
                            return 0;  // Reached the start of the string.
1197
0
                        }
1198
0
                        offsets.addOffset(dec);
1199
0
                    }
1200
0
                    if(overlap==0) {
1201
0
                        break;
1202
0
                    }
1203
0
                    --overlap;
1204
0
                    ++dec;
1205
0
                }
1206
0
                s8+=length8;
1207
0
            }
1208
0
        } else /* USET_SPAN_SIMPLE */ {
1209
0
            int32_t maxDec=0, maxOverlap=0;
1210
0
            for(i=0; i<stringsLength; ++i) {
1211
0
                length8=utf8Lengths[i];
1212
0
                if(length8==0) {
1213
0
                    continue;  // String not representable in UTF-8.
1214
0
                }
1215
0
                int32_t overlap=spanBackUTF8Lengths[i];
1216
                // For longest match, we do need to try to match even an all-contained string
1217
                // to find the match from the latest end.
1218
1219
                // Try to match this string at pos-(length8-overlap)..pos-length8.
1220
0
                if(overlap>=LONG_SPAN) {
1221
0
                    overlap=length8;
1222
                    // Longest match: Need to match fully inside the code point span
1223
                    // to find the match from the latest end.
1224
0
                }
1225
0
                if(overlap>spanLength) {
1226
0
                    overlap=spanLength;
1227
0
                }
1228
0
                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1229
0
                for(;;) {
1230
0
                    if(dec>pos || overlap<maxOverlap) {
1231
0
                        break;
1232
0
                    }
1233
                    // Try to match if the string is longer or ends later.
1234
                    // Match at code point boundaries. (The UTF-8 strings were converted
1235
                    // from UTF-16 and are guaranteed to be well-formed.)
1236
0
                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1237
0
                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1238
0
                        matches8(s+pos-dec, s8, length8)
1239
0
                    ) {
1240
0
                        maxDec=dec;  // Longest match from latest end.
1241
0
                        maxOverlap=overlap;
1242
0
                        break;
1243
0
                    }
1244
0
                    --overlap;
1245
0
                    ++dec;
1246
0
                }
1247
0
                s8+=length8;
1248
0
            }
1249
1250
0
            if(maxDec!=0 || maxOverlap!=0) {
1251
                // Longest-match algorithm, and there was a string match.
1252
                // Simply continue before it.
1253
0
                pos-=maxDec;
1254
0
                if(pos==0) {
1255
0
                    return 0;  // Reached the start of the string.
1256
0
                }
1257
0
                spanLength=0;  // Match strings from before a string match.
1258
0
                continue;
1259
0
            }
1260
0
        }
1261
        // Finished trying to match all strings at pos.
1262
1263
0
        if(spanLength!=0 || pos==length) {
1264
            // The position is before an unlimited code point span (spanLength!=0),
1265
            // not before a string match.
1266
            // The only position where spanLength==0 before a span is pos==length.
1267
            // Otherwise, an unlimited code point span is only tried again when no
1268
            // strings match, and if such a non-initial span fails we stop.
1269
0
            if(offsets.isEmpty()) {
1270
0
                return pos;  // No strings matched before a span.
1271
0
            }
1272
            // Match strings from before the next string match.
1273
0
        } else {
1274
            // The position is before a string match (or a single code point).
1275
0
            if(offsets.isEmpty()) {
1276
                // No more strings matched before a previous string match.
1277
                // Try another code point span from before the last string match.
1278
0
                int32_t oldPos=pos;
1279
0
                pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1280
0
                spanLength=oldPos-pos;
1281
0
                if( pos==0 ||           // Reached the start of the string, or
1282
0
                    spanLength==0       // neither strings nor span progressed.
1283
0
                ) {
1284
0
                    return pos;
1285
0
                }
1286
0
                continue;  // spanLength>0: Match strings from before a span.
1287
0
            } else {
1288
                // Try to match only one code point from before a string match if some
1289
                // string matched beyond it, so that we try all possible positions
1290
                // and don't overshoot.
1291
0
                spanLength=spanOneBackUTF8(spanSet, s, pos);
1292
0
                if(spanLength>0) {
1293
0
                    if(spanLength==pos) {
1294
0
                        return 0;  // Reached the start of the string.
1295
0
                    }
1296
                    // Match strings before this code point.
1297
                    // There cannot be any decrements below it because UnicodeSet strings
1298
                    // contain multiple code points.
1299
0
                    pos-=spanLength;
1300
0
                    offsets.shift(spanLength);
1301
0
                    spanLength=0;
1302
0
                    continue;  // Match strings from before a single code point.
1303
0
                }
1304
                // Match strings from before the next string match.
1305
0
            }
1306
0
        }
1307
0
        pos-=offsets.popMinimum();
1308
0
        spanLength=0;  // Match strings from before a string match.
1309
0
    }
1310
0
}
1311
1312
/*
1313
 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1314
 *
1315
 * Theoretical algorithm:
1316
 * - Iterate through the string, and at each code point boundary:
1317
 *   + If the code point there is in the set, then return with the current position.
1318
 *   + If a set string matches at the current position, then return with the current position.
1319
 *
1320
 * Optimized implementation:
1321
 *
1322
 * (Same assumption as for span() above.)
1323
 *
1324
 * Create and cache a spanNotSet which contains all of the single code points
1325
 * of the original set but none of its strings.
1326
 * For each set string add its initial code point to the spanNotSet.
1327
 * (Also add its final code point for spanNotBack().)
1328
 *
1329
 * - Loop:
1330
 *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1331
 *   + If the current code point is in the original set, then
1332
 *     return the current position.
1333
 *   + If any set string matches at the current position, then
1334
 *     return the current position.
1335
 *   + If there is no match at the current position, neither for the code point there
1336
 *     nor for any set string, then skip this code point and continue the loop.
1337
 *     This happens for set-string-initial code points that were added to spanNotSet
1338
 *     when there is not actually a match for such a set string.
1339
 */
1340
1341
0
int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1342
0
    int32_t pos=0, rest=length;
1343
0
    int32_t i, stringsLength=strings.size();
1344
0
    do {
1345
        // Span until we find a code point from the set,
1346
        // or a code point that starts or ends some string.
1347
0
        i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1348
0
        if(i==rest) {
1349
0
            return length;  // Reached the end of the string.
1350
0
        }
1351
0
        pos+=i;
1352
0
        rest-=i;
1353
1354
        // Check whether the current code point is in the original set,
1355
        // without the string starts and ends.
1356
0
        int32_t cpLength=spanOne(spanSet, s+pos, rest);
1357
0
        if(cpLength>0) {
1358
0
            return pos;  // There is a set element at pos.
1359
0
        }
1360
1361
        // Try to match the strings at pos.
1362
0
        for(i=0; i<stringsLength; ++i) {
1363
0
            if(spanLengths[i]==ALL_CP_CONTAINED) {
1364
0
                continue;  // Irrelevant string.
1365
0
            }
1366
0
            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1367
0
            const UChar *s16=string.getBuffer();
1368
0
            int32_t length16=string.length();
1369
0
            if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1370
0
                return pos;  // There is a set element at pos.
1371
0
            }
1372
0
        }
1373
1374
        // The span(while not contained) ended on a string start/end which is
1375
        // not in the original set. Skip this code point and continue.
1376
        // cpLength<0
1377
0
        pos-=cpLength;
1378
0
        rest+=cpLength;
1379
0
    } while(rest!=0);
1380
0
    return length;  // Reached the end of the string.
1381
0
}
1382
1383
0
int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1384
0
    int32_t pos=length;
1385
0
    int32_t i, stringsLength=strings.size();
1386
0
    do {
1387
        // Span until we find a code point from the set,
1388
        // or a code point that starts or ends some string.
1389
0
        pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1390
0
        if(pos==0) {
1391
0
            return 0;  // Reached the start of the string.
1392
0
        }
1393
1394
        // Check whether the current code point is in the original set,
1395
        // without the string starts and ends.
1396
0
        int32_t cpLength=spanOneBack(spanSet, s, pos);
1397
0
        if(cpLength>0) {
1398
0
            return pos;  // There is a set element at pos.
1399
0
        }
1400
1401
        // Try to match the strings at pos.
1402
0
        for(i=0; i<stringsLength; ++i) {
1403
            // Use spanLengths rather than a spanBackLengths pointer because
1404
            // it is easier and we only need to know whether the string is irrelevant
1405
            // which is the same in either array.
1406
0
            if(spanLengths[i]==ALL_CP_CONTAINED) {
1407
0
                continue;  // Irrelevant string.
1408
0
            }
1409
0
            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1410
0
            const UChar *s16=string.getBuffer();
1411
0
            int32_t length16=string.length();
1412
0
            if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1413
0
                return pos;  // There is a set element at pos.
1414
0
            }
1415
0
        }
1416
1417
        // The span(while not contained) ended on a string start/end which is
1418
        // not in the original set. Skip this code point and continue.
1419
        // cpLength<0
1420
0
        pos+=cpLength;
1421
0
    } while(pos!=0);
1422
0
    return 0;  // Reached the start of the string.
1423
0
}
1424
1425
0
int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1426
0
    int32_t pos=0, rest=length;
1427
0
    int32_t i, stringsLength=strings.size();
1428
0
    uint8_t *spanUTF8Lengths=spanLengths;
1429
0
    if(all) {
1430
0
        spanUTF8Lengths+=2*stringsLength;
1431
0
    }
1432
0
    do {
1433
        // Span until we find a code point from the set,
1434
        // or a code point that starts or ends some string.
1435
0
        i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1436
0
        if(i==rest) {
1437
0
            return length;  // Reached the end of the string.
1438
0
        }
1439
0
        pos+=i;
1440
0
        rest-=i;
1441
1442
        // Check whether the current code point is in the original set,
1443
        // without the string starts and ends.
1444
0
        int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1445
0
        if(cpLength>0) {
1446
0
            return pos;  // There is a set element at pos.
1447
0
        }
1448
1449
        // Try to match the strings at pos.
1450
0
        const uint8_t *s8=utf8;
1451
0
        int32_t length8;
1452
0
        for(i=0; i<stringsLength; ++i) {
1453
0
            length8=utf8Lengths[i];
1454
            // ALL_CP_CONTAINED: Irrelevant string.
1455
0
            if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1456
0
                return pos;  // There is a set element at pos.
1457
0
            }
1458
0
            s8+=length8;
1459
0
        }
1460
1461
        // The span(while not contained) ended on a string start/end which is
1462
        // not in the original set. Skip this code point and continue.
1463
        // cpLength<0
1464
0
        pos-=cpLength;
1465
0
        rest+=cpLength;
1466
0
    } while(rest!=0);
1467
0
    return length;  // Reached the end of the string.
1468
0
}
1469
1470
0
int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1471
0
    int32_t pos=length;
1472
0
    int32_t i, stringsLength=strings.size();
1473
0
    uint8_t *spanBackUTF8Lengths=spanLengths;
1474
0
    if(all) {
1475
0
        spanBackUTF8Lengths+=3*stringsLength;
1476
0
    }
1477
0
    do {
1478
        // Span until we find a code point from the set,
1479
        // or a code point that starts or ends some string.
1480
0
        pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1481
0
        if(pos==0) {
1482
0
            return 0;  // Reached the start of the string.
1483
0
        }
1484
1485
        // Check whether the current code point is in the original set,
1486
        // without the string starts and ends.
1487
0
        int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1488
0
        if(cpLength>0) {
1489
0
            return pos;  // There is a set element at pos.
1490
0
        }
1491
1492
        // Try to match the strings at pos.
1493
0
        const uint8_t *s8=utf8;
1494
0
        int32_t length8;
1495
0
        for(i=0; i<stringsLength; ++i) {
1496
0
            length8=utf8Lengths[i];
1497
            // ALL_CP_CONTAINED: Irrelevant string.
1498
0
            if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1499
0
                return pos;  // There is a set element at pos.
1500
0
            }
1501
0
            s8+=length8;
1502
0
        }
1503
1504
        // The span(while not contained) ended on a string start/end which is
1505
        // not in the original set. Skip this code point and continue.
1506
        // cpLength<0
1507
0
        pos+=cpLength;
1508
0
    } while(pos!=0);
1509
0
    return 0;  // Reached the start of the string.
1510
0
}
1511
1512
U_NAMESPACE_END