Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/common/unisetspan.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 2007-2012, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  unisetspan.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 2007mar01
16
*   created by: Markus W. Scherer
17
*/
18
19
#include "unicode/utypes.h"
20
#include "unicode/uniset.h"
21
#include "unicode/ustring.h"
22
#include "unicode/utf8.h"
23
#include "unicode/utf16.h"
24
#include "cmemory.h"
25
#include "uvector.h"
26
#include "unisetspan.h"
27
28
U_NAMESPACE_BEGIN
29
30
/*
31
 * List of offsets from the current position from where to try matching
32
 * a code point or a string.
33
 * Store offsets rather than indexes to simplify the code and use the same list
34
 * for both increments (in span()) and decrements (in spanBack()).
35
 *
36
 * Assumption: The maximum offset is limited, and the offsets that are stored
37
 * at any one time are relatively dense, that is, there are normally no gaps of
38
 * hundreds or thousands of offset values.
39
 *
40
 * The implementation uses a circular buffer of byte flags,
41
 * each indicating whether the corresponding offset is in the list.
42
 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43
 * physically moving part of the list.
44
 *
45
 * Note: In principle, the caller should setMaxLength() to the maximum of the
46
 * max string length and U16_LENGTH/U8_LENGTH to account for
47
 * "long" single code points.
48
 * However, this implementation uses at least a staticList with more than
49
 * U8_LENGTH entries anyway.
50
 *
51
 * Note: If maxLength were guaranteed to be no more than 32 or 64,
52
 * the list could be stored as bit flags in a single integer.
53
 * Rather than handling a circular buffer with a start list index,
54
 * the integer would simply be shifted when lower offsets are removed.
55
 * UnicodeSet does not have a limit on the lengths of strings.
56
 */
57
class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
58
public:
59
0
    OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60
61
0
    ~OffsetList() {
62
0
        if(list!=staticList) {
63
0
            uprv_free(list);
64
0
        }
65
0
    }
66
67
    // Call exactly once if the list is to be used.
68
0
    void setMaxLength(int32_t maxLength) {
69
0
        if(maxLength<=(int32_t)sizeof(staticList)) {
70
0
            capacity=(int32_t)sizeof(staticList);
71
0
        } else {
72
0
            UBool *l=(UBool *)uprv_malloc(maxLength);
73
0
            if(l!=NULL) {
74
0
                list=l;
75
0
                capacity=maxLength;
76
0
            }
77
0
        }
78
0
        uprv_memset(list, 0, capacity);
79
0
    }
80
81
0
    void clear() {
82
0
        uprv_memset(list, 0, capacity);
83
0
        start=length=0;
84
0
    }
85
86
0
    UBool isEmpty() const {
87
0
        return (UBool)(length==0);
88
0
    }
89
90
    // Reduce all stored offsets by delta, used when the current position
91
    // moves by delta.
92
    // There must not be any offsets lower than delta.
93
    // If there is an offset equal to delta, it is removed.
94
    // delta=[1..maxLength]
95
0
    void shift(int32_t delta) {
96
0
        int32_t i=start+delta;
97
0
        if(i>=capacity) {
98
0
            i-=capacity;
99
0
        }
100
0
        if(list[i]) {
101
0
            list[i]=FALSE;
102
0
            --length;
103
0
        }
104
0
        start=i;
105
0
    }
106
107
    // Add an offset. The list must not contain it yet.
108
    // offset=[1..maxLength]
109
0
    void addOffset(int32_t offset) {
110
0
        int32_t i=start+offset;
111
0
        if(i>=capacity) {
112
0
            i-=capacity;
113
0
        }
114
0
        list[i]=TRUE;
115
0
        ++length;
116
0
    }
117
118
    // offset=[1..maxLength]
119
0
    UBool containsOffset(int32_t offset) const {
120
0
        int32_t i=start+offset;
121
0
        if(i>=capacity) {
122
0
            i-=capacity;
123
0
        }
124
0
        return list[i];
125
0
    }
126
127
    // Find the lowest stored offset from a non-empty list, remove it,
128
    // and reduce all other offsets by this minimum.
129
    // Returns [1..maxLength].
130
0
    int32_t popMinimum() {
131
        // Look for the next offset in list[start+1..capacity-1].
132
0
        int32_t i=start, result;
133
0
        while(++i<capacity) {
134
0
            if(list[i]) {
135
0
                list[i]=FALSE;
136
0
                --length;
137
0
                result=i-start;
138
0
                start=i;
139
0
                return result;
140
0
            }
141
0
        }
142
        // i==capacity
143
144
        // Wrap around and look for the next offset in list[0..start].
145
        // Since the list is not empty, there will be one.
146
0
        result=capacity-start;
147
0
        i=0;
148
0
        while(!list[i]) {
149
0
            ++i;
150
0
        }
151
0
        list[i]=FALSE;
152
0
        --length;
153
0
        start=i;
154
0
        return result+=i;
155
0
    }
156
157
private:
158
    UBool *list;
159
    int32_t capacity;
160
    int32_t length;
161
    int32_t start;
162
163
    UBool staticList[16];
164
};
165
166
// Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167
static int32_t
168
0
getUTF8Length(const UChar *s, int32_t length) {
169
0
    UErrorCode errorCode=U_ZERO_ERROR;
170
0
    int32_t length8=0;
171
0
    u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
172
0
    if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173
0
        return length8;
174
0
    } else {
175
        // The string contains an unpaired surrogate.
176
        // Ignore this string.
177
0
        return 0;
178
0
    }
179
0
}
180
181
// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182
static int32_t
183
0
appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
184
0
    UErrorCode errorCode=U_ZERO_ERROR;
185
0
    int32_t length8=0;
186
0
    u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
187
0
    if(U_SUCCESS(errorCode)) {
188
0
        return length8;
189
0
    } else {
190
        // The string contains an unpaired surrogate.
191
        // Ignore this string.
192
0
        return 0;
193
0
    }
194
0
}
195
196
static inline uint8_t
197
0
makeSpanLengthByte(int32_t spanLength) {
198
    // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199
0
    return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200
0
}
201
202
// Construct for all variants of span(), or only for any one variant.
203
// Initialize as little as possible, for single use.
204
UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205
                                           const UVector &setStrings,
206
                                           uint32_t which)
207
0
        : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
208
          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
209
0
          utf8Length(0),
210
0
          maxLength16(0), maxLength8(0),
211
0
          all((UBool)(which==ALL)) {
212
0
    spanSet.retainAll(set);
213
0
    if(which&NOT_CONTAINED) {
214
        // Default to the same sets.
215
        // addToSpanNotSet() will create a separate set if necessary.
216
0
        pSpanNotSet=&spanSet;
217
0
    }
218
219
    // Determine if the strings even need to be taken into account at all for span() etc.
220
    // If any string is relevant, then all strings need to be used for
221
    // span(longest match) but only the relevant ones for span(while contained).
222
    // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223
    //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224
    //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225
    // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226
0
    int32_t stringsLength=strings.size();
227
228
0
    int32_t i, spanLength;
229
0
    UBool someRelevant=FALSE;
230
0
    for(i=0; i<stringsLength; ++i) {
231
0
        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
232
0
        const UChar *s16=string.getBuffer();
233
0
        int32_t length16=string.length();
234
0
        if (length16==0) {
235
0
            continue;  // skip the empty string
236
0
        }
237
0
        UBool thisRelevant;
238
0
        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
239
0
        if(spanLength<length16) {  // Relevant string.
240
0
            someRelevant=thisRelevant=TRUE;
241
0
        } else {
242
0
            thisRelevant=FALSE;
243
0
        }
244
0
        if((which&UTF16) && length16>maxLength16) {
245
0
            maxLength16=length16;
246
0
        }
247
0
        if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
248
0
            int32_t length8=getUTF8Length(s16, length16);
249
0
            utf8Length+=length8;
250
0
            if(length8>maxLength8) {
251
0
                maxLength8=length8;
252
0
            }
253
0
        }
254
0
    }
255
0
    if(!someRelevant) {
256
0
        maxLength16=maxLength8=0;
257
0
        return;
258
0
    }
259
260
    // Freeze after checking for the need to use strings at all because freezing
261
    // a set takes some time and memory which are wasted if there are no relevant strings.
262
0
    if(all) {
263
0
        spanSet.freeze();
264
0
    }
265
266
0
    uint8_t *spanBackLengths;
267
0
    uint8_t *spanUTF8Lengths;
268
0
    uint8_t *spanBackUTF8Lengths;
269
270
    // Allocate a block of meta data.
271
0
    int32_t allocSize;
272
0
    if(all) {
273
        // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
274
0
        allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
275
0
    } else {
276
0
        allocSize=stringsLength;  // One set of span lengths.
277
0
        if(which&UTF8) {
278
            // UTF-8 lengths and UTF-8 strings.
279
0
            allocSize+=stringsLength*4+utf8Length;
280
0
        }
281
0
    }
282
0
    if(allocSize<=(int32_t)sizeof(staticLengths)) {
283
0
        utf8Lengths=staticLengths;
284
0
    } else {
285
0
        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
286
0
        if(utf8Lengths==NULL) {
287
0
            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
288
0
            return;  // Out of memory.
289
0
        }
290
0
    }
291
292
0
    if(all) {
293
        // Store span lengths for all span() variants.
294
0
        spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
295
0
        spanBackLengths=spanLengths+stringsLength;
296
0
        spanUTF8Lengths=spanBackLengths+stringsLength;
297
0
        spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
298
0
        utf8=spanBackUTF8Lengths+stringsLength;
299
0
    } else {
300
        // Store span lengths for only one span() variant.
301
0
        if(which&UTF8) {
302
0
            spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
303
0
            utf8=spanLengths+stringsLength;
304
0
        } else {
305
0
            spanLengths=(uint8_t *)utf8Lengths;
306
0
        }
307
0
        spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
308
0
    }
309
310
    // Set the meta data and pSpanNotSet and write the UTF-8 strings.
311
0
    int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
312
313
0
    for(i=0; i<stringsLength; ++i) {
314
0
        const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
315
0
        const UChar *s16=string.getBuffer();
316
0
        int32_t length16=string.length();
317
0
        spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
318
0
        if(spanLength<length16 && length16>0) {  // Relevant string.
319
0
            if(which&UTF16) {
320
0
                if(which&CONTAINED) {
321
0
                    if(which&FWD) {
322
0
                        spanLengths[i]=makeSpanLengthByte(spanLength);
323
0
                    }
324
0
                    if(which&BACK) {
325
0
                        spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
326
0
                        spanBackLengths[i]=makeSpanLengthByte(spanLength);
327
0
                    }
328
0
                } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
329
0
                    spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
330
0
                }
331
0
            }
332
0
            if(which&UTF8) {
333
0
                uint8_t *s8=utf8+utf8Count;
334
0
                int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
335
0
                utf8Count+=utf8Lengths[i]=length8;
336
0
                if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
337
0
                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
338
0
                } else {  // Relevant for UTF-8.
339
0
                    if(which&CONTAINED) {
340
0
                        if(which&FWD) {
341
0
                            spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
342
0
                            spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
343
0
                        }
344
0
                        if(which&BACK) {
345
0
                            spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
346
0
                            spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
347
0
                        }
348
0
                    } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
349
0
                        spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
350
0
                    }
351
0
                }
352
0
            }
353
0
            if(which&NOT_CONTAINED) {
354
                // Add string start and end code points to the spanNotSet so that
355
                // a span(while not contained) stops before any string.
356
0
                UChar32 c;
357
0
                if(which&FWD) {
358
0
                    int32_t len=0;
359
0
                    U16_NEXT(s16, len, length16, c);
360
0
                    addToSpanNotSet(c);
361
0
                }
362
0
                if(which&BACK) {
363
0
                    int32_t len=length16;
364
0
                    U16_PREV(s16, 0, len, c);
365
0
                    addToSpanNotSet(c);
366
0
                }
367
0
            }
368
0
        } else {  // Irrelevant string. (Also the empty string.)
369
0
            if(which&UTF8) {
370
0
                if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
371
0
                    uint8_t *s8=utf8+utf8Count;
372
0
                    int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
373
0
                    utf8Count+=utf8Lengths[i]=length8;
374
0
                } else {
375
0
                    utf8Lengths[i]=0;
376
0
                }
377
0
            }
378
0
            if(all) {
379
0
                spanLengths[i]=spanBackLengths[i]=
380
0
                    spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
381
0
                        (uint8_t)ALL_CP_CONTAINED;
382
0
            } else {
383
                // All spanXYZLengths pointers contain the same address.
384
0
                spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
385
0
            }
386
0
        }
387
0
    }
388
389
    // Finish.
390
0
    if(all) {
391
0
        pSpanNotSet->freeze();
392
0
    }
393
0
}
394
395
// Copy constructor. Assumes which==ALL for a frozen set.
396
UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
397
                                           const UVector &newParentSetStrings)
398
0
        : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
399
          utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
400
0
          utf8Length(otherStringSpan.utf8Length),
401
0
          maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
402
0
          all(TRUE) {
403
0
    if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
404
0
        pSpanNotSet=&spanSet;
405
0
    } else {
406
0
        pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
407
0
    }
408
409
    // Allocate a block of meta data.
410
    // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
411
0
    int32_t stringsLength=strings.size();
412
0
    int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
413
0
    if(allocSize<=(int32_t)sizeof(staticLengths)) {
414
0
        utf8Lengths=staticLengths;
415
0
    } else {
416
0
        utf8Lengths=(int32_t *)uprv_malloc(allocSize);
417
0
        if(utf8Lengths==NULL) {
418
0
            maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
419
0
            return;  // Out of memory.
420
0
        }
421
0
    }
422
423
0
    spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
424
0
    utf8=spanLengths+stringsLength*4;
425
0
    uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
426
0
}
427
428
0
UnicodeSetStringSpan::~UnicodeSetStringSpan() {
429
0
    if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
430
0
        delete pSpanNotSet;
431
0
    }
432
0
    if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
433
0
        uprv_free(utf8Lengths);
434
0
    }
435
0
}
436
437
0
void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
438
0
    if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
439
0
        if(spanSet.contains(c)) {
440
0
            return;  // Nothing to do.
441
0
        }
442
0
        UnicodeSet *newSet=spanSet.cloneAsThawed();
443
0
        if(newSet==NULL) {
444
0
            return;  // Out of memory.
445
0
        } else {
446
0
            pSpanNotSet=newSet;
447
0
        }
448
0
    }
449
0
    pSpanNotSet->add(c);
450
0
}
451
452
// Compare strings without any argument checks. Requires length>0.
453
static inline UBool
454
0
matches16(const UChar *s, const UChar *t, int32_t length) {
455
0
    do {
456
0
        if(*s++!=*t++) {
457
0
            return FALSE;
458
0
        }
459
0
    } while(--length>0);
460
0
    return TRUE;
461
0
}
462
463
static inline UBool
464
0
matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
465
0
    do {
466
0
        if(*s++!=*t++) {
467
0
            return FALSE;
468
0
        }
469
0
    } while(--length>0);
470
0
    return TRUE;
471
0
}
472
473
// Compare 16-bit Unicode strings (which may be malformed UTF-16)
474
// at code point boundaries.
475
// That is, each edge of a match must not be in the middle of a surrogate pair.
476
static inline UBool
477
0
matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
478
0
    s+=start;
479
0
    limit-=start;
480
0
    return matches16(s, t, length) &&
481
0
           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
482
0
           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
483
0
}
484
485
// Does the set contain the next code point?
486
// If so, return its length; otherwise return its negative length.
487
static inline int32_t
488
0
spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
489
0
    UChar c=*s, c2;
490
0
    if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
491
0
        return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
492
0
    }
493
0
    return set.contains(c) ? 1 : -1;
494
0
}
495
496
static inline int32_t
497
0
spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
498
0
    UChar c=s[length-1], c2;
499
0
    if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
500
0
        return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
501
0
    }
502
0
    return set.contains(c) ? 1 : -1;
503
0
}
504
505
static inline int32_t
506
0
spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
507
0
    UChar32 c=*s;
508
0
    if(U8_IS_SINGLE(c)) {
509
0
        return set.contains(c) ? 1 : -1;
510
0
    }
511
    // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
512
0
    int32_t i=0;
513
0
    U8_NEXT_OR_FFFD(s, i, length, c);
514
0
    return set.contains(c) ? i : -i;
515
0
}
516
517
static inline int32_t
518
0
spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
519
0
    UChar32 c=s[length-1];
520
0
    if(U8_IS_SINGLE(c)) {
521
0
        return set.contains(c) ? 1 : -1;
522
0
    }
523
0
    int32_t i=length-1;
524
0
    c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
525
0
    length-=i;
526
0
    return set.contains(c) ? length : -length;
527
0
}
528
529
/*
530
 * Note: In span() when spanLength==0 (after a string match, or at the beginning
531
 * after an empty code point span) and in spanNot() and spanNotUTF8(),
532
 * string matching could use a binary search
533
 * because all string matches are done from the same start index.
534
 *
535
 * For UTF-8, this would require a comparison function that returns UTF-16 order.
536
 *
537
 * This optimization should not be necessary for normal UnicodeSets because
538
 * most sets have no strings, and most sets with strings have
539
 * very few very short strings.
540
 * For cases with many strings, it might be better to use a different API
541
 * and implementation with a DFA (state machine).
542
 */
543
544
/*
545
 * Algorithm for span(USET_SPAN_CONTAINED)
546
 *
547
 * Theoretical algorithm:
548
 * - Iterate through the string, and at each code point boundary:
549
 *   + If the code point there is in the set, then remember to continue after it.
550
 *   + If a set string matches at the current position, then remember to continue after it.
551
 *   + Either recursively span for each code point or string match,
552
 *     or recursively span for all but the shortest one and
553
 *     iteratively continue the span with the shortest local match.
554
 *   + Remember the longest recursive span (the farthest end point).
555
 *   + If there is no match at the current position, neither for the code point there
556
 *     nor for any set string, then stop and return the longest recursive span length.
557
 *
558
 * Optimized implementation:
559
 *
560
 * (We assume that most sets will have very few very short strings.
561
 * A span using a string-less set is extremely fast.)
562
 *
563
 * Create and cache a spanSet which contains all of the single code points
564
 * of the original set but none of its strings.
565
 *
566
 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
567
 * - Loop:
568
 *   + Try to match each set string at the end of the spanLength.
569
 *     ~ Set strings that start with set-contained code points must be matched
570
 *       with a partial overlap because the recursive algorithm would have tried
571
 *       to match them at every position.
572
 *     ~ Set strings that entirely consist of set-contained code points
573
 *       are irrelevant for span(USET_SPAN_CONTAINED) because the
574
 *       recursive algorithm would continue after them anyway
575
 *       and find the longest recursive match from their end.
576
 *     ~ Rather than recursing, note each end point of a set string match.
577
 *   + If no set string matched after spanSet.span(), then return
578
 *     with where the spanSet.span() ended.
579
 *   + If at least one set string matched after spanSet.span(), then
580
 *     pop the shortest string match end point and continue
581
 *     the loop, trying to match all set strings from there.
582
 *   + If at least one more set string matched after a previous string match,
583
 *     then test if the code point after the previous string match is also
584
 *     contained in the set.
585
 *     Continue the loop with the shortest end point of either this code point
586
 *     or a matching set string.
587
 *   + If no more set string matched after a previous string match,
588
 *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
589
 *     Stop if spanLength==0, otherwise continue the loop.
590
 *
591
 * By noting each end point of a set string match,
592
 * the function visits each string position at most once and finishes
593
 * in linear time.
594
 *
595
 * The recursive algorithm may visit the same string position many times
596
 * if multiple paths lead to it and finishes in exponential time.
597
 */
598
599
/*
600
 * Algorithm for span(USET_SPAN_SIMPLE)
601
 *
602
 * Theoretical algorithm:
603
 * - Iterate through the string, and at each code point boundary:
604
 *   + If the code point there is in the set, then remember to continue after it.
605
 *   + If a set string matches at the current position, then remember to continue after it.
606
 *   + Continue from the farthest match position and ignore all others.
607
 *   + If there is no match at the current position,
608
 *     then stop and return the current position.
609
 *
610
 * Optimized implementation:
611
 *
612
 * (Same assumption and spanSet as above.)
613
 *
614
 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
615
 * - Loop:
616
 *   + Try to match each set string at the end of the spanLength.
617
 *     ~ Set strings that start with set-contained code points must be matched
618
 *       with a partial overlap because the standard algorithm would have tried
619
 *       to match them earlier.
620
 *     ~ Set strings that entirely consist of set-contained code points
621
 *       must be matched with a full overlap because the longest-match algorithm
622
 *       would hide set string matches that end earlier.
623
 *       Such set strings need not be matched earlier inside the code point span
624
 *       because the standard algorithm would then have continued after
625
 *       the set string match anyway.
626
 *     ~ Remember the longest set string match (farthest end point) from the earliest
627
 *       starting point.
628
 *   + If no set string matched after spanSet.span(), then return
629
 *     with where the spanSet.span() ended.
630
 *   + If at least one set string matched, then continue the loop after the
631
 *     longest match from the earliest position.
632
 *   + If no more set string matched after a previous string match,
633
 *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
634
 *     Stop if spanLength==0, otherwise continue the loop.
635
 */
636
637
0
int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
638
0
    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
639
0
        return spanNot(s, length);
640
0
    }
641
0
    int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
642
0
    if(spanLength==length) {
643
0
        return length;
644
0
    }
645
646
    // Consider strings; they may overlap with the span.
647
0
    OffsetList offsets;
648
0
    if(spanCondition==USET_SPAN_CONTAINED) {
649
        // Use offset list to try all possibilities.
650
0
        offsets.setMaxLength(maxLength16);
651
0
    }
652
0
    int32_t pos=spanLength, rest=length-pos;
653
0
    int32_t i, stringsLength=strings.size();
654
0
    for(;;) {
655
0
        if(spanCondition==USET_SPAN_CONTAINED) {
656
0
            for(i=0; i<stringsLength; ++i) {
657
0
                int32_t overlap=spanLengths[i];
658
0
                if(overlap==ALL_CP_CONTAINED) {
659
0
                    continue;  // Irrelevant string. (Also the empty string.)
660
0
                }
661
0
                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
662
0
                const UChar *s16=string.getBuffer();
663
0
                int32_t length16=string.length();
664
0
                U_ASSERT(length>0);
665
666
                // Try to match this string at pos-overlap..pos.
667
0
                if(overlap>=LONG_SPAN) {
668
0
                    overlap=length16;
669
                    // While contained: No point matching fully inside the code point span.
670
0
                    U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
671
0
                }
672
0
                if(overlap>spanLength) {
673
0
                    overlap=spanLength;
674
0
                }
675
0
                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
676
0
                for(;;) {
677
0
                    if(inc>rest) {
678
0
                        break;
679
0
                    }
680
                    // Try to match if the increment is not listed already.
681
0
                    if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
682
0
                        if(inc==rest) {
683
0
                            return length;  // Reached the end of the string.
684
0
                        }
685
0
                        offsets.addOffset(inc);
686
0
                    }
687
0
                    if(overlap==0) {
688
0
                        break;
689
0
                    }
690
0
                    --overlap;
691
0
                    ++inc;
692
0
                }
693
0
            }
694
0
        } else /* USET_SPAN_SIMPLE */ {
695
0
            int32_t maxInc=0, maxOverlap=0;
696
0
            for(i=0; i<stringsLength; ++i) {
697
0
                int32_t overlap=spanLengths[i];
698
                // For longest match, we do need to try to match even an all-contained string
699
                // to find the match from the earliest start.
700
701
0
                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
702
0
                const UChar *s16=string.getBuffer();
703
0
                int32_t length16=string.length();
704
0
                if (length16==0) {
705
0
                    continue;  // skip the empty string
706
0
                }
707
708
                // Try to match this string at pos-overlap..pos.
709
0
                if(overlap>=LONG_SPAN) {
710
0
                    overlap=length16;
711
                    // Longest match: Need to match fully inside the code point span
712
                    // to find the match from the earliest start.
713
0
                }
714
0
                if(overlap>spanLength) {
715
0
                    overlap=spanLength;
716
0
                }
717
0
                int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
718
0
                for(;;) {
719
0
                    if(inc>rest || overlap<maxOverlap) {
720
0
                        break;
721
0
                    }
722
                    // Try to match if the string is longer or starts earlier.
723
0
                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
724
0
                        matches16CPB(s, pos-overlap, length, s16, length16)
725
0
                    ) {
726
0
                        maxInc=inc;  // Longest match from earliest start.
727
0
                        maxOverlap=overlap;
728
0
                        break;
729
0
                    }
730
0
                    --overlap;
731
0
                    ++inc;
732
0
                }
733
0
            }
734
735
0
            if(maxInc!=0 || maxOverlap!=0) {
736
                // Longest-match algorithm, and there was a string match.
737
                // Simply continue after it.
738
0
                pos+=maxInc;
739
0
                rest-=maxInc;
740
0
                if(rest==0) {
741
0
                    return length;  // Reached the end of the string.
742
0
                }
743
0
                spanLength=0;  // Match strings from after a string match.
744
0
                continue;
745
0
            }
746
0
        }
747
        // Finished trying to match all strings at pos.
748
749
0
        if(spanLength!=0 || pos==0) {
750
            // The position is after an unlimited code point span (spanLength!=0),
751
            // not after a string match.
752
            // The only position where spanLength==0 after a span is pos==0.
753
            // Otherwise, an unlimited code point span is only tried again when no
754
            // strings match, and if such a non-initial span fails we stop.
755
0
            if(offsets.isEmpty()) {
756
0
                return pos;  // No strings matched after a span.
757
0
            }
758
            // Match strings from after the next string match.
759
0
        } else {
760
            // The position is after a string match (or a single code point).
761
0
            if(offsets.isEmpty()) {
762
                // No more strings matched after a previous string match.
763
                // Try another code point span from after the last string match.
764
0
                spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
765
0
                if( spanLength==rest || // Reached the end of the string, or
766
0
                    spanLength==0       // neither strings nor span progressed.
767
0
                ) {
768
0
                    return pos+spanLength;
769
0
                }
770
0
                pos+=spanLength;
771
0
                rest-=spanLength;
772
0
                continue;  // spanLength>0: Match strings from after a span.
773
0
            } else {
774
                // Try to match only one code point from after a string match if some
775
                // string matched beyond it, so that we try all possible positions
776
                // and don't overshoot.
777
0
                spanLength=spanOne(spanSet, s+pos, rest);
778
0
                if(spanLength>0) {
779
0
                    if(spanLength==rest) {
780
0
                        return length;  // Reached the end of the string.
781
0
                    }
782
                    // Match strings after this code point.
783
                    // There cannot be any increments below it because UnicodeSet strings
784
                    // contain multiple code points.
785
0
                    pos+=spanLength;
786
0
                    rest-=spanLength;
787
0
                    offsets.shift(spanLength);
788
0
                    spanLength=0;
789
0
                    continue;  // Match strings from after a single code point.
790
0
                }
791
                // Match strings from after the next string match.
792
0
            }
793
0
        }
794
0
        int32_t minOffset=offsets.popMinimum();
795
0
        pos+=minOffset;
796
0
        rest-=minOffset;
797
0
        spanLength=0;  // Match strings from after a string match.
798
0
    }
799
0
}
800
801
0
int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
802
0
    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
803
0
        return spanNotBack(s, length);
804
0
    }
805
0
    int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
806
0
    if(pos==0) {
807
0
        return 0;
808
0
    }
809
0
    int32_t spanLength=length-pos;
810
811
    // Consider strings; they may overlap with the span.
812
0
    OffsetList offsets;
813
0
    if(spanCondition==USET_SPAN_CONTAINED) {
814
        // Use offset list to try all possibilities.
815
0
        offsets.setMaxLength(maxLength16);
816
0
    }
817
0
    int32_t i, stringsLength=strings.size();
818
0
    uint8_t *spanBackLengths=spanLengths;
819
0
    if(all) {
820
0
        spanBackLengths+=stringsLength;
821
0
    }
822
0
    for(;;) {
823
0
        if(spanCondition==USET_SPAN_CONTAINED) {
824
0
            for(i=0; i<stringsLength; ++i) {
825
0
                int32_t overlap=spanBackLengths[i];
826
0
                if(overlap==ALL_CP_CONTAINED) {
827
0
                    continue;  // Irrelevant string. (Also the empty string.)
828
0
                }
829
0
                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
830
0
                const UChar *s16=string.getBuffer();
831
0
                int32_t length16=string.length();
832
0
                U_ASSERT(length>0);
833
834
                // Try to match this string at pos-(length16-overlap)..pos-length16.
835
0
                if(overlap>=LONG_SPAN) {
836
0
                    overlap=length16;
837
                    // While contained: No point matching fully inside the code point span.
838
0
                    int32_t len1=0;
839
0
                    U16_FWD_1(s16, len1, overlap);
840
0
                    overlap-=len1;  // Length of the string minus the first code point.
841
0
                }
842
0
                if(overlap>spanLength) {
843
0
                    overlap=spanLength;
844
0
                }
845
0
                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
846
0
                for(;;) {
847
0
                    if(dec>pos) {
848
0
                        break;
849
0
                    }
850
                    // Try to match if the decrement is not listed already.
851
0
                    if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
852
0
                        if(dec==pos) {
853
0
                            return 0;  // Reached the start of the string.
854
0
                        }
855
0
                        offsets.addOffset(dec);
856
0
                    }
857
0
                    if(overlap==0) {
858
0
                        break;
859
0
                    }
860
0
                    --overlap;
861
0
                    ++dec;
862
0
                }
863
0
            }
864
0
        } else /* USET_SPAN_SIMPLE */ {
865
0
            int32_t maxDec=0, maxOverlap=0;
866
0
            for(i=0; i<stringsLength; ++i) {
867
0
                int32_t overlap=spanBackLengths[i];
868
                // For longest match, we do need to try to match even an all-contained string
869
                // to find the match from the latest end.
870
871
0
                const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
872
0
                const UChar *s16=string.getBuffer();
873
0
                int32_t length16=string.length();
874
0
                if (length16==0) {
875
0
                    continue;  // skip the empty string
876
0
                }
877
878
                // Try to match this string at pos-(length16-overlap)..pos-length16.
879
0
                if(overlap>=LONG_SPAN) {
880
0
                    overlap=length16;
881
                    // Longest match: Need to match fully inside the code point span
882
                    // to find the match from the latest end.
883
0
                }
884
0
                if(overlap>spanLength) {
885
0
                    overlap=spanLength;
886
0
                }
887
0
                int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
888
0
                for(;;) {
889
0
                    if(dec>pos || overlap<maxOverlap) {
890
0
                        break;
891
0
                    }
892
                    // Try to match if the string is longer or ends later.
893
0
                    if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
894
0
                        matches16CPB(s, pos-dec, length, s16, length16)
895
0
                    ) {
896
0
                        maxDec=dec;  // Longest match from latest end.
897
0
                        maxOverlap=overlap;
898
0
                        break;
899
0
                    }
900
0
                    --overlap;
901
0
                    ++dec;
902
0
                }
903
0
            }
904
905
0
            if(maxDec!=0 || maxOverlap!=0) {
906
                // Longest-match algorithm, and there was a string match.
907
                // Simply continue before it.
908
0
                pos-=maxDec;
909
0
                if(pos==0) {
910
0
                    return 0;  // Reached the start of the string.
911
0
                }
912
0
                spanLength=0;  // Match strings from before a string match.
913
0
                continue;
914
0
            }
915
0
        }
916
        // Finished trying to match all strings at pos.
917
918
0
        if(spanLength!=0 || pos==length) {
919
            // The position is before an unlimited code point span (spanLength!=0),
920
            // not before a string match.
921
            // The only position where spanLength==0 before a span is pos==length.
922
            // Otherwise, an unlimited code point span is only tried again when no
923
            // strings match, and if such a non-initial span fails we stop.
924
0
            if(offsets.isEmpty()) {
925
0
                return pos;  // No strings matched before a span.
926
0
            }
927
            // Match strings from before the next string match.
928
0
        } else {
929
            // The position is before a string match (or a single code point).
930
0
            if(offsets.isEmpty()) {
931
                // No more strings matched before a previous string match.
932
                // Try another code point span from before the last string match.
933
0
                int32_t oldPos=pos;
934
0
                pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
935
0
                spanLength=oldPos-pos;
936
0
                if( pos==0 ||           // Reached the start of the string, or
937
0
                    spanLength==0       // neither strings nor span progressed.
938
0
                ) {
939
0
                    return pos;
940
0
                }
941
0
                continue;  // spanLength>0: Match strings from before a span.
942
0
            } else {
943
                // Try to match only one code point from before a string match if some
944
                // string matched beyond it, so that we try all possible positions
945
                // and don't overshoot.
946
0
                spanLength=spanOneBack(spanSet, s, pos);
947
0
                if(spanLength>0) {
948
0
                    if(spanLength==pos) {
949
0
                        return 0;  // Reached the start of the string.
950
0
                    }
951
                    // Match strings before this code point.
952
                    // There cannot be any decrements below it because UnicodeSet strings
953
                    // contain multiple code points.
954
0
                    pos-=spanLength;
955
0
                    offsets.shift(spanLength);
956
0
                    spanLength=0;
957
0
                    continue;  // Match strings from before a single code point.
958
0
                }
959
                // Match strings from before the next string match.
960
0
            }
961
0
        }
962
0
        pos-=offsets.popMinimum();
963
0
        spanLength=0;  // Match strings from before a string match.
964
0
    }
965
0
}
966
967
0
int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
968
0
    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
969
0
        return spanNotUTF8(s, length);
970
0
    }
971
0
    int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
972
0
    if(spanLength==length) {
973
0
        return length;
974
0
    }
975
976
    // Consider strings; they may overlap with the span.
977
0
    OffsetList offsets;
978
0
    if(spanCondition==USET_SPAN_CONTAINED) {
979
        // Use offset list to try all possibilities.
980
0
        offsets.setMaxLength(maxLength8);
981
0
    }
982
0
    int32_t pos=spanLength, rest=length-pos;
983
0
    int32_t i, stringsLength=strings.size();
984
0
    uint8_t *spanUTF8Lengths=spanLengths;
985
0
    if(all) {
986
0
        spanUTF8Lengths+=2*stringsLength;
987
0
    }
988
0
    for(;;) {
989
0
        const uint8_t *s8=utf8;
990
0
        int32_t length8;
991
0
        if(spanCondition==USET_SPAN_CONTAINED) {
992
0
            for(i=0; i<stringsLength; ++i) {
993
0
                length8=utf8Lengths[i];
994
0
                if(length8==0) {
995
0
                    continue;  // String not representable in UTF-8.
996
0
                }
997
0
                int32_t overlap=spanUTF8Lengths[i];
998
0
                if(overlap==ALL_CP_CONTAINED) {
999
0
                    s8+=length8;
1000
0
                    continue;  // Irrelevant string.
1001
0
                }
1002
1003
                // Try to match this string at pos-overlap..pos.
1004
0
                if(overlap>=LONG_SPAN) {
1005
0
                    overlap=length8;
1006
                    // While contained: No point matching fully inside the code point span.
1007
0
                    U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
1008
0
                }
1009
0
                if(overlap>spanLength) {
1010
0
                    overlap=spanLength;
1011
0
                }
1012
0
                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1013
0
                for(;;) {
1014
0
                    if(inc>rest) {
1015
0
                        break;
1016
0
                    }
1017
                    // Try to match if the increment is not listed already.
1018
                    // Match at code point boundaries. (The UTF-8 strings were converted
1019
                    // from UTF-16 and are guaranteed to be well-formed.)
1020
0
                    if(!U8_IS_TRAIL(s[pos-overlap]) &&
1021
0
                            !offsets.containsOffset(inc) &&
1022
0
                            matches8(s+pos-overlap, s8, length8)) {
1023
0
                        if(inc==rest) {
1024
0
                            return length;  // Reached the end of the string.
1025
0
                        }
1026
0
                        offsets.addOffset(inc);
1027
0
                    }
1028
0
                    if(overlap==0) {
1029
0
                        break;
1030
0
                    }
1031
0
                    --overlap;
1032
0
                    ++inc;
1033
0
                }
1034
0
                s8+=length8;
1035
0
            }
1036
0
        } else /* USET_SPAN_SIMPLE */ {
1037
0
            int32_t maxInc=0, maxOverlap=0;
1038
0
            for(i=0; i<stringsLength; ++i) {
1039
0
                length8=utf8Lengths[i];
1040
0
                if(length8==0) {
1041
0
                    continue;  // String not representable in UTF-8.
1042
0
                }
1043
0
                int32_t overlap=spanUTF8Lengths[i];
1044
                // For longest match, we do need to try to match even an all-contained string
1045
                // to find the match from the earliest start.
1046
1047
                // Try to match this string at pos-overlap..pos.
1048
0
                if(overlap>=LONG_SPAN) {
1049
0
                    overlap=length8;
1050
                    // Longest match: Need to match fully inside the code point span
1051
                    // to find the match from the earliest start.
1052
0
                }
1053
0
                if(overlap>spanLength) {
1054
0
                    overlap=spanLength;
1055
0
                }
1056
0
                int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1057
0
                for(;;) {
1058
0
                    if(inc>rest || overlap<maxOverlap) {
1059
0
                        break;
1060
0
                    }
1061
                    // Try to match if the string is longer or starts earlier.
1062
                    // Match at code point boundaries. (The UTF-8 strings were converted
1063
                    // from UTF-16 and are guaranteed to be well-formed.)
1064
0
                    if(!U8_IS_TRAIL(s[pos-overlap]) &&
1065
0
                            (overlap>maxOverlap ||
1066
0
                                /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1067
0
                            matches8(s+pos-overlap, s8, length8)) {
1068
0
                        maxInc=inc;  // Longest match from earliest start.
1069
0
                        maxOverlap=overlap;
1070
0
                        break;
1071
0
                    }
1072
0
                    --overlap;
1073
0
                    ++inc;
1074
0
                }
1075
0
                s8+=length8;
1076
0
            }
1077
1078
0
            if(maxInc!=0 || maxOverlap!=0) {
1079
                // Longest-match algorithm, and there was a string match.
1080
                // Simply continue after it.
1081
0
                pos+=maxInc;
1082
0
                rest-=maxInc;
1083
0
                if(rest==0) {
1084
0
                    return length;  // Reached the end of the string.
1085
0
                }
1086
0
                spanLength=0;  // Match strings from after a string match.
1087
0
                continue;
1088
0
            }
1089
0
        }
1090
        // Finished trying to match all strings at pos.
1091
1092
0
        if(spanLength!=0 || pos==0) {
1093
            // The position is after an unlimited code point span (spanLength!=0),
1094
            // not after a string match.
1095
            // The only position where spanLength==0 after a span is pos==0.
1096
            // Otherwise, an unlimited code point span is only tried again when no
1097
            // strings match, and if such a non-initial span fails we stop.
1098
0
            if(offsets.isEmpty()) {
1099
0
                return pos;  // No strings matched after a span.
1100
0
            }
1101
            // Match strings from after the next string match.
1102
0
        } else {
1103
            // The position is after a string match (or a single code point).
1104
0
            if(offsets.isEmpty()) {
1105
                // No more strings matched after a previous string match.
1106
                // Try another code point span from after the last string match.
1107
0
                spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1108
0
                if( spanLength==rest || // Reached the end of the string, or
1109
0
                    spanLength==0       // neither strings nor span progressed.
1110
0
                ) {
1111
0
                    return pos+spanLength;
1112
0
                }
1113
0
                pos+=spanLength;
1114
0
                rest-=spanLength;
1115
0
                continue;  // spanLength>0: Match strings from after a span.
1116
0
            } else {
1117
                // Try to match only one code point from after a string match if some
1118
                // string matched beyond it, so that we try all possible positions
1119
                // and don't overshoot.
1120
0
                spanLength=spanOneUTF8(spanSet, s+pos, rest);
1121
0
                if(spanLength>0) {
1122
0
                    if(spanLength==rest) {
1123
0
                        return length;  // Reached the end of the string.
1124
0
                    }
1125
                    // Match strings after this code point.
1126
                    // There cannot be any increments below it because UnicodeSet strings
1127
                    // contain multiple code points.
1128
0
                    pos+=spanLength;
1129
0
                    rest-=spanLength;
1130
0
                    offsets.shift(spanLength);
1131
0
                    spanLength=0;
1132
0
                    continue;  // Match strings from after a single code point.
1133
0
                }
1134
                // Match strings from after the next string match.
1135
0
            }
1136
0
        }
1137
0
        int32_t minOffset=offsets.popMinimum();
1138
0
        pos+=minOffset;
1139
0
        rest-=minOffset;
1140
0
        spanLength=0;  // Match strings from after a string match.
1141
0
    }
1142
0
}
1143
1144
0
int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1145
0
    if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1146
0
        return spanNotBackUTF8(s, length);
1147
0
    }
1148
0
    int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1149
0
    if(pos==0) {
1150
0
        return 0;
1151
0
    }
1152
0
    int32_t spanLength=length-pos;
1153
1154
    // Consider strings; they may overlap with the span.
1155
0
    OffsetList offsets;
1156
0
    if(spanCondition==USET_SPAN_CONTAINED) {
1157
        // Use offset list to try all possibilities.
1158
0
        offsets.setMaxLength(maxLength8);
1159
0
    }
1160
0
    int32_t i, stringsLength=strings.size();
1161
0
    uint8_t *spanBackUTF8Lengths=spanLengths;
1162
0
    if(all) {
1163
0
        spanBackUTF8Lengths+=3*stringsLength;
1164
0
    }
1165
0
    for(;;) {
1166
0
        const uint8_t *s8=utf8;
1167
0
        int32_t length8;
1168
0
        if(spanCondition==USET_SPAN_CONTAINED) {
1169
0
            for(i=0; i<stringsLength; ++i) {
1170
0
                length8=utf8Lengths[i];
1171
0
                if(length8==0) {
1172
0
                    continue;  // String not representable in UTF-8.
1173
0
                }
1174
0
                int32_t overlap=spanBackUTF8Lengths[i];
1175
0
                if(overlap==ALL_CP_CONTAINED) {
1176
0
                    s8+=length8;
1177
0
                    continue;  // Irrelevant string.
1178
0
                }
1179
1180
                // Try to match this string at pos-(length8-overlap)..pos-length8.
1181
0
                if(overlap>=LONG_SPAN) {
1182
0
                    overlap=length8;
1183
                    // While contained: No point matching fully inside the code point span.
1184
0
                    int32_t len1=0;
1185
0
                    U8_FWD_1(s8, len1, overlap);
1186
0
                    overlap-=len1;  // Length of the string minus the first code point.
1187
0
                }
1188
0
                if(overlap>spanLength) {
1189
0
                    overlap=spanLength;
1190
0
                }
1191
0
                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1192
0
                for(;;) {
1193
0
                    if(dec>pos) {
1194
0
                        break;
1195
0
                    }
1196
                    // Try to match if the decrement is not listed already.
1197
                    // Match at code point boundaries. (The UTF-8 strings were converted
1198
                    // from UTF-16 and are guaranteed to be well-formed.)
1199
0
                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1200
0
                        !offsets.containsOffset(dec) &&
1201
0
                        matches8(s+pos-dec, s8, length8)
1202
0
                    ) {
1203
0
                        if(dec==pos) {
1204
0
                            return 0;  // Reached the start of the string.
1205
0
                        }
1206
0
                        offsets.addOffset(dec);
1207
0
                    }
1208
0
                    if(overlap==0) {
1209
0
                        break;
1210
0
                    }
1211
0
                    --overlap;
1212
0
                    ++dec;
1213
0
                }
1214
0
                s8+=length8;
1215
0
            }
1216
0
        } else /* USET_SPAN_SIMPLE */ {
1217
0
            int32_t maxDec=0, maxOverlap=0;
1218
0
            for(i=0; i<stringsLength; ++i) {
1219
0
                length8=utf8Lengths[i];
1220
0
                if(length8==0) {
1221
0
                    continue;  // String not representable in UTF-8.
1222
0
                }
1223
0
                int32_t overlap=spanBackUTF8Lengths[i];
1224
                // For longest match, we do need to try to match even an all-contained string
1225
                // to find the match from the latest end.
1226
1227
                // Try to match this string at pos-(length8-overlap)..pos-length8.
1228
0
                if(overlap>=LONG_SPAN) {
1229
0
                    overlap=length8;
1230
                    // Longest match: Need to match fully inside the code point span
1231
                    // to find the match from the latest end.
1232
0
                }
1233
0
                if(overlap>spanLength) {
1234
0
                    overlap=spanLength;
1235
0
                }
1236
0
                int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1237
0
                for(;;) {
1238
0
                    if(dec>pos || overlap<maxOverlap) {
1239
0
                        break;
1240
0
                    }
1241
                    // Try to match if the string is longer or ends later.
1242
                    // Match at code point boundaries. (The UTF-8 strings were converted
1243
                    // from UTF-16 and are guaranteed to be well-formed.)
1244
0
                    if( !U8_IS_TRAIL(s[pos-dec]) &&
1245
0
                        (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1246
0
                        matches8(s+pos-dec, s8, length8)
1247
0
                    ) {
1248
0
                        maxDec=dec;  // Longest match from latest end.
1249
0
                        maxOverlap=overlap;
1250
0
                        break;
1251
0
                    }
1252
0
                    --overlap;
1253
0
                    ++dec;
1254
0
                }
1255
0
                s8+=length8;
1256
0
            }
1257
1258
0
            if(maxDec!=0 || maxOverlap!=0) {
1259
                // Longest-match algorithm, and there was a string match.
1260
                // Simply continue before it.
1261
0
                pos-=maxDec;
1262
0
                if(pos==0) {
1263
0
                    return 0;  // Reached the start of the string.
1264
0
                }
1265
0
                spanLength=0;  // Match strings from before a string match.
1266
0
                continue;
1267
0
            }
1268
0
        }
1269
        // Finished trying to match all strings at pos.
1270
1271
0
        if(spanLength!=0 || pos==length) {
1272
            // The position is before an unlimited code point span (spanLength!=0),
1273
            // not before a string match.
1274
            // The only position where spanLength==0 before a span is pos==length.
1275
            // Otherwise, an unlimited code point span is only tried again when no
1276
            // strings match, and if such a non-initial span fails we stop.
1277
0
            if(offsets.isEmpty()) {
1278
0
                return pos;  // No strings matched before a span.
1279
0
            }
1280
            // Match strings from before the next string match.
1281
0
        } else {
1282
            // The position is before a string match (or a single code point).
1283
0
            if(offsets.isEmpty()) {
1284
                // No more strings matched before a previous string match.
1285
                // Try another code point span from before the last string match.
1286
0
                int32_t oldPos=pos;
1287
0
                pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1288
0
                spanLength=oldPos-pos;
1289
0
                if( pos==0 ||           // Reached the start of the string, or
1290
0
                    spanLength==0       // neither strings nor span progressed.
1291
0
                ) {
1292
0
                    return pos;
1293
0
                }
1294
0
                continue;  // spanLength>0: Match strings from before a span.
1295
0
            } else {
1296
                // Try to match only one code point from before a string match if some
1297
                // string matched beyond it, so that we try all possible positions
1298
                // and don't overshoot.
1299
0
                spanLength=spanOneBackUTF8(spanSet, s, pos);
1300
0
                if(spanLength>0) {
1301
0
                    if(spanLength==pos) {
1302
0
                        return 0;  // Reached the start of the string.
1303
0
                    }
1304
                    // Match strings before this code point.
1305
                    // There cannot be any decrements below it because UnicodeSet strings
1306
                    // contain multiple code points.
1307
0
                    pos-=spanLength;
1308
0
                    offsets.shift(spanLength);
1309
0
                    spanLength=0;
1310
0
                    continue;  // Match strings from before a single code point.
1311
0
                }
1312
                // Match strings from before the next string match.
1313
0
            }
1314
0
        }
1315
0
        pos-=offsets.popMinimum();
1316
0
        spanLength=0;  // Match strings from before a string match.
1317
0
    }
1318
0
}
1319
1320
/*
1321
 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1322
 *
1323
 * Theoretical algorithm:
1324
 * - Iterate through the string, and at each code point boundary:
1325
 *   + If the code point there is in the set, then return with the current position.
1326
 *   + If a set string matches at the current position, then return with the current position.
1327
 *
1328
 * Optimized implementation:
1329
 *
1330
 * (Same assumption as for span() above.)
1331
 *
1332
 * Create and cache a spanNotSet which contains all of the single code points
1333
 * of the original set but none of its strings.
1334
 * For each set string add its initial code point to the spanNotSet.
1335
 * (Also add its final code point for spanNotBack().)
1336
 *
1337
 * - Loop:
1338
 *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1339
 *   + If the current code point is in the original set, then
1340
 *     return the current position.
1341
 *   + If any set string matches at the current position, then
1342
 *     return the current position.
1343
 *   + If there is no match at the current position, neither for the code point there
1344
 *     nor for any set string, then skip this code point and continue the loop.
1345
 *     This happens for set-string-initial code points that were added to spanNotSet
1346
 *     when there is not actually a match for such a set string.
1347
 */
1348
1349
0
int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1350
0
    int32_t pos=0, rest=length;
1351
0
    int32_t i, stringsLength=strings.size();
1352
0
    do {
1353
        // Span until we find a code point from the set,
1354
        // or a code point that starts or ends some string.
1355
0
        i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1356
0
        if(i==rest) {
1357
0
            return length;  // Reached the end of the string.
1358
0
        }
1359
0
        pos+=i;
1360
0
        rest-=i;
1361
1362
        // Check whether the current code point is in the original set,
1363
        // without the string starts and ends.
1364
0
        int32_t cpLength=spanOne(spanSet, s+pos, rest);
1365
0
        if(cpLength>0) {
1366
0
            return pos;  // There is a set element at pos.
1367
0
        }
1368
1369
        // Try to match the strings at pos.
1370
0
        for(i=0; i<stringsLength; ++i) {
1371
0
            if(spanLengths[i]==ALL_CP_CONTAINED) {
1372
0
                continue;  // Irrelevant string. (Also the empty string.)
1373
0
            }
1374
0
            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1375
0
            const UChar *s16=string.getBuffer();
1376
0
            int32_t length16=string.length();
1377
0
            U_ASSERT(length>0);
1378
0
            if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1379
0
                return pos;  // There is a set element at pos.
1380
0
            }
1381
0
        }
1382
1383
        // The span(while not contained) ended on a string start/end which is
1384
        // not in the original set. Skip this code point and continue.
1385
        // cpLength<0
1386
0
        pos-=cpLength;
1387
0
        rest+=cpLength;
1388
0
    } while(rest!=0);
1389
0
    return length;  // Reached the end of the string.
1390
0
}
1391
1392
0
int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1393
0
    int32_t pos=length;
1394
0
    int32_t i, stringsLength=strings.size();
1395
0
    do {
1396
        // Span until we find a code point from the set,
1397
        // or a code point that starts or ends some string.
1398
0
        pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1399
0
        if(pos==0) {
1400
0
            return 0;  // Reached the start of the string.
1401
0
        }
1402
1403
        // Check whether the current code point is in the original set,
1404
        // without the string starts and ends.
1405
0
        int32_t cpLength=spanOneBack(spanSet, s, pos);
1406
0
        if(cpLength>0) {
1407
0
            return pos;  // There is a set element at pos.
1408
0
        }
1409
1410
        // Try to match the strings at pos.
1411
0
        for(i=0; i<stringsLength; ++i) {
1412
            // Use spanLengths rather than a spanBackLengths pointer because
1413
            // it is easier and we only need to know whether the string is irrelevant
1414
            // which is the same in either array.
1415
0
            if(spanLengths[i]==ALL_CP_CONTAINED) {
1416
0
                continue;  // Irrelevant string. (Also the empty string.)
1417
0
            }
1418
0
            const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1419
0
            const UChar *s16=string.getBuffer();
1420
0
            int32_t length16=string.length();
1421
0
            U_ASSERT(length>0);
1422
0
            if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1423
0
                return pos;  // There is a set element at pos.
1424
0
            }
1425
0
        }
1426
1427
        // The span(while not contained) ended on a string start/end which is
1428
        // not in the original set. Skip this code point and continue.
1429
        // cpLength<0
1430
0
        pos+=cpLength;
1431
0
    } while(pos!=0);
1432
0
    return 0;  // Reached the start of the string.
1433
0
}
1434
1435
0
int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1436
0
    int32_t pos=0, rest=length;
1437
0
    int32_t i, stringsLength=strings.size();
1438
0
    uint8_t *spanUTF8Lengths=spanLengths;
1439
0
    if(all) {
1440
0
        spanUTF8Lengths+=2*stringsLength;
1441
0
    }
1442
0
    do {
1443
        // Span until we find a code point from the set,
1444
        // or a code point that starts or ends some string.
1445
0
        i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1446
0
        if(i==rest) {
1447
0
            return length;  // Reached the end of the string.
1448
0
        }
1449
0
        pos+=i;
1450
0
        rest-=i;
1451
1452
        // Check whether the current code point is in the original set,
1453
        // without the string starts and ends.
1454
0
        int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1455
0
        if(cpLength>0) {
1456
0
            return pos;  // There is a set element at pos.
1457
0
        }
1458
1459
        // Try to match the strings at pos.
1460
0
        const uint8_t *s8=utf8;
1461
0
        int32_t length8;
1462
0
        for(i=0; i<stringsLength; ++i) {
1463
0
            length8=utf8Lengths[i];
1464
            // ALL_CP_CONTAINED: Irrelevant string.
1465
0
            if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1466
0
                return pos;  // There is a set element at pos.
1467
0
            }
1468
0
            s8+=length8;
1469
0
        }
1470
1471
        // The span(while not contained) ended on a string start/end which is
1472
        // not in the original set. Skip this code point and continue.
1473
        // cpLength<0
1474
0
        pos-=cpLength;
1475
0
        rest+=cpLength;
1476
0
    } while(rest!=0);
1477
0
    return length;  // Reached the end of the string.
1478
0
}
1479
1480
0
int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1481
0
    int32_t pos=length;
1482
0
    int32_t i, stringsLength=strings.size();
1483
0
    uint8_t *spanBackUTF8Lengths=spanLengths;
1484
0
    if(all) {
1485
0
        spanBackUTF8Lengths+=3*stringsLength;
1486
0
    }
1487
0
    do {
1488
        // Span until we find a code point from the set,
1489
        // or a code point that starts or ends some string.
1490
0
        pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1491
0
        if(pos==0) {
1492
0
            return 0;  // Reached the start of the string.
1493
0
        }
1494
1495
        // Check whether the current code point is in the original set,
1496
        // without the string starts and ends.
1497
0
        int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1498
0
        if(cpLength>0) {
1499
0
            return pos;  // There is a set element at pos.
1500
0
        }
1501
1502
        // Try to match the strings at pos.
1503
0
        const uint8_t *s8=utf8;
1504
0
        int32_t length8;
1505
0
        for(i=0; i<stringsLength; ++i) {
1506
0
            length8=utf8Lengths[i];
1507
            // ALL_CP_CONTAINED: Irrelevant string.
1508
0
            if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1509
0
                return pos;  // There is a set element at pos.
1510
0
            }
1511
0
            s8+=length8;
1512
0
        }
1513
1514
        // The span(while not contained) ended on a string start/end which is
1515
        // not in the original set. Skip this code point and continue.
1516
        // cpLength<0
1517
0
        pos+=cpLength;
1518
0
    } while(pos!=0);
1519
0
    return 0;  // Reached the start of the string.
1520
0
}
1521
1522
U_NAMESPACE_END