Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/i18n/usearch.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 2001-2015 IBM and others. All rights reserved.
6
**********************************************************************
7
*   Date        Name        Description
8
*  07/02/2001   synwee      Creation.
9
**********************************************************************
10
*/
11
12
#include "unicode/utypes.h"
13
14
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
15
16
#include "unicode/usearch.h"
17
#include "unicode/ustring.h"
18
#include "unicode/uchar.h"
19
#include "unicode/utf16.h"
20
#include "normalizer2impl.h"
21
#include "usrchimp.h"
22
#include "cmemory.h"
23
#include "ucln_in.h"
24
#include "uassert.h"
25
#include "ustr_imp.h"
26
27
U_NAMESPACE_USE
28
29
// don't use Boyer-Moore
30
// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
31
#define BOYER_MOORE 0
32
33
// internal definition ---------------------------------------------------
34
35
0
#define LAST_BYTE_MASK_          0xFF
36
0
#define SECOND_LAST_BYTE_SHIFT_  8
37
#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
38
39
static const Normalizer2Impl *g_nfcImpl = NULL;
40
41
// internal methods -------------------------------------------------
42
43
/**
44
* Fast collation element iterator setOffset.
45
* This function does not check for bounds.
46
* @param coleiter collation element iterator
47
* @param offset to set
48
*/
49
static
50
inline void setColEIterOffset(UCollationElements *elems,
51
                      int32_t             offset)
52
0
{
53
0
    // Note: Not "fast" any more after the 2013 collation rewrite.
54
0
    // We do not want to expose more internals than necessary.
55
0
    UErrorCode status = U_ZERO_ERROR;
56
0
    ucol_setOffset(elems, offset, &status);
57
0
}
58
59
/**
60
* Getting the mask for collation strength
61
* @param strength collation strength
62
* @return collation element mask
63
*/
64
static
65
inline uint32_t getMask(UCollationStrength strength)
66
0
{
67
0
    switch (strength)
68
0
    {
69
0
    case UCOL_PRIMARY:
70
0
        return UCOL_PRIMARYORDERMASK;
71
0
    case UCOL_SECONDARY:
72
0
        return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
73
0
    default:
74
0
        return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
75
0
               UCOL_PRIMARYORDERMASK;
76
0
    }
77
0
}
78
79
/**
80
* @param ce 32-bit collation element
81
* @return hash code
82
*/
83
static
84
inline int hashFromCE32(uint32_t ce)
85
0
{
86
0
    int hc = (int)(
87
0
            ((((((ce >> 24) * 37) +
88
0
            (ce >> 16)) * 37) +
89
0
            (ce >> 8)) * 37) +
90
0
            ce);
91
0
    hc %= MAX_TABLE_SIZE_;
92
0
    if (hc < 0) {
93
0
        hc += MAX_TABLE_SIZE_;
94
0
    }
95
0
    return hc;
96
0
}
97
98
U_CDECL_BEGIN
99
static UBool U_CALLCONV
100
0
usearch_cleanup(void) {
101
0
    g_nfcImpl = NULL;
102
0
    return TRUE;
103
0
}
104
U_CDECL_END
105
106
/**
107
* Initializing the fcd tables.
108
* Internal method, status assumed to be a success.
109
* @param status output error if any, caller to check status before calling
110
*               method, status assumed to be success when passed in.
111
*/
112
static
113
inline void initializeFCD(UErrorCode *status)
114
0
{
115
0
    if (g_nfcImpl == NULL) {
116
0
        g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
117
0
        ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
118
0
    }
119
0
}
120
121
/**
122
* Gets the fcd value for a character at the argument index.
123
* This method takes into accounts of the supplementary characters.
124
* @param str UTF16 string where character for fcd retrieval resides
125
* @param offset position of the character whose fcd is to be retrieved, to be
126
*               overwritten with the next character position, taking
127
*               surrogate characters into consideration.
128
* @param strlength length of the argument string
129
* @return fcd value
130
*/
131
static
132
uint16_t getFCD(const UChar   *str, int32_t *offset,
133
                             int32_t  strlength)
134
0
{
135
0
    const UChar *temp = str + *offset;
136
0
    uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
137
0
    *offset = (int32_t)(temp - str);
138
0
    return result;
139
0
}
140
141
/**
142
* Getting the modified collation elements taking into account the collation
143
* attributes
144
* @param strsrch string search data
145
* @param sourcece
146
* @return the modified collation element
147
*/
148
static
149
inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
150
0
{
151
0
    // note for tertiary we can't use the collator->tertiaryMask, that
152
0
    // is a preprocessed mask that takes into account case options. since
153
0
    // we are only concerned with exact matches, we don't need that.
154
0
    sourcece &= strsrch->ceMask;
155
0
156
0
    if (strsrch->toShift) {
157
0
        // alternate handling here, since only the 16 most significant digits
158
0
        // is only used, we can safely do a compare without masking
159
0
        // if the ce is a variable, we mask and get only the primary values
160
0
        // no shifting to quartenary is required since all primary values
161
0
        // less than variabletop will need to be masked off anyway.
162
0
        if (strsrch->variableTop > sourcece) {
163
0
            if (strsrch->strength >= UCOL_QUATERNARY) {
164
0
                sourcece &= UCOL_PRIMARYORDERMASK;
165
0
            }
166
0
            else {
167
0
                sourcece = UCOL_IGNORABLE;
168
0
            }
169
0
        }
170
0
    } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
171
0
        sourcece = 0xFFFF;
172
0
    }
173
0
174
0
    return sourcece;
175
0
}
176
177
/**
178
* Allocate a memory and returns NULL if it failed.
179
* Internal method, status assumed to be a success.
180
* @param size to allocate
181
* @param status output error if any, caller to check status before calling
182
*               method, status assumed to be success when passed in.
183
* @return newly allocated array, NULL otherwise
184
*/
185
static
186
inline void * allocateMemory(uint32_t size, UErrorCode *status)
187
0
{
188
0
    uint32_t *result = (uint32_t *)uprv_malloc(size);
189
0
    if (result == NULL) {
190
0
        *status = U_MEMORY_ALLOCATION_ERROR;
191
0
    }
192
0
    return result;
193
0
}
194
195
/**
196
* Adds a uint32_t value to a destination array.
197
* Creates a new array if we run out of space. The caller will have to
198
* manually deallocate the newly allocated array.
199
* Internal method, status assumed to be success, caller has to check status
200
* before calling this method. destination not to be NULL and has at least
201
* size destinationlength.
202
* @param destination target array
203
* @param offset destination offset to add value
204
* @param destinationlength target array size, return value for the new size
205
* @param value to be added
206
* @param increments incremental size expected
207
* @param status output error if any, caller to check status before calling
208
*               method, status assumed to be success when passed in.
209
* @return new destination array, destination if there was no new allocation
210
*/
211
static
212
inline int32_t * addTouint32_tArray(int32_t    *destination,
213
                                    uint32_t    offset,
214
                                    uint32_t   *destinationlength,
215
                                    uint32_t    value,
216
                                    uint32_t    increments,
217
                                    UErrorCode *status)
218
0
{
219
0
    uint32_t newlength = *destinationlength;
220
0
    if (offset + 1 == newlength) {
221
0
        newlength += increments;
222
0
        int32_t *temp = (int32_t *)allocateMemory(
223
0
                                         sizeof(int32_t) * newlength, status);
224
0
        if (U_FAILURE(*status)) {
225
0
            return NULL;
226
0
        }
227
0
        uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
228
0
        *destinationlength = newlength;
229
0
        destination        = temp;
230
0
    }
231
0
    destination[offset] = value;
232
0
    return destination;
233
0
}
234
235
/**
236
* Adds a uint64_t value to a destination array.
237
* Creates a new array if we run out of space. The caller will have to
238
* manually deallocate the newly allocated array.
239
* Internal method, status assumed to be success, caller has to check status
240
* before calling this method. destination not to be NULL and has at least
241
* size destinationlength.
242
* @param destination target array
243
* @param offset destination offset to add value
244
* @param destinationlength target array size, return value for the new size
245
* @param value to be added
246
* @param increments incremental size expected
247
* @param status output error if any, caller to check status before calling
248
*               method, status assumed to be success when passed in.
249
* @return new destination array, destination if there was no new allocation
250
*/
251
static
252
inline int64_t * addTouint64_tArray(int64_t    *destination,
253
                                    uint32_t    offset,
254
                                    uint32_t   *destinationlength,
255
                                    uint64_t    value,
256
                                    uint32_t    increments,
257
                                    UErrorCode *status)
258
0
{
259
0
    uint32_t newlength = *destinationlength;
260
0
    if (offset + 1 == newlength) {
261
0
        newlength += increments;
262
0
        int64_t *temp = (int64_t *)allocateMemory(
263
0
                                         sizeof(int64_t) * newlength, status);
264
0
265
0
        if (U_FAILURE(*status)) {
266
0
            return NULL;
267
0
        }
268
0
269
0
        uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
270
0
        *destinationlength = newlength;
271
0
        destination        = temp;
272
0
    }
273
0
274
0
    destination[offset] = value;
275
0
276
0
    return destination;
277
0
}
278
279
/**
280
* Initializing the ce table for a pattern.
281
* Stores non-ignorable collation keys.
282
* Table size will be estimated by the size of the pattern text. Table
283
* expansion will be perform as we go along. Adding 1 to ensure that the table
284
* size definitely increases.
285
* Internal method, status assumed to be a success.
286
* @param strsrch string search data
287
* @param status output error if any, caller to check status before calling
288
*               method, status assumed to be success when passed in.
289
* @return total number of expansions
290
*/
291
static
292
inline uint16_t initializePatternCETable(UStringSearch *strsrch,
293
                                         UErrorCode    *status)
294
0
{
295
0
    UPattern *pattern            = &(strsrch->pattern);
296
0
    uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
297
0
    int32_t  *cetable            = pattern->cesBuffer;
298
0
    uint32_t  patternlength      = pattern->textLength;
299
0
    UCollationElements *coleiter = strsrch->utilIter;
300
0
301
0
    if (coleiter == NULL) {
302
0
        coleiter = ucol_openElements(strsrch->collator, pattern->text,
303
0
                                     patternlength, status);
304
0
        // status will be checked in ucol_next(..) later and if it is an
305
0
        // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
306
0
        // returned.
307
0
        strsrch->utilIter = coleiter;
308
0
    }
309
0
    else {
310
0
        ucol_setText(coleiter, pattern->text, pattern->textLength, status);
311
0
    }
312
0
    if(U_FAILURE(*status)) {
313
0
        return 0;
314
0
    }
315
0
316
0
    if (pattern->ces != cetable && pattern->ces) {
317
0
        uprv_free(pattern->ces);
318
0
    }
319
0
320
0
    uint16_t  offset      = 0;
321
0
    uint16_t  result      = 0;
322
0
    int32_t   ce;
323
0
324
0
    while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
325
0
           U_SUCCESS(*status)) {
326
0
        uint32_t newce = getCE(strsrch, ce);
327
0
        if (newce) {
328
0
            int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
329
0
                                  newce,
330
0
                                  patternlength - ucol_getOffset(coleiter) + 1,
331
0
                                  status);
332
0
            if (U_FAILURE(*status)) {
333
0
                return 0;
334
0
            }
335
0
            offset ++;
336
0
            if (cetable != temp && cetable != pattern->cesBuffer) {
337
0
                uprv_free(cetable);
338
0
            }
339
0
            cetable = temp;
340
0
        }
341
0
        result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
342
0
    }
343
0
344
0
    cetable[offset]   = 0;
345
0
    pattern->ces       = cetable;
346
0
    pattern->cesLength = offset;
347
0
348
0
    return result;
349
0
}
350
351
/**
352
* Initializing the pce table for a pattern.
353
* Stores non-ignorable collation keys.
354
* Table size will be estimated by the size of the pattern text. Table
355
* expansion will be perform as we go along. Adding 1 to ensure that the table
356
* size definitely increases.
357
* Internal method, status assumed to be a success.
358
* @param strsrch string search data
359
* @param status output error if any, caller to check status before calling
360
*               method, status assumed to be success when passed in.
361
* @return total number of expansions
362
*/
363
static
364
inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
365
                                          UErrorCode    *status)
366
0
{
367
0
    UPattern *pattern            = &(strsrch->pattern);
368
0
    uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
369
0
    int64_t  *pcetable           = pattern->pcesBuffer;
370
0
    uint32_t  patternlength      = pattern->textLength;
371
0
    UCollationElements *coleiter = strsrch->utilIter;
372
0
373
0
    if (coleiter == NULL) {
374
0
        coleiter = ucol_openElements(strsrch->collator, pattern->text,
375
0
                                     patternlength, status);
376
0
        // status will be checked in ucol_next(..) later and if it is an
377
0
        // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
378
0
        // returned.
379
0
        strsrch->utilIter = coleiter;
380
0
    } else {
381
0
        ucol_setText(coleiter, pattern->text, pattern->textLength, status);
382
0
    }
383
0
    if(U_FAILURE(*status)) {
384
0
        return 0;
385
0
    }
386
0
387
0
    if (pattern->pces != pcetable && pattern->pces != NULL) {
388
0
        uprv_free(pattern->pces);
389
0
    }
390
0
391
0
    uint16_t  offset = 0;
392
0
    uint16_t  result = 0;
393
0
    int64_t   pce;
394
0
395
0
    icu::UCollationPCE iter(coleiter);
396
0
397
0
    // ** Should processed CEs be signed or unsigned?
398
0
    // ** (the rest of the code in this file seems to play fast-and-loose with
399
0
    // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
400
0
    while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
401
0
           U_SUCCESS(*status)) {
402
0
        int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
403
0
                              pce,
404
0
                              patternlength - ucol_getOffset(coleiter) + 1,
405
0
                              status);
406
0
407
0
        if (U_FAILURE(*status)) {
408
0
            return 0;
409
0
        }
410
0
411
0
        offset += 1;
412
0
413
0
        if (pcetable != temp && pcetable != pattern->pcesBuffer) {
414
0
            uprv_free(pcetable);
415
0
        }
416
0
417
0
        pcetable = temp;
418
0
        //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
419
0
    }
420
0
421
0
    pcetable[offset]   = 0;
422
0
    pattern->pces       = pcetable;
423
0
    pattern->pcesLength = offset;
424
0
425
0
    return result;
426
0
}
427
428
/**
429
* Initializes the pattern struct.
430
* Internal method, status assumed to be success.
431
* @param strsrch UStringSearch data storage
432
* @param status output error if any, caller to check status before calling
433
*               method, status assumed to be success when passed in.
434
* @return expansionsize the total expansion size of the pattern
435
*/
436
static
437
inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
438
0
{
439
0
    if (U_FAILURE(*status)) { return 0; }
440
0
          UPattern   *pattern     = &(strsrch->pattern);
441
0
    const UChar      *patterntext = pattern->text;
442
0
          int32_t     length      = pattern->textLength;
443
0
          int32_t index       = 0;
444
0
445
0
    // Since the strength is primary, accents are ignored in the pattern.
446
0
    if (strsrch->strength == UCOL_PRIMARY) {
447
0
        pattern->hasPrefixAccents = 0;
448
0
        pattern->hasSuffixAccents = 0;
449
0
    } else {
450
0
        pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
451
0
                                                         SECOND_LAST_BYTE_SHIFT_;
452
0
        index = length;
453
0
        U16_BACK_1(patterntext, 0, index);
454
0
        pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
455
0
                                                                 LAST_BYTE_MASK_;
456
0
    }
457
0
458
0
    // ** HACK **
459
0
    if (strsrch->pattern.pces != NULL) {
460
0
        if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
461
0
            uprv_free(strsrch->pattern.pces);
462
0
        }
463
0
464
0
        strsrch->pattern.pces = NULL;
465
0
    }
466
0
467
0
    // since intializePattern is an internal method status is a success.
468
0
    return initializePatternCETable(strsrch, status);
469
0
}
470
471
/**
472
* Initializing shift tables, with the default values.
473
* If a corresponding default value is 0, the shift table is not set.
474
* @param shift table for forwards shift
475
* @param backshift table for backwards shift
476
* @param cetable table containing pattern ce
477
* @param cesize size of the pattern ces
478
* @param expansionsize total size of the expansions
479
* @param defaultforward the default forward value
480
* @param defaultbackward the default backward value
481
*/
482
static
483
inline void setShiftTable(int16_t   shift[], int16_t backshift[],
484
                          int32_t  *cetable, int32_t cesize,
485
                          int16_t   expansionsize,
486
                          int16_t   defaultforward,
487
                          int16_t   defaultbackward)
488
0
{
489
0
    // estimate the value to shift. to do that we estimate the smallest
490
0
    // number of characters to give the relevant ces, ie approximately
491
0
    // the number of ces minus their expansion, since expansions can come
492
0
    // from a character.
493
0
    int32_t count;
494
0
    for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
495
0
        shift[count] = defaultforward;
496
0
    }
497
0
    cesize --; // down to the last index
498
0
    for (count = 0; count < cesize; count ++) {
499
0
        // number of ces from right of array to the count
500
0
        int temp = defaultforward - count - 1;
501
0
        shift[hashFromCE32(cetable[count])] = temp > 1 ? temp : 1;
502
0
    }
503
0
    shift[hashFromCE32(cetable[cesize])] = 1;
504
0
    // for ignorables we just shift by one. see test examples.
505
0
    shift[hashFromCE32(0)] = 1;
506
0
507
0
    for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
508
0
        backshift[count] = defaultbackward;
509
0
    }
510
0
    for (count = cesize; count > 0; count --) {
511
0
        // the original value count does not seem to work
512
0
        backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
513
0
                                          (int16_t)(count - expansionsize) : 1;
514
0
    }
515
0
    backshift[hashFromCE32(cetable[0])] = 1;
516
0
    backshift[hashFromCE32(0)] = 1;
517
0
}
518
519
/**
520
* Building of the pattern collation element list and the boyer moore strsrch
521
* table.
522
* The canonical match will only be performed after the default match fails.
523
* For both cases we need to remember the size of the composed and decomposed
524
* versions of the string. Since the Boyer-Moore shift calculations shifts by
525
* a number of characters in the text and tries to match the pattern from that
526
* offset, the shift value can not be too large in case we miss some
527
* characters. To choose a right shift size, we estimate the NFC form of the
528
* and use its size as a shift guide. The NFC form should be the small
529
* possible representation of the pattern. Anyways, we'll err on the smaller
530
* shift size. Hence the calculation for minlength.
531
* Canonical match will be performed slightly differently. We'll split the
532
* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
533
* the first and last base character (MS), the ending accents (EA). Matches
534
* will be done on MS first, and only when we match MS then some processing
535
* will be required for the prefix and end accents in order to determine if
536
* they match PA and EA. Hence the default shift values
537
* for the canonical match will take the size of either end's accent into
538
* consideration. Forwards search will take the end accents into consideration
539
* for the default shift values and the backwards search will take the prefix
540
* accents into consideration.
541
* If pattern has no non-ignorable ce, we return a illegal argument error.
542
* Internal method, status assumed to be success.
543
* @param strsrch UStringSearch data storage
544
* @param status  for output errors if it occurs, status is assumed to be a
545
*                success when it is passed in.
546
*/
547
static
548
inline void initialize(UStringSearch *strsrch, UErrorCode *status)
549
0
{
550
0
    int16_t expandlength  = initializePattern(strsrch, status);
551
0
    if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
552
0
        UPattern *pattern = &strsrch->pattern;
553
0
        int32_t   cesize  = pattern->cesLength;
554
0
555
0
        int16_t minlength = cesize > expandlength
556
0
                            ? (int16_t)cesize - expandlength : 1;
557
0
        pattern->defaultShiftSize    = minlength;
558
0
        setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
559
0
                      cesize, expandlength, minlength, minlength);
560
0
        return;
561
0
    }
562
0
    strsrch->pattern.defaultShiftSize = 0;
563
0
}
564
565
#if BOYER_MOORE
566
/**
567
* Check to make sure that the match length is at the end of the character by
568
* using the breakiterator.
569
* @param strsrch string search data
570
* @param start target text start offset
571
* @param end target text end offset
572
*/
573
static
574
void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
575
                               int32_t *end)
576
{
577
#if !UCONFIG_NO_BREAK_ITERATION
578
    UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
579
    if (breakiterator) {
580
        int32_t matchend = *end;
581
        //int32_t matchstart = *start;
582
583
        if (!ubrk_isBoundary(breakiterator, matchend)) {
584
            *end = ubrk_following(breakiterator, matchend);
585
        }
586
587
        /* Check the start of the matched text to make sure it doesn't have any accents
588
         * before it.  This code may not be necessary and so it is commented out */
589
        /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
590
            *start = ubrk_preceding(breakiterator, matchstart);
591
        }*/
592
    }
593
#endif
594
}
595
596
/**
597
* Determine whether the target text in UStringSearch bounded by the offset
598
* start and end is one or more whole units of text as
599
* determined by the breakiterator in UStringSearch.
600
* @param strsrch string search data
601
* @param start target text start offset
602
* @param end target text end offset
603
*/
604
static
605
UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
606
                               int32_t    end)
607
{
608
#if !UCONFIG_NO_BREAK_ITERATION
609
    UBreakIterator *breakiterator = strsrch->search->breakIter;
610
    //TODO: Add here.
611
    if (breakiterator) {
612
        int32_t startindex = ubrk_first(breakiterator);
613
        int32_t endindex   = ubrk_last(breakiterator);
614
615
        // out-of-range indexes are never boundary positions
616
        if (start < startindex || start > endindex ||
617
            end < startindex || end > endindex) {
618
            return FALSE;
619
        }
620
        // otherwise, we can use following() on the position before the
621
        // specified one and return true of the position we get back is the
622
        // one the user specified
623
        UBool result = (start == startindex ||
624
                ubrk_following(breakiterator, start - 1) == start) &&
625
               (end == endindex ||
626
                ubrk_following(breakiterator, end - 1) == end);
627
        if (result) {
628
            // iterates the individual ces
629
                  UCollationElements *coleiter  = strsrch->utilIter;
630
            const UChar              *text      = strsrch->search->text +
631
                                                                      start;
632
                  UErrorCode          status    = U_ZERO_ERROR;
633
            ucol_setText(coleiter, text, end - start, &status);
634
            for (int32_t count = 0; count < strsrch->pattern.cesLength;
635
                 count ++) {
636
                int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
637
                if (ce == UCOL_IGNORABLE) {
638
                    count --;
639
                    continue;
640
                }
641
                if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
642
                    return FALSE;
643
                }
644
            }
645
            int32_t nextce = ucol_next(coleiter, &status);
646
            while (ucol_getOffset(coleiter) == (end - start)
647
                   && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
648
                nextce = ucol_next(coleiter, &status);
649
            }
650
            if (ucol_getOffset(coleiter) == (end - start)
651
                && nextce != UCOL_NULLORDER) {
652
                // extra collation elements at the end of the match
653
                return FALSE;
654
            }
655
        }
656
        return result;
657
    }
658
#endif
659
    return TRUE;
660
}
661
662
/**
663
* Getting the next base character offset if current offset is an accent,
664
* or the current offset if the current character contains a base character.
665
* accents the following base character will be returned
666
* @param text string
667
* @param textoffset current offset
668
* @param textlength length of text string
669
* @return the next base character or the current offset
670
*         if the current character is contains a base character.
671
*/
672
static
673
inline int32_t getNextBaseOffset(const UChar       *text,
674
                                           int32_t  textoffset,
675
                                           int32_t      textlength)
676
{
677
    if (textoffset < textlength) {
678
        int32_t temp = textoffset;
679
        if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
680
            while (temp < textlength) {
681
                int32_t result = temp;
682
                if ((getFCD(text, &temp, textlength) >>
683
                     SECOND_LAST_BYTE_SHIFT_) == 0) {
684
                    return result;
685
                }
686
            }
687
            return textlength;
688
        }
689
    }
690
    return textoffset;
691
}
692
693
/**
694
* Gets the next base character offset depending on the string search pattern
695
* data
696
* @param strsrch string search data
697
* @param textoffset current offset, one offset away from the last character
698
*                   to search for.
699
* @return start index of the next base character or the current offset
700
*         if the current character is contains a base character.
701
*/
702
static
703
inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
704
                                                  int32_t    textoffset)
705
{
706
    int32_t textlength = strsrch->search->textLength;
707
    if (strsrch->pattern.hasSuffixAccents &&
708
        textoffset < textlength) {
709
              int32_t  temp       = textoffset;
710
        const UChar       *text       = strsrch->search->text;
711
        U16_BACK_1(text, 0, temp);
712
        if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
713
            return getNextBaseOffset(text, textoffset, textlength);
714
        }
715
    }
716
    return textoffset;
717
}
718
719
/**
720
* Shifting the collation element iterator position forward to prepare for
721
* a following match. If the last character is a unsafe character, we'll only
722
* shift by 1 to capture contractions, normalization etc.
723
* Internal method, status assumed to be success.
724
* @param text strsrch string search data
725
* @param textoffset start text position to do search
726
* @param ce the text ce which failed the match.
727
* @param patternceindex index of the ce within the pattern ce buffer which
728
*        failed the match
729
* @return final offset
730
*/
731
static
732
inline int32_t shiftForward(UStringSearch *strsrch,
733
                                int32_t    textoffset,
734
                                int32_t       ce,
735
                                int32_t        patternceindex)
736
{
737
    UPattern *pattern = &(strsrch->pattern);
738
    if (ce != UCOL_NULLORDER) {
739
        int32_t shift = pattern->shift[hashFromCE32(ce)];
740
        // this is to adjust for characters in the middle of the
741
        // substring for matching that failed.
742
        int32_t adjust = pattern->cesLength - patternceindex;
743
        if (adjust > 1 && shift >= adjust) {
744
            shift -= adjust - 1;
745
        }
746
        textoffset += shift;
747
    }
748
    else {
749
        textoffset += pattern->defaultShiftSize;
750
    }
751
752
    textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
753
    // check for unsafe characters
754
    // * if it is the start or middle of a contraction: to be done after
755
    //   a initial match is found
756
    // * thai or lao base consonant character: similar to contraction
757
    // * high surrogate character: similar to contraction
758
    // * next character is a accent: shift to the next base character
759
    return textoffset;
760
}
761
#endif // #if BOYER_MOORE
762
763
/**
764
* sets match not found
765
* @param strsrch string search data
766
*/
767
static
768
inline void setMatchNotFound(UStringSearch *strsrch)
769
0
{
770
0
    // this method resets the match result regardless of the error status.
771
0
    strsrch->search->matchedIndex = USEARCH_DONE;
772
0
    strsrch->search->matchedLength = 0;
773
0
    if (strsrch->search->isForwardSearching) {
774
0
        setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
775
0
    }
776
0
    else {
777
0
        setColEIterOffset(strsrch->textIter, 0);
778
0
    }
779
0
}
780
781
#if BOYER_MOORE
782
/**
783
* Gets the offset to the next safe point in text.
784
* ie. not the middle of a contraction, swappable characters or supplementary
785
* characters.
786
* @param collator collation sata
787
* @param text string to work with
788
* @param textoffset offset in string
789
* @param textlength length of text string
790
* @return offset to the next safe character
791
*/
792
static
793
inline int32_t getNextSafeOffset(const UCollator   *collator,
794
                                     const UChar       *text,
795
                                           int32_t  textoffset,
796
                                           int32_t      textlength)
797
{
798
    int32_t result = textoffset; // first contraction character
799
    while (result != textlength && ucol_unsafeCP(text[result], collator)) {
800
        result ++;
801
    }
802
    return result;
803
}
804
805
/**
806
* This checks for accents in the potential match started with a .
807
* composite character.
808
* This is really painful... we have to check that composite character do not
809
* have any extra accents. We have to normalize the potential match and find
810
* the immediate decomposed character before the match.
811
* The first composite character would have been taken care of by the fcd
812
* checks in checkForwardExactMatch.
813
* This is the slow path after the fcd of the first character and
814
* the last character has been checked by checkForwardExactMatch and we
815
* determine that the potential match has extra non-ignorable preceding
816
* ces.
817
* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
818
* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
819
* Note here that accents checking are slow and cautioned in the API docs.
820
* Internal method, status assumed to be a success, caller should check status
821
* before calling this method
822
* @param strsrch string search data
823
* @param start index of the potential unfriendly composite character
824
* @param end index of the potential unfriendly composite character
825
* @param status output error status if any.
826
* @return TRUE if there is non-ignorable accents before at the beginning
827
*              of the match, FALSE otherwise.
828
*/
829
830
static
831
UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
832
                                   int32_t    end,
833
                                   UErrorCode    *status)
834
{
835
    UBool result = FALSE;
836
    if (strsrch->pattern.hasPrefixAccents) {
837
              int32_t  length = end - start;
838
              int32_t  offset = 0;
839
        const UChar       *text   = strsrch->search->text + start;
840
841
        U16_FWD_1(text, offset, length);
842
        // we are only concerned with the first composite character
843
        if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
844
            int32_t safeoffset = getNextSafeOffset(strsrch->collator,
845
                                                       text, 0, length);
846
            if (safeoffset != length) {
847
                safeoffset ++;
848
            }
849
            UChar   *norm = NULL;
850
            UChar    buffer[INITIAL_ARRAY_SIZE_];
851
            int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
852
                                            buffer, INITIAL_ARRAY_SIZE_,
853
                                            status);
854
            if (U_FAILURE(*status)) {
855
                return FALSE;
856
            }
857
            if (size >= INITIAL_ARRAY_SIZE_) {
858
                norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
859
                                               status);
860
                // if allocation failed, status will be set to
861
                // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
862
                // checks for it.
863
                size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
864
                                       size, status);
865
                if (U_FAILURE(*status) && norm != NULL) {
866
                    uprv_free(norm);
867
                    return FALSE;
868
                }
869
            }
870
            else {
871
                norm = buffer;
872
            }
873
874
            UCollationElements *coleiter  = strsrch->utilIter;
875
            ucol_setText(coleiter, norm, size, status);
876
            uint32_t            firstce   = strsrch->pattern.ces[0];
877
            UBool               ignorable = TRUE;
878
            uint32_t            ce        = UCOL_IGNORABLE;
879
            while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
880
                offset = ucol_getOffset(coleiter);
881
                if (ce != firstce && ce != UCOL_IGNORABLE) {
882
                    ignorable = FALSE;
883
                }
884
                ce = ucol_next(coleiter, status);
885
            }
886
            UChar32 codepoint;
887
            U16_PREV(norm, 0, offset, codepoint);
888
            result = !ignorable && (u_getCombiningClass(codepoint) != 0);
889
890
            if (norm != buffer) {
891
                uprv_free(norm);
892
            }
893
        }
894
    }
895
896
    return result;
897
}
898
899
/**
900
* Used by exact matches, checks if there are accents before the match.
901
* This is really painful... we have to check that composite characters at
902
* the start of the matches have to not have any extra accents.
903
* We check the FCD of the character first, if it starts with an accent and
904
* the first pattern ce does not match the first ce of the character, we bail.
905
* Otherwise we try normalizing the first composite
906
* character and find the immediate decomposed character before the match to
907
* see if it is an non-ignorable accent.
908
* Now normalizing the first composite character is enough because we ensure
909
* that when the match is passed in here with extra beginning ces, the
910
* first or last ce that match has to occur within the first character.
911
* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
912
* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
913
* Note here that accents checking are slow and cautioned in the API docs.
914
* @param strsrch string search data
915
* @param start offset
916
* @param end offset
917
* @return TRUE if there are accents on either side of the match,
918
*         FALSE otherwise
919
*/
920
static
921
UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
922
                                  int32_t    end)
923
{
924
    if (strsrch->pattern.hasPrefixAccents) {
925
        UCollationElements *coleiter  = strsrch->textIter;
926
        UErrorCode          status    = U_ZERO_ERROR;
927
        // we have been iterating forwards previously
928
        uint32_t            ignorable = TRUE;
929
        int32_t             firstce   = strsrch->pattern.ces[0];
930
931
        setColEIterOffset(coleiter, start);
932
        int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
933
        if (U_FAILURE(status)) {
934
            return TRUE;
935
        }
936
        while (ce != firstce) {
937
            if (ce != UCOL_IGNORABLE) {
938
                ignorable = FALSE;
939
            }
940
            ce = getCE(strsrch, ucol_next(coleiter, &status));
941
            if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
942
                return TRUE;
943
            }
944
        }
945
        if (!ignorable && inNormBuf(coleiter)) {
946
            // within normalization buffer, discontiguous handled here
947
            return TRUE;
948
        }
949
950
        // within text
951
        int32_t temp = start;
952
        // original code
953
        // accent = (getFCD(strsrch->search->text, &temp,
954
        //                  strsrch->search->textLength)
955
        //            >> SECOND_LAST_BYTE_SHIFT_);
956
        // however this code does not work well with VC7 .net in release mode.
957
        // maybe the inlines for getFCD combined with shifting has bugs in
958
        // VC7. anyways this is a work around.
959
        UBool accent = getFCD(strsrch->search->text, &temp,
960
                              strsrch->search->textLength) > 0xFF;
961
        if (!accent) {
962
            return checkExtraMatchAccents(strsrch, start, end, &status);
963
        }
964
        if (!ignorable) {
965
            return TRUE;
966
        }
967
        if (start > 0) {
968
            temp = start;
969
            U16_BACK_1(strsrch->search->text, 0, temp);
970
            if (getFCD(strsrch->search->text, &temp,
971
                       strsrch->search->textLength) & LAST_BYTE_MASK_) {
972
                setColEIterOffset(coleiter, start);
973
                ce = ucol_previous(coleiter, &status);
974
                if (U_FAILURE(status) ||
975
                    (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
976
                    return TRUE;
977
                }
978
            }
979
        }
980
    }
981
982
    return FALSE;
983
}
984
985
/**
986
* Used by exact matches, checks if there are accents bounding the match.
987
* Note this is the initial boundary check. If the potential match
988
* starts or ends with composite characters, the accents in those
989
* characters will be determined later.
990
* Not doing backwards iteration here, since discontiguos contraction for
991
* backwards collation element iterator, use up too many characters.
992
* E.g. looking for \u030A ring in \u01FA A ring above and acute,
993
* should fail since there is a acute at the end of \u01FA
994
* Note here that accents checking are slow and cautioned in the API docs.
995
* @param strsrch string search data
996
* @param start offset of match
997
* @param end end offset of the match
998
* @return TRUE if there are accents on either side of the match,
999
*         FALSE otherwise
1000
*/
1001
static
1002
UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1003
                                 int32_t    end)
1004
{
1005
    if (strsrch->pattern.hasSuffixAccents) {
1006
        const UChar       *text       = strsrch->search->text;
1007
              int32_t  temp       = end;
1008
              int32_t      textlength = strsrch->search->textLength;
1009
        U16_BACK_1(text, 0, temp);
1010
        if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1011
            int32_t             firstce  = strsrch->pattern.ces[0];
1012
            UCollationElements *coleiter = strsrch->textIter;
1013
            UErrorCode          status   = U_ZERO_ERROR;
1014
            int32_t ce;
1015
            setColEIterOffset(coleiter, start);
1016
            while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1017
                if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1018
                    return TRUE;
1019
                }
1020
            }
1021
            int32_t count = 1;
1022
            while (count < strsrch->pattern.cesLength) {
1023
                if (getCE(strsrch, ucol_next(coleiter, &status))
1024
                    == UCOL_IGNORABLE) {
1025
                    // Thai can give an ignorable here.
1026
                    count --;
1027
                }
1028
                if (U_FAILURE(status)) {
1029
                    return TRUE;
1030
                }
1031
                count ++;
1032
            }
1033
1034
            ce = ucol_next(coleiter, &status);
1035
            if (U_FAILURE(status)) {
1036
                return TRUE;
1037
            }
1038
            if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1039
                ce = getCE(strsrch, ce);
1040
            }
1041
            if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1042
                if (ucol_getOffset(coleiter) <= end) {
1043
                    return TRUE;
1044
                }
1045
                if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1046
                    return TRUE;
1047
                }
1048
            }
1049
        }
1050
    }
1051
    return FALSE;
1052
}
1053
#endif // #if BOYER_MOORE
1054
1055
/**
1056
* Checks if the offset runs out of the text string
1057
* @param offset
1058
* @param textlength of the text string
1059
* @return TRUE if offset is out of bounds, FALSE otherwise
1060
*/
1061
static
1062
inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1063
0
{
1064
0
    return offset < 0 || offset > textlength;
1065
0
}
1066
1067
/**
1068
* Checks for identical match
1069
* @param strsrch string search data
1070
* @param start offset of possible match
1071
* @param end offset of possible match
1072
* @return TRUE if identical match is found
1073
*/
1074
static
1075
inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1076
                                  int32_t    end)
1077
0
{
1078
0
    if (strsrch->strength != UCOL_IDENTICAL) {
1079
0
        return TRUE;
1080
0
    }
1081
0
1082
0
    // Note: We could use Normalizer::compare() or similar, but for short strings
1083
0
    // which may not be in FCD it might be faster to just NFD them.
1084
0
    UErrorCode status = U_ZERO_ERROR;
1085
0
    UnicodeString t2, p2;
1086
0
    strsrch->nfd->normalize(
1087
0
        UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1088
0
    strsrch->nfd->normalize(
1089
0
        UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1090
0
    // return FALSE if NFD failed
1091
0
    return U_SUCCESS(status) && t2 == p2;
1092
0
}
1093
1094
#if BOYER_MOORE
1095
/**
1096
* Checks to see if the match is repeated
1097
* @param strsrch string search data
1098
* @param start new match start index
1099
* @param end new match end index
1100
* @return TRUE if the the match is repeated, FALSE otherwise
1101
*/
1102
static
1103
inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1104
                                int32_t    start,
1105
                                int32_t    end)
1106
{
1107
    int32_t lastmatchindex = strsrch->search->matchedIndex;
1108
    UBool       result;
1109
    if (lastmatchindex == USEARCH_DONE) {
1110
        return FALSE;
1111
    }
1112
    if (strsrch->search->isForwardSearching) {
1113
        result = start <= lastmatchindex;
1114
    }
1115
    else {
1116
        result = start >= lastmatchindex;
1117
    }
1118
    if (!result && !strsrch->search->isOverlap) {
1119
        if (strsrch->search->isForwardSearching) {
1120
            result = start < lastmatchindex + strsrch->search->matchedLength;
1121
        }
1122
        else {
1123
            result = end > lastmatchindex;
1124
        }
1125
    }
1126
    return result;
1127
}
1128
1129
/**
1130
* Gets the collation element iterator's current offset.
1131
* @param coleiter collation element iterator
1132
* @param forwards flag TRUE if we are moving in th forwards direction
1133
* @return current offset
1134
*/
1135
static
1136
inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1137
                                              UBool               forwards)
1138
{
1139
    int32_t result = ucol_getOffset(coleiter);
1140
    // intricacies of the the backwards collation element iterator
1141
    if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1142
        result ++;
1143
    }
1144
    return result;
1145
}
1146
1147
/**
1148
* Checks match for contraction.
1149
* If the match ends with a partial contraction we fail.
1150
* If the match starts too far off (because of backwards iteration) we try to
1151
* chip off the extra characters depending on whether a breakiterator has
1152
* been used.
1153
* Internal method, error assumed to be success, caller has to check status
1154
* before calling this method.
1155
* @param strsrch string search data
1156
* @param start offset of potential match, to be modified if necessary
1157
* @param end offset of potential match, to be modified if necessary
1158
* @param status output error status if any
1159
* @return TRUE if match passes the contraction test, FALSE otherwise
1160
*/
1161
1162
static
1163
UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1164
                                     int32_t   *start,
1165
                                     int32_t   *end, UErrorCode  *status)
1166
{
1167
          UCollationElements *coleiter   = strsrch->textIter;
1168
          int32_t             textlength = strsrch->search->textLength;
1169
          int32_t             temp       = *start;
1170
    const UCollator          *collator   = strsrch->collator;
1171
    const UChar              *text       = strsrch->search->text;
1172
    // This part checks if either ends of the match contains potential
1173
    // contraction. If so we'll have to iterate through them
1174
    // The start contraction needs to be checked since ucol_previous dumps
1175
    // all characters till the first safe character into the buffer.
1176
    // *start + 1 is used to test for the unsafe characters instead of *start
1177
    // because ucol_prev takes all unsafe characters till the first safe
1178
    // character ie *start. so by testing *start + 1, we can estimate if
1179
    // excess prefix characters has been included in the potential search
1180
    // results.
1181
    if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1182
        (*start + 1 < textlength
1183
         && ucol_unsafeCP(text[*start + 1], collator))) {
1184
        int32_t expansion  = getExpansionPrefix(coleiter);
1185
        UBool   expandflag = expansion > 0;
1186
        setColEIterOffset(coleiter, *start);
1187
        while (expansion > 0) {
1188
            // getting rid of the redundant ce, caused by setOffset.
1189
            // since backward contraction/expansion may have extra ces if we
1190
            // are in the normalization buffer, hasAccentsBeforeMatch would
1191
            // have taken care of it.
1192
            // E.g. the character \u01FA will have an expansion of 3, but if
1193
            // we are only looking for acute and ring \u030A and \u0301, we'll
1194
            // have to skip the first ce in the expansion buffer.
1195
            ucol_next(coleiter, status);
1196
            if (U_FAILURE(*status)) {
1197
                return FALSE;
1198
            }
1199
            if (ucol_getOffset(coleiter) != temp) {
1200
                *start = temp;
1201
                temp  = ucol_getOffset(coleiter);
1202
            }
1203
            expansion --;
1204
        }
1205
1206
        int32_t  *patternce       = strsrch->pattern.ces;
1207
        int32_t   patterncelength = strsrch->pattern.cesLength;
1208
        int32_t   count           = 0;
1209
        while (count < patterncelength) {
1210
            int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1211
            if (ce == UCOL_IGNORABLE) {
1212
                continue;
1213
            }
1214
            if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1215
                *start = temp;
1216
                temp   = ucol_getOffset(coleiter);
1217
            }
1218
            if (U_FAILURE(*status) || ce != patternce[count]) {
1219
                (*end) ++;
1220
                *end = getNextUStringSearchBaseOffset(strsrch, *end);
1221
                return FALSE;
1222
            }
1223
            count ++;
1224
        }
1225
    }
1226
    return TRUE;
1227
}
1228
1229
/**
1230
* Checks and sets the match information if found.
1231
* Checks
1232
* <ul>
1233
* <li> the potential match does not repeat the previous match
1234
* <li> boundaries are correct
1235
* <li> exact matches has no extra accents
1236
* <li> identical matchesb
1237
* <li> potential match does not end in the middle of a contraction
1238
* <\ul>
1239
* Otherwise the offset will be shifted to the next character.
1240
* Internal method, status assumed to be success, caller has to check status
1241
* before calling this method.
1242
* @param strsrch string search data
1243
* @param textoffset offset in the collation element text. the returned value
1244
*        will be the truncated end offset of the match or the new start
1245
*        search offset.
1246
* @param status output error status if any
1247
* @return TRUE if the match is valid, FALSE otherwise
1248
*/
1249
static
1250
inline UBool checkNextExactMatch(UStringSearch *strsrch,
1251
                                 int32_t   *textoffset, UErrorCode *status)
1252
{
1253
    UCollationElements *coleiter = strsrch->textIter;
1254
    int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1255
1256
    if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1257
        return FALSE;
1258
    }
1259
1260
    // this totally matches, however we need to check if it is repeating
1261
    if (!isBreakUnit(strsrch, start, *textoffset) ||
1262
        checkRepeatedMatch(strsrch, start, *textoffset) ||
1263
        hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1264
        !checkIdentical(strsrch, start, *textoffset) ||
1265
        hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1266
1267
        (*textoffset) ++;
1268
        *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1269
        return FALSE;
1270
    }
1271
1272
    //Add breakiterator boundary check for primary strength search.
1273
    if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1274
        checkBreakBoundary(strsrch, &start, textoffset);
1275
    }
1276
1277
    // totally match, we will get rid of the ending ignorables.
1278
    strsrch->search->matchedIndex  = start;
1279
    strsrch->search->matchedLength = *textoffset - start;
1280
    return TRUE;
1281
}
1282
1283
/**
1284
* Getting the previous base character offset, or the current offset if the
1285
* current character is a base character
1286
* @param text string
1287
* @param textoffset one offset after the current character
1288
* @return the offset of the next character after the base character or the first
1289
*         composed character with accents
1290
*/
1291
static
1292
inline int32_t getPreviousBaseOffset(const UChar       *text,
1293
                                               int32_t  textoffset)
1294
{
1295
    if (textoffset > 0) {
1296
        for (;;) {
1297
            int32_t result = textoffset;
1298
            U16_BACK_1(text, 0, textoffset);
1299
            int32_t temp = textoffset;
1300
            uint16_t fcd = getFCD(text, &temp, result);
1301
            if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1302
                if (fcd & LAST_BYTE_MASK_) {
1303
                    return textoffset;
1304
                }
1305
                return result;
1306
            }
1307
            if (textoffset == 0) {
1308
                return 0;
1309
            }
1310
        }
1311
    }
1312
    return textoffset;
1313
}
1314
1315
/**
1316
* Getting the indexes of the accents that are not blocked in the argument
1317
* accent array
1318
* @param accents array of accents in nfd terminated by a 0.
1319
* @param accentsindex array of indexes of the accents that are not blocked
1320
*/
1321
static
1322
inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1323
{
1324
    int32_t index     = 0;
1325
    int32_t     length    = u_strlen(accents);
1326
    UChar32     codepoint = 0;
1327
    int         cclass    = 0;
1328
    int         result    = 0;
1329
    int32_t temp;
1330
    while (index < length) {
1331
        temp = index;
1332
        U16_NEXT(accents, index, length, codepoint);
1333
        if (u_getCombiningClass(codepoint) != cclass) {
1334
            cclass        = u_getCombiningClass(codepoint);
1335
            accentsindex[result] = temp;
1336
            result ++;
1337
        }
1338
    }
1339
    accentsindex[result] = length;
1340
    return result;
1341
}
1342
1343
/**
1344
* Appends 3 UChar arrays to a destination array.
1345
* Creates a new array if we run out of space. The caller will have to
1346
* manually deallocate the newly allocated array.
1347
* Internal method, status assumed to be success, caller has to check status
1348
* before calling this method. destination not to be NULL and has at least
1349
* size destinationlength.
1350
* @param destination target array
1351
* @param destinationlength target array size, returning the appended length
1352
* @param source1 null-terminated first array
1353
* @param source2 second array
1354
* @param source2length length of seond array
1355
* @param source3 null-terminated third array
1356
* @param status error status if any
1357
* @return new destination array, destination if there was no new allocation
1358
*/
1359
static
1360
inline UChar * addToUCharArray(      UChar      *destination,
1361
                                     int32_t    *destinationlength,
1362
                               const UChar      *source1,
1363
                               const UChar      *source2,
1364
                                     int32_t     source2length,
1365
                               const UChar      *source3,
1366
                                     UErrorCode *status)
1367
{
1368
    int32_t source1length = source1 ? u_strlen(source1) : 0;
1369
    int32_t source3length = source3 ? u_strlen(source3) : 0;
1370
    if (*destinationlength < source1length + source2length + source3length +
1371
                                                                           1)
1372
    {
1373
        destination = (UChar *)allocateMemory(
1374
          (source1length + source2length + source3length + 1) * sizeof(UChar),
1375
          status);
1376
        // if error allocating memory, status will be
1377
        // U_MEMORY_ALLOCATION_ERROR
1378
        if (U_FAILURE(*status)) {
1379
            *destinationlength = 0;
1380
            return NULL;
1381
        }
1382
    }
1383
    if (source1length != 0) {
1384
        u_memcpy(destination, source1, source1length);
1385
    }
1386
    if (source2length != 0) {
1387
        uprv_memcpy(destination + source1length, source2,
1388
                    sizeof(UChar) * source2length);
1389
    }
1390
    if (source3length != 0) {
1391
        uprv_memcpy(destination + source1length + source2length, source3,
1392
                    sizeof(UChar) * source3length);
1393
    }
1394
    *destinationlength = source1length + source2length + source3length;
1395
    return destination;
1396
}
1397
1398
/**
1399
* Running through a collation element iterator to see if the contents matches
1400
* pattern in string search data
1401
* @param strsrch string search data
1402
* @param coleiter collation element iterator
1403
* @return TRUE if a match if found, FALSE otherwise
1404
*/
1405
static
1406
inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1407
                                       UCollationElements *coleiter)
1408
{
1409
    int         patternceindex = strsrch->pattern.cesLength;
1410
    int32_t    *patternce      = strsrch->pattern.ces;
1411
    UErrorCode  status = U_ZERO_ERROR;
1412
    while (patternceindex > 0) {
1413
        int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1414
        if (ce == UCOL_IGNORABLE) {
1415
            continue;
1416
        }
1417
        if (U_FAILURE(status) || ce != *patternce) {
1418
            return FALSE;
1419
        }
1420
        patternce ++;
1421
        patternceindex --;
1422
    }
1423
    return TRUE;
1424
}
1425
1426
/**
1427
* Rearranges the front accents to try matching.
1428
* Prefix accents in the text will be grouped according to their combining
1429
* class and the groups will be mixed and matched to try find the perfect
1430
* match with the pattern.
1431
* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1432
* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1433
*         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1434
*         "\u0301\u0325".
1435
* step 2: check if any of the generated substrings matches the pattern.
1436
* Internal method, status is assumed to be success, caller has to check status
1437
* before calling this method.
1438
* @param strsrch string search match
1439
* @param start first offset of the accents to start searching
1440
* @param end start of the last accent set
1441
* @param status output error status if any
1442
* @return USEARCH_DONE if a match is not found, otherwise return the starting
1443
*         offset of the match. Note this start includes all preceding accents.
1444
*/
1445
static
1446
int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1447
                                       int32_t    start,
1448
                                       int32_t    end,
1449
                                       UErrorCode    *status)
1450
{
1451
    const UChar       *text       = strsrch->search->text;
1452
          int32_t      textlength = strsrch->search->textLength;
1453
          int32_t  tempstart  = start;
1454
1455
    if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1456
        // die... failed at a base character
1457
        return USEARCH_DONE;
1458
    }
1459
1460
    int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1461
    start = getPreviousBaseOffset(text, tempstart);
1462
1463
    UChar       accents[INITIAL_ARRAY_SIZE_];
1464
    // normalizing the offensive string
1465
    unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1466
                    INITIAL_ARRAY_SIZE_, status);
1467
    if (U_FAILURE(*status)) {
1468
        return USEARCH_DONE;
1469
    }
1470
1471
    int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1472
    int32_t         accentsize = getUnblockedAccentIndex(accents,
1473
                                                                 accentsindex);
1474
    int32_t         count      = (2 << (accentsize - 1)) - 1;
1475
    UChar               buffer[INITIAL_ARRAY_SIZE_];
1476
    UCollationElements *coleiter   = strsrch->utilIter;
1477
    while (U_SUCCESS(*status) && count > 0) {
1478
        UChar *rearrange = strsrch->canonicalPrefixAccents;
1479
        // copy the base characters
1480
        for (int k = 0; k < accentsindex[0]; k ++) {
1481
            *rearrange ++ = accents[k];
1482
        }
1483
        // forming all possible canonical rearrangement by dropping
1484
        // sets of accents
1485
        for (int i = 0; i <= accentsize - 1; i ++) {
1486
            int32_t mask = 1 << (accentsize - i - 1);
1487
            if (count & mask) {
1488
                for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1489
                    *rearrange ++ = accents[j];
1490
                }
1491
            }
1492
        }
1493
        *rearrange = 0;
1494
        int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1495
        UChar   *match     = addToUCharArray(buffer, &matchsize,
1496
                                           strsrch->canonicalPrefixAccents,
1497
                                           strsrch->search->text + offset,
1498
                                           end - offset,
1499
                                           strsrch->canonicalSuffixAccents,
1500
                                           status);
1501
1502
        // if status is a failure, ucol_setText does nothing.
1503
        // run the collator iterator through this match
1504
        ucol_setText(coleiter, match, matchsize, status);
1505
        if (U_SUCCESS(*status)) {
1506
            if (checkCollationMatch(strsrch, coleiter)) {
1507
                if (match != buffer) {
1508
                    uprv_free(match);
1509
                }
1510
                return start;
1511
            }
1512
        }
1513
        count --;
1514
    }
1515
    return USEARCH_DONE;
1516
}
1517
1518
/**
1519
* Gets the offset to the safe point in text before textoffset.
1520
* ie. not the middle of a contraction, swappable characters or supplementary
1521
* characters.
1522
* @param collator collation sata
1523
* @param text string to work with
1524
* @param textoffset offset in string
1525
* @param textlength length of text string
1526
* @return offset to the previous safe character
1527
*/
1528
static
1529
inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1530
                                      const UChar       *text,
1531
                                            int32_t  textoffset)
1532
{
1533
    int32_t result = textoffset; // first contraction character
1534
    while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1535
        result --;
1536
    }
1537
    if (result != 0) {
1538
        // the first contraction character is consider unsafe here
1539
        result --;
1540
    }
1541
    return result;
1542
}
1543
1544
/**
1545
* Cleaning up after we passed the safe zone
1546
* @param strsrch string search data
1547
* @param safetext safe text array
1548
* @param safebuffer safe text buffer
1549
* @param coleiter collation element iterator for safe text
1550
*/
1551
static
1552
inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1553
                                  UChar         *safebuffer)
1554
{
1555
    if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1556
    {
1557
       uprv_free(safetext);
1558
    }
1559
}
1560
1561
/**
1562
* Take the rearranged end accents and tries matching. If match failed at
1563
* a seperate preceding set of accents (seperated from the rearranged on by
1564
* at least a base character) then we rearrange the preceding accents and
1565
* tries matching again.
1566
* We allow skipping of the ends of the accent set if the ces do not match.
1567
* However if the failure is found before the accent set, it fails.
1568
* Internal method, status assumed to be success, caller has to check status
1569
* before calling this method.
1570
* @param strsrch string search data
1571
* @param textoffset of the start of the rearranged accent
1572
* @param status output error status if any
1573
* @return USEARCH_DONE if a match is not found, otherwise return the starting
1574
*         offset of the match. Note this start includes all preceding accents.
1575
*/
1576
static
1577
int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1578
                                       int32_t    textoffset,
1579
                                       UErrorCode    *status)
1580
{
1581
    const UChar              *text           = strsrch->search->text;
1582
    const UCollator          *collator       = strsrch->collator;
1583
          int32_t             safelength     = 0;
1584
          UChar              *safetext;
1585
          int32_t             safetextlength;
1586
          UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1587
          UCollationElements *coleiter       = strsrch->utilIter;
1588
          int32_t         safeoffset     = textoffset;
1589
1590
    if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1591
                                         collator)) {
1592
        safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1593
        safelength     = textoffset - safeoffset;
1594
        safetextlength = INITIAL_ARRAY_SIZE_;
1595
        safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1596
                                         text + safeoffset, safelength,
1597
                                         strsrch->canonicalSuffixAccents,
1598
                                         status);
1599
    }
1600
    else {
1601
        safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1602
        safetext       = strsrch->canonicalSuffixAccents;
1603
    }
1604
1605
    // if status is a failure, ucol_setText does nothing
1606
    ucol_setText(coleiter, safetext, safetextlength, status);
1607
    // status checked in loop below
1608
1609
    int32_t  *ce        = strsrch->pattern.ces;
1610
    int32_t   celength  = strsrch->pattern.cesLength;
1611
    int       ceindex   = celength - 1;
1612
    UBool     isSafe    = TRUE; // indication flag for position in safe zone
1613
1614
    while (ceindex >= 0) {
1615
        int32_t textce = ucol_previous(coleiter, status);
1616
        if (U_FAILURE(*status)) {
1617
            if (isSafe) {
1618
                cleanUpSafeText(strsrch, safetext, safebuffer);
1619
            }
1620
            return USEARCH_DONE;
1621
        }
1622
        if (textce == UCOL_NULLORDER) {
1623
            // check if we have passed the safe buffer
1624
            if (coleiter == strsrch->textIter) {
1625
                cleanUpSafeText(strsrch, safetext, safebuffer);
1626
                return USEARCH_DONE;
1627
            }
1628
            cleanUpSafeText(strsrch, safetext, safebuffer);
1629
            safetext = safebuffer;
1630
            coleiter = strsrch->textIter;
1631
            setColEIterOffset(coleiter, safeoffset);
1632
            // status checked at the start of the loop
1633
            isSafe = FALSE;
1634
            continue;
1635
        }
1636
        textce = getCE(strsrch, textce);
1637
        if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1638
            // do the beginning stuff
1639
            int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1640
            if (isSafe && failedoffset >= safelength) {
1641
                // alas... no hope. failed at rearranged accent set
1642
                cleanUpSafeText(strsrch, safetext, safebuffer);
1643
                return USEARCH_DONE;
1644
            }
1645
            else {
1646
                if (isSafe) {
1647
                    failedoffset += safeoffset;
1648
                    cleanUpSafeText(strsrch, safetext, safebuffer);
1649
                }
1650
1651
                // try rearranging the front accents
1652
                int32_t result = doNextCanonicalPrefixMatch(strsrch,
1653
                                        failedoffset, textoffset, status);
1654
                if (result != USEARCH_DONE) {
1655
                    // if status is a failure, ucol_setOffset does nothing
1656
                    setColEIterOffset(strsrch->textIter, result);
1657
                }
1658
                if (U_FAILURE(*status)) {
1659
                    return USEARCH_DONE;
1660
                }
1661
                return result;
1662
            }
1663
        }
1664
        if (textce == ce[ceindex]) {
1665
            ceindex --;
1666
        }
1667
    }
1668
    // set offset here
1669
    if (isSafe) {
1670
        int32_t result     = getColElemIterOffset(coleiter, FALSE);
1671
        // sets the text iterator here with the correct expansion and offset
1672
        int32_t    leftoverces = getExpansionPrefix(coleiter);
1673
        cleanUpSafeText(strsrch, safetext, safebuffer);
1674
        if (result >= safelength) {
1675
            result = textoffset;
1676
        }
1677
        else {
1678
            result += safeoffset;
1679
        }
1680
        setColEIterOffset(strsrch->textIter, result);
1681
        strsrch->textIter->iteratordata_.toReturn =
1682
                       setExpansionPrefix(strsrch->textIter, leftoverces);
1683
        return result;
1684
    }
1685
1686
    return ucol_getOffset(coleiter);
1687
}
1688
1689
/**
1690
* Trying out the substring and sees if it can be a canonical match.
1691
* This will try normalizing the end accents and arranging them into canonical
1692
* equivalents and check their corresponding ces with the pattern ce.
1693
* Suffix accents in the text will be grouped according to their combining
1694
* class and the groups will be mixed and matched to try find the perfect
1695
* match with the pattern.
1696
* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1697
* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1698
*         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1699
*         "\u0301\u0325".
1700
* step 2: check if any of the generated substrings matches the pattern.
1701
* Internal method, status assumed to be success, caller has to check status
1702
* before calling this method.
1703
* @param strsrch string search data
1704
* @param textoffset end offset in the collation element text that ends with
1705
*                   the accents to be rearranged
1706
* @param status error status if any
1707
* @return TRUE if the match is valid, FALSE otherwise
1708
*/
1709
static
1710
UBool doNextCanonicalMatch(UStringSearch *strsrch,
1711
                           int32_t    textoffset,
1712
                           UErrorCode    *status)
1713
{
1714
    const UChar       *text = strsrch->search->text;
1715
          int32_t  temp = textoffset;
1716
    U16_BACK_1(text, 0, temp);
1717
    if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1718
        UCollationElements *coleiter = strsrch->textIter;
1719
        int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1720
        if (strsrch->pattern.hasPrefixAccents) {
1721
            offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1722
                                                status);
1723
            if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1724
                setColEIterOffset(coleiter, offset);
1725
                return TRUE;
1726
            }
1727
        }
1728
        return FALSE;
1729
    }
1730
1731
    if (!strsrch->pattern.hasSuffixAccents) {
1732
        return FALSE;
1733
    }
1734
1735
    UChar       accents[INITIAL_ARRAY_SIZE_];
1736
    // offset to the last base character in substring to search
1737
    int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1738
    // normalizing the offensive string
1739
    unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1740
                               0, accents, INITIAL_ARRAY_SIZE_, status);
1741
    // status checked in loop below
1742
1743
    int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1744
    int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1745
1746
    // 2 power n - 1 plus the full set of accents
1747
    int32_t  count = (2 << (size - 1)) - 1;
1748
    while (U_SUCCESS(*status) && count > 0) {
1749
        UChar *rearrange = strsrch->canonicalSuffixAccents;
1750
        // copy the base characters
1751
        for (int k = 0; k < accentsindex[0]; k ++) {
1752
            *rearrange ++ = accents[k];
1753
        }
1754
        // forming all possible canonical rearrangement by dropping
1755
        // sets of accents
1756
        for (int i = 0; i <= size - 1; i ++) {
1757
            int32_t mask = 1 << (size - i - 1);
1758
            if (count & mask) {
1759
                for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1760
                    *rearrange ++ = accents[j];
1761
                }
1762
            }
1763
        }
1764
        *rearrange = 0;
1765
        int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1766
                                                        status);
1767
        if (offset != USEARCH_DONE) {
1768
            return TRUE; // match found
1769
        }
1770
        count --;
1771
    }
1772
    return FALSE;
1773
}
1774
1775
/**
1776
* Gets the previous base character offset depending on the string search
1777
* pattern data
1778
* @param strsrch string search data
1779
* @param textoffset current offset, current character
1780
* @return the offset of the next character after this base character or itself
1781
*         if it is a composed character with accents
1782
*/
1783
static
1784
inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1785
                                                      int32_t textoffset)
1786
{
1787
    if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1788
        const UChar       *text = strsrch->search->text;
1789
              int32_t  offset = textoffset;
1790
        if (getFCD(text, &offset, strsrch->search->textLength) >>
1791
                                                   SECOND_LAST_BYTE_SHIFT_) {
1792
            return getPreviousBaseOffset(text, textoffset);
1793
        }
1794
    }
1795
    return textoffset;
1796
}
1797
1798
/**
1799
* Checks match for contraction.
1800
* If the match ends with a partial contraction we fail.
1801
* If the match starts too far off (because of backwards iteration) we try to
1802
* chip off the extra characters
1803
* Internal method, status assumed to be success, caller has to check status
1804
* before calling this method.
1805
* @param strsrch string search data
1806
* @param start offset of potential match, to be modified if necessary
1807
* @param end offset of potential match, to be modified if necessary
1808
* @param status output error status if any
1809
* @return TRUE if match passes the contraction test, FALSE otherwise
1810
*/
1811
static
1812
UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1813
                                         int32_t   *start,
1814
                                         int32_t   *end,
1815
                                         UErrorCode    *status)
1816
{
1817
          UCollationElements *coleiter   = strsrch->textIter;
1818
          int32_t             textlength = strsrch->search->textLength;
1819
          int32_t         temp       = *start;
1820
    const UCollator          *collator   = strsrch->collator;
1821
    const UChar              *text       = strsrch->search->text;
1822
    // This part checks if either ends of the match contains potential
1823
    // contraction. If so we'll have to iterate through them
1824
    if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1825
        (*start + 1 < textlength
1826
         && ucol_unsafeCP(text[*start + 1], collator))) {
1827
        int32_t expansion  = getExpansionPrefix(coleiter);
1828
        UBool   expandflag = expansion > 0;
1829
        setColEIterOffset(coleiter, *start);
1830
        while (expansion > 0) {
1831
            // getting rid of the redundant ce, caused by setOffset.
1832
            // since backward contraction/expansion may have extra ces if we
1833
            // are in the normalization buffer, hasAccentsBeforeMatch would
1834
            // have taken care of it.
1835
            // E.g. the character \u01FA will have an expansion of 3, but if
1836
            // we are only looking for acute and ring \u030A and \u0301, we'll
1837
            // have to skip the first ce in the expansion buffer.
1838
            ucol_next(coleiter, status);
1839
            if (U_FAILURE(*status)) {
1840
                return FALSE;
1841
            }
1842
            if (ucol_getOffset(coleiter) != temp) {
1843
                *start = temp;
1844
                temp  = ucol_getOffset(coleiter);
1845
            }
1846
            expansion --;
1847
        }
1848
1849
        int32_t  *patternce       = strsrch->pattern.ces;
1850
        int32_t   patterncelength = strsrch->pattern.cesLength;
1851
        int32_t   count           = 0;
1852
        int32_t   textlength      = strsrch->search->textLength;
1853
        while (count < patterncelength) {
1854
            int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1855
            // status checked below, note that if status is a failure
1856
            // ucol_next returns UCOL_NULLORDER
1857
            if (ce == UCOL_IGNORABLE) {
1858
                continue;
1859
            }
1860
            if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1861
                *start = temp;
1862
                temp   = ucol_getOffset(coleiter);
1863
            }
1864
1865
            if (count == 0 && ce != patternce[0]) {
1866
                // accents may have extra starting ces, this occurs when a
1867
                // pure accent pattern is matched without rearrangement
1868
                // text \u0325\u0300 and looking for \u0300
1869
                int32_t expected = patternce[0];
1870
                if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1871
                    ce = getCE(strsrch, ucol_next(coleiter, status));
1872
                    while (U_SUCCESS(*status) && ce != expected &&
1873
                           ce != UCOL_NULLORDER &&
1874
                           ucol_getOffset(coleiter) <= *end) {
1875
                        ce = getCE(strsrch, ucol_next(coleiter, status));
1876
                    }
1877
                }
1878
            }
1879
            if (U_FAILURE(*status) || ce != patternce[count]) {
1880
                (*end) ++;
1881
                *end = getNextUStringSearchBaseOffset(strsrch, *end);
1882
                return FALSE;
1883
            }
1884
            count ++;
1885
        }
1886
    }
1887
    return TRUE;
1888
}
1889
1890
/**
1891
* Checks and sets the match information if found.
1892
* Checks
1893
* <ul>
1894
* <li> the potential match does not repeat the previous match
1895
* <li> boundaries are correct
1896
* <li> potential match does not end in the middle of a contraction
1897
* <li> identical matches
1898
* <\ul>
1899
* Otherwise the offset will be shifted to the next character.
1900
* Internal method, status assumed to be success, caller has to check the
1901
* status before calling this method.
1902
* @param strsrch string search data
1903
* @param textoffset offset in the collation element text. the returned value
1904
*        will be the truncated end offset of the match or the new start
1905
*        search offset.
1906
* @param status output error status if any
1907
* @return TRUE if the match is valid, FALSE otherwise
1908
*/
1909
static
1910
inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1911
                                     int32_t   *textoffset,
1912
                                     UErrorCode    *status)
1913
{
1914
    // to ensure that the start and ends are not composite characters
1915
    UCollationElements *coleiter = strsrch->textIter;
1916
    // if we have a canonical accent match
1917
    if ((strsrch->pattern.hasSuffixAccents &&
1918
        strsrch->canonicalSuffixAccents[0]) ||
1919
        (strsrch->pattern.hasPrefixAccents &&
1920
        strsrch->canonicalPrefixAccents[0])) {
1921
        strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1922
                                                    strsrch,
1923
                                                    ucol_getOffset(coleiter));
1924
        strsrch->search->matchedLength = *textoffset -
1925
                                                strsrch->search->matchedIndex;
1926
        return TRUE;
1927
    }
1928
1929
    int32_t start = getColElemIterOffset(coleiter, FALSE);
1930
    if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1931
                                            status) || U_FAILURE(*status)) {
1932
        return FALSE;
1933
    }
1934
1935
    start = getPreviousUStringSearchBaseOffset(strsrch, start);
1936
    // this totally matches, however we need to check if it is repeating
1937
    if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1938
        !isBreakUnit(strsrch, start, *textoffset) ||
1939
        !checkIdentical(strsrch, start, *textoffset)) {
1940
        (*textoffset) ++;
1941
        *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1942
                                        strsrch->search->textLength);
1943
        return FALSE;
1944
    }
1945
1946
    strsrch->search->matchedIndex  = start;
1947
    strsrch->search->matchedLength = *textoffset - start;
1948
    return TRUE;
1949
}
1950
1951
/**
1952
* Shifting the collation element iterator position forward to prepare for
1953
* a preceding match. If the first character is a unsafe character, we'll only
1954
* shift by 1 to capture contractions, normalization etc.
1955
* Internal method, status assumed to be success, caller has to check status
1956
* before calling this method.
1957
* @param text strsrch string search data
1958
* @param textoffset start text position to do search
1959
* @param ce the text ce which failed the match.
1960
* @param patternceindex index of the ce within the pattern ce buffer which
1961
*        failed the match
1962
* @return final offset
1963
*/
1964
static
1965
inline int32_t reverseShift(UStringSearch *strsrch,
1966
                                int32_t    textoffset,
1967
                                int32_t       ce,
1968
                                int32_t        patternceindex)
1969
{
1970
    if (strsrch->search->isOverlap) {
1971
        if (textoffset != strsrch->search->textLength) {
1972
            textoffset --;
1973
        }
1974
        else {
1975
            textoffset -= strsrch->pattern.defaultShiftSize;
1976
        }
1977
    }
1978
    else {
1979
        if (ce != UCOL_NULLORDER) {
1980
            int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1981
1982
            // this is to adjust for characters in the middle of the substring
1983
            // for matching that failed.
1984
            int32_t adjust = patternceindex;
1985
            if (adjust > 1 && shift > adjust) {
1986
                shift -= adjust - 1;
1987
            }
1988
            textoffset -= shift;
1989
        }
1990
        else {
1991
            textoffset -= strsrch->pattern.defaultShiftSize;
1992
        }
1993
    }
1994
    textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1995
    return textoffset;
1996
}
1997
1998
/**
1999
* Checks match for contraction.
2000
* If the match starts with a partial contraction we fail.
2001
* Internal method, status assumed to be success, caller has to check status
2002
* before calling this method.
2003
* @param strsrch string search data
2004
* @param start offset of potential match, to be modified if necessary
2005
* @param end offset of potential match, to be modified if necessary
2006
* @param status output error status if any
2007
* @return TRUE if match passes the contraction test, FALSE otherwise
2008
*/
2009
static
2010
UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2011
                                     int32_t   *start,
2012
                                     int32_t   *end, UErrorCode  *status)
2013
{
2014
          UCollationElements *coleiter   = strsrch->textIter;
2015
          int32_t             textlength = strsrch->search->textLength;
2016
          int32_t             temp       = *end;
2017
    const UCollator          *collator   = strsrch->collator;
2018
    const UChar              *text       = strsrch->search->text;
2019
    // This part checks if either if the start of the match contains potential
2020
    // contraction. If so we'll have to iterate through them
2021
    // Since we used ucol_next while previously looking for the potential
2022
    // match, this guarantees that our end will not be a partial contraction,
2023
    // or a partial supplementary character.
2024
    if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2025
        int32_t expansion  = getExpansionSuffix(coleiter);
2026
        UBool   expandflag = expansion > 0;
2027
        setColEIterOffset(coleiter, *end);
2028
        while (U_SUCCESS(*status) && expansion > 0) {
2029
            // getting rid of the redundant ce
2030
            // since forward contraction/expansion may have extra ces
2031
            // if we are in the normalization buffer, hasAccentsBeforeMatch
2032
            // would have taken care of it.
2033
            // E.g. the character \u01FA will have an expansion of 3, but if
2034
            // we are only looking for A ring A\u030A, we'll have to skip the
2035
            // last ce in the expansion buffer
2036
            ucol_previous(coleiter, status);
2037
            if (U_FAILURE(*status)) {
2038
                return FALSE;
2039
            }
2040
            if (ucol_getOffset(coleiter) != temp) {
2041
                *end = temp;
2042
                temp  = ucol_getOffset(coleiter);
2043
            }
2044
            expansion --;
2045
        }
2046
2047
        int32_t  *patternce       = strsrch->pattern.ces;
2048
        int32_t   patterncelength = strsrch->pattern.cesLength;
2049
        int32_t   count           = patterncelength;
2050
        while (count > 0) {
2051
            int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2052
            // status checked below, note that if status is a failure
2053
            // ucol_previous returns UCOL_NULLORDER
2054
            if (ce == UCOL_IGNORABLE) {
2055
                continue;
2056
            }
2057
            if (expandflag && count == 0 &&
2058
                getColElemIterOffset(coleiter, FALSE) != temp) {
2059
                *end = temp;
2060
                temp  = ucol_getOffset(coleiter);
2061
            }
2062
            if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2063
                (*start) --;
2064
                *start = getPreviousBaseOffset(text, *start);
2065
                return FALSE;
2066
            }
2067
            count --;
2068
        }
2069
    }
2070
    return TRUE;
2071
}
2072
2073
/**
2074
* Checks and sets the match information if found.
2075
* Checks
2076
* <ul>
2077
* <li> the current match does not repeat the last match
2078
* <li> boundaries are correct
2079
* <li> exact matches has no extra accents
2080
* <li> identical matches
2081
* <\ul>
2082
* Otherwise the offset will be shifted to the preceding character.
2083
* Internal method, status assumed to be success, caller has to check status
2084
* before calling this method.
2085
* @param strsrch string search data
2086
* @param collator
2087
* @param coleiter collation element iterator
2088
* @param text string
2089
* @param textoffset offset in the collation element text. the returned value
2090
*        will be the truncated start offset of the match or the new start
2091
*        search offset.
2092
* @param status output error status if any
2093
* @return TRUE if the match is valid, FALSE otherwise
2094
*/
2095
static
2096
inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2097
                                     int32_t   *textoffset,
2098
                                     UErrorCode    *status)
2099
{
2100
    // to ensure that the start and ends are not composite characters
2101
    int32_t end = ucol_getOffset(strsrch->textIter);
2102
    if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2103
        || U_FAILURE(*status)) {
2104
            return FALSE;
2105
    }
2106
2107
    // this totally matches, however we need to check if it is repeating
2108
    // the old match
2109
    if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2110
        !isBreakUnit(strsrch, *textoffset, end) ||
2111
        hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2112
        !checkIdentical(strsrch, *textoffset, end) ||
2113
        hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2114
        (*textoffset) --;
2115
        *textoffset = getPreviousBaseOffset(strsrch->search->text,
2116
                                            *textoffset);
2117
        return FALSE;
2118
    }
2119
2120
    //Add breakiterator boundary check for primary strength search.
2121
    if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2122
        checkBreakBoundary(strsrch, textoffset, &end);
2123
    }
2124
2125
    strsrch->search->matchedIndex = *textoffset;
2126
    strsrch->search->matchedLength = end - *textoffset;
2127
    return TRUE;
2128
}
2129
2130
/**
2131
* Rearranges the end accents to try matching.
2132
* Suffix accents in the text will be grouped according to their combining
2133
* class and the groups will be mixed and matched to try find the perfect
2134
* match with the pattern.
2135
* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2136
* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2137
*         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2138
*         "\u0301\u0325".
2139
* step 2: check if any of the generated substrings matches the pattern.
2140
* Internal method, status assumed to be success, user has to check status
2141
* before calling this method.
2142
* @param strsrch string search match
2143
* @param start offset of the first base character
2144
* @param end start of the last accent set
2145
* @param status only error status if any
2146
* @return USEARCH_DONE if a match is not found, otherwise return the ending
2147
*         offset of the match. Note this start includes all following accents.
2148
*/
2149
static
2150
int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2151
                                           int32_t    start,
2152
                                           int32_t    end,
2153
                                           UErrorCode    *status)
2154
{
2155
    const UChar       *text       = strsrch->search->text;
2156
          int32_t  tempend    = end;
2157
2158
    U16_BACK_1(text, 0, tempend);
2159
    if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2160
                                                           LAST_BYTE_MASK_)) {
2161
        // die... failed at a base character
2162
        return USEARCH_DONE;
2163
    }
2164
    end = getNextBaseOffset(text, end, strsrch->search->textLength);
2165
2166
    if (U_SUCCESS(*status)) {
2167
        UChar       accents[INITIAL_ARRAY_SIZE_];
2168
        int32_t offset = getPreviousBaseOffset(text, end);
2169
        // normalizing the offensive string
2170
        unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2171
                        INITIAL_ARRAY_SIZE_, status);
2172
2173
        int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
2174
        int32_t         accentsize = getUnblockedAccentIndex(accents,
2175
                                                         accentsindex);
2176
        int32_t         count      = (2 << (accentsize - 1)) - 1;
2177
        UChar               buffer[INITIAL_ARRAY_SIZE_];
2178
        UCollationElements *coleiter = strsrch->utilIter;
2179
        while (U_SUCCESS(*status) && count > 0) {
2180
            UChar *rearrange = strsrch->canonicalSuffixAccents;
2181
            // copy the base characters
2182
            for (int k = 0; k < accentsindex[0]; k ++) {
2183
                *rearrange ++ = accents[k];
2184
            }
2185
            // forming all possible canonical rearrangement by dropping
2186
            // sets of accents
2187
            for (int i = 0; i <= accentsize - 1; i ++) {
2188
                int32_t mask = 1 << (accentsize - i - 1);
2189
                if (count & mask) {
2190
                    for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2191
                        *rearrange ++ = accents[j];
2192
                    }
2193
                }
2194
            }
2195
            *rearrange = 0;
2196
            int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2197
            UChar   *match     = addToUCharArray(buffer, &matchsize,
2198
                                           strsrch->canonicalPrefixAccents,
2199
                                           strsrch->search->text + start,
2200
                                           offset - start,
2201
                                           strsrch->canonicalSuffixAccents,
2202
                                           status);
2203
2204
            // run the collator iterator through this match
2205
            // if status is a failure ucol_setText does nothing
2206
            ucol_setText(coleiter, match, matchsize, status);
2207
            if (U_SUCCESS(*status)) {
2208
                if (checkCollationMatch(strsrch, coleiter)) {
2209
                    if (match != buffer) {
2210
                        uprv_free(match);
2211
                    }
2212
                    return end;
2213
                }
2214
            }
2215
            count --;
2216
        }
2217
    }
2218
    return USEARCH_DONE;
2219
}
2220
2221
/**
2222
* Take the rearranged start accents and tries matching. If match failed at
2223
* a seperate following set of accents (seperated from the rearranged on by
2224
* at least a base character) then we rearrange the preceding accents and
2225
* tries matching again.
2226
* We allow skipping of the ends of the accent set if the ces do not match.
2227
* However if the failure is found before the accent set, it fails.
2228
* Internal method, status assumed to be success, caller has to check status
2229
* before calling this method.
2230
* @param strsrch string search data
2231
* @param textoffset of the ends of the rearranged accent
2232
* @param status output error status if any
2233
* @return USEARCH_DONE if a match is not found, otherwise return the ending
2234
*         offset of the match. Note this start includes all following accents.
2235
*/
2236
static
2237
int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2238
                                           int32_t    textoffset,
2239
                                           UErrorCode    *status)
2240
{
2241
    const UChar       *text       = strsrch->search->text;
2242
    const UCollator   *collator   = strsrch->collator;
2243
          int32_t      safelength = 0;
2244
          UChar       *safetext;
2245
          int32_t      safetextlength;
2246
          UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2247
          int32_t  safeoffset = textoffset;
2248
2249
    if (textoffset &&
2250
        ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2251
                                 u_strlen(strsrch->canonicalPrefixAccents) - 1
2252
                                         ], collator)) {
2253
        safeoffset     = getNextSafeOffset(collator, text, textoffset,
2254
                                           strsrch->search->textLength);
2255
        safelength     = safeoffset - textoffset;
2256
        safetextlength = INITIAL_ARRAY_SIZE_;
2257
        safetext       = addToUCharArray(safebuffer, &safetextlength,
2258
                                         strsrch->canonicalPrefixAccents,
2259
                                         text + textoffset, safelength,
2260
                                         NULL, status);
2261
    }
2262
    else {
2263
        safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2264
        safetext       = strsrch->canonicalPrefixAccents;
2265
    }
2266
2267
    UCollationElements *coleiter = strsrch->utilIter;
2268
     // if status is a failure, ucol_setText does nothing
2269
    ucol_setText(coleiter, safetext, safetextlength, status);
2270
    // status checked in loop below
2271
2272
    int32_t  *ce           = strsrch->pattern.ces;
2273
    int32_t   celength     = strsrch->pattern.cesLength;
2274
    int       ceindex      = 0;
2275
    UBool     isSafe       = TRUE; // safe zone indication flag for position
2276
    int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2277
2278
    while (ceindex < celength) {
2279
        int32_t textce = ucol_next(coleiter, status);
2280
        if (U_FAILURE(*status)) {
2281
            if (isSafe) {
2282
                cleanUpSafeText(strsrch, safetext, safebuffer);
2283
            }
2284
            return USEARCH_DONE;
2285
        }
2286
        if (textce == UCOL_NULLORDER) {
2287
            // check if we have passed the safe buffer
2288
            if (coleiter == strsrch->textIter) {
2289
                cleanUpSafeText(strsrch, safetext, safebuffer);
2290
                return USEARCH_DONE;
2291
            }
2292
            cleanUpSafeText(strsrch, safetext, safebuffer);
2293
            safetext = safebuffer;
2294
            coleiter = strsrch->textIter;
2295
            setColEIterOffset(coleiter, safeoffset);
2296
            // status checked at the start of the loop
2297
            isSafe = FALSE;
2298
            continue;
2299
        }
2300
        textce = getCE(strsrch, textce);
2301
        if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2302
            // do the beginning stuff
2303
            int32_t failedoffset = ucol_getOffset(coleiter);
2304
            if (isSafe && failedoffset <= prefixlength) {
2305
                // alas... no hope. failed at rearranged accent set
2306
                cleanUpSafeText(strsrch, safetext, safebuffer);
2307
                return USEARCH_DONE;
2308
            }
2309
            else {
2310
                if (isSafe) {
2311
                    failedoffset = safeoffset - failedoffset;
2312
                    cleanUpSafeText(strsrch, safetext, safebuffer);
2313
                }
2314
2315
                // try rearranging the end accents
2316
                int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2317
                                        textoffset, failedoffset, status);
2318
                if (result != USEARCH_DONE) {
2319
                    // if status is a failure, ucol_setOffset does nothing
2320
                    setColEIterOffset(strsrch->textIter, result);
2321
                }
2322
                if (U_FAILURE(*status)) {
2323
                    return USEARCH_DONE;
2324
                }
2325
                return result;
2326
            }
2327
        }
2328
        if (textce == ce[ceindex]) {
2329
            ceindex ++;
2330
        }
2331
    }
2332
    // set offset here
2333
    if (isSafe) {
2334
        int32_t result      = ucol_getOffset(coleiter);
2335
        // sets the text iterator here with the correct expansion and offset
2336
        int32_t     leftoverces = getExpansionSuffix(coleiter);
2337
        cleanUpSafeText(strsrch, safetext, safebuffer);
2338
        if (result <= prefixlength) {
2339
            result = textoffset;
2340
        }
2341
        else {
2342
            result = textoffset + (safeoffset - result);
2343
        }
2344
        setColEIterOffset(strsrch->textIter, result);
2345
        setExpansionSuffix(strsrch->textIter, leftoverces);
2346
        return result;
2347
    }
2348
2349
    return ucol_getOffset(coleiter);
2350
}
2351
2352
/**
2353
* Trying out the substring and sees if it can be a canonical match.
2354
* This will try normalizing the starting accents and arranging them into
2355
* canonical equivalents and check their corresponding ces with the pattern ce.
2356
* Prefix accents in the text will be grouped according to their combining
2357
* class and the groups will be mixed and matched to try find the perfect
2358
* match with the pattern.
2359
* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2360
* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2361
*         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2362
*         "\u0301\u0325".
2363
* step 2: check if any of the generated substrings matches the pattern.
2364
* Internal method, status assumed to be success, caller has to check status
2365
* before calling this method.
2366
* @param strsrch string search data
2367
* @param textoffset start offset in the collation element text that starts
2368
*                   with the accents to be rearranged
2369
* @param status output error status if any
2370
* @return TRUE if the match is valid, FALSE otherwise
2371
*/
2372
static
2373
UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2374
                               int32_t    textoffset,
2375
                               UErrorCode    *status)
2376
{
2377
    const UChar       *text       = strsrch->search->text;
2378
          int32_t  temp       = textoffset;
2379
          int32_t      textlength = strsrch->search->textLength;
2380
    if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2381
        UCollationElements *coleiter = strsrch->textIter;
2382
        int32_t         offset   = ucol_getOffset(coleiter);
2383
        if (strsrch->pattern.hasSuffixAccents) {
2384
            offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2385
                                                    offset, status);
2386
            if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2387
                setColEIterOffset(coleiter, offset);
2388
                return TRUE;
2389
            }
2390
        }
2391
        return FALSE;
2392
    }
2393
2394
    if (!strsrch->pattern.hasPrefixAccents) {
2395
        return FALSE;
2396
    }
2397
2398
    UChar       accents[INITIAL_ARRAY_SIZE_];
2399
    // offset to the last base character in substring to search
2400
    int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2401
    // normalizing the offensive string
2402
    unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2403
                               0, accents, INITIAL_ARRAY_SIZE_, status);
2404
    // status checked in loop
2405
2406
    int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2407
    int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2408
2409
    // 2 power n - 1 plus the full set of accents
2410
    int32_t  count = (2 << (size - 1)) - 1;
2411
    while (U_SUCCESS(*status) && count > 0) {
2412
        UChar *rearrange = strsrch->canonicalPrefixAccents;
2413
        // copy the base characters
2414
        for (int k = 0; k < accentsindex[0]; k ++) {
2415
            *rearrange ++ = accents[k];
2416
        }
2417
        // forming all possible canonical rearrangement by dropping
2418
        // sets of accents
2419
        for (int i = 0; i <= size - 1; i ++) {
2420
            int32_t mask = 1 << (size - i - 1);
2421
            if (count & mask) {
2422
                for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2423
                    *rearrange ++ = accents[j];
2424
                }
2425
            }
2426
        }
2427
        *rearrange = 0;
2428
        int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2429
                                                          baseoffset, status);
2430
        if (offset != USEARCH_DONE) {
2431
            return TRUE; // match found
2432
        }
2433
        count --;
2434
    }
2435
    return FALSE;
2436
}
2437
2438
/**
2439
* Checks match for contraction.
2440
* If the match starts with a partial contraction we fail.
2441
* Internal method, status assumed to be success, caller has to check status
2442
* before calling this method.
2443
* @param strsrch string search data
2444
* @param start offset of potential match, to be modified if necessary
2445
* @param end offset of potential match, to be modified if necessary
2446
* @param status only error status if any
2447
* @return TRUE if match passes the contraction test, FALSE otherwise
2448
*/
2449
static
2450
UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2451
                                     int32_t   *start,
2452
                                     int32_t   *end, UErrorCode  *status)
2453
{
2454
          UCollationElements *coleiter   = strsrch->textIter;
2455
          int32_t             textlength = strsrch->search->textLength;
2456
          int32_t         temp       = *end;
2457
    const UCollator          *collator   = strsrch->collator;
2458
    const UChar              *text       = strsrch->search->text;
2459
    // This part checks if either if the start of the match contains potential
2460
    // contraction. If so we'll have to iterate through them
2461
    // Since we used ucol_next while previously looking for the potential
2462
    // match, this guarantees that our end will not be a partial contraction,
2463
    // or a partial supplementary character.
2464
    if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2465
        int32_t expansion  = getExpansionSuffix(coleiter);
2466
        UBool   expandflag = expansion > 0;
2467
        setColEIterOffset(coleiter, *end);
2468
        while (expansion > 0) {
2469
            // getting rid of the redundant ce
2470
            // since forward contraction/expansion may have extra ces
2471
            // if we are in the normalization buffer, hasAccentsBeforeMatch
2472
            // would have taken care of it.
2473
            // E.g. the character \u01FA will have an expansion of 3, but if
2474
            // we are only looking for A ring A\u030A, we'll have to skip the
2475
            // last ce in the expansion buffer
2476
            ucol_previous(coleiter, status);
2477
            if (U_FAILURE(*status)) {
2478
                return FALSE;
2479
            }
2480
            if (ucol_getOffset(coleiter) != temp) {
2481
                *end = temp;
2482
                temp  = ucol_getOffset(coleiter);
2483
            }
2484
            expansion --;
2485
        }
2486
2487
        int32_t  *patternce       = strsrch->pattern.ces;
2488
        int32_t   patterncelength = strsrch->pattern.cesLength;
2489
        int32_t   count           = patterncelength;
2490
        while (count > 0) {
2491
            int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2492
            // status checked below, note that if status is a failure
2493
            // ucol_previous returns UCOL_NULLORDER
2494
            if (ce == UCOL_IGNORABLE) {
2495
                continue;
2496
            }
2497
            if (expandflag && count == 0 &&
2498
                getColElemIterOffset(coleiter, FALSE) != temp) {
2499
                *end = temp;
2500
                temp  = ucol_getOffset(coleiter);
2501
            }
2502
            if (count == patterncelength &&
2503
                ce != patternce[patterncelength - 1]) {
2504
                // accents may have extra starting ces, this occurs when a
2505
                // pure accent pattern is matched without rearrangement
2506
                int32_t    expected = patternce[patterncelength - 1];
2507
                U16_BACK_1(text, 0, *end);
2508
                if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2509
                    ce = getCE(strsrch, ucol_previous(coleiter, status));
2510
                    while (U_SUCCESS(*status) && ce != expected &&
2511
                           ce != UCOL_NULLORDER &&
2512
                           ucol_getOffset(coleiter) <= *start) {
2513
                        ce = getCE(strsrch, ucol_previous(coleiter, status));
2514
                    }
2515
                }
2516
            }
2517
            if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2518
                (*start) --;
2519
                *start = getPreviousBaseOffset(text, *start);
2520
                return FALSE;
2521
            }
2522
            count --;
2523
        }
2524
    }
2525
    return TRUE;
2526
}
2527
2528
/**
2529
* Checks and sets the match information if found.
2530
* Checks
2531
* <ul>
2532
* <li> the potential match does not repeat the previous match
2533
* <li> boundaries are correct
2534
* <li> potential match does not end in the middle of a contraction
2535
* <li> identical matches
2536
* <\ul>
2537
* Otherwise the offset will be shifted to the next character.
2538
* Internal method, status assumed to be success, caller has to check status
2539
* before calling this method.
2540
* @param strsrch string search data
2541
* @param textoffset offset in the collation element text. the returned value
2542
*        will be the truncated start offset of the match or the new start
2543
*        search offset.
2544
* @param status only error status if any
2545
* @return TRUE if the match is valid, FALSE otherwise
2546
*/
2547
static
2548
inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2549
                                         int32_t   *textoffset,
2550
                                         UErrorCode    *status)
2551
{
2552
    // to ensure that the start and ends are not composite characters
2553
    UCollationElements *coleiter = strsrch->textIter;
2554
    // if we have a canonical accent match
2555
    if ((strsrch->pattern.hasSuffixAccents &&
2556
        strsrch->canonicalSuffixAccents[0]) ||
2557
        (strsrch->pattern.hasPrefixAccents &&
2558
        strsrch->canonicalPrefixAccents[0])) {
2559
        strsrch->search->matchedIndex  = *textoffset;
2560
        strsrch->search->matchedLength =
2561
            getNextUStringSearchBaseOffset(strsrch,
2562
                                      getColElemIterOffset(coleiter, FALSE))
2563
            - *textoffset;
2564
        return TRUE;
2565
    }
2566
2567
    int32_t end = ucol_getOffset(coleiter);
2568
    if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2569
                                                status) ||
2570
         U_FAILURE(*status)) {
2571
        return FALSE;
2572
    }
2573
2574
    end = getNextUStringSearchBaseOffset(strsrch, end);
2575
    // this totally matches, however we need to check if it is repeating
2576
    if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2577
        !isBreakUnit(strsrch, *textoffset, end) ||
2578
        !checkIdentical(strsrch, *textoffset, end)) {
2579
        (*textoffset) --;
2580
        *textoffset = getPreviousBaseOffset(strsrch->search->text,
2581
                                            *textoffset);
2582
        return FALSE;
2583
    }
2584
2585
    strsrch->search->matchedIndex  = *textoffset;
2586
    strsrch->search->matchedLength = end - *textoffset;
2587
    return TRUE;
2588
}
2589
#endif // #if BOYER_MOORE
2590
2591
// constructors and destructor -------------------------------------------
2592
2593
U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2594
                                          int32_t         patternlength,
2595
                                    const UChar          *text,
2596
                                          int32_t         textlength,
2597
                                    const char           *locale,
2598
                                          UBreakIterator *breakiter,
2599
                                          UErrorCode     *status)
2600
0
{
2601
0
    if (U_FAILURE(*status)) {
2602
0
        return NULL;
2603
0
    }
2604
#if UCONFIG_NO_BREAK_ITERATION
2605
    if (breakiter != NULL) {
2606
        *status = U_UNSUPPORTED_ERROR;
2607
        return NULL;
2608
    }
2609
#endif
2610
0
    if (locale) {
2611
0
        // ucol_open internally checks for status
2612
0
        UCollator     *collator = ucol_open(locale, status);
2613
0
        // pattern, text checks are done in usearch_openFromCollator
2614
0
        UStringSearch *result   = usearch_openFromCollator(pattern,
2615
0
                                              patternlength, text, textlength,
2616
0
                                              collator, breakiter, status);
2617
0
2618
0
        if (result == NULL || U_FAILURE(*status)) {
2619
0
            if (collator) {
2620
0
                ucol_close(collator);
2621
0
            }
2622
0
            return NULL;
2623
0
        }
2624
0
        else {
2625
0
            result->ownCollator = TRUE;
2626
0
        }
2627
0
        return result;
2628
0
    }
2629
0
    *status = U_ILLEGAL_ARGUMENT_ERROR;
2630
0
    return NULL;
2631
0
}
2632
2633
U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2634
                                  const UChar          *pattern,
2635
                                        int32_t         patternlength,
2636
                                  const UChar          *text,
2637
                                        int32_t         textlength,
2638
                                  const UCollator      *collator,
2639
                                        UBreakIterator *breakiter,
2640
                                        UErrorCode     *status)
2641
0
{
2642
0
    if (U_FAILURE(*status)) {
2643
0
        return NULL;
2644
0
    }
2645
#if UCONFIG_NO_BREAK_ITERATION
2646
    if (breakiter != NULL) {
2647
        *status = U_UNSUPPORTED_ERROR;
2648
        return NULL;
2649
    }
2650
#endif
2651
0
    if (pattern == NULL || text == NULL || collator == NULL) {
2652
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
2653
0
        return NULL;
2654
0
    }
2655
0
2656
0
    // string search does not really work when numeric collation is turned on
2657
0
    if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2658
0
        *status = U_UNSUPPORTED_ERROR;
2659
0
        return NULL;
2660
0
    }
2661
0
2662
0
    if (U_SUCCESS(*status)) {
2663
0
        initializeFCD(status);
2664
0
        if (U_FAILURE(*status)) {
2665
0
            return NULL;
2666
0
        }
2667
0
2668
0
        UStringSearch *result;
2669
0
        if (textlength == -1) {
2670
0
            textlength = u_strlen(text);
2671
0
        }
2672
0
        if (patternlength == -1) {
2673
0
            patternlength = u_strlen(pattern);
2674
0
        }
2675
0
        if (textlength <= 0 || patternlength <= 0) {
2676
0
            *status = U_ILLEGAL_ARGUMENT_ERROR;
2677
0
            return NULL;
2678
0
        }
2679
0
2680
0
        result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2681
0
        if (result == NULL) {
2682
0
            *status = U_MEMORY_ALLOCATION_ERROR;
2683
0
            return NULL;
2684
0
        }
2685
0
2686
0
        result->collator    = collator;
2687
0
        result->strength    = ucol_getStrength(collator);
2688
0
        result->ceMask      = getMask(result->strength);
2689
0
        result->toShift     =
2690
0
             ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2691
0
                                                            UCOL_SHIFTED;
2692
0
        result->variableTop = ucol_getVariableTop(collator, status);
2693
0
2694
0
        result->nfd         = Normalizer2::getNFDInstance(*status);
2695
0
2696
0
        if (U_FAILURE(*status)) {
2697
0
            uprv_free(result);
2698
0
            return NULL;
2699
0
        }
2700
0
2701
0
        result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2702
0
        if (result->search == NULL) {
2703
0
            *status = U_MEMORY_ALLOCATION_ERROR;
2704
0
            uprv_free(result);
2705
0
            return NULL;
2706
0
        }
2707
0
2708
0
        result->search->text       = text;
2709
0
        result->search->textLength = textlength;
2710
0
2711
0
        result->pattern.text       = pattern;
2712
0
        result->pattern.textLength = patternlength;
2713
0
        result->pattern.ces         = NULL;
2714
0
        result->pattern.pces        = NULL;
2715
0
2716
0
        result->search->breakIter  = breakiter;
2717
0
#if !UCONFIG_NO_BREAK_ITERATION
2718
0
        result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2719
0
        if (breakiter) {
2720
0
            ubrk_setText(breakiter, text, textlength, status);
2721
0
        }
2722
0
#endif
2723
0
2724
0
        result->ownCollator           = FALSE;
2725
0
        result->search->matchedLength = 0;
2726
0
        result->search->matchedIndex  = USEARCH_DONE;
2727
0
        result->utilIter              = NULL;
2728
0
        result->textIter              = ucol_openElements(collator, text,
2729
0
                                                          textlength, status);
2730
0
        result->textProcessedIter     = NULL;
2731
0
        if (U_FAILURE(*status)) {
2732
0
            usearch_close(result);
2733
0
            return NULL;
2734
0
        }
2735
0
2736
0
        result->search->isOverlap          = FALSE;
2737
0
        result->search->isCanonicalMatch   = FALSE;
2738
0
        result->search->elementComparisonType = 0;
2739
0
        result->search->isForwardSearching = TRUE;
2740
0
        result->search->reset              = TRUE;
2741
0
2742
0
        initialize(result, status);
2743
0
2744
0
        if (U_FAILURE(*status)) {
2745
0
            usearch_close(result);
2746
0
            return NULL;
2747
0
        }
2748
0
2749
0
        return result;
2750
0
    }
2751
0
    return NULL;
2752
0
}
2753
2754
U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2755
0
{
2756
0
    if (strsrch) {
2757
0
        if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2758
0
            strsrch->pattern.ces) {
2759
0
            uprv_free(strsrch->pattern.ces);
2760
0
        }
2761
0
2762
0
        if (strsrch->pattern.pces != NULL &&
2763
0
            strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2764
0
            uprv_free(strsrch->pattern.pces);
2765
0
        }
2766
0
2767
0
        delete strsrch->textProcessedIter;
2768
0
        ucol_closeElements(strsrch->textIter);
2769
0
        ucol_closeElements(strsrch->utilIter);
2770
0
2771
0
        if (strsrch->ownCollator && strsrch->collator) {
2772
0
            ucol_close((UCollator *)strsrch->collator);
2773
0
        }
2774
0
2775
0
#if !UCONFIG_NO_BREAK_ITERATION
2776
0
        if (strsrch->search->internalBreakIter) {
2777
0
            ubrk_close(strsrch->search->internalBreakIter);
2778
0
        }
2779
0
#endif
2780
0
2781
0
        uprv_free(strsrch->search);
2782
0
        uprv_free(strsrch);
2783
0
    }
2784
0
}
2785
2786
namespace {
2787
2788
0
UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2789
0
    if (U_FAILURE(*status)) { return FALSE; }
2790
0
    if (strsrch->textProcessedIter == NULL) {
2791
0
        strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2792
0
        if (strsrch->textProcessedIter == NULL) {
2793
0
            *status = U_MEMORY_ALLOCATION_ERROR;
2794
0
            return FALSE;
2795
0
        }
2796
0
    } else {
2797
0
        strsrch->textProcessedIter->init(strsrch->textIter);
2798
0
    }
2799
0
    return TRUE;
2800
0
}
2801
2802
}
2803
2804
// set and get methods --------------------------------------------------
2805
2806
U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2807
                                        int32_t    position,
2808
                                        UErrorCode    *status)
2809
0
{
2810
0
    if (U_SUCCESS(*status) && strsrch) {
2811
0
        if (isOutOfBounds(strsrch->search->textLength, position)) {
2812
0
            *status = U_INDEX_OUTOFBOUNDS_ERROR;
2813
0
        }
2814
0
        else {
2815
0
            setColEIterOffset(strsrch->textIter, position);
2816
0
        }
2817
0
        strsrch->search->matchedIndex  = USEARCH_DONE;
2818
0
        strsrch->search->matchedLength = 0;
2819
0
        strsrch->search->reset         = FALSE;
2820
0
    }
2821
0
}
2822
2823
U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2824
0
{
2825
0
    if (strsrch) {
2826
0
        int32_t result = ucol_getOffset(strsrch->textIter);
2827
0
        if (isOutOfBounds(strsrch->search->textLength, result)) {
2828
0
            return USEARCH_DONE;
2829
0
        }
2830
0
        return result;
2831
0
    }
2832
0
    return USEARCH_DONE;
2833
0
}
2834
2835
U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2836
                                 USearchAttribute attribute,
2837
                                 USearchAttributeValue value,
2838
                                 UErrorCode *status)
2839
0
{
2840
0
    if (U_SUCCESS(*status) && strsrch) {
2841
0
        switch (attribute)
2842
0
        {
2843
0
        case USEARCH_OVERLAP :
2844
0
            strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2845
0
            break;
2846
0
        case USEARCH_CANONICAL_MATCH :
2847
0
            strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2848
0
                                                                      FALSE);
2849
0
            break;
2850
0
        case USEARCH_ELEMENT_COMPARISON :
2851
0
            if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2852
0
                strsrch->search->elementComparisonType = (int16_t)value;
2853
0
            } else {
2854
0
                strsrch->search->elementComparisonType = 0;
2855
0
            }
2856
0
            break;
2857
0
        case USEARCH_ATTRIBUTE_COUNT :
2858
0
        default:
2859
0
            *status = U_ILLEGAL_ARGUMENT_ERROR;
2860
0
        }
2861
0
    }
2862
0
    if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2863
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
2864
0
    }
2865
0
}
2866
2867
U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2868
                                                const UStringSearch *strsrch,
2869
                                                USearchAttribute attribute)
2870
0
{
2871
0
    if (strsrch) {
2872
0
        switch (attribute) {
2873
0
        case USEARCH_OVERLAP :
2874
0
            return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2875
0
                                                        USEARCH_OFF);
2876
0
        case USEARCH_CANONICAL_MATCH :
2877
0
            return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2878
0
                                                               USEARCH_OFF);
2879
0
        case USEARCH_ELEMENT_COMPARISON :
2880
0
            {
2881
0
                int16_t value = strsrch->search->elementComparisonType;
2882
0
                if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2883
0
                    return (USearchAttributeValue)value;
2884
0
                } else {
2885
0
                    return USEARCH_STANDARD_ELEMENT_COMPARISON;
2886
0
                }
2887
0
            }
2888
0
        case USEARCH_ATTRIBUTE_COUNT :
2889
0
            return USEARCH_DEFAULT;
2890
0
        }
2891
0
    }
2892
0
    return USEARCH_DEFAULT;
2893
0
}
2894
2895
U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2896
                                                const UStringSearch *strsrch)
2897
0
{
2898
0
    if (strsrch == NULL) {
2899
0
        return USEARCH_DONE;
2900
0
    }
2901
0
    return strsrch->search->matchedIndex;
2902
0
}
2903
2904
2905
U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2906
                                            UChar         *result,
2907
                                            int32_t        resultCapacity,
2908
                                            UErrorCode    *status)
2909
0
{
2910
0
    if (U_FAILURE(*status)) {
2911
0
        return USEARCH_DONE;
2912
0
    }
2913
0
    if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2914
0
        result == NULL)) {
2915
0
        *status = U_ILLEGAL_ARGUMENT_ERROR;
2916
0
        return USEARCH_DONE;
2917
0
    }
2918
0
2919
0
    int32_t     copylength = strsrch->search->matchedLength;
2920
0
    int32_t copyindex  = strsrch->search->matchedIndex;
2921
0
    if (copyindex == USEARCH_DONE) {
2922
0
        u_terminateUChars(result, resultCapacity, 0, status);
2923
0
        return USEARCH_DONE;
2924
0
    }
2925
0
2926
0
    if (resultCapacity < copylength) {
2927
0
        copylength = resultCapacity;
2928
0
    }
2929
0
    if (copylength > 0) {
2930
0
        uprv_memcpy(result, strsrch->search->text + copyindex,
2931
0
                    copylength * sizeof(UChar));
2932
0
    }
2933
0
    return u_terminateUChars(result, resultCapacity,
2934
0
                             strsrch->search->matchedLength, status);
2935
0
}
2936
2937
U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2938
                                              const UStringSearch *strsrch)
2939
0
{
2940
0
    if (strsrch) {
2941
0
        return strsrch->search->matchedLength;
2942
0
    }
2943
0
    return USEARCH_DONE;
2944
0
}
2945
2946
#if !UCONFIG_NO_BREAK_ITERATION
2947
2948
U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2949
                                               UBreakIterator *breakiter,
2950
                                               UErrorCode     *status)
2951
0
{
2952
0
    if (U_SUCCESS(*status) && strsrch) {
2953
0
        strsrch->search->breakIter = breakiter;
2954
0
        if (breakiter) {
2955
0
            ubrk_setText(breakiter, strsrch->search->text,
2956
0
                         strsrch->search->textLength, status);
2957
0
        }
2958
0
    }
2959
0
}
2960
2961
U_CAPI const UBreakIterator* U_EXPORT2
2962
usearch_getBreakIterator(const UStringSearch *strsrch)
2963
0
{
2964
0
    if (strsrch) {
2965
0
        return strsrch->search->breakIter;
2966
0
    }
2967
0
    return NULL;
2968
0
}
2969
2970
#endif
2971
2972
U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2973
                                      const UChar         *text,
2974
                                            int32_t        textlength,
2975
                                            UErrorCode    *status)
2976
0
{
2977
0
    if (U_SUCCESS(*status)) {
2978
0
        if (strsrch == NULL || text == NULL || textlength < -1 ||
2979
0
            textlength == 0) {
2980
0
            *status = U_ILLEGAL_ARGUMENT_ERROR;
2981
0
        }
2982
0
        else {
2983
0
            if (textlength == -1) {
2984
0
                textlength = u_strlen(text);
2985
0
            }
2986
0
            strsrch->search->text       = text;
2987
0
            strsrch->search->textLength = textlength;
2988
0
            ucol_setText(strsrch->textIter, text, textlength, status);
2989
0
            strsrch->search->matchedIndex  = USEARCH_DONE;
2990
0
            strsrch->search->matchedLength = 0;
2991
0
            strsrch->search->reset         = TRUE;
2992
0
#if !UCONFIG_NO_BREAK_ITERATION
2993
0
            if (strsrch->search->breakIter != NULL) {
2994
0
                ubrk_setText(strsrch->search->breakIter, text,
2995
0
                             textlength, status);
2996
0
            }
2997
0
            ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2998
0
#endif
2999
0
        }
3000
0
    }
3001
0
}
3002
3003
U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3004
                                                     int32_t       *length)
3005
0
{
3006
0
    if (strsrch) {
3007
0
        *length = strsrch->search->textLength;
3008
0
        return strsrch->search->text;
3009
0
    }
3010
0
    return NULL;
3011
0
}
3012
3013
U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
3014
                                          const UCollator     *collator,
3015
                                                UErrorCode    *status)
3016
0
{
3017
0
    if (U_SUCCESS(*status)) {
3018
0
        if (collator == NULL) {
3019
0
            *status = U_ILLEGAL_ARGUMENT_ERROR;
3020
0
            return;
3021
0
        }
3022
0
3023
0
        if (strsrch) {
3024
0
            delete strsrch->textProcessedIter;
3025
0
            strsrch->textProcessedIter = NULL;
3026
0
            ucol_closeElements(strsrch->textIter);
3027
0
            ucol_closeElements(strsrch->utilIter);
3028
0
            strsrch->textIter = strsrch->utilIter = NULL;
3029
0
            if (strsrch->ownCollator && (strsrch->collator != collator)) {
3030
0
                ucol_close((UCollator *)strsrch->collator);
3031
0
                strsrch->ownCollator = FALSE;
3032
0
            }
3033
0
            strsrch->collator    = collator;
3034
0
            strsrch->strength    = ucol_getStrength(collator);
3035
0
            strsrch->ceMask      = getMask(strsrch->strength);
3036
0
#if !UCONFIG_NO_BREAK_ITERATION
3037
0
            ubrk_close(strsrch->search->internalBreakIter);
3038
0
            strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3039
0
                                                     strsrch->search->text, strsrch->search->textLength, status);
3040
0
#endif
3041
0
            // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3042
0
            strsrch->toShift     =
3043
0
               ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3044
0
                                                                UCOL_SHIFTED;
3045
0
            // if status is a failure, ucol_getVariableTop returns 0
3046
0
            strsrch->variableTop = ucol_getVariableTop(collator, status);
3047
0
            strsrch->textIter = ucol_openElements(collator,
3048
0
                                      strsrch->search->text,
3049
0
                                      strsrch->search->textLength,
3050
0
                                      status);
3051
0
            strsrch->utilIter = ucol_openElements(
3052
0
                    collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3053
0
            // initialize() _after_ setting the iterators for the new collator.
3054
0
            initialize(strsrch, status);
3055
0
        }
3056
0
3057
0
        // **** are these calls needed?
3058
0
        // **** we call uprv_init_pce in initializePatternPCETable
3059
0
        // **** and the CEIBuffer constructor...
3060
#if 0
3061
        uprv_init_pce(strsrch->textIter);
3062
        uprv_init_pce(strsrch->utilIter);
3063
#endif
3064
    }
3065
0
}
3066
3067
U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3068
0
{
3069
0
    if (strsrch) {
3070
0
        return (UCollator *)strsrch->collator;
3071
0
    }
3072
0
    return NULL;
3073
0
}
3074
3075
U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
3076
                                         const UChar         *pattern,
3077
                                               int32_t        patternlength,
3078
                                               UErrorCode    *status)
3079
0
{
3080
0
    if (U_SUCCESS(*status)) {
3081
0
        if (strsrch == NULL || pattern == NULL) {
3082
0
            *status = U_ILLEGAL_ARGUMENT_ERROR;
3083
0
        }
3084
0
        else {
3085
0
            if (patternlength == -1) {
3086
0
                patternlength = u_strlen(pattern);
3087
0
            }
3088
0
            if (patternlength == 0) {
3089
0
                *status = U_ILLEGAL_ARGUMENT_ERROR;
3090
0
                return;
3091
0
            }
3092
0
            strsrch->pattern.text       = pattern;
3093
0
            strsrch->pattern.textLength = patternlength;
3094
0
            initialize(strsrch, status);
3095
0
        }
3096
0
    }
3097
0
}
3098
3099
U_CAPI const UChar* U_EXPORT2
3100
usearch_getPattern(const UStringSearch *strsrch,
3101
                   int32_t       *length)
3102
0
{
3103
0
    if (strsrch) {
3104
0
        *length = strsrch->pattern.textLength;
3105
0
        return strsrch->pattern.text;
3106
0
    }
3107
0
    return NULL;
3108
0
}
3109
3110
// miscellanous methods --------------------------------------------------
3111
3112
U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3113
                                           UErrorCode    *status)
3114
0
{
3115
0
    if (strsrch && U_SUCCESS(*status)) {
3116
0
        strsrch->search->isForwardSearching = TRUE;
3117
0
        usearch_setOffset(strsrch, 0, status);
3118
0
        if (U_SUCCESS(*status)) {
3119
0
            return usearch_next(strsrch, status);
3120
0
        }
3121
0
    }
3122
0
    return USEARCH_DONE;
3123
0
}
3124
3125
U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3126
                                               int32_t    position,
3127
                                               UErrorCode    *status)
3128
0
{
3129
0
    if (strsrch && U_SUCCESS(*status)) {
3130
0
        strsrch->search->isForwardSearching = TRUE;
3131
0
        // position checked in usearch_setOffset
3132
0
        usearch_setOffset(strsrch, position, status);
3133
0
        if (U_SUCCESS(*status)) {
3134
0
            return usearch_next(strsrch, status);
3135
0
        }
3136
0
    }
3137
0
    return USEARCH_DONE;
3138
0
}
3139
3140
U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3141
                                          UErrorCode    *status)
3142
0
{
3143
0
    if (strsrch && U_SUCCESS(*status)) {
3144
0
        strsrch->search->isForwardSearching = FALSE;
3145
0
        usearch_setOffset(strsrch, strsrch->search->textLength, status);
3146
0
        if (U_SUCCESS(*status)) {
3147
0
            return usearch_previous(strsrch, status);
3148
0
        }
3149
0
    }
3150
0
    return USEARCH_DONE;
3151
0
}
3152
3153
U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3154
                                               int32_t    position,
3155
                                               UErrorCode    *status)
3156
0
{
3157
0
    if (strsrch && U_SUCCESS(*status)) {
3158
0
        strsrch->search->isForwardSearching = FALSE;
3159
0
        // position checked in usearch_setOffset
3160
0
        usearch_setOffset(strsrch, position, status);
3161
0
        if (U_SUCCESS(*status)) {
3162
0
            return usearch_previous(strsrch, status);
3163
0
        }
3164
0
    }
3165
0
    return USEARCH_DONE;
3166
0
}
3167
3168
/**
3169
* If a direction switch is required, we'll count the number of ces till the
3170
* beginning of the collation element iterator and iterate forwards that
3171
* number of times. This is so that we get to the correct point within the
3172
* string to continue the search in. Imagine when we are in the middle of the
3173
* normalization buffer when the change in direction is request. arrrgghh....
3174
* After searching the offset within the collation element iterator will be
3175
* shifted to the start of the match. If a match is not found, the offset would
3176
* have been set to the end of the text string in the collation element
3177
* iterator.
3178
* Okay, here's my take on normalization buffer. The only time when there can
3179
* be 2 matches within the same normalization is when the pattern is consists
3180
* of all accents. But since the offset returned is from the text string, we
3181
* should not confuse the caller by returning the second match within the
3182
* same normalization buffer. If we do, the 2 results will have the same match
3183
* offsets, and that'll be confusing. I'll return the next match that doesn't
3184
* fall within the same normalization buffer. Note this does not affect the
3185
* results of matches spanning the text and the normalization buffer.
3186
* The position to start searching is taken from the collation element
3187
* iterator. Callers of this API would have to set the offset in the collation
3188
* element iterator before using this method.
3189
*/
3190
U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3191
                                          UErrorCode    *status)
3192
0
{
3193
0
    if (U_SUCCESS(*status) && strsrch) {
3194
0
        // note offset is either equivalent to the start of the previous match
3195
0
        // or is set by the user
3196
0
        int32_t      offset       = usearch_getOffset(strsrch);
3197
0
        USearch     *search       = strsrch->search;
3198
0
        search->reset             = FALSE;
3199
0
        int32_t      textlength   = search->textLength;
3200
0
        if (search->isForwardSearching) {
3201
#if BOYER_MOORE
3202
            if (offset == textlength
3203
                || (!search->isOverlap &&
3204
                    (offset + strsrch->pattern.defaultShiftSize > textlength ||
3205
                    (search->matchedIndex != USEARCH_DONE &&
3206
                     offset + search->matchedLength >= textlength)))) {
3207
                // not enough characters to match
3208
                setMatchNotFound(strsrch);
3209
                return USEARCH_DONE;
3210
            }
3211
#else
3212
0
            if (offset == textlength ||
3213
0
                (! search->isOverlap &&
3214
0
                (search->matchedIndex != USEARCH_DONE &&
3215
0
                offset + search->matchedLength > textlength))) {
3216
0
                    // not enough characters to match
3217
0
                    setMatchNotFound(strsrch);
3218
0
                    return USEARCH_DONE;
3219
0
            }
3220
0
#endif
3221
0
        }
3222
0
        else {
3223
0
            // switching direction.
3224
0
            // if matchedIndex == USEARCH_DONE, it means that either a
3225
0
            // setOffset has been called or that previous ran off the text
3226
0
            // string. the iterator would have been set to offset 0 if a
3227
0
            // match is not found.
3228
0
            search->isForwardSearching = TRUE;
3229
0
            if (search->matchedIndex != USEARCH_DONE) {
3230
0
                // there's no need to set the collation element iterator
3231
0
                // the next call to next will set the offset.
3232
0
                return search->matchedIndex;
3233
0
            }
3234
0
        }
3235
0
3236
0
        if (U_SUCCESS(*status)) {
3237
0
            if (strsrch->pattern.cesLength == 0) {
3238
0
                if (search->matchedIndex == USEARCH_DONE) {
3239
0
                    search->matchedIndex = offset;
3240
0
                }
3241
0
                else { // moves by codepoints
3242
0
                    U16_FWD_1(search->text, search->matchedIndex, textlength);
3243
0
                }
3244
0
3245
0
                search->matchedLength = 0;
3246
0
                setColEIterOffset(strsrch->textIter, search->matchedIndex);
3247
0
                // status checked below
3248
0
                if (search->matchedIndex == textlength) {
3249
0
                    search->matchedIndex = USEARCH_DONE;
3250
0
                }
3251
0
            }
3252
0
            else {
3253
0
                if (search->matchedLength > 0) {
3254
0
                    // if matchlength is 0 we are at the start of the iteration
3255
0
                    if (search->isOverlap) {
3256
0
                        ucol_setOffset(strsrch->textIter, offset + 1, status);
3257
0
                    }
3258
0
                    else {
3259
0
                        ucol_setOffset(strsrch->textIter,
3260
0
                                       offset + search->matchedLength, status);
3261
0
                    }
3262
0
                }
3263
0
                else {
3264
0
                    // for boundary check purposes. this will ensure that the
3265
0
                    // next match will not preceed the current offset
3266
0
                    // note search->matchedIndex will always be set to something
3267
0
                    // in the code
3268
0
                    search->matchedIndex = offset - 1;
3269
0
                }
3270
0
3271
0
                if (search->isCanonicalMatch) {
3272
0
                    // can't use exact here since extra accents are allowed.
3273
0
                    usearch_handleNextCanonical(strsrch, status);
3274
0
                }
3275
0
                else {
3276
0
                    usearch_handleNextExact(strsrch, status);
3277
0
                }
3278
0
            }
3279
0
3280
0
            if (U_FAILURE(*status)) {
3281
0
                return USEARCH_DONE;
3282
0
            }
3283
0
3284
0
#if !BOYER_MOORE
3285
0
            if (search->matchedIndex == USEARCH_DONE) {
3286
0
                ucol_setOffset(strsrch->textIter, search->textLength, status);
3287
0
            } else {
3288
0
                ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3289
0
            }
3290
0
#endif
3291
0
3292
0
            return search->matchedIndex;
3293
0
        }
3294
0
    }
3295
0
    return USEARCH_DONE;
3296
0
}
3297
3298
U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3299
                                              UErrorCode *status)
3300
0
{
3301
0
    if (U_SUCCESS(*status) && strsrch) {
3302
0
        int32_t offset;
3303
0
        USearch *search = strsrch->search;
3304
0
        if (search->reset) {
3305
0
            offset                     = search->textLength;
3306
0
            search->isForwardSearching = FALSE;
3307
0
            search->reset              = FALSE;
3308
0
            setColEIterOffset(strsrch->textIter, offset);
3309
0
        }
3310
0
        else {
3311
0
            offset = usearch_getOffset(strsrch);
3312
0
        }
3313
0
3314
0
        int32_t matchedindex = search->matchedIndex;
3315
0
        if (search->isForwardSearching == TRUE) {
3316
0
            // switching direction.
3317
0
            // if matchedIndex == USEARCH_DONE, it means that either a
3318
0
            // setOffset has been called or that next ran off the text
3319
0
            // string. the iterator would have been set to offset textLength if
3320
0
            // a match is not found.
3321
0
            search->isForwardSearching = FALSE;
3322
0
            if (matchedindex != USEARCH_DONE) {
3323
0
                return matchedindex;
3324
0
            }
3325
0
        }
3326
0
        else {
3327
#if BOYER_MOORE
3328
            if (offset == 0 || matchedindex == 0 ||
3329
                (!search->isOverlap &&
3330
                    (offset < strsrch->pattern.defaultShiftSize ||
3331
                    (matchedindex != USEARCH_DONE &&
3332
                    matchedindex < strsrch->pattern.defaultShiftSize)))) {
3333
                // not enough characters to match
3334
                setMatchNotFound(strsrch);
3335
                return USEARCH_DONE;
3336
            }
3337
#else
3338
            // Could check pattern length, but the
3339
0
            // linear search will do the right thing
3340
0
            if (offset == 0 || matchedindex == 0) {
3341
0
                setMatchNotFound(strsrch);
3342
0
                return USEARCH_DONE;
3343
0
            }
3344
0
#endif
3345
0
        }
3346
0
3347
0
        if (U_SUCCESS(*status)) {
3348
0
            if (strsrch->pattern.cesLength == 0) {
3349
0
                search->matchedIndex =
3350
0
                      (matchedindex == USEARCH_DONE ? offset : matchedindex);
3351
0
                if (search->matchedIndex == 0) {
3352
0
                    setMatchNotFound(strsrch);
3353
0
                    // status checked below
3354
0
                }
3355
0
                else { // move by codepoints
3356
0
                    U16_BACK_1(search->text, 0, search->matchedIndex);
3357
0
                    setColEIterOffset(strsrch->textIter, search->matchedIndex);
3358
0
                    // status checked below
3359
0
                    search->matchedLength = 0;
3360
0
                }
3361
0
            }
3362
0
            else {
3363
0
                if (strsrch->search->isCanonicalMatch) {
3364
0
                    // can't use exact here since extra accents are allowed.
3365
0
                    usearch_handlePreviousCanonical(strsrch, status);
3366
0
                    // status checked below
3367
0
                }
3368
0
                else {
3369
0
                    usearch_handlePreviousExact(strsrch, status);
3370
0
                    // status checked below
3371
0
                }
3372
0
            }
3373
0
3374
0
            if (U_FAILURE(*status)) {
3375
0
                return USEARCH_DONE;
3376
0
            }
3377
0
3378
0
            return search->matchedIndex;
3379
0
        }
3380
0
    }
3381
0
    return USEARCH_DONE;
3382
0
}
3383
3384
3385
3386
U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3387
0
{
3388
0
    /*
3389
0
    reset is setting the attributes that are already in
3390
0
    string search, hence all attributes in the collator should
3391
0
    be retrieved without any problems
3392
0
    */
3393
0
    if (strsrch) {
3394
0
        UErrorCode status            = U_ZERO_ERROR;
3395
0
        UBool      sameCollAttribute = TRUE;
3396
0
        uint32_t   ceMask;
3397
0
        UBool      shift;
3398
0
        uint32_t   varTop;
3399
0
3400
0
        // **** hack to deal w/ how processed CEs encode quaternary ****
3401
0
        UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3402
0
        if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3403
0
            (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3404
0
                sameCollAttribute = FALSE;
3405
0
        }
3406
0
3407
0
        strsrch->strength    = ucol_getStrength(strsrch->collator);
3408
0
        ceMask = getMask(strsrch->strength);
3409
0
        if (strsrch->ceMask != ceMask) {
3410
0
            strsrch->ceMask = ceMask;
3411
0
            sameCollAttribute = FALSE;
3412
0
        }
3413
0
3414
0
        // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3415
0
        shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3416
0
                                  &status) == UCOL_SHIFTED;
3417
0
        if (strsrch->toShift != shift) {
3418
0
            strsrch->toShift  = shift;
3419
0
            sameCollAttribute = FALSE;
3420
0
        }
3421
0
3422
0
        // if status is a failure, ucol_getVariableTop returns 0
3423
0
        varTop = ucol_getVariableTop(strsrch->collator, &status);
3424
0
        if (strsrch->variableTop != varTop) {
3425
0
            strsrch->variableTop = varTop;
3426
0
            sameCollAttribute    = FALSE;
3427
0
        }
3428
0
        if (!sameCollAttribute) {
3429
0
            initialize(strsrch, &status);
3430
0
        }
3431
0
        ucol_setText(strsrch->textIter, strsrch->search->text,
3432
0
                              strsrch->search->textLength,
3433
0
                              &status);
3434
0
        strsrch->search->matchedLength      = 0;
3435
0
        strsrch->search->matchedIndex       = USEARCH_DONE;
3436
0
        strsrch->search->isOverlap          = FALSE;
3437
0
        strsrch->search->isCanonicalMatch   = FALSE;
3438
0
        strsrch->search->elementComparisonType = 0;
3439
0
        strsrch->search->isForwardSearching = TRUE;
3440
0
        strsrch->search->reset              = TRUE;
3441
0
    }
3442
0
}
3443
3444
//
3445
//  CEI  Collation Element + source text index.
3446
//       These structs are kept in the circular buffer.
3447
//
3448
struct  CEI {
3449
    int64_t ce;
3450
    int32_t lowIndex;
3451
    int32_t highIndex;
3452
};
3453
3454
U_NAMESPACE_BEGIN
3455
3456
namespace {
3457
//
3458
//  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
3459
//
3460
0
#define   DEFAULT_CEBUFFER_SIZE 96
3461
0
#define   CEBUFFER_EXTRA 32
3462
// Some typical max values to make buffer size more reasonable for asymmetric search.
3463
// #8694 is for a better long-term solution to allocation of this buffer.
3464
0
#define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3465
0
#define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3466
0
#define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3467
struct CEIBuffer {
3468
    CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
3469
    CEI                 *buf;
3470
    int32_t              bufSize;
3471
    int32_t              firstIx;
3472
    int32_t              limitIx;
3473
    UCollationElements  *ceIter;
3474
    UStringSearch       *strSearch;
3475
3476
3477
3478
               CEIBuffer(UStringSearch *ss, UErrorCode *status);
3479
               ~CEIBuffer();
3480
   const CEI   *get(int32_t index);
3481
   const CEI   *getPrevious(int32_t index);
3482
};
3483
3484
3485
0
CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3486
0
    buf = defBuf;
3487
0
    strSearch = ss;
3488
0
    bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3489
0
    if (ss->search->elementComparisonType != 0) {
3490
0
        const UChar * patText = ss->pattern.text;
3491
0
        if (patText) {
3492
0
            const UChar * patTextLimit = patText + ss->pattern.textLength;
3493
0
            while ( patText < patTextLimit ) {
3494
0
                UChar c = *patText++;
3495
0
                if (MIGHT_BE_JAMO_L(c)) {
3496
0
                    bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3497
0
                } else {
3498
0
                    // No check for surrogates, we might allocate slightly more buffer than necessary.
3499
0
                    bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3500
0
                }
3501
0
            }
3502
0
        }
3503
0
    }
3504
0
    ceIter    = ss->textIter;
3505
0
    firstIx = 0;
3506
0
    limitIx = 0;
3507
0
3508
0
    if (!initTextProcessedIter(ss, status)) { return; }
3509
0
3510
0
    if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3511
0
        buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3512
0
        if (buf == NULL) {
3513
0
            *status = U_MEMORY_ALLOCATION_ERROR;
3514
0
        }
3515
0
    }
3516
0
}
3517
3518
// TODO: add a reset or init function so that allocated
3519
//       buffers can be retained & reused.
3520
3521
0
CEIBuffer::~CEIBuffer() {
3522
0
    if (buf != defBuf) {
3523
0
        uprv_free(buf);
3524
0
    }
3525
0
}
3526
3527
3528
// Get the CE with the specified index.
3529
//   Index must be in the range
3530
//          n-history_size < index < n+1
3531
//   where n is the largest index to have been fetched by some previous call to this function.
3532
//   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3533
//
3534
0
const CEI *CEIBuffer::get(int32_t index) {
3535
0
    int i = index % bufSize;
3536
0
3537
0
    if (index>=firstIx && index<limitIx) {
3538
0
        // The request was for an entry already in our buffer.
3539
0
        //  Just return it.
3540
0
        return &buf[i];
3541
0
    }
3542
0
3543
0
    // Caller is requesting a new, never accessed before, CE.
3544
0
    //   Verify that it is the next one in sequence, which is all
3545
0
    //   that is allowed.
3546
0
    if (index != limitIx) {
3547
0
        U_ASSERT(FALSE);
3548
0
3549
0
        return NULL;
3550
0
    }
3551
0
3552
0
    // Manage the circular CE buffer indexing
3553
0
    limitIx++;
3554
0
3555
0
    if (limitIx - firstIx >= bufSize) {
3556
0
        // The buffer is full, knock out the lowest-indexed entry.
3557
0
        firstIx++;
3558
0
    }
3559
0
3560
0
    UErrorCode status = U_ZERO_ERROR;
3561
0
3562
0
    buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3563
0
3564
0
    return &buf[i];
3565
0
}
3566
3567
// Get the CE with the specified index.
3568
//   Index must be in the range
3569
//          n-history_size < index < n+1
3570
//   where n is the largest index to have been fetched by some previous call to this function.
3571
//   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3572
//
3573
0
const CEI *CEIBuffer::getPrevious(int32_t index) {
3574
0
    int i = index % bufSize;
3575
0
3576
0
    if (index>=firstIx && index<limitIx) {
3577
0
        // The request was for an entry already in our buffer.
3578
0
        //  Just return it.
3579
0
        return &buf[i];
3580
0
    }
3581
0
3582
0
    // Caller is requesting a new, never accessed before, CE.
3583
0
    //   Verify that it is the next one in sequence, which is all
3584
0
    //   that is allowed.
3585
0
    if (index != limitIx) {
3586
0
        U_ASSERT(FALSE);
3587
0
3588
0
        return NULL;
3589
0
    }
3590
0
3591
0
    // Manage the circular CE buffer indexing
3592
0
    limitIx++;
3593
0
3594
0
    if (limitIx - firstIx >= bufSize) {
3595
0
        // The buffer is full, knock out the lowest-indexed entry.
3596
0
        firstIx++;
3597
0
    }
3598
0
3599
0
    UErrorCode status = U_ZERO_ERROR;
3600
0
3601
0
    buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3602
0
3603
0
    return &buf[i];
3604
0
}
3605
3606
}
3607
3608
U_NAMESPACE_END
3609
3610
3611
// #define USEARCH_DEBUG
3612
3613
#ifdef USEARCH_DEBUG
3614
#include <stdio.h>
3615
#include <stdlib.h>
3616
#endif
3617
3618
/*
3619
 * Find the next break boundary after startIndex. If the UStringSearch object
3620
 * has an external break iterator, use that. Otherwise use the internal character
3621
 * break iterator.
3622
 */
3623
0
static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3624
#if 0
3625
    const UChar *text = strsrch->search->text;
3626
    int32_t textLen   = strsrch->search->textLength;
3627
3628
    U_ASSERT(startIndex>=0);
3629
    U_ASSERT(startIndex<=textLen);
3630
3631
    if (startIndex >= textLen) {
3632
        return startIndex;
3633
    }
3634
3635
    UChar32  c;
3636
    int32_t  i = startIndex;
3637
    U16_NEXT(text, i, textLen, c);
3638
3639
    // If we are on a control character, stop without looking for combining marks.
3640
    //    Control characters do not combine.
3641
    int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3642
    if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3643
        return i;
3644
    }
3645
3646
    // The initial character was not a control, and can thus accept trailing
3647
    //   combining characters.  Advance over however many of them there are.
3648
    int32_t  indexOfLastCharChecked;
3649
    for (;;) {
3650
        indexOfLastCharChecked = i;
3651
        if (i>=textLen) {
3652
            break;
3653
        }
3654
        U16_NEXT(text, i, textLen, c);
3655
        gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3656
        if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3657
            break;
3658
        }
3659
    }
3660
    return indexOfLastCharChecked;
3661
#elif !UCONFIG_NO_BREAK_ITERATION
3662
    UBreakIterator *breakiterator = strsrch->search->breakIter;
3663
0
3664
0
    if (breakiterator == NULL) {
3665
0
        breakiterator = strsrch->search->internalBreakIter;
3666
0
    }
3667
0
3668
0
    if (breakiterator != NULL) {
3669
0
        return ubrk_following(breakiterator, startIndex);
3670
0
    }
3671
0
3672
0
    return startIndex;
3673
#else
3674
    // **** or should we use the original code? ****
3675
    return startIndex;
3676
#endif
3677
3678
0
}
3679
3680
/*
3681
 * Returns TRUE if index is on a break boundary. If the UStringSearch
3682
 * has an external break iterator, test using that, otherwise test
3683
 * using the internal character break iterator.
3684
 */
3685
0
static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3686
#if 0
3687
    const UChar *text = strsrch->search->text;
3688
    int32_t textLen   = strsrch->search->textLength;
3689
3690
    U_ASSERT(index>=0);
3691
    U_ASSERT(index<=textLen);
3692
3693
    if (index>=textLen || index<=0) {
3694
        return TRUE;
3695
    }
3696
3697
    // If the character at the current index is not a GRAPHEME_EXTEND
3698
    //    then we can not be within a combining sequence.
3699
    UChar32  c;
3700
    U16_GET(text, 0, index, textLen, c);
3701
    int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3702
    if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3703
        return TRUE;
3704
    }
3705
3706
    // We are at a combining mark.  If the preceding character is anything
3707
    //   except a CONTROL, CR or LF, we are in a combining sequence.
3708
    U16_PREV(text, 0, index, c);
3709
    gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3710
    UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3711
    return !combining;
3712
#elif !UCONFIG_NO_BREAK_ITERATION
3713
    UBreakIterator *breakiterator = strsrch->search->breakIter;
3714
0
3715
0
    if (breakiterator == NULL) {
3716
0
        breakiterator = strsrch->search->internalBreakIter;
3717
0
    }
3718
0
3719
0
    return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3720
#else
3721
    // **** or use the original code? ****
3722
    return TRUE;
3723
#endif
3724
}
3725
3726
#if 0
3727
static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3728
{
3729
#if !UCONFIG_NO_BREAK_ITERATION
3730
    UBreakIterator *breakiterator = strsrch->search->breakIter;
3731
3732
    if (breakiterator != NULL) {
3733
        int32_t startindex = ubrk_first(breakiterator);
3734
        int32_t endindex   = ubrk_last(breakiterator);
3735
3736
        // out-of-range indexes are never boundary positions
3737
        if (start < startindex || start > endindex ||
3738
            end < startindex || end > endindex) {
3739
            return FALSE;
3740
        }
3741
3742
        return ubrk_isBoundary(breakiterator, start) &&
3743
               ubrk_isBoundary(breakiterator, end);
3744
    }
3745
#endif
3746
3747
    return TRUE;
3748
}
3749
#endif
3750
3751
typedef enum {
3752
    U_CE_MATCH = -1,
3753
    U_CE_NO_MATCH = 0,
3754
    U_CE_SKIP_TARG,
3755
    U_CE_SKIP_PATN
3756
} UCompareCEsResult;
3757
0
#define U_CE_LEVEL2_BASE 0x00000005
3758
0
#define U_CE_LEVEL3_BASE 0x00050000
3759
3760
0
static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3761
0
    if (targCE == patCE) {
3762
0
        return U_CE_MATCH;
3763
0
    }
3764
0
    if (compareType == 0) {
3765
0
        return U_CE_NO_MATCH;
3766
0
    }
3767
0
    
3768
0
    int64_t targCEshifted = targCE >> 32;
3769
0
    int64_t patCEshifted = patCE >> 32;
3770
0
    int64_t mask;
3771
0
3772
0
    mask = 0xFFFF0000;
3773
0
    int32_t targLev1 = (int32_t)(targCEshifted & mask);
3774
0
    int32_t patLev1 = (int32_t)(patCEshifted & mask);
3775
0
    if ( targLev1 != patLev1 ) {
3776
0
        if ( targLev1 == 0 ) {
3777
0
            return U_CE_SKIP_TARG;
3778
0
        }
3779
0
        if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3780
0
            return U_CE_SKIP_PATN;
3781
0
        }
3782
0
        return U_CE_NO_MATCH;
3783
0
    }
3784
0
3785
0
    mask = 0x0000FFFF;
3786
0
    int32_t targLev2 = (int32_t)(targCEshifted & mask);
3787
0
    int32_t patLev2 = (int32_t)(patCEshifted & mask);
3788
0
    if ( targLev2 != patLev2 ) {
3789
0
        if ( targLev2 == 0 ) {
3790
0
            return U_CE_SKIP_TARG;
3791
0
        }
3792
0
        if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3793
0
            return U_CE_SKIP_PATN;
3794
0
        }
3795
0
        return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3796
0
            U_CE_MATCH: U_CE_NO_MATCH;
3797
0
    }
3798
0
    
3799
0
    mask = 0xFFFF0000;
3800
0
    int32_t targLev3 = (int32_t)(targCE & mask);
3801
0
    int32_t patLev3 = (int32_t)(patCE & mask);
3802
0
    if ( targLev3 != patLev3 ) {
3803
0
        return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3804
0
            U_CE_MATCH: U_CE_NO_MATCH;
3805
0
   }
3806
0
3807
0
    return U_CE_MATCH;
3808
0
}
3809
3810
#if BOYER_MOORE
3811
// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3812
#endif
3813
3814
namespace {
3815
3816
0
UChar32 codePointAt(const USearch &search, int32_t index) {
3817
0
    if (index < search.textLength) {
3818
0
        UChar32 c;
3819
0
        U16_NEXT(search.text, index, search.textLength, c);
3820
0
        return c;
3821
0
    }
3822
0
    return U_SENTINEL;
3823
0
}
3824
3825
0
UChar32 codePointBefore(const USearch &search, int32_t index) {
3826
0
    if (0 < index) {
3827
0
        UChar32 c;
3828
0
        U16_PREV(search.text, 0, index, c);
3829
0
        return c;
3830
0
    }
3831
0
    return U_SENTINEL;
3832
0
}
3833
3834
}  // namespace
3835
3836
U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
3837
                                       int32_t        startIdx,
3838
                                       int32_t        *matchStart,
3839
                                       int32_t        *matchLimit,
3840
                                       UErrorCode     *status)
3841
0
{
3842
0
    if (U_FAILURE(*status)) {
3843
0
        return FALSE;
3844
0
    }
3845
0
3846
0
    // TODO:  reject search patterns beginning with a combining char.
3847
0
3848
#ifdef USEARCH_DEBUG
3849
    if (getenv("USEARCH_DEBUG") != NULL) {
3850
        printf("Pattern CEs\n");
3851
        for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3852
            printf(" %8x", strsrch->pattern.ces[ii]);
3853
        }
3854
        printf("\n");
3855
    }
3856
3857
#endif
3858
    // Input parameter sanity check.
3859
0
    //  TODO:  should input indicies clip to the text length
3860
0
    //         in the same way that UText does.
3861
0
    if(strsrch->pattern.cesLength == 0         ||
3862
0
       startIdx < 0                           ||
3863
0
       startIdx > strsrch->search->textLength ||
3864
0
       strsrch->pattern.ces == NULL) {
3865
0
           *status = U_ILLEGAL_ARGUMENT_ERROR;
3866
0
           return FALSE;
3867
0
    }
3868
0
3869
0
    if (strsrch->pattern.pces == NULL) {
3870
0
        initializePatternPCETable(strsrch, status);
3871
0
    }
3872
0
3873
0
    ucol_setOffset(strsrch->textIter, startIdx, status);
3874
0
    CEIBuffer ceb(strsrch, status);
3875
0
3876
0
3877
0
    int32_t    targetIx = 0;
3878
0
    const CEI *targetCEI = NULL;
3879
0
    int32_t    patIx;
3880
0
    UBool      found;
3881
0
3882
0
    int32_t  mStart = -1;
3883
0
    int32_t  mLimit = -1;
3884
0
    int32_t  minLimit;
3885
0
    int32_t  maxLimit;
3886
0
3887
0
3888
0
3889
0
    // Outer loop moves over match starting positions in the
3890
0
    //      target CE space.
3891
0
    // Here we see the target as a sequence of collation elements, resulting from the following:
3892
0
    // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3893
0
    //    (for example, digraphs such as IJ may be broken into two characters).
3894
0
    // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3895
0
    //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3896
0
    //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3897
0
    //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3898
0
    //    then the CE is deleted, so the following code sees only CEs that are relevant.
3899
0
    // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3900
0
    // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3901
0
    // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3902
0
    //
3903
0
    for(targetIx=0; ; targetIx++)
3904
0
    {
3905
0
        found = TRUE;
3906
0
        //  Inner loop checks for a match beginning at each
3907
0
        //  position from the outer loop.
3908
0
        int32_t targetIxOffset = 0;
3909
0
        int64_t patCE = 0;
3910
0
        // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3911
0
        // (compared to the last CE fetched for the previous targetIx value) as we need to go
3912
0
        // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3913
0
        const CEI *firstCEI = ceb.get(targetIx);
3914
0
        if (firstCEI == NULL) {
3915
0
            *status = U_INTERNAL_PROGRAM_ERROR;
3916
0
            found = FALSE;
3917
0
            break;
3918
0
        }
3919
0
        
3920
0
        for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3921
0
            patCE = strsrch->pattern.pces[patIx];
3922
0
            targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3923
0
            //  Compare CE from target string with CE from the pattern.
3924
0
            //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3925
0
            //    which will fail the compare, below.
3926
0
            UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3927
0
            if ( ceMatch == U_CE_NO_MATCH ) {
3928
0
                found = FALSE;
3929
0
                break;
3930
0
            } else if ( ceMatch > U_CE_NO_MATCH ) {
3931
0
                if ( ceMatch == U_CE_SKIP_TARG ) {
3932
0
                    // redo with same patCE, next targCE
3933
0
                    patIx--;
3934
0
                    targetIxOffset++;
3935
0
                } else { // ceMatch == U_CE_SKIP_PATN
3936
0
                    // redo with same targCE, next patCE
3937
0
                    targetIxOffset--;
3938
0
                }
3939
0
            }
3940
0
        }
3941
0
        targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3942
0
3943
0
        if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3944
0
            // No match at this targetIx.  Try again at the next.
3945
0
            continue;
3946
0
        }
3947
0
3948
0
        if (!found) {
3949
0
            // No match at all, we have run off the end of the target text.
3950
0
            break;
3951
0
        }
3952
0
3953
0
3954
0
        // We have found a match in CE space.
3955
0
        // Now determine the bounds in string index space.
3956
0
        //  There still is a chance of match failure if the CE range not correspond to
3957
0
        //     an acceptable character range.
3958
0
        //
3959
0
        const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
3960
0
3961
0
        mStart   = firstCEI->lowIndex;
3962
0
        minLimit = lastCEI->lowIndex;
3963
0
3964
0
        // Look at the CE following the match.  If it is UCOL_NULLORDER the match
3965
0
        //   extended to the end of input, and the match is good.
3966
0
3967
0
        // Look at the high and low indices of the CE following the match. If
3968
0
        // they are the same it means one of two things:
3969
0
        //    1. The match extended to the last CE from the target text, which is OK, or
3970
0
        //    2. The last CE that was part of the match is in an expansion that extends
3971
0
        //       to the first CE after the match. In this case, we reject the match.
3972
0
        const CEI *nextCEI = 0;
3973
0
        if (strsrch->search->elementComparisonType == 0) {
3974
0
            nextCEI  = ceb.get(targetIx + targetIxOffset);
3975
0
            maxLimit = nextCEI->lowIndex;
3976
0
            if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3977
0
                found = FALSE;
3978
0
            }
3979
0
        } else {
3980
0
            for ( ; ; ++targetIxOffset ) {
3981
0
                nextCEI = ceb.get(targetIx + targetIxOffset);
3982
0
                maxLimit = nextCEI->lowIndex;
3983
0
                // If we are at the end of the target too, match succeeds
3984
0
                if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3985
0
                    break;
3986
0
                }
3987
0
                // As long as the next CE has primary weight of 0,
3988
0
                // it is part of the last target element matched by the pattern;
3989
0
                // make sure it can be part of a match with the last patCE
3990
0
                if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3991
0
                    UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3992
0
                    if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3993
0
                        found = FALSE;
3994
0
                        break;
3995
0
                    }
3996
0
                // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3997
0
                // target element, but it has non-zero primary weight => match fails
3998
0
                } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3999
0
                    found = false;
4000
0
                    break;
4001
0
                // Else the target CE is not part of an expansion of the last matched element, match succeeds
4002
0
                } else {
4003
0
                    break;
4004
0
                }
4005
0
            }
4006
0
        }
4007
0
4008
0
4009
0
        // Check for the start of the match being within a combining sequence.
4010
0
        //   This can happen if the pattern itself begins with a combining char, and
4011
0
        //   the match found combining marks in the target text that were attached
4012
0
        //    to something else.
4013
0
        //   This type of match should be rejected for not completely consuming a
4014
0
        //   combining sequence.
4015
0
        if (!isBreakBoundary(strsrch, mStart)) {
4016
0
            found = FALSE;
4017
0
        }
4018
0
4019
0
        // Check for the start of the match being within an Collation Element Expansion,
4020
0
        //   meaning that the first char of the match is only partially matched.
4021
0
        //   With exapnsions, the first CE will report the index of the source
4022
0
        //   character, and all subsequent (expansions) CEs will report the source index of the
4023
0
        //    _following_ character.
4024
0
        int32_t secondIx = firstCEI->highIndex;
4025
0
        if (mStart == secondIx) {
4026
0
            found = FALSE;
4027
0
        }
4028
0
4029
0
        // Allow matches to end in the middle of a grapheme cluster if the following
4030
0
        // conditions are met; this is needed to make prefix search work properly in
4031
0
        // Indic, see #11750
4032
0
        // * the default breakIter is being used
4033
0
        // * the next collation element after this combining sequence
4034
0
        //   - has non-zero primary weight
4035
0
        //   - corresponds to a separate character following the one at end of the current match
4036
0
        //   (the second of these conditions, and perhaps both, may be redundant given the
4037
0
        //   subsequent check for normalization boundary; however they are likely much faster
4038
0
        //   tests in any case)
4039
0
        // * the match limit is a normalization boundary
4040
0
        UBool allowMidclusterMatch = FALSE;
4041
0
        if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4042
0
            allowMidclusterMatch =
4043
0
                    strsrch->search->breakIter == NULL &&
4044
0
                    nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4045
0
                    maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4046
0
                    (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4047
0
                        strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4048
0
        }
4049
0
        // If those conditions are met, then:
4050
0
        // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4051
0
        //   the match limit may be backed off to a previous break boundary. This handles
4052
0
        //   cases in which mLimit includes target characters that are ignorable with current
4053
0
        //   settings (such as space) and which extend beyond the pattern match.
4054
0
        // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4055
0
        // * do NOT require that match limit be on a breakIter boundary
4056
0
4057
0
        //  Advance the match end position to the first acceptable match boundary.
4058
0
        //    This advances the index over any combining charcters.
4059
0
        mLimit = maxLimit;
4060
0
        if (minLimit < maxLimit) {
4061
0
            // When the last CE's low index is same with its high index, the CE is likely
4062
0
            // a part of expansion. In this case, the index is located just after the
4063
0
            // character corresponding to the CEs compared above. If the index is right
4064
0
            // at the break boundary, move the position to the next boundary will result
4065
0
            // incorrect match length when there are ignorable characters exist between
4066
0
            // the position and the next character produces CE(s). See ticket#8482.
4067
0
            if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4068
0
                mLimit = minLimit;
4069
0
            } else {
4070
0
                int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4071
0
                // Note that we can have nba < maxLimit && nba >= minLImit, in which
4072
0
                // case we want to set mLimit to nba regardless of allowMidclusterMatch
4073
0
                // (i.e. we back off mLimit to the previous breakIterator boundary).
4074
0
                if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4075
0
                    mLimit = nba;
4076
0
                }
4077
0
            }
4078
0
        }
4079
0
4080
    #ifdef USEARCH_DEBUG
4081
        if (getenv("USEARCH_DEBUG") != NULL) {
4082
            printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4083
        }
4084
    #endif
4085
4086
0
        if (!allowMidclusterMatch) {
4087
0
            // If advancing to the end of a combining sequence in character indexing space
4088
0
            //   advanced us beyond the end of the match in CE space, reject this match.
4089
0
            if (mLimit > maxLimit) {
4090
0
                found = FALSE;
4091
0
            }
4092
0
4093
0
            if (!isBreakBoundary(strsrch, mLimit)) {
4094
0
                found = FALSE;
4095
0
            }
4096
0
        }
4097
0
4098
0
        if (! checkIdentical(strsrch, mStart, mLimit)) {
4099
0
            found = FALSE;
4100
0
        }
4101
0
4102
0
        if (found) {
4103
0
            break;
4104
0
        }
4105
0
    }
4106
0
4107
    #ifdef USEARCH_DEBUG
4108
    if (getenv("USEARCH_DEBUG") != NULL) {
4109
        printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4110
        int32_t  lastToPrint = ceb.limitIx+2;
4111
        for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4112
            printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4113
        }
4114
        printf("\n%s\n", found? "match found" : "no match");
4115
    }
4116
    #endif
4117
4118
0
    // All Done.  Store back the match bounds to the caller.
4119
0
    //
4120
0
    if (found==FALSE) {
4121
0
        mLimit = -1;
4122
0
        mStart = -1;
4123
0
    }
4124
0
4125
0
    if (matchStart != NULL) {
4126
0
        *matchStart= mStart;
4127
0
    }
4128
0
4129
0
    if (matchLimit != NULL) {
4130
0
        *matchLimit = mLimit;
4131
0
    }
4132
0
4133
0
    return found;
4134
0
}
4135
4136
U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
4137
                                                int32_t        startIdx,
4138
                                                int32_t        *matchStart,
4139
                                                int32_t        *matchLimit,
4140
                                                UErrorCode     *status)
4141
0
{
4142
0
    if (U_FAILURE(*status)) {
4143
0
        return FALSE;
4144
0
    }
4145
0
4146
0
    // TODO:  reject search patterns beginning with a combining char.
4147
0
4148
#ifdef USEARCH_DEBUG
4149
    if (getenv("USEARCH_DEBUG") != NULL) {
4150
        printf("Pattern CEs\n");
4151
        for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4152
            printf(" %8x", strsrch->pattern.ces[ii]);
4153
        }
4154
        printf("\n");
4155
    }
4156
4157
#endif
4158
    // Input parameter sanity check.
4159
0
    //  TODO:  should input indicies clip to the text length
4160
0
    //         in the same way that UText does.
4161
0
    if(strsrch->pattern.cesLength == 0         ||
4162
0
       startIdx < 0                           ||
4163
0
       startIdx > strsrch->search->textLength ||
4164
0
       strsrch->pattern.ces == NULL) {
4165
0
           *status = U_ILLEGAL_ARGUMENT_ERROR;
4166
0
           return FALSE;
4167
0
    }
4168
0
4169
0
    if (strsrch->pattern.pces == NULL) {
4170
0
        initializePatternPCETable(strsrch, status);
4171
0
    }
4172
0
4173
0
    CEIBuffer ceb(strsrch, status);
4174
0
    int32_t    targetIx = 0;
4175
0
4176
0
    /*
4177
0
     * Pre-load the buffer with the CE's for the grapheme
4178
0
     * after our starting position so that we're sure that
4179
0
     * we can look at the CE following the match when we
4180
0
     * check the match boundaries.
4181
0
     *
4182
0
     * This will also pre-fetch the first CE that we'll
4183
0
     * consider for the match.
4184
0
     */
4185
0
    if (startIdx < strsrch->search->textLength) {
4186
0
        UBreakIterator *bi = strsrch->search->internalBreakIter;
4187
0
        int32_t next = ubrk_following(bi, startIdx);
4188
0
4189
0
        ucol_setOffset(strsrch->textIter, next, status);
4190
0
4191
0
        for (targetIx = 0; ; targetIx += 1) {
4192
0
            if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4193
0
                break;
4194
0
            }
4195
0
        }
4196
0
    } else {
4197
0
        ucol_setOffset(strsrch->textIter, startIdx, status);
4198
0
    }
4199
0
4200
0
4201
0
    const CEI *targetCEI = NULL;
4202
0
    int32_t    patIx;
4203
0
    UBool      found;
4204
0
4205
0
    int32_t  limitIx = targetIx;
4206
0
    int32_t  mStart = -1;
4207
0
    int32_t  mLimit = -1;
4208
0
    int32_t  minLimit;
4209
0
    int32_t  maxLimit;
4210
0
4211
0
4212
0
4213
0
    // Outer loop moves over match starting positions in the
4214
0
    //      target CE space.
4215
0
    // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4216
0
    // But  patIx is 0 at the beginning of the pattern and increases toward the end.
4217
0
    // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4218
0
    // and the beginning of the base text.
4219
0
    for(targetIx = limitIx; ; targetIx += 1)
4220
0
    {
4221
0
        found = TRUE;
4222
0
        // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4223
0
        // (compared to the last CE fetched for the previous targetIx value) as we need to go
4224
0
        // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4225
0
        const CEI *lastCEI  = ceb.getPrevious(targetIx);
4226
0
        if (lastCEI == NULL) {
4227
0
            *status = U_INTERNAL_PROGRAM_ERROR;
4228
0
            found = FALSE;
4229
0
             break;
4230
0
        }
4231
0
        //  Inner loop checks for a match beginning at each
4232
0
        //  position from the outer loop.
4233
0
        int32_t targetIxOffset = 0;
4234
0
        for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4235
0
            int64_t patCE = strsrch->pattern.pces[patIx];
4236
0
4237
0
            targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4238
0
            //  Compare CE from target string with CE from the pattern.
4239
0
            //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4240
0
            //    which will fail the compare, below.
4241
0
            UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4242
0
            if ( ceMatch == U_CE_NO_MATCH ) {
4243
0
                found = FALSE;
4244
0
                break;
4245
0
            } else if ( ceMatch > U_CE_NO_MATCH ) {
4246
0
                if ( ceMatch == U_CE_SKIP_TARG ) {
4247
0
                    // redo with same patCE, next targCE
4248
0
                    patIx++;
4249
0
                    targetIxOffset++;
4250
0
                } else { // ceMatch == U_CE_SKIP_PATN
4251
0
                    // redo with same targCE, next patCE
4252
0
                    targetIxOffset--;
4253
0
                }
4254
0
            }
4255
0
        }
4256
0
4257
0
        if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4258
0
            // No match at this targetIx.  Try again at the next.
4259
0
            continue;
4260
0
        }
4261
0
4262
0
        if (!found) {
4263
0
            // No match at all, we have run off the end of the target text.
4264
0
            break;
4265
0
        }
4266
0
4267
0
4268
0
        // We have found a match in CE space.
4269
0
        // Now determine the bounds in string index space.
4270
0
        //  There still is a chance of match failure if the CE range not correspond to
4271
0
        //     an acceptable character range.
4272
0
        //
4273
0
        const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4274
0
        mStart   = firstCEI->lowIndex;
4275
0
4276
0
        // Check for the start of the match being within a combining sequence.
4277
0
        //   This can happen if the pattern itself begins with a combining char, and
4278
0
        //   the match found combining marks in the target text that were attached
4279
0
        //    to something else.
4280
0
        //   This type of match should be rejected for not completely consuming a
4281
0
        //   combining sequence.
4282
0
        if (!isBreakBoundary(strsrch, mStart)) {
4283
0
            found = FALSE;
4284
0
        }
4285
0
4286
0
        // Look at the high index of the first CE in the match. If it's the same as the
4287
0
        // low index, the first CE in the match is in the middle of an expansion.
4288
0
        if (mStart == firstCEI->highIndex) {
4289
0
            found = FALSE;
4290
0
        }
4291
0
4292
0
4293
0
        minLimit = lastCEI->lowIndex;
4294
0
4295
0
        if (targetIx > 0) {
4296
0
            // Look at the CE following the match.  If it is UCOL_NULLORDER the match
4297
0
            //   extended to the end of input, and the match is good.
4298
0
4299
0
            // Look at the high and low indices of the CE following the match. If
4300
0
            // they are the same it means one of two things:
4301
0
            //    1. The match extended to the last CE from the target text, which is OK, or
4302
0
            //    2. The last CE that was part of the match is in an expansion that extends
4303
0
            //       to the first CE after the match. In this case, we reject the match.
4304
0
            const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
4305
0
4306
0
            if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4307
0
                found = FALSE;
4308
0
            }
4309
0
4310
0
            mLimit = maxLimit = nextCEI->lowIndex;
4311
0
4312
0
            // Allow matches to end in the middle of a grapheme cluster if the following
4313
0
            // conditions are met; this is needed to make prefix search work properly in
4314
0
            // Indic, see #11750
4315
0
            // * the default breakIter is being used
4316
0
            // * the next collation element after this combining sequence
4317
0
            //   - has non-zero primary weight
4318
0
            //   - corresponds to a separate character following the one at end of the current match
4319
0
            //   (the second of these conditions, and perhaps both, may be redundant given the
4320
0
            //   subsequent check for normalization boundary; however they are likely much faster
4321
0
            //   tests in any case)
4322
0
            // * the match limit is a normalization boundary
4323
0
            UBool allowMidclusterMatch = FALSE;
4324
0
            if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4325
0
                allowMidclusterMatch =
4326
0
                        strsrch->search->breakIter == NULL &&
4327
0
                        nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4328
0
                        maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4329
0
                        (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4330
0
                            strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4331
0
            }
4332
0
            // If those conditions are met, then:
4333
0
            // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4334
0
            //   the match limit may be backed off to a previous break boundary. This handles
4335
0
            //   cases in which mLimit includes target characters that are ignorable with current
4336
0
            //   settings (such as space) and which extend beyond the pattern match.
4337
0
            // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4338
0
            // * do NOT require that match limit be on a breakIter boundary
4339
0
4340
0
            //  Advance the match end position to the first acceptable match boundary.
4341
0
            //    This advances the index over any combining characters.
4342
0
            if (minLimit < maxLimit) {
4343
0
                int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4344
0
                // Note that we can have nba < maxLimit && nba >= minLImit, in which
4345
0
                // case we want to set mLimit to nba regardless of allowMidclusterMatch
4346
0
                // (i.e. we back off mLimit to the previous breakIterator boundary).
4347
0
                if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4348
0
                    mLimit = nba;
4349
0
                }
4350
0
            }
4351
0
4352
0
            if (!allowMidclusterMatch) {
4353
0
                // If advancing to the end of a combining sequence in character indexing space
4354
0
                //   advanced us beyond the end of the match in CE space, reject this match.
4355
0
                if (mLimit > maxLimit) {
4356
0
                    found = FALSE;
4357
0
                }
4358
0
4359
0
                // Make sure the end of the match is on a break boundary
4360
0
                if (!isBreakBoundary(strsrch, mLimit)) {
4361
0
                    found = FALSE;
4362
0
                }
4363
0
            }
4364
0
4365
0
        } else {
4366
0
            // No non-ignorable CEs after this point.
4367
0
            // The maximum position is detected by boundary after
4368
0
            // the last non-ignorable CE. Combining sequence
4369
0
            // across the start index will be truncated.
4370
0
            int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4371
0
            mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4372
0
        }
4373
0
4374
    #ifdef USEARCH_DEBUG
4375
        if (getenv("USEARCH_DEBUG") != NULL) {
4376
            printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4377
        }
4378
    #endif
4379
4380
0
4381
0
        if (! checkIdentical(strsrch, mStart, mLimit)) {
4382
0
            found = FALSE;
4383
0
        }
4384
0
4385
0
        if (found) {
4386
0
            break;
4387
0
        }
4388
0
    }
4389
0
4390
    #ifdef USEARCH_DEBUG
4391
    if (getenv("USEARCH_DEBUG") != NULL) {
4392
        printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4393
        int32_t  lastToPrint = ceb.limitIx+2;
4394
        for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4395
            printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4396
        }
4397
        printf("\n%s\n", found? "match found" : "no match");
4398
    }
4399
    #endif
4400
4401
0
    // All Done.  Store back the match bounds to the caller.
4402
0
    //
4403
0
    if (found==FALSE) {
4404
0
        mLimit = -1;
4405
0
        mStart = -1;
4406
0
    }
4407
0
4408
0
    if (matchStart != NULL) {
4409
0
        *matchStart= mStart;
4410
0
    }
4411
0
4412
0
    if (matchLimit != NULL) {
4413
0
        *matchLimit = mLimit;
4414
0
    }
4415
0
4416
0
    return found;
4417
0
}
4418
4419
// internal use methods declared in usrchimp.h -----------------------------
4420
4421
UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4422
0
{
4423
0
    if (U_FAILURE(*status)) {
4424
0
        setMatchNotFound(strsrch);
4425
0
        return FALSE;
4426
0
    }
4427
0
4428
#if BOYER_MOORE
4429
    UCollationElements *coleiter        = strsrch->textIter;
4430
    int32_t             textlength      = strsrch->search->textLength;
4431
    int32_t            *patternce       = strsrch->pattern.ces;
4432
    int32_t             patterncelength = strsrch->pattern.cesLength;
4433
    int32_t             textoffset      = ucol_getOffset(coleiter);
4434
4435
    // status used in setting coleiter offset, since offset is checked in
4436
    // shiftForward before setting the coleiter offset, status never
4437
    // a failure
4438
    textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4439
                              patterncelength);
4440
    while (textoffset <= textlength)
4441
    {
4442
        uint32_t    patternceindex = patterncelength - 1;
4443
        int32_t     targetce;
4444
        UBool       found          = FALSE;
4445
        int32_t    lastce          = UCOL_NULLORDER;
4446
4447
        setColEIterOffset(coleiter, textoffset);
4448
4449
        for (;;) {
4450
            // finding the last pattern ce match, imagine composite characters
4451
            // for example: search for pattern A in text \u00C0
4452
            // we'll have to skip \u0300 the grave first before we get to A
4453
            targetce = ucol_previous(coleiter, status);
4454
            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4455
                found = FALSE;
4456
                break;
4457
            }
4458
            targetce = getCE(strsrch, targetce);
4459
            if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4460
                // this is for the text \u0315\u0300 that requires
4461
                // normalization and pattern \u0300, where \u0315 is ignorable
4462
                continue;
4463
            }
4464
            if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4465
                lastce = targetce;
4466
            }
4467
            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4468
            if (targetce == patternce[patternceindex]) {
4469
                // the first ce can be a contraction
4470
                found = TRUE;
4471
                break;
4472
            }
4473
            if (!hasExpansion(coleiter)) {
4474
                found = FALSE;
4475
                break;
4476
            }
4477
        }
4478
4479
        //targetce = lastce;
4480
4481
        while (found && patternceindex > 0) {
4482
            lastce = targetce;
4483
            targetce    = ucol_previous(coleiter, status);
4484
            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4485
                found = FALSE;
4486
                break;
4487
            }
4488
            targetce    = getCE(strsrch, targetce);
4489
            if (targetce == UCOL_IGNORABLE) {
4490
                continue;
4491
            }
4492
4493
            patternceindex --;
4494
            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4495
            found = found && targetce == patternce[patternceindex];
4496
        }
4497
4498
        targetce = lastce;
4499
4500
        if (!found) {
4501
            if (U_FAILURE(*status)) {
4502
                break;
4503
            }
4504
            textoffset = shiftForward(strsrch, textoffset, lastce,
4505
                                      patternceindex);
4506
            // status checked at loop.
4507
            patternceindex = patterncelength;
4508
            continue;
4509
        }
4510
4511
        if (checkNextExactMatch(strsrch, &textoffset, status)) {
4512
            // status checked in ucol_setOffset
4513
            setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4514
            return TRUE;
4515
        }
4516
    }
4517
    setMatchNotFound(strsrch);
4518
    return FALSE;
4519
#else
4520
0
    int32_t textOffset = ucol_getOffset(strsrch->textIter);
4521
0
    int32_t start = -1;
4522
0
    int32_t end = -1;
4523
0
4524
0
    if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4525
0
        strsrch->search->matchedIndex  = start;
4526
0
        strsrch->search->matchedLength = end - start;
4527
0
        return TRUE;
4528
0
    } else {
4529
0
        setMatchNotFound(strsrch);
4530
0
        return FALSE;
4531
0
    }
4532
0
#endif
4533
0
}
4534
4535
UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4536
0
{
4537
0
    if (U_FAILURE(*status)) {
4538
0
        setMatchNotFound(strsrch);
4539
0
        return FALSE;
4540
0
    }
4541
0
4542
#if BOYER_MOORE
4543
    UCollationElements *coleiter        = strsrch->textIter;
4544
    int32_t             textlength      = strsrch->search->textLength;
4545
    int32_t            *patternce       = strsrch->pattern.ces;
4546
    int32_t             patterncelength = strsrch->pattern.cesLength;
4547
    int32_t             textoffset      = ucol_getOffset(coleiter);
4548
    UBool               hasPatternAccents =
4549
       strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4550
4551
    textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4552
                              patterncelength);
4553
    strsrch->canonicalPrefixAccents[0] = 0;
4554
    strsrch->canonicalSuffixAccents[0] = 0;
4555
4556
    while (textoffset <= textlength)
4557
    {
4558
        int32_t     patternceindex = patterncelength - 1;
4559
        int32_t     targetce;
4560
        UBool       found          = FALSE;
4561
        int32_t     lastce         = UCOL_NULLORDER;
4562
4563
        setColEIterOffset(coleiter, textoffset);
4564
4565
        for (;;) {
4566
            // finding the last pattern ce match, imagine composite characters
4567
            // for example: search for pattern A in text \u00C0
4568
            // we'll have to skip \u0300 the grave first before we get to A
4569
            targetce = ucol_previous(coleiter, status);
4570
            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4571
                found = FALSE;
4572
                break;
4573
            }
4574
            targetce = getCE(strsrch, targetce);
4575
            if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4576
                lastce = targetce;
4577
            }
4578
            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4579
            if (targetce == patternce[patternceindex]) {
4580
                // the first ce can be a contraction
4581
                found = TRUE;
4582
                break;
4583
            }
4584
            if (!hasExpansion(coleiter)) {
4585
                found = FALSE;
4586
                break;
4587
            }
4588
        }
4589
4590
        while (found && patternceindex > 0) {
4591
            targetce    = ucol_previous(coleiter, status);
4592
            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4593
                found = FALSE;
4594
                break;
4595
            }
4596
            targetce    = getCE(strsrch, targetce);
4597
            if (targetce == UCOL_IGNORABLE) {
4598
                continue;
4599
            }
4600
4601
            patternceindex --;
4602
            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4603
            found = found && targetce == patternce[patternceindex];
4604
        }
4605
4606
        // initializing the rearranged accent array
4607
        if (hasPatternAccents && !found) {
4608
            strsrch->canonicalPrefixAccents[0] = 0;
4609
            strsrch->canonicalSuffixAccents[0] = 0;
4610
            if (U_FAILURE(*status)) {
4611
                break;
4612
            }
4613
            found = doNextCanonicalMatch(strsrch, textoffset, status);
4614
        }
4615
4616
        if (!found) {
4617
            if (U_FAILURE(*status)) {
4618
                break;
4619
            }
4620
            textoffset = shiftForward(strsrch, textoffset, lastce,
4621
                                      patternceindex);
4622
            // status checked at loop
4623
            patternceindex = patterncelength;
4624
            continue;
4625
        }
4626
4627
        if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4628
            setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4629
            return TRUE;
4630
        }
4631
    }
4632
    setMatchNotFound(strsrch);
4633
    return FALSE;
4634
#else
4635
0
    int32_t textOffset = ucol_getOffset(strsrch->textIter);
4636
0
    int32_t start = -1;
4637
0
    int32_t end = -1;
4638
0
4639
0
    if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4640
0
        strsrch->search->matchedIndex  = start;
4641
0
        strsrch->search->matchedLength = end - start;
4642
0
        return TRUE;
4643
0
    } else {
4644
0
        setMatchNotFound(strsrch);
4645
0
        return FALSE;
4646
0
    }
4647
0
#endif
4648
0
}
4649
4650
UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4651
0
{
4652
0
    if (U_FAILURE(*status)) {
4653
0
        setMatchNotFound(strsrch);
4654
0
        return FALSE;
4655
0
    }
4656
0
4657
#if BOYER_MOORE
4658
    UCollationElements *coleiter        = strsrch->textIter;
4659
    int32_t            *patternce       = strsrch->pattern.ces;
4660
    int32_t             patterncelength = strsrch->pattern.cesLength;
4661
    int32_t             textoffset      = ucol_getOffset(coleiter);
4662
4663
    // shifting it check for setting offset
4664
    // if setOffset is called previously or there was no previous match, we
4665
    // leave the offset as it is.
4666
    if (strsrch->search->matchedIndex != USEARCH_DONE) {
4667
        textoffset = strsrch->search->matchedIndex;
4668
    }
4669
4670
    textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4671
                              patterncelength);
4672
4673
    while (textoffset >= 0)
4674
    {
4675
        int32_t     patternceindex = 1;
4676
        int32_t     targetce;
4677
        UBool       found          = FALSE;
4678
        int32_t     firstce        = UCOL_NULLORDER;
4679
4680
        // if status is a failure, ucol_setOffset does nothing
4681
        setColEIterOffset(coleiter, textoffset);
4682
4683
        for (;;) {
4684
            // finding the first pattern ce match, imagine composite
4685
            // characters. for example: search for pattern \u0300 in text
4686
            // \u00C0, we'll have to skip A first before we get to
4687
            // \u0300 the grave accent
4688
            targetce = ucol_next(coleiter, status);
4689
            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4690
                found = FALSE;
4691
                break;
4692
            }
4693
            targetce = getCE(strsrch, targetce);
4694
            if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4695
                firstce = targetce;
4696
            }
4697
            if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4698
                continue;
4699
            }
4700
            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4701
            if (targetce == patternce[0]) {
4702
                found = TRUE;
4703
                break;
4704
            }
4705
            if (!hasExpansion(coleiter)) {
4706
                // checking for accents in composite character
4707
                found = FALSE;
4708
                break;
4709
            }
4710
        }
4711
4712
        //targetce = firstce;
4713
4714
        while (found && (patternceindex < patterncelength)) {
4715
            firstce = targetce;
4716
            targetce    = ucol_next(coleiter, status);
4717
            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4718
                found = FALSE;
4719
                break;
4720
            }
4721
            targetce    = getCE(strsrch, targetce);
4722
            if (targetce == UCOL_IGNORABLE) {
4723
                continue;
4724
            }
4725
4726
            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4727
            found = found && targetce == patternce[patternceindex];
4728
            patternceindex ++;
4729
        }
4730
4731
        targetce = firstce;
4732
4733
        if (!found) {
4734
            if (U_FAILURE(*status)) {
4735
                break;
4736
            }
4737
4738
            textoffset = reverseShift(strsrch, textoffset, targetce,
4739
                                      patternceindex);
4740
            patternceindex = 0;
4741
            continue;
4742
        }
4743
4744
        if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4745
            setColEIterOffset(coleiter, textoffset);
4746
            return TRUE;
4747
        }
4748
    }
4749
    setMatchNotFound(strsrch);
4750
    return FALSE;
4751
#else
4752
0
    int32_t textOffset;
4753
0
4754
0
    if (strsrch->search->isOverlap) {
4755
0
        if (strsrch->search->matchedIndex != USEARCH_DONE) {
4756
0
            textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4757
0
        } else {
4758
0
            // move the start position at the end of possible match
4759
0
            initializePatternPCETable(strsrch, status);
4760
0
            if (!initTextProcessedIter(strsrch, status)) {
4761
0
                setMatchNotFound(strsrch);
4762
0
                return FALSE;
4763
0
            }
4764
0
            for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4765
0
                int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4766
0
                if (pce == UCOL_PROCESSED_NULLORDER) {
4767
0
                    // at the end of the text
4768
0
                    break;
4769
0
                }
4770
0
            }
4771
0
            if (U_FAILURE(*status)) {
4772
0
                setMatchNotFound(strsrch);
4773
0
                return FALSE;
4774
0
            }
4775
0
            textOffset = ucol_getOffset(strsrch->textIter);
4776
0
        }
4777
0
    } else {
4778
0
        textOffset = ucol_getOffset(strsrch->textIter);
4779
0
    }
4780
0
4781
0
    int32_t start = -1;
4782
0
    int32_t end = -1;
4783
0
4784
0
    if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4785
0
        strsrch->search->matchedIndex = start;
4786
0
        strsrch->search->matchedLength = end - start;
4787
0
        return TRUE;
4788
0
    } else {
4789
0
        setMatchNotFound(strsrch);
4790
0
        return FALSE;
4791
0
    }
4792
0
#endif
4793
0
}
4794
4795
UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4796
                                      UErrorCode    *status)
4797
0
{
4798
0
    if (U_FAILURE(*status)) {
4799
0
        setMatchNotFound(strsrch);
4800
0
        return FALSE;
4801
0
    }
4802
0
4803
#if BOYER_MOORE
4804
    UCollationElements *coleiter        = strsrch->textIter;
4805
    int32_t            *patternce       = strsrch->pattern.ces;
4806
    int32_t             patterncelength = strsrch->pattern.cesLength;
4807
    int32_t             textoffset      = ucol_getOffset(coleiter);
4808
    UBool               hasPatternAccents =
4809
       strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4810
4811
    // shifting it check for setting offset
4812
    // if setOffset is called previously or there was no previous match, we
4813
    // leave the offset as it is.
4814
    if (strsrch->search->matchedIndex != USEARCH_DONE) {
4815
        textoffset = strsrch->search->matchedIndex;
4816
    }
4817
4818
    textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4819
                              patterncelength);
4820
    strsrch->canonicalPrefixAccents[0] = 0;
4821
    strsrch->canonicalSuffixAccents[0] = 0;
4822
4823
    while (textoffset >= 0)
4824
    {
4825
        int32_t     patternceindex = 1;
4826
        int32_t     targetce;
4827
        UBool       found          = FALSE;
4828
        int32_t     firstce        = UCOL_NULLORDER;
4829
4830
        setColEIterOffset(coleiter, textoffset);
4831
        for (;;) {
4832
            // finding the first pattern ce match, imagine composite
4833
            // characters. for example: search for pattern \u0300 in text
4834
            // \u00C0, we'll have to skip A first before we get to
4835
            // \u0300 the grave accent
4836
            targetce = ucol_next(coleiter, status);
4837
            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4838
                found = FALSE;
4839
                break;
4840
            }
4841
            targetce = getCE(strsrch, targetce);
4842
            if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4843
                firstce = targetce;
4844
            }
4845
4846
            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4847
            if (targetce == patternce[0]) {
4848
                // the first ce can be a contraction
4849
                found = TRUE;
4850
                break;
4851
            }
4852
            if (!hasExpansion(coleiter)) {
4853
                // checking for accents in composite character
4854
                found = FALSE;
4855
                break;
4856
            }
4857
        }
4858
4859
        targetce = firstce;
4860
4861
        while (found && patternceindex < patterncelength) {
4862
            targetce    = ucol_next(coleiter, status);
4863
            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4864
                found = FALSE;
4865
                break;
4866
            }
4867
            targetce = getCE(strsrch, targetce);
4868
            if (targetce == UCOL_IGNORABLE) {
4869
                continue;
4870
            }
4871
4872
            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4873
            found = found && targetce == patternce[patternceindex];
4874
            patternceindex ++;
4875
        }
4876
4877
        // initializing the rearranged accent array
4878
        if (hasPatternAccents && !found) {
4879
            strsrch->canonicalPrefixAccents[0] = 0;
4880
            strsrch->canonicalSuffixAccents[0] = 0;
4881
            if (U_FAILURE(*status)) {
4882
                break;
4883
            }
4884
            found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4885
        }
4886
4887
        if (!found) {
4888
            if (U_FAILURE(*status)) {
4889
                break;
4890
            }
4891
            textoffset = reverseShift(strsrch, textoffset, targetce,
4892
                                      patternceindex);
4893
            patternceindex = 0;
4894
            continue;
4895
        }
4896
4897
        if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4898
            setColEIterOffset(coleiter, textoffset);
4899
            return TRUE;
4900
        }
4901
    }
4902
    setMatchNotFound(strsrch);
4903
    return FALSE;
4904
#else
4905
0
    int32_t textOffset;
4906
0
4907
0
    if (strsrch->search->isOverlap) {
4908
0
        if (strsrch->search->matchedIndex != USEARCH_DONE) {
4909
0
            textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4910
0
        } else {
4911
0
            // move the start position at the end of possible match
4912
0
            initializePatternPCETable(strsrch, status);
4913
0
            if (!initTextProcessedIter(strsrch, status)) {
4914
0
                setMatchNotFound(strsrch);
4915
0
                return FALSE;
4916
0
            }
4917
0
            for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4918
0
                int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4919
0
                if (pce == UCOL_PROCESSED_NULLORDER) {
4920
0
                    // at the end of the text
4921
0
                    break;
4922
0
                }
4923
0
            }
4924
0
            if (U_FAILURE(*status)) {
4925
0
                setMatchNotFound(strsrch);
4926
0
                return FALSE;
4927
0
            }
4928
0
            textOffset = ucol_getOffset(strsrch->textIter);
4929
0
        }
4930
0
    } else {
4931
0
        textOffset = ucol_getOffset(strsrch->textIter);
4932
0
    }
4933
0
4934
0
    int32_t start = -1;
4935
0
    int32_t end = -1;
4936
0
4937
0
    if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4938
0
        strsrch->search->matchedIndex = start;
4939
0
        strsrch->search->matchedLength = end - start;
4940
0
        return TRUE;
4941
0
    } else {
4942
0
        setMatchNotFound(strsrch);
4943
0
        return FALSE;
4944
0
    }
4945
0
#endif
4946
0
}
4947
4948
#endif /* #if !UCONFIG_NO_COLLATION */