Coverage Report

Created: 2018-09-25 14:53

/work/obj-fuzz/dist/include/unicode/search.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 2001-2011 IBM and others. All rights reserved.
6
**********************************************************************
7
*   Date        Name        Description
8
*  03/22/2000   helena      Creation.
9
**********************************************************************
10
*/
11
12
#ifndef SEARCH_H
13
#define SEARCH_H
14
15
#include "unicode/utypes.h"
16
17
/**
18
 * \file 
19
 * \brief C++ API: SearchIterator object.
20
 */
21
 
22
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
23
24
#include "unicode/uobject.h"
25
#include "unicode/unistr.h"
26
#include "unicode/chariter.h"
27
#include "unicode/brkiter.h"
28
#include "unicode/usearch.h"
29
30
/**
31
* @stable ICU 2.0
32
*/
33
struct USearch;
34
/**
35
* @stable ICU 2.0
36
*/
37
typedef struct USearch USearch;
38
39
U_NAMESPACE_BEGIN
40
41
/**
42
 *
43
 * <tt>SearchIterator</tt> is an abstract base class that provides 
44
 * methods to search for a pattern within a text string. Instances of
45
 * <tt>SearchIterator</tt> maintain a current position and scans over the 
46
 * target text, returning the indices the pattern is matched and the length 
47
 * of each match.
48
 * <p>
49
 * <tt>SearchIterator</tt> defines a protocol for text searching. 
50
 * Subclasses provide concrete implementations of various search algorithms. 
51
 * For example, <tt>StringSearch</tt> implements language-sensitive pattern 
52
 * matching based on the comparison rules defined in a 
53
 * <tt>RuleBasedCollator</tt> object. 
54
 * <p> 
55
 * Other options for searching includes using a BreakIterator to restrict 
56
 * the points at which matches are detected.
57
 * <p>
58
 * <tt>SearchIterator</tt> provides an API that is similar to that of
59
 * other text iteration classes such as <tt>BreakIterator</tt>. Using 
60
 * this class, it is easy to scan through text looking for all occurances of 
61
 * a given pattern. The following example uses a <tt>StringSearch</tt> 
62
 * object to find all instances of "fox" in the target string. Any other 
63
 * subclass of <tt>SearchIterator</tt> can be used in an identical 
64
 * manner.
65
 * <pre><code>
66
 * UnicodeString target("The quick brown fox jumped over the lazy fox");
67
 * UnicodeString pattern("fox");
68
 *
69
 * SearchIterator *iter  = new StringSearch(pattern, target);
70
 * UErrorCode      error = U_ZERO_ERROR;
71
 * for (int pos = iter->first(error); pos != USEARCH_DONE; 
72
 *                               pos = iter->next(error)) {
73
 *     printf("Found match at %d pos, length is %d\n", pos, 
74
 *                                             iter.getMatchLength());
75
 * }
76
 * </code></pre>
77
 *
78
 * @see StringSearch
79
 * @see RuleBasedCollator
80
 */
81
class U_I18N_API SearchIterator : public UObject {
82
83
public:
84
85
    // public constructors and destructors -------------------------------
86
87
    /** 
88
    * Copy constructor that creates a SearchIterator instance with the same 
89
    * behavior, and iterating over the same text. 
90
    * @param other the SearchIterator instance to be copied.
91
    * @stable ICU 2.0
92
    */
93
    SearchIterator(const SearchIterator &other);
94
95
    /**
96
     * Destructor. Cleans up the search iterator data struct.
97
     * @stable ICU 2.0
98
     */
99
    virtual ~SearchIterator();
100
101
    // public get and set methods ----------------------------------------
102
103
    /**
104
     * Sets the index to point to the given position, and clears any state 
105
     * that's affected.
106
     * <p>
107
     * This method takes the argument index and sets the position in the text 
108
     * string accordingly without checking if the index is pointing to a 
109
     * valid starting point to begin searching. 
110
     * @param position within the text to be set. If position is less
111
     *             than or greater than the text range for searching, 
112
     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
113
     * @param status for errors if it occurs
114
     * @stable ICU 2.0
115
     */
116
    virtual void setOffset(int32_t position, UErrorCode &status) = 0;
117
118
    /**
119
     * Return the current index in the text being searched.
120
     * If the iteration has gone past the end of the text
121
     * (or past the beginning for a backwards search), USEARCH_DONE
122
     * is returned.
123
     * @return current index in the text being searched.
124
     * @stable ICU 2.0
125
     */
126
    virtual int32_t getOffset(void) const = 0;
127
128
    /**
129
    * Sets the text searching attributes located in the enum 
130
    * USearchAttribute with values from the enum USearchAttributeValue.
131
    * USEARCH_DEFAULT can be used for all attributes for resetting.
132
    * @param attribute text attribute (enum USearchAttribute) to be set
133
    * @param value text attribute value
134
    * @param status for errors if it occurs
135
    * @stable ICU 2.0
136
    */
137
    void setAttribute(USearchAttribute       attribute,
138
                      USearchAttributeValue  value,
139
                      UErrorCode            &status);
140
141
    /**    
142
    * Gets the text searching attributes
143
    * @param attribute text attribute (enum USearchAttribute) to be retrieve
144
    * @return text attribute value
145
    * @stable ICU 2.0
146
    */
147
    USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
148
    
149
    /**
150
    * Returns the index to the match in the text string that was searched.
151
    * This call returns a valid result only after a successful call to 
152
    * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
153
    * Just after construction, or after a searching method returns 
154
    * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
155
    * <p>
156
    * Use getMatchedLength to get the matched string length.
157
    * @return index of a substring within the text string that is being 
158
    *         searched.
159
    * @see #first
160
    * @see #next
161
    * @see #previous
162
    * @see #last
163
    * @stable ICU 2.0
164
    */
165
    int32_t getMatchedStart(void) const;
166
167
    /**
168
     * Returns the length of text in the string which matches the search 
169
     * pattern. This call returns a valid result only after a successful call 
170
     * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
171
     * Just after construction, or after a searching method returns 
172
     * <tt>USEARCH_DONE</tt>, this method will return 0.
173
     * @return The length of the match in the target text, or 0 if there
174
     *         is no match currently.
175
     * @see #first
176
     * @see #next
177
     * @see #previous
178
     * @see #last
179
     * @stable ICU 2.0
180
     */
181
    int32_t getMatchedLength(void) const;
182
    
183
    /**
184
     * Returns the text that was matched by the most recent call to 
185
     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
186
     * If the iterator is not pointing at a valid match (e.g. just after 
187
     * construction or after <tt>USEARCH_DONE</tt> has been returned, 
188
     * returns an empty string. 
189
     * @param result stores the matched string or an empty string if a match
190
     *        is not found.
191
     * @see #first
192
     * @see #next
193
     * @see #previous
194
     * @see #last
195
     * @stable ICU 2.0
196
     */
197
    void getMatchedText(UnicodeString &result) const;
198
    
199
    /**
200
     * Set the BreakIterator that will be used to restrict the points
201
     * at which matches are detected. The user is responsible for deleting 
202
     * the breakiterator.
203
     * @param breakiter A BreakIterator that will be used to restrict the 
204
     *                points at which matches are detected. If a match is 
205
     *                found, but the match's start or end index is not a 
206
     *                boundary as determined by the <tt>BreakIterator</tt>, 
207
     *                the match will be rejected and another will be searched 
208
     *                for. If this parameter is <tt>NULL</tt>, no break
209
     *                detection is attempted.
210
     * @param status for errors if it occurs
211
     * @see BreakIterator
212
     * @stable ICU 2.0
213
     */
214
    void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
215
    
216
    /**
217
     * Returns the BreakIterator that is used to restrict the points at 
218
     * which matches are detected.  This will be the same object that was 
219
     * passed to the constructor or to <tt>setBreakIterator</tt>.
220
     * Note that <tt>NULL</tt> is a legal value; it means that break
221
     * detection should not be attempted.
222
     * @return BreakIterator used to restrict matchings.
223
     * @see #setBreakIterator
224
     * @stable ICU 2.0
225
     */
226
    const BreakIterator * getBreakIterator(void) const;
227
228
    /**
229
     * Set the string text to be searched. Text iteration will hence begin at 
230
     * the start of the text string. This method is useful if you want to 
231
     * re-use an iterator to search for the same pattern within a different 
232
     * body of text. The user is responsible for deleting the text.
233
     * @param text string to be searched.
234
     * @param status for errors. If the text length is 0, 
235
     *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
236
     * @stable ICU 2.0
237
     */
238
    virtual void setText(const UnicodeString &text, UErrorCode &status);    
239
240
    /**
241
     * Set the string text to be searched. Text iteration will hence begin at 
242
     * the start of the text string. This method is useful if you want to 
243
     * re-use an iterator to search for the same pattern within a different 
244
     * body of text.
245
     * <p>
246
     * Note: No parsing of the text within the <tt>CharacterIterator</tt> 
247
     * will be done during searching for this version. The block of text 
248
     * in <tt>CharacterIterator</tt> will be used as it is.
249
     * The user is responsible for deleting the text.
250
     * @param text string iterator to be searched.
251
     * @param status for errors if any. If the text length is 0 then an 
252
     *        U_ILLEGAL_ARGUMENT_ERROR is returned.
253
     * @stable ICU 2.0
254
     */
255
    virtual void setText(CharacterIterator &text, UErrorCode &status);
256
    
257
    /**
258
     * Return the string text to be searched.
259
     * @return text string to be searched.
260
     * @stable ICU 2.0
261
     */
262
    const UnicodeString & getText(void) const;
263
264
    // operator overloading ----------------------------------------------
265
266
    /**
267
     * Equality operator. 
268
     * @param that SearchIterator instance to be compared.
269
     * @return TRUE if both BreakIterators are of the same class, have the 
270
     *         same behavior, terates over the same text and have the same
271
     *         attributes. FALSE otherwise.
272
     * @stable ICU 2.0
273
     */
274
    virtual UBool operator==(const SearchIterator &that) const;
275
276
    /**
277
     * Not-equal operator. 
278
     * @param that SearchIterator instance to be compared.
279
     * @return FALSE if operator== returns TRUE, and vice versa.
280
     * @stable ICU 2.0
281
     */
282
    UBool operator!=(const SearchIterator &that) const;
283
284
    // public methods ----------------------------------------------------
285
286
    /**
287
     * Returns a copy of SearchIterator with the same behavior, and 
288
     * iterating over the same text, as this one. Note that all data will be
289
     * replicated, except for the text string to be searched.
290
     * @return cloned object
291
     * @stable ICU 2.0
292
     */
293
    virtual SearchIterator* safeClone(void) const = 0;
294
295
    /**
296
     * Returns the first index at which the string text matches the search 
297
     * pattern. The iterator is adjusted so that its current index (as 
298
     * returned by <tt>getOffset</tt>) is the match position if one 
299
     * was found.
300
     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
301
     * the iterator will be adjusted to the index USEARCH_DONE
302
     * @param  status for errors if it occurs
303
     * @return The character index of the first match, or 
304
     *         <tt>USEARCH_DONE</tt> if there are no matches.
305
     * @see #getOffset
306
     * @stable ICU 2.0
307
     */
308
    int32_t first(UErrorCode &status);
309
310
    /**
311
     * Returns the first index equal or greater than <tt>position</tt> at which the 
312
     * string text matches the search pattern. The iterator is adjusted so 
313
     * that its current index (as returned by <tt>getOffset</tt>) is the 
314
     * match position if one was found.
315
     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
316
     * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
317
     * @param  position where search if to start from. If position is less
318
     *             than or greater than the text range for searching, 
319
     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
320
     * @param  status for errors if it occurs
321
     * @return The character index of the first match following 
322
     *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no 
323
     *         matches.
324
     * @see #getOffset
325
     * @stable ICU 2.0
326
     */
327
    int32_t following(int32_t position, UErrorCode &status);
328
    
329
    /**
330
     * Returns the last index in the target text at which it matches the 
331
     * search pattern. The iterator is adjusted so that its current index 
332
     * (as returned by <tt>getOffset</tt>) is the match position if one was 
333
     * found.
334
     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
335
     * the iterator will be adjusted to the index USEARCH_DONE.
336
     * @param  status for errors if it occurs
337
     * @return The index of the first match, or <tt>USEARCH_DONE</tt> if 
338
     *         there are no matches.
339
     * @see #getOffset
340
     * @stable ICU 2.0
341
     */
342
    int32_t last(UErrorCode &status);
343
344
    /**
345
     * Returns the first index less than <tt>position</tt> at which the string 
346
     * text matches the search pattern. The iterator is adjusted so that its 
347
     * current index (as returned by <tt>getOffset</tt>) is the match 
348
     * position if one was found. If a match is not found, 
349
     * <tt>USEARCH_DONE</tt> will be returned and the iterator will be 
350
     * adjusted to the index USEARCH_DONE
351
     * <p>
352
     * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
353
     * result match is always less than <tt>position</tt>.
354
     * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
355
     * <tt>position</tt>.
356
     *
357
     * @param  position where search is to start from. If position is less
358
     *             than or greater than the text range for searching, 
359
     *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
360
     * @param  status for errors if it occurs
361
     * @return The character index of the first match preceding 
362
     *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are 
363
     *         no matches.
364
     * @see #getOffset
365
     * @stable ICU 2.0
366
     */
367
    int32_t preceding(int32_t position, UErrorCode &status);
368
369
    /**
370
     * Returns the index of the next point at which the text matches the
371
     * search pattern, starting from the current position
372
     * The iterator is adjusted so that its current index (as returned by 
373
     * <tt>getOffset</tt>) is the match position if one was found.
374
     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
375
     * the iterator will be adjusted to a position after the end of the text 
376
     * string.
377
     * @param  status for errors if it occurs
378
     * @return The index of the next match after the current position,
379
     *          or <tt>USEARCH_DONE</tt> if there are no more matches.
380
     * @see #getOffset
381
     * @stable ICU 2.0
382
     */
383
     int32_t next(UErrorCode &status);
384
385
    /**
386
     * Returns the index of the previous point at which the string text 
387
     * matches the search pattern, starting at the current position.
388
     * The iterator is adjusted so that its current index (as returned by 
389
     * <tt>getOffset</tt>) is the match position if one was found.
390
     * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
391
     * the iterator will be adjusted to the index USEARCH_DONE
392
     * @param  status for errors if it occurs
393
     * @return The index of the previous match before the current position,
394
     *          or <tt>USEARCH_DONE</tt> if there are no more matches.
395
     * @see #getOffset
396
     * @stable ICU 2.0
397
     */
398
    int32_t previous(UErrorCode &status);
399
400
    /** 
401
    * Resets the iteration.
402
    * Search will begin at the start of the text string if a forward 
403
    * iteration is initiated before a backwards iteration. Otherwise if a 
404
    * backwards iteration is initiated before a forwards iteration, the 
405
    * search will begin at the end of the text string.    
406
    * @stable ICU 2.0
407
    */
408
    virtual void reset();
409
410
protected:
411
    // protected data members ---------------------------------------------
412
413
    /**
414
    * C search data struct
415
    * @stable ICU 2.0
416
    */
417
    USearch *m_search_;
418
419
    /**
420
    * Break iterator.
421
    * Currently the C++ breakiterator does not have getRules etc to reproduce
422
    * another in C. Hence we keep the original around and do the verification
423
    * at the end of the match. The user is responsible for deleting this
424
    * break iterator.
425
    * @stable ICU 2.0
426
    */
427
    BreakIterator *m_breakiterator_;
428
    
429
    /**
430
    * Unicode string version of the search text
431
    * @stable ICU 2.0
432
    */
433
    UnicodeString  m_text_;
434
435
    // protected constructors and destructors -----------------------------
436
437
    /**
438
    * Default constructor.
439
    * Initializes data to the default values.
440
    * @stable ICU 2.0
441
    */
442
    SearchIterator();
443
444
    /**
445
     * Constructor for use by subclasses.
446
     * @param text The target text to be searched.
447
     * @param breakiter A {@link BreakIterator} that is used to restrict the 
448
     *                points at which matches are detected. If 
449
     *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 
450
     *                match, but the match's start or end index is not a 
451
     *                boundary as determined by the <tt>BreakIterator</tt>, 
452
     *                the match is rejected and <tt>handleNext</tt> or 
453
     *                <tt>handlePrev</tt> is called again. If this parameter 
454
     *                is <tt>NULL</tt>, no break detection is attempted.  
455
     * @see #handleNext
456
     * @see #handlePrev
457
     * @stable ICU 2.0
458
     */
459
    SearchIterator(const UnicodeString &text, 
460
                         BreakIterator *breakiter = NULL);
461
462
    /**
463
     * Constructor for use by subclasses.
464
     * <p>
465
     * Note: No parsing of the text within the <tt>CharacterIterator</tt> 
466
     * will be done during searching for this version. The block of text 
467
     * in <tt>CharacterIterator</tt> will be used as it is.
468
     * @param text The target text to be searched.
469
     * @param breakiter A {@link BreakIterator} that is used to restrict the 
470
     *                points at which matches are detected. If 
471
     *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 
472
     *                match, but the match's start or end index is not a 
473
     *                boundary as determined by the <tt>BreakIterator</tt>, 
474
     *                the match is rejected and <tt>handleNext</tt> or 
475
     *                <tt>handlePrev</tt> is called again. If this parameter 
476
     *                is <tt>NULL</tt>, no break detection is attempted.
477
     * @see #handleNext
478
     * @see #handlePrev
479
     * @stable ICU 2.0
480
     */
481
    SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
482
483
    // protected methods --------------------------------------------------
484
485
    /**
486
     * Assignment operator. Sets this iterator to have the same behavior,
487
     * and iterate over the same text, as the one passed in.
488
     * @param that instance to be copied.
489
     * @stable ICU 2.0
490
     */
491
    SearchIterator & operator=(const SearchIterator &that);
492
493
    /**
494
     * Abstract method which subclasses override to provide the mechanism
495
     * for finding the next match in the target text. This allows different
496
     * subclasses to provide different search algorithms.
497
     * <p>
498
     * If a match is found, the implementation should return the index at
499
     * which the match starts and should call 
500
     * <tt>setMatchLength</tt> with the number of characters 
501
     * in the target text that make up the match. If no match is found, the 
502
     * method should return USEARCH_DONE.
503
     * <p>
504
     * @param position The index in the target text at which the search 
505
     *                 should start.
506
     * @param status for error codes if it occurs.
507
     * @return index at which the match starts, else if match is not found 
508
     *         USEARCH_DONE is returned
509
     * @see #setMatchLength
510
     * @stable ICU 2.0
511
     */
512
    virtual int32_t handleNext(int32_t position, UErrorCode &status) 
513
                                                                         = 0;
514
515
    /**
516
     * Abstract method which subclasses override to provide the mechanism for
517
     * finding the previous match in the target text. This allows different
518
     * subclasses to provide different search algorithms.
519
     * <p>
520
     * If a match is found, the implementation should return the index at
521
     * which the match starts and should call 
522
     * <tt>setMatchLength</tt> with the number of characters 
523
     * in the target text that make up the match. If no match is found, the 
524
     * method should return USEARCH_DONE.
525
     * <p>
526
     * @param position The index in the target text at which the search 
527
     *                 should start.
528
     * @param status for error codes if it occurs.
529
     * @return index at which the match starts, else if match is not found 
530
     *         USEARCH_DONE is returned
531
     * @see #setMatchLength
532
     * @stable ICU 2.0
533
     */
534
     virtual int32_t handlePrev(int32_t position, UErrorCode &status) 
535
                                                                         = 0;
536
537
    /**
538
     * Sets the length of the currently matched string in the text string to
539
     * be searched.
540
     * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
541
     * methods should call this when they find a match in the target text.
542
     * @param length length of the matched text.
543
     * @see #handleNext
544
     * @see #handlePrev
545
     * @stable ICU 2.0
546
     */
547
    virtual void setMatchLength(int32_t length);
548
549
    /**
550
     * Sets the offset of the currently matched string in the text string to
551
     * be searched.
552
     * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
553
     * methods should call this when they find a match in the target text.
554
     * @param position start offset of the matched text.
555
     * @see #handleNext
556
     * @see #handlePrev
557
     * @stable ICU 2.0
558
     */
559
    virtual void setMatchStart(int32_t position);
560
561
    /**
562
    * sets match not found 
563
    * @stable ICU 2.0
564
    */
565
    void setMatchNotFound();
566
};
567
568
inline UBool SearchIterator::operator!=(const SearchIterator &that) const
569
0
{
570
0
   return !operator==(that); 
571
0
}
572
U_NAMESPACE_END
573
574
#endif /* #if !UCONFIG_NO_COLLATION */
575
576
#endif
577