Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/common/unicode/edits.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
// edits.h
5
// created: 2016dec30 Markus W. Scherer
6
7
#ifndef __EDITS_H__
8
#define __EDITS_H__
9
10
#include "unicode/utypes.h"
11
#include "unicode/uobject.h"
12
13
/**
14
 * \file
15
 * \brief C++ API: C++ class Edits for low-level string transformations on styled text.
16
 */
17
18
U_NAMESPACE_BEGIN
19
20
class UnicodeString;
21
22
/**
23
 * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions
24
 * in linear progression. Does not support moving/reordering of text.
25
 *
26
 * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to
27
 * instances of this class using {@link #addReplace(int, int)} (for change edits) and
28
 * {@link #addUnchanged(int)} (for no-change edits). Change edits are retained with full granularity,
29
 * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one
30
 * mapping between code points in the source and destination strings.
31
 *
32
 * After all edits have been added, instances of this class should be considered immutable, and an
33
 * {@link Edits::Iterator} can be used for queries.
34
 *
35
 * There are four flavors of Edits::Iterator:
36
 *
37
 * <ul>
38
 * <li>{@link #getFineIterator()} retains full granularity of change edits.
39
 * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling
40
 * next() on the iterator, skips over no-change edits (unchanged regions).
41
 * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change
42
 * edits are automatically merged during the construction phase.)
43
 * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when
44
 * calling next() on the iterator, skips over no-change edits (unchanged regions).
45
 * </ul>
46
 *
47
 * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the
48
 * following fine edits:
49
 * <ul>
50
 * <li>abc ⇨ abc (no-change)
51
 * <li>ß ⇨ ss (change)
52
 * <li>D ⇨ d (change)
53
 * <li>e ⇨ e (no-change)
54
 * <li>F ⇨ f (change)
55
 * </ul>
56
 * and the following coarse edits (note how adjacent change edits get merged together):
57
 * <ul>
58
 * <li>abc ⇨ abc (no-change)
59
 * <li>ßD ⇨ ssd (change)
60
 * <li>e ⇨ e (no-change)
61
 * <li>F ⇨ f (change)
62
 * </ul>
63
 *
64
 * The "fine changes" and "coarse changes" iterators will step through only the change edits when their
65
 * {@link Edits::Iterator#next()} methods are called. They are identical to the non-change iterators when
66
 * their {@link Edits::Iterator#findSourceIndex(int)} or {@link Edits::Iterator#findDestinationIndex(int)}
67
 * methods are used to walk through the string.
68
 *
69
 * For examples of how to use this class, see the test <code>TestCaseMapEditsIteratorDocs</code> in
70
 * UCharacterCaseTest.java.
71
 *
72
 * An Edits object tracks a separate UErrorCode, but ICU string transformation functions
73
 * (e.g., case mapping functions) merge any such errors into their API's UErrorCode.
74
 *
75
 * @stable ICU 59
76
 */
77
class U_COMMON_API Edits U_FINAL : public UMemory {
78
public:
79
    /**
80
     * Constructs an empty object.
81
     * @stable ICU 59
82
     */
83
    Edits() :
84
            array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),
85
0
            errorCode_(U_ZERO_ERROR) {}
86
    /**
87
     * Copy constructor.
88
     * @param other source edits
89
     * @draft ICU 60
90
     */
91
    Edits(const Edits &other) :
92
            array(stackArray), capacity(STACK_CAPACITY), length(other.length),
93
            delta(other.delta), numChanges(other.numChanges),
94
0
            errorCode_(other.errorCode_) {
95
0
        copyArray(other);
96
0
    }
97
    /**
98
     * Move constructor, might leave src empty.
99
     * This object will have the same contents that the source object had.
100
     * @param src source edits
101
     * @draft ICU 60
102
     */
103
    Edits(Edits &&src) U_NOEXCEPT :
104
            array(stackArray), capacity(STACK_CAPACITY), length(src.length),
105
            delta(src.delta), numChanges(src.numChanges),
106
0
            errorCode_(src.errorCode_) {
107
0
        moveArray(src);
108
0
    }
109
110
    /**
111
     * Destructor.
112
     * @stable ICU 59
113
     */
114
    ~Edits();
115
116
    /**
117
     * Assignment operator.
118
     * @param other source edits
119
     * @return *this
120
     * @draft ICU 60
121
     */
122
    Edits &operator=(const Edits &other);
123
124
    /**
125
     * Move assignment operator, might leave src empty.
126
     * This object will have the same contents that the source object had.
127
     * The behavior is undefined if *this and src are the same object.
128
     * @param src source edits
129
     * @return *this
130
     * @draft ICU 60
131
     */
132
    Edits &operator=(Edits &&src) U_NOEXCEPT;
133
134
    /**
135
     * Resets the data but may not release memory.
136
     * @stable ICU 59
137
     */
138
    void reset() U_NOEXCEPT;
139
140
    /**
141
     * Adds a no-change edit: a record for an unchanged segment of text.
142
     * Normally called from inside ICU string transformation functions, not user code.
143
     * @stable ICU 59
144
     */
145
    void addUnchanged(int32_t unchangedLength);
146
    /**
147
     * Adds a change edit: a record for a text replacement/insertion/deletion.
148
     * Normally called from inside ICU string transformation functions, not user code.
149
     * @stable ICU 59
150
     */
151
    void addReplace(int32_t oldLength, int32_t newLength);
152
    /**
153
     * Sets the UErrorCode if an error occurred while recording edits.
154
     * Preserves older error codes in the outErrorCode.
155
     * Normally called from inside ICU string transformation functions, not user code.
156
     * @param outErrorCode Set to an error code if it does not contain one already
157
     *                  and an error occurred while recording edits.
158
     *                  Otherwise unchanged.
159
     * @return TRUE if U_FAILURE(outErrorCode)
160
     * @stable ICU 59
161
     */
162
    UBool copyErrorTo(UErrorCode &outErrorCode);
163
164
    /**
165
     * How much longer is the new text compared with the old text?
166
     * @return new length minus old length
167
     * @stable ICU 59
168
     */
169
0
    int32_t lengthDelta() const { return delta; }
170
    /**
171
     * @return TRUE if there are any change edits
172
     * @stable ICU 59
173
     */
174
0
    UBool hasChanges() const { return numChanges != 0; }
175
176
#ifndef U_HIDE_DRAFT_API
177
    /**
178
     * @return the number of change edits
179
     * @draft ICU 60
180
     */
181
0
    int32_t numberOfChanges() const { return numChanges; }
182
#endif  // U_HIDE_DRAFT_API
183
184
    /**
185
     * Access to the list of edits.
186
     *
187
     * At any moment in time, an instance of this class points to a single edit: a "window" into a span
188
     * of the source string and the corresponding span of the destination string. The source string span
189
     * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string
190
     * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars.
191
     *
192
     * The iterator can be moved between edits using the {@link #next()}, {@link #findSourceIndex(int)},
193
     * and {@link #findDestinationIndex(int)} methods. Calling any of these methods mutates the iterator
194
     * to make it point to the corresponding edit.
195
     *
196
     * For more information, see the documentation for {@link Edits}.
197
     *
198
     * @see getCoarseIterator
199
     * @see getFineIterator
200
     * @stable ICU 59
201
     */
202
    struct U_COMMON_API Iterator U_FINAL : public UMemory {
203
        /**
204
         * Default constructor, empty iterator.
205
         * @draft ICU 60
206
         */
207
        Iterator() :
208
                array(nullptr), index(0), length(0),
209
                remaining(0), onlyChanges_(FALSE), coarse(FALSE),
210
                dir(0), changed(FALSE), oldLength_(0), newLength_(0),
211
0
                srcIndex(0), replIndex(0), destIndex(0) {}
212
        /**
213
         * Copy constructor.
214
         * @stable ICU 59
215
         */
216
        Iterator(const Iterator &other) = default;
217
        /**
218
         * Assignment operator.
219
         * @stable ICU 59
220
         */
221
        Iterator &operator=(const Iterator &other) = default;
222
223
        /**
224
         * Advances the iterator to the next edit.
225
         * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
226
         *                  or else the function returns immediately. Check for U_FAILURE()
227
         *                  on output or use with function chaining. (See User Guide for details.)
228
         * @return TRUE if there is another edit
229
         * @stable ICU 59
230
         */
231
0
        UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); }
232
233
        /**
234
         * Moves the iterator to the edit that contains the source index.
235
         * The source index may be found in a no-change edit
236
         * even if normal iteration would skip no-change edits.
237
         * Normal iteration can continue from a found edit.
238
         *
239
         * The iterator state before this search logically does not matter.
240
         * (It may affect the performance of the search.)
241
         *
242
         * The iterator state after this search is undefined
243
         * if the source index is out of bounds for the source string.
244
         *
245
         * @param i source index
246
         * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
247
         *                  or else the function returns immediately. Check for U_FAILURE()
248
         *                  on output or use with function chaining. (See User Guide for details.)
249
         * @return TRUE if the edit for the source index was found
250
         * @stable ICU 59
251
         */
252
0
        UBool findSourceIndex(int32_t i, UErrorCode &errorCode) {
253
0
            return findIndex(i, TRUE, errorCode) == 0;
254
0
        }
255
256
#ifndef U_HIDE_DRAFT_API
257
        /**
258
         * Moves the iterator to the edit that contains the destination index.
259
         * The destination index may be found in a no-change edit
260
         * even if normal iteration would skip no-change edits.
261
         * Normal iteration can continue from a found edit.
262
         *
263
         * The iterator state before this search logically does not matter.
264
         * (It may affect the performance of the search.)
265
         *
266
         * The iterator state after this search is undefined
267
         * if the source index is out of bounds for the source string.
268
         *
269
         * @param i destination index
270
         * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
271
         *                  or else the function returns immediately. Check for U_FAILURE()
272
         *                  on output or use with function chaining. (See User Guide for details.)
273
         * @return TRUE if the edit for the destination index was found
274
         * @draft ICU 60
275
         */
276
0
        UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) {
277
0
            return findIndex(i, FALSE, errorCode) == 0;
278
0
        }
279
280
        /**
281
         * Computes the destination index corresponding to the given source index.
282
         * If the source index is inside a change edit (not at its start),
283
         * then the destination index at the end of that edit is returned,
284
         * since there is no information about index mapping inside a change edit.
285
         *
286
         * (This means that indexes to the start and middle of an edit,
287
         * for example around a grapheme cluster, are mapped to indexes
288
         * encompassing the entire edit.
289
         * The alternative, mapping an interior index to the start,
290
         * would map such an interval to an empty one.)
291
         *
292
         * This operation will usually but not always modify this object.
293
         * The iterator state after this search is undefined.
294
         *
295
         * @param i source index
296
         * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
297
         *                  or else the function returns immediately. Check for U_FAILURE()
298
         *                  on output or use with function chaining. (See User Guide for details.)
299
         * @return destination index; undefined if i is not 0..string length
300
         * @draft ICU 60
301
         */
302
        int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode);
303
304
        /**
305
         * Computes the source index corresponding to the given destination index.
306
         * If the destination index is inside a change edit (not at its start),
307
         * then the source index at the end of that edit is returned,
308
         * since there is no information about index mapping inside a change edit.
309
         *
310
         * (This means that indexes to the start and middle of an edit,
311
         * for example around a grapheme cluster, are mapped to indexes
312
         * encompassing the entire edit.
313
         * The alternative, mapping an interior index to the start,
314
         * would map such an interval to an empty one.)
315
         *
316
         * This operation will usually but not always modify this object.
317
         * The iterator state after this search is undefined.
318
         *
319
         * @param i destination index
320
         * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
321
         *                  or else the function returns immediately. Check for U_FAILURE()
322
         *                  on output or use with function chaining. (See User Guide for details.)
323
         * @return source index; undefined if i is not 0..string length
324
         * @draft ICU 60
325
         */
326
        int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode);
327
#endif  // U_HIDE_DRAFT_API
328
329
        /**
330
         * Returns whether the edit currently represented by the iterator is a change edit.
331
         *
332
         * @return TRUE if this edit replaces oldLength() units with newLength() different ones.
333
         *         FALSE if oldLength units remain unchanged.
334
         * @stable ICU 59
335
         */
336
0
        UBool hasChange() const { return changed; }
337
338
        /**
339
         * The length of the current span in the source string, which starts at {@link #sourceIndex}.
340
         *
341
         * @return the number of units in the original string which are replaced or remain unchanged.
342
         * @stable ICU 59
343
         */
344
0
        int32_t oldLength() const { return oldLength_; }
345
346
        /**
347
         * The length of the current span in the destination string, which starts at
348
         * {@link #destinationIndex}, or in the replacement string, which starts at
349
         * {@link #replacementIndex}.
350
         *
351
         * @return the number of units in the modified string, if hasChange() is TRUE.
352
         *         Same as oldLength if hasChange() is FALSE.
353
         * @stable ICU 59
354
         */
355
0
        int32_t newLength() const { return newLength_; }
356
357
        /**
358
         * The start index of the current span in the source string; the span has length
359
         * {@link #oldLength}.
360
         *
361
         * @return the current index into the source string
362
         * @stable ICU 59
363
         */
364
0
        int32_t sourceIndex() const { return srcIndex; }
365
366
        /**
367
         * The start index of the current span in the replacement string; the span has length
368
         * {@link #newLength}. Well-defined only if the current edit is a change edit.
369
         * <p>
370
         * The <em>replacement string</em> is the concatenation of all substrings of the destination
371
         * string corresponding to change edits.
372
         * <p>
373
         * This method is intended to be used together with operations that write only replacement
374
         * characters (e.g., {@link CaseMap#omitUnchangedText()}). The source string can then be modified
375
         * in-place.
376
         *
377
         * @return the current index into the replacement-characters-only string,
378
         *         not counting unchanged spans
379
         * @stable ICU 59
380
         */
381
0
        int32_t replacementIndex() const {
382
0
            // TODO: Throw an exception if we aren't in a change edit?
383
0
            return replIndex;
384
0
        }
385
386
        /**
387
         * The start index of the current span in the destination string; the span has length
388
         * {@link #newLength}.
389
         *
390
         * @return the current index into the full destination string
391
         * @stable ICU 59
392
         */
393
0
        int32_t destinationIndex() const { return destIndex; }
394
395
#ifndef U_HIDE_INTERNAL_API
396
        /**
397
         * A string representation of the current edit represented by the iterator for debugging. You
398
         * should not depend on the contents of the return string.
399
         * @internal
400
         */
401
        UnicodeString& toString(UnicodeString& appendTo) const;
402
#endif  // U_HIDE_INTERNAL_API
403
404
    private:
405
        friend class Edits;
406
407
        Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
408
409
        int32_t readLength(int32_t head);
410
        void updateNextIndexes();
411
        void updatePreviousIndexes();
412
        UBool noNext();
413
        UBool next(UBool onlyChanges, UErrorCode &errorCode);
414
        UBool previous(UErrorCode &errorCode);
415
        /** @return -1: error or i<0; 0: found; 1: i>=string length */
416
        int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);
417
418
        const uint16_t *array;
419
        int32_t index, length;
420
        // 0 if we are not within compressed equal-length changes.
421
        // Otherwise the number of remaining changes, including the current one.
422
        int32_t remaining;
423
        UBool onlyChanges_, coarse;
424
425
        int8_t dir;  // iteration direction: back(<0), initial(0), forward(>0)
426
        UBool changed;
427
        int32_t oldLength_, newLength_;
428
        int32_t srcIndex, replIndex, destIndex;
429
    };
430
431
    /**
432
     * Returns an Iterator for coarse-grained change edits
433
     * (adjacent change edits are treated as one).
434
     * Can be used to perform simple string updates.
435
     * Skips no-change edits.
436
     * @return an Iterator that merges adjacent changes.
437
     * @stable ICU 59
438
     */
439
0
    Iterator getCoarseChangesIterator() const {
440
0
        return Iterator(array, length, TRUE, TRUE);
441
0
    }
442
443
    /**
444
     * Returns an Iterator for coarse-grained change and no-change edits
445
     * (adjacent change edits are treated as one).
446
     * Can be used to perform simple string updates.
447
     * Adjacent change edits are treated as one edit.
448
     * @return an Iterator that merges adjacent changes.
449
     * @stable ICU 59
450
     */
451
0
    Iterator getCoarseIterator() const {
452
0
        return Iterator(array, length, FALSE, TRUE);
453
0
    }
454
455
    /**
456
     * Returns an Iterator for fine-grained change edits
457
     * (full granularity of change edits is retained).
458
     * Can be used for modifying styled text.
459
     * Skips no-change edits.
460
     * @return an Iterator that separates adjacent changes.
461
     * @stable ICU 59
462
     */
463
0
    Iterator getFineChangesIterator() const {
464
0
        return Iterator(array, length, TRUE, FALSE);
465
0
    }
466
467
    /**
468
     * Returns an Iterator for fine-grained change and no-change edits
469
     * (full granularity of change edits is retained).
470
     * Can be used for modifying styled text.
471
     * @return an Iterator that separates adjacent changes.
472
     * @stable ICU 59
473
     */
474
0
    Iterator getFineIterator() const {
475
0
        return Iterator(array, length, FALSE, FALSE);
476
0
    }
477
478
#ifndef U_HIDE_DRAFT_API
479
    /**
480
     * Merges the two input Edits and appends the result to this object.
481
     *
482
     * Consider two string transformations (for example, normalization and case mapping)
483
     * where each records Edits in addition to writing an output string.<br>
484
     * Edits ab reflect how substrings of input string a
485
     * map to substrings of intermediate string b.<br>
486
     * Edits bc reflect how substrings of intermediate string b
487
     * map to substrings of output string c.<br>
488
     * This function merges ab and bc such that the additional edits
489
     * recorded in this object reflect how substrings of input string a
490
     * map to substrings of output string c.
491
     *
492
     * If unrelated Edits are passed in where the output string of the first
493
     * has a different length than the input string of the second,
494
     * then a U_ILLEGAL_ARGUMENT_ERROR is reported.
495
     *
496
     * @param ab reflects how substrings of input string a
497
     *     map to substrings of intermediate string b.
498
     * @param bc reflects how substrings of intermediate string b
499
     *     map to substrings of output string c.
500
     * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
501
     *                  or else the function returns immediately. Check for U_FAILURE()
502
     *                  on output or use with function chaining. (See User Guide for details.)
503
     * @return *this, with the merged edits appended
504
     * @draft ICU 60
505
     */
506
    Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode);
507
#endif  // U_HIDE_DRAFT_API
508
509
private:
510
    void releaseArray() U_NOEXCEPT;
511
    Edits &copyArray(const Edits &other);
512
    Edits &moveArray(Edits &src) U_NOEXCEPT;
513
514
0
    void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
515
0
    int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
516
517
    void append(int32_t r);
518
    UBool growArray();
519
520
    static const int32_t STACK_CAPACITY = 100;
521
    uint16_t *array;
522
    int32_t capacity;
523
    int32_t length;
524
    int32_t delta;
525
    int32_t numChanges;
526
    UErrorCode errorCode_;
527
    uint16_t stackArray[STACK_CAPACITY];
528
};
529
530
U_NAMESPACE_END
531
532
#endif  // __EDITS_H__