/src/mozilla-central/intl/icu/source/common/unicode/edits.h
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | |
4 | | // edits.h |
5 | | // created: 2016dec30 Markus W. Scherer |
6 | | |
7 | | #ifndef __EDITS_H__ |
8 | | #define __EDITS_H__ |
9 | | |
10 | | #include "unicode/utypes.h" |
11 | | #include "unicode/uobject.h" |
12 | | |
13 | | /** |
14 | | * \file |
15 | | * \brief C++ API: C++ class Edits for low-level string transformations on styled text. |
16 | | */ |
17 | | |
18 | | U_NAMESPACE_BEGIN |
19 | | |
20 | | class UnicodeString; |
21 | | |
22 | | /** |
23 | | * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions |
24 | | * in linear progression. Does not support moving/reordering of text. |
25 | | * |
26 | | * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to |
27 | | * instances of this class using {@link #addReplace(int, int)} (for change edits) and |
28 | | * {@link #addUnchanged(int)} (for no-change edits). Change edits are retained with full granularity, |
29 | | * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one |
30 | | * mapping between code points in the source and destination strings. |
31 | | * |
32 | | * After all edits have been added, instances of this class should be considered immutable, and an |
33 | | * {@link Edits::Iterator} can be used for queries. |
34 | | * |
35 | | * There are four flavors of Edits::Iterator: |
36 | | * |
37 | | * <ul> |
38 | | * <li>{@link #getFineIterator()} retains full granularity of change edits. |
39 | | * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling |
40 | | * next() on the iterator, skips over no-change edits (unchanged regions). |
41 | | * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change |
42 | | * edits are automatically merged during the construction phase.) |
43 | | * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when |
44 | | * calling next() on the iterator, skips over no-change edits (unchanged regions). |
45 | | * </ul> |
46 | | * |
47 | | * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the |
48 | | * following fine edits: |
49 | | * <ul> |
50 | | * <li>abc ⇨ abc (no-change) |
51 | | * <li>ß ⇨ ss (change) |
52 | | * <li>D ⇨ d (change) |
53 | | * <li>e ⇨ e (no-change) |
54 | | * <li>F ⇨ f (change) |
55 | | * </ul> |
56 | | * and the following coarse edits (note how adjacent change edits get merged together): |
57 | | * <ul> |
58 | | * <li>abc ⇨ abc (no-change) |
59 | | * <li>ßD ⇨ ssd (change) |
60 | | * <li>e ⇨ e (no-change) |
61 | | * <li>F ⇨ f (change) |
62 | | * </ul> |
63 | | * |
64 | | * The "fine changes" and "coarse changes" iterators will step through only the change edits when their |
65 | | * {@link Edits::Iterator#next()} methods are called. They are identical to the non-change iterators when |
66 | | * their {@link Edits::Iterator#findSourceIndex(int)} or {@link Edits::Iterator#findDestinationIndex(int)} |
67 | | * methods are used to walk through the string. |
68 | | * |
69 | | * For examples of how to use this class, see the test <code>TestCaseMapEditsIteratorDocs</code> in |
70 | | * UCharacterCaseTest.java. |
71 | | * |
72 | | * An Edits object tracks a separate UErrorCode, but ICU string transformation functions |
73 | | * (e.g., case mapping functions) merge any such errors into their API's UErrorCode. |
74 | | * |
75 | | * @stable ICU 59 |
76 | | */ |
77 | | class U_COMMON_API Edits U_FINAL : public UMemory { |
78 | | public: |
79 | | /** |
80 | | * Constructs an empty object. |
81 | | * @stable ICU 59 |
82 | | */ |
83 | | Edits() : |
84 | | array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0), |
85 | 0 | errorCode_(U_ZERO_ERROR) {} |
86 | | /** |
87 | | * Copy constructor. |
88 | | * @param other source edits |
89 | | * @draft ICU 60 |
90 | | */ |
91 | | Edits(const Edits &other) : |
92 | | array(stackArray), capacity(STACK_CAPACITY), length(other.length), |
93 | | delta(other.delta), numChanges(other.numChanges), |
94 | 0 | errorCode_(other.errorCode_) { |
95 | 0 | copyArray(other); |
96 | 0 | } |
97 | | /** |
98 | | * Move constructor, might leave src empty. |
99 | | * This object will have the same contents that the source object had. |
100 | | * @param src source edits |
101 | | * @draft ICU 60 |
102 | | */ |
103 | | Edits(Edits &&src) U_NOEXCEPT : |
104 | | array(stackArray), capacity(STACK_CAPACITY), length(src.length), |
105 | | delta(src.delta), numChanges(src.numChanges), |
106 | 0 | errorCode_(src.errorCode_) { |
107 | 0 | moveArray(src); |
108 | 0 | } |
109 | | |
110 | | /** |
111 | | * Destructor. |
112 | | * @stable ICU 59 |
113 | | */ |
114 | | ~Edits(); |
115 | | |
116 | | /** |
117 | | * Assignment operator. |
118 | | * @param other source edits |
119 | | * @return *this |
120 | | * @draft ICU 60 |
121 | | */ |
122 | | Edits &operator=(const Edits &other); |
123 | | |
124 | | /** |
125 | | * Move assignment operator, might leave src empty. |
126 | | * This object will have the same contents that the source object had. |
127 | | * The behavior is undefined if *this and src are the same object. |
128 | | * @param src source edits |
129 | | * @return *this |
130 | | * @draft ICU 60 |
131 | | */ |
132 | | Edits &operator=(Edits &&src) U_NOEXCEPT; |
133 | | |
134 | | /** |
135 | | * Resets the data but may not release memory. |
136 | | * @stable ICU 59 |
137 | | */ |
138 | | void reset() U_NOEXCEPT; |
139 | | |
140 | | /** |
141 | | * Adds a no-change edit: a record for an unchanged segment of text. |
142 | | * Normally called from inside ICU string transformation functions, not user code. |
143 | | * @stable ICU 59 |
144 | | */ |
145 | | void addUnchanged(int32_t unchangedLength); |
146 | | /** |
147 | | * Adds a change edit: a record for a text replacement/insertion/deletion. |
148 | | * Normally called from inside ICU string transformation functions, not user code. |
149 | | * @stable ICU 59 |
150 | | */ |
151 | | void addReplace(int32_t oldLength, int32_t newLength); |
152 | | /** |
153 | | * Sets the UErrorCode if an error occurred while recording edits. |
154 | | * Preserves older error codes in the outErrorCode. |
155 | | * Normally called from inside ICU string transformation functions, not user code. |
156 | | * @param outErrorCode Set to an error code if it does not contain one already |
157 | | * and an error occurred while recording edits. |
158 | | * Otherwise unchanged. |
159 | | * @return TRUE if U_FAILURE(outErrorCode) |
160 | | * @stable ICU 59 |
161 | | */ |
162 | | UBool copyErrorTo(UErrorCode &outErrorCode); |
163 | | |
164 | | /** |
165 | | * How much longer is the new text compared with the old text? |
166 | | * @return new length minus old length |
167 | | * @stable ICU 59 |
168 | | */ |
169 | 0 | int32_t lengthDelta() const { return delta; } |
170 | | /** |
171 | | * @return TRUE if there are any change edits |
172 | | * @stable ICU 59 |
173 | | */ |
174 | 0 | UBool hasChanges() const { return numChanges != 0; } |
175 | | |
176 | | #ifndef U_HIDE_DRAFT_API |
177 | | /** |
178 | | * @return the number of change edits |
179 | | * @draft ICU 60 |
180 | | */ |
181 | 0 | int32_t numberOfChanges() const { return numChanges; } |
182 | | #endif // U_HIDE_DRAFT_API |
183 | | |
184 | | /** |
185 | | * Access to the list of edits. |
186 | | * |
187 | | * At any moment in time, an instance of this class points to a single edit: a "window" into a span |
188 | | * of the source string and the corresponding span of the destination string. The source string span |
189 | | * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string |
190 | | * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars. |
191 | | * |
192 | | * The iterator can be moved between edits using the {@link #next()}, {@link #findSourceIndex(int)}, |
193 | | * and {@link #findDestinationIndex(int)} methods. Calling any of these methods mutates the iterator |
194 | | * to make it point to the corresponding edit. |
195 | | * |
196 | | * For more information, see the documentation for {@link Edits}. |
197 | | * |
198 | | * @see getCoarseIterator |
199 | | * @see getFineIterator |
200 | | * @stable ICU 59 |
201 | | */ |
202 | | struct U_COMMON_API Iterator U_FINAL : public UMemory { |
203 | | /** |
204 | | * Default constructor, empty iterator. |
205 | | * @draft ICU 60 |
206 | | */ |
207 | | Iterator() : |
208 | | array(nullptr), index(0), length(0), |
209 | | remaining(0), onlyChanges_(FALSE), coarse(FALSE), |
210 | | dir(0), changed(FALSE), oldLength_(0), newLength_(0), |
211 | 0 | srcIndex(0), replIndex(0), destIndex(0) {} |
212 | | /** |
213 | | * Copy constructor. |
214 | | * @stable ICU 59 |
215 | | */ |
216 | | Iterator(const Iterator &other) = default; |
217 | | /** |
218 | | * Assignment operator. |
219 | | * @stable ICU 59 |
220 | | */ |
221 | | Iterator &operator=(const Iterator &other) = default; |
222 | | |
223 | | /** |
224 | | * Advances the iterator to the next edit. |
225 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
226 | | * or else the function returns immediately. Check for U_FAILURE() |
227 | | * on output or use with function chaining. (See User Guide for details.) |
228 | | * @return TRUE if there is another edit |
229 | | * @stable ICU 59 |
230 | | */ |
231 | 0 | UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); } |
232 | | |
233 | | /** |
234 | | * Moves the iterator to the edit that contains the source index. |
235 | | * The source index may be found in a no-change edit |
236 | | * even if normal iteration would skip no-change edits. |
237 | | * Normal iteration can continue from a found edit. |
238 | | * |
239 | | * The iterator state before this search logically does not matter. |
240 | | * (It may affect the performance of the search.) |
241 | | * |
242 | | * The iterator state after this search is undefined |
243 | | * if the source index is out of bounds for the source string. |
244 | | * |
245 | | * @param i source index |
246 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
247 | | * or else the function returns immediately. Check for U_FAILURE() |
248 | | * on output or use with function chaining. (See User Guide for details.) |
249 | | * @return TRUE if the edit for the source index was found |
250 | | * @stable ICU 59 |
251 | | */ |
252 | 0 | UBool findSourceIndex(int32_t i, UErrorCode &errorCode) { |
253 | 0 | return findIndex(i, TRUE, errorCode) == 0; |
254 | 0 | } |
255 | | |
256 | | #ifndef U_HIDE_DRAFT_API |
257 | | /** |
258 | | * Moves the iterator to the edit that contains the destination index. |
259 | | * The destination index may be found in a no-change edit |
260 | | * even if normal iteration would skip no-change edits. |
261 | | * Normal iteration can continue from a found edit. |
262 | | * |
263 | | * The iterator state before this search logically does not matter. |
264 | | * (It may affect the performance of the search.) |
265 | | * |
266 | | * The iterator state after this search is undefined |
267 | | * if the source index is out of bounds for the source string. |
268 | | * |
269 | | * @param i destination index |
270 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
271 | | * or else the function returns immediately. Check for U_FAILURE() |
272 | | * on output or use with function chaining. (See User Guide for details.) |
273 | | * @return TRUE if the edit for the destination index was found |
274 | | * @draft ICU 60 |
275 | | */ |
276 | 0 | UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) { |
277 | 0 | return findIndex(i, FALSE, errorCode) == 0; |
278 | 0 | } |
279 | | |
280 | | /** |
281 | | * Computes the destination index corresponding to the given source index. |
282 | | * If the source index is inside a change edit (not at its start), |
283 | | * then the destination index at the end of that edit is returned, |
284 | | * since there is no information about index mapping inside a change edit. |
285 | | * |
286 | | * (This means that indexes to the start and middle of an edit, |
287 | | * for example around a grapheme cluster, are mapped to indexes |
288 | | * encompassing the entire edit. |
289 | | * The alternative, mapping an interior index to the start, |
290 | | * would map such an interval to an empty one.) |
291 | | * |
292 | | * This operation will usually but not always modify this object. |
293 | | * The iterator state after this search is undefined. |
294 | | * |
295 | | * @param i source index |
296 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
297 | | * or else the function returns immediately. Check for U_FAILURE() |
298 | | * on output or use with function chaining. (See User Guide for details.) |
299 | | * @return destination index; undefined if i is not 0..string length |
300 | | * @draft ICU 60 |
301 | | */ |
302 | | int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode); |
303 | | |
304 | | /** |
305 | | * Computes the source index corresponding to the given destination index. |
306 | | * If the destination index is inside a change edit (not at its start), |
307 | | * then the source index at the end of that edit is returned, |
308 | | * since there is no information about index mapping inside a change edit. |
309 | | * |
310 | | * (This means that indexes to the start and middle of an edit, |
311 | | * for example around a grapheme cluster, are mapped to indexes |
312 | | * encompassing the entire edit. |
313 | | * The alternative, mapping an interior index to the start, |
314 | | * would map such an interval to an empty one.) |
315 | | * |
316 | | * This operation will usually but not always modify this object. |
317 | | * The iterator state after this search is undefined. |
318 | | * |
319 | | * @param i destination index |
320 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
321 | | * or else the function returns immediately. Check for U_FAILURE() |
322 | | * on output or use with function chaining. (See User Guide for details.) |
323 | | * @return source index; undefined if i is not 0..string length |
324 | | * @draft ICU 60 |
325 | | */ |
326 | | int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode); |
327 | | #endif // U_HIDE_DRAFT_API |
328 | | |
329 | | /** |
330 | | * Returns whether the edit currently represented by the iterator is a change edit. |
331 | | * |
332 | | * @return TRUE if this edit replaces oldLength() units with newLength() different ones. |
333 | | * FALSE if oldLength units remain unchanged. |
334 | | * @stable ICU 59 |
335 | | */ |
336 | 0 | UBool hasChange() const { return changed; } |
337 | | |
338 | | /** |
339 | | * The length of the current span in the source string, which starts at {@link #sourceIndex}. |
340 | | * |
341 | | * @return the number of units in the original string which are replaced or remain unchanged. |
342 | | * @stable ICU 59 |
343 | | */ |
344 | 0 | int32_t oldLength() const { return oldLength_; } |
345 | | |
346 | | /** |
347 | | * The length of the current span in the destination string, which starts at |
348 | | * {@link #destinationIndex}, or in the replacement string, which starts at |
349 | | * {@link #replacementIndex}. |
350 | | * |
351 | | * @return the number of units in the modified string, if hasChange() is TRUE. |
352 | | * Same as oldLength if hasChange() is FALSE. |
353 | | * @stable ICU 59 |
354 | | */ |
355 | 0 | int32_t newLength() const { return newLength_; } |
356 | | |
357 | | /** |
358 | | * The start index of the current span in the source string; the span has length |
359 | | * {@link #oldLength}. |
360 | | * |
361 | | * @return the current index into the source string |
362 | | * @stable ICU 59 |
363 | | */ |
364 | 0 | int32_t sourceIndex() const { return srcIndex; } |
365 | | |
366 | | /** |
367 | | * The start index of the current span in the replacement string; the span has length |
368 | | * {@link #newLength}. Well-defined only if the current edit is a change edit. |
369 | | * <p> |
370 | | * The <em>replacement string</em> is the concatenation of all substrings of the destination |
371 | | * string corresponding to change edits. |
372 | | * <p> |
373 | | * This method is intended to be used together with operations that write only replacement |
374 | | * characters (e.g., {@link CaseMap#omitUnchangedText()}). The source string can then be modified |
375 | | * in-place. |
376 | | * |
377 | | * @return the current index into the replacement-characters-only string, |
378 | | * not counting unchanged spans |
379 | | * @stable ICU 59 |
380 | | */ |
381 | 0 | int32_t replacementIndex() const { |
382 | 0 | // TODO: Throw an exception if we aren't in a change edit? |
383 | 0 | return replIndex; |
384 | 0 | } |
385 | | |
386 | | /** |
387 | | * The start index of the current span in the destination string; the span has length |
388 | | * {@link #newLength}. |
389 | | * |
390 | | * @return the current index into the full destination string |
391 | | * @stable ICU 59 |
392 | | */ |
393 | 0 | int32_t destinationIndex() const { return destIndex; } |
394 | | |
395 | | #ifndef U_HIDE_INTERNAL_API |
396 | | /** |
397 | | * A string representation of the current edit represented by the iterator for debugging. You |
398 | | * should not depend on the contents of the return string. |
399 | | * @internal |
400 | | */ |
401 | | UnicodeString& toString(UnicodeString& appendTo) const; |
402 | | #endif // U_HIDE_INTERNAL_API |
403 | | |
404 | | private: |
405 | | friend class Edits; |
406 | | |
407 | | Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs); |
408 | | |
409 | | int32_t readLength(int32_t head); |
410 | | void updateNextIndexes(); |
411 | | void updatePreviousIndexes(); |
412 | | UBool noNext(); |
413 | | UBool next(UBool onlyChanges, UErrorCode &errorCode); |
414 | | UBool previous(UErrorCode &errorCode); |
415 | | /** @return -1: error or i<0; 0: found; 1: i>=string length */ |
416 | | int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode); |
417 | | |
418 | | const uint16_t *array; |
419 | | int32_t index, length; |
420 | | // 0 if we are not within compressed equal-length changes. |
421 | | // Otherwise the number of remaining changes, including the current one. |
422 | | int32_t remaining; |
423 | | UBool onlyChanges_, coarse; |
424 | | |
425 | | int8_t dir; // iteration direction: back(<0), initial(0), forward(>0) |
426 | | UBool changed; |
427 | | int32_t oldLength_, newLength_; |
428 | | int32_t srcIndex, replIndex, destIndex; |
429 | | }; |
430 | | |
431 | | /** |
432 | | * Returns an Iterator for coarse-grained change edits |
433 | | * (adjacent change edits are treated as one). |
434 | | * Can be used to perform simple string updates. |
435 | | * Skips no-change edits. |
436 | | * @return an Iterator that merges adjacent changes. |
437 | | * @stable ICU 59 |
438 | | */ |
439 | 0 | Iterator getCoarseChangesIterator() const { |
440 | 0 | return Iterator(array, length, TRUE, TRUE); |
441 | 0 | } |
442 | | |
443 | | /** |
444 | | * Returns an Iterator for coarse-grained change and no-change edits |
445 | | * (adjacent change edits are treated as one). |
446 | | * Can be used to perform simple string updates. |
447 | | * Adjacent change edits are treated as one edit. |
448 | | * @return an Iterator that merges adjacent changes. |
449 | | * @stable ICU 59 |
450 | | */ |
451 | 0 | Iterator getCoarseIterator() const { |
452 | 0 | return Iterator(array, length, FALSE, TRUE); |
453 | 0 | } |
454 | | |
455 | | /** |
456 | | * Returns an Iterator for fine-grained change edits |
457 | | * (full granularity of change edits is retained). |
458 | | * Can be used for modifying styled text. |
459 | | * Skips no-change edits. |
460 | | * @return an Iterator that separates adjacent changes. |
461 | | * @stable ICU 59 |
462 | | */ |
463 | 0 | Iterator getFineChangesIterator() const { |
464 | 0 | return Iterator(array, length, TRUE, FALSE); |
465 | 0 | } |
466 | | |
467 | | /** |
468 | | * Returns an Iterator for fine-grained change and no-change edits |
469 | | * (full granularity of change edits is retained). |
470 | | * Can be used for modifying styled text. |
471 | | * @return an Iterator that separates adjacent changes. |
472 | | * @stable ICU 59 |
473 | | */ |
474 | 0 | Iterator getFineIterator() const { |
475 | 0 | return Iterator(array, length, FALSE, FALSE); |
476 | 0 | } |
477 | | |
478 | | #ifndef U_HIDE_DRAFT_API |
479 | | /** |
480 | | * Merges the two input Edits and appends the result to this object. |
481 | | * |
482 | | * Consider two string transformations (for example, normalization and case mapping) |
483 | | * where each records Edits in addition to writing an output string.<br> |
484 | | * Edits ab reflect how substrings of input string a |
485 | | * map to substrings of intermediate string b.<br> |
486 | | * Edits bc reflect how substrings of intermediate string b |
487 | | * map to substrings of output string c.<br> |
488 | | * This function merges ab and bc such that the additional edits |
489 | | * recorded in this object reflect how substrings of input string a |
490 | | * map to substrings of output string c. |
491 | | * |
492 | | * If unrelated Edits are passed in where the output string of the first |
493 | | * has a different length than the input string of the second, |
494 | | * then a U_ILLEGAL_ARGUMENT_ERROR is reported. |
495 | | * |
496 | | * @param ab reflects how substrings of input string a |
497 | | * map to substrings of intermediate string b. |
498 | | * @param bc reflects how substrings of intermediate string b |
499 | | * map to substrings of output string c. |
500 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
501 | | * or else the function returns immediately. Check for U_FAILURE() |
502 | | * on output or use with function chaining. (See User Guide for details.) |
503 | | * @return *this, with the merged edits appended |
504 | | * @draft ICU 60 |
505 | | */ |
506 | | Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode); |
507 | | #endif // U_HIDE_DRAFT_API |
508 | | |
509 | | private: |
510 | | void releaseArray() U_NOEXCEPT; |
511 | | Edits ©Array(const Edits &other); |
512 | | Edits &moveArray(Edits &src) U_NOEXCEPT; |
513 | | |
514 | 0 | void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; } |
515 | 0 | int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; } |
516 | | |
517 | | void append(int32_t r); |
518 | | UBool growArray(); |
519 | | |
520 | | static const int32_t STACK_CAPACITY = 100; |
521 | | uint16_t *array; |
522 | | int32_t capacity; |
523 | | int32_t length; |
524 | | int32_t delta; |
525 | | int32_t numChanges; |
526 | | UErrorCode errorCode_; |
527 | | uint16_t stackArray[STACK_CAPACITY]; |
528 | | }; |
529 | | |
530 | | U_NAMESPACE_END |
531 | | |
532 | | #endif // __EDITS_H__ |