/src/icu/source/common/unicode/edits.h
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | |
4 | | // edits.h |
5 | | // created: 2016dec30 Markus W. Scherer |
6 | | |
7 | | #ifndef __EDITS_H__ |
8 | | #define __EDITS_H__ |
9 | | |
10 | | #include "unicode/utypes.h" |
11 | | |
12 | | #if U_SHOW_CPLUSPLUS_API |
13 | | |
14 | | #include "unicode/uobject.h" |
15 | | |
16 | | /** |
17 | | * \file |
18 | | * \brief C++ API: C++ class Edits for low-level string transformations on styled text. |
19 | | */ |
20 | | |
21 | | U_NAMESPACE_BEGIN |
22 | | |
23 | | class UnicodeString; |
24 | | |
25 | | /** |
26 | | * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions |
27 | | * in linear progression. Does not support moving/reordering of text. |
28 | | * |
29 | | * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to |
30 | | * instances of this class using {@link #addReplace(int32_t, int32_t)} (for change edits) and |
31 | | * {@link #addUnchanged(int32_t)} (for no-change edits). Change edits are retained with full granularity, |
32 | | * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one |
33 | | * mapping between code points in the source and destination strings. |
34 | | * |
35 | | * After all edits have been added, instances of this class should be considered immutable, and an |
36 | | * {@link Edits::Iterator} can be used for queries. |
37 | | * |
38 | | * There are four flavors of Edits::Iterator: |
39 | | * |
40 | | * <ul> |
41 | | * <li>{@link #getFineIterator()} retains full granularity of change edits. |
42 | | * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling |
43 | | * next() on the iterator, skips over no-change edits (unchanged regions). |
44 | | * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change |
45 | | * edits are automatically merged during the construction phase.) |
46 | | * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when |
47 | | * calling next() on the iterator, skips over no-change edits (unchanged regions). |
48 | | * </ul> |
49 | | * |
50 | | * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the |
51 | | * following fine edits: |
52 | | * <ul> |
53 | | * <li>abc ⇨ abc (no-change) |
54 | | * <li>ß ⇨ ss (change) |
55 | | * <li>D ⇨ d (change) |
56 | | * <li>e ⇨ e (no-change) |
57 | | * <li>F ⇨ f (change) |
58 | | * </ul> |
59 | | * and the following coarse edits (note how adjacent change edits get merged together): |
60 | | * <ul> |
61 | | * <li>abc ⇨ abc (no-change) |
62 | | * <li>ßD ⇨ ssd (change) |
63 | | * <li>e ⇨ e (no-change) |
64 | | * <li>F ⇨ f (change) |
65 | | * </ul> |
66 | | * |
67 | | * The "fine changes" and "coarse changes" iterators will step through only the change edits when their |
68 | | * `Edits::Iterator::next()` methods are called. They are identical to the non-change iterators when |
69 | | * their `Edits::Iterator::findSourceIndex()` or `Edits::Iterator::findDestinationIndex()` |
70 | | * methods are used to walk through the string. |
71 | | * |
72 | | * For examples of how to use this class, see the test `TestCaseMapEditsIteratorDocs` in |
73 | | * UCharacterCaseTest.java. |
74 | | * |
75 | | * An Edits object tracks a separate UErrorCode, but ICU string transformation functions |
76 | | * (e.g., case mapping functions) merge any such errors into their API's UErrorCode. |
77 | | * |
78 | | * @stable ICU 59 |
79 | | */ |
80 | | class U_COMMON_API Edits U_FINAL : public UMemory { |
81 | | public: |
82 | | /** |
83 | | * Constructs an empty object. |
84 | | * @stable ICU 59 |
85 | | */ |
86 | | Edits() : |
87 | 0 | array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0), |
88 | 0 | errorCode_(U_ZERO_ERROR) {} |
89 | | /** |
90 | | * Copy constructor. |
91 | | * @param other source edits |
92 | | * @stable ICU 60 |
93 | | */ |
94 | | Edits(const Edits &other) : |
95 | | array(stackArray), capacity(STACK_CAPACITY), length(other.length), |
96 | | delta(other.delta), numChanges(other.numChanges), |
97 | 0 | errorCode_(other.errorCode_) { |
98 | 0 | copyArray(other); |
99 | 0 | } |
100 | | /** |
101 | | * Move constructor, might leave src empty. |
102 | | * This object will have the same contents that the source object had. |
103 | | * @param src source edits |
104 | | * @stable ICU 60 |
105 | | */ |
106 | | Edits(Edits &&src) U_NOEXCEPT : |
107 | | array(stackArray), capacity(STACK_CAPACITY), length(src.length), |
108 | | delta(src.delta), numChanges(src.numChanges), |
109 | 0 | errorCode_(src.errorCode_) { |
110 | 0 | moveArray(src); |
111 | 0 | } |
112 | | |
113 | | /** |
114 | | * Destructor. |
115 | | * @stable ICU 59 |
116 | | */ |
117 | | ~Edits(); |
118 | | |
119 | | /** |
120 | | * Assignment operator. |
121 | | * @param other source edits |
122 | | * @return *this |
123 | | * @stable ICU 60 |
124 | | */ |
125 | | Edits &operator=(const Edits &other); |
126 | | |
127 | | /** |
128 | | * Move assignment operator, might leave src empty. |
129 | | * This object will have the same contents that the source object had. |
130 | | * The behavior is undefined if *this and src are the same object. |
131 | | * @param src source edits |
132 | | * @return *this |
133 | | * @stable ICU 60 |
134 | | */ |
135 | | Edits &operator=(Edits &&src) U_NOEXCEPT; |
136 | | |
137 | | /** |
138 | | * Resets the data but may not release memory. |
139 | | * @stable ICU 59 |
140 | | */ |
141 | | void reset() U_NOEXCEPT; |
142 | | |
143 | | /** |
144 | | * Adds a no-change edit: a record for an unchanged segment of text. |
145 | | * Normally called from inside ICU string transformation functions, not user code. |
146 | | * @stable ICU 59 |
147 | | */ |
148 | | void addUnchanged(int32_t unchangedLength); |
149 | | /** |
150 | | * Adds a change edit: a record for a text replacement/insertion/deletion. |
151 | | * Normally called from inside ICU string transformation functions, not user code. |
152 | | * @stable ICU 59 |
153 | | */ |
154 | | void addReplace(int32_t oldLength, int32_t newLength); |
155 | | /** |
156 | | * Sets the UErrorCode if an error occurred while recording edits. |
157 | | * Preserves older error codes in the outErrorCode. |
158 | | * Normally called from inside ICU string transformation functions, not user code. |
159 | | * @param outErrorCode Set to an error code if it does not contain one already |
160 | | * and an error occurred while recording edits. |
161 | | * Otherwise unchanged. |
162 | | * @return true if U_FAILURE(outErrorCode) |
163 | | * @stable ICU 59 |
164 | | */ |
165 | | UBool copyErrorTo(UErrorCode &outErrorCode) const; |
166 | | |
167 | | /** |
168 | | * How much longer is the new text compared with the old text? |
169 | | * @return new length minus old length |
170 | | * @stable ICU 59 |
171 | | */ |
172 | 0 | int32_t lengthDelta() const { return delta; } |
173 | | /** |
174 | | * @return true if there are any change edits |
175 | | * @stable ICU 59 |
176 | | */ |
177 | 0 | UBool hasChanges() const { return numChanges != 0; } |
178 | | |
179 | | /** |
180 | | * @return the number of change edits |
181 | | * @stable ICU 60 |
182 | | */ |
183 | 0 | int32_t numberOfChanges() const { return numChanges; } |
184 | | |
185 | | /** |
186 | | * Access to the list of edits. |
187 | | * |
188 | | * At any moment in time, an instance of this class points to a single edit: a "window" into a span |
189 | | * of the source string and the corresponding span of the destination string. The source string span |
190 | | * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string |
191 | | * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars. |
192 | | * |
193 | | * The iterator can be moved between edits using the `next()`, `findSourceIndex(int32_t, UErrorCode &)`, |
194 | | * and `findDestinationIndex(int32_t, UErrorCode &)` methods. |
195 | | * Calling any of these methods mutates the iterator to make it point to the corresponding edit. |
196 | | * |
197 | | * For more information, see the documentation for {@link Edits}. |
198 | | * |
199 | | * @see getCoarseIterator |
200 | | * @see getFineIterator |
201 | | * @stable ICU 59 |
202 | | */ |
203 | | struct U_COMMON_API Iterator U_FINAL : public UMemory { |
204 | | /** |
205 | | * Default constructor, empty iterator. |
206 | | * @stable ICU 60 |
207 | | */ |
208 | | Iterator() : |
209 | | array(nullptr), index(0), length(0), |
210 | | remaining(0), onlyChanges_(false), coarse(false), |
211 | | dir(0), changed(false), oldLength_(0), newLength_(0), |
212 | 0 | srcIndex(0), replIndex(0), destIndex(0) {} |
213 | | /** |
214 | | * Copy constructor. |
215 | | * @stable ICU 59 |
216 | | */ |
217 | | Iterator(const Iterator &other) = default; |
218 | | /** |
219 | | * Assignment operator. |
220 | | * @stable ICU 59 |
221 | | */ |
222 | | Iterator &operator=(const Iterator &other) = default; |
223 | | |
224 | | /** |
225 | | * Advances the iterator to the next edit. |
226 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
227 | | * or else the function returns immediately. Check for U_FAILURE() |
228 | | * on output or use with function chaining. (See User Guide for details.) |
229 | | * @return true if there is another edit |
230 | | * @stable ICU 59 |
231 | | */ |
232 | 0 | UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); } |
233 | | |
234 | | /** |
235 | | * Moves the iterator to the edit that contains the source index. |
236 | | * The source index may be found in a no-change edit |
237 | | * even if normal iteration would skip no-change edits. |
238 | | * Normal iteration can continue from a found edit. |
239 | | * |
240 | | * The iterator state before this search logically does not matter. |
241 | | * (It may affect the performance of the search.) |
242 | | * |
243 | | * The iterator state after this search is undefined |
244 | | * if the source index is out of bounds for the source string. |
245 | | * |
246 | | * @param i source index |
247 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
248 | | * or else the function returns immediately. Check for U_FAILURE() |
249 | | * on output or use with function chaining. (See User Guide for details.) |
250 | | * @return true if the edit for the source index was found |
251 | | * @stable ICU 59 |
252 | | */ |
253 | 0 | UBool findSourceIndex(int32_t i, UErrorCode &errorCode) { |
254 | 0 | return findIndex(i, true, errorCode) == 0; |
255 | 0 | } |
256 | | |
257 | | /** |
258 | | * Moves the iterator to the edit that contains the destination index. |
259 | | * The destination index may be found in a no-change edit |
260 | | * even if normal iteration would skip no-change edits. |
261 | | * Normal iteration can continue from a found edit. |
262 | | * |
263 | | * The iterator state before this search logically does not matter. |
264 | | * (It may affect the performance of the search.) |
265 | | * |
266 | | * The iterator state after this search is undefined |
267 | | * if the source index is out of bounds for the source string. |
268 | | * |
269 | | * @param i destination index |
270 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
271 | | * or else the function returns immediately. Check for U_FAILURE() |
272 | | * on output or use with function chaining. (See User Guide for details.) |
273 | | * @return true if the edit for the destination index was found |
274 | | * @stable ICU 60 |
275 | | */ |
276 | 0 | UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) { |
277 | 0 | return findIndex(i, false, errorCode) == 0; |
278 | 0 | } |
279 | | |
280 | | /** |
281 | | * Computes the destination index corresponding to the given source index. |
282 | | * If the source index is inside a change edit (not at its start), |
283 | | * then the destination index at the end of that edit is returned, |
284 | | * since there is no information about index mapping inside a change edit. |
285 | | * |
286 | | * (This means that indexes to the start and middle of an edit, |
287 | | * for example around a grapheme cluster, are mapped to indexes |
288 | | * encompassing the entire edit. |
289 | | * The alternative, mapping an interior index to the start, |
290 | | * would map such an interval to an empty one.) |
291 | | * |
292 | | * This operation will usually but not always modify this object. |
293 | | * The iterator state after this search is undefined. |
294 | | * |
295 | | * @param i source index |
296 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
297 | | * or else the function returns immediately. Check for U_FAILURE() |
298 | | * on output or use with function chaining. (See User Guide for details.) |
299 | | * @return destination index; undefined if i is not 0..string length |
300 | | * @stable ICU 60 |
301 | | */ |
302 | | int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode); |
303 | | |
304 | | /** |
305 | | * Computes the source index corresponding to the given destination index. |
306 | | * If the destination index is inside a change edit (not at its start), |
307 | | * then the source index at the end of that edit is returned, |
308 | | * since there is no information about index mapping inside a change edit. |
309 | | * |
310 | | * (This means that indexes to the start and middle of an edit, |
311 | | * for example around a grapheme cluster, are mapped to indexes |
312 | | * encompassing the entire edit. |
313 | | * The alternative, mapping an interior index to the start, |
314 | | * would map such an interval to an empty one.) |
315 | | * |
316 | | * This operation will usually but not always modify this object. |
317 | | * The iterator state after this search is undefined. |
318 | | * |
319 | | * @param i destination index |
320 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
321 | | * or else the function returns immediately. Check for U_FAILURE() |
322 | | * on output or use with function chaining. (See User Guide for details.) |
323 | | * @return source index; undefined if i is not 0..string length |
324 | | * @stable ICU 60 |
325 | | */ |
326 | | int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode); |
327 | | |
328 | | /** |
329 | | * Returns whether the edit currently represented by the iterator is a change edit. |
330 | | * |
331 | | * @return true if this edit replaces oldLength() units with newLength() different ones. |
332 | | * false if oldLength units remain unchanged. |
333 | | * @stable ICU 59 |
334 | | */ |
335 | 0 | UBool hasChange() const { return changed; } |
336 | | |
337 | | /** |
338 | | * The length of the current span in the source string, which starts at {@link #sourceIndex}. |
339 | | * |
340 | | * @return the number of units in the original string which are replaced or remain unchanged. |
341 | | * @stable ICU 59 |
342 | | */ |
343 | 0 | int32_t oldLength() const { return oldLength_; } |
344 | | |
345 | | /** |
346 | | * The length of the current span in the destination string, which starts at |
347 | | * {@link #destinationIndex}, or in the replacement string, which starts at |
348 | | * {@link #replacementIndex}. |
349 | | * |
350 | | * @return the number of units in the modified string, if hasChange() is true. |
351 | | * Same as oldLength if hasChange() is false. |
352 | | * @stable ICU 59 |
353 | | */ |
354 | 0 | int32_t newLength() const { return newLength_; } |
355 | | |
356 | | /** |
357 | | * The start index of the current span in the source string; the span has length |
358 | | * {@link #oldLength}. |
359 | | * |
360 | | * @return the current index into the source string |
361 | | * @stable ICU 59 |
362 | | */ |
363 | 0 | int32_t sourceIndex() const { return srcIndex; } |
364 | | |
365 | | /** |
366 | | * The start index of the current span in the replacement string; the span has length |
367 | | * {@link #newLength}. Well-defined only if the current edit is a change edit. |
368 | | * |
369 | | * The *replacement string* is the concatenation of all substrings of the destination |
370 | | * string corresponding to change edits. |
371 | | * |
372 | | * This method is intended to be used together with operations that write only replacement |
373 | | * characters (e.g. operations specifying the \ref U_OMIT_UNCHANGED_TEXT option). |
374 | | * The source string can then be modified in-place. |
375 | | * |
376 | | * @return the current index into the replacement-characters-only string, |
377 | | * not counting unchanged spans |
378 | | * @stable ICU 59 |
379 | | */ |
380 | 0 | int32_t replacementIndex() const { |
381 | | // TODO: Throw an exception if we aren't in a change edit? |
382 | 0 | return replIndex; |
383 | 0 | } |
384 | | |
385 | | /** |
386 | | * The start index of the current span in the destination string; the span has length |
387 | | * {@link #newLength}. |
388 | | * |
389 | | * @return the current index into the full destination string |
390 | | * @stable ICU 59 |
391 | | */ |
392 | 0 | int32_t destinationIndex() const { return destIndex; } |
393 | | |
394 | | #ifndef U_HIDE_INTERNAL_API |
395 | | /** |
396 | | * A string representation of the current edit represented by the iterator for debugging. You |
397 | | * should not depend on the contents of the return string. |
398 | | * @internal |
399 | | */ |
400 | | UnicodeString& toString(UnicodeString& appendTo) const; |
401 | | #endif // U_HIDE_INTERNAL_API |
402 | | |
403 | | private: |
404 | | friend class Edits; |
405 | | |
406 | | Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs); |
407 | | |
408 | | int32_t readLength(int32_t head); |
409 | | void updateNextIndexes(); |
410 | | void updatePreviousIndexes(); |
411 | | UBool noNext(); |
412 | | UBool next(UBool onlyChanges, UErrorCode &errorCode); |
413 | | UBool previous(UErrorCode &errorCode); |
414 | | /** @return -1: error or i<0; 0: found; 1: i>=string length */ |
415 | | int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode); |
416 | | |
417 | | const uint16_t *array; |
418 | | int32_t index, length; |
419 | | // 0 if we are not within compressed equal-length changes. |
420 | | // Otherwise the number of remaining changes, including the current one. |
421 | | int32_t remaining; |
422 | | UBool onlyChanges_, coarse; |
423 | | |
424 | | int8_t dir; // iteration direction: back(<0), initial(0), forward(>0) |
425 | | UBool changed; |
426 | | int32_t oldLength_, newLength_; |
427 | | int32_t srcIndex, replIndex, destIndex; |
428 | | }; |
429 | | |
430 | | /** |
431 | | * Returns an Iterator for coarse-grained change edits |
432 | | * (adjacent change edits are treated as one). |
433 | | * Can be used to perform simple string updates. |
434 | | * Skips no-change edits. |
435 | | * @return an Iterator that merges adjacent changes. |
436 | | * @stable ICU 59 |
437 | | */ |
438 | 0 | Iterator getCoarseChangesIterator() const { |
439 | 0 | return Iterator(array, length, true, true); |
440 | 0 | } |
441 | | |
442 | | /** |
443 | | * Returns an Iterator for coarse-grained change and no-change edits |
444 | | * (adjacent change edits are treated as one). |
445 | | * Can be used to perform simple string updates. |
446 | | * Adjacent change edits are treated as one edit. |
447 | | * @return an Iterator that merges adjacent changes. |
448 | | * @stable ICU 59 |
449 | | */ |
450 | 0 | Iterator getCoarseIterator() const { |
451 | 0 | return Iterator(array, length, false, true); |
452 | 0 | } |
453 | | |
454 | | /** |
455 | | * Returns an Iterator for fine-grained change edits |
456 | | * (full granularity of change edits is retained). |
457 | | * Can be used for modifying styled text. |
458 | | * Skips no-change edits. |
459 | | * @return an Iterator that separates adjacent changes. |
460 | | * @stable ICU 59 |
461 | | */ |
462 | 0 | Iterator getFineChangesIterator() const { |
463 | 0 | return Iterator(array, length, true, false); |
464 | 0 | } |
465 | | |
466 | | /** |
467 | | * Returns an Iterator for fine-grained change and no-change edits |
468 | | * (full granularity of change edits is retained). |
469 | | * Can be used for modifying styled text. |
470 | | * @return an Iterator that separates adjacent changes. |
471 | | * @stable ICU 59 |
472 | | */ |
473 | 0 | Iterator getFineIterator() const { |
474 | 0 | return Iterator(array, length, false, false); |
475 | 0 | } |
476 | | |
477 | | /** |
478 | | * Merges the two input Edits and appends the result to this object. |
479 | | * |
480 | | * Consider two string transformations (for example, normalization and case mapping) |
481 | | * where each records Edits in addition to writing an output string.<br> |
482 | | * Edits ab reflect how substrings of input string a |
483 | | * map to substrings of intermediate string b.<br> |
484 | | * Edits bc reflect how substrings of intermediate string b |
485 | | * map to substrings of output string c.<br> |
486 | | * This function merges ab and bc such that the additional edits |
487 | | * recorded in this object reflect how substrings of input string a |
488 | | * map to substrings of output string c. |
489 | | * |
490 | | * If unrelated Edits are passed in where the output string of the first |
491 | | * has a different length than the input string of the second, |
492 | | * then a U_ILLEGAL_ARGUMENT_ERROR is reported. |
493 | | * |
494 | | * @param ab reflects how substrings of input string a |
495 | | * map to substrings of intermediate string b. |
496 | | * @param bc reflects how substrings of intermediate string b |
497 | | * map to substrings of output string c. |
498 | | * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, |
499 | | * or else the function returns immediately. Check for U_FAILURE() |
500 | | * on output or use with function chaining. (See User Guide for details.) |
501 | | * @return *this, with the merged edits appended |
502 | | * @stable ICU 60 |
503 | | */ |
504 | | Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode); |
505 | | |
506 | | private: |
507 | | void releaseArray() U_NOEXCEPT; |
508 | | Edits ©Array(const Edits &other); |
509 | | Edits &moveArray(Edits &src) U_NOEXCEPT; |
510 | | |
511 | 0 | void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; } |
512 | 0 | int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; } |
513 | | |
514 | | void append(int32_t r); |
515 | | UBool growArray(); |
516 | | |
517 | | static const int32_t STACK_CAPACITY = 100; |
518 | | uint16_t *array; |
519 | | int32_t capacity; |
520 | | int32_t length; |
521 | | int32_t delta; |
522 | | int32_t numChanges; |
523 | | UErrorCode errorCode_; |
524 | | uint16_t stackArray[STACK_CAPACITY]; |
525 | | }; |
526 | | |
527 | | U_NAMESPACE_END |
528 | | |
529 | | #endif /* U_SHOW_CPLUSPLUS_API */ |
530 | | |
531 | | #endif // __EDITS_H__ |