/src/icu/source/common/edits.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2017 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  |  | 
4  |  | // edits.cpp  | 
5  |  | // created: 2017feb08 Markus W. Scherer  | 
6  |  |  | 
7  |  | #include "unicode/edits.h"  | 
8  |  | #include "unicode/unistr.h"  | 
9  |  | #include "unicode/utypes.h"  | 
10  |  | #include "cmemory.h"  | 
11  |  | #include "uassert.h"  | 
12  |  | #include "util.h"  | 
13  |  |  | 
14  |  | U_NAMESPACE_BEGIN  | 
15  |  |  | 
16  |  | namespace { | 
17  |  |  | 
18  |  | // 0000uuuuuuuuuuuu records u+1 unchanged text units.  | 
19  |  | const int32_t MAX_UNCHANGED_LENGTH = 0x1000;  | 
20  |  | const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;  | 
21  |  |  | 
22  |  | // 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units.  | 
23  |  | const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6;  | 
24  |  | const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7;  | 
25  |  | const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff;  | 
26  |  | const int32_t MAX_SHORT_CHANGE = 0x6fff;  | 
27  |  |  | 
28  |  | // 0111mmmmmmnnnnnn records a replacement of m text units with n.  | 
29  |  | // m or n = 61: actual length follows in the next edits array unit.  | 
30  |  | // m or n = 62..63: actual length follows in the next two edits array units.  | 
31  |  | // Bit 30 of the actual length is in the head unit.  | 
32  |  | // Trailing units have bit 15 set.  | 
33  |  | const int32_t LENGTH_IN_1TRAIL = 61;  | 
34  |  | const int32_t LENGTH_IN_2TRAIL = 62;  | 
35  |  |  | 
36  |  | }  // namespace  | 
37  |  |  | 
38  | 0  | void Edits::releaseArray() U_NOEXCEPT { | 
39  | 0  |     if (array != stackArray) { | 
40  | 0  |         uprv_free(array);  | 
41  | 0  |     }  | 
42  | 0  | }  | 
43  |  |  | 
44  | 0  | Edits &Edits::copyArray(const Edits &other) { | 
45  | 0  |     if (U_FAILURE(errorCode_)) { | 
46  | 0  |         length = delta = numChanges = 0;  | 
47  | 0  |         return *this;  | 
48  | 0  |     }  | 
49  | 0  |     if (length > capacity) { | 
50  | 0  |         uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)length * 2);  | 
51  | 0  |         if (newArray == nullptr) { | 
52  | 0  |             length = delta = numChanges = 0;  | 
53  | 0  |             errorCode_ = U_MEMORY_ALLOCATION_ERROR;  | 
54  | 0  |             return *this;  | 
55  | 0  |         }  | 
56  | 0  |         releaseArray();  | 
57  | 0  |         array = newArray;  | 
58  | 0  |         capacity = length;  | 
59  | 0  |     }  | 
60  | 0  |     if (length > 0) { | 
61  | 0  |         uprv_memcpy(array, other.array, (size_t)length * 2);  | 
62  | 0  |     }  | 
63  | 0  |     return *this;  | 
64  | 0  | }  | 
65  |  |  | 
66  | 0  | Edits &Edits::moveArray(Edits &src) U_NOEXCEPT { | 
67  | 0  |     if (U_FAILURE(errorCode_)) { | 
68  | 0  |         length = delta = numChanges = 0;  | 
69  | 0  |         return *this;  | 
70  | 0  |     }  | 
71  | 0  |     releaseArray();  | 
72  | 0  |     if (length > STACK_CAPACITY) { | 
73  | 0  |         array = src.array;  | 
74  | 0  |         capacity = src.capacity;  | 
75  | 0  |         src.array = src.stackArray;  | 
76  | 0  |         src.capacity = STACK_CAPACITY;  | 
77  | 0  |         src.reset();  | 
78  | 0  |         return *this;  | 
79  | 0  |     }  | 
80  | 0  |     array = stackArray;  | 
81  | 0  |     capacity = STACK_CAPACITY;  | 
82  | 0  |     if (length > 0) { | 
83  | 0  |         uprv_memcpy(array, src.array, (size_t)length * 2);  | 
84  | 0  |     }  | 
85  | 0  |     return *this;  | 
86  | 0  | }  | 
87  |  |  | 
88  | 0  | Edits &Edits::operator=(const Edits &other) { | 
89  | 0  |     if (this == &other) { return *this; }  // self-assignment: no-op | 
90  | 0  |     length = other.length;  | 
91  | 0  |     delta = other.delta;  | 
92  | 0  |     numChanges = other.numChanges;  | 
93  | 0  |     errorCode_ = other.errorCode_;  | 
94  | 0  |     return copyArray(other);  | 
95  | 0  | }  | 
96  |  |  | 
97  | 0  | Edits &Edits::operator=(Edits &&src) U_NOEXCEPT { | 
98  | 0  |     length = src.length;  | 
99  | 0  |     delta = src.delta;  | 
100  | 0  |     numChanges = src.numChanges;  | 
101  | 0  |     errorCode_ = src.errorCode_;  | 
102  | 0  |     return moveArray(src);  | 
103  | 0  | }  | 
104  |  |  | 
105  | 0  | Edits::~Edits() { | 
106  | 0  |     releaseArray();  | 
107  | 0  | }  | 
108  |  |  | 
109  | 0  | void Edits::reset() U_NOEXCEPT { | 
110  | 0  |     length = delta = numChanges = 0;  | 
111  | 0  |     errorCode_ = U_ZERO_ERROR;  | 
112  | 0  | }  | 
113  |  |  | 
114  | 0  | void Edits::addUnchanged(int32_t unchangedLength) { | 
115  | 0  |     if(U_FAILURE(errorCode_) || unchangedLength == 0) { return; } | 
116  | 0  |     if(unchangedLength < 0) { | 
117  | 0  |         errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;  | 
118  | 0  |         return;  | 
119  | 0  |     }  | 
120  |  |     // Merge into previous unchanged-text record, if any.  | 
121  | 0  |     int32_t last = lastUnit();  | 
122  | 0  |     if(last < MAX_UNCHANGED) { | 
123  | 0  |         int32_t remaining = MAX_UNCHANGED - last;  | 
124  | 0  |         if (remaining >= unchangedLength) { | 
125  | 0  |             setLastUnit(last + unchangedLength);  | 
126  | 0  |             return;  | 
127  | 0  |         }  | 
128  | 0  |         setLastUnit(MAX_UNCHANGED);  | 
129  | 0  |         unchangedLength -= remaining;  | 
130  | 0  |     }  | 
131  |  |     // Split large lengths into multiple units.  | 
132  | 0  |     while(unchangedLength >= MAX_UNCHANGED_LENGTH) { | 
133  | 0  |         append(MAX_UNCHANGED);  | 
134  | 0  |         unchangedLength -= MAX_UNCHANGED_LENGTH;  | 
135  | 0  |     }  | 
136  |  |     // Write a small (remaining) length.  | 
137  | 0  |     if(unchangedLength > 0) { | 
138  | 0  |         append(unchangedLength - 1);  | 
139  | 0  |     }  | 
140  | 0  | }  | 
141  |  |  | 
142  | 0  | void Edits::addReplace(int32_t oldLength, int32_t newLength) { | 
143  | 0  |     if(U_FAILURE(errorCode_)) { return; } | 
144  | 0  |     if(oldLength < 0 || newLength < 0) { | 
145  | 0  |         errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;  | 
146  | 0  |         return;  | 
147  | 0  |     }  | 
148  | 0  |     if (oldLength == 0 && newLength == 0) { | 
149  | 0  |         return;  | 
150  | 0  |     }  | 
151  | 0  |     ++numChanges;  | 
152  | 0  |     int32_t newDelta = newLength - oldLength;  | 
153  | 0  |     if (newDelta != 0) { | 
154  | 0  |         if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) ||  | 
155  | 0  |                 (newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) { | 
156  |  |             // Integer overflow or underflow.  | 
157  | 0  |             errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;  | 
158  | 0  |             return;  | 
159  | 0  |         }  | 
160  | 0  |         delta += newDelta;  | 
161  | 0  |     }  | 
162  |  |  | 
163  | 0  |     if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH &&  | 
164  | 0  |             newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) { | 
165  |  |         // Merge into previous same-lengths short-replacement record, if any.  | 
166  | 0  |         int32_t u = (oldLength << 12) | (newLength << 9);  | 
167  | 0  |         int32_t last = lastUnit();  | 
168  | 0  |         if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&  | 
169  | 0  |                 (last & ~SHORT_CHANGE_NUM_MASK) == u &&  | 
170  | 0  |                 (last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) { | 
171  | 0  |             setLastUnit(last + 1);  | 
172  | 0  |             return;  | 
173  | 0  |         }  | 
174  | 0  |         append(u);  | 
175  | 0  |         return;  | 
176  | 0  |     }  | 
177  |  |  | 
178  | 0  |     int32_t head = 0x7000;  | 
179  | 0  |     if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { | 
180  | 0  |         head |= oldLength << 6;  | 
181  | 0  |         head |= newLength;  | 
182  | 0  |         append(head);  | 
183  | 0  |     } else if ((capacity - length) >= 5 || growArray()) { | 
184  | 0  |         int32_t limit = length + 1;  | 
185  | 0  |         if(oldLength < LENGTH_IN_1TRAIL) { | 
186  | 0  |             head |= oldLength << 6;  | 
187  | 0  |         } else if(oldLength <= 0x7fff) { | 
188  | 0  |             head |= LENGTH_IN_1TRAIL << 6;  | 
189  | 0  |             array[limit++] = (uint16_t)(0x8000 | oldLength);  | 
190  | 0  |         } else { | 
191  | 0  |             head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;  | 
192  | 0  |             array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15));  | 
193  | 0  |             array[limit++] = (uint16_t)(0x8000 | oldLength);  | 
194  | 0  |         }  | 
195  | 0  |         if(newLength < LENGTH_IN_1TRAIL) { | 
196  | 0  |             head |= newLength;  | 
197  | 0  |         } else if(newLength <= 0x7fff) { | 
198  | 0  |             head |= LENGTH_IN_1TRAIL;  | 
199  | 0  |             array[limit++] = (uint16_t)(0x8000 | newLength);  | 
200  | 0  |         } else { | 
201  | 0  |             head |= LENGTH_IN_2TRAIL + (newLength >> 30);  | 
202  | 0  |             array[limit++] = (uint16_t)(0x8000 | (newLength >> 15));  | 
203  | 0  |             array[limit++] = (uint16_t)(0x8000 | newLength);  | 
204  | 0  |         }  | 
205  | 0  |         array[length] = (uint16_t)head;  | 
206  | 0  |         length = limit;  | 
207  | 0  |     }  | 
208  | 0  | }  | 
209  |  |  | 
210  | 0  | void Edits::append(int32_t r) { | 
211  | 0  |     if(length < capacity || growArray()) { | 
212  | 0  |         array[length++] = (uint16_t)r;  | 
213  | 0  |     }  | 
214  | 0  | }  | 
215  |  |  | 
216  | 0  | UBool Edits::growArray() { | 
217  | 0  |     int32_t newCapacity;  | 
218  | 0  |     if (array == stackArray) { | 
219  | 0  |         newCapacity = 2000;  | 
220  | 0  |     } else if (capacity == INT32_MAX) { | 
221  |  |         // Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API  | 
222  |  |         // with a result-string-buffer overflow.  | 
223  | 0  |         errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;  | 
224  | 0  |         return FALSE;  | 
225  | 0  |     } else if (capacity >= (INT32_MAX / 2)) { | 
226  | 0  |         newCapacity = INT32_MAX;  | 
227  | 0  |     } else { | 
228  | 0  |         newCapacity = 2 * capacity;  | 
229  | 0  |     }  | 
230  |  |     // Grow by at least 5 units so that a maximal change record will fit.  | 
231  | 0  |     if ((newCapacity - capacity) < 5) { | 
232  | 0  |         errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;  | 
233  | 0  |         return FALSE;  | 
234  | 0  |     }  | 
235  | 0  |     uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);  | 
236  | 0  |     if (newArray == NULL) { | 
237  | 0  |         errorCode_ = U_MEMORY_ALLOCATION_ERROR;  | 
238  | 0  |         return FALSE;  | 
239  | 0  |     }  | 
240  | 0  |     uprv_memcpy(newArray, array, (size_t)length * 2);  | 
241  | 0  |     releaseArray();  | 
242  | 0  |     array = newArray;  | 
243  | 0  |     capacity = newCapacity;  | 
244  | 0  |     return TRUE;  | 
245  | 0  | }  | 
246  |  |  | 
247  | 0  | UBool Edits::copyErrorTo(UErrorCode &outErrorCode) const { | 
248  | 0  |     if (U_FAILURE(outErrorCode)) { return TRUE; } | 
249  | 0  |     if (U_SUCCESS(errorCode_)) { return FALSE; } | 
250  | 0  |     outErrorCode = errorCode_;  | 
251  | 0  |     return TRUE;  | 
252  | 0  | }  | 
253  |  |  | 
254  | 0  | Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode) { | 
255  | 0  |     if (copyErrorTo(errorCode)) { return *this; } | 
256  |  |     // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.  | 
257  |  |     // Parallel iteration over both Edits.  | 
258  | 0  |     Iterator abIter = ab.getFineIterator();  | 
259  | 0  |     Iterator bcIter = bc.getFineIterator();  | 
260  | 0  |     UBool abHasNext = TRUE, bcHasNext = TRUE;  | 
261  |  |     // Copy iterator state into local variables, so that we can modify and subdivide spans.  | 
262  |  |     // ab old & new length, bc old & new length  | 
263  | 0  |     int32_t aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;  | 
264  |  |     // When we have different-intermediate-length changes, we accumulate a larger change.  | 
265  | 0  |     int32_t pending_aLength = 0, pending_cLength = 0;  | 
266  | 0  |     for (;;) { | 
267  |  |         // At this point, for each of the two iterators:  | 
268  |  |         // Either we are done with the locally cached current edit,  | 
269  |  |         // and its intermediate-string length has been reset,  | 
270  |  |         // or we will continue to work with a truncated remainder of this edit.  | 
271  |  |         //  | 
272  |  |         // If the current edit is done, and the iterator has not yet reached the end,  | 
273  |  |         // then we fetch the next edit. This is true for at least one of the iterators.  | 
274  |  |         //  | 
275  |  |         // Normally it does not matter whether we fetch from ab and then bc or vice versa.  | 
276  |  |         // However, the result is observably different when  | 
277  |  |         // ab deletions meet bc insertions at the same intermediate-string index.  | 
278  |  |         // Some users expect the bc insertions to come first, so we fetch from bc first.  | 
279  | 0  |         if (bc_bLength == 0) { | 
280  | 0  |             if (bcHasNext && (bcHasNext = bcIter.next(errorCode)) != 0) { | 
281  | 0  |                 bc_bLength = bcIter.oldLength();  | 
282  | 0  |                 cLength = bcIter.newLength();  | 
283  | 0  |                 if (bc_bLength == 0) { | 
284  |  |                     // insertion  | 
285  | 0  |                     if (ab_bLength == 0 || !abIter.hasChange()) { | 
286  | 0  |                         addReplace(pending_aLength, pending_cLength + cLength);  | 
287  | 0  |                         pending_aLength = pending_cLength = 0;  | 
288  | 0  |                     } else { | 
289  | 0  |                         pending_cLength += cLength;  | 
290  | 0  |                     }  | 
291  | 0  |                     continue;  | 
292  | 0  |                 }  | 
293  | 0  |             }  | 
294  |  |             // else see if the other iterator is done, too.  | 
295  | 0  |         }  | 
296  | 0  |         if (ab_bLength == 0) { | 
297  | 0  |             if (abHasNext && (abHasNext = abIter.next(errorCode)) != 0) { | 
298  | 0  |                 aLength = abIter.oldLength();  | 
299  | 0  |                 ab_bLength = abIter.newLength();  | 
300  | 0  |                 if (ab_bLength == 0) { | 
301  |  |                     // deletion  | 
302  | 0  |                     if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) { | 
303  | 0  |                         addReplace(pending_aLength + aLength, pending_cLength);  | 
304  | 0  |                         pending_aLength = pending_cLength = 0;  | 
305  | 0  |                     } else { | 
306  | 0  |                         pending_aLength += aLength;  | 
307  | 0  |                     }  | 
308  | 0  |                     continue;  | 
309  | 0  |                 }  | 
310  | 0  |             } else if (bc_bLength == 0) { | 
311  |  |                 // Both iterators are done at the same time:  | 
312  |  |                 // The intermediate-string lengths match.  | 
313  | 0  |                 break;  | 
314  | 0  |             } else { | 
315  |  |                 // The ab output string is shorter than the bc input string.  | 
316  | 0  |                 if (!copyErrorTo(errorCode)) { | 
317  | 0  |                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;  | 
318  | 0  |                 }  | 
319  | 0  |                 return *this;  | 
320  | 0  |             }  | 
321  | 0  |         }  | 
322  | 0  |         if (bc_bLength == 0) { | 
323  |  |             // The bc input string is shorter than the ab output string.  | 
324  | 0  |             if (!copyErrorTo(errorCode)) { | 
325  | 0  |                 errorCode = U_ILLEGAL_ARGUMENT_ERROR;  | 
326  | 0  |             }  | 
327  | 0  |             return *this;  | 
328  | 0  |         }  | 
329  |  |         //  Done fetching: ab_bLength > 0 && bc_bLength > 0  | 
330  |  |  | 
331  |  |         // The current state has two parts:  | 
332  |  |         // - Past: We accumulate a longer ac edit in the "pending" variables.  | 
333  |  |         // - Current: We have copies of the current ab/bc edits in local variables.  | 
334  |  |         //   At least one side is newly fetched.  | 
335  |  |         //   One side might be a truncated remainder of an edit we fetched earlier.  | 
336  |  |  | 
337  | 0  |         if (!abIter.hasChange() && !bcIter.hasChange()) { | 
338  |  |             // An unchanged span all the way from string a to string c.  | 
339  | 0  |             if (pending_aLength != 0 || pending_cLength != 0) { | 
340  | 0  |                 addReplace(pending_aLength, pending_cLength);  | 
341  | 0  |                 pending_aLength = pending_cLength = 0;  | 
342  | 0  |             }  | 
343  | 0  |             int32_t unchangedLength = aLength <= cLength ? aLength : cLength;  | 
344  | 0  |             addUnchanged(unchangedLength);  | 
345  | 0  |             ab_bLength = aLength -= unchangedLength;  | 
346  | 0  |             bc_bLength = cLength -= unchangedLength;  | 
347  |  |             // At least one of the unchanged spans is now empty.  | 
348  | 0  |             continue;  | 
349  | 0  |         }  | 
350  | 0  |         if (!abIter.hasChange() && bcIter.hasChange()) { | 
351  |  |             // Unchanged a->b but changed b->c.  | 
352  | 0  |             if (ab_bLength >= bc_bLength) { | 
353  |  |                 // Split the longer unchanged span into change + remainder.  | 
354  | 0  |                 addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);  | 
355  | 0  |                 pending_aLength = pending_cLength = 0;  | 
356  | 0  |                 aLength = ab_bLength -= bc_bLength;  | 
357  | 0  |                 bc_bLength = 0;  | 
358  | 0  |                 continue;  | 
359  | 0  |             }  | 
360  |  |             // Handle the shorter unchanged span below like a change.  | 
361  | 0  |         } else if (abIter.hasChange() && !bcIter.hasChange()) { | 
362  |  |             // Changed a->b and then unchanged b->c.  | 
363  | 0  |             if (ab_bLength <= bc_bLength) { | 
364  |  |                 // Split the longer unchanged span into change + remainder.  | 
365  | 0  |                 addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);  | 
366  | 0  |                 pending_aLength = pending_cLength = 0;  | 
367  | 0  |                 cLength = bc_bLength -= ab_bLength;  | 
368  | 0  |                 ab_bLength = 0;  | 
369  | 0  |                 continue;  | 
370  | 0  |             }  | 
371  |  |             // Handle the shorter unchanged span below like a change.  | 
372  | 0  |         } else {  // both abIter.hasChange() && bcIter.hasChange() | 
373  | 0  |             if (ab_bLength == bc_bLength) { | 
374  |  |                 // Changes on both sides up to the same position. Emit & reset.  | 
375  | 0  |                 addReplace(pending_aLength + aLength, pending_cLength + cLength);  | 
376  | 0  |                 pending_aLength = pending_cLength = 0;  | 
377  | 0  |                 ab_bLength = bc_bLength = 0;  | 
378  | 0  |                 continue;  | 
379  | 0  |             }  | 
380  | 0  |         }  | 
381  |  |         // Accumulate the a->c change, reset the shorter side,  | 
382  |  |         // keep a remainder of the longer one.  | 
383  | 0  |         pending_aLength += aLength;  | 
384  | 0  |         pending_cLength += cLength;  | 
385  | 0  |         if (ab_bLength < bc_bLength) { | 
386  | 0  |             bc_bLength -= ab_bLength;  | 
387  | 0  |             cLength = ab_bLength = 0;  | 
388  | 0  |         } else {  // ab_bLength > bc_bLength | 
389  | 0  |             ab_bLength -= bc_bLength;  | 
390  | 0  |             aLength = bc_bLength = 0;  | 
391  | 0  |         }  | 
392  | 0  |     }  | 
393  | 0  |     if (pending_aLength != 0 || pending_cLength != 0) { | 
394  | 0  |         addReplace(pending_aLength, pending_cLength);  | 
395  | 0  |     }  | 
396  | 0  |     copyErrorTo(errorCode);  | 
397  | 0  |     return *this;  | 
398  | 0  | }  | 
399  |  |  | 
400  |  | Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :  | 
401  | 0  |         array(a), index(0), length(len), remaining(0),  | 
402  | 0  |         onlyChanges_(oc), coarse(crs),  | 
403  | 0  |         dir(0), changed(FALSE), oldLength_(0), newLength_(0),  | 
404  | 0  |         srcIndex(0), replIndex(0), destIndex(0) {} | 
405  |  |  | 
406  | 0  | int32_t Edits::Iterator::readLength(int32_t head) { | 
407  | 0  |     if (head < LENGTH_IN_1TRAIL) { | 
408  | 0  |         return head;  | 
409  | 0  |     } else if (head < LENGTH_IN_2TRAIL) { | 
410  | 0  |         U_ASSERT(index < length);  | 
411  | 0  |         U_ASSERT(array[index] >= 0x8000);  | 
412  | 0  |         return array[index++] & 0x7fff;  | 
413  | 0  |     } else { | 
414  | 0  |         U_ASSERT((index + 2) <= length);  | 
415  | 0  |         U_ASSERT(array[index] >= 0x8000);  | 
416  | 0  |         U_ASSERT(array[index + 1] >= 0x8000);  | 
417  | 0  |         int32_t len = ((head & 1) << 30) |  | 
418  | 0  |                 ((int32_t)(array[index] & 0x7fff) << 15) |  | 
419  | 0  |                 (array[index + 1] & 0x7fff);  | 
420  | 0  |         index += 2;  | 
421  | 0  |         return len;  | 
422  | 0  |     }  | 
423  | 0  | }  | 
424  |  |  | 
425  | 0  | void Edits::Iterator::updateNextIndexes() { | 
426  | 0  |     srcIndex += oldLength_;  | 
427  | 0  |     if (changed) { | 
428  | 0  |         replIndex += newLength_;  | 
429  | 0  |     }  | 
430  | 0  |     destIndex += newLength_;  | 
431  | 0  | }  | 
432  |  |  | 
433  | 0  | void Edits::Iterator::updatePreviousIndexes() { | 
434  | 0  |     srcIndex -= oldLength_;  | 
435  | 0  |     if (changed) { | 
436  | 0  |         replIndex -= newLength_;  | 
437  | 0  |     }  | 
438  | 0  |     destIndex -= newLength_;  | 
439  | 0  | }  | 
440  |  |  | 
441  | 0  | UBool Edits::Iterator::noNext() { | 
442  |  |     // No change before or beyond the string.  | 
443  | 0  |     dir = 0;  | 
444  | 0  |     changed = FALSE;  | 
445  | 0  |     oldLength_ = newLength_ = 0;  | 
446  | 0  |     return FALSE;  | 
447  | 0  | }  | 
448  |  |  | 
449  | 0  | UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) { | 
450  |  |     // Forward iteration: Update the string indexes to the limit of the current span,  | 
451  |  |     // and post-increment-read array units to assemble a new span.  | 
452  |  |     // Leaves the array index one after the last unit of that span.  | 
453  | 0  |     if (U_FAILURE(errorCode)) { return FALSE; } | 
454  |  |     // We have an errorCode in case we need to start guarding against integer overflows.  | 
455  |  |     // It is also convenient for caller loops if we bail out when an error was set elsewhere.  | 
456  | 0  |     if (dir > 0) { | 
457  | 0  |         updateNextIndexes();  | 
458  | 0  |     } else { | 
459  | 0  |         if (dir < 0) { | 
460  |  |             // Turn around from previous() to next().  | 
461  |  |             // Post-increment-read the same span again.  | 
462  | 0  |             if (remaining > 0) { | 
463  |  |                 // Fine-grained iterator:  | 
464  |  |                 // Stay on the current one of a sequence of compressed changes.  | 
465  | 0  |                 ++index;  // next() rests on the index after the sequence unit.  | 
466  | 0  |                 dir = 1;  | 
467  | 0  |                 return TRUE;  | 
468  | 0  |             }  | 
469  | 0  |         }  | 
470  | 0  |         dir = 1;  | 
471  | 0  |     }  | 
472  | 0  |     if (remaining >= 1) { | 
473  |  |         // Fine-grained iterator: Continue a sequence of compressed changes.  | 
474  | 0  |         if (remaining > 1) { | 
475  | 0  |             --remaining;  | 
476  | 0  |             return TRUE;  | 
477  | 0  |         }  | 
478  | 0  |         remaining = 0;  | 
479  | 0  |     }  | 
480  | 0  |     if (index >= length) { | 
481  | 0  |         return noNext();  | 
482  | 0  |     }  | 
483  | 0  |     int32_t u = array[index++];  | 
484  | 0  |     if (u <= MAX_UNCHANGED) { | 
485  |  |         // Combine adjacent unchanged ranges.  | 
486  | 0  |         changed = FALSE;  | 
487  | 0  |         oldLength_ = u + 1;  | 
488  | 0  |         while (index < length && (u = array[index]) <= MAX_UNCHANGED) { | 
489  | 0  |             ++index;  | 
490  | 0  |             oldLength_ += u + 1;  | 
491  | 0  |         }  | 
492  | 0  |         newLength_ = oldLength_;  | 
493  | 0  |         if (onlyChanges) { | 
494  | 0  |             updateNextIndexes();  | 
495  | 0  |             if (index >= length) { | 
496  | 0  |                 return noNext();  | 
497  | 0  |             }  | 
498  |  |             // already fetched u > MAX_UNCHANGED at index  | 
499  | 0  |             ++index;  | 
500  | 0  |         } else { | 
501  | 0  |             return TRUE;  | 
502  | 0  |         }  | 
503  | 0  |     }  | 
504  | 0  |     changed = TRUE;  | 
505  | 0  |     if (u <= MAX_SHORT_CHANGE) { | 
506  | 0  |         int32_t oldLen = u >> 12;  | 
507  | 0  |         int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;  | 
508  | 0  |         int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;  | 
509  | 0  |         if (coarse) { | 
510  | 0  |             oldLength_ = num * oldLen;  | 
511  | 0  |             newLength_ = num * newLen;  | 
512  | 0  |         } else { | 
513  |  |             // Split a sequence of changes that was compressed into one unit.  | 
514  | 0  |             oldLength_ = oldLen;  | 
515  | 0  |             newLength_ = newLen;  | 
516  | 0  |             if (num > 1) { | 
517  | 0  |                 remaining = num;  // This is the first of two or more changes.  | 
518  | 0  |             }  | 
519  | 0  |             return TRUE;  | 
520  | 0  |         }  | 
521  | 0  |     } else { | 
522  | 0  |         U_ASSERT(u <= 0x7fff);  | 
523  | 0  |         oldLength_ = readLength((u >> 6) & 0x3f);  | 
524  | 0  |         newLength_ = readLength(u & 0x3f);  | 
525  | 0  |         if (!coarse) { | 
526  | 0  |             return TRUE;  | 
527  | 0  |         }  | 
528  | 0  |     }  | 
529  |  |     // Combine adjacent changes.  | 
530  | 0  |     while (index < length && (u = array[index]) > MAX_UNCHANGED) { | 
531  | 0  |         ++index;  | 
532  | 0  |         if (u <= MAX_SHORT_CHANGE) { | 
533  | 0  |             int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;  | 
534  | 0  |             oldLength_ += (u >> 12) * num;  | 
535  | 0  |             newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;  | 
536  | 0  |         } else { | 
537  | 0  |             U_ASSERT(u <= 0x7fff);  | 
538  | 0  |             oldLength_ += readLength((u >> 6) & 0x3f);  | 
539  | 0  |             newLength_ += readLength(u & 0x3f);  | 
540  | 0  |         }  | 
541  | 0  |     }  | 
542  | 0  |     return TRUE;  | 
543  | 0  | }  | 
544  |  |  | 
545  | 0  | UBool Edits::Iterator::previous(UErrorCode &errorCode) { | 
546  |  |     // Backward iteration: Pre-decrement-read array units to assemble a new span,  | 
547  |  |     // then update the string indexes to the start of that span.  | 
548  |  |     // Leaves the array index on the head unit of that span.  | 
549  | 0  |     if (U_FAILURE(errorCode)) { return FALSE; } | 
550  |  |     // We have an errorCode in case we need to start guarding against integer overflows.  | 
551  |  |     // It is also convenient for caller loops if we bail out when an error was set elsewhere.  | 
552  | 0  |     if (dir >= 0) { | 
553  | 0  |         if (dir > 0) { | 
554  |  |             // Turn around from next() to previous().  | 
555  |  |             // Set the string indexes to the span limit and  | 
556  |  |             // pre-decrement-read the same span again.  | 
557  | 0  |             if (remaining > 0) { | 
558  |  |                 // Fine-grained iterator:  | 
559  |  |                 // Stay on the current one of a sequence of compressed changes.  | 
560  | 0  |                 --index;  // previous() rests on the sequence unit.  | 
561  | 0  |                 dir = -1;  | 
562  | 0  |                 return TRUE;  | 
563  | 0  |             }  | 
564  | 0  |             updateNextIndexes();  | 
565  | 0  |         }  | 
566  | 0  |         dir = -1;  | 
567  | 0  |     }  | 
568  | 0  |     if (remaining > 0) { | 
569  |  |         // Fine-grained iterator: Continue a sequence of compressed changes.  | 
570  | 0  |         int32_t u = array[index];  | 
571  | 0  |         U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);  | 
572  | 0  |         if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) { | 
573  | 0  |             ++remaining;  | 
574  | 0  |             updatePreviousIndexes();  | 
575  | 0  |             return TRUE;  | 
576  | 0  |         }  | 
577  | 0  |         remaining = 0;  | 
578  | 0  |     }  | 
579  | 0  |     if (index <= 0) { | 
580  | 0  |         return noNext();  | 
581  | 0  |     }  | 
582  | 0  |     int32_t u = array[--index];  | 
583  | 0  |     if (u <= MAX_UNCHANGED) { | 
584  |  |         // Combine adjacent unchanged ranges.  | 
585  | 0  |         changed = FALSE;  | 
586  | 0  |         oldLength_ = u + 1;  | 
587  | 0  |         while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) { | 
588  | 0  |             --index;  | 
589  | 0  |             oldLength_ += u + 1;  | 
590  | 0  |         }  | 
591  | 0  |         newLength_ = oldLength_;  | 
592  |  |         // No need to handle onlyChanges as long as previous() is called only from findIndex().  | 
593  | 0  |         updatePreviousIndexes();  | 
594  | 0  |         return TRUE;  | 
595  | 0  |     }  | 
596  | 0  |     changed = TRUE;  | 
597  | 0  |     if (u <= MAX_SHORT_CHANGE) { | 
598  | 0  |         int32_t oldLen = u >> 12;  | 
599  | 0  |         int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;  | 
600  | 0  |         int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;  | 
601  | 0  |         if (coarse) { | 
602  | 0  |             oldLength_ = num * oldLen;  | 
603  | 0  |             newLength_ = num * newLen;  | 
604  | 0  |         } else { | 
605  |  |             // Split a sequence of changes that was compressed into one unit.  | 
606  | 0  |             oldLength_ = oldLen;  | 
607  | 0  |             newLength_ = newLen;  | 
608  | 0  |             if (num > 1) { | 
609  | 0  |                 remaining = 1;  // This is the last of two or more changes.  | 
610  | 0  |             }  | 
611  | 0  |             updatePreviousIndexes();  | 
612  | 0  |             return TRUE;  | 
613  | 0  |         }  | 
614  | 0  |     } else { | 
615  | 0  |         if (u <= 0x7fff) { | 
616  |  |             // The change is encoded in u alone.  | 
617  | 0  |             oldLength_ = readLength((u >> 6) & 0x3f);  | 
618  | 0  |             newLength_ = readLength(u & 0x3f);  | 
619  | 0  |         } else { | 
620  |  |             // Back up to the head of the change, read the lengths,  | 
621  |  |             // and reset the index to the head again.  | 
622  | 0  |             U_ASSERT(index > 0);  | 
623  | 0  |             while ((u = array[--index]) > 0x7fff) {} | 
624  | 0  |             U_ASSERT(u > MAX_SHORT_CHANGE);  | 
625  | 0  |             int32_t headIndex = index++;  | 
626  | 0  |             oldLength_ = readLength((u >> 6) & 0x3f);  | 
627  | 0  |             newLength_ = readLength(u & 0x3f);  | 
628  | 0  |             index = headIndex;  | 
629  | 0  |         }  | 
630  | 0  |         if (!coarse) { | 
631  | 0  |             updatePreviousIndexes();  | 
632  | 0  |             return TRUE;  | 
633  | 0  |         }  | 
634  | 0  |     }  | 
635  |  |     // Combine adjacent changes.  | 
636  | 0  |     while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) { | 
637  | 0  |         --index;  | 
638  | 0  |         if (u <= MAX_SHORT_CHANGE) { | 
639  | 0  |             int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;  | 
640  | 0  |             oldLength_ += (u >> 12) * num;  | 
641  | 0  |             newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;  | 
642  | 0  |         } else if (u <= 0x7fff) { | 
643  |  |             // Read the lengths, and reset the index to the head again.  | 
644  | 0  |             int32_t headIndex = index++;  | 
645  | 0  |             oldLength_ += readLength((u >> 6) & 0x3f);  | 
646  | 0  |             newLength_ += readLength(u & 0x3f);  | 
647  | 0  |             index = headIndex;  | 
648  | 0  |         }  | 
649  | 0  |     }  | 
650  | 0  |     updatePreviousIndexes();  | 
651  | 0  |     return TRUE;  | 
652  | 0  | }  | 
653  |  |  | 
654  | 0  | int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) { | 
655  | 0  |     if (U_FAILURE(errorCode) || i < 0) { return -1; } | 
656  | 0  |     int32_t spanStart, spanLength;  | 
657  | 0  |     if (findSource) {  // find source index | 
658  | 0  |         spanStart = srcIndex;  | 
659  | 0  |         spanLength = oldLength_;  | 
660  | 0  |     } else {  // find destination index | 
661  | 0  |         spanStart = destIndex;  | 
662  | 0  |         spanLength = newLength_;  | 
663  | 0  |     }  | 
664  | 0  |     if (i < spanStart) { | 
665  | 0  |         if (i >= (spanStart / 2)) { | 
666  |  |             // Search backwards.  | 
667  | 0  |             for (;;) { | 
668  | 0  |                 UBool hasPrevious = previous(errorCode);  | 
669  | 0  |                 U_ASSERT(hasPrevious);  // because i>=0 and the first span starts at 0  | 
670  | 0  |                 (void)hasPrevious;  // avoid unused-variable warning  | 
671  | 0  |                 spanStart = findSource ? srcIndex : destIndex;  | 
672  | 0  |                 if (i >= spanStart) { | 
673  |  |                     // The index is in the current span.  | 
674  | 0  |                     return 0;  | 
675  | 0  |                 }  | 
676  | 0  |                 if (remaining > 0) { | 
677  |  |                     // Is the index in one of the remaining compressed edits?  | 
678  |  |                     // spanStart is the start of the current span, first of the remaining ones.  | 
679  | 0  |                     spanLength = findSource ? oldLength_ : newLength_;  | 
680  | 0  |                     int32_t u = array[index];  | 
681  | 0  |                     U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);  | 
682  | 0  |                     int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining;  | 
683  | 0  |                     int32_t len = num * spanLength;  | 
684  | 0  |                     if (i >= (spanStart - len)) { | 
685  | 0  |                         int32_t n = ((spanStart - i - 1) / spanLength) + 1;  | 
686  |  |                         // 1 <= n <= num  | 
687  | 0  |                         srcIndex -= n * oldLength_;  | 
688  | 0  |                         replIndex -= n * newLength_;  | 
689  | 0  |                         destIndex -= n * newLength_;  | 
690  | 0  |                         remaining += n;  | 
691  | 0  |                         return 0;  | 
692  | 0  |                     }  | 
693  |  |                     // Skip all of these edits at once.  | 
694  | 0  |                     srcIndex -= num * oldLength_;  | 
695  | 0  |                     replIndex -= num * newLength_;  | 
696  | 0  |                     destIndex -= num * newLength_;  | 
697  | 0  |                     remaining = 0;  | 
698  | 0  |                 }  | 
699  | 0  |             }  | 
700  | 0  |         }  | 
701  |  |         // Reset the iterator to the start.  | 
702  | 0  |         dir = 0;  | 
703  | 0  |         index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;  | 
704  | 0  |     } else if (i < (spanStart + spanLength)) { | 
705  |  |         // The index is in the current span.  | 
706  | 0  |         return 0;  | 
707  | 0  |     }  | 
708  | 0  |     while (next(FALSE, errorCode)) { | 
709  | 0  |         if (findSource) { | 
710  | 0  |             spanStart = srcIndex;  | 
711  | 0  |             spanLength = oldLength_;  | 
712  | 0  |         } else { | 
713  | 0  |             spanStart = destIndex;  | 
714  | 0  |             spanLength = newLength_;  | 
715  | 0  |         }  | 
716  | 0  |         if (i < (spanStart + spanLength)) { | 
717  |  |             // The index is in the current span.  | 
718  | 0  |             return 0;  | 
719  | 0  |         }  | 
720  | 0  |         if (remaining > 1) { | 
721  |  |             // Is the index in one of the remaining compressed edits?  | 
722  |  |             // spanStart is the start of the current span, first of the remaining ones.  | 
723  | 0  |             int32_t len = remaining * spanLength;  | 
724  | 0  |             if (i < (spanStart + len)) { | 
725  | 0  |                 int32_t n = (i - spanStart) / spanLength;  // 1 <= n <= remaining - 1  | 
726  | 0  |                 srcIndex += n * oldLength_;  | 
727  | 0  |                 replIndex += n * newLength_;  | 
728  | 0  |                 destIndex += n * newLength_;  | 
729  | 0  |                 remaining -= n;  | 
730  | 0  |                 return 0;  | 
731  | 0  |             }  | 
732  |  |             // Make next() skip all of these edits at once.  | 
733  | 0  |             oldLength_ *= remaining;  | 
734  | 0  |             newLength_ *= remaining;  | 
735  | 0  |             remaining = 0;  | 
736  | 0  |         }  | 
737  | 0  |     }  | 
738  | 0  |     return 1;  | 
739  | 0  | }  | 
740  |  |  | 
741  | 0  | int32_t Edits::Iterator::destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode) { | 
742  | 0  |     int32_t where = findIndex(i, TRUE, errorCode);  | 
743  | 0  |     if (where < 0) { | 
744  |  |         // Error or before the string.  | 
745  | 0  |         return 0;  | 
746  | 0  |     }  | 
747  | 0  |     if (where > 0 || i == srcIndex) { | 
748  |  |         // At or after string length, or at start of the found span.  | 
749  | 0  |         return destIndex;  | 
750  | 0  |     }  | 
751  | 0  |     if (changed) { | 
752  |  |         // In a change span, map to its end.  | 
753  | 0  |         return destIndex + newLength_;  | 
754  | 0  |     } else { | 
755  |  |         // In an unchanged span, offset 1:1 within it.  | 
756  | 0  |         return destIndex + (i - srcIndex);  | 
757  | 0  |     }  | 
758  | 0  | }  | 
759  |  |  | 
760  | 0  | int32_t Edits::Iterator::sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode) { | 
761  | 0  |     int32_t where = findIndex(i, FALSE, errorCode);  | 
762  | 0  |     if (where < 0) { | 
763  |  |         // Error or before the string.  | 
764  | 0  |         return 0;  | 
765  | 0  |     }  | 
766  | 0  |     if (where > 0 || i == destIndex) { | 
767  |  |         // At or after string length, or at start of the found span.  | 
768  | 0  |         return srcIndex;  | 
769  | 0  |     }  | 
770  | 0  |     if (changed) { | 
771  |  |         // In a change span, map to its end.  | 
772  | 0  |         return srcIndex + oldLength_;  | 
773  | 0  |     } else { | 
774  |  |         // In an unchanged span, offset within it.  | 
775  | 0  |         return srcIndex + (i - destIndex);  | 
776  | 0  |     }  | 
777  | 0  | }  | 
778  |  |  | 
779  | 0  | UnicodeString& Edits::Iterator::toString(UnicodeString& sb) const { | 
780  | 0  |     sb.append(u"{ src[", -1); | 
781  | 0  |     ICU_Utility::appendNumber(sb, srcIndex);  | 
782  | 0  |     sb.append(u"..", -1);  | 
783  | 0  |     ICU_Utility::appendNumber(sb, srcIndex + oldLength_);  | 
784  | 0  |     if (changed) { | 
785  | 0  |         sb.append(u"] ⇝ dest[", -1);  | 
786  | 0  |     } else { | 
787  | 0  |         sb.append(u"] ≡ dest[", -1);  | 
788  | 0  |     }  | 
789  | 0  |     ICU_Utility::appendNumber(sb, destIndex);  | 
790  | 0  |     sb.append(u"..", -1);  | 
791  | 0  |     ICU_Utility::appendNumber(sb, destIndex + newLength_);  | 
792  | 0  |     if (changed) { | 
793  | 0  |         sb.append(u"], repl[", -1);  | 
794  | 0  |         ICU_Utility::appendNumber(sb, replIndex);  | 
795  | 0  |         sb.append(u"..", -1);  | 
796  | 0  |         ICU_Utility::appendNumber(sb, replIndex + newLength_);  | 
797  | 0  |         sb.append(u"] }", -1);  | 
798  | 0  |     } else { | 
799  | 0  |         sb.append(u"] (no-change) }", -1);  | 
800  | 0  |     }  | 
801  | 0  |     return sb;  | 
802  | 0  | }  | 
803  |  |  | 
804  |  | U_NAMESPACE_END  |