/src/mozilla-central/intl/icu/source/common/rbbi_cache.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | |
4 | | // file: rbbi_cache.cpp |
5 | | |
6 | | #include "unicode/utypes.h" |
7 | | |
8 | | #if !UCONFIG_NO_BREAK_ITERATION |
9 | | |
10 | | #include "unicode/ubrk.h" |
11 | | #include "unicode/rbbi.h" |
12 | | |
13 | | #include "rbbi_cache.h" |
14 | | |
15 | | #include "brkeng.h" |
16 | | #include "cmemory.h" |
17 | | #include "rbbidata.h" |
18 | | #include "rbbirb.h" |
19 | | #include "uassert.h" |
20 | | #include "uvectr32.h" |
21 | | |
22 | | U_NAMESPACE_BEGIN |
23 | | |
24 | | /* |
25 | | * DictionaryCache implementation |
26 | | */ |
27 | | |
28 | | RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status) : |
29 | | fBI(bi), fBreaks(status), fPositionInCache(-1), |
30 | 0 | fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) { |
31 | 0 | } |
32 | | |
33 | 0 | RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() { |
34 | 0 | } |
35 | | |
36 | 0 | void RuleBasedBreakIterator::DictionaryCache::reset() { |
37 | 0 | fPositionInCache = -1; |
38 | 0 | fStart = 0; |
39 | 0 | fLimit = 0; |
40 | 0 | fFirstRuleStatusIndex = 0; |
41 | 0 | fOtherRuleStatusIndex = 0; |
42 | 0 | fBreaks.removeAllElements(); |
43 | 0 | } |
44 | | |
45 | 0 | UBool RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos, int32_t *result, int32_t *statusIndex) { |
46 | 0 | if (fromPos >= fLimit || fromPos < fStart) { |
47 | 0 | fPositionInCache = -1; |
48 | 0 | return FALSE; |
49 | 0 | } |
50 | 0 |
|
51 | 0 | // Sequential iteration, move from previous boundary to the following |
52 | 0 |
|
53 | 0 | int32_t r = 0; |
54 | 0 | if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { |
55 | 0 | ++fPositionInCache; |
56 | 0 | if (fPositionInCache >= fBreaks.size()) { |
57 | 0 | fPositionInCache = -1; |
58 | 0 | return FALSE; |
59 | 0 | } |
60 | 0 | r = fBreaks.elementAti(fPositionInCache); |
61 | 0 | U_ASSERT(r > fromPos); |
62 | 0 | *result = r; |
63 | 0 | *statusIndex = fOtherRuleStatusIndex; |
64 | 0 | return TRUE; |
65 | 0 | } |
66 | 0 |
|
67 | 0 | // Random indexing. Linear search for the boundary following the given position. |
68 | 0 |
|
69 | 0 | for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { |
70 | 0 | r= fBreaks.elementAti(fPositionInCache); |
71 | 0 | if (r > fromPos) { |
72 | 0 | *result = r; |
73 | 0 | *statusIndex = fOtherRuleStatusIndex; |
74 | 0 | return TRUE; |
75 | 0 | } |
76 | 0 | } |
77 | 0 | U_ASSERT(FALSE); |
78 | 0 | fPositionInCache = -1; |
79 | 0 | return FALSE; |
80 | 0 | } |
81 | | |
82 | | |
83 | 0 | UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_t *result, int32_t *statusIndex) { |
84 | 0 | if (fromPos <= fStart || fromPos > fLimit) { |
85 | 0 | fPositionInCache = -1; |
86 | 0 | return FALSE; |
87 | 0 | } |
88 | 0 |
|
89 | 0 | if (fromPos == fLimit) { |
90 | 0 | fPositionInCache = fBreaks.size() - 1; |
91 | 0 | if (fPositionInCache >= 0) { |
92 | 0 | U_ASSERT(fBreaks.elementAti(fPositionInCache) == fromPos); |
93 | 0 | } |
94 | 0 | } |
95 | 0 |
|
96 | 0 | int32_t r; |
97 | 0 | if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAti(fPositionInCache) == fromPos) { |
98 | 0 | --fPositionInCache; |
99 | 0 | r = fBreaks.elementAti(fPositionInCache); |
100 | 0 | U_ASSERT(r < fromPos); |
101 | 0 | *result = r; |
102 | 0 | *statusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; |
103 | 0 | return TRUE; |
104 | 0 | } |
105 | 0 |
|
106 | 0 | if (fPositionInCache == 0) { |
107 | 0 | fPositionInCache = -1; |
108 | 0 | return FALSE; |
109 | 0 | } |
110 | 0 |
|
111 | 0 | for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { |
112 | 0 | r = fBreaks.elementAti(fPositionInCache); |
113 | 0 | if (r < fromPos) { |
114 | 0 | *result = r; |
115 | 0 | *statusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; |
116 | 0 | return TRUE; |
117 | 0 | } |
118 | 0 | } |
119 | 0 | U_ASSERT(FALSE); |
120 | 0 | fPositionInCache = -1; |
121 | 0 | return FALSE; |
122 | 0 | } |
123 | | |
124 | | void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos, |
125 | 0 | int32_t firstRuleStatus, int32_t otherRuleStatus) { |
126 | 0 | if ((endPos - startPos) <= 1) { |
127 | 0 | return; |
128 | 0 | } |
129 | 0 | |
130 | 0 | reset(); |
131 | 0 | fFirstRuleStatusIndex = firstRuleStatus; |
132 | 0 | fOtherRuleStatusIndex = otherRuleStatus; |
133 | 0 |
|
134 | 0 | int32_t rangeStart = startPos; |
135 | 0 | int32_t rangeEnd = endPos; |
136 | 0 |
|
137 | 0 | uint16_t category; |
138 | 0 | int32_t current; |
139 | 0 | UErrorCode status = U_ZERO_ERROR; |
140 | 0 | int32_t foundBreakCount = 0; |
141 | 0 | UText *text = &fBI->fText; |
142 | 0 |
|
143 | 0 | // Loop through the text, looking for ranges of dictionary characters. |
144 | 0 | // For each span, find the appropriate break engine, and ask it to find |
145 | 0 | // any breaks within the span. |
146 | 0 |
|
147 | 0 | utext_setNativeIndex(text, rangeStart); |
148 | 0 | UChar32 c = utext_current32(text); |
149 | 0 | category = UTRIE2_GET16(fBI->fData->fTrie, c); |
150 | 0 |
|
151 | 0 | while(U_SUCCESS(status)) { |
152 | 0 | while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) { |
153 | 0 | utext_next32(text); // TODO: cleaner loop structure. |
154 | 0 | c = utext_current32(text); |
155 | 0 | category = UTRIE2_GET16(fBI->fData->fTrie, c); |
156 | 0 | } |
157 | 0 | if (current >= rangeEnd) { |
158 | 0 | break; |
159 | 0 | } |
160 | 0 | |
161 | 0 | // We now have a dictionary character. Get the appropriate language object |
162 | 0 | // to deal with it. |
163 | 0 | const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c); |
164 | 0 |
|
165 | 0 | // Ask the language object if there are any breaks. It will add them to the cache and |
166 | 0 | // leave the text pointer on the other side of its range, ready to search for the next one. |
167 | 0 | if (lbe != NULL) { |
168 | 0 | foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks); |
169 | 0 | } |
170 | 0 |
|
171 | 0 | // Reload the loop variables for the next go-round |
172 | 0 | c = utext_current32(text); |
173 | 0 | category = UTRIE2_GET16(fBI->fData->fTrie, c); |
174 | 0 | } |
175 | 0 |
|
176 | 0 | // If we found breaks, ensure that the first and last entries are |
177 | 0 | // the original starting and ending position. And initialize the |
178 | 0 | // cache iteration position to the first entry. |
179 | 0 |
|
180 | 0 | // printf("foundBreakCount = %d\n", foundBreakCount); |
181 | 0 | if (foundBreakCount > 0) { |
182 | 0 | U_ASSERT(foundBreakCount == fBreaks.size()); |
183 | 0 | if (startPos < fBreaks.elementAti(0)) { |
184 | 0 | // The dictionary did not place a boundary at the start of the segment of text. |
185 | 0 | // Add one now. This should not commonly happen, but it would be easy for interactions |
186 | 0 | // of the rules for dictionary segments and the break engine implementations to |
187 | 0 | // inadvertently cause it. Cover it here, just in case. |
188 | 0 | fBreaks.insertElementAt(startPos, 0, status); |
189 | 0 | } |
190 | 0 | if (endPos > fBreaks.peeki()) { |
191 | 0 | fBreaks.push(endPos, status); |
192 | 0 | } |
193 | 0 | fPositionInCache = 0; |
194 | 0 | // Note: Dictionary matching may extend beyond the original limit. |
195 | 0 | fStart = fBreaks.elementAti(0); |
196 | 0 | fLimit = fBreaks.peeki(); |
197 | 0 | } else { |
198 | 0 | // there were no language-based breaks, even though the segment contained |
199 | 0 | // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache |
200 | 0 | // for this range will fail, and the calling code will fall back to the rule based boundaries. |
201 | 0 | } |
202 | 0 | } |
203 | | |
204 | | |
205 | | /* |
206 | | * BreakCache implemetation |
207 | | */ |
208 | | |
209 | | RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : |
210 | 0 | fBI(bi), fSideBuffer(status) { |
211 | 0 | reset(); |
212 | 0 | } |
213 | | |
214 | | |
215 | 0 | RuleBasedBreakIterator::BreakCache::~BreakCache() { |
216 | 0 | } |
217 | | |
218 | | |
219 | 0 | void RuleBasedBreakIterator::BreakCache::reset(int32_t pos, int32_t ruleStatus) { |
220 | 0 | fStartBufIdx = 0; |
221 | 0 | fEndBufIdx = 0; |
222 | 0 | fTextIdx = pos; |
223 | 0 | fBufIdx = 0; |
224 | 0 | fBoundaries[0] = pos; |
225 | 0 | fStatuses[0] = (uint16_t)ruleStatus; |
226 | 0 | } |
227 | | |
228 | | |
229 | 0 | int32_t RuleBasedBreakIterator::BreakCache::current() { |
230 | 0 | fBI->fPosition = fTextIdx; |
231 | 0 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; |
232 | 0 | fBI->fDone = FALSE; |
233 | 0 | return fTextIdx; |
234 | 0 | } |
235 | | |
236 | | |
237 | 0 | void RuleBasedBreakIterator::BreakCache::following(int32_t startPos, UErrorCode &status) { |
238 | 0 | if (U_FAILURE(status)) { |
239 | 0 | return; |
240 | 0 | } |
241 | 0 | if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { |
242 | 0 | // startPos is in the cache. Do a next() from that position. |
243 | 0 | // TODO: an awkward set of interactions with bi->fDone |
244 | 0 | // seek() does not clear it; it can't because of interactions with populateNear(). |
245 | 0 | // next() does not clear it in the fast-path case, where everything matters. Maybe it should. |
246 | 0 | // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. |
247 | 0 | fBI->fDone = false; |
248 | 0 | next(); |
249 | 0 | } |
250 | 0 | return; |
251 | 0 | } |
252 | | |
253 | | |
254 | 0 | void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos, UErrorCode &status) { |
255 | 0 | if (U_FAILURE(status)) { |
256 | 0 | return; |
257 | 0 | } |
258 | 0 | if (startPos == fTextIdx || seek(startPos) || populateNear(startPos, status)) { |
259 | 0 | if (startPos == fTextIdx) { |
260 | 0 | previous(status); |
261 | 0 | } else { |
262 | 0 | // seek() leaves the BreakCache positioned at the preceding boundary |
263 | 0 | // if the requested position is between two bounaries. |
264 | 0 | // current() pushes the BreakCache position out to the BreakIterator itself. |
265 | 0 | U_ASSERT(startPos > fTextIdx); |
266 | 0 | current(); |
267 | 0 | } |
268 | 0 | } |
269 | 0 | return; |
270 | 0 | } |
271 | | |
272 | | |
273 | | /* |
274 | | * Out-of-line code for BreakCache::next(). |
275 | | * Cache does not already contain the boundary |
276 | | */ |
277 | 0 | void RuleBasedBreakIterator::BreakCache::nextOL() { |
278 | 0 | fBI->fDone = !populateFollowing(); |
279 | 0 | fBI->fPosition = fTextIdx; |
280 | 0 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; |
281 | 0 | return; |
282 | 0 | } |
283 | | |
284 | | |
285 | 0 | void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) { |
286 | 0 | if (U_FAILURE(status)) { |
287 | 0 | return; |
288 | 0 | } |
289 | 0 | int32_t initialBufIdx = fBufIdx; |
290 | 0 | if (fBufIdx == fStartBufIdx) { |
291 | 0 | // At start of cache. Prepend to it. |
292 | 0 | populatePreceding(status); |
293 | 0 | } else { |
294 | 0 | // Cache already holds the next boundary |
295 | 0 | fBufIdx = modChunkSize(fBufIdx - 1); |
296 | 0 | fTextIdx = fBoundaries[fBufIdx]; |
297 | 0 | } |
298 | 0 | fBI->fDone = (fBufIdx == initialBufIdx); |
299 | 0 | fBI->fPosition = fTextIdx; |
300 | 0 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; |
301 | 0 | return; |
302 | 0 | } |
303 | | |
304 | | |
305 | 0 | UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) { |
306 | 0 | if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { |
307 | 0 | return FALSE; |
308 | 0 | } |
309 | 0 | if (pos == fBoundaries[fStartBufIdx]) { |
310 | 0 | // Common case: seek(0), from BreakIterator::first() |
311 | 0 | fBufIdx = fStartBufIdx; |
312 | 0 | fTextIdx = fBoundaries[fBufIdx]; |
313 | 0 | return TRUE; |
314 | 0 | } |
315 | 0 | if (pos == fBoundaries[fEndBufIdx]) { |
316 | 0 | fBufIdx = fEndBufIdx; |
317 | 0 | fTextIdx = fBoundaries[fBufIdx]; |
318 | 0 | return TRUE; |
319 | 0 | } |
320 | 0 |
|
321 | 0 | int32_t min = fStartBufIdx; |
322 | 0 | int32_t max = fEndBufIdx; |
323 | 0 | while (min != max) { |
324 | 0 | int32_t probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; |
325 | 0 | probe = modChunkSize(probe); |
326 | 0 | if (fBoundaries[probe] > pos) { |
327 | 0 | max = probe; |
328 | 0 | } else { |
329 | 0 | min = modChunkSize(probe + 1); |
330 | 0 | } |
331 | 0 | } |
332 | 0 | U_ASSERT(fBoundaries[max] > pos); |
333 | 0 | fBufIdx = modChunkSize(max - 1); |
334 | 0 | fTextIdx = fBoundaries[fBufIdx]; |
335 | 0 | U_ASSERT(fTextIdx <= pos); |
336 | 0 | return TRUE; |
337 | 0 | } |
338 | | |
339 | | |
340 | 0 | UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorCode &status) { |
341 | 0 | if (U_FAILURE(status)) { |
342 | 0 | return FALSE; |
343 | 0 | } |
344 | 0 | U_ASSERT(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); |
345 | 0 |
|
346 | 0 | // Find a boundary somewhere in the vicinity of the requested position. |
347 | 0 | // Depending on the safe rules and the text data, it could be either before, at, or after |
348 | 0 | // the requested position. |
349 | 0 |
|
350 | 0 |
|
351 | 0 | // If the requested position is not near already cached positions, clear the existing cache, |
352 | 0 | // find a near-by boundary and begin new cache contents there. |
353 | 0 |
|
354 | 0 | if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { |
355 | 0 | int32_t aBoundary = 0; |
356 | 0 | int32_t ruleStatusIndex = 0; |
357 | 0 | if (position > 20) { |
358 | 0 | int32_t backupPos = fBI->handleSafePrevious(position); |
359 | 0 |
|
360 | 0 | if (backupPos > 0) { |
361 | 0 | // Advance to the boundary following the backup position. |
362 | 0 | // There is a complication: the safe reverse rules identify pairs of code points |
363 | 0 | // that are safe. If advancing from the safe point moves forwards by less than |
364 | 0 | // two code points, we need to advance one more time to ensure that the boundary |
365 | 0 | // is good, including a correct rules status value. |
366 | 0 | // |
367 | 0 | fBI->fPosition = backupPos; |
368 | 0 | aBoundary = fBI->handleNext(); |
369 | 0 | if (aBoundary <= backupPos + 4) { |
370 | 0 | // +4 is a quick test for possibly having advanced only one codepoint. |
371 | 0 | // Four being the length of the longest potential code point, a supplementary in UTF-8 |
372 | 0 | utext_setNativeIndex(&fBI->fText, aBoundary); |
373 | 0 | if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) { |
374 | 0 | // The initial handleNext() only advanced by a single code point. Go again. |
375 | 0 | aBoundary = fBI->handleNext(); // Safe rules identify safe pairs. |
376 | 0 | } |
377 | 0 | } |
378 | 0 | ruleStatusIndex = fBI->fRuleStatusIndex; |
379 | 0 | } |
380 | 0 | } |
381 | 0 | reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. |
382 | 0 | } |
383 | 0 |
|
384 | 0 | // Fill in boundaries between existing cache content and the new requested position. |
385 | 0 |
|
386 | 0 | if (fBoundaries[fEndBufIdx] < position) { |
387 | 0 | // The last position in the cache precedes the requested position. |
388 | 0 | // Add following position(s) to the cache. |
389 | 0 | while (fBoundaries[fEndBufIdx] < position) { |
390 | 0 | if (!populateFollowing()) { |
391 | 0 | U_ASSERT(false); |
392 | 0 | return false; |
393 | 0 | } |
394 | 0 | } |
395 | 0 | fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. |
396 | 0 | fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. |
397 | 0 | while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. |
398 | 0 | previous(status); |
399 | 0 | } |
400 | 0 | return true; |
401 | 0 | } |
402 | 0 |
|
403 | 0 | if (fBoundaries[fStartBufIdx] > position) { |
404 | 0 | // The first position in the cache is beyond the requested position. |
405 | 0 | // back up more until we get a boundary <= the requested position. |
406 | 0 | while (fBoundaries[fStartBufIdx] > position) { |
407 | 0 | populatePreceding(status); |
408 | 0 | } |
409 | 0 | fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. |
410 | 0 | fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. |
411 | 0 | while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. |
412 | 0 | next(); |
413 | 0 | } |
414 | 0 | if (fTextIdx > position) { |
415 | 0 | // If position is not itself a boundary, the next() loop above will overshoot. |
416 | 0 | // Back up one, leaving cache position at the boundary preceding the requested position. |
417 | 0 | previous(status); |
418 | 0 | } |
419 | 0 | return true; |
420 | 0 | } |
421 | 0 |
|
422 | 0 | U_ASSERT(fTextIdx == position); |
423 | 0 | return true; |
424 | 0 | } |
425 | | |
426 | | |
427 | | |
428 | 0 | UBool RuleBasedBreakIterator::BreakCache::populateFollowing() { |
429 | 0 | int32_t fromPosition = fBoundaries[fEndBufIdx]; |
430 | 0 | int32_t fromRuleStatusIdx = fStatuses[fEndBufIdx]; |
431 | 0 | int32_t pos = 0; |
432 | 0 | int32_t ruleStatusIdx = 0; |
433 | 0 |
|
434 | 0 | if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { |
435 | 0 | addFollowing(pos, ruleStatusIdx, UpdateCachePosition); |
436 | 0 | return TRUE; |
437 | 0 | } |
438 | 0 |
|
439 | 0 | fBI->fPosition = fromPosition; |
440 | 0 | pos = fBI->handleNext(); |
441 | 0 | if (pos == UBRK_DONE) { |
442 | 0 | return FALSE; |
443 | 0 | } |
444 | 0 |
|
445 | 0 | ruleStatusIdx = fBI->fRuleStatusIndex; |
446 | 0 | if (fBI->fDictionaryCharCount > 0) { |
447 | 0 | // The text segment obtained from the rules includes dictionary characters. |
448 | 0 | // Subdivide it, with subdivided results going into the dictionary cache. |
449 | 0 | fBI->fDictionaryCache->populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); |
450 | 0 | if (fBI->fDictionaryCache->following(fromPosition, &pos, &ruleStatusIdx)) { |
451 | 0 | addFollowing(pos, ruleStatusIdx, UpdateCachePosition); |
452 | 0 | return TRUE; |
453 | 0 | // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point. |
454 | 0 | // But be careful with interactions with populateNear(). |
455 | 0 | } |
456 | 0 | } |
457 | 0 |
|
458 | 0 | // Rule based segment did not include dictionary characters. |
459 | 0 | // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, |
460 | 0 | // meaning that we didn't take the return, above. |
461 | 0 | // Add its end point to the cache. |
462 | 0 | addFollowing(pos, ruleStatusIdx, UpdateCachePosition); |
463 | 0 |
|
464 | 0 | // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. |
465 | 0 | // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. |
466 | 0 | // |
467 | 0 | for (int count=0; count<6; ++count) { |
468 | 0 | pos = fBI->handleNext(); |
469 | 0 | if (pos == UBRK_DONE || fBI->fDictionaryCharCount > 0) { |
470 | 0 | break; |
471 | 0 | } |
472 | 0 | addFollowing(pos, fBI->fRuleStatusIndex, RetainCachePosition); |
473 | 0 | } |
474 | 0 |
|
475 | 0 | return TRUE; |
476 | 0 | } |
477 | | |
478 | | |
479 | 0 | UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) { |
480 | 0 | if (U_FAILURE(status)) { |
481 | 0 | return FALSE; |
482 | 0 | } |
483 | 0 |
|
484 | 0 | int32_t fromPosition = fBoundaries[fStartBufIdx]; |
485 | 0 | if (fromPosition == 0) { |
486 | 0 | return FALSE; |
487 | 0 | } |
488 | 0 |
|
489 | 0 | int32_t position = 0; |
490 | 0 | int32_t positionStatusIdx = 0; |
491 | 0 |
|
492 | 0 | if (fBI->fDictionaryCache->preceding(fromPosition, &position, &positionStatusIdx)) { |
493 | 0 | addPreceding(position, positionStatusIdx, UpdateCachePosition); |
494 | 0 | return TRUE; |
495 | 0 | } |
496 | 0 |
|
497 | 0 | int32_t backupPosition = fromPosition; |
498 | 0 |
|
499 | 0 | // Find a boundary somewhere preceding the first already-cached boundary |
500 | 0 | do { |
501 | 0 | backupPosition = backupPosition - 30; |
502 | 0 | if (backupPosition <= 0) { |
503 | 0 | backupPosition = 0; |
504 | 0 | } else { |
505 | 0 | backupPosition = fBI->handleSafePrevious(backupPosition); |
506 | 0 | } |
507 | 0 | if (backupPosition == UBRK_DONE || backupPosition == 0) { |
508 | 0 | position = 0; |
509 | 0 | positionStatusIdx = 0; |
510 | 0 | } else { |
511 | 0 | // Advance to the boundary following the backup position. |
512 | 0 | // There is a complication: the safe reverse rules identify pairs of code points |
513 | 0 | // that are safe. If advancing from the safe point moves forwards by less than |
514 | 0 | // two code points, we need to advance one more time to ensure that the boundary |
515 | 0 | // is good, including a correct rules status value. |
516 | 0 | // |
517 | 0 | fBI->fPosition = backupPosition; |
518 | 0 | position = fBI->handleNext(); |
519 | 0 | if (position <= backupPosition + 4) { |
520 | 0 | // +4 is a quick test for possibly having advanced only one codepoint. |
521 | 0 | // Four being the length of the longest potential code point, a supplementary in UTF-8 |
522 | 0 | utext_setNativeIndex(&fBI->fText, position); |
523 | 0 | if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) { |
524 | 0 | // The initial handleNext() only advanced by a single code point. Go again. |
525 | 0 | position = fBI->handleNext(); // Safe rules identify safe pairs. |
526 | 0 | } |
527 | 0 | }; |
528 | 0 | positionStatusIdx = fBI->fRuleStatusIndex; |
529 | 0 | } |
530 | 0 | } while (position >= fromPosition); |
531 | 0 |
|
532 | 0 | // Find boundaries between the one we just located and the first already-cached boundary |
533 | 0 | // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. |
534 | 0 |
|
535 | 0 | fSideBuffer.removeAllElements(); |
536 | 0 | fSideBuffer.addElement(position, status); |
537 | 0 | fSideBuffer.addElement(positionStatusIdx, status); |
538 | 0 |
|
539 | 0 | do { |
540 | 0 | int32_t prevPosition = fBI->fPosition = position; |
541 | 0 | int32_t prevStatusIdx = positionStatusIdx; |
542 | 0 | position = fBI->handleNext(); |
543 | 0 | positionStatusIdx = fBI->fRuleStatusIndex; |
544 | 0 | if (position == UBRK_DONE) { |
545 | 0 | break; |
546 | 0 | } |
547 | 0 | |
548 | 0 | UBool segmentHandledByDictionary = FALSE; |
549 | 0 | if (fBI->fDictionaryCharCount != 0) { |
550 | 0 | // Segment from the rules includes dictionary characters. |
551 | 0 | // Subdivide it, with subdivided results going into the dictionary cache. |
552 | 0 | int32_t dictSegEndPosition = position; |
553 | 0 | fBI->fDictionaryCache->populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); |
554 | 0 | while (fBI->fDictionaryCache->following(prevPosition, &position, &positionStatusIdx)) { |
555 | 0 | segmentHandledByDictionary = true; |
556 | 0 | U_ASSERT(position > prevPosition); |
557 | 0 | if (position >= fromPosition) { |
558 | 0 | break; |
559 | 0 | } |
560 | 0 | U_ASSERT(position <= dictSegEndPosition); |
561 | 0 | fSideBuffer.addElement(position, status); |
562 | 0 | fSideBuffer.addElement(positionStatusIdx, status); |
563 | 0 | prevPosition = position; |
564 | 0 | } |
565 | 0 | U_ASSERT(position==dictSegEndPosition || position>=fromPosition); |
566 | 0 | } |
567 | 0 |
|
568 | 0 | if (!segmentHandledByDictionary && position < fromPosition) { |
569 | 0 | fSideBuffer.addElement(position, status); |
570 | 0 | fSideBuffer.addElement(positionStatusIdx, status); |
571 | 0 | } |
572 | 0 | } while (position < fromPosition); |
573 | 0 |
|
574 | 0 | // Move boundaries from the side buffer to the main circular buffer. |
575 | 0 | UBool success = FALSE; |
576 | 0 | if (!fSideBuffer.isEmpty()) { |
577 | 0 | positionStatusIdx = fSideBuffer.popi(); |
578 | 0 | position = fSideBuffer.popi(); |
579 | 0 | addPreceding(position, positionStatusIdx, UpdateCachePosition); |
580 | 0 | success = TRUE; |
581 | 0 | } |
582 | 0 |
|
583 | 0 | while (!fSideBuffer.isEmpty()) { |
584 | 0 | positionStatusIdx = fSideBuffer.popi(); |
585 | 0 | position = fSideBuffer.popi(); |
586 | 0 | if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { |
587 | 0 | // No space in circular buffer to hold a new preceding result while |
588 | 0 | // also retaining the current cache (iteration) position. |
589 | 0 | // Bailing out is safe; the cache will refill again if needed. |
590 | 0 | break; |
591 | 0 | } |
592 | 0 | } |
593 | 0 |
|
594 | 0 | return success; |
595 | 0 | } |
596 | | |
597 | | |
598 | 0 | void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { |
599 | 0 | U_ASSERT(position > fBoundaries[fEndBufIdx]); |
600 | 0 | U_ASSERT(ruleStatusIdx <= UINT16_MAX); |
601 | 0 | int32_t nextIdx = modChunkSize(fEndBufIdx + 1); |
602 | 0 | if (nextIdx == fStartBufIdx) { |
603 | 0 | fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. |
604 | 0 | } |
605 | 0 | fBoundaries[nextIdx] = position; |
606 | 0 | fStatuses[nextIdx] = ruleStatusIdx; |
607 | 0 | fEndBufIdx = nextIdx; |
608 | 0 | if (update == UpdateCachePosition) { |
609 | 0 | // Set current position to the newly added boundary. |
610 | 0 | fBufIdx = nextIdx; |
611 | 0 | fTextIdx = position; |
612 | 0 | } else { |
613 | 0 | // Retaining the original cache position. |
614 | 0 | // Check if the added boundary wraps around the buffer, and would over-write the original position. |
615 | 0 | // It's the responsibility of callers of this function to not add too many. |
616 | 0 | U_ASSERT(nextIdx != fBufIdx); |
617 | 0 | } |
618 | 0 | } |
619 | | |
620 | 0 | bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update) { |
621 | 0 | U_ASSERT(position < fBoundaries[fStartBufIdx]); |
622 | 0 | U_ASSERT(ruleStatusIdx <= UINT16_MAX); |
623 | 0 | int32_t nextIdx = modChunkSize(fStartBufIdx - 1); |
624 | 0 | if (nextIdx == fEndBufIdx) { |
625 | 0 | if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { |
626 | 0 | // Failure. The insertion of the new boundary would claim the buffer position that is the |
627 | 0 | // current iteration position. And we also want to retain the current iteration position. |
628 | 0 | // (The buffer is already completely full of entries that precede the iteration position.) |
629 | 0 | return false; |
630 | 0 | } |
631 | 0 | fEndBufIdx = modChunkSize(fEndBufIdx - 1); |
632 | 0 | } |
633 | 0 | fBoundaries[nextIdx] = position; |
634 | 0 | fStatuses[nextIdx] = ruleStatusIdx; |
635 | 0 | fStartBufIdx = nextIdx; |
636 | 0 | if (update == UpdateCachePosition) { |
637 | 0 | fBufIdx = nextIdx; |
638 | 0 | fTextIdx = position; |
639 | 0 | } |
640 | 0 | return true; |
641 | 0 | } |
642 | | |
643 | | |
644 | 0 | void RuleBasedBreakIterator::BreakCache::dumpCache() { |
645 | | #ifdef RBBI_DEBUG |
646 | | RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx, fBufIdx); |
647 | | for (int32_t i=fStartBufIdx; ; i=modChunkSize(i+1)) { |
648 | | RBBIDebugPrintf("%d %d\n", i, fBoundaries[i]); |
649 | | if (i == fEndBufIdx) { |
650 | | break; |
651 | | } |
652 | | } |
653 | | #endif |
654 | | } |
655 | | |
656 | | U_NAMESPACE_END |
657 | | |
658 | | #endif // #if !UCONFIG_NO_BREAK_ITERATION |