/src/icu/source/common/rbbi_cache.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // Copyright (C) 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  |  | 
4  |  | // file: rbbi_cache.h  | 
5  |  | //  | 
6  |  | #ifndef RBBI_CACHE_H  | 
7  |  | #define RBBI_CACHE_H  | 
8  |  |  | 
9  |  | #include "unicode/utypes.h"  | 
10  |  |  | 
11  |  | #if !UCONFIG_NO_BREAK_ITERATION  | 
12  |  |  | 
13  |  | #include "unicode/rbbi.h"  | 
14  |  | #include "unicode/uobject.h"  | 
15  |  |  | 
16  |  | #include "uvectr32.h"  | 
17  |  |  | 
18  |  | U_NAMESPACE_BEGIN  | 
19  |  |  | 
20  |  | /* DictionaryCache  stores the boundaries obtained from a run of dictionary characters.  | 
21  |  |  *                 Dictionary boundaries are moved first to this cache, then from here  | 
22  |  |  *                 to the main BreakCache, where they may inter-leave with non-dictionary  | 
23  |  |  *                 boundaries. The public BreakIterator API always fetches directly  | 
24  |  |  *                 from the main BreakCache, not from here.  | 
25  |  |  *  | 
26  |  |  *                 In common situations, the number of boundaries in a single dictionary run  | 
27  |  |  *                 should be quite small, it will be terminated by punctuation, spaces,  | 
28  |  |  *                 or any other non-dictionary characters. The main BreakCache may end  | 
29  |  |  *                 up with boundaries from multiple dictionary based runs.  | 
30  |  |  *  | 
31  |  |  *                 The boundaries are stored in a simple ArrayList (vector), with the  | 
32  |  |  *                 assumption that they will be accessed sequentially.  | 
33  |  |  */                   | 
34  |  | class RuleBasedBreakIterator::DictionaryCache: public UMemory { | 
35  |  |   public:  | 
36  |  |      DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status);  | 
37  |  |      ~DictionaryCache();  | 
38  |  |  | 
39  |  |      void reset();  | 
40  |  |  | 
41  |  |      UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex);  | 
42  |  |      UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex);  | 
43  |  |  | 
44  |  |     /**  | 
45  |  |      * Populate the cache with the dictionary based boundaries within a region of text.  | 
46  |  |      * @param startPos  The start position of a range of text  | 
47  |  |      * @param endPos    The end position of a range of text  | 
48  |  |      * @param firstRuleStatus The rule status index that applies to the break at startPos  | 
49  |  |      * @param otherRuleStatus The rule status index that applies to boundaries other than startPos  | 
50  |  |      * @internal  | 
51  |  |      */  | 
52  |  |     void populateDictionary(int32_t startPos, int32_t endPos,  | 
53  |  |                          int32_t firstRuleStatus, int32_t otherRuleStatus);  | 
54  |  |  | 
55  |  |  | 
56  |  |  | 
57  |  |     RuleBasedBreakIterator *fBI;  | 
58  |  |       | 
59  |  |     UVector32           fBreaks;                // A vector containing the boundaries.  | 
60  |  |     int32_t             fPositionInCache;       // Index in fBreaks of last boundary returned by following()  | 
61  |  |                                                 //    or preceding(). Optimizes sequential access.  | 
62  |  |     int32_t             fStart;                 // Text position of first boundary in cache.  | 
63  |  |     int32_t             fLimit;                 // Last boundary in cache. Which is the limit of the  | 
64  |  |                                                 //    text segment being handled by the dictionary.  | 
65  |  |     int32_t             fFirstRuleStatusIndex;  // Rule status info for first boundary.  | 
66  |  |     int32_t             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries.  | 
67  |  | };  | 
68  |  |  | 
69  |  |  | 
70  |  | /*  | 
71  |  |  * class BreakCache  | 
72  |  |  *  | 
73  |  |  * Cache of break boundary positions and rule status values.  | 
74  |  |  * Break iterator API functions, next(), previous(), etc., will use cached results  | 
75  |  |  * when possible, and otherwise cache new results as they are obtained.  | 
76  |  |  *  | 
77  |  |  * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.  | 
78  |  |  *  | 
79  |  |  * The cache is implemented as a single circular buffer.  | 
80  |  |  */  | 
81  |  |  | 
82  |  | /*  | 
83  |  |  * size of the circular cache buffer.  | 
84  |  |  */  | 
85  |  |  | 
86  |  | class RuleBasedBreakIterator::BreakCache: public UMemory { | 
87  |  |   public:  | 
88  |  |                 BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status);  | 
89  |  |     virtual     ~BreakCache();  | 
90  |  |     void        reset(int32_t pos = 0, int32_t ruleStatus = 0);  | 
91  | 0  |     void        next() {    if (fBufIdx == fEndBufIdx) { | 
92  | 0  |                                 nextOL();  | 
93  | 0  |                             } else { | 
94  | 0  |                                 fBufIdx = modChunkSize(fBufIdx + 1);  | 
95  | 0  |                                 fTextIdx = fBI->fPosition = fBoundaries[fBufIdx];  | 
96  | 0  |                                 fBI->fRuleStatusIndex = fStatuses[fBufIdx];  | 
97  | 0  |                             }  | 
98  | 0  |                 }  | 
99  |  |  | 
100  |  |  | 
101  |  |     void        nextOL();  | 
102  |  |     void        previous(UErrorCode &status);  | 
103  |  |  | 
104  |  |     // Move the iteration state to the position following the startPosition.  | 
105  |  |     // Input position must be pinned to the input length.  | 
106  |  |     void        following(int32_t startPosition, UErrorCode &status);  | 
107  |  |  | 
108  |  |     void        preceding(int32_t startPosition, UErrorCode &status);  | 
109  |  |  | 
110  |  |     /*  | 
111  |  |      * Update the state of the public BreakIterator (fBI) to reflect the  | 
112  |  |      * current state of the break iterator cache (this).  | 
113  |  |      */  | 
114  |  |     int32_t     current();  | 
115  |  |  | 
116  |  |     /**  | 
117  |  |      * Add boundaries to the cache near the specified position.  | 
118  |  |      * The given position need not be a boundary itself.  | 
119  |  |      * The input position must be within the range of the text, and  | 
120  |  |      * on a code point boundary.  | 
121  |  |      * If the requested position is a break boundary, leave the iteration  | 
122  |  |      * position on it.  | 
123  |  |      * If the requested position is not a boundary, leave the iteration  | 
124  |  |      * position on the preceding boundary and include both the  | 
125  |  |      * preceding and following boundaries in the cache.  | 
126  |  |      * Additional boundaries, either preceding or following, may be added  | 
127  |  |      * to the cache as a side effect.  | 
128  |  |      *  | 
129  |  |      * Return false if the operation failed.  | 
130  |  |      */  | 
131  |  |     UBool populateNear(int32_t position, UErrorCode &status);  | 
132  |  |  | 
133  |  |     /**  | 
134  |  |      *  Add boundary(s) to the cache following the current last boundary.  | 
135  |  |      *  Return false if at the end of the text, and no more boundaries can be added.  | 
136  |  |      *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.  | 
137  |  |      */  | 
138  |  |     UBool populateFollowing();  | 
139  |  |  | 
140  |  |     /**  | 
141  |  |      *  Add one or more boundaries to the cache preceding the first currently cached boundary.  | 
142  |  |      *  Leave the iteration position on the first added boundary.  | 
143  |  |      *  Return false if no boundaries could be added (if at the start of the text.)  | 
144  |  |      */  | 
145  |  |     UBool populatePreceding(UErrorCode &status);  | 
146  |  |  | 
147  |  |     enum UpdatePositionValues { | 
148  |  |         RetainCachePosition = 0,  | 
149  |  |         UpdateCachePosition = 1  | 
150  |  |     };  | 
151  |  |  | 
152  |  |     /*  | 
153  |  |      * Add the boundary following the current position.  | 
154  |  |      * The current position can be left as it was, or changed to the newly added boundary,  | 
155  |  |      * as specified by the update parameter.  | 
156  |  |      */  | 
157  |  |     void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);  | 
158  |  |  | 
159  |  |  | 
160  |  |     /*  | 
161  |  |      * Add the boundary preceding the current position.  | 
162  |  |      * The current position can be left as it was, or changed to the newly added boundary,  | 
163  |  |      * as specified by the update parameter.  | 
164  |  |      */  | 
165  |  |     bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update);  | 
166  |  |  | 
167  |  |     /**  | 
168  |  |      *  Set the cache position to the specified position, or, if the position  | 
169  |  |      *  falls between to cached boundaries, to the preceding boundary.  | 
170  |  |      *  Fails if the requested position is outside of the range of boundaries currently held by the cache.  | 
171  |  |      *  The startPosition must be on a code point boundary.  | 
172  |  |      *  | 
173  |  |      *  Return true if successful, false if the specified position is after  | 
174  |  |      *  the last cached boundary or before the first.  | 
175  |  |      */  | 
176  |  |     UBool                   seek(int32_t startPosition);  | 
177  |  |  | 
178  |  |     void dumpCache();  | 
179  |  |  | 
180  |  |   private:  | 
181  | 0  |     static inline int32_t   modChunkSize(int index) { return index & (CACHE_SIZE - 1); } | 
182  |  |  | 
183  |  |     static constexpr int32_t CACHE_SIZE = 128;  | 
184  |  |     static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");  | 
185  |  |  | 
186  |  |     RuleBasedBreakIterator *fBI;  | 
187  |  |     int32_t                 fStartBufIdx;  | 
188  |  |     int32_t                 fEndBufIdx;    // inclusive  | 
189  |  |  | 
190  |  |     int32_t                 fTextIdx;  | 
191  |  |     int32_t                 fBufIdx;  | 
192  |  |  | 
193  |  |     int32_t                 fBoundaries[CACHE_SIZE];  | 
194  |  |     uint16_t                fStatuses[CACHE_SIZE];  | 
195  |  |  | 
196  |  |     UVector32               fSideBuffer;  | 
197  |  | };  | 
198  |  |  | 
199  |  | U_NAMESPACE_END  | 
200  |  |  | 
201  |  | #endif // #if !UCONFIG_NO_BREAK_ITERATION  | 
202  |  |  | 
203  |  | #endif // RBBI_CACHE_H  |