/src/icu/source/i18n/unicode/coleitr.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  |  ******************************************************************************  | 
5  |  |  *   Copyright (C) 1997-2014, International Business Machines  | 
6  |  |  *   Corporation and others.  All Rights Reserved.  | 
7  |  |  ******************************************************************************  | 
8  |  |  */  | 
9  |  |  | 
10  |  | /**  | 
11  |  |  * \file   | 
12  |  |  * \brief C++ API: Collation Element Iterator.  | 
13  |  |  */  | 
14  |  |  | 
15  |  | /**  | 
16  |  | * File coleitr.h  | 
17  |  | *  | 
18  |  | * Created by: Helena Shih  | 
19  |  | *  | 
20  |  | * Modification History:  | 
21  |  | *  | 
22  |  | *  Date       Name        Description  | 
23  |  | *  | 
24  |  | *  8/18/97    helena      Added internal API documentation.  | 
25  |  | * 08/03/98    erm         Synched with 1.2 version CollationElementIterator.java  | 
26  |  | * 12/10/99    aliu        Ported Thai collation support from Java.  | 
27  |  | * 01/25/01    swquek      Modified into a C++ wrapper calling C APIs (ucoliter.h)  | 
28  |  | * 02/19/01    swquek      Removed CollationElementsIterator() since it is   | 
29  |  | *                         private constructor and no calls are made to it  | 
30  |  | * 2012-2014   markus      Rewritten in C++ again.  | 
31  |  | */  | 
32  |  |  | 
33  |  | #ifndef COLEITR_H  | 
34  |  | #define COLEITR_H  | 
35  |  |  | 
36  |  | #include "unicode/utypes.h"  | 
37  |  |  | 
38  |  | #if U_SHOW_CPLUSPLUS_API  | 
39  |  |  | 
40  |  | #if !UCONFIG_NO_COLLATION  | 
41  |  |  | 
42  |  | #include "unicode/unistr.h"  | 
43  |  | #include "unicode/uobject.h"  | 
44  |  |  | 
45  |  | struct UCollationElements;  | 
46  |  | struct UHashtable;  | 
47  |  |  | 
48  |  | U_NAMESPACE_BEGIN  | 
49  |  |  | 
50  |  | struct CollationData;  | 
51  |  |  | 
52  |  | class CharacterIterator;  | 
53  |  | class CollationIterator;  | 
54  |  | class RuleBasedCollator;  | 
55  |  | class UCollationPCE;  | 
56  |  | class UVector32;  | 
57  |  |  | 
58  |  | /**  | 
59  |  | * The CollationElementIterator class is used as an iterator to walk through       | 
60  |  | * each character of an international string. Use the iterator to return the  | 
61  |  | * ordering priority of the positioned character. The ordering priority of a   | 
62  |  | * character, which we refer to as a key, defines how a character is collated in   | 
63  |  | * the given collation object.  | 
64  |  | * For example, consider the following in Slovak and in traditional Spanish collation:  | 
65  |  | * <pre>  | 
66  |  | *        "ca" -> the first key is key('c') and second key is key('a'). | 
67  |  | *        "cha" -> the first key is key('ch') and second key is key('a').</pre> | 
68  |  | * And in German phonebook collation,  | 
69  |  | * <pre> \htmlonly       "æb"-> the first key is key('a'), the second key is key('e'), and | 
70  |  | *        the third key is key('b'). \endhtmlonly </pre> | 
71  |  | * The key of a character, is an integer composed of primary order(short),  | 
72  |  | * secondary order(char), and tertiary order(char). Java strictly defines the   | 
73  |  | * size and signedness of its primitive data types. Therefore, the static  | 
74  |  | * functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return   | 
75  |  | * int32_t to ensure the correctness of the key value.  | 
76  |  | * <p>Example of the iterator usage: (without error checking)  | 
77  |  | * <pre>  | 
78  |  | * \code  | 
79  |  | *   void CollationElementIterator_Example()  | 
80  |  | *   { | 
81  |  | *       UnicodeString str = "This is a test";  | 
82  |  | *       UErrorCode success = U_ZERO_ERROR;  | 
83  |  | *       RuleBasedCollator* rbc =  | 
84  |  | *           (RuleBasedCollator*) RuleBasedCollator::createInstance(success);  | 
85  |  | *       CollationElementIterator* c =  | 
86  |  | *           rbc->createCollationElementIterator( str );  | 
87  |  | *       int32_t order = c->next(success);  | 
88  |  | *       c->reset();  | 
89  |  | *       order = c->previous(success);  | 
90  |  | *       delete c;  | 
91  |  | *       delete rbc;  | 
92  |  | *   }  | 
93  |  | * \endcode  | 
94  |  | * </pre>  | 
95  |  | * <p>  | 
96  |  | * The method next() returns the collation order of the next character based on  | 
97  |  | * the comparison level of the collator. The method previous() returns the  | 
98  |  | * collation order of the previous character based on the comparison level of  | 
99  |  | * the collator. The Collation Element Iterator moves only in one direction  | 
100  |  | * between calls to reset(), setOffset(), or setText(). That is, next()   | 
101  |  | * and previous() can not be inter-used. Whenever previous() is to be called after   | 
102  |  | * next() or vice versa, reset(), setOffset() or setText() has to be called first  | 
103  |  | * to reset the status, shifting pointers to either the end or the start of  | 
104  |  | * the string (reset() or setText()), or the specified position (setOffset()).  | 
105  |  | * Hence at the next call of next() or previous(), the first or last collation order,  | 
106  |  | * or collation order at the specified position will be returned. If a change of  | 
107  |  | * direction is done without one of these calls, the result is undefined.  | 
108  |  | * <p>  | 
109  |  | * The result of a forward iterate (next()) and reversed result of the backward  | 
110  |  | * iterate (previous()) on the same string are equivalent, if collation orders  | 
111  |  | * with the value 0 are ignored.  | 
112  |  | * Character based on the comparison level of the collator.  A collation order   | 
113  |  | * consists of primary order, secondary order and tertiary order.  The data   | 
114  |  | * type of the collation order is <strong>int32_t</strong>.   | 
115  |  | *  | 
116  |  | * Note, CollationElementIterator should not be subclassed.  | 
117  |  | * @see     Collator  | 
118  |  | * @see     RuleBasedCollator  | 
119  |  | * @version 1.8 Jan 16 2001  | 
120  |  | */  | 
121  |  | class U_I18N_API CollationElementIterator U_FINAL : public UObject { | 
122  |  | public:   | 
123  |  |  | 
124  |  |     // CollationElementIterator public data member ------------------------------  | 
125  |  |  | 
126  |  |     enum { | 
127  |  |         /**  | 
128  |  |          * NULLORDER indicates that an error has occurred while processing  | 
129  |  |          * @stable ICU 2.0  | 
130  |  |          */  | 
131  |  |         NULLORDER = (int32_t)0xffffffff  | 
132  |  |     };  | 
133  |  |  | 
134  |  |     // CollationElementIterator public constructor/destructor -------------------  | 
135  |  |  | 
136  |  |     /**  | 
137  |  |     * Copy constructor.  | 
138  |  |     *  | 
139  |  |     * @param other    the object to be copied from  | 
140  |  |     * @stable ICU 2.0  | 
141  |  |     */  | 
142  |  |     CollationElementIterator(const CollationElementIterator& other);  | 
143  |  |  | 
144  |  |     /**   | 
145  |  |     * Destructor  | 
146  |  |     * @stable ICU 2.0  | 
147  |  |     */  | 
148  |  |     virtual ~CollationElementIterator();  | 
149  |  |  | 
150  |  |     // CollationElementIterator public methods ----------------------------------  | 
151  |  |  | 
152  |  |     /**  | 
153  |  |     * Returns true if "other" is the same as "this"  | 
154  |  |     *  | 
155  |  |     * @param other    the object to be compared  | 
156  |  |     * @return         true if "other" is the same as "this"  | 
157  |  |     * @stable ICU 2.0  | 
158  |  |     */  | 
159  |  |     bool operator==(const CollationElementIterator& other) const;  | 
160  |  |  | 
161  |  |     /**  | 
162  |  |     * Returns true if "other" is not the same as "this".  | 
163  |  |     *  | 
164  |  |     * @param other    the object to be compared  | 
165  |  |     * @return         true if "other" is not the same as "this"  | 
166  |  |     * @stable ICU 2.0  | 
167  |  |     */  | 
168  |  |     bool operator!=(const CollationElementIterator& other) const;  | 
169  |  |  | 
170  |  |     /**  | 
171  |  |     * Resets the cursor to the beginning of the string.  | 
172  |  |     * @stable ICU 2.0  | 
173  |  |     */  | 
174  |  |     void reset(void);  | 
175  |  |  | 
176  |  |     /**  | 
177  |  |     * Gets the ordering priority of the next character in the string.  | 
178  |  |     * @param status the error code status.  | 
179  |  |     * @return the next character's ordering. otherwise returns NULLORDER if an   | 
180  |  |     *         error has occurred or if the end of string has been reached  | 
181  |  |     * @stable ICU 2.0  | 
182  |  |     */  | 
183  |  |     int32_t next(UErrorCode& status);  | 
184  |  |  | 
185  |  |     /**  | 
186  |  |     * Get the ordering priority of the previous collation element in the string.  | 
187  |  |     * @param status the error code status.  | 
188  |  |     * @return the previous element's ordering. otherwise returns NULLORDER if an   | 
189  |  |     *         error has occurred or if the start of string has been reached  | 
190  |  |     * @stable ICU 2.0  | 
191  |  |     */  | 
192  |  |     int32_t previous(UErrorCode& status);  | 
193  |  |  | 
194  |  |     /**  | 
195  |  |     * Gets the primary order of a collation order.  | 
196  |  |     * @param order the collation order  | 
197  |  |     * @return the primary order of a collation order.  | 
198  |  |     * @stable ICU 2.0  | 
199  |  |     */  | 
200  |  |     static inline int32_t primaryOrder(int32_t order);  | 
201  |  |  | 
202  |  |     /**  | 
203  |  |     * Gets the secondary order of a collation order.  | 
204  |  |     * @param order the collation order  | 
205  |  |     * @return the secondary order of a collation order.  | 
206  |  |     * @stable ICU 2.0  | 
207  |  |     */  | 
208  |  |     static inline int32_t secondaryOrder(int32_t order);  | 
209  |  |  | 
210  |  |     /**  | 
211  |  |     * Gets the tertiary order of a collation order.  | 
212  |  |     * @param order the collation order  | 
213  |  |     * @return the tertiary order of a collation order.  | 
214  |  |     * @stable ICU 2.0  | 
215  |  |     */  | 
216  |  |     static inline int32_t tertiaryOrder(int32_t order);  | 
217  |  |  | 
218  |  |     /**  | 
219  |  |     * Return the maximum length of any expansion sequences that end with the   | 
220  |  |     * specified comparison order.  | 
221  |  |     * @param order a collation order returned by previous or next.  | 
222  |  |     * @return maximum size of the expansion sequences ending with the collation   | 
223  |  |     *         element or 1 if collation element does not occur at the end of any   | 
224  |  |     *         expansion sequence  | 
225  |  |     * @stable ICU 2.0  | 
226  |  |     */  | 
227  |  |     int32_t getMaxExpansion(int32_t order) const;  | 
228  |  |  | 
229  |  |     /**  | 
230  |  |     * Gets the comparison order in the desired strength. Ignore the other  | 
231  |  |     * differences.  | 
232  |  |     * @param order The order value  | 
233  |  |     * @stable ICU 2.0  | 
234  |  |     */  | 
235  |  |     int32_t strengthOrder(int32_t order) const;  | 
236  |  |  | 
237  |  |     /**  | 
238  |  |     * Sets the source string.  | 
239  |  |     * @param str the source string.  | 
240  |  |     * @param status the error code status.  | 
241  |  |     * @stable ICU 2.0  | 
242  |  |     */  | 
243  |  |     void setText(const UnicodeString& str, UErrorCode& status);  | 
244  |  |  | 
245  |  |     /**  | 
246  |  |     * Sets the source string.  | 
247  |  |     * @param str the source character iterator.  | 
248  |  |     * @param status the error code status.  | 
249  |  |     * @stable ICU 2.0  | 
250  |  |     */  | 
251  |  |     void setText(CharacterIterator& str, UErrorCode& status);  | 
252  |  |  | 
253  |  |     /**  | 
254  |  |     * Checks if a comparison order is ignorable.  | 
255  |  |     * @param order the collation order.  | 
256  |  |     * @return true if a character is ignorable, false otherwise.  | 
257  |  |     * @stable ICU 2.0  | 
258  |  |     */  | 
259  |  |     static inline UBool isIgnorable(int32_t order);  | 
260  |  |  | 
261  |  |     /**  | 
262  |  |     * Gets the offset of the currently processed character in the source string.  | 
263  |  |     * @return the offset of the character.  | 
264  |  |     * @stable ICU 2.0  | 
265  |  |     */  | 
266  |  |     int32_t getOffset(void) const;  | 
267  |  |  | 
268  |  |     /**  | 
269  |  |     * Sets the offset of the currently processed character in the source string.  | 
270  |  |     * @param newOffset the new offset.  | 
271  |  |     * @param status the error code status.  | 
272  |  |     * @return the offset of the character.  | 
273  |  |     * @stable ICU 2.0  | 
274  |  |     */  | 
275  |  |     void setOffset(int32_t newOffset, UErrorCode& status);  | 
276  |  |  | 
277  |  |     /**  | 
278  |  |     * ICU "poor man's RTTI", returns a UClassID for the actual class.  | 
279  |  |     *  | 
280  |  |     * @stable ICU 2.2  | 
281  |  |     */  | 
282  |  |     virtual UClassID getDynamicClassID() const;  | 
283  |  |  | 
284  |  |     /**  | 
285  |  |     * ICU "poor man's RTTI", returns a UClassID for this class.  | 
286  |  |     *  | 
287  |  |     * @stable ICU 2.2  | 
288  |  |     */  | 
289  |  |     static UClassID U_EXPORT2 getStaticClassID();  | 
290  |  |  | 
291  |  | #ifndef U_HIDE_INTERNAL_API  | 
292  |  |     /** @internal */  | 
293  | 0  |     static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) { | 
294  | 0  |         return reinterpret_cast<CollationElementIterator *>(uc);  | 
295  | 0  |     }  | 
296  |  |     /** @internal */  | 
297  | 0  |     static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) { | 
298  | 0  |         return reinterpret_cast<const CollationElementIterator *>(uc);  | 
299  | 0  |     }  | 
300  |  |     /** @internal */  | 
301  | 0  |     inline UCollationElements *toUCollationElements() { | 
302  | 0  |         return reinterpret_cast<UCollationElements *>(this);  | 
303  | 0  |     }  | 
304  |  |     /** @internal */  | 
305  | 0  |     inline const UCollationElements *toUCollationElements() const { | 
306  | 0  |         return reinterpret_cast<const UCollationElements *>(this);  | 
307  | 0  |     }  | 
308  |  | #endif  // U_HIDE_INTERNAL_API  | 
309  |  |  | 
310  |  | private:  | 
311  |  |     friend class RuleBasedCollator;  | 
312  |  |     friend class UCollationPCE;  | 
313  |  |  | 
314  |  |     /**  | 
315  |  |     * CollationElementIterator constructor. This takes the source string and the   | 
316  |  |     * collation object. The cursor will walk thru the source string based on the   | 
317  |  |     * predefined collation rules. If the source string is empty, NULLORDER will   | 
318  |  |     * be returned on the calls to next().  | 
319  |  |     * @param sourceText    the source string.  | 
320  |  |     * @param order         the collation object.  | 
321  |  |     * @param status        the error code status.  | 
322  |  |     */  | 
323  |  |     CollationElementIterator(const UnicodeString& sourceText,  | 
324  |  |         const RuleBasedCollator* order, UErrorCode& status);  | 
325  |  |     // Note: The constructors should take settings & tailoring, not a collator,  | 
326  |  |     // to avoid circular dependencies.  | 
327  |  |     // However, for operator==() we would need to be able to compare tailoring data for equality  | 
328  |  |     // without making CollationData or CollationTailoring depend on TailoredSet.  | 
329  |  |     // (See the implementation of RuleBasedCollator::operator==().)  | 
330  |  |     // That might require creating an intermediate class that would be used  | 
331  |  |     // by both CollationElementIterator and RuleBasedCollator  | 
332  |  |     // but only contain the part of RBC== related to data and rules.  | 
333  |  |  | 
334  |  |     /**  | 
335  |  |     * CollationElementIterator constructor. This takes the source string and the   | 
336  |  |     * collation object.  The cursor will walk thru the source string based on the   | 
337  |  |     * predefined collation rules.  If the source string is empty, NULLORDER will   | 
338  |  |     * be returned on the calls to next().  | 
339  |  |     * @param sourceText    the source string.  | 
340  |  |     * @param order         the collation object.  | 
341  |  |     * @param status        the error code status.  | 
342  |  |     */  | 
343  |  |     CollationElementIterator(const CharacterIterator& sourceText,  | 
344  |  |         const RuleBasedCollator* order, UErrorCode& status);  | 
345  |  |  | 
346  |  |     /**  | 
347  |  |     * Assignment operator  | 
348  |  |     *  | 
349  |  |     * @param other    the object to be copied  | 
350  |  |     */  | 
351  |  |     const CollationElementIterator&  | 
352  |  |         operator=(const CollationElementIterator& other);  | 
353  |  |  | 
354  |  |     CollationElementIterator(); // default constructor not implemented  | 
355  |  |  | 
356  |  |     /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */  | 
357  | 0  |     inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; } | 
358  |  |  | 
359  |  |     static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode);  | 
360  |  |  | 
361  |  |     static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order);  | 
362  |  |  | 
363  |  |     // CollationElementIterator private data members ----------------------------  | 
364  |  |  | 
365  |  |     CollationIterator *iter_;  // owned  | 
366  |  |     const RuleBasedCollator *rbc_;  // aliased  | 
367  |  |     uint32_t otherHalf_;  | 
368  |  |     /**  | 
369  |  |      * <0: backwards; 0: just after reset() (previous() begins from end);  | 
370  |  |      * 1: just after setOffset(); >1: forward  | 
371  |  |      */  | 
372  |  |     int8_t dir_;  | 
373  |  |     /**  | 
374  |  |      * Stores offsets from expansions and from unsafe-backwards iteration,  | 
375  |  |      * so that getOffset() returns intermediate offsets for the CEs  | 
376  |  |      * that are consistent with forward iteration.  | 
377  |  |      */  | 
378  |  |     UVector32 *offsets_;  | 
379  |  |  | 
380  |  |     UnicodeString string_;  | 
381  |  | };  | 
382  |  |  | 
383  |  | // CollationElementIterator inline method definitions --------------------------  | 
384  |  |  | 
385  |  | inline int32_t CollationElementIterator::primaryOrder(int32_t order)  | 
386  | 0  | { | 
387  | 0  |     return (order >> 16) & 0xffff;  | 
388  | 0  | }  | 
389  |  |  | 
390  |  | inline int32_t CollationElementIterator::secondaryOrder(int32_t order)  | 
391  | 0  | { | 
392  | 0  |     return (order >> 8) & 0xff;  | 
393  | 0  | }  | 
394  |  |  | 
395  |  | inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)  | 
396  | 0  | { | 
397  | 0  |     return order & 0xff;  | 
398  | 0  | }  | 
399  |  |  | 
400  |  | inline UBool CollationElementIterator::isIgnorable(int32_t order)  | 
401  | 0  | { | 
402  | 0  |     return (order & 0xffff0000) == 0;  | 
403  | 0  | }  | 
404  |  |  | 
405  |  | U_NAMESPACE_END  | 
406  |  |  | 
407  |  | #endif /* #if !UCONFIG_NO_COLLATION */  | 
408  |  |  | 
409  |  | #endif /* U_SHOW_CPLUSPLUS_API */  | 
410  |  |  | 
411  |  | #endif  |