/src/icu/source/common/unicode/stringtriebuilder.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /*  | 
4  |  | *******************************************************************************  | 
5  |  | *   Copyright (C) 2010-2012,2014, International Business Machines  | 
6  |  | *   Corporation and others.  All Rights Reserved.  | 
7  |  | *******************************************************************************  | 
8  |  | *   file name:  stringtriebuilder.h  | 
9  |  | *   encoding:   UTF-8  | 
10  |  | *   tab size:   8 (not used)  | 
11  |  | *   indentation:4  | 
12  |  | *  | 
13  |  | *   created on: 2010dec24  | 
14  |  | *   created by: Markus W. Scherer  | 
15  |  | */  | 
16  |  |  | 
17  |  | #ifndef __STRINGTRIEBUILDER_H__  | 
18  |  | #define __STRINGTRIEBUILDER_H__  | 
19  |  |  | 
20  |  | #include "unicode/utypes.h"  | 
21  |  |  | 
22  |  | #if U_SHOW_CPLUSPLUS_API  | 
23  |  |  | 
24  |  | #include "unicode/uobject.h"  | 
25  |  |  | 
26  |  | /**  | 
27  |  |  * \file  | 
28  |  |  * \brief C++ API: Builder API for trie builders  | 
29  |  |  */  | 
30  |  |  | 
31  |  | // Forward declaration.  | 
32  |  | /// \cond  | 
33  |  | struct UHashtable;  | 
34  |  | typedef struct UHashtable UHashtable;  | 
35  |  | /// \endcond  | 
36  |  |  | 
37  |  | /**  | 
38  |  |  * Build options for BytesTrieBuilder and CharsTrieBuilder.  | 
39  |  |  * @stable ICU 4.8  | 
40  |  |  */  | 
41  |  | enum UStringTrieBuildOption { | 
42  |  |     /**  | 
43  |  |      * Builds a trie quickly.  | 
44  |  |      * @stable ICU 4.8  | 
45  |  |      */  | 
46  |  |     USTRINGTRIE_BUILD_FAST,  | 
47  |  |     /**  | 
48  |  |      * Builds a trie more slowly, attempting to generate  | 
49  |  |      * a shorter but equivalent serialization.  | 
50  |  |      * This build option also uses more memory.  | 
51  |  |      *  | 
52  |  |      * This option can be effective when many integer values are the same  | 
53  |  |      * and string/byte sequence suffixes can be shared.  | 
54  |  |      * Runtime speed is not expected to improve.  | 
55  |  |      * @stable ICU 4.8  | 
56  |  |      */  | 
57  |  |     USTRINGTRIE_BUILD_SMALL  | 
58  |  | };  | 
59  |  |  | 
60  |  | U_NAMESPACE_BEGIN  | 
61  |  |  | 
62  |  | /**  | 
63  |  |  * Base class for string trie builder classes.  | 
64  |  |  *  | 
65  |  |  * This class is not intended for public subclassing.  | 
66  |  |  * @stable ICU 4.8  | 
67  |  |  */  | 
68  |  | class U_COMMON_API StringTrieBuilder : public UObject { | 
69  |  | public:  | 
70  |  | #ifndef U_HIDE_INTERNAL_API  | 
71  |  |     /** @internal */  | 
72  |  |     static int32_t hashNode(const void *node);  | 
73  |  |     /** @internal */  | 
74  |  |     static UBool equalNodes(const void *left, const void *right);  | 
75  |  | #endif  /* U_HIDE_INTERNAL_API */  | 
76  |  |  | 
77  |  | protected:  | 
78  |  |     // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API  | 
79  |  |     // or else the compiler will create a public default constructor.  | 
80  |  |     /** @internal */  | 
81  |  |     StringTrieBuilder();  | 
82  |  |     /** @internal */  | 
83  |  |     virtual ~StringTrieBuilder();  | 
84  |  |  | 
85  |  | #ifndef U_HIDE_INTERNAL_API  | 
86  |  |     /** @internal */  | 
87  |  |     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);  | 
88  |  |     /** @internal */  | 
89  |  |     void deleteCompactBuilder();  | 
90  |  |  | 
91  |  |     /** @internal */  | 
92  |  |     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);  | 
93  |  |  | 
94  |  |     /** @internal */  | 
95  |  |     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);  | 
96  |  |     /** @internal */  | 
97  |  |     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);  | 
98  |  | #endif  /* U_HIDE_INTERNAL_API */  | 
99  |  |  | 
100  |  |     class Node;  | 
101  |  |  | 
102  |  | #ifndef U_HIDE_INTERNAL_API  | 
103  |  |     /** @internal */  | 
104  |  |     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);  | 
105  |  |     /** @internal */  | 
106  |  |     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,  | 
107  |  |                             int32_t length, UErrorCode &errorCode);  | 
108  |  | #endif  /* U_HIDE_INTERNAL_API */  | 
109  |  |  | 
110  |  |     /** @internal */  | 
111  |  |     virtual int32_t getElementStringLength(int32_t i) const = 0;  | 
112  |  |     /** @internal */  | 
113  |  |     virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;  | 
114  |  |     /** @internal */  | 
115  |  |     virtual int32_t getElementValue(int32_t i) const = 0;  | 
116  |  |  | 
117  |  |     // Finds the first unit index after this one where  | 
118  |  |     // the first and last element have different units again.  | 
119  |  |     /** @internal */  | 
120  |  |     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;  | 
121  |  |  | 
122  |  |     // Number of different units at unitIndex.  | 
123  |  |     /** @internal */  | 
124  |  |     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;  | 
125  |  |     /** @internal */  | 
126  |  |     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;  | 
127  |  |     /** @internal */  | 
128  |  |     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;  | 
129  |  |  | 
130  |  |     /** @internal */  | 
131  |  |     virtual UBool matchNodesCanHaveValues() const = 0;  | 
132  |  |  | 
133  |  |     /** @internal */  | 
134  |  |     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;  | 
135  |  |     /** @internal */  | 
136  |  |     virtual int32_t getMinLinearMatch() const = 0;  | 
137  |  |     /** @internal */  | 
138  |  |     virtual int32_t getMaxLinearMatchLength() const = 0;  | 
139  |  |  | 
140  |  | #ifndef U_HIDE_INTERNAL_API  | 
141  |  |     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).  | 
142  |  |     /** @internal */  | 
143  |  |     static const int32_t kMaxBranchLinearSubNodeLength=5;  | 
144  |  |  | 
145  |  |     // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.  | 
146  |  |     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.  | 
147  |  |     /** @internal */  | 
148  |  |     static const int32_t kMaxSplitBranchLevels=14;  | 
149  |  |  | 
150  |  |     /**  | 
151  |  |      * Makes sure that there is only one unique node registered that is  | 
152  |  |      * equivalent to newNode.  | 
153  |  |      * @param newNode Input node. The builder takes ownership.  | 
154  |  |      * @param errorCode ICU in/out UErrorCode.  | 
155  |  |                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.  | 
156  |  |      * @return newNode if it is the first of its kind, or  | 
157  |  |      *         an equivalent node if newNode is a duplicate.  | 
158  |  |      * @internal  | 
159  |  |      */  | 
160  |  |     Node *registerNode(Node *newNode, UErrorCode &errorCode);  | 
161  |  |     /**  | 
162  |  |      * Makes sure that there is only one unique FinalValueNode registered  | 
163  |  |      * with this value.  | 
164  |  |      * Avoids creating a node if the value is a duplicate.  | 
165  |  |      * @param value A final value.  | 
166  |  |      * @param errorCode ICU in/out UErrorCode.  | 
167  |  |                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.  | 
168  |  |      * @return A FinalValueNode with the given value.  | 
169  |  |      * @internal  | 
170  |  |      */  | 
171  |  |     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);  | 
172  |  | #endif  /* U_HIDE_INTERNAL_API */  | 
173  |  |  | 
174  |  |     /*  | 
175  |  |      * C++ note:  | 
176  |  |      * registerNode() and registerFinalValue() take ownership of their input nodes,  | 
177  |  |      * and only return owned nodes.  | 
178  |  |      * If they see a failure UErrorCode, they will delete the input node.  | 
179  |  |      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.  | 
180  |  |      * If there is a failure, they return NULL.  | 
181  |  |      *  | 
182  |  |      * NULL Node pointers can be safely passed into other Nodes because  | 
183  |  |      * they call the static Node::hashCode() which checks for a NULL pointer first.  | 
184  |  |      *  | 
185  |  |      * Therefore, as long as builder functions register a new node,  | 
186  |  |      * they need to check for failures only before explicitly dereferencing  | 
187  |  |      * a Node pointer, or before setting a new UErrorCode.  | 
188  |  |      */  | 
189  |  |  | 
190  |  |     // Hash set of nodes, maps from nodes to integer 1.  | 
191  |  |     /** @internal */  | 
192  |  |     UHashtable *nodes;  | 
193  |  |  | 
194  |  |     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,  | 
195  |  |     // it is needed for layout of other objects.  | 
196  |  |     /**  | 
197  |  |      * @internal  | 
198  |  |      * \cond  | 
199  |  |      */  | 
200  |  |     class Node : public UObject { | 
201  |  |     public:  | 
202  | 0  |         Node(int32_t initialHash) : hash(initialHash), offset(0) {} | 
203  | 0  |         inline int32_t hashCode() const { return hash; } | 
204  |  |         // Handles node==NULL.  | 
205  | 0  |         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); } | 
206  |  |         // Base class operator==() compares the actual class types.  | 
207  |  |         virtual bool operator==(const Node &other) const;  | 
208  | 0  |         inline bool operator!=(const Node &other) const { return !operator==(other); } | 
209  |  |         /**  | 
210  |  |          * Traverses the Node graph and numbers branch edges, with rightmost edges first.  | 
211  |  |          * This is to avoid writing a duplicate node twice.  | 
212  |  |          *  | 
213  |  |          * Branch nodes in this trie data structure are not symmetric.  | 
214  |  |          * Most branch edges "jump" to other nodes but the rightmost branch edges  | 
215  |  |          * just continue without a jump.  | 
216  |  |          * Therefore, write() must write the rightmost branch edge last  | 
217  |  |          * (trie units are written backwards), and must write it at that point even if  | 
218  |  |          * it is a duplicate of a node previously written elsewhere.  | 
219  |  |          *  | 
220  |  |          * This function visits and marks right branch edges first.  | 
221  |  |          * Edges are numbered with increasingly negative values because we share the  | 
222  |  |          * offset field which gets positive values when nodes are written.  | 
223  |  |          * A branch edge also remembers the first number for any of its edges.  | 
224  |  |          *  | 
225  |  |          * When a further-left branch edge has a number in the range of the rightmost  | 
226  |  |          * edge's numbers, then it will be written as part of the required right edge  | 
227  |  |          * and we can avoid writing it first.  | 
228  |  |          *  | 
229  |  |          * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative  | 
230  |  |          * edge numbers.  | 
231  |  |          *  | 
232  |  |          * @param edgeNumber The first edge number for this node and its sub-nodes.  | 
233  |  |          * @return An edge number that is at least the maximum-negative  | 
234  |  |          *         of the input edge number and the numbers of this node and all of its sub-nodes.  | 
235  |  |          */  | 
236  |  |         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);  | 
237  |  |         // write() must set the offset to a positive value.  | 
238  |  |         virtual void write(StringTrieBuilder &builder) = 0;  | 
239  |  |         // See markRightEdgesFirst.  | 
240  |  |         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,  | 
241  | 0  |                                                StringTrieBuilder &builder) { | 
242  |  |             // Note: Edge numbers are negative, lastRight<=firstRight.  | 
243  |  |             // If offset>0 then this node and its sub-nodes have been written already  | 
244  |  |             // and we need not write them again.  | 
245  |  |             // If this node is part of the unwritten right branch edge,  | 
246  |  |             // then we wait until that is written.  | 
247  | 0  |             if(offset<0 && (offset<lastRight || firstRight<offset)) { | 
248  | 0  |                 write(builder);  | 
249  | 0  |             }  | 
250  | 0  |         }  | 
251  | 0  |         inline int32_t getOffset() const { return offset; } | 
252  |  |     protected:  | 
253  |  |         int32_t hash;  | 
254  |  |         int32_t offset;  | 
255  |  |     };  | 
256  |  |  | 
257  |  | #ifndef U_HIDE_INTERNAL_API  | 
258  |  |     // This class should not be overridden because  | 
259  |  |     // registerFinalValue() compares a stack-allocated FinalValueNode  | 
260  |  |     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)  | 
261  |  |     // with the input node, and the  | 
262  |  |     // !Node::operator==(other) used inside FinalValueNode::operator==(other)  | 
263  |  |     // will be false if the typeid's are different.  | 
264  |  |     /** @internal */  | 
265  |  |     class FinalValueNode : public Node { | 
266  |  |     public:  | 
267  | 0  |         FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {} | 
268  |  |         virtual bool operator==(const Node &other) const;  | 
269  |  |         virtual void write(StringTrieBuilder &builder);  | 
270  |  |     protected:  | 
271  |  |         int32_t value;  | 
272  |  |     };  | 
273  |  | #endif  /* U_HIDE_INTERNAL_API */  | 
274  |  |  | 
275  |  |     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,  | 
276  |  |     // it is needed for layout of other objects.  | 
277  |  |     /**  | 
278  |  |      * @internal   | 
279  |  |      */  | 
280  |  |     class ValueNode : public Node { | 
281  |  |     public:  | 
282  | 0  |         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(false), value(0) {} | 
283  |  |         virtual bool operator==(const Node &other) const;  | 
284  | 0  |         void setValue(int32_t v) { | 
285  | 0  |             hasValue=true;  | 
286  | 0  |             value=v;  | 
287  | 0  |             hash=hash*37u+v;  | 
288  | 0  |         }  | 
289  |  |     protected:  | 
290  |  |         UBool hasValue;  | 
291  |  |         int32_t value;  | 
292  |  |     };  | 
293  |  |  | 
294  |  | #ifndef U_HIDE_INTERNAL_API  | 
295  |  |     /**   | 
296  |  |      * @internal   | 
297  |  |      */  | 
298  |  |     class IntermediateValueNode : public ValueNode { | 
299  |  |     public:  | 
300  |  |         IntermediateValueNode(int32_t v, Node *nextNode)  | 
301  | 0  |                 : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); } | 
302  |  |         virtual bool operator==(const Node &other) const;  | 
303  |  |         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);  | 
304  |  |         virtual void write(StringTrieBuilder &builder);  | 
305  |  |     protected:  | 
306  |  |         Node *next;  | 
307  |  |     };  | 
308  |  | #endif  /* U_HIDE_INTERNAL_API */  | 
309  |  |  | 
310  |  |     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,  | 
311  |  |     // it is needed for layout of other objects.  | 
312  |  |     /**  | 
313  |  |      * @internal   | 
314  |  |      */  | 
315  |  |     class LinearMatchNode : public ValueNode { | 
316  |  |     public:  | 
317  |  |         LinearMatchNode(int32_t len, Node *nextNode)  | 
318  | 0  |                 : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),  | 
319  | 0  |                   length(len), next(nextNode) {} | 
320  |  |         virtual bool operator==(const Node &other) const;  | 
321  |  |         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);  | 
322  |  |     protected:  | 
323  |  |         int32_t length;  | 
324  |  |         Node *next;  | 
325  |  |     };  | 
326  |  |  | 
327  |  | #ifndef U_HIDE_INTERNAL_API  | 
328  |  |     /**  | 
329  |  |      * @internal   | 
330  |  |      */  | 
331  |  |     class BranchNode : public Node { | 
332  |  |     public:  | 
333  | 0  |         BranchNode(int32_t initialHash) : Node(initialHash) {} | 
334  |  |     protected:  | 
335  |  |         int32_t firstEdgeNumber;  | 
336  |  |     };  | 
337  |  |  | 
338  |  |     /**  | 
339  |  |      * @internal   | 
340  |  |      */  | 
341  |  |     class ListBranchNode : public BranchNode { | 
342  |  |     public:  | 
343  | 0  |         ListBranchNode() : BranchNode(0x444444), length(0) {} | 
344  |  |         virtual bool operator==(const Node &other) const;  | 
345  |  |         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);  | 
346  |  |         virtual void write(StringTrieBuilder &builder);  | 
347  |  |         // Adds a unit with a final value.  | 
348  | 0  |         void add(int32_t c, int32_t value) { | 
349  | 0  |             units[length]=(char16_t)c;  | 
350  | 0  |             equal[length]=NULL;  | 
351  | 0  |             values[length]=value;  | 
352  | 0  |             ++length;  | 
353  | 0  |             hash=(hash*37u+c)*37u+value;  | 
354  | 0  |         }  | 
355  |  |         // Adds a unit which leads to another match node.  | 
356  | 0  |         void add(int32_t c, Node *node) { | 
357  | 0  |             units[length]=(char16_t)c;  | 
358  | 0  |             equal[length]=node;  | 
359  | 0  |             values[length]=0;  | 
360  | 0  |             ++length;  | 
361  | 0  |             hash=(hash*37u+c)*37u+hashCode(node);  | 
362  | 0  |         }  | 
363  |  |     protected:  | 
364  |  |         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".  | 
365  |  |         int32_t length;  | 
366  |  |         int32_t values[kMaxBranchLinearSubNodeLength];  | 
367  |  |         char16_t units[kMaxBranchLinearSubNodeLength];  | 
368  |  |     };  | 
369  |  |  | 
370  |  |     /**  | 
371  |  |      * @internal   | 
372  |  |      */  | 
373  |  |     class SplitBranchNode : public BranchNode { | 
374  |  |     public:  | 
375  |  |         SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)  | 
376  | 0  |                 : BranchNode(((0x555555u*37u+middleUnit)*37u+  | 
377  | 0  |                               hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),  | 
378  | 0  |                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} | 
379  |  |         virtual bool operator==(const Node &other) const;  | 
380  |  |         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);  | 
381  |  |         virtual void write(StringTrieBuilder &builder);  | 
382  |  |     protected:  | 
383  |  |         char16_t unit;  | 
384  |  |         Node *lessThan;  | 
385  |  |         Node *greaterOrEqual;  | 
386  |  |     };  | 
387  |  |  | 
388  |  |     // Branch head node, for writing the actual node lead unit.  | 
389  |  |     /** @internal */  | 
390  |  |     class BranchHeadNode : public ValueNode { | 
391  |  |     public:  | 
392  |  |         BranchHeadNode(int32_t len, Node *subNode)  | 
393  | 0  |                 : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),  | 
394  | 0  |                   length(len), next(subNode) {} | 
395  |  |         virtual bool operator==(const Node &other) const;  | 
396  |  |         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);  | 
397  |  |         virtual void write(StringTrieBuilder &builder);  | 
398  |  |     protected:  | 
399  |  |         int32_t length;  | 
400  |  |         Node *next;  // A branch sub-node.  | 
401  |  |     };  | 
402  |  |  | 
403  |  | #endif  /* U_HIDE_INTERNAL_API */  | 
404  |  |     /// \endcond  | 
405  |  |  | 
406  |  |     /** @internal */  | 
407  |  |     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,  | 
408  |  |                                         Node *nextNode) const = 0;  | 
409  |  |  | 
410  |  |     /** @internal */  | 
411  |  |     virtual int32_t write(int32_t unit) = 0;  | 
412  |  |     /** @internal */  | 
413  |  |     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;  | 
414  |  |     /** @internal */  | 
415  |  |     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;  | 
416  |  |     /** @internal */  | 
417  |  |     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;  | 
418  |  |     /** @internal */  | 
419  |  |     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;  | 
420  |  | };  | 
421  |  |  | 
422  |  | U_NAMESPACE_END  | 
423  |  |  | 
424  |  | #endif /* U_SHOW_CPLUSPLUS_API */  | 
425  |  |  | 
426  |  | #endif  // __STRINGTRIEBUILDER_H__  |