Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/common/unicode/ucharstrie.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
*   Copyright (C) 2010-2012, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
*   file name:  ucharstrie.h
9
*   encoding:   UTF-8
10
*   tab size:   8 (not used)
11
*   indentation:4
12
*
13
*   created on: 2010nov14
14
*   created by: Markus W. Scherer
15
*/
16
17
#ifndef __UCHARSTRIE_H__
18
#define __UCHARSTRIE_H__
19
20
/**
21
 * \file
22
 * \brief C++ API: Trie for mapping Unicode strings (or 16-bit-unit sequences)
23
 *                 to integer values.
24
 */
25
26
#include "unicode/utypes.h"
27
#include "unicode/unistr.h"
28
#include "unicode/uobject.h"
29
#include "unicode/ustringtrie.h"
30
31
U_NAMESPACE_BEGIN
32
33
class Appendable;
34
class UCharsTrieBuilder;
35
class UVector32;
36
37
/**
38
 * Light-weight, non-const reader class for a UCharsTrie.
39
 * Traverses a char16_t-serialized data structure with minimal state,
40
 * for mapping strings (16-bit-unit sequences) to non-negative integer values.
41
 *
42
 * This class owns the serialized trie data only if it was constructed by
43
 * the builder's build() method.
44
 * The public constructor and the copy constructor only alias the data (only copy the pointer).
45
 * There is no assignment operator.
46
 *
47
 * This class is not intended for public subclassing.
48
 * @stable ICU 4.8
49
 */
50
class U_COMMON_API UCharsTrie : public UMemory {
51
public:
52
    /**
53
     * Constructs a UCharsTrie reader instance.
54
     *
55
     * The trieUChars must contain a copy of a char16_t sequence from the UCharsTrieBuilder,
56
     * starting with the first char16_t of that sequence.
57
     * The UCharsTrie object will not read more char16_ts than
58
     * the UCharsTrieBuilder generated in the corresponding build() call.
59
     *
60
     * The array is not copied/cloned and must not be modified while
61
     * the UCharsTrie object is in use.
62
     *
63
     * @param trieUChars The char16_t array that contains the serialized trie.
64
     * @stable ICU 4.8
65
     */
66
    UCharsTrie(ConstChar16Ptr trieUChars)
67
            : ownedArray_(NULL), uchars_(trieUChars),
68
0
              pos_(uchars_), remainingMatchLength_(-1) {}
69
70
    /**
71
     * Destructor.
72
     * @stable ICU 4.8
73
     */
74
    ~UCharsTrie();
75
76
    /**
77
     * Copy constructor, copies the other trie reader object and its state,
78
     * but not the char16_t array which will be shared. (Shallow copy.)
79
     * @param other Another UCharsTrie object.
80
     * @stable ICU 4.8
81
     */
82
    UCharsTrie(const UCharsTrie &other)
83
            : ownedArray_(NULL), uchars_(other.uchars_),
84
0
              pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
85
86
    /**
87
     * Resets this trie to its initial state.
88
     * @return *this
89
     * @stable ICU 4.8
90
     */
91
    UCharsTrie &reset() {
92
        pos_=uchars_;
93
        remainingMatchLength_=-1;
94
        return *this;
95
    }
96
97
    /**
98
     * UCharsTrie state object, for saving a trie's current state
99
     * and resetting the trie back to this state later.
100
     * @stable ICU 4.8
101
     */
102
    class State : public UMemory {
103
    public:
104
        /**
105
         * Constructs an empty State.
106
         * @stable ICU 4.8
107
         */
108
0
        State() { uchars=NULL; }
109
    private:
110
        friend class UCharsTrie;
111
112
        const char16_t *uchars;
113
        const char16_t *pos;
114
        int32_t remainingMatchLength;
115
    };
116
117
    /**
118
     * Saves the state of this trie.
119
     * @param state The State object to hold the trie's state.
120
     * @return *this
121
     * @see resetToState
122
     * @stable ICU 4.8
123
     */
124
0
    const UCharsTrie &saveState(State &state) const {
125
0
        state.uchars=uchars_;
126
0
        state.pos=pos_;
127
0
        state.remainingMatchLength=remainingMatchLength_;
128
0
        return *this;
129
0
    }
130
131
    /**
132
     * Resets this trie to the saved state.
133
     * If the state object contains no state, or the state of a different trie,
134
     * then this trie remains unchanged.
135
     * @param state The State object which holds a saved trie state.
136
     * @return *this
137
     * @see saveState
138
     * @see reset
139
     * @stable ICU 4.8
140
     */
141
0
    UCharsTrie &resetToState(const State &state) {
142
0
        if(uchars_==state.uchars && uchars_!=NULL) {
143
0
            pos_=state.pos;
144
0
            remainingMatchLength_=state.remainingMatchLength;
145
0
        }
146
0
        return *this;
147
0
    }
148
149
    /**
150
     * Determines whether the string so far matches, whether it has a value,
151
     * and whether another input char16_t can continue a matching string.
152
     * @return The match/value Result.
153
     * @stable ICU 4.8
154
     */
155
    UStringTrieResult current() const;
156
157
    /**
158
     * Traverses the trie from the initial state for this input char16_t.
159
     * Equivalent to reset().next(uchar).
160
     * @param uchar Input char value. Values below 0 and above 0xffff will never match.
161
     * @return The match/value Result.
162
     * @stable ICU 4.8
163
     */
164
0
    inline UStringTrieResult first(int32_t uchar) {
165
0
        remainingMatchLength_=-1;
166
0
        return nextImpl(uchars_, uchar);
167
0
    }
168
169
    /**
170
     * Traverses the trie from the initial state for the
171
     * one or two UTF-16 code units for this input code point.
172
     * Equivalent to reset().nextForCodePoint(cp).
173
     * @param cp A Unicode code point 0..0x10ffff.
174
     * @return The match/value Result.
175
     * @stable ICU 4.8
176
     */
177
    UStringTrieResult firstForCodePoint(UChar32 cp);
178
179
    /**
180
     * Traverses the trie from the current state for this input char16_t.
181
     * @param uchar Input char value. Values below 0 and above 0xffff will never match.
182
     * @return The match/value Result.
183
     * @stable ICU 4.8
184
     */
185
    UStringTrieResult next(int32_t uchar);
186
187
    /**
188
     * Traverses the trie from the current state for the
189
     * one or two UTF-16 code units for this input code point.
190
     * @param cp A Unicode code point 0..0x10ffff.
191
     * @return The match/value Result.
192
     * @stable ICU 4.8
193
     */
194
    UStringTrieResult nextForCodePoint(UChar32 cp);
195
196
    /**
197
     * Traverses the trie from the current state for this string.
198
     * Equivalent to
199
     * \code
200
     * Result result=current();
201
     * for(each c in s)
202
     *   if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
203
     *   result=next(c);
204
     * return result;
205
     * \endcode
206
     * @param s A string. Can be NULL if length is 0.
207
     * @param length The length of the string. Can be -1 if NUL-terminated.
208
     * @return The match/value Result.
209
     * @stable ICU 4.8
210
     */
211
    UStringTrieResult next(ConstChar16Ptr s, int32_t length);
212
213
    /**
214
     * Returns a matching string's value if called immediately after
215
     * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
216
     * getValue() can be called multiple times.
217
     *
218
     * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
219
     * @return The value for the string so far.
220
     * @stable ICU 4.8
221
     */
222
0
    inline int32_t getValue() const {
223
0
        const char16_t *pos=pos_;
224
0
        int32_t leadUnit=*pos++;
225
0
        // U_ASSERT(leadUnit>=kMinValueLead);
226
0
        return leadUnit&kValueIsFinal ?
227
0
            readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
228
0
    }
229
230
    /**
231
     * Determines whether all strings reachable from the current state
232
     * map to the same value.
233
     * @param uniqueValue Receives the unique value, if this function returns TRUE.
234
     *                    (output-only)
235
     * @return TRUE if all strings reachable from the current state
236
     *         map to the same value.
237
     * @stable ICU 4.8
238
     */
239
0
    inline UBool hasUniqueValue(int32_t &uniqueValue) const {
240
0
        const char16_t *pos=pos_;
241
0
        // Skip the rest of a pending linear-match node.
242
0
        return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
243
0
    }
244
245
    /**
246
     * Finds each char16_t which continues the string from the current state.
247
     * That is, each char16_t c for which it would be next(c)!=USTRINGTRIE_NO_MATCH now.
248
     * @param out Each next char16_t is appended to this object.
249
     * @return the number of char16_ts which continue the string from here
250
     * @stable ICU 4.8
251
     */
252
    int32_t getNextUChars(Appendable &out) const;
253
254
    /**
255
     * Iterator for all of the (string, value) pairs in a UCharsTrie.
256
     * @stable ICU 4.8
257
     */
258
    class U_COMMON_API Iterator : public UMemory {
259
    public:
260
        /**
261
         * Iterates from the root of a char16_t-serialized UCharsTrie.
262
         * @param trieUChars The trie char16_ts.
263
         * @param maxStringLength If 0, the iterator returns full strings.
264
         *                        Otherwise, the iterator returns strings with this maximum length.
265
         * @param errorCode Standard ICU error code. Its input value must
266
         *                  pass the U_SUCCESS() test, or else the function returns
267
         *                  immediately. Check for U_FAILURE() on output or use with
268
         *                  function chaining. (See User Guide for details.)
269
         * @stable ICU 4.8
270
         */
271
        Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
272
273
        /**
274
         * Iterates from the current state of the specified UCharsTrie.
275
         * @param trie The trie whose state will be copied for iteration.
276
         * @param maxStringLength If 0, the iterator returns full strings.
277
         *                        Otherwise, the iterator returns strings with this maximum length.
278
         * @param errorCode Standard ICU error code. Its input value must
279
         *                  pass the U_SUCCESS() test, or else the function returns
280
         *                  immediately. Check for U_FAILURE() on output or use with
281
         *                  function chaining. (See User Guide for details.)
282
         * @stable ICU 4.8
283
         */
284
        Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
285
286
        /**
287
         * Destructor.
288
         * @stable ICU 4.8
289
         */
290
        ~Iterator();
291
292
        /**
293
         * Resets this iterator to its initial state.
294
         * @return *this
295
         * @stable ICU 4.8
296
         */
297
        Iterator &reset();
298
299
        /**
300
         * @return TRUE if there are more elements.
301
         * @stable ICU 4.8
302
         */
303
        UBool hasNext() const;
304
305
        /**
306
         * Finds the next (string, value) pair if there is one.
307
         *
308
         * If the string is truncated to the maximum length and does not
309
         * have a real value, then the value is set to -1.
310
         * In this case, this "not a real value" is indistinguishable from
311
         * a real value of -1.
312
         * @param errorCode Standard ICU error code. Its input value must
313
         *                  pass the U_SUCCESS() test, or else the function returns
314
         *                  immediately. Check for U_FAILURE() on output or use with
315
         *                  function chaining. (See User Guide for details.)
316
         * @return TRUE if there is another element.
317
         * @stable ICU 4.8
318
         */
319
        UBool next(UErrorCode &errorCode);
320
321
        /**
322
         * @return The string for the last successful next().
323
         * @stable ICU 4.8
324
         */
325
0
        const UnicodeString &getString() const { return str_; }
326
        /**
327
         * @return The value for the last successful next().
328
         * @stable ICU 4.8
329
         */
330
0
        int32_t getValue() const { return value_; }
331
332
    private:
333
0
        UBool truncateAndStop() {
334
0
            pos_=NULL;
335
0
            value_=-1;  // no real value for str
336
0
            return TRUE;
337
0
        }
338
339
        const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode);
340
341
        const char16_t *uchars_;
342
        const char16_t *pos_;
343
        const char16_t *initialPos_;
344
        int32_t remainingMatchLength_;
345
        int32_t initialRemainingMatchLength_;
346
        UBool skipValue_;  // Skip intermediate value which was already delivered.
347
348
        UnicodeString str_;
349
        int32_t maxLength_;
350
        int32_t value_;
351
352
        // The stack stores pairs of integers for backtracking to another
353
        // outbound edge of a branch node.
354
        // The first integer is an offset from uchars_.
355
        // The second integer has the str_.length() from before the node in bits 15..0,
356
        // and the remaining branch length in bits 31..16.
357
        // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
358
        // but the code looks more confusing that way.)
359
        UVector32 *stack_;
360
    };
361
362
private:
363
    friend class UCharsTrieBuilder;
364
365
    /**
366
     * Constructs a UCharsTrie reader instance.
367
     * Unlike the public constructor which just aliases an array,
368
     * this constructor adopts the builder's array.
369
     * This constructor is only called by the builder.
370
     */
371
    UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars)
372
            : ownedArray_(adoptUChars), uchars_(trieUChars),
373
              pos_(uchars_), remainingMatchLength_(-1) {}
374
375
    // No assignment operator.
376
    UCharsTrie &operator=(const UCharsTrie &other);
377
378
0
    inline void stop() {
379
0
        pos_=NULL;
380
0
    }
381
382
    // Reads a compact 32-bit integer.
383
    // pos is already after the leadUnit, and the lead unit has bit 15 reset.
384
0
    static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
385
0
        int32_t value;
386
0
        if(leadUnit<kMinTwoUnitValueLead) {
387
0
            value=leadUnit;
388
0
        } else if(leadUnit<kThreeUnitValueLead) {
389
0
            value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
390
0
        } else {
391
0
            value=(pos[0]<<16)|pos[1];
392
0
        }
393
0
        return value;
394
0
    }
395
0
    static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
396
0
        if(leadUnit>=kMinTwoUnitValueLead) {
397
0
            if(leadUnit<kThreeUnitValueLead) {
398
0
                ++pos;
399
0
            } else {
400
0
                pos+=2;
401
0
            }
402
0
        }
403
0
        return pos;
404
0
    }
405
0
    static inline const char16_t *skipValue(const char16_t *pos) {
406
0
        int32_t leadUnit=*pos++;
407
0
        return skipValue(pos, leadUnit&0x7fff);
408
0
    }
409
410
0
    static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
411
0
        // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
412
0
        int32_t value;
413
0
        if(leadUnit<kMinTwoUnitNodeValueLead) {
414
0
            value=(leadUnit>>6)-1;
415
0
        } else if(leadUnit<kThreeUnitNodeValueLead) {
416
0
            value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
417
0
        } else {
418
0
            value=(pos[0]<<16)|pos[1];
419
0
        }
420
0
        return value;
421
0
    }
422
0
    static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
423
0
        // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
424
0
        if(leadUnit>=kMinTwoUnitNodeValueLead) {
425
0
            if(leadUnit<kThreeUnitNodeValueLead) {
426
0
                ++pos;
427
0
            } else {
428
0
                pos+=2;
429
0
            }
430
0
        }
431
0
        return pos;
432
0
    }
433
434
0
    static inline const char16_t *jumpByDelta(const char16_t *pos) {
435
0
        int32_t delta=*pos++;
436
0
        if(delta>=kMinTwoUnitDeltaLead) {
437
0
            if(delta==kThreeUnitDeltaLead) {
438
0
                delta=(pos[0]<<16)|pos[1];
439
0
                pos+=2;
440
0
            } else {
441
0
                delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
442
0
            }
443
0
        }
444
0
        return pos+delta;
445
0
    }
446
447
0
    static const char16_t *skipDelta(const char16_t *pos) {
448
0
        int32_t delta=*pos++;
449
0
        if(delta>=kMinTwoUnitDeltaLead) {
450
0
            if(delta==kThreeUnitDeltaLead) {
451
0
                pos+=2;
452
0
            } else {
453
0
                ++pos;
454
0
            }
455
0
        }
456
0
        return pos;
457
0
    }
458
459
0
    static inline UStringTrieResult valueResult(int32_t node) {
460
0
        return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node>>15));
461
0
    }
462
463
    // Handles a branch node for both next(uchar) and next(string).
464
    UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
465
466
    // Requires remainingLength_<0.
467
    UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
468
469
    // Helper functions for hasUniqueValue().
470
    // Recursively finds a unique value (or whether there is not a unique one)
471
    // from a branch.
472
    static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
473
                                                  UBool haveUniqueValue, int32_t &uniqueValue);
474
    // Recursively finds a unique value (or whether there is not a unique one)
475
    // starting from a position on a node lead unit.
476
    static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
477
478
    // Helper functions for getNextUChars().
479
    // getNextUChars() when pos is on a branch node.
480
    static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
481
482
    // UCharsTrie data structure
483
    //
484
    // The trie consists of a series of char16_t-serialized nodes for incremental
485
    // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
486
    // The root node is at the beginning of the trie data.
487
    //
488
    // Types of nodes are distinguished by their node lead unit ranges.
489
    // After each node, except a final-value node, another node follows to
490
    // encode match values or continue matching further units.
491
    //
492
    // Node types:
493
    //  - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
494
    //    The value is for the string/char16_t sequence so far.
495
    //  - Match node, optionally with an intermediate value in a different compact format.
496
    //    The value, if present, is for the string/char16_t sequence so far.
497
    //
498
    //  Aside from the value, which uses the node lead unit's high bits:
499
    //
500
    //  - Linear-match node: Matches a number of units.
501
    //  - Branch node: Branches to other nodes according to the current input unit.
502
    //    The node unit is the length of the branch (number of units to select from)
503
    //    minus 1. It is followed by a sub-node:
504
    //    - If the length is at most kMaxBranchLinearSubNodeLength, then
505
    //      there are length-1 (key, value) pairs and then one more comparison unit.
506
    //      If one of the key units matches, then the value is either a final value for
507
    //      the string so far, or a "jump" delta to the next node.
508
    //      If the last unit matches, then matching continues with the next node.
509
    //      (Values have the same encoding as final-value nodes.)
510
    //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
511
    //      there is one unit and one "jump" delta.
512
    //      If the input unit is less than the sub-node unit, then "jump" by delta to
513
    //      the next sub-node which will have a length of length/2.
514
    //      (The delta has its own compact encoding.)
515
    //      Otherwise, skip the "jump" delta to the next sub-node
516
    //      which will have a length of length-length/2.
517
518
    // Match-node lead unit values, after masking off intermediate-value bits:
519
520
    // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
521
    // the length is one more than the next unit.
522
523
    // For a branch sub-node with at most this many entries, we drop down
524
    // to a linear search.
525
    static const int32_t kMaxBranchLinearSubNodeLength=5;
526
527
    // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
528
    static const int32_t kMinLinearMatch=0x30;
529
    static const int32_t kMaxLinearMatchLength=0x10;
530
531
    // Match-node lead unit bits 14..6 for the optional intermediate value.
532
    // If these bits are 0, then there is no intermediate value.
533
    // Otherwise, see the *NodeValue* constants below.
534
    static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x0040
535
    static const int32_t kNodeTypeMask=kMinValueLead-1;  // 0x003f
536
537
    // A final-value node has bit 15 set.
538
    static const int32_t kValueIsFinal=0x8000;
539
540
    // Compact value: After testing and masking off bit 15, use the following thresholds.
541
    static const int32_t kMaxOneUnitValue=0x3fff;
542
543
    static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1;  // 0x4000
544
    static const int32_t kThreeUnitValueLead=0x7fff;
545
546
    static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1;  // 0x3ffeffff
547
548
    // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
549
    static const int32_t kMaxOneUnitNodeValue=0xff;
550
    static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6);  // 0x4040
551
    static const int32_t kThreeUnitNodeValueLead=0x7fc0;
552
553
    static const int32_t kMaxTwoUnitNodeValue=
554
        ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1;  // 0xfdffff
555
556
    // Compact delta integers.
557
    static const int32_t kMaxOneUnitDelta=0xfbff;
558
    static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1;  // 0xfc00
559
    static const int32_t kThreeUnitDeltaLead=0xffff;
560
561
    static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1;  // 0x03feffff
562
563
    char16_t *ownedArray_;
564
565
    // Fixed value referencing the UCharsTrie words.
566
    const char16_t *uchars_;
567
568
    // Iterator variables.
569
570
    // Pointer to next trie unit to read. NULL if no more matches.
571
    const char16_t *pos_;
572
    // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
573
    int32_t remainingMatchLength_;
574
};
575
576
U_NAMESPACE_END
577
578
#endif  // __UCHARSTRIE_H__