Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/i18n/collationdatabuilder.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2012-2014, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationdatabuilder.h
9
*
10
* created on: 2012apr01
11
* created by: Markus W. Scherer
12
*/
13
14
#ifndef __COLLATIONDATABUILDER_H__
15
#define __COLLATIONDATABUILDER_H__
16
17
#include "unicode/utypes.h"
18
19
#if !UCONFIG_NO_COLLATION
20
21
#include "unicode/uniset.h"
22
#include "unicode/unistr.h"
23
#include "unicode/uversion.h"
24
#include "collation.h"
25
#include "collationdata.h"
26
#include "collationsettings.h"
27
#include "normalizer2impl.h"
28
#include "utrie2.h"
29
#include "uvectr32.h"
30
#include "uvectr64.h"
31
#include "uvector.h"
32
33
U_NAMESPACE_BEGIN
34
35
struct ConditionalCE32;
36
37
class CollationFastLatinBuilder;
38
class CopyHelper;
39
class DataBuilderCollationIterator;
40
class UCharsTrieBuilder;
41
42
/**
43
 * Low-level CollationData builder.
44
 * Takes (character, CE) pairs and builds them into runtime data structures.
45
 * Supports characters with context prefixes and contraction suffixes.
46
 */
47
class U_I18N_API CollationDataBuilder : public UObject {
48
public:
49
    /**
50
     * Collation element modifier. Interface class for a modifier
51
     * that changes a tailoring builder's temporary CEs to final CEs.
52
     * Called for every non-special CE32 and every expansion CE.
53
     */
54
    class CEModifier : public UObject {
55
    public:
56
        virtual ~CEModifier();
57
        /** Returns a new CE to replace the non-special input CE32, or else Collation::NO_CE. */
58
        virtual int64_t modifyCE32(uint32_t ce32) const = 0;
59
        /** Returns a new CE to replace the input CE, or else Collation::NO_CE. */
60
        virtual int64_t modifyCE(int64_t ce) const = 0;
61
    };
62
63
    CollationDataBuilder(UErrorCode &errorCode);
64
65
    virtual ~CollationDataBuilder();
66
67
    void initForTailoring(const CollationData *b, UErrorCode &errorCode);
68
69
    virtual UBool isCompressibleLeadByte(uint32_t b) const;
70
71
0
    inline UBool isCompressiblePrimary(uint32_t p) const {
72
0
        return isCompressibleLeadByte(p >> 24);
73
0
    }
74
75
    /**
76
     * @return true if this builder has mappings (e.g., add() has been called)
77
     */
78
0
    UBool hasMappings() const { return modified; }
79
80
    /**
81
     * @return true if c has CEs in this builder
82
     */
83
    UBool isAssigned(UChar32 c) const;
84
85
    /**
86
     * @return the three-byte primary if c maps to a single such CE and has no context data,
87
     * otherwise returns 0.
88
     */
89
    uint32_t getLongPrimaryIfSingleCE(UChar32 c) const;
90
91
    /**
92
     * @return the single CE for c.
93
     * Sets an error code if c does not have a single CE.
94
     */
95
    int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
96
97
    void add(const UnicodeString &prefix, const UnicodeString &s,
98
             const int64_t ces[], int32_t cesLength,
99
             UErrorCode &errorCode);
100
101
    /**
102
     * Encodes the ces as either the returned ce32 by itself,
103
     * or by storing an expansion, with the returned ce32 referring to that.
104
     *
105
     * add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
106
     */
107
    virtual uint32_t encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode);
108
    void addCE32(const UnicodeString &prefix, const UnicodeString &s,
109
                 uint32_t ce32, UErrorCode &errorCode);
110
111
    /**
112
     * Sets three-byte-primary CEs for a range of code points in code point order,
113
     * if it is worth doing; otherwise no change is made.
114
     * None of the code points in the range should have complex mappings so far
115
     * (expansions/contractions/prefixes).
116
     * @param start first code point
117
     * @param end last code point (inclusive)
118
     * @param primary primary weight for 'start'
119
     * @param step per-code point primary-weight increment
120
     * @param errorCode ICU in/out error code
121
     * @return true if an OFFSET_TAG range was used for start..end
122
     */
123
    UBool maybeSetPrimaryRange(UChar32 start, UChar32 end,
124
                               uint32_t primary, int32_t step,
125
                               UErrorCode &errorCode);
126
127
    /**
128
     * Sets three-byte-primary CEs for a range of code points in code point order.
129
     * Sets range values if that is worth doing, or else individual values.
130
     * None of the code points in the range should have complex mappings so far
131
     * (expansions/contractions/prefixes).
132
     * @param start first code point
133
     * @param end last code point (inclusive)
134
     * @param primary primary weight for 'start'
135
     * @param step per-code point primary-weight increment
136
     * @param errorCode ICU in/out error code
137
     * @return the next primary after 'end': start primary incremented by ((end-start)+1)*step
138
     */
139
    uint32_t setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
140
                                          uint32_t primary, int32_t step,
141
                                          UErrorCode &errorCode);
142
143
    /**
144
     * Copies all mappings from the src builder, with modifications.
145
     * This builder here must not be built yet, and should be empty.
146
     */
147
    void copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
148
                  UErrorCode &errorCode);
149
150
    void optimize(const UnicodeSet &set, UErrorCode &errorCode);
151
    void suppressContractions(const UnicodeSet &set, UErrorCode &errorCode);
152
153
0
    void enableFastLatin() { fastLatinEnabled = true; }
154
    virtual void build(CollationData &data, UErrorCode &errorCode);
155
156
    /**
157
     * Looks up CEs for s and appends them to the ces array.
158
     * Does not handle normalization: s should be in FCD form.
159
     *
160
     * Does not write completely ignorable CEs.
161
     * Does not write beyond Collation::MAX_EXPANSION_LENGTH.
162
     *
163
     * @return incremented cesLength
164
     */
165
    int32_t getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength);
166
    int32_t getCEs(const UnicodeString &prefix, const UnicodeString &s,
167
                   int64_t ces[], int32_t cesLength);
168
169
protected:
170
    friend class CopyHelper;
171
    friend class DataBuilderCollationIterator;
172
173
    uint32_t getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const;
174
175
    int32_t addCE(int64_t ce, UErrorCode &errorCode);
176
    int32_t addCE32(uint32_t ce32, UErrorCode &errorCode);
177
    int32_t addConditionalCE32(const UnicodeString &context, uint32_t ce32, UErrorCode &errorCode);
178
179
0
    inline ConditionalCE32 *getConditionalCE32(int32_t index) const {
180
0
        return static_cast<ConditionalCE32 *>(conditionalCE32s[index]);
181
0
    }
182
0
    inline ConditionalCE32 *getConditionalCE32ForCE32(uint32_t ce32) const {
183
0
        return getConditionalCE32(Collation::indexFromCE32(ce32));
184
0
    }
185
186
0
    static uint32_t makeBuilderContextCE32(int32_t index) {
187
0
        return Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, index);
188
0
    }
189
0
    static inline UBool isBuilderContextCE32(uint32_t ce32) {
190
0
        return Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG);
191
0
    }
192
193
    static uint32_t encodeOneCEAsCE32(int64_t ce);
194
    uint32_t encodeOneCE(int64_t ce, UErrorCode &errorCode);
195
    uint32_t encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode);
196
    uint32_t encodeExpansion32(const int32_t newCE32s[], int32_t length, UErrorCode &errorCode);
197
198
    uint32_t copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, UErrorCode &errorCode);
199
    /**
200
     * Copies base contractions to a list of ConditionalCE32.
201
     * Sets cond->next to the index of the first new item
202
     * and returns the index of the last new item.
203
     */
204
    int32_t copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
205
                                         ConditionalCE32 *cond, UErrorCode &errorCode);
206
207
    UBool getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode);
208
    void setDigitTags(UErrorCode &errorCode);
209
    void setLeadSurrogates(UErrorCode &errorCode);
210
211
    void buildMappings(CollationData &data, UErrorCode &errorCode);
212
213
    void clearContexts();
214
    void buildContexts(UErrorCode &errorCode);
215
    uint32_t buildContext(ConditionalCE32 *head, UErrorCode &errorCode);
216
    int32_t addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
217
                           UErrorCode &errorCode);
218
219
    void buildFastLatinTable(CollationData &data, UErrorCode &errorCode);
220
221
    int32_t getCEs(const UnicodeString &s, int32_t start, int64_t ces[], int32_t cesLength);
222
223
0
    static UChar32 jamoCpFromIndex(int32_t i) {
224
        // 0 <= i < CollationData::JAMO_CE32S_LENGTH = 19 + 21 + 27
225
0
        if(i < Hangul::JAMO_L_COUNT) { return Hangul::JAMO_L_BASE + i; }
226
0
        i -= Hangul::JAMO_L_COUNT;
227
0
        if(i < Hangul::JAMO_V_COUNT) { return Hangul::JAMO_V_BASE + i; }
228
0
        i -= Hangul::JAMO_V_COUNT;
229
        // i < 27
230
0
        return Hangul::JAMO_T_BASE + 1 + i;
231
0
    }
232
233
    /** @see Collation::BUILDER_DATA_TAG */
234
    static const uint32_t IS_BUILDER_JAMO_CE32 = 0x100;
235
236
    const Normalizer2Impl &nfcImpl;
237
    const CollationData *base;
238
    const CollationSettings *baseSettings;
239
    UTrie2 *trie;
240
    UVector32 ce32s;
241
    UVector64 ce64s;
242
    UVector conditionalCE32s;  // vector of ConditionalCE32
243
    // Characters that have context (prefixes or contraction suffixes).
244
    UnicodeSet contextChars;
245
    // Serialized UCharsTrie structures for finalized contexts.
246
    UnicodeString contexts;
247
    UnicodeSet unsafeBackwardSet;
248
    UBool modified;
249
250
    UBool fastLatinEnabled;
251
    CollationFastLatinBuilder *fastLatinBuilder;
252
253
    DataBuilderCollationIterator *collIter;
254
};
255
256
U_NAMESPACE_END
257
258
#endif  // !UCONFIG_NO_COLLATION
259
#endif  // __COLLATIONDATABUILDER_H__