/src/skia/third_party/externals/icu/source/common/dictionarydata.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * Copyright (C) 2014-2016, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ******************************************************************************* |
8 | | * dictionarydata.h |
9 | | * |
10 | | * created on: 2012may31 |
11 | | * created by: Markus W. Scherer & Maxime Serrano |
12 | | */ |
13 | | |
14 | | #include "dictionarydata.h" |
15 | | #include "unicode/ucharstrie.h" |
16 | | #include "unicode/bytestrie.h" |
17 | | #include "unicode/udata.h" |
18 | | #include "cmemory.h" |
19 | | |
20 | | #if !UCONFIG_NO_BREAK_ITERATION |
21 | | |
22 | | U_NAMESPACE_BEGIN |
23 | | |
24 | | const int32_t DictionaryData::TRIE_TYPE_BYTES = 0; |
25 | | const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1; |
26 | | const int32_t DictionaryData::TRIE_TYPE_MASK = 7; |
27 | | const int32_t DictionaryData::TRIE_HAS_VALUES = 8; |
28 | | |
29 | | const int32_t DictionaryData::TRANSFORM_NONE = 0; |
30 | | const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; |
31 | | const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; |
32 | | const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; |
33 | | |
34 | 0 | DictionaryMatcher::~DictionaryMatcher() { |
35 | 0 | } |
36 | | |
37 | 0 | UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { |
38 | 0 | udata_close(file); |
39 | 0 | } |
40 | | |
41 | 0 | int32_t UCharsDictionaryMatcher::getType() const { |
42 | 0 | return DictionaryData::TRIE_TYPE_UCHARS; |
43 | 0 | } |
44 | | |
45 | | int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, |
46 | | int32_t *lengths, int32_t *cpLengths, int32_t *values, |
47 | 0 | int32_t *prefix) const { |
48 | |
|
49 | 0 | UCharsTrie uct(characters); |
50 | 0 | int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); |
51 | 0 | int32_t wordCount = 0; |
52 | 0 | int32_t codePointsMatched = 0; |
53 | |
|
54 | 0 | for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { |
55 | 0 | UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); |
56 | 0 | int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; |
57 | 0 | codePointsMatched += 1; |
58 | 0 | if (USTRINGTRIE_HAS_VALUE(result)) { |
59 | 0 | if (wordCount < limit) { |
60 | 0 | if (values != NULL) { |
61 | 0 | values[wordCount] = uct.getValue(); |
62 | 0 | } |
63 | 0 | if (lengths != NULL) { |
64 | 0 | lengths[wordCount] = lengthMatched; |
65 | 0 | } |
66 | 0 | if (cpLengths != NULL) { |
67 | 0 | cpLengths[wordCount] = codePointsMatched; |
68 | 0 | } |
69 | 0 | ++wordCount; |
70 | 0 | } |
71 | 0 | if (result == USTRINGTRIE_FINAL_VALUE) { |
72 | 0 | break; |
73 | 0 | } |
74 | 0 | } |
75 | 0 | else if (result == USTRINGTRIE_NO_MATCH) { |
76 | 0 | break; |
77 | 0 | } |
78 | 0 | if (lengthMatched >= maxLength) { |
79 | 0 | break; |
80 | 0 | } |
81 | 0 | } |
82 | |
|
83 | 0 | if (prefix != NULL) { |
84 | 0 | *prefix = codePointsMatched; |
85 | 0 | } |
86 | 0 | return wordCount; |
87 | 0 | } |
88 | | |
89 | 0 | BytesDictionaryMatcher::~BytesDictionaryMatcher() { |
90 | 0 | udata_close(file); |
91 | 0 | } |
92 | | |
93 | 166k | UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { |
94 | 166k | if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { |
95 | 166k | if (c == 0x200D) { |
96 | 0 | return 0xFF; |
97 | 166k | } else if (c == 0x200C) { |
98 | 0 | return 0xFE; |
99 | 0 | } |
100 | 166k | int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); |
101 | 166k | if (delta < 0 || 0xFD < delta) { |
102 | 1.01k | return U_SENTINEL; |
103 | 1.01k | } |
104 | 165k | return (UChar32)delta; |
105 | 165k | } |
106 | 0 | return c; |
107 | 0 | } |
108 | | |
109 | 0 | int32_t BytesDictionaryMatcher::getType() const { |
110 | 0 | return DictionaryData::TRIE_TYPE_BYTES; |
111 | 0 | } |
112 | | |
113 | | int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, |
114 | | int32_t *lengths, int32_t *cpLengths, int32_t *values, |
115 | 71.0k | int32_t *prefix) const { |
116 | 71.0k | BytesTrie bt(characters); |
117 | 71.0k | int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); |
118 | 71.0k | int32_t wordCount = 0; |
119 | 71.0k | int32_t codePointsMatched = 0; |
120 | | |
121 | 166k | for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { |
122 | 95.8k | UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); |
123 | 166k | int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; |
124 | 166k | codePointsMatched += 1; |
125 | 166k | if (USTRINGTRIE_HAS_VALUE(result)) { |
126 | 54.0k | if (wordCount < limit) { |
127 | 54.0k | if (values != NULL) { |
128 | 0 | values[wordCount] = bt.getValue(); |
129 | 0 | } |
130 | 54.0k | if (lengths != NULL) { |
131 | 54.0k | lengths[wordCount] = lengthMatched; |
132 | 54.0k | } |
133 | 54.0k | if (cpLengths != NULL) { |
134 | 54.0k | cpLengths[wordCount] = codePointsMatched; |
135 | 54.0k | } |
136 | 54.0k | ++wordCount; |
137 | 54.0k | } |
138 | 54.0k | if (result == USTRINGTRIE_FINAL_VALUE) { |
139 | 3.39k | break; |
140 | 3.39k | } |
141 | 112k | } |
142 | 112k | else if (result == USTRINGTRIE_NO_MATCH) { |
143 | 57.2k | break; |
144 | 57.2k | } |
145 | 106k | if (lengthMatched >= maxLength) { |
146 | 10.3k | break; |
147 | 10.3k | } |
148 | 106k | } |
149 | | |
150 | 71.0k | if (prefix != NULL) { |
151 | 71.0k | *prefix = codePointsMatched; |
152 | 71.0k | } |
153 | 71.0k | return wordCount; |
154 | 71.0k | } |
155 | | |
156 | | |
157 | | U_NAMESPACE_END |
158 | | |
159 | | U_NAMESPACE_USE |
160 | | |
161 | | U_CAPI int32_t U_EXPORT2 |
162 | | udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, |
163 | 0 | void *outData, UErrorCode *pErrorCode) { |
164 | 0 | const UDataInfo *pInfo; |
165 | 0 | int32_t headerSize; |
166 | 0 | const uint8_t *inBytes; |
167 | 0 | uint8_t *outBytes; |
168 | 0 | const int32_t *inIndexes; |
169 | 0 | int32_t indexes[DictionaryData::IX_COUNT]; |
170 | 0 | int32_t i, offset, size; |
171 | |
|
172 | 0 | headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); |
173 | 0 | if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; |
174 | 0 | pInfo = (const UDataInfo *)((const char *)inData + 4); |
175 | 0 | if (!(pInfo->dataFormat[0] == 0x44 && |
176 | 0 | pInfo->dataFormat[1] == 0x69 && |
177 | 0 | pInfo->dataFormat[2] == 0x63 && |
178 | 0 | pInfo->dataFormat[3] == 0x74 && |
179 | 0 | pInfo->formatVersion[0] == 1)) { |
180 | 0 | udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", |
181 | 0 | pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); |
182 | 0 | *pErrorCode = U_UNSUPPORTED_ERROR; |
183 | 0 | return 0; |
184 | 0 | } |
185 | | |
186 | 0 | inBytes = (const uint8_t *)inData + headerSize; |
187 | 0 | outBytes = (uint8_t *)outData + headerSize; |
188 | |
|
189 | 0 | inIndexes = (const int32_t *)inBytes; |
190 | 0 | if (length >= 0) { |
191 | 0 | length -= headerSize; |
192 | 0 | if (length < (int32_t)(sizeof(indexes))) { |
193 | 0 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); |
194 | 0 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
195 | 0 | return 0; |
196 | 0 | } |
197 | 0 | } |
198 | | |
199 | 0 | for (i = 0; i < DictionaryData::IX_COUNT; i++) { |
200 | 0 | indexes[i] = udata_readInt32(ds, inIndexes[i]); |
201 | 0 | } |
202 | |
|
203 | 0 | size = indexes[DictionaryData::IX_TOTAL_SIZE]; |
204 | |
|
205 | 0 | if (length >= 0) { |
206 | 0 | if (length < size) { |
207 | 0 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); |
208 | 0 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; |
209 | 0 | return 0; |
210 | 0 | } |
211 | | |
212 | 0 | if (inBytes != outBytes) { |
213 | 0 | uprv_memcpy(outBytes, inBytes, size); |
214 | 0 | } |
215 | |
|
216 | 0 | offset = 0; |
217 | 0 | ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); |
218 | 0 | offset = (int32_t)sizeof(indexes); |
219 | 0 | int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; |
220 | 0 | int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; |
221 | |
|
222 | 0 | if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { |
223 | 0 | ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); |
224 | 0 | } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { |
225 | | // nothing to do |
226 | 0 | } else { |
227 | 0 | udata_printError(ds, "udict_swap(): unknown trie type!\n"); |
228 | 0 | *pErrorCode = U_UNSUPPORTED_ERROR; |
229 | 0 | return 0; |
230 | 0 | } |
231 | | |
232 | | // these next two sections are empty in the current format, |
233 | | // but may be used later. |
234 | 0 | offset = nextOffset; |
235 | 0 | nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; |
236 | 0 | offset = nextOffset; |
237 | 0 | nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; |
238 | 0 | offset = nextOffset; |
239 | 0 | } |
240 | 0 | return headerSize + size; |
241 | 0 | } |
242 | | #endif |