/src/sentencepiece/third_party/darts_clone/darts.h
Line | Count | Source |
1 | | #ifndef DARTS_H_ |
2 | | #define DARTS_H_ |
3 | | |
4 | | #include <cstdio> |
5 | | #include <exception> |
6 | | #include <new> |
7 | | |
8 | | #define DARTS_VERSION "0.32" |
9 | | |
10 | | // DARTS_THROW() throws a <Darts::Exception> whose message starts with the |
11 | | // file name and the line number. For example, DARTS_THROW("error message") at |
12 | | // line 123 of "darts.h" throws a <Darts::Exception> which has a pointer to |
13 | | // "darts.h:123: exception: error message". The message is available by using |
14 | | // what() as well as that of <std::exception>. |
15 | | #define DARTS_INT_TO_STR(value) #value |
16 | | #define DARTS_LINE_TO_STR(line) DARTS_INT_TO_STR(line) |
17 | | #define DARTS_LINE_STR DARTS_LINE_TO_STR(__LINE__) |
18 | 0 | #define DARTS_THROW(msg) throw Darts::Details::Exception( \ |
19 | 0 | __FILE__ ":" DARTS_LINE_STR ": exception: " msg) |
20 | | |
21 | | namespace Darts { |
22 | | |
23 | | // The following namespace hides the internal types and classes. |
24 | | namespace Details { |
25 | | |
26 | | // This header assumes that <int> and <unsigned int> are 32-bit integer types. |
27 | | // |
28 | | // Darts-clone keeps values associated with keys. The type of the values is |
29 | | // <value_type>. Note that the values must be positive integers because the |
30 | | // most significant bit (MSB) of each value is used to represent whether the |
31 | | // corresponding unit is a leaf or not. Also, the keys are represented by |
32 | | // sequences of <char_type>s. <uchar_type> is the unsigned type of <char_type>. |
33 | | typedef char char_type; |
34 | | typedef unsigned char uchar_type; |
35 | | typedef int value_type; |
36 | | |
37 | | // The main structure of Darts-clone is an array of <DoubleArrayUnit>s, and the |
38 | | // unit type is actually a wrapper of <id_type>. |
39 | | typedef unsigned int id_type; |
40 | | |
41 | | // <progress_func_type> is the type of callback functions for reporting the |
42 | | // progress of building a dictionary. See also build() of <DoubleArray>. |
43 | | // The 1st argument receives the progress value and the 2nd argument receives |
44 | | // the maximum progress value. A usage example is to show the progress |
45 | | // percentage, 100.0 * (the 1st argument) / (the 2nd argument). |
46 | | typedef int (*progress_func_type)(std::size_t, std::size_t); |
47 | | |
48 | | // <DoubleArrayUnit> is the type of double-array units and it is a wrapper of |
49 | | // <id_type> in practice. |
50 | | class DoubleArrayUnit { |
51 | | public: |
52 | 49.8M | DoubleArrayUnit() : unit_() {} |
53 | | |
54 | | // has_leaf() returns whether a leaf unit is immediately derived from the |
55 | | // unit (true) or not (false). |
56 | 712M | bool has_leaf() const { |
57 | 712M | return ((unit_ >> 8) & 1) == 1; |
58 | 712M | } |
59 | | // value() returns the value stored in the unit, and thus value() is |
60 | | // available when and only when the unit is a leaf unit. |
61 | 262M | value_type value() const { |
62 | 262M | return static_cast<value_type>(unit_ & ((1U << 31) - 1)); |
63 | 262M | } |
64 | | |
65 | | // label() returns the label associted with the unit. Note that a leaf unit |
66 | | // always returns an invalid label. For this feature, leaf unit's label() |
67 | | // returns an <id_type> that has the MSB of 1. |
68 | 1.82G | id_type label() const { |
69 | 1.82G | return unit_ & ((1U << 31) | 0xFF); |
70 | 1.82G | } |
71 | | // offset() returns the offset from the unit to its derived units. |
72 | 1.54G | id_type offset() const { |
73 | 1.54G | return (unit_ >> 10) << ((unit_ & (1U << 9)) >> 6); |
74 | 1.54G | } |
75 | | |
76 | | private: |
77 | | id_type unit_; |
78 | | |
79 | | // Copyable. |
80 | | }; |
81 | | |
82 | | // Darts-clone throws an <Exception> for memory allocation failure, invalid |
83 | | // arguments or a too large offset. The last case means that there are too many |
84 | | // keys in the given set of keys. Note that the `msg' of <Exception> must be a |
85 | | // constant or static string because an <Exception> keeps only a pointer to |
86 | | // that string. |
87 | | class Exception : public std::exception { |
88 | | public: |
89 | 0 | explicit Exception(const char *msg = NULL) throw() : msg_(msg) {} |
90 | 0 | Exception(const Exception &rhs) throw() : msg_(rhs.msg_) {} |
91 | 0 | virtual ~Exception() throw() {} |
92 | | |
93 | | // <Exception> overrides what() of <std::exception>. |
94 | 0 | virtual const char *what() const throw() { |
95 | 0 | return (msg_ != NULL) ? msg_ : ""; |
96 | 0 | } |
97 | | |
98 | | private: |
99 | | const char *msg_; |
100 | | |
101 | | // Disallows operator=. |
102 | | Exception &operator=(const Exception &); |
103 | | }; |
104 | | |
105 | | } // namespace Details |
106 | | |
107 | | // <DoubleArrayImpl> is the interface of Darts-clone. Note that other |
108 | | // classes should not be accessed from outside. |
109 | | // |
110 | | // <DoubleArrayImpl> has 4 template arguments but only the 3rd one is used as |
111 | | // the type of values. Note that the given <T> is used only from outside, and |
112 | | // the internal value type is not changed from <Darts::Details::value_type>. |
113 | | // In build(), given values are casted from <T> to <Darts::Details::value_type> |
114 | | // by using static_cast. On the other hand, values are casted from |
115 | | // <Darts::Details::value_type> to <T> in searching dictionaries. |
116 | | template <typename, typename, typename T, typename> |
117 | | class DoubleArrayImpl { |
118 | | public: |
119 | | // Even if this <value_type> is changed, the internal value type is still |
120 | | // <Darts::Details::value_type>. Other types, such as 64-bit integer types |
121 | | // and floating-point number types, should not be used. |
122 | | typedef T value_type; |
123 | | // A key is reprenseted by a sequence of <key_type>s. For example, |
124 | | // exactMatchSearch() takes a <const key_type *>. |
125 | | typedef Details::char_type key_type; |
126 | | // In searching dictionaries, the values associated with the matched keys are |
127 | | // stored into or returned as <result_type>s. |
128 | | typedef value_type result_type; |
129 | | |
130 | | // <result_pair_type> enables applications to get the lengths of the matched |
131 | | // keys in addition to the values. |
132 | | struct result_pair_type { |
133 | | value_type value; |
134 | | std::size_t length; |
135 | | }; |
136 | | |
137 | | // The constructor initializes member variables with 0 and NULLs. |
138 | 86.4k | DoubleArrayImpl() : size_(0), array_(NULL), buf_(NULL) {} |
139 | | // The destructor frees memory allocated for units and then initializes |
140 | | // member variables with 0 and NULLs. |
141 | 86.4k | virtual ~DoubleArrayImpl() { |
142 | 86.4k | clear(); |
143 | 86.4k | } |
144 | | |
145 | | // <DoubleArrayImpl> has 2 kinds of set_result()s. The 1st set_result() is to |
146 | | // set a value to a <value_type>. The 2nd set_result() is to set a value and |
147 | | // a length to a <result_pair_type>. By using set_result()s, search methods |
148 | | // can return the 2 kinds of results in the same way. |
149 | | // Why the set_result()s are non-static? It is for compatibility. |
150 | | // |
151 | | // The 1st set_result() takes a length as the 3rd argument but it is not |
152 | | // used. If a compiler does a good job, codes for getting the length may be |
153 | | // removed. |
154 | 5.16M | void set_result(value_type *result, value_type value, std::size_t) const { |
155 | 5.16M | *result = value; |
156 | 5.16M | } |
157 | | // The 2nd set_result() uses both `value' and `length'. |
158 | | void set_result(result_pair_type *result, |
159 | 253M | value_type value, std::size_t length) const { |
160 | 253M | result->value = value; |
161 | 253M | result->length = length; |
162 | 253M | } |
163 | | |
164 | | // set_array() calls clear() in order to free memory allocated to the old |
165 | | // array and then sets a new array. This function is useful to set a memory- |
166 | | // mapped array. Note that the array set by set_array() is not freed in |
167 | | // clear() and the destructor of <DoubleArrayImpl>. |
168 | | // set_array() can also set the size of the new array but the size is not |
169 | | // used in search methods. So it works well even if the 2nd argument is 0 or |
170 | | // omitted. Remember that size() and total_size() returns 0 in such a case. |
171 | 20.2k | void set_array(const void *ptr, std::size_t size = 0) { |
172 | 20.2k | clear(); |
173 | 20.2k | array_ = static_cast<const unit_type *>(ptr); |
174 | 20.2k | size_ = size; |
175 | 20.2k | } |
176 | | // array() returns a pointer to the array of units. |
177 | 0 | const void *array() const { |
178 | 0 | return array_; |
179 | 0 | } |
180 | | |
181 | | // clear() frees memory allocated to units and then initializes member |
182 | | // variables with 0 and NULLs. Note that clear() does not free memory if the |
183 | | // array of units was set by set_array(). In such a case, `array_' is not |
184 | | // NULL and `buf_' is NULL. |
185 | 172k | void clear() { |
186 | 172k | size_ = 0; |
187 | 172k | array_ = NULL; |
188 | 172k | if (buf_ != NULL) { |
189 | 66.2k | delete[] buf_; |
190 | 66.2k | buf_ = NULL; |
191 | 66.2k | } |
192 | 172k | } |
193 | | |
194 | | // unit_size() returns the size of each unit. The size must be 4 bytes. |
195 | 20.2k | std::size_t unit_size() const { |
196 | 20.2k | return sizeof(unit_type); |
197 | 20.2k | } |
198 | | // size() returns the number of units. It can be 0 if set_array() is used. |
199 | 0 | std::size_t size() const { |
200 | 0 | return size_; |
201 | 0 | } |
202 | | // total_size() returns the number of bytes allocated to the array of units. |
203 | | // It can be 0 if set_array() is used. |
204 | | std::size_t total_size() const { |
205 | | return unit_size() * size(); |
206 | | } |
207 | | // nonzero_size() exists for compatibility. It always returns the number of |
208 | | // units because it takes long time to count the number of non-zero units. |
209 | | std::size_t nonzero_size() const { |
210 | | return size(); |
211 | | } |
212 | | |
213 | | // build() constructs a dictionary from given key-value pairs. If `lengths' |
214 | | // is NULL, `keys' is handled as an array of zero-terminated strings. If |
215 | | // `values' is NULL, the index in `keys' is associated with each key, i.e. |
216 | | // the ith key has (i - 1) as its value. |
217 | | // Note that the key-value pairs must be arranged in key order and the values |
218 | | // must not be negative. Also, if there are duplicate keys, only the first |
219 | | // pair will be stored in the resultant dictionary. |
220 | | // `progress_func' is a pointer to a callback function. If it is not NULL, |
221 | | // it will be called in build() so that the caller can check the progress of |
222 | | // dictionary construction. For details, please see the definition of |
223 | | // <Darts::Details::progress_func_type>. |
224 | | // The return value of build() is 0, and it indicates the success of the |
225 | | // operation. Otherwise, build() throws a <Darts::Exception>, which is a |
226 | | // derived class of <std::exception>. |
227 | | // build() uses another construction algorithm if `values' is not NULL. In |
228 | | // this case, Darts-clone uses a Directed Acyclic Word Graph (DAWG) instead |
229 | | // of a trie because a DAWG is likely to be more compact than a trie. |
230 | | int build(std::size_t num_keys, const key_type * const *keys, |
231 | | const std::size_t *lengths = NULL, const value_type *values = NULL, |
232 | | Details::progress_func_type progress_func = NULL); |
233 | | |
234 | | // open() reads an array of units from the specified file. And if it goes |
235 | | // well, the old array will be freed and replaced with the new array read |
236 | | // from the file. `offset' specifies the number of bytes to be skipped before |
237 | | // reading an array. `size' specifies the number of bytes to be read from the |
238 | | // file. If the `size' is 0, the whole file will be read. |
239 | | // open() returns 0 iff the operation succeeds. Otherwise, it returns a |
240 | | // non-zero value or throws a <Darts::Exception>. The exception is thrown |
241 | | // when and only when a memory allocation fails. |
242 | | int open(const char *file_name, const char *mode = "rb", |
243 | | std::size_t offset = 0, std::size_t size = 0); |
244 | | // save() writes the array of units into the specified file. `offset' |
245 | | // specifies the number of bytes to be skipped before writing the array. |
246 | | // open() returns 0 iff the operation succeeds. Otherwise, it returns a |
247 | | // non-zero value. |
248 | | int save(const char *file_name, const char *mode = "wb", |
249 | | std::size_t offset = 0) const; |
250 | | |
251 | | // The 1st exactMatchSearch() tests whether the given key exists or not, and |
252 | | // if it exists, its value and length are set to `result'. Otherwise, the |
253 | | // value and the length of `result' are set to -1 and 0 respectively. |
254 | | // Note that if `length' is 0, `key' is handled as a zero-terminated string. |
255 | | // `node_pos' specifies the start position of matching. This argument enables |
256 | | // the combination of exactMatchSearch() and traverse(). For example, if you |
257 | | // want to test "xyzA", "xyzBC", and "xyzDE", you can use traverse() to get |
258 | | // the node position corresponding to "xyz" and then you can use |
259 | | // exactMatchSearch() to test "A", "BC", and "DE" from that position. |
260 | | // Note that the length of `result' indicates the length from the `node_pos'. |
261 | | // In the above example, the lengths are { 1, 2, 2 }, not { 4, 5, 5 }. |
262 | | template <class U> |
263 | | void exactMatchSearch(const key_type *key, U &result, |
264 | 2.74M | std::size_t length = 0, std::size_t node_pos = 0) const { |
265 | 2.74M | result = exactMatchSearch<U>(key, length, node_pos); |
266 | 2.74M | } |
267 | | // The 2nd exactMatchSearch() returns a result instead of updating the 2nd |
268 | | // argument. So, the following exactMatchSearch() has only 3 arguments. |
269 | | template <class U> |
270 | | inline U exactMatchSearch(const key_type *key, std::size_t length = 0, |
271 | | std::size_t node_pos = 0) const; |
272 | | |
273 | | // commonPrefixSearch() searches for keys which match a prefix of the given |
274 | | // string. If `length' is 0, `key' is handled as a zero-terminated string. |
275 | | // The values and the lengths of at most `max_num_results' matched keys are |
276 | | // stored in `results'. commonPrefixSearch() returns the number of matched |
277 | | // keys. Note that the return value can be larger than `max_num_results' if |
278 | | // there are more than `max_num_results' matches. If you want to get all the |
279 | | // results, allocate more spaces and call commonPrefixSearch() again. |
280 | | // `node_pos' works as well as in exactMatchSearch(). |
281 | | template <class U> |
282 | | inline std::size_t commonPrefixSearch(const key_type *key, U *results, |
283 | | std::size_t max_num_results, std::size_t length = 0, |
284 | | std::size_t node_pos = 0) const; |
285 | | |
286 | | // In Darts-clone, a dictionary is a deterministic finite-state automaton |
287 | | // (DFA) and traverse() tests transitions on the DFA. The initial state is |
288 | | // `node_pos' and traverse() chooses transitions labeled key[key_pos], |
289 | | // key[key_pos + 1], ... in order. If there is not a transition labeled |
290 | | // key[key_pos + i], traverse() terminates the transitions at that state and |
291 | | // returns -2. Otherwise, traverse() ends without a termination and returns |
292 | | // -1 or a nonnegative value, -1 indicates that the final state was not an |
293 | | // accept state. When a nonnegative value is returned, it is the value |
294 | | // associated with the final accept state. That is, traverse() returns the |
295 | | // value associated with the given key if it exists. Note that traverse() |
296 | | // updates `node_pos' and `key_pos' after each transition. |
297 | | inline value_type traverse(const key_type *key, std::size_t &node_pos, |
298 | | std::size_t &key_pos, std::size_t length = 0) const; |
299 | | |
300 | | bool validate() const; |
301 | | |
302 | | private: |
303 | | typedef Details::uchar_type uchar_type; |
304 | | typedef Details::id_type id_type; |
305 | | typedef Details::DoubleArrayUnit unit_type; |
306 | | |
307 | | std::size_t size_; |
308 | | const unit_type *array_; |
309 | | unit_type *buf_; |
310 | | |
311 | | // Disallows copy and assignment. |
312 | | DoubleArrayImpl(const DoubleArrayImpl &); |
313 | | DoubleArrayImpl &operator=(const DoubleArrayImpl &); |
314 | | }; |
315 | | |
316 | | // <DoubleArray> is the typical instance of <DoubleArrayImpl>. It uses <int> |
317 | | // as the type of values and it is suitable for most cases. |
318 | | typedef DoubleArrayImpl<void, void, int, void> DoubleArray; |
319 | | |
320 | | // The interface section ends here. For using Darts-clone, there is no need |
321 | | // to read the remaining section, which gives the implementation of |
322 | | // Darts-clone. |
323 | | |
324 | | // |
325 | | // Member functions of DoubleArrayImpl (except build()). |
326 | | // |
327 | | |
328 | | template <typename A, typename B, typename T, typename C> |
329 | | int DoubleArrayImpl<A, B, T, C>::open(const char *file_name, |
330 | | const char *mode, std::size_t offset, std::size_t size) { |
331 | | #ifdef _MSC_VER |
332 | | std::FILE *file; |
333 | | if (::fopen_s(&file, file_name, mode) != 0) { |
334 | | return -1; |
335 | | } |
336 | | #else |
337 | | std::FILE *file = std::fopen(file_name, mode); |
338 | | if (file == NULL) { |
339 | | return -1; |
340 | | } |
341 | | #endif |
342 | | |
343 | | if (size == 0) { |
344 | | if (std::fseek(file, 0, SEEK_END) != 0) { |
345 | | std::fclose(file); |
346 | | return -1; |
347 | | } |
348 | | size = std::ftell(file) - offset; |
349 | | } |
350 | | |
351 | | size /= unit_size(); |
352 | | if (size < 256 || (size & 0xFF) != 0) { |
353 | | std::fclose(file); |
354 | | return -1; |
355 | | } |
356 | | |
357 | | if (std::fseek(file, offset, SEEK_SET) != 0) { |
358 | | std::fclose(file); |
359 | | return -1; |
360 | | } |
361 | | |
362 | | unit_type units[256]; |
363 | | if (std::fread(units, unit_size(), 256, file) != 256) { |
364 | | std::fclose(file); |
365 | | return -1; |
366 | | } |
367 | | |
368 | | if (units[0].label() != '\0' || units[0].has_leaf() || |
369 | | units[0].offset() == 0 || units[0].offset() >= 512) { |
370 | | std::fclose(file); |
371 | | return -1; |
372 | | } |
373 | | for (id_type i = 1; i < 256; ++i) { |
374 | | if (units[i].label() <= 0xFF && units[i].offset() >= size) { |
375 | | std::fclose(file); |
376 | | return -1; |
377 | | } |
378 | | } |
379 | | |
380 | | unit_type *buf; |
381 | | try { |
382 | | buf = new unit_type[size]; |
383 | | for (id_type i = 0; i < 256; ++i) { |
384 | | buf[i] = units[i]; |
385 | | } |
386 | | } catch (const std::bad_alloc &) { |
387 | | std::fclose(file); |
388 | | DARTS_THROW("failed to open double-array: std::bad_alloc"); |
389 | | } |
390 | | |
391 | | if (size > 256) { |
392 | | if (std::fread(buf + 256, unit_size(), size - 256, file) != size - 256) { |
393 | | std::fclose(file); |
394 | | delete[] buf; |
395 | | return -1; |
396 | | } |
397 | | } |
398 | | std::fclose(file); |
399 | | |
400 | | clear(); |
401 | | |
402 | | size_ = size; |
403 | | array_ = buf; |
404 | | buf_ = buf; |
405 | | return 0; |
406 | | } |
407 | | |
408 | | template <typename A, typename B, typename T, typename C> |
409 | | int DoubleArrayImpl<A, B, T, C>::save(const char *file_name, |
410 | | const char *mode, std::size_t) const { |
411 | | if (size() == 0) { |
412 | | return -1; |
413 | | } |
414 | | |
415 | | #ifdef _MSC_VER |
416 | | std::FILE *file; |
417 | | if (::fopen_s(&file, file_name, mode) != 0) { |
418 | | return -1; |
419 | | } |
420 | | #else |
421 | | std::FILE *file = std::fopen(file_name, mode); |
422 | | if (file == NULL) { |
423 | | return -1; |
424 | | } |
425 | | #endif |
426 | | |
427 | | if (std::fwrite(array_, unit_size(), size(), file) != size()) { |
428 | | std::fclose(file); |
429 | | return -1; |
430 | | } |
431 | | std::fclose(file); |
432 | | return 0; |
433 | | } |
434 | | |
435 | | template <typename A, typename B, typename T, typename C> |
436 | 20.2k | bool DoubleArrayImpl<A, B, T, C>::validate() const { |
437 | 20.2k | if (size_ == 0) { |
438 | 0 | return true; |
439 | 0 | } |
440 | 907M | for (std::size_t i = 0; i < size_; ++i) { |
441 | | // Leaf/value units store data in the offset field, not child pointers. |
442 | | // label() has MSB set (> 0xFF) for these units. Skip them, matching |
443 | | // the same pattern used in open(). |
444 | 907M | if (array_[i].label() > 0xFF) { |
445 | 302M | continue; |
446 | 302M | } |
447 | 605M | const id_type offset = array_[i].offset(); |
448 | 605M | if (offset == 0) { |
449 | 3.88M | continue; |
450 | 3.88M | } |
451 | 601M | const std::size_t base = i ^ offset; |
452 | 601M | if ((base | 0xFF) >= size_) { |
453 | 0 | return false; |
454 | 0 | } |
455 | 601M | } |
456 | 20.2k | return true; |
457 | 20.2k | } |
458 | | |
459 | | template <typename A, typename B, typename T, typename C> |
460 | | template <typename U> |
461 | | inline U DoubleArrayImpl<A, B, T, C>::exactMatchSearch(const key_type *key, |
462 | 2.74M | std::size_t length, std::size_t node_pos) const { |
463 | 2.74M | U result; |
464 | 2.74M | set_result(&result, static_cast<value_type>(-1), 0); |
465 | | |
466 | 2.74M | unit_type unit = array_[node_pos]; |
467 | 2.74M | if (length != 0) { |
468 | 7.08M | for (std::size_t i = 0; i < length; ++i) { |
469 | 4.66M | node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[i]); |
470 | 4.66M | unit = array_[node_pos]; |
471 | 4.66M | if (unit.label() != static_cast<uchar_type>(key[i])) { |
472 | 327k | return result; |
473 | 327k | } |
474 | 4.66M | } |
475 | 2.74M | } else { |
476 | 375 | for ( ; key[length] != '\0'; ++length) { |
477 | 2 | node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[length]); |
478 | 2 | unit = array_[node_pos]; |
479 | 2 | if (unit.label() != static_cast<uchar_type>(key[length])) { |
480 | 2 | return result; |
481 | 2 | } |
482 | 2 | } |
483 | 375 | } |
484 | | |
485 | 2.41M | if (!unit.has_leaf()) { |
486 | 871 | return result; |
487 | 871 | } |
488 | 2.41M | unit = array_[node_pos ^ unit.offset()]; |
489 | 2.41M | set_result(&result, static_cast<value_type>(unit.value()), length); |
490 | 2.41M | return result; |
491 | 2.41M | } |
492 | | |
493 | | template <typename A, typename B, typename T, typename C> |
494 | | template <typename U> |
495 | | inline std::size_t DoubleArrayImpl<A, B, T, C>::commonPrefixSearch( |
496 | | const key_type *key, U *results, std::size_t max_num_results, |
497 | 170M | std::size_t length, std::size_t node_pos) const { |
498 | 170M | std::size_t num_results = 0; |
499 | | |
500 | 170M | unit_type unit = array_[node_pos]; |
501 | 170M | node_pos ^= unit.offset(); |
502 | 170M | if (length != 0) { |
503 | 868M | for (std::size_t i = 0; i < length; ++i) { |
504 | 847M | node_pos ^= static_cast<uchar_type>(key[i]); |
505 | 847M | unit = array_[node_pos]; |
506 | 847M | if (unit.label() != static_cast<uchar_type>(key[i])) { |
507 | 149M | return num_results; |
508 | 149M | } |
509 | | |
510 | 698M | node_pos ^= unit.offset(); |
511 | 698M | if (unit.has_leaf()) { |
512 | 253M | if (num_results < max_num_results) { |
513 | 253M | set_result(&results[num_results], static_cast<value_type>( |
514 | 253M | array_[node_pos].value()), i + 1); |
515 | 253M | } |
516 | 253M | ++num_results; |
517 | 253M | } |
518 | 698M | } |
519 | 170M | } else { |
520 | 0 | for ( ; key[length] != '\0'; ++length) { |
521 | 0 | node_pos ^= static_cast<uchar_type>(key[length]); |
522 | 0 | unit = array_[node_pos]; |
523 | 0 | if (unit.label() != static_cast<uchar_type>(key[length])) { |
524 | 0 | return num_results; |
525 | 0 | } |
526 | | |
527 | 0 | node_pos ^= unit.offset(); |
528 | 0 | if (unit.has_leaf()) { |
529 | 0 | if (num_results < max_num_results) { |
530 | 0 | set_result(&results[num_results], static_cast<value_type>( |
531 | 0 | array_[node_pos].value()), length + 1); |
532 | 0 | } |
533 | 0 | ++num_results; |
534 | 0 | } |
535 | 0 | } |
536 | 0 | } |
537 | | |
538 | 21.6M | return num_results; |
539 | 170M | } |
540 | | |
541 | | template <typename A, typename B, typename T, typename C> |
542 | | inline typename DoubleArrayImpl<A, B, T, C>::value_type |
543 | | DoubleArrayImpl<A, B, T, C>::traverse(const key_type *key, |
544 | 61.2M | std::size_t &node_pos, std::size_t &key_pos, std::size_t length) const { |
545 | 61.2M | id_type id = static_cast<id_type>(node_pos); |
546 | 61.2M | unit_type unit = array_[id]; |
547 | | |
548 | 61.2M | if (length != 0) { |
549 | 73.1M | for ( ; key_pos < length; ++key_pos) { |
550 | 61.2M | id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]); |
551 | 61.2M | unit = array_[id]; |
552 | 61.2M | if (unit.label() != static_cast<uchar_type>(key[key_pos])) { |
553 | 49.3M | return static_cast<value_type>(-2); |
554 | 49.3M | } |
555 | 11.9M | node_pos = id; |
556 | 11.9M | } |
557 | 61.2M | } else { |
558 | 0 | for ( ; key[key_pos] != '\0'; ++key_pos) { |
559 | 0 | id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]); |
560 | 0 | unit = array_[id]; |
561 | 0 | if (unit.label() != static_cast<uchar_type>(key[key_pos])) { |
562 | 0 | return static_cast<value_type>(-2); |
563 | 0 | } |
564 | 0 | node_pos = id; |
565 | 0 | } |
566 | 0 | } |
567 | | |
568 | 11.9M | if (!unit.has_leaf()) { |
569 | 5.79M | return static_cast<value_type>(-1); |
570 | 5.79M | } |
571 | 6.12M | unit = array_[id ^ unit.offset()]; |
572 | 6.12M | return static_cast<value_type>(unit.value()); |
573 | 11.9M | } |
574 | | |
575 | | namespace Details { |
576 | | |
577 | | // |
578 | | // Memory management of array. |
579 | | // |
580 | | |
581 | | template <typename T> |
582 | | class AutoArray { |
583 | | public: |
584 | 6.79M | explicit AutoArray(T *array = NULL) : array_(array) {}Darts::Details::AutoArray<char>::AutoArray(char*) Line | Count | Source | 584 | 6.43M | explicit AutoArray(T *array = NULL) : array_(array) {} |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::AutoArray(Darts::Details::DoubleArrayBuilderExtraUnit*) Line | Count | Source | 584 | 132k | explicit AutoArray(T *array = NULL) : array_(array) {} |
Darts::Details::AutoArray<unsigned int>::AutoArray(unsigned int*) Line | Count | Source | 584 | 233k | explicit AutoArray(T *array = NULL) : array_(array) {} |
|
585 | 6.79M | ~AutoArray() { |
586 | 6.79M | clear(); |
587 | 6.79M | } Darts::Details::AutoArray<char>::~AutoArray() Line | Count | Source | 585 | 6.43M | ~AutoArray() { | 586 | 6.43M | clear(); | 587 | 6.43M | } |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::~AutoArray() Line | Count | Source | 585 | 132k | ~AutoArray() { | 586 | 132k | clear(); | 587 | 132k | } |
Darts::Details::AutoArray<unsigned int>::~AutoArray() Line | Count | Source | 585 | 233k | ~AutoArray() { | 586 | 233k | clear(); | 587 | 233k | } |
|
588 | | |
589 | 1.56G | const T &operator[](std::size_t id) const { |
590 | 1.56G | return array_[id]; |
591 | 1.56G | } Darts::Details::AutoArray<char>::operator[](unsigned long) const Line | Count | Source | 589 | 1.07G | const T &operator[](std::size_t id) const { | 590 | 1.07G | return array_[id]; | 591 | 1.07G | } |
Unexecuted instantiation: Darts::Details::AutoArray<unsigned int>::operator[](unsigned long) const Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::operator[](unsigned long) const Line | Count | Source | 589 | 489M | const T &operator[](std::size_t id) const { | 590 | 489M | return array_[id]; | 591 | 489M | } |
|
592 | 2.17G | T &operator[](std::size_t id) { |
593 | 2.17G | return array_[id]; |
594 | 2.17G | } Darts::Details::AutoArray<char>::operator[](unsigned long) Line | Count | Source | 592 | 1.62G | T &operator[](std::size_t id) { | 593 | 1.62G | return array_[id]; | 594 | 1.62G | } |
Darts::Details::AutoArray<unsigned int>::operator[](unsigned long) Line | Count | Source | 592 | 1.12M | T &operator[](std::size_t id) { | 593 | 1.12M | return array_[id]; | 594 | 1.12M | } |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::operator[](unsigned long) Line | Count | Source | 592 | 554M | T &operator[](std::size_t id) { | 593 | 554M | return array_[id]; | 594 | 554M | } |
|
595 | | |
596 | | bool empty() const { |
597 | | return array_ == NULL; |
598 | | } |
599 | | |
600 | 9.12M | void clear() { |
601 | 9.12M | if (array_ != NULL) { |
602 | 3.13M | delete[] array_; |
603 | 3.13M | array_ = NULL; |
604 | 3.13M | } |
605 | 9.12M | } Darts::Details::AutoArray<char>::clear() Line | Count | Source | 600 | 8.33M | void clear() { | 601 | 8.33M | if (array_ != NULL) { | 602 | 2.95M | delete[] array_; | 603 | | array_ = NULL; | 604 | 2.95M | } | 605 | 8.33M | } |
Darts::Details::AutoArray<unsigned int>::clear() Line | Count | Source | 600 | 522k | void clear() { | 601 | 522k | if (array_ != NULL) { | 602 | 111k | delete[] array_; | 603 | | array_ = NULL; | 604 | 111k | } | 605 | 522k | } |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::clear() Line | Count | Source | 600 | 264k | void clear() { | 601 | 264k | if (array_ != NULL) { | 602 | 66.2k | delete[] array_; | 603 | | array_ = NULL; | 604 | 66.2k | } | 605 | 264k | } |
|
606 | 6.08M | void swap(AutoArray *array) { |
607 | 6.08M | T *temp = array_; |
608 | 6.08M | array_ = array->array_; |
609 | 6.08M | array->array_ = temp; |
610 | 6.08M | } Darts::Details::AutoArray<char>::swap(Darts::Details::AutoArray<char>*) Line | Count | Source | 606 | 5.90M | void swap(AutoArray *array) { | 607 | 5.90M | T *temp = array_; | 608 | 5.90M | array_ = array->array_; | 609 | 5.90M | array->array_ = temp; | 610 | 5.90M | } |
Darts::Details::AutoArray<unsigned int>::swap(Darts::Details::AutoArray<unsigned int>*) Line | Count | Source | 606 | 111k | void swap(AutoArray *array) { | 607 | 111k | T *temp = array_; | 608 | 111k | array_ = array->array_; | 609 | 111k | array->array_ = temp; | 610 | 111k | } |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::swap(Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>*) Line | Count | Source | 606 | 66.2k | void swap(AutoArray *array) { | 607 | 66.2k | T *temp = array_; | 608 | 66.2k | array_ = array->array_; | 609 | 66.2k | array->array_ = temp; | 610 | 66.2k | } |
|
611 | 3.13M | void reset(T *array = NULL) { |
612 | 3.13M | AutoArray(array).swap(this); |
613 | 3.13M | } Darts::Details::AutoArray<char>::reset(char*) Line | Count | Source | 611 | 2.95M | void reset(T *array = NULL) { | 612 | 2.95M | AutoArray(array).swap(this); | 613 | 2.95M | } |
Darts::Details::AutoArray<unsigned int>::reset(unsigned int*) Line | Count | Source | 611 | 111k | void reset(T *array = NULL) { | 612 | 111k | AutoArray(array).swap(this); | 613 | 111k | } |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::reset(Darts::Details::DoubleArrayBuilderExtraUnit*) Line | Count | Source | 611 | 66.2k | void reset(T *array = NULL) { | 612 | 66.2k | AutoArray(array).swap(this); | 613 | 66.2k | } |
|
614 | | |
615 | | private: |
616 | | T *array_; |
617 | | |
618 | | // Disallows copy and assignment. |
619 | | AutoArray(const AutoArray &); |
620 | | AutoArray &operator=(const AutoArray &); |
621 | | }; |
622 | | |
623 | | // |
624 | | // Memory management of resizable array. |
625 | | // |
626 | | |
627 | | template <typename T> |
628 | | class AutoPool { |
629 | | public: |
630 | 522k | AutoPool() : buf_(), size_(0), capacity_(0) {}Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::AutoPool() Line | Count | Source | 630 | 66.2k | AutoPool() : buf_(), size_(0), capacity_(0) {} |
Darts::Details::AutoPool<unsigned char>::AutoPool() Line | Count | Source | 630 | 121k | AutoPool() : buf_(), size_(0), capacity_(0) {} |
Darts::Details::AutoPool<Darts::Details::DawgNode>::AutoPool() Line | Count | Source | 630 | 55.6k | AutoPool() : buf_(), size_(0), capacity_(0) {} |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::AutoPool() Line | Count | Source | 630 | 55.6k | AutoPool() : buf_(), size_(0), capacity_(0) {} |
Darts::Details::AutoPool<unsigned int>::AutoPool() Line | Count | Source | 630 | 222k | AutoPool() : buf_(), size_(0), capacity_(0) {} |
|
631 | 522k | ~AutoPool() { clear(); }Darts::Details::AutoPool<unsigned char>::~AutoPool() Line | Count | Source | 631 | 121k | ~AutoPool() { clear(); } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::~AutoPool() Line | Count | Source | 631 | 66.2k | ~AutoPool() { clear(); } |
Darts::Details::AutoPool<unsigned int>::~AutoPool() Line | Count | Source | 631 | 222k | ~AutoPool() { clear(); } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::~AutoPool() Line | Count | Source | 631 | 55.6k | ~AutoPool() { clear(); } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::~AutoPool() Line | Count | Source | 631 | 55.6k | ~AutoPool() { clear(); } |
|
632 | | |
633 | 1.07G | const T &operator[](std::size_t id) const { |
634 | 1.07G | return *(reinterpret_cast<const T *>(&buf_[0]) + id); |
635 | 1.07G | } Darts::Details::AutoPool<unsigned int>::operator[](unsigned long) const Line | Count | Source | 633 | 139M | const T &operator[](std::size_t id) const { | 634 | 139M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); | 635 | 139M | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::operator[](unsigned long) const Line | Count | Source | 633 | 304M | const T &operator[](std::size_t id) const { | 634 | 304M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); | 635 | 304M | } |
Darts::Details::AutoPool<unsigned char>::operator[](unsigned long) const Line | Count | Source | 633 | 413M | const T &operator[](std::size_t id) const { | 634 | 413M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); | 635 | 413M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::operator[](unsigned long) const Line | Count | Source | 633 | 170M | const T &operator[](std::size_t id) const { | 634 | 170M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); | 635 | 170M | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::operator[](unsigned long) const Line | Count | Source | 633 | 49.8M | const T &operator[](std::size_t id) const { | 634 | 49.8M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); | 635 | 49.8M | } |
|
636 | 1.61G | T &operator[](std::size_t id) { |
637 | 1.61G | return *(reinterpret_cast<T *>(&buf_[0]) + id); |
638 | 1.61G | } Darts::Details::AutoPool<unsigned int>::operator[](unsigned long) Line | Count | Source | 636 | 518M | T &operator[](std::size_t id) { | 637 | 518M | return *(reinterpret_cast<T *>(&buf_[0]) + id); | 638 | 518M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::operator[](unsigned long) Line | Count | Source | 636 | 529M | T &operator[](std::size_t id) { | 637 | 529M | return *(reinterpret_cast<T *>(&buf_[0]) + id); | 638 | 529M | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::operator[](unsigned long) Line | Count | Source | 636 | 122M | T &operator[](std::size_t id) { | 637 | 122M | return *(reinterpret_cast<T *>(&buf_[0]) + id); | 638 | 122M | } |
Darts::Details::AutoPool<unsigned char>::operator[](unsigned long) Line | Count | Source | 636 | 261M | T &operator[](std::size_t id) { | 637 | 261M | return *(reinterpret_cast<T *>(&buf_[0]) + id); | 638 | 261M | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::operator[](unsigned long) Line | Count | Source | 636 | 185M | T &operator[](std::size_t id) { | 637 | 185M | return *(reinterpret_cast<T *>(&buf_[0]) + id); | 638 | 185M | } |
|
639 | | |
640 | 35.5M | bool empty() const { |
641 | 35.5M | return size_ == 0; |
642 | 35.5M | } Darts::Details::AutoPool<unsigned int>::empty() const Line | Count | Source | 640 | 35.2M | bool empty() const { | 641 | 35.2M | return size_ == 0; | 642 | 35.2M | } |
Darts::Details::AutoPool<unsigned char>::empty() const Line | Count | Source | 640 | 279k | bool empty() const { | 641 | 279k | return size_ == 0; | 642 | 279k | } |
|
643 | 559M | std::size_t size() const { |
644 | 559M | return size_; |
645 | 559M | } Darts::Details::AutoPool<unsigned int>::size() const Line | Count | Source | 643 | 261M | std::size_t size() const { | 644 | 261M | return size_; | 645 | 261M | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::size() const Line | Count | Source | 643 | 22.7M | std::size_t size() const { | 644 | 22.7M | return size_; | 645 | 22.7M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::size() const Line | Count | Source | 643 | 3.57M | std::size_t size() const { | 644 | 3.57M | return size_; | 645 | 3.57M | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::size() const Line | Count | Source | 643 | 127M | std::size_t size() const { | 644 | 127M | return size_; | 645 | 127M | } |
Darts::Details::AutoPool<unsigned char>::size() const Line | Count | Source | 643 | 143M | std::size_t size() const { | 644 | 143M | return size_; | 645 | 143M | } |
|
646 | | |
647 | 1.90M | void clear() { |
648 | 1.90M | resize(0); |
649 | 1.90M | buf_.clear(); |
650 | 1.90M | size_ = 0; |
651 | 1.90M | capacity_ = 0; |
652 | 1.90M | } Darts::Details::AutoPool<unsigned int>::clear() Line | Count | Source | 647 | 1.01M | void clear() { | 648 | 1.01M | resize(0); | 649 | 1.01M | buf_.clear(); | 650 | 1.01M | size_ = 0; | 651 | 1.01M | capacity_ = 0; | 652 | 1.01M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::clear() Line | Count | Source | 647 | 222k | void clear() { | 648 | 222k | resize(0); | 649 | 222k | buf_.clear(); | 650 | 222k | size_ = 0; | 651 | 222k | capacity_ = 0; | 652 | 222k | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::clear() Line | Count | Source | 647 | 167k | void clear() { | 648 | 167k | resize(0); | 649 | 167k | buf_.clear(); | 650 | 167k | size_ = 0; | 651 | 167k | capacity_ = 0; | 652 | 167k | } |
Darts::Details::AutoPool<unsigned char>::clear() Line | Count | Source | 647 | 365k | void clear() { | 648 | 365k | resize(0); | 649 | 365k | buf_.clear(); | 650 | 365k | size_ = 0; | 651 | 365k | capacity_ = 0; | 652 | 365k | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::clear() Line | Count | Source | 647 | 132k | void clear() { | 648 | 132k | resize(0); | 649 | 132k | buf_.clear(); | 650 | 132k | size_ = 0; | 651 | 132k | capacity_ = 0; | 652 | 132k | } |
|
653 | | |
654 | 70.4M | void push_back(const T &value) { |
655 | 70.4M | append(value); |
656 | 70.4M | } |
657 | 66.9M | void pop_back() { |
658 | 66.9M | (*this)[--size_].~T(); |
659 | 66.9M | } |
660 | | |
661 | 74.0M | void append() { |
662 | 74.0M | if (size_ == capacity_) |
663 | 1.45M | resize_buf(size_ + 1); |
664 | 74.0M | new(&(*this)[size_++]) T; |
665 | 74.0M | } Darts::Details::AutoPool<Darts::Details::DawgUnit>::append() Line | Count | Source | 661 | 35.2M | void append() { | 662 | 35.2M | if (size_ == capacity_) | 663 | 529k | resize_buf(size_ + 1); | 664 | 35.2M | new(&(*this)[size_++]) T; | 665 | 35.2M | } |
Darts::Details::AutoPool<unsigned char>::append() Line | Count | Source | 661 | 35.2M | void append() { | 662 | 35.2M | if (size_ == capacity_) | 663 | 529k | resize_buf(size_ + 1); | 664 | 35.2M | new(&(*this)[size_++]) T; | 665 | 35.2M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::append() Line | Count | Source | 661 | 3.57M | void append() { | 662 | 3.57M | if (size_ == capacity_) | 663 | 393k | resize_buf(size_ + 1); | 664 | 3.57M | new(&(*this)[size_++]) T; | 665 | 3.57M | } |
|
666 | 106M | void append(const T &value) { |
667 | 106M | if (size_ == capacity_) |
668 | 1.32M | resize_buf(size_ + 1); |
669 | 106M | new(&(*this)[size_++]) T(value); |
670 | 106M | } Darts::Details::AutoPool<unsigned int>::append(unsigned int const&) Line | Count | Source | 666 | 71.5M | void append(const T &value) { | 667 | 71.5M | if (size_ == capacity_) | 668 | 978k | resize_buf(size_ + 1); | 669 | 71.5M | new(&(*this)[size_++]) T(value); | 670 | 71.5M | } |
Darts::Details::AutoPool<unsigned char>::append(unsigned char const&) Line | Count | Source | 666 | 35.3M | void append(const T &value) { | 667 | 35.3M | if (size_ == capacity_) | 668 | 349k | resize_buf(size_ + 1); | 669 | 35.3M | new(&(*this)[size_++]) T(value); | 670 | 35.3M | } |
|
671 | | |
672 | 29.2M | void resize(std::size_t size) { |
673 | 297M | while (size_ > size) { |
674 | 268M | (*this)[--size_].~T(); |
675 | 268M | } |
676 | 29.2M | if (size > capacity_) { |
677 | 38.2k | resize_buf(size); |
678 | 38.2k | } |
679 | 79.0M | while (size_ < size) { |
680 | 49.8M | new(&(*this)[size_++]) T; |
681 | 49.8M | } |
682 | 29.2M | } Darts::Details::AutoPool<unsigned int>::resize(unsigned long) Line | Count | Source | 672 | 1.01M | void resize(std::size_t size) { | 673 | 110M | while (size_ > size) { | 674 | 109M | (*this)[--size_].~T(); | 675 | 109M | } | 676 | 1.01M | if (size > capacity_) { | 677 | 0 | resize_buf(size); | 678 | 0 | } | 679 | 1.01M | while (size_ < size) { | 680 | 0 | new(&(*this)[size_++]) T; | 681 | 0 | } | 682 | 1.01M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::resize(unsigned long) Line | Count | Source | 672 | 222k | void resize(std::size_t size) { | 673 | 3.79M | while (size_ > size) { | 674 | 3.57M | (*this)[--size_].~T(); | 675 | 3.57M | } | 676 | 222k | if (size > capacity_) { | 677 | 0 | resize_buf(size); | 678 | 0 | } | 679 | 222k | while (size_ < size) { | 680 | 0 | new(&(*this)[size_++]) T; | 681 | 0 | } | 682 | 222k | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::resize(unsigned long) Line | Count | Source | 672 | 167k | void resize(std::size_t size) { | 673 | 35.4M | while (size_ > size) { | 674 | 35.2M | (*this)[--size_].~T(); | 675 | 35.2M | } | 676 | 167k | if (size > capacity_) { | 677 | 0 | resize_buf(size); | 678 | 0 | } | 679 | 167k | while (size_ < size) { | 680 | 0 | new(&(*this)[size_++]) T; | 681 | 0 | } | 682 | 167k | } |
Darts::Details::AutoPool<unsigned char>::resize(unsigned long) Line | Count | Source | 672 | 27.4M | void resize(std::size_t size) { | 673 | 98.0M | while (size_ > size) { | 674 | 70.6M | (*this)[--size_].~T(); | 675 | 70.6M | } | 676 | 27.4M | if (size > capacity_) { | 677 | 0 | resize_buf(size); | 678 | 0 | } | 679 | 27.4M | while (size_ < size) { | 680 | 0 | new(&(*this)[size_++]) T; | 681 | 0 | } | 682 | 27.4M | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::resize(unsigned long) Line | Count | Source | 672 | 327k | void resize(std::size_t size) { | 673 | 50.1M | while (size_ > size) { | 674 | 49.8M | (*this)[--size_].~T(); | 675 | 49.8M | } | 676 | 327k | if (size > capacity_) { | 677 | 38.2k | resize_buf(size); | 678 | 38.2k | } | 679 | 50.1M | while (size_ < size) { | 680 | 49.8M | new(&(*this)[size_++]) T; | 681 | 49.8M | } | 682 | 327k | } |
|
683 | 70.8k | void resize(std::size_t size, const T &value) { |
684 | 70.8k | while (size_ > size) { |
685 | 0 | (*this)[--size_].~T(); |
686 | 0 | } |
687 | 70.8k | if (size > capacity_) { |
688 | 70.8k | resize_buf(size); |
689 | 70.8k | } |
690 | 104M | while (size_ < size) { |
691 | 104M | new(&(*this)[size_++]) T(value); |
692 | 104M | } |
693 | 70.8k | } |
694 | | |
695 | 66.2k | void reserve(std::size_t size) { |
696 | 66.2k | if (size > capacity_) { |
697 | 66.2k | resize_buf(size); |
698 | 66.2k | } |
699 | 66.2k | } |
700 | | |
701 | | private: |
702 | | AutoArray<char> buf_; |
703 | | std::size_t size_; |
704 | | std::size_t capacity_; |
705 | | |
706 | | // Disallows copy and assignment. |
707 | | AutoPool(const AutoPool &); |
708 | | AutoPool &operator=(const AutoPool &); |
709 | | |
710 | | void resize_buf(std::size_t size); |
711 | | }; |
712 | | |
713 | | template <typename T> |
714 | 2.95M | void AutoPool<T>::resize_buf(std::size_t size) { |
715 | 2.95M | std::size_t capacity; |
716 | 2.95M | if (size >= capacity_ * 2) { |
717 | 965k | capacity = size; |
718 | 1.98M | } else { |
719 | 1.98M | capacity = 1; |
720 | 11.1M | while (capacity < size) { |
721 | 9.13M | capacity <<= 1; |
722 | 9.13M | } |
723 | 1.98M | } |
724 | | |
725 | 2.95M | AutoArray<char> buf; |
726 | 2.95M | try { |
727 | 2.95M | buf.reset(new char[sizeof(T) * capacity]); |
728 | 2.95M | } catch (const std::bad_alloc &) { |
729 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); |
730 | 0 | } |
731 | | |
732 | 2.95M | if (size_ > 0) { |
733 | 2.38M | T *src = reinterpret_cast<T *>(&buf_[0]); |
734 | 2.38M | T *dest = reinterpret_cast<T *>(&buf[0]); |
735 | 124M | for (std::size_t i = 0; i < size_; ++i) { |
736 | 122M | new(&dest[i]) T(src[i]); |
737 | 122M | src[i].~T(); |
738 | 122M | } |
739 | 2.38M | } |
740 | | |
741 | 2.95M | buf_.swap(&buf); |
742 | 2.95M | capacity_ = capacity; |
743 | 2.95M | } Darts::Details::AutoPool<unsigned int>::resize_buf(unsigned long) Line | Count | Source | 714 | 1.04M | void AutoPool<T>::resize_buf(std::size_t size) { | 715 | 1.04M | std::size_t capacity; | 716 | 1.04M | if (size >= capacity_ * 2) { | 717 | 399k | capacity = size; | 718 | 649k | } else { | 719 | 649k | capacity = 1; | 720 | 3.13M | while (capacity < size) { | 721 | 2.48M | capacity <<= 1; | 722 | 2.48M | } | 723 | 649k | } | 724 | | | 725 | 1.04M | AutoArray<char> buf; | 726 | 1.04M | try { | 727 | 1.04M | buf.reset(new char[sizeof(T) * capacity]); | 728 | 1.04M | } catch (const std::bad_alloc &) { | 729 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); | 730 | 0 | } | 731 | | | 732 | 1.04M | if (size_ > 0) { | 733 | 811k | T *src = reinterpret_cast<T *>(&buf_[0]); | 734 | 811k | T *dest = reinterpret_cast<T *>(&buf[0]); | 735 | 9.44M | for (std::size_t i = 0; i < size_; ++i) { | 736 | 8.62M | new(&dest[i]) T(src[i]); | 737 | 8.62M | src[i].~T(); | 738 | 8.62M | } | 739 | 811k | } | 740 | | | 741 | 1.04M | buf_.swap(&buf); | 742 | 1.04M | capacity_ = capacity; | 743 | 1.04M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::resize_buf(unsigned long) Line | Count | Source | 714 | 393k | void AutoPool<T>::resize_buf(std::size_t size) { | 715 | 393k | std::size_t capacity; | 716 | 393k | if (size >= capacity_ * 2) { | 717 | 111k | capacity = size; | 718 | 282k | } else { | 719 | 282k | capacity = 1; | 720 | 1.46M | while (capacity < size) { | 721 | 1.18M | capacity <<= 1; | 722 | 1.18M | } | 723 | 282k | } | 724 | | | 725 | 393k | AutoArray<char> buf; | 726 | 393k | try { | 727 | 393k | buf.reset(new char[sizeof(T) * capacity]); | 728 | 393k | } catch (const std::bad_alloc &) { | 729 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); | 730 | 0 | } | 731 | | | 732 | 393k | if (size_ > 0) { | 733 | 337k | T *src = reinterpret_cast<T *>(&buf_[0]); | 734 | 337k | T *dest = reinterpret_cast<T *>(&buf[0]); | 735 | 5.36M | for (std::size_t i = 0; i < size_; ++i) { | 736 | 5.02M | new(&dest[i]) T(src[i]); | 737 | 5.02M | src[i].~T(); | 738 | 5.02M | } | 739 | 337k | } | 740 | | | 741 | 393k | buf_.swap(&buf); | 742 | 393k | capacity_ = capacity; | 743 | 393k | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::resize_buf(unsigned long) Line | Count | Source | 714 | 529k | void AutoPool<T>::resize_buf(std::size_t size) { | 715 | 529k | std::size_t capacity; | 716 | 529k | if (size >= capacity_ * 2) { | 717 | 111k | capacity = size; | 718 | 417k | } else { | 719 | 417k | capacity = 1; | 720 | 2.73M | while (capacity < size) { | 721 | 2.32M | capacity <<= 1; | 722 | 2.32M | } | 723 | 417k | } | 724 | | | 725 | 529k | AutoArray<char> buf; | 726 | 529k | try { | 727 | 529k | buf.reset(new char[sizeof(T) * capacity]); | 728 | 529k | } catch (const std::bad_alloc &) { | 729 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); | 730 | 0 | } | 731 | | | 732 | 529k | if (size_ > 0) { | 733 | 473k | T *src = reinterpret_cast<T *>(&buf_[0]); | 734 | 473k | T *dest = reinterpret_cast<T *>(&buf[0]); | 735 | 50.9M | for (std::size_t i = 0; i < size_; ++i) { | 736 | 50.5M | new(&dest[i]) T(src[i]); | 737 | 50.5M | src[i].~T(); | 738 | 50.5M | } | 739 | 473k | } | 740 | | | 741 | 529k | buf_.swap(&buf); | 742 | 529k | capacity_ = capacity; | 743 | 529k | } |
Darts::Details::AutoPool<unsigned char>::resize_buf(unsigned long) Line | Count | Source | 714 | 878k | void AutoPool<T>::resize_buf(std::size_t size) { | 715 | 878k | std::size_t capacity; | 716 | 878k | if (size >= capacity_ * 2) { | 717 | 242k | capacity = size; | 718 | 635k | } else { | 719 | 635k | capacity = 1; | 720 | 3.73M | while (capacity < size) { | 721 | 3.10M | capacity <<= 1; | 722 | 3.10M | } | 723 | 635k | } | 724 | | | 725 | 878k | AutoArray<char> buf; | 726 | 878k | try { | 727 | 878k | buf.reset(new char[sizeof(T) * capacity]); | 728 | 878k | } catch (const std::bad_alloc &) { | 729 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); | 730 | 0 | } | 731 | | | 732 | 878k | if (size_ > 0) { | 733 | 756k | T *src = reinterpret_cast<T *>(&buf_[0]); | 734 | 756k | T *dest = reinterpret_cast<T *>(&buf[0]); | 735 | 53.5M | for (std::size_t i = 0; i < size_; ++i) { | 736 | 52.7M | new(&dest[i]) T(src[i]); | 737 | 52.7M | src[i].~T(); | 738 | 52.7M | } | 739 | 756k | } | 740 | | | 741 | 878k | buf_.swap(&buf); | 742 | 878k | capacity_ = capacity; | 743 | 878k | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::resize_buf(unsigned long) Line | Count | Source | 714 | 104k | void AutoPool<T>::resize_buf(std::size_t size) { | 715 | 104k | std::size_t capacity; | 716 | 104k | if (size >= capacity_ * 2) { | 717 | 99.9k | capacity = size; | 718 | 99.9k | } else { | 719 | 4.58k | capacity = 1; | 720 | 52.5k | while (capacity < size) { | 721 | 48.0k | capacity <<= 1; | 722 | 48.0k | } | 723 | 4.58k | } | 724 | | | 725 | 104k | AutoArray<char> buf; | 726 | 104k | try { | 727 | 104k | buf.reset(new char[sizeof(T) * capacity]); | 728 | 104k | } catch (const std::bad_alloc &) { | 729 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); | 730 | 0 | } | 731 | | | 732 | 104k | if (size_ > 0) { | 733 | 10.4k | T *src = reinterpret_cast<T *>(&buf_[0]); | 734 | 10.4k | T *dest = reinterpret_cast<T *>(&buf[0]); | 735 | 5.40M | for (std::size_t i = 0; i < size_; ++i) { | 736 | 5.39M | new(&dest[i]) T(src[i]); | 737 | 5.39M | src[i].~T(); | 738 | 5.39M | } | 739 | 10.4k | } | 740 | | | 741 | 104k | buf_.swap(&buf); | 742 | 104k | capacity_ = capacity; | 743 | 104k | } |
|
744 | | |
745 | | // |
746 | | // Memory management of stack. |
747 | | // |
748 | | |
749 | | template <typename T> |
750 | | class AutoStack { |
751 | | public: |
752 | 111k | AutoStack() : pool_() {} |
753 | 111k | ~AutoStack() { |
754 | 111k | clear(); |
755 | 111k | } |
756 | | |
757 | | const T &top() const { |
758 | | return pool_[size() - 1]; |
759 | | } |
760 | 120M | T &top() { |
761 | 120M | return pool_[size() - 1]; |
762 | 120M | } |
763 | | |
764 | 35.2M | bool empty() const { |
765 | 35.2M | return pool_.empty(); |
766 | 35.2M | } |
767 | 120M | std::size_t size() const { |
768 | 120M | return pool_.size(); |
769 | 120M | } |
770 | | |
771 | 70.4M | void push(const T &value) { |
772 | 70.4M | pool_.push_back(value); |
773 | 70.4M | } |
774 | 66.9M | void pop() { |
775 | 66.9M | pool_.pop_back(); |
776 | 66.9M | } |
777 | | |
778 | 445k | void clear() { |
779 | 445k | pool_.clear(); |
780 | 445k | } |
781 | | |
782 | | private: |
783 | | AutoPool<T> pool_; |
784 | | |
785 | | // Disallows copy and assignment. |
786 | | AutoStack(const AutoStack &); |
787 | | AutoStack &operator=(const AutoStack &); |
788 | | }; |
789 | | |
790 | | // |
791 | | // Succinct bit vector. |
792 | | // |
793 | | |
794 | | class BitVector { |
795 | | public: |
796 | 55.6k | BitVector() : units_(), ranks_(), num_ones_(0), size_(0) {} |
797 | 55.6k | ~BitVector() { |
798 | 55.6k | clear(); |
799 | 55.6k | } |
800 | | |
801 | 53.9M | bool operator[](std::size_t id) const { |
802 | 53.9M | return (units_[id / UNIT_SIZE] >> (id % UNIT_SIZE) & 1) == 1; |
803 | 53.9M | } |
804 | | |
805 | 0 | id_type rank(std::size_t id) const { |
806 | 0 | std::size_t unit_id = id / UNIT_SIZE; |
807 | 0 | return ranks_[unit_id] + pop_count(units_[unit_id] |
808 | 0 | & (~0U >> (UNIT_SIZE - (id % UNIT_SIZE) - 1))); |
809 | 0 | } |
810 | | |
811 | 0 | void set(std::size_t id, bool bit) { |
812 | 0 | if (bit) { |
813 | 0 | units_[id / UNIT_SIZE] |= 1U << (id % UNIT_SIZE); |
814 | 0 | } else { |
815 | 0 | units_[id / UNIT_SIZE] &= ~(1U << (id % UNIT_SIZE)); |
816 | 0 | } |
817 | 0 | } |
818 | | |
819 | 0 | bool empty() const { |
820 | 0 | return units_.empty(); |
821 | 0 | } |
822 | 111k | std::size_t num_ones() const { |
823 | 111k | return num_ones_; |
824 | 111k | } |
825 | 35.2M | std::size_t size() const { |
826 | 35.2M | return size_; |
827 | 35.2M | } |
828 | | |
829 | 35.2M | void append() { |
830 | 35.2M | if ((size_ % UNIT_SIZE) == 0) { |
831 | 1.12M | units_.append(0); |
832 | 1.12M | } |
833 | 35.2M | ++size_; |
834 | 35.2M | } |
835 | | void build(); |
836 | | |
837 | 167k | void clear() { |
838 | 167k | units_.clear(); |
839 | 167k | ranks_.clear(); |
840 | 167k | } |
841 | | |
842 | | private: |
843 | | enum { UNIT_SIZE = sizeof(id_type) * 8 }; |
844 | | |
845 | | AutoPool<id_type> units_; |
846 | | AutoArray<id_type> ranks_; |
847 | | std::size_t num_ones_; |
848 | | std::size_t size_; |
849 | | |
850 | | // Disallows copy and assignment. |
851 | | BitVector(const BitVector &); |
852 | | BitVector &operator=(const BitVector &); |
853 | | |
854 | 1.12M | static id_type pop_count(id_type unit) { |
855 | 1.12M | unit = ((unit & 0xAAAAAAAA) >> 1) + (unit & 0x55555555); |
856 | 1.12M | unit = ((unit & 0xCCCCCCCC) >> 2) + (unit & 0x33333333); |
857 | 1.12M | unit = ((unit >> 4) + unit) & 0x0F0F0F0F; |
858 | 1.12M | unit += unit >> 8; |
859 | 1.12M | unit += unit >> 16; |
860 | 1.12M | return unit & 0xFF; |
861 | 1.12M | } |
862 | | }; |
863 | | |
864 | 55.6k | inline void BitVector::build() { |
865 | 55.6k | try { |
866 | 55.6k | ranks_.reset(new id_type[units_.size()]); |
867 | 55.6k | } catch (const std::bad_alloc &) { |
868 | 0 | DARTS_THROW("failed to build rank index: std::bad_alloc"); |
869 | 0 | } |
870 | | |
871 | 55.6k | num_ones_ = 0; |
872 | 1.18M | for (std::size_t i = 0; i < units_.size(); ++i) { |
873 | 1.12M | ranks_[i] = num_ones_; |
874 | 1.12M | num_ones_ += pop_count(units_[i]); |
875 | 1.12M | } |
876 | 55.6k | } |
877 | | |
878 | | // |
879 | | // Keyset. |
880 | | // |
881 | | |
882 | | template <typename T> |
883 | | class Keyset { |
884 | | public: |
885 | | Keyset(std::size_t num_keys, const char_type * const *keys, |
886 | | const std::size_t *lengths, const T *values) : |
887 | 66.2k | num_keys_(num_keys), keys_(keys), lengths_(lengths), values_(values) {} |
888 | | |
889 | 8.39M | std::size_t num_keys() const { |
890 | 8.39M | return num_keys_; |
891 | 8.39M | } |
892 | 8.28M | const char_type *keys(std::size_t id) const { |
893 | 8.28M | return keys_[id]; |
894 | 8.28M | } |
895 | 699k | uchar_type keys(std::size_t key_id, std::size_t char_id) const { |
896 | 699k | if (has_lengths() && char_id >= lengths_[key_id]) |
897 | 97.6k | return '\0'; |
898 | 602k | return keys_[key_id][char_id]; |
899 | 699k | } |
900 | | |
901 | 9.08M | bool has_lengths() const { |
902 | 9.08M | return lengths_ != NULL; |
903 | 9.08M | } |
904 | 8.33M | std::size_t lengths(std::size_t id) const { |
905 | 8.33M | if (has_lengths()) { |
906 | 8.33M | return lengths_[id]; |
907 | 8.33M | } |
908 | 0 | std::size_t length = 0; |
909 | 0 | while (keys_[id][length] != '\0') { |
910 | 0 | ++length; |
911 | 0 | } |
912 | 0 | return length; |
913 | 8.33M | } |
914 | | |
915 | 8.44M | bool has_values() const { |
916 | 8.44M | return values_ != NULL; |
917 | 8.44M | } |
918 | 8.38M | const value_type values(std::size_t id) const { |
919 | 8.38M | if (has_values()) { |
920 | 8.28M | return static_cast<value_type>(values_[id]); |
921 | 8.28M | } |
922 | 97.6k | return static_cast<value_type>(id); |
923 | 8.38M | } |
924 | | |
925 | | private: |
926 | | std::size_t num_keys_; |
927 | | const char_type * const * keys_; |
928 | | const std::size_t *lengths_; |
929 | | const T *values_; |
930 | | |
931 | | // Disallows copy and assignment. |
932 | | Keyset(const Keyset &); |
933 | | Keyset &operator=(const Keyset &); |
934 | | }; |
935 | | |
936 | | // |
937 | | // Node of Directed Acyclic Word Graph (DAWG). |
938 | | // |
939 | | |
940 | | class DawgNode { |
941 | | public: |
942 | 35.2M | DawgNode() : child_(0), sibling_(0), label_('\0'), |
943 | 35.2M | is_state_(false), has_sibling_(false) {} |
944 | | |
945 | 62.1M | void set_child(id_type child) { |
946 | 62.1M | child_ = child; |
947 | 62.1M | } |
948 | 35.1M | void set_sibling(id_type sibling) { |
949 | 35.1M | sibling_ = sibling; |
950 | 35.1M | } |
951 | 8.28M | void set_value(value_type value) { |
952 | 8.28M | child_ = value; |
953 | 8.28M | } |
954 | 35.2M | void set_label(uchar_type label) { |
955 | 35.2M | label_ = label; |
956 | 35.2M | } |
957 | 26.9M | void set_is_state(bool is_state) { |
958 | 26.9M | is_state_ = is_state; |
959 | 26.9M | } |
960 | 8.22M | void set_has_sibling(bool has_sibling) { |
961 | 8.22M | has_sibling_ = has_sibling; |
962 | 8.22M | } |
963 | | |
964 | 104M | id_type child() const { |
965 | 104M | return child_; |
966 | 104M | } |
967 | 177M | id_type sibling() const { |
968 | 177M | return sibling_; |
969 | 177M | } |
970 | 0 | value_type value() const { |
971 | 0 | return static_cast<value_type>(child_); |
972 | 0 | } |
973 | 104M | uchar_type label() const { |
974 | 104M | return label_; |
975 | 104M | } |
976 | 0 | bool is_state() const { |
977 | 0 | return is_state_; |
978 | 0 | } |
979 | 0 | bool has_sibling() const { |
980 | 0 | return has_sibling_; |
981 | 0 | } |
982 | | |
983 | 98.8M | id_type unit() const { |
984 | 98.8M | if (label_ == '\0') { |
985 | 22.1M | return (child_ << 1) | (has_sibling_ ? 1 : 0); |
986 | 22.1M | } |
987 | 76.7M | return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0); |
988 | 98.8M | } |
989 | | |
990 | | private: |
991 | | id_type child_; |
992 | | id_type sibling_; |
993 | | uchar_type label_; |
994 | | bool is_state_; |
995 | | bool has_sibling_; |
996 | | |
997 | | // Copyable. |
998 | | }; |
999 | | |
1000 | | // |
1001 | | // Fixed unit of Directed Acyclic Word Graph (DAWG). |
1002 | | // |
1003 | | |
1004 | | class DawgUnit { |
1005 | | public: |
1006 | 35.2M | explicit DawgUnit(id_type unit = 0) : unit_(unit) {} |
1007 | 50.5M | DawgUnit(const DawgUnit &unit) : unit_(unit.unit_) {} |
1008 | | |
1009 | 35.2M | DawgUnit &operator=(id_type unit) { |
1010 | 35.2M | unit_ = unit; |
1011 | 35.2M | return *this; |
1012 | 35.2M | } |
1013 | | |
1014 | 50.6M | id_type unit() const { |
1015 | 50.6M | return unit_; |
1016 | 50.6M | } |
1017 | | |
1018 | 80.9M | id_type child() const { |
1019 | 80.9M | return unit_ >> 2; |
1020 | 80.9M | } |
1021 | 164M | bool has_sibling() const { |
1022 | 164M | return (unit_ & 1) == 1; |
1023 | 164M | } |
1024 | 8.28M | value_type value() const { |
1025 | 8.28M | return static_cast<value_type>(unit_ >> 1); |
1026 | 8.28M | } |
1027 | 17.0M | bool is_state() const { |
1028 | 17.0M | return (unit_ & 2) == 2; |
1029 | 17.0M | } |
1030 | | |
1031 | | private: |
1032 | | id_type unit_; |
1033 | | |
1034 | | // Copyable. |
1035 | | }; |
1036 | | |
1037 | | // |
1038 | | // Directed Acyclic Word Graph (DAWG) builder. |
1039 | | // |
1040 | | |
1041 | | class DawgBuilder { |
1042 | | public: |
1043 | 55.6k | DawgBuilder() : nodes_(), units_(), labels_(), is_intersections_(), |
1044 | 55.6k | table_(), node_stack_(), recycle_bin_(), num_states_(0) {} |
1045 | 55.6k | ~DawgBuilder() { |
1046 | 55.6k | clear(); |
1047 | 55.6k | } |
1048 | | |
1049 | 111k | id_type root() const { |
1050 | 111k | return 0; |
1051 | 111k | } |
1052 | | |
1053 | 80.9M | id_type child(id_type id) const { |
1054 | 80.9M | return units_[id].child(); |
1055 | 80.9M | } |
1056 | 105M | id_type sibling(id_type id) const { |
1057 | 105M | return units_[id].has_sibling() ? (id + 1) : 0; |
1058 | 105M | } |
1059 | 8.28M | int value(id_type id) const { |
1060 | 8.28M | return units_[id].value(); |
1061 | 8.28M | } |
1062 | | |
1063 | 35.1M | bool is_leaf(id_type id) const { |
1064 | 35.1M | return label(id) == '\0'; |
1065 | 35.1M | } |
1066 | 105M | uchar_type label(id_type id) const { |
1067 | 105M | return labels_[id]; |
1068 | 105M | } |
1069 | | |
1070 | 53.9M | bool is_intersection(id_type id) const { |
1071 | 53.9M | return is_intersections_[id]; |
1072 | 53.9M | } |
1073 | 0 | id_type intersection_id(id_type id) const { |
1074 | 0 | return is_intersections_.rank(id) - 1; |
1075 | 0 | } |
1076 | | |
1077 | 111k | std::size_t num_intersections() const { |
1078 | 111k | return is_intersections_.num_ones(); |
1079 | 111k | } |
1080 | | |
1081 | 529k | std::size_t size() const { |
1082 | 529k | return units_.size(); |
1083 | 529k | } |
1084 | | |
1085 | | void init(); |
1086 | | void finish(); |
1087 | | |
1088 | | void insert(const char *key, std::size_t length, value_type value); |
1089 | | |
1090 | | void clear(); |
1091 | | |
1092 | | private: |
1093 | | enum { INITIAL_TABLE_SIZE = 1 << 10 }; |
1094 | | |
1095 | | AutoPool<DawgNode> nodes_; |
1096 | | AutoPool<DawgUnit> units_; |
1097 | | AutoPool<uchar_type> labels_; |
1098 | | BitVector is_intersections_; |
1099 | | AutoPool<id_type> table_; |
1100 | | AutoStack<id_type> node_stack_; |
1101 | | AutoStack<id_type> recycle_bin_; |
1102 | | std::size_t num_states_; |
1103 | | |
1104 | | // Disallows copy and assignment. |
1105 | | DawgBuilder(const DawgBuilder &); |
1106 | | DawgBuilder &operator=(const DawgBuilder &); |
1107 | | |
1108 | | void flush(id_type id); |
1109 | | |
1110 | | void expand_table(); |
1111 | | |
1112 | | id_type find_unit(id_type id, id_type *hash_id) const; |
1113 | | id_type find_node(id_type node_id, id_type *hash_id) const; |
1114 | | |
1115 | | bool are_equal(id_type node_id, id_type unit_id) const; |
1116 | | |
1117 | | id_type hash_unit(id_type id) const; |
1118 | | id_type hash_node(id_type id) const; |
1119 | | |
1120 | | id_type append_node(); |
1121 | | id_type append_unit(); |
1122 | | |
1123 | 35.1M | void free_node(id_type id) { |
1124 | 35.1M | recycle_bin_.push(id); |
1125 | 35.1M | } |
1126 | | |
1127 | 57.3M | static id_type hash(id_type key) { |
1128 | 57.3M | key = ~key + (key << 15); // key = (key << 15) - key - 1; |
1129 | 57.3M | key = key ^ (key >> 12); |
1130 | 57.3M | key = key + (key << 2); |
1131 | 57.3M | key = key ^ (key >> 4); |
1132 | 57.3M | key = key * 2057; // key = (key + (key << 3)) + (key << 11); |
1133 | 57.3M | key = key ^ (key >> 16); |
1134 | 57.3M | return key; |
1135 | 57.3M | } |
1136 | | }; |
1137 | | |
1138 | 55.6k | inline void DawgBuilder::init() { |
1139 | 55.6k | table_.resize(INITIAL_TABLE_SIZE, 0); |
1140 | | |
1141 | 55.6k | append_node(); |
1142 | 55.6k | append_unit(); |
1143 | | |
1144 | 55.6k | num_states_ = 1; |
1145 | | |
1146 | 55.6k | nodes_[0].set_label(0xFF); |
1147 | 55.6k | node_stack_.push(0); |
1148 | 55.6k | } |
1149 | | |
1150 | 55.6k | inline void DawgBuilder::finish() { |
1151 | 55.6k | flush(0); |
1152 | | |
1153 | 55.6k | units_[0] = nodes_[0].unit(); |
1154 | 55.6k | labels_[0] = nodes_[0].label(); |
1155 | | |
1156 | 55.6k | nodes_.clear(); |
1157 | 55.6k | table_.clear(); |
1158 | 55.6k | node_stack_.clear(); |
1159 | 55.6k | recycle_bin_.clear(); |
1160 | | |
1161 | 55.6k | is_intersections_.build(); |
1162 | 55.6k | } |
1163 | | |
1164 | | inline void DawgBuilder::insert(const char *key, std::size_t length, |
1165 | 8.28M | value_type value) { |
1166 | 8.28M | if (value < 0) { |
1167 | 0 | DARTS_THROW("failed to insert key: negative value"); |
1168 | 8.28M | } else if (length == 0) { |
1169 | 0 | DARTS_THROW("failed to insert key: zero-length key"); |
1170 | 0 | } |
1171 | | |
1172 | 8.28M | id_type id = 0; |
1173 | 8.28M | std::size_t key_pos = 0; |
1174 | | |
1175 | 34.1M | for ( ; key_pos <= length; ++key_pos) { |
1176 | 34.1M | id_type child_id = nodes_[id].child(); |
1177 | 34.1M | if (child_id == 0) { |
1178 | 55.6k | break; |
1179 | 55.6k | } |
1180 | | |
1181 | 34.0M | uchar_type key_label = static_cast<uchar_type>(key[key_pos]); |
1182 | 34.0M | if (key_pos < length && key_label == '\0') { |
1183 | 0 | DARTS_THROW("failed to insert key: invalid null character"); |
1184 | 0 | } |
1185 | | |
1186 | 34.0M | uchar_type unit_label = nodes_[child_id].label(); |
1187 | 34.0M | if (key_label < unit_label) { |
1188 | 0 | DARTS_THROW("failed to insert key: wrong key order"); |
1189 | 34.0M | } else if (key_label > unit_label) { |
1190 | 8.22M | nodes_[child_id].set_has_sibling(true); |
1191 | 8.22M | flush(child_id); |
1192 | 8.22M | break; |
1193 | 8.22M | } |
1194 | 25.8M | id = child_id; |
1195 | 25.8M | } |
1196 | | |
1197 | 8.28M | if (key_pos > length) { |
1198 | 0 | return; |
1199 | 0 | } |
1200 | | |
1201 | 43.4M | for ( ; key_pos <= length; ++key_pos) { |
1202 | 35.1M | uchar_type key_label = static_cast<uchar_type>( |
1203 | 35.1M | (key_pos < length) ? key[key_pos] : '\0'); |
1204 | 35.1M | id_type child_id = append_node(); |
1205 | | |
1206 | 35.1M | if (nodes_[id].child() == 0) { |
1207 | 26.9M | nodes_[child_id].set_is_state(true); |
1208 | 26.9M | } |
1209 | 35.1M | nodes_[child_id].set_sibling(nodes_[id].child()); |
1210 | 35.1M | nodes_[child_id].set_label(key_label); |
1211 | 35.1M | nodes_[id].set_child(child_id); |
1212 | 35.1M | node_stack_.push(child_id); |
1213 | | |
1214 | 35.1M | id = child_id; |
1215 | 35.1M | } |
1216 | 8.28M | nodes_[id].set_value(value); |
1217 | 8.28M | } |
1218 | | |
1219 | 111k | inline void DawgBuilder::clear() { |
1220 | 111k | nodes_.clear(); |
1221 | 111k | units_.clear(); |
1222 | 111k | labels_.clear(); |
1223 | 111k | is_intersections_.clear(); |
1224 | 111k | table_.clear(); |
1225 | 111k | node_stack_.clear(); |
1226 | 111k | recycle_bin_.clear(); |
1227 | 111k | num_states_ = 0; |
1228 | 111k | } |
1229 | | |
1230 | 8.28M | inline void DawgBuilder::flush(id_type id) { |
1231 | 35.2M | while (node_stack_.top() != id) { |
1232 | 26.9M | id_type node_id = node_stack_.top(); |
1233 | 26.9M | node_stack_.pop(); |
1234 | | |
1235 | 26.9M | if (num_states_ >= table_.size() - (table_.size() >> 2)) { |
1236 | 15.1k | expand_table(); |
1237 | 15.1k | } |
1238 | | |
1239 | 26.9M | id_type num_siblings = 0; |
1240 | 62.1M | for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { |
1241 | 35.1M | ++num_siblings; |
1242 | 35.1M | } |
1243 | | |
1244 | 26.9M | id_type hash_id; |
1245 | 26.9M | id_type match_id = find_node(node_id, &hash_id); |
1246 | 26.9M | if (match_id != 0) { |
1247 | 0 | is_intersections_.set(match_id, true); |
1248 | 26.9M | } else { |
1249 | 26.9M | id_type unit_id = 0; |
1250 | 62.1M | for (id_type i = 0; i < num_siblings; ++i) { |
1251 | 35.1M | unit_id = append_unit(); |
1252 | 35.1M | } |
1253 | 62.1M | for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { |
1254 | 35.1M | units_[unit_id] = nodes_[i].unit(); |
1255 | 35.1M | labels_[unit_id] = nodes_[i].label(); |
1256 | 35.1M | --unit_id; |
1257 | 35.1M | } |
1258 | 26.9M | match_id = unit_id + 1; |
1259 | 26.9M | table_[hash_id] = match_id; |
1260 | 26.9M | ++num_states_; |
1261 | 26.9M | } |
1262 | | |
1263 | 62.1M | for (id_type i = node_id, next; i != 0; i = next) { |
1264 | 35.1M | next = nodes_[i].sibling(); |
1265 | 35.1M | free_node(i); |
1266 | 35.1M | } |
1267 | | |
1268 | 26.9M | nodes_[node_stack_.top()].set_child(match_id); |
1269 | 26.9M | } |
1270 | 8.28M | node_stack_.pop(); |
1271 | 8.28M | } |
1272 | | |
1273 | 15.1k | inline void DawgBuilder::expand_table() { |
1274 | 15.1k | std::size_t table_size = table_.size() << 1; |
1275 | 15.1k | table_.clear(); |
1276 | 15.1k | table_.resize(table_size, 0); |
1277 | | |
1278 | 22.2M | for (std::size_t i = 1; i < units_.size(); ++i) { |
1279 | 22.1M | id_type id = static_cast<id_type>(i); |
1280 | 22.1M | if (labels_[id] == '\0' || units_[id].is_state()) { |
1281 | 17.7M | id_type hash_id; |
1282 | 17.7M | find_unit(id, &hash_id); |
1283 | 17.7M | table_[hash_id] = id; |
1284 | 17.7M | } |
1285 | 22.1M | } |
1286 | 15.1k | } |
1287 | | |
1288 | 17.7M | inline id_type DawgBuilder::find_unit(id_type id, id_type *hash_id) const { |
1289 | 17.7M | *hash_id = hash_unit(id) % table_.size(); |
1290 | 23.0M | for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { |
1291 | 23.0M | id_type unit_id = table_[*hash_id]; |
1292 | 23.0M | if (unit_id == 0) { |
1293 | 17.7M | break; |
1294 | 17.7M | } |
1295 | | |
1296 | | // There must not be the same unit. |
1297 | 23.0M | } |
1298 | 17.7M | return 0; |
1299 | 17.7M | } |
1300 | | |
1301 | | inline id_type DawgBuilder::find_node(id_type node_id, |
1302 | 26.9M | id_type *hash_id) const { |
1303 | 26.9M | *hash_id = hash_node(node_id) % table_.size(); |
1304 | 62.7M | for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { |
1305 | 62.7M | id_type unit_id = table_[*hash_id]; |
1306 | 62.7M | if (unit_id == 0) { |
1307 | 26.9M | break; |
1308 | 26.9M | } |
1309 | | |
1310 | 35.7M | if (are_equal(node_id, unit_id)) { |
1311 | 0 | return unit_id; |
1312 | 0 | } |
1313 | 35.7M | } |
1314 | 26.9M | return 0; |
1315 | 26.9M | } |
1316 | | |
1317 | 35.7M | inline bool DawgBuilder::are_equal(id_type node_id, id_type unit_id) const { |
1318 | 36.6M | for (id_type i = nodes_[node_id].sibling(); i != 0; |
1319 | 35.7M | i = nodes_[i].sibling()) { |
1320 | 4.48M | if (units_[unit_id].has_sibling() == false) { |
1321 | 3.62M | return false; |
1322 | 3.62M | } |
1323 | 855k | ++unit_id; |
1324 | 855k | } |
1325 | 32.1M | if (units_[unit_id].has_sibling() == true) { |
1326 | 3.70M | return false; |
1327 | 3.70M | } |
1328 | | |
1329 | 28.4M | for (id_type i = node_id; i != 0; i = nodes_[i].sibling(), --unit_id) { |
1330 | 28.4M | if (nodes_[i].unit() != units_[unit_id].unit() || |
1331 | 28.4M | nodes_[i].label() != labels_[unit_id]) { |
1332 | 28.4M | return false; |
1333 | 28.4M | } |
1334 | 28.4M | } |
1335 | 0 | return true; |
1336 | 28.4M | } |
1337 | | |
1338 | 17.7M | inline id_type DawgBuilder::hash_unit(id_type id) const { |
1339 | 17.7M | id_type hash_value = 0; |
1340 | 22.1M | for ( ; id != 0; ++id) { |
1341 | 22.1M | id_type unit = units_[id].unit(); |
1342 | 22.1M | uchar_type label = labels_[id]; |
1343 | 22.1M | hash_value ^= hash((label << 24) ^ unit); |
1344 | | |
1345 | 22.1M | if (units_[id].has_sibling() == false) { |
1346 | 17.7M | break; |
1347 | 17.7M | } |
1348 | 22.1M | } |
1349 | 17.7M | return hash_value; |
1350 | 17.7M | } |
1351 | | |
1352 | 26.9M | inline id_type DawgBuilder::hash_node(id_type id) const { |
1353 | 26.9M | id_type hash_value = 0; |
1354 | 62.1M | for ( ; id != 0; id = nodes_[id].sibling()) { |
1355 | 35.1M | id_type unit = nodes_[id].unit(); |
1356 | 35.1M | uchar_type label = nodes_[id].label(); |
1357 | 35.1M | hash_value ^= hash((label << 24) ^ unit); |
1358 | 35.1M | } |
1359 | 26.9M | return hash_value; |
1360 | 26.9M | } |
1361 | | |
1362 | 35.2M | inline id_type DawgBuilder::append_unit() { |
1363 | 35.2M | is_intersections_.append(); |
1364 | 35.2M | units_.append(); |
1365 | 35.2M | labels_.append(); |
1366 | | |
1367 | 35.2M | return static_cast<id_type>(is_intersections_.size() - 1); |
1368 | 35.2M | } |
1369 | | |
1370 | 35.2M | inline id_type DawgBuilder::append_node() { |
1371 | 35.2M | id_type id; |
1372 | 35.2M | if (recycle_bin_.empty()) { |
1373 | 3.57M | id = static_cast<id_type>(nodes_.size()); |
1374 | 3.57M | nodes_.append(); |
1375 | 31.6M | } else { |
1376 | 31.6M | id = recycle_bin_.top(); |
1377 | 31.6M | nodes_[id] = DawgNode(); |
1378 | 31.6M | recycle_bin_.pop(); |
1379 | 31.6M | } |
1380 | 35.2M | return id; |
1381 | 35.2M | } |
1382 | | |
1383 | | // |
1384 | | // Unit of double-array builder. |
1385 | | // |
1386 | | |
1387 | | class DoubleArrayBuilderUnit { |
1388 | | public: |
1389 | 49.8M | DoubleArrayBuilderUnit() : unit_(0) {} |
1390 | | |
1391 | 8.33M | void set_has_leaf(bool has_leaf) { |
1392 | 8.33M | if (has_leaf) { |
1393 | 8.33M | unit_ |= 1U << 8; |
1394 | 8.33M | } else { |
1395 | 0 | unit_ &= ~(1U << 8); |
1396 | 0 | } |
1397 | 8.33M | } |
1398 | 8.33M | void set_value(value_type value) { |
1399 | 8.33M | unit_ = value | (1U << 31); |
1400 | 8.33M | } |
1401 | 41.5M | void set_label(uchar_type label) { |
1402 | 41.5M | unit_ = (unit_ & ~0xFFU) | label; |
1403 | 41.5M | } |
1404 | 27.1M | void set_offset(id_type offset) { |
1405 | 27.1M | if (offset >= 1U << 29) { |
1406 | 0 | DARTS_THROW("failed to modify unit: too large offset"); |
1407 | 0 | } |
1408 | 27.1M | unit_ &= (1U << 31) | (1U << 8) | 0xFF; |
1409 | 27.1M | if (offset < 1U << 21) { |
1410 | 27.1M | unit_ |= (offset << 10); |
1411 | 27.1M | } else { |
1412 | 0 | unit_ |= (offset << 2) | (1U << 9); |
1413 | 0 | } |
1414 | 27.1M | } |
1415 | | |
1416 | | private: |
1417 | | id_type unit_; |
1418 | | |
1419 | | // Copyable. |
1420 | | }; |
1421 | | |
1422 | | // |
1423 | | // Extra unit of double-array builder. |
1424 | | // |
1425 | | |
1426 | | class DoubleArrayBuilderExtraUnit { |
1427 | | public: |
1428 | 271M | DoubleArrayBuilderExtraUnit() : prev_(0), next_(0), |
1429 | 271M | is_fixed_(false), is_used_(false) {} |
1430 | | |
1431 | 100M | void set_prev(id_type prev) { |
1432 | 100M | prev_ = prev; |
1433 | 100M | } |
1434 | 100M | void set_next(id_type next) { |
1435 | 100M | next_ = next; |
1436 | 100M | } |
1437 | 52.2M | void set_is_fixed(bool is_fixed) { |
1438 | 52.2M | is_fixed_ = is_fixed; |
1439 | 52.2M | } |
1440 | 29.5M | void set_is_used(bool is_used) { |
1441 | 29.5M | is_used_ = is_used; |
1442 | 29.5M | } |
1443 | | |
1444 | 100M | id_type prev() const { |
1445 | 100M | return prev_; |
1446 | 100M | } |
1447 | 324M | id_type next() const { |
1448 | 324M | return next_; |
1449 | 324M | } |
1450 | 103M | bool is_fixed() const { |
1451 | 103M | return is_fixed_; |
1452 | 103M | } |
1453 | 233M | bool is_used() const { |
1454 | 233M | return is_used_; |
1455 | 233M | } |
1456 | | |
1457 | | private: |
1458 | | id_type prev_; |
1459 | | id_type next_; |
1460 | | bool is_fixed_; |
1461 | | bool is_used_; |
1462 | | |
1463 | | // Copyable. |
1464 | | }; |
1465 | | |
1466 | | // |
1467 | | // DAWG -> double-array converter. |
1468 | | // |
1469 | | |
1470 | | class DoubleArrayBuilder { |
1471 | | public: |
1472 | | explicit DoubleArrayBuilder(progress_func_type progress_func) |
1473 | 66.2k | : progress_func_(progress_func), units_(), extras_(), labels_(), |
1474 | 66.2k | table_(), extras_head_(0) {} |
1475 | 66.2k | ~DoubleArrayBuilder() { |
1476 | 66.2k | clear(); |
1477 | 66.2k | } |
1478 | | |
1479 | | template <typename T> |
1480 | | void build(const Keyset<T> &keyset); |
1481 | | void copy(std::size_t *size_ptr, DoubleArrayUnit **buf_ptr) const; |
1482 | | |
1483 | | void clear(); |
1484 | | |
1485 | | private: |
1486 | | static constexpr int BLOCK_SIZE = 256; |
1487 | | static constexpr int NUM_EXTRA_BLOCKS = 16; |
1488 | | static constexpr int NUM_EXTRAS = BLOCK_SIZE * NUM_EXTRA_BLOCKS; |
1489 | | |
1490 | | static constexpr int UPPER_MASK = 0xFF << 21; |
1491 | | static constexpr int LOWER_MASK = 0xFF; |
1492 | | |
1493 | | typedef DoubleArrayBuilderUnit unit_type; |
1494 | | typedef DoubleArrayBuilderExtraUnit extra_type; |
1495 | | |
1496 | | progress_func_type progress_func_; |
1497 | | AutoPool<unit_type> units_; |
1498 | | AutoArray<extra_type> extras_; |
1499 | | AutoPool<uchar_type> labels_; |
1500 | | AutoArray<id_type> table_; |
1501 | | id_type extras_head_; |
1502 | | |
1503 | | // Disallows copy and assignment. |
1504 | | DoubleArrayBuilder(const DoubleArrayBuilder &); |
1505 | | DoubleArrayBuilder &operator=(const DoubleArrayBuilder &); |
1506 | | |
1507 | 328k | std::size_t num_blocks() const { |
1508 | 328k | return units_.size() / BLOCK_SIZE; |
1509 | 328k | } |
1510 | | |
1511 | 489M | const extra_type &extras(id_type id) const { |
1512 | 489M | return extras_[id % NUM_EXTRAS]; |
1513 | 489M | } |
1514 | 554M | extra_type &extras(id_type id) { |
1515 | 554M | return extras_[id % NUM_EXTRAS]; |
1516 | 554M | } |
1517 | | |
1518 | | template <typename T> |
1519 | | void build_dawg(const Keyset<T> &keyset, DawgBuilder *dawg_builder); |
1520 | | void build_from_dawg(const DawgBuilder &dawg); |
1521 | | void build_from_dawg(const DawgBuilder &dawg, |
1522 | | id_type dawg_id, id_type dic_id); |
1523 | | id_type arrange_from_dawg(const DawgBuilder &dawg, |
1524 | | id_type dawg_id, id_type dic_id); |
1525 | | |
1526 | | template <typename T> |
1527 | | void build_from_keyset(const Keyset<T> &keyset); |
1528 | | template <typename T> |
1529 | | void build_from_keyset(const Keyset<T> &keyset, std::size_t begin, |
1530 | | std::size_t end, std::size_t depth, id_type dic_id); |
1531 | | template <typename T> |
1532 | | id_type arrange_from_keyset(const Keyset<T> &keyset, std::size_t begin, |
1533 | | std::size_t end, std::size_t depth, id_type dic_id); |
1534 | | |
1535 | | id_type find_valid_offset(id_type id) const; |
1536 | | bool is_valid_offset(id_type id, id_type offset) const; |
1537 | | |
1538 | | void reserve_id(id_type id); |
1539 | | void expand_units(); |
1540 | | |
1541 | | void fix_all_blocks(); |
1542 | | void fix_block(id_type block_id); |
1543 | | }; |
1544 | | |
1545 | | template <typename T> |
1546 | 66.2k | void DoubleArrayBuilder::build(const Keyset<T> &keyset) { |
1547 | 66.2k | if (keyset.has_values()) { |
1548 | 55.6k | Details::DawgBuilder dawg_builder; |
1549 | 55.6k | build_dawg(keyset, &dawg_builder); |
1550 | 55.6k | build_from_dawg(dawg_builder); |
1551 | 55.6k | dawg_builder.clear(); |
1552 | 55.6k | } else { |
1553 | 10.5k | build_from_keyset(keyset); |
1554 | 10.5k | } |
1555 | 66.2k | } |
1556 | | |
1557 | | inline void DoubleArrayBuilder::copy(std::size_t *size_ptr, |
1558 | 66.2k | DoubleArrayUnit **buf_ptr) const { |
1559 | 66.2k | if (size_ptr != NULL) { |
1560 | 66.2k | *size_ptr = units_.size(); |
1561 | 66.2k | } |
1562 | 66.2k | if (buf_ptr != NULL) { |
1563 | 66.2k | *buf_ptr = new DoubleArrayUnit[units_.size()]; |
1564 | 66.2k | unit_type *units = reinterpret_cast<unit_type *>(*buf_ptr); |
1565 | 49.9M | for (std::size_t i = 0; i < units_.size(); ++i) { |
1566 | 49.8M | units[i] = units_[i]; |
1567 | 49.8M | } |
1568 | 66.2k | } |
1569 | 66.2k | } |
1570 | | |
1571 | 66.2k | inline void DoubleArrayBuilder::clear() { |
1572 | 66.2k | units_.clear(); |
1573 | 66.2k | extras_.clear(); |
1574 | 66.2k | labels_.clear(); |
1575 | 66.2k | table_.clear(); |
1576 | 66.2k | extras_head_ = 0; |
1577 | 66.2k | } |
1578 | | |
1579 | | template <typename T> |
1580 | | void DoubleArrayBuilder::build_dawg(const Keyset<T> &keyset, |
1581 | 55.6k | DawgBuilder *dawg_builder) { |
1582 | 55.6k | dawg_builder->init(); |
1583 | 8.33M | for (std::size_t i = 0; i < keyset.num_keys(); ++i) { |
1584 | 8.28M | dawg_builder->insert(keyset.keys(i), keyset.lengths(i), keyset.values(i)); |
1585 | 8.28M | if (progress_func_ != NULL) { |
1586 | 0 | progress_func_(i + 1, keyset.num_keys() + 1); |
1587 | 0 | } |
1588 | 8.28M | } |
1589 | 55.6k | dawg_builder->finish(); |
1590 | 55.6k | } |
1591 | | |
1592 | 55.6k | inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg) { |
1593 | 55.6k | std::size_t num_units = 1; |
1594 | 529k | while (num_units < dawg.size()) { |
1595 | 473k | num_units <<= 1; |
1596 | 473k | } |
1597 | 55.6k | units_.reserve(num_units); |
1598 | | |
1599 | 55.6k | table_.reset(new id_type[dawg.num_intersections()]); |
1600 | 55.6k | for (std::size_t i = 0; i < dawg.num_intersections(); ++i) { |
1601 | 0 | table_[i] = 0; |
1602 | 0 | } |
1603 | | |
1604 | 55.6k | extras_.reset(new extra_type[NUM_EXTRAS]); |
1605 | | |
1606 | 55.6k | reserve_id(0); |
1607 | 55.6k | extras(0).set_is_used(true); |
1608 | 55.6k | units_[0].set_offset(1); |
1609 | 55.6k | units_[0].set_label('\0'); |
1610 | | |
1611 | 55.6k | if (dawg.child(dawg.root()) != 0) { |
1612 | 55.6k | build_from_dawg(dawg, dawg.root(), 0); |
1613 | 55.6k | } |
1614 | | |
1615 | 55.6k | fix_all_blocks(); |
1616 | | |
1617 | 55.6k | extras_.clear(); |
1618 | 55.6k | labels_.clear(); |
1619 | 55.6k | table_.clear(); |
1620 | 55.6k | } |
1621 | | |
1622 | | inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg, |
1623 | 26.9M | id_type dawg_id, id_type dic_id) { |
1624 | 26.9M | id_type dawg_child_id = dawg.child(dawg_id); |
1625 | 26.9M | if (dawg.is_intersection(dawg_child_id)) { |
1626 | 0 | id_type intersection_id = dawg.intersection_id(dawg_child_id); |
1627 | 0 | id_type offset = table_[intersection_id]; |
1628 | 0 | if (offset != 0) { |
1629 | 0 | offset ^= dic_id; |
1630 | 0 | if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) { |
1631 | 0 | if (dawg.is_leaf(dawg_child_id)) { |
1632 | 0 | units_[dic_id].set_has_leaf(true); |
1633 | 0 | } |
1634 | 0 | units_[dic_id].set_offset(offset); |
1635 | 0 | return; |
1636 | 0 | } |
1637 | 0 | } |
1638 | 0 | } |
1639 | | |
1640 | 26.9M | id_type offset = arrange_from_dawg(dawg, dawg_id, dic_id); |
1641 | 26.9M | if (dawg.is_intersection(dawg_child_id)) { |
1642 | 0 | table_[dawg.intersection_id(dawg_child_id)] = offset; |
1643 | 0 | } |
1644 | | |
1645 | 35.1M | do { |
1646 | 35.1M | uchar_type child_label = dawg.label(dawg_child_id); |
1647 | 35.1M | id_type dic_child_id = offset ^ child_label; |
1648 | 35.1M | if (child_label != '\0') { |
1649 | 26.9M | build_from_dawg(dawg, dawg_child_id, dic_child_id); |
1650 | 26.9M | } |
1651 | 35.1M | dawg_child_id = dawg.sibling(dawg_child_id); |
1652 | 35.1M | } while (dawg_child_id != 0); |
1653 | 26.9M | } |
1654 | | |
1655 | | inline id_type DoubleArrayBuilder::arrange_from_dawg(const DawgBuilder &dawg, |
1656 | 26.9M | id_type dawg_id, id_type dic_id) { |
1657 | 26.9M | labels_.resize(0); |
1658 | | |
1659 | 26.9M | id_type dawg_child_id = dawg.child(dawg_id); |
1660 | 62.1M | while (dawg_child_id != 0) { |
1661 | 35.1M | labels_.append(dawg.label(dawg_child_id)); |
1662 | 35.1M | dawg_child_id = dawg.sibling(dawg_child_id); |
1663 | 35.1M | } |
1664 | | |
1665 | 26.9M | id_type offset = find_valid_offset(dic_id); |
1666 | 26.9M | units_[dic_id].set_offset(dic_id ^ offset); |
1667 | | |
1668 | 26.9M | dawg_child_id = dawg.child(dawg_id); |
1669 | 62.1M | for (std::size_t i = 0; i < labels_.size(); ++i) { |
1670 | 35.1M | id_type dic_child_id = offset ^ labels_[i]; |
1671 | 35.1M | reserve_id(dic_child_id); |
1672 | | |
1673 | 35.1M | if (dawg.is_leaf(dawg_child_id)) { |
1674 | 8.28M | units_[dic_id].set_has_leaf(true); |
1675 | 8.28M | units_[dic_child_id].set_value(dawg.value(dawg_child_id)); |
1676 | 26.9M | } else { |
1677 | 26.9M | units_[dic_child_id].set_label(labels_[i]); |
1678 | 26.9M | } |
1679 | | |
1680 | 35.1M | dawg_child_id = dawg.sibling(dawg_child_id); |
1681 | 35.1M | } |
1682 | 26.9M | extras(offset).set_is_used(true); |
1683 | | |
1684 | 26.9M | return offset; |
1685 | 26.9M | } |
1686 | | |
1687 | | template <typename T> |
1688 | 10.5k | void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset) { |
1689 | 10.5k | std::size_t num_units = 1; |
1690 | 31.3k | while (num_units < keyset.num_keys()) { |
1691 | 20.8k | num_units <<= 1; |
1692 | 20.8k | } |
1693 | 10.5k | units_.reserve(num_units); |
1694 | | |
1695 | 10.5k | extras_.reset(new extra_type[NUM_EXTRAS]); |
1696 | | |
1697 | 10.5k | reserve_id(0); |
1698 | 10.5k | extras(0).set_is_used(true); |
1699 | 10.5k | units_[0].set_offset(1); |
1700 | 10.5k | units_[0].set_label('\0'); |
1701 | | |
1702 | 10.5k | if (keyset.num_keys() > 0) { |
1703 | 10.5k | build_from_keyset(keyset, 0, keyset.num_keys(), 0, 0); |
1704 | 10.5k | } |
1705 | | |
1706 | 10.5k | fix_all_blocks(); |
1707 | | |
1708 | 10.5k | extras_.clear(); |
1709 | 10.5k | labels_.clear(); |
1710 | 10.5k | } |
1711 | | |
1712 | | template <typename T> |
1713 | | void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset, |
1714 | 150k | std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { |
1715 | 150k | id_type offset = arrange_from_keyset(keyset, begin, end, depth, dic_id); |
1716 | | |
1717 | 199k | while (begin < end) { |
1718 | 150k | if (keyset.keys(begin, depth) != '\0') { |
1719 | 101k | break; |
1720 | 101k | } |
1721 | 48.8k | ++begin; |
1722 | 48.8k | } |
1723 | 150k | if (begin == end) { |
1724 | 48.8k | return; |
1725 | 48.8k | } |
1726 | | |
1727 | 101k | std::size_t last_begin = begin; |
1728 | 101k | uchar_type last_label = keyset.keys(begin, depth); |
1729 | 230k | while (++begin < end) { |
1730 | 128k | uchar_type label = keyset.keys(begin, depth); |
1731 | 128k | if (label != last_label) { |
1732 | 38.2k | build_from_keyset(keyset, last_begin, begin, |
1733 | 38.2k | depth + 1, offset ^ last_label); |
1734 | 38.2k | last_begin = begin; |
1735 | 38.2k | last_label = keyset.keys(begin, depth); |
1736 | 38.2k | } |
1737 | 128k | } |
1738 | 101k | build_from_keyset(keyset, last_begin, end, depth + 1, offset ^ last_label); |
1739 | 101k | } |
1740 | | |
1741 | | template <typename T> |
1742 | | id_type DoubleArrayBuilder::arrange_from_keyset(const Keyset<T> &keyset, |
1743 | 150k | std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { |
1744 | 150k | labels_.resize(0); |
1745 | | |
1746 | 150k | value_type value = -1; |
1747 | 430k | for (std::size_t i = begin; i < end; ++i) { |
1748 | 279k | uchar_type label = keyset.keys(i, depth); |
1749 | 279k | if (label == '\0') { |
1750 | 48.8k | if (keyset.has_lengths() && depth < keyset.lengths(i)) { |
1751 | 0 | DARTS_THROW("failed to build double-array: " |
1752 | 0 | "invalid null character"); |
1753 | 48.8k | } else if (keyset.values(i) < 0) { |
1754 | 0 | DARTS_THROW("failed to build double-array: negative value"); |
1755 | 0 | } |
1756 | | |
1757 | 48.8k | if (value == -1) { |
1758 | 48.8k | value = keyset.values(i); |
1759 | 48.8k | } |
1760 | 48.8k | if (progress_func_ != NULL) { |
1761 | 0 | progress_func_(i + 1, keyset.num_keys() + 1); |
1762 | 0 | } |
1763 | 48.8k | } |
1764 | | |
1765 | 279k | if (labels_.empty()) { |
1766 | 150k | labels_.append(label); |
1767 | 150k | } else if (label != labels_[labels_.size() - 1]) { |
1768 | 38.2k | if (label < labels_[labels_.size() - 1]) { |
1769 | 0 | DARTS_THROW("failed to build double-array: wrong key order"); |
1770 | 0 | } |
1771 | 38.2k | labels_.append(label); |
1772 | 38.2k | } |
1773 | 279k | } |
1774 | | |
1775 | 150k | id_type offset = find_valid_offset(dic_id); |
1776 | 150k | units_[dic_id].set_offset(dic_id ^ offset); |
1777 | | |
1778 | 339k | for (std::size_t i = 0; i < labels_.size(); ++i) { |
1779 | 189k | id_type dic_child_id = offset ^ labels_[i]; |
1780 | 189k | reserve_id(dic_child_id); |
1781 | 189k | if (labels_[i] == '\0') { |
1782 | 48.8k | units_[dic_id].set_has_leaf(true); |
1783 | 48.8k | units_[dic_child_id].set_value(value); |
1784 | 140k | } else { |
1785 | 140k | units_[dic_child_id].set_label(labels_[i]); |
1786 | 140k | } |
1787 | 189k | } |
1788 | 150k | extras(offset).set_is_used(true); |
1789 | | |
1790 | 150k | return offset; |
1791 | 150k | } |
1792 | | |
1793 | 27.1M | inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const { |
1794 | 27.1M | if (extras_head_ >= units_.size()) { |
1795 | 86 | return units_.size() | (id & LOWER_MASK); |
1796 | 86 | } |
1797 | | |
1798 | 27.1M | id_type unfixed_id = extras_head_; |
1799 | 231M | do { |
1800 | 231M | id_type offset = unfixed_id ^ labels_[0]; |
1801 | 231M | if (is_valid_offset(id, offset)) { |
1802 | 26.9M | return offset; |
1803 | 26.9M | } |
1804 | 204M | unfixed_id = extras(unfixed_id).next(); |
1805 | 204M | } while (unfixed_id != extras_head_); |
1806 | | |
1807 | 128k | return units_.size() | (id & LOWER_MASK); |
1808 | 27.1M | } |
1809 | | |
1810 | | inline bool DoubleArrayBuilder::is_valid_offset(id_type id, |
1811 | 231M | id_type offset) const { |
1812 | 231M | if (extras(offset).is_used()) { |
1813 | 175M | return false; |
1814 | 175M | } |
1815 | | |
1816 | 55.4M | id_type rel_offset = id ^ offset; |
1817 | 55.4M | if ((rel_offset & LOWER_MASK) && (rel_offset & UPPER_MASK)) { |
1818 | 0 | return false; |
1819 | 0 | } |
1820 | | |
1821 | 81.0M | for (std::size_t i = 1; i < labels_.size(); ++i) { |
1822 | 54.0M | if (extras(offset ^ labels_[i]).is_fixed()) { |
1823 | 28.5M | return false; |
1824 | 28.5M | } |
1825 | 54.0M | } |
1826 | | |
1827 | 26.9M | return true; |
1828 | 55.4M | } |
1829 | | |
1830 | 49.8M | inline void DoubleArrayBuilder::reserve_id(id_type id) { |
1831 | 49.8M | if (id >= units_.size()) { |
1832 | 194k | expand_units(); |
1833 | 194k | } |
1834 | | |
1835 | 49.8M | if (id == extras_head_) { |
1836 | 20.3M | extras_head_ = extras(id).next(); |
1837 | 20.3M | if (extras_head_ == id) { |
1838 | 66.3k | extras_head_ = units_.size(); |
1839 | 66.3k | } |
1840 | 20.3M | } |
1841 | 49.8M | extras(extras(id).prev()).set_next(extras(id).next()); |
1842 | 49.8M | extras(extras(id).next()).set_prev(extras(id).prev()); |
1843 | 49.8M | extras(id).set_is_fixed(true); |
1844 | 49.8M | } |
1845 | | |
1846 | 194k | inline void DoubleArrayBuilder::expand_units() { |
1847 | 194k | id_type src_num_units = units_.size(); |
1848 | 194k | id_type src_num_blocks = num_blocks(); |
1849 | | |
1850 | 194k | id_type dest_num_units = src_num_units + BLOCK_SIZE; |
1851 | 194k | id_type dest_num_blocks = src_num_blocks + 1; |
1852 | | |
1853 | 194k | if (dest_num_blocks > NUM_EXTRA_BLOCKS) { |
1854 | 9.29k | fix_block(src_num_blocks - NUM_EXTRA_BLOCKS); |
1855 | 9.29k | } |
1856 | | |
1857 | 194k | units_.resize(dest_num_units); |
1858 | | |
1859 | 194k | if (dest_num_blocks > NUM_EXTRA_BLOCKS) { |
1860 | 2.38M | for (std::size_t id = src_num_units; id < dest_num_units; ++id) { |
1861 | 2.37M | extras(id).set_is_used(false); |
1862 | 2.37M | extras(id).set_is_fixed(false); |
1863 | 2.37M | } |
1864 | 9.29k | } |
1865 | | |
1866 | 49.8M | for (id_type i = src_num_units + 1; i < dest_num_units; ++i) { |
1867 | 49.6M | extras(i - 1).set_next(i); |
1868 | 49.6M | extras(i).set_prev(i - 1); |
1869 | 49.6M | } |
1870 | | |
1871 | 194k | extras(src_num_units).set_prev(dest_num_units - 1); |
1872 | 194k | extras(dest_num_units - 1).set_next(src_num_units); |
1873 | | |
1874 | 194k | extras(src_num_units).set_prev(extras(extras_head_).prev()); |
1875 | 194k | extras(dest_num_units - 1).set_next(extras_head_); |
1876 | | |
1877 | 194k | extras(extras(extras_head_).prev()).set_next(src_num_units); |
1878 | 194k | extras(extras_head_).set_prev(dest_num_units - 1); |
1879 | 194k | } |
1880 | | |
1881 | 66.2k | inline void DoubleArrayBuilder::fix_all_blocks() { |
1882 | 66.2k | id_type begin = 0; |
1883 | 66.2k | if (num_blocks() > NUM_EXTRA_BLOCKS) { |
1884 | 735 | begin = num_blocks() - NUM_EXTRA_BLOCKS; |
1885 | 735 | } |
1886 | 66.2k | id_type end = num_blocks(); |
1887 | | |
1888 | 251k | for (id_type block_id = begin; block_id != end; ++block_id) { |
1889 | 185k | fix_block(block_id); |
1890 | 185k | } |
1891 | 66.2k | } |
1892 | | |
1893 | 194k | inline void DoubleArrayBuilder::fix_block(id_type block_id) { |
1894 | 194k | id_type begin = block_id * BLOCK_SIZE; |
1895 | 194k | id_type end = begin + BLOCK_SIZE; |
1896 | | |
1897 | 194k | id_type unused_offset = 0; |
1898 | 2.43M | for (id_type offset = begin; offset != end; ++offset) { |
1899 | 2.43M | if (!extras(offset).is_used()) { |
1900 | 194k | unused_offset = offset; |
1901 | 194k | break; |
1902 | 194k | } |
1903 | 2.43M | } |
1904 | | |
1905 | 50.0M | for (id_type id = begin; id != end; ++id) { |
1906 | 49.8M | if (!extras(id).is_fixed()) { |
1907 | 14.4M | reserve_id(id); |
1908 | 14.4M | units_[id].set_label(static_cast<uchar_type>(id ^ unused_offset)); |
1909 | 14.4M | } |
1910 | 49.8M | } |
1911 | 194k | } |
1912 | | |
1913 | | } // namespace Details |
1914 | | |
1915 | | // |
1916 | | // Member function build() of DoubleArrayImpl. |
1917 | | // |
1918 | | |
1919 | | template <typename A, typename B, typename T, typename C> |
1920 | | int DoubleArrayImpl<A, B, T, C>::build(std::size_t num_keys, |
1921 | | const key_type * const *keys, const std::size_t *lengths, |
1922 | 66.2k | const value_type *values, Details::progress_func_type progress_func) { |
1923 | 66.2k | Details::Keyset<value_type> keyset(num_keys, keys, lengths, values); |
1924 | | |
1925 | 66.2k | Details::DoubleArrayBuilder builder(progress_func); |
1926 | 66.2k | builder.build(keyset); |
1927 | | |
1928 | 66.2k | std::size_t size = 0; |
1929 | 66.2k | unit_type *buf = NULL; |
1930 | 66.2k | builder.copy(&size, &buf); |
1931 | | |
1932 | 66.2k | clear(); |
1933 | | |
1934 | 66.2k | size_ = size; |
1935 | 66.2k | array_ = buf; |
1936 | 66.2k | buf_ = buf; |
1937 | | |
1938 | 66.2k | if (progress_func != NULL) { |
1939 | 0 | progress_func(num_keys + 1, num_keys + 1); |
1940 | 0 | } |
1941 | | |
1942 | 66.2k | return 0; |
1943 | 66.2k | } |
1944 | | |
1945 | | } // namespace Darts |
1946 | | |
1947 | | #undef DARTS_INT_TO_STR |
1948 | | #undef DARTS_LINE_TO_STR |
1949 | | #undef DARTS_LINE_STR |
1950 | | #undef DARTS_THROW |
1951 | | |
1952 | | #endif // DARTS_H_ |