/src/sentencepiece/third_party/darts_clone/darts.h
Line | Count | Source |
1 | | #ifndef DARTS_H_ |
2 | | #define DARTS_H_ |
3 | | |
4 | | #include <cstdio> |
5 | | #include <cstring> |
6 | | #include <exception> |
7 | | #include <new> |
8 | | |
9 | | #define DARTS_VERSION "0.32" |
10 | | |
11 | | // DARTS_THROW() throws a <Darts::Exception> whose message starts with the |
12 | | // file name and the line number. For example, DARTS_THROW("error message") at |
13 | | // line 123 of "darts.h" throws a <Darts::Exception> which has a pointer to |
14 | | // "darts.h:123: exception: error message". The message is available by using |
15 | | // what() as well as that of <std::exception>. |
16 | | #define DARTS_INT_TO_STR(value) #value |
17 | | #define DARTS_LINE_TO_STR(line) DARTS_INT_TO_STR(line) |
18 | | #define DARTS_LINE_STR DARTS_LINE_TO_STR(__LINE__) |
19 | 0 | #define DARTS_THROW(msg) throw Darts::Details::Exception( \ |
20 | 0 | __FILE__ ":" DARTS_LINE_STR ": exception: " msg) |
21 | | |
22 | | namespace Darts { |
23 | | |
24 | | // The following namespace hides the internal types and classes. |
25 | | namespace Details { |
26 | | |
27 | | // This header assumes that <int> and <unsigned int> are 32-bit integer types. |
28 | | // |
29 | | // Darts-clone keeps values associated with keys. The type of the values is |
30 | | // <value_type>. Note that the values must be positive integers because the |
31 | | // most significant bit (MSB) of each value is used to represent whether the |
32 | | // corresponding unit is a leaf or not. Also, the keys are represented by |
33 | | // sequences of <char_type>s. <uchar_type> is the unsigned type of <char_type>. |
34 | | typedef char char_type; |
35 | | typedef unsigned char uchar_type; |
36 | | typedef int value_type; |
37 | | |
38 | | // The main structure of Darts-clone is an array of <DoubleArrayUnit>s, and the |
39 | | // unit type is actually a wrapper of <id_type>. |
40 | | typedef unsigned int id_type; |
41 | | |
42 | | // <progress_func_type> is the type of callback functions for reporting the |
43 | | // progress of building a dictionary. See also build() of <DoubleArray>. |
44 | | // The 1st argument receives the progress value and the 2nd argument receives |
45 | | // the maximum progress value. A usage example is to show the progress |
46 | | // percentage, 100.0 * (the 1st argument) / (the 2nd argument). |
47 | | typedef int (*progress_func_type)(std::size_t, std::size_t); |
48 | | |
49 | | // <DoubleArrayUnit> is the type of double-array units and it is a wrapper of |
50 | | // <id_type> in practice. |
51 | | class DoubleArrayUnit { |
52 | | public: |
53 | 44.5M | DoubleArrayUnit() : unit_() {} |
54 | | |
55 | | // has_leaf() returns whether a leaf unit is immediately derived from the |
56 | | // unit (true) or not (false). |
57 | 652M | bool has_leaf() const { |
58 | 652M | return ((unit_ >> 8) & 1) == 1; |
59 | 652M | } |
60 | | // value() returns the value stored in the unit, and thus value() is |
61 | | // available when and only when the unit is a leaf unit. |
62 | 498M | value_type value() const { |
63 | 498M | return static_cast<value_type>(unit_ & ((1U << 31) - 1)); |
64 | 498M | } |
65 | | |
66 | | // label() returns the label associted with the unit. Note that a leaf unit |
67 | | // always returns an invalid label. For this feature, leaf unit's label() |
68 | | // returns an <id_type> that has the MSB of 1. |
69 | 1.60G | id_type label() const { |
70 | 1.60G | return unit_ & ((1U << 31) | 0xFF); |
71 | 1.60G | } |
72 | | // offset() returns the offset from the unit to its derived units. |
73 | 1.37G | id_type offset() const { |
74 | 1.37G | return (unit_ >> 10) << ((unit_ & (1U << 9)) >> 6); |
75 | 1.37G | } |
76 | | |
77 | | private: |
78 | | id_type unit_; |
79 | | |
80 | | // Copyable. |
81 | | }; |
82 | | |
83 | | // Darts-clone throws an <Exception> for memory allocation failure, invalid |
84 | | // arguments or a too large offset. The last case means that there are too many |
85 | | // keys in the given set of keys. Note that the `msg' of <Exception> must be a |
86 | | // constant or static string because an <Exception> keeps only a pointer to |
87 | | // that string. |
88 | | class Exception : public std::exception { |
89 | | public: |
90 | 0 | explicit Exception(const char *msg = NULL) throw() : msg_(msg) {} |
91 | 0 | Exception(const Exception &rhs) throw() : msg_(rhs.msg_) {} |
92 | 0 | virtual ~Exception() throw() {} |
93 | | |
94 | | // <Exception> overrides what() of <std::exception>. |
95 | 0 | virtual const char *what() const throw() { |
96 | 0 | return (msg_ != NULL) ? msg_ : ""; |
97 | 0 | } |
98 | | |
99 | | private: |
100 | | const char *msg_; |
101 | | |
102 | | // Disallows operator=. |
103 | | Exception &operator=(const Exception &); |
104 | | }; |
105 | | |
106 | | } // namespace Details |
107 | | |
108 | | // <DoubleArrayImpl> is the interface of Darts-clone. Note that other |
109 | | // classes should not be accessed from outside. |
110 | | // |
111 | | // <DoubleArrayImpl> has 4 template arguments but only the 3rd one is used as |
112 | | // the type of values. Note that the given <T> is used only from outside, and |
113 | | // the internal value type is not changed from <Darts::Details::value_type>. |
114 | | // In build(), given values are casted from <T> to <Darts::Details::value_type> |
115 | | // by using static_cast. On the other hand, values are casted from |
116 | | // <Darts::Details::value_type> to <T> in searching dictionaries. |
117 | | template <typename, typename, typename T, typename> |
118 | | class DoubleArrayImpl { |
119 | | public: |
120 | | // Even if this <value_type> is changed, the internal value type is still |
121 | | // <Darts::Details::value_type>. Other types, such as 64-bit integer types |
122 | | // and floating-point number types, should not be used. |
123 | | typedef T value_type; |
124 | | // A key is reprenseted by a sequence of <key_type>s. For example, |
125 | | // exactMatchSearch() takes a <const key_type *>. |
126 | | typedef Details::char_type key_type; |
127 | | // In searching dictionaries, the values associated with the matched keys are |
128 | | // stored into or returned as <result_type>s. |
129 | | typedef value_type result_type; |
130 | | |
131 | | // <result_pair_type> enables applications to get the lengths of the matched |
132 | | // keys in addition to the values. |
133 | | struct result_pair_type { |
134 | | value_type value; |
135 | | std::size_t length; |
136 | | }; |
137 | | |
138 | | // The constructor initializes member variables with 0 and NULLs. |
139 | 74.7k | DoubleArrayImpl() : size_(0), array_(NULL), buf_(NULL) {} |
140 | | // The destructor frees memory allocated for units and then initializes |
141 | | // member variables with 0 and NULLs. |
142 | 74.7k | virtual ~DoubleArrayImpl() { |
143 | 74.7k | clear(); |
144 | 74.7k | } |
145 | | |
146 | | // <DoubleArrayImpl> has 2 kinds of set_result()s. The 1st set_result() is to |
147 | | // set a value to a <value_type>. The 2nd set_result() is to set a value and |
148 | | // a length to a <result_pair_type>. By using set_result()s, search methods |
149 | | // can return the 2 kinds of results in the same way. |
150 | | // Why the set_result()s are non-static? It is for compatibility. |
151 | | // |
152 | | // The 1st set_result() takes a length as the 3rd argument but it is not |
153 | | // used. If a compiler does a good job, codes for getting the length may be |
154 | | // removed. |
155 | 5.16M | void set_result(value_type *result, value_type value, std::size_t) const { |
156 | 5.16M | *result = value; |
157 | 5.16M | } |
158 | | // The 2nd set_result() uses both `value' and `length'. |
159 | | void set_result(result_pair_type *result, |
160 | 234M | value_type value, std::size_t length) const { |
161 | 234M | result->value = value; |
162 | 234M | result->length = length; |
163 | 234M | } |
164 | | |
165 | | // set_array() calls clear() in order to free memory allocated to the old |
166 | | // array and then sets a new array. This function is useful to set a memory- |
167 | | // mapped array. Note that the array set by set_array() is not freed in |
168 | | // clear() and the destructor of <DoubleArrayImpl>. |
169 | | // set_array() can also set the size of the new array but the size is not |
170 | | // used in search methods. So it works well even if the 2nd argument is 0 or |
171 | | // omitted. Remember that size() and total_size() returns 0 in such a case. |
172 | 17.0k | void set_array(const void *ptr, std::size_t size = 0) { |
173 | 17.0k | clear(); |
174 | 17.0k | array_ = static_cast<const unit_type *>(ptr); |
175 | 17.0k | size_ = size; |
176 | 17.0k | } |
177 | 0 | void copy_array(const void* ptr, std::size_t num_bytes) { |
178 | 0 | clear(); |
179 | 0 | const std::size_t extra_padding = num_bytes % unit_size() == 0 ? 0 : 1; |
180 | 0 | size_ = num_bytes / unit_size() + extra_padding; |
181 | 0 | buf_ = new unit_type[size_]; |
182 | 0 | std::memcpy(buf_, ptr, num_bytes); |
183 | 0 | array_ = buf_; |
184 | 0 | } |
185 | | // array() returns a pointer to the array of units. |
186 | 0 | const void *array() const { |
187 | 0 | return array_; |
188 | 0 | } |
189 | | |
190 | | // clear() frees memory allocated to units and then initializes member |
191 | | // variables with 0 and NULLs. Note that clear() does not free memory if the |
192 | | // array of units was set by set_array(). In such a case, `array_' is not |
193 | | // NULL and `buf_' is NULL. |
194 | 149k | void clear() { |
195 | 149k | size_ = 0; |
196 | 149k | array_ = NULL; |
197 | 149k | if (buf_ != NULL) { |
198 | 57.7k | delete[] buf_; |
199 | 57.7k | buf_ = NULL; |
200 | 57.7k | } |
201 | 149k | } |
202 | | |
203 | | // unit_size() returns the size of each unit. The size must be 4 bytes. |
204 | 17.0k | std::size_t unit_size() const { |
205 | 17.0k | return sizeof(unit_type); |
206 | 17.0k | } |
207 | | // size() returns the number of units. It can be 0 if set_array() is used. |
208 | 0 | std::size_t size() const { |
209 | 0 | return size_; |
210 | 0 | } |
211 | | // total_size() returns the number of bytes allocated to the array of units. |
212 | | // It can be 0 if set_array() is used. |
213 | | std::size_t total_size() const { |
214 | | return unit_size() * size(); |
215 | | } |
216 | | // nonzero_size() exists for compatibility. It always returns the number of |
217 | | // units because it takes long time to count the number of non-zero units. |
218 | | std::size_t nonzero_size() const { |
219 | | return size(); |
220 | | } |
221 | | |
222 | | // build() constructs a dictionary from given key-value pairs. If `lengths' |
223 | | // is NULL, `keys' is handled as an array of zero-terminated strings. If |
224 | | // `values' is NULL, the index in `keys' is associated with each key, i.e. |
225 | | // the ith key has (i - 1) as its value. |
226 | | // Note that the key-value pairs must be arranged in key order and the values |
227 | | // must not be negative. Also, if there are duplicate keys, only the first |
228 | | // pair will be stored in the resultant dictionary. |
229 | | // `progress_func' is a pointer to a callback function. If it is not NULL, |
230 | | // it will be called in build() so that the caller can check the progress of |
231 | | // dictionary construction. For details, please see the definition of |
232 | | // <Darts::Details::progress_func_type>. |
233 | | // The return value of build() is 0, and it indicates the success of the |
234 | | // operation. Otherwise, build() throws a <Darts::Exception>, which is a |
235 | | // derived class of <std::exception>. |
236 | | // build() uses another construction algorithm if `values' is not NULL. In |
237 | | // this case, Darts-clone uses a Directed Acyclic Word Graph (DAWG) instead |
238 | | // of a trie because a DAWG is likely to be more compact than a trie. |
239 | | int build(std::size_t num_keys, const key_type * const *keys, |
240 | | const std::size_t *lengths = NULL, const value_type *values = NULL, |
241 | | Details::progress_func_type progress_func = NULL); |
242 | | |
243 | | // open() reads an array of units from the specified file. And if it goes |
244 | | // well, the old array will be freed and replaced with the new array read |
245 | | // from the file. `offset' specifies the number of bytes to be skipped before |
246 | | // reading an array. `size' specifies the number of bytes to be read from the |
247 | | // file. If the `size' is 0, the whole file will be read. |
248 | | // open() returns 0 iff the operation succeeds. Otherwise, it returns a |
249 | | // non-zero value or throws a <Darts::Exception>. The exception is thrown |
250 | | // when and only when a memory allocation fails. |
251 | | int open(const char *file_name, const char *mode = "rb", |
252 | | std::size_t offset = 0, std::size_t size = 0); |
253 | | // save() writes the array of units into the specified file. `offset' |
254 | | // specifies the number of bytes to be skipped before writing the array. |
255 | | // open() returns 0 iff the operation succeeds. Otherwise, it returns a |
256 | | // non-zero value. |
257 | | int save(const char *file_name, const char *mode = "wb", |
258 | | std::size_t offset = 0) const; |
259 | | |
260 | | // The 1st exactMatchSearch() tests whether the given key exists or not, and |
261 | | // if it exists, its value and length are set to `result'. Otherwise, the |
262 | | // value and the length of `result' are set to -1 and 0 respectively. |
263 | | // Note that if `length' is 0, `key' is handled as a zero-terminated string. |
264 | | // `node_pos' specifies the start position of matching. This argument enables |
265 | | // the combination of exactMatchSearch() and traverse(). For example, if you |
266 | | // want to test "xyzA", "xyzBC", and "xyzDE", you can use traverse() to get |
267 | | // the node position corresponding to "xyz" and then you can use |
268 | | // exactMatchSearch() to test "A", "BC", and "DE" from that position. |
269 | | // Note that the length of `result' indicates the length from the `node_pos'. |
270 | | // In the above example, the lengths are { 1, 2, 2 }, not { 4, 5, 5 }. |
271 | | template <class U> |
272 | | void exactMatchSearch(const key_type *key, U &result, |
273 | 2.74M | std::size_t length = 0, std::size_t node_pos = 0) const { |
274 | 2.74M | result = exactMatchSearch<U>(key, length, node_pos); |
275 | 2.74M | } |
276 | | // The 2nd exactMatchSearch() returns a result instead of updating the 2nd |
277 | | // argument. So, the following exactMatchSearch() has only 3 arguments. |
278 | | template <class U> |
279 | | inline U exactMatchSearch(const key_type *key, std::size_t length = 0, |
280 | | std::size_t node_pos = 0) const; |
281 | | |
282 | | // commonPrefixSearch() searches for keys which match a prefix of the given |
283 | | // string. If `length' is 0, `key' is handled as a zero-terminated string. |
284 | | // The values and the lengths of at most `max_num_results' matched keys are |
285 | | // stored in `results'. commonPrefixSearch() returns the number of matched |
286 | | // keys. Note that the return value can be larger than `max_num_results' if |
287 | | // there are more than `max_num_results' matches. If you want to get all the |
288 | | // results, allocate more spaces and call commonPrefixSearch() again. |
289 | | // `node_pos' works as well as in exactMatchSearch(). |
290 | | template <class U> |
291 | | inline std::size_t commonPrefixSearch(const key_type *key, U *results, |
292 | | std::size_t max_num_results, std::size_t length = 0, |
293 | | std::size_t node_pos = 0) const; |
294 | | |
295 | | // In Darts-clone, a dictionary is a deterministic finite-state automaton |
296 | | // (DFA) and traverse() tests transitions on the DFA. The initial state is |
297 | | // `node_pos' and traverse() chooses transitions labeled key[key_pos], |
298 | | // key[key_pos + 1], ... in order. If there is not a transition labeled |
299 | | // key[key_pos + i], traverse() terminates the transitions at that state and |
300 | | // returns -2. Otherwise, traverse() ends without a termination and returns |
301 | | // -1 or a nonnegative value, -1 indicates that the final state was not an |
302 | | // accept state. When a nonnegative value is returned, it is the value |
303 | | // associated with the final accept state. That is, traverse() returns the |
304 | | // value associated with the given key if it exists. Note that traverse() |
305 | | // updates `node_pos' and `key_pos' after each transition. |
306 | | inline value_type traverse(const key_type *key, std::size_t &node_pos, |
307 | | std::size_t &key_pos, std::size_t length = 0) const; |
308 | | |
309 | | bool validate(value_type max_value_limit = -1) const; |
310 | | |
311 | | private: |
312 | | typedef Details::uchar_type uchar_type; |
313 | | typedef Details::id_type id_type; |
314 | | typedef Details::DoubleArrayUnit unit_type; |
315 | | |
316 | | std::size_t size_; |
317 | | const unit_type *array_; |
318 | | unit_type *buf_; |
319 | | |
320 | | // Disallows copy and assignment. |
321 | | DoubleArrayImpl(const DoubleArrayImpl &); |
322 | | DoubleArrayImpl &operator=(const DoubleArrayImpl &); |
323 | | }; |
324 | | |
325 | | // <DoubleArray> is the typical instance of <DoubleArrayImpl>. It uses <int> |
326 | | // as the type of values and it is suitable for most cases. |
327 | | typedef DoubleArrayImpl<void, void, int, void> DoubleArray; |
328 | | |
329 | | // The interface section ends here. For using Darts-clone, there is no need |
330 | | // to read the remaining section, which gives the implementation of |
331 | | // Darts-clone. |
332 | | |
333 | | // |
334 | | // Member functions of DoubleArrayImpl (except build()). |
335 | | // |
336 | | |
337 | | template <typename A, typename B, typename T, typename C> |
338 | | int DoubleArrayImpl<A, B, T, C>::open(const char *file_name, |
339 | | const char *mode, std::size_t offset, std::size_t size) { |
340 | | #ifdef _MSC_VER |
341 | | std::FILE *file; |
342 | | if (::fopen_s(&file, file_name, mode) != 0) { |
343 | | return -1; |
344 | | } |
345 | | #else |
346 | | std::FILE *file = std::fopen(file_name, mode); |
347 | | if (file == NULL) { |
348 | | return -1; |
349 | | } |
350 | | #endif |
351 | | |
352 | | if (size == 0) { |
353 | | if (std::fseek(file, 0, SEEK_END) != 0) { |
354 | | std::fclose(file); |
355 | | return -1; |
356 | | } |
357 | | size = std::ftell(file) - offset; |
358 | | } |
359 | | |
360 | | size /= unit_size(); |
361 | | if (size < 256 || (size & 0xFF) != 0) { |
362 | | std::fclose(file); |
363 | | return -1; |
364 | | } |
365 | | |
366 | | if (std::fseek(file, offset, SEEK_SET) != 0) { |
367 | | std::fclose(file); |
368 | | return -1; |
369 | | } |
370 | | |
371 | | unit_type units[256]; |
372 | | if (std::fread(units, unit_size(), 256, file) != 256) { |
373 | | std::fclose(file); |
374 | | return -1; |
375 | | } |
376 | | |
377 | | if (units[0].label() != '\0' || units[0].has_leaf() || |
378 | | units[0].offset() == 0 || units[0].offset() >= 512) { |
379 | | std::fclose(file); |
380 | | return -1; |
381 | | } |
382 | | for (id_type i = 1; i < 256; ++i) { |
383 | | if (units[i].label() <= 0xFF && units[i].offset() >= size) { |
384 | | std::fclose(file); |
385 | | return -1; |
386 | | } |
387 | | } |
388 | | |
389 | | unit_type *buf; |
390 | | try { |
391 | | buf = new unit_type[size]; |
392 | | for (id_type i = 0; i < 256; ++i) { |
393 | | buf[i] = units[i]; |
394 | | } |
395 | | } catch (const std::bad_alloc &) { |
396 | | std::fclose(file); |
397 | | DARTS_THROW("failed to open double-array: std::bad_alloc"); |
398 | | } |
399 | | |
400 | | if (size > 256) { |
401 | | if (std::fread(buf + 256, unit_size(), size - 256, file) != size - 256) { |
402 | | std::fclose(file); |
403 | | delete[] buf; |
404 | | return -1; |
405 | | } |
406 | | } |
407 | | std::fclose(file); |
408 | | |
409 | | clear(); |
410 | | |
411 | | size_ = size; |
412 | | array_ = buf; |
413 | | buf_ = buf; |
414 | | return 0; |
415 | | } |
416 | | |
417 | | template <typename A, typename B, typename T, typename C> |
418 | | int DoubleArrayImpl<A, B, T, C>::save(const char *file_name, |
419 | | const char *mode, std::size_t) const { |
420 | | if (size() == 0) { |
421 | | return -1; |
422 | | } |
423 | | |
424 | | #ifdef _MSC_VER |
425 | | std::FILE *file; |
426 | | if (::fopen_s(&file, file_name, mode) != 0) { |
427 | | return -1; |
428 | | } |
429 | | #else |
430 | | std::FILE *file = std::fopen(file_name, mode); |
431 | | if (file == NULL) { |
432 | | return -1; |
433 | | } |
434 | | #endif |
435 | | |
436 | | if (std::fwrite(array_, unit_size(), size(), file) != size()) { |
437 | | std::fclose(file); |
438 | | return -1; |
439 | | } |
440 | | std::fclose(file); |
441 | | return 0; |
442 | | } |
443 | | |
444 | | template <typename A, typename B, typename T, typename C> |
445 | 17.0k | bool DoubleArrayImpl<A, B, T, C>::validate(value_type max_value_limit) const { |
446 | 17.0k | if (size_ == 0 || array_ == NULL) return false; |
447 | 17.0k | if (array_[0].label() != '\0' || array_[0].has_leaf() || |
448 | 17.0k | array_[0].offset() == 0 || ((0 ^ array_[0].offset()) | 0xFF) >= size_) { |
449 | 0 | return false; |
450 | 0 | } |
451 | 765M | for (std::size_t i = 1; i < size_; ++i) { |
452 | 765M | if (array_[i].label() <= 0xFF) { |
453 | 510M | if (((i ^ array_[i].offset()) | 0xFF) >= size_) { |
454 | 0 | return false; |
455 | 0 | } |
456 | 510M | } else { |
457 | 255M | if (max_value_limit >= 0) { |
458 | 255M | if (array_[i].value() >= max_value_limit) { |
459 | 0 | return false; |
460 | 0 | } |
461 | 255M | } |
462 | 255M | } |
463 | 765M | } |
464 | 17.0k | return true; |
465 | 17.0k | } |
466 | | |
467 | | template <typename A, typename B, typename T, typename C> |
468 | | template <typename U> |
469 | | inline U DoubleArrayImpl<A, B, T, C>::exactMatchSearch(const key_type *key, |
470 | 2.74M | std::size_t length, std::size_t node_pos) const { |
471 | 2.74M | U result; |
472 | 2.74M | set_result(&result, static_cast<value_type>(-1), 0); |
473 | | |
474 | 2.74M | unit_type unit = array_[node_pos]; |
475 | 2.74M | if (length != 0) { |
476 | 7.08M | for (std::size_t i = 0; i < length; ++i) { |
477 | 4.66M | node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[i]); |
478 | 4.66M | unit = array_[node_pos]; |
479 | 4.66M | if (unit.label() != static_cast<uchar_type>(key[i])) { |
480 | 327k | return result; |
481 | 327k | } |
482 | 4.66M | } |
483 | 2.74M | } else { |
484 | 375 | for ( ; key[length] != '\0'; ++length) { |
485 | 2 | node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[length]); |
486 | 2 | unit = array_[node_pos]; |
487 | 2 | if (unit.label() != static_cast<uchar_type>(key[length])) { |
488 | 2 | return result; |
489 | 2 | } |
490 | 2 | } |
491 | 375 | } |
492 | | |
493 | 2.41M | if (!unit.has_leaf()) { |
494 | 871 | return result; |
495 | 871 | } |
496 | 2.41M | unit = array_[node_pos ^ unit.offset()]; |
497 | 2.41M | set_result(&result, static_cast<value_type>(unit.value()), length); |
498 | 2.41M | return result; |
499 | 2.41M | } |
500 | | |
501 | | template <typename A, typename B, typename T, typename C> |
502 | | template <typename U> |
503 | | inline std::size_t DoubleArrayImpl<A, B, T, C>::commonPrefixSearch( |
504 | | const key_type *key, U *results, std::size_t max_num_results, |
505 | 155M | std::size_t length, std::size_t node_pos) const { |
506 | 155M | std::size_t num_results = 0; |
507 | | |
508 | 155M | unit_type unit = array_[node_pos]; |
509 | 155M | node_pos ^= unit.offset(); |
510 | 155M | if (length != 0) { |
511 | 793M | for (std::size_t i = 0; i < length; ++i) { |
512 | 774M | node_pos ^= static_cast<uchar_type>(key[i]); |
513 | 774M | unit = array_[node_pos]; |
514 | 774M | if (unit.label() != static_cast<uchar_type>(key[i])) { |
515 | 135M | return num_results; |
516 | 135M | } |
517 | | |
518 | 638M | node_pos ^= unit.offset(); |
519 | 638M | if (unit.has_leaf()) { |
520 | 234M | if (num_results < max_num_results) { |
521 | 234M | set_result(&results[num_results], static_cast<value_type>( |
522 | 234M | array_[node_pos].value()), i + 1); |
523 | 234M | } |
524 | 234M | ++num_results; |
525 | 234M | } |
526 | 638M | } |
527 | 155M | } else { |
528 | 0 | for ( ; key[length] != '\0'; ++length) { |
529 | 0 | node_pos ^= static_cast<uchar_type>(key[length]); |
530 | 0 | unit = array_[node_pos]; |
531 | 0 | if (unit.label() != static_cast<uchar_type>(key[length])) { |
532 | 0 | return num_results; |
533 | 0 | } |
534 | | |
535 | 0 | node_pos ^= unit.offset(); |
536 | 0 | if (unit.has_leaf()) { |
537 | 0 | if (num_results < max_num_results) { |
538 | 0 | set_result(&results[num_results], static_cast<value_type>( |
539 | 0 | array_[node_pos].value()), length + 1); |
540 | 0 | } |
541 | 0 | ++num_results; |
542 | 0 | } |
543 | 0 | } |
544 | 0 | } |
545 | | |
546 | 19.4M | return num_results; |
547 | 155M | } |
548 | | |
549 | | template <typename A, typename B, typename T, typename C> |
550 | | inline typename DoubleArrayImpl<A, B, T, C>::value_type |
551 | | DoubleArrayImpl<A, B, T, C>::traverse(const key_type *key, |
552 | 61.2M | std::size_t &node_pos, std::size_t &key_pos, std::size_t length) const { |
553 | 61.2M | id_type id = static_cast<id_type>(node_pos); |
554 | 61.2M | unit_type unit = array_[id]; |
555 | | |
556 | 61.2M | if (length != 0) { |
557 | 73.1M | for ( ; key_pos < length; ++key_pos) { |
558 | 61.2M | id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]); |
559 | 61.2M | unit = array_[id]; |
560 | 61.2M | if (unit.label() != static_cast<uchar_type>(key[key_pos])) { |
561 | 49.3M | return static_cast<value_type>(-2); |
562 | 49.3M | } |
563 | 11.9M | node_pos = id; |
564 | 11.9M | } |
565 | 61.2M | } else { |
566 | 0 | for ( ; key[key_pos] != '\0'; ++key_pos) { |
567 | 0 | id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]); |
568 | 0 | unit = array_[id]; |
569 | 0 | if (unit.label() != static_cast<uchar_type>(key[key_pos])) { |
570 | 0 | return static_cast<value_type>(-2); |
571 | 0 | } |
572 | 0 | node_pos = id; |
573 | 0 | } |
574 | 0 | } |
575 | | |
576 | 11.9M | if (!unit.has_leaf()) { |
577 | 5.79M | return static_cast<value_type>(-1); |
578 | 5.79M | } |
579 | 6.12M | unit = array_[id ^ unit.offset()]; |
580 | 6.12M | return static_cast<value_type>(unit.value()); |
581 | 11.9M | } |
582 | | |
583 | | namespace Details { |
584 | | |
585 | | // |
586 | | // Memory management of array. |
587 | | // |
588 | | |
589 | | template <typename T> |
590 | | class AutoArray { |
591 | | public: |
592 | 5.96M | explicit AutoArray(T *array = NULL) : array_(array) {}Darts::Details::AutoArray<char>::AutoArray(char*) Line | Count | Source | 592 | 5.64M | explicit AutoArray(T *array = NULL) : array_(array) {} |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::AutoArray(Darts::Details::DoubleArrayBuilderExtraUnit*) Line | Count | Source | 592 | 115k | explicit AutoArray(T *array = NULL) : array_(array) {} |
Darts::Details::AutoArray<unsigned int>::AutoArray(unsigned int*) Line | Count | Source | 592 | 203k | explicit AutoArray(T *array = NULL) : array_(array) {} |
|
593 | 5.96M | ~AutoArray() { |
594 | 5.96M | clear(); |
595 | 5.96M | } Darts::Details::AutoArray<char>::~AutoArray() Line | Count | Source | 593 | 5.64M | ~AutoArray() { | 594 | 5.64M | clear(); | 595 | 5.64M | } |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::~AutoArray() Line | Count | Source | 593 | 115k | ~AutoArray() { | 594 | 115k | clear(); | 595 | 115k | } |
Darts::Details::AutoArray<unsigned int>::~AutoArray() Line | Count | Source | 593 | 203k | ~AutoArray() { | 594 | 203k | clear(); | 595 | 203k | } |
|
596 | | |
597 | 1.41G | const T &operator[](std::size_t id) const { |
598 | 1.41G | return array_[id]; |
599 | 1.41G | } Darts::Details::AutoArray<char>::operator[](unsigned long) const Line | Count | Source | 597 | 974M | const T &operator[](std::size_t id) const { | 598 | 974M | return array_[id]; | 599 | 974M | } |
Unexecuted instantiation: Darts::Details::AutoArray<unsigned int>::operator[](unsigned long) const Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::operator[](unsigned long) const Line | Count | Source | 597 | 436M | const T &operator[](std::size_t id) const { | 598 | 436M | return array_[id]; | 599 | 436M | } |
|
600 | 1.96G | T &operator[](std::size_t id) { |
601 | 1.96G | return array_[id]; |
602 | 1.96G | } Darts::Details::AutoArray<char>::operator[](unsigned long) Line | Count | Source | 600 | 1.46G | T &operator[](std::size_t id) { | 601 | 1.46G | return array_[id]; | 602 | 1.46G | } |
Darts::Details::AutoArray<unsigned int>::operator[](unsigned long) Line | Count | Source | 600 | 1.02M | T &operator[](std::size_t id) { | 601 | 1.02M | return array_[id]; | 602 | 1.02M | } |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::operator[](unsigned long) Line | Count | Source | 600 | 495M | T &operator[](std::size_t id) { | 601 | 495M | return array_[id]; | 602 | 495M | } |
|
603 | | |
604 | | bool empty() const { |
605 | | return array_ == NULL; |
606 | | } |
607 | | |
608 | 8.00M | void clear() { |
609 | 8.00M | if (array_ != NULL) { |
610 | 2.74M | delete[] array_; |
611 | 2.74M | array_ = NULL; |
612 | 2.74M | } |
613 | 8.00M | } Darts::Details::AutoArray<char>::clear() Line | Count | Source | 608 | 7.31M | void clear() { | 609 | 7.31M | if (array_ != NULL) { | 610 | 2.59M | delete[] array_; | 611 | | array_ = NULL; | 612 | 2.59M | } | 613 | 7.31M | } |
Darts::Details::AutoArray<unsigned int>::clear() Line | Count | Source | 608 | 456k | void clear() { | 609 | 456k | if (array_ != NULL) { | 610 | 97.4k | delete[] array_; | 611 | | array_ = NULL; | 612 | 97.4k | } | 613 | 456k | } |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::clear() Line | Count | Source | 608 | 230k | void clear() { | 609 | 230k | if (array_ != NULL) { | 610 | 57.7k | delete[] array_; | 611 | | array_ = NULL; | 612 | 57.7k | } | 613 | 230k | } |
|
614 | 5.34M | void swap(AutoArray *array) { |
615 | 5.34M | T *temp = array_; |
616 | 5.34M | array_ = array->array_; |
617 | 5.34M | array->array_ = temp; |
618 | 5.34M | } Darts::Details::AutoArray<char>::swap(Darts::Details::AutoArray<char>*) Line | Count | Source | 614 | 5.18M | void swap(AutoArray *array) { | 615 | 5.18M | T *temp = array_; | 616 | 5.18M | array_ = array->array_; | 617 | 5.18M | array->array_ = temp; | 618 | 5.18M | } |
Darts::Details::AutoArray<unsigned int>::swap(Darts::Details::AutoArray<unsigned int>*) Line | Count | Source | 614 | 97.4k | void swap(AutoArray *array) { | 615 | 97.4k | T *temp = array_; | 616 | 97.4k | array_ = array->array_; | 617 | 97.4k | array->array_ = temp; | 618 | 97.4k | } |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::swap(Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>*) Line | Count | Source | 614 | 57.7k | void swap(AutoArray *array) { | 615 | 57.7k | T *temp = array_; | 616 | 57.7k | array_ = array->array_; | 617 | 57.7k | array->array_ = temp; | 618 | 57.7k | } |
|
619 | 2.74M | void reset(T *array = NULL) { |
620 | 2.74M | AutoArray(array).swap(this); |
621 | 2.74M | } Darts::Details::AutoArray<char>::reset(char*) Line | Count | Source | 619 | 2.59M | void reset(T *array = NULL) { | 620 | 2.59M | AutoArray(array).swap(this); | 621 | 2.59M | } |
Darts::Details::AutoArray<unsigned int>::reset(unsigned int*) Line | Count | Source | 619 | 97.4k | void reset(T *array = NULL) { | 620 | 97.4k | AutoArray(array).swap(this); | 621 | 97.4k | } |
Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::reset(Darts::Details::DoubleArrayBuilderExtraUnit*) Line | Count | Source | 619 | 57.7k | void reset(T *array = NULL) { | 620 | 57.7k | AutoArray(array).swap(this); | 621 | 57.7k | } |
|
622 | | |
623 | | private: |
624 | | T *array_; |
625 | | |
626 | | // Disallows copy and assignment. |
627 | | AutoArray(const AutoArray &); |
628 | | AutoArray &operator=(const AutoArray &); |
629 | | }; |
630 | | |
631 | | // |
632 | | // Memory management of resizable array. |
633 | | // |
634 | | |
635 | | template <typename T> |
636 | | class AutoPool { |
637 | | public: |
638 | 456k | AutoPool() : buf_(), size_(0), capacity_(0) {}Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::AutoPool() Line | Count | Source | 638 | 57.7k | AutoPool() : buf_(), size_(0), capacity_(0) {} |
Darts::Details::AutoPool<unsigned char>::AutoPool() Line | Count | Source | 638 | 106k | AutoPool() : buf_(), size_(0), capacity_(0) {} |
Darts::Details::AutoPool<Darts::Details::DawgNode>::AutoPool() Line | Count | Source | 638 | 48.7k | AutoPool() : buf_(), size_(0), capacity_(0) {} |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::AutoPool() Line | Count | Source | 638 | 48.7k | AutoPool() : buf_(), size_(0), capacity_(0) {} |
Darts::Details::AutoPool<unsigned int>::AutoPool() Line | Count | Source | 638 | 194k | AutoPool() : buf_(), size_(0), capacity_(0) {} |
|
639 | 456k | ~AutoPool() { clear(); }Darts::Details::AutoPool<unsigned char>::~AutoPool() Line | Count | Source | 639 | 106k | ~AutoPool() { clear(); } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::~AutoPool() Line | Count | Source | 639 | 57.7k | ~AutoPool() { clear(); } |
Darts::Details::AutoPool<unsigned int>::~AutoPool() Line | Count | Source | 639 | 194k | ~AutoPool() { clear(); } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::~AutoPool() Line | Count | Source | 639 | 48.7k | ~AutoPool() { clear(); } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::~AutoPool() Line | Count | Source | 639 | 48.7k | ~AutoPool() { clear(); } |
|
640 | | |
641 | 974M | const T &operator[](std::size_t id) const { |
642 | 974M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); |
643 | 974M | } Darts::Details::AutoPool<unsigned int>::operator[](unsigned long) const Line | Count | Source | 641 | 127M | const T &operator[](std::size_t id) const { | 642 | 127M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); | 643 | 127M | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::operator[](unsigned long) const Line | Count | Source | 641 | 276M | const T &operator[](std::size_t id) const { | 642 | 276M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); | 643 | 276M | } |
Darts::Details::AutoPool<unsigned char>::operator[](unsigned long) const Line | Count | Source | 641 | 370M | const T &operator[](std::size_t id) const { | 642 | 370M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); | 643 | 370M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::operator[](unsigned long) const Line | Count | Source | 641 | 155M | const T &operator[](std::size_t id) const { | 642 | 155M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); | 643 | 155M | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::operator[](unsigned long) const Line | Count | Source | 641 | 44.5M | const T &operator[](std::size_t id) const { | 642 | 44.5M | return *(reinterpret_cast<const T *>(&buf_[0]) + id); | 643 | 44.5M | } |
|
644 | 1.46G | T &operator[](std::size_t id) { |
645 | 1.46G | return *(reinterpret_cast<T *>(&buf_[0]) + id); |
646 | 1.46G | } Darts::Details::AutoPool<unsigned int>::operator[](unsigned long) Line | Count | Source | 644 | 468M | T &operator[](std::size_t id) { | 645 | 468M | return *(reinterpret_cast<T *>(&buf_[0]) + id); | 646 | 468M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::operator[](unsigned long) Line | Count | Source | 644 | 479M | T &operator[](std::size_t id) { | 645 | 479M | return *(reinterpret_cast<T *>(&buf_[0]) + id); | 646 | 479M | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::operator[](unsigned long) Line | Count | Source | 644 | 111M | T &operator[](std::size_t id) { | 645 | 111M | return *(reinterpret_cast<T *>(&buf_[0]) + id); | 646 | 111M | } |
Darts::Details::AutoPool<unsigned char>::operator[](unsigned long) Line | Count | Source | 644 | 236M | T &operator[](std::size_t id) { | 645 | 236M | return *(reinterpret_cast<T *>(&buf_[0]) + id); | 646 | 236M | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::operator[](unsigned long) Line | Count | Source | 644 | 165M | T &operator[](std::size_t id) { | 645 | 165M | return *(reinterpret_cast<T *>(&buf_[0]) + id); | 646 | 165M | } |
|
647 | | |
648 | 32.1M | bool empty() const { |
649 | 32.1M | return size_ == 0; |
650 | 32.1M | } Darts::Details::AutoPool<unsigned int>::empty() const Line | Count | Source | 648 | 31.8M | bool empty() const { | 649 | 31.8M | return size_ == 0; | 650 | 31.8M | } |
Darts::Details::AutoPool<unsigned char>::empty() const Line | Count | Source | 648 | 252k | bool empty() const { | 649 | 252k | return size_ == 0; | 650 | 252k | } |
|
651 | 506M | std::size_t size() const { |
652 | 506M | return size_; |
653 | 506M | } Darts::Details::AutoPool<unsigned int>::size() const Line | Count | Source | 651 | 238M | std::size_t size() const { | 652 | 238M | return size_; | 653 | 238M | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::size() const Line | Count | Source | 651 | 20.8M | std::size_t size() const { | 652 | 20.8M | return size_; | 653 | 20.8M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::size() const Line | Count | Source | 651 | 3.15M | std::size_t size() const { | 652 | 3.15M | return size_; | 653 | 3.15M | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::size() const Line | Count | Source | 651 | 114M | std::size_t size() const { | 652 | 114M | return size_; | 653 | 114M | } |
Darts::Details::AutoPool<unsigned char>::size() const Line | Count | Source | 651 | 129M | std::size_t size() const { | 652 | 129M | return size_; | 653 | 129M | } |
|
654 | | |
655 | 1.66M | void clear() { |
656 | 1.66M | resize(0); |
657 | 1.66M | buf_.clear(); |
658 | 1.66M | size_ = 0; |
659 | 1.66M | capacity_ = 0; |
660 | 1.66M | } Darts::Details::AutoPool<unsigned int>::clear() Line | Count | Source | 655 | 891k | void clear() { | 656 | 891k | resize(0); | 657 | 891k | buf_.clear(); | 658 | 891k | size_ = 0; | 659 | 891k | capacity_ = 0; | 660 | 891k | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::clear() Line | Count | Source | 655 | 194k | void clear() { | 656 | 194k | resize(0); | 657 | 194k | buf_.clear(); | 658 | 194k | size_ = 0; | 659 | 194k | capacity_ = 0; | 660 | 194k | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::clear() Line | Count | Source | 655 | 146k | void clear() { | 656 | 146k | resize(0); | 657 | 146k | buf_.clear(); | 658 | 146k | size_ = 0; | 659 | 146k | capacity_ = 0; | 660 | 146k | } |
Darts::Details::AutoPool<unsigned char>::clear() Line | Count | Source | 655 | 319k | void clear() { | 656 | 319k | resize(0); | 657 | 319k | buf_.clear(); | 658 | 319k | size_ = 0; | 659 | 319k | capacity_ = 0; | 660 | 319k | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::clear() Line | Count | Source | 655 | 115k | void clear() { | 656 | 115k | resize(0); | 657 | 115k | buf_.clear(); | 658 | 115k | size_ = 0; | 659 | 115k | capacity_ = 0; | 660 | 115k | } |
|
661 | | |
662 | 63.7M | void push_back(const T &value) { |
663 | 63.7M | append(value); |
664 | 63.7M | } |
665 | 60.6M | void pop_back() { |
666 | 60.6M | (*this)[--size_].~T(); |
667 | 60.6M | } |
668 | | |
669 | 66.9M | void append() { |
670 | 66.9M | if (size_ == capacity_) |
671 | 1.27M | resize_buf(size_ + 1); |
672 | 66.9M | new(&(*this)[size_++]) T; |
673 | 66.9M | } Darts::Details::AutoPool<Darts::Details::DawgUnit>::append() Line | Count | Source | 669 | 31.8M | void append() { | 670 | 31.8M | if (size_ == capacity_) | 671 | 465k | resize_buf(size_ + 1); | 672 | 31.8M | new(&(*this)[size_++]) T; | 673 | 31.8M | } |
Darts::Details::AutoPool<unsigned char>::append() Line | Count | Source | 669 | 31.8M | void append() { | 670 | 31.8M | if (size_ == capacity_) | 671 | 465k | resize_buf(size_ + 1); | 672 | 31.8M | new(&(*this)[size_++]) T; | 673 | 31.8M | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::append() Line | Count | Source | 669 | 3.15M | void append() { | 670 | 3.15M | if (size_ == capacity_) | 671 | 344k | resize_buf(size_ + 1); | 672 | 3.15M | new(&(*this)[size_++]) T; | 673 | 3.15M | } |
|
674 | 96.7M | void append(const T &value) { |
675 | 96.7M | if (size_ == capacity_) |
676 | 1.16M | resize_buf(size_ + 1); |
677 | 96.7M | new(&(*this)[size_++]) T(value); |
678 | 96.7M | } Darts::Details::AutoPool<unsigned int>::append(unsigned int const&) Line | Count | Source | 674 | 64.7M | void append(const T &value) { | 675 | 64.7M | if (size_ == capacity_) | 676 | 860k | resize_buf(size_ + 1); | 677 | 64.7M | new(&(*this)[size_++]) T(value); | 678 | 64.7M | } |
Darts::Details::AutoPool<unsigned char>::append(unsigned char const&) Line | Count | Source | 674 | 32.0M | void append(const T &value) { | 675 | 32.0M | if (size_ == capacity_) | 676 | 305k | resize_buf(size_ + 1); | 677 | 32.0M | new(&(*this)[size_++]) T(value); | 678 | 32.0M | } |
|
679 | | |
680 | 26.4M | void resize(std::size_t size) { |
681 | 267M | while (size_ > size) { |
682 | 241M | (*this)[--size_].~T(); |
683 | 241M | } |
684 | 26.4M | if (size > capacity_) { |
685 | 32.1k | resize_buf(size); |
686 | 32.1k | } |
687 | 70.9M | while (size_ < size) { |
688 | 44.5M | new(&(*this)[size_++]) T; |
689 | 44.5M | } |
690 | 26.4M | } Darts::Details::AutoPool<unsigned int>::resize(unsigned long) Line | Count | Source | 680 | 891k | void resize(std::size_t size) { | 681 | 98.6M | while (size_ > size) { | 682 | 97.8M | (*this)[--size_].~T(); | 683 | 97.8M | } | 684 | 891k | if (size > capacity_) { | 685 | 0 | resize_buf(size); | 686 | 0 | } | 687 | 891k | while (size_ < size) { | 688 | 0 | new(&(*this)[size_++]) T; | 689 | 0 | } | 690 | 891k | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::resize(unsigned long) Line | Count | Source | 680 | 194k | void resize(std::size_t size) { | 681 | 3.35M | while (size_ > size) { | 682 | 3.15M | (*this)[--size_].~T(); | 683 | 3.15M | } | 684 | 194k | if (size > capacity_) { | 685 | 0 | resize_buf(size); | 686 | 0 | } | 687 | 194k | while (size_ < size) { | 688 | 0 | new(&(*this)[size_++]) T; | 689 | 0 | } | 690 | 194k | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::resize(unsigned long) Line | Count | Source | 680 | 146k | void resize(std::size_t size) { | 681 | 32.0M | while (size_ > size) { | 682 | 31.8M | (*this)[--size_].~T(); | 683 | 31.8M | } | 684 | 146k | if (size > capacity_) { | 685 | 0 | resize_buf(size); | 686 | 0 | } | 687 | 146k | while (size_ < size) { | 688 | 0 | new(&(*this)[size_++]) T; | 689 | 0 | } | 690 | 146k | } |
Darts::Details::AutoPool<unsigned char>::resize(unsigned long) Line | Count | Source | 680 | 24.9M | void resize(std::size_t size) { | 681 | 88.7M | while (size_ > size) { | 682 | 63.8M | (*this)[--size_].~T(); | 683 | 63.8M | } | 684 | 24.9M | if (size > capacity_) { | 685 | 0 | resize_buf(size); | 686 | 0 | } | 687 | 24.9M | while (size_ < size) { | 688 | 0 | new(&(*this)[size_++]) T; | 689 | 0 | } | 690 | 24.9M | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::resize(unsigned long) Line | Count | Source | 680 | 289k | void resize(std::size_t size) { | 681 | 44.8M | while (size_ > size) { | 682 | 44.5M | (*this)[--size_].~T(); | 683 | 44.5M | } | 684 | 289k | if (size > capacity_) { | 685 | 32.1k | resize_buf(size); | 686 | 32.1k | } | 687 | 44.8M | while (size_ < size) { | 688 | 44.5M | new(&(*this)[size_++]) T; | 689 | 44.5M | } | 690 | 289k | } |
|
691 | 62.4k | void resize(std::size_t size, const T &value) { |
692 | 62.4k | while (size_ > size) { |
693 | 0 | (*this)[--size_].~T(); |
694 | 0 | } |
695 | 62.4k | if (size > capacity_) { |
696 | 62.4k | resize_buf(size); |
697 | 62.4k | } |
698 | 93.7M | while (size_ < size) { |
699 | 93.6M | new(&(*this)[size_++]) T(value); |
700 | 93.6M | } |
701 | 62.4k | } |
702 | | |
703 | 57.7k | void reserve(std::size_t size) { |
704 | 57.7k | if (size > capacity_) { |
705 | 57.7k | resize_buf(size); |
706 | 57.7k | } |
707 | 57.7k | } |
708 | | |
709 | | private: |
710 | | AutoArray<char> buf_; |
711 | | std::size_t size_; |
712 | | std::size_t capacity_; |
713 | | |
714 | | // Disallows copy and assignment. |
715 | | AutoPool(const AutoPool &); |
716 | | AutoPool &operator=(const AutoPool &); |
717 | | |
718 | | void resize_buf(std::size_t size); |
719 | | }; |
720 | | |
721 | | template <typename T> |
722 | 2.59M | void AutoPool<T>::resize_buf(std::size_t size) { |
723 | 2.59M | std::size_t capacity; |
724 | 2.59M | if (size >= capacity_ * 2) { |
725 | 843k | capacity = size; |
726 | 1.75M | } else { |
727 | 1.75M | capacity = 1; |
728 | 9.82M | while (capacity < size) { |
729 | 8.07M | capacity <<= 1; |
730 | 8.07M | } |
731 | 1.75M | } |
732 | | |
733 | 2.59M | AutoArray<char> buf; |
734 | 2.59M | try { |
735 | 2.59M | buf.reset(new char[sizeof(T) * capacity]); |
736 | 2.59M | } catch (const std::bad_alloc &) { |
737 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); |
738 | 0 | } |
739 | | |
740 | 2.59M | if (size_ > 0) { |
741 | 2.10M | T *src = reinterpret_cast<T *>(&buf_[0]); |
742 | 2.10M | T *dest = reinterpret_cast<T *>(&buf[0]); |
743 | 112M | for (std::size_t i = 0; i < size_; ++i) { |
744 | 110M | new(&dest[i]) T(src[i]); |
745 | 110M | src[i].~T(); |
746 | 110M | } |
747 | 2.10M | } |
748 | | |
749 | 2.59M | buf_.swap(&buf); |
750 | 2.59M | capacity_ = capacity; |
751 | 2.59M | } Darts::Details::AutoPool<unsigned int>::resize_buf(unsigned long) Line | Count | Source | 722 | 922k | void AutoPool<T>::resize_buf(std::size_t size) { | 723 | 922k | std::size_t capacity; | 724 | 922k | if (size >= capacity_ * 2) { | 725 | 350k | capacity = size; | 726 | 572k | } else { | 727 | 572k | capacity = 1; | 728 | 2.76M | while (capacity < size) { | 729 | 2.19M | capacity <<= 1; | 730 | 2.19M | } | 731 | 572k | } | 732 | | | 733 | 922k | AutoArray<char> buf; | 734 | 922k | try { | 735 | 922k | buf.reset(new char[sizeof(T) * capacity]); | 736 | 922k | } catch (const std::bad_alloc &) { | 737 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); | 738 | 0 | } | 739 | | | 740 | 922k | if (size_ > 0) { | 741 | 714k | T *src = reinterpret_cast<T *>(&buf_[0]); | 742 | 714k | T *dest = reinterpret_cast<T *>(&buf[0]); | 743 | 8.34M | for (std::size_t i = 0; i < size_; ++i) { | 744 | 7.63M | new(&dest[i]) T(src[i]); | 745 | 7.63M | src[i].~T(); | 746 | 7.63M | } | 747 | 714k | } | 748 | | | 749 | 922k | buf_.swap(&buf); | 750 | 922k | capacity_ = capacity; | 751 | 922k | } |
Darts::Details::AutoPool<Darts::Details::DawgNode>::resize_buf(unsigned long) Line | Count | Source | 722 | 344k | void AutoPool<T>::resize_buf(std::size_t size) { | 723 | 344k | std::size_t capacity; | 724 | 344k | if (size >= capacity_ * 2) { | 725 | 97.4k | capacity = size; | 726 | 247k | } else { | 727 | 247k | capacity = 1; | 728 | 1.28M | while (capacity < size) { | 729 | 1.03M | capacity <<= 1; | 730 | 1.03M | } | 731 | 247k | } | 732 | | | 733 | 344k | AutoArray<char> buf; | 734 | 344k | try { | 735 | 344k | buf.reset(new char[sizeof(T) * capacity]); | 736 | 344k | } catch (const std::bad_alloc &) { | 737 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); | 738 | 0 | } | 739 | | | 740 | 344k | if (size_ > 0) { | 741 | 296k | T *src = reinterpret_cast<T *>(&buf_[0]); | 742 | 296k | T *dest = reinterpret_cast<T *>(&buf[0]); | 743 | 4.70M | for (std::size_t i = 0; i < size_; ++i) { | 744 | 4.40M | new(&dest[i]) T(src[i]); | 745 | 4.40M | src[i].~T(); | 746 | 4.40M | } | 747 | 296k | } | 748 | | | 749 | 344k | buf_.swap(&buf); | 750 | 344k | capacity_ = capacity; | 751 | 344k | } |
Darts::Details::AutoPool<Darts::Details::DawgUnit>::resize_buf(unsigned long) Line | Count | Source | 722 | 465k | void AutoPool<T>::resize_buf(std::size_t size) { | 723 | 465k | std::size_t capacity; | 724 | 465k | if (size >= capacity_ * 2) { | 725 | 97.4k | capacity = size; | 726 | 368k | } else { | 727 | 368k | capacity = 1; | 728 | 2.42M | while (capacity < size) { | 729 | 2.05M | capacity <<= 1; | 730 | 2.05M | } | 731 | 368k | } | 732 | | | 733 | 465k | AutoArray<char> buf; | 734 | 465k | try { | 735 | 465k | buf.reset(new char[sizeof(T) * capacity]); | 736 | 465k | } catch (const std::bad_alloc &) { | 737 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); | 738 | 0 | } | 739 | | | 740 | 465k | if (size_ > 0) { | 741 | 417k | T *src = reinterpret_cast<T *>(&buf_[0]); | 742 | 417k | T *dest = reinterpret_cast<T *>(&buf[0]); | 743 | 46.2M | for (std::size_t i = 0; i < size_; ++i) { | 744 | 45.8M | new(&dest[i]) T(src[i]); | 745 | 45.8M | src[i].~T(); | 746 | 45.8M | } | 747 | 417k | } | 748 | | | 749 | 465k | buf_.swap(&buf); | 750 | 465k | capacity_ = capacity; | 751 | 465k | } |
Darts::Details::AutoPool<unsigned char>::resize_buf(unsigned long) Line | Count | Source | 722 | 771k | void AutoPool<T>::resize_buf(std::size_t size) { | 723 | 771k | std::size_t capacity; | 724 | 771k | if (size >= capacity_ * 2) { | 725 | 211k | capacity = size; | 726 | 559k | } else { | 727 | 559k | capacity = 1; | 728 | 3.30M | while (capacity < size) { | 729 | 2.74M | capacity <<= 1; | 730 | 2.74M | } | 731 | 559k | } | 732 | | | 733 | 771k | AutoArray<char> buf; | 734 | 771k | try { | 735 | 771k | buf.reset(new char[sizeof(T) * capacity]); | 736 | 771k | } catch (const std::bad_alloc &) { | 737 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); | 738 | 0 | } | 739 | | | 740 | 771k | if (size_ > 0) { | 741 | 664k | T *src = reinterpret_cast<T *>(&buf_[0]); | 742 | 664k | T *dest = reinterpret_cast<T *>(&buf[0]); | 743 | 48.4M | for (std::size_t i = 0; i < size_; ++i) { | 744 | 47.8M | new(&dest[i]) T(src[i]); | 745 | 47.8M | src[i].~T(); | 746 | 47.8M | } | 747 | 664k | } | 748 | | | 749 | 771k | buf_.swap(&buf); | 750 | 771k | capacity_ = capacity; | 751 | 771k | } |
Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::resize_buf(unsigned long) Line | Count | Source | 722 | 89.8k | void AutoPool<T>::resize_buf(std::size_t size) { | 723 | 89.8k | std::size_t capacity; | 724 | 89.8k | if (size >= capacity_ * 2) { | 725 | 85.8k | capacity = size; | 726 | 85.8k | } else { | 727 | 4.08k | capacity = 1; | 728 | 46.7k | while (capacity < size) { | 729 | 42.6k | capacity <<= 1; | 730 | 42.6k | } | 731 | 4.08k | } | 732 | | | 733 | 89.8k | AutoArray<char> buf; | 734 | 89.8k | try { | 735 | 89.8k | buf.reset(new char[sizeof(T) * capacity]); | 736 | 89.8k | } catch (const std::bad_alloc &) { | 737 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); | 738 | 0 | } | 739 | | | 740 | 89.8k | if (size_ > 0) { | 741 | 8.94k | T *src = reinterpret_cast<T *>(&buf_[0]); | 742 | 8.94k | T *dest = reinterpret_cast<T *>(&buf[0]); | 743 | 4.59M | for (std::size_t i = 0; i < size_; ++i) { | 744 | 4.58M | new(&dest[i]) T(src[i]); | 745 | 4.58M | src[i].~T(); | 746 | 4.58M | } | 747 | 8.94k | } | 748 | | | 749 | 89.8k | buf_.swap(&buf); | 750 | 89.8k | capacity_ = capacity; | 751 | 89.8k | } |
|
752 | | |
753 | | // |
754 | | // Memory management of stack. |
755 | | // |
756 | | |
757 | | template <typename T> |
758 | | class AutoStack { |
759 | | public: |
760 | 97.4k | AutoStack() : pool_() {} |
761 | 97.4k | ~AutoStack() { |
762 | 97.4k | clear(); |
763 | 97.4k | } |
764 | | |
765 | | const T &top() const { |
766 | | return pool_[size() - 1]; |
767 | | } |
768 | 109M | T &top() { |
769 | 109M | return pool_[size() - 1]; |
770 | 109M | } |
771 | | |
772 | 31.8M | bool empty() const { |
773 | 31.8M | return pool_.empty(); |
774 | 31.8M | } |
775 | 109M | std::size_t size() const { |
776 | 109M | return pool_.size(); |
777 | 109M | } |
778 | | |
779 | 63.7M | void push(const T &value) { |
780 | 63.7M | pool_.push_back(value); |
781 | 63.7M | } |
782 | 60.6M | void pop() { |
783 | 60.6M | pool_.pop_back(); |
784 | 60.6M | } |
785 | | |
786 | 389k | void clear() { |
787 | 389k | pool_.clear(); |
788 | 389k | } |
789 | | |
790 | | private: |
791 | | AutoPool<T> pool_; |
792 | | |
793 | | // Disallows copy and assignment. |
794 | | AutoStack(const AutoStack &); |
795 | | AutoStack &operator=(const AutoStack &); |
796 | | }; |
797 | | |
798 | | // |
799 | | // Succinct bit vector. |
800 | | // |
801 | | |
802 | | class BitVector { |
803 | | public: |
804 | 48.7k | BitVector() : units_(), ranks_(), num_ones_(0), size_(0) {} |
805 | 48.7k | ~BitVector() { |
806 | 48.7k | clear(); |
807 | 48.7k | } |
808 | | |
809 | 48.9M | bool operator[](std::size_t id) const { |
810 | 48.9M | return (units_[id / UNIT_SIZE] >> (id % UNIT_SIZE) & 1) == 1; |
811 | 48.9M | } |
812 | | |
813 | 0 | id_type rank(std::size_t id) const { |
814 | 0 | std::size_t unit_id = id / UNIT_SIZE; |
815 | 0 | return ranks_[unit_id] + pop_count(units_[unit_id] |
816 | 0 | & (~0U >> (UNIT_SIZE - (id % UNIT_SIZE) - 1))); |
817 | 0 | } |
818 | | |
819 | 0 | void set(std::size_t id, bool bit) { |
820 | 0 | if (bit) { |
821 | 0 | units_[id / UNIT_SIZE] |= 1U << (id % UNIT_SIZE); |
822 | 0 | } else { |
823 | 0 | units_[id / UNIT_SIZE] &= ~(1U << (id % UNIT_SIZE)); |
824 | 0 | } |
825 | 0 | } |
826 | | |
827 | 0 | bool empty() const { |
828 | 0 | return units_.empty(); |
829 | 0 | } |
830 | 97.4k | std::size_t num_ones() const { |
831 | 97.4k | return num_ones_; |
832 | 97.4k | } |
833 | 31.8M | std::size_t size() const { |
834 | 31.8M | return size_; |
835 | 31.8M | } |
836 | | |
837 | 31.8M | void append() { |
838 | 31.8M | if ((size_ % UNIT_SIZE) == 0) { |
839 | 1.02M | units_.append(0); |
840 | 1.02M | } |
841 | 31.8M | ++size_; |
842 | 31.8M | } |
843 | | void build(); |
844 | | |
845 | 146k | void clear() { |
846 | 146k | units_.clear(); |
847 | 146k | ranks_.clear(); |
848 | 146k | } |
849 | | |
850 | | private: |
851 | | enum { UNIT_SIZE = sizeof(id_type) * 8 }; |
852 | | |
853 | | AutoPool<id_type> units_; |
854 | | AutoArray<id_type> ranks_; |
855 | | std::size_t num_ones_; |
856 | | std::size_t size_; |
857 | | |
858 | | // Disallows copy and assignment. |
859 | | BitVector(const BitVector &); |
860 | | BitVector &operator=(const BitVector &); |
861 | | |
862 | 1.02M | static id_type pop_count(id_type unit) { |
863 | 1.02M | unit = ((unit & 0xAAAAAAAA) >> 1) + (unit & 0x55555555); |
864 | 1.02M | unit = ((unit & 0xCCCCCCCC) >> 2) + (unit & 0x33333333); |
865 | 1.02M | unit = ((unit >> 4) + unit) & 0x0F0F0F0F; |
866 | 1.02M | unit += unit >> 8; |
867 | 1.02M | unit += unit >> 16; |
868 | 1.02M | return unit & 0xFF; |
869 | 1.02M | } |
870 | | }; |
871 | | |
872 | 48.7k | inline void BitVector::build() { |
873 | 48.7k | try { |
874 | 48.7k | ranks_.reset(new id_type[units_.size()]); |
875 | 48.7k | } catch (const std::bad_alloc &) { |
876 | 0 | DARTS_THROW("failed to build rank index: std::bad_alloc"); |
877 | 0 | } |
878 | | |
879 | 48.7k | num_ones_ = 0; |
880 | 1.06M | for (std::size_t i = 0; i < units_.size(); ++i) { |
881 | 1.02M | ranks_[i] = num_ones_; |
882 | 1.02M | num_ones_ += pop_count(units_[i]); |
883 | 1.02M | } |
884 | 48.7k | } |
885 | | |
886 | | // |
887 | | // Keyset. |
888 | | // |
889 | | |
890 | | template <typename T> |
891 | | class Keyset { |
892 | | public: |
893 | | Keyset(std::size_t num_keys, const char_type * const *keys, |
894 | | const std::size_t *lengths, const T *values) : |
895 | 57.7k | num_keys_(num_keys), keys_(keys), lengths_(lengths), values_(values) {} |
896 | | |
897 | 7.51M | std::size_t num_keys() const { |
898 | 7.51M | return num_keys_; |
899 | 7.51M | } |
900 | 7.42M | const char_type *keys(std::size_t id) const { |
901 | 7.42M | return keys_[id]; |
902 | 7.42M | } |
903 | 628k | uchar_type keys(std::size_t key_id, std::size_t char_id) const { |
904 | 628k | if (has_lengths() && char_id >= lengths_[key_id]) |
905 | 87.1k | return '\0'; |
906 | 541k | return keys_[key_id][char_id]; |
907 | 628k | } |
908 | | |
909 | 8.13M | bool has_lengths() const { |
910 | 8.13M | return lengths_ != NULL; |
911 | 8.13M | } |
912 | 7.46M | std::size_t lengths(std::size_t id) const { |
913 | 7.46M | if (has_lengths()) { |
914 | 7.46M | return lengths_[id]; |
915 | 7.46M | } |
916 | 0 | std::size_t length = 0; |
917 | 0 | while (keys_[id][length] != '\0') { |
918 | 0 | ++length; |
919 | 0 | } |
920 | 0 | return length; |
921 | 7.46M | } |
922 | | |
923 | 7.56M | bool has_values() const { |
924 | 7.56M | return values_ != NULL; |
925 | 7.56M | } |
926 | 7.50M | const value_type values(std::size_t id) const { |
927 | 7.50M | if (has_values()) { |
928 | 7.42M | return static_cast<value_type>(values_[id]); |
929 | 7.42M | } |
930 | 87.1k | return static_cast<value_type>(id); |
931 | 7.50M | } |
932 | | |
933 | | private: |
934 | | std::size_t num_keys_; |
935 | | const char_type * const * keys_; |
936 | | const std::size_t *lengths_; |
937 | | const T *values_; |
938 | | |
939 | | // Disallows copy and assignment. |
940 | | Keyset(const Keyset &); |
941 | | Keyset &operator=(const Keyset &); |
942 | | }; |
943 | | |
944 | | // |
945 | | // Node of Directed Acyclic Word Graph (DAWG). |
946 | | // |
947 | | |
948 | | class DawgNode { |
949 | | public: |
950 | 31.8M | DawgNode() : child_(0), sibling_(0), label_('\0'), |
951 | 31.8M | is_state_(false), has_sibling_(false) {} |
952 | | |
953 | 56.2M | void set_child(id_type child) { |
954 | 56.2M | child_ = child; |
955 | 56.2M | } |
956 | 31.8M | void set_sibling(id_type sibling) { |
957 | 31.8M | sibling_ = sibling; |
958 | 31.8M | } |
959 | 7.42M | void set_value(value_type value) { |
960 | 7.42M | child_ = value; |
961 | 7.42M | } |
962 | 31.8M | void set_label(uchar_type label) { |
963 | 31.8M | label_ = label; |
964 | 31.8M | } |
965 | 24.4M | void set_is_state(bool is_state) { |
966 | 24.4M | is_state_ = is_state; |
967 | 24.4M | } |
968 | 7.37M | void set_has_sibling(bool has_sibling) { |
969 | 7.37M | has_sibling_ = has_sibling; |
970 | 7.37M | } |
971 | | |
972 | 94.7M | id_type child() const { |
973 | 94.7M | return child_; |
974 | 94.7M | } |
975 | 160M | id_type sibling() const { |
976 | 160M | return sibling_; |
977 | 160M | } |
978 | 0 | value_type value() const { |
979 | 0 | return static_cast<value_type>(child_); |
980 | 0 | } |
981 | 94.7M | uchar_type label() const { |
982 | 94.7M | return label_; |
983 | 94.7M | } |
984 | 0 | bool is_state() const { |
985 | 0 | return is_state_; |
986 | 0 | } |
987 | 0 | bool has_sibling() const { |
988 | 0 | return has_sibling_; |
989 | 0 | } |
990 | | |
991 | 89.9M | id_type unit() const { |
992 | 89.9M | if (label_ == '\0') { |
993 | 19.8M | return (child_ << 1) | (has_sibling_ ? 1 : 0); |
994 | 19.8M | } |
995 | 70.1M | return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0); |
996 | 89.9M | } |
997 | | |
998 | | private: |
999 | | id_type child_; |
1000 | | id_type sibling_; |
1001 | | uchar_type label_; |
1002 | | bool is_state_; |
1003 | | bool has_sibling_; |
1004 | | |
1005 | | // Copyable. |
1006 | | }; |
1007 | | |
1008 | | // |
1009 | | // Fixed unit of Directed Acyclic Word Graph (DAWG). |
1010 | | // |
1011 | | |
1012 | | class DawgUnit { |
1013 | | public: |
1014 | 31.8M | explicit DawgUnit(id_type unit = 0) : unit_(unit) {} |
1015 | 45.8M | DawgUnit(const DawgUnit &unit) : unit_(unit.unit_) {} |
1016 | | |
1017 | 31.8M | DawgUnit &operator=(id_type unit) { |
1018 | 31.8M | unit_ = unit; |
1019 | 31.8M | return *this; |
1020 | 31.8M | } |
1021 | | |
1022 | 46.6M | id_type unit() const { |
1023 | 46.6M | return unit_; |
1024 | 46.6M | } |
1025 | | |
1026 | 73.4M | id_type child() const { |
1027 | 73.4M | return unit_ >> 2; |
1028 | 73.4M | } |
1029 | 149M | bool has_sibling() const { |
1030 | 149M | return (unit_ & 1) == 1; |
1031 | 149M | } |
1032 | 7.42M | value_type value() const { |
1033 | 7.42M | return static_cast<value_type>(unit_ >> 1); |
1034 | 7.42M | } |
1035 | 15.7M | bool is_state() const { |
1036 | 15.7M | return (unit_ & 2) == 2; |
1037 | 15.7M | } |
1038 | | |
1039 | | private: |
1040 | | id_type unit_; |
1041 | | |
1042 | | // Copyable. |
1043 | | }; |
1044 | | |
1045 | | // |
1046 | | // Directed Acyclic Word Graph (DAWG) builder. |
1047 | | // |
1048 | | |
1049 | | class DawgBuilder { |
1050 | | public: |
1051 | 48.7k | DawgBuilder() : nodes_(), units_(), labels_(), is_intersections_(), |
1052 | 48.7k | table_(), node_stack_(), recycle_bin_(), num_states_(0) {} |
1053 | 48.7k | ~DawgBuilder() { |
1054 | 48.7k | clear(); |
1055 | 48.7k | } |
1056 | | |
1057 | 97.4k | id_type root() const { |
1058 | 97.4k | return 0; |
1059 | 97.4k | } |
1060 | | |
1061 | 73.4M | id_type child(id_type id) const { |
1062 | 73.4M | return units_[id].child(); |
1063 | 73.4M | } |
1064 | 95.5M | id_type sibling(id_type id) const { |
1065 | 95.5M | return units_[id].has_sibling() ? (id + 1) : 0; |
1066 | 95.5M | } |
1067 | 7.42M | int value(id_type id) const { |
1068 | 7.42M | return units_[id].value(); |
1069 | 7.42M | } |
1070 | | |
1071 | 31.8M | bool is_leaf(id_type id) const { |
1072 | 31.8M | return label(id) == '\0'; |
1073 | 31.8M | } |
1074 | 95.5M | uchar_type label(id_type id) const { |
1075 | 95.5M | return labels_[id]; |
1076 | 95.5M | } |
1077 | | |
1078 | 48.9M | bool is_intersection(id_type id) const { |
1079 | 48.9M | return is_intersections_[id]; |
1080 | 48.9M | } |
1081 | 0 | id_type intersection_id(id_type id) const { |
1082 | 0 | return is_intersections_.rank(id) - 1; |
1083 | 0 | } |
1084 | | |
1085 | 97.4k | std::size_t num_intersections() const { |
1086 | 97.4k | return is_intersections_.num_ones(); |
1087 | 97.4k | } |
1088 | | |
1089 | 465k | std::size_t size() const { |
1090 | 465k | return units_.size(); |
1091 | 465k | } |
1092 | | |
1093 | | void init(); |
1094 | | void finish(); |
1095 | | |
1096 | | void insert(const char *key, std::size_t length, value_type value); |
1097 | | |
1098 | | void clear(); |
1099 | | |
1100 | | private: |
1101 | | enum { INITIAL_TABLE_SIZE = 1 << 10 }; |
1102 | | |
1103 | | AutoPool<DawgNode> nodes_; |
1104 | | AutoPool<DawgUnit> units_; |
1105 | | AutoPool<uchar_type> labels_; |
1106 | | BitVector is_intersections_; |
1107 | | AutoPool<id_type> table_; |
1108 | | AutoStack<id_type> node_stack_; |
1109 | | AutoStack<id_type> recycle_bin_; |
1110 | | std::size_t num_states_; |
1111 | | |
1112 | | // Disallows copy and assignment. |
1113 | | DawgBuilder(const DawgBuilder &); |
1114 | | DawgBuilder &operator=(const DawgBuilder &); |
1115 | | |
1116 | | void flush(id_type id); |
1117 | | |
1118 | | void expand_table(); |
1119 | | |
1120 | | id_type find_unit(id_type id, id_type *hash_id) const; |
1121 | | id_type find_node(id_type node_id, id_type *hash_id) const; |
1122 | | |
1123 | | bool are_equal(id_type node_id, id_type unit_id) const; |
1124 | | |
1125 | | id_type hash_unit(id_type id) const; |
1126 | | id_type hash_node(id_type id) const; |
1127 | | |
1128 | | id_type append_node(); |
1129 | | id_type append_unit(); |
1130 | | |
1131 | 31.8M | void free_node(id_type id) { |
1132 | 31.8M | recycle_bin_.push(id); |
1133 | 31.8M | } |
1134 | | |
1135 | 52.2M | static id_type hash(id_type key) { |
1136 | 52.2M | key = ~key + (key << 15); // key = (key << 15) - key - 1; |
1137 | 52.2M | key = key ^ (key >> 12); |
1138 | 52.2M | key = key + (key << 2); |
1139 | 52.2M | key = key ^ (key >> 4); |
1140 | 52.2M | key = key * 2057; // key = (key + (key << 3)) + (key << 11); |
1141 | 52.2M | key = key ^ (key >> 16); |
1142 | 52.2M | return key; |
1143 | 52.2M | } |
1144 | | }; |
1145 | | |
1146 | 48.7k | inline void DawgBuilder::init() { |
1147 | 48.7k | table_.resize(INITIAL_TABLE_SIZE, 0); |
1148 | | |
1149 | 48.7k | append_node(); |
1150 | 48.7k | append_unit(); |
1151 | | |
1152 | 48.7k | num_states_ = 1; |
1153 | | |
1154 | 48.7k | nodes_[0].set_label(0xFF); |
1155 | 48.7k | node_stack_.push(0); |
1156 | 48.7k | } |
1157 | | |
1158 | 48.7k | inline void DawgBuilder::finish() { |
1159 | 48.7k | flush(0); |
1160 | | |
1161 | 48.7k | units_[0] = nodes_[0].unit(); |
1162 | 48.7k | labels_[0] = nodes_[0].label(); |
1163 | | |
1164 | 48.7k | nodes_.clear(); |
1165 | 48.7k | table_.clear(); |
1166 | 48.7k | node_stack_.clear(); |
1167 | 48.7k | recycle_bin_.clear(); |
1168 | | |
1169 | 48.7k | is_intersections_.build(); |
1170 | 48.7k | } |
1171 | | |
1172 | | inline void DawgBuilder::insert(const char *key, std::size_t length, |
1173 | 7.42M | value_type value) { |
1174 | 7.42M | if (value < 0) { |
1175 | 0 | DARTS_THROW("failed to insert key: negative value"); |
1176 | 7.42M | } else if (length == 0) { |
1177 | 0 | DARTS_THROW("failed to insert key: zero-length key"); |
1178 | 0 | } |
1179 | | |
1180 | 7.42M | id_type id = 0; |
1181 | 7.42M | std::size_t key_pos = 0; |
1182 | | |
1183 | 31.0M | for ( ; key_pos <= length; ++key_pos) { |
1184 | 31.0M | id_type child_id = nodes_[id].child(); |
1185 | 31.0M | if (child_id == 0) { |
1186 | 48.7k | break; |
1187 | 48.7k | } |
1188 | | |
1189 | 30.9M | uchar_type key_label = static_cast<uchar_type>(key[key_pos]); |
1190 | 30.9M | if (key_pos < length && key_label == '\0') { |
1191 | 0 | DARTS_THROW("failed to insert key: invalid null character"); |
1192 | 0 | } |
1193 | | |
1194 | 30.9M | uchar_type unit_label = nodes_[child_id].label(); |
1195 | 30.9M | if (key_label < unit_label) { |
1196 | 0 | DARTS_THROW("failed to insert key: wrong key order"); |
1197 | 30.9M | } else if (key_label > unit_label) { |
1198 | 7.37M | nodes_[child_id].set_has_sibling(true); |
1199 | 7.37M | flush(child_id); |
1200 | 7.37M | break; |
1201 | 7.37M | } |
1202 | 23.6M | id = child_id; |
1203 | 23.6M | } |
1204 | | |
1205 | 7.42M | if (key_pos > length) { |
1206 | 0 | return; |
1207 | 0 | } |
1208 | | |
1209 | 39.2M | for ( ; key_pos <= length; ++key_pos) { |
1210 | 31.8M | uchar_type key_label = static_cast<uchar_type>( |
1211 | 31.8M | (key_pos < length) ? key[key_pos] : '\0'); |
1212 | 31.8M | id_type child_id = append_node(); |
1213 | | |
1214 | 31.8M | if (nodes_[id].child() == 0) { |
1215 | 24.4M | nodes_[child_id].set_is_state(true); |
1216 | 24.4M | } |
1217 | 31.8M | nodes_[child_id].set_sibling(nodes_[id].child()); |
1218 | 31.8M | nodes_[child_id].set_label(key_label); |
1219 | 31.8M | nodes_[id].set_child(child_id); |
1220 | 31.8M | node_stack_.push(child_id); |
1221 | | |
1222 | 31.8M | id = child_id; |
1223 | 31.8M | } |
1224 | 7.42M | nodes_[id].set_value(value); |
1225 | 7.42M | } |
1226 | | |
1227 | 97.4k | inline void DawgBuilder::clear() { |
1228 | 97.4k | nodes_.clear(); |
1229 | 97.4k | units_.clear(); |
1230 | 97.4k | labels_.clear(); |
1231 | 97.4k | is_intersections_.clear(); |
1232 | 97.4k | table_.clear(); |
1233 | 97.4k | node_stack_.clear(); |
1234 | 97.4k | recycle_bin_.clear(); |
1235 | 97.4k | num_states_ = 0; |
1236 | 97.4k | } |
1237 | | |
1238 | 7.42M | inline void DawgBuilder::flush(id_type id) { |
1239 | 31.8M | while (node_stack_.top() != id) { |
1240 | 24.4M | id_type node_id = node_stack_.top(); |
1241 | 24.4M | node_stack_.pop(); |
1242 | | |
1243 | 24.4M | if (num_states_ >= table_.size() - (table_.size() >> 2)) { |
1244 | 13.7k | expand_table(); |
1245 | 13.7k | } |
1246 | | |
1247 | 24.4M | id_type num_siblings = 0; |
1248 | 56.2M | for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { |
1249 | 31.8M | ++num_siblings; |
1250 | 31.8M | } |
1251 | | |
1252 | 24.4M | id_type hash_id; |
1253 | 24.4M | id_type match_id = find_node(node_id, &hash_id); |
1254 | 24.4M | if (match_id != 0) { |
1255 | 0 | is_intersections_.set(match_id, true); |
1256 | 24.4M | } else { |
1257 | 24.4M | id_type unit_id = 0; |
1258 | 56.2M | for (id_type i = 0; i < num_siblings; ++i) { |
1259 | 31.8M | unit_id = append_unit(); |
1260 | 31.8M | } |
1261 | 56.2M | for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { |
1262 | 31.8M | units_[unit_id] = nodes_[i].unit(); |
1263 | 31.8M | labels_[unit_id] = nodes_[i].label(); |
1264 | 31.8M | --unit_id; |
1265 | 31.8M | } |
1266 | 24.4M | match_id = unit_id + 1; |
1267 | 24.4M | table_[hash_id] = match_id; |
1268 | 24.4M | ++num_states_; |
1269 | 24.4M | } |
1270 | | |
1271 | 56.2M | for (id_type i = node_id, next; i != 0; i = next) { |
1272 | 31.8M | next = nodes_[i].sibling(); |
1273 | 31.8M | free_node(i); |
1274 | 31.8M | } |
1275 | | |
1276 | 24.4M | nodes_[node_stack_.top()].set_child(match_id); |
1277 | 24.4M | } |
1278 | 7.42M | node_stack_.pop(); |
1279 | 7.42M | } |
1280 | | |
1281 | 13.7k | inline void DawgBuilder::expand_table() { |
1282 | 13.7k | std::size_t table_size = table_.size() << 1; |
1283 | 13.7k | table_.clear(); |
1284 | 13.7k | table_.resize(table_size, 0); |
1285 | | |
1286 | 20.4M | for (std::size_t i = 1; i < units_.size(); ++i) { |
1287 | 20.3M | id_type id = static_cast<id_type>(i); |
1288 | 20.3M | if (labels_[id] == '\0' || units_[id].is_state()) { |
1289 | 16.3M | id_type hash_id; |
1290 | 16.3M | find_unit(id, &hash_id); |
1291 | 16.3M | table_[hash_id] = id; |
1292 | 16.3M | } |
1293 | 20.3M | } |
1294 | 13.7k | } |
1295 | | |
1296 | 16.3M | inline id_type DawgBuilder::find_unit(id_type id, id_type *hash_id) const { |
1297 | 16.3M | *hash_id = hash_unit(id) % table_.size(); |
1298 | 21.3M | for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { |
1299 | 21.3M | id_type unit_id = table_[*hash_id]; |
1300 | 21.3M | if (unit_id == 0) { |
1301 | 16.3M | break; |
1302 | 16.3M | } |
1303 | | |
1304 | | // There must not be the same unit. |
1305 | 21.3M | } |
1306 | 16.3M | return 0; |
1307 | 16.3M | } |
1308 | | |
1309 | | inline id_type DawgBuilder::find_node(id_type node_id, |
1310 | 24.4M | id_type *hash_id) const { |
1311 | 24.4M | *hash_id = hash_node(node_id) % table_.size(); |
1312 | 57.2M | for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { |
1313 | 57.2M | id_type unit_id = table_[*hash_id]; |
1314 | 57.2M | if (unit_id == 0) { |
1315 | 24.4M | break; |
1316 | 24.4M | } |
1317 | | |
1318 | 32.8M | if (are_equal(node_id, unit_id)) { |
1319 | 0 | return unit_id; |
1320 | 0 | } |
1321 | 32.8M | } |
1322 | 24.4M | return 0; |
1323 | 24.4M | } |
1324 | | |
1325 | 32.8M | inline bool DawgBuilder::are_equal(id_type node_id, id_type unit_id) const { |
1326 | 33.6M | for (id_type i = nodes_[node_id].sibling(); i != 0; |
1327 | 32.8M | i = nodes_[i].sibling()) { |
1328 | 4.06M | if (units_[unit_id].has_sibling() == false) { |
1329 | 3.24M | return false; |
1330 | 3.24M | } |
1331 | 822k | ++unit_id; |
1332 | 822k | } |
1333 | 29.5M | if (units_[unit_id].has_sibling() == true) { |
1334 | 3.34M | return false; |
1335 | 3.34M | } |
1336 | | |
1337 | 26.2M | for (id_type i = node_id; i != 0; i = nodes_[i].sibling(), --unit_id) { |
1338 | 26.2M | if (nodes_[i].unit() != units_[unit_id].unit() || |
1339 | 26.2M | nodes_[i].label() != labels_[unit_id]) { |
1340 | 26.2M | return false; |
1341 | 26.2M | } |
1342 | 26.2M | } |
1343 | 0 | return true; |
1344 | 26.2M | } |
1345 | | |
1346 | 16.3M | inline id_type DawgBuilder::hash_unit(id_type id) const { |
1347 | 16.3M | id_type hash_value = 0; |
1348 | 20.3M | for ( ; id != 0; ++id) { |
1349 | 20.3M | id_type unit = units_[id].unit(); |
1350 | 20.3M | uchar_type label = labels_[id]; |
1351 | 20.3M | hash_value ^= hash((label << 24) ^ unit); |
1352 | | |
1353 | 20.3M | if (units_[id].has_sibling() == false) { |
1354 | 16.3M | break; |
1355 | 16.3M | } |
1356 | 20.3M | } |
1357 | 16.3M | return hash_value; |
1358 | 16.3M | } |
1359 | | |
1360 | 24.4M | inline id_type DawgBuilder::hash_node(id_type id) const { |
1361 | 24.4M | id_type hash_value = 0; |
1362 | 56.2M | for ( ; id != 0; id = nodes_[id].sibling()) { |
1363 | 31.8M | id_type unit = nodes_[id].unit(); |
1364 | 31.8M | uchar_type label = nodes_[id].label(); |
1365 | 31.8M | hash_value ^= hash((label << 24) ^ unit); |
1366 | 31.8M | } |
1367 | 24.4M | return hash_value; |
1368 | 24.4M | } |
1369 | | |
1370 | 31.8M | inline id_type DawgBuilder::append_unit() { |
1371 | 31.8M | is_intersections_.append(); |
1372 | 31.8M | units_.append(); |
1373 | 31.8M | labels_.append(); |
1374 | | |
1375 | 31.8M | return static_cast<id_type>(is_intersections_.size() - 1); |
1376 | 31.8M | } |
1377 | | |
1378 | 31.8M | inline id_type DawgBuilder::append_node() { |
1379 | 31.8M | id_type id; |
1380 | 31.8M | if (recycle_bin_.empty()) { |
1381 | 3.15M | id = static_cast<id_type>(nodes_.size()); |
1382 | 3.15M | nodes_.append(); |
1383 | 28.7M | } else { |
1384 | 28.7M | id = recycle_bin_.top(); |
1385 | 28.7M | nodes_[id] = DawgNode(); |
1386 | 28.7M | recycle_bin_.pop(); |
1387 | 28.7M | } |
1388 | 31.8M | return id; |
1389 | 31.8M | } |
1390 | | |
1391 | | // |
1392 | | // Unit of double-array builder. |
1393 | | // |
1394 | | |
1395 | | class DoubleArrayBuilderUnit { |
1396 | | public: |
1397 | 44.5M | DoubleArrayBuilderUnit() : unit_(0) {} |
1398 | | |
1399 | 7.46M | void set_has_leaf(bool has_leaf) { |
1400 | 7.46M | if (has_leaf) { |
1401 | 7.46M | unit_ |= 1U << 8; |
1402 | 7.46M | } else { |
1403 | 0 | unit_ &= ~(1U << 8); |
1404 | 0 | } |
1405 | 7.46M | } |
1406 | 7.46M | void set_value(value_type value) { |
1407 | 7.46M | unit_ = value | (1U << 31); |
1408 | 7.46M | } |
1409 | 37.0M | void set_label(uchar_type label) { |
1410 | 37.0M | unit_ = (unit_ & ~0xFFU) | label; |
1411 | 37.0M | } |
1412 | 24.6M | void set_offset(id_type offset) { |
1413 | 24.6M | if (offset >= 1U << 29) { |
1414 | 0 | DARTS_THROW("failed to modify unit: too large offset"); |
1415 | 0 | } |
1416 | 24.6M | unit_ &= (1U << 31) | (1U << 8) | 0xFF; |
1417 | 24.6M | if (offset < 1U << 21) { |
1418 | 24.6M | unit_ |= (offset << 10); |
1419 | 24.6M | } else { |
1420 | 0 | unit_ |= (offset << 2) | (1U << 9); |
1421 | 0 | } |
1422 | 24.6M | } |
1423 | | |
1424 | | private: |
1425 | | id_type unit_; |
1426 | | |
1427 | | // Copyable. |
1428 | | }; |
1429 | | |
1430 | | // |
1431 | | // Extra unit of double-array builder. |
1432 | | // |
1433 | | |
1434 | | class DoubleArrayBuilderExtraUnit { |
1435 | | public: |
1436 | 236M | DoubleArrayBuilderExtraUnit() : prev_(0), next_(0), |
1437 | 236M | is_fixed_(false), is_used_(false) {} |
1438 | | |
1439 | 89.4M | void set_prev(id_type prev) { |
1440 | 89.4M | prev_ = prev; |
1441 | 89.4M | } |
1442 | 89.4M | void set_next(id_type next) { |
1443 | 89.4M | next_ = next; |
1444 | 89.4M | } |
1445 | 46.8M | void set_is_fixed(bool is_fixed) { |
1446 | 46.8M | is_fixed_ = is_fixed; |
1447 | 46.8M | } |
1448 | 26.9M | void set_is_used(bool is_used) { |
1449 | 26.9M | is_used_ = is_used; |
1450 | 26.9M | } |
1451 | | |
1452 | 89.4M | id_type prev() const { |
1453 | 89.4M | return prev_; |
1454 | 89.4M | } |
1455 | 288M | id_type next() const { |
1456 | 288M | return next_; |
1457 | 288M | } |
1458 | 92.8M | bool is_fixed() const { |
1459 | 92.8M | return is_fixed_; |
1460 | 92.8M | } |
1461 | 208M | bool is_used() const { |
1462 | 208M | return is_used_; |
1463 | 208M | } |
1464 | | |
1465 | | private: |
1466 | | id_type prev_; |
1467 | | id_type next_; |
1468 | | bool is_fixed_; |
1469 | | bool is_used_; |
1470 | | |
1471 | | // Copyable. |
1472 | | }; |
1473 | | |
1474 | | // |
1475 | | // DAWG -> double-array converter. |
1476 | | // |
1477 | | |
1478 | | class DoubleArrayBuilder { |
1479 | | public: |
1480 | | explicit DoubleArrayBuilder(progress_func_type progress_func) |
1481 | 57.7k | : progress_func_(progress_func), units_(), extras_(), labels_(), |
1482 | 57.7k | table_(), extras_head_(0) {} |
1483 | 57.7k | ~DoubleArrayBuilder() { |
1484 | 57.7k | clear(); |
1485 | 57.7k | } |
1486 | | |
1487 | | template <typename T> |
1488 | | void build(const Keyset<T> &keyset); |
1489 | | void copy(std::size_t *size_ptr, DoubleArrayUnit **buf_ptr) const; |
1490 | | |
1491 | | void clear(); |
1492 | | |
1493 | | private: |
1494 | | static constexpr int BLOCK_SIZE = 256; |
1495 | | static constexpr int NUM_EXTRA_BLOCKS = 16; |
1496 | | static constexpr int NUM_EXTRAS = BLOCK_SIZE * NUM_EXTRA_BLOCKS; |
1497 | | |
1498 | | static constexpr int UPPER_MASK = 0xFF << 21; |
1499 | | static constexpr int LOWER_MASK = 0xFF; |
1500 | | |
1501 | | typedef DoubleArrayBuilderUnit unit_type; |
1502 | | typedef DoubleArrayBuilderExtraUnit extra_type; |
1503 | | |
1504 | | progress_func_type progress_func_; |
1505 | | AutoPool<unit_type> units_; |
1506 | | AutoArray<extra_type> extras_; |
1507 | | AutoPool<uchar_type> labels_; |
1508 | | AutoArray<id_type> table_; |
1509 | | id_type extras_head_; |
1510 | | |
1511 | | // Disallows copy and assignment. |
1512 | | DoubleArrayBuilder(const DoubleArrayBuilder &); |
1513 | | DoubleArrayBuilder &operator=(const DoubleArrayBuilder &); |
1514 | | |
1515 | 290k | std::size_t num_blocks() const { |
1516 | 290k | return units_.size() / BLOCK_SIZE; |
1517 | 290k | } |
1518 | | |
1519 | 436M | const extra_type &extras(id_type id) const { |
1520 | 436M | return extras_[id % NUM_EXTRAS]; |
1521 | 436M | } |
1522 | 495M | extra_type &extras(id_type id) { |
1523 | 495M | return extras_[id % NUM_EXTRAS]; |
1524 | 495M | } |
1525 | | |
1526 | | template <typename T> |
1527 | | void build_dawg(const Keyset<T> &keyset, DawgBuilder *dawg_builder); |
1528 | | void build_from_dawg(const DawgBuilder &dawg); |
1529 | | void build_from_dawg(const DawgBuilder &dawg, |
1530 | | id_type dawg_id, id_type dic_id); |
1531 | | id_type arrange_from_dawg(const DawgBuilder &dawg, |
1532 | | id_type dawg_id, id_type dic_id); |
1533 | | |
1534 | | template <typename T> |
1535 | | void build_from_keyset(const Keyset<T> &keyset); |
1536 | | template <typename T> |
1537 | | void build_from_keyset(const Keyset<T> &keyset, std::size_t begin, |
1538 | | std::size_t end, std::size_t depth, id_type dic_id); |
1539 | | template <typename T> |
1540 | | id_type arrange_from_keyset(const Keyset<T> &keyset, std::size_t begin, |
1541 | | std::size_t end, std::size_t depth, id_type dic_id); |
1542 | | |
1543 | | id_type find_valid_offset(id_type id) const; |
1544 | | bool is_valid_offset(id_type id, id_type offset) const; |
1545 | | |
1546 | | void reserve_id(id_type id); |
1547 | | void expand_units(); |
1548 | | |
1549 | | void fix_all_blocks(); |
1550 | | void fix_block(id_type block_id); |
1551 | | }; |
1552 | | |
1553 | | template <typename T> |
1554 | 57.7k | void DoubleArrayBuilder::build(const Keyset<T> &keyset) { |
1555 | 57.7k | if (keyset.has_values()) { |
1556 | 48.7k | Details::DawgBuilder dawg_builder; |
1557 | 48.7k | build_dawg(keyset, &dawg_builder); |
1558 | 48.7k | build_from_dawg(dawg_builder); |
1559 | 48.7k | dawg_builder.clear(); |
1560 | 48.7k | } else { |
1561 | 8.96k | build_from_keyset(keyset); |
1562 | 8.96k | } |
1563 | 57.7k | } |
1564 | | |
1565 | | inline void DoubleArrayBuilder::copy(std::size_t *size_ptr, |
1566 | 57.7k | DoubleArrayUnit **buf_ptr) const { |
1567 | 57.7k | if (size_ptr != NULL) { |
1568 | 57.7k | *size_ptr = units_.size(); |
1569 | 57.7k | } |
1570 | 57.7k | if (buf_ptr != NULL) { |
1571 | 57.7k | *buf_ptr = new DoubleArrayUnit[units_.size()]; |
1572 | 57.7k | unit_type *units = reinterpret_cast<unit_type *>(*buf_ptr); |
1573 | 44.6M | for (std::size_t i = 0; i < units_.size(); ++i) { |
1574 | 44.5M | units[i] = units_[i]; |
1575 | 44.5M | } |
1576 | 57.7k | } |
1577 | 57.7k | } |
1578 | | |
1579 | 57.7k | inline void DoubleArrayBuilder::clear() { |
1580 | 57.7k | units_.clear(); |
1581 | 57.7k | extras_.clear(); |
1582 | 57.7k | labels_.clear(); |
1583 | 57.7k | table_.clear(); |
1584 | 57.7k | extras_head_ = 0; |
1585 | 57.7k | } |
1586 | | |
1587 | | template <typename T> |
1588 | | void DoubleArrayBuilder::build_dawg(const Keyset<T> &keyset, |
1589 | 48.7k | DawgBuilder *dawg_builder) { |
1590 | 48.7k | dawg_builder->init(); |
1591 | 7.47M | for (std::size_t i = 0; i < keyset.num_keys(); ++i) { |
1592 | 7.42M | dawg_builder->insert(keyset.keys(i), keyset.lengths(i), keyset.values(i)); |
1593 | 7.42M | if (progress_func_ != NULL) { |
1594 | 0 | progress_func_(i + 1, keyset.num_keys() + 1); |
1595 | 0 | } |
1596 | 7.42M | } |
1597 | 48.7k | dawg_builder->finish(); |
1598 | 48.7k | } |
1599 | | |
1600 | 48.7k | inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg) { |
1601 | 48.7k | std::size_t num_units = 1; |
1602 | 465k | while (num_units < dawg.size()) { |
1603 | 417k | num_units <<= 1; |
1604 | 417k | } |
1605 | 48.7k | units_.reserve(num_units); |
1606 | | |
1607 | 48.7k | table_.reset(new id_type[dawg.num_intersections()]); |
1608 | 48.7k | for (std::size_t i = 0; i < dawg.num_intersections(); ++i) { |
1609 | 0 | table_[i] = 0; |
1610 | 0 | } |
1611 | | |
1612 | 48.7k | extras_.reset(new extra_type[NUM_EXTRAS]); |
1613 | | |
1614 | 48.7k | reserve_id(0); |
1615 | 48.7k | extras(0).set_is_used(true); |
1616 | 48.7k | units_[0].set_offset(1); |
1617 | 48.7k | units_[0].set_label('\0'); |
1618 | | |
1619 | 48.7k | if (dawg.child(dawg.root()) != 0) { |
1620 | 48.7k | build_from_dawg(dawg, dawg.root(), 0); |
1621 | 48.7k | } |
1622 | | |
1623 | 48.7k | fix_all_blocks(); |
1624 | | |
1625 | 48.7k | extras_.clear(); |
1626 | 48.7k | labels_.clear(); |
1627 | 48.7k | table_.clear(); |
1628 | 48.7k | } |
1629 | | |
1630 | | inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg, |
1631 | 24.4M | id_type dawg_id, id_type dic_id) { |
1632 | 24.4M | id_type dawg_child_id = dawg.child(dawg_id); |
1633 | 24.4M | if (dawg.is_intersection(dawg_child_id)) { |
1634 | 0 | id_type intersection_id = dawg.intersection_id(dawg_child_id); |
1635 | 0 | id_type offset = table_[intersection_id]; |
1636 | 0 | if (offset != 0) { |
1637 | 0 | offset ^= dic_id; |
1638 | 0 | if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) { |
1639 | 0 | if (dawg.is_leaf(dawg_child_id)) { |
1640 | 0 | units_[dic_id].set_has_leaf(true); |
1641 | 0 | } |
1642 | 0 | units_[dic_id].set_offset(offset); |
1643 | 0 | return; |
1644 | 0 | } |
1645 | 0 | } |
1646 | 0 | } |
1647 | | |
1648 | 24.4M | id_type offset = arrange_from_dawg(dawg, dawg_id, dic_id); |
1649 | 24.4M | if (dawg.is_intersection(dawg_child_id)) { |
1650 | 0 | table_[dawg.intersection_id(dawg_child_id)] = offset; |
1651 | 0 | } |
1652 | | |
1653 | 31.8M | do { |
1654 | 31.8M | uchar_type child_label = dawg.label(dawg_child_id); |
1655 | 31.8M | id_type dic_child_id = offset ^ child_label; |
1656 | 31.8M | if (child_label != '\0') { |
1657 | 24.4M | build_from_dawg(dawg, dawg_child_id, dic_child_id); |
1658 | 24.4M | } |
1659 | 31.8M | dawg_child_id = dawg.sibling(dawg_child_id); |
1660 | 31.8M | } while (dawg_child_id != 0); |
1661 | 24.4M | } |
1662 | | |
1663 | | inline id_type DoubleArrayBuilder::arrange_from_dawg(const DawgBuilder &dawg, |
1664 | 24.4M | id_type dawg_id, id_type dic_id) { |
1665 | 24.4M | labels_.resize(0); |
1666 | | |
1667 | 24.4M | id_type dawg_child_id = dawg.child(dawg_id); |
1668 | 56.2M | while (dawg_child_id != 0) { |
1669 | 31.8M | labels_.append(dawg.label(dawg_child_id)); |
1670 | 31.8M | dawg_child_id = dawg.sibling(dawg_child_id); |
1671 | 31.8M | } |
1672 | | |
1673 | 24.4M | id_type offset = find_valid_offset(dic_id); |
1674 | 24.4M | units_[dic_id].set_offset(dic_id ^ offset); |
1675 | | |
1676 | 24.4M | dawg_child_id = dawg.child(dawg_id); |
1677 | 56.2M | for (std::size_t i = 0; i < labels_.size(); ++i) { |
1678 | 31.8M | id_type dic_child_id = offset ^ labels_[i]; |
1679 | 31.8M | reserve_id(dic_child_id); |
1680 | | |
1681 | 31.8M | if (dawg.is_leaf(dawg_child_id)) { |
1682 | 7.42M | units_[dic_id].set_has_leaf(true); |
1683 | 7.42M | units_[dic_child_id].set_value(dawg.value(dawg_child_id)); |
1684 | 24.4M | } else { |
1685 | 24.4M | units_[dic_child_id].set_label(labels_[i]); |
1686 | 24.4M | } |
1687 | | |
1688 | 31.8M | dawg_child_id = dawg.sibling(dawg_child_id); |
1689 | 31.8M | } |
1690 | 24.4M | extras(offset).set_is_used(true); |
1691 | | |
1692 | 24.4M | return offset; |
1693 | 24.4M | } |
1694 | | |
1695 | | template <typename T> |
1696 | 8.96k | void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset) { |
1697 | 8.96k | std::size_t num_units = 1; |
1698 | 26.5k | while (num_units < keyset.num_keys()) { |
1699 | 17.6k | num_units <<= 1; |
1700 | 17.6k | } |
1701 | 8.96k | units_.reserve(num_units); |
1702 | | |
1703 | 8.96k | extras_.reset(new extra_type[NUM_EXTRAS]); |
1704 | | |
1705 | 8.96k | reserve_id(0); |
1706 | 8.96k | extras(0).set_is_used(true); |
1707 | 8.96k | units_[0].set_offset(1); |
1708 | 8.96k | units_[0].set_label('\0'); |
1709 | | |
1710 | 8.96k | if (keyset.num_keys() > 0) { |
1711 | 8.96k | build_from_keyset(keyset, 0, keyset.num_keys(), 0, 0); |
1712 | 8.96k | } |
1713 | | |
1714 | 8.96k | fix_all_blocks(); |
1715 | | |
1716 | 8.96k | extras_.clear(); |
1717 | 8.96k | labels_.clear(); |
1718 | 8.96k | } |
1719 | | |
1720 | | template <typename T> |
1721 | | void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset, |
1722 | 132k | std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { |
1723 | 132k | id_type offset = arrange_from_keyset(keyset, begin, end, depth, dic_id); |
1724 | | |
1725 | 175k | while (begin < end) { |
1726 | 132k | if (keyset.keys(begin, depth) != '\0') { |
1727 | 88.7k | break; |
1728 | 88.7k | } |
1729 | 43.5k | ++begin; |
1730 | 43.5k | } |
1731 | 132k | if (begin == end) { |
1732 | 43.5k | return; |
1733 | 43.5k | } |
1734 | | |
1735 | 88.7k | std::size_t last_begin = begin; |
1736 | 88.7k | uchar_type last_label = keyset.keys(begin, depth); |
1737 | 208k | while (++begin < end) { |
1738 | 120k | uchar_type label = keyset.keys(begin, depth); |
1739 | 120k | if (label != last_label) { |
1740 | 34.5k | build_from_keyset(keyset, last_begin, begin, |
1741 | 34.5k | depth + 1, offset ^ last_label); |
1742 | 34.5k | last_begin = begin; |
1743 | 34.5k | last_label = keyset.keys(begin, depth); |
1744 | 34.5k | } |
1745 | 120k | } |
1746 | 88.7k | build_from_keyset(keyset, last_begin, end, depth + 1, offset ^ last_label); |
1747 | 88.7k | } |
1748 | | |
1749 | | template <typename T> |
1750 | | id_type DoubleArrayBuilder::arrange_from_keyset(const Keyset<T> &keyset, |
1751 | 132k | std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { |
1752 | 132k | labels_.resize(0); |
1753 | | |
1754 | 132k | value_type value = -1; |
1755 | 384k | for (std::size_t i = begin; i < end; ++i) { |
1756 | 252k | uchar_type label = keyset.keys(i, depth); |
1757 | 252k | if (label == '\0') { |
1758 | 43.5k | if (keyset.has_lengths() && depth < keyset.lengths(i)) { |
1759 | 0 | DARTS_THROW("failed to build double-array: " |
1760 | 0 | "invalid null character"); |
1761 | 43.5k | } else if (keyset.values(i) < 0) { |
1762 | 0 | DARTS_THROW("failed to build double-array: negative value"); |
1763 | 0 | } |
1764 | | |
1765 | 43.5k | if (value == -1) { |
1766 | 43.5k | value = keyset.values(i); |
1767 | 43.5k | } |
1768 | 43.5k | if (progress_func_ != NULL) { |
1769 | 0 | progress_func_(i + 1, keyset.num_keys() + 1); |
1770 | 0 | } |
1771 | 43.5k | } |
1772 | | |
1773 | 252k | if (labels_.empty()) { |
1774 | 132k | labels_.append(label); |
1775 | 132k | } else if (label != labels_[labels_.size() - 1]) { |
1776 | 34.5k | if (label < labels_[labels_.size() - 1]) { |
1777 | 0 | DARTS_THROW("failed to build double-array: wrong key order"); |
1778 | 0 | } |
1779 | 34.5k | labels_.append(label); |
1780 | 34.5k | } |
1781 | 252k | } |
1782 | | |
1783 | 132k | id_type offset = find_valid_offset(dic_id); |
1784 | 132k | units_[dic_id].set_offset(dic_id ^ offset); |
1785 | | |
1786 | 299k | for (std::size_t i = 0; i < labels_.size(); ++i) { |
1787 | 166k | id_type dic_child_id = offset ^ labels_[i]; |
1788 | 166k | reserve_id(dic_child_id); |
1789 | 166k | if (labels_[i] == '\0') { |
1790 | 43.5k | units_[dic_id].set_has_leaf(true); |
1791 | 43.5k | units_[dic_child_id].set_value(value); |
1792 | 123k | } else { |
1793 | 123k | units_[dic_child_id].set_label(labels_[i]); |
1794 | 123k | } |
1795 | 166k | } |
1796 | 132k | extras(offset).set_is_used(true); |
1797 | | |
1798 | 132k | return offset; |
1799 | 132k | } |
1800 | | |
1801 | 24.5M | inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const { |
1802 | 24.5M | if (extras_head_ >= units_.size()) { |
1803 | 90 | return units_.size() | (id & LOWER_MASK); |
1804 | 90 | } |
1805 | | |
1806 | 24.5M | id_type unfixed_id = extras_head_; |
1807 | 206M | do { |
1808 | 206M | id_type offset = unfixed_id ^ labels_[0]; |
1809 | 206M | if (is_valid_offset(id, offset)) { |
1810 | 24.4M | return offset; |
1811 | 24.4M | } |
1812 | 181M | unfixed_id = extras(unfixed_id).next(); |
1813 | 181M | } while (unfixed_id != extras_head_); |
1814 | | |
1815 | 116k | return units_.size() | (id & LOWER_MASK); |
1816 | 24.5M | } |
1817 | | |
1818 | | inline bool DoubleArrayBuilder::is_valid_offset(id_type id, |
1819 | 206M | id_type offset) const { |
1820 | 206M | if (extras(offset).is_used()) { |
1821 | 156M | return false; |
1822 | 156M | } |
1823 | | |
1824 | 49.7M | id_type rel_offset = id ^ offset; |
1825 | 49.7M | if ((rel_offset & LOWER_MASK) && (rel_offset & UPPER_MASK)) { |
1826 | 0 | return false; |
1827 | 0 | } |
1828 | | |
1829 | 72.7M | for (std::size_t i = 1; i < labels_.size(); ++i) { |
1830 | 48.2M | if (extras(offset ^ labels_[i]).is_fixed()) { |
1831 | 25.2M | return false; |
1832 | 25.2M | } |
1833 | 48.2M | } |
1834 | | |
1835 | 24.4M | return true; |
1836 | 49.7M | } |
1837 | | |
1838 | 44.5M | inline void DoubleArrayBuilder::reserve_id(id_type id) { |
1839 | 44.5M | if (id >= units_.size()) { |
1840 | 174k | expand_units(); |
1841 | 174k | } |
1842 | | |
1843 | 44.5M | if (id == extras_head_) { |
1844 | 17.7M | extras_head_ = extras(id).next(); |
1845 | 17.7M | if (extras_head_ == id) { |
1846 | 57.8k | extras_head_ = units_.size(); |
1847 | 57.8k | } |
1848 | 17.7M | } |
1849 | 44.5M | extras(extras(id).prev()).set_next(extras(id).next()); |
1850 | 44.5M | extras(extras(id).next()).set_prev(extras(id).prev()); |
1851 | 44.5M | extras(id).set_is_fixed(true); |
1852 | 44.5M | } |
1853 | | |
1854 | 174k | inline void DoubleArrayBuilder::expand_units() { |
1855 | 174k | id_type src_num_units = units_.size(); |
1856 | 174k | id_type src_num_blocks = num_blocks(); |
1857 | | |
1858 | 174k | id_type dest_num_units = src_num_units + BLOCK_SIZE; |
1859 | 174k | id_type dest_num_blocks = src_num_blocks + 1; |
1860 | | |
1861 | 174k | if (dest_num_blocks > NUM_EXTRA_BLOCKS) { |
1862 | 9.08k | fix_block(src_num_blocks - NUM_EXTRA_BLOCKS); |
1863 | 9.08k | } |
1864 | | |
1865 | 174k | units_.resize(dest_num_units); |
1866 | | |
1867 | 174k | if (dest_num_blocks > NUM_EXTRA_BLOCKS) { |
1868 | 2.33M | for (std::size_t id = src_num_units; id < dest_num_units; ++id) { |
1869 | 2.32M | extras(id).set_is_used(false); |
1870 | 2.32M | extras(id).set_is_fixed(false); |
1871 | 2.32M | } |
1872 | 9.08k | } |
1873 | | |
1874 | 44.5M | for (id_type i = src_num_units + 1; i < dest_num_units; ++i) { |
1875 | 44.3M | extras(i - 1).set_next(i); |
1876 | 44.3M | extras(i).set_prev(i - 1); |
1877 | 44.3M | } |
1878 | | |
1879 | 174k | extras(src_num_units).set_prev(dest_num_units - 1); |
1880 | 174k | extras(dest_num_units - 1).set_next(src_num_units); |
1881 | | |
1882 | 174k | extras(src_num_units).set_prev(extras(extras_head_).prev()); |
1883 | 174k | extras(dest_num_units - 1).set_next(extras_head_); |
1884 | | |
1885 | 174k | extras(extras(extras_head_).prev()).set_next(src_num_units); |
1886 | 174k | extras(extras_head_).set_prev(dest_num_units - 1); |
1887 | 174k | } |
1888 | | |
1889 | 57.7k | inline void DoubleArrayBuilder::fix_all_blocks() { |
1890 | 57.7k | id_type begin = 0; |
1891 | 57.7k | if (num_blocks() > NUM_EXTRA_BLOCKS) { |
1892 | 705 | begin = num_blocks() - NUM_EXTRA_BLOCKS; |
1893 | 705 | } |
1894 | 57.7k | id_type end = num_blocks(); |
1895 | | |
1896 | 222k | for (id_type block_id = begin; block_id != end; ++block_id) { |
1897 | 164k | fix_block(block_id); |
1898 | 164k | } |
1899 | 57.7k | } |
1900 | | |
1901 | 174k | inline void DoubleArrayBuilder::fix_block(id_type block_id) { |
1902 | 174k | id_type begin = block_id * BLOCK_SIZE; |
1903 | 174k | id_type end = begin + BLOCK_SIZE; |
1904 | | |
1905 | 174k | id_type unused_offset = 0; |
1906 | 2.20M | for (id_type offset = begin; offset != end; ++offset) { |
1907 | 2.20M | if (!extras(offset).is_used()) { |
1908 | 174k | unused_offset = offset; |
1909 | 174k | break; |
1910 | 174k | } |
1911 | 2.20M | } |
1912 | | |
1913 | 44.7M | for (id_type id = begin; id != end; ++id) { |
1914 | 44.5M | if (!extras(id).is_fixed()) { |
1915 | 12.5M | reserve_id(id); |
1916 | 12.5M | units_[id].set_label(static_cast<uchar_type>(id ^ unused_offset)); |
1917 | 12.5M | } |
1918 | 44.5M | } |
1919 | 174k | } |
1920 | | |
1921 | | } // namespace Details |
1922 | | |
1923 | | // |
1924 | | // Member function build() of DoubleArrayImpl. |
1925 | | // |
1926 | | |
1927 | | template <typename A, typename B, typename T, typename C> |
1928 | | int DoubleArrayImpl<A, B, T, C>::build(std::size_t num_keys, |
1929 | | const key_type * const *keys, const std::size_t *lengths, |
1930 | 57.7k | const value_type *values, Details::progress_func_type progress_func) { |
1931 | 57.7k | Details::Keyset<value_type> keyset(num_keys, keys, lengths, values); |
1932 | | |
1933 | 57.7k | Details::DoubleArrayBuilder builder(progress_func); |
1934 | 57.7k | builder.build(keyset); |
1935 | | |
1936 | 57.7k | std::size_t size = 0; |
1937 | 57.7k | unit_type *buf = NULL; |
1938 | 57.7k | builder.copy(&size, &buf); |
1939 | | |
1940 | 57.7k | clear(); |
1941 | | |
1942 | 57.7k | size_ = size; |
1943 | 57.7k | array_ = buf; |
1944 | 57.7k | buf_ = buf; |
1945 | | |
1946 | 57.7k | if (progress_func != NULL) { |
1947 | 0 | progress_func(num_keys + 1, num_keys + 1); |
1948 | 0 | } |
1949 | | |
1950 | 57.7k | return 0; |
1951 | 57.7k | } |
1952 | | |
1953 | | } // namespace Darts |
1954 | | |
1955 | | #undef DARTS_INT_TO_STR |
1956 | | #undef DARTS_LINE_TO_STR |
1957 | | #undef DARTS_LINE_STR |
1958 | | #undef DARTS_THROW |
1959 | | |
1960 | | #endif // DARTS_H_ |