/src/sentencepiece/third_party/darts_clone/darts.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef DARTS_H_ |
2 | | #define DARTS_H_ |
3 | | |
4 | | #include <cstdio> |
5 | | #include <exception> |
6 | | #include <new> |
7 | | |
8 | | #define DARTS_VERSION "0.32" |
9 | | |
10 | | // DARTS_THROW() throws a <Darts::Exception> whose message starts with the |
11 | | // file name and the line number. For example, DARTS_THROW("error message") at |
12 | | // line 123 of "darts.h" throws a <Darts::Exception> which has a pointer to |
13 | | // "darts.h:123: exception: error message". The message is available by using |
14 | | // what() as well as that of <std::exception>. |
15 | | #define DARTS_INT_TO_STR(value) #value |
16 | | #define DARTS_LINE_TO_STR(line) DARTS_INT_TO_STR(line) |
17 | | #define DARTS_LINE_STR DARTS_LINE_TO_STR(__LINE__) |
18 | 0 | #define DARTS_THROW(msg) throw Darts::Details::Exception( \ |
19 | 0 | __FILE__ ":" DARTS_LINE_STR ": exception: " msg) |
20 | | |
21 | | namespace Darts { |
22 | | |
23 | | // The following namespace hides the internal types and classes. |
24 | | namespace Details { |
25 | | |
26 | | // This header assumes that <int> and <unsigned int> are 32-bit integer types. |
27 | | // |
28 | | // Darts-clone keeps values associated with keys. The type of the values is |
29 | | // <value_type>. Note that the values must be positive integers because the |
30 | | // most significant bit (MSB) of each value is used to represent whether the |
31 | | // corresponding unit is a leaf or not. Also, the keys are represented by |
32 | | // sequences of <char_type>s. <uchar_type> is the unsigned type of <char_type>. |
33 | | typedef char char_type; |
34 | | typedef unsigned char uchar_type; |
35 | | typedef int value_type; |
36 | | |
37 | | // The main structure of Darts-clone is an array of <DoubleArrayUnit>s, and the |
38 | | // unit type is actually a wrapper of <id_type>. |
39 | | typedef unsigned int id_type; |
40 | | |
41 | | // <progress_func_type> is the type of callback functions for reporting the |
42 | | // progress of building a dictionary. See also build() of <DoubleArray>. |
43 | | // The 1st argument receives the progress value and the 2nd argument receives |
44 | | // the maximum progress value. A usage example is to show the progress |
45 | | // percentage, 100.0 * (the 1st argument) / (the 2nd argument). |
46 | | typedef int (*progress_func_type)(std::size_t, std::size_t); |
47 | | |
48 | | // <DoubleArrayUnit> is the type of double-array units and it is a wrapper of |
49 | | // <id_type> in practice. |
50 | | class DoubleArrayUnit { |
51 | | public: |
52 | 0 | DoubleArrayUnit() : unit_() {} |
53 | | |
54 | | // has_leaf() returns whether a leaf unit is immediately derived from the |
55 | | // unit (true) or not (false). |
56 | 0 | bool has_leaf() const { |
57 | 0 | return ((unit_ >> 8) & 1) == 1; |
58 | 0 | } |
59 | | // value() returns the value stored in the unit, and thus value() is |
60 | | // available when and only when the unit is a leaf unit. |
61 | 0 | value_type value() const { |
62 | 0 | return static_cast<value_type>(unit_ & ((1U << 31) - 1)); |
63 | 0 | } |
64 | | |
65 | | // label() returns the label associted with the unit. Note that a leaf unit |
66 | | // always returns an invalid label. For this feature, leaf unit's label() |
67 | | // returns an <id_type> that has the MSB of 1. |
68 | 0 | id_type label() const { |
69 | 0 | return unit_ & ((1U << 31) | 0xFF); |
70 | 0 | } |
71 | | // offset() returns the offset from the unit to its derived units. |
72 | 0 | id_type offset() const { |
73 | 0 | return (unit_ >> 10) << ((unit_ & (1U << 9)) >> 6); |
74 | 0 | } |
75 | | |
76 | | private: |
77 | | id_type unit_; |
78 | | |
79 | | // Copyable. |
80 | | }; |
81 | | |
82 | | // Darts-clone throws an <Exception> for memory allocation failure, invalid |
83 | | // arguments or a too large offset. The last case means that there are too many |
84 | | // keys in the given set of keys. Note that the `msg' of <Exception> must be a |
85 | | // constant or static string because an <Exception> keeps only a pointer to |
86 | | // that string. |
87 | | class Exception : public std::exception { |
88 | | public: |
89 | 0 | explicit Exception(const char *msg = NULL) throw() : msg_(msg) {} |
90 | 0 | Exception(const Exception &rhs) throw() : msg_(rhs.msg_) {} |
91 | 0 | virtual ~Exception() throw() {} |
92 | | |
93 | | // <Exception> overrides what() of <std::exception>. |
94 | 0 | virtual const char *what() const throw() { |
95 | 0 | return (msg_ != NULL) ? msg_ : ""; |
96 | 0 | } |
97 | | |
98 | | private: |
99 | | const char *msg_; |
100 | | |
101 | | // Disallows operator=. |
102 | | Exception &operator=(const Exception &); |
103 | | }; |
104 | | |
105 | | } // namespace Details |
106 | | |
107 | | // <DoubleArrayImpl> is the interface of Darts-clone. Note that other |
108 | | // classes should not be accessed from outside. |
109 | | // |
110 | | // <DoubleArrayImpl> has 4 template arguments but only the 3rd one is used as |
111 | | // the type of values. Note that the given <T> is used only from outside, and |
112 | | // the internal value type is not changed from <Darts::Details::value_type>. |
113 | | // In build(), given values are casted from <T> to <Darts::Details::value_type> |
114 | | // by using static_cast. On the other hand, values are casted from |
115 | | // <Darts::Details::value_type> to <T> in searching dictionaries. |
116 | | template <typename, typename, typename T, typename> |
117 | | class DoubleArrayImpl { |
118 | | public: |
119 | | // Even if this <value_type> is changed, the internal value type is still |
120 | | // <Darts::Details::value_type>. Other types, such as 64-bit integer types |
121 | | // and floating-point number types, should not be used. |
122 | | typedef T value_type; |
123 | | // A key is reprenseted by a sequence of <key_type>s. For example, |
124 | | // exactMatchSearch() takes a <const key_type *>. |
125 | | typedef Details::char_type key_type; |
126 | | // In searching dictionaries, the values associated with the matched keys are |
127 | | // stored into or returned as <result_type>s. |
128 | | typedef value_type result_type; |
129 | | |
130 | | // <result_pair_type> enables applications to get the lengths of the matched |
131 | | // keys in addition to the values. |
132 | | struct result_pair_type { |
133 | | value_type value; |
134 | | std::size_t length; |
135 | | }; |
136 | | |
137 | | // The constructor initializes member variables with 0 and NULLs. |
138 | 0 | DoubleArrayImpl() : size_(0), array_(NULL), buf_(NULL) {} |
139 | | // The destructor frees memory allocated for units and then initializes |
140 | | // member variables with 0 and NULLs. |
141 | 0 | virtual ~DoubleArrayImpl() { |
142 | 0 | clear(); |
143 | 0 | } |
144 | | |
145 | | // <DoubleArrayImpl> has 2 kinds of set_result()s. The 1st set_result() is to |
146 | | // set a value to a <value_type>. The 2nd set_result() is to set a value and |
147 | | // a length to a <result_pair_type>. By using set_result()s, search methods |
148 | | // can return the 2 kinds of results in the same way. |
149 | | // Why the set_result()s are non-static? It is for compatibility. |
150 | | // |
151 | | // The 1st set_result() takes a length as the 3rd argument but it is not |
152 | | // used. If a compiler does a good job, codes for getting the length may be |
153 | | // removed. |
154 | 0 | void set_result(value_type *result, value_type value, std::size_t) const { |
155 | 0 | *result = value; |
156 | 0 | } |
157 | | // The 2nd set_result() uses both `value' and `length'. |
158 | | void set_result(result_pair_type *result, |
159 | 0 | value_type value, std::size_t length) const { |
160 | 0 | result->value = value; |
161 | 0 | result->length = length; |
162 | 0 | } |
163 | | |
164 | | // set_array() calls clear() in order to free memory allocated to the old |
165 | | // array and then sets a new array. This function is useful to set a memory- |
166 | | // mapped array. Note that the array set by set_array() is not freed in |
167 | | // clear() and the destructor of <DoubleArrayImpl>. |
168 | | // set_array() can also set the size of the new array but the size is not |
169 | | // used in search methods. So it works well even if the 2nd argument is 0 or |
170 | | // omitted. Remember that size() and total_size() returns 0 in such a case. |
171 | 0 | void set_array(const void *ptr, std::size_t size = 0) { |
172 | 0 | clear(); |
173 | 0 | array_ = static_cast<const unit_type *>(ptr); |
174 | 0 | size_ = size; |
175 | 0 | } |
176 | | // array() returns a pointer to the array of units. |
177 | | const void *array() const { |
178 | | return array_; |
179 | | } |
180 | | |
181 | | // clear() frees memory allocated to units and then initializes member |
182 | | // variables with 0 and NULLs. Note that clear() does not free memory if the |
183 | | // array of units was set by set_array(). In such a case, `array_' is not |
184 | | // NULL and `buf_' is NULL. |
185 | 0 | void clear() { |
186 | 0 | size_ = 0; |
187 | 0 | array_ = NULL; |
188 | 0 | if (buf_ != NULL) { |
189 | 0 | delete[] buf_; |
190 | 0 | buf_ = NULL; |
191 | 0 | } |
192 | 0 | } |
193 | | |
194 | | // unit_size() returns the size of each unit. The size must be 4 bytes. |
195 | 0 | std::size_t unit_size() const { |
196 | 0 | return sizeof(unit_type); |
197 | 0 | } |
198 | | // size() returns the number of units. It can be 0 if set_array() is used. |
199 | | std::size_t size() const { |
200 | | return size_; |
201 | | } |
202 | | // total_size() returns the number of bytes allocated to the array of units. |
203 | | // It can be 0 if set_array() is used. |
204 | | std::size_t total_size() const { |
205 | | return unit_size() * size(); |
206 | | } |
207 | | // nonzero_size() exists for compatibility. It always returns the number of |
208 | | // units because it takes long time to count the number of non-zero units. |
209 | | std::size_t nonzero_size() const { |
210 | | return size(); |
211 | | } |
212 | | |
213 | | // build() constructs a dictionary from given key-value pairs. If `lengths' |
214 | | // is NULL, `keys' is handled as an array of zero-terminated strings. If |
215 | | // `values' is NULL, the index in `keys' is associated with each key, i.e. |
216 | | // the ith key has (i - 1) as its value. |
217 | | // Note that the key-value pairs must be arranged in key order and the values |
218 | | // must not be negative. Also, if there are duplicate keys, only the first |
219 | | // pair will be stored in the resultant dictionary. |
220 | | // `progress_func' is a pointer to a callback function. If it is not NULL, |
221 | | // it will be called in build() so that the caller can check the progress of |
222 | | // dictionary construction. For details, please see the definition of |
223 | | // <Darts::Details::progress_func_type>. |
224 | | // The return value of build() is 0, and it indicates the success of the |
225 | | // operation. Otherwise, build() throws a <Darts::Exception>, which is a |
226 | | // derived class of <std::exception>. |
227 | | // build() uses another construction algorithm if `values' is not NULL. In |
228 | | // this case, Darts-clone uses a Directed Acyclic Word Graph (DAWG) instead |
229 | | // of a trie because a DAWG is likely to be more compact than a trie. |
230 | | int build(std::size_t num_keys, const key_type * const *keys, |
231 | | const std::size_t *lengths = NULL, const value_type *values = NULL, |
232 | | Details::progress_func_type progress_func = NULL); |
233 | | |
234 | | // open() reads an array of units from the specified file. And if it goes |
235 | | // well, the old array will be freed and replaced with the new array read |
236 | | // from the file. `offset' specifies the number of bytes to be skipped before |
237 | | // reading an array. `size' specifies the number of bytes to be read from the |
238 | | // file. If the `size' is 0, the whole file will be read. |
239 | | // open() returns 0 iff the operation succeeds. Otherwise, it returns a |
240 | | // non-zero value or throws a <Darts::Exception>. The exception is thrown |
241 | | // when and only when a memory allocation fails. |
242 | | int open(const char *file_name, const char *mode = "rb", |
243 | | std::size_t offset = 0, std::size_t size = 0); |
244 | | // save() writes the array of units into the specified file. `offset' |
245 | | // specifies the number of bytes to be skipped before writing the array. |
246 | | // open() returns 0 iff the operation succeeds. Otherwise, it returns a |
247 | | // non-zero value. |
248 | | int save(const char *file_name, const char *mode = "wb", |
249 | | std::size_t offset = 0) const; |
250 | | |
251 | | // The 1st exactMatchSearch() tests whether the given key exists or not, and |
252 | | // if it exists, its value and length are set to `result'. Otherwise, the |
253 | | // value and the length of `result' are set to -1 and 0 respectively. |
254 | | // Note that if `length' is 0, `key' is handled as a zero-terminated string. |
255 | | // `node_pos' specifies the start position of matching. This argument enables |
256 | | // the combination of exactMatchSearch() and traverse(). For example, if you |
257 | | // want to test "xyzA", "xyzBC", and "xyzDE", you can use traverse() to get |
258 | | // the node position corresponding to "xyz" and then you can use |
259 | | // exactMatchSearch() to test "A", "BC", and "DE" from that position. |
260 | | // Note that the length of `result' indicates the length from the `node_pos'. |
261 | | // In the above example, the lengths are { 1, 2, 2 }, not { 4, 5, 5 }. |
262 | | template <class U> |
263 | | void exactMatchSearch(const key_type *key, U &result, |
264 | 0 | std::size_t length = 0, std::size_t node_pos = 0) const { |
265 | 0 | result = exactMatchSearch<U>(key, length, node_pos); |
266 | 0 | } |
267 | | // The 2nd exactMatchSearch() returns a result instead of updating the 2nd |
268 | | // argument. So, the following exactMatchSearch() has only 3 arguments. |
269 | | template <class U> |
270 | | inline U exactMatchSearch(const key_type *key, std::size_t length = 0, |
271 | | std::size_t node_pos = 0) const; |
272 | | |
273 | | // commonPrefixSearch() searches for keys which match a prefix of the given |
274 | | // string. If `length' is 0, `key' is handled as a zero-terminated string. |
275 | | // The values and the lengths of at most `max_num_results' matched keys are |
276 | | // stored in `results'. commonPrefixSearch() returns the number of matched |
277 | | // keys. Note that the return value can be larger than `max_num_results' if |
278 | | // there are more than `max_num_results' matches. If you want to get all the |
279 | | // results, allocate more spaces and call commonPrefixSearch() again. |
280 | | // `node_pos' works as well as in exactMatchSearch(). |
281 | | template <class U> |
282 | | inline std::size_t commonPrefixSearch(const key_type *key, U *results, |
283 | | std::size_t max_num_results, std::size_t length = 0, |
284 | | std::size_t node_pos = 0) const; |
285 | | |
286 | | // In Darts-clone, a dictionary is a deterministic finite-state automaton |
287 | | // (DFA) and traverse() tests transitions on the DFA. The initial state is |
288 | | // `node_pos' and traverse() chooses transitions labeled key[key_pos], |
289 | | // key[key_pos + 1], ... in order. If there is not a transition labeled |
290 | | // key[key_pos + i], traverse() terminates the transitions at that state and |
291 | | // returns -2. Otherwise, traverse() ends without a termination and returns |
292 | | // -1 or a nonnegative value, -1 indicates that the final state was not an |
293 | | // accept state. When a nonnegative value is returned, it is the value |
294 | | // associated with the final accept state. That is, traverse() returns the |
295 | | // value associated with the given key if it exists. Note that traverse() |
296 | | // updates `node_pos' and `key_pos' after each transition. |
297 | | inline value_type traverse(const key_type *key, std::size_t &node_pos, |
298 | | std::size_t &key_pos, std::size_t length = 0) const; |
299 | | |
300 | | private: |
301 | | typedef Details::uchar_type uchar_type; |
302 | | typedef Details::id_type id_type; |
303 | | typedef Details::DoubleArrayUnit unit_type; |
304 | | |
305 | | std::size_t size_; |
306 | | const unit_type *array_; |
307 | | unit_type *buf_; |
308 | | |
309 | | // Disallows copy and assignment. |
310 | | DoubleArrayImpl(const DoubleArrayImpl &); |
311 | | DoubleArrayImpl &operator=(const DoubleArrayImpl &); |
312 | | }; |
313 | | |
314 | | // <DoubleArray> is the typical instance of <DoubleArrayImpl>. It uses <int> |
315 | | // as the type of values and it is suitable for most cases. |
316 | | typedef DoubleArrayImpl<void, void, int, void> DoubleArray; |
317 | | |
318 | | // The interface section ends here. For using Darts-clone, there is no need |
319 | | // to read the remaining section, which gives the implementation of |
320 | | // Darts-clone. |
321 | | |
322 | | // |
323 | | // Member functions of DoubleArrayImpl (except build()). |
324 | | // |
325 | | |
326 | | template <typename A, typename B, typename T, typename C> |
327 | | int DoubleArrayImpl<A, B, T, C>::open(const char *file_name, |
328 | | const char *mode, std::size_t offset, std::size_t size) { |
329 | | #ifdef _MSC_VER |
330 | | std::FILE *file; |
331 | | if (::fopen_s(&file, file_name, mode) != 0) { |
332 | | return -1; |
333 | | } |
334 | | #else |
335 | | std::FILE *file = std::fopen(file_name, mode); |
336 | | if (file == NULL) { |
337 | | return -1; |
338 | | } |
339 | | #endif |
340 | | |
341 | | if (size == 0) { |
342 | | if (std::fseek(file, 0, SEEK_END) != 0) { |
343 | | std::fclose(file); |
344 | | return -1; |
345 | | } |
346 | | size = std::ftell(file) - offset; |
347 | | } |
348 | | |
349 | | size /= unit_size(); |
350 | | if (size < 256 || (size & 0xFF) != 0) { |
351 | | std::fclose(file); |
352 | | return -1; |
353 | | } |
354 | | |
355 | | if (std::fseek(file, offset, SEEK_SET) != 0) { |
356 | | std::fclose(file); |
357 | | return -1; |
358 | | } |
359 | | |
360 | | unit_type units[256]; |
361 | | if (std::fread(units, unit_size(), 256, file) != 256) { |
362 | | std::fclose(file); |
363 | | return -1; |
364 | | } |
365 | | |
366 | | if (units[0].label() != '\0' || units[0].has_leaf() || |
367 | | units[0].offset() == 0 || units[0].offset() >= 512) { |
368 | | std::fclose(file); |
369 | | return -1; |
370 | | } |
371 | | for (id_type i = 1; i < 256; ++i) { |
372 | | if (units[i].label() <= 0xFF && units[i].offset() >= size) { |
373 | | std::fclose(file); |
374 | | return -1; |
375 | | } |
376 | | } |
377 | | |
378 | | unit_type *buf; |
379 | | try { |
380 | | buf = new unit_type[size]; |
381 | | for (id_type i = 0; i < 256; ++i) { |
382 | | buf[i] = units[i]; |
383 | | } |
384 | | } catch (const std::bad_alloc &) { |
385 | | std::fclose(file); |
386 | | DARTS_THROW("failed to open double-array: std::bad_alloc"); |
387 | | } |
388 | | |
389 | | if (size > 256) { |
390 | | if (std::fread(buf + 256, unit_size(), size - 256, file) != size - 256) { |
391 | | std::fclose(file); |
392 | | delete[] buf; |
393 | | return -1; |
394 | | } |
395 | | } |
396 | | std::fclose(file); |
397 | | |
398 | | clear(); |
399 | | |
400 | | size_ = size; |
401 | | array_ = buf; |
402 | | buf_ = buf; |
403 | | return 0; |
404 | | } |
405 | | |
406 | | template <typename A, typename B, typename T, typename C> |
407 | | int DoubleArrayImpl<A, B, T, C>::save(const char *file_name, |
408 | | const char *mode, std::size_t) const { |
409 | | if (size() == 0) { |
410 | | return -1; |
411 | | } |
412 | | |
413 | | #ifdef _MSC_VER |
414 | | std::FILE *file; |
415 | | if (::fopen_s(&file, file_name, mode) != 0) { |
416 | | return -1; |
417 | | } |
418 | | #else |
419 | | std::FILE *file = std::fopen(file_name, mode); |
420 | | if (file == NULL) { |
421 | | return -1; |
422 | | } |
423 | | #endif |
424 | | |
425 | | if (std::fwrite(array_, unit_size(), size(), file) != size()) { |
426 | | std::fclose(file); |
427 | | return -1; |
428 | | } |
429 | | std::fclose(file); |
430 | | return 0; |
431 | | } |
432 | | |
433 | | template <typename A, typename B, typename T, typename C> |
434 | | template <typename U> |
435 | | inline U DoubleArrayImpl<A, B, T, C>::exactMatchSearch(const key_type *key, |
436 | 0 | std::size_t length, std::size_t node_pos) const { |
437 | 0 | U result; |
438 | 0 | set_result(&result, static_cast<value_type>(-1), 0); |
439 | |
|
440 | 0 | unit_type unit = array_[node_pos]; |
441 | 0 | if (length != 0) { |
442 | 0 | for (std::size_t i = 0; i < length; ++i) { |
443 | 0 | node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[i]); |
444 | 0 | unit = array_[node_pos]; |
445 | 0 | if (unit.label() != static_cast<uchar_type>(key[i])) { |
446 | 0 | return result; |
447 | 0 | } |
448 | 0 | } |
449 | 0 | } else { |
450 | 0 | for ( ; key[length] != '\0'; ++length) { |
451 | 0 | node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[length]); |
452 | 0 | unit = array_[node_pos]; |
453 | 0 | if (unit.label() != static_cast<uchar_type>(key[length])) { |
454 | 0 | return result; |
455 | 0 | } |
456 | 0 | } |
457 | 0 | } |
458 | | |
459 | 0 | if (!unit.has_leaf()) { |
460 | 0 | return result; |
461 | 0 | } |
462 | 0 | unit = array_[node_pos ^ unit.offset()]; |
463 | 0 | set_result(&result, static_cast<value_type>(unit.value()), length); |
464 | 0 | return result; |
465 | 0 | } |
466 | | |
467 | | template <typename A, typename B, typename T, typename C> |
468 | | template <typename U> |
469 | | inline std::size_t DoubleArrayImpl<A, B, T, C>::commonPrefixSearch( |
470 | | const key_type *key, U *results, std::size_t max_num_results, |
471 | 0 | std::size_t length, std::size_t node_pos) const { |
472 | 0 | std::size_t num_results = 0; |
473 | |
|
474 | 0 | unit_type unit = array_[node_pos]; |
475 | 0 | node_pos ^= unit.offset(); |
476 | 0 | if (length != 0) { |
477 | 0 | for (std::size_t i = 0; i < length; ++i) { |
478 | 0 | node_pos ^= static_cast<uchar_type>(key[i]); |
479 | 0 | unit = array_[node_pos]; |
480 | 0 | if (unit.label() != static_cast<uchar_type>(key[i])) { |
481 | 0 | return num_results; |
482 | 0 | } |
483 | | |
484 | 0 | node_pos ^= unit.offset(); |
485 | 0 | if (unit.has_leaf()) { |
486 | 0 | if (num_results < max_num_results) { |
487 | 0 | set_result(&results[num_results], static_cast<value_type>( |
488 | 0 | array_[node_pos].value()), i + 1); |
489 | 0 | } |
490 | 0 | ++num_results; |
491 | 0 | } |
492 | 0 | } |
493 | 0 | } else { |
494 | 0 | for ( ; key[length] != '\0'; ++length) { |
495 | 0 | node_pos ^= static_cast<uchar_type>(key[length]); |
496 | 0 | unit = array_[node_pos]; |
497 | 0 | if (unit.label() != static_cast<uchar_type>(key[length])) { |
498 | 0 | return num_results; |
499 | 0 | } |
500 | | |
501 | 0 | node_pos ^= unit.offset(); |
502 | 0 | if (unit.has_leaf()) { |
503 | 0 | if (num_results < max_num_results) { |
504 | 0 | set_result(&results[num_results], static_cast<value_type>( |
505 | 0 | array_[node_pos].value()), length + 1); |
506 | 0 | } |
507 | 0 | ++num_results; |
508 | 0 | } |
509 | 0 | } |
510 | 0 | } |
511 | | |
512 | 0 | return num_results; |
513 | 0 | } |
514 | | |
515 | | template <typename A, typename B, typename T, typename C> |
516 | | inline typename DoubleArrayImpl<A, B, T, C>::value_type |
517 | | DoubleArrayImpl<A, B, T, C>::traverse(const key_type *key, |
518 | 0 | std::size_t &node_pos, std::size_t &key_pos, std::size_t length) const { |
519 | 0 | id_type id = static_cast<id_type>(node_pos); |
520 | 0 | unit_type unit = array_[id]; |
521 | |
|
522 | 0 | if (length != 0) { |
523 | 0 | for ( ; key_pos < length; ++key_pos) { |
524 | 0 | id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]); |
525 | 0 | unit = array_[id]; |
526 | 0 | if (unit.label() != static_cast<uchar_type>(key[key_pos])) { |
527 | 0 | return static_cast<value_type>(-2); |
528 | 0 | } |
529 | 0 | node_pos = id; |
530 | 0 | } |
531 | 0 | } else { |
532 | 0 | for ( ; key[key_pos] != '\0'; ++key_pos) { |
533 | 0 | id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]); |
534 | 0 | unit = array_[id]; |
535 | 0 | if (unit.label() != static_cast<uchar_type>(key[key_pos])) { |
536 | 0 | return static_cast<value_type>(-2); |
537 | 0 | } |
538 | 0 | node_pos = id; |
539 | 0 | } |
540 | 0 | } |
541 | | |
542 | 0 | if (!unit.has_leaf()) { |
543 | 0 | return static_cast<value_type>(-1); |
544 | 0 | } |
545 | 0 | unit = array_[id ^ unit.offset()]; |
546 | 0 | return static_cast<value_type>(unit.value()); |
547 | 0 | } |
548 | | |
549 | | namespace Details { |
550 | | |
551 | | // |
552 | | // Memory management of array. |
553 | | // |
554 | | |
555 | | template <typename T> |
556 | | class AutoArray { |
557 | | public: |
558 | 0 | explicit AutoArray(T *array = NULL) : array_(array) {} Unexecuted instantiation: Darts::Details::AutoArray<char>::AutoArray(char*) Unexecuted instantiation: Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::AutoArray(Darts::Details::DoubleArrayBuilderExtraUnit*) Unexecuted instantiation: Darts::Details::AutoArray<unsigned int>::AutoArray(unsigned int*) |
559 | 0 | ~AutoArray() { |
560 | 0 | clear(); |
561 | 0 | } Unexecuted instantiation: Darts::Details::AutoArray<char>::~AutoArray() Unexecuted instantiation: Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::~AutoArray() Unexecuted instantiation: Darts::Details::AutoArray<unsigned int>::~AutoArray() |
562 | | |
563 | 0 | const T &operator[](std::size_t id) const { |
564 | 0 | return array_[id]; |
565 | 0 | } Unexecuted instantiation: Darts::Details::AutoArray<char>::operator[](unsigned long) const Unexecuted instantiation: Darts::Details::AutoArray<unsigned int>::operator[](unsigned long) const Unexecuted instantiation: Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::operator[](unsigned long) const |
566 | 0 | T &operator[](std::size_t id) { |
567 | 0 | return array_[id]; |
568 | 0 | } Unexecuted instantiation: Darts::Details::AutoArray<char>::operator[](unsigned long) Unexecuted instantiation: Darts::Details::AutoArray<unsigned int>::operator[](unsigned long) Unexecuted instantiation: Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::operator[](unsigned long) |
569 | | |
570 | | bool empty() const { |
571 | | return array_ == NULL; |
572 | | } |
573 | | |
574 | 0 | void clear() { |
575 | 0 | if (array_ != NULL) { |
576 | 0 | delete[] array_; |
577 | 0 | array_ = NULL; |
578 | 0 | } |
579 | 0 | } Unexecuted instantiation: Darts::Details::AutoArray<char>::clear() Unexecuted instantiation: Darts::Details::AutoArray<unsigned int>::clear() Unexecuted instantiation: Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::clear() |
580 | 0 | void swap(AutoArray *array) { |
581 | 0 | T *temp = array_; |
582 | 0 | array_ = array->array_; |
583 | 0 | array->array_ = temp; |
584 | 0 | } Unexecuted instantiation: Darts::Details::AutoArray<char>::swap(Darts::Details::AutoArray<char>*) Unexecuted instantiation: Darts::Details::AutoArray<unsigned int>::swap(Darts::Details::AutoArray<unsigned int>*) Unexecuted instantiation: Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::swap(Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>*) |
585 | 0 | void reset(T *array = NULL) { |
586 | 0 | AutoArray(array).swap(this); |
587 | 0 | } Unexecuted instantiation: Darts::Details::AutoArray<char>::reset(char*) Unexecuted instantiation: Darts::Details::AutoArray<unsigned int>::reset(unsigned int*) Unexecuted instantiation: Darts::Details::AutoArray<Darts::Details::DoubleArrayBuilderExtraUnit>::reset(Darts::Details::DoubleArrayBuilderExtraUnit*) |
588 | | |
589 | | private: |
590 | | T *array_; |
591 | | |
592 | | // Disallows copy and assignment. |
593 | | AutoArray(const AutoArray &); |
594 | | AutoArray &operator=(const AutoArray &); |
595 | | }; |
596 | | |
597 | | // |
598 | | // Memory management of resizable array. |
599 | | // |
600 | | |
601 | | template <typename T> |
602 | | class AutoPool { |
603 | | public: |
604 | 0 | AutoPool() : buf_(), size_(0), capacity_(0) {} Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::AutoPool() Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::AutoPool() Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgNode>::AutoPool() Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgUnit>::AutoPool() Unexecuted instantiation: Darts::Details::AutoPool<unsigned int>::AutoPool() |
605 | 0 | ~AutoPool() { clear(); } Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::~AutoPool() Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::~AutoPool() Unexecuted instantiation: Darts::Details::AutoPool<unsigned int>::~AutoPool() Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgUnit>::~AutoPool() Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgNode>::~AutoPool() |
606 | | |
607 | 0 | const T &operator[](std::size_t id) const { |
608 | 0 | return *(reinterpret_cast<const T *>(&buf_[0]) + id); |
609 | 0 | } Unexecuted instantiation: Darts::Details::AutoPool<unsigned int>::operator[](unsigned long) const Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgUnit>::operator[](unsigned long) const Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::operator[](unsigned long) const Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgNode>::operator[](unsigned long) const Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::operator[](unsigned long) const |
610 | 0 | T &operator[](std::size_t id) { |
611 | 0 | return *(reinterpret_cast<T *>(&buf_[0]) + id); |
612 | 0 | } Unexecuted instantiation: Darts::Details::AutoPool<unsigned int>::operator[](unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgNode>::operator[](unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgUnit>::operator[](unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::operator[](unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::operator[](unsigned long) |
613 | | |
614 | 0 | bool empty() const { |
615 | 0 | return size_ == 0; |
616 | 0 | } Unexecuted instantiation: Darts::Details::AutoPool<unsigned int>::empty() const Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::empty() const |
617 | 0 | std::size_t size() const { |
618 | 0 | return size_; |
619 | 0 | } Unexecuted instantiation: Darts::Details::AutoPool<unsigned int>::size() const Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgUnit>::size() const Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgNode>::size() const Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::size() const Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::size() const |
620 | | |
621 | 0 | void clear() { |
622 | 0 | resize(0); |
623 | 0 | buf_.clear(); |
624 | 0 | size_ = 0; |
625 | 0 | capacity_ = 0; |
626 | 0 | } Unexecuted instantiation: Darts::Details::AutoPool<unsigned int>::clear() Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgNode>::clear() Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgUnit>::clear() Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::clear() Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::clear() |
627 | | |
628 | 0 | void push_back(const T &value) { |
629 | 0 | append(value); |
630 | 0 | } |
631 | 0 | void pop_back() { |
632 | 0 | (*this)[--size_].~T(); |
633 | 0 | } |
634 | | |
635 | 0 | void append() { |
636 | 0 | if (size_ == capacity_) |
637 | 0 | resize_buf(size_ + 1); |
638 | 0 | new(&(*this)[size_++]) T; |
639 | 0 | } Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgUnit>::append() Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::append() Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgNode>::append() |
640 | 0 | void append(const T &value) { |
641 | 0 | if (size_ == capacity_) |
642 | 0 | resize_buf(size_ + 1); |
643 | 0 | new(&(*this)[size_++]) T(value); |
644 | 0 | } Unexecuted instantiation: Darts::Details::AutoPool<unsigned int>::append(unsigned int const&) Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::append(unsigned char const&) |
645 | | |
646 | 0 | void resize(std::size_t size) { |
647 | 0 | while (size_ > size) { |
648 | 0 | (*this)[--size_].~T(); |
649 | 0 | } |
650 | 0 | if (size > capacity_) { |
651 | 0 | resize_buf(size); |
652 | 0 | } |
653 | 0 | while (size_ < size) { |
654 | 0 | new(&(*this)[size_++]) T; |
655 | 0 | } |
656 | 0 | } Unexecuted instantiation: Darts::Details::AutoPool<unsigned int>::resize(unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgNode>::resize(unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgUnit>::resize(unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::resize(unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::resize(unsigned long) |
657 | 0 | void resize(std::size_t size, const T &value) { |
658 | 0 | while (size_ > size) { |
659 | 0 | (*this)[--size_].~T(); |
660 | 0 | } |
661 | 0 | if (size > capacity_) { |
662 | 0 | resize_buf(size); |
663 | 0 | } |
664 | 0 | while (size_ < size) { |
665 | 0 | new(&(*this)[size_++]) T(value); |
666 | 0 | } |
667 | 0 | } |
668 | | |
669 | 0 | void reserve(std::size_t size) { |
670 | 0 | if (size > capacity_) { |
671 | 0 | resize_buf(size); |
672 | 0 | } |
673 | 0 | } |
674 | | |
675 | | private: |
676 | | AutoArray<char> buf_; |
677 | | std::size_t size_; |
678 | | std::size_t capacity_; |
679 | | |
680 | | // Disallows copy and assignment. |
681 | | AutoPool(const AutoPool &); |
682 | | AutoPool &operator=(const AutoPool &); |
683 | | |
684 | | void resize_buf(std::size_t size); |
685 | | }; |
686 | | |
687 | | template <typename T> |
688 | 0 | void AutoPool<T>::resize_buf(std::size_t size) { |
689 | 0 | std::size_t capacity; |
690 | 0 | if (size >= capacity_ * 2) { |
691 | 0 | capacity = size; |
692 | 0 | } else { |
693 | 0 | capacity = 1; |
694 | 0 | while (capacity < size) { |
695 | 0 | capacity <<= 1; |
696 | 0 | } |
697 | 0 | } |
698 | |
|
699 | 0 | AutoArray<char> buf; |
700 | 0 | try { |
701 | 0 | buf.reset(new char[sizeof(T) * capacity]); |
702 | 0 | } catch (const std::bad_alloc &) { |
703 | 0 | DARTS_THROW("failed to resize pool: std::bad_alloc"); |
704 | 0 | } |
705 | | |
706 | 0 | if (size_ > 0) { |
707 | 0 | T *src = reinterpret_cast<T *>(&buf_[0]); |
708 | 0 | T *dest = reinterpret_cast<T *>(&buf[0]); |
709 | 0 | for (std::size_t i = 0; i < size_; ++i) { |
710 | 0 | new(&dest[i]) T(src[i]); |
711 | 0 | src[i].~T(); |
712 | 0 | } |
713 | 0 | } |
714 | |
|
715 | 0 | buf_.swap(&buf); |
716 | 0 | capacity_ = capacity; |
717 | 0 | } Unexecuted instantiation: Darts::Details::AutoPool<unsigned int>::resize_buf(unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgNode>::resize_buf(unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DawgUnit>::resize_buf(unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<unsigned char>::resize_buf(unsigned long) Unexecuted instantiation: Darts::Details::AutoPool<Darts::Details::DoubleArrayBuilderUnit>::resize_buf(unsigned long) |
718 | | |
719 | | // |
720 | | // Memory management of stack. |
721 | | // |
722 | | |
723 | | template <typename T> |
724 | | class AutoStack { |
725 | | public: |
726 | 0 | AutoStack() : pool_() {} |
727 | 0 | ~AutoStack() { |
728 | 0 | clear(); |
729 | 0 | } |
730 | | |
731 | | const T &top() const { |
732 | | return pool_[size() - 1]; |
733 | | } |
734 | 0 | T &top() { |
735 | 0 | return pool_[size() - 1]; |
736 | 0 | } |
737 | | |
738 | 0 | bool empty() const { |
739 | 0 | return pool_.empty(); |
740 | 0 | } |
741 | 0 | std::size_t size() const { |
742 | 0 | return pool_.size(); |
743 | 0 | } |
744 | | |
745 | 0 | void push(const T &value) { |
746 | 0 | pool_.push_back(value); |
747 | 0 | } |
748 | 0 | void pop() { |
749 | 0 | pool_.pop_back(); |
750 | 0 | } |
751 | | |
752 | 0 | void clear() { |
753 | 0 | pool_.clear(); |
754 | 0 | } |
755 | | |
756 | | private: |
757 | | AutoPool<T> pool_; |
758 | | |
759 | | // Disallows copy and assignment. |
760 | | AutoStack(const AutoStack &); |
761 | | AutoStack &operator=(const AutoStack &); |
762 | | }; |
763 | | |
764 | | // |
765 | | // Succinct bit vector. |
766 | | // |
767 | | |
768 | | class BitVector { |
769 | | public: |
770 | 0 | BitVector() : units_(), ranks_(), num_ones_(0), size_(0) {} |
771 | 0 | ~BitVector() { |
772 | 0 | clear(); |
773 | 0 | } |
774 | | |
775 | 0 | bool operator[](std::size_t id) const { |
776 | 0 | return (units_[id / UNIT_SIZE] >> (id % UNIT_SIZE) & 1) == 1; |
777 | 0 | } |
778 | | |
779 | 0 | id_type rank(std::size_t id) const { |
780 | 0 | std::size_t unit_id = id / UNIT_SIZE; |
781 | 0 | return ranks_[unit_id] + pop_count(units_[unit_id] |
782 | 0 | & (~0U >> (UNIT_SIZE - (id % UNIT_SIZE) - 1))); |
783 | 0 | } |
784 | | |
785 | 0 | void set(std::size_t id, bool bit) { |
786 | 0 | if (bit) { |
787 | 0 | units_[id / UNIT_SIZE] |= 1U << (id % UNIT_SIZE); |
788 | 0 | } else { |
789 | 0 | units_[id / UNIT_SIZE] &= ~(1U << (id % UNIT_SIZE)); |
790 | 0 | } |
791 | 0 | } |
792 | | |
793 | 0 | bool empty() const { |
794 | 0 | return units_.empty(); |
795 | 0 | } |
796 | 0 | std::size_t num_ones() const { |
797 | 0 | return num_ones_; |
798 | 0 | } |
799 | 0 | std::size_t size() const { |
800 | 0 | return size_; |
801 | 0 | } |
802 | | |
803 | 0 | void append() { |
804 | 0 | if ((size_ % UNIT_SIZE) == 0) { |
805 | 0 | units_.append(0); |
806 | 0 | } |
807 | 0 | ++size_; |
808 | 0 | } |
809 | | void build(); |
810 | | |
811 | 0 | void clear() { |
812 | 0 | units_.clear(); |
813 | 0 | ranks_.clear(); |
814 | 0 | } |
815 | | |
816 | | private: |
817 | | enum { UNIT_SIZE = sizeof(id_type) * 8 }; |
818 | | |
819 | | AutoPool<id_type> units_; |
820 | | AutoArray<id_type> ranks_; |
821 | | std::size_t num_ones_; |
822 | | std::size_t size_; |
823 | | |
824 | | // Disallows copy and assignment. |
825 | | BitVector(const BitVector &); |
826 | | BitVector &operator=(const BitVector &); |
827 | | |
828 | 0 | static id_type pop_count(id_type unit) { |
829 | 0 | unit = ((unit & 0xAAAAAAAA) >> 1) + (unit & 0x55555555); |
830 | 0 | unit = ((unit & 0xCCCCCCCC) >> 2) + (unit & 0x33333333); |
831 | 0 | unit = ((unit >> 4) + unit) & 0x0F0F0F0F; |
832 | 0 | unit += unit >> 8; |
833 | 0 | unit += unit >> 16; |
834 | 0 | return unit & 0xFF; |
835 | 0 | } |
836 | | }; |
837 | | |
838 | 0 | inline void BitVector::build() { |
839 | 0 | try { |
840 | 0 | ranks_.reset(new id_type[units_.size()]); |
841 | 0 | } catch (const std::bad_alloc &) { |
842 | 0 | DARTS_THROW("failed to build rank index: std::bad_alloc"); |
843 | 0 | } |
844 | | |
845 | 0 | num_ones_ = 0; |
846 | 0 | for (std::size_t i = 0; i < units_.size(); ++i) { |
847 | 0 | ranks_[i] = num_ones_; |
848 | 0 | num_ones_ += pop_count(units_[i]); |
849 | 0 | } |
850 | 0 | } |
851 | | |
852 | | // |
853 | | // Keyset. |
854 | | // |
855 | | |
856 | | template <typename T> |
857 | | class Keyset { |
858 | | public: |
859 | | Keyset(std::size_t num_keys, const char_type * const *keys, |
860 | | const std::size_t *lengths, const T *values) : |
861 | 0 | num_keys_(num_keys), keys_(keys), lengths_(lengths), values_(values) {} |
862 | | |
863 | 0 | std::size_t num_keys() const { |
864 | 0 | return num_keys_; |
865 | 0 | } |
866 | 0 | const char_type *keys(std::size_t id) const { |
867 | 0 | return keys_[id]; |
868 | 0 | } |
869 | 0 | uchar_type keys(std::size_t key_id, std::size_t char_id) const { |
870 | 0 | if (has_lengths() && char_id >= lengths_[key_id]) |
871 | 0 | return '\0'; |
872 | 0 | return keys_[key_id][char_id]; |
873 | 0 | } |
874 | | |
875 | 0 | bool has_lengths() const { |
876 | 0 | return lengths_ != NULL; |
877 | 0 | } |
878 | 0 | std::size_t lengths(std::size_t id) const { |
879 | 0 | if (has_lengths()) { |
880 | 0 | return lengths_[id]; |
881 | 0 | } |
882 | 0 | std::size_t length = 0; |
883 | 0 | while (keys_[id][length] != '\0') { |
884 | 0 | ++length; |
885 | 0 | } |
886 | 0 | return length; |
887 | 0 | } |
888 | | |
889 | 0 | bool has_values() const { |
890 | 0 | return values_ != NULL; |
891 | 0 | } |
892 | 0 | const value_type values(std::size_t id) const { |
893 | 0 | if (has_values()) { |
894 | 0 | return static_cast<value_type>(values_[id]); |
895 | 0 | } |
896 | 0 | return static_cast<value_type>(id); |
897 | 0 | } |
898 | | |
899 | | private: |
900 | | std::size_t num_keys_; |
901 | | const char_type * const * keys_; |
902 | | const std::size_t *lengths_; |
903 | | const T *values_; |
904 | | |
905 | | // Disallows copy and assignment. |
906 | | Keyset(const Keyset &); |
907 | | Keyset &operator=(const Keyset &); |
908 | | }; |
909 | | |
910 | | // |
911 | | // Node of Directed Acyclic Word Graph (DAWG). |
912 | | // |
913 | | |
914 | | class DawgNode { |
915 | | public: |
916 | 0 | DawgNode() : child_(0), sibling_(0), label_('\0'), |
917 | 0 | is_state_(false), has_sibling_(false) {} |
918 | | |
919 | 0 | void set_child(id_type child) { |
920 | 0 | child_ = child; |
921 | 0 | } |
922 | 0 | void set_sibling(id_type sibling) { |
923 | 0 | sibling_ = sibling; |
924 | 0 | } |
925 | 0 | void set_value(value_type value) { |
926 | 0 | child_ = value; |
927 | 0 | } |
928 | 0 | void set_label(uchar_type label) { |
929 | 0 | label_ = label; |
930 | 0 | } |
931 | 0 | void set_is_state(bool is_state) { |
932 | 0 | is_state_ = is_state; |
933 | 0 | } |
934 | 0 | void set_has_sibling(bool has_sibling) { |
935 | 0 | has_sibling_ = has_sibling; |
936 | 0 | } |
937 | | |
938 | 0 | id_type child() const { |
939 | 0 | return child_; |
940 | 0 | } |
941 | 0 | id_type sibling() const { |
942 | 0 | return sibling_; |
943 | 0 | } |
944 | 0 | value_type value() const { |
945 | 0 | return static_cast<value_type>(child_); |
946 | 0 | } |
947 | 0 | uchar_type label() const { |
948 | 0 | return label_; |
949 | 0 | } |
950 | 0 | bool is_state() const { |
951 | 0 | return is_state_; |
952 | 0 | } |
953 | 0 | bool has_sibling() const { |
954 | 0 | return has_sibling_; |
955 | 0 | } |
956 | | |
957 | 0 | id_type unit() const { |
958 | 0 | if (label_ == '\0') { |
959 | 0 | return (child_ << 1) | (has_sibling_ ? 1 : 0); |
960 | 0 | } |
961 | 0 | return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0); |
962 | 0 | } |
963 | | |
964 | | private: |
965 | | id_type child_; |
966 | | id_type sibling_; |
967 | | uchar_type label_; |
968 | | bool is_state_; |
969 | | bool has_sibling_; |
970 | | |
971 | | // Copyable. |
972 | | }; |
973 | | |
974 | | // |
975 | | // Fixed unit of Directed Acyclic Word Graph (DAWG). |
976 | | // |
977 | | |
978 | | class DawgUnit { |
979 | | public: |
980 | 0 | explicit DawgUnit(id_type unit = 0) : unit_(unit) {} |
981 | 0 | DawgUnit(const DawgUnit &unit) : unit_(unit.unit_) {} |
982 | | |
983 | 0 | DawgUnit &operator=(id_type unit) { |
984 | 0 | unit_ = unit; |
985 | 0 | return *this; |
986 | 0 | } |
987 | | |
988 | 0 | id_type unit() const { |
989 | 0 | return unit_; |
990 | 0 | } |
991 | | |
992 | 0 | id_type child() const { |
993 | 0 | return unit_ >> 2; |
994 | 0 | } |
995 | 0 | bool has_sibling() const { |
996 | 0 | return (unit_ & 1) == 1; |
997 | 0 | } |
998 | 0 | value_type value() const { |
999 | 0 | return static_cast<value_type>(unit_ >> 1); |
1000 | 0 | } |
1001 | 0 | bool is_state() const { |
1002 | 0 | return (unit_ & 2) == 2; |
1003 | 0 | } |
1004 | | |
1005 | | private: |
1006 | | id_type unit_; |
1007 | | |
1008 | | // Copyable. |
1009 | | }; |
1010 | | |
1011 | | // |
1012 | | // Directed Acyclic Word Graph (DAWG) builder. |
1013 | | // |
1014 | | |
1015 | | class DawgBuilder { |
1016 | | public: |
1017 | 0 | DawgBuilder() : nodes_(), units_(), labels_(), is_intersections_(), |
1018 | 0 | table_(), node_stack_(), recycle_bin_(), num_states_(0) {} |
1019 | 0 | ~DawgBuilder() { |
1020 | 0 | clear(); |
1021 | 0 | } |
1022 | | |
1023 | 0 | id_type root() const { |
1024 | 0 | return 0; |
1025 | 0 | } |
1026 | | |
1027 | 0 | id_type child(id_type id) const { |
1028 | 0 | return units_[id].child(); |
1029 | 0 | } |
1030 | 0 | id_type sibling(id_type id) const { |
1031 | 0 | return units_[id].has_sibling() ? (id + 1) : 0; |
1032 | 0 | } |
1033 | 0 | int value(id_type id) const { |
1034 | 0 | return units_[id].value(); |
1035 | 0 | } |
1036 | | |
1037 | 0 | bool is_leaf(id_type id) const { |
1038 | 0 | return label(id) == '\0'; |
1039 | 0 | } |
1040 | 0 | uchar_type label(id_type id) const { |
1041 | 0 | return labels_[id]; |
1042 | 0 | } |
1043 | | |
1044 | 0 | bool is_intersection(id_type id) const { |
1045 | 0 | return is_intersections_[id]; |
1046 | 0 | } |
1047 | 0 | id_type intersection_id(id_type id) const { |
1048 | 0 | return is_intersections_.rank(id) - 1; |
1049 | 0 | } |
1050 | | |
1051 | 0 | std::size_t num_intersections() const { |
1052 | 0 | return is_intersections_.num_ones(); |
1053 | 0 | } |
1054 | | |
1055 | 0 | std::size_t size() const { |
1056 | 0 | return units_.size(); |
1057 | 0 | } |
1058 | | |
1059 | | void init(); |
1060 | | void finish(); |
1061 | | |
1062 | | void insert(const char *key, std::size_t length, value_type value); |
1063 | | |
1064 | | void clear(); |
1065 | | |
1066 | | private: |
1067 | | enum { INITIAL_TABLE_SIZE = 1 << 10 }; |
1068 | | |
1069 | | AutoPool<DawgNode> nodes_; |
1070 | | AutoPool<DawgUnit> units_; |
1071 | | AutoPool<uchar_type> labels_; |
1072 | | BitVector is_intersections_; |
1073 | | AutoPool<id_type> table_; |
1074 | | AutoStack<id_type> node_stack_; |
1075 | | AutoStack<id_type> recycle_bin_; |
1076 | | std::size_t num_states_; |
1077 | | |
1078 | | // Disallows copy and assignment. |
1079 | | DawgBuilder(const DawgBuilder &); |
1080 | | DawgBuilder &operator=(const DawgBuilder &); |
1081 | | |
1082 | | void flush(id_type id); |
1083 | | |
1084 | | void expand_table(); |
1085 | | |
1086 | | id_type find_unit(id_type id, id_type *hash_id) const; |
1087 | | id_type find_node(id_type node_id, id_type *hash_id) const; |
1088 | | |
1089 | | bool are_equal(id_type node_id, id_type unit_id) const; |
1090 | | |
1091 | | id_type hash_unit(id_type id) const; |
1092 | | id_type hash_node(id_type id) const; |
1093 | | |
1094 | | id_type append_node(); |
1095 | | id_type append_unit(); |
1096 | | |
1097 | 0 | void free_node(id_type id) { |
1098 | 0 | recycle_bin_.push(id); |
1099 | 0 | } |
1100 | | |
1101 | 0 | static id_type hash(id_type key) { |
1102 | 0 | key = ~key + (key << 15); // key = (key << 15) - key - 1; |
1103 | 0 | key = key ^ (key >> 12); |
1104 | 0 | key = key + (key << 2); |
1105 | 0 | key = key ^ (key >> 4); |
1106 | 0 | key = key * 2057; // key = (key + (key << 3)) + (key << 11); |
1107 | 0 | key = key ^ (key >> 16); |
1108 | 0 | return key; |
1109 | 0 | } |
1110 | | }; |
1111 | | |
1112 | 0 | inline void DawgBuilder::init() { |
1113 | 0 | table_.resize(INITIAL_TABLE_SIZE, 0); |
1114 | |
|
1115 | 0 | append_node(); |
1116 | 0 | append_unit(); |
1117 | |
|
1118 | 0 | num_states_ = 1; |
1119 | |
|
1120 | 0 | nodes_[0].set_label(0xFF); |
1121 | 0 | node_stack_.push(0); |
1122 | 0 | } |
1123 | | |
1124 | 0 | inline void DawgBuilder::finish() { |
1125 | 0 | flush(0); |
1126 | |
|
1127 | 0 | units_[0] = nodes_[0].unit(); |
1128 | 0 | labels_[0] = nodes_[0].label(); |
1129 | |
|
1130 | 0 | nodes_.clear(); |
1131 | 0 | table_.clear(); |
1132 | 0 | node_stack_.clear(); |
1133 | 0 | recycle_bin_.clear(); |
1134 | |
|
1135 | 0 | is_intersections_.build(); |
1136 | 0 | } |
1137 | | |
1138 | | inline void DawgBuilder::insert(const char *key, std::size_t length, |
1139 | 0 | value_type value) { |
1140 | 0 | if (value < 0) { |
1141 | 0 | DARTS_THROW("failed to insert key: negative value"); |
1142 | 0 | } else if (length == 0) { |
1143 | 0 | DARTS_THROW("failed to insert key: zero-length key"); |
1144 | 0 | } |
1145 | | |
1146 | 0 | id_type id = 0; |
1147 | 0 | std::size_t key_pos = 0; |
1148 | |
|
1149 | 0 | for ( ; key_pos <= length; ++key_pos) { |
1150 | 0 | id_type child_id = nodes_[id].child(); |
1151 | 0 | if (child_id == 0) { |
1152 | 0 | break; |
1153 | 0 | } |
1154 | | |
1155 | 0 | uchar_type key_label = static_cast<uchar_type>(key[key_pos]); |
1156 | 0 | if (key_pos < length && key_label == '\0') { |
1157 | 0 | DARTS_THROW("failed to insert key: invalid null character"); |
1158 | 0 | } |
1159 | | |
1160 | 0 | uchar_type unit_label = nodes_[child_id].label(); |
1161 | 0 | if (key_label < unit_label) { |
1162 | 0 | DARTS_THROW("failed to insert key: wrong key order"); |
1163 | 0 | } else if (key_label > unit_label) { |
1164 | 0 | nodes_[child_id].set_has_sibling(true); |
1165 | 0 | flush(child_id); |
1166 | 0 | break; |
1167 | 0 | } |
1168 | 0 | id = child_id; |
1169 | 0 | } |
1170 | | |
1171 | 0 | if (key_pos > length) { |
1172 | 0 | return; |
1173 | 0 | } |
1174 | | |
1175 | 0 | for ( ; key_pos <= length; ++key_pos) { |
1176 | 0 | uchar_type key_label = static_cast<uchar_type>( |
1177 | 0 | (key_pos < length) ? key[key_pos] : '\0'); |
1178 | 0 | id_type child_id = append_node(); |
1179 | |
|
1180 | 0 | if (nodes_[id].child() == 0) { |
1181 | 0 | nodes_[child_id].set_is_state(true); |
1182 | 0 | } |
1183 | 0 | nodes_[child_id].set_sibling(nodes_[id].child()); |
1184 | 0 | nodes_[child_id].set_label(key_label); |
1185 | 0 | nodes_[id].set_child(child_id); |
1186 | 0 | node_stack_.push(child_id); |
1187 | |
|
1188 | 0 | id = child_id; |
1189 | 0 | } |
1190 | 0 | nodes_[id].set_value(value); |
1191 | 0 | } |
1192 | | |
1193 | 0 | inline void DawgBuilder::clear() { |
1194 | 0 | nodes_.clear(); |
1195 | 0 | units_.clear(); |
1196 | 0 | labels_.clear(); |
1197 | 0 | is_intersections_.clear(); |
1198 | 0 | table_.clear(); |
1199 | 0 | node_stack_.clear(); |
1200 | 0 | recycle_bin_.clear(); |
1201 | 0 | num_states_ = 0; |
1202 | 0 | } |
1203 | | |
1204 | 0 | inline void DawgBuilder::flush(id_type id) { |
1205 | 0 | while (node_stack_.top() != id) { |
1206 | 0 | id_type node_id = node_stack_.top(); |
1207 | 0 | node_stack_.pop(); |
1208 | |
|
1209 | 0 | if (num_states_ >= table_.size() - (table_.size() >> 2)) { |
1210 | 0 | expand_table(); |
1211 | 0 | } |
1212 | |
|
1213 | 0 | id_type num_siblings = 0; |
1214 | 0 | for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { |
1215 | 0 | ++num_siblings; |
1216 | 0 | } |
1217 | |
|
1218 | 0 | id_type hash_id; |
1219 | 0 | id_type match_id = find_node(node_id, &hash_id); |
1220 | 0 | if (match_id != 0) { |
1221 | 0 | is_intersections_.set(match_id, true); |
1222 | 0 | } else { |
1223 | 0 | id_type unit_id = 0; |
1224 | 0 | for (id_type i = 0; i < num_siblings; ++i) { |
1225 | 0 | unit_id = append_unit(); |
1226 | 0 | } |
1227 | 0 | for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) { |
1228 | 0 | units_[unit_id] = nodes_[i].unit(); |
1229 | 0 | labels_[unit_id] = nodes_[i].label(); |
1230 | 0 | --unit_id; |
1231 | 0 | } |
1232 | 0 | match_id = unit_id + 1; |
1233 | 0 | table_[hash_id] = match_id; |
1234 | 0 | ++num_states_; |
1235 | 0 | } |
1236 | |
|
1237 | 0 | for (id_type i = node_id, next; i != 0; i = next) { |
1238 | 0 | next = nodes_[i].sibling(); |
1239 | 0 | free_node(i); |
1240 | 0 | } |
1241 | |
|
1242 | 0 | nodes_[node_stack_.top()].set_child(match_id); |
1243 | 0 | } |
1244 | 0 | node_stack_.pop(); |
1245 | 0 | } |
1246 | | |
1247 | 0 | inline void DawgBuilder::expand_table() { |
1248 | 0 | std::size_t table_size = table_.size() << 1; |
1249 | 0 | table_.clear(); |
1250 | 0 | table_.resize(table_size, 0); |
1251 | |
|
1252 | 0 | for (std::size_t i = 1; i < units_.size(); ++i) { |
1253 | 0 | id_type id = static_cast<id_type>(i); |
1254 | 0 | if (labels_[id] == '\0' || units_[id].is_state()) { |
1255 | 0 | id_type hash_id; |
1256 | 0 | find_unit(id, &hash_id); |
1257 | 0 | table_[hash_id] = id; |
1258 | 0 | } |
1259 | 0 | } |
1260 | 0 | } |
1261 | | |
1262 | 0 | inline id_type DawgBuilder::find_unit(id_type id, id_type *hash_id) const { |
1263 | 0 | *hash_id = hash_unit(id) % table_.size(); |
1264 | 0 | for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { |
1265 | 0 | id_type unit_id = table_[*hash_id]; |
1266 | 0 | if (unit_id == 0) { |
1267 | 0 | break; |
1268 | 0 | } |
1269 | | |
1270 | | // There must not be the same unit. |
1271 | 0 | } |
1272 | 0 | return 0; |
1273 | 0 | } |
1274 | | |
1275 | | inline id_type DawgBuilder::find_node(id_type node_id, |
1276 | 0 | id_type *hash_id) const { |
1277 | 0 | *hash_id = hash_node(node_id) % table_.size(); |
1278 | 0 | for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) { |
1279 | 0 | id_type unit_id = table_[*hash_id]; |
1280 | 0 | if (unit_id == 0) { |
1281 | 0 | break; |
1282 | 0 | } |
1283 | | |
1284 | 0 | if (are_equal(node_id, unit_id)) { |
1285 | 0 | return unit_id; |
1286 | 0 | } |
1287 | 0 | } |
1288 | 0 | return 0; |
1289 | 0 | } |
1290 | | |
1291 | 0 | inline bool DawgBuilder::are_equal(id_type node_id, id_type unit_id) const { |
1292 | 0 | for (id_type i = nodes_[node_id].sibling(); i != 0; |
1293 | 0 | i = nodes_[i].sibling()) { |
1294 | 0 | if (units_[unit_id].has_sibling() == false) { |
1295 | 0 | return false; |
1296 | 0 | } |
1297 | 0 | ++unit_id; |
1298 | 0 | } |
1299 | 0 | if (units_[unit_id].has_sibling() == true) { |
1300 | 0 | return false; |
1301 | 0 | } |
1302 | | |
1303 | 0 | for (id_type i = node_id; i != 0; i = nodes_[i].sibling(), --unit_id) { |
1304 | 0 | if (nodes_[i].unit() != units_[unit_id].unit() || |
1305 | 0 | nodes_[i].label() != labels_[unit_id]) { |
1306 | 0 | return false; |
1307 | 0 | } |
1308 | 0 | } |
1309 | 0 | return true; |
1310 | 0 | } |
1311 | | |
1312 | 0 | inline id_type DawgBuilder::hash_unit(id_type id) const { |
1313 | 0 | id_type hash_value = 0; |
1314 | 0 | for ( ; id != 0; ++id) { |
1315 | 0 | id_type unit = units_[id].unit(); |
1316 | 0 | uchar_type label = labels_[id]; |
1317 | 0 | hash_value ^= hash((label << 24) ^ unit); |
1318 | |
|
1319 | 0 | if (units_[id].has_sibling() == false) { |
1320 | 0 | break; |
1321 | 0 | } |
1322 | 0 | } |
1323 | 0 | return hash_value; |
1324 | 0 | } |
1325 | | |
1326 | 0 | inline id_type DawgBuilder::hash_node(id_type id) const { |
1327 | 0 | id_type hash_value = 0; |
1328 | 0 | for ( ; id != 0; id = nodes_[id].sibling()) { |
1329 | 0 | id_type unit = nodes_[id].unit(); |
1330 | 0 | uchar_type label = nodes_[id].label(); |
1331 | 0 | hash_value ^= hash((label << 24) ^ unit); |
1332 | 0 | } |
1333 | 0 | return hash_value; |
1334 | 0 | } |
1335 | | |
1336 | 0 | inline id_type DawgBuilder::append_unit() { |
1337 | 0 | is_intersections_.append(); |
1338 | 0 | units_.append(); |
1339 | 0 | labels_.append(); |
1340 | |
|
1341 | 0 | return static_cast<id_type>(is_intersections_.size() - 1); |
1342 | 0 | } |
1343 | | |
1344 | 0 | inline id_type DawgBuilder::append_node() { |
1345 | 0 | id_type id; |
1346 | 0 | if (recycle_bin_.empty()) { |
1347 | 0 | id = static_cast<id_type>(nodes_.size()); |
1348 | 0 | nodes_.append(); |
1349 | 0 | } else { |
1350 | 0 | id = recycle_bin_.top(); |
1351 | 0 | nodes_[id] = DawgNode(); |
1352 | 0 | recycle_bin_.pop(); |
1353 | 0 | } |
1354 | 0 | return id; |
1355 | 0 | } |
1356 | | |
1357 | | // |
1358 | | // Unit of double-array builder. |
1359 | | // |
1360 | | |
1361 | | class DoubleArrayBuilderUnit { |
1362 | | public: |
1363 | 0 | DoubleArrayBuilderUnit() : unit_(0) {} |
1364 | | |
1365 | 0 | void set_has_leaf(bool has_leaf) { |
1366 | 0 | if (has_leaf) { |
1367 | 0 | unit_ |= 1U << 8; |
1368 | 0 | } else { |
1369 | 0 | unit_ &= ~(1U << 8); |
1370 | 0 | } |
1371 | 0 | } |
1372 | 0 | void set_value(value_type value) { |
1373 | 0 | unit_ = value | (1U << 31); |
1374 | 0 | } |
1375 | 0 | void set_label(uchar_type label) { |
1376 | 0 | unit_ = (unit_ & ~0xFFU) | label; |
1377 | 0 | } |
1378 | 0 | void set_offset(id_type offset) { |
1379 | 0 | if (offset >= 1U << 29) { |
1380 | 0 | DARTS_THROW("failed to modify unit: too large offset"); |
1381 | 0 | } |
1382 | 0 | unit_ &= (1U << 31) | (1U << 8) | 0xFF; |
1383 | 0 | if (offset < 1U << 21) { |
1384 | 0 | unit_ |= (offset << 10); |
1385 | 0 | } else { |
1386 | 0 | unit_ |= (offset << 2) | (1U << 9); |
1387 | 0 | } |
1388 | 0 | } |
1389 | | |
1390 | | private: |
1391 | | id_type unit_; |
1392 | | |
1393 | | // Copyable. |
1394 | | }; |
1395 | | |
1396 | | // |
1397 | | // Extra unit of double-array builder. |
1398 | | // |
1399 | | |
1400 | | class DoubleArrayBuilderExtraUnit { |
1401 | | public: |
1402 | 0 | DoubleArrayBuilderExtraUnit() : prev_(0), next_(0), |
1403 | 0 | is_fixed_(false), is_used_(false) {} |
1404 | | |
1405 | 0 | void set_prev(id_type prev) { |
1406 | 0 | prev_ = prev; |
1407 | 0 | } |
1408 | 0 | void set_next(id_type next) { |
1409 | 0 | next_ = next; |
1410 | 0 | } |
1411 | 0 | void set_is_fixed(bool is_fixed) { |
1412 | 0 | is_fixed_ = is_fixed; |
1413 | 0 | } |
1414 | 0 | void set_is_used(bool is_used) { |
1415 | 0 | is_used_ = is_used; |
1416 | 0 | } |
1417 | | |
1418 | 0 | id_type prev() const { |
1419 | 0 | return prev_; |
1420 | 0 | } |
1421 | 0 | id_type next() const { |
1422 | 0 | return next_; |
1423 | 0 | } |
1424 | 0 | bool is_fixed() const { |
1425 | 0 | return is_fixed_; |
1426 | 0 | } |
1427 | 0 | bool is_used() const { |
1428 | 0 | return is_used_; |
1429 | 0 | } |
1430 | | |
1431 | | private: |
1432 | | id_type prev_; |
1433 | | id_type next_; |
1434 | | bool is_fixed_; |
1435 | | bool is_used_; |
1436 | | |
1437 | | // Copyable. |
1438 | | }; |
1439 | | |
1440 | | // |
1441 | | // DAWG -> double-array converter. |
1442 | | // |
1443 | | |
1444 | | class DoubleArrayBuilder { |
1445 | | public: |
1446 | | explicit DoubleArrayBuilder(progress_func_type progress_func) |
1447 | 0 | : progress_func_(progress_func), units_(), extras_(), labels_(), |
1448 | 0 | table_(), extras_head_(0) {} |
1449 | 0 | ~DoubleArrayBuilder() { |
1450 | 0 | clear(); |
1451 | 0 | } |
1452 | | |
1453 | | template <typename T> |
1454 | | void build(const Keyset<T> &keyset); |
1455 | | void copy(std::size_t *size_ptr, DoubleArrayUnit **buf_ptr) const; |
1456 | | |
1457 | | void clear(); |
1458 | | |
1459 | | private: |
1460 | | enum { BLOCK_SIZE = 256 }; |
1461 | | enum { NUM_EXTRA_BLOCKS = 16 }; |
1462 | | enum { NUM_EXTRAS = BLOCK_SIZE * NUM_EXTRA_BLOCKS }; |
1463 | | |
1464 | | enum { UPPER_MASK = 0xFF << 21 }; |
1465 | | enum { LOWER_MASK = 0xFF }; |
1466 | | |
1467 | | typedef DoubleArrayBuilderUnit unit_type; |
1468 | | typedef DoubleArrayBuilderExtraUnit extra_type; |
1469 | | |
1470 | | progress_func_type progress_func_; |
1471 | | AutoPool<unit_type> units_; |
1472 | | AutoArray<extra_type> extras_; |
1473 | | AutoPool<uchar_type> labels_; |
1474 | | AutoArray<id_type> table_; |
1475 | | id_type extras_head_; |
1476 | | |
1477 | | // Disallows copy and assignment. |
1478 | | DoubleArrayBuilder(const DoubleArrayBuilder &); |
1479 | | DoubleArrayBuilder &operator=(const DoubleArrayBuilder &); |
1480 | | |
1481 | 0 | std::size_t num_blocks() const { |
1482 | 0 | return units_.size() / BLOCK_SIZE; |
1483 | 0 | } |
1484 | | |
1485 | 0 | const extra_type &extras(id_type id) const { |
1486 | 0 | return extras_[id % NUM_EXTRAS]; |
1487 | 0 | } |
1488 | 0 | extra_type &extras(id_type id) { |
1489 | 0 | return extras_[id % NUM_EXTRAS]; |
1490 | 0 | } |
1491 | | |
1492 | | template <typename T> |
1493 | | void build_dawg(const Keyset<T> &keyset, DawgBuilder *dawg_builder); |
1494 | | void build_from_dawg(const DawgBuilder &dawg); |
1495 | | void build_from_dawg(const DawgBuilder &dawg, |
1496 | | id_type dawg_id, id_type dic_id); |
1497 | | id_type arrange_from_dawg(const DawgBuilder &dawg, |
1498 | | id_type dawg_id, id_type dic_id); |
1499 | | |
1500 | | template <typename T> |
1501 | | void build_from_keyset(const Keyset<T> &keyset); |
1502 | | template <typename T> |
1503 | | void build_from_keyset(const Keyset<T> &keyset, std::size_t begin, |
1504 | | std::size_t end, std::size_t depth, id_type dic_id); |
1505 | | template <typename T> |
1506 | | id_type arrange_from_keyset(const Keyset<T> &keyset, std::size_t begin, |
1507 | | std::size_t end, std::size_t depth, id_type dic_id); |
1508 | | |
1509 | | id_type find_valid_offset(id_type id) const; |
1510 | | bool is_valid_offset(id_type id, id_type offset) const; |
1511 | | |
1512 | | void reserve_id(id_type id); |
1513 | | void expand_units(); |
1514 | | |
1515 | | void fix_all_blocks(); |
1516 | | void fix_block(id_type block_id); |
1517 | | }; |
1518 | | |
1519 | | template <typename T> |
1520 | 0 | void DoubleArrayBuilder::build(const Keyset<T> &keyset) { |
1521 | 0 | if (keyset.has_values()) { |
1522 | 0 | Details::DawgBuilder dawg_builder; |
1523 | 0 | build_dawg(keyset, &dawg_builder); |
1524 | 0 | build_from_dawg(dawg_builder); |
1525 | 0 | dawg_builder.clear(); |
1526 | 0 | } else { |
1527 | 0 | build_from_keyset(keyset); |
1528 | 0 | } |
1529 | 0 | } |
1530 | | |
1531 | | inline void DoubleArrayBuilder::copy(std::size_t *size_ptr, |
1532 | 0 | DoubleArrayUnit **buf_ptr) const { |
1533 | 0 | if (size_ptr != NULL) { |
1534 | 0 | *size_ptr = units_.size(); |
1535 | 0 | } |
1536 | 0 | if (buf_ptr != NULL) { |
1537 | 0 | *buf_ptr = new DoubleArrayUnit[units_.size()]; |
1538 | 0 | unit_type *units = reinterpret_cast<unit_type *>(*buf_ptr); |
1539 | 0 | for (std::size_t i = 0; i < units_.size(); ++i) { |
1540 | 0 | units[i] = units_[i]; |
1541 | 0 | } |
1542 | 0 | } |
1543 | 0 | } |
1544 | | |
1545 | 0 | inline void DoubleArrayBuilder::clear() { |
1546 | 0 | units_.clear(); |
1547 | 0 | extras_.clear(); |
1548 | 0 | labels_.clear(); |
1549 | 0 | table_.clear(); |
1550 | 0 | extras_head_ = 0; |
1551 | 0 | } |
1552 | | |
1553 | | template <typename T> |
1554 | | void DoubleArrayBuilder::build_dawg(const Keyset<T> &keyset, |
1555 | 0 | DawgBuilder *dawg_builder) { |
1556 | 0 | dawg_builder->init(); |
1557 | 0 | for (std::size_t i = 0; i < keyset.num_keys(); ++i) { |
1558 | 0 | dawg_builder->insert(keyset.keys(i), keyset.lengths(i), keyset.values(i)); |
1559 | 0 | if (progress_func_ != NULL) { |
1560 | 0 | progress_func_(i + 1, keyset.num_keys() + 1); |
1561 | 0 | } |
1562 | 0 | } |
1563 | 0 | dawg_builder->finish(); |
1564 | 0 | } |
1565 | | |
1566 | 0 | inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg) { |
1567 | 0 | std::size_t num_units = 1; |
1568 | 0 | while (num_units < dawg.size()) { |
1569 | 0 | num_units <<= 1; |
1570 | 0 | } |
1571 | 0 | units_.reserve(num_units); |
1572 | |
|
1573 | 0 | table_.reset(new id_type[dawg.num_intersections()]); |
1574 | 0 | for (std::size_t i = 0; i < dawg.num_intersections(); ++i) { |
1575 | 0 | table_[i] = 0; |
1576 | 0 | } |
1577 | |
|
1578 | 0 | extras_.reset(new extra_type[NUM_EXTRAS]); |
1579 | |
|
1580 | 0 | reserve_id(0); |
1581 | 0 | extras(0).set_is_used(true); |
1582 | 0 | units_[0].set_offset(1); |
1583 | 0 | units_[0].set_label('\0'); |
1584 | |
|
1585 | 0 | if (dawg.child(dawg.root()) != 0) { |
1586 | 0 | build_from_dawg(dawg, dawg.root(), 0); |
1587 | 0 | } |
1588 | |
|
1589 | 0 | fix_all_blocks(); |
1590 | |
|
1591 | 0 | extras_.clear(); |
1592 | 0 | labels_.clear(); |
1593 | 0 | table_.clear(); |
1594 | 0 | } |
1595 | | |
1596 | | inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg, |
1597 | 0 | id_type dawg_id, id_type dic_id) { |
1598 | 0 | id_type dawg_child_id = dawg.child(dawg_id); |
1599 | 0 | if (dawg.is_intersection(dawg_child_id)) { |
1600 | 0 | id_type intersection_id = dawg.intersection_id(dawg_child_id); |
1601 | 0 | id_type offset = table_[intersection_id]; |
1602 | 0 | if (offset != 0) { |
1603 | 0 | offset ^= dic_id; |
1604 | 0 | if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) { |
1605 | 0 | if (dawg.is_leaf(dawg_child_id)) { |
1606 | 0 | units_[dic_id].set_has_leaf(true); |
1607 | 0 | } |
1608 | 0 | units_[dic_id].set_offset(offset); |
1609 | 0 | return; |
1610 | 0 | } |
1611 | 0 | } |
1612 | 0 | } |
1613 | | |
1614 | 0 | id_type offset = arrange_from_dawg(dawg, dawg_id, dic_id); |
1615 | 0 | if (dawg.is_intersection(dawg_child_id)) { |
1616 | 0 | table_[dawg.intersection_id(dawg_child_id)] = offset; |
1617 | 0 | } |
1618 | |
|
1619 | 0 | do { |
1620 | 0 | uchar_type child_label = dawg.label(dawg_child_id); |
1621 | 0 | id_type dic_child_id = offset ^ child_label; |
1622 | 0 | if (child_label != '\0') { |
1623 | 0 | build_from_dawg(dawg, dawg_child_id, dic_child_id); |
1624 | 0 | } |
1625 | 0 | dawg_child_id = dawg.sibling(dawg_child_id); |
1626 | 0 | } while (dawg_child_id != 0); |
1627 | 0 | } |
1628 | | |
1629 | | inline id_type DoubleArrayBuilder::arrange_from_dawg(const DawgBuilder &dawg, |
1630 | 0 | id_type dawg_id, id_type dic_id) { |
1631 | 0 | labels_.resize(0); |
1632 | |
|
1633 | 0 | id_type dawg_child_id = dawg.child(dawg_id); |
1634 | 0 | while (dawg_child_id != 0) { |
1635 | 0 | labels_.append(dawg.label(dawg_child_id)); |
1636 | 0 | dawg_child_id = dawg.sibling(dawg_child_id); |
1637 | 0 | } |
1638 | |
|
1639 | 0 | id_type offset = find_valid_offset(dic_id); |
1640 | 0 | units_[dic_id].set_offset(dic_id ^ offset); |
1641 | |
|
1642 | 0 | dawg_child_id = dawg.child(dawg_id); |
1643 | 0 | for (std::size_t i = 0; i < labels_.size(); ++i) { |
1644 | 0 | id_type dic_child_id = offset ^ labels_[i]; |
1645 | 0 | reserve_id(dic_child_id); |
1646 | |
|
1647 | 0 | if (dawg.is_leaf(dawg_child_id)) { |
1648 | 0 | units_[dic_id].set_has_leaf(true); |
1649 | 0 | units_[dic_child_id].set_value(dawg.value(dawg_child_id)); |
1650 | 0 | } else { |
1651 | 0 | units_[dic_child_id].set_label(labels_[i]); |
1652 | 0 | } |
1653 | |
|
1654 | 0 | dawg_child_id = dawg.sibling(dawg_child_id); |
1655 | 0 | } |
1656 | 0 | extras(offset).set_is_used(true); |
1657 | |
|
1658 | 0 | return offset; |
1659 | 0 | } |
1660 | | |
1661 | | template <typename T> |
1662 | 0 | void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset) { |
1663 | 0 | std::size_t num_units = 1; |
1664 | 0 | while (num_units < keyset.num_keys()) { |
1665 | 0 | num_units <<= 1; |
1666 | 0 | } |
1667 | 0 | units_.reserve(num_units); |
1668 | |
|
1669 | 0 | extras_.reset(new extra_type[NUM_EXTRAS]); |
1670 | |
|
1671 | 0 | reserve_id(0); |
1672 | 0 | extras(0).set_is_used(true); |
1673 | 0 | units_[0].set_offset(1); |
1674 | 0 | units_[0].set_label('\0'); |
1675 | |
|
1676 | 0 | if (keyset.num_keys() > 0) { |
1677 | 0 | build_from_keyset(keyset, 0, keyset.num_keys(), 0, 0); |
1678 | 0 | } |
1679 | |
|
1680 | 0 | fix_all_blocks(); |
1681 | |
|
1682 | 0 | extras_.clear(); |
1683 | 0 | labels_.clear(); |
1684 | 0 | } |
1685 | | |
1686 | | template <typename T> |
1687 | | void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset, |
1688 | 0 | std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { |
1689 | 0 | id_type offset = arrange_from_keyset(keyset, begin, end, depth, dic_id); |
1690 | |
|
1691 | 0 | while (begin < end) { |
1692 | 0 | if (keyset.keys(begin, depth) != '\0') { |
1693 | 0 | break; |
1694 | 0 | } |
1695 | 0 | ++begin; |
1696 | 0 | } |
1697 | 0 | if (begin == end) { |
1698 | 0 | return; |
1699 | 0 | } |
1700 | | |
1701 | 0 | std::size_t last_begin = begin; |
1702 | 0 | uchar_type last_label = keyset.keys(begin, depth); |
1703 | 0 | while (++begin < end) { |
1704 | 0 | uchar_type label = keyset.keys(begin, depth); |
1705 | 0 | if (label != last_label) { |
1706 | 0 | build_from_keyset(keyset, last_begin, begin, |
1707 | 0 | depth + 1, offset ^ last_label); |
1708 | 0 | last_begin = begin; |
1709 | 0 | last_label = keyset.keys(begin, depth); |
1710 | 0 | } |
1711 | 0 | } |
1712 | 0 | build_from_keyset(keyset, last_begin, end, depth + 1, offset ^ last_label); |
1713 | 0 | } |
1714 | | |
1715 | | template <typename T> |
1716 | | id_type DoubleArrayBuilder::arrange_from_keyset(const Keyset<T> &keyset, |
1717 | 0 | std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) { |
1718 | 0 | labels_.resize(0); |
1719 | |
|
1720 | 0 | value_type value = -1; |
1721 | 0 | for (std::size_t i = begin; i < end; ++i) { |
1722 | 0 | uchar_type label = keyset.keys(i, depth); |
1723 | 0 | if (label == '\0') { |
1724 | 0 | if (keyset.has_lengths() && depth < keyset.lengths(i)) { |
1725 | 0 | DARTS_THROW("failed to build double-array: " |
1726 | 0 | "invalid null character"); |
1727 | 0 | } else if (keyset.values(i) < 0) { |
1728 | 0 | DARTS_THROW("failed to build double-array: negative value"); |
1729 | 0 | } |
1730 | | |
1731 | 0 | if (value == -1) { |
1732 | 0 | value = keyset.values(i); |
1733 | 0 | } |
1734 | 0 | if (progress_func_ != NULL) { |
1735 | 0 | progress_func_(i + 1, keyset.num_keys() + 1); |
1736 | 0 | } |
1737 | 0 | } |
1738 | | |
1739 | 0 | if (labels_.empty()) { |
1740 | 0 | labels_.append(label); |
1741 | 0 | } else if (label != labels_[labels_.size() - 1]) { |
1742 | 0 | if (label < labels_[labels_.size() - 1]) { |
1743 | 0 | DARTS_THROW("failed to build double-array: wrong key order"); |
1744 | 0 | } |
1745 | 0 | labels_.append(label); |
1746 | 0 | } |
1747 | 0 | } |
1748 | | |
1749 | 0 | id_type offset = find_valid_offset(dic_id); |
1750 | 0 | units_[dic_id].set_offset(dic_id ^ offset); |
1751 | |
|
1752 | 0 | for (std::size_t i = 0; i < labels_.size(); ++i) { |
1753 | 0 | id_type dic_child_id = offset ^ labels_[i]; |
1754 | 0 | reserve_id(dic_child_id); |
1755 | 0 | if (labels_[i] == '\0') { |
1756 | 0 | units_[dic_id].set_has_leaf(true); |
1757 | 0 | units_[dic_child_id].set_value(value); |
1758 | 0 | } else { |
1759 | 0 | units_[dic_child_id].set_label(labels_[i]); |
1760 | 0 | } |
1761 | 0 | } |
1762 | 0 | extras(offset).set_is_used(true); |
1763 | |
|
1764 | 0 | return offset; |
1765 | 0 | } |
1766 | | |
1767 | 0 | inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const { |
1768 | 0 | if (extras_head_ >= units_.size()) { |
1769 | 0 | return units_.size() | (id & LOWER_MASK); |
1770 | 0 | } |
1771 | | |
1772 | 0 | id_type unfixed_id = extras_head_; |
1773 | 0 | do { |
1774 | 0 | id_type offset = unfixed_id ^ labels_[0]; |
1775 | 0 | if (is_valid_offset(id, offset)) { |
1776 | 0 | return offset; |
1777 | 0 | } |
1778 | 0 | unfixed_id = extras(unfixed_id).next(); |
1779 | 0 | } while (unfixed_id != extras_head_); |
1780 | | |
1781 | 0 | return units_.size() | (id & LOWER_MASK); |
1782 | 0 | } |
1783 | | |
1784 | | inline bool DoubleArrayBuilder::is_valid_offset(id_type id, |
1785 | 0 | id_type offset) const { |
1786 | 0 | if (extras(offset).is_used()) { |
1787 | 0 | return false; |
1788 | 0 | } |
1789 | | |
1790 | 0 | id_type rel_offset = id ^ offset; |
1791 | 0 | if ((rel_offset & LOWER_MASK) && (rel_offset & UPPER_MASK)) { |
1792 | 0 | return false; |
1793 | 0 | } |
1794 | | |
1795 | 0 | for (std::size_t i = 1; i < labels_.size(); ++i) { |
1796 | 0 | if (extras(offset ^ labels_[i]).is_fixed()) { |
1797 | 0 | return false; |
1798 | 0 | } |
1799 | 0 | } |
1800 | | |
1801 | 0 | return true; |
1802 | 0 | } |
1803 | | |
1804 | 0 | inline void DoubleArrayBuilder::reserve_id(id_type id) { |
1805 | 0 | if (id >= units_.size()) { |
1806 | 0 | expand_units(); |
1807 | 0 | } |
1808 | |
|
1809 | 0 | if (id == extras_head_) { |
1810 | 0 | extras_head_ = extras(id).next(); |
1811 | 0 | if (extras_head_ == id) { |
1812 | 0 | extras_head_ = units_.size(); |
1813 | 0 | } |
1814 | 0 | } |
1815 | 0 | extras(extras(id).prev()).set_next(extras(id).next()); |
1816 | 0 | extras(extras(id).next()).set_prev(extras(id).prev()); |
1817 | 0 | extras(id).set_is_fixed(true); |
1818 | 0 | } |
1819 | | |
1820 | 0 | inline void DoubleArrayBuilder::expand_units() { |
1821 | 0 | id_type src_num_units = units_.size(); |
1822 | 0 | id_type src_num_blocks = num_blocks(); |
1823 | |
|
1824 | 0 | id_type dest_num_units = src_num_units + BLOCK_SIZE; |
1825 | 0 | id_type dest_num_blocks = src_num_blocks + 1; |
1826 | |
|
1827 | 0 | if (dest_num_blocks > NUM_EXTRA_BLOCKS) { |
1828 | 0 | fix_block(src_num_blocks - NUM_EXTRA_BLOCKS); |
1829 | 0 | } |
1830 | |
|
1831 | 0 | units_.resize(dest_num_units); |
1832 | |
|
1833 | 0 | if (dest_num_blocks > NUM_EXTRA_BLOCKS) { |
1834 | 0 | for (std::size_t id = src_num_units; id < dest_num_units; ++id) { |
1835 | 0 | extras(id).set_is_used(false); |
1836 | 0 | extras(id).set_is_fixed(false); |
1837 | 0 | } |
1838 | 0 | } |
1839 | |
|
1840 | 0 | for (id_type i = src_num_units + 1; i < dest_num_units; ++i) { |
1841 | 0 | extras(i - 1).set_next(i); |
1842 | 0 | extras(i).set_prev(i - 1); |
1843 | 0 | } |
1844 | |
|
1845 | 0 | extras(src_num_units).set_prev(dest_num_units - 1); |
1846 | 0 | extras(dest_num_units - 1).set_next(src_num_units); |
1847 | |
|
1848 | 0 | extras(src_num_units).set_prev(extras(extras_head_).prev()); |
1849 | 0 | extras(dest_num_units - 1).set_next(extras_head_); |
1850 | |
|
1851 | 0 | extras(extras(extras_head_).prev()).set_next(src_num_units); |
1852 | 0 | extras(extras_head_).set_prev(dest_num_units - 1); |
1853 | 0 | } |
1854 | | |
1855 | 0 | inline void DoubleArrayBuilder::fix_all_blocks() { |
1856 | 0 | id_type begin = 0; |
1857 | 0 | if (num_blocks() > NUM_EXTRA_BLOCKS) { |
1858 | 0 | begin = num_blocks() - NUM_EXTRA_BLOCKS; |
1859 | 0 | } |
1860 | 0 | id_type end = num_blocks(); |
1861 | |
|
1862 | 0 | for (id_type block_id = begin; block_id != end; ++block_id) { |
1863 | 0 | fix_block(block_id); |
1864 | 0 | } |
1865 | 0 | } |
1866 | | |
1867 | 0 | inline void DoubleArrayBuilder::fix_block(id_type block_id) { |
1868 | 0 | id_type begin = block_id * BLOCK_SIZE; |
1869 | 0 | id_type end = begin + BLOCK_SIZE; |
1870 | |
|
1871 | 0 | id_type unused_offset = 0; |
1872 | 0 | for (id_type offset = begin; offset != end; ++offset) { |
1873 | 0 | if (!extras(offset).is_used()) { |
1874 | 0 | unused_offset = offset; |
1875 | 0 | break; |
1876 | 0 | } |
1877 | 0 | } |
1878 | |
|
1879 | 0 | for (id_type id = begin; id != end; ++id) { |
1880 | 0 | if (!extras(id).is_fixed()) { |
1881 | 0 | reserve_id(id); |
1882 | 0 | units_[id].set_label(static_cast<uchar_type>(id ^ unused_offset)); |
1883 | 0 | } |
1884 | 0 | } |
1885 | 0 | } |
1886 | | |
1887 | | } // namespace Details |
1888 | | |
1889 | | // |
1890 | | // Member function build() of DoubleArrayImpl. |
1891 | | // |
1892 | | |
1893 | | template <typename A, typename B, typename T, typename C> |
1894 | | int DoubleArrayImpl<A, B, T, C>::build(std::size_t num_keys, |
1895 | | const key_type * const *keys, const std::size_t *lengths, |
1896 | 0 | const value_type *values, Details::progress_func_type progress_func) { |
1897 | 0 | Details::Keyset<value_type> keyset(num_keys, keys, lengths, values); |
1898 | |
|
1899 | 0 | Details::DoubleArrayBuilder builder(progress_func); |
1900 | 0 | builder.build(keyset); |
1901 | |
|
1902 | 0 | std::size_t size = 0; |
1903 | 0 | unit_type *buf = NULL; |
1904 | 0 | builder.copy(&size, &buf); |
1905 | |
|
1906 | 0 | clear(); |
1907 | |
|
1908 | 0 | size_ = size; |
1909 | 0 | array_ = buf; |
1910 | 0 | buf_ = buf; |
1911 | |
|
1912 | 0 | if (progress_func != NULL) { |
1913 | 0 | progress_func(num_keys + 1, num_keys + 1); |
1914 | 0 | } |
1915 | |
|
1916 | 0 | return 0; |
1917 | 0 | } |
1918 | | |
1919 | | } // namespace Darts |
1920 | | |
1921 | | #undef DARTS_INT_TO_STR |
1922 | | #undef DARTS_LINE_TO_STR |
1923 | | #undef DARTS_LINE_STR |
1924 | | #undef DARTS_THROW |
1925 | | |
1926 | | #endif // DARTS_H_ |