/src/tesseract/src/dict/dawg.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /******************************************************************************** |
2 | | * |
3 | | * File: dawg.cpp (Formerly dawg.c) |
4 | | * Description: Use a Directed Acyclic Word Graph |
5 | | * Author: Mark Seaman, OCR Technology |
6 | | * |
7 | | * (c) Copyright 1987, Hewlett-Packard Company. |
8 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | ** you may not use this file except in compliance with the License. |
10 | | ** You may obtain a copy of the License at |
11 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
12 | | ** Unless required by applicable law or agreed to in writing, software |
13 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
14 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | ** See the License for the specific language governing permissions and |
16 | | ** limitations under the License. |
17 | | * |
18 | | *********************************************************************************/ |
19 | | /*---------------------------------------------------------------------- |
20 | | I n c l u d e s |
21 | | ----------------------------------------------------------------------*/ |
22 | | |
23 | | #include "dawg.h" |
24 | | |
25 | | #include "dict.h" |
26 | | #include "helpers.h" |
27 | | #include "tprintf.h" |
28 | | |
29 | | #include <memory> |
30 | | |
31 | | /*---------------------------------------------------------------------- |
32 | | F u n c t i o n s f o r D a w g |
33 | | ----------------------------------------------------------------------*/ |
34 | | namespace tesseract { |
35 | | |
36 | | // Destructor. |
37 | | // It is defined here, so the compiler can create a single vtable |
38 | | // instead of weak vtables in every compilation unit. |
39 | 6 | Dawg::~Dawg() = default; |
40 | | |
41 | | bool Dawg::prefix_in_dawg(const WERD_CHOICE &word, |
42 | 0 | bool requires_complete) const { |
43 | 0 | if (word.empty()) { |
44 | 0 | return !requires_complete; |
45 | 0 | } |
46 | 0 | NODE_REF node = 0; |
47 | 0 | int end_index = word.length() - 1; |
48 | 0 | for (int i = 0; i < end_index; i++) { |
49 | 0 | EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false); |
50 | 0 | if (edge == NO_EDGE) { |
51 | 0 | return false; |
52 | 0 | } |
53 | 0 | if ((node = next_node(edge)) == 0) { |
54 | | // This only happens if all words following this edge terminate -- |
55 | | // there are no larger words. See Trie::add_word_to_dawg() |
56 | 0 | return false; |
57 | 0 | } |
58 | 0 | } |
59 | | // Now check the last character. |
60 | 0 | return edge_char_of(node, word.unichar_id(end_index), requires_complete) != |
61 | 0 | NO_EDGE; |
62 | 0 | } |
63 | | |
64 | 0 | bool Dawg::word_in_dawg(const WERD_CHOICE &word) const { |
65 | 0 | return prefix_in_dawg(word, true); |
66 | 0 | } |
67 | | |
68 | | int Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset, |
69 | 0 | bool enable_wildcard) const { |
70 | 0 | if (filename == nullptr) { |
71 | 0 | return 0; |
72 | 0 | } |
73 | | |
74 | 0 | FILE *word_file; |
75 | 0 | char string[CHARS_PER_LINE]; |
76 | 0 | int misses = 0; |
77 | 0 | UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); |
78 | |
|
79 | 0 | word_file = fopen(filename, "r"); |
80 | 0 | if (word_file == nullptr) { |
81 | 0 | tprintf("Error: Could not open file %s\n", filename); |
82 | 0 | ASSERT_HOST(word_file); |
83 | 0 | } |
84 | |
|
85 | 0 | while (fgets(string, CHARS_PER_LINE, word_file) != nullptr) { |
86 | 0 | chomp_string(string); // remove newline |
87 | 0 | WERD_CHOICE word(string, unicharset); |
88 | 0 | if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { |
89 | 0 | if (!match_words(&word, 0, 0, |
90 | 0 | enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { |
91 | 0 | tprintf("Missing word: %s\n", string); |
92 | 0 | ++misses; |
93 | 0 | } |
94 | 0 | } else { |
95 | 0 | tprintf("Failed to create a valid word from %s\n", string); |
96 | 0 | } |
97 | 0 | } |
98 | 0 | fclose(word_file); |
99 | | // Make sure the user sees this with fprintf instead of tprintf. |
100 | 0 | if (debug_level_) { |
101 | 0 | tprintf("Number of lost words=%d\n", misses); |
102 | 0 | } |
103 | 0 | return misses; |
104 | 0 | } |
105 | | |
106 | | void Dawg::iterate_words(const UNICHARSET &unicharset, |
107 | 0 | std::function<void(const WERD_CHOICE *)> cb) const { |
108 | 0 | WERD_CHOICE word(&unicharset); |
109 | 0 | iterate_words_rec(word, 0, cb); |
110 | 0 | } |
111 | | |
112 | | static void CallWithUTF8(const std::function<void(const char *)> &cb, |
113 | 0 | const WERD_CHOICE *wc) { |
114 | 0 | std::string s; |
115 | 0 | wc->string_and_lengths(&s, nullptr); |
116 | 0 | cb(s.c_str()); |
117 | 0 | } |
118 | | |
119 | | void Dawg::iterate_words(const UNICHARSET &unicharset, |
120 | 0 | const std::function<void(const char *)> &cb) const { |
121 | 0 | using namespace std::placeholders; // for _1 |
122 | 0 | std::function<void(const WERD_CHOICE *)> shim( |
123 | 0 | std::bind(CallWithUTF8, cb, _1)); |
124 | 0 | WERD_CHOICE word(&unicharset); |
125 | 0 | iterate_words_rec(word, 0, shim); |
126 | 0 | } |
127 | | |
128 | | void Dawg::iterate_words_rec( |
129 | | const WERD_CHOICE &word_so_far, NODE_REF to_explore, |
130 | 0 | const std::function<void(const WERD_CHOICE *)> &cb) const { |
131 | 0 | NodeChildVector children; |
132 | 0 | this->unichar_ids_of(to_explore, &children, false); |
133 | 0 | for (auto &i : children) { |
134 | 0 | WERD_CHOICE next_word(word_so_far); |
135 | 0 | next_word.append_unichar_id(i.unichar_id, 1, 0.0, 0.0); |
136 | 0 | if (this->end_of_word(i.edge_ref)) { |
137 | 0 | cb(&next_word); |
138 | 0 | } |
139 | 0 | NODE_REF next = next_node(i.edge_ref); |
140 | 0 | if (next != 0) { |
141 | 0 | iterate_words_rec(next_word, next, cb); |
142 | 0 | } |
143 | 0 | } |
144 | 0 | } |
145 | | |
146 | | bool Dawg::match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node, |
147 | 0 | UNICHAR_ID wildcard) const { |
148 | 0 | if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) { |
149 | 0 | bool any_matched = false; |
150 | 0 | NodeChildVector vec; |
151 | 0 | this->unichar_ids_of(node, &vec, false); |
152 | 0 | for (auto &i : vec) { |
153 | 0 | word->set_unichar_id(i.unichar_id, index); |
154 | 0 | if (match_words(word, index, node, wildcard)) { |
155 | 0 | any_matched = true; |
156 | 0 | } |
157 | 0 | } |
158 | 0 | word->set_unichar_id(wildcard, index); |
159 | 0 | return any_matched; |
160 | 0 | } else { |
161 | 0 | auto word_end = index == word->length() - 1; |
162 | 0 | auto edge = edge_char_of(node, word->unichar_id(index), word_end); |
163 | 0 | if (edge != NO_EDGE) { // normal edge in DAWG |
164 | 0 | node = next_node(edge); |
165 | 0 | if (word_end) { |
166 | 0 | if (debug_level_ > 1) { |
167 | 0 | word->print("match_words() found: "); |
168 | 0 | } |
169 | 0 | return true; |
170 | 0 | } else if (node != 0) { |
171 | 0 | return match_words(word, index + 1, node, wildcard); |
172 | 0 | } |
173 | 0 | } |
174 | 0 | } |
175 | 0 | return false; |
176 | 0 | } |
177 | | |
178 | 20 | void Dawg::init(int unicharset_size) { |
179 | 20 | ASSERT_HOST(unicharset_size > 0); |
180 | 20 | unicharset_size_ = unicharset_size; |
181 | | // Set bit masks. We will use the value unicharset_size_ as a null char, so |
182 | | // the actual number of unichars is unicharset_size_ + 1. |
183 | 20 | flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0)); |
184 | 20 | next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS; |
185 | 20 | letter_mask_ = ~(~0ull << flag_start_bit_); |
186 | 20 | next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS); |
187 | 20 | flags_mask_ = ~(letter_mask_ | next_node_mask_); |
188 | 20 | } |
189 | | |
190 | | /*---------------------------------------------------------------------- |
191 | | F u n c t i o n s f o r S q u i s h e d D a w g |
192 | | ----------------------------------------------------------------------*/ |
193 | | |
194 | 6 | SquishedDawg::~SquishedDawg() { |
195 | 6 | delete[] edges_; |
196 | 6 | } |
197 | | |
198 | | EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, |
199 | 22.6M | bool word_end) const { |
200 | 22.6M | EDGE_REF edge = node; |
201 | 22.6M | if (node == 0) { // binary search |
202 | 13.8M | EDGE_REF start = 0; |
203 | 13.8M | EDGE_REF end = num_forward_edges_in_node0 - 1; |
204 | 13.8M | int compare; |
205 | 65.0M | while (start <= end) { |
206 | 58.2M | edge = (start + end) >> 1; // (start + end) / 2 |
207 | 58.2M | compare = given_greater_than_edge_rec(NO_EDGE, word_end, unichar_id, |
208 | 58.2M | edges_[edge]); |
209 | 58.2M | if (compare == 0) { // given == vec[k] |
210 | 7.02M | return edge; |
211 | 51.2M | } else if (compare == 1) { // given > vec[k] |
212 | 27.0M | start = edge + 1; |
213 | 27.0M | } else { // given < vec[k] |
214 | 24.1M | end = edge - 1; |
215 | 24.1M | } |
216 | 58.2M | } |
217 | 13.8M | } else { // linear search |
218 | 8.81M | if (edge != NO_EDGE && edge_occupied(edge)) { |
219 | 102M | do { |
220 | 102M | if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) && |
221 | 102M | (!word_end || end_of_word_from_edge_rec(edges_[edge]))) { |
222 | 856k | return (edge); |
223 | 856k | } |
224 | 102M | } while (!last_edge(edge++)); |
225 | 8.81M | } |
226 | 8.81M | } |
227 | 14.8M | return (NO_EDGE); // not found |
228 | 22.6M | } |
229 | | |
230 | 16 | int32_t SquishedDawg::num_forward_edges(NODE_REF node) const { |
231 | 16 | EDGE_REF edge = node; |
232 | 16 | int32_t num = 0; |
233 | | |
234 | 16 | if (forward_edge(edge)) { |
235 | 636 | do { |
236 | 636 | num++; |
237 | 636 | } while (!last_edge(edge++)); |
238 | 16 | } |
239 | | |
240 | 16 | return (num); |
241 | 16 | } |
242 | | |
243 | 0 | void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const { |
244 | 0 | if (node == NO_EDGE) { |
245 | 0 | return; // nothing to print |
246 | 0 | } |
247 | | |
248 | 0 | EDGE_REF edge = node; |
249 | 0 | const char *forward_string = "FORWARD"; |
250 | 0 | const char *backward_string = " "; |
251 | |
|
252 | 0 | const char *last_string = "LAST"; |
253 | 0 | const char *not_last_string = " "; |
254 | |
|
255 | 0 | const char *eow_string = "EOW"; |
256 | 0 | const char *not_eow_string = " "; |
257 | |
|
258 | 0 | const char *direction; |
259 | 0 | const char *is_last; |
260 | 0 | const char *eow; |
261 | |
|
262 | 0 | UNICHAR_ID unichar_id; |
263 | |
|
264 | 0 | if (edge_occupied(edge)) { |
265 | 0 | do { |
266 | 0 | direction = forward_edge(edge) ? forward_string : backward_string; |
267 | 0 | is_last = last_edge(edge) ? last_string : not_last_string; |
268 | 0 | eow = end_of_word(edge) ? eow_string : not_eow_string; |
269 | |
|
270 | 0 | unichar_id = edge_letter(edge); |
271 | 0 | tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n", |
272 | 0 | edge, next_node(edge), unichar_id, direction, is_last, eow); |
273 | |
|
274 | 0 | if (edge - node > max_num_edges) { |
275 | 0 | return; |
276 | 0 | } |
277 | 0 | } while (!last_edge(edge++)); |
278 | | |
279 | 0 | if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) { |
280 | 0 | do { |
281 | 0 | direction = forward_edge(edge) ? forward_string : backward_string; |
282 | 0 | is_last = last_edge(edge) ? last_string : not_last_string; |
283 | 0 | eow = end_of_word(edge) ? eow_string : not_eow_string; |
284 | |
|
285 | 0 | unichar_id = edge_letter(edge); |
286 | 0 | tprintf(REFFORMAT " : next = " REFFORMAT |
287 | 0 | ", unichar_id = %d, %s %s %s\n", |
288 | 0 | edge, next_node(edge), unichar_id, direction, is_last, eow); |
289 | |
|
290 | 0 | if (edge - node > MAX_NODE_EDGES_DISPLAY) { |
291 | 0 | return; |
292 | 0 | } |
293 | 0 | } while (!last_edge(edge++)); |
294 | 0 | } |
295 | 0 | } else { |
296 | 0 | tprintf(REFFORMAT " : no edges in this node\n", node); |
297 | 0 | } |
298 | 0 | tprintf("\n"); |
299 | 0 | } |
300 | | |
301 | 0 | void SquishedDawg::print_edge(EDGE_REF edge) const { |
302 | 0 | if (edge == NO_EDGE) { |
303 | 0 | tprintf("NO_EDGE\n"); |
304 | 0 | } else { |
305 | 0 | tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = '%d', %s %s %s\n", |
306 | 0 | edge, next_node(edge), edge_letter(edge), |
307 | 0 | (forward_edge(edge) ? "FORWARD" : " "), |
308 | 0 | (last_edge(edge) ? "LAST" : " "), |
309 | 0 | (end_of_word(edge) ? "EOW" : "")); |
310 | 0 | } |
311 | 0 | } |
312 | | |
313 | 16 | bool SquishedDawg::read_squished_dawg(TFile *file) { |
314 | 16 | if (debug_level_) { |
315 | 0 | tprintf("Reading squished dawg\n"); |
316 | 0 | } |
317 | | |
318 | | // Read the magic number and check that it matches kDawgMagicNumber, as |
319 | | // auto-endian fixing should make sure it is always correct. |
320 | 16 | int16_t magic; |
321 | 16 | if (!file->DeSerialize(&magic)) { |
322 | 0 | return false; |
323 | 0 | } |
324 | 16 | if (magic != kDawgMagicNumber) { |
325 | 0 | tprintf("Bad magic number on dawg: %d vs %d\n", magic, kDawgMagicNumber); |
326 | 0 | return false; |
327 | 0 | } |
328 | | |
329 | 16 | int32_t unicharset_size; |
330 | 16 | if (!file->DeSerialize(&unicharset_size)) { |
331 | 0 | return false; |
332 | 0 | } |
333 | 16 | if (!file->DeSerialize(&num_edges_)) { |
334 | 0 | return false; |
335 | 0 | } |
336 | 16 | ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty |
337 | 16 | Dawg::init(unicharset_size); |
338 | | |
339 | 16 | edges_ = new EDGE_RECORD[num_edges_]; |
340 | 16 | if (!file->DeSerialize(&edges_[0], num_edges_)) { |
341 | 0 | return false; |
342 | 0 | } |
343 | 16 | if (debug_level_ > 2) { |
344 | 0 | tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n", |
345 | 0 | type_, lang_.c_str(), perm_, unicharset_size_, num_edges_); |
346 | 0 | for (EDGE_REF edge = 0; edge < num_edges_; ++edge) { |
347 | 0 | print_edge(edge); |
348 | 0 | } |
349 | 0 | } |
350 | 16 | return true; |
351 | 16 | } |
352 | | |
353 | | std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map( |
354 | 0 | int32_t *num_nodes) const { |
355 | 0 | EDGE_REF edge; |
356 | 0 | std::unique_ptr<EDGE_REF[]> node_map(new EDGE_REF[num_edges_]); |
357 | 0 | int32_t node_counter; |
358 | 0 | int32_t num_edges; |
359 | |
|
360 | 0 | for (edge = 0; edge < num_edges_; edge++) { // init all slots |
361 | 0 | node_map[edge] = -1; |
362 | 0 | } |
363 | |
|
364 | 0 | node_counter = num_forward_edges(0); |
365 | |
|
366 | 0 | *num_nodes = 0; |
367 | 0 | for (edge = 0; edge < num_edges_; edge++) { // search all slots |
368 | |
|
369 | 0 | if (forward_edge(edge)) { |
370 | 0 | (*num_nodes)++; // count nodes links |
371 | 0 | node_map[edge] = (edge ? node_counter : 0); |
372 | 0 | num_edges = num_forward_edges(edge); |
373 | 0 | if (edge != 0) { |
374 | 0 | node_counter += num_edges; |
375 | 0 | } |
376 | 0 | edge += num_edges; |
377 | 0 | if (edge >= num_edges_) { |
378 | 0 | break; |
379 | 0 | } |
380 | 0 | if (backward_edge(edge)) { |
381 | 0 | while (!last_edge(edge++)) { |
382 | 0 | ; |
383 | 0 | } |
384 | 0 | } |
385 | 0 | edge--; |
386 | 0 | } |
387 | 0 | } |
388 | 0 | return node_map; |
389 | 0 | } |
390 | | |
391 | 0 | bool SquishedDawg::write_squished_dawg(TFile *file) { |
392 | 0 | EDGE_REF edge; |
393 | 0 | int32_t num_edges; |
394 | 0 | int32_t node_count = 0; |
395 | 0 | EDGE_REF old_index; |
396 | 0 | EDGE_RECORD temp_record; |
397 | |
|
398 | 0 | if (debug_level_) { |
399 | 0 | tprintf("write_squished_dawg\n"); |
400 | 0 | } |
401 | |
|
402 | 0 | std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count)); |
403 | | |
404 | | // Write the magic number to help detecting a change in endianness. |
405 | 0 | int16_t magic = kDawgMagicNumber; |
406 | 0 | if (!file->Serialize(&magic)) { |
407 | 0 | return false; |
408 | 0 | } |
409 | 0 | if (!file->Serialize(&unicharset_size_)) { |
410 | 0 | return false; |
411 | 0 | } |
412 | | |
413 | | // Count the number of edges in this Dawg. |
414 | 0 | num_edges = 0; |
415 | 0 | for (edge = 0; edge < num_edges_; edge++) { |
416 | 0 | if (forward_edge(edge)) { |
417 | 0 | num_edges++; |
418 | 0 | } |
419 | 0 | } |
420 | | |
421 | | // Write edge count to file. |
422 | 0 | if (!file->Serialize(&num_edges)) { |
423 | 0 | return false; |
424 | 0 | } |
425 | | |
426 | 0 | if (debug_level_) { |
427 | 0 | tprintf("%d nodes in DAWG\n", node_count); |
428 | 0 | tprintf("%d edges in DAWG\n", num_edges); |
429 | 0 | } |
430 | |
|
431 | 0 | for (edge = 0; edge < num_edges_; edge++) { |
432 | 0 | if (forward_edge(edge)) { // write forward edges |
433 | 0 | do { |
434 | 0 | old_index = next_node_from_edge_rec(edges_[edge]); |
435 | 0 | set_next_node(edge, node_map[old_index]); |
436 | 0 | temp_record = edges_[edge]; |
437 | 0 | if (!file->Serialize(&temp_record)) { |
438 | 0 | return false; |
439 | 0 | } |
440 | 0 | set_next_node(edge, old_index); |
441 | 0 | } while (!last_edge(edge++)); |
442 | | |
443 | 0 | if (edge >= num_edges_) { |
444 | 0 | break; |
445 | 0 | } |
446 | 0 | if (backward_edge(edge)) { // skip back links |
447 | 0 | while (!last_edge(edge++)) { |
448 | 0 | ; |
449 | 0 | } |
450 | 0 | } |
451 | |
|
452 | 0 | edge--; |
453 | 0 | } |
454 | 0 | } |
455 | 0 | return true; |
456 | 0 | } |
457 | | |
458 | | } // namespace tesseract |