/src/tesseract/src/dict/trie.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * File: trie.cpp (Formerly trie.c) |
4 | | * Description: Functions to build a trie data structure. |
5 | | * Author: Mark Seaman, OCR Technology |
6 | | * |
7 | | * (c) Copyright 1987, Hewlett-Packard Company. |
8 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | ** you may not use this file except in compliance with the License. |
10 | | ** You may obtain a copy of the License at |
11 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
12 | | ** Unless required by applicable law or agreed to in writing, software |
13 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
14 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | ** See the License for the specific language governing permissions and |
16 | | ** limitations under the License. |
17 | | * |
18 | | *****************************************************************************/ |
19 | | /*---------------------------------------------------------------------- |
20 | | I n c l u d e s |
21 | | ----------------------------------------------------------------------*/ |
22 | | |
23 | | #include "trie.h" |
24 | | |
25 | | #include "dawg.h" |
26 | | #include "dict.h" |
27 | | #include "helpers.h" |
28 | | #include "kdpair.h" |
29 | | |
30 | | namespace tesseract { |
31 | | |
32 | | const char kDoNotReverse[] = "RRP_DO_NO_REVERSE"; |
33 | | const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL"; |
34 | | const char kForceReverse[] = "RRP_FORCE_REVERSE"; |
35 | | |
36 | | const char *const RTLReversePolicyNames[] = {kDoNotReverse, kReverseIfHasRTL, kForceReverse}; |
37 | | |
38 | | const char Trie::kAlphaPatternUnicode[] = "\u2000"; |
39 | | const char Trie::kDigitPatternUnicode[] = "\u2001"; |
40 | | const char Trie::kAlphanumPatternUnicode[] = "\u2002"; |
41 | | const char Trie::kPuncPatternUnicode[] = "\u2003"; |
42 | | const char Trie::kLowerPatternUnicode[] = "\u2004"; |
43 | | const char Trie::kUpperPatternUnicode[] = "\u2005"; |
44 | | |
45 | 0 | const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) { |
46 | 0 | return RTLReversePolicyNames[reverse_policy]; |
47 | 0 | } |
48 | | |
49 | | // Reset the Trie to empty. |
50 | 0 | void Trie::clear() { |
51 | 0 | for (auto node : nodes_) { |
52 | 0 | delete node; |
53 | 0 | } |
54 | 0 | nodes_.clear(); |
55 | 0 | root_back_freelist_.clear(); |
56 | 0 | num_edges_ = 0; |
57 | 0 | new_dawg_node(); // Need to allocate node 0. |
58 | 0 | } |
59 | | |
60 | | bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, |
61 | | UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, |
62 | 4.13M | EDGE_INDEX *edge_index) const { |
63 | 4.13M | if (debug_level_ == 3) { |
64 | 0 | tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT |
65 | 0 | " direction %d word_end %d unichar_id %d, exploring node:\n", |
66 | 0 | node_ref, next_node, direction, word_end, unichar_id); |
67 | 0 | if (node_ref != NO_EDGE) { |
68 | 0 | print_node(node_ref, nodes_[node_ref]->forward_edges.size()); |
69 | 0 | } |
70 | 0 | } |
71 | 4.13M | if (node_ref == NO_EDGE) { |
72 | 0 | return false; |
73 | 0 | } |
74 | 4.13M | assert(static_cast<size_t>(node_ref) < nodes_.size()); |
75 | 4.13M | EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? nodes_[node_ref]->forward_edges |
76 | 4.13M | : nodes_[node_ref]->backward_edges; |
77 | 4.13M | int vec_size = vec.size(); |
78 | 4.13M | if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search |
79 | 3.94M | EDGE_INDEX start = 0; |
80 | 3.94M | EDGE_INDEX end = vec_size - 1; |
81 | 3.94M | EDGE_INDEX k; |
82 | 3.94M | int compare; |
83 | 11.2M | while (start <= end) { |
84 | 7.31M | k = (start + end) >> 1; // (start + end) / 2 |
85 | 7.31M | compare = given_greater_than_edge_rec(next_node, word_end, unichar_id, vec[k]); |
86 | 7.31M | if (compare == 0) { // given == vec[k] |
87 | 49.6k | *edge_ptr = &(vec[k]); |
88 | 49.6k | *edge_index = k; |
89 | 49.6k | return true; |
90 | 7.26M | } else if (compare == 1) { // given > vec[k] |
91 | 5.14M | start = k + 1; |
92 | 5.14M | } else { // given < vec[k] |
93 | 2.11M | end = k - 1; |
94 | 2.11M | } |
95 | 7.31M | } |
96 | 3.94M | } else { // linear search |
97 | 398k | for (int i = 0; i < vec_size; ++i) { |
98 | 233k | EDGE_RECORD &edge_rec = vec[i]; |
99 | 233k | if (edge_rec_match(next_node, word_end, unichar_id, next_node_from_edge_rec(edge_rec), |
100 | 233k | end_of_word_from_edge_rec(edge_rec), unichar_id_from_edge_rec(edge_rec))) { |
101 | 21.2k | *edge_ptr = &(edge_rec); |
102 | 21.2k | *edge_index = i; |
103 | 21.2k | return true; |
104 | 21.2k | } |
105 | 233k | } |
106 | 186k | } |
107 | 4.06M | return false; // not found |
108 | 4.13M | } |
109 | | |
110 | | bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, int direction, |
111 | 146 | bool word_end, UNICHAR_ID unichar_id) { |
112 | 146 | EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? &(nodes_[node1]->forward_edges) |
113 | 146 | : &(nodes_[node1]->backward_edges); |
114 | 146 | unsigned search_index; |
115 | 146 | if (node1 == 0 && direction == FORWARD_EDGE) { |
116 | 10 | search_index = 0; // find the index to make the add sorted |
117 | 29 | while (search_index < vec->size() && |
118 | 29 | given_greater_than_edge_rec(node2, word_end, unichar_id, (*vec)[search_index]) == 1) { |
119 | 19 | search_index++; |
120 | 19 | } |
121 | 136 | } else { |
122 | 136 | search_index = vec->size(); // add is unsorted, so index does not matter |
123 | 136 | } |
124 | 146 | EDGE_RECORD edge_rec; |
125 | 146 | link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id); |
126 | 146 | if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) { |
127 | 0 | EDGE_INDEX edge_index = root_back_freelist_.back(); |
128 | 0 | root_back_freelist_.pop_back(); |
129 | 0 | (*vec)[edge_index] = edge_rec; |
130 | 146 | } else if (search_index < vec->size()) { |
131 | 4 | vec->insert(vec->begin() + search_index, edge_rec); |
132 | 142 | } else { |
133 | 142 | vec->push_back(edge_rec); |
134 | 142 | } |
135 | 146 | if (debug_level_ > 1) { |
136 | 0 | tprintf("new edge in nodes_[" REFFORMAT "]: ", node1); |
137 | 0 | print_edge_rec(edge_rec); |
138 | 0 | tprintf("\n"); |
139 | 0 | } |
140 | 146 | num_edges_++; |
141 | 146 | return true; |
142 | 146 | } |
143 | | |
144 | | void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool marker_flag, |
145 | 2 | UNICHAR_ID unichar_id) { |
146 | 2 | EDGE_RECORD *back_edge_ptr; |
147 | 2 | EDGE_INDEX back_edge_index; |
148 | 2 | ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, unichar_id, &back_edge_ptr, |
149 | 2 | &back_edge_index)); |
150 | 2 | if (marker_flag) { |
151 | 0 | *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_); |
152 | 0 | *edge_ptr |= (MARKER_FLAG << flag_start_bit_); |
153 | 0 | } |
154 | | // Mark both directions as end of word. |
155 | 2 | *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_); |
156 | 2 | *edge_ptr |= (WERD_END_FLAG << flag_start_bit_); |
157 | 2 | } |
158 | | |
159 | 17 | bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions) { |
160 | 17 | if (word.length() <= 0) { |
161 | 0 | return false; // can't add empty words |
162 | 0 | } |
163 | 17 | if (repetitions != nullptr) { |
164 | 0 | ASSERT_HOST(repetitions->size() == word.length()); |
165 | 0 | } |
166 | | // Make sure the word does not contain invalid unchar ids. |
167 | 102 | for (unsigned i = 0; i < word.length(); ++i) { |
168 | 85 | if (word.unichar_id(i) < 0 || word.unichar_id(i) >= unicharset_size_) { |
169 | 0 | return false; |
170 | 0 | } |
171 | 85 | } |
172 | | |
173 | 17 | EDGE_RECORD *edge_ptr; |
174 | 17 | NODE_REF last_node = 0; |
175 | 17 | NODE_REF the_next_node; |
176 | 17 | bool marker_flag = false; |
177 | 17 | EDGE_INDEX edge_index; |
178 | 17 | int32_t still_finding_chars = true; |
179 | 17 | int32_t word_end = false; |
180 | 17 | bool add_failed = false; |
181 | 17 | bool found; |
182 | | |
183 | 17 | if (debug_level_ > 1) { |
184 | 0 | word.print("\nAdding word: "); |
185 | 0 | } |
186 | | |
187 | 17 | UNICHAR_ID unichar_id; |
188 | 17 | unsigned i; |
189 | 85 | for (i = 0; i < word.length() - 1; ++i) { |
190 | 68 | unichar_id = word.unichar_id(i); |
191 | 68 | marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false; |
192 | 68 | if (debug_level_ > 1) { |
193 | 0 | tprintf("Adding letter %d\n", unichar_id); |
194 | 0 | } |
195 | 68 | if (still_finding_chars) { |
196 | 25 | found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr, |
197 | 25 | &edge_index); |
198 | 25 | if (found && debug_level_ > 1) { |
199 | 0 | tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n", edge_index, last_node); |
200 | 0 | } |
201 | 25 | if (!found) { |
202 | 15 | still_finding_chars = false; |
203 | 15 | } else if (next_node_from_edge_rec(*edge_ptr) == 0) { |
204 | | // We hit the end of an existing word, but the new word is longer. |
205 | | // In this case we have to disconnect the existing word from the |
206 | | // backwards root node, mark the current position as end-of-word |
207 | | // and add new nodes for the increased length. Disconnecting the |
208 | | // existing word from the backwards root node requires a linear |
209 | | // search, so it is much faster to add the longest words first, |
210 | | // to avoid having to come here. |
211 | 0 | word_end = true; |
212 | 0 | still_finding_chars = false; |
213 | 0 | remove_edge(last_node, 0, word_end, unichar_id); |
214 | 10 | } else { |
215 | | // We have to add a new branch here for the new word. |
216 | 10 | if (marker_flag) { |
217 | 0 | set_marker_flag_in_edge_rec(edge_ptr); |
218 | 0 | } |
219 | 10 | last_node = next_node_from_edge_rec(*edge_ptr); |
220 | 10 | } |
221 | 25 | } |
222 | 68 | if (!still_finding_chars) { |
223 | 58 | the_next_node = new_dawg_node(); |
224 | 58 | if (debug_level_ > 1) { |
225 | 0 | tprintf("adding node " REFFORMAT "\n", the_next_node); |
226 | 0 | } |
227 | 58 | if (the_next_node == 0) { |
228 | 0 | add_failed = true; |
229 | 0 | break; |
230 | 0 | } |
231 | 58 | if (!add_new_edge(last_node, the_next_node, marker_flag, word_end, unichar_id)) { |
232 | 0 | add_failed = true; |
233 | 0 | break; |
234 | 0 | } |
235 | 58 | word_end = false; |
236 | 58 | last_node = the_next_node; |
237 | 58 | } |
238 | 68 | } |
239 | 17 | the_next_node = 0; |
240 | 17 | unichar_id = word.unichar_id(i); |
241 | 17 | marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false; |
242 | 17 | if (debug_level_ > 1) { |
243 | 0 | tprintf("Adding letter %d\n", unichar_id); |
244 | 0 | } |
245 | 17 | if (still_finding_chars && |
246 | 17 | edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, unichar_id, &edge_ptr, &edge_index)) { |
247 | | // An extension of this word already exists in the trie, so we |
248 | | // only have to add the ending flags in both directions. |
249 | 2 | add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), marker_flag, unichar_id); |
250 | 15 | } else { |
251 | | // Add a link to node 0. All leaves connect to node 0 so the back links can |
252 | | // be used in reduction to a dawg. This root backward node has one edge |
253 | | // entry for every word, (except prefixes of longer words) so it is huge. |
254 | 15 | if (!add_failed && !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) { |
255 | 0 | add_failed = true; |
256 | 0 | } |
257 | 15 | } |
258 | 17 | if (add_failed) { |
259 | 0 | tprintf("Re-initializing document dictionary...\n"); |
260 | 0 | clear(); |
261 | 0 | return false; |
262 | 17 | } else { |
263 | 17 | return true; |
264 | 17 | } |
265 | 17 | } |
266 | | |
267 | 66 | NODE_REF Trie::new_dawg_node() { |
268 | 66 | auto *node = new TRIE_NODE_RECORD(); |
269 | 66 | nodes_.push_back(node); |
270 | 66 | return nodes_.size() - 1; |
271 | 66 | } |
272 | | |
273 | | bool Trie::read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, |
274 | 0 | Trie::RTLReversePolicy reverse_policy) { |
275 | 0 | std::vector<std::string> word_list; |
276 | 0 | if (!read_word_list(filename, &word_list)) { |
277 | 0 | return false; |
278 | 0 | } |
279 | 0 | std::sort(word_list.begin(), word_list.end(), |
280 | 0 | [](auto &s1, auto &s2) { return s1.size() > s2.size(); }); |
281 | 0 | return add_word_list(word_list, unicharset, reverse_policy); |
282 | 0 | } |
283 | | |
284 | 0 | bool Trie::read_word_list(const char *filename, std::vector<std::string> *words) { |
285 | 0 | FILE *word_file; |
286 | 0 | char line_str[CHARS_PER_LINE]; |
287 | 0 | int word_count = 0; |
288 | |
|
289 | 0 | word_file = fopen(filename, "rb"); |
290 | 0 | if (word_file == nullptr) { |
291 | 0 | return false; |
292 | 0 | } |
293 | | |
294 | 0 | while (fgets(line_str, sizeof(line_str), word_file) != nullptr) { |
295 | 0 | chomp_string(line_str); // remove newline |
296 | 0 | std::string word_str(line_str); |
297 | 0 | ++word_count; |
298 | 0 | if (debug_level_ && word_count % 10000 == 0) { |
299 | 0 | tprintf("Read %d words so far\n", word_count); |
300 | 0 | } |
301 | 0 | words->push_back(word_str); |
302 | 0 | } |
303 | 0 | if (debug_level_) { |
304 | 0 | tprintf("Read %d words total.\n", word_count); |
305 | 0 | } |
306 | 0 | fclose(word_file); |
307 | 0 | return true; |
308 | 0 | } |
309 | | |
310 | | bool Trie::add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset, |
311 | 0 | Trie::RTLReversePolicy reverse_policy) { |
312 | 0 | for (const auto &i : words) { |
313 | 0 | WERD_CHOICE word(i.c_str(), unicharset); |
314 | 0 | if (word.empty() || word.contains_unichar_id(INVALID_UNICHAR_ID)) { |
315 | 0 | continue; |
316 | 0 | } |
317 | 0 | if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && word.has_rtl_unichar_id()) || |
318 | 0 | reverse_policy == RRP_FORCE_REVERSE) { |
319 | 0 | word.reverse_and_mirror_unichar_ids(); |
320 | 0 | } |
321 | 0 | if (!word_in_dawg(word)) { |
322 | 0 | add_word_to_dawg(word); |
323 | 0 | if (!word_in_dawg(word)) { |
324 | 0 | tprintf("Error: word '%s' not in DAWG after adding it\n", i.c_str()); |
325 | 0 | return false; |
326 | 0 | } |
327 | 0 | } |
328 | 0 | } |
329 | 0 | return true; |
330 | 0 | } |
331 | | |
332 | 0 | void Trie::initialize_patterns(UNICHARSET *unicharset) { |
333 | 0 | unicharset->unichar_insert(kAlphaPatternUnicode); |
334 | 0 | alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode); |
335 | 0 | unicharset->unichar_insert(kDigitPatternUnicode); |
336 | 0 | digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode); |
337 | 0 | unicharset->unichar_insert(kAlphanumPatternUnicode); |
338 | 0 | alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode); |
339 | 0 | unicharset->unichar_insert(kPuncPatternUnicode); |
340 | 0 | punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode); |
341 | 0 | unicharset->unichar_insert(kLowerPatternUnicode); |
342 | 0 | lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode); |
343 | 0 | unicharset->unichar_insert(kUpperPatternUnicode); |
344 | 0 | upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode); |
345 | 0 | initialized_patterns_ = true; |
346 | 0 | unicharset_size_ = unicharset->size(); |
347 | 0 | } |
348 | | |
349 | | void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, |
350 | 0 | std::vector<UNICHAR_ID> *vec) const { |
351 | 0 | bool is_alpha = unicharset.get_isalpha(unichar_id); |
352 | 0 | if (is_alpha) { |
353 | 0 | vec->push_back(alpha_pattern_); |
354 | 0 | vec->push_back(alphanum_pattern_); |
355 | 0 | if (unicharset.get_islower(unichar_id)) { |
356 | 0 | vec->push_back(lower_pattern_); |
357 | 0 | } else if (unicharset.get_isupper(unichar_id)) { |
358 | 0 | vec->push_back(upper_pattern_); |
359 | 0 | } |
360 | 0 | } |
361 | 0 | if (unicharset.get_isdigit(unichar_id)) { |
362 | 0 | vec->push_back(digit_pattern_); |
363 | 0 | if (!is_alpha) { |
364 | 0 | vec->push_back(alphanum_pattern_); |
365 | 0 | } |
366 | 0 | } |
367 | 0 | if (unicharset.get_ispunctuation(unichar_id)) { |
368 | 0 | vec->push_back(punc_pattern_); |
369 | 0 | } |
370 | 0 | } |
371 | | |
372 | 0 | UNICHAR_ID Trie::character_class_to_pattern(char ch) { |
373 | 0 | if (ch == 'c') { |
374 | 0 | return alpha_pattern_; |
375 | 0 | } else if (ch == 'd') { |
376 | 0 | return digit_pattern_; |
377 | 0 | } else if (ch == 'n') { |
378 | 0 | return alphanum_pattern_; |
379 | 0 | } else if (ch == 'p') { |
380 | 0 | return punc_pattern_; |
381 | 0 | } else if (ch == 'a') { |
382 | 0 | return lower_pattern_; |
383 | 0 | } else if (ch == 'A') { |
384 | 0 | return upper_pattern_; |
385 | 0 | } else { |
386 | 0 | return INVALID_UNICHAR_ID; |
387 | 0 | } |
388 | 0 | } |
389 | | |
390 | 0 | bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset) { |
391 | 0 | if (!initialized_patterns_) { |
392 | 0 | tprintf("please call initialize_patterns() before read_pattern_list()\n"); |
393 | 0 | return false; |
394 | 0 | } |
395 | | |
396 | 0 | FILE *pattern_file = fopen(filename, "rb"); |
397 | 0 | if (pattern_file == nullptr) { |
398 | 0 | tprintf("Error opening pattern file %s\n", filename); |
399 | 0 | return false; |
400 | 0 | } |
401 | | |
402 | 0 | int pattern_count = 0; |
403 | 0 | char string[CHARS_PER_LINE]; |
404 | 0 | while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) { |
405 | 0 | chomp_string(string); // remove newline |
406 | | // Parse the pattern and construct a unichar id vector. |
407 | | // Record the number of repetitions of each unichar in the parallel vector. |
408 | 0 | WERD_CHOICE word(&unicharset); |
409 | 0 | std::vector<bool> repetitions_vec; |
410 | 0 | const char *str_ptr = string; |
411 | 0 | int step = unicharset.step(str_ptr); |
412 | 0 | bool failed = false; |
413 | 0 | while (step > 0) { |
414 | 0 | UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; |
415 | 0 | if (step == 1 && *str_ptr == '\\') { |
416 | 0 | ++str_ptr; |
417 | 0 | if (*str_ptr == '\\') { // regular '\' unichar that was escaped |
418 | 0 | curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); |
419 | 0 | } else { |
420 | | #if 0 // TODO: This code should be enabled if kSaneNumConcreteChars != 0. |
421 | | if (word.length() < kSaneNumConcreteChars) { |
422 | | tprintf( |
423 | | "Please provide at least %d concrete characters at the" |
424 | | " beginning of the pattern\n", |
425 | | kSaneNumConcreteChars); |
426 | | failed = true; |
427 | | break; |
428 | | } |
429 | | #endif |
430 | | // Parse character class from expression. |
431 | 0 | curr_unichar_id = character_class_to_pattern(*str_ptr); |
432 | 0 | } |
433 | 0 | } else { |
434 | 0 | curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); |
435 | 0 | } |
436 | 0 | if (curr_unichar_id == INVALID_UNICHAR_ID) { |
437 | 0 | failed = true; |
438 | 0 | break; // failed to parse this pattern |
439 | 0 | } |
440 | 0 | word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0); |
441 | 0 | repetitions_vec.push_back(false); |
442 | 0 | str_ptr += step; |
443 | 0 | step = unicharset.step(str_ptr); |
444 | | // Check if there is a repetition pattern specified after this unichar. |
445 | 0 | if (step == 1 && *str_ptr == '\\' && *(str_ptr + 1) == '*') { |
446 | 0 | repetitions_vec[repetitions_vec.size() - 1] = true; |
447 | 0 | str_ptr += 2; |
448 | 0 | step = unicharset.step(str_ptr); |
449 | 0 | } |
450 | 0 | } |
451 | 0 | if (failed) { |
452 | 0 | tprintf("Invalid user pattern %s\n", string); |
453 | 0 | continue; |
454 | 0 | } |
455 | | // Insert the pattern into the trie. |
456 | 0 | if (debug_level_ > 2) { |
457 | 0 | tprintf("Inserting expanded user pattern %s\n", word.debug_string().c_str()); |
458 | 0 | } |
459 | 0 | if (!this->word_in_dawg(word)) { |
460 | 0 | this->add_word_to_dawg(word, &repetitions_vec); |
461 | 0 | if (!this->word_in_dawg(word)) { |
462 | 0 | tprintf("Error: failed to insert pattern '%s'\n", string); |
463 | 0 | } |
464 | 0 | } |
465 | 0 | ++pattern_count; |
466 | 0 | } |
467 | 0 | if (debug_level_) { |
468 | 0 | tprintf("Read %d valid patterns from %s\n", pattern_count, filename); |
469 | 0 | } |
470 | 0 | fclose(pattern_file); |
471 | 0 | return true; |
472 | 0 | } |
473 | | |
474 | | void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, |
475 | 0 | UNICHAR_ID unichar_id) { |
476 | 0 | EDGE_RECORD *edge_ptr = nullptr; |
477 | 0 | EDGE_INDEX edge_index = 0; |
478 | 0 | ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, unichar_id, &edge_ptr, &edge_index)); |
479 | 0 | if (debug_level_ > 1) { |
480 | 0 | tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1); |
481 | 0 | print_edge_rec(*edge_ptr); |
482 | 0 | tprintf("\n"); |
483 | 0 | } |
484 | 0 | if (direction == FORWARD_EDGE) { |
485 | 0 | nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index); |
486 | 0 | } else if (node1 == 0) { |
487 | 0 | KillEdge(&nodes_[node1]->backward_edges[edge_index]); |
488 | 0 | root_back_freelist_.push_back(edge_index); |
489 | 0 | } else { |
490 | 0 | nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index); |
491 | 0 | } |
492 | 0 | --num_edges_; |
493 | 0 | } |
494 | | |
495 | | // Some optimizations employed in add_word_to_dawg and trie_to_dawg: |
496 | | // 1 Avoid insertion sorting or bubble sorting the tail root node |
497 | | // (back links on node 0, a list of all the leaves.). The node is |
498 | | // huge, and sorting it with n^2 time is terrible. |
499 | | // 2 Avoid using vector::erase on the tail root node. |
500 | | // (a) During add of words to the trie, zero-out the unichars and |
501 | | // keep a freelist of spaces to re-use. |
502 | | // (b) During reduction, just zero-out the unichars of deleted back |
503 | | // links, skipping zero entries while searching. |
504 | | // 3 Avoid linear search of the tail root node. This has to be done when |
505 | | // a suffix is added to an existing word. Adding words by decreasing |
506 | | // length avoids this problem entirely. Words can still be added in |
507 | | // any order, but it is faster to add the longest first. |
508 | 0 | SquishedDawg *Trie::trie_to_dawg() { |
509 | 0 | root_back_freelist_.clear(); // Will be invalided by trie_to_dawg. |
510 | 0 | if (debug_level_ > 2) { |
511 | 0 | print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY); |
512 | 0 | } |
513 | 0 | std::vector<bool> reduced_nodes(nodes_.size()); |
514 | 0 | this->reduce_node_input(0, reduced_nodes); |
515 | |
|
516 | 0 | if (debug_level_ > 2) { |
517 | 0 | print_all("After reduction:", MAX_NODE_EDGES_DISPLAY); |
518 | 0 | } |
519 | | // Build a translation map from node indices in nodes_ vector to |
520 | | // their target indices in EDGE_ARRAY. |
521 | 0 | std::vector<NODE_REF> node_ref_map(nodes_.size() + 1); |
522 | 0 | unsigned i; |
523 | 0 | for (i = 0; i < nodes_.size(); ++i) { |
524 | 0 | node_ref_map[i + 1] = node_ref_map[i] + nodes_[i]->forward_edges.size(); |
525 | 0 | } |
526 | 0 | int num_forward_edges = node_ref_map[i]; |
527 | | |
528 | | // Convert nodes_ vector into EDGE_ARRAY translating the next node references |
529 | | // in edges using node_ref_map. Empty nodes and backward edges are dropped. |
530 | 0 | auto edge_array = new EDGE_RECORD[num_forward_edges]; |
531 | 0 | EDGE_ARRAY edge_array_ptr = edge_array; |
532 | 0 | for (i = 0; i < nodes_.size(); ++i) { |
533 | 0 | TRIE_NODE_RECORD *node_ptr = nodes_[i]; |
534 | 0 | int end = node_ptr->forward_edges.size(); |
535 | 0 | for (int j = 0; j < end; ++j) { |
536 | 0 | EDGE_RECORD &edge_rec = node_ptr->forward_edges[j]; |
537 | 0 | NODE_REF node_ref = next_node_from_edge_rec(edge_rec); |
538 | 0 | ASSERT_HOST(static_cast<size_t>(node_ref) < nodes_.size()); |
539 | 0 | UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec); |
540 | 0 | link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE, |
541 | 0 | end_of_word_from_edge_rec(edge_rec), unichar_id); |
542 | 0 | if (j == end - 1) { |
543 | 0 | set_marker_flag_in_edge_rec(edge_array_ptr); |
544 | 0 | } |
545 | 0 | ++edge_array_ptr; |
546 | 0 | } |
547 | 0 | } |
548 | |
|
549 | 0 | return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, perm_, unicharset_size_, |
550 | 0 | debug_level_); |
551 | 0 | } |
552 | | |
553 | | bool Trie::eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, |
554 | 0 | const EDGE_RECORD &edge2) { |
555 | 0 | if (debug_level_ > 1) { |
556 | 0 | tprintf("\nCollapsing node %" PRIi64 ":\n", node); |
557 | 0 | print_node(node, MAX_NODE_EDGES_DISPLAY); |
558 | 0 | tprintf("Candidate edges: "); |
559 | 0 | print_edge_rec(edge1); |
560 | 0 | tprintf(", "); |
561 | 0 | print_edge_rec(edge2); |
562 | 0 | tprintf("\n\n"); |
563 | 0 | } |
564 | 0 | NODE_REF next_node1 = next_node_from_edge_rec(edge1); |
565 | 0 | NODE_REF next_node2 = next_node_from_edge_rec(edge2); |
566 | 0 | TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2]; |
567 | | // Translate all edges going to/from next_node2 to go to/from next_node1. |
568 | 0 | EDGE_RECORD *edge_ptr = nullptr; |
569 | 0 | EDGE_INDEX edge_index; |
570 | | // The backward link in node to next_node2 will be zeroed out by the caller. |
571 | | // Copy all the backward links in next_node2 to node next_node1 |
572 | 0 | for (unsigned i = 0; i < next_node2_ptr->backward_edges.size(); ++i) { |
573 | 0 | const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i]; |
574 | 0 | NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge); |
575 | 0 | UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge); |
576 | 0 | int curr_word_end = end_of_word_from_edge_rec(bkw_edge); |
577 | 0 | bool marker_flag = marker_flag_from_edge_rec(bkw_edge); |
578 | 0 | add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, curr_word_end, |
579 | 0 | curr_unichar_id); |
580 | | // Relocate the corresponding forward edge in curr_next_node |
581 | 0 | ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, curr_word_end, |
582 | 0 | curr_unichar_id, &edge_ptr, &edge_index)); |
583 | 0 | set_next_node_in_edge_rec(edge_ptr, next_node1); |
584 | 0 | } |
585 | 0 | int next_node2_num_edges = |
586 | 0 | (next_node2_ptr->forward_edges.size() + next_node2_ptr->backward_edges.size()); |
587 | 0 | if (debug_level_ > 1) { |
588 | 0 | tprintf("removed %d edges from node " REFFORMAT "\n", next_node2_num_edges, next_node2); |
589 | 0 | } |
590 | 0 | next_node2_ptr->forward_edges.clear(); |
591 | 0 | next_node2_ptr->backward_edges.clear(); |
592 | 0 | num_edges_ -= next_node2_num_edges; |
593 | 0 | return true; |
594 | 0 | } |
595 | | |
596 | | bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, |
597 | 0 | EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes) { |
598 | 0 | if (debug_level_ > 1) { |
599 | 0 | tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index); |
600 | 0 | } |
601 | | // Compare each of the edge pairs with the given unichar_id. |
602 | 0 | bool did_something = false; |
603 | 0 | for (unsigned i = edge_index; i < backward_edges->size() - 1; ++i) { |
604 | | // Find the first edge that can be eliminated. |
605 | 0 | UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; |
606 | 0 | while (i < backward_edges->size()) { |
607 | 0 | if (!DeadEdge((*backward_edges)[i])) { |
608 | 0 | curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]); |
609 | 0 | if (curr_unichar_id != unichar_id) { |
610 | 0 | return did_something; |
611 | 0 | } |
612 | 0 | if (can_be_eliminated((*backward_edges)[i])) { |
613 | 0 | break; |
614 | 0 | } |
615 | 0 | } |
616 | 0 | ++i; |
617 | 0 | } |
618 | 0 | if (i == backward_edges->size()) { |
619 | 0 | break; |
620 | 0 | } |
621 | 0 | const EDGE_RECORD &edge_rec = (*backward_edges)[i]; |
622 | | // Compare it to the rest of the edges with the given unichar_id. |
623 | 0 | for (auto j = i + 1; j < backward_edges->size(); ++j) { |
624 | 0 | const EDGE_RECORD &next_edge_rec = (*backward_edges)[j]; |
625 | 0 | if (DeadEdge(next_edge_rec)) { |
626 | 0 | continue; |
627 | 0 | } |
628 | 0 | UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec); |
629 | 0 | if (next_id != unichar_id) { |
630 | 0 | break; |
631 | 0 | } |
632 | 0 | if (end_of_word_from_edge_rec(next_edge_rec) == end_of_word_from_edge_rec(edge_rec) && |
633 | 0 | can_be_eliminated(next_edge_rec) && |
634 | 0 | eliminate_redundant_edges(node, edge_rec, next_edge_rec)) { |
635 | 0 | reduced_nodes[next_node_from_edge_rec(edge_rec)] = false; |
636 | 0 | did_something = true; |
637 | 0 | KillEdge(&(*backward_edges)[j]); |
638 | 0 | } |
639 | 0 | } |
640 | 0 | } |
641 | 0 | return did_something; |
642 | 0 | } |
643 | | |
644 | 0 | void Trie::sort_edges(EDGE_VECTOR *edges) { |
645 | 0 | int num_edges = edges->size(); |
646 | 0 | if (num_edges <= 1) { |
647 | 0 | return; |
648 | 0 | } |
649 | 0 | std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec; |
650 | 0 | sort_vec.reserve(num_edges); |
651 | 0 | for (int i = 0; i < num_edges; ++i) { |
652 | 0 | sort_vec.emplace_back(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]); |
653 | 0 | } |
654 | 0 | std::sort(sort_vec.begin(), sort_vec.end()); |
655 | 0 | for (int i = 0; i < num_edges; ++i) { |
656 | 0 | (*edges)[i] = sort_vec[i].data(); |
657 | 0 | } |
658 | 0 | } |
659 | | |
660 | 0 | void Trie::reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes) { |
661 | 0 | EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges; |
662 | 0 | sort_edges(&backward_edges); |
663 | 0 | if (debug_level_ > 1) { |
664 | 0 | tprintf("reduce_node_input(node=" REFFORMAT ")\n", node); |
665 | 0 | print_node(node, MAX_NODE_EDGES_DISPLAY); |
666 | 0 | } |
667 | |
|
668 | 0 | EDGE_INDEX edge_index = 0; |
669 | 0 | while (static_cast<size_t>(edge_index) < backward_edges.size()) { |
670 | 0 | if (DeadEdge(backward_edges[edge_index])) { |
671 | 0 | continue; |
672 | 0 | } |
673 | 0 | UNICHAR_ID unichar_id = unichar_id_from_edge_rec(backward_edges[edge_index]); |
674 | 0 | while (reduce_lettered_edges(edge_index, unichar_id, node, &backward_edges, reduced_nodes)) { |
675 | 0 | ; |
676 | 0 | } |
677 | 0 | while (static_cast<size_t>(++edge_index) < backward_edges.size()) { |
678 | 0 | UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]); |
679 | 0 | if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) { |
680 | 0 | break; |
681 | 0 | } |
682 | 0 | } |
683 | 0 | } |
684 | 0 | reduced_nodes[node] = true; // mark as reduced |
685 | |
|
686 | 0 | if (debug_level_ > 1) { |
687 | 0 | tprintf("Node " REFFORMAT " after reduction:\n", node); |
688 | 0 | print_node(node, MAX_NODE_EDGES_DISPLAY); |
689 | 0 | } |
690 | |
|
691 | 0 | for (auto &backward_edge : backward_edges) { |
692 | 0 | if (DeadEdge(backward_edge)) { |
693 | 0 | continue; |
694 | 0 | } |
695 | 0 | NODE_REF next_node = next_node_from_edge_rec(backward_edge); |
696 | 0 | if (next_node != 0 && !reduced_nodes[next_node]) { |
697 | 0 | reduce_node_input(next_node, reduced_nodes); |
698 | 0 | } |
699 | 0 | } |
700 | 0 | } |
701 | | |
702 | 0 | void Trie::print_node(NODE_REF node, int max_num_edges) const { |
703 | 0 | if (node == NO_EDGE) { |
704 | 0 | return; // nothing to print |
705 | 0 | } |
706 | 0 | TRIE_NODE_RECORD *node_ptr = nodes_[node]; |
707 | 0 | int num_fwd = node_ptr->forward_edges.size(); |
708 | 0 | int num_bkw = node_ptr->backward_edges.size(); |
709 | 0 | EDGE_VECTOR *vec; |
710 | 0 | for (int dir = 0; dir < 2; ++dir) { |
711 | 0 | if (dir == 0) { |
712 | 0 | vec = &(node_ptr->forward_edges); |
713 | 0 | tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw); |
714 | 0 | } else { |
715 | 0 | vec = &(node_ptr->backward_edges); |
716 | 0 | tprintf("\t"); |
717 | 0 | } |
718 | 0 | int i; |
719 | 0 | for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && i < max_num_edges; ++i) { |
720 | 0 | if (DeadEdge((*vec)[i])) { |
721 | 0 | continue; |
722 | 0 | } |
723 | 0 | print_edge_rec((*vec)[i]); |
724 | 0 | tprintf(" "); |
725 | 0 | } |
726 | 0 | if (dir == 0 ? i < num_fwd : i < num_bkw) { |
727 | 0 | tprintf("..."); |
728 | 0 | } |
729 | 0 | tprintf("\n"); |
730 | 0 | } |
731 | 0 | } |
732 | | |
733 | | } // namespace tesseract |