/src/tesseract/src/dict/permdawg.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * File: permdawg.cpp (Formerly permdawg.c) |
4 | | * Description: Scale word choices by a dictionary |
5 | | * Author: Mark Seaman, OCR Technology |
6 | | * |
7 | | * (c) Copyright 1987, Hewlett-Packard Company. |
8 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | ** you may not use this file except in compliance with the License. |
10 | | ** You may obtain a copy of the License at |
11 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
12 | | ** Unless required by applicable law or agreed to in writing, software |
13 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
14 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | ** See the License for the specific language governing permissions and |
16 | | ** limitations under the License. |
17 | | * |
18 | | *****************************************************************************/ |
19 | | /*---------------------------------------------------------------------- |
20 | | I n c l u d e s |
21 | | ----------------------------------------------------------------------*/ |
22 | | |
23 | | #include "dawg.h" |
24 | | #include "params.h" |
25 | | #include "stopper.h" |
26 | | #include "tprintf.h" |
27 | | |
28 | | #include <algorithm> |
29 | | #include <cctype> |
30 | | #include "dict.h" |
31 | | |
32 | | /*---------------------------------------------------------------------- |
33 | | F u n c t i o n s |
34 | | ----------------------------------------------------------------------*/ |
35 | | namespace tesseract { |
36 | | |
37 | | /** |
38 | | * @name go_deeper_dawg_fxn |
39 | | * |
40 | | * If the choice being composed so far could be a dictionary word |
41 | | * keep exploring choices. |
42 | | */ |
43 | | void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, |
44 | | int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, |
45 | | bool word_ending, WERD_CHOICE *word, float certainties[], |
46 | | float *limit, WERD_CHOICE *best_choice, int *attempts_left, |
47 | 617k | void *void_more_args) { |
48 | 617k | auto *more_args = static_cast<DawgArgs *>(void_more_args); |
49 | 617k | word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1); |
50 | 617k | int word_index = word->length() - 1; |
51 | 617k | if (best_choice->rating() < *limit) { |
52 | 0 | return; |
53 | 0 | } |
54 | | // Look up char in DAWG |
55 | | |
56 | | // If the current unichar is an ngram first try calling |
57 | | // letter_is_okay() for each unigram it contains separately. |
58 | 617k | UNICHAR_ID orig_uch_id = word->unichar_id(word_index); |
59 | 617k | bool checked_unigrams = false; |
60 | 617k | if (getUnicharset().get_isngram(orig_uch_id)) { |
61 | 132k | if (dawg_debug_level) { |
62 | 0 | tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str()); |
63 | 0 | } |
64 | 132k | int num_unigrams = 0; |
65 | 132k | word->remove_last_unichar_id(); |
66 | 132k | std::vector<UNICHAR_ID> encoding; |
67 | 132k | const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); |
68 | | // Since the string came out of the unicharset, failure is impossible. |
69 | 132k | ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr)); |
70 | 132k | bool unigrams_ok = true; |
71 | | // Construct DawgArgs that reflect the current state. |
72 | 132k | DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs); |
73 | 132k | DawgPositionVector unigram_updated_dawgs; |
74 | 132k | DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter); |
75 | | // Check unigrams in the ngram with letter_is_okay(). |
76 | 264k | for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) { |
77 | 132k | UNICHAR_ID uch_id = encoding[i]; |
78 | 132k | ASSERT_HOST(uch_id != INVALID_UNICHAR_ID); |
79 | 132k | ++num_unigrams; |
80 | 132k | word->append_unichar_id(uch_id, 1, 0.0, 0.0); |
81 | 132k | unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(), |
82 | 132k | word->unichar_id(word_index + num_unigrams - 1), |
83 | 132k | word_ending && i == encoding.size() - 1); |
84 | 132k | (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs); |
85 | 132k | if (dawg_debug_level) { |
86 | 0 | tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(), |
87 | 0 | unigrams_ok ? "OK" : "not OK"); |
88 | 0 | } |
89 | 132k | } |
90 | | // Restore the word and copy the updated dawg state if needed. |
91 | 264k | while (num_unigrams-- > 0) { |
92 | 132k | word->remove_last_unichar_id(); |
93 | 132k | } |
94 | 132k | word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0); |
95 | 132k | if (unigrams_ok) { |
96 | 0 | checked_unigrams = true; |
97 | 0 | more_args->permuter = unigram_dawg_args.permuter; |
98 | 0 | *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs); |
99 | 0 | } |
100 | 132k | } |
101 | | |
102 | | // Check which dawgs from the dawgs_ vector contain the word |
103 | | // up to and including the current unichar. |
104 | 617k | if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(), |
105 | 617k | word->unichar_id(word_index), word_ending)) { |
106 | | // Add a new word choice |
107 | 0 | if (word_ending) { |
108 | 0 | if (dawg_debug_level) { |
109 | 0 | tprintf("found word = %s\n", word->debug_string().c_str()); |
110 | 0 | } |
111 | 0 | if (strcmp(output_ambig_words_file.c_str(), "") != 0) { |
112 | 0 | if (output_ambig_words_file_ == nullptr) { |
113 | 0 | output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+"); |
114 | 0 | if (output_ambig_words_file_ == nullptr) { |
115 | 0 | tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str()); |
116 | 0 | exit(1); |
117 | 0 | } |
118 | 0 | std::string word_str; |
119 | 0 | word->string_and_lengths(&word_str, nullptr); |
120 | 0 | word_str += " "; |
121 | 0 | fprintf(output_ambig_words_file_, "%s", word_str.c_str()); |
122 | 0 | } |
123 | 0 | std::string word_str; |
124 | 0 | word->string_and_lengths(&word_str, nullptr); |
125 | 0 | word_str += " "; |
126 | 0 | fprintf(output_ambig_words_file_, "%s", word_str.c_str()); |
127 | 0 | } |
128 | 0 | WERD_CHOICE *adjusted_word = word; |
129 | 0 | adjusted_word->set_permuter(more_args->permuter); |
130 | 0 | update_best_choice(*adjusted_word, best_choice); |
131 | 0 | } else { // search the next letter |
132 | | // Make updated_* point to the next entries in the DawgPositionVector |
133 | | // arrays (that were originally created in dawg_permute_and_select) |
134 | 0 | ++(more_args->updated_dawgs); |
135 | | // Make active_dawgs and constraints point to the updated ones. |
136 | 0 | ++(more_args->active_dawgs); |
137 | 0 | permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word, |
138 | 0 | certainties, limit, best_choice, attempts_left, more_args); |
139 | | // Restore previous state to explore another letter in this position. |
140 | 0 | --(more_args->updated_dawgs); |
141 | 0 | --(more_args->active_dawgs); |
142 | 0 | } |
143 | 617k | } else { |
144 | 617k | if (dawg_debug_level) { |
145 | 0 | tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str()); |
146 | 0 | } |
147 | 617k | } |
148 | 617k | } |
149 | | |
150 | | /** |
151 | | * dawg_permute_and_select |
152 | | * |
153 | | * Recursively explore all the possible character combinations in |
154 | | * the given char_choices. Use go_deeper_dawg_fxn() to search all the |
155 | | * dawgs in the dawgs_ vector in parallel and discard invalid words. |
156 | | * |
157 | | * Allocate and return a WERD_CHOICE with the best valid word found. |
158 | | */ |
159 | | WERD_CHOICE *Dict::dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, |
160 | 414k | float rating_limit) { |
161 | 414k | auto *best_choice = new WERD_CHOICE(&getUnicharset()); |
162 | 414k | best_choice->make_bad(); |
163 | 414k | best_choice->set_rating(rating_limit); |
164 | 414k | if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) { |
165 | 0 | return best_choice; |
166 | 0 | } |
167 | 414k | auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1]; |
168 | 414k | init_active_dawgs(&(active_dawgs[0]), true); |
169 | 414k | DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); |
170 | 414k | WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH); |
171 | | |
172 | 414k | float certainties[MAX_WERD_LENGTH]; |
173 | 414k | this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn; |
174 | 414k | int attempts_left = max_permuter_attempts; |
175 | 414k | permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr, |
176 | 414k | &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args); |
177 | 414k | delete[] active_dawgs; |
178 | 414k | return best_choice; |
179 | 414k | } |
180 | | |
181 | | /** |
182 | | * permute_choices |
183 | | * |
184 | | * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST |
185 | | * with the given char_choice_index in char_choices. |
186 | | */ |
187 | | void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, |
188 | | int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, |
189 | | WERD_CHOICE *word, float certainties[], float *limit, |
190 | 432k | WERD_CHOICE *best_choice, int *attempts_left, void *more_args) { |
191 | 432k | if (debug) { |
192 | 0 | tprintf( |
193 | 0 | "%s permute_choices: char_choice_index=%d" |
194 | 0 | " limit=%g rating=%g, certainty=%g word=%s\n", |
195 | 0 | debug, char_choice_index, *limit, word->rating(), word->certainty(), |
196 | 0 | word->debug_string().c_str()); |
197 | 0 | } |
198 | 432k | if (static_cast<unsigned>(char_choice_index) < char_choices.size()) { |
199 | 432k | BLOB_CHOICE_IT blob_choice_it; |
200 | 432k | blob_choice_it.set_to_list(char_choices.at(char_choice_index)); |
201 | 1.12M | for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) { |
202 | 693k | (*attempts_left)--; |
203 | 693k | append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index, |
204 | 693k | prev_char_frag_info, word, certainties, limit, best_choice, attempts_left, |
205 | 693k | more_args); |
206 | 693k | if (*attempts_left <= 0) { |
207 | 0 | if (debug) { |
208 | 0 | tprintf("permute_choices(): attempts_left is 0\n"); |
209 | 0 | } |
210 | 0 | break; |
211 | 0 | } |
212 | 693k | } |
213 | 432k | } |
214 | 432k | } |
215 | | |
216 | | /** |
217 | | * append_choices |
218 | | * |
219 | | * Checks to see whether or not the next choice is worth appending to |
220 | | * the word being generated. If so then keeps going deeper into the word. |
221 | | * |
222 | | * This function assumes that Dict::go_deeper_fxn_ is set. |
223 | | */ |
224 | | void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, |
225 | | const BLOB_CHOICE &blob_choice, int char_choice_index, |
226 | | const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, |
227 | | float certainties[], float *limit, WERD_CHOICE *best_choice, |
228 | 693k | int *attempts_left, void *more_args) { |
229 | 693k | auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1); |
230 | | |
231 | | // Deal with fragments. |
232 | 693k | CHAR_FRAGMENT_INFO char_frag_info; |
233 | 693k | if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(), |
234 | 693k | prev_char_frag_info, debug, word_ending, &char_frag_info)) { |
235 | 57.3k | return; // blob_choice must be an invalid fragment |
236 | 57.3k | } |
237 | | // Search the next letter if this character is a fragment. |
238 | 636k | if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) { |
239 | 18.5k | permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties, |
240 | 18.5k | limit, best_choice, attempts_left, more_args); |
241 | 18.5k | return; |
242 | 18.5k | } |
243 | | |
244 | | // Add the next unichar. |
245 | 617k | float old_rating = word->rating(); |
246 | 617k | float old_certainty = word->certainty(); |
247 | 617k | uint8_t old_permuter = word->permuter(); |
248 | 617k | certainties[word->length()] = char_frag_info.certainty; |
249 | 617k | word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments, |
250 | 617k | char_frag_info.rating, char_frag_info.certainty); |
251 | | |
252 | | // Explore the next unichar. |
253 | 617k | (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending, |
254 | 617k | word, certainties, limit, best_choice, attempts_left, more_args); |
255 | | |
256 | | // Remove the unichar we added to explore other choices in it's place. |
257 | 617k | word->remove_last_unichar_id(); |
258 | 617k | word->set_rating(old_rating); |
259 | 617k | word->set_certainty(old_certainty); |
260 | 617k | word->set_permuter(old_permuter); |
261 | 617k | } |
262 | | |
263 | | /** |
264 | | * @name fragment_state |
265 | | * |
266 | | * Given the current char choice and information about previously seen |
267 | | * fragments, determines whether adjacent character fragments are |
268 | | * present and whether they can be concatenated. |
269 | | * |
270 | | * The given prev_char_frag_info contains: |
271 | | * - fragment: if not nullptr contains information about immediately |
272 | | * preceding fragmented character choice |
273 | | * - num_fragments: number of fragments that have been used so far |
274 | | * to construct a character |
275 | | * - certainty: certainty of the current choice or minimum |
276 | | * certainty of all fragments concatenated so far |
277 | | * - rating: rating of the current choice or sum of fragment |
278 | | * ratings concatenated so far |
279 | | * |
280 | | * The output char_frag_info is filled in as follows: |
281 | | * - character: is set to be nullptr if the choice is a non-matching |
282 | | * or non-ending fragment piece; is set to unichar of the given choice |
283 | | * if it represents a regular character or a matching ending fragment |
284 | | * - fragment,num_fragments,certainty,rating are set as described above |
285 | | * |
286 | | * @returns false if a non-matching fragment is discovered, true otherwise. |
287 | | */ |
288 | | bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, |
289 | | const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, |
290 | 693k | int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) { |
291 | 693k | const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id); |
292 | 693k | const CHAR_FRAGMENT *prev_fragment = |
293 | 693k | prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr; |
294 | | |
295 | | // Print debug info for fragments. |
296 | 693k | if (debug && (prev_fragment || this_fragment)) { |
297 | 0 | tprintf("%s check fragments: choice=%s word_ending=%d\n", debug, |
298 | 0 | getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending); |
299 | 0 | if (prev_fragment) { |
300 | 0 | tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str()); |
301 | 0 | } |
302 | 0 | if (this_fragment) { |
303 | 0 | tprintf("this_fragment %s\n", this_fragment->to_string().c_str()); |
304 | 0 | } |
305 | 0 | } |
306 | | |
307 | 693k | char_frag_info->unichar_id = curr_unichar_id; |
308 | 693k | char_frag_info->fragment = this_fragment; |
309 | 693k | char_frag_info->rating = curr_rating; |
310 | 693k | char_frag_info->certainty = curr_certainty; |
311 | 693k | char_frag_info->num_fragments = 1; |
312 | 693k | if (prev_fragment && !this_fragment) { |
313 | 23.9k | if (debug) { |
314 | 0 | tprintf("Skip choice with incomplete fragment\n"); |
315 | 0 | } |
316 | 23.9k | return false; |
317 | 23.9k | } |
318 | 669k | if (this_fragment) { |
319 | | // We are dealing with a fragment. |
320 | 69.1k | char_frag_info->unichar_id = INVALID_UNICHAR_ID; |
321 | 69.1k | if (prev_fragment) { |
322 | 52.0k | if (!this_fragment->is_continuation_of(prev_fragment)) { |
323 | 33.4k | if (debug) { |
324 | 0 | tprintf("Non-matching fragment piece\n"); |
325 | 0 | } |
326 | 33.4k | return false; |
327 | 33.4k | } |
328 | 18.5k | if (this_fragment->is_ending()) { |
329 | 17.1k | char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar()); |
330 | 17.1k | char_frag_info->fragment = nullptr; |
331 | 17.1k | if (debug) { |
332 | 0 | tprintf("Built character %s from fragments\n", |
333 | 0 | getUnicharset().debug_str(char_frag_info->unichar_id).c_str()); |
334 | 0 | } |
335 | 17.1k | } else { |
336 | 1.46k | if (debug) { |
337 | 0 | tprintf("Record fragment continuation\n"); |
338 | 0 | } |
339 | 1.46k | char_frag_info->fragment = this_fragment; |
340 | 1.46k | } |
341 | | // Update certainty and rating. |
342 | 18.5k | char_frag_info->rating = prev_char_frag_info->rating + curr_rating; |
343 | 18.5k | char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1; |
344 | 18.5k | char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty); |
345 | 18.5k | } else { |
346 | 17.1k | if (this_fragment->is_beginning()) { |
347 | 17.1k | if (debug) { |
348 | 0 | tprintf("Record fragment beginning\n"); |
349 | 0 | } |
350 | 17.1k | } else { |
351 | 0 | if (debug) { |
352 | 0 | tprintf("Non-starting fragment piece with no prev_fragment\n"); |
353 | 0 | } |
354 | 0 | return false; |
355 | 0 | } |
356 | 17.1k | } |
357 | 69.1k | } |
358 | 636k | if (word_ending && char_frag_info->fragment) { |
359 | 0 | if (debug) { |
360 | 0 | tprintf("Word cannot end with a fragment\n"); |
361 | 0 | } |
362 | 0 | return false; |
363 | 0 | } |
364 | 636k | return true; |
365 | 636k | } |
366 | | |
367 | | } // namespace tesseract |