/src/tesseract/src/dict/permdawg.cpp
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * File: permdawg.cpp (Formerly permdawg.c) |
4 | | * Description: Scale word choices by a dictionary |
5 | | * Author: Mark Seaman, OCR Technology |
6 | | * |
7 | | * (c) Copyright 1987, Hewlett-Packard Company. |
8 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | ** you may not use this file except in compliance with the License. |
10 | | ** You may obtain a copy of the License at |
11 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
12 | | ** Unless required by applicable law or agreed to in writing, software |
13 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
14 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | ** See the License for the specific language governing permissions and |
16 | | ** limitations under the License. |
17 | | * |
18 | | *****************************************************************************/ |
19 | | /*---------------------------------------------------------------------- |
20 | | I n c l u d e s |
21 | | ----------------------------------------------------------------------*/ |
22 | | |
23 | | #include "dawg.h" |
24 | | #include "params.h" |
25 | | #include "stopper.h" |
26 | | #include "tprintf.h" |
27 | | |
28 | | #include <algorithm> |
29 | | #include <cctype> |
30 | | #include "dict.h" |
31 | | |
32 | | /*---------------------------------------------------------------------- |
33 | | F u n c t i o n s |
34 | | ----------------------------------------------------------------------*/ |
35 | | namespace tesseract { |
36 | | |
37 | | /** |
38 | | * @name go_deeper_dawg_fxn |
39 | | * |
40 | | * If the choice being composed so far could be a dictionary word |
41 | | * keep exploring choices. |
42 | | */ |
43 | | void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, |
44 | | int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, |
45 | | bool word_ending, WERD_CHOICE *word, float certainties[], |
46 | | float *limit, WERD_CHOICE *best_choice, int *attempts_left, |
47 | 661k | void *void_more_args) { |
48 | 661k | auto *more_args = static_cast<DawgArgs *>(void_more_args); |
49 | 661k | word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1); |
50 | 661k | int word_index = word->length() - 1; |
51 | 661k | if (best_choice->rating() < *limit) { |
52 | 0 | return; |
53 | 0 | } |
54 | | // Look up char in DAWG |
55 | | |
56 | | // If the current unichar is an ngram first try calling |
57 | | // letter_is_okay() for each unigram it contains separately. |
58 | 661k | UNICHAR_ID orig_uch_id = word->unichar_id(word_index); |
59 | 661k | bool checked_unigrams = false; |
60 | 661k | if (getUnicharset().get_isngram(orig_uch_id)) { |
61 | 131k | if (dawg_debug_level) { |
62 | 0 | tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str()); |
63 | 0 | } |
64 | 131k | int num_unigrams = 0; |
65 | 131k | word->remove_last_unichar_id(); |
66 | 131k | std::vector<UNICHAR_ID> encoding; |
67 | 131k | const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); |
68 | | // Since the string came out of the unicharset, failure is impossible. |
69 | 131k | ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr)); |
70 | 131k | bool unigrams_ok = true; |
71 | | // Construct DawgArgs that reflect the current state. |
72 | 131k | DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs); |
73 | 131k | DawgPositionVector unigram_updated_dawgs; |
74 | 131k | DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter); |
75 | | // Check unigrams in the ngram with letter_is_okay(). |
76 | 262k | for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) { |
77 | 131k | UNICHAR_ID uch_id = encoding[i]; |
78 | 131k | ASSERT_HOST(uch_id != INVALID_UNICHAR_ID); |
79 | 131k | ++num_unigrams; |
80 | 131k | word->append_unichar_id(uch_id, 1, 0.0, 0.0); |
81 | 131k | unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(), |
82 | 131k | word->unichar_id(word_index + num_unigrams - 1), |
83 | 131k | word_ending && i == encoding.size() - 1); |
84 | 131k | (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs); |
85 | 131k | if (dawg_debug_level) { |
86 | 0 | tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(), |
87 | 0 | unigrams_ok ? "OK" : "not OK"); |
88 | 0 | } |
89 | 131k | } |
90 | | // Restore the word and copy the updated dawg state if needed. |
91 | 262k | while (num_unigrams-- > 0) { |
92 | 131k | word->remove_last_unichar_id(); |
93 | 131k | } |
94 | 131k | word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0); |
95 | 131k | if (unigrams_ok) { |
96 | 0 | checked_unigrams = true; |
97 | 0 | more_args->permuter = unigram_dawg_args.permuter; |
98 | 0 | *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs); |
99 | 0 | } |
100 | 131k | } |
101 | | |
102 | | // Check which dawgs from the dawgs_ vector contain the word |
103 | | // up to and including the current unichar. |
104 | 661k | if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(), |
105 | 661k | word->unichar_id(word_index), word_ending)) { |
106 | | // Add a new word choice |
107 | 0 | if (word_ending) { |
108 | 0 | if (dawg_debug_level) { |
109 | 0 | tprintf("found word = %s\n", word->debug_string().c_str()); |
110 | 0 | } |
111 | 0 | if (strcmp(output_ambig_words_file.c_str(), "") != 0) { |
112 | 0 | if (output_ambig_words_file_ == nullptr) { |
113 | 0 | output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+"); |
114 | 0 | if (output_ambig_words_file_ == nullptr) { |
115 | 0 | tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str()); |
116 | 0 | exit(1); |
117 | 0 | } |
118 | 0 | std::string word_str; |
119 | 0 | word->string_and_lengths(&word_str, nullptr); |
120 | 0 | word_str += " "; |
121 | 0 | fprintf(output_ambig_words_file_, "%s", word_str.c_str()); |
122 | 0 | } |
123 | 0 | std::string word_str; |
124 | 0 | word->string_and_lengths(&word_str, nullptr); |
125 | 0 | word_str += " "; |
126 | 0 | fprintf(output_ambig_words_file_, "%s", word_str.c_str()); |
127 | 0 | } |
128 | 0 | WERD_CHOICE *adjusted_word = word; |
129 | 0 | adjusted_word->set_permuter(more_args->permuter); |
130 | 0 | update_best_choice(*adjusted_word, best_choice); |
131 | 0 | } else { // search the next letter |
132 | | // Make updated_* point to the next entries in the DawgPositionVector |
133 | | // arrays (that were originally created in dawg_permute_and_select) |
134 | 0 | ++(more_args->updated_dawgs); |
135 | | // Make active_dawgs and constraints point to the updated ones. |
136 | 0 | ++(more_args->active_dawgs); |
137 | 0 | permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word, |
138 | 0 | certainties, limit, best_choice, attempts_left, more_args); |
139 | | // Restore previous state to explore another letter in this position. |
140 | 0 | --(more_args->updated_dawgs); |
141 | 0 | --(more_args->active_dawgs); |
142 | 0 | } |
143 | 661k | } else { |
144 | 661k | if (dawg_debug_level) { |
145 | 0 | tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str()); |
146 | 0 | } |
147 | 661k | } |
148 | 661k | } |
149 | | |
150 | | /** |
151 | | * dawg_permute_and_select |
152 | | * |
153 | | * Recursively explore all the possible character combinations in |
154 | | * the given char_choices. Use go_deeper_dawg_fxn() to search all the |
155 | | * dawgs in the dawgs_ vector in parallel and discard invalid words. |
156 | | * |
157 | | * Allocate and return a WERD_CHOICE with the best valid word found. |
158 | | */ |
159 | | WERD_CHOICE *Dict::dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, |
160 | 417k | float rating_limit) { |
161 | 417k | auto *best_choice = new WERD_CHOICE(&getUnicharset()); |
162 | 417k | best_choice->make_bad(); |
163 | 417k | best_choice->set_rating(rating_limit); |
164 | 417k | if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) { |
165 | 0 | return best_choice; |
166 | 0 | } |
167 | 417k | auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1]; |
168 | 417k | init_active_dawgs(&(active_dawgs[0]), true); |
169 | 417k | DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); |
170 | 417k | WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH); |
171 | | |
172 | 417k | float certainties[MAX_WERD_LENGTH]; |
173 | 417k | this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn; |
174 | 417k | int attempts_left = max_permuter_attempts; |
175 | 417k | permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr, |
176 | 417k | &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args); |
177 | 417k | delete[] active_dawgs; |
178 | 417k | return best_choice; |
179 | 417k | } |
180 | | |
181 | | /** |
182 | | * permute_choices |
183 | | * |
184 | | * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST |
185 | | * with the given char_choice_index in char_choices. |
186 | | */ |
187 | | void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, |
188 | | int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, |
189 | | WERD_CHOICE *word, float certainties[], float *limit, |
190 | 491k | WERD_CHOICE *best_choice, int *attempts_left, void *more_args) { |
191 | 491k | if (debug) { |
192 | 0 | tprintf( |
193 | 0 | "%s permute_choices: char_choice_index=%d" |
194 | 0 | " limit=%g rating=%g, certainty=%g word=%s\n", |
195 | 0 | debug, char_choice_index, *limit, word->rating(), word->certainty(), |
196 | 0 | word->debug_string().c_str()); |
197 | 0 | } |
198 | 491k | if (static_cast<unsigned>(char_choice_index) < char_choices.size()) { |
199 | 491k | BLOB_CHOICE_IT blob_choice_it; |
200 | 491k | blob_choice_it.set_to_list(char_choices.at(char_choice_index)); |
201 | 1.56M | for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) { |
202 | 1.07M | (*attempts_left)--; |
203 | 1.07M | append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index, |
204 | 1.07M | prev_char_frag_info, word, certainties, limit, best_choice, attempts_left, |
205 | 1.07M | more_args); |
206 | 1.07M | if (*attempts_left <= 0) { |
207 | 0 | if (debug) { |
208 | 0 | tprintf("permute_choices(): attempts_left is 0\n"); |
209 | 0 | } |
210 | 0 | break; |
211 | 0 | } |
212 | 1.07M | } |
213 | 491k | } |
214 | 491k | } |
215 | | |
216 | | /** |
217 | | * append_choices |
218 | | * |
219 | | * Checks to see whether or not the next choice is worth appending to |
220 | | * the word being generated. If so then keeps going deeper into the word. |
221 | | * |
222 | | * This function assumes that Dict::go_deeper_fxn_ is set. |
223 | | */ |
224 | | void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, |
225 | | const BLOB_CHOICE &blob_choice, int char_choice_index, |
226 | | const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, |
227 | | float certainties[], float *limit, WERD_CHOICE *best_choice, |
228 | 1.07M | int *attempts_left, void *more_args) { |
229 | 1.07M | auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1); |
230 | | |
231 | | // Deal with fragments. |
232 | 1.07M | CHAR_FRAGMENT_INFO char_frag_info; |
233 | 1.07M | if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(), |
234 | 1.07M | prev_char_frag_info, debug, word_ending, &char_frag_info)) { |
235 | 337k | return; // blob_choice must be an invalid fragment |
236 | 337k | } |
237 | | // Search the next letter if this character is a fragment. |
238 | 734k | if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) { |
239 | 73.4k | permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties, |
240 | 73.4k | limit, best_choice, attempts_left, more_args); |
241 | 73.4k | return; |
242 | 73.4k | } |
243 | | |
244 | | // Add the next unichar. |
245 | 661k | float old_rating = word->rating(); |
246 | 661k | float old_certainty = word->certainty(); |
247 | 661k | uint8_t old_permuter = word->permuter(); |
248 | 661k | certainties[word->length()] = char_frag_info.certainty; |
249 | 661k | word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments, |
250 | 661k | char_frag_info.rating, char_frag_info.certainty); |
251 | | |
252 | | // Explore the next unichar. |
253 | 661k | (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending, |
254 | 661k | word, certainties, limit, best_choice, attempts_left, more_args); |
255 | | |
256 | | // Remove the unichar we added to explore other choices in it's place. |
257 | 661k | word->remove_last_unichar_id(); |
258 | 661k | word->set_rating(old_rating); |
259 | 661k | word->set_certainty(old_certainty); |
260 | 661k | word->set_permuter(old_permuter); |
261 | 661k | } |
262 | | |
263 | | /** |
264 | | * @name fragment_state |
265 | | * |
266 | | * Given the current char choice and information about previously seen |
267 | | * fragments, determines whether adjacent character fragments are |
268 | | * present and whether they can be concatenated. |
269 | | * |
270 | | * The given prev_char_frag_info contains: |
271 | | * - fragment: if not nullptr contains information about immediately |
272 | | * preceding fragmented character choice |
273 | | * - num_fragments: number of fragments that have been used so far |
274 | | * to construct a character |
275 | | * - certainty: certainty of the current choice or minimum |
276 | | * certainty of all fragments concatenated so far |
277 | | * - rating: rating of the current choice or sum of fragment |
278 | | * ratings concatenated so far |
279 | | * |
280 | | * The output char_frag_info is filled in as follows: |
281 | | * - character: is set to be nullptr if the choice is a non-matching |
282 | | * or non-ending fragment piece; is set to unichar of the given choice |
283 | | * if it represents a regular character or a matching ending fragment |
284 | | * - fragment,num_fragments,certainty,rating are set as described above |
285 | | * |
286 | | * @returns false if a non-matching fragment is discovered, true otherwise. |
287 | | */ |
288 | | bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, |
289 | | const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, |
290 | 1.07M | int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) { |
291 | 1.07M | const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id); |
292 | 1.07M | const CHAR_FRAGMENT *prev_fragment = |
293 | 1.07M | prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr; |
294 | | |
295 | | // Print debug info for fragments. |
296 | 1.07M | if (debug && (prev_fragment || this_fragment)) { |
297 | 0 | tprintf("%s check fragments: choice=%s word_ending=%d\n", debug, |
298 | 0 | getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending); |
299 | 0 | if (prev_fragment) { |
300 | 0 | tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str()); |
301 | 0 | } |
302 | 0 | if (this_fragment) { |
303 | 0 | tprintf("this_fragment %s\n", this_fragment->to_string().c_str()); |
304 | 0 | } |
305 | 0 | } |
306 | | |
307 | 1.07M | char_frag_info->unichar_id = curr_unichar_id; |
308 | 1.07M | char_frag_info->fragment = this_fragment; |
309 | 1.07M | char_frag_info->rating = curr_rating; |
310 | 1.07M | char_frag_info->certainty = curr_certainty; |
311 | 1.07M | char_frag_info->num_fragments = 1; |
312 | 1.07M | if (prev_fragment && !this_fragment) { |
313 | 80.2k | if (debug) { |
314 | 0 | tprintf("Skip choice with incomplete fragment\n"); |
315 | 0 | } |
316 | 80.2k | return false; |
317 | 80.2k | } |
318 | 992k | if (this_fragment) { |
319 | | // We are dealing with a fragment. |
320 | 391k | char_frag_info->unichar_id = INVALID_UNICHAR_ID; |
321 | 391k | if (prev_fragment) { |
322 | 330k | if (!this_fragment->is_continuation_of(prev_fragment)) { |
323 | 257k | if (debug) { |
324 | 0 | tprintf("Non-matching fragment piece\n"); |
325 | 0 | } |
326 | 257k | return false; |
327 | 257k | } |
328 | 73.4k | if (this_fragment->is_ending()) { |
329 | 61.0k | char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar()); |
330 | 61.0k | char_frag_info->fragment = nullptr; |
331 | 61.0k | if (debug) { |
332 | 0 | tprintf("Built character %s from fragments\n", |
333 | 0 | getUnicharset().debug_str(char_frag_info->unichar_id).c_str()); |
334 | 0 | } |
335 | 61.0k | } else { |
336 | 12.4k | if (debug) { |
337 | 0 | tprintf("Record fragment continuation\n"); |
338 | 0 | } |
339 | 12.4k | char_frag_info->fragment = this_fragment; |
340 | 12.4k | } |
341 | | // Update certainty and rating. |
342 | 73.4k | char_frag_info->rating = prev_char_frag_info->rating + curr_rating; |
343 | 73.4k | char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1; |
344 | 73.4k | char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty); |
345 | 73.4k | } else { |
346 | 61.0k | if (this_fragment->is_beginning()) { |
347 | 61.0k | if (debug) { |
348 | 0 | tprintf("Record fragment beginning\n"); |
349 | 0 | } |
350 | 61.0k | } else { |
351 | 0 | if (debug) { |
352 | 0 | tprintf("Non-starting fragment piece with no prev_fragment\n"); |
353 | 0 | } |
354 | 0 | return false; |
355 | 0 | } |
356 | 61.0k | } |
357 | 391k | } |
358 | 734k | if (word_ending && char_frag_info->fragment) { |
359 | 0 | if (debug) { |
360 | 0 | tprintf("Word cannot end with a fragment\n"); |
361 | 0 | } |
362 | 0 | return false; |
363 | 0 | } |
364 | 734k | return true; |
365 | 734k | } |
366 | | |
367 | | } // namespace tesseract |