/src/tesseract/src/ccmain/control.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************** |
2 | | * File: control.cpp (Formerly control.c) |
3 | | * Description: Module-independent matcher controller. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1992, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | // Include automatically generated configuration file if running autoconf. |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" |
22 | | #endif |
23 | | |
24 | | #include <cctype> |
25 | | #include <cmath> |
26 | | #include <cstdint> // for int16_t, int32_t |
27 | | #include <cstdio> // for fclose, fopen, FILE |
28 | | #include <ctime> // for clock |
29 | | #include "control.h" |
30 | | #ifndef DISABLED_LEGACY_ENGINE |
31 | | # include "docqual.h" |
32 | | # include "drawfx.h" |
33 | | # include "fixspace.h" |
34 | | #endif |
35 | | #include <tesseract/ocrclass.h> |
36 | | #include "lstmrecognizer.h" |
37 | | #include "output.h" |
38 | | #include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO... |
39 | | #ifndef DISABLED_LEGACY_ENGINE |
40 | | # include "reject.h" |
41 | | #endif |
42 | | #include "sorthelper.h" |
43 | | #include "tesseractclass.h" |
44 | | #include "tessvars.h" |
45 | | #include "werdit.h" |
46 | | |
47 | | const char *const kBackUpConfigFile = "tempconfigdata.config"; |
48 | | #ifndef DISABLED_LEGACY_ENGINE |
49 | | // Min believable x-height for any text when refitting as a fraction of |
50 | | // original x-height |
51 | | const double kMinRefitXHeightFraction = 0.5; |
52 | | #endif // ! DISABLED_LEGACY_ENGINE |
53 | | |
54 | | /** |
55 | | * Make a word from the selected blobs and run Tess on them. |
56 | | * |
57 | | * @param page_res recognise blobs |
58 | | * @param selection_box within this box |
59 | | */ |
60 | | namespace tesseract { |
61 | | |
62 | 0 | void Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) { |
63 | 0 | PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box); |
64 | 0 | if (it != nullptr) { |
65 | 0 | recog_interactive(it); |
66 | 0 | it->DeleteCurrentWord(); |
67 | 0 | delete it; |
68 | 0 | } |
69 | 0 | } |
70 | | |
71 | | /** |
72 | | * Recognize a single word in interactive mode. |
73 | | * |
74 | | * @param pr_it the page results iterator |
75 | | */ |
76 | 0 | bool Tesseract::recog_interactive(PAGE_RES_IT *pr_it) { |
77 | 0 | WordData word_data(*pr_it); |
78 | 0 | SetupWordPassN(2, &word_data); |
79 | | // LSTM doesn't run on pass2, but we want to run pass2 for tesseract. |
80 | 0 | if (lstm_recognizer_ == nullptr) { |
81 | 0 | #ifndef DISABLED_LEGACY_ENGINE |
82 | 0 | classify_word_and_language(2, pr_it, &word_data); |
83 | 0 | #endif // ndef DISABLED_LEGACY_ENGINE |
84 | 0 | } else { |
85 | 0 | classify_word_and_language(1, pr_it, &word_data); |
86 | 0 | } |
87 | 0 | #ifndef DISABLED_LEGACY_ENGINE |
88 | 0 | if (tessedit_debug_quality_metrics) { |
89 | 0 | int16_t char_qual; |
90 | 0 | int16_t good_char_qual; |
91 | 0 | WERD_RES *word_res = pr_it->word(); |
92 | 0 | word_char_quality(word_res, &char_qual, &good_char_qual); |
93 | 0 | tprintf( |
94 | 0 | "\n%d chars; word_blob_quality: %d; outline_errs: %d; " |
95 | 0 | "char_quality: %d; good_char_quality: %d\n", |
96 | 0 | word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res), |
97 | 0 | char_qual, good_char_qual); |
98 | 0 | } |
99 | 0 | #endif // ndef DISABLED_LEGACY_ENGINE |
100 | 0 | return true; |
101 | 0 | } |
102 | | |
103 | | // Helper function to check for a target word and handle it appropriately. |
104 | | // Inspired by Jetsoft's requirement to process only single words on pass2 |
105 | | // and beyond. |
106 | | // If word_config is not null: |
107 | | // If the word_box and target_word_box overlap, read the word_config file |
108 | | // else reset to previous config data. |
109 | | // return true. |
110 | | // else |
111 | | // If the word_box and target_word_box overlap or pass <= 1, return true. |
112 | | // Note that this function uses a fixed temporary file for storing the previous |
113 | | // configs, so it is neither thread-safe, nor process-safe, but the assumption |
114 | | // is that it will only be used for one debug window at a time. |
115 | | // |
116 | | // Since this function is used for debugging (and not to change OCR results) |
117 | | // set only debug params from the word config file. |
118 | | bool Tesseract::ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, |
119 | 0 | const char *word_config, int pass) { |
120 | 0 | if (word_config != nullptr) { |
121 | 0 | if (word_box.major_overlap(target_word_box)) { |
122 | 0 | if (backup_config_file_ == nullptr) { |
123 | 0 | backup_config_file_ = kBackUpConfigFile; |
124 | 0 | FILE *config_fp = fopen(backup_config_file_, "wb"); |
125 | 0 | if (config_fp == nullptr) { |
126 | 0 | tprintf("Error, failed to open file \"%s\"\n", backup_config_file_); |
127 | 0 | } else { |
128 | 0 | ParamUtils::PrintParams(config_fp, params()); |
129 | 0 | fclose(config_fp); |
130 | 0 | } |
131 | 0 | ParamUtils::ReadParamsFile(word_config, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params()); |
132 | 0 | } |
133 | 0 | } else { |
134 | 0 | if (backup_config_file_ != nullptr) { |
135 | 0 | ParamUtils::ReadParamsFile(backup_config_file_, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params()); |
136 | 0 | backup_config_file_ = nullptr; |
137 | 0 | } |
138 | 0 | } |
139 | 0 | } else if (pass > 1 && !word_box.major_overlap(target_word_box)) { |
140 | 0 | return false; |
141 | 0 | } |
142 | 0 | return true; |
143 | 0 | } |
144 | | |
145 | | /** If tesseract is to be run, sets the words up ready for it. */ |
146 | | void Tesseract::SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, |
147 | 31.3k | PAGE_RES *page_res, std::vector<WordData> *words) { |
148 | | // Prepare all the words. |
149 | 31.3k | PAGE_RES_IT page_res_it(page_res); |
150 | 476k | for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { |
151 | 444k | if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(), |
152 | 444k | *target_word_box, word_config, 1)) { |
153 | 444k | words->push_back(WordData(page_res_it)); |
154 | 444k | } |
155 | 444k | } |
156 | | // Setup all the words for recognition with polygonal approximation. |
157 | 476k | for (unsigned w = 0; w < words->size(); ++w) { |
158 | 444k | SetupWordPassN(pass_n, &(*words)[w]); |
159 | 444k | if (w > 0) { |
160 | 413k | (*words)[w].prev_word = &(*words)[w - 1]; |
161 | 413k | } |
162 | 444k | } |
163 | 31.3k | } |
164 | | |
165 | | // Sets up the single word ready for whichever engine is to be run. |
166 | 444k | void Tesseract::SetupWordPassN(int pass_n, WordData *word) { |
167 | 444k | if (pass_n == 1 || !word->word->done) { |
168 | 277k | if (pass_n == 1) { |
169 | 214k | word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, |
170 | 214k | nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model, |
171 | 214k | poly_allow_detailed_fx, word->row, word->block); |
172 | 214k | } else if (pass_n == 2) { |
173 | | // TODO(rays) Should we do this on pass1 too? |
174 | 63.2k | word->word->caps_height = 0.0; |
175 | 63.2k | if (word->word->x_height == 0.0f) { |
176 | 0 | word->word->x_height = word->row->x_height(); |
177 | 0 | } |
178 | 63.2k | } |
179 | 277k | word->lang_words.truncate(0); |
180 | 554k | for (unsigned s = 0; s <= sub_langs_.size(); ++s) { |
181 | | // The sub_langs_.size() entry is for the master language. |
182 | 277k | Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this; |
183 | 277k | auto *word_res = new WERD_RES; |
184 | 277k | word_res->InitForRetryRecognition(*word->word); |
185 | 277k | word->lang_words.push_back(word_res); |
186 | | // LSTM doesn't get setup for pass2. |
187 | 277k | if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) { |
188 | 277k | word_res->SetupForRecognition( |
189 | 277k | lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr, |
190 | 277k | lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model, |
191 | 277k | lang_t->poly_allow_detailed_fx, word->row, word->block); |
192 | 277k | } |
193 | 277k | } |
194 | 277k | } |
195 | 444k | } |
196 | | |
197 | | // Runs word recognition on all the words. |
198 | | bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, |
199 | 31.3k | std::vector<WordData> *words) { |
200 | | // TODO(rays) Before this loop can be parallelized (it would yield a massive |
201 | | // speed-up) all remaining member globals need to be converted to local/heap |
202 | | // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be |
203 | | // added. The results will be significantly different with adaption on, and |
204 | | // deterioration will need investigation. |
205 | 31.3k | pr_it->restart_page(); |
206 | 476k | for (unsigned w = 0; w < words->size(); ++w) { |
207 | 444k | WordData *word = &(*words)[w]; |
208 | 444k | if (w > 0) { |
209 | 413k | word->prev_word = &(*words)[w - 1]; |
210 | 413k | } |
211 | 444k | if (monitor != nullptr) { |
212 | 0 | monitor->ocr_alive = true; |
213 | 0 | if (pass_n == 1) { |
214 | 0 | monitor->progress = 70 * w / words->size(); |
215 | 0 | } else { |
216 | 0 | monitor->progress = 70 + 30 * w / words->size(); |
217 | 0 | } |
218 | 0 | if (monitor->progress_callback2 != nullptr) { |
219 | 0 | TBOX box = pr_it->word()->word->bounding_box(); |
220 | 0 | (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom()); |
221 | 0 | } |
222 | 0 | if (monitor->deadline_exceeded() || |
223 | 0 | (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) { |
224 | | // Timeout. Fake out the rest of the words. |
225 | 0 | for (; w < words->size(); ++w) { |
226 | 0 | (*words)[w].word->SetupFake(unicharset); |
227 | 0 | } |
228 | 0 | return false; |
229 | 0 | } |
230 | 0 | } |
231 | 444k | if (word->word->tess_failed) { |
232 | 2.76k | unsigned s; |
233 | 4.13k | for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) { |
234 | 1.37k | } |
235 | | // If all are failed, skip it. Image words are skipped by this test. |
236 | 2.76k | if (s > word->lang_words.size()) { |
237 | 0 | continue; |
238 | 0 | } |
239 | 2.76k | } |
240 | | // Sync pr_it with the WordData. |
241 | 444k | while (pr_it->word() != nullptr && pr_it->word() != word->word) { |
242 | 0 | pr_it->forward(); |
243 | 0 | } |
244 | 444k | ASSERT_HOST(pr_it->word() != nullptr); |
245 | 444k | bool make_next_word_fuzzy = false; |
246 | 444k | #ifndef DISABLED_LEGACY_ENGINE |
247 | 444k | if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) { |
248 | | // Needs to be setup again to see the new outlines in the chopped_word. |
249 | 0 | SetupWordPassN(pass_n, word); |
250 | 0 | } |
251 | 444k | #endif // ndef DISABLED_LEGACY_ENGINE |
252 | | |
253 | 444k | classify_word_and_language(pass_n, pr_it, word); |
254 | 444k | if (tessedit_dump_choices || debug_noise_removal) { |
255 | 0 | tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(), |
256 | 0 | word->word->best_choice->debug_string().c_str()); |
257 | 0 | } |
258 | 444k | pr_it->forward(); |
259 | 444k | if (make_next_word_fuzzy && pr_it->word() != nullptr) { |
260 | 0 | pr_it->MakeCurrentWordFuzzy(); |
261 | 0 | } |
262 | 444k | } |
263 | 31.3k | return true; |
264 | 31.3k | } |
265 | | |
266 | | /** |
267 | | * recog_all_words() |
268 | | * |
269 | | * Walk the page_res, recognizing all the words. |
270 | | * If monitor is not null, it is used as a progress monitor/timeout/cancel. |
271 | | * If dopasses is 0, all recognition passes are run, |
272 | | * 1 just pass 1, 2 passes2 and higher. |
273 | | * If target_word_box is not null, special things are done to words that |
274 | | * overlap the target_word_box: |
275 | | * if word_config is not null, the word config file is read for just the |
276 | | * target word(s), otherwise, on pass 2 and beyond ONLY the target words |
277 | | * are processed (Jetsoft modification.) |
278 | | * Returns false if we cancelled prematurely. |
279 | | * |
280 | | * @param page_res page structure |
281 | | * @param monitor progress monitor |
282 | | * @param word_config word_config file |
283 | | * @param target_word_box specifies just to extract a rectangle |
284 | | * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher |
285 | | */ |
286 | | |
287 | | bool Tesseract::recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, |
288 | | const TBOX *target_word_box, const char *word_config, |
289 | 15.6k | int dopasses) { |
290 | 15.6k | PAGE_RES_IT page_res_it(page_res); |
291 | | |
292 | 15.6k | if (tessedit_minimal_rej_pass1) { |
293 | 0 | tessedit_test_adaption.set_value(true); |
294 | 0 | tessedit_minimal_rejection.set_value(true); |
295 | 0 | } |
296 | | |
297 | 15.6k | if (dopasses == 0 || dopasses == 1) { |
298 | 15.6k | page_res_it.restart_page(); |
299 | | // ****************** Pass 1 ******************* |
300 | | |
301 | 15.6k | #ifndef DISABLED_LEGACY_ENGINE |
302 | | // If the adaptive classifier is full switch to one we prepared earlier, |
303 | | // ie on the previous page. If the current adaptive classifier is non-empty, |
304 | | // prepare a backup starting at this page, in case it fills up. Do all this |
305 | | // independently for each language. |
306 | 15.6k | if (AdaptiveClassifierIsFull()) { |
307 | 1 | SwitchAdaptiveClassifier(); |
308 | 15.6k | } else if (!AdaptiveClassifierIsEmpty()) { |
309 | 13.2k | StartBackupAdaptiveClassifier(); |
310 | 13.2k | } |
311 | | // Now check the sub-langs as well. |
312 | 15.6k | for (auto &lang : sub_langs_) { |
313 | 0 | if (lang->AdaptiveClassifierIsFull()) { |
314 | 0 | lang->SwitchAdaptiveClassifier(); |
315 | 0 | } else if (!lang->AdaptiveClassifierIsEmpty()) { |
316 | 0 | lang->StartBackupAdaptiveClassifier(); |
317 | 0 | } |
318 | 0 | } |
319 | | |
320 | 15.6k | #endif // ndef DISABLED_LEGACY_ENGINE |
321 | | |
322 | | // Set up all words ready for recognition, so that if parallelism is on |
323 | | // all the input and output classes are ready to run the classifier. |
324 | 15.6k | std::vector<WordData> words; |
325 | 15.6k | SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words); |
326 | 15.6k | #ifndef DISABLED_LEGACY_ENGINE |
327 | 15.6k | if (tessedit_parallelize) { |
328 | 0 | PrerecAllWordsPar(words); |
329 | 0 | } |
330 | 15.6k | #endif // ndef DISABLED_LEGACY_ENGINE |
331 | | |
332 | 15.6k | stats_.word_count = words.size(); |
333 | | |
334 | 15.6k | stats_.dict_words = 0; |
335 | 15.6k | stats_.doc_blob_quality = 0; |
336 | 15.6k | stats_.doc_outline_errs = 0; |
337 | 15.6k | stats_.doc_char_quality = 0; |
338 | 15.6k | stats_.good_char_count = 0; |
339 | 15.6k | stats_.doc_good_char_quality = 0; |
340 | | |
341 | 15.6k | most_recently_used_ = this; |
342 | | // Run pass 1 word recognition. |
343 | 15.6k | if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) { |
344 | 0 | return false; |
345 | 0 | } |
346 | | // Pass 1 post-processing. |
347 | 246k | for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { |
348 | 230k | if (page_res_it.word()->word->flag(W_REP_CHAR)) { |
349 | 0 | fix_rep_char(&page_res_it); |
350 | 0 | continue; |
351 | 0 | } |
352 | | |
353 | | // Count dict words. |
354 | 230k | if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) { |
355 | 0 | ++(stats_.dict_words); |
356 | 0 | } |
357 | | |
358 | | // Update misadaption log (we only need to do it on pass 1, since |
359 | | // adaption only happens on this pass). |
360 | 230k | if (page_res_it.word()->blamer_bundle != nullptr && |
361 | 230k | page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) { |
362 | 0 | page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug()); |
363 | 0 | } |
364 | 230k | } |
365 | 15.6k | } |
366 | | |
367 | 15.6k | if (dopasses == 1) { |
368 | 0 | return true; |
369 | 0 | } |
370 | | |
371 | 15.6k | #ifndef DISABLED_LEGACY_ENGINE |
372 | | |
373 | | // ****************** Pass 2 ******************* |
374 | 15.6k | if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) { |
375 | 15.6k | page_res_it.restart_page(); |
376 | 15.6k | std::vector<WordData> words; |
377 | 15.6k | SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words); |
378 | 15.6k | if (tessedit_parallelize) { |
379 | 0 | PrerecAllWordsPar(words); |
380 | 0 | } |
381 | 15.6k | most_recently_used_ = this; |
382 | | // Run pass 2 word recognition. |
383 | 15.6k | if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) { |
384 | 0 | return false; |
385 | 0 | } |
386 | 15.6k | } |
387 | | |
388 | | // The next passes are only required for Tess-only. |
389 | 15.6k | if (AnyTessLang() && !AnyLSTMLang()) { |
390 | | // ****************** Pass 3 ******************* |
391 | | // Fix fuzzy spaces. |
392 | |
|
393 | 0 | if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word && |
394 | 0 | !right_to_left()) { |
395 | 0 | fix_fuzzy_spaces(monitor, stats_.word_count, page_res); |
396 | 0 | } |
397 | | |
398 | | // ****************** Pass 4 ******************* |
399 | 0 | if (tessedit_enable_dict_correction) { |
400 | 0 | dictionary_correction_pass(page_res); |
401 | 0 | } |
402 | 0 | if (tessedit_enable_bigram_correction) { |
403 | 0 | bigram_correction_pass(page_res); |
404 | 0 | } |
405 | | |
406 | | // ****************** Pass 5,6 ******************* |
407 | 0 | rejection_passes(page_res, monitor, target_word_box, word_config); |
408 | | |
409 | | // ****************** Pass 8 ******************* |
410 | 0 | font_recognition_pass(page_res); |
411 | | |
412 | | // ****************** Pass 9 ******************* |
413 | | // Check the correctness of the final results. |
414 | 0 | blamer_pass(page_res); |
415 | 0 | script_pos_pass(page_res); |
416 | 0 | } |
417 | | |
418 | 15.6k | #endif // ndef DISABLED_LEGACY_ENGINE |
419 | | |
420 | | // Write results pass. |
421 | | // This is now redundant, but retained commented so show how to obtain |
422 | | // bounding boxes and style information. |
423 | | |
424 | 15.6k | #ifndef DISABLED_LEGACY_ENGINE |
425 | | // changed by jetsoft |
426 | | // needed for dll to output memory structure |
427 | 15.6k | if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) { |
428 | 0 | output_pass(page_res_it, target_word_box); |
429 | 0 | } |
430 | | // end jetsoft |
431 | 15.6k | #endif // ndef DISABLED_LEGACY_ENGINE |
432 | | |
433 | 15.6k | const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode)); |
434 | 15.6k | textord_.CleanupSingleRowResult(pageseg_mode, page_res); |
435 | | |
436 | | // Remove empty words, as these mess up the result iterators. |
437 | 246k | for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { |
438 | 230k | const WERD_RES *word = page_res_it.word(); |
439 | 230k | const POLY_BLOCK *pb = page_res_it.block()->block != nullptr |
440 | 230k | ? page_res_it.block()->block->pdblk.poly_block() |
441 | 230k | : nullptr; |
442 | 230k | if (word->best_choice == nullptr || word->best_choice->empty() || |
443 | 230k | (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) { |
444 | 1.87k | page_res_it.DeleteCurrentWord(); |
445 | 1.87k | } |
446 | 230k | } |
447 | | |
448 | 15.6k | if (monitor != nullptr) { |
449 | 0 | monitor->progress = 100; |
450 | 0 | } |
451 | 15.6k | return true; |
452 | 15.6k | } |
453 | | |
454 | | #ifndef DISABLED_LEGACY_ENGINE |
455 | | |
456 | 0 | void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { |
457 | 0 | PAGE_RES_IT word_it(page_res); |
458 | |
|
459 | 0 | WERD_RES *w_prev = nullptr; |
460 | 0 | WERD_RES *w = word_it.word(); |
461 | 0 | while (true) { |
462 | 0 | w_prev = w; |
463 | 0 | while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) { |
464 | | // advance word_it, skipping over parts of combos |
465 | 0 | } |
466 | 0 | if (!word_it.word()) { |
467 | 0 | break; |
468 | 0 | } |
469 | 0 | w = word_it.word(); |
470 | 0 | if (!w || !w_prev || w->uch_set != w_prev->uch_set) { |
471 | 0 | continue; |
472 | 0 | } |
473 | 0 | if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) { |
474 | 0 | if (tessedit_bigram_debug) { |
475 | 0 | tprintf("Skipping because one of the words is W_REP_CHAR\n"); |
476 | 0 | } |
477 | 0 | continue; |
478 | 0 | } |
479 | | // Two words sharing the same language model, excellent! |
480 | 0 | std::vector<WERD_CHOICE *> overrides_word1; |
481 | 0 | std::vector<WERD_CHOICE *> overrides_word2; |
482 | |
|
483 | 0 | const auto orig_w1_str = w_prev->best_choice->unichar_string(); |
484 | 0 | const auto orig_w2_str = w->best_choice->unichar_string(); |
485 | 0 | WERD_CHOICE prev_best(w->uch_set); |
486 | 0 | { |
487 | 0 | int w1start, w1end; |
488 | 0 | w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end); |
489 | 0 | prev_best = w_prev->best_choice->shallow_copy(w1start, w1end); |
490 | 0 | } |
491 | 0 | WERD_CHOICE this_best(w->uch_set); |
492 | 0 | { |
493 | 0 | int w2start, w2end; |
494 | 0 | w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end); |
495 | 0 | this_best = w->best_choice->shallow_copy(w2start, w2end); |
496 | 0 | } |
497 | |
|
498 | 0 | if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) { |
499 | 0 | if (tessedit_bigram_debug) { |
500 | 0 | tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(), |
501 | 0 | orig_w2_str.c_str()); |
502 | 0 | } |
503 | 0 | continue; |
504 | 0 | } |
505 | 0 | if (tessedit_bigram_debug > 2) { |
506 | 0 | tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str()); |
507 | 0 | } |
508 | 0 | if (tessedit_bigram_debug > 1) { |
509 | 0 | if (!w_prev->best_choices.singleton()) { |
510 | 0 | w_prev->PrintBestChoices(); |
511 | 0 | } |
512 | 0 | if (!w->best_choices.singleton()) { |
513 | 0 | w->PrintBestChoices(); |
514 | 0 | } |
515 | 0 | } |
516 | 0 | float best_rating = 0.0; |
517 | 0 | int best_idx = 0; |
518 | 0 | WERD_CHOICE_IT prev_it(&w_prev->best_choices); |
519 | 0 | for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) { |
520 | 0 | WERD_CHOICE *p1 = prev_it.data(); |
521 | 0 | WERD_CHOICE strip1(w->uch_set); |
522 | 0 | { |
523 | 0 | int p1start, p1end; |
524 | 0 | p1->GetNonSuperscriptSpan(&p1start, &p1end); |
525 | 0 | strip1 = p1->shallow_copy(p1start, p1end); |
526 | 0 | } |
527 | 0 | WERD_CHOICE_IT w_it(&w->best_choices); |
528 | 0 | for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { |
529 | 0 | WERD_CHOICE *p2 = w_it.data(); |
530 | 0 | WERD_CHOICE strip2(w->uch_set); |
531 | 0 | { |
532 | 0 | int p2start, p2end; |
533 | 0 | p2->GetNonSuperscriptSpan(&p2start, &p2end); |
534 | 0 | strip2 = p2->shallow_copy(p2start, p2end); |
535 | 0 | } |
536 | 0 | if (w->tesseract->getDict().valid_bigram(strip1, strip2)) { |
537 | 0 | overrides_word1.push_back(p1); |
538 | 0 | overrides_word2.push_back(p2); |
539 | 0 | if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) { |
540 | 0 | best_rating = p1->rating() + p2->rating(); |
541 | 0 | best_idx = overrides_word1.size() - 1; |
542 | 0 | } |
543 | 0 | } |
544 | 0 | } |
545 | 0 | } |
546 | 0 | if (!overrides_word1.empty()) { |
547 | | // Excellent, we have some bigram matches. |
548 | 0 | if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) && |
549 | 0 | EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) { |
550 | 0 | if (tessedit_bigram_debug > 1) { |
551 | 0 | tprintf( |
552 | 0 | "Top choice \"%s %s\" verified (sans case) by bigram " |
553 | 0 | "model.\n", |
554 | 0 | orig_w1_str.c_str(), orig_w2_str.c_str()); |
555 | 0 | } |
556 | 0 | continue; |
557 | 0 | } |
558 | 0 | const auto new_w1_str = overrides_word1[best_idx]->unichar_string(); |
559 | 0 | const auto new_w2_str = overrides_word2[best_idx]->unichar_string(); |
560 | 0 | if (new_w1_str != orig_w1_str) { |
561 | 0 | w_prev->ReplaceBestChoice(overrides_word1[best_idx]); |
562 | 0 | } |
563 | 0 | if (new_w2_str != orig_w2_str) { |
564 | 0 | w->ReplaceBestChoice(overrides_word2[best_idx]); |
565 | 0 | } |
566 | 0 | if (tessedit_bigram_debug > 0) { |
567 | 0 | std::string choices_description; |
568 | 0 | int num_bigram_choices = overrides_word1.size() * overrides_word2.size(); |
569 | 0 | if (num_bigram_choices == 1) { |
570 | 0 | choices_description = "This was the unique bigram choice."; |
571 | 0 | } else { |
572 | 0 | if (tessedit_bigram_debug > 1) { |
573 | 0 | std::string bigrams_list; |
574 | 0 | const int kMaxChoicesToPrint = 20; |
575 | 0 | for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) { |
576 | 0 | if (i > 0) { |
577 | 0 | bigrams_list += ", "; |
578 | 0 | } |
579 | 0 | WERD_CHOICE *p1 = overrides_word1[i]; |
580 | 0 | WERD_CHOICE *p2 = overrides_word2[i]; |
581 | 0 | bigrams_list += p1->unichar_string() + " " + p2->unichar_string(); |
582 | 0 | } |
583 | 0 | choices_description = "There were many choices: {"; |
584 | 0 | choices_description += bigrams_list; |
585 | 0 | choices_description += "}"; |
586 | 0 | } else { |
587 | 0 | choices_description += "There were " + std::to_string(num_bigram_choices); |
588 | 0 | choices_description += " compatible bigrams."; |
589 | 0 | } |
590 | 0 | } |
591 | 0 | tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(), |
592 | 0 | orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(), |
593 | 0 | choices_description.c_str()); |
594 | 0 | } |
595 | 0 | } |
596 | 0 | } |
597 | 0 | } |
598 | | |
599 | | void Tesseract::rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, |
600 | 0 | const TBOX *target_word_box, const char *word_config) { |
601 | 0 | PAGE_RES_IT page_res_it(page_res); |
602 | | // ****************** Pass 5 ******************* |
603 | | // Gather statistics on rejects. |
604 | 0 | int word_index = 0; |
605 | 0 | while (!tessedit_test_adaption && page_res_it.word() != nullptr) { |
606 | 0 | WERD_RES *word = page_res_it.word(); |
607 | 0 | word_index++; |
608 | 0 | if (monitor != nullptr) { |
609 | 0 | monitor->ocr_alive = true; |
610 | 0 | monitor->progress = 95 + 5 * word_index / stats_.word_count; |
611 | 0 | } |
612 | 0 | if (word->rebuild_word == nullptr) { |
613 | | // Word was not processed by tesseract. |
614 | 0 | page_res_it.forward(); |
615 | 0 | continue; |
616 | 0 | } |
617 | 0 | check_debug_pt(word, 70); |
618 | | |
619 | | // changed by jetsoft |
620 | | // specific to its needs to extract one word when need |
621 | 0 | if (target_word_box && |
622 | 0 | !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) { |
623 | 0 | page_res_it.forward(); |
624 | 0 | continue; |
625 | 0 | } |
626 | | // end jetsoft |
627 | | |
628 | 0 | page_res_it.rej_stat_word(); |
629 | 0 | const int chars_in_word = word->reject_map.length(); |
630 | 0 | const int rejects_in_word = word->reject_map.reject_count(); |
631 | |
|
632 | 0 | const int blob_quality = word_blob_quality(word); |
633 | 0 | stats_.doc_blob_quality += blob_quality; |
634 | 0 | const int outline_errs = word_outline_errs(word); |
635 | 0 | stats_.doc_outline_errs += outline_errs; |
636 | 0 | int16_t all_char_quality; |
637 | 0 | int16_t accepted_all_char_quality; |
638 | 0 | word_char_quality(word, &all_char_quality, &accepted_all_char_quality); |
639 | 0 | stats_.doc_char_quality += all_char_quality; |
640 | 0 | const uint8_t permuter_type = word->best_choice->permuter(); |
641 | 0 | if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) || |
642 | 0 | (permuter_type == USER_DAWG_PERM)) { |
643 | 0 | stats_.good_char_count += chars_in_word - rejects_in_word; |
644 | 0 | stats_.doc_good_char_quality += accepted_all_char_quality; |
645 | 0 | } |
646 | 0 | check_debug_pt(word, 80); |
647 | 0 | if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) { |
648 | 0 | word->reject_map.rej_word_bad_quality(); |
649 | 0 | } |
650 | 0 | check_debug_pt(word, 90); |
651 | 0 | page_res_it.forward(); |
652 | 0 | } |
653 | |
|
654 | 0 | if (tessedit_debug_quality_metrics) { |
655 | 0 | tprintf( |
656 | 0 | "QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f" |
657 | 0 | " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n", |
658 | 0 | page_res->char_count, page_res->rej_count, |
659 | 0 | page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality, |
660 | 0 | stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs, |
661 | 0 | stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality, |
662 | 0 | stats_.doc_char_quality / static_cast<float>(page_res->char_count), |
663 | 0 | stats_.doc_good_char_quality, |
664 | 0 | (stats_.good_char_count > 0) |
665 | 0 | ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count)) |
666 | 0 | : 0.0); |
667 | 0 | } |
668 | 0 | bool good_quality_doc = |
669 | 0 | ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) && |
670 | 0 | (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) && |
671 | 0 | (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) && |
672 | 0 | (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc); |
673 | | |
674 | | // ****************** Pass 6 ******************* |
675 | | // Do whole document or whole block rejection pass |
676 | 0 | if (!tessedit_test_adaption) { |
677 | 0 | quality_based_rejection(page_res_it, good_quality_doc); |
678 | 0 | } |
679 | 0 | } |
680 | | |
681 | | #endif // ndef DISABLED_LEGACY_ENGINE |
682 | | |
683 | 0 | void Tesseract::blamer_pass(PAGE_RES *page_res) { |
684 | 0 | if (!wordrec_run_blamer) { |
685 | 0 | return; |
686 | 0 | } |
687 | 0 | PAGE_RES_IT page_res_it(page_res); |
688 | 0 | for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { |
689 | 0 | WERD_RES *word = page_res_it.word(); |
690 | 0 | BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word); |
691 | 0 | page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++; |
692 | 0 | } |
693 | 0 | tprintf("Blame reasons:\n"); |
694 | 0 | for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) { |
695 | 0 | tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(static_cast<IncorrectResultReason>(bl)), |
696 | 0 | page_res->blame_reasons[bl]); |
697 | 0 | } |
698 | 0 | if (page_res->misadaption_log.size() > 0) { |
699 | 0 | tprintf("Misadaption log:\n"); |
700 | 0 | for (auto &log : page_res->misadaption_log) { |
701 | 0 | tprintf("%s\n", log.c_str()); |
702 | 0 | } |
703 | 0 | } |
704 | 0 | } |
705 | | |
706 | | // Sets script positions and detects smallcaps on all output words. |
707 | 0 | void Tesseract::script_pos_pass(PAGE_RES *page_res) { |
708 | 0 | PAGE_RES_IT page_res_it(page_res); |
709 | 0 | for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { |
710 | 0 | WERD_RES *word = page_res_it.word(); |
711 | 0 | if (word->word->flag(W_REP_CHAR)) { |
712 | 0 | page_res_it.forward(); |
713 | 0 | continue; |
714 | 0 | } |
715 | 0 | const float x_height = page_res_it.block()->block->x_height(); |
716 | 0 | float word_x_height = word->x_height; |
717 | 0 | if (word_x_height < word->best_choice->min_x_height() || |
718 | 0 | word_x_height > word->best_choice->max_x_height()) { |
719 | 0 | word_x_height = |
720 | 0 | (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f; |
721 | 0 | } |
722 | | // Test for small caps. Word capheight must be close to block xheight, |
723 | | // and word must contain no lower case letters, and at least one upper case. |
724 | 0 | const double small_cap_xheight = x_height * kXHeightCapRatio; |
725 | 0 | const double small_cap_delta = (x_height - small_cap_xheight) / 2.0; |
726 | 0 | if (word->uch_set->script_has_xheight() && |
727 | 0 | small_cap_xheight - small_cap_delta <= word_x_height && |
728 | 0 | word_x_height <= small_cap_xheight + small_cap_delta) { |
729 | | // Scan for upper/lower. |
730 | 0 | int num_upper = 0; |
731 | 0 | int num_lower = 0; |
732 | 0 | for (unsigned i = 0; i < word->best_choice->length(); ++i) { |
733 | 0 | if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) { |
734 | 0 | ++num_upper; |
735 | 0 | } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) { |
736 | 0 | ++num_lower; |
737 | 0 | } |
738 | 0 | } |
739 | 0 | if (num_upper > 0 && num_lower == 0) { |
740 | 0 | word->small_caps = true; |
741 | 0 | } |
742 | 0 | } |
743 | 0 | word->SetScriptPositions(); |
744 | 0 | } |
745 | 0 | } |
746 | | |
747 | | // Helper finds the gap between the index word and the next. |
748 | 584k | static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) { |
749 | 584k | *right = -INT32_MAX; |
750 | 584k | *next_left = INT32_MAX; |
751 | 584k | if (index < words.size()) { |
752 | 292k | *right = words[index]->word->bounding_box().right(); |
753 | 292k | if (index + 1 < words.size()) { |
754 | 16.3k | *next_left = words[index + 1]->word->bounding_box().left(); |
755 | 16.3k | } |
756 | 292k | } |
757 | 584k | } |
758 | | |
759 | | // Factored helper computes the rating, certainty, badness and validity of |
760 | | // the permuter of the words in [first_index, end_index). |
761 | | static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index, |
762 | 584k | float *rating, float *certainty, bool *bad, bool *valid_permuter) { |
763 | 584k | if (end_index <= first_index) { |
764 | 292k | *bad = true; |
765 | 292k | *valid_permuter = false; |
766 | 292k | } |
767 | 876k | for (unsigned index = first_index; index < end_index && index < words.size(); ++index) { |
768 | 292k | WERD_CHOICE *choice = words[index]->best_choice; |
769 | 292k | if (choice == nullptr) { |
770 | 0 | *bad = true; |
771 | 292k | } else { |
772 | 292k | *rating += choice->rating(); |
773 | 292k | *certainty = std::min(*certainty, choice->certainty()); |
774 | 292k | if (!Dict::valid_word_permuter(choice->permuter(), false)) { |
775 | 106k | *valid_permuter = false; |
776 | 106k | } |
777 | 292k | } |
778 | 292k | } |
779 | 584k | } |
780 | | |
781 | | // Helper chooses the best combination of words, transferring good ones from |
782 | | // new_words to best_words. To win, a new word must have (better rating and |
783 | | // certainty) or (better permuter status and rating within rating ratio and |
784 | | // certainty within certainty margin) than current best. |
785 | | // All the new_words are consumed (moved to best_words or deleted.) |
786 | | // The return value is the number of new_words used minus the number of |
787 | | // best_words that remain in the output. |
788 | | static int SelectBestWords(double rating_ratio, double certainty_margin, bool debug, |
789 | | PointerVector<WERD_RES> *new_words, |
790 | 276k | PointerVector<WERD_RES> *best_words) { |
791 | | // Process the smallest groups of words that have an overlapping word |
792 | | // boundary at the end. |
793 | 276k | std::vector<WERD_RES *> out_words; |
794 | | // Index into each word vector (best, new). |
795 | 276k | unsigned b = 0, n = 0; |
796 | 276k | int num_best = 0, num_new = 0; |
797 | 568k | while (b < best_words->size() || n < new_words->size()) { |
798 | | // Start of the current run in each. |
799 | 292k | auto start_b = b, start_n = n; |
800 | 292k | while (b < best_words->size() || n < new_words->size()) { |
801 | 292k | int b_right = -INT32_MAX; |
802 | 292k | int next_b_left = INT32_MAX; |
803 | 292k | WordGap(*best_words, b, &b_right, &next_b_left); |
804 | 292k | int n_right = -INT32_MAX; |
805 | 292k | int next_n_left = INT32_MAX; |
806 | 292k | WordGap(*new_words, n, &n_right, &next_n_left); |
807 | 292k | if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) { |
808 | | // The word breaks overlap. [start_b,b] and [start_n, n] match. |
809 | 292k | break; |
810 | 292k | } |
811 | | // Keep searching for the matching word break. |
812 | 162 | if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) { |
813 | 0 | ++b; |
814 | 162 | } else { |
815 | 162 | ++n; |
816 | 162 | } |
817 | 162 | } |
818 | | // Rating of the current run in each. |
819 | 292k | float b_rating = 0.0f, n_rating = 0.0f; |
820 | | // Certainty of the current run in each. |
821 | 292k | float b_certainty = 0.0f, n_certainty = 0.0f; |
822 | | // True if any word is missing its best choice. |
823 | 292k | bool b_bad = false, n_bad = false; |
824 | | // True if all words have a valid permuter. |
825 | 292k | bool b_valid_permuter = true, n_valid_permuter = true; |
826 | 292k | const int end_b = b < best_words->size() ? b + 1 : b; |
827 | 292k | const int end_n = n < new_words->size() ? n + 1 : n; |
828 | 292k | EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad, |
829 | 292k | &b_valid_permuter); |
830 | 292k | EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad, |
831 | 292k | &n_valid_permuter); |
832 | 292k | bool new_better = false; |
833 | 292k | if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) || |
834 | 292k | (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio && |
835 | 292k | n_certainty > b_certainty - certainty_margin))) { |
836 | | // New is better. |
837 | 584k | for (int i = start_n; i < end_n; ++i) { |
838 | 292k | out_words.push_back((*new_words)[i]); |
839 | 292k | (*new_words)[i] = nullptr; |
840 | 292k | ++num_new; |
841 | 292k | } |
842 | 292k | new_better = true; |
843 | 292k | } else if (!b_bad) { |
844 | | // Current best is better. |
845 | 0 | for (int i = start_b; i < end_b; ++i) { |
846 | 0 | out_words.push_back((*best_words)[i]); |
847 | 0 | (*best_words)[i] = nullptr; |
848 | 0 | ++num_best; |
849 | 0 | } |
850 | 0 | } |
851 | 292k | if (debug) { |
852 | 0 | tprintf( |
853 | 0 | "%d new words %s than %d old words: r: %g v %g c: %g v %g" |
854 | 0 | " valid dict: %d v %d\n", |
855 | 0 | end_n - start_n, new_better ? "better" : "worse", end_b - start_b, n_rating, b_rating, |
856 | 0 | n_certainty, b_certainty, n_valid_permuter, b_valid_permuter); |
857 | 0 | } |
858 | | // Move on to the next group. |
859 | 292k | b = end_b; |
860 | 292k | n = end_n; |
861 | 292k | } |
862 | | // Transfer from out_words to best_words. |
863 | 276k | best_words->clear(); |
864 | 292k | for (auto &out_word : out_words) { |
865 | 292k | best_words->push_back(out_word); |
866 | 292k | } |
867 | 276k | return num_new - num_best; |
868 | 276k | } |
869 | | |
870 | | // Helper to recognize the word using the given (language-specific) tesseract. |
871 | | // Returns positive if this recognizer found more new best words than the |
872 | | // number kept from best_words. |
873 | | int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, |
874 | 276k | WERD_RES **in_word, PointerVector<WERD_RES> *best_words) { |
875 | 276k | if (debug) { |
876 | 0 | tprintf("Trying word using lang %s, oem %d\n", lang.c_str(), |
877 | 0 | static_cast<int>(tessedit_ocr_engine_mode)); |
878 | 0 | } |
879 | | // Run the recognizer on the word. |
880 | 276k | PointerVector<WERD_RES> new_words; |
881 | 276k | (this->*recognizer)(word_data, in_word, &new_words); |
882 | 276k | if (new_words.empty()) { |
883 | | // Transfer input word to new_words, as the classifier must have put |
884 | | // the result back in the input. |
885 | 128k | new_words.push_back(*in_word); |
886 | 128k | *in_word = nullptr; |
887 | 128k | } |
888 | 276k | if (debug) { |
889 | 0 | for (unsigned i = 0; i < new_words.size(); ++i) { |
890 | 0 | new_words[i]->DebugTopChoice("Lang result"); |
891 | 0 | } |
892 | 0 | } |
893 | | // Initial version is a bit of a hack based on better certainty and rating |
894 | | // or a dictionary vs non-dictionary word. |
895 | 276k | return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug, |
896 | 276k | &new_words, best_words); |
897 | 276k | } |
898 | | |
899 | | // Helper returns true if all the words are acceptable. |
900 | 536k | static bool WordsAcceptable(const PointerVector<WERD_RES> &words) { |
901 | 552k | for (unsigned w = 0; w < words.size(); ++w) { |
902 | 536k | if (words[w]->tess_failed || !words[w]->tess_accepted) { |
903 | 521k | return false; |
904 | 521k | } |
905 | 536k | } |
906 | 15.4k | return true; |
907 | 536k | } |
908 | | |
909 | | #ifndef DISABLED_LEGACY_ENGINE |
910 | | |
911 | | // Moves good-looking "noise"/diacritics from the reject list to the main |
912 | | // blob list on the current word. Returns true if anything was done, and |
913 | | // sets make_next_word_fuzzy if blob(s) were added to the end of the word. |
914 | 0 | bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) { |
915 | 0 | *make_next_word_fuzzy = false; |
916 | 0 | WERD *real_word = pr_it->word()->word; |
917 | 0 | if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() || |
918 | 0 | real_word->rej_cblob_list()->length() > noise_maxperword) { |
919 | 0 | return false; |
920 | 0 | } |
921 | 0 | real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle); |
922 | | // Get the noise outlines into a vector with matching bool map. |
923 | 0 | std::vector<C_OUTLINE *> outlines; |
924 | 0 | real_word->GetNoiseOutlines(&outlines); |
925 | 0 | std::vector<bool> word_wanted; |
926 | 0 | std::vector<bool> overlapped_any_blob; |
927 | 0 | std::vector<C_BLOB *> target_blobs; |
928 | 0 | AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted, |
929 | 0 | &overlapped_any_blob, &target_blobs); |
930 | | // Filter the outlines that overlapped any blob and put them into the word |
931 | | // now. This simplifies the remaining task and also makes it more accurate |
932 | | // as it has more completed blobs to work on. |
933 | 0 | std::vector<bool> wanted; |
934 | 0 | std::vector<C_BLOB *> wanted_blobs; |
935 | 0 | std::vector<C_OUTLINE *> wanted_outlines; |
936 | 0 | int num_overlapped = 0; |
937 | 0 | int num_overlapped_used = 0; |
938 | 0 | for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) { |
939 | 0 | if (overlapped_any_blob[i]) { |
940 | 0 | ++num_overlapped; |
941 | 0 | if (word_wanted[i]) { |
942 | 0 | ++num_overlapped_used; |
943 | 0 | } |
944 | 0 | wanted.push_back(word_wanted[i]); |
945 | 0 | wanted_blobs.push_back(target_blobs[i]); |
946 | 0 | wanted_outlines.push_back(outlines[i]); |
947 | 0 | outlines[i] = nullptr; |
948 | 0 | } |
949 | 0 | } |
950 | 0 | real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr); |
951 | 0 | AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs); |
952 | 0 | int non_overlapped = 0; |
953 | 0 | int non_overlapped_used = 0; |
954 | 0 | for (unsigned i = 0; i < word_wanted.size(); ++i) { |
955 | 0 | if (word_wanted[i]) { |
956 | 0 | ++non_overlapped_used; |
957 | 0 | } |
958 | 0 | if (outlines[i] != nullptr) { |
959 | 0 | ++non_overlapped_used; |
960 | 0 | } |
961 | 0 | } |
962 | 0 | if (debug_noise_removal) { |
963 | 0 | tprintf("Used %d/%d overlapped %d/%d non-overlapped diacritics on word:", num_overlapped_used, |
964 | 0 | num_overlapped, non_overlapped_used, non_overlapped); |
965 | 0 | real_word->bounding_box().print(); |
966 | 0 | } |
967 | | // Now we have decided which outlines we want, put them into the real_word. |
968 | 0 | if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) { |
969 | 0 | pr_it->MakeCurrentWordFuzzy(); |
970 | 0 | } |
971 | | // TODO(rays) Parts of combos have a deep copy of the real word, and need |
972 | | // to have their noise outlines moved/assigned in the same way!! |
973 | 0 | return num_overlapped_used != 0 || non_overlapped_used != 0; |
974 | 0 | } |
975 | | |
976 | | // Attempts to put noise/diacritic outlines into the blobs that they overlap. |
977 | | // Input: a set of noisy outlines that probably belong to the real_word. |
978 | | // Output: word_wanted indicates which outlines are to be assigned to a blob, |
979 | | // target_blobs indicates which to assign to, and overlapped_any_blob is |
980 | | // true for all outlines that overlapped a blob. |
981 | | void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines, |
982 | | int pass, WERD *real_word, PAGE_RES_IT *pr_it, |
983 | | std::vector<bool> *word_wanted, |
984 | | std::vector<bool> *overlapped_any_blob, |
985 | 0 | std::vector<C_BLOB *> *target_blobs) { |
986 | 0 | std::vector<bool> blob_wanted; |
987 | 0 | word_wanted->clear(); |
988 | 0 | word_wanted->resize(outlines.size()); |
989 | 0 | overlapped_any_blob->clear(); |
990 | 0 | overlapped_any_blob->resize(outlines.size()); |
991 | 0 | target_blobs->clear(); |
992 | 0 | target_blobs->resize(outlines.size()); |
993 | | // For each real blob, find the outlines that seriously overlap it. |
994 | | // A single blob could be several merged characters, so there can be quite |
995 | | // a few outlines overlapping, and the full engine needs to be used to chop |
996 | | // and join to get a sensible result. |
997 | 0 | C_BLOB_IT blob_it(real_word->cblob_list()); |
998 | 0 | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
999 | 0 | C_BLOB *blob = blob_it.data(); |
1000 | 0 | const TBOX blob_box = blob->bounding_box(); |
1001 | 0 | blob_wanted.clear(); |
1002 | 0 | blob_wanted.resize(outlines.size()); |
1003 | 0 | int num_blob_outlines = 0; |
1004 | 0 | for (unsigned i = 0; i < outlines.size(); ++i) { |
1005 | 0 | if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) { |
1006 | 0 | blob_wanted[i] = true; |
1007 | 0 | (*overlapped_any_blob)[i] = true; |
1008 | 0 | ++num_blob_outlines; |
1009 | 0 | } |
1010 | 0 | } |
1011 | 0 | if (debug_noise_removal) { |
1012 | 0 | tprintf("%d noise outlines overlap blob at:", num_blob_outlines); |
1013 | 0 | blob_box.print(); |
1014 | 0 | } |
1015 | | // If any outlines overlap the blob, and not too many, classify the blob |
1016 | | // (using the full engine, languages and all), and choose the maximal |
1017 | | // combination of outlines that doesn't hurt the end-result classification |
1018 | | // by too much. Mark them as wanted. |
1019 | 0 | if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) { |
1020 | 0 | if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines, |
1021 | 0 | num_blob_outlines, &blob_wanted)) { |
1022 | 0 | for (unsigned i = 0; i < blob_wanted.size(); ++i) { |
1023 | 0 | if (blob_wanted[i]) { |
1024 | | // Claim the outline and record where it is going. |
1025 | 0 | (*word_wanted)[i] = true; |
1026 | 0 | (*target_blobs)[i] = blob; |
1027 | 0 | } |
1028 | 0 | } |
1029 | 0 | } |
1030 | 0 | } |
1031 | 0 | } |
1032 | 0 | } |
1033 | | |
1034 | | // Attempts to assign non-overlapping outlines to their nearest blobs or |
1035 | | // make new blobs out of them. |
1036 | | void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass, |
1037 | | WERD *real_word, PAGE_RES_IT *pr_it, |
1038 | | std::vector<bool> *word_wanted, |
1039 | 0 | std::vector<C_BLOB *> *target_blobs) { |
1040 | 0 | std::vector<bool> blob_wanted; |
1041 | 0 | word_wanted->clear(); |
1042 | 0 | word_wanted->resize(outlines.size()); |
1043 | 0 | target_blobs->clear(); |
1044 | 0 | target_blobs->resize(outlines.size()); |
1045 | | // Check for outlines that need to be turned into stand-alone blobs. |
1046 | 0 | for (unsigned i = 0; i < outlines.size(); ++i) { |
1047 | 0 | if (outlines[i] == nullptr) { |
1048 | 0 | continue; |
1049 | 0 | } |
1050 | | // Get a set of adjacent outlines that don't overlap any existing blob. |
1051 | 0 | blob_wanted.clear(); |
1052 | 0 | blob_wanted.resize(outlines.size()); |
1053 | 0 | int num_blob_outlines = 0; |
1054 | 0 | TBOX total_ol_box(outlines[i]->bounding_box()); |
1055 | 0 | while (i < outlines.size() && outlines[i] != nullptr) { |
1056 | 0 | blob_wanted[i] = true; |
1057 | 0 | total_ol_box += outlines[i]->bounding_box(); |
1058 | 0 | ++i; |
1059 | 0 | ++num_blob_outlines; |
1060 | 0 | } |
1061 | | // Find the insertion point. |
1062 | 0 | C_BLOB_IT blob_it(real_word->cblob_list()); |
1063 | 0 | while (!blob_it.at_last() && |
1064 | 0 | blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) { |
1065 | 0 | blob_it.forward(); |
1066 | 0 | } |
1067 | | // Choose which combination of them we actually want and where to put |
1068 | | // them. |
1069 | 0 | if (debug_noise_removal) { |
1070 | 0 | tprintf("Num blobless outlines = %d\n", num_blob_outlines); |
1071 | 0 | } |
1072 | 0 | C_BLOB *left_blob = blob_it.data(); |
1073 | 0 | TBOX left_box = left_blob->bounding_box(); |
1074 | 0 | C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1); |
1075 | 0 | if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr || |
1076 | 0 | !right_blob->bounding_box().x_overlap(total_ol_box)) && |
1077 | 0 | SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines, |
1078 | 0 | num_blob_outlines, &blob_wanted)) { |
1079 | 0 | if (debug_noise_removal) { |
1080 | 0 | tprintf("Added to left blob\n"); |
1081 | 0 | } |
1082 | 0 | for (unsigned j = 0; j < blob_wanted.size(); ++j) { |
1083 | 0 | if (blob_wanted[j]) { |
1084 | 0 | (*word_wanted)[j] = true; |
1085 | 0 | (*target_blobs)[j] = left_blob; |
1086 | 0 | } |
1087 | 0 | } |
1088 | 0 | } else if (right_blob != nullptr && |
1089 | 0 | (!left_box.x_overlap(total_ol_box) || |
1090 | 0 | right_blob->bounding_box().x_overlap(total_ol_box)) && |
1091 | 0 | SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines, |
1092 | 0 | num_blob_outlines, &blob_wanted)) { |
1093 | 0 | if (debug_noise_removal) { |
1094 | 0 | tprintf("Added to right blob\n"); |
1095 | 0 | } |
1096 | 0 | for (unsigned j = 0; j < blob_wanted.size(); ++j) { |
1097 | 0 | if (blob_wanted[j]) { |
1098 | 0 | (*word_wanted)[j] = true; |
1099 | 0 | (*target_blobs)[j] = right_blob; |
1100 | 0 | } |
1101 | 0 | } |
1102 | 0 | } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines, |
1103 | 0 | num_blob_outlines, &blob_wanted)) { |
1104 | 0 | if (debug_noise_removal) { |
1105 | 0 | tprintf("Fitted between blobs\n"); |
1106 | 0 | } |
1107 | 0 | for (unsigned j = 0; j < blob_wanted.size(); ++j) { |
1108 | 0 | if (blob_wanted[j]) { |
1109 | 0 | (*word_wanted)[j] = true; |
1110 | 0 | (*target_blobs)[j] = nullptr; |
1111 | 0 | } |
1112 | 0 | } |
1113 | 0 | } |
1114 | 0 | } |
1115 | 0 | } |
1116 | | |
1117 | | // Starting with ok_outlines set to indicate which outlines overlap the blob, |
1118 | | // chooses the optimal set (approximately) and returns true if any outlines |
1119 | | // are desired, in which case ok_outlines indicates which ones. |
1120 | | bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, |
1121 | | C_BLOB *blob, |
1122 | | const std::vector<C_OUTLINE *> &outlines, |
1123 | 0 | int num_outlines, std::vector<bool> *ok_outlines) { |
1124 | 0 | std::string best_str; |
1125 | 0 | float target_cert = certainty_threshold; |
1126 | 0 | if (blob != nullptr) { |
1127 | 0 | float target_c2; |
1128 | 0 | target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2); |
1129 | 0 | if (debug_noise_removal) { |
1130 | 0 | tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert, |
1131 | 0 | target_c2); |
1132 | 0 | blob->bounding_box().print(); |
1133 | 0 | } |
1134 | 0 | target_cert -= (target_cert - certainty_threshold) * noise_cert_factor; |
1135 | 0 | } |
1136 | 0 | std::vector<bool> test_outlines = *ok_outlines; |
1137 | | // Start with all the outlines in. |
1138 | 0 | std::string all_str; |
1139 | 0 | std::vector<bool> best_outlines = *ok_outlines; |
1140 | 0 | float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str); |
1141 | 0 | if (debug_noise_removal) { |
1142 | 0 | TBOX ol_box; |
1143 | 0 | for (unsigned i = 0; i < test_outlines.size(); ++i) { |
1144 | 0 | if (test_outlines[i]) { |
1145 | 0 | ol_box += outlines[i]->bounding_box(); |
1146 | 0 | } |
1147 | 0 | } |
1148 | 0 | tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert, |
1149 | 0 | best_cert - target_cert); |
1150 | 0 | ol_box.print(); |
1151 | 0 | } |
1152 | | // Iteratively zero out the bit that improves the certainty the most, until |
1153 | | // we get past the threshold, have zero bits, or fail to improve. |
1154 | 0 | int best_index = 0; // To zero out. |
1155 | 0 | while (num_outlines > 1 && best_index >= 0 && |
1156 | 0 | (blob == nullptr || best_cert < target_cert || blob != nullptr)) { |
1157 | | // Find the best bit to zero out. |
1158 | 0 | best_index = -1; |
1159 | 0 | for (unsigned i = 0; i < outlines.size(); ++i) { |
1160 | 0 | if (test_outlines[i]) { |
1161 | 0 | test_outlines[i] = false; |
1162 | 0 | std::string str; |
1163 | 0 | float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str); |
1164 | 0 | if (debug_noise_removal) { |
1165 | 0 | TBOX ol_box; |
1166 | 0 | for (unsigned j = 0; j < outlines.size(); ++j) { |
1167 | 0 | if (test_outlines[j]) { |
1168 | 0 | ol_box += outlines[j]->bounding_box(); |
1169 | 0 | } |
1170 | 0 | tprintf("%c", test_outlines[j] ? 'T' : 'F'); |
1171 | 0 | } |
1172 | 0 | tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert, |
1173 | 0 | cert - target_cert); |
1174 | 0 | ol_box.print(); |
1175 | 0 | } |
1176 | 0 | if (cert > best_cert) { |
1177 | 0 | best_cert = cert; |
1178 | 0 | best_index = i; |
1179 | 0 | best_outlines = test_outlines; |
1180 | 0 | } |
1181 | 0 | test_outlines[i] = true; |
1182 | 0 | } |
1183 | 0 | } |
1184 | 0 | if (best_index >= 0) { |
1185 | 0 | test_outlines[best_index] = false; |
1186 | 0 | --num_outlines; |
1187 | 0 | } |
1188 | 0 | } |
1189 | 0 | if (best_cert >= target_cert) { |
1190 | | // Save the best combination. |
1191 | 0 | *ok_outlines = best_outlines; |
1192 | 0 | if (debug_noise_removal) { |
1193 | 0 | tprintf("%s noise combination ", blob ? "Adding" : "New"); |
1194 | 0 | for (auto &&best_outline : best_outlines) { |
1195 | 0 | tprintf("%c", best_outline ? 'T' : 'F'); |
1196 | 0 | } |
1197 | 0 | tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert); |
1198 | 0 | } |
1199 | 0 | return true; |
1200 | 0 | } |
1201 | | |
1202 | 0 | return false; |
1203 | 0 | } |
1204 | | |
1205 | | // Classifies the given blob plus the outlines flagged by ok_outlines, undoes |
1206 | | // the inclusion of the outlines, and returns the certainty of the raw choice. |
1207 | | float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines, |
1208 | | const std::vector<C_OUTLINE *> &outlines, int pass_n, |
1209 | 0 | PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) { |
1210 | 0 | C_OUTLINE_IT ol_it; |
1211 | 0 | C_OUTLINE *first_to_keep = nullptr; |
1212 | 0 | C_BLOB *local_blob = nullptr; |
1213 | 0 | if (blob != nullptr) { |
1214 | | // Add the required outlines to the blob. |
1215 | 0 | ol_it.set_to_list(blob->out_list()); |
1216 | 0 | first_to_keep = ol_it.data(); |
1217 | 0 | } |
1218 | 0 | for (unsigned i = 0; i < ok_outlines.size(); ++i) { |
1219 | 0 | if (ok_outlines[i]) { |
1220 | | // This outline is to be added. |
1221 | 0 | if (blob == nullptr) { |
1222 | 0 | local_blob = new C_BLOB(outlines[i]); |
1223 | 0 | blob = local_blob; |
1224 | 0 | ol_it.set_to_list(blob->out_list()); |
1225 | 0 | } else { |
1226 | 0 | ol_it.add_before_stay_put(outlines[i]); |
1227 | 0 | } |
1228 | 0 | } |
1229 | 0 | } |
1230 | 0 | float c2; |
1231 | 0 | float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2); |
1232 | 0 | ol_it.move_to_first(); |
1233 | 0 | if (first_to_keep == nullptr) { |
1234 | | // We created blob. Empty its outlines and delete it. |
1235 | 0 | for (; !ol_it.empty(); ol_it.forward()) { |
1236 | 0 | ol_it.extract(); |
1237 | 0 | } |
1238 | 0 | delete local_blob; |
1239 | 0 | cert = -c2; |
1240 | 0 | } else { |
1241 | | // Remove the outlines that we put in. |
1242 | 0 | for (; ol_it.data() != first_to_keep; ol_it.forward()) { |
1243 | 0 | ol_it.extract(); |
1244 | 0 | } |
1245 | 0 | } |
1246 | 0 | return cert; |
1247 | 0 | } |
1248 | | |
1249 | | // Classifies the given blob (part of word_data->word->word) as an individual |
1250 | | // word, using languages, chopper etc, returning only the certainty of the |
1251 | | // best raw choice, and undoing all the work done to fake out the word. |
1252 | | float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, |
1253 | 0 | float *c2) { |
1254 | 0 | WERD *real_word = pr_it->word()->word; |
1255 | 0 | WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL), |
1256 | 0 | C_BLOB::deep_copy(blob)); |
1257 | 0 | WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word); |
1258 | | // Get a new iterator that points to the new word. |
1259 | 0 | PAGE_RES_IT it(pr_it->page_res); |
1260 | 0 | while (it.word() != word_res && it.word() != nullptr) { |
1261 | 0 | it.forward(); |
1262 | 0 | } |
1263 | 0 | ASSERT_HOST(it.word() == word_res); |
1264 | 0 | WordData wd(it); |
1265 | | // Force full initialization. |
1266 | 0 | SetupWordPassN(1, &wd); |
1267 | 0 | classify_word_and_language(pass_n, &it, &wd); |
1268 | 0 | if (debug_noise_removal) { |
1269 | 0 | if (wd.word->raw_choice != nullptr) { |
1270 | 0 | tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(), |
1271 | 0 | wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height()); |
1272 | 0 | } else { |
1273 | 0 | tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height, |
1274 | 0 | wd.row->x_height()); |
1275 | 0 | } |
1276 | 0 | } |
1277 | 0 | float cert = 0.0f; |
1278 | 0 | if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but... |
1279 | 0 | cert = wd.word->raw_choice->certainty(); |
1280 | 0 | float rat = wd.word->raw_choice->rating(); |
1281 | 0 | *c2 = rat > 0.0f ? cert * cert / rat : 0.0f; |
1282 | 0 | best_str = wd.word->raw_choice->unichar_string(); |
1283 | 0 | } else { |
1284 | 0 | *c2 = 0.0f; |
1285 | 0 | best_str.clear(); |
1286 | 0 | } |
1287 | 0 | it.DeleteCurrentWord(); |
1288 | 0 | pr_it->ResetWordIterator(); |
1289 | 0 | return cert; |
1290 | 0 | } |
1291 | | |
1292 | | #endif // ndef DISABLED_LEGACY_ENGINE |
1293 | | |
1294 | | // Generic function for classifying a word. Can be used either for pass1 or |
1295 | | // pass2 according to the function passed to recognizer. |
1296 | | // word_data holds the word to be recognized, and its block and row, and |
1297 | | // pr_it points to the word as well, in case we are running LSTM and it wants |
1298 | | // to output multiple words. |
1299 | | // Recognizes in the current language, and if successful that is all. |
1300 | | // If recognition was not successful, tries all available languages until |
1301 | | // it gets a successful result or runs out of languages. Keeps the best result. |
1302 | 444k | void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) { |
1303 | | #ifdef DISABLED_LEGACY_ENGINE |
1304 | | WordRecognizer recognizer = &Tesseract::classify_word_pass1; |
1305 | | #else |
1306 | 444k | WordRecognizer recognizer = |
1307 | 444k | pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2; |
1308 | 444k | #endif // def DISABLED_LEGACY_ENGINE |
1309 | | |
1310 | | // Best result so far. |
1311 | 444k | PointerVector<WERD_RES> best_words; |
1312 | | // Points to the best result. May be word or in lang_words. |
1313 | 444k | const WERD_RES *word = word_data->word; |
1314 | 444k | clock_t start_t = clock(); |
1315 | 444k | const bool debug = classify_debug_level > 0 || multilang_debug_level > 0; |
1316 | 444k | if (debug) { |
1317 | 0 | tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing", |
1318 | 0 | most_recently_used_->lang.c_str()); |
1319 | 0 | word->word->bounding_box().print(); |
1320 | 0 | } |
1321 | 444k | if (word->done) { |
1322 | | // If done on pass1, leave it as-is. |
1323 | 168k | if (!word->tess_failed) { |
1324 | 166k | most_recently_used_ = word->tesseract; |
1325 | 166k | } |
1326 | 168k | return; |
1327 | 168k | } |
1328 | 276k | auto sub = sub_langs_.size(); |
1329 | 276k | if (most_recently_used_ != this) { |
1330 | | // Get the index of the most_recently_used_. |
1331 | 0 | for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) { |
1332 | 0 | } |
1333 | 0 | } |
1334 | 276k | most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub], |
1335 | 276k | &best_words); |
1336 | 276k | Tesseract *best_lang_tess = most_recently_used_; |
1337 | 276k | if (!WordsAcceptable(best_words)) { |
1338 | | // Try all the other languages to see if they are any better. |
1339 | 260k | if (most_recently_used_ != this && |
1340 | 260k | this->RetryWithLanguage(*word_data, recognizer, debug, |
1341 | 0 | &word_data->lang_words[sub_langs_.size()], &best_words) > 0) { |
1342 | 0 | best_lang_tess = this; |
1343 | 0 | } |
1344 | 260k | for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) { |
1345 | 0 | if (most_recently_used_ != sub_langs_[i] && |
1346 | 0 | sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i], |
1347 | 0 | &best_words) > 0) { |
1348 | 0 | best_lang_tess = sub_langs_[i]; |
1349 | 0 | } |
1350 | 0 | } |
1351 | 260k | } |
1352 | 276k | most_recently_used_ = best_lang_tess; |
1353 | 276k | if (!best_words.empty()) { |
1354 | 276k | if (best_words.size() == 1 && !best_words[0]->combination) { |
1355 | | // Move the best single result to the main word. |
1356 | 128k | word_data->word->ConsumeWordResults(best_words[0]); |
1357 | 147k | } else { |
1358 | | // Words came from LSTM, and must be moved to the PAGE_RES properly. |
1359 | 147k | word_data->word = best_words.back(); |
1360 | 147k | pr_it->ReplaceCurrentWord(&best_words); |
1361 | 147k | } |
1362 | 276k | ASSERT_HOST(word_data->word->box_word != nullptr); |
1363 | 276k | } else { |
1364 | 0 | tprintf("no best words!!\n"); |
1365 | 0 | } |
1366 | 276k | clock_t ocr_t = clock(); |
1367 | 276k | if (tessedit_timing_debug) { |
1368 | 0 | tprintf("%s (ocr took %.2f sec)\n", word_data->word->best_choice->unichar_string().c_str(), |
1369 | 0 | static_cast<double>(ocr_t - start_t) / CLOCKS_PER_SEC); |
1370 | 0 | } |
1371 | 276k | } |
1372 | | |
1373 | | /** |
1374 | | * classify_word_pass1 |
1375 | | * |
1376 | | * Baseline normalize the word and pass it to Tess. |
1377 | | */ |
1378 | | |
1379 | | void Tesseract::classify_word_pass1(const WordData &word_data, WERD_RES **in_word, |
1380 | 212k | PointerVector<WERD_RES> *out_words) { |
1381 | 212k | ROW *row = word_data.row; |
1382 | 212k | BLOCK *block = word_data.block; |
1383 | 212k | prev_word_best_choice_ = |
1384 | 212k | word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr; |
1385 | | #ifdef DISABLED_LEGACY_ENGINE |
1386 | | if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { |
1387 | | #else |
1388 | 212k | if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY || |
1389 | 212k | tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) { |
1390 | 212k | #endif // def DISABLED_LEGACY_ENGINE |
1391 | 212k | if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { |
1392 | 180k | LSTMRecognizeWord(*block, row, *in_word, out_words); |
1393 | 180k | if (!out_words->empty()) { |
1394 | 147k | return; // Successful lstm recognition. |
1395 | 147k | } |
1396 | 180k | } |
1397 | 65.5k | if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { |
1398 | | // No fallback allowed, so use a fake. |
1399 | 0 | (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset()); |
1400 | 0 | return; |
1401 | 0 | } |
1402 | | |
1403 | 65.5k | #ifndef DISABLED_LEGACY_ENGINE |
1404 | | // Fall back to tesseract for failed words or odd words. |
1405 | 65.5k | (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr, |
1406 | 65.5k | classify_bln_numeric_mode, textord_use_cjk_fp_model, |
1407 | 65.5k | poly_allow_detailed_fx, row, block); |
1408 | 65.5k | #endif // ndef DISABLED_LEGACY_ENGINE |
1409 | 65.5k | } |
1410 | | |
1411 | 65.5k | #ifndef DISABLED_LEGACY_ENGINE |
1412 | 65.5k | WERD_RES *word = *in_word; |
1413 | 65.5k | match_word_pass_n(1, word, row, block); |
1414 | 65.5k | if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { |
1415 | 65.5k | word->tess_would_adapt = AdaptableWord(word); |
1416 | 65.5k | bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode); |
1417 | | |
1418 | 65.5k | if (adapt_ok) { |
1419 | | // Send word to adaptive classifier for training. |
1420 | 2.36k | word->BestChoiceToCorrectText(); |
1421 | 2.36k | LearnWord(nullptr, word); |
1422 | | // Mark misadaptions if running blamer. |
1423 | 2.36k | if (word->blamer_bundle != nullptr) { |
1424 | 0 | word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer); |
1425 | 0 | } |
1426 | 2.36k | } |
1427 | | |
1428 | 65.5k | if (tessedit_enable_doc_dict && !word->IsAmbiguous()) { |
1429 | 36.2k | tess_add_doc_word(word->best_choice); |
1430 | 36.2k | } |
1431 | 65.5k | } |
1432 | 65.5k | #endif // ndef DISABLED_LEGACY_ENGINE |
1433 | 65.5k | } |
1434 | | |
1435 | | // Helper to report the result of the xheight fix. |
1436 | | void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, |
1437 | 0 | WERD_RES *new_word) { |
1438 | 0 | tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(), |
1439 | 0 | word->best_choice->debug_string().c_str()); |
1440 | 0 | word->reject_map.print(debug_fp); |
1441 | 0 | tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(), |
1442 | 0 | new_word->best_choice->debug_string().c_str()); |
1443 | 0 | new_word->reject_map.print(debug_fp); |
1444 | 0 | tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT", |
1445 | 0 | new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK", |
1446 | 0 | accept_new_word ? "ACCEPTED" : ""); |
1447 | 0 | } |
1448 | | |
1449 | | #ifndef DISABLED_LEGACY_ENGINE |
1450 | | |
1451 | | // Run the x-height fix-up, based on min/max top/bottom information in |
1452 | | // unicharset. |
1453 | | // Returns true if the word was changed. |
1454 | | // See the comment in fixxht.cpp for a description of the overall process. |
1455 | 63.1k | bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row) { |
1456 | 63.1k | int original_misfits = CountMisfitTops(word); |
1457 | 63.1k | if (original_misfits == 0) { |
1458 | 27.4k | return false; |
1459 | 27.4k | } |
1460 | 35.7k | float baseline_shift = 0.0f; |
1461 | 35.7k | float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift); |
1462 | 35.7k | if (baseline_shift != 0.0f) { |
1463 | | // Try the shift on its own first. |
1464 | 12.3k | if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) { |
1465 | 8.33k | return false; |
1466 | 8.33k | } |
1467 | 4.06k | original_misfits = CountMisfitTops(word); |
1468 | 4.06k | if (original_misfits > 0) { |
1469 | 1.48k | float new_baseline_shift; |
1470 | | // Now recompute the new x_height. |
1471 | 1.48k | new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift); |
1472 | 1.48k | if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { |
1473 | | // No test of return value here, as we are definitely making a change |
1474 | | // to the word by shifting the baseline. |
1475 | 1.26k | TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row); |
1476 | 1.26k | } |
1477 | 1.48k | } |
1478 | 4.06k | return true; |
1479 | 23.3k | } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { |
1480 | 18.4k | return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row); |
1481 | 18.4k | } else { |
1482 | 4.90k | return false; |
1483 | 4.90k | } |
1484 | 35.7k | } |
1485 | | |
1486 | | // Runs recognition with the test baseline shift and x-height and returns true |
1487 | | // if there was an improvement in recognition result. |
1488 | | bool Tesseract::TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, |
1489 | 32.0k | WERD_RES *word, BLOCK *block, ROW *row) { |
1490 | 32.0k | bool accept_new_x_ht = false; |
1491 | 32.0k | WERD_RES new_x_ht_word(word->word); |
1492 | 32.0k | if (word->blamer_bundle != nullptr) { |
1493 | 0 | new_x_ht_word.blamer_bundle = new BlamerBundle(); |
1494 | 0 | new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle)); |
1495 | 0 | } |
1496 | 32.0k | new_x_ht_word.x_height = new_x_ht; |
1497 | 32.0k | new_x_ht_word.baseline_shift = baseline_shift; |
1498 | 32.0k | new_x_ht_word.caps_height = 0.0; |
1499 | 32.0k | new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr, |
1500 | 32.0k | classify_bln_numeric_mode, textord_use_cjk_fp_model, |
1501 | 32.0k | poly_allow_detailed_fx, row, block); |
1502 | 32.0k | match_word_pass_n(2, &new_x_ht_word, row, block); |
1503 | 32.0k | if (!new_x_ht_word.tess_failed) { |
1504 | 32.0k | int new_misfits = CountMisfitTops(&new_x_ht_word); |
1505 | 32.0k | if (debug_x_ht_level >= 1) { |
1506 | 0 | tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits, |
1507 | 0 | word->x_height, new_misfits, new_x_ht); |
1508 | 0 | tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(), |
1509 | 0 | word->best_choice->certainty(), new_x_ht_word.best_choice->rating(), |
1510 | 0 | new_x_ht_word.best_choice->certainty()); |
1511 | 0 | } |
1512 | | // The misfits must improve and either the rating or certainty. |
1513 | 32.0k | accept_new_x_ht = new_misfits < original_misfits && |
1514 | 32.0k | (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() || |
1515 | 22.6k | new_x_ht_word.best_choice->rating() < word->best_choice->rating()); |
1516 | 32.0k | if (debug_x_ht_level >= 1) { |
1517 | 0 | ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word); |
1518 | 0 | } |
1519 | 32.0k | } |
1520 | 32.0k | if (accept_new_x_ht) { |
1521 | 18.2k | word->ConsumeWordResults(&new_x_ht_word); |
1522 | 18.2k | return true; |
1523 | 18.2k | } |
1524 | 13.8k | return false; |
1525 | 32.0k | } |
1526 | | |
1527 | | #endif // ndef DISABLED_LEGACY_ENGINE |
1528 | | |
1529 | | /** |
1530 | | * classify_word_pass2 |
1531 | | * |
1532 | | * Control what to do with the word in pass 2 |
1533 | | */ |
1534 | | |
1535 | | void Tesseract::classify_word_pass2(const WordData &word_data, WERD_RES **in_word, |
1536 | 63.2k | PointerVector<WERD_RES> *out_words) { |
1537 | | // Return if we do not want to run Tesseract. |
1538 | 63.2k | if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { |
1539 | 0 | return; |
1540 | 0 | } |
1541 | 63.2k | #ifndef DISABLED_LEGACY_ENGINE |
1542 | 63.2k | ROW *row = word_data.row; |
1543 | 63.2k | BLOCK *block = word_data.block; |
1544 | 63.2k | WERD_RES *word = *in_word; |
1545 | 63.2k | prev_word_best_choice_ = |
1546 | 63.2k | word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr; |
1547 | | |
1548 | 63.2k | check_debug_pt(word, 30); |
1549 | 63.2k | if (!word->done) { |
1550 | 63.2k | word->caps_height = 0.0; |
1551 | 63.2k | if (word->x_height == 0.0f) { |
1552 | 0 | word->x_height = row->x_height(); |
1553 | 0 | } |
1554 | 63.2k | match_word_pass_n(2, word, row, block); |
1555 | 63.2k | check_debug_pt(word, 40); |
1556 | 63.2k | } |
1557 | | |
1558 | 63.2k | SubAndSuperscriptFix(word); |
1559 | | |
1560 | 63.2k | if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { |
1561 | 63.1k | if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() && |
1562 | 63.1k | block->classify_rotation().y() == 0.0f) { |
1563 | | // Use the tops and bottoms since they are available. |
1564 | 63.1k | TrainedXheightFix(word, block, row); |
1565 | 63.1k | } |
1566 | 63.1k | } |
1567 | | # ifndef GRAPHICS_DISABLED |
1568 | | if (tessedit_display_outwords) { |
1569 | | if (fx_win == nullptr) { |
1570 | | create_fx_win(); |
1571 | | } |
1572 | | clear_fx_win(); |
1573 | | word->rebuild_word->plot(fx_win); |
1574 | | TBOX wbox = word->rebuild_word->bounding_box(); |
1575 | | fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom()); |
1576 | | ScrollView::Update(); |
1577 | | } |
1578 | | # endif |
1579 | 63.2k | check_debug_pt(word, 50); |
1580 | 63.2k | #endif // ndef DISABLED_LEGACY_ENGINE |
1581 | 63.2k | } |
1582 | | |
1583 | | #ifndef DISABLED_LEGACY_ENGINE |
1584 | | /** |
1585 | | * match_word_pass2 |
1586 | | * |
1587 | | * Baseline normalize the word and pass it to Tess. |
1588 | | */ |
1589 | 160k | void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block) { |
1590 | 160k | if (word->tess_failed) { |
1591 | 0 | return; |
1592 | 0 | } |
1593 | 160k | tess_segment_pass_n(pass_n, word); |
1594 | | |
1595 | 160k | if (!word->tess_failed) { |
1596 | 160k | if (!word->word->flag(W_REP_CHAR)) { |
1597 | 160k | word->fix_quotes(); |
1598 | 160k | if (tessedit_fix_hyphens) { |
1599 | 160k | word->fix_hyphens(); |
1600 | 160k | } |
1601 | | /* Don't trust fix_quotes! - though I think I've fixed the bug */ |
1602 | 160k | if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) { |
1603 | 0 | tprintf( |
1604 | 0 | "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;" |
1605 | 0 | " #Blobs=%u\n", |
1606 | 0 | word->best_choice->debug_string().c_str(), word->best_choice->length(), |
1607 | 0 | word->box_word->length()); |
1608 | 0 | } |
1609 | 160k | word->tess_accepted = tess_acceptable_word(word); |
1610 | | |
1611 | | // Also sets word->done flag |
1612 | 160k | make_reject_map(word, row, pass_n); |
1613 | 160k | } |
1614 | 160k | } |
1615 | 160k | set_word_fonts(word); |
1616 | | |
1617 | 160k | ASSERT_HOST(word->raw_choice != nullptr); |
1618 | 160k | } |
1619 | | #endif // ndef DISABLED_LEGACY_ENGINE |
1620 | | |
1621 | | // Helper to return the best rated BLOB_CHOICE in the whole word that matches |
1622 | | // the given char_id, or nullptr if none can be found. |
1623 | 0 | static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) { |
1624 | | // Find the corresponding best BLOB_CHOICE from any position in the word_res. |
1625 | 0 | BLOB_CHOICE *best_choice = nullptr; |
1626 | 0 | for (unsigned i = 0; i < word_res->best_choice->length(); ++i) { |
1627 | 0 | BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i)); |
1628 | 0 | if (choice != nullptr) { |
1629 | 0 | if (best_choice == nullptr || choice->rating() < best_choice->rating()) { |
1630 | 0 | best_choice = choice; |
1631 | 0 | } |
1632 | 0 | } |
1633 | 0 | } |
1634 | 0 | return best_choice; |
1635 | 0 | } |
1636 | | |
1637 | | // Helper to insert blob_choice in each location in the leader word if there is |
1638 | | // no matching BLOB_CHOICE there already, and correct any incorrect results |
1639 | | // in the best_choice. |
1640 | 0 | static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) { |
1641 | 0 | WERD_CHOICE *word = word_res->best_choice; |
1642 | 0 | for (unsigned i = 0; i < word_res->best_choice->length(); ++i) { |
1643 | 0 | BLOB_CHOICE *choice = |
1644 | 0 | FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i)); |
1645 | 0 | if (choice == nullptr) { |
1646 | 0 | BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i)); |
1647 | 0 | choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice)); |
1648 | 0 | } |
1649 | 0 | } |
1650 | | // Correct any incorrect results in word. |
1651 | 0 | for (unsigned i = 0; i < word->length(); ++i) { |
1652 | 0 | if (word->unichar_id(i) != blob_choice->unichar_id()) { |
1653 | 0 | word->set_unichar_id(blob_choice->unichar_id(), i); |
1654 | 0 | } |
1655 | 0 | } |
1656 | 0 | } |
1657 | | |
1658 | | /** |
1659 | | * fix_rep_char() |
1660 | | * The word is a repeated char. (Leader.) Find the repeated char character. |
1661 | | * Create the appropriate single-word or multi-word sequence according to |
1662 | | * the size of spaces in between blobs, and correct the classifications |
1663 | | * where some of the characters disagree with the majority. |
1664 | | */ |
1665 | 0 | void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) { |
1666 | 0 | WERD_RES *word_res = page_res_it->word(); |
1667 | 0 | const WERD_CHOICE &word = *(word_res->best_choice); |
1668 | | |
1669 | | // Find the frequency of each unique character in the word. |
1670 | 0 | SortHelper<UNICHAR_ID> rep_ch(word.length()); |
1671 | 0 | for (unsigned i = 0; i < word.length(); ++i) { |
1672 | 0 | rep_ch.Add(word.unichar_id(i), 1); |
1673 | 0 | } |
1674 | | |
1675 | | // Find the most frequent result. |
1676 | 0 | UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char |
1677 | 0 | int max_count = rep_ch.MaxCount(&maxch_id); |
1678 | | // Find the best exemplar of a classifier result for maxch_id. |
1679 | 0 | BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res); |
1680 | 0 | if (best_choice == nullptr) { |
1681 | 0 | tprintf("Failed to find a choice for %s, occurring %d times\n", |
1682 | 0 | word_res->uch_set->debug_str(maxch_id).c_str(), max_count); |
1683 | 0 | return; |
1684 | 0 | } |
1685 | 0 | word_res->done = true; |
1686 | | |
1687 | | // Just correct existing classification. |
1688 | 0 | CorrectRepcharChoices(best_choice, word_res); |
1689 | 0 | word_res->reject_map.initialise(word.length()); |
1690 | 0 | } |
1691 | | |
1692 | | ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_set, const char *s, |
1693 | 26 | const char *lengths) { |
1694 | 26 | int i = 0; |
1695 | 26 | int offset = 0; |
1696 | 26 | int leading_punct_count; |
1697 | 26 | int upper_count = 0; |
1698 | 26 | int hyphen_pos = -1; |
1699 | 26 | ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; |
1700 | | |
1701 | 26 | if (strlen(lengths) > 20) { |
1702 | 0 | return word_type; |
1703 | 0 | } |
1704 | | |
1705 | | /* Single Leading punctuation char*/ |
1706 | | |
1707 | 26 | if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) { |
1708 | 0 | offset += lengths[i++]; |
1709 | 0 | } |
1710 | 26 | leading_punct_count = i; |
1711 | | |
1712 | | /* Initial cap */ |
1713 | 32 | while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) { |
1714 | 6 | offset += lengths[i++]; |
1715 | 6 | upper_count++; |
1716 | 6 | } |
1717 | 26 | if (upper_count > 1) { |
1718 | 1 | word_type = AC_UPPER_CASE; |
1719 | 25 | } else { |
1720 | | /* Lower case word, possibly with an initial cap */ |
1721 | 59 | while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) { |
1722 | 34 | offset += lengths[i++]; |
1723 | 34 | } |
1724 | 25 | if (i - leading_punct_count < quality_min_initial_alphas_reqd) { |
1725 | 20 | goto not_a_word; |
1726 | 20 | } |
1727 | | /* |
1728 | | Allow a single hyphen in a lower case word |
1729 | | - don't trust upper case - I've seen several cases of "H" -> "I-I" |
1730 | | */ |
1731 | 5 | if (lengths[i] == 1 && s[offset] == '-') { |
1732 | 0 | hyphen_pos = i; |
1733 | 0 | offset += lengths[i++]; |
1734 | 0 | if (s[offset] != '\0') { |
1735 | 0 | while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) { |
1736 | 0 | offset += lengths[i++]; |
1737 | 0 | } |
1738 | 0 | if (i < hyphen_pos + 3) { |
1739 | 0 | goto not_a_word; |
1740 | 0 | } |
1741 | 0 | } |
1742 | 5 | } else { |
1743 | | /* Allow "'s" in NON hyphenated lower case words */ |
1744 | 5 | if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 && |
1745 | 5 | (s[offset + lengths[i]] == 's')) { |
1746 | 0 | offset += lengths[i++]; |
1747 | 0 | offset += lengths[i++]; |
1748 | 0 | } |
1749 | 5 | } |
1750 | 5 | if (upper_count > 0) { |
1751 | 4 | word_type = AC_INITIAL_CAP; |
1752 | 4 | } else { |
1753 | 1 | word_type = AC_LOWER_CASE; |
1754 | 1 | } |
1755 | 5 | } |
1756 | | |
1757 | | /* Up to two different, constrained trailing punctuation chars */ |
1758 | 6 | if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) { |
1759 | 0 | offset += lengths[i++]; |
1760 | 0 | } |
1761 | 6 | if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] && |
1762 | 6 | chs_trailing_punct2.contains(s[offset])) { |
1763 | 0 | offset += lengths[i++]; |
1764 | 0 | } |
1765 | | |
1766 | 6 | if (s[offset] != '\0') { |
1767 | 2 | word_type = AC_UNACCEPTABLE; |
1768 | 2 | } |
1769 | | |
1770 | 26 | not_a_word: |
1771 | | |
1772 | 26 | if (word_type == AC_UNACCEPTABLE) { |
1773 | | /* Look for abbreviation string */ |
1774 | 22 | i = 0; |
1775 | 22 | offset = 0; |
1776 | 22 | if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) { |
1777 | 1 | word_type = AC_UC_ABBREV; |
1778 | 1 | while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) && |
1779 | 1 | lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { |
1780 | 0 | offset += lengths[i++]; |
1781 | 0 | offset += lengths[i++]; |
1782 | 0 | } |
1783 | 21 | } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) { |
1784 | 21 | word_type = AC_LC_ABBREV; |
1785 | 21 | while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) && |
1786 | 21 | lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { |
1787 | 0 | offset += lengths[i++]; |
1788 | 0 | offset += lengths[i++]; |
1789 | 0 | } |
1790 | 21 | } |
1791 | 22 | if (s[offset] != '\0') { |
1792 | 22 | word_type = AC_UNACCEPTABLE; |
1793 | 22 | } |
1794 | 22 | } |
1795 | | |
1796 | 26 | return word_type; |
1797 | 6 | } |
1798 | | |
1799 | 672k | bool Tesseract::check_debug_pt(WERD_RES *word, int location) { |
1800 | 672k | bool show_map_detail = false; |
1801 | 672k | int16_t i; |
1802 | | |
1803 | 672k | if (!test_pt) { |
1804 | 672k | return false; |
1805 | 672k | } |
1806 | | |
1807 | 0 | tessedit_rejection_debug.set_value(false); |
1808 | 0 | debug_x_ht_level.set_value(0); |
1809 | |
|
1810 | 0 | if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) { |
1811 | 0 | if (location < 0) { |
1812 | 0 | return true; // For breakpoint use |
1813 | 0 | } |
1814 | 0 | tessedit_rejection_debug.set_value(true); |
1815 | 0 | debug_x_ht_level.set_value(2); |
1816 | 0 | tprintf("\n\nTESTWD::"); |
1817 | 0 | switch (location) { |
1818 | 0 | case 0: |
1819 | 0 | tprintf("classify_word_pass1 start\n"); |
1820 | 0 | word->word->print(); |
1821 | 0 | break; |
1822 | 0 | case 10: |
1823 | 0 | tprintf("make_reject_map: initial map"); |
1824 | 0 | break; |
1825 | 0 | case 20: |
1826 | 0 | tprintf("make_reject_map: after NN"); |
1827 | 0 | break; |
1828 | 0 | case 30: |
1829 | 0 | tprintf("classify_word_pass2 - START"); |
1830 | 0 | break; |
1831 | 0 | case 40: |
1832 | 0 | tprintf("classify_word_pass2 - Pre Xht"); |
1833 | 0 | break; |
1834 | 0 | case 50: |
1835 | 0 | tprintf("classify_word_pass2 - END"); |
1836 | 0 | show_map_detail = true; |
1837 | 0 | break; |
1838 | 0 | case 60: |
1839 | 0 | tprintf("fixspace"); |
1840 | 0 | break; |
1841 | 0 | case 70: |
1842 | 0 | tprintf("MM pass START"); |
1843 | 0 | break; |
1844 | 0 | case 80: |
1845 | 0 | tprintf("MM pass END"); |
1846 | 0 | break; |
1847 | 0 | case 90: |
1848 | 0 | tprintf("After Poor quality rejection"); |
1849 | 0 | break; |
1850 | 0 | case 100: |
1851 | 0 | tprintf("unrej_good_quality_words - START"); |
1852 | 0 | break; |
1853 | 0 | case 110: |
1854 | 0 | tprintf("unrej_good_quality_words - END"); |
1855 | 0 | break; |
1856 | 0 | case 120: |
1857 | 0 | tprintf("Write results pass"); |
1858 | 0 | show_map_detail = true; |
1859 | 0 | break; |
1860 | 0 | } |
1861 | 0 | if (word->best_choice != nullptr) { |
1862 | 0 | tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str()); |
1863 | 0 | word->reject_map.print(debug_fp); |
1864 | 0 | tprintf("\n"); |
1865 | 0 | if (show_map_detail) { |
1866 | 0 | tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str()); |
1867 | 0 | for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { |
1868 | 0 | tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); |
1869 | 0 | word->reject_map[i].full_print(debug_fp); |
1870 | 0 | } |
1871 | 0 | } |
1872 | 0 | } else { |
1873 | 0 | tprintf("null best choice\n"); |
1874 | 0 | } |
1875 | 0 | tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); |
1876 | 0 | tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); |
1877 | 0 | return true; |
1878 | 0 | } else { |
1879 | 0 | return false; |
1880 | 0 | } |
1881 | 0 | } |
1882 | | |
1883 | | /** |
1884 | | * find_modal_font |
1885 | | * |
1886 | | * Find the modal font and remove from the stats. |
1887 | | */ |
1888 | | #ifndef DISABLED_LEGACY_ENGINE |
1889 | | static void find_modal_font( // good chars in word |
1890 | | STATS *fonts, // font stats |
1891 | | int16_t *font_out, // output font |
1892 | | int8_t *font_count // output count |
1893 | 0 | ) { |
1894 | 0 | int16_t font; // font index |
1895 | 0 | int32_t count; // pile count |
1896 | |
|
1897 | 0 | if (fonts->get_total() > 0) { |
1898 | 0 | font = static_cast<int16_t>(fonts->mode()); |
1899 | 0 | *font_out = font; |
1900 | 0 | count = fonts->pile_count(font); |
1901 | 0 | *font_count = count < INT8_MAX ? count : INT8_MAX; |
1902 | 0 | fonts->add(font, -*font_count); |
1903 | 0 | } else { |
1904 | 0 | *font_out = -1; |
1905 | 0 | *font_count = 0; |
1906 | 0 | } |
1907 | 0 | } |
1908 | | #endif // ! DISABLED_LEGACY_ENGINE |
1909 | | |
1910 | | /** |
1911 | | * set_word_fonts |
1912 | | * |
1913 | | * Get the fonts for the word. |
1914 | | */ |
1915 | 160k | void Tesseract::set_word_fonts(WERD_RES *word) { |
1916 | | // Don't try to set the word fonts for an lstm word, as the configs |
1917 | | // will be meaningless. |
1918 | 160k | if (word->chopped_word == nullptr) { |
1919 | 0 | return; |
1920 | 0 | } |
1921 | 160k | ASSERT_HOST(word->best_choice != nullptr); |
1922 | | |
1923 | 160k | #ifndef DISABLED_LEGACY_ENGINE |
1924 | 160k | const int fontinfo_size = fontinfo_table_.size(); |
1925 | 160k | if (fontinfo_size == 0) { |
1926 | 0 | return; |
1927 | 0 | } |
1928 | 160k | if (tessedit_font_id > 0) { |
1929 | 0 | if (tessedit_font_id >= fontinfo_size) { |
1930 | 0 | tprintf("Error, invalid font ID provided: must be below %d.\n" |
1931 | 0 | "Falling back to font auto-detection.\n", fontinfo_size); |
1932 | 0 | } else { |
1933 | 0 | word->fontinfo = &fontinfo_table_.at(tessedit_font_id); |
1934 | 0 | word->fontinfo2 = nullptr; |
1935 | 0 | word->fontinfo_id_count = INT8_MAX; |
1936 | 0 | word->fontinfo_id2_count = 0; |
1937 | 0 | return; |
1938 | 0 | } |
1939 | 0 | } |
1940 | 160k | std::vector<int> font_total_score(fontinfo_size); |
1941 | | |
1942 | | // Compute the font scores for the word |
1943 | 160k | if (tessedit_debug_fonts) { |
1944 | 0 | tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str()); |
1945 | 0 | } |
1946 | 853k | for (unsigned b = 0; b < word->best_choice->length(); ++b) { |
1947 | 692k | const BLOB_CHOICE *choice = word->GetBlobChoice(b); |
1948 | 692k | if (choice == nullptr) { |
1949 | 81 | continue; |
1950 | 81 | } |
1951 | 692k | auto &fonts = choice->fonts(); |
1952 | 22.2M | for (auto &f : fonts) { |
1953 | 22.2M | const int fontinfo_id = f.fontinfo_id; |
1954 | 22.2M | if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) { |
1955 | 22.2M | font_total_score[fontinfo_id] += f.score; |
1956 | 22.2M | } |
1957 | 22.2M | } |
1958 | 692k | } |
1959 | | // Find the top and 2nd choice for the word. |
1960 | 160k | int score1 = 0, score2 = 0; |
1961 | 160k | int16_t font_id1 = -1, font_id2 = -1; |
1962 | 68.8M | for (int f = 0; f < fontinfo_size; ++f) { |
1963 | 68.6M | if (tessedit_debug_fonts && font_total_score[f] > 0) { |
1964 | 0 | tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]); |
1965 | 0 | } |
1966 | 68.6M | if (font_total_score[f] > score1) { |
1967 | 494k | score2 = score1; |
1968 | 494k | font_id2 = font_id1; |
1969 | 494k | score1 = font_total_score[f]; |
1970 | 494k | font_id1 = f; |
1971 | 68.1M | } else if (font_total_score[f] > score2) { |
1972 | 319k | score2 = font_total_score[f]; |
1973 | 319k | font_id2 = f; |
1974 | 319k | } |
1975 | 68.6M | } |
1976 | 160k | word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr; |
1977 | 160k | word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr; |
1978 | | // Each score has a limit of UINT16_MAX, so divide by that to get the number |
1979 | | // of "votes" for that font, ie number of perfect scores. |
1980 | 160k | word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX); |
1981 | 160k | word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX); |
1982 | 160k | if (score1 > 0) { |
1983 | 160k | const FontInfo fi = fontinfo_table_.at(font_id1); |
1984 | 160k | if (tessedit_debug_fonts) { |
1985 | 0 | if (word->fontinfo_id2_count > 0 && font_id2 >= 0) { |
1986 | 0 | tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name, |
1987 | 0 | word->fontinfo_id_count, fontinfo_table_.at(font_id2).name, |
1988 | 0 | word->fontinfo_id2_count); |
1989 | 0 | } else { |
1990 | 0 | tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count); |
1991 | 0 | } |
1992 | 0 | } |
1993 | 160k | } |
1994 | 160k | #endif // ndef DISABLED_LEGACY_ENGINE |
1995 | 160k | } |
1996 | | |
1997 | | #ifndef DISABLED_LEGACY_ENGINE |
1998 | | /** |
1999 | | * font_recognition_pass |
2000 | | * |
2001 | | * Smooth the fonts for the document. |
2002 | | */ |
2003 | 0 | void Tesseract::font_recognition_pass(PAGE_RES *page_res) { |
2004 | 0 | PAGE_RES_IT page_res_it(page_res); |
2005 | 0 | WERD_RES *word; // current word |
2006 | 0 | STATS doc_fonts(0, font_table_size_ - 1); // font counters |
2007 | | |
2008 | | // Gather font id statistics. |
2009 | 0 | for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { |
2010 | 0 | word = page_res_it.word(); |
2011 | 0 | if (word->fontinfo != nullptr) { |
2012 | 0 | doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count); |
2013 | 0 | } |
2014 | 0 | if (word->fontinfo2 != nullptr) { |
2015 | 0 | doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count); |
2016 | 0 | } |
2017 | 0 | } |
2018 | 0 | int16_t doc_font; // modal font |
2019 | 0 | int8_t doc_font_count; // modal font |
2020 | 0 | find_modal_font(&doc_fonts, &doc_font, &doc_font_count); |
2021 | 0 | if (doc_font_count == 0) { |
2022 | 0 | return; |
2023 | 0 | } |
2024 | | // Get the modal font pointer. |
2025 | 0 | const FontInfo *modal_font = nullptr; |
2026 | 0 | for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { |
2027 | 0 | word = page_res_it.word(); |
2028 | 0 | if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) { |
2029 | 0 | modal_font = word->fontinfo; |
2030 | 0 | break; |
2031 | 0 | } |
2032 | 0 | if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) { |
2033 | 0 | modal_font = word->fontinfo2; |
2034 | 0 | break; |
2035 | 0 | } |
2036 | 0 | } |
2037 | 0 | ASSERT_HOST(modal_font != nullptr); |
2038 | | |
2039 | | // Assign modal font to weak words. |
2040 | 0 | for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { |
2041 | 0 | word = page_res_it.word(); |
2042 | 0 | const int length = word->best_choice->length(); |
2043 | |
|
2044 | 0 | const int count = word->fontinfo_id_count; |
2045 | 0 | if (!(count == length || (length > 3 && count >= length * 3 / 4))) { |
2046 | 0 | word->fontinfo = modal_font; |
2047 | | // Counts only get 1 as it came from the doc. |
2048 | 0 | word->fontinfo_id_count = 1; |
2049 | 0 | } |
2050 | 0 | } |
2051 | 0 | } |
2052 | | #endif // ndef DISABLED_LEGACY_ENGINE |
2053 | | |
2054 | | // If a word has multiple alternates check if the best choice is in the |
2055 | | // dictionary. If not, replace it with an alternate that exists in the |
2056 | | // dictionary. |
2057 | 0 | void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) { |
2058 | 0 | PAGE_RES_IT word_it(page_res); |
2059 | 0 | for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) { |
2060 | 0 | if (word->best_choices.singleton()) { |
2061 | 0 | continue; // There are no alternates. |
2062 | 0 | } |
2063 | | |
2064 | 0 | const WERD_CHOICE *best = word->best_choice; |
2065 | 0 | if (word->tesseract->getDict().valid_word(*best) != 0) { |
2066 | 0 | continue; // The best choice is in the dictionary. |
2067 | 0 | } |
2068 | | |
2069 | 0 | WERD_CHOICE_IT choice_it(&word->best_choices); |
2070 | 0 | for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { |
2071 | 0 | WERD_CHOICE *alternate = choice_it.data(); |
2072 | 0 | if (word->tesseract->getDict().valid_word(*alternate)) { |
2073 | | // The alternate choice is in the dictionary. |
2074 | 0 | if (tessedit_bigram_debug) { |
2075 | 0 | tprintf("Dictionary correction replaces best choice '%s' with '%s'\n", |
2076 | 0 | best->unichar_string().c_str(), alternate->unichar_string().c_str()); |
2077 | 0 | } |
2078 | | // Replace the 'best' choice with a better choice. |
2079 | 0 | word->ReplaceBestChoice(alternate); |
2080 | 0 | break; |
2081 | 0 | } |
2082 | 0 | } |
2083 | 0 | } |
2084 | 0 | } |
2085 | | |
2086 | | } // namespace tesseract |