Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccmain/control.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************
2
 * File:        control.cpp  (Formerly control.c)
3
 * Description: Module-independent matcher controller.
4
 * Author:      Ray Smith
5
 *
6
 * (C) Copyright 1992, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
// Include automatically generated configuration file if running autoconf.
20
#ifdef HAVE_CONFIG_H
21
#  include "config_auto.h"
22
#endif
23
24
#include <cctype>
25
#include <cmath>
26
#include <cstdint> // for int16_t, int32_t
27
#include <cstdio>  // for fclose, fopen, FILE
28
#include <ctime>   // for clock
29
#include "control.h"
30
#ifndef DISABLED_LEGACY_ENGINE
31
#  include "docqual.h"
32
#  include "drawfx.h"
33
#  include "fixspace.h"
34
#endif
35
#include <tesseract/ocrclass.h>
36
#include "lstmrecognizer.h"
37
#include "output.h"
38
#include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...
39
#ifndef DISABLED_LEGACY_ENGINE
40
#  include "reject.h"
41
#endif
42
#include "sorthelper.h"
43
#include "tesseractclass.h"
44
#include "tesserrstream.h"  // for tesserr
45
#include "tessvars.h"
46
#include "werdit.h"
47
48
const char *const kBackUpConfigFile = "tempconfigdata.config";
49
#ifndef DISABLED_LEGACY_ENGINE
50
// Min believable x-height for any text when refitting as a fraction of
51
// original x-height
52
const double kMinRefitXHeightFraction = 0.5;
53
#endif // ! DISABLED_LEGACY_ENGINE
54
55
namespace tesseract {
56
57
/**
58
 * Make a word from the selected blobs and run Tess on them.
59
 *
60
 * @param page_res recognise blobs
61
 * @param selection_box within this box
62
 */
63
64
0
void Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) {
65
0
  PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
66
0
  if (it != nullptr) {
67
0
    recog_interactive(it);
68
0
    it->DeleteCurrentWord();
69
0
    delete it;
70
0
  }
71
0
}
72
73
/**
74
 * Recognize a single word in interactive mode.
75
 *
76
 * @param pr_it the page results iterator
77
 */
78
0
bool Tesseract::recog_interactive(PAGE_RES_IT *pr_it) {
79
0
  WordData word_data(*pr_it);
80
0
  SetupWordPassN(2, &word_data);
81
  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
82
0
  if (lstm_recognizer_ == nullptr) {
83
0
#ifndef DISABLED_LEGACY_ENGINE
84
0
    classify_word_and_language(2, pr_it, &word_data);
85
0
#endif // ndef DISABLED_LEGACY_ENGINE
86
0
  } else {
87
0
    classify_word_and_language(1, pr_it, &word_data);
88
0
  }
89
0
#ifndef DISABLED_LEGACY_ENGINE
90
0
  if (tessedit_debug_quality_metrics) {
91
0
    int16_t char_qual;
92
0
    int16_t good_char_qual;
93
0
    WERD_RES *word_res = pr_it->word();
94
0
    word_char_quality(word_res, &char_qual, &good_char_qual);
95
0
    tprintf(
96
0
        "\n%d chars;  word_blob_quality: %d;  outline_errs: %d; "
97
0
        "char_quality: %d; good_char_quality: %d\n",
98
0
        word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res),
99
0
        char_qual, good_char_qual);
100
0
  }
101
0
#endif // ndef DISABLED_LEGACY_ENGINE
102
0
  return true;
103
0
}
104
105
// Helper function to check for a target word and handle it appropriately.
106
// Inspired by Jetsoft's requirement to process only single words on pass2
107
// and beyond.
108
// If word_config is not null:
109
//   If the word_box and target_word_box overlap, read the word_config file
110
//   else reset to previous config data.
111
//   return true.
112
// else
113
//   If the word_box and target_word_box overlap or pass <= 1, return true.
114
// Note that this function uses a fixed temporary file for storing the previous
115
// configs, so it is neither thread-safe, nor process-safe, but the assumption
116
// is that it will only be used for one debug window at a time.
117
//
118
// Since this function is used for debugging (and not to change OCR results)
119
// set only debug params from the word config file.
120
bool Tesseract::ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box,
121
0
                                  const char *word_config, int pass) {
122
0
  if (word_config != nullptr) {
123
0
    if (word_box.major_overlap(target_word_box)) {
124
0
      if (backup_config_file_ == nullptr) {
125
0
        backup_config_file_ = kBackUpConfigFile;
126
0
        FILE *config_fp = fopen(backup_config_file_, "wb");
127
0
        if (config_fp == nullptr) {
128
0
          tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
129
0
        } else {
130
0
          ParamUtils::PrintParams(config_fp, params());
131
0
          fclose(config_fp);
132
0
        }
133
0
        ParamUtils::ReadParamsFile(word_config, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());
134
0
      }
135
0
    } else {
136
0
      if (backup_config_file_ != nullptr) {
137
0
        ParamUtils::ReadParamsFile(backup_config_file_, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());
138
0
        backup_config_file_ = nullptr;
139
0
      }
140
0
    }
141
0
  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
142
0
    return false;
143
0
  }
144
0
  return true;
145
0
}
146
147
/** If tesseract is to be run, sets the words up ready for it. */
148
void Tesseract::SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config,
149
27.8k
                                   PAGE_RES *page_res, std::vector<WordData> *words) {
150
  // Prepare all the words.
151
27.8k
  PAGE_RES_IT page_res_it(page_res);
152
366k
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
153
338k
    if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(),
154
338k
                                                        *target_word_box, word_config, 1)) {
155
338k
      words->push_back(WordData(page_res_it));
156
338k
    }
157
338k
  }
158
  // Setup all the words for recognition with polygonal approximation.
159
366k
  for (unsigned w = 0; w < words->size(); ++w) {
160
338k
    SetupWordPassN(pass_n, &(*words)[w]);
161
338k
    if (w > 0) {
162
311k
      (*words)[w].prev_word = &(*words)[w - 1];
163
311k
    }
164
338k
  }
165
27.8k
}
166
167
// Sets up the single word ready for whichever engine is to be run.
168
338k
void Tesseract::SetupWordPassN(int pass_n, WordData *word) {
169
338k
  if (pass_n == 1 || !word->word->done) {
170
206k
    if (pass_n == 1) {
171
162k
      word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode,
172
162k
                                      nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model,
173
162k
                                      poly_allow_detailed_fx, word->row, word->block);
174
162k
    } else if (pass_n == 2) {
175
      // TODO(rays) Should we do this on pass1 too?
176
43.6k
      word->word->caps_height = 0.0;
177
43.6k
      if (word->word->x_height == 0.0f) {
178
0
        word->word->x_height = word->row->x_height();
179
0
      }
180
43.6k
    }
181
206k
    word->lang_words.truncate(0);
182
413k
    for (unsigned s = 0; s <= sub_langs_.size(); ++s) {
183
      // The sub_langs_.size() entry is for the master language.
184
206k
      Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
185
206k
      auto *word_res = new WERD_RES;
186
206k
      word_res->InitForRetryRecognition(*word->word);
187
206k
      word->lang_words.push_back(word_res);
188
      // LSTM doesn't get setup for pass2.
189
206k
      if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
190
206k
        word_res->SetupForRecognition(
191
206k
            lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr,
192
206k
            lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model,
193
206k
            lang_t->poly_allow_detailed_fx, word->row, word->block);
194
206k
      }
195
206k
    }
196
206k
  }
197
338k
}
198
199
// Runs word recognition on all the words.
200
bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it,
201
27.8k
                                   std::vector<WordData> *words) {
202
  // TODO(rays) Before this loop can be parallelized (it would yield a massive
203
  // speed-up) all remaining member globals need to be converted to local/heap
204
  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
205
  // added. The results will be significantly different with adaption on, and
206
  // deterioration will need investigation.
207
27.8k
  pr_it->restart_page();
208
366k
  for (unsigned w = 0; w < words->size(); ++w) {
209
338k
    WordData *word = &(*words)[w];
210
338k
    if (w > 0) {
211
311k
      word->prev_word = &(*words)[w - 1];
212
311k
    }
213
338k
    if (monitor != nullptr) {
214
0
      monitor->ocr_alive = true;
215
0
      if (pass_n == 1) {
216
0
        monitor->progress = 70 * w / words->size();
217
0
      } else {
218
0
        monitor->progress = 70 + 30 * w / words->size();
219
0
      }
220
0
      if (monitor->progress_callback2 != nullptr) {
221
0
        TBOX box = pr_it->word()->word->bounding_box();
222
0
        (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom());
223
0
      }
224
0
      if (monitor->deadline_exceeded() ||
225
0
          (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) {
226
        // Timeout. Fake out the rest of the words.
227
0
        for (; w < words->size(); ++w) {
228
0
          (*words)[w].word->SetupFake(unicharset);
229
0
        }
230
0
        return false;
231
0
      }
232
0
    }
233
338k
    if (word->word->tess_failed) {
234
283
      unsigned s;
235
420
      for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {
236
137
      }
237
      // If all are failed, skip it. Image words are skipped by this test.
238
283
      if (s > word->lang_words.size()) {
239
0
        continue;
240
0
      }
241
283
    }
242
    // Sync pr_it with the WordData.
243
338k
    while (pr_it->word() != nullptr && pr_it->word() != word->word) {
244
0
      pr_it->forward();
245
0
    }
246
338k
    ASSERT_HOST(pr_it->word() != nullptr);
247
338k
    bool make_next_word_fuzzy = false;
248
338k
#ifndef DISABLED_LEGACY_ENGINE
249
338k
    if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
250
      // Needs to be setup again to see the new outlines in the chopped_word.
251
0
      SetupWordPassN(pass_n, word);
252
0
    }
253
338k
#endif // ndef DISABLED_LEGACY_ENGINE
254
255
338k
    classify_word_and_language(pass_n, pr_it, word);
256
338k
    if (tessedit_dump_choices || debug_noise_removal) {
257
0
      tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(),
258
0
              word->word->best_choice->debug_string().c_str());
259
0
    }
260
338k
    pr_it->forward();
261
338k
    if (make_next_word_fuzzy && pr_it->word() != nullptr) {
262
0
      pr_it->MakeCurrentWordFuzzy();
263
0
    }
264
338k
  }
265
27.8k
  return true;
266
27.8k
}
267
268
/**
269
 * recog_all_words()
270
 *
271
 * Walk the page_res, recognizing all the words.
272
 * If monitor is not null, it is used as a progress monitor/timeout/cancel.
273
 * If dopasses is 0, all recognition passes are run,
274
 * 1 just pass 1, 2 passes2 and higher.
275
 * If target_word_box is not null, special things are done to words that
276
 * overlap the target_word_box:
277
 * if word_config is not null, the word config file is read for just the
278
 * target word(s), otherwise, on pass 2 and beyond ONLY the target words
279
 * are processed (Jetsoft modification.)
280
 * Returns false if we cancelled prematurely.
281
 *
282
 * @param page_res page structure
283
 * @param monitor progress monitor
284
 * @param word_config word_config file
285
 * @param target_word_box specifies just to extract a rectangle
286
 * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher
287
 */
288
289
bool Tesseract::recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor,
290
                                const TBOX *target_word_box, const char *word_config,
291
13.9k
                                int dopasses) {
292
13.9k
  PAGE_RES_IT page_res_it(page_res);
293
294
13.9k
  if (tessedit_minimal_rej_pass1) {
295
0
    tessedit_test_adaption.set_value(true);
296
0
    tessedit_minimal_rejection.set_value(true);
297
0
  }
298
299
13.9k
  if (dopasses == 0 || dopasses == 1) {
300
13.9k
    page_res_it.restart_page();
301
    // ****************** Pass 1 *******************
302
303
13.9k
#ifndef DISABLED_LEGACY_ENGINE
304
    // If the adaptive classifier is full switch to one we prepared earlier,
305
    // ie on the previous page. If the current adaptive classifier is non-empty,
306
    // prepare a backup starting at this page, in case it fills up. Do all this
307
    // independently for each language.
308
13.9k
    if (AdaptiveClassifierIsFull()) {
309
1
      SwitchAdaptiveClassifier();
310
13.9k
    } else if (!AdaptiveClassifierIsEmpty()) {
311
13.0k
      StartBackupAdaptiveClassifier();
312
13.0k
    }
313
    // Now check the sub-langs as well.
314
13.9k
    for (auto &lang : sub_langs_) {
315
0
      if (lang->AdaptiveClassifierIsFull()) {
316
0
        lang->SwitchAdaptiveClassifier();
317
0
      } else if (!lang->AdaptiveClassifierIsEmpty()) {
318
0
        lang->StartBackupAdaptiveClassifier();
319
0
      }
320
0
    }
321
322
13.9k
#endif // ndef DISABLED_LEGACY_ENGINE
323
324
    // Set up all words ready for recognition, so that if parallelism is on
325
    // all the input and output classes are ready to run the classifier.
326
13.9k
    std::vector<WordData> words;
327
13.9k
    SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
328
13.9k
#ifndef DISABLED_LEGACY_ENGINE
329
13.9k
    if (tessedit_parallelize) {
330
0
      PrerecAllWordsPar(words);
331
0
    }
332
13.9k
#endif // ndef DISABLED_LEGACY_ENGINE
333
334
13.9k
    stats_.word_count = words.size();
335
336
13.9k
    stats_.dict_words = 0;
337
13.9k
    stats_.doc_blob_quality = 0;
338
13.9k
    stats_.doc_outline_errs = 0;
339
13.9k
    stats_.doc_char_quality = 0;
340
13.9k
    stats_.good_char_count = 0;
341
13.9k
    stats_.doc_good_char_quality = 0;
342
343
13.9k
    most_recently_used_ = this;
344
    // Run pass 1 word recognition.
345
13.9k
    if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) {
346
0
      return false;
347
0
    }
348
    // Pass 1 post-processing.
349
189k
    for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
350
175k
      if (page_res_it.word()->word->flag(W_REP_CHAR)) {
351
0
        fix_rep_char(&page_res_it);
352
0
        continue;
353
0
      }
354
355
      // Count dict words.
356
175k
      if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) {
357
0
        ++(stats_.dict_words);
358
0
      }
359
360
      // Update misadaption log (we only need to do it on pass 1, since
361
      // adaption only happens on this pass).
362
175k
      if (page_res_it.word()->blamer_bundle != nullptr &&
363
175k
          page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
364
0
        page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug());
365
0
      }
366
175k
    }
367
13.9k
  }
368
369
13.9k
  if (dopasses == 1) {
370
0
    return true;
371
0
  }
372
373
13.9k
#ifndef DISABLED_LEGACY_ENGINE
374
375
  // ****************** Pass 2 *******************
376
13.9k
  if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) {
377
13.9k
    page_res_it.restart_page();
378
13.9k
    std::vector<WordData> words;
379
13.9k
    SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
380
13.9k
    if (tessedit_parallelize) {
381
0
      PrerecAllWordsPar(words);
382
0
    }
383
13.9k
    most_recently_used_ = this;
384
    // Run pass 2 word recognition.
385
13.9k
    if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) {
386
0
      return false;
387
0
    }
388
13.9k
  }
389
390
  // The next passes are only required for Tess-only.
391
13.9k
  if (AnyTessLang() && !AnyLSTMLang()) {
392
    // ****************** Pass 3 *******************
393
    // Fix fuzzy spaces.
394
395
0
    if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word &&
396
0
        !right_to_left()) {
397
0
      fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
398
0
    }
399
400
    // ****************** Pass 4 *******************
401
0
    if (tessedit_enable_dict_correction) {
402
0
      dictionary_correction_pass(page_res);
403
0
    }
404
0
    if (tessedit_enable_bigram_correction) {
405
0
      bigram_correction_pass(page_res);
406
0
    }
407
408
    // ****************** Pass 5,6 *******************
409
0
    rejection_passes(page_res, monitor, target_word_box, word_config);
410
411
    // ****************** Pass 8 *******************
412
0
    font_recognition_pass(page_res);
413
414
    // ****************** Pass 9 *******************
415
    // Check the correctness of the final results.
416
0
    blamer_pass(page_res);
417
0
    script_pos_pass(page_res);
418
0
  }
419
420
13.9k
#endif // ndef DISABLED_LEGACY_ENGINE
421
422
  // Write results pass.
423
  // This is now redundant, but retained commented so show how to obtain
424
  // bounding boxes and style information.
425
426
13.9k
#ifndef DISABLED_LEGACY_ENGINE
427
  // changed by jetsoft
428
  // needed for dll to output memory structure
429
13.9k
  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) {
430
0
    output_pass(page_res_it, target_word_box);
431
0
  }
432
// end jetsoft
433
13.9k
#endif // ndef DISABLED_LEGACY_ENGINE
434
435
13.9k
  const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
436
13.9k
  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
437
438
  // Remove empty words, as these mess up the result iterators.
439
189k
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
440
175k
    const WERD_RES *word = page_res_it.word();
441
175k
    const POLY_BLOCK *pb = page_res_it.block()->block != nullptr
442
175k
                               ? page_res_it.block()->block->pdblk.poly_block()
443
175k
                               : nullptr;
444
175k
    if (word->best_choice == nullptr || word->best_choice->empty() ||
445
175k
        (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
446
498
      page_res_it.DeleteCurrentWord();
447
498
    }
448
175k
  }
449
450
13.9k
  if (monitor != nullptr) {
451
0
    monitor->progress = 100;
452
0
  }
453
13.9k
  return true;
454
13.9k
}
455
456
#ifndef DISABLED_LEGACY_ENGINE
457
458
0
void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
459
0
  PAGE_RES_IT word_it(page_res);
460
461
0
  WERD_RES *w_prev = nullptr;
462
0
  WERD_RES *w = word_it.word();
463
0
  while (true) {
464
0
    w_prev = w;
465
0
    while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) {
466
      // advance word_it, skipping over parts of combos
467
0
    }
468
0
    if (!word_it.word()) {
469
0
      break;
470
0
    }
471
0
    w = word_it.word();
472
0
    if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
473
0
      continue;
474
0
    }
475
0
    if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
476
0
      if (tessedit_bigram_debug) {
477
0
        tprintf("Skipping because one of the words is W_REP_CHAR\n");
478
0
      }
479
0
      continue;
480
0
    }
481
    // Two words sharing the same language model, excellent!
482
0
    std::vector<WERD_CHOICE *> overrides_word1;
483
0
    std::vector<WERD_CHOICE *> overrides_word2;
484
485
0
    const auto &orig_w1_str = w_prev->best_choice->unichar_string();
486
0
    const auto &orig_w2_str = w->best_choice->unichar_string();
487
0
    WERD_CHOICE prev_best(w->uch_set);
488
0
    {
489
0
      int w1start, w1end;
490
0
      w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
491
0
      prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
492
0
    }
493
0
    WERD_CHOICE this_best(w->uch_set);
494
0
    {
495
0
      int w2start, w2end;
496
0
      w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
497
0
      this_best = w->best_choice->shallow_copy(w2start, w2end);
498
0
    }
499
500
0
    if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
501
0
      if (tessedit_bigram_debug) {
502
0
        tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(),
503
0
                orig_w2_str.c_str());
504
0
      }
505
0
      continue;
506
0
    }
507
0
    if (tessedit_bigram_debug > 2) {
508
0
      tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str());
509
0
    }
510
0
    if (tessedit_bigram_debug > 1) {
511
0
      if (!w_prev->best_choices.singleton()) {
512
0
        w_prev->PrintBestChoices();
513
0
      }
514
0
      if (!w->best_choices.singleton()) {
515
0
        w->PrintBestChoices();
516
0
      }
517
0
    }
518
0
    float best_rating = 0.0;
519
0
    int best_idx = 0;
520
0
    WERD_CHOICE_IT prev_it(&w_prev->best_choices);
521
0
    for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
522
0
      WERD_CHOICE *p1 = prev_it.data();
523
0
      WERD_CHOICE strip1(w->uch_set);
524
0
      {
525
0
        int p1start, p1end;
526
0
        p1->GetNonSuperscriptSpan(&p1start, &p1end);
527
0
        strip1 = p1->shallow_copy(p1start, p1end);
528
0
      }
529
0
      WERD_CHOICE_IT w_it(&w->best_choices);
530
0
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
531
0
        WERD_CHOICE *p2 = w_it.data();
532
0
        WERD_CHOICE strip2(w->uch_set);
533
0
        {
534
0
          int p2start, p2end;
535
0
          p2->GetNonSuperscriptSpan(&p2start, &p2end);
536
0
          strip2 = p2->shallow_copy(p2start, p2end);
537
0
        }
538
0
        if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
539
0
          overrides_word1.push_back(p1);
540
0
          overrides_word2.push_back(p2);
541
0
          if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) {
542
0
            best_rating = p1->rating() + p2->rating();
543
0
            best_idx = overrides_word1.size() - 1;
544
0
          }
545
0
        }
546
0
      }
547
0
    }
548
0
    if (!overrides_word1.empty()) {
549
      // Excellent, we have some bigram matches.
550
0
      if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) &&
551
0
          EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) {
552
0
        if (tessedit_bigram_debug > 1) {
553
0
          tprintf(
554
0
              "Top choice \"%s %s\" verified (sans case) by bigram "
555
0
              "model.\n",
556
0
              orig_w1_str.c_str(), orig_w2_str.c_str());
557
0
        }
558
0
        continue;
559
0
      }
560
0
      const auto &new_w1_str = overrides_word1[best_idx]->unichar_string();
561
0
      const auto &new_w2_str = overrides_word2[best_idx]->unichar_string();
562
0
      if (new_w1_str != orig_w1_str) {
563
0
        w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
564
0
      }
565
0
      if (new_w2_str != orig_w2_str) {
566
0
        w->ReplaceBestChoice(overrides_word2[best_idx]);
567
0
      }
568
0
      if (tessedit_bigram_debug > 0) {
569
0
        std::string choices_description;
570
0
        int num_bigram_choices = overrides_word1.size() * overrides_word2.size();
571
0
        if (num_bigram_choices == 1) {
572
0
          choices_description = "This was the unique bigram choice.";
573
0
        } else {
574
0
          if (tessedit_bigram_debug > 1) {
575
0
            std::string bigrams_list;
576
0
            const int kMaxChoicesToPrint = 20;
577
0
            for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) {
578
0
              if (i > 0) {
579
0
                bigrams_list += ", ";
580
0
              }
581
0
              WERD_CHOICE *p1 = overrides_word1[i];
582
0
              WERD_CHOICE *p2 = overrides_word2[i];
583
0
              bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
584
0
            }
585
0
            choices_description = "There were many choices: {";
586
0
            choices_description += bigrams_list;
587
0
            choices_description += "}";
588
0
          } else {
589
0
            choices_description += "There were " + std::to_string(num_bigram_choices);
590
0
            choices_description += " compatible bigrams.";
591
0
          }
592
0
        }
593
0
        tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(),
594
0
                orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(),
595
0
                choices_description.c_str());
596
0
      }
597
0
    }
598
0
  }
599
0
}
600
601
void Tesseract::rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor,
602
0
                                 const TBOX *target_word_box, const char *word_config) {
603
0
  PAGE_RES_IT page_res_it(page_res);
604
  // ****************** Pass 5 *******************
605
  // Gather statistics on rejects.
606
0
  int word_index = 0;
607
0
  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
608
0
    WERD_RES *word = page_res_it.word();
609
0
    word_index++;
610
0
    if (monitor != nullptr) {
611
0
      monitor->ocr_alive = true;
612
0
      monitor->progress = 95 + 5 * word_index / stats_.word_count;
613
0
    }
614
0
    if (word->rebuild_word == nullptr) {
615
      // Word was not processed by tesseract.
616
0
      page_res_it.forward();
617
0
      continue;
618
0
    }
619
0
    check_debug_pt(word, 70);
620
621
    // changed by jetsoft
622
    // specific to its needs to extract one word when need
623
0
    if (target_word_box &&
624
0
        !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) {
625
0
      page_res_it.forward();
626
0
      continue;
627
0
    }
628
    // end jetsoft
629
630
0
    page_res_it.rej_stat_word();
631
0
    const int chars_in_word = word->reject_map.length();
632
0
    const int rejects_in_word = word->reject_map.reject_count();
633
634
0
    const int blob_quality = word_blob_quality(word);
635
0
    stats_.doc_blob_quality += blob_quality;
636
0
    const int outline_errs = word_outline_errs(word);
637
0
    stats_.doc_outline_errs += outline_errs;
638
0
    int16_t all_char_quality;
639
0
    int16_t accepted_all_char_quality;
640
0
    word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
641
0
    stats_.doc_char_quality += all_char_quality;
642
0
    const uint8_t permuter_type = word->best_choice->permuter();
643
0
    if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) ||
644
0
        (permuter_type == USER_DAWG_PERM)) {
645
0
      stats_.good_char_count += chars_in_word - rejects_in_word;
646
0
      stats_.doc_good_char_quality += accepted_all_char_quality;
647
0
    }
648
0
    check_debug_pt(word, 80);
649
0
    if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) {
650
0
      word->reject_map.rej_word_bad_quality();
651
0
    }
652
0
    check_debug_pt(word, 90);
653
0
    page_res_it.forward();
654
0
  }
655
656
0
  if (tessedit_debug_quality_metrics) {
657
0
    tprintf(
658
0
        "QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"
659
0
        " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
660
0
        page_res->char_count, page_res->rej_count,
661
0
        page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality,
662
0
        stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs,
663
0
        stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality,
664
0
        stats_.doc_char_quality / static_cast<float>(page_res->char_count),
665
0
        stats_.doc_good_char_quality,
666
0
        (stats_.good_char_count > 0)
667
0
            ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count))
668
0
            : 0.0);
669
0
  }
670
0
  bool good_quality_doc =
671
0
      ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) &&
672
0
      (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) &&
673
0
      (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) &&
674
0
      (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc);
675
676
  // ****************** Pass 6 *******************
677
  // Do whole document or whole block rejection pass
678
0
  if (!tessedit_test_adaption) {
679
0
    quality_based_rejection(page_res_it, good_quality_doc);
680
0
  }
681
0
}
682
683
#endif // ndef DISABLED_LEGACY_ENGINE
684
685
0
void Tesseract::blamer_pass(PAGE_RES *page_res) {
686
0
  if (!wordrec_run_blamer) {
687
0
    return;
688
0
  }
689
0
  PAGE_RES_IT page_res_it(page_res);
690
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
691
0
    WERD_RES *word = page_res_it.word();
692
0
    BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
693
0
    page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
694
0
  }
695
0
  tprintf("Blame reasons:\n");
696
0
  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
697
0
    tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(static_cast<IncorrectResultReason>(bl)),
698
0
            page_res->blame_reasons[bl]);
699
0
  }
700
0
  if (page_res->misadaption_log.size() > 0) {
701
0
    tprintf("Misadaption log:\n");
702
0
    for (auto &log : page_res->misadaption_log) {
703
0
      tprintf("%s\n", log.c_str());
704
0
    }
705
0
  }
706
0
}
707
708
// Sets script positions and detects smallcaps on all output words.
709
0
void Tesseract::script_pos_pass(PAGE_RES *page_res) {
710
0
  PAGE_RES_IT page_res_it(page_res);
711
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
712
0
    WERD_RES *word = page_res_it.word();
713
0
    if (word->word->flag(W_REP_CHAR)) {
714
0
      page_res_it.forward();
715
0
      continue;
716
0
    }
717
0
    const float x_height = page_res_it.block()->block->x_height();
718
0
    float word_x_height = word->x_height;
719
0
    if (word_x_height < word->best_choice->min_x_height() ||
720
0
        word_x_height > word->best_choice->max_x_height()) {
721
0
      word_x_height =
722
0
          (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f;
723
0
    }
724
    // Test for small caps. Word capheight must be close to block xheight,
725
    // and word must contain no lower case letters, and at least one upper case.
726
0
    const double small_cap_xheight = x_height * kXHeightCapRatio;
727
0
    const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
728
0
    if (word->uch_set->script_has_xheight() &&
729
0
        small_cap_xheight - small_cap_delta <= word_x_height &&
730
0
        word_x_height <= small_cap_xheight + small_cap_delta) {
731
      // Scan for upper/lower.
732
0
      int num_upper = 0;
733
0
      int num_lower = 0;
734
0
      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
735
0
        if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {
736
0
          ++num_upper;
737
0
        } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {
738
0
          ++num_lower;
739
0
        }
740
0
      }
741
0
      if (num_upper > 0 && num_lower == 0) {
742
0
        word->small_caps = true;
743
0
      }
744
0
    }
745
0
    word->SetScriptPositions();
746
0
  }
747
0
}
748
749
// Helper finds the gap between the index word and the next.
750
438k
static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) {
751
438k
  *right = -INT32_MAX;
752
438k
  *next_left = INT32_MAX;
753
438k
  if (index < words.size()) {
754
219k
    *right = words[index]->word->bounding_box().right();
755
219k
    if (index + 1 < words.size()) {
756
12.9k
      *next_left = words[index + 1]->word->bounding_box().left();
757
12.9k
    }
758
219k
  }
759
438k
}
760
761
// Factored helper computes the rating, certainty, badness and validity of
762
// the permuter of the words in [first_index, end_index).
763
static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index,
764
438k
                             float *rating, float *certainty, bool *bad, bool *valid_permuter) {
765
438k
  if (end_index <= first_index) {
766
219k
    *bad = true;
767
219k
    *valid_permuter = false;
768
219k
  }
769
658k
  for (unsigned index = first_index; index < end_index && index < words.size(); ++index) {
770
219k
    WERD_CHOICE *choice = words[index]->best_choice;
771
219k
    if (choice == nullptr) {
772
0
      *bad = true;
773
219k
    } else {
774
219k
      *rating += choice->rating();
775
219k
      *certainty = std::min(*certainty, choice->certainty());
776
219k
      if (!Dict::valid_word_permuter(choice->permuter(), false)) {
777
69.8k
        *valid_permuter = false;
778
69.8k
      }
779
219k
    }
780
219k
  }
781
438k
}
782
783
// Helper chooses the best combination of words, transferring good ones from
784
// new_words to best_words. To win, a new word must have (better rating and
785
// certainty) or (better permuter status and rating within rating ratio and
786
// certainty within certainty margin) than current best.
787
// All the new_words are consumed (moved to best_words or deleted.)
788
// The return value is the number of new_words used minus the number of
789
// best_words that remain in the output.
790
static int SelectBestWords(double rating_ratio, double certainty_margin, bool debug,
791
                           PointerVector<WERD_RES> *new_words,
792
206k
                           PointerVector<WERD_RES> *best_words) {
793
  // Process the smallest groups of words that have an overlapping word
794
  // boundary at the end.
795
206k
  std::vector<WERD_RES *> out_words;
796
  // Index into each word vector (best, new).
797
206k
  unsigned b = 0, n = 0;
798
206k
  int num_best = 0, num_new = 0;
799
425k
  while (b < best_words->size() || n < new_words->size()) {
800
    // Start of the current run in each.
801
219k
    auto start_b = b, start_n = n;
802
219k
    while (b < best_words->size() || n < new_words->size()) {
803
219k
      int b_right = -INT32_MAX;
804
219k
      int next_b_left = INT32_MAX;
805
219k
      WordGap(*best_words, b, &b_right, &next_b_left);
806
219k
      int n_right = -INT32_MAX;
807
219k
      int next_n_left = INT32_MAX;
808
219k
      WordGap(*new_words, n, &n_right, &next_n_left);
809
219k
      if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
810
        // The word breaks overlap. [start_b,b] and [start_n, n] match.
811
219k
        break;
812
219k
      }
813
      // Keep searching for the matching word break.
814
128
      if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) {
815
0
        ++b;
816
128
      } else {
817
128
        ++n;
818
128
      }
819
128
    }
820
    // Rating of the current run in each.
821
219k
    float b_rating = 0.0f, n_rating = 0.0f;
822
    // Certainty of the current run in each.
823
219k
    float b_certainty = 0.0f, n_certainty = 0.0f;
824
    // True if any word is missing its best choice.
825
219k
    bool b_bad = false, n_bad = false;
826
    // True if all words have a valid permuter.
827
219k
    bool b_valid_permuter = true, n_valid_permuter = true;
828
219k
    const int end_b = b < best_words->size() ? b + 1 : b;
829
219k
    const int end_n = n < new_words->size() ? n + 1 : n;
830
219k
    EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad,
831
219k
                     &b_valid_permuter);
832
219k
    EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad,
833
219k
                     &n_valid_permuter);
834
219k
    bool new_better = false;
835
219k
    if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) ||
836
219k
                   (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio &&
837
219k
                    n_certainty > b_certainty - certainty_margin))) {
838
      // New is better.
839
438k
      for (int i = start_n; i < end_n; ++i) {
840
219k
        out_words.push_back((*new_words)[i]);
841
219k
        (*new_words)[i] = nullptr;
842
219k
        ++num_new;
843
219k
      }
844
219k
      new_better = true;
845
219k
    } else if (!b_bad) {
846
      // Current best is better.
847
0
      for (int i = start_b; i < end_b; ++i) {
848
0
        out_words.push_back((*best_words)[i]);
849
0
        (*best_words)[i] = nullptr;
850
0
        ++num_best;
851
0
      }
852
0
    }
853
219k
    if (debug) {
854
0
      tprintf(
855
0
          "%d new words %s than %d old words: r: %g v %g c: %g v %g"
856
0
          " valid dict: %d v %d\n",
857
0
          end_n - start_n, new_better ? "better" : "worse", end_b - start_b, n_rating, b_rating,
858
0
          n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
859
0
    }
860
    // Move on to the next group.
861
219k
    b = end_b;
862
219k
    n = end_n;
863
219k
  }
864
  // Transfer from out_words to best_words.
865
206k
  best_words->clear();
866
219k
  for (auto &out_word : out_words) {
867
219k
    best_words->push_back(out_word);
868
219k
  }
869
206k
  return num_new - num_best;
870
206k
}
871
872
// Helper to recognize the word using the given (language-specific) tesseract.
873
// Returns positive if this recognizer found more new best words than the
874
// number kept from best_words.
875
int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug,
876
206k
                                 WERD_RES **in_word, PointerVector<WERD_RES> *best_words) {
877
206k
  if (debug) {
878
0
    tprintf("Trying word using lang %s, oem %d\n", lang.c_str(),
879
0
            static_cast<int>(tessedit_ocr_engine_mode));
880
0
  }
881
  // Run the recognizer on the word.
882
206k
  PointerVector<WERD_RES> new_words;
883
206k
  (this->*recognizer)(word_data, in_word, &new_words);
884
206k
  if (new_words.empty()) {
885
    // Transfer input word to new_words, as the classifier must have put
886
    // the result back in the input.
887
89.8k
    new_words.push_back(*in_word);
888
89.8k
    *in_word = nullptr;
889
89.8k
  }
890
206k
  if (debug) {
891
0
    for (unsigned i = 0; i < new_words.size(); ++i) {
892
0
      new_words[i]->DebugTopChoice("Lang result");
893
0
    }
894
0
  }
895
  // Initial version is a bit of a hack based on better certainty and rating
896
  // or a dictionary vs non-dictionary word.
897
206k
  return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug,
898
206k
                         &new_words, best_words);
899
206k
}
900
901
// Helper returns true if all the words are acceptable.
902
397k
static bool WordsAcceptable(const PointerVector<WERD_RES> &words) {
903
413k
  for (unsigned w = 0; w < words.size(); ++w) {
904
397k
    if (words[w]->tess_failed || !words[w]->tess_accepted) {
905
382k
      return false;
906
382k
    }
907
397k
  }
908
15.3k
  return true;
909
397k
}
910
911
#ifndef DISABLED_LEGACY_ENGINE
912
913
// Moves good-looking "noise"/diacritics from the reject list to the main
914
// blob list on the current word. Returns true if anything was done, and
915
// sets make_next_word_fuzzy if blob(s) were added to the end of the word.
916
0
bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) {
917
0
  *make_next_word_fuzzy = false;
918
0
  WERD *real_word = pr_it->word()->word;
919
0
  if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() ||
920
0
      real_word->rej_cblob_list()->length() > noise_maxperword) {
921
0
    return false;
922
0
  }
923
0
  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
924
  // Get the noise outlines into a vector with matching bool map.
925
0
  std::vector<C_OUTLINE *> outlines;
926
0
  real_word->GetNoiseOutlines(&outlines);
927
0
  std::vector<bool> word_wanted;
928
0
  std::vector<bool> overlapped_any_blob;
929
0
  std::vector<C_BLOB *> target_blobs;
930
0
  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
931
0
                                     &overlapped_any_blob, &target_blobs);
932
  // Filter the outlines that overlapped any blob and put them into the word
933
  // now. This simplifies the remaining task and also makes it more accurate
934
  // as it has more completed blobs to work on.
935
0
  std::vector<bool> wanted;
936
0
  std::vector<C_BLOB *> wanted_blobs;
937
0
  std::vector<C_OUTLINE *> wanted_outlines;
938
0
  int num_overlapped = 0;
939
0
  int num_overlapped_used = 0;
940
0
  for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) {
941
0
    if (overlapped_any_blob[i]) {
942
0
      ++num_overlapped;
943
0
      if (word_wanted[i]) {
944
0
        ++num_overlapped_used;
945
0
      }
946
0
      wanted.push_back(word_wanted[i]);
947
0
      wanted_blobs.push_back(target_blobs[i]);
948
0
      wanted_outlines.push_back(outlines[i]);
949
0
      outlines[i] = nullptr;
950
0
    }
951
0
  }
952
0
  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
953
0
  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs);
954
  // TODO: check code.
955
0
  int non_overlapped = 0;
956
0
  int non_overlapped_used = 0;
957
0
  for (unsigned i = 0; i < word_wanted.size(); ++i) {
958
0
    if (word_wanted[i]) {
959
0
      ++non_overlapped_used;
960
0
    }
961
0
    if (outlines[i] != nullptr) {
962
0
      ++non_overlapped_used;
963
0
    }
964
0
  }
965
0
  if (debug_noise_removal) {
966
0
    tprintf("Used %d/%d overlapped %d/%d non-overlapped diacritics on word:", num_overlapped_used,
967
0
            num_overlapped, non_overlapped_used, non_overlapped);
968
0
    real_word->bounding_box().print();
969
0
  }
970
  // Now we have decided which outlines we want, put them into the real_word.
971
0
  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) {
972
0
    pr_it->MakeCurrentWordFuzzy();
973
0
  }
974
  // TODO(rays) Parts of combos have a deep copy of the real word, and need
975
  // to have their noise outlines moved/assigned in the same way!!
976
0
  return num_overlapped_used != 0 || non_overlapped_used != 0;
977
0
}
978
979
// Attempts to put noise/diacritic outlines into the blobs that they overlap.
980
// Input: a set of noisy outlines that probably belong to the real_word.
981
// Output: word_wanted indicates which outlines are to be assigned to a blob,
982
//   target_blobs indicates which to assign to, and overlapped_any_blob is
983
//   true for all outlines that overlapped a blob.
984
void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,
985
                                                   int pass, WERD *real_word, PAGE_RES_IT *pr_it,
986
                                                   std::vector<bool> *word_wanted,
987
                                                   std::vector<bool> *overlapped_any_blob,
988
0
                                                   std::vector<C_BLOB *> *target_blobs) {
989
0
  std::vector<bool> blob_wanted;
990
0
  word_wanted->clear();
991
0
  word_wanted->resize(outlines.size());
992
0
  overlapped_any_blob->clear();
993
0
  overlapped_any_blob->resize(outlines.size());
994
0
  target_blobs->clear();
995
0
  target_blobs->resize(outlines.size());
996
  // For each real blob, find the outlines that seriously overlap it.
997
  // A single blob could be several merged characters, so there can be quite
998
  // a few outlines overlapping, and the full engine needs to be used to chop
999
  // and join to get a sensible result.
1000
0
  C_BLOB_IT blob_it(real_word->cblob_list());
1001
0
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1002
0
    C_BLOB *blob = blob_it.data();
1003
0
    const TBOX blob_box = blob->bounding_box();
1004
0
    blob_wanted.clear();
1005
0
    blob_wanted.resize(outlines.size());
1006
0
    int num_blob_outlines = 0;
1007
0
    for (unsigned i = 0; i < outlines.size(); ++i) {
1008
0
      if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) {
1009
0
        blob_wanted[i] = true;
1010
0
        (*overlapped_any_blob)[i] = true;
1011
0
        ++num_blob_outlines;
1012
0
      }
1013
0
    }
1014
0
    if (debug_noise_removal) {
1015
0
      tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1016
0
      blob_box.print();
1017
0
    }
1018
    // If any outlines overlap the blob, and not too many, classify the blob
1019
    // (using the full engine, languages and all), and choose the maximal
1020
    // combination of outlines that doesn't hurt the end-result classification
1021
    // by too much. Mark them as wanted.
1022
0
    if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1023
0
      if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines,
1024
0
                                      num_blob_outlines, &blob_wanted)) {
1025
0
        for (unsigned i = 0; i < blob_wanted.size(); ++i) {
1026
0
          if (blob_wanted[i]) {
1027
            // Claim the outline and record where it is going.
1028
0
            (*word_wanted)[i] = true;
1029
0
            (*target_blobs)[i] = blob;
1030
0
          }
1031
0
        }
1032
0
      }
1033
0
    }
1034
0
  }
1035
0
}
1036
1037
// Attempts to assign non-overlapping outlines to their nearest blobs or
1038
// make new blobs out of them.
1039
void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
1040
                                           WERD *real_word, PAGE_RES_IT *pr_it,
1041
                                           std::vector<bool> *word_wanted,
1042
0
                                           std::vector<C_BLOB *> *target_blobs) {
1043
0
  std::vector<bool> blob_wanted;
1044
0
  word_wanted->clear();
1045
0
  word_wanted->resize(outlines.size());
1046
0
  target_blobs->clear();
1047
0
  target_blobs->resize(outlines.size());
1048
  // Check for outlines that need to be turned into stand-alone blobs.
1049
0
  for (unsigned i = 0; i < outlines.size(); ++i) {
1050
0
    if (outlines[i] == nullptr) {
1051
0
      continue;
1052
0
    }
1053
    // Get a set of adjacent outlines that don't overlap any existing blob.
1054
0
    blob_wanted.clear();
1055
0
    blob_wanted.resize(outlines.size());
1056
0
    int num_blob_outlines = 0;
1057
0
    TBOX total_ol_box(outlines[i]->bounding_box());
1058
0
    while (i < outlines.size() && outlines[i] != nullptr) {
1059
0
      blob_wanted[i] = true;
1060
0
      total_ol_box += outlines[i]->bounding_box();
1061
0
      ++i;
1062
0
      ++num_blob_outlines;
1063
0
    }
1064
    // Find the insertion point.
1065
0
    C_BLOB_IT blob_it(real_word->cblob_list());
1066
0
    while (!blob_it.at_last() &&
1067
0
           blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) {
1068
0
      blob_it.forward();
1069
0
    }
1070
    // Choose which combination of them we actually want and where to put
1071
    // them.
1072
0
    if (debug_noise_removal) {
1073
0
      tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1074
0
    }
1075
0
    C_BLOB *left_blob = blob_it.data();
1076
0
    TBOX left_box = left_blob->bounding_box();
1077
0
    C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1078
0
    if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1079
0
         !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1080
0
        SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines,
1081
0
                                    num_blob_outlines, &blob_wanted)) {
1082
0
      if (debug_noise_removal) {
1083
0
        tprintf("Added to left blob\n");
1084
0
      }
1085
0
      for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1086
0
        if (blob_wanted[j]) {
1087
0
          (*word_wanted)[j] = true;
1088
0
          (*target_blobs)[j] = left_blob;
1089
0
        }
1090
0
      }
1091
0
    } else if (right_blob != nullptr &&
1092
0
               (!left_box.x_overlap(total_ol_box) ||
1093
0
                right_blob->bounding_box().x_overlap(total_ol_box)) &&
1094
0
               SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines,
1095
0
                                           num_blob_outlines, &blob_wanted)) {
1096
0
      if (debug_noise_removal) {
1097
0
        tprintf("Added to right blob\n");
1098
0
      }
1099
0
      for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1100
0
        if (blob_wanted[j]) {
1101
0
          (*word_wanted)[j] = true;
1102
0
          (*target_blobs)[j] = right_blob;
1103
0
        }
1104
0
      }
1105
0
    } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines,
1106
0
                                           num_blob_outlines, &blob_wanted)) {
1107
0
      if (debug_noise_removal) {
1108
0
        tprintf("Fitted between blobs\n");
1109
0
      }
1110
0
      for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1111
0
        if (blob_wanted[j]) {
1112
0
          (*word_wanted)[j] = true;
1113
0
          (*target_blobs)[j] = nullptr;
1114
0
        }
1115
0
      }
1116
0
    }
1117
0
  }
1118
0
}
1119
1120
// Starting with ok_outlines set to indicate which outlines overlap the blob,
1121
// chooses the optimal set (approximately) and returns true if any outlines
1122
// are desired, in which case ok_outlines indicates which ones.
1123
bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
1124
                                            C_BLOB *blob,
1125
                                            const std::vector<C_OUTLINE *> &outlines,
1126
0
                                            int num_outlines, std::vector<bool> *ok_outlines) {
1127
0
  float target_cert = certainty_threshold;
1128
0
  if (blob != nullptr) {
1129
0
    std::string best_str;
1130
0
    float target_c2;
1131
0
    target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2);
1132
0
    if (debug_noise_removal) {
1133
0
      tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert,
1134
0
              target_c2);
1135
0
      blob->bounding_box().print();
1136
0
    }
1137
0
    target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1138
0
  }
1139
0
  std::vector<bool> test_outlines = *ok_outlines;
1140
  // Start with all the outlines in.
1141
0
  std::string all_str;
1142
0
  std::vector<bool> best_outlines = *ok_outlines;
1143
0
  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str);
1144
0
  if (debug_noise_removal) {
1145
0
    TBOX ol_box;
1146
0
    for (unsigned i = 0; i < test_outlines.size(); ++i) {
1147
0
      if (test_outlines[i]) {
1148
0
        ol_box += outlines[i]->bounding_box();
1149
0
      }
1150
0
    }
1151
0
    tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert,
1152
0
            best_cert - target_cert);
1153
0
    ol_box.print();
1154
0
  }
1155
  // Iteratively zero out the bit that improves the certainty the most, until
1156
  // we get past the threshold, have zero bits, or fail to improve.
1157
0
  int best_index = 0; // To zero out.
1158
0
  while (num_outlines > 1 && best_index >= 0 &&
1159
0
         (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1160
    // Find the best bit to zero out.
1161
0
    best_index = -1;
1162
0
    for (unsigned i = 0; i < outlines.size(); ++i) {
1163
0
      if (test_outlines[i]) {
1164
0
        test_outlines[i] = false;
1165
0
        std::string str;
1166
0
        float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str);
1167
0
        if (debug_noise_removal) {
1168
0
          TBOX ol_box;
1169
0
          for (unsigned j = 0; j < outlines.size(); ++j) {
1170
0
            if (test_outlines[j]) {
1171
0
              ol_box += outlines[j]->bounding_box();
1172
0
            }
1173
0
            tprintf("%c", test_outlines[j] ? 'T' : 'F');
1174
0
          }
1175
0
          tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert,
1176
0
                  cert - target_cert);
1177
0
          ol_box.print();
1178
0
        }
1179
0
        if (cert > best_cert) {
1180
0
          best_cert = cert;
1181
0
          best_index = i;
1182
0
          best_outlines = test_outlines;
1183
0
        }
1184
0
        test_outlines[i] = true;
1185
0
      }
1186
0
    }
1187
0
    if (best_index >= 0) {
1188
0
      test_outlines[best_index] = false;
1189
0
      --num_outlines;
1190
0
    }
1191
0
  }
1192
0
  if (best_cert >= target_cert) {
1193
    // Save the best combination.
1194
0
    *ok_outlines = best_outlines;
1195
0
    if (debug_noise_removal) {
1196
0
      tprintf("%s noise combination ", blob ? "Adding" : "New");
1197
0
      for (auto &&best_outline : best_outlines) {
1198
0
        tprintf("%c", best_outline ? 'T' : 'F');
1199
0
      }
1200
0
      tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert);
1201
0
    }
1202
0
    return true;
1203
0
  }
1204
1205
0
  return false;
1206
0
}
1207
1208
// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
1209
// the inclusion of the outlines, and returns the certainty of the raw choice.
1210
float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
1211
                                          const std::vector<C_OUTLINE *> &outlines, int pass_n,
1212
0
                                          PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {
1213
0
  C_OUTLINE_IT ol_it;
1214
0
  C_OUTLINE *first_to_keep = nullptr;
1215
0
  C_BLOB *local_blob = nullptr;
1216
0
  if (blob != nullptr) {
1217
    // Add the required outlines to the blob.
1218
0
    ol_it.set_to_list(blob->out_list());
1219
0
    first_to_keep = ol_it.data();
1220
0
  }
1221
0
  for (unsigned i = 0; i < ok_outlines.size(); ++i) {
1222
0
    if (ok_outlines[i]) {
1223
      // This outline is to be added.
1224
0
      if (blob == nullptr) {
1225
0
        local_blob = new C_BLOB(outlines[i]);
1226
0
        blob = local_blob;
1227
0
        ol_it.set_to_list(blob->out_list());
1228
0
      } else {
1229
0
        ol_it.add_before_stay_put(outlines[i]);
1230
0
      }
1231
0
    }
1232
0
  }
1233
0
  float c2;
1234
0
  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1235
0
  ol_it.move_to_first();
1236
0
  if (first_to_keep == nullptr) {
1237
    // We created blob. Empty its outlines and delete it.
1238
0
    for (; !ol_it.empty(); ol_it.forward()) {
1239
0
      ol_it.extract();
1240
0
    }
1241
0
    delete local_blob;
1242
0
    cert = -c2;
1243
0
  } else {
1244
    // Remove the outlines that we put in.
1245
0
    for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1246
0
      ol_it.extract();
1247
0
    }
1248
0
  }
1249
0
  return cert;
1250
0
}
1251
1252
// Classifies the given blob (part of word_data->word->word) as an individual
1253
// word, using languages, chopper etc, returning only the certainty of the
1254
// best raw choice, and undoing all the work done to fake out the word.
1255
float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str,
1256
0
                                    float *c2) {
1257
0
  WERD *real_word = pr_it->word()->word;
1258
0
  WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL),
1259
0
                                                  C_BLOB::deep_copy(blob));
1260
0
  WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1261
  // Get a new iterator that points to the new word.
1262
0
  PAGE_RES_IT it(pr_it->page_res);
1263
0
  while (it.word() != word_res && it.word() != nullptr) {
1264
0
    it.forward();
1265
0
  }
1266
0
  ASSERT_HOST(it.word() == word_res);
1267
0
  WordData wd(it);
1268
  // Force full initialization.
1269
0
  SetupWordPassN(1, &wd);
1270
0
  classify_word_and_language(pass_n, &it, &wd);
1271
0
  if (debug_noise_removal) {
1272
0
    if (wd.word->raw_choice != nullptr) {
1273
0
      tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(),
1274
0
              wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height());
1275
0
    } else {
1276
0
      tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1277
0
              wd.row->x_height());
1278
0
    }
1279
0
  }
1280
0
  float cert = 0.0f;
1281
0
  if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1282
0
    cert = wd.word->raw_choice->certainty();
1283
0
    float rat = wd.word->raw_choice->rating();
1284
0
    *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1285
0
    best_str = wd.word->raw_choice->unichar_string();
1286
0
  } else {
1287
0
    *c2 = 0.0f;
1288
0
    best_str.clear();
1289
0
  }
1290
0
  it.DeleteCurrentWord();
1291
0
  pr_it->ResetWordIterator();
1292
0
  return cert;
1293
0
}
1294
1295
#endif // ndef DISABLED_LEGACY_ENGINE
1296
1297
// Generic function for classifying a word. Can be used either for pass1 or
1298
// pass2 according to the function passed to recognizer.
1299
// word_data holds the word to be recognized, and its block and row, and
1300
// pr_it points to the word as well, in case we are running LSTM and it wants
1301
// to output multiple words.
1302
// Recognizes in the current language, and if successful that is all.
1303
// If recognition was not successful, tries all available languages until
1304
// it gets a successful result or runs out of languages. Keeps the best result.
1305
338k
void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) {
1306
#ifdef DISABLED_LEGACY_ENGINE
1307
  WordRecognizer recognizer = &Tesseract::classify_word_pass1;
1308
#else
1309
338k
  WordRecognizer recognizer =
1310
338k
      pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2;
1311
338k
#endif // def DISABLED_LEGACY_ENGINE
1312
1313
  // Best result so far.
1314
338k
  PointerVector<WERD_RES> best_words;
1315
  // Points to the best result. May be word or in lang_words.
1316
338k
  const WERD_RES *word = word_data->word;
1317
338k
  clock_t total_time = 0;
1318
338k
  const bool timing_debug = tessedit_timing_debug;
1319
338k
  if (timing_debug) {
1320
0
    total_time = clock();
1321
0
  }
1322
338k
  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1323
338k
  if (debug) {
1324
0
    tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing",
1325
0
            most_recently_used_->lang.c_str());
1326
0
    word->word->bounding_box().print();
1327
0
  }
1328
338k
  if (word->done) {
1329
    // If done on pass1, leave it as-is.
1330
132k
    if (!word->tess_failed) {
1331
132k
      most_recently_used_ = word->tesseract;
1332
132k
    }
1333
132k
    return;
1334
132k
  }
1335
206k
  auto sub = sub_langs_.size();
1336
206k
  if (most_recently_used_ != this) {
1337
    // Get the index of the most_recently_used_.
1338
0
    for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {
1339
0
    }
1340
0
  }
1341
206k
  most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub],
1342
206k
                                         &best_words);
1343
206k
  Tesseract *best_lang_tess = most_recently_used_;
1344
206k
  if (!WordsAcceptable(best_words)) {
1345
    // Try all the other languages to see if they are any better.
1346
191k
    if (most_recently_used_ != this &&
1347
191k
        this->RetryWithLanguage(*word_data, recognizer, debug,
1348
0
                                &word_data->lang_words[sub_langs_.size()], &best_words) > 0) {
1349
0
      best_lang_tess = this;
1350
0
    }
1351
191k
    for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) {
1352
0
      if (most_recently_used_ != sub_langs_[i] &&
1353
0
          sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i],
1354
0
                                           &best_words) > 0) {
1355
0
        best_lang_tess = sub_langs_[i];
1356
0
      }
1357
0
    }
1358
191k
  }
1359
206k
  most_recently_used_ = best_lang_tess;
1360
206k
  if (!best_words.empty()) {
1361
206k
    if (best_words.size() == 1 && !best_words[0]->combination) {
1362
      // Move the best single result to the main word.
1363
89.8k
      word_data->word->ConsumeWordResults(best_words[0]);
1364
116k
    } else {
1365
      // Words came from LSTM, and must be moved to the PAGE_RES properly.
1366
116k
      word_data->word = best_words.back();
1367
116k
      pr_it->ReplaceCurrentWord(&best_words);
1368
116k
    }
1369
206k
    ASSERT_HOST(word_data->word->box_word != nullptr);
1370
206k
  } else {
1371
0
    tprintf("no best words!!\n");
1372
0
  }
1373
206k
  if (timing_debug) {
1374
0
    total_time = clock() - total_time;
1375
0
    tesserr << word_data->word->best_choice->unichar_string()
1376
0
            << " (ocr took " << 1000 * total_time / CLOCKS_PER_SEC << " ms)\n";
1377
0
  }
1378
206k
}
1379
1380
/**
1381
 * classify_word_pass1
1382
 *
1383
 * Baseline normalize the word and pass it to Tess.
1384
 */
1385
1386
void Tesseract::classify_word_pass1(const WordData &word_data, WERD_RES **in_word,
1387
162k
                                    PointerVector<WERD_RES> *out_words) {
1388
162k
  ROW *row = word_data.row;
1389
162k
  BLOCK *block = word_data.block;
1390
162k
  prev_word_best_choice_ =
1391
162k
      word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1392
#ifdef DISABLED_LEGACY_ENGINE
1393
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1394
#else
1395
162k
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
1396
162k
      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
1397
162k
#endif // def DISABLED_LEGACY_ENGINE
1398
162k
    if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1399
142k
      LSTMRecognizeWord(*block, row, *in_word, out_words);
1400
142k
      if (!out_words->empty()) {
1401
116k
        return; // Successful lstm recognition.
1402
116k
      }
1403
142k
    }
1404
46.2k
    if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1405
      // No fallback allowed, so use a fake.
1406
0
      (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1407
0
      return;
1408
0
    }
1409
1410
46.2k
#ifndef DISABLED_LEGACY_ENGINE
1411
    // Fall back to tesseract for failed words or odd words.
1412
46.2k
    (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr,
1413
46.2k
                                    classify_bln_numeric_mode, textord_use_cjk_fp_model,
1414
46.2k
                                    poly_allow_detailed_fx, row, block);
1415
46.2k
#endif // ndef DISABLED_LEGACY_ENGINE
1416
46.2k
  }
1417
1418
46.2k
#ifndef DISABLED_LEGACY_ENGINE
1419
46.2k
  WERD_RES *word = *in_word;
1420
46.2k
  match_word_pass_n(1, word, row, block);
1421
46.2k
  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1422
46.2k
    word->tess_would_adapt = AdaptableWord(word);
1423
46.2k
    bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1424
1425
46.2k
    if (adapt_ok) {
1426
      // Send word to adaptive classifier for training.
1427
2.61k
      word->BestChoiceToCorrectText();
1428
2.61k
      LearnWord(nullptr, word);
1429
      // Mark misadaptions if running blamer.
1430
2.61k
      if (word->blamer_bundle != nullptr) {
1431
0
        word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer);
1432
0
      }
1433
2.61k
    }
1434
1435
46.2k
    if (tessedit_enable_doc_dict && !word->IsAmbiguous()) {
1436
26.7k
      tess_add_doc_word(word->best_choice);
1437
26.7k
    }
1438
46.2k
  }
1439
46.2k
#endif // ndef DISABLED_LEGACY_ENGINE
1440
46.2k
}
1441
1442
// Helper to report the result of the xheight fix.
1443
void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word,
1444
0
                                   WERD_RES *new_word) {
1445
0
  tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(),
1446
0
          word->best_choice->debug_string().c_str());
1447
0
  word->reject_map.print(debug_fp);
1448
0
  tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(),
1449
0
          new_word->best_choice->debug_string().c_str());
1450
0
  new_word->reject_map.print(debug_fp);
1451
0
  tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT",
1452
0
          new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1453
0
          accept_new_word ? "ACCEPTED" : "");
1454
0
}
1455
1456
#ifndef DISABLED_LEGACY_ENGINE
1457
1458
// Run the x-height fix-up, based on min/max top/bottom information in
1459
// unicharset.
1460
// Returns true if the word was changed.
1461
// See the comment in fixxht.cpp for a description of the overall process.
1462
43.6k
bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row) {
1463
43.6k
  int original_misfits = CountMisfitTops(word);
1464
43.6k
  if (original_misfits == 0) {
1465
15.8k
    return false;
1466
15.8k
  }
1467
27.7k
  float baseline_shift = 0.0f;
1468
27.7k
  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1469
27.7k
  if (baseline_shift != 0.0f) {
1470
    // Try the shift on its own first.
1471
7.28k
    if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) {
1472
5.01k
      return false;
1473
5.01k
    }
1474
2.27k
    original_misfits = CountMisfitTops(word);
1475
2.27k
    if (original_misfits > 0) {
1476
925
      float new_baseline_shift;
1477
      // Now recompute the new x_height.
1478
925
      new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1479
925
      if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1480
        // No test of return value here, as we are definitely making a change
1481
        // to the word by shifting the baseline.
1482
809
        TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row);
1483
809
      }
1484
925
    }
1485
2.27k
    return true;
1486
20.5k
  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1487
15.1k
    return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row);
1488
15.1k
  } else {
1489
5.37k
    return false;
1490
5.37k
  }
1491
27.7k
}
1492
1493
// Runs recognition with the test baseline shift and x-height and returns true
1494
// if there was an improvement in recognition result.
1495
bool Tesseract::TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht,
1496
23.2k
                                     WERD_RES *word, BLOCK *block, ROW *row) {
1497
23.2k
  bool accept_new_x_ht = false;
1498
23.2k
  WERD_RES new_x_ht_word(word->word);
1499
23.2k
  if (word->blamer_bundle != nullptr) {
1500
0
    new_x_ht_word.blamer_bundle = new BlamerBundle();
1501
0
    new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1502
0
  }
1503
23.2k
  new_x_ht_word.x_height = new_x_ht;
1504
23.2k
  new_x_ht_word.baseline_shift = baseline_shift;
1505
23.2k
  new_x_ht_word.caps_height = 0.0;
1506
23.2k
  new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1507
23.2k
                                    classify_bln_numeric_mode, textord_use_cjk_fp_model,
1508
23.2k
                                    poly_allow_detailed_fx, row, block);
1509
23.2k
  match_word_pass_n(2, &new_x_ht_word, row, block);
1510
23.2k
  if (!new_x_ht_word.tess_failed) {
1511
23.2k
    int new_misfits = CountMisfitTops(&new_x_ht_word);
1512
23.2k
    if (debug_x_ht_level >= 1) {
1513
0
      tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits,
1514
0
              word->x_height, new_misfits, new_x_ht);
1515
0
      tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(),
1516
0
              word->best_choice->certainty(), new_x_ht_word.best_choice->rating(),
1517
0
              new_x_ht_word.best_choice->certainty());
1518
0
    }
1519
    // The misfits must improve and either the rating or certainty.
1520
23.2k
    accept_new_x_ht = new_misfits < original_misfits &&
1521
23.2k
                      (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() ||
1522
17.4k
                       new_x_ht_word.best_choice->rating() < word->best_choice->rating());
1523
23.2k
    if (debug_x_ht_level >= 1) {
1524
0
      ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1525
0
    }
1526
23.2k
  }
1527
23.2k
  if (accept_new_x_ht) {
1528
13.0k
    word->ConsumeWordResults(&new_x_ht_word);
1529
13.0k
    return true;
1530
13.0k
  }
1531
10.1k
  return false;
1532
23.2k
}
1533
1534
#endif // ndef DISABLED_LEGACY_ENGINE
1535
1536
/**
1537
 * classify_word_pass2
1538
 *
1539
 * Control what to do with the word in pass 2
1540
 */
1541
1542
void Tesseract::classify_word_pass2(const WordData &word_data, WERD_RES **in_word,
1543
43.6k
                                    PointerVector<WERD_RES> *out_words) {
1544
  // Return if we do not want to run Tesseract.
1545
43.6k
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1546
0
    return;
1547
0
  }
1548
43.6k
#ifndef DISABLED_LEGACY_ENGINE
1549
43.6k
  ROW *row = word_data.row;
1550
43.6k
  BLOCK *block = word_data.block;
1551
43.6k
  WERD_RES *word = *in_word;
1552
43.6k
  prev_word_best_choice_ =
1553
43.6k
      word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1554
1555
43.6k
  check_debug_pt(word, 30);
1556
43.6k
  if (!word->done) {
1557
43.6k
    word->caps_height = 0.0;
1558
43.6k
    if (word->x_height == 0.0f) {
1559
0
      word->x_height = row->x_height();
1560
0
    }
1561
43.6k
    match_word_pass_n(2, word, row, block);
1562
43.6k
    check_debug_pt(word, 40);
1563
43.6k
  }
1564
1565
43.6k
  SubAndSuperscriptFix(word);
1566
1567
43.6k
  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1568
43.6k
    if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
1569
43.6k
        block->classify_rotation().y() == 0.0f) {
1570
      // Use the tops and bottoms since they are available.
1571
43.6k
      TrainedXheightFix(word, block, row);
1572
43.6k
    }
1573
43.6k
  }
1574
#  ifndef GRAPHICS_DISABLED
1575
  if (tessedit_display_outwords) {
1576
    if (fx_win == nullptr) {
1577
      create_fx_win();
1578
    }
1579
    clear_fx_win();
1580
    word->rebuild_word->plot(fx_win);
1581
    TBOX wbox = word->rebuild_word->bounding_box();
1582
    fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
1583
    ScrollView::Update();
1584
  }
1585
#  endif
1586
43.6k
  check_debug_pt(word, 50);
1587
43.6k
#endif // ndef DISABLED_LEGACY_ENGINE
1588
43.6k
}
1589
1590
#ifndef DISABLED_LEGACY_ENGINE
1591
/**
1592
 * match_word_pass2
1593
 *
1594
 * Baseline normalize the word and pass it to Tess.
1595
 */
1596
113k
void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block) {
1597
113k
  if (word->tess_failed) {
1598
0
    return;
1599
0
  }
1600
113k
  tess_segment_pass_n(pass_n, word);
1601
1602
113k
  if (!word->tess_failed) {
1603
113k
    if (!word->word->flag(W_REP_CHAR)) {
1604
113k
      word->fix_quotes();
1605
113k
      if (tessedit_fix_hyphens) {
1606
113k
        word->fix_hyphens();
1607
113k
      }
1608
      /* Don't trust fix_quotes! - though I think I've fixed the bug */
1609
113k
      if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {
1610
0
        tprintf(
1611
0
            "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1612
0
            " #Blobs=%u\n",
1613
0
            word->best_choice->debug_string().c_str(), word->best_choice->length(),
1614
0
            word->box_word->length());
1615
0
      }
1616
113k
      word->tess_accepted = tess_acceptable_word(word);
1617
1618
      // Also sets word->done flag
1619
113k
      make_reject_map(word, row, pass_n);
1620
113k
    }
1621
113k
  }
1622
113k
  set_word_fonts(word);
1623
1624
113k
  ASSERT_HOST(word->raw_choice != nullptr);
1625
113k
}
1626
#endif // ndef DISABLED_LEGACY_ENGINE
1627
1628
// Helper to return the best rated BLOB_CHOICE in the whole word that matches
1629
// the given char_id, or nullptr if none can be found.
1630
0
static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) {
1631
  // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1632
0
  BLOB_CHOICE *best_choice = nullptr;
1633
0
  for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
1634
0
    BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i));
1635
0
    if (choice != nullptr) {
1636
0
      if (best_choice == nullptr || choice->rating() < best_choice->rating()) {
1637
0
        best_choice = choice;
1638
0
      }
1639
0
    }
1640
0
  }
1641
0
  return best_choice;
1642
0
}
1643
1644
// Helper to insert blob_choice in each location in the leader word if there is
1645
// no matching BLOB_CHOICE there already, and correct any incorrect results
1646
// in the best_choice.
1647
0
static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {
1648
0
  WERD_CHOICE *word = word_res->best_choice;
1649
0
  for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
1650
0
    BLOB_CHOICE *choice =
1651
0
        FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i));
1652
0
    if (choice == nullptr) {
1653
0
      BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
1654
0
      choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1655
0
    }
1656
0
  }
1657
  // Correct any incorrect results in word.
1658
0
  for (unsigned i = 0; i < word->length(); ++i) {
1659
0
    if (word->unichar_id(i) != blob_choice->unichar_id()) {
1660
0
      word->set_unichar_id(blob_choice->unichar_id(), i);
1661
0
    }
1662
0
  }
1663
0
}
1664
1665
/**
1666
 * fix_rep_char()
1667
 * The word is a repeated char. (Leader.) Find the repeated char character.
1668
 * Create the appropriate single-word or multi-word sequence according to
1669
 * the size of spaces in between blobs, and correct the classifications
1670
 * where some of the characters disagree with the majority.
1671
 */
1672
0
void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) {
1673
0
  WERD_RES *word_res = page_res_it->word();
1674
0
  const WERD_CHOICE &word = *(word_res->best_choice);
1675
1676
  // Find the frequency of each unique character in the word.
1677
0
  SortHelper<UNICHAR_ID> rep_ch(word.length());
1678
0
  for (unsigned i = 0; i < word.length(); ++i) {
1679
0
    rep_ch.Add(word.unichar_id(i), 1);
1680
0
  }
1681
1682
  // Find the most frequent result.
1683
0
  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1684
0
  int max_count = rep_ch.MaxCount(&maxch_id);
1685
  // Find the best exemplar of a classifier result for maxch_id.
1686
0
  BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res);
1687
0
  if (best_choice == nullptr) {
1688
0
    tprintf("Failed to find a choice for %s, occurring %d times\n",
1689
0
            word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
1690
0
    return;
1691
0
  }
1692
0
  word_res->done = true;
1693
1694
  // Just correct existing classification.
1695
0
  CorrectRepcharChoices(best_choice, word_res);
1696
0
  word_res->reject_map.initialise(word.length());
1697
0
}
1698
1699
ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_set, const char *s,
1700
0
                                                       const char *lengths) {
1701
0
  int i = 0;
1702
0
  int offset = 0;
1703
0
  int leading_punct_count;
1704
0
  int upper_count = 0;
1705
0
  int hyphen_pos = -1;
1706
0
  ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
1707
1708
0
  if (strlen(lengths) > 20) {
1709
0
    return word_type;
1710
0
  }
1711
1712
  /* Single Leading punctuation char*/
1713
1714
0
  if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) {
1715
0
    offset += lengths[i++];
1716
0
  }
1717
0
  leading_punct_count = i;
1718
1719
  /* Initial cap */
1720
0
  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1721
0
    offset += lengths[i++];
1722
0
    upper_count++;
1723
0
  }
1724
0
  if (upper_count > 1) {
1725
0
    word_type = AC_UPPER_CASE;
1726
0
  } else {
1727
    /* Lower case word, possibly with an initial cap */
1728
0
    while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1729
0
      offset += lengths[i++];
1730
0
    }
1731
0
    if (i - leading_punct_count < quality_min_initial_alphas_reqd) {
1732
0
      goto not_a_word;
1733
0
    }
1734
    /*
1735
Allow a single hyphen in a lower case word
1736
- don't trust upper case - I've seen several cases of "H" -> "I-I"
1737
*/
1738
0
    if (lengths[i] == 1 && s[offset] == '-') {
1739
0
      hyphen_pos = i;
1740
0
      offset += lengths[i++];
1741
0
      if (s[offset] != '\0') {
1742
0
        while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) {
1743
0
          offset += lengths[i++];
1744
0
        }
1745
0
        if (i < hyphen_pos + 3) {
1746
0
          goto not_a_word;
1747
0
        }
1748
0
      }
1749
0
    } else {
1750
      /* Allow "'s" in NON hyphenated lower case words */
1751
0
      if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 &&
1752
0
          (s[offset + lengths[i]] == 's')) {
1753
0
        offset += lengths[i++];
1754
0
        offset += lengths[i++];
1755
0
      }
1756
0
    }
1757
0
    if (upper_count > 0) {
1758
0
      word_type = AC_INITIAL_CAP;
1759
0
    } else {
1760
0
      word_type = AC_LOWER_CASE;
1761
0
    }
1762
0
  }
1763
1764
  /* Up to two different, constrained trailing punctuation chars */
1765
0
  if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) {
1766
0
    offset += lengths[i++];
1767
0
  }
1768
0
  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&
1769
0
      chs_trailing_punct2.contains(s[offset])) {
1770
0
    offset += lengths[i++];
1771
0
  }
1772
1773
0
  if (s[offset] != '\0') {
1774
0
    word_type = AC_UNACCEPTABLE;
1775
0
  }
1776
1777
0
not_a_word:
1778
1779
0
  if (word_type == AC_UNACCEPTABLE) {
1780
    /* Look for abbreviation string */
1781
0
    i = 0;
1782
0
    offset = 0;
1783
0
    if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1784
0
      word_type = AC_UC_ABBREV;
1785
0
      while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) &&
1786
0
             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1787
0
        offset += lengths[i++];
1788
0
        offset += lengths[i++];
1789
0
      }
1790
0
    } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1791
0
      word_type = AC_LC_ABBREV;
1792
0
      while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) &&
1793
0
             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1794
0
        offset += lengths[i++];
1795
0
        offset += lengths[i++];
1796
0
      }
1797
0
    }
1798
0
    if (s[offset] != '\0') {
1799
0
      word_type = AC_UNACCEPTABLE;
1800
0
    }
1801
0
  }
1802
1803
0
  return word_type;
1804
0
}
1805
1806
470k
bool Tesseract::check_debug_pt(WERD_RES *word, int location) {
1807
470k
  if (!test_pt) {
1808
470k
    return false;
1809
470k
  }
1810
1811
0
  tessedit_rejection_debug.set_value(false);
1812
0
  debug_x_ht_level.set_value(0);
1813
1814
0
  if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) {
1815
0
    if (location < 0) {
1816
0
      return true; // For breakpoint use
1817
0
    }
1818
0
    bool show_map_detail = false;
1819
0
    tessedit_rejection_debug.set_value(true);
1820
0
    debug_x_ht_level.set_value(2);
1821
0
    tprintf("\n\nTESTWD::");
1822
0
    switch (location) {
1823
0
      case 0:
1824
0
        tprintf("classify_word_pass1 start\n");
1825
0
        word->word->print();
1826
0
        break;
1827
0
      case 10:
1828
0
        tprintf("make_reject_map: initial map");
1829
0
        break;
1830
0
      case 20:
1831
0
        tprintf("make_reject_map: after NN");
1832
0
        break;
1833
0
      case 30:
1834
0
        tprintf("classify_word_pass2 - START");
1835
0
        break;
1836
0
      case 40:
1837
0
        tprintf("classify_word_pass2 - Pre Xht");
1838
0
        break;
1839
0
      case 50:
1840
0
        tprintf("classify_word_pass2 - END");
1841
0
        show_map_detail = true;
1842
0
        break;
1843
0
      case 60:
1844
0
        tprintf("fixspace");
1845
0
        break;
1846
0
      case 70:
1847
0
        tprintf("MM pass START");
1848
0
        break;
1849
0
      case 80:
1850
0
        tprintf("MM pass END");
1851
0
        break;
1852
0
      case 90:
1853
0
        tprintf("After Poor quality rejection");
1854
0
        break;
1855
0
      case 100:
1856
0
        tprintf("unrej_good_quality_words - START");
1857
0
        break;
1858
0
      case 110:
1859
0
        tprintf("unrej_good_quality_words - END");
1860
0
        break;
1861
0
      case 120:
1862
0
        tprintf("Write results pass");
1863
0
        show_map_detail = true;
1864
0
        break;
1865
0
    }
1866
0
    if (word->best_choice != nullptr) {
1867
0
      tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
1868
0
      word->reject_map.print(debug_fp);
1869
0
      tprintf("\n");
1870
0
      if (show_map_detail) {
1871
0
        tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
1872
0
        for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1873
0
          tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1874
0
          word->reject_map[i].full_print(debug_fp);
1875
0
        }
1876
0
      }
1877
0
    } else {
1878
0
      tprintf("null best choice\n");
1879
0
    }
1880
0
    tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1881
0
    tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1882
0
    return true;
1883
0
  } else {
1884
0
    return false;
1885
0
  }
1886
0
}
1887
1888
/**
1889
 * find_modal_font
1890
 *
1891
 * Find the modal font and remove from the stats.
1892
 */
1893
#ifndef DISABLED_LEGACY_ENGINE
1894
static void find_modal_font( // good chars in word
1895
    STATS *fonts,            // font stats
1896
    int16_t *font_out,       // output font
1897
    int8_t *font_count       // output count
1898
0
) {
1899
0
  if (fonts->get_total() > 0) {
1900
    // font index
1901
0
    int16_t font = static_cast<int16_t>(fonts->mode());
1902
0
    *font_out = font;
1903
    // pile count
1904
0
    int32_t count = fonts->pile_count(font);
1905
0
    *font_count = count < INT8_MAX ? count : INT8_MAX;
1906
0
    fonts->add(font, -*font_count);
1907
0
  } else {
1908
0
    *font_out = -1;
1909
0
    *font_count = 0;
1910
0
  }
1911
0
}
1912
#endif // ! DISABLED_LEGACY_ENGINE
1913
1914
/**
1915
 * set_word_fonts
1916
 *
1917
 * Get the fonts for the word.
1918
 */
1919
113k
void Tesseract::set_word_fonts(WERD_RES *word) {
1920
  // Don't try to set the word fonts for an lstm word, as the configs
1921
  // will be meaningless.
1922
113k
  if (word->chopped_word == nullptr) {
1923
0
    return;
1924
0
  }
1925
113k
  ASSERT_HOST(word->best_choice != nullptr);
1926
1927
113k
#ifndef DISABLED_LEGACY_ENGINE
1928
113k
  const int fontinfo_size = fontinfo_table_.size();
1929
113k
  if (fontinfo_size == 0) {
1930
0
    return;
1931
0
  }
1932
113k
  if (tessedit_font_id > 0) {
1933
0
    if (tessedit_font_id >= fontinfo_size) {
1934
0
      tprintf("Error, invalid font ID provided: must be below %d.\n"
1935
0
              "Falling back to font auto-detection.\n", fontinfo_size);
1936
0
    } else {
1937
0
      word->fontinfo = &fontinfo_table_.at(tessedit_font_id);
1938
0
      word->fontinfo2 = nullptr;
1939
0
      word->fontinfo_id_count = INT8_MAX;
1940
0
      word->fontinfo_id2_count = 0;
1941
0
      return;
1942
0
    }
1943
0
  }
1944
113k
  std::vector<int> font_total_score(fontinfo_size);
1945
1946
  // Compute the font scores for the word
1947
113k
  if (tessedit_debug_fonts) {
1948
0
    tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());
1949
0
  }
1950
600k
  for (unsigned b = 0; b < word->best_choice->length(); ++b) {
1951
487k
    const BLOB_CHOICE *choice = word->GetBlobChoice(b);
1952
487k
    if (choice == nullptr) {
1953
50
      continue;
1954
50
    }
1955
487k
    auto &fonts = choice->fonts();
1956
17.0M
    for (auto &f : fonts) {
1957
17.0M
      const int fontinfo_id = f.fontinfo_id;
1958
17.0M
      if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1959
17.0M
        font_total_score[fontinfo_id] += f.score;
1960
17.0M
      }
1961
17.0M
    }
1962
487k
  }
1963
  // Find the top and 2nd choice for the word.
1964
113k
  int score1 = 0, score2 = 0;
1965
113k
  int16_t font_id1 = -1, font_id2 = -1;
1966
48.4M
  for (int f = 0; f < fontinfo_size; ++f) {
1967
48.2M
    if (tessedit_debug_fonts && font_total_score[f] > 0) {
1968
0
      tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]);
1969
0
    }
1970
48.2M
    if (font_total_score[f] > score1) {
1971
361k
      score2 = score1;
1972
361k
      font_id2 = font_id1;
1973
361k
      score1 = font_total_score[f];
1974
361k
      font_id1 = f;
1975
47.9M
    } else if (font_total_score[f] > score2) {
1976
206k
      score2 = font_total_score[f];
1977
206k
      font_id2 = f;
1978
206k
    }
1979
48.2M
  }
1980
113k
  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr;
1981
113k
  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr;
1982
  // Each score has a limit of UINT16_MAX, so divide by that to get the number
1983
  // of "votes" for that font, ie number of perfect scores.
1984
113k
  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
1985
113k
  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
1986
113k
  if (score1 > 0) {
1987
113k
    const FontInfo fi = fontinfo_table_.at(font_id1);
1988
113k
    if (tessedit_debug_fonts) {
1989
0
      if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
1990
0
        tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name,
1991
0
                word->fontinfo_id_count, fontinfo_table_.at(font_id2).name,
1992
0
                word->fontinfo_id2_count);
1993
0
      } else {
1994
0
        tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count);
1995
0
      }
1996
0
    }
1997
113k
  }
1998
113k
#endif // ndef DISABLED_LEGACY_ENGINE
1999
113k
}
2000
2001
#ifndef DISABLED_LEGACY_ENGINE
2002
/**
2003
 * font_recognition_pass
2004
 *
2005
 * Smooth the fonts for the document.
2006
 */
2007
0
void Tesseract::font_recognition_pass(PAGE_RES *page_res) {
2008
0
  PAGE_RES_IT page_res_it(page_res);
2009
0
  WERD_RES *word;                       // current word
2010
0
  STATS doc_fonts(0, font_table_size_ - 1); // font counters
2011
2012
  // Gather font id statistics.
2013
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2014
0
    word = page_res_it.word();
2015
0
    if (word->fontinfo != nullptr) {
2016
0
      doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2017
0
    }
2018
0
    if (word->fontinfo2 != nullptr) {
2019
0
      doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2020
0
    }
2021
0
  }
2022
0
  int16_t doc_font;      // modal font
2023
0
  int8_t doc_font_count; // modal font
2024
0
  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2025
0
  if (doc_font_count == 0) {
2026
0
    return;
2027
0
  }
2028
  // Get the modal font pointer.
2029
0
  const FontInfo *modal_font = nullptr;
2030
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2031
0
    word = page_res_it.word();
2032
0
    if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2033
0
      modal_font = word->fontinfo;
2034
0
      break;
2035
0
    }
2036
0
    if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2037
0
      modal_font = word->fontinfo2;
2038
0
      break;
2039
0
    }
2040
0
  }
2041
0
  ASSERT_HOST(modal_font != nullptr);
2042
2043
  // Assign modal font to weak words.
2044
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2045
0
    word = page_res_it.word();
2046
0
    const int length = word->best_choice->length();
2047
2048
0
    const int count = word->fontinfo_id_count;
2049
0
    if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2050
0
      word->fontinfo = modal_font;
2051
      // Counts only get 1 as it came from the doc.
2052
0
      word->fontinfo_id_count = 1;
2053
0
    }
2054
0
  }
2055
0
}
2056
#endif // ndef DISABLED_LEGACY_ENGINE
2057
2058
// If a word has multiple alternates check if the best choice is in the
2059
// dictionary. If not, replace it with an alternate that exists in the
2060
// dictionary.
2061
0
void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) {
2062
0
  PAGE_RES_IT word_it(page_res);
2063
0
  for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) {
2064
0
    if (word->best_choices.singleton()) {
2065
0
      continue; // There are no alternates.
2066
0
    }
2067
2068
0
    const WERD_CHOICE *best = word->best_choice;
2069
0
    if (word->tesseract->getDict().valid_word(*best) != 0) {
2070
0
      continue; // The best choice is in the dictionary.
2071
0
    }
2072
2073
0
    WERD_CHOICE_IT choice_it(&word->best_choices);
2074
0
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
2075
0
      WERD_CHOICE *alternate = choice_it.data();
2076
0
      if (word->tesseract->getDict().valid_word(*alternate)) {
2077
        // The alternate choice is in the dictionary.
2078
0
        if (tessedit_bigram_debug) {
2079
0
          tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2080
0
                  best->unichar_string().c_str(), alternate->unichar_string().c_str());
2081
0
        }
2082
        // Replace the 'best' choice with a better choice.
2083
0
        word->ReplaceBestChoice(alternate);
2084
0
        break;
2085
0
      }
2086
0
    }
2087
0
  }
2088
0
}
2089
2090
} // namespace tesseract