Coverage Report

Created: 2024-02-28 06:46

/src/tesseract/src/ccmain/control.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************
2
 * File:        control.cpp  (Formerly control.c)
3
 * Description: Module-independent matcher controller.
4
 * Author:      Ray Smith
5
 *
6
 * (C) Copyright 1992, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
// Include automatically generated configuration file if running autoconf.
20
#ifdef HAVE_CONFIG_H
21
#  include "config_auto.h"
22
#endif
23
24
#include <cctype>
25
#include <cmath>
26
#include <cstdint> // for int16_t, int32_t
27
#include <cstdio>  // for fclose, fopen, FILE
28
#include <ctime>   // for clock
29
#include "control.h"
30
#ifndef DISABLED_LEGACY_ENGINE
31
#  include "docqual.h"
32
#  include "drawfx.h"
33
#  include "fixspace.h"
34
#endif
35
#include <tesseract/ocrclass.h>
36
#include "lstmrecognizer.h"
37
#include "output.h"
38
#include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...
39
#ifndef DISABLED_LEGACY_ENGINE
40
#  include "reject.h"
41
#endif
42
#include "sorthelper.h"
43
#include "tesseractclass.h"
44
#include "tessvars.h"
45
#include "werdit.h"
46
47
const char *const kBackUpConfigFile = "tempconfigdata.config";
48
#ifndef DISABLED_LEGACY_ENGINE
49
// Min believable x-height for any text when refitting as a fraction of
50
// original x-height
51
const double kMinRefitXHeightFraction = 0.5;
52
#endif // ! DISABLED_LEGACY_ENGINE
53
54
/**
55
 * Make a word from the selected blobs and run Tess on them.
56
 *
57
 * @param page_res recognise blobs
58
 * @param selection_box within this box
59
 */
60
namespace tesseract {
61
62
0
void Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) {
63
0
  PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
64
0
  if (it != nullptr) {
65
0
    recog_interactive(it);
66
0
    it->DeleteCurrentWord();
67
0
    delete it;
68
0
  }
69
0
}
70
71
/**
72
 * Recognize a single word in interactive mode.
73
 *
74
 * @param pr_it the page results iterator
75
 */
76
0
bool Tesseract::recog_interactive(PAGE_RES_IT *pr_it) {
77
0
  WordData word_data(*pr_it);
78
0
  SetupWordPassN(2, &word_data);
79
  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
80
0
  if (lstm_recognizer_ == nullptr) {
81
0
#ifndef DISABLED_LEGACY_ENGINE
82
0
    classify_word_and_language(2, pr_it, &word_data);
83
0
#endif // ndef DISABLED_LEGACY_ENGINE
84
0
  } else {
85
0
    classify_word_and_language(1, pr_it, &word_data);
86
0
  }
87
0
#ifndef DISABLED_LEGACY_ENGINE
88
0
  if (tessedit_debug_quality_metrics) {
89
0
    int16_t char_qual;
90
0
    int16_t good_char_qual;
91
0
    WERD_RES *word_res = pr_it->word();
92
0
    word_char_quality(word_res, &char_qual, &good_char_qual);
93
0
    tprintf(
94
0
        "\n%d chars;  word_blob_quality: %d;  outline_errs: %d; "
95
0
        "char_quality: %d; good_char_quality: %d\n",
96
0
        word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res),
97
0
        char_qual, good_char_qual);
98
0
  }
99
0
#endif // ndef DISABLED_LEGACY_ENGINE
100
0
  return true;
101
0
}
102
103
// Helper function to check for a target word and handle it appropriately.
104
// Inspired by Jetsoft's requirement to process only single words on pass2
105
// and beyond.
106
// If word_config is not null:
107
//   If the word_box and target_word_box overlap, read the word_config file
108
//   else reset to previous config data.
109
//   return true.
110
// else
111
//   If the word_box and target_word_box overlap or pass <= 1, return true.
112
// Note that this function uses a fixed temporary file for storing the previous
113
// configs, so it is neither thread-safe, nor process-safe, but the assumption
114
// is that it will only be used for one debug window at a time.
115
//
116
// Since this function is used for debugging (and not to change OCR results)
117
// set only debug params from the word config file.
118
bool Tesseract::ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box,
119
0
                                  const char *word_config, int pass) {
120
0
  if (word_config != nullptr) {
121
0
    if (word_box.major_overlap(target_word_box)) {
122
0
      if (backup_config_file_ == nullptr) {
123
0
        backup_config_file_ = kBackUpConfigFile;
124
0
        FILE *config_fp = fopen(backup_config_file_, "wb");
125
0
        if (config_fp == nullptr) {
126
0
          tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
127
0
        } else {
128
0
          ParamUtils::PrintParams(config_fp, params());
129
0
          fclose(config_fp);
130
0
        }
131
0
        ParamUtils::ReadParamsFile(word_config, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());
132
0
      }
133
0
    } else {
134
0
      if (backup_config_file_ != nullptr) {
135
0
        ParamUtils::ReadParamsFile(backup_config_file_, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());
136
0
        backup_config_file_ = nullptr;
137
0
      }
138
0
    }
139
0
  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
140
0
    return false;
141
0
  }
142
0
  return true;
143
0
}
144
145
/** If tesseract is to be run, sets the words up ready for it. */
146
void Tesseract::SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config,
147
31.3k
                                   PAGE_RES *page_res, std::vector<WordData> *words) {
148
  // Prepare all the words.
149
31.3k
  PAGE_RES_IT page_res_it(page_res);
150
476k
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
151
444k
    if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(),
152
444k
                                                        *target_word_box, word_config, 1)) {
153
444k
      words->push_back(WordData(page_res_it));
154
444k
    }
155
444k
  }
156
  // Setup all the words for recognition with polygonal approximation.
157
476k
  for (unsigned w = 0; w < words->size(); ++w) {
158
444k
    SetupWordPassN(pass_n, &(*words)[w]);
159
444k
    if (w > 0) {
160
413k
      (*words)[w].prev_word = &(*words)[w - 1];
161
413k
    }
162
444k
  }
163
31.3k
}
164
165
// Sets up the single word ready for whichever engine is to be run.
166
444k
void Tesseract::SetupWordPassN(int pass_n, WordData *word) {
167
444k
  if (pass_n == 1 || !word->word->done) {
168
277k
    if (pass_n == 1) {
169
214k
      word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode,
170
214k
                                      nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model,
171
214k
                                      poly_allow_detailed_fx, word->row, word->block);
172
214k
    } else if (pass_n == 2) {
173
      // TODO(rays) Should we do this on pass1 too?
174
63.2k
      word->word->caps_height = 0.0;
175
63.2k
      if (word->word->x_height == 0.0f) {
176
0
        word->word->x_height = word->row->x_height();
177
0
      }
178
63.2k
    }
179
277k
    word->lang_words.truncate(0);
180
554k
    for (unsigned s = 0; s <= sub_langs_.size(); ++s) {
181
      // The sub_langs_.size() entry is for the master language.
182
277k
      Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
183
277k
      auto *word_res = new WERD_RES;
184
277k
      word_res->InitForRetryRecognition(*word->word);
185
277k
      word->lang_words.push_back(word_res);
186
      // LSTM doesn't get setup for pass2.
187
277k
      if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
188
277k
        word_res->SetupForRecognition(
189
277k
            lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr,
190
277k
            lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model,
191
277k
            lang_t->poly_allow_detailed_fx, word->row, word->block);
192
277k
      }
193
277k
    }
194
277k
  }
195
444k
}
196
197
// Runs word recognition on all the words.
198
bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it,
199
31.3k
                                   std::vector<WordData> *words) {
200
  // TODO(rays) Before this loop can be parallelized (it would yield a massive
201
  // speed-up) all remaining member globals need to be converted to local/heap
202
  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
203
  // added. The results will be significantly different with adaption on, and
204
  // deterioration will need investigation.
205
31.3k
  pr_it->restart_page();
206
476k
  for (unsigned w = 0; w < words->size(); ++w) {
207
444k
    WordData *word = &(*words)[w];
208
444k
    if (w > 0) {
209
413k
      word->prev_word = &(*words)[w - 1];
210
413k
    }
211
444k
    if (monitor != nullptr) {
212
0
      monitor->ocr_alive = true;
213
0
      if (pass_n == 1) {
214
0
        monitor->progress = 70 * w / words->size();
215
0
      } else {
216
0
        monitor->progress = 70 + 30 * w / words->size();
217
0
      }
218
0
      if (monitor->progress_callback2 != nullptr) {
219
0
        TBOX box = pr_it->word()->word->bounding_box();
220
0
        (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom());
221
0
      }
222
0
      if (monitor->deadline_exceeded() ||
223
0
          (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) {
224
        // Timeout. Fake out the rest of the words.
225
0
        for (; w < words->size(); ++w) {
226
0
          (*words)[w].word->SetupFake(unicharset);
227
0
        }
228
0
        return false;
229
0
      }
230
0
    }
231
444k
    if (word->word->tess_failed) {
232
2.76k
      unsigned s;
233
4.13k
      for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {
234
1.37k
      }
235
      // If all are failed, skip it. Image words are skipped by this test.
236
2.76k
      if (s > word->lang_words.size()) {
237
0
        continue;
238
0
      }
239
2.76k
    }
240
    // Sync pr_it with the WordData.
241
444k
    while (pr_it->word() != nullptr && pr_it->word() != word->word) {
242
0
      pr_it->forward();
243
0
    }
244
444k
    ASSERT_HOST(pr_it->word() != nullptr);
245
444k
    bool make_next_word_fuzzy = false;
246
444k
#ifndef DISABLED_LEGACY_ENGINE
247
444k
    if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
248
      // Needs to be setup again to see the new outlines in the chopped_word.
249
0
      SetupWordPassN(pass_n, word);
250
0
    }
251
444k
#endif // ndef DISABLED_LEGACY_ENGINE
252
253
444k
    classify_word_and_language(pass_n, pr_it, word);
254
444k
    if (tessedit_dump_choices || debug_noise_removal) {
255
0
      tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(),
256
0
              word->word->best_choice->debug_string().c_str());
257
0
    }
258
444k
    pr_it->forward();
259
444k
    if (make_next_word_fuzzy && pr_it->word() != nullptr) {
260
0
      pr_it->MakeCurrentWordFuzzy();
261
0
    }
262
444k
  }
263
31.3k
  return true;
264
31.3k
}
265
266
/**
267
 * recog_all_words()
268
 *
269
 * Walk the page_res, recognizing all the words.
270
 * If monitor is not null, it is used as a progress monitor/timeout/cancel.
271
 * If dopasses is 0, all recognition passes are run,
272
 * 1 just pass 1, 2 passes2 and higher.
273
 * If target_word_box is not null, special things are done to words that
274
 * overlap the target_word_box:
275
 * if word_config is not null, the word config file is read for just the
276
 * target word(s), otherwise, on pass 2 and beyond ONLY the target words
277
 * are processed (Jetsoft modification.)
278
 * Returns false if we cancelled prematurely.
279
 *
280
 * @param page_res page structure
281
 * @param monitor progress monitor
282
 * @param word_config word_config file
283
 * @param target_word_box specifies just to extract a rectangle
284
 * @param dopasses 0 - all, 1 just pass 1, 2 passes 2 and higher
285
 */
286
287
bool Tesseract::recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor,
288
                                const TBOX *target_word_box, const char *word_config,
289
15.6k
                                int dopasses) {
290
15.6k
  PAGE_RES_IT page_res_it(page_res);
291
292
15.6k
  if (tessedit_minimal_rej_pass1) {
293
0
    tessedit_test_adaption.set_value(true);
294
0
    tessedit_minimal_rejection.set_value(true);
295
0
  }
296
297
15.6k
  if (dopasses == 0 || dopasses == 1) {
298
15.6k
    page_res_it.restart_page();
299
    // ****************** Pass 1 *******************
300
301
15.6k
#ifndef DISABLED_LEGACY_ENGINE
302
    // If the adaptive classifier is full switch to one we prepared earlier,
303
    // ie on the previous page. If the current adaptive classifier is non-empty,
304
    // prepare a backup starting at this page, in case it fills up. Do all this
305
    // independently for each language.
306
15.6k
    if (AdaptiveClassifierIsFull()) {
307
1
      SwitchAdaptiveClassifier();
308
15.6k
    } else if (!AdaptiveClassifierIsEmpty()) {
309
13.2k
      StartBackupAdaptiveClassifier();
310
13.2k
    }
311
    // Now check the sub-langs as well.
312
15.6k
    for (auto &lang : sub_langs_) {
313
0
      if (lang->AdaptiveClassifierIsFull()) {
314
0
        lang->SwitchAdaptiveClassifier();
315
0
      } else if (!lang->AdaptiveClassifierIsEmpty()) {
316
0
        lang->StartBackupAdaptiveClassifier();
317
0
      }
318
0
    }
319
320
15.6k
#endif // ndef DISABLED_LEGACY_ENGINE
321
322
    // Set up all words ready for recognition, so that if parallelism is on
323
    // all the input and output classes are ready to run the classifier.
324
15.6k
    std::vector<WordData> words;
325
15.6k
    SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
326
15.6k
#ifndef DISABLED_LEGACY_ENGINE
327
15.6k
    if (tessedit_parallelize) {
328
0
      PrerecAllWordsPar(words);
329
0
    }
330
15.6k
#endif // ndef DISABLED_LEGACY_ENGINE
331
332
15.6k
    stats_.word_count = words.size();
333
334
15.6k
    stats_.dict_words = 0;
335
15.6k
    stats_.doc_blob_quality = 0;
336
15.6k
    stats_.doc_outline_errs = 0;
337
15.6k
    stats_.doc_char_quality = 0;
338
15.6k
    stats_.good_char_count = 0;
339
15.6k
    stats_.doc_good_char_quality = 0;
340
341
15.6k
    most_recently_used_ = this;
342
    // Run pass 1 word recognition.
343
15.6k
    if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) {
344
0
      return false;
345
0
    }
346
    // Pass 1 post-processing.
347
246k
    for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
348
230k
      if (page_res_it.word()->word->flag(W_REP_CHAR)) {
349
0
        fix_rep_char(&page_res_it);
350
0
        continue;
351
0
      }
352
353
      // Count dict words.
354
230k
      if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) {
355
0
        ++(stats_.dict_words);
356
0
      }
357
358
      // Update misadaption log (we only need to do it on pass 1, since
359
      // adaption only happens on this pass).
360
230k
      if (page_res_it.word()->blamer_bundle != nullptr &&
361
230k
          page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
362
0
        page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug());
363
0
      }
364
230k
    }
365
15.6k
  }
366
367
15.6k
  if (dopasses == 1) {
368
0
    return true;
369
0
  }
370
371
15.6k
#ifndef DISABLED_LEGACY_ENGINE
372
373
  // ****************** Pass 2 *******************
374
15.6k
  if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) {
375
15.6k
    page_res_it.restart_page();
376
15.6k
    std::vector<WordData> words;
377
15.6k
    SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
378
15.6k
    if (tessedit_parallelize) {
379
0
      PrerecAllWordsPar(words);
380
0
    }
381
15.6k
    most_recently_used_ = this;
382
    // Run pass 2 word recognition.
383
15.6k
    if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) {
384
0
      return false;
385
0
    }
386
15.6k
  }
387
388
  // The next passes are only required for Tess-only.
389
15.6k
  if (AnyTessLang() && !AnyLSTMLang()) {
390
    // ****************** Pass 3 *******************
391
    // Fix fuzzy spaces.
392
393
0
    if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word &&
394
0
        !right_to_left()) {
395
0
      fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
396
0
    }
397
398
    // ****************** Pass 4 *******************
399
0
    if (tessedit_enable_dict_correction) {
400
0
      dictionary_correction_pass(page_res);
401
0
    }
402
0
    if (tessedit_enable_bigram_correction) {
403
0
      bigram_correction_pass(page_res);
404
0
    }
405
406
    // ****************** Pass 5,6 *******************
407
0
    rejection_passes(page_res, monitor, target_word_box, word_config);
408
409
    // ****************** Pass 8 *******************
410
0
    font_recognition_pass(page_res);
411
412
    // ****************** Pass 9 *******************
413
    // Check the correctness of the final results.
414
0
    blamer_pass(page_res);
415
0
    script_pos_pass(page_res);
416
0
  }
417
418
15.6k
#endif // ndef DISABLED_LEGACY_ENGINE
419
420
  // Write results pass.
421
  // This is now redundant, but retained commented so show how to obtain
422
  // bounding boxes and style information.
423
424
15.6k
#ifndef DISABLED_LEGACY_ENGINE
425
  // changed by jetsoft
426
  // needed for dll to output memory structure
427
15.6k
  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) {
428
0
    output_pass(page_res_it, target_word_box);
429
0
  }
430
// end jetsoft
431
15.6k
#endif // ndef DISABLED_LEGACY_ENGINE
432
433
15.6k
  const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
434
15.6k
  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
435
436
  // Remove empty words, as these mess up the result iterators.
437
246k
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
438
230k
    const WERD_RES *word = page_res_it.word();
439
230k
    const POLY_BLOCK *pb = page_res_it.block()->block != nullptr
440
230k
                               ? page_res_it.block()->block->pdblk.poly_block()
441
230k
                               : nullptr;
442
230k
    if (word->best_choice == nullptr || word->best_choice->empty() ||
443
230k
        (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
444
1.87k
      page_res_it.DeleteCurrentWord();
445
1.87k
    }
446
230k
  }
447
448
15.6k
  if (monitor != nullptr) {
449
0
    monitor->progress = 100;
450
0
  }
451
15.6k
  return true;
452
15.6k
}
453
454
#ifndef DISABLED_LEGACY_ENGINE
455
456
0
void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
457
0
  PAGE_RES_IT word_it(page_res);
458
459
0
  WERD_RES *w_prev = nullptr;
460
0
  WERD_RES *w = word_it.word();
461
0
  while (true) {
462
0
    w_prev = w;
463
0
    while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) {
464
      // advance word_it, skipping over parts of combos
465
0
    }
466
0
    if (!word_it.word()) {
467
0
      break;
468
0
    }
469
0
    w = word_it.word();
470
0
    if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
471
0
      continue;
472
0
    }
473
0
    if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
474
0
      if (tessedit_bigram_debug) {
475
0
        tprintf("Skipping because one of the words is W_REP_CHAR\n");
476
0
      }
477
0
      continue;
478
0
    }
479
    // Two words sharing the same language model, excellent!
480
0
    std::vector<WERD_CHOICE *> overrides_word1;
481
0
    std::vector<WERD_CHOICE *> overrides_word2;
482
483
0
    const auto orig_w1_str = w_prev->best_choice->unichar_string();
484
0
    const auto orig_w2_str = w->best_choice->unichar_string();
485
0
    WERD_CHOICE prev_best(w->uch_set);
486
0
    {
487
0
      int w1start, w1end;
488
0
      w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
489
0
      prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
490
0
    }
491
0
    WERD_CHOICE this_best(w->uch_set);
492
0
    {
493
0
      int w2start, w2end;
494
0
      w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
495
0
      this_best = w->best_choice->shallow_copy(w2start, w2end);
496
0
    }
497
498
0
    if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
499
0
      if (tessedit_bigram_debug) {
500
0
        tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(),
501
0
                orig_w2_str.c_str());
502
0
      }
503
0
      continue;
504
0
    }
505
0
    if (tessedit_bigram_debug > 2) {
506
0
      tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str());
507
0
    }
508
0
    if (tessedit_bigram_debug > 1) {
509
0
      if (!w_prev->best_choices.singleton()) {
510
0
        w_prev->PrintBestChoices();
511
0
      }
512
0
      if (!w->best_choices.singleton()) {
513
0
        w->PrintBestChoices();
514
0
      }
515
0
    }
516
0
    float best_rating = 0.0;
517
0
    int best_idx = 0;
518
0
    WERD_CHOICE_IT prev_it(&w_prev->best_choices);
519
0
    for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
520
0
      WERD_CHOICE *p1 = prev_it.data();
521
0
      WERD_CHOICE strip1(w->uch_set);
522
0
      {
523
0
        int p1start, p1end;
524
0
        p1->GetNonSuperscriptSpan(&p1start, &p1end);
525
0
        strip1 = p1->shallow_copy(p1start, p1end);
526
0
      }
527
0
      WERD_CHOICE_IT w_it(&w->best_choices);
528
0
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
529
0
        WERD_CHOICE *p2 = w_it.data();
530
0
        WERD_CHOICE strip2(w->uch_set);
531
0
        {
532
0
          int p2start, p2end;
533
0
          p2->GetNonSuperscriptSpan(&p2start, &p2end);
534
0
          strip2 = p2->shallow_copy(p2start, p2end);
535
0
        }
536
0
        if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
537
0
          overrides_word1.push_back(p1);
538
0
          overrides_word2.push_back(p2);
539
0
          if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) {
540
0
            best_rating = p1->rating() + p2->rating();
541
0
            best_idx = overrides_word1.size() - 1;
542
0
          }
543
0
        }
544
0
      }
545
0
    }
546
0
    if (!overrides_word1.empty()) {
547
      // Excellent, we have some bigram matches.
548
0
      if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) &&
549
0
          EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) {
550
0
        if (tessedit_bigram_debug > 1) {
551
0
          tprintf(
552
0
              "Top choice \"%s %s\" verified (sans case) by bigram "
553
0
              "model.\n",
554
0
              orig_w1_str.c_str(), orig_w2_str.c_str());
555
0
        }
556
0
        continue;
557
0
      }
558
0
      const auto new_w1_str = overrides_word1[best_idx]->unichar_string();
559
0
      const auto new_w2_str = overrides_word2[best_idx]->unichar_string();
560
0
      if (new_w1_str != orig_w1_str) {
561
0
        w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
562
0
      }
563
0
      if (new_w2_str != orig_w2_str) {
564
0
        w->ReplaceBestChoice(overrides_word2[best_idx]);
565
0
      }
566
0
      if (tessedit_bigram_debug > 0) {
567
0
        std::string choices_description;
568
0
        int num_bigram_choices = overrides_word1.size() * overrides_word2.size();
569
0
        if (num_bigram_choices == 1) {
570
0
          choices_description = "This was the unique bigram choice.";
571
0
        } else {
572
0
          if (tessedit_bigram_debug > 1) {
573
0
            std::string bigrams_list;
574
0
            const int kMaxChoicesToPrint = 20;
575
0
            for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) {
576
0
              if (i > 0) {
577
0
                bigrams_list += ", ";
578
0
              }
579
0
              WERD_CHOICE *p1 = overrides_word1[i];
580
0
              WERD_CHOICE *p2 = overrides_word2[i];
581
0
              bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
582
0
            }
583
0
            choices_description = "There were many choices: {";
584
0
            choices_description += bigrams_list;
585
0
            choices_description += "}";
586
0
          } else {
587
0
            choices_description += "There were " + std::to_string(num_bigram_choices);
588
0
            choices_description += " compatible bigrams.";
589
0
          }
590
0
        }
591
0
        tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(),
592
0
                orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(),
593
0
                choices_description.c_str());
594
0
      }
595
0
    }
596
0
  }
597
0
}
598
599
void Tesseract::rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor,
600
0
                                 const TBOX *target_word_box, const char *word_config) {
601
0
  PAGE_RES_IT page_res_it(page_res);
602
  // ****************** Pass 5 *******************
603
  // Gather statistics on rejects.
604
0
  int word_index = 0;
605
0
  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
606
0
    WERD_RES *word = page_res_it.word();
607
0
    word_index++;
608
0
    if (monitor != nullptr) {
609
0
      monitor->ocr_alive = true;
610
0
      monitor->progress = 95 + 5 * word_index / stats_.word_count;
611
0
    }
612
0
    if (word->rebuild_word == nullptr) {
613
      // Word was not processed by tesseract.
614
0
      page_res_it.forward();
615
0
      continue;
616
0
    }
617
0
    check_debug_pt(word, 70);
618
619
    // changed by jetsoft
620
    // specific to its needs to extract one word when need
621
0
    if (target_word_box &&
622
0
        !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) {
623
0
      page_res_it.forward();
624
0
      continue;
625
0
    }
626
    // end jetsoft
627
628
0
    page_res_it.rej_stat_word();
629
0
    const int chars_in_word = word->reject_map.length();
630
0
    const int rejects_in_word = word->reject_map.reject_count();
631
632
0
    const int blob_quality = word_blob_quality(word);
633
0
    stats_.doc_blob_quality += blob_quality;
634
0
    const int outline_errs = word_outline_errs(word);
635
0
    stats_.doc_outline_errs += outline_errs;
636
0
    int16_t all_char_quality;
637
0
    int16_t accepted_all_char_quality;
638
0
    word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
639
0
    stats_.doc_char_quality += all_char_quality;
640
0
    const uint8_t permuter_type = word->best_choice->permuter();
641
0
    if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) ||
642
0
        (permuter_type == USER_DAWG_PERM)) {
643
0
      stats_.good_char_count += chars_in_word - rejects_in_word;
644
0
      stats_.doc_good_char_quality += accepted_all_char_quality;
645
0
    }
646
0
    check_debug_pt(word, 80);
647
0
    if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) {
648
0
      word->reject_map.rej_word_bad_quality();
649
0
    }
650
0
    check_debug_pt(word, 90);
651
0
    page_res_it.forward();
652
0
  }
653
654
0
  if (tessedit_debug_quality_metrics) {
655
0
    tprintf(
656
0
        "QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"
657
0
        " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
658
0
        page_res->char_count, page_res->rej_count,
659
0
        page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality,
660
0
        stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs,
661
0
        stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality,
662
0
        stats_.doc_char_quality / static_cast<float>(page_res->char_count),
663
0
        stats_.doc_good_char_quality,
664
0
        (stats_.good_char_count > 0)
665
0
            ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count))
666
0
            : 0.0);
667
0
  }
668
0
  bool good_quality_doc =
669
0
      ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) &&
670
0
      (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) &&
671
0
      (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) &&
672
0
      (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc);
673
674
  // ****************** Pass 6 *******************
675
  // Do whole document or whole block rejection pass
676
0
  if (!tessedit_test_adaption) {
677
0
    quality_based_rejection(page_res_it, good_quality_doc);
678
0
  }
679
0
}
680
681
#endif // ndef DISABLED_LEGACY_ENGINE
682
683
0
void Tesseract::blamer_pass(PAGE_RES *page_res) {
684
0
  if (!wordrec_run_blamer) {
685
0
    return;
686
0
  }
687
0
  PAGE_RES_IT page_res_it(page_res);
688
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
689
0
    WERD_RES *word = page_res_it.word();
690
0
    BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
691
0
    page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
692
0
  }
693
0
  tprintf("Blame reasons:\n");
694
0
  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
695
0
    tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(static_cast<IncorrectResultReason>(bl)),
696
0
            page_res->blame_reasons[bl]);
697
0
  }
698
0
  if (page_res->misadaption_log.size() > 0) {
699
0
    tprintf("Misadaption log:\n");
700
0
    for (auto &log : page_res->misadaption_log) {
701
0
      tprintf("%s\n", log.c_str());
702
0
    }
703
0
  }
704
0
}
705
706
// Sets script positions and detects smallcaps on all output words.
707
0
void Tesseract::script_pos_pass(PAGE_RES *page_res) {
708
0
  PAGE_RES_IT page_res_it(page_res);
709
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
710
0
    WERD_RES *word = page_res_it.word();
711
0
    if (word->word->flag(W_REP_CHAR)) {
712
0
      page_res_it.forward();
713
0
      continue;
714
0
    }
715
0
    const float x_height = page_res_it.block()->block->x_height();
716
0
    float word_x_height = word->x_height;
717
0
    if (word_x_height < word->best_choice->min_x_height() ||
718
0
        word_x_height > word->best_choice->max_x_height()) {
719
0
      word_x_height =
720
0
          (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f;
721
0
    }
722
    // Test for small caps. Word capheight must be close to block xheight,
723
    // and word must contain no lower case letters, and at least one upper case.
724
0
    const double small_cap_xheight = x_height * kXHeightCapRatio;
725
0
    const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
726
0
    if (word->uch_set->script_has_xheight() &&
727
0
        small_cap_xheight - small_cap_delta <= word_x_height &&
728
0
        word_x_height <= small_cap_xheight + small_cap_delta) {
729
      // Scan for upper/lower.
730
0
      int num_upper = 0;
731
0
      int num_lower = 0;
732
0
      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
733
0
        if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {
734
0
          ++num_upper;
735
0
        } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {
736
0
          ++num_lower;
737
0
        }
738
0
      }
739
0
      if (num_upper > 0 && num_lower == 0) {
740
0
        word->small_caps = true;
741
0
      }
742
0
    }
743
0
    word->SetScriptPositions();
744
0
  }
745
0
}
746
747
// Helper finds the gap between the index word and the next.
748
584k
static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) {
749
584k
  *right = -INT32_MAX;
750
584k
  *next_left = INT32_MAX;
751
584k
  if (index < words.size()) {
752
292k
    *right = words[index]->word->bounding_box().right();
753
292k
    if (index + 1 < words.size()) {
754
16.3k
      *next_left = words[index + 1]->word->bounding_box().left();
755
16.3k
    }
756
292k
  }
757
584k
}
758
759
// Factored helper computes the rating, certainty, badness and validity of
760
// the permuter of the words in [first_index, end_index).
761
static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index,
762
584k
                             float *rating, float *certainty, bool *bad, bool *valid_permuter) {
763
584k
  if (end_index <= first_index) {
764
292k
    *bad = true;
765
292k
    *valid_permuter = false;
766
292k
  }
767
876k
  for (unsigned index = first_index; index < end_index && index < words.size(); ++index) {
768
292k
    WERD_CHOICE *choice = words[index]->best_choice;
769
292k
    if (choice == nullptr) {
770
0
      *bad = true;
771
292k
    } else {
772
292k
      *rating += choice->rating();
773
292k
      *certainty = std::min(*certainty, choice->certainty());
774
292k
      if (!Dict::valid_word_permuter(choice->permuter(), false)) {
775
106k
        *valid_permuter = false;
776
106k
      }
777
292k
    }
778
292k
  }
779
584k
}
780
781
// Helper chooses the best combination of words, transferring good ones from
782
// new_words to best_words. To win, a new word must have (better rating and
783
// certainty) or (better permuter status and rating within rating ratio and
784
// certainty within certainty margin) than current best.
785
// All the new_words are consumed (moved to best_words or deleted.)
786
// The return value is the number of new_words used minus the number of
787
// best_words that remain in the output.
788
static int SelectBestWords(double rating_ratio, double certainty_margin, bool debug,
789
                           PointerVector<WERD_RES> *new_words,
790
276k
                           PointerVector<WERD_RES> *best_words) {
791
  // Process the smallest groups of words that have an overlapping word
792
  // boundary at the end.
793
276k
  std::vector<WERD_RES *> out_words;
794
  // Index into each word vector (best, new).
795
276k
  unsigned b = 0, n = 0;
796
276k
  int num_best = 0, num_new = 0;
797
568k
  while (b < best_words->size() || n < new_words->size()) {
798
    // Start of the current run in each.
799
292k
    auto start_b = b, start_n = n;
800
292k
    while (b < best_words->size() || n < new_words->size()) {
801
292k
      int b_right = -INT32_MAX;
802
292k
      int next_b_left = INT32_MAX;
803
292k
      WordGap(*best_words, b, &b_right, &next_b_left);
804
292k
      int n_right = -INT32_MAX;
805
292k
      int next_n_left = INT32_MAX;
806
292k
      WordGap(*new_words, n, &n_right, &next_n_left);
807
292k
      if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
808
        // The word breaks overlap. [start_b,b] and [start_n, n] match.
809
292k
        break;
810
292k
      }
811
      // Keep searching for the matching word break.
812
162
      if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) {
813
0
        ++b;
814
162
      } else {
815
162
        ++n;
816
162
      }
817
162
    }
818
    // Rating of the current run in each.
819
292k
    float b_rating = 0.0f, n_rating = 0.0f;
820
    // Certainty of the current run in each.
821
292k
    float b_certainty = 0.0f, n_certainty = 0.0f;
822
    // True if any word is missing its best choice.
823
292k
    bool b_bad = false, n_bad = false;
824
    // True if all words have a valid permuter.
825
292k
    bool b_valid_permuter = true, n_valid_permuter = true;
826
292k
    const int end_b = b < best_words->size() ? b + 1 : b;
827
292k
    const int end_n = n < new_words->size() ? n + 1 : n;
828
292k
    EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad,
829
292k
                     &b_valid_permuter);
830
292k
    EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad,
831
292k
                     &n_valid_permuter);
832
292k
    bool new_better = false;
833
292k
    if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) ||
834
292k
                   (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio &&
835
292k
                    n_certainty > b_certainty - certainty_margin))) {
836
      // New is better.
837
584k
      for (int i = start_n; i < end_n; ++i) {
838
292k
        out_words.push_back((*new_words)[i]);
839
292k
        (*new_words)[i] = nullptr;
840
292k
        ++num_new;
841
292k
      }
842
292k
      new_better = true;
843
292k
    } else if (!b_bad) {
844
      // Current best is better.
845
0
      for (int i = start_b; i < end_b; ++i) {
846
0
        out_words.push_back((*best_words)[i]);
847
0
        (*best_words)[i] = nullptr;
848
0
        ++num_best;
849
0
      }
850
0
    }
851
292k
    if (debug) {
852
0
      tprintf(
853
0
          "%d new words %s than %d old words: r: %g v %g c: %g v %g"
854
0
          " valid dict: %d v %d\n",
855
0
          end_n - start_n, new_better ? "better" : "worse", end_b - start_b, n_rating, b_rating,
856
0
          n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
857
0
    }
858
    // Move on to the next group.
859
292k
    b = end_b;
860
292k
    n = end_n;
861
292k
  }
862
  // Transfer from out_words to best_words.
863
276k
  best_words->clear();
864
292k
  for (auto &out_word : out_words) {
865
292k
    best_words->push_back(out_word);
866
292k
  }
867
276k
  return num_new - num_best;
868
276k
}
869
870
// Helper to recognize the word using the given (language-specific) tesseract.
871
// Returns positive if this recognizer found more new best words than the
872
// number kept from best_words.
873
int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug,
874
276k
                                 WERD_RES **in_word, PointerVector<WERD_RES> *best_words) {
875
276k
  if (debug) {
876
0
    tprintf("Trying word using lang %s, oem %d\n", lang.c_str(),
877
0
            static_cast<int>(tessedit_ocr_engine_mode));
878
0
  }
879
  // Run the recognizer on the word.
880
276k
  PointerVector<WERD_RES> new_words;
881
276k
  (this->*recognizer)(word_data, in_word, &new_words);
882
276k
  if (new_words.empty()) {
883
    // Transfer input word to new_words, as the classifier must have put
884
    // the result back in the input.
885
128k
    new_words.push_back(*in_word);
886
128k
    *in_word = nullptr;
887
128k
  }
888
276k
  if (debug) {
889
0
    for (unsigned i = 0; i < new_words.size(); ++i) {
890
0
      new_words[i]->DebugTopChoice("Lang result");
891
0
    }
892
0
  }
893
  // Initial version is a bit of a hack based on better certainty and rating
894
  // or a dictionary vs non-dictionary word.
895
276k
  return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug,
896
276k
                         &new_words, best_words);
897
276k
}
898
899
// Helper returns true if all the words are acceptable.
900
536k
static bool WordsAcceptable(const PointerVector<WERD_RES> &words) {
901
552k
  for (unsigned w = 0; w < words.size(); ++w) {
902
536k
    if (words[w]->tess_failed || !words[w]->tess_accepted) {
903
521k
      return false;
904
521k
    }
905
536k
  }
906
15.4k
  return true;
907
536k
}
908
909
#ifndef DISABLED_LEGACY_ENGINE
910
911
// Moves good-looking "noise"/diacritics from the reject list to the main
912
// blob list on the current word. Returns true if anything was done, and
913
// sets make_next_word_fuzzy if blob(s) were added to the end of the word.
914
0
bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) {
915
0
  *make_next_word_fuzzy = false;
916
0
  WERD *real_word = pr_it->word()->word;
917
0
  if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() ||
918
0
      real_word->rej_cblob_list()->length() > noise_maxperword) {
919
0
    return false;
920
0
  }
921
0
  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
922
  // Get the noise outlines into a vector with matching bool map.
923
0
  std::vector<C_OUTLINE *> outlines;
924
0
  real_word->GetNoiseOutlines(&outlines);
925
0
  std::vector<bool> word_wanted;
926
0
  std::vector<bool> overlapped_any_blob;
927
0
  std::vector<C_BLOB *> target_blobs;
928
0
  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
929
0
                                     &overlapped_any_blob, &target_blobs);
930
  // Filter the outlines that overlapped any blob and put them into the word
931
  // now. This simplifies the remaining task and also makes it more accurate
932
  // as it has more completed blobs to work on.
933
0
  std::vector<bool> wanted;
934
0
  std::vector<C_BLOB *> wanted_blobs;
935
0
  std::vector<C_OUTLINE *> wanted_outlines;
936
0
  int num_overlapped = 0;
937
0
  int num_overlapped_used = 0;
938
0
  for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) {
939
0
    if (overlapped_any_blob[i]) {
940
0
      ++num_overlapped;
941
0
      if (word_wanted[i]) {
942
0
        ++num_overlapped_used;
943
0
      }
944
0
      wanted.push_back(word_wanted[i]);
945
0
      wanted_blobs.push_back(target_blobs[i]);
946
0
      wanted_outlines.push_back(outlines[i]);
947
0
      outlines[i] = nullptr;
948
0
    }
949
0
  }
950
0
  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
951
0
  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs);
952
0
  int non_overlapped = 0;
953
0
  int non_overlapped_used = 0;
954
0
  for (unsigned i = 0; i < word_wanted.size(); ++i) {
955
0
    if (word_wanted[i]) {
956
0
      ++non_overlapped_used;
957
0
    }
958
0
    if (outlines[i] != nullptr) {
959
0
      ++non_overlapped_used;
960
0
    }
961
0
  }
962
0
  if (debug_noise_removal) {
963
0
    tprintf("Used %d/%d overlapped %d/%d non-overlapped diacritics on word:", num_overlapped_used,
964
0
            num_overlapped, non_overlapped_used, non_overlapped);
965
0
    real_word->bounding_box().print();
966
0
  }
967
  // Now we have decided which outlines we want, put them into the real_word.
968
0
  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) {
969
0
    pr_it->MakeCurrentWordFuzzy();
970
0
  }
971
  // TODO(rays) Parts of combos have a deep copy of the real word, and need
972
  // to have their noise outlines moved/assigned in the same way!!
973
0
  return num_overlapped_used != 0 || non_overlapped_used != 0;
974
0
}
975
976
// Attempts to put noise/diacritic outlines into the blobs that they overlap.
977
// Input: a set of noisy outlines that probably belong to the real_word.
978
// Output: word_wanted indicates which outlines are to be assigned to a blob,
979
//   target_blobs indicates which to assign to, and overlapped_any_blob is
980
//   true for all outlines that overlapped a blob.
981
void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,
982
                                                   int pass, WERD *real_word, PAGE_RES_IT *pr_it,
983
                                                   std::vector<bool> *word_wanted,
984
                                                   std::vector<bool> *overlapped_any_blob,
985
0
                                                   std::vector<C_BLOB *> *target_blobs) {
986
0
  std::vector<bool> blob_wanted;
987
0
  word_wanted->clear();
988
0
  word_wanted->resize(outlines.size());
989
0
  overlapped_any_blob->clear();
990
0
  overlapped_any_blob->resize(outlines.size());
991
0
  target_blobs->clear();
992
0
  target_blobs->resize(outlines.size());
993
  // For each real blob, find the outlines that seriously overlap it.
994
  // A single blob could be several merged characters, so there can be quite
995
  // a few outlines overlapping, and the full engine needs to be used to chop
996
  // and join to get a sensible result.
997
0
  C_BLOB_IT blob_it(real_word->cblob_list());
998
0
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
999
0
    C_BLOB *blob = blob_it.data();
1000
0
    const TBOX blob_box = blob->bounding_box();
1001
0
    blob_wanted.clear();
1002
0
    blob_wanted.resize(outlines.size());
1003
0
    int num_blob_outlines = 0;
1004
0
    for (unsigned i = 0; i < outlines.size(); ++i) {
1005
0
      if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) {
1006
0
        blob_wanted[i] = true;
1007
0
        (*overlapped_any_blob)[i] = true;
1008
0
        ++num_blob_outlines;
1009
0
      }
1010
0
    }
1011
0
    if (debug_noise_removal) {
1012
0
      tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1013
0
      blob_box.print();
1014
0
    }
1015
    // If any outlines overlap the blob, and not too many, classify the blob
1016
    // (using the full engine, languages and all), and choose the maximal
1017
    // combination of outlines that doesn't hurt the end-result classification
1018
    // by too much. Mark them as wanted.
1019
0
    if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1020
0
      if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines,
1021
0
                                      num_blob_outlines, &blob_wanted)) {
1022
0
        for (unsigned i = 0; i < blob_wanted.size(); ++i) {
1023
0
          if (blob_wanted[i]) {
1024
            // Claim the outline and record where it is going.
1025
0
            (*word_wanted)[i] = true;
1026
0
            (*target_blobs)[i] = blob;
1027
0
          }
1028
0
        }
1029
0
      }
1030
0
    }
1031
0
  }
1032
0
}
1033
1034
// Attempts to assign non-overlapping outlines to their nearest blobs or
1035
// make new blobs out of them.
1036
void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
1037
                                           WERD *real_word, PAGE_RES_IT *pr_it,
1038
                                           std::vector<bool> *word_wanted,
1039
0
                                           std::vector<C_BLOB *> *target_blobs) {
1040
0
  std::vector<bool> blob_wanted;
1041
0
  word_wanted->clear();
1042
0
  word_wanted->resize(outlines.size());
1043
0
  target_blobs->clear();
1044
0
  target_blobs->resize(outlines.size());
1045
  // Check for outlines that need to be turned into stand-alone blobs.
1046
0
  for (unsigned i = 0; i < outlines.size(); ++i) {
1047
0
    if (outlines[i] == nullptr) {
1048
0
      continue;
1049
0
    }
1050
    // Get a set of adjacent outlines that don't overlap any existing blob.
1051
0
    blob_wanted.clear();
1052
0
    blob_wanted.resize(outlines.size());
1053
0
    int num_blob_outlines = 0;
1054
0
    TBOX total_ol_box(outlines[i]->bounding_box());
1055
0
    while (i < outlines.size() && outlines[i] != nullptr) {
1056
0
      blob_wanted[i] = true;
1057
0
      total_ol_box += outlines[i]->bounding_box();
1058
0
      ++i;
1059
0
      ++num_blob_outlines;
1060
0
    }
1061
    // Find the insertion point.
1062
0
    C_BLOB_IT blob_it(real_word->cblob_list());
1063
0
    while (!blob_it.at_last() &&
1064
0
           blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) {
1065
0
      blob_it.forward();
1066
0
    }
1067
    // Choose which combination of them we actually want and where to put
1068
    // them.
1069
0
    if (debug_noise_removal) {
1070
0
      tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1071
0
    }
1072
0
    C_BLOB *left_blob = blob_it.data();
1073
0
    TBOX left_box = left_blob->bounding_box();
1074
0
    C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1075
0
    if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1076
0
         !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1077
0
        SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines,
1078
0
                                    num_blob_outlines, &blob_wanted)) {
1079
0
      if (debug_noise_removal) {
1080
0
        tprintf("Added to left blob\n");
1081
0
      }
1082
0
      for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1083
0
        if (blob_wanted[j]) {
1084
0
          (*word_wanted)[j] = true;
1085
0
          (*target_blobs)[j] = left_blob;
1086
0
        }
1087
0
      }
1088
0
    } else if (right_blob != nullptr &&
1089
0
               (!left_box.x_overlap(total_ol_box) ||
1090
0
                right_blob->bounding_box().x_overlap(total_ol_box)) &&
1091
0
               SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines,
1092
0
                                           num_blob_outlines, &blob_wanted)) {
1093
0
      if (debug_noise_removal) {
1094
0
        tprintf("Added to right blob\n");
1095
0
      }
1096
0
      for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1097
0
        if (blob_wanted[j]) {
1098
0
          (*word_wanted)[j] = true;
1099
0
          (*target_blobs)[j] = right_blob;
1100
0
        }
1101
0
      }
1102
0
    } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines,
1103
0
                                           num_blob_outlines, &blob_wanted)) {
1104
0
      if (debug_noise_removal) {
1105
0
        tprintf("Fitted between blobs\n");
1106
0
      }
1107
0
      for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1108
0
        if (blob_wanted[j]) {
1109
0
          (*word_wanted)[j] = true;
1110
0
          (*target_blobs)[j] = nullptr;
1111
0
        }
1112
0
      }
1113
0
    }
1114
0
  }
1115
0
}
1116
1117
// Starting with ok_outlines set to indicate which outlines overlap the blob,
1118
// chooses the optimal set (approximately) and returns true if any outlines
1119
// are desired, in which case ok_outlines indicates which ones.
1120
bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
1121
                                            C_BLOB *blob,
1122
                                            const std::vector<C_OUTLINE *> &outlines,
1123
0
                                            int num_outlines, std::vector<bool> *ok_outlines) {
1124
0
  std::string best_str;
1125
0
  float target_cert = certainty_threshold;
1126
0
  if (blob != nullptr) {
1127
0
    float target_c2;
1128
0
    target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2);
1129
0
    if (debug_noise_removal) {
1130
0
      tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert,
1131
0
              target_c2);
1132
0
      blob->bounding_box().print();
1133
0
    }
1134
0
    target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1135
0
  }
1136
0
  std::vector<bool> test_outlines = *ok_outlines;
1137
  // Start with all the outlines in.
1138
0
  std::string all_str;
1139
0
  std::vector<bool> best_outlines = *ok_outlines;
1140
0
  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str);
1141
0
  if (debug_noise_removal) {
1142
0
    TBOX ol_box;
1143
0
    for (unsigned i = 0; i < test_outlines.size(); ++i) {
1144
0
      if (test_outlines[i]) {
1145
0
        ol_box += outlines[i]->bounding_box();
1146
0
      }
1147
0
    }
1148
0
    tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert,
1149
0
            best_cert - target_cert);
1150
0
    ol_box.print();
1151
0
  }
1152
  // Iteratively zero out the bit that improves the certainty the most, until
1153
  // we get past the threshold, have zero bits, or fail to improve.
1154
0
  int best_index = 0; // To zero out.
1155
0
  while (num_outlines > 1 && best_index >= 0 &&
1156
0
         (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1157
    // Find the best bit to zero out.
1158
0
    best_index = -1;
1159
0
    for (unsigned i = 0; i < outlines.size(); ++i) {
1160
0
      if (test_outlines[i]) {
1161
0
        test_outlines[i] = false;
1162
0
        std::string str;
1163
0
        float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str);
1164
0
        if (debug_noise_removal) {
1165
0
          TBOX ol_box;
1166
0
          for (unsigned j = 0; j < outlines.size(); ++j) {
1167
0
            if (test_outlines[j]) {
1168
0
              ol_box += outlines[j]->bounding_box();
1169
0
            }
1170
0
            tprintf("%c", test_outlines[j] ? 'T' : 'F');
1171
0
          }
1172
0
          tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert,
1173
0
                  cert - target_cert);
1174
0
          ol_box.print();
1175
0
        }
1176
0
        if (cert > best_cert) {
1177
0
          best_cert = cert;
1178
0
          best_index = i;
1179
0
          best_outlines = test_outlines;
1180
0
        }
1181
0
        test_outlines[i] = true;
1182
0
      }
1183
0
    }
1184
0
    if (best_index >= 0) {
1185
0
      test_outlines[best_index] = false;
1186
0
      --num_outlines;
1187
0
    }
1188
0
  }
1189
0
  if (best_cert >= target_cert) {
1190
    // Save the best combination.
1191
0
    *ok_outlines = best_outlines;
1192
0
    if (debug_noise_removal) {
1193
0
      tprintf("%s noise combination ", blob ? "Adding" : "New");
1194
0
      for (auto &&best_outline : best_outlines) {
1195
0
        tprintf("%c", best_outline ? 'T' : 'F');
1196
0
      }
1197
0
      tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert);
1198
0
    }
1199
0
    return true;
1200
0
  }
1201
1202
0
  return false;
1203
0
}
1204
1205
// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
1206
// the inclusion of the outlines, and returns the certainty of the raw choice.
1207
float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
1208
                                          const std::vector<C_OUTLINE *> &outlines, int pass_n,
1209
0
                                          PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {
1210
0
  C_OUTLINE_IT ol_it;
1211
0
  C_OUTLINE *first_to_keep = nullptr;
1212
0
  C_BLOB *local_blob = nullptr;
1213
0
  if (blob != nullptr) {
1214
    // Add the required outlines to the blob.
1215
0
    ol_it.set_to_list(blob->out_list());
1216
0
    first_to_keep = ol_it.data();
1217
0
  }
1218
0
  for (unsigned i = 0; i < ok_outlines.size(); ++i) {
1219
0
    if (ok_outlines[i]) {
1220
      // This outline is to be added.
1221
0
      if (blob == nullptr) {
1222
0
        local_blob = new C_BLOB(outlines[i]);
1223
0
        blob = local_blob;
1224
0
        ol_it.set_to_list(blob->out_list());
1225
0
      } else {
1226
0
        ol_it.add_before_stay_put(outlines[i]);
1227
0
      }
1228
0
    }
1229
0
  }
1230
0
  float c2;
1231
0
  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1232
0
  ol_it.move_to_first();
1233
0
  if (first_to_keep == nullptr) {
1234
    // We created blob. Empty its outlines and delete it.
1235
0
    for (; !ol_it.empty(); ol_it.forward()) {
1236
0
      ol_it.extract();
1237
0
    }
1238
0
    delete local_blob;
1239
0
    cert = -c2;
1240
0
  } else {
1241
    // Remove the outlines that we put in.
1242
0
    for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1243
0
      ol_it.extract();
1244
0
    }
1245
0
  }
1246
0
  return cert;
1247
0
}
1248
1249
// Classifies the given blob (part of word_data->word->word) as an individual
1250
// word, using languages, chopper etc, returning only the certainty of the
1251
// best raw choice, and undoing all the work done to fake out the word.
1252
float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str,
1253
0
                                    float *c2) {
1254
0
  WERD *real_word = pr_it->word()->word;
1255
0
  WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL),
1256
0
                                                  C_BLOB::deep_copy(blob));
1257
0
  WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1258
  // Get a new iterator that points to the new word.
1259
0
  PAGE_RES_IT it(pr_it->page_res);
1260
0
  while (it.word() != word_res && it.word() != nullptr) {
1261
0
    it.forward();
1262
0
  }
1263
0
  ASSERT_HOST(it.word() == word_res);
1264
0
  WordData wd(it);
1265
  // Force full initialization.
1266
0
  SetupWordPassN(1, &wd);
1267
0
  classify_word_and_language(pass_n, &it, &wd);
1268
0
  if (debug_noise_removal) {
1269
0
    if (wd.word->raw_choice != nullptr) {
1270
0
      tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(),
1271
0
              wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height());
1272
0
    } else {
1273
0
      tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1274
0
              wd.row->x_height());
1275
0
    }
1276
0
  }
1277
0
  float cert = 0.0f;
1278
0
  if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1279
0
    cert = wd.word->raw_choice->certainty();
1280
0
    float rat = wd.word->raw_choice->rating();
1281
0
    *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1282
0
    best_str = wd.word->raw_choice->unichar_string();
1283
0
  } else {
1284
0
    *c2 = 0.0f;
1285
0
    best_str.clear();
1286
0
  }
1287
0
  it.DeleteCurrentWord();
1288
0
  pr_it->ResetWordIterator();
1289
0
  return cert;
1290
0
}
1291
1292
#endif // ndef DISABLED_LEGACY_ENGINE
1293
1294
// Generic function for classifying a word. Can be used either for pass1 or
1295
// pass2 according to the function passed to recognizer.
1296
// word_data holds the word to be recognized, and its block and row, and
1297
// pr_it points to the word as well, in case we are running LSTM and it wants
1298
// to output multiple words.
1299
// Recognizes in the current language, and if successful that is all.
1300
// If recognition was not successful, tries all available languages until
1301
// it gets a successful result or runs out of languages. Keeps the best result.
1302
444k
void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) {
1303
#ifdef DISABLED_LEGACY_ENGINE
1304
  WordRecognizer recognizer = &Tesseract::classify_word_pass1;
1305
#else
1306
444k
  WordRecognizer recognizer =
1307
444k
      pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2;
1308
444k
#endif // def DISABLED_LEGACY_ENGINE
1309
1310
  // Best result so far.
1311
444k
  PointerVector<WERD_RES> best_words;
1312
  // Points to the best result. May be word or in lang_words.
1313
444k
  const WERD_RES *word = word_data->word;
1314
444k
  clock_t start_t = clock();
1315
444k
  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1316
444k
  if (debug) {
1317
0
    tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing",
1318
0
            most_recently_used_->lang.c_str());
1319
0
    word->word->bounding_box().print();
1320
0
  }
1321
444k
  if (word->done) {
1322
    // If done on pass1, leave it as-is.
1323
168k
    if (!word->tess_failed) {
1324
166k
      most_recently_used_ = word->tesseract;
1325
166k
    }
1326
168k
    return;
1327
168k
  }
1328
276k
  auto sub = sub_langs_.size();
1329
276k
  if (most_recently_used_ != this) {
1330
    // Get the index of the most_recently_used_.
1331
0
    for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {
1332
0
    }
1333
0
  }
1334
276k
  most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub],
1335
276k
                                         &best_words);
1336
276k
  Tesseract *best_lang_tess = most_recently_used_;
1337
276k
  if (!WordsAcceptable(best_words)) {
1338
    // Try all the other languages to see if they are any better.
1339
260k
    if (most_recently_used_ != this &&
1340
260k
        this->RetryWithLanguage(*word_data, recognizer, debug,
1341
0
                                &word_data->lang_words[sub_langs_.size()], &best_words) > 0) {
1342
0
      best_lang_tess = this;
1343
0
    }
1344
260k
    for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) {
1345
0
      if (most_recently_used_ != sub_langs_[i] &&
1346
0
          sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i],
1347
0
                                           &best_words) > 0) {
1348
0
        best_lang_tess = sub_langs_[i];
1349
0
      }
1350
0
    }
1351
260k
  }
1352
276k
  most_recently_used_ = best_lang_tess;
1353
276k
  if (!best_words.empty()) {
1354
276k
    if (best_words.size() == 1 && !best_words[0]->combination) {
1355
      // Move the best single result to the main word.
1356
128k
      word_data->word->ConsumeWordResults(best_words[0]);
1357
147k
    } else {
1358
      // Words came from LSTM, and must be moved to the PAGE_RES properly.
1359
147k
      word_data->word = best_words.back();
1360
147k
      pr_it->ReplaceCurrentWord(&best_words);
1361
147k
    }
1362
276k
    ASSERT_HOST(word_data->word->box_word != nullptr);
1363
276k
  } else {
1364
0
    tprintf("no best words!!\n");
1365
0
  }
1366
276k
  clock_t ocr_t = clock();
1367
276k
  if (tessedit_timing_debug) {
1368
0
    tprintf("%s (ocr took %.2f sec)\n", word_data->word->best_choice->unichar_string().c_str(),
1369
0
            static_cast<double>(ocr_t - start_t) / CLOCKS_PER_SEC);
1370
0
  }
1371
276k
}
1372
1373
/**
1374
 * classify_word_pass1
1375
 *
1376
 * Baseline normalize the word and pass it to Tess.
1377
 */
1378
1379
void Tesseract::classify_word_pass1(const WordData &word_data, WERD_RES **in_word,
1380
212k
                                    PointerVector<WERD_RES> *out_words) {
1381
212k
  ROW *row = word_data.row;
1382
212k
  BLOCK *block = word_data.block;
1383
212k
  prev_word_best_choice_ =
1384
212k
      word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1385
#ifdef DISABLED_LEGACY_ENGINE
1386
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1387
#else
1388
212k
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
1389
212k
      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
1390
212k
#endif // def DISABLED_LEGACY_ENGINE
1391
212k
    if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1392
180k
      LSTMRecognizeWord(*block, row, *in_word, out_words);
1393
180k
      if (!out_words->empty()) {
1394
147k
        return; // Successful lstm recognition.
1395
147k
      }
1396
180k
    }
1397
65.5k
    if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1398
      // No fallback allowed, so use a fake.
1399
0
      (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1400
0
      return;
1401
0
    }
1402
1403
65.5k
#ifndef DISABLED_LEGACY_ENGINE
1404
    // Fall back to tesseract for failed words or odd words.
1405
65.5k
    (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr,
1406
65.5k
                                    classify_bln_numeric_mode, textord_use_cjk_fp_model,
1407
65.5k
                                    poly_allow_detailed_fx, row, block);
1408
65.5k
#endif // ndef DISABLED_LEGACY_ENGINE
1409
65.5k
  }
1410
1411
65.5k
#ifndef DISABLED_LEGACY_ENGINE
1412
65.5k
  WERD_RES *word = *in_word;
1413
65.5k
  match_word_pass_n(1, word, row, block);
1414
65.5k
  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1415
65.5k
    word->tess_would_adapt = AdaptableWord(word);
1416
65.5k
    bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1417
1418
65.5k
    if (adapt_ok) {
1419
      // Send word to adaptive classifier for training.
1420
2.36k
      word->BestChoiceToCorrectText();
1421
2.36k
      LearnWord(nullptr, word);
1422
      // Mark misadaptions if running blamer.
1423
2.36k
      if (word->blamer_bundle != nullptr) {
1424
0
        word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer);
1425
0
      }
1426
2.36k
    }
1427
1428
65.5k
    if (tessedit_enable_doc_dict && !word->IsAmbiguous()) {
1429
36.2k
      tess_add_doc_word(word->best_choice);
1430
36.2k
    }
1431
65.5k
  }
1432
65.5k
#endif // ndef DISABLED_LEGACY_ENGINE
1433
65.5k
}
1434
1435
// Helper to report the result of the xheight fix.
1436
void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word,
1437
0
                                   WERD_RES *new_word) {
1438
0
  tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(),
1439
0
          word->best_choice->debug_string().c_str());
1440
0
  word->reject_map.print(debug_fp);
1441
0
  tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(),
1442
0
          new_word->best_choice->debug_string().c_str());
1443
0
  new_word->reject_map.print(debug_fp);
1444
0
  tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT",
1445
0
          new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1446
0
          accept_new_word ? "ACCEPTED" : "");
1447
0
}
1448
1449
#ifndef DISABLED_LEGACY_ENGINE
1450
1451
// Run the x-height fix-up, based on min/max top/bottom information in
1452
// unicharset.
1453
// Returns true if the word was changed.
1454
// See the comment in fixxht.cpp for a description of the overall process.
1455
63.1k
bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row) {
1456
63.1k
  int original_misfits = CountMisfitTops(word);
1457
63.1k
  if (original_misfits == 0) {
1458
27.4k
    return false;
1459
27.4k
  }
1460
35.7k
  float baseline_shift = 0.0f;
1461
35.7k
  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1462
35.7k
  if (baseline_shift != 0.0f) {
1463
    // Try the shift on its own first.
1464
12.3k
    if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) {
1465
8.33k
      return false;
1466
8.33k
    }
1467
4.06k
    original_misfits = CountMisfitTops(word);
1468
4.06k
    if (original_misfits > 0) {
1469
1.48k
      float new_baseline_shift;
1470
      // Now recompute the new x_height.
1471
1.48k
      new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1472
1.48k
      if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1473
        // No test of return value here, as we are definitely making a change
1474
        // to the word by shifting the baseline.
1475
1.26k
        TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row);
1476
1.26k
      }
1477
1.48k
    }
1478
4.06k
    return true;
1479
23.3k
  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1480
18.4k
    return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row);
1481
18.4k
  } else {
1482
4.90k
    return false;
1483
4.90k
  }
1484
35.7k
}
1485
1486
// Runs recognition with the test baseline shift and x-height and returns true
1487
// if there was an improvement in recognition result.
1488
bool Tesseract::TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht,
1489
32.0k
                                     WERD_RES *word, BLOCK *block, ROW *row) {
1490
32.0k
  bool accept_new_x_ht = false;
1491
32.0k
  WERD_RES new_x_ht_word(word->word);
1492
32.0k
  if (word->blamer_bundle != nullptr) {
1493
0
    new_x_ht_word.blamer_bundle = new BlamerBundle();
1494
0
    new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1495
0
  }
1496
32.0k
  new_x_ht_word.x_height = new_x_ht;
1497
32.0k
  new_x_ht_word.baseline_shift = baseline_shift;
1498
32.0k
  new_x_ht_word.caps_height = 0.0;
1499
32.0k
  new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1500
32.0k
                                    classify_bln_numeric_mode, textord_use_cjk_fp_model,
1501
32.0k
                                    poly_allow_detailed_fx, row, block);
1502
32.0k
  match_word_pass_n(2, &new_x_ht_word, row, block);
1503
32.0k
  if (!new_x_ht_word.tess_failed) {
1504
32.0k
    int new_misfits = CountMisfitTops(&new_x_ht_word);
1505
32.0k
    if (debug_x_ht_level >= 1) {
1506
0
      tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits,
1507
0
              word->x_height, new_misfits, new_x_ht);
1508
0
      tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(),
1509
0
              word->best_choice->certainty(), new_x_ht_word.best_choice->rating(),
1510
0
              new_x_ht_word.best_choice->certainty());
1511
0
    }
1512
    // The misfits must improve and either the rating or certainty.
1513
32.0k
    accept_new_x_ht = new_misfits < original_misfits &&
1514
32.0k
                      (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() ||
1515
22.6k
                       new_x_ht_word.best_choice->rating() < word->best_choice->rating());
1516
32.0k
    if (debug_x_ht_level >= 1) {
1517
0
      ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1518
0
    }
1519
32.0k
  }
1520
32.0k
  if (accept_new_x_ht) {
1521
18.2k
    word->ConsumeWordResults(&new_x_ht_word);
1522
18.2k
    return true;
1523
18.2k
  }
1524
13.8k
  return false;
1525
32.0k
}
1526
1527
#endif // ndef DISABLED_LEGACY_ENGINE
1528
1529
/**
1530
 * classify_word_pass2
1531
 *
1532
 * Control what to do with the word in pass 2
1533
 */
1534
1535
void Tesseract::classify_word_pass2(const WordData &word_data, WERD_RES **in_word,
1536
63.2k
                                    PointerVector<WERD_RES> *out_words) {
1537
  // Return if we do not want to run Tesseract.
1538
63.2k
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1539
0
    return;
1540
0
  }
1541
63.2k
#ifndef DISABLED_LEGACY_ENGINE
1542
63.2k
  ROW *row = word_data.row;
1543
63.2k
  BLOCK *block = word_data.block;
1544
63.2k
  WERD_RES *word = *in_word;
1545
63.2k
  prev_word_best_choice_ =
1546
63.2k
      word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1547
1548
63.2k
  check_debug_pt(word, 30);
1549
63.2k
  if (!word->done) {
1550
63.2k
    word->caps_height = 0.0;
1551
63.2k
    if (word->x_height == 0.0f) {
1552
0
      word->x_height = row->x_height();
1553
0
    }
1554
63.2k
    match_word_pass_n(2, word, row, block);
1555
63.2k
    check_debug_pt(word, 40);
1556
63.2k
  }
1557
1558
63.2k
  SubAndSuperscriptFix(word);
1559
1560
63.2k
  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1561
63.1k
    if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
1562
63.1k
        block->classify_rotation().y() == 0.0f) {
1563
      // Use the tops and bottoms since they are available.
1564
63.1k
      TrainedXheightFix(word, block, row);
1565
63.1k
    }
1566
63.1k
  }
1567
#  ifndef GRAPHICS_DISABLED
1568
  if (tessedit_display_outwords) {
1569
    if (fx_win == nullptr) {
1570
      create_fx_win();
1571
    }
1572
    clear_fx_win();
1573
    word->rebuild_word->plot(fx_win);
1574
    TBOX wbox = word->rebuild_word->bounding_box();
1575
    fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
1576
    ScrollView::Update();
1577
  }
1578
#  endif
1579
63.2k
  check_debug_pt(word, 50);
1580
63.2k
#endif // ndef DISABLED_LEGACY_ENGINE
1581
63.2k
}
1582
1583
#ifndef DISABLED_LEGACY_ENGINE
1584
/**
1585
 * match_word_pass2
1586
 *
1587
 * Baseline normalize the word and pass it to Tess.
1588
 */
1589
160k
void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block) {
1590
160k
  if (word->tess_failed) {
1591
0
    return;
1592
0
  }
1593
160k
  tess_segment_pass_n(pass_n, word);
1594
1595
160k
  if (!word->tess_failed) {
1596
160k
    if (!word->word->flag(W_REP_CHAR)) {
1597
160k
      word->fix_quotes();
1598
160k
      if (tessedit_fix_hyphens) {
1599
160k
        word->fix_hyphens();
1600
160k
      }
1601
      /* Don't trust fix_quotes! - though I think I've fixed the bug */
1602
160k
      if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {
1603
0
        tprintf(
1604
0
            "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1605
0
            " #Blobs=%u\n",
1606
0
            word->best_choice->debug_string().c_str(), word->best_choice->length(),
1607
0
            word->box_word->length());
1608
0
      }
1609
160k
      word->tess_accepted = tess_acceptable_word(word);
1610
1611
      // Also sets word->done flag
1612
160k
      make_reject_map(word, row, pass_n);
1613
160k
    }
1614
160k
  }
1615
160k
  set_word_fonts(word);
1616
1617
160k
  ASSERT_HOST(word->raw_choice != nullptr);
1618
160k
}
1619
#endif // ndef DISABLED_LEGACY_ENGINE
1620
1621
// Helper to return the best rated BLOB_CHOICE in the whole word that matches
1622
// the given char_id, or nullptr if none can be found.
1623
0
static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) {
1624
  // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1625
0
  BLOB_CHOICE *best_choice = nullptr;
1626
0
  for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
1627
0
    BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i));
1628
0
    if (choice != nullptr) {
1629
0
      if (best_choice == nullptr || choice->rating() < best_choice->rating()) {
1630
0
        best_choice = choice;
1631
0
      }
1632
0
    }
1633
0
  }
1634
0
  return best_choice;
1635
0
}
1636
1637
// Helper to insert blob_choice in each location in the leader word if there is
1638
// no matching BLOB_CHOICE there already, and correct any incorrect results
1639
// in the best_choice.
1640
0
static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {
1641
0
  WERD_CHOICE *word = word_res->best_choice;
1642
0
  for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
1643
0
    BLOB_CHOICE *choice =
1644
0
        FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i));
1645
0
    if (choice == nullptr) {
1646
0
      BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
1647
0
      choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1648
0
    }
1649
0
  }
1650
  // Correct any incorrect results in word.
1651
0
  for (unsigned i = 0; i < word->length(); ++i) {
1652
0
    if (word->unichar_id(i) != blob_choice->unichar_id()) {
1653
0
      word->set_unichar_id(blob_choice->unichar_id(), i);
1654
0
    }
1655
0
  }
1656
0
}
1657
1658
/**
1659
 * fix_rep_char()
1660
 * The word is a repeated char. (Leader.) Find the repeated char character.
1661
 * Create the appropriate single-word or multi-word sequence according to
1662
 * the size of spaces in between blobs, and correct the classifications
1663
 * where some of the characters disagree with the majority.
1664
 */
1665
0
void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) {
1666
0
  WERD_RES *word_res = page_res_it->word();
1667
0
  const WERD_CHOICE &word = *(word_res->best_choice);
1668
1669
  // Find the frequency of each unique character in the word.
1670
0
  SortHelper<UNICHAR_ID> rep_ch(word.length());
1671
0
  for (unsigned i = 0; i < word.length(); ++i) {
1672
0
    rep_ch.Add(word.unichar_id(i), 1);
1673
0
  }
1674
1675
  // Find the most frequent result.
1676
0
  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1677
0
  int max_count = rep_ch.MaxCount(&maxch_id);
1678
  // Find the best exemplar of a classifier result for maxch_id.
1679
0
  BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res);
1680
0
  if (best_choice == nullptr) {
1681
0
    tprintf("Failed to find a choice for %s, occurring %d times\n",
1682
0
            word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
1683
0
    return;
1684
0
  }
1685
0
  word_res->done = true;
1686
1687
  // Just correct existing classification.
1688
0
  CorrectRepcharChoices(best_choice, word_res);
1689
0
  word_res->reject_map.initialise(word.length());
1690
0
}
1691
1692
ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_set, const char *s,
1693
26
                                                       const char *lengths) {
1694
26
  int i = 0;
1695
26
  int offset = 0;
1696
26
  int leading_punct_count;
1697
26
  int upper_count = 0;
1698
26
  int hyphen_pos = -1;
1699
26
  ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
1700
1701
26
  if (strlen(lengths) > 20) {
1702
0
    return word_type;
1703
0
  }
1704
1705
  /* Single Leading punctuation char*/
1706
1707
26
  if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) {
1708
0
    offset += lengths[i++];
1709
0
  }
1710
26
  leading_punct_count = i;
1711
1712
  /* Initial cap */
1713
32
  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1714
6
    offset += lengths[i++];
1715
6
    upper_count++;
1716
6
  }
1717
26
  if (upper_count > 1) {
1718
1
    word_type = AC_UPPER_CASE;
1719
25
  } else {
1720
    /* Lower case word, possibly with an initial cap */
1721
59
    while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1722
34
      offset += lengths[i++];
1723
34
    }
1724
25
    if (i - leading_punct_count < quality_min_initial_alphas_reqd) {
1725
20
      goto not_a_word;
1726
20
    }
1727
    /*
1728
Allow a single hyphen in a lower case word
1729
- don't trust upper case - I've seen several cases of "H" -> "I-I"
1730
*/
1731
5
    if (lengths[i] == 1 && s[offset] == '-') {
1732
0
      hyphen_pos = i;
1733
0
      offset += lengths[i++];
1734
0
      if (s[offset] != '\0') {
1735
0
        while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) {
1736
0
          offset += lengths[i++];
1737
0
        }
1738
0
        if (i < hyphen_pos + 3) {
1739
0
          goto not_a_word;
1740
0
        }
1741
0
      }
1742
5
    } else {
1743
      /* Allow "'s" in NON hyphenated lower case words */
1744
5
      if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 &&
1745
5
          (s[offset + lengths[i]] == 's')) {
1746
0
        offset += lengths[i++];
1747
0
        offset += lengths[i++];
1748
0
      }
1749
5
    }
1750
5
    if (upper_count > 0) {
1751
4
      word_type = AC_INITIAL_CAP;
1752
4
    } else {
1753
1
      word_type = AC_LOWER_CASE;
1754
1
    }
1755
5
  }
1756
1757
  /* Up to two different, constrained trailing punctuation chars */
1758
6
  if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) {
1759
0
    offset += lengths[i++];
1760
0
  }
1761
6
  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&
1762
6
      chs_trailing_punct2.contains(s[offset])) {
1763
0
    offset += lengths[i++];
1764
0
  }
1765
1766
6
  if (s[offset] != '\0') {
1767
2
    word_type = AC_UNACCEPTABLE;
1768
2
  }
1769
1770
26
not_a_word:
1771
1772
26
  if (word_type == AC_UNACCEPTABLE) {
1773
    /* Look for abbreviation string */
1774
22
    i = 0;
1775
22
    offset = 0;
1776
22
    if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1777
1
      word_type = AC_UC_ABBREV;
1778
1
      while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) &&
1779
1
             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1780
0
        offset += lengths[i++];
1781
0
        offset += lengths[i++];
1782
0
      }
1783
21
    } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1784
21
      word_type = AC_LC_ABBREV;
1785
21
      while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) &&
1786
21
             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1787
0
        offset += lengths[i++];
1788
0
        offset += lengths[i++];
1789
0
      }
1790
21
    }
1791
22
    if (s[offset] != '\0') {
1792
22
      word_type = AC_UNACCEPTABLE;
1793
22
    }
1794
22
  }
1795
1796
26
  return word_type;
1797
6
}
1798
1799
672k
bool Tesseract::check_debug_pt(WERD_RES *word, int location) {
1800
672k
  bool show_map_detail = false;
1801
672k
  int16_t i;
1802
1803
672k
  if (!test_pt) {
1804
672k
    return false;
1805
672k
  }
1806
1807
0
  tessedit_rejection_debug.set_value(false);
1808
0
  debug_x_ht_level.set_value(0);
1809
1810
0
  if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) {
1811
0
    if (location < 0) {
1812
0
      return true; // For breakpoint use
1813
0
    }
1814
0
    tessedit_rejection_debug.set_value(true);
1815
0
    debug_x_ht_level.set_value(2);
1816
0
    tprintf("\n\nTESTWD::");
1817
0
    switch (location) {
1818
0
      case 0:
1819
0
        tprintf("classify_word_pass1 start\n");
1820
0
        word->word->print();
1821
0
        break;
1822
0
      case 10:
1823
0
        tprintf("make_reject_map: initial map");
1824
0
        break;
1825
0
      case 20:
1826
0
        tprintf("make_reject_map: after NN");
1827
0
        break;
1828
0
      case 30:
1829
0
        tprintf("classify_word_pass2 - START");
1830
0
        break;
1831
0
      case 40:
1832
0
        tprintf("classify_word_pass2 - Pre Xht");
1833
0
        break;
1834
0
      case 50:
1835
0
        tprintf("classify_word_pass2 - END");
1836
0
        show_map_detail = true;
1837
0
        break;
1838
0
      case 60:
1839
0
        tprintf("fixspace");
1840
0
        break;
1841
0
      case 70:
1842
0
        tprintf("MM pass START");
1843
0
        break;
1844
0
      case 80:
1845
0
        tprintf("MM pass END");
1846
0
        break;
1847
0
      case 90:
1848
0
        tprintf("After Poor quality rejection");
1849
0
        break;
1850
0
      case 100:
1851
0
        tprintf("unrej_good_quality_words - START");
1852
0
        break;
1853
0
      case 110:
1854
0
        tprintf("unrej_good_quality_words - END");
1855
0
        break;
1856
0
      case 120:
1857
0
        tprintf("Write results pass");
1858
0
        show_map_detail = true;
1859
0
        break;
1860
0
    }
1861
0
    if (word->best_choice != nullptr) {
1862
0
      tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
1863
0
      word->reject_map.print(debug_fp);
1864
0
      tprintf("\n");
1865
0
      if (show_map_detail) {
1866
0
        tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
1867
0
        for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1868
0
          tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1869
0
          word->reject_map[i].full_print(debug_fp);
1870
0
        }
1871
0
      }
1872
0
    } else {
1873
0
      tprintf("null best choice\n");
1874
0
    }
1875
0
    tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1876
0
    tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1877
0
    return true;
1878
0
  } else {
1879
0
    return false;
1880
0
  }
1881
0
}
1882
1883
/**
1884
 * find_modal_font
1885
 *
1886
 * Find the modal font and remove from the stats.
1887
 */
1888
#ifndef DISABLED_LEGACY_ENGINE
1889
static void find_modal_font( // good chars in word
1890
    STATS *fonts,            // font stats
1891
    int16_t *font_out,       // output font
1892
    int8_t *font_count       // output count
1893
0
) {
1894
0
  int16_t font;  // font index
1895
0
  int32_t count; // pile count
1896
1897
0
  if (fonts->get_total() > 0) {
1898
0
    font = static_cast<int16_t>(fonts->mode());
1899
0
    *font_out = font;
1900
0
    count = fonts->pile_count(font);
1901
0
    *font_count = count < INT8_MAX ? count : INT8_MAX;
1902
0
    fonts->add(font, -*font_count);
1903
0
  } else {
1904
0
    *font_out = -1;
1905
0
    *font_count = 0;
1906
0
  }
1907
0
}
1908
#endif // ! DISABLED_LEGACY_ENGINE
1909
1910
/**
1911
 * set_word_fonts
1912
 *
1913
 * Get the fonts for the word.
1914
 */
1915
160k
void Tesseract::set_word_fonts(WERD_RES *word) {
1916
  // Don't try to set the word fonts for an lstm word, as the configs
1917
  // will be meaningless.
1918
160k
  if (word->chopped_word == nullptr) {
1919
0
    return;
1920
0
  }
1921
160k
  ASSERT_HOST(word->best_choice != nullptr);
1922
1923
160k
#ifndef DISABLED_LEGACY_ENGINE
1924
160k
  const int fontinfo_size = fontinfo_table_.size();
1925
160k
  if (fontinfo_size == 0) {
1926
0
    return;
1927
0
  }
1928
160k
  if (tessedit_font_id > 0) {
1929
0
    if (tessedit_font_id >= fontinfo_size) {
1930
0
      tprintf("Error, invalid font ID provided: must be below %d.\n"
1931
0
              "Falling back to font auto-detection.\n", fontinfo_size);
1932
0
    } else {
1933
0
      word->fontinfo = &fontinfo_table_.at(tessedit_font_id);
1934
0
      word->fontinfo2 = nullptr;
1935
0
      word->fontinfo_id_count = INT8_MAX;
1936
0
      word->fontinfo_id2_count = 0;
1937
0
      return;
1938
0
    }
1939
0
  }
1940
160k
  std::vector<int> font_total_score(fontinfo_size);
1941
1942
  // Compute the font scores for the word
1943
160k
  if (tessedit_debug_fonts) {
1944
0
    tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());
1945
0
  }
1946
853k
  for (unsigned b = 0; b < word->best_choice->length(); ++b) {
1947
692k
    const BLOB_CHOICE *choice = word->GetBlobChoice(b);
1948
692k
    if (choice == nullptr) {
1949
81
      continue;
1950
81
    }
1951
692k
    auto &fonts = choice->fonts();
1952
22.2M
    for (auto &f : fonts) {
1953
22.2M
      const int fontinfo_id = f.fontinfo_id;
1954
22.2M
      if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1955
22.2M
        font_total_score[fontinfo_id] += f.score;
1956
22.2M
      }
1957
22.2M
    }
1958
692k
  }
1959
  // Find the top and 2nd choice for the word.
1960
160k
  int score1 = 0, score2 = 0;
1961
160k
  int16_t font_id1 = -1, font_id2 = -1;
1962
68.8M
  for (int f = 0; f < fontinfo_size; ++f) {
1963
68.6M
    if (tessedit_debug_fonts && font_total_score[f] > 0) {
1964
0
      tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]);
1965
0
    }
1966
68.6M
    if (font_total_score[f] > score1) {
1967
494k
      score2 = score1;
1968
494k
      font_id2 = font_id1;
1969
494k
      score1 = font_total_score[f];
1970
494k
      font_id1 = f;
1971
68.1M
    } else if (font_total_score[f] > score2) {
1972
319k
      score2 = font_total_score[f];
1973
319k
      font_id2 = f;
1974
319k
    }
1975
68.6M
  }
1976
160k
  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr;
1977
160k
  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr;
1978
  // Each score has a limit of UINT16_MAX, so divide by that to get the number
1979
  // of "votes" for that font, ie number of perfect scores.
1980
160k
  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
1981
160k
  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
1982
160k
  if (score1 > 0) {
1983
160k
    const FontInfo fi = fontinfo_table_.at(font_id1);
1984
160k
    if (tessedit_debug_fonts) {
1985
0
      if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
1986
0
        tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name,
1987
0
                word->fontinfo_id_count, fontinfo_table_.at(font_id2).name,
1988
0
                word->fontinfo_id2_count);
1989
0
      } else {
1990
0
        tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count);
1991
0
      }
1992
0
    }
1993
160k
  }
1994
160k
#endif // ndef DISABLED_LEGACY_ENGINE
1995
160k
}
1996
1997
#ifndef DISABLED_LEGACY_ENGINE
1998
/**
1999
 * font_recognition_pass
2000
 *
2001
 * Smooth the fonts for the document.
2002
 */
2003
0
void Tesseract::font_recognition_pass(PAGE_RES *page_res) {
2004
0
  PAGE_RES_IT page_res_it(page_res);
2005
0
  WERD_RES *word;                       // current word
2006
0
  STATS doc_fonts(0, font_table_size_ - 1); // font counters
2007
2008
  // Gather font id statistics.
2009
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2010
0
    word = page_res_it.word();
2011
0
    if (word->fontinfo != nullptr) {
2012
0
      doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2013
0
    }
2014
0
    if (word->fontinfo2 != nullptr) {
2015
0
      doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2016
0
    }
2017
0
  }
2018
0
  int16_t doc_font;      // modal font
2019
0
  int8_t doc_font_count; // modal font
2020
0
  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2021
0
  if (doc_font_count == 0) {
2022
0
    return;
2023
0
  }
2024
  // Get the modal font pointer.
2025
0
  const FontInfo *modal_font = nullptr;
2026
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2027
0
    word = page_res_it.word();
2028
0
    if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2029
0
      modal_font = word->fontinfo;
2030
0
      break;
2031
0
    }
2032
0
    if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2033
0
      modal_font = word->fontinfo2;
2034
0
      break;
2035
0
    }
2036
0
  }
2037
0
  ASSERT_HOST(modal_font != nullptr);
2038
2039
  // Assign modal font to weak words.
2040
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2041
0
    word = page_res_it.word();
2042
0
    const int length = word->best_choice->length();
2043
2044
0
    const int count = word->fontinfo_id_count;
2045
0
    if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2046
0
      word->fontinfo = modal_font;
2047
      // Counts only get 1 as it came from the doc.
2048
0
      word->fontinfo_id_count = 1;
2049
0
    }
2050
0
  }
2051
0
}
2052
#endif // ndef DISABLED_LEGACY_ENGINE
2053
2054
// If a word has multiple alternates check if the best choice is in the
2055
// dictionary. If not, replace it with an alternate that exists in the
2056
// dictionary.
2057
0
void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) {
2058
0
  PAGE_RES_IT word_it(page_res);
2059
0
  for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) {
2060
0
    if (word->best_choices.singleton()) {
2061
0
      continue; // There are no alternates.
2062
0
    }
2063
2064
0
    const WERD_CHOICE *best = word->best_choice;
2065
0
    if (word->tesseract->getDict().valid_word(*best) != 0) {
2066
0
      continue; // The best choice is in the dictionary.
2067
0
    }
2068
2069
0
    WERD_CHOICE_IT choice_it(&word->best_choices);
2070
0
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
2071
0
      WERD_CHOICE *alternate = choice_it.data();
2072
0
      if (word->tesseract->getDict().valid_word(*alternate)) {
2073
        // The alternate choice is in the dictionary.
2074
0
        if (tessedit_bigram_debug) {
2075
0
          tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2076
0
                  best->unichar_string().c_str(), alternate->unichar_string().c_str());
2077
0
        }
2078
        // Replace the 'best' choice with a better choice.
2079
0
        word->ReplaceBestChoice(alternate);
2080
0
        break;
2081
0
      }
2082
0
    }
2083
0
  }
2084
0
}
2085
2086
} // namespace tesseract