Coverage Report

Created: 2024-02-28 06:46

/src/tesseract/src/ccmain/recogtraining.cpp
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        recogtraining.cpp
3
// Description: Functions for ambiguity and parameter training.
4
// Author:      Daria Antonova
5
//
6
// (C) Copyright 2009, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#include "tesseractclass.h"
20
21
#include "boxread.h"
22
#include "control.h"
23
#include "host.h" // for NearlyEqual
24
#include "ratngs.h"
25
#ifndef DISABLED_LEGACY_ENGINE
26
#  include "reject.h"
27
#endif
28
#include "stopper.h"
29
30
namespace tesseract {
31
32
const int16_t kMaxBoxEdgeDiff = 2;
33
34
// Sets flags necessary for recognition in the training mode.
35
// Opens and returns the pointer to the output file.
36
0
FILE *Tesseract::init_recog_training(const char *filename) {
37
0
  if (tessedit_ambigs_training) {
38
0
    tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39
0
    tessedit_enable_doc_dict.set_value(false); // turn off document dictionary
40
    // Explore all segmentations.
41
0
    getDict().stopper_no_acceptable_choices.set_value(true);
42
0
  }
43
44
0
  std::string output_fname = filename;
45
0
  const char *lastdot = strrchr(output_fname.c_str(), '.');
46
0
  if (lastdot != nullptr) {
47
0
    output_fname[lastdot - output_fname.c_str()] = '\0';
48
0
  }
49
0
  output_fname += ".txt";
50
0
  FILE *output_file = fopen(output_fname.c_str(), "a+");
51
0
  if (output_file == nullptr) {
52
0
    tprintf("Error: Could not open file %s\n", output_fname.c_str());
53
0
    ASSERT_HOST(output_file);
54
0
  }
55
0
  return output_file;
56
0
}
57
58
// Copies the bounding box from page_res_it->word() to the given TBOX.
59
0
static bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
60
0
  while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) {
61
0
    page_res_it->forward();
62
0
  }
63
64
0
  if (page_res_it->word() != nullptr) {
65
0
    *tbox = page_res_it->word()->word->bounding_box();
66
67
    // If tbox->left() is negative, the training image has vertical text and
68
    // all the coordinates of bounding boxes of page_res are rotated by 90
69
    // degrees in a counterclockwise direction. We need to rotate the TBOX back
70
    // in order to compare with the TBOXes of box files.
71
0
    if (tbox->left() < 0) {
72
0
      tbox->rotate(FCOORD(0.0, -1.0));
73
0
    }
74
75
0
    return true;
76
0
  } else {
77
0
    return false;
78
0
  }
79
0
}
80
81
// This function takes tif/box pair of files and runs recognition on the image,
82
// while making sure that the word bounds that tesseract identified roughly
83
// match to those specified by the input box file. For each word (ngram in a
84
// single bounding box from the input box file) it outputs the ocred result,
85
// the correct label, rating and certainty.
86
void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_res,
87
0
                                         volatile ETEXT_DESC *monitor, FILE *output_file) {
88
0
  std::string box_fname = filename;
89
0
  const char *lastdot = strrchr(box_fname.c_str(), '.');
90
0
  if (lastdot != nullptr) {
91
0
    box_fname[lastdot - box_fname.c_str()] = '\0';
92
0
  }
93
0
  box_fname += ".box";
94
  // ReadNextBox() will close box_file
95
0
  FILE *box_file = fopen(box_fname.c_str(), "r");
96
0
  if (box_file == nullptr) {
97
0
    tprintf("Error: Could not open file %s\n", box_fname.c_str());
98
0
    ASSERT_HOST(box_file);
99
0
  }
100
101
0
  PAGE_RES_IT page_res_it;
102
0
  page_res_it.page_res = page_res;
103
0
  page_res_it.restart_page();
104
0
  std::string label;
105
106
  // Process all the words on this page.
107
0
  TBOX tbox; // tesseract-identified box
108
0
  TBOX bbox; // box from the box file
109
0
  bool keep_going;
110
0
  int line_number = 0;
111
0
  int examined_words = 0;
112
0
  do {
113
0
    keep_going = read_t(&page_res_it, &tbox);
114
0
    keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
115
    // Align bottom left points of the TBOXes.
116
0
    while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
117
0
      if (bbox.bottom() < tbox.bottom()) {
118
0
        page_res_it.forward();
119
0
        keep_going = read_t(&page_res_it, &tbox);
120
0
      } else {
121
0
        keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
122
0
      }
123
0
    }
124
0
    while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
125
0
      if (bbox.left() > tbox.left()) {
126
0
        page_res_it.forward();
127
0
        keep_going = read_t(&page_res_it, &tbox);
128
0
      } else {
129
0
        keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
130
0
      }
131
0
    }
132
    // OCR the word if top right points of the TBOXes are similar.
133
0
    if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
134
0
        NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
135
0
      ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
136
0
      examined_words++;
137
0
    }
138
0
    page_res_it.forward();
139
0
  } while (keep_going);
140
141
  // Set up scripts on all of the words that did not get sent to
142
  // ambigs_classify_and_output.  They all should have, but if all the
143
  // werd_res's don't get uch_sets, tesseract will crash when you try
144
  // to iterate over them. :-(
145
0
  int total_words = 0;
146
0
  for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {
147
0
    if (page_res_it.word()) {
148
0
      if (page_res_it.word()->uch_set == nullptr) {
149
0
        page_res_it.word()->SetupFake(unicharset);
150
0
      }
151
0
      total_words++;
152
0
    }
153
0
  }
154
0
  if (examined_words < 0.85 * total_words) {
155
0
    tprintf(
156
0
        "TODO(antonova): clean up recog_training_segmented; "
157
0
        " It examined only a small fraction of the ambigs image.\n");
158
0
  }
159
0
  tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words);
160
0
}
161
162
// Helper prints the given set of blob choices.
163
static void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
164
0
                      const char *label, FILE *output_file) {
165
0
  float rating = 0.0f;
166
0
  float certainty = 0.0f;
167
0
  for (int i = 0; i < length; ++i) {
168
0
    const BLOB_CHOICE *blob_choice = blob_choices[i];
169
0
    fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id()));
170
0
    rating += blob_choice->rating();
171
0
    if (certainty > blob_choice->certainty()) {
172
0
      certainty = blob_choice->certainty();
173
0
    }
174
0
  }
175
0
  fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
176
0
}
177
178
// Helper recursively prints all paths through the ratings matrix, starting
179
// at column col.
180
static void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length,
181
                             const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
182
0
                             const char *label, FILE *output_file) {
183
0
  for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
184
0
    if (ratings.get(col, row) != NOT_CLASSIFIED) {
185
0
      BLOB_CHOICE_IT bc_it(ratings.get(col, row));
186
0
      for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
187
0
        blob_choices[length] = bc_it.data();
188
0
        if (row + 1 < dim) {
189
0
          PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label,
190
0
                           output_file);
191
0
        } else {
192
0
          PrintPath(length + 1, blob_choices, unicharset, label, output_file);
193
0
        }
194
0
      }
195
0
    }
196
0
  }
197
0
}
198
199
// Runs classify_word_pass1() on the current word. Outputs Tesseract's
200
// raw choice as a result of the classification. For words labeled with a
201
// single unichar also outputs all alternatives from blob_choices of the
202
// best choice.
203
void Tesseract::ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it,
204
0
                                           FILE *output_file) {
205
  // Classify word.
206
0
  fflush(stdout);
207
0
  WordData word_data(*pr_it);
208
0
  SetupWordPassN(1, &word_data);
209
0
  classify_word_and_language(1, pr_it, &word_data);
210
0
  WERD_RES *werd_res = word_data.word;
211
0
  WERD_CHOICE *best_choice = werd_res->best_choice;
212
0
  ASSERT_HOST(best_choice != nullptr);
213
214
  // Compute the number of unichars in the label.
215
0
  std::vector<UNICHAR_ID> encoding;
216
0
  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
217
0
    tprintf("Not outputting illegal unichar %s\n", label);
218
0
    return;
219
0
  }
220
221
  // Dump all paths through the ratings matrix (which is normally small).
222
0
  int dim = werd_res->ratings->dimension();
223
0
  const auto **blob_choices = new const BLOB_CHOICE *[dim];
224
0
  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);
225
0
  delete[] blob_choices;
226
0
}
227
228
} // namespace tesseract