Coverage Report

Created: 2024-02-28 06:46

/src/tesseract/src/ccmain/linerec.cpp
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        linerec.cpp
3
// Description: Top-level line-based recognition module for Tesseract.
4
// Author:      Ray Smith
5
//
6
// (C) Copyright 2013, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
///////////////////////////////////////////////////////////////////////
17
18
#include "tesseractclass.h"
19
20
#include <allheaders.h>
21
#include "boxread.h"
22
#include "imagedata.h" // for ImageData
23
#include "lstmrecognizer.h"
24
#include "pageres.h"
25
#include "recodebeam.h"
26
#include "tprintf.h"
27
28
#include <algorithm>
29
30
namespace tesseract {
31
32
// Scale factor to make certainty more comparable to Tesseract.
33
const float kCertaintyScale = 7.0f;
34
// Worst acceptable certainty for a dictionary word.
35
const float kWorstDictCertainty = -25.0f;
36
37
// Generates training data for training a line recognizer, eg LSTM.
38
// Breaks the page into lines, according to the boxes, and writes them to a
39
// serialized DocumentData based on output_basename.
40
// Return true if successful, false if an error occurred.
41
bool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename,
42
0
                                    BLOCK_LIST *block_list) {
43
0
  std::string lstmf_name = output_basename + ".lstmf";
44
0
  DocumentData images(lstmf_name);
45
0
  if (applybox_page > 0) {
46
    // Load existing document for the previous pages.
47
0
    if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
48
0
      tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
49
0
      return false;
50
0
    }
51
0
  }
52
0
  std::vector<TBOX> boxes;
53
0
  std::vector<std::string> texts;
54
  // Get the boxes for this page, if there are any.
55
0
  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) ||
56
0
      boxes.empty()) {
57
0
    tprintf("Failed to read boxes from %s\n", input_imagename);
58
0
    return false;
59
0
  }
60
0
  TrainFromBoxes(boxes, texts, block_list, &images);
61
0
  if (images.PagesSize() == 0) {
62
0
    tprintf("Failed to read pages from %s\n", input_imagename);
63
0
    return false;
64
0
  }
65
0
  images.Shuffle();
66
0
  if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
67
0
    tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
68
0
    return false;
69
0
  }
70
0
  return true;
71
0
}
72
73
// Generates training data for training a line recognizer, eg LSTM.
74
// Breaks the boxes into lines, normalizes them, converts to ImageData and
75
// appends them to the given training_data.
76
void Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
77
0
                               BLOCK_LIST *block_list, DocumentData *training_data) {
78
0
  auto box_count = boxes.size();
79
  // Process all the text lines in this page, as defined by the boxes.
80
0
  unsigned end_box = 0;
81
  // Don't let \t, which marks newlines in the box file, get into the line
82
  // content, as that makes the line unusable in training.
83
0
  while (end_box < texts.size() && texts[end_box] == "\t") {
84
0
    ++end_box;
85
0
  }
86
0
  for (auto start_box = end_box; start_box < box_count; start_box = end_box) {
87
    // Find the textline of boxes starting at start and their bounding box.
88
0
    TBOX line_box = boxes[start_box];
89
0
    std::string line_str = texts[start_box];
90
0
    for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) {
91
0
      line_box += boxes[end_box];
92
0
      line_str += texts[end_box];
93
0
    }
94
    // Find the most overlapping block.
95
0
    BLOCK *best_block = nullptr;
96
0
    int best_overlap = 0;
97
0
    BLOCK_IT b_it(block_list);
98
0
    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
99
0
      BLOCK *block = b_it.data();
100
0
      if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
101
0
        continue; // Not a text block.
102
0
      }
103
0
      TBOX block_box = block->pdblk.bounding_box();
104
0
      block_box.rotate(block->re_rotation());
105
0
      if (block_box.major_overlap(line_box)) {
106
0
        TBOX overlap_box = line_box.intersection(block_box);
107
0
        if (overlap_box.area() > best_overlap) {
108
0
          best_overlap = overlap_box.area();
109
0
          best_block = block;
110
0
        }
111
0
      }
112
0
    }
113
0
    ImageData *imagedata = nullptr;
114
0
    if (best_block == nullptr) {
115
0
      tprintf("No block overlapping textline: %s\n", line_str.c_str());
116
0
    } else {
117
0
      imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block);
118
0
    }
119
0
    if (imagedata != nullptr) {
120
0
      training_data->AddPageToDocument(imagedata);
121
0
    }
122
    // Don't let \t, which marks newlines in the box file, get into the line
123
    // content, as that makes the line unusable in training.
124
0
    while (end_box < texts.size() && texts[end_box] == "\t") {
125
0
      ++end_box;
126
0
    }
127
0
  }
128
0
}
129
130
// Returns an Imagedata containing the image of the given box,
131
// and ground truth boxes/truth text if available in the input.
132
// The image is not normalized in any way.
133
ImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes,
134
                                  const std::vector<std::string> &texts, int start_box, int end_box,
135
0
                                  const BLOCK &block) {
136
0
  TBOX revised_box;
137
0
  ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box);
138
0
  if (image_data == nullptr) {
139
0
    return nullptr;
140
0
  }
141
0
  image_data->set_page_number(applybox_page);
142
  // Copy the boxes and shift them so they are relative to the image.
143
0
  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
144
0
  ICOORD shift = -revised_box.botleft();
145
0
  std::vector<TBOX> line_boxes;
146
0
  std::vector<std::string> line_texts;
147
0
  for (int b = start_box; b < end_box; ++b) {
148
0
    TBOX box = boxes[b];
149
0
    box.rotate(block_rotation);
150
0
    box.move(shift);
151
0
    line_boxes.push_back(box);
152
0
    line_texts.push_back(texts[b]);
153
0
  }
154
0
  std::vector<int> page_numbers(line_boxes.size(), applybox_page);
155
0
  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
156
0
  return image_data;
157
0
}
158
159
// Helper gets the image of a rectangle, using the block.re_rotation() if
160
// needed to get to the image, and rotating the result back to horizontal
161
// layout. (CJK characters will be on their left sides) The vertical text flag
162
// is set in the returned ImageData if the text was originally vertical, which
163
// can be used to invoke a different CJK recognition engine. The revised_box
164
// is also returned to enable calculation of output bounding boxes.
165
ImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding,
166
180k
                                   TBOX *revised_box) const {
167
180k
  TBOX wbox = box;
168
180k
  wbox.pad(padding, padding);
169
180k
  *revised_box = wbox;
170
  // Number of clockwise 90 degree rotations needed to get back to tesseract
171
  // coords from the clipped image.
172
180k
  int num_rotations = 0;
173
180k
  if (block.re_rotation().y() > 0.0f) {
174
0
    num_rotations = 1;
175
180k
  } else if (block.re_rotation().x() < 0.0f) {
176
0
    num_rotations = 2;
177
180k
  } else if (block.re_rotation().y() < 0.0f) {
178
0
    num_rotations = 3;
179
0
  }
180
  // Handle two cases automatically: 1 the box came from the block, 2 the box
181
  // came from a box file, and refers to the image, which the block may not.
182
180k
  if (block.pdblk.bounding_box().major_overlap(*revised_box)) {
183
177k
    revised_box->rotate(block.re_rotation());
184
177k
  }
185
  // Now revised_box always refers to the image.
186
  // BestPix is never colormapped, but may be of any depth.
187
180k
  Image pix = BestPix();
188
180k
  int width = pixGetWidth(pix);
189
180k
  int height = pixGetHeight(pix);
190
180k
  TBOX image_box(0, 0, width, height);
191
  // Clip to image bounds;
192
180k
  *revised_box &= image_box;
193
180k
  if (revised_box->null_box()) {
194
0
    return nullptr;
195
0
  }
196
180k
  Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(),
197
180k
                            revised_box->height());
198
180k
  Image box_pix = pixClipRectangle(pix, clip_box, nullptr);
199
180k
  boxDestroy(&clip_box);
200
180k
  if (box_pix == nullptr) {
201
0
    return nullptr;
202
0
  }
203
180k
  if (num_rotations > 0) {
204
0
    Image rot_pix = pixRotateOrth(box_pix, num_rotations);
205
0
    box_pix.destroy();
206
0
    box_pix = rot_pix;
207
0
  }
208
  // Convert sub-8-bit images to 8 bit.
209
180k
  int depth = pixGetDepth(box_pix);
210
180k
  if (depth < 8) {
211
180k
    Image grey;
212
180k
    grey = pixConvertTo8(box_pix, false);
213
180k
    box_pix.destroy();
214
180k
    box_pix = grey;
215
180k
  }
216
180k
  bool vertical_text = false;
217
180k
  if (num_rotations > 0) {
218
    // Rotated the clipped revised box back to internal coordinates.
219
0
    FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
220
0
    revised_box->rotate(rotation);
221
0
    if (num_rotations != 2) {
222
0
      vertical_text = true;
223
0
    }
224
0
  }
225
180k
  return new ImageData(vertical_text, box_pix);
226
180k
}
227
228
// Recognizes a word or group of words, converting to WERD_RES in *words.
229
// Analogous to classify_word_pass1, but can handle a group of words as well.
230
void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
231
180k
                                  PointerVector<WERD_RES> *words) {
232
180k
  TBOX word_box = word->word->bounding_box();
233
  // Get the word image - no frills.
234
180k
  if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) {
235
    // In single word mode, use the whole image without any other row/word
236
    // interpretation.
237
0
    word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
238
180k
  } else {
239
180k
    float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
240
180k
    if (baseline + row->descenders() < word_box.bottom()) {
241
149k
      word_box.set_bottom(baseline + row->descenders());
242
149k
    }
243
180k
    if (baseline + row->x_height() + row->ascenders() > word_box.top()) {
244
143k
      word_box.set_top(baseline + row->x_height() + row->ascenders());
245
143k
    }
246
180k
  }
247
180k
  ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
248
180k
  if (im_data == nullptr) {
249
0
    return;
250
0
  }
251
252
180k
  bool do_invert = tessedit_do_invert;
253
180k
  float threshold = do_invert ? double(invert_threshold) : 0.0f;
254
180k
  lstm_recognizer_->RecognizeLine(*im_data, threshold, classify_debug_level > 0,
255
180k
                                  kWorstDictCertainty / kCertaintyScale, word_box, words,
256
180k
                                  lstm_choice_mode, lstm_choice_iterations);
257
180k
  delete im_data;
258
180k
  SearchWords(words);
259
180k
}
260
261
// Apply segmentation search to the given set of words, within the constraints
262
// of the existing ratings matrix. If there is already a best_choice on a word
263
// leaves it untouched and just sets the done/accepted etc flags.
264
180k
void Tesseract::SearchWords(PointerVector<WERD_RES> *words) {
265
  // Run the segmentation search on the network outputs and make a BoxWord
266
  // for each of the output words.
267
  // If we drop a word as junk, then there is always a space in front of the
268
  // next.
269
180k
  const Dict *stopper_dict = lstm_recognizer_->GetDict();
270
180k
  if (stopper_dict == nullptr) {
271
0
    stopper_dict = &getDict();
272
0
  }
273
343k
  for (unsigned w = 0; w < words->size(); ++w) {
274
163k
    WERD_RES *word = (*words)[w];
275
163k
    if (word->best_choice == nullptr) {
276
      // It is a dud.
277
0
      word->SetupFake(lstm_recognizer_->GetUnicharset());
278
163k
    } else {
279
      // Set the best state.
280
384k
      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
281
221k
        int length = word->best_choice->state(i);
282
221k
        word->best_state.push_back(length);
283
221k
      }
284
163k
      word->reject_map.initialise(word->best_choice->length());
285
163k
      word->tess_failed = false;
286
163k
      word->tess_accepted = true;
287
163k
      word->tess_would_adapt = false;
288
163k
      word->done = true;
289
163k
      word->tesseract = this;
290
163k
      float word_certainty = std::min(word->space_certainty, word->best_choice->certainty());
291
163k
      word_certainty *= kCertaintyScale;
292
163k
      if (getDict().stopper_debug_level >= 1) {
293
0
        tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
294
0
                word->best_choice->certainty(), word->space_certainty,
295
0
                std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale,
296
0
                word_certainty);
297
0
        word->best_choice->print();
298
0
      }
299
163k
      word->best_choice->set_certainty(word_certainty);
300
301
163k
      word->tess_accepted = stopper_dict->AcceptableResult(word);
302
163k
    }
303
163k
  }
304
180k
}
305
306
} // namespace tesseract.