/src/tesseract/src/ccmain/linerec.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: linerec.cpp |
3 | | // Description: Top-level line-based recognition module for Tesseract. |
4 | | // Author: Ray Smith |
5 | | // |
6 | | // (C) Copyright 2013, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | /////////////////////////////////////////////////////////////////////// |
17 | | |
18 | | #include "tesseractclass.h" |
19 | | |
20 | | #include <allheaders.h> |
21 | | #include "boxread.h" |
22 | | #include "imagedata.h" // for ImageData |
23 | | #include "lstmrecognizer.h" |
24 | | #include "pageres.h" |
25 | | #include "recodebeam.h" |
26 | | #include "tprintf.h" |
27 | | |
28 | | #include <algorithm> |
29 | | |
30 | | namespace tesseract { |
31 | | |
32 | | // Scale factor to make certainty more comparable to Tesseract. |
33 | | const float kCertaintyScale = 7.0f; |
34 | | // Worst acceptable certainty for a dictionary word. |
35 | | const float kWorstDictCertainty = -25.0f; |
36 | | |
37 | | // Generates training data for training a line recognizer, eg LSTM. |
38 | | // Breaks the page into lines, according to the boxes, and writes them to a |
39 | | // serialized DocumentData based on output_basename. |
40 | | // Return true if successful, false if an error occurred. |
41 | | bool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename, |
42 | 0 | BLOCK_LIST *block_list) { |
43 | 0 | std::string lstmf_name = output_basename + ".lstmf"; |
44 | 0 | DocumentData images(lstmf_name); |
45 | 0 | if (applybox_page > 0) { |
46 | | // Load existing document for the previous pages. |
47 | 0 | if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) { |
48 | 0 | tprintf("Failed to read training data from %s!\n", lstmf_name.c_str()); |
49 | 0 | return false; |
50 | 0 | } |
51 | 0 | } |
52 | 0 | std::vector<TBOX> boxes; |
53 | 0 | std::vector<std::string> texts; |
54 | | // Get the boxes for this page, if there are any. |
55 | 0 | if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) || |
56 | 0 | boxes.empty()) { |
57 | 0 | tprintf("Failed to read boxes from %s\n", input_imagename); |
58 | 0 | return false; |
59 | 0 | } |
60 | 0 | TrainFromBoxes(boxes, texts, block_list, &images); |
61 | 0 | if (images.PagesSize() == 0) { |
62 | 0 | tprintf("Failed to read pages from %s\n", input_imagename); |
63 | 0 | return false; |
64 | 0 | } |
65 | 0 | images.Shuffle(); |
66 | 0 | if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) { |
67 | 0 | tprintf("Failed to write training data to %s!\n", lstmf_name.c_str()); |
68 | 0 | return false; |
69 | 0 | } |
70 | 0 | return true; |
71 | 0 | } |
72 | | |
73 | | // Generates training data for training a line recognizer, eg LSTM. |
74 | | // Breaks the boxes into lines, normalizes them, converts to ImageData and |
75 | | // appends them to the given training_data. |
76 | | void Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts, |
77 | 0 | BLOCK_LIST *block_list, DocumentData *training_data) { |
78 | 0 | auto box_count = boxes.size(); |
79 | | // Process all the text lines in this page, as defined by the boxes. |
80 | 0 | unsigned end_box = 0; |
81 | | // Don't let \t, which marks newlines in the box file, get into the line |
82 | | // content, as that makes the line unusable in training. |
83 | 0 | while (end_box < texts.size() && texts[end_box] == "\t") { |
84 | 0 | ++end_box; |
85 | 0 | } |
86 | 0 | for (auto start_box = end_box; start_box < box_count; start_box = end_box) { |
87 | | // Find the textline of boxes starting at start and their bounding box. |
88 | 0 | TBOX line_box = boxes[start_box]; |
89 | 0 | std::string line_str = texts[start_box]; |
90 | 0 | for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) { |
91 | 0 | line_box += boxes[end_box]; |
92 | 0 | line_str += texts[end_box]; |
93 | 0 | } |
94 | | // Find the most overlapping block. |
95 | 0 | BLOCK *best_block = nullptr; |
96 | 0 | int best_overlap = 0; |
97 | 0 | BLOCK_IT b_it(block_list); |
98 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
99 | 0 | BLOCK *block = b_it.data(); |
100 | 0 | if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { |
101 | 0 | continue; // Not a text block. |
102 | 0 | } |
103 | 0 | TBOX block_box = block->pdblk.bounding_box(); |
104 | 0 | block_box.rotate(block->re_rotation()); |
105 | 0 | if (block_box.major_overlap(line_box)) { |
106 | 0 | TBOX overlap_box = line_box.intersection(block_box); |
107 | 0 | if (overlap_box.area() > best_overlap) { |
108 | 0 | best_overlap = overlap_box.area(); |
109 | 0 | best_block = block; |
110 | 0 | } |
111 | 0 | } |
112 | 0 | } |
113 | 0 | ImageData *imagedata = nullptr; |
114 | 0 | if (best_block == nullptr) { |
115 | 0 | tprintf("No block overlapping textline: %s\n", line_str.c_str()); |
116 | 0 | } else { |
117 | 0 | imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block); |
118 | 0 | } |
119 | 0 | if (imagedata != nullptr) { |
120 | 0 | training_data->AddPageToDocument(imagedata); |
121 | 0 | } |
122 | | // Don't let \t, which marks newlines in the box file, get into the line |
123 | | // content, as that makes the line unusable in training. |
124 | 0 | while (end_box < texts.size() && texts[end_box] == "\t") { |
125 | 0 | ++end_box; |
126 | 0 | } |
127 | 0 | } |
128 | 0 | } |
129 | | |
130 | | // Returns an Imagedata containing the image of the given box, |
131 | | // and ground truth boxes/truth text if available in the input. |
132 | | // The image is not normalized in any way. |
133 | | ImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes, |
134 | | const std::vector<std::string> &texts, int start_box, int end_box, |
135 | 0 | const BLOCK &block) { |
136 | 0 | TBOX revised_box; |
137 | 0 | ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box); |
138 | 0 | if (image_data == nullptr) { |
139 | 0 | return nullptr; |
140 | 0 | } |
141 | 0 | image_data->set_page_number(applybox_page); |
142 | | // Copy the boxes and shift them so they are relative to the image. |
143 | 0 | FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y()); |
144 | 0 | ICOORD shift = -revised_box.botleft(); |
145 | 0 | std::vector<TBOX> line_boxes; |
146 | 0 | std::vector<std::string> line_texts; |
147 | 0 | for (int b = start_box; b < end_box; ++b) { |
148 | 0 | TBOX box = boxes[b]; |
149 | 0 | box.rotate(block_rotation); |
150 | 0 | box.move(shift); |
151 | 0 | line_boxes.push_back(box); |
152 | 0 | line_texts.push_back(texts[b]); |
153 | 0 | } |
154 | 0 | std::vector<int> page_numbers(line_boxes.size(), applybox_page); |
155 | 0 | image_data->AddBoxes(line_boxes, line_texts, page_numbers); |
156 | 0 | return image_data; |
157 | 0 | } |
158 | | |
159 | | // Helper gets the image of a rectangle, using the block.re_rotation() if |
160 | | // needed to get to the image, and rotating the result back to horizontal |
161 | | // layout. (CJK characters will be on their left sides) The vertical text flag |
162 | | // is set in the returned ImageData if the text was originally vertical, which |
163 | | // can be used to invoke a different CJK recognition engine. The revised_box |
164 | | // is also returned to enable calculation of output bounding boxes. |
165 | | ImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding, |
166 | 180k | TBOX *revised_box) const { |
167 | 180k | TBOX wbox = box; |
168 | 180k | wbox.pad(padding, padding); |
169 | 180k | *revised_box = wbox; |
170 | | // Number of clockwise 90 degree rotations needed to get back to tesseract |
171 | | // coords from the clipped image. |
172 | 180k | int num_rotations = 0; |
173 | 180k | if (block.re_rotation().y() > 0.0f) { |
174 | 0 | num_rotations = 1; |
175 | 180k | } else if (block.re_rotation().x() < 0.0f) { |
176 | 0 | num_rotations = 2; |
177 | 180k | } else if (block.re_rotation().y() < 0.0f) { |
178 | 0 | num_rotations = 3; |
179 | 0 | } |
180 | | // Handle two cases automatically: 1 the box came from the block, 2 the box |
181 | | // came from a box file, and refers to the image, which the block may not. |
182 | 180k | if (block.pdblk.bounding_box().major_overlap(*revised_box)) { |
183 | 177k | revised_box->rotate(block.re_rotation()); |
184 | 177k | } |
185 | | // Now revised_box always refers to the image. |
186 | | // BestPix is never colormapped, but may be of any depth. |
187 | 180k | Image pix = BestPix(); |
188 | 180k | int width = pixGetWidth(pix); |
189 | 180k | int height = pixGetHeight(pix); |
190 | 180k | TBOX image_box(0, 0, width, height); |
191 | | // Clip to image bounds; |
192 | 180k | *revised_box &= image_box; |
193 | 180k | if (revised_box->null_box()) { |
194 | 0 | return nullptr; |
195 | 0 | } |
196 | 180k | Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(), |
197 | 180k | revised_box->height()); |
198 | 180k | Image box_pix = pixClipRectangle(pix, clip_box, nullptr); |
199 | 180k | boxDestroy(&clip_box); |
200 | 180k | if (box_pix == nullptr) { |
201 | 0 | return nullptr; |
202 | 0 | } |
203 | 180k | if (num_rotations > 0) { |
204 | 0 | Image rot_pix = pixRotateOrth(box_pix, num_rotations); |
205 | 0 | box_pix.destroy(); |
206 | 0 | box_pix = rot_pix; |
207 | 0 | } |
208 | | // Convert sub-8-bit images to 8 bit. |
209 | 180k | int depth = pixGetDepth(box_pix); |
210 | 180k | if (depth < 8) { |
211 | 180k | Image grey; |
212 | 180k | grey = pixConvertTo8(box_pix, false); |
213 | 180k | box_pix.destroy(); |
214 | 180k | box_pix = grey; |
215 | 180k | } |
216 | 180k | bool vertical_text = false; |
217 | 180k | if (num_rotations > 0) { |
218 | | // Rotated the clipped revised box back to internal coordinates. |
219 | 0 | FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y()); |
220 | 0 | revised_box->rotate(rotation); |
221 | 0 | if (num_rotations != 2) { |
222 | 0 | vertical_text = true; |
223 | 0 | } |
224 | 0 | } |
225 | 180k | return new ImageData(vertical_text, box_pix); |
226 | 180k | } |
227 | | |
228 | | // Recognizes a word or group of words, converting to WERD_RES in *words. |
229 | | // Analogous to classify_word_pass1, but can handle a group of words as well. |
230 | | void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, |
231 | 180k | PointerVector<WERD_RES> *words) { |
232 | 180k | TBOX word_box = word->word->bounding_box(); |
233 | | // Get the word image - no frills. |
234 | 180k | if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) { |
235 | | // In single word mode, use the whole image without any other row/word |
236 | | // interpretation. |
237 | 0 | word_box = TBOX(0, 0, ImageWidth(), ImageHeight()); |
238 | 180k | } else { |
239 | 180k | float baseline = row->base_line((word_box.left() + word_box.right()) / 2); |
240 | 180k | if (baseline + row->descenders() < word_box.bottom()) { |
241 | 149k | word_box.set_bottom(baseline + row->descenders()); |
242 | 149k | } |
243 | 180k | if (baseline + row->x_height() + row->ascenders() > word_box.top()) { |
244 | 143k | word_box.set_top(baseline + row->x_height() + row->ascenders()); |
245 | 143k | } |
246 | 180k | } |
247 | 180k | ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box); |
248 | 180k | if (im_data == nullptr) { |
249 | 0 | return; |
250 | 0 | } |
251 | | |
252 | 180k | bool do_invert = tessedit_do_invert; |
253 | 180k | float threshold = do_invert ? double(invert_threshold) : 0.0f; |
254 | 180k | lstm_recognizer_->RecognizeLine(*im_data, threshold, classify_debug_level > 0, |
255 | 180k | kWorstDictCertainty / kCertaintyScale, word_box, words, |
256 | 180k | lstm_choice_mode, lstm_choice_iterations); |
257 | 180k | delete im_data; |
258 | 180k | SearchWords(words); |
259 | 180k | } |
260 | | |
261 | | // Apply segmentation search to the given set of words, within the constraints |
262 | | // of the existing ratings matrix. If there is already a best_choice on a word |
263 | | // leaves it untouched and just sets the done/accepted etc flags. |
264 | 180k | void Tesseract::SearchWords(PointerVector<WERD_RES> *words) { |
265 | | // Run the segmentation search on the network outputs and make a BoxWord |
266 | | // for each of the output words. |
267 | | // If we drop a word as junk, then there is always a space in front of the |
268 | | // next. |
269 | 180k | const Dict *stopper_dict = lstm_recognizer_->GetDict(); |
270 | 180k | if (stopper_dict == nullptr) { |
271 | 0 | stopper_dict = &getDict(); |
272 | 0 | } |
273 | 343k | for (unsigned w = 0; w < words->size(); ++w) { |
274 | 163k | WERD_RES *word = (*words)[w]; |
275 | 163k | if (word->best_choice == nullptr) { |
276 | | // It is a dud. |
277 | 0 | word->SetupFake(lstm_recognizer_->GetUnicharset()); |
278 | 163k | } else { |
279 | | // Set the best state. |
280 | 384k | for (unsigned i = 0; i < word->best_choice->length(); ++i) { |
281 | 221k | int length = word->best_choice->state(i); |
282 | 221k | word->best_state.push_back(length); |
283 | 221k | } |
284 | 163k | word->reject_map.initialise(word->best_choice->length()); |
285 | 163k | word->tess_failed = false; |
286 | 163k | word->tess_accepted = true; |
287 | 163k | word->tess_would_adapt = false; |
288 | 163k | word->done = true; |
289 | 163k | word->tesseract = this; |
290 | 163k | float word_certainty = std::min(word->space_certainty, word->best_choice->certainty()); |
291 | 163k | word_certainty *= kCertaintyScale; |
292 | 163k | if (getDict().stopper_debug_level >= 1) { |
293 | 0 | tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n", |
294 | 0 | word->best_choice->certainty(), word->space_certainty, |
295 | 0 | std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale, |
296 | 0 | word_certainty); |
297 | 0 | word->best_choice->print(); |
298 | 0 | } |
299 | 163k | word->best_choice->set_certainty(word_certainty); |
300 | | |
301 | 163k | word->tess_accepted = stopper_dict->AcceptableResult(word); |
302 | 163k | } |
303 | 163k | } |
304 | 180k | } |
305 | | |
306 | | } // namespace tesseract. |