/src/tesseract/src/ccmain/applybox.cpp
Line | Count | Source |
1 | | /********************************************************************** |
2 | | * File: applybox.cpp (Formerly applybox.c) |
3 | | * Description: Re segment rows according to box file data |
4 | | * Author: Phil Cheatle |
5 | | * |
6 | | * (C) Copyright 1993, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #ifndef DISABLED_LEGACY_ENGINE |
20 | | # include <allheaders.h> |
21 | | # include <cctype> |
22 | | # include <cerrno> |
23 | | # include <cstring> |
24 | | # include "boxread.h" |
25 | | #endif // ndef DISABLED_LEGACY_ENGINE |
26 | | #include <tesseract/unichar.h> |
27 | | #include "pageres.h" |
28 | | #include "tesseractclass.h" |
29 | | #include "tesserrstream.h" // for tesserr |
30 | | #include "unicharset.h" |
31 | | |
32 | | #ifndef DISABLED_LEGACY_ENGINE |
33 | | /** Max number of blobs to classify together in FindSegmentation. */ |
34 | | const int kMaxGroupSize = 4; |
35 | | /// Max fraction of median allowed as deviation in xheight before switching |
36 | | /// to median. |
37 | | const double kMaxXHeightDeviationFraction = 0.125; |
38 | | #endif // ndef DISABLED_LEGACY_ENGINE |
39 | | |
40 | | /** |
41 | | * The box file is assumed to contain box definitions, one per line, of the |
42 | | * following format for blob-level boxes: |
43 | | * @verbatim |
44 | | * <UTF8 str> <left> <bottom> <right> <top> <page id> |
45 | | * @endverbatim |
46 | | * and for word/line-level boxes: |
47 | | * @verbatim |
48 | | * WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> |
49 | | * @endverbatim |
50 | | * NOTES: |
51 | | * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT. |
52 | | * |
53 | | * <page id> is 0-based, and the page number is used for multipage input (tiff). |
54 | | * |
55 | | * In the blob-level form, each line represents a recognizable unit, which may |
56 | | * be several UTF-8 bytes, but there is a bounding box around each recognizable |
57 | | * unit, and no classifier is needed to train in this mode (bootstrapping.) |
58 | | * |
59 | | * In the word/line-level form, the line begins with the literal "WordStr", and |
60 | | * the bounding box bounds either a whole line or a whole word. The recognizable |
61 | | * units in the word/line are listed after the # at the end of the line and |
62 | | * are space delimited, ignoring any original spaces on the line. |
63 | | * Eg. |
64 | | * @verbatim |
65 | | * word -> #w o r d |
66 | | * multi word line -> #m u l t i w o r d l i n e |
67 | | * @endverbatim |
68 | | * The recognizable units must be space-delimited in order to allow multiple |
69 | | * unicodes to be used for a single recognizable unit, eg Hindi. |
70 | | * |
71 | | * In this mode, the classifier must have been pre-trained with the desired |
72 | | * character set, or it will not be able to find the character segmentations. |
73 | | */ |
74 | | |
75 | | namespace tesseract { |
76 | | |
77 | | #ifndef DISABLED_LEGACY_ENGINE |
78 | 0 | static void clear_any_old_text(BLOCK_LIST *block_list) { |
79 | 0 | BLOCK_IT block_it(block_list); |
80 | 0 | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
81 | 0 | ROW_IT row_it(block_it.data()->row_list()); |
82 | 0 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
83 | 0 | WERD_IT word_it(row_it.data()->word_list()); |
84 | 0 | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
85 | 0 | word_it.data()->set_text(""); |
86 | 0 | } |
87 | 0 | } |
88 | 0 | } |
89 | 0 | } |
90 | | |
91 | | // Applies the box file based on the image name filename, and resegments |
92 | | // the words in the block_list (page), with: |
93 | | // blob-mode: one blob per line in the box file, words as input. |
94 | | // word/line-mode: one blob per space-delimited unit after the #, and one word |
95 | | // per line in the box file. (See comment above for box file format.) |
96 | | // If find_segmentation is true, (word/line mode) then the classifier is used |
97 | | // to re-segment words/lines to match the space-delimited truth string for |
98 | | // each box. In this case, the input box may be for a word or even a whole |
99 | | // text line, and the output words will contain multiple blobs corresponding |
100 | | // to the space-delimited input string. |
101 | | // With find_segmentation false, no classifier is needed, but the chopper |
102 | | // can still be used to correctly segment touching characters with the help |
103 | | // of the input boxes. |
104 | | // In the returned PAGE_RES, the WERD_RES are setup as they would be returned |
105 | | // from normal classification, ie. with a word, chopped_word, rebuild_word, |
106 | | // seam_array, denorm, box_word, and best_state, but NO best_choice or |
107 | | // raw_choice, as they would require a UNICHARSET, which we aim to avoid. |
108 | | // Instead, the correct_text member of WERD_RES is set, and this may be later |
109 | | // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords |
110 | | // is not required before calling ApplyBoxTraining. |
111 | | PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation, |
112 | 0 | BLOCK_LIST *block_list) { |
113 | 0 | std::vector<TBOX> boxes; |
114 | 0 | std::vector<std::string> texts, full_texts; |
115 | 0 | if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) { |
116 | 0 | return nullptr; // Can't do it. |
117 | 0 | } |
118 | | |
119 | 0 | const int box_count = boxes.size(); |
120 | 0 | int box_failures = 0; |
121 | | |
122 | | // In word mode, we use the boxes to make a word for each box, but |
123 | | // in blob mode we use the existing words and maximally chop them first. |
124 | 0 | PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list); |
125 | 0 | clear_any_old_text(block_list); |
126 | |
|
127 | 0 | for (int i = 0; i < box_count; i++) { |
128 | 0 | bool foundit = false; |
129 | 0 | if (page_res != nullptr) { |
130 | 0 | foundit = |
131 | 0 | ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i], |
132 | 0 | (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str()); |
133 | 0 | } else { |
134 | 0 | foundit = ResegmentWordBox(block_list, boxes[i], |
135 | 0 | (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str()); |
136 | 0 | } |
137 | 0 | if (!foundit) { |
138 | 0 | box_failures++; |
139 | 0 | ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob"); |
140 | 0 | } |
141 | 0 | } |
142 | |
|
143 | 0 | if (page_res == nullptr) { |
144 | | // In word/line mode, we now maximally chop all the words and resegment |
145 | | // them with the classifier. |
146 | 0 | page_res = SetupApplyBoxes(boxes, block_list); |
147 | 0 | ReSegmentByClassification(page_res); |
148 | 0 | } |
149 | 0 | if (applybox_debug > 0) { |
150 | 0 | tprintf("APPLY_BOXES:\n"); |
151 | 0 | tprintf(" Boxes read from boxfile: %6d\n", box_count); |
152 | 0 | if (box_failures > 0) { |
153 | 0 | tprintf(" Boxes failed resegmentation: %6d\n", box_failures); |
154 | 0 | } |
155 | 0 | } |
156 | 0 | TidyUp(page_res); |
157 | 0 | return page_res; |
158 | 0 | } |
159 | | |
160 | | // Helper computes median xheight in the image. |
161 | 0 | static double MedianXHeight(BLOCK_LIST *block_list) { |
162 | 0 | BLOCK_IT block_it(block_list); |
163 | 0 | STATS xheights(0, block_it.data()->pdblk.bounding_box().height() - 1); |
164 | 0 | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
165 | 0 | ROW_IT row_it(block_it.data()->row_list()); |
166 | 0 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
167 | 0 | xheights.add(IntCastRounded(row_it.data()->x_height()), 1); |
168 | 0 | } |
169 | 0 | } |
170 | 0 | return xheights.median(); |
171 | 0 | } |
172 | | |
173 | | /// Any row xheight that is significantly different from the median is set |
174 | | /// to the median. |
175 | 0 | void Tesseract::PreenXHeights(BLOCK_LIST *block_list) { |
176 | 0 | const double median_xheight = MedianXHeight(block_list); |
177 | 0 | const double max_deviation = kMaxXHeightDeviationFraction * median_xheight; |
178 | | // Strip all fuzzy space markers to simplify the PAGE_RES. |
179 | 0 | BLOCK_IT b_it(block_list); |
180 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
181 | 0 | BLOCK *block = b_it.data(); |
182 | 0 | ROW_IT r_it(block->row_list()); |
183 | 0 | for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { |
184 | 0 | ROW *row = r_it.data(); |
185 | 0 | const double diff = fabs(row->x_height() - median_xheight); |
186 | 0 | if (diff > max_deviation) { |
187 | 0 | if (applybox_debug) { |
188 | 0 | tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight); |
189 | 0 | } |
190 | 0 | row->set_x_height(static_cast<float>(median_xheight)); |
191 | 0 | } |
192 | 0 | } |
193 | 0 | } |
194 | 0 | } |
195 | | |
196 | | /// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: |
197 | | /// All fuzzy spaces are removed, and all the words are maximally chopped. |
198 | 0 | PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) { |
199 | 0 | PreenXHeights(block_list); |
200 | | // Strip all fuzzy space markers to simplify the PAGE_RES. |
201 | 0 | BLOCK_IT b_it(block_list); |
202 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
203 | 0 | BLOCK *block = b_it.data(); |
204 | 0 | ROW_IT r_it(block->row_list()); |
205 | 0 | for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { |
206 | 0 | ROW *row = r_it.data(); |
207 | 0 | WERD_IT w_it(row->word_list()); |
208 | 0 | for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { |
209 | 0 | WERD *word = w_it.data(); |
210 | 0 | if (word->cblob_list()->empty()) { |
211 | 0 | delete w_it.extract(); |
212 | 0 | } else { |
213 | 0 | word->set_flag(W_FUZZY_SP, false); |
214 | 0 | word->set_flag(W_FUZZY_NON, false); |
215 | 0 | } |
216 | 0 | } |
217 | 0 | } |
218 | 0 | } |
219 | 0 | auto *page_res = new PAGE_RES(false, block_list, nullptr); |
220 | 0 | PAGE_RES_IT pr_it(page_res); |
221 | 0 | WERD_RES *word_res; |
222 | 0 | while ((word_res = pr_it.word()) != nullptr) { |
223 | 0 | MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res); |
224 | 0 | pr_it.forward(); |
225 | 0 | } |
226 | 0 | return page_res; |
227 | 0 | } |
228 | | |
229 | | /// Tests the chopper by exhaustively running chop_one_blob. |
230 | | /// The word_res will contain filled chopped_word, seam_array, denorm, |
231 | | /// box_word and best_state for the maximally chopped word. |
232 | | void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row, |
233 | 0 | WERD_RES *word_res) { |
234 | 0 | if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr, |
235 | 0 | classify_bln_numeric_mode, textord_use_cjk_fp_model, |
236 | 0 | poly_allow_detailed_fx, row, block)) { |
237 | 0 | word_res->CloneChoppedToRebuild(); |
238 | 0 | return; |
239 | 0 | } |
240 | 0 | if (chop_debug) { |
241 | 0 | tprintf("Maximally chopping word at:"); |
242 | 0 | word_res->word->bounding_box().print(); |
243 | 0 | } |
244 | 0 | std::vector<BLOB_CHOICE *> blob_choices; |
245 | 0 | ASSERT_HOST(!word_res->chopped_word->blobs.empty()); |
246 | 0 | auto rating = static_cast<float>(INT8_MAX); |
247 | 0 | for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) { |
248 | | // The rating and certainty are not quite arbitrary. Since |
249 | | // select_blob_to_chop uses the worst certainty to choose, they all have |
250 | | // to be different, so starting with INT8_MAX, subtract 1/8 for each blob |
251 | | // in here, and then divide by e each time they are chopped, which |
252 | | // should guarantee a set of unequal values for the whole tree of blobs |
253 | | // produced, however much chopping is required. The chops are thus only |
254 | | // limited by the ability of the chopper to find suitable chop points, |
255 | | // and not by the value of the certainties. |
256 | 0 | auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE); |
257 | 0 | blob_choices.push_back(choice); |
258 | 0 | rating -= 0.125f; |
259 | 0 | } |
260 | 0 | const double e = exp(1.0); // The base of natural logs. |
261 | 0 | unsigned blob_number; |
262 | 0 | if (!assume_fixed_pitch_char_segment) { |
263 | | // We only chop if the language is not fixed pitch like CJK. |
264 | 0 | SEAM *seam = nullptr; |
265 | 0 | int right_chop_index = 0; |
266 | 0 | while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) { |
267 | 0 | word_res->InsertSeam(blob_number, seam); |
268 | 0 | BLOB_CHOICE *left_choice = blob_choices[blob_number]; |
269 | 0 | rating = left_choice->rating() / e; |
270 | 0 | left_choice->set_rating(rating); |
271 | 0 | left_choice->set_certainty(-rating); |
272 | | // combine confidence w/ serial # |
273 | 0 | auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f, |
274 | 0 | 0.0f, 0.0f, BCC_FAKE); |
275 | 0 | blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice); |
276 | 0 | } |
277 | 0 | } |
278 | 0 | word_res->CloneChoppedToRebuild(); |
279 | 0 | word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]); |
280 | 0 | } |
281 | | |
282 | | /// Helper to compute the dispute resolution metric. |
283 | | /// Disputed blob resolution. The aim is to give the blob to the most |
284 | | /// appropriate boxfile box. Most of the time it is obvious, but if |
285 | | /// two boxfile boxes overlap significantly it is not. If a small boxfile |
286 | | /// box takes most of the blob, and a large boxfile box does too, then |
287 | | /// we want the small boxfile box to get it, but if the small box |
288 | | /// is much smaller than the blob, we don't want it to get it. |
289 | | /// Details of the disputed blob resolution: |
290 | | /// Given a box with area A, and a blob with area B, with overlap area C, |
291 | | /// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum |
292 | | /// miss metric gets the blob. |
293 | 0 | static double BoxMissMetric(const TBOX &box1, const TBOX &box2) { |
294 | 0 | const int overlap_area = box1.intersection(box2).area(); |
295 | 0 | const int a = box1.area(); |
296 | 0 | const int b = box2.area(); |
297 | 0 | ASSERT_HOST(a != 0 && b != 0); |
298 | 0 | return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b; |
299 | 0 | } |
300 | | |
301 | | /// Gather consecutive blobs that match the given box into the best_state |
302 | | /// and corresponding correct_text. |
303 | | /// |
304 | | /// Fights over which box owns which blobs are settled by pre-chopping and |
305 | | /// applying the blobs to box or next_box with the least non-overlap. |
306 | | /// @return false if the box was in error, which can only be caused by |
307 | | /// failing to find an appropriate blob for a box. |
308 | | /// |
309 | | /// This means that occasionally, blobs may be incorrectly segmented if the |
310 | | /// chopper fails to find a suitable chop point. |
311 | | bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, |
312 | 0 | const TBOX *next_box, const char *correct_text) { |
313 | 0 | if (applybox_debug > 1) { |
314 | 0 | tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text); |
315 | 0 | } |
316 | 0 | PAGE_RES_IT page_res_it(page_res); |
317 | 0 | WERD_RES *word_res; |
318 | 0 | for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) { |
319 | 0 | if (!word_res->box_word->bounding_box().major_overlap(box)) { |
320 | 0 | continue; |
321 | 0 | } |
322 | 0 | if (applybox_debug > 1) { |
323 | 0 | tprintf("Checking word box:"); |
324 | 0 | word_res->box_word->bounding_box().print(); |
325 | 0 | } |
326 | 0 | int word_len = word_res->box_word->length(); |
327 | 0 | for (int i = 0; i < word_len; ++i) { |
328 | 0 | TBOX char_box = TBOX(); |
329 | 0 | int blob_count = 0; |
330 | 0 | for (blob_count = 0; i + blob_count < word_len; ++blob_count) { |
331 | 0 | TBOX blob_box = word_res->box_word->BlobBox(i + blob_count); |
332 | 0 | if (!blob_box.major_overlap(box)) { |
333 | 0 | break; |
334 | 0 | } |
335 | 0 | if (word_res->correct_text[i + blob_count].length() > 0) { |
336 | 0 | break; // Blob is claimed already. |
337 | 0 | } |
338 | 0 | if (next_box != nullptr) { |
339 | 0 | const double current_box_miss_metric = BoxMissMetric(blob_box, box); |
340 | 0 | const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box); |
341 | 0 | if (applybox_debug > 2) { |
342 | 0 | tprintf("Checking blob:"); |
343 | 0 | blob_box.print(); |
344 | 0 | tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, |
345 | 0 | next_box_miss_metric); |
346 | 0 | } |
347 | 0 | if (current_box_miss_metric > next_box_miss_metric) { |
348 | 0 | break; // Blob is a better match for next box. |
349 | 0 | } |
350 | 0 | } |
351 | 0 | char_box += blob_box; |
352 | 0 | } |
353 | 0 | if (blob_count > 0) { |
354 | 0 | if (applybox_debug > 1) { |
355 | 0 | tprintf("Index [%d, %d) seem good.\n", i, i + blob_count); |
356 | 0 | } |
357 | 0 | if (!char_box.almost_equal(box, 3) && |
358 | 0 | ((next_box != nullptr && box.x_gap(*next_box) < -3) || |
359 | 0 | (prev_box != nullptr && prev_box->x_gap(box) < -3))) { |
360 | 0 | return false; |
361 | 0 | } |
362 | | // We refine just the box_word, best_state and correct_text here. |
363 | | // The rebuild_word is made in TidyUp. |
364 | | // blob_count blobs are put together to match the box. Merge the |
365 | | // box_word boxes, save the blob_count in the state and the text. |
366 | 0 | word_res->box_word->MergeBoxes(i, i + blob_count); |
367 | 0 | word_res->best_state[i] = blob_count; |
368 | 0 | word_res->correct_text[i] = correct_text; |
369 | 0 | if (applybox_debug > 2) { |
370 | 0 | tprintf("%d Blobs match: blob box:", blob_count); |
371 | 0 | word_res->box_word->BlobBox(i).print(); |
372 | 0 | tprintf("Matches box:"); |
373 | 0 | box.print(); |
374 | 0 | if (next_box != nullptr) { |
375 | 0 | tprintf("With next box:"); |
376 | 0 | next_box->print(); |
377 | 0 | } |
378 | 0 | } |
379 | | // Eliminated best_state and correct_text entries for the consumed |
380 | | // blobs. |
381 | 0 | for (int j = 1; j < blob_count; ++j) { |
382 | 0 | word_res->best_state.erase(word_res->best_state.begin() + i + 1); |
383 | 0 | word_res->correct_text.erase(word_res->correct_text.begin() + i + 1); |
384 | 0 | } |
385 | | // Assume that no box spans multiple source words, so we are done with |
386 | | // this box. |
387 | 0 | if (applybox_debug > 1) { |
388 | 0 | tprintf("Best state = "); |
389 | 0 | for (auto best_state : word_res->best_state) { |
390 | 0 | tprintf("%d ", best_state); |
391 | 0 | } |
392 | 0 | tprintf("\n"); |
393 | 0 | tprintf("Correct text = [[ "); |
394 | 0 | for (auto &it : word_res->correct_text) { |
395 | 0 | tprintf("%s ", it.c_str()); |
396 | 0 | } |
397 | 0 | tprintf("]]\n"); |
398 | 0 | } |
399 | 0 | return true; |
400 | 0 | } |
401 | 0 | } |
402 | 0 | } |
403 | 0 | if (applybox_debug > 0) { |
404 | 0 | tprintf("FAIL!\n"); |
405 | 0 | } |
406 | 0 | return false; // Failure. |
407 | 0 | } |
408 | | |
409 | | /// Consume all source blobs that strongly overlap the given box, |
410 | | /// putting them into a new word, with the correct_text label. |
411 | | /// Fights over which box owns which blobs are settled by |
412 | | /// applying the blobs to box or next_box with the least non-overlap. |
413 | | /// @return false if the box was in error, which can only be caused by |
414 | | /// failing to find an overlapping blob for a box. |
415 | | bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, |
416 | 0 | const char *correct_text) { |
417 | 0 | if (applybox_debug > 1) { |
418 | 0 | tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); |
419 | 0 | } |
420 | 0 | WERD *new_word = nullptr; |
421 | 0 | BLOCK_IT b_it(block_list); |
422 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
423 | 0 | BLOCK *block = b_it.data(); |
424 | 0 | if (!box.major_overlap(block->pdblk.bounding_box())) { |
425 | 0 | continue; |
426 | 0 | } |
427 | 0 | ROW_IT r_it(block->row_list()); |
428 | 0 | for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { |
429 | 0 | ROW *row = r_it.data(); |
430 | 0 | if (!box.major_overlap(row->bounding_box())) { |
431 | 0 | continue; |
432 | 0 | } |
433 | 0 | WERD_IT w_it(row->word_list()); |
434 | 0 | for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { |
435 | 0 | WERD *word = w_it.data(); |
436 | 0 | if (applybox_debug > 2) { |
437 | 0 | tprintf("Checking word:"); |
438 | 0 | word->bounding_box().print(); |
439 | 0 | } |
440 | 0 | if (word->text() != nullptr && word->text()[0] != '\0') { |
441 | 0 | continue; // Ignore words that are already done. |
442 | 0 | } |
443 | 0 | if (!box.major_overlap(word->bounding_box())) { |
444 | 0 | continue; |
445 | 0 | } |
446 | 0 | C_BLOB_IT blob_it(word->cblob_list()); |
447 | 0 | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
448 | 0 | C_BLOB *blob = blob_it.data(); |
449 | 0 | TBOX blob_box = blob->bounding_box(); |
450 | 0 | if (!blob_box.major_overlap(box)) { |
451 | 0 | continue; |
452 | 0 | } |
453 | 0 | if (next_box != nullptr) { |
454 | 0 | const double current_box_miss_metric = BoxMissMetric(blob_box, box); |
455 | 0 | const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box); |
456 | 0 | if (applybox_debug > 2) { |
457 | 0 | tprintf("Checking blob:"); |
458 | 0 | blob_box.print(); |
459 | 0 | tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, |
460 | 0 | next_box_miss_metric); |
461 | 0 | } |
462 | 0 | if (current_box_miss_metric > next_box_miss_metric) { |
463 | 0 | continue; // Blob is a better match for next box. |
464 | 0 | } |
465 | 0 | } |
466 | 0 | if (applybox_debug > 2) { |
467 | 0 | tprintf("Blob match: blob:"); |
468 | 0 | blob_box.print(); |
469 | 0 | tprintf("Matches box:"); |
470 | 0 | box.print(); |
471 | 0 | if (next_box != nullptr) { |
472 | 0 | tprintf("With next box:"); |
473 | 0 | next_box->print(); |
474 | 0 | } |
475 | 0 | } |
476 | 0 | if (new_word == nullptr) { |
477 | | // Make a new word with a single blob. |
478 | 0 | new_word = word->shallow_copy(); |
479 | 0 | new_word->set_text(correct_text); |
480 | 0 | w_it.add_to_end(new_word); |
481 | 0 | } |
482 | 0 | C_BLOB_IT new_blob_it(new_word->cblob_list()); |
483 | 0 | new_blob_it.add_to_end(blob_it.extract()); |
484 | 0 | } |
485 | 0 | } |
486 | 0 | } |
487 | 0 | } |
488 | 0 | if (new_word == nullptr && applybox_debug > 0) { |
489 | 0 | tprintf("FAIL!\n"); |
490 | 0 | } |
491 | 0 | return new_word != nullptr; |
492 | 0 | } |
493 | | |
494 | | /// Resegments the words by running the classifier in an attempt to find the |
495 | | /// correct segmentation that produces the required string. |
496 | 0 | void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) { |
497 | 0 | PAGE_RES_IT pr_it(page_res); |
498 | 0 | WERD_RES *word_res; |
499 | 0 | for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) { |
500 | 0 | const WERD *word = word_res->word; |
501 | 0 | if (word->text() == nullptr || word->text()[0] == '\0') { |
502 | 0 | continue; // Ignore words that have no text. |
503 | 0 | } |
504 | | // Convert the correct text to a vector of UNICHAR_ID |
505 | 0 | std::vector<UNICHAR_ID> target_text; |
506 | 0 | if (!ConvertStringToUnichars(word->text(), &target_text)) { |
507 | 0 | tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text()); |
508 | 0 | pr_it.DeleteCurrentWord(); |
509 | 0 | continue; |
510 | 0 | } |
511 | 0 | if (!FindSegmentation(target_text, word_res)) { |
512 | 0 | tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text()); |
513 | 0 | pr_it.DeleteCurrentWord(); |
514 | 0 | continue; |
515 | 0 | } |
516 | 0 | } |
517 | 0 | } |
518 | | |
519 | | /// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. |
520 | | /// @return false if an invalid UNICHAR_ID is encountered. |
521 | 0 | bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) { |
522 | 0 | for (int step = 0; *utf8 != '\0'; utf8 += step) { |
523 | 0 | const char *next_space = strchr(utf8, ' '); |
524 | 0 | if (next_space == nullptr) { |
525 | 0 | next_space = utf8 + strlen(utf8); |
526 | 0 | } |
527 | 0 | step = next_space - utf8; |
528 | 0 | UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step); |
529 | 0 | if (class_id == INVALID_UNICHAR_ID) { |
530 | 0 | return false; |
531 | 0 | } |
532 | 0 | while (utf8[step] == ' ') { |
533 | 0 | ++step; |
534 | 0 | } |
535 | 0 | class_ids->push_back(class_id); |
536 | 0 | } |
537 | 0 | return true; |
538 | 0 | } |
539 | | |
540 | | /// Resegments the word to achieve the target_text from the classifier. |
541 | | /// Returns false if the re-segmentation fails. |
542 | | /// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and |
543 | | /// applies a full search on the classifier results to find the best classified |
544 | | /// segmentation. As a compromise to obtain better recall, 1-1 ambiguity |
545 | | /// substitutions ARE used. |
546 | 0 | bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) { |
547 | | // Classify all required combinations of blobs and save results in choices. |
548 | 0 | const int word_length = word_res->box_word->length(); |
549 | 0 | auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length]; |
550 | 0 | for (int i = 0; i < word_length; ++i) { |
551 | 0 | for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { |
552 | 0 | BLOB_CHOICE_LIST *match_result = |
553 | 0 | classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word, |
554 | 0 | word_res->blamer_bundle); |
555 | 0 | if (applybox_debug > 2) { |
556 | 0 | tprintf("%d+%d:", i, j); |
557 | 0 | print_ratings_list("Segment:", match_result, unicharset); |
558 | 0 | } |
559 | 0 | choices[i].push_back(match_result); |
560 | 0 | } |
561 | 0 | } |
562 | | // Search the segmentation graph for the target text. Must be an exact |
563 | | // match. Using wildcards makes it difficult to find the correct |
564 | | // segmentation even when it is there. |
565 | 0 | word_res->best_state.clear(); |
566 | 0 | std::vector<int> search_segmentation; |
567 | 0 | float best_rating = 0.0f; |
568 | 0 | SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating, |
569 | 0 | &word_res->best_state); |
570 | 0 | for (int i = 0; i < word_length; ++i) { |
571 | 0 | for (auto choice : choices[i]) { |
572 | 0 | delete choice; |
573 | 0 | } |
574 | 0 | } |
575 | 0 | delete[] choices; |
576 | 0 | if (word_res->best_state.empty()) { |
577 | | // Build the original segmentation and if it is the same length as the |
578 | | // truth, assume it will do. |
579 | 0 | int blob_count = 1; |
580 | 0 | for (auto s : word_res->seam_array) { |
581 | 0 | SEAM *seam = s; |
582 | 0 | if (!seam->HasAnySplits()) { |
583 | 0 | word_res->best_state.push_back(blob_count); |
584 | 0 | blob_count = 1; |
585 | 0 | } else { |
586 | 0 | ++blob_count; |
587 | 0 | } |
588 | 0 | } |
589 | 0 | word_res->best_state.push_back(blob_count); |
590 | 0 | if (word_res->best_state.size() != target_text.size()) { |
591 | 0 | word_res->best_state.clear(); // No good. Original segmentation bad size. |
592 | 0 | return false; |
593 | 0 | } |
594 | 0 | } |
595 | 0 | word_res->correct_text.clear(); |
596 | 0 | for (auto &text : target_text) { |
597 | 0 | word_res->correct_text.emplace_back(unicharset.id_to_unichar(text)); |
598 | 0 | } |
599 | 0 | return true; |
600 | 0 | } |
601 | | |
602 | | /// Recursive helper to find a match to the target_text (from text_index |
603 | | /// position) in the choices (from choices_pos position). |
604 | | /// @param choices is an array of vectors of length choices_length, |
605 | | /// with each element representing a starting position in the word, and the |
606 | | /// #vector holding classification results for a sequence of consecutive |
607 | | /// blobs, with index 0 being a single blob, index 1 being 2 blobs etc. |
608 | | /// @param choices_pos |
609 | | /// @param choices_length |
610 | | /// @param target_text |
611 | | /// @param text_index |
612 | | /// @param rating |
613 | | /// @param segmentation |
614 | | /// @param best_rating |
615 | | /// @param best_segmentation |
616 | | void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos, |
617 | | unsigned choices_length, const std::vector<UNICHAR_ID> &target_text, |
618 | | unsigned text_index, float rating, std::vector<int> *segmentation, |
619 | 0 | float *best_rating, std::vector<int> *best_segmentation) { |
620 | 0 | const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs(); |
621 | 0 | for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) { |
622 | | // Rating of matching choice or worst choice if no match. |
623 | 0 | float choice_rating = 0.0f; |
624 | | // Find the corresponding best BLOB_CHOICE. |
625 | 0 | BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]); |
626 | 0 | for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { |
627 | 0 | const BLOB_CHOICE *choice = choice_it.data(); |
628 | 0 | choice_rating = choice->rating(); |
629 | 0 | auto class_id = choice->unichar_id(); |
630 | 0 | if (class_id == target_text[text_index]) { |
631 | 0 | break; |
632 | 0 | } |
633 | | // Search ambigs table. |
634 | 0 | if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) { |
635 | 0 | AmbigSpec_IT spec_it(table[class_id]); |
636 | 0 | for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) { |
637 | 0 | const AmbigSpec *ambig_spec = spec_it.data(); |
638 | | // We'll only do 1-1. |
639 | 0 | if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID && |
640 | 0 | ambig_spec->correct_ngram_id == target_text[text_index]) { |
641 | 0 | break; |
642 | 0 | } |
643 | 0 | } |
644 | 0 | if (!spec_it.cycled_list()) { |
645 | 0 | break; // Found an ambig. |
646 | 0 | } |
647 | 0 | } |
648 | 0 | } |
649 | 0 | if (choice_it.cycled_list()) { |
650 | 0 | continue; // No match. |
651 | 0 | } |
652 | 0 | segmentation->push_back(length); |
653 | 0 | if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) { |
654 | | // This is a complete match. If the rating is good record a new best. |
655 | 0 | if (applybox_debug > 2) { |
656 | 0 | tesserr << "Complete match, rating = " << rating + choice_rating |
657 | 0 | << ", best=" << *best_rating |
658 | 0 | << ", seglength=" << segmentation->size() |
659 | 0 | << ", best=" << best_segmentation->size() << '\n'; |
660 | 0 | } |
661 | 0 | if (best_segmentation->empty() || rating + choice_rating < *best_rating) { |
662 | 0 | *best_segmentation = *segmentation; |
663 | 0 | *best_rating = rating + choice_rating; |
664 | 0 | } |
665 | 0 | } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) { |
666 | 0 | if (applybox_debug > 3) { |
667 | 0 | tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index], |
668 | 0 | unicharset.id_to_unichar(target_text[text_index]), |
669 | 0 | choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig", |
670 | 0 | choices_pos, length); |
671 | 0 | } |
672 | 0 | SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1, |
673 | 0 | rating + choice_rating, segmentation, best_rating, best_segmentation); |
674 | 0 | if (applybox_debug > 3) { |
675 | 0 | tprintf("End recursion for %d=%s\n", target_text[text_index], |
676 | 0 | unicharset.id_to_unichar(target_text[text_index])); |
677 | 0 | } |
678 | 0 | } |
679 | 0 | segmentation->resize(segmentation->size() - 1); |
680 | 0 | } |
681 | 0 | } |
682 | | |
683 | | /// - Counts up the labelled words and the blobs within. |
684 | | /// - Deletes all unused or emptied words, counting the unused ones. |
685 | | /// - Resets W_BOL and W_EOL flags correctly. |
686 | | /// - Builds the rebuild_word and rebuilds the box_word and the best_choice. |
687 | 0 | void Tesseract::TidyUp(PAGE_RES *page_res) { |
688 | 0 | int ok_blob_count = 0; |
689 | 0 | int bad_blob_count = 0; |
690 | | // TODO: check usage of ok_word_count. |
691 | 0 | int ok_word_count = 0; |
692 | 0 | int unlabelled_words = 0; |
693 | 0 | PAGE_RES_IT pr_it(page_res); |
694 | 0 | WERD_RES *word_res; |
695 | 0 | for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) { |
696 | 0 | int ok_in_word = 0; |
697 | 0 | int blob_count = word_res->correct_text.size(); |
698 | 0 | auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count); |
699 | 0 | word_choice->set_permuter(TOP_CHOICE_PERM); |
700 | 0 | for (int c = 0; c < blob_count; ++c) { |
701 | 0 | if (word_res->correct_text[c].length() > 0) { |
702 | 0 | ++ok_in_word; |
703 | 0 | } |
704 | | // Since we only need a fake word_res->best_choice, the actual |
705 | | // unichar_ids do not matter. Which is fortunate, since TidyUp() |
706 | | // can be called while training Tesseract, at the stage where |
707 | | // unicharset is not meaningful yet. |
708 | 0 | word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c], |
709 | 0 | 1.0f, -1.0f); |
710 | 0 | } |
711 | 0 | if (ok_in_word > 0) { |
712 | 0 | ok_blob_count += ok_in_word; |
713 | 0 | bad_blob_count += word_res->correct_text.size() - ok_in_word; |
714 | 0 | word_res->LogNewRawChoice(word_choice); |
715 | 0 | word_res->LogNewCookedChoice(1, false, word_choice); |
716 | 0 | } else { |
717 | 0 | ++unlabelled_words; |
718 | 0 | if (applybox_debug > 0) { |
719 | 0 | tprintf("APPLY_BOXES: Unlabelled word at :"); |
720 | 0 | word_res->word->bounding_box().print(); |
721 | 0 | } |
722 | 0 | pr_it.DeleteCurrentWord(); |
723 | 0 | delete word_choice; |
724 | 0 | } |
725 | 0 | } |
726 | 0 | pr_it.restart_page(); |
727 | 0 | for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) { |
728 | | // Denormalize back to a BoxWord. |
729 | 0 | word_res->RebuildBestState(); |
730 | 0 | word_res->SetupBoxWord(); |
731 | 0 | word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row()); |
732 | 0 | word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row()); |
733 | 0 | } |
734 | 0 | if (applybox_debug > 0) { |
735 | 0 | tprintf(" Found %d good blobs.\n", ok_blob_count); |
736 | 0 | if (bad_blob_count > 0) { |
737 | 0 | tprintf(" Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count); |
738 | 0 | } |
739 | 0 | if (unlabelled_words > 0) { |
740 | 0 | tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words); |
741 | 0 | } |
742 | 0 | } |
743 | 0 | } |
744 | | |
745 | | /** Logs a bad box by line in the box file and box coords.*/ |
746 | | void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, |
747 | 0 | const char *err_msg) { |
748 | 0 | tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch, |
749 | 0 | box.left(), box.bottom(), box.right(), box.top(), err_msg); |
750 | 0 | } |
751 | | |
752 | | /// Calls #LearnWord to extract features for labelled blobs within each word. |
753 | | /// Features are stored in an internal buffer. |
754 | 0 | void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) { |
755 | 0 | PAGE_RES_IT pr_it(page_res); |
756 | 0 | int word_count = 0; |
757 | 0 | for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) { |
758 | 0 | LearnWord(fontname.c_str(), word_res); |
759 | 0 | ++word_count; |
760 | 0 | } |
761 | 0 | tprintf("Generated training data for %d words\n", word_count); |
762 | 0 | } |
763 | | |
764 | | #endif // ndef DISABLED_LEGACY_ENGINE |
765 | | |
766 | | /** Creates a fake best_choice entry in each WERD_RES with the correct text.*/ |
767 | 0 | void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) { |
768 | 0 | PAGE_RES_IT pr_it(page_res); |
769 | 0 | for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) { |
770 | 0 | auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size()); |
771 | 0 | for (auto &correct_text : word_res->correct_text) { |
772 | | // The part before the first space is the real ground truth, and the |
773 | | // rest is the bounding box location and page number. |
774 | 0 | std::vector<std::string> tokens = split(correct_text, ' '); |
775 | 0 | UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str()); |
776 | 0 | choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f); |
777 | 0 | } |
778 | 0 | word_res->ClearWordChoices(); |
779 | 0 | word_res->LogNewRawChoice(choice); |
780 | 0 | word_res->LogNewCookedChoice(1, false, choice); |
781 | 0 | } |
782 | 0 | } |
783 | | |
784 | | } // namespace tesseract |