/src/tesseract/src/ccstruct/pageres.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: pageres.cpp (Formerly page_res.c) |
3 | | * Description: Hierarchy of results classes from PAGE_RES to WERD_RES |
4 | | * and an iterator class to iterate over the words. |
5 | | * Main purposes: |
6 | | * Easy way to iterate over the words without a 3-nested loop. |
7 | | * Holds data used during word recognition. |
8 | | * Holds information about alternative spacing paths. |
9 | | * Author: Phil Cheatle |
10 | | * |
11 | | * (C) Copyright 1992, Hewlett-Packard Ltd. |
12 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
13 | | ** you may not use this file except in compliance with the License. |
14 | | ** You may obtain a copy of the License at |
15 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
16 | | ** Unless required by applicable law or agreed to in writing, software |
17 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
18 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
19 | | ** See the License for the specific language governing permissions and |
20 | | ** limitations under the License. |
21 | | * |
22 | | **********************************************************************/ |
23 | | |
24 | | #include "pageres.h" |
25 | | |
26 | | #include "blamer.h" // for BlamerBundle |
27 | | #include "blobs.h" // for TWERD, TBLOB |
28 | | #include "boxword.h" // for BoxWord |
29 | | #include "errcode.h" // for ASSERT_HOST |
30 | | #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only) |
31 | | #include "ocrrow.h" // for ROW, ROW_IT |
32 | | #include "pdblock.h" // for PDBLK |
33 | | #include "polyblk.h" // for POLY_BLOCK |
34 | | #include "seam.h" // for SEAM, start_seam_list |
35 | | #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST |
36 | | #include "tprintf.h" // for tprintf |
37 | | |
38 | | #include <tesseract/publictypes.h> // for OcrEngineMode, OEM_LSTM_ONLY |
39 | | |
40 | | #include <cassert> // for assert |
41 | | #include <cstdint> // for INT32_MAX |
42 | | #include <cstring> // for strlen |
43 | | |
44 | | struct Pix; |
45 | | |
46 | | namespace tesseract { |
47 | | |
48 | | // Gain factor for computing thresholds that determine the ambiguity of a |
49 | | // word. |
50 | | static const double kStopperAmbiguityThresholdGain = 8.0; |
51 | | // Constant offset for computing thresholds that determine the ambiguity of a |
52 | | // word. |
53 | | static const double kStopperAmbiguityThresholdOffset = 1.5; |
54 | | // Max number of broken pieces to associate. |
55 | | const int kWordrecMaxNumJoinChunks = 4; |
56 | | // Max ratio of word box height to line size to allow it to be processed as |
57 | | // a line with other words. |
58 | | const double kMaxWordSizeRatio = 1.25; |
59 | | // Max ratio of line box height to line size to allow a new word to be added. |
60 | | const double kMaxLineSizeRatio = 1.25; |
61 | | // Max ratio of word gap to line size to allow a new word to be added. |
62 | | const double kMaxWordGapRatio = 2.0; |
63 | | |
64 | | // Computes and returns a threshold of certainty difference used to determine |
65 | | // which words to keep, based on the adjustment factors of the two words. |
66 | | // TODO(rays) This is horrible. Replace with an enhance params training model. |
67 | 485k | static double StopperAmbigThreshold(double f1, double f2) { |
68 | 485k | return (f2 - f1) * kStopperAmbiguityThresholdGain - |
69 | 485k | kStopperAmbiguityThresholdOffset; |
70 | 485k | } |
71 | | |
72 | | /************************************************************************* |
73 | | * PAGE_RES::PAGE_RES |
74 | | * |
75 | | * Constructor for page results |
76 | | *************************************************************************/ |
77 | | PAGE_RES::PAGE_RES(bool merge_similar_words, BLOCK_LIST *the_block_list, |
78 | 7.72k | WERD_CHOICE **prev_word_best_choice_ptr) { |
79 | 7.72k | Init(); |
80 | 7.72k | BLOCK_IT block_it(the_block_list); |
81 | 7.72k | BLOCK_RES_IT block_res_it(&block_res_list); |
82 | 14.7k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
83 | 7.06k | block_res_it.add_to_end( |
84 | 7.06k | new BLOCK_RES(merge_similar_words, block_it.data())); |
85 | 7.06k | } |
86 | 7.72k | prev_word_best_choice = prev_word_best_choice_ptr; |
87 | 7.72k | } |
88 | | |
89 | | /************************************************************************* |
90 | | * BLOCK_RES::BLOCK_RES |
91 | | * |
92 | | * Constructor for BLOCK results |
93 | | *************************************************************************/ |
94 | | |
95 | 7.06k | BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) { |
96 | 7.06k | ROW_IT row_it(the_block->row_list()); |
97 | 7.06k | ROW_RES_IT row_res_it(&row_res_list); |
98 | | |
99 | 7.06k | char_count = 0; |
100 | 7.06k | rej_count = 0; |
101 | 7.06k | font_class = -1; // not assigned |
102 | 7.06k | x_height = -1.0; |
103 | 7.06k | font_assigned = false; |
104 | 7.06k | row_count = 0; |
105 | | |
106 | 7.06k | block = the_block; |
107 | | |
108 | 102k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
109 | 95.5k | row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data())); |
110 | 95.5k | } |
111 | 7.06k | } |
112 | | |
113 | | /************************************************************************* |
114 | | * ROW_RES::ROW_RES |
115 | | * |
116 | | * Constructor for ROW results |
117 | | *************************************************************************/ |
118 | | |
119 | 95.5k | ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) { |
120 | 95.5k | WERD_IT word_it(the_row->word_list()); |
121 | 95.5k | WERD_RES_IT word_res_it(&word_res_list); |
122 | 95.5k | WERD_RES *combo = nullptr; // current combination of fuzzies |
123 | 95.5k | WERD *copy_word; |
124 | | |
125 | 95.5k | char_count = 0; |
126 | 95.5k | rej_count = 0; |
127 | 95.5k | whole_word_rej_count = 0; |
128 | | |
129 | 95.5k | row = the_row; |
130 | 95.5k | bool add_next_word = false; |
131 | 95.5k | TBOX union_box; |
132 | 95.5k | float line_height = |
133 | 95.5k | the_row->x_height() + the_row->ascenders() - the_row->descenders(); |
134 | 241k | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
135 | 145k | auto *word_res = new WERD_RES(word_it.data()); |
136 | 145k | word_res->x_height = the_row->x_height(); |
137 | 145k | if (add_next_word) { |
138 | 33.9k | ASSERT_HOST(combo != nullptr); |
139 | | // We are adding this word to the combination. |
140 | 33.9k | word_res->part_of_combo = true; |
141 | 33.9k | combo->copy_on(word_res); |
142 | 111k | } else if (merge_similar_words) { |
143 | 111k | union_box = word_res->word->bounding_box(); |
144 | 111k | add_next_word = !word_res->word->flag(W_REP_CHAR) && |
145 | 111k | union_box.height() <= line_height * kMaxWordSizeRatio; |
146 | 111k | word_res->odd_size = !add_next_word; |
147 | 111k | } |
148 | 145k | WERD *next_word = word_it.data_relative(1); |
149 | 145k | if (merge_similar_words) { |
150 | 145k | if (add_next_word && !next_word->flag(W_REP_CHAR)) { |
151 | | // Next word will be added on if all of the following are true: |
152 | | // Not a rep char. |
153 | | // Box height small enough. |
154 | | // Union box height small enough. |
155 | | // Horizontal gap small enough. |
156 | 132k | TBOX next_box = next_word->bounding_box(); |
157 | 132k | int prev_right = union_box.right(); |
158 | 132k | union_box += next_box; |
159 | 132k | if (next_box.height() > line_height * kMaxWordSizeRatio || |
160 | 132k | union_box.height() > line_height * kMaxLineSizeRatio || |
161 | 132k | next_box.left() > prev_right + line_height * kMaxWordGapRatio) { |
162 | 17.2k | add_next_word = false; |
163 | 17.2k | } |
164 | 132k | } |
165 | 145k | next_word->set_flag(W_FUZZY_NON, add_next_word); |
166 | 145k | } else { |
167 | 0 | add_next_word = next_word->flag(W_FUZZY_NON); |
168 | 0 | } |
169 | 145k | if (add_next_word) { |
170 | 115k | if (combo == nullptr) { |
171 | 84.1k | copy_word = new WERD; |
172 | 84.1k | *copy_word = *(word_it.data()); // deep copy |
173 | 84.1k | combo = new WERD_RES(copy_word); |
174 | 84.1k | combo->x_height = the_row->x_height(); |
175 | 84.1k | combo->combination = true; |
176 | 84.1k | word_res_it.add_to_end(combo); |
177 | 84.1k | } |
178 | 115k | word_res->part_of_combo = true; |
179 | 115k | } else { |
180 | 30.5k | combo = nullptr; |
181 | 30.5k | } |
182 | 145k | word_res_it.add_to_end(word_res); |
183 | 145k | } |
184 | 95.5k | } |
185 | | |
186 | 26.8k | WERD_RES &WERD_RES::operator=(const WERD_RES &source) { |
187 | 26.8k | this->ELIST<WERD_RES>::LINK::operator=(source); |
188 | 26.8k | Clear(); |
189 | 26.8k | if (source.combination) { |
190 | 0 | word = new WERD; |
191 | 0 | *word = *(source.word); // deep copy |
192 | 26.8k | } else { |
193 | 26.8k | word = source.word; // pt to same word |
194 | 26.8k | } |
195 | 26.8k | if (source.bln_boxes != nullptr) { |
196 | 26.8k | bln_boxes = new tesseract::BoxWord(*source.bln_boxes); |
197 | 26.8k | } |
198 | 26.8k | if (source.chopped_word != nullptr) { |
199 | 26.8k | chopped_word = new TWERD(*source.chopped_word); |
200 | 26.8k | } |
201 | 26.8k | if (source.rebuild_word != nullptr) { |
202 | 13.1k | rebuild_word = new TWERD(*source.rebuild_word); |
203 | 13.1k | } |
204 | | // TODO(rays) Do we ever need to copy the seam_array? |
205 | 26.8k | blob_row = source.blob_row; |
206 | 26.8k | denorm = source.denorm; |
207 | 26.8k | if (source.box_word != nullptr) { |
208 | 13.1k | box_word = new tesseract::BoxWord(*source.box_word); |
209 | 13.1k | } |
210 | 26.8k | best_state = source.best_state; |
211 | 26.8k | correct_text = source.correct_text; |
212 | 26.8k | blob_widths = source.blob_widths; |
213 | 26.8k | blob_gaps = source.blob_gaps; |
214 | | // None of the uses of operator= require the ratings matrix to be copied, |
215 | | // so don't as it would be really slow. |
216 | | |
217 | | // Copy the cooked choices. |
218 | 26.8k | WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&source.best_choices)); |
219 | 26.8k | WERD_CHOICE_IT wc_dest_it(&best_choices); |
220 | 83.8k | for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) { |
221 | 57.0k | const WERD_CHOICE *choice = wc_it.data(); |
222 | 57.0k | wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice)); |
223 | 57.0k | } |
224 | 26.8k | if (!wc_dest_it.empty()) { |
225 | 13.1k | wc_dest_it.move_to_first(); |
226 | 13.1k | best_choice = wc_dest_it.data(); |
227 | 13.7k | } else { |
228 | 13.7k | best_choice = nullptr; |
229 | 13.7k | } |
230 | | |
231 | 26.8k | if (source.raw_choice != nullptr) { |
232 | 13.1k | raw_choice = new WERD_CHOICE(*source.raw_choice); |
233 | 13.7k | } else { |
234 | 13.7k | raw_choice = nullptr; |
235 | 13.7k | } |
236 | 26.8k | if (source.ep_choice != nullptr) { |
237 | 0 | ep_choice = new WERD_CHOICE(*source.ep_choice); |
238 | 26.8k | } else { |
239 | 26.8k | ep_choice = nullptr; |
240 | 26.8k | } |
241 | 26.8k | reject_map = source.reject_map; |
242 | 26.8k | combination = source.combination; |
243 | 26.8k | part_of_combo = source.part_of_combo; |
244 | 26.8k | CopySimpleFields(source); |
245 | 26.8k | if (source.blamer_bundle != nullptr) { |
246 | 0 | blamer_bundle = new BlamerBundle(*(source.blamer_bundle)); |
247 | 0 | } |
248 | 26.8k | return *this; |
249 | 26.8k | } |
250 | | |
251 | | // Copies basic fields that don't involve pointers that might be useful |
252 | | // to copy when making one WERD_RES from another. |
253 | 239k | void WERD_RES::CopySimpleFields(const WERD_RES &source) { |
254 | 239k | tess_failed = source.tess_failed; |
255 | 239k | tess_accepted = source.tess_accepted; |
256 | 239k | tess_would_adapt = source.tess_would_adapt; |
257 | 239k | done = source.done; |
258 | 239k | unlv_crunch_mode = source.unlv_crunch_mode; |
259 | 239k | small_caps = source.small_caps; |
260 | 239k | odd_size = source.odd_size; |
261 | 239k | fontinfo = source.fontinfo; |
262 | 239k | fontinfo2 = source.fontinfo2; |
263 | 239k | fontinfo_id_count = source.fontinfo_id_count; |
264 | 239k | fontinfo_id2_count = source.fontinfo_id2_count; |
265 | 239k | x_height = source.x_height; |
266 | 239k | caps_height = source.caps_height; |
267 | 239k | baseline_shift = source.baseline_shift; |
268 | 239k | guessed_x_ht = source.guessed_x_ht; |
269 | 239k | guessed_caps_ht = source.guessed_caps_ht; |
270 | 239k | reject_spaces = source.reject_spaces; |
271 | 239k | uch_set = source.uch_set; |
272 | 239k | tesseract = source.tesseract; |
273 | 239k | } |
274 | | |
275 | | // Initializes a blank (default constructed) WERD_RES from one that has |
276 | | // already been recognized. |
277 | | // Use SetupFor*Recognition afterwards to complete the setup and make |
278 | | // it ready for a retry recognition. |
279 | 141k | void WERD_RES::InitForRetryRecognition(const WERD_RES &source) { |
280 | 141k | word = source.word; |
281 | 141k | CopySimpleFields(source); |
282 | 141k | if (source.blamer_bundle != nullptr) { |
283 | 0 | blamer_bundle = new BlamerBundle(); |
284 | 0 | blamer_bundle->CopyTruth(*source.blamer_bundle); |
285 | 0 | } |
286 | 141k | } |
287 | | |
288 | | // Sets up the members used in recognition: bln_boxes, chopped_word, |
289 | | // seam_array, denorm. Returns false if |
290 | | // the word is empty and sets up fake results. If use_body_size is |
291 | | // true and row->body_size is set, then body_size will be used for |
292 | | // blob normalization instead of xheight + ascrise. This flag is for |
293 | | // those languages that are using CJK pitch model and thus it has to |
294 | | // be true if and only if tesseract->textord_use_cjk_fp_model is |
295 | | // true. |
296 | | // If allow_detailed_fx is true, the feature extractor will receive fine |
297 | | // precision outline information, allowing smoother features and better |
298 | | // features on low resolution images. |
299 | | // The norm_mode_hint sets the default mode for normalization in absence |
300 | | // of any of the above flags. |
301 | | // norm_box is used to override the word bounding box to determine the |
302 | | // normalization scale and offset. |
303 | | // Returns false if the word is empty and sets up fake results. |
304 | | bool WERD_RES::SetupForRecognition(const UNICHARSET &unicharset_in, |
305 | | tesseract::Tesseract *tess, Image pix, |
306 | | int norm_mode, const TBOX *norm_box, |
307 | | bool numeric_mode, bool use_body_size, |
308 | | bool allow_detailed_fx, ROW *row, |
309 | 300k | const BLOCK *block) { |
310 | 300k | auto norm_mode_hint = static_cast<tesseract::OcrEngineMode>(norm_mode); |
311 | 300k | tesseract = tess; |
312 | 300k | POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr; |
313 | 300k | if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY && |
314 | 300k | word->cblob_list()->empty()) || |
315 | 300k | (pb != nullptr && !pb->IsText())) { |
316 | | // Empty words occur when all the blobs have been moved to the rej_blobs |
317 | | // list, which seems to occur frequently in junk. |
318 | 150 | SetupFake(unicharset_in); |
319 | 150 | word->set_flag(W_REP_CHAR, false); |
320 | 150 | return false; |
321 | 150 | } |
322 | 300k | ClearResults(); |
323 | 300k | SetupWordScript(unicharset_in); |
324 | 300k | chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word); |
325 | 300k | float word_xheight = |
326 | 300k | use_body_size && row != nullptr && row->body_size() > 0.0f |
327 | 300k | ? row->body_size() |
328 | 300k | : x_height; |
329 | 300k | chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE), |
330 | 300k | word_xheight, baseline_shift, numeric_mode, |
331 | 300k | norm_mode_hint, norm_box, &denorm); |
332 | 300k | blob_row = row; |
333 | 300k | SetupBasicsFromChoppedWord(unicharset_in); |
334 | 300k | SetupBlamerBundle(); |
335 | 300k | int num_blobs = chopped_word->NumBlobs(); |
336 | 300k | ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks); |
337 | 300k | tess_failed = false; |
338 | 300k | return true; |
339 | 300k | } |
340 | | |
341 | | // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty |
342 | | // accumulators from a made chopped word. We presume the fields are already |
343 | | // empty. |
344 | 340k | void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) { |
345 | 340k | bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word); |
346 | 340k | start_seam_list(chopped_word, &seam_array); |
347 | 340k | SetupBlobWidthsAndGaps(); |
348 | 340k | ClearWordChoices(); |
349 | 340k | } |
350 | | |
351 | | // Sets up the members used in recognition for an empty recognition result: |
352 | | // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. |
353 | 150 | void WERD_RES::SetupFake(const UNICHARSET &unicharset_in) { |
354 | 150 | ClearResults(); |
355 | 150 | SetupWordScript(unicharset_in); |
356 | 150 | chopped_word = new TWERD; |
357 | 150 | rebuild_word = new TWERD; |
358 | 150 | bln_boxes = new tesseract::BoxWord; |
359 | 150 | box_word = new tesseract::BoxWord; |
360 | 150 | int blob_count = word->cblob_list()->length(); |
361 | 150 | if (blob_count > 0) { |
362 | 0 | auto **fake_choices = new BLOB_CHOICE *[blob_count]; |
363 | | // For non-text blocks, just pass any blobs through to the box_word |
364 | | // and call the word failed with a fake classification. |
365 | 0 | C_BLOB_IT b_it(word->cblob_list()); |
366 | 0 | int blob_id = 0; |
367 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
368 | 0 | TBOX box = b_it.data()->bounding_box(); |
369 | 0 | box_word->InsertBox(box_word->length(), box); |
370 | 0 | fake_choices[blob_id++] = new BLOB_CHOICE; |
371 | 0 | } |
372 | 0 | FakeClassifyWord(blob_count, fake_choices); |
373 | 0 | delete[] fake_choices; |
374 | 150 | } else { |
375 | 150 | auto *word = new WERD_CHOICE(&unicharset_in); |
376 | 150 | word->make_bad(); |
377 | 150 | LogNewRawChoice(word); |
378 | | // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice. |
379 | 150 | LogNewCookedChoice(1, false, word); |
380 | 150 | } |
381 | 150 | tess_failed = true; |
382 | 150 | done = true; |
383 | 150 | } |
384 | | |
385 | 300k | void WERD_RES::SetupWordScript(const UNICHARSET &uch) { |
386 | 300k | uch_set = &uch; |
387 | 300k | int script = uch.default_sid(); |
388 | 300k | word->set_script_id(script); |
389 | 300k | word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight()); |
390 | 300k | word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid()); |
391 | 300k | } |
392 | | |
393 | | // Sets up the blamer_bundle if it is not null, using the initialized denorm. |
394 | 300k | void WERD_RES::SetupBlamerBundle() { |
395 | 300k | if (blamer_bundle != nullptr) { |
396 | 0 | blamer_bundle->SetupNormTruthWord(denorm); |
397 | 0 | } |
398 | 300k | } |
399 | | |
400 | | // Computes the blob_widths and blob_gaps from the chopped_word. |
401 | 449k | void WERD_RES::SetupBlobWidthsAndGaps() { |
402 | 449k | blob_widths.clear(); |
403 | 449k | blob_gaps.clear(); |
404 | 449k | int num_blobs = chopped_word->NumBlobs(); |
405 | 3.97M | for (int b = 0; b < num_blobs; ++b) { |
406 | 3.52M | TBLOB *blob = chopped_word->blobs[b]; |
407 | 3.52M | TBOX box = blob->bounding_box(); |
408 | 3.52M | blob_widths.push_back(box.width()); |
409 | 3.52M | if (b + 1 < num_blobs) { |
410 | 3.07M | blob_gaps.push_back(chopped_word->blobs[b + 1]->bounding_box().left() - |
411 | 3.07M | box.right()); |
412 | 3.07M | } |
413 | 3.52M | } |
414 | 449k | } |
415 | | |
416 | | // Updates internal data to account for a new SEAM (chop) at the given |
417 | | // blob_number. Fixes the ratings matrix and states in the choices, as well |
418 | | // as the blob widths and gaps. |
419 | 109k | void WERD_RES::InsertSeam(int blob_number, SEAM *seam) { |
420 | | // Insert the seam into the SEAMS array. |
421 | 109k | seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true); |
422 | 109k | seam_array.insert(seam_array.begin() + blob_number, seam); |
423 | 109k | if (ratings != nullptr) { |
424 | | // Expand the ratings matrix. |
425 | 109k | ratings = ratings->ConsumeAndMakeBigger(blob_number); |
426 | | // Fix all the segmentation states. |
427 | 109k | if (raw_choice != nullptr) { |
428 | 109k | raw_choice->UpdateStateForSplit(blob_number); |
429 | 109k | } |
430 | 109k | WERD_CHOICE_IT wc_it(&best_choices); |
431 | 567k | for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) { |
432 | 457k | WERD_CHOICE *choice = wc_it.data(); |
433 | 457k | choice->UpdateStateForSplit(blob_number); |
434 | 457k | } |
435 | 109k | SetupBlobWidthsAndGaps(); |
436 | 109k | } |
437 | 109k | } |
438 | | |
439 | | // Returns true if all the word choices except the first have adjust_factors |
440 | | // worse than the given threshold. |
441 | 0 | bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const { |
442 | | // The choices are not changed by this iteration. |
443 | 0 | WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&best_choices)); |
444 | 0 | for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) { |
445 | 0 | WERD_CHOICE *choice = wc_it.data(); |
446 | 0 | if (choice->adjust_factor() <= threshold) { |
447 | 0 | return false; |
448 | 0 | } |
449 | 0 | } |
450 | 0 | return true; |
451 | 0 | } |
452 | | |
453 | | // Returns true if the current word is ambiguous (by number of answers or |
454 | | // by dangerous ambigs.) |
455 | 31.9k | bool WERD_RES::IsAmbiguous() { |
456 | 31.9k | return !best_choices.singleton() || best_choice->dangerous_ambig_found(); |
457 | 31.9k | } |
458 | | |
459 | | // Returns true if the ratings matrix size matches the sum of each of the |
460 | | // segmentation states. |
461 | 996k | bool WERD_RES::StatesAllValid() { |
462 | 996k | unsigned ratings_dim = ratings->dimension(); |
463 | 996k | if (raw_choice->TotalOfStates() != ratings_dim) { |
464 | 0 | tprintf("raw_choice has total of states = %u vs ratings dim of %u\n", |
465 | 0 | raw_choice->TotalOfStates(), ratings_dim); |
466 | 0 | return false; |
467 | 0 | } |
468 | 996k | WERD_CHOICE_IT it(&best_choices); |
469 | 996k | unsigned index = 0; |
470 | 5.23M | for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) { |
471 | 4.23M | WERD_CHOICE *choice = it.data(); |
472 | 4.23M | if (choice->TotalOfStates() != ratings_dim) { |
473 | 0 | tprintf("Cooked #%u has total of states = %u vs ratings dim of %u\n", |
474 | 0 | index, choice->TotalOfStates(), ratings_dim); |
475 | 0 | return false; |
476 | 0 | } |
477 | 4.23M | } |
478 | 996k | return true; |
479 | 996k | } |
480 | | |
481 | | // Prints a list of words found if debug is true or the word result matches |
482 | | // the word_to_debug. |
483 | 98.6k | void WERD_RES::DebugWordChoices(bool debug, const char *word_to_debug) { |
484 | 98.6k | if (debug || (word_to_debug != nullptr && *word_to_debug != '\0' && |
485 | 98.6k | best_choice != nullptr && |
486 | 98.6k | best_choice->unichar_string() == std::string(word_to_debug))) { |
487 | 0 | if (raw_choice != nullptr) { |
488 | 0 | raw_choice->print("\nBest Raw Choice"); |
489 | 0 | } |
490 | |
|
491 | 0 | WERD_CHOICE_IT it(&best_choices); |
492 | 0 | int index = 0; |
493 | 0 | for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) { |
494 | 0 | WERD_CHOICE *choice = it.data(); |
495 | 0 | std::string label; |
496 | 0 | label += "\nCooked Choice #" + std::to_string(index); |
497 | 0 | choice->print(label.c_str()); |
498 | 0 | } |
499 | 0 | } |
500 | 98.6k | } |
501 | | |
502 | | // Prints the top choice along with the accepted/done flags. |
503 | 0 | void WERD_RES::DebugTopChoice(const char *msg) const { |
504 | 0 | tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ", tess_accepted, |
505 | 0 | tess_would_adapt, done); |
506 | 0 | if (best_choice == nullptr) { |
507 | 0 | tprintf("<Null choice>\n"); |
508 | 0 | } else { |
509 | 0 | best_choice->print(msg); |
510 | 0 | } |
511 | 0 | } |
512 | | |
513 | | // Removes from best_choices all choices which are not within a reasonable |
514 | | // range of the best choice. |
515 | | // TODO(rays) incorporate the information used here into the params training |
516 | | // re-ranker, in place of this heuristic that is based on the previous |
517 | | // adjustment factor. |
518 | 98.6k | void WERD_RES::FilterWordChoices(int debug_level) { |
519 | 98.6k | if (best_choice == nullptr || best_choices.singleton()) { |
520 | 50.6k | return; |
521 | 50.6k | } |
522 | | |
523 | 48.0k | if (debug_level >= 2) { |
524 | 0 | best_choice->print("\nFiltering against best choice"); |
525 | 0 | } |
526 | 48.0k | WERD_CHOICE_IT it(&best_choices); |
527 | 48.0k | int index = 0; |
528 | 233k | for (it.forward(); !it.at_first(); it.forward(), ++index) { |
529 | 185k | WERD_CHOICE *choice = it.data(); |
530 | 185k | float threshold = StopperAmbigThreshold(best_choice->adjust_factor(), |
531 | 185k | choice->adjust_factor()); |
532 | | // i, j index the blob choice in choice, best_choice. |
533 | | // chunk is an index into the chopped_word blobs (AKA chunks). |
534 | | // Since the two words may use different segmentations of the chunks, we |
535 | | // iterate over the chunks to find out whether a comparable blob |
536 | | // classification is much worse than the best result. |
537 | 185k | unsigned i = 0, j = 0, chunk = 0; |
538 | | // Each iteration of the while deals with 1 chunk. On entry choice_chunk |
539 | | // and best_chunk are the indices of the first chunk in the NEXT blob, |
540 | | // i.e. we don't have to increment i, j while chunk < choice_chunk and |
541 | | // best_chunk respectively. |
542 | 185k | auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0); |
543 | 1.74M | while (i < choice->length() && j < best_choice->length()) { |
544 | 1.66M | if (choice->unichar_id(i) != best_choice->unichar_id(j) && |
545 | 1.66M | choice->certainty(i) - best_choice->certainty(j) < threshold) { |
546 | 100k | if (debug_level >= 2) { |
547 | 0 | choice->print("WorstCertaintyDiffWorseThan"); |
548 | 0 | tprintf( |
549 | 0 | "i %u j %u Choice->Blob[i].Certainty %.4g" |
550 | 0 | " WorstOtherChoiceCertainty %g Threshold %g\n", |
551 | 0 | i, j, choice->certainty(i), best_choice->certainty(j), threshold); |
552 | 0 | tprintf("Discarding bad choice #%d\n", index); |
553 | 0 | } |
554 | 100k | delete it.extract(); |
555 | 100k | break; |
556 | 100k | } |
557 | 1.56M | ++chunk; |
558 | | // If needed, advance choice_chunk to keep up with chunk. |
559 | 2.35M | while (choice_chunk < chunk && ++i < choice->length()) { |
560 | 797k | choice_chunk += choice->state(i); |
561 | 797k | } |
562 | | // If needed, advance best_chunk to keep up with chunk. |
563 | 2.31M | while (best_chunk < chunk && ++j < best_choice->length()) { |
564 | 756k | best_chunk += best_choice->state(j); |
565 | 756k | } |
566 | 1.56M | } |
567 | 185k | } |
568 | 48.0k | } |
569 | | |
570 | | void WERD_RES::ComputeAdaptionThresholds(float certainty_scale, |
571 | | float min_rating, float max_rating, |
572 | | float rating_margin, |
573 | 2.15k | float *thresholds) { |
574 | 2.15k | int chunk = 0; |
575 | 2.15k | int end_chunk = best_choice->state(0); |
576 | 2.15k | int end_raw_chunk = raw_choice->state(0); |
577 | 2.15k | int raw_blob = 0; |
578 | 4.43k | for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) { |
579 | 2.28k | float avg_rating = 0.0f; |
580 | 2.28k | int num_error_chunks = 0; |
581 | | |
582 | | // For each chunk in best choice blob i, count non-matching raw results. |
583 | 4.49k | while (chunk < end_chunk) { |
584 | 2.21k | if (chunk >= end_raw_chunk) { |
585 | 3 | ++raw_blob; |
586 | 3 | end_raw_chunk += raw_choice->state(raw_blob); |
587 | 3 | } |
588 | 2.21k | if (best_choice->unichar_id(i) != raw_choice->unichar_id(raw_blob)) { |
589 | 3 | avg_rating += raw_choice->certainty(raw_blob); |
590 | 3 | ++num_error_chunks; |
591 | 3 | } |
592 | 2.21k | ++chunk; |
593 | 2.21k | } |
594 | | |
595 | 2.28k | if (num_error_chunks > 0) { |
596 | 2 | avg_rating /= num_error_chunks; |
597 | 2 | *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin); |
598 | 2.27k | } else { |
599 | 2.27k | *thresholds = max_rating; |
600 | 2.27k | } |
601 | | |
602 | 2.28k | if (*thresholds > max_rating) { |
603 | 2 | *thresholds = max_rating; |
604 | 2 | } |
605 | 2.28k | if (*thresholds < min_rating) { |
606 | 0 | *thresholds = min_rating; |
607 | 0 | } |
608 | 2.28k | } |
609 | 2.15k | } |
610 | | |
611 | | // Saves a copy of the word_choice if it has the best unadjusted rating. |
612 | | // Returns true if the word_choice was the new best. |
613 | 291k | bool WERD_RES::LogNewRawChoice(WERD_CHOICE *word_choice) { |
614 | 291k | if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) { |
615 | 291k | delete raw_choice; |
616 | 291k | raw_choice = new WERD_CHOICE(*word_choice); |
617 | 291k | raw_choice->set_permuter(TOP_CHOICE_PERM); |
618 | 291k | return true; |
619 | 291k | } |
620 | 0 | return false; |
621 | 291k | } |
622 | | |
623 | | // Consumes word_choice by adding it to best_choices, (taking ownership) if |
624 | | // the certainty for word_choice is some distance of the best choice in |
625 | | // best_choices, or by deleting the word_choice and returning false. |
626 | | // The best_choices list is kept in sorted order by rating. Duplicates are |
627 | | // removed, and the list is kept no longer than max_num_choices in length. |
628 | | // Returns true if the word_choice is still a valid pointer. |
629 | | bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug, |
630 | 490k | WERD_CHOICE *word_choice) { |
631 | 490k | if (best_choice != nullptr) { |
632 | | // Throw out obviously bad choices to save some work. |
633 | | // TODO(rays) Get rid of this! This piece of code produces different |
634 | | // results according to the order in which words are found, which is an |
635 | | // undesirable behavior. It would be better to keep all the choices and |
636 | | // prune them later when more information is available. |
637 | 300k | float max_certainty_delta = StopperAmbigThreshold( |
638 | 300k | best_choice->adjust_factor(), word_choice->adjust_factor()); |
639 | 300k | if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) { |
640 | 18.3k | max_certainty_delta = -kStopperAmbiguityThresholdOffset; |
641 | 18.3k | } |
642 | 300k | if (word_choice->certainty() - best_choice->certainty() < |
643 | 300k | max_certainty_delta) { |
644 | 38.6k | if (debug) { |
645 | 0 | std::string bad_string; |
646 | 0 | word_choice->string_and_lengths(&bad_string, nullptr); |
647 | 0 | tprintf( |
648 | 0 | "Discarding choice \"%s\" with an overly low certainty" |
649 | 0 | " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n", |
650 | 0 | bad_string.c_str(), word_choice->certainty(), |
651 | 0 | best_choice->certainty(), |
652 | 0 | max_certainty_delta + best_choice->certainty()); |
653 | 0 | } |
654 | 38.6k | delete word_choice; |
655 | 38.6k | return false; |
656 | 38.6k | } |
657 | 300k | } |
658 | | |
659 | | // Insert in the list in order of increasing rating, but knock out worse |
660 | | // string duplicates. |
661 | 451k | WERD_CHOICE_IT it(&best_choices); |
662 | 451k | const std::string &new_str = word_choice->unichar_string(); |
663 | 451k | bool inserted = false; |
664 | 451k | int num_choices = 0; |
665 | 451k | if (!it.empty()) { |
666 | 1.26M | do { |
667 | 1.26M | WERD_CHOICE *choice = it.data(); |
668 | 1.26M | if (choice->rating() > word_choice->rating() && !inserted) { |
669 | | // Time to insert. |
670 | 213k | it.add_before_stay_put(word_choice); |
671 | 213k | inserted = true; |
672 | 213k | if (num_choices == 0) { |
673 | 118k | best_choice = word_choice; // This is the new best. |
674 | 118k | } |
675 | 213k | ++num_choices; |
676 | 213k | } |
677 | 1.26M | if (choice->unichar_string() == new_str) { |
678 | 25.0k | if (inserted) { |
679 | | // New is better. |
680 | 20.7k | delete it.extract(); |
681 | 20.7k | } else { |
682 | | // Old is better. |
683 | 4.29k | if (debug) { |
684 | 0 | tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n", |
685 | 0 | new_str.c_str(), word_choice->rating(), choice->rating()); |
686 | 0 | } |
687 | 4.29k | delete word_choice; |
688 | 4.29k | return false; |
689 | 4.29k | } |
690 | 1.24M | } else { |
691 | 1.24M | ++num_choices; |
692 | 1.24M | if (num_choices > max_num_choices) { |
693 | 47.8k | delete it.extract(); |
694 | 47.8k | } |
695 | 1.24M | } |
696 | 1.26M | it.forward(); |
697 | 1.26M | } while (!it.at_first()); |
698 | 262k | } |
699 | 447k | if (!inserted && num_choices < max_num_choices) { |
700 | 230k | it.add_to_end(word_choice); |
701 | 230k | inserted = true; |
702 | 230k | if (num_choices == 0) { |
703 | 189k | best_choice = word_choice; // This is the new best. |
704 | 189k | } |
705 | 230k | } |
706 | 447k | if (debug) { |
707 | 0 | if (inserted) { |
708 | 0 | tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary"); |
709 | 0 | } else { |
710 | 0 | tprintf("Poor"); |
711 | 0 | } |
712 | 0 | word_choice->print(" Word Choice"); |
713 | 0 | } |
714 | 447k | if (!inserted) { |
715 | 3.96k | delete word_choice; |
716 | 3.96k | return false; |
717 | 3.96k | } |
718 | 443k | return true; |
719 | 447k | } |
720 | | |
721 | | // Simple helper moves the ownership of the pointer data from src to dest, |
722 | | // first deleting anything in dest, and nulling out src afterwards. |
723 | | template <class T> |
724 | 353k | static void MovePointerData(T **dest, T **src) { |
725 | 353k | delete *dest; |
726 | 353k | *dest = *src; |
727 | 353k | *src = nullptr; |
728 | 353k | } pageres.cpp:void tesseract::MovePointerData<tesseract::TWERD>(tesseract::TWERD**, tesseract::TWERD**) Line | Count | Source | 724 | 141k | static void MovePointerData(T **dest, T **src) { | 725 | 141k | delete *dest; | 726 | 141k | *dest = *src; | 727 | 141k | *src = nullptr; | 728 | 141k | } |
pageres.cpp:void tesseract::MovePointerData<tesseract::BoxWord>(tesseract::BoxWord**, tesseract::BoxWord**) Line | Count | Source | 724 | 70.6k | static void MovePointerData(T **dest, T **src) { | 725 | 70.6k | delete *dest; | 726 | 70.6k | *dest = *src; | 727 | 70.6k | *src = nullptr; | 728 | 70.6k | } |
pageres.cpp:void tesseract::MovePointerData<tesseract::MATRIX>(tesseract::MATRIX**, tesseract::MATRIX**) Line | Count | Source | 724 | 70.6k | static void MovePointerData(T **dest, T **src) { | 725 | 70.6k | delete *dest; | 726 | 70.6k | *dest = *src; | 727 | 70.6k | *src = nullptr; | 728 | 70.6k | } |
pageres.cpp:void tesseract::MovePointerData<tesseract::WERD_CHOICE>(tesseract::WERD_CHOICE**, tesseract::WERD_CHOICE**) Line | Count | Source | 724 | 70.6k | static void MovePointerData(T **dest, T **src) { | 725 | 70.6k | delete *dest; | 726 | 70.6k | *dest = *src; | 727 | 70.6k | *src = nullptr; | 728 | 70.6k | } |
|
729 | | |
730 | | // Prints a brief list of all the best choices. |
731 | 0 | void WERD_RES::PrintBestChoices() const { |
732 | 0 | std::string alternates_str; |
733 | 0 | WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST *>(&best_choices)); |
734 | 0 | for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
735 | 0 | if (!it.at_first()) { |
736 | 0 | alternates_str += "\", \""; |
737 | 0 | } |
738 | 0 | alternates_str += it.data()->unichar_string(); |
739 | 0 | } |
740 | 0 | tprintf("Alternates for \"%s\": {\"%s\"}\n", |
741 | 0 | best_choice->unichar_string().c_str(), alternates_str.c_str()); |
742 | 0 | } |
743 | | |
744 | | // Returns the sum of the widths of the blob between start_blob and last_blob |
745 | | // inclusive. |
746 | 11.3M | int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) const { |
747 | 11.3M | int result = 0; |
748 | 40.7M | for (int b = start_blob; b <= last_blob; ++b) { |
749 | 29.3M | result += blob_widths[b]; |
750 | 29.3M | if (b < last_blob) { |
751 | 18.0M | result += blob_gaps[b]; |
752 | 18.0M | } |
753 | 29.3M | } |
754 | 11.3M | return result; |
755 | 11.3M | } |
756 | | // Returns the width of a gap between the specified blob and the next one. |
757 | 22.9M | int WERD_RES::GetBlobsGap(unsigned blob_index) const { |
758 | 22.9M | if (blob_index >= blob_gaps.size()) { |
759 | 0 | return 0; |
760 | 0 | } |
761 | 22.9M | return blob_gaps[blob_index]; |
762 | 22.9M | } |
763 | | |
764 | | // Returns the BLOB_CHOICE corresponding to the given index in the |
765 | | // best choice word taken from the appropriate cell in the ratings MATRIX. |
766 | | // Borrowed pointer, so do not delete. May return nullptr if there is no |
767 | | // BLOB_CHOICE matching the unichar_id at the given index. |
768 | 331k | BLOB_CHOICE *WERD_RES::GetBlobChoice(unsigned index) const { |
769 | 331k | if (index >= best_choice->length()) { |
770 | 0 | return nullptr; |
771 | 0 | } |
772 | 331k | BLOB_CHOICE_LIST *choices = GetBlobChoices(index); |
773 | 331k | return FindMatchingChoice(best_choice->unichar_id(index), choices); |
774 | 331k | } |
775 | | |
776 | | // Returns the BLOB_CHOICE_LIST corresponding to the given index in the |
777 | | // best choice word taken from the appropriate cell in the ratings MATRIX. |
778 | | // Borrowed pointer, so do not delete. |
779 | 331k | BLOB_CHOICE_LIST *WERD_RES::GetBlobChoices(int index) const { |
780 | 331k | return best_choice->blob_choices(index, ratings); |
781 | 331k | } |
782 | | |
783 | | // Moves the results fields from word to this. This takes ownership of all |
784 | | // the data, so src can be destructed. |
785 | 70.6k | void WERD_RES::ConsumeWordResults(WERD_RES *word) { |
786 | 70.6k | denorm = word->denorm; |
787 | 70.6k | blob_row = word->blob_row; |
788 | 70.6k | MovePointerData(&chopped_word, &word->chopped_word); |
789 | 70.6k | MovePointerData(&rebuild_word, &word->rebuild_word); |
790 | 70.6k | MovePointerData(&box_word, &word->box_word); |
791 | 272k | for (auto data : seam_array) { |
792 | 272k | delete data; |
793 | 272k | } |
794 | 70.6k | seam_array = word->seam_array; |
795 | 70.6k | word->seam_array.clear(); |
796 | | // TODO: optimize moves. |
797 | 70.6k | best_state = word->best_state; |
798 | 70.6k | word->best_state.clear(); |
799 | 70.6k | correct_text = word->correct_text; |
800 | 70.6k | word->correct_text.clear(); |
801 | 70.6k | blob_widths = word->blob_widths; |
802 | 70.6k | word->blob_widths.clear(); |
803 | 70.6k | blob_gaps = word->blob_gaps; |
804 | 70.6k | word->blob_gaps.clear(); |
805 | 70.6k | if (ratings != nullptr) { |
806 | 70.6k | ratings->delete_matrix_pointers(); |
807 | 70.6k | } |
808 | 70.6k | MovePointerData(&ratings, &word->ratings); |
809 | 70.6k | best_choice = word->best_choice; |
810 | 70.6k | MovePointerData(&raw_choice, &word->raw_choice); |
811 | 70.6k | best_choices.clear(); |
812 | 70.6k | WERD_CHOICE_IT wc_it(&best_choices); |
813 | 70.6k | wc_it.add_list_after(&word->best_choices); |
814 | 70.6k | reject_map = word->reject_map; |
815 | 70.6k | if (word->blamer_bundle != nullptr) { |
816 | 0 | assert(blamer_bundle != nullptr); |
817 | 0 | blamer_bundle->CopyResults(*(word->blamer_bundle)); |
818 | 0 | } |
819 | 70.6k | CopySimpleFields(*word); |
820 | 70.6k | } |
821 | | |
822 | | // Replace the best choice and rebuild box word. |
823 | | // choice must be from the current best_choices list. |
824 | 0 | void WERD_RES::ReplaceBestChoice(WERD_CHOICE *choice) { |
825 | 0 | best_choice = choice; |
826 | 0 | RebuildBestState(); |
827 | 0 | SetupBoxWord(); |
828 | | // Make up a fake reject map of the right length to keep the |
829 | | // rejection pass happy. |
830 | 0 | reject_map.initialise(best_state.size()); |
831 | 0 | done = tess_accepted = tess_would_adapt = true; |
832 | 0 | SetScriptPositions(); |
833 | 0 | } |
834 | | |
835 | | // Builds the rebuild_word and sets the best_state from the chopped_word and |
836 | | // the best_choice->state. |
837 | 98.6k | void WERD_RES::RebuildBestState() { |
838 | 98.6k | ASSERT_HOST(best_choice != nullptr); |
839 | 98.6k | delete rebuild_word; |
840 | 98.6k | rebuild_word = new TWERD; |
841 | 98.6k | if (seam_array.empty()) { |
842 | 45.4k | start_seam_list(chopped_word, &seam_array); |
843 | 45.4k | } |
844 | 98.6k | best_state.clear(); |
845 | 98.6k | int start = 0; |
846 | 441k | for (unsigned i = 0; i < best_choice->length(); ++i) { |
847 | 342k | int length = best_choice->state(i); |
848 | 342k | best_state.push_back(length); |
849 | 342k | if (length > 1) { |
850 | 93.7k | SEAM::JoinPieces(seam_array, chopped_word->blobs, start, |
851 | 93.7k | start + length - 1); |
852 | 93.7k | } |
853 | 342k | TBLOB *blob = chopped_word->blobs[start]; |
854 | 342k | rebuild_word->blobs.push_back(new TBLOB(*blob)); |
855 | 342k | if (length > 1) { |
856 | 93.7k | SEAM::BreakPieces(seam_array, chopped_word->blobs, start, |
857 | 93.7k | start + length - 1); |
858 | 93.7k | } |
859 | 342k | start += length; |
860 | 342k | } |
861 | 98.6k | } |
862 | | |
863 | | // Copies the chopped_word to the rebuild_word, faking a best_state as well. |
864 | | // Also sets up the output box_word. |
865 | 0 | void WERD_RES::CloneChoppedToRebuild() { |
866 | 0 | delete rebuild_word; |
867 | 0 | rebuild_word = new TWERD(*chopped_word); |
868 | 0 | SetupBoxWord(); |
869 | 0 | auto word_len = box_word->length(); |
870 | 0 | best_state.reserve(word_len); |
871 | 0 | correct_text.reserve(word_len); |
872 | 0 | for (unsigned i = 0; i < word_len; ++i) { |
873 | 0 | best_state.push_back(1); |
874 | 0 | correct_text.emplace_back(""); |
875 | 0 | } |
876 | 0 | } |
877 | | |
878 | | // Sets/replaces the box_word with one made from the rebuild_word. |
879 | 91.3k | void WERD_RES::SetupBoxWord() { |
880 | 91.3k | delete box_word; |
881 | 91.3k | rebuild_word->ComputeBoundingBoxes(); |
882 | 91.3k | box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word); |
883 | 91.3k | box_word->ClipToOriginalWord(denorm.block(), word); |
884 | 91.3k | } |
885 | | |
886 | | // Sets up the script positions in the output best_choice using the best_choice |
887 | | // to get the unichars, and the unicharset to get the target positions. |
888 | 0 | void WERD_RES::SetScriptPositions() { |
889 | 0 | best_choice->SetScriptPositions(small_caps, chopped_word); |
890 | 0 | } |
891 | | // Sets all the blobs in all the words (raw choice and best choices) to be |
892 | | // the given position. (When a sub/superscript is recognized as a separate |
893 | | // word, it falls victim to the rule that a whole word cannot be sub or |
894 | | // superscript, so this function overrides that problem.) |
895 | 1.79k | void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) { |
896 | 1.79k | raw_choice->SetAllScriptPositions(position); |
897 | 1.79k | WERD_CHOICE_IT wc_it(&best_choices); |
898 | 5.33k | for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) { |
899 | 3.53k | wc_it.data()->SetAllScriptPositions(position); |
900 | 3.53k | } |
901 | 1.79k | } |
902 | | |
903 | | // Classifies the word with some already-calculated BLOB_CHOICEs. |
904 | | // The choices are an array of blob_count pointers to BLOB_CHOICE, |
905 | | // providing a single classifier result for each blob. |
906 | | // The BLOB_CHOICEs are consumed and the word takes ownership. |
907 | | // The number of blobs in the box_word must match blob_count. |
908 | 0 | void WERD_RES::FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices) { |
909 | | // Setup the WERD_RES. |
910 | 0 | ASSERT_HOST(box_word != nullptr); |
911 | 0 | ASSERT_HOST(blob_count == box_word->length()); |
912 | 0 | ClearWordChoices(); |
913 | 0 | ClearRatings(); |
914 | 0 | ratings = new MATRIX(blob_count, 1); |
915 | 0 | for (unsigned c = 0; c < blob_count; ++c) { |
916 | 0 | auto *choice_list = new BLOB_CHOICE_LIST; |
917 | 0 | BLOB_CHOICE_IT choice_it(choice_list); |
918 | 0 | choice_it.add_after_then_move(choices[c]); |
919 | 0 | ratings->put(c, c, choice_list); |
920 | 0 | } |
921 | 0 | FakeWordFromRatings(TOP_CHOICE_PERM); |
922 | 0 | reject_map.initialise(blob_count); |
923 | 0 | best_state.clear(); |
924 | 0 | best_state.resize(blob_count, 1); |
925 | 0 | done = true; |
926 | 0 | } |
927 | | |
928 | | // Creates a WERD_CHOICE for the word using the top choices from the leading |
929 | | // diagonal of the ratings matrix. |
930 | 90.7k | void WERD_RES::FakeWordFromRatings(PermuterType permuter) { |
931 | 90.7k | int num_blobs = ratings->dimension(); |
932 | 90.7k | auto *word_choice = new WERD_CHOICE(uch_set, num_blobs); |
933 | 90.7k | word_choice->set_permuter(permuter); |
934 | 218k | for (int b = 0; b < num_blobs; ++b) { |
935 | 127k | UNICHAR_ID unichar_id = UNICHAR_SPACE; |
936 | | // Initialize rating and certainty like in WERD_CHOICE::make_bad(). |
937 | 127k | float rating = WERD_CHOICE::kBadRating; |
938 | 127k | float certainty = -FLT_MAX; |
939 | 127k | BLOB_CHOICE_LIST *choices = ratings->get(b, b); |
940 | 127k | if (choices != nullptr && !choices->empty()) { |
941 | 127k | BLOB_CHOICE_IT bc_it(choices); |
942 | 127k | BLOB_CHOICE *choice = bc_it.data(); |
943 | 127k | unichar_id = choice->unichar_id(); |
944 | 127k | rating = choice->rating(); |
945 | 127k | certainty = choice->certainty(); |
946 | 127k | } |
947 | 127k | word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating, |
948 | 127k | certainty); |
949 | 127k | } |
950 | 90.7k | LogNewRawChoice(word_choice); |
951 | | // Ownership of word_choice taken by word here. |
952 | 90.7k | LogNewCookedChoice(1, false, word_choice); |
953 | 90.7k | } |
954 | | |
955 | | // Copies the best_choice strings to the correct_text for adaption/training. |
956 | 2.15k | void WERD_RES::BestChoiceToCorrectText() { |
957 | 2.15k | correct_text.clear(); |
958 | 2.15k | ASSERT_HOST(best_choice != nullptr); |
959 | 4.43k | for (unsigned i = 0; i < best_choice->length(); ++i) { |
960 | 2.28k | UNICHAR_ID choice_id = best_choice->unichar_id(i); |
961 | 2.28k | const char *blob_choice = uch_set->id_to_unichar(choice_id); |
962 | 2.28k | correct_text.emplace_back(blob_choice); |
963 | 2.28k | } |
964 | 2.15k | } |
965 | | |
966 | | // Merges 2 adjacent blobs in the result if the permanent callback |
967 | | // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent |
968 | | // callback box_cb is nullptr or returns true, setting the merged blob |
969 | | // result to the class returned from class_cb. |
970 | | // Returns true if anything was merged. |
971 | | bool WERD_RES::ConditionalBlobMerge( |
972 | | const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb, |
973 | 153k | const std::function<bool(const TBOX &, const TBOX &)> &box_cb) { |
974 | 153k | ASSERT_HOST(best_choice->empty() || ratings != nullptr); |
975 | 153k | bool modified = false; |
976 | 619k | for (unsigned i = 0; i + 1 < best_choice->length(); ++i) { |
977 | 465k | UNICHAR_ID new_id = |
978 | 465k | class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1)); |
979 | 465k | if (new_id != INVALID_UNICHAR_ID && |
980 | 465k | (box_cb == nullptr || |
981 | 575 | box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) { |
982 | | // Raw choice should not be fixed. |
983 | 407 | best_choice->set_unichar_id(new_id, i); |
984 | 407 | modified = true; |
985 | 407 | MergeAdjacentBlobs(i); |
986 | 407 | const MATRIX_COORD &coord = best_choice->MatrixCoord(i); |
987 | 407 | if (!coord.Valid(*ratings)) { |
988 | 0 | ratings->IncreaseBandSize(coord.row + 1 - coord.col); |
989 | 0 | } |
990 | 407 | BLOB_CHOICE_LIST *blob_choices = GetBlobChoices(i); |
991 | 407 | if (FindMatchingChoice(new_id, blob_choices) == nullptr) { |
992 | | // Insert a fake result. |
993 | 398 | auto *blob_choice = new BLOB_CHOICE; |
994 | 398 | blob_choice->set_unichar_id(new_id); |
995 | 398 | BLOB_CHOICE_IT bc_it(blob_choices); |
996 | 398 | bc_it.add_before_then_move(blob_choice); |
997 | 398 | } |
998 | 407 | } |
999 | 465k | } |
1000 | 153k | return modified; |
1001 | 153k | } |
1002 | | |
1003 | | // Merges 2 adjacent blobs in the result (index and index+1) and corrects |
1004 | | // all the data to account for the change. |
1005 | 407 | void WERD_RES::MergeAdjacentBlobs(unsigned index) { |
1006 | 407 | if (reject_map.length() == best_choice->length()) { |
1007 | 284 | reject_map.remove_pos(index); |
1008 | 284 | } |
1009 | 407 | best_choice->remove_unichar_id(index + 1); |
1010 | 407 | rebuild_word->MergeBlobs(index, index + 2); |
1011 | 407 | box_word->MergeBoxes(index, index + 2); |
1012 | 407 | if (index + 1 < best_state.size()) { |
1013 | 407 | best_state[index] += best_state[index + 1]; |
1014 | 407 | best_state.erase(best_state.begin() + index + 1); |
1015 | 407 | } |
1016 | 407 | } |
1017 | | |
1018 | | // TODO(tkielbus) Decide between keeping this behavior here or modifying the |
1019 | | // training data. |
1020 | | |
1021 | | // Utility function for fix_quotes |
1022 | | // Return true if the next character in the string (given the UTF8 length in |
1023 | | // bytes) is a quote character. |
1024 | 252k | static int is_simple_quote(const char *signed_str, int length) { |
1025 | 252k | const auto *str = reinterpret_cast<const unsigned char *>(signed_str); |
1026 | | // Standard 1 byte quotes. |
1027 | 252k | return (length == 1 && (*str == '\'' || *str == '`')) || |
1028 | | // UTF-8 3 bytes curved quotes. |
1029 | 252k | (length == 3 && |
1030 | 249k | ((*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x98) || |
1031 | 34.4k | (*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x99))); |
1032 | 252k | } |
1033 | | |
1034 | | // Callback helper for fix_quotes returns a double quote if both |
1035 | | // arguments are quote, otherwise INVALID_UNICHAR_ID. |
1036 | 232k | UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) { |
1037 | 232k | const char *ch = uch_set->id_to_unichar(id1); |
1038 | 232k | const char *next_ch = uch_set->id_to_unichar(id2); |
1039 | 232k | if (is_simple_quote(ch, strlen(ch)) && |
1040 | 232k | is_simple_quote(next_ch, strlen(next_ch))) { |
1041 | 284 | return uch_set->unichar_to_id("\""); |
1042 | 284 | } |
1043 | 232k | return INVALID_UNICHAR_ID; |
1044 | 232k | } |
1045 | | |
1046 | | // Change pairs of quotes to double quotes. |
1047 | 76.7k | void WERD_RES::fix_quotes() { |
1048 | 76.7k | if (!uch_set->contains_unichar("\"") || |
1049 | 76.7k | !uch_set->get_enabled(uch_set->unichar_to_id("\""))) { |
1050 | 0 | return; // Don't create it if it is disallowed. |
1051 | 0 | } |
1052 | | |
1053 | 76.7k | using namespace std::placeholders; // for _1, _2 |
1054 | 76.7k | ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2), nullptr); |
1055 | 76.7k | } |
1056 | | |
1057 | | // Callback helper for fix_hyphens returns UNICHAR_ID of - if both |
1058 | | // arguments are hyphen, otherwise INVALID_UNICHAR_ID. |
1059 | 232k | UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) { |
1060 | 232k | const char *ch = uch_set->id_to_unichar(id1); |
1061 | 232k | const char *next_ch = uch_set->id_to_unichar(id2); |
1062 | 232k | if (strlen(ch) == 1 && strlen(next_ch) == 1 && (*ch == '-' || *ch == '~') && |
1063 | 232k | (*next_ch == '-' || *next_ch == '~')) { |
1064 | 291 | return uch_set->unichar_to_id("-"); |
1065 | 291 | } |
1066 | 232k | return INVALID_UNICHAR_ID; |
1067 | 232k | } |
1068 | | |
1069 | | // Callback helper for fix_hyphens returns true if box1 and box2 overlap |
1070 | | // (assuming both on the same textline, are in order and a chopped em dash.) |
1071 | 291 | bool WERD_RES::HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2) { |
1072 | 291 | return box1.right() >= box2.left(); |
1073 | 291 | } |
1074 | | |
1075 | | // Change pairs of hyphens to a single hyphen if the bounding boxes touch |
1076 | | // Typically a long dash which has been segmented. |
1077 | 76.7k | void WERD_RES::fix_hyphens() { |
1078 | 76.7k | if (!uch_set->contains_unichar("-") || |
1079 | 76.7k | !uch_set->get_enabled(uch_set->unichar_to_id("-"))) { |
1080 | 0 | return; // Don't create it if it is disallowed. |
1081 | 0 | } |
1082 | | |
1083 | 76.7k | using namespace std::placeholders; // for _1, _2 |
1084 | 76.7k | ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2), |
1085 | 76.7k | std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2)); |
1086 | 76.7k | } |
1087 | | |
1088 | | // Callback helper for merge_tess_fails returns a space if both |
1089 | | // arguments are space, otherwise INVALID_UNICHAR_ID. |
1090 | 0 | UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) { |
1091 | 0 | if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) { |
1092 | 0 | return id1; |
1093 | 0 | } else { |
1094 | 0 | return INVALID_UNICHAR_ID; |
1095 | 0 | } |
1096 | 0 | } |
1097 | | |
1098 | | // Change pairs of tess failures to a single one |
1099 | 0 | void WERD_RES::merge_tess_fails() { |
1100 | 0 | using namespace std::placeholders; // for _1, _2 |
1101 | 0 | if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2), |
1102 | 0 | nullptr)) { |
1103 | 0 | unsigned len = best_choice->length(); |
1104 | 0 | ASSERT_HOST(reject_map.length() == len); |
1105 | 0 | ASSERT_HOST(box_word->length() == len); |
1106 | 0 | } |
1107 | 0 | } |
1108 | | |
1109 | | // Returns true if the collection of count pieces, starting at start, are all |
1110 | | // natural connected components, ie there are no real chops involved. |
1111 | 0 | bool WERD_RES::PiecesAllNatural(int start, int count) const { |
1112 | | // all seams must have no splits. |
1113 | 0 | for (int index = start; index < start + count - 1; ++index) { |
1114 | 0 | if (index >= 0 && static_cast<size_t>(index) < seam_array.size()) { |
1115 | 0 | SEAM *seam = seam_array[index]; |
1116 | 0 | if (seam != nullptr && seam->HasAnySplits()) { |
1117 | 0 | return false; |
1118 | 0 | } |
1119 | 0 | } |
1120 | 0 | } |
1121 | 0 | return true; |
1122 | 0 | } |
1123 | | |
1124 | 503k | WERD_RES::~WERD_RES() { |
1125 | 503k | Clear(); |
1126 | 503k | } |
1127 | | |
1128 | 530k | void WERD_RES::Clear() { |
1129 | 530k | if (combination) { |
1130 | 164k | delete word; |
1131 | 164k | } |
1132 | 530k | word = nullptr; |
1133 | 530k | delete blamer_bundle; |
1134 | 530k | blamer_bundle = nullptr; |
1135 | 530k | ClearResults(); |
1136 | 530k | } |
1137 | | |
1138 | 871k | void WERD_RES::ClearResults() { |
1139 | 871k | done = false; |
1140 | 871k | fontinfo = nullptr; |
1141 | 871k | fontinfo2 = nullptr; |
1142 | 871k | fontinfo_id_count = 0; |
1143 | 871k | fontinfo_id2_count = 0; |
1144 | 871k | delete bln_boxes; |
1145 | 871k | bln_boxes = nullptr; |
1146 | 871k | blob_row = nullptr; |
1147 | 871k | delete chopped_word; |
1148 | 871k | chopped_word = nullptr; |
1149 | 871k | delete rebuild_word; |
1150 | 871k | rebuild_word = nullptr; |
1151 | 871k | delete box_word; |
1152 | 871k | box_word = nullptr; |
1153 | 871k | best_state.clear(); |
1154 | 871k | correct_text.clear(); |
1155 | 1.47M | for (auto data : seam_array) { |
1156 | 1.47M | delete data; |
1157 | 1.47M | } |
1158 | 871k | seam_array.clear(); |
1159 | 871k | blob_widths.clear(); |
1160 | 871k | blob_gaps.clear(); |
1161 | 871k | ClearRatings(); |
1162 | 871k | ClearWordChoices(); |
1163 | 871k | if (blamer_bundle != nullptr) { |
1164 | 0 | blamer_bundle->ClearResults(); |
1165 | 0 | } |
1166 | 871k | } |
1167 | 1.21M | void WERD_RES::ClearWordChoices() { |
1168 | 1.21M | best_choice = nullptr; |
1169 | 1.21M | delete raw_choice; |
1170 | 1.21M | raw_choice = nullptr; |
1171 | 1.21M | best_choices.clear(); |
1172 | 1.21M | delete ep_choice; |
1173 | 1.21M | ep_choice = nullptr; |
1174 | 1.21M | } |
1175 | 871k | void WERD_RES::ClearRatings() { |
1176 | 871k | if (ratings != nullptr) { |
1177 | 343k | ratings->delete_matrix_pointers(); |
1178 | 343k | delete ratings; |
1179 | 343k | ratings = nullptr; |
1180 | 343k | } |
1181 | 871k | } |
1182 | | |
1183 | 930k | int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const { |
1184 | 930k | ASSERT_HOST(page_res == other.page_res); |
1185 | 930k | if (other.block_res == nullptr) { |
1186 | | // other points to the end of the page. |
1187 | 0 | if (block_res == nullptr) { |
1188 | 0 | return 0; |
1189 | 0 | } |
1190 | 0 | return -1; |
1191 | 0 | } |
1192 | 930k | if (block_res == nullptr) { |
1193 | 543k | return 1; // we point to the end of the page. |
1194 | 543k | } |
1195 | 386k | if (block_res == other.block_res) { |
1196 | 386k | if (other.row_res == nullptr || row_res == nullptr) { |
1197 | | // this should only happen if we hit an image block. |
1198 | 0 | return 0; |
1199 | 0 | } |
1200 | 386k | if (row_res == other.row_res) { |
1201 | | // we point to the same block and row. |
1202 | 195k | ASSERT_HOST(other.word_res != nullptr && word_res != nullptr); |
1203 | 195k | if (word_res == other.word_res) { |
1204 | | // we point to the same word! |
1205 | 195k | return 0; |
1206 | 195k | } |
1207 | | |
1208 | 0 | WERD_RES_IT word_res_it(&row_res->word_res_list); |
1209 | 0 | for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); |
1210 | 0 | word_res_it.forward()) { |
1211 | 0 | if (word_res_it.data() == word_res) { |
1212 | 0 | return -1; |
1213 | 0 | } else if (word_res_it.data() == other.word_res) { |
1214 | 0 | return 1; |
1215 | 0 | } |
1216 | 0 | } |
1217 | 0 | ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr); |
1218 | 0 | } |
1219 | | |
1220 | | // we both point to the same block, but different rows. |
1221 | 191k | ROW_RES_IT row_res_it(&block_res->row_res_list); |
1222 | 2.27M | for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); |
1223 | 2.27M | row_res_it.forward()) { |
1224 | 2.27M | if (row_res_it.data() == row_res) { |
1225 | 167k | return -1; |
1226 | 2.11M | } else if (row_res_it.data() == other.row_res) { |
1227 | 23.3k | return 1; |
1228 | 23.3k | } |
1229 | 2.27M | } |
1230 | 0 | ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr); |
1231 | 0 | } |
1232 | | |
1233 | | // We point to different blocks. |
1234 | 0 | BLOCK_RES_IT block_res_it(&page_res->block_res_list); |
1235 | 0 | for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); |
1236 | 0 | block_res_it.forward()) { |
1237 | 0 | if (block_res_it.data() == block_res) { |
1238 | 0 | return -1; |
1239 | 0 | } else if (block_res_it.data() == other.block_res) { |
1240 | 0 | return 1; |
1241 | 0 | } |
1242 | 0 | } |
1243 | | // Shouldn't happen... |
1244 | 0 | ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr); |
1245 | 0 | return 0; |
1246 | 0 | } |
1247 | | |
1248 | | // Inserts the new_word as a combination owned by a corresponding WERD_RES |
1249 | | // before the current position. The simple fields of the WERD_RES are copied |
1250 | | // from clone_res and the resulting WERD_RES is returned for further setup |
1251 | | // with best_choice etc. |
1252 | | WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res, |
1253 | 0 | WERD *new_word) { |
1254 | | // Make a WERD_RES for the new_word. |
1255 | 0 | auto *new_res = new WERD_RES(new_word); |
1256 | 0 | new_res->CopySimpleFields(clone_res); |
1257 | 0 | new_res->combination = true; |
1258 | | // Insert into the appropriate place in the ROW_RES. |
1259 | 0 | WERD_RES_IT wr_it(&row()->word_res_list); |
1260 | 0 | for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { |
1261 | 0 | WERD_RES *word = wr_it.data(); |
1262 | 0 | if (word == word_res) { |
1263 | 0 | break; |
1264 | 0 | } |
1265 | 0 | } |
1266 | 0 | ASSERT_HOST(!wr_it.cycled_list()); |
1267 | 0 | wr_it.add_before_then_move(new_res); |
1268 | 0 | if (wr_it.at_first()) { |
1269 | | // This is the new first word, so reset the member iterator so it |
1270 | | // detects the cycled_list state correctly. |
1271 | 0 | ResetWordIterator(); |
1272 | 0 | } |
1273 | 0 | return new_res; |
1274 | 0 | } |
1275 | | |
1276 | | // Helper computes the boundaries between blobs in the word. The blob bounds |
1277 | | // are likely very poor, if they come from LSTM, where it only outputs the |
1278 | | // character at one pixel within it, so we find the midpoints between them. |
1279 | | static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box, |
1280 | | C_BLOB_LIST *next_word_blobs, |
1281 | 90.7k | std::vector<int> *blob_ends) { |
1282 | 90.7k | C_BLOB_IT blob_it(word.word->cblob_list()); |
1283 | 127k | for (int length : word.best_state) { |
1284 | | // Get the bounding box of the fake blobs |
1285 | 127k | TBOX blob_box = blob_it.data()->bounding_box(); |
1286 | 127k | blob_it.forward(); |
1287 | 127k | for (int b = 1; b < length; ++b) { |
1288 | 0 | blob_box += blob_it.data()->bounding_box(); |
1289 | 0 | blob_it.forward(); |
1290 | 0 | } |
1291 | | // This blob_box is crap, so for now we are only looking for the |
1292 | | // boundaries between them. |
1293 | 127k | int blob_end = INT32_MAX; |
1294 | 127k | if (!blob_it.at_first() || next_word_blobs != nullptr) { |
1295 | 47.6k | if (blob_it.at_first()) { |
1296 | 11.1k | blob_it.set_to_list(next_word_blobs); |
1297 | 11.1k | } |
1298 | 47.6k | blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2; |
1299 | 47.6k | } |
1300 | 127k | blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right()); |
1301 | 127k | blob_ends->push_back(blob_end); |
1302 | 127k | } |
1303 | 90.7k | blob_ends->back() = clip_box.right(); |
1304 | 90.7k | } |
1305 | | |
1306 | | // Helper computes the bounds of a word by restricting it to existing words |
1307 | | // that significantly overlap. |
1308 | | static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words, |
1309 | 90.7k | int w_index, TBOX prev_box, WERD_RES_IT w_it) { |
1310 | 90.7k | constexpr int kSignificantOverlapFraction = 4; |
1311 | 90.7k | TBOX clipped_box; |
1312 | 90.7k | TBOX current_box = words[w_index]->word->bounding_box(); |
1313 | 90.7k | TBOX next_box; |
1314 | 90.7k | if (static_cast<size_t>(w_index + 1) < words.size() && |
1315 | 90.7k | words[w_index + 1] != nullptr && words[w_index + 1]->word != nullptr) { |
1316 | 11.1k | next_box = words[w_index + 1]->word->bounding_box(); |
1317 | 11.1k | } |
1318 | 241k | for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo; |
1319 | 150k | w_it.forward()) { |
1320 | 150k | if (w_it.data() == nullptr || w_it.data()->word == nullptr) { |
1321 | 0 | continue; |
1322 | 0 | } |
1323 | 150k | TBOX w_box = w_it.data()->word->bounding_box(); |
1324 | 150k | int height_limit = std::min<int>(w_box.height(), w_box.width() / 2); |
1325 | 150k | int width_limit = w_box.width() / kSignificantOverlapFraction; |
1326 | 150k | int min_significant_overlap = std::max(height_limit, width_limit); |
1327 | 150k | int overlap = w_box.intersection(current_box).width(); |
1328 | 150k | int prev_overlap = w_box.intersection(prev_box).width(); |
1329 | 150k | int next_overlap = w_box.intersection(next_box).width(); |
1330 | 150k | if (overlap > min_significant_overlap) { |
1331 | 96.4k | if (prev_overlap > min_significant_overlap) { |
1332 | | // We have no choice but to use the LSTM word edge. |
1333 | 746 | clipped_box.set_left(current_box.left()); |
1334 | 95.6k | } else if (next_overlap > min_significant_overlap) { |
1335 | | // We have no choice but to use the LSTM word edge. |
1336 | 713 | clipped_box.set_right(current_box.right()); |
1337 | 94.9k | } else { |
1338 | 94.9k | clipped_box += w_box; |
1339 | 94.9k | } |
1340 | 96.4k | } |
1341 | 150k | } |
1342 | 90.7k | if (clipped_box.height() <= 0) { |
1343 | 18.4k | clipped_box.set_top(current_box.top()); |
1344 | 18.4k | clipped_box.set_bottom(current_box.bottom()); |
1345 | 18.4k | } |
1346 | 90.7k | if (clipped_box.width() <= 0) { |
1347 | 18.3k | clipped_box = current_box; |
1348 | 18.3k | } |
1349 | 90.7k | return clipped_box; |
1350 | 90.7k | } |
1351 | | |
1352 | | // Helper moves the blob from src to dest. If it isn't contained by clip_box, |
1353 | | // the blob is replaced by a fake that is contained. |
1354 | | static TBOX MoveAndClipBlob(C_BLOB_IT *src_it, C_BLOB_IT *dest_it, |
1355 | 404k | const TBOX &clip_box) { |
1356 | 404k | C_BLOB *src_blob = src_it->extract(); |
1357 | 404k | TBOX box = src_blob->bounding_box(); |
1358 | 404k | if (!clip_box.contains(box)) { |
1359 | 27.2k | int left = |
1360 | 27.2k | ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1); |
1361 | 27.2k | int right = |
1362 | 27.2k | ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right()); |
1363 | 27.2k | int top = |
1364 | 27.2k | ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top()); |
1365 | 27.2k | int bottom = |
1366 | 27.2k | ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1); |
1367 | 27.2k | box = TBOX(left, bottom, right, top); |
1368 | 27.2k | delete src_blob; |
1369 | 27.2k | src_blob = C_BLOB::FakeBlob(box); |
1370 | 27.2k | } |
1371 | 404k | dest_it->add_after_then_move(src_blob); |
1372 | 404k | return box; |
1373 | 404k | } |
1374 | | |
1375 | | // Replaces the current WERD/WERD_RES with the given words. The given words |
1376 | | // contain fake blobs that indicate the position of the characters. These are |
1377 | | // replaced with real blobs from the current word as much as possible. |
1378 | | void PAGE_RES_IT::ReplaceCurrentWord( |
1379 | 79.6k | tesseract::PointerVector<WERD_RES> *words) { |
1380 | 79.6k | if (words->empty()) { |
1381 | 0 | DeleteCurrentWord(); |
1382 | 0 | return; |
1383 | 0 | } |
1384 | 79.6k | WERD_RES *input_word = word(); |
1385 | | // Set the BOL/EOL flags on the words from the input word. |
1386 | 79.6k | if (input_word->word->flag(W_BOL)) { |
1387 | 68.7k | (*words)[0]->word->set_flag(W_BOL, true); |
1388 | 68.7k | } else { |
1389 | 10.9k | (*words)[0]->word->set_blanks(input_word->word->space()); |
1390 | 10.9k | } |
1391 | 79.6k | words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL)); |
1392 | | |
1393 | | // Move the blobs from the input word to the new set of words. |
1394 | | // If the input word_res is a combination, then the replacements will also be |
1395 | | // combinations, and will own their own words. If the input word_res is not a |
1396 | | // combination, then the final replacements will not be either, (although it |
1397 | | // is allowed for the input words to be combinations) and their words |
1398 | | // will get put on the row list. This maintains the ownership rules. |
1399 | 79.6k | WERD_IT w_it(row()->row->word_list()); |
1400 | 79.6k | if (!input_word->combination) { |
1401 | 36.9k | for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { |
1402 | 36.9k | WERD *word = w_it.data(); |
1403 | 36.9k | if (word == input_word->word) { |
1404 | 10.7k | break; |
1405 | 10.7k | } |
1406 | 36.9k | } |
1407 | | // w_it is now set to the input_word's word. |
1408 | 10.7k | ASSERT_HOST(!w_it.cycled_list()); |
1409 | 10.7k | } |
1410 | | // Insert into the appropriate place in the ROW_RES. |
1411 | 79.6k | WERD_RES_IT wr_it(&row()->word_res_list); |
1412 | 120k | for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { |
1413 | 120k | WERD_RES *word = wr_it.data(); |
1414 | 120k | if (word == input_word) { |
1415 | 79.6k | break; |
1416 | 79.6k | } |
1417 | 120k | } |
1418 | 79.6k | ASSERT_HOST(!wr_it.cycled_list()); |
1419 | | // Since we only have an estimate of the bounds between blobs, use the blob |
1420 | | // x-middle as the determiner of where to put the blobs |
1421 | 79.6k | C_BLOB_IT src_b_it(input_word->word->cblob_list()); |
1422 | 79.6k | src_b_it.sort(&C_BLOB::SortByXMiddle); |
1423 | 79.6k | C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list()); |
1424 | 79.6k | rej_b_it.sort(&C_BLOB::SortByXMiddle); |
1425 | 79.6k | TBOX clip_box; |
1426 | 170k | for (size_t w = 0; w < words->size(); ++w) { |
1427 | 90.7k | WERD_RES *word_w = (*words)[w]; |
1428 | 90.7k | clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word); |
1429 | | // Compute blob boundaries. |
1430 | 90.7k | std::vector<int> blob_ends; |
1431 | 90.7k | C_BLOB_LIST *next_word_blobs = |
1432 | 90.7k | w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr; |
1433 | 90.7k | ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends); |
1434 | | // Remove the fake blobs on the current word, but keep safe for back-up if |
1435 | | // no blob can be found. |
1436 | 90.7k | C_BLOB_LIST fake_blobs; |
1437 | 90.7k | C_BLOB_IT fake_b_it(&fake_blobs); |
1438 | 90.7k | fake_b_it.add_list_after(word_w->word->cblob_list()); |
1439 | 90.7k | fake_b_it.move_to_first(); |
1440 | 90.7k | word_w->word->cblob_list()->clear(); |
1441 | 90.7k | C_BLOB_IT dest_it(word_w->word->cblob_list()); |
1442 | | // Build the box word as we move the blobs. |
1443 | 90.7k | auto *box_word = new tesseract::BoxWord; |
1444 | 218k | for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) { |
1445 | 127k | int end_x = blob_ends[i]; |
1446 | 127k | TBOX blob_box; |
1447 | | // Add the blobs up to end_x. |
1448 | 452k | while (!src_b_it.empty() && |
1449 | 452k | src_b_it.data()->bounding_box().x_middle() < end_x) { |
1450 | 325k | blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box); |
1451 | 325k | src_b_it.forward(); |
1452 | 325k | } |
1453 | 191k | while (!rej_b_it.empty() && |
1454 | 191k | rej_b_it.data()->bounding_box().x_middle() < end_x) { |
1455 | 64.2k | blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box); |
1456 | 64.2k | rej_b_it.forward(); |
1457 | 64.2k | } |
1458 | 127k | if (blob_box.null_box()) { |
1459 | | // Use the original box as a back-up. |
1460 | 15.3k | blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box); |
1461 | 15.3k | } |
1462 | 127k | box_word->InsertBox(i, blob_box); |
1463 | 127k | } |
1464 | 90.7k | delete word_w->box_word; |
1465 | 90.7k | word_w->box_word = box_word; |
1466 | 90.7k | if (!input_word->combination) { |
1467 | | // Insert word_w->word into the ROW. It doesn't own its word, so the |
1468 | | // ROW needs to own it. |
1469 | 10.8k | w_it.add_before_stay_put(word_w->word); |
1470 | 10.8k | word_w->combination = false; |
1471 | 10.8k | } |
1472 | 90.7k | (*words)[w] = nullptr; // We are taking ownership. |
1473 | 90.7k | wr_it.add_before_stay_put(word_w); |
1474 | 90.7k | } |
1475 | | // We have taken ownership of the words. |
1476 | 79.6k | words->clear(); |
1477 | | // Delete the current word, which has been replaced. We could just call |
1478 | | // DeleteCurrentWord, but that would iterate both lists again, and we know |
1479 | | // we are already in the right place. |
1480 | 79.6k | if (!input_word->combination) { |
1481 | 10.7k | delete w_it.extract(); |
1482 | 10.7k | } |
1483 | 79.6k | delete wr_it.extract(); |
1484 | 79.6k | ResetWordIterator(); |
1485 | 79.6k | } |
1486 | | |
1487 | | // Deletes the current WERD_RES and its underlying WERD. |
1488 | 382 | void PAGE_RES_IT::DeleteCurrentWord() { |
1489 | | // Check that this word is as we expect. part_of_combos are NEVER iterated |
1490 | | // by the normal iterator, so we should never be trying to delete them. |
1491 | 382 | ASSERT_HOST(!word_res->part_of_combo); |
1492 | 382 | if (!word_res->combination) { |
1493 | | // Combinations own their own word, so we won't find the word on the |
1494 | | // row's word_list, but it is legitimate to try to delete them. |
1495 | | // Delete word from the ROW when not a combination. |
1496 | 71 | WERD_IT w_it(row()->row->word_list()); |
1497 | 245 | for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { |
1498 | 245 | if (w_it.data() == word_res->word) { |
1499 | 71 | break; |
1500 | 71 | } |
1501 | 245 | } |
1502 | 71 | ASSERT_HOST(!w_it.cycled_list()); |
1503 | 71 | delete w_it.extract(); |
1504 | 71 | } |
1505 | | // Remove the WERD_RES for the new_word. |
1506 | | // Remove the WORD_RES from the ROW_RES. |
1507 | 382 | WERD_RES_IT wr_it(&row()->word_res_list); |
1508 | 1.05k | for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { |
1509 | 1.05k | if (wr_it.data() == word_res) { |
1510 | 382 | word_res = nullptr; |
1511 | 382 | break; |
1512 | 382 | } |
1513 | 1.05k | } |
1514 | 382 | ASSERT_HOST(!wr_it.cycled_list()); |
1515 | 382 | delete wr_it.extract(); |
1516 | 382 | ResetWordIterator(); |
1517 | 382 | } |
1518 | | |
1519 | | // Makes the current word a fuzzy space if not already fuzzy. Updates |
1520 | | // corresponding part of combo if required. |
1521 | 0 | void PAGE_RES_IT::MakeCurrentWordFuzzy() { |
1522 | 0 | WERD *real_word = word_res->word; |
1523 | 0 | if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) { |
1524 | 0 | real_word->set_flag(W_FUZZY_SP, true); |
1525 | 0 | if (word_res->combination) { |
1526 | | // The next word should be the corresponding part of combo, but we have |
1527 | | // already stepped past it, so find it by search. |
1528 | 0 | WERD_RES_IT wr_it(&row()->word_res_list); |
1529 | 0 | for (wr_it.mark_cycle_pt(); |
1530 | 0 | !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) { |
1531 | 0 | } |
1532 | 0 | wr_it.forward(); |
1533 | 0 | ASSERT_HOST(wr_it.data()->part_of_combo); |
1534 | 0 | real_word = wr_it.data()->word; |
1535 | 0 | ASSERT_HOST(!real_word->flag(W_FUZZY_SP) && |
1536 | 0 | !real_word->flag(W_FUZZY_NON)); |
1537 | 0 | real_word->set_flag(W_FUZZY_SP, true); |
1538 | 0 | } |
1539 | 0 | } |
1540 | 0 | } |
1541 | | |
1542 | | /************************************************************************* |
1543 | | * PAGE_RES_IT::restart_page |
1544 | | * |
1545 | | * Set things up at the start of the page |
1546 | | *************************************************************************/ |
1547 | | |
1548 | 4.46M | WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) { |
1549 | 4.46M | block_res_it.set_to_list(&page_res->block_res_list); |
1550 | 4.46M | block_res_it.mark_cycle_pt(); |
1551 | 4.46M | prev_block_res = nullptr; |
1552 | 4.46M | prev_row_res = nullptr; |
1553 | 4.46M | prev_word_res = nullptr; |
1554 | 4.46M | block_res = nullptr; |
1555 | 4.46M | row_res = nullptr; |
1556 | 4.46M | word_res = nullptr; |
1557 | 4.46M | next_block_res = nullptr; |
1558 | 4.46M | next_row_res = nullptr; |
1559 | 4.46M | next_word_res = nullptr; |
1560 | 4.46M | internal_forward(true, empty_ok); |
1561 | 4.46M | return internal_forward(false, empty_ok); |
1562 | 4.46M | } |
1563 | | |
1564 | | // Recovers from operations on the current word, such as in InsertCloneWord |
1565 | | // and DeleteCurrentWord. |
1566 | | // Resets the word_res_it so that it is one past the next_word_res, as |
1567 | | // it should be after internal_forward. If next_row_res != row_res, |
1568 | | // then the next_word_res is in the next row, so there is no need to do |
1569 | | // anything to word_res_it, but it is still a good idea to reset the pointers |
1570 | | // word_res and prev_word_res, which are still in the current row. |
1571 | 80.0k | void PAGE_RES_IT::ResetWordIterator() { |
1572 | 80.0k | if (row_res == next_row_res) { |
1573 | | // Reset the member iterator so it can move forward and detect the |
1574 | | // cycled_list state correctly. |
1575 | 9.49k | word_res_it.move_to_first(); |
1576 | 9.49k | for (word_res_it.mark_cycle_pt(); |
1577 | 49.7k | !word_res_it.cycled_list() && word_res_it.data() != next_word_res; |
1578 | 40.2k | word_res_it.forward()) { |
1579 | 40.2k | if (!word_res_it.data()->part_of_combo) { |
1580 | 29.1k | if (prev_row_res == row_res) { |
1581 | 24.5k | prev_word_res = word_res; |
1582 | 24.5k | } |
1583 | 29.1k | word_res = word_res_it.data(); |
1584 | 29.1k | } |
1585 | 40.2k | } |
1586 | 9.49k | ASSERT_HOST(!word_res_it.cycled_list()); |
1587 | 9.49k | wr_it_of_next_word = word_res_it; |
1588 | 9.49k | word_res_it.forward(); |
1589 | 70.5k | } else { |
1590 | | // word_res_it is OK, but reset word_res and prev_word_res if needed. |
1591 | 70.5k | WERD_RES_IT wr_it(&row_res->word_res_list); |
1592 | 265k | for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { |
1593 | 195k | if (!wr_it.data()->part_of_combo) { |
1594 | 93.0k | if (prev_row_res == row_res) { |
1595 | 18.5k | prev_word_res = word_res; |
1596 | 18.5k | } |
1597 | 93.0k | word_res = wr_it.data(); |
1598 | 93.0k | } |
1599 | 195k | } |
1600 | 70.5k | } |
1601 | 80.0k | } |
1602 | | |
1603 | | /************************************************************************* |
1604 | | * PAGE_RES_IT::internal_forward |
1605 | | * |
1606 | | * Find the next word on the page. If empty_ok is true, then non-text blocks |
1607 | | * and text blocks with no text are visited as if they contain a single |
1608 | | * imaginary word in a single imaginary row. (word() and row() both return |
1609 | | *nullptr in such a block and the return value is nullptr.) If empty_ok is |
1610 | | *false, the old behaviour is maintained. Each real word is visited and empty |
1611 | | *and non-text blocks and rows are skipped. new_block is used to initialize the |
1612 | | *iterators for a new block. The iterator maintains pointers to block, row and |
1613 | | *word for the previous, current and next words. These are correct, regardless |
1614 | | *of block/row boundaries. nullptr values denote start and end of the page. |
1615 | | *************************************************************************/ |
1616 | | |
1617 | 95.7M | WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) { |
1618 | 95.7M | bool new_row = false; |
1619 | | |
1620 | 95.7M | prev_block_res = block_res; |
1621 | 95.7M | prev_row_res = row_res; |
1622 | 95.7M | prev_word_res = word_res; |
1623 | 95.7M | block_res = next_block_res; |
1624 | 95.7M | row_res = next_row_res; |
1625 | 95.7M | word_res = next_word_res; |
1626 | 95.7M | wr_it_of_current_word = wr_it_of_next_word; |
1627 | 95.7M | next_block_res = nullptr; |
1628 | 95.7M | next_row_res = nullptr; |
1629 | 95.7M | next_word_res = nullptr; |
1630 | | |
1631 | 96.7M | while (!block_res_it.cycled_list()) { |
1632 | 94.9M | if (new_block) { |
1633 | 4.45M | new_block = false; |
1634 | 4.45M | row_res_it.set_to_list(&block_res_it.data()->row_res_list); |
1635 | 4.45M | row_res_it.mark_cycle_pt(); |
1636 | 4.45M | if (row_res_it.empty() && empty_ok) { |
1637 | 0 | next_block_res = block_res_it.data(); |
1638 | 0 | break; |
1639 | 0 | } |
1640 | 4.45M | new_row = true; |
1641 | 4.45M | } |
1642 | 154M | while (!row_res_it.cycled_list()) { |
1643 | 153M | if (new_row) { |
1644 | 63.2M | new_row = false; |
1645 | 63.2M | word_res_it.set_to_list(&row_res_it.data()->word_res_list); |
1646 | 63.2M | word_res_it.mark_cycle_pt(); |
1647 | 63.2M | } |
1648 | | // Skip any part_of_combo words. |
1649 | 231M | while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) { |
1650 | 77.8M | word_res_it.forward(); |
1651 | 77.8M | } |
1652 | 153M | if (!word_res_it.cycled_list()) { |
1653 | 93.9M | next_block_res = block_res_it.data(); |
1654 | 93.9M | next_row_res = row_res_it.data(); |
1655 | 93.9M | next_word_res = word_res_it.data(); |
1656 | 93.9M | wr_it_of_next_word = word_res_it; |
1657 | 93.9M | word_res_it.forward(); |
1658 | 93.9M | goto foundword; |
1659 | 93.9M | } |
1660 | | // end of row reached |
1661 | 59.8M | row_res_it.forward(); |
1662 | 59.8M | new_row = true; |
1663 | 59.8M | } |
1664 | | // end of block reached |
1665 | 1.02M | block_res_it.forward(); |
1666 | 1.02M | new_block = true; |
1667 | 1.02M | } |
1668 | 95.7M | foundword: |
1669 | | // Update prev_word_best_choice pointer. |
1670 | 95.7M | if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) { |
1671 | 95.7M | *page_res->prev_word_best_choice = (new_block || prev_word_res == nullptr) |
1672 | 95.7M | ? nullptr |
1673 | 95.7M | : prev_word_res->best_choice; |
1674 | 95.7M | } |
1675 | 95.7M | return word_res; |
1676 | 95.7M | } |
1677 | | |
1678 | | /************************************************************************* |
1679 | | * PAGE_RES_IT::restart_row() |
1680 | | * |
1681 | | * Move to the beginning (leftmost word) of the current row. |
1682 | | *************************************************************************/ |
1683 | 3.78M | WERD_RES *PAGE_RES_IT::restart_row() { |
1684 | 3.78M | ROW_RES *row = this->row(); |
1685 | 3.78M | if (!row) { |
1686 | 1.35k | return nullptr; |
1687 | 1.35k | } |
1688 | 68.8M | for (restart_page(); this->row() != row; forward()) { |
1689 | | // pass |
1690 | 65.0M | } |
1691 | 3.78M | return word(); |
1692 | 3.78M | } |
1693 | | |
1694 | | /************************************************************************* |
1695 | | * PAGE_RES_IT::forward_paragraph |
1696 | | * |
1697 | | * Move to the beginning of the next paragraph, allowing empty blocks. |
1698 | | *************************************************************************/ |
1699 | | |
1700 | 941k | WERD_RES *PAGE_RES_IT::forward_paragraph() { |
1701 | 16.3M | while (block_res == next_block_res && |
1702 | 16.3M | (next_row_res != nullptr && next_row_res->row != nullptr && |
1703 | 15.7M | row_res->row->para() == next_row_res->row->para())) { |
1704 | 15.4M | internal_forward(false, true); |
1705 | 15.4M | } |
1706 | 941k | return internal_forward(false, true); |
1707 | 941k | } |
1708 | | |
1709 | | /************************************************************************* |
1710 | | * PAGE_RES_IT::forward_block |
1711 | | * |
1712 | | * Move to the beginning of the next block, allowing empty blocks. |
1713 | | *************************************************************************/ |
1714 | | |
1715 | 7.05k | WERD_RES *PAGE_RES_IT::forward_block() { |
1716 | 122k | while (block_res == next_block_res) { |
1717 | 115k | internal_forward(false, true); |
1718 | 115k | } |
1719 | 7.05k | return internal_forward(false, true); |
1720 | 7.05k | } |
1721 | | |
1722 | 0 | void PAGE_RES_IT::rej_stat_word() { |
1723 | 0 | int16_t chars_in_word; |
1724 | 0 | int16_t rejects_in_word = 0; |
1725 | |
|
1726 | 0 | chars_in_word = word_res->reject_map.length(); |
1727 | 0 | page_res->char_count += chars_in_word; |
1728 | 0 | block_res->char_count += chars_in_word; |
1729 | 0 | row_res->char_count += chars_in_word; |
1730 | |
|
1731 | 0 | rejects_in_word = word_res->reject_map.reject_count(); |
1732 | |
|
1733 | 0 | page_res->rej_count += rejects_in_word; |
1734 | 0 | block_res->rej_count += rejects_in_word; |
1735 | 0 | row_res->rej_count += rejects_in_word; |
1736 | 0 | if (chars_in_word == rejects_in_word) { |
1737 | 0 | row_res->whole_word_rej_count += rejects_in_word; |
1738 | 0 | } |
1739 | 0 | } |
1740 | | |
1741 | | } // namespace tesseract |