/src/tesseract/src/ccstruct/pageres.h
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: pageres.h (Formerly page_res.h) |
3 | | * Description: Results classes used by control.c |
4 | | * Author: Phil Cheatle |
5 | | * |
6 | | * (C) Copyright 1992, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #ifndef PAGERES_H |
20 | | #define PAGERES_H |
21 | | |
22 | | #include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS |
23 | | #include "clst.h" // for CLIST_ITERATOR, CLISTIZEH |
24 | | #include "elst.h" // for ELIST_ITERATOR, ELIST_LINK, ELISTIZEH |
25 | | #include "genericvector.h" // for PointerVector |
26 | | #include "matrix.h" // for MATRIX |
27 | | #include "normalis.h" // for DENORM |
28 | | #include "ratngs.h" // for WERD_CHOICE, BLOB_CHOICE (ptr only) |
29 | | #include "rect.h" // for TBOX |
30 | | #include "rejctmap.h" // for REJMAP |
31 | | #include "unicharset.h" // for UNICHARSET, UNICHARSET::Direction, UNI... |
32 | | #include "werd.h" // for WERD, W_BOL, W_EOL |
33 | | |
34 | | #include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID |
35 | | |
36 | | #include <cstdint> // for int32_t, int16_t |
37 | | #include <functional> // for std::function |
38 | | #include <set> // for std::pair |
39 | | #include <vector> // for std::vector |
40 | | |
41 | | #include <sys/types.h> // for int8_t |
42 | | |
43 | | struct Pix; |
44 | | |
45 | | namespace tesseract { |
46 | | |
47 | | class BLOCK; |
48 | | class BLOCK_LIST; |
49 | | class BLOCK_RES; |
50 | | class ROW; |
51 | | class ROW_RES; |
52 | | class SEAM; |
53 | | class WERD_RES; |
54 | | |
55 | | struct TWERD; |
56 | | |
57 | | class BoxWord; |
58 | | class Tesseract; |
59 | | struct FontInfo; |
60 | | |
61 | | /* Forward declarations */ |
62 | | |
63 | | class BLOCK_RES; |
64 | | |
65 | | ELISTIZEH(BLOCK_RES) |
66 | | CLISTIZEH(BLOCK_RES) |
67 | | class ROW_RES; |
68 | | |
69 | | ELISTIZEH(ROW_RES) |
70 | | class WERD_RES; |
71 | | |
72 | | ELISTIZEH(WERD_RES) |
73 | | |
74 | | /************************************************************************* |
75 | | * PAGE_RES - Page results |
76 | | *************************************************************************/ |
77 | | class PAGE_RES { // page result |
78 | | public: |
79 | | int32_t char_count; |
80 | | int32_t rej_count; |
81 | | BLOCK_RES_LIST block_res_list; |
82 | | bool rejected; |
83 | | // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to |
84 | | // the next word. This pointer is not owned by PAGE_RES class. |
85 | | WERD_CHOICE **prev_word_best_choice; |
86 | | // Sums of blame reasons computed by the blamer. |
87 | | std::vector<int> blame_reasons; |
88 | | // Debug information about all the misadaptions on this page. |
89 | | // Each BlamerBundle contains an index into this vector, so that words that |
90 | | // caused misadaption could be marked. However, since words could be |
91 | | // deleted/split/merged, the log is stored on the PAGE_RES level. |
92 | | std::vector<std::string> misadaption_log; |
93 | | |
94 | 7.72k | inline void Init() { |
95 | 7.72k | char_count = 0; |
96 | 7.72k | rej_count = 0; |
97 | 7.72k | rejected = false; |
98 | 7.72k | prev_word_best_choice = nullptr; |
99 | 7.72k | blame_reasons.clear(); |
100 | 7.72k | blame_reasons.resize(IRR_NUM_REASONS); |
101 | 7.72k | } |
102 | | |
103 | 0 | PAGE_RES() { |
104 | 0 | Init(); |
105 | 0 | } // empty constructor |
106 | | |
107 | | PAGE_RES(bool merge_similar_words, |
108 | | BLOCK_LIST *block_list, // real blocks |
109 | | WERD_CHOICE **prev_word_best_choice_ptr); |
110 | | |
111 | 7.72k | ~PAGE_RES() = default; |
112 | | }; |
113 | | |
114 | | /************************************************************************* |
115 | | * BLOCK_RES - Block results |
116 | | *************************************************************************/ |
117 | | |
118 | | class BLOCK_RES : public ELIST<BLOCK_RES>::LINK { |
119 | | public: |
120 | | BLOCK *block; // real block |
121 | | int32_t char_count; // chars in block |
122 | | int32_t rej_count; // rejected chars |
123 | | int16_t font_class; // |
124 | | int16_t row_count; |
125 | | float x_height; |
126 | | bool font_assigned; // block already |
127 | | // processed |
128 | | |
129 | | ROW_RES_LIST row_res_list; |
130 | | |
131 | | BLOCK_RES() = default; |
132 | | |
133 | | BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block |
134 | | |
135 | 7.06k | ~BLOCK_RES() = default; |
136 | | }; |
137 | | |
138 | | /************************************************************************* |
139 | | * ROW_RES - Row results |
140 | | *************************************************************************/ |
141 | | |
142 | | class ROW_RES : public ELIST<ROW_RES>::LINK { |
143 | | public: |
144 | | ROW *row; // real row |
145 | | int32_t char_count; // chars in block |
146 | | int32_t rej_count; // rejected chars |
147 | | int32_t whole_word_rej_count; // rejs in total rej wds |
148 | | WERD_RES_LIST word_res_list; |
149 | | |
150 | | ROW_RES() = default; |
151 | | |
152 | | ROW_RES(bool merge_similar_words, ROW *the_row); // real row |
153 | | |
154 | 95.5k | ~ROW_RES() = default; |
155 | | }; |
156 | | |
157 | | /************************************************************************* |
158 | | * WERD_RES - Word results |
159 | | *************************************************************************/ |
160 | | enum CRUNCH_MODE { CR_NONE, CR_KEEP_SPACE, CR_LOOSE_SPACE, CR_DELETE }; |
161 | | |
162 | | // WERD_RES is a collection of publicly accessible members that gathers |
163 | | // information about a word result. |
164 | | class TESS_API WERD_RES : public ELIST<WERD_RES>::LINK { |
165 | | public: |
166 | | // Which word is which? |
167 | | // There are 3 coordinate spaces in use here: a possibly rotated pixel space, |
168 | | // the original image coordinate space, and the BLN space in which the |
169 | | // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight, |
170 | | // and the x-middle of the word is at 0. |
171 | | // In the rotated pixel space, coordinates correspond to the input image, |
172 | | // but may be rotated about the origin by a multiple of 90 degrees, |
173 | | // and may therefore be negative. |
174 | | // In any case a rotation by denorm.block()->re_rotation() will take them |
175 | | // back to the original image. |
176 | | // The other differences between words all represent different stages of |
177 | | // processing during recognition. |
178 | | |
179 | | // ---------------------------INPUT------------------------------------- |
180 | | |
181 | | // The word is the input C_BLOBs in the rotated pixel space. |
182 | | // word is NOT owned by the WERD_RES unless combination is true. |
183 | | // All the other word pointers ARE owned by the WERD_RES. |
184 | | WERD *word = nullptr; // Input C_BLOB word. |
185 | | |
186 | | // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------ |
187 | | |
188 | | // The bln_boxes contains the bounding boxes (only) of the input word, in the |
189 | | // BLN space. The lengths of word and bln_boxes |
190 | | // match as they are both before any chopping. |
191 | | // TODO(rays) determine if docqual does anything useful and delete bln_boxes |
192 | | // if it doesn't. |
193 | | tesseract::BoxWord *bln_boxes = nullptr; // BLN input bounding boxes. |
194 | | // The ROW that this word sits in. NOT owned by the WERD_RES. |
195 | | ROW *blob_row = nullptr; |
196 | | // The denorm provides the transformation to get back to the rotated image |
197 | | // coords from the chopped_word/rebuild_word BLN coords, but each blob also |
198 | | // has its own denorm. |
199 | | DENORM denorm; // For use on chopped_word. |
200 | | // Unicharset used by the classifier output in best_choice and raw_choice. |
201 | | const UNICHARSET *uch_set = nullptr; // For converting back to utf8. |
202 | | |
203 | | // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION---- |
204 | | // ----Setup to a (different!) state expected by the various classifiers---- |
205 | | // TODO(rays) Tidy and make more consistent. |
206 | | |
207 | | // The chopped_word is also in BLN space, and represents the fully chopped |
208 | | // character fragments that make up the word. |
209 | | // The length of chopped_word matches length of seam_array + 1 (if set). |
210 | | TWERD *chopped_word = nullptr; // BLN chopped fragments output. |
211 | | // Vector of SEAM* holding chopping points matching chopped_word. |
212 | | std::vector<SEAM *> seam_array; |
213 | | // Widths of blobs in chopped_word. |
214 | | std::vector<int> blob_widths; |
215 | | // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between |
216 | | // blob i and blob i+1. |
217 | | std::vector<int> blob_gaps; |
218 | | // Stores the lstm choices of every timestep |
219 | | std::vector<std::vector<std::pair<const char *, float>>> timesteps; |
220 | | // Stores the lstm choices of every timestep segmented by character |
221 | | std::vector<std::vector<std::vector<std::pair<const char *, float>>>> |
222 | | segmented_timesteps; |
223 | | // Symbolchoices acquired during CTC |
224 | | std::vector<std::vector<std::pair<const char *, float>>> CTC_symbol_choices; |
225 | | // Stores if the timestep vector starts with a space |
226 | | bool leading_space = false; |
227 | | // Stores value when the word ends |
228 | | int end = 0; |
229 | | // Ratings matrix contains classifier choices for each classified combination |
230 | | // of blobs. The dimension is the same as the number of blobs in chopped_word |
231 | | // and the leading diagonal corresponds to classifier results of the blobs |
232 | | // in chopped_word. The state_ members of best_choice, raw_choice and |
233 | | // best_choices all correspond to this ratings matrix and allow extraction |
234 | | // of the blob choices for any given WERD_CHOICE. |
235 | | MATRIX *ratings = nullptr; // Owned pointer. |
236 | | // Pointer to the first WERD_CHOICE in best_choices. This is the result that |
237 | | // will be output from Tesseract. Note that this is now a borrowed pointer |
238 | | // and should NOT be deleted. |
239 | | WERD_CHOICE *best_choice = nullptr; // Borrowed pointer. |
240 | | // The best raw_choice found during segmentation search. Differs from the |
241 | | // best_choice by being the best result according to just the character |
242 | | // classifier, not taking any language model information into account. |
243 | | // Unlike best_choice, the pointer IS owned by this WERD_RES. |
244 | | WERD_CHOICE *raw_choice = nullptr; // Owned pointer. |
245 | | // Alternative results found during chopping/segmentation search stages. |
246 | | // Note that being an ELIST, best_choices owns the WERD_CHOICEs. |
247 | | WERD_CHOICE_LIST best_choices; |
248 | | |
249 | | // Truth bounding boxes, text and incorrect choice reason. |
250 | | BlamerBundle *blamer_bundle = nullptr; |
251 | | |
252 | | // --------------OUTPUT FROM RECOGNITION------------------------------- |
253 | | // --------------Not all fields are necessarily set.------------------- |
254 | | // ---best_choice, raw_choice *must* end up set, with a box_word------- |
255 | | // ---In complete output, the number of blobs in rebuild_word matches--- |
256 | | // ---the number of boxes in box_word, the number of unichar_ids in--- |
257 | | // ---best_choice, the number of ints in best_state, and the number--- |
258 | | // ---of strings in correct_text-------------------------------------- |
259 | | // ---SetupFake Sets everything to appropriate values if the word is--- |
260 | | // ---known to be bad before recognition.------------------------------ |
261 | | |
262 | | // The rebuild_word is also in BLN space, but represents the final best |
263 | | // segmentation of the word. Its length is therefore the same as box_word. |
264 | | TWERD *rebuild_word = nullptr; // BLN best segmented word. |
265 | | // The box_word is in the original image coordinate space. It is the |
266 | | // bounding boxes of the rebuild_word, after denormalization. |
267 | | // The length of box_word matches rebuild_word, best_state (if set) and |
268 | | // correct_text (if set), as well as best_choice and represents the |
269 | | // number of classified units in the output. |
270 | | tesseract::BoxWord *box_word = nullptr; // Denormalized output boxes. |
271 | | // The Tesseract that was used to recognize this word. Just a borrowed |
272 | | // pointer. Note: Tesseract's class definition is in a higher-level library. |
273 | | // We avoid introducing a cyclic dependency by not using the Tesseract |
274 | | // within WERD_RES. We are just storing it to provide access to it |
275 | | // for the top-level multi-language controller, and maybe for output of |
276 | | // the recognized language. |
277 | | // tesseract points to data owned elsewhere. |
278 | | tesseract::Tesseract *tesseract = nullptr; |
279 | | // The best_state stores the relationship between chopped_word and |
280 | | // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i] |
281 | | // adjacent blobs in chopped_word. The seams in seam_array are hidden |
282 | | // within a rebuild_word blob and revealed between them. |
283 | | std::vector<int> best_state; // Number of blobs in each best blob. |
284 | | // The correct_text is used during training and adaption to carry the |
285 | | // text to the training system without the need for a unicharset. There |
286 | | // is one entry in the vector for each blob in rebuild_word and box_word. |
287 | | std::vector<std::string> correct_text; |
288 | | |
289 | | // Less-well documented members. |
290 | | // TODO(rays) Add more documentation here. |
291 | | WERD_CHOICE *ep_choice = nullptr; // ep text TODO(rays) delete this. |
292 | | REJMAP reject_map; // best_choice rejects |
293 | | bool tess_failed = false; |
294 | | /* |
295 | | If tess_failed is true, one of the following tests failed when Tess |
296 | | returned: |
297 | | - The outword blob list was not the same length as the best_choice string; |
298 | | - The best_choice string contained ALL blanks; |
299 | | - The best_choice string was zero length |
300 | | */ |
301 | | bool tess_accepted = false; // Tess thinks its ok? |
302 | | bool tess_would_adapt = false; // Tess would adapt? |
303 | | bool done = false; // ready for output? |
304 | | bool small_caps = false; // word appears to be small caps |
305 | | bool odd_size = false; // word is bigger than line or leader dots. |
306 | | // The fontinfos are pointers to data owned by the classifier. |
307 | | const FontInfo *fontinfo = nullptr; |
308 | | const FontInfo *fontinfo2 = nullptr; |
309 | | int8_t fontinfo_id_count = 0; // number of votes |
310 | | int8_t fontinfo_id2_count = 0; // number of votes |
311 | | bool guessed_x_ht = true; |
312 | | bool guessed_caps_ht = true; |
313 | | CRUNCH_MODE unlv_crunch_mode = CR_NONE; |
314 | | float x_height = 0.0f; // post match estimate |
315 | | float caps_height = 0.0f; // post match estimate |
316 | | float baseline_shift = 0.0f; // post match estimate. |
317 | | // Certainty score for the spaces either side of this word (LSTM mode). |
318 | | // MIN this value with the actual word certainty. |
319 | | float space_certainty = 0.0f; |
320 | | |
321 | | /* |
322 | | To deal with fuzzy spaces we need to be able to combine "words" to form |
323 | | combinations when we suspect that the gap is a non-space. The (new) text |
324 | | ord code generates separate words for EVERY fuzzy gap - flags in the word |
325 | | indicate whether the gap is below the threshold (fuzzy kern) and is thus |
326 | | NOT a real word break by default, or above the threshold (fuzzy space) and |
327 | | this is a real word break by default. |
328 | | |
329 | | The WERD_RES list contains all these words PLUS "combination" words built |
330 | | out of (copies of) the words split by fuzzy kerns. The separate parts have |
331 | | their "part_of_combo" flag set true and should be IGNORED on a default |
332 | | reading of the list. |
333 | | |
334 | | Combination words are FOLLOWED by the sequence of part_of_combo words |
335 | | which they combine. |
336 | | */ |
337 | | bool combination = false; // of two fuzzy gap wds |
338 | | bool part_of_combo = false; // part of a combo |
339 | | bool reject_spaces = false; // Reject spacing? |
340 | | |
341 | 141k | WERD_RES() = default; |
342 | | |
343 | 335k | WERD_RES(WERD *the_word) { |
344 | 335k | word = the_word; |
345 | 335k | } |
346 | | // Deep copies everything except the ratings MATRIX. |
347 | | // To get that use deep_copy below. |
348 | 26.8k | WERD_RES(const WERD_RES &source) : ELIST<WERD_RES>::LINK(source) { |
349 | | // combination is used in function Clear which is called from operator=. |
350 | 26.8k | combination = false; |
351 | 26.8k | *this = source; // see operator= |
352 | 26.8k | } |
353 | | |
354 | | ~WERD_RES(); |
355 | | |
356 | | // Returns the UTF-8 string for the given blob index in the best_choice word, |
357 | | // given that we know whether we are in a right-to-left reading context. |
358 | | // This matters for mirrorable characters such as parentheses. We recognize |
359 | | // characters purely based on their shape on the page, and by default produce |
360 | | // the corresponding unicode for a left-to-right context. |
361 | 488k | const char *BestUTF8(unsigned blob_index, bool in_rtl_context) const { |
362 | 488k | if (best_choice == nullptr || blob_index >= best_choice->length()) { |
363 | 0 | return nullptr; |
364 | 0 | } |
365 | 488k | UNICHAR_ID id = best_choice->unichar_id(blob_index); |
366 | 488k | if (static_cast<unsigned>(id) >= uch_set->size()) { |
367 | 0 | return nullptr; |
368 | 0 | } |
369 | 488k | UNICHAR_ID mirrored = uch_set->get_mirror(id); |
370 | 488k | if (in_rtl_context && mirrored > 0) { |
371 | 0 | id = mirrored; |
372 | 0 | } |
373 | 488k | return uch_set->id_to_unichar_ext(id); |
374 | 488k | } |
375 | | // Returns the UTF-8 string for the given blob index in the raw_choice word. |
376 | 0 | const char *RawUTF8(unsigned blob_index) const { |
377 | 0 | if (blob_index >= raw_choice->length()) { |
378 | 0 | return nullptr; |
379 | 0 | } |
380 | 0 | UNICHAR_ID id = raw_choice->unichar_id(blob_index); |
381 | 0 | if (static_cast<unsigned>(id) >= uch_set->size()) { |
382 | 0 | return nullptr; |
383 | 0 | } |
384 | 0 | return uch_set->id_to_unichar(id); |
385 | 0 | } |
386 | | |
387 | 0 | UNICHARSET::Direction SymbolDirection(unsigned blob_index) const { |
388 | 0 | if (best_choice == nullptr || blob_index >= best_choice->length()) { |
389 | 0 | return UNICHARSET::U_OTHER_NEUTRAL; |
390 | 0 | } |
391 | 0 | return uch_set->get_direction(best_choice->unichar_id(blob_index)); |
392 | 0 | } |
393 | | |
394 | 3.79M | bool AnyRtlCharsInWord() const { |
395 | 3.79M | if (uch_set == nullptr || best_choice == nullptr || |
396 | 3.79M | best_choice->length() < 1) { |
397 | 0 | return false; |
398 | 0 | } |
399 | 11.1M | for (unsigned id = 0; id < best_choice->length(); id++) { |
400 | 7.30M | unsigned unichar_id = best_choice->unichar_id(id); |
401 | 7.30M | if (unichar_id >= uch_set->size()) { |
402 | 0 | continue; // Ignore illegal chars. |
403 | 0 | } |
404 | 7.30M | UNICHARSET::Direction dir = uch_set->get_direction(unichar_id); |
405 | 7.30M | if (dir == UNICHARSET::U_RIGHT_TO_LEFT || |
406 | 7.30M | dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) { |
407 | 0 | return true; |
408 | 0 | } |
409 | 7.30M | } |
410 | 3.79M | return false; |
411 | 3.79M | } |
412 | | |
413 | 3.79M | bool AnyLtrCharsInWord() const { |
414 | 3.79M | if (uch_set == nullptr || best_choice == nullptr || |
415 | 3.79M | best_choice->length() < 1) { |
416 | 0 | return false; |
417 | 0 | } |
418 | 5.90M | for (unsigned id = 0; id < best_choice->length(); id++) { |
419 | 4.42M | unsigned unichar_id = best_choice->unichar_id(id); |
420 | 4.42M | if (unichar_id >= uch_set->size()) { |
421 | 0 | continue; // Ignore illegal chars. |
422 | 0 | } |
423 | 4.42M | UNICHARSET::Direction dir = uch_set->get_direction(unichar_id); |
424 | 4.42M | if (dir == UNICHARSET::U_LEFT_TO_RIGHT || |
425 | 4.42M | dir == UNICHARSET::U_ARABIC_NUMBER) { |
426 | 2.31M | return true; |
427 | 2.31M | } |
428 | 4.42M | } |
429 | 1.47M | return false; |
430 | 3.79M | } |
431 | | |
432 | | // Return whether the blobs in this WERD_RES 0, 1,... come from an engine |
433 | | // that gave us the unichars in reading order (as opposed to strict left |
434 | | // to right). |
435 | 0 | bool UnicharsInReadingOrder() const { |
436 | 0 | return best_choice->unichars_in_script_order(); |
437 | 0 | } |
438 | | |
439 | | void Clear(); |
440 | | void ClearResults(); |
441 | | void ClearWordChoices(); |
442 | | void ClearRatings(); |
443 | | |
444 | | // Deep copies everything except the ratings MATRIX. |
445 | | // To get that use deep_copy below. |
446 | | WERD_RES &operator=(const WERD_RES &source); // from this |
447 | | |
448 | | void CopySimpleFields(const WERD_RES &source); |
449 | | |
450 | | // Initializes a blank (default constructed) WERD_RES from one that has |
451 | | // already been recognized. |
452 | | // Use SetupFor*Recognition afterwards to complete the setup and make |
453 | | // it ready for a retry recognition. |
454 | | void InitForRetryRecognition(const WERD_RES &source); |
455 | | |
456 | | // Sets up the members used in recognition: bln_boxes, chopped_word, |
457 | | // seam_array, denorm. Returns false if |
458 | | // the word is empty and sets up fake results. If use_body_size is |
459 | | // true and row->body_size is set, then body_size will be used for |
460 | | // blob normalization instead of xheight + ascrise. This flag is for |
461 | | // those languages that are using CJK pitch model and thus it has to |
462 | | // be true if and only if tesseract->textord_use_cjk_fp_model is |
463 | | // true. |
464 | | // If allow_detailed_fx is true, the feature extractor will receive fine |
465 | | // precision outline information, allowing smoother features and better |
466 | | // features on low resolution images. |
467 | | // The norm_mode sets the default mode for normalization in absence |
468 | | // of any of the above flags. It should really be a tesseract::OcrEngineMode |
469 | | // but is declared as int for ease of use with tessedit_ocr_engine_mode. |
470 | | // Returns false if the word is empty and sets up fake results. |
471 | | bool SetupForRecognition(const UNICHARSET &unicharset_in, |
472 | | tesseract::Tesseract *tesseract, Image pix, |
473 | | int norm_mode, const TBOX *norm_box, |
474 | | bool numeric_mode, bool use_body_size, |
475 | | bool allow_detailed_fx, ROW *row, |
476 | | const BLOCK *block); |
477 | | |
478 | | // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty |
479 | | // accumulators from a made chopped word. We presume the fields are already |
480 | | // empty. |
481 | | void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in); |
482 | | |
483 | | // Sets up the members used in recognition for an empty recognition result: |
484 | | // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. |
485 | | void SetupFake(const UNICHARSET &uch); |
486 | | |
487 | | // Set the word as having the script of the input unicharset. |
488 | | void SetupWordScript(const UNICHARSET &unicharset_in); |
489 | | |
490 | | // Sets up the blamer_bundle if it is not null, using the initialized denorm. |
491 | | void SetupBlamerBundle(); |
492 | | |
493 | | // Computes the blob_widths and blob_gaps from the chopped_word. |
494 | | void SetupBlobWidthsAndGaps(); |
495 | | |
496 | | // Updates internal data to account for a new SEAM (chop) at the given |
497 | | // blob_number. Fixes the ratings matrix and states in the choices, as well |
498 | | // as the blob widths and gaps. |
499 | | void InsertSeam(int blob_number, SEAM *seam); |
500 | | |
501 | | // Returns true if all the word choices except the first have adjust_factors |
502 | | // worse than the given threshold. |
503 | | bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const; |
504 | | |
505 | | // Returns true if the current word is ambiguous (by number of answers or |
506 | | // by dangerous ambigs.) |
507 | | bool IsAmbiguous(); |
508 | | |
509 | | // Returns true if the ratings matrix size matches the sum of each of the |
510 | | // segmentation states. |
511 | | bool StatesAllValid(); |
512 | | |
513 | | // Prints a list of words found if debug is true or the word result matches |
514 | | // the word_to_debug. |
515 | | void DebugWordChoices(bool debug, const char *word_to_debug); |
516 | | |
517 | | // Prints the top choice along with the accepted/done flags. |
518 | | void DebugTopChoice(const char *msg) const; |
519 | | |
520 | | // Removes from best_choices all choices which are not within a reasonable |
521 | | // range of the best choice. |
522 | | void FilterWordChoices(int debug_level); |
523 | | |
524 | | // Computes a set of distance thresholds used to control adaption. |
525 | | // Compares the best choice for the current word to the best raw choice |
526 | | // to determine which characters were classified incorrectly by the |
527 | | // classifier. Then places a separate threshold into thresholds for each |
528 | | // character in the word. If the classifier was correct, max_rating is placed |
529 | | // into thresholds. If the classifier was incorrect, the mean match rating |
530 | | // (error percentage) of the classifier's incorrect choice minus some margin |
531 | | // is placed into thresholds. This can then be used by the caller to try to |
532 | | // create a new template for the desired class that will classify the |
533 | | // character with a rating better than the threshold value. The match rating |
534 | | // placed into thresholds is never allowed to be below min_rating in order to |
535 | | // prevent trying to make overly tight templates. |
536 | | // min_rating limits how tight to make a template. |
537 | | // max_rating limits how loose to make a template. |
538 | | // rating_margin denotes the amount of margin to put in template. |
539 | | void ComputeAdaptionThresholds(float certainty_scale, float min_rating, |
540 | | float max_rating, float rating_margin, |
541 | | float *thresholds); |
542 | | |
543 | | // Saves a copy of the word_choice if it has the best unadjusted rating. |
544 | | // Returns true if the word_choice was the new best. |
545 | | bool LogNewRawChoice(WERD_CHOICE *word_choice); |
546 | | // Consumes word_choice by adding it to best_choices, (taking ownership) if |
547 | | // the certainty for word_choice is some distance of the best choice in |
548 | | // best_choices, or by deleting the word_choice and returning false. |
549 | | // The best_choices list is kept in sorted order by rating. Duplicates are |
550 | | // removed, and the list is kept no longer than max_num_choices in length. |
551 | | // Returns true if the word_choice is still a valid pointer. |
552 | | bool LogNewCookedChoice(int max_num_choices, bool debug, |
553 | | WERD_CHOICE *word_choice); |
554 | | |
555 | | // Prints a brief list of all the best choices. |
556 | | void PrintBestChoices() const; |
557 | | |
558 | | // Returns the sum of the widths of the blob between start_blob and last_blob |
559 | | // inclusive. |
560 | | int GetBlobsWidth(int start_blob, int last_blob) const; |
561 | | // Returns the width of a gap between the specified blob and the next one. |
562 | | int GetBlobsGap(unsigned blob_index) const; |
563 | | |
564 | | // Returns the BLOB_CHOICE corresponding to the given index in the |
565 | | // best choice word taken from the appropriate cell in the ratings MATRIX. |
566 | | // Borrowed pointer, so do not delete. May return nullptr if there is no |
567 | | // BLOB_CHOICE matching the unichar_id at the given index. |
568 | | BLOB_CHOICE *GetBlobChoice(unsigned index) const; |
569 | | |
570 | | // Returns the BLOB_CHOICE_LIST corresponding to the given index in the |
571 | | // best choice word taken from the appropriate cell in the ratings MATRIX. |
572 | | // Borrowed pointer, so do not delete. |
573 | | BLOB_CHOICE_LIST *GetBlobChoices(int index) const; |
574 | | |
575 | | // Moves the results fields from word to this. This takes ownership of all |
576 | | // the data, so src can be destructed. |
577 | | // word1.ConsumeWordResult(word); |
578 | | // delete word; |
579 | | // is simpler and faster than: |
580 | | // word1 = *word; |
581 | | // delete word; |
582 | | // as it doesn't need to copy and reallocate anything. |
583 | | void ConsumeWordResults(WERD_RES *word); |
584 | | |
585 | | // Replace the best choice and rebuild box word. |
586 | | // choice must be from the current best_choices list. |
587 | | void ReplaceBestChoice(WERD_CHOICE *choice); |
588 | | |
589 | | // Builds the rebuild_word and sets the best_state from the chopped_word and |
590 | | // the best_choice->state. |
591 | | void RebuildBestState(); |
592 | | |
593 | | // Copies the chopped_word to the rebuild_word, faking a best_state as well. |
594 | | // Also sets up the output box_word. |
595 | | void CloneChoppedToRebuild(); |
596 | | |
597 | | // Sets/replaces the box_word with one made from the rebuild_word. |
598 | | void SetupBoxWord(); |
599 | | |
600 | | // Sets up the script positions in the best_choice using the best_choice |
601 | | // to get the unichars, and the unicharset to get the target positions. |
602 | | void SetScriptPositions(); |
603 | | // Sets all the blobs in all the words (best choice and alternates) to be |
604 | | // the given position. (When a sub/superscript is recognized as a separate |
605 | | // word, it falls victim to the rule that a whole word cannot be sub or |
606 | | // superscript, so this function overrides that problem.) |
607 | | void SetAllScriptPositions(tesseract::ScriptPos position); |
608 | | |
609 | | // Classifies the word with some already-calculated BLOB_CHOICEs. |
610 | | // The choices are an array of blob_count pointers to BLOB_CHOICE, |
611 | | // providing a single classifier result for each blob. |
612 | | // The BLOB_CHOICEs are consumed and the word takes ownership. |
613 | | // The number of blobs in the box_word must match blob_count. |
614 | | void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices); |
615 | | |
616 | | // Creates a WERD_CHOICE for the word using the top choices from the leading |
617 | | // diagonal of the ratings matrix. |
618 | | void FakeWordFromRatings(PermuterType permuter); |
619 | | |
620 | | // Copies the best_choice strings to the correct_text for adaption/training. |
621 | | void BestChoiceToCorrectText(); |
622 | | |
623 | | // Merges 2 adjacent blobs in the result if the permanent callback |
624 | | // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent |
625 | | // callback box_cb is nullptr or returns true, setting the merged blob |
626 | | // result to the class returned from class_cb. |
627 | | // Returns true if anything was merged. |
628 | | bool ConditionalBlobMerge( |
629 | | const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb, |
630 | | const std::function<bool(const TBOX &, const TBOX &)> &box_cb); |
631 | | |
632 | | // Merges 2 adjacent blobs in the result (index and index+1) and corrects |
633 | | // all the data to account for the change. |
634 | | void MergeAdjacentBlobs(unsigned index); |
635 | | |
636 | | // Callback helper for fix_quotes returns a double quote if both |
637 | | // arguments are quote, otherwise INVALID_UNICHAR_ID. |
638 | | UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2); |
639 | | void fix_quotes(); |
640 | | |
641 | | // Callback helper for fix_hyphens returns UNICHAR_ID of - if both |
642 | | // arguments are hyphen, otherwise INVALID_UNICHAR_ID. |
643 | | UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2); |
644 | | // Callback helper for fix_hyphens returns true if box1 and box2 overlap |
645 | | // (assuming both on the same textline, are in order and a chopped em dash.) |
646 | | bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2); |
647 | | void fix_hyphens(); |
648 | | |
649 | | // Callback helper for merge_tess_fails returns a space if both |
650 | | // arguments are space, otherwise INVALID_UNICHAR_ID. |
651 | | UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2); |
652 | | void merge_tess_fails(); |
653 | | |
654 | | // Returns a really deep copy of *src, including the ratings MATRIX. |
655 | 0 | static WERD_RES *deep_copy(const WERD_RES *src) { |
656 | 0 | auto *result = new WERD_RES(*src); |
657 | | // That didn't copy the ratings, but we want a copy if there is one to |
658 | | // begin with. |
659 | 0 | if (src->ratings != nullptr) { |
660 | 0 | result->ratings = src->ratings->DeepCopy(); |
661 | 0 | } |
662 | 0 | return result; |
663 | 0 | } |
664 | | |
665 | | // Copy blobs from word_res onto this word (eliminating spaces between). |
666 | | // Since this may be called bidirectionally OR both the BOL and EOL flags. |
667 | 33.9k | void copy_on(WERD_RES *word_res) { // from this word |
668 | 33.9k | word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL)); |
669 | 33.9k | word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL)); |
670 | 33.9k | word->copy_on(word_res->word); |
671 | 33.9k | } |
672 | | |
673 | | // Returns true if the collection of count pieces, starting at start, are all |
674 | | // natural connected components, ie there are no real chops involved. |
675 | | bool PiecesAllNatural(int start, int count) const; |
676 | | }; |
677 | | |
678 | | /************************************************************************* |
679 | | * PAGE_RES_IT - Page results iterator |
680 | | *************************************************************************/ |
681 | | |
682 | | class TESS_API PAGE_RES_IT { |
683 | | public: |
684 | | PAGE_RES *page_res; // page being iterated |
685 | | |
686 | 0 | PAGE_RES_IT() = default; |
687 | | |
688 | 603k | PAGE_RES_IT(PAGE_RES *the_page_res) { // page result |
689 | 603k | page_res = the_page_res; |
690 | 603k | restart_page(); // ready to scan |
691 | 603k | } |
692 | | |
693 | | // Do two PAGE_RES_ITs point at the same word? |
694 | | // This is much cheaper than cmp(). |
695 | 1.37M | bool operator==(const PAGE_RES_IT &other) const { |
696 | 1.37M | return word_res == other.word_res && row_res == other.row_res && |
697 | 1.37M | block_res == other.block_res; |
698 | 1.37M | } |
699 | | |
700 | 0 | bool operator!=(const PAGE_RES_IT &other) const { |
701 | 0 | return !(*this == other); |
702 | 0 | } |
703 | | |
704 | | // Given another PAGE_RES_IT to the same page, |
705 | | // this before other: -1 |
706 | | // this equal to other: 0 |
707 | | // this later than other: 1 |
708 | | int cmp(const PAGE_RES_IT &other) const; |
709 | | |
710 | 4.44M | WERD_RES *restart_page() { |
711 | 4.44M | return start_page(false); // Skip empty blocks. |
712 | 4.44M | } |
713 | 14.7k | WERD_RES *restart_page_with_empties() { |
714 | 14.7k | return start_page(true); // Allow empty blocks. |
715 | 14.7k | } |
716 | | WERD_RES *start_page(bool empty_ok); |
717 | | |
718 | | WERD_RES *restart_row(); |
719 | | |
720 | | // ============ Methods that mutate the underling structures =========== |
721 | | // Note that these methods will potentially invalidate other PAGE_RES_ITs |
722 | | // and are intended to be used only while a single PAGE_RES_IT is active. |
723 | | // This problem needs to be taken into account if these mutation operators |
724 | | // are ever provided to PageIterator or its subclasses. |
725 | | |
726 | | // Inserts the new_word and a corresponding WERD_RES before the current |
727 | | // position. The simple fields of the WERD_RES are copied from clone_res and |
728 | | // the resulting WERD_RES is returned for further setup with best_choice etc. |
729 | | WERD_RES *InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word); |
730 | | |
731 | | // Replaces the current WERD/WERD_RES with the given words. The given words |
732 | | // contain fake blobs that indicate the position of the characters. These are |
733 | | // replaced with real blobs from the current word as much as possible. |
734 | | void ReplaceCurrentWord(PointerVector<WERD_RES> *words); |
735 | | |
736 | | // Deletes the current WERD_RES and its underlying WERD. |
737 | | void DeleteCurrentWord(); |
738 | | |
739 | | // Makes the current word a fuzzy space if not already fuzzy. Updates |
740 | | // corresponding part of combo if required. |
741 | | void MakeCurrentWordFuzzy(); |
742 | | |
743 | 65.9M | WERD_RES *forward() { // Get next word. |
744 | 65.9M | return internal_forward(false, false); |
745 | 65.9M | } |
746 | | // Move forward, but allow empty blocks to show as single nullptr words. |
747 | 4.45M | WERD_RES *forward_with_empties() { |
748 | 4.45M | return internal_forward(false, true); |
749 | 4.45M | } |
750 | | |
751 | | WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph |
752 | | WERD_RES *forward_block(); // get first word in next non-empty block |
753 | | |
754 | 0 | WERD_RES *prev_word() const { // previous word |
755 | 0 | return prev_word_res; |
756 | 0 | } |
757 | 4.23M | ROW_RES *prev_row() const { // row of prev word |
758 | 4.23M | return prev_row_res; |
759 | 4.23M | } |
760 | 876k | BLOCK_RES *prev_block() const { // block of prev word |
761 | 876k | return prev_block_res; |
762 | 876k | } |
763 | 43.8M | WERD_RES *word() const { // current word |
764 | 43.8M | return word_res; |
765 | 43.8M | } |
766 | 78.9M | ROW_RES *row() const { // row of current word |
767 | 78.9M | return row_res; |
768 | 78.9M | } |
769 | 20.2M | BLOCK_RES *block() const { // block of cur. word |
770 | 20.2M | return block_res; |
771 | 20.2M | } |
772 | 0 | WERD_RES *next_word() const { // next word |
773 | 0 | return next_word_res; |
774 | 0 | } |
775 | 0 | ROW_RES *next_row() const { // row of next word |
776 | 0 | return next_row_res; |
777 | 0 | } |
778 | 0 | BLOCK_RES *next_block() const { // block of next word |
779 | 0 | return next_block_res; |
780 | 0 | } |
781 | | void rej_stat_word(); // for page/block/row |
782 | | void ResetWordIterator(); |
783 | | |
784 | | private: |
785 | | WERD_RES *internal_forward(bool new_block, bool empty_ok); |
786 | | |
787 | | WERD_RES *prev_word_res; // previous word |
788 | | ROW_RES *prev_row_res; // row of prev word |
789 | | BLOCK_RES *prev_block_res; // block of prev word |
790 | | |
791 | | WERD_RES *word_res; // current word |
792 | | ROW_RES *row_res; // row of current word |
793 | | BLOCK_RES *block_res; // block of cur. word |
794 | | |
795 | | WERD_RES *next_word_res; // next word |
796 | | ROW_RES *next_row_res; // row of next word |
797 | | BLOCK_RES *next_block_res; // block of next word |
798 | | |
799 | | BLOCK_RES_IT block_res_it; // iterators |
800 | | ROW_RES_IT row_res_it; |
801 | | WERD_RES_IT word_res_it; |
802 | | // Iterators used to get the state of word_res_it for the current word. |
803 | | // Since word_res_it is 2 words further on, this is otherwise hard to do. |
804 | | WERD_RES_IT wr_it_of_current_word; |
805 | | WERD_RES_IT wr_it_of_next_word; |
806 | | }; |
807 | | |
808 | | } // namespace tesseract |
809 | | |
810 | | #endif |