Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccstruct/pageres.h
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        pageres.h  (Formerly page_res.h)
3
 * Description: Results classes used by control.c
4
 * Author:      Phil Cheatle
5
 *
6
 * (C) Copyright 1992, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#ifndef PAGERES_H
20
#define PAGERES_H
21
22
#include "blamer.h"        // for BlamerBundle (ptr only), IRR_NUM_REASONS
23
#include "clst.h"          // for CLIST_ITERATOR, CLISTIZEH
24
#include "elst.h"          // for ELIST_ITERATOR, ELIST_LINK, ELISTIZEH
25
#include "genericvector.h" // for PointerVector
26
#include "matrix.h"        // for MATRIX
27
#include "normalis.h"      // for DENORM
28
#include "ratngs.h"        // for WERD_CHOICE, BLOB_CHOICE (ptr only)
29
#include "rect.h"          // for TBOX
30
#include "rejctmap.h"      // for REJMAP
31
#include "unicharset.h"    // for UNICHARSET, UNICHARSET::Direction, UNI...
32
#include "werd.h"          // for WERD, W_BOL, W_EOL
33
34
#include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
35
36
#include <cstdint>    // for int32_t, int16_t
37
#include <functional> // for std::function
38
#include <set>        // for std::pair
39
#include <vector>     // for std::vector
40
41
#include <sys/types.h> // for int8_t
42
43
struct Pix;
44
45
namespace tesseract {
46
47
class BLOCK;
48
class BLOCK_LIST;
49
class BLOCK_RES;
50
class ROW;
51
class ROW_RES;
52
class SEAM;
53
class WERD_RES;
54
55
struct TWERD;
56
57
class BoxWord;
58
class Tesseract;
59
struct FontInfo;
60
61
/* Forward declarations */
62
63
class BLOCK_RES;
64
65
ELISTIZEH(BLOCK_RES)
66
CLISTIZEH(BLOCK_RES)
67
class ROW_RES;
68
69
ELISTIZEH(ROW_RES)
70
class WERD_RES;
71
72
ELISTIZEH(WERD_RES)
73
74
/*************************************************************************
75
 * PAGE_RES - Page results
76
 *************************************************************************/
77
class PAGE_RES { // page result
78
public:
79
  int32_t char_count;
80
  int32_t rej_count;
81
  BLOCK_RES_LIST block_res_list;
82
  bool rejected;
83
  // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
84
  // the next word. This pointer is not owned by PAGE_RES class.
85
  WERD_CHOICE **prev_word_best_choice;
86
  // Sums of blame reasons computed by the blamer.
87
  std::vector<int> blame_reasons;
88
  // Debug information about all the misadaptions on this page.
89
  // Each BlamerBundle contains an index into this vector, so that words that
90
  // caused misadaption could be marked. However, since words could be
91
  // deleted/split/merged, the log is stored on the PAGE_RES level.
92
  std::vector<std::string> misadaption_log;
93
94
7.72k
  inline void Init() {
95
7.72k
    char_count = 0;
96
7.72k
    rej_count = 0;
97
7.72k
    rejected = false;
98
7.72k
    prev_word_best_choice = nullptr;
99
7.72k
    blame_reasons.clear();
100
7.72k
    blame_reasons.resize(IRR_NUM_REASONS);
101
7.72k
  }
102
103
0
  PAGE_RES() {
104
0
    Init();
105
0
  } // empty constructor
106
107
  PAGE_RES(bool merge_similar_words,
108
           BLOCK_LIST *block_list, // real blocks
109
           WERD_CHOICE **prev_word_best_choice_ptr);
110
111
7.72k
  ~PAGE_RES() = default;
112
};
113
114
/*************************************************************************
115
 * BLOCK_RES - Block results
116
 *************************************************************************/
117
118
class BLOCK_RES : public ELIST<BLOCK_RES>::LINK {
119
public:
120
  BLOCK *block;       // real block
121
  int32_t char_count; // chars in block
122
  int32_t rej_count;  // rejected chars
123
  int16_t font_class; //
124
  int16_t row_count;
125
  float x_height;
126
  bool font_assigned; // block already
127
  //      processed
128
129
  ROW_RES_LIST row_res_list;
130
131
  BLOCK_RES() = default;
132
133
  BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block
134
135
7.06k
  ~BLOCK_RES() = default;
136
};
137
138
/*************************************************************************
139
 * ROW_RES - Row results
140
 *************************************************************************/
141
142
class ROW_RES : public ELIST<ROW_RES>::LINK {
143
public:
144
  ROW *row;                     // real row
145
  int32_t char_count;           // chars in block
146
  int32_t rej_count;            // rejected chars
147
  int32_t whole_word_rej_count; // rejs in total rej wds
148
  WERD_RES_LIST word_res_list;
149
150
  ROW_RES() = default;
151
152
  ROW_RES(bool merge_similar_words, ROW *the_row); // real row
153
154
95.5k
  ~ROW_RES() = default;
155
};
156
157
/*************************************************************************
158
 * WERD_RES - Word results
159
 *************************************************************************/
160
enum CRUNCH_MODE { CR_NONE, CR_KEEP_SPACE, CR_LOOSE_SPACE, CR_DELETE };
161
162
// WERD_RES is a collection of publicly accessible members that gathers
163
// information about a word result.
164
class TESS_API WERD_RES : public ELIST<WERD_RES>::LINK {
165
public:
166
  // Which word is which?
167
  // There are 3 coordinate spaces in use here: a possibly rotated pixel space,
168
  // the original image coordinate space, and the BLN space in which the
169
  // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,
170
  // and the x-middle of the word is at 0.
171
  // In the rotated pixel space, coordinates correspond to the input image,
172
  // but may be rotated about the origin by a multiple of 90 degrees,
173
  // and may therefore be negative.
174
  // In any case a rotation by denorm.block()->re_rotation() will take them
175
  // back to the original image.
176
  // The other differences between words all represent different stages of
177
  // processing during recognition.
178
179
  // ---------------------------INPUT-------------------------------------
180
181
  // The word is the input C_BLOBs in the rotated pixel space.
182
  // word is NOT owned by the WERD_RES unless combination is true.
183
  // All the other word pointers ARE owned by the WERD_RES.
184
  WERD *word = nullptr; // Input C_BLOB word.
185
186
  // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
187
188
  // The bln_boxes contains the bounding boxes (only) of the input word, in the
189
  // BLN space. The lengths of word and bln_boxes
190
  // match as they are both before any chopping.
191
  // TODO(rays) determine if docqual does anything useful and delete bln_boxes
192
  // if it doesn't.
193
  tesseract::BoxWord *bln_boxes = nullptr; // BLN input bounding boxes.
194
  // The ROW that this word sits in. NOT owned by the WERD_RES.
195
  ROW *blob_row = nullptr;
196
  // The denorm provides the transformation to get back to the rotated image
197
  // coords from the chopped_word/rebuild_word BLN coords, but each blob also
198
  // has its own denorm.
199
  DENORM denorm; // For use on chopped_word.
200
  // Unicharset used by the classifier output in best_choice and raw_choice.
201
  const UNICHARSET *uch_set = nullptr; // For converting back to utf8.
202
203
  // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
204
  // ----Setup to a (different!) state expected by the various classifiers----
205
  // TODO(rays) Tidy and make more consistent.
206
207
  // The chopped_word is also in BLN space, and represents the fully chopped
208
  // character fragments that make up the word.
209
  // The length of chopped_word matches length of seam_array + 1 (if set).
210
  TWERD *chopped_word = nullptr; // BLN chopped fragments output.
211
  // Vector of SEAM* holding chopping points matching chopped_word.
212
  std::vector<SEAM *> seam_array;
213
  // Widths of blobs in chopped_word.
214
  std::vector<int> blob_widths;
215
  // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
216
  // blob i and blob i+1.
217
  std::vector<int> blob_gaps;
218
  // Stores the lstm choices of every timestep
219
  std::vector<std::vector<std::pair<const char *, float>>> timesteps;
220
  // Stores the lstm choices of every timestep segmented by character
221
  std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
222
      segmented_timesteps;
223
  // Symbolchoices acquired during CTC
224
  std::vector<std::vector<std::pair<const char *, float>>> CTC_symbol_choices;
225
  // Stores if the timestep vector starts with a space
226
  bool leading_space = false;
227
  // Stores value when the word ends
228
  int end = 0;
229
  // Ratings matrix contains classifier choices for each classified combination
230
  // of blobs. The dimension is the same as the number of blobs in chopped_word
231
  // and the leading diagonal corresponds to classifier results of the blobs
232
  // in chopped_word. The state_ members of best_choice, raw_choice and
233
  // best_choices all correspond to this ratings matrix and allow extraction
234
  // of the blob choices for any given WERD_CHOICE.
235
  MATRIX *ratings = nullptr; // Owned pointer.
236
  // Pointer to the first WERD_CHOICE in best_choices. This is the result that
237
  // will be output from Tesseract. Note that this is now a borrowed pointer
238
  // and should NOT be deleted.
239
  WERD_CHOICE *best_choice = nullptr; // Borrowed pointer.
240
  // The best raw_choice found during segmentation search. Differs from the
241
  // best_choice by being the best result according to just the character
242
  // classifier, not taking any language model information into account.
243
  // Unlike best_choice, the pointer IS owned by this WERD_RES.
244
  WERD_CHOICE *raw_choice = nullptr; // Owned pointer.
245
  // Alternative results found during chopping/segmentation search stages.
246
  // Note that being an ELIST, best_choices owns the WERD_CHOICEs.
247
  WERD_CHOICE_LIST best_choices;
248
249
  // Truth bounding boxes, text and incorrect choice reason.
250
  BlamerBundle *blamer_bundle = nullptr;
251
252
  // --------------OUTPUT FROM RECOGNITION-------------------------------
253
  // --------------Not all fields are necessarily set.-------------------
254
  // ---best_choice, raw_choice *must* end up set, with a box_word-------
255
  // ---In complete output, the number of blobs in rebuild_word matches---
256
  // ---the number of boxes in box_word, the number of unichar_ids in---
257
  // ---best_choice, the number of ints in best_state, and the number---
258
  // ---of strings in correct_text--------------------------------------
259
  // ---SetupFake Sets everything to appropriate values if the word is---
260
  // ---known to be bad before recognition.------------------------------
261
262
  // The rebuild_word is also in BLN space, but represents the final best
263
  // segmentation of the word. Its length is therefore the same as box_word.
264
  TWERD *rebuild_word = nullptr; // BLN best segmented word.
265
  // The box_word is in the original image coordinate space. It is the
266
  // bounding boxes of the rebuild_word, after denormalization.
267
  // The length of box_word matches rebuild_word, best_state (if set) and
268
  // correct_text (if set), as well as best_choice and represents the
269
  // number of classified units in the output.
270
  tesseract::BoxWord *box_word = nullptr; // Denormalized output boxes.
271
  // The Tesseract that was used to recognize this word. Just a borrowed
272
  // pointer. Note: Tesseract's class definition is in a higher-level library.
273
  // We avoid introducing a cyclic dependency by not using the Tesseract
274
  // within WERD_RES. We are just storing it to provide access to it
275
  // for the top-level multi-language controller, and maybe for output of
276
  // the recognized language.
277
  // tesseract points to data owned elsewhere.
278
  tesseract::Tesseract *tesseract = nullptr;
279
  // The best_state stores the relationship between chopped_word and
280
  // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
281
  // adjacent blobs in chopped_word. The seams in seam_array are hidden
282
  // within a rebuild_word blob and revealed between them.
283
  std::vector<int> best_state; // Number of blobs in each best blob.
284
  // The correct_text is used during training and adaption to carry the
285
  // text to the training system without the need for a unicharset. There
286
  // is one entry in the vector for each blob in rebuild_word and box_word.
287
  std::vector<std::string> correct_text;
288
289
  // Less-well documented members.
290
  // TODO(rays) Add more documentation here.
291
  WERD_CHOICE *ep_choice = nullptr; // ep text TODO(rays) delete this.
292
  REJMAP reject_map;                // best_choice rejects
293
  bool tess_failed = false;
294
  /*
295
  If tess_failed is true, one of the following tests failed when Tess
296
  returned:
297
  - The outword blob list was not the same length as the best_choice string;
298
  - The best_choice string contained ALL blanks;
299
  - The best_choice string was zero length
300
*/
301
  bool tess_accepted = false;    // Tess thinks its ok?
302
  bool tess_would_adapt = false; // Tess would adapt?
303
  bool done = false;             // ready for output?
304
  bool small_caps = false;       // word appears to be small caps
305
  bool odd_size = false;         // word is bigger than line or leader dots.
306
  // The fontinfos are pointers to data owned by the classifier.
307
  const FontInfo *fontinfo = nullptr;
308
  const FontInfo *fontinfo2 = nullptr;
309
  int8_t fontinfo_id_count = 0;  // number of votes
310
  int8_t fontinfo_id2_count = 0; // number of votes
311
  bool guessed_x_ht = true;
312
  bool guessed_caps_ht = true;
313
  CRUNCH_MODE unlv_crunch_mode = CR_NONE;
314
  float x_height = 0.0f;       // post match estimate
315
  float caps_height = 0.0f;    // post match estimate
316
  float baseline_shift = 0.0f; // post match estimate.
317
  // Certainty score for the spaces either side of this word (LSTM mode).
318
  // MIN this value with the actual word certainty.
319
  float space_certainty = 0.0f;
320
321
  /*
322
  To deal with fuzzy spaces we need to be able to combine "words" to form
323
  combinations when we suspect that the gap is a non-space. The (new) text
324
  ord code generates separate words for EVERY fuzzy gap - flags in the word
325
  indicate whether the gap is below the threshold (fuzzy kern) and is thus
326
  NOT a real word break by default, or above the threshold (fuzzy space) and
327
  this is a real word break by default.
328
329
  The WERD_RES list contains all these words PLUS "combination" words built
330
  out of (copies of) the words split by fuzzy kerns. The separate parts have
331
  their "part_of_combo" flag set true and should be IGNORED on a default
332
  reading of the list.
333
334
  Combination words are FOLLOWED by the sequence of part_of_combo words
335
  which they combine.
336
*/
337
  bool combination = false;   // of two fuzzy gap wds
338
  bool part_of_combo = false; // part of a combo
339
  bool reject_spaces = false; // Reject spacing?
340
341
141k
  WERD_RES() = default;
342
343
335k
  WERD_RES(WERD *the_word) {
344
335k
    word = the_word;
345
335k
  }
346
  // Deep copies everything except the ratings MATRIX.
347
  // To get that use deep_copy below.
348
26.8k
  WERD_RES(const WERD_RES &source) : ELIST<WERD_RES>::LINK(source) {
349
    // combination is used in function Clear which is called from operator=.
350
26.8k
    combination = false;
351
26.8k
    *this = source; // see operator=
352
26.8k
  }
353
354
  ~WERD_RES();
355
356
  // Returns the UTF-8 string for the given blob index in the best_choice word,
357
  // given that we know whether we are in a right-to-left reading context.
358
  // This matters for mirrorable characters such as parentheses.  We recognize
359
  // characters purely based on their shape on the page, and by default produce
360
  // the corresponding unicode for a left-to-right context.
361
488k
  const char *BestUTF8(unsigned blob_index, bool in_rtl_context) const {
362
488k
    if (best_choice == nullptr || blob_index >= best_choice->length()) {
363
0
      return nullptr;
364
0
    }
365
488k
    UNICHAR_ID id = best_choice->unichar_id(blob_index);
366
488k
    if (static_cast<unsigned>(id) >= uch_set->size()) {
367
0
      return nullptr;
368
0
    }
369
488k
    UNICHAR_ID mirrored = uch_set->get_mirror(id);
370
488k
    if (in_rtl_context && mirrored > 0) {
371
0
      id = mirrored;
372
0
    }
373
488k
    return uch_set->id_to_unichar_ext(id);
374
488k
  }
375
  // Returns the UTF-8 string for the given blob index in the raw_choice word.
376
0
  const char *RawUTF8(unsigned blob_index) const {
377
0
    if (blob_index >= raw_choice->length()) {
378
0
      return nullptr;
379
0
    }
380
0
    UNICHAR_ID id = raw_choice->unichar_id(blob_index);
381
0
    if (static_cast<unsigned>(id) >= uch_set->size()) {
382
0
      return nullptr;
383
0
    }
384
0
    return uch_set->id_to_unichar(id);
385
0
  }
386
387
0
  UNICHARSET::Direction SymbolDirection(unsigned blob_index) const {
388
0
    if (best_choice == nullptr || blob_index >= best_choice->length()) {
389
0
      return UNICHARSET::U_OTHER_NEUTRAL;
390
0
    }
391
0
    return uch_set->get_direction(best_choice->unichar_id(blob_index));
392
0
  }
393
394
3.79M
  bool AnyRtlCharsInWord() const {
395
3.79M
    if (uch_set == nullptr || best_choice == nullptr ||
396
3.79M
        best_choice->length() < 1) {
397
0
      return false;
398
0
    }
399
11.1M
    for (unsigned id = 0; id < best_choice->length(); id++) {
400
7.30M
      unsigned unichar_id = best_choice->unichar_id(id);
401
7.30M
      if (unichar_id >= uch_set->size()) {
402
0
        continue; // Ignore illegal chars.
403
0
      }
404
7.30M
      UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
405
7.30M
      if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
406
7.30M
          dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {
407
0
        return true;
408
0
      }
409
7.30M
    }
410
3.79M
    return false;
411
3.79M
  }
412
413
3.79M
  bool AnyLtrCharsInWord() const {
414
3.79M
    if (uch_set == nullptr || best_choice == nullptr ||
415
3.79M
        best_choice->length() < 1) {
416
0
      return false;
417
0
    }
418
5.90M
    for (unsigned id = 0; id < best_choice->length(); id++) {
419
4.42M
      unsigned unichar_id = best_choice->unichar_id(id);
420
4.42M
      if (unichar_id >= uch_set->size()) {
421
0
        continue; // Ignore illegal chars.
422
0
      }
423
4.42M
      UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
424
4.42M
      if (dir == UNICHARSET::U_LEFT_TO_RIGHT ||
425
4.42M
          dir == UNICHARSET::U_ARABIC_NUMBER) {
426
2.31M
        return true;
427
2.31M
      }
428
4.42M
    }
429
1.47M
    return false;
430
3.79M
  }
431
432
  // Return whether the blobs in this WERD_RES 0, 1,... come from an engine
433
  // that gave us the unichars in reading order (as opposed to strict left
434
  // to right).
435
0
  bool UnicharsInReadingOrder() const {
436
0
    return best_choice->unichars_in_script_order();
437
0
  }
438
439
  void Clear();
440
  void ClearResults();
441
  void ClearWordChoices();
442
  void ClearRatings();
443
444
  // Deep copies everything except the ratings MATRIX.
445
  // To get that use deep_copy below.
446
  WERD_RES &operator=(const WERD_RES &source); // from this
447
448
  void CopySimpleFields(const WERD_RES &source);
449
450
  // Initializes a blank (default constructed) WERD_RES from one that has
451
  // already been recognized.
452
  // Use SetupFor*Recognition afterwards to complete the setup and make
453
  // it ready for a retry recognition.
454
  void InitForRetryRecognition(const WERD_RES &source);
455
456
  // Sets up the members used in recognition: bln_boxes, chopped_word,
457
  // seam_array, denorm.  Returns false if
458
  // the word is empty and sets up fake results.  If use_body_size is
459
  // true and row->body_size is set, then body_size will be used for
460
  // blob normalization instead of xheight + ascrise. This flag is for
461
  // those languages that are using CJK pitch model and thus it has to
462
  // be true if and only if tesseract->textord_use_cjk_fp_model is
463
  // true.
464
  // If allow_detailed_fx is true, the feature extractor will receive fine
465
  // precision outline information, allowing smoother features and better
466
  // features on low resolution images.
467
  // The norm_mode sets the default mode for normalization in absence
468
  // of any of the above flags. It should really be a tesseract::OcrEngineMode
469
  // but is declared as int for ease of use with tessedit_ocr_engine_mode.
470
  // Returns false if the word is empty and sets up fake results.
471
  bool SetupForRecognition(const UNICHARSET &unicharset_in,
472
                           tesseract::Tesseract *tesseract, Image pix,
473
                           int norm_mode, const TBOX *norm_box,
474
                           bool numeric_mode, bool use_body_size,
475
                           bool allow_detailed_fx, ROW *row,
476
                           const BLOCK *block);
477
478
  // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
479
  // accumulators from a made chopped word.  We presume the fields are already
480
  // empty.
481
  void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
482
483
  // Sets up the members used in recognition for an empty recognition result:
484
  // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
485
  void SetupFake(const UNICHARSET &uch);
486
487
  // Set the word as having the script of the input unicharset.
488
  void SetupWordScript(const UNICHARSET &unicharset_in);
489
490
  // Sets up the blamer_bundle if it is not null, using the initialized denorm.
491
  void SetupBlamerBundle();
492
493
  // Computes the blob_widths and blob_gaps from the chopped_word.
494
  void SetupBlobWidthsAndGaps();
495
496
  // Updates internal data to account for a new SEAM (chop) at the given
497
  // blob_number. Fixes the ratings matrix and states in the choices, as well
498
  // as the blob widths and gaps.
499
  void InsertSeam(int blob_number, SEAM *seam);
500
501
  // Returns true if all the word choices except the first have adjust_factors
502
  // worse than the given threshold.
503
  bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
504
505
  // Returns true if the current word is ambiguous (by number of answers or
506
  // by dangerous ambigs.)
507
  bool IsAmbiguous();
508
509
  // Returns true if the ratings matrix size matches the sum of each of the
510
  // segmentation states.
511
  bool StatesAllValid();
512
513
  // Prints a list of words found if debug is true or the word result matches
514
  // the word_to_debug.
515
  void DebugWordChoices(bool debug, const char *word_to_debug);
516
517
  // Prints the top choice along with the accepted/done flags.
518
  void DebugTopChoice(const char *msg) const;
519
520
  // Removes from best_choices all choices which are not within a reasonable
521
  // range of the best choice.
522
  void FilterWordChoices(int debug_level);
523
524
  // Computes a set of distance thresholds used to control adaption.
525
  // Compares the best choice for the current word to the best raw choice
526
  // to determine which characters were classified incorrectly by the
527
  // classifier. Then places a separate threshold into thresholds for each
528
  // character in the word. If the classifier was correct, max_rating is placed
529
  // into thresholds. If the classifier was incorrect, the mean match rating
530
  // (error percentage) of the classifier's incorrect choice minus some margin
531
  // is placed into thresholds. This can then be used by the caller to try to
532
  // create a new template for the desired class that will classify the
533
  // character with a rating better than the threshold value. The match rating
534
  // placed into thresholds is never allowed to be below min_rating in order to
535
  // prevent trying to make overly tight templates.
536
  // min_rating limits how tight to make a template.
537
  // max_rating limits how loose to make a template.
538
  // rating_margin denotes the amount of margin to put in template.
539
  void ComputeAdaptionThresholds(float certainty_scale, float min_rating,
540
                                 float max_rating, float rating_margin,
541
                                 float *thresholds);
542
543
  // Saves a copy of the word_choice if it has the best unadjusted rating.
544
  // Returns true if the word_choice was the new best.
545
  bool LogNewRawChoice(WERD_CHOICE *word_choice);
546
  // Consumes word_choice by adding it to best_choices, (taking ownership) if
547
  // the certainty for word_choice is some distance of the best choice in
548
  // best_choices, or by deleting the word_choice and returning false.
549
  // The best_choices list is kept in sorted order by rating. Duplicates are
550
  // removed, and the list is kept no longer than max_num_choices in length.
551
  // Returns true if the word_choice is still a valid pointer.
552
  bool LogNewCookedChoice(int max_num_choices, bool debug,
553
                          WERD_CHOICE *word_choice);
554
555
  // Prints a brief list of all the best choices.
556
  void PrintBestChoices() const;
557
558
  // Returns the sum of the widths of the blob between start_blob and last_blob
559
  // inclusive.
560
  int GetBlobsWidth(int start_blob, int last_blob) const;
561
  // Returns the width of a gap between the specified blob and the next one.
562
  int GetBlobsGap(unsigned blob_index) const;
563
564
  // Returns the BLOB_CHOICE corresponding to the given index in the
565
  // best choice word taken from the appropriate cell in the ratings MATRIX.
566
  // Borrowed pointer, so do not delete. May return nullptr if there is no
567
  // BLOB_CHOICE matching the unichar_id at the given index.
568
  BLOB_CHOICE *GetBlobChoice(unsigned index) const;
569
570
  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
571
  // best choice word taken from the appropriate cell in the ratings MATRIX.
572
  // Borrowed pointer, so do not delete.
573
  BLOB_CHOICE_LIST *GetBlobChoices(int index) const;
574
575
  // Moves the results fields from word to this. This takes ownership of all
576
  // the data, so src can be destructed.
577
  // word1.ConsumeWordResult(word);
578
  // delete word;
579
  // is simpler and faster than:
580
  // word1 = *word;
581
  // delete word;
582
  // as it doesn't need to copy and reallocate anything.
583
  void ConsumeWordResults(WERD_RES *word);
584
585
  // Replace the best choice and rebuild box word.
586
  // choice must be from the current best_choices list.
587
  void ReplaceBestChoice(WERD_CHOICE *choice);
588
589
  // Builds the rebuild_word and sets the best_state from the chopped_word and
590
  // the best_choice->state.
591
  void RebuildBestState();
592
593
  // Copies the chopped_word to the rebuild_word, faking a best_state as well.
594
  // Also sets up the output box_word.
595
  void CloneChoppedToRebuild();
596
597
  // Sets/replaces the box_word with one made from the rebuild_word.
598
  void SetupBoxWord();
599
600
  // Sets up the script positions in the best_choice using the best_choice
601
  // to get the unichars, and the unicharset to get the target positions.
602
  void SetScriptPositions();
603
  // Sets all the blobs in all the words (best choice and alternates) to be
604
  // the given position. (When a sub/superscript is recognized as a separate
605
  // word, it falls victim to the rule that a whole word cannot be sub or
606
  // superscript, so this function overrides that problem.)
607
  void SetAllScriptPositions(tesseract::ScriptPos position);
608
609
  // Classifies the word with some already-calculated BLOB_CHOICEs.
610
  // The choices are an array of blob_count pointers to BLOB_CHOICE,
611
  // providing a single classifier result for each blob.
612
  // The BLOB_CHOICEs are consumed and the word takes ownership.
613
  // The number of blobs in the box_word must match blob_count.
614
  void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices);
615
616
  // Creates a WERD_CHOICE for the word using the top choices from the leading
617
  // diagonal of the ratings matrix.
618
  void FakeWordFromRatings(PermuterType permuter);
619
620
  // Copies the best_choice strings to the correct_text for adaption/training.
621
  void BestChoiceToCorrectText();
622
623
  // Merges 2 adjacent blobs in the result if the permanent callback
624
  // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
625
  // callback box_cb is nullptr or returns true, setting the merged blob
626
  // result to the class returned from class_cb.
627
  // Returns true if anything was merged.
628
  bool ConditionalBlobMerge(
629
      const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,
630
      const std::function<bool(const TBOX &, const TBOX &)> &box_cb);
631
632
  // Merges 2 adjacent blobs in the result (index and index+1) and corrects
633
  // all the data to account for the change.
634
  void MergeAdjacentBlobs(unsigned index);
635
636
  // Callback helper for fix_quotes returns a double quote if both
637
  // arguments are quote, otherwise INVALID_UNICHAR_ID.
638
  UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
639
  void fix_quotes();
640
641
  // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
642
  // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
643
  UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2);
644
  // Callback helper for fix_hyphens returns true if box1 and box2 overlap
645
  // (assuming both on the same textline, are in order and a chopped em dash.)
646
  bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2);
647
  void fix_hyphens();
648
649
  // Callback helper for merge_tess_fails returns a space if both
650
  // arguments are space, otherwise INVALID_UNICHAR_ID.
651
  UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
652
  void merge_tess_fails();
653
654
  // Returns a really deep copy of *src, including the ratings MATRIX.
655
0
  static WERD_RES *deep_copy(const WERD_RES *src) {
656
0
    auto *result = new WERD_RES(*src);
657
    // That didn't copy the ratings, but we want a copy if there is one to
658
    // begin with.
659
0
    if (src->ratings != nullptr) {
660
0
      result->ratings = src->ratings->DeepCopy();
661
0
    }
662
0
    return result;
663
0
  }
664
665
  // Copy blobs from word_res onto this word (eliminating spaces between).
666
  // Since this may be called bidirectionally OR both the BOL and EOL flags.
667
33.9k
  void copy_on(WERD_RES *word_res) { // from this word
668
33.9k
    word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
669
33.9k
    word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
670
33.9k
    word->copy_on(word_res->word);
671
33.9k
  }
672
673
  // Returns true if the collection of count pieces, starting at start, are all
674
  // natural connected components, ie there are no real chops involved.
675
  bool PiecesAllNatural(int start, int count) const;
676
};
677
678
/*************************************************************************
679
 * PAGE_RES_IT - Page results iterator
680
 *************************************************************************/
681
682
class TESS_API PAGE_RES_IT {
683
public:
684
  PAGE_RES *page_res; // page being iterated
685
686
0
  PAGE_RES_IT() = default;
687
688
603k
  PAGE_RES_IT(PAGE_RES *the_page_res) { // page result
689
603k
    page_res = the_page_res;
690
603k
    restart_page(); // ready to scan
691
603k
  }
692
693
  // Do two PAGE_RES_ITs point at the same word?
694
  // This is much cheaper than cmp().
695
1.37M
  bool operator==(const PAGE_RES_IT &other) const {
696
1.37M
    return word_res == other.word_res && row_res == other.row_res &&
697
1.37M
           block_res == other.block_res;
698
1.37M
  }
699
700
0
  bool operator!=(const PAGE_RES_IT &other) const {
701
0
    return !(*this == other);
702
0
  }
703
704
  // Given another PAGE_RES_IT to the same page,
705
  //  this before other:     -1
706
  //  this equal to other:    0
707
  //  this later than other:  1
708
  int cmp(const PAGE_RES_IT &other) const;
709
710
4.44M
  WERD_RES *restart_page() {
711
4.44M
    return start_page(false); // Skip empty blocks.
712
4.44M
  }
713
14.7k
  WERD_RES *restart_page_with_empties() {
714
14.7k
    return start_page(true); // Allow empty blocks.
715
14.7k
  }
716
  WERD_RES *start_page(bool empty_ok);
717
718
  WERD_RES *restart_row();
719
720
  // ============ Methods that mutate the underling structures ===========
721
  // Note that these methods will potentially invalidate other PAGE_RES_ITs
722
  // and are intended to be used only while a single PAGE_RES_IT is  active.
723
  // This problem needs to be taken into account if these mutation operators
724
  // are ever provided to PageIterator or its subclasses.
725
726
  // Inserts the new_word and a corresponding WERD_RES before the current
727
  // position. The simple fields of the WERD_RES are copied from clone_res and
728
  // the resulting WERD_RES is returned for further setup with best_choice etc.
729
  WERD_RES *InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word);
730
731
  // Replaces the current WERD/WERD_RES with the given words. The given words
732
  // contain fake blobs that indicate the position of the characters. These are
733
  // replaced with real blobs from the current word as much as possible.
734
  void ReplaceCurrentWord(PointerVector<WERD_RES> *words);
735
736
  // Deletes the current WERD_RES and its underlying WERD.
737
  void DeleteCurrentWord();
738
739
  // Makes the current word a fuzzy space if not already fuzzy. Updates
740
  // corresponding part of combo if required.
741
  void MakeCurrentWordFuzzy();
742
743
65.9M
  WERD_RES *forward() { // Get next word.
744
65.9M
    return internal_forward(false, false);
745
65.9M
  }
746
  // Move forward, but allow empty blocks to show as single nullptr words.
747
4.45M
  WERD_RES *forward_with_empties() {
748
4.45M
    return internal_forward(false, true);
749
4.45M
  }
750
751
  WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph
752
  WERD_RES *forward_block();     // get first word in next non-empty block
753
754
0
  WERD_RES *prev_word() const { // previous word
755
0
    return prev_word_res;
756
0
  }
757
4.23M
  ROW_RES *prev_row() const { // row of prev word
758
4.23M
    return prev_row_res;
759
4.23M
  }
760
876k
  BLOCK_RES *prev_block() const { // block of prev word
761
876k
    return prev_block_res;
762
876k
  }
763
43.8M
  WERD_RES *word() const { // current word
764
43.8M
    return word_res;
765
43.8M
  }
766
78.9M
  ROW_RES *row() const { // row of current word
767
78.9M
    return row_res;
768
78.9M
  }
769
20.2M
  BLOCK_RES *block() const { // block of cur. word
770
20.2M
    return block_res;
771
20.2M
  }
772
0
  WERD_RES *next_word() const { // next word
773
0
    return next_word_res;
774
0
  }
775
0
  ROW_RES *next_row() const { // row of next word
776
0
    return next_row_res;
777
0
  }
778
0
  BLOCK_RES *next_block() const { // block of next word
779
0
    return next_block_res;
780
0
  }
781
  void rej_stat_word(); // for page/block/row
782
  void ResetWordIterator();
783
784
private:
785
  WERD_RES *internal_forward(bool new_block, bool empty_ok);
786
787
  WERD_RES *prev_word_res;   // previous word
788
  ROW_RES *prev_row_res;     // row of prev word
789
  BLOCK_RES *prev_block_res; // block of prev word
790
791
  WERD_RES *word_res;   // current word
792
  ROW_RES *row_res;     // row of current word
793
  BLOCK_RES *block_res; // block of cur. word
794
795
  WERD_RES *next_word_res;   // next word
796
  ROW_RES *next_row_res;     // row of next word
797
  BLOCK_RES *next_block_res; // block of next word
798
799
  BLOCK_RES_IT block_res_it; // iterators
800
  ROW_RES_IT row_res_it;
801
  WERD_RES_IT word_res_it;
802
  // Iterators used to get the state of word_res_it for the current word.
803
  // Since word_res_it is 2 words further on, this is otherwise hard to do.
804
  WERD_RES_IT wr_it_of_current_word;
805
  WERD_RES_IT wr_it_of_next_word;
806
};
807
808
} // namespace tesseract
809
810
#endif