Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/wordrec/wordrec.h
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        wordrec.h
3
// Description: wordrec class.
4
// Author:      Samuel Charron
5
//
6
// (C) Copyright 2006, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#ifndef TESSERACT_WORDREC_WORDREC_H_
20
#define TESSERACT_WORDREC_WORDREC_H_
21
22
#ifdef HAVE_CONFIG_H
23
#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
24
#endif
25
26
#ifdef DISABLED_LEGACY_ENGINE
27
28
#  include <cstdint>    // for int16_t, int32_t
29
#  include "classify.h" // for Classify
30
#  include "params.h"   // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...
31
#  include "ratngs.h"   // for WERD_CHOICE
32
33
namespace tesseract {
34
class TessdataManager;
35
}
36
37
namespace tesseract {
38
39
/* ccmain/tstruct.cpp */
40
41
class TESS_API Wordrec : public Classify {
42
public:
43
  // config parameters
44
45
  BOOL_VAR_H(wordrec_debug_blamer);
46
  BOOL_VAR_H(wordrec_run_blamer);
47
48
  // methods
49
  Wordrec();
50
  virtual ~Wordrec() = default;
51
52
  // tface.cpp
53
  void program_editup(const std::string &textbase, TessdataManager *init_classifier,
54
                      TessdataManager *init_dict);
55
  void program_editdown(int32_t elapsed_time);
56
  int end_recog();
57
  int dict_word(const WERD_CHOICE &word);
58
59
  // Member variables
60
  WERD_CHOICE *prev_word_best_choice_;
61
};
62
63
} // namespace tesseract
64
65
#else // DISABLED_LEGACY_ENGINE not defined
66
67
#  include <memory>
68
#  include "associate.h"
69
#  include "chop.h"     // for PointHeap, MAX_NUM_POINTS
70
#  include "classify.h" // for Classify
71
#  include "dict.h"
72
#  include "elst.h"     // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
73
#  include "findseam.h" // for SeamQueue, SeamPile
74
#  include "language_model.h"
75
#  include "matrix.h"
76
#  include "oldlist.h" // for LIST
77
#  include "params.h"  // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...
78
#  include "points.h"  // for ICOORD
79
#  include "ratngs.h"  // for BLOB_CHOICE_LIST (ptr only), BLOB_CHOI...
80
#  include "seam.h"    // for SEAM (ptr only), PRIORITY
81
#  include "stopper.h" // for DANGERR
82
83
#  include <cstdint> // for int16_t, int32_t
84
85
namespace tesseract {
86
87
class EDGEPT_CLIST;
88
class MATRIX;
89
class TBOX;
90
class UNICHARSET;
91
class WERD_RES;
92
93
class LMPainPoints;
94
class TessdataManager;
95
struct BestChoiceBundle;
96
97
struct BlamerBundle;
98
struct EDGEPT;
99
struct MATRIX_COORD;
100
struct SPLIT;
101
struct TBLOB;
102
struct TESSLINE;
103
struct TWERD;
104
105
// A class for storing which nodes are to be processed by the segmentation
106
// search. There is a single SegSearchPending for each column in the ratings
107
// matrix, and it indicates whether the segsearch should combine all
108
// BLOB_CHOICES in the column, or just the given row with the parents
109
// corresponding to *this SegSearchPending, and whether only updated parent
110
// ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs.
111
class SegSearchPending {
112
public:
113
  SegSearchPending()
114
333k
      : classified_row_(-1), revisit_whole_column_(false), column_classified_(false) {}
115
116
  // Marks the whole column as just classified. Used to start a search on
117
  // a newly initialized ratings matrix.
118
144k
  void SetColumnClassified() {
119
144k
    column_classified_ = true;
120
144k
  }
121
  // Marks the matrix entry at the given row as just classified.
122
  // Used after classifying a new matrix cell.
123
  // Additional to, not overriding a previous RevisitWholeColumn.
124
1.36M
  void SetBlobClassified(int row) {
125
1.36M
    classified_row_ = row;
126
1.36M
  }
127
  // Marks the whole column as needing work, but not just classified.
128
  // Used when the parent vse list is updated.
129
  // Additional to, not overriding a previous SetBlobClassified.
130
2.44M
  void RevisitWholeColumn() {
131
2.44M
    revisit_whole_column_ = true;
132
2.44M
  }
133
134
  // Clears *this to indicate no work to do.
135
17.2M
  void Clear() {
136
17.2M
    classified_row_ = -1;
137
17.2M
    revisit_whole_column_ = false;
138
17.2M
    column_classified_ = false;
139
17.2M
  }
140
141
  // Returns true if there are updates to do in the column that *this
142
  // represents.
143
10.7M
  bool WorkToDo() const {
144
10.7M
    return revisit_whole_column_ || column_classified_ || classified_row_ >= 0;
145
10.7M
  }
146
  // Returns true if the given row was just classified.
147
4.47M
  bool IsRowJustClassified(int row) const {
148
4.47M
    return row == classified_row_ || column_classified_;
149
4.47M
  }
150
  // Returns the single row to process if there is only one, otherwise -1.
151
4.72M
  int SingleRow() const {
152
4.72M
    return revisit_whole_column_ || column_classified_ ? -1 : classified_row_;
153
4.72M
  }
154
155
private:
156
  // If non-negative, indicates the single row in the ratings matrix that has
157
  // just been classified, and so should be combined with all the parents in the
158
  // column that this SegSearchPending represents.
159
  // Operates independently of revisit_whole_column.
160
  int classified_row_;
161
  // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will
162
  // be processed, but classified_row can indicate a row that is newly
163
  // classified. Overridden if column_classified is true.
164
  bool revisit_whole_column_;
165
  // If column_classified is true, parent vses are processed with all rows
166
  // regardless of whether they are just updated, overriding
167
  // revisit_whole_column and classified_row.
168
  bool column_classified_;
169
};
170
171
/* ccmain/tstruct.cpp *********************************************************/
172
class FRAGMENT : public ELIST<FRAGMENT>::LINK {
173
public:
174
0
  FRAGMENT() { // constructor
175
0
  }
176
  FRAGMENT(EDGEPT *head_pt,  // start
177
           EDGEPT *tail_pt); // end
178
179
  ICOORD head;    // coords of start
180
  ICOORD tail;    // coords of end
181
  EDGEPT *headpt; // start point
182
  EDGEPT *tailpt; // end point
183
};
184
ELISTIZEH(FRAGMENT)
185
186
class TESS_API Wordrec : public Classify {
187
public:
188
  // config parameters *******************************************************
189
  BOOL_VAR_H(merge_fragments_in_matrix);
190
  BOOL_VAR_H(wordrec_enable_assoc);
191
  BOOL_VAR_H(force_word_assoc);
192
  INT_VAR_H(repair_unchopped_blobs);
193
  double_VAR_H(tessedit_certainty_threshold);
194
  INT_VAR_H(chop_debug);
195
  BOOL_VAR_H(chop_enable);
196
  BOOL_VAR_H(chop_vertical_creep);
197
  INT_VAR_H(chop_split_length);
198
  INT_VAR_H(chop_same_distance);
199
  INT_VAR_H(chop_min_outline_points);
200
  INT_VAR_H(chop_seam_pile_size);
201
  BOOL_VAR_H(chop_new_seam_pile);
202
  INT_VAR_H(chop_inside_angle);
203
  INT_VAR_H(chop_min_outline_area);
204
  double_VAR_H(chop_split_dist_knob);
205
  double_VAR_H(chop_overlap_knob);
206
  double_VAR_H(chop_center_knob);
207
  INT_VAR_H(chop_centered_maxwidth);
208
  double_VAR_H(chop_sharpness_knob);
209
  double_VAR_H(chop_width_change_knob);
210
  double_VAR_H(chop_ok_split);
211
  double_VAR_H(chop_good_split);
212
  INT_VAR_H(chop_x_y_weight);
213
  BOOL_VAR_H(assume_fixed_pitch_char_segment);
214
  INT_VAR_H(wordrec_debug_level);
215
  INT_VAR_H(wordrec_max_join_chunks);
216
  BOOL_VAR_H(wordrec_skip_no_truth_words);
217
  BOOL_VAR_H(wordrec_debug_blamer);
218
  BOOL_VAR_H(wordrec_run_blamer);
219
  INT_VAR_H(segsearch_debug_level);
220
  INT_VAR_H(segsearch_max_pain_points);
221
  INT_VAR_H(segsearch_max_futile_classifications);
222
  double_VAR_H(segsearch_max_char_wh_ratio);
223
  BOOL_VAR_H(save_alt_choices);
224
225
  // methods from wordrec/*.cpp ***********************************************
226
  Wordrec();
227
0
  ~Wordrec() override = default;
228
229
  // Fills word->alt_choices with alternative paths found during
230
  // chopping/segmentation search that are kept in best_choices.
231
  void SaveAltChoices(const LIST &best_choices, WERD_RES *word);
232
233
  // Fills character choice lattice in the given BlamerBundle
234
  // using the given ratings matrix and best choice list.
235
  void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
236
                   const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
237
238
  // Calls fill_lattice_ member function
239
  // (assumes that fill_lattice_ is not nullptr).
240
  void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
241
0
                       const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) {
242
0
    (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
243
0
  }
244
245
  // tface.cpp
246
  void program_editup(const std::string &textbase, TessdataManager *init_classifier,
247
                      TessdataManager *init_dict);
248
  void cc_recog(WERD_RES *word);
249
  void program_editdown(int32_t elapsed_time);
250
  void set_pass1();
251
  void set_pass2();
252
  int end_recog();
253
  BLOB_CHOICE_LIST *call_matcher(TBLOB *blob);
254
  int dict_word(const WERD_CHOICE &word);
255
  // wordclass.cpp
256
  BLOB_CHOICE_LIST *classify_blob(TBLOB *blob, const char *string, ScrollView::Color color,
257
                                  BlamerBundle *blamer_bundle);
258
259
  // segsearch.cpp
260
  // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.
261
  // Each entry in the matrix represents the classification choice
262
  // for a chunk, i.e. an entry in row 2, column 1 represents the list
263
  // of ratings for the chunks 1 and 2 classified as a single blob.
264
  // The entries on the diagonal of the matrix are classifier choice lists
265
  // for a single chunk from the maximal segmentation.
266
  //
267
  // The ratings matrix given to SegSearch represents the segmentation
268
  // graph / trellis for the current word. The nodes in the graph are the
269
  // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings
270
  // matrix. The children of each node (nodes connected by outgoing links)
271
  // are the entries in the column that is equal to node's row+1. The parents
272
  // (nodes connected by the incoming links) are the entries in the row that
273
  // is equal to the node's column-1. Here is an example ratings matrix:
274
  //
275
  //    0    1    2   3   4
276
  //  -------------------------
277
  // 0| c,(                   |
278
  // 1| d    l,1              |
279
  // 2|           o           |
280
  // 3|              c,(      |
281
  // 4|              g,y  l,1 |
282
  //  -------------------------
283
  //
284
  // In the example above node "o" has children (outgoing connection to nodes)
285
  // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d".
286
  //
287
  // The objective of the search is to find the least cost path, where the cost
288
  // is determined by the language model components and the properties of the
289
  // cut between the blobs on the path. SegSearch starts by populating the
290
  // matrix with the all the entries that were classified by the chopper and
291
  // finding the initial best path. Based on the classifier ratings, language
292
  // model scores and the properties of each cut, a list of "pain points" is
293
  // constructed - those are the points on the path where the choices do not
294
  // look consistent with the neighboring choices, the cuts look particularly
295
  // problematic, or the certainties of the blobs are low. The most troublesome
296
  // "pain point" is picked from the list and the new entry in the ratings
297
  // matrix corresponding to this "pain point" is filled in. Then the language
298
  // model state is updated to reflect the new classification and the new
299
  // "pain points" are added to the list and the next most troublesome
300
  // "pain point" is determined. This continues until either the word choice
301
  // composed from the best paths in the segmentation graph is "good enough"
302
  // (e.g. above a certain certainty threshold, is an unambiguous dictionary
303
  // word, etc) or there are no more "pain points" to explore.
304
  //
305
  // If associate_blobs is set to false no new classifications will be done
306
  // to combine blobs. Segmentation search will run only one "iteration"
307
  // on the classifications already recorded in chunks_record.ratings.
308
  //
309
  // Note: this function assumes that word_res, best_choice_bundle arguments
310
  // are not nullptr.
311
  void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
312
                 BlamerBundle *blamer_bundle);
313
314
  // Setup and run just the initial segsearch on an established matrix,
315
  // without doing any additional chopping or joining.
316
  // (Internal factored version that can be used as part of the main SegSearch.)
317
  void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
318
                        std::vector<SegSearchPending> *pending,
319
                        BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
320
321
  // chop.cpp
322
  PRIORITY point_priority(EDGEPT *point);
323
  void add_point_to_list(PointHeap *point_heap, EDGEPT *point);
324
  // Returns true if the edgept supplied as input is an inside angle.  This
325
  // is determined by the angular change of the vectors from point to point.
326
  bool is_inside_angle(EDGEPT *pt);
327
  int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);
328
  EDGEPT *pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist);
329
  void prioritize_points(TESSLINE *outline, PointHeap *points);
330
  void new_min_point(EDGEPT *local_min, PointHeap *points);
331
  void new_max_point(EDGEPT *local_max, PointHeap *points);
332
  void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point,
333
                                 EDGEPT_CLIST *new_points);
334
335
  // chopper.cpp
336
  SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,
337
                          const std::vector<SEAM *> &seams);
338
  SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
339
                           const std::vector<SEAM *> &seams);
340
  SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res,
341
                              unsigned *blob_number);
342
  SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
343
                         bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
344
                         unsigned *blob_number);
345
  SEAM *chop_one_blob(const std::vector<TBOX> &boxes,
346
                      const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
347
                      unsigned *blob_number);
348
  void chop_word_main(WERD_RES *word);
349
  void improve_by_chopping(float rating_cert_scale, WERD_RES *word,
350
                           BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
351
                           LMPainPoints *pain_points, std::vector<SegSearchPending> *pending);
352
  int select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, float rating_ceiling,
353
                           bool split_next_to_fragment);
354
  int select_blob_to_split_from_fixpt(DANGERR *fixpt);
355
356
  // findseam.cpp
357
  void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams);
358
  void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority,
359
                        SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile);
360
  void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue);
361
  SEAM *pick_good_seam(TBLOB *blob);
362
  void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue,
363
                       SeamPile *seam_pile, SEAM **seam, TBLOB *blob);
364
  void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points,
365
                           EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile,
366
                           SEAM **seam, TBLOB *blob);
367
368
  // gradechop.cpp
369
  PRIORITY grade_split_length(SPLIT *split);
370
  PRIORITY grade_sharpness(SPLIT *split);
371
372
  // outlines.cpp
373
  bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt);
374
375
  // pieces.cpp
376
  virtual BLOB_CHOICE_LIST *classify_piece(const std::vector<SEAM *> &seams, int16_t start,
377
                                           int16_t end, const char *description, TWERD *word,
378
                                           BlamerBundle *blamer_bundle);
379
380
  // Member variables.
381
382
  std::unique_ptr<LanguageModel> language_model_;
383
  PRIORITY pass2_ok_split;
384
  // Stores the best choice for the previous word in the paragraph.
385
  // This variable is modified by PAGE_RES_IT when iterating over
386
  // words to OCR on the page.
387
  WERD_CHOICE *prev_word_best_choice_;
388
389
  // Function used to fill char choice lattices.
390
  void (Wordrec::*fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
391
                                 const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
392
393
protected:
394
2.25M
  inline bool SegSearchDone(int num_futile_classifications) {
395
2.25M
    return (language_model_->AcceptableChoiceFound() ||
396
2.25M
            num_futile_classifications >= segsearch_max_futile_classifications);
397
2.25M
  }
398
399
  // Updates the language model state recorded for the child entries specified
400
  // in pending[starting_col]. Enqueues the children of the updated entries
401
  // into pending and proceeds to update (and remove from pending) all the
402
  // remaining entries in pending[col] (col >= starting_col). Upon termination
403
  // of this function all the pending[col] lists will be empty.
404
  //
405
  // The arguments:
406
  //
407
  // starting_col: index of the column in chunks_record->ratings from
408
  // which the update should be started
409
  //
410
  // pending: list of entries listing chunks_record->ratings entries
411
  // that should be updated
412
  //
413
  // pain_points: priority heap listing the pain points generated by
414
  // the language model
415
  //
416
  // temp_pain_points: temporary storage for tentative pain points generated
417
  // by the language model after a single call to LanguageModel::UpdateState()
418
  // (the argument is passed in rather than created before each
419
  // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)
420
  //
421
  // best_choice_bundle: a collection of variables that should be updated
422
  // if a new best choice is found
423
  //
424
  void UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
425
                            std::vector<SegSearchPending> *pending, WERD_RES *word_res,
426
                            LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
427
                            BlamerBundle *blamer_bundle);
428
429
  // Process the given pain point: classify the corresponding blob, enqueue
430
  // new pain points to join the newly classified blob with its neighbors.
431
  void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,
432
                                 const char *pain_point_type,
433
                                 std::vector<SegSearchPending> *pending, WERD_RES *word_res,
434
                                 LMPainPoints *pain_points, BlamerBundle *blamer_bundle);
435
  // Resets enough of the results so that the Viterbi search is re-run.
436
  // Needed when the n-gram model is enabled, as the multi-length comparison
437
  // implementation will re-value existing paths to worse values.
438
  void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
439
                        std::vector<SegSearchPending> &pending);
440
441
  // Add pain points for classifying blobs on the correct segmentation path
442
  // (so that we can evaluate correct segmentation path and discover the reason
443
  // for incorrect result).
444
  void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
445
                              BlamerBundle *blamer_bundle, std::string &blamer_debug);
446
};
447
448
} // namespace tesseract
449
450
#endif // DISABLED_LEGACY_ENGINE
451
452
#endif // TESSERACT_WORDREC_WORDREC_H_