/src/tesseract/src/wordrec/wordrec.h

Source (jump to first uncovered line)
///////////////////////////////////////////////////////////////////////
// File:        wordrec.h
// Description: wordrec class.
// Author:      Samuel Charron
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_WORDREC_WORDREC_H_
#define TESSERACT_WORDREC_WORDREC_H_

#ifdef HAVE_CONFIG_H
#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
#endif

#ifdef DISABLED_LEGACY_ENGINE

#  include <cstdint>    // for int16_t, int32_t
#  include "classify.h" // for Classify
#  include "params.h"   // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...
#  include "ratngs.h"   // for WERD_CHOICE

namespace tesseract {
class TessdataManager;
}

namespace tesseract {

/* ccmain/tstruct.cpp */

class TESS_API Wordrec : public Classify {
public:
  // config parameters

  BOOL_VAR_H(wordrec_debug_blamer);
  BOOL_VAR_H(wordrec_run_blamer);

  // methods
  Wordrec();
  virtual ~Wordrec() = default;

  // tface.cpp
  void program_editup(const std::string &textbase, TessdataManager *init_classifier,
                      TessdataManager *init_dict);
  void program_editdown(int32_t elapsed_time);
  int end_recog();
  int dict_word(const WERD_CHOICE &word);

  // Member variables
  WERD_CHOICE *prev_word_best_choice_;
};

} // namespace tesseract

#else // DISABLED_LEGACY_ENGINE not defined

#  include <memory>
#  include "associate.h"
#  include "chop.h"     // for PointHeap, MAX_NUM_POINTS
#  include "classify.h" // for Classify
#  include "dict.h"
#  include "elst.h"     // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
#  include "findseam.h" // for SeamQueue, SeamPile
#  include "language_model.h"
#  include "matrix.h"
#  include "oldlist.h" // for LIST
#  include "params.h"  // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...
#  include "points.h"  // for ICOORD
#  include "ratngs.h"  // for BLOB_CHOICE_LIST (ptr only), BLOB_CHOI...
#  include "seam.h"    // for SEAM (ptr only), PRIORITY
#  include "stopper.h" // for DANGERR

#  include <cstdint> // for int16_t, int32_t

namespace tesseract {

class EDGEPT_CLIST;
class MATRIX;
class TBOX;
class UNICHARSET;
class WERD_RES;

class LMPainPoints;
class TessdataManager;
struct BestChoiceBundle;

struct BlamerBundle;
struct EDGEPT;
struct MATRIX_COORD;
struct SPLIT;
struct TBLOB;
struct TESSLINE;
struct TWERD;

// A class for storing which nodes are to be processed by the segmentation
// search. There is a single SegSearchPending for each column in the ratings
// matrix, and it indicates whether the segsearch should combine all
// BLOB_CHOICES in the column, or just the given row with the parents
// corresponding to *this SegSearchPending, and whether only updated parent
// ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs.
class SegSearchPending {
public:
  SegSearchPending()
      : classified_row_(-1), revisit_whole_column_(false), column_classified_(false) {}

  // Marks the whole column as just classified. Used to start a search on
  // a newly initialized ratings matrix.
  void SetColumnClassified() {
    column_classified_ = true;
  }
  // Marks the matrix entry at the given row as just classified.
  // Used after classifying a new matrix cell.
  // Additional to, not overriding a previous RevisitWholeColumn.
  void SetBlobClassified(int row) {
    classified_row_ = row;
  }
  // Marks the whole column as needing work, but not just classified.
  // Used when the parent vse list is updated.
  // Additional to, not overriding a previous SetBlobClassified.
  void RevisitWholeColumn() {
    revisit_whole_column_ = true;
  }

  // Clears *this to indicate no work to do.
  void Clear() {
    classified_row_ = -1;
    revisit_whole_column_ = false;
    column_classified_ = false;
  }

  // Returns true if there are updates to do in the column that *this
  // represents.
  bool WorkToDo() const {
    return revisit_whole_column_ || column_classified_ || classified_row_ >= 0;
  }
  // Returns true if the given row was just classified.
  bool IsRowJustClassified(int row) const {
    return row == classified_row_ || column_classified_;
  }
  // Returns the single row to process if there is only one, otherwise -1.
  int SingleRow() const {
    return revisit_whole_column_ || column_classified_ ? -1 : classified_row_;
  }

private:
  // If non-negative, indicates the single row in the ratings matrix that has
  // just been classified, and so should be combined with all the parents in the
  // column that this SegSearchPending represents.
  // Operates independently of revisit_whole_column.
  int classified_row_;
  // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will
  // be processed, but classified_row can indicate a row that is newly
  // classified. Overridden if column_classified is true.
  bool revisit_whole_column_;
  // If column_classified is true, parent vses are processed with all rows
  // regardless of whether they are just updated, overriding
  // revisit_whole_column and classified_row.
  bool column_classified_;
};

/* ccmain/tstruct.cpp *********************************************************/
class FRAGMENT : public ELIST<FRAGMENT>::LINK {
public:
  FRAGMENT() { // constructor
  }
  FRAGMENT(EDGEPT *head_pt,  // start
           EDGEPT *tail_pt); // end

  ICOORD head;    // coords of start
  ICOORD tail;    // coords of end
  EDGEPT *headpt; // start point
  EDGEPT *tailpt; // end point
};
ELISTIZEH(FRAGMENT)

class TESS_API Wordrec : public Classify {
public:
  // config parameters *******************************************************
  BOOL_VAR_H(merge_fragments_in_matrix);
  BOOL_VAR_H(wordrec_enable_assoc);
  BOOL_VAR_H(force_word_assoc);
  INT_VAR_H(repair_unchopped_blobs);
  double_VAR_H(tessedit_certainty_threshold);
  INT_VAR_H(chop_debug);
  BOOL_VAR_H(chop_enable);
  BOOL_VAR_H(chop_vertical_creep);
  INT_VAR_H(chop_split_length);
  INT_VAR_H(chop_same_distance);
  INT_VAR_H(chop_min_outline_points);
  INT_VAR_H(chop_seam_pile_size);
  BOOL_VAR_H(chop_new_seam_pile);
  INT_VAR_H(chop_inside_angle);
  INT_VAR_H(chop_min_outline_area);
  double_VAR_H(chop_split_dist_knob);
  double_VAR_H(chop_overlap_knob);
  double_VAR_H(chop_center_knob);
  INT_VAR_H(chop_centered_maxwidth);
  double_VAR_H(chop_sharpness_knob);
  double_VAR_H(chop_width_change_knob);
  double_VAR_H(chop_ok_split);
  double_VAR_H(chop_good_split);
  INT_VAR_H(chop_x_y_weight);
  BOOL_VAR_H(assume_fixed_pitch_char_segment);
  INT_VAR_H(wordrec_debug_level);
  INT_VAR_H(wordrec_max_join_chunks);
  BOOL_VAR_H(wordrec_skip_no_truth_words);
  BOOL_VAR_H(wordrec_debug_blamer);
  BOOL_VAR_H(wordrec_run_blamer);
  INT_VAR_H(segsearch_debug_level);
  INT_VAR_H(segsearch_max_pain_points);
  INT_VAR_H(segsearch_max_futile_classifications);
  double_VAR_H(segsearch_max_char_wh_ratio);
  BOOL_VAR_H(save_alt_choices);

  // methods from wordrec/*.cpp ***********************************************
  Wordrec();
  ~Wordrec() override = default;

  // Fills word->alt_choices with alternative paths found during
  // chopping/segmentation search that are kept in best_choices.
  void SaveAltChoices(const LIST &best_choices, WERD_RES *word);

  // Fills character choice lattice in the given BlamerBundle
  // using the given ratings matrix and best choice list.
  void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
                   const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);

  // Calls fill_lattice_ member function
  // (assumes that fill_lattice_ is not nullptr).
  void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
                       const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) {
    (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
  }

  // tface.cpp
  void program_editup(const std::string &textbase, TessdataManager *init_classifier,
                      TessdataManager *init_dict);
  void cc_recog(WERD_RES *word);
  void program_editdown(int32_t elapsed_time);
  void set_pass1();
  void set_pass2();
  int end_recog();
  BLOB_CHOICE_LIST *call_matcher(TBLOB *blob);
  int dict_word(const WERD_CHOICE &word);
  // wordclass.cpp
  BLOB_CHOICE_LIST *classify_blob(TBLOB *blob, const char *string, ScrollView::Color color,
                                  BlamerBundle *blamer_bundle);

  // segsearch.cpp
  // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.
  // Each entry in the matrix represents the classification choice
  // for a chunk, i.e. an entry in row 2, column 1 represents the list
  // of ratings for the chunks 1 and 2 classified as a single blob.
  // The entries on the diagonal of the matrix are classifier choice lists
  // for a single chunk from the maximal segmentation.
  //
  // The ratings matrix given to SegSearch represents the segmentation
  // graph / trellis for the current word. The nodes in the graph are the
  // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings
  // matrix. The children of each node (nodes connected by outgoing links)
  // are the entries in the column that is equal to node's row+1. The parents
  // (nodes connected by the incoming links) are the entries in the row that
  // is equal to the node's column-1. Here is an example ratings matrix:
  //
  //    0    1    2   3   4
  //  -------------------------
  // 0| c,(                   |
  // 1| d    l,1              |
  // 2|           o           |
  // 3|              c,(      |
  // 4|              g,y  l,1 |
  //  -------------------------
  //
  // In the example above node "o" has children (outgoing connection to nodes)
  // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d".
  //
  // The objective of the search is to find the least cost path, where the cost
  // is determined by the language model components and the properties of the
  // cut between the blobs on the path. SegSearch starts by populating the
  // matrix with the all the entries that were classified by the chopper and
  // finding the initial best path. Based on the classifier ratings, language
  // model scores and the properties of each cut, a list of "pain points" is
  // constructed - those are the points on the path where the choices do not
  // look consistent with the neighboring choices, the cuts look particularly
  // problematic, or the certainties of the blobs are low. The most troublesome
  // "pain point" is picked from the list and the new entry in the ratings
  // matrix corresponding to this "pain point" is filled in. Then the language
  // model state is updated to reflect the new classification and the new
  // "pain points" are added to the list and the next most troublesome
  // "pain point" is determined. This continues until either the word choice
  // composed from the best paths in the segmentation graph is "good enough"
  // (e.g. above a certain certainty threshold, is an unambiguous dictionary
  // word, etc) or there are no more "pain points" to explore.
  //
  // If associate_blobs is set to false no new classifications will be done
  // to combine blobs. Segmentation search will run only one "iteration"
  // on the classifications already recorded in chunks_record.ratings.
  //
  // Note: this function assumes that word_res, best_choice_bundle arguments
  // are not nullptr.
  void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
                 BlamerBundle *blamer_bundle);

  // Setup and run just the initial segsearch on an established matrix,
  // without doing any additional chopping or joining.
  // (Internal factored version that can be used as part of the main SegSearch.)
  void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
                        std::vector<SegSearchPending> *pending,
                        BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);

  // chop.cpp
  PRIORITY point_priority(EDGEPT *point);
  void add_point_to_list(PointHeap *point_heap, EDGEPT *point);
  // Returns true if the edgept supplied as input is an inside angle.  This
  // is determined by the angular change of the vectors from point to point.
  bool is_inside_angle(EDGEPT *pt);
  int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);
  EDGEPT *pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist);
  void prioritize_points(TESSLINE *outline, PointHeap *points);
  void new_min_point(EDGEPT *local_min, PointHeap *points);
  void new_max_point(EDGEPT *local_max, PointHeap *points);
  void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point,
                                 EDGEPT_CLIST *new_points);

  // chopper.cpp
  SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,
                          const std::vector<SEAM *> &seams);
  SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
                           const std::vector<SEAM *> &seams);
  SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res,
                              unsigned *blob_number);
  SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
                         bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
                         unsigned *blob_number);
  SEAM *chop_one_blob(const std::vector<TBOX> &boxes,
                      const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
                      unsigned *blob_number);
  void chop_word_main(WERD_RES *word);
  void improve_by_chopping(float rating_cert_scale, WERD_RES *word,
                           BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
                           LMPainPoints *pain_points, std::vector<SegSearchPending> *pending);
  int select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, float rating_ceiling,
                           bool split_next_to_fragment);
  int select_blob_to_split_from_fixpt(DANGERR *fixpt);

  // findseam.cpp
  void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams);
  void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority,
                        SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile);
  void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue);
  SEAM *pick_good_seam(TBLOB *blob);
  void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue,
                       SeamPile *seam_pile, SEAM **seam, TBLOB *blob);
  void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points,
                           EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile,
                           SEAM **seam, TBLOB *blob);

  // gradechop.cpp
  PRIORITY grade_split_length(SPLIT *split);
  PRIORITY grade_sharpness(SPLIT *split);

  // outlines.cpp
  bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt);

  // pieces.cpp
  virtual BLOB_CHOICE_LIST *classify_piece(const std::vector<SEAM *> &seams, int16_t start,
                                           int16_t end, const char *description, TWERD *word,
                                           BlamerBundle *blamer_bundle);

  // Member variables.

  std::unique_ptr<LanguageModel> language_model_;
  PRIORITY pass2_ok_split;
  // Stores the best choice for the previous word in the paragraph.
  // This variable is modified by PAGE_RES_IT when iterating over
  // words to OCR on the page.
  WERD_CHOICE *prev_word_best_choice_;

  // Function used to fill char choice lattices.
  void (Wordrec::*fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
                                 const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);

protected:
  inline bool SegSearchDone(int num_futile_classifications) {
    return (language_model_->AcceptableChoiceFound() ||
            num_futile_classifications >= segsearch_max_futile_classifications);
  }

  // Updates the language model state recorded for the child entries specified
  // in pending[starting_col]. Enqueues the children of the updated entries
  // into pending and proceeds to update (and remove from pending) all the
  // remaining entries in pending[col] (col >= starting_col). Upon termination
  // of this function all the pending[col] lists will be empty.
  //
  // The arguments:
  //
  // starting_col: index of the column in chunks_record->ratings from
  // which the update should be started
  //
  // pending: list of entries listing chunks_record->ratings entries
  // that should be updated
  //
  // pain_points: priority heap listing the pain points generated by
  // the language model
  //
  // temp_pain_points: temporary storage for tentative pain points generated
  // by the language model after a single call to LanguageModel::UpdateState()
  // (the argument is passed in rather than created before each
  // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)
  //
  // best_choice_bundle: a collection of variables that should be updated
  // if a new best choice is found
  //
  void UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
                            std::vector<SegSearchPending> *pending, WERD_RES *word_res,
                            LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
                            BlamerBundle *blamer_bundle);

  // Process the given pain point: classify the corresponding blob, enqueue
  // new pain points to join the newly classified blob with its neighbors.
  void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,
                                 const char *pain_point_type,
                                 std::vector<SegSearchPending> *pending, WERD_RES *word_res,
                                 LMPainPoints *pain_points, BlamerBundle *blamer_bundle);
  // Resets enough of the results so that the Viterbi search is re-run.
  // Needed when the n-gram model is enabled, as the multi-length comparison
  // implementation will re-value existing paths to worse values.
  void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
                        std::vector<SegSearchPending> &pending);

  // Add pain points for classifying blobs on the correct segmentation path
  // (so that we can evaluate correct segmentation path and discover the reason
  // for incorrect result).
  void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
                              BlamerBundle *blamer_bundle, std::string &blamer_debug);
};

} // namespace tesseract

#endif // DISABLED_LEGACY_ENGINE

#endif // TESSERACT_WORDREC_WORDREC_H_

Coverage Report

Created: 2025-06-13 07:15

Line	Count	Source (jump to first uncovered line)
1		///////////////////////////////////////////////////////////////////////
2		// File: wordrec.h
3		// Description: wordrec class.
4		// Author: Samuel Charron
5		//
6		// (C) Copyright 2006, Google Inc.
7		// Licensed under the Apache License, Version 2.0 (the "License");
8		// you may not use this file except in compliance with the License.
9		// You may obtain a copy of the License at
10		// http://www.apache.org/licenses/LICENSE-2.0
11		// Unless required by applicable law or agreed to in writing, software
12		// distributed under the License is distributed on an "AS IS" BASIS,
13		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		// See the License for the specific language governing permissions and
15		// limitations under the License.
16		//
17		///////////////////////////////////////////////////////////////////////
18
19		#ifndef TESSERACT_WORDREC_WORDREC_H_
20		#define TESSERACT_WORDREC_WORDREC_H_
21
22		#ifdef HAVE_CONFIG_H
23		# include "config_auto.h" // DISABLED_LEGACY_ENGINE
24		#endif
25
26		#ifdef DISABLED_LEGACY_ENGINE
27
28		# include <cstdint> // for int16_t, int32_t
29		# include "classify.h" // for Classify
30		# include "params.h" // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...
31		# include "ratngs.h" // for WERD_CHOICE
32
33		namespace tesseract {
34		class TessdataManager;
35		}
36
37		namespace tesseract {
38
39		/* ccmain/tstruct.cpp */
40
41		class TESS_API Wordrec : public Classify {
42		public:
43		// config parameters
44
45		BOOL_VAR_H(wordrec_debug_blamer);
46		BOOL_VAR_H(wordrec_run_blamer);
47
48		// methods
49		Wordrec();
50		virtual ~Wordrec() = default;
51
52		// tface.cpp
53		void program_editup(const std::string &textbase, TessdataManager *init_classifier,
54		TessdataManager *init_dict);
55		void program_editdown(int32_t elapsed_time);
56		int end_recog();
57		int dict_word(const WERD_CHOICE &word);
58
59		// Member variables
60		WERD_CHOICE *prev_word_best_choice_;
61		};
62
63		} // namespace tesseract
64
65		#else // DISABLED_LEGACY_ENGINE not defined
66
67		# include <memory>
68		# include "associate.h"
69		# include "chop.h" // for PointHeap, MAX_NUM_POINTS
70		# include "classify.h" // for Classify
71		# include "dict.h"
72		# include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
73		# include "findseam.h" // for SeamQueue, SeamPile
74		# include "language_model.h"
75		# include "matrix.h"
76		# include "oldlist.h" // for LIST
77		# include "params.h" // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...
78		# include "points.h" // for ICOORD
79		# include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only), BLOB_CHOI...
80		# include "seam.h" // for SEAM (ptr only), PRIORITY
81		# include "stopper.h" // for DANGERR
82
83		# include <cstdint> // for int16_t, int32_t
84
85		namespace tesseract {
86
87		class EDGEPT_CLIST;
88		class MATRIX;
89		class TBOX;
90		class UNICHARSET;
91		class WERD_RES;
92
93		class LMPainPoints;
94		class TessdataManager;
95		struct BestChoiceBundle;
96
97		struct BlamerBundle;
98		struct EDGEPT;
99		struct MATRIX_COORD;
100		struct SPLIT;
101		struct TBLOB;
102		struct TESSLINE;
103		struct TWERD;
104
105		// A class for storing which nodes are to be processed by the segmentation
106		// search. There is a single SegSearchPending for each column in the ratings
107		// matrix, and it indicates whether the segsearch should combine all
108		// BLOB_CHOICES in the column, or just the given row with the parents
109		// corresponding to *this SegSearchPending, and whether only updated parent
110		// ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs.
111		class SegSearchPending {
112		public:
113		SegSearchPending()
114	333k	: classified_row_(-1), revisit_whole_column_(false), column_classified_(false) {}
115
116		// Marks the whole column as just classified. Used to start a search on
117		// a newly initialized ratings matrix.
118	144k	void SetColumnClassified() {
119	144k	column_classified_ = true;
120	144k	}
121		// Marks the matrix entry at the given row as just classified.
122		// Used after classifying a new matrix cell.
123		// Additional to, not overriding a previous RevisitWholeColumn.
124	1.36M	void SetBlobClassified(int row) {
125	1.36M	classified_row_ = row;
126	1.36M	}
127		// Marks the whole column as needing work, but not just classified.
128		// Used when the parent vse list is updated.
129		// Additional to, not overriding a previous SetBlobClassified.
130	2.44M	void RevisitWholeColumn() {
131	2.44M	revisit_whole_column_ = true;
132	2.44M	}
133
134		// Clears *this to indicate no work to do.
135	17.2M	void Clear() {
136	17.2M	classified_row_ = -1;
137	17.2M	revisit_whole_column_ = false;
138	17.2M	column_classified_ = false;
139	17.2M	}
140
141		// Returns true if there are updates to do in the column that *this
142		// represents.
143	10.7M	bool WorkToDo() const {
144	10.7M	return revisit_whole_column_ \|\| column_classified_ \|\| classified_row_ >= 0;
145	10.7M	}
146		// Returns true if the given row was just classified.
147	4.47M	bool IsRowJustClassified(int row) const {
148	4.47M	return row == classified_row_ \|\| column_classified_;
149	4.47M	}
150		// Returns the single row to process if there is only one, otherwise -1.
151	4.72M	int SingleRow() const {
152	4.72M	return revisit_whole_column_ \|\| column_classified_ ? -1 : classified_row_;
153	4.72M	}
154
155		private:
156		// If non-negative, indicates the single row in the ratings matrix that has
157		// just been classified, and so should be combined with all the parents in the
158		// column that this SegSearchPending represents.
159		// Operates independently of revisit_whole_column.
160		int classified_row_;
161		// If revisit_whole_column is true, then all BLOB_CHOICEs in this column will
162		// be processed, but classified_row can indicate a row that is newly
163		// classified. Overridden if column_classified is true.
164		bool revisit_whole_column_;
165		// If column_classified is true, parent vses are processed with all rows
166		// regardless of whether they are just updated, overriding
167		// revisit_whole_column and classified_row.
168		bool column_classified_;
169		};
170
171		/* ccmain/tstruct.cpp *********************************************************/
172		class FRAGMENT : public ELIST<FRAGMENT>::LINK {
173		public:
174	0	FRAGMENT() { // constructor
175	0	}
176		FRAGMENT(EDGEPT *head_pt, // start
177		EDGEPT *tail_pt); // end
178
179		ICOORD head; // coords of start
180		ICOORD tail; // coords of end
181		EDGEPT *headpt; // start point
182		EDGEPT *tailpt; // end point
183		};
184		ELISTIZEH(FRAGMENT)
185
186		class TESS_API Wordrec : public Classify {
187		public:
188		// config parameters *******************************************************
189		BOOL_VAR_H(merge_fragments_in_matrix);
190		BOOL_VAR_H(wordrec_enable_assoc);
191		BOOL_VAR_H(force_word_assoc);
192		INT_VAR_H(repair_unchopped_blobs);
193		double_VAR_H(tessedit_certainty_threshold);
194		INT_VAR_H(chop_debug);
195		BOOL_VAR_H(chop_enable);
196		BOOL_VAR_H(chop_vertical_creep);
197		INT_VAR_H(chop_split_length);
198		INT_VAR_H(chop_same_distance);
199		INT_VAR_H(chop_min_outline_points);
200		INT_VAR_H(chop_seam_pile_size);
201		BOOL_VAR_H(chop_new_seam_pile);
202		INT_VAR_H(chop_inside_angle);
203		INT_VAR_H(chop_min_outline_area);
204		double_VAR_H(chop_split_dist_knob);
205		double_VAR_H(chop_overlap_knob);
206		double_VAR_H(chop_center_knob);
207		INT_VAR_H(chop_centered_maxwidth);
208		double_VAR_H(chop_sharpness_knob);
209		double_VAR_H(chop_width_change_knob);
210		double_VAR_H(chop_ok_split);
211		double_VAR_H(chop_good_split);
212		INT_VAR_H(chop_x_y_weight);
213		BOOL_VAR_H(assume_fixed_pitch_char_segment);
214		INT_VAR_H(wordrec_debug_level);
215		INT_VAR_H(wordrec_max_join_chunks);
216		BOOL_VAR_H(wordrec_skip_no_truth_words);
217		BOOL_VAR_H(wordrec_debug_blamer);
218		BOOL_VAR_H(wordrec_run_blamer);
219		INT_VAR_H(segsearch_debug_level);
220		INT_VAR_H(segsearch_max_pain_points);
221		INT_VAR_H(segsearch_max_futile_classifications);
222		double_VAR_H(segsearch_max_char_wh_ratio);
223		BOOL_VAR_H(save_alt_choices);
224
225		// methods from wordrec/.cpp **********************************************
226		Wordrec();
227	0	~Wordrec() override = default;
228
229		// Fills word->alt_choices with alternative paths found during
230		// chopping/segmentation search that are kept in best_choices.
231		void SaveAltChoices(const LIST &best_choices, WERD_RES *word);
232
233		// Fills character choice lattice in the given BlamerBundle
234		// using the given ratings matrix and best choice list.
235		void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
236		const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
237
238		// Calls fill_lattice_ member function
239		// (assumes that fill_lattice_ is not nullptr).
240		void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
241	0	const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) {
242	0	(this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
243	0	}
244
245		// tface.cpp
246		void program_editup(const std::string &textbase, TessdataManager *init_classifier,
247		TessdataManager *init_dict);
248		void cc_recog(WERD_RES *word);
249		void program_editdown(int32_t elapsed_time);
250		void set_pass1();
251		void set_pass2();
252		int end_recog();
253		BLOB_CHOICE_LIST call_matcher(TBLOB blob);
254		int dict_word(const WERD_CHOICE &word);
255		// wordclass.cpp
256		BLOB_CHOICE_LIST classify_blob(TBLOB blob, const char *string, ScrollView::Color color,
257		BlamerBundle *blamer_bundle);
258
259		// segsearch.cpp
260		// SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.
261		// Each entry in the matrix represents the classification choice
262		// for a chunk, i.e. an entry in row 2, column 1 represents the list
263		// of ratings for the chunks 1 and 2 classified as a single blob.
264		// The entries on the diagonal of the matrix are classifier choice lists
265		// for a single chunk from the maximal segmentation.
266		//
267		// The ratings matrix given to SegSearch represents the segmentation
268		// graph / trellis for the current word. The nodes in the graph are the
269		// individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings
270		// matrix. The children of each node (nodes connected by outgoing links)
271		// are the entries in the column that is equal to node's row+1. The parents
272		// (nodes connected by the incoming links) are the entries in the row that
273		// is equal to the node's column-1. Here is an example ratings matrix:
274		//
275		// 0 1 2 3 4
276		// -------------------------
277		// 0\| c,( \|
278		// 1\| d l,1 \|
279		// 2\| o \|
280		// 3\| c,( \|
281		// 4\| g,y l,1 \|
282		// -------------------------
283		//
284		// In the example above node "o" has children (outgoing connection to nodes)
285		// "c","(","g","y" and parents (incoming connections from nodes) "l","1","d".
286		//
287		// The objective of the search is to find the least cost path, where the cost
288		// is determined by the language model components and the properties of the
289		// cut between the blobs on the path. SegSearch starts by populating the
290		// matrix with the all the entries that were classified by the chopper and
291		// finding the initial best path. Based on the classifier ratings, language
292		// model scores and the properties of each cut, a list of "pain points" is
293		// constructed - those are the points on the path where the choices do not
294		// look consistent with the neighboring choices, the cuts look particularly
295		// problematic, or the certainties of the blobs are low. The most troublesome
296		// "pain point" is picked from the list and the new entry in the ratings
297		// matrix corresponding to this "pain point" is filled in. Then the language
298		// model state is updated to reflect the new classification and the new
299		// "pain points" are added to the list and the next most troublesome
300		// "pain point" is determined. This continues until either the word choice
301		// composed from the best paths in the segmentation graph is "good enough"
302		// (e.g. above a certain certainty threshold, is an unambiguous dictionary
303		// word, etc) or there are no more "pain points" to explore.
304		//
305		// If associate_blobs is set to false no new classifications will be done
306		// to combine blobs. Segmentation search will run only one "iteration"
307		// on the classifications already recorded in chunks_record.ratings.
308		//
309		// Note: this function assumes that word_res, best_choice_bundle arguments
310		// are not nullptr.
311		void SegSearch(WERD_RES word_res, BestChoiceBundle best_choice_bundle,
312		BlamerBundle *blamer_bundle);
313
314		// Setup and run just the initial segsearch on an established matrix,
315		// without doing any additional chopping or joining.
316		// (Internal factored version that can be used as part of the main SegSearch.)
317		void InitialSegSearch(WERD_RES word_res, LMPainPoints pain_points,
318		std::vector<SegSearchPending> *pending,
319		BestChoiceBundle best_choice_bundle, BlamerBundle blamer_bundle);
320
321		// chop.cpp
322		PRIORITY point_priority(EDGEPT *point);
323		void add_point_to_list(PointHeap point_heap, EDGEPT point);
324		// Returns true if the edgept supplied as input is an inside angle. This
325		// is determined by the angular change of the vectors from point to point.
326		bool is_inside_angle(EDGEPT *pt);
327		int angle_change(EDGEPT point1, EDGEPT point2, EDGEPT *point3);
328		EDGEPT pick_close_point(EDGEPT critical_point, EDGEPT vertical_point, int best_dist);
329		void prioritize_points(TESSLINE outline, PointHeap points);
330		void new_min_point(EDGEPT local_min, PointHeap points);
331		void new_max_point(EDGEPT local_max, PointHeap points);
332		void vertical_projection_point(EDGEPT split_point, EDGEPT target_point, EDGEPT **best_point,
333		EDGEPT_CLIST *new_points);
334
335		// chopper.cpp
336		SEAM attempt_blob_chop(TWERD word, TBLOB *blob, int32_t blob_number, bool italic_blob,
337		const std::vector<SEAM *> &seams);
338		SEAM chop_numbered_blob(TWERD word, int32_t blob_number, bool italic_blob,
339		const std::vector<SEAM *> &seams);
340		SEAM chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES word_res,
341		unsigned *blob_number);
342		SEAM improve_one_blob(const std::vector<BLOB_CHOICE > &blob_choices, DANGERR *fixpt,
343		bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
344		unsigned *blob_number);
345		SEAM *chop_one_blob(const std::vector<TBOX> &boxes,
346		const std::vector<BLOB_CHOICE > &blob_choices, WERD_RES word_res,
347		unsigned *blob_number);
348		void chop_word_main(WERD_RES *word);
349		void improve_by_chopping(float rating_cert_scale, WERD_RES *word,
350		BestChoiceBundle best_choice_bundle, BlamerBundle blamer_bundle,
351		LMPainPoints pain_points, std::vector<SegSearchPending> pending);
352		int select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, float rating_ceiling,
353		bool split_next_to_fragment);
354		int select_blob_to_split_from_fixpt(DANGERR *fixpt);
355
356		// findseam.cpp
357		void add_seam_to_queue(float new_priority, SEAM new_seam, SeamQueue seams);
358		void choose_best_seam(SeamQueue seam_queue, const SPLIT split, PRIORITY priority,
359		SEAM *seam_result, TBLOB blob, SeamPile *seam_pile);
360		void combine_seam(const SeamPile &seam_pile, const SEAM seam, SeamQueue seam_queue);
361		SEAM pick_good_seam(TBLOB blob);
362		void try_point_pairs(EDGEPT points[MAX_NUM_POINTS], int16_t num_points, SeamQueue seam_queue,
363		SeamPile seam_pile, SEAM seam, TBLOB blob);
364		void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points,
365		EDGEPT_CLIST new_points, SeamQueue seam_queue, SeamPile *seam_pile,
366		SEAM *seam, TBLOB blob);
367
368		// gradechop.cpp
369		PRIORITY grade_split_length(SPLIT *split);
370		PRIORITY grade_sharpness(SPLIT *split);
371
372		// outlines.cpp
373		bool near_point(EDGEPT point, EDGEPT line_pt_0, EDGEPT line_pt_1, EDGEPT *near_pt);
374
375		// pieces.cpp
376		virtual BLOB_CHOICE_LIST classify_piece(const std::vector<SEAM > &seams, int16_t start,
377		int16_t end, const char description, TWERD word,
378		BlamerBundle *blamer_bundle);
379
380		// Member variables.
381
382		std::unique_ptr<LanguageModel> language_model_;
383		PRIORITY pass2_ok_split;
384		// Stores the best choice for the previous word in the paragraph.
385		// This variable is modified by PAGE_RES_IT when iterating over
386		// words to OCR on the page.
387		WERD_CHOICE *prev_word_best_choice_;
388
389		// Function used to fill char choice lattices.
390		void (Wordrec::*fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
391		const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
392
393		protected:
394	2.25M	inline bool SegSearchDone(int num_futile_classifications) {
395	2.25M	return (language_model_->AcceptableChoiceFound() \|\|
396	2.25M	num_futile_classifications >= segsearch_max_futile_classifications);
397	2.25M	}
398
399		// Updates the language model state recorded for the child entries specified
400		// in pending[starting_col]. Enqueues the children of the updated entries
401		// into pending and proceeds to update (and remove from pending) all the
402		// remaining entries in pending[col] (col >= starting_col). Upon termination
403		// of this function all the pending[col] lists will be empty.
404		//
405		// The arguments:
406		//
407		// starting_col: index of the column in chunks_record->ratings from
408		// which the update should be started
409		//
410		// pending: list of entries listing chunks_record->ratings entries
411		// that should be updated
412		//
413		// pain_points: priority heap listing the pain points generated by
414		// the language model
415		//
416		// temp_pain_points: temporary storage for tentative pain points generated
417		// by the language model after a single call to LanguageModel::UpdateState()
418		// (the argument is passed in rather than created before each
419		// LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)
420		//
421		// best_choice_bundle: a collection of variables that should be updated
422		// if a new best choice is found
423		//
424		void UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
425		std::vector<SegSearchPending> pending, WERD_RES word_res,
426		LMPainPoints pain_points, BestChoiceBundle best_choice_bundle,
427		BlamerBundle *blamer_bundle);
428
429		// Process the given pain point: classify the corresponding blob, enqueue
430		// new pain points to join the newly classified blob with its neighbors.
431		void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,
432		const char *pain_point_type,
433		std::vector<SegSearchPending> pending, WERD_RES word_res,
434		LMPainPoints pain_points, BlamerBundle blamer_bundle);
435		// Resets enough of the results so that the Viterbi search is re-run.
436		// Needed when the n-gram model is enabled, as the multi-length comparison
437		// implementation will re-value existing paths to worse values.
438		void ResetNGramSearch(WERD_RES word_res, BestChoiceBundle best_choice_bundle,
439		std::vector<SegSearchPending> &pending);
440
441		// Add pain points for classifying blobs on the correct segmentation path
442		// (so that we can evaluate correct segmentation path and discover the reason
443		// for incorrect result).
444		void InitBlamerForSegSearch(WERD_RES word_res, LMPainPoints pain_points,
445		BlamerBundle *blamer_bundle, std::string &blamer_debug);
446		};
447
448		} // namespace tesseract
449
450		#endif // DISABLED_LEGACY_ENGINE
451
452		#endif // TESSERACT_WORDREC_WORDREC_H_