/src/tesseract/src/wordrec/lm_state.h

Source (jump to first uncovered line)
///////////////////////////////////////////////////////////////////////
// File:        lm_state.h
// Description: Structures and functionality for capturing the state of
//              segmentation search guided by the language model.
// Author:      Rika Antonova
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
#define TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_

#include <tesseract/unichar.h> // for UNICHAR_ID
#include "associate.h"         // for AssociateStats
#include "dawg.h"              // for DawgPositionVector
#include "elst.h"              // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
#include "lm_consistency.h"    // for LMConsistencyInfo
#include "ratngs.h"            // for BLOB_CHOICE, PermuterType
#include "stopper.h"           // for DANGERR
#include "unicharset.h"        // for UNICHARSET

namespace tesseract {

/// Used for expressing various language model flags.
using LanguageModelFlagsType = unsigned char;

/// The following structs are used for storing the state of the language model
/// in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs
/// and the links are the relationships between the underlying blobs (see
/// segsearch.h for a more detailed description).
///
/// Each of the BLOB_CHOICEs contains LanguageModelState struct, which has
/// a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi
/// search leading up to and including this BLOB_CHOICE.
///
/// Each ViterbiStateEntry contains information from various components of the
/// language model: dawgs in which the path is found, character ngram model
/// probability of the path, script/chartype/font consistency info, state for
/// language-specific heuristics (e.g. hyphenated and compound words,
/// lower/upper case preferences, etc).
///
/// Each ViterbiStateEntry also contains the parent pointer, so that the path
/// that it represents (WERD_CHOICE) can be constructed by following these
/// parent pointers.

/// Struct for storing additional information used by Dawg language model
/// component. It stores the set of active dawgs in which the sequence of
/// letters on a path can be found.
struct LanguageModelDawgInfo {
  LanguageModelDawgInfo(const DawgPositionVector *a, PermuterType pt)
      : active_dawgs(*a), permuter(pt) {}
  DawgPositionVector active_dawgs;
  PermuterType permuter;
};

/// Struct for storing additional information used by Ngram language model
/// component.
struct LanguageModelNgramInfo {
  LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc)
      : context(c)
      , context_unichar_step_len(l)
      , pruned(p)
      , ngram_cost(nc)
      , ngram_and_classifier_cost(ncc) {}
  std::string context; ///< context string
  /// Length of the context measured by advancing using UNICHAR::utf8_step()
  /// (should be at most the order of the character ngram model used).
  int context_unichar_step_len;
  /// The paths with pruned set are pruned out from the perspective of the
  /// character ngram model. They are explored further because they represent
  /// a dictionary match or a top choice. Thus ngram_info is still computed
  /// for them in order to calculate the combined cost.
  bool pruned;
  /// -ln(P_ngram_model(path))
  float ngram_cost;
  /// -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
  float ngram_and_classifier_cost;
};

/// Struct for storing the information about a path in the segmentation graph
/// explored by Viterbi search.
struct ViterbiStateEntry : public ELIST<ViterbiStateEntry>::LINK {
  ViterbiStateEntry(ViterbiStateEntry *pe, BLOB_CHOICE *b, float c, float ol,
                    const LMConsistencyInfo &ci, const AssociateStats &as,
                    LanguageModelFlagsType tcf, LanguageModelDawgInfo *d, LanguageModelNgramInfo *n,
                    const char *debug_uch)
      : curr_b(b)
      , parent_vse(pe)
      , competing_vse(nullptr)
      , dawg_info(d)
      , ngram_info(n)
      , cost(c)
      , ratings_sum(b->rating())
      , min_certainty(b->certainty())
      , adapted(b->IsAdapted())
      , length(1)
      , outline_length(ol)
      , consistency_info(ci)
      , associate_stats(as)
      , top_choice_flags(tcf)
      , updated(true) {
    debug_str = (debug_uch == nullptr) ? nullptr : new std::string();
    if (pe != nullptr) {
      ratings_sum += pe->ratings_sum;
      if (pe->min_certainty < min_certainty) {
        min_certainty = pe->min_certainty;
      }
      adapted += pe->adapted;
      length += pe->length;
      outline_length += pe->outline_length;
      if (debug_uch != nullptr) {
        *debug_str += *(pe->debug_str);
      }
    }
    if (debug_str != nullptr && debug_uch != nullptr) {
      *debug_str += debug_uch;
    }
  }
  ~ViterbiStateEntry() {
    delete dawg_info;
    delete ngram_info;
    delete debug_str;
  }
  /// Comparator function for sorting ViterbiStateEntry_LISTs in
  /// non-increasing order of costs.
  static int Compare(const ViterbiStateEntry *ve1, const ViterbiStateEntry *ve2) {
    return (ve1->cost < ve2->cost) ? -1 : 1;
  }
  inline bool Consistent() const {
    if (dawg_info != nullptr && consistency_info.NumInconsistentCase() == 0) {
      return true;
    }
    return consistency_info.Consistent();
  }
  /// Returns true if this VSE has an alphanumeric character as its classifier
  /// result.
  bool HasAlnumChoice(const UNICHARSET &unicharset) {
    if (curr_b == nullptr) {
      return false;
    }
    UNICHAR_ID unichar_id = curr_b->unichar_id();
    if (unicharset.get_isalpha(unichar_id) || unicharset.get_isdigit(unichar_id)) {
      return true;
    }
    return false;
  }
  void Print(const char *msg) const;

  /// Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
  BLOB_CHOICE *curr_b;
  ViterbiStateEntry *parent_vse;
  /// Pointer to a case-competing ViterbiStateEntry in the same list that
  /// represents a path ending in the same letter of the opposite case.
  ViterbiStateEntry *competing_vse;

  /// Extra information maintained by Dawg language model component
  /// (owned by ViterbiStateEntry).
  LanguageModelDawgInfo *dawg_info;

  /// Extra information maintained by Ngram language model component
  /// (owned by ViterbiStateEntry).
  LanguageModelNgramInfo *ngram_info;

  /// UTF8 string representing the path corresponding to this vse.
  /// Populated only in when language_model_debug_level > 0.
  std::string *debug_str;

  /// The cost is an adjusted ratings sum, that is adjusted by all the language
  /// model components that use Viterbi search.
  float cost;

  /// Various information about the characters on the path represented
  /// by this ViterbiStateEntry.
  float ratings_sum;                  ///< sum of ratings of character on the path
  float min_certainty;                ///< minimum certainty on the path
  int adapted;                        ///< number of BLOB_CHOICES from adapted templates
  int length;                         ///< number of characters on the path
  float outline_length;               ///< length of the outline so far
  LMConsistencyInfo consistency_info; ///< path consistency info
  AssociateStats associate_stats;     ///< character widths/gaps/seams

  /// Flags for marking the entry as a top choice path with
  /// the smallest rating or lower/upper case letters).
  LanguageModelFlagsType top_choice_flags;

  bool updated; ///< set to true if the entry has just been created/updated
};

ELISTIZEH(ViterbiStateEntry)

/// Struct to store information maintained by various language model components.
struct LanguageModelState {
  LanguageModelState()
      : viterbi_state_entries_prunable_length(0)
      , viterbi_state_entries_prunable_max_cost(FLT_MAX)
      , viterbi_state_entries_length(0) {}
  ~LanguageModelState() = default;

  /// Clears the viterbi search state back to its initial conditions.
  void Clear();

  void Print(const char *msg);

  /// Storage for the Viterbi state.
  ViterbiStateEntry_LIST viterbi_state_entries;
  /// Number and max cost of prunable paths in viterbi_state_entries.
  int viterbi_state_entries_prunable_length;
  float viterbi_state_entries_prunable_max_cost;
  /// Total number of entries in viterbi_state_entries.
  int viterbi_state_entries_length;
};

/// Bundle together all the things pertaining to the best choice/state.
struct BestChoiceBundle {
  explicit BestChoiceBundle(int matrix_dimension) : updated(false), best_vse(nullptr) {
    beam.reserve(matrix_dimension);
    for (int i = 0; i < matrix_dimension; ++i) {
      beam.push_back(new LanguageModelState);
    }
  }
  ~BestChoiceBundle() {
    for (auto &state : beam) {
      delete state;
    }
  }

  /// Flag to indicate whether anything was changed.
  bool updated;
  /// Places to try to fix the word suggested by ambiguity checking.
  DANGERR fixpt;
  /// The beam. One LanguageModelState containing a list of ViterbiStateEntry
  /// per row in the ratings matrix containing all VSEs whose BLOB_CHOICE is
  /// somewhere in the corresponding row.
  std::vector<LanguageModelState *> beam;
  /// Best ViterbiStateEntry and BLOB_CHOICE.
  ViterbiStateEntry *best_vse;
};

} // namespace tesseract

#endif // TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_

Coverage Report

Created: 2025-07-12 06:44

Line	Count	Source (jump to first uncovered line)
1		///////////////////////////////////////////////////////////////////////
2		// File: lm_state.h
3		// Description: Structures and functionality for capturing the state of
4		// segmentation search guided by the language model.
5		// Author: Rika Antonova
6		//
7		// (C) Copyright 2012, Google Inc.
8		// Licensed under the Apache License, Version 2.0 (the "License");
9		// you may not use this file except in compliance with the License.
10		// You may obtain a copy of the License at
11		// http://www.apache.org/licenses/LICENSE-2.0
12		// Unless required by applicable law or agreed to in writing, software
13		// distributed under the License is distributed on an "AS IS" BASIS,
14		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15		// See the License for the specific language governing permissions and
16		// limitations under the License.
17		//
18		///////////////////////////////////////////////////////////////////////
19
20		#ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
21		#define TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_
22
23		#include <tesseract/unichar.h> // for UNICHAR_ID
24		#include "associate.h" // for AssociateStats
25		#include "dawg.h" // for DawgPositionVector
26		#include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
27		#include "lm_consistency.h" // for LMConsistencyInfo
28		#include "ratngs.h" // for BLOB_CHOICE, PermuterType
29		#include "stopper.h" // for DANGERR
30		#include "unicharset.h" // for UNICHARSET
31
32		namespace tesseract {
33
34		/// Used for expressing various language model flags.
35		using LanguageModelFlagsType = unsigned char;
36
37		/// The following structs are used for storing the state of the language model
38		/// in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs
39		/// and the links are the relationships between the underlying blobs (see
40		/// segsearch.h for a more detailed description).
41		///
42		/// Each of the BLOB_CHOICEs contains LanguageModelState struct, which has
43		/// a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi
44		/// search leading up to and including this BLOB_CHOICE.
45		///
46		/// Each ViterbiStateEntry contains information from various components of the
47		/// language model: dawgs in which the path is found, character ngram model
48		/// probability of the path, script/chartype/font consistency info, state for
49		/// language-specific heuristics (e.g. hyphenated and compound words,
50		/// lower/upper case preferences, etc).
51		///
52		/// Each ViterbiStateEntry also contains the parent pointer, so that the path
53		/// that it represents (WERD_CHOICE) can be constructed by following these
54		/// parent pointers.
55
56		/// Struct for storing additional information used by Dawg language model
57		/// component. It stores the set of active dawgs in which the sequence of
58		/// letters on a path can be found.
59		struct LanguageModelDawgInfo {
60		LanguageModelDawgInfo(const DawgPositionVector *a, PermuterType pt)
61	0	: active_dawgs(*a), permuter(pt) {}
62		DawgPositionVector active_dawgs;
63		PermuterType permuter;
64		};
65
66		/// Struct for storing additional information used by Ngram language model
67		/// component.
68		struct LanguageModelNgramInfo {
69		LanguageModelNgramInfo(const char *c, int l, bool p, float nc, float ncc)
70	0	: context(c)
71	0	, context_unichar_step_len(l)
72	0	, pruned(p)
73	0	, ngram_cost(nc)
74	0	, ngram_and_classifier_cost(ncc) {}
75		std::string context; ///< context string
76		/// Length of the context measured by advancing using UNICHAR::utf8_step()
77		/// (should be at most the order of the character ngram model used).
78		int context_unichar_step_len;
79		/// The paths with pruned set are pruned out from the perspective of the
80		/// character ngram model. They are explored further because they represent
81		/// a dictionary match or a top choice. Thus ngram_info is still computed
82		/// for them in order to calculate the combined cost.
83		bool pruned;
84		/// -ln(P_ngram_model(path))
85		float ngram_cost;
86		/// -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
87		float ngram_and_classifier_cost;
88		};
89
90		/// Struct for storing the information about a path in the segmentation graph
91		/// explored by Viterbi search.
92		struct ViterbiStateEntry : public ELIST<ViterbiStateEntry>::LINK {
93		ViterbiStateEntry(ViterbiStateEntry pe, BLOB_CHOICE b, float c, float ol,
94		const LMConsistencyInfo &ci, const AssociateStats &as,
95		LanguageModelFlagsType tcf, LanguageModelDawgInfo d, LanguageModelNgramInfo n,
96		const char *debug_uch)
97	22.3M	: curr_b(b)
98	22.3M	, parent_vse(pe)
99	22.3M	, competing_vse(nullptr)
100	22.3M	, dawg_info(d)
101	22.3M	, ngram_info(n)
102	22.3M	, cost(c)
103	22.3M	, ratings_sum(b->rating())
104	22.3M	, min_certainty(b->certainty())
105	22.3M	, adapted(b->IsAdapted())
106	22.3M	, length(1)
107	22.3M	, outline_length(ol)
108	22.3M	, consistency_info(ci)
109	22.3M	, associate_stats(as)
110	22.3M	, top_choice_flags(tcf)
111	22.3M	, updated(true) {
112	22.3M	debug_str = (debug_uch == nullptr) ? nullptr : new std::string();
113	22.3M	if (pe != nullptr) {
114	20.7M	ratings_sum += pe->ratings_sum;
115	20.7M	if (pe->min_certainty < min_certainty) {
116	11.4M	min_certainty = pe->min_certainty;
117	11.4M	}
118	20.7M	adapted += pe->adapted;
119	20.7M	length += pe->length;
120	20.7M	outline_length += pe->outline_length;
121	20.7M	if (debug_uch != nullptr) {
122	0	debug_str += (pe->debug_str);
123	0	}
124	20.7M	}
125	22.3M	if (debug_str != nullptr && debug_uch != nullptr) {
126	0	*debug_str += debug_uch;
127	0	}
128	22.3M	}
129	22.3M	~ViterbiStateEntry() {
130	22.3M	delete dawg_info;
131	22.3M	delete ngram_info;
132	22.3M	delete debug_str;
133	22.3M	}
134		/// Comparator function for sorting ViterbiStateEntry_LISTs in
135		/// non-increasing order of costs.
136	16.0M	static int Compare(const ViterbiStateEntry ve1, const ViterbiStateEntry ve2) {
137	16.0M	return (ve1->cost < ve2->cost) ? -1 : 1;
138	16.0M	}
139	0	inline bool Consistent() const {
140	0	if (dawg_info != nullptr && consistency_info.NumInconsistentCase() == 0) {
141	0	return true;
142	0	}
143	0	return consistency_info.Consistent();
144	0	}
145		/// Returns true if this VSE has an alphanumeric character as its classifier
146		/// result.
147	66.5M	bool HasAlnumChoice(const UNICHARSET &unicharset) {
148	66.5M	if (curr_b == nullptr) {
149	0	return false;
150	0	}
151	66.5M	UNICHAR_ID unichar_id = curr_b->unichar_id();
152	66.5M	if (unicharset.get_isalpha(unichar_id) \|\| unicharset.get_isdigit(unichar_id)) {
153	35.9M	return true;
154	35.9M	}
155	30.6M	return false;
156	66.5M	}
157		void Print(const char *msg) const;
158
159		/// Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
160		BLOB_CHOICE *curr_b;
161		ViterbiStateEntry *parent_vse;
162		/// Pointer to a case-competing ViterbiStateEntry in the same list that
163		/// represents a path ending in the same letter of the opposite case.
164		ViterbiStateEntry *competing_vse;
165
166		/// Extra information maintained by Dawg language model component
167		/// (owned by ViterbiStateEntry).
168		LanguageModelDawgInfo *dawg_info;
169
170		/// Extra information maintained by Ngram language model component
171		/// (owned by ViterbiStateEntry).
172		LanguageModelNgramInfo *ngram_info;
173
174		/// UTF8 string representing the path corresponding to this vse.
175		/// Populated only in when language_model_debug_level > 0.
176		std::string *debug_str;
177
178		/// The cost is an adjusted ratings sum, that is adjusted by all the language
179		/// model components that use Viterbi search.
180		float cost;
181
182		/// Various information about the characters on the path represented
183		/// by this ViterbiStateEntry.
184		float ratings_sum; ///< sum of ratings of character on the path
185		float min_certainty; ///< minimum certainty on the path
186		int adapted; ///< number of BLOB_CHOICES from adapted templates
187		int length; ///< number of characters on the path
188		float outline_length; ///< length of the outline so far
189		LMConsistencyInfo consistency_info; ///< path consistency info
190		AssociateStats associate_stats; ///< character widths/gaps/seams
191
192		/// Flags for marking the entry as a top choice path with
193		/// the smallest rating or lower/upper case letters).
194		LanguageModelFlagsType top_choice_flags;
195
196		bool updated; ///< set to true if the entry has just been created/updated
197		};
198
199		ELISTIZEH(ViterbiStateEntry)
200
201		/// Struct to store information maintained by various language model components.
202		struct LanguageModelState {
203		LanguageModelState()
204	814k	: viterbi_state_entries_prunable_length(0)
205		, viterbi_state_entries_prunable_max_cost(FLT_MAX)
206	814k	, viterbi_state_entries_length(0) {}
207	814k	~LanguageModelState() = default;
208
209		/// Clears the viterbi search state back to its initial conditions.
210		void Clear();
211
212		void Print(const char *msg);
213
214		/// Storage for the Viterbi state.
215		ViterbiStateEntry_LIST viterbi_state_entries;
216		/// Number and max cost of prunable paths in viterbi_state_entries.
217		int viterbi_state_entries_prunable_length;
218		float viterbi_state_entries_prunable_max_cost;
219		/// Total number of entries in viterbi_state_entries.
220		int viterbi_state_entries_length;
221		};
222
223		/// Bundle together all the things pertaining to the best choice/state.
224		struct BestChoiceBundle {
225	155k	explicit BestChoiceBundle(int matrix_dimension) : updated(false), best_vse(nullptr) {
226	155k	beam.reserve(matrix_dimension);
227	762k	for (int i = 0; i < matrix_dimension; ++i) {
228	606k	beam.push_back(new LanguageModelState);
229	606k	}
230	155k	}
231	155k	~BestChoiceBundle() {
232	814k	for (auto &state : beam) {
233	814k	delete state;
234	814k	}
235	155k	}
236
237		/// Flag to indicate whether anything was changed.
238		bool updated;
239		/// Places to try to fix the word suggested by ambiguity checking.
240		DANGERR fixpt;
241		/// The beam. One LanguageModelState containing a list of ViterbiStateEntry
242		/// per row in the ratings matrix containing all VSEs whose BLOB_CHOICE is
243		/// somewhere in the corresponding row.
244		std::vector<LanguageModelState *> beam;
245		/// Best ViterbiStateEntry and BLOB_CHOICE.
246		ViterbiStateEntry *best_vse;
247		};
248
249		} // namespace tesseract
250
251		#endif // TESSERACT_WORDREC_LANGUAGE_MODEL_DEFS_H_