/src/tesseract/src/wordrec/lm_pain_points.h

Source (jump to first uncovered line)
///////////////////////////////////////////////////////////////////////
// File:        lm_pain_points.h
// Description: Functions that utilize the knowledge about the properties
//              of the paths explored by the segmentation search in order
//              to generate "pain points" - the locations in the ratings
//              matrix which should be classified next.
// Author:      Rika Antonova
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_WORDREC_PAIN_POINTS_H_
#define TESSERACT_WORDREC_PAIN_POINTS_H_

#include "genericheap.h" // for GenericHeap
#include "matrix.h"      // for MATRIX_COORD (ptr only), MatrixCoordPair
#include "stopper.h"     // for DANGERR

namespace tesseract {

class Dict;
struct ViterbiStateEntry;
class WERD_RES;

// Heap of pain points used for determining where to chop/join.
using PainPointHeap = GenericHeap<MatrixCoordPair>;

// Types of pain points (ordered in the decreasing level of importance).
enum LMPainPointsType {
  LM_PPTYPE_BLAMER,
  LM_PPTYPE_AMBIG,
  LM_PPTYPE_PATH,
  LM_PPTYPE_SHAPE,

  LM_PPTYPE_NUM
};

static const char *const LMPainPointsTypeName[] = {
    "LM_PPTYPE_BLAMER",
    "LM_PPTYPE_AMBIGS",
    "LM_PPTYPE_PATH",
    "LM_PPTYPE_SHAPE",
};

class LMPainPoints {
public:
  static const float kDefaultPainPointPriorityAdjustment;
  // If there is a significant drop in character ngram probability or a
  // dangerous ambiguity make the thresholds on what blob combinations
  // can be classified looser.
  static const float kLooseMaxCharWhRatio;
  // Returns a description of the type of a pain point.
  static const char *PainPointDescription(LMPainPointsType type) {
    return LMPainPointsTypeName[type];
  }

  LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
      : max_heap_size_(max)
      , max_char_wh_ratio_(rat)
      , fixed_pitch_(fp)
      , dict_(d)
      , debug_level_(deb) {}
  ~LMPainPoints() = default;

  // Returns true if the heap of pain points of pp_type is not empty().
  inline bool HasPainPoints(LMPainPointsType pp_type) const {
    return !pain_points_heaps_[pp_type].empty();
  }

  // Dequeues the next pain point from the pain points queue and copies
  // its contents and priority to *pp and *priority.
  // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.
  LMPainPointsType Deque(MATRIX_COORD *pp, float *priority);

  // Clears pain points heap.
  void Clear() {
    for (auto &pain_points_heap : pain_points_heaps_) {
      pain_points_heap.clear();
    }
  }

  // For each cell, generate a "pain point" if the cell is not classified
  // and has a left or right neighbor that was classified.
  void GenerateInitial(WERD_RES *word_res);

  // Generate pain points from the given path.
  void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res);

  // Generate pain points from dangerous ambiguities in best choice.
  void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res);

  // Adds a pain point to classify chunks_record->ratings(col, row).
  // Returns true if a new pain point was added to an appropriate heap.
  // Pain point priority is set to special_priority for pain points of
  // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points
  // AssociateStats::gap_sum is used.
  bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority,
                         bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res);

  // Adjusts the pain point coordinates to cope with expansion of the ratings
  // matrix due to a split of the blob with the given index.
  void RemapForSplit(int index);

private:
  // Priority queues containing pain points generated by the language model
  // The priority is set by the language model components, adjustments like
  // seam cost and width priority are factored into the priority.
  PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM];
  // Maximum number of points to keep in the heap.
  int max_heap_size_;
  // Maximum character width/height ratio.
  float max_char_wh_ratio_;
  // Set to true if fixed pitch should be assumed.
  bool fixed_pitch_;
  // Cached pointer to dictionary.
  const Dict *dict_;
  // Debug level for print statements.
  int debug_level_;
};

} // namespace tesseract

#endif // TESSERACT_WORDREC_PAIN_POINTS_H_

Line	Count	Source (jump to first uncovered line)
1		///////////////////////////////////////////////////////////////////////
2		// File: lm_pain_points.h
3		// Description: Functions that utilize the knowledge about the properties
4		// of the paths explored by the segmentation search in order
5		// to generate "pain points" - the locations in the ratings
6		// matrix which should be classified next.
7		// Author: Rika Antonova
8		//
9		// (C) Copyright 2012, Google Inc.
10		// Licensed under the Apache License, Version 2.0 (the "License");
11		// you may not use this file except in compliance with the License.
12		// You may obtain a copy of the License at
13		// http://www.apache.org/licenses/LICENSE-2.0
14		// Unless required by applicable law or agreed to in writing, software
15		// distributed under the License is distributed on an "AS IS" BASIS,
16		// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17		// See the License for the specific language governing permissions and
18		// limitations under the License.
19		//
20		///////////////////////////////////////////////////////////////////////
21
22		#ifndef TESSERACT_WORDREC_PAIN_POINTS_H_
23		#define TESSERACT_WORDREC_PAIN_POINTS_H_
24
25		#include "genericheap.h" // for GenericHeap
26		#include "matrix.h" // for MATRIX_COORD (ptr only), MatrixCoordPair
27		#include "stopper.h" // for DANGERR
28
29		namespace tesseract {
30
31		class Dict;
32		struct ViterbiStateEntry;
33		class WERD_RES;
34
35		// Heap of pain points used for determining where to chop/join.
36		using PainPointHeap = GenericHeap<MatrixCoordPair>;
37
38		// Types of pain points (ordered in the decreasing level of importance).
39		enum LMPainPointsType {
40		LM_PPTYPE_BLAMER,
41		LM_PPTYPE_AMBIG,
42		LM_PPTYPE_PATH,
43		LM_PPTYPE_SHAPE,
44
45		LM_PPTYPE_NUM
46		};
47
48		static const char *const LMPainPointsTypeName[] = {
49		"LM_PPTYPE_BLAMER",
50		"LM_PPTYPE_AMBIGS",
51		"LM_PPTYPE_PATH",
52		"LM_PPTYPE_SHAPE",
53		};
54
55		class LMPainPoints {
56		public:
57		static const float kDefaultPainPointPriorityAdjustment;
58		// If there is a significant drop in character ngram probability or a
59		// dangerous ambiguity make the thresholds on what blob combinations
60		// can be classified looser.
61		static const float kLooseMaxCharWhRatio;
62		// Returns a description of the type of a pain point.
63	1.48M	static const char *PainPointDescription(LMPainPointsType type) {
64	1.48M	return LMPainPointsTypeName[type];
65	1.48M	}
66
67		LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
68		: max_heap_size_(max)
69		, max_char_wh_ratio_(rat)
70		, fixed_pitch_(fp)
71		, dict_(d)
72	196k	, debug_level_(deb) {}
73	196k	~LMPainPoints() = default;
74
75		// Returns true if the heap of pain points of pp_type is not empty().
76	0	inline bool HasPainPoints(LMPainPointsType pp_type) const {
77	0	return !pain_points_heaps_[pp_type].empty();
78	0	}
79
80		// Dequeues the next pain point from the pain points queue and copies
81		// its contents and priority to pp and priority.
82		// Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.
83		LMPainPointsType Deque(MATRIX_COORD pp, float priority);
84
85		// Clears pain points heap.
86	0	void Clear() {
87	0	for (auto &pain_points_heap : pain_points_heaps_) {
88	0	pain_points_heap.clear();
89	0	}
90	0	}
91
92		// For each cell, generate a "pain point" if the cell is not classified
93		// and has a left or right neighbor that was classified.
94		void GenerateInitial(WERD_RES *word_res);
95
96		// Generate pain points from the given path.
97		void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry vse, WERD_RES word_res);
98
99		// Generate pain points from dangerous ambiguities in best choice.
100		void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry vse, WERD_RES word_res);
101
102		// Adds a pain point to classify chunks_record->ratings(col, row).
103		// Returns true if a new pain point was added to an appropriate heap.
104		// Pain point priority is set to special_priority for pain points of
105		// LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points
106		// AssociateStats::gap_sum is used.
107		bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority,
108		bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res);
109
110		// Adjusts the pain point coordinates to cope with expansion of the ratings
111		// matrix due to a split of the blob with the given index.
112		void RemapForSplit(int index);
113
114		private:
115		// Priority queues containing pain points generated by the language model
116		// The priority is set by the language model components, adjustments like
117		// seam cost and width priority are factored into the priority.
118		PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM];
119		// Maximum number of points to keep in the heap.
120		int max_heap_size_;
121		// Maximum character width/height ratio.
122		float max_char_wh_ratio_;
123		// Set to true if fixed pitch should be assumed.
124		bool fixed_pitch_;
125		// Cached pointer to dictionary.
126		const Dict *dict_;
127		// Debug level for print statements.
128		int debug_level_;
129		};
130
131		} // namespace tesseract
132
133		#endif // TESSERACT_WORDREC_PAIN_POINTS_H_

Coverage Report

Created: 2024-02-28 06:46