Coverage Report

Created: 2024-02-28 06:46

/src/tesseract/src/wordrec/lm_pain_points.h
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        lm_pain_points.h
3
// Description: Functions that utilize the knowledge about the properties
4
//              of the paths explored by the segmentation search in order
5
//              to generate "pain points" - the locations in the ratings
6
//              matrix which should be classified next.
7
// Author:      Rika Antonova
8
//
9
// (C) Copyright 2012, Google Inc.
10
// Licensed under the Apache License, Version 2.0 (the "License");
11
// you may not use this file except in compliance with the License.
12
// You may obtain a copy of the License at
13
// http://www.apache.org/licenses/LICENSE-2.0
14
// Unless required by applicable law or agreed to in writing, software
15
// distributed under the License is distributed on an "AS IS" BASIS,
16
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
// See the License for the specific language governing permissions and
18
// limitations under the License.
19
//
20
///////////////////////////////////////////////////////////////////////
21
22
#ifndef TESSERACT_WORDREC_PAIN_POINTS_H_
23
#define TESSERACT_WORDREC_PAIN_POINTS_H_
24
25
#include "genericheap.h" // for GenericHeap
26
#include "matrix.h"      // for MATRIX_COORD (ptr only), MatrixCoordPair
27
#include "stopper.h"     // for DANGERR
28
29
namespace tesseract {
30
31
class Dict;
32
struct ViterbiStateEntry;
33
class WERD_RES;
34
35
// Heap of pain points used for determining where to chop/join.
36
using PainPointHeap = GenericHeap<MatrixCoordPair>;
37
38
// Types of pain points (ordered in the decreasing level of importance).
39
enum LMPainPointsType {
40
  LM_PPTYPE_BLAMER,
41
  LM_PPTYPE_AMBIG,
42
  LM_PPTYPE_PATH,
43
  LM_PPTYPE_SHAPE,
44
45
  LM_PPTYPE_NUM
46
};
47
48
static const char *const LMPainPointsTypeName[] = {
49
    "LM_PPTYPE_BLAMER",
50
    "LM_PPTYPE_AMBIGS",
51
    "LM_PPTYPE_PATH",
52
    "LM_PPTYPE_SHAPE",
53
};
54
55
class LMPainPoints {
56
public:
57
  static const float kDefaultPainPointPriorityAdjustment;
58
  // If there is a significant drop in character ngram probability or a
59
  // dangerous ambiguity make the thresholds on what blob combinations
60
  // can be classified looser.
61
  static const float kLooseMaxCharWhRatio;
62
  // Returns a description of the type of a pain point.
63
1.48M
  static const char *PainPointDescription(LMPainPointsType type) {
64
1.48M
    return LMPainPointsTypeName[type];
65
1.48M
  }
66
67
  LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
68
      : max_heap_size_(max)
69
      , max_char_wh_ratio_(rat)
70
      , fixed_pitch_(fp)
71
      , dict_(d)
72
196k
      , debug_level_(deb) {}
73
196k
  ~LMPainPoints() = default;
74
75
  // Returns true if the heap of pain points of pp_type is not empty().
76
0
  inline bool HasPainPoints(LMPainPointsType pp_type) const {
77
0
    return !pain_points_heaps_[pp_type].empty();
78
0
  }
79
80
  // Dequeues the next pain point from the pain points queue and copies
81
  // its contents and priority to *pp and *priority.
82
  // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.
83
  LMPainPointsType Deque(MATRIX_COORD *pp, float *priority);
84
85
  // Clears pain points heap.
86
0
  void Clear() {
87
0
    for (auto &pain_points_heap : pain_points_heaps_) {
88
0
      pain_points_heap.clear();
89
0
    }
90
0
  }
91
92
  // For each cell, generate a "pain point" if the cell is not classified
93
  // and has a left or right neighbor that was classified.
94
  void GenerateInitial(WERD_RES *word_res);
95
96
  // Generate pain points from the given path.
97
  void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res);
98
99
  // Generate pain points from dangerous ambiguities in best choice.
100
  void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res);
101
102
  // Adds a pain point to classify chunks_record->ratings(col, row).
103
  // Returns true if a new pain point was added to an appropriate heap.
104
  // Pain point priority is set to special_priority for pain points of
105
  // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points
106
  // AssociateStats::gap_sum is used.
107
  bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority,
108
                         bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res);
109
110
  // Adjusts the pain point coordinates to cope with expansion of the ratings
111
  // matrix due to a split of the blob with the given index.
112
  void RemapForSplit(int index);
113
114
private:
115
  // Priority queues containing pain points generated by the language model
116
  // The priority is set by the language model components, adjustments like
117
  // seam cost and width priority are factored into the priority.
118
  PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM];
119
  // Maximum number of points to keep in the heap.
120
  int max_heap_size_;
121
  // Maximum character width/height ratio.
122
  float max_char_wh_ratio_;
123
  // Set to true if fixed pitch should be assumed.
124
  bool fixed_pitch_;
125
  // Cached pointer to dictionary.
126
  const Dict *dict_;
127
  // Debug level for print statements.
128
  int debug_level_;
129
};
130
131
} // namespace tesseract
132
133
#endif // TESSERACT_WORDREC_PAIN_POINTS_H_