/src/tesseract/src/wordrec/lm_pain_points.h
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: lm_pain_points.h |
3 | | // Description: Functions that utilize the knowledge about the properties |
4 | | // of the paths explored by the segmentation search in order |
5 | | // to generate "pain points" - the locations in the ratings |
6 | | // matrix which should be classified next. |
7 | | // Author: Rika Antonova |
8 | | // |
9 | | // (C) Copyright 2012, Google Inc. |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
11 | | // you may not use this file except in compliance with the License. |
12 | | // You may obtain a copy of the License at |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // Unless required by applicable law or agreed to in writing, software |
15 | | // distributed under the License is distributed on an "AS IS" BASIS, |
16 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
17 | | // See the License for the specific language governing permissions and |
18 | | // limitations under the License. |
19 | | // |
20 | | /////////////////////////////////////////////////////////////////////// |
21 | | |
22 | | #ifndef TESSERACT_WORDREC_PAIN_POINTS_H_ |
23 | | #define TESSERACT_WORDREC_PAIN_POINTS_H_ |
24 | | |
25 | | #include "genericheap.h" // for GenericHeap |
26 | | #include "matrix.h" // for MATRIX_COORD (ptr only), MatrixCoordPair |
27 | | #include "stopper.h" // for DANGERR |
28 | | |
29 | | namespace tesseract { |
30 | | |
31 | | class Dict; |
32 | | struct ViterbiStateEntry; |
33 | | class WERD_RES; |
34 | | |
35 | | // Heap of pain points used for determining where to chop/join. |
36 | | using PainPointHeap = GenericHeap<MatrixCoordPair>; |
37 | | |
38 | | // Types of pain points (ordered in the decreasing level of importance). |
39 | | enum LMPainPointsType { |
40 | | LM_PPTYPE_BLAMER, |
41 | | LM_PPTYPE_AMBIG, |
42 | | LM_PPTYPE_PATH, |
43 | | LM_PPTYPE_SHAPE, |
44 | | |
45 | | LM_PPTYPE_NUM |
46 | | }; |
47 | | |
48 | | static const char *const LMPainPointsTypeName[] = { |
49 | | "LM_PPTYPE_BLAMER", |
50 | | "LM_PPTYPE_AMBIGS", |
51 | | "LM_PPTYPE_PATH", |
52 | | "LM_PPTYPE_SHAPE", |
53 | | }; |
54 | | |
55 | | class LMPainPoints { |
56 | | public: |
57 | | static const float kDefaultPainPointPriorityAdjustment; |
58 | | // If there is a significant drop in character ngram probability or a |
59 | | // dangerous ambiguity make the thresholds on what blob combinations |
60 | | // can be classified looser. |
61 | | static const float kLooseMaxCharWhRatio; |
62 | | // Returns a description of the type of a pain point. |
63 | 1.48M | static const char *PainPointDescription(LMPainPointsType type) { |
64 | 1.48M | return LMPainPointsTypeName[type]; |
65 | 1.48M | } |
66 | | |
67 | | LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb) |
68 | | : max_heap_size_(max) |
69 | | , max_char_wh_ratio_(rat) |
70 | | , fixed_pitch_(fp) |
71 | | , dict_(d) |
72 | 196k | , debug_level_(deb) {} |
73 | 196k | ~LMPainPoints() = default; |
74 | | |
75 | | // Returns true if the heap of pain points of pp_type is not empty(). |
76 | 0 | inline bool HasPainPoints(LMPainPointsType pp_type) const { |
77 | 0 | return !pain_points_heaps_[pp_type].empty(); |
78 | 0 | } |
79 | | |
80 | | // Dequeues the next pain point from the pain points queue and copies |
81 | | // its contents and priority to *pp and *priority. |
82 | | // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type. |
83 | | LMPainPointsType Deque(MATRIX_COORD *pp, float *priority); |
84 | | |
85 | | // Clears pain points heap. |
86 | 0 | void Clear() { |
87 | 0 | for (auto &pain_points_heap : pain_points_heaps_) { |
88 | 0 | pain_points_heap.clear(); |
89 | 0 | } |
90 | 0 | } |
91 | | |
92 | | // For each cell, generate a "pain point" if the cell is not classified |
93 | | // and has a left or right neighbor that was classified. |
94 | | void GenerateInitial(WERD_RES *word_res); |
95 | | |
96 | | // Generate pain points from the given path. |
97 | | void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res); |
98 | | |
99 | | // Generate pain points from dangerous ambiguities in best choice. |
100 | | void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res); |
101 | | |
102 | | // Adds a pain point to classify chunks_record->ratings(col, row). |
103 | | // Returns true if a new pain point was added to an appropriate heap. |
104 | | // Pain point priority is set to special_priority for pain points of |
105 | | // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points |
106 | | // AssociateStats::gap_sum is used. |
107 | | bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, |
108 | | bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res); |
109 | | |
110 | | // Adjusts the pain point coordinates to cope with expansion of the ratings |
111 | | // matrix due to a split of the blob with the given index. |
112 | | void RemapForSplit(int index); |
113 | | |
114 | | private: |
115 | | // Priority queues containing pain points generated by the language model |
116 | | // The priority is set by the language model components, adjustments like |
117 | | // seam cost and width priority are factored into the priority. |
118 | | PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM]; |
119 | | // Maximum number of points to keep in the heap. |
120 | | int max_heap_size_; |
121 | | // Maximum character width/height ratio. |
122 | | float max_char_wh_ratio_; |
123 | | // Set to true if fixed pitch should be assumed. |
124 | | bool fixed_pitch_; |
125 | | // Cached pointer to dictionary. |
126 | | const Dict *dict_; |
127 | | // Debug level for print statements. |
128 | | int debug_level_; |
129 | | }; |
130 | | |
131 | | } // namespace tesseract |
132 | | |
133 | | #endif // TESSERACT_WORDREC_PAIN_POINTS_H_ |