/src/tesseract/src/ccstruct/blamer.h
Line | Count | Source |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: blamer.h |
3 | | // Description: Module allowing precise error causes to be allocated. |
4 | | // Author: Rike Antonova |
5 | | // Refactored: Ray Smith |
6 | | // |
7 | | // (C) Copyright 2013, Google Inc. |
8 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | // you may not use this file except in compliance with the License. |
10 | | // You may obtain a copy of the License at |
11 | | // http://www.apache.org/licenses/LICENSE-2.0 |
12 | | // Unless required by applicable law or agreed to in writing, software |
13 | | // distributed under the License is distributed on an "AS IS" BASIS, |
14 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | // See the License for the specific language governing permissions and |
16 | | // limitations under the License. |
17 | | // |
18 | | /////////////////////////////////////////////////////////////////////// |
19 | | |
20 | | #ifndef TESSERACT_CCSTRUCT_BLAMER_H_ |
21 | | #define TESSERACT_CCSTRUCT_BLAMER_H_ |
22 | | |
23 | | #ifdef HAVE_CONFIG_H |
24 | | # include "config_auto.h" // DISABLED_LEGACY_ENGINE |
25 | | #endif |
26 | | #include "boxword.h" // for BoxWord |
27 | | #ifndef DISABLED_LEGACY_ENGINE |
28 | | # include "params_training_featdef.h" // for ParamsTrainingBundle, ParamsTra... |
29 | | #endif // ndef DISABLED_LEGACY_ENGINE |
30 | | #include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only) |
31 | | #include "rect.h" // for TBOX |
32 | | #include "tprintf.h" // for tprintf |
33 | | |
34 | | #include <tesseract/unichar.h> // for UNICHAR_ID |
35 | | |
36 | | #include <cstdint> // for int16_t |
37 | | #include <cstring> // for memcpy |
38 | | #include <vector> // for std::vector |
39 | | |
40 | | namespace tesseract { |
41 | | |
42 | | class DENORM; |
43 | | class MATRIX; |
44 | | class UNICHARSET; |
45 | | class WERD_RES; |
46 | | |
47 | | struct MATRIX_COORD; |
48 | | struct TWERD; |
49 | | |
50 | | class LMPainPoints; |
51 | | |
52 | | static const int16_t kBlamerBoxTolerance = 5; |
53 | | |
54 | | // Enum for expressing the source of error. |
55 | | // Note: Please update kIncorrectResultReasonNames when modifying this enum. |
56 | | enum IncorrectResultReason { |
57 | | // The text recorded in best choice == truth text |
58 | | IRR_CORRECT, |
59 | | // Either: Top choice is incorrect and is a dictionary word (language model |
60 | | // is unlikely to help correct such errors, so blame the classifier). |
61 | | // Or: the correct unichar was not included in shortlist produced by the |
62 | | // classifier at all. |
63 | | IRR_CLASSIFIER, |
64 | | // Chopper have not found one or more splits that correspond to the correct |
65 | | // character bounding boxes recorded in BlamerBundle::truth_word. |
66 | | IRR_CHOPPER, |
67 | | // Classifier did include correct unichars for each blob in the correct |
68 | | // segmentation, however its rating could have been too bad to allow the |
69 | | // language model to pull out the correct choice. On the other hand the |
70 | | // strength of the language model might have been too weak to favor the |
71 | | // correct answer, this we call this case a classifier-language model |
72 | | // tradeoff error. |
73 | | IRR_CLASS_LM_TRADEOFF, |
74 | | // Page layout failed to produce the correct bounding box. Blame page layout |
75 | | // if the truth was not found for the word, which implies that the bounding |
76 | | // box of the word was incorrect (no truth word had a similar bounding box). |
77 | | IRR_PAGE_LAYOUT, |
78 | | // SegSearch heuristic prevented one or more blobs from the correct |
79 | | // segmentation state to be classified (e.g. the blob was too wide). |
80 | | IRR_SEGSEARCH_HEUR, |
81 | | // The correct segmentaiton state was not explored because of poor SegSearch |
82 | | // pain point prioritization. We blame SegSearch pain point prioritization |
83 | | // if the best rating of a choice constructed from correct segmentation is |
84 | | // better than that of the best choice (i.e. if we got to explore the correct |
85 | | // segmentation state, language model would have picked the correct choice). |
86 | | IRR_SEGSEARCH_PP, |
87 | | // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word, |
88 | | // and thus use the old language model (permuters). |
89 | | // TODO(antonova): integrate the new language mode with chopper |
90 | | IRR_CLASS_OLD_LM_TRADEOFF, |
91 | | // If there is an incorrect adaptive template match with a better score than |
92 | | // a correct one (either pre-trained or adapted), mark this as adaption error. |
93 | | IRR_ADAPTION, |
94 | | // split_and_recog_word() failed to find a suitable split in truth. |
95 | | IRR_NO_TRUTH_SPLIT, |
96 | | // Truth is not available for this word (e.g. when words in corrected content |
97 | | // file are turned into ~~~~ because an appropriate alignment was not found. |
98 | | IRR_NO_TRUTH, |
99 | | // The text recorded in best choice != truth text, but none of the above |
100 | | // reasons are set. |
101 | | IRR_UNKNOWN, |
102 | | |
103 | | IRR_NUM_REASONS |
104 | | }; |
105 | | |
106 | | // Blamer-related information to determine the source of errors. |
107 | | struct BlamerBundle { |
108 | | static const char *IncorrectReasonName(IncorrectResultReason irr); |
109 | | BlamerBundle() |
110 | 0 | : truth_has_char_boxes_(false) |
111 | 0 | , incorrect_result_reason_(IRR_CORRECT) |
112 | 0 | , lattice_data_(nullptr) { |
113 | 0 | ClearResults(); |
114 | 0 | } |
115 | 0 | BlamerBundle(const BlamerBundle &other) { |
116 | 0 | this->CopyTruth(other); |
117 | 0 | this->CopyResults(other); |
118 | 0 | } |
119 | 0 | ~BlamerBundle() { |
120 | 0 | delete[] lattice_data_; |
121 | 0 | } |
122 | | |
123 | | // Accessors. |
124 | 0 | std::string TruthString() const { |
125 | 0 | std::string truth_str; |
126 | 0 | for (auto &text : truth_text_) { |
127 | 0 | truth_str += text; |
128 | 0 | } |
129 | 0 | return truth_str; |
130 | 0 | } |
131 | 0 | IncorrectResultReason incorrect_result_reason() const { |
132 | 0 | return incorrect_result_reason_; |
133 | 0 | } |
134 | 0 | bool NoTruth() const { |
135 | 0 | return incorrect_result_reason_ == IRR_NO_TRUTH || incorrect_result_reason_ == IRR_PAGE_LAYOUT; |
136 | 0 | } |
137 | 0 | bool HasDebugInfo() const { |
138 | 0 | return debug_.length() > 0 || misadaption_debug_.length() > 0; |
139 | 0 | } |
140 | 0 | const std::string &debug() const { |
141 | 0 | return debug_; |
142 | 0 | } |
143 | 0 | const std::string &misadaption_debug() const { |
144 | 0 | return misadaption_debug_; |
145 | 0 | } |
146 | 0 | void UpdateBestRating(float rating) { |
147 | 0 | if (rating < best_correctly_segmented_rating_) { |
148 | 0 | best_correctly_segmented_rating_ = rating; |
149 | 0 | } |
150 | 0 | } |
151 | 0 | int correct_segmentation_length() const { |
152 | 0 | return correct_segmentation_cols_.size(); |
153 | 0 | } |
154 | | // Returns true if the given ratings matrix col,row position is included |
155 | | // in the correct segmentation path at the given index. |
156 | 0 | bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord) { |
157 | 0 | return correct_segmentation_cols_[index] == coord.col && |
158 | 0 | correct_segmentation_rows_[index] == coord.row; |
159 | 0 | } |
160 | 0 | void set_best_choice_is_dict_and_top_choice(bool value) { |
161 | 0 | best_choice_is_dict_and_top_choice_ = value; |
162 | 0 | } |
163 | 0 | const char *lattice_data() const { |
164 | 0 | return lattice_data_; |
165 | 0 | } |
166 | 0 | int lattice_size() const { |
167 | 0 | return lattice_size_; // size of lattice_data in bytes |
168 | 0 | } |
169 | 0 | void set_lattice_data(const char *data, int size) { |
170 | 0 | lattice_size_ = size; |
171 | 0 | delete[] lattice_data_; |
172 | 0 | lattice_data_ = new char[lattice_size_]; |
173 | 0 | memcpy(lattice_data_, data, lattice_size_); |
174 | 0 | } |
175 | | #ifndef DISABLED_LEGACY_ENGINE |
176 | 0 | const tesseract::ParamsTrainingBundle ¶ms_training_bundle() const { |
177 | 0 | return params_training_bundle_; |
178 | 0 | } |
179 | | // Adds a new ParamsTrainingHypothesis to the current hypothesis list. |
180 | 0 | void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo) { |
181 | 0 | params_training_bundle_.AddHypothesis(hypo); |
182 | 0 | } |
183 | | #endif // ndef DISABLED_LEGACY_ENGINE |
184 | | |
185 | | // Functions to setup the blamer. |
186 | | // Whole word string, whole word bounding box. |
187 | | void SetWordTruth(const UNICHARSET &unicharset, const char *truth_str, const TBOX &word_box); |
188 | | // Single "character" string, "character" bounding box. |
189 | | // May be called multiple times to indicate the characters in a word. |
190 | | void SetSymbolTruth(const UNICHARSET &unicharset, const char *char_str, const TBOX &char_box); |
191 | | // Marks that there is something wrong with the truth text, like it contains |
192 | | // reject characters. |
193 | | void SetRejectedTruth(); |
194 | | |
195 | | // Returns true if the provided word_choice is correct. |
196 | | bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const; |
197 | | |
198 | 0 | void ClearResults() { |
199 | 0 | norm_truth_word_.DeleteAllBoxes(); |
200 | 0 | norm_box_tolerance_ = 0; |
201 | 0 | if (!NoTruth()) { |
202 | 0 | incorrect_result_reason_ = IRR_CORRECT; |
203 | 0 | } |
204 | 0 | debug_ = ""; |
205 | 0 | segsearch_is_looking_for_blame_ = false; |
206 | 0 | best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating; |
207 | 0 | correct_segmentation_cols_.clear(); |
208 | 0 | correct_segmentation_rows_.clear(); |
209 | 0 | best_choice_is_dict_and_top_choice_ = false; |
210 | 0 | delete[] lattice_data_; |
211 | 0 | lattice_data_ = nullptr; |
212 | 0 | lattice_size_ = 0; |
213 | 0 | } |
214 | 0 | void CopyTruth(const BlamerBundle &other) { |
215 | 0 | truth_has_char_boxes_ = other.truth_has_char_boxes_; |
216 | 0 | truth_word_ = other.truth_word_; |
217 | 0 | truth_text_ = other.truth_text_; |
218 | 0 | incorrect_result_reason_ = (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT); |
219 | 0 | } |
220 | 0 | void CopyResults(const BlamerBundle &other) { |
221 | 0 | norm_truth_word_ = other.norm_truth_word_; |
222 | 0 | norm_box_tolerance_ = other.norm_box_tolerance_; |
223 | 0 | incorrect_result_reason_ = other.incorrect_result_reason_; |
224 | 0 | segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_; |
225 | 0 | best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_; |
226 | 0 | correct_segmentation_cols_ = other.correct_segmentation_cols_; |
227 | 0 | correct_segmentation_rows_ = other.correct_segmentation_rows_; |
228 | 0 | best_choice_is_dict_and_top_choice_ = other.best_choice_is_dict_and_top_choice_; |
229 | 0 | if (other.lattice_data_ != nullptr) { |
230 | 0 | lattice_data_ = new char[other.lattice_size_]; |
231 | 0 | memcpy(lattice_data_, other.lattice_data_, other.lattice_size_); |
232 | 0 | lattice_size_ = other.lattice_size_; |
233 | 0 | } else { |
234 | 0 | lattice_data_ = nullptr; |
235 | 0 | } |
236 | 0 | } |
237 | | const char *IncorrectReason() const; |
238 | | |
239 | | // Appends choice and truth details to the given debug string. |
240 | | void FillDebugString(const std::string &msg, const WERD_CHOICE *choice, std::string &debug); |
241 | | |
242 | | // Sets up the norm_truth_word from truth_word using the given DENORM. |
243 | | void SetupNormTruthWord(const DENORM &denorm); |
244 | | |
245 | | // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty |
246 | | // bundles) where the right edge/ of the left-hand word is word1_right, |
247 | | // and the left edge of the right-hand word is word2_left. |
248 | | void SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle *bundle1, |
249 | | BlamerBundle *bundle2) const; |
250 | | // "Joins" the blames from bundle1 and bundle2 into *this. |
251 | | void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug); |
252 | | |
253 | | // If a blob with the same bounding box as one of the truth character |
254 | | // bounding boxes is not classified as the corresponding truth character |
255 | | // blames character classifier for incorrect answer. |
256 | | void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, |
257 | | const BLOB_CHOICE_LIST &choices, bool debug); |
258 | | |
259 | | // Checks whether chops were made at all the character bounding box |
260 | | // boundaries in word->truth_word. If not - blames the chopper for an |
261 | | // incorrect answer. |
262 | | void SetChopperBlame(const WERD_RES *word, bool debug); |
263 | | // Blames the classifier or the language model if, after running only the |
264 | | // chopper, best_choice is incorrect and no blame has been yet set. |
265 | | // Blames the classifier if best_choice is classifier's top choice and is a |
266 | | // dictionary word (i.e. language model could not have helped). |
267 | | // Otherwise, blames the language model (formerly permuter word adjustment). |
268 | | void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, |
269 | | bool valid_permuter, bool debug); |
270 | | // Sets up the correct_segmentation_* to mark the correct bounding boxes. |
271 | | void SetupCorrectSegmentation(const TWERD *word, bool debug); |
272 | | |
273 | | // Returns true if a guided segmentation search is needed. |
274 | | bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const; |
275 | | // Setup ready to guide the segmentation search to the correct segmentation. |
276 | | void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, |
277 | | bool debug, std::string &debug_str, tesseract::LMPainPoints *pain_points, |
278 | | double max_char_wh_ratio, WERD_RES *word_res); |
279 | | // Returns true if the guided segsearch is in progress. |
280 | | bool GuidedSegsearchStillGoing() const; |
281 | | // The segmentation search has ended. Sets the blame appropriately. |
282 | | void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, std::string &debug_str); |
283 | | |
284 | | // If the bundle is null or still does not indicate the correct result, |
285 | | // fix it and use some backup reason for the blame. |
286 | | static void LastChanceBlame(bool debug, WERD_RES *word); |
287 | | |
288 | | // Sets the misadaption debug if this word is incorrect, as this word is |
289 | | // being adapted to. |
290 | | void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug); |
291 | | |
292 | | private: |
293 | | // Copy assignment operator (currently unused, therefore private). |
294 | | BlamerBundle &operator=(const BlamerBundle &other) = delete; |
295 | | void SetBlame(IncorrectResultReason irr, const std::string &msg, const WERD_CHOICE *choice, |
296 | 0 | bool debug) { |
297 | 0 | incorrect_result_reason_ = irr; |
298 | 0 | debug_ = IncorrectReason(); |
299 | 0 | debug_ += " to blame: "; |
300 | 0 | FillDebugString(msg, choice, debug_); |
301 | 0 | if (debug) { |
302 | 0 | tprintf("SetBlame(): %s", debug_.c_str()); |
303 | 0 | } |
304 | 0 | } |
305 | | |
306 | | private: |
307 | | // Set to true when bounding boxes for individual unichars are recorded. |
308 | | bool truth_has_char_boxes_; |
309 | | // Variables used by the segmentation search when looking for the blame. |
310 | | // Set to true while segmentation search is continued after the usual |
311 | | // termination condition in order to look for the blame. |
312 | | bool segsearch_is_looking_for_blame_; |
313 | | // Set to true if best choice is a dictionary word and |
314 | | // classifier's top choice. |
315 | | bool best_choice_is_dict_and_top_choice_; |
316 | | // Tolerance for bounding box comparisons in normalized space. |
317 | | int norm_box_tolerance_; |
318 | | // The true_word (in the original image coordinate space) contains ground |
319 | | // truth bounding boxes for this WERD_RES. |
320 | | tesseract::BoxWord truth_word_; |
321 | | // Same as above, but in normalized coordinates |
322 | | // (filled in by WERD_RES::SetupForRecognition()). |
323 | | tesseract::BoxWord norm_truth_word_; |
324 | | // Contains ground truth unichar for each of the bounding boxes in truth_word. |
325 | | std::vector<std::string> truth_text_; |
326 | | // The reason for incorrect OCR result. |
327 | | IncorrectResultReason incorrect_result_reason_; |
328 | | // Debug text associated with the blame. |
329 | | std::string debug_; |
330 | | // Misadaption debug information (filled in if this word was misadapted to). |
331 | | std::string misadaption_debug_; |
332 | | // Vectors populated by SegSearch to indicate column and row indices that |
333 | | // correspond to blobs with correct bounding boxes. |
334 | | std::vector<int> correct_segmentation_cols_; |
335 | | std::vector<int> correct_segmentation_rows_; |
336 | | // Best rating for correctly segmented path |
337 | | // (set and used by SegSearch when looking for blame). |
338 | | float best_correctly_segmented_rating_; |
339 | | int lattice_size_; // size of lattice_data in bytes |
340 | | // Serialized segmentation search lattice. |
341 | | char *lattice_data_; |
342 | | // Information about hypotheses (paths) explored by the segmentation search. |
343 | | #ifndef DISABLED_LEGACY_ENGINE |
344 | | tesseract::ParamsTrainingBundle params_training_bundle_; |
345 | | #endif // ndef DISABLED_LEGACY_ENGINE |
346 | | }; |
347 | | |
348 | | } // namespace tesseract |
349 | | |
350 | | #endif // TESSERACT_CCSTRUCT_BLAMER_H_ |