/src/tesseract/src/classify/classify.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: classify.cpp |
3 | | // Description: classify class. |
4 | | // Author: Samuel Charron |
5 | | // |
6 | | // (C) Copyright 2006, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #include "classify.h" |
20 | | |
21 | | #ifdef DISABLED_LEGACY_ENGINE |
22 | | |
23 | | # include <string.h> |
24 | | |
25 | | namespace tesseract { |
26 | | |
27 | | Classify::Classify() |
28 | | : INT_MEMBER(classify_debug_level, 0, "Classify debug level", this->params()) |
29 | | , |
30 | | |
31 | | BOOL_MEMBER(classify_bln_numeric_mode, 0, "Assume the input is numbers [0-9].", this->params()) |
32 | | , |
33 | | |
34 | | double_MEMBER(classify_max_rating_ratio, 1.5, "Veto ratio between classifier ratings", |
35 | | this->params()) |
36 | | , |
37 | | |
38 | | double_MEMBER(classify_max_certainty_margin, 5.5, |
39 | | "Veto difference between classifier certainties", this->params()) |
40 | | , |
41 | | |
42 | | dict_(this) {} |
43 | | |
44 | | Classify::~Classify() {} |
45 | | |
46 | | } // namespace tesseract |
47 | | |
48 | | #else // DISABLED_LEGACY_ENGINE not defined |
49 | | |
50 | | # include <cstring> |
51 | | # include "fontinfo.h" |
52 | | # include "intproto.h" |
53 | | # include "mfoutline.h" |
54 | | # include "scrollview.h" |
55 | | # include "shapeclassifier.h" |
56 | | # include "shapetable.h" |
57 | | # include "unicity_table.h" |
58 | | |
59 | | namespace tesseract { |
60 | | Classify::Classify() |
61 | 2 | : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping", this->params()) |
62 | 2 | , BOOL_MEMBER(prioritize_division, false, "Prioritize blob division over chopping", |
63 | | this->params()) |
64 | 2 | , BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier", this->params()) |
65 | 2 | , INT_MEMBER(classify_debug_level, 0, "Classify debug level", this->params()) |
66 | 2 | , INT_MEMBER(classify_norm_method, character, "Normalization Method ...", this->params()) |
67 | 2 | , double_MEMBER(classify_char_norm_range, 0.2, "Character Normalization Range ...", |
68 | | this->params()) |
69 | 2 | , double_MEMBER(classify_max_rating_ratio, 1.5, "Veto ratio between classifier ratings", |
70 | | this->params()) |
71 | 2 | , double_MEMBER(classify_max_certainty_margin, 5.5, |
72 | | "Veto difference between classifier certainties", this->params()) |
73 | 2 | , BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching", this->params()) |
74 | 2 | , BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching", this->params()) |
75 | 2 | , BOOL_MEMBER(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier", this->params()) |
76 | 2 | , BOOL_MEMBER(classify_use_pre_adapted_templates, 0, "Use pre-adapted classifier templates", |
77 | | this->params()) |
78 | 2 | , BOOL_MEMBER(classify_save_adapted_templates, 0, "Save adapted templates to a file", |
79 | | this->params()) |
80 | 2 | , BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger", this->params()) |
81 | 2 | , BOOL_MEMBER(classify_nonlinear_norm, 0, "Non-linear stroke-density normalization", |
82 | | this->params()) |
83 | 2 | , INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()) |
84 | 2 | , INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()) |
85 | 2 | , INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ", this->params()) |
86 | 2 | , double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)", this->params()) |
87 | 2 | , double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)", this->params()) |
88 | 2 | , double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)", this->params()) |
89 | 2 | , double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)", this->params()) |
90 | 2 | , double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)", this->params()) |
91 | 2 | , double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length", this->params()) |
92 | 2 | , INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes", this->params()) |
93 | 2 | , INT_MEMBER(matcher_min_examples_for_prototyping, 3, "Reliable Config Threshold", |
94 | | this->params()) |
95 | 2 | , INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5, |
96 | | "Enable adaption even if the ambiguities have not been seen", this->params()) |
97 | 2 | , double_MEMBER(matcher_clustering_max_angle_delta, 0.015, |
98 | | "Maximum angle delta for prototype clustering", this->params()) |
99 | 2 | , double_MEMBER(classify_misfit_junk_penalty, 0.0, |
100 | | "Penalty to apply when a non-alnum is vertically out of " |
101 | | "its expected textline position", |
102 | | this->params()) |
103 | 2 | , double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()) |
104 | 2 | , double_MEMBER(tessedit_class_miss_scale, 0.00390625, "Scale factor for features not used", |
105 | | this->params()) |
106 | 2 | , double_MEMBER(classify_adapted_pruning_factor, 2.5, |
107 | | "Prune poor adapted results this much worse than best result", this->params()) |
108 | 2 | , double_MEMBER(classify_adapted_pruning_threshold, -1.0, |
109 | | "Threshold at which classify_adapted_pruning_factor starts", this->params()) |
110 | 2 | , INT_MEMBER(classify_adapt_proto_threshold, 230, |
111 | | "Threshold for good protos during adaptive 0-255", this->params()) |
112 | 2 | , INT_MEMBER(classify_adapt_feature_threshold, 230, |
113 | | "Threshold for good features during adaptive 0-255", this->params()) |
114 | 2 | , BOOL_MEMBER(disable_character_fragments, true, |
115 | | "Do not include character fragments in the" |
116 | | " results of the classifier", |
117 | | this->params()) |
118 | 2 | , double_MEMBER(classify_character_fragments_garbage_certainty_threshold, -3.0, |
119 | | "Exclude fragments that do not look like whole" |
120 | | " characters from training and adaption", |
121 | | this->params()) |
122 | 2 | , BOOL_MEMBER(classify_debug_character_fragments, false, |
123 | | "Bring up graphical debugging windows for fragments training", this->params()) |
124 | 2 | , BOOL_MEMBER(matcher_debug_separate_windows, false, |
125 | | "Use two different windows for debugging the matching: " |
126 | | "One for the protos and one for the features.", |
127 | | this->params()) |
128 | 2 | , STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning", this->params()) |
129 | 2 | , INT_MEMBER(classify_class_pruner_threshold, 229, "Class Pruner Threshold 0-255", |
130 | | this->params()) |
131 | 2 | , INT_MEMBER(classify_class_pruner_multiplier, 15, |
132 | | "Class Pruner Multiplier 0-255: ", this->params()) |
133 | 2 | , INT_MEMBER(classify_cp_cutoff_strength, 7, |
134 | | "Class Pruner CutoffStrength: ", this->params()) |
135 | 2 | , INT_MEMBER(classify_integer_matcher_multiplier, 10, |
136 | | "Integer Matcher Multiplier 0-255: ", this->params()) |
137 | 2 | , BOOL_MEMBER(classify_bln_numeric_mode, 0, "Assume the input is numbers [0-9].", |
138 | | this->params()) |
139 | 2 | , double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size", this->params()) |
140 | 2 | , double_MEMBER(speckle_rating_penalty, 10.0, "Penalty to add to worst rating for noise", |
141 | | this->params()) |
142 | 2 | , im_(&classify_debug_level) |
143 | 2 | , dict_(this) { |
144 | 2 | using namespace std::placeholders; // for _1, _2 |
145 | 2 | fontinfo_table_.set_clear_callback(std::bind(FontInfoDeleteCallback, _1)); |
146 | | |
147 | 2 | InitFeatureDefs(&feature_defs_); |
148 | 2 | } |
149 | | |
150 | 0 | Classify::~Classify() { |
151 | 0 | EndAdaptiveClassifier(); |
152 | | #ifndef GRAPHICS_DISABLED |
153 | | delete learn_debug_win_; |
154 | | delete learn_fragmented_word_debug_win_; |
155 | | delete learn_fragments_debug_win_; |
156 | | #endif |
157 | 0 | } |
158 | | |
159 | | // Takes ownership of the given classifier, and uses it for future calls |
160 | | // to CharNormClassifier. |
161 | 0 | void Classify::SetStaticClassifier(ShapeClassifier *static_classifier) { |
162 | 0 | delete static_classifier_; |
163 | 0 | static_classifier_ = static_classifier; |
164 | 0 | } |
165 | | |
166 | | // Moved from speckle.cpp |
167 | | // Adds a noise classification result that is a bit worse than the worst |
168 | | // current result, or the worst possible result if no current results. |
169 | 34.8k | void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) { |
170 | 34.8k | BLOB_CHOICE_IT bc_it(choices); |
171 | | // If there is no classifier result, we will use the worst possible certainty |
172 | | // and corresponding rating. |
173 | 34.8k | float certainty = -getDict().certainty_scale; |
174 | 34.8k | float rating = rating_scale * blob_length; |
175 | 34.8k | if (!choices->empty() && blob_length > 0) { |
176 | 34.8k | bc_it.move_to_last(); |
177 | 34.8k | BLOB_CHOICE *worst_choice = bc_it.data(); |
178 | | // Add speckle_rating_penalty to worst rating, matching old value. |
179 | 34.8k | rating = worst_choice->rating() + speckle_rating_penalty; |
180 | | // Compute the rating to correspond to the certainty. (Used to be kept |
181 | | // the same, but that messes up the language model search.) |
182 | 34.8k | certainty = -rating * getDict().certainty_scale / (rating_scale * blob_length); |
183 | 34.8k | } |
184 | 34.8k | auto *blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty, -1, 0.0f, FLT_MAX, 0, |
185 | 34.8k | BCC_SPECKLE_CLASSIFIER); |
186 | 34.8k | bc_it.add_to_end(blob_choice); |
187 | 34.8k | } |
188 | | |
189 | | // Returns true if the blob is small enough to be a large speckle. |
190 | 755k | bool Classify::LargeSpeckle(const TBLOB &blob) { |
191 | 755k | double speckle_size = kBlnXHeight * speckle_large_max_size; |
192 | 755k | TBOX bbox = blob.bounding_box(); |
193 | 755k | return bbox.width() < speckle_size && bbox.height() < speckle_size; |
194 | 755k | } |
195 | | |
196 | | } // namespace tesseract |
197 | | |
198 | | #endif // def DISABLED_LEGACY_ENGINE |