/src/tesseract/src/ccmain/fixxht.cpp
Line | Count | Source |
1 | | /********************************************************************** |
2 | | * File: fixxht.cpp (Formerly fixxht.c) |
3 | | * Description: Improve x_ht and look out for case inconsistencies |
4 | | * Author: Phil Cheatle |
5 | | * Created: Thu Aug 5 14:11:08 BST 1993 |
6 | | * |
7 | | * (C) Copyright 1992, Hewlett-Packard Ltd. |
8 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | ** you may not use this file except in compliance with the License. |
10 | | ** You may obtain a copy of the License at |
11 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
12 | | ** Unless required by applicable law or agreed to in writing, software |
13 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
14 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | ** See the License for the specific language governing permissions and |
16 | | ** limitations under the License. |
17 | | * |
18 | | **********************************************************************/ |
19 | | |
20 | | #include "float2int.h" |
21 | | #include "params.h" |
22 | | #include "tesseractclass.h" |
23 | | |
24 | | #include <algorithm> |
25 | | #include <cctype> |
26 | | #include <cmath> |
27 | | #include <cstring> |
28 | | |
29 | | namespace tesseract { |
30 | | |
31 | | // Fixxht overview. |
32 | | // Premise: Initial estimate of x-height is adequate most of the time, but |
33 | | // occasionally it is incorrect. Most notable causes of failure are: |
34 | | // 1. Small caps, where the top of the caps is the same as the body text |
35 | | // xheight. For small caps words the xheight needs to be reduced to correctly |
36 | | // recognize the caps in the small caps word. |
37 | | // 2. All xheight lines, such as summer. Here the initial estimate will have |
38 | | // guessed that the blob tops are caps and will have placed the xheight too low. |
39 | | // 3. Noise/logos beside words, or changes in font size on a line. Such |
40 | | // things can blow the statistics and cause an incorrect estimate. |
41 | | // 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged. |
42 | | // In this case the x-height is often still correct. |
43 | | // |
44 | | // Algorithm. |
45 | | // Compare the vertical position (top only) of alphnumerics in a word with |
46 | | // the range of positions in training data (in the unicharset). |
47 | | // See CountMisfitTops. If any characters disagree sufficiently with the |
48 | | // initial xheight estimate, then recalculate the xheight, re-run OCR on |
49 | | // the word, and if the number of vertical misfits goes down, along with |
50 | | // either the word rating or certainty, then keep the new xheight. |
51 | | // The new xheight is calculated as follows:ComputeCompatibleXHeight |
52 | | // For each alphanumeric character that has a vertically misplaced top |
53 | | // (a misfit), yet its bottom is within the acceptable range (ie it is not |
54 | | // likely a sub-or super-script) calculate the range of acceptable xheight |
55 | | // positions from its range of tops, and give each value in the range a |
56 | | // number of votes equal to the distance of its top from its acceptance range. |
57 | | // The x-height position with the median of the votes becomes the new |
58 | | // x-height. This assumes that most characters will be correctly recognized |
59 | | // even if the x-height is incorrect. This is not a terrible assumption, but |
60 | | // it is not great. An improvement would be to use a classifier that does |
61 | | // not care about vertical position or scaling at all. |
62 | | // Separately collect stats on shifted baselines and apply the same logic to |
63 | | // computing a best-fit shift to fix the error. If the baseline needs to be |
64 | | // shifted, but the x-height is OK, returns the original x-height along with |
65 | | // the baseline shift to indicate that recognition needs to re-run. |
66 | | |
67 | | // If the max-min top of a unicharset char is bigger than kMaxCharTopRange |
68 | | // then the char top cannot be used to judge misfits or suggest a new top. |
69 | | const int kMaxCharTopRange = 48; |
70 | | |
71 | | // Returns the number of misfit blob tops in this word. |
72 | 72.3k | int Tesseract::CountMisfitTops(WERD_RES *word_res) { |
73 | 72.3k | int bad_blobs = 0; |
74 | 72.3k | int num_blobs = word_res->rebuild_word->NumBlobs(); |
75 | 391k | for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { |
76 | 318k | TBLOB *blob = word_res->rebuild_word->blobs[blob_id]; |
77 | 318k | UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); |
78 | 318k | if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { |
79 | 196k | int top = blob->bounding_box().top(); |
80 | 196k | if (top >= INT_FEAT_RANGE) { |
81 | 55.6k | top = INT_FEAT_RANGE - 1; |
82 | 55.6k | } |
83 | 196k | int min_bottom, max_bottom, min_top, max_top; |
84 | 196k | unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top); |
85 | 196k | if (max_top - min_top > kMaxCharTopRange) { |
86 | 4.39k | continue; |
87 | 4.39k | } |
88 | 191k | bool bad = |
89 | 191k | top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance; |
90 | 191k | if (bad) { |
91 | 88.0k | ++bad_blobs; |
92 | 88.0k | } |
93 | 191k | if (debug_x_ht_level >= 1) { |
94 | 0 | tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n", |
95 | 0 | unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top, |
96 | 0 | static_cast<int>(x_ht_acceptance_tolerance)); |
97 | 0 | } |
98 | 191k | } |
99 | 318k | } |
100 | 72.3k | return bad_blobs; |
101 | 72.3k | } |
102 | | |
103 | | // Returns a new x-height maximally compatible with the result in word_res. |
104 | | // See comment above for overall algorithm. |
105 | 28.0k | float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift) { |
106 | 28.0k | STATS top_stats(0, UINT8_MAX - 1); |
107 | 28.0k | STATS shift_stats(-UINT8_MAX, UINT8_MAX - 1); |
108 | 28.0k | int bottom_shift = 0; |
109 | 28.0k | int num_blobs = word_res->rebuild_word->NumBlobs(); |
110 | 37.0k | do { |
111 | 37.0k | top_stats.clear(); |
112 | 37.0k | shift_stats.clear(); |
113 | 233k | for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { |
114 | 196k | TBLOB *blob = word_res->rebuild_word->blobs[blob_id]; |
115 | 196k | UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); |
116 | 196k | if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { |
117 | 127k | int top = blob->bounding_box().top() + bottom_shift; |
118 | | // Clip the top to the limit of normalized feature space. |
119 | 127k | if (top >= INT_FEAT_RANGE) { |
120 | 32.0k | top = INT_FEAT_RANGE - 1; |
121 | 32.0k | } |
122 | 127k | int bottom = blob->bounding_box().bottom() + bottom_shift; |
123 | 127k | int min_bottom, max_bottom, min_top, max_top; |
124 | 127k | unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top); |
125 | | // Chars with a wild top range would mess up the result so ignore them. |
126 | 127k | if (max_top - min_top > kMaxCharTopRange) { |
127 | 1.03k | continue; |
128 | 1.03k | } |
129 | 126k | int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top, |
130 | 126k | top - (max_top + x_ht_acceptance_tolerance)); |
131 | 126k | int height = top - kBlnBaselineOffset; |
132 | 126k | if (debug_x_ht_level >= 2) { |
133 | 0 | tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ", |
134 | 0 | unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top, |
135 | 0 | max_top, bottom, top); |
136 | 0 | } |
137 | | // Use only chars that fit in the expected bottom range, and where |
138 | | // the range of tops is sensibly near the xheight. |
139 | 126k | if (min_bottom <= bottom + x_ht_acceptance_tolerance && |
140 | 89.5k | bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset && |
141 | 61.7k | max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) { |
142 | | // Compute the x-height position using proportionality between the |
143 | | // actual height and expected height. |
144 | 41.4k | int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset); |
145 | 41.4k | int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset); |
146 | 41.4k | if (debug_x_ht_level >= 2) { |
147 | 0 | tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht); |
148 | 0 | } |
149 | | // The range of expected heights gets a vote equal to the distance |
150 | | // of the actual top from the expected top. |
151 | 739k | for (int y = min_xht; y <= max_xht; ++y) { |
152 | 697k | top_stats.add(y, misfit_dist); |
153 | 697k | } |
154 | 84.6k | } else if ((min_bottom > bottom + x_ht_acceptance_tolerance || |
155 | 48.1k | bottom - x_ht_acceptance_tolerance > max_bottom) && |
156 | 64.2k | bottom_shift == 0) { |
157 | | // Get the range of required bottom shift. |
158 | 42.1k | int min_shift = min_bottom - bottom; |
159 | 42.1k | int max_shift = max_bottom - bottom; |
160 | 42.1k | if (debug_x_ht_level >= 2) { |
161 | 0 | tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift); |
162 | 0 | } |
163 | | // The range of expected shifts gets a vote equal to the min distance |
164 | | // of the actual bottom from the expected bottom, spread over the |
165 | | // range of its acceptance. |
166 | 42.1k | int misfit_weight = abs(min_shift); |
167 | 42.1k | if (max_shift > min_shift) { |
168 | 42.1k | misfit_weight /= max_shift - min_shift; |
169 | 42.1k | } |
170 | 738k | for (int y = min_shift; y <= max_shift; ++y) { |
171 | 696k | shift_stats.add(y, misfit_weight); |
172 | 696k | } |
173 | 42.4k | } else { |
174 | 42.4k | if (bottom_shift == 0) { |
175 | | // Things with bottoms that are already ok need to say so, on the |
176 | | // 1st iteration only. |
177 | 13.0k | shift_stats.add(0, kBlnBaselineOffset); |
178 | 13.0k | } |
179 | 42.4k | if (debug_x_ht_level >= 2) { |
180 | 0 | tprintf(" already OK\n"); |
181 | 0 | } |
182 | 42.4k | } |
183 | 126k | } |
184 | 196k | } |
185 | 37.0k | if (shift_stats.get_total() > top_stats.get_total()) { |
186 | 9.41k | bottom_shift = IntCastRounded(shift_stats.median()); |
187 | 9.41k | if (debug_x_ht_level >= 2) { |
188 | 0 | tprintf("Applying bottom shift=%d\n", bottom_shift); |
189 | 0 | } |
190 | 9.41k | } |
191 | 37.0k | } while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total()); |
192 | | // Baseline shift is opposite sign to the bottom shift. |
193 | 28.0k | *baseline_shift = -bottom_shift / word_res->denorm.y_scale(); |
194 | 28.0k | if (debug_x_ht_level >= 2) { |
195 | 0 | tprintf("baseline shift=%g\n", *baseline_shift); |
196 | 0 | } |
197 | 28.0k | if (top_stats.get_total() == 0) { |
198 | 5.85k | return bottom_shift != 0 ? word_res->x_height : 0.0f; |
199 | 5.85k | } |
200 | | // The new xheight is just the median vote, which is then scaled out |
201 | | // of BLN space back to pixel space to get the x-height in pixel space. |
202 | 22.1k | float new_xht = top_stats.median(); |
203 | 22.1k | if (debug_x_ht_level >= 2) { |
204 | 0 | tprintf("Median xht=%f\n", new_xht); |
205 | 0 | tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht, |
206 | 0 | new_xht / word_res->denorm.y_scale()); |
207 | 0 | } |
208 | | // The xheight must change by at least x_ht_min_change to be used. |
209 | 22.1k | if (std::fabs(new_xht - kBlnXHeight) >= x_ht_min_change) { |
210 | 22.1k | return new_xht / word_res->denorm.y_scale(); |
211 | 22.1k | } else { |
212 | 0 | return bottom_shift != 0 ? word_res->x_height : 0.0f; |
213 | 0 | } |
214 | 22.1k | } |
215 | | |
216 | | } // namespace tesseract |