Coverage Report

Created: 2025-09-27 07:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/ccmain/fixxht.cpp
Line
Count
Source
1
/**********************************************************************
2
 * File:        fixxht.cpp  (Formerly fixxht.c)
3
 * Description: Improve x_ht and look out for case inconsistencies
4
 * Author:      Phil Cheatle
5
 * Created:     Thu Aug  5 14:11:08 BST 1993
6
 *
7
 * (C) Copyright 1992, Hewlett-Packard Ltd.
8
 ** Licensed under the Apache License, Version 2.0 (the "License");
9
 ** you may not use this file except in compliance with the License.
10
 ** You may obtain a copy of the License at
11
 ** http://www.apache.org/licenses/LICENSE-2.0
12
 ** Unless required by applicable law or agreed to in writing, software
13
 ** distributed under the License is distributed on an "AS IS" BASIS,
14
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 ** See the License for the specific language governing permissions and
16
 ** limitations under the License.
17
 *
18
 **********************************************************************/
19
20
#include "float2int.h"
21
#include "params.h"
22
#include "tesseractclass.h"
23
24
#include <algorithm>
25
#include <cctype>
26
#include <cmath>
27
#include <cstring>
28
29
namespace tesseract {
30
31
// Fixxht overview.
32
// Premise: Initial estimate of x-height is adequate most of the time, but
33
// occasionally it is incorrect. Most notable causes of failure are:
34
// 1. Small caps, where the top of the caps is the same as the body text
35
// xheight. For small caps words the xheight needs to be reduced to correctly
36
// recognize the caps in the small caps word.
37
// 2. All xheight lines, such as summer. Here the initial estimate will have
38
// guessed that the blob tops are caps and will have placed the xheight too low.
39
// 3. Noise/logos beside words, or changes in font size on a line. Such
40
// things can blow the statistics and cause an incorrect estimate.
41
// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
42
// In this case the x-height is often still correct.
43
//
44
// Algorithm.
45
// Compare the vertical position (top only) of alphnumerics in a word with
46
// the range of positions in training data (in the unicharset).
47
// See CountMisfitTops. If any characters disagree sufficiently with the
48
// initial xheight estimate, then recalculate the xheight, re-run OCR on
49
// the word, and if the number of vertical misfits goes down, along with
50
// either the word rating or certainty, then keep the new xheight.
51
// The new xheight is calculated as follows:ComputeCompatibleXHeight
52
// For each alphanumeric character that has a vertically misplaced top
53
// (a misfit), yet its bottom is within the acceptable range (ie it is not
54
// likely a sub-or super-script) calculate the range of acceptable xheight
55
// positions from its range of tops, and give each value in the range a
56
// number of votes equal to the distance of its top from its acceptance range.
57
// The x-height position with the median of the votes becomes the new
58
// x-height. This assumes that most characters will be correctly recognized
59
// even if the x-height is incorrect. This is not a terrible assumption, but
60
// it is not great. An improvement would be to use a classifier that does
61
// not care about vertical position or scaling at all.
62
// Separately collect stats on shifted baselines and apply the same logic to
63
// computing a best-fit shift to fix the error. If the baseline needs to be
64
// shifted, but the x-height is OK, returns the original x-height along with
65
// the baseline shift to indicate that recognition needs to re-run.
66
67
// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
68
// then the char top cannot be used to judge misfits or suggest a new top.
69
const int kMaxCharTopRange = 48;
70
71
// Returns the number of misfit blob tops in this word.
72
72.3k
int Tesseract::CountMisfitTops(WERD_RES *word_res) {
73
72.3k
  int bad_blobs = 0;
74
72.3k
  int num_blobs = word_res->rebuild_word->NumBlobs();
75
391k
  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
76
318k
    TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
77
318k
    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
78
318k
    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
79
196k
      int top = blob->bounding_box().top();
80
196k
      if (top >= INT_FEAT_RANGE) {
81
55.6k
        top = INT_FEAT_RANGE - 1;
82
55.6k
      }
83
196k
      int min_bottom, max_bottom, min_top, max_top;
84
196k
      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
85
196k
      if (max_top - min_top > kMaxCharTopRange) {
86
4.39k
        continue;
87
4.39k
      }
88
191k
      bool bad =
89
191k
          top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance;
90
191k
      if (bad) {
91
88.0k
        ++bad_blobs;
92
88.0k
      }
93
191k
      if (debug_x_ht_level >= 1) {
94
0
        tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
95
0
                unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top,
96
0
                static_cast<int>(x_ht_acceptance_tolerance));
97
0
      }
98
191k
    }
99
318k
  }
100
72.3k
  return bad_blobs;
101
72.3k
}
102
103
// Returns a new x-height maximally compatible with the result in word_res.
104
// See comment above for overall algorithm.
105
28.0k
float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift) {
106
28.0k
  STATS top_stats(0, UINT8_MAX - 1);
107
28.0k
  STATS shift_stats(-UINT8_MAX, UINT8_MAX - 1);
108
28.0k
  int bottom_shift = 0;
109
28.0k
  int num_blobs = word_res->rebuild_word->NumBlobs();
110
37.0k
  do {
111
37.0k
    top_stats.clear();
112
37.0k
    shift_stats.clear();
113
233k
    for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
114
196k
      TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
115
196k
      UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
116
196k
      if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
117
127k
        int top = blob->bounding_box().top() + bottom_shift;
118
        // Clip the top to the limit of normalized feature space.
119
127k
        if (top >= INT_FEAT_RANGE) {
120
32.0k
          top = INT_FEAT_RANGE - 1;
121
32.0k
        }
122
127k
        int bottom = blob->bounding_box().bottom() + bottom_shift;
123
127k
        int min_bottom, max_bottom, min_top, max_top;
124
127k
        unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
125
        // Chars with a wild top range would mess up the result so ignore them.
126
127k
        if (max_top - min_top > kMaxCharTopRange) {
127
1.03k
          continue;
128
1.03k
        }
129
126k
        int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
130
126k
                                   top - (max_top + x_ht_acceptance_tolerance));
131
126k
        int height = top - kBlnBaselineOffset;
132
126k
        if (debug_x_ht_level >= 2) {
133
0
          tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
134
0
                  unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top,
135
0
                  max_top, bottom, top);
136
0
        }
137
        // Use only chars that fit in the expected bottom range, and where
138
        // the range of tops is sensibly near the xheight.
139
126k
        if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
140
89.5k
            bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset &&
141
61.7k
            max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) {
142
          // Compute the x-height position using proportionality between the
143
          // actual height and expected height.
144
41.4k
          int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset);
145
41.4k
          int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset);
146
41.4k
          if (debug_x_ht_level >= 2) {
147
0
            tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
148
0
          }
149
          // The range of expected heights gets a vote equal to the distance
150
          // of the actual top from the expected top.
151
739k
          for (int y = min_xht; y <= max_xht; ++y) {
152
697k
            top_stats.add(y, misfit_dist);
153
697k
          }
154
84.6k
        } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
155
48.1k
                    bottom - x_ht_acceptance_tolerance > max_bottom) &&
156
64.2k
                   bottom_shift == 0) {
157
          // Get the range of required bottom shift.
158
42.1k
          int min_shift = min_bottom - bottom;
159
42.1k
          int max_shift = max_bottom - bottom;
160
42.1k
          if (debug_x_ht_level >= 2) {
161
0
            tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
162
0
          }
163
          // The range of expected shifts gets a vote equal to the min distance
164
          // of the actual bottom from the expected bottom, spread over the
165
          // range of its acceptance.
166
42.1k
          int misfit_weight = abs(min_shift);
167
42.1k
          if (max_shift > min_shift) {
168
42.1k
            misfit_weight /= max_shift - min_shift;
169
42.1k
          }
170
738k
          for (int y = min_shift; y <= max_shift; ++y) {
171
696k
            shift_stats.add(y, misfit_weight);
172
696k
          }
173
42.4k
        } else {
174
42.4k
          if (bottom_shift == 0) {
175
            // Things with bottoms that are already ok need to say so, on the
176
            // 1st iteration only.
177
13.0k
            shift_stats.add(0, kBlnBaselineOffset);
178
13.0k
          }
179
42.4k
          if (debug_x_ht_level >= 2) {
180
0
            tprintf(" already OK\n");
181
0
          }
182
42.4k
        }
183
126k
      }
184
196k
    }
185
37.0k
    if (shift_stats.get_total() > top_stats.get_total()) {
186
9.41k
      bottom_shift = IntCastRounded(shift_stats.median());
187
9.41k
      if (debug_x_ht_level >= 2) {
188
0
        tprintf("Applying bottom shift=%d\n", bottom_shift);
189
0
      }
190
9.41k
    }
191
37.0k
  } while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total());
192
  // Baseline shift is opposite sign to the bottom shift.
193
28.0k
  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
194
28.0k
  if (debug_x_ht_level >= 2) {
195
0
    tprintf("baseline shift=%g\n", *baseline_shift);
196
0
  }
197
28.0k
  if (top_stats.get_total() == 0) {
198
5.85k
    return bottom_shift != 0 ? word_res->x_height : 0.0f;
199
5.85k
  }
200
  // The new xheight is just the median vote, which is then scaled out
201
  // of BLN space back to pixel space to get the x-height in pixel space.
202
22.1k
  float new_xht = top_stats.median();
203
22.1k
  if (debug_x_ht_level >= 2) {
204
0
    tprintf("Median xht=%f\n", new_xht);
205
0
    tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht,
206
0
            new_xht / word_res->denorm.y_scale());
207
0
  }
208
  // The xheight must change by at least x_ht_min_change to be used.
209
22.1k
  if (std::fabs(new_xht - kBlnXHeight) >= x_ht_min_change) {
210
22.1k
    return new_xht / word_res->denorm.y_scale();
211
22.1k
  } else {
212
0
    return bottom_shift != 0 ? word_res->x_height : 0.0f;
213
0
  }
214
22.1k
}
215
216
} // namespace tesseract