/src/tesseract/src/ccmain/fixxht.cpp

Source
/**********************************************************************
 * File:        fixxht.cpp  (Formerly fixxht.c)
 * Description: Improve x_ht and look out for case inconsistencies
 * Author:      Phil Cheatle
 * Created:     Thu Aug  5 14:11:08 BST 1993
 *
 * (C) Copyright 1992, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#include "float2int.h"
#include "params.h"
#include "tesseractclass.h"

#include <algorithm>
#include <cctype>
#include <cmath>
#include <cstring>

namespace tesseract {

// Fixxht overview.
// Premise: Initial estimate of x-height is adequate most of the time, but
// occasionally it is incorrect. Most notable causes of failure are:
// 1. Small caps, where the top of the caps is the same as the body text
// xheight. For small caps words the xheight needs to be reduced to correctly
// recognize the caps in the small caps word.
// 2. All xheight lines, such as summer. Here the initial estimate will have
// guessed that the blob tops are caps and will have placed the xheight too low.
// 3. Noise/logos beside words, or changes in font size on a line. Such
// things can blow the statistics and cause an incorrect estimate.
// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
// In this case the x-height is often still correct.
//
// Algorithm.
// Compare the vertical position (top only) of alphnumerics in a word with
// the range of positions in training data (in the unicharset).
// See CountMisfitTops. If any characters disagree sufficiently with the
// initial xheight estimate, then recalculate the xheight, re-run OCR on
// the word, and if the number of vertical misfits goes down, along with
// either the word rating or certainty, then keep the new xheight.
// The new xheight is calculated as follows:ComputeCompatibleXHeight
// For each alphanumeric character that has a vertically misplaced top
// (a misfit), yet its bottom is within the acceptable range (ie it is not
// likely a sub-or super-script) calculate the range of acceptable xheight
// positions from its range of tops, and give each value in the range a
// number of votes equal to the distance of its top from its acceptance range.
// The x-height position with the median of the votes becomes the new
// x-height. This assumes that most characters will be correctly recognized
// even if the x-height is incorrect. This is not a terrible assumption, but
// it is not great. An improvement would be to use a classifier that does
// not care about vertical position or scaling at all.
// Separately collect stats on shifted baselines and apply the same logic to
// computing a best-fit shift to fix the error. If the baseline needs to be
// shifted, but the x-height is OK, returns the original x-height along with
// the baseline shift to indicate that recognition needs to re-run.

// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
// then the char top cannot be used to judge misfits or suggest a new top.
const int kMaxCharTopRange = 48;

// Returns the number of misfit blob tops in this word.
int Tesseract::CountMisfitTops(WERD_RES *word_res) {
  int bad_blobs = 0;
  int num_blobs = word_res->rebuild_word->NumBlobs();
  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
    TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
      int top = blob->bounding_box().top();
      if (top >= INT_FEAT_RANGE) {
        top = INT_FEAT_RANGE - 1;
      }
      int min_bottom, max_bottom, min_top, max_top;
      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
      if (max_top - min_top > kMaxCharTopRange) {
        continue;
      }
      bool bad =
          top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance;
      if (bad) {
        ++bad_blobs;
      }
      if (debug_x_ht_level >= 1) {
        tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
                unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top,
                static_cast<int>(x_ht_acceptance_tolerance));
      }
    }
  }
  return bad_blobs;
}

// Returns a new x-height maximally compatible with the result in word_res.
// See comment above for overall algorithm.
float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift) {
  STATS top_stats(0, UINT8_MAX - 1);
  STATS shift_stats(-UINT8_MAX, UINT8_MAX - 1);
  int bottom_shift = 0;
  int num_blobs = word_res->rebuild_word->NumBlobs();
  do {
    top_stats.clear();
    shift_stats.clear();
    for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
      TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
      UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
      if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
        int top = blob->bounding_box().top() + bottom_shift;
        // Clip the top to the limit of normalized feature space.
        if (top >= INT_FEAT_RANGE) {
          top = INT_FEAT_RANGE - 1;
        }
        int bottom = blob->bounding_box().bottom() + bottom_shift;
        int min_bottom, max_bottom, min_top, max_top;
        unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
        // Chars with a wild top range would mess up the result so ignore them.
        if (max_top - min_top > kMaxCharTopRange) {
          continue;
        }
        int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
                                   top - (max_top + x_ht_acceptance_tolerance));
        int height = top - kBlnBaselineOffset;
        if (debug_x_ht_level >= 2) {
          tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
                  unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top,
                  max_top, bottom, top);
        }
        // Use only chars that fit in the expected bottom range, and where
        // the range of tops is sensibly near the xheight.
        if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
            bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset &&
            max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) {
          // Compute the x-height position using proportionality between the
          // actual height and expected height.
          int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset);
          int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset);
          if (debug_x_ht_level >= 2) {
            tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
          }
          // The range of expected heights gets a vote equal to the distance
          // of the actual top from the expected top.
          for (int y = min_xht; y <= max_xht; ++y) {
            top_stats.add(y, misfit_dist);
          }
        } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
                    bottom - x_ht_acceptance_tolerance > max_bottom) &&
                   bottom_shift == 0) {
          // Get the range of required bottom shift.
          int min_shift = min_bottom - bottom;
          int max_shift = max_bottom - bottom;
          if (debug_x_ht_level >= 2) {
            tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
          }
          // The range of expected shifts gets a vote equal to the min distance
          // of the actual bottom from the expected bottom, spread over the
          // range of its acceptance.
          int misfit_weight = abs(min_shift);
          if (max_shift > min_shift) {
            misfit_weight /= max_shift - min_shift;
          }
          for (int y = min_shift; y <= max_shift; ++y) {
            shift_stats.add(y, misfit_weight);
          }
        } else {
          if (bottom_shift == 0) {
            // Things with bottoms that are already ok need to say so, on the
            // 1st iteration only.
            shift_stats.add(0, kBlnBaselineOffset);
          }
          if (debug_x_ht_level >= 2) {
            tprintf(" already OK\n");
          }
        }
      }
    }
    if (shift_stats.get_total() > top_stats.get_total()) {
      bottom_shift = IntCastRounded(shift_stats.median());
      if (debug_x_ht_level >= 2) {
        tprintf("Applying bottom shift=%d\n", bottom_shift);
      }
    }
  } while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total());
  // Baseline shift is opposite sign to the bottom shift.
  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
  if (debug_x_ht_level >= 2) {
    tprintf("baseline shift=%g\n", *baseline_shift);
  }
  if (top_stats.get_total() == 0) {
    return bottom_shift != 0 ? word_res->x_height : 0.0f;
  }
  // The new xheight is just the median vote, which is then scaled out
  // of BLN space back to pixel space to get the x-height in pixel space.
  float new_xht = top_stats.median();
  if (debug_x_ht_level >= 2) {
    tprintf("Median xht=%f\n", new_xht);
    tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht,
            new_xht / word_res->denorm.y_scale());
  }
  // The xheight must change by at least x_ht_min_change to be used.
  if (std::fabs(new_xht - kBlnXHeight) >= x_ht_min_change) {
    return new_xht / word_res->denorm.y_scale();
  } else {
    return bottom_shift != 0 ? word_res->x_height : 0.0f;
  }
}

} // namespace tesseract

Coverage Report

Created: 2025-09-27 07:10

Line	Count	Source
1		/**********************************************************************
2		* File: fixxht.cpp (Formerly fixxht.c)
3		* Description: Improve x_ht and look out for case inconsistencies
4		* Author: Phil Cheatle
5		* Created: Thu Aug 5 14:11:08 BST 1993
6		*
7		* (C) Copyright 1992, Hewlett-Packard Ltd.
8		** Licensed under the Apache License, Version 2.0 (the "License");
9		** you may not use this file except in compliance with the License.
10		** You may obtain a copy of the License at
11		** http://www.apache.org/licenses/LICENSE-2.0
12		** Unless required by applicable law or agreed to in writing, software
13		** distributed under the License is distributed on an "AS IS" BASIS,
14		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15		** See the License for the specific language governing permissions and
16		** limitations under the License.
17		*
18		**********************************************************************/
19
20		#include "float2int.h"
21		#include "params.h"
22		#include "tesseractclass.h"
23
24		#include <algorithm>
25		#include <cctype>
26		#include <cmath>
27		#include <cstring>
28
29		namespace tesseract {
30
31		// Fixxht overview.
32		// Premise: Initial estimate of x-height is adequate most of the time, but
33		// occasionally it is incorrect. Most notable causes of failure are:
34		// 1. Small caps, where the top of the caps is the same as the body text
35		// xheight. For small caps words the xheight needs to be reduced to correctly
36		// recognize the caps in the small caps word.
37		// 2. All xheight lines, such as summer. Here the initial estimate will have
38		// guessed that the blob tops are caps and will have placed the xheight too low.
39		// 3. Noise/logos beside words, or changes in font size on a line. Such
40		// things can blow the statistics and cause an incorrect estimate.
41		// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
42		// In this case the x-height is often still correct.
43		//
44		// Algorithm.
45		// Compare the vertical position (top only) of alphnumerics in a word with
46		// the range of positions in training data (in the unicharset).
47		// See CountMisfitTops. If any characters disagree sufficiently with the
48		// initial xheight estimate, then recalculate the xheight, re-run OCR on
49		// the word, and if the number of vertical misfits goes down, along with
50		// either the word rating or certainty, then keep the new xheight.
51		// The new xheight is calculated as follows:ComputeCompatibleXHeight
52		// For each alphanumeric character that has a vertically misplaced top
53		// (a misfit), yet its bottom is within the acceptable range (ie it is not
54		// likely a sub-or super-script) calculate the range of acceptable xheight
55		// positions from its range of tops, and give each value in the range a
56		// number of votes equal to the distance of its top from its acceptance range.
57		// The x-height position with the median of the votes becomes the new
58		// x-height. This assumes that most characters will be correctly recognized
59		// even if the x-height is incorrect. This is not a terrible assumption, but
60		// it is not great. An improvement would be to use a classifier that does
61		// not care about vertical position or scaling at all.
62		// Separately collect stats on shifted baselines and apply the same logic to
63		// computing a best-fit shift to fix the error. If the baseline needs to be
64		// shifted, but the x-height is OK, returns the original x-height along with
65		// the baseline shift to indicate that recognition needs to re-run.
66
67		// If the max-min top of a unicharset char is bigger than kMaxCharTopRange
68		// then the char top cannot be used to judge misfits or suggest a new top.
69		const int kMaxCharTopRange = 48;
70
71		// Returns the number of misfit blob tops in this word.
72	72.3k	int Tesseract::CountMisfitTops(WERD_RES *word_res) {
73	72.3k	int bad_blobs = 0;
74	72.3k	int num_blobs = word_res->rebuild_word->NumBlobs();
75	391k	for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
76	318k	TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
77	318k	UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
78	318k	if (unicharset.get_isalpha(class_id) \|\| unicharset.get_isdigit(class_id)) {
79	196k	int top = blob->bounding_box().top();
80	196k	if (top >= INT_FEAT_RANGE) {
81	55.6k	top = INT_FEAT_RANGE - 1;
82	55.6k	}
83	196k	int min_bottom, max_bottom, min_top, max_top;
84	196k	unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
85	196k	if (max_top - min_top > kMaxCharTopRange) {
86	4.39k	continue;
87	4.39k	}
88	191k	bool bad =
89	191k	top < min_top - x_ht_acceptance_tolerance \|\| top > max_top + x_ht_acceptance_tolerance;
90	191k	if (bad) {
91	88.0k	++bad_blobs;
92	88.0k	}
93	191k	if (debug_x_ht_level >= 1) {
94	0	tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
95	0	unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top,
96	0	static_cast<int>(x_ht_acceptance_tolerance));
97	0	}
98	191k	}
99	318k	}
100	72.3k	return bad_blobs;
101	72.3k	}
102
103		// Returns a new x-height maximally compatible with the result in word_res.
104		// See comment above for overall algorithm.
105	28.0k	float Tesseract::ComputeCompatibleXheight(WERD_RES word_res, float baseline_shift) {
106	28.0k	STATS top_stats(0, UINT8_MAX - 1);
107	28.0k	STATS shift_stats(-UINT8_MAX, UINT8_MAX - 1);
108	28.0k	int bottom_shift = 0;
109	28.0k	int num_blobs = word_res->rebuild_word->NumBlobs();
110	37.0k	do {
111	37.0k	top_stats.clear();
112	37.0k	shift_stats.clear();
113	233k	for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
114	196k	TBLOB *blob = word_res->rebuild_word->blobs[blob_id];
115	196k	UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
116	196k	if (unicharset.get_isalpha(class_id) \|\| unicharset.get_isdigit(class_id)) {
117	127k	int top = blob->bounding_box().top() + bottom_shift;
118		// Clip the top to the limit of normalized feature space.
119	127k	if (top >= INT_FEAT_RANGE) {
120	32.0k	top = INT_FEAT_RANGE - 1;
121	32.0k	}
122	127k	int bottom = blob->bounding_box().bottom() + bottom_shift;
123	127k	int min_bottom, max_bottom, min_top, max_top;
124	127k	unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top);
125		// Chars with a wild top range would mess up the result so ignore them.
126	127k	if (max_top - min_top > kMaxCharTopRange) {
127	1.03k	continue;
128	1.03k	}
129	126k	int misfit_dist = std::max((min_top - x_ht_acceptance_tolerance) - top,
130	126k	top - (max_top + x_ht_acceptance_tolerance));
131	126k	int height = top - kBlnBaselineOffset;
132	126k	if (debug_x_ht_level >= 2) {
133	0	tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
134	0	unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top,
135	0	max_top, bottom, top);
136	0	}
137		// Use only chars that fit in the expected bottom range, and where
138		// the range of tops is sensibly near the xheight.
139	126k	if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
140	89.5k	bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset &&
141	61.7k	max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) {
142		// Compute the x-height position using proportionality between the
143		// actual height and expected height.
144	41.4k	int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset);
145	41.4k	int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset);
146	41.4k	if (debug_x_ht_level >= 2) {
147	0	tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
148	0	}
149		// The range of expected heights gets a vote equal to the distance
150		// of the actual top from the expected top.
151	739k	for (int y = min_xht; y <= max_xht; ++y) {
152	697k	top_stats.add(y, misfit_dist);
153	697k	}
154	84.6k	} else if ((min_bottom > bottom + x_ht_acceptance_tolerance \|\|
155	48.1k	bottom - x_ht_acceptance_tolerance > max_bottom) &&
156	64.2k	bottom_shift == 0) {
157		// Get the range of required bottom shift.
158	42.1k	int min_shift = min_bottom - bottom;
159	42.1k	int max_shift = max_bottom - bottom;
160	42.1k	if (debug_x_ht_level >= 2) {
161	0	tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
162	0	}
163		// The range of expected shifts gets a vote equal to the min distance
164		// of the actual bottom from the expected bottom, spread over the
165		// range of its acceptance.
166	42.1k	int misfit_weight = abs(min_shift);
167	42.1k	if (max_shift > min_shift) {
168	42.1k	misfit_weight /= max_shift - min_shift;
169	42.1k	}
170	738k	for (int y = min_shift; y <= max_shift; ++y) {
171	696k	shift_stats.add(y, misfit_weight);
172	696k	}
173	42.4k	} else {
174	42.4k	if (bottom_shift == 0) {
175		// Things with bottoms that are already ok need to say so, on the
176		// 1st iteration only.
177	13.0k	shift_stats.add(0, kBlnBaselineOffset);
178	13.0k	}
179	42.4k	if (debug_x_ht_level >= 2) {
180	0	tprintf(" already OK\n");
181	0	}
182	42.4k	}
183	126k	}
184	196k	}
185	37.0k	if (shift_stats.get_total() > top_stats.get_total()) {
186	9.41k	bottom_shift = IntCastRounded(shift_stats.median());
187	9.41k	if (debug_x_ht_level >= 2) {
188	0	tprintf("Applying bottom shift=%d\n", bottom_shift);
189	0	}
190	9.41k	}
191	37.0k	} while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total());
192		// Baseline shift is opposite sign to the bottom shift.
193	28.0k	*baseline_shift = -bottom_shift / word_res->denorm.y_scale();
194	28.0k	if (debug_x_ht_level >= 2) {
195	0	tprintf("baseline shift=%g\n", *baseline_shift);
196	0	}
197	28.0k	if (top_stats.get_total() == 0) {
198	5.85k	return bottom_shift != 0 ? word_res->x_height : 0.0f;
199	5.85k	}
200		// The new xheight is just the median vote, which is then scaled out
201		// of BLN space back to pixel space to get the x-height in pixel space.
202	22.1k	float new_xht = top_stats.median();
203	22.1k	if (debug_x_ht_level >= 2) {
204	0	tprintf("Median xht=%f\n", new_xht);
205	0	tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht,
206	0	new_xht / word_res->denorm.y_scale());
207	0	}
208		// The xheight must change by at least x_ht_min_change to be used.
209	22.1k	if (std::fabs(new_xht - kBlnXHeight) >= x_ht_min_change) {
210	22.1k	return new_xht / word_res->denorm.y_scale();
211	22.1k	} else {
212	0	return bottom_shift != 0 ? word_res->x_height : 0.0f;
213	0	}
214	22.1k	}
215
216		} // namespace tesseract