Coverage Report

Created: 2026-04-01 07:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/wordrec/lm_consistency.cpp
Line
Count
Source
1
///////////////////////////////////////////////////////////////////////
2
// File:        lm_consistency.cpp
3
// Description: Struct for recording consistency of the paths  representing
4
//              OCR hypotheses.
5
// Author:      Rika Antonova
6
// Created:     Mon Jun 20 11:26:43 PST 2012
7
//
8
// (C) Copyright 2012, Google Inc.
9
// Licensed under the Apache License, Version 2.0 (the "License");
10
// you may not use this file except in compliance with the License.
11
// You may obtain a copy of the License at
12
// http://www.apache.org/licenses/LICENSE-2.0
13
// Unless required by applicable law or agreed to in writing, software
14
// distributed under the License is distributed on an "AS IS" BASIS,
15
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
// See the License for the specific language governing permissions and
17
// limitations under the License.
18
//
19
////////////////////////////////////////////////////////////////////////
20
21
#include "lm_consistency.h"
22
23
#include "associate.h"
24
#include "dict.h"
25
#include "ratngs.h"
26
27
namespace tesseract {
28
29
20.4M
void LMConsistencyInfo::ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc) {
30
20.4M
  if (xht_decision == XH_INCONSISTENT) {
31
8.28M
    return; // It isn't going to get any better.
32
8.28M
  }
33
34
  // Compute xheight consistency.
35
12.1M
  bool parent_null = xht_sp < 0;
36
12.1M
  int parent_sp = xht_sp;
37
  // Debug strings.
38
12.1M
  if (b->yshift() > LMConsistencyInfo::kShiftThresh) {
39
2.35M
    xht_sp = LMConsistencyInfo::kSUP;
40
9.82M
  } else if (b->yshift() < -LMConsistencyInfo::kShiftThresh) {
41
2.50M
    xht_sp = LMConsistencyInfo::kSUB;
42
7.31M
  } else {
43
7.31M
    xht_sp = LMConsistencyInfo::kNORM;
44
7.31M
  }
45
12.1M
  xht_count[xht_sp]++;
46
12.1M
  if (is_punc) {
47
3.54M
    xht_count_punc[xht_sp]++;
48
3.54M
  }
49
12.1M
  if (!parent_null) {
50
10.8M
    xpos_entropy += abs(parent_sp - xht_sp);
51
10.8M
  }
52
  // TODO(eger): Figure out a better way to account for small caps.
53
  // For the first character not y-shifted, we only care if it is too small.
54
  // Too large is common in drop caps and small caps.
55
  // int16_t small_xht = b->min_xheight();
56
  //  if (parent_vse == nullptr && sp == LanguageModelConsistencyInfo::kNORM) {
57
  //  small_xht = 0;
58
  // }
59
12.1M
  IntersectRange(b->min_xheight(), b->max_xheight(), &(xht_lo[xht_sp]), &(xht_hi[xht_sp]));
60
61
  // Compute xheight inconsistency kinds.
62
12.1M
  if (parent_null) {
63
1.37M
    if (xht_count[kNORM] == 1) {
64
908k
      xht_decision = XH_GOOD;
65
908k
    } else {
66
469k
      xht_decision = XH_SUBNORMAL;
67
469k
    }
68
1.37M
    return;
69
1.37M
  }
70
71
  // When we intersect the ranges of xheights in pixels for all characters in
72
  // each position (subscript, normal, superscript),
73
  // How much range must be left?  0? [exactly one pixel height for xheight] 1?
74
  // TODO(eger): Extend this code to take a prior for the rest of the line.
75
10.8M
  const int kMinIntersectedXHeightRange = 0;
76
39.9M
  for (int i = 0; i < kNumPos; i++) {
77
30.6M
    if (xht_lo[i] > xht_hi[i] - kMinIntersectedXHeightRange) {
78
1.42M
      xht_decision = XH_INCONSISTENT;
79
1.42M
      return;
80
1.42M
    }
81
30.6M
  }
82
83
  // Reject as improbable anything where there's much punctuation in subscript
84
  // or superscript regions.
85
9.38M
  if (xht_count_punc[kSUB] > xht_count[kSUB] * 0.4 ||
86
8.48M
      xht_count_punc[kSUP] > xht_count[kSUP] * 0.4) {
87
1.27M
    xht_decision = XH_INCONSISTENT;
88
1.27M
    return;
89
1.27M
  }
90
91
  // Now check that the subscript and superscript aren't too small relative to
92
  // the mainline.
93
8.10M
  auto mainline_xht = static_cast<double>(xht_lo[kNORM]);
94
8.10M
  double kMinSizeRatio = 0.4;
95
8.10M
  if (mainline_xht > 0.0 && (static_cast<double>(xht_hi[kSUB]) / mainline_xht < kMinSizeRatio ||
96
7.37M
                             static_cast<double>(xht_hi[kSUP]) / mainline_xht < kMinSizeRatio)) {
97
1.26k
    xht_decision = XH_INCONSISTENT;
98
1.26k
    return;
99
1.26k
  }
100
  // TODO(eger): Check into inconsistency of super/subscript y offsets.
101
8.10M
  if (xpos_entropy > kMaxEntropy) {
102
1.01M
    xht_decision = XH_INCONSISTENT;
103
1.01M
    return;
104
1.01M
  }
105
7.08M
  if (xht_count[kSUB] == 0 && xht_count[kSUP] == 0) {
106
3.60M
    xht_decision = XH_GOOD;
107
3.60M
    return;
108
3.60M
  }
109
3.48M
  xht_decision = XH_SUBNORMAL;
110
3.48M
}
111
112
} // namespace tesseract