Coverage Report

Created: 2025-09-27 07:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/textord/gap_map.cpp
Line
Count
Source
1
// Licensed under the Apache License, Version 2.0 (the "License");
2
// you may not use this file except in compliance with the License.
3
// You may obtain a copy of the License at
4
// http://www.apache.org/licenses/LICENSE-2.0
5
// Unless required by applicable law or agreed to in writing, software
6
// distributed under the License is distributed on an "AS IS" BASIS,
7
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8
// See the License for the specific language governing permissions and
9
// limitations under the License.
10
11
#include "gap_map.h"
12
13
#include "statistc.h"
14
15
namespace tesseract {
16
17
BOOL_VAR(gapmap_debug, false, "Say which blocks have tables");
18
BOOL_VAR(gapmap_use_ends, false, "Use large space at start and end of rows");
19
BOOL_VAR(gapmap_no_isolated_quanta, false, "Ensure gaps not less than 2quanta wide");
20
double_VAR(gapmap_big_gaps, 1.75, "xht multiplier");
21
22
/*************************************************************************
23
 * A block gap map is a quantised histogram of whitespace regions in the
24
 * block. It is a vertical projection of wide gaps WITHIN lines
25
 *
26
 * The map is held as an array of counts of rows which have a wide gap
27
 * covering that region of the row. Each bucket in the map represents a width
28
 * of about half an xheight - (The median of the xhts in the rows is used.)
29
 *
30
 * The block is considered RECTANGULAR - delimited by the left and right
31
 * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
32
 * counted.
33
 *
34
 *************************************************************************/
35
36
GAPMAP::GAPMAP(     // Constructor
37
    TO_BLOCK *block // block
38
16.1k
) {
39
16.1k
  TO_ROW *row;         // current row
40
16.1k
  BLOBNBOX_IT blob_it; // iterator
41
16.1k
  TBOX blob_box;
42
16.1k
  TBOX prev_blob_box;
43
16.1k
  int16_t gap_width;
44
16.1k
  int16_t start_of_row;
45
16.1k
  int16_t end_of_row;
46
16.1k
  STATS xht_stats(0, 127);
47
16.1k
  int16_t min_quantum;
48
16.1k
  int16_t max_quantum;
49
16.1k
  int16_t i;
50
51
  /*
52
  Find left and right extremes and bucket size
53
*/
54
16.1k
  map = nullptr;
55
16.1k
  min_left = INT16_MAX;
56
16.1k
  max_right = -INT16_MAX;
57
16.1k
  total_rows = 0;
58
16.1k
  any_tabs = false;
59
60
  // row iterator
61
16.1k
  TO_ROW_IT row_it(block->get_rows());
62
193k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
63
177k
    row = row_it.data();
64
177k
    if (!row->blob_list()->empty()) {
65
177k
      total_rows++;
66
177k
      xht_stats.add(static_cast<int16_t>(floor(row->xheight + 0.5)), 1);
67
177k
      blob_it.set_to_list(row->blob_list());
68
177k
      start_of_row = blob_it.data()->bounding_box().left();
69
177k
      end_of_row = blob_it.data_relative(-1)->bounding_box().right();
70
177k
      if (min_left > start_of_row) {
71
21.5k
        min_left = start_of_row;
72
21.5k
      }
73
177k
      if (max_right < end_of_row) {
74
30.8k
        max_right = end_of_row;
75
30.8k
      }
76
177k
    }
77
177k
  }
78
16.1k
  if ((total_rows < 3) || (min_left >= max_right)) {
79
3.13k
    bucket_size = 0;
80
3.13k
    map_max = 0;
81
3.13k
    total_rows = 0;
82
3.13k
    min_left = max_right = 0;
83
3.13k
    return;
84
3.13k
  }
85
13.0k
  bucket_size = static_cast<int16_t>(floor(xht_stats.median() + 0.5)) / 2;
86
13.0k
  map_max = (max_right - min_left) / bucket_size;
87
13.0k
  map = new int16_t[map_max + 1];
88
118k
  for (i = 0; i <= map_max; i++) {
89
105k
    map[i] = 0;
90
105k
  }
91
92
186k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
93
173k
    row = row_it.data();
94
173k
    if (!row->blob_list()->empty()) {
95
173k
      blob_it.set_to_list(row->blob_list());
96
173k
      blob_it.mark_cycle_pt();
97
173k
      blob_box = box_next(&blob_it);
98
173k
      prev_blob_box = blob_box;
99
173k
      if (gapmap_use_ends) {
100
        /* Leading space */
101
0
        gap_width = blob_box.left() - min_left;
102
0
        if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
103
0
          max_quantum = (blob_box.left() - min_left) / bucket_size;
104
0
          if (max_quantum > map_max) {
105
0
            max_quantum = map_max;
106
0
          }
107
0
          for (i = 0; i <= max_quantum; i++) {
108
0
            map[i]++;
109
0
          }
110
0
        }
111
0
      }
112
1.28M
      while (!blob_it.cycled_list()) {
113
1.11M
        blob_box = box_next(&blob_it);
114
1.11M
        gap_width = blob_box.left() - prev_blob_box.right();
115
1.11M
        if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
116
15.4k
          min_quantum = (prev_blob_box.right() - min_left) / bucket_size;
117
15.4k
          max_quantum = (blob_box.left() - min_left) / bucket_size;
118
15.4k
          if (max_quantum > map_max) {
119
0
            max_quantum = map_max;
120
0
          }
121
145k
          for (i = min_quantum; i <= max_quantum; i++) {
122
129k
            map[i]++;
123
129k
          }
124
15.4k
        }
125
1.11M
        prev_blob_box = blob_box;
126
1.11M
      }
127
173k
      if (gapmap_use_ends) {
128
        /* Trailing space */
129
0
        gap_width = max_right - prev_blob_box.right();
130
0
        if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
131
0
          min_quantum = (prev_blob_box.right() - min_left) / bucket_size;
132
0
          if (min_quantum < 0) {
133
0
            min_quantum = 0;
134
0
          }
135
0
          for (i = min_quantum; i <= map_max; i++) {
136
0
            map[i]++;
137
0
          }
138
0
        }
139
0
      }
140
173k
    }
141
173k
  }
142
118k
  for (i = 0; i <= map_max; i++) {
143
105k
    if (map[i] > total_rows / 2) {
144
3.39k
      if (gapmap_no_isolated_quanta &&
145
0
          (((i == 0) && (map[i + 1] <= total_rows / 2)) ||
146
0
           ((i == map_max) && (map[i - 1] <= total_rows / 2)) ||
147
0
           ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) &&
148
0
            (map[i + 1] <= total_rows / 2)))) {
149
0
        map[i] = 0; // prevent isolated quantum
150
3.39k
      } else {
151
3.39k
        any_tabs = true;
152
3.39k
      }
153
3.39k
    }
154
105k
  }
155
13.0k
  if (gapmap_debug && any_tabs) {
156
0
    tprintf("Table found\n");
157
0
  }
158
13.0k
}
159
160
/*************************************************************************
161
 * GAPMAP::table_gap()
162
 * Is there a bucket in the specified range where more than half the rows in the
163
 * block have a wide gap?
164
 *************************************************************************/
165
166
bool GAPMAP::table_gap( // Is gap a table?
167
    int16_t left,       // From here
168
    int16_t right       // To here
169
30.9k
) {
170
30.9k
  int16_t min_quantum;
171
30.9k
  int16_t max_quantum;
172
30.9k
  int16_t i;
173
30.9k
  bool tab_found = false;
174
175
30.9k
  if (!any_tabs) {
176
17.2k
    return false;
177
17.2k
  }
178
179
13.7k
  min_quantum = (left - min_left) / bucket_size;
180
13.7k
  max_quantum = (right - min_left) / bucket_size;
181
  // Clip to the bounds of the array. In some circumstances (big blob followed
182
  // by small blob) max_quantum can exceed the map_max bounds, but we clip
183
  // here instead, as it provides better long-term safety.
184
13.7k
  if (min_quantum < 0) {
185
0
    min_quantum = 0;
186
0
  }
187
13.7k
  if (max_quantum > map_max) {
188
0
    max_quantum = map_max;
189
0
  }
190
44.5k
  for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) {
191
30.7k
    if (map[i] > total_rows / 2) {
192
11.2k
      tab_found = true;
193
11.2k
    }
194
30.7k
  }
195
13.7k
  return tab_found;
196
30.9k
}
197
198
} // namespace tesseract