/src/tesseract/src/textord/gap_map.cpp
Line | Count | Source |
1 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
2 | | // you may not use this file except in compliance with the License. |
3 | | // You may obtain a copy of the License at |
4 | | // http://www.apache.org/licenses/LICENSE-2.0 |
5 | | // Unless required by applicable law or agreed to in writing, software |
6 | | // distributed under the License is distributed on an "AS IS" BASIS, |
7 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
8 | | // See the License for the specific language governing permissions and |
9 | | // limitations under the License. |
10 | | |
11 | | #include "gap_map.h" |
12 | | |
13 | | #include "statistc.h" |
14 | | |
15 | | namespace tesseract { |
16 | | |
17 | | BOOL_VAR(gapmap_debug, false, "Say which blocks have tables"); |
18 | | BOOL_VAR(gapmap_use_ends, false, "Use large space at start and end of rows"); |
19 | | BOOL_VAR(gapmap_no_isolated_quanta, false, "Ensure gaps not less than 2quanta wide"); |
20 | | double_VAR(gapmap_big_gaps, 1.75, "xht multiplier"); |
21 | | |
22 | | /************************************************************************* |
23 | | * A block gap map is a quantised histogram of whitespace regions in the |
24 | | * block. It is a vertical projection of wide gaps WITHIN lines |
25 | | * |
26 | | * The map is held as an array of counts of rows which have a wide gap |
27 | | * covering that region of the row. Each bucket in the map represents a width |
28 | | * of about half an xheight - (The median of the xhts in the rows is used.) |
29 | | * |
30 | | * The block is considered RECTANGULAR - delimited by the left and right |
31 | | * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are |
32 | | * counted. |
33 | | * |
34 | | *************************************************************************/ |
35 | | |
36 | | GAPMAP::GAPMAP( // Constructor |
37 | | TO_BLOCK *block // block |
38 | 16.1k | ) { |
39 | 16.1k | TO_ROW *row; // current row |
40 | 16.1k | BLOBNBOX_IT blob_it; // iterator |
41 | 16.1k | TBOX blob_box; |
42 | 16.1k | TBOX prev_blob_box; |
43 | 16.1k | int16_t gap_width; |
44 | 16.1k | int16_t start_of_row; |
45 | 16.1k | int16_t end_of_row; |
46 | 16.1k | STATS xht_stats(0, 127); |
47 | 16.1k | int16_t min_quantum; |
48 | 16.1k | int16_t max_quantum; |
49 | 16.1k | int16_t i; |
50 | | |
51 | | /* |
52 | | Find left and right extremes and bucket size |
53 | | */ |
54 | 16.1k | map = nullptr; |
55 | 16.1k | min_left = INT16_MAX; |
56 | 16.1k | max_right = -INT16_MAX; |
57 | 16.1k | total_rows = 0; |
58 | 16.1k | any_tabs = false; |
59 | | |
60 | | // row iterator |
61 | 16.1k | TO_ROW_IT row_it(block->get_rows()); |
62 | 193k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
63 | 177k | row = row_it.data(); |
64 | 177k | if (!row->blob_list()->empty()) { |
65 | 177k | total_rows++; |
66 | 177k | xht_stats.add(static_cast<int16_t>(floor(row->xheight + 0.5)), 1); |
67 | 177k | blob_it.set_to_list(row->blob_list()); |
68 | 177k | start_of_row = blob_it.data()->bounding_box().left(); |
69 | 177k | end_of_row = blob_it.data_relative(-1)->bounding_box().right(); |
70 | 177k | if (min_left > start_of_row) { |
71 | 21.5k | min_left = start_of_row; |
72 | 21.5k | } |
73 | 177k | if (max_right < end_of_row) { |
74 | 30.8k | max_right = end_of_row; |
75 | 30.8k | } |
76 | 177k | } |
77 | 177k | } |
78 | 16.1k | if ((total_rows < 3) || (min_left >= max_right)) { |
79 | 3.13k | bucket_size = 0; |
80 | 3.13k | map_max = 0; |
81 | 3.13k | total_rows = 0; |
82 | 3.13k | min_left = max_right = 0; |
83 | 3.13k | return; |
84 | 3.13k | } |
85 | 13.0k | bucket_size = static_cast<int16_t>(floor(xht_stats.median() + 0.5)) / 2; |
86 | 13.0k | map_max = (max_right - min_left) / bucket_size; |
87 | 13.0k | map = new int16_t[map_max + 1]; |
88 | 118k | for (i = 0; i <= map_max; i++) { |
89 | 105k | map[i] = 0; |
90 | 105k | } |
91 | | |
92 | 186k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
93 | 173k | row = row_it.data(); |
94 | 173k | if (!row->blob_list()->empty()) { |
95 | 173k | blob_it.set_to_list(row->blob_list()); |
96 | 173k | blob_it.mark_cycle_pt(); |
97 | 173k | blob_box = box_next(&blob_it); |
98 | 173k | prev_blob_box = blob_box; |
99 | 173k | if (gapmap_use_ends) { |
100 | | /* Leading space */ |
101 | 0 | gap_width = blob_box.left() - min_left; |
102 | 0 | if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { |
103 | 0 | max_quantum = (blob_box.left() - min_left) / bucket_size; |
104 | 0 | if (max_quantum > map_max) { |
105 | 0 | max_quantum = map_max; |
106 | 0 | } |
107 | 0 | for (i = 0; i <= max_quantum; i++) { |
108 | 0 | map[i]++; |
109 | 0 | } |
110 | 0 | } |
111 | 0 | } |
112 | 1.28M | while (!blob_it.cycled_list()) { |
113 | 1.11M | blob_box = box_next(&blob_it); |
114 | 1.11M | gap_width = blob_box.left() - prev_blob_box.right(); |
115 | 1.11M | if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { |
116 | 15.4k | min_quantum = (prev_blob_box.right() - min_left) / bucket_size; |
117 | 15.4k | max_quantum = (blob_box.left() - min_left) / bucket_size; |
118 | 15.4k | if (max_quantum > map_max) { |
119 | 0 | max_quantum = map_max; |
120 | 0 | } |
121 | 145k | for (i = min_quantum; i <= max_quantum; i++) { |
122 | 129k | map[i]++; |
123 | 129k | } |
124 | 15.4k | } |
125 | 1.11M | prev_blob_box = blob_box; |
126 | 1.11M | } |
127 | 173k | if (gapmap_use_ends) { |
128 | | /* Trailing space */ |
129 | 0 | gap_width = max_right - prev_blob_box.right(); |
130 | 0 | if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { |
131 | 0 | min_quantum = (prev_blob_box.right() - min_left) / bucket_size; |
132 | 0 | if (min_quantum < 0) { |
133 | 0 | min_quantum = 0; |
134 | 0 | } |
135 | 0 | for (i = min_quantum; i <= map_max; i++) { |
136 | 0 | map[i]++; |
137 | 0 | } |
138 | 0 | } |
139 | 0 | } |
140 | 173k | } |
141 | 173k | } |
142 | 118k | for (i = 0; i <= map_max; i++) { |
143 | 105k | if (map[i] > total_rows / 2) { |
144 | 3.39k | if (gapmap_no_isolated_quanta && |
145 | 0 | (((i == 0) && (map[i + 1] <= total_rows / 2)) || |
146 | 0 | ((i == map_max) && (map[i - 1] <= total_rows / 2)) || |
147 | 0 | ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) && |
148 | 0 | (map[i + 1] <= total_rows / 2)))) { |
149 | 0 | map[i] = 0; // prevent isolated quantum |
150 | 3.39k | } else { |
151 | 3.39k | any_tabs = true; |
152 | 3.39k | } |
153 | 3.39k | } |
154 | 105k | } |
155 | 13.0k | if (gapmap_debug && any_tabs) { |
156 | 0 | tprintf("Table found\n"); |
157 | 0 | } |
158 | 13.0k | } |
159 | | |
160 | | /************************************************************************* |
161 | | * GAPMAP::table_gap() |
162 | | * Is there a bucket in the specified range where more than half the rows in the |
163 | | * block have a wide gap? |
164 | | *************************************************************************/ |
165 | | |
166 | | bool GAPMAP::table_gap( // Is gap a table? |
167 | | int16_t left, // From here |
168 | | int16_t right // To here |
169 | 30.9k | ) { |
170 | 30.9k | int16_t min_quantum; |
171 | 30.9k | int16_t max_quantum; |
172 | 30.9k | int16_t i; |
173 | 30.9k | bool tab_found = false; |
174 | | |
175 | 30.9k | if (!any_tabs) { |
176 | 17.2k | return false; |
177 | 17.2k | } |
178 | | |
179 | 13.7k | min_quantum = (left - min_left) / bucket_size; |
180 | 13.7k | max_quantum = (right - min_left) / bucket_size; |
181 | | // Clip to the bounds of the array. In some circumstances (big blob followed |
182 | | // by small blob) max_quantum can exceed the map_max bounds, but we clip |
183 | | // here instead, as it provides better long-term safety. |
184 | 13.7k | if (min_quantum < 0) { |
185 | 0 | min_quantum = 0; |
186 | 0 | } |
187 | 13.7k | if (max_quantum > map_max) { |
188 | 0 | max_quantum = map_max; |
189 | 0 | } |
190 | 44.5k | for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) { |
191 | 30.7k | if (map[i] > total_rows / 2) { |
192 | 11.2k | tab_found = true; |
193 | 11.2k | } |
194 | 30.7k | } |
195 | 13.7k | return tab_found; |
196 | 30.9k | } |
197 | | |
198 | | } // namespace tesseract |