/src/tesseract/src/textord/edgblob.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: edgblob.cpp (Formerly edgeloop.c) |
3 | | * Description: Functions to clean up an outline before approximation. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | *(C) Copyright 1991, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0(the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | // Include automatically generated configuration file if running autoconf. |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" |
22 | | #endif |
23 | | |
24 | | #include "edgblob.h" |
25 | | |
26 | | #include "edgloop.h" |
27 | | #include "scanedg.h" |
28 | | |
29 | 14.0M | #define BUCKETSIZE 16 |
30 | | |
31 | | namespace tesseract { |
32 | | |
33 | | // Control parameters used in outline_complexity(), which rejects an outline |
34 | | // if any one of the 3 conditions is satisfied: |
35 | | // - number of children exceeds edges_max_children_per_outline |
36 | | // - number of nested layers exceeds edges_max_children_layers |
37 | | // - joint complexity exceeds edges_children_count_limit(as in child_count()) |
38 | | static BOOL_VAR(edges_use_new_outline_complexity, false, |
39 | | "Use the new outline complexity module"); |
40 | | static INT_VAR(edges_max_children_per_outline, 10, |
41 | | "Max number of children inside a character outline"); |
42 | | static INT_VAR(edges_max_children_layers, 5, |
43 | | "Max layers of nested children inside a character outline"); |
44 | | static BOOL_VAR(edges_debug, false, "turn on debugging for this module"); |
45 | | |
46 | | static INT_VAR(edges_children_per_grandchild, 10, |
47 | | "Importance ratio for chucking outlines"); |
48 | | static INT_VAR(edges_children_count_limit, 45, "Max holes allowed in blob"); |
49 | | static BOOL_VAR(edges_children_fix, false, |
50 | | "Remove boxy parents of char-like children"); |
51 | | static INT_VAR(edges_min_nonhole, 12, "Min pixels for potential char in box"); |
52 | | static INT_VAR(edges_patharea_ratio, 40, |
53 | | "Max lensq/area for acceptable child outline"); |
54 | | static double_VAR(edges_childarea, 0.5, "Min area fraction of child outline"); |
55 | | static double_VAR(edges_boxarea, 0.875, |
56 | | "Min area fraction of grandchild for box"); |
57 | | |
58 | | /** |
59 | | * @name OL_BUCKETS::OL_BUCKETS |
60 | | * |
61 | | * Construct an array of buckets for associating outlines into blobs. |
62 | | */ |
63 | | |
64 | | OL_BUCKETS::OL_BUCKETS(ICOORD bleft, // corners |
65 | | ICOORD tright) |
66 | 7.72k | : bxdim((tright.x() - bleft.x()) / BUCKETSIZE + 1), |
67 | 7.72k | bydim((tright.y() - bleft.y()) / BUCKETSIZE + 1), |
68 | 7.72k | buckets(bxdim * bydim), |
69 | 7.72k | bl(bleft), |
70 | 7.72k | tr(tright) {} |
71 | | |
72 | | /** |
73 | | * @name OL_BUCKETS::operator( |
74 | | * |
75 | | * Return a pointer to a list of C_OUTLINEs corresponding to the |
76 | | * given pixel coordinates. |
77 | | */ |
78 | | |
79 | | C_OUTLINE_LIST *OL_BUCKETS::operator()( // array access |
80 | | TDimension x, // image coords |
81 | 2.25M | TDimension y) { |
82 | 2.25M | return &buckets[(y - bl.y()) / BUCKETSIZE * bxdim + |
83 | 2.25M | (x - bl.x()) / BUCKETSIZE]; |
84 | 2.25M | } |
85 | | |
86 | 7.72k | C_OUTLINE_LIST *OL_BUCKETS::start_scan() { |
87 | 7.72k | return scan_next(buckets.begin()); |
88 | 7.72k | } |
89 | | |
90 | 2.07M | C_OUTLINE_LIST *OL_BUCKETS::scan_next() { |
91 | 2.07M | return scan_next(it); |
92 | 2.07M | } |
93 | | |
94 | 2.08M | C_OUTLINE_LIST *OL_BUCKETS::scan_next(decltype(buckets)::iterator in_it) { |
95 | 6.40M | it = std::find_if(in_it, buckets.end(), [](auto &&b) { return !b.empty(); }); |
96 | 2.08M | if (it == buckets.end()) |
97 | 7.72k | return nullptr; |
98 | 2.07M | return &*it; |
99 | 2.08M | } |
100 | | |
101 | | /** |
102 | | * @name OL_BUCKETS::outline_complexity |
103 | | * |
104 | | * This is the new version of count_child. |
105 | | * |
106 | | * The goal of this function is to determine if an outline and its |
107 | | * interiors could be part of a character blob. This is done by |
108 | | * computing a "complexity" index for the outline, which is the return |
109 | | * value of this function, and checking it against a threshold. |
110 | | * The max_count is used for short-circuiting the recursion and forcing |
111 | | * a rejection that guarantees to fail the threshold test. |
112 | | * The complexity F for outline X with N children X[i] is |
113 | | * F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild |
114 | | * so each layer of nesting increases complexity exponentially. |
115 | | * An outline can be rejected as a text blob candidate if its complexity |
116 | | * is too high, has too many children(likely a container), or has too |
117 | | * many layers of nested inner loops. This has the side-effect of |
118 | | * flattening out boxed or reversed video text regions. |
119 | | */ |
120 | | |
121 | | int32_t OL_BUCKETS::outline_complexity(C_OUTLINE *outline, // parent outline |
122 | | int32_t max_count, // max output |
123 | | int16_t depth // recursion depth |
124 | 0 | ) { |
125 | 0 | TDimension xmin, xmax; // coord limits |
126 | 0 | TDimension ymin, ymax; |
127 | 0 | C_OUTLINE *child; // current child |
128 | 0 | int32_t child_count; // no of children |
129 | 0 | int32_t grandchild_count; // no of grandchildren |
130 | 0 | C_OUTLINE_IT child_it; // search iterator |
131 | |
|
132 | 0 | TBOX olbox = outline->bounding_box(); |
133 | 0 | xmin = (olbox.left() - bl.x()) / BUCKETSIZE; |
134 | 0 | xmax = (olbox.right() - bl.x()) / BUCKETSIZE; |
135 | 0 | ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE; |
136 | 0 | ymax = (olbox.top() - bl.y()) / BUCKETSIZE; |
137 | 0 | child_count = 0; |
138 | 0 | grandchild_count = 0; |
139 | 0 | if (++depth > edges_max_children_layers) { // nested loops are too deep |
140 | 0 | return max_count + depth; |
141 | 0 | } |
142 | | |
143 | 0 | for (auto yindex = ymin; yindex <= ymax; yindex++) { |
144 | 0 | for (auto xindex = xmin; xindex <= xmax; xindex++) { |
145 | 0 | child_it.set_to_list(&buckets[yindex * bxdim + xindex]); |
146 | 0 | if (child_it.empty()) { |
147 | 0 | continue; |
148 | 0 | } |
149 | 0 | for (child_it.mark_cycle_pt(); !child_it.cycled_list(); |
150 | 0 | child_it.forward()) { |
151 | 0 | child = child_it.data(); |
152 | 0 | if (child == outline || !(*child < *outline)) { |
153 | 0 | continue; |
154 | 0 | } |
155 | 0 | child_count++; |
156 | |
|
157 | 0 | if (child_count > edges_max_children_per_outline) { // too fragmented |
158 | 0 | if (edges_debug) { |
159 | 0 | tprintf( |
160 | 0 | "Discard outline on child_count=%d > " |
161 | 0 | "max_children_per_outline=%d\n", |
162 | 0 | child_count, |
163 | 0 | static_cast<int32_t>(edges_max_children_per_outline)); |
164 | 0 | } |
165 | 0 | return max_count + child_count; |
166 | 0 | } |
167 | | |
168 | | // Compute the "complexity" of each child recursively |
169 | 0 | int32_t remaining_count = max_count - child_count - grandchild_count; |
170 | 0 | if (remaining_count > 0) { |
171 | 0 | grandchild_count += edges_children_per_grandchild * |
172 | 0 | outline_complexity(child, remaining_count, depth); |
173 | 0 | } |
174 | 0 | if (child_count + grandchild_count > max_count) { // too complex |
175 | 0 | if (edges_debug) { |
176 | 0 | tprintf( |
177 | 0 | "Discard outline on child_count=%d + grandchild_count=%d " |
178 | 0 | "> max_count=%d\n", |
179 | 0 | child_count, grandchild_count, max_count); |
180 | 0 | } |
181 | 0 | return child_count + grandchild_count; |
182 | 0 | } |
183 | 0 | } |
184 | 0 | } |
185 | 0 | } |
186 | 0 | return child_count + grandchild_count; |
187 | 0 | } |
188 | | |
189 | | /** |
190 | | * @name OL_BUCKETS::count_children |
191 | | * |
192 | | * Find number of descendants of this outline. |
193 | | */ |
194 | | // TODO(rays) Merge with outline_complexity. |
195 | | int32_t OL_BUCKETS::count_children( // recursive count |
196 | | C_OUTLINE *outline, // parent outline |
197 | | int32_t max_count // max output |
198 | 2.32M | ) { |
199 | 2.32M | bool parent_box; // could it be boxy |
200 | 2.32M | TDimension xmin, xmax; // coord limits |
201 | 2.32M | TDimension ymin, ymax; |
202 | 2.32M | C_OUTLINE *child; // current child |
203 | 2.32M | int32_t child_count; // no of children |
204 | 2.32M | int32_t grandchild_count; // no of grandchildren |
205 | 2.32M | int32_t parent_area; // potential box |
206 | 2.32M | float max_parent_area; // potential box |
207 | 2.32M | int32_t child_area; // current child |
208 | 2.32M | int32_t child_length; // current child |
209 | 2.32M | TBOX olbox; |
210 | 2.32M | C_OUTLINE_IT child_it; // search iterator |
211 | | |
212 | 2.32M | olbox = outline->bounding_box(); |
213 | 2.32M | xmin = (olbox.left() - bl.x()) / BUCKETSIZE; |
214 | 2.32M | xmax = (olbox.right() - bl.x()) / BUCKETSIZE; |
215 | 2.32M | ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE; |
216 | 2.32M | ymax = (olbox.top() - bl.y()) / BUCKETSIZE; |
217 | 2.32M | child_count = 0; |
218 | 2.32M | grandchild_count = 0; |
219 | 2.32M | parent_area = 0; |
220 | 2.32M | max_parent_area = 0; |
221 | 2.32M | parent_box = true; |
222 | 5.49M | for (auto yindex = ymin; yindex <= ymax; yindex++) { |
223 | 7.37M | for (auto xindex = xmin; xindex <= xmax; xindex++) { |
224 | 4.20M | child_it.set_to_list(&buckets[yindex * bxdim + xindex]); |
225 | 4.20M | if (child_it.empty()) { |
226 | 794k | continue; |
227 | 794k | } |
228 | 19.9M | for (child_it.mark_cycle_pt(); !child_it.cycled_list(); |
229 | 16.5M | child_it.forward()) { |
230 | 16.5M | child = child_it.data(); |
231 | 16.5M | if (child != outline && *child < *outline) { |
232 | 250k | child_count++; |
233 | 250k | if (child_count <= max_count) { |
234 | 249k | int max_grand = |
235 | 249k | (max_count - child_count) / edges_children_per_grandchild; |
236 | 249k | if (max_grand > 0) { |
237 | 232k | grandchild_count += count_children(child, max_grand) * |
238 | 232k | edges_children_per_grandchild; |
239 | 232k | } else { |
240 | 17.3k | grandchild_count += count_children(child, 1); |
241 | 17.3k | } |
242 | 249k | } |
243 | 250k | if (child_count + grandchild_count > max_count) { |
244 | 2.07k | if (edges_debug) { |
245 | 0 | tprintf("Discarding parent with child count=%d, gc=%d\n", |
246 | 0 | child_count, grandchild_count); |
247 | 0 | } |
248 | 2.07k | return child_count + grandchild_count; |
249 | 2.07k | } |
250 | 248k | if (parent_area == 0) { |
251 | 58.5k | parent_area = outline->outer_area(); |
252 | 58.5k | if (parent_area < 0) { |
253 | 11.4k | parent_area = -parent_area; |
254 | 11.4k | } |
255 | 58.5k | max_parent_area = outline->bounding_box().area() * edges_boxarea; |
256 | 58.5k | if (parent_area < max_parent_area) { |
257 | 54.4k | parent_box = false; |
258 | 54.4k | } |
259 | 58.5k | } |
260 | 248k | if (parent_box && |
261 | 248k | (!edges_children_fix || |
262 | 36.9k | child->bounding_box().height() > edges_min_nonhole)) { |
263 | 36.9k | child_area = child->outer_area(); |
264 | 36.9k | if (child_area < 0) { |
265 | 35.0k | child_area = -child_area; |
266 | 35.0k | } |
267 | 36.9k | if (edges_children_fix) { |
268 | 0 | if (parent_area - child_area < max_parent_area) { |
269 | 0 | parent_box = false; |
270 | 0 | continue; |
271 | 0 | } |
272 | 0 | if (grandchild_count > 0) { |
273 | 0 | if (edges_debug) { |
274 | 0 | tprintf( |
275 | 0 | "Discarding parent of area %d, child area=%d, max%g " |
276 | 0 | "with gc=%d\n", |
277 | 0 | parent_area, child_area, max_parent_area, |
278 | 0 | grandchild_count); |
279 | 0 | } |
280 | 0 | return max_count + 1; |
281 | 0 | } |
282 | 0 | child_length = child->pathlength(); |
283 | 0 | if (child_length * child_length > |
284 | 0 | child_area * edges_patharea_ratio) { |
285 | 0 | if (edges_debug) { |
286 | 0 | tprintf( |
287 | 0 | "Discarding parent of area %d, child area=%d, max%g " |
288 | 0 | "with child length=%d\n", |
289 | 0 | parent_area, child_area, max_parent_area, child_length); |
290 | 0 | } |
291 | 0 | return max_count + 1; |
292 | 0 | } |
293 | 0 | } |
294 | 36.9k | if (child_area < child->bounding_box().area() * edges_childarea) { |
295 | 640 | if (edges_debug) { |
296 | 0 | tprintf( |
297 | 0 | "Discarding parent of area %d, child area=%d, max%g " |
298 | 0 | "with child rect=%d\n", |
299 | 0 | parent_area, child_area, max_parent_area, |
300 | 0 | child->bounding_box().area()); |
301 | 0 | } |
302 | 640 | return max_count + 1; |
303 | 640 | } |
304 | 36.9k | } |
305 | 248k | } |
306 | 16.5M | } |
307 | 3.40M | } |
308 | 3.17M | } |
309 | 2.32M | return child_count + grandchild_count; |
310 | 2.32M | } |
311 | | |
312 | | /** |
313 | | * @name OL_BUCKETS::extract_children |
314 | | * |
315 | | * Find number of descendants of this outline. |
316 | | */ |
317 | | |
318 | | void OL_BUCKETS::extract_children( // recursive count |
319 | | C_OUTLINE *outline, // parent outline |
320 | | C_OUTLINE_IT *it // destination iterator |
321 | 54.1k | ) { |
322 | 54.1k | TDimension xmin, xmax; // coord limits |
323 | 54.1k | TDimension ymin, ymax; |
324 | 54.1k | TBOX olbox; |
325 | 54.1k | C_OUTLINE_IT child_it; // search iterator |
326 | | |
327 | 54.1k | olbox = outline->bounding_box(); |
328 | 54.1k | xmin = (olbox.left() - bl.x()) / BUCKETSIZE; |
329 | 54.1k | xmax = (olbox.right() - bl.x()) / BUCKETSIZE; |
330 | 54.1k | ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE; |
331 | 54.1k | ymax = (olbox.top() - bl.y()) / BUCKETSIZE; |
332 | 173k | for (auto yindex = ymin; yindex <= ymax; yindex++) { |
333 | 488k | for (auto xindex = xmin; xindex <= xmax; xindex++) { |
334 | 368k | child_it.set_to_list(&buckets[yindex * bxdim + xindex]); |
335 | 1.47M | for (child_it.mark_cycle_pt(); !child_it.cycled_list(); |
336 | 1.10M | child_it.forward()) { |
337 | 1.10M | if (*child_it.data() < *outline) { |
338 | 176k | it->add_after_then_move(child_it.extract()); |
339 | 176k | } |
340 | 1.10M | } |
341 | 368k | } |
342 | 119k | } |
343 | 54.1k | } |
344 | | |
345 | | /// @name extract_edges |
346 | | |
347 | | void extract_edges(Image pix, // thresholded image |
348 | 7.72k | BLOCK *block) { // block to scan |
349 | 7.72k | C_OUTLINE_LIST outlines; // outlines in block |
350 | 7.72k | C_OUTLINE_IT out_it = &outlines; |
351 | | |
352 | 7.72k | block_edges(pix, &(block->pdblk), &out_it); |
353 | 7.72k | ICOORD bleft; // block box |
354 | 7.72k | ICOORD tright; |
355 | 7.72k | block->pdblk.bounding_box(bleft, tright); |
356 | | // make blobs |
357 | 7.72k | outlines_to_blobs(block, bleft, tright, &outlines); |
358 | 7.72k | } |
359 | | |
360 | | /// @name fill_buckets |
361 | | |
362 | | static void fill_buckets(C_OUTLINE_LIST *outlines, // outlines in block |
363 | | OL_BUCKETS *buckets // output buckets |
364 | 7.72k | ) { |
365 | 7.72k | C_OUTLINE_IT out_it = outlines; // iterator |
366 | 7.72k | C_OUTLINE_IT bucket_it; // iterator in bucket |
367 | | |
368 | 2.25M | for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { |
369 | 2.25M | auto outline = out_it.extract(); // take off list |
370 | | // get box |
371 | 2.25M | const TBOX &ol_box(outline->bounding_box()); |
372 | 2.25M | bucket_it.set_to_list((*buckets)(ol_box.left(), ol_box.bottom())); |
373 | 2.25M | bucket_it.add_to_end(outline); |
374 | 2.25M | } |
375 | 7.72k | } |
376 | | |
377 | | /** |
378 | | * @name capture_children |
379 | | * |
380 | | * Find all neighbouring outlines that are children of this outline |
381 | | * and either move them to the output list or declare this outline |
382 | | * illegal and return false. |
383 | | */ |
384 | | |
385 | | static bool capture_children(OL_BUCKETS *buckets, // bucket sort class |
386 | | C_BLOB_IT *reject_it, // dead grandchildren |
387 | | C_OUTLINE_IT *blob_it // output outlines |
388 | 2.07M | ) { |
389 | | // master outline |
390 | 2.07M | auto outline = blob_it->data(); |
391 | | // no of children |
392 | 2.07M | int32_t child_count; |
393 | 2.07M | if (edges_use_new_outline_complexity) { |
394 | 0 | child_count = |
395 | 0 | buckets->outline_complexity(outline, edges_children_count_limit, 0); |
396 | 2.07M | } else { |
397 | 2.07M | child_count = buckets->count_children(outline, edges_children_count_limit); |
398 | 2.07M | } |
399 | 2.07M | if (child_count > edges_children_count_limit) { |
400 | 2.45k | return false; |
401 | 2.45k | } |
402 | | |
403 | 2.07M | if (child_count > 0) { |
404 | 54.1k | buckets->extract_children(outline, blob_it); |
405 | 54.1k | } |
406 | 2.07M | return true; |
407 | 2.07M | } |
408 | | |
409 | | /** |
410 | | * @name empty_buckets |
411 | | * |
412 | | * Run the edge detector over the block and return a list of blobs. |
413 | | */ |
414 | | |
415 | | static void empty_buckets(BLOCK *block, // block to scan |
416 | | OL_BUCKETS *buckets // output buckets |
417 | 7.72k | ) { |
418 | 7.72k | C_OUTLINE_LIST outlines; // outlines in block |
419 | | // iterator |
420 | 7.72k | C_OUTLINE_IT out_it = &outlines; |
421 | 7.72k | auto start_scan = buckets->start_scan(); |
422 | 7.72k | if (start_scan == nullptr) { |
423 | 11 | return; |
424 | 11 | } |
425 | 7.71k | C_OUTLINE_IT bucket_it = start_scan; |
426 | 7.71k | C_BLOB_IT good_blobs = block->blob_list(); |
427 | 7.71k | C_BLOB_IT junk_blobs = block->reject_blobs(); |
428 | | |
429 | 2.07M | while (!bucket_it.empty()) { |
430 | 2.07M | out_it.set_to_list(&outlines); |
431 | 2.07M | C_OUTLINE_IT parent_it; // parent outline |
432 | 2.10M | do { |
433 | 2.10M | parent_it = bucket_it; // find outermost |
434 | 8.58M | do { |
435 | 8.58M | bucket_it.forward(); |
436 | 8.58M | } while (!bucket_it.at_first() && |
437 | 8.58M | !(*parent_it.data() < *bucket_it.data())); |
438 | 2.10M | } while (!bucket_it.at_first()); |
439 | | |
440 | | // move to new list |
441 | 2.07M | out_it.add_after_then_move(parent_it.extract()); |
442 | | // healthy blob |
443 | 2.07M | bool good_blob = capture_children(buckets, &junk_blobs, &out_it); |
444 | 2.07M | C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs, |
445 | 2.07M | &junk_blobs); |
446 | | |
447 | 2.07M | if (auto l = buckets->scan_next()) |
448 | 2.06M | bucket_it.set_to_list(l); |
449 | 7.71k | else |
450 | 7.71k | break; |
451 | 2.07M | } |
452 | 7.71k | } |
453 | | |
454 | | /** |
455 | | * @name outlines_to_blobs |
456 | | * |
457 | | * Gather together outlines into blobs using the usual bucket sort. |
458 | | */ |
459 | | |
460 | | void outlines_to_blobs( // find blobs |
461 | | BLOCK *block, // block to scan |
462 | 7.72k | ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines) { |
463 | | // make buckets |
464 | 7.72k | OL_BUCKETS buckets(bleft, tright); |
465 | | |
466 | 7.72k | fill_buckets(outlines, &buckets); |
467 | 7.72k | empty_buckets(block, &buckets); |
468 | 7.72k | } |
469 | | |
470 | | } // namespace tesseract |