Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/textord/edgblob.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        edgblob.cpp (Formerly edgeloop.c)
3
 * Description: Functions to clean up an outline before approximation.
4
 * Author:      Ray Smith
5
 *
6
 *(C) Copyright 1991, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0(the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
// Include automatically generated configuration file if running autoconf.
20
#ifdef HAVE_CONFIG_H
21
#  include "config_auto.h"
22
#endif
23
24
#include "edgblob.h"
25
26
#include "edgloop.h"
27
#include "scanedg.h"
28
29
14.0M
#define BUCKETSIZE 16
30
31
namespace tesseract {
32
33
// Control parameters used in outline_complexity(), which rejects an outline
34
// if any one of the 3 conditions is satisfied:
35
//  - number of children exceeds edges_max_children_per_outline
36
//  - number of nested layers exceeds edges_max_children_layers
37
//  - joint complexity exceeds edges_children_count_limit(as in child_count())
38
static BOOL_VAR(edges_use_new_outline_complexity, false,
39
                "Use the new outline complexity module");
40
static INT_VAR(edges_max_children_per_outline, 10,
41
               "Max number of children inside a character outline");
42
static INT_VAR(edges_max_children_layers, 5,
43
               "Max layers of nested children inside a character outline");
44
static BOOL_VAR(edges_debug, false, "turn on debugging for this module");
45
46
static INT_VAR(edges_children_per_grandchild, 10,
47
               "Importance ratio for chucking outlines");
48
static INT_VAR(edges_children_count_limit, 45, "Max holes allowed in blob");
49
static BOOL_VAR(edges_children_fix, false,
50
                "Remove boxy parents of char-like children");
51
static INT_VAR(edges_min_nonhole, 12, "Min pixels for potential char in box");
52
static INT_VAR(edges_patharea_ratio, 40,
53
               "Max lensq/area for acceptable child outline");
54
static double_VAR(edges_childarea, 0.5, "Min area fraction of child outline");
55
static double_VAR(edges_boxarea, 0.875,
56
                  "Min area fraction of grandchild for box");
57
58
/**
59
 * @name OL_BUCKETS::OL_BUCKETS
60
 *
61
 * Construct an array of buckets for associating outlines into blobs.
62
 */
63
64
OL_BUCKETS::OL_BUCKETS(ICOORD bleft, // corners
65
                       ICOORD tright)
66
7.72k
    : bxdim((tright.x() - bleft.x()) / BUCKETSIZE + 1),
67
7.72k
      bydim((tright.y() - bleft.y()) / BUCKETSIZE + 1),
68
7.72k
      buckets(bxdim * bydim),
69
7.72k
      bl(bleft),
70
7.72k
      tr(tright) {}
71
72
/**
73
 * @name OL_BUCKETS::operator(
74
 *
75
 * Return a pointer to a list of C_OUTLINEs corresponding to the
76
 * given pixel coordinates.
77
 */
78
79
C_OUTLINE_LIST *OL_BUCKETS::operator()( // array access
80
    TDimension x,                       // image coords
81
2.25M
    TDimension y) {
82
2.25M
  return &buckets[(y - bl.y()) / BUCKETSIZE * bxdim +
83
2.25M
                  (x - bl.x()) / BUCKETSIZE];
84
2.25M
}
85
86
7.72k
C_OUTLINE_LIST *OL_BUCKETS::start_scan() {
87
7.72k
  return scan_next(buckets.begin());
88
7.72k
}
89
90
2.07M
C_OUTLINE_LIST *OL_BUCKETS::scan_next() {
91
2.07M
  return scan_next(it);
92
2.07M
}
93
94
2.08M
C_OUTLINE_LIST *OL_BUCKETS::scan_next(decltype(buckets)::iterator in_it) {
95
6.40M
  it = std::find_if(in_it, buckets.end(), [](auto &&b) { return !b.empty(); });
96
2.08M
  if (it == buckets.end())
97
7.72k
    return nullptr;
98
2.07M
  return &*it;
99
2.08M
}
100
101
/**
102
 * @name OL_BUCKETS::outline_complexity
103
 *
104
 * This is the new version of count_child.
105
 *
106
 * The goal of this function is to determine if an outline and its
107
 * interiors could be part of a character blob.  This is done by
108
 * computing a "complexity" index for the outline, which is the return
109
 * value of this function, and checking it against a threshold.
110
 * The max_count is used for short-circuiting the recursion and forcing
111
 * a rejection that guarantees to fail the threshold test.
112
 * The complexity F for outline X with N children X[i] is
113
 *   F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild
114
 * so each layer of nesting increases complexity exponentially.
115
 * An outline can be rejected as a text blob candidate if its complexity
116
 * is too high, has too many children(likely a container), or has too
117
 * many layers of nested inner loops.  This has the side-effect of
118
 * flattening out boxed or reversed video text regions.
119
 */
120
121
int32_t OL_BUCKETS::outline_complexity(C_OUTLINE *outline, // parent outline
122
                                       int32_t max_count,  // max output
123
                                       int16_t depth       // recursion depth
124
0
) {
125
0
  TDimension xmin, xmax;    // coord limits
126
0
  TDimension ymin, ymax;
127
0
  C_OUTLINE *child;         // current child
128
0
  int32_t child_count;      // no of children
129
0
  int32_t grandchild_count; // no of grandchildren
130
0
  C_OUTLINE_IT child_it;    // search iterator
131
132
0
  TBOX olbox = outline->bounding_box();
133
0
  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
134
0
  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
135
0
  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
136
0
  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
137
0
  child_count = 0;
138
0
  grandchild_count = 0;
139
0
  if (++depth > edges_max_children_layers) { // nested loops are too deep
140
0
    return max_count + depth;
141
0
  }
142
143
0
  for (auto yindex = ymin; yindex <= ymax; yindex++) {
144
0
    for (auto xindex = xmin; xindex <= xmax; xindex++) {
145
0
      child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
146
0
      if (child_it.empty()) {
147
0
        continue;
148
0
      }
149
0
      for (child_it.mark_cycle_pt(); !child_it.cycled_list();
150
0
           child_it.forward()) {
151
0
        child = child_it.data();
152
0
        if (child == outline || !(*child < *outline)) {
153
0
          continue;
154
0
        }
155
0
        child_count++;
156
157
0
        if (child_count > edges_max_children_per_outline) { // too fragmented
158
0
          if (edges_debug) {
159
0
            tprintf(
160
0
                "Discard outline on child_count=%d > "
161
0
                "max_children_per_outline=%d\n",
162
0
                child_count,
163
0
                static_cast<int32_t>(edges_max_children_per_outline));
164
0
          }
165
0
          return max_count + child_count;
166
0
        }
167
168
        // Compute the "complexity" of each child recursively
169
0
        int32_t remaining_count = max_count - child_count - grandchild_count;
170
0
        if (remaining_count > 0) {
171
0
          grandchild_count += edges_children_per_grandchild *
172
0
                              outline_complexity(child, remaining_count, depth);
173
0
        }
174
0
        if (child_count + grandchild_count > max_count) { // too complex
175
0
          if (edges_debug) {
176
0
            tprintf(
177
0
                "Discard outline on child_count=%d + grandchild_count=%d "
178
0
                "> max_count=%d\n",
179
0
                child_count, grandchild_count, max_count);
180
0
          }
181
0
          return child_count + grandchild_count;
182
0
        }
183
0
      }
184
0
    }
185
0
  }
186
0
  return child_count + grandchild_count;
187
0
}
188
189
/**
190
 * @name OL_BUCKETS::count_children
191
 *
192
 * Find number of descendants of this outline.
193
 */
194
// TODO(rays) Merge with outline_complexity.
195
int32_t OL_BUCKETS::count_children( // recursive count
196
    C_OUTLINE *outline,             // parent outline
197
    int32_t max_count               // max output
198
2.32M
) {
199
2.32M
  bool parent_box;    // could it be boxy
200
2.32M
  TDimension xmin, xmax;    // coord limits
201
2.32M
  TDimension ymin, ymax;
202
2.32M
  C_OUTLINE *child;         // current child
203
2.32M
  int32_t child_count;      // no of children
204
2.32M
  int32_t grandchild_count; // no of grandchildren
205
2.32M
  int32_t parent_area;      // potential box
206
2.32M
  float max_parent_area;    // potential box
207
2.32M
  int32_t child_area;       // current child
208
2.32M
  int32_t child_length;     // current child
209
2.32M
  TBOX olbox;
210
2.32M
  C_OUTLINE_IT child_it; // search iterator
211
212
2.32M
  olbox = outline->bounding_box();
213
2.32M
  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
214
2.32M
  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
215
2.32M
  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
216
2.32M
  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
217
2.32M
  child_count = 0;
218
2.32M
  grandchild_count = 0;
219
2.32M
  parent_area = 0;
220
2.32M
  max_parent_area = 0;
221
2.32M
  parent_box = true;
222
5.49M
  for (auto yindex = ymin; yindex <= ymax; yindex++) {
223
7.37M
    for (auto xindex = xmin; xindex <= xmax; xindex++) {
224
4.20M
      child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
225
4.20M
      if (child_it.empty()) {
226
794k
        continue;
227
794k
      }
228
19.9M
      for (child_it.mark_cycle_pt(); !child_it.cycled_list();
229
16.5M
           child_it.forward()) {
230
16.5M
        child = child_it.data();
231
16.5M
        if (child != outline && *child < *outline) {
232
250k
          child_count++;
233
250k
          if (child_count <= max_count) {
234
249k
            int max_grand =
235
249k
                (max_count - child_count) / edges_children_per_grandchild;
236
249k
            if (max_grand > 0) {
237
232k
              grandchild_count += count_children(child, max_grand) *
238
232k
                                  edges_children_per_grandchild;
239
232k
            } else {
240
17.3k
              grandchild_count += count_children(child, 1);
241
17.3k
            }
242
249k
          }
243
250k
          if (child_count + grandchild_count > max_count) {
244
2.07k
            if (edges_debug) {
245
0
              tprintf("Discarding parent with child count=%d, gc=%d\n",
246
0
                      child_count, grandchild_count);
247
0
            }
248
2.07k
            return child_count + grandchild_count;
249
2.07k
          }
250
248k
          if (parent_area == 0) {
251
58.5k
            parent_area = outline->outer_area();
252
58.5k
            if (parent_area < 0) {
253
11.4k
              parent_area = -parent_area;
254
11.4k
            }
255
58.5k
            max_parent_area = outline->bounding_box().area() * edges_boxarea;
256
58.5k
            if (parent_area < max_parent_area) {
257
54.4k
              parent_box = false;
258
54.4k
            }
259
58.5k
          }
260
248k
          if (parent_box &&
261
248k
              (!edges_children_fix ||
262
36.9k
               child->bounding_box().height() > edges_min_nonhole)) {
263
36.9k
            child_area = child->outer_area();
264
36.9k
            if (child_area < 0) {
265
35.0k
              child_area = -child_area;
266
35.0k
            }
267
36.9k
            if (edges_children_fix) {
268
0
              if (parent_area - child_area < max_parent_area) {
269
0
                parent_box = false;
270
0
                continue;
271
0
              }
272
0
              if (grandchild_count > 0) {
273
0
                if (edges_debug) {
274
0
                  tprintf(
275
0
                      "Discarding parent of area %d, child area=%d, max%g "
276
0
                      "with gc=%d\n",
277
0
                      parent_area, child_area, max_parent_area,
278
0
                      grandchild_count);
279
0
                }
280
0
                return max_count + 1;
281
0
              }
282
0
              child_length = child->pathlength();
283
0
              if (child_length * child_length >
284
0
                  child_area * edges_patharea_ratio) {
285
0
                if (edges_debug) {
286
0
                  tprintf(
287
0
                      "Discarding parent of area %d, child area=%d, max%g "
288
0
                      "with child length=%d\n",
289
0
                      parent_area, child_area, max_parent_area, child_length);
290
0
                }
291
0
                return max_count + 1;
292
0
              }
293
0
            }
294
36.9k
            if (child_area < child->bounding_box().area() * edges_childarea) {
295
640
              if (edges_debug) {
296
0
                tprintf(
297
0
                    "Discarding parent of area %d, child area=%d, max%g "
298
0
                    "with child rect=%d\n",
299
0
                    parent_area, child_area, max_parent_area,
300
0
                    child->bounding_box().area());
301
0
              }
302
640
              return max_count + 1;
303
640
            }
304
36.9k
          }
305
248k
        }
306
16.5M
      }
307
3.40M
    }
308
3.17M
  }
309
2.32M
  return child_count + grandchild_count;
310
2.32M
}
311
312
/**
313
 * @name OL_BUCKETS::extract_children
314
 *
315
 * Find number of descendants of this outline.
316
 */
317
318
void OL_BUCKETS::extract_children( // recursive count
319
    C_OUTLINE *outline,            // parent outline
320
    C_OUTLINE_IT *it               // destination iterator
321
54.1k
) {
322
54.1k
  TDimension xmin, xmax; // coord limits
323
54.1k
  TDimension ymin, ymax;
324
54.1k
  TBOX olbox;
325
54.1k
  C_OUTLINE_IT child_it; // search iterator
326
327
54.1k
  olbox = outline->bounding_box();
328
54.1k
  xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
329
54.1k
  xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
330
54.1k
  ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
331
54.1k
  ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
332
173k
  for (auto yindex = ymin; yindex <= ymax; yindex++) {
333
488k
    for (auto xindex = xmin; xindex <= xmax; xindex++) {
334
368k
      child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
335
1.47M
      for (child_it.mark_cycle_pt(); !child_it.cycled_list();
336
1.10M
           child_it.forward()) {
337
1.10M
        if (*child_it.data() < *outline) {
338
176k
          it->add_after_then_move(child_it.extract());
339
176k
        }
340
1.10M
      }
341
368k
    }
342
119k
  }
343
54.1k
}
344
345
/// @name extract_edges
346
347
void extract_edges(Image pix,      // thresholded image
348
7.72k
                   BLOCK *block) { // block to scan
349
7.72k
  C_OUTLINE_LIST outlines;         // outlines in block
350
7.72k
  C_OUTLINE_IT out_it = &outlines;
351
352
7.72k
  block_edges(pix, &(block->pdblk), &out_it);
353
7.72k
  ICOORD bleft; // block box
354
7.72k
  ICOORD tright;
355
7.72k
  block->pdblk.bounding_box(bleft, tright);
356
  // make blobs
357
7.72k
  outlines_to_blobs(block, bleft, tright, &outlines);
358
7.72k
}
359
360
/// @name fill_buckets
361
362
static void fill_buckets(C_OUTLINE_LIST *outlines, // outlines in block
363
                         OL_BUCKETS *buckets       // output buckets
364
7.72k
) {
365
7.72k
  C_OUTLINE_IT out_it = outlines; // iterator
366
7.72k
  C_OUTLINE_IT bucket_it;         // iterator in bucket
367
368
2.25M
  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
369
2.25M
    auto outline = out_it.extract(); // take off list
370
                                     // get box
371
2.25M
    const TBOX &ol_box(outline->bounding_box());
372
2.25M
    bucket_it.set_to_list((*buckets)(ol_box.left(), ol_box.bottom()));
373
2.25M
    bucket_it.add_to_end(outline);
374
2.25M
  }
375
7.72k
}
376
377
/**
378
 * @name capture_children
379
 *
380
 * Find all neighbouring outlines that are children of this outline
381
 * and either move them to the output list or declare this outline
382
 * illegal and return false.
383
 */
384
385
static bool capture_children(OL_BUCKETS *buckets,  // bucket sort class
386
                             C_BLOB_IT *reject_it, // dead grandchildren
387
                             C_OUTLINE_IT *blob_it // output outlines
388
2.07M
) {
389
  // master outline
390
2.07M
  auto outline = blob_it->data();
391
  // no of children
392
2.07M
  int32_t child_count;
393
2.07M
  if (edges_use_new_outline_complexity) {
394
0
    child_count =
395
0
        buckets->outline_complexity(outline, edges_children_count_limit, 0);
396
2.07M
  } else {
397
2.07M
    child_count = buckets->count_children(outline, edges_children_count_limit);
398
2.07M
  }
399
2.07M
  if (child_count > edges_children_count_limit) {
400
2.45k
    return false;
401
2.45k
  }
402
403
2.07M
  if (child_count > 0) {
404
54.1k
    buckets->extract_children(outline, blob_it);
405
54.1k
  }
406
2.07M
  return true;
407
2.07M
}
408
409
/**
410
 * @name empty_buckets
411
 *
412
 * Run the edge detector over the block and return a list of blobs.
413
 */
414
415
static void empty_buckets(BLOCK *block,       // block to scan
416
                          OL_BUCKETS *buckets // output buckets
417
7.72k
) {
418
7.72k
  C_OUTLINE_LIST outlines; // outlines in block
419
                           // iterator
420
7.72k
  C_OUTLINE_IT out_it = &outlines;
421
7.72k
  auto start_scan = buckets->start_scan();
422
7.72k
  if (start_scan == nullptr) {
423
11
    return;
424
11
  }
425
7.71k
  C_OUTLINE_IT bucket_it = start_scan;
426
7.71k
  C_BLOB_IT good_blobs = block->blob_list();
427
7.71k
  C_BLOB_IT junk_blobs = block->reject_blobs();
428
429
2.07M
  while (!bucket_it.empty()) {
430
2.07M
    out_it.set_to_list(&outlines);
431
2.07M
    C_OUTLINE_IT parent_it; // parent outline
432
2.10M
    do {
433
2.10M
      parent_it = bucket_it; // find outermost
434
8.58M
      do {
435
8.58M
        bucket_it.forward();
436
8.58M
      } while (!bucket_it.at_first() &&
437
8.58M
               !(*parent_it.data() < *bucket_it.data()));
438
2.10M
    } while (!bucket_it.at_first());
439
440
    // move to new list
441
2.07M
    out_it.add_after_then_move(parent_it.extract());
442
    // healthy blob
443
2.07M
    bool good_blob = capture_children(buckets, &junk_blobs, &out_it);
444
2.07M
    C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,
445
2.07M
                                       &junk_blobs);
446
447
2.07M
    if (auto l = buckets->scan_next())
448
2.06M
      bucket_it.set_to_list(l);
449
7.71k
    else
450
7.71k
      break;
451
2.07M
  }
452
7.71k
}
453
454
/**
455
 * @name outlines_to_blobs
456
 *
457
 * Gather together outlines into blobs using the usual bucket sort.
458
 */
459
460
void outlines_to_blobs( // find blobs
461
    BLOCK *block,       // block to scan
462
7.72k
    ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines) {
463
  // make buckets
464
7.72k
  OL_BUCKETS buckets(bleft, tright);
465
466
7.72k
  fill_buckets(outlines, &buckets);
467
7.72k
  empty_buckets(block, &buckets);
468
7.72k
}
469
470
} // namespace tesseract