Coverage Report

Created: 2025-09-27 07:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/textord/tablerecog.cpp
Line
Count
Source
1
///////////////////////////////////////////////////////////////////////
2
// File:        tablerecog.cpp
3
// Description: Helper class to help structure table areas. Given an bounding
4
//              box from TableFinder, the TableRecognizer should give a
5
//              StructuredTable (maybe a list in the future) of "good" tables
6
//              in that area.
7
// Author:      Nicholas Beato
8
//
9
// (C) Copyright 2009, Google Inc.
10
// Licensed under the Apache License, Version 2.0 (the "License");
11
// you may not use this file except in compliance with the License.
12
// You may obtain a copy of the License at
13
// http://www.apache.org/licenses/LICENSE-2.0
14
// Unless required by applicable law or agreed to in writing, software
15
// distributed under the License is distributed on an "AS IS" BASIS,
16
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
// See the License for the specific language governing permissions and
18
// limitations under the License.
19
//
20
///////////////////////////////////////////////////////////////////////
21
22
#ifdef HAVE_CONFIG_H
23
#  include "config_auto.h"
24
#endif
25
26
#include "tablerecog.h"
27
28
#include <algorithm>
29
30
namespace tesseract {
31
32
// The amount of space required between the ColPartitions in 2 columns
33
// of a non-lined table as a multiple of the median width.
34
const double kHorizontalSpacing = 0.30;
35
// The amount of space required between the ColPartitions in 2 rows
36
// of a non-lined table as multiples of the median height.
37
const double kVerticalSpacing = -0.2;
38
// The number of cells that the grid lines may intersect.
39
// See FindCellSplitLocations for explanation.
40
const int kCellSplitRowThreshold = 0;
41
const int kCellSplitColumnThreshold = 0;
42
// For "lined tables", the number of required lines. Currently a guess.
43
const int kLinedTableMinVerticalLines = 3;
44
const int kLinedTableMinHorizontalLines = 3;
45
// Number of columns required, as a fraction of the most columns found.
46
// None of these are tweaked at all.
47
const double kRequiredColumns = 0.7;
48
// The tolerance for comparing margins of potential tables.
49
const double kMarginFactor = 1.1;
50
// The first and last row should be consistent cell height.
51
// This factor is the first and last row cell height max.
52
const double kMaxRowSize = 2.5;
53
// Number of filled columns required to form a strong table row.
54
// For small tables, this is an absolute number.
55
const double kGoodRowNumberOfColumnsSmall[] = {2, 2, 2, 2, 2, 3, 3};
56
// For large tables, it is a relative number
57
const double kGoodRowNumberOfColumnsLarge = 0.7;
58
// The amount of area that must be covered in a cell by ColPartitions to
59
// be considered "filled"
60
const double kMinFilledArea = 0.35;
61
62
// Indicates that a table row is weak. This means that it has
63
// many missing data cells or very large cell heights compared.
64
// to the rest of the table.
65
// Code is buggy right now. It is disabled in the calling function.
66
// It seems like sometimes the row that is passed in is not correct
67
// sometimes (like a phantom row is introduced). There's something going
68
// on in the cell_y_ data member before this is called... not certain.
69
0
static bool IsWeakTableRow(StructuredTable *table, int row) {
70
0
  if (!table->VerifyRowFilled(row)) {
71
0
    return false;
72
0
  }
73
0
74
0
  double threshold;
75
0
  if (table->column_count() < countof(kGoodRowNumberOfColumnsSmall)) {
76
0
    threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
77
0
  } else {
78
0
    threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
79
0
  }
80
0
81
0
  return table->CountFilledCellsInRow(row) < threshold;
82
0
}
83
84
////////
85
//////// StructuredTable Class
86
////////
87
88
StructuredTable::StructuredTable()
89
0
    : text_grid_(nullptr)
90
0
    , line_grid_(nullptr)
91
0
    , is_lined_(false)
92
0
    , space_above_(0)
93
0
    , space_below_(0)
94
0
    , space_left_(0)
95
0
    , space_right_(0)
96
0
    , median_cell_height_(0)
97
0
    , median_cell_width_(0)
98
0
    , max_text_height_(INT32_MAX) {}
99
100
0
void StructuredTable::Init() {}
101
102
0
void StructuredTable::set_text_grid(ColPartitionGrid *text_grid) {
103
0
  text_grid_ = text_grid;
104
0
}
105
0
void StructuredTable::set_line_grid(ColPartitionGrid *line_grid) {
106
0
  line_grid_ = line_grid;
107
0
}
108
0
void StructuredTable::set_max_text_height(int height) {
109
0
  max_text_height_ = height;
110
0
}
111
0
bool StructuredTable::is_lined() const {
112
0
  return is_lined_;
113
0
}
114
0
unsigned StructuredTable::row_count() const {
115
0
  return cell_y_.empty() ? 0 : cell_y_.size() - 1;
116
0
}
117
0
unsigned StructuredTable::column_count() const {
118
0
  return cell_x_.empty() ? 0 : cell_x_.size() - 1;
119
0
}
120
0
unsigned StructuredTable::cell_count() const {
121
0
  return row_count() * column_count();
122
0
}
123
0
void StructuredTable::set_bounding_box(const TBOX &box) {
124
0
  bounding_box_ = box;
125
0
}
126
0
const TBOX &StructuredTable::bounding_box() const {
127
0
  return bounding_box_;
128
0
}
129
0
int StructuredTable::median_cell_height() {
130
0
  return median_cell_height_;
131
0
}
132
0
int StructuredTable::median_cell_width() {
133
0
  return median_cell_width_;
134
0
}
135
0
int StructuredTable::row_height(unsigned row) const {
136
0
  ASSERT_HOST(row < row_count());
137
0
  return cell_y_[row + 1] - cell_y_[row];
138
0
}
139
0
int StructuredTable::column_width(unsigned column) const {
140
0
  ASSERT_HOST(column < column_count());
141
0
  return cell_x_[column + 1] - cell_x_[column];
142
0
}
143
0
int StructuredTable::space_above() const {
144
0
  return space_above_;
145
0
}
146
0
int StructuredTable::space_below() const {
147
0
  return space_below_;
148
0
}
149
150
// At this point, we know that the lines are contained
151
// by the box (by FindLinesBoundingBox).
152
// So try to find the cell structure and make sure it works out.
153
// The assumption is that all lines span the table. If this
154
// assumption fails, the VerifyLinedTable method will
155
// abort the lined table. The TableRecognizer will fall
156
// back on FindWhitespacedStructure.
157
0
bool StructuredTable::FindLinedStructure() {
158
0
  ClearStructure();
159
160
  // Search for all of the lines in the current box.
161
  // Update the cellular structure with the exact lines.
162
0
  ColPartitionGridSearch box_search(line_grid_);
163
0
  box_search.SetUniqueMode(true);
164
0
  box_search.StartRectSearch(bounding_box_);
165
0
  ColPartition *line = nullptr;
166
167
0
  while ((line = box_search.NextRectSearch()) != nullptr) {
168
0
    if (line->IsHorizontalLine()) {
169
0
      cell_y_.push_back(line->MidY());
170
0
    }
171
0
    if (line->IsVerticalLine()) {
172
0
      cell_x_.push_back(line->MidX());
173
0
    }
174
0
  }
175
176
  // HasSignificantLines should guarantee cells.
177
  // Because that code is a different class, just gracefully
178
  // return false. This could be an assert.
179
0
  if (cell_x_.size() < 3 || cell_y_.size() < 3) {
180
0
    return false;
181
0
  }
182
183
  // Sort and remove duplicates that may have occurred due to split lines.
184
0
  std::sort(cell_x_.begin(), cell_x_.end());
185
0
  auto last_x = std::unique(cell_x_.begin(), cell_x_.end());
186
0
  cell_x_.erase(last_x, cell_x_.end());
187
0
  std::sort(cell_y_.begin(), cell_y_.end());
188
0
  auto last_y = std::unique(cell_y_.begin(), cell_y_.end());
189
0
  cell_y_.erase(last_y, cell_y_.end());
190
191
  // The border should be the extents of line boxes, not middle.
192
0
  cell_x_[0] = bounding_box_.left();
193
0
  cell_x_[cell_x_.size() - 1] = bounding_box_.right();
194
0
  cell_y_[0] = bounding_box_.bottom();
195
0
  cell_y_[cell_y_.size() - 1] = bounding_box_.top();
196
197
  // Remove duplicates that may have occurred due to moving the borders.
198
0
  last_x = std::unique(cell_x_.begin(), cell_x_.end());
199
0
  cell_x_.erase(last_x, cell_x_.end());
200
0
  last_y = std::unique(cell_y_.begin(), cell_y_.end());
201
0
  cell_y_.erase(last_y, cell_y_.end());
202
203
0
  CalculateMargins();
204
0
  CalculateStats();
205
0
  is_lined_ = VerifyLinedTableCells();
206
0
  return is_lined_;
207
0
}
208
209
// Finds the cellular structure given a particular box.
210
0
bool StructuredTable::FindWhitespacedStructure() {
211
0
  ClearStructure();
212
0
  FindWhitespacedColumns();
213
0
  FindWhitespacedRows();
214
215
0
  if (!VerifyWhitespacedTable()) {
216
0
    return false;
217
0
  } else {
218
0
    bounding_box_.set_left(cell_x_[0]);
219
0
    bounding_box_.set_right(cell_x_[cell_x_.size() - 1]);
220
0
    bounding_box_.set_bottom(cell_y_[0]);
221
0
    bounding_box_.set_top(cell_y_[cell_y_.size() - 1]);
222
0
    AbsorbNearbyLines();
223
0
    CalculateMargins();
224
0
    CalculateStats();
225
0
    return true;
226
0
  }
227
0
}
228
229
// Tests if a partition fits inside the table structure.
230
// Partitions must fully span a grid line in order to intersect it.
231
// This means that a partition does not intersect a line
232
// that it "just" touches. This is mainly because the assumption
233
// throughout the code is that "0" distance is a very very small space.
234
0
bool StructuredTable::DoesPartitionFit(const ColPartition &part) const {
235
0
  const TBOX &box = part.bounding_box();
236
0
  for (int i : cell_x_) {
237
0
    if (box.left() < i && i < box.right()) {
238
0
      return false;
239
0
    }
240
0
  }
241
0
  for (int i : cell_y_) {
242
0
    if (box.bottom() < i && i < box.top()) {
243
0
      return false;
244
0
    }
245
0
  }
246
0
  return true;
247
0
}
248
249
// Checks if a sub-table has multiple data cells filled.
250
0
int StructuredTable::CountFilledCells() {
251
0
  return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
252
0
}
253
0
int StructuredTable::CountFilledCellsInRow(int row) {
254
0
  return CountFilledCells(row, row, 0, column_count() - 1);
255
0
}
256
0
int StructuredTable::CountFilledCellsInColumn(int column) {
257
0
  return CountFilledCells(0, row_count() - 1, column, column);
258
0
}
259
int StructuredTable::CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start,
260
0
                                      unsigned column_end) {
261
0
  ASSERT_HOST(row_start <= row_end && row_end < row_count());
262
0
  ASSERT_HOST(column_start <= column_end && column_end < column_count());
263
0
  int cell_count = 0;
264
0
  TBOX cell_box;
265
0
  for (unsigned row = row_start; row <= row_end; ++row) {
266
0
    cell_box.set_bottom(cell_y_[row]);
267
0
    cell_box.set_top(cell_y_[row + 1]);
268
0
    for (unsigned col = column_start; col <= column_end; ++col) {
269
0
      cell_box.set_left(cell_x_[col]);
270
0
      cell_box.set_right(cell_x_[col + 1]);
271
0
      if (CountPartitions(cell_box) > 0) {
272
0
        ++cell_count;
273
0
      }
274
0
    }
275
0
  }
276
0
  return cell_count;
277
0
}
278
279
// Makes sure that at least one cell in a row has substantial area filled.
280
// This can filter out large whitespace caused by growing tables too far
281
// and page numbers.
282
0
bool StructuredTable::VerifyRowFilled(int row) {
283
0
  for (unsigned i = 0; i < column_count(); ++i) {
284
0
    auto area_filled = CalculateCellFilledPercentage(row, i);
285
0
    if (area_filled >= kMinFilledArea) {
286
0
      return true;
287
0
    }
288
0
  }
289
0
  return false;
290
0
}
291
292
// Finds the filled area in a cell.
293
// Assume ColPartitions do not overlap for simplicity (even though they do).
294
0
double StructuredTable::CalculateCellFilledPercentage(unsigned row, unsigned column) {
295
0
  ASSERT_HOST(row <= row_count());
296
0
  ASSERT_HOST(column <= column_count());
297
0
  const TBOX kCellBox(cell_x_[column], cell_y_[row], cell_x_[column + 1], cell_y_[row + 1]);
298
0
  ASSERT_HOST(!kCellBox.null_box());
299
300
0
  ColPartitionGridSearch gsearch(text_grid_);
301
0
  gsearch.SetUniqueMode(true);
302
0
  gsearch.StartRectSearch(kCellBox);
303
0
  double area_covered = 0;
304
0
  ColPartition *text = nullptr;
305
0
  while ((text = gsearch.NextRectSearch()) != nullptr) {
306
0
    if (text->IsTextType()) {
307
0
      area_covered += text->bounding_box().intersection(kCellBox).area();
308
0
    }
309
0
  }
310
0
  const int32_t current_area = kCellBox.area();
311
0
  if (current_area == 0) {
312
0
    return 1.0;
313
0
  }
314
0
  return std::min(1.0, area_covered / current_area);
315
0
}
316
317
#ifndef GRAPHICS_DISABLED
318
319
void StructuredTable::Display(ScrollView *window, ScrollView::Color color) {
320
  window->Brush(ScrollView::NONE);
321
  window->Pen(color);
322
  window->Rectangle(bounding_box_.left(), bounding_box_.bottom(), bounding_box_.right(),
323
                    bounding_box_.top());
324
  for (int i : cell_x_) {
325
    window->Line(i, bounding_box_.bottom(), i, bounding_box_.top());
326
  }
327
  for (int i : cell_y_) {
328
    window->Line(bounding_box_.left(), i, bounding_box_.right(), i);
329
  }
330
  window->UpdateWindow();
331
}
332
333
#endif
334
335
// Clear structure information.
336
0
void StructuredTable::ClearStructure() {
337
0
  cell_x_.clear();
338
0
  cell_y_.clear();
339
0
  is_lined_ = false;
340
0
  space_above_ = 0;
341
0
  space_below_ = 0;
342
0
  space_left_ = 0;
343
0
  space_right_ = 0;
344
0
  median_cell_height_ = 0;
345
0
  median_cell_width_ = 0;
346
0
}
347
348
// When a table has lines, the lines should not intersect any partitions.
349
// The following function makes sure the previous assumption is met.
350
0
bool StructuredTable::VerifyLinedTableCells() {
351
  // Function only called when lines exist.
352
0
  ASSERT_HOST(cell_y_.size() >= 2 && cell_x_.size() >= 2);
353
0
  for (int i : cell_y_) {
354
0
    if (CountHorizontalIntersections(i) > 0) {
355
0
      return false;
356
0
    }
357
0
  }
358
0
  for (int i : cell_x_) {
359
0
    if (CountVerticalIntersections(i) > 0) {
360
0
      return false;
361
0
    }
362
0
  }
363
0
  return true;
364
0
}
365
366
// TODO(nbeato): Could be much better than this.
367
// Examples:
368
//   - Calculate the percentage of filled cells.
369
//   - Calculate the average number of ColPartitions per cell.
370
//   - Calculate the number of cells per row with partitions.
371
//   - Check if ColPartitions in adjacent cells are similar.
372
//   - Check that all columns are at least a certain width.
373
//   - etc.
374
0
bool StructuredTable::VerifyWhitespacedTable() {
375
  // criteria for a table, must be at least 2x3 or 3x2
376
0
  return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
377
0
}
378
379
// Finds vertical splits in the ColPartitions of text_grid_ by considering
380
// all possible "good" guesses. A good guess is just the left/right sides of
381
// the partitions, since these locations will uniquely define where the
382
// extremal values where the splits can occur. The split happens
383
// in the middle of the two nearest partitions.
384
0
void StructuredTable::FindWhitespacedColumns() {
385
  // Set of the extents of all partitions on the page.
386
0
  std::vector<int> left_sides;
387
0
  std::vector<int> right_sides;
388
389
  // Look at each text partition. We want to find the partitions
390
  // that have extremal left/right sides. These will give us a basis
391
  // for the table columns.
392
0
  ColPartitionGridSearch gsearch(text_grid_);
393
0
  gsearch.SetUniqueMode(true);
394
0
  gsearch.StartRectSearch(bounding_box_);
395
0
  ColPartition *text = nullptr;
396
0
  while ((text = gsearch.NextRectSearch()) != nullptr) {
397
0
    if (!text->IsTextType()) {
398
0
      continue;
399
0
    }
400
401
0
    ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
402
0
    int spacing = static_cast<int>(text->median_width() * kHorizontalSpacing / 2.0 + 0.5);
403
0
    left_sides.push_back(text->bounding_box().left() - spacing);
404
0
    right_sides.push_back(text->bounding_box().right() + spacing);
405
0
  }
406
  // It causes disaster below, so avoid it!
407
0
  if (left_sides.empty() || right_sides.empty()) {
408
0
    return;
409
0
  }
410
411
  // Since data may be inserted in grid order, we sort the left/right sides.
412
0
  std::sort(left_sides.begin(), left_sides.end());
413
0
  std::sort(right_sides.begin(), right_sides.end());
414
415
  // At this point, in the "merged list", we expect to have a left side,
416
  // followed by either more left sides or a right side. The last number
417
  // should be a right side. We find places where the splits occur by looking
418
  // for "valleys". If we want to force gap sizes or allow overlap, change
419
  // the spacing above. If you want to let lines "slice" partitions as long
420
  // as it is infrequent, change the following function.
421
0
  FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, &cell_x_);
422
0
}
423
424
// Finds horizontal splits in the ColPartitions of text_grid_ by considering
425
// all possible "good" guesses. A good guess is just the bottom/top sides of
426
// the partitions, since these locations will uniquely define where the
427
// extremal values where the splits can occur. The split happens
428
// in the middle of the two nearest partitions.
429
0
void StructuredTable::FindWhitespacedRows() {
430
  // Set of the extents of all partitions on the page.
431
0
  std::vector<int> bottom_sides;
432
0
  std::vector<int> top_sides;
433
  // We will be "shrinking" partitions, so keep the min/max around to
434
  // make sure the bottom/top lines do not intersect text.
435
0
  int min_bottom = INT32_MAX;
436
0
  int max_top = INT32_MIN;
437
438
  // Look at each text partition. We want to find the partitions
439
  // that have extremal bottom/top sides. These will give us a basis
440
  // for the table rows. Because the textlines can be skewed and close due
441
  // to warping, the height of the partitions is toned down a little bit.
442
0
  ColPartitionGridSearch gsearch(text_grid_);
443
0
  gsearch.SetUniqueMode(true);
444
0
  gsearch.StartRectSearch(bounding_box_);
445
0
  ColPartition *text = nullptr;
446
0
  while ((text = gsearch.NextRectSearch()) != nullptr) {
447
0
    if (!text->IsTextType()) {
448
0
      continue;
449
0
    }
450
451
0
    ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
452
0
    min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom()));
453
0
    max_top = std::max(max_top, static_cast<int>(text->bounding_box().top()));
454
455
    // Ignore "tall" text partitions, as these are usually false positive
456
    // vertical text or multiple lines pulled together.
457
0
    if (text->bounding_box().height() > max_text_height_) {
458
0
      continue;
459
0
    }
460
461
0
    int spacing = static_cast<int>(text->bounding_box().height() * kVerticalSpacing / 2.0 + 0.5);
462
0
    int bottom = text->bounding_box().bottom() - spacing;
463
0
    int top = text->bounding_box().top() + spacing;
464
    // For horizontal text, the factor can be negative. This should
465
    // probably cause a warning or failure. I haven't actually checked if
466
    // it happens.
467
0
    if (bottom >= top) {
468
0
      continue;
469
0
    }
470
471
0
    bottom_sides.push_back(bottom);
472
0
    top_sides.push_back(top);
473
0
  }
474
  // It causes disaster below, so avoid it!
475
0
  if (bottom_sides.empty() || top_sides.empty()) {
476
0
    return;
477
0
  }
478
479
  // Since data may be inserted in grid order, we sort the bottom/top sides.
480
0
  std::sort(bottom_sides.begin(), bottom_sides.end());
481
0
  std::sort(top_sides.begin(), top_sides.end());
482
483
  // At this point, in the "merged list", we expect to have a bottom side,
484
  // followed by either more bottom sides or a top side. The last number
485
  // should be a top side. We find places where the splits occur by looking
486
  // for "valleys". If we want to force gap sizes or allow overlap, change
487
  // the spacing above. If you want to let lines "slice" partitions as long
488
  // as it is infrequent, change the following function.
489
0
  FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, &cell_y_);
490
491
  // Recover the min/max correctly since it was shifted.
492
0
  cell_y_[0] = min_bottom;
493
0
  cell_y_[cell_y_.size() - 1] = max_top;
494
0
}
495
496
0
void StructuredTable::CalculateMargins() {
497
0
  space_above_ = INT32_MAX;
498
0
  space_below_ = INT32_MAX;
499
0
  space_right_ = INT32_MAX;
500
0
  space_left_ = INT32_MAX;
501
0
  UpdateMargins(text_grid_);
502
0
  UpdateMargins(line_grid_);
503
0
}
504
// Finds the nearest partition in grid to the table
505
// boundaries and updates the margin.
506
0
void StructuredTable::UpdateMargins(ColPartitionGrid *grid) {
507
0
  int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
508
0
  space_below_ = std::min(space_below_, below);
509
0
  int above = FindVerticalMargin(grid, bounding_box_.top(), false);
510
0
  space_above_ = std::min(space_above_, above);
511
0
  int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
512
0
  space_left_ = std::min(space_left_, left);
513
0
  int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
514
0
  space_right_ = std::min(space_right_, right);
515
0
}
516
0
int StructuredTable::FindVerticalMargin(ColPartitionGrid *grid, int border, bool decrease) const {
517
0
  ColPartitionGridSearch gsearch(grid);
518
0
  gsearch.SetUniqueMode(true);
519
0
  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), border);
520
0
  ColPartition *part = nullptr;
521
0
  while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) {
522
0
    if (!part->IsTextType() && !part->IsHorizontalLine()) {
523
0
      continue;
524
0
    }
525
0
    int distance =
526
0
        decrease ? border - part->bounding_box().top() : part->bounding_box().bottom() - border;
527
0
    if (distance >= 0) {
528
0
      return distance;
529
0
    }
530
0
  }
531
0
  return INT32_MAX;
532
0
}
533
0
int StructuredTable::FindHorizontalMargin(ColPartitionGrid *grid, int border, bool decrease) const {
534
0
  ColPartitionGridSearch gsearch(grid);
535
0
  gsearch.SetUniqueMode(true);
536
0
  gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
537
0
  ColPartition *part = nullptr;
538
0
  while ((part = gsearch.NextSideSearch(decrease)) != nullptr) {
539
0
    if (!part->IsTextType() && !part->IsVerticalLine()) {
540
0
      continue;
541
0
    }
542
0
    int distance =
543
0
        decrease ? border - part->bounding_box().right() : part->bounding_box().left() - border;
544
0
    if (distance >= 0) {
545
0
      return distance;
546
0
    }
547
0
  }
548
0
  return INT32_MAX;
549
0
}
550
551
0
void StructuredTable::CalculateStats() {
552
0
  const int kMaxCellHeight = 1000;
553
0
  const int kMaxCellWidth = 1000;
554
0
  STATS height_stats(0, kMaxCellHeight);
555
0
  STATS width_stats(0, kMaxCellWidth);
556
557
0
  for (unsigned i = 0; i < row_count(); ++i) {
558
0
    height_stats.add(row_height(i), column_count());
559
0
  }
560
0
  for (unsigned i = 0; i < column_count(); ++i) {
561
0
    width_stats.add(column_width(i), row_count());
562
0
  }
563
564
0
  median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
565
0
  median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
566
0
}
567
568
// Looks for grid lines near the current bounding box and
569
// grows the bounding box to include them if no intersections
570
// will occur as a result. This is necessary because the margins
571
// are calculated relative to the closest line/text. If the
572
// line isn't absorbed, the margin will be the distance to the line.
573
0
void StructuredTable::AbsorbNearbyLines() {
574
0
  ColPartitionGridSearch gsearch(line_grid_);
575
0
  gsearch.SetUniqueMode(true);
576
577
  // Is the closest line above good? Loop multiple times for tables with
578
  // multi-line (sometimes 2) borders. Limit the number of lines by
579
  // making sure they stay within a table cell or so.
580
0
  ColPartition *line = nullptr;
581
0
  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.top());
582
0
  while ((line = gsearch.NextVerticalSearch(false)) != nullptr) {
583
0
    if (!line->IsHorizontalLine()) {
584
0
      break;
585
0
    }
586
0
    TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1, bounding_box_.right(),
587
0
                     line->MidY());
588
0
    if (text_search.height() > median_cell_height_ * 2) {
589
0
      break;
590
0
    }
591
0
    if (CountPartitions(text_search) > 0) {
592
0
      break;
593
0
    }
594
0
    bounding_box_.set_top(line->MidY());
595
0
  }
596
  // As above, is the closest line below good?
597
0
  line = nullptr;
598
0
  gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.bottom());
599
0
  while ((line = gsearch.NextVerticalSearch(true)) != nullptr) {
600
0
    if (!line->IsHorizontalLine()) {
601
0
      break;
602
0
    }
603
0
    TBOX text_search(bounding_box_.left(), line->MidY(), bounding_box_.right(),
604
0
                     bounding_box_.bottom() - 1);
605
0
    if (text_search.height() > median_cell_height_ * 2) {
606
0
      break;
607
0
    }
608
0
    if (CountPartitions(text_search) > 0) {
609
0
      break;
610
0
    }
611
0
    bounding_box_.set_bottom(line->MidY());
612
0
  }
613
  // TODO(nbeato): vertical lines
614
0
}
615
616
// This function will find all "0 valleys" (of any length) given two
617
// arrays. The arrays are the mins and maxes of partitions (either
618
// left and right or bottom and top). Since the min/max lists are generated
619
// with pairs of increasing integers, we can make some assumptions in
620
// the function about ordering of the overall list, which are shown in the
621
// asserts.
622
// The algorithm works as follows:
623
//   While there are numbers to process, take the smallest number.
624
//     If it is from the min_list, increment the "hill" counter.
625
//     Otherwise, decrement the "hill" counter.
626
//     In the process of doing this, keep track of "crossing" the
627
//     desired height.
628
// The first/last items are extremal values of the list and known.
629
// NOTE: This function assumes the lists are sorted!
630
void StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list,
631
                                             const std::vector<int> &max_list, int max_merged,
632
0
                                             std::vector<int> *locations) {
633
0
  locations->clear();
634
0
  ASSERT_HOST(min_list.size() == max_list.size());
635
0
  if (min_list.empty()) {
636
0
    return;
637
0
  }
638
0
  ASSERT_HOST(min_list.at(0) < max_list.at(0));
639
0
  ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1));
640
641
0
  locations->push_back(min_list.at(0));
642
0
  unsigned min_index = 0;
643
0
  unsigned max_index = 0;
644
0
  int stacked_partitions = 0;
645
0
  int last_cross_position = INT32_MAX;
646
  // max_index will expire after min_index.
647
  // However, we can't "increase" the hill size if min_index expired.
648
  // So finish processing when min_index expires.
649
0
  while (min_index < min_list.size()) {
650
    // Increase the hill count.
651
0
    if (min_list[min_index] < max_list[max_index]) {
652
0
      ++stacked_partitions;
653
0
      if (last_cross_position != INT32_MAX && stacked_partitions > max_merged) {
654
0
        int mid = (last_cross_position + min_list[min_index]) / 2;
655
0
        locations->push_back(mid);
656
0
        last_cross_position = INT32_MAX;
657
0
      }
658
0
      ++min_index;
659
0
    } else {
660
      // Decrease the hill count.
661
0
      --stacked_partitions;
662
0
      if (last_cross_position == INT32_MAX && stacked_partitions <= max_merged) {
663
0
        last_cross_position = max_list[max_index];
664
0
      }
665
0
      ++max_index;
666
0
    }
667
0
  }
668
0
  locations->push_back(max_list.at(max_list.size() - 1));
669
0
}
670
671
// Counts the number of partitions in the table
672
// box that intersection the given x value.
673
0
int StructuredTable::CountVerticalIntersections(int x) {
674
0
  int count = 0;
675
  // Make a small box to keep the search time down.
676
0
  const int kGridSize = text_grid_->gridsize();
677
0
  TBOX vertical_box = bounding_box_;
678
0
  vertical_box.set_left(x - kGridSize);
679
0
  vertical_box.set_right(x + kGridSize);
680
681
0
  ColPartitionGridSearch gsearch(text_grid_);
682
0
  gsearch.SetUniqueMode(true);
683
0
  gsearch.StartRectSearch(vertical_box);
684
0
  ColPartition *text = nullptr;
685
0
  while ((text = gsearch.NextRectSearch()) != nullptr) {
686
0
    if (!text->IsTextType()) {
687
0
      continue;
688
0
    }
689
0
    const TBOX &box = text->bounding_box();
690
0
    if (box.left() < x && x < box.right()) {
691
0
      ++count;
692
0
    }
693
0
  }
694
0
  return count;
695
0
}
696
697
// Counts the number of partitions in the table
698
// box that intersection the given y value.
699
0
int StructuredTable::CountHorizontalIntersections(int y) {
700
0
  int count = 0;
701
  // Make a small box to keep the search time down.
702
0
  const int kGridSize = text_grid_->gridsize();
703
0
  TBOX horizontal_box = bounding_box_;
704
0
  horizontal_box.set_bottom(y - kGridSize);
705
0
  horizontal_box.set_top(y + kGridSize);
706
707
0
  ColPartitionGridSearch gsearch(text_grid_);
708
0
  gsearch.SetUniqueMode(true);
709
0
  gsearch.StartRectSearch(horizontal_box);
710
0
  ColPartition *text = nullptr;
711
0
  while ((text = gsearch.NextRectSearch()) != nullptr) {
712
0
    if (!text->IsTextType()) {
713
0
      continue;
714
0
    }
715
716
0
    const TBOX &box = text->bounding_box();
717
0
    if (box.bottom() < y && y < box.top()) {
718
0
      ++count;
719
0
    }
720
0
  }
721
0
  return count;
722
0
}
723
724
// Counts how many text partitions are in this box.
725
// This is used to count partitions in cells, as that can indicate
726
// how "strong" a potential table row/column (or even full table) actually is.
727
0
int StructuredTable::CountPartitions(const TBOX &box) {
728
0
  ColPartitionGridSearch gsearch(text_grid_);
729
0
  gsearch.SetUniqueMode(true);
730
0
  gsearch.StartRectSearch(box);
731
0
  int count = 0;
732
0
  ColPartition *text = nullptr;
733
0
  while ((text = gsearch.NextRectSearch()) != nullptr) {
734
0
    if (text->IsTextType()) {
735
0
      ++count;
736
0
    }
737
0
  }
738
0
  return count;
739
0
}
740
741
////////
742
//////// TableRecognizer Class
743
////////
744
745
0
void TableRecognizer::Init() {}
746
747
0
void TableRecognizer::set_text_grid(ColPartitionGrid *text_grid) {
748
0
  text_grid_ = text_grid;
749
0
}
750
0
void TableRecognizer::set_line_grid(ColPartitionGrid *line_grid) {
751
0
  line_grid_ = line_grid;
752
0
}
753
0
void TableRecognizer::set_min_height(int height) {
754
0
  min_height_ = height;
755
0
}
756
0
void TableRecognizer::set_min_width(int width) {
757
0
  min_width_ = width;
758
0
}
759
0
void TableRecognizer::set_max_text_height(int height) {
760
0
  max_text_height_ = height;
761
0
}
762
763
0
StructuredTable *TableRecognizer::RecognizeTable(const TBOX &guess) {
764
0
  auto *table = new StructuredTable();
765
0
  table->Init();
766
0
  table->set_text_grid(text_grid_);
767
0
  table->set_line_grid(line_grid_);
768
0
  table->set_max_text_height(max_text_height_);
769
770
  // Try to solve this simple case, a table with *both*
771
  // vertical and horizontal lines.
772
0
  if (RecognizeLinedTable(guess, table)) {
773
0
    return table;
774
0
  }
775
776
  // Fallback to whitespace if that failed.
777
  // TODO(nbeato): Break this apart to take advantage of horizontal
778
  // lines or vertical lines when present.
779
0
  if (RecognizeWhitespacedTable(guess, table)) {
780
0
    return table;
781
0
  }
782
783
  // No table found...
784
0
  delete table;
785
0
  return nullptr;
786
0
}
787
788
0
bool TableRecognizer::RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table) {
789
0
  if (!HasSignificantLines(guess_box)) {
790
0
    return false;
791
0
  }
792
0
  TBOX line_bound = guess_box;
793
0
  if (!FindLinesBoundingBox(&line_bound)) {
794
0
    return false;
795
0
  }
796
0
  table->set_bounding_box(line_bound);
797
0
  return table->FindLinedStructure();
798
0
}
799
800
// Quick implementation. Just count the number of lines in the box.
801
// A better implementation would counter intersections and look for connected
802
// components. It could even go as far as finding similar length lines.
803
// To account for these possible issues, the VerifyLinedTableCells function
804
// will reject lined tables that cause intersections with text on the page.
805
// TODO(nbeato): look for "better" lines
806
0
bool TableRecognizer::HasSignificantLines(const TBOX &guess) {
807
0
  ColPartitionGridSearch box_search(line_grid_);
808
0
  box_search.SetUniqueMode(true);
809
0
  box_search.StartRectSearch(guess);
810
0
  ColPartition *line = nullptr;
811
0
  int vertical_count = 0;
812
0
  int horizontal_count = 0;
813
814
0
  while ((line = box_search.NextRectSearch()) != nullptr) {
815
0
    if (line->IsHorizontalLine()) {
816
0
      ++horizontal_count;
817
0
    }
818
0
    if (line->IsVerticalLine()) {
819
0
      ++vertical_count;
820
0
    }
821
0
  }
822
823
0
  return vertical_count >= kLinedTableMinVerticalLines &&
824
0
         horizontal_count >= kLinedTableMinHorizontalLines;
825
0
}
826
827
// Given a bounding box with a bunch of horizontal / vertical lines,
828
// we just find the extents of all of these lines iteratively.
829
// The box will be at least as large as guess. This
830
// could possibly be a bad assumption.
831
// It is guaranteed to halt in at least O(n * gridarea) where n
832
// is the number of lines.
833
// The assumption is that growing the box iteratively will add lines
834
// several times, but eventually we'll find the extents.
835
//
836
// For tables, the approach is a bit aggressive, a single line (which could be
837
// noise or a column ruling) can destroy the table inside.
838
//
839
// TODO(nbeato): This is a quick first implementation.
840
// A better implementation would actually look for consistency
841
// in extents of the lines and find the extents using lines
842
// that clearly describe the table. This would allow the
843
// lines to "vote" for height/width. An approach like
844
// this would solve issues with page layout rulings.
845
// I haven't looked for these issues yet, so I can't even
846
// say they happen confidently.
847
0
bool TableRecognizer::FindLinesBoundingBox(TBOX *bounding_box) {
848
  // The first iteration will tell us if there are lines
849
  // present and shrink the box to a minimal iterative size.
850
0
  if (!FindLinesBoundingBoxIteration(bounding_box)) {
851
0
    return false;
852
0
  }
853
854
  // Keep growing until the area of the table stabilizes.
855
  // The box can only get bigger, increasing area.
856
0
  bool changed = true;
857
0
  while (changed) {
858
0
    changed = false;
859
0
    int old_area = bounding_box->area();
860
0
    bool check = FindLinesBoundingBoxIteration(bounding_box);
861
    // At this point, the function will return true.
862
0
    ASSERT_HOST(check);
863
0
    ASSERT_HOST(bounding_box->area() >= old_area);
864
0
    changed = (bounding_box->area() > old_area);
865
0
  }
866
867
0
  return true;
868
0
}
869
870
0
bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX *bounding_box) {
871
  // Search for all of the lines in the current box, keeping track of extents.
872
0
  ColPartitionGridSearch box_search(line_grid_);
873
0
  box_search.SetUniqueMode(true);
874
0
  box_search.StartRectSearch(*bounding_box);
875
0
  ColPartition *line = nullptr;
876
0
  bool first_line = true;
877
878
0
  while ((line = box_search.NextRectSearch()) != nullptr) {
879
0
    if (line->IsLineType()) {
880
0
      if (first_line) {
881
        // The first iteration can shrink the box.
882
0
        *bounding_box = line->bounding_box();
883
0
        first_line = false;
884
0
      } else {
885
0
        *bounding_box += line->bounding_box();
886
0
      }
887
0
    }
888
0
  }
889
0
  return !first_line;
890
0
}
891
892
// The goal of this function is to move the table boundaries around and find
893
// a table that maximizes the whitespace around the table while maximizing
894
// the cellular structure. As a result, it gets confused by headers, footers,
895
// and merged columns (text that crosses columns). There is a tolerance
896
// that allows a few partitions to count towards potential cell merges.
897
// It's the max_merged parameter to FindPartitionLocations.
898
// It can work, but it needs some false positive remove on boundaries.
899
// For now, the grid structure must not intersect any partitions.
900
// Also, small tolerance is added to the horizontal lines for tightly packed
901
// tables. The tolerance is added by adjusting the bounding boxes of the
902
// partitions (in FindHorizontalPartitions). The current implementation
903
// only adjusts the vertical extents of the table.
904
//
905
// Also note. This was hacked at a lot. It could probably use some
906
// more hacking at to find a good set of border conditions and then a
907
// nice clean up.
908
0
bool TableRecognizer::RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table) {
909
0
  TBOX best_box = guess_box; // Best borders known.
910
0
  int best_below = 0;        // Margin size above best table.
911
0
  int best_above = 0;        // Margin size below best table.
912
0
  TBOX adjusted = guess_box; // The search box.
913
914
  // We assume that the guess box is somewhat accurate, so we don't allow
915
  // the adjusted border to pass half of the guessed area. This prevents
916
  // "negative" tables from forming.
917
0
  const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
918
  // Keeps track of the most columns in an accepted table. The resulting table
919
  // may be less than the max, but we don't want to stray too far.
920
0
  unsigned best_cols = 0;
921
  // Make sure we find a good border.
922
0
  bool found_good_border = false;
923
924
  // Find the bottom of the table by trying a few different locations. For
925
  // each location, the top, left, and right are fixed. We start the search
926
  // in a smaller table to favor best_cols getting a good estimate sooner.
927
0
  int last_bottom = INT32_MAX;
928
0
  int bottom =
929
0
      NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY - min_height_ / 2, true);
930
0
  int top =
931
0
      NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
932
0
  adjusted.set_top(top);
933
934
  // Headers/footers can be spaced far from everything.
935
  // Make sure that the space below is greater than the space above
936
  // the lowest row.
937
0
  int previous_below = 0;
938
0
  const int kMaxChances = 10;
939
0
  int chances = kMaxChances;
940
0
  while (bottom != last_bottom) {
941
0
    adjusted.set_bottom(bottom);
942
943
0
    if (adjusted.height() >= min_height_) {
944
      // Try to fit the grid on the current box. We give it a chance
945
      // if the number of columns didn't significantly drop.
946
0
      table->set_bounding_box(adjusted);
947
0
      if (table->FindWhitespacedStructure() &&
948
0
          table->column_count() >= best_cols * kRequiredColumns) {
949
0
        if (false && IsWeakTableRow(table, 0)) {
950
          // Currently buggy, but was looking promising so disabled.
951
0
          --chances;
952
0
        } else {
953
          // We favor 2 things,
954
          //   1- Adding rows that have partitioned data.
955
          //   2- Better margins (to find header/footer).
956
          // For better tables, we just look for multiple cells in the
957
          // bottom row with data in them.
958
          // For margins, the space below the last row should
959
          // be better than a table with the last row removed.
960
0
          chances = kMaxChances;
961
0
          double max_row_height = kMaxRowSize * table->median_cell_height();
962
0
          if ((table->space_below() * kMarginFactor >= best_below &&
963
0
               table->space_below() >= previous_below) ||
964
0
              (table->CountFilledCellsInRow(0) > 1 && table->row_height(0) < max_row_height)) {
965
0
            best_box.set_bottom(bottom);
966
0
            best_below = table->space_below();
967
0
            best_cols = std::max(table->column_count(), best_cols);
968
0
            found_good_border = true;
969
0
          }
970
0
        }
971
0
        previous_below = table->space_below();
972
0
      } else {
973
0
        --chances;
974
0
      }
975
0
    }
976
0
    if (chances <= 0) {
977
0
      break;
978
0
    }
979
980
0
    last_bottom = bottom;
981
0
    bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_bottom, true);
982
0
  }
983
0
  if (!found_good_border) {
984
0
    return false;
985
0
  }
986
987
  // TODO(nbeato) comments: follow modified code above... put it in a function!
988
0
  found_good_border = false;
989
0
  int last_top = INT32_MIN;
990
0
  top =
991
0
      NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
992
0
  int previous_above = 0;
993
0
  chances = kMaxChances;
994
995
0
  adjusted.set_bottom(best_box.bottom());
996
0
  while (last_top != top) {
997
0
    adjusted.set_top(top);
998
0
    if (adjusted.height() >= min_height_) {
999
0
      table->set_bounding_box(adjusted);
1000
0
      if (table->FindWhitespacedStructure() &&
1001
0
          table->column_count() >= best_cols * kRequiredColumns) {
1002
0
        int last_row = table->row_count() - 1;
1003
0
        if (false && IsWeakTableRow(table, last_row)) {
1004
          // Currently buggy, but was looking promising so disabled.
1005
0
          --chances;
1006
0
        } else {
1007
0
          chances = kMaxChances;
1008
0
          double max_row_height = kMaxRowSize * table->median_cell_height();
1009
0
          if ((table->space_above() * kMarginFactor >= best_above &&
1010
0
               table->space_above() >= previous_above) ||
1011
0
              (table->CountFilledCellsInRow(last_row) > 1 &&
1012
0
               table->row_height(last_row) < max_row_height)) {
1013
0
            best_box.set_top(top);
1014
0
            best_above = table->space_above();
1015
0
            best_cols = std::max(table->column_count(), best_cols);
1016
0
            found_good_border = true;
1017
0
          }
1018
0
        }
1019
0
        previous_above = table->space_above();
1020
0
      } else {
1021
0
        --chances;
1022
0
      }
1023
0
    }
1024
0
    if (chances <= 0) {
1025
0
      break;
1026
0
    }
1027
1028
0
    last_top = top;
1029
0
    top = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_top, false);
1030
0
  }
1031
1032
0
  if (!found_good_border) {
1033
0
    return false;
1034
0
  }
1035
1036
  // If we get here, this shouldn't happen. It can be an assert, but
1037
  // I haven't tested it enough to make it crash things.
1038
0
  if (best_box.null_box()) {
1039
0
    return false;
1040
0
  }
1041
1042
  // Given the best locations, fit the box to those locations.
1043
0
  table->set_bounding_box(best_box);
1044
0
  return table->FindWhitespacedStructure();
1045
0
}
1046
1047
// Finds the closest value to y that can safely cause a horizontal
1048
// split in the partitions.
1049
// This function has been buggy and not as reliable as I would've
1050
// liked. I suggest finding all of the splits using the
1051
// FindPartitionLocations once and then just keeping the results
1052
// of that function cached somewhere.
1053
0
int TableRecognizer::NextHorizontalSplit(int left, int right, int y, bool top_to_bottom) {
1054
0
  ColPartitionGridSearch gsearch(text_grid_);
1055
0
  gsearch.SetUniqueMode(true);
1056
0
  gsearch.StartVerticalSearch(left, right, y);
1057
0
  ColPartition *text = nullptr;
1058
0
  int last_y = y;
1059
0
  while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != nullptr) {
1060
0
    if (!text->IsTextType() || !text->IsHorizontalType()) {
1061
0
      continue;
1062
0
    }
1063
0
    if (text->bounding_box().height() > max_text_height_) {
1064
0
      continue;
1065
0
    }
1066
1067
0
    const TBOX &text_box = text->bounding_box();
1068
0
    if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
1069
0
      last_y = std::min(last_y, static_cast<int>(text_box.bottom()));
1070
0
      continue;
1071
0
    }
1072
0
    if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
1073
0
      last_y = std::max(last_y, static_cast<int>(text_box.top()));
1074
0
      continue;
1075
0
    }
1076
1077
0
    return last_y;
1078
0
  }
1079
  // If none is found, we at least want to preserve the min/max,
1080
  // which defines the overlap of y with the last partition in the grid.
1081
0
  return last_y;
1082
0
}
1083
1084
} // namespace tesseract