Coverage Report

Created: 2025-06-13 07:02

/src/tesseract/src/textord/tablerecog.h
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        tablerecog.h
3
// Description: Functions to detect structure of tables.
4
// Author:      Nicholas Beato
5
//
6
// (C) Copyright 2010, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#ifndef TABLERECOG_H_
20
#define TABLERECOG_H_
21
22
#include "colpartitiongrid.h"
23
24
namespace tesseract {
25
26
// There are 2 classes in this file. They have 2 different purposes.
27
//  - StructuredTable contains the methods to find the structure given
28
//    a specific bounding box and grow that structure.
29
//  - TableRecognizer contains the methods to adjust the possible positions
30
//    of a table without worrying about structure.
31
//
32
// To use these classes, the assumption is that the TableFinder will
33
// have a guess of the location of a table (or possibly over/undersegmented
34
// tables). The TableRecognizer is responsible for finding the table boundaries
35
// at a high level. The StructuredTable class is responsible for determining
36
// the structure of the table and trying to maximize its bounds while retaining
37
// the structure.
38
// (The latter part is not implemented yet, but that was the goal).
39
//
40
// While on the boundary discussion, keep in mind that this is a first pass.
41
// There should eventually be some things like internal structure checks,
42
// and, more importantly, surrounding text flow checks.
43
//
44
45
// Usage:
46
// The StructuredTable class contains methods to query a potential table.
47
// It has functions to find structure, count rows, find ColPartitions that
48
// intersect gridlines, etc. It is not meant to blindly find a table. It
49
// is meant to start with a known table location and enhance it.
50
// Usage:
51
//    ColPartitionGrid text_grid, line_grid;  // init
52
//    TBOX table_box;  // known location of table location
53
//
54
//    StructuredTable table;
55
//    table.Init();  // construction code
56
//    table.set_text_grid(/* text */);  // These 2 grids can be the same!
57
//    table.set_line_grid(/* lines */);
58
//    table.set_min_text_height(10);    // Filter vertical and tall text.
59
//    // IMPORTANT! The table needs to be told where it is!
60
//    table.set_bounding_box(table_box);  // Set initial table location.
61
//    if (table.FindWhitespacedStructure()) {
62
//      // process table
63
//      table.column_count();  // number of columns
64
//      table.row_count();     // number of rows
65
//      table.cells_count();   // number of cells
66
//      table.bounding_box();  // updated bounding box
67
//      // etc.
68
//    }
69
//
70
class TESS_API StructuredTable {
71
public:
72
  StructuredTable();
73
0
  ~StructuredTable() = default;
74
75
  // Initialization code. Must be called after the constructor.
76
  void Init();
77
78
  // Sets the grids used by the table. These can be changed between
79
  // calls to Recognize. They are treated as read-only data.
80
  void set_text_grid(ColPartitionGrid *text);
81
  void set_line_grid(ColPartitionGrid *lines);
82
  // Filters text partitions that are ridiculously tall to prevent
83
  // merging rows.
84
  void set_max_text_height(int height);
85
86
  // Basic accessors. Some are treated as attributes despite having indirect
87
  // representation.
88
  bool is_lined() const;
89
  unsigned row_count() const;
90
  unsigned column_count() const;
91
  unsigned cell_count() const;
92
  void set_bounding_box(const TBOX &box);
93
  const TBOX &bounding_box() const;
94
  int median_cell_height();
95
  int median_cell_width();
96
  int row_height(unsigned row) const;
97
  int column_width(unsigned column) const;
98
  int space_above() const;
99
  int space_below() const;
100
101
  // Given enough horizontal and vertical lines in a region, create this table
102
  // based on the structure given by the lines. Return true if it worked out.
103
  // Code assumes the lines exist. It is the caller's responsibility to check
104
  // for lines and find an appropriate bounding box.
105
  bool FindLinedStructure();
106
107
  // The main subroutine for finding generic table structure. The function
108
  // finds the grid structure in the given box. Returns true if a good grid
109
  // exists, implying that "this" table is valid.
110
  bool FindWhitespacedStructure();
111
112
  ////////
113
  //////// Functions to query table info.
114
  ////////
115
116
  // Returns true if inserting part into the table does not cause any
117
  // cell merges.
118
  bool DoesPartitionFit(const ColPartition &part) const;
119
  // Checks if a sub-table has multiple data cells filled.
120
  int CountFilledCells();
121
  int CountFilledCellsInRow(int row);
122
  int CountFilledCellsInColumn(int column);
123
  int CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start, unsigned column_end);
124
125
  // Makes sure that at least one cell in a row has substantial area filled.
126
  // This can filter out large whitespace caused by growing tables too far
127
  // and page numbers.
128
  // (currently bugged for some reason).
129
  bool VerifyRowFilled(int row);
130
  // Finds the filled area in a cell.
131
  double CalculateCellFilledPercentage(unsigned row, unsigned column);
132
133
  // Debug display, draws the table in the given color. If the table is not
134
  // valid, the table and "best" grid lines are still drawn in the given color.
135
  void Display(ScrollView *window, ScrollView::Color color);
136
137
protected:
138
  // Clear the structure information.
139
  void ClearStructure();
140
141
  ////////
142
  //////// Lined tables
143
  ////////
144
145
  // Verifies the lines do not intersect partitions. This happens when
146
  // the lines are in column boundaries and extend the full page. As a result,
147
  // the grid lines go through column text. The condition is detectable.
148
  bool VerifyLinedTableCells();
149
150
  ////////
151
  //////// Tables with whitespace
152
  ////////
153
154
  // This is the function to change if you want to filter resulting tables
155
  // better. Right now it just checks for a minimum cell count and such.
156
  // You could add things like maximum number of ColPartitions per cell or
157
  // similar.
158
  bool VerifyWhitespacedTable();
159
  // Find the columns of a table using whitespace.
160
  void FindWhitespacedColumns();
161
  // Find the rows of a table using whitespace.
162
  void FindWhitespacedRows();
163
164
  ////////
165
  //////// Functions to provide information about the table.
166
  ////////
167
168
  // Calculates the whitespace around the table using the table boundary and
169
  // the supplied grids (set_text_grid and set_line_grid).
170
  void CalculateMargins();
171
  // Update the table margins with the supplied grid. This is
172
  // only called by calculate margins to use multiple grid sources.
173
  void UpdateMargins(ColPartitionGrid *grid);
174
  int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const;
175
  int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const;
176
  // Calculates stats on the table, namely the median cell height and width.
177
  void CalculateStats();
178
179
  ////////
180
  //////// Functions to try to "fix" some table errors.
181
  ////////
182
183
  // Given a whitespaced table, this looks for bordering lines that might
184
  // be page layout boxes around the table. It is necessary to get the margins
185
  // correct on the table. If the lines are not joined, the margins will be
186
  // the distance to the line, which is not right.
187
  void AbsorbNearbyLines();
188
189
  // Nice utility function for finding partition gaps. You feed it a sorted
190
  // list of all of the mins/maxes of the partitions in the table, and it gives
191
  // you the gaps (middle). This works for both vertical and horizontal
192
  // gaps.
193
  //
194
  // If you want to allow slight overlap in the division and the partitions,
195
  // just scale down the partitions before inserting them in the list.
196
  // Likewise, you can force at least some space between partitions.
197
  // This trick is how the horizontal partitions are done (since the page
198
  // skew could make it hard to find splits in the text).
199
  //
200
  // As a result, "0 distance" between closest partitions causes a gap.
201
  // This is not a programmatic assumption. It is intentional and simplifies
202
  // things.
203
  //
204
  // "max_merged" indicates both the minimum number of stacked partitions
205
  // to cause a cell (add 1 to it), and the maximum number of partitions that
206
  // a grid line can intersect. For example, if max_merged is 0, then lines
207
  // are inserted wherever space exists between partitions. If it is 2,
208
  // lines may intersect 2 partitions at most, but you also need at least
209
  // 2 partitions to generate a line.
210
  static void FindCellSplitLocations(const std::vector<int> &min_list,
211
                                     const std::vector<int> &max_list, int max_merged,
212
                                     std::vector<int> *locations);
213
214
  ////////
215
  //////// Utility function for table queries
216
  ////////
217
218
  // Counts the number of ColPartitions that intersect vertical cell
219
  // division at this x value. Used by VerifyLinedTable.
220
  int CountVerticalIntersections(int x);
221
  int CountHorizontalIntersections(int y);
222
223
  // Counts how many text partitions are in this box.
224
  int CountPartitions(const TBOX &box);
225
226
  ////////
227
  //////// Data members.
228
  ////////
229
230
  // Input data, used as read only data to make decisions.
231
  ColPartitionGrid *text_grid_; // Text ColPartitions
232
  ColPartitionGrid *line_grid_; // Line ColPartitions
233
  // Table structure.
234
  // bounding box is a convenient external representation.
235
  // cell_x_ and cell_y_ indicate the grid lines.
236
  TBOX bounding_box_;         // Bounding box
237
  std::vector<int> cell_x_; // Locations of vertical divisions (sorted)
238
  std::vector<int> cell_y_; // Locations of horizontal divisions (sorted)
239
  bool is_lined_;             // Is the table backed up by a line structure
240
  // Table margins, set via CalculateMargins
241
  int space_above_;
242
  int space_below_;
243
  int space_left_;
244
  int space_right_;
245
  int median_cell_height_;
246
  int median_cell_width_;
247
  // Filters, used to prevent awkward partitions from destroying structure.
248
  int max_text_height_;
249
};
250
251
class TESS_API TableRecognizer {
252
public:
253
0
  TableRecognizer() = default;
254
  ~TableRecognizer() = default;
255
256
  // Initialization code. Must be called after the constructor.
257
  void Init();
258
259
  ////////
260
  //////// Pre-recognize methods to initial table constraints.
261
  ////////
262
263
  // Sets the grids used by the table. These can be changed between
264
  // calls to Recognize. They are treated as read-only data.
265
  void set_text_grid(ColPartitionGrid *text);
266
  void set_line_grid(ColPartitionGrid *lines);
267
  // Sets some additional constraints on the table.
268
  void set_min_height(int height);
269
  void set_min_width(int width);
270
  // Filters text partitions that are ridiculously tall to prevent
271
  // merging rows. Note that "filters" refers to allowing horizontal
272
  // cells to slice through them on the premise that they were
273
  // merged text rows during previous layout.
274
  void set_max_text_height(int height);
275
276
  // Given a guess location, the RecognizeTable function will try to find a
277
  // structured grid in the area. On success, it will return a new
278
  // StructuredTable (and assumes you will delete it). Otherwise,
279
  // nullptr is returned.
280
  //
281
  // Keep in mind, this may "overgrow" or "undergrow" the size of guess.
282
  // Ideally, there is either a one-to-one correspondence between
283
  // the guess and table or no table at all. This is not the best of
284
  // assumptions right now, but was made to try to keep things simple in
285
  // the first pass.
286
  //
287
  // If a line structure is available on the page in the given region,
288
  // the table will use the linear structure as it is.
289
  // Otherwise, it will try to maximize the whitespace around it while keeping
290
  // a grid structure. This is somewhat working.
291
  //
292
  // Since the combination of adjustments can get high, effort was
293
  // originally made to keep the number of adjustments linear in the number
294
  // of partitions. The underlying structure finding code used to be
295
  // much more complex. I don't know how necessary this constraint is anymore.
296
  // The evaluation of a possible table is kept within O(nlogn) in the size of
297
  // the table (where size is the number of partitions in the table).
298
  // As a result, the algorithm is capable of O(n^2 log n). Depending
299
  // on the grid search size, it may be higher.
300
  //
301
  // Last note: it is possible to just try all partition boundaries at a high
302
  // level O(n^4) and do a verification scheme (at least O(nlogn)). If there
303
  // area 200 partitions on a page, this could be too costly. Effort could go
304
  // into pruning the search, but I opted for something quicker. I'm confident
305
  // that the independent adjustments can get similar results and keep the
306
  // complextiy down. However, the other approach could work without using
307
  // TableFinder at all if it is fast enough.  It comes down to properly
308
  // deciding what is a table. The code currently relies on TableFinder's
309
  // guess to the location of a table for that.
310
  StructuredTable *RecognizeTable(const TBOX &guess_box);
311
312
protected:
313
  ////////
314
  //////// Lined tables
315
  ////////
316
317
  // Returns true if the given box has a lined table within it. The
318
  // table argument will be updated with the table if the table exists.
319
  bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table);
320
  // Returns true if the given box has a large number of horizontal and
321
  // vertical lines present. If so, we assume the extent of these lines
322
  // uniquely defines a table and find that table via SolveLinedTable.
323
  bool HasSignificantLines(const TBOX &guess);
324
325
  // Given enough horizontal and vertical lines in a region, find a bounding
326
  // box that encloses all of them (as well as newly introduced lines).
327
  // The bounding box is the smallest box that encloses the lines in guess
328
  // without having any lines sticking out of it.
329
  // bounding_box is an in/out parameter.
330
  // On input, it in the extents of the box to search.
331
  // On output, it is the resulting bounding box.
332
  bool FindLinesBoundingBox(TBOX *bounding_box);
333
  // Iteration in above search.
334
  // bounding_box is an in/out parameter.
335
  // On input, it in the extents of the box to search.
336
  // On output, it is the resulting bounding box.
337
  bool FindLinesBoundingBoxIteration(TBOX *bounding_box);
338
339
  ////////
340
  //////// Generic "whitespaced" tables
341
  ////////
342
343
  // Returns true if the given box has a whitespaced table within it. The
344
  // table argument will be updated if the table exists. Also note
345
  // that this method will fail if the guess_box center is not
346
  // mostly within the table.
347
  bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table);
348
349
  // Finds the location of a horizontal split relative to y.
350
  // This function is mostly unused now. If the SolveWhitespacedTable
351
  // changes much, it can be removed. Note, it isn't really as reliable
352
  // as I thought. I went with alternatives for most of the other uses.
353
  int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom);
354
355
  // Input data, used as read only data to make decisions.
356
  ColPartitionGrid *text_grid_ = nullptr; // Text ColPartitions
357
  ColPartitionGrid *line_grid_ = nullptr; // Line ColPartitions
358
  // Table constraints, a "good" table must satisfy these.
359
  int min_height_ = 0;
360
  int min_width_ = 0;
361
  // Filters, used to prevent awkward partitions from destroying structure.
362
  int max_text_height_ = INT32_MAX; // Horizontal lines may intersect taller text.
363
};
364
365
} // namespace tesseract
366
367
#endif /* TABLERECOG_H_ */