/src/tesseract/src/textord/tablerecog.h
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: tablerecog.h |
3 | | // Description: Functions to detect structure of tables. |
4 | | // Author: Nicholas Beato |
5 | | // |
6 | | // (C) Copyright 2010, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #ifndef TABLERECOG_H_ |
20 | | #define TABLERECOG_H_ |
21 | | |
22 | | #include "colpartitiongrid.h" |
23 | | |
24 | | namespace tesseract { |
25 | | |
26 | | // There are 2 classes in this file. They have 2 different purposes. |
27 | | // - StructuredTable contains the methods to find the structure given |
28 | | // a specific bounding box and grow that structure. |
29 | | // - TableRecognizer contains the methods to adjust the possible positions |
30 | | // of a table without worrying about structure. |
31 | | // |
32 | | // To use these classes, the assumption is that the TableFinder will |
33 | | // have a guess of the location of a table (or possibly over/undersegmented |
34 | | // tables). The TableRecognizer is responsible for finding the table boundaries |
35 | | // at a high level. The StructuredTable class is responsible for determining |
36 | | // the structure of the table and trying to maximize its bounds while retaining |
37 | | // the structure. |
38 | | // (The latter part is not implemented yet, but that was the goal). |
39 | | // |
40 | | // While on the boundary discussion, keep in mind that this is a first pass. |
41 | | // There should eventually be some things like internal structure checks, |
42 | | // and, more importantly, surrounding text flow checks. |
43 | | // |
44 | | |
45 | | // Usage: |
46 | | // The StructuredTable class contains methods to query a potential table. |
47 | | // It has functions to find structure, count rows, find ColPartitions that |
48 | | // intersect gridlines, etc. It is not meant to blindly find a table. It |
49 | | // is meant to start with a known table location and enhance it. |
50 | | // Usage: |
51 | | // ColPartitionGrid text_grid, line_grid; // init |
52 | | // TBOX table_box; // known location of table location |
53 | | // |
54 | | // StructuredTable table; |
55 | | // table.Init(); // construction code |
56 | | // table.set_text_grid(/* text */); // These 2 grids can be the same! |
57 | | // table.set_line_grid(/* lines */); |
58 | | // table.set_min_text_height(10); // Filter vertical and tall text. |
59 | | // // IMPORTANT! The table needs to be told where it is! |
60 | | // table.set_bounding_box(table_box); // Set initial table location. |
61 | | // if (table.FindWhitespacedStructure()) { |
62 | | // // process table |
63 | | // table.column_count(); // number of columns |
64 | | // table.row_count(); // number of rows |
65 | | // table.cells_count(); // number of cells |
66 | | // table.bounding_box(); // updated bounding box |
67 | | // // etc. |
68 | | // } |
69 | | // |
70 | | class TESS_API StructuredTable { |
71 | | public: |
72 | | StructuredTable(); |
73 | 0 | ~StructuredTable() = default; |
74 | | |
75 | | // Initialization code. Must be called after the constructor. |
76 | | void Init(); |
77 | | |
78 | | // Sets the grids used by the table. These can be changed between |
79 | | // calls to Recognize. They are treated as read-only data. |
80 | | void set_text_grid(ColPartitionGrid *text); |
81 | | void set_line_grid(ColPartitionGrid *lines); |
82 | | // Filters text partitions that are ridiculously tall to prevent |
83 | | // merging rows. |
84 | | void set_max_text_height(int height); |
85 | | |
86 | | // Basic accessors. Some are treated as attributes despite having indirect |
87 | | // representation. |
88 | | bool is_lined() const; |
89 | | unsigned row_count() const; |
90 | | unsigned column_count() const; |
91 | | unsigned cell_count() const; |
92 | | void set_bounding_box(const TBOX &box); |
93 | | const TBOX &bounding_box() const; |
94 | | int median_cell_height(); |
95 | | int median_cell_width(); |
96 | | int row_height(unsigned row) const; |
97 | | int column_width(unsigned column) const; |
98 | | int space_above() const; |
99 | | int space_below() const; |
100 | | |
101 | | // Given enough horizontal and vertical lines in a region, create this table |
102 | | // based on the structure given by the lines. Return true if it worked out. |
103 | | // Code assumes the lines exist. It is the caller's responsibility to check |
104 | | // for lines and find an appropriate bounding box. |
105 | | bool FindLinedStructure(); |
106 | | |
107 | | // The main subroutine for finding generic table structure. The function |
108 | | // finds the grid structure in the given box. Returns true if a good grid |
109 | | // exists, implying that "this" table is valid. |
110 | | bool FindWhitespacedStructure(); |
111 | | |
112 | | //////// |
113 | | //////// Functions to query table info. |
114 | | //////// |
115 | | |
116 | | // Returns true if inserting part into the table does not cause any |
117 | | // cell merges. |
118 | | bool DoesPartitionFit(const ColPartition &part) const; |
119 | | // Checks if a sub-table has multiple data cells filled. |
120 | | int CountFilledCells(); |
121 | | int CountFilledCellsInRow(int row); |
122 | | int CountFilledCellsInColumn(int column); |
123 | | int CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start, unsigned column_end); |
124 | | |
125 | | // Makes sure that at least one cell in a row has substantial area filled. |
126 | | // This can filter out large whitespace caused by growing tables too far |
127 | | // and page numbers. |
128 | | // (currently bugged for some reason). |
129 | | bool VerifyRowFilled(int row); |
130 | | // Finds the filled area in a cell. |
131 | | double CalculateCellFilledPercentage(unsigned row, unsigned column); |
132 | | |
133 | | // Debug display, draws the table in the given color. If the table is not |
134 | | // valid, the table and "best" grid lines are still drawn in the given color. |
135 | | void Display(ScrollView *window, ScrollView::Color color); |
136 | | |
137 | | protected: |
138 | | // Clear the structure information. |
139 | | void ClearStructure(); |
140 | | |
141 | | //////// |
142 | | //////// Lined tables |
143 | | //////// |
144 | | |
145 | | // Verifies the lines do not intersect partitions. This happens when |
146 | | // the lines are in column boundaries and extend the full page. As a result, |
147 | | // the grid lines go through column text. The condition is detectable. |
148 | | bool VerifyLinedTableCells(); |
149 | | |
150 | | //////// |
151 | | //////// Tables with whitespace |
152 | | //////// |
153 | | |
154 | | // This is the function to change if you want to filter resulting tables |
155 | | // better. Right now it just checks for a minimum cell count and such. |
156 | | // You could add things like maximum number of ColPartitions per cell or |
157 | | // similar. |
158 | | bool VerifyWhitespacedTable(); |
159 | | // Find the columns of a table using whitespace. |
160 | | void FindWhitespacedColumns(); |
161 | | // Find the rows of a table using whitespace. |
162 | | void FindWhitespacedRows(); |
163 | | |
164 | | //////// |
165 | | //////// Functions to provide information about the table. |
166 | | //////// |
167 | | |
168 | | // Calculates the whitespace around the table using the table boundary and |
169 | | // the supplied grids (set_text_grid and set_line_grid). |
170 | | void CalculateMargins(); |
171 | | // Update the table margins with the supplied grid. This is |
172 | | // only called by calculate margins to use multiple grid sources. |
173 | | void UpdateMargins(ColPartitionGrid *grid); |
174 | | int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const; |
175 | | int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const; |
176 | | // Calculates stats on the table, namely the median cell height and width. |
177 | | void CalculateStats(); |
178 | | |
179 | | //////// |
180 | | //////// Functions to try to "fix" some table errors. |
181 | | //////// |
182 | | |
183 | | // Given a whitespaced table, this looks for bordering lines that might |
184 | | // be page layout boxes around the table. It is necessary to get the margins |
185 | | // correct on the table. If the lines are not joined, the margins will be |
186 | | // the distance to the line, which is not right. |
187 | | void AbsorbNearbyLines(); |
188 | | |
189 | | // Nice utility function for finding partition gaps. You feed it a sorted |
190 | | // list of all of the mins/maxes of the partitions in the table, and it gives |
191 | | // you the gaps (middle). This works for both vertical and horizontal |
192 | | // gaps. |
193 | | // |
194 | | // If you want to allow slight overlap in the division and the partitions, |
195 | | // just scale down the partitions before inserting them in the list. |
196 | | // Likewise, you can force at least some space between partitions. |
197 | | // This trick is how the horizontal partitions are done (since the page |
198 | | // skew could make it hard to find splits in the text). |
199 | | // |
200 | | // As a result, "0 distance" between closest partitions causes a gap. |
201 | | // This is not a programmatic assumption. It is intentional and simplifies |
202 | | // things. |
203 | | // |
204 | | // "max_merged" indicates both the minimum number of stacked partitions |
205 | | // to cause a cell (add 1 to it), and the maximum number of partitions that |
206 | | // a grid line can intersect. For example, if max_merged is 0, then lines |
207 | | // are inserted wherever space exists between partitions. If it is 2, |
208 | | // lines may intersect 2 partitions at most, but you also need at least |
209 | | // 2 partitions to generate a line. |
210 | | static void FindCellSplitLocations(const std::vector<int> &min_list, |
211 | | const std::vector<int> &max_list, int max_merged, |
212 | | std::vector<int> *locations); |
213 | | |
214 | | //////// |
215 | | //////// Utility function for table queries |
216 | | //////// |
217 | | |
218 | | // Counts the number of ColPartitions that intersect vertical cell |
219 | | // division at this x value. Used by VerifyLinedTable. |
220 | | int CountVerticalIntersections(int x); |
221 | | int CountHorizontalIntersections(int y); |
222 | | |
223 | | // Counts how many text partitions are in this box. |
224 | | int CountPartitions(const TBOX &box); |
225 | | |
226 | | //////// |
227 | | //////// Data members. |
228 | | //////// |
229 | | |
230 | | // Input data, used as read only data to make decisions. |
231 | | ColPartitionGrid *text_grid_; // Text ColPartitions |
232 | | ColPartitionGrid *line_grid_; // Line ColPartitions |
233 | | // Table structure. |
234 | | // bounding box is a convenient external representation. |
235 | | // cell_x_ and cell_y_ indicate the grid lines. |
236 | | TBOX bounding_box_; // Bounding box |
237 | | std::vector<int> cell_x_; // Locations of vertical divisions (sorted) |
238 | | std::vector<int> cell_y_; // Locations of horizontal divisions (sorted) |
239 | | bool is_lined_; // Is the table backed up by a line structure |
240 | | // Table margins, set via CalculateMargins |
241 | | int space_above_; |
242 | | int space_below_; |
243 | | int space_left_; |
244 | | int space_right_; |
245 | | int median_cell_height_; |
246 | | int median_cell_width_; |
247 | | // Filters, used to prevent awkward partitions from destroying structure. |
248 | | int max_text_height_; |
249 | | }; |
250 | | |
251 | | class TESS_API TableRecognizer { |
252 | | public: |
253 | 0 | TableRecognizer() = default; |
254 | | ~TableRecognizer() = default; |
255 | | |
256 | | // Initialization code. Must be called after the constructor. |
257 | | void Init(); |
258 | | |
259 | | //////// |
260 | | //////// Pre-recognize methods to initial table constraints. |
261 | | //////// |
262 | | |
263 | | // Sets the grids used by the table. These can be changed between |
264 | | // calls to Recognize. They are treated as read-only data. |
265 | | void set_text_grid(ColPartitionGrid *text); |
266 | | void set_line_grid(ColPartitionGrid *lines); |
267 | | // Sets some additional constraints on the table. |
268 | | void set_min_height(int height); |
269 | | void set_min_width(int width); |
270 | | // Filters text partitions that are ridiculously tall to prevent |
271 | | // merging rows. Note that "filters" refers to allowing horizontal |
272 | | // cells to slice through them on the premise that they were |
273 | | // merged text rows during previous layout. |
274 | | void set_max_text_height(int height); |
275 | | |
276 | | // Given a guess location, the RecognizeTable function will try to find a |
277 | | // structured grid in the area. On success, it will return a new |
278 | | // StructuredTable (and assumes you will delete it). Otherwise, |
279 | | // nullptr is returned. |
280 | | // |
281 | | // Keep in mind, this may "overgrow" or "undergrow" the size of guess. |
282 | | // Ideally, there is either a one-to-one correspondence between |
283 | | // the guess and table or no table at all. This is not the best of |
284 | | // assumptions right now, but was made to try to keep things simple in |
285 | | // the first pass. |
286 | | // |
287 | | // If a line structure is available on the page in the given region, |
288 | | // the table will use the linear structure as it is. |
289 | | // Otherwise, it will try to maximize the whitespace around it while keeping |
290 | | // a grid structure. This is somewhat working. |
291 | | // |
292 | | // Since the combination of adjustments can get high, effort was |
293 | | // originally made to keep the number of adjustments linear in the number |
294 | | // of partitions. The underlying structure finding code used to be |
295 | | // much more complex. I don't know how necessary this constraint is anymore. |
296 | | // The evaluation of a possible table is kept within O(nlogn) in the size of |
297 | | // the table (where size is the number of partitions in the table). |
298 | | // As a result, the algorithm is capable of O(n^2 log n). Depending |
299 | | // on the grid search size, it may be higher. |
300 | | // |
301 | | // Last note: it is possible to just try all partition boundaries at a high |
302 | | // level O(n^4) and do a verification scheme (at least O(nlogn)). If there |
303 | | // area 200 partitions on a page, this could be too costly. Effort could go |
304 | | // into pruning the search, but I opted for something quicker. I'm confident |
305 | | // that the independent adjustments can get similar results and keep the |
306 | | // complextiy down. However, the other approach could work without using |
307 | | // TableFinder at all if it is fast enough. It comes down to properly |
308 | | // deciding what is a table. The code currently relies on TableFinder's |
309 | | // guess to the location of a table for that. |
310 | | StructuredTable *RecognizeTable(const TBOX &guess_box); |
311 | | |
312 | | protected: |
313 | | //////// |
314 | | //////// Lined tables |
315 | | //////// |
316 | | |
317 | | // Returns true if the given box has a lined table within it. The |
318 | | // table argument will be updated with the table if the table exists. |
319 | | bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table); |
320 | | // Returns true if the given box has a large number of horizontal and |
321 | | // vertical lines present. If so, we assume the extent of these lines |
322 | | // uniquely defines a table and find that table via SolveLinedTable. |
323 | | bool HasSignificantLines(const TBOX &guess); |
324 | | |
325 | | // Given enough horizontal and vertical lines in a region, find a bounding |
326 | | // box that encloses all of them (as well as newly introduced lines). |
327 | | // The bounding box is the smallest box that encloses the lines in guess |
328 | | // without having any lines sticking out of it. |
329 | | // bounding_box is an in/out parameter. |
330 | | // On input, it in the extents of the box to search. |
331 | | // On output, it is the resulting bounding box. |
332 | | bool FindLinesBoundingBox(TBOX *bounding_box); |
333 | | // Iteration in above search. |
334 | | // bounding_box is an in/out parameter. |
335 | | // On input, it in the extents of the box to search. |
336 | | // On output, it is the resulting bounding box. |
337 | | bool FindLinesBoundingBoxIteration(TBOX *bounding_box); |
338 | | |
339 | | //////// |
340 | | //////// Generic "whitespaced" tables |
341 | | //////// |
342 | | |
343 | | // Returns true if the given box has a whitespaced table within it. The |
344 | | // table argument will be updated if the table exists. Also note |
345 | | // that this method will fail if the guess_box center is not |
346 | | // mostly within the table. |
347 | | bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table); |
348 | | |
349 | | // Finds the location of a horizontal split relative to y. |
350 | | // This function is mostly unused now. If the SolveWhitespacedTable |
351 | | // changes much, it can be removed. Note, it isn't really as reliable |
352 | | // as I thought. I went with alternatives for most of the other uses. |
353 | | int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom); |
354 | | |
355 | | // Input data, used as read only data to make decisions. |
356 | | ColPartitionGrid *text_grid_ = nullptr; // Text ColPartitions |
357 | | ColPartitionGrid *line_grid_ = nullptr; // Line ColPartitions |
358 | | // Table constraints, a "good" table must satisfy these. |
359 | | int min_height_ = 0; |
360 | | int min_width_ = 0; |
361 | | // Filters, used to prevent awkward partitions from destroying structure. |
362 | | int max_text_height_ = INT32_MAX; // Horizontal lines may intersect taller text. |
363 | | }; |
364 | | |
365 | | } // namespace tesseract |
366 | | |
367 | | #endif /* TABLERECOG_H_ */ |