/src/tesseract/src/textord/tablerecog.cpp
Line | Count | Source |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: tablerecog.cpp |
3 | | // Description: Helper class to help structure table areas. Given an bounding |
4 | | // box from TableFinder, the TableRecognizer should give a |
5 | | // StructuredTable (maybe a list in the future) of "good" tables |
6 | | // in that area. |
7 | | // Author: Nicholas Beato |
8 | | // |
9 | | // (C) Copyright 2009, Google Inc. |
10 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
11 | | // you may not use this file except in compliance with the License. |
12 | | // You may obtain a copy of the License at |
13 | | // http://www.apache.org/licenses/LICENSE-2.0 |
14 | | // Unless required by applicable law or agreed to in writing, software |
15 | | // distributed under the License is distributed on an "AS IS" BASIS, |
16 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
17 | | // See the License for the specific language governing permissions and |
18 | | // limitations under the License. |
19 | | // |
20 | | /////////////////////////////////////////////////////////////////////// |
21 | | |
22 | | #ifdef HAVE_CONFIG_H |
23 | | # include "config_auto.h" |
24 | | #endif |
25 | | |
26 | | #include "tablerecog.h" |
27 | | |
28 | | #include <algorithm> |
29 | | |
30 | | namespace tesseract { |
31 | | |
32 | | // The amount of space required between the ColPartitions in 2 columns |
33 | | // of a non-lined table as a multiple of the median width. |
34 | | const double kHorizontalSpacing = 0.30; |
35 | | // The amount of space required between the ColPartitions in 2 rows |
36 | | // of a non-lined table as multiples of the median height. |
37 | | const double kVerticalSpacing = -0.2; |
38 | | // The number of cells that the grid lines may intersect. |
39 | | // See FindCellSplitLocations for explanation. |
40 | | const int kCellSplitRowThreshold = 0; |
41 | | const int kCellSplitColumnThreshold = 0; |
42 | | // For "lined tables", the number of required lines. Currently a guess. |
43 | | const int kLinedTableMinVerticalLines = 3; |
44 | | const int kLinedTableMinHorizontalLines = 3; |
45 | | // Number of columns required, as a fraction of the most columns found. |
46 | | // None of these are tweaked at all. |
47 | | const double kRequiredColumns = 0.7; |
48 | | // The tolerance for comparing margins of potential tables. |
49 | | const double kMarginFactor = 1.1; |
50 | | // The first and last row should be consistent cell height. |
51 | | // This factor is the first and last row cell height max. |
52 | | const double kMaxRowSize = 2.5; |
53 | | // Number of filled columns required to form a strong table row. |
54 | | // For small tables, this is an absolute number. |
55 | | const double kGoodRowNumberOfColumnsSmall[] = {2, 2, 2, 2, 2, 3, 3}; |
56 | | // For large tables, it is a relative number |
57 | | const double kGoodRowNumberOfColumnsLarge = 0.7; |
58 | | // The amount of area that must be covered in a cell by ColPartitions to |
59 | | // be considered "filled" |
60 | | const double kMinFilledArea = 0.35; |
61 | | |
62 | | // Indicates that a table row is weak. This means that it has |
63 | | // many missing data cells or very large cell heights compared. |
64 | | // to the rest of the table. |
65 | | // Code is buggy right now. It is disabled in the calling function. |
66 | | // It seems like sometimes the row that is passed in is not correct |
67 | | // sometimes (like a phantom row is introduced). There's something going |
68 | | // on in the cell_y_ data member before this is called... not certain. |
69 | 0 | static bool IsWeakTableRow(StructuredTable *table, int row) { |
70 | 0 | if (!table->VerifyRowFilled(row)) { |
71 | 0 | return false; |
72 | 0 | } |
73 | 0 |
|
74 | 0 | double threshold; |
75 | 0 | if (table->column_count() < countof(kGoodRowNumberOfColumnsSmall)) { |
76 | 0 | threshold = kGoodRowNumberOfColumnsSmall[table->column_count()]; |
77 | 0 | } else { |
78 | 0 | threshold = table->column_count() * kGoodRowNumberOfColumnsLarge; |
79 | 0 | } |
80 | 0 |
|
81 | 0 | return table->CountFilledCellsInRow(row) < threshold; |
82 | 0 | } |
83 | | |
84 | | //////// |
85 | | //////// StructuredTable Class |
86 | | //////// |
87 | | |
88 | | StructuredTable::StructuredTable() |
89 | 0 | : text_grid_(nullptr) |
90 | 0 | , line_grid_(nullptr) |
91 | 0 | , is_lined_(false) |
92 | 0 | , space_above_(0) |
93 | 0 | , space_below_(0) |
94 | 0 | , space_left_(0) |
95 | 0 | , space_right_(0) |
96 | 0 | , median_cell_height_(0) |
97 | 0 | , median_cell_width_(0) |
98 | 0 | , max_text_height_(INT32_MAX) {} |
99 | | |
100 | 0 | void StructuredTable::Init() {} |
101 | | |
102 | 0 | void StructuredTable::set_text_grid(ColPartitionGrid *text_grid) { |
103 | 0 | text_grid_ = text_grid; |
104 | 0 | } |
105 | 0 | void StructuredTable::set_line_grid(ColPartitionGrid *line_grid) { |
106 | 0 | line_grid_ = line_grid; |
107 | 0 | } |
108 | 0 | void StructuredTable::set_max_text_height(int height) { |
109 | 0 | max_text_height_ = height; |
110 | 0 | } |
111 | 0 | bool StructuredTable::is_lined() const { |
112 | 0 | return is_lined_; |
113 | 0 | } |
114 | 0 | unsigned StructuredTable::row_count() const { |
115 | 0 | return cell_y_.empty() ? 0 : cell_y_.size() - 1; |
116 | 0 | } |
117 | 0 | unsigned StructuredTable::column_count() const { |
118 | 0 | return cell_x_.empty() ? 0 : cell_x_.size() - 1; |
119 | 0 | } |
120 | 0 | unsigned StructuredTable::cell_count() const { |
121 | 0 | return row_count() * column_count(); |
122 | 0 | } |
123 | 0 | void StructuredTable::set_bounding_box(const TBOX &box) { |
124 | 0 | bounding_box_ = box; |
125 | 0 | } |
126 | 0 | const TBOX &StructuredTable::bounding_box() const { |
127 | 0 | return bounding_box_; |
128 | 0 | } |
129 | 0 | int StructuredTable::median_cell_height() { |
130 | 0 | return median_cell_height_; |
131 | 0 | } |
132 | 0 | int StructuredTable::median_cell_width() { |
133 | 0 | return median_cell_width_; |
134 | 0 | } |
135 | 0 | int StructuredTable::row_height(unsigned row) const { |
136 | 0 | ASSERT_HOST(row < row_count()); |
137 | 0 | return cell_y_[row + 1] - cell_y_[row]; |
138 | 0 | } |
139 | 0 | int StructuredTable::column_width(unsigned column) const { |
140 | 0 | ASSERT_HOST(column < column_count()); |
141 | 0 | return cell_x_[column + 1] - cell_x_[column]; |
142 | 0 | } |
143 | 0 | int StructuredTable::space_above() const { |
144 | 0 | return space_above_; |
145 | 0 | } |
146 | 0 | int StructuredTable::space_below() const { |
147 | 0 | return space_below_; |
148 | 0 | } |
149 | | |
150 | | // At this point, we know that the lines are contained |
151 | | // by the box (by FindLinesBoundingBox). |
152 | | // So try to find the cell structure and make sure it works out. |
153 | | // The assumption is that all lines span the table. If this |
154 | | // assumption fails, the VerifyLinedTable method will |
155 | | // abort the lined table. The TableRecognizer will fall |
156 | | // back on FindWhitespacedStructure. |
157 | 0 | bool StructuredTable::FindLinedStructure() { |
158 | 0 | ClearStructure(); |
159 | | |
160 | | // Search for all of the lines in the current box. |
161 | | // Update the cellular structure with the exact lines. |
162 | 0 | ColPartitionGridSearch box_search(line_grid_); |
163 | 0 | box_search.SetUniqueMode(true); |
164 | 0 | box_search.StartRectSearch(bounding_box_); |
165 | 0 | ColPartition *line = nullptr; |
166 | |
|
167 | 0 | while ((line = box_search.NextRectSearch()) != nullptr) { |
168 | 0 | if (line->IsHorizontalLine()) { |
169 | 0 | cell_y_.push_back(line->MidY()); |
170 | 0 | } |
171 | 0 | if (line->IsVerticalLine()) { |
172 | 0 | cell_x_.push_back(line->MidX()); |
173 | 0 | } |
174 | 0 | } |
175 | | |
176 | | // HasSignificantLines should guarantee cells. |
177 | | // Because that code is a different class, just gracefully |
178 | | // return false. This could be an assert. |
179 | 0 | if (cell_x_.size() < 3 || cell_y_.size() < 3) { |
180 | 0 | return false; |
181 | 0 | } |
182 | | |
183 | | // Sort and remove duplicates that may have occurred due to split lines. |
184 | 0 | std::sort(cell_x_.begin(), cell_x_.end()); |
185 | 0 | auto last_x = std::unique(cell_x_.begin(), cell_x_.end()); |
186 | 0 | cell_x_.erase(last_x, cell_x_.end()); |
187 | 0 | std::sort(cell_y_.begin(), cell_y_.end()); |
188 | 0 | auto last_y = std::unique(cell_y_.begin(), cell_y_.end()); |
189 | 0 | cell_y_.erase(last_y, cell_y_.end()); |
190 | | |
191 | | // The border should be the extents of line boxes, not middle. |
192 | 0 | cell_x_[0] = bounding_box_.left(); |
193 | 0 | cell_x_[cell_x_.size() - 1] = bounding_box_.right(); |
194 | 0 | cell_y_[0] = bounding_box_.bottom(); |
195 | 0 | cell_y_[cell_y_.size() - 1] = bounding_box_.top(); |
196 | | |
197 | | // Remove duplicates that may have occurred due to moving the borders. |
198 | 0 | last_x = std::unique(cell_x_.begin(), cell_x_.end()); |
199 | 0 | cell_x_.erase(last_x, cell_x_.end()); |
200 | 0 | last_y = std::unique(cell_y_.begin(), cell_y_.end()); |
201 | 0 | cell_y_.erase(last_y, cell_y_.end()); |
202 | |
|
203 | 0 | CalculateMargins(); |
204 | 0 | CalculateStats(); |
205 | 0 | is_lined_ = VerifyLinedTableCells(); |
206 | 0 | return is_lined_; |
207 | 0 | } |
208 | | |
209 | | // Finds the cellular structure given a particular box. |
210 | 0 | bool StructuredTable::FindWhitespacedStructure() { |
211 | 0 | ClearStructure(); |
212 | 0 | FindWhitespacedColumns(); |
213 | 0 | FindWhitespacedRows(); |
214 | |
|
215 | 0 | if (!VerifyWhitespacedTable()) { |
216 | 0 | return false; |
217 | 0 | } else { |
218 | 0 | bounding_box_.set_left(cell_x_[0]); |
219 | 0 | bounding_box_.set_right(cell_x_[cell_x_.size() - 1]); |
220 | 0 | bounding_box_.set_bottom(cell_y_[0]); |
221 | 0 | bounding_box_.set_top(cell_y_[cell_y_.size() - 1]); |
222 | 0 | AbsorbNearbyLines(); |
223 | 0 | CalculateMargins(); |
224 | 0 | CalculateStats(); |
225 | 0 | return true; |
226 | 0 | } |
227 | 0 | } |
228 | | |
229 | | // Tests if a partition fits inside the table structure. |
230 | | // Partitions must fully span a grid line in order to intersect it. |
231 | | // This means that a partition does not intersect a line |
232 | | // that it "just" touches. This is mainly because the assumption |
233 | | // throughout the code is that "0" distance is a very very small space. |
234 | 0 | bool StructuredTable::DoesPartitionFit(const ColPartition &part) const { |
235 | 0 | const TBOX &box = part.bounding_box(); |
236 | 0 | for (int i : cell_x_) { |
237 | 0 | if (box.left() < i && i < box.right()) { |
238 | 0 | return false; |
239 | 0 | } |
240 | 0 | } |
241 | 0 | for (int i : cell_y_) { |
242 | 0 | if (box.bottom() < i && i < box.top()) { |
243 | 0 | return false; |
244 | 0 | } |
245 | 0 | } |
246 | 0 | return true; |
247 | 0 | } |
248 | | |
249 | | // Checks if a sub-table has multiple data cells filled. |
250 | 0 | int StructuredTable::CountFilledCells() { |
251 | 0 | return CountFilledCells(0, row_count() - 1, 0, column_count() - 1); |
252 | 0 | } |
253 | 0 | int StructuredTable::CountFilledCellsInRow(int row) { |
254 | 0 | return CountFilledCells(row, row, 0, column_count() - 1); |
255 | 0 | } |
256 | 0 | int StructuredTable::CountFilledCellsInColumn(int column) { |
257 | 0 | return CountFilledCells(0, row_count() - 1, column, column); |
258 | 0 | } |
259 | | int StructuredTable::CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start, |
260 | 0 | unsigned column_end) { |
261 | 0 | ASSERT_HOST(row_start <= row_end && row_end < row_count()); |
262 | 0 | ASSERT_HOST(column_start <= column_end && column_end < column_count()); |
263 | 0 | int cell_count = 0; |
264 | 0 | TBOX cell_box; |
265 | 0 | for (unsigned row = row_start; row <= row_end; ++row) { |
266 | 0 | cell_box.set_bottom(cell_y_[row]); |
267 | 0 | cell_box.set_top(cell_y_[row + 1]); |
268 | 0 | for (unsigned col = column_start; col <= column_end; ++col) { |
269 | 0 | cell_box.set_left(cell_x_[col]); |
270 | 0 | cell_box.set_right(cell_x_[col + 1]); |
271 | 0 | if (CountPartitions(cell_box) > 0) { |
272 | 0 | ++cell_count; |
273 | 0 | } |
274 | 0 | } |
275 | 0 | } |
276 | 0 | return cell_count; |
277 | 0 | } |
278 | | |
279 | | // Makes sure that at least one cell in a row has substantial area filled. |
280 | | // This can filter out large whitespace caused by growing tables too far |
281 | | // and page numbers. |
282 | 0 | bool StructuredTable::VerifyRowFilled(int row) { |
283 | 0 | for (unsigned i = 0; i < column_count(); ++i) { |
284 | 0 | auto area_filled = CalculateCellFilledPercentage(row, i); |
285 | 0 | if (area_filled >= kMinFilledArea) { |
286 | 0 | return true; |
287 | 0 | } |
288 | 0 | } |
289 | 0 | return false; |
290 | 0 | } |
291 | | |
292 | | // Finds the filled area in a cell. |
293 | | // Assume ColPartitions do not overlap for simplicity (even though they do). |
294 | 0 | double StructuredTable::CalculateCellFilledPercentage(unsigned row, unsigned column) { |
295 | 0 | ASSERT_HOST(row <= row_count()); |
296 | 0 | ASSERT_HOST(column <= column_count()); |
297 | 0 | const TBOX kCellBox(cell_x_[column], cell_y_[row], cell_x_[column + 1], cell_y_[row + 1]); |
298 | 0 | ASSERT_HOST(!kCellBox.null_box()); |
299 | |
|
300 | 0 | ColPartitionGridSearch gsearch(text_grid_); |
301 | 0 | gsearch.SetUniqueMode(true); |
302 | 0 | gsearch.StartRectSearch(kCellBox); |
303 | 0 | double area_covered = 0; |
304 | 0 | ColPartition *text = nullptr; |
305 | 0 | while ((text = gsearch.NextRectSearch()) != nullptr) { |
306 | 0 | if (text->IsTextType()) { |
307 | 0 | area_covered += text->bounding_box().intersection(kCellBox).area(); |
308 | 0 | } |
309 | 0 | } |
310 | 0 | const int32_t current_area = kCellBox.area(); |
311 | 0 | if (current_area == 0) { |
312 | 0 | return 1.0; |
313 | 0 | } |
314 | 0 | return std::min(1.0, area_covered / current_area); |
315 | 0 | } |
316 | | |
317 | | #ifndef GRAPHICS_DISABLED |
318 | | |
319 | | void StructuredTable::Display(ScrollView *window, ScrollView::Color color) { |
320 | | window->Brush(ScrollView::NONE); |
321 | | window->Pen(color); |
322 | | window->Rectangle(bounding_box_.left(), bounding_box_.bottom(), bounding_box_.right(), |
323 | | bounding_box_.top()); |
324 | | for (int i : cell_x_) { |
325 | | window->Line(i, bounding_box_.bottom(), i, bounding_box_.top()); |
326 | | } |
327 | | for (int i : cell_y_) { |
328 | | window->Line(bounding_box_.left(), i, bounding_box_.right(), i); |
329 | | } |
330 | | window->UpdateWindow(); |
331 | | } |
332 | | |
333 | | #endif |
334 | | |
335 | | // Clear structure information. |
336 | 0 | void StructuredTable::ClearStructure() { |
337 | 0 | cell_x_.clear(); |
338 | 0 | cell_y_.clear(); |
339 | 0 | is_lined_ = false; |
340 | 0 | space_above_ = 0; |
341 | 0 | space_below_ = 0; |
342 | 0 | space_left_ = 0; |
343 | 0 | space_right_ = 0; |
344 | 0 | median_cell_height_ = 0; |
345 | 0 | median_cell_width_ = 0; |
346 | 0 | } |
347 | | |
348 | | // When a table has lines, the lines should not intersect any partitions. |
349 | | // The following function makes sure the previous assumption is met. |
350 | 0 | bool StructuredTable::VerifyLinedTableCells() { |
351 | | // Function only called when lines exist. |
352 | 0 | ASSERT_HOST(cell_y_.size() >= 2 && cell_x_.size() >= 2); |
353 | 0 | for (int i : cell_y_) { |
354 | 0 | if (CountHorizontalIntersections(i) > 0) { |
355 | 0 | return false; |
356 | 0 | } |
357 | 0 | } |
358 | 0 | for (int i : cell_x_) { |
359 | 0 | if (CountVerticalIntersections(i) > 0) { |
360 | 0 | return false; |
361 | 0 | } |
362 | 0 | } |
363 | 0 | return true; |
364 | 0 | } |
365 | | |
366 | | // TODO(nbeato): Could be much better than this. |
367 | | // Examples: |
368 | | // - Calculate the percentage of filled cells. |
369 | | // - Calculate the average number of ColPartitions per cell. |
370 | | // - Calculate the number of cells per row with partitions. |
371 | | // - Check if ColPartitions in adjacent cells are similar. |
372 | | // - Check that all columns are at least a certain width. |
373 | | // - etc. |
374 | 0 | bool StructuredTable::VerifyWhitespacedTable() { |
375 | | // criteria for a table, must be at least 2x3 or 3x2 |
376 | 0 | return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6; |
377 | 0 | } |
378 | | |
379 | | // Finds vertical splits in the ColPartitions of text_grid_ by considering |
380 | | // all possible "good" guesses. A good guess is just the left/right sides of |
381 | | // the partitions, since these locations will uniquely define where the |
382 | | // extremal values where the splits can occur. The split happens |
383 | | // in the middle of the two nearest partitions. |
384 | 0 | void StructuredTable::FindWhitespacedColumns() { |
385 | | // Set of the extents of all partitions on the page. |
386 | 0 | std::vector<int> left_sides; |
387 | 0 | std::vector<int> right_sides; |
388 | | |
389 | | // Look at each text partition. We want to find the partitions |
390 | | // that have extremal left/right sides. These will give us a basis |
391 | | // for the table columns. |
392 | 0 | ColPartitionGridSearch gsearch(text_grid_); |
393 | 0 | gsearch.SetUniqueMode(true); |
394 | 0 | gsearch.StartRectSearch(bounding_box_); |
395 | 0 | ColPartition *text = nullptr; |
396 | 0 | while ((text = gsearch.NextRectSearch()) != nullptr) { |
397 | 0 | if (!text->IsTextType()) { |
398 | 0 | continue; |
399 | 0 | } |
400 | | |
401 | 0 | ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right()); |
402 | 0 | int spacing = static_cast<int>(text->median_width() * kHorizontalSpacing / 2.0 + 0.5); |
403 | 0 | left_sides.push_back(text->bounding_box().left() - spacing); |
404 | 0 | right_sides.push_back(text->bounding_box().right() + spacing); |
405 | 0 | } |
406 | | // It causes disaster below, so avoid it! |
407 | 0 | if (left_sides.empty() || right_sides.empty()) { |
408 | 0 | return; |
409 | 0 | } |
410 | | |
411 | | // Since data may be inserted in grid order, we sort the left/right sides. |
412 | 0 | std::sort(left_sides.begin(), left_sides.end()); |
413 | 0 | std::sort(right_sides.begin(), right_sides.end()); |
414 | | |
415 | | // At this point, in the "merged list", we expect to have a left side, |
416 | | // followed by either more left sides or a right side. The last number |
417 | | // should be a right side. We find places where the splits occur by looking |
418 | | // for "valleys". If we want to force gap sizes or allow overlap, change |
419 | | // the spacing above. If you want to let lines "slice" partitions as long |
420 | | // as it is infrequent, change the following function. |
421 | 0 | FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, &cell_x_); |
422 | 0 | } |
423 | | |
424 | | // Finds horizontal splits in the ColPartitions of text_grid_ by considering |
425 | | // all possible "good" guesses. A good guess is just the bottom/top sides of |
426 | | // the partitions, since these locations will uniquely define where the |
427 | | // extremal values where the splits can occur. The split happens |
428 | | // in the middle of the two nearest partitions. |
429 | 0 | void StructuredTable::FindWhitespacedRows() { |
430 | | // Set of the extents of all partitions on the page. |
431 | 0 | std::vector<int> bottom_sides; |
432 | 0 | std::vector<int> top_sides; |
433 | | // We will be "shrinking" partitions, so keep the min/max around to |
434 | | // make sure the bottom/top lines do not intersect text. |
435 | 0 | int min_bottom = INT32_MAX; |
436 | 0 | int max_top = INT32_MIN; |
437 | | |
438 | | // Look at each text partition. We want to find the partitions |
439 | | // that have extremal bottom/top sides. These will give us a basis |
440 | | // for the table rows. Because the textlines can be skewed and close due |
441 | | // to warping, the height of the partitions is toned down a little bit. |
442 | 0 | ColPartitionGridSearch gsearch(text_grid_); |
443 | 0 | gsearch.SetUniqueMode(true); |
444 | 0 | gsearch.StartRectSearch(bounding_box_); |
445 | 0 | ColPartition *text = nullptr; |
446 | 0 | while ((text = gsearch.NextRectSearch()) != nullptr) { |
447 | 0 | if (!text->IsTextType()) { |
448 | 0 | continue; |
449 | 0 | } |
450 | | |
451 | 0 | ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top()); |
452 | 0 | min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom())); |
453 | 0 | max_top = std::max(max_top, static_cast<int>(text->bounding_box().top())); |
454 | | |
455 | | // Ignore "tall" text partitions, as these are usually false positive |
456 | | // vertical text or multiple lines pulled together. |
457 | 0 | if (text->bounding_box().height() > max_text_height_) { |
458 | 0 | continue; |
459 | 0 | } |
460 | | |
461 | 0 | int spacing = static_cast<int>(text->bounding_box().height() * kVerticalSpacing / 2.0 + 0.5); |
462 | 0 | int bottom = text->bounding_box().bottom() - spacing; |
463 | 0 | int top = text->bounding_box().top() + spacing; |
464 | | // For horizontal text, the factor can be negative. This should |
465 | | // probably cause a warning or failure. I haven't actually checked if |
466 | | // it happens. |
467 | 0 | if (bottom >= top) { |
468 | 0 | continue; |
469 | 0 | } |
470 | | |
471 | 0 | bottom_sides.push_back(bottom); |
472 | 0 | top_sides.push_back(top); |
473 | 0 | } |
474 | | // It causes disaster below, so avoid it! |
475 | 0 | if (bottom_sides.empty() || top_sides.empty()) { |
476 | 0 | return; |
477 | 0 | } |
478 | | |
479 | | // Since data may be inserted in grid order, we sort the bottom/top sides. |
480 | 0 | std::sort(bottom_sides.begin(), bottom_sides.end()); |
481 | 0 | std::sort(top_sides.begin(), top_sides.end()); |
482 | | |
483 | | // At this point, in the "merged list", we expect to have a bottom side, |
484 | | // followed by either more bottom sides or a top side. The last number |
485 | | // should be a top side. We find places where the splits occur by looking |
486 | | // for "valleys". If we want to force gap sizes or allow overlap, change |
487 | | // the spacing above. If you want to let lines "slice" partitions as long |
488 | | // as it is infrequent, change the following function. |
489 | 0 | FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, &cell_y_); |
490 | | |
491 | | // Recover the min/max correctly since it was shifted. |
492 | 0 | cell_y_[0] = min_bottom; |
493 | 0 | cell_y_[cell_y_.size() - 1] = max_top; |
494 | 0 | } |
495 | | |
496 | 0 | void StructuredTable::CalculateMargins() { |
497 | 0 | space_above_ = INT32_MAX; |
498 | 0 | space_below_ = INT32_MAX; |
499 | 0 | space_right_ = INT32_MAX; |
500 | 0 | space_left_ = INT32_MAX; |
501 | 0 | UpdateMargins(text_grid_); |
502 | 0 | UpdateMargins(line_grid_); |
503 | 0 | } |
504 | | // Finds the nearest partition in grid to the table |
505 | | // boundaries and updates the margin. |
506 | 0 | void StructuredTable::UpdateMargins(ColPartitionGrid *grid) { |
507 | 0 | int below = FindVerticalMargin(grid, bounding_box_.bottom(), true); |
508 | 0 | space_below_ = std::min(space_below_, below); |
509 | 0 | int above = FindVerticalMargin(grid, bounding_box_.top(), false); |
510 | 0 | space_above_ = std::min(space_above_, above); |
511 | 0 | int left = FindHorizontalMargin(grid, bounding_box_.left(), true); |
512 | 0 | space_left_ = std::min(space_left_, left); |
513 | 0 | int right = FindHorizontalMargin(grid, bounding_box_.right(), false); |
514 | 0 | space_right_ = std::min(space_right_, right); |
515 | 0 | } |
516 | 0 | int StructuredTable::FindVerticalMargin(ColPartitionGrid *grid, int border, bool decrease) const { |
517 | 0 | ColPartitionGridSearch gsearch(grid); |
518 | 0 | gsearch.SetUniqueMode(true); |
519 | 0 | gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), border); |
520 | 0 | ColPartition *part = nullptr; |
521 | 0 | while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) { |
522 | 0 | if (!part->IsTextType() && !part->IsHorizontalLine()) { |
523 | 0 | continue; |
524 | 0 | } |
525 | 0 | int distance = |
526 | 0 | decrease ? border - part->bounding_box().top() : part->bounding_box().bottom() - border; |
527 | 0 | if (distance >= 0) { |
528 | 0 | return distance; |
529 | 0 | } |
530 | 0 | } |
531 | 0 | return INT32_MAX; |
532 | 0 | } |
533 | 0 | int StructuredTable::FindHorizontalMargin(ColPartitionGrid *grid, int border, bool decrease) const { |
534 | 0 | ColPartitionGridSearch gsearch(grid); |
535 | 0 | gsearch.SetUniqueMode(true); |
536 | 0 | gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top()); |
537 | 0 | ColPartition *part = nullptr; |
538 | 0 | while ((part = gsearch.NextSideSearch(decrease)) != nullptr) { |
539 | 0 | if (!part->IsTextType() && !part->IsVerticalLine()) { |
540 | 0 | continue; |
541 | 0 | } |
542 | 0 | int distance = |
543 | 0 | decrease ? border - part->bounding_box().right() : part->bounding_box().left() - border; |
544 | 0 | if (distance >= 0) { |
545 | 0 | return distance; |
546 | 0 | } |
547 | 0 | } |
548 | 0 | return INT32_MAX; |
549 | 0 | } |
550 | | |
551 | 0 | void StructuredTable::CalculateStats() { |
552 | 0 | const int kMaxCellHeight = 1000; |
553 | 0 | const int kMaxCellWidth = 1000; |
554 | 0 | STATS height_stats(0, kMaxCellHeight); |
555 | 0 | STATS width_stats(0, kMaxCellWidth); |
556 | |
|
557 | 0 | for (unsigned i = 0; i < row_count(); ++i) { |
558 | 0 | height_stats.add(row_height(i), column_count()); |
559 | 0 | } |
560 | 0 | for (unsigned i = 0; i < column_count(); ++i) { |
561 | 0 | width_stats.add(column_width(i), row_count()); |
562 | 0 | } |
563 | |
|
564 | 0 | median_cell_height_ = static_cast<int>(height_stats.median() + 0.5); |
565 | 0 | median_cell_width_ = static_cast<int>(width_stats.median() + 0.5); |
566 | 0 | } |
567 | | |
568 | | // Looks for grid lines near the current bounding box and |
569 | | // grows the bounding box to include them if no intersections |
570 | | // will occur as a result. This is necessary because the margins |
571 | | // are calculated relative to the closest line/text. If the |
572 | | // line isn't absorbed, the margin will be the distance to the line. |
573 | 0 | void StructuredTable::AbsorbNearbyLines() { |
574 | 0 | ColPartitionGridSearch gsearch(line_grid_); |
575 | 0 | gsearch.SetUniqueMode(true); |
576 | | |
577 | | // Is the closest line above good? Loop multiple times for tables with |
578 | | // multi-line (sometimes 2) borders. Limit the number of lines by |
579 | | // making sure they stay within a table cell or so. |
580 | 0 | ColPartition *line = nullptr; |
581 | 0 | gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.top()); |
582 | 0 | while ((line = gsearch.NextVerticalSearch(false)) != nullptr) { |
583 | 0 | if (!line->IsHorizontalLine()) { |
584 | 0 | break; |
585 | 0 | } |
586 | 0 | TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1, bounding_box_.right(), |
587 | 0 | line->MidY()); |
588 | 0 | if (text_search.height() > median_cell_height_ * 2) { |
589 | 0 | break; |
590 | 0 | } |
591 | 0 | if (CountPartitions(text_search) > 0) { |
592 | 0 | break; |
593 | 0 | } |
594 | 0 | bounding_box_.set_top(line->MidY()); |
595 | 0 | } |
596 | | // As above, is the closest line below good? |
597 | 0 | line = nullptr; |
598 | 0 | gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), bounding_box_.bottom()); |
599 | 0 | while ((line = gsearch.NextVerticalSearch(true)) != nullptr) { |
600 | 0 | if (!line->IsHorizontalLine()) { |
601 | 0 | break; |
602 | 0 | } |
603 | 0 | TBOX text_search(bounding_box_.left(), line->MidY(), bounding_box_.right(), |
604 | 0 | bounding_box_.bottom() - 1); |
605 | 0 | if (text_search.height() > median_cell_height_ * 2) { |
606 | 0 | break; |
607 | 0 | } |
608 | 0 | if (CountPartitions(text_search) > 0) { |
609 | 0 | break; |
610 | 0 | } |
611 | 0 | bounding_box_.set_bottom(line->MidY()); |
612 | 0 | } |
613 | | // TODO(nbeato): vertical lines |
614 | 0 | } |
615 | | |
616 | | // This function will find all "0 valleys" (of any length) given two |
617 | | // arrays. The arrays are the mins and maxes of partitions (either |
618 | | // left and right or bottom and top). Since the min/max lists are generated |
619 | | // with pairs of increasing integers, we can make some assumptions in |
620 | | // the function about ordering of the overall list, which are shown in the |
621 | | // asserts. |
622 | | // The algorithm works as follows: |
623 | | // While there are numbers to process, take the smallest number. |
624 | | // If it is from the min_list, increment the "hill" counter. |
625 | | // Otherwise, decrement the "hill" counter. |
626 | | // In the process of doing this, keep track of "crossing" the |
627 | | // desired height. |
628 | | // The first/last items are extremal values of the list and known. |
629 | | // NOTE: This function assumes the lists are sorted! |
630 | | void StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list, |
631 | | const std::vector<int> &max_list, int max_merged, |
632 | 0 | std::vector<int> *locations) { |
633 | 0 | locations->clear(); |
634 | 0 | ASSERT_HOST(min_list.size() == max_list.size()); |
635 | 0 | if (min_list.empty()) { |
636 | 0 | return; |
637 | 0 | } |
638 | 0 | ASSERT_HOST(min_list.at(0) < max_list.at(0)); |
639 | 0 | ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1)); |
640 | |
|
641 | 0 | locations->push_back(min_list.at(0)); |
642 | 0 | unsigned min_index = 0; |
643 | 0 | unsigned max_index = 0; |
644 | 0 | int stacked_partitions = 0; |
645 | 0 | int last_cross_position = INT32_MAX; |
646 | | // max_index will expire after min_index. |
647 | | // However, we can't "increase" the hill size if min_index expired. |
648 | | // So finish processing when min_index expires. |
649 | 0 | while (min_index < min_list.size()) { |
650 | | // Increase the hill count. |
651 | 0 | if (min_list[min_index] < max_list[max_index]) { |
652 | 0 | ++stacked_partitions; |
653 | 0 | if (last_cross_position != INT32_MAX && stacked_partitions > max_merged) { |
654 | 0 | int mid = (last_cross_position + min_list[min_index]) / 2; |
655 | 0 | locations->push_back(mid); |
656 | 0 | last_cross_position = INT32_MAX; |
657 | 0 | } |
658 | 0 | ++min_index; |
659 | 0 | } else { |
660 | | // Decrease the hill count. |
661 | 0 | --stacked_partitions; |
662 | 0 | if (last_cross_position == INT32_MAX && stacked_partitions <= max_merged) { |
663 | 0 | last_cross_position = max_list[max_index]; |
664 | 0 | } |
665 | 0 | ++max_index; |
666 | 0 | } |
667 | 0 | } |
668 | 0 | locations->push_back(max_list.at(max_list.size() - 1)); |
669 | 0 | } |
670 | | |
671 | | // Counts the number of partitions in the table |
672 | | // box that intersection the given x value. |
673 | 0 | int StructuredTable::CountVerticalIntersections(int x) { |
674 | 0 | int count = 0; |
675 | | // Make a small box to keep the search time down. |
676 | 0 | const int kGridSize = text_grid_->gridsize(); |
677 | 0 | TBOX vertical_box = bounding_box_; |
678 | 0 | vertical_box.set_left(x - kGridSize); |
679 | 0 | vertical_box.set_right(x + kGridSize); |
680 | |
|
681 | 0 | ColPartitionGridSearch gsearch(text_grid_); |
682 | 0 | gsearch.SetUniqueMode(true); |
683 | 0 | gsearch.StartRectSearch(vertical_box); |
684 | 0 | ColPartition *text = nullptr; |
685 | 0 | while ((text = gsearch.NextRectSearch()) != nullptr) { |
686 | 0 | if (!text->IsTextType()) { |
687 | 0 | continue; |
688 | 0 | } |
689 | 0 | const TBOX &box = text->bounding_box(); |
690 | 0 | if (box.left() < x && x < box.right()) { |
691 | 0 | ++count; |
692 | 0 | } |
693 | 0 | } |
694 | 0 | return count; |
695 | 0 | } |
696 | | |
697 | | // Counts the number of partitions in the table |
698 | | // box that intersection the given y value. |
699 | 0 | int StructuredTable::CountHorizontalIntersections(int y) { |
700 | 0 | int count = 0; |
701 | | // Make a small box to keep the search time down. |
702 | 0 | const int kGridSize = text_grid_->gridsize(); |
703 | 0 | TBOX horizontal_box = bounding_box_; |
704 | 0 | horizontal_box.set_bottom(y - kGridSize); |
705 | 0 | horizontal_box.set_top(y + kGridSize); |
706 | |
|
707 | 0 | ColPartitionGridSearch gsearch(text_grid_); |
708 | 0 | gsearch.SetUniqueMode(true); |
709 | 0 | gsearch.StartRectSearch(horizontal_box); |
710 | 0 | ColPartition *text = nullptr; |
711 | 0 | while ((text = gsearch.NextRectSearch()) != nullptr) { |
712 | 0 | if (!text->IsTextType()) { |
713 | 0 | continue; |
714 | 0 | } |
715 | | |
716 | 0 | const TBOX &box = text->bounding_box(); |
717 | 0 | if (box.bottom() < y && y < box.top()) { |
718 | 0 | ++count; |
719 | 0 | } |
720 | 0 | } |
721 | 0 | return count; |
722 | 0 | } |
723 | | |
724 | | // Counts how many text partitions are in this box. |
725 | | // This is used to count partitions in cells, as that can indicate |
726 | | // how "strong" a potential table row/column (or even full table) actually is. |
727 | 0 | int StructuredTable::CountPartitions(const TBOX &box) { |
728 | 0 | ColPartitionGridSearch gsearch(text_grid_); |
729 | 0 | gsearch.SetUniqueMode(true); |
730 | 0 | gsearch.StartRectSearch(box); |
731 | 0 | int count = 0; |
732 | 0 | ColPartition *text = nullptr; |
733 | 0 | while ((text = gsearch.NextRectSearch()) != nullptr) { |
734 | 0 | if (text->IsTextType()) { |
735 | 0 | ++count; |
736 | 0 | } |
737 | 0 | } |
738 | 0 | return count; |
739 | 0 | } |
740 | | |
741 | | //////// |
742 | | //////// TableRecognizer Class |
743 | | //////// |
744 | | |
745 | 0 | void TableRecognizer::Init() {} |
746 | | |
747 | 0 | void TableRecognizer::set_text_grid(ColPartitionGrid *text_grid) { |
748 | 0 | text_grid_ = text_grid; |
749 | 0 | } |
750 | 0 | void TableRecognizer::set_line_grid(ColPartitionGrid *line_grid) { |
751 | 0 | line_grid_ = line_grid; |
752 | 0 | } |
753 | 0 | void TableRecognizer::set_min_height(int height) { |
754 | 0 | min_height_ = height; |
755 | 0 | } |
756 | 0 | void TableRecognizer::set_min_width(int width) { |
757 | 0 | min_width_ = width; |
758 | 0 | } |
759 | 0 | void TableRecognizer::set_max_text_height(int height) { |
760 | 0 | max_text_height_ = height; |
761 | 0 | } |
762 | | |
763 | 0 | StructuredTable *TableRecognizer::RecognizeTable(const TBOX &guess) { |
764 | 0 | auto *table = new StructuredTable(); |
765 | 0 | table->Init(); |
766 | 0 | table->set_text_grid(text_grid_); |
767 | 0 | table->set_line_grid(line_grid_); |
768 | 0 | table->set_max_text_height(max_text_height_); |
769 | | |
770 | | // Try to solve this simple case, a table with *both* |
771 | | // vertical and horizontal lines. |
772 | 0 | if (RecognizeLinedTable(guess, table)) { |
773 | 0 | return table; |
774 | 0 | } |
775 | | |
776 | | // Fallback to whitespace if that failed. |
777 | | // TODO(nbeato): Break this apart to take advantage of horizontal |
778 | | // lines or vertical lines when present. |
779 | 0 | if (RecognizeWhitespacedTable(guess, table)) { |
780 | 0 | return table; |
781 | 0 | } |
782 | | |
783 | | // No table found... |
784 | 0 | delete table; |
785 | 0 | return nullptr; |
786 | 0 | } |
787 | | |
788 | 0 | bool TableRecognizer::RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table) { |
789 | 0 | if (!HasSignificantLines(guess_box)) { |
790 | 0 | return false; |
791 | 0 | } |
792 | 0 | TBOX line_bound = guess_box; |
793 | 0 | if (!FindLinesBoundingBox(&line_bound)) { |
794 | 0 | return false; |
795 | 0 | } |
796 | 0 | table->set_bounding_box(line_bound); |
797 | 0 | return table->FindLinedStructure(); |
798 | 0 | } |
799 | | |
800 | | // Quick implementation. Just count the number of lines in the box. |
801 | | // A better implementation would counter intersections and look for connected |
802 | | // components. It could even go as far as finding similar length lines. |
803 | | // To account for these possible issues, the VerifyLinedTableCells function |
804 | | // will reject lined tables that cause intersections with text on the page. |
805 | | // TODO(nbeato): look for "better" lines |
806 | 0 | bool TableRecognizer::HasSignificantLines(const TBOX &guess) { |
807 | 0 | ColPartitionGridSearch box_search(line_grid_); |
808 | 0 | box_search.SetUniqueMode(true); |
809 | 0 | box_search.StartRectSearch(guess); |
810 | 0 | ColPartition *line = nullptr; |
811 | 0 | int vertical_count = 0; |
812 | 0 | int horizontal_count = 0; |
813 | |
|
814 | 0 | while ((line = box_search.NextRectSearch()) != nullptr) { |
815 | 0 | if (line->IsHorizontalLine()) { |
816 | 0 | ++horizontal_count; |
817 | 0 | } |
818 | 0 | if (line->IsVerticalLine()) { |
819 | 0 | ++vertical_count; |
820 | 0 | } |
821 | 0 | } |
822 | |
|
823 | 0 | return vertical_count >= kLinedTableMinVerticalLines && |
824 | 0 | horizontal_count >= kLinedTableMinHorizontalLines; |
825 | 0 | } |
826 | | |
827 | | // Given a bounding box with a bunch of horizontal / vertical lines, |
828 | | // we just find the extents of all of these lines iteratively. |
829 | | // The box will be at least as large as guess. This |
830 | | // could possibly be a bad assumption. |
831 | | // It is guaranteed to halt in at least O(n * gridarea) where n |
832 | | // is the number of lines. |
833 | | // The assumption is that growing the box iteratively will add lines |
834 | | // several times, but eventually we'll find the extents. |
835 | | // |
836 | | // For tables, the approach is a bit aggressive, a single line (which could be |
837 | | // noise or a column ruling) can destroy the table inside. |
838 | | // |
839 | | // TODO(nbeato): This is a quick first implementation. |
840 | | // A better implementation would actually look for consistency |
841 | | // in extents of the lines and find the extents using lines |
842 | | // that clearly describe the table. This would allow the |
843 | | // lines to "vote" for height/width. An approach like |
844 | | // this would solve issues with page layout rulings. |
845 | | // I haven't looked for these issues yet, so I can't even |
846 | | // say they happen confidently. |
847 | 0 | bool TableRecognizer::FindLinesBoundingBox(TBOX *bounding_box) { |
848 | | // The first iteration will tell us if there are lines |
849 | | // present and shrink the box to a minimal iterative size. |
850 | 0 | if (!FindLinesBoundingBoxIteration(bounding_box)) { |
851 | 0 | return false; |
852 | 0 | } |
853 | | |
854 | | // Keep growing until the area of the table stabilizes. |
855 | | // The box can only get bigger, increasing area. |
856 | 0 | bool changed = true; |
857 | 0 | while (changed) { |
858 | 0 | changed = false; |
859 | 0 | int old_area = bounding_box->area(); |
860 | 0 | bool check = FindLinesBoundingBoxIteration(bounding_box); |
861 | | // At this point, the function will return true. |
862 | 0 | ASSERT_HOST(check); |
863 | 0 | ASSERT_HOST(bounding_box->area() >= old_area); |
864 | 0 | changed = (bounding_box->area() > old_area); |
865 | 0 | } |
866 | |
|
867 | 0 | return true; |
868 | 0 | } |
869 | | |
870 | 0 | bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX *bounding_box) { |
871 | | // Search for all of the lines in the current box, keeping track of extents. |
872 | 0 | ColPartitionGridSearch box_search(line_grid_); |
873 | 0 | box_search.SetUniqueMode(true); |
874 | 0 | box_search.StartRectSearch(*bounding_box); |
875 | 0 | ColPartition *line = nullptr; |
876 | 0 | bool first_line = true; |
877 | |
|
878 | 0 | while ((line = box_search.NextRectSearch()) != nullptr) { |
879 | 0 | if (line->IsLineType()) { |
880 | 0 | if (first_line) { |
881 | | // The first iteration can shrink the box. |
882 | 0 | *bounding_box = line->bounding_box(); |
883 | 0 | first_line = false; |
884 | 0 | } else { |
885 | 0 | *bounding_box += line->bounding_box(); |
886 | 0 | } |
887 | 0 | } |
888 | 0 | } |
889 | 0 | return !first_line; |
890 | 0 | } |
891 | | |
892 | | // The goal of this function is to move the table boundaries around and find |
893 | | // a table that maximizes the whitespace around the table while maximizing |
894 | | // the cellular structure. As a result, it gets confused by headers, footers, |
895 | | // and merged columns (text that crosses columns). There is a tolerance |
896 | | // that allows a few partitions to count towards potential cell merges. |
897 | | // It's the max_merged parameter to FindPartitionLocations. |
898 | | // It can work, but it needs some false positive remove on boundaries. |
899 | | // For now, the grid structure must not intersect any partitions. |
900 | | // Also, small tolerance is added to the horizontal lines for tightly packed |
901 | | // tables. The tolerance is added by adjusting the bounding boxes of the |
902 | | // partitions (in FindHorizontalPartitions). The current implementation |
903 | | // only adjusts the vertical extents of the table. |
904 | | // |
905 | | // Also note. This was hacked at a lot. It could probably use some |
906 | | // more hacking at to find a good set of border conditions and then a |
907 | | // nice clean up. |
908 | 0 | bool TableRecognizer::RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table) { |
909 | 0 | TBOX best_box = guess_box; // Best borders known. |
910 | 0 | int best_below = 0; // Margin size above best table. |
911 | 0 | int best_above = 0; // Margin size below best table. |
912 | 0 | TBOX adjusted = guess_box; // The search box. |
913 | | |
914 | | // We assume that the guess box is somewhat accurate, so we don't allow |
915 | | // the adjusted border to pass half of the guessed area. This prevents |
916 | | // "negative" tables from forming. |
917 | 0 | const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2; |
918 | | // Keeps track of the most columns in an accepted table. The resulting table |
919 | | // may be less than the max, but we don't want to stray too far. |
920 | 0 | unsigned best_cols = 0; |
921 | | // Make sure we find a good border. |
922 | 0 | bool found_good_border = false; |
923 | | |
924 | | // Find the bottom of the table by trying a few different locations. For |
925 | | // each location, the top, left, and right are fixed. We start the search |
926 | | // in a smaller table to favor best_cols getting a good estimate sooner. |
927 | 0 | int last_bottom = INT32_MAX; |
928 | 0 | int bottom = |
929 | 0 | NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY - min_height_ / 2, true); |
930 | 0 | int top = |
931 | 0 | NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false); |
932 | 0 | adjusted.set_top(top); |
933 | | |
934 | | // Headers/footers can be spaced far from everything. |
935 | | // Make sure that the space below is greater than the space above |
936 | | // the lowest row. |
937 | 0 | int previous_below = 0; |
938 | 0 | const int kMaxChances = 10; |
939 | 0 | int chances = kMaxChances; |
940 | 0 | while (bottom != last_bottom) { |
941 | 0 | adjusted.set_bottom(bottom); |
942 | |
|
943 | 0 | if (adjusted.height() >= min_height_) { |
944 | | // Try to fit the grid on the current box. We give it a chance |
945 | | // if the number of columns didn't significantly drop. |
946 | 0 | table->set_bounding_box(adjusted); |
947 | 0 | if (table->FindWhitespacedStructure() && |
948 | 0 | table->column_count() >= best_cols * kRequiredColumns) { |
949 | 0 | if (false && IsWeakTableRow(table, 0)) { |
950 | | // Currently buggy, but was looking promising so disabled. |
951 | 0 | --chances; |
952 | 0 | } else { |
953 | | // We favor 2 things, |
954 | | // 1- Adding rows that have partitioned data. |
955 | | // 2- Better margins (to find header/footer). |
956 | | // For better tables, we just look for multiple cells in the |
957 | | // bottom row with data in them. |
958 | | // For margins, the space below the last row should |
959 | | // be better than a table with the last row removed. |
960 | 0 | chances = kMaxChances; |
961 | 0 | double max_row_height = kMaxRowSize * table->median_cell_height(); |
962 | 0 | if ((table->space_below() * kMarginFactor >= best_below && |
963 | 0 | table->space_below() >= previous_below) || |
964 | 0 | (table->CountFilledCellsInRow(0) > 1 && table->row_height(0) < max_row_height)) { |
965 | 0 | best_box.set_bottom(bottom); |
966 | 0 | best_below = table->space_below(); |
967 | 0 | best_cols = std::max(table->column_count(), best_cols); |
968 | 0 | found_good_border = true; |
969 | 0 | } |
970 | 0 | } |
971 | 0 | previous_below = table->space_below(); |
972 | 0 | } else { |
973 | 0 | --chances; |
974 | 0 | } |
975 | 0 | } |
976 | 0 | if (chances <= 0) { |
977 | 0 | break; |
978 | 0 | } |
979 | | |
980 | 0 | last_bottom = bottom; |
981 | 0 | bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_bottom, true); |
982 | 0 | } |
983 | 0 | if (!found_good_border) { |
984 | 0 | return false; |
985 | 0 | } |
986 | | |
987 | | // TODO(nbeato) comments: follow modified code above... put it in a function! |
988 | 0 | found_good_border = false; |
989 | 0 | int last_top = INT32_MIN; |
990 | 0 | top = |
991 | 0 | NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false); |
992 | 0 | int previous_above = 0; |
993 | 0 | chances = kMaxChances; |
994 | |
|
995 | 0 | adjusted.set_bottom(best_box.bottom()); |
996 | 0 | while (last_top != top) { |
997 | 0 | adjusted.set_top(top); |
998 | 0 | if (adjusted.height() >= min_height_) { |
999 | 0 | table->set_bounding_box(adjusted); |
1000 | 0 | if (table->FindWhitespacedStructure() && |
1001 | 0 | table->column_count() >= best_cols * kRequiredColumns) { |
1002 | 0 | int last_row = table->row_count() - 1; |
1003 | 0 | if (false && IsWeakTableRow(table, last_row)) { |
1004 | | // Currently buggy, but was looking promising so disabled. |
1005 | 0 | --chances; |
1006 | 0 | } else { |
1007 | 0 | chances = kMaxChances; |
1008 | 0 | double max_row_height = kMaxRowSize * table->median_cell_height(); |
1009 | 0 | if ((table->space_above() * kMarginFactor >= best_above && |
1010 | 0 | table->space_above() >= previous_above) || |
1011 | 0 | (table->CountFilledCellsInRow(last_row) > 1 && |
1012 | 0 | table->row_height(last_row) < max_row_height)) { |
1013 | 0 | best_box.set_top(top); |
1014 | 0 | best_above = table->space_above(); |
1015 | 0 | best_cols = std::max(table->column_count(), best_cols); |
1016 | 0 | found_good_border = true; |
1017 | 0 | } |
1018 | 0 | } |
1019 | 0 | previous_above = table->space_above(); |
1020 | 0 | } else { |
1021 | 0 | --chances; |
1022 | 0 | } |
1023 | 0 | } |
1024 | 0 | if (chances <= 0) { |
1025 | 0 | break; |
1026 | 0 | } |
1027 | | |
1028 | 0 | last_top = top; |
1029 | 0 | top = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_top, false); |
1030 | 0 | } |
1031 | |
|
1032 | 0 | if (!found_good_border) { |
1033 | 0 | return false; |
1034 | 0 | } |
1035 | | |
1036 | | // If we get here, this shouldn't happen. It can be an assert, but |
1037 | | // I haven't tested it enough to make it crash things. |
1038 | 0 | if (best_box.null_box()) { |
1039 | 0 | return false; |
1040 | 0 | } |
1041 | | |
1042 | | // Given the best locations, fit the box to those locations. |
1043 | 0 | table->set_bounding_box(best_box); |
1044 | 0 | return table->FindWhitespacedStructure(); |
1045 | 0 | } |
1046 | | |
1047 | | // Finds the closest value to y that can safely cause a horizontal |
1048 | | // split in the partitions. |
1049 | | // This function has been buggy and not as reliable as I would've |
1050 | | // liked. I suggest finding all of the splits using the |
1051 | | // FindPartitionLocations once and then just keeping the results |
1052 | | // of that function cached somewhere. |
1053 | 0 | int TableRecognizer::NextHorizontalSplit(int left, int right, int y, bool top_to_bottom) { |
1054 | 0 | ColPartitionGridSearch gsearch(text_grid_); |
1055 | 0 | gsearch.SetUniqueMode(true); |
1056 | 0 | gsearch.StartVerticalSearch(left, right, y); |
1057 | 0 | ColPartition *text = nullptr; |
1058 | 0 | int last_y = y; |
1059 | 0 | while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != nullptr) { |
1060 | 0 | if (!text->IsTextType() || !text->IsHorizontalType()) { |
1061 | 0 | continue; |
1062 | 0 | } |
1063 | 0 | if (text->bounding_box().height() > max_text_height_) { |
1064 | 0 | continue; |
1065 | 0 | } |
1066 | | |
1067 | 0 | const TBOX &text_box = text->bounding_box(); |
1068 | 0 | if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) { |
1069 | 0 | last_y = std::min(last_y, static_cast<int>(text_box.bottom())); |
1070 | 0 | continue; |
1071 | 0 | } |
1072 | 0 | if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) { |
1073 | 0 | last_y = std::max(last_y, static_cast<int>(text_box.top())); |
1074 | 0 | continue; |
1075 | 0 | } |
1076 | | |
1077 | 0 | return last_y; |
1078 | 0 | } |
1079 | | // If none is found, we at least want to preserve the min/max, |
1080 | | // which defines the overlap of y with the last partition in the grid. |
1081 | 0 | return last_y; |
1082 | 0 | } |
1083 | | |
1084 | | } // namespace tesseract |