/src/tesseract/src/textord/tabfind.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | ///////////////////////////////////////////////////////////////////////  | 
2  |  | // File:        tabfind.cpp  | 
3  |  | // Description: Subclass of BBGrid to find vertically aligned blobs.  | 
4  |  | // Author:      Ray Smith  | 
5  |  | //  | 
6  |  | // (C) Copyright 2008, Google Inc.  | 
7  |  | // Licensed under the Apache License, Version 2.0 (the "License");  | 
8  |  | // you may not use this file except in compliance with the License.  | 
9  |  | // You may obtain a copy of the License at  | 
10  |  | // http://www.apache.org/licenses/LICENSE-2.0  | 
11  |  | // Unless required by applicable law or agreed to in writing, software  | 
12  |  | // distributed under the License is distributed on an "AS IS" BASIS,  | 
13  |  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
14  |  | // See the License for the specific language governing permissions and  | 
15  |  | // limitations under the License.  | 
16  |  | //  | 
17  |  | ///////////////////////////////////////////////////////////////////////  | 
18  |  |  | 
19  |  | #ifdef HAVE_CONFIG_H  | 
20  |  | #  include "config_auto.h"  | 
21  |  | #endif  | 
22  |  |  | 
23  |  | #include "alignedblob.h"  | 
24  |  | #include "colpartitiongrid.h"  | 
25  |  | #include "detlinefit.h"  | 
26  |  | #include "host.h" // for NearlyEqual  | 
27  |  | #include "linefind.h"  | 
28  |  | #include "tabfind.h"  | 
29  |  |  | 
30  |  | #include <algorithm>  | 
31  |  |  | 
32  |  | namespace tesseract { | 
33  |  |  | 
34  |  | // Multiple of box size to search for initial gaps.  | 
35  |  | const int kTabRadiusFactor = 5;  | 
36  |  | // Min and Max multiple of height to search vertically when extrapolating.  | 
37  |  | const int kMinVerticalSearch = 3;  | 
38  |  | const int kMaxVerticalSearch = 12;  | 
39  |  | const int kMaxRaggedSearch = 25;  | 
40  |  | // Minimum number of lines in a column width to make it interesting.  | 
41  |  | const int kMinLinesInColumn = 10;  | 
42  |  | // Minimum width of a column to be interesting.  | 
43  |  | const int kMinColumnWidth = 200;  | 
44  |  | // Minimum fraction of total column lines for a column to be interesting.  | 
45  |  | const double kMinFractionalLinesInColumn = 0.125;  | 
46  |  | // Fraction of height used as alignment tolerance for aligned tabs.  | 
47  |  | const double kAlignedFraction = 0.03125;  | 
48  |  | // Maximum gutter width (in absolute inch) that we care about  | 
49  |  | const double kMaxGutterWidthAbsolute = 2.00;  | 
50  |  | // Multiplier of gridsize for min gutter width of TT_MAYBE_RAGGED blobs.  | 
51  |  | const int kRaggedGutterMultiple = 5;  | 
52  |  | // Min aspect ratio of tall objects to be considered a separator line.  | 
53  |  | // (These will be ignored in searching the gutter for obstructions.)  | 
54  |  | const double kLineFragmentAspectRatio = 10.0;  | 
55  |  | // Min number of points to accept after evaluation.  | 
56  |  | const int kMinEvaluatedTabs = 3;  | 
57  |  | // Up to 30 degrees is allowed for rotations of diacritic blobs.  | 
58  |  | // Keep this value slightly larger than kCosSmallAngle in blobbox.cpp  | 
59  |  | // so that the assert there never fails.  | 
60  |  | const double kCosMaxSkewAngle = 0.866025;  | 
61  |  |  | 
62  |  | static BOOL_VAR(textord_tabfind_show_initialtabs, false, "Show tab candidates");  | 
63  |  | static BOOL_VAR(textord_tabfind_show_finaltabs, false, "Show tab vectors");  | 
64  |  |  | 
65  |  | TabFind::TabFind(int gridsize, const ICOORD &bleft, const ICOORD &tright, TabVector_LIST *vlines,  | 
66  |  |                  int vertical_x, int vertical_y, int resolution)  | 
67  | 0  |     : AlignedBlob(gridsize, bleft, tright)  | 
68  | 0  |     , resolution_(resolution)  | 
69  | 0  |     , image_origin_(0, tright.y() - 1)  | 
70  | 0  |     , v_it_(&vectors_)  | 
71  | 0  |     , width_cb_(nullptr) { | 
72  | 0  |   v_it_.add_list_after(vlines);  | 
73  | 0  |   SetVerticalSkewAndParallelize(vertical_x, vertical_y);  | 
74  | 0  |   using namespace std::placeholders; // for _1  | 
75  | 0  |   width_cb_ = std::bind(&TabFind::CommonWidth, this, _1);  | 
76  | 0  | }  | 
77  |  |  | 
78  | 0  | TabFind::~TabFind() = default;  | 
79  |  |  | 
80  |  | ///////////////// PUBLIC functions (mostly used by TabVector). //////////////  | 
81  |  |  | 
82  |  | // Insert a list of blobs into the given grid (not necessarily this).  | 
83  |  | // If take_ownership is true, then the blobs are removed from the source list.  | 
84  |  | // See InsertBlob for the other arguments.  | 
85  |  | // It would seem to make more sense to swap this and grid, but this way  | 
86  |  | // around allows grid to not be derived from TabFind, eg a ColPartitionGrid,  | 
87  |  | // while the grid that provides the tab stops(this) has to be derived from  | 
88  |  | // TabFind.  | 
89  |  | void TabFind::InsertBlobsToGrid(bool h_spread, bool v_spread, BLOBNBOX_LIST *blobs,  | 
90  | 0  |                                 BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> *grid) { | 
91  | 0  |   BLOBNBOX_IT blob_it(blobs);  | 
92  | 0  |   int b_count = 0;  | 
93  | 0  |   int reject_count = 0;  | 
94  | 0  |   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | 
95  | 0  |     BLOBNBOX *blob = blob_it.data();  | 
96  |  |     //    if (InsertBlob(true, true, blob, grid)) { | 
97  | 0  |     if (InsertBlob(h_spread, v_spread, blob, grid)) { | 
98  | 0  |       ++b_count;  | 
99  | 0  |     } else { | 
100  | 0  |       ++reject_count;  | 
101  | 0  |     }  | 
102  | 0  |   }  | 
103  | 0  |   if (textord_debug_tabfind) { | 
104  | 0  |     tprintf("Inserted %d blobs into grid, %d rejected.\n", b_count, reject_count); | 
105  | 0  |   }  | 
106  | 0  | }  | 
107  |  |  | 
108  |  | // Insert a single blob into the given grid (not necessarily this).  | 
109  |  | // If h_spread, then all cells covered horizontally by the box are  | 
110  |  | // used, otherwise, just the bottom-left. Similarly for v_spread.  | 
111  |  | // A side effect is that the left and right rule edges of the blob are  | 
112  |  | // set according to the tab vectors in this (not grid).  | 
113  |  | bool TabFind::InsertBlob(bool h_spread, bool v_spread, BLOBNBOX *blob,  | 
114  | 0  |                          BBGrid<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> *grid) { | 
115  | 0  |   TBOX box = blob->bounding_box();  | 
116  | 0  |   blob->set_left_rule(LeftEdgeForBox(box, false, false));  | 
117  | 0  |   blob->set_right_rule(RightEdgeForBox(box, false, false));  | 
118  | 0  |   blob->set_left_crossing_rule(LeftEdgeForBox(box, true, false));  | 
119  | 0  |   blob->set_right_crossing_rule(RightEdgeForBox(box, true, false));  | 
120  | 0  |   if (blob->joined_to_prev()) { | 
121  | 0  |     return false;  | 
122  | 0  |   }  | 
123  | 0  |   grid->InsertBBox(h_spread, v_spread, blob);  | 
124  | 0  |   return true;  | 
125  | 0  | }  | 
126  |  |  | 
127  |  | // Calls SetBlobRuleEdges for all the blobs in the given block.  | 
128  | 0  | void TabFind::SetBlockRuleEdges(TO_BLOCK *block) { | 
129  | 0  |   SetBlobRuleEdges(&block->blobs);  | 
130  | 0  |   SetBlobRuleEdges(&block->small_blobs);  | 
131  | 0  |   SetBlobRuleEdges(&block->noise_blobs);  | 
132  | 0  |   SetBlobRuleEdges(&block->large_blobs);  | 
133  | 0  | }  | 
134  |  |  | 
135  |  | // Sets the left and right rule and crossing_rules for the blobs in the given  | 
136  |  | // list by fiding the next outermost tabvectors for each blob.  | 
137  | 0  | void TabFind::SetBlobRuleEdges(BLOBNBOX_LIST *blobs) { | 
138  | 0  |   BLOBNBOX_IT blob_it(blobs);  | 
139  | 0  |   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { | 
140  | 0  |     BLOBNBOX *blob = blob_it.data();  | 
141  | 0  |     TBOX box = blob->bounding_box();  | 
142  | 0  |     blob->set_left_rule(LeftEdgeForBox(box, false, false));  | 
143  | 0  |     blob->set_right_rule(RightEdgeForBox(box, false, false));  | 
144  | 0  |     blob->set_left_crossing_rule(LeftEdgeForBox(box, true, false));  | 
145  | 0  |     blob->set_right_crossing_rule(RightEdgeForBox(box, true, false));  | 
146  | 0  |   }  | 
147  | 0  | }  | 
148  |  |  | 
149  |  | // Returns the gutter width of the given TabVector between the given y limits.  | 
150  |  | // Also returns x-shift to be added to the vector to clear any intersecting  | 
151  |  | // blobs. The shift is deducted from the returned gutter.  | 
152  |  | // If ignore_unmergeables is true, then blobs of UnMergeableType are  | 
153  |  | // ignored as if they don't exist. (Used for text on image.)  | 
154  |  | // max_gutter_width is used as the maximum width worth searching for in case  | 
155  |  | // there is nothing near the TabVector.  | 
156  |  | int TabFind::GutterWidth(int bottom_y, int top_y, const TabVector &v, bool ignore_unmergeables,  | 
157  | 0  |                          int max_gutter_width, int *required_shift) { | 
158  | 0  |   bool right_to_left = v.IsLeftTab();  | 
159  | 0  |   int bottom_x = v.XAtY(bottom_y);  | 
160  | 0  |   int top_x = v.XAtY(top_y);  | 
161  | 0  |   int start_x = right_to_left ? std::max(top_x, bottom_x) : std::min(top_x, bottom_x);  | 
162  | 0  |   BlobGridSearch sidesearch(this);  | 
163  | 0  |   sidesearch.StartSideSearch(start_x, bottom_y, top_y);  | 
164  | 0  |   int min_gap = max_gutter_width;  | 
165  | 0  |   *required_shift = 0;  | 
166  | 0  |   BLOBNBOX *blob = nullptr;  | 
167  | 0  |   while ((blob = sidesearch.NextSideSearch(right_to_left)) != nullptr) { | 
168  | 0  |     const TBOX &box = blob->bounding_box();  | 
169  | 0  |     if (box.bottom() >= top_y || box.top() <= bottom_y) { | 
170  | 0  |       continue; // Doesn't overlap enough.  | 
171  | 0  |     }  | 
172  | 0  |     if (box.height() >= gridsize() * 2 && box.height() > box.width() * kLineFragmentAspectRatio) { | 
173  |  |       // Skip likely separator line residue.  | 
174  | 0  |       continue;  | 
175  | 0  |     }  | 
176  | 0  |     if (ignore_unmergeables && BLOBNBOX::UnMergeableType(blob->region_type())) { | 
177  | 0  |       continue; // Skip non-text if required.  | 
178  | 0  |     }  | 
179  | 0  |     int mid_y = (box.bottom() + box.top()) / 2;  | 
180  |  |     // We use the x at the mid-y so that the required_shift guarantees  | 
181  |  |     // to clear all the blobs on the tab-stop. If we use the min/max  | 
182  |  |     // of x at top/bottom of the blob, then exactness would be required,  | 
183  |  |     // which is not a good thing.  | 
184  | 0  |     int tab_x = v.XAtY(mid_y);  | 
185  | 0  |     int gap;  | 
186  | 0  |     if (right_to_left) { | 
187  | 0  |       gap = tab_x - box.right();  | 
188  | 0  |       if (gap < 0 && box.left() - tab_x < *required_shift) { | 
189  | 0  |         *required_shift = box.left() - tab_x;  | 
190  | 0  |       }  | 
191  | 0  |     } else { | 
192  | 0  |       gap = box.left() - tab_x;  | 
193  | 0  |       if (gap < 0 && box.right() - tab_x > *required_shift) { | 
194  | 0  |         *required_shift = box.right() - tab_x;  | 
195  | 0  |       }  | 
196  | 0  |     }  | 
197  | 0  |     if (gap > 0 && gap < min_gap) { | 
198  | 0  |       min_gap = gap;  | 
199  | 0  |     }  | 
200  | 0  |   }  | 
201  |  |   // Result may be negative, in which case,  this is a really bad tabstop.  | 
202  | 0  |   return min_gap - abs(*required_shift);  | 
203  | 0  | }  | 
204  |  |  | 
205  |  | // Find the gutter width and distance to inner neighbour for the given blob.  | 
206  |  | void TabFind::GutterWidthAndNeighbourGap(int tab_x, int mean_height, int max_gutter, bool left,  | 
207  | 0  |                                          BLOBNBOX *bbox, int *gutter_width, int *neighbour_gap) { | 
208  | 0  |   const TBOX &box = bbox->bounding_box();  | 
209  |  |   // The gutter and internal sides of the box.  | 
210  | 0  |   int gutter_x = left ? box.left() : box.right();  | 
211  | 0  |   int internal_x = left ? box.right() : box.left();  | 
212  |  |   // On ragged edges, the gutter side of the box is away from the tabstop.  | 
213  | 0  |   int tab_gap = left ? gutter_x - tab_x : tab_x - gutter_x;  | 
214  | 0  |   *gutter_width = max_gutter;  | 
215  |  |   // If the box is away from the tabstop, we need to increase  | 
216  |  |   // the allowed gutter width.  | 
217  | 0  |   if (tab_gap > 0) { | 
218  | 0  |     *gutter_width += tab_gap;  | 
219  | 0  |   }  | 
220  | 0  |   bool debug = WithinTestRegion(2, box.left(), box.bottom());  | 
221  | 0  |   if (debug) { | 
222  | 0  |     tprintf("Looking in gutter\n"); | 
223  | 0  |   }  | 
224  |  |   // Find the nearest blob on the outside of the column.  | 
225  | 0  |   BLOBNBOX *gutter_bbox = AdjacentBlob(bbox, left, bbox->flow() == BTFT_TEXT_ON_IMAGE, 0.0,  | 
226  | 0  |                                        *gutter_width, box.top(), box.bottom());  | 
227  | 0  |   if (gutter_bbox != nullptr) { | 
228  | 0  |     const TBOX &gutter_box = gutter_bbox->bounding_box();  | 
229  | 0  |     *gutter_width = left ? tab_x - gutter_box.right() : gutter_box.left() - tab_x;  | 
230  | 0  |   }  | 
231  | 0  |   if (*gutter_width >= max_gutter) { | 
232  |  |     // If there is no box because a tab was in the way, get the tab coord.  | 
233  | 0  |     TBOX gutter_box(box);  | 
234  | 0  |     if (left) { | 
235  | 0  |       gutter_box.set_left(tab_x - max_gutter - 1);  | 
236  | 0  |       gutter_box.set_right(tab_x - max_gutter);  | 
237  | 0  |       int tab_gutter = RightEdgeForBox(gutter_box, true, false);  | 
238  | 0  |       if (tab_gutter < tab_x - 1) { | 
239  | 0  |         *gutter_width = tab_x - tab_gutter;  | 
240  | 0  |       }  | 
241  | 0  |     } else { | 
242  | 0  |       gutter_box.set_left(tab_x + max_gutter);  | 
243  | 0  |       gutter_box.set_right(tab_x + max_gutter + 1);  | 
244  | 0  |       int tab_gutter = LeftEdgeForBox(gutter_box, true, false);  | 
245  | 0  |       if (tab_gutter > tab_x + 1) { | 
246  | 0  |         *gutter_width = tab_gutter - tab_x;  | 
247  | 0  |       }  | 
248  | 0  |     }  | 
249  | 0  |   }  | 
250  | 0  |   if (*gutter_width > max_gutter) { | 
251  | 0  |     *gutter_width = max_gutter;  | 
252  | 0  |   }  | 
253  |  |   // Now look for a neighbour on the inside.  | 
254  | 0  |   if (debug) { | 
255  | 0  |     tprintf("Looking for neighbour\n"); | 
256  | 0  |   }  | 
257  | 0  |   BLOBNBOX *neighbour = AdjacentBlob(bbox, !left, bbox->flow() == BTFT_TEXT_ON_IMAGE, 0.0,  | 
258  | 0  |                                      *gutter_width, box.top(), box.bottom());  | 
259  | 0  |   int neighbour_edge = left ? RightEdgeForBox(box, true, false) : LeftEdgeForBox(box, true, false);  | 
260  | 0  |   if (neighbour != nullptr) { | 
261  | 0  |     const TBOX &n_box = neighbour->bounding_box();  | 
262  | 0  |     if (debug) { | 
263  | 0  |       tprintf("Found neighbour:"); | 
264  | 0  |       n_box.print();  | 
265  | 0  |     }  | 
266  | 0  |     if (left && n_box.left() < neighbour_edge) { | 
267  | 0  |       neighbour_edge = n_box.left();  | 
268  | 0  |     } else if (!left && n_box.right() > neighbour_edge) { | 
269  | 0  |       neighbour_edge = n_box.right();  | 
270  | 0  |     }  | 
271  | 0  |   }  | 
272  | 0  |   *neighbour_gap = left ? neighbour_edge - internal_x : internal_x - neighbour_edge;  | 
273  | 0  | }  | 
274  |  |  | 
275  |  | // Return the x-coord that corresponds to the right edge for the given  | 
276  |  | // box. If there is a rule line to the right that vertically overlaps it,  | 
277  |  | // then return the x-coord of the rule line, otherwise return the right  | 
278  |  | // edge of the page. For details see RightTabForBox below.  | 
279  | 0  | int TabFind::RightEdgeForBox(const TBOX &box, bool crossing, bool extended) { | 
280  | 0  |   TabVector *v = RightTabForBox(box, crossing, extended);  | 
281  | 0  |   return v == nullptr ? tright_.x() : v->XAtY((box.top() + box.bottom()) / 2);  | 
282  | 0  | }  | 
283  |  | // As RightEdgeForBox, but finds the left Edge instead.  | 
284  | 0  | int TabFind::LeftEdgeForBox(const TBOX &box, bool crossing, bool extended) { | 
285  | 0  |   TabVector *v = LeftTabForBox(box, crossing, extended);  | 
286  | 0  |   return v == nullptr ? bleft_.x() : v->XAtY((box.top() + box.bottom()) / 2);  | 
287  | 0  | }  | 
288  |  |  | 
289  |  | // This comment documents how this function works.  | 
290  |  | // For its purpose and arguments, see the comment in tabfind.h.  | 
291  |  | // TabVectors are stored sorted by perpendicular distance of middle from  | 
292  |  | // the global mean vertical vector. Since the individual vectors can have  | 
293  |  | // differing directions, their XAtY for a given y is not necessarily in the  | 
294  |  | // right order. Therefore the search has to be run with a margin.  | 
295  |  | // The middle of a vector that passes through (x,y) cannot be higher than  | 
296  |  | // halfway from y to the top, or lower than halfway from y to the bottom  | 
297  |  | // of the coordinate range; therefore, the search margin is the range of  | 
298  |  | // sort keys between these halfway points. Any vector with a sort key greater  | 
299  |  | // than the upper margin must be to the right of x at y, and likewise any  | 
300  |  | // vector with a sort key less than the lower margin must pass to the left  | 
301  |  | // of x at y.  | 
302  | 0  | TabVector *TabFind::RightTabForBox(const TBOX &box, bool crossing, bool extended) { | 
303  | 0  |   if (v_it_.empty()) { | 
304  | 0  |     return nullptr;  | 
305  | 0  |   }  | 
306  | 0  |   int top_y = box.top();  | 
307  | 0  |   int bottom_y = box.bottom();  | 
308  | 0  |   int mid_y = (top_y + bottom_y) / 2;  | 
309  | 0  |   int right = crossing ? (box.left() + box.right()) / 2 : box.right();  | 
310  | 0  |   int min_key, max_key;  | 
311  | 0  |   SetupTabSearch(right, mid_y, &min_key, &max_key);  | 
312  |  |   // Position the iterator at the first TabVector with sort_key >= min_key.  | 
313  | 0  |   while (!v_it_.at_first() && v_it_.data()->sort_key() >= min_key) { | 
314  | 0  |     v_it_.backward();  | 
315  | 0  |   }  | 
316  | 0  |   while (!v_it_.at_last() && v_it_.data()->sort_key() < min_key) { | 
317  | 0  |     v_it_.forward();  | 
318  | 0  |   }  | 
319  |  |   // Find the leftmost tab vector that overlaps and has XAtY(mid_y) >= right.  | 
320  | 0  |   TabVector *best_v = nullptr;  | 
321  | 0  |   int best_x = -1;  | 
322  | 0  |   int key_limit = -1;  | 
323  | 0  |   do { | 
324  | 0  |     TabVector *v = v_it_.data();  | 
325  | 0  |     int x = v->XAtY(mid_y);  | 
326  | 0  |     if (x >= right && (v->VOverlap(top_y, bottom_y) > 0 ||  | 
327  | 0  |                        (extended && v->ExtendedOverlap(top_y, bottom_y) > 0))) { | 
328  | 0  |       if (best_v == nullptr || x < best_x) { | 
329  | 0  |         best_v = v;  | 
330  | 0  |         best_x = x;  | 
331  |  |         // We can guarantee that no better vector can be found if the  | 
332  |  |         // sort key exceeds that of the best by max_key - min_key.  | 
333  | 0  |         key_limit = v->sort_key() + max_key - min_key;  | 
334  | 0  |       }  | 
335  | 0  |     }  | 
336  |  |     // Break when the search is done to avoid wrapping the iterator and  | 
337  |  |     // thereby potentially slowing the next search.  | 
338  | 0  |     if (v_it_.at_last() || (best_v != nullptr && v->sort_key() > key_limit)) { | 
339  | 0  |       break; // Prevent restarting list for next call.  | 
340  | 0  |     }  | 
341  | 0  |     v_it_.forward();  | 
342  | 0  |   } while (!v_it_.at_first());  | 
343  | 0  |   return best_v;  | 
344  | 0  | }  | 
345  |  |  | 
346  |  | // As RightTabForBox, but finds the left TabVector instead.  | 
347  | 0  | TabVector *TabFind::LeftTabForBox(const TBOX &box, bool crossing, bool extended) { | 
348  | 0  |   if (v_it_.empty()) { | 
349  | 0  |     return nullptr;  | 
350  | 0  |   }  | 
351  | 0  |   int top_y = box.top();  | 
352  | 0  |   int bottom_y = box.bottom();  | 
353  | 0  |   int mid_y = (top_y + bottom_y) / 2;  | 
354  | 0  |   int left = crossing ? (box.left() + box.right()) / 2 : box.left();  | 
355  | 0  |   int min_key, max_key;  | 
356  | 0  |   SetupTabSearch(left, mid_y, &min_key, &max_key);  | 
357  |  |   // Position the iterator at the last TabVector with sort_key <= max_key.  | 
358  | 0  |   while (!v_it_.at_last() && v_it_.data()->sort_key() <= max_key) { | 
359  | 0  |     v_it_.forward();  | 
360  | 0  |   }  | 
361  | 0  |   while (!v_it_.at_first() && v_it_.data()->sort_key() > max_key) { | 
362  | 0  |     v_it_.backward();  | 
363  | 0  |   }  | 
364  |  |   // Find the rightmost tab vector that overlaps and has XAtY(mid_y) <= left.  | 
365  | 0  |   TabVector *best_v = nullptr;  | 
366  | 0  |   int best_x = -1;  | 
367  | 0  |   int key_limit = -1;  | 
368  | 0  |   do { | 
369  | 0  |     TabVector *v = v_it_.data();  | 
370  | 0  |     int x = v->XAtY(mid_y);  | 
371  | 0  |     if (x <= left && (v->VOverlap(top_y, bottom_y) > 0 ||  | 
372  | 0  |                       (extended && v->ExtendedOverlap(top_y, bottom_y) > 0))) { | 
373  | 0  |       if (best_v == nullptr || x > best_x) { | 
374  | 0  |         best_v = v;  | 
375  | 0  |         best_x = x;  | 
376  |  |         // We can guarantee that no better vector can be found if the  | 
377  |  |         // sort key is less than that of the best by max_key - min_key.  | 
378  | 0  |         key_limit = v->sort_key() - (max_key - min_key);  | 
379  | 0  |       }  | 
380  | 0  |     }  | 
381  |  |     // Break when the search is done to avoid wrapping the iterator and  | 
382  |  |     // thereby potentially slowing the next search.  | 
383  | 0  |     if (v_it_.at_first() || (best_v != nullptr && v->sort_key() < key_limit)) { | 
384  | 0  |       break; // Prevent restarting list for next call.  | 
385  | 0  |     }  | 
386  | 0  |     v_it_.backward();  | 
387  | 0  |   } while (!v_it_.at_last());  | 
388  | 0  |   return best_v;  | 
389  | 0  | }  | 
390  |  |  | 
391  |  | // Return true if the given width is close to one of the common  | 
392  |  | // widths in column_widths_.  | 
393  | 0  | bool TabFind::CommonWidth(int width) { | 
394  | 0  |   width /= kColumnWidthFactor;  | 
395  | 0  |   ICOORDELT_IT it(&column_widths_);  | 
396  | 0  |   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | 
397  | 0  |     ICOORDELT *w = it.data();  | 
398  | 0  |     if (w->x() - 1 <= width && width <= w->y() + 1) { | 
399  | 0  |       return true;  | 
400  | 0  |     }  | 
401  | 0  |   }  | 
402  | 0  |   return false;  | 
403  | 0  | }  | 
404  |  |  | 
405  |  | // Return true if the sizes are more than a  | 
406  |  | // factor of 2 different.  | 
407  | 0  | bool TabFind::DifferentSizes(int size1, int size2) { | 
408  | 0  |   return size1 > size2 * 2 || size2 > size1 * 2;  | 
409  | 0  | }  | 
410  |  |  | 
411  |  | // Return true if the sizes are more than a  | 
412  |  | // factor of 5 different.  | 
413  | 0  | bool TabFind::VeryDifferentSizes(int size1, int size2) { | 
414  | 0  |   return size1 > size2 * 5 || size2 > size1 * 5;  | 
415  | 0  | }  | 
416  |  |  | 
417  |  | ///////////////// PROTECTED functions (used by ColumnFinder). //////////////  | 
418  |  |  | 
419  |  | // Top-level function to find TabVectors in an input page block.  | 
420  |  | // Returns false if the detected skew angle is impossible.  | 
421  |  | // Applies the detected skew angle to deskew the tabs, blobs and part_grid.  | 
422  |  | bool TabFind::FindTabVectors(TabVector_LIST *hlines, BLOBNBOX_LIST *image_blobs, TO_BLOCK *block,  | 
423  |  |                              int min_gutter_width, double tabfind_aligned_gap_fraction,  | 
424  | 0  |                              ColPartitionGrid *part_grid, FCOORD *deskew, FCOORD *reskew) { | 
425  | 0  |   ScrollView *tab_win =  | 
426  | 0  |       FindInitialTabVectors(image_blobs, min_gutter_width, tabfind_aligned_gap_fraction, block);  | 
427  | 0  |   ComputeColumnWidths(tab_win, part_grid);  | 
428  | 0  |   TabVector::MergeSimilarTabVectors(vertical_skew_, &vectors_, this);  | 
429  | 0  |   SortVectors();  | 
430  | 0  |   CleanupTabs();  | 
431  | 0  |   if (!Deskew(hlines, image_blobs, block, deskew, reskew)) { | 
432  | 0  |     return false; // Skew angle is too large.  | 
433  | 0  |   }  | 
434  | 0  |   part_grid->Deskew(*deskew);  | 
435  | 0  |   ApplyTabConstraints();  | 
436  |  | #ifndef GRAPHICS_DISABLED  | 
437  |  |   if (textord_tabfind_show_finaltabs) { | 
438  |  |     tab_win = MakeWindow(640, 50, "FinalTabs");  | 
439  |  |     DisplayBoxes(tab_win);  | 
440  |  |     DisplayTabs("FinalTabs", tab_win); | 
441  |  |     tab_win = DisplayTabVectors(tab_win);  | 
442  |  |   }  | 
443  |  | #endif // !GRAPHICS_DISABLED  | 
444  | 0  |   return true;  | 
445  | 0  | }  | 
446  |  |  | 
447  |  | // Top-level function to not find TabVectors in an input page block,  | 
448  |  | // but setup for single column mode.  | 
449  |  | void TabFind::DontFindTabVectors(BLOBNBOX_LIST *image_blobs, TO_BLOCK *block, FCOORD *deskew,  | 
450  | 0  |                                  FCOORD *reskew) { | 
451  | 0  |   InsertBlobsToGrid(false, false, image_blobs, this);  | 
452  | 0  |   InsertBlobsToGrid(true, false, &block->blobs, this);  | 
453  | 0  |   deskew->set_x(1.0f);  | 
454  | 0  |   deskew->set_y(0.0f);  | 
455  | 0  |   reskew->set_x(1.0f);  | 
456  | 0  |   reskew->set_y(0.0f);  | 
457  | 0  | }  | 
458  |  |  | 
459  |  | // Cleans up the lists of blobs in the block ready for use by TabFind.  | 
460  |  | // Large blobs that look like text are moved to the main blobs list.  | 
461  |  | // Main blobs that are superseded by the image blobs are deleted.  | 
462  | 0  | void TabFind::TidyBlobs(TO_BLOCK *block) { | 
463  | 0  |   BLOBNBOX_IT large_it = &block->large_blobs;  | 
464  | 0  |   BLOBNBOX_IT blob_it = &block->blobs;  | 
465  | 0  |   int b_count = 0;  | 
466  | 0  |   for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) { | 
467  | 0  |     BLOBNBOX *large_blob = large_it.data();  | 
468  | 0  |     if (large_blob->owner() != nullptr) { | 
469  | 0  |       blob_it.add_to_end(large_it.extract());  | 
470  | 0  |       ++b_count;  | 
471  | 0  |     }  | 
472  | 0  |   }  | 
473  | 0  |   if (textord_debug_tabfind) { | 
474  | 0  |     tprintf("Moved %d large blobs to normal list\n", b_count); | 
475  |  | #ifndef GRAPHICS_DISABLED  | 
476  |  |     ScrollView *rej_win = MakeWindow(500, 300, "Image blobs");  | 
477  |  |     block->plot_graded_blobs(rej_win);  | 
478  |  |     block->plot_noise_blobs(rej_win);  | 
479  |  |     rej_win->Update();  | 
480  |  | #endif // !GRAPHICS_DISABLED  | 
481  | 0  |   }  | 
482  | 0  |   block->DeleteUnownedNoise();  | 
483  | 0  | }  | 
484  |  |  | 
485  |  | // Helper function to setup search limits for *TabForBox.  | 
486  | 0  | void TabFind::SetupTabSearch(int x, int y, int *min_key, int *max_key) { | 
487  | 0  |   int key1 = TabVector::SortKey(vertical_skew_, x, (y + tright_.y()) / 2);  | 
488  | 0  |   int key2 = TabVector::SortKey(vertical_skew_, x, (y + bleft_.y()) / 2);  | 
489  | 0  |   *min_key = std::min(key1, key2);  | 
490  | 0  |   *max_key = std::max(key1, key2);  | 
491  | 0  | }  | 
492  |  |  | 
493  |  | #ifndef GRAPHICS_DISABLED  | 
494  |  |  | 
495  |  | ScrollView *TabFind::DisplayTabVectors(ScrollView *tab_win) { | 
496  |  |   // For every vector, display it.  | 
497  |  |   TabVector_IT it(&vectors_);  | 
498  |  |   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | 
499  |  |     TabVector *vector = it.data();  | 
500  |  |     vector->Display(tab_win);  | 
501  |  |   }  | 
502  |  |   tab_win->Update();  | 
503  |  |   return tab_win;  | 
504  |  | }  | 
505  |  |  | 
506  |  | #endif  | 
507  |  |  | 
508  |  | // PRIVATE CODE.  | 
509  |  | //  | 
510  |  | // First part of FindTabVectors, which may be used twice if the text  | 
511  |  | // is mostly of vertical alignment.  | 
512  |  | ScrollView *TabFind::FindInitialTabVectors(BLOBNBOX_LIST *image_blobs, int min_gutter_width,  | 
513  | 0  |                                            double tabfind_aligned_gap_fraction, TO_BLOCK *block) { | 
514  |  | #ifndef GRAPHICS_DISABLED  | 
515  |  |   if (textord_tabfind_show_initialtabs) { | 
516  |  |     ScrollView *line_win = MakeWindow(0, 0, "VerticalLines");  | 
517  |  |     line_win = DisplayTabVectors(line_win);  | 
518  |  |   }  | 
519  |  | #endif  | 
520  |  |   // Prepare the grid.  | 
521  | 0  |   if (image_blobs != nullptr) { | 
522  | 0  |     InsertBlobsToGrid(true, false, image_blobs, this);  | 
523  | 0  |   }  | 
524  | 0  |   InsertBlobsToGrid(true, false, &block->blobs, this);  | 
525  | 0  |   ScrollView *initial_win = FindTabBoxes(min_gutter_width, tabfind_aligned_gap_fraction);  | 
526  | 0  |   FindAllTabVectors(min_gutter_width);  | 
527  |  | 
  | 
528  | 0  |   TabVector::MergeSimilarTabVectors(vertical_skew_, &vectors_, this);  | 
529  | 0  |   SortVectors();  | 
530  | 0  |   EvaluateTabs();  | 
531  |  | #ifndef GRAPHICS_DISABLED  | 
532  |  |   if (textord_tabfind_show_initialtabs && initial_win != nullptr) { | 
533  |  |     initial_win = DisplayTabVectors(initial_win);  | 
534  |  |   }  | 
535  |  | #endif  | 
536  | 0  |   MarkVerticalText();  | 
537  | 0  |   return initial_win;  | 
538  | 0  | }  | 
539  |  |  | 
540  |  | #ifndef GRAPHICS_DISABLED  | 
541  |  |  | 
542  |  | // Helper displays all the boxes in the given vector on the given window.  | 
543  |  | static void DisplayBoxVector(const std::vector<BLOBNBOX *> &boxes, ScrollView *win) { | 
544  |  |   for (auto boxe : boxes) { | 
545  |  |     TBOX box = boxe->bounding_box();  | 
546  |  |     int left_x = box.left();  | 
547  |  |     int right_x = box.right();  | 
548  |  |     int top_y = box.top();  | 
549  |  |     int bottom_y = box.bottom();  | 
550  |  |     ScrollView::Color box_color = boxe->BoxColor();  | 
551  |  |     win->Pen(box_color);  | 
552  |  |     win->Rectangle(left_x, bottom_y, right_x, top_y);  | 
553  |  |   }  | 
554  |  |   win->Update();  | 
555  |  | }  | 
556  |  |  | 
557  |  | #endif // !GRAPHICS_DISABLED  | 
558  |  |  | 
559  |  | // For each box in the grid, decide whether it is a candidate tab-stop,  | 
560  |  | // and if so add it to the left/right tab boxes.  | 
561  | 0  | ScrollView *TabFind::FindTabBoxes(int min_gutter_width, double tabfind_aligned_gap_fraction) { | 
562  | 0  |   left_tab_boxes_.clear();  | 
563  | 0  |   right_tab_boxes_.clear();  | 
564  |  |   // For every bbox in the grid, determine whether it uses a tab on an edge.  | 
565  | 0  |   BlobGridSearch gsearch(this);  | 
566  | 0  |   gsearch.StartFullSearch();  | 
567  | 0  |   BLOBNBOX *bbox;  | 
568  | 0  |   while ((bbox = gsearch.NextFullSearch()) != nullptr) { | 
569  | 0  |     if (TestBoxForTabs(bbox, min_gutter_width, tabfind_aligned_gap_fraction)) { | 
570  |  |       // If it is any kind of tab, insert it into the vectors.  | 
571  | 0  |       if (bbox->left_tab_type() != TT_NONE) { | 
572  | 0  |         left_tab_boxes_.push_back(bbox);  | 
573  | 0  |       }  | 
574  | 0  |       if (bbox->right_tab_type() != TT_NONE) { | 
575  | 0  |         right_tab_boxes_.push_back(bbox);  | 
576  | 0  |       }  | 
577  | 0  |     }  | 
578  | 0  |   }  | 
579  |  |   // Sort left tabs by left and right by right to see the outermost one first  | 
580  |  |   // on a ragged tab.  | 
581  | 0  |   std::sort(left_tab_boxes_.begin(), left_tab_boxes_.end(), StdSortByBoxLeft<BLOBNBOX>);  | 
582  | 0  |   std::sort(right_tab_boxes_.begin(), right_tab_boxes_.end(), StdSortRightToLeft<BLOBNBOX>);  | 
583  | 0  |   ScrollView *tab_win = nullptr;  | 
584  |  | #ifndef GRAPHICS_DISABLED  | 
585  |  |   if (textord_tabfind_show_initialtabs) { | 
586  |  |     tab_win = MakeWindow(0, 100, "InitialTabs");  | 
587  |  |     tab_win->Pen(ScrollView::BLUE);  | 
588  |  |     tab_win->Brush(ScrollView::NONE);  | 
589  |  |     // Display the left and right tab boxes.  | 
590  |  |     DisplayBoxVector(left_tab_boxes_, tab_win);  | 
591  |  |     DisplayBoxVector(right_tab_boxes_, tab_win);  | 
592  |  |     tab_win = DisplayTabs("Tabs", tab_win); | 
593  |  |   }  | 
594  |  | #endif // !GRAPHICS_DISABLED  | 
595  | 0  |   return tab_win;  | 
596  | 0  | }  | 
597  |  |  | 
598  |  | bool TabFind::TestBoxForTabs(BLOBNBOX *bbox, int min_gutter_width,  | 
599  | 0  |                              double tabfind_aligned_gap_fraction) { | 
600  | 0  |   GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> radsearch(this);  | 
601  | 0  |   TBOX box = bbox->bounding_box();  | 
602  |  |   // If there are separator lines, get the column edges.  | 
603  | 0  |   int left_column_edge = bbox->left_rule();  | 
604  | 0  |   int right_column_edge = bbox->right_rule();  | 
605  |  |   // The edges of the bounding box of the blob being processed.  | 
606  | 0  |   int left_x = box.left();  | 
607  | 0  |   int right_x = box.right();  | 
608  | 0  |   int top_y = box.top();  | 
609  | 0  |   int bottom_y = box.bottom();  | 
610  | 0  |   int height = box.height();  | 
611  | 0  |   bool debug = WithinTestRegion(3, left_x, top_y);  | 
612  | 0  |   if (debug) { | 
613  | 0  |     tprintf("Column edges for blob at (%d,%d)->(%d,%d) are [%d, %d]\n", left_x, top_y, right_x, | 
614  | 0  |             bottom_y, left_column_edge, right_column_edge);  | 
615  | 0  |   }  | 
616  |  |   // Compute a search radius based on a multiple of the height.  | 
617  | 0  |   int radius = (height * kTabRadiusFactor + gridsize_ - 1) / gridsize_;  | 
618  | 0  |   radsearch.StartRadSearch((left_x + right_x) / 2, (top_y + bottom_y) / 2, radius);  | 
619  |  |   // In Vertical Page mode, once we have an estimate of the vertical line  | 
620  |  |   // spacing, the minimum amount of gutter space before a possible tab is  | 
621  |  |   // increased under the assumption that column partition is always larger  | 
622  |  |   // than line spacing.  | 
623  | 0  |   int min_spacing = static_cast<int>(height * tabfind_aligned_gap_fraction);  | 
624  | 0  |   if (min_gutter_width > min_spacing) { | 
625  | 0  |     min_spacing = min_gutter_width;  | 
626  | 0  |   }  | 
627  | 0  |   int min_ragged_gutter = kRaggedGutterMultiple * gridsize();  | 
628  | 0  |   if (min_gutter_width > min_ragged_gutter) { | 
629  | 0  |     min_ragged_gutter = min_gutter_width;  | 
630  | 0  |   }  | 
631  | 0  |   int target_right = left_x - min_spacing;  | 
632  | 0  |   int target_left = right_x + min_spacing;  | 
633  |  |   // We will be evaluating whether the left edge could be a left tab, and  | 
634  |  |   // whether the right edge could be a right tab.  | 
635  |  |   // A box can be a tab if its bool is_(left/right)_tab remains true, meaning  | 
636  |  |   // that no blobs have been found in the gutter during the radial search.  | 
637  |  |   // A box can also be a tab if there are objects in the gutter only above  | 
638  |  |   // or only below, and there are aligned objects on the opposite side, but  | 
639  |  |   // not too many unaligned objects. The maybe_(left/right)_tab_up counts  | 
640  |  |   // aligned objects above and negatively counts unaligned objects above,  | 
641  |  |   // and is set to -INT32_MAX if a gutter object is found above.  | 
642  |  |   // The other 3 maybe ints work similarly for the other sides.  | 
643  |  |   // These conditions are very strict, to minimize false positives, and really  | 
644  |  |   // only aligned tabs and outermost ragged tab blobs will qualify, so we  | 
645  |  |   // also have maybe_ragged_left/right with less stringent rules.  | 
646  |  |   // A blob that is maybe_ragged_left/right will be further qualified later,  | 
647  |  |   // using the min_ragged_gutter.  | 
648  | 0  |   bool is_left_tab = true;  | 
649  | 0  |   bool is_right_tab = true;  | 
650  | 0  |   bool maybe_ragged_left = true;  | 
651  | 0  |   bool maybe_ragged_right = true;  | 
652  | 0  |   int maybe_left_tab_up = 0;  | 
653  | 0  |   int maybe_right_tab_up = 0;  | 
654  | 0  |   int maybe_left_tab_down = 0;  | 
655  | 0  |   int maybe_right_tab_down = 0;  | 
656  | 0  |   if (bbox->leader_on_left()) { | 
657  | 0  |     is_left_tab = false;  | 
658  | 0  |     maybe_ragged_left = false;  | 
659  | 0  |     maybe_left_tab_up = -INT32_MAX;  | 
660  | 0  |     maybe_left_tab_down = -INT32_MAX;  | 
661  | 0  |   }  | 
662  | 0  |   if (bbox->leader_on_right()) { | 
663  | 0  |     is_right_tab = false;  | 
664  | 0  |     maybe_ragged_right = false;  | 
665  | 0  |     maybe_right_tab_up = -INT32_MAX;  | 
666  | 0  |     maybe_right_tab_down = -INT32_MAX;  | 
667  | 0  |   }  | 
668  | 0  |   int alignment_tolerance = static_cast<int>(resolution_ * kAlignedFraction);  | 
669  | 0  |   BLOBNBOX *neighbour = nullptr;  | 
670  | 0  |   while ((neighbour = radsearch.NextRadSearch()) != nullptr) { | 
671  | 0  |     if (neighbour == bbox) { | 
672  | 0  |       continue;  | 
673  | 0  |     }  | 
674  | 0  |     TBOX nbox = neighbour->bounding_box();  | 
675  | 0  |     int n_left = nbox.left();  | 
676  | 0  |     int n_right = nbox.right();  | 
677  | 0  |     if (debug) { | 
678  | 0  |       tprintf("Neighbour at (%d,%d)->(%d,%d)\n", n_left, nbox.bottom(), n_right, nbox.top()); | 
679  | 0  |     }  | 
680  |  |     // If the neighbouring blob is the wrong side of a separator line, then it  | 
681  |  |     // "doesn't exist" as far as we are concerned.  | 
682  | 0  |     if (n_right > right_column_edge || n_left < left_column_edge ||  | 
683  | 0  |         left_x < neighbour->left_rule() || right_x > neighbour->right_rule()) { | 
684  | 0  |       continue; // Separator line in the way.  | 
685  | 0  |     }  | 
686  | 0  |     int n_mid_x = (n_left + n_right) / 2;  | 
687  | 0  |     int n_mid_y = (nbox.top() + nbox.bottom()) / 2;  | 
688  | 0  |     if (n_mid_x <= left_x && n_right >= target_right) { | 
689  | 0  |       if (debug) { | 
690  | 0  |         tprintf("Not a left tab\n"); | 
691  | 0  |       }  | 
692  | 0  |       is_left_tab = false;  | 
693  | 0  |       if (n_mid_y < top_y) { | 
694  | 0  |         maybe_left_tab_down = -INT32_MAX;  | 
695  | 0  |       }  | 
696  | 0  |       if (n_mid_y > bottom_y) { | 
697  | 0  |         maybe_left_tab_up = -INT32_MAX;  | 
698  | 0  |       }  | 
699  | 0  |     } else if (NearlyEqual(left_x, n_left, alignment_tolerance)) { | 
700  | 0  |       if (debug) { | 
701  | 0  |         tprintf("Maybe a left tab\n"); | 
702  | 0  |       }  | 
703  | 0  |       if (n_mid_y > top_y && maybe_left_tab_up > -INT32_MAX) { | 
704  | 0  |         ++maybe_left_tab_up;  | 
705  | 0  |       }  | 
706  | 0  |       if (n_mid_y < bottom_y && maybe_left_tab_down > -INT32_MAX) { | 
707  | 0  |         ++maybe_left_tab_down;  | 
708  | 0  |       }  | 
709  | 0  |     } else if (n_left < left_x && n_right >= left_x) { | 
710  |  |       // Overlaps but not aligned so negative points on a maybe.  | 
711  | 0  |       if (debug) { | 
712  | 0  |         tprintf("Maybe Not a left tab\n"); | 
713  | 0  |       }  | 
714  | 0  |       if (n_mid_y > top_y && maybe_left_tab_up > -INT32_MAX) { | 
715  | 0  |         --maybe_left_tab_up;  | 
716  | 0  |       }  | 
717  | 0  |       if (n_mid_y < bottom_y && maybe_left_tab_down > -INT32_MAX) { | 
718  | 0  |         --maybe_left_tab_down;  | 
719  | 0  |       }  | 
720  | 0  |     }  | 
721  | 0  |     if (n_left < left_x && nbox.y_overlap(box) && n_right >= target_right) { | 
722  | 0  |       maybe_ragged_left = false;  | 
723  | 0  |       if (debug) { | 
724  | 0  |         tprintf("Not a ragged left\n"); | 
725  | 0  |       }  | 
726  | 0  |     }  | 
727  | 0  |     if (n_mid_x >= right_x && n_left <= target_left) { | 
728  | 0  |       if (debug) { | 
729  | 0  |         tprintf("Not a right tab\n"); | 
730  | 0  |       }  | 
731  | 0  |       is_right_tab = false;  | 
732  | 0  |       if (n_mid_y < top_y) { | 
733  | 0  |         maybe_right_tab_down = -INT32_MAX;  | 
734  | 0  |       }  | 
735  | 0  |       if (n_mid_y > bottom_y) { | 
736  | 0  |         maybe_right_tab_up = -INT32_MAX;  | 
737  | 0  |       }  | 
738  | 0  |     } else if (NearlyEqual(right_x, n_right, alignment_tolerance)) { | 
739  | 0  |       if (debug) { | 
740  | 0  |         tprintf("Maybe a right tab\n"); | 
741  | 0  |       }  | 
742  | 0  |       if (n_mid_y > top_y && maybe_right_tab_up > -INT32_MAX) { | 
743  | 0  |         ++maybe_right_tab_up;  | 
744  | 0  |       }  | 
745  | 0  |       if (n_mid_y < bottom_y && maybe_right_tab_down > -INT32_MAX) { | 
746  | 0  |         ++maybe_right_tab_down;  | 
747  | 0  |       }  | 
748  | 0  |     } else if (n_right > right_x && n_left <= right_x) { | 
749  |  |       // Overlaps but not aligned so negative points on a maybe.  | 
750  | 0  |       if (debug) { | 
751  | 0  |         tprintf("Maybe Not a right tab\n"); | 
752  | 0  |       }  | 
753  | 0  |       if (n_mid_y > top_y && maybe_right_tab_up > -INT32_MAX) { | 
754  | 0  |         --maybe_right_tab_up;  | 
755  | 0  |       }  | 
756  | 0  |       if (n_mid_y < bottom_y && maybe_right_tab_down > -INT32_MAX) { | 
757  | 0  |         --maybe_right_tab_down;  | 
758  | 0  |       }  | 
759  | 0  |     }  | 
760  | 0  |     if (n_right > right_x && nbox.y_overlap(box) && n_left <= target_left) { | 
761  | 0  |       maybe_ragged_right = false;  | 
762  | 0  |       if (debug) { | 
763  | 0  |         tprintf("Not a ragged right\n"); | 
764  | 0  |       }  | 
765  | 0  |     }  | 
766  | 0  |     if (maybe_left_tab_down == -INT32_MAX && maybe_left_tab_up == -INT32_MAX &&  | 
767  | 0  |         maybe_right_tab_down == -INT32_MAX && maybe_right_tab_up == -INT32_MAX) { | 
768  | 0  |       break;  | 
769  | 0  |     }  | 
770  | 0  |   }  | 
771  | 0  |   if (is_left_tab || maybe_left_tab_up > 1 || maybe_left_tab_down > 1) { | 
772  | 0  |     bbox->set_left_tab_type(TT_MAYBE_ALIGNED);  | 
773  | 0  |   } else if (maybe_ragged_left && ConfirmRaggedLeft(bbox, min_ragged_gutter)) { | 
774  | 0  |     bbox->set_left_tab_type(TT_MAYBE_RAGGED);  | 
775  | 0  |   } else { | 
776  | 0  |     bbox->set_left_tab_type(TT_NONE);  | 
777  | 0  |   }  | 
778  | 0  |   if (is_right_tab || maybe_right_tab_up > 1 || maybe_right_tab_down > 1) { | 
779  | 0  |     bbox->set_right_tab_type(TT_MAYBE_ALIGNED);  | 
780  | 0  |   } else if (maybe_ragged_right && ConfirmRaggedRight(bbox, min_ragged_gutter)) { | 
781  | 0  |     bbox->set_right_tab_type(TT_MAYBE_RAGGED);  | 
782  | 0  |   } else { | 
783  | 0  |     bbox->set_right_tab_type(TT_NONE);  | 
784  | 0  |   }  | 
785  | 0  |   if (debug) { | 
786  | 0  |     tprintf("Left result = %s, Right result=%s\n", | 
787  | 0  |             bbox->left_tab_type() == TT_MAYBE_ALIGNED  | 
788  | 0  |                 ? "Aligned"  | 
789  | 0  |                 : (bbox->left_tab_type() == TT_MAYBE_RAGGED ? "Ragged" : "None"),  | 
790  | 0  |             bbox->right_tab_type() == TT_MAYBE_ALIGNED  | 
791  | 0  |                 ? "Aligned"  | 
792  | 0  |                 : (bbox->right_tab_type() == TT_MAYBE_RAGGED ? "Ragged" : "None"));  | 
793  | 0  |   }  | 
794  | 0  |   return bbox->left_tab_type() != TT_NONE || bbox->right_tab_type() != TT_NONE;  | 
795  | 0  | }  | 
796  |  |  | 
797  |  | // Returns true if there is nothing in the rectangle of width min_gutter to  | 
798  |  | // the left of bbox.  | 
799  | 0  | bool TabFind::ConfirmRaggedLeft(BLOBNBOX *bbox, int min_gutter) { | 
800  | 0  |   TBOX search_box(bbox->bounding_box());  | 
801  | 0  |   search_box.set_right(search_box.left());  | 
802  | 0  |   search_box.set_left(search_box.left() - min_gutter);  | 
803  | 0  |   return NothingYOverlapsInBox(search_box, bbox->bounding_box());  | 
804  | 0  | }  | 
805  |  |  | 
806  |  | // Returns true if there is nothing in the rectangle of width min_gutter to  | 
807  |  | // the right of bbox.  | 
808  | 0  | bool TabFind::ConfirmRaggedRight(BLOBNBOX *bbox, int min_gutter) { | 
809  | 0  |   TBOX search_box(bbox->bounding_box());  | 
810  | 0  |   search_box.set_left(search_box.right());  | 
811  | 0  |   search_box.set_right(search_box.right() + min_gutter);  | 
812  | 0  |   return NothingYOverlapsInBox(search_box, bbox->bounding_box());  | 
813  | 0  | }  | 
814  |  |  | 
815  |  | // Returns true if there is nothing in the given search_box that vertically  | 
816  |  | // overlaps target_box other than target_box itself.  | 
817  | 0  | bool TabFind::NothingYOverlapsInBox(const TBOX &search_box, const TBOX &target_box) { | 
818  | 0  |   BlobGridSearch rsearch(this);  | 
819  | 0  |   rsearch.StartRectSearch(search_box);  | 
820  | 0  |   BLOBNBOX *blob;  | 
821  | 0  |   while ((blob = rsearch.NextRectSearch()) != nullptr) { | 
822  | 0  |     const TBOX &box = blob->bounding_box();  | 
823  | 0  |     if (box.y_overlap(target_box) && !(box == target_box)) { | 
824  | 0  |       return false;  | 
825  | 0  |     }  | 
826  | 0  |   }  | 
827  | 0  |   return true;  | 
828  | 0  | }  | 
829  |  |  | 
830  | 0  | void TabFind::FindAllTabVectors(int min_gutter_width) { | 
831  |  |   // A list of vectors that will be created in estimating the skew.  | 
832  | 0  |   TabVector_LIST dummy_vectors;  | 
833  |  |   // An estimate of the vertical direction, revised as more lines are added.  | 
834  | 0  |   int vertical_x = 0;  | 
835  | 0  |   int vertical_y = 1;  | 
836  |  |   // Find an estimate of the vertical direction by finding some tab vectors.  | 
837  |  |   // Slowly up the search size until we get some vectors.  | 
838  | 0  |   for (int search_size = kMinVerticalSearch; search_size < kMaxVerticalSearch;  | 
839  | 0  |        search_size += kMinVerticalSearch) { | 
840  | 0  |     int vector_count = FindTabVectors(search_size, TA_LEFT_ALIGNED, min_gutter_width,  | 
841  | 0  |                                       &dummy_vectors, &vertical_x, &vertical_y);  | 
842  | 0  |     vector_count += FindTabVectors(search_size, TA_RIGHT_ALIGNED, min_gutter_width, &dummy_vectors,  | 
843  | 0  |                                    &vertical_x, &vertical_y);  | 
844  | 0  |     if (vector_count > 0) { | 
845  | 0  |       break;  | 
846  | 0  |     }  | 
847  | 0  |   }  | 
848  |  |   // Get rid of the test vectors and reset the types of the tabs.  | 
849  | 0  |   dummy_vectors.clear();  | 
850  | 0  |   for (auto bbox : left_tab_boxes_) { | 
851  | 0  |     if (bbox->left_tab_type() == TT_CONFIRMED) { | 
852  | 0  |       bbox->set_left_tab_type(TT_MAYBE_ALIGNED);  | 
853  | 0  |     }  | 
854  | 0  |   }  | 
855  | 0  |   for (auto bbox : right_tab_boxes_) { | 
856  | 0  |     if (bbox->right_tab_type() == TT_CONFIRMED) { | 
857  | 0  |       bbox->set_right_tab_type(TT_MAYBE_ALIGNED);  | 
858  | 0  |     }  | 
859  | 0  |   }  | 
860  | 0  |   if (textord_debug_tabfind) { | 
861  | 0  |     tprintf("Beginning real tab search with vertical = %d,%d...\n", vertical_x, vertical_y); | 
862  | 0  |   }  | 
863  |  |   // Now do the real thing ,but keep the vectors in the dummy_vectors list  | 
864  |  |   // until they are all done, so we don't get the tab vectors confused with  | 
865  |  |   // the rule line vectors.  | 
866  | 0  |   FindTabVectors(kMaxVerticalSearch, TA_LEFT_ALIGNED, min_gutter_width, &dummy_vectors, &vertical_x,  | 
867  | 0  |                  &vertical_y);  | 
868  | 0  |   FindTabVectors(kMaxVerticalSearch, TA_RIGHT_ALIGNED, min_gutter_width, &dummy_vectors,  | 
869  | 0  |                  &vertical_x, &vertical_y);  | 
870  | 0  |   FindTabVectors(kMaxRaggedSearch, TA_LEFT_RAGGED, min_gutter_width, &dummy_vectors, &vertical_x,  | 
871  | 0  |                  &vertical_y);  | 
872  | 0  |   FindTabVectors(kMaxRaggedSearch, TA_RIGHT_RAGGED, min_gutter_width, &dummy_vectors, &vertical_x,  | 
873  | 0  |                  &vertical_y);  | 
874  |  |   // Now add the vectors to the vectors_ list.  | 
875  | 0  |   TabVector_IT v_it(&vectors_);  | 
876  | 0  |   v_it.add_list_after(&dummy_vectors);  | 
877  |  |   // Now use the summed (mean) vertical vector as the direction for everything.  | 
878  | 0  |   SetVerticalSkewAndParallelize(vertical_x, vertical_y);  | 
879  | 0  | }  | 
880  |  |  | 
881  |  | // Helper for FindAllTabVectors finds the vectors of a particular type.  | 
882  |  | int TabFind::FindTabVectors(int search_size_multiple, TabAlignment alignment, int min_gutter_width,  | 
883  | 0  |                             TabVector_LIST *vectors, int *vertical_x, int *vertical_y) { | 
884  | 0  |   TabVector_IT vector_it(vectors);  | 
885  | 0  |   int vector_count = 0;  | 
886  |  |   // Search the right or left tab boxes, looking for tab vectors.  | 
887  | 0  |   bool right = alignment == TA_RIGHT_ALIGNED || alignment == TA_RIGHT_RAGGED;  | 
888  | 0  |   const std::vector<BLOBNBOX *> &boxes = right ? right_tab_boxes_ : left_tab_boxes_;  | 
889  | 0  |   for (auto bbox : boxes) { | 
890  | 0  |     if ((!right && bbox->left_tab_type() == TT_MAYBE_ALIGNED) ||  | 
891  | 0  |         (right && bbox->right_tab_type() == TT_MAYBE_ALIGNED)) { | 
892  | 0  |       TabVector *vector = FindTabVector(search_size_multiple, min_gutter_width, alignment, bbox,  | 
893  | 0  |                                         vertical_x, vertical_y);  | 
894  | 0  |       if (vector != nullptr) { | 
895  | 0  |         ++vector_count;  | 
896  | 0  |         vector_it.add_to_end(vector);  | 
897  | 0  |       }  | 
898  | 0  |     }  | 
899  | 0  |   }  | 
900  | 0  |   return vector_count;  | 
901  | 0  | }  | 
902  |  |  | 
903  |  | // Finds a vector corresponding to a tabstop running through the  | 
904  |  | // given box of the given alignment type.  | 
905  |  | // search_size_multiple is a multiple of height used to control  | 
906  |  | // the size of the search.  | 
907  |  | // vertical_x and y are updated with an estimate of the real  | 
908  |  | // vertical direction. (skew finding.)  | 
909  |  | // Returns nullptr if no decent tabstop can be found.  | 
910  |  | TabVector *TabFind::FindTabVector(int search_size_multiple, int min_gutter_width,  | 
911  |  |                                   TabAlignment alignment, BLOBNBOX *bbox, int *vertical_x,  | 
912  | 0  |                                   int *vertical_y) { | 
913  | 0  |   int height = std::max(static_cast<int>(bbox->bounding_box().height()), gridsize());  | 
914  | 0  |   AlignedBlobParams align_params(*vertical_x, *vertical_y, height, search_size_multiple,  | 
915  | 0  |                                  min_gutter_width, resolution_, alignment);  | 
916  |  |   // FindVerticalAlignment is in the parent (AlignedBlob) class.  | 
917  | 0  |   return FindVerticalAlignment(align_params, bbox, vertical_x, vertical_y);  | 
918  | 0  | }  | 
919  |  |  | 
920  |  | // Set the vertical_skew_ member from the given vector and refit  | 
921  |  | // all vectors parallel to the skew vector.  | 
922  | 0  | void TabFind::SetVerticalSkewAndParallelize(int vertical_x, int vertical_y) { | 
923  |  |   // Fit the vertical vector into an ICOORD, which is 16 bit.  | 
924  | 0  |   vertical_skew_.set_with_shrink(vertical_x, vertical_y);  | 
925  | 0  |   if (textord_debug_tabfind) { | 
926  | 0  |     tprintf("Vertical skew vector=(%d,%d)\n", vertical_skew_.x(), vertical_skew_.y()); | 
927  | 0  |   }  | 
928  | 0  |   v_it_.set_to_list(&vectors_);  | 
929  | 0  |   for (v_it_.mark_cycle_pt(); !v_it_.cycled_list(); v_it_.forward()) { | 
930  | 0  |     TabVector *v = v_it_.data();  | 
931  | 0  |     v->Fit(vertical_skew_, true);  | 
932  | 0  |   }  | 
933  |  |   // Now sort the vectors as their direction has potentially changed.  | 
934  | 0  |   SortVectors();  | 
935  | 0  | }  | 
936  |  |  | 
937  |  | // Sort all the current vectors using the given vertical direction vector.  | 
938  | 0  | void TabFind::SortVectors() { | 
939  | 0  |   vectors_.sort(TabVector::SortVectorsByKey);  | 
940  | 0  |   v_it_.set_to_list(&vectors_);  | 
941  | 0  | }  | 
942  |  |  | 
943  |  | // Evaluate all the current tab vectors.  | 
944  | 0  | void TabFind::EvaluateTabs() { | 
945  | 0  |   TabVector_IT rule_it(&vectors_);  | 
946  | 0  |   for (rule_it.mark_cycle_pt(); !rule_it.cycled_list(); rule_it.forward()) { | 
947  | 0  |     TabVector *tab = rule_it.data();  | 
948  | 0  |     if (!tab->IsSeparator()) { | 
949  | 0  |       tab->Evaluate(vertical_skew_, this);  | 
950  | 0  |       if (tab->BoxCount() < kMinEvaluatedTabs) { | 
951  | 0  |         if (textord_debug_tabfind > 2) { | 
952  | 0  |           tab->Print("Too few boxes"); | 
953  | 0  |         }  | 
954  | 0  |         delete rule_it.extract();  | 
955  | 0  |         v_it_.set_to_list(&vectors_);  | 
956  | 0  |       } else if (WithinTestRegion(3, tab->startpt().x(), tab->startpt().y())) { | 
957  | 0  |         tab->Print("Evaluated tab"); | 
958  | 0  |       }  | 
959  | 0  |     }  | 
960  | 0  |   }  | 
961  | 0  | }  | 
962  |  |  | 
963  |  | // Trace textlines from one side to the other of each tab vector, saving  | 
964  |  | // the most frequent column widths found in a list so that a given width  | 
965  |  | // can be tested for being a common width with a simple callback function.  | 
966  | 0  | void TabFind::ComputeColumnWidths(ScrollView *tab_win, ColPartitionGrid *part_grid) { | 
967  |  | #ifndef GRAPHICS_DISABLED  | 
968  |  |   if (tab_win != nullptr) { | 
969  |  |     tab_win->Pen(ScrollView::WHITE);  | 
970  |  |   }  | 
971  |  | #endif // !GRAPHICS_DISABLED  | 
972  |  |   // Accumulate column sections into a STATS  | 
973  | 0  |   int col_widths_size = (tright_.x() - bleft_.x()) / kColumnWidthFactor;  | 
974  | 0  |   STATS col_widths(0, col_widths_size);  | 
975  | 0  |   ApplyPartitionsToColumnWidths(part_grid, &col_widths);  | 
976  |  | #ifndef GRAPHICS_DISABLED  | 
977  |  |   if (tab_win != nullptr) { | 
978  |  |     tab_win->Update();  | 
979  |  |   }  | 
980  |  | #endif // !GRAPHICS_DISABLED  | 
981  | 0  |   if (textord_debug_tabfind > 1) { | 
982  | 0  |     col_widths.print();  | 
983  | 0  |   }  | 
984  |  |   // Now make a list of column widths.  | 
985  | 0  |   MakeColumnWidths(col_widths_size, &col_widths);  | 
986  |  |   // Turn the column width into a range.  | 
987  | 0  |   ApplyPartitionsToColumnWidths(part_grid, nullptr);  | 
988  | 0  | }  | 
989  |  |  | 
990  |  | // Finds column width and:  | 
991  |  | //   if col_widths is not null (pass1):  | 
992  |  | //     pair-up tab vectors with existing ColPartitions and accumulate widths.  | 
993  |  | //   else (pass2):  | 
994  |  | //     find the largest real partition width for each recorded column width,  | 
995  |  | //     to be used as the minimum acceptable width.  | 
996  | 0  | void TabFind::ApplyPartitionsToColumnWidths(ColPartitionGrid *part_grid, STATS *col_widths) { | 
997  |  |   // For every ColPartition in the part_grid, add partners to the tabvectors  | 
998  |  |   // and accumulate the column widths.  | 
999  | 0  |   ColPartitionGridSearch gsearch(part_grid);  | 
1000  | 0  |   gsearch.StartFullSearch();  | 
1001  | 0  |   ColPartition *part;  | 
1002  | 0  |   while ((part = gsearch.NextFullSearch()) != nullptr) { | 
1003  | 0  |     BLOBNBOX_C_IT blob_it(part->boxes());  | 
1004  | 0  |     if (blob_it.empty()) { | 
1005  | 0  |       continue;  | 
1006  | 0  |     }  | 
1007  | 0  |     BLOBNBOX *left_blob = blob_it.data();  | 
1008  | 0  |     blob_it.move_to_last();  | 
1009  | 0  |     BLOBNBOX *right_blob = blob_it.data();  | 
1010  | 0  |     TabVector *left_vector = LeftTabForBox(left_blob->bounding_box(), true, false);  | 
1011  | 0  |     if (left_vector == nullptr || left_vector->IsRightTab()) { | 
1012  | 0  |       continue;  | 
1013  | 0  |     }  | 
1014  | 0  |     TabVector *right_vector = RightTabForBox(right_blob->bounding_box(), true, false);  | 
1015  | 0  |     if (right_vector == nullptr || right_vector->IsLeftTab()) { | 
1016  | 0  |       continue;  | 
1017  | 0  |     }  | 
1018  |  |  | 
1019  | 0  |     int line_left = left_vector->XAtY(left_blob->bounding_box().bottom());  | 
1020  | 0  |     int line_right = right_vector->XAtY(right_blob->bounding_box().bottom());  | 
1021  |  |     // Add to STATS of measurements if the width is significant.  | 
1022  | 0  |     int width = line_right - line_left;  | 
1023  | 0  |     if (col_widths != nullptr) { | 
1024  | 0  |       AddPartnerVector(left_blob, right_blob, left_vector, right_vector);  | 
1025  | 0  |       if (width >= kMinColumnWidth) { | 
1026  | 0  |         col_widths->add(width / kColumnWidthFactor, 1);  | 
1027  | 0  |       }  | 
1028  | 0  |     } else { | 
1029  | 0  |       width /= kColumnWidthFactor;  | 
1030  | 0  |       ICOORDELT_IT it(&column_widths_);  | 
1031  | 0  |       for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | 
1032  | 0  |         ICOORDELT *w = it.data();  | 
1033  | 0  |         if (NearlyEqual<int>(width, w->y(), 1)) { | 
1034  | 0  |           int true_width = part->bounding_box().width() / kColumnWidthFactor;  | 
1035  | 0  |           if (true_width <= w->y() && true_width > w->x()) { | 
1036  | 0  |             w->set_x(true_width);  | 
1037  | 0  |           }  | 
1038  | 0  |           break;  | 
1039  | 0  |         }  | 
1040  | 0  |       }  | 
1041  | 0  |     }  | 
1042  | 0  |   }  | 
1043  | 0  | }  | 
1044  |  |  | 
1045  |  | // Helper makes the list of common column widths in column_widths_ from the  | 
1046  |  | // input col_widths. Destroys the content of col_widths by repeatedly  | 
1047  |  | // finding the mode and erasing the peak.  | 
1048  | 0  | void TabFind::MakeColumnWidths(int col_widths_size, STATS *col_widths) { | 
1049  | 0  |   ICOORDELT_IT w_it(&column_widths_);  | 
1050  | 0  |   int total_col_count = col_widths->get_total();  | 
1051  | 0  |   while (col_widths->get_total() > 0) { | 
1052  | 0  |     int width = col_widths->mode();  | 
1053  | 0  |     int col_count = col_widths->pile_count(width);  | 
1054  | 0  |     col_widths->add(width, -col_count);  | 
1055  |  |     // Get the entire peak.  | 
1056  | 0  |     for (int left = width - 1; left > 0 && col_widths->pile_count(left) > 0; --left) { | 
1057  | 0  |       int new_count = col_widths->pile_count(left);  | 
1058  | 0  |       col_count += new_count;  | 
1059  | 0  |       col_widths->add(left, -new_count);  | 
1060  | 0  |     }  | 
1061  | 0  |     for (int right = width + 1; right < col_widths_size && col_widths->pile_count(right) > 0;  | 
1062  | 0  |          ++right) { | 
1063  | 0  |       int new_count = col_widths->pile_count(right);  | 
1064  | 0  |       col_count += new_count;  | 
1065  | 0  |       col_widths->add(right, -new_count);  | 
1066  | 0  |     }  | 
1067  | 0  |     if (col_count > kMinLinesInColumn &&  | 
1068  | 0  |         col_count > kMinFractionalLinesInColumn * total_col_count) { | 
1069  | 0  |       auto *w = new ICOORDELT(0, width);  | 
1070  | 0  |       w_it.add_after_then_move(w);  | 
1071  | 0  |       if (textord_debug_tabfind) { | 
1072  | 0  |         tprintf("Column of width %d has %d = %.2f%% lines\n", width * kColumnWidthFactor, col_count, | 
1073  | 0  |                 100.0 * col_count / total_col_count);  | 
1074  | 0  |       }  | 
1075  | 0  |     }  | 
1076  | 0  |   }  | 
1077  | 0  | }  | 
1078  |  |  | 
1079  |  | // Mark blobs as being in a vertical text line where that is the case.  | 
1080  |  | // Returns true if the majority of the image is vertical text lines.  | 
1081  | 0  | void TabFind::MarkVerticalText() { | 
1082  | 0  |   if (textord_debug_tabfind) { | 
1083  | 0  |     tprintf("Checking for vertical lines\n"); | 
1084  | 0  |   }  | 
1085  | 0  |   BlobGridSearch gsearch(this);  | 
1086  | 0  |   gsearch.StartFullSearch();  | 
1087  | 0  |   BLOBNBOX *blob = nullptr;  | 
1088  | 0  |   while ((blob = gsearch.NextFullSearch()) != nullptr) { | 
1089  | 0  |     if (blob->region_type() < BRT_UNKNOWN) { | 
1090  | 0  |       continue;  | 
1091  | 0  |     }  | 
1092  | 0  |     if (blob->UniquelyVertical()) { | 
1093  | 0  |       blob->set_region_type(BRT_VERT_TEXT);  | 
1094  | 0  |     }  | 
1095  | 0  |   }  | 
1096  | 0  | }  | 
1097  |  |  | 
1098  | 0  | int TabFind::FindMedianGutterWidth(TabVector_LIST *lines) { | 
1099  | 0  |   TabVector_IT it(lines);  | 
1100  | 0  |   int prev_right = -1;  | 
1101  | 0  |   int max_gap = static_cast<int>(kMaxGutterWidthAbsolute * resolution_);  | 
1102  | 0  |   STATS gaps(0, max_gap - 1);  | 
1103  | 0  |   STATS heights(0, max_gap - 1);  | 
1104  | 0  |   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | 
1105  | 0  |     TabVector *v = it.data();  | 
1106  | 0  |     TabVector *partner = v->GetSinglePartner();  | 
1107  | 0  |     if (!v->IsLeftTab() || v->IsSeparator() || !partner) { | 
1108  | 0  |       continue;  | 
1109  | 0  |     }  | 
1110  | 0  |     heights.add(partner->startpt().x() - v->startpt().x(), 1);  | 
1111  | 0  |     if (prev_right > 0 && v->startpt().x() > prev_right) { | 
1112  | 0  |       gaps.add(v->startpt().x() - prev_right, 1);  | 
1113  | 0  |     }  | 
1114  | 0  |     prev_right = partner->startpt().x();  | 
1115  | 0  |   }  | 
1116  | 0  |   if (textord_debug_tabfind) { | 
1117  | 0  |     tprintf("TabGutter total %d  median_gap %.2f  median_hgt %.2f\n", gaps.get_total(), | 
1118  | 0  |             gaps.median(), heights.median());  | 
1119  | 0  |   }  | 
1120  | 0  |   if (gaps.get_total() < kMinLinesInColumn) { | 
1121  | 0  |     return 0;  | 
1122  | 0  |   }  | 
1123  | 0  |   return static_cast<int>(gaps.median());  | 
1124  | 0  | }  | 
1125  |  |  | 
1126  |  | // Find the next adjacent (looking to the left or right) blob on this text  | 
1127  |  | // line, with the constraint that it must vertically significantly overlap  | 
1128  |  | // the [top_y, bottom_y] range.  | 
1129  |  | // If ignore_images is true, then blobs with aligned_text() < 0 are treated  | 
1130  |  | // as if they do not exist.  | 
1131  |  | BLOBNBOX *TabFind::AdjacentBlob(const BLOBNBOX *bbox, bool look_left, bool ignore_images,  | 
1132  |  |                                 double min_overlap_fraction, int gap_limit, int top_y,  | 
1133  | 0  |                                 int bottom_y) { | 
1134  | 0  |   GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> sidesearch(this);  | 
1135  | 0  |   const TBOX &box = bbox->bounding_box();  | 
1136  | 0  |   int left = box.left();  | 
1137  | 0  |   int right = box.right();  | 
1138  | 0  |   int mid_x = (left + right) / 2;  | 
1139  | 0  |   sidesearch.StartSideSearch(mid_x, bottom_y, top_y);  | 
1140  | 0  |   int best_gap = 0;  | 
1141  | 0  |   bool debug = WithinTestRegion(3, left, bottom_y);  | 
1142  | 0  |   BLOBNBOX *result = nullptr;  | 
1143  | 0  |   BLOBNBOX *neighbour = nullptr;  | 
1144  | 0  |   while ((neighbour = sidesearch.NextSideSearch(look_left)) != nullptr) { | 
1145  | 0  |     if (debug) { | 
1146  | 0  |       tprintf("Adjacent blob: considering box:"); | 
1147  | 0  |       neighbour->bounding_box().print();  | 
1148  | 0  |     }  | 
1149  | 0  |     if (neighbour == bbox || (ignore_images && neighbour->region_type() < BRT_UNKNOWN)) { | 
1150  | 0  |       continue;  | 
1151  | 0  |     }  | 
1152  | 0  |     const TBOX &nbox = neighbour->bounding_box();  | 
1153  | 0  |     int n_top_y = nbox.top();  | 
1154  | 0  |     int n_bottom_y = nbox.bottom();  | 
1155  | 0  |     int v_overlap = std::min(n_top_y, top_y) - std::max(n_bottom_y, bottom_y);  | 
1156  | 0  |     int height = top_y - bottom_y;  | 
1157  | 0  |     int n_height = n_top_y - n_bottom_y;  | 
1158  | 0  |     if (v_overlap > min_overlap_fraction * std::min(height, n_height) &&  | 
1159  | 0  |         (min_overlap_fraction == 0.0 || !DifferentSizes(height, n_height))) { | 
1160  | 0  |       int n_left = nbox.left();  | 
1161  | 0  |       int n_right = nbox.right();  | 
1162  | 0  |       int h_gap = std::max(n_left, left) - std::min(n_right, right);  | 
1163  | 0  |       int n_mid_x = (n_left + n_right) / 2;  | 
1164  | 0  |       if (look_left == (n_mid_x < mid_x) && n_mid_x != mid_x) { | 
1165  | 0  |         if (h_gap > gap_limit) { | 
1166  |  |           // Hit a big gap before next tab so don't return anything.  | 
1167  | 0  |           if (debug) { | 
1168  | 0  |             tprintf("Giving up due to big gap = %d vs %d\n", h_gap, gap_limit); | 
1169  | 0  |           }  | 
1170  | 0  |           return result;  | 
1171  | 0  |         }  | 
1172  | 0  |         if (h_gap > 0 && (look_left ? neighbour->right_tab_type() : neighbour->left_tab_type()) >=  | 
1173  | 0  |                              TT_CONFIRMED) { | 
1174  |  |           // Hit a tab facing the wrong way. Stop in case we are crossing  | 
1175  |  |           // the column boundary.  | 
1176  | 0  |           if (debug) { | 
1177  | 0  |             tprintf("Collision with like tab of type %d at %d,%d\n", | 
1178  | 0  |                     look_left ? neighbour->right_tab_type() : neighbour->left_tab_type(), n_left,  | 
1179  | 0  |                     nbox.bottom());  | 
1180  | 0  |           }  | 
1181  | 0  |           return result;  | 
1182  | 0  |         }  | 
1183  |  |         // This is a good fit to the line. Continue with this  | 
1184  |  |         // neighbour as the bbox if the best gap.  | 
1185  | 0  |         if (result == nullptr || h_gap < best_gap) { | 
1186  | 0  |           if (debug) { | 
1187  | 0  |             tprintf("Good result\n"); | 
1188  | 0  |           }  | 
1189  | 0  |           result = neighbour;  | 
1190  | 0  |           best_gap = h_gap;  | 
1191  | 0  |         } else { | 
1192  |  |           // The new one is worse, so we probably already have the best result.  | 
1193  | 0  |           return result;  | 
1194  | 0  |         }  | 
1195  | 0  |       } else if (debug) { | 
1196  | 0  |         tprintf("Wrong way\n"); | 
1197  | 0  |       }  | 
1198  | 0  |     } else if (debug) { | 
1199  | 0  |       tprintf("Insufficient overlap\n"); | 
1200  | 0  |     }  | 
1201  | 0  |   }  | 
1202  | 0  |   if (WithinTestRegion(3, left, box.top())) { | 
1203  | 0  |     tprintf("Giving up due to end of search\n"); | 
1204  | 0  |   }  | 
1205  | 0  |   return result; // Hit the edge and found nothing.  | 
1206  | 0  | }  | 
1207  |  |  | 
1208  |  | // Add a bi-directional partner relationship between the left  | 
1209  |  | // and the right. If one (or both) of the vectors is a separator,  | 
1210  |  | // extend a nearby extendable vector or create a new one of the  | 
1211  |  | // correct type, using the given left or right blob as a guide.  | 
1212  |  | void TabFind::AddPartnerVector(BLOBNBOX *left_blob, BLOBNBOX *right_blob, TabVector *left,  | 
1213  | 0  |                                TabVector *right) { | 
1214  | 0  |   const TBOX &left_box = left_blob->bounding_box();  | 
1215  | 0  |   const TBOX &right_box = right_blob->bounding_box();  | 
1216  | 0  |   if (left->IsSeparator()) { | 
1217  |  |     // Try to find a nearby left edge to extend.  | 
1218  | 0  |     TabVector *v = LeftTabForBox(left_box, true, true);  | 
1219  | 0  |     if (v != nullptr && v != left && v->IsLeftTab() &&  | 
1220  | 0  |         v->XAtY(left_box.top()) > left->XAtY(left_box.top())) { | 
1221  | 0  |       left = v; // Found a good replacement.  | 
1222  | 0  |       left->ExtendToBox(left_blob);  | 
1223  | 0  |     } else { | 
1224  |  |       // Fake a vector.  | 
1225  | 0  |       left = new TabVector(*left, TA_LEFT_RAGGED, vertical_skew_, left_blob);  | 
1226  | 0  |       vectors_.add_sorted(TabVector::SortVectorsByKey, left);  | 
1227  | 0  |       v_it_.move_to_first();  | 
1228  | 0  |     }  | 
1229  | 0  |   }  | 
1230  | 0  |   if (right->IsSeparator()) { | 
1231  |  |     // Try to find a nearby left edge to extend.  | 
1232  | 0  |     if (WithinTestRegion(3, right_box.right(), right_box.bottom())) { | 
1233  | 0  |       tprintf("Box edge (%d,%d-%d)", right_box.right(), right_box.bottom(), right_box.top()); | 
1234  | 0  |       right->Print(" looking for improvement for"); | 
1235  | 0  |     }  | 
1236  | 0  |     TabVector *v = RightTabForBox(right_box, true, true);  | 
1237  | 0  |     if (v != nullptr && v != right && v->IsRightTab() &&  | 
1238  | 0  |         v->XAtY(right_box.top()) < right->XAtY(right_box.top())) { | 
1239  | 0  |       right = v; // Found a good replacement.  | 
1240  | 0  |       right->ExtendToBox(right_blob);  | 
1241  | 0  |       if (WithinTestRegion(3, right_box.right(), right_box.bottom())) { | 
1242  | 0  |         right->Print("Extended vector"); | 
1243  | 0  |       }  | 
1244  | 0  |     } else { | 
1245  |  |       // Fake a vector.  | 
1246  | 0  |       right = new TabVector(*right, TA_RIGHT_RAGGED, vertical_skew_, right_blob);  | 
1247  | 0  |       vectors_.add_sorted(TabVector::SortVectorsByKey, right);  | 
1248  | 0  |       v_it_.move_to_first();  | 
1249  | 0  |       if (WithinTestRegion(3, right_box.right(), right_box.bottom())) { | 
1250  | 0  |         right->Print("Created new vector"); | 
1251  | 0  |       }  | 
1252  | 0  |     }  | 
1253  | 0  |   }  | 
1254  | 0  |   left->AddPartner(right);  | 
1255  | 0  |   right->AddPartner(left);  | 
1256  | 0  | }  | 
1257  |  |  | 
1258  |  | // Remove separators and unused tabs from the main vectors_ list  | 
1259  |  | // to the dead_vectors_ list.  | 
1260  | 0  | void TabFind::CleanupTabs() { | 
1261  |  |   // TODO(rays) Before getting rid of separators and unused vectors, it  | 
1262  |  |   // would be useful to try moving ragged vectors outwards to see if this  | 
1263  |  |   // allows useful extension. Could be combined with checking ends of partners.  | 
1264  | 0  |   TabVector_IT it(&vectors_);  | 
1265  | 0  |   TabVector_IT dead_it(&dead_vectors_);  | 
1266  | 0  |   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | 
1267  | 0  |     TabVector *v = it.data();  | 
1268  | 0  |     if (v->IsSeparator() || v->Partnerless()) { | 
1269  | 0  |       dead_it.add_after_then_move(it.extract());  | 
1270  | 0  |       v_it_.set_to_list(&vectors_);  | 
1271  | 0  |     } else { | 
1272  | 0  |       v->FitAndEvaluateIfNeeded(vertical_skew_, this);  | 
1273  | 0  |     }  | 
1274  | 0  |   }  | 
1275  | 0  | }  | 
1276  |  |  | 
1277  |  | // Apply the given rotation to the given list of blobs.  | 
1278  | 0  | void TabFind::RotateBlobList(const FCOORD &rotation, BLOBNBOX_LIST *blobs) { | 
1279  | 0  |   BLOBNBOX_IT it(blobs);  | 
1280  | 0  |   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | 
1281  | 0  |     it.data()->rotate_box(rotation);  | 
1282  | 0  |   }  | 
1283  | 0  | }  | 
1284  |  |  | 
1285  |  | // Recreate the grid with deskewed BLOBNBOXes.  | 
1286  |  | // Returns false if the detected skew angle is impossible.  | 
1287  |  | bool TabFind::Deskew(TabVector_LIST *hlines, BLOBNBOX_LIST *image_blobs, TO_BLOCK *block,  | 
1288  | 0  |                      FCOORD *deskew, FCOORD *reskew) { | 
1289  | 0  |   ComputeDeskewVectors(deskew, reskew);  | 
1290  | 0  |   if (deskew->x() < kCosMaxSkewAngle) { | 
1291  | 0  |     return false;  | 
1292  | 0  |   }  | 
1293  | 0  |   RotateBlobList(*deskew, image_blobs);  | 
1294  | 0  |   RotateBlobList(*deskew, &block->blobs);  | 
1295  | 0  |   RotateBlobList(*deskew, &block->small_blobs);  | 
1296  | 0  |   RotateBlobList(*deskew, &block->noise_blobs);  | 
1297  |  |  | 
1298  |  |   // Rotate the horizontal vectors. The vertical vectors don't need  | 
1299  |  |   // rotating as they can just be refitted.  | 
1300  | 0  |   TabVector_IT h_it(hlines);  | 
1301  | 0  |   for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) { | 
1302  | 0  |     TabVector *h = h_it.data();  | 
1303  | 0  |     h->Rotate(*deskew);  | 
1304  | 0  |   }  | 
1305  | 0  |   TabVector_IT d_it(&dead_vectors_);  | 
1306  | 0  |   for (d_it.mark_cycle_pt(); !d_it.cycled_list(); d_it.forward()) { | 
1307  | 0  |     TabVector *d = d_it.data();  | 
1308  | 0  |     d->Rotate(*deskew);  | 
1309  | 0  |   }  | 
1310  | 0  |   SetVerticalSkewAndParallelize(0, 1);  | 
1311  |  |   // Rebuild the grid to the new size.  | 
1312  | 0  |   TBOX grid_box(bleft_, tright_);  | 
1313  | 0  |   grid_box.rotate_large(*deskew);  | 
1314  | 0  |   Init(gridsize(), grid_box.botleft(), grid_box.topright());  | 
1315  | 0  |   InsertBlobsToGrid(false, false, image_blobs, this);  | 
1316  | 0  |   InsertBlobsToGrid(true, false, &block->blobs, this);  | 
1317  | 0  |   return true;  | 
1318  | 0  | }  | 
1319  |  |  | 
1320  |  | // Flip the vertical and horizontal lines and rotate the grid ready  | 
1321  |  | // for working on the rotated image.  | 
1322  |  | // This also makes parameter adjustments for FindInitialTabVectors().  | 
1323  |  | void TabFind::ResetForVerticalText(const FCOORD &rotate, const FCOORD &rerotate,  | 
1324  | 0  |                                    TabVector_LIST *horizontal_lines, int *min_gutter_width) { | 
1325  |  |   // Rotate the horizontal and vertical vectors and swap them over.  | 
1326  |  |   // Only the separators are kept and rotated; other tabs are used  | 
1327  |  |   // to estimate the gutter width then thrown away.  | 
1328  | 0  |   TabVector_LIST ex_verticals;  | 
1329  | 0  |   TabVector_IT ex_v_it(&ex_verticals);  | 
1330  | 0  |   TabVector_LIST vlines;  | 
1331  | 0  |   TabVector_IT v_it(&vlines);  | 
1332  | 0  |   while (!v_it_.empty()) { | 
1333  | 0  |     TabVector *v = v_it_.extract();  | 
1334  | 0  |     if (v->IsSeparator()) { | 
1335  | 0  |       v->Rotate(rotate);  | 
1336  | 0  |       ex_v_it.add_after_then_move(v);  | 
1337  | 0  |     } else { | 
1338  | 0  |       v_it.add_after_then_move(v);  | 
1339  | 0  |     }  | 
1340  | 0  |     v_it_.forward();  | 
1341  | 0  |   }  | 
1342  |  |  | 
1343  |  |   // Adjust the min gutter width for better tabbox selection  | 
1344  |  |   // in 2nd call to FindInitialTabVectors().  | 
1345  | 0  |   int median_gutter = FindMedianGutterWidth(&vlines);  | 
1346  | 0  |   if (median_gutter > *min_gutter_width) { | 
1347  | 0  |     *min_gutter_width = median_gutter;  | 
1348  | 0  |   }  | 
1349  |  | 
  | 
1350  | 0  |   TabVector_IT h_it(horizontal_lines);  | 
1351  | 0  |   for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) { | 
1352  | 0  |     TabVector *h = h_it.data();  | 
1353  | 0  |     h->Rotate(rotate);  | 
1354  | 0  |   }  | 
1355  | 0  |   v_it_.add_list_after(horizontal_lines);  | 
1356  | 0  |   v_it_.move_to_first();  | 
1357  | 0  |   h_it.set_to_list(horizontal_lines);  | 
1358  | 0  |   h_it.add_list_after(&ex_verticals);  | 
1359  |  |  | 
1360  |  |   // Rebuild the grid to the new size.  | 
1361  | 0  |   TBOX grid_box(bleft(), tright());  | 
1362  | 0  |   grid_box.rotate_large(rotate);  | 
1363  | 0  |   Init(gridsize(), grid_box.botleft(), grid_box.topright());  | 
1364  | 0  | }  | 
1365  |  |  | 
1366  |  | // Clear the grid and get rid of the tab vectors, but not separators,  | 
1367  |  | // ready to start again.  | 
1368  | 0  | void TabFind::Reset() { | 
1369  | 0  |   v_it_.move_to_first();  | 
1370  | 0  |   for (v_it_.mark_cycle_pt(); !v_it_.cycled_list(); v_it_.forward()) { | 
1371  | 0  |     if (!v_it_.data()->IsSeparator()) { | 
1372  | 0  |       delete v_it_.extract();  | 
1373  | 0  |     }  | 
1374  | 0  |   }  | 
1375  | 0  |   Clear();  | 
1376  | 0  | }  | 
1377  |  |  | 
1378  |  | // Reflect the separator tab vectors and the grids in the y-axis.  | 
1379  |  | // Can only be called after Reset!  | 
1380  | 0  | void TabFind::ReflectInYAxis() { | 
1381  | 0  |   TabVector_LIST temp_list;  | 
1382  | 0  |   TabVector_IT temp_it(&temp_list);  | 
1383  | 0  |   v_it_.move_to_first();  | 
1384  |  |   // The TabVector list only contains vertical lines, but they need to be  | 
1385  |  |   // reflected and the list needs to be reversed, so they are still in  | 
1386  |  |   // sort_key order.  | 
1387  | 0  |   while (!v_it_.empty()) { | 
1388  | 0  |     TabVector *v = v_it_.extract();  | 
1389  | 0  |     v_it_.forward();  | 
1390  | 0  |     v->ReflectInYAxis();  | 
1391  | 0  |     temp_it.add_before_then_move(v);  | 
1392  | 0  |   }  | 
1393  | 0  |   v_it_.add_list_after(&temp_list);  | 
1394  | 0  |   v_it_.move_to_first();  | 
1395  |  |   // Reset this grid with reflected bounding boxes.  | 
1396  | 0  |   TBOX grid_box(bleft(), tright());  | 
1397  | 0  |   int tmp = grid_box.left();  | 
1398  | 0  |   grid_box.set_left(-grid_box.right());  | 
1399  | 0  |   grid_box.set_right(-tmp);  | 
1400  | 0  |   Init(gridsize(), grid_box.botleft(), grid_box.topright());  | 
1401  | 0  | }  | 
1402  |  |  | 
1403  |  | // Compute the rotation required to deskew, and its inverse rotation.  | 
1404  | 0  | void TabFind::ComputeDeskewVectors(FCOORD *deskew, FCOORD *reskew) { | 
1405  | 0  |   double length = vertical_skew_ % vertical_skew_;  | 
1406  | 0  |   length = sqrt(length);  | 
1407  | 0  |   deskew->set_x(static_cast<float>(vertical_skew_.y() / length));  | 
1408  | 0  |   deskew->set_y(static_cast<float>(vertical_skew_.x() / length));  | 
1409  | 0  |   reskew->set_x(deskew->x());  | 
1410  | 0  |   reskew->set_y(-deskew->y());  | 
1411  | 0  | }  | 
1412  |  |  | 
1413  |  | // Compute and apply constraints to the end positions of TabVectors so  | 
1414  |  | // that where possible partners end at the same y coordinate.  | 
1415  | 0  | void TabFind::ApplyTabConstraints() { | 
1416  | 0  |   TabVector_IT it(&vectors_);  | 
1417  | 0  |   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | 
1418  | 0  |     TabVector *v = it.data();  | 
1419  | 0  |     v->SetupConstraints();  | 
1420  | 0  |   }  | 
1421  | 0  |   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | 
1422  | 0  |     TabVector *v = it.data();  | 
1423  |  |     // With the first and last partner, we want a common bottom and top,  | 
1424  |  |     // respectively, and for each change of partner, we want a common  | 
1425  |  |     // top of first with bottom of next.  | 
1426  | 0  |     v->SetupPartnerConstraints();  | 
1427  | 0  |   }  | 
1428  |  |   // TODO(rays) The back-to-back pairs should really be done like the  | 
1429  |  |   // front-to-front pairs, but there is no convenient way of producing the  | 
1430  |  |   // list of partners like there is with the front-to-front.  | 
1431  | 0  |   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | 
1432  | 0  |     TabVector *v = it.data();  | 
1433  | 0  |     if (!v->IsRightTab()) { | 
1434  | 0  |       continue;  | 
1435  | 0  |     }  | 
1436  |  |     // For each back-to-back pair of vectors, try for common top and bottom.  | 
1437  | 0  |     TabVector_IT partner_it(it);  | 
1438  | 0  |     for (partner_it.forward(); !partner_it.at_first(); partner_it.forward()) { | 
1439  | 0  |       TabVector *partner = partner_it.data();  | 
1440  | 0  |       if (!partner->IsLeftTab() || !v->VOverlap(*partner)) { | 
1441  | 0  |         continue;  | 
1442  | 0  |       }  | 
1443  | 0  |       v->SetupPartnerConstraints(partner);  | 
1444  | 0  |     }  | 
1445  | 0  |   }  | 
1446  |  |   // Now actually apply the constraints to get common start/end points.  | 
1447  | 0  |   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { | 
1448  | 0  |     TabVector *v = it.data();  | 
1449  | 0  |     if (!v->IsSeparator()) { | 
1450  | 0  |       v->ApplyConstraints();  | 
1451  | 0  |     }  | 
1452  | 0  |   }  | 
1453  |  |   // TODO(rays) Where constraint application fails, it would be good to try  | 
1454  |  |   // checking the ends to see if they really should be moved.  | 
1455  | 0  | }  | 
1456  |  |  | 
1457  |  | } // namespace tesseract.  |