/src/tesseract/src/textord/tospace.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // Licensed under the Apache License, Version 2.0 (the "License");  | 
2  |  | // you may not use this file except in compliance with the License.  | 
3  |  | // You may obtain a copy of the License at  | 
4  |  | // http://www.apache.org/licenses/LICENSE-2.0  | 
5  |  | // Unless required by applicable law or agreed to in writing, software  | 
6  |  | // distributed under the License is distributed on an "AS IS" BASIS,  | 
7  |  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
8  |  | // See the License for the specific language governing permissions and  | 
9  |  | // limitations under the License.  | 
10  |  | /**********************************************************************  | 
11  |  |  * tospace.cpp  | 
12  |  |  *  | 
13  |  |  * Compute fuzzy word spacing thresholds for each row.  | 
14  |  |  * I.e. set :   max_nonspace  | 
15  |  |  *              space_threshold  | 
16  |  |  *              min_space  | 
17  |  |  *              kern_size  | 
18  |  |  *              space_size  | 
19  |  |  * for each row.  | 
20  |  |  * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE  | 
21  |  |  *  | 
22  |  |  * Note: functions in this file were originally not members of any  | 
23  |  |  * class or enclosed by any namespace. Now they are all static members  | 
24  |  |  * of the Textord class.  | 
25  |  |  *  | 
26  |  |  **********************************************************************/  | 
27  |  |  | 
28  |  | #include "drawtord.h"  | 
29  |  | #include "statistc.h"  | 
30  |  | #include "textord.h"  | 
31  |  | #include "tovars.h"  | 
32  |  |  | 
33  |  | // Include automatically generated configuration file if running autoconf.  | 
34  |  | #ifdef HAVE_CONFIG_H  | 
35  |  | #  include "config_auto.h"  | 
36  |  | #endif  | 
37  |  |  | 
38  |  | #include <algorithm>  | 
39  |  | #include <cmath>  | 
40  |  | #include <memory>  | 
41  |  |  | 
42  | 1.21M  | #define MAXSPACING 128 /*max expected spacing in pix */  | 
43  |  |  | 
44  |  | namespace tesseract { | 
45  |  | void Textord::to_spacing(ICOORD page_tr,       // topright of page  | 
46  |  |                          TO_BLOCK_LIST *blocks // blocks on page  | 
47  | 15.4k  | ) { | 
48  | 15.4k  |   TO_BLOCK_IT block_it; // iterator  | 
49  | 15.4k  |   TO_BLOCK *block;      // current block;  | 
50  | 15.4k  |   TO_ROW *row;          // current row  | 
51  | 15.4k  |   int block_index;      // block number  | 
52  | 15.4k  |   int row_index;        // row number  | 
53  |  |   // estimated width of real spaces for whole block  | 
54  | 15.4k  |   int16_t block_space_gap_width;  | 
55  |  |   // estimated width of non space gaps for whole block  | 
56  | 15.4k  |   int16_t block_non_space_gap_width;  | 
57  | 15.4k  |   bool old_text_ord_proportional; // old fixed/prop result  | 
58  |  |  | 
59  | 15.4k  |   block_it.set_to_list(blocks);  | 
60  | 15.4k  |   block_index = 1;  | 
61  | 30.9k  |   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { | 
62  | 15.4k  |     block = block_it.data();  | 
63  | 15.4k  |     std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk  | 
64  | 15.4k  |     block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,  | 
65  | 15.4k  |                         block_non_space_gap_width);  | 
66  |  |     // Make sure relative values of block-level space and non-space gap  | 
67  |  |     // widths are reasonable. The ratio of 1:3 is also used in  | 
68  |  |     // block_spacing_stats, to correct the block_space_gap_width.  | 
69  |  |     // Useful for arabic and hindi, when the non-space gap width is  | 
70  |  |     // often over-estimated and should not be trusted. A similar ratio  | 
71  |  |     // is found in block_spacing_stats.  | 
72  | 15.4k  |     if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&  | 
73  | 15.4k  |         block_non_space_gap_width > block_space_gap_width / 3) { | 
74  | 0  |       block_non_space_gap_width = block_space_gap_width / 3;  | 
75  | 0  |     }  | 
76  |  |     // row iterator  | 
77  | 15.4k  |     TO_ROW_IT row_it(block->get_rows());  | 
78  | 15.4k  |     row_index = 1;  | 
79  | 194k  |     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | 
80  | 178k  |       row = row_it.data();  | 
81  | 178k  |       if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) { | 
82  | 173k  |         if ((tosp_debug_level > 0) && !old_text_ord_proportional) { | 
83  | 0  |           tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index); | 
84  | 0  |         }  | 
85  | 173k  |         row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,  | 
86  | 173k  |                           block_non_space_gap_width);  | 
87  | 173k  |       } else { | 
88  | 5.71k  |         if ((tosp_debug_level > 0) && old_text_ord_proportional) { | 
89  | 0  |           tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index, | 
90  | 0  |                   row_index, row->pitch_decision, row->fixed_pitch);  | 
91  | 0  |         }  | 
92  | 5.71k  |       }  | 
93  |  | #ifndef GRAPHICS_DISABLED  | 
94  |  |       if (textord_show_initial_words) { | 
95  |  |         plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);  | 
96  |  |       }  | 
97  |  | #endif  | 
98  | 178k  |       row_index++;  | 
99  | 178k  |     }  | 
100  | 15.4k  |     block_index++;  | 
101  | 15.4k  |   }  | 
102  | 15.4k  | }  | 
103  |  |  | 
104  |  | /*************************************************************************  | 
105  |  |  * block_spacing_stats()  | 
106  |  |  *************************************************************************/  | 
107  |  |  | 
108  |  | void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,  | 
109  |  |                                   int16_t &block_space_gap_width,    // resulting estimate  | 
110  |  |                                   int16_t &block_non_space_gap_width // resulting estimate  | 
111  | 15.4k  | ) { | 
112  | 15.4k  |   TO_ROW *row;         // current row  | 
113  | 15.4k  |   BLOBNBOX_IT blob_it; // iterator  | 
114  |  |  | 
115  | 15.4k  |   STATS centre_to_centre_stats(0, MAXSPACING - 1);  | 
116  |  |   // DEBUG USE ONLY  | 
117  | 15.4k  |   STATS all_gap_stats(0, MAXSPACING - 1);  | 
118  | 15.4k  |   STATS space_gap_stats(0, MAXSPACING - 1);  | 
119  | 15.4k  |   int16_t minwidth = MAXSPACING; // narrowest blob  | 
120  | 15.4k  |   TBOX blob_box;  | 
121  | 15.4k  |   TBOX prev_blob_box;  | 
122  | 15.4k  |   int16_t centre_to_centre;  | 
123  | 15.4k  |   int16_t gap_width;  | 
124  | 15.4k  |   float real_space_threshold;  | 
125  | 15.4k  |   float iqr_centre_to_centre; // DEBUG USE ONLY  | 
126  | 15.4k  |   float iqr_all_gap_stats;    // DEBUG USE ONLY  | 
127  | 15.4k  |   int32_t end_of_row;  | 
128  | 15.4k  |   int32_t row_length;  | 
129  |  |  | 
130  |  |   // row iterator  | 
131  | 15.4k  |   TO_ROW_IT row_it(block->get_rows());  | 
132  | 194k  |   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | 
133  | 178k  |     row = row_it.data();  | 
134  | 178k  |     if (!row->blob_list()->empty() &&  | 
135  | 178k  |         (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||  | 
136  | 178k  |          (row->pitch_decision == PITCH_CORR_PROP))) { | 
137  | 173k  |       blob_it.set_to_list(row->blob_list());  | 
138  | 173k  |       blob_it.mark_cycle_pt();  | 
139  | 173k  |       end_of_row = blob_it.data_relative(-1)->bounding_box().right();  | 
140  | 173k  |       if (tosp_use_pre_chopping) { | 
141  | 0  |         blob_box = box_next_pre_chopped(&blob_it);  | 
142  | 173k  |       } else if (tosp_stats_use_xht_gaps) { | 
143  | 173k  |         blob_box = reduced_box_next(row, &blob_it);  | 
144  | 173k  |       } else { | 
145  | 0  |         blob_box = box_next(&blob_it);  | 
146  | 0  |       }  | 
147  | 173k  |       row_length = end_of_row - blob_box.left();  | 
148  | 173k  |       if (blob_box.width() < minwidth) { | 
149  | 21.5k  |         minwidth = blob_box.width();  | 
150  | 21.5k  |       }  | 
151  | 173k  |       prev_blob_box = blob_box;  | 
152  | 1.23M  |       while (!blob_it.cycled_list()) { | 
153  | 1.06M  |         if (tosp_use_pre_chopping) { | 
154  | 0  |           blob_box = box_next_pre_chopped(&blob_it);  | 
155  | 1.06M  |         } else if (tosp_stats_use_xht_gaps) { | 
156  | 1.06M  |           blob_box = reduced_box_next(row, &blob_it);  | 
157  | 1.06M  |         } else { | 
158  | 0  |           blob_box = box_next(&blob_it);  | 
159  | 0  |         }  | 
160  | 1.06M  |         if (blob_box.width() < minwidth) { | 
161  | 8.10k  |           minwidth = blob_box.width();  | 
162  | 8.10k  |         }  | 
163  | 1.06M  |         int16_t left = prev_blob_box.right();  | 
164  | 1.06M  |         int16_t right = blob_box.left();  | 
165  | 1.06M  |         gap_width = right - left;  | 
166  | 1.06M  |         if (!ignore_big_gap(row, row_length, gapmap, left, right)) { | 
167  | 1.05M  |           all_gap_stats.add(gap_width, 1);  | 
168  |  |  | 
169  | 1.05M  |           centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2;  | 
170  |  |           // DEBUG  | 
171  | 1.05M  |           centre_to_centre_stats.add(centre_to_centre, 1);  | 
172  |  |           // DEBUG  | 
173  | 1.05M  |         }  | 
174  | 1.06M  |         prev_blob_box = blob_box;  | 
175  | 1.06M  |       }  | 
176  | 173k  |     }  | 
177  | 178k  |   }  | 
178  |  |  | 
179  |  |   // Inadequate samples  | 
180  | 15.4k  |   if (all_gap_stats.get_total() <= 1) { | 
181  | 6.42k  |     block_non_space_gap_width = minwidth;  | 
182  | 6.42k  |     block_space_gap_width = -1; // No est. space width  | 
183  |  |                                 // DEBUG  | 
184  | 6.42k  |     old_text_ord_proportional = true;  | 
185  | 9.04k  |   } else { | 
186  |  |     /* For debug only ..... */  | 
187  | 9.04k  |     iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25);  | 
188  | 9.04k  |     iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25);  | 
189  | 9.04k  |     old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats;  | 
190  |  |     /* .......For debug only */  | 
191  |  |  | 
192  |  |     /*  | 
193  |  | The median of the gaps is used as an estimate of the NON-SPACE gap width.  | 
194  |  | This RELIES on the assumption that there are more gaps WITHIN words than  | 
195  |  | BETWEEN words in a block  | 
196  |  |  | 
197  |  | Now try to estimate the width of a real space for all real spaces in the  | 
198  |  | block. Do this by using a crude threshold to ignore "narrow" gaps, then  | 
199  |  | find the median of the "wide" gaps and use this.  | 
200  |  | */  | 
201  | 9.04k  |     block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median()));  | 
202  |  |     // median gap  | 
203  |  |  | 
204  | 9.04k  |     row_it.set_to_list(block->get_rows());  | 
205  | 143k  |     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | 
206  | 134k  |       row = row_it.data();  | 
207  | 134k  |       if (!row->blob_list()->empty() &&  | 
208  | 134k  |           (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||  | 
209  | 134k  |            (row->pitch_decision == PITCH_CORR_PROP))) { | 
210  | 130k  |         real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width,  | 
211  | 130k  |                                         tosp_init_guess_xht_mult * row->xheight);  | 
212  | 130k  |         blob_it.set_to_list(row->blob_list());  | 
213  | 130k  |         blob_it.mark_cycle_pt();  | 
214  | 130k  |         end_of_row = blob_it.data_relative(-1)->bounding_box().right();  | 
215  | 130k  |         if (tosp_use_pre_chopping) { | 
216  | 0  |           blob_box = box_next_pre_chopped(&blob_it);  | 
217  | 130k  |         } else if (tosp_stats_use_xht_gaps) { | 
218  | 130k  |           blob_box = reduced_box_next(row, &blob_it);  | 
219  | 130k  |         } else { | 
220  | 0  |           blob_box = box_next(&blob_it);  | 
221  | 0  |         }  | 
222  | 130k  |         row_length = blob_box.left() - end_of_row;  | 
223  | 130k  |         prev_blob_box = blob_box;  | 
224  | 1.19M  |         while (!blob_it.cycled_list()) { | 
225  | 1.06M  |           if (tosp_use_pre_chopping) { | 
226  | 0  |             blob_box = box_next_pre_chopped(&blob_it);  | 
227  | 1.06M  |           } else if (tosp_stats_use_xht_gaps) { | 
228  | 1.06M  |             blob_box = reduced_box_next(row, &blob_it);  | 
229  | 1.06M  |           } else { | 
230  | 0  |             blob_box = box_next(&blob_it);  | 
231  | 0  |           }  | 
232  | 1.06M  |           int16_t left = prev_blob_box.right();  | 
233  | 1.06M  |           int16_t right = blob_box.left();  | 
234  | 1.06M  |           gap_width = right - left;  | 
235  | 1.06M  |           if ((gap_width > real_space_threshold) &&  | 
236  | 1.06M  |               !ignore_big_gap(row, row_length, gapmap, left, right)) { | 
237  |  |             /*  | 
238  |  | If tosp_use_cert_spaces is enabled, the estimate of the space gap is  | 
239  |  | restricted to obvious spaces - those wider than half the xht or  | 
240  |  | those with wide blobs on both sides - i.e not things that are  | 
241  |  | suspect 1's or punctuation that is sometimes widely spaced.  | 
242  |  | */  | 
243  | 110k  |             if (!tosp_block_use_cert_spaces ||  | 
244  | 110k  |                 (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||  | 
245  | 110k  |                 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&  | 
246  | 77.2k  |                  (!tosp_narrow_blobs_not_cert ||  | 
247  | 20.0k  |                   (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||  | 
248  | 110k  |                 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { | 
249  | 38.3k  |               space_gap_stats.add(gap_width, 1);  | 
250  | 38.3k  |             }  | 
251  | 110k  |           }  | 
252  | 1.06M  |           prev_blob_box = blob_box;  | 
253  | 1.06M  |         }  | 
254  | 130k  |       }  | 
255  | 134k  |     }  | 
256  |  |     // Inadequate samples  | 
257  | 9.04k  |     if (space_gap_stats.get_total() <= 2) { | 
258  | 7.36k  |       block_space_gap_width = -1; // No est. space width  | 
259  | 7.36k  |     } else { | 
260  | 1.68k  |       block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())),  | 
261  | 1.68k  |                                        static_cast<int16_t>(3 * block_non_space_gap_width));  | 
262  | 1.68k  |     }  | 
263  | 9.04k  |   }  | 
264  | 15.4k  | }  | 
265  |  |  | 
266  |  | /*************************************************************************  | 
267  |  |  * row_spacing_stats()  | 
268  |  |  * Set values for min_space, max_non_space based on row stats only  | 
269  |  |  * If failure - return 0 values.  | 
270  |  |  *************************************************************************/  | 
271  |  | void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,  | 
272  |  |                                 int16_t block_space_gap_width,    // estimate for block  | 
273  |  |                                 int16_t block_non_space_gap_width // estimate for block  | 
274  | 173k  | ) { | 
275  |  |   // iterator  | 
276  | 173k  |   BLOBNBOX_IT blob_it = row->blob_list();  | 
277  | 173k  |   STATS all_gap_stats(0, MAXSPACING - 1);  | 
278  | 173k  |   STATS cert_space_gap_stats(0, MAXSPACING - 1);  | 
279  | 173k  |   STATS all_space_gap_stats(0, MAXSPACING - 1);  | 
280  | 173k  |   STATS small_gap_stats(0, MAXSPACING - 1);  | 
281  | 173k  |   TBOX blob_box;  | 
282  | 173k  |   TBOX prev_blob_box;  | 
283  | 173k  |   int16_t gap_width;  | 
284  | 173k  |   int16_t real_space_threshold = 0;  | 
285  | 173k  |   int16_t max = 0;  | 
286  | 173k  |   int16_t large_gap_count = 0;  | 
287  | 173k  |   bool suspected_table;  | 
288  | 173k  |   bool good_block_space_estimate = block_space_gap_width > 0;  | 
289  | 173k  |   int32_t end_of_row;  | 
290  | 173k  |   int32_t row_length = 0;  | 
291  | 173k  |   float sane_space;  | 
292  | 173k  |   int32_t sane_threshold;  | 
293  |  |  | 
294  |  |   /* Collect first pass stats for row */  | 
295  |  |  | 
296  | 173k  |   if (!good_block_space_estimate) { | 
297  | 138k  |     block_space_gap_width = int16_t(std::floor(row->xheight / 2));  | 
298  | 138k  |   }  | 
299  | 173k  |   if (!row->blob_list()->empty()) { | 
300  | 173k  |     if (tosp_threshold_bias1 > 0) { | 
301  | 0  |       real_space_threshold =  | 
302  | 0  |           block_non_space_gap_width +  | 
303  | 0  |           int16_t(floor(0.5 + tosp_threshold_bias1 *  | 
304  | 0  |                                   (block_space_gap_width - block_non_space_gap_width)));  | 
305  | 173k  |     } else { | 
306  | 173k  |       real_space_threshold = // Old TO method  | 
307  | 173k  |           (block_space_gap_width + block_non_space_gap_width) / 2;  | 
308  | 173k  |     }  | 
309  | 173k  |     blob_it.set_to_list(row->blob_list());  | 
310  | 173k  |     blob_it.mark_cycle_pt();  | 
311  | 173k  |     end_of_row = blob_it.data_relative(-1)->bounding_box().right();  | 
312  | 173k  |     if (tosp_use_pre_chopping) { | 
313  | 0  |       blob_box = box_next_pre_chopped(&blob_it);  | 
314  | 173k  |     } else if (tosp_stats_use_xht_gaps) { | 
315  | 173k  |       blob_box = reduced_box_next(row, &blob_it);  | 
316  | 173k  |     } else { | 
317  | 0  |       blob_box = box_next(&blob_it);  | 
318  | 0  |     }  | 
319  | 173k  |     row_length = end_of_row - blob_box.left();  | 
320  | 173k  |     prev_blob_box = blob_box;  | 
321  | 1.23M  |     while (!blob_it.cycled_list()) { | 
322  | 1.06M  |       if (tosp_use_pre_chopping) { | 
323  | 0  |         blob_box = box_next_pre_chopped(&blob_it);  | 
324  | 1.06M  |       } else if (tosp_stats_use_xht_gaps) { | 
325  | 1.06M  |         blob_box = reduced_box_next(row, &blob_it);  | 
326  | 1.06M  |       } else { | 
327  | 0  |         blob_box = box_next(&blob_it);  | 
328  | 0  |       }  | 
329  | 1.06M  |       int16_t left = prev_blob_box.right();  | 
330  | 1.06M  |       int16_t right = blob_box.left();  | 
331  | 1.06M  |       gap_width = right - left;  | 
332  | 1.06M  |       if (ignore_big_gap(row, row_length, gapmap, left, right)) { | 
333  | 10.3k  |         large_gap_count++;  | 
334  | 1.05M  |       } else { | 
335  | 1.05M  |         if (gap_width >= real_space_threshold) { | 
336  | 137k  |           if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||  | 
337  | 137k  |               ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&  | 
338  | 96.0k  |                (!tosp_narrow_blobs_not_cert ||  | 
339  | 22.5k  |                 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||  | 
340  | 137k  |               (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { | 
341  | 49.7k  |             cert_space_gap_stats.add(gap_width, 1);  | 
342  | 49.7k  |           }  | 
343  | 137k  |           all_space_gap_stats.add(gap_width, 1);  | 
344  | 918k  |         } else { | 
345  | 918k  |           small_gap_stats.add(gap_width, 1);  | 
346  | 918k  |         }  | 
347  | 1.05M  |         all_gap_stats.add(gap_width, 1);  | 
348  | 1.05M  |       }  | 
349  | 1.06M  |       prev_blob_box = blob_box;  | 
350  | 1.06M  |     }  | 
351  | 173k  |   }  | 
352  | 173k  |   suspected_table = (large_gap_count > 1) ||  | 
353  | 173k  |                     ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples));  | 
354  |  |  | 
355  |  |   /* Now determine row kern size, space size and threshold */  | 
356  |  |  | 
357  | 173k  |   if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) ||  | 
358  | 173k  |       ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) &&  | 
359  | 167k  |        cert_space_gap_stats.get_total() > 0)) { | 
360  | 18.1k  |     old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats,  | 
361  | 18.1k  |                   block_space_gap_width, block_non_space_gap_width);  | 
362  | 155k  |   } else { | 
363  | 155k  |     if (!tosp_recovery_isolated_row_stats ||  | 
364  | 155k  |         !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) { | 
365  | 146k  |       if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) { | 
366  | 0  |         tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx); | 
367  | 0  |       }  | 
368  | 146k  |       if (tosp_row_use_cert_spaces1 && good_block_space_estimate) { | 
369  |  |         // Use block default  | 
370  | 17.1k  |         row->space_size = block_space_gap_width;  | 
371  | 17.1k  |         if (all_gap_stats.get_total() > tosp_redo_kern_limit) { | 
372  | 6.27k  |           row->kern_size = all_gap_stats.median();  | 
373  | 10.8k  |         } else { | 
374  | 10.8k  |           row->kern_size = block_non_space_gap_width;  | 
375  | 10.8k  |         }  | 
376  | 17.1k  |         row->space_threshold =  | 
377  | 17.1k  |             int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));  | 
378  | 129k  |       } else { | 
379  | 129k  |         old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats,  | 
380  | 129k  |                       block_space_gap_width, block_non_space_gap_width);  | 
381  | 129k  |       }  | 
382  | 146k  |     }  | 
383  | 155k  |   }  | 
384  |  |  | 
385  | 173k  |   if (tosp_improve_thresh && !suspected_table) { | 
386  | 0  |     improve_row_threshold(row, &all_gap_stats);  | 
387  | 0  |   }  | 
388  |  |  | 
389  |  |   /* Now lets try to be careful not to do anything silly with tables when we  | 
390  |  | are ignoring big gaps*/  | 
391  | 173k  |   if (tosp_sanity_method == 0) { | 
392  | 0  |     if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) { | 
393  | 0  |       if (tosp_debug_level > 5) { | 
394  | 0  |         tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx, | 
395  | 0  |                 row->kern_size, row->space_threshold, row->space_size);  | 
396  | 0  |       }  | 
397  | 0  |       row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);  | 
398  | 0  |       row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);  | 
399  | 0  |     }  | 
400  | 173k  |   } else if (tosp_sanity_method == 1) { | 
401  | 173k  |     sane_space = row->space_size;  | 
402  |  |     /* NEVER let space size get too close to kern size */  | 
403  | 173k  |     if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||  | 
404  | 173k  |         ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) { | 
405  | 24.3k  |       if (good_block_space_estimate &&  | 
406  | 24.3k  |           (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) { | 
407  | 1.28k  |         sane_space = block_space_gap_width;  | 
408  | 23.0k  |       } else { | 
409  | 23.0k  |         sane_space =  | 
410  | 23.0k  |             std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),  | 
411  | 23.0k  |                      row->xheight / 2.0f);  | 
412  | 23.0k  |       }  | 
413  | 24.3k  |       if (tosp_debug_level > 5) { | 
414  | 0  |         tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx, | 
415  | 0  |                 row->kern_size, row->space_threshold, row->space_size, sane_space);  | 
416  | 0  |       }  | 
417  | 24.3k  |       row->space_size = sane_space;  | 
418  | 24.3k  |       row->space_threshold =  | 
419  | 24.3k  |           int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));  | 
420  | 24.3k  |     }  | 
421  |  |     /* NEVER let threshold get VERY far away from kern */  | 
422  | 173k  |     sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f)));  | 
423  | 173k  |     if (row->space_threshold > sane_threshold) { | 
424  | 4.35k  |       if (tosp_debug_level > 5) { | 
425  | 0  |         tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx, | 
426  | 0  |                 row->kern_size, row->space_threshold, row->space_size, sane_threshold);  | 
427  | 0  |       }  | 
428  | 4.35k  |       row->space_threshold = sane_threshold;  | 
429  | 4.35k  |       if (row->space_size <= sane_threshold) { | 
430  | 0  |         row->space_size = row->space_threshold + 1.0f;  | 
431  | 0  |       }  | 
432  | 4.35k  |     }  | 
433  |  |     /* Beware of tables - there may be NO spaces */  | 
434  | 173k  |     if (suspected_table) { | 
435  | 5.89k  |       sane_space =  | 
436  | 5.89k  |           std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight);  | 
437  | 5.89k  |       sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2));  | 
438  |  |  | 
439  | 5.89k  |       if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) { | 
440  | 758  |         if (tosp_debug_level > 5) { | 
441  | 0  |           tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx, | 
442  | 0  |                   row->kern_size, row->space_threshold, row->space_size);  | 
443  | 0  |         }  | 
444  |  |         // the minimum sane value  | 
445  | 758  |         row->space_threshold = static_cast<int32_t>(sane_space);  | 
446  | 758  |         row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);  | 
447  | 758  |       }  | 
448  | 5.89k  |     }  | 
449  | 173k  |   }  | 
450  |  |  | 
451  |  |   /* Now lets try to put some error limits on the threshold */  | 
452  |  |  | 
453  | 173k  |   if (tosp_old_to_method) { | 
454  |  |     /* Old textord made a space if gap >= threshold */  | 
455  |  |     // NO FUZZY SPACES YET  | 
456  | 0  |     row->max_nonspace = row->space_threshold;  | 
457  |  |     // NO FUZZY SPACES       YET  | 
458  | 0  |     row->min_space = row->space_threshold + 1;  | 
459  | 173k  |   } else { | 
460  |  |     /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */  | 
461  | 173k  |     row->min_space =  | 
462  | 173k  |         std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size));  | 
463  | 173k  |     if (row->min_space <= row->space_threshold) { | 
464  |  |       // Don't be silly  | 
465  | 30.5k  |       row->min_space = row->space_threshold + 1;  | 
466  | 30.5k  |     }  | 
467  |  |     /*  | 
468  |  | Lets try to guess the max certain kern gap by looking at the cluster of  | 
469  |  | kerns for the row. The row is proportional so the kerns should cluster  | 
470  |  | tightly at the bottom of the distribution. We also expect most gaps to be  | 
471  |  | kerns. Find the maximum of the kern piles between 0 and twice the kern  | 
472  |  | estimate. Piles before the first one with less than 1/10 the maximum  | 
473  |  | number of samples can be taken as certain kerns.  | 
474  |  |  | 
475  |  |   Of course, there are some cases where the kern peak and space peaks merge,  | 
476  |  |   so we will put an UPPER limit on the max certain kern gap of some fraction  | 
477  |  |   below the threshold.  | 
478  |  | */  | 
479  |  |  | 
480  |  |     // upper bound  | 
481  | 173k  |     int32_t max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2);  | 
482  |  |  | 
483  |  |     // default  | 
484  | 173k  |     row->max_nonspace = max_max_nonspace;  | 
485  | 877k  |     for (int32_t index = 0; index <= max_max_nonspace; index++) { | 
486  | 776k  |       if (all_gap_stats.pile_count(index) > max) { | 
487  | 121k  |         max = all_gap_stats.pile_count(index);  | 
488  | 121k  |       }  | 
489  | 776k  |       if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) { | 
490  | 71.8k  |         row->max_nonspace = index;  | 
491  | 71.8k  |         break;  | 
492  | 71.8k  |       }  | 
493  | 776k  |     }  | 
494  | 173k  |   }  | 
495  |  |  | 
496  |  |   /* Yet another algorithm - simpler this time - just choose a fraction of the  | 
497  |  | threshold to space range */  | 
498  |  |  | 
499  | 173k  |   if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) { | 
500  | 173k  |     row->min_space = std::max(  | 
501  | 173k  |         row->min_space, static_cast<int32_t>(ceil(row->space_threshold +  | 
502  | 173k  |                                                   tosp_fuzzy_sp_fraction *  | 
503  | 173k  |                                                       (row->space_size - row->space_threshold))));  | 
504  | 173k  |   }  | 
505  |  |  | 
506  |  |   /* Ensure that ANY space less than some multiplier times the kern size is  | 
507  |  | fuzzy.  In tables there is a risk of erroneously setting a small space size  | 
508  |  | when there are no real spaces. Sometimes tables have text squashed into  | 
509  |  | columns so that the kn->sp ratio is small anyway - this means that we can't  | 
510  |  | use this to force a wider separation - hence we rely on context to join any  | 
511  |  | dubious breaks. */  | 
512  |  |  | 
513  | 173k  |   if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) { | 
514  | 173k  |     row->min_space = std::max(  | 
515  | 173k  |         row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size)));  | 
516  | 173k  |   }  | 
517  |  |  | 
518  | 173k  |   if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) { | 
519  | 172k  |     row->max_nonspace = static_cast<int32_t>(floor(  | 
520  | 172k  |         0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)));  | 
521  | 172k  |   }  | 
522  | 173k  |   if (row->max_nonspace > row->space_threshold) { | 
523  |  |     // Don't be silly  | 
524  | 0  |     row->max_nonspace = row->space_threshold;  | 
525  | 0  |   }  | 
526  |  |  | 
527  | 173k  |   if (tosp_debug_level > 5) { | 
528  | 0  |     tprintf(  | 
529  | 0  |         "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) "  | 
530  | 0  |         "Sp:%3.2f\n",  | 
531  | 0  |         block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width,  | 
532  | 0  |         real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold,  | 
533  | 0  |         row->min_space, row->space_size);  | 
534  | 0  |   }  | 
535  | 173k  |   if (tosp_debug_level > 10) { | 
536  | 0  |     tprintf(  | 
537  | 0  |         "row->kern_size = %3.2f, row->space_size = %3.2f, "  | 
538  | 0  |         "row->space_threshold = %d\n",  | 
539  | 0  |         row->kern_size, row->space_size, row->space_threshold);  | 
540  | 0  |   }  | 
541  | 173k  | }  | 
542  |  |  | 
543  |  | void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,  | 
544  |  |                             STATS *small_gap_stats,  | 
545  |  |                             int16_t block_space_gap_width,    // estimate for block  | 
546  |  |                             int16_t block_non_space_gap_width // estimate for block  | 
547  | 147k  | ) { | 
548  |  |   /* First, estimate row space size */  | 
549  |  |   /* Old to condition was > 2 */  | 
550  | 147k  |   if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) { | 
551  |  |     // Adequate samples  | 
552  |  |     /* Set space size to median of spaces BUT limits it if it seems wildly out  | 
553  |  |      */  | 
554  | 9.83k  |     row->space_size = space_gap_stats->median();  | 
555  | 9.83k  |     if (row->space_size > block_space_gap_width * 1.5) { | 
556  | 1.08k  |       if (tosp_old_to_bug_fix) { | 
557  | 0  |         row->space_size = block_space_gap_width * 1.5;  | 
558  | 1.08k  |       } else { | 
559  |  |         // BUG??? should be *1.5  | 
560  | 1.08k  |         row->space_size = block_space_gap_width;  | 
561  | 1.08k  |       }  | 
562  | 1.08k  |     }  | 
563  | 9.83k  |     if (row->space_size < (block_non_space_gap_width * 2) + 1) { | 
564  | 2.70k  |       row->space_size = (block_non_space_gap_width * 2) + 1;  | 
565  | 2.70k  |     }  | 
566  | 9.83k  |   }  | 
567  |  |   // Only 1 or 2 samples  | 
568  | 138k  |   else if (space_gap_stats->get_total() >= 1) { | 
569  |  |     // hence mean not median  | 
570  | 28.9k  |     row->space_size = space_gap_stats->mean();  | 
571  | 28.9k  |     if (row->space_size > block_space_gap_width * 1.5) { | 
572  | 3.74k  |       if (tosp_old_to_bug_fix) { | 
573  | 0  |         row->space_size = block_space_gap_width * 1.5;  | 
574  | 3.74k  |       } else { | 
575  |  |         // BUG??? should be *1.5  | 
576  | 3.74k  |         row->space_size = block_space_gap_width;  | 
577  | 3.74k  |       }  | 
578  | 3.74k  |     }  | 
579  | 28.9k  |     if (row->space_size < (block_non_space_gap_width * 3) + 1) { | 
580  | 13.2k  |       row->space_size = (block_non_space_gap_width * 3) + 1;  | 
581  | 13.2k  |     }  | 
582  | 109k  |   } else { | 
583  |  |     // Use block default  | 
584  | 109k  |     row->space_size = block_space_gap_width;  | 
585  | 109k  |   }  | 
586  |  |  | 
587  |  |   /* Next, estimate row kern size */  | 
588  | 147k  |   if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) { | 
589  | 0  |     row->kern_size = small_gap_stats->median();  | 
590  | 147k  |   } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) { | 
591  | 15.7k  |     row->kern_size = all_gap_stats->median();  | 
592  | 132k  |   } else { // old TO -SAME FOR ALL ROWS | 
593  | 132k  |     row->kern_size = block_non_space_gap_width;  | 
594  | 132k  |   }  | 
595  |  |  | 
596  |  |   /* Finally, estimate row space threshold */  | 
597  | 147k  |   if (tosp_threshold_bias2 > 0) { | 
598  | 0  |     row->space_threshold = int32_t(  | 
599  | 0  |         floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size)));  | 
600  | 147k  |   } else { | 
601  |  |     /*  | 
602  |  |   NOTE old text ord uses (space_size + kern_size + 1)/2  as the threshold  | 
603  |  | and holds this in a float. The use is with a >= test  | 
604  |  | NEW textord uses an integer threshold and a > test  | 
605  |  | It comes to the same thing.  | 
606  |  |   (Though there is a difference in that old textor has integer space_size  | 
607  |  |   and kern_size.)  | 
608  |  | */  | 
609  | 147k  |     row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));  | 
610  | 147k  |   }  | 
611  |  |  | 
612  |  |   // Apply the same logic and ratios as in row_spacing_stats to  | 
613  |  |   // restrict relative values of the row's space_size, kern_size, and  | 
614  |  |   // space_threshold  | 
615  | 147k  |   if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&  | 
616  | 147k  |       ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||  | 
617  | 0  |        ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) { | 
618  | 0  |     if (row->kern_size > 2.5) { | 
619  | 0  |       row->kern_size = row->space_size / tosp_min_sane_kn_sp;  | 
620  | 0  |     }  | 
621  | 0  |     row->space_threshold =  | 
622  | 0  |         int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));  | 
623  | 0  |   }  | 
624  | 147k  | }  | 
625  |  |  | 
626  |  | /*************************************************************************  | 
627  |  |  * isolated_row_stats()  | 
628  |  |  * Set values for min_space, max_non_space based on row stats only  | 
629  |  |  *************************************************************************/  | 
630  |  | bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats,  | 
631  | 155k  |                                  bool suspected_table, int16_t block_idx, int16_t row_idx) { | 
632  | 155k  |   float kern_estimate;  | 
633  | 155k  |   float crude_threshold_estimate;  | 
634  | 155k  |   int16_t small_gaps_count;  | 
635  | 155k  |   int16_t total;  | 
636  |  |   // iterator  | 
637  | 155k  |   BLOBNBOX_IT blob_it = row->blob_list();  | 
638  | 155k  |   STATS cert_space_gap_stats(0, MAXSPACING - 1);  | 
639  | 155k  |   STATS all_space_gap_stats(0, MAXSPACING - 1);  | 
640  | 155k  |   STATS small_gap_stats(0, MAXSPACING - 1);  | 
641  | 155k  |   TBOX blob_box;  | 
642  | 155k  |   TBOX prev_blob_box;  | 
643  | 155k  |   int16_t gap_width;  | 
644  | 155k  |   int32_t end_of_row;  | 
645  | 155k  |   int32_t row_length;  | 
646  |  |  | 
647  | 155k  |   kern_estimate = all_gap_stats->median();  | 
648  | 155k  |   crude_threshold_estimate =  | 
649  | 155k  |       std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight);  | 
650  | 155k  |   small_gaps_count =  | 
651  | 155k  |       stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate)));  | 
652  | 155k  |   total = all_gap_stats->get_total();  | 
653  |  |  | 
654  | 155k  |   if ((total <= tosp_redo_kern_limit) ||  | 
655  | 155k  |       ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||  | 
656  | 155k  |       (total - small_gaps_count < 1)) { | 
657  | 146k  |     if (tosp_debug_level > 5) { | 
658  | 0  |       tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx); | 
659  | 0  |     }  | 
660  | 146k  |     return false;  | 
661  | 146k  |   }  | 
662  | 8.13k  |   blob_it.set_to_list(row->blob_list());  | 
663  | 8.13k  |   blob_it.mark_cycle_pt();  | 
664  | 8.13k  |   end_of_row = blob_it.data_relative(-1)->bounding_box().right();  | 
665  | 8.13k  |   if (tosp_use_pre_chopping) { | 
666  | 0  |     blob_box = box_next_pre_chopped(&blob_it);  | 
667  | 8.13k  |   } else if (tosp_stats_use_xht_gaps) { | 
668  | 8.13k  |     blob_box = reduced_box_next(row, &blob_it);  | 
669  | 8.13k  |   } else { | 
670  | 0  |     blob_box = box_next(&blob_it);  | 
671  | 0  |   }  | 
672  | 8.13k  |   row_length = end_of_row - blob_box.left();  | 
673  | 8.13k  |   prev_blob_box = blob_box;  | 
674  | 245k  |   while (!blob_it.cycled_list()) { | 
675  | 237k  |     if (tosp_use_pre_chopping) { | 
676  | 0  |       blob_box = box_next_pre_chopped(&blob_it);  | 
677  | 237k  |     } else if (tosp_stats_use_xht_gaps) { | 
678  | 237k  |       blob_box = reduced_box_next(row, &blob_it);  | 
679  | 237k  |     } else { | 
680  | 0  |       blob_box = box_next(&blob_it);  | 
681  | 0  |     }  | 
682  | 237k  |     int16_t left = prev_blob_box.right();  | 
683  | 237k  |     int16_t right = blob_box.left();  | 
684  | 237k  |     gap_width = right - left;  | 
685  | 237k  |     if (!ignore_big_gap(row, row_length, gapmap, left, right) &&  | 
686  | 237k  |         (gap_width > crude_threshold_estimate)) { | 
687  | 21.7k  |       if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||  | 
688  | 21.7k  |           ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&  | 
689  | 19.6k  |            (!tosp_narrow_blobs_not_cert ||  | 
690  | 6.56k  |             (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||  | 
691  | 21.7k  |           (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { | 
692  | 2.72k  |         cert_space_gap_stats.add(gap_width, 1);  | 
693  | 2.72k  |       }  | 
694  | 21.7k  |       all_space_gap_stats.add(gap_width, 1);  | 
695  | 21.7k  |     }  | 
696  | 237k  |     if (gap_width < crude_threshold_estimate) { | 
697  | 215k  |       small_gap_stats.add(gap_width, 1);  | 
698  | 215k  |     }  | 
699  |  |  | 
700  | 237k  |     prev_blob_box = blob_box;  | 
701  | 237k  |   }  | 
702  | 8.13k  |   if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) { | 
703  |  |     // median  | 
704  | 89  |     row->space_size = cert_space_gap_stats.median();  | 
705  | 8.04k  |   } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) { | 
706  |  |     // to avoid spaced  | 
707  | 33  |     row->space_size = cert_space_gap_stats.mean();  | 
708  |  |   //      1's in tables  | 
709  | 8.01k  |   } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) { | 
710  |  |     // median  | 
711  | 2.63k  |     row->space_size = all_space_gap_stats.median();  | 
712  | 5.37k  |   } else { | 
713  | 5.37k  |     row->space_size = all_space_gap_stats.mean();  | 
714  | 5.37k  |   }  | 
715  |  |  | 
716  | 8.13k  |   if (tosp_only_small_gaps_for_kern) { | 
717  | 0  |     row->kern_size = small_gap_stats.median();  | 
718  | 8.13k  |   } else { | 
719  | 8.13k  |     row->kern_size = all_gap_stats->median();  | 
720  | 8.13k  |   }  | 
721  | 8.13k  |   row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));  | 
722  |  |   /* Sanity check */  | 
723  | 8.13k  |   if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) ||  | 
724  | 8.13k  |       (row->space_threshold <= 0)) { | 
725  | 28  |     if (tosp_debug_level > 5) { | 
726  | 0  |       tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx, | 
727  | 0  |               row->kern_size, row->space_threshold, row->space_size);  | 
728  | 0  |     }  | 
729  | 28  |     row->kern_size = 0.0f;  | 
730  | 28  |     row->space_threshold = 0;  | 
731  | 28  |     row->space_size = 0.0f;  | 
732  | 28  |     return false;  | 
733  | 28  |   }  | 
734  |  |  | 
735  | 8.10k  |   if (tosp_debug_level > 5) { | 
736  | 0  |     tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size, | 
737  | 0  |             row->space_threshold, row->space_size);  | 
738  | 0  |   }  | 
739  | 8.10k  |   return true;  | 
740  | 8.13k  | }  | 
741  |  |  | 
742  | 155k  | int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) { | 
743  | 155k  |   int16_t index;  | 
744  | 155k  |   int16_t total = 0;  | 
745  |  |  | 
746  | 925k  |   for (index = 0; index < threshold; index++) { | 
747  | 770k  |     total += stats->pile_count(index);  | 
748  | 770k  |   }  | 
749  | 155k  |   return total;  | 
750  | 155k  | }  | 
751  |  |  | 
752  |  | /*************************************************************************  | 
753  |  |  * improve_row_threshold()  | 
754  |  |  *    Try to recognise a "normal line" -  | 
755  |  |  *           > 25 gaps  | 
756  |  |  *     &&    space > 3 * kn  && space > 10  | 
757  |  |  *              (I.e. reasonably large space and kn:sp ratio)  | 
758  |  |  *     &&    > 3/4 # gaps < kn + (sp - kn)/3  | 
759  |  |  *              (I.e. most gaps are well away from space estimate)  | 
760  |  |  *     &&    a gap of max(3, (sp - kn) / 3) empty histogram positions is found  | 
761  |  |  *           somewhere in the histogram between kn and sp  | 
762  |  |  *     THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies  | 
763  |  |  *          NO!!!!! the bristol line has "11" with a gap of 12 between the  | 
764  |  |  *1's!!! try moving the default threshold to within this band but leave the  | 
765  |  |  *          fuzzy limit calculation as at present.  | 
766  |  |  *************************************************************************/  | 
767  | 0  | void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) { | 
768  | 0  |   float sp = row->space_size;  | 
769  | 0  |   float kn = row->kern_size;  | 
770  | 0  |   int16_t reqd_zero_width = 0;  | 
771  | 0  |   int16_t zero_width = 0;  | 
772  | 0  |   int16_t zero_start = 0;  | 
773  | 0  |   int16_t index = 0;  | 
774  |  | 
  | 
775  | 0  |   if (tosp_debug_level > 10) { | 
776  | 0  |     tprintf("Improve row threshold 0"); | 
777  | 0  |   }  | 
778  | 0  |   if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) ||  | 
779  | 0  |       (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) <  | 
780  | 0  |        (0.75 * all_gap_stats->get_total()))) { | 
781  | 0  |     return;  | 
782  | 0  |   }  | 
783  | 0  |   if (tosp_debug_level > 10) { | 
784  | 0  |     tprintf(" 1"); | 
785  | 0  |   }  | 
786  |  |   /*  | 
787  |  | Look for the first region of all 0's in the histogram which is wider than  | 
788  |  | max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current  | 
789  |  | threshold is not within it, move the threshold so that is just inside it.  | 
790  |  | */  | 
791  | 0  |   reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5));  | 
792  | 0  |   if (reqd_zero_width < 3) { | 
793  | 0  |     reqd_zero_width = 3;  | 
794  | 0  |   }  | 
795  |  | 
  | 
796  | 0  |   for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) { | 
797  | 0  |     if (all_gap_stats->pile_count(index) == 0) { | 
798  | 0  |       if (zero_width == 0) { | 
799  | 0  |         zero_start = index;  | 
800  | 0  |       }  | 
801  | 0  |       zero_width++;  | 
802  | 0  |     } else { | 
803  | 0  |       if (zero_width >= reqd_zero_width) { | 
804  | 0  |         break;  | 
805  | 0  |       } else { | 
806  | 0  |         zero_width = 0;  | 
807  | 0  |       }  | 
808  | 0  |     }  | 
809  | 0  |   }  | 
810  | 0  |   index--;  | 
811  | 0  |   if (tosp_debug_level > 10) { | 
812  | 0  |     tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width, | 
813  | 0  |             zero_width, zero_start, row->space_threshold);  | 
814  | 0  |   }  | 
815  | 0  |   if ((zero_width < reqd_zero_width) ||  | 
816  | 0  |       ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) { | 
817  | 0  |     return;  | 
818  | 0  |   }  | 
819  | 0  |   if (tosp_debug_level > 10) { | 
820  | 0  |     tprintf(" 2"); | 
821  | 0  |   }  | 
822  | 0  |   if (row->space_threshold < zero_start) { | 
823  | 0  |     if (tosp_debug_level > 5) { | 
824  | 0  |       tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n", kn, sp, zero_start, | 
825  | 0  |               index, row->space_threshold, zero_start);  | 
826  | 0  |     }  | 
827  | 0  |     row->space_threshold = zero_start;  | 
828  | 0  |   }  | 
829  | 0  |   if (row->space_threshold > index) { | 
830  | 0  |     if (tosp_debug_level > 5) { | 
831  | 0  |       tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n", kn, sp, zero_start, | 
832  | 0  |               index, row->space_threshold, index);  | 
833  | 0  |     }  | 
834  | 0  |     row->space_threshold = index;  | 
835  | 0  |   }  | 
836  | 0  | }  | 
837  |  |  | 
838  |  | /**********************************************************************  | 
839  |  |  * make_prop_words  | 
840  |  |  *  | 
841  |  |  * Convert a TO_ROW to a ROW.  | 
842  |  |  **********************************************************************/  | 
843  |  | ROW *Textord::make_prop_words(TO_ROW *row,    // row to make  | 
844  |  |                               FCOORD rotation // for drawing  | 
845  | 173k  | ) { | 
846  | 173k  |   bool bol; // start of line  | 
847  |  |   /* prev_ values are for start of word being built. non prev_ values are for  | 
848  |  | the gap between the word being built and the next one. */  | 
849  | 173k  |   bool prev_fuzzy_sp;     // probably space  | 
850  | 173k  |   bool prev_fuzzy_non;    // probably not  | 
851  | 173k  |   uint8_t prev_blanks;    // in front of word  | 
852  | 173k  |   bool fuzzy_sp = false;  // probably space  | 
853  | 173k  |   bool fuzzy_non = false; // probably not  | 
854  | 173k  |   uint8_t blanks = 0;     // in front of word  | 
855  | 173k  |   bool prev_gap_was_a_space = false;  | 
856  | 173k  |   bool break_at_next_gap = false;  | 
857  | 173k  |   ROW *real_row; // output row  | 
858  | 173k  |   C_OUTLINE_IT cout_it;  | 
859  | 173k  |   C_BLOB_LIST cblobs;  | 
860  | 173k  |   C_BLOB_IT cblob_it = &cblobs;  | 
861  | 173k  |   WERD_LIST words;  | 
862  | 173k  |   WERD *word; // new word  | 
863  | 173k  |   int32_t next_rep_char_word_right = INT32_MAX;  | 
864  | 173k  |   float repetition_spacing; // gap between repetitions  | 
865  | 173k  |   int32_t xstarts[2];       // row ends  | 
866  | 173k  |   int32_t prev_x;           // end of prev blob  | 
867  | 173k  |   BLOBNBOX_IT box_it;       // iterator  | 
868  | 173k  |   TBOX prev_blob_box;  | 
869  | 173k  |   TBOX next_blob_box;  | 
870  | 173k  |   int16_t prev_gap = INT16_MAX;  | 
871  | 173k  |   int16_t current_gap = INT16_MAX;  | 
872  | 173k  |   int16_t next_gap = INT16_MAX;  | 
873  | 173k  |   int16_t prev_within_xht_gap = INT16_MAX;  | 
874  | 173k  |   int16_t current_within_xht_gap = INT16_MAX;  | 
875  | 173k  |   int16_t next_within_xht_gap = INT16_MAX;  | 
876  | 173k  |   int16_t word_count = 0;  | 
877  |  |  | 
878  |  |   // repeated char words  | 
879  | 173k  |   WERD_IT rep_char_it(&(row->rep_words));  | 
880  | 173k  |   if (!rep_char_it.empty()) { | 
881  | 0  |     next_rep_char_word_right = rep_char_it.data()->bounding_box().right();  | 
882  | 0  |   }  | 
883  |  |  | 
884  | 173k  |   prev_x = -INT16_MAX;  | 
885  | 173k  |   cblob_it.set_to_list(&cblobs);  | 
886  | 173k  |   box_it.set_to_list(row->blob_list());  | 
887  |  |   // new words  | 
888  | 173k  |   WERD_IT word_it(&words);  | 
889  | 173k  |   bol = true;  | 
890  | 173k  |   prev_blanks = 0;  | 
891  | 173k  |   prev_fuzzy_sp = false;  | 
892  | 173k  |   prev_fuzzy_non = false;  | 
893  | 173k  |   if (!box_it.empty()) { | 
894  | 173k  |     xstarts[0] = box_it.data()->bounding_box().left();  | 
895  | 173k  |     if (xstarts[0] > next_rep_char_word_right) { | 
896  |  |       /* We need to insert a repeated char word at the start of the row */  | 
897  | 0  |       word = rep_char_it.extract();  | 
898  | 0  |       word_it.add_after_then_move(word);  | 
899  |  |       /* Set spaces before repeated char word */  | 
900  | 0  |       word->set_flag(W_BOL, true);  | 
901  | 0  |       bol = false;  | 
902  | 0  |       word->set_blanks(0);  | 
903  |  |       // NO uncertainty  | 
904  | 0  |       word->set_flag(W_FUZZY_SP, false);  | 
905  | 0  |       word->set_flag(W_FUZZY_NON, false);  | 
906  | 0  |       xstarts[0] = word->bounding_box().left();  | 
907  |  |       /* Set spaces after repeated char word (and leave current word set) */  | 
908  | 0  |       repetition_spacing = find_mean_blob_spacing(word);  | 
909  | 0  |       current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;  | 
910  | 0  |       current_within_xht_gap = current_gap;  | 
911  | 0  |       if (current_gap > tosp_rep_space * repetition_spacing) { | 
912  | 0  |         prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));  | 
913  | 0  |         if (prev_blanks < 1) { | 
914  | 0  |           prev_blanks = 1;  | 
915  | 0  |         }  | 
916  | 0  |       } else { | 
917  | 0  |         prev_blanks = 0;  | 
918  | 0  |       }  | 
919  | 0  |       if (tosp_debug_level > 5) { | 
920  | 0  |         tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f;  Rgap:%d  ", | 
921  | 0  |                 box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),  | 
922  | 0  |                 repetition_spacing, current_gap);  | 
923  | 0  |       }  | 
924  | 0  |       prev_fuzzy_sp = false;  | 
925  | 0  |       prev_fuzzy_non = false;  | 
926  | 0  |       if (rep_char_it.empty()) { | 
927  | 0  |         next_rep_char_word_right = INT32_MAX;  | 
928  | 0  |       } else { | 
929  | 0  |         rep_char_it.forward();  | 
930  | 0  |         next_rep_char_word_right = rep_char_it.data()->bounding_box().right();  | 
931  | 0  |       }  | 
932  | 0  |     }  | 
933  |  |  | 
934  | 173k  |     peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);  | 
935  | 2.54M  |     do { | 
936  | 2.54M  |       auto bblob = box_it.data();  | 
937  | 2.54M  |       auto blob_box = bblob->bounding_box();  | 
938  | 2.54M  |       if (bblob->joined_to_prev()) { | 
939  | 1.07M  |         auto cblob = bblob->remove_cblob();  | 
940  | 1.07M  |         if (cblob != nullptr) { | 
941  | 1.07M  |           cout_it.set_to_list(cblob_it.data()->out_list());  | 
942  | 1.07M  |           cout_it.move_to_last();  | 
943  | 1.07M  |           cout_it.add_list_after(cblob->out_list());  | 
944  | 1.07M  |           delete cblob;  | 
945  | 1.07M  |         }  | 
946  | 1.46M  |       } else { | 
947  | 1.46M  |         auto cblob = bblob->cblob();  | 
948  | 1.46M  |         if (cblob != nullptr) { | 
949  | 1.23M  |           bblob->set_owns_cblob(false);  | 
950  | 1.23M  |           cblob_it.add_after_then_move(cblob);  | 
951  | 1.23M  |         }  | 
952  | 1.46M  |         prev_x = blob_box.right();  | 
953  | 1.46M  |       }  | 
954  | 2.54M  |       box_it.forward(); // next one  | 
955  | 2.54M  |       bblob = box_it.data();  | 
956  | 2.54M  |       blob_box = bblob->bounding_box();  | 
957  |  |  | 
958  | 2.54M  |       if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) { | 
959  |  |         /* Real Blob - not multiple outlines or pre-chopped */  | 
960  | 1.23M  |         prev_gap = current_gap;  | 
961  | 1.23M  |         prev_within_xht_gap = current_within_xht_gap;  | 
962  | 1.23M  |         prev_blob_box = next_blob_box;  | 
963  | 1.23M  |         current_gap = next_gap;  | 
964  | 1.23M  |         current_within_xht_gap = next_within_xht_gap;  | 
965  | 1.23M  |         peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);  | 
966  |  |  | 
967  | 1.23M  |         int16_t prev_gap_arg = prev_gap;  | 
968  | 1.23M  |         int16_t next_gap_arg = next_gap;  | 
969  | 1.23M  |         if (tosp_only_use_xht_gaps) { | 
970  | 0  |           prev_gap_arg = prev_within_xht_gap;  | 
971  | 0  |           next_gap_arg = next_within_xht_gap;  | 
972  | 0  |         }  | 
973  |  |         // Decide if a word-break should be inserted  | 
974  | 1.23M  |         if (blob_box.left() > next_rep_char_word_right ||  | 
975  | 1.23M  |             make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,  | 
976  | 1.23M  |                               current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,  | 
977  | 1.23M  |                               fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||  | 
978  | 1.23M  |             box_it.at_first()) { | 
979  |  |           /* Form a new word out of the blobs collected */  | 
980  | 280k  |           word = new WERD(&cblobs, prev_blanks, nullptr);  | 
981  | 280k  |           word_count++;  | 
982  | 280k  |           word_it.add_after_then_move(word);  | 
983  | 280k  |           if (bol) { | 
984  | 173k  |             word->set_flag(W_BOL, true);  | 
985  | 173k  |             bol = false;  | 
986  | 173k  |           }  | 
987  | 280k  |           if (prev_fuzzy_sp) { | 
988  |  |             // probably space  | 
989  | 28.6k  |             word->set_flag(W_FUZZY_SP, true);  | 
990  | 252k  |           } else if (prev_fuzzy_non) { | 
991  | 19.7k  |             word->set_flag(W_FUZZY_NON, true);  | 
992  | 19.7k  |           }  | 
993  |  |           // probably not  | 
994  |  |  | 
995  | 280k  |           if (blob_box.left() > next_rep_char_word_right) { | 
996  |  |             /* We need to insert a repeated char word */  | 
997  | 0  |             word = rep_char_it.extract();  | 
998  | 0  |             word_it.add_after_then_move(word);  | 
999  |  |  | 
1000  |  |             /* Set spaces before repeated char word */  | 
1001  | 0  |             repetition_spacing = find_mean_blob_spacing(word);  | 
1002  | 0  |             current_gap = word->bounding_box().left() - prev_x;  | 
1003  | 0  |             current_within_xht_gap = current_gap;  | 
1004  | 0  |             if (current_gap > tosp_rep_space * repetition_spacing) { | 
1005  | 0  |               blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));  | 
1006  | 0  |               if (blanks < 1) { | 
1007  | 0  |                 blanks = 1;  | 
1008  | 0  |               }  | 
1009  | 0  |             } else { | 
1010  | 0  |               blanks = 0;  | 
1011  | 0  |             }  | 
1012  | 0  |             if (tosp_debug_level > 5) { | 
1013  | 0  |               tprintf("Repch wd (%d,%d) rep gap %5.2f;  Lgap:%d (%d blanks);", | 
1014  | 0  |                       word->bounding_box().left(), word->bounding_box().bottom(),  | 
1015  | 0  |                       repetition_spacing, current_gap, blanks);  | 
1016  | 0  |             }  | 
1017  | 0  |             word->set_blanks(blanks);  | 
1018  |  |             // NO uncertainty  | 
1019  | 0  |             word->set_flag(W_FUZZY_SP, false);  | 
1020  | 0  |             word->set_flag(W_FUZZY_NON, false);  | 
1021  |  |  | 
1022  |  |             /* Set spaces after repeated char word (and leave current word set)  | 
1023  |  |              */  | 
1024  | 0  |             current_gap = blob_box.left() - next_rep_char_word_right;  | 
1025  | 0  |             if (current_gap > tosp_rep_space * repetition_spacing) { | 
1026  | 0  |               blanks = static_cast<uint8_t>(current_gap / row->space_size);  | 
1027  | 0  |               if (blanks < 1) { | 
1028  | 0  |                 blanks = 1;  | 
1029  | 0  |               }  | 
1030  | 0  |             } else { | 
1031  | 0  |               blanks = 0;  | 
1032  | 0  |             }  | 
1033  | 0  |             if (tosp_debug_level > 5) { | 
1034  | 0  |               tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks); | 
1035  | 0  |             }  | 
1036  | 0  |             fuzzy_sp = false;  | 
1037  | 0  |             fuzzy_non = false;  | 
1038  |  | 
  | 
1039  | 0  |             if (rep_char_it.empty()) { | 
1040  | 0  |               next_rep_char_word_right = INT32_MAX;  | 
1041  | 0  |             } else { | 
1042  | 0  |               rep_char_it.forward();  | 
1043  | 0  |               next_rep_char_word_right = rep_char_it.data()->bounding_box().right();  | 
1044  | 0  |             }  | 
1045  | 0  |           }  | 
1046  |  |  | 
1047  | 280k  |           if (box_it.at_first() && rep_char_it.empty()) { | 
1048  |  |             // at end of line  | 
1049  | 173k  |             word->set_flag(W_EOL, true);  | 
1050  | 173k  |             xstarts[1] = prev_x;  | 
1051  | 173k  |           } else { | 
1052  | 107k  |             prev_blanks = blanks;  | 
1053  | 107k  |             prev_fuzzy_sp = fuzzy_sp;  | 
1054  | 107k  |             prev_fuzzy_non = fuzzy_non;  | 
1055  | 107k  |           }  | 
1056  | 280k  |         }  | 
1057  | 1.23M  |       }  | 
1058  | 2.54M  |     } while (!box_it.at_first()); // until back at start  | 
1059  |  |  | 
1060  |  |     /* Insert any further repeated char words */  | 
1061  | 173k  |     while (!rep_char_it.empty()) { | 
1062  | 0  |       word = rep_char_it.extract();  | 
1063  | 0  |       word_it.add_after_then_move(word);  | 
1064  |  |  | 
1065  |  |       /* Set spaces before repeated char word */  | 
1066  | 0  |       repetition_spacing = find_mean_blob_spacing(word);  | 
1067  | 0  |       current_gap = word->bounding_box().left() - prev_x;  | 
1068  | 0  |       if (current_gap > tosp_rep_space * repetition_spacing) { | 
1069  | 0  |         blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));  | 
1070  | 0  |         if (blanks < 1) { | 
1071  | 0  |           blanks = 1;  | 
1072  | 0  |         }  | 
1073  | 0  |       } else { | 
1074  | 0  |         blanks = 0;  | 
1075  | 0  |       }  | 
1076  | 0  |       if (tosp_debug_level > 5) { | 
1077  | 0  |         tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n", | 
1078  | 0  |                 word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing,  | 
1079  | 0  |                 current_gap, blanks);  | 
1080  | 0  |       }  | 
1081  | 0  |       word->set_blanks(blanks);  | 
1082  |  |       // NO uncertainty  | 
1083  | 0  |       word->set_flag(W_FUZZY_SP, false);  | 
1084  | 0  |       word->set_flag(W_FUZZY_NON, false);  | 
1085  | 0  |       prev_x = word->bounding_box().right();  | 
1086  | 0  |       if (rep_char_it.empty()) { | 
1087  |  |         // at end of line  | 
1088  | 0  |         word->set_flag(W_EOL, true);  | 
1089  | 0  |         xstarts[1] = prev_x;  | 
1090  | 0  |       } else { | 
1091  | 0  |         rep_char_it.forward();  | 
1092  | 0  |       }  | 
1093  | 0  |     }  | 
1094  | 173k  |     real_row =  | 
1095  | 173k  |         new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));  | 
1096  | 173k  |     word_it.set_to_list(real_row->word_list());  | 
1097  |  |     // put words in row  | 
1098  | 173k  |     word_it.add_list_after(&words);  | 
1099  | 173k  |     real_row->recalc_bounding_box();  | 
1100  |  |  | 
1101  | 173k  |     if (tosp_debug_level > 4) { | 
1102  | 0  |       tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count, | 
1103  | 0  |               real_row->bounding_box().left(), real_row->bounding_box().bottom(),  | 
1104  | 0  |               real_row->bounding_box().right(), real_row->bounding_box().top());  | 
1105  | 0  |     }  | 
1106  | 173k  |     return real_row;  | 
1107  | 173k  |   }  | 
1108  | 0  |   return nullptr;  | 
1109  | 173k  | }  | 
1110  |  |  | 
1111  |  | /**********************************************************************  | 
1112  |  |  * make_blob_words  | 
1113  |  |  *  | 
1114  |  |  * Converts words into blobs so that each blob is a single character.  | 
1115  |  |  *  Used for chopper test.  | 
1116  |  |  **********************************************************************/  | 
1117  |  | ROW *Textord::make_blob_words(TO_ROW *row,    // row to make  | 
1118  |  |                               FCOORD rotation // for drawing  | 
1119  | 0  | ) { | 
1120  | 0  |   bool bol;      // start of line  | 
1121  | 0  |   ROW *real_row; // output row  | 
1122  | 0  |   C_OUTLINE_IT cout_it;  | 
1123  | 0  |   C_BLOB_LIST cblobs;  | 
1124  | 0  |   C_BLOB_IT cblob_it = &cblobs;  | 
1125  | 0  |   WERD_LIST words;  | 
1126  | 0  |   WERD *word;         // new word  | 
1127  | 0  |   BLOBNBOX_IT box_it; // iterator  | 
1128  | 0  |   int16_t word_count = 0;  | 
1129  |  | 
  | 
1130  | 0  |   cblob_it.set_to_list(&cblobs);  | 
1131  | 0  |   box_it.set_to_list(row->blob_list());  | 
1132  |  |   // new words  | 
1133  | 0  |   WERD_IT word_it(&words);  | 
1134  | 0  |   bol = true;  | 
1135  | 0  |   if (!box_it.empty()) { | 
1136  | 0  |     do { | 
1137  | 0  |       auto bblob = box_it.data();  | 
1138  | 0  |       auto blob_box = bblob->bounding_box();  | 
1139  | 0  |       if (bblob->joined_to_prev()) { | 
1140  | 0  |         auto cblob = bblob->remove_cblob();  | 
1141  | 0  |         if (cblob != nullptr) { | 
1142  | 0  |           cout_it.set_to_list(cblob_it.data()->out_list());  | 
1143  | 0  |           cout_it.move_to_last();  | 
1144  | 0  |           cout_it.add_list_after(cblob->out_list());  | 
1145  | 0  |           delete cblob;  | 
1146  | 0  |         }  | 
1147  | 0  |       } else { | 
1148  | 0  |         auto cblob = bblob->cblob();  | 
1149  | 0  |         if (cblob != nullptr) { | 
1150  | 0  |           bblob->set_owns_cblob(false);  | 
1151  | 0  |           cblob_it.add_after_then_move(cblob);  | 
1152  | 0  |         }  | 
1153  | 0  |       }  | 
1154  | 0  |       box_it.forward(); // next one  | 
1155  | 0  |       bblob = box_it.data();  | 
1156  | 0  |       blob_box = bblob->bounding_box();  | 
1157  |  | 
  | 
1158  | 0  |       if (!bblob->joined_to_prev() && !cblobs.empty()) { | 
1159  | 0  |         word = new WERD(&cblobs, 1, nullptr);  | 
1160  | 0  |         word_count++;  | 
1161  | 0  |         word_it.add_after_then_move(word);  | 
1162  | 0  |         if (bol) { | 
1163  | 0  |           word->set_flag(W_BOL, true);  | 
1164  | 0  |           bol = false;  | 
1165  | 0  |         }  | 
1166  | 0  |         if (box_it.at_first()) { // at end of line | 
1167  | 0  |           word->set_flag(W_EOL, true);  | 
1168  | 0  |         }  | 
1169  | 0  |       }  | 
1170  | 0  |     } while (!box_it.at_first()); // until back at start  | 
1171  |  |     /* Setup the row with created words. */  | 
1172  | 0  |     real_row =  | 
1173  | 0  |         new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));  | 
1174  | 0  |     word_it.set_to_list(real_row->word_list());  | 
1175  |  |     // put words in row  | 
1176  | 0  |     word_it.add_list_after(&words);  | 
1177  | 0  |     real_row->recalc_bounding_box();  | 
1178  | 0  |     if (tosp_debug_level > 4) { | 
1179  | 0  |       tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count, | 
1180  | 0  |               real_row->bounding_box().left(), real_row->bounding_box().bottom(),  | 
1181  | 0  |               real_row->bounding_box().right(), real_row->bounding_box().top());  | 
1182  | 0  |     }  | 
1183  | 0  |     return real_row;  | 
1184  | 0  |   }  | 
1185  | 0  |   return nullptr;  | 
1186  | 0  | }  | 
1187  |  |  | 
1188  |  | bool Textord::make_a_word_break(TO_ROW *row,   // row being made  | 
1189  |  |                                 TBOX blob_box, // for next_blob // how many blanks?  | 
1190  |  |                                 int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,  | 
1191  |  |                                 int16_t within_xht_current_gap, TBOX next_blob_box,  | 
1192  |  |                                 int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,  | 
1193  | 1.23M  |                                 bool &prev_gap_was_a_space, bool &break_at_next_gap) { | 
1194  | 1.23M  |   bool space;  | 
1195  | 1.23M  |   int16_t current_gap;  | 
1196  | 1.23M  |   float fuzzy_sp_to_kn_limit;  | 
1197  |  |  | 
1198  | 1.23M  |   if (break_at_next_gap) { | 
1199  | 0  |     break_at_next_gap = false;  | 
1200  | 0  |     return true;  | 
1201  | 0  |   }  | 
1202  |  |   /* Inhibit using the reduced gap if  | 
1203  |  |   The kerning is large - chars are not kerned and reducing "f"s can cause  | 
1204  |  |   erroneous blanks  | 
1205  |  | OR  The real gap is less than 0  | 
1206  |  | OR  The real gap is less than the kerning estimate  | 
1207  |  | */  | 
1208  | 1.23M  |   if ((row->kern_size > tosp_large_kerning * row->xheight) ||  | 
1209  | 1.23M  |       ((tosp_dont_fool_with_small_kerns >= 0) &&  | 
1210  | 972k  |        (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) { | 
1211  |  |     // Ignore the difference  | 
1212  | 266k  |     within_xht_current_gap = real_current_gap;  | 
1213  | 266k  |   }  | 
1214  |  |  | 
1215  | 1.23M  |   if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) { | 
1216  | 0  |     current_gap = within_xht_current_gap;  | 
1217  | 1.23M  |   } else { | 
1218  | 1.23M  |     current_gap = real_current_gap;  | 
1219  | 1.23M  |   }  | 
1220  |  |  | 
1221  | 1.23M  |   if (tosp_old_to_method) { | 
1222  |  |     // Boring old method  | 
1223  | 0  |     space = current_gap > row->max_nonspace;  | 
1224  | 0  |     if (space && (current_gap < INT16_MAX)) { | 
1225  | 0  |       if (current_gap < row->min_space) { | 
1226  | 0  |         if (current_gap > row->space_threshold) { | 
1227  | 0  |           blanks = 1;  | 
1228  | 0  |           fuzzy_sp = true;  | 
1229  | 0  |           fuzzy_non = false;  | 
1230  | 0  |         } else { | 
1231  | 0  |           blanks = 0;  | 
1232  | 0  |           fuzzy_sp = false;  | 
1233  | 0  |           fuzzy_non = true;  | 
1234  | 0  |         }  | 
1235  | 0  |       } else { | 
1236  | 0  |         if (row->space_size == 0.0f) { | 
1237  |  |           // Avoid FP division by 0.  | 
1238  | 0  |           blanks = 1;  | 
1239  | 0  |         } else { | 
1240  | 0  |           blanks = static_cast<uint8_t>(current_gap / row->space_size);  | 
1241  | 0  |           if (blanks < 1) { | 
1242  | 0  |             blanks = 1;  | 
1243  | 0  |           }  | 
1244  | 0  |         }  | 
1245  | 0  |         fuzzy_sp = false;  | 
1246  | 0  |         fuzzy_non = false;  | 
1247  | 0  |       }  | 
1248  | 0  |     }  | 
1249  | 0  |     return space;  | 
1250  | 1.23M  |   } else { | 
1251  |  |     /* New exciting heuristic method */  | 
1252  | 1.23M  |     if (prev_blob_box.null_box()) { // Beginning of row | 
1253  | 94  |       prev_gap_was_a_space = true;  | 
1254  | 94  |     }  | 
1255  |  |  | 
1256  |  |     // Default as old TO  | 
1257  | 1.23M  |     space = current_gap > row->space_threshold;  | 
1258  |  |  | 
1259  |  |     /* Set defaults for the word break in case we find one.  Currently there are  | 
1260  |  | no fuzzy spaces. Depending on the reliability of the different heuristics  | 
1261  |  | we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY  | 
1262  |  | be used if the function returns true - ie the word is to be broken.  | 
1263  |  | */  | 
1264  | 1.23M  |     int num_blanks = current_gap;  | 
1265  | 1.23M  |     if (row->space_size > 1.0f) { | 
1266  | 1.23M  |       num_blanks = IntCastRounded(current_gap / row->space_size);  | 
1267  | 1.23M  |     }  | 
1268  | 1.23M  |     blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));  | 
1269  | 1.23M  |     fuzzy_sp = false;  | 
1270  | 1.23M  |     fuzzy_non = false;  | 
1271  |  |     /*  | 
1272  |  | If xht measure causes gap to flip one of the 3 thresholds act accordingly -  | 
1273  |  | despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to  | 
1274  |  | context.  | 
1275  |  | */  | 
1276  | 1.23M  |     if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) &&  | 
1277  | 1.23M  |         (within_xht_current_gap > row->max_nonspace)) { | 
1278  | 2.01k  |       space = true;  | 
1279  | 2.01k  |       fuzzy_non = true;  | 
1280  |  | #ifndef GRAPHICS_DISABLED  | 
1281  |  |       mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1282  |  |                next_gap);  | 
1283  |  | #endif  | 
1284  | 1.23M  |     } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) &&  | 
1285  | 1.23M  |                (within_xht_current_gap > row->space_threshold)) { | 
1286  | 262  |       space = true;  | 
1287  | 262  |       if (tosp_flip_fuzz_kn_to_sp) { | 
1288  | 262  |         fuzzy_sp = true;  | 
1289  | 262  |       } else { | 
1290  | 0  |         fuzzy_non = true;  | 
1291  | 0  |       }  | 
1292  |  | #ifndef GRAPHICS_DISABLED  | 
1293  |  |       mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1294  |  |                next_gap);  | 
1295  |  | #endif  | 
1296  | 1.23M  |     } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) &&  | 
1297  | 1.23M  |                (within_xht_current_gap >= row->min_space)) { | 
1298  | 268  |       space = true;  | 
1299  |  | #ifndef GRAPHICS_DISABLED  | 
1300  |  |       mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1301  |  |                next_gap);  | 
1302  |  | #endif  | 
1303  | 1.23M  |     } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) &&  | 
1304  | 1.23M  |                suspected_punct_blob(row, blob_box)) { | 
1305  | 0  |       break_at_next_gap = true;  | 
1306  | 0  |     }  | 
1307  |  |     /* Now continue with normal heuristics */  | 
1308  | 1.23M  |     else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) { | 
1309  |  |       /* Heuristics to turn dubious spaces to kerns */  | 
1310  | 43.0k  |       if (tosp_pass_wide_fuzz_sp_to_context > 0) { | 
1311  | 43.0k  |         fuzzy_sp_to_kn_limit =  | 
1312  | 43.0k  |             row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size);  | 
1313  | 43.0k  |       } else { | 
1314  | 0  |         fuzzy_sp_to_kn_limit = 99999.0f;  | 
1315  | 0  |       }  | 
1316  |  |  | 
1317  |  |       /* If current gap is significantly smaller than the previous space the  | 
1318  |  | other side of a narrow blob then this gap is a kern. */  | 
1319  | 43.0k  |       if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space &&  | 
1320  | 43.0k  |           (current_gap <= tosp_gap_factor * prev_gap)) { | 
1321  | 1.52k  |         if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { | 
1322  | 351  |           if (tosp_flip_fuzz_sp_to_kn) { | 
1323  | 351  |             fuzzy_non = true;  | 
1324  | 351  |           } else { | 
1325  | 0  |             fuzzy_sp = true;  | 
1326  | 0  |           }  | 
1327  | 1.17k  |         } else { | 
1328  | 1.17k  |           space = false;  | 
1329  | 1.17k  |         }  | 
1330  |  | #ifndef GRAPHICS_DISABLED  | 
1331  |  |         mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1332  |  |                  next_gap);  | 
1333  |  | #endif  | 
1334  | 1.52k  |       }  | 
1335  |  |       /* If current gap not much bigger than the previous kern the other side of  | 
1336  |  | a narrow blob then this gap is a kern as well */  | 
1337  | 41.5k  |       else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) &&  | 
1338  | 41.5k  |                !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) { | 
1339  | 4.75k  |         if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { | 
1340  | 1.04k  |           if (tosp_flip_fuzz_sp_to_kn) { | 
1341  | 1.04k  |             fuzzy_non = true;  | 
1342  | 1.04k  |           } else { | 
1343  | 0  |             fuzzy_sp = true;  | 
1344  | 0  |           }  | 
1345  | 3.71k  |         } else { | 
1346  | 3.71k  |           space = false;  | 
1347  | 3.71k  |         }  | 
1348  |  | #ifndef GRAPHICS_DISABLED  | 
1349  |  |         mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1350  |  |                  next_gap);  | 
1351  |  | #endif  | 
1352  | 36.7k  |       } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&  | 
1353  | 36.7k  |                  (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) { | 
1354  | 4.29k  |         if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { | 
1355  | 1.10k  |           if (tosp_flip_fuzz_sp_to_kn) { | 
1356  | 1.10k  |             fuzzy_non = true;  | 
1357  | 1.10k  |           } else { | 
1358  | 0  |             fuzzy_sp = true;  | 
1359  | 0  |           }  | 
1360  | 3.19k  |         } else { | 
1361  | 3.19k  |           space = false;  | 
1362  | 3.19k  |         }  | 
1363  |  | #ifndef GRAPHICS_DISABLED  | 
1364  |  |         mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1365  |  |                  next_gap);  | 
1366  |  | #endif  | 
1367  | 32.4k  |       } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&  | 
1368  | 32.4k  |                  (next_gap <= row->space_threshold) &&  | 
1369  | 32.4k  |                  (current_gap * tosp_gap_factor <= next_gap)) { | 
1370  | 181  |         if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { | 
1371  | 31  |           if (tosp_flip_fuzz_sp_to_kn) { | 
1372  | 31  |             fuzzy_non = true;  | 
1373  | 31  |           } else { | 
1374  | 0  |             fuzzy_sp = true;  | 
1375  | 0  |           }  | 
1376  | 150  |         } else { | 
1377  | 150  |           space = false;  | 
1378  | 150  |         }  | 
1379  |  | #ifndef GRAPHICS_DISABLED  | 
1380  |  |         mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1381  |  |                  next_gap);  | 
1382  |  | #endif  | 
1383  | 32.2k  |       } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) ||  | 
1384  | 32.2k  |                   ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) { | 
1385  | 27.8k  |         fuzzy_sp = true;  | 
1386  |  | #ifndef GRAPHICS_DISABLED  | 
1387  |  |         mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1388  |  |                  next_gap);  | 
1389  |  | #endif  | 
1390  | 27.8k  |       }  | 
1391  | 1.19M  |     } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) { | 
1392  |  |       /* Heuristics to turn dubious kerns to spaces */  | 
1393  |  |       /* TRIED THIS BUT IT MADE THINGS WORSE  | 
1394  |  |     if (prev_gap == INT16_MAX)  | 
1395  |  |       prev_gap = 0;  // start of row  | 
1396  |  |     if (next_gap == INT16_MAX)  | 
1397  |  |       next_gap = 0;  // end of row  | 
1398  |  | */  | 
1399  | 35.7k  |       if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) &&  | 
1400  | 35.7k  |           (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&  | 
1401  | 35.7k  |           wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) { | 
1402  | 591  |         space = true;  | 
1403  |  |         /*  | 
1404  |  | tosp_flip_caution is an attempt to stop the default changing in cases  | 
1405  |  | where there is a large difference between the kern and space estimates.  | 
1406  |  |   See problem in 'chiefs' where "have" gets split in the quotation.  | 
1407  |  | */  | 
1408  | 591  |         if ((tosp_flip_fuzz_kn_to_sp) &&  | 
1409  | 591  |             ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) { | 
1410  | 591  |           fuzzy_sp = true;  | 
1411  | 591  |         } else { | 
1412  | 0  |           fuzzy_non = true;  | 
1413  | 0  |         }  | 
1414  |  | #ifndef GRAPHICS_DISABLED  | 
1415  |  |         mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1416  |  |                  next_gap);  | 
1417  |  | #endif  | 
1418  | 35.1k  |       } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 &&  | 
1419  | 35.1k  |                  current_gap > 5 && // Rule 9 handles small gap, big ratio.  | 
1420  | 35.1k  |                  current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&  | 
1421  | 35.1k  |                  !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) &&  | 
1422  | 35.1k  |                  !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) { | 
1423  | 216  |         space = true;  | 
1424  | 216  |         fuzzy_non = true;  | 
1425  |  | #ifndef GRAPHICS_DISABLED  | 
1426  |  |         mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1427  |  |                  next_gap);  | 
1428  |  | #endif  | 
1429  | 34.9k  |       } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) &&  | 
1430  | 34.9k  |                  (next_blob_box.width() > 0) &&  | 
1431  | 34.9k  |                  (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&  | 
1432  | 34.9k  |                  (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) &&  | 
1433  | 14.9k  |                                               !suspected_punct_blob(row, next_blob_box)))) { | 
1434  | 14.9k  |         space = true;  | 
1435  | 14.9k  |         fuzzy_non = true;  | 
1436  |  | #ifndef GRAPHICS_DISABLED  | 
1437  |  |         mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),  | 
1438  |  |                  next_gap);  | 
1439  |  | #endif  | 
1440  | 14.9k  |       }  | 
1441  | 35.7k  |     }  | 
1442  | 1.23M  |     if (tosp_debug_level > 10) { | 
1443  | 0  |       tprintf(  | 
1444  | 0  |           "word break = %d current_gap = %d, prev_gap = %d, "  | 
1445  | 0  |           "next_gap = %d\n",  | 
1446  | 0  |           space ? 1 : 0, current_gap, prev_gap, next_gap);  | 
1447  | 0  |     }  | 
1448  | 1.23M  |     prev_gap_was_a_space = space && !(fuzzy_non);  | 
1449  | 1.23M  |     return space;  | 
1450  | 1.23M  |   }  | 
1451  | 1.23M  | }  | 
1452  |  |  | 
1453  | 262k  | bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) { | 
1454  | 262k  |   bool result;  | 
1455  | 262k  |   result =  | 
1456  | 262k  |       ((blob_box.width() <= tosp_narrow_fraction * row->xheight) ||  | 
1457  | 262k  |        ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio));  | 
1458  | 262k  |   return result;  | 
1459  | 262k  | }  | 
1460  |  |  | 
1461  | 227k  | bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) { | 
1462  | 227k  |   bool result;  | 
1463  | 227k  |   if (tosp_wide_fraction > 0) { | 
1464  | 227k  |     if (tosp_wide_aspect_ratio > 0) { | 
1465  | 0  |       result =  | 
1466  | 0  |           ((blob_box.width() >= tosp_wide_fraction * row->xheight) &&  | 
1467  | 0  |            ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio));  | 
1468  | 227k  |     } else { | 
1469  | 227k  |       result = (blob_box.width() >= tosp_wide_fraction * row->xheight);  | 
1470  | 227k  |     }  | 
1471  | 227k  |   } else { | 
1472  | 0  |     result = !narrow_blob(row, blob_box);  | 
1473  | 0  |   }  | 
1474  | 227k  |   return result;  | 
1475  | 227k  | }  | 
1476  |  |  | 
1477  | 2.13k  | bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) { | 
1478  | 2.13k  |   bool result;  | 
1479  | 2.13k  |   float baseline;  | 
1480  | 2.13k  |   float blob_x_centre;  | 
1481  |  |   /* Find baseline of centre of blob */  | 
1482  | 2.13k  |   blob_x_centre = (box.right() + box.left()) / 2.0;  | 
1483  | 2.13k  |   baseline = row->baseline.y(blob_x_centre);  | 
1484  |  |  | 
1485  | 2.13k  |   result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) ||  | 
1486  | 2.13k  |            (box.bottom() > baseline + row->xheight / 2.0);  | 
1487  | 2.13k  |   return result;  | 
1488  | 2.13k  | }  | 
1489  |  |  | 
1490  |  | void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box,  | 
1491  | 1.41M  |                                int16_t &next_gap, int16_t &next_within_xht_gap) { | 
1492  | 1.41M  |   TBOX next_reduced_blob_box;  | 
1493  | 1.41M  |   TBOX bit_beyond;  | 
1494  | 1.41M  |   BLOBNBOX_IT reduced_box_it = box_it;  | 
1495  |  |  | 
1496  | 1.41M  |   next_blob_box = box_next(&box_it);  | 
1497  | 1.41M  |   next_reduced_blob_box = reduced_box_next(row, &reduced_box_it);  | 
1498  | 1.41M  |   if (box_it.at_first()) { | 
1499  | 234k  |     next_gap = INT16_MAX;  | 
1500  | 234k  |     next_within_xht_gap = INT16_MAX;  | 
1501  | 1.17M  |   } else { | 
1502  | 1.17M  |     bit_beyond = box_it.data()->bounding_box();  | 
1503  | 1.17M  |     next_gap = bit_beyond.left() - next_blob_box.right();  | 
1504  | 1.17M  |     bit_beyond = reduced_box_next(row, &reduced_box_it);  | 
1505  | 1.17M  |     next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right();  | 
1506  | 1.17M  |   }  | 
1507  | 1.41M  | }  | 
1508  |  |  | 
1509  |  | #ifndef GRAPHICS_DISABLED  | 
1510  |  | void Textord::mark_gap(TBOX blob,    // blob following gap  | 
1511  |  |                        int16_t rule, // heuristic id  | 
1512  |  |                        int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,  | 
1513  |  |                        int16_t next_blob_width, int16_t next_gap) { | 
1514  |  |   ScrollView::Color col; // of ellipse marking flipped gap  | 
1515  |  |  | 
1516  |  |   switch (rule) { | 
1517  |  |     case 1:  | 
1518  |  |       col = ScrollView::RED;  | 
1519  |  |       break;  | 
1520  |  |     case 2:  | 
1521  |  |       col = ScrollView::CYAN;  | 
1522  |  |       break;  | 
1523  |  |     case 3:  | 
1524  |  |       col = ScrollView::GREEN;  | 
1525  |  |       break;  | 
1526  |  |     case 4:  | 
1527  |  |       col = ScrollView::BLACK;  | 
1528  |  |       break;  | 
1529  |  |     case 5:  | 
1530  |  |       col = ScrollView::MAGENTA;  | 
1531  |  |       break;  | 
1532  |  |     case 6:  | 
1533  |  |       col = ScrollView::BLUE;  | 
1534  |  |       break;  | 
1535  |  |  | 
1536  |  |     case 7:  | 
1537  |  |       col = ScrollView::WHITE;  | 
1538  |  |       break;  | 
1539  |  |     case 8:  | 
1540  |  |       col = ScrollView::YELLOW;  | 
1541  |  |       break;  | 
1542  |  |     case 9:  | 
1543  |  |       col = ScrollView::BLACK;  | 
1544  |  |       break;  | 
1545  |  |  | 
1546  |  |     case 20:  | 
1547  |  |       col = ScrollView::CYAN;  | 
1548  |  |       break;  | 
1549  |  |     case 21:  | 
1550  |  |       col = ScrollView::GREEN;  | 
1551  |  |       break;  | 
1552  |  |     case 22:  | 
1553  |  |       col = ScrollView::MAGENTA;  | 
1554  |  |       break;  | 
1555  |  |     default:  | 
1556  |  |       col = ScrollView::BLACK;  | 
1557  |  |   }  | 
1558  |  |   if (textord_show_initial_words) { | 
1559  |  |     to_win->Pen(col);  | 
1560  |  |     /*  if (rule < 20)  | 
1561  |  |     //interior_style(to_win, INT_SOLID, false);  | 
1562  |  |   else  | 
1563  |  |     //interior_style(to_win, INT_HOLLOW, true);*/  | 
1564  |  |     // x radius  | 
1565  |  |     to_win->Ellipse(current_gap / 2.0f,  | 
1566  |  |                     blob.height() / 2.0f, // y radius  | 
1567  |  |                                           // x centre  | 
1568  |  |                     blob.left() - current_gap / 2.0f,  | 
1569  |  |                     // y centre  | 
1570  |  |                     blob.bottom() + blob.height() / 2.0f);  | 
1571  |  |   }  | 
1572  |  |   if (tosp_debug_level > 5) { | 
1573  |  |     tprintf("  (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2, | 
1574  |  |             blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap);  | 
1575  |  |   }  | 
1576  |  | }  | 
1577  |  | #endif  | 
1578  |  |  | 
1579  | 0  | float Textord::find_mean_blob_spacing(WERD *word) { | 
1580  | 0  |   C_BLOB_IT cblob_it;  | 
1581  | 0  |   TBOX blob_box;  | 
1582  | 0  |   int32_t gap_sum = 0;  | 
1583  | 0  |   int16_t gap_count = 0;  | 
1584  | 0  |   int16_t prev_right;  | 
1585  |  | 
  | 
1586  | 0  |   cblob_it.set_to_list(word->cblob_list());  | 
1587  | 0  |   if (!cblob_it.empty()) { | 
1588  | 0  |     cblob_it.mark_cycle_pt();  | 
1589  | 0  |     prev_right = cblob_it.data()->bounding_box().right();  | 
1590  |  |     // first blob  | 
1591  | 0  |     cblob_it.forward();  | 
1592  | 0  |     for (; !cblob_it.cycled_list(); cblob_it.forward()) { | 
1593  | 0  |       blob_box = cblob_it.data()->bounding_box();  | 
1594  | 0  |       gap_sum += blob_box.left() - prev_right;  | 
1595  | 0  |       gap_count++;  | 
1596  | 0  |       prev_right = blob_box.right();  | 
1597  | 0  |     }  | 
1598  | 0  |   }  | 
1599  | 0  |   if (gap_count > 0) { | 
1600  | 0  |     return (gap_sum / static_cast<float>(gap_count));  | 
1601  | 0  |   } else { | 
1602  | 0  |     return 0.0f;  | 
1603  | 0  |   }  | 
1604  | 0  | }  | 
1605  |  |  | 
1606  |  | bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left,  | 
1607  | 2.48M  |                              int16_t right) { | 
1608  | 2.48M  |   int16_t gap = right - left + 1;  | 
1609  |  |  | 
1610  | 2.48M  |   if (tosp_ignore_big_gaps > 999) { | 
1611  | 0  |     return false; // Don't ignore  | 
1612  | 0  |   }  | 
1613  | 2.48M  |   if (tosp_ignore_big_gaps > 0) { | 
1614  | 0  |     return (gap > tosp_ignore_big_gaps * row->xheight);  | 
1615  | 0  |   }  | 
1616  | 2.48M  |   if (gap > tosp_ignore_very_big_gaps * row->xheight) { | 
1617  | 18.7k  |     return true;  | 
1618  | 18.7k  |   }  | 
1619  | 2.47M  |   if (tosp_ignore_big_gaps == 0) { | 
1620  | 0  |     if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) { | 
1621  | 0  |       return true;  | 
1622  | 0  |     }  | 
1623  | 0  |     if ((gap > 1.75 * row->xheight) &&  | 
1624  | 0  |         ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) { | 
1625  | 0  |       return true;  | 
1626  | 0  |     }  | 
1627  | 2.47M  |   } else { | 
1628  |  |     /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table  | 
1629  |  |      */  | 
1630  | 2.47M  |     if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) { | 
1631  | 11.8k  |       return true;  | 
1632  | 11.8k  |     }  | 
1633  | 2.47M  |   }  | 
1634  | 2.45M  |   return false;  | 
1635  | 2.47M  | }  | 
1636  |  |  | 
1637  |  | /**********************************************************************  | 
1638  |  |  * reduced_box_next  | 
1639  |  |  *  | 
1640  |  |  * Compute the bounding box of this blob with merging of x overlaps  | 
1641  |  |  * but no pre-chopping.  | 
1642  |  |  * Then move the iterator on to the start of the next blob.  | 
1643  |  |  * DON'T reduce the box for small things - eg punctuation.  | 
1644  |  |  **********************************************************************/  | 
1645  |  | TBOX Textord::reduced_box_next(TO_ROW *row,    // current row  | 
1646  |  |                                BLOBNBOX_IT *it // iterator to blobds  | 
1647  | 6.51M  | ) { | 
1648  | 6.51M  |   BLOBNBOX *blob;             // current blob  | 
1649  | 6.51M  |   BLOBNBOX *head_blob;        // place to store box  | 
1650  | 6.51M  |   TBOX full_box;              // full blob boundg box  | 
1651  | 6.51M  |   TBOX reduced_box;           // box of significant part  | 
1652  | 6.51M  |   int16_t left_above_xht;     // ABOVE xht left limit  | 
1653  | 6.51M  |   int16_t new_left_above_xht; // ABOVE xht left limit  | 
1654  |  |  | 
1655  | 6.51M  |   blob = it->data();  | 
1656  | 6.51M  |   if (blob->red_box_set()) { | 
1657  | 5.27M  |     reduced_box = blob->reduced_box();  | 
1658  | 10.7M  |     do { | 
1659  | 10.7M  |       it->forward();  | 
1660  | 10.7M  |       blob = it->data();  | 
1661  | 10.7M  |     } while (blob->cblob() == nullptr || blob->joined_to_prev());  | 
1662  | 5.27M  |     return reduced_box;  | 
1663  | 5.27M  |   }  | 
1664  | 1.23M  |   head_blob = blob;  | 
1665  | 1.23M  |   full_box = blob->bounding_box();  | 
1666  | 1.23M  |   reduced_box = reduced_box_for_blob(blob, row, &left_above_xht);  | 
1667  | 2.54M  |   do { | 
1668  | 2.54M  |     it->forward();  | 
1669  | 2.54M  |     blob = it->data();  | 
1670  | 2.54M  |     if (blob->cblob() == nullptr) { | 
1671  |  |       // was pre-chopped  | 
1672  | 233k  |       full_box += blob->bounding_box();  | 
1673  | 2.31M  |     } else if (blob->joined_to_prev()) { | 
1674  | 1.07M  |       reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht);  | 
1675  | 1.07M  |       left_above_xht = std::min(left_above_xht, new_left_above_xht);  | 
1676  | 1.07M  |     }  | 
1677  | 2.54M  |   }  | 
1678  |  |   // until next real blob  | 
1679  | 2.54M  |   while (blob->cblob() == nullptr || blob->joined_to_prev());  | 
1680  |  |  | 
1681  | 1.23M  |   if ((reduced_box.width() > 0) &&  | 
1682  | 1.23M  |       ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) &&  | 
1683  | 1.23M  |       (reduced_box.height() > 0.7 * row->xheight)) { | 
1684  |  | #ifndef GRAPHICS_DISABLED  | 
1685  |  |     if (textord_show_initial_words) { | 
1686  |  |       reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW);  | 
1687  |  |     }  | 
1688  |  | #endif  | 
1689  | 912k  |   } else { | 
1690  | 912k  |     reduced_box = full_box;  | 
1691  | 912k  |   }  | 
1692  | 1.23M  |   head_blob->set_reduced_box(reduced_box);  | 
1693  | 1.23M  |   return reduced_box;  | 
1694  | 6.51M  | }  | 
1695  |  |  | 
1696  |  | /*************************************************************************  | 
1697  |  |  * reduced_box_for_blob()  | 
1698  |  |  * Find box for blob which is the same height and y position as the whole blob,  | 
1699  |  |  * but whose left limit is the left most position of the blob ABOVE the  | 
1700  |  |  * baseline and whose right limit is the right most position of the blob BELOW  | 
1701  |  |  * the xheight.  | 
1702  |  |  *  | 
1703  |  |  *  | 
1704  |  |  * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on  | 
1705  |  |  *         "home".  Perhaps we need something which say if the width ABOVE the  | 
1706  |  |  *         xht alone includes the whole of the reduced width, then use the full  | 
1707  |  |  *         blob box - Might still fail on italic F  | 
1708  |  |  *  | 
1709  |  |  *         Alternatively we could be a little less severe and only reduce the  | 
1710  |  |  *         left and right edges by half the difference between the full box and  | 
1711  |  |  *         the reduced box.  | 
1712  |  |  *  | 
1713  |  |  * NOTE that we need to rotate all the coordinates as  | 
1714  |  |  * find_blob_limits finds the y min and max within a specified x band  | 
1715  |  |  *************************************************************************/  | 
1716  | 2.31M  | TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) { | 
1717  | 2.31M  |   float baseline;  | 
1718  | 2.31M  |   float blob_x_centre;  | 
1719  | 2.31M  |   float left_limit;  | 
1720  | 2.31M  |   float right_limit;  | 
1721  | 2.31M  |   float junk;  | 
1722  | 2.31M  |   TBOX blob_box;  | 
1723  |  |  | 
1724  |  |   /* Find baseline of centre of blob */  | 
1725  |  |  | 
1726  | 2.31M  |   blob_box = blob->bounding_box();  | 
1727  | 2.31M  |   blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0;  | 
1728  | 2.31M  |   baseline = row->baseline.y(blob_x_centre);  | 
1729  |  |  | 
1730  |  |   /*  | 
1731  |  | Find LH limit of blob ABOVE the xht. This is so that we can detect certain  | 
1732  |  | caps ht chars which should NOT have their box reduced: T, Y, V, W etc  | 
1733  |  | */  | 
1734  | 2.31M  |   left_limit = static_cast<float>(INT32_MAX);  | 
1735  | 2.31M  |   junk = static_cast<float>(-INT32_MAX);  | 
1736  | 2.31M  |   find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX),  | 
1737  | 2.31M  |                      left_limit, junk);  | 
1738  | 2.31M  |   if (left_limit > junk) { | 
1739  | 1.88M  |     *left_above_xht = INT16_MAX; // No area above xht  | 
1740  | 1.88M  |   } else { | 
1741  | 425k  |     *left_above_xht = static_cast<int16_t>(std::floor(left_limit));  | 
1742  | 425k  |   }  | 
1743  |  |   /*  | 
1744  |  | Find reduced LH limit of blob - the left extent of the region ABOVE the  | 
1745  |  | baseline.  | 
1746  |  | */  | 
1747  | 2.31M  |   left_limit = static_cast<float>(INT32_MAX);  | 
1748  | 2.31M  |   junk = static_cast<float>(-INT32_MAX);  | 
1749  | 2.31M  |   find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk);  | 
1750  |  |  | 
1751  | 2.31M  |   if (left_limit > junk) { | 
1752  | 199k  |     return TBOX(); // no area within xht so return empty box  | 
1753  | 199k  |   }  | 
1754  |  |   /*  | 
1755  |  | Find reduced RH limit of blob - the right extent of the region BELOW the xht.  | 
1756  |  | */  | 
1757  | 2.11M  |   junk = static_cast<float>(INT32_MAX);  | 
1758  | 2.11M  |   right_limit = static_cast<float>(-INT32_MAX);  | 
1759  | 2.11M  |   find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk,  | 
1760  | 2.11M  |                      right_limit);  | 
1761  | 2.11M  |   if (junk > right_limit) { | 
1762  | 258k  |     return TBOX(); // no area within xht so return empty box  | 
1763  | 258k  |   }  | 
1764  |  |  | 
1765  | 1.85M  |   return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()),  | 
1766  | 1.85M  |               ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top()));  | 
1767  | 2.11M  | }  | 
1768  |  | } // namespace tesseract  |