/src/tesseract/src/textord/tospace.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
2 | | // you may not use this file except in compliance with the License. |
3 | | // You may obtain a copy of the License at |
4 | | // http://www.apache.org/licenses/LICENSE-2.0 |
5 | | // Unless required by applicable law or agreed to in writing, software |
6 | | // distributed under the License is distributed on an "AS IS" BASIS, |
7 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
8 | | // See the License for the specific language governing permissions and |
9 | | // limitations under the License. |
10 | | /********************************************************************** |
11 | | * tospace.cpp |
12 | | * |
13 | | * Compute fuzzy word spacing thresholds for each row. |
14 | | * I.e. set : max_nonspace |
15 | | * space_threshold |
16 | | * min_space |
17 | | * kern_size |
18 | | * space_size |
19 | | * for each row. |
20 | | * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE |
21 | | * |
22 | | * Note: functions in this file were originally not members of any |
23 | | * class or enclosed by any namespace. Now they are all static members |
24 | | * of the Textord class. |
25 | | * |
26 | | **********************************************************************/ |
27 | | |
28 | | #include "drawtord.h" |
29 | | #include "statistc.h" |
30 | | #include "textord.h" |
31 | | #include "tovars.h" |
32 | | |
33 | | // Include automatically generated configuration file if running autoconf. |
34 | | #ifdef HAVE_CONFIG_H |
35 | | # include "config_auto.h" |
36 | | #endif |
37 | | |
38 | | #include <algorithm> |
39 | | #include <cmath> |
40 | | #include <memory> |
41 | | |
42 | 1.21M | #define MAXSPACING 128 /*max expected spacing in pix */ |
43 | | |
44 | | namespace tesseract { |
45 | | void Textord::to_spacing(ICOORD page_tr, // topright of page |
46 | | TO_BLOCK_LIST *blocks // blocks on page |
47 | 15.4k | ) { |
48 | 15.4k | TO_BLOCK_IT block_it; // iterator |
49 | 15.4k | TO_BLOCK *block; // current block; |
50 | 15.4k | TO_ROW *row; // current row |
51 | 15.4k | int block_index; // block number |
52 | 15.4k | int row_index; // row number |
53 | | // estimated width of real spaces for whole block |
54 | 15.4k | int16_t block_space_gap_width; |
55 | | // estimated width of non space gaps for whole block |
56 | 15.4k | int16_t block_non_space_gap_width; |
57 | 15.4k | bool old_text_ord_proportional; // old fixed/prop result |
58 | | |
59 | 15.4k | block_it.set_to_list(blocks); |
60 | 15.4k | block_index = 1; |
61 | 30.9k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
62 | 15.4k | block = block_it.data(); |
63 | 15.4k | std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk |
64 | 15.4k | block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width, |
65 | 15.4k | block_non_space_gap_width); |
66 | | // Make sure relative values of block-level space and non-space gap |
67 | | // widths are reasonable. The ratio of 1:3 is also used in |
68 | | // block_spacing_stats, to correct the block_space_gap_width. |
69 | | // Useful for arabic and hindi, when the non-space gap width is |
70 | | // often over-estimated and should not be trusted. A similar ratio |
71 | | // is found in block_spacing_stats. |
72 | 15.4k | if (tosp_old_to_method && tosp_old_to_constrain_sp_kn && |
73 | 15.4k | block_non_space_gap_width > block_space_gap_width / 3) { |
74 | 0 | block_non_space_gap_width = block_space_gap_width / 3; |
75 | 0 | } |
76 | | // row iterator |
77 | 15.4k | TO_ROW_IT row_it(block->get_rows()); |
78 | 15.4k | row_index = 1; |
79 | 194k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
80 | 178k | row = row_it.data(); |
81 | 178k | if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) { |
82 | 173k | if ((tosp_debug_level > 0) && !old_text_ord_proportional) { |
83 | 0 | tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index); |
84 | 0 | } |
85 | 173k | row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width, |
86 | 173k | block_non_space_gap_width); |
87 | 173k | } else { |
88 | 5.71k | if ((tosp_debug_level > 0) && old_text_ord_proportional) { |
89 | 0 | tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index, |
90 | 0 | row_index, row->pitch_decision, row->fixed_pitch); |
91 | 0 | } |
92 | 5.71k | } |
93 | | #ifndef GRAPHICS_DISABLED |
94 | | if (textord_show_initial_words) { |
95 | | plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row); |
96 | | } |
97 | | #endif |
98 | 178k | row_index++; |
99 | 178k | } |
100 | 15.4k | block_index++; |
101 | 15.4k | } |
102 | 15.4k | } |
103 | | |
104 | | /************************************************************************* |
105 | | * block_spacing_stats() |
106 | | *************************************************************************/ |
107 | | |
108 | | void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional, |
109 | | int16_t &block_space_gap_width, // resulting estimate |
110 | | int16_t &block_non_space_gap_width // resulting estimate |
111 | 15.4k | ) { |
112 | 15.4k | TO_ROW *row; // current row |
113 | 15.4k | BLOBNBOX_IT blob_it; // iterator |
114 | | |
115 | 15.4k | STATS centre_to_centre_stats(0, MAXSPACING - 1); |
116 | | // DEBUG USE ONLY |
117 | 15.4k | STATS all_gap_stats(0, MAXSPACING - 1); |
118 | 15.4k | STATS space_gap_stats(0, MAXSPACING - 1); |
119 | 15.4k | int16_t minwidth = MAXSPACING; // narrowest blob |
120 | 15.4k | TBOX blob_box; |
121 | 15.4k | TBOX prev_blob_box; |
122 | 15.4k | int16_t centre_to_centre; |
123 | 15.4k | int16_t gap_width; |
124 | 15.4k | float real_space_threshold; |
125 | 15.4k | float iqr_centre_to_centre; // DEBUG USE ONLY |
126 | 15.4k | float iqr_all_gap_stats; // DEBUG USE ONLY |
127 | 15.4k | int32_t end_of_row; |
128 | 15.4k | int32_t row_length; |
129 | | |
130 | | // row iterator |
131 | 15.4k | TO_ROW_IT row_it(block->get_rows()); |
132 | 194k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
133 | 178k | row = row_it.data(); |
134 | 178k | if (!row->blob_list()->empty() && |
135 | 178k | (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) || |
136 | 178k | (row->pitch_decision == PITCH_CORR_PROP))) { |
137 | 173k | blob_it.set_to_list(row->blob_list()); |
138 | 173k | blob_it.mark_cycle_pt(); |
139 | 173k | end_of_row = blob_it.data_relative(-1)->bounding_box().right(); |
140 | 173k | if (tosp_use_pre_chopping) { |
141 | 0 | blob_box = box_next_pre_chopped(&blob_it); |
142 | 173k | } else if (tosp_stats_use_xht_gaps) { |
143 | 173k | blob_box = reduced_box_next(row, &blob_it); |
144 | 173k | } else { |
145 | 0 | blob_box = box_next(&blob_it); |
146 | 0 | } |
147 | 173k | row_length = end_of_row - blob_box.left(); |
148 | 173k | if (blob_box.width() < minwidth) { |
149 | 21.5k | minwidth = blob_box.width(); |
150 | 21.5k | } |
151 | 173k | prev_blob_box = blob_box; |
152 | 1.23M | while (!blob_it.cycled_list()) { |
153 | 1.06M | if (tosp_use_pre_chopping) { |
154 | 0 | blob_box = box_next_pre_chopped(&blob_it); |
155 | 1.06M | } else if (tosp_stats_use_xht_gaps) { |
156 | 1.06M | blob_box = reduced_box_next(row, &blob_it); |
157 | 1.06M | } else { |
158 | 0 | blob_box = box_next(&blob_it); |
159 | 0 | } |
160 | 1.06M | if (blob_box.width() < minwidth) { |
161 | 8.10k | minwidth = blob_box.width(); |
162 | 8.10k | } |
163 | 1.06M | int16_t left = prev_blob_box.right(); |
164 | 1.06M | int16_t right = blob_box.left(); |
165 | 1.06M | gap_width = right - left; |
166 | 1.06M | if (!ignore_big_gap(row, row_length, gapmap, left, right)) { |
167 | 1.05M | all_gap_stats.add(gap_width, 1); |
168 | | |
169 | 1.05M | centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2; |
170 | | // DEBUG |
171 | 1.05M | centre_to_centre_stats.add(centre_to_centre, 1); |
172 | | // DEBUG |
173 | 1.05M | } |
174 | 1.06M | prev_blob_box = blob_box; |
175 | 1.06M | } |
176 | 173k | } |
177 | 178k | } |
178 | | |
179 | | // Inadequate samples |
180 | 15.4k | if (all_gap_stats.get_total() <= 1) { |
181 | 6.42k | block_non_space_gap_width = minwidth; |
182 | 6.42k | block_space_gap_width = -1; // No est. space width |
183 | | // DEBUG |
184 | 6.42k | old_text_ord_proportional = true; |
185 | 9.04k | } else { |
186 | | /* For debug only ..... */ |
187 | 9.04k | iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25); |
188 | 9.04k | iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25); |
189 | 9.04k | old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats; |
190 | | /* .......For debug only */ |
191 | | |
192 | | /* |
193 | | The median of the gaps is used as an estimate of the NON-SPACE gap width. |
194 | | This RELIES on the assumption that there are more gaps WITHIN words than |
195 | | BETWEEN words in a block |
196 | | |
197 | | Now try to estimate the width of a real space for all real spaces in the |
198 | | block. Do this by using a crude threshold to ignore "narrow" gaps, then |
199 | | find the median of the "wide" gaps and use this. |
200 | | */ |
201 | 9.04k | block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median())); |
202 | | // median gap |
203 | | |
204 | 9.04k | row_it.set_to_list(block->get_rows()); |
205 | 143k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
206 | 134k | row = row_it.data(); |
207 | 134k | if (!row->blob_list()->empty() && |
208 | 134k | (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) || |
209 | 134k | (row->pitch_decision == PITCH_CORR_PROP))) { |
210 | 130k | real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width, |
211 | 130k | tosp_init_guess_xht_mult * row->xheight); |
212 | 130k | blob_it.set_to_list(row->blob_list()); |
213 | 130k | blob_it.mark_cycle_pt(); |
214 | 130k | end_of_row = blob_it.data_relative(-1)->bounding_box().right(); |
215 | 130k | if (tosp_use_pre_chopping) { |
216 | 0 | blob_box = box_next_pre_chopped(&blob_it); |
217 | 130k | } else if (tosp_stats_use_xht_gaps) { |
218 | 130k | blob_box = reduced_box_next(row, &blob_it); |
219 | 130k | } else { |
220 | 0 | blob_box = box_next(&blob_it); |
221 | 0 | } |
222 | 130k | row_length = blob_box.left() - end_of_row; |
223 | 130k | prev_blob_box = blob_box; |
224 | 1.19M | while (!blob_it.cycled_list()) { |
225 | 1.06M | if (tosp_use_pre_chopping) { |
226 | 0 | blob_box = box_next_pre_chopped(&blob_it); |
227 | 1.06M | } else if (tosp_stats_use_xht_gaps) { |
228 | 1.06M | blob_box = reduced_box_next(row, &blob_it); |
229 | 1.06M | } else { |
230 | 0 | blob_box = box_next(&blob_it); |
231 | 0 | } |
232 | 1.06M | int16_t left = prev_blob_box.right(); |
233 | 1.06M | int16_t right = blob_box.left(); |
234 | 1.06M | gap_width = right - left; |
235 | 1.06M | if ((gap_width > real_space_threshold) && |
236 | 1.06M | !ignore_big_gap(row, row_length, gapmap, left, right)) { |
237 | | /* |
238 | | If tosp_use_cert_spaces is enabled, the estimate of the space gap is |
239 | | restricted to obvious spaces - those wider than half the xht or |
240 | | those with wide blobs on both sides - i.e not things that are |
241 | | suspect 1's or punctuation that is sometimes widely spaced. |
242 | | */ |
243 | 110k | if (!tosp_block_use_cert_spaces || |
244 | 110k | (gap_width > tosp_fuzzy_space_factor2 * row->xheight) || |
245 | 110k | ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && |
246 | 77.2k | (!tosp_narrow_blobs_not_cert || |
247 | 20.0k | (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) || |
248 | 110k | (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { |
249 | 38.3k | space_gap_stats.add(gap_width, 1); |
250 | 38.3k | } |
251 | 110k | } |
252 | 1.06M | prev_blob_box = blob_box; |
253 | 1.06M | } |
254 | 130k | } |
255 | 134k | } |
256 | | // Inadequate samples |
257 | 9.04k | if (space_gap_stats.get_total() <= 2) { |
258 | 7.36k | block_space_gap_width = -1; // No est. space width |
259 | 7.36k | } else { |
260 | 1.68k | block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())), |
261 | 1.68k | static_cast<int16_t>(3 * block_non_space_gap_width)); |
262 | 1.68k | } |
263 | 9.04k | } |
264 | 15.4k | } |
265 | | |
266 | | /************************************************************************* |
267 | | * row_spacing_stats() |
268 | | * Set values for min_space, max_non_space based on row stats only |
269 | | * If failure - return 0 values. |
270 | | *************************************************************************/ |
271 | | void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx, |
272 | | int16_t block_space_gap_width, // estimate for block |
273 | | int16_t block_non_space_gap_width // estimate for block |
274 | 173k | ) { |
275 | | // iterator |
276 | 173k | BLOBNBOX_IT blob_it = row->blob_list(); |
277 | 173k | STATS all_gap_stats(0, MAXSPACING - 1); |
278 | 173k | STATS cert_space_gap_stats(0, MAXSPACING - 1); |
279 | 173k | STATS all_space_gap_stats(0, MAXSPACING - 1); |
280 | 173k | STATS small_gap_stats(0, MAXSPACING - 1); |
281 | 173k | TBOX blob_box; |
282 | 173k | TBOX prev_blob_box; |
283 | 173k | int16_t gap_width; |
284 | 173k | int16_t real_space_threshold = 0; |
285 | 173k | int16_t max = 0; |
286 | 173k | int16_t large_gap_count = 0; |
287 | 173k | bool suspected_table; |
288 | 173k | bool good_block_space_estimate = block_space_gap_width > 0; |
289 | 173k | int32_t end_of_row; |
290 | 173k | int32_t row_length = 0; |
291 | 173k | float sane_space; |
292 | 173k | int32_t sane_threshold; |
293 | | |
294 | | /* Collect first pass stats for row */ |
295 | | |
296 | 173k | if (!good_block_space_estimate) { |
297 | 138k | block_space_gap_width = int16_t(std::floor(row->xheight / 2)); |
298 | 138k | } |
299 | 173k | if (!row->blob_list()->empty()) { |
300 | 173k | if (tosp_threshold_bias1 > 0) { |
301 | 0 | real_space_threshold = |
302 | 0 | block_non_space_gap_width + |
303 | 0 | int16_t(floor(0.5 + tosp_threshold_bias1 * |
304 | 0 | (block_space_gap_width - block_non_space_gap_width))); |
305 | 173k | } else { |
306 | 173k | real_space_threshold = // Old TO method |
307 | 173k | (block_space_gap_width + block_non_space_gap_width) / 2; |
308 | 173k | } |
309 | 173k | blob_it.set_to_list(row->blob_list()); |
310 | 173k | blob_it.mark_cycle_pt(); |
311 | 173k | end_of_row = blob_it.data_relative(-1)->bounding_box().right(); |
312 | 173k | if (tosp_use_pre_chopping) { |
313 | 0 | blob_box = box_next_pre_chopped(&blob_it); |
314 | 173k | } else if (tosp_stats_use_xht_gaps) { |
315 | 173k | blob_box = reduced_box_next(row, &blob_it); |
316 | 173k | } else { |
317 | 0 | blob_box = box_next(&blob_it); |
318 | 0 | } |
319 | 173k | row_length = end_of_row - blob_box.left(); |
320 | 173k | prev_blob_box = blob_box; |
321 | 1.23M | while (!blob_it.cycled_list()) { |
322 | 1.06M | if (tosp_use_pre_chopping) { |
323 | 0 | blob_box = box_next_pre_chopped(&blob_it); |
324 | 1.06M | } else if (tosp_stats_use_xht_gaps) { |
325 | 1.06M | blob_box = reduced_box_next(row, &blob_it); |
326 | 1.06M | } else { |
327 | 0 | blob_box = box_next(&blob_it); |
328 | 0 | } |
329 | 1.06M | int16_t left = prev_blob_box.right(); |
330 | 1.06M | int16_t right = blob_box.left(); |
331 | 1.06M | gap_width = right - left; |
332 | 1.06M | if (ignore_big_gap(row, row_length, gapmap, left, right)) { |
333 | 10.3k | large_gap_count++; |
334 | 1.05M | } else { |
335 | 1.05M | if (gap_width >= real_space_threshold) { |
336 | 137k | if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) || |
337 | 137k | ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && |
338 | 96.0k | (!tosp_narrow_blobs_not_cert || |
339 | 22.5k | (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) || |
340 | 137k | (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { |
341 | 49.7k | cert_space_gap_stats.add(gap_width, 1); |
342 | 49.7k | } |
343 | 137k | all_space_gap_stats.add(gap_width, 1); |
344 | 918k | } else { |
345 | 918k | small_gap_stats.add(gap_width, 1); |
346 | 918k | } |
347 | 1.05M | all_gap_stats.add(gap_width, 1); |
348 | 1.05M | } |
349 | 1.06M | prev_blob_box = blob_box; |
350 | 1.06M | } |
351 | 173k | } |
352 | 173k | suspected_table = (large_gap_count > 1) || |
353 | 173k | ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples)); |
354 | | |
355 | | /* Now determine row kern size, space size and threshold */ |
356 | | |
357 | 173k | if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) || |
358 | 173k | ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) && |
359 | 167k | cert_space_gap_stats.get_total() > 0)) { |
360 | 18.1k | old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats, |
361 | 18.1k | block_space_gap_width, block_non_space_gap_width); |
362 | 155k | } else { |
363 | 155k | if (!tosp_recovery_isolated_row_stats || |
364 | 155k | !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) { |
365 | 146k | if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) { |
366 | 0 | tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx); |
367 | 0 | } |
368 | 146k | if (tosp_row_use_cert_spaces1 && good_block_space_estimate) { |
369 | | // Use block default |
370 | 17.1k | row->space_size = block_space_gap_width; |
371 | 17.1k | if (all_gap_stats.get_total() > tosp_redo_kern_limit) { |
372 | 6.27k | row->kern_size = all_gap_stats.median(); |
373 | 10.8k | } else { |
374 | 10.8k | row->kern_size = block_non_space_gap_width; |
375 | 10.8k | } |
376 | 17.1k | row->space_threshold = |
377 | 17.1k | int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor)); |
378 | 129k | } else { |
379 | 129k | old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats, |
380 | 129k | block_space_gap_width, block_non_space_gap_width); |
381 | 129k | } |
382 | 146k | } |
383 | 155k | } |
384 | | |
385 | 173k | if (tosp_improve_thresh && !suspected_table) { |
386 | 0 | improve_row_threshold(row, &all_gap_stats); |
387 | 0 | } |
388 | | |
389 | | /* Now lets try to be careful not to do anything silly with tables when we |
390 | | are ignoring big gaps*/ |
391 | 173k | if (tosp_sanity_method == 0) { |
392 | 0 | if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) { |
393 | 0 | if (tosp_debug_level > 5) { |
394 | 0 | tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx, |
395 | 0 | row->kern_size, row->space_threshold, row->space_size); |
396 | 0 | } |
397 | 0 | row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size); |
398 | 0 | row->space_size = std::max(row->space_threshold + 1.0f, row->xheight); |
399 | 0 | } |
400 | 173k | } else if (tosp_sanity_method == 1) { |
401 | 173k | sane_space = row->space_size; |
402 | | /* NEVER let space size get too close to kern size */ |
403 | 173k | if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) || |
404 | 173k | ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) { |
405 | 24.3k | if (good_block_space_estimate && |
406 | 24.3k | (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) { |
407 | 1.28k | sane_space = block_space_gap_width; |
408 | 23.0k | } else { |
409 | 23.0k | sane_space = |
410 | 23.0k | std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f), |
411 | 23.0k | row->xheight / 2.0f); |
412 | 23.0k | } |
413 | 24.3k | if (tosp_debug_level > 5) { |
414 | 0 | tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx, |
415 | 0 | row->kern_size, row->space_threshold, row->space_size, sane_space); |
416 | 0 | } |
417 | 24.3k | row->space_size = sane_space; |
418 | 24.3k | row->space_threshold = |
419 | 24.3k | int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor)); |
420 | 24.3k | } |
421 | | /* NEVER let threshold get VERY far away from kern */ |
422 | 173k | sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f))); |
423 | 173k | if (row->space_threshold > sane_threshold) { |
424 | 4.35k | if (tosp_debug_level > 5) { |
425 | 0 | tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx, |
426 | 0 | row->kern_size, row->space_threshold, row->space_size, sane_threshold); |
427 | 0 | } |
428 | 4.35k | row->space_threshold = sane_threshold; |
429 | 4.35k | if (row->space_size <= sane_threshold) { |
430 | 0 | row->space_size = row->space_threshold + 1.0f; |
431 | 0 | } |
432 | 4.35k | } |
433 | | /* Beware of tables - there may be NO spaces */ |
434 | 173k | if (suspected_table) { |
435 | 5.89k | sane_space = |
436 | 5.89k | std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight); |
437 | 5.89k | sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2)); |
438 | | |
439 | 5.89k | if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) { |
440 | 758 | if (tosp_debug_level > 5) { |
441 | 0 | tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx, |
442 | 0 | row->kern_size, row->space_threshold, row->space_size); |
443 | 0 | } |
444 | | // the minimum sane value |
445 | 758 | row->space_threshold = static_cast<int32_t>(sane_space); |
446 | 758 | row->space_size = std::max(row->space_threshold + 1.0f, row->xheight); |
447 | 758 | } |
448 | 5.89k | } |
449 | 173k | } |
450 | | |
451 | | /* Now lets try to put some error limits on the threshold */ |
452 | | |
453 | 173k | if (tosp_old_to_method) { |
454 | | /* Old textord made a space if gap >= threshold */ |
455 | | // NO FUZZY SPACES YET |
456 | 0 | row->max_nonspace = row->space_threshold; |
457 | | // NO FUZZY SPACES YET |
458 | 0 | row->min_space = row->space_threshold + 1; |
459 | 173k | } else { |
460 | | /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */ |
461 | 173k | row->min_space = |
462 | 173k | std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size)); |
463 | 173k | if (row->min_space <= row->space_threshold) { |
464 | | // Don't be silly |
465 | 30.5k | row->min_space = row->space_threshold + 1; |
466 | 30.5k | } |
467 | | /* |
468 | | Lets try to guess the max certain kern gap by looking at the cluster of |
469 | | kerns for the row. The row is proportional so the kerns should cluster |
470 | | tightly at the bottom of the distribution. We also expect most gaps to be |
471 | | kerns. Find the maximum of the kern piles between 0 and twice the kern |
472 | | estimate. Piles before the first one with less than 1/10 the maximum |
473 | | number of samples can be taken as certain kerns. |
474 | | |
475 | | Of course, there are some cases where the kern peak and space peaks merge, |
476 | | so we will put an UPPER limit on the max certain kern gap of some fraction |
477 | | below the threshold. |
478 | | */ |
479 | | |
480 | | // upper bound |
481 | 173k | int32_t max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2); |
482 | | |
483 | | // default |
484 | 173k | row->max_nonspace = max_max_nonspace; |
485 | 877k | for (int32_t index = 0; index <= max_max_nonspace; index++) { |
486 | 776k | if (all_gap_stats.pile_count(index) > max) { |
487 | 121k | max = all_gap_stats.pile_count(index); |
488 | 121k | } |
489 | 776k | if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) { |
490 | 71.8k | row->max_nonspace = index; |
491 | 71.8k | break; |
492 | 71.8k | } |
493 | 776k | } |
494 | 173k | } |
495 | | |
496 | | /* Yet another algorithm - simpler this time - just choose a fraction of the |
497 | | threshold to space range */ |
498 | | |
499 | 173k | if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) { |
500 | 173k | row->min_space = std::max( |
501 | 173k | row->min_space, static_cast<int32_t>(ceil(row->space_threshold + |
502 | 173k | tosp_fuzzy_sp_fraction * |
503 | 173k | (row->space_size - row->space_threshold)))); |
504 | 173k | } |
505 | | |
506 | | /* Ensure that ANY space less than some multiplier times the kern size is |
507 | | fuzzy. In tables there is a risk of erroneously setting a small space size |
508 | | when there are no real spaces. Sometimes tables have text squashed into |
509 | | columns so that the kn->sp ratio is small anyway - this means that we can't |
510 | | use this to force a wider separation - hence we rely on context to join any |
511 | | dubious breaks. */ |
512 | | |
513 | 173k | if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) { |
514 | 173k | row->min_space = std::max( |
515 | 173k | row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size))); |
516 | 173k | } |
517 | | |
518 | 173k | if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) { |
519 | 172k | row->max_nonspace = static_cast<int32_t>(floor( |
520 | 172k | 0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size))); |
521 | 172k | } |
522 | 173k | if (row->max_nonspace > row->space_threshold) { |
523 | | // Don't be silly |
524 | 0 | row->max_nonspace = row->space_threshold; |
525 | 0 | } |
526 | | |
527 | 173k | if (tosp_debug_level > 5) { |
528 | 0 | tprintf( |
529 | 0 | "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) " |
530 | 0 | "Sp:%3.2f\n", |
531 | 0 | block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width, |
532 | 0 | real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold, |
533 | 0 | row->min_space, row->space_size); |
534 | 0 | } |
535 | 173k | if (tosp_debug_level > 10) { |
536 | 0 | tprintf( |
537 | 0 | "row->kern_size = %3.2f, row->space_size = %3.2f, " |
538 | 0 | "row->space_threshold = %d\n", |
539 | 0 | row->kern_size, row->space_size, row->space_threshold); |
540 | 0 | } |
541 | 173k | } |
542 | | |
543 | | void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats, |
544 | | STATS *small_gap_stats, |
545 | | int16_t block_space_gap_width, // estimate for block |
546 | | int16_t block_non_space_gap_width // estimate for block |
547 | 147k | ) { |
548 | | /* First, estimate row space size */ |
549 | | /* Old to condition was > 2 */ |
550 | 147k | if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) { |
551 | | // Adequate samples |
552 | | /* Set space size to median of spaces BUT limits it if it seems wildly out |
553 | | */ |
554 | 9.83k | row->space_size = space_gap_stats->median(); |
555 | 9.83k | if (row->space_size > block_space_gap_width * 1.5) { |
556 | 1.08k | if (tosp_old_to_bug_fix) { |
557 | 0 | row->space_size = block_space_gap_width * 1.5; |
558 | 1.08k | } else { |
559 | | // BUG??? should be *1.5 |
560 | 1.08k | row->space_size = block_space_gap_width; |
561 | 1.08k | } |
562 | 1.08k | } |
563 | 9.83k | if (row->space_size < (block_non_space_gap_width * 2) + 1) { |
564 | 2.70k | row->space_size = (block_non_space_gap_width * 2) + 1; |
565 | 2.70k | } |
566 | 9.83k | } |
567 | | // Only 1 or 2 samples |
568 | 138k | else if (space_gap_stats->get_total() >= 1) { |
569 | | // hence mean not median |
570 | 28.9k | row->space_size = space_gap_stats->mean(); |
571 | 28.9k | if (row->space_size > block_space_gap_width * 1.5) { |
572 | 3.74k | if (tosp_old_to_bug_fix) { |
573 | 0 | row->space_size = block_space_gap_width * 1.5; |
574 | 3.74k | } else { |
575 | | // BUG??? should be *1.5 |
576 | 3.74k | row->space_size = block_space_gap_width; |
577 | 3.74k | } |
578 | 3.74k | } |
579 | 28.9k | if (row->space_size < (block_non_space_gap_width * 3) + 1) { |
580 | 13.2k | row->space_size = (block_non_space_gap_width * 3) + 1; |
581 | 13.2k | } |
582 | 109k | } else { |
583 | | // Use block default |
584 | 109k | row->space_size = block_space_gap_width; |
585 | 109k | } |
586 | | |
587 | | /* Next, estimate row kern size */ |
588 | 147k | if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) { |
589 | 0 | row->kern_size = small_gap_stats->median(); |
590 | 147k | } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) { |
591 | 15.7k | row->kern_size = all_gap_stats->median(); |
592 | 132k | } else { // old TO -SAME FOR ALL ROWS |
593 | 132k | row->kern_size = block_non_space_gap_width; |
594 | 132k | } |
595 | | |
596 | | /* Finally, estimate row space threshold */ |
597 | 147k | if (tosp_threshold_bias2 > 0) { |
598 | 0 | row->space_threshold = int32_t( |
599 | 0 | floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size))); |
600 | 147k | } else { |
601 | | /* |
602 | | NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold |
603 | | and holds this in a float. The use is with a >= test |
604 | | NEW textord uses an integer threshold and a > test |
605 | | It comes to the same thing. |
606 | | (Though there is a difference in that old textor has integer space_size |
607 | | and kern_size.) |
608 | | */ |
609 | 147k | row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2)); |
610 | 147k | } |
611 | | |
612 | | // Apply the same logic and ratios as in row_spacing_stats to |
613 | | // restrict relative values of the row's space_size, kern_size, and |
614 | | // space_threshold |
615 | 147k | if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 && |
616 | 147k | ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) || |
617 | 0 | ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) { |
618 | 0 | if (row->kern_size > 2.5) { |
619 | 0 | row->kern_size = row->space_size / tosp_min_sane_kn_sp; |
620 | 0 | } |
621 | 0 | row->space_threshold = |
622 | 0 | int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor)); |
623 | 0 | } |
624 | 147k | } |
625 | | |
626 | | /************************************************************************* |
627 | | * isolated_row_stats() |
628 | | * Set values for min_space, max_non_space based on row stats only |
629 | | *************************************************************************/ |
630 | | bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats, |
631 | 155k | bool suspected_table, int16_t block_idx, int16_t row_idx) { |
632 | 155k | float kern_estimate; |
633 | 155k | float crude_threshold_estimate; |
634 | 155k | int16_t small_gaps_count; |
635 | 155k | int16_t total; |
636 | | // iterator |
637 | 155k | BLOBNBOX_IT blob_it = row->blob_list(); |
638 | 155k | STATS cert_space_gap_stats(0, MAXSPACING - 1); |
639 | 155k | STATS all_space_gap_stats(0, MAXSPACING - 1); |
640 | 155k | STATS small_gap_stats(0, MAXSPACING - 1); |
641 | 155k | TBOX blob_box; |
642 | 155k | TBOX prev_blob_box; |
643 | 155k | int16_t gap_width; |
644 | 155k | int32_t end_of_row; |
645 | 155k | int32_t row_length; |
646 | | |
647 | 155k | kern_estimate = all_gap_stats->median(); |
648 | 155k | crude_threshold_estimate = |
649 | 155k | std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight); |
650 | 155k | small_gaps_count = |
651 | 155k | stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate))); |
652 | 155k | total = all_gap_stats->get_total(); |
653 | | |
654 | 155k | if ((total <= tosp_redo_kern_limit) || |
655 | 155k | ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) || |
656 | 155k | (total - small_gaps_count < 1)) { |
657 | 146k | if (tosp_debug_level > 5) { |
658 | 0 | tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx); |
659 | 0 | } |
660 | 146k | return false; |
661 | 146k | } |
662 | 8.13k | blob_it.set_to_list(row->blob_list()); |
663 | 8.13k | blob_it.mark_cycle_pt(); |
664 | 8.13k | end_of_row = blob_it.data_relative(-1)->bounding_box().right(); |
665 | 8.13k | if (tosp_use_pre_chopping) { |
666 | 0 | blob_box = box_next_pre_chopped(&blob_it); |
667 | 8.13k | } else if (tosp_stats_use_xht_gaps) { |
668 | 8.13k | blob_box = reduced_box_next(row, &blob_it); |
669 | 8.13k | } else { |
670 | 0 | blob_box = box_next(&blob_it); |
671 | 0 | } |
672 | 8.13k | row_length = end_of_row - blob_box.left(); |
673 | 8.13k | prev_blob_box = blob_box; |
674 | 245k | while (!blob_it.cycled_list()) { |
675 | 237k | if (tosp_use_pre_chopping) { |
676 | 0 | blob_box = box_next_pre_chopped(&blob_it); |
677 | 237k | } else if (tosp_stats_use_xht_gaps) { |
678 | 237k | blob_box = reduced_box_next(row, &blob_it); |
679 | 237k | } else { |
680 | 0 | blob_box = box_next(&blob_it); |
681 | 0 | } |
682 | 237k | int16_t left = prev_blob_box.right(); |
683 | 237k | int16_t right = blob_box.left(); |
684 | 237k | gap_width = right - left; |
685 | 237k | if (!ignore_big_gap(row, row_length, gapmap, left, right) && |
686 | 237k | (gap_width > crude_threshold_estimate)) { |
687 | 21.7k | if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) || |
688 | 21.7k | ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && |
689 | 19.6k | (!tosp_narrow_blobs_not_cert || |
690 | 6.56k | (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) || |
691 | 21.7k | (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) { |
692 | 2.72k | cert_space_gap_stats.add(gap_width, 1); |
693 | 2.72k | } |
694 | 21.7k | all_space_gap_stats.add(gap_width, 1); |
695 | 21.7k | } |
696 | 237k | if (gap_width < crude_threshold_estimate) { |
697 | 215k | small_gap_stats.add(gap_width, 1); |
698 | 215k | } |
699 | | |
700 | 237k | prev_blob_box = blob_box; |
701 | 237k | } |
702 | 8.13k | if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) { |
703 | | // median |
704 | 89 | row->space_size = cert_space_gap_stats.median(); |
705 | 8.04k | } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) { |
706 | | // to avoid spaced |
707 | 33 | row->space_size = cert_space_gap_stats.mean(); |
708 | | // 1's in tables |
709 | 8.01k | } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) { |
710 | | // median |
711 | 2.63k | row->space_size = all_space_gap_stats.median(); |
712 | 5.37k | } else { |
713 | 5.37k | row->space_size = all_space_gap_stats.mean(); |
714 | 5.37k | } |
715 | | |
716 | 8.13k | if (tosp_only_small_gaps_for_kern) { |
717 | 0 | row->kern_size = small_gap_stats.median(); |
718 | 8.13k | } else { |
719 | 8.13k | row->kern_size = all_gap_stats->median(); |
720 | 8.13k | } |
721 | 8.13k | row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2)); |
722 | | /* Sanity check */ |
723 | 8.13k | if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) || |
724 | 8.13k | (row->space_threshold <= 0)) { |
725 | 28 | if (tosp_debug_level > 5) { |
726 | 0 | tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx, |
727 | 0 | row->kern_size, row->space_threshold, row->space_size); |
728 | 0 | } |
729 | 28 | row->kern_size = 0.0f; |
730 | 28 | row->space_threshold = 0; |
731 | 28 | row->space_size = 0.0f; |
732 | 28 | return false; |
733 | 28 | } |
734 | | |
735 | 8.10k | if (tosp_debug_level > 5) { |
736 | 0 | tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size, |
737 | 0 | row->space_threshold, row->space_size); |
738 | 0 | } |
739 | 8.10k | return true; |
740 | 8.13k | } |
741 | | |
742 | 155k | int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) { |
743 | 155k | int16_t index; |
744 | 155k | int16_t total = 0; |
745 | | |
746 | 925k | for (index = 0; index < threshold; index++) { |
747 | 770k | total += stats->pile_count(index); |
748 | 770k | } |
749 | 155k | return total; |
750 | 155k | } |
751 | | |
752 | | /************************************************************************* |
753 | | * improve_row_threshold() |
754 | | * Try to recognise a "normal line" - |
755 | | * > 25 gaps |
756 | | * && space > 3 * kn && space > 10 |
757 | | * (I.e. reasonably large space and kn:sp ratio) |
758 | | * && > 3/4 # gaps < kn + (sp - kn)/3 |
759 | | * (I.e. most gaps are well away from space estimate) |
760 | | * && a gap of max(3, (sp - kn) / 3) empty histogram positions is found |
761 | | * somewhere in the histogram between kn and sp |
762 | | * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies |
763 | | * NO!!!!! the bristol line has "11" with a gap of 12 between the |
764 | | *1's!!! try moving the default threshold to within this band but leave the |
765 | | * fuzzy limit calculation as at present. |
766 | | *************************************************************************/ |
767 | 0 | void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) { |
768 | 0 | float sp = row->space_size; |
769 | 0 | float kn = row->kern_size; |
770 | 0 | int16_t reqd_zero_width = 0; |
771 | 0 | int16_t zero_width = 0; |
772 | 0 | int16_t zero_start = 0; |
773 | 0 | int16_t index = 0; |
774 | |
|
775 | 0 | if (tosp_debug_level > 10) { |
776 | 0 | tprintf("Improve row threshold 0"); |
777 | 0 | } |
778 | 0 | if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) || |
779 | 0 | (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) < |
780 | 0 | (0.75 * all_gap_stats->get_total()))) { |
781 | 0 | return; |
782 | 0 | } |
783 | 0 | if (tosp_debug_level > 10) { |
784 | 0 | tprintf(" 1"); |
785 | 0 | } |
786 | | /* |
787 | | Look for the first region of all 0's in the histogram which is wider than |
788 | | max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current |
789 | | threshold is not within it, move the threshold so that is just inside it. |
790 | | */ |
791 | 0 | reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5)); |
792 | 0 | if (reqd_zero_width < 3) { |
793 | 0 | reqd_zero_width = 3; |
794 | 0 | } |
795 | |
|
796 | 0 | for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) { |
797 | 0 | if (all_gap_stats->pile_count(index) == 0) { |
798 | 0 | if (zero_width == 0) { |
799 | 0 | zero_start = index; |
800 | 0 | } |
801 | 0 | zero_width++; |
802 | 0 | } else { |
803 | 0 | if (zero_width >= reqd_zero_width) { |
804 | 0 | break; |
805 | 0 | } else { |
806 | 0 | zero_width = 0; |
807 | 0 | } |
808 | 0 | } |
809 | 0 | } |
810 | 0 | index--; |
811 | 0 | if (tosp_debug_level > 10) { |
812 | 0 | tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width, |
813 | 0 | zero_width, zero_start, row->space_threshold); |
814 | 0 | } |
815 | 0 | if ((zero_width < reqd_zero_width) || |
816 | 0 | ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) { |
817 | 0 | return; |
818 | 0 | } |
819 | 0 | if (tosp_debug_level > 10) { |
820 | 0 | tprintf(" 2"); |
821 | 0 | } |
822 | 0 | if (row->space_threshold < zero_start) { |
823 | 0 | if (tosp_debug_level > 5) { |
824 | 0 | tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start, |
825 | 0 | index, row->space_threshold, zero_start); |
826 | 0 | } |
827 | 0 | row->space_threshold = zero_start; |
828 | 0 | } |
829 | 0 | if (row->space_threshold > index) { |
830 | 0 | if (tosp_debug_level > 5) { |
831 | 0 | tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start, |
832 | 0 | index, row->space_threshold, index); |
833 | 0 | } |
834 | 0 | row->space_threshold = index; |
835 | 0 | } |
836 | 0 | } |
837 | | |
838 | | /********************************************************************** |
839 | | * make_prop_words |
840 | | * |
841 | | * Convert a TO_ROW to a ROW. |
842 | | **********************************************************************/ |
843 | | ROW *Textord::make_prop_words(TO_ROW *row, // row to make |
844 | | FCOORD rotation // for drawing |
845 | 173k | ) { |
846 | 173k | bool bol; // start of line |
847 | | /* prev_ values are for start of word being built. non prev_ values are for |
848 | | the gap between the word being built and the next one. */ |
849 | 173k | bool prev_fuzzy_sp; // probably space |
850 | 173k | bool prev_fuzzy_non; // probably not |
851 | 173k | uint8_t prev_blanks; // in front of word |
852 | 173k | bool fuzzy_sp = false; // probably space |
853 | 173k | bool fuzzy_non = false; // probably not |
854 | 173k | uint8_t blanks = 0; // in front of word |
855 | 173k | bool prev_gap_was_a_space = false; |
856 | 173k | bool break_at_next_gap = false; |
857 | 173k | ROW *real_row; // output row |
858 | 173k | C_OUTLINE_IT cout_it; |
859 | 173k | C_BLOB_LIST cblobs; |
860 | 173k | C_BLOB_IT cblob_it = &cblobs; |
861 | 173k | WERD_LIST words; |
862 | 173k | WERD *word; // new word |
863 | 173k | int32_t next_rep_char_word_right = INT32_MAX; |
864 | 173k | float repetition_spacing; // gap between repetitions |
865 | 173k | int32_t xstarts[2]; // row ends |
866 | 173k | int32_t prev_x; // end of prev blob |
867 | 173k | BLOBNBOX_IT box_it; // iterator |
868 | 173k | TBOX prev_blob_box; |
869 | 173k | TBOX next_blob_box; |
870 | 173k | int16_t prev_gap = INT16_MAX; |
871 | 173k | int16_t current_gap = INT16_MAX; |
872 | 173k | int16_t next_gap = INT16_MAX; |
873 | 173k | int16_t prev_within_xht_gap = INT16_MAX; |
874 | 173k | int16_t current_within_xht_gap = INT16_MAX; |
875 | 173k | int16_t next_within_xht_gap = INT16_MAX; |
876 | 173k | int16_t word_count = 0; |
877 | | |
878 | | // repeated char words |
879 | 173k | WERD_IT rep_char_it(&(row->rep_words)); |
880 | 173k | if (!rep_char_it.empty()) { |
881 | 0 | next_rep_char_word_right = rep_char_it.data()->bounding_box().right(); |
882 | 0 | } |
883 | | |
884 | 173k | prev_x = -INT16_MAX; |
885 | 173k | cblob_it.set_to_list(&cblobs); |
886 | 173k | box_it.set_to_list(row->blob_list()); |
887 | | // new words |
888 | 173k | WERD_IT word_it(&words); |
889 | 173k | bol = true; |
890 | 173k | prev_blanks = 0; |
891 | 173k | prev_fuzzy_sp = false; |
892 | 173k | prev_fuzzy_non = false; |
893 | 173k | if (!box_it.empty()) { |
894 | 173k | xstarts[0] = box_it.data()->bounding_box().left(); |
895 | 173k | if (xstarts[0] > next_rep_char_word_right) { |
896 | | /* We need to insert a repeated char word at the start of the row */ |
897 | 0 | word = rep_char_it.extract(); |
898 | 0 | word_it.add_after_then_move(word); |
899 | | /* Set spaces before repeated char word */ |
900 | 0 | word->set_flag(W_BOL, true); |
901 | 0 | bol = false; |
902 | 0 | word->set_blanks(0); |
903 | | // NO uncertainty |
904 | 0 | word->set_flag(W_FUZZY_SP, false); |
905 | 0 | word->set_flag(W_FUZZY_NON, false); |
906 | 0 | xstarts[0] = word->bounding_box().left(); |
907 | | /* Set spaces after repeated char word (and leave current word set) */ |
908 | 0 | repetition_spacing = find_mean_blob_spacing(word); |
909 | 0 | current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right; |
910 | 0 | current_within_xht_gap = current_gap; |
911 | 0 | if (current_gap > tosp_rep_space * repetition_spacing) { |
912 | 0 | prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size)); |
913 | 0 | if (prev_blanks < 1) { |
914 | 0 | prev_blanks = 1; |
915 | 0 | } |
916 | 0 | } else { |
917 | 0 | prev_blanks = 0; |
918 | 0 | } |
919 | 0 | if (tosp_debug_level > 5) { |
920 | 0 | tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ", |
921 | 0 | box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(), |
922 | 0 | repetition_spacing, current_gap); |
923 | 0 | } |
924 | 0 | prev_fuzzy_sp = false; |
925 | 0 | prev_fuzzy_non = false; |
926 | 0 | if (rep_char_it.empty()) { |
927 | 0 | next_rep_char_word_right = INT32_MAX; |
928 | 0 | } else { |
929 | 0 | rep_char_it.forward(); |
930 | 0 | next_rep_char_word_right = rep_char_it.data()->bounding_box().right(); |
931 | 0 | } |
932 | 0 | } |
933 | | |
934 | 173k | peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap); |
935 | 2.54M | do { |
936 | 2.54M | auto bblob = box_it.data(); |
937 | 2.54M | auto blob_box = bblob->bounding_box(); |
938 | 2.54M | if (bblob->joined_to_prev()) { |
939 | 1.07M | auto cblob = bblob->remove_cblob(); |
940 | 1.07M | if (cblob != nullptr) { |
941 | 1.07M | cout_it.set_to_list(cblob_it.data()->out_list()); |
942 | 1.07M | cout_it.move_to_last(); |
943 | 1.07M | cout_it.add_list_after(cblob->out_list()); |
944 | 1.07M | delete cblob; |
945 | 1.07M | } |
946 | 1.46M | } else { |
947 | 1.46M | auto cblob = bblob->cblob(); |
948 | 1.46M | if (cblob != nullptr) { |
949 | 1.23M | bblob->set_owns_cblob(false); |
950 | 1.23M | cblob_it.add_after_then_move(cblob); |
951 | 1.23M | } |
952 | 1.46M | prev_x = blob_box.right(); |
953 | 1.46M | } |
954 | 2.54M | box_it.forward(); // next one |
955 | 2.54M | bblob = box_it.data(); |
956 | 2.54M | blob_box = bblob->bounding_box(); |
957 | | |
958 | 2.54M | if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) { |
959 | | /* Real Blob - not multiple outlines or pre-chopped */ |
960 | 1.23M | prev_gap = current_gap; |
961 | 1.23M | prev_within_xht_gap = current_within_xht_gap; |
962 | 1.23M | prev_blob_box = next_blob_box; |
963 | 1.23M | current_gap = next_gap; |
964 | 1.23M | current_within_xht_gap = next_within_xht_gap; |
965 | 1.23M | peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap); |
966 | | |
967 | 1.23M | int16_t prev_gap_arg = prev_gap; |
968 | 1.23M | int16_t next_gap_arg = next_gap; |
969 | 1.23M | if (tosp_only_use_xht_gaps) { |
970 | 0 | prev_gap_arg = prev_within_xht_gap; |
971 | 0 | next_gap_arg = next_within_xht_gap; |
972 | 0 | } |
973 | | // Decide if a word-break should be inserted |
974 | 1.23M | if (blob_box.left() > next_rep_char_word_right || |
975 | 1.23M | make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap, |
976 | 1.23M | current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp, |
977 | 1.23M | fuzzy_non, prev_gap_was_a_space, break_at_next_gap) || |
978 | 1.23M | box_it.at_first()) { |
979 | | /* Form a new word out of the blobs collected */ |
980 | 280k | word = new WERD(&cblobs, prev_blanks, nullptr); |
981 | 280k | word_count++; |
982 | 280k | word_it.add_after_then_move(word); |
983 | 280k | if (bol) { |
984 | 173k | word->set_flag(W_BOL, true); |
985 | 173k | bol = false; |
986 | 173k | } |
987 | 280k | if (prev_fuzzy_sp) { |
988 | | // probably space |
989 | 28.6k | word->set_flag(W_FUZZY_SP, true); |
990 | 252k | } else if (prev_fuzzy_non) { |
991 | 19.7k | word->set_flag(W_FUZZY_NON, true); |
992 | 19.7k | } |
993 | | // probably not |
994 | | |
995 | 280k | if (blob_box.left() > next_rep_char_word_right) { |
996 | | /* We need to insert a repeated char word */ |
997 | 0 | word = rep_char_it.extract(); |
998 | 0 | word_it.add_after_then_move(word); |
999 | | |
1000 | | /* Set spaces before repeated char word */ |
1001 | 0 | repetition_spacing = find_mean_blob_spacing(word); |
1002 | 0 | current_gap = word->bounding_box().left() - prev_x; |
1003 | 0 | current_within_xht_gap = current_gap; |
1004 | 0 | if (current_gap > tosp_rep_space * repetition_spacing) { |
1005 | 0 | blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size)); |
1006 | 0 | if (blanks < 1) { |
1007 | 0 | blanks = 1; |
1008 | 0 | } |
1009 | 0 | } else { |
1010 | 0 | blanks = 0; |
1011 | 0 | } |
1012 | 0 | if (tosp_debug_level > 5) { |
1013 | 0 | tprintf("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);", |
1014 | 0 | word->bounding_box().left(), word->bounding_box().bottom(), |
1015 | 0 | repetition_spacing, current_gap, blanks); |
1016 | 0 | } |
1017 | 0 | word->set_blanks(blanks); |
1018 | | // NO uncertainty |
1019 | 0 | word->set_flag(W_FUZZY_SP, false); |
1020 | 0 | word->set_flag(W_FUZZY_NON, false); |
1021 | | |
1022 | | /* Set spaces after repeated char word (and leave current word set) |
1023 | | */ |
1024 | 0 | current_gap = blob_box.left() - next_rep_char_word_right; |
1025 | 0 | if (current_gap > tosp_rep_space * repetition_spacing) { |
1026 | 0 | blanks = static_cast<uint8_t>(current_gap / row->space_size); |
1027 | 0 | if (blanks < 1) { |
1028 | 0 | blanks = 1; |
1029 | 0 | } |
1030 | 0 | } else { |
1031 | 0 | blanks = 0; |
1032 | 0 | } |
1033 | 0 | if (tosp_debug_level > 5) { |
1034 | 0 | tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks); |
1035 | 0 | } |
1036 | 0 | fuzzy_sp = false; |
1037 | 0 | fuzzy_non = false; |
1038 | |
|
1039 | 0 | if (rep_char_it.empty()) { |
1040 | 0 | next_rep_char_word_right = INT32_MAX; |
1041 | 0 | } else { |
1042 | 0 | rep_char_it.forward(); |
1043 | 0 | next_rep_char_word_right = rep_char_it.data()->bounding_box().right(); |
1044 | 0 | } |
1045 | 0 | } |
1046 | | |
1047 | 280k | if (box_it.at_first() && rep_char_it.empty()) { |
1048 | | // at end of line |
1049 | 173k | word->set_flag(W_EOL, true); |
1050 | 173k | xstarts[1] = prev_x; |
1051 | 173k | } else { |
1052 | 107k | prev_blanks = blanks; |
1053 | 107k | prev_fuzzy_sp = fuzzy_sp; |
1054 | 107k | prev_fuzzy_non = fuzzy_non; |
1055 | 107k | } |
1056 | 280k | } |
1057 | 1.23M | } |
1058 | 2.54M | } while (!box_it.at_first()); // until back at start |
1059 | | |
1060 | | /* Insert any further repeated char words */ |
1061 | 173k | while (!rep_char_it.empty()) { |
1062 | 0 | word = rep_char_it.extract(); |
1063 | 0 | word_it.add_after_then_move(word); |
1064 | | |
1065 | | /* Set spaces before repeated char word */ |
1066 | 0 | repetition_spacing = find_mean_blob_spacing(word); |
1067 | 0 | current_gap = word->bounding_box().left() - prev_x; |
1068 | 0 | if (current_gap > tosp_rep_space * repetition_spacing) { |
1069 | 0 | blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size)); |
1070 | 0 | if (blanks < 1) { |
1071 | 0 | blanks = 1; |
1072 | 0 | } |
1073 | 0 | } else { |
1074 | 0 | blanks = 0; |
1075 | 0 | } |
1076 | 0 | if (tosp_debug_level > 5) { |
1077 | 0 | tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n", |
1078 | 0 | word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing, |
1079 | 0 | current_gap, blanks); |
1080 | 0 | } |
1081 | 0 | word->set_blanks(blanks); |
1082 | | // NO uncertainty |
1083 | 0 | word->set_flag(W_FUZZY_SP, false); |
1084 | 0 | word->set_flag(W_FUZZY_NON, false); |
1085 | 0 | prev_x = word->bounding_box().right(); |
1086 | 0 | if (rep_char_it.empty()) { |
1087 | | // at end of line |
1088 | 0 | word->set_flag(W_EOL, true); |
1089 | 0 | xstarts[1] = prev_x; |
1090 | 0 | } else { |
1091 | 0 | rep_char_it.forward(); |
1092 | 0 | } |
1093 | 0 | } |
1094 | 173k | real_row = |
1095 | 173k | new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size)); |
1096 | 173k | word_it.set_to_list(real_row->word_list()); |
1097 | | // put words in row |
1098 | 173k | word_it.add_list_after(&words); |
1099 | 173k | real_row->recalc_bounding_box(); |
1100 | | |
1101 | 173k | if (tosp_debug_level > 4) { |
1102 | 0 | tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count, |
1103 | 0 | real_row->bounding_box().left(), real_row->bounding_box().bottom(), |
1104 | 0 | real_row->bounding_box().right(), real_row->bounding_box().top()); |
1105 | 0 | } |
1106 | 173k | return real_row; |
1107 | 173k | } |
1108 | 0 | return nullptr; |
1109 | 173k | } |
1110 | | |
1111 | | /********************************************************************** |
1112 | | * make_blob_words |
1113 | | * |
1114 | | * Converts words into blobs so that each blob is a single character. |
1115 | | * Used for chopper test. |
1116 | | **********************************************************************/ |
1117 | | ROW *Textord::make_blob_words(TO_ROW *row, // row to make |
1118 | | FCOORD rotation // for drawing |
1119 | 0 | ) { |
1120 | 0 | bool bol; // start of line |
1121 | 0 | ROW *real_row; // output row |
1122 | 0 | C_OUTLINE_IT cout_it; |
1123 | 0 | C_BLOB_LIST cblobs; |
1124 | 0 | C_BLOB_IT cblob_it = &cblobs; |
1125 | 0 | WERD_LIST words; |
1126 | 0 | WERD *word; // new word |
1127 | 0 | BLOBNBOX_IT box_it; // iterator |
1128 | 0 | int16_t word_count = 0; |
1129 | |
|
1130 | 0 | cblob_it.set_to_list(&cblobs); |
1131 | 0 | box_it.set_to_list(row->blob_list()); |
1132 | | // new words |
1133 | 0 | WERD_IT word_it(&words); |
1134 | 0 | bol = true; |
1135 | 0 | if (!box_it.empty()) { |
1136 | 0 | do { |
1137 | 0 | auto bblob = box_it.data(); |
1138 | 0 | auto blob_box = bblob->bounding_box(); |
1139 | 0 | if (bblob->joined_to_prev()) { |
1140 | 0 | auto cblob = bblob->remove_cblob(); |
1141 | 0 | if (cblob != nullptr) { |
1142 | 0 | cout_it.set_to_list(cblob_it.data()->out_list()); |
1143 | 0 | cout_it.move_to_last(); |
1144 | 0 | cout_it.add_list_after(cblob->out_list()); |
1145 | 0 | delete cblob; |
1146 | 0 | } |
1147 | 0 | } else { |
1148 | 0 | auto cblob = bblob->cblob(); |
1149 | 0 | if (cblob != nullptr) { |
1150 | 0 | bblob->set_owns_cblob(false); |
1151 | 0 | cblob_it.add_after_then_move(cblob); |
1152 | 0 | } |
1153 | 0 | } |
1154 | 0 | box_it.forward(); // next one |
1155 | 0 | bblob = box_it.data(); |
1156 | 0 | blob_box = bblob->bounding_box(); |
1157 | |
|
1158 | 0 | if (!bblob->joined_to_prev() && !cblobs.empty()) { |
1159 | 0 | word = new WERD(&cblobs, 1, nullptr); |
1160 | 0 | word_count++; |
1161 | 0 | word_it.add_after_then_move(word); |
1162 | 0 | if (bol) { |
1163 | 0 | word->set_flag(W_BOL, true); |
1164 | 0 | bol = false; |
1165 | 0 | } |
1166 | 0 | if (box_it.at_first()) { // at end of line |
1167 | 0 | word->set_flag(W_EOL, true); |
1168 | 0 | } |
1169 | 0 | } |
1170 | 0 | } while (!box_it.at_first()); // until back at start |
1171 | | /* Setup the row with created words. */ |
1172 | 0 | real_row = |
1173 | 0 | new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size)); |
1174 | 0 | word_it.set_to_list(real_row->word_list()); |
1175 | | // put words in row |
1176 | 0 | word_it.add_list_after(&words); |
1177 | 0 | real_row->recalc_bounding_box(); |
1178 | 0 | if (tosp_debug_level > 4) { |
1179 | 0 | tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count, |
1180 | 0 | real_row->bounding_box().left(), real_row->bounding_box().bottom(), |
1181 | 0 | real_row->bounding_box().right(), real_row->bounding_box().top()); |
1182 | 0 | } |
1183 | 0 | return real_row; |
1184 | 0 | } |
1185 | 0 | return nullptr; |
1186 | 0 | } |
1187 | | |
1188 | | bool Textord::make_a_word_break(TO_ROW *row, // row being made |
1189 | | TBOX blob_box, // for next_blob // how many blanks? |
1190 | | int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap, |
1191 | | int16_t within_xht_current_gap, TBOX next_blob_box, |
1192 | | int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non, |
1193 | 1.23M | bool &prev_gap_was_a_space, bool &break_at_next_gap) { |
1194 | 1.23M | bool space; |
1195 | 1.23M | int16_t current_gap; |
1196 | 1.23M | float fuzzy_sp_to_kn_limit; |
1197 | | |
1198 | 1.23M | if (break_at_next_gap) { |
1199 | 0 | break_at_next_gap = false; |
1200 | 0 | return true; |
1201 | 0 | } |
1202 | | /* Inhibit using the reduced gap if |
1203 | | The kerning is large - chars are not kerned and reducing "f"s can cause |
1204 | | erroneous blanks |
1205 | | OR The real gap is less than 0 |
1206 | | OR The real gap is less than the kerning estimate |
1207 | | */ |
1208 | 1.23M | if ((row->kern_size > tosp_large_kerning * row->xheight) || |
1209 | 1.23M | ((tosp_dont_fool_with_small_kerns >= 0) && |
1210 | 972k | (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) { |
1211 | | // Ignore the difference |
1212 | 266k | within_xht_current_gap = real_current_gap; |
1213 | 266k | } |
1214 | | |
1215 | 1.23M | if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) { |
1216 | 0 | current_gap = within_xht_current_gap; |
1217 | 1.23M | } else { |
1218 | 1.23M | current_gap = real_current_gap; |
1219 | 1.23M | } |
1220 | | |
1221 | 1.23M | if (tosp_old_to_method) { |
1222 | | // Boring old method |
1223 | 0 | space = current_gap > row->max_nonspace; |
1224 | 0 | if (space && (current_gap < INT16_MAX)) { |
1225 | 0 | if (current_gap < row->min_space) { |
1226 | 0 | if (current_gap > row->space_threshold) { |
1227 | 0 | blanks = 1; |
1228 | 0 | fuzzy_sp = true; |
1229 | 0 | fuzzy_non = false; |
1230 | 0 | } else { |
1231 | 0 | blanks = 0; |
1232 | 0 | fuzzy_sp = false; |
1233 | 0 | fuzzy_non = true; |
1234 | 0 | } |
1235 | 0 | } else { |
1236 | 0 | if (row->space_size == 0.0f) { |
1237 | | // Avoid FP division by 0. |
1238 | 0 | blanks = 1; |
1239 | 0 | } else { |
1240 | 0 | blanks = static_cast<uint8_t>(current_gap / row->space_size); |
1241 | 0 | if (blanks < 1) { |
1242 | 0 | blanks = 1; |
1243 | 0 | } |
1244 | 0 | } |
1245 | 0 | fuzzy_sp = false; |
1246 | 0 | fuzzy_non = false; |
1247 | 0 | } |
1248 | 0 | } |
1249 | 0 | return space; |
1250 | 1.23M | } else { |
1251 | | /* New exciting heuristic method */ |
1252 | 1.23M | if (prev_blob_box.null_box()) { // Beginning of row |
1253 | 94 | prev_gap_was_a_space = true; |
1254 | 94 | } |
1255 | | |
1256 | | // Default as old TO |
1257 | 1.23M | space = current_gap > row->space_threshold; |
1258 | | |
1259 | | /* Set defaults for the word break in case we find one. Currently there are |
1260 | | no fuzzy spaces. Depending on the reliability of the different heuristics |
1261 | | we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY |
1262 | | be used if the function returns true - ie the word is to be broken. |
1263 | | */ |
1264 | 1.23M | int num_blanks = current_gap; |
1265 | 1.23M | if (row->space_size > 1.0f) { |
1266 | 1.23M | num_blanks = IntCastRounded(current_gap / row->space_size); |
1267 | 1.23M | } |
1268 | 1.23M | blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX)); |
1269 | 1.23M | fuzzy_sp = false; |
1270 | 1.23M | fuzzy_non = false; |
1271 | | /* |
1272 | | If xht measure causes gap to flip one of the 3 thresholds act accordingly - |
1273 | | despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to |
1274 | | context. |
1275 | | */ |
1276 | 1.23M | if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) && |
1277 | 1.23M | (within_xht_current_gap > row->max_nonspace)) { |
1278 | 2.01k | space = true; |
1279 | 2.01k | fuzzy_non = true; |
1280 | | #ifndef GRAPHICS_DISABLED |
1281 | | mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1282 | | next_gap); |
1283 | | #endif |
1284 | 1.23M | } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) && |
1285 | 1.23M | (within_xht_current_gap > row->space_threshold)) { |
1286 | 262 | space = true; |
1287 | 262 | if (tosp_flip_fuzz_kn_to_sp) { |
1288 | 262 | fuzzy_sp = true; |
1289 | 262 | } else { |
1290 | 0 | fuzzy_non = true; |
1291 | 0 | } |
1292 | | #ifndef GRAPHICS_DISABLED |
1293 | | mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1294 | | next_gap); |
1295 | | #endif |
1296 | 1.23M | } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) && |
1297 | 1.23M | (within_xht_current_gap >= row->min_space)) { |
1298 | 268 | space = true; |
1299 | | #ifndef GRAPHICS_DISABLED |
1300 | | mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1301 | | next_gap); |
1302 | | #endif |
1303 | 1.23M | } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) && |
1304 | 1.23M | suspected_punct_blob(row, blob_box)) { |
1305 | 0 | break_at_next_gap = true; |
1306 | 0 | } |
1307 | | /* Now continue with normal heuristics */ |
1308 | 1.23M | else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) { |
1309 | | /* Heuristics to turn dubious spaces to kerns */ |
1310 | 43.0k | if (tosp_pass_wide_fuzz_sp_to_context > 0) { |
1311 | 43.0k | fuzzy_sp_to_kn_limit = |
1312 | 43.0k | row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size); |
1313 | 43.0k | } else { |
1314 | 0 | fuzzy_sp_to_kn_limit = 99999.0f; |
1315 | 0 | } |
1316 | | |
1317 | | /* If current gap is significantly smaller than the previous space the |
1318 | | other side of a narrow blob then this gap is a kern. */ |
1319 | 43.0k | if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space && |
1320 | 43.0k | (current_gap <= tosp_gap_factor * prev_gap)) { |
1321 | 1.52k | if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { |
1322 | 351 | if (tosp_flip_fuzz_sp_to_kn) { |
1323 | 351 | fuzzy_non = true; |
1324 | 351 | } else { |
1325 | 0 | fuzzy_sp = true; |
1326 | 0 | } |
1327 | 1.17k | } else { |
1328 | 1.17k | space = false; |
1329 | 1.17k | } |
1330 | | #ifndef GRAPHICS_DISABLED |
1331 | | mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1332 | | next_gap); |
1333 | | #endif |
1334 | 1.52k | } |
1335 | | /* If current gap not much bigger than the previous kern the other side of |
1336 | | a narrow blob then this gap is a kern as well */ |
1337 | 41.5k | else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && |
1338 | 41.5k | !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) { |
1339 | 4.75k | if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { |
1340 | 1.04k | if (tosp_flip_fuzz_sp_to_kn) { |
1341 | 1.04k | fuzzy_non = true; |
1342 | 1.04k | } else { |
1343 | 0 | fuzzy_sp = true; |
1344 | 0 | } |
1345 | 3.71k | } else { |
1346 | 3.71k | space = false; |
1347 | 3.71k | } |
1348 | | #ifndef GRAPHICS_DISABLED |
1349 | | mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1350 | | next_gap); |
1351 | | #endif |
1352 | 36.7k | } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) && |
1353 | 36.7k | (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) { |
1354 | 4.29k | if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { |
1355 | 1.10k | if (tosp_flip_fuzz_sp_to_kn) { |
1356 | 1.10k | fuzzy_non = true; |
1357 | 1.10k | } else { |
1358 | 0 | fuzzy_sp = true; |
1359 | 0 | } |
1360 | 3.19k | } else { |
1361 | 3.19k | space = false; |
1362 | 3.19k | } |
1363 | | #ifndef GRAPHICS_DISABLED |
1364 | | mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1365 | | next_gap); |
1366 | | #endif |
1367 | 32.4k | } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) && |
1368 | 32.4k | (next_gap <= row->space_threshold) && |
1369 | 32.4k | (current_gap * tosp_gap_factor <= next_gap)) { |
1370 | 181 | if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) { |
1371 | 31 | if (tosp_flip_fuzz_sp_to_kn) { |
1372 | 31 | fuzzy_non = true; |
1373 | 31 | } else { |
1374 | 0 | fuzzy_sp = true; |
1375 | 0 | } |
1376 | 150 | } else { |
1377 | 150 | space = false; |
1378 | 150 | } |
1379 | | #ifndef GRAPHICS_DISABLED |
1380 | | mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1381 | | next_gap); |
1382 | | #endif |
1383 | 32.2k | } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) || |
1384 | 32.2k | ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) { |
1385 | 27.8k | fuzzy_sp = true; |
1386 | | #ifndef GRAPHICS_DISABLED |
1387 | | mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1388 | | next_gap); |
1389 | | #endif |
1390 | 27.8k | } |
1391 | 1.19M | } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) { |
1392 | | /* Heuristics to turn dubious kerns to spaces */ |
1393 | | /* TRIED THIS BUT IT MADE THINGS WORSE |
1394 | | if (prev_gap == INT16_MAX) |
1395 | | prev_gap = 0; // start of row |
1396 | | if (next_gap == INT16_MAX) |
1397 | | next_gap = 0; // end of row |
1398 | | */ |
1399 | 35.7k | if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) && |
1400 | 35.7k | (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) && |
1401 | 35.7k | wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) { |
1402 | 591 | space = true; |
1403 | | /* |
1404 | | tosp_flip_caution is an attempt to stop the default changing in cases |
1405 | | where there is a large difference between the kern and space estimates. |
1406 | | See problem in 'chiefs' where "have" gets split in the quotation. |
1407 | | */ |
1408 | 591 | if ((tosp_flip_fuzz_kn_to_sp) && |
1409 | 591 | ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) { |
1410 | 591 | fuzzy_sp = true; |
1411 | 591 | } else { |
1412 | 0 | fuzzy_non = true; |
1413 | 0 | } |
1414 | | #ifndef GRAPHICS_DISABLED |
1415 | | mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1416 | | next_gap); |
1417 | | #endif |
1418 | 35.1k | } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 && |
1419 | 35.1k | current_gap > 5 && // Rule 9 handles small gap, big ratio. |
1420 | 35.1k | current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) && |
1421 | 35.1k | !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) && |
1422 | 35.1k | !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) { |
1423 | 216 | space = true; |
1424 | 216 | fuzzy_non = true; |
1425 | | #ifndef GRAPHICS_DISABLED |
1426 | | mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1427 | | next_gap); |
1428 | | #endif |
1429 | 34.9k | } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) && |
1430 | 34.9k | (next_blob_box.width() > 0) && |
1431 | 34.9k | (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) && |
1432 | 34.9k | (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) && |
1433 | 14.9k | !suspected_punct_blob(row, next_blob_box)))) { |
1434 | 14.9k | space = true; |
1435 | 14.9k | fuzzy_non = true; |
1436 | | #ifndef GRAPHICS_DISABLED |
1437 | | mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(), |
1438 | | next_gap); |
1439 | | #endif |
1440 | 14.9k | } |
1441 | 35.7k | } |
1442 | 1.23M | if (tosp_debug_level > 10) { |
1443 | 0 | tprintf( |
1444 | 0 | "word break = %d current_gap = %d, prev_gap = %d, " |
1445 | 0 | "next_gap = %d\n", |
1446 | 0 | space ? 1 : 0, current_gap, prev_gap, next_gap); |
1447 | 0 | } |
1448 | 1.23M | prev_gap_was_a_space = space && !(fuzzy_non); |
1449 | 1.23M | return space; |
1450 | 1.23M | } |
1451 | 1.23M | } |
1452 | | |
1453 | 262k | bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) { |
1454 | 262k | bool result; |
1455 | 262k | result = |
1456 | 262k | ((blob_box.width() <= tosp_narrow_fraction * row->xheight) || |
1457 | 262k | ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio)); |
1458 | 262k | return result; |
1459 | 262k | } |
1460 | | |
1461 | 227k | bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) { |
1462 | 227k | bool result; |
1463 | 227k | if (tosp_wide_fraction > 0) { |
1464 | 227k | if (tosp_wide_aspect_ratio > 0) { |
1465 | 0 | result = |
1466 | 0 | ((blob_box.width() >= tosp_wide_fraction * row->xheight) && |
1467 | 0 | ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio)); |
1468 | 227k | } else { |
1469 | 227k | result = (blob_box.width() >= tosp_wide_fraction * row->xheight); |
1470 | 227k | } |
1471 | 227k | } else { |
1472 | 0 | result = !narrow_blob(row, blob_box); |
1473 | 0 | } |
1474 | 227k | return result; |
1475 | 227k | } |
1476 | | |
1477 | 2.13k | bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) { |
1478 | 2.13k | bool result; |
1479 | 2.13k | float baseline; |
1480 | 2.13k | float blob_x_centre; |
1481 | | /* Find baseline of centre of blob */ |
1482 | 2.13k | blob_x_centre = (box.right() + box.left()) / 2.0; |
1483 | 2.13k | baseline = row->baseline.y(blob_x_centre); |
1484 | | |
1485 | 2.13k | result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) || |
1486 | 2.13k | (box.bottom() > baseline + row->xheight / 2.0); |
1487 | 2.13k | return result; |
1488 | 2.13k | } |
1489 | | |
1490 | | void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box, |
1491 | 1.41M | int16_t &next_gap, int16_t &next_within_xht_gap) { |
1492 | 1.41M | TBOX next_reduced_blob_box; |
1493 | 1.41M | TBOX bit_beyond; |
1494 | 1.41M | BLOBNBOX_IT reduced_box_it = box_it; |
1495 | | |
1496 | 1.41M | next_blob_box = box_next(&box_it); |
1497 | 1.41M | next_reduced_blob_box = reduced_box_next(row, &reduced_box_it); |
1498 | 1.41M | if (box_it.at_first()) { |
1499 | 234k | next_gap = INT16_MAX; |
1500 | 234k | next_within_xht_gap = INT16_MAX; |
1501 | 1.17M | } else { |
1502 | 1.17M | bit_beyond = box_it.data()->bounding_box(); |
1503 | 1.17M | next_gap = bit_beyond.left() - next_blob_box.right(); |
1504 | 1.17M | bit_beyond = reduced_box_next(row, &reduced_box_it); |
1505 | 1.17M | next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right(); |
1506 | 1.17M | } |
1507 | 1.41M | } |
1508 | | |
1509 | | #ifndef GRAPHICS_DISABLED |
1510 | | void Textord::mark_gap(TBOX blob, // blob following gap |
1511 | | int16_t rule, // heuristic id |
1512 | | int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap, |
1513 | | int16_t next_blob_width, int16_t next_gap) { |
1514 | | ScrollView::Color col; // of ellipse marking flipped gap |
1515 | | |
1516 | | switch (rule) { |
1517 | | case 1: |
1518 | | col = ScrollView::RED; |
1519 | | break; |
1520 | | case 2: |
1521 | | col = ScrollView::CYAN; |
1522 | | break; |
1523 | | case 3: |
1524 | | col = ScrollView::GREEN; |
1525 | | break; |
1526 | | case 4: |
1527 | | col = ScrollView::BLACK; |
1528 | | break; |
1529 | | case 5: |
1530 | | col = ScrollView::MAGENTA; |
1531 | | break; |
1532 | | case 6: |
1533 | | col = ScrollView::BLUE; |
1534 | | break; |
1535 | | |
1536 | | case 7: |
1537 | | col = ScrollView::WHITE; |
1538 | | break; |
1539 | | case 8: |
1540 | | col = ScrollView::YELLOW; |
1541 | | break; |
1542 | | case 9: |
1543 | | col = ScrollView::BLACK; |
1544 | | break; |
1545 | | |
1546 | | case 20: |
1547 | | col = ScrollView::CYAN; |
1548 | | break; |
1549 | | case 21: |
1550 | | col = ScrollView::GREEN; |
1551 | | break; |
1552 | | case 22: |
1553 | | col = ScrollView::MAGENTA; |
1554 | | break; |
1555 | | default: |
1556 | | col = ScrollView::BLACK; |
1557 | | } |
1558 | | if (textord_show_initial_words) { |
1559 | | to_win->Pen(col); |
1560 | | /* if (rule < 20) |
1561 | | //interior_style(to_win, INT_SOLID, false); |
1562 | | else |
1563 | | //interior_style(to_win, INT_HOLLOW, true);*/ |
1564 | | // x radius |
1565 | | to_win->Ellipse(current_gap / 2.0f, |
1566 | | blob.height() / 2.0f, // y radius |
1567 | | // x centre |
1568 | | blob.left() - current_gap / 2.0f, |
1569 | | // y centre |
1570 | | blob.bottom() + blob.height() / 2.0f); |
1571 | | } |
1572 | | if (tosp_debug_level > 5) { |
1573 | | tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2, |
1574 | | blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap); |
1575 | | } |
1576 | | } |
1577 | | #endif |
1578 | | |
1579 | 0 | float Textord::find_mean_blob_spacing(WERD *word) { |
1580 | 0 | C_BLOB_IT cblob_it; |
1581 | 0 | TBOX blob_box; |
1582 | 0 | int32_t gap_sum = 0; |
1583 | 0 | int16_t gap_count = 0; |
1584 | 0 | int16_t prev_right; |
1585 | |
|
1586 | 0 | cblob_it.set_to_list(word->cblob_list()); |
1587 | 0 | if (!cblob_it.empty()) { |
1588 | 0 | cblob_it.mark_cycle_pt(); |
1589 | 0 | prev_right = cblob_it.data()->bounding_box().right(); |
1590 | | // first blob |
1591 | 0 | cblob_it.forward(); |
1592 | 0 | for (; !cblob_it.cycled_list(); cblob_it.forward()) { |
1593 | 0 | blob_box = cblob_it.data()->bounding_box(); |
1594 | 0 | gap_sum += blob_box.left() - prev_right; |
1595 | 0 | gap_count++; |
1596 | 0 | prev_right = blob_box.right(); |
1597 | 0 | } |
1598 | 0 | } |
1599 | 0 | if (gap_count > 0) { |
1600 | 0 | return (gap_sum / static_cast<float>(gap_count)); |
1601 | 0 | } else { |
1602 | 0 | return 0.0f; |
1603 | 0 | } |
1604 | 0 | } |
1605 | | |
1606 | | bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left, |
1607 | 2.48M | int16_t right) { |
1608 | 2.48M | int16_t gap = right - left + 1; |
1609 | | |
1610 | 2.48M | if (tosp_ignore_big_gaps > 999) { |
1611 | 0 | return false; // Don't ignore |
1612 | 0 | } |
1613 | 2.48M | if (tosp_ignore_big_gaps > 0) { |
1614 | 0 | return (gap > tosp_ignore_big_gaps * row->xheight); |
1615 | 0 | } |
1616 | 2.48M | if (gap > tosp_ignore_very_big_gaps * row->xheight) { |
1617 | 18.7k | return true; |
1618 | 18.7k | } |
1619 | 2.47M | if (tosp_ignore_big_gaps == 0) { |
1620 | 0 | if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) { |
1621 | 0 | return true; |
1622 | 0 | } |
1623 | 0 | if ((gap > 1.75 * row->xheight) && |
1624 | 0 | ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) { |
1625 | 0 | return true; |
1626 | 0 | } |
1627 | 2.47M | } else { |
1628 | | /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table |
1629 | | */ |
1630 | 2.47M | if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) { |
1631 | 11.8k | return true; |
1632 | 11.8k | } |
1633 | 2.47M | } |
1634 | 2.45M | return false; |
1635 | 2.47M | } |
1636 | | |
1637 | | /********************************************************************** |
1638 | | * reduced_box_next |
1639 | | * |
1640 | | * Compute the bounding box of this blob with merging of x overlaps |
1641 | | * but no pre-chopping. |
1642 | | * Then move the iterator on to the start of the next blob. |
1643 | | * DON'T reduce the box for small things - eg punctuation. |
1644 | | **********************************************************************/ |
1645 | | TBOX Textord::reduced_box_next(TO_ROW *row, // current row |
1646 | | BLOBNBOX_IT *it // iterator to blobds |
1647 | 6.51M | ) { |
1648 | 6.51M | BLOBNBOX *blob; // current blob |
1649 | 6.51M | BLOBNBOX *head_blob; // place to store box |
1650 | 6.51M | TBOX full_box; // full blob boundg box |
1651 | 6.51M | TBOX reduced_box; // box of significant part |
1652 | 6.51M | int16_t left_above_xht; // ABOVE xht left limit |
1653 | 6.51M | int16_t new_left_above_xht; // ABOVE xht left limit |
1654 | | |
1655 | 6.51M | blob = it->data(); |
1656 | 6.51M | if (blob->red_box_set()) { |
1657 | 5.27M | reduced_box = blob->reduced_box(); |
1658 | 10.7M | do { |
1659 | 10.7M | it->forward(); |
1660 | 10.7M | blob = it->data(); |
1661 | 10.7M | } while (blob->cblob() == nullptr || blob->joined_to_prev()); |
1662 | 5.27M | return reduced_box; |
1663 | 5.27M | } |
1664 | 1.23M | head_blob = blob; |
1665 | 1.23M | full_box = blob->bounding_box(); |
1666 | 1.23M | reduced_box = reduced_box_for_blob(blob, row, &left_above_xht); |
1667 | 2.54M | do { |
1668 | 2.54M | it->forward(); |
1669 | 2.54M | blob = it->data(); |
1670 | 2.54M | if (blob->cblob() == nullptr) { |
1671 | | // was pre-chopped |
1672 | 233k | full_box += blob->bounding_box(); |
1673 | 2.31M | } else if (blob->joined_to_prev()) { |
1674 | 1.07M | reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht); |
1675 | 1.07M | left_above_xht = std::min(left_above_xht, new_left_above_xht); |
1676 | 1.07M | } |
1677 | 2.54M | } |
1678 | | // until next real blob |
1679 | 2.54M | while (blob->cblob() == nullptr || blob->joined_to_prev()); |
1680 | | |
1681 | 1.23M | if ((reduced_box.width() > 0) && |
1682 | 1.23M | ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) && |
1683 | 1.23M | (reduced_box.height() > 0.7 * row->xheight)) { |
1684 | | #ifndef GRAPHICS_DISABLED |
1685 | | if (textord_show_initial_words) { |
1686 | | reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW); |
1687 | | } |
1688 | | #endif |
1689 | 912k | } else { |
1690 | 912k | reduced_box = full_box; |
1691 | 912k | } |
1692 | 1.23M | head_blob->set_reduced_box(reduced_box); |
1693 | 1.23M | return reduced_box; |
1694 | 6.51M | } |
1695 | | |
1696 | | /************************************************************************* |
1697 | | * reduced_box_for_blob() |
1698 | | * Find box for blob which is the same height and y position as the whole blob, |
1699 | | * but whose left limit is the left most position of the blob ABOVE the |
1700 | | * baseline and whose right limit is the right most position of the blob BELOW |
1701 | | * the xheight. |
1702 | | * |
1703 | | * |
1704 | | * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on |
1705 | | * "home". Perhaps we need something which say if the width ABOVE the |
1706 | | * xht alone includes the whole of the reduced width, then use the full |
1707 | | * blob box - Might still fail on italic F |
1708 | | * |
1709 | | * Alternatively we could be a little less severe and only reduce the |
1710 | | * left and right edges by half the difference between the full box and |
1711 | | * the reduced box. |
1712 | | * |
1713 | | * NOTE that we need to rotate all the coordinates as |
1714 | | * find_blob_limits finds the y min and max within a specified x band |
1715 | | *************************************************************************/ |
1716 | 2.31M | TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) { |
1717 | 2.31M | float baseline; |
1718 | 2.31M | float blob_x_centre; |
1719 | 2.31M | float left_limit; |
1720 | 2.31M | float right_limit; |
1721 | 2.31M | float junk; |
1722 | 2.31M | TBOX blob_box; |
1723 | | |
1724 | | /* Find baseline of centre of blob */ |
1725 | | |
1726 | 2.31M | blob_box = blob->bounding_box(); |
1727 | 2.31M | blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0; |
1728 | 2.31M | baseline = row->baseline.y(blob_x_centre); |
1729 | | |
1730 | | /* |
1731 | | Find LH limit of blob ABOVE the xht. This is so that we can detect certain |
1732 | | caps ht chars which should NOT have their box reduced: T, Y, V, W etc |
1733 | | */ |
1734 | 2.31M | left_limit = static_cast<float>(INT32_MAX); |
1735 | 2.31M | junk = static_cast<float>(-INT32_MAX); |
1736 | 2.31M | find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX), |
1737 | 2.31M | left_limit, junk); |
1738 | 2.31M | if (left_limit > junk) { |
1739 | 1.88M | *left_above_xht = INT16_MAX; // No area above xht |
1740 | 1.88M | } else { |
1741 | 425k | *left_above_xht = static_cast<int16_t>(std::floor(left_limit)); |
1742 | 425k | } |
1743 | | /* |
1744 | | Find reduced LH limit of blob - the left extent of the region ABOVE the |
1745 | | baseline. |
1746 | | */ |
1747 | 2.31M | left_limit = static_cast<float>(INT32_MAX); |
1748 | 2.31M | junk = static_cast<float>(-INT32_MAX); |
1749 | 2.31M | find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk); |
1750 | | |
1751 | 2.31M | if (left_limit > junk) { |
1752 | 199k | return TBOX(); // no area within xht so return empty box |
1753 | 199k | } |
1754 | | /* |
1755 | | Find reduced RH limit of blob - the right extent of the region BELOW the xht. |
1756 | | */ |
1757 | 2.11M | junk = static_cast<float>(INT32_MAX); |
1758 | 2.11M | right_limit = static_cast<float>(-INT32_MAX); |
1759 | 2.11M | find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk, |
1760 | 2.11M | right_limit); |
1761 | 2.11M | if (junk > right_limit) { |
1762 | 258k | return TBOX(); // no area within xht so return empty box |
1763 | 258k | } |
1764 | | |
1765 | 1.85M | return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()), |
1766 | 1.85M | ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top())); |
1767 | 2.11M | } |
1768 | | } // namespace tesseract |