/src/tesseract/src/textord/topitch.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: topitch.cpp (Formerly to_pitch.c) |
3 | | * Description: Code to determine fixed pitchness and the pitch if fixed. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1993, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | // Include automatically generated configuration file if running autoconf. |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" |
22 | | #endif |
23 | | |
24 | | #include "topitch.h" |
25 | | |
26 | | #include "blobbox.h" |
27 | | #include "drawtord.h" |
28 | | #include "makerow.h" |
29 | | #include "pithsync.h" |
30 | | #include "pitsync1.h" |
31 | | #include "statistc.h" |
32 | | #include "tovars.h" |
33 | | #include "wordseg.h" |
34 | | |
35 | | #include "helpers.h" |
36 | | |
37 | | #include <memory> |
38 | | |
39 | | namespace tesseract { |
40 | | |
41 | | static BOOL_VAR(textord_all_prop, false, "All doc is proportial text"); |
42 | | BOOL_VAR(textord_debug_pitch_test, false, "Debug on fixed pitch test"); |
43 | | static BOOL_VAR(textord_disable_pitch_test, false, "Turn off dp fixed pitch algorithm"); |
44 | | BOOL_VAR(textord_fast_pitch_test, false, "Do even faster pitch algorithm"); |
45 | | BOOL_VAR(textord_debug_pitch_metric, false, "Write full metric stuff"); |
46 | | BOOL_VAR(textord_show_row_cuts, false, "Draw row-level cuts"); |
47 | | BOOL_VAR(textord_show_page_cuts, false, "Draw page-level cuts"); |
48 | | BOOL_VAR(textord_blockndoc_fixed, false, "Attempt whole doc/block fixed pitch"); |
49 | | double_VAR(textord_projection_scale, 0.200, "Ding rate for mid-cuts"); |
50 | | double_VAR(textord_balance_factor, 1.0, "Ding rate for unbalanced char cells"); |
51 | | |
52 | 386k | #define BLOCK_STATS_CLUSTERS 10 |
53 | 17.2k | #define MAX_ALLOWED_PITCH 100 // max pixel pitch. |
54 | | |
55 | | // qsort function to sort 2 floats. |
56 | 49.2k | static int sort_floats(const void *arg1, const void *arg2) { |
57 | 49.2k | float diff = *reinterpret_cast<const float *>(arg1) - *reinterpret_cast<const float *>(arg2); |
58 | 49.2k | if (diff > 0) { |
59 | 6.74k | return 1; |
60 | 42.4k | } else if (diff < 0) { |
61 | 42.4k | return -1; |
62 | 42.4k | } else { |
63 | 0 | return 0; |
64 | 0 | } |
65 | 49.2k | } |
66 | | |
67 | | /********************************************************************** |
68 | | * compute_fixed_pitch |
69 | | * |
70 | | * Decide whether each row is fixed pitch individually. |
71 | | * Correlate definite and uncertain results to obtain an individual |
72 | | * result for each row in the TO_ROW class. |
73 | | **********************************************************************/ |
74 | | |
75 | | void compute_fixed_pitch(ICOORD page_tr, // top right |
76 | | TO_BLOCK_LIST *port_blocks, // input list |
77 | | float gradient, // page skew |
78 | | FCOORD rotation, // for drawing |
79 | 17.2k | bool testing_on) { // correct orientation |
80 | 17.2k | TO_BLOCK_IT block_it; // iterator |
81 | 17.2k | TO_BLOCK *block; // current block; |
82 | 17.2k | TO_ROW *row; // current row |
83 | 17.2k | int block_index; // block number |
84 | 17.2k | int row_index; // row number |
85 | | |
86 | | #ifndef GRAPHICS_DISABLED |
87 | | if (textord_show_initial_words && testing_on) { |
88 | | if (to_win == nullptr) { |
89 | | create_to_win(page_tr); |
90 | | } |
91 | | } |
92 | | #endif |
93 | | |
94 | 17.2k | block_it.set_to_list(port_blocks); |
95 | 17.2k | block_index = 1; |
96 | 34.5k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
97 | 17.2k | block = block_it.data(); |
98 | 17.2k | compute_block_pitch(block, rotation, block_index, testing_on); |
99 | 17.2k | block_index++; |
100 | 17.2k | } |
101 | | |
102 | 17.2k | if (!try_doc_fixed(page_tr, port_blocks, gradient)) { |
103 | 17.2k | block_index = 1; |
104 | 34.5k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
105 | 17.2k | block = block_it.data(); |
106 | 17.2k | if (!try_block_fixed(block, block_index)) { |
107 | 17.2k | try_rows_fixed(block, block_index, testing_on); |
108 | 17.2k | } |
109 | 17.2k | block_index++; |
110 | 17.2k | } |
111 | 17.2k | } |
112 | | |
113 | 17.2k | block_index = 1; |
114 | 34.5k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
115 | 17.2k | block = block_it.data(); |
116 | 17.2k | POLY_BLOCK *pb = block->block->pdblk.poly_block(); |
117 | 17.2k | if (pb != nullptr && !pb->IsText()) { |
118 | 0 | continue; // Non-text doesn't exist! |
119 | 0 | } |
120 | | // row iterator |
121 | 17.2k | TO_ROW_IT row_it(block->get_rows()); |
122 | 17.2k | row_index = 1; |
123 | 203k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
124 | 185k | row = row_it.data(); |
125 | 185k | fix_row_pitch(row, block, port_blocks, row_index, block_index); |
126 | 185k | row_index++; |
127 | 185k | } |
128 | 17.2k | block_index++; |
129 | 17.2k | } |
130 | | #ifndef GRAPHICS_DISABLED |
131 | | if (textord_show_initial_words && testing_on) { |
132 | | ScrollView::Update(); |
133 | | } |
134 | | #endif |
135 | 17.2k | } |
136 | | |
137 | | /********************************************************************** |
138 | | * fix_row_pitch |
139 | | * |
140 | | * Get a pitch_decision for this row by voting among similar rows in the |
141 | | * block, then similar rows over all the page, or any other rows at all. |
142 | | **********************************************************************/ |
143 | | |
144 | | void fix_row_pitch(TO_ROW *bad_row, // row to fix |
145 | | TO_BLOCK *bad_block, // block of bad_row |
146 | | TO_BLOCK_LIST *blocks, // blocks to scan |
147 | | int32_t row_target, // number of row |
148 | 185k | int32_t block_target) { // number of block |
149 | 185k | int16_t mid_cuts; |
150 | 185k | int block_votes; // votes in block |
151 | 185k | int like_votes; // votes over page |
152 | 185k | int other_votes; // votes of unlike blocks |
153 | 185k | int block_index; // number of block |
154 | 185k | int maxwidth; // max pitch |
155 | 185k | TO_BLOCK_IT block_it = blocks; // block iterator |
156 | 185k | TO_BLOCK *block; // current block |
157 | 185k | TO_ROW *row; // current row |
158 | 185k | float sp_sd; // space deviation |
159 | 185k | STATS block_stats; // pitches in block |
160 | 185k | STATS like_stats; // pitches in page |
161 | | |
162 | 185k | block_votes = like_votes = other_votes = 0; |
163 | 185k | maxwidth = static_cast<int32_t>(ceil(bad_row->xheight * textord_words_maxspace)); |
164 | 185k | if (bad_row->pitch_decision != PITCH_DEF_FIXED && bad_row->pitch_decision != PITCH_DEF_PROP) { |
165 | 176k | block_stats.set_range(0, maxwidth - 1); |
166 | 176k | like_stats.set_range(0, maxwidth - 1); |
167 | 176k | block_index = 1; |
168 | 352k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
169 | 176k | block = block_it.data(); |
170 | 176k | POLY_BLOCK *pb = block->block->pdblk.poly_block(); |
171 | 176k | if (pb != nullptr && !pb->IsText()) { |
172 | 0 | continue; // Non text doesn't exist! |
173 | 0 | } |
174 | 176k | TO_ROW_IT row_it(block->get_rows()); |
175 | 3.84M | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
176 | 3.67M | row = row_it.data(); |
177 | 3.67M | if ((bad_row->all_caps && |
178 | 3.67M | row->xheight + row->ascrise < |
179 | 423k | (bad_row->xheight + bad_row->ascrise) * (1 + textord_pitch_rowsimilarity) && |
180 | 3.67M | row->xheight + row->ascrise > |
181 | 217k | (bad_row->xheight + bad_row->ascrise) * (1 - textord_pitch_rowsimilarity)) || |
182 | 3.67M | (!bad_row->all_caps && |
183 | 3.51M | row->xheight < bad_row->xheight * (1 + textord_pitch_rowsimilarity) && |
184 | 3.51M | row->xheight > bad_row->xheight * (1 - textord_pitch_rowsimilarity))) { |
185 | 2.94M | if (block_index == block_target) { |
186 | 2.94M | if (row->pitch_decision == PITCH_DEF_FIXED) { |
187 | 20.0k | block_votes += textord_words_veto_power; |
188 | 20.0k | block_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power); |
189 | 2.92M | } else if (row->pitch_decision == PITCH_MAYBE_FIXED || |
190 | 2.92M | row->pitch_decision == PITCH_CORR_FIXED) { |
191 | 43.9k | block_votes++; |
192 | 43.9k | block_stats.add(static_cast<int32_t>(row->fixed_pitch), 1); |
193 | 2.88M | } else if (row->pitch_decision == PITCH_DEF_PROP) { |
194 | 48.3k | block_votes -= textord_words_veto_power; |
195 | 2.83M | } else if (row->pitch_decision == PITCH_MAYBE_PROP || |
196 | 2.83M | row->pitch_decision == PITCH_CORR_PROP) { |
197 | 1.34M | block_votes--; |
198 | 1.34M | } |
199 | 2.94M | } else { |
200 | 0 | if (row->pitch_decision == PITCH_DEF_FIXED) { |
201 | 0 | like_votes += textord_words_veto_power; |
202 | 0 | like_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power); |
203 | 0 | } else if (row->pitch_decision == PITCH_MAYBE_FIXED || |
204 | 0 | row->pitch_decision == PITCH_CORR_FIXED) { |
205 | 0 | like_votes++; |
206 | 0 | like_stats.add(static_cast<int32_t>(row->fixed_pitch), 1); |
207 | 0 | } else if (row->pitch_decision == PITCH_DEF_PROP) { |
208 | 0 | like_votes -= textord_words_veto_power; |
209 | 0 | } else if (row->pitch_decision == PITCH_MAYBE_PROP || |
210 | 0 | row->pitch_decision == PITCH_CORR_PROP) { |
211 | 0 | like_votes--; |
212 | 0 | } |
213 | 0 | } |
214 | 2.94M | } else { |
215 | 728k | if (row->pitch_decision == PITCH_DEF_FIXED) { |
216 | 2.67k | other_votes += textord_words_veto_power; |
217 | 725k | } else if (row->pitch_decision == PITCH_MAYBE_FIXED || |
218 | 725k | row->pitch_decision == PITCH_CORR_FIXED) { |
219 | 5.38k | other_votes++; |
220 | 720k | } else if (row->pitch_decision == PITCH_DEF_PROP) { |
221 | 30.9k | other_votes -= textord_words_veto_power; |
222 | 689k | } else if (row->pitch_decision == PITCH_MAYBE_PROP || |
223 | 689k | row->pitch_decision == PITCH_CORR_PROP) { |
224 | 350k | other_votes--; |
225 | 350k | } |
226 | 728k | } |
227 | 3.67M | } |
228 | 176k | block_index++; |
229 | 176k | } |
230 | 176k | if (block_votes > textord_words_veto_power) { |
231 | 3.79k | bad_row->fixed_pitch = block_stats.ile(0.5); |
232 | 3.79k | bad_row->pitch_decision = PITCH_CORR_FIXED; |
233 | 172k | } else if (block_votes <= textord_words_veto_power && like_votes > 0) { |
234 | 0 | bad_row->fixed_pitch = like_stats.ile(0.5); |
235 | 0 | bad_row->pitch_decision = PITCH_CORR_FIXED; |
236 | 172k | } else { |
237 | 172k | bad_row->pitch_decision = PITCH_CORR_PROP; |
238 | 172k | if (block_votes == 0 && like_votes == 0 && other_votes > 0 && |
239 | 172k | (textord_debug_pitch_test || textord_debug_pitch_metric)) { |
240 | 0 | tprintf( |
241 | 0 | "Warning:row %d of block %d set prop with no like rows against " |
242 | 0 | "trend\n", |
243 | 0 | row_target, block_target); |
244 | 0 | } |
245 | 172k | } |
246 | 176k | } |
247 | 185k | if (textord_debug_pitch_metric) { |
248 | 0 | tprintf(":b_votes=%d:l_votes=%d:o_votes=%d", block_votes, like_votes, other_votes); |
249 | 0 | tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise); |
250 | 0 | } |
251 | 185k | if (bad_row->pitch_decision == PITCH_CORR_FIXED) { |
252 | 3.79k | if (bad_row->fixed_pitch < textord_min_xheight) { |
253 | 1.98k | if (block_votes > 0) { |
254 | 1.98k | bad_row->fixed_pitch = block_stats.ile(0.5); |
255 | 1.98k | } else if (block_votes == 0 && like_votes > 0) { |
256 | 0 | bad_row->fixed_pitch = like_stats.ile(0.5); |
257 | 0 | } else { |
258 | 0 | tprintf("Warning:guessing pitch as xheight on row %d, block %d\n", row_target, |
259 | 0 | block_target); |
260 | 0 | bad_row->fixed_pitch = bad_row->xheight; |
261 | 0 | } |
262 | 1.98k | } |
263 | 3.79k | if (bad_row->fixed_pitch < textord_min_xheight) { |
264 | 1.98k | bad_row->fixed_pitch = (float)textord_min_xheight; |
265 | 1.98k | } |
266 | 3.79k | bad_row->kern_size = bad_row->fixed_pitch / 4; |
267 | 3.79k | bad_row->min_space = static_cast<int32_t>(bad_row->fixed_pitch * 0.6); |
268 | 3.79k | bad_row->max_nonspace = static_cast<int32_t>(bad_row->fixed_pitch * 0.4); |
269 | 3.79k | bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2; |
270 | 3.79k | bad_row->space_size = bad_row->fixed_pitch; |
271 | 3.79k | if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) { |
272 | 2.63k | tune_row_pitch(bad_row, &bad_row->projection, bad_row->projection_left, |
273 | 2.63k | bad_row->projection_right, |
274 | 2.63k | (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch, |
275 | 2.63k | sp_sd, mid_cuts, &bad_row->char_cells, false); |
276 | 2.63k | } |
277 | 182k | } else if (bad_row->pitch_decision == PITCH_CORR_PROP || |
278 | 182k | bad_row->pitch_decision == PITCH_DEF_PROP) { |
279 | 180k | bad_row->fixed_pitch = 0.0f; |
280 | 180k | bad_row->char_cells.clear(); |
281 | 180k | } |
282 | 185k | } |
283 | | |
284 | | /********************************************************************** |
285 | | * compute_block_pitch |
286 | | * |
287 | | * Decide whether each block is fixed pitch individually. |
288 | | **********************************************************************/ |
289 | | |
290 | | void compute_block_pitch(TO_BLOCK *block, // input list |
291 | | FCOORD rotation, // for drawing |
292 | | int32_t block_index, // block number |
293 | 17.2k | bool testing_on) { // correct orientation |
294 | 17.2k | TBOX block_box; // bounding box |
295 | | |
296 | 17.2k | block_box = block->block->pdblk.bounding_box(); |
297 | 17.2k | if (testing_on && textord_debug_pitch_test) { |
298 | 0 | tprintf("Block %d at (%d,%d)->(%d,%d)\n", block_index, block_box.left(), block_box.bottom(), |
299 | 0 | block_box.right(), block_box.top()); |
300 | 0 | } |
301 | 17.2k | block->min_space = static_cast<int32_t>(floor(block->xheight * textord_words_default_minspace)); |
302 | 17.2k | block->max_nonspace = static_cast<int32_t>(ceil(block->xheight * textord_words_default_nonspace)); |
303 | 17.2k | block->fixed_pitch = 0.0f; |
304 | 17.2k | block->space_size = static_cast<float>(block->min_space); |
305 | 17.2k | block->kern_size = static_cast<float>(block->max_nonspace); |
306 | 17.2k | block->pr_nonsp = block->xheight * words_default_prop_nonspace; |
307 | 17.2k | block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop; |
308 | 17.2k | if (!block->get_rows()->empty()) { |
309 | 16.7k | ASSERT_HOST(block->xheight > 0); |
310 | 16.7k | find_repeated_chars(block, textord_show_initial_words && testing_on); |
311 | | #ifndef GRAPHICS_DISABLED |
312 | | if (textord_show_initial_words && testing_on) { |
313 | | // overlap_picture_ops(true); |
314 | | ScrollView::Update(); |
315 | | } |
316 | | #endif |
317 | 16.7k | compute_rows_pitch(block, block_index, textord_debug_pitch_test && testing_on); |
318 | 16.7k | } |
319 | 17.2k | } |
320 | | |
321 | | /********************************************************************** |
322 | | * compute_rows_pitch |
323 | | * |
324 | | * Decide whether each row is fixed pitch individually. |
325 | | **********************************************************************/ |
326 | | |
327 | | bool compute_rows_pitch( // find line stats |
328 | | TO_BLOCK *block, // block to do |
329 | | int32_t block_index, // block number |
330 | | bool testing_on // correct orientation |
331 | 16.7k | ) { |
332 | 16.7k | int32_t maxwidth; // of spaces |
333 | 16.7k | TO_ROW *row; // current row |
334 | 16.7k | int32_t row_index; // row number. |
335 | 16.7k | float lower, upper; // cluster thresholds |
336 | 16.7k | TO_ROW_IT row_it = block->get_rows(); |
337 | | |
338 | 16.7k | row_index = 1; |
339 | 202k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
340 | 185k | row = row_it.data(); |
341 | 185k | ASSERT_HOST(row->xheight > 0); |
342 | 185k | row->compute_vertical_projection(); |
343 | 185k | maxwidth = static_cast<int32_t>(ceil(row->xheight * textord_words_maxspace)); |
344 | 185k | if (row_pitch_stats(row, maxwidth, testing_on) && |
345 | 185k | find_row_pitch(row, maxwidth, textord_dotmatrix_gap + 1, block, block_index, row_index, |
346 | 128k | testing_on)) { |
347 | 14.6k | if (row->fixed_pitch == 0) { |
348 | 0 | lower = row->pr_nonsp; |
349 | 0 | upper = row->pr_space; |
350 | 0 | row->space_size = upper; |
351 | 0 | row->kern_size = lower; |
352 | 0 | } |
353 | 171k | } else { |
354 | 171k | row->fixed_pitch = 0.0f; // insufficient data |
355 | 171k | row->pitch_decision = PITCH_DUNNO; |
356 | 171k | } |
357 | 185k | row_index++; |
358 | 185k | } |
359 | 16.7k | return false; |
360 | 16.7k | } |
361 | | |
362 | | /********************************************************************** |
363 | | * try_doc_fixed |
364 | | * |
365 | | * Attempt to call the entire document fixed pitch. |
366 | | **********************************************************************/ |
367 | | |
368 | | bool try_doc_fixed( // determine pitch |
369 | | ICOORD page_tr, // top right |
370 | | TO_BLOCK_LIST *port_blocks, // input list |
371 | | float gradient // page skew |
372 | 17.2k | ) { |
373 | 17.2k | int16_t master_x; // uniform shifts |
374 | 17.2k | int16_t pitch; // median pitch. |
375 | 17.2k | int x; // profile coord |
376 | 17.2k | int prop_blocks; // correct counts |
377 | 17.2k | int fixed_blocks; |
378 | 17.2k | int total_row_count; // total in page |
379 | | // iterator |
380 | 17.2k | TO_BLOCK_IT block_it = port_blocks; |
381 | 17.2k | TO_BLOCK *block; // current block; |
382 | 17.2k | TO_ROW *row; // current row |
383 | 17.2k | int16_t projection_left; // edges |
384 | 17.2k | int16_t projection_right; |
385 | 17.2k | int16_t row_left; // edges of row |
386 | 17.2k | int16_t row_right; |
387 | 17.2k | float master_y; // uniform shifts |
388 | 17.2k | float shift_factor; // page skew correction |
389 | 17.2k | float final_pitch; // output pitch |
390 | 17.2k | float row_y; // baseline |
391 | 17.2k | STATS projection; // entire page |
392 | 17.2k | STATS pitches(0, MAX_ALLOWED_PITCH - 1); |
393 | | // for median |
394 | 17.2k | float sp_sd; // space sd |
395 | 17.2k | int16_t mid_cuts; // no of cheap cuts |
396 | 17.2k | float pitch_sd; // sync rating |
397 | | |
398 | 17.2k | if (!textord_blockndoc_fixed || |
399 | 17.2k | block_it.empty() || block_it.data()->get_rows()->empty()) { |
400 | 17.2k | return false; |
401 | 17.2k | } |
402 | 0 | shift_factor = gradient / (gradient * gradient + 1); |
403 | | // row iterator |
404 | 0 | TO_ROW_IT row_it(block_it.data()->get_rows()); |
405 | 0 | master_x = row_it.data()->projection_left; |
406 | 0 | master_y = row_it.data()->baseline.y(master_x); |
407 | 0 | projection_left = INT16_MAX; |
408 | 0 | projection_right = -INT16_MAX; |
409 | 0 | prop_blocks = 0; |
410 | 0 | fixed_blocks = 0; |
411 | 0 | total_row_count = 0; |
412 | |
|
413 | 0 | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
414 | 0 | block = block_it.data(); |
415 | 0 | row_it.set_to_list(block->get_rows()); |
416 | 0 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
417 | 0 | row = row_it.data(); |
418 | 0 | total_row_count++; |
419 | 0 | if (row->fixed_pitch > 0) { |
420 | 0 | pitches.add(static_cast<int32_t>(row->fixed_pitch), 1); |
421 | 0 | } |
422 | | // find median |
423 | 0 | row_y = row->baseline.y(master_x); |
424 | 0 | row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y)); |
425 | 0 | row_right = static_cast<int16_t>(row->projection_right - shift_factor * (master_y - row_y)); |
426 | 0 | if (row_left < projection_left) { |
427 | 0 | projection_left = row_left; |
428 | 0 | } |
429 | 0 | if (row_right > projection_right) { |
430 | 0 | projection_right = row_right; |
431 | 0 | } |
432 | 0 | } |
433 | 0 | } |
434 | 0 | if (pitches.get_total() == 0) { |
435 | 0 | return false; |
436 | 0 | } |
437 | 0 | projection.set_range(projection_left, projection_right - 1); |
438 | |
|
439 | 0 | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
440 | 0 | block = block_it.data(); |
441 | 0 | row_it.set_to_list(block->get_rows()); |
442 | 0 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
443 | 0 | row = row_it.data(); |
444 | 0 | row_y = row->baseline.y(master_x); |
445 | 0 | row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y)); |
446 | 0 | for (x = row->projection_left; x < row->projection_right; x++, row_left++) { |
447 | 0 | projection.add(row_left, row->projection.pile_count(x)); |
448 | 0 | } |
449 | 0 | } |
450 | 0 | } |
451 | |
|
452 | 0 | row_it.set_to_list(block_it.data()->get_rows()); |
453 | 0 | row = row_it.data(); |
454 | | #ifndef GRAPHICS_DISABLED |
455 | | if (textord_show_page_cuts && to_win != nullptr) { |
456 | | projection.plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL); |
457 | | } |
458 | | #endif |
459 | 0 | final_pitch = pitches.ile(0.5); |
460 | 0 | pitch = static_cast<int16_t>(final_pitch); |
461 | 0 | pitch_sd = tune_row_pitch(row, &projection, projection_left, projection_right, pitch * 0.75, |
462 | 0 | final_pitch, sp_sd, mid_cuts, &row->char_cells, false); |
463 | |
|
464 | 0 | if (textord_debug_pitch_metric) { |
465 | 0 | tprintf( |
466 | 0 | "try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%" |
467 | 0 | "g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n", |
468 | 0 | prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, pitch_sd / total_row_count, |
469 | 0 | pitch_sd / pitch, pitch_sd / total_row_count / pitch); |
470 | 0 | } |
471 | |
|
472 | | #ifndef GRAPHICS_DISABLED |
473 | | if (textord_show_page_cuts && to_win != nullptr) { |
474 | | float row_shift; // shift for row |
475 | | ICOORDELT_LIST *master_cells; // cells for page |
476 | | master_cells = &row->char_cells; |
477 | | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
478 | | block = block_it.data(); |
479 | | row_it.set_to_list(block->get_rows()); |
480 | | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
481 | | row = row_it.data(); |
482 | | row_y = row->baseline.y(master_x); |
483 | | row_shift = shift_factor * (master_y - row_y); |
484 | | plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells); |
485 | | } |
486 | | } |
487 | | } |
488 | | #endif |
489 | 0 | row->char_cells.clear(); |
490 | 0 | return false; |
491 | 0 | } |
492 | | |
493 | | /********************************************************************** |
494 | | * try_block_fixed |
495 | | * |
496 | | * Try to call the entire block fixed. |
497 | | **********************************************************************/ |
498 | | |
499 | | bool try_block_fixed( // find line stats |
500 | | TO_BLOCK *block, // block to do |
501 | | int32_t block_index // block number |
502 | 17.2k | ) { |
503 | 17.2k | return false; |
504 | 17.2k | } |
505 | | |
506 | | /********************************************************************** |
507 | | * try_rows_fixed |
508 | | * |
509 | | * Decide whether each row is fixed pitch individually. |
510 | | **********************************************************************/ |
511 | | |
512 | | bool try_rows_fixed( // find line stats |
513 | | TO_BLOCK *block, // block to do |
514 | | int32_t block_index, // block number |
515 | | bool testing_on // correct orientation |
516 | 17.2k | ) { |
517 | 17.2k | TO_ROW *row; // current row |
518 | 17.2k | int32_t def_fixed = 0; // counters |
519 | 17.2k | int32_t def_prop = 0; |
520 | 17.2k | int32_t maybe_fixed = 0; |
521 | 17.2k | int32_t maybe_prop = 0; |
522 | 17.2k | int32_t dunno = 0; |
523 | 17.2k | int32_t corr_fixed = 0; |
524 | 17.2k | int32_t corr_prop = 0; |
525 | 17.2k | float lower, upper; // cluster thresholds |
526 | 17.2k | TO_ROW_IT row_it = block->get_rows(); |
527 | | |
528 | 203k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
529 | 185k | row = row_it.data(); |
530 | 185k | ASSERT_HOST(row->xheight > 0); |
531 | 185k | if (row->fixed_pitch > 0 && fixed_pitch_row(row, block->block, block_index)) { |
532 | 14.6k | if (row->fixed_pitch == 0) { |
533 | 0 | lower = row->pr_nonsp; |
534 | 0 | upper = row->pr_space; |
535 | 0 | row->space_size = upper; |
536 | 0 | row->kern_size = lower; |
537 | 0 | } |
538 | 14.6k | } |
539 | 185k | } |
540 | 17.2k | count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop, |
541 | 17.2k | dunno); |
542 | 17.2k | if (testing_on && |
543 | 17.2k | (textord_debug_pitch_test || textord_blocksall_prop || textord_blocksall_fixed)) { |
544 | 0 | tprintf("Initially:"); |
545 | 0 | print_block_counts(block, block_index); |
546 | 0 | } |
547 | 17.2k | if (def_fixed > def_prop * textord_words_veto_power) { |
548 | 269 | block->pitch_decision = PITCH_DEF_FIXED; |
549 | 17.0k | } else if (def_prop > def_fixed * textord_words_veto_power) { |
550 | 1.45k | block->pitch_decision = PITCH_DEF_PROP; |
551 | 15.5k | } else if (def_fixed > 0 || def_prop > 0) { |
552 | 190 | block->pitch_decision = PITCH_DUNNO; |
553 | 15.3k | } else if (maybe_fixed > maybe_prop * textord_words_veto_power) { |
554 | 58 | block->pitch_decision = PITCH_MAYBE_FIXED; |
555 | 15.3k | } else if (maybe_prop > maybe_fixed * textord_words_veto_power) { |
556 | 188 | block->pitch_decision = PITCH_MAYBE_PROP; |
557 | 15.1k | } else { |
558 | 15.1k | block->pitch_decision = PITCH_DUNNO; |
559 | 15.1k | } |
560 | 17.2k | return false; |
561 | 17.2k | } |
562 | | |
563 | | /********************************************************************** |
564 | | * print_block_counts |
565 | | * |
566 | | * Count up how many rows have what decision and print the results. |
567 | | **********************************************************************/ |
568 | | |
569 | | void print_block_counts( // find line stats |
570 | | TO_BLOCK *block, // block to do |
571 | | int32_t block_index // block number |
572 | 0 | ) { |
573 | 0 | int32_t def_fixed = 0; // counters |
574 | 0 | int32_t def_prop = 0; |
575 | 0 | int32_t maybe_fixed = 0; |
576 | 0 | int32_t maybe_prop = 0; |
577 | 0 | int32_t dunno = 0; |
578 | 0 | int32_t corr_fixed = 0; |
579 | 0 | int32_t corr_prop = 0; |
580 | |
|
581 | 0 | count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop, |
582 | 0 | dunno); |
583 | 0 | tprintf("Block %d has (%d,%d,%d)", block_index, def_fixed, maybe_fixed, corr_fixed); |
584 | 0 | if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed)) { |
585 | 0 | tprintf(" (Wrongly)"); |
586 | 0 | } |
587 | 0 | tprintf(" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop); |
588 | 0 | if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop)) { |
589 | 0 | tprintf(" (Wrongly)"); |
590 | 0 | } |
591 | 0 | tprintf(" prop, %d dunno\n", dunno); |
592 | 0 | } |
593 | | |
594 | | /********************************************************************** |
595 | | * count_block_votes |
596 | | * |
597 | | * Count the number of rows in the block with each kind of pitch_decision. |
598 | | **********************************************************************/ |
599 | | |
600 | | void count_block_votes( // find line stats |
601 | | TO_BLOCK *block, // block to do |
602 | | int32_t &def_fixed, // add to counts |
603 | | int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed, |
604 | 17.2k | int32_t &corr_prop, int32_t &dunno) { |
605 | 17.2k | TO_ROW *row; // current row |
606 | 17.2k | TO_ROW_IT row_it = block->get_rows(); |
607 | | |
608 | 203k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
609 | 185k | row = row_it.data(); |
610 | 185k | switch (row->pitch_decision) { |
611 | 171k | case PITCH_DUNNO: |
612 | 171k | dunno++; |
613 | 171k | break; |
614 | 7.88k | case PITCH_DEF_PROP: |
615 | 7.88k | def_prop++; |
616 | 7.88k | break; |
617 | 3.95k | case PITCH_MAYBE_PROP: |
618 | 3.95k | maybe_prop++; |
619 | 3.95k | break; |
620 | 1.77k | case PITCH_DEF_FIXED: |
621 | 1.77k | def_fixed++; |
622 | 1.77k | break; |
623 | 1.06k | case PITCH_MAYBE_FIXED: |
624 | 1.06k | maybe_fixed++; |
625 | 1.06k | break; |
626 | 0 | case PITCH_CORR_PROP: |
627 | 0 | corr_prop++; |
628 | 0 | break; |
629 | 0 | case PITCH_CORR_FIXED: |
630 | 0 | corr_fixed++; |
631 | 0 | break; |
632 | 185k | } |
633 | 185k | } |
634 | 17.2k | } |
635 | | |
636 | | /********************************************************************** |
637 | | * row_pitch_stats |
638 | | * |
639 | | * Decide whether each row is fixed pitch individually. |
640 | | **********************************************************************/ |
641 | | |
642 | | bool row_pitch_stats( // find line stats |
643 | | TO_ROW *row, // current row |
644 | | int32_t maxwidth, // of spaces |
645 | | bool testing_on // correct orientation |
646 | 185k | ) { |
647 | 185k | BLOBNBOX *blob; // current blob |
648 | 185k | int gap_index; // current gap |
649 | 185k | int32_t prev_x; // end of prev blob |
650 | 185k | int32_t cluster_count; // no of clusters |
651 | 185k | int32_t prev_count; // of clusters |
652 | 185k | int32_t smooth_factor; // for smoothing stats |
653 | 185k | TBOX blob_box; // bounding box |
654 | 185k | float lower, upper; // cluster thresholds |
655 | | // gap sizes |
656 | 185k | float gaps[BLOCK_STATS_CLUSTERS]; |
657 | | // blobs |
658 | 185k | BLOBNBOX_IT blob_it = row->blob_list(); |
659 | 185k | STATS gap_stats(0, maxwidth - 1); |
660 | 185k | STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; |
661 | | // clusters |
662 | | |
663 | 185k | smooth_factor = static_cast<int32_t>(row->xheight * textord_wordstats_smooth_factor + 1.5); |
664 | 185k | if (!blob_it.empty()) { |
665 | 185k | prev_x = blob_it.data()->bounding_box().right(); |
666 | 185k | blob_it.forward(); |
667 | 2.68M | while (!blob_it.at_first()) { |
668 | 2.50M | blob = blob_it.data(); |
669 | 2.50M | if (!blob->joined_to_prev()) { |
670 | 1.33M | blob_box = blob->bounding_box(); |
671 | 1.33M | if (blob_box.left() - prev_x < maxwidth) { |
672 | 1.33M | gap_stats.add(blob_box.left() - prev_x, 1); |
673 | 1.33M | } |
674 | 1.33M | prev_x = blob_box.right(); |
675 | 1.33M | } |
676 | 2.50M | blob_it.forward(); |
677 | 2.50M | } |
678 | 185k | } |
679 | 185k | if (gap_stats.get_total() == 0) { |
680 | 57.1k | return false; |
681 | 57.1k | } |
682 | 128k | cluster_count = 0; |
683 | 128k | lower = row->xheight * words_initial_lower; |
684 | 128k | upper = row->xheight * words_initial_upper; |
685 | 128k | gap_stats.smooth(smooth_factor); |
686 | 257k | do { |
687 | 257k | prev_count = cluster_count; |
688 | 257k | cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, |
689 | 257k | BLOCK_STATS_CLUSTERS, cluster_stats); |
690 | 257k | } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); |
691 | 128k | if (cluster_count < 1) { |
692 | 0 | return false; |
693 | 0 | } |
694 | 304k | for (gap_index = 0; gap_index < cluster_count; gap_index++) { |
695 | 175k | gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5); |
696 | 175k | } |
697 | | // get medians |
698 | 128k | if (testing_on) { |
699 | 0 | tprintf("cluster_count=%d:", cluster_count); |
700 | 0 | for (gap_index = 0; gap_index < cluster_count; gap_index++) { |
701 | 0 | tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total()); |
702 | 0 | } |
703 | 0 | tprintf("\n"); |
704 | 0 | } |
705 | 128k | qsort(gaps, cluster_count, sizeof(float), sort_floats); |
706 | | |
707 | | // Try to find proportional non-space and space for row. |
708 | 128k | lower = row->xheight * words_default_prop_nonspace; |
709 | 128k | upper = row->xheight * textord_words_min_minspace; |
710 | 239k | for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < lower; gap_index++) { |
711 | 110k | ; |
712 | 110k | } |
713 | 128k | if (gap_index == 0) { |
714 | 18.6k | if (testing_on) { |
715 | 0 | tprintf("No clusters below nonspace threshold!!\n"); |
716 | 0 | } |
717 | 18.6k | if (cluster_count > 1) { |
718 | 4.91k | row->pr_nonsp = gaps[0]; |
719 | 4.91k | row->pr_space = gaps[1]; |
720 | 13.6k | } else { |
721 | 13.6k | row->pr_nonsp = lower; |
722 | 13.6k | row->pr_space = gaps[0]; |
723 | 13.6k | } |
724 | 110k | } else { |
725 | 110k | row->pr_nonsp = gaps[gap_index - 1]; |
726 | 111k | while (gap_index < cluster_count && gaps[gap_index] < upper) { |
727 | 1.14k | gap_index++; |
728 | 1.14k | } |
729 | 110k | if (gap_index == cluster_count) { |
730 | 78.7k | if (testing_on) { |
731 | 0 | tprintf("No clusters above nonspace threshold!!\n"); |
732 | 0 | } |
733 | 78.7k | row->pr_space = lower * textord_spacesize_ratioprop; |
734 | 78.7k | } else { |
735 | 31.4k | row->pr_space = gaps[gap_index]; |
736 | 31.4k | } |
737 | 110k | } |
738 | | |
739 | | // Now try to find the fixed pitch space and non-space. |
740 | 128k | upper = row->xheight * words_default_fixed_space; |
741 | 281k | for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < upper; gap_index++) { |
742 | 152k | ; |
743 | 152k | } |
744 | 128k | if (gap_index == 0) { |
745 | 3.46k | if (testing_on) { |
746 | 0 | tprintf("No clusters below space threshold!!\n"); |
747 | 0 | } |
748 | 3.46k | row->fp_nonsp = upper; |
749 | 3.46k | row->fp_space = gaps[0]; |
750 | 125k | } else { |
751 | 125k | row->fp_nonsp = gaps[gap_index - 1]; |
752 | 125k | if (gap_index == cluster_count) { |
753 | 108k | if (testing_on) { |
754 | 0 | tprintf("No clusters above space threshold!!\n"); |
755 | 0 | } |
756 | 108k | row->fp_space = row->xheight; |
757 | 108k | } else { |
758 | 16.4k | row->fp_space = gaps[gap_index]; |
759 | 16.4k | } |
760 | 125k | } |
761 | 128k | if (testing_on) { |
762 | 0 | tprintf( |
763 | 0 | "Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, " |
764 | 0 | "fp_space=%g\n", |
765 | 0 | row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space); |
766 | 0 | } |
767 | 128k | return true; // computed some stats |
768 | 128k | } |
769 | | |
770 | | /********************************************************************** |
771 | | * find_row_pitch |
772 | | * |
773 | | * Check to see if this row could be fixed pitch using the given spacings. |
774 | | * Blobs with gaps smaller than the lower threshold are assumed to be one. |
775 | | * The larger threshold is the word gap threshold. |
776 | | **********************************************************************/ |
777 | | |
778 | | bool find_row_pitch( // find lines |
779 | | TO_ROW *row, // row to do |
780 | | int32_t maxwidth, // max permitted space |
781 | | int32_t dm_gap, // ignorable gaps |
782 | | TO_BLOCK *block, // block of row |
783 | | int32_t block_index, // block_number |
784 | | int32_t row_index, // number of row |
785 | | bool testing_on // correct orientation |
786 | 128k | ) { |
787 | 128k | bool used_dm_model; // looks like dot matrix |
788 | 128k | float min_space; // estimate threshold |
789 | 128k | float non_space; // gap size |
790 | 128k | float gap_iqr; // interquartile range |
791 | 128k | float pitch_iqr; |
792 | 128k | float dm_gap_iqr; // interquartile range |
793 | 128k | float dm_pitch_iqr; |
794 | 128k | float dm_pitch; // pitch with dm on |
795 | 128k | float pitch; // revised estimate |
796 | 128k | float initial_pitch; // guess at pitch |
797 | 128k | STATS gap_stats(0, maxwidth - 1); |
798 | | // centre-centre |
799 | 128k | STATS pitch_stats(0, maxwidth - 1); |
800 | | |
801 | 128k | row->fixed_pitch = 0.0f; |
802 | 128k | initial_pitch = row->fp_space; |
803 | 128k | if (initial_pitch > row->xheight * (1 + words_default_fixed_limit)) { |
804 | 5.36k | initial_pitch = row->xheight; // keep pitch decent |
805 | 5.36k | } |
806 | 128k | non_space = row->fp_nonsp; |
807 | 128k | if (non_space > initial_pitch) { |
808 | 0 | non_space = initial_pitch; |
809 | 0 | } |
810 | 128k | min_space = (initial_pitch + non_space) / 2; |
811 | | |
812 | 128k | if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false, |
813 | 128k | dm_gap)) { |
814 | 125k | dm_gap_iqr = 0.0001f; |
815 | 125k | dm_pitch_iqr = maxwidth * 2.0f; |
816 | 125k | dm_pitch = initial_pitch; |
817 | 125k | } else { |
818 | 3.14k | dm_gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25); |
819 | 3.14k | dm_pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25); |
820 | 3.14k | dm_pitch = pitch_stats.ile(0.5); |
821 | 3.14k | } |
822 | 128k | gap_stats.clear(); |
823 | 128k | pitch_stats.clear(); |
824 | 128k | if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false, 0)) { |
825 | 116k | gap_iqr = 0.0001f; |
826 | 116k | pitch_iqr = maxwidth * 3.0f; |
827 | 116k | } else { |
828 | 12.7k | gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25); |
829 | 12.7k | pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25); |
830 | 12.7k | if (testing_on) { |
831 | 0 | tprintf( |
832 | 0 | "First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, " |
833 | 0 | "pitch=%g\n", |
834 | 0 | initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5)); |
835 | 0 | } |
836 | 12.7k | initial_pitch = pitch_stats.ile(0.5); |
837 | 12.7k | if (min_space > initial_pitch && count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, |
838 | 3.33k | initial_pitch, true, false, 0)) { |
839 | 3.25k | min_space = initial_pitch; |
840 | 3.25k | gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25); |
841 | 3.25k | pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25); |
842 | 3.25k | if (testing_on) { |
843 | 0 | tprintf( |
844 | 0 | "Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, " |
845 | 0 | "pitch=%g\n", |
846 | 0 | initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5)); |
847 | 0 | } |
848 | 3.25k | initial_pitch = pitch_stats.ile(0.5); |
849 | 3.25k | } |
850 | 12.7k | } |
851 | 128k | if (textord_debug_pitch_metric) { |
852 | 0 | tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:", block_index, |
853 | 0 | row_index, 'X', pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr, |
854 | 0 | pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth |
855 | 0 | ? 'D' |
856 | 0 | : (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M')); |
857 | 0 | } |
858 | 128k | if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) { |
859 | 114k | row->pitch_decision = PITCH_DUNNO; |
860 | 114k | if (textord_debug_pitch_metric) { |
861 | 0 | tprintf("\n"); |
862 | 0 | } |
863 | 114k | return false; // insufficient data |
864 | 114k | } |
865 | 14.6k | if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) { |
866 | 12.7k | if (testing_on) { |
867 | 0 | tprintf( |
868 | 0 | "Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, " |
869 | 0 | "dm_gap_iqr=%g\n", |
870 | 0 | pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr); |
871 | 0 | } |
872 | 12.7k | gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25); |
873 | 12.7k | pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25); |
874 | 12.7k | pitch = pitch_stats.ile(0.5); |
875 | 12.7k | used_dm_model = false; |
876 | 12.7k | } else { |
877 | 1.97k | if (testing_on) { |
878 | 0 | tprintf( |
879 | 0 | "Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, " |
880 | 0 | "dm_gap_iqr=%g\n", |
881 | 0 | pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr); |
882 | 0 | } |
883 | 1.97k | gap_iqr = dm_gap_iqr; |
884 | 1.97k | pitch_iqr = dm_pitch_iqr; |
885 | 1.97k | pitch = dm_pitch; |
886 | 1.97k | used_dm_model = true; |
887 | 1.97k | } |
888 | 14.6k | if (textord_debug_pitch_metric) { |
889 | 0 | tprintf("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:", pitch_iqr, gap_iqr, pitch); |
890 | 0 | tprintf("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:", pitch_iqr / gap_iqr, pitch_iqr / block->xheight, |
891 | 0 | pitch_iqr < gap_iqr * textord_fpiqr_ratio && |
892 | 0 | pitch_iqr < block->xheight * textord_max_pitch_iqr && |
893 | 0 | pitch < block->xheight * textord_words_default_maxspace |
894 | 0 | ? 'F' |
895 | 0 | : 'P'); |
896 | 0 | } |
897 | 14.6k | if (pitch_iqr < gap_iqr * textord_fpiqr_ratio && |
898 | 14.6k | pitch_iqr < block->xheight * textord_max_pitch_iqr && |
899 | 14.6k | pitch < block->xheight * textord_words_default_maxspace) { |
900 | 7.27k | row->pitch_decision = PITCH_MAYBE_FIXED; |
901 | 7.39k | } else { |
902 | 7.39k | row->pitch_decision = PITCH_MAYBE_PROP; |
903 | 7.39k | } |
904 | 14.6k | row->fixed_pitch = pitch; |
905 | 14.6k | row->kern_size = gap_stats.ile(0.5); |
906 | 14.6k | row->min_space = static_cast<int32_t>(row->fixed_pitch + non_space) / 2; |
907 | 14.6k | if (row->min_space > row->fixed_pitch) { |
908 | 168 | row->min_space = static_cast<int32_t>(row->fixed_pitch); |
909 | 168 | } |
910 | 14.6k | row->max_nonspace = row->min_space; |
911 | 14.6k | row->space_size = row->fixed_pitch; |
912 | 14.6k | row->space_threshold = (row->max_nonspace + row->min_space) / 2; |
913 | 14.6k | row->used_dm_model = used_dm_model; |
914 | 14.6k | return true; |
915 | 128k | } |
916 | | |
917 | | /********************************************************************** |
918 | | * fixed_pitch_row |
919 | | * |
920 | | * Check to see if this row could be fixed pitch using the given spacings. |
921 | | * Blobs with gaps smaller than the lower threshold are assumed to be one. |
922 | | * The larger threshold is the word gap threshold. |
923 | | **********************************************************************/ |
924 | | |
925 | | bool fixed_pitch_row(TO_ROW *row, // row to do |
926 | | BLOCK *block, |
927 | | int32_t block_index // block_number |
928 | 14.6k | ) { |
929 | 14.6k | const char *res_string; // pitch result |
930 | 14.6k | int16_t mid_cuts; // no of cheap cuts |
931 | 14.6k | float non_space; // gap size |
932 | 14.6k | float pitch_sd; // error on pitch |
933 | 14.6k | float sp_sd = 0.0f; // space sd |
934 | | |
935 | 14.6k | non_space = row->fp_nonsp; |
936 | 14.6k | if (non_space > row->fixed_pitch) { |
937 | 281 | non_space = row->fixed_pitch; |
938 | 281 | } |
939 | 14.6k | POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr; |
940 | 14.6k | if (textord_all_prop || (pb != nullptr && !pb->IsText())) { |
941 | | // Set the decision to definitely proportional. |
942 | 0 | pitch_sd = textord_words_def_prop * row->fixed_pitch; |
943 | 0 | row->pitch_decision = PITCH_DEF_PROP; |
944 | 14.6k | } else { |
945 | 14.6k | pitch_sd = tune_row_pitch(row, &row->projection, row->projection_left, row->projection_right, |
946 | 14.6k | (row->fixed_pitch + non_space * 3) / 4, row->fixed_pitch, sp_sd, |
947 | 14.6k | mid_cuts, &row->char_cells, block_index == textord_debug_block); |
948 | 14.6k | if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch && |
949 | 14.6k | ((pitsync_linear_version & 3) < 3 || |
950 | 2.83k | ((pitsync_linear_version & 3) >= 3 && |
951 | 2.83k | (row->used_dm_model || sp_sd > 20 || (pitch_sd == 0 && sp_sd > 10))))) { |
952 | 2.83k | if (pitch_sd < textord_words_def_fixed * row->fixed_pitch && !row->all_caps && |
953 | 2.83k | ((pitsync_linear_version & 3) < 3 || sp_sd > 20)) { |
954 | 1.77k | row->pitch_decision = PITCH_DEF_FIXED; |
955 | 1.77k | } else { |
956 | 1.06k | row->pitch_decision = PITCH_MAYBE_FIXED; |
957 | 1.06k | } |
958 | 11.8k | } else if ((pitsync_linear_version & 3) < 3 || sp_sd > 20 || mid_cuts > 0 || |
959 | 11.8k | pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) { |
960 | 11.8k | if (pitch_sd < textord_words_def_prop * row->fixed_pitch) { |
961 | 3.95k | row->pitch_decision = PITCH_MAYBE_PROP; |
962 | 7.88k | } else { |
963 | 7.88k | row->pitch_decision = PITCH_DEF_PROP; |
964 | 7.88k | } |
965 | 11.8k | } else { |
966 | 0 | row->pitch_decision = PITCH_DUNNO; |
967 | 0 | } |
968 | 14.6k | } |
969 | | |
970 | 14.6k | if (textord_debug_pitch_metric) { |
971 | 0 | res_string = "??"; |
972 | 0 | switch (row->pitch_decision) { |
973 | 0 | case PITCH_DEF_PROP: |
974 | 0 | res_string = "DP"; |
975 | 0 | break; |
976 | 0 | case PITCH_MAYBE_PROP: |
977 | 0 | res_string = "MP"; |
978 | 0 | break; |
979 | 0 | case PITCH_DEF_FIXED: |
980 | 0 | res_string = "DF"; |
981 | 0 | break; |
982 | 0 | case PITCH_MAYBE_FIXED: |
983 | 0 | res_string = "MF"; |
984 | 0 | break; |
985 | 0 | default: |
986 | 0 | res_string = "??"; |
987 | 0 | } |
988 | 0 | tprintf(":sd/p=%g:occ=%g:init_res=%s\n", pitch_sd / row->fixed_pitch, sp_sd, res_string); |
989 | 0 | } |
990 | 14.6k | return true; |
991 | 14.6k | } |
992 | | |
993 | | /********************************************************************** |
994 | | * count_pitch_stats |
995 | | * |
996 | | * Count up the gap and pitch stats on the block to see if it is fixed pitch. |
997 | | * Blobs with gaps smaller than the lower threshold are assumed to be one. |
998 | | * The larger threshold is the word gap threshold. |
999 | | * The return value indicates whether there were any decent values to use. |
1000 | | **********************************************************************/ |
1001 | | |
1002 | | bool count_pitch_stats( // find lines |
1003 | | TO_ROW *row, // row to do |
1004 | | STATS *gap_stats, // blob gaps |
1005 | | STATS *pitch_stats, // centre-centre stats |
1006 | | float initial_pitch, // guess at pitch |
1007 | | float min_space, // estimate space size |
1008 | | bool ignore_outsize, // discard big objects |
1009 | | bool split_outsize, // split big objects |
1010 | | int32_t dm_gap // ignorable gaps |
1011 | 261k | ) { |
1012 | 261k | bool prev_valid; // not word broken |
1013 | 261k | BLOBNBOX *blob; // current blob |
1014 | | // blobs |
1015 | 261k | BLOBNBOX_IT blob_it = row->blob_list(); |
1016 | 261k | int32_t prev_right; // end of prev blob |
1017 | 261k | int32_t prev_centre; // centre of previous blob |
1018 | 261k | int32_t x_centre; // centre of this blob |
1019 | 261k | int32_t blob_width; // width of blob |
1020 | 261k | int32_t width_units; // no of widths in blob |
1021 | 261k | float width; // blob width |
1022 | 261k | TBOX blob_box; // bounding box |
1023 | 261k | TBOX joined_box; // of super blob |
1024 | | |
1025 | 261k | gap_stats->clear(); |
1026 | 261k | pitch_stats->clear(); |
1027 | 261k | if (blob_it.empty()) { |
1028 | 0 | return false; |
1029 | 0 | } |
1030 | 261k | prev_valid = false; |
1031 | 261k | prev_centre = 0; |
1032 | 261k | prev_right = 0; // stop compiler warning |
1033 | 261k | joined_box = blob_it.data()->bounding_box(); |
1034 | 5.19M | do { |
1035 | 5.19M | blob_it.forward(); |
1036 | 5.19M | blob = blob_it.data(); |
1037 | 5.19M | if (!blob->joined_to_prev()) { |
1038 | 2.97M | blob_box = blob->bounding_box(); |
1039 | 2.97M | if ((blob_box.left() - joined_box.right() < dm_gap && !blob_it.at_first()) || |
1040 | 2.97M | blob->cblob() == nullptr) { |
1041 | 1.46M | joined_box += blob_box; // merge blobs |
1042 | 1.50M | } else { |
1043 | 1.50M | blob_width = joined_box.width(); |
1044 | 1.50M | if (split_outsize) { |
1045 | 0 | width_units = |
1046 | 0 | static_cast<int32_t>(floor(static_cast<float>(blob_width) / initial_pitch + 0.5)); |
1047 | 0 | if (width_units < 1) { |
1048 | 0 | width_units = 1; |
1049 | 0 | } |
1050 | 0 | width_units--; |
1051 | 1.50M | } else if (ignore_outsize) { |
1052 | 1.50M | width = static_cast<float>(blob_width) / initial_pitch; |
1053 | 1.50M | width_units = |
1054 | 1.50M | width < 1 + words_default_fixed_limit && width > 1 - words_default_fixed_limit ? 0 |
1055 | 1.50M | : -1; |
1056 | 1.50M | } else { |
1057 | 0 | width_units = 0; // everything in |
1058 | 0 | } |
1059 | 1.50M | x_centre = static_cast<int32_t>(joined_box.left() + |
1060 | 1.50M | (blob_width - width_units * initial_pitch) / 2); |
1061 | 1.50M | if (prev_valid && width_units >= 0) { |
1062 | | // if (width_units>0) |
1063 | | // { |
1064 | | // tprintf("wu=%d, |
1065 | | // width=%d, |
1066 | | // xc=%d, adding |
1067 | | // %d\n", |
1068 | | // width_units,blob_width,x_centre,x_centre-prev_centre); |
1069 | | // } |
1070 | 134k | gap_stats->add(joined_box.left() - prev_right, 1); |
1071 | 134k | pitch_stats->add(x_centre - prev_centre, 1); |
1072 | 134k | } |
1073 | 1.50M | prev_centre = static_cast<int32_t>(x_centre + width_units * initial_pitch); |
1074 | 1.50M | prev_right = joined_box.right(); |
1075 | 1.50M | prev_valid = blob_box.left() - joined_box.right() < min_space; |
1076 | 1.50M | prev_valid = prev_valid && width_units >= 0; |
1077 | 1.50M | joined_box = blob_box; |
1078 | 1.50M | } |
1079 | 2.97M | } |
1080 | 5.19M | } while (!blob_it.at_first()); |
1081 | 261k | return gap_stats->get_total() >= 3; |
1082 | 261k | } |
1083 | | |
1084 | | /********************************************************************** |
1085 | | * tune_row_pitch |
1086 | | * |
1087 | | * Use a dp algorithm to fit the character cells and return the sd of |
1088 | | * the cell size over the row. |
1089 | | **********************************************************************/ |
1090 | | |
1091 | | float tune_row_pitch( // find fp cells |
1092 | | TO_ROW *row, // row to do |
1093 | | STATS *projection, // vertical projection |
1094 | | int16_t projection_left, // edge of projection |
1095 | | int16_t projection_right, // edge of projection |
1096 | | float space_size, // size of blank |
1097 | | float &initial_pitch, // guess at pitch |
1098 | | float &best_sp_sd, // space sd |
1099 | | int16_t &best_mid_cuts, // no of cheap cuts |
1100 | | ICOORDELT_LIST *best_cells, // row cells |
1101 | | bool testing_on // individual words |
1102 | 17.3k | ) { |
1103 | 17.3k | int pitch_delta; // offset pitch |
1104 | 17.3k | int16_t mid_cuts; // cheap cuts |
1105 | 17.3k | float pitch_sd; // current sd |
1106 | 17.3k | float best_sd; // best result |
1107 | 17.3k | float best_pitch; // pitch for best result |
1108 | 17.3k | float initial_sd; // starting error |
1109 | 17.3k | float sp_sd; // space sd |
1110 | 17.3k | ICOORDELT_LIST test_cells; // row cells |
1111 | 17.3k | ICOORDELT_IT best_it; // start of best list |
1112 | | |
1113 | 17.3k | if (textord_fast_pitch_test) { |
1114 | 0 | return tune_row_pitch2(row, projection, projection_left, projection_right, space_size, |
1115 | 0 | initial_pitch, best_sp_sd, |
1116 | | // space sd |
1117 | 0 | best_mid_cuts, best_cells, testing_on); |
1118 | 0 | } |
1119 | 17.3k | if (textord_disable_pitch_test) { |
1120 | 0 | best_sp_sd = initial_pitch; |
1121 | 0 | return initial_pitch; |
1122 | 0 | } |
1123 | 17.3k | initial_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size, |
1124 | 17.3k | initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on); |
1125 | 17.3k | best_sd = initial_sd; |
1126 | 17.3k | best_pitch = initial_pitch; |
1127 | 17.3k | if (testing_on) { |
1128 | 0 | tprintf("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd); |
1129 | 0 | } |
1130 | 28.1k | for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) { |
1131 | 23.9k | pitch_sd = |
1132 | 23.9k | compute_pitch_sd(row, projection, projection_left, projection_right, space_size, |
1133 | 23.9k | initial_pitch + pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on); |
1134 | 23.9k | if (testing_on) { |
1135 | 0 | tprintf("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta, pitch_sd); |
1136 | 0 | } |
1137 | 23.9k | if (pitch_sd < best_sd) { |
1138 | 8.31k | best_sd = pitch_sd; |
1139 | 8.31k | best_mid_cuts = mid_cuts; |
1140 | 8.31k | best_sp_sd = sp_sd; |
1141 | 8.31k | best_pitch = initial_pitch + pitch_delta; |
1142 | 8.31k | best_cells->clear(); |
1143 | 8.31k | best_it.set_to_list(best_cells); |
1144 | 8.31k | best_it.add_list_after(&test_cells); |
1145 | 15.6k | } else { |
1146 | 15.6k | test_cells.clear(); |
1147 | 15.6k | } |
1148 | 23.9k | if (pitch_sd > initial_sd) { |
1149 | 13.1k | break; // getting worse |
1150 | 13.1k | } |
1151 | 23.9k | } |
1152 | 28.9k | for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) { |
1153 | 24.2k | pitch_sd = |
1154 | 24.2k | compute_pitch_sd(row, projection, projection_left, projection_right, space_size, |
1155 | 24.2k | initial_pitch - pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on); |
1156 | 24.2k | if (testing_on) { |
1157 | 0 | tprintf("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta, pitch_sd); |
1158 | 0 | } |
1159 | 24.2k | if (pitch_sd < best_sd) { |
1160 | 5.68k | best_sd = pitch_sd; |
1161 | 5.68k | best_mid_cuts = mid_cuts; |
1162 | 5.68k | best_sp_sd = sp_sd; |
1163 | 5.68k | best_pitch = initial_pitch - pitch_delta; |
1164 | 5.68k | best_cells->clear(); |
1165 | 5.68k | best_it.set_to_list(best_cells); |
1166 | 5.68k | best_it.add_list_after(&test_cells); |
1167 | 18.6k | } else { |
1168 | 18.6k | test_cells.clear(); |
1169 | 18.6k | } |
1170 | 24.2k | if (pitch_sd > initial_sd) { |
1171 | 12.6k | break; |
1172 | 12.6k | } |
1173 | 24.2k | } |
1174 | 17.3k | initial_pitch = best_pitch; |
1175 | | |
1176 | 17.3k | if (textord_debug_pitch_metric) { |
1177 | 0 | print_pitch_sd(row, projection, projection_left, projection_right, space_size, best_pitch); |
1178 | 0 | } |
1179 | | |
1180 | 17.3k | return best_sd; |
1181 | 17.3k | } |
1182 | | |
1183 | | /********************************************************************** |
1184 | | * tune_row_pitch |
1185 | | * |
1186 | | * Use a dp algorithm to fit the character cells and return the sd of |
1187 | | * the cell size over the row. |
1188 | | **********************************************************************/ |
1189 | | |
1190 | | float tune_row_pitch2( // find fp cells |
1191 | | TO_ROW *row, // row to do |
1192 | | STATS *projection, // vertical projection |
1193 | | int16_t projection_left, // edge of projection |
1194 | | int16_t projection_right, // edge of projection |
1195 | | float space_size, // size of blank |
1196 | | float &initial_pitch, // guess at pitch |
1197 | | float &best_sp_sd, // space sd |
1198 | | int16_t &best_mid_cuts, // no of cheap cuts |
1199 | | ICOORDELT_LIST *best_cells, // row cells |
1200 | | bool testing_on // individual words |
1201 | 0 | ) { |
1202 | 0 | int pitch_delta; // offset pitch |
1203 | 0 | int16_t pixel; // pixel coord |
1204 | 0 | int16_t best_pixel; // pixel coord |
1205 | 0 | int16_t best_delta; // best pitch |
1206 | 0 | int16_t best_pitch; // best pitch |
1207 | 0 | int16_t start; // of good range |
1208 | 0 | int16_t end; // of good range |
1209 | 0 | int32_t best_count; // lowest sum |
1210 | 0 | float best_sd; // best result |
1211 | |
|
1212 | 0 | best_sp_sd = initial_pitch; |
1213 | |
|
1214 | 0 | best_pitch = static_cast<int>(initial_pitch); |
1215 | 0 | if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) { |
1216 | 0 | return initial_pitch; |
1217 | 0 | } |
1218 | 0 | std::unique_ptr<STATS[]> sum_proj(new STATS[textord_pitch_range * 2 + 1]); // summed projection |
1219 | |
|
1220 | 0 | for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) { |
1221 | 0 | sum_proj[textord_pitch_range + pitch_delta].set_range(0, best_pitch + pitch_delta); |
1222 | 0 | } |
1223 | 0 | for (pixel = projection_left; pixel <= projection_right; pixel++) { |
1224 | 0 | for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) { |
1225 | 0 | sum_proj[textord_pitch_range + pitch_delta].add( |
1226 | 0 | (pixel - projection_left) % (best_pitch + pitch_delta), projection->pile_count(pixel)); |
1227 | 0 | } |
1228 | 0 | } |
1229 | 0 | best_count = sum_proj[textord_pitch_range].pile_count(0); |
1230 | 0 | best_delta = 0; |
1231 | 0 | best_pixel = 0; |
1232 | 0 | for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) { |
1233 | 0 | for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) { |
1234 | 0 | if (sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel) < best_count) { |
1235 | 0 | best_count = sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel); |
1236 | 0 | best_delta = pitch_delta; |
1237 | 0 | best_pixel = pixel; |
1238 | 0 | } |
1239 | 0 | } |
1240 | 0 | } |
1241 | 0 | if (testing_on) { |
1242 | 0 | tprintf("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n", initial_pitch, best_delta, |
1243 | 0 | best_count); |
1244 | 0 | } |
1245 | 0 | best_pitch += best_delta; |
1246 | 0 | initial_pitch = best_pitch; |
1247 | 0 | best_count++; |
1248 | 0 | best_count += best_count; |
1249 | 0 | for (start = best_pixel - 2; |
1250 | 0 | start > best_pixel - best_pitch && |
1251 | 0 | sum_proj[textord_pitch_range + best_delta].pile_count(start % best_pitch) <= best_count; |
1252 | 0 | start--) { |
1253 | 0 | ; |
1254 | 0 | } |
1255 | 0 | for (end = best_pixel + 2; |
1256 | 0 | end < best_pixel + best_pitch && |
1257 | 0 | sum_proj[textord_pitch_range + best_delta].pile_count(end % best_pitch) <= best_count; |
1258 | 0 | end++) { |
1259 | 0 | ; |
1260 | 0 | } |
1261 | |
|
1262 | 0 | best_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size, |
1263 | 0 | initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on, |
1264 | 0 | start, end); |
1265 | 0 | if (testing_on) { |
1266 | 0 | tprintf("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch, best_sd); |
1267 | 0 | } |
1268 | |
|
1269 | 0 | if (textord_debug_pitch_metric) { |
1270 | 0 | print_pitch_sd(row, projection, projection_left, projection_right, space_size, initial_pitch); |
1271 | 0 | } |
1272 | |
|
1273 | 0 | return best_sd; |
1274 | 0 | } |
1275 | | |
1276 | | /********************************************************************** |
1277 | | * compute_pitch_sd |
1278 | | * |
1279 | | * Use a dp algorithm to fit the character cells and return the sd of |
1280 | | * the cell size over the row. |
1281 | | **********************************************************************/ |
1282 | | |
1283 | | float compute_pitch_sd( // find fp cells |
1284 | | TO_ROW *row, // row to do |
1285 | | STATS *projection, // vertical projection |
1286 | | int16_t projection_left, // edge |
1287 | | int16_t projection_right, // edge |
1288 | | float space_size, // size of blank |
1289 | | float initial_pitch, // guess at pitch |
1290 | | float &sp_sd, // space sd |
1291 | | int16_t &mid_cuts, // no of free cuts |
1292 | | ICOORDELT_LIST *row_cells, // list of chop pts |
1293 | | bool testing_on, // individual words |
1294 | | int16_t start, // start of good range |
1295 | | int16_t end // end of good range |
1296 | 65.5k | ) { |
1297 | 65.5k | int16_t occupation; // no of cells in word. |
1298 | | // blobs |
1299 | 65.5k | BLOBNBOX_IT blob_it = row->blob_list(); |
1300 | 65.5k | BLOBNBOX_IT start_it; // start of word |
1301 | 65.5k | BLOBNBOX_IT plot_it; // for plotting |
1302 | 65.5k | int16_t blob_count; // no of blobs |
1303 | 65.5k | TBOX blob_box; // bounding box |
1304 | 65.5k | TBOX prev_box; // of super blob |
1305 | 65.5k | int32_t prev_right; // of word sync |
1306 | 65.5k | int scale_factor; // on scores for big words |
1307 | 65.5k | int32_t sp_count; // spaces |
1308 | 65.5k | FPSEGPT_LIST seg_list; // char cells |
1309 | 65.5k | FPSEGPT_IT seg_it; // iterator |
1310 | 65.5k | int16_t segpos; // position of segment |
1311 | 65.5k | int16_t cellpos; // previous cell boundary |
1312 | | // iterator |
1313 | 65.5k | ICOORDELT_IT cell_it = row_cells; |
1314 | 65.5k | ICOORDELT *cell; // new cell |
1315 | 65.5k | double sqsum; // sum of squares |
1316 | 65.5k | double spsum; // of spaces |
1317 | 65.5k | double sp_var; // space error |
1318 | 65.5k | double word_sync; // result for word |
1319 | 65.5k | int32_t total_count; // total blobs |
1320 | | |
1321 | 65.5k | if ((pitsync_linear_version & 3) > 1) { |
1322 | 65.5k | word_sync = compute_pitch_sd2(row, projection, projection_left, projection_right, initial_pitch, |
1323 | 65.5k | occupation, mid_cuts, row_cells, testing_on, start, end); |
1324 | 65.5k | sp_sd = occupation; |
1325 | 65.5k | return word_sync; |
1326 | 65.5k | } |
1327 | 0 | mid_cuts = 0; |
1328 | 0 | cellpos = 0; |
1329 | 0 | total_count = 0; |
1330 | 0 | sqsum = 0; |
1331 | 0 | sp_count = 0; |
1332 | 0 | spsum = 0; |
1333 | 0 | prev_right = -1; |
1334 | 0 | if (blob_it.empty()) { |
1335 | 0 | return space_size * 10; |
1336 | 0 | } |
1337 | | #ifndef GRAPHICS_DISABLED |
1338 | | if (testing_on && to_win != nullptr) { |
1339 | | blob_box = blob_it.data()->bounding_box(); |
1340 | | projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL); |
1341 | | } |
1342 | | #endif |
1343 | 0 | start_it = blob_it; |
1344 | 0 | blob_count = 0; |
1345 | 0 | blob_box = box_next(&blob_it); // first blob |
1346 | 0 | blob_it.mark_cycle_pt(); |
1347 | 0 | do { |
1348 | 0 | for (; blob_count > 0; blob_count--) { |
1349 | 0 | box_next(&start_it); |
1350 | 0 | } |
1351 | 0 | do { |
1352 | 0 | prev_box = blob_box; |
1353 | 0 | blob_count++; |
1354 | 0 | blob_box = box_next(&blob_it); |
1355 | 0 | } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size); |
1356 | 0 | plot_it = start_it; |
1357 | 0 | if (pitsync_linear_version & 3) { |
1358 | 0 | word_sync = check_pitch_sync2(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, |
1359 | 0 | projection, projection_left, projection_right, |
1360 | 0 | row->xheight * textord_projection_scale, occupation, &seg_list, |
1361 | 0 | start, end); |
1362 | 0 | } else { |
1363 | 0 | word_sync = check_pitch_sync(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, |
1364 | 0 | projection, &seg_list); |
1365 | 0 | } |
1366 | 0 | if (testing_on) { |
1367 | 0 | tprintf("Word ending at (%d,%d), len=%d, sync rating=%g, ", prev_box.right(), prev_box.top(), |
1368 | 0 | seg_list.length() - 1, word_sync); |
1369 | 0 | seg_it.set_to_list(&seg_list); |
1370 | 0 | for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { |
1371 | 0 | if (seg_it.data()->faked) { |
1372 | 0 | tprintf("(F)"); |
1373 | 0 | } |
1374 | 0 | tprintf("%d, ", seg_it.data()->position()); |
1375 | | // tprintf("C=%g, s=%g, sq=%g\n", |
1376 | | // seg_it.data()->cost_function(), |
1377 | | // seg_it.data()->sum(), |
1378 | | // seg_it.data()->squares()); |
1379 | 0 | } |
1380 | 0 | tprintf("\n"); |
1381 | 0 | } |
1382 | | #ifndef GRAPHICS_DISABLED |
1383 | | if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) { |
1384 | | plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); |
1385 | | } |
1386 | | #endif |
1387 | 0 | seg_it.set_to_list(&seg_list); |
1388 | 0 | if (prev_right >= 0) { |
1389 | 0 | sp_var = seg_it.data()->position() - prev_right; |
1390 | 0 | sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch; |
1391 | 0 | sp_var *= sp_var; |
1392 | 0 | spsum += sp_var; |
1393 | 0 | sp_count++; |
1394 | 0 | } |
1395 | 0 | for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { |
1396 | 0 | segpos = seg_it.data()->position(); |
1397 | 0 | if (cell_it.empty() || segpos > cellpos + initial_pitch / 2) { |
1398 | | // big gap |
1399 | 0 | while (!cell_it.empty() && segpos > cellpos + initial_pitch * 3 / 2) { |
1400 | 0 | cell = new ICOORDELT(cellpos + static_cast<int16_t>(initial_pitch), 0); |
1401 | 0 | cell_it.add_after_then_move(cell); |
1402 | 0 | cellpos += static_cast<int16_t>(initial_pitch); |
1403 | 0 | } |
1404 | | // make new one |
1405 | 0 | cell = new ICOORDELT(segpos, 0); |
1406 | 0 | cell_it.add_after_then_move(cell); |
1407 | 0 | cellpos = segpos; |
1408 | 0 | } else if (segpos > cellpos - initial_pitch / 2) { |
1409 | 0 | cell = cell_it.data(); |
1410 | | // average positions |
1411 | 0 | cell->set_x((cellpos + segpos) / 2); |
1412 | 0 | cellpos = cell->x(); |
1413 | 0 | } |
1414 | 0 | } |
1415 | 0 | seg_it.move_to_last(); |
1416 | 0 | prev_right = seg_it.data()->position(); |
1417 | 0 | if (textord_pitch_scalebigwords) { |
1418 | 0 | scale_factor = (seg_list.length() - 2) / 2; |
1419 | 0 | if (scale_factor < 1) { |
1420 | 0 | scale_factor = 1; |
1421 | 0 | } |
1422 | 0 | } else { |
1423 | 0 | scale_factor = 1; |
1424 | 0 | } |
1425 | 0 | sqsum += word_sync * scale_factor; |
1426 | 0 | total_count += (seg_list.length() - 1) * scale_factor; |
1427 | 0 | seg_list.clear(); |
1428 | 0 | } while (!blob_it.cycled_list()); |
1429 | 0 | sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0; |
1430 | 0 | return total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10; |
1431 | 0 | } |
1432 | | |
1433 | | /********************************************************************** |
1434 | | * compute_pitch_sd2 |
1435 | | * |
1436 | | * Use a dp algorithm to fit the character cells and return the sd of |
1437 | | * the cell size over the row. |
1438 | | **********************************************************************/ |
1439 | | |
1440 | | float compute_pitch_sd2( // find fp cells |
1441 | | TO_ROW *row, // row to do |
1442 | | STATS *projection, // vertical projection |
1443 | | int16_t projection_left, // edge |
1444 | | int16_t projection_right, // edge |
1445 | | float initial_pitch, // guess at pitch |
1446 | | int16_t &occupation, // no of occupied cells |
1447 | | int16_t &mid_cuts, // no of free cuts |
1448 | | ICOORDELT_LIST *row_cells, // list of chop pts |
1449 | | bool testing_on, // individual words |
1450 | | int16_t start, // start of good range |
1451 | | int16_t end // end of good range |
1452 | 65.5k | ) { |
1453 | | // blobs |
1454 | 65.5k | BLOBNBOX_IT blob_it = row->blob_list(); |
1455 | 65.5k | BLOBNBOX_IT plot_it; |
1456 | 65.5k | int16_t blob_count; // no of blobs |
1457 | 65.5k | TBOX blob_box; // bounding box |
1458 | 65.5k | FPSEGPT_LIST seg_list; // char cells |
1459 | 65.5k | FPSEGPT_IT seg_it; // iterator |
1460 | 65.5k | int16_t segpos; // position of segment |
1461 | | // iterator |
1462 | 65.5k | ICOORDELT_IT cell_it = row_cells; |
1463 | 65.5k | ICOORDELT *cell; // new cell |
1464 | 65.5k | double word_sync; // result for word |
1465 | | |
1466 | 65.5k | mid_cuts = 0; |
1467 | 65.5k | if (blob_it.empty()) { |
1468 | 0 | occupation = 0; |
1469 | 0 | return initial_pitch * 10; |
1470 | 0 | } |
1471 | | #ifndef GRAPHICS_DISABLED |
1472 | | if (testing_on && to_win != nullptr) { |
1473 | | projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL); |
1474 | | } |
1475 | | #endif |
1476 | 65.5k | blob_count = 0; |
1477 | 65.5k | blob_it.mark_cycle_pt(); |
1478 | 909k | do { |
1479 | | // first blob |
1480 | 909k | blob_box = box_next(&blob_it); |
1481 | 909k | blob_count++; |
1482 | 909k | } while (!blob_it.cycled_list()); |
1483 | 65.5k | plot_it = blob_it; |
1484 | 65.5k | word_sync = check_pitch_sync2( |
1485 | 65.5k | &blob_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left, |
1486 | 65.5k | projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, start, end); |
1487 | 65.5k | if (testing_on) { |
1488 | 0 | tprintf("Row ending at (%d,%d), len=%d, sync rating=%g, ", blob_box.right(), blob_box.top(), |
1489 | 0 | seg_list.length() - 1, word_sync); |
1490 | 0 | seg_it.set_to_list(&seg_list); |
1491 | 0 | for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { |
1492 | 0 | if (seg_it.data()->faked) { |
1493 | 0 | tprintf("(F)"); |
1494 | 0 | } |
1495 | 0 | tprintf("%d, ", seg_it.data()->position()); |
1496 | | // tprintf("C=%g, s=%g, sq=%g\n", |
1497 | | // seg_it.data()->cost_function(), |
1498 | | // seg_it.data()->sum(), |
1499 | | // seg_it.data()->squares()); |
1500 | 0 | } |
1501 | 0 | tprintf("\n"); |
1502 | 0 | } |
1503 | | #ifndef GRAPHICS_DISABLED |
1504 | | if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) { |
1505 | | plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); |
1506 | | } |
1507 | | #endif |
1508 | 65.5k | seg_it.set_to_list(&seg_list); |
1509 | 866k | for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { |
1510 | 801k | segpos = seg_it.data()->position(); |
1511 | | // make new one |
1512 | 801k | cell = new ICOORDELT(segpos, 0); |
1513 | 801k | cell_it.add_after_then_move(cell); |
1514 | 801k | if (seg_it.at_last()) { |
1515 | 65.5k | mid_cuts = seg_it.data()->cheap_cuts(); |
1516 | 65.5k | } |
1517 | 801k | } |
1518 | 65.5k | seg_list.clear(); |
1519 | 65.5k | return occupation > 0 ? sqrt(word_sync / occupation) : initial_pitch * 10; |
1520 | 65.5k | } |
1521 | | |
1522 | | /********************************************************************** |
1523 | | * print_pitch_sd |
1524 | | * |
1525 | | * Use a dp algorithm to fit the character cells and return the sd of |
1526 | | * the cell size over the row. |
1527 | | **********************************************************************/ |
1528 | | |
1529 | | void print_pitch_sd( // find fp cells |
1530 | | TO_ROW *row, // row to do |
1531 | | STATS *projection, // vertical projection |
1532 | | int16_t projection_left, // edges //size of blank |
1533 | | int16_t projection_right, float space_size, |
1534 | | float initial_pitch // guess at pitch |
1535 | 0 | ) { |
1536 | 0 | const char *res2; // pitch result |
1537 | 0 | int16_t occupation; // used cells |
1538 | 0 | float sp_sd; // space sd |
1539 | | // blobs |
1540 | 0 | BLOBNBOX_IT blob_it = row->blob_list(); |
1541 | 0 | BLOBNBOX_IT start_it; // start of word |
1542 | 0 | BLOBNBOX_IT row_start; // start of row |
1543 | 0 | int16_t blob_count; // no of blobs |
1544 | 0 | int16_t total_blob_count; // total blobs in line |
1545 | 0 | TBOX blob_box; // bounding box |
1546 | 0 | TBOX prev_box; // of super blob |
1547 | 0 | int32_t prev_right; // of word sync |
1548 | 0 | int scale_factor; // on scores for big words |
1549 | 0 | int32_t sp_count; // spaces |
1550 | 0 | FPSEGPT_LIST seg_list; // char cells |
1551 | 0 | FPSEGPT_IT seg_it; // iterator |
1552 | 0 | double sqsum; // sum of squares |
1553 | 0 | double spsum; // of spaces |
1554 | 0 | double sp_var; // space error |
1555 | 0 | double word_sync; // result for word |
1556 | 0 | double total_count; // total cuts |
1557 | |
|
1558 | 0 | if (blob_it.empty()) { |
1559 | 0 | return; |
1560 | 0 | } |
1561 | 0 | row_start = blob_it; |
1562 | 0 | total_blob_count = 0; |
1563 | |
|
1564 | 0 | total_count = 0; |
1565 | 0 | sqsum = 0; |
1566 | 0 | sp_count = 0; |
1567 | 0 | spsum = 0; |
1568 | 0 | prev_right = -1; |
1569 | 0 | blob_it = row_start; |
1570 | 0 | start_it = blob_it; |
1571 | 0 | blob_count = 0; |
1572 | 0 | blob_box = box_next(&blob_it); // first blob |
1573 | 0 | blob_it.mark_cycle_pt(); |
1574 | 0 | do { |
1575 | 0 | for (; blob_count > 0; blob_count--) { |
1576 | 0 | box_next(&start_it); |
1577 | 0 | } |
1578 | 0 | do { |
1579 | 0 | prev_box = blob_box; |
1580 | 0 | blob_count++; |
1581 | 0 | blob_box = box_next(&blob_it); |
1582 | 0 | } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size); |
1583 | 0 | word_sync = check_pitch_sync2( |
1584 | 0 | &start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left, |
1585 | 0 | projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0); |
1586 | 0 | total_blob_count += blob_count; |
1587 | 0 | seg_it.set_to_list(&seg_list); |
1588 | 0 | if (prev_right >= 0) { |
1589 | 0 | sp_var = seg_it.data()->position() - prev_right; |
1590 | 0 | sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch; |
1591 | 0 | sp_var *= sp_var; |
1592 | 0 | spsum += sp_var; |
1593 | 0 | sp_count++; |
1594 | 0 | } |
1595 | 0 | seg_it.move_to_last(); |
1596 | 0 | prev_right = seg_it.data()->position(); |
1597 | 0 | if (textord_pitch_scalebigwords) { |
1598 | 0 | scale_factor = (seg_list.length() - 2) / 2; |
1599 | 0 | if (scale_factor < 1) { |
1600 | 0 | scale_factor = 1; |
1601 | 0 | } |
1602 | 0 | } else { |
1603 | 0 | scale_factor = 1; |
1604 | 0 | } |
1605 | 0 | sqsum += word_sync * scale_factor; |
1606 | 0 | total_count += (seg_list.length() - 1) * scale_factor; |
1607 | 0 | seg_list.clear(); |
1608 | 0 | } while (!blob_it.cycled_list()); |
1609 | 0 | sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0; |
1610 | 0 | word_sync = total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10; |
1611 | 0 | tprintf("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:", word_sync, word_sync / initial_pitch, sp_sd, |
1612 | 0 | word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P'); |
1613 | |
|
1614 | 0 | start_it = row_start; |
1615 | 0 | blob_it = row_start; |
1616 | 0 | word_sync = |
1617 | 0 | check_pitch_sync2(&blob_it, total_blob_count, static_cast<int16_t>(initial_pitch), 2, |
1618 | 0 | projection, projection_left, projection_right, |
1619 | 0 | row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0); |
1620 | 0 | if (occupation > 1) { |
1621 | 0 | word_sync /= occupation; |
1622 | 0 | } |
1623 | 0 | word_sync = sqrt(word_sync); |
1624 | |
|
1625 | | #ifndef GRAPHICS_DISABLED |
1626 | | if (textord_show_row_cuts && to_win != nullptr) { |
1627 | | plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list); |
1628 | | } |
1629 | | #endif |
1630 | 0 | seg_list.clear(); |
1631 | 0 | if (word_sync < textord_words_pitchsd_threshold * initial_pitch) { |
1632 | 0 | if (word_sync < textord_words_def_fixed * initial_pitch && !row->all_caps) { |
1633 | 0 | res2 = "DF"; |
1634 | 0 | } else { |
1635 | 0 | res2 = "MF"; |
1636 | 0 | } |
1637 | 0 | } else { |
1638 | 0 | res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP"; |
1639 | 0 | } |
1640 | 0 | tprintf( |
1641 | 0 | "row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, " |
1642 | 0 | "all_caps=%d\n", |
1643 | 0 | word_sync, word_sync / initial_pitch, |
1644 | 0 | word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P', occupation, res2, |
1645 | 0 | initial_pitch, row->fixed_pitch, row->all_caps); |
1646 | 0 | } |
1647 | | |
1648 | | /********************************************************************** |
1649 | | * find_repeated_chars |
1650 | | * |
1651 | | * Extract marked leader blobs and put them |
1652 | | * into words in advance of fixed pitch checking and word generation. |
1653 | | **********************************************************************/ |
1654 | | void find_repeated_chars(TO_BLOCK *block, // Block to search. |
1655 | 16.7k | bool testing_on) { // Debug mode. |
1656 | 16.7k | POLY_BLOCK *pb = block->block->pdblk.poly_block(); |
1657 | 16.7k | if (pb != nullptr && !pb->IsText()) { |
1658 | 0 | return; // Don't find repeated chars in non-text blocks. |
1659 | 0 | } |
1660 | | |
1661 | 16.7k | TO_ROW *row; |
1662 | 16.7k | BLOBNBOX_IT box_it; |
1663 | 16.7k | BLOBNBOX_IT search_it; // forward search |
1664 | 16.7k | WERD *word; // new word |
1665 | 16.7k | TBOX word_box; // for plotting |
1666 | 16.7k | int blobcount, repeated_set; |
1667 | | |
1668 | 16.7k | TO_ROW_IT row_it = block->get_rows(); |
1669 | 16.7k | if (row_it.empty()) { |
1670 | 0 | return; // empty block |
1671 | 0 | } |
1672 | 202k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1673 | 185k | row = row_it.data(); |
1674 | 185k | box_it.set_to_list(row->blob_list()); |
1675 | 185k | if (box_it.empty()) { |
1676 | 0 | continue; // no blobs in this row |
1677 | 0 | } |
1678 | 185k | if (!row->rep_chars_marked()) { |
1679 | 0 | mark_repeated_chars(row); |
1680 | 0 | } |
1681 | 185k | if (row->num_repeated_sets() == 0) { |
1682 | 185k | continue; // nothing to do for this row |
1683 | 185k | } |
1684 | | // new words |
1685 | 0 | WERD_IT word_it(&row->rep_words); |
1686 | 0 | do { |
1687 | 0 | if (box_it.data()->repeated_set() != 0 && !box_it.data()->joined_to_prev()) { |
1688 | 0 | blobcount = 1; |
1689 | 0 | repeated_set = box_it.data()->repeated_set(); |
1690 | 0 | search_it = box_it; |
1691 | 0 | search_it.forward(); |
1692 | 0 | while (!search_it.at_first() && search_it.data()->repeated_set() == repeated_set) { |
1693 | 0 | blobcount++; |
1694 | 0 | search_it.forward(); |
1695 | 0 | } |
1696 | | // After the call to make_real_word() all the blobs from this |
1697 | | // repeated set will be removed from the blob list. box_it will be |
1698 | | // set to point to the blob after the end of the extracted sequence. |
1699 | 0 | word = make_real_word(&box_it, blobcount, box_it.at_first(), 1); |
1700 | 0 | if (!box_it.empty() && box_it.data()->joined_to_prev()) { |
1701 | 0 | tprintf("Bad box joined to prev at"); |
1702 | 0 | box_it.data()->bounding_box().print(); |
1703 | 0 | tprintf("After repeated word:"); |
1704 | 0 | word->bounding_box().print(); |
1705 | 0 | } |
1706 | 0 | ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev()); |
1707 | 0 | word->set_flag(W_REP_CHAR, true); |
1708 | 0 | word->set_flag(W_DONT_CHOP, true); |
1709 | 0 | word_it.add_after_then_move(word); |
1710 | 0 | } else { |
1711 | 0 | box_it.forward(); |
1712 | 0 | } |
1713 | 0 | } while (!box_it.at_first()); |
1714 | 0 | } |
1715 | 16.7k | } |
1716 | | |
1717 | | /********************************************************************** |
1718 | | * plot_fp_word |
1719 | | * |
1720 | | * Plot a block of words as if fixed pitch. |
1721 | | **********************************************************************/ |
1722 | | |
1723 | | #ifndef GRAPHICS_DISABLED |
1724 | | void plot_fp_word( // draw block of words |
1725 | | TO_BLOCK *block, // block to draw |
1726 | | float pitch, // pitch to draw with |
1727 | | float nonspace // for space threshold |
1728 | | ) { |
1729 | | TO_ROW *row; // current row |
1730 | | TO_ROW_IT row_it = block->get_rows(); |
1731 | | |
1732 | | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1733 | | row = row_it.data(); |
1734 | | row->min_space = static_cast<int32_t>((pitch + nonspace) / 2); |
1735 | | row->max_nonspace = row->min_space; |
1736 | | row->space_threshold = row->min_space; |
1737 | | plot_word_decisions(to_win, static_cast<int16_t>(pitch), row); |
1738 | | } |
1739 | | } |
1740 | | #endif |
1741 | | |
1742 | | } // namespace tesseract |