/src/tesseract/src/textord/makerow.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: makerow.cpp (Formerly makerows.c) |
3 | | * Description: Code to arrange blobs into rows of text. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1992, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | // Include automatically generated configuration file if running autoconf. |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" |
22 | | #endif |
23 | | |
24 | | #include "makerow.h" |
25 | | |
26 | | #include "blkocc.h" |
27 | | #include "blobbox.h" |
28 | | #include "ccstruct.h" |
29 | | #include "detlinefit.h" |
30 | | #include "drawtord.h" |
31 | | #include "oldbasel.h" |
32 | | #include "sortflts.h" |
33 | | #include "statistc.h" |
34 | | #include "textord.h" |
35 | | #include "tordmain.h" |
36 | | #include "tovars.h" |
37 | | #include "tprintf.h" |
38 | | #include "underlin.h" |
39 | | |
40 | | #include <algorithm> |
41 | | #include <cmath> |
42 | | #include <vector> // for std::vector |
43 | | |
44 | | namespace tesseract { |
45 | | |
46 | | BOOL_VAR(textord_heavy_nr, false, "Vigorously remove noise"); |
47 | | BOOL_VAR(textord_show_initial_rows, false, "Display row accumulation"); |
48 | | BOOL_VAR(textord_show_parallel_rows, false, "Display page correlated rows"); |
49 | | BOOL_VAR(textord_show_expanded_rows, false, "Display rows after expanding"); |
50 | | BOOL_VAR(textord_show_final_rows, false, "Display rows after final fitting"); |
51 | | BOOL_VAR(textord_show_final_blobs, false, "Display blob bounds after pre-ass"); |
52 | | BOOL_VAR(textord_test_landscape, false, "Tests refer to land/port"); |
53 | | BOOL_VAR(textord_parallel_baselines, true, "Force parallel baselines"); |
54 | | BOOL_VAR(textord_straight_baselines, false, "Force straight baselines"); |
55 | | BOOL_VAR(textord_old_baselines, true, "Use old baseline algorithm"); |
56 | | BOOL_VAR(textord_old_xheight, false, "Use old xheight algorithm"); |
57 | | BOOL_VAR(textord_fix_xheight_bug, true, "Use spline baseline"); |
58 | | BOOL_VAR(textord_fix_makerow_bug, true, "Prevent multiple baselines"); |
59 | | BOOL_VAR(textord_debug_xheights, false, "Test xheight algorithms"); |
60 | | static BOOL_VAR(textord_biased_skewcalc, true, "Bias skew estimates with line length"); |
61 | | static BOOL_VAR(textord_interpolating_skew, true, "Interpolate across gaps"); |
62 | | static INT_VAR(textord_skewsmooth_offset, 4, "For smooth factor"); |
63 | | static INT_VAR(textord_skewsmooth_offset2, 1, "For smooth factor"); |
64 | | INT_VAR(textord_test_x, -INT32_MAX, "coord of test pt"); |
65 | | INT_VAR(textord_test_y, -INT32_MAX, "coord of test pt"); |
66 | | INT_VAR(textord_min_blobs_in_row, 4, "Min blobs before gradient counted"); |
67 | | INT_VAR(textord_spline_minblobs, 8, "Min blobs in each spline segment"); |
68 | | INT_VAR(textord_spline_medianwin, 6, "Size of window for spline segmentation"); |
69 | | static INT_VAR(textord_max_blob_overlaps, 4, "Max number of blobs a big blob can overlap"); |
70 | | INT_VAR(textord_min_xheight, 10, "Min credible pixel xheight"); |
71 | | double_VAR(textord_spline_shift_fraction, 0.02, "Fraction of line spacing for quad"); |
72 | | double_VAR(textord_skew_ile, 0.5, "Ile of gradients for page skew"); |
73 | | double_VAR(textord_skew_lag, 0.02, "Lag for skew on row accumulation"); |
74 | | double_VAR(textord_linespace_iqrlimit, 0.2, "Max iqr/median for linespace"); |
75 | | double_VAR(textord_width_limit, 8, "Max width of blobs to make rows"); |
76 | | double_VAR(textord_chop_width, 1.5, "Max width before chopping"); |
77 | | static double_VAR(textord_expansion_factor, 1.0, "Factor to expand rows by in expand_rows"); |
78 | | static double_VAR(textord_overlap_x, 0.375, "Fraction of linespace for good overlap"); |
79 | | double_VAR(textord_minxh, 0.25, "fraction of linesize for min xheight"); |
80 | | double_VAR(textord_min_linesize, 1.25, "* blob height for initial linesize"); |
81 | | double_VAR(textord_excess_blobsize, 1.3, "New row made if blob makes row this big"); |
82 | | double_VAR(textord_occupancy_threshold, 0.4, "Fraction of neighbourhood"); |
83 | | double_VAR(textord_underline_width, 2.0, "Multiple of line_size for underline"); |
84 | | double_VAR(textord_min_blob_height_fraction, 0.75, |
85 | | "Min blob height/top to include blob top into xheight stats"); |
86 | | double_VAR(textord_xheight_mode_fraction, 0.4, "Min pile height to make xheight"); |
87 | | double_VAR(textord_ascheight_mode_fraction, 0.08, "Min pile height to make ascheight"); |
88 | | static double_VAR(textord_descheight_mode_fraction, 0.08, "Min pile height to make descheight"); |
89 | | double_VAR(textord_ascx_ratio_min, 1.25, "Min cap/xheight"); |
90 | | double_VAR(textord_ascx_ratio_max, 1.8, "Max cap/xheight"); |
91 | | double_VAR(textord_descx_ratio_min, 0.25, "Min desc/xheight"); |
92 | | double_VAR(textord_descx_ratio_max, 0.6, "Max desc/xheight"); |
93 | | double_VAR(textord_xheight_error_margin, 0.1, "Accepted variation"); |
94 | | INT_VAR(textord_lms_line_trials, 12, "Number of linew fits to do"); |
95 | | BOOL_VAR(textord_new_initial_xheight, true, "Use test xheight mechanism"); |
96 | | BOOL_VAR(textord_debug_blob, false, "Print test blob information"); |
97 | | |
98 | 87.5k | #define MAX_HEIGHT_MODES 12 |
99 | | |
100 | | const int kMinLeaderCount = 5; |
101 | | |
102 | | /** |
103 | | * @name row_y_order |
104 | | * |
105 | | * Sort function to sort rows in y from page top. |
106 | | */ |
107 | | static int row_y_order( // sort function |
108 | | const void *item1, // items to compare |
109 | 740k | const void *item2) { |
110 | | // converted ptr |
111 | 740k | const TO_ROW *row1 = *reinterpret_cast<const TO_ROW *const *>(item1); |
112 | | // converted ptr |
113 | 740k | const TO_ROW *row2 = *reinterpret_cast<const TO_ROW *const *>(item2); |
114 | | |
115 | 740k | if (row1->parallel_c() > row2->parallel_c()) { |
116 | 735k | return -1; |
117 | 735k | } else if (row1->parallel_c() < row2->parallel_c()) { |
118 | 3.06k | return 1; |
119 | 3.06k | } else { |
120 | 1.92k | return 0; |
121 | 1.92k | } |
122 | 740k | } |
123 | | |
124 | | /** |
125 | | * @name row_spacing_order |
126 | | * |
127 | | * Qsort style function to compare 2 TO_ROWS based on their spacing value. |
128 | | */ |
129 | | static int row_spacing_order( // sort function |
130 | | const TO_ROW *row1, // items to compare |
131 | 819k | const TO_ROW *row2) { |
132 | 819k | return row1->spacing < row2->spacing; |
133 | 819k | } |
134 | | |
135 | | // Factored-out helper to build a single row from a list of blobs. |
136 | | // Returns the mean blob size. |
137 | 0 | static float MakeRowFromBlobs(float line_size, BLOBNBOX_IT *blob_it, TO_ROW_IT *row_it) { |
138 | 0 | blob_it->sort(blob_x_order); |
139 | 0 | blob_it->move_to_first(); |
140 | 0 | TO_ROW *row = nullptr; |
141 | 0 | float total_size = 0.0f; |
142 | 0 | int blob_count = 0; |
143 | | // Add all the blobs to a single TO_ROW. |
144 | 0 | for (; !blob_it->empty(); blob_it->forward()) { |
145 | 0 | BLOBNBOX *blob = blob_it->extract(); |
146 | 0 | int top = blob->bounding_box().top(); |
147 | 0 | int bottom = blob->bounding_box().bottom(); |
148 | 0 | if (row == nullptr) { |
149 | 0 | row = new TO_ROW(blob, top, bottom, line_size); |
150 | 0 | row_it->add_before_then_move(row); |
151 | 0 | } else { |
152 | 0 | row->add_blob(blob, top, bottom, line_size); |
153 | 0 | } |
154 | 0 | total_size += top - bottom; |
155 | 0 | ++blob_count; |
156 | 0 | } |
157 | 0 | return blob_count > 0 ? total_size / blob_count : total_size; |
158 | 0 | } |
159 | | |
160 | | // Helper to make a row using the children of a single blob. |
161 | | // Returns the mean size of the blobs created. |
162 | 0 | static float MakeRowFromSubBlobs(TO_BLOCK *block, C_BLOB *blob, TO_ROW_IT *row_it) { |
163 | | // The blobs made from the children will go in the small_blobs list. |
164 | 0 | BLOBNBOX_IT bb_it(&block->small_blobs); |
165 | 0 | C_OUTLINE_IT ol_it(blob->out_list()); |
166 | | // Get the children. |
167 | 0 | ol_it.set_to_list(ol_it.data()->child()); |
168 | 0 | if (ol_it.empty()) { |
169 | 0 | return 0.0f; |
170 | 0 | } |
171 | 0 | for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { |
172 | | // Deep copy the child outline and use that to make a blob. |
173 | 0 | blob = new C_BLOB(C_OUTLINE::deep_copy(ol_it.data())); |
174 | | // Correct direction as needed. |
175 | 0 | blob->CheckInverseFlagAndDirection(); |
176 | 0 | auto *bbox = new BLOBNBOX(blob); |
177 | 0 | bb_it.add_after_then_move(bbox); |
178 | 0 | } |
179 | | // Now we can make a row from the blobs. |
180 | 0 | return MakeRowFromBlobs(block->line_size, &bb_it, row_it); |
181 | 0 | } |
182 | | |
183 | | /** |
184 | | * @name make_single_row |
185 | | * |
186 | | * Arrange the blobs into a single row... well actually, if there is |
187 | | * only a single blob, it makes 2 rows, in case the top-level blob |
188 | | * is a container of the real blobs to recognize. |
189 | | */ |
190 | | float make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK *block, |
191 | 0 | TO_BLOCK_LIST *blocks) { |
192 | 0 | BLOBNBOX_IT blob_it = &block->blobs; |
193 | 0 | TO_ROW_IT row_it = block->get_rows(); |
194 | | |
195 | | // Include all the small blobs and large blobs. |
196 | 0 | blob_it.add_list_after(&block->small_blobs); |
197 | 0 | blob_it.add_list_after(&block->noise_blobs); |
198 | 0 | blob_it.add_list_after(&block->large_blobs); |
199 | 0 | if (block->blobs.singleton() && allow_sub_blobs) { |
200 | 0 | blob_it.move_to_first(); |
201 | 0 | float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it); |
202 | 0 | if (size > block->line_size) { |
203 | 0 | block->line_size = size; |
204 | 0 | } |
205 | 0 | } else if (block->blobs.empty()) { |
206 | | // Make a fake blob. |
207 | 0 | C_BLOB *blob = C_BLOB::FakeBlob(block->block->pdblk.bounding_box()); |
208 | | // The blobnbox owns the blob. |
209 | 0 | auto *bblob = new BLOBNBOX(blob); |
210 | 0 | blob_it.add_after_then_move(bblob); |
211 | 0 | } |
212 | 0 | MakeRowFromBlobs(block->line_size, &blob_it, &row_it); |
213 | | // Fit an LMS line to the rows. |
214 | 0 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
215 | 0 | fit_lms_line(row_it.data()); |
216 | 0 | } |
217 | 0 | float gradient; |
218 | 0 | float fit_error; |
219 | | // Compute the skew based on the fitted line. |
220 | 0 | compute_page_skew(blocks, gradient, fit_error); |
221 | 0 | return gradient; |
222 | 0 | } |
223 | | |
224 | | /** |
225 | | * @name make_rows |
226 | | * |
227 | | * Arrange the blobs into rows. |
228 | | */ |
229 | 16.6k | float make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks) { |
230 | 16.6k | float port_m; // global skew |
231 | 16.6k | float port_err; // global noise |
232 | 16.6k | TO_BLOCK_IT block_it; // iterator |
233 | | |
234 | 16.6k | block_it.set_to_list(port_blocks); |
235 | 33.3k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
236 | 16.6k | make_initial_textrows(page_tr, block_it.data(), FCOORD(1.0f, 0.0f), !textord_test_landscape); |
237 | 16.6k | } |
238 | | // compute globally |
239 | 16.6k | compute_page_skew(port_blocks, port_m, port_err); |
240 | 16.6k | block_it.set_to_list(port_blocks); |
241 | 33.3k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
242 | 16.6k | cleanup_rows_making(page_tr, block_it.data(), port_m, FCOORD(1.0f, 0.0f), |
243 | 16.6k | block_it.data()->block->pdblk.bounding_box().left(), |
244 | 16.6k | !textord_test_landscape); |
245 | 16.6k | } |
246 | 16.6k | return port_m; // global skew |
247 | 16.6k | } |
248 | | |
249 | | /** |
250 | | * @name make_initial_textrows |
251 | | * |
252 | | * Arrange the good blobs into rows of text. |
253 | | */ |
254 | | void make_initial_textrows( // find lines |
255 | | ICOORD page_tr, |
256 | | TO_BLOCK *block, // block to do |
257 | | FCOORD rotation, // for drawing |
258 | | bool testing_on // correct orientation |
259 | 16.6k | ) { |
260 | 16.6k | TO_ROW_IT row_it = block->get_rows(); |
261 | | |
262 | | #ifndef GRAPHICS_DISABLED |
263 | | ScrollView::Color colour; // of row |
264 | | |
265 | | if (textord_show_initial_rows && testing_on) { |
266 | | if (to_win == nullptr) { |
267 | | create_to_win(page_tr); |
268 | | } |
269 | | } |
270 | | #endif |
271 | | // guess skew |
272 | 16.6k | assign_blobs_to_rows(block, nullptr, 0, true, true, textord_show_initial_rows && testing_on); |
273 | 16.6k | row_it.move_to_first(); |
274 | 248k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
275 | 231k | fit_lms_line(row_it.data()); |
276 | 231k | } |
277 | | #ifndef GRAPHICS_DISABLED |
278 | | if (textord_show_initial_rows && testing_on) { |
279 | | colour = ScrollView::RED; |
280 | | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
281 | | plot_to_row(row_it.data(), colour, rotation); |
282 | | colour = static_cast<ScrollView::Color>(colour + 1); |
283 | | if (colour > ScrollView::MAGENTA) { |
284 | | colour = ScrollView::RED; |
285 | | } |
286 | | } |
287 | | } |
288 | | #endif |
289 | 16.6k | } |
290 | | |
291 | | /** |
292 | | * @name fit_lms_line |
293 | | * |
294 | | * Fit an LMS line to a row. |
295 | | */ |
296 | 231k | void fit_lms_line(TO_ROW *row) { |
297 | 231k | float m, c; // fitted line |
298 | 231k | tesseract::DetLineFit lms; |
299 | 231k | BLOBNBOX_IT blob_it = row->blob_list(); |
300 | | |
301 | 1.61M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
302 | 1.38M | const TBOX &box = blob_it.data()->bounding_box(); |
303 | 1.38M | lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom())); |
304 | 1.38M | } |
305 | 231k | double error = lms.Fit(&m, &c); |
306 | 231k | row->set_line(m, c, error); |
307 | 231k | } |
308 | | |
309 | | /** |
310 | | * @name compute_page_skew |
311 | | * |
312 | | * Compute the skew over a full page by averaging the gradients over |
313 | | * all the lines. Get the error of the same row. |
314 | | */ |
315 | | void compute_page_skew( // get average gradient |
316 | | TO_BLOCK_LIST *blocks, // list of blocks |
317 | | float &page_m, // average gradient |
318 | | float &page_err // average error |
319 | 16.6k | ) { |
320 | 16.6k | int32_t row_count; // total rows |
321 | 16.6k | int32_t blob_count; // total_blobs |
322 | 16.6k | int32_t row_err; // integer error |
323 | 16.6k | int32_t row_index; // of total |
324 | 16.6k | TO_ROW *row; // current row |
325 | 16.6k | TO_BLOCK_IT block_it = blocks; // iterator |
326 | | |
327 | 16.6k | row_count = 0; |
328 | 16.6k | blob_count = 0; |
329 | 33.3k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
330 | 16.6k | POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block(); |
331 | 16.6k | if (pb != nullptr && !pb->IsText()) { |
332 | 0 | continue; // Pretend non-text blocks don't exist. |
333 | 0 | } |
334 | 16.6k | row_count += block_it.data()->get_rows()->length(); |
335 | | // count up rows |
336 | 16.6k | TO_ROW_IT row_it(block_it.data()->get_rows()); |
337 | 248k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
338 | 231k | blob_count += row_it.data()->blob_list()->length(); |
339 | 231k | } |
340 | 16.6k | } |
341 | 16.6k | if (row_count == 0) { |
342 | 552 | page_m = 0.0f; |
343 | 552 | page_err = 0.0f; |
344 | 552 | return; |
345 | 552 | } |
346 | | // of rows |
347 | 16.1k | std::vector<float> gradients(blob_count); |
348 | | // of rows |
349 | 16.1k | std::vector<float> errors(blob_count); |
350 | | |
351 | 16.1k | row_index = 0; |
352 | 32.2k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
353 | 16.1k | POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block(); |
354 | 16.1k | if (pb != nullptr && !pb->IsText()) { |
355 | 0 | continue; // Pretend non-text blocks don't exist. |
356 | 0 | } |
357 | 16.1k | TO_ROW_IT row_it(block_it.data()->get_rows()); |
358 | 247k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
359 | 231k | row = row_it.data(); |
360 | 231k | blob_count = row->blob_list()->length(); |
361 | 231k | row_err = static_cast<int32_t>(std::ceil(row->line_error())); |
362 | 231k | if (row_err <= 0) { |
363 | 172k | row_err = 1; |
364 | 172k | } |
365 | 231k | if (textord_biased_skewcalc) { |
366 | 231k | blob_count /= row_err; |
367 | 1.15M | for (blob_count /= row_err; blob_count > 0; blob_count--) { |
368 | 923k | gradients[row_index] = row->line_m(); |
369 | 923k | errors[row_index] = row->line_error(); |
370 | 923k | row_index++; |
371 | 923k | } |
372 | 231k | } else if (blob_count >= textord_min_blobs_in_row) { |
373 | | // get gradient |
374 | 0 | gradients[row_index] = row->line_m(); |
375 | 0 | errors[row_index] = row->line_error(); |
376 | 0 | row_index++; |
377 | 0 | } |
378 | 231k | } |
379 | 16.1k | } |
380 | 16.1k | if (row_index == 0) { |
381 | | // desperate |
382 | 250 | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
383 | 125 | POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block(); |
384 | 125 | if (pb != nullptr && !pb->IsText()) { |
385 | 0 | continue; // Pretend non-text blocks don't exist. |
386 | 0 | } |
387 | 125 | TO_ROW_IT row_it(block_it.data()->get_rows()); |
388 | 583 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
389 | 458 | row = row_it.data(); |
390 | 458 | gradients[row_index] = row->line_m(); |
391 | 458 | errors[row_index] = row->line_error(); |
392 | 458 | row_index++; |
393 | 458 | } |
394 | 125 | } |
395 | 125 | } |
396 | 16.1k | row_count = row_index; |
397 | 16.1k | row_index = static_cast<int32_t>(row_count * textord_skew_ile); |
398 | 16.1k | gradients.resize(row_count); |
399 | 16.1k | std::nth_element(gradients.begin(), gradients.begin() + row_index, gradients.end()); |
400 | 16.1k | page_m = gradients[row_index]; |
401 | 16.1k | row_index = static_cast<int32_t>(row_count * textord_skew_ile); |
402 | 16.1k | errors.resize(row_count); |
403 | 16.1k | std::nth_element(errors.begin(), errors.begin() + row_index, errors.end()); |
404 | 16.1k | page_err = errors[row_index]; |
405 | 16.1k | } |
406 | | |
407 | | const double kNoiseSize = 0.5; // Fraction of xheight. |
408 | | const int kMinSize = 8; // Min pixels to be xheight. |
409 | | |
410 | | /** |
411 | | * Return true if the dot looks like it is part of the i. |
412 | | * Doesn't work for any other diacritical. |
413 | | */ |
414 | 0 | static bool dot_of_i(BLOBNBOX *dot, BLOBNBOX *i, TO_ROW *row) { |
415 | 0 | const TBOX &ibox = i->bounding_box(); |
416 | 0 | const TBOX &dotbox = dot->bounding_box(); |
417 | | |
418 | | // Must overlap horizontally by enough and be high enough. |
419 | 0 | int overlap = std::min(dotbox.right(), ibox.right()) - std::max(dotbox.left(), ibox.left()); |
420 | 0 | if (ibox.height() <= 2 * dotbox.height() || |
421 | 0 | (overlap * 2 < ibox.width() && overlap < dotbox.width())) { |
422 | 0 | return false; |
423 | 0 | } |
424 | | |
425 | | // If the i is tall and thin then it is good. |
426 | 0 | if (ibox.height() > ibox.width() * 2) { |
427 | 0 | return true; // The i or ! must be tall and thin. |
428 | 0 | } |
429 | | |
430 | | // It might still be tall and thin, but it might be joined to something. |
431 | | // So search the outline for a piece of large height close to the edges |
432 | | // of the dot. |
433 | 0 | const double kHeightFraction = 0.6; |
434 | 0 | double target_height = std::min(dotbox.bottom(), ibox.top()); |
435 | 0 | target_height -= row->line_m() * dotbox.left() + row->line_c(); |
436 | 0 | target_height *= kHeightFraction; |
437 | 0 | int left_min = dotbox.left() - dotbox.width(); |
438 | 0 | int middle = (dotbox.left() + dotbox.right()) / 2; |
439 | 0 | int right_max = dotbox.right() + dotbox.width(); |
440 | 0 | int left_miny = 0; |
441 | 0 | int left_maxy = 0; |
442 | 0 | int right_miny = 0; |
443 | 0 | int right_maxy = 0; |
444 | 0 | bool found_left = false; |
445 | 0 | bool found_right = false; |
446 | 0 | bool in_left = false; |
447 | 0 | bool in_right = false; |
448 | 0 | C_BLOB *blob = i->cblob(); |
449 | 0 | C_OUTLINE_IT o_it = blob->out_list(); |
450 | 0 | for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) { |
451 | 0 | C_OUTLINE *outline = o_it.data(); |
452 | 0 | int length = outline->pathlength(); |
453 | 0 | ICOORD pos = outline->start_pos(); |
454 | 0 | for (int step = 0; step < length; pos += outline->step(step++)) { |
455 | 0 | int x = pos.x(); |
456 | 0 | int y = pos.y(); |
457 | 0 | if (x >= left_min && x < middle && !found_left) { |
458 | | // We are in the left part so find min and max y. |
459 | 0 | if (in_left) { |
460 | 0 | if (y > left_maxy) { |
461 | 0 | left_maxy = y; |
462 | 0 | } |
463 | 0 | if (y < left_miny) { |
464 | 0 | left_miny = y; |
465 | 0 | } |
466 | 0 | } else { |
467 | 0 | left_maxy = left_miny = y; |
468 | 0 | in_left = true; |
469 | 0 | } |
470 | 0 | } else if (in_left) { |
471 | | // We just left the left so look for size. |
472 | 0 | if (left_maxy - left_miny > target_height) { |
473 | 0 | if (found_right) { |
474 | 0 | return true; |
475 | 0 | } |
476 | 0 | found_left = true; |
477 | 0 | } |
478 | 0 | in_left = false; |
479 | 0 | } |
480 | 0 | if (x <= right_max && x > middle && !found_right) { |
481 | | // We are in the right part so find min and max y. |
482 | 0 | if (in_right) { |
483 | 0 | if (y > right_maxy) { |
484 | 0 | right_maxy = y; |
485 | 0 | } |
486 | 0 | if (y < right_miny) { |
487 | 0 | right_miny = y; |
488 | 0 | } |
489 | 0 | } else { |
490 | 0 | right_maxy = right_miny = y; |
491 | 0 | in_right = true; |
492 | 0 | } |
493 | 0 | } else if (in_right) { |
494 | | // We just left the right so look for size. |
495 | 0 | if (right_maxy - right_miny > target_height) { |
496 | 0 | if (found_left) { |
497 | 0 | return true; |
498 | 0 | } |
499 | 0 | found_right = true; |
500 | 0 | } |
501 | 0 | in_right = false; |
502 | 0 | } |
503 | 0 | } |
504 | 0 | } |
505 | 0 | return false; |
506 | 0 | } |
507 | | |
508 | 0 | void vigorous_noise_removal(TO_BLOCK *block) { |
509 | 0 | TO_ROW_IT row_it = block->get_rows(); |
510 | 0 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
511 | 0 | TO_ROW *row = row_it.data(); |
512 | 0 | BLOBNBOX_IT b_it = row->blob_list(); |
513 | | // Estimate the xheight on the row. |
514 | 0 | int max_height = 0; |
515 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
516 | 0 | BLOBNBOX *blob = b_it.data(); |
517 | 0 | if (blob->bounding_box().height() > max_height) { |
518 | 0 | max_height = blob->bounding_box().height(); |
519 | 0 | } |
520 | 0 | } |
521 | 0 | STATS hstats(0, max_height); |
522 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
523 | 0 | BLOBNBOX *blob = b_it.data(); |
524 | 0 | int height = blob->bounding_box().height(); |
525 | 0 | if (height >= kMinSize) { |
526 | 0 | hstats.add(blob->bounding_box().height(), 1); |
527 | 0 | } |
528 | 0 | } |
529 | 0 | float xheight = hstats.median(); |
530 | | // Delete small objects. |
531 | 0 | BLOBNBOX *prev = nullptr; |
532 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
533 | 0 | BLOBNBOX *blob = b_it.data(); |
534 | 0 | const TBOX &box = blob->bounding_box(); |
535 | 0 | if (box.height() < kNoiseSize * xheight) { |
536 | | // Small so delete unless it looks like an i dot. |
537 | 0 | if (prev != nullptr) { |
538 | 0 | if (dot_of_i(blob, prev, row)) { |
539 | 0 | continue; // Looks OK. |
540 | 0 | } |
541 | 0 | } |
542 | 0 | if (!b_it.at_last()) { |
543 | 0 | BLOBNBOX *next = b_it.data_relative(1); |
544 | 0 | if (dot_of_i(blob, next, row)) { |
545 | 0 | continue; // Looks OK. |
546 | 0 | } |
547 | 0 | } |
548 | | // It might be noise so get rid of it. |
549 | 0 | delete blob->remove_cblob(); |
550 | 0 | delete b_it.extract(); |
551 | 0 | } else { |
552 | 0 | prev = blob; |
553 | 0 | } |
554 | 0 | } |
555 | 0 | } |
556 | 0 | } |
557 | | |
558 | | /** |
559 | | * cleanup_rows_making |
560 | | * |
561 | | * Remove overlapping rows and fit all the blobs to what's left. |
562 | | */ |
563 | | void cleanup_rows_making( // find lines |
564 | | ICOORD page_tr, // top right |
565 | | TO_BLOCK *block, // block to do |
566 | | float gradient, // gradient to fit |
567 | | FCOORD rotation, // for drawing |
568 | | int32_t block_edge, // edge of block |
569 | | bool testing_on // correct orientation |
570 | 16.6k | ) { |
571 | | // iterators |
572 | 16.6k | BLOBNBOX_IT blob_it = &block->blobs; |
573 | 16.6k | TO_ROW_IT row_it = block->get_rows(); |
574 | | |
575 | | #ifndef GRAPHICS_DISABLED |
576 | | if (textord_show_parallel_rows && testing_on) { |
577 | | if (to_win == nullptr) { |
578 | | create_to_win(page_tr); |
579 | | } |
580 | | } |
581 | | #endif |
582 | | // get row coords |
583 | 16.6k | fit_parallel_rows(block, gradient, rotation, block_edge, |
584 | 16.6k | textord_show_parallel_rows && testing_on); |
585 | 16.6k | delete_non_dropout_rows(block, gradient, rotation, block_edge, |
586 | 16.6k | textord_show_parallel_rows && testing_on); |
587 | 16.6k | expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on); |
588 | 16.6k | blob_it.set_to_list(&block->blobs); |
589 | 16.6k | row_it.set_to_list(block->get_rows()); |
590 | 147k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
591 | 130k | blob_it.add_list_after(row_it.data()->blob_list()); |
592 | 130k | } |
593 | | // give blobs back |
594 | 16.6k | assign_blobs_to_rows(block, &gradient, 1, false, false, false); |
595 | | // now new rows must be genuine |
596 | 16.6k | blob_it.set_to_list(&block->blobs); |
597 | 16.6k | blob_it.add_list_after(&block->large_blobs); |
598 | 16.6k | assign_blobs_to_rows(block, &gradient, 2, true, true, false); |
599 | | // safe to use big ones now |
600 | 16.6k | blob_it.set_to_list(&block->blobs); |
601 | | // throw all blobs in |
602 | 16.6k | blob_it.add_list_after(&block->noise_blobs); |
603 | 16.6k | blob_it.add_list_after(&block->small_blobs); |
604 | 16.6k | assign_blobs_to_rows(block, &gradient, 3, false, false, false); |
605 | 16.6k | } |
606 | | |
607 | | /** |
608 | | * delete_non_dropout_rows |
609 | | * |
610 | | * Compute the linespacing and offset. |
611 | | */ |
612 | | void delete_non_dropout_rows( // find lines |
613 | | TO_BLOCK *block, // block to do |
614 | | float gradient, // global skew |
615 | | FCOORD rotation, // deskew vector |
616 | | int32_t block_edge, // left edge |
617 | | bool testing_on // correct orientation |
618 | 16.6k | ) { |
619 | 16.6k | TBOX block_box; // deskewed block |
620 | 16.6k | int32_t max_y; // in block |
621 | 16.6k | int32_t min_y; |
622 | 16.6k | int32_t line_index; // of scan line |
623 | 16.6k | int32_t line_count; // no of scan lines |
624 | 16.6k | int32_t distance; // to drop-out |
625 | 16.6k | int32_t xleft; // of block |
626 | 16.6k | int32_t ybottom; // of block |
627 | 16.6k | TO_ROW *row; // current row |
628 | 16.6k | TO_ROW_IT row_it = block->get_rows(); |
629 | 16.6k | BLOBNBOX_IT blob_it = &block->blobs; |
630 | | |
631 | 16.6k | if (row_it.empty()) { |
632 | 552 | return; // empty block |
633 | 552 | } |
634 | 16.1k | block_box = deskew_block_coords(block, gradient); |
635 | 16.1k | xleft = block->block->pdblk.bounding_box().left(); |
636 | 16.1k | ybottom = block->block->pdblk.bounding_box().bottom(); |
637 | 16.1k | min_y = block_box.bottom() - 1; |
638 | 16.1k | max_y = block_box.top() + 1; |
639 | 247k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
640 | 231k | line_index = static_cast<int32_t>(std::floor(row_it.data()->intercept())); |
641 | 231k | if (line_index <= min_y) { |
642 | 1.40k | min_y = line_index - 1; |
643 | 1.40k | } |
644 | 231k | if (line_index >= max_y) { |
645 | 0 | max_y = line_index + 1; |
646 | 0 | } |
647 | 231k | } |
648 | 16.1k | line_count = max_y - min_y + 1; |
649 | 16.1k | if (line_count <= 0) { |
650 | 0 | return; // empty block |
651 | 0 | } |
652 | | // change in occupation |
653 | 16.1k | std::vector<int32_t> deltas(line_count); |
654 | | // of pixel coords |
655 | 16.1k | std::vector<int32_t> occupation(line_count); |
656 | | |
657 | 16.1k | compute_line_occupation(block, gradient, min_y, max_y, &occupation[0], &deltas[0]); |
658 | 16.1k | compute_occupation_threshold( |
659 | 16.1k | static_cast<int32_t>(ceil(block->line_spacing * (tesseract::CCStruct::kDescenderFraction + |
660 | 16.1k | tesseract::CCStruct::kAscenderFraction))), |
661 | 16.1k | static_cast<int32_t>(ceil(block->line_spacing * (tesseract::CCStruct::kXHeightFraction + |
662 | 16.1k | tesseract::CCStruct::kAscenderFraction))), |
663 | 16.1k | max_y - min_y + 1, &occupation[0], &deltas[0]); |
664 | | #ifndef GRAPHICS_DISABLED |
665 | | if (testing_on) { |
666 | | draw_occupation(xleft, ybottom, min_y, max_y, &occupation[0], &deltas[0]); |
667 | | } |
668 | | #endif |
669 | 16.1k | compute_dropout_distances(&occupation[0], &deltas[0], line_count); |
670 | 247k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
671 | 231k | row = row_it.data(); |
672 | 231k | line_index = static_cast<int32_t>(std::floor(row->intercept())); |
673 | 231k | distance = deltas[line_index - min_y]; |
674 | 231k | if (find_best_dropout_row(row, distance, block->line_spacing / 2, line_index, &row_it, |
675 | 231k | testing_on)) { |
676 | | #ifndef GRAPHICS_DISABLED |
677 | | if (testing_on) { |
678 | | plot_parallel_row(row, gradient, block_edge, ScrollView::WHITE, rotation); |
679 | | } |
680 | | #endif |
681 | 95.6k | blob_it.add_list_after(row_it.data()->blob_list()); |
682 | 95.6k | delete row_it.extract(); // too far away |
683 | 95.6k | } |
684 | 231k | } |
685 | 152k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
686 | 135k | blob_it.add_list_after(row_it.data()->blob_list()); |
687 | 135k | } |
688 | 16.1k | } |
689 | | |
690 | | /** |
691 | | * @name find_best_dropout_row |
692 | | * |
693 | | * Delete this row if it has a neighbour with better dropout characteristics. |
694 | | * true is returned if the row should be deleted. |
695 | | */ |
696 | | bool find_best_dropout_row( // find neighbours |
697 | | TO_ROW *row, // row to test |
698 | | int32_t distance, // dropout dist |
699 | | float dist_limit, // threshold distance |
700 | | int32_t line_index, // index of row |
701 | | TO_ROW_IT *row_it, // current position |
702 | | bool testing_on // correct orientation |
703 | 231k | ) { |
704 | 231k | int32_t next_index; // of neighbouring row |
705 | 231k | int32_t row_offset; // from current row |
706 | 231k | int32_t abs_dist; // absolute distance |
707 | 231k | int8_t row_inc; // increment to row_index |
708 | 231k | TO_ROW *next_row; // nextious row |
709 | | |
710 | 231k | if (testing_on) { |
711 | 0 | tprintf("Row at %g(%g), dropout dist=%d,", row->intercept(), row->parallel_c(), distance); |
712 | 0 | } |
713 | 231k | if (distance < 0) { |
714 | 87.5k | row_inc = 1; |
715 | 87.5k | abs_dist = -distance; |
716 | 144k | } else { |
717 | 144k | row_inc = -1; |
718 | 144k | abs_dist = distance; |
719 | 144k | } |
720 | 231k | if (abs_dist > dist_limit) { |
721 | 62.6k | if (testing_on) { |
722 | 0 | tprintf(" too far - deleting\n"); |
723 | 0 | } |
724 | 62.6k | return true; |
725 | 62.6k | } |
726 | 168k | if ((distance < 0 && !row_it->at_last()) || (distance >= 0 && !row_it->at_first())) { |
727 | 157k | row_offset = row_inc; |
728 | 159k | do { |
729 | 159k | next_row = row_it->data_relative(row_offset); |
730 | 159k | next_index = static_cast<int32_t>(std::floor(next_row->intercept())); |
731 | 159k | if ((distance < 0 && next_index < line_index && |
732 | 159k | next_index > line_index + distance + distance) || |
733 | 159k | (distance >= 0 && next_index > line_index && |
734 | 139k | next_index < line_index + distance + distance)) { |
735 | 30.4k | if (testing_on) { |
736 | 0 | tprintf(" nearer neighbour (%d) at %g\n", line_index + distance - next_index, |
737 | 0 | next_row->intercept()); |
738 | 0 | } |
739 | 30.4k | return true; // other is nearer |
740 | 129k | } else if (next_index == line_index || next_index == line_index + distance + distance) { |
741 | 4.16k | if (row->believability() <= next_row->believability()) { |
742 | 2.51k | if (testing_on) { |
743 | 0 | tprintf(" equal but more believable at %g (%g/%g)\n", next_row->intercept(), |
744 | 0 | row->believability(), next_row->believability()); |
745 | 0 | } |
746 | 2.51k | return true; // other is more believable |
747 | 2.51k | } |
748 | 4.16k | } |
749 | 126k | row_offset += row_inc; |
750 | 126k | } while ((next_index == line_index || next_index == line_index + distance + distance) && |
751 | 126k | row_offset < row_it->length()); |
752 | 124k | if (testing_on) { |
753 | 0 | tprintf(" keeping\n"); |
754 | 0 | } |
755 | 124k | } |
756 | 135k | return false; |
757 | 168k | } |
758 | | |
759 | | /** |
760 | | * @name deskew_block_coords |
761 | | * |
762 | | * Compute the bounding box of all the blobs in the block |
763 | | * if they were deskewed without actually doing it. |
764 | | */ |
765 | | TBOX deskew_block_coords( // block box |
766 | | TO_BLOCK *block, // block to do |
767 | | float gradient // global skew |
768 | 16.1k | ) { |
769 | 16.1k | TBOX result; // block bounds |
770 | 16.1k | TBOX blob_box; // of block |
771 | 16.1k | FCOORD rotation; // deskew vector |
772 | 16.1k | float length; // of gradient vector |
773 | 16.1k | TO_ROW_IT row_it = block->get_rows(); |
774 | 16.1k | TO_ROW *row; // current row |
775 | 16.1k | BLOBNBOX *blob; // current blob |
776 | 16.1k | BLOBNBOX_IT blob_it; // iterator |
777 | | |
778 | 16.1k | length = std::sqrt(gradient * gradient + 1); |
779 | 16.1k | rotation = FCOORD(1 / length, -gradient / length); |
780 | 247k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
781 | 231k | row = row_it.data(); |
782 | 231k | blob_it.set_to_list(row->blob_list()); |
783 | 1.61M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
784 | 1.38M | blob = blob_it.data(); |
785 | 1.38M | blob_box = blob->bounding_box(); |
786 | 1.38M | blob_box.rotate(rotation); // de-skew it |
787 | 1.38M | result += blob_box; |
788 | 1.38M | } |
789 | 231k | } |
790 | 16.1k | return result; |
791 | 16.1k | } |
792 | | |
793 | | /** |
794 | | * @name compute_line_occupation |
795 | | * |
796 | | * Compute the pixel projection back on the y axis given the global |
797 | | * skew. Also compute the 1st derivative. |
798 | | */ |
799 | | void compute_line_occupation( // project blobs |
800 | | TO_BLOCK *block, // block to do |
801 | | float gradient, // global skew |
802 | | int32_t min_y, // min coord in block |
803 | | int32_t max_y, // in block |
804 | | int32_t *occupation, // output projection |
805 | | int32_t *deltas // derivative |
806 | 16.1k | ) { |
807 | 16.1k | int32_t line_count; // maxy-miny+1 |
808 | 16.1k | int32_t line_index; // of scan line |
809 | 16.1k | int index; // array index for daft compilers |
810 | 16.1k | TO_ROW *row; // current row |
811 | 16.1k | TO_ROW_IT row_it = block->get_rows(); |
812 | 16.1k | BLOBNBOX *blob; // current blob |
813 | 16.1k | BLOBNBOX_IT blob_it; // iterator |
814 | 16.1k | float length; // of skew vector |
815 | 16.1k | TBOX blob_box; // bounding box |
816 | 16.1k | FCOORD rotation; // inverse of skew |
817 | | |
818 | 16.1k | line_count = max_y - min_y + 1; |
819 | 16.1k | length = std::sqrt(gradient * gradient + 1); |
820 | 16.1k | rotation = FCOORD(1 / length, -gradient / length); |
821 | 2.53M | for (line_index = 0; line_index < line_count; line_index++) { |
822 | 2.51M | deltas[line_index] = 0; |
823 | 2.51M | } |
824 | 247k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
825 | 231k | row = row_it.data(); |
826 | 231k | blob_it.set_to_list(row->blob_list()); |
827 | 1.61M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
828 | 1.38M | blob = blob_it.data(); |
829 | 1.38M | blob_box = blob->bounding_box(); |
830 | 1.38M | blob_box.rotate(rotation); // de-skew it |
831 | 1.38M | int32_t width = blob_box.right() - blob_box.left(); |
832 | 1.38M | index = blob_box.bottom() - min_y; |
833 | 1.38M | ASSERT_HOST(index >= 0 && index < line_count); |
834 | | // count transitions |
835 | 1.38M | deltas[index] += width; |
836 | 1.38M | index = blob_box.top() - min_y; |
837 | 1.38M | ASSERT_HOST(index >= 0 && index < line_count); |
838 | 1.38M | deltas[index] -= width; |
839 | 1.38M | } |
840 | 231k | } |
841 | 16.1k | occupation[0] = deltas[0]; |
842 | 2.51M | for (line_index = 1; line_index < line_count; line_index++) { |
843 | 2.50M | occupation[line_index] = occupation[line_index - 1] + deltas[line_index]; |
844 | 2.50M | } |
845 | 16.1k | } |
846 | | |
847 | | /** |
848 | | * compute_occupation_threshold |
849 | | * |
850 | | * Compute thresholds for textline or not for the occupation array. |
851 | | */ |
852 | | void compute_occupation_threshold( // project blobs |
853 | | int32_t low_window, // below result point |
854 | | int32_t high_window, // above result point |
855 | | int32_t line_count, // array sizes |
856 | | int32_t *occupation, // input projection |
857 | | int32_t *thresholds // output thresholds |
858 | 16.1k | ) { |
859 | 16.1k | int32_t line_index; // of thresholds line |
860 | 16.1k | int32_t low_index; // in occupation |
861 | 16.1k | int32_t high_index; // in occupation |
862 | 16.1k | int32_t sum; // current average |
863 | 16.1k | int32_t divisor; // to get thresholds |
864 | 16.1k | int32_t min_index; // of min occ |
865 | 16.1k | int32_t min_occ; // min in locality |
866 | 16.1k | int32_t test_index; // for finding min |
867 | | |
868 | 16.1k | divisor = static_cast<int32_t>(ceil((low_window + high_window) / textord_occupancy_threshold)); |
869 | 16.1k | if (low_window + high_window < line_count) { |
870 | 240k | for (sum = 0, high_index = 0; high_index < low_window; high_index++) { |
871 | 226k | sum += occupation[high_index]; |
872 | 226k | } |
873 | 351k | for (low_index = 0; low_index < high_window; low_index++, high_index++) { |
874 | 337k | sum += occupation[high_index]; |
875 | 337k | } |
876 | 13.7k | min_occ = occupation[0]; |
877 | 13.7k | min_index = 0; |
878 | 563k | for (test_index = 1; test_index < high_index; test_index++) { |
879 | 550k | if (occupation[test_index] <= min_occ) { |
880 | 138k | min_occ = occupation[test_index]; |
881 | 138k | min_index = test_index; // find min in region |
882 | 138k | } |
883 | 550k | } |
884 | 240k | for (line_index = 0; line_index < low_window; line_index++) { |
885 | 226k | thresholds[line_index] = (sum - min_occ) / divisor + min_occ; |
886 | 226k | } |
887 | | // same out to end |
888 | 1.77M | for (low_index = 0; high_index < line_count; low_index++, high_index++) { |
889 | 1.75M | sum -= occupation[low_index]; |
890 | 1.75M | sum += occupation[high_index]; |
891 | 1.75M | if (occupation[high_index] <= min_occ) { |
892 | | // find min in region |
893 | 530k | min_occ = occupation[high_index]; |
894 | 530k | min_index = high_index; |
895 | 530k | } |
896 | | // lost min from region |
897 | 1.75M | if (min_index <= low_index) { |
898 | 16.8k | min_occ = occupation[low_index + 1]; |
899 | 16.8k | min_index = low_index + 1; |
900 | 702k | for (test_index = low_index + 2; test_index <= high_index; test_index++) { |
901 | 685k | if (occupation[test_index] <= min_occ) { |
902 | 87.1k | min_occ = occupation[test_index]; |
903 | | // find min in region |
904 | 87.1k | min_index = test_index; |
905 | 87.1k | } |
906 | 685k | } |
907 | 16.8k | } |
908 | 1.75M | thresholds[line_index++] = (sum - min_occ) / divisor + min_occ; |
909 | 1.75M | } |
910 | 13.7k | } else { |
911 | 2.40k | min_occ = occupation[0]; |
912 | 2.40k | min_index = 0; |
913 | 198k | for (sum = 0, low_index = 0; low_index < line_count; low_index++) { |
914 | 195k | if (occupation[low_index] < min_occ) { |
915 | 0 | min_occ = occupation[low_index]; |
916 | 0 | min_index = low_index; |
917 | 0 | } |
918 | 195k | sum += occupation[low_index]; |
919 | 195k | } |
920 | 2.40k | line_index = 0; |
921 | 2.40k | } |
922 | 549k | for (; line_index < line_count; line_index++) { |
923 | 533k | thresholds[line_index] = (sum - min_occ) / divisor + min_occ; |
924 | 533k | } |
925 | | // same out to end |
926 | 16.1k | } |
927 | | |
928 | | /** |
929 | | * @name compute_dropout_distances |
930 | | * |
931 | | * Compute the distance from each coordinate to the nearest dropout. |
932 | | */ |
933 | | void compute_dropout_distances( // project blobs |
934 | | int32_t *occupation, // input projection |
935 | | int32_t *thresholds, // output thresholds |
936 | | int32_t line_count // array sizes |
937 | 16.1k | ) { |
938 | 16.1k | int32_t line_index; // of thresholds line |
939 | 16.1k | int32_t distance; // from prev dropout |
940 | 16.1k | int32_t next_dist; // to next dropout |
941 | 16.1k | int32_t back_index; // for back filling |
942 | 16.1k | int32_t prev_threshold; // before overwrite |
943 | | |
944 | 16.1k | distance = -line_count; |
945 | 16.1k | line_index = 0; |
946 | 158k | do { |
947 | 2.51M | do { |
948 | 2.51M | distance--; |
949 | 2.51M | prev_threshold = thresholds[line_index]; |
950 | | // distance from prev |
951 | 2.51M | thresholds[line_index] = distance; |
952 | 2.51M | line_index++; |
953 | 2.51M | } while (line_index < line_count && (occupation[line_index] < thresholds[line_index] || |
954 | 2.50M | occupation[line_index - 1] >= prev_threshold)); |
955 | 158k | if (line_index < line_count) { |
956 | 142k | back_index = line_index - 1; |
957 | 142k | next_dist = 1; |
958 | 832k | while (next_dist < -distance && back_index >= 0) { |
959 | 690k | thresholds[back_index] = next_dist; |
960 | 690k | back_index--; |
961 | 690k | next_dist++; |
962 | 690k | distance++; |
963 | 690k | } |
964 | 142k | distance = 1; |
965 | 142k | } |
966 | 158k | } while (line_index < line_count); |
967 | 16.1k | } |
968 | | |
969 | | /** |
970 | | * @name expand_rows |
971 | | * |
972 | | * Expand each row to the least of its allowed size and touching its |
973 | | * neighbours. If the expansion would entirely swallow a neighbouring row |
974 | | * then do so. |
975 | | */ |
976 | | void expand_rows( // find lines |
977 | | ICOORD page_tr, // top right |
978 | | TO_BLOCK *block, // block to do |
979 | | float gradient, // gradient to fit |
980 | | FCOORD rotation, // for drawing |
981 | | int32_t block_edge, // edge of block |
982 | | bool testing_on // correct orientation |
983 | 16.6k | ) { |
984 | 16.6k | bool swallowed_row; // eaten a neighbour |
985 | 16.6k | float y_max, y_min; // new row limits |
986 | 16.6k | float y_bottom, y_top; // allowed limits |
987 | 16.6k | TO_ROW *test_row; // next row |
988 | 16.6k | TO_ROW *row; // current row |
989 | | // iterators |
990 | 16.6k | BLOBNBOX_IT blob_it = &block->blobs; |
991 | 16.6k | TO_ROW_IT row_it = block->get_rows(); |
992 | | |
993 | | #ifndef GRAPHICS_DISABLED |
994 | | if (textord_show_expanded_rows && testing_on) { |
995 | | if (to_win == nullptr) { |
996 | | create_to_win(page_tr); |
997 | | } |
998 | | } |
999 | | #endif |
1000 | | |
1001 | 16.6k | adjust_row_limits(block); // shift min,max. |
1002 | 16.6k | if (textord_new_initial_xheight) { |
1003 | 16.6k | if (block->get_rows()->empty()) { |
1004 | 5.24k | return; |
1005 | 5.24k | } |
1006 | 11.4k | compute_row_stats(block, textord_show_expanded_rows && testing_on); |
1007 | 11.4k | } |
1008 | 11.4k | assign_blobs_to_rows(block, &gradient, 4, true, false, false); |
1009 | | // get real membership |
1010 | 11.4k | if (block->get_rows()->empty()) { |
1011 | 18 | return; |
1012 | 18 | } |
1013 | 11.4k | fit_parallel_rows(block, gradient, rotation, block_edge, |
1014 | 11.4k | textord_show_expanded_rows && testing_on); |
1015 | 11.4k | if (!textord_new_initial_xheight) { |
1016 | 0 | compute_row_stats(block, textord_show_expanded_rows && testing_on); |
1017 | 0 | } |
1018 | 11.4k | row_it.move_to_last(); |
1019 | 130k | do { |
1020 | 130k | row = row_it.data(); |
1021 | 130k | y_max = row->max_y(); // get current limits |
1022 | 130k | y_min = row->min_y(); |
1023 | 130k | y_bottom = row->intercept() - block->line_size * textord_expansion_factor * |
1024 | 130k | tesseract::CCStruct::kDescenderFraction; |
1025 | 130k | y_top = row->intercept() + |
1026 | 130k | block->line_size * textord_expansion_factor * |
1027 | 130k | (tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction); |
1028 | 130k | if (y_min > y_bottom) { // expansion allowed |
1029 | 98.5k | if (textord_show_expanded_rows && testing_on) { |
1030 | 0 | tprintf("Expanding bottom of row at %f from %f to %f\n", row->intercept(), y_min, y_bottom); |
1031 | 0 | } |
1032 | | // expandable |
1033 | 98.5k | swallowed_row = true; |
1034 | 188k | while (swallowed_row && !row_it.at_last()) { |
1035 | 90.2k | swallowed_row = false; |
1036 | | // get next one |
1037 | 90.2k | test_row = row_it.data_relative(1); |
1038 | | // overlaps space |
1039 | 90.2k | if (test_row->max_y() > y_bottom) { |
1040 | 76.0k | if (test_row->min_y() > y_bottom) { |
1041 | 469 | if (textord_show_expanded_rows && testing_on) { |
1042 | 0 | tprintf("Eating row below at %f\n", test_row->intercept()); |
1043 | 0 | } |
1044 | 469 | row_it.forward(); |
1045 | | #ifndef GRAPHICS_DISABLED |
1046 | | if (textord_show_expanded_rows && testing_on) { |
1047 | | plot_parallel_row(test_row, gradient, block_edge, ScrollView::WHITE, rotation); |
1048 | | } |
1049 | | #endif |
1050 | 469 | blob_it.set_to_list(row->blob_list()); |
1051 | 469 | blob_it.add_list_after(test_row->blob_list()); |
1052 | | // swallow complete row |
1053 | 469 | delete row_it.extract(); |
1054 | 469 | row_it.backward(); |
1055 | 469 | swallowed_row = true; |
1056 | 75.5k | } else if (test_row->max_y() < y_min) { |
1057 | | // shorter limit |
1058 | 2.47k | y_bottom = test_row->max_y(); |
1059 | 2.47k | if (textord_show_expanded_rows && testing_on) { |
1060 | 0 | tprintf("Truncating limit to %f due to touching row at %f\n", y_bottom, |
1061 | 0 | test_row->intercept()); |
1062 | 0 | } |
1063 | 73.0k | } else { |
1064 | 73.0k | y_bottom = y_min; // can't expand it |
1065 | 73.0k | if (textord_show_expanded_rows && testing_on) { |
1066 | 0 | tprintf("Not expanding limit beyond %f due to touching row at %f\n", y_bottom, |
1067 | 0 | test_row->intercept()); |
1068 | 0 | } |
1069 | 73.0k | } |
1070 | 76.0k | } |
1071 | 90.2k | } |
1072 | 98.5k | y_min = y_bottom; // expand it |
1073 | 98.5k | } |
1074 | 130k | if (y_max < y_top) { // expansion allowed |
1075 | 80.1k | if (textord_show_expanded_rows && testing_on) { |
1076 | 0 | tprintf("Expanding top of row at %f from %f to %f\n", row->intercept(), y_max, y_top); |
1077 | 0 | } |
1078 | 80.1k | swallowed_row = true; |
1079 | 151k | while (swallowed_row && !row_it.at_first()) { |
1080 | 71.8k | swallowed_row = false; |
1081 | | // get one above |
1082 | 71.8k | test_row = row_it.data_relative(-1); |
1083 | 71.8k | if (test_row->min_y() < y_top) { |
1084 | 56.7k | if (test_row->max_y() < y_top) { |
1085 | 28 | if (textord_show_expanded_rows && testing_on) { |
1086 | 0 | tprintf("Eating row above at %f\n", test_row->intercept()); |
1087 | 0 | } |
1088 | 28 | row_it.backward(); |
1089 | 28 | blob_it.set_to_list(row->blob_list()); |
1090 | | #ifndef GRAPHICS_DISABLED |
1091 | | if (textord_show_expanded_rows && testing_on) { |
1092 | | plot_parallel_row(test_row, gradient, block_edge, ScrollView::WHITE, rotation); |
1093 | | } |
1094 | | #endif |
1095 | 28 | blob_it.add_list_after(test_row->blob_list()); |
1096 | | // swallow complete row |
1097 | 28 | delete row_it.extract(); |
1098 | 28 | row_it.forward(); |
1099 | 28 | swallowed_row = true; |
1100 | 56.6k | } else if (test_row->min_y() < y_max) { |
1101 | | // shorter limit |
1102 | 54.6k | y_top = test_row->min_y(); |
1103 | 54.6k | if (textord_show_expanded_rows && testing_on) { |
1104 | 0 | tprintf("Truncating limit to %f due to touching row at %f\n", y_top, |
1105 | 0 | test_row->intercept()); |
1106 | 0 | } |
1107 | 54.6k | } else { |
1108 | 2.03k | y_top = y_max; // can't expand it |
1109 | 2.03k | if (textord_show_expanded_rows && testing_on) { |
1110 | 0 | tprintf("Not expanding limit beyond %f due to touching row at %f\n", y_top, |
1111 | 0 | test_row->intercept()); |
1112 | 0 | } |
1113 | 2.03k | } |
1114 | 56.7k | } |
1115 | 71.8k | } |
1116 | 80.1k | y_max = y_top; |
1117 | 80.1k | } |
1118 | | // new limits |
1119 | 130k | row->set_limits(y_min, y_max); |
1120 | 130k | row_it.backward(); |
1121 | 130k | } while (!row_it.at_last()); |
1122 | 11.4k | } |
1123 | | |
1124 | | /** |
1125 | | * adjust_row_limits |
1126 | | * |
1127 | | * Change the limits of rows to suit the default fractions. |
1128 | | */ |
1129 | | void adjust_row_limits( // tidy limits |
1130 | | TO_BLOCK *block // block to do |
1131 | 16.6k | ) { |
1132 | 16.6k | TO_ROW *row; // current row |
1133 | 16.6k | float size; // size of row |
1134 | 16.6k | float ymax; // top of row |
1135 | 16.6k | float ymin; // bottom of row |
1136 | 16.6k | TO_ROW_IT row_it = block->get_rows(); |
1137 | | |
1138 | 16.6k | if (textord_show_expanded_rows) { |
1139 | 0 | tprintf("Adjusting row limits for block(%d,%d)\n", block->block->pdblk.bounding_box().left(), |
1140 | 0 | block->block->pdblk.bounding_box().top()); |
1141 | 0 | } |
1142 | 152k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1143 | 135k | row = row_it.data(); |
1144 | 135k | size = row->max_y() - row->min_y(); |
1145 | 135k | if (textord_show_expanded_rows) { |
1146 | 0 | tprintf("Row at %f has min %f, max %f, size %f\n", row->intercept(), row->min_y(), |
1147 | 0 | row->max_y(), size); |
1148 | 0 | } |
1149 | 135k | size /= tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction + |
1150 | 135k | tesseract::CCStruct::kDescenderFraction; |
1151 | 135k | ymax = size * (tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction); |
1152 | 135k | ymin = -size * tesseract::CCStruct::kDescenderFraction; |
1153 | 135k | row->set_limits(row->intercept() + ymin, row->intercept() + ymax); |
1154 | 135k | row->merged = false; |
1155 | 135k | } |
1156 | 16.6k | } |
1157 | | |
1158 | | /** |
1159 | | * @name compute_row_stats |
1160 | | * |
1161 | | * Compute the linespacing and offset. |
1162 | | */ |
1163 | | void compute_row_stats( // find lines |
1164 | | TO_BLOCK *block, // block to do |
1165 | | bool testing_on // correct orientation |
1166 | 11.4k | ) { |
1167 | 11.4k | int32_t row_index; // of median |
1168 | 11.4k | TO_ROW *row; // current row |
1169 | 11.4k | TO_ROW *prev_row; // previous row |
1170 | 11.4k | float iqr; // inter quartile range |
1171 | 11.4k | TO_ROW_IT row_it = block->get_rows(); |
1172 | | // number of rows |
1173 | 11.4k | int16_t rowcount = row_it.length(); |
1174 | | // for choose nth |
1175 | 11.4k | std::vector<TO_ROW *> rows(rowcount); |
1176 | 11.4k | rowcount = 0; |
1177 | 11.4k | prev_row = nullptr; |
1178 | 11.4k | row_it.move_to_last(); // start at bottom |
1179 | 135k | do { |
1180 | 135k | row = row_it.data(); |
1181 | 135k | if (prev_row != nullptr) { |
1182 | 124k | rows[rowcount++] = prev_row; |
1183 | 124k | prev_row->spacing = row->intercept() - prev_row->intercept(); |
1184 | 124k | if (prev_row->spacing < 0.1 && prev_row->spacing > -0.1) { |
1185 | | // Avoid small spacing values which give a small disp_quant_factor_. |
1186 | | // That can cause large memory allocations with out-of-memory. |
1187 | 544 | prev_row->spacing = 0; |
1188 | 544 | } |
1189 | 124k | if (testing_on) { |
1190 | 0 | tprintf("Row at %g yields spacing of %g\n", row->intercept(), prev_row->spacing); |
1191 | 0 | } |
1192 | 124k | } |
1193 | 135k | prev_row = row; |
1194 | 135k | row_it.backward(); |
1195 | 135k | } while (!row_it.at_last()); |
1196 | 11.4k | block->key_row = prev_row; |
1197 | 11.4k | block->baseline_offset = std::fmod(prev_row->parallel_c(), block->line_spacing); |
1198 | 11.4k | if (testing_on) { |
1199 | 0 | tprintf("Blob based spacing=(%g,%g), offset=%g", block->line_size, block->line_spacing, |
1200 | 0 | block->baseline_offset); |
1201 | 0 | } |
1202 | 11.4k | if (rowcount > 0) { |
1203 | 9.55k | rows.resize(rowcount); |
1204 | 9.55k | row_index = rowcount * 3 / 4; |
1205 | 9.55k | std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order); |
1206 | 9.55k | iqr = rows[row_index]->spacing; |
1207 | 9.55k | row_index = rowcount / 4; |
1208 | 9.55k | std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order); |
1209 | 9.55k | iqr -= rows[row_index]->spacing; |
1210 | 9.55k | row_index = rowcount / 2; |
1211 | 9.55k | std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order); |
1212 | 9.55k | block->key_row = rows[row_index]; |
1213 | 9.55k | if (testing_on) { |
1214 | 0 | tprintf(" row based=%g(%g)", rows[row_index]->spacing, iqr); |
1215 | 0 | } |
1216 | 9.55k | if (rowcount > 2 && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) { |
1217 | 4.24k | if (!textord_new_initial_xheight) { |
1218 | 0 | if (rows[row_index]->spacing < block->line_spacing && |
1219 | 0 | rows[row_index]->spacing > block->line_size) { |
1220 | | // within range |
1221 | 0 | block->line_size = rows[row_index]->spacing; |
1222 | | // spacing=size |
1223 | 0 | } else if (rows[row_index]->spacing > block->line_spacing) { |
1224 | 0 | block->line_size = block->line_spacing; |
1225 | 0 | } |
1226 | | // too big so use max |
1227 | 4.24k | } else { |
1228 | 4.24k | if (rows[row_index]->spacing < block->line_spacing) { |
1229 | 4.20k | block->line_size = rows[row_index]->spacing; |
1230 | 4.20k | } else { |
1231 | 38 | block->line_size = block->line_spacing; |
1232 | 38 | } |
1233 | | // too big so use max |
1234 | 4.24k | } |
1235 | 4.24k | if (block->line_size < textord_min_xheight) { |
1236 | 3.60k | block->line_size = (float)textord_min_xheight; |
1237 | 3.60k | } |
1238 | 4.24k | block->line_spacing = rows[row_index]->spacing; |
1239 | 4.24k | block->max_blob_size = block->line_spacing * textord_excess_blobsize; |
1240 | 4.24k | } |
1241 | 9.55k | block->baseline_offset = std::fmod(rows[row_index]->intercept(), block->line_spacing); |
1242 | 9.55k | } |
1243 | 11.4k | if (testing_on) { |
1244 | 0 | tprintf("\nEstimate line size=%g, spacing=%g, offset=%g\n", block->line_size, |
1245 | 0 | block->line_spacing, block->baseline_offset); |
1246 | 0 | } |
1247 | 11.4k | } |
1248 | | |
1249 | | /** |
1250 | | * @name compute_block_xheight |
1251 | | * |
1252 | | * Compute the xheight of the individual rows, then correlate them |
1253 | | * and interpret ascenderless lines, correcting xheights. |
1254 | | * |
1255 | | * First we compute our best guess of the x-height of each row independently |
1256 | | * with compute_row_xheight(), which looks for a pair of commonly occurring |
1257 | | * heights that could be x-height and ascender height. This function also |
1258 | | * attempts to find descenders of lowercase letters (i.e. not the small |
1259 | | * descenders that could appear in upper case letters as Q,J). |
1260 | | * |
1261 | | * After this computation each row falls into one of the following categories: |
1262 | | * ROW_ASCENDERS_FOUND: we found xheight and ascender modes, so this must be |
1263 | | * a regular row; we'll use its xheight to compute |
1264 | | * xheight and ascrise estimates for the block |
1265 | | * ROW_DESCENDERS_FOUND: no ascenders, so we do not have a high confidence in |
1266 | | * the xheight of this row (don't use it for estimating |
1267 | | * block xheight), but this row can't contain all caps |
1268 | | * ROW_UNKNOWN: a row with no ascenders/descenders, could be all lowercase |
1269 | | * (or mostly lowercase for fonts with very few ascenders), |
1270 | | * all upper case or small caps |
1271 | | * ROW_INVALID: no meaningful xheight could be found for this row |
1272 | | * |
1273 | | * We then run correct_row_xheight() and use the computed xheight and ascrise |
1274 | | * averages to correct xheight values of the rows in ROW_DESCENDERS_FOUND, |
1275 | | * ROW_UNKNOWN and ROW_INVALID categories. |
1276 | | * |
1277 | | */ |
1278 | 33.0k | void Textord::compute_block_xheight(TO_BLOCK *block, float gradient) { |
1279 | 33.0k | TO_ROW *row; // current row |
1280 | 33.0k | float asc_frac_xheight = CCStruct::kAscenderFraction / CCStruct::kXHeightFraction; |
1281 | 33.0k | float desc_frac_xheight = CCStruct::kDescenderFraction / CCStruct::kXHeightFraction; |
1282 | 33.0k | int32_t min_height, max_height; // limits on xheight |
1283 | 33.0k | TO_ROW_IT row_it = block->get_rows(); |
1284 | 33.0k | if (row_it.empty()) { |
1285 | 377 | return; // no rows |
1286 | 377 | } |
1287 | | |
1288 | | // Compute the best guess of xheight of each row individually. |
1289 | | // Use xheight and ascrise values of the rows where ascenders were found. |
1290 | 32.6k | get_min_max_xheight(block->line_size, &min_height, &max_height); |
1291 | 32.6k | STATS row_asc_xheights(min_height, max_height); |
1292 | 32.6k | STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight), |
1293 | 32.6k | static_cast<int>(max_height * asc_frac_xheight)); |
1294 | 32.6k | int min_desc_height = static_cast<int>(min_height * desc_frac_xheight); |
1295 | 32.6k | int max_desc_height = static_cast<int>(max_height * desc_frac_xheight); |
1296 | 32.6k | STATS row_asc_descdrop(min_desc_height, max_desc_height); |
1297 | 32.6k | STATS row_desc_xheights(min_height, max_height); |
1298 | 32.6k | STATS row_desc_descdrop(min_desc_height, max_desc_height); |
1299 | 32.6k | STATS row_cap_xheights(min_height, max_height); |
1300 | 32.6k | STATS row_cap_floating_xheights(min_height, max_height); |
1301 | 432k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1302 | 399k | row = row_it.data(); |
1303 | | // Compute the xheight of this row if it has not been computed before. |
1304 | 399k | if (row->xheight <= 0) { |
1305 | 118k | compute_row_xheight(row, block->block->classify_rotation(), gradient, block->line_size); |
1306 | 118k | } |
1307 | 399k | ROW_CATEGORY row_category = get_row_category(row); |
1308 | 399k | if (row_category == ROW_ASCENDERS_FOUND) { |
1309 | 218k | row_asc_xheights.add(static_cast<int32_t>(row->xheight), row->xheight_evidence); |
1310 | 218k | row_asc_ascrise.add(static_cast<int32_t>(row->ascrise), row->xheight_evidence); |
1311 | 218k | row_asc_descdrop.add(static_cast<int32_t>(-row->descdrop), row->xheight_evidence); |
1312 | 218k | } else if (row_category == ROW_DESCENDERS_FOUND) { |
1313 | 15.7k | row_desc_xheights.add(static_cast<int32_t>(row->xheight), row->xheight_evidence); |
1314 | 15.7k | row_desc_descdrop.add(static_cast<int32_t>(-row->descdrop), row->xheight_evidence); |
1315 | 165k | } else if (row_category == ROW_UNKNOWN) { |
1316 | 46.8k | fill_heights(row, gradient, min_height, max_height, &row_cap_xheights, |
1317 | 46.8k | &row_cap_floating_xheights); |
1318 | 46.8k | } |
1319 | 399k | } |
1320 | | |
1321 | 32.6k | float xheight = 0.0; |
1322 | 32.6k | float ascrise = 0.0; |
1323 | 32.6k | float descdrop = 0.0; |
1324 | | // Compute our best guess of xheight of this block. |
1325 | 32.6k | if (row_asc_xheights.get_total() > 0) { |
1326 | | // Determine xheight from rows where ascenders were found. |
1327 | 18.6k | xheight = row_asc_xheights.median(); |
1328 | 18.6k | ascrise = row_asc_ascrise.median(); |
1329 | 18.6k | descdrop = -row_asc_descdrop.median(); |
1330 | 18.6k | } else if (row_desc_xheights.get_total() > 0) { |
1331 | | // Determine xheight from rows where descenders were found. |
1332 | 3.01k | xheight = row_desc_xheights.median(); |
1333 | 3.01k | descdrop = -row_desc_descdrop.median(); |
1334 | 10.9k | } else if (row_cap_xheights.get_total() > 0) { |
1335 | | // All the rows in the block were (a/de)scenderless. |
1336 | | // Try to search for two modes in row_cap_heights that could |
1337 | | // be the xheight and the capheight (e.g. some of the rows |
1338 | | // were lowercase, but did not have enough (a/de)scenders. |
1339 | | // If such two modes cannot be found, this block is most |
1340 | | // likely all caps (or all small caps, in which case the code |
1341 | | // still works as intended). |
1342 | 6.40k | compute_xheight_from_modes( |
1343 | 6.40k | &row_cap_xheights, &row_cap_floating_xheights, |
1344 | 6.40k | textord_single_height_mode && block->block->classify_rotation().y() == 0.0, min_height, |
1345 | 6.40k | max_height, &(xheight), &(ascrise)); |
1346 | 6.40k | if (ascrise == 0) { // assume only caps in the whole block |
1347 | 4.29k | xheight = row_cap_xheights.median() * CCStruct::kXHeightCapRatio; |
1348 | 4.29k | } |
1349 | 6.40k | } else { // default block sizes |
1350 | 4.52k | xheight = block->line_size * CCStruct::kXHeightFraction; |
1351 | 4.52k | } |
1352 | | // Correct xheight, ascrise and descdrop if necessary. |
1353 | 32.6k | bool corrected_xheight = false; |
1354 | 32.6k | if (xheight < textord_min_xheight) { |
1355 | 6.14k | xheight = static_cast<float>(textord_min_xheight); |
1356 | 6.14k | corrected_xheight = true; |
1357 | 6.14k | } |
1358 | 32.6k | if (corrected_xheight || ascrise <= 0) { |
1359 | 11.8k | ascrise = xheight * asc_frac_xheight; |
1360 | 11.8k | } |
1361 | 32.6k | if (corrected_xheight || descdrop >= 0) { |
1362 | 10.9k | descdrop = -(xheight * desc_frac_xheight); |
1363 | 10.9k | } |
1364 | 32.6k | block->xheight = xheight; |
1365 | | |
1366 | 32.6k | if (textord_debug_xheights) { |
1367 | 0 | tprintf("Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n", xheight, ascrise, |
1368 | 0 | descdrop); |
1369 | 0 | } |
1370 | | // Correct xheight, ascrise, descdrop of rows based on block averages. |
1371 | 432k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1372 | 399k | correct_row_xheight(row_it.data(), xheight, ascrise, descdrop); |
1373 | 399k | } |
1374 | 32.6k | } |
1375 | | |
1376 | | /** |
1377 | | * @name compute_row_xheight |
1378 | | * |
1379 | | * Estimate the xheight of this row. |
1380 | | * Compute the ascender rise and descender drop at the same time. |
1381 | | * Set xheigh_evidence to the number of blobs with the chosen xheight |
1382 | | * that appear in this row. |
1383 | | */ |
1384 | | void Textord::compute_row_xheight(TO_ROW *row, // row to do |
1385 | | const FCOORD &rotation, |
1386 | | float gradient, // global skew |
1387 | 337k | int block_line_size) { |
1388 | | // Find blobs representing repeated characters in rows and mark them. |
1389 | | // This information is used for computing row xheight and at a later |
1390 | | // stage when words are formed by make_words. |
1391 | 337k | if (!row->rep_chars_marked()) { |
1392 | 199k | mark_repeated_chars(row); |
1393 | 199k | } |
1394 | | |
1395 | 337k | int min_height, max_height; |
1396 | 337k | get_min_max_xheight(block_line_size, &min_height, &max_height); |
1397 | 337k | STATS heights(min_height, max_height); |
1398 | 337k | STATS floating_heights(min_height, max_height); |
1399 | 337k | fill_heights(row, gradient, min_height, max_height, &heights, &floating_heights); |
1400 | 337k | row->ascrise = 0.0f; |
1401 | 337k | row->xheight = 0.0f; |
1402 | 337k | row->xheight_evidence = compute_xheight_from_modes( |
1403 | 337k | &heights, &floating_heights, textord_single_height_mode && rotation.y() == 0.0, min_height, |
1404 | 337k | max_height, &(row->xheight), &(row->ascrise)); |
1405 | 337k | row->descdrop = 0.0f; |
1406 | 337k | if (row->xheight > 0) { |
1407 | 81.1k | row->descdrop = |
1408 | 81.1k | static_cast<float>(compute_row_descdrop(row, gradient, row->xheight_evidence, &heights)); |
1409 | 81.1k | } |
1410 | 337k | } |
1411 | | |
1412 | | /** |
1413 | | * @name fill_heights |
1414 | | * |
1415 | | * Fill the given heights with heights of the blobs that are legal |
1416 | | * candidates for estimating xheight. |
1417 | | */ |
1418 | | void fill_heights(TO_ROW *row, float gradient, int min_height, int max_height, STATS *heights, |
1419 | 384k | STATS *floating_heights) { |
1420 | 384k | float xcentre; // centre of blob |
1421 | 384k | float top; // top y coord of blob |
1422 | 384k | float height; // height of blob |
1423 | 384k | BLOBNBOX *blob; // current blob |
1424 | 384k | int repeated_set; |
1425 | 384k | BLOBNBOX_IT blob_it = row->blob_list(); |
1426 | 384k | if (blob_it.empty()) { |
1427 | 0 | return; // no blobs in this row |
1428 | 0 | } |
1429 | 384k | bool has_rep_chars = row->rep_chars_marked() && row->num_repeated_sets() > 0; |
1430 | 4.78M | do { |
1431 | 4.78M | blob = blob_it.data(); |
1432 | 4.78M | if (!blob->joined_to_prev()) { |
1433 | 2.98M | xcentre = (blob->bounding_box().left() + blob->bounding_box().right()) / 2.0f; |
1434 | 2.98M | top = blob->bounding_box().top(); |
1435 | 2.98M | height = blob->bounding_box().height(); |
1436 | 2.98M | if (textord_fix_xheight_bug) { |
1437 | 2.98M | top -= row->baseline.y(xcentre); |
1438 | 2.98M | } else { |
1439 | 0 | top -= gradient * xcentre + row->parallel_c(); |
1440 | 0 | } |
1441 | 2.98M | if (top >= min_height && top <= max_height) { |
1442 | 596k | heights->add(static_cast<int32_t>(floor(top + 0.5)), 1); |
1443 | 596k | if (height / top < textord_min_blob_height_fraction) { |
1444 | 159k | floating_heights->add(static_cast<int32_t>(floor(top + 0.5)), 1); |
1445 | 159k | } |
1446 | 596k | } |
1447 | 2.98M | } |
1448 | | // Skip repeated chars, since they are likely to skew the height stats. |
1449 | 4.78M | if (has_rep_chars && blob->repeated_set() != 0) { |
1450 | 0 | repeated_set = blob->repeated_set(); |
1451 | 0 | blob_it.forward(); |
1452 | 0 | while (!blob_it.at_first() && blob_it.data()->repeated_set() == repeated_set) { |
1453 | 0 | blob_it.forward(); |
1454 | 0 | if (textord_debug_xheights) { |
1455 | 0 | tprintf("Skipping repeated char when computing xheight\n"); |
1456 | 0 | } |
1457 | 0 | } |
1458 | 4.78M | } else { |
1459 | 4.78M | blob_it.forward(); |
1460 | 4.78M | } |
1461 | 4.78M | } while (!blob_it.at_first()); |
1462 | 384k | } |
1463 | | |
1464 | | /** |
1465 | | * @name compute_xheight_from_modes |
1466 | | * |
1467 | | * Given a STATS object heights, looks for two most frequently occurring |
1468 | | * heights that look like xheight and xheight + ascrise. If found, sets |
1469 | | * the values of *xheight and *ascrise accordingly, otherwise sets xheight |
1470 | | * to any most frequently occurring height and sets *ascrise to 0. |
1471 | | * Returns the number of times xheight occurred in heights. |
1472 | | * For each mode that is considered for being an xheight the count of |
1473 | | * floating blobs (stored in floating_heights) is subtracted from the |
1474 | | * total count of the blobs of this height. This is done because blobs |
1475 | | * that sit far above the baseline could represent valid ascenders, but |
1476 | | * it is highly unlikely that such a character's height will be an xheight |
1477 | | * (e.g. -, ', =, ^, `, ", ', etc) |
1478 | | * If cap_only, then force finding of only the top mode. |
1479 | | */ |
1480 | | int compute_xheight_from_modes(STATS *heights, STATS *floating_heights, bool cap_only, |
1481 | 344k | int min_height, int max_height, float *xheight, float *ascrise) { |
1482 | 344k | int blob_index = heights->mode(); // find mode |
1483 | 344k | int blob_count = heights->pile_count(blob_index); // get count of mode |
1484 | 344k | if (textord_debug_xheights) { |
1485 | 0 | tprintf("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n", min_height, max_height, |
1486 | 0 | blob_index, blob_count, heights->get_total()); |
1487 | 0 | heights->print(); |
1488 | 0 | floating_heights->print(); |
1489 | 0 | } |
1490 | 344k | if (blob_count == 0) { |
1491 | 256k | return 0; |
1492 | 256k | } |
1493 | 87.5k | int modes[MAX_HEIGHT_MODES]; // biggest piles |
1494 | 87.5k | bool in_best_pile = false; |
1495 | 87.5k | int prev_size = -INT32_MAX; |
1496 | 87.5k | int best_count = 0; |
1497 | 87.5k | int mode_count = compute_height_modes(heights, min_height, max_height, modes, MAX_HEIGHT_MODES); |
1498 | 87.5k | if (cap_only && mode_count > 1) { |
1499 | 0 | mode_count = 1; |
1500 | 0 | } |
1501 | 87.5k | int x; |
1502 | 87.5k | if (textord_debug_xheights) { |
1503 | 0 | tprintf("found %d modes: ", mode_count); |
1504 | 0 | for (x = 0; x < mode_count; x++) { |
1505 | 0 | tprintf("%d ", modes[x]); |
1506 | 0 | } |
1507 | 0 | tprintf("\n"); |
1508 | 0 | } |
1509 | | |
1510 | 216k | for (x = 0; x < mode_count - 1; x++) { |
1511 | 128k | if (modes[x] != prev_size + 1) { |
1512 | 117k | in_best_pile = false; // had empty height |
1513 | 117k | } |
1514 | 128k | int modes_x_count = heights->pile_count(modes[x]) - floating_heights->pile_count(modes[x]); |
1515 | 128k | if ((modes_x_count >= blob_count * textord_xheight_mode_fraction) && |
1516 | 128k | (in_best_pile || modes_x_count > best_count)) { |
1517 | 186k | for (int asc = x + 1; asc < mode_count; asc++) { |
1518 | 140k | float ratio = static_cast<float>(modes[asc]) / static_cast<float>(modes[x]); |
1519 | 140k | if (textord_ascx_ratio_min < ratio && ratio < textord_ascx_ratio_max && |
1520 | 140k | (heights->pile_count(modes[asc]) >= blob_count * textord_ascheight_mode_fraction)) { |
1521 | 56.9k | if (modes_x_count > best_count) { |
1522 | 23.5k | in_best_pile = true; |
1523 | 23.5k | best_count = modes_x_count; |
1524 | 23.5k | } |
1525 | 56.9k | if (textord_debug_xheights) { |
1526 | 0 | tprintf("X=%d, asc=%d, count=%d, ratio=%g\n", modes[x], modes[asc] - modes[x], |
1527 | 0 | modes_x_count, ratio); |
1528 | 0 | } |
1529 | 56.9k | prev_size = modes[x]; |
1530 | 56.9k | *xheight = static_cast<float>(modes[x]); |
1531 | 56.9k | *ascrise = static_cast<float>(modes[asc] - modes[x]); |
1532 | 56.9k | } |
1533 | 140k | } |
1534 | 45.8k | } |
1535 | 128k | } |
1536 | 87.5k | if (*xheight == 0) { // single mode |
1537 | | // Remove counts of the "floating" blobs (the one whose height is too |
1538 | | // small in relation to it's top end of the bounding box) from heights |
1539 | | // before computing the single-mode xheight. |
1540 | | // Restore the counts in heights after the mode is found, since |
1541 | | // floating blobs might be useful for determining potential ascenders |
1542 | | // in compute_row_descdrop(). |
1543 | 66.9k | if (floating_heights->get_total() > 0) { |
1544 | 495k | for (x = min_height; x < max_height; ++x) { |
1545 | 473k | heights->add(x, -(floating_heights->pile_count(x))); |
1546 | 473k | } |
1547 | 21.7k | blob_index = heights->mode(); // find the modified mode |
1548 | 495k | for (x = min_height; x < max_height; ++x) { |
1549 | 473k | heights->add(x, floating_heights->pile_count(x)); |
1550 | 473k | } |
1551 | 21.7k | } |
1552 | 66.9k | *xheight = static_cast<float>(blob_index); |
1553 | 66.9k | *ascrise = 0.0f; |
1554 | 66.9k | best_count = heights->pile_count(blob_index); |
1555 | 66.9k | if (textord_debug_xheights) { |
1556 | 0 | tprintf("Single mode xheight set to %g\n", *xheight); |
1557 | 0 | } |
1558 | 66.9k | } else if (textord_debug_xheights) { |
1559 | 0 | tprintf("Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise); |
1560 | 0 | } |
1561 | 87.5k | return best_count; |
1562 | 344k | } |
1563 | | |
1564 | | /** |
1565 | | * @name compute_row_descdrop |
1566 | | * |
1567 | | * Estimates the descdrop of this row. This function looks for |
1568 | | * "significant" descenders of lowercase letters (those that could |
1569 | | * not just be the small descenders of upper case letters like Q,J). |
1570 | | * The function also takes into account how many potential ascenders |
1571 | | * this row might contain. If the number of potential ascenders along |
1572 | | * with descenders is close to the expected fraction of the total |
1573 | | * number of blobs in the row, the function returns the descender |
1574 | | * height, returns 0 otherwise. |
1575 | | */ |
1576 | | int32_t compute_row_descdrop(TO_ROW *row, float gradient, int xheight_blob_count, |
1577 | 81.1k | STATS *asc_heights) { |
1578 | | // Count how many potential ascenders are in this row. |
1579 | 81.1k | int i_min = asc_heights->min_bucket(); |
1580 | 81.1k | if ((i_min / row->xheight) < textord_ascx_ratio_min) { |
1581 | 78.9k | i_min = static_cast<int>(floor(row->xheight * textord_ascx_ratio_min + 0.5)); |
1582 | 78.9k | } |
1583 | 81.1k | int i_max = asc_heights->max_bucket(); |
1584 | 81.1k | if ((i_max / row->xheight) > textord_ascx_ratio_max) { |
1585 | 7.78k | i_max = static_cast<int>(floor(row->xheight * textord_ascx_ratio_max)); |
1586 | 7.78k | } |
1587 | 81.1k | int num_potential_asc = 0; |
1588 | 206k | for (int i = i_min; i <= i_max; ++i) { |
1589 | 125k | num_potential_asc += asc_heights->pile_count(i); |
1590 | 125k | } |
1591 | 81.1k | auto min_height = static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_min + 0.5)); |
1592 | 81.1k | auto max_height = static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_max)); |
1593 | 81.1k | float xcentre; // centre of blob |
1594 | 81.1k | float height; // height of blob |
1595 | 81.1k | BLOBNBOX_IT blob_it = row->blob_list(); |
1596 | 81.1k | BLOBNBOX *blob; // current blob |
1597 | 81.1k | STATS heights(min_height, max_height); |
1598 | 1.80M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1599 | 1.72M | blob = blob_it.data(); |
1600 | 1.72M | if (!blob->joined_to_prev()) { |
1601 | 808k | xcentre = (blob->bounding_box().left() + blob->bounding_box().right()) / 2.0f; |
1602 | 808k | height = (gradient * xcentre + row->parallel_c() - blob->bounding_box().bottom()); |
1603 | 808k | if (height >= min_height && height <= max_height) { |
1604 | 110k | heights.add(static_cast<int>(floor(height + 0.5)), 1); |
1605 | 110k | } |
1606 | 808k | } |
1607 | 1.72M | } |
1608 | 81.1k | int blob_index = heights.mode(); // find mode |
1609 | 81.1k | int blob_count = heights.pile_count(blob_index); // get count of mode |
1610 | 81.1k | float total_fraction = (textord_descheight_mode_fraction + textord_ascheight_mode_fraction); |
1611 | 81.1k | if (static_cast<float>(blob_count + num_potential_asc) < xheight_blob_count * total_fraction) { |
1612 | 40.3k | blob_count = 0; |
1613 | 40.3k | } |
1614 | 81.1k | int descdrop = blob_count > 0 ? -blob_index : 0; |
1615 | 81.1k | if (textord_debug_xheights) { |
1616 | 0 | tprintf("Descdrop: %d (potential ascenders %d, descenders %d)\n", descdrop, num_potential_asc, |
1617 | 0 | blob_count); |
1618 | 0 | heights.print(); |
1619 | 0 | } |
1620 | 81.1k | return descdrop; |
1621 | 81.1k | } |
1622 | | |
1623 | | /** |
1624 | | * @name compute_height_modes |
1625 | | * |
1626 | | * Find the top maxmodes values in the input array and put their |
1627 | | * indices in the output in the order in which they occurred. |
1628 | | */ |
1629 | | int32_t compute_height_modes(STATS *heights, // stats to search |
1630 | | int32_t min_height, // bottom of range |
1631 | | int32_t max_height, // top of range |
1632 | | int32_t *modes, // output array |
1633 | 87.5k | int32_t maxmodes) { // size of modes |
1634 | 87.5k | int32_t pile_count; // no in source pile |
1635 | 87.5k | int32_t src_count; // no of source entries |
1636 | 87.5k | int32_t src_index; // current entry |
1637 | 87.5k | int32_t least_count; // height of smalllest |
1638 | 87.5k | int32_t least_index; // index of least |
1639 | 87.5k | int32_t dest_count; // index in modes |
1640 | | |
1641 | 87.5k | src_count = max_height + 1 - min_height; |
1642 | 87.5k | dest_count = 0; |
1643 | 87.5k | least_count = INT32_MAX; |
1644 | 87.5k | least_index = -1; |
1645 | 3.36M | for (src_index = 0; src_index < src_count; src_index++) { |
1646 | 3.27M | pile_count = heights->pile_count(min_height + src_index); |
1647 | 3.27M | if (pile_count > 0) { |
1648 | 228k | if (dest_count < maxmodes) { |
1649 | 216k | if (pile_count < least_count) { |
1650 | | // find smallest in array |
1651 | 101k | least_count = pile_count; |
1652 | 101k | least_index = dest_count; |
1653 | 101k | } |
1654 | 216k | modes[dest_count++] = min_height + src_index; |
1655 | 216k | } else if (pile_count >= least_count) { |
1656 | 63.6k | while (least_index < maxmodes - 1) { |
1657 | 53.0k | modes[least_index] = modes[least_index + 1]; |
1658 | | // shuffle up |
1659 | 53.0k | least_index++; |
1660 | 53.0k | } |
1661 | | // new one on end |
1662 | 10.6k | modes[maxmodes - 1] = min_height + src_index; |
1663 | 10.6k | if (pile_count == least_count) { |
1664 | | // new smallest |
1665 | 6.39k | least_index = maxmodes - 1; |
1666 | 6.39k | } else { |
1667 | 4.23k | least_count = heights->pile_count(modes[0]); |
1668 | 4.23k | least_index = 0; |
1669 | 50.8k | for (dest_count = 1; dest_count < maxmodes; dest_count++) { |
1670 | 46.5k | pile_count = heights->pile_count(modes[dest_count]); |
1671 | 46.5k | if (pile_count < least_count) { |
1672 | | // find smallest |
1673 | 3.19k | least_count = pile_count; |
1674 | 3.19k | least_index = dest_count; |
1675 | 3.19k | } |
1676 | 46.5k | } |
1677 | 4.23k | } |
1678 | 10.6k | } |
1679 | 228k | } |
1680 | 3.27M | } |
1681 | 87.5k | return dest_count; |
1682 | 87.5k | } |
1683 | | |
1684 | | /** |
1685 | | * @name correct_row_xheight |
1686 | | * |
1687 | | * Adjust the xheight etc of this row if not within reasonable limits |
1688 | | * of the average for the block. |
1689 | | */ |
1690 | 399k | void correct_row_xheight(TO_ROW *row, float xheight, float ascrise, float descdrop) { |
1691 | 399k | ROW_CATEGORY row_category = get_row_category(row); |
1692 | 399k | if (textord_debug_xheights) { |
1693 | 0 | tprintf( |
1694 | 0 | "correcting row xheight: row->xheight %.4f" |
1695 | 0 | ", row->acrise %.4f row->descdrop %.4f\n", |
1696 | 0 | row->xheight, row->ascrise, row->descdrop); |
1697 | 0 | } |
1698 | 399k | bool normal_xheight = within_error_margin(row->xheight, xheight, textord_xheight_error_margin); |
1699 | 399k | bool cap_xheight = |
1700 | 399k | within_error_margin(row->xheight, xheight + ascrise, textord_xheight_error_margin); |
1701 | | // Use the average xheight/ascrise for the following cases: |
1702 | | // -- the xheight of the row could not be determined at all |
1703 | | // -- the row has descenders (e.g. "many groups", "ISBN 12345 p.3") |
1704 | | // and its xheight is close to either cap height or average xheight |
1705 | | // -- the row does not have ascenders or descenders, but its xheight |
1706 | | // is close to the average block xheight (e.g. row with "www.mmm.com") |
1707 | 399k | if (row_category == ROW_ASCENDERS_FOUND) { |
1708 | 218k | if (row->descdrop >= 0) { |
1709 | 8.73k | row->descdrop = row->xheight * (descdrop / xheight); |
1710 | 8.73k | } |
1711 | 218k | } else if (row_category == ROW_INVALID || |
1712 | 181k | (row_category == ROW_DESCENDERS_FOUND && (normal_xheight || cap_xheight)) || |
1713 | 181k | (row_category == ROW_UNKNOWN && normal_xheight)) { |
1714 | 143k | if (textord_debug_xheights) { |
1715 | 0 | tprintf("using average xheight\n"); |
1716 | 0 | } |
1717 | 143k | row->xheight = xheight; |
1718 | 143k | row->ascrise = ascrise; |
1719 | 143k | row->descdrop = descdrop; |
1720 | 143k | } else if (row_category == ROW_DESCENDERS_FOUND) { |
1721 | | // Assume this is a row with mostly lowercase letters and it's xheight |
1722 | | // is computed correctly (unfortunately there is no way to distinguish |
1723 | | // this from the case when descenders are found, but the most common |
1724 | | // height is capheight). |
1725 | 7.07k | if (textord_debug_xheights) { |
1726 | 0 | tprintf("lowercase, corrected ascrise\n"); |
1727 | 0 | } |
1728 | 7.07k | row->ascrise = row->xheight * (ascrise / xheight); |
1729 | 30.5k | } else if (row_category == ROW_UNKNOWN) { |
1730 | | // Otherwise assume this row is an all-caps or small-caps row |
1731 | | // and adjust xheight and ascrise of the row. |
1732 | | |
1733 | 30.5k | row->all_caps = true; |
1734 | 30.5k | if (cap_xheight) { // regular all caps |
1735 | 7.95k | if (textord_debug_xheights) { |
1736 | 0 | tprintf("all caps\n"); |
1737 | 0 | } |
1738 | 7.95k | row->xheight = xheight; |
1739 | 7.95k | row->ascrise = ascrise; |
1740 | 7.95k | row->descdrop = descdrop; |
1741 | 22.6k | } else { // small caps or caps with an odd xheight |
1742 | 22.6k | if (textord_debug_xheights) { |
1743 | 0 | if (row->xheight < xheight + ascrise && row->xheight > xheight) { |
1744 | 0 | tprintf("small caps\n"); |
1745 | 0 | } else { |
1746 | 0 | tprintf("all caps with irregular xheight\n"); |
1747 | 0 | } |
1748 | 0 | } |
1749 | 22.6k | row->ascrise = row->xheight * (ascrise / (xheight + ascrise)); |
1750 | 22.6k | row->xheight -= row->ascrise; |
1751 | 22.6k | row->descdrop = row->xheight * (descdrop / xheight); |
1752 | 22.6k | } |
1753 | 30.5k | } |
1754 | 399k | if (textord_debug_xheights) { |
1755 | 0 | tprintf( |
1756 | 0 | "corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop" |
1757 | 0 | " = %.4f\n", |
1758 | 0 | row->xheight, row->ascrise, row->descdrop); |
1759 | 0 | } |
1760 | 399k | } |
1761 | | |
1762 | 35.2k | static int CountOverlaps(const TBOX &box, int min_height, BLOBNBOX_LIST *blobs) { |
1763 | 35.2k | int overlaps = 0; |
1764 | 35.2k | BLOBNBOX_IT blob_it(blobs); |
1765 | 899k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1766 | 864k | BLOBNBOX *blob = blob_it.data(); |
1767 | 864k | const TBOX &blob_box = blob->bounding_box(); |
1768 | 864k | if (blob_box.height() >= min_height && box.major_overlap(blob_box)) { |
1769 | 32.5k | ++overlaps; |
1770 | 32.5k | } |
1771 | 864k | } |
1772 | 35.2k | return overlaps; |
1773 | 35.2k | } |
1774 | | |
1775 | | /** |
1776 | | * @name separate_underlines |
1777 | | * |
1778 | | * Test wide objects for being potential underlines. If they are then |
1779 | | * put them in a separate list in the block. |
1780 | | */ |
1781 | | void separate_underlines(TO_BLOCK *block, // block to do |
1782 | | float gradient, // skew angle |
1783 | | FCOORD rotation, // inverse landscape |
1784 | 16.6k | bool testing_on) { // correct orientation |
1785 | 16.6k | BLOBNBOX *blob; // current blob |
1786 | 16.6k | C_BLOB *rotated_blob; // rotated blob |
1787 | 16.6k | TO_ROW *row; // current row |
1788 | 16.6k | float length; // of g_vec |
1789 | 16.6k | TBOX blob_box; |
1790 | 16.6k | FCOORD blob_rotation; // inverse of rotation |
1791 | 16.6k | FCOORD g_vec; // skew rotation |
1792 | 16.6k | BLOBNBOX_IT blob_it; // iterator |
1793 | | // iterator |
1794 | 16.6k | BLOBNBOX_IT under_it = &block->underlines; |
1795 | 16.6k | BLOBNBOX_IT large_it = &block->large_blobs; |
1796 | 16.6k | TO_ROW_IT row_it = block->get_rows(); |
1797 | 16.6k | int min_blob_height = static_cast<int>(textord_min_blob_height_fraction * block->line_size + 0.5); |
1798 | | |
1799 | | // length of vector |
1800 | 16.6k | length = std::sqrt(1 + gradient * gradient); |
1801 | 16.6k | g_vec = FCOORD(1 / length, -gradient / length); |
1802 | 16.6k | blob_rotation = FCOORD(rotation.x(), -rotation.y()); |
1803 | 16.6k | blob_rotation.rotate(g_vec); // undoing everything |
1804 | 216k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1805 | 199k | row = row_it.data(); |
1806 | | // get blobs |
1807 | 199k | blob_it.set_to_list(row->blob_list()); |
1808 | 2.83M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1809 | 2.63M | blob = blob_it.data(); |
1810 | 2.63M | blob_box = blob->bounding_box(); |
1811 | 2.63M | if (blob_box.width() > block->line_size * textord_underline_width) { |
1812 | 67.2k | ASSERT_HOST(blob->cblob() != nullptr); |
1813 | 67.2k | rotated_blob = crotate_cblob(blob->cblob(), blob_rotation); |
1814 | 67.2k | if (test_underline(testing_on && textord_show_final_rows, rotated_blob, |
1815 | 67.2k | static_cast<int16_t>(row->intercept()), |
1816 | 67.2k | static_cast<int16_t>(block->line_size * |
1817 | 67.2k | (tesseract::CCStruct::kXHeightFraction + |
1818 | 67.2k | tesseract::CCStruct::kAscenderFraction / 2.0f)))) { |
1819 | 31.9k | under_it.add_after_then_move(blob_it.extract()); |
1820 | 31.9k | if (testing_on && textord_show_final_rows) { |
1821 | 0 | tprintf("Underlined blob at:"); |
1822 | 0 | rotated_blob->bounding_box().print(); |
1823 | 0 | tprintf("Was:"); |
1824 | 0 | blob_box.print(); |
1825 | 0 | } |
1826 | 35.2k | } else if (CountOverlaps(blob->bounding_box(), min_blob_height, row->blob_list()) > |
1827 | 35.2k | textord_max_blob_overlaps) { |
1828 | 329 | large_it.add_after_then_move(blob_it.extract()); |
1829 | 329 | if (testing_on && textord_show_final_rows) { |
1830 | 0 | tprintf("Large blob overlaps %d blobs at:", |
1831 | 0 | CountOverlaps(blob_box, min_blob_height, row->blob_list())); |
1832 | 0 | blob_box.print(); |
1833 | 0 | } |
1834 | 329 | } |
1835 | 67.2k | delete rotated_blob; |
1836 | 67.2k | } |
1837 | 2.63M | } |
1838 | 199k | } |
1839 | 16.6k | } |
1840 | | |
1841 | | /** |
1842 | | * @name pre_associate_blobs |
1843 | | * |
1844 | | * Associate overlapping blobs and fake chop wide blobs. |
1845 | | */ |
1846 | | void pre_associate_blobs( // make rough chars |
1847 | | ICOORD page_tr, // top right |
1848 | | TO_BLOCK *block, // block to do |
1849 | | FCOORD rotation, // inverse landscape |
1850 | | bool testing_on // correct orientation |
1851 | 16.6k | ) { |
1852 | | #ifndef GRAPHICS_DISABLED |
1853 | | ScrollView::Color colour; // of boxes |
1854 | | #endif |
1855 | 16.6k | BLOBNBOX *blob; // current blob |
1856 | 16.6k | BLOBNBOX *nextblob; // next in list |
1857 | 16.6k | TBOX blob_box; |
1858 | 16.6k | FCOORD blob_rotation; // inverse of rotation |
1859 | 16.6k | BLOBNBOX_IT blob_it; // iterator |
1860 | 16.6k | BLOBNBOX_IT start_it; // iterator |
1861 | 16.6k | TO_ROW_IT row_it = block->get_rows(); |
1862 | | |
1863 | | #ifndef GRAPHICS_DISABLED |
1864 | | colour = ScrollView::RED; |
1865 | | #endif |
1866 | | |
1867 | 16.6k | blob_rotation = FCOORD(rotation.x(), -rotation.y()); |
1868 | 216k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1869 | | // get blobs |
1870 | 199k | blob_it.set_to_list(row_it.data()->blob_list()); |
1871 | 1.87M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1872 | 1.67M | blob = blob_it.data(); |
1873 | 1.67M | blob_box = blob->bounding_box(); |
1874 | 1.67M | start_it = blob_it; // save start point |
1875 | | // if (testing_on && textord_show_final_blobs) |
1876 | | // { |
1877 | | // tprintf("Blob at (%d,%d)->(%d,%d), |
1878 | | // addr=%x, count=%d\n", |
1879 | | // blob_box.left(),blob_box.bottom(), |
1880 | | // blob_box.right(),blob_box.top(), |
1881 | | // (void*)blob,blob_it.length()); |
1882 | | // } |
1883 | 1.67M | bool overlap; |
1884 | 2.87M | do { |
1885 | 2.87M | overlap = false; |
1886 | 2.87M | if (!blob_it.at_last()) { |
1887 | 2.65M | nextblob = blob_it.data_relative(1); |
1888 | 2.65M | overlap = blob_box.major_x_overlap(nextblob->bounding_box()); |
1889 | 2.65M | if (overlap) { |
1890 | 1.20M | blob->merge(nextblob); // merge new blob |
1891 | 1.20M | blob_box = blob->bounding_box(); // get bigger box |
1892 | 1.20M | blob_it.forward(); |
1893 | 1.20M | } |
1894 | 2.65M | } |
1895 | 2.87M | } while (overlap); |
1896 | 1.67M | blob->chop(&start_it, &blob_it, blob_rotation, |
1897 | 1.67M | block->line_size * tesseract::CCStruct::kXHeightFraction * textord_chop_width); |
1898 | | // attempt chop |
1899 | 1.67M | } |
1900 | | #ifndef GRAPHICS_DISABLED |
1901 | | if (testing_on && textord_show_final_blobs) { |
1902 | | if (to_win == nullptr) { |
1903 | | create_to_win(page_tr); |
1904 | | } |
1905 | | to_win->Pen(colour); |
1906 | | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1907 | | blob = blob_it.data(); |
1908 | | blob_box = blob->bounding_box(); |
1909 | | blob_box.rotate(rotation); |
1910 | | if (!blob->joined_to_prev()) { |
1911 | | to_win->Rectangle(blob_box.left(), blob_box.bottom(), blob_box.right(), blob_box.top()); |
1912 | | } |
1913 | | } |
1914 | | colour = static_cast<ScrollView::Color>(colour + 1); |
1915 | | if (colour > ScrollView::MAGENTA) { |
1916 | | colour = ScrollView::RED; |
1917 | | } |
1918 | | } |
1919 | | #endif |
1920 | 199k | } |
1921 | 16.6k | } |
1922 | | |
1923 | | /** |
1924 | | * @name fit_parallel_rows |
1925 | | * |
1926 | | * Re-fit the rows in the block to the given gradient. |
1927 | | */ |
1928 | | void fit_parallel_rows( // find lines |
1929 | | TO_BLOCK *block, // block to do |
1930 | | float gradient, // gradient to fit |
1931 | | FCOORD rotation, // for drawing |
1932 | | int32_t block_edge, // edge of block |
1933 | | bool testing_on // correct orientation |
1934 | 28.1k | ) { |
1935 | | #ifndef GRAPHICS_DISABLED |
1936 | | ScrollView::Color colour; // of row |
1937 | | #endif |
1938 | 28.1k | TO_ROW_IT row_it = block->get_rows(); |
1939 | | |
1940 | 28.1k | row_it.move_to_first(); |
1941 | 390k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1942 | 362k | if (row_it.data()->blob_list()->empty()) { |
1943 | 0 | delete row_it.extract(); // nothing in it |
1944 | 362k | } else { |
1945 | 362k | fit_parallel_lms(gradient, row_it.data()); |
1946 | 362k | } |
1947 | 362k | } |
1948 | | #ifndef GRAPHICS_DISABLED |
1949 | | if (testing_on) { |
1950 | | colour = ScrollView::RED; |
1951 | | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1952 | | plot_parallel_row(row_it.data(), gradient, block_edge, colour, rotation); |
1953 | | colour = static_cast<ScrollView::Color>(colour + 1); |
1954 | | if (colour > ScrollView::MAGENTA) { |
1955 | | colour = ScrollView::RED; |
1956 | | } |
1957 | | } |
1958 | | } |
1959 | | #endif |
1960 | 28.1k | row_it.sort(row_y_order); // may have gone out of order |
1961 | 28.1k | } |
1962 | | |
1963 | | /** |
1964 | | * @name fit_parallel_lms |
1965 | | * |
1966 | | * Fit an LMS line to a row. |
1967 | | * Make the fit parallel to the given gradient and set the |
1968 | | * row accordingly. |
1969 | | */ |
1970 | 362k | void fit_parallel_lms(float gradient, TO_ROW *row) { |
1971 | 362k | float c; // fitted line |
1972 | 362k | int blobcount; // no of blobs |
1973 | 362k | tesseract::DetLineFit lms; |
1974 | 362k | BLOBNBOX_IT blob_it = row->blob_list(); |
1975 | | |
1976 | 362k | blobcount = 0; |
1977 | 2.76M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1978 | 2.39M | if (!blob_it.data()->joined_to_prev()) { |
1979 | 2.39M | const TBOX &box = blob_it.data()->bounding_box(); |
1980 | 2.39M | lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom())); |
1981 | 2.39M | blobcount++; |
1982 | 2.39M | } |
1983 | 2.39M | } |
1984 | 362k | double error = lms.ConstrainedFit(gradient, &c); |
1985 | 362k | row->set_parallel_line(gradient, c, error); |
1986 | 362k | if (textord_straight_baselines && blobcount > textord_lms_line_trials) { |
1987 | 0 | error = lms.Fit(&gradient, &c); |
1988 | 0 | } |
1989 | | // set the other too |
1990 | 362k | row->set_line(gradient, c, error); |
1991 | 362k | } |
1992 | | |
1993 | | /** |
1994 | | * @name make_spline_rows |
1995 | | * |
1996 | | * Re-fit the rows in the block to the given gradient. |
1997 | | */ |
1998 | | void Textord::make_spline_rows(TO_BLOCK *block, // block to do |
1999 | | float gradient, // gradient to fit |
2000 | 16.6k | bool testing_on) { |
2001 | | #ifndef GRAPHICS_DISABLED |
2002 | | ScrollView::Color colour; // of row |
2003 | | if (testing_on && to_win == nullptr) { |
2004 | | create_to_win(page_tr_); |
2005 | | } |
2006 | | #endif |
2007 | 16.6k | TO_ROW_IT row_it = block->get_rows(); |
2008 | | |
2009 | 16.6k | row_it.move_to_first(); |
2010 | 216k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
2011 | 199k | if (row_it.data()->blob_list()->empty()) { |
2012 | 133 | delete row_it.extract(); // nothing in it |
2013 | 199k | } else { |
2014 | 199k | make_baseline_spline(row_it.data(), block); |
2015 | 199k | } |
2016 | 199k | } |
2017 | 16.6k | if (textord_old_baselines) { |
2018 | | #ifndef GRAPHICS_DISABLED |
2019 | | if (testing_on) { |
2020 | | colour = ScrollView::RED; |
2021 | | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
2022 | | row_it.data()->baseline.plot(to_win, colour); |
2023 | | colour = static_cast<ScrollView::Color>(colour + 1); |
2024 | | if (colour > ScrollView::MAGENTA) { |
2025 | | colour = ScrollView::RED; |
2026 | | } |
2027 | | } |
2028 | | } |
2029 | | #endif |
2030 | 16.6k | make_old_baselines(block, testing_on, gradient); |
2031 | 16.6k | } |
2032 | | #ifndef GRAPHICS_DISABLED |
2033 | | if (testing_on) { |
2034 | | colour = ScrollView::RED; |
2035 | | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
2036 | | row_it.data()->baseline.plot(to_win, colour); |
2037 | | colour = static_cast<ScrollView::Color>(colour + 1); |
2038 | | if (colour > ScrollView::MAGENTA) { |
2039 | | colour = ScrollView::RED; |
2040 | | } |
2041 | | } |
2042 | | } |
2043 | | #endif |
2044 | 16.6k | } |
2045 | | |
2046 | | /** |
2047 | | * @name make_baseline_spline |
2048 | | * |
2049 | | * Fit an LMS line to a row. |
2050 | | * Make the fit parallel to the given gradient and set the |
2051 | | * row accordingly. |
2052 | | */ |
2053 | | void make_baseline_spline(TO_ROW *row, // row to fit |
2054 | 199k | TO_BLOCK *block) { |
2055 | 199k | double *coeffs; // quadratic coeffs |
2056 | 199k | int32_t segments; // no of segments |
2057 | | |
2058 | | // spline boundaries |
2059 | 199k | auto *xstarts = new int32_t[row->blob_list()->length() + 1]; |
2060 | 199k | if (segment_baseline(row, block, segments, xstarts) && !textord_straight_baselines && |
2061 | 199k | !textord_parallel_baselines) { |
2062 | 0 | coeffs = linear_spline_baseline(row, block, segments, xstarts); |
2063 | 199k | } else { |
2064 | 199k | xstarts[1] = xstarts[segments]; |
2065 | 199k | segments = 1; |
2066 | 199k | coeffs = new double[3]; |
2067 | 199k | coeffs[0] = 0; |
2068 | 199k | coeffs[1] = row->line_m(); |
2069 | 199k | coeffs[2] = row->line_c(); |
2070 | 199k | } |
2071 | 199k | row->baseline = QSPLINE(segments, xstarts, coeffs); |
2072 | 199k | delete[] coeffs; |
2073 | 199k | delete[] xstarts; |
2074 | 199k | } |
2075 | | |
2076 | | /** |
2077 | | * @name segment_baseline |
2078 | | * |
2079 | | * Divide the baseline up into segments which require a different |
2080 | | * quadratic fitted to them. |
2081 | | * Return true if enough blobs were far enough away to need a quadratic. |
2082 | | */ |
2083 | | bool segment_baseline( // split baseline |
2084 | | TO_ROW *row, // row to fit |
2085 | | TO_BLOCK *block, // block it came from |
2086 | | int32_t &segments, // no fo segments |
2087 | | int32_t *xstarts // coords of segments |
2088 | 199k | ) { |
2089 | 199k | bool needs_curve; // needs curved line |
2090 | 199k | int blobcount; // no of blobs |
2091 | 199k | int blobindex; // current blob |
2092 | 199k | int last_state; // above, on , below |
2093 | 199k | int state; // of current blob |
2094 | 199k | float yshift; // from baseline |
2095 | 199k | TBOX box; // blob box |
2096 | 199k | TBOX new_box; // new_it box |
2097 | 199k | float middle; // xcentre of blob |
2098 | | // blobs |
2099 | 199k | BLOBNBOX_IT blob_it = row->blob_list(); |
2100 | 199k | BLOBNBOX_IT new_it = blob_it; // front end |
2101 | 199k | SORTED_FLOATS yshifts; // shifts from baseline |
2102 | | |
2103 | 199k | needs_curve = false; |
2104 | 199k | box = box_next_pre_chopped(&blob_it); |
2105 | 199k | xstarts[0] = box.left(); |
2106 | 199k | segments = 1; |
2107 | 199k | blobcount = row->blob_list()->length(); |
2108 | 199k | if (textord_oldbl_debug) { |
2109 | 0 | tprintf("Segmenting baseline of %d blobs at (%d,%d)\n", blobcount, box.left(), box.bottom()); |
2110 | 0 | } |
2111 | 199k | if (blobcount <= textord_spline_medianwin || blobcount < textord_spline_minblobs) { |
2112 | 119k | blob_it.move_to_last(); |
2113 | 119k | box = blob_it.data()->bounding_box(); |
2114 | 119k | xstarts[1] = box.right(); |
2115 | 119k | return false; |
2116 | 119k | } |
2117 | 80.7k | last_state = 0; |
2118 | 80.7k | new_it.mark_cycle_pt(); |
2119 | 520k | for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) { |
2120 | 454k | new_box = box_next_pre_chopped(&new_it); |
2121 | 454k | middle = (new_box.left() + new_box.right()) / 2.0; |
2122 | 454k | yshift = new_box.bottom() - row->line_m() * middle - row->line_c(); |
2123 | | // record shift |
2124 | 454k | yshifts.add(yshift, blobindex); |
2125 | 454k | if (new_it.cycled_list()) { |
2126 | 15.2k | xstarts[1] = new_box.right(); |
2127 | 15.2k | return false; |
2128 | 15.2k | } |
2129 | 454k | } |
2130 | 262k | for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++) { |
2131 | 196k | box = box_next_pre_chopped(&blob_it); |
2132 | 196k | } |
2133 | 944k | do { |
2134 | 944k | new_box = box_next_pre_chopped(&new_it); |
2135 | | // get middle one |
2136 | 944k | yshift = yshifts[textord_spline_medianwin / 2]; |
2137 | 944k | if (yshift > textord_spline_shift_fraction * block->line_size) { |
2138 | 398k | state = 1; |
2139 | 545k | } else if (-yshift > textord_spline_shift_fraction * block->line_size) { |
2140 | 251k | state = -1; |
2141 | 293k | } else { |
2142 | 293k | state = 0; |
2143 | 293k | } |
2144 | 944k | if (state != 0) { |
2145 | 650k | needs_curve = true; |
2146 | 650k | } |
2147 | | // tprintf("State=%d, prev=%d, shift=%g\n", |
2148 | | // state,last_state,yshift); |
2149 | 944k | if (state != last_state && blobcount > textord_spline_minblobs) { |
2150 | 17.8k | xstarts[segments++] = box.left(); |
2151 | 17.8k | blobcount = 0; |
2152 | 17.8k | } |
2153 | 944k | last_state = state; |
2154 | 944k | yshifts.remove(blobindex - textord_spline_medianwin); |
2155 | 944k | box = box_next_pre_chopped(&blob_it); |
2156 | 944k | middle = (new_box.left() + new_box.right()) / 2.0; |
2157 | 944k | yshift = new_box.bottom() - row->line_m() * middle - row->line_c(); |
2158 | 944k | yshifts.add(yshift, blobindex); |
2159 | 944k | blobindex++; |
2160 | 944k | blobcount++; |
2161 | 944k | } while (!new_it.cycled_list()); |
2162 | 65.5k | if (blobcount > textord_spline_minblobs || segments == 1) { |
2163 | 57.8k | xstarts[segments] = new_box.right(); |
2164 | 57.8k | } else { |
2165 | 7.66k | xstarts[--segments] = new_box.right(); |
2166 | 7.66k | } |
2167 | 65.5k | if (textord_oldbl_debug) { |
2168 | 0 | tprintf("Made %d segments on row at (%d,%d)\n", segments, box.right(), box.bottom()); |
2169 | 0 | } |
2170 | 65.5k | return needs_curve; |
2171 | 80.7k | } |
2172 | | |
2173 | | /** |
2174 | | * @name linear_spline_baseline |
2175 | | * |
2176 | | * Divide the baseline up into segments which require a different |
2177 | | * quadratic fitted to them. |
2178 | | * @return true if enough blobs were far enough away to need a quadratic. |
2179 | | */ |
2180 | | double *linear_spline_baseline( // split baseline |
2181 | | TO_ROW *row, // row to fit |
2182 | | TO_BLOCK *block, // block it came from |
2183 | | int32_t &segments, // no fo segments |
2184 | | int32_t xstarts[] // coords of segments |
2185 | 0 | ) { |
2186 | 0 | int blobcount; // no of blobs |
2187 | 0 | int blobindex; // current blob |
2188 | 0 | int index1, index2; // blob numbers |
2189 | 0 | int blobs_per_segment; // blobs in each |
2190 | 0 | TBOX box; // blob box |
2191 | 0 | TBOX new_box; // new_it box |
2192 | | // blobs |
2193 | 0 | BLOBNBOX_IT blob_it = row->blob_list(); |
2194 | 0 | BLOBNBOX_IT new_it = blob_it; // front end |
2195 | 0 | float b, c; // fitted curve |
2196 | 0 | tesseract::DetLineFit lms; |
2197 | 0 | int32_t segment; // current segment |
2198 | |
|
2199 | 0 | box = box_next_pre_chopped(&blob_it); |
2200 | 0 | xstarts[0] = box.left(); |
2201 | 0 | blobcount = 1; |
2202 | 0 | while (!blob_it.at_first()) { |
2203 | 0 | blobcount++; |
2204 | 0 | box = box_next_pre_chopped(&blob_it); |
2205 | 0 | } |
2206 | 0 | segments = blobcount / textord_spline_medianwin; |
2207 | 0 | if (segments < 1) { |
2208 | 0 | segments = 1; |
2209 | 0 | } |
2210 | 0 | blobs_per_segment = blobcount / segments; |
2211 | | // quadratic coeffs |
2212 | 0 | auto *coeffs = new double[segments * 3]; |
2213 | 0 | if (textord_oldbl_debug) { |
2214 | 0 | tprintf( |
2215 | 0 | "Linear splining baseline of %d blobs at (%d,%d), into %d segments of " |
2216 | 0 | "%d blobs\n", |
2217 | 0 | blobcount, box.left(), box.bottom(), segments, blobs_per_segment); |
2218 | 0 | } |
2219 | 0 | segment = 1; |
2220 | 0 | for (index2 = 0; index2 < blobs_per_segment / 2; index2++) { |
2221 | 0 | box_next_pre_chopped(&new_it); |
2222 | 0 | } |
2223 | 0 | index1 = 0; |
2224 | 0 | blobindex = index2; |
2225 | 0 | do { |
2226 | 0 | blobindex += blobs_per_segment; |
2227 | 0 | lms.Clear(); |
2228 | 0 | while (index1 < blobindex || (segment == segments && index1 < blobcount)) { |
2229 | 0 | box = box_next_pre_chopped(&blob_it); |
2230 | 0 | int middle = (box.left() + box.right()) / 2; |
2231 | 0 | lms.Add(ICOORD(middle, box.bottom())); |
2232 | 0 | index1++; |
2233 | 0 | if (index1 == blobindex - blobs_per_segment / 2 || index1 == blobcount - 1) { |
2234 | 0 | xstarts[segment] = box.left(); |
2235 | 0 | } |
2236 | 0 | } |
2237 | 0 | lms.Fit(&b, &c); |
2238 | 0 | coeffs[segment * 3 - 3] = 0; |
2239 | 0 | coeffs[segment * 3 - 2] = b; |
2240 | 0 | coeffs[segment * 3 - 1] = c; |
2241 | 0 | segment++; |
2242 | 0 | if (segment > segments) { |
2243 | 0 | break; |
2244 | 0 | } |
2245 | | |
2246 | 0 | blobindex += blobs_per_segment; |
2247 | 0 | lms.Clear(); |
2248 | 0 | while (index2 < blobindex || (segment == segments && index2 < blobcount)) { |
2249 | 0 | new_box = box_next_pre_chopped(&new_it); |
2250 | 0 | int middle = (new_box.left() + new_box.right()) / 2; |
2251 | 0 | lms.Add(ICOORD(middle, new_box.bottom())); |
2252 | 0 | index2++; |
2253 | 0 | if (index2 == blobindex - blobs_per_segment / 2 || index2 == blobcount - 1) { |
2254 | 0 | xstarts[segment] = new_box.left(); |
2255 | 0 | } |
2256 | 0 | } |
2257 | 0 | lms.Fit(&b, &c); |
2258 | 0 | coeffs[segment * 3 - 3] = 0; |
2259 | 0 | coeffs[segment * 3 - 2] = b; |
2260 | 0 | coeffs[segment * 3 - 1] = c; |
2261 | 0 | segment++; |
2262 | 0 | } while (segment <= segments); |
2263 | 0 | return coeffs; |
2264 | 0 | } |
2265 | | |
2266 | | /** |
2267 | | * @name assign_blobs_to_rows |
2268 | | * |
2269 | | * Make enough rows to allocate all the given blobs to one. |
2270 | | * If a block skew is given, use that, else attempt to track it. |
2271 | | */ |
2272 | | void assign_blobs_to_rows( // find lines |
2273 | | TO_BLOCK *block, // block to do |
2274 | | float *gradient, // block skew |
2275 | | int pass, // identification |
2276 | | bool reject_misses, // chuck big ones out |
2277 | | bool make_new_rows, // add rows for unmatched |
2278 | | bool drawing_skew // draw smoothed skew |
2279 | 78.2k | ) { |
2280 | 78.2k | OVERLAP_STATE overlap_result; // what to do with it |
2281 | 78.2k | float ycoord; // current y |
2282 | 78.2k | float top, bottom; // of blob |
2283 | 78.2k | float g_length = 1.0f; // from gradient |
2284 | 78.2k | int16_t row_count; // no of rows |
2285 | 78.2k | int16_t left_x; // left edge |
2286 | 78.2k | int16_t last_x; // previous edge |
2287 | 78.2k | float block_skew; // y delta |
2288 | 78.2k | float smooth_factor; // for new coords |
2289 | 78.2k | float near_dist; // dist to nearest row |
2290 | 78.2k | ICOORD testpt; // testing only |
2291 | 78.2k | BLOBNBOX *blob; // current blob |
2292 | 78.2k | TO_ROW *row; // current row |
2293 | 78.2k | TO_ROW *dest_row = nullptr; // row to put blob in |
2294 | | // iterators |
2295 | 78.2k | BLOBNBOX_IT blob_it = &block->blobs; |
2296 | 78.2k | TO_ROW_IT row_it = block->get_rows(); |
2297 | | |
2298 | 78.2k | ycoord = |
2299 | 78.2k | (block->block->pdblk.bounding_box().bottom() + block->block->pdblk.bounding_box().top()) / |
2300 | 78.2k | 2.0f; |
2301 | 78.2k | if (gradient != nullptr) { |
2302 | 61.5k | g_length = std::sqrt(1 + *gradient * *gradient); |
2303 | 61.5k | } |
2304 | | #ifndef GRAPHICS_DISABLED |
2305 | | if (drawing_skew) { |
2306 | | to_win->SetCursor(block->block->pdblk.bounding_box().left(), ycoord); |
2307 | | } |
2308 | | #endif |
2309 | 78.2k | testpt = ICOORD(textord_test_x, textord_test_y); |
2310 | 78.2k | blob_it.sort(blob_x_order); |
2311 | 78.2k | smooth_factor = 1.0; |
2312 | 78.2k | block_skew = 0.0f; |
2313 | 78.2k | row_count = row_it.length(); // might have rows |
2314 | 78.2k | if (!blob_it.empty()) { |
2315 | 72.4k | left_x = blob_it.data()->bounding_box().left(); |
2316 | 72.4k | } else { |
2317 | 5.71k | left_x = block->block->pdblk.bounding_box().left(); |
2318 | 5.71k | } |
2319 | 78.2k | last_x = left_x; |
2320 | 6.22M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
2321 | 6.14M | blob = blob_it.data(); |
2322 | 6.14M | if (gradient != nullptr) { |
2323 | 4.76M | block_skew = (1 - 1 / g_length) * blob->bounding_box().bottom() + |
2324 | 4.76M | *gradient / g_length * blob->bounding_box().left(); |
2325 | 4.76M | } else if (blob->bounding_box().left() - last_x > block->line_size / 2 && |
2326 | 1.38M | last_x - left_x > block->line_size * 2 && textord_interpolating_skew) { |
2327 | | // tprintf("Interpolating skew from %g",block_skew); |
2328 | 3.06k | block_skew *= static_cast<float>(blob->bounding_box().left() - left_x) / (last_x - left_x); |
2329 | | // tprintf("to %g\n",block_skew); |
2330 | 3.06k | } |
2331 | 6.14M | last_x = blob->bounding_box().left(); |
2332 | 6.14M | top = blob->bounding_box().top() - block_skew; |
2333 | 6.14M | bottom = blob->bounding_box().bottom() - block_skew; |
2334 | | #ifndef GRAPHICS_DISABLED |
2335 | | if (drawing_skew) { |
2336 | | to_win->DrawTo(blob->bounding_box().left(), ycoord + block_skew); |
2337 | | } |
2338 | | #endif |
2339 | 6.14M | if (!row_it.empty()) { |
2340 | 108M | for (row_it.move_to_first(); !row_it.at_last() && row_it.data()->min_y() > top; |
2341 | 102M | row_it.forward()) { |
2342 | 102M | } |
2343 | 6.05M | row = row_it.data(); |
2344 | 6.05M | if (row->min_y() <= top && row->max_y() >= bottom) { |
2345 | | // any overlap |
2346 | 5.38M | dest_row = row; |
2347 | 5.38M | overlap_result = most_overlapping_row(&row_it, dest_row, top, bottom, block->line_size, |
2348 | 5.38M | blob->bounding_box().contains(testpt)); |
2349 | 5.38M | if (overlap_result == NEW_ROW && !reject_misses) { |
2350 | 49.9k | overlap_result = ASSIGN; |
2351 | 49.9k | } |
2352 | 5.38M | } else { |
2353 | 667k | overlap_result = NEW_ROW; |
2354 | 667k | if (!make_new_rows) { |
2355 | 425k | near_dist = row_it.data_relative(-1)->min_y() - top; |
2356 | | // below bottom |
2357 | 425k | if (bottom < row->min_y()) { |
2358 | 86.9k | if (row->min_y() - bottom <= (block->line_spacing - block->line_size) * |
2359 | 86.9k | tesseract::CCStruct::kDescenderFraction) { |
2360 | | // done it |
2361 | 5.43k | overlap_result = ASSIGN; |
2362 | 5.43k | dest_row = row; |
2363 | 5.43k | } |
2364 | 338k | } else if (near_dist > 0 && near_dist < bottom - row->max_y()) { |
2365 | 116k | row_it.backward(); |
2366 | 116k | dest_row = row_it.data(); |
2367 | 116k | if (dest_row->min_y() - bottom <= (block->line_spacing - block->line_size) * |
2368 | 116k | tesseract::CCStruct::kDescenderFraction) { |
2369 | | // done it |
2370 | 14.0k | overlap_result = ASSIGN; |
2371 | 14.0k | } |
2372 | 222k | } else { |
2373 | 222k | if (top - row->max_y() <= |
2374 | 222k | (block->line_spacing - block->line_size) * |
2375 | 222k | (textord_overlap_x + tesseract::CCStruct::kAscenderFraction)) { |
2376 | | // done it |
2377 | 73.6k | overlap_result = ASSIGN; |
2378 | 73.6k | dest_row = row; |
2379 | 73.6k | } |
2380 | 222k | } |
2381 | 425k | } |
2382 | 667k | } |
2383 | 6.05M | if (overlap_result == ASSIGN) { |
2384 | 4.72M | dest_row->add_blob(blob_it.extract(), top, bottom, block->line_size); |
2385 | 4.72M | } |
2386 | 6.05M | if (overlap_result == NEW_ROW) { |
2387 | 687k | if (make_new_rows && top - bottom < block->max_blob_size) { |
2388 | 287k | dest_row = new TO_ROW(blob_it.extract(), top, bottom, block->line_size); |
2389 | 287k | row_count++; |
2390 | 287k | if (bottom > row_it.data()->min_y()) { |
2391 | 211k | row_it.add_before_then_move(dest_row); |
2392 | | // insert in right place |
2393 | 211k | } else { |
2394 | 76.6k | row_it.add_after_then_move(dest_row); |
2395 | 76.6k | } |
2396 | 287k | smooth_factor = 1.0 / (row_count * textord_skew_lag + textord_skewsmooth_offset); |
2397 | 399k | } else { |
2398 | 399k | overlap_result = REJECT; |
2399 | 399k | } |
2400 | 687k | } |
2401 | 6.05M | } else if (make_new_rows && top - bottom < block->max_blob_size) { |
2402 | 21.0k | overlap_result = NEW_ROW; |
2403 | 21.0k | dest_row = new TO_ROW(blob_it.extract(), top, bottom, block->line_size); |
2404 | 21.0k | row_count++; |
2405 | 21.0k | row_it.add_after_then_move(dest_row); |
2406 | 21.0k | smooth_factor = 1.0 / (row_count * textord_skew_lag + textord_skewsmooth_offset2); |
2407 | 73.6k | } else { |
2408 | 73.6k | overlap_result = REJECT; |
2409 | 73.6k | } |
2410 | 6.14M | if (blob->bounding_box().contains(testpt) && textord_debug_blob) { |
2411 | 0 | if (overlap_result != REJECT) { |
2412 | 0 | tprintf("Test blob assigned to row at (%g,%g) on pass %d\n", dest_row->min_y(), |
2413 | 0 | dest_row->max_y(), pass); |
2414 | 0 | } else { |
2415 | 0 | tprintf("Test blob assigned to no row on pass %d\n", pass); |
2416 | 0 | } |
2417 | 0 | } |
2418 | 6.14M | if (overlap_result != REJECT) { |
2419 | 5.03M | while (!row_it.at_first() && row_it.data()->min_y() > row_it.data_relative(-1)->min_y()) { |
2420 | 37 | row = row_it.extract(); |
2421 | 37 | row_it.backward(); |
2422 | 37 | row_it.add_before_then_move(row); |
2423 | 37 | } |
2424 | 5.03M | while (!row_it.at_last() && row_it.data()->min_y() < row_it.data_relative(1)->min_y()) { |
2425 | 148 | row = row_it.extract(); |
2426 | 148 | row_it.forward(); |
2427 | | // Keep rows in order. |
2428 | 148 | row_it.add_after_then_move(row); |
2429 | 148 | } |
2430 | 5.03M | BLOBNBOX_IT added_blob_it(dest_row->blob_list()); |
2431 | 5.03M | added_blob_it.move_to_last(); |
2432 | 5.03M | TBOX prev_box = added_blob_it.data_relative(-1)->bounding_box(); |
2433 | 5.03M | if (dest_row->blob_list()->singleton() || !prev_box.major_x_overlap(blob->bounding_box())) { |
2434 | 3.80M | block_skew = (1 - smooth_factor) * block_skew + |
2435 | 3.80M | smooth_factor * (blob->bounding_box().bottom() - dest_row->initial_min_y()); |
2436 | 3.80M | } |
2437 | 5.03M | } |
2438 | 6.14M | } |
2439 | 972k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
2440 | 894k | if (row_it.data()->blob_list()->empty()) { |
2441 | 3.89k | delete row_it.extract(); // Discard empty rows. |
2442 | 3.89k | } |
2443 | 894k | } |
2444 | 78.2k | } |
2445 | | |
2446 | | /** |
2447 | | * @name most_overlapping_row |
2448 | | * |
2449 | | * Return the row which most overlaps the blob. |
2450 | | */ |
2451 | | OVERLAP_STATE most_overlapping_row( // find best row |
2452 | | TO_ROW_IT *row_it, // iterator |
2453 | | TO_ROW *&best_row, // output row |
2454 | | float top, // top of blob |
2455 | | float bottom, // bottom of blob |
2456 | | float rowsize, // max row size |
2457 | | bool testing_blob // test stuff |
2458 | 5.38M | ) { |
2459 | 5.38M | OVERLAP_STATE result; // result of tests |
2460 | 5.38M | float overlap; // of blob & row |
2461 | 5.38M | float bestover; // nearest row |
2462 | 5.38M | float merge_top, merge_bottom; // size of merged row |
2463 | 5.38M | ICOORD testpt; // testing only |
2464 | 5.38M | TO_ROW *row; // current row |
2465 | 5.38M | TO_ROW *test_row; // for multiple overlaps |
2466 | 5.38M | BLOBNBOX_IT blob_it; // for merging rows |
2467 | | |
2468 | 5.38M | result = ASSIGN; |
2469 | 5.38M | row = row_it->data(); |
2470 | 5.38M | bestover = top - bottom; |
2471 | 5.38M | if (top > row->max_y()) { |
2472 | 541k | bestover -= top - row->max_y(); |
2473 | 541k | } |
2474 | 5.38M | if (bottom < row->min_y()) { |
2475 | | // compute overlap |
2476 | 3.31M | bestover -= row->min_y() - bottom; |
2477 | 3.31M | } |
2478 | 5.38M | if (testing_blob && textord_debug_blob) { |
2479 | 0 | tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f\n", bottom, top, row->min_y(), |
2480 | 0 | row->max_y(), rowsize, bestover); |
2481 | 0 | } |
2482 | 5.38M | test_row = row; |
2483 | 13.5M | do { |
2484 | 13.5M | if (!row_it->at_last()) { |
2485 | 13.2M | row_it->forward(); |
2486 | 13.2M | test_row = row_it->data(); |
2487 | 13.2M | if (test_row->min_y() <= top && test_row->max_y() >= bottom) { |
2488 | 8.27M | merge_top = test_row->max_y() > row->max_y() ? test_row->max_y() : row->max_y(); |
2489 | 8.27M | merge_bottom = test_row->min_y() < row->min_y() ? test_row->min_y() : row->min_y(); |
2490 | 8.27M | if (merge_top - merge_bottom <= rowsize) { |
2491 | 9.03k | if (testing_blob && textord_debug_blob) { |
2492 | 0 | tprintf("Merging rows at (%g,%g), (%g,%g)\n", row->min_y(), row->max_y(), |
2493 | 0 | test_row->min_y(), test_row->max_y()); |
2494 | 0 | } |
2495 | 9.03k | test_row->set_limits(merge_bottom, merge_top); |
2496 | 9.03k | blob_it.set_to_list(test_row->blob_list()); |
2497 | 9.03k | blob_it.add_list_after(row->blob_list()); |
2498 | 9.03k | blob_it.sort(blob_x_order); |
2499 | 9.03k | row_it->backward(); |
2500 | 9.03k | delete row_it->extract(); |
2501 | 9.03k | row_it->forward(); |
2502 | 9.03k | bestover = -1.0f; // force replacement |
2503 | 9.03k | } |
2504 | 8.27M | overlap = top - bottom; |
2505 | 8.27M | if (top > test_row->max_y()) { |
2506 | 4.60M | overlap -= top - test_row->max_y(); |
2507 | 4.60M | } |
2508 | 8.27M | if (bottom < test_row->min_y()) { |
2509 | 1.82M | overlap -= test_row->min_y() - bottom; |
2510 | 1.82M | } |
2511 | 8.27M | if (bestover >= rowsize - 1 && overlap >= rowsize - 1) { |
2512 | 711k | result = REJECT; |
2513 | 711k | } |
2514 | 8.27M | if (overlap > bestover) { |
2515 | 4.30M | bestover = overlap; // find biggest overlap |
2516 | 4.30M | row = test_row; |
2517 | 4.30M | } |
2518 | 8.27M | if (testing_blob && textord_debug_blob) { |
2519 | 0 | tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f->%f\n", bottom, top, |
2520 | 0 | test_row->min_y(), test_row->max_y(), rowsize, overlap, bestover); |
2521 | 0 | } |
2522 | 8.27M | } |
2523 | 13.2M | } |
2524 | 13.5M | } while (!row_it->at_last() && test_row->min_y() <= top && test_row->max_y() >= bottom); |
2525 | 14.3M | while (row_it->data() != row) { |
2526 | 8.93M | row_it->backward(); // make it point to row |
2527 | 8.93M | } |
2528 | | // doesn't overlap much |
2529 | 5.38M | if (top - bottom - bestover > rowsize * textord_overlap_x && |
2530 | 5.38M | (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x) && result == ASSIGN) { |
2531 | 163k | result = NEW_ROW; // doesn't overlap enough |
2532 | 163k | } |
2533 | 5.38M | best_row = row; |
2534 | 5.38M | return result; |
2535 | 5.38M | } |
2536 | | |
2537 | | /** |
2538 | | * @name blob_x_order |
2539 | | * |
2540 | | * Sort function to sort blobs in x from page left. |
2541 | | */ |
2542 | | int blob_x_order( // sort function |
2543 | | const void *item1, // items to compare |
2544 | 46.5M | const void *item2) { |
2545 | | // converted ptr |
2546 | 46.5M | const BLOBNBOX *blob1 = *reinterpret_cast<const BLOBNBOX *const *>(item1); |
2547 | | // converted ptr |
2548 | 46.5M | const BLOBNBOX *blob2 = *reinterpret_cast<const BLOBNBOX *const *>(item2); |
2549 | | |
2550 | 46.5M | if (blob1->bounding_box().left() < blob2->bounding_box().left()) { |
2551 | 21.3M | return -1; |
2552 | 25.2M | } else if (blob1->bounding_box().left() > blob2->bounding_box().left()) { |
2553 | 16.7M | return 1; |
2554 | 16.7M | } else { |
2555 | 8.46M | return 0; |
2556 | 8.46M | } |
2557 | 46.5M | } |
2558 | | |
2559 | | /** |
2560 | | * @name mark_repeated_chars |
2561 | | * |
2562 | | * Mark blobs marked with BTFT_LEADER in repeated sets using the |
2563 | | * repeated_set member of BLOBNBOX. |
2564 | | */ |
2565 | 199k | void mark_repeated_chars(TO_ROW *row) { |
2566 | 199k | BLOBNBOX_IT box_it(row->blob_list()); // Iterator. |
2567 | 199k | int num_repeated_sets = 0; |
2568 | 199k | if (!box_it.empty()) { |
2569 | 2.87M | do { |
2570 | 2.87M | BLOBNBOX *bblob = box_it.data(); |
2571 | 2.87M | int repeat_length = 1; |
2572 | 2.87M | if (bblob->flow() == BTFT_LEADER && !bblob->joined_to_prev() && bblob->cblob() != nullptr) { |
2573 | 0 | BLOBNBOX_IT test_it(box_it); |
2574 | 0 | for (test_it.forward(); !test_it.at_first();) { |
2575 | 0 | bblob = test_it.data(); |
2576 | 0 | if (bblob->flow() != BTFT_LEADER) { |
2577 | 0 | break; |
2578 | 0 | } |
2579 | 0 | test_it.forward(); |
2580 | 0 | bblob = test_it.data(); |
2581 | 0 | if (bblob->joined_to_prev() || bblob->cblob() == nullptr) { |
2582 | 0 | repeat_length = 0; |
2583 | 0 | break; |
2584 | 0 | } |
2585 | 0 | ++repeat_length; |
2586 | 0 | } |
2587 | 0 | } |
2588 | 2.87M | if (repeat_length >= kMinLeaderCount) { |
2589 | 0 | num_repeated_sets++; |
2590 | 0 | for (; repeat_length > 0; box_it.forward(), --repeat_length) { |
2591 | 0 | bblob = box_it.data(); |
2592 | 0 | bblob->set_repeated_set(num_repeated_sets); |
2593 | 0 | } |
2594 | 2.87M | } else { |
2595 | 2.87M | bblob->set_repeated_set(0); |
2596 | 2.87M | box_it.forward(); |
2597 | 2.87M | } |
2598 | 2.87M | } while (!box_it.at_first()); // until all done |
2599 | 199k | } |
2600 | 199k | row->set_num_repeated_sets(num_repeated_sets); |
2601 | 199k | } |
2602 | | |
2603 | | } // namespace tesseract |