/src/tesseract/src/textord/makerow.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: makerow.cpp (Formerly makerows.c) |
3 | | * Description: Code to arrange blobs into rows of text. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1992, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | // Include automatically generated configuration file if running autoconf. |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" |
22 | | #endif |
23 | | |
24 | | #include "makerow.h" |
25 | | |
26 | | #include "blkocc.h" |
27 | | #include "blobbox.h" |
28 | | #include "ccstruct.h" |
29 | | #include "detlinefit.h" |
30 | | #include "drawtord.h" |
31 | | #include "oldbasel.h" |
32 | | #include "sortflts.h" |
33 | | #include "statistc.h" |
34 | | #include "textord.h" |
35 | | #include "tordmain.h" |
36 | | #include "tovars.h" |
37 | | #include "tprintf.h" |
38 | | #include "underlin.h" |
39 | | |
40 | | #include <algorithm> |
41 | | #include <cmath> |
42 | | #include <vector> // for std::vector |
43 | | |
44 | | namespace tesseract { |
45 | | |
46 | | BOOL_VAR(textord_heavy_nr, false, "Vigorously remove noise"); |
47 | | BOOL_VAR(textord_show_initial_rows, false, "Display row accumulation"); |
48 | | BOOL_VAR(textord_show_parallel_rows, false, "Display page correlated rows"); |
49 | | BOOL_VAR(textord_show_expanded_rows, false, "Display rows after expanding"); |
50 | | BOOL_VAR(textord_show_final_rows, false, "Display rows after final fitting"); |
51 | | BOOL_VAR(textord_show_final_blobs, false, "Display blob bounds after pre-ass"); |
52 | | BOOL_VAR(textord_test_landscape, false, "Tests refer to land/port"); |
53 | | BOOL_VAR(textord_parallel_baselines, true, "Force parallel baselines"); |
54 | | BOOL_VAR(textord_straight_baselines, false, "Force straight baselines"); |
55 | | BOOL_VAR(textord_old_baselines, true, "Use old baseline algorithm"); |
56 | | BOOL_VAR(textord_old_xheight, false, "Use old xheight algorithm"); |
57 | | BOOL_VAR(textord_fix_xheight_bug, true, "Use spline baseline"); |
58 | | BOOL_VAR(textord_fix_makerow_bug, true, "Prevent multiple baselines"); |
59 | | BOOL_VAR(textord_debug_xheights, false, "Test xheight algorithms"); |
60 | | static BOOL_VAR(textord_biased_skewcalc, true, "Bias skew estimates with line length"); |
61 | | static BOOL_VAR(textord_interpolating_skew, true, "Interpolate across gaps"); |
62 | | static INT_VAR(textord_skewsmooth_offset, 4, "For smooth factor"); |
63 | | static INT_VAR(textord_skewsmooth_offset2, 1, "For smooth factor"); |
64 | | INT_VAR(textord_test_x, -INT32_MAX, "coord of test pt"); |
65 | | INT_VAR(textord_test_y, -INT32_MAX, "coord of test pt"); |
66 | | INT_VAR(textord_min_blobs_in_row, 4, "Min blobs before gradient counted"); |
67 | | INT_VAR(textord_spline_minblobs, 8, "Min blobs in each spline segment"); |
68 | | INT_VAR(textord_spline_medianwin, 6, "Size of window for spline segmentation"); |
69 | | static INT_VAR(textord_max_blob_overlaps, 4, "Max number of blobs a big blob can overlap"); |
70 | | INT_VAR(textord_min_xheight, 10, "Min credible pixel xheight"); |
71 | | double_VAR(textord_spline_shift_fraction, 0.02, "Fraction of line spacing for quad"); |
72 | | double_VAR(textord_skew_ile, 0.5, "Ile of gradients for page skew"); |
73 | | double_VAR(textord_skew_lag, 0.02, "Lag for skew on row accumulation"); |
74 | | double_VAR(textord_linespace_iqrlimit, 0.2, "Max iqr/median for linespace"); |
75 | | double_VAR(textord_width_limit, 8, "Max width of blobs to make rows"); |
76 | | double_VAR(textord_chop_width, 1.5, "Max width before chopping"); |
77 | | static double_VAR(textord_expansion_factor, 1.0, "Factor to expand rows by in expand_rows"); |
78 | | static double_VAR(textord_overlap_x, 0.375, "Fraction of linespace for good overlap"); |
79 | | double_VAR(textord_minxh, 0.25, "fraction of linesize for min xheight"); |
80 | | double_VAR(textord_min_linesize, 1.25, "* blob height for initial linesize"); |
81 | | double_VAR(textord_excess_blobsize, 1.3, "New row made if blob makes row this big"); |
82 | | double_VAR(textord_occupancy_threshold, 0.4, "Fraction of neighbourhood"); |
83 | | double_VAR(textord_underline_width, 2.0, "Multiple of line_size for underline"); |
84 | | double_VAR(textord_min_blob_height_fraction, 0.75, |
85 | | "Min blob height/top to include blob top into xheight stats"); |
86 | | double_VAR(textord_xheight_mode_fraction, 0.4, "Min pile height to make xheight"); |
87 | | double_VAR(textord_ascheight_mode_fraction, 0.08, "Min pile height to make ascheight"); |
88 | | static double_VAR(textord_descheight_mode_fraction, 0.08, "Min pile height to make descheight"); |
89 | | double_VAR(textord_ascx_ratio_min, 1.25, "Min cap/xheight"); |
90 | | double_VAR(textord_ascx_ratio_max, 1.8, "Max cap/xheight"); |
91 | | double_VAR(textord_descx_ratio_min, 0.25, "Min desc/xheight"); |
92 | | double_VAR(textord_descx_ratio_max, 0.6, "Max desc/xheight"); |
93 | | double_VAR(textord_xheight_error_margin, 0.1, "Accepted variation"); |
94 | | INT_VAR(textord_lms_line_trials, 12, "Number of linew fits to do"); |
95 | | BOOL_VAR(textord_new_initial_xheight, true, "Use test xheight mechanism"); |
96 | | BOOL_VAR(textord_debug_blob, false, "Print test blob information"); |
97 | | |
98 | 29.0k | #define MAX_HEIGHT_MODES 12 |
99 | | |
100 | | const int kMinLeaderCount = 5; |
101 | | |
102 | | /** |
103 | | * @name row_y_order |
104 | | * |
105 | | * Sort function to sort rows in y from page top. |
106 | | */ |
107 | | static int row_y_order( // sort function |
108 | | const TO_ROW *row1, // items to compare |
109 | 97.9k | const TO_ROW *row2) { |
110 | 97.9k | if (row1->parallel_c() > row2->parallel_c()) { |
111 | 5.15k | return -1; |
112 | 92.8k | } else if (row1->parallel_c() < row2->parallel_c()) { |
113 | 92.2k | return 1; |
114 | 92.2k | } else { |
115 | 542 | return 0; |
116 | 542 | } |
117 | 97.9k | } |
118 | | |
119 | | /** |
120 | | * @name row_spacing_order |
121 | | * |
122 | | * Qsort style function to compare 2 TO_ROWS based on their spacing value. |
123 | | */ |
124 | | static int row_spacing_order( // sort function |
125 | | const TO_ROW *row1, // items to compare |
126 | 170k | const TO_ROW *row2) { |
127 | 170k | return row1->spacing < row2->spacing; |
128 | 170k | } |
129 | | |
130 | | // Factored-out helper to build a single row from a list of blobs. |
131 | | // Returns the mean blob size. |
132 | 0 | static float MakeRowFromBlobs(float line_size, BLOBNBOX_IT *blob_it, TO_ROW_IT *row_it) { |
133 | 0 | blob_it->sort(blob_x_order); |
134 | 0 | blob_it->move_to_first(); |
135 | 0 | TO_ROW *row = nullptr; |
136 | 0 | float total_size = 0.0f; |
137 | 0 | int blob_count = 0; |
138 | | // Add all the blobs to a single TO_ROW. |
139 | 0 | for (; !blob_it->empty(); blob_it->forward()) { |
140 | 0 | BLOBNBOX *blob = blob_it->extract(); |
141 | 0 | int top = blob->bounding_box().top(); |
142 | 0 | int bottom = blob->bounding_box().bottom(); |
143 | 0 | if (row == nullptr) { |
144 | 0 | row = new TO_ROW(blob, top, bottom, line_size); |
145 | 0 | row_it->add_before_then_move(row); |
146 | 0 | } else { |
147 | 0 | row->add_blob(blob, top, bottom, line_size); |
148 | 0 | } |
149 | 0 | total_size += top - bottom; |
150 | 0 | ++blob_count; |
151 | 0 | } |
152 | 0 | return blob_count > 0 ? total_size / blob_count : total_size; |
153 | 0 | } |
154 | | |
155 | | // Helper to make a row using the children of a single blob. |
156 | | // Returns the mean size of the blobs created. |
157 | 0 | static float MakeRowFromSubBlobs(TO_BLOCK *block, C_BLOB *blob, TO_ROW_IT *row_it) { |
158 | | // The blobs made from the children will go in the small_blobs list. |
159 | 0 | BLOBNBOX_IT bb_it(&block->small_blobs); |
160 | 0 | C_OUTLINE_IT ol_it(blob->out_list()); |
161 | | // Get the children. |
162 | 0 | ol_it.set_to_list(ol_it.data()->child()); |
163 | 0 | if (ol_it.empty()) { |
164 | 0 | return 0.0f; |
165 | 0 | } |
166 | 0 | for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { |
167 | | // Deep copy the child outline and use that to make a blob. |
168 | 0 | blob = new C_BLOB(C_OUTLINE::deep_copy(ol_it.data())); |
169 | | // Correct direction as needed. |
170 | 0 | blob->CheckInverseFlagAndDirection(); |
171 | 0 | auto *bbox = new BLOBNBOX(blob); |
172 | 0 | bb_it.add_after_then_move(bbox); |
173 | 0 | } |
174 | | // Now we can make a row from the blobs. |
175 | 0 | return MakeRowFromBlobs(block->line_size, &bb_it, row_it); |
176 | 0 | } |
177 | | |
178 | | /** |
179 | | * @name make_single_row |
180 | | * |
181 | | * Arrange the blobs into a single row... well actually, if there is |
182 | | * only a single blob, it makes 2 rows, in case the top-level blob |
183 | | * is a container of the real blobs to recognize. |
184 | | */ |
185 | | float make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK *block, |
186 | 0 | TO_BLOCK_LIST *blocks) { |
187 | 0 | BLOBNBOX_IT blob_it = &block->blobs; |
188 | 0 | TO_ROW_IT row_it = block->get_rows(); |
189 | | |
190 | | // Include all the small blobs and large blobs. |
191 | 0 | blob_it.add_list_after(&block->small_blobs); |
192 | 0 | blob_it.add_list_after(&block->noise_blobs); |
193 | 0 | blob_it.add_list_after(&block->large_blobs); |
194 | 0 | if (block->blobs.singleton() && allow_sub_blobs) { |
195 | 0 | blob_it.move_to_first(); |
196 | 0 | float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it); |
197 | 0 | if (size > block->line_size) { |
198 | 0 | block->line_size = size; |
199 | 0 | } |
200 | 0 | } else if (block->blobs.empty()) { |
201 | | // Make a fake blob. |
202 | 0 | C_BLOB *blob = C_BLOB::FakeBlob(block->block->pdblk.bounding_box()); |
203 | | // The blobnbox owns the blob. |
204 | 0 | auto *bblob = new BLOBNBOX(blob); |
205 | 0 | blob_it.add_after_then_move(bblob); |
206 | 0 | } |
207 | 0 | MakeRowFromBlobs(block->line_size, &blob_it, &row_it); |
208 | | // Fit an LMS line to the rows. |
209 | 0 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
210 | 0 | fit_lms_line(row_it.data()); |
211 | 0 | } |
212 | 0 | float gradient; |
213 | 0 | float fit_error; |
214 | | // Compute the skew based on the fitted line. |
215 | 0 | compute_page_skew(blocks, gradient, fit_error); |
216 | 0 | return gradient; |
217 | 0 | } |
218 | | |
219 | | /** |
220 | | * @name make_rows |
221 | | * |
222 | | * Arrange the blobs into rows. |
223 | | */ |
224 | 7.74k | float make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks) { |
225 | 7.74k | float port_m; // global skew |
226 | 7.74k | float port_err; // global noise |
227 | 7.74k | TO_BLOCK_IT block_it; // iterator |
228 | | |
229 | 7.74k | block_it.set_to_list(port_blocks); |
230 | 15.4k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
231 | 7.74k | make_initial_textrows(page_tr, block_it.data(), FCOORD(1.0f, 0.0f), !textord_test_landscape); |
232 | 7.74k | } |
233 | | // compute globally |
234 | 7.74k | compute_page_skew(port_blocks, port_m, port_err); |
235 | 7.74k | block_it.set_to_list(port_blocks); |
236 | 15.4k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
237 | 7.74k | cleanup_rows_making(page_tr, block_it.data(), port_m, FCOORD(1.0f, 0.0f), |
238 | 7.74k | block_it.data()->block->pdblk.bounding_box().left(), |
239 | 7.74k | !textord_test_landscape); |
240 | 7.74k | } |
241 | 7.74k | return port_m; // global skew |
242 | 7.74k | } |
243 | | |
244 | | /** |
245 | | * @name make_initial_textrows |
246 | | * |
247 | | * Arrange the good blobs into rows of text. |
248 | | */ |
249 | | void make_initial_textrows( // find lines |
250 | | ICOORD page_tr, |
251 | | TO_BLOCK *block, // block to do |
252 | | FCOORD rotation, // for drawing |
253 | | bool testing_on // correct orientation |
254 | 7.74k | ) { |
255 | 7.74k | TO_ROW_IT row_it = block->get_rows(); |
256 | | |
257 | | #ifndef GRAPHICS_DISABLED |
258 | | ScrollView::Color colour; // of row |
259 | | |
260 | | if (textord_show_initial_rows && testing_on) { |
261 | | if (to_win == nullptr) { |
262 | | create_to_win(page_tr); |
263 | | } |
264 | | } |
265 | | #endif |
266 | | // guess skew |
267 | 7.74k | assign_blobs_to_rows(block, nullptr, 0, true, true, textord_show_initial_rows && testing_on); |
268 | 7.74k | row_it.move_to_first(); |
269 | 77.6k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
270 | 69.9k | fit_lms_line(row_it.data()); |
271 | 69.9k | } |
272 | | #ifndef GRAPHICS_DISABLED |
273 | | if (textord_show_initial_rows && testing_on) { |
274 | | colour = ScrollView::RED; |
275 | | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
276 | | plot_to_row(row_it.data(), colour, rotation); |
277 | | colour = static_cast<ScrollView::Color>(colour + 1); |
278 | | if (colour > ScrollView::MAGENTA) { |
279 | | colour = ScrollView::RED; |
280 | | } |
281 | | } |
282 | | } |
283 | | #endif |
284 | 7.74k | } |
285 | | |
286 | | /** |
287 | | * @name fit_lms_line |
288 | | * |
289 | | * Fit an LMS line to a row. |
290 | | */ |
291 | 69.9k | void fit_lms_line(TO_ROW *row) { |
292 | 69.9k | float m, c; // fitted line |
293 | 69.9k | tesseract::DetLineFit lms; |
294 | 69.9k | BLOBNBOX_IT blob_it = row->blob_list(); |
295 | | |
296 | 306k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
297 | 236k | const TBOX &box = blob_it.data()->bounding_box(); |
298 | 236k | lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom())); |
299 | 236k | } |
300 | 69.9k | double error = lms.Fit(&m, &c); |
301 | 69.9k | row->set_line(m, c, error); |
302 | 69.9k | } |
303 | | |
304 | | /** |
305 | | * @name compute_page_skew |
306 | | * |
307 | | * Compute the skew over a full page by averaging the gradients over |
308 | | * all the lines. Get the error of the same row. |
309 | | */ |
310 | | void compute_page_skew( // get average gradient |
311 | | TO_BLOCK_LIST *blocks, // list of blocks |
312 | | float &page_m, // average gradient |
313 | | float &page_err // average error |
314 | 7.74k | ) { |
315 | 7.74k | int32_t row_count; // total rows |
316 | 7.74k | int32_t blob_count; // total_blobs |
317 | 7.74k | int32_t row_err; // integer error |
318 | 7.74k | int32_t row_index; // of total |
319 | 7.74k | TO_ROW *row; // current row |
320 | 7.74k | TO_BLOCK_IT block_it = blocks; // iterator |
321 | | |
322 | 7.74k | row_count = 0; |
323 | 7.74k | blob_count = 0; |
324 | 15.4k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
325 | 7.74k | POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block(); |
326 | 7.74k | if (pb != nullptr && !pb->IsText()) { |
327 | 0 | continue; // Pretend non-text blocks don't exist. |
328 | 0 | } |
329 | 7.74k | row_count += block_it.data()->get_rows()->length(); |
330 | | // count up rows |
331 | 7.74k | TO_ROW_IT row_it(block_it.data()->get_rows()); |
332 | 77.6k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
333 | 69.9k | blob_count += row_it.data()->blob_list()->length(); |
334 | 69.9k | } |
335 | 7.74k | } |
336 | 7.74k | if (row_count == 0) { |
337 | 207 | page_m = 0.0f; |
338 | 207 | page_err = 0.0f; |
339 | 207 | return; |
340 | 207 | } |
341 | | // of rows |
342 | 7.53k | std::vector<float> gradients(blob_count); |
343 | | // of rows |
344 | 7.53k | std::vector<float> errors(blob_count); |
345 | | |
346 | 7.53k | row_index = 0; |
347 | 15.0k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
348 | 7.53k | POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block(); |
349 | 7.53k | if (pb != nullptr && !pb->IsText()) { |
350 | 0 | continue; // Pretend non-text blocks don't exist. |
351 | 0 | } |
352 | 7.53k | TO_ROW_IT row_it(block_it.data()->get_rows()); |
353 | 77.4k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
354 | 69.9k | row = row_it.data(); |
355 | 69.9k | blob_count = row->blob_list()->length(); |
356 | 69.9k | row_err = static_cast<int32_t>(std::ceil(row->line_error())); |
357 | 69.9k | if (row_err <= 0) { |
358 | 55.2k | row_err = 1; |
359 | 55.2k | } |
360 | 69.9k | if (textord_biased_skewcalc) { |
361 | 69.9k | blob_count /= row_err; |
362 | 240k | for (blob_count /= row_err; blob_count > 0; blob_count--) { |
363 | 170k | gradients[row_index] = row->line_m(); |
364 | 170k | errors[row_index] = row->line_error(); |
365 | 170k | row_index++; |
366 | 170k | } |
367 | 69.9k | } else if (blob_count >= textord_min_blobs_in_row) { |
368 | | // get gradient |
369 | 0 | gradients[row_index] = row->line_m(); |
370 | 0 | errors[row_index] = row->line_error(); |
371 | 0 | row_index++; |
372 | 0 | } |
373 | 69.9k | } |
374 | 7.53k | } |
375 | 7.53k | if (row_index == 0) { |
376 | | // desperate |
377 | 150 | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
378 | 75 | POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block(); |
379 | 75 | if (pb != nullptr && !pb->IsText()) { |
380 | 0 | continue; // Pretend non-text blocks don't exist. |
381 | 0 | } |
382 | 75 | TO_ROW_IT row_it(block_it.data()->get_rows()); |
383 | 321 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
384 | 246 | row = row_it.data(); |
385 | 246 | gradients[row_index] = row->line_m(); |
386 | 246 | errors[row_index] = row->line_error(); |
387 | 246 | row_index++; |
388 | 246 | } |
389 | 75 | } |
390 | 75 | } |
391 | 7.53k | row_count = row_index; |
392 | 7.53k | row_index = static_cast<int32_t>(row_count * textord_skew_ile); |
393 | 7.53k | gradients.resize(row_count); |
394 | 7.53k | std::nth_element(gradients.begin(), gradients.begin() + row_index, gradients.end()); |
395 | 7.53k | page_m = gradients[row_index]; |
396 | 7.53k | row_index = static_cast<int32_t>(row_count * textord_skew_ile); |
397 | 7.53k | errors.resize(row_count); |
398 | 7.53k | std::nth_element(errors.begin(), errors.begin() + row_index, errors.end()); |
399 | 7.53k | page_err = errors[row_index]; |
400 | 7.53k | } |
401 | | |
402 | | const double kNoiseSize = 0.5; // Fraction of xheight. |
403 | | const int kMinSize = 8; // Min pixels to be xheight. |
404 | | |
405 | | /** |
406 | | * Return true if the dot looks like it is part of the i. |
407 | | * Doesn't work for any other diacritical. |
408 | | */ |
409 | 0 | static bool dot_of_i(BLOBNBOX *dot, BLOBNBOX *i, TO_ROW *row) { |
410 | 0 | const TBOX &ibox = i->bounding_box(); |
411 | 0 | const TBOX &dotbox = dot->bounding_box(); |
412 | | |
413 | | // Must overlap horizontally by enough and be high enough. |
414 | 0 | int overlap = std::min(dotbox.right(), ibox.right()) - std::max(dotbox.left(), ibox.left()); |
415 | 0 | if (ibox.height() <= 2 * dotbox.height() || |
416 | 0 | (overlap * 2 < ibox.width() && overlap < dotbox.width())) { |
417 | 0 | return false; |
418 | 0 | } |
419 | | |
420 | | // If the i is tall and thin then it is good. |
421 | 0 | if (ibox.height() > ibox.width() * 2) { |
422 | 0 | return true; // The i or ! must be tall and thin. |
423 | 0 | } |
424 | | |
425 | | // It might still be tall and thin, but it might be joined to something. |
426 | | // So search the outline for a piece of large height close to the edges |
427 | | // of the dot. |
428 | 0 | const double kHeightFraction = 0.6; |
429 | 0 | double target_height = std::min(dotbox.bottom(), ibox.top()); |
430 | 0 | target_height -= row->line_m() * dotbox.left() + row->line_c(); |
431 | 0 | target_height *= kHeightFraction; |
432 | 0 | int left_min = dotbox.left() - dotbox.width(); |
433 | 0 | int middle = (dotbox.left() + dotbox.right()) / 2; |
434 | 0 | int right_max = dotbox.right() + dotbox.width(); |
435 | 0 | int left_miny = 0; |
436 | 0 | int left_maxy = 0; |
437 | 0 | int right_miny = 0; |
438 | 0 | int right_maxy = 0; |
439 | 0 | bool found_left = false; |
440 | 0 | bool found_right = false; |
441 | 0 | bool in_left = false; |
442 | 0 | bool in_right = false; |
443 | 0 | C_BLOB *blob = i->cblob(); |
444 | 0 | C_OUTLINE_IT o_it = blob->out_list(); |
445 | 0 | for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) { |
446 | 0 | C_OUTLINE *outline = o_it.data(); |
447 | 0 | int length = outline->pathlength(); |
448 | 0 | ICOORD pos = outline->start_pos(); |
449 | 0 | for (int step = 0; step < length; pos += outline->step(step++)) { |
450 | 0 | int x = pos.x(); |
451 | 0 | int y = pos.y(); |
452 | 0 | if (x >= left_min && x < middle && !found_left) { |
453 | | // We are in the left part so find min and max y. |
454 | 0 | if (in_left) { |
455 | 0 | if (y > left_maxy) { |
456 | 0 | left_maxy = y; |
457 | 0 | } |
458 | 0 | if (y < left_miny) { |
459 | 0 | left_miny = y; |
460 | 0 | } |
461 | 0 | } else { |
462 | 0 | left_maxy = left_miny = y; |
463 | 0 | in_left = true; |
464 | 0 | } |
465 | 0 | } else if (in_left) { |
466 | | // We just left the left so look for size. |
467 | 0 | if (left_maxy - left_miny > target_height) { |
468 | 0 | if (found_right) { |
469 | 0 | return true; |
470 | 0 | } |
471 | 0 | found_left = true; |
472 | 0 | } |
473 | 0 | in_left = false; |
474 | 0 | } |
475 | 0 | if (x <= right_max && x > middle && !found_right) { |
476 | | // We are in the right part so find min and max y. |
477 | 0 | if (in_right) { |
478 | 0 | if (y > right_maxy) { |
479 | 0 | right_maxy = y; |
480 | 0 | } |
481 | 0 | if (y < right_miny) { |
482 | 0 | right_miny = y; |
483 | 0 | } |
484 | 0 | } else { |
485 | 0 | right_maxy = right_miny = y; |
486 | 0 | in_right = true; |
487 | 0 | } |
488 | 0 | } else if (in_right) { |
489 | | // We just left the right so look for size. |
490 | 0 | if (right_maxy - right_miny > target_height) { |
491 | 0 | if (found_left) { |
492 | 0 | return true; |
493 | 0 | } |
494 | 0 | found_right = true; |
495 | 0 | } |
496 | 0 | in_right = false; |
497 | 0 | } |
498 | 0 | } |
499 | 0 | } |
500 | 0 | return false; |
501 | 0 | } |
502 | | |
503 | 0 | void vigorous_noise_removal(TO_BLOCK *block) { |
504 | 0 | TO_ROW_IT row_it = block->get_rows(); |
505 | 0 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
506 | 0 | TO_ROW *row = row_it.data(); |
507 | 0 | BLOBNBOX_IT b_it = row->blob_list(); |
508 | | // Estimate the xheight on the row. |
509 | 0 | int max_height = 0; |
510 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
511 | 0 | BLOBNBOX *blob = b_it.data(); |
512 | 0 | if (blob->bounding_box().height() > max_height) { |
513 | 0 | max_height = blob->bounding_box().height(); |
514 | 0 | } |
515 | 0 | } |
516 | 0 | STATS hstats(0, max_height); |
517 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
518 | 0 | BLOBNBOX *blob = b_it.data(); |
519 | 0 | int height = blob->bounding_box().height(); |
520 | 0 | if (height >= kMinSize) { |
521 | 0 | hstats.add(blob->bounding_box().height(), 1); |
522 | 0 | } |
523 | 0 | } |
524 | 0 | float xheight = hstats.median(); |
525 | | // Delete small objects. |
526 | 0 | BLOBNBOX *prev = nullptr; |
527 | 0 | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
528 | 0 | BLOBNBOX *blob = b_it.data(); |
529 | 0 | const TBOX &box = blob->bounding_box(); |
530 | 0 | if (box.height() < kNoiseSize * xheight) { |
531 | | // Small so delete unless it looks like an i dot. |
532 | 0 | if (prev != nullptr) { |
533 | 0 | if (dot_of_i(blob, prev, row)) { |
534 | 0 | continue; // Looks OK. |
535 | 0 | } |
536 | 0 | } |
537 | 0 | if (!b_it.at_last()) { |
538 | 0 | BLOBNBOX *next = b_it.data_relative(1); |
539 | 0 | if (dot_of_i(blob, next, row)) { |
540 | 0 | continue; // Looks OK. |
541 | 0 | } |
542 | 0 | } |
543 | | // It might be noise so get rid of it. |
544 | 0 | delete blob->remove_cblob(); |
545 | 0 | delete b_it.extract(); |
546 | 0 | } else { |
547 | 0 | prev = blob; |
548 | 0 | } |
549 | 0 | } |
550 | 0 | } |
551 | 0 | } |
552 | | |
553 | | /** |
554 | | * cleanup_rows_making |
555 | | * |
556 | | * Remove overlapping rows and fit all the blobs to what's left. |
557 | | */ |
558 | | void cleanup_rows_making( // find lines |
559 | | ICOORD page_tr, // top right |
560 | | TO_BLOCK *block, // block to do |
561 | | float gradient, // gradient to fit |
562 | | FCOORD rotation, // for drawing |
563 | | int32_t block_edge, // edge of block |
564 | | bool testing_on // correct orientation |
565 | 7.74k | ) { |
566 | | // iterators |
567 | 7.74k | BLOBNBOX_IT blob_it = &block->blobs; |
568 | 7.74k | TO_ROW_IT row_it = block->get_rows(); |
569 | | |
570 | | #ifndef GRAPHICS_DISABLED |
571 | | if (textord_show_parallel_rows && testing_on) { |
572 | | if (to_win == nullptr) { |
573 | | create_to_win(page_tr); |
574 | | } |
575 | | } |
576 | | #endif |
577 | | // get row coords |
578 | 7.74k | fit_parallel_rows(block, gradient, rotation, block_edge, |
579 | 7.74k | textord_show_parallel_rows && testing_on); |
580 | 7.74k | delete_non_dropout_rows(block, gradient, rotation, block_edge, |
581 | 7.74k | textord_show_parallel_rows && testing_on); |
582 | 7.74k | expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on); |
583 | 7.74k | blob_it.set_to_list(&block->blobs); |
584 | 7.74k | row_it.set_to_list(block->get_rows()); |
585 | 39.7k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
586 | 31.9k | blob_it.add_list_after(row_it.data()->blob_list()); |
587 | 31.9k | } |
588 | | // give blobs back |
589 | 7.74k | assign_blobs_to_rows(block, &gradient, 1, false, false, false); |
590 | | // now new rows must be genuine |
591 | 7.74k | blob_it.set_to_list(&block->blobs); |
592 | 7.74k | blob_it.add_list_after(&block->large_blobs); |
593 | 7.74k | assign_blobs_to_rows(block, &gradient, 2, true, true, false); |
594 | | // safe to use big ones now |
595 | 7.74k | blob_it.set_to_list(&block->blobs); |
596 | | // throw all blobs in |
597 | 7.74k | blob_it.add_list_after(&block->noise_blobs); |
598 | 7.74k | blob_it.add_list_after(&block->small_blobs); |
599 | 7.74k | assign_blobs_to_rows(block, &gradient, 3, false, false, false); |
600 | 7.74k | } |
601 | | |
602 | | /** |
603 | | * delete_non_dropout_rows |
604 | | * |
605 | | * Compute the linespacing and offset. |
606 | | */ |
607 | | void delete_non_dropout_rows( // find lines |
608 | | TO_BLOCK *block, // block to do |
609 | | float gradient, // global skew |
610 | | FCOORD rotation, // deskew vector |
611 | | int32_t block_edge, // left edge |
612 | | bool testing_on // correct orientation |
613 | 7.74k | ) { |
614 | 7.74k | TBOX block_box; // deskewed block |
615 | 7.74k | int32_t max_y; // in block |
616 | 7.74k | int32_t min_y; |
617 | 7.74k | int32_t line_index; // of scan line |
618 | 7.74k | int32_t line_count; // no of scan lines |
619 | 7.74k | int32_t distance; // to drop-out |
620 | 7.74k | int32_t xleft; // of block |
621 | 7.74k | int32_t ybottom; // of block |
622 | 7.74k | TO_ROW *row; // current row |
623 | 7.74k | TO_ROW_IT row_it = block->get_rows(); |
624 | 7.74k | BLOBNBOX_IT blob_it = &block->blobs; |
625 | | |
626 | 7.74k | if (row_it.empty()) { |
627 | 207 | return; // empty block |
628 | 207 | } |
629 | 7.53k | block_box = deskew_block_coords(block, gradient); |
630 | 7.53k | xleft = block->block->pdblk.bounding_box().left(); |
631 | 7.53k | ybottom = block->block->pdblk.bounding_box().bottom(); |
632 | 7.53k | min_y = block_box.bottom() - 1; |
633 | 7.53k | max_y = block_box.top() + 1; |
634 | 77.4k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
635 | 69.9k | line_index = static_cast<int32_t>(std::floor(row_it.data()->intercept())); |
636 | 69.9k | if (line_index <= min_y) { |
637 | 667 | min_y = line_index - 1; |
638 | 667 | } |
639 | 69.9k | if (line_index >= max_y) { |
640 | 0 | max_y = line_index + 1; |
641 | 0 | } |
642 | 69.9k | } |
643 | 7.53k | line_count = max_y - min_y + 1; |
644 | 7.53k | if (line_count <= 0) { |
645 | 0 | return; // empty block |
646 | 0 | } |
647 | | // change in occupation |
648 | 7.53k | std::vector<int32_t> deltas(line_count); |
649 | | // of pixel coords |
650 | 7.53k | std::vector<int32_t> occupation(line_count); |
651 | | |
652 | 7.53k | compute_line_occupation(block, gradient, min_y, max_y, &occupation[0], &deltas[0]); |
653 | 7.53k | compute_occupation_threshold( |
654 | 7.53k | static_cast<int32_t>(ceil(block->line_spacing * (tesseract::CCStruct::kDescenderFraction + |
655 | 7.53k | tesseract::CCStruct::kAscenderFraction))), |
656 | 7.53k | static_cast<int32_t>(ceil(block->line_spacing * (tesseract::CCStruct::kXHeightFraction + |
657 | 7.53k | tesseract::CCStruct::kAscenderFraction))), |
658 | 7.53k | max_y - min_y + 1, &occupation[0], &deltas[0]); |
659 | | #ifndef GRAPHICS_DISABLED |
660 | | if (testing_on) { |
661 | | draw_occupation(xleft, ybottom, min_y, max_y, &occupation[0], &deltas[0]); |
662 | | } |
663 | | #endif |
664 | 7.53k | compute_dropout_distances(&occupation[0], &deltas[0], line_count); |
665 | 77.4k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
666 | 69.9k | row = row_it.data(); |
667 | 69.9k | line_index = static_cast<int32_t>(std::floor(row->intercept())); |
668 | 69.9k | distance = deltas[line_index - min_y]; |
669 | 69.9k | if (find_best_dropout_row(row, distance, block->line_spacing / 2, line_index, &row_it, |
670 | 69.9k | testing_on)) { |
671 | | #ifndef GRAPHICS_DISABLED |
672 | | if (testing_on) { |
673 | | plot_parallel_row(row, gradient, block_edge, ScrollView::WHITE, rotation); |
674 | | } |
675 | | #endif |
676 | 37.0k | blob_it.add_list_after(row_it.data()->blob_list()); |
677 | 37.0k | delete row_it.extract(); // too far away |
678 | 37.0k | } |
679 | 69.9k | } |
680 | 40.3k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
681 | 32.8k | blob_it.add_list_after(row_it.data()->blob_list()); |
682 | 32.8k | } |
683 | 7.53k | } |
684 | | |
685 | | /** |
686 | | * @name find_best_dropout_row |
687 | | * |
688 | | * Delete this row if it has a neighbour with better dropout characteristics. |
689 | | * true is returned if the row should be deleted. |
690 | | */ |
691 | | bool find_best_dropout_row( // find neighbours |
692 | | TO_ROW *row, // row to test |
693 | | int32_t distance, // dropout dist |
694 | | float dist_limit, // threshold distance |
695 | | int32_t line_index, // index of row |
696 | | TO_ROW_IT *row_it, // current position |
697 | | bool testing_on // correct orientation |
698 | 69.9k | ) { |
699 | 69.9k | int32_t next_index; // of neighbouring row |
700 | 69.9k | int32_t row_offset; // from current row |
701 | 69.9k | int32_t abs_dist; // absolute distance |
702 | 69.9k | int8_t row_inc; // increment to row_index |
703 | 69.9k | TO_ROW *next_row; // nextious row |
704 | | |
705 | 69.9k | if (testing_on) { |
706 | 0 | tprintf("Row at %g(%g), dropout dist=%d,", row->intercept(), row->parallel_c(), distance); |
707 | 0 | } |
708 | 69.9k | if (distance < 0) { |
709 | 34.1k | row_inc = 1; |
710 | 34.1k | abs_dist = -distance; |
711 | 35.8k | } else { |
712 | 35.8k | row_inc = -1; |
713 | 35.8k | abs_dist = distance; |
714 | 35.8k | } |
715 | 69.9k | if (abs_dist > dist_limit) { |
716 | 24.9k | if (testing_on) { |
717 | 0 | tprintf(" too far - deleting\n"); |
718 | 0 | } |
719 | 24.9k | return true; |
720 | 24.9k | } |
721 | 44.9k | if ((distance < 0 && !row_it->at_last()) || (distance >= 0 && !row_it->at_first())) { |
722 | 39.9k | row_offset = row_inc; |
723 | 40.5k | do { |
724 | 40.5k | next_row = row_it->data_relative(row_offset); |
725 | 40.5k | next_index = static_cast<int32_t>(std::floor(next_row->intercept())); |
726 | 40.5k | if ((distance < 0 && next_index < line_index && |
727 | 40.5k | next_index > line_index + distance + distance) || |
728 | 40.5k | (distance >= 0 && next_index > line_index && |
729 | 32.7k | next_index < line_index + distance + distance)) { |
730 | 11.3k | if (testing_on) { |
731 | 0 | tprintf(" nearer neighbour (%d) at %g\n", line_index + distance - next_index, |
732 | 0 | next_row->intercept()); |
733 | 0 | } |
734 | 11.3k | return true; // other is nearer |
735 | 29.2k | } else if (next_index == line_index || next_index == line_index + distance + distance) { |
736 | 1.37k | if (row->believability() <= next_row->believability()) { |
737 | 749 | if (testing_on) { |
738 | 0 | tprintf(" equal but more believable at %g (%g/%g)\n", next_row->intercept(), |
739 | 0 | row->believability(), next_row->believability()); |
740 | 0 | } |
741 | 749 | return true; // other is more believable |
742 | 749 | } |
743 | 1.37k | } |
744 | 28.4k | row_offset += row_inc; |
745 | 28.4k | } while ((next_index == line_index || next_index == line_index + distance + distance) && |
746 | 28.4k | row_offset < row_it->length()); |
747 | 27.8k | if (testing_on) { |
748 | 0 | tprintf(" keeping\n"); |
749 | 0 | } |
750 | 27.8k | } |
751 | 32.8k | return false; |
752 | 44.9k | } |
753 | | |
754 | | /** |
755 | | * @name deskew_block_coords |
756 | | * |
757 | | * Compute the bounding box of all the blobs in the block |
758 | | * if they were deskewed without actually doing it. |
759 | | */ |
760 | | TBOX deskew_block_coords( // block box |
761 | | TO_BLOCK *block, // block to do |
762 | | float gradient // global skew |
763 | 7.53k | ) { |
764 | 7.53k | TBOX result; // block bounds |
765 | 7.53k | TBOX blob_box; // of block |
766 | 7.53k | FCOORD rotation; // deskew vector |
767 | 7.53k | float length; // of gradient vector |
768 | 7.53k | TO_ROW_IT row_it = block->get_rows(); |
769 | 7.53k | TO_ROW *row; // current row |
770 | 7.53k | BLOBNBOX *blob; // current blob |
771 | 7.53k | BLOBNBOX_IT blob_it; // iterator |
772 | | |
773 | 7.53k | length = std::sqrt(gradient * gradient + 1); |
774 | 7.53k | rotation = FCOORD(1 / length, -gradient / length); |
775 | 77.4k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
776 | 69.9k | row = row_it.data(); |
777 | 69.9k | blob_it.set_to_list(row->blob_list()); |
778 | 306k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
779 | 236k | blob = blob_it.data(); |
780 | 236k | blob_box = blob->bounding_box(); |
781 | 236k | blob_box.rotate(rotation); // de-skew it |
782 | 236k | result += blob_box; |
783 | 236k | } |
784 | 69.9k | } |
785 | 7.53k | return result; |
786 | 7.53k | } |
787 | | |
788 | | /** |
789 | | * @name compute_line_occupation |
790 | | * |
791 | | * Compute the pixel projection back on the y axis given the global |
792 | | * skew. Also compute the 1st derivative. |
793 | | */ |
794 | | void compute_line_occupation( // project blobs |
795 | | TO_BLOCK *block, // block to do |
796 | | float gradient, // global skew |
797 | | int32_t min_y, // min coord in block |
798 | | int32_t max_y, // in block |
799 | | int32_t *occupation, // output projection |
800 | | int32_t *deltas // derivative |
801 | 7.53k | ) { |
802 | 7.53k | int32_t line_count; // maxy-miny+1 |
803 | 7.53k | int32_t line_index; // of scan line |
804 | 7.53k | int index; // array index for daft compilers |
805 | 7.53k | TO_ROW *row; // current row |
806 | 7.53k | TO_ROW_IT row_it = block->get_rows(); |
807 | 7.53k | BLOBNBOX *blob; // current blob |
808 | 7.53k | BLOBNBOX_IT blob_it; // iterator |
809 | 7.53k | float length; // of skew vector |
810 | 7.53k | TBOX blob_box; // bounding box |
811 | 7.53k | FCOORD rotation; // inverse of skew |
812 | | |
813 | 7.53k | line_count = max_y - min_y + 1; |
814 | 7.53k | length = std::sqrt(gradient * gradient + 1); |
815 | 7.53k | rotation = FCOORD(1 / length, -gradient / length); |
816 | 661k | for (line_index = 0; line_index < line_count; line_index++) { |
817 | 653k | deltas[line_index] = 0; |
818 | 653k | } |
819 | 77.4k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
820 | 69.9k | row = row_it.data(); |
821 | 69.9k | blob_it.set_to_list(row->blob_list()); |
822 | 306k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
823 | 236k | blob = blob_it.data(); |
824 | 236k | blob_box = blob->bounding_box(); |
825 | 236k | blob_box.rotate(rotation); // de-skew it |
826 | 236k | int32_t width = blob_box.right() - blob_box.left(); |
827 | 236k | index = blob_box.bottom() - min_y; |
828 | 236k | ASSERT_HOST(index >= 0 && index < line_count); |
829 | | // count transitions |
830 | 236k | deltas[index] += width; |
831 | 236k | index = blob_box.top() - min_y; |
832 | 236k | ASSERT_HOST(index >= 0 && index < line_count); |
833 | 236k | deltas[index] -= width; |
834 | 236k | } |
835 | 69.9k | } |
836 | 7.53k | occupation[0] = deltas[0]; |
837 | 653k | for (line_index = 1; line_index < line_count; line_index++) { |
838 | 646k | occupation[line_index] = occupation[line_index - 1] + deltas[line_index]; |
839 | 646k | } |
840 | 7.53k | } |
841 | | |
842 | | /** |
843 | | * compute_occupation_threshold |
844 | | * |
845 | | * Compute thresholds for textline or not for the occupation array. |
846 | | */ |
847 | | void compute_occupation_threshold( // project blobs |
848 | | int32_t low_window, // below result point |
849 | | int32_t high_window, // above result point |
850 | | int32_t line_count, // array sizes |
851 | | int32_t *occupation, // input projection |
852 | | int32_t *thresholds // output thresholds |
853 | 7.53k | ) { |
854 | 7.53k | int32_t line_index; // of thresholds line |
855 | 7.53k | int32_t low_index; // in occupation |
856 | 7.53k | int32_t high_index; // in occupation |
857 | 7.53k | int32_t sum; // current average |
858 | 7.53k | int32_t divisor; // to get thresholds |
859 | 7.53k | int32_t min_index; // of min occ |
860 | 7.53k | int32_t min_occ; // min in locality |
861 | 7.53k | int32_t test_index; // for finding min |
862 | | |
863 | 7.53k | divisor = static_cast<int32_t>(ceil((low_window + high_window) / textord_occupancy_threshold)); |
864 | 7.53k | if (low_window + high_window < line_count) { |
865 | 92.8k | for (sum = 0, high_index = 0; high_index < low_window; high_index++) { |
866 | 86.7k | sum += occupation[high_index]; |
867 | 86.7k | } |
868 | 135k | for (low_index = 0; low_index < high_window; low_index++, high_index++) { |
869 | 129k | sum += occupation[high_index]; |
870 | 129k | } |
871 | 6.08k | min_occ = occupation[0]; |
872 | 6.08k | min_index = 0; |
873 | 216k | for (test_index = 1; test_index < high_index; test_index++) { |
874 | 210k | if (occupation[test_index] <= min_occ) { |
875 | 41.8k | min_occ = occupation[test_index]; |
876 | 41.8k | min_index = test_index; // find min in region |
877 | 41.8k | } |
878 | 210k | } |
879 | 92.8k | for (line_index = 0; line_index < low_window; line_index++) { |
880 | 86.7k | thresholds[line_index] = (sum - min_occ) / divisor + min_occ; |
881 | 86.7k | } |
882 | | // same out to end |
883 | 365k | for (low_index = 0; high_index < line_count; low_index++, high_index++) { |
884 | 359k | sum -= occupation[low_index]; |
885 | 359k | sum += occupation[high_index]; |
886 | 359k | if (occupation[high_index] <= min_occ) { |
887 | | // find min in region |
888 | 85.9k | min_occ = occupation[high_index]; |
889 | 85.9k | min_index = high_index; |
890 | 85.9k | } |
891 | | // lost min from region |
892 | 359k | if (min_index <= low_index) { |
893 | 6.08k | min_occ = occupation[low_index + 1]; |
894 | 6.08k | min_index = low_index + 1; |
895 | 191k | for (test_index = low_index + 2; test_index <= high_index; test_index++) { |
896 | 185k | if (occupation[test_index] <= min_occ) { |
897 | 30.0k | min_occ = occupation[test_index]; |
898 | | // find min in region |
899 | 30.0k | min_index = test_index; |
900 | 30.0k | } |
901 | 185k | } |
902 | 6.08k | } |
903 | 359k | thresholds[line_index++] = (sum - min_occ) / divisor + min_occ; |
904 | 359k | } |
905 | 6.08k | } else { |
906 | 1.44k | min_occ = occupation[0]; |
907 | 1.44k | min_index = 0; |
908 | 79.0k | for (sum = 0, low_index = 0; low_index < line_count; low_index++) { |
909 | 77.6k | if (occupation[low_index] < min_occ) { |
910 | 0 | min_occ = occupation[low_index]; |
911 | 0 | min_index = low_index; |
912 | 0 | } |
913 | 77.6k | sum += occupation[low_index]; |
914 | 77.6k | } |
915 | 1.44k | line_index = 0; |
916 | 1.44k | } |
917 | 214k | for (; line_index < line_count; line_index++) { |
918 | 206k | thresholds[line_index] = (sum - min_occ) / divisor + min_occ; |
919 | 206k | } |
920 | | // same out to end |
921 | 7.53k | } |
922 | | |
923 | | /** |
924 | | * @name compute_dropout_distances |
925 | | * |
926 | | * Compute the distance from each coordinate to the nearest dropout. |
927 | | */ |
928 | | void compute_dropout_distances( // project blobs |
929 | | int32_t *occupation, // input projection |
930 | | int32_t *thresholds, // output thresholds |
931 | | int32_t line_count // array sizes |
932 | 7.53k | ) { |
933 | 7.53k | int32_t line_index; // of thresholds line |
934 | 7.53k | int32_t distance; // from prev dropout |
935 | 7.53k | int32_t next_dist; // to next dropout |
936 | 7.53k | int32_t back_index; // for back filling |
937 | 7.53k | int32_t prev_threshold; // before overwrite |
938 | | |
939 | 7.53k | distance = -line_count; |
940 | 7.53k | line_index = 0; |
941 | 41.2k | do { |
942 | 653k | do { |
943 | 653k | distance--; |
944 | 653k | prev_threshold = thresholds[line_index]; |
945 | | // distance from prev |
946 | 653k | thresholds[line_index] = distance; |
947 | 653k | line_index++; |
948 | 653k | } while (line_index < line_count && (occupation[line_index] < thresholds[line_index] || |
949 | 646k | occupation[line_index - 1] >= prev_threshold)); |
950 | 41.2k | if (line_index < line_count) { |
951 | 33.6k | back_index = line_index - 1; |
952 | 33.6k | next_dist = 1; |
953 | 180k | while (next_dist < -distance && back_index >= 0) { |
954 | 146k | thresholds[back_index] = next_dist; |
955 | 146k | back_index--; |
956 | 146k | next_dist++; |
957 | 146k | distance++; |
958 | 146k | } |
959 | 33.6k | distance = 1; |
960 | 33.6k | } |
961 | 41.2k | } while (line_index < line_count); |
962 | 7.53k | } |
963 | | |
964 | | /** |
965 | | * @name expand_rows |
966 | | * |
967 | | * Expand each row to the least of its allowed size and touching its |
968 | | * neighbours. If the expansion would entirely swallow a neighbouring row |
969 | | * then do so. |
970 | | */ |
971 | | void expand_rows( // find lines |
972 | | ICOORD page_tr, // top right |
973 | | TO_BLOCK *block, // block to do |
974 | | float gradient, // gradient to fit |
975 | | FCOORD rotation, // for drawing |
976 | | int32_t block_edge, // edge of block |
977 | | bool testing_on // correct orientation |
978 | 7.74k | ) { |
979 | 7.74k | bool swallowed_row; // eaten a neighbour |
980 | 7.74k | float y_max, y_min; // new row limits |
981 | 7.74k | float y_bottom, y_top; // allowed limits |
982 | 7.74k | TO_ROW *test_row; // next row |
983 | 7.74k | TO_ROW *row; // current row |
984 | | // iterators |
985 | 7.74k | BLOBNBOX_IT blob_it = &block->blobs; |
986 | 7.74k | TO_ROW_IT row_it = block->get_rows(); |
987 | | |
988 | | #ifndef GRAPHICS_DISABLED |
989 | | if (textord_show_expanded_rows && testing_on) { |
990 | | if (to_win == nullptr) { |
991 | | create_to_win(page_tr); |
992 | | } |
993 | | } |
994 | | #endif |
995 | | |
996 | 7.74k | adjust_row_limits(block); // shift min,max. |
997 | 7.74k | if (textord_new_initial_xheight) { |
998 | 7.74k | if (block->get_rows()->empty()) { |
999 | 2.60k | return; |
1000 | 2.60k | } |
1001 | 5.13k | compute_row_stats(block, textord_show_expanded_rows && testing_on); |
1002 | 5.13k | } |
1003 | 5.13k | assign_blobs_to_rows(block, &gradient, 4, true, false, false); |
1004 | | // get real membership |
1005 | 5.13k | if (block->get_rows()->empty()) { |
1006 | 15 | return; |
1007 | 15 | } |
1008 | 5.12k | fit_parallel_rows(block, gradient, rotation, block_edge, |
1009 | 5.12k | textord_show_expanded_rows && testing_on); |
1010 | 5.12k | if (!textord_new_initial_xheight) { |
1011 | 0 | compute_row_stats(block, textord_show_expanded_rows && testing_on); |
1012 | 0 | } |
1013 | 5.12k | row_it.move_to_last(); |
1014 | 31.9k | do { |
1015 | 31.9k | row = row_it.data(); |
1016 | 31.9k | y_max = row->max_y(); // get current limits |
1017 | 31.9k | y_min = row->min_y(); |
1018 | 31.9k | y_bottom = row->intercept() - block->line_size * textord_expansion_factor * |
1019 | 31.9k | tesseract::CCStruct::kDescenderFraction; |
1020 | 31.9k | y_top = row->intercept() + |
1021 | 31.9k | block->line_size * textord_expansion_factor * |
1022 | 31.9k | (tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction); |
1023 | 31.9k | if (y_min > y_bottom) { // expansion allowed |
1024 | 24.5k | if (textord_show_expanded_rows && testing_on) { |
1025 | 0 | tprintf("Expanding bottom of row at %f from %f to %f\n", row->intercept(), y_min, y_bottom); |
1026 | 0 | } |
1027 | | // expandable |
1028 | 24.5k | swallowed_row = true; |
1029 | 45.0k | while (swallowed_row && !row_it.at_last()) { |
1030 | 20.5k | swallowed_row = false; |
1031 | | // get next one |
1032 | 20.5k | test_row = row_it.data_relative(1); |
1033 | | // overlaps space |
1034 | 20.5k | if (test_row->max_y() > y_bottom) { |
1035 | 17.0k | if (test_row->min_y() > y_bottom) { |
1036 | 29 | if (textord_show_expanded_rows && testing_on) { |
1037 | 0 | tprintf("Eating row below at %f\n", test_row->intercept()); |
1038 | 0 | } |
1039 | 29 | row_it.forward(); |
1040 | | #ifndef GRAPHICS_DISABLED |
1041 | | if (textord_show_expanded_rows && testing_on) { |
1042 | | plot_parallel_row(test_row, gradient, block_edge, ScrollView::WHITE, rotation); |
1043 | | } |
1044 | | #endif |
1045 | 29 | blob_it.set_to_list(row->blob_list()); |
1046 | 29 | blob_it.add_list_after(test_row->blob_list()); |
1047 | | // swallow complete row |
1048 | 29 | delete row_it.extract(); |
1049 | 29 | row_it.backward(); |
1050 | 29 | swallowed_row = true; |
1051 | 17.0k | } else if (test_row->max_y() < y_min) { |
1052 | | // shorter limit |
1053 | 925 | y_bottom = test_row->max_y(); |
1054 | 925 | if (textord_show_expanded_rows && testing_on) { |
1055 | 0 | tprintf("Truncating limit to %f due to touching row at %f\n", y_bottom, |
1056 | 0 | test_row->intercept()); |
1057 | 0 | } |
1058 | 16.1k | } else { |
1059 | 16.1k | y_bottom = y_min; // can't expand it |
1060 | 16.1k | if (textord_show_expanded_rows && testing_on) { |
1061 | 0 | tprintf("Not expanding limit beyond %f due to touching row at %f\n", y_bottom, |
1062 | 0 | test_row->intercept()); |
1063 | 0 | } |
1064 | 16.1k | } |
1065 | 17.0k | } |
1066 | 20.5k | } |
1067 | 24.5k | y_min = y_bottom; // expand it |
1068 | 24.5k | } |
1069 | 31.9k | if (y_max < y_top) { // expansion allowed |
1070 | 21.6k | if (textord_show_expanded_rows && testing_on) { |
1071 | 0 | tprintf("Expanding top of row at %f from %f to %f\n", row->intercept(), y_max, y_top); |
1072 | 0 | } |
1073 | 21.6k | swallowed_row = true; |
1074 | 39.3k | while (swallowed_row && !row_it.at_first()) { |
1075 | 17.7k | swallowed_row = false; |
1076 | | // get one above |
1077 | 17.7k | test_row = row_it.data_relative(-1); |
1078 | 17.7k | if (test_row->min_y() < y_top) { |
1079 | 13.4k | if (test_row->max_y() < y_top) { |
1080 | 8 | if (textord_show_expanded_rows && testing_on) { |
1081 | 0 | tprintf("Eating row above at %f\n", test_row->intercept()); |
1082 | 0 | } |
1083 | 8 | row_it.backward(); |
1084 | 8 | blob_it.set_to_list(row->blob_list()); |
1085 | | #ifndef GRAPHICS_DISABLED |
1086 | | if (textord_show_expanded_rows && testing_on) { |
1087 | | plot_parallel_row(test_row, gradient, block_edge, ScrollView::WHITE, rotation); |
1088 | | } |
1089 | | #endif |
1090 | 8 | blob_it.add_list_after(test_row->blob_list()); |
1091 | | // swallow complete row |
1092 | 8 | delete row_it.extract(); |
1093 | 8 | row_it.forward(); |
1094 | 8 | swallowed_row = true; |
1095 | 13.4k | } else if (test_row->min_y() < y_max) { |
1096 | | // shorter limit |
1097 | 12.7k | y_top = test_row->min_y(); |
1098 | 12.7k | if (textord_show_expanded_rows && testing_on) { |
1099 | 0 | tprintf("Truncating limit to %f due to touching row at %f\n", y_top, |
1100 | 0 | test_row->intercept()); |
1101 | 0 | } |
1102 | 12.7k | } else { |
1103 | 696 | y_top = y_max; // can't expand it |
1104 | 696 | if (textord_show_expanded_rows && testing_on) { |
1105 | 0 | tprintf("Not expanding limit beyond %f due to touching row at %f\n", y_top, |
1106 | 0 | test_row->intercept()); |
1107 | 0 | } |
1108 | 696 | } |
1109 | 13.4k | } |
1110 | 17.7k | } |
1111 | 21.6k | y_max = y_top; |
1112 | 21.6k | } |
1113 | | // new limits |
1114 | 31.9k | row->set_limits(y_min, y_max); |
1115 | 31.9k | row_it.backward(); |
1116 | 31.9k | } while (!row_it.at_last()); |
1117 | 5.12k | } |
1118 | | |
1119 | | /** |
1120 | | * adjust_row_limits |
1121 | | * |
1122 | | * Change the limits of rows to suit the default fractions. |
1123 | | */ |
1124 | | void adjust_row_limits( // tidy limits |
1125 | | TO_BLOCK *block // block to do |
1126 | 7.74k | ) { |
1127 | 7.74k | TO_ROW *row; // current row |
1128 | 7.74k | float size; // size of row |
1129 | 7.74k | float ymax; // top of row |
1130 | 7.74k | float ymin; // bottom of row |
1131 | 7.74k | TO_ROW_IT row_it = block->get_rows(); |
1132 | | |
1133 | 7.74k | if (textord_show_expanded_rows) { |
1134 | 0 | tprintf("Adjusting row limits for block(%d,%d)\n", block->block->pdblk.bounding_box().left(), |
1135 | 0 | block->block->pdblk.bounding_box().top()); |
1136 | 0 | } |
1137 | 40.5k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1138 | 32.8k | row = row_it.data(); |
1139 | 32.8k | size = row->max_y() - row->min_y(); |
1140 | 32.8k | if (textord_show_expanded_rows) { |
1141 | 0 | tprintf("Row at %f has min %f, max %f, size %f\n", row->intercept(), row->min_y(), |
1142 | 0 | row->max_y(), size); |
1143 | 0 | } |
1144 | 32.8k | size /= tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction + |
1145 | 32.8k | tesseract::CCStruct::kDescenderFraction; |
1146 | 32.8k | ymax = size * (tesseract::CCStruct::kXHeightFraction + tesseract::CCStruct::kAscenderFraction); |
1147 | 32.8k | ymin = -size * tesseract::CCStruct::kDescenderFraction; |
1148 | 32.8k | row->set_limits(row->intercept() + ymin, row->intercept() + ymax); |
1149 | 32.8k | row->merged = false; |
1150 | 32.8k | } |
1151 | 7.74k | } |
1152 | | |
1153 | | /** |
1154 | | * @name compute_row_stats |
1155 | | * |
1156 | | * Compute the linespacing and offset. |
1157 | | */ |
1158 | | void compute_row_stats( // find lines |
1159 | | TO_BLOCK *block, // block to do |
1160 | | bool testing_on // correct orientation |
1161 | 5.13k | ) { |
1162 | 5.13k | int32_t row_index; // of median |
1163 | 5.13k | TO_ROW *row; // current row |
1164 | 5.13k | TO_ROW *prev_row; // previous row |
1165 | 5.13k | float iqr; // inter quartile range |
1166 | 5.13k | TO_ROW_IT row_it = block->get_rows(); |
1167 | | // number of rows |
1168 | 5.13k | int16_t rowcount = row_it.length(); |
1169 | | // for choose nth |
1170 | 5.13k | std::vector<TO_ROW *> rows(rowcount); |
1171 | 5.13k | rowcount = 0; |
1172 | 5.13k | prev_row = nullptr; |
1173 | 5.13k | row_it.move_to_last(); // start at bottom |
1174 | 32.8k | do { |
1175 | 32.8k | row = row_it.data(); |
1176 | 32.8k | if (prev_row != nullptr) { |
1177 | 27.7k | rows[rowcount++] = prev_row; |
1178 | 27.7k | prev_row->spacing = row->intercept() - prev_row->intercept(); |
1179 | 27.7k | if (prev_row->spacing < 0.1 && prev_row->spacing > -0.1) { |
1180 | | // Avoid small spacing values which give a small disp_quant_factor_. |
1181 | | // That can cause large memory allocations with out-of-memory. |
1182 | 189 | prev_row->spacing = 0; |
1183 | 189 | } |
1184 | 27.7k | if (testing_on) { |
1185 | 0 | tprintf("Row at %g yields spacing of %g\n", row->intercept(), prev_row->spacing); |
1186 | 0 | } |
1187 | 27.7k | } |
1188 | 32.8k | prev_row = row; |
1189 | 32.8k | row_it.backward(); |
1190 | 32.8k | } while (!row_it.at_last()); |
1191 | 5.13k | block->key_row = prev_row; |
1192 | 5.13k | block->baseline_offset = std::fmod(prev_row->parallel_c(), block->line_spacing); |
1193 | 5.13k | if (testing_on) { |
1194 | 0 | tprintf("Blob based spacing=(%g,%g), offset=%g", block->line_size, block->line_spacing, |
1195 | 0 | block->baseline_offset); |
1196 | 0 | } |
1197 | 5.13k | if (rowcount > 0) { |
1198 | 4.01k | rows.resize(rowcount); |
1199 | 4.01k | row_index = rowcount * 3 / 4; |
1200 | 4.01k | std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order); |
1201 | 4.01k | iqr = rows[row_index]->spacing; |
1202 | 4.01k | row_index = rowcount / 4; |
1203 | 4.01k | std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order); |
1204 | 4.01k | iqr -= rows[row_index]->spacing; |
1205 | 4.01k | row_index = rowcount / 2; |
1206 | 4.01k | std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order); |
1207 | 4.01k | block->key_row = rows[row_index]; |
1208 | 4.01k | if (testing_on) { |
1209 | 0 | tprintf(" row based=%g(%g)", rows[row_index]->spacing, iqr); |
1210 | 0 | } |
1211 | 4.01k | if (rowcount > 2 && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) { |
1212 | 1.46k | if (!textord_new_initial_xheight) { |
1213 | 0 | if (rows[row_index]->spacing < block->line_spacing && |
1214 | 0 | rows[row_index]->spacing > block->line_size) { |
1215 | | // within range |
1216 | 0 | block->line_size = rows[row_index]->spacing; |
1217 | | // spacing=size |
1218 | 0 | } else if (rows[row_index]->spacing > block->line_spacing) { |
1219 | 0 | block->line_size = block->line_spacing; |
1220 | 0 | } |
1221 | | // too big so use max |
1222 | 1.46k | } else { |
1223 | 1.46k | if (rows[row_index]->spacing < block->line_spacing) { |
1224 | 1.46k | block->line_size = rows[row_index]->spacing; |
1225 | 1.46k | } else { |
1226 | 2 | block->line_size = block->line_spacing; |
1227 | 2 | } |
1228 | | // too big so use max |
1229 | 1.46k | } |
1230 | 1.46k | if (block->line_size < textord_min_xheight) { |
1231 | 1.26k | block->line_size = (float)textord_min_xheight; |
1232 | 1.26k | } |
1233 | 1.46k | block->line_spacing = rows[row_index]->spacing; |
1234 | 1.46k | block->max_blob_size = block->line_spacing * textord_excess_blobsize; |
1235 | 1.46k | } |
1236 | 4.01k | block->baseline_offset = std::fmod(rows[row_index]->intercept(), block->line_spacing); |
1237 | 4.01k | } |
1238 | 5.13k | if (testing_on) { |
1239 | 0 | tprintf("\nEstimate line size=%g, spacing=%g, offset=%g\n", block->line_size, |
1240 | 0 | block->line_spacing, block->baseline_offset); |
1241 | 0 | } |
1242 | 5.13k | } |
1243 | | |
1244 | | /** |
1245 | | * @name compute_block_xheight |
1246 | | * |
1247 | | * Compute the xheight of the individual rows, then correlate them |
1248 | | * and interpret ascenderless lines, correcting xheights. |
1249 | | * |
1250 | | * First we compute our best guess of the x-height of each row independently |
1251 | | * with compute_row_xheight(), which looks for a pair of commonly occurring |
1252 | | * heights that could be x-height and ascender height. This function also |
1253 | | * attempts to find descenders of lowercase letters (i.e. not the small |
1254 | | * descenders that could appear in upper case letters as Q,J). |
1255 | | * |
1256 | | * After this computation each row falls into one of the following categories: |
1257 | | * ROW_ASCENDERS_FOUND: we found xheight and ascender modes, so this must be |
1258 | | * a regular row; we'll use its xheight to compute |
1259 | | * xheight and ascrise estimates for the block |
1260 | | * ROW_DESCENDERS_FOUND: no ascenders, so we do not have a high confidence in |
1261 | | * the xheight of this row (don't use it for estimating |
1262 | | * block xheight), but this row can't contain all caps |
1263 | | * ROW_UNKNOWN: a row with no ascenders/descenders, could be all lowercase |
1264 | | * (or mostly lowercase for fonts with very few ascenders), |
1265 | | * all upper case or small caps |
1266 | | * ROW_INVALID: no meaningful xheight could be found for this row |
1267 | | * |
1268 | | * We then run correct_row_xheight() and use the computed xheight and ascrise |
1269 | | * averages to correct xheight values of the rows in ROW_DESCENDERS_FOUND, |
1270 | | * ROW_UNKNOWN and ROW_INVALID categories. |
1271 | | * |
1272 | | */ |
1273 | 15.2k | void Textord::compute_block_xheight(TO_BLOCK *block, float gradient) { |
1274 | 15.2k | TO_ROW *row; // current row |
1275 | 15.2k | float asc_frac_xheight = CCStruct::kAscenderFraction / CCStruct::kXHeightFraction; |
1276 | 15.2k | float desc_frac_xheight = CCStruct::kDescenderFraction / CCStruct::kXHeightFraction; |
1277 | 15.2k | int32_t min_height, max_height; // limits on xheight |
1278 | 15.2k | TO_ROW_IT row_it = block->get_rows(); |
1279 | 15.2k | if (row_it.empty()) { |
1280 | 190 | return; // no rows |
1281 | 190 | } |
1282 | | |
1283 | | // Compute the best guess of xheight of each row individually. |
1284 | | // Use xheight and ascrise values of the rows where ascenders were found. |
1285 | 15.1k | get_min_max_xheight(block->line_size, &min_height, &max_height); |
1286 | 15.1k | STATS row_asc_xheights(min_height, max_height); |
1287 | 15.1k | STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight), |
1288 | 15.1k | static_cast<int>(max_height * asc_frac_xheight)); |
1289 | 15.1k | int min_desc_height = static_cast<int>(min_height * desc_frac_xheight); |
1290 | 15.1k | int max_desc_height = static_cast<int>(max_height * desc_frac_xheight); |
1291 | 15.1k | STATS row_asc_descdrop(min_desc_height, max_desc_height); |
1292 | 15.1k | STATS row_desc_xheights(min_height, max_height); |
1293 | 15.1k | STATS row_desc_descdrop(min_desc_height, max_desc_height); |
1294 | 15.1k | STATS row_cap_xheights(min_height, max_height); |
1295 | 15.1k | STATS row_cap_floating_xheights(min_height, max_height); |
1296 | 127k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1297 | 112k | row = row_it.data(); |
1298 | | // Compute the xheight of this row if it has not been computed before. |
1299 | 112k | if (row->xheight <= 0) { |
1300 | 30.5k | compute_row_xheight(row, block->block->classify_rotation(), gradient, block->line_size); |
1301 | 30.5k | } |
1302 | 112k | ROW_CATEGORY row_category = get_row_category(row); |
1303 | 112k | if (row_category == ROW_ASCENDERS_FOUND) { |
1304 | 61.1k | row_asc_xheights.add(static_cast<int32_t>(row->xheight), row->xheight_evidence); |
1305 | 61.1k | row_asc_ascrise.add(static_cast<int32_t>(row->ascrise), row->xheight_evidence); |
1306 | 61.1k | row_asc_descdrop.add(static_cast<int32_t>(-row->descdrop), row->xheight_evidence); |
1307 | 61.1k | } else if (row_category == ROW_DESCENDERS_FOUND) { |
1308 | 5.68k | row_desc_xheights.add(static_cast<int32_t>(row->xheight), row->xheight_evidence); |
1309 | 5.68k | row_desc_descdrop.add(static_cast<int32_t>(-row->descdrop), row->xheight_evidence); |
1310 | 45.7k | } else if (row_category == ROW_UNKNOWN) { |
1311 | 15.1k | fill_heights(row, gradient, min_height, max_height, &row_cap_xheights, |
1312 | 15.1k | &row_cap_floating_xheights); |
1313 | 15.1k | } |
1314 | 112k | } |
1315 | | |
1316 | 15.1k | float xheight = 0.0; |
1317 | 15.1k | float ascrise = 0.0; |
1318 | 15.1k | float descdrop = 0.0; |
1319 | | // Compute our best guess of xheight of this block. |
1320 | 15.1k | if (row_asc_xheights.get_total() > 0) { |
1321 | | // Determine xheight from rows where ascenders were found. |
1322 | 8.62k | xheight = row_asc_xheights.median(); |
1323 | 8.62k | ascrise = row_asc_ascrise.median(); |
1324 | 8.62k | descdrop = -row_asc_descdrop.median(); |
1325 | 8.62k | } else if (row_desc_xheights.get_total() > 0) { |
1326 | | // Determine xheight from rows where descenders were found. |
1327 | 1.40k | xheight = row_desc_xheights.median(); |
1328 | 1.40k | descdrop = -row_desc_descdrop.median(); |
1329 | 5.07k | } else if (row_cap_xheights.get_total() > 0) { |
1330 | | // All the rows in the block were (a/de)scenderless. |
1331 | | // Try to search for two modes in row_cap_heights that could |
1332 | | // be the xheight and the capheight (e.g. some of the rows |
1333 | | // were lowercase, but did not have enough (a/de)scenders. |
1334 | | // If such two modes cannot be found, this block is most |
1335 | | // likely all caps (or all small caps, in which case the code |
1336 | | // still works as intended). |
1337 | 3.26k | compute_xheight_from_modes( |
1338 | 3.26k | &row_cap_xheights, &row_cap_floating_xheights, |
1339 | 3.26k | textord_single_height_mode && block->block->classify_rotation().y() == 0.0, min_height, |
1340 | 3.26k | max_height, &(xheight), &(ascrise)); |
1341 | 3.26k | if (ascrise == 0) { // assume only caps in the whole block |
1342 | 2.40k | xheight = row_cap_xheights.median() * CCStruct::kXHeightCapRatio; |
1343 | 2.40k | } |
1344 | 3.26k | } else { // default block sizes |
1345 | 1.81k | xheight = block->line_size * CCStruct::kXHeightFraction; |
1346 | 1.81k | } |
1347 | | // Correct xheight, ascrise and descdrop if necessary. |
1348 | 15.1k | bool corrected_xheight = false; |
1349 | 15.1k | if (xheight < textord_min_xheight) { |
1350 | 2.75k | xheight = static_cast<float>(textord_min_xheight); |
1351 | 2.75k | corrected_xheight = true; |
1352 | 2.75k | } |
1353 | 15.1k | if (corrected_xheight || ascrise <= 0) { |
1354 | 5.62k | ascrise = xheight * asc_frac_xheight; |
1355 | 5.62k | } |
1356 | 15.1k | if (corrected_xheight || descdrop >= 0) { |
1357 | 5.07k | descdrop = -(xheight * desc_frac_xheight); |
1358 | 5.07k | } |
1359 | 15.1k | block->xheight = xheight; |
1360 | | |
1361 | 15.1k | if (textord_debug_xheights) { |
1362 | 0 | tprintf("Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n", xheight, ascrise, |
1363 | 0 | descdrop); |
1364 | 0 | } |
1365 | | // Correct xheight, ascrise, descdrop of rows based on block averages. |
1366 | 127k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1367 | 112k | correct_row_xheight(row_it.data(), xheight, ascrise, descdrop); |
1368 | 112k | } |
1369 | 15.1k | } |
1370 | | |
1371 | | /** |
1372 | | * @name compute_row_xheight |
1373 | | * |
1374 | | * Estimate the xheight of this row. |
1375 | | * Compute the ascender rise and descender drop at the same time. |
1376 | | * Set xheigh_evidence to the number of blobs with the chosen xheight |
1377 | | * that appear in this row. |
1378 | | */ |
1379 | | void Textord::compute_row_xheight(TO_ROW *row, // row to do |
1380 | | const FCOORD &rotation, |
1381 | | float gradient, // global skew |
1382 | 93.5k | int block_line_size) { |
1383 | | // Find blobs representing repeated characters in rows and mark them. |
1384 | | // This information is used for computing row xheight and at a later |
1385 | | // stage when words are formed by make_words. |
1386 | 93.5k | if (!row->rep_chars_marked()) { |
1387 | 56.2k | mark_repeated_chars(row); |
1388 | 56.2k | } |
1389 | | |
1390 | 93.5k | int min_height, max_height; |
1391 | 93.5k | get_min_max_xheight(block_line_size, &min_height, &max_height); |
1392 | 93.5k | STATS heights(min_height, max_height); |
1393 | 93.5k | STATS floating_heights(min_height, max_height); |
1394 | 93.5k | fill_heights(row, gradient, min_height, max_height, &heights, &floating_heights); |
1395 | 93.5k | row->ascrise = 0.0f; |
1396 | 93.5k | row->xheight = 0.0f; |
1397 | 93.5k | row->xheight_evidence = compute_xheight_from_modes( |
1398 | 93.5k | &heights, &floating_heights, textord_single_height_mode && rotation.y() == 0.0, min_height, |
1399 | 93.5k | max_height, &(row->xheight), &(row->ascrise)); |
1400 | 93.5k | row->descdrop = 0.0f; |
1401 | 93.5k | if (row->xheight > 0) { |
1402 | 25.7k | row->descdrop = |
1403 | 25.7k | static_cast<float>(compute_row_descdrop(row, gradient, row->xheight_evidence, &heights)); |
1404 | 25.7k | } |
1405 | 93.5k | } |
1406 | | |
1407 | | /** |
1408 | | * @name fill_heights |
1409 | | * |
1410 | | * Fill the given heights with heights of the blobs that are legal |
1411 | | * candidates for estimating xheight. |
1412 | | */ |
1413 | | void fill_heights(TO_ROW *row, float gradient, int min_height, int max_height, STATS *heights, |
1414 | 108k | STATS *floating_heights) { |
1415 | 108k | float xcentre; // centre of blob |
1416 | 108k | float top; // top y coord of blob |
1417 | 108k | float height; // height of blob |
1418 | 108k | BLOBNBOX *blob; // current blob |
1419 | 108k | int repeated_set; |
1420 | 108k | BLOBNBOX_IT blob_it = row->blob_list(); |
1421 | 108k | if (blob_it.empty()) { |
1422 | 0 | return; // no blobs in this row |
1423 | 0 | } |
1424 | 108k | bool has_rep_chars = row->rep_chars_marked() && row->num_repeated_sets() > 0; |
1425 | 920k | do { |
1426 | 920k | blob = blob_it.data(); |
1427 | 920k | if (!blob->joined_to_prev()) { |
1428 | 564k | xcentre = (blob->bounding_box().left() + blob->bounding_box().right()) / 2.0f; |
1429 | 564k | top = blob->bounding_box().top(); |
1430 | 564k | height = blob->bounding_box().height(); |
1431 | 564k | if (textord_fix_xheight_bug) { |
1432 | 564k | top -= row->baseline.y(xcentre); |
1433 | 564k | } else { |
1434 | 0 | top -= gradient * xcentre + row->parallel_c(); |
1435 | 0 | } |
1436 | 564k | if (top >= min_height && top <= max_height) { |
1437 | 118k | heights->add(static_cast<int32_t>(floor(top + 0.5)), 1); |
1438 | 118k | if (height / top < textord_min_blob_height_fraction) { |
1439 | 37.7k | floating_heights->add(static_cast<int32_t>(floor(top + 0.5)), 1); |
1440 | 37.7k | } |
1441 | 118k | } |
1442 | 564k | } |
1443 | | // Skip repeated chars, since they are likely to skew the height stats. |
1444 | 920k | if (has_rep_chars && blob->repeated_set() != 0) { |
1445 | 0 | repeated_set = blob->repeated_set(); |
1446 | 0 | blob_it.forward(); |
1447 | 0 | while (!blob_it.at_first() && blob_it.data()->repeated_set() == repeated_set) { |
1448 | 0 | blob_it.forward(); |
1449 | 0 | if (textord_debug_xheights) { |
1450 | 0 | tprintf("Skipping repeated char when computing xheight\n"); |
1451 | 0 | } |
1452 | 0 | } |
1453 | 920k | } else { |
1454 | 920k | blob_it.forward(); |
1455 | 920k | } |
1456 | 920k | } while (!blob_it.at_first()); |
1457 | 108k | } |
1458 | | |
1459 | | /** |
1460 | | * @name compute_xheight_from_modes |
1461 | | * |
1462 | | * Given a STATS object heights, looks for two most frequently occurring |
1463 | | * heights that look like xheight and xheight + ascrise. If found, sets |
1464 | | * the values of *xheight and *ascrise accordingly, otherwise sets xheight |
1465 | | * to any most frequently occurring height and sets *ascrise to 0. |
1466 | | * Returns the number of times xheight occurred in heights. |
1467 | | * For each mode that is considered for being an xheight the count of |
1468 | | * floating blobs (stored in floating_heights) is subtracted from the |
1469 | | * total count of the blobs of this height. This is done because blobs |
1470 | | * that sit far above the baseline could represent valid ascenders, but |
1471 | | * it is highly unlikely that such a character's height will be an xheight |
1472 | | * (e.g. -, ', =, ^, `, ", ', etc) |
1473 | | * If cap_only, then force finding of only the top mode. |
1474 | | */ |
1475 | | int compute_xheight_from_modes(STATS *heights, STATS *floating_heights, bool cap_only, |
1476 | 96.8k | int min_height, int max_height, float *xheight, float *ascrise) { |
1477 | 96.8k | int blob_index = heights->mode(); // find mode |
1478 | 96.8k | int blob_count = heights->pile_count(blob_index); // get count of mode |
1479 | 96.8k | if (textord_debug_xheights) { |
1480 | 0 | tprintf("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n", min_height, max_height, |
1481 | 0 | blob_index, blob_count, heights->get_total()); |
1482 | 0 | heights->print(); |
1483 | 0 | floating_heights->print(); |
1484 | 0 | } |
1485 | 96.8k | if (blob_count == 0) { |
1486 | 67.7k | return 0; |
1487 | 67.7k | } |
1488 | 29.0k | int modes[MAX_HEIGHT_MODES]; // biggest piles |
1489 | 29.0k | bool in_best_pile = false; |
1490 | 29.0k | int prev_size = -INT32_MAX; |
1491 | 29.0k | int best_count = 0; |
1492 | 29.0k | int mode_count = compute_height_modes(heights, min_height, max_height, modes, MAX_HEIGHT_MODES); |
1493 | 29.0k | if (cap_only && mode_count > 1) { |
1494 | 0 | mode_count = 1; |
1495 | 0 | } |
1496 | 29.0k | int x; |
1497 | 29.0k | if (textord_debug_xheights) { |
1498 | 0 | tprintf("found %d modes: ", mode_count); |
1499 | 0 | for (x = 0; x < mode_count; x++) { |
1500 | 0 | tprintf("%d ", modes[x]); |
1501 | 0 | } |
1502 | 0 | tprintf("\n"); |
1503 | 0 | } |
1504 | | |
1505 | 57.0k | for (x = 0; x < mode_count - 1; x++) { |
1506 | 28.0k | if (modes[x] != prev_size + 1) { |
1507 | 26.3k | in_best_pile = false; // had empty height |
1508 | 26.3k | } |
1509 | 28.0k | int modes_x_count = heights->pile_count(modes[x]) - floating_heights->pile_count(modes[x]); |
1510 | 28.0k | if ((modes_x_count >= blob_count * textord_xheight_mode_fraction) && |
1511 | 28.0k | (in_best_pile || modes_x_count > best_count)) { |
1512 | 37.6k | for (int asc = x + 1; asc < mode_count; asc++) { |
1513 | 25.7k | float ratio = static_cast<float>(modes[asc]) / static_cast<float>(modes[x]); |
1514 | 25.7k | if (textord_ascx_ratio_min < ratio && ratio < textord_ascx_ratio_max && |
1515 | 25.7k | (heights->pile_count(modes[asc]) >= blob_count * textord_ascheight_mode_fraction)) { |
1516 | 11.1k | if (modes_x_count > best_count) { |
1517 | 6.12k | in_best_pile = true; |
1518 | 6.12k | best_count = modes_x_count; |
1519 | 6.12k | } |
1520 | 11.1k | if (textord_debug_xheights) { |
1521 | 0 | tprintf("X=%d, asc=%d, count=%d, ratio=%g\n", modes[x], modes[asc] - modes[x], |
1522 | 0 | modes_x_count, ratio); |
1523 | 0 | } |
1524 | 11.1k | prev_size = modes[x]; |
1525 | 11.1k | *xheight = static_cast<float>(modes[x]); |
1526 | 11.1k | *ascrise = static_cast<float>(modes[asc] - modes[x]); |
1527 | 11.1k | } |
1528 | 25.7k | } |
1529 | 11.9k | } |
1530 | 28.0k | } |
1531 | 29.0k | if (*xheight == 0) { // single mode |
1532 | | // Remove counts of the "floating" blobs (the one whose height is too |
1533 | | // small in relation to it's top end of the bounding box) from heights |
1534 | | // before computing the single-mode xheight. |
1535 | | // Restore the counts in heights after the mode is found, since |
1536 | | // floating blobs might be useful for determining potential ascenders |
1537 | | // in compute_row_descdrop(). |
1538 | 23.2k | if (floating_heights->get_total() > 0) { |
1539 | 126k | for (x = min_height; x < max_height; ++x) { |
1540 | 117k | heights->add(x, -(floating_heights->pile_count(x))); |
1541 | 117k | } |
1542 | 8.31k | blob_index = heights->mode(); // find the modified mode |
1543 | 126k | for (x = min_height; x < max_height; ++x) { |
1544 | 117k | heights->add(x, floating_heights->pile_count(x)); |
1545 | 117k | } |
1546 | 8.31k | } |
1547 | 23.2k | *xheight = static_cast<float>(blob_index); |
1548 | 23.2k | *ascrise = 0.0f; |
1549 | 23.2k | best_count = heights->pile_count(blob_index); |
1550 | 23.2k | if (textord_debug_xheights) { |
1551 | 0 | tprintf("Single mode xheight set to %g\n", *xheight); |
1552 | 0 | } |
1553 | 23.2k | } else if (textord_debug_xheights) { |
1554 | 0 | tprintf("Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise); |
1555 | 0 | } |
1556 | 29.0k | return best_count; |
1557 | 96.8k | } |
1558 | | |
1559 | | /** |
1560 | | * @name compute_row_descdrop |
1561 | | * |
1562 | | * Estimates the descdrop of this row. This function looks for |
1563 | | * "significant" descenders of lowercase letters (those that could |
1564 | | * not just be the small descenders of upper case letters like Q,J). |
1565 | | * The function also takes into account how many potential ascenders |
1566 | | * this row might contain. If the number of potential ascenders along |
1567 | | * with descenders is close to the expected fraction of the total |
1568 | | * number of blobs in the row, the function returns the descender |
1569 | | * height, returns 0 otherwise. |
1570 | | */ |
1571 | | int32_t compute_row_descdrop(TO_ROW *row, float gradient, int xheight_blob_count, |
1572 | 25.7k | STATS *asc_heights) { |
1573 | | // Count how many potential ascenders are in this row. |
1574 | 25.7k | int i_min = asc_heights->min_bucket(); |
1575 | 25.7k | if ((i_min / row->xheight) < textord_ascx_ratio_min) { |
1576 | 25.1k | i_min = static_cast<int>(floor(row->xheight * textord_ascx_ratio_min + 0.5)); |
1577 | 25.1k | } |
1578 | 25.7k | int i_max = asc_heights->max_bucket(); |
1579 | 25.7k | if ((i_max / row->xheight) > textord_ascx_ratio_max) { |
1580 | 1.52k | i_max = static_cast<int>(floor(row->xheight * textord_ascx_ratio_max)); |
1581 | 1.52k | } |
1582 | 25.7k | int num_potential_asc = 0; |
1583 | 52.8k | for (int i = i_min; i <= i_max; ++i) { |
1584 | 27.1k | num_potential_asc += asc_heights->pile_count(i); |
1585 | 27.1k | } |
1586 | 25.7k | auto min_height = static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_min + 0.5)); |
1587 | 25.7k | auto max_height = static_cast<int32_t>(floor(row->xheight * textord_descx_ratio_max)); |
1588 | 25.7k | float xcentre; // centre of blob |
1589 | 25.7k | float height; // height of blob |
1590 | 25.7k | BLOBNBOX_IT blob_it = row->blob_list(); |
1591 | 25.7k | BLOBNBOX *blob; // current blob |
1592 | 25.7k | STATS heights(min_height, max_height); |
1593 | 382k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1594 | 357k | blob = blob_it.data(); |
1595 | 357k | if (!blob->joined_to_prev()) { |
1596 | 167k | xcentre = (blob->bounding_box().left() + blob->bounding_box().right()) / 2.0f; |
1597 | 167k | height = (gradient * xcentre + row->parallel_c() - blob->bounding_box().bottom()); |
1598 | 167k | if (height >= min_height && height <= max_height) { |
1599 | 23.0k | heights.add(static_cast<int>(floor(height + 0.5)), 1); |
1600 | 23.0k | } |
1601 | 167k | } |
1602 | 357k | } |
1603 | 25.7k | int blob_index = heights.mode(); // find mode |
1604 | 25.7k | int blob_count = heights.pile_count(blob_index); // get count of mode |
1605 | 25.7k | float total_fraction = (textord_descheight_mode_fraction + textord_ascheight_mode_fraction); |
1606 | 25.7k | if (static_cast<float>(blob_count + num_potential_asc) < xheight_blob_count * total_fraction) { |
1607 | 12.2k | blob_count = 0; |
1608 | 12.2k | } |
1609 | 25.7k | int descdrop = blob_count > 0 ? -blob_index : 0; |
1610 | 25.7k | if (textord_debug_xheights) { |
1611 | 0 | tprintf("Descdrop: %d (potential ascenders %d, descenders %d)\n", descdrop, num_potential_asc, |
1612 | 0 | blob_count); |
1613 | 0 | heights.print(); |
1614 | 0 | } |
1615 | 25.7k | return descdrop; |
1616 | 25.7k | } |
1617 | | |
1618 | | /** |
1619 | | * @name compute_height_modes |
1620 | | * |
1621 | | * Find the top maxmodes values in the input array and put their |
1622 | | * indices in the output in the order in which they occurred. |
1623 | | */ |
1624 | | int32_t compute_height_modes(STATS *heights, // stats to search |
1625 | | int32_t min_height, // bottom of range |
1626 | | int32_t max_height, // top of range |
1627 | | int32_t *modes, // output array |
1628 | 29.0k | int32_t maxmodes) { // size of modes |
1629 | 29.0k | int32_t pile_count; // no in source pile |
1630 | 29.0k | int32_t src_count; // no of source entries |
1631 | 29.0k | int32_t src_index; // current entry |
1632 | 29.0k | int32_t least_count; // height of smalllest |
1633 | 29.0k | int32_t least_index; // index of least |
1634 | 29.0k | int32_t dest_count; // index in modes |
1635 | | |
1636 | 29.0k | src_count = max_height + 1 - min_height; |
1637 | 29.0k | dest_count = 0; |
1638 | 29.0k | least_count = INT32_MAX; |
1639 | 29.0k | least_index = -1; |
1640 | 937k | for (src_index = 0; src_index < src_count; src_index++) { |
1641 | 908k | pile_count = heights->pile_count(min_height + src_index); |
1642 | 908k | if (pile_count > 0) { |
1643 | 57.4k | if (dest_count < maxmodes) { |
1644 | 57.0k | if (pile_count < least_count) { |
1645 | | // find smallest in array |
1646 | 32.2k | least_count = pile_count; |
1647 | 32.2k | least_index = dest_count; |
1648 | 32.2k | } |
1649 | 57.0k | modes[dest_count++] = min_height + src_index; |
1650 | 57.0k | } else if (pile_count >= least_count) { |
1651 | 2.07k | while (least_index < maxmodes - 1) { |
1652 | 1.70k | modes[least_index] = modes[least_index + 1]; |
1653 | | // shuffle up |
1654 | 1.70k | least_index++; |
1655 | 1.70k | } |
1656 | | // new one on end |
1657 | 373 | modes[maxmodes - 1] = min_height + src_index; |
1658 | 373 | if (pile_count == least_count) { |
1659 | | // new smallest |
1660 | 236 | least_index = maxmodes - 1; |
1661 | 236 | } else { |
1662 | 137 | least_count = heights->pile_count(modes[0]); |
1663 | 137 | least_index = 0; |
1664 | 1.64k | for (dest_count = 1; dest_count < maxmodes; dest_count++) { |
1665 | 1.50k | pile_count = heights->pile_count(modes[dest_count]); |
1666 | 1.50k | if (pile_count < least_count) { |
1667 | | // find smallest |
1668 | 127 | least_count = pile_count; |
1669 | 127 | least_index = dest_count; |
1670 | 127 | } |
1671 | 1.50k | } |
1672 | 137 | } |
1673 | 373 | } |
1674 | 57.4k | } |
1675 | 908k | } |
1676 | 29.0k | return dest_count; |
1677 | 29.0k | } |
1678 | | |
1679 | | /** |
1680 | | * @name correct_row_xheight |
1681 | | * |
1682 | | * Adjust the xheight etc of this row if not within reasonable limits |
1683 | | * of the average for the block. |
1684 | | */ |
1685 | 112k | void correct_row_xheight(TO_ROW *row, float xheight, float ascrise, float descdrop) { |
1686 | 112k | ROW_CATEGORY row_category = get_row_category(row); |
1687 | 112k | if (textord_debug_xheights) { |
1688 | 0 | tprintf( |
1689 | 0 | "correcting row xheight: row->xheight %.4f" |
1690 | 0 | ", row->acrise %.4f row->descdrop %.4f\n", |
1691 | 0 | row->xheight, row->ascrise, row->descdrop); |
1692 | 0 | } |
1693 | 112k | bool normal_xheight = within_error_margin(row->xheight, xheight, textord_xheight_error_margin); |
1694 | 112k | bool cap_xheight = |
1695 | 112k | within_error_margin(row->xheight, xheight + ascrise, textord_xheight_error_margin); |
1696 | | // Use the average xheight/ascrise for the following cases: |
1697 | | // -- the xheight of the row could not be determined at all |
1698 | | // -- the row has descenders (e.g. "many groups", "ISBN 12345 p.3") |
1699 | | // and its xheight is close to either cap height or average xheight |
1700 | | // -- the row does not have ascenders or descenders, but its xheight |
1701 | | // is close to the average block xheight (e.g. row with "www.mmm.com") |
1702 | 112k | if (row_category == ROW_ASCENDERS_FOUND) { |
1703 | 61.1k | if (row->descdrop >= 0) { |
1704 | 2.11k | row->descdrop = row->xheight * (descdrop / xheight); |
1705 | 2.11k | } |
1706 | 61.1k | } else if (row_category == ROW_INVALID || |
1707 | 51.4k | (row_category == ROW_DESCENDERS_FOUND && (normal_xheight || cap_xheight)) || |
1708 | 51.4k | (row_category == ROW_UNKNOWN && normal_xheight)) { |
1709 | 39.0k | if (textord_debug_xheights) { |
1710 | 0 | tprintf("using average xheight\n"); |
1711 | 0 | } |
1712 | 39.0k | row->xheight = xheight; |
1713 | 39.0k | row->ascrise = ascrise; |
1714 | 39.0k | row->descdrop = descdrop; |
1715 | 39.0k | } else if (row_category == ROW_DESCENDERS_FOUND) { |
1716 | | // Assume this is a row with mostly lowercase letters and it's xheight |
1717 | | // is computed correctly (unfortunately there is no way to distinguish |
1718 | | // this from the case when descenders are found, but the most common |
1719 | | // height is capheight). |
1720 | 2.35k | if (textord_debug_xheights) { |
1721 | 0 | tprintf("lowercase, corrected ascrise\n"); |
1722 | 0 | } |
1723 | 2.35k | row->ascrise = row->xheight * (ascrise / xheight); |
1724 | 10.0k | } else if (row_category == ROW_UNKNOWN) { |
1725 | | // Otherwise assume this row is an all-caps or small-caps row |
1726 | | // and adjust xheight and ascrise of the row. |
1727 | | |
1728 | 10.0k | row->all_caps = true; |
1729 | 10.0k | if (cap_xheight) { // regular all caps |
1730 | 2.87k | if (textord_debug_xheights) { |
1731 | 0 | tprintf("all caps\n"); |
1732 | 0 | } |
1733 | 2.87k | row->xheight = xheight; |
1734 | 2.87k | row->ascrise = ascrise; |
1735 | 2.87k | row->descdrop = descdrop; |
1736 | 7.16k | } else { // small caps or caps with an odd xheight |
1737 | 7.16k | if (textord_debug_xheights) { |
1738 | 0 | if (row->xheight < xheight + ascrise && row->xheight > xheight) { |
1739 | 0 | tprintf("small caps\n"); |
1740 | 0 | } else { |
1741 | 0 | tprintf("all caps with irregular xheight\n"); |
1742 | 0 | } |
1743 | 0 | } |
1744 | 7.16k | row->ascrise = row->xheight * (ascrise / (xheight + ascrise)); |
1745 | 7.16k | row->xheight -= row->ascrise; |
1746 | 7.16k | row->descdrop = row->xheight * (descdrop / xheight); |
1747 | 7.16k | } |
1748 | 10.0k | } |
1749 | 112k | if (textord_debug_xheights) { |
1750 | 0 | tprintf( |
1751 | 0 | "corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop" |
1752 | 0 | " = %.4f\n", |
1753 | 0 | row->xheight, row->ascrise, row->descdrop); |
1754 | 0 | } |
1755 | 112k | } |
1756 | | |
1757 | 7.32k | static int CountOverlaps(const TBOX &box, int min_height, BLOBNBOX_LIST *blobs) { |
1758 | 7.32k | int overlaps = 0; |
1759 | 7.32k | BLOBNBOX_IT blob_it(blobs); |
1760 | 85.8k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1761 | 78.5k | BLOBNBOX *blob = blob_it.data(); |
1762 | 78.5k | const TBOX &blob_box = blob->bounding_box(); |
1763 | 78.5k | if (blob_box.height() >= min_height && box.major_overlap(blob_box)) { |
1764 | 8.39k | ++overlaps; |
1765 | 8.39k | } |
1766 | 78.5k | } |
1767 | 7.32k | return overlaps; |
1768 | 7.32k | } |
1769 | | |
1770 | | /** |
1771 | | * @name separate_underlines |
1772 | | * |
1773 | | * Test wide objects for being potential underlines. If they are then |
1774 | | * put them in a separate list in the block. |
1775 | | */ |
1776 | | void separate_underlines(TO_BLOCK *block, // block to do |
1777 | | float gradient, // skew angle |
1778 | | FCOORD rotation, // inverse landscape |
1779 | 7.74k | bool testing_on) { // correct orientation |
1780 | 7.74k | BLOBNBOX *blob; // current blob |
1781 | 7.74k | C_BLOB *rotated_blob; // rotated blob |
1782 | 7.74k | TO_ROW *row; // current row |
1783 | 7.74k | float length; // of g_vec |
1784 | 7.74k | TBOX blob_box; |
1785 | 7.74k | FCOORD blob_rotation; // inverse of rotation |
1786 | 7.74k | FCOORD g_vec; // skew rotation |
1787 | 7.74k | BLOBNBOX_IT blob_it; // iterator |
1788 | | // iterator |
1789 | 7.74k | BLOBNBOX_IT under_it = &block->underlines; |
1790 | 7.74k | BLOBNBOX_IT large_it = &block->large_blobs; |
1791 | 7.74k | TO_ROW_IT row_it = block->get_rows(); |
1792 | 7.74k | int min_blob_height = static_cast<int>(textord_min_blob_height_fraction * block->line_size + 0.5); |
1793 | | |
1794 | | // length of vector |
1795 | 7.74k | length = std::sqrt(1 + gradient * gradient); |
1796 | 7.74k | g_vec = FCOORD(1 / length, -gradient / length); |
1797 | 7.74k | blob_rotation = FCOORD(rotation.x(), -rotation.y()); |
1798 | 7.74k | blob_rotation.rotate(g_vec); // undoing everything |
1799 | 64.0k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1800 | 56.3k | row = row_it.data(); |
1801 | | // get blobs |
1802 | 56.3k | blob_it.set_to_list(row->blob_list()); |
1803 | 578k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1804 | 522k | blob = blob_it.data(); |
1805 | 522k | blob_box = blob->bounding_box(); |
1806 | 522k | if (blob_box.width() > block->line_size * textord_underline_width) { |
1807 | 14.4k | ASSERT_HOST(blob->cblob() != nullptr); |
1808 | 14.4k | rotated_blob = crotate_cblob(blob->cblob(), blob_rotation); |
1809 | 14.4k | if (test_underline(testing_on && textord_show_final_rows, rotated_blob, |
1810 | 14.4k | static_cast<int16_t>(row->intercept()), |
1811 | 14.4k | static_cast<int16_t>(block->line_size * |
1812 | 14.4k | (tesseract::CCStruct::kXHeightFraction + |
1813 | 14.4k | tesseract::CCStruct::kAscenderFraction / 2.0f)))) { |
1814 | 7.14k | under_it.add_after_then_move(blob_it.extract()); |
1815 | 7.14k | if (testing_on && textord_show_final_rows) { |
1816 | 0 | tprintf("Underlined blob at:"); |
1817 | 0 | rotated_blob->bounding_box().print(); |
1818 | 0 | tprintf("Was:"); |
1819 | 0 | blob_box.print(); |
1820 | 0 | } |
1821 | 7.32k | } else if (CountOverlaps(blob->bounding_box(), min_blob_height, row->blob_list()) > |
1822 | 7.32k | textord_max_blob_overlaps) { |
1823 | 124 | large_it.add_after_then_move(blob_it.extract()); |
1824 | 124 | if (testing_on && textord_show_final_rows) { |
1825 | 0 | tprintf("Large blob overlaps %d blobs at:", |
1826 | 0 | CountOverlaps(blob_box, min_blob_height, row->blob_list())); |
1827 | 0 | blob_box.print(); |
1828 | 0 | } |
1829 | 124 | } |
1830 | 14.4k | delete rotated_blob; |
1831 | 14.4k | } |
1832 | 522k | } |
1833 | 56.3k | } |
1834 | 7.74k | } |
1835 | | |
1836 | | /** |
1837 | | * @name pre_associate_blobs |
1838 | | * |
1839 | | * Associate overlapping blobs and fake chop wide blobs. |
1840 | | */ |
1841 | | void pre_associate_blobs( // make rough chars |
1842 | | ICOORD page_tr, // top right |
1843 | | TO_BLOCK *block, // block to do |
1844 | | FCOORD rotation, // inverse landscape |
1845 | | bool testing_on // correct orientation |
1846 | 7.74k | ) { |
1847 | | #ifndef GRAPHICS_DISABLED |
1848 | | ScrollView::Color colour; // of boxes |
1849 | | #endif |
1850 | 7.74k | BLOBNBOX *blob; // current blob |
1851 | 7.74k | BLOBNBOX *nextblob; // next in list |
1852 | 7.74k | TBOX blob_box; |
1853 | 7.74k | FCOORD blob_rotation; // inverse of rotation |
1854 | 7.74k | BLOBNBOX_IT blob_it; // iterator |
1855 | 7.74k | BLOBNBOX_IT start_it; // iterator |
1856 | 7.74k | TO_ROW_IT row_it = block->get_rows(); |
1857 | | |
1858 | | #ifndef GRAPHICS_DISABLED |
1859 | | colour = ScrollView::RED; |
1860 | | #endif |
1861 | | |
1862 | 7.74k | blob_rotation = FCOORD(rotation.x(), -rotation.y()); |
1863 | 64.0k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1864 | | // get blobs |
1865 | 56.3k | blob_it.set_to_list(row_it.data()->blob_list()); |
1866 | 368k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1867 | 311k | blob = blob_it.data(); |
1868 | 311k | blob_box = blob->bounding_box(); |
1869 | 311k | start_it = blob_it; // save start point |
1870 | | // if (testing_on && textord_show_final_blobs) |
1871 | | // { |
1872 | | // tprintf("Blob at (%d,%d)->(%d,%d), |
1873 | | // addr=%x, count=%d\n", |
1874 | | // blob_box.left(),blob_box.bottom(), |
1875 | | // blob_box.right(),blob_box.top(), |
1876 | | // (void*)blob,blob_it.length()); |
1877 | | // } |
1878 | 311k | bool overlap; |
1879 | 545k | do { |
1880 | 545k | overlap = false; |
1881 | 545k | if (!blob_it.at_last()) { |
1882 | 484k | nextblob = blob_it.data_relative(1); |
1883 | 484k | overlap = blob_box.major_x_overlap(nextblob->bounding_box()); |
1884 | 484k | if (overlap) { |
1885 | 233k | blob->merge(nextblob); // merge new blob |
1886 | 233k | blob_box = blob->bounding_box(); // get bigger box |
1887 | 233k | blob_it.forward(); |
1888 | 233k | } |
1889 | 484k | } |
1890 | 545k | } while (overlap); |
1891 | 311k | blob->chop(&start_it, &blob_it, blob_rotation, |
1892 | 311k | block->line_size * tesseract::CCStruct::kXHeightFraction * textord_chop_width); |
1893 | | // attempt chop |
1894 | 311k | } |
1895 | | #ifndef GRAPHICS_DISABLED |
1896 | | if (testing_on && textord_show_final_blobs) { |
1897 | | if (to_win == nullptr) { |
1898 | | create_to_win(page_tr); |
1899 | | } |
1900 | | to_win->Pen(colour); |
1901 | | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1902 | | blob = blob_it.data(); |
1903 | | blob_box = blob->bounding_box(); |
1904 | | blob_box.rotate(rotation); |
1905 | | if (!blob->joined_to_prev()) { |
1906 | | to_win->Rectangle(blob_box.left(), blob_box.bottom(), blob_box.right(), blob_box.top()); |
1907 | | } |
1908 | | } |
1909 | | colour = static_cast<ScrollView::Color>(colour + 1); |
1910 | | if (colour > ScrollView::MAGENTA) { |
1911 | | colour = ScrollView::RED; |
1912 | | } |
1913 | | } |
1914 | | #endif |
1915 | 56.3k | } |
1916 | 7.74k | } |
1917 | | |
1918 | | /** |
1919 | | * @name fit_parallel_rows |
1920 | | * |
1921 | | * Re-fit the rows in the block to the given gradient. |
1922 | | */ |
1923 | | void fit_parallel_rows( // find lines |
1924 | | TO_BLOCK *block, // block to do |
1925 | | float gradient, // gradient to fit |
1926 | | FCOORD rotation, // for drawing |
1927 | | int32_t block_edge, // edge of block |
1928 | | bool testing_on // correct orientation |
1929 | 12.8k | ) { |
1930 | | #ifndef GRAPHICS_DISABLED |
1931 | | ScrollView::Color colour; // of row |
1932 | | #endif |
1933 | 12.8k | TO_ROW_IT row_it = block->get_rows(); |
1934 | | |
1935 | 12.8k | row_it.move_to_first(); |
1936 | 114k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1937 | 101k | if (row_it.data()->blob_list()->empty()) { |
1938 | 0 | delete row_it.extract(); // nothing in it |
1939 | 101k | } else { |
1940 | 101k | fit_parallel_lms(gradient, row_it.data()); |
1941 | 101k | } |
1942 | 101k | } |
1943 | | #ifndef GRAPHICS_DISABLED |
1944 | | if (testing_on) { |
1945 | | colour = ScrollView::RED; |
1946 | | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
1947 | | plot_parallel_row(row_it.data(), gradient, block_edge, colour, rotation); |
1948 | | colour = static_cast<ScrollView::Color>(colour + 1); |
1949 | | if (colour > ScrollView::MAGENTA) { |
1950 | | colour = ScrollView::RED; |
1951 | | } |
1952 | | } |
1953 | | } |
1954 | | #endif |
1955 | 12.8k | row_it.sort(row_y_order); // may have gone out of order |
1956 | 12.8k | } |
1957 | | |
1958 | | /** |
1959 | | * @name fit_parallel_lms |
1960 | | * |
1961 | | * Fit an LMS line to a row. |
1962 | | * Make the fit parallel to the given gradient and set the |
1963 | | * row accordingly. |
1964 | | */ |
1965 | 101k | void fit_parallel_lms(float gradient, TO_ROW *row) { |
1966 | 101k | float c; // fitted line |
1967 | 101k | int blobcount; // no of blobs |
1968 | 101k | tesseract::DetLineFit lms; |
1969 | 101k | BLOBNBOX_IT blob_it = row->blob_list(); |
1970 | | |
1971 | 101k | blobcount = 0; |
1972 | 509k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
1973 | 407k | if (!blob_it.data()->joined_to_prev()) { |
1974 | 407k | const TBOX &box = blob_it.data()->bounding_box(); |
1975 | 407k | lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom())); |
1976 | 407k | blobcount++; |
1977 | 407k | } |
1978 | 407k | } |
1979 | 101k | double error = lms.ConstrainedFit(gradient, &c); |
1980 | 101k | row->set_parallel_line(gradient, c, error); |
1981 | 101k | if (textord_straight_baselines && blobcount > textord_lms_line_trials) { |
1982 | 0 | error = lms.Fit(&gradient, &c); |
1983 | 0 | } |
1984 | | // set the other too |
1985 | 101k | row->set_line(gradient, c, error); |
1986 | 101k | } |
1987 | | |
1988 | | /** |
1989 | | * @name make_spline_rows |
1990 | | * |
1991 | | * Re-fit the rows in the block to the given gradient. |
1992 | | */ |
1993 | | void Textord::make_spline_rows(TO_BLOCK *block, // block to do |
1994 | | float gradient, // gradient to fit |
1995 | 7.74k | bool testing_on) { |
1996 | | #ifndef GRAPHICS_DISABLED |
1997 | | ScrollView::Color colour; // of row |
1998 | | if (testing_on && to_win == nullptr) { |
1999 | | create_to_win(page_tr_); |
2000 | | } |
2001 | | #endif |
2002 | 7.74k | TO_ROW_IT row_it = block->get_rows(); |
2003 | | |
2004 | 7.74k | row_it.move_to_first(); |
2005 | 64.0k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
2006 | 56.3k | if (row_it.data()->blob_list()->empty()) { |
2007 | 40 | delete row_it.extract(); // nothing in it |
2008 | 56.2k | } else { |
2009 | 56.2k | make_baseline_spline(row_it.data(), block); |
2010 | 56.2k | } |
2011 | 56.3k | } |
2012 | 7.74k | if (textord_old_baselines) { |
2013 | | #ifndef GRAPHICS_DISABLED |
2014 | | if (testing_on) { |
2015 | | colour = ScrollView::RED; |
2016 | | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
2017 | | row_it.data()->baseline.plot(to_win, colour); |
2018 | | colour = static_cast<ScrollView::Color>(colour + 1); |
2019 | | if (colour > ScrollView::MAGENTA) { |
2020 | | colour = ScrollView::RED; |
2021 | | } |
2022 | | } |
2023 | | } |
2024 | | #endif |
2025 | 7.74k | make_old_baselines(block, testing_on, gradient); |
2026 | 7.74k | } |
2027 | | #ifndef GRAPHICS_DISABLED |
2028 | | if (testing_on) { |
2029 | | colour = ScrollView::RED; |
2030 | | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
2031 | | row_it.data()->baseline.plot(to_win, colour); |
2032 | | colour = static_cast<ScrollView::Color>(colour + 1); |
2033 | | if (colour > ScrollView::MAGENTA) { |
2034 | | colour = ScrollView::RED; |
2035 | | } |
2036 | | } |
2037 | | } |
2038 | | #endif |
2039 | 7.74k | } |
2040 | | |
2041 | | /** |
2042 | | * @name make_baseline_spline |
2043 | | * |
2044 | | * Fit an LMS line to a row. |
2045 | | * Make the fit parallel to the given gradient and set the |
2046 | | * row accordingly. |
2047 | | */ |
2048 | | void make_baseline_spline(TO_ROW *row, // row to fit |
2049 | 56.2k | TO_BLOCK *block) { |
2050 | 56.2k | double *coeffs; // quadratic coeffs |
2051 | 56.2k | int32_t segments; // no of segments |
2052 | | |
2053 | | // spline boundaries |
2054 | 56.2k | auto *xstarts = new int32_t[row->blob_list()->length() + 1]; |
2055 | 56.2k | if (segment_baseline(row, block, segments, xstarts) && !textord_straight_baselines && |
2056 | 56.2k | !textord_parallel_baselines) { |
2057 | 0 | coeffs = linear_spline_baseline(row, block, segments, xstarts); |
2058 | 56.2k | } else { |
2059 | 56.2k | xstarts[1] = xstarts[segments]; |
2060 | 56.2k | segments = 1; |
2061 | 56.2k | coeffs = new double[3]; |
2062 | 56.2k | coeffs[0] = 0; |
2063 | 56.2k | coeffs[1] = row->line_m(); |
2064 | 56.2k | coeffs[2] = row->line_c(); |
2065 | 56.2k | } |
2066 | 56.2k | row->baseline = QSPLINE(segments, xstarts, coeffs); |
2067 | 56.2k | delete[] coeffs; |
2068 | 56.2k | delete[] xstarts; |
2069 | 56.2k | } |
2070 | | |
2071 | | /** |
2072 | | * @name segment_baseline |
2073 | | * |
2074 | | * Divide the baseline up into segments which require a different |
2075 | | * quadratic fitted to them. |
2076 | | * Return true if enough blobs were far enough away to need a quadratic. |
2077 | | */ |
2078 | | bool segment_baseline( // split baseline |
2079 | | TO_ROW *row, // row to fit |
2080 | | TO_BLOCK *block, // block it came from |
2081 | | int32_t &segments, // no fo segments |
2082 | | int32_t *xstarts // coords of segments |
2083 | 56.2k | ) { |
2084 | 56.2k | bool needs_curve; // needs curved line |
2085 | 56.2k | int blobcount; // no of blobs |
2086 | 56.2k | int blobindex; // current blob |
2087 | 56.2k | int last_state; // above, on , below |
2088 | 56.2k | int state; // of current blob |
2089 | 56.2k | float yshift; // from baseline |
2090 | 56.2k | TBOX box; // blob box |
2091 | 56.2k | TBOX new_box; // new_it box |
2092 | 56.2k | float middle; // xcentre of blob |
2093 | | // blobs |
2094 | 56.2k | BLOBNBOX_IT blob_it = row->blob_list(); |
2095 | 56.2k | BLOBNBOX_IT new_it = blob_it; // front end |
2096 | 56.2k | SORTED_FLOATS yshifts; // shifts from baseline |
2097 | | |
2098 | 56.2k | needs_curve = false; |
2099 | 56.2k | box = box_next_pre_chopped(&blob_it); |
2100 | 56.2k | xstarts[0] = box.left(); |
2101 | 56.2k | segments = 1; |
2102 | 56.2k | blobcount = row->blob_list()->length(); |
2103 | 56.2k | if (textord_oldbl_debug) { |
2104 | 0 | tprintf("Segmenting baseline of %d blobs at (%d,%d)\n", blobcount, box.left(), box.bottom()); |
2105 | 0 | } |
2106 | 56.2k | if (blobcount <= textord_spline_medianwin || blobcount < textord_spline_minblobs) { |
2107 | 36.6k | blob_it.move_to_last(); |
2108 | 36.6k | box = blob_it.data()->bounding_box(); |
2109 | 36.6k | xstarts[1] = box.right(); |
2110 | 36.6k | return false; |
2111 | 36.6k | } |
2112 | 19.6k | last_state = 0; |
2113 | 19.6k | new_it.mark_cycle_pt(); |
2114 | 121k | for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) { |
2115 | 107k | new_box = box_next_pre_chopped(&new_it); |
2116 | 107k | middle = (new_box.left() + new_box.right()) / 2.0; |
2117 | 107k | yshift = new_box.bottom() - row->line_m() * middle - row->line_c(); |
2118 | | // record shift |
2119 | 107k | yshifts.add(yshift, blobindex); |
2120 | 107k | if (new_it.cycled_list()) { |
2121 | 5.21k | xstarts[1] = new_box.right(); |
2122 | 5.21k | return false; |
2123 | 5.21k | } |
2124 | 107k | } |
2125 | 57.8k | for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++) { |
2126 | 43.3k | box = box_next_pre_chopped(&blob_it); |
2127 | 43.3k | } |
2128 | 119k | do { |
2129 | 119k | new_box = box_next_pre_chopped(&new_it); |
2130 | | // get middle one |
2131 | 119k | yshift = yshifts[textord_spline_medianwin / 2]; |
2132 | 119k | if (yshift > textord_spline_shift_fraction * block->line_size) { |
2133 | 53.3k | state = 1; |
2134 | 66.1k | } else if (-yshift > textord_spline_shift_fraction * block->line_size) { |
2135 | 30.8k | state = -1; |
2136 | 35.3k | } else { |
2137 | 35.3k | state = 0; |
2138 | 35.3k | } |
2139 | 119k | if (state != 0) { |
2140 | 84.2k | needs_curve = true; |
2141 | 84.2k | } |
2142 | | // tprintf("State=%d, prev=%d, shift=%g\n", |
2143 | | // state,last_state,yshift); |
2144 | 119k | if (state != last_state && blobcount > textord_spline_minblobs) { |
2145 | 2.50k | xstarts[segments++] = box.left(); |
2146 | 2.50k | blobcount = 0; |
2147 | 2.50k | } |
2148 | 119k | last_state = state; |
2149 | 119k | yshifts.remove(blobindex - textord_spline_medianwin); |
2150 | 119k | box = box_next_pre_chopped(&blob_it); |
2151 | 119k | middle = (new_box.left() + new_box.right()) / 2.0; |
2152 | 119k | yshift = new_box.bottom() - row->line_m() * middle - row->line_c(); |
2153 | 119k | yshifts.add(yshift, blobindex); |
2154 | 119k | blobindex++; |
2155 | 119k | blobcount++; |
2156 | 119k | } while (!new_it.cycled_list()); |
2157 | 14.4k | if (blobcount > textord_spline_minblobs || segments == 1) { |
2158 | 12.7k | xstarts[segments] = new_box.right(); |
2159 | 12.7k | } else { |
2160 | 1.73k | xstarts[--segments] = new_box.right(); |
2161 | 1.73k | } |
2162 | 14.4k | if (textord_oldbl_debug) { |
2163 | 0 | tprintf("Made %d segments on row at (%d,%d)\n", segments, box.right(), box.bottom()); |
2164 | 0 | } |
2165 | 14.4k | return needs_curve; |
2166 | 19.6k | } |
2167 | | |
2168 | | /** |
2169 | | * @name linear_spline_baseline |
2170 | | * |
2171 | | * Divide the baseline up into segments which require a different |
2172 | | * quadratic fitted to them. |
2173 | | * @return true if enough blobs were far enough away to need a quadratic. |
2174 | | */ |
2175 | | double *linear_spline_baseline( // split baseline |
2176 | | TO_ROW *row, // row to fit |
2177 | | TO_BLOCK *block, // block it came from |
2178 | | int32_t &segments, // no fo segments |
2179 | | int32_t xstarts[] // coords of segments |
2180 | 0 | ) { |
2181 | 0 | int blobcount; // no of blobs |
2182 | 0 | int blobindex; // current blob |
2183 | 0 | int index1, index2; // blob numbers |
2184 | 0 | int blobs_per_segment; // blobs in each |
2185 | 0 | TBOX box; // blob box |
2186 | 0 | TBOX new_box; // new_it box |
2187 | | // blobs |
2188 | 0 | BLOBNBOX_IT blob_it = row->blob_list(); |
2189 | 0 | BLOBNBOX_IT new_it = blob_it; // front end |
2190 | 0 | float b, c; // fitted curve |
2191 | 0 | tesseract::DetLineFit lms; |
2192 | 0 | int32_t segment; // current segment |
2193 | |
|
2194 | 0 | box = box_next_pre_chopped(&blob_it); |
2195 | 0 | xstarts[0] = box.left(); |
2196 | 0 | blobcount = 1; |
2197 | 0 | while (!blob_it.at_first()) { |
2198 | 0 | blobcount++; |
2199 | 0 | box = box_next_pre_chopped(&blob_it); |
2200 | 0 | } |
2201 | 0 | segments = blobcount / textord_spline_medianwin; |
2202 | 0 | if (segments < 1) { |
2203 | 0 | segments = 1; |
2204 | 0 | } |
2205 | 0 | blobs_per_segment = blobcount / segments; |
2206 | | // quadratic coeffs |
2207 | 0 | auto *coeffs = new double[segments * 3]; |
2208 | 0 | if (textord_oldbl_debug) { |
2209 | 0 | tprintf( |
2210 | 0 | "Linear splining baseline of %d blobs at (%d,%d), into %d segments of " |
2211 | 0 | "%d blobs\n", |
2212 | 0 | blobcount, box.left(), box.bottom(), segments, blobs_per_segment); |
2213 | 0 | } |
2214 | 0 | segment = 1; |
2215 | 0 | for (index2 = 0; index2 < blobs_per_segment / 2; index2++) { |
2216 | 0 | box_next_pre_chopped(&new_it); |
2217 | 0 | } |
2218 | 0 | index1 = 0; |
2219 | 0 | blobindex = index2; |
2220 | 0 | do { |
2221 | 0 | blobindex += blobs_per_segment; |
2222 | 0 | lms.Clear(); |
2223 | 0 | while (index1 < blobindex || (segment == segments && index1 < blobcount)) { |
2224 | 0 | box = box_next_pre_chopped(&blob_it); |
2225 | 0 | int middle = (box.left() + box.right()) / 2; |
2226 | 0 | lms.Add(ICOORD(middle, box.bottom())); |
2227 | 0 | index1++; |
2228 | 0 | if (index1 == blobindex - blobs_per_segment / 2 || index1 == blobcount - 1) { |
2229 | 0 | xstarts[segment] = box.left(); |
2230 | 0 | } |
2231 | 0 | } |
2232 | 0 | lms.Fit(&b, &c); |
2233 | 0 | coeffs[segment * 3 - 3] = 0; |
2234 | 0 | coeffs[segment * 3 - 2] = b; |
2235 | 0 | coeffs[segment * 3 - 1] = c; |
2236 | 0 | segment++; |
2237 | 0 | if (segment > segments) { |
2238 | 0 | break; |
2239 | 0 | } |
2240 | | |
2241 | 0 | blobindex += blobs_per_segment; |
2242 | 0 | lms.Clear(); |
2243 | 0 | while (index2 < blobindex || (segment == segments && index2 < blobcount)) { |
2244 | 0 | new_box = box_next_pre_chopped(&new_it); |
2245 | 0 | int middle = (new_box.left() + new_box.right()) / 2; |
2246 | 0 | lms.Add(ICOORD(middle, new_box.bottom())); |
2247 | 0 | index2++; |
2248 | 0 | if (index2 == blobindex - blobs_per_segment / 2 || index2 == blobcount - 1) { |
2249 | 0 | xstarts[segment] = new_box.left(); |
2250 | 0 | } |
2251 | 0 | } |
2252 | 0 | lms.Fit(&b, &c); |
2253 | 0 | coeffs[segment * 3 - 3] = 0; |
2254 | 0 | coeffs[segment * 3 - 2] = b; |
2255 | 0 | coeffs[segment * 3 - 1] = c; |
2256 | 0 | segment++; |
2257 | 0 | } while (segment <= segments); |
2258 | 0 | return coeffs; |
2259 | 0 | } |
2260 | | |
2261 | | /** |
2262 | | * @name assign_blobs_to_rows |
2263 | | * |
2264 | | * Make enough rows to allocate all the given blobs to one. |
2265 | | * If a block skew is given, use that, else attempt to track it. |
2266 | | */ |
2267 | | void assign_blobs_to_rows( // find lines |
2268 | | TO_BLOCK *block, // block to do |
2269 | | float *gradient, // block skew |
2270 | | int pass, // identification |
2271 | | bool reject_misses, // chuck big ones out |
2272 | | bool make_new_rows, // add rows for unmatched |
2273 | | bool drawing_skew // draw smoothed skew |
2274 | 36.1k | ) { |
2275 | 36.1k | OVERLAP_STATE overlap_result; // what to do with it |
2276 | 36.1k | float ycoord; // current y |
2277 | 36.1k | float top, bottom; // of blob |
2278 | 36.1k | float g_length = 1.0f; // from gradient |
2279 | 36.1k | int16_t row_count; // no of rows |
2280 | 36.1k | int16_t left_x; // left edge |
2281 | 36.1k | int16_t last_x; // previous edge |
2282 | 36.1k | float block_skew; // y delta |
2283 | 36.1k | float smooth_factor; // for new coords |
2284 | 36.1k | float near_dist; // dist to nearest row |
2285 | 36.1k | ICOORD testpt; // testing only |
2286 | 36.1k | BLOBNBOX *blob; // current blob |
2287 | 36.1k | TO_ROW *row; // current row |
2288 | 36.1k | TO_ROW *dest_row = nullptr; // row to put blob in |
2289 | | // iterators |
2290 | 36.1k | BLOBNBOX_IT blob_it = &block->blobs; |
2291 | 36.1k | TO_ROW_IT row_it = block->get_rows(); |
2292 | | |
2293 | 36.1k | ycoord = |
2294 | 36.1k | (block->block->pdblk.bounding_box().bottom() + block->block->pdblk.bounding_box().top()) / |
2295 | 36.1k | 2.0f; |
2296 | 36.1k | if (gradient != nullptr) { |
2297 | 28.3k | g_length = std::sqrt(1 + *gradient * *gradient); |
2298 | 28.3k | } |
2299 | | #ifndef GRAPHICS_DISABLED |
2300 | | if (drawing_skew) { |
2301 | | to_win->SetCursor(block->block->pdblk.bounding_box().left(), ycoord); |
2302 | | } |
2303 | | #endif |
2304 | 36.1k | testpt = ICOORD(textord_test_x, textord_test_y); |
2305 | 36.1k | blob_it.sort(blob_x_order); |
2306 | 36.1k | smooth_factor = 1.0; |
2307 | 36.1k | block_skew = 0.0f; |
2308 | 36.1k | row_count = row_it.length(); // might have rows |
2309 | 36.1k | if (!blob_it.empty()) { |
2310 | 33.4k | left_x = blob_it.data()->bounding_box().left(); |
2311 | 33.4k | } else { |
2312 | 2.67k | left_x = block->block->pdblk.bounding_box().left(); |
2313 | 2.67k | } |
2314 | 36.1k | last_x = left_x; |
2315 | 1.09M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
2316 | 1.05M | blob = blob_it.data(); |
2317 | 1.05M | if (gradient != nullptr) { |
2318 | 819k | block_skew = (1 - 1 / g_length) * blob->bounding_box().bottom() + |
2319 | 819k | *gradient / g_length * blob->bounding_box().left(); |
2320 | 819k | } else if (blob->bounding_box().left() - last_x > block->line_size / 2 && |
2321 | 237k | last_x - left_x > block->line_size * 2 && textord_interpolating_skew) { |
2322 | | // tprintf("Interpolating skew from %g",block_skew); |
2323 | 457 | block_skew *= static_cast<float>(blob->bounding_box().left() - left_x) / (last_x - left_x); |
2324 | | // tprintf("to %g\n",block_skew); |
2325 | 457 | } |
2326 | 1.05M | last_x = blob->bounding_box().left(); |
2327 | 1.05M | top = blob->bounding_box().top() - block_skew; |
2328 | 1.05M | bottom = blob->bounding_box().bottom() - block_skew; |
2329 | | #ifndef GRAPHICS_DISABLED |
2330 | | if (drawing_skew) { |
2331 | | to_win->DrawTo(blob->bounding_box().left(), ycoord + block_skew); |
2332 | | } |
2333 | | #endif |
2334 | 1.05M | if (!row_it.empty()) { |
2335 | 5.12M | for (row_it.move_to_first(); !row_it.at_last() && row_it.data()->min_y() > top; |
2336 | 4.10M | row_it.forward()) { |
2337 | 4.10M | } |
2338 | 1.01M | row = row_it.data(); |
2339 | 1.01M | if (row->min_y() <= top && row->max_y() >= bottom) { |
2340 | | // any overlap |
2341 | 859k | dest_row = row; |
2342 | 859k | overlap_result = most_overlapping_row(&row_it, dest_row, top, bottom, block->line_size, |
2343 | 859k | blob->bounding_box().contains(testpt)); |
2344 | 859k | if (overlap_result == NEW_ROW && !reject_misses) { |
2345 | 15.3k | overlap_result = ASSIGN; |
2346 | 15.3k | } |
2347 | 859k | } else { |
2348 | 156k | overlap_result = NEW_ROW; |
2349 | 156k | if (!make_new_rows) { |
2350 | 92.9k | near_dist = row_it.data_relative(-1)->min_y() - top; |
2351 | | // below bottom |
2352 | 92.9k | if (bottom < row->min_y()) { |
2353 | 20.5k | if (row->min_y() - bottom <= (block->line_spacing - block->line_size) * |
2354 | 20.5k | tesseract::CCStruct::kDescenderFraction) { |
2355 | | // done it |
2356 | 1.77k | overlap_result = ASSIGN; |
2357 | 1.77k | dest_row = row; |
2358 | 1.77k | } |
2359 | 72.4k | } else if (near_dist > 0 && near_dist < bottom - row->max_y()) { |
2360 | 13.6k | row_it.backward(); |
2361 | 13.6k | dest_row = row_it.data(); |
2362 | 13.6k | if (dest_row->min_y() - bottom <= (block->line_spacing - block->line_size) * |
2363 | 13.6k | tesseract::CCStruct::kDescenderFraction) { |
2364 | | // done it |
2365 | 1.82k | overlap_result = ASSIGN; |
2366 | 1.82k | } |
2367 | 58.7k | } else { |
2368 | 58.7k | if (top - row->max_y() <= |
2369 | 58.7k | (block->line_spacing - block->line_size) * |
2370 | 58.7k | (textord_overlap_x + tesseract::CCStruct::kAscenderFraction)) { |
2371 | | // done it |
2372 | 17.4k | overlap_result = ASSIGN; |
2373 | 17.4k | dest_row = row; |
2374 | 17.4k | } |
2375 | 58.7k | } |
2376 | 92.9k | } |
2377 | 156k | } |
2378 | 1.01M | if (overlap_result == ASSIGN) { |
2379 | 832k | dest_row->add_blob(blob_it.extract(), top, bottom, block->line_size); |
2380 | 832k | } |
2381 | 1.01M | if (overlap_result == NEW_ROW) { |
2382 | 180k | if (make_new_rows && top - bottom < block->max_blob_size) { |
2383 | 87.6k | dest_row = new TO_ROW(blob_it.extract(), top, bottom, block->line_size); |
2384 | 87.6k | row_count++; |
2385 | 87.6k | if (bottom > row_it.data()->min_y()) { |
2386 | 59.2k | row_it.add_before_then_move(dest_row); |
2387 | | // insert in right place |
2388 | 59.2k | } else { |
2389 | 28.4k | row_it.add_after_then_move(dest_row); |
2390 | 28.4k | } |
2391 | 87.6k | smooth_factor = 1.0 / (row_count * textord_skew_lag + textord_skewsmooth_offset); |
2392 | 92.8k | } else { |
2393 | 92.8k | overlap_result = REJECT; |
2394 | 92.8k | } |
2395 | 180k | } |
2396 | 1.01M | } else if (make_new_rows && top - bottom < block->max_blob_size) { |
2397 | 9.96k | overlap_result = NEW_ROW; |
2398 | 9.96k | dest_row = new TO_ROW(blob_it.extract(), top, bottom, block->line_size); |
2399 | 9.96k | row_count++; |
2400 | 9.96k | row_it.add_after_then_move(dest_row); |
2401 | 9.96k | smooth_factor = 1.0 / (row_count * textord_skew_lag + textord_skewsmooth_offset2); |
2402 | 30.6k | } else { |
2403 | 30.6k | overlap_result = REJECT; |
2404 | 30.6k | } |
2405 | 1.05M | if (blob->bounding_box().contains(testpt) && textord_debug_blob) { |
2406 | 0 | if (overlap_result != REJECT) { |
2407 | 0 | tprintf("Test blob assigned to row at (%g,%g) on pass %d\n", dest_row->min_y(), |
2408 | 0 | dest_row->max_y(), pass); |
2409 | 0 | } else { |
2410 | 0 | tprintf("Test blob assigned to no row on pass %d\n", pass); |
2411 | 0 | } |
2412 | 0 | } |
2413 | 1.05M | if (overlap_result != REJECT) { |
2414 | 930k | while (!row_it.at_first() && row_it.data()->min_y() > row_it.data_relative(-1)->min_y()) { |
2415 | 13 | row = row_it.extract(); |
2416 | 13 | row_it.backward(); |
2417 | 13 | row_it.add_before_then_move(row); |
2418 | 13 | } |
2419 | 930k | while (!row_it.at_last() && row_it.data()->min_y() < row_it.data_relative(1)->min_y()) { |
2420 | 27 | row = row_it.extract(); |
2421 | 27 | row_it.forward(); |
2422 | | // Keep rows in order. |
2423 | 27 | row_it.add_after_then_move(row); |
2424 | 27 | } |
2425 | 930k | BLOBNBOX_IT added_blob_it(dest_row->blob_list()); |
2426 | 930k | added_blob_it.move_to_last(); |
2427 | 930k | TBOX prev_box = added_blob_it.data_relative(-1)->bounding_box(); |
2428 | 930k | if (dest_row->blob_list()->singleton() || !prev_box.major_x_overlap(blob->bounding_box())) { |
2429 | 719k | block_skew = (1 - smooth_factor) * block_skew + |
2430 | 719k | smooth_factor * (blob->bounding_box().bottom() - dest_row->initial_min_y()); |
2431 | 719k | } |
2432 | 930k | } |
2433 | 1.05M | } |
2434 | 280k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
2435 | 244k | if (row_it.data()->blob_list()->empty()) { |
2436 | 668 | delete row_it.extract(); // Discard empty rows. |
2437 | 668 | } |
2438 | 244k | } |
2439 | 36.1k | } |
2440 | | |
2441 | | /** |
2442 | | * @name most_overlapping_row |
2443 | | * |
2444 | | * Return the row which most overlaps the blob. |
2445 | | */ |
2446 | | OVERLAP_STATE most_overlapping_row( // find best row |
2447 | | TO_ROW_IT *row_it, // iterator |
2448 | | TO_ROW *&best_row, // output row |
2449 | | float top, // top of blob |
2450 | | float bottom, // bottom of blob |
2451 | | float rowsize, // max row size |
2452 | | bool testing_blob // test stuff |
2453 | 859k | ) { |
2454 | 859k | OVERLAP_STATE result; // result of tests |
2455 | 859k | float overlap; // of blob & row |
2456 | 859k | float bestover; // nearest row |
2457 | 859k | float merge_top, merge_bottom; // size of merged row |
2458 | 859k | ICOORD testpt; // testing only |
2459 | 859k | TO_ROW *row; // current row |
2460 | 859k | TO_ROW *test_row; // for multiple overlaps |
2461 | 859k | BLOBNBOX_IT blob_it; // for merging rows |
2462 | | |
2463 | 859k | result = ASSIGN; |
2464 | 859k | row = row_it->data(); |
2465 | 859k | bestover = top - bottom; |
2466 | 859k | if (top > row->max_y()) { |
2467 | 89.3k | bestover -= top - row->max_y(); |
2468 | 89.3k | } |
2469 | 859k | if (bottom < row->min_y()) { |
2470 | | // compute overlap |
2471 | 495k | bestover -= row->min_y() - bottom; |
2472 | 495k | } |
2473 | 859k | if (testing_blob && textord_debug_blob) { |
2474 | 0 | tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f\n", bottom, top, row->min_y(), |
2475 | 0 | row->max_y(), rowsize, bestover); |
2476 | 0 | } |
2477 | 859k | test_row = row; |
2478 | 1.56M | do { |
2479 | 1.56M | if (!row_it->at_last()) { |
2480 | 1.47M | row_it->forward(); |
2481 | 1.47M | test_row = row_it->data(); |
2482 | 1.47M | if (test_row->min_y() <= top && test_row->max_y() >= bottom) { |
2483 | 747k | merge_top = std::max(test_row->max_y(),row->max_y()); |
2484 | 747k | merge_bottom = std::min(test_row->min_y(),row->min_y()); |
2485 | 747k | if (merge_top - merge_bottom <= rowsize) { |
2486 | 3.56k | if (testing_blob && textord_debug_blob) { |
2487 | 0 | tprintf("Merging rows at (%g,%g), (%g,%g)\n", row->min_y(), row->max_y(), |
2488 | 0 | test_row->min_y(), test_row->max_y()); |
2489 | 0 | } |
2490 | 3.56k | test_row->set_limits(merge_bottom, merge_top); |
2491 | 3.56k | blob_it.set_to_list(test_row->blob_list()); |
2492 | 3.56k | blob_it.add_list_after(row->blob_list()); |
2493 | 3.56k | blob_it.sort(blob_x_order); |
2494 | 3.56k | row_it->backward(); |
2495 | 3.56k | delete row_it->extract(); |
2496 | 3.56k | row_it->forward(); |
2497 | 3.56k | bestover = -1.0f; // force replacement |
2498 | 3.56k | } |
2499 | 747k | overlap = top - bottom; |
2500 | 747k | if (top > test_row->max_y()) { |
2501 | 504k | overlap -= top - test_row->max_y(); |
2502 | 504k | } |
2503 | 747k | if (bottom < test_row->min_y()) { |
2504 | 90.1k | overlap -= test_row->min_y() - bottom; |
2505 | 90.1k | } |
2506 | 747k | if (bestover >= rowsize - 1 && overlap >= rowsize - 1) { |
2507 | 9.37k | result = REJECT; |
2508 | 9.37k | } |
2509 | 747k | if (overlap > bestover) { |
2510 | 413k | bestover = overlap; // find biggest overlap |
2511 | 413k | row = test_row; |
2512 | 413k | } |
2513 | 747k | if (testing_blob && textord_debug_blob) { |
2514 | 0 | tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f->%f\n", bottom, top, |
2515 | 0 | test_row->min_y(), test_row->max_y(), rowsize, overlap, bestover); |
2516 | 0 | } |
2517 | 747k | } |
2518 | 1.47M | } |
2519 | 1.56M | } while (!row_it->at_last() && test_row->min_y() <= top && test_row->max_y() >= bottom); |
2520 | 1.92M | while (row_it->data() != row) { |
2521 | 1.06M | row_it->backward(); // make it point to row |
2522 | 1.06M | } |
2523 | | // doesn't overlap much |
2524 | 859k | if (top - bottom - bestover > rowsize * textord_overlap_x && |
2525 | 859k | (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x) && result == ASSIGN) { |
2526 | 60.2k | result = NEW_ROW; // doesn't overlap enough |
2527 | 60.2k | } |
2528 | 859k | best_row = row; |
2529 | 859k | return result; |
2530 | 859k | } |
2531 | | |
2532 | | /** |
2533 | | * @name blob_x_order |
2534 | | * |
2535 | | * Sort function to sort blobs in x from page left. |
2536 | | */ |
2537 | | int blob_x_order( // sort function |
2538 | | const BLOBNBOX *blob1, // items to compare |
2539 | 8.75M | const BLOBNBOX *blob2) { |
2540 | 8.75M | if (blob1->bounding_box().left() < blob2->bounding_box().left()) { |
2541 | 5.24M | return -1; |
2542 | 5.24M | } else if (blob1->bounding_box().left() > blob2->bounding_box().left()) { |
2543 | 2.45M | return 1; |
2544 | 2.45M | } else { |
2545 | 1.05M | return 0; |
2546 | 1.05M | } |
2547 | 8.75M | } |
2548 | | |
2549 | | /** |
2550 | | * @name mark_repeated_chars |
2551 | | * |
2552 | | * Mark blobs marked with BTFT_LEADER in repeated sets using the |
2553 | | * repeated_set member of BLOBNBOX. |
2554 | | */ |
2555 | 56.2k | void mark_repeated_chars(TO_ROW *row) { |
2556 | 56.2k | BLOBNBOX_IT box_it(row->blob_list()); // Iterator. |
2557 | 56.2k | int num_repeated_sets = 0; |
2558 | 56.2k | if (!box_it.empty()) { |
2559 | 545k | do { |
2560 | 545k | BLOBNBOX *bblob = box_it.data(); |
2561 | 545k | int repeat_length = 1; |
2562 | 545k | if (bblob->flow() == BTFT_LEADER && !bblob->joined_to_prev() && bblob->cblob() != nullptr) { |
2563 | 0 | BLOBNBOX_IT test_it(box_it); |
2564 | 0 | for (test_it.forward(); !test_it.at_first();) { |
2565 | 0 | bblob = test_it.data(); |
2566 | 0 | if (bblob->flow() != BTFT_LEADER) { |
2567 | 0 | break; |
2568 | 0 | } |
2569 | 0 | test_it.forward(); |
2570 | 0 | bblob = test_it.data(); |
2571 | 0 | if (bblob->joined_to_prev() || bblob->cblob() == nullptr) { |
2572 | 0 | repeat_length = 0; |
2573 | 0 | break; |
2574 | 0 | } |
2575 | 0 | ++repeat_length; |
2576 | 0 | } |
2577 | 0 | } |
2578 | 545k | if (repeat_length >= kMinLeaderCount) { |
2579 | 0 | num_repeated_sets++; |
2580 | 0 | for (; repeat_length > 0; box_it.forward(), --repeat_length) { |
2581 | 0 | bblob = box_it.data(); |
2582 | 0 | bblob->set_repeated_set(num_repeated_sets); |
2583 | 0 | } |
2584 | 545k | } else { |
2585 | 545k | bblob->set_repeated_set(0); |
2586 | 545k | box_it.forward(); |
2587 | 545k | } |
2588 | 545k | } while (!box_it.at_first()); // until all done |
2589 | 56.2k | } |
2590 | 56.2k | row->set_num_repeated_sets(num_repeated_sets); |
2591 | 56.2k | } |
2592 | | |
2593 | | } // namespace tesseract |