/src/tesseract/src/textord/tordmain.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: tordmain.cpp (Formerly textordp.c) |
3 | | * Description: C++ top level textord code. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1992, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #define _USE_MATH_DEFINES // for M_PI |
20 | | |
21 | | #ifdef HAVE_CONFIG_H |
22 | | # include "config_auto.h" |
23 | | #endif |
24 | | |
25 | | #include "tordmain.h" |
26 | | |
27 | | #include "arrayaccess.h" // for GET_DATA_BYTE |
28 | | #include "blobbox.h" // for BLOBNBOX_IT, BLOBNBOX, TO_BLOCK, TO_B... |
29 | | #include "ccstruct.h" // for CCStruct, CCStruct::kXHeightFraction |
30 | | #include "clst.h" // for CLISTIZE |
31 | | #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST, C_OUTLINE |
32 | | #include "drawtord.h" // for plot_box_list, to_win, create_to_win |
33 | | #include "edgblob.h" // for extract_edges |
34 | | #include "errcode.h" // for ASSERT_HOST, ... |
35 | | #include "makerow.h" // for textord_test_x, textord_test_y, texto... |
36 | | #include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only) |
37 | | #include "ocrrow.h" // for ROW, ROW_IT, ROW_LIST, tweak_row_base... |
38 | | #include "params.h" // for DoubleParam, BoolParam, IntParam |
39 | | #include "pdblock.h" // for PDBLK |
40 | | #include "points.h" // for FCOORD, ICOORD |
41 | | #include "polyblk.h" // for POLY_BLOCK |
42 | | #include "quadratc.h" // for QUAD_COEFFS |
43 | | #include "quspline.h" // for QSPLINE, tweak_row_baseline |
44 | | #include "rect.h" // for TBOX |
45 | | #include "scrollview.h" // for ScrollView, ScrollView::WHITE |
46 | | #include "statistc.h" // for STATS |
47 | | #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST |
48 | | #include "textord.h" // for Textord, WordWithBox, WordGrid, WordS... |
49 | | #include "tprintf.h" // for tprintf |
50 | | #include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP |
51 | | |
52 | | #include <allheaders.h> // for pixDestroy, pixGetHeight, boxCreate |
53 | | |
54 | | #include <cfloat> // for FLT_MAX |
55 | | #include <cmath> // for ceil, floor, M_PI |
56 | | #include <cstdint> // for INT16_MAX, uint32_t, int32_t, int16_t |
57 | | #include <memory> |
58 | | |
59 | | namespace tesseract { |
60 | | |
61 | 7.72k | #define MAX_NEAREST_DIST 600 // for block skew stats |
62 | | |
63 | | /********************************************************************** |
64 | | * SetBlobStrokeWidth |
65 | | * |
66 | | * Set the horizontal and vertical stroke widths in the blob. |
67 | | **********************************************************************/ |
68 | 2.07M | void SetBlobStrokeWidth(Image pix, BLOBNBOX *blob) { |
69 | | // Cut the blob rectangle into a Pix. |
70 | 2.07M | int pix_height = pixGetHeight(pix); |
71 | 2.07M | const TBOX &box = blob->bounding_box(); |
72 | 2.07M | int width = box.width(); |
73 | 2.07M | int height = box.height(); |
74 | 2.07M | Box *blob_pix_box = boxCreate(box.left(), pix_height - box.top(), width, height); |
75 | 2.07M | Image pix_blob = pixClipRectangle(pix, blob_pix_box, nullptr); |
76 | 2.07M | boxDestroy(&blob_pix_box); |
77 | 2.07M | Image dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG); |
78 | 2.07M | pix_blob.destroy(); |
79 | | // Compute the stroke widths. |
80 | 2.07M | uint32_t *data = pixGetData(dist_pix); |
81 | 2.07M | int wpl = pixGetWpl(dist_pix); |
82 | | // Horizontal width of stroke. |
83 | 2.07M | STATS h_stats(0, width); |
84 | 15.7M | for (int y = 0; y < height; ++y) { |
85 | 13.6M | uint32_t *pixels = data + y * wpl; |
86 | 13.6M | int prev_pixel = 0; |
87 | 13.6M | int pixel = GET_DATA_BYTE(pixels, 0); |
88 | 124M | for (int x = 1; x < width; ++x) { |
89 | 111M | int next_pixel = GET_DATA_BYTE(pixels, x); |
90 | | // We are looking for a pixel that is equal to its vertical neighbours, |
91 | | // yet greater than its left neighbour. |
92 | 111M | if (prev_pixel < pixel && (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && |
93 | 111M | (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) { |
94 | 10.2M | if (pixel > next_pixel) { |
95 | | // Single local max, so an odd width. |
96 | 5.33M | h_stats.add(pixel * 2 - 1, 1); |
97 | 5.33M | } else if (pixel == next_pixel && x + 1 < width && pixel > GET_DATA_BYTE(pixels, x + 1)) { |
98 | | // Double local max, so an even width. |
99 | 2.65M | h_stats.add(pixel * 2, 1); |
100 | 2.65M | } |
101 | 10.2M | } |
102 | 111M | prev_pixel = pixel; |
103 | 111M | pixel = next_pixel; |
104 | 111M | } |
105 | 13.6M | } |
106 | | // Vertical width of stroke. |
107 | 2.07M | STATS v_stats(0, height); |
108 | 12.6M | for (int x = 0; x < width; ++x) { |
109 | 10.6M | int prev_pixel = 0; |
110 | 10.6M | int pixel = GET_DATA_BYTE(data, x); |
111 | 124M | for (int y = 1; y < height; ++y) { |
112 | 114M | uint32_t *pixels = data + y * wpl; |
113 | 114M | int next_pixel = GET_DATA_BYTE(pixels, x); |
114 | | // We are looking for a pixel that is equal to its horizontal neighbours, |
115 | | // yet greater than its upper neighbour. |
116 | 114M | if (prev_pixel < pixel && (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && |
117 | 114M | (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) { |
118 | 11.9M | if (pixel > next_pixel) { |
119 | | // Single local max, so an odd width. |
120 | 5.62M | v_stats.add(pixel * 2 - 1, 1); |
121 | 6.33M | } else if (pixel == next_pixel && y + 1 < height && |
122 | 6.33M | pixel > GET_DATA_BYTE(pixels + wpl, x)) { |
123 | | // Double local max, so an even width. |
124 | 2.40M | v_stats.add(pixel * 2, 1); |
125 | 2.40M | } |
126 | 11.9M | } |
127 | 114M | prev_pixel = pixel; |
128 | 114M | pixel = next_pixel; |
129 | 114M | } |
130 | 10.6M | } |
131 | 2.07M | dist_pix.destroy(); |
132 | | // Store the horizontal and vertical width in the blob, keeping both |
133 | | // widths if there is enough information, otherwise only the one with |
134 | | // the most samples. |
135 | | // If there are insufficient samples, store zero, rather than using |
136 | | // 2*area/perimeter, as the numbers that gives do not match the numbers |
137 | | // from the distance method. |
138 | 2.07M | if (h_stats.get_total() >= (width + height) / 4) { |
139 | 499k | blob->set_horz_stroke_width(h_stats.ile(0.5f)); |
140 | 499k | if (v_stats.get_total() >= (width + height) / 4) { |
141 | 138k | blob->set_vert_stroke_width(v_stats.ile(0.5f)); |
142 | 360k | } else { |
143 | 360k | blob->set_vert_stroke_width(0.0f); |
144 | 360k | } |
145 | 1.57M | } else { |
146 | 1.57M | if (v_stats.get_total() >= (width + height) / 4 || v_stats.get_total() > h_stats.get_total()) { |
147 | 266k | blob->set_horz_stroke_width(0.0f); |
148 | 266k | blob->set_vert_stroke_width(v_stats.ile(0.5f)); |
149 | 1.30M | } else { |
150 | 1.30M | blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f) : 0.0f); |
151 | 1.30M | blob->set_vert_stroke_width(0.0f); |
152 | 1.30M | } |
153 | 1.57M | } |
154 | 2.07M | } |
155 | | |
156 | | /********************************************************************** |
157 | | * assign_blobs_to_blocks2 |
158 | | * |
159 | | * Make a list of TO_BLOCKs for portrait and landscape orientation. |
160 | | **********************************************************************/ |
161 | | |
162 | | void assign_blobs_to_blocks2(Image pix, |
163 | | BLOCK_LIST *blocks, // blocks to process |
164 | 7.72k | TO_BLOCK_LIST *port_blocks) { // output list |
165 | 7.72k | BLOCK_IT block_it = blocks; |
166 | 7.72k | C_BLOB_IT blob_it; // iterator |
167 | 7.72k | BLOBNBOX_IT port_box_it; // iterator |
168 | | // destination iterator |
169 | 7.72k | TO_BLOCK_IT port_block_it = port_blocks; |
170 | | |
171 | 15.4k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
172 | 7.72k | auto block = block_it.data(); |
173 | 7.72k | auto port_block = new TO_BLOCK(block); |
174 | | |
175 | | // Convert the good outlines to block->blob_list |
176 | 7.72k | port_box_it.set_to_list(&port_block->blobs); |
177 | 7.72k | blob_it.set_to_list(block->blob_list()); |
178 | 2.07M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
179 | 2.07M | auto blob = blob_it.extract(); |
180 | 2.07M | auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. |
181 | 2.07M | newblob->set_owns_cblob(true); |
182 | 2.07M | SetBlobStrokeWidth(pix, newblob); |
183 | 2.07M | port_box_it.add_after_then_move(newblob); |
184 | 2.07M | } |
185 | | |
186 | | // Put the rejected outlines in block->noise_blobs, which allows them to |
187 | | // be reconsidered and sorted back into rows and recover outlines mistakenly |
188 | | // rejected. |
189 | 7.72k | port_box_it.set_to_list(&port_block->noise_blobs); |
190 | 7.72k | blob_it.set_to_list(block->reject_blobs()); |
191 | 10.1k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
192 | 2.45k | auto blob = blob_it.extract(); |
193 | 2.45k | auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. |
194 | 2.45k | newblob->set_owns_cblob(true); |
195 | 2.45k | SetBlobStrokeWidth(pix, newblob); |
196 | 2.45k | port_box_it.add_after_then_move(newblob); |
197 | 2.45k | } |
198 | | |
199 | 7.72k | port_block_it.add_after_then_move(port_block); |
200 | 7.72k | } |
201 | 7.72k | } |
202 | | |
203 | | /********************************************************************** |
204 | | * find_components |
205 | | * |
206 | | * Find the C_OUTLINEs of the connected components in each block, put them |
207 | | * in C_BLOBs, and filter them by size, putting the different size |
208 | | * grades on different lists in the matching TO_BLOCK in to_blocks. |
209 | | **********************************************************************/ |
210 | | |
211 | 7.72k | void Textord::find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) { |
212 | 7.72k | int width = pixGetWidth(pix); |
213 | 7.72k | int height = pixGetHeight(pix); |
214 | 7.72k | if (width > INT16_MAX || height > INT16_MAX) { |
215 | 0 | tprintf("Input image too large! (%d, %d)\n", width, height); |
216 | 0 | return; // Can't handle it. |
217 | 0 | } |
218 | | |
219 | 7.72k | BLOCK_IT block_it(blocks); // iterator |
220 | 15.4k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
221 | 7.72k | BLOCK *block = block_it.data(); |
222 | 7.72k | if (block->pdblk.poly_block() == nullptr || block->pdblk.poly_block()->IsText()) { |
223 | 7.72k | extract_edges(pix, block); |
224 | 7.72k | } |
225 | 7.72k | } |
226 | | |
227 | 7.72k | assign_blobs_to_blocks2(pix, blocks, to_blocks); |
228 | 7.72k | ICOORD page_tr(width, height); |
229 | 7.72k | filter_blobs(page_tr, to_blocks, !textord_test_landscape); |
230 | 7.72k | } |
231 | | |
232 | | /********************************************************************** |
233 | | * filter_blobs |
234 | | * |
235 | | * Sort the blobs into sizes in all the blocks for later work. |
236 | | **********************************************************************/ |
237 | | |
238 | | void Textord::filter_blobs(ICOORD page_tr, // top right |
239 | | TO_BLOCK_LIST *blocks, // output list |
240 | 7.72k | bool testing_on) { // for plotting |
241 | 7.72k | TO_BLOCK_IT block_it = blocks; // destination iterator |
242 | 7.72k | TO_BLOCK *block; // created block |
243 | | |
244 | | #ifndef GRAPHICS_DISABLED |
245 | | if (to_win != nullptr) { |
246 | | to_win->Clear(); |
247 | | } |
248 | | #endif // !GRAPHICS_DISABLED |
249 | | |
250 | 15.4k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
251 | 7.72k | block = block_it.data(); |
252 | 7.72k | block->line_size = filter_noise_blobs(&block->blobs, &block->noise_blobs, &block->small_blobs, |
253 | 7.72k | &block->large_blobs); |
254 | 7.72k | if (block->line_size == 0) { |
255 | 190 | block->line_size = 1; |
256 | 190 | } |
257 | 7.72k | block->line_spacing = |
258 | 7.72k | block->line_size * |
259 | 7.72k | (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + |
260 | 7.72k | 2 * tesseract::CCStruct::kAscenderFraction) / |
261 | 7.72k | tesseract::CCStruct::kXHeightFraction; |
262 | 7.72k | block->line_size *= textord_min_linesize; |
263 | 7.72k | block->max_blob_size = block->line_size * textord_excess_blobsize; |
264 | | |
265 | | #ifndef GRAPHICS_DISABLED |
266 | | if (textord_show_blobs && testing_on) { |
267 | | if (to_win == nullptr) { |
268 | | create_to_win(page_tr); |
269 | | } |
270 | | block->plot_graded_blobs(to_win); |
271 | | } |
272 | | if (textord_show_boxes && testing_on) { |
273 | | if (to_win == nullptr) { |
274 | | create_to_win(page_tr); |
275 | | } |
276 | | plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE); |
277 | | plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE); |
278 | | plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE); |
279 | | plot_box_list(to_win, &block->blobs, ScrollView::WHITE); |
280 | | } |
281 | | #endif // !GRAPHICS_DISABLED |
282 | 7.72k | } |
283 | 7.72k | } |
284 | | |
285 | | /********************************************************************** |
286 | | * filter_noise_blobs |
287 | | * |
288 | | * Move small blobs to a separate list. |
289 | | **********************************************************************/ |
290 | | |
291 | | float Textord::filter_noise_blobs(BLOBNBOX_LIST *src_list, // original list |
292 | | BLOBNBOX_LIST *noise_list, // noise list |
293 | | BLOBNBOX_LIST *small_list, // small blobs |
294 | 7.72k | BLOBNBOX_LIST *large_list) { // large blobs |
295 | 7.72k | int16_t height; // height of blob |
296 | 7.72k | int16_t width; // of blob |
297 | 7.72k | BLOBNBOX *blob; // current blob |
298 | 7.72k | float initial_x; // first guess |
299 | 7.72k | BLOBNBOX_IT src_it = src_list; // iterators |
300 | 7.72k | BLOBNBOX_IT noise_it = noise_list; |
301 | 7.72k | BLOBNBOX_IT small_it = small_list; |
302 | 7.72k | BLOBNBOX_IT large_it = large_list; |
303 | 7.72k | STATS size_stats(0, MAX_NEAREST_DIST - 1); |
304 | | // blob heights |
305 | 7.72k | float min_y; // size limits |
306 | 7.72k | float max_y; |
307 | 7.72k | float max_x; |
308 | 7.72k | float max_height; // of good blobs |
309 | | |
310 | 2.07M | for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { |
311 | 2.07M | blob = src_it.data(); |
312 | 2.07M | if (blob->bounding_box().height() < textord_max_noise_size) { |
313 | 1.19M | noise_it.add_after_then_move(src_it.extract()); |
314 | 1.19M | } else if (blob->enclosed_area() >= blob->bounding_box().height() * |
315 | 874k | blob->bounding_box().width() * |
316 | 874k | textord_noise_area_ratio) { |
317 | 454k | small_it.add_after_then_move(src_it.extract()); |
318 | 454k | } |
319 | 2.07M | } |
320 | 427k | for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { |
321 | 420k | size_stats.add(src_it.data()->bounding_box().height(), 1); |
322 | 420k | } |
323 | 7.72k | initial_x = size_stats.ile(textord_initialx_ile); |
324 | 7.72k | max_y = ceil(initial_x * |
325 | 7.72k | (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + |
326 | 7.72k | 2 * tesseract::CCStruct::kAscenderFraction) / |
327 | 7.72k | tesseract::CCStruct::kXHeightFraction); |
328 | 7.72k | min_y = std::floor(initial_x / 2); |
329 | 7.72k | max_x = ceil(initial_x * textord_width_limit); |
330 | 7.72k | small_it.move_to_first(); |
331 | 462k | for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) { |
332 | 454k | height = small_it.data()->bounding_box().height(); |
333 | 454k | if (height > max_y) { |
334 | 2.83k | large_it.add_after_then_move(small_it.extract()); |
335 | 451k | } else if (height >= min_y) { |
336 | 436k | src_it.add_after_then_move(small_it.extract()); |
337 | 436k | } |
338 | 454k | } |
339 | 7.72k | size_stats.clear(); |
340 | 864k | for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { |
341 | 856k | height = src_it.data()->bounding_box().height(); |
342 | 856k | width = src_it.data()->bounding_box().width(); |
343 | 856k | if (height < min_y) { |
344 | 17.4k | small_it.add_after_then_move(src_it.extract()); |
345 | 839k | } else if (height > max_y || width > max_x) { |
346 | 13.5k | large_it.add_after_then_move(src_it.extract()); |
347 | 825k | } else { |
348 | 825k | size_stats.add(height, 1); |
349 | 825k | } |
350 | 856k | } |
351 | 7.72k | max_height = size_stats.ile(textord_initialasc_ile); |
352 | | // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,", |
353 | | // max_y,min_y,initial_x,max_height); |
354 | 7.72k | max_height *= tesseract::CCStruct::kXHeightCapRatio; |
355 | 7.72k | if (max_height > initial_x) { |
356 | 1.03k | initial_x = max_height; |
357 | 1.03k | } |
358 | | // tprintf(" ret=%g\n",initial_x); |
359 | 7.72k | return initial_x; |
360 | 7.72k | } |
361 | | |
362 | | // Fixes the block so it obeys all the rules: |
363 | | // Must have at least one ROW. |
364 | | // Must have at least one WERD. |
365 | | // WERDs contain a fake blob. |
366 | 0 | void Textord::cleanup_nontext_block(BLOCK *block) { |
367 | | // Non-text blocks must contain at least one row. |
368 | 0 | ROW_IT row_it(block->row_list()); |
369 | 0 | if (row_it.empty()) { |
370 | 0 | const TBOX &box = block->pdblk.bounding_box(); |
371 | 0 | float height = box.height(); |
372 | 0 | int32_t xstarts[2] = {box.left(), box.right()}; |
373 | 0 | double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())}; |
374 | 0 | ROW *row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1); |
375 | 0 | row_it.add_after_then_move(row); |
376 | 0 | } |
377 | | // Each row must contain at least one word. |
378 | 0 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
379 | 0 | ROW *row = row_it.data(); |
380 | 0 | WERD_IT w_it(row->word_list()); |
381 | 0 | if (w_it.empty()) { |
382 | | // Make a fake blob to put in the word. |
383 | 0 | TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box(); |
384 | 0 | C_BLOB *blob = C_BLOB::FakeBlob(box); |
385 | 0 | C_BLOB_LIST blobs; |
386 | 0 | C_BLOB_IT blob_it(&blobs); |
387 | 0 | blob_it.add_after_then_move(blob); |
388 | 0 | WERD *word = new WERD(&blobs, 0, nullptr); |
389 | 0 | w_it.add_after_then_move(word); |
390 | 0 | } |
391 | | // Each word must contain a fake blob. |
392 | 0 | for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { |
393 | 0 | WERD *word = w_it.data(); |
394 | | // Just assert that this is true, as it would be useful to find |
395 | | // out why it isn't. |
396 | 0 | ASSERT_HOST(!word->cblob_list()->empty()); |
397 | 0 | } |
398 | 0 | row->recalc_bounding_box(); |
399 | 0 | } |
400 | 0 | } |
401 | | |
402 | | /********************************************************************** |
403 | | * cleanup_blocks |
404 | | * |
405 | | * Delete empty blocks, rows from the page. |
406 | | **********************************************************************/ |
407 | | |
408 | 7.72k | void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) { |
409 | 7.72k | BLOCK_IT block_it = blocks; // iterator |
410 | 7.72k | ROW_IT row_it; // row iterator |
411 | | |
412 | 7.72k | int num_rows = 0; |
413 | 7.72k | int num_rows_all = 0; |
414 | 7.72k | int num_blocks = 0; |
415 | 7.72k | int num_blocks_all = 0; |
416 | 15.4k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
417 | 7.72k | BLOCK *block = block_it.data(); |
418 | 7.72k | if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { |
419 | 0 | cleanup_nontext_block(block); |
420 | 0 | continue; |
421 | 0 | } |
422 | 7.72k | num_rows = 0; |
423 | 7.72k | num_rows_all = 0; |
424 | 7.72k | if (clean_noise) { |
425 | 7.72k | row_it.set_to_list(block->row_list()); |
426 | 130k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
427 | 122k | ROW *row = row_it.data(); |
428 | 122k | ++num_rows_all; |
429 | 122k | clean_small_noise_from_words(row); |
430 | 122k | if ((textord_noise_rejrows && !row->word_list()->empty() && clean_noise_from_row(row)) || |
431 | 122k | row->word_list()->empty()) { |
432 | 27.0k | delete row_it.extract(); // lose empty row. |
433 | 95.5k | } else { |
434 | 95.5k | if (textord_noise_rejwords) { |
435 | 95.5k | clean_noise_from_words(row_it.data()); |
436 | 95.5k | } |
437 | 95.5k | if (textord_blshift_maxshift >= 0) { |
438 | 95.5k | tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction); |
439 | 95.5k | } |
440 | 95.5k | ++num_rows; |
441 | 95.5k | } |
442 | 122k | } |
443 | 7.72k | } |
444 | 7.72k | if (block->row_list()->empty()) { |
445 | 665 | delete block_it.extract(); // Lose empty text blocks. |
446 | 7.06k | } else { |
447 | 7.06k | ++num_blocks; |
448 | 7.06k | } |
449 | 7.72k | ++num_blocks_all; |
450 | 7.72k | if (textord_noise_debug) { |
451 | 0 | tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all); |
452 | 0 | } |
453 | 7.72k | } |
454 | 7.72k | if (textord_noise_debug) { |
455 | 0 | tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all); |
456 | 0 | } |
457 | 7.72k | } |
458 | | |
459 | | /********************************************************************** |
460 | | * clean_noise_from_row |
461 | | * |
462 | | * Move blobs of words from rows of garbage into the reject blobs list. |
463 | | **********************************************************************/ |
464 | | |
465 | | bool Textord::clean_noise_from_row( // remove empties |
466 | | ROW *row // row to clean |
467 | 117k | ) { |
468 | 117k | bool testing_on; |
469 | 117k | TBOX blob_box; // bounding box |
470 | 117k | C_BLOB *blob; // current blob |
471 | 117k | C_OUTLINE *outline; // current outline |
472 | 117k | WERD *word; // current word |
473 | 117k | int32_t blob_size; // biggest size |
474 | 117k | int32_t trans_count = 0; // no of transitions |
475 | 117k | int32_t trans_threshold; // noise tolerance |
476 | 117k | int32_t dot_count; // small objects |
477 | 117k | int32_t norm_count; // normal objects |
478 | 117k | int32_t super_norm_count; // real char-like |
479 | | // words of row |
480 | 117k | WERD_IT word_it = row->word_list(); |
481 | 117k | C_BLOB_IT blob_it; // blob iterator |
482 | 117k | C_OUTLINE_IT out_it; // outline iterator |
483 | | |
484 | 117k | testing_on = textord_test_y > row->base_line(textord_test_x) && textord_show_blobs && |
485 | 117k | textord_test_y < row->base_line(textord_test_x) + row->x_height(); |
486 | 117k | dot_count = 0; |
487 | 117k | norm_count = 0; |
488 | 117k | super_norm_count = 0; |
489 | 322k | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
490 | 205k | word = word_it.data(); // current word |
491 | | // blobs in word |
492 | 205k | blob_it.set_to_list(word->cblob_list()); |
493 | 999k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
494 | 794k | blob = blob_it.data(); |
495 | 794k | if (!word->flag(W_DONT_CHOP)) { |
496 | | // get outlines |
497 | 765k | out_it.set_to_list(blob->out_list()); |
498 | 2.13M | for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { |
499 | 1.36M | outline = out_it.data(); |
500 | 1.36M | blob_box = outline->bounding_box(); |
501 | 1.36M | blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); |
502 | 1.36M | if (blob_size < textord_noise_sizelimit * row->x_height()) { |
503 | 834k | dot_count++; // count small outlines |
504 | 834k | } |
505 | 1.36M | if (!outline->child()->empty() && |
506 | 1.36M | blob_box.height() < (1 + textord_noise_syfract) * row->x_height() && |
507 | 1.36M | blob_box.height() > (1 - textord_noise_syfract) * row->x_height() && |
508 | 1.36M | blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() && |
509 | 1.36M | blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) { |
510 | 2.54k | super_norm_count++; // count small outlines |
511 | 2.54k | } |
512 | 1.36M | } |
513 | 765k | } else { |
514 | 28.8k | super_norm_count++; |
515 | 28.8k | } |
516 | 794k | blob_box = blob->bounding_box(); |
517 | 794k | blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); |
518 | 794k | if (blob_size >= textord_noise_sizelimit * row->x_height() && |
519 | 794k | blob_size < row->x_height() * 2) { |
520 | 508k | trans_threshold = blob_size / textord_noise_sizefraction; |
521 | 508k | trans_count = blob->count_transitions(trans_threshold); |
522 | 508k | if (trans_count < textord_noise_translimit) { |
523 | 449k | norm_count++; |
524 | 449k | } |
525 | 508k | } else if (blob_box.height() > row->x_height() * 2 && |
526 | 285k | (!word_it.at_first() || !blob_it.at_first())) { |
527 | 10.9k | dot_count += 2; |
528 | 10.9k | } |
529 | 794k | if (testing_on) { |
530 | 0 | tprintf("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", blob_box.left(), |
531 | 0 | blob_box.bottom(), blob_box.right(), blob_box.top(), blob->out_list()->length(), |
532 | 0 | trans_count, blob_box.bottom() - row->base_line(blob_box.left())); |
533 | 0 | } |
534 | 794k | } |
535 | 205k | } |
536 | | // TODO: check whether `&& super_norm_count < textord_noise_sncount`should always be added here. |
537 | 117k | bool rejected = dot_count > norm_count * textord_noise_normratio && |
538 | 117k | dot_count > 2; |
539 | 117k | if (textord_noise_debug) { |
540 | 0 | tprintf("Row ending at (%d,%g):", blob_box.right(), row->base_line(blob_box.right())); |
541 | 0 | tprintf(" R=%g, dc=%d, nc=%d, %s\n", |
542 | 0 | norm_count > 0 ? static_cast<float>(dot_count) / norm_count : 9999, dot_count, |
543 | 0 | norm_count, |
544 | 0 | rejected? "REJECTED": "ACCEPTED"); |
545 | 0 | } |
546 | 117k | return super_norm_count < textord_noise_sncount && rejected; |
547 | 117k | } |
548 | | |
549 | | /********************************************************************** |
550 | | * clean_noise_from_words |
551 | | * |
552 | | * Move blobs of words from rows of garbage into the reject blobs list. |
553 | | **********************************************************************/ |
554 | | |
555 | | void Textord::clean_noise_from_words( // remove empties |
556 | | ROW *row // row to clean |
557 | 95.5k | ) { |
558 | 95.5k | TBOX blob_box; // bounding box |
559 | 95.5k | C_BLOB *blob; // current blob |
560 | 95.5k | C_OUTLINE *outline; // current outline |
561 | 95.5k | WERD *word; // current word |
562 | 95.5k | int32_t blob_size; // biggest size |
563 | 95.5k | int32_t trans_count; // no of transitions |
564 | 95.5k | int32_t trans_threshold; // noise tolerance |
565 | 95.5k | int32_t dot_count; // small objects |
566 | 95.5k | int32_t norm_count; // normal objects |
567 | 95.5k | int32_t dud_words; // number discarded |
568 | 95.5k | int32_t ok_words; // number remaining |
569 | 95.5k | int32_t word_index; // current word |
570 | | // words of row |
571 | 95.5k | WERD_IT word_it = row->word_list(); |
572 | 95.5k | C_BLOB_IT blob_it; // blob iterator |
573 | 95.5k | C_OUTLINE_IT out_it; // outline iterator |
574 | | |
575 | 95.5k | ok_words = word_it.length(); |
576 | 95.5k | if (ok_words == 0 || textord_no_rejects) { |
577 | 0 | return; |
578 | 0 | } |
579 | | // was it chucked |
580 | 95.5k | std::vector<int8_t> word_dud(ok_words); |
581 | 95.5k | dud_words = 0; |
582 | 95.5k | ok_words = 0; |
583 | 95.5k | word_index = 0; |
584 | 241k | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
585 | 145k | word = word_it.data(); // current word |
586 | 145k | dot_count = 0; |
587 | 145k | norm_count = 0; |
588 | | // blobs in word |
589 | 145k | blob_it.set_to_list(word->cblob_list()); |
590 | 621k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
591 | 476k | blob = blob_it.data(); |
592 | 476k | if (!word->flag(W_DONT_CHOP)) { |
593 | | // get outlines |
594 | 447k | out_it.set_to_list(blob->out_list()); |
595 | 1.10M | for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { |
596 | 659k | outline = out_it.data(); |
597 | 659k | blob_box = outline->bounding_box(); |
598 | 659k | blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); |
599 | 659k | if (blob_size < textord_noise_sizelimit * row->x_height()) { |
600 | 255k | dot_count++; // count small outlines |
601 | 255k | } |
602 | 659k | if (!outline->child()->empty() && |
603 | 659k | blob_box.height() < (1 + textord_noise_syfract) * row->x_height() && |
604 | 659k | blob_box.height() > (1 - textord_noise_syfract) * row->x_height() && |
605 | 659k | blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() && |
606 | 659k | blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) { |
607 | 2.54k | norm_count++; // count small outlines |
608 | 2.54k | } |
609 | 659k | } |
610 | 447k | } else { |
611 | 28.8k | norm_count++; |
612 | 28.8k | } |
613 | 476k | blob_box = blob->bounding_box(); |
614 | 476k | blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height(); |
615 | 476k | if (blob_size >= textord_noise_sizelimit * row->x_height() && |
616 | 476k | blob_size < row->x_height() * 2) { |
617 | 377k | trans_threshold = blob_size / textord_noise_sizefraction; |
618 | 377k | trans_count = blob->count_transitions(trans_threshold); |
619 | 377k | if (trans_count < textord_noise_translimit) { |
620 | 349k | norm_count++; |
621 | 349k | } |
622 | 377k | } else if (blob_box.height() > row->x_height() * 2 && |
623 | 99.0k | (!word_it.at_first() || !blob_it.at_first())) { |
624 | 5.38k | dot_count += 2; |
625 | 5.38k | } |
626 | 476k | } |
627 | 145k | if (dot_count > 2 && !word->flag(W_REP_CHAR)) { |
628 | 19.8k | if (dot_count > norm_count * textord_noise_normratio * 2) { |
629 | 3.34k | word_dud[word_index] = 2; |
630 | 16.4k | } else if (dot_count > norm_count * textord_noise_normratio) { |
631 | 2.82k | word_dud[word_index] = 1; |
632 | 13.6k | } else { |
633 | 13.6k | word_dud[word_index] = 0; |
634 | 13.6k | } |
635 | 125k | } else { |
636 | 125k | word_dud[word_index] = 0; |
637 | 125k | } |
638 | 145k | if (word_dud[word_index] == 2) { |
639 | 3.34k | dud_words++; |
640 | 142k | } else { |
641 | 142k | ok_words++; |
642 | 142k | } |
643 | 145k | word_index++; |
644 | 145k | } |
645 | | |
646 | 95.5k | word_index = 0; |
647 | 241k | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
648 | 145k | if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { |
649 | 3.54k | word = word_it.data(); // Current word. |
650 | | // Previously we threw away the entire word. |
651 | | // Now just aggressively throw all small blobs into the reject list, where |
652 | | // the classifier can decide whether they are actually needed. |
653 | 3.54k | word->CleanNoise(textord_noise_sizelimit * row->x_height()); |
654 | 3.54k | } |
655 | 145k | word_index++; |
656 | 145k | } |
657 | 95.5k | } |
658 | | |
659 | | // Remove outlines that are a tiny fraction in either width or height |
660 | | // of the word height. |
661 | 122k | void Textord::clean_small_noise_from_words(ROW *row) { |
662 | 122k | WERD_IT word_it(row->word_list()); |
663 | 334k | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
664 | 211k | WERD *word = word_it.data(); |
665 | 211k | int min_size = static_cast<int>(textord_noise_hfract * word->bounding_box().height() + 0.5); |
666 | 211k | C_BLOB_IT blob_it(word->cblob_list()); |
667 | 1.17M | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
668 | 963k | C_BLOB *blob = blob_it.data(); |
669 | 963k | C_OUTLINE_IT out_it(blob->out_list()); |
670 | 2.74M | for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { |
671 | 1.78M | C_OUTLINE *outline = out_it.data(); |
672 | 1.78M | outline->RemoveSmallRecursive(min_size, &out_it); |
673 | 1.78M | } |
674 | 963k | if (blob->out_list()->empty()) { |
675 | 169k | delete blob_it.extract(); |
676 | 169k | } |
677 | 963k | } |
678 | 211k | if (word->cblob_list()->empty()) { |
679 | 6.70k | if (!word_it.at_last()) { |
680 | | // The next word is no longer a fuzzy non space if it was before, |
681 | | // since the word before is about to be deleted. |
682 | 847 | WERD *next_word = word_it.data_relative(1); |
683 | 847 | if (next_word->flag(W_FUZZY_NON)) { |
684 | 264 | next_word->set_flag(W_FUZZY_NON, false); |
685 | 264 | } |
686 | 847 | } |
687 | 6.70k | delete word_it.extract(); |
688 | 6.70k | } |
689 | 211k | } |
690 | 122k | } |
691 | | |
692 | | // Local struct to hold a group of blocks. |
693 | | struct BlockGroup { |
694 | 0 | BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {} |
695 | | explicit BlockGroup(BLOCK *block) |
696 | 7.06k | : bounding_box(block->pdblk.bounding_box()) |
697 | 7.06k | , rotation(block->re_rotation()) |
698 | 7.06k | , angle(block->re_rotation().angle()) |
699 | 7.06k | , min_xheight(block->x_height()) { |
700 | 7.06k | blocks.push_back(block); |
701 | 7.06k | } |
702 | | // Union of block bounding boxes. |
703 | | TBOX bounding_box; |
704 | | // Common rotation of the blocks. |
705 | | FCOORD rotation; |
706 | | // Angle of rotation. |
707 | | float angle; |
708 | | // Min xheight of the blocks. |
709 | | float min_xheight; |
710 | | // Collection of borrowed pointers to the blocks in the group. |
711 | | std::vector<BLOCK *> blocks; |
712 | | }; |
713 | | |
714 | | // Groups blocks by rotation, then, for each group, makes a WordGrid and calls |
715 | | // TransferDiacriticsToWords to copy the diacritic blobs to the most |
716 | | // appropriate words in the group of blocks. Source blobs are not touched. |
717 | 7.72k | void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks) { |
718 | | // Angle difference larger than this is too much to consider equal. |
719 | | // They should only be in multiples of M_PI/2 anyway. |
720 | 7.72k | const double kMaxAngleDiff = 0.01; // About 0.6 degrees. |
721 | 7.72k | std::vector<std::unique_ptr<BlockGroup>> groups; |
722 | 7.72k | BLOCK_IT bk_it(blocks); |
723 | 14.7k | for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) { |
724 | 7.06k | BLOCK *block = bk_it.data(); |
725 | 7.06k | if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { |
726 | 0 | continue; |
727 | 0 | } |
728 | | // Linear search of the groups to find a matching rotation. |
729 | 7.06k | float block_angle = block->re_rotation().angle(); |
730 | 7.06k | int best_g = 0; |
731 | 7.06k | float best_angle_diff = FLT_MAX; |
732 | 7.06k | for (const auto &group : groups) { |
733 | 0 | double angle_diff = std::fabs(block_angle - group->angle); |
734 | 0 | if (angle_diff > M_PI) { |
735 | 0 | angle_diff = fabs(angle_diff - 2.0 * M_PI); |
736 | 0 | } |
737 | 0 | if (angle_diff < best_angle_diff) { |
738 | 0 | best_angle_diff = angle_diff; |
739 | 0 | best_g = &group - &groups[0]; |
740 | 0 | } |
741 | 0 | } |
742 | 7.06k | if (best_angle_diff > kMaxAngleDiff) { |
743 | 7.06k | groups.push_back(std::make_unique<BlockGroup>(block)); |
744 | 7.06k | } else { |
745 | 0 | groups[best_g]->blocks.push_back(block); |
746 | 0 | groups[best_g]->bounding_box += block->pdblk.bounding_box(); |
747 | 0 | float x_height = block->x_height(); |
748 | 0 | if (x_height < groups[best_g]->min_xheight) { |
749 | 0 | groups[best_g]->min_xheight = x_height; |
750 | 0 | } |
751 | 0 | } |
752 | 7.06k | } |
753 | | // Now process each group of blocks. |
754 | 7.72k | std::vector<std::unique_ptr<WordWithBox>> word_ptrs; |
755 | 7.72k | for (const auto &group : groups) { |
756 | 7.06k | if (group->bounding_box.null_box()) { |
757 | 0 | continue; |
758 | 0 | } |
759 | 7.06k | WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), |
760 | 7.06k | group->bounding_box.topright()); |
761 | 7.06k | for (auto b : group->blocks) { |
762 | 7.06k | ROW_IT row_it(b->row_list()); |
763 | 102k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
764 | 95.5k | ROW *row = row_it.data(); |
765 | | // Put the words of the row into the grid. |
766 | 95.5k | WERD_IT w_it(row->word_list()); |
767 | 241k | for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { |
768 | 145k | WERD *word = w_it.data(); |
769 | 145k | auto box_word = std::make_unique<WordWithBox>(word); |
770 | 145k | word_grid.InsertBBox(true, true, box_word.get()); |
771 | | // Save the pointer where it will be auto-deleted. |
772 | 145k | word_ptrs.emplace_back(std::move(box_word)); |
773 | 145k | } |
774 | 95.5k | } |
775 | 7.06k | } |
776 | 7.06k | FCOORD rotation = group->rotation; |
777 | | // Make it a forward rotation that will transform blob coords to block. |
778 | 7.06k | rotation.set_y(-rotation.y()); |
779 | 7.06k | TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid); |
780 | 7.06k | } |
781 | 7.72k | } |
782 | | |
783 | | // Places a copy of blobs that are near a word (after applying rotation to the |
784 | | // blob) in the most appropriate word, unless there is doubt, in which case a |
785 | | // blob can end up in two words. Source blobs are not touched. |
786 | | void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, const FCOORD &rotation, |
787 | 7.06k | WordGrid *word_grid) { |
788 | 7.06k | WordSearch ws(word_grid); |
789 | 7.06k | BLOBNBOX_IT b_it(diacritic_blobs); |
790 | | // Apply rotation to each blob before finding the nearest words. The rotation |
791 | | // allows us to only consider above/below placement and not left/right on |
792 | | // vertical text, because all text is horizontal here. |
793 | 7.06k | for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { |
794 | 0 | BLOBNBOX *blobnbox = b_it.data(); |
795 | 0 | TBOX blob_box = blobnbox->bounding_box(); |
796 | 0 | blob_box.rotate(rotation); |
797 | 0 | ws.StartRectSearch(blob_box); |
798 | | // Above/below refer to word position relative to diacritic. Since some |
799 | | // scripts eg Kannada/Telugu habitually put diacritics below words, and |
800 | | // others eg Thai/Vietnamese/Latin put most diacritics above words, try |
801 | | // for both if there isn't much in it. |
802 | 0 | WordWithBox *best_above_word = nullptr; |
803 | 0 | WordWithBox *best_below_word = nullptr; |
804 | 0 | int best_above_distance = 0; |
805 | 0 | int best_below_distance = 0; |
806 | 0 | for (WordWithBox *word = ws.NextRectSearch(); word != nullptr; word = ws.NextRectSearch()) { |
807 | 0 | if (word->word()->flag(W_REP_CHAR)) { |
808 | 0 | continue; |
809 | 0 | } |
810 | 0 | TBOX word_box = word->true_bounding_box(); |
811 | 0 | int x_distance = blob_box.x_gap(word_box); |
812 | 0 | int y_distance = blob_box.y_gap(word_box); |
813 | 0 | if (x_distance > 0) { |
814 | | // Arbitrarily divide x-distance by 2 if there is a major y overlap, |
815 | | // and the word is to the left of the diacritic. If the |
816 | | // diacritic is a dropped broken character between two words, this will |
817 | | // help send all the pieces to a single word, instead of splitting them |
818 | | // over the 2 words. |
819 | 0 | if (word_box.major_y_overlap(blob_box) && blob_box.left() > word_box.right()) { |
820 | 0 | x_distance /= 2; |
821 | 0 | } |
822 | 0 | y_distance += x_distance; |
823 | 0 | } |
824 | 0 | if (word_box.y_middle() > blob_box.y_middle() && |
825 | 0 | (best_above_word == nullptr || y_distance < best_above_distance)) { |
826 | 0 | best_above_word = word; |
827 | 0 | best_above_distance = y_distance; |
828 | 0 | } |
829 | 0 | if (word_box.y_middle() <= blob_box.y_middle() && |
830 | 0 | (best_below_word == nullptr || y_distance < best_below_distance)) { |
831 | 0 | best_below_word = word; |
832 | 0 | best_below_distance = y_distance; |
833 | 0 | } |
834 | 0 | } |
835 | 0 | bool above_good = best_above_word != nullptr && |
836 | 0 | (best_below_word == nullptr || |
837 | 0 | best_above_distance < best_below_distance + blob_box.height()); |
838 | 0 | bool below_good = best_below_word != nullptr && best_below_word != best_above_word && |
839 | 0 | (best_above_word == nullptr || |
840 | 0 | best_below_distance < best_above_distance + blob_box.height()); |
841 | 0 | if (below_good) { |
842 | 0 | C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); |
843 | 0 | copied_blob->rotate(rotation); |
844 | | // Put the blob into the word's reject blobs list. |
845 | 0 | C_BLOB_IT blob_it(best_below_word->RejBlobs()); |
846 | 0 | blob_it.add_to_end(copied_blob); |
847 | 0 | } |
848 | 0 | if (above_good) { |
849 | 0 | C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); |
850 | 0 | copied_blob->rotate(rotation); |
851 | | // Put the blob into the word's reject blobs list. |
852 | 0 | C_BLOB_IT blob_it(best_above_word->RejBlobs()); |
853 | 0 | blob_it.add_to_end(copied_blob); |
854 | 0 | } |
855 | 0 | } |
856 | 7.06k | } |
857 | | |
858 | | /********************************************************************** |
859 | | * tweak_row_baseline |
860 | | * |
861 | | * Shift baseline to fit the blobs more accurately where they are |
862 | | * close enough. |
863 | | **********************************************************************/ |
864 | | |
865 | 95.5k | void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction) { |
866 | 95.5k | TBOX blob_box; // bounding box |
867 | 95.5k | C_BLOB *blob; // current blob |
868 | 95.5k | WERD *word; // current word |
869 | 95.5k | int32_t blob_count; // no of blobs |
870 | 95.5k | int32_t src_index; // source segment |
871 | 95.5k | int32_t dest_index; // destination segment |
872 | 95.5k | float ydiff; // baseline error |
873 | 95.5k | float x_centre; // centre of blob |
874 | | // words of row |
875 | 95.5k | WERD_IT word_it = row->word_list(); |
876 | 95.5k | C_BLOB_IT blob_it; // blob iterator |
877 | | |
878 | 95.5k | blob_count = 0; |
879 | 241k | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
880 | 145k | word = word_it.data(); // current word |
881 | | // get total blobs |
882 | 145k | blob_count += word->cblob_list()->length(); |
883 | 145k | } |
884 | 95.5k | if (blob_count == 0) { |
885 | 0 | return; |
886 | 0 | } |
887 | | // spline segments |
888 | 95.5k | std::vector<int32_t> xstarts(blob_count + row->baseline.segments + 1); |
889 | | // spline coeffs |
890 | 95.5k | std::vector<double> coeffs((blob_count + row->baseline.segments) * 3); |
891 | | |
892 | 95.5k | src_index = 0; |
893 | 95.5k | dest_index = 0; |
894 | 95.5k | xstarts[0] = row->baseline.xcoords[0]; |
895 | 241k | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
896 | 145k | word = word_it.data(); // current word |
897 | | // blobs in word |
898 | 145k | blob_it.set_to_list(word->cblob_list()); |
899 | 607k | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
900 | 462k | blob = blob_it.data(); |
901 | 462k | blob_box = blob->bounding_box(); |
902 | 462k | x_centre = (blob_box.left() + blob_box.right()) / 2.0; |
903 | 462k | ydiff = blob_box.bottom() - row->base_line(x_centre); |
904 | 462k | if (ydiff < 0) { |
905 | 157k | ydiff = -ydiff / row->x_height(); |
906 | 304k | } else { |
907 | 304k | ydiff = ydiff / row->x_height(); |
908 | 304k | } |
909 | 462k | if (ydiff < blshift_maxshift && blob_box.height() / row->x_height() > blshift_xfraction) { |
910 | 0 | if (xstarts[dest_index] >= x_centre) { |
911 | 0 | xstarts[dest_index] = blob_box.left(); |
912 | 0 | } |
913 | 0 | coeffs[dest_index * 3] = 0; |
914 | 0 | coeffs[dest_index * 3 + 1] = 0; |
915 | 0 | coeffs[dest_index * 3 + 2] = blob_box.bottom(); |
916 | | // shift it |
917 | 0 | dest_index++; |
918 | 0 | xstarts[dest_index] = blob_box.right() + 1; |
919 | 462k | } else { |
920 | 462k | if (xstarts[dest_index] <= x_centre) { |
921 | 269k | while (row->baseline.xcoords[src_index + 1] <= x_centre && |
922 | 269k | src_index < row->baseline.segments - 1) { |
923 | 110k | if (row->baseline.xcoords[src_index + 1] > xstarts[dest_index]) { |
924 | 46.7k | coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; |
925 | 46.7k | coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; |
926 | 46.7k | coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; |
927 | 46.7k | dest_index++; |
928 | 46.7k | xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; |
929 | 46.7k | } |
930 | 110k | src_index++; |
931 | 110k | } |
932 | 159k | coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; |
933 | 159k | coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; |
934 | 159k | coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; |
935 | 159k | dest_index++; |
936 | 159k | xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; |
937 | 159k | } |
938 | 462k | } |
939 | 462k | } |
940 | 145k | } |
941 | 191k | while (src_index < row->baseline.segments && |
942 | 191k | row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) { |
943 | 95.5k | src_index++; |
944 | 95.5k | } |
945 | 179k | while (src_index < row->baseline.segments) { |
946 | 84.4k | coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; |
947 | 84.4k | coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; |
948 | 84.4k | coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; |
949 | 84.4k | dest_index++; |
950 | 84.4k | src_index++; |
951 | 84.4k | xstarts[dest_index] = row->baseline.xcoords[src_index]; |
952 | 84.4k | } |
953 | | // turn to spline |
954 | 95.5k | row->baseline = QSPLINE(dest_index, &xstarts[0], &coeffs[0]); |
955 | 95.5k | } |
956 | | |
957 | | } // namespace tesseract |