/src/tesseract/src/textord/wordseg.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: wordseg.cpp (Formerly wspace.c) |
3 | | * Description: Code to segment the blobs into words. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1992, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | // Include automatically generated configuration file if running autoconf. |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" |
22 | | #endif |
23 | | |
24 | | #include "wordseg.h" |
25 | | |
26 | | #include <cmath> |
27 | | |
28 | | #include "blobbox.h" |
29 | | #include "cjkpitch.h" |
30 | | #include "drawtord.h" |
31 | | #include "fpchop.h" |
32 | | #include "makerow.h" |
33 | | #include "pitsync1.h" |
34 | | #include "statistc.h" |
35 | | #include "textord.h" |
36 | | #include "topitch.h" |
37 | | #include "tovars.h" |
38 | | |
39 | | namespace tesseract { |
40 | | |
41 | | BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows"); |
42 | | BOOL_VAR(textord_chopper_test, false, "Chopper is being tested."); |
43 | | |
44 | 0 | #define BLOCK_STATS_CLUSTERS 10 |
45 | | |
46 | | /** |
47 | | * @name make_single_word |
48 | | * |
49 | | * For each row, arrange the blobs into one word. There is no fixed |
50 | | * pitch detection. |
51 | | */ |
52 | | |
53 | 0 | void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) { |
54 | 0 | TO_ROW_IT to_row_it(rows); |
55 | 0 | ROW_IT row_it(real_rows); |
56 | 0 | for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) { |
57 | 0 | TO_ROW *row = to_row_it.data(); |
58 | | // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready |
59 | | // to create the word. |
60 | 0 | C_BLOB_LIST cblobs; |
61 | 0 | C_BLOB_IT cblob_it(&cblobs); |
62 | 0 | BLOBNBOX_IT box_it(row->blob_list()); |
63 | 0 | for (; !box_it.empty(); box_it.forward()) { |
64 | 0 | BLOBNBOX *bblob = box_it.extract(); |
65 | 0 | if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { |
66 | 0 | auto cblob = bblob->remove_cblob(); |
67 | 0 | if (cblob != nullptr) { |
68 | 0 | C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); |
69 | 0 | cout_it.move_to_last(); |
70 | 0 | cout_it.add_list_after(cblob->out_list()); |
71 | 0 | delete cblob; |
72 | 0 | } |
73 | 0 | } else { |
74 | 0 | auto cblob = bblob->remove_cblob(); |
75 | 0 | if (cblob != nullptr) { |
76 | 0 | cblob_it.add_after_then_move(cblob); |
77 | 0 | } |
78 | 0 | } |
79 | 0 | delete bblob; |
80 | 0 | } |
81 | | // Convert the TO_ROW to a ROW. |
82 | 0 | ROW *real_row = |
83 | 0 | new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size)); |
84 | 0 | WERD_IT word_it(real_row->word_list()); |
85 | 0 | WERD *word = new WERD(&cblobs, 0, nullptr); |
86 | 0 | word->set_flag(W_BOL, true); |
87 | 0 | word->set_flag(W_EOL, true); |
88 | 0 | word->set_flag(W_DONT_CHOP, one_blob); |
89 | 0 | word_it.add_after_then_move(word); |
90 | 0 | real_row->recalc_bounding_box(); |
91 | 0 | row_it.add_after_then_move(real_row); |
92 | 0 | } |
93 | 0 | } |
94 | | |
95 | | /** |
96 | | * make_words |
97 | | * |
98 | | * Arrange the blobs into words. |
99 | | */ |
100 | | void make_words(tesseract::Textord *textord, |
101 | | ICOORD page_tr, // top right |
102 | | float gradient, // page skew |
103 | | BLOCK_LIST *blocks, // block list |
104 | 17.2k | TO_BLOCK_LIST *port_blocks) { // output list |
105 | 17.2k | TO_BLOCK_IT block_it; // iterator |
106 | 17.2k | TO_BLOCK *block; // current block |
107 | | |
108 | 17.2k | if (textord->use_cjk_fp_model()) { |
109 | 0 | compute_fixed_pitch_cjk(page_tr, port_blocks); |
110 | 17.2k | } else { |
111 | 17.2k | compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f), |
112 | 17.2k | !bool(textord_test_landscape)); |
113 | 17.2k | } |
114 | 17.2k | textord->to_spacing(page_tr, port_blocks); |
115 | 17.2k | block_it.set_to_list(port_blocks); |
116 | 34.5k | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
117 | 17.2k | block = block_it.data(); |
118 | 17.2k | make_real_words(textord, block, FCOORD(1.0f, 0.0f)); |
119 | 17.2k | } |
120 | 17.2k | } |
121 | | |
122 | | /** |
123 | | * @name set_row_spaces |
124 | | * |
125 | | * Set the min_space and max_nonspace members of the row so that |
126 | | * the blobs can be arranged into words. |
127 | | */ |
128 | | |
129 | | void set_row_spaces( // find space sizes |
130 | | TO_BLOCK *block, // block to do |
131 | | FCOORD rotation, // for drawing |
132 | | bool testing_on // correct orientation |
133 | 0 | ) { |
134 | 0 | TO_ROW *row; // current row |
135 | 0 | TO_ROW_IT row_it = block->get_rows(); |
136 | |
|
137 | 0 | if (row_it.empty()) { |
138 | 0 | return; // empty block |
139 | 0 | } |
140 | 0 | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
141 | 0 | row = row_it.data(); |
142 | 0 | if (row->fixed_pitch == 0) { |
143 | 0 | row->min_space = static_cast<int32_t>( |
144 | 0 | ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread)); |
145 | 0 | row->max_nonspace = static_cast<int32_t>( |
146 | 0 | floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread)); |
147 | 0 | if (testing_on && textord_show_initial_words) { |
148 | 0 | tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace, |
149 | 0 | row->min_space, row->intercept()); |
150 | 0 | } |
151 | 0 | row->space_threshold = (row->max_nonspace + row->min_space) / 2; |
152 | 0 | row->space_size = row->pr_space; |
153 | 0 | row->kern_size = row->pr_nonsp; |
154 | 0 | } |
155 | | #ifndef GRAPHICS_DISABLED |
156 | | if (textord_show_initial_words && testing_on) { |
157 | | plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row); |
158 | | } |
159 | | #endif |
160 | 0 | } |
161 | 0 | } |
162 | | |
163 | | /** |
164 | | * @name row_words |
165 | | * |
166 | | * Compute the max nonspace and min space for the row. |
167 | | */ |
168 | | |
169 | | int32_t row_words( // compute space size |
170 | | TO_BLOCK *block, // block it came from |
171 | | TO_ROW *row, // row to operate on |
172 | | int32_t maxwidth, // max expected space size |
173 | | FCOORD rotation, // for drawing |
174 | | bool testing_on // for debug |
175 | 0 | ) { |
176 | 0 | bool testing_row; // contains testpt |
177 | 0 | bool prev_valid; // if decent size |
178 | 0 | int32_t prev_x; // end of prev blob |
179 | 0 | int32_t cluster_count; // no of clusters |
180 | 0 | int32_t gap_index; // which cluster |
181 | 0 | int32_t smooth_factor; // for smoothing stats |
182 | 0 | BLOBNBOX *blob; // current blob |
183 | 0 | float lower, upper; // clustering parameters |
184 | 0 | float gaps[3]; // gap clusers |
185 | 0 | ICOORD testpt; |
186 | 0 | TBOX blob_box; // bounding box |
187 | | // iterator |
188 | 0 | BLOBNBOX_IT blob_it = row->blob_list(); |
189 | 0 | STATS gap_stats(0, maxwidth - 1); |
190 | 0 | STATS cluster_stats[4]; // clusters |
191 | |
|
192 | 0 | testpt = ICOORD(textord_test_x, textord_test_y); |
193 | 0 | smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5); |
194 | | // if (testing_on) |
195 | | // tprintf("Row smooth factor=%d\n",smooth_factor); |
196 | 0 | prev_valid = false; |
197 | 0 | prev_x = -INT32_MAX; |
198 | 0 | testing_row = false; |
199 | 0 | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
200 | 0 | blob = blob_it.data(); |
201 | 0 | blob_box = blob->bounding_box(); |
202 | 0 | if (blob_box.contains(testpt)) { |
203 | 0 | testing_row = true; |
204 | 0 | } |
205 | 0 | gap_stats.add(blob_box.width(), 1); |
206 | 0 | } |
207 | 0 | gap_stats.clear(); |
208 | 0 | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
209 | 0 | blob = blob_it.data(); |
210 | 0 | if (!blob->joined_to_prev()) { |
211 | 0 | blob_box = blob->bounding_box(); |
212 | 0 | if (prev_valid && blob_box.left() - prev_x < maxwidth) { |
213 | 0 | gap_stats.add(blob_box.left() - prev_x, 1); |
214 | 0 | } |
215 | 0 | prev_valid = true; |
216 | 0 | prev_x = blob_box.right(); |
217 | 0 | } |
218 | 0 | } |
219 | 0 | if (gap_stats.get_total() == 0) { |
220 | 0 | row->min_space = 0; // no evidence |
221 | 0 | row->max_nonspace = 0; |
222 | 0 | return 0; |
223 | 0 | } |
224 | 0 | gap_stats.smooth(smooth_factor); |
225 | 0 | lower = row->xheight * textord_words_initial_lower; |
226 | 0 | upper = row->xheight * textord_words_initial_upper; |
227 | 0 | cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats); |
228 | 0 | while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) { |
229 | | // shrink gap |
230 | 0 | upper = (upper * 3 + lower) / 4; |
231 | 0 | lower = (lower * 3 + upper) / 4; |
232 | 0 | cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats); |
233 | 0 | } |
234 | 0 | if (cluster_count < 2) { |
235 | 0 | row->min_space = 0; // no evidence |
236 | 0 | row->max_nonspace = 0; |
237 | 0 | return 0; |
238 | 0 | } |
239 | 0 | for (gap_index = 0; gap_index < cluster_count; gap_index++) { |
240 | 0 | gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5); |
241 | 0 | } |
242 | | // get medians |
243 | 0 | if (cluster_count > 2) { |
244 | 0 | if (testing_on && textord_show_initial_words) { |
245 | 0 | tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(), |
246 | 0 | cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5)); |
247 | 0 | } |
248 | 0 | lower = gaps[0]; |
249 | 0 | if (gaps[1] > lower) { |
250 | 0 | upper = gaps[1]; // prefer most frequent |
251 | 0 | if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) { |
252 | 0 | upper = gaps[2]; |
253 | 0 | } |
254 | 0 | } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) { |
255 | 0 | upper = gaps[2]; |
256 | 0 | } else if (lower >= block->xheight * textord_words_min_minspace) { |
257 | 0 | upper = lower; // not nice |
258 | 0 | lower = gaps[1]; |
259 | 0 | if (testing_on && textord_show_initial_words) { |
260 | 0 | tprintf("Had to switch most common from lower to upper!!\n"); |
261 | 0 | gap_stats.print(); |
262 | 0 | } |
263 | 0 | } else { |
264 | 0 | row->min_space = 0; // no evidence |
265 | 0 | row->max_nonspace = 0; |
266 | 0 | return 0; |
267 | 0 | } |
268 | 0 | } else { |
269 | 0 | if (gaps[1] < gaps[0]) { |
270 | 0 | if (testing_on && textord_show_initial_words) { |
271 | 0 | tprintf("Had to switch most common from lower to upper!!\n"); |
272 | 0 | gap_stats.print(); |
273 | 0 | } |
274 | 0 | lower = gaps[1]; |
275 | 0 | upper = gaps[0]; |
276 | 0 | } else { |
277 | 0 | upper = gaps[1]; |
278 | 0 | lower = gaps[0]; |
279 | 0 | } |
280 | 0 | } |
281 | 0 | if (upper < block->xheight * textord_words_min_minspace) { |
282 | 0 | row->min_space = 0; // no evidence |
283 | 0 | row->max_nonspace = 0; |
284 | 0 | return 0; |
285 | 0 | } |
286 | 0 | if (upper * 3 < block->min_space * 2 + block->max_nonspace || |
287 | 0 | lower * 3 > block->min_space * 2 + block->max_nonspace) { |
288 | 0 | if (testing_on && textord_show_initial_words) { |
289 | 0 | tprintf("Disagreement between block and row at %g!!\n", row->intercept()); |
290 | 0 | tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper); |
291 | 0 | gap_stats.print(); |
292 | 0 | } |
293 | 0 | } |
294 | 0 | row->min_space = |
295 | 0 | static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread)); |
296 | 0 | row->max_nonspace = |
297 | 0 | static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread)); |
298 | 0 | row->space_threshold = (row->max_nonspace + row->min_space) / 2; |
299 | 0 | row->space_size = upper; |
300 | 0 | row->kern_size = lower; |
301 | 0 | if (testing_on && textord_show_initial_words) { |
302 | 0 | if (testing_row) { |
303 | 0 | tprintf("GAP STATS\n"); |
304 | 0 | gap_stats.print(); |
305 | 0 | tprintf("SPACE stats\n"); |
306 | 0 | cluster_stats[2].print_summary(); |
307 | 0 | tprintf("NONSPACE stats\n"); |
308 | 0 | cluster_stats[1].print_summary(); |
309 | 0 | } |
310 | 0 | tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space, |
311 | 0 | upper, row->max_nonspace, lower); |
312 | 0 | } |
313 | 0 | return cluster_stats[2].get_total(); |
314 | 0 | } |
315 | | |
316 | | /** |
317 | | * @name row_words2 |
318 | | * |
319 | | * Compute the max nonspace and min space for the row. |
320 | | */ |
321 | | |
322 | | int32_t row_words2( // compute space size |
323 | | TO_BLOCK *block, // block it came from |
324 | | TO_ROW *row, // row to operate on |
325 | | int32_t maxwidth, // max expected space size |
326 | | FCOORD rotation, // for drawing |
327 | | bool testing_on // for debug |
328 | 0 | ) { |
329 | 0 | bool prev_valid; // if decent size |
330 | 0 | bool this_valid; // current blob big enough |
331 | 0 | int32_t prev_x; // end of prev blob |
332 | 0 | int32_t min_width; // min interesting width |
333 | 0 | int32_t valid_count; // good gaps |
334 | 0 | int32_t total_count; // total gaps |
335 | 0 | int32_t cluster_count; // no of clusters |
336 | 0 | int32_t prev_count; // previous cluster_count |
337 | 0 | int32_t gap_index; // which cluster |
338 | 0 | int32_t smooth_factor; // for smoothing stats |
339 | 0 | BLOBNBOX *blob; // current blob |
340 | 0 | float lower, upper; // clustering parameters |
341 | 0 | ICOORD testpt; |
342 | 0 | TBOX blob_box; // bounding box |
343 | | // iterator |
344 | 0 | BLOBNBOX_IT blob_it = row->blob_list(); |
345 | 0 | STATS gap_stats(0, maxwidth - 1); |
346 | | // gap sizes |
347 | 0 | float gaps[BLOCK_STATS_CLUSTERS]; |
348 | 0 | STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; |
349 | | // clusters |
350 | |
|
351 | 0 | testpt = ICOORD(textord_test_x, textord_test_y); |
352 | 0 | smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5); |
353 | | // if (testing_on) |
354 | | // tprintf("Row smooth factor=%d\n",smooth_factor); |
355 | 0 | prev_valid = false; |
356 | 0 | prev_x = -INT16_MAX; |
357 | 0 | const bool testing_row = false; |
358 | | // min blob size |
359 | 0 | min_width = static_cast<int32_t>(block->pr_space); |
360 | 0 | total_count = 0; |
361 | 0 | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
362 | 0 | blob = blob_it.data(); |
363 | 0 | if (!blob->joined_to_prev()) { |
364 | 0 | blob_box = blob->bounding_box(); |
365 | 0 | this_valid = blob_box.width() >= min_width; |
366 | 0 | if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) { |
367 | 0 | gap_stats.add(blob_box.left() - prev_x, 1); |
368 | 0 | } |
369 | 0 | total_count++; // count possibles |
370 | 0 | prev_x = blob_box.right(); |
371 | 0 | prev_valid = this_valid; |
372 | 0 | } |
373 | 0 | } |
374 | 0 | valid_count = gap_stats.get_total(); |
375 | 0 | if (valid_count < total_count * textord_words_minlarge) { |
376 | 0 | gap_stats.clear(); |
377 | 0 | prev_x = -INT16_MAX; |
378 | 0 | for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { |
379 | 0 | blob = blob_it.data(); |
380 | 0 | if (!blob->joined_to_prev()) { |
381 | 0 | blob_box = blob->bounding_box(); |
382 | 0 | if (blob_box.left() - prev_x < maxwidth) { |
383 | 0 | gap_stats.add(blob_box.left() - prev_x, 1); |
384 | 0 | } |
385 | 0 | prev_x = blob_box.right(); |
386 | 0 | } |
387 | 0 | } |
388 | 0 | } |
389 | 0 | if (gap_stats.get_total() == 0) { |
390 | 0 | row->min_space = 0; // no evidence |
391 | 0 | row->max_nonspace = 0; |
392 | 0 | return 0; |
393 | 0 | } |
394 | | |
395 | 0 | cluster_count = 0; |
396 | 0 | lower = block->xheight * words_initial_lower; |
397 | 0 | upper = block->xheight * words_initial_upper; |
398 | 0 | gap_stats.smooth(smooth_factor); |
399 | 0 | do { |
400 | 0 | prev_count = cluster_count; |
401 | 0 | cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, |
402 | 0 | BLOCK_STATS_CLUSTERS, cluster_stats); |
403 | 0 | } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); |
404 | 0 | if (cluster_count < 1) { |
405 | 0 | row->min_space = 0; |
406 | 0 | row->max_nonspace = 0; |
407 | 0 | return 0; |
408 | 0 | } |
409 | 0 | for (gap_index = 0; gap_index < cluster_count; gap_index++) { |
410 | 0 | gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5); |
411 | 0 | } |
412 | | // get medians |
413 | 0 | if (testing_on) { |
414 | 0 | tprintf("cluster_count=%d:", cluster_count); |
415 | 0 | for (gap_index = 0; gap_index < cluster_count; gap_index++) { |
416 | 0 | tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total()); |
417 | 0 | } |
418 | 0 | tprintf("\n"); |
419 | 0 | } |
420 | | |
421 | | // Try to find proportional non-space and space for row. |
422 | 0 | for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace; |
423 | 0 | gap_index++) { |
424 | 0 | ; |
425 | 0 | } |
426 | 0 | if (gap_index < cluster_count) { |
427 | 0 | lower = gaps[gap_index]; // most frequent below |
428 | 0 | } else { |
429 | 0 | if (testing_on) { |
430 | 0 | tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp); |
431 | 0 | } |
432 | 0 | lower = block->pr_nonsp; |
433 | 0 | } |
434 | 0 | for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace; |
435 | 0 | gap_index++) { |
436 | 0 | ; |
437 | 0 | } |
438 | 0 | if (gap_index < cluster_count) { |
439 | 0 | upper = gaps[gap_index]; // most frequent above |
440 | 0 | } else { |
441 | 0 | if (testing_on) { |
442 | 0 | tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space); |
443 | 0 | } |
444 | 0 | upper = block->pr_space; |
445 | 0 | } |
446 | 0 | row->min_space = |
447 | 0 | static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread)); |
448 | 0 | row->max_nonspace = |
449 | 0 | static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread)); |
450 | 0 | row->space_threshold = (row->max_nonspace + row->min_space) / 2; |
451 | 0 | row->space_size = upper; |
452 | 0 | row->kern_size = lower; |
453 | 0 | if (testing_on) { |
454 | 0 | if (testing_row) { |
455 | 0 | tprintf("GAP STATS\n"); |
456 | 0 | gap_stats.print(); |
457 | 0 | tprintf("SPACE stats\n"); |
458 | 0 | cluster_stats[2].print_summary(); |
459 | 0 | tprintf("NONSPACE stats\n"); |
460 | 0 | cluster_stats[1].print_summary(); |
461 | 0 | } |
462 | 0 | tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space, |
463 | 0 | upper, row->max_nonspace, lower); |
464 | 0 | } |
465 | 0 | return 1; |
466 | 0 | } |
467 | | |
468 | | /** |
469 | | * @name make_real_words |
470 | | * |
471 | | * Convert a TO_BLOCK to a BLOCK. |
472 | | */ |
473 | | |
474 | | void make_real_words(tesseract::Textord *textord, |
475 | | TO_BLOCK *block, // block to do |
476 | | FCOORD rotation // for drawing |
477 | 17.2k | ) { |
478 | 17.2k | TO_ROW *row; // current row |
479 | 17.2k | TO_ROW_IT row_it = block->get_rows(); |
480 | 17.2k | ROW *real_row = nullptr; // output row |
481 | 17.2k | ROW_IT real_row_it = block->block->row_list(); |
482 | | |
483 | 17.2k | if (row_it.empty()) { |
484 | 519 | return; // empty block |
485 | 519 | } |
486 | 202k | for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { |
487 | 185k | row = row_it.data(); |
488 | 185k | if (row->blob_list()->empty() && !row->rep_words.empty()) { |
489 | 0 | real_row = make_rep_words(row, block); |
490 | 185k | } else if (!row->blob_list()->empty()) { |
491 | | // In a fixed pitch document, some lines may be detected as fixed pitch |
492 | | // while others don't, and will go through different path. |
493 | | // For non-space delimited language like CJK, fixed pitch chop always |
494 | | // leave the entire line as one word. We can force consistent chopping |
495 | | // with force_make_prop_words flag. |
496 | 185k | POLY_BLOCK *pb = block->block->pdblk.poly_block(); |
497 | 185k | if (textord_chopper_test) { |
498 | 0 | real_row = textord->make_blob_words(row, rotation); |
499 | 185k | } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) || |
500 | 185k | row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) { |
501 | 180k | real_row = textord->make_prop_words(row, rotation); |
502 | 180k | } else if (row->pitch_decision == PITCH_DEF_FIXED || |
503 | 5.56k | row->pitch_decision == PITCH_CORR_FIXED) { |
504 | 5.56k | real_row = fixed_pitch_words(row, rotation); |
505 | 5.56k | } else { |
506 | 0 | ASSERT_HOST(false); |
507 | 0 | } |
508 | 185k | } |
509 | 185k | if (real_row != nullptr) { |
510 | | // put row in block |
511 | 185k | real_row_it.add_after_then_move(real_row); |
512 | 185k | } |
513 | 185k | } |
514 | 16.7k | block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size), |
515 | 16.7k | static_cast<int16_t>(block->space_size), |
516 | 16.7k | static_cast<int16_t>(block->fixed_pitch)); |
517 | 16.7k | block->block->check_pitch(); |
518 | 16.7k | } |
519 | | |
520 | | /** |
521 | | * @name make_rep_words |
522 | | * |
523 | | * Fabricate a real row from only the repeated blob words. |
524 | | * Get the xheight from the block as it may be more meaningful. |
525 | | */ |
526 | | |
527 | | ROW *make_rep_words( // make a row |
528 | | TO_ROW *row, // row to convert |
529 | | TO_BLOCK *block // block it lives in |
530 | 0 | ) { |
531 | 0 | ROW *real_row; // output row |
532 | 0 | TBOX word_box; // bounding box |
533 | | // iterator |
534 | 0 | WERD_IT word_it = &row->rep_words; |
535 | |
|
536 | 0 | if (word_it.empty()) { |
537 | 0 | return nullptr; |
538 | 0 | } |
539 | 0 | word_box = word_it.data()->bounding_box(); |
540 | 0 | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
541 | 0 | word_box += word_it.data()->bounding_box(); |
542 | 0 | } |
543 | 0 | row->xheight = block->xheight; |
544 | 0 | real_row = |
545 | 0 | new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size)); |
546 | 0 | word_it.set_to_list(real_row->word_list()); |
547 | | // put words in row |
548 | 0 | word_it.add_list_after(&row->rep_words); |
549 | 0 | real_row->recalc_bounding_box(); |
550 | 0 | return real_row; |
551 | 0 | } |
552 | | |
553 | | /** |
554 | | * @name make_real_word |
555 | | * |
556 | | * Construct a WERD from a given number of adjacent entries in a |
557 | | * list of BLOBNBOXs. |
558 | | */ |
559 | | |
560 | | WERD *make_real_word(BLOBNBOX_IT *box_it, // iterator |
561 | | int32_t blobcount, // no of blobs to use |
562 | | bool bol, // start of line |
563 | | uint8_t blanks // no of blanks |
564 | 0 | ) { |
565 | 0 | C_OUTLINE_IT cout_it; |
566 | 0 | C_BLOB_LIST cblobs; |
567 | 0 | C_BLOB_IT cblob_it = &cblobs; |
568 | |
|
569 | 0 | for (int blobindex = 0; blobindex < blobcount; blobindex++) { |
570 | 0 | auto bblob = box_it->extract(); |
571 | 0 | if (bblob->joined_to_prev()) { |
572 | 0 | auto cblob = bblob->remove_cblob(); |
573 | 0 | if (cblob != nullptr) { |
574 | 0 | cout_it.set_to_list(cblob_it.data()->out_list()); |
575 | 0 | cout_it.move_to_last(); |
576 | 0 | cout_it.add_list_after(cblob->out_list()); |
577 | 0 | delete cblob; |
578 | 0 | } |
579 | 0 | } else { |
580 | 0 | auto cblob = bblob->remove_cblob(); |
581 | 0 | if (cblob != nullptr) { |
582 | 0 | cblob_it.add_after_then_move(cblob); |
583 | 0 | } |
584 | 0 | } |
585 | 0 | delete bblob; |
586 | 0 | box_it->forward(); // next one |
587 | 0 | } |
588 | |
|
589 | 0 | if (blanks < 1) { |
590 | 0 | blanks = 1; |
591 | 0 | } |
592 | |
|
593 | 0 | auto word = new WERD(&cblobs, blanks, nullptr); |
594 | |
|
595 | 0 | if (bol) { |
596 | 0 | word->set_flag(W_BOL, true); |
597 | 0 | } |
598 | 0 | if (box_it->at_first()) { |
599 | 0 | word->set_flag(W_EOL, true); // at end of line |
600 | 0 | } |
601 | |
|
602 | 0 | return word; |
603 | 0 | } |
604 | | |
605 | | } // namespace tesseract |