/src/tesseract/src/ccmain/fixspace.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************** |
2 | | * File: fixspace.cpp (Formerly fixspace.c) |
3 | | * Description: Implements a pass over the page res, exploring the alternative |
4 | | * spacing possibilities, trying to use context to improve the |
5 | | * word spacing |
6 | | * Author: Phil Cheatle |
7 | | * |
8 | | * (C) Copyright 1993, Hewlett-Packard Ltd. |
9 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
10 | | ** you may not use this file except in compliance with the License. |
11 | | ** You may obtain a copy of the License at |
12 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
13 | | ** Unless required by applicable law or agreed to in writing, software |
14 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
15 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
16 | | ** See the License for the specific language governing permissions and |
17 | | ** limitations under the License. |
18 | | * |
19 | | **********************************************************************/ |
20 | | |
21 | | #include "fixspace.h" |
22 | | |
23 | | #include "blobs.h" // for TWERD, TBLOB, TESSLINE |
24 | | #include "boxword.h" // for BoxWord |
25 | | #include "errcode.h" // for ASSERT_HOST |
26 | | #include "normalis.h" // for kBlnXHeight, kBlnBaselineOffset |
27 | | #include "pageres.h" // for WERD_RES_IT, WERD_RES, WERD_RES_LIST |
28 | | #include "params.h" // for IntParam, StringParam, BoolParam, DoubleParam, ... |
29 | | #include "ratngs.h" // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM |
30 | | #include "rect.h" // for TBOX |
31 | | #include "stepblob.h" // for C_BLOB_IT, C_BLOB_LIST, C_BLOB |
32 | | #include "tesseractclass.h" // for Tesseract, TesseractStats, WordData |
33 | | #include "tessvars.h" // for debug_fp |
34 | | #include "tprintf.h" // for tprintf |
35 | | #include "unicharset.h" // for UNICHARSET |
36 | | #include "werd.h" // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP |
37 | | |
38 | | #include <tesseract/ocrclass.h> // for ETEXT_DESC |
39 | | #include <tesseract/unichar.h> // for UNICHAR_ID |
40 | | |
41 | | #include <cstdint> // for INT16_MAX, int16_t, int32_t |
42 | | |
43 | | namespace tesseract { |
44 | | |
45 | | class BLOCK; |
46 | | class ROW; |
47 | | |
48 | 0 | #define PERFECT_WERDS 999 |
49 | | |
50 | | /********************************************************************** |
51 | | * c_blob_comparator() |
52 | | * |
53 | | * Blob comparator used to sort a blob list so that blobs are in increasing |
54 | | * order of left edge. |
55 | | **********************************************************************/ |
56 | | |
57 | | static int c_blob_comparator( // sort blobs |
58 | | const C_BLOB *blob1, |
59 | | const C_BLOB *blob2 |
60 | 0 | ) { |
61 | 0 | return blob1->bounding_box().left() - blob2->bounding_box().left(); |
62 | 0 | } |
63 | | |
64 | | /** |
65 | | * @name fix_fuzzy_spaces() |
66 | | * Walk over the page finding sequences of words joined by fuzzy spaces. Extract |
67 | | * them as a sublist, process the sublist to find the optimal arrangement of |
68 | | * spaces then replace the sublist in the ROW_RES. |
69 | | * |
70 | | * @param monitor progress monitor |
71 | | * @param word_count count of words in doc |
72 | | * @param[out] page_res |
73 | | */ |
74 | 0 | void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) { |
75 | 0 | BLOCK_RES_IT block_res_it; |
76 | 0 | ROW_RES_IT row_res_it; |
77 | 0 | WERD_RES_IT word_res_it_from; |
78 | 0 | WERD_RES_IT word_res_it_to; |
79 | 0 | WERD_RES *word_res; |
80 | 0 | WERD_RES_LIST fuzzy_space_words; |
81 | 0 | int16_t new_length; |
82 | 0 | bool prevent_null_wd_fixsp; // DON'T process blobless wds |
83 | 0 | int32_t word_index; // current word |
84 | |
|
85 | 0 | block_res_it.set_to_list(&page_res->block_res_list); |
86 | 0 | word_index = 0; |
87 | 0 | for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) { |
88 | 0 | row_res_it.set_to_list(&block_res_it.data()->row_res_list); |
89 | 0 | for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) { |
90 | 0 | word_res_it_from.set_to_list(&row_res_it.data()->word_res_list); |
91 | 0 | while (!word_res_it_from.at_last()) { |
92 | 0 | word_res = word_res_it_from.data(); |
93 | 0 | while (!word_res_it_from.at_last() && |
94 | 0 | !(word_res->combination || |
95 | 0 | word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) || |
96 | 0 | word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) { |
97 | 0 | fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block); |
98 | 0 | word_res = word_res_it_from.forward(); |
99 | 0 | word_index++; |
100 | 0 | if (monitor != nullptr) { |
101 | 0 | monitor->ocr_alive = true; |
102 | 0 | monitor->progress = 90 + 5 * word_index / word_count; |
103 | 0 | if (monitor->deadline_exceeded() || |
104 | 0 | (monitor->cancel != nullptr && |
105 | 0 | (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) { |
106 | 0 | return; |
107 | 0 | } |
108 | 0 | } |
109 | 0 | } |
110 | | |
111 | 0 | if (!word_res_it_from.at_last()) { |
112 | 0 | word_res_it_to = word_res_it_from; |
113 | 0 | prevent_null_wd_fixsp = word_res->word->cblob_list()->empty(); |
114 | 0 | if (check_debug_pt(word_res, 60)) { |
115 | 0 | debug_fix_space_level.set_value(10); |
116 | 0 | } |
117 | 0 | word_res_it_to.forward(); |
118 | 0 | word_index++; |
119 | 0 | if (monitor != nullptr) { |
120 | 0 | monitor->ocr_alive = true; |
121 | 0 | monitor->progress = 90 + 5 * word_index / word_count; |
122 | 0 | if (monitor->deadline_exceeded() || |
123 | 0 | (monitor->cancel != nullptr && |
124 | 0 | (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) { |
125 | 0 | return; |
126 | 0 | } |
127 | 0 | } |
128 | 0 | while (!word_res_it_to.at_last() && |
129 | 0 | (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) || |
130 | 0 | word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) { |
131 | 0 | if (check_debug_pt(word_res, 60)) { |
132 | 0 | debug_fix_space_level.set_value(10); |
133 | 0 | } |
134 | 0 | if (word_res->word->cblob_list()->empty()) { |
135 | 0 | prevent_null_wd_fixsp = true; |
136 | 0 | } |
137 | 0 | word_res = word_res_it_to.forward(); |
138 | 0 | } |
139 | 0 | if (check_debug_pt(word_res, 60)) { |
140 | 0 | debug_fix_space_level.set_value(10); |
141 | 0 | } |
142 | 0 | if (word_res->word->cblob_list()->empty()) { |
143 | 0 | prevent_null_wd_fixsp = true; |
144 | 0 | } |
145 | 0 | if (prevent_null_wd_fixsp) { |
146 | 0 | word_res_it_from = word_res_it_to; |
147 | 0 | } else { |
148 | 0 | fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to); |
149 | 0 | fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row, |
150 | 0 | block_res_it.data()->block); |
151 | 0 | new_length = fuzzy_space_words.length(); |
152 | 0 | word_res_it_from.add_list_before(&fuzzy_space_words); |
153 | 0 | for (; !word_res_it_from.at_last() && new_length > 0; new_length--) { |
154 | 0 | word_res_it_from.forward(); |
155 | 0 | } |
156 | 0 | } |
157 | 0 | if (test_pt) { |
158 | 0 | debug_fix_space_level.set_value(0); |
159 | 0 | } |
160 | 0 | } |
161 | 0 | fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block); |
162 | | // Last word in row |
163 | 0 | } |
164 | 0 | } |
165 | 0 | } |
166 | 0 | } |
167 | | |
168 | 0 | void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) { |
169 | 0 | int16_t best_score; |
170 | 0 | WERD_RES_LIST current_perm; |
171 | 0 | bool improved = false; |
172 | |
|
173 | 0 | best_score = eval_word_spacing(best_perm); // default score |
174 | 0 | dump_words(best_perm, best_score, 1, improved); |
175 | |
|
176 | 0 | if (best_score != PERFECT_WERDS) { |
177 | 0 | initialise_search(best_perm, current_perm); |
178 | 0 | } |
179 | |
|
180 | 0 | while ((best_score != PERFECT_WERDS) && !current_perm.empty()) { |
181 | 0 | match_current_words(current_perm, row, block); |
182 | 0 | int16_t current_score = eval_word_spacing(current_perm); |
183 | 0 | dump_words(current_perm, current_score, 2, improved); |
184 | 0 | if (current_score > best_score) { |
185 | 0 | best_perm.clear(); |
186 | 0 | best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); |
187 | 0 | best_score = current_score; |
188 | 0 | improved = true; |
189 | 0 | } |
190 | 0 | if (current_score < PERFECT_WERDS) { |
191 | 0 | transform_to_next_perm(current_perm); |
192 | 0 | } |
193 | 0 | } |
194 | 0 | dump_words(best_perm, best_score, 3, improved); |
195 | 0 | } |
196 | | |
197 | 0 | void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) { |
198 | 0 | WERD_RES_IT src_it(&src_list); |
199 | 0 | WERD_RES_IT new_it(&new_list); |
200 | 0 | WERD_RES *new_wd; |
201 | |
|
202 | 0 | for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { |
203 | 0 | WERD_RES *src_wd = src_it.data(); |
204 | 0 | if (!src_wd->combination) { |
205 | 0 | new_wd = WERD_RES::deep_copy(src_wd); |
206 | 0 | new_wd->combination = false; |
207 | 0 | new_wd->part_of_combo = false; |
208 | 0 | new_it.add_after_then_move(new_wd); |
209 | 0 | } |
210 | 0 | } |
211 | 0 | } |
212 | | |
213 | 0 | void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) { |
214 | 0 | WERD_RES_IT word_it(&words); |
215 | 0 | WERD_RES *word; |
216 | | // Since we are not using PAGE_RES to iterate over words, we need to update |
217 | | // prev_word_best_choice_ before calling classify_word_pass2(). |
218 | 0 | prev_word_best_choice_ = nullptr; |
219 | 0 | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
220 | 0 | word = word_it.data(); |
221 | 0 | if ((!word->part_of_combo) && (word->box_word == nullptr)) { |
222 | 0 | WordData word_data(block, row, word); |
223 | 0 | SetupWordPassN(2, &word_data); |
224 | 0 | classify_word_and_language(2, nullptr, &word_data); |
225 | 0 | } |
226 | 0 | prev_word_best_choice_ = word->best_choice; |
227 | 0 | } |
228 | 0 | } |
229 | | |
230 | | /** |
231 | | * @name eval_word_spacing() |
232 | | * The basic measure is the number of characters in contextually confirmed |
233 | | * words. (I.e the word is done) |
234 | | * If all words are contextually confirmed the evaluation is deemed perfect. |
235 | | * |
236 | | * Some fiddles are done to handle "1"s as these are VERY frequent causes of |
237 | | * fuzzy spaces. The problem with the basic measure is that "561 63" would score |
238 | | * the same as "56163", though given our knowledge that the space is fuzzy, and |
239 | | * that there is a "1" next to the fuzzy space, we need to ensure that "56163" |
240 | | * is preferred. |
241 | | * |
242 | | * The solution is to NOT COUNT the score of any word which has a digit at one |
243 | | * end and a "1Il" as the character the other side of the space. |
244 | | * |
245 | | * Conversely, any character next to a "1" within a word is counted as a |
246 | | * positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 |
247 | | * side of the "1" joined). "56163" would score 7 - all chars in a numeric word |
248 | | * + 2 sides of a "1" joined. |
249 | | * |
250 | | * The joined 1 rule is applied to any word REGARDLESS of contextual |
251 | | * confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally |
252 | | * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2. |
253 | | * |
254 | | */ |
255 | 0 | int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) { |
256 | 0 | WERD_RES_IT word_res_it(&word_res_list); |
257 | 0 | int16_t total_score = 0; |
258 | 0 | int16_t word_count = 0; |
259 | 0 | int16_t done_word_count = 0; |
260 | 0 | int i; |
261 | 0 | int16_t offset; |
262 | 0 | int16_t prev_word_score = 0; |
263 | 0 | bool prev_word_done = false; |
264 | 0 | bool prev_char_1 = false; // prev ch a "1/I/l"? |
265 | 0 | bool prev_char_digit = false; // prev ch 2..9 or 0 |
266 | 0 | const char *punct_chars = "!\"`',.:;"; |
267 | 0 | do { |
268 | | // current word |
269 | 0 | WERD_RES *word = word_res_it.data(); |
270 | 0 | bool word_done = fixspace_thinks_word_done(word); |
271 | 0 | word_count++; |
272 | 0 | if (word->tess_failed) { |
273 | 0 | total_score += prev_word_score; |
274 | 0 | if (prev_word_done) { |
275 | 0 | done_word_count++; |
276 | 0 | } |
277 | 0 | prev_word_score = 0; |
278 | 0 | prev_char_1 = false; |
279 | 0 | prev_char_digit = false; |
280 | 0 | prev_word_done = false; |
281 | 0 | } else { |
282 | | /* |
283 | | Can we add the prev word score and potentially count this word? |
284 | | Yes IF it didn't end in a 1 when the first char of this word is a digit |
285 | | AND it didn't end in a digit when the first char of this word is a 1 |
286 | | */ |
287 | 0 | auto word_len = word->reject_map.length(); |
288 | 0 | bool current_word_ok_so_far = false; |
289 | 0 | if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) || |
290 | 0 | (prev_char_digit && |
291 | 0 | ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 && |
292 | 0 | word->best_choice->unichar_string()[0] == '1') || |
293 | 0 | (!word_done && |
294 | 0 | conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) { |
295 | 0 | total_score += prev_word_score; |
296 | 0 | if (prev_word_done) { |
297 | 0 | done_word_count++; |
298 | 0 | } |
299 | 0 | current_word_ok_so_far = word_done; |
300 | 0 | } |
301 | |
|
302 | 0 | if (current_word_ok_so_far) { |
303 | 0 | prev_word_done = true; |
304 | 0 | prev_word_score = word_len; |
305 | 0 | } else { |
306 | 0 | prev_word_done = false; |
307 | 0 | prev_word_score = 0; |
308 | 0 | } |
309 | | |
310 | | /* Add 1 to total score for every joined 1 regardless of context and |
311 | | rejtn */ |
312 | 0 | for (i = 0, prev_char_1 = false; i < word_len; i++) { |
313 | 0 | bool current_char_1 = word->best_choice->unichar_string()[i] == '1'; |
314 | 0 | if (prev_char_1 || (current_char_1 && (i > 0))) { |
315 | 0 | total_score++; |
316 | 0 | } |
317 | 0 | prev_char_1 = current_char_1; |
318 | 0 | } |
319 | | |
320 | | /* Add 1 to total score for every joined punctuation regardless of context |
321 | | and rejtn */ |
322 | 0 | if (tessedit_prefer_joined_punct) { |
323 | 0 | bool prev_char_punct; |
324 | 0 | for (i = 0, offset = 0, prev_char_punct = false; i < word_len; |
325 | 0 | offset += word->best_choice->unichar_lengths()[i++]) { |
326 | 0 | bool current_char_punct = |
327 | 0 | strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr; |
328 | 0 | if (prev_char_punct || (current_char_punct && i > 0)) { |
329 | 0 | total_score++; |
330 | 0 | } |
331 | 0 | prev_char_punct = current_char_punct; |
332 | 0 | } |
333 | 0 | } |
334 | 0 | prev_char_digit = digit_or_numeric_punct(word, word_len - 1); |
335 | 0 | for (i = 0, offset = 0; i < word_len - 1; |
336 | 0 | offset += word->best_choice->unichar_lengths()[i++]) { |
337 | 0 | ; |
338 | 0 | } |
339 | 0 | prev_char_1 = |
340 | 0 | ((word_done && (word->best_choice->unichar_string()[offset] == '1')) || |
341 | 0 | (!word_done && |
342 | 0 | conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset]))); |
343 | 0 | } |
344 | | /* Find next word */ |
345 | 0 | do { |
346 | 0 | word_res_it.forward(); |
347 | 0 | } while (word_res_it.data()->part_of_combo); |
348 | 0 | } while (!word_res_it.at_first()); |
349 | 0 | total_score += prev_word_score; |
350 | 0 | if (prev_word_done) { |
351 | 0 | done_word_count++; |
352 | 0 | } |
353 | 0 | if (done_word_count == word_count) { |
354 | 0 | return PERFECT_WERDS; |
355 | 0 | } else { |
356 | 0 | return total_score; |
357 | 0 | } |
358 | 0 | } |
359 | | |
360 | 0 | bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) { |
361 | 0 | int i; |
362 | 0 | int offset; |
363 | |
|
364 | 0 | for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) { |
365 | 0 | ; |
366 | 0 | } |
367 | 0 | return ( |
368 | 0 | word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset, |
369 | 0 | word->best_choice->unichar_lengths()[i]) || |
370 | 0 | (word->best_choice->permuter() == NUMBER_PERM && |
371 | 0 | numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset]))); |
372 | 0 | } |
373 | | |
374 | | /** |
375 | | * @name transform_to_next_perm() |
376 | | * Examines the current word list to find the smallest word gap size. Then walks |
377 | | * the word list closing any gaps of this size by either inserted new |
378 | | * combination words, or extending existing ones. |
379 | | * |
380 | | * The routine COULD be limited to stop it building words longer than N blobs. |
381 | | * |
382 | | * If there are no more gaps then it DELETES the entire list and returns the |
383 | | * empty list to cause termination. |
384 | | */ |
385 | 0 | void transform_to_next_perm(WERD_RES_LIST &words) { |
386 | 0 | WERD_RES_IT word_it(&words); |
387 | 0 | WERD_RES_IT prev_word_it(&words); |
388 | 0 | WERD_RES *word; |
389 | 0 | WERD_RES *prev_word; |
390 | 0 | int16_t prev_right = -INT16_MAX; |
391 | 0 | TBOX box; |
392 | 0 | int16_t gap; |
393 | 0 | int16_t min_gap = INT16_MAX; |
394 | |
|
395 | 0 | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
396 | 0 | word = word_it.data(); |
397 | 0 | if (!word->part_of_combo) { |
398 | 0 | box = word->word->bounding_box(); |
399 | 0 | if (prev_right > -INT16_MAX) { |
400 | 0 | gap = box.left() - prev_right; |
401 | 0 | if (gap < min_gap) { |
402 | 0 | min_gap = gap; |
403 | 0 | } |
404 | 0 | } |
405 | 0 | prev_right = box.right(); |
406 | 0 | } |
407 | 0 | } |
408 | 0 | if (min_gap < INT16_MAX) { |
409 | 0 | prev_right = -INT16_MAX; // back to start |
410 | 0 | word_it.set_to_list(&words); |
411 | | // Note: we can't use cycle_pt due to inserted combos at start of list. |
412 | 0 | for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) { |
413 | 0 | word = word_it.data(); |
414 | 0 | if (!word->part_of_combo) { |
415 | 0 | box = word->word->bounding_box(); |
416 | 0 | if (prev_right > -INT16_MAX) { |
417 | 0 | gap = box.left() - prev_right; |
418 | 0 | if (gap <= min_gap) { |
419 | 0 | prev_word = prev_word_it.data(); |
420 | 0 | WERD_RES *combo; |
421 | 0 | if (prev_word->combination) { |
422 | 0 | combo = prev_word; |
423 | 0 | } else { |
424 | | /* Make a new combination and insert before |
425 | | * the first word being joined. */ |
426 | 0 | auto *copy_word = new WERD; |
427 | 0 | *copy_word = *(prev_word->word); |
428 | | // deep copy |
429 | 0 | combo = new WERD_RES(copy_word); |
430 | 0 | combo->combination = true; |
431 | 0 | combo->x_height = prev_word->x_height; |
432 | 0 | prev_word->part_of_combo = true; |
433 | 0 | prev_word_it.add_before_then_move(combo); |
434 | 0 | } |
435 | 0 | combo->word->set_flag(W_EOL, word->word->flag(W_EOL)); |
436 | 0 | if (word->combination) { |
437 | 0 | combo->word->join_on(word->word); |
438 | | // Move blobs to combo |
439 | | // old combo no longer needed |
440 | 0 | delete word_it.extract(); |
441 | 0 | } else { |
442 | | // Copy current wd to combo |
443 | 0 | combo->copy_on(word); |
444 | 0 | word->part_of_combo = true; |
445 | 0 | } |
446 | 0 | combo->done = false; |
447 | 0 | combo->ClearResults(); |
448 | 0 | } else { |
449 | 0 | prev_word_it = word_it; // catch up |
450 | 0 | } |
451 | 0 | } |
452 | 0 | prev_right = box.right(); |
453 | 0 | } |
454 | 0 | } |
455 | 0 | } else { |
456 | 0 | words.clear(); // signal termination |
457 | 0 | } |
458 | 0 | } |
459 | | |
460 | 0 | void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) { |
461 | 0 | WERD_RES_IT word_res_it(&perm); |
462 | |
|
463 | 0 | if (debug_fix_space_level > 0) { |
464 | 0 | if (mode == 1) { |
465 | 0 | stats_.dump_words_str = ""; |
466 | 0 | for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) { |
467 | 0 | if (!word_res_it.data()->part_of_combo) { |
468 | 0 | stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string(); |
469 | 0 | stats_.dump_words_str += ' '; |
470 | 0 | } |
471 | 0 | } |
472 | 0 | } |
473 | |
|
474 | 0 | if (debug_fix_space_level > 1) { |
475 | 0 | switch (mode) { |
476 | 0 | case 1: |
477 | 0 | tprintf("EXTRACTED (%d): \"", score); |
478 | 0 | break; |
479 | 0 | case 2: |
480 | 0 | tprintf("TESTED (%d): \"", score); |
481 | 0 | break; |
482 | 0 | case 3: |
483 | 0 | tprintf("RETURNED (%d): \"", score); |
484 | 0 | break; |
485 | 0 | } |
486 | | |
487 | 0 | for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) { |
488 | 0 | if (!word_res_it.data()->part_of_combo) { |
489 | 0 | tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(), |
490 | 0 | static_cast<int>(word_res_it.data()->best_choice->permuter())); |
491 | 0 | } |
492 | 0 | } |
493 | 0 | tprintf("\"\n"); |
494 | 0 | } else if (improved) { |
495 | 0 | tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str()); |
496 | 0 | for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) { |
497 | 0 | if (!word_res_it.data()->part_of_combo) { |
498 | 0 | tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(), |
499 | 0 | static_cast<int>(word_res_it.data()->best_choice->permuter())); |
500 | 0 | } |
501 | 0 | } |
502 | 0 | tprintf("\"\n"); |
503 | 0 | } |
504 | 0 | } |
505 | 0 | } |
506 | | |
507 | 0 | bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) { |
508 | 0 | if (word->done) { |
509 | 0 | return true; |
510 | 0 | } |
511 | | |
512 | | /* |
513 | | Use all the standard pass 2 conditions for mode 5 in set_done() in |
514 | | reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T |
515 | | CARE WHETHER WE HAVE of/at on/an etc. |
516 | | */ |
517 | 0 | if (fixsp_done_mode > 0 && |
518 | 0 | (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) || |
519 | 0 | fixsp_done_mode == 3) && |
520 | 0 | (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) && |
521 | 0 | ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) || |
522 | 0 | (word->best_choice->permuter() == FREQ_DAWG_PERM) || |
523 | 0 | (word->best_choice->permuter() == USER_DAWG_PERM) || |
524 | 0 | (word->best_choice->permuter() == NUMBER_PERM))) { |
525 | 0 | return true; |
526 | 0 | } else { |
527 | 0 | return false; |
528 | 0 | } |
529 | 0 | } |
530 | | |
531 | | /** |
532 | | * @name fix_sp_fp_word() |
533 | | * Test the current word to see if it can be split by deleting noise blobs. If |
534 | | * so, do the business. |
535 | | * Return with the iterator pointing to the same place if the word is unchanged, |
536 | | * or the last of the replacement words. |
537 | | */ |
538 | 0 | void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) { |
539 | 0 | WERD_RES *word_res; |
540 | 0 | WERD_RES_LIST sub_word_list; |
541 | 0 | WERD_RES_IT sub_word_list_it(&sub_word_list); |
542 | 0 | int16_t new_length; |
543 | 0 | float junk; |
544 | |
|
545 | 0 | word_res = word_res_it.data(); |
546 | 0 | if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo || |
547 | 0 | !word_res->word->flag(W_DONT_CHOP)) { |
548 | 0 | return; |
549 | 0 | } |
550 | | |
551 | 0 | auto blob_index = worst_noise_blob(word_res, &junk); |
552 | 0 | if (blob_index < 0) { |
553 | 0 | return; |
554 | 0 | } |
555 | | |
556 | 0 | if (debug_fix_space_level > 1) { |
557 | 0 | tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str()); |
558 | 0 | } |
559 | 0 | word_res->word->rej_cblob_list()->sort(c_blob_comparator); |
560 | 0 | sub_word_list_it.add_after_stay_put(word_res_it.extract()); |
561 | 0 | fix_noisy_space_list(sub_word_list, row, block); |
562 | 0 | new_length = sub_word_list.length(); |
563 | 0 | word_res_it.add_list_before(&sub_word_list); |
564 | 0 | for (; !word_res_it.at_last() && new_length > 1; new_length--) { |
565 | 0 | word_res_it.forward(); |
566 | 0 | } |
567 | 0 | } |
568 | | |
569 | 0 | void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) { |
570 | 0 | int16_t best_score; |
571 | 0 | WERD_RES_IT best_perm_it(&best_perm); |
572 | 0 | WERD_RES_LIST current_perm; |
573 | 0 | WERD_RES_IT current_perm_it(¤t_perm); |
574 | 0 | WERD_RES *old_word_res; |
575 | 0 | int16_t current_score; |
576 | 0 | bool improved = false; |
577 | |
|
578 | 0 | best_score = fp_eval_word_spacing(best_perm); // default score |
579 | |
|
580 | 0 | dump_words(best_perm, best_score, 1, improved); |
581 | |
|
582 | 0 | old_word_res = best_perm_it.data(); |
583 | | // Even deep_copy doesn't copy the underlying WERD unless its combination |
584 | | // flag is true!. |
585 | 0 | old_word_res->combination = true; // Kludge to force deep copy |
586 | 0 | current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res)); |
587 | 0 | old_word_res->combination = false; // Undo kludge |
588 | |
|
589 | 0 | break_noisiest_blob_word(current_perm); |
590 | |
|
591 | 0 | while (best_score != PERFECT_WERDS && !current_perm.empty()) { |
592 | 0 | match_current_words(current_perm, row, block); |
593 | 0 | current_score = fp_eval_word_spacing(current_perm); |
594 | 0 | dump_words(current_perm, current_score, 2, improved); |
595 | 0 | if (current_score > best_score) { |
596 | 0 | best_perm.clear(); |
597 | 0 | best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); |
598 | 0 | best_score = current_score; |
599 | 0 | improved = true; |
600 | 0 | } |
601 | 0 | if (current_score < PERFECT_WERDS) { |
602 | 0 | break_noisiest_blob_word(current_perm); |
603 | 0 | } |
604 | 0 | } |
605 | 0 | dump_words(best_perm, best_score, 3, improved); |
606 | 0 | } |
607 | | |
608 | | /** |
609 | | * break_noisiest_blob_word() |
610 | | * Find the word with the blob which looks like the worst noise. |
611 | | * Break the word into two, deleting the noise blob. |
612 | | */ |
613 | 0 | void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) { |
614 | 0 | WERD_RES_IT word_it(&words); |
615 | 0 | WERD_RES_IT worst_word_it; |
616 | 0 | float worst_noise_score = 9999; |
617 | 0 | int worst_blob_index = -1; // Noisiest blob of noisiest wd |
618 | 0 | float noise_score; // of wds noisiest blob |
619 | 0 | WERD_RES *word_res; |
620 | 0 | C_BLOB_IT blob_it; |
621 | 0 | C_BLOB_IT rej_cblob_it; |
622 | 0 | C_BLOB_LIST new_blob_list; |
623 | 0 | C_BLOB_IT new_blob_it; |
624 | 0 | C_BLOB_IT new_rej_cblob_it; |
625 | 0 | WERD *new_word; |
626 | 0 | int16_t start_of_noise_blob; |
627 | 0 | int16_t i; |
628 | |
|
629 | 0 | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
630 | 0 | auto blob_index = worst_noise_blob(word_it.data(), &noise_score); |
631 | 0 | if (blob_index > -1 && worst_noise_score > noise_score) { |
632 | 0 | worst_noise_score = noise_score; |
633 | 0 | worst_blob_index = blob_index; |
634 | 0 | worst_word_it = word_it; |
635 | 0 | } |
636 | 0 | } |
637 | 0 | if (worst_blob_index < 0) { |
638 | 0 | words.clear(); // signal termination |
639 | 0 | return; |
640 | 0 | } |
641 | | |
642 | | /* Now split the worst_word_it */ |
643 | | |
644 | 0 | word_res = worst_word_it.data(); |
645 | | |
646 | | /* Move blobs before noise blob to a new bloblist */ |
647 | |
|
648 | 0 | new_blob_it.set_to_list(&new_blob_list); |
649 | 0 | blob_it.set_to_list(word_res->word->cblob_list()); |
650 | 0 | for (i = 0; i < worst_blob_index; i++, blob_it.forward()) { |
651 | 0 | new_blob_it.add_after_then_move(blob_it.extract()); |
652 | 0 | } |
653 | 0 | start_of_noise_blob = blob_it.data()->bounding_box().left(); |
654 | 0 | delete blob_it.extract(); // throw out noise blob |
655 | |
|
656 | 0 | new_word = new WERD(&new_blob_list, word_res->word); |
657 | 0 | new_word->set_flag(W_EOL, false); |
658 | 0 | word_res->word->set_flag(W_BOL, false); |
659 | 0 | word_res->word->set_blanks(1); // After break |
660 | |
|
661 | 0 | new_rej_cblob_it.set_to_list(new_word->rej_cblob_list()); |
662 | 0 | rej_cblob_it.set_to_list(word_res->word->rej_cblob_list()); |
663 | 0 | for (; (!rej_cblob_it.empty() && |
664 | 0 | (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob)); |
665 | 0 | rej_cblob_it.forward()) { |
666 | 0 | new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract()); |
667 | 0 | } |
668 | |
|
669 | 0 | auto *new_word_res = new WERD_RES(new_word); |
670 | 0 | new_word_res->combination = true; |
671 | 0 | worst_word_it.add_before_then_move(new_word_res); |
672 | |
|
673 | 0 | word_res->ClearResults(); |
674 | 0 | } |
675 | | |
676 | 0 | int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) { |
677 | 0 | float noise_score[512]; |
678 | 0 | int min_noise_blob; // 1st contender |
679 | 0 | int max_noise_blob; // last contender |
680 | 0 | int non_noise_count; |
681 | 0 | int worst_noise_blob; // Worst blob |
682 | 0 | float small_limit = kBlnXHeight * fixsp_small_outlines_size; |
683 | 0 | float non_noise_limit = kBlnXHeight * 0.8; |
684 | |
|
685 | 0 | if (word_res->rebuild_word == nullptr) { |
686 | 0 | return -1; // Can't handle cube words. |
687 | 0 | } |
688 | | |
689 | | // Normalised. |
690 | 0 | auto blob_count = word_res->box_word->length(); |
691 | 0 | ASSERT_HOST(blob_count <= 512); |
692 | 0 | if (blob_count < 5) { |
693 | 0 | return -1; // too short to split |
694 | 0 | } |
695 | | |
696 | | /* Get the noise scores for all blobs */ |
697 | | |
698 | 0 | #ifndef SECURE_NAMES |
699 | 0 | if (debug_fix_space_level > 5) { |
700 | 0 | tprintf("FP fixspace Noise metrics for \"%s\": ", |
701 | 0 | word_res->best_choice->unichar_string().c_str()); |
702 | 0 | } |
703 | 0 | #endif |
704 | |
|
705 | 0 | for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) { |
706 | 0 | TBLOB *blob = word_res->rebuild_word->blobs[i]; |
707 | 0 | if (word_res->reject_map[i].accepted()) { |
708 | 0 | noise_score[i] = non_noise_limit; |
709 | 0 | } else { |
710 | 0 | noise_score[i] = blob_noise_score(blob); |
711 | 0 | } |
712 | |
|
713 | 0 | if (debug_fix_space_level > 5) { |
714 | 0 | tprintf("%1.1f ", noise_score[i]); |
715 | 0 | } |
716 | 0 | } |
717 | 0 | if (debug_fix_space_level > 5) { |
718 | 0 | tprintf("\n"); |
719 | 0 | } |
720 | | |
721 | | /* Now find the worst one which is far enough away from the end of the word */ |
722 | |
|
723 | 0 | non_noise_count = 0; |
724 | 0 | int i; |
725 | 0 | for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) { |
726 | 0 | if (noise_score[i] >= non_noise_limit) { |
727 | 0 | non_noise_count++; |
728 | 0 | } |
729 | 0 | } |
730 | 0 | if (non_noise_count < fixsp_non_noise_limit) { |
731 | 0 | return -1; |
732 | 0 | } |
733 | | |
734 | 0 | min_noise_blob = i; |
735 | |
|
736 | 0 | non_noise_count = 0; |
737 | 0 | for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) { |
738 | 0 | if (noise_score[i] >= non_noise_limit) { |
739 | 0 | non_noise_count++; |
740 | 0 | } |
741 | 0 | } |
742 | 0 | if (non_noise_count < fixsp_non_noise_limit) { |
743 | 0 | return -1; |
744 | 0 | } |
745 | | |
746 | 0 | max_noise_blob = i; |
747 | |
|
748 | 0 | if (min_noise_blob > max_noise_blob) { |
749 | 0 | return -1; |
750 | 0 | } |
751 | | |
752 | 0 | *worst_noise_score = small_limit; |
753 | 0 | worst_noise_blob = -1; |
754 | 0 | for (auto i = min_noise_blob; i <= max_noise_blob; i++) { |
755 | 0 | if (noise_score[i] < *worst_noise_score) { |
756 | 0 | worst_noise_blob = i; |
757 | 0 | *worst_noise_score = noise_score[i]; |
758 | 0 | } |
759 | 0 | } |
760 | 0 | return worst_noise_blob; |
761 | 0 | } |
762 | | |
763 | 0 | float Tesseract::blob_noise_score(TBLOB *blob) { |
764 | 0 | TBOX box; // BB of outline |
765 | 0 | int16_t outline_count = 0; |
766 | 0 | int16_t max_dimension; |
767 | 0 | int16_t largest_outline_dimension = 0; |
768 | |
|
769 | 0 | for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) { |
770 | 0 | outline_count++; |
771 | 0 | box = ol->bounding_box(); |
772 | 0 | if (box.height() > box.width()) { |
773 | 0 | max_dimension = box.height(); |
774 | 0 | } else { |
775 | 0 | max_dimension = box.width(); |
776 | 0 | } |
777 | |
|
778 | 0 | if (largest_outline_dimension < max_dimension) { |
779 | 0 | largest_outline_dimension = max_dimension; |
780 | 0 | } |
781 | 0 | } |
782 | |
|
783 | 0 | if (outline_count > 5) { |
784 | | // penalise LOTS of blobs |
785 | 0 | largest_outline_dimension *= 2; |
786 | 0 | } |
787 | |
|
788 | 0 | box = blob->bounding_box(); |
789 | 0 | if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) { |
790 | | // Lax blob is if high or low |
791 | 0 | largest_outline_dimension /= 2; |
792 | 0 | } |
793 | |
|
794 | 0 | return largest_outline_dimension; |
795 | 0 | } |
796 | | |
797 | 0 | void fixspace_dbg(WERD_RES *word) { |
798 | 0 | TBOX box = word->word->bounding_box(); |
799 | 0 | const bool show_map_detail = false; |
800 | |
|
801 | 0 | box.print(); |
802 | 0 | tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str()); |
803 | 0 | tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", word->word->cblob_list()->length(), |
804 | 0 | word->rebuild_word->NumBlobs(), word->box_word->length()); |
805 | 0 | word->reject_map.print(debug_fp); |
806 | 0 | tprintf("\n"); |
807 | 0 | if (show_map_detail) { |
808 | 0 | tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str()); |
809 | 0 | for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { |
810 | 0 | tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); |
811 | 0 | word->reject_map[i].full_print(debug_fp); |
812 | 0 | } |
813 | 0 | } |
814 | |
|
815 | 0 | tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); |
816 | 0 | tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); |
817 | 0 | } |
818 | | |
819 | | /** |
820 | | * fp_eval_word_spacing() |
821 | | * Evaluation function for fixed pitch word lists. |
822 | | * |
823 | | * Basically, count the number of "nice" characters - those which are in tess |
824 | | * acceptable words or in dict words and are not rejected. |
825 | | * Penalise any potential noise chars |
826 | | */ |
827 | 0 | int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { |
828 | 0 | WERD_RES_IT word_it(&word_res_list); |
829 | 0 | WERD_RES *word; |
830 | 0 | int16_t score = 0; |
831 | 0 | float small_limit = kBlnXHeight * fixsp_small_outlines_size; |
832 | |
|
833 | 0 | for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { |
834 | 0 | word = word_it.data(); |
835 | 0 | if (word->rebuild_word == nullptr) { |
836 | 0 | continue; // Can't handle cube words. |
837 | 0 | } |
838 | 0 | if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM || |
839 | 0 | word->best_choice->permuter() == FREQ_DAWG_PERM || |
840 | 0 | word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) { |
841 | 0 | auto num_blobs = word->rebuild_word->NumBlobs(); |
842 | 0 | UNICHAR_ID space = word->uch_set->unichar_to_id(" "); |
843 | 0 | for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) { |
844 | 0 | TBLOB *blob = word->rebuild_word->blobs[i]; |
845 | 0 | if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) { |
846 | 0 | score -= 1; // penalise possibly erroneous non-space |
847 | 0 | } else if (word->reject_map[i].accepted()) { |
848 | 0 | score++; |
849 | 0 | } |
850 | 0 | } |
851 | 0 | } |
852 | 0 | } |
853 | 0 | if (score < 0) { |
854 | 0 | score = 0; |
855 | 0 | } |
856 | 0 | return score; |
857 | 0 | } |
858 | | |
859 | | } // namespace tesseract |