/src/tesseract/src/ccmain/output.cpp
Line | Count | Source |
1 | | /****************************************************************** |
2 | | * File: output.cpp (Formerly output.c) |
3 | | * Description: Output pass |
4 | | * Author: Phil Cheatle |
5 | | * |
6 | | * (C) Copyright 1994, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #include "output.h" |
20 | | |
21 | | #include "control.h" |
22 | | #include "tesseractclass.h" |
23 | | #include "tessvars.h" |
24 | | #ifndef DISABLED_LEGACY_ENGINE |
25 | | # include "docqual.h" |
26 | | # include "reject.h" |
27 | | #endif |
28 | | |
29 | | #include "helpers.h" |
30 | | |
31 | | #include <cctype> |
32 | | #include <cerrno> |
33 | | #include <cstring> |
34 | | |
35 | 0 | #define CTRL_NEWLINE '\012' // newline |
36 | 0 | #define CTRL_HARDLINE '\015' // cr |
37 | | |
38 | | namespace tesseract { |
39 | | void Tesseract::output_pass( // Tess output pass //send to api |
40 | 0 | PAGE_RES_IT &page_res_it, const TBOX *target_word_box) { |
41 | 0 | BLOCK_RES *block_of_last_word; |
42 | 0 | bool force_eol; // During output |
43 | 0 | BLOCK *nextblock; // block of next word |
44 | 0 | WERD *nextword; // next word |
45 | |
|
46 | 0 | page_res_it.restart_page(); |
47 | 0 | block_of_last_word = nullptr; |
48 | 0 | while (page_res_it.word() != nullptr) { |
49 | 0 | check_debug_pt(page_res_it.word(), 120); |
50 | |
|
51 | 0 | if (target_word_box) { |
52 | 0 | TBOX current_word_box = page_res_it.word()->word->bounding_box(); |
53 | 0 | FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2, |
54 | 0 | (current_word_box.bottom() + current_word_box.top()) / 2); |
55 | 0 | if (!target_word_box->contains(center_pt)) { |
56 | 0 | page_res_it.forward(); |
57 | 0 | continue; |
58 | 0 | } |
59 | 0 | } |
60 | 0 | if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) { |
61 | 0 | block_of_last_word = page_res_it.block(); |
62 | 0 | } |
63 | |
|
64 | 0 | force_eol = |
65 | 0 | (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) || |
66 | 0 | (page_res_it.next_word() == nullptr); |
67 | |
|
68 | 0 | if (page_res_it.next_word() != nullptr) { |
69 | 0 | nextword = page_res_it.next_word()->word; |
70 | 0 | } else { |
71 | 0 | nextword = nullptr; |
72 | 0 | } |
73 | 0 | if (page_res_it.next_block() != nullptr) { |
74 | 0 | nextblock = page_res_it.next_block()->block; |
75 | 0 | } else { |
76 | 0 | nextblock = nullptr; |
77 | 0 | } |
78 | | // regardless of tilde crunching |
79 | 0 | write_results(page_res_it, |
80 | 0 | determine_newline_type(page_res_it.word()->word, page_res_it.block()->block, |
81 | 0 | nextword, nextblock), |
82 | 0 | force_eol); |
83 | 0 | page_res_it.forward(); |
84 | 0 | } |
85 | 0 | } |
86 | | |
87 | | /************************************************************************* |
88 | | * write_results() |
89 | | * |
90 | | * All recognition and rejection has now been done. Generate the following: |
91 | | * .txt file - giving the final best choices with NO highlighting |
92 | | * .raw file - giving the tesseract top choice output for each word |
93 | | * .map file - showing how the .txt file has been rejected in the .ep file |
94 | | * epchoice list - a list of one element per word, containing the text for the |
95 | | * epaper. Reject strings are inserted. |
96 | | * inset list - a list of bounding boxes of reject insets - indexed by the |
97 | | * reject strings in the epchoice text. |
98 | | *************************************************************************/ |
99 | | void Tesseract::write_results(PAGE_RES_IT &page_res_it, |
100 | | char newline_type, // type of newline |
101 | 0 | bool force_eol) { // override tilde crunch? |
102 | 0 | WERD_RES *word = page_res_it.word(); |
103 | 0 | const UNICHARSET &uchset = *word->uch_set; |
104 | 0 | UNICHAR_ID space = uchset.unichar_to_id(" "); |
105 | |
|
106 | 0 | if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) && |
107 | 0 | !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { |
108 | 0 | bool need_reject = false; |
109 | 0 | if ((word->unlv_crunch_mode != CR_DELETE) && |
110 | 0 | (!stats_.tilde_crunch_written || |
111 | 0 | ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) && |
112 | 0 | !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) { |
113 | 0 | if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) && |
114 | 0 | !word->word->flag(W_FUZZY_SP)) { |
115 | 0 | stats_.last_char_was_tilde = false; |
116 | 0 | } |
117 | 0 | need_reject = true; |
118 | 0 | } |
119 | 0 | if ((need_reject && !stats_.last_char_was_tilde) || |
120 | 0 | (force_eol && stats_.write_results_empty_block)) { |
121 | | /* Write a reject char - mark as rejected unless zero_rejection mode */ |
122 | 0 | stats_.last_char_was_tilde = true; |
123 | 0 | stats_.tilde_crunch_written = true; |
124 | 0 | stats_.last_char_was_newline = false; |
125 | 0 | stats_.write_results_empty_block = false; |
126 | 0 | } |
127 | |
|
128 | 0 | if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) { |
129 | 0 | stats_.tilde_crunch_written = false; |
130 | 0 | stats_.last_char_was_newline = true; |
131 | 0 | stats_.last_char_was_tilde = false; |
132 | 0 | } |
133 | |
|
134 | 0 | if (force_eol) { |
135 | 0 | stats_.write_results_empty_block = true; |
136 | 0 | } |
137 | 0 | return; |
138 | 0 | } |
139 | | |
140 | | /* NORMAL PROCESSING of non tilde crunched words */ |
141 | | |
142 | 0 | stats_.tilde_crunch_written = false; |
143 | 0 | if (newline_type) { |
144 | 0 | stats_.last_char_was_newline = true; |
145 | 0 | } else { |
146 | 0 | stats_.last_char_was_newline = false; |
147 | 0 | } |
148 | 0 | stats_.write_results_empty_block = force_eol; // about to write a real word |
149 | |
|
150 | 0 | if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) && |
151 | 0 | !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && |
152 | 0 | (word->best_choice->unichar_id(0) == space)) { |
153 | | /* Prevent adjacent tilde across words - we know that adjacent tildes within |
154 | | words have been removed */ |
155 | 0 | word->MergeAdjacentBlobs(0); |
156 | 0 | } |
157 | 0 | if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) { |
158 | 0 | stats_.last_char_was_tilde = false; |
159 | 0 | } else { |
160 | 0 | if (word->reject_map.length() > 0) { |
161 | 0 | if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) { |
162 | 0 | stats_.last_char_was_tilde = true; |
163 | 0 | } else { |
164 | 0 | stats_.last_char_was_tilde = false; |
165 | 0 | } |
166 | 0 | } else if (word->word->space() > 0) { |
167 | 0 | stats_.last_char_was_tilde = false; |
168 | 0 | } |
169 | | /* else it is unchanged as there are no output chars */ |
170 | 0 | } |
171 | |
|
172 | 0 | ASSERT_HOST(word->best_choice->length() == word->reject_map.length()); |
173 | |
|
174 | 0 | set_unlv_suspects(word); |
175 | 0 | check_debug_pt(word, 120); |
176 | 0 | if (tessedit_rejection_debug) { |
177 | 0 | tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(), |
178 | 0 | dict_word(*(word->best_choice))); |
179 | 0 | } |
180 | 0 | if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) { |
181 | 0 | if (tessedit_zero_rejection) { |
182 | | /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ |
183 | 0 | for (unsigned i = 0; i < word->best_choice->length(); ++i) { |
184 | 0 | if (word->reject_map[i].rejected()) { |
185 | 0 | word->reject_map[i].setrej_minimal_rej_accept(); |
186 | 0 | } |
187 | 0 | } |
188 | 0 | } |
189 | 0 | if (tessedit_minimal_rejection) { |
190 | | /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ |
191 | 0 | for (unsigned i = 0; i < word->best_choice->length(); ++i) { |
192 | 0 | if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) { |
193 | 0 | word->reject_map[i].setrej_minimal_rej_accept(); |
194 | 0 | } |
195 | 0 | } |
196 | 0 | } |
197 | 0 | } |
198 | 0 | } |
199 | | |
200 | | /********************************************************************** |
201 | | * determine_newline_type |
202 | | * |
203 | | * Find whether we have a wrapping or hard newline. |
204 | | * Return false if not at end of line. |
205 | | **********************************************************************/ |
206 | | |
207 | | char determine_newline_type( // test line ends |
208 | | WERD *word, // word to do |
209 | | BLOCK *block, // current block |
210 | | WERD *next_word, // next word |
211 | | BLOCK *next_block // block of next word |
212 | 0 | ) { |
213 | 0 | int16_t end_gap; // to right edge |
214 | 0 | int16_t width; // of next word |
215 | 0 | TBOX word_box; // bounding |
216 | 0 | TBOX next_box; // next word |
217 | 0 | TBOX block_box; // block bounding |
218 | |
|
219 | 0 | if (!word->flag(W_EOL)) { |
220 | 0 | return false; // not end of line |
221 | 0 | } |
222 | 0 | if (next_word == nullptr || next_block == nullptr || block != next_block) { |
223 | 0 | return CTRL_NEWLINE; |
224 | 0 | } |
225 | 0 | if (next_word->space() > 0) { |
226 | 0 | return CTRL_HARDLINE; // it is tabbed |
227 | 0 | } |
228 | 0 | word_box = word->bounding_box(); |
229 | 0 | next_box = next_word->bounding_box(); |
230 | 0 | block_box = block->pdblk.bounding_box(); |
231 | | // gap to eol |
232 | 0 | end_gap = block_box.right() - word_box.right(); |
233 | 0 | end_gap -= static_cast<int32_t>(block->space()); |
234 | 0 | width = next_box.right() - next_box.left(); |
235 | | // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n", |
236 | | // block_box.right(),word_box.right(),end_gap, |
237 | | // next_box.right(),next_box.left(),width, |
238 | | // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE); |
239 | 0 | return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE; |
240 | 0 | } |
241 | | |
242 | | /************************************************************************* |
243 | | * get_rep_char() |
244 | | * Return the first accepted character from the repetition string. This is the |
245 | | * character which is repeated - as determined earlier by fix_rep_char() |
246 | | *************************************************************************/ |
247 | 0 | UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? |
248 | 0 | int i; |
249 | 0 | for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) { |
250 | 0 | ; |
251 | 0 | } |
252 | |
|
253 | 0 | if (i < word->reject_map.length()) { |
254 | 0 | return word->best_choice->unichar_id(i); |
255 | 0 | } else { |
256 | 0 | return word->uch_set->unichar_to_id(unrecognised_char.c_str()); |
257 | 0 | } |
258 | 0 | } |
259 | | |
260 | | /************************************************************************* |
261 | | * SUSPECT LEVELS |
262 | | * |
263 | | * 0 - don't reject ANYTHING |
264 | | * 1,2 - partial rejection |
265 | | * 3 - BEST |
266 | | * |
267 | | * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and |
268 | | * tessedit_minimal_rejection. |
269 | | *************************************************************************/ |
270 | 0 | void Tesseract::set_unlv_suspects(WERD_RES *word_res) { |
271 | 0 | int len = word_res->reject_map.length(); |
272 | 0 | const WERD_CHOICE &word = *(word_res->best_choice); |
273 | 0 | const UNICHARSET &uchset = *word.unicharset(); |
274 | 0 | int i; |
275 | 0 | float rating_per_ch; |
276 | |
|
277 | 0 | if (suspect_level == 0) { |
278 | 0 | for (i = 0; i < len; i++) { |
279 | 0 | if (word_res->reject_map[i].rejected()) { |
280 | 0 | word_res->reject_map[i].setrej_minimal_rej_accept(); |
281 | 0 | } |
282 | 0 | } |
283 | 0 | return; |
284 | 0 | } |
285 | | |
286 | 0 | if (suspect_level >= 3) { |
287 | 0 | return; // Use defaults |
288 | 0 | } |
289 | | |
290 | | /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ |
291 | | |
292 | 0 | if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) { |
293 | | /* Unreject alphas in dictionary words */ |
294 | 0 | for (i = 0; i < len; ++i) { |
295 | 0 | if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) { |
296 | 0 | word_res->reject_map[i].setrej_minimal_rej_accept(); |
297 | 0 | } |
298 | 0 | } |
299 | 0 | } |
300 | |
|
301 | 0 | rating_per_ch = word.rating() / word_res->reject_map.length(); |
302 | |
|
303 | 0 | if (rating_per_ch >= suspect_rating_per_ch) { |
304 | 0 | return; // Don't touch bad ratings |
305 | 0 | } |
306 | | |
307 | 0 | if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { |
308 | | /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ |
309 | 0 | for (i = 0; i < len; ++i) { |
310 | 0 | if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) { |
311 | 0 | word_res->reject_map[i].setrej_minimal_rej_accept(); |
312 | 0 | } |
313 | 0 | } |
314 | 0 | } |
315 | |
|
316 | 0 | for (i = 0; i < len; i++) { |
317 | 0 | if (word_res->reject_map[i].rejected()) { |
318 | 0 | if (word_res->reject_map[i].flag(R_DOC_REJ)) { |
319 | 0 | word_res->reject_map[i].setrej_minimal_rej_accept(); |
320 | 0 | } |
321 | 0 | if (word_res->reject_map[i].flag(R_BLOCK_REJ)) { |
322 | 0 | word_res->reject_map[i].setrej_minimal_rej_accept(); |
323 | 0 | } |
324 | 0 | if (word_res->reject_map[i].flag(R_ROW_REJ)) { |
325 | 0 | word_res->reject_map[i].setrej_minimal_rej_accept(); |
326 | 0 | } |
327 | 0 | } |
328 | 0 | } |
329 | |
|
330 | 0 | if (suspect_level == 2) { |
331 | 0 | return; |
332 | 0 | } |
333 | | |
334 | 0 | if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) { |
335 | 0 | for (i = 0; i < len; i++) { |
336 | 0 | if (word_res->reject_map[i].rejected()) { |
337 | 0 | if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) || |
338 | 0 | word_res->reject_map[i].flag(R_POSTNN_1IL))) { |
339 | 0 | word_res->reject_map[i].setrej_minimal_rej_accept(); |
340 | 0 | } |
341 | |
|
342 | 0 | if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) { |
343 | 0 | word_res->reject_map[i].setrej_minimal_rej_accept(); |
344 | 0 | } |
345 | 0 | } |
346 | 0 | } |
347 | 0 | } |
348 | |
|
349 | 0 | if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(), |
350 | 0 | word.unichar_lengths().c_str()) != AC_UNACCEPTABLE || |
351 | 0 | acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) { |
352 | 0 | if (word_res->reject_map.length() > suspect_short_words) { |
353 | 0 | for (i = 0; i < len; i++) { |
354 | 0 | if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() || |
355 | 0 | word_res->reject_map[i].flag(R_1IL_CONFLICT) || |
356 | 0 | word_res->reject_map[i].flag(R_POSTNN_1IL) || |
357 | 0 | word_res->reject_map[i].flag(R_MM_REJECT))) { |
358 | 0 | word_res->reject_map[i].setrej_minimal_rej_accept(); |
359 | 0 | } |
360 | 0 | } |
361 | 0 | } |
362 | 0 | } |
363 | 0 | } |
364 | | |
365 | 0 | int16_t Tesseract::count_alphas(const WERD_CHOICE &word) { |
366 | 0 | int count = 0; |
367 | 0 | for (unsigned i = 0; i < word.length(); ++i) { |
368 | 0 | if (word.unicharset()->get_isalpha(word.unichar_id(i))) { |
369 | 0 | count++; |
370 | 0 | } |
371 | 0 | } |
372 | 0 | return count; |
373 | 0 | } |
374 | | |
375 | 0 | int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) { |
376 | 0 | int count = 0; |
377 | 0 | for (unsigned i = 0; i < word.length(); ++i) { |
378 | 0 | if (word.unicharset()->get_isalpha(word.unichar_id(i)) || |
379 | 0 | word.unicharset()->get_isdigit(word.unichar_id(i))) { |
380 | 0 | count++; |
381 | 0 | } |
382 | 0 | } |
383 | 0 | return count; |
384 | 0 | } |
385 | | |
386 | 0 | bool Tesseract::acceptable_number_string(const char *s, const char *lengths) { |
387 | 0 | bool prev_digit = false; |
388 | |
|
389 | 0 | if (*lengths == 1 && *s == '(') { |
390 | 0 | s++; |
391 | 0 | } |
392 | |
|
393 | 0 | if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) { |
394 | 0 | s++; |
395 | 0 | } |
396 | |
|
397 | 0 | for (; *s != '\0'; s += *(lengths++)) { |
398 | 0 | if (unicharset.get_isdigit(s, *lengths)) { |
399 | 0 | prev_digit = true; |
400 | 0 | } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) { |
401 | 0 | prev_digit = false; |
402 | 0 | } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') && |
403 | 0 | ((*s == '%') || (*s == ')'))) { |
404 | 0 | return true; |
405 | 0 | } else if (prev_digit && *lengths == 1 && (*s == '%') && |
406 | 0 | (*(lengths + 1) == 1 && *(s + *lengths) == ')') && |
407 | 0 | (*(s + *lengths + *(lengths + 1)) == '\0')) { |
408 | 0 | return true; |
409 | 0 | } else { |
410 | 0 | return false; |
411 | 0 | } |
412 | 0 | } |
413 | 0 | return true; |
414 | 0 | } |
415 | | } // namespace tesseract |