Coverage Report

Created: 2025-11-16 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/ccmain/output.cpp
Line
Count
Source
1
/******************************************************************
2
 * File:        output.cpp  (Formerly output.c)
3
 * Description: Output pass
4
 * Author:      Phil Cheatle
5
 *
6
 * (C) Copyright 1994, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#include "output.h"
20
21
#include "control.h"
22
#include "tesseractclass.h"
23
#include "tessvars.h"
24
#ifndef DISABLED_LEGACY_ENGINE
25
#  include "docqual.h"
26
#  include "reject.h"
27
#endif
28
29
#include "helpers.h"
30
31
#include <cctype>
32
#include <cerrno>
33
#include <cstring>
34
35
0
#define CTRL_NEWLINE '\012'  // newline
36
0
#define CTRL_HARDLINE '\015' // cr
37
38
namespace tesseract {
39
void Tesseract::output_pass( // Tess output pass //send to api
40
0
    PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {
41
0
  BLOCK_RES *block_of_last_word;
42
0
  bool force_eol;   // During output
43
0
  BLOCK *nextblock; // block of next word
44
0
  WERD *nextword;   // next word
45
46
0
  page_res_it.restart_page();
47
0
  block_of_last_word = nullptr;
48
0
  while (page_res_it.word() != nullptr) {
49
0
    check_debug_pt(page_res_it.word(), 120);
50
51
0
    if (target_word_box) {
52
0
      TBOX current_word_box = page_res_it.word()->word->bounding_box();
53
0
      FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
54
0
                       (current_word_box.bottom() + current_word_box.top()) / 2);
55
0
      if (!target_word_box->contains(center_pt)) {
56
0
        page_res_it.forward();
57
0
        continue;
58
0
      }
59
0
    }
60
0
    if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
61
0
      block_of_last_word = page_res_it.block();
62
0
    }
63
64
0
    force_eol =
65
0
        (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
66
0
        (page_res_it.next_word() == nullptr);
67
68
0
    if (page_res_it.next_word() != nullptr) {
69
0
      nextword = page_res_it.next_word()->word;
70
0
    } else {
71
0
      nextword = nullptr;
72
0
    }
73
0
    if (page_res_it.next_block() != nullptr) {
74
0
      nextblock = page_res_it.next_block()->block;
75
0
    } else {
76
0
      nextblock = nullptr;
77
0
    }
78
    // regardless of tilde crunching
79
0
    write_results(page_res_it,
80
0
                  determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
81
0
                                         nextword, nextblock),
82
0
                  force_eol);
83
0
    page_res_it.forward();
84
0
  }
85
0
}
86
87
/*************************************************************************
88
 * write_results()
89
 *
90
 * All recognition and rejection has now been done. Generate the following:
91
 *   .txt file     - giving the final best choices with NO highlighting
92
 *   .raw file     - giving the tesseract top choice output for each word
93
 *   .map file     - showing how the .txt file has been rejected in the .ep file
94
 *   epchoice list - a list of one element per word, containing the text for the
95
 *                   epaper. Reject strings are inserted.
96
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
97
 *                   reject strings in the epchoice text.
98
 *************************************************************************/
99
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
100
                              char newline_type, // type of newline
101
0
                              bool force_eol) {  // override tilde crunch?
102
0
  WERD_RES *word = page_res_it.word();
103
0
  const UNICHARSET &uchset = *word->uch_set;
104
0
  UNICHAR_ID space = uchset.unichar_to_id(" ");
105
106
0
  if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
107
0
      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
108
0
    bool need_reject = false;
109
0
    if ((word->unlv_crunch_mode != CR_DELETE) &&
110
0
        (!stats_.tilde_crunch_written ||
111
0
         ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
112
0
          !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
113
0
      if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
114
0
          !word->word->flag(W_FUZZY_SP)) {
115
0
        stats_.last_char_was_tilde = false;
116
0
      }
117
0
      need_reject = true;
118
0
    }
119
0
    if ((need_reject && !stats_.last_char_was_tilde) ||
120
0
        (force_eol && stats_.write_results_empty_block)) {
121
      /* Write a reject char - mark as rejected unless zero_rejection mode */
122
0
      stats_.last_char_was_tilde = true;
123
0
      stats_.tilde_crunch_written = true;
124
0
      stats_.last_char_was_newline = false;
125
0
      stats_.write_results_empty_block = false;
126
0
    }
127
128
0
    if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
129
0
      stats_.tilde_crunch_written = false;
130
0
      stats_.last_char_was_newline = true;
131
0
      stats_.last_char_was_tilde = false;
132
0
    }
133
134
0
    if (force_eol) {
135
0
      stats_.write_results_empty_block = true;
136
0
    }
137
0
    return;
138
0
  }
139
140
  /* NORMAL PROCESSING of non tilde crunched words */
141
142
0
  stats_.tilde_crunch_written = false;
143
0
  if (newline_type) {
144
0
    stats_.last_char_was_newline = true;
145
0
  } else {
146
0
    stats_.last_char_was_newline = false;
147
0
  }
148
0
  stats_.write_results_empty_block = force_eol; // about to write a real word
149
150
0
  if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
151
0
      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
152
0
      (word->best_choice->unichar_id(0) == space)) {
153
    /* Prevent adjacent tilde across words - we know that adjacent tildes within
154
   words have been removed */
155
0
    word->MergeAdjacentBlobs(0);
156
0
  }
157
0
  if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
158
0
    stats_.last_char_was_tilde = false;
159
0
  } else {
160
0
    if (word->reject_map.length() > 0) {
161
0
      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
162
0
        stats_.last_char_was_tilde = true;
163
0
      } else {
164
0
        stats_.last_char_was_tilde = false;
165
0
      }
166
0
    } else if (word->word->space() > 0) {
167
0
      stats_.last_char_was_tilde = false;
168
0
    }
169
    /* else it is unchanged as there are no output chars */
170
0
  }
171
172
0
  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
173
174
0
  set_unlv_suspects(word);
175
0
  check_debug_pt(word, 120);
176
0
  if (tessedit_rejection_debug) {
177
0
    tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
178
0
            dict_word(*(word->best_choice)));
179
0
  }
180
0
  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
181
0
    if (tessedit_zero_rejection) {
182
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
183
0
      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
184
0
        if (word->reject_map[i].rejected()) {
185
0
          word->reject_map[i].setrej_minimal_rej_accept();
186
0
        }
187
0
      }
188
0
    }
189
0
    if (tessedit_minimal_rejection) {
190
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
191
0
      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
192
0
        if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
193
0
          word->reject_map[i].setrej_minimal_rej_accept();
194
0
        }
195
0
      }
196
0
    }
197
0
  }
198
0
}
199
200
/**********************************************************************
201
 * determine_newline_type
202
 *
203
 * Find whether we have a wrapping or hard newline.
204
 * Return false if not at end of line.
205
 **********************************************************************/
206
207
char determine_newline_type( // test line ends
208
    WERD *word,              // word to do
209
    BLOCK *block,            // current block
210
    WERD *next_word,         // next word
211
    BLOCK *next_block        // block of next word
212
0
) {
213
0
  int16_t end_gap; // to right edge
214
0
  int16_t width;   // of next word
215
0
  TBOX word_box;   // bounding
216
0
  TBOX next_box;   // next word
217
0
  TBOX block_box;  // block bounding
218
219
0
  if (!word->flag(W_EOL)) {
220
0
    return false; // not end of line
221
0
  }
222
0
  if (next_word == nullptr || next_block == nullptr || block != next_block) {
223
0
    return CTRL_NEWLINE;
224
0
  }
225
0
  if (next_word->space() > 0) {
226
0
    return CTRL_HARDLINE; // it is tabbed
227
0
  }
228
0
  word_box = word->bounding_box();
229
0
  next_box = next_word->bounding_box();
230
0
  block_box = block->pdblk.bounding_box();
231
  // gap to eol
232
0
  end_gap = block_box.right() - word_box.right();
233
0
  end_gap -= static_cast<int32_t>(block->space());
234
0
  width = next_box.right() - next_box.left();
235
  //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
236
  //              block_box.right(),word_box.right(),end_gap,
237
  //              next_box.right(),next_box.left(),width,
238
  //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
239
0
  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
240
0
}
241
242
/*************************************************************************
243
 * get_rep_char()
244
 * Return the first accepted character from the repetition string. This is the
245
 * character which is repeated - as determined earlier by fix_rep_char()
246
 *************************************************************************/
247
0
UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
248
0
  int i;
249
0
  for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
250
0
    ;
251
0
  }
252
253
0
  if (i < word->reject_map.length()) {
254
0
    return word->best_choice->unichar_id(i);
255
0
  } else {
256
0
    return word->uch_set->unichar_to_id(unrecognised_char.c_str());
257
0
  }
258
0
}
259
260
/*************************************************************************
261
 * SUSPECT LEVELS
262
 *
263
 * 0 - don't reject ANYTHING
264
 * 1,2 - partial rejection
265
 * 3 - BEST
266
 *
267
 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
268
 * tessedit_minimal_rejection.
269
 *************************************************************************/
270
0
void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
271
0
  int len = word_res->reject_map.length();
272
0
  const WERD_CHOICE &word = *(word_res->best_choice);
273
0
  const UNICHARSET &uchset = *word.unicharset();
274
0
  int i;
275
0
  float rating_per_ch;
276
277
0
  if (suspect_level == 0) {
278
0
    for (i = 0; i < len; i++) {
279
0
      if (word_res->reject_map[i].rejected()) {
280
0
        word_res->reject_map[i].setrej_minimal_rej_accept();
281
0
      }
282
0
    }
283
0
    return;
284
0
  }
285
286
0
  if (suspect_level >= 3) {
287
0
    return; // Use defaults
288
0
  }
289
290
  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
291
292
0
  if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
293
    /* Unreject alphas in dictionary words */
294
0
    for (i = 0; i < len; ++i) {
295
0
      if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
296
0
        word_res->reject_map[i].setrej_minimal_rej_accept();
297
0
      }
298
0
    }
299
0
  }
300
301
0
  rating_per_ch = word.rating() / word_res->reject_map.length();
302
303
0
  if (rating_per_ch >= suspect_rating_per_ch) {
304
0
    return; // Don't touch bad ratings
305
0
  }
306
307
0
  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
308
    /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
309
0
    for (i = 0; i < len; ++i) {
310
0
      if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
311
0
        word_res->reject_map[i].setrej_minimal_rej_accept();
312
0
      }
313
0
    }
314
0
  }
315
316
0
  for (i = 0; i < len; i++) {
317
0
    if (word_res->reject_map[i].rejected()) {
318
0
      if (word_res->reject_map[i].flag(R_DOC_REJ)) {
319
0
        word_res->reject_map[i].setrej_minimal_rej_accept();
320
0
      }
321
0
      if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
322
0
        word_res->reject_map[i].setrej_minimal_rej_accept();
323
0
      }
324
0
      if (word_res->reject_map[i].flag(R_ROW_REJ)) {
325
0
        word_res->reject_map[i].setrej_minimal_rej_accept();
326
0
      }
327
0
    }
328
0
  }
329
330
0
  if (suspect_level == 2) {
331
0
    return;
332
0
  }
333
334
0
  if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
335
0
    for (i = 0; i < len; i++) {
336
0
      if (word_res->reject_map[i].rejected()) {
337
0
        if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
338
0
             word_res->reject_map[i].flag(R_POSTNN_1IL))) {
339
0
          word_res->reject_map[i].setrej_minimal_rej_accept();
340
0
        }
341
342
0
        if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
343
0
          word_res->reject_map[i].setrej_minimal_rej_accept();
344
0
        }
345
0
      }
346
0
    }
347
0
  }
348
349
0
  if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
350
0
                             word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
351
0
      acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
352
0
    if (word_res->reject_map.length() > suspect_short_words) {
353
0
      for (i = 0; i < len; i++) {
354
0
        if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
355
0
                                                   word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
356
0
                                                   word_res->reject_map[i].flag(R_POSTNN_1IL) ||
357
0
                                                   word_res->reject_map[i].flag(R_MM_REJECT))) {
358
0
          word_res->reject_map[i].setrej_minimal_rej_accept();
359
0
        }
360
0
      }
361
0
    }
362
0
  }
363
0
}
364
365
0
int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
366
0
  int count = 0;
367
0
  for (unsigned i = 0; i < word.length(); ++i) {
368
0
    if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
369
0
      count++;
370
0
    }
371
0
  }
372
0
  return count;
373
0
}
374
375
0
int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
376
0
  int count = 0;
377
0
  for (unsigned i = 0; i < word.length(); ++i) {
378
0
    if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
379
0
        word.unicharset()->get_isdigit(word.unichar_id(i))) {
380
0
      count++;
381
0
    }
382
0
  }
383
0
  return count;
384
0
}
385
386
0
bool Tesseract::acceptable_number_string(const char *s, const char *lengths) {
387
0
  bool prev_digit = false;
388
389
0
  if (*lengths == 1 && *s == '(') {
390
0
    s++;
391
0
  }
392
393
0
  if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {
394
0
    s++;
395
0
  }
396
397
0
  for (; *s != '\0'; s += *(lengths++)) {
398
0
    if (unicharset.get_isdigit(s, *lengths)) {
399
0
      prev_digit = true;
400
0
    } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {
401
0
      prev_digit = false;
402
0
    } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
403
0
               ((*s == '%') || (*s == ')'))) {
404
0
      return true;
405
0
    } else if (prev_digit && *lengths == 1 && (*s == '%') &&
406
0
               (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
407
0
               (*(s + *lengths + *(lengths + 1)) == '\0')) {
408
0
      return true;
409
0
    } else {
410
0
      return false;
411
0
    }
412
0
  }
413
0
  return true;
414
0
}
415
} // namespace tesseract