/src/tesseract/src/ccmain/output.cpp

Source
/******************************************************************
 * File:        output.cpp  (Formerly output.c)
 * Description: Output pass
 * Author:      Phil Cheatle
 *
 * (C) Copyright 1994, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#include "output.h"

#include "control.h"
#include "tesseractclass.h"
#include "tessvars.h"
#ifndef DISABLED_LEGACY_ENGINE
#  include "docqual.h"
#  include "reject.h"
#endif

#include "helpers.h"

#include <cctype>
#include <cerrno>
#include <cstring>

#define CTRL_NEWLINE '\012'  // newline
#define CTRL_HARDLINE '\015' // cr

namespace tesseract {
void Tesseract::output_pass( // Tess output pass //send to api
    PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {
  BLOCK_RES *block_of_last_word;
  bool force_eol;   // During output
  BLOCK *nextblock; // block of next word
  WERD *nextword;   // next word

  page_res_it.restart_page();
  block_of_last_word = nullptr;
  while (page_res_it.word() != nullptr) {
    check_debug_pt(page_res_it.word(), 120);

    if (target_word_box) {
      TBOX current_word_box = page_res_it.word()->word->bounding_box();
      FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
                       (current_word_box.bottom() + current_word_box.top()) / 2);
      if (!target_word_box->contains(center_pt)) {
        page_res_it.forward();
        continue;
      }
    }
    if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
      block_of_last_word = page_res_it.block();
    }

    force_eol =
        (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
        (page_res_it.next_word() == nullptr);

    if (page_res_it.next_word() != nullptr) {
      nextword = page_res_it.next_word()->word;
    } else {
      nextword = nullptr;
    }
    if (page_res_it.next_block() != nullptr) {
      nextblock = page_res_it.next_block()->block;
    } else {
      nextblock = nullptr;
    }
    // regardless of tilde crunching
    write_results(page_res_it,
                  determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
                                         nextword, nextblock),
                  force_eol);
    page_res_it.forward();
  }
}

/*************************************************************************
 * write_results()
 *
 * All recognition and rejection has now been done. Generate the following:
 *   .txt file     - giving the final best choices with NO highlighting
 *   .raw file     - giving the tesseract top choice output for each word
 *   .map file     - showing how the .txt file has been rejected in the .ep file
 *   epchoice list - a list of one element per word, containing the text for the
 *                   epaper. Reject strings are inserted.
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                              char newline_type, // type of newline
                              bool force_eol) {  // override tilde crunch?
  WERD_RES *word = page_res_it.word();
  const UNICHARSET &uchset = *word->uch_set;
  UNICHAR_ID space = uchset.unichar_to_id(" ");

  if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
    bool need_reject = false;
    if ((word->unlv_crunch_mode != CR_DELETE) &&
        (!stats_.tilde_crunch_written ||
         ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
          !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
      if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
          !word->word->flag(W_FUZZY_SP)) {
        stats_.last_char_was_tilde = false;
      }
      need_reject = true;
    }
    if ((need_reject && !stats_.last_char_was_tilde) ||
        (force_eol && stats_.write_results_empty_block)) {
      /* Write a reject char - mark as rejected unless zero_rejection mode */
      stats_.last_char_was_tilde = true;
      stats_.tilde_crunch_written = true;
      stats_.last_char_was_newline = false;
      stats_.write_results_empty_block = false;
    }

    if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
      stats_.tilde_crunch_written = false;
      stats_.last_char_was_newline = true;
      stats_.last_char_was_tilde = false;
    }

    if (force_eol) {
      stats_.write_results_empty_block = true;
    }
    return;
  }

  /* NORMAL PROCESSING of non tilde crunched words */

  stats_.tilde_crunch_written = false;
  if (newline_type) {
    stats_.last_char_was_newline = true;
  } else {
    stats_.last_char_was_newline = false;
  }
  stats_.write_results_empty_block = force_eol; // about to write a real word

  if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
      (word->best_choice->unichar_id(0) == space)) {
    /* Prevent adjacent tilde across words - we know that adjacent tildes within
   words have been removed */
    word->MergeAdjacentBlobs(0);
  }
  if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
    stats_.last_char_was_tilde = false;
  } else {
    if (word->reject_map.length() > 0) {
      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
        stats_.last_char_was_tilde = true;
      } else {
        stats_.last_char_was_tilde = false;
      }
    } else if (word->word->space() > 0) {
      stats_.last_char_was_tilde = false;
    }
    /* else it is unchanged as there are no output chars */
  }

  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());

  set_unlv_suspects(word);
  check_debug_pt(word, 120);
  if (tessedit_rejection_debug) {
    tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
            dict_word(*(word->best_choice)));
  }
  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
    if (tessedit_zero_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
        if (word->reject_map[i].rejected()) {
          word->reject_map[i].setrej_minimal_rej_accept();
        }
      }
    }
    if (tessedit_minimal_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
        if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
          word->reject_map[i].setrej_minimal_rej_accept();
        }
      }
    }
  }
}

/**********************************************************************
 * determine_newline_type
 *
 * Find whether we have a wrapping or hard newline.
 * Return false if not at end of line.
 **********************************************************************/

char determine_newline_type( // test line ends
    WERD *word,              // word to do
    BLOCK *block,            // current block
    WERD *next_word,         // next word
    BLOCK *next_block        // block of next word
) {
  int16_t end_gap; // to right edge
  int16_t width;   // of next word
  TBOX word_box;   // bounding
  TBOX next_box;   // next word
  TBOX block_box;  // block bounding

  if (!word->flag(W_EOL)) {
    return false; // not end of line
  }
  if (next_word == nullptr || next_block == nullptr || block != next_block) {
    return CTRL_NEWLINE;
  }
  if (next_word->space() > 0) {
    return CTRL_HARDLINE; // it is tabbed
  }
  word_box = word->bounding_box();
  next_box = next_word->bounding_box();
  block_box = block->pdblk.bounding_box();
  // gap to eol
  end_gap = block_box.right() - word_box.right();
  end_gap -= static_cast<int32_t>(block->space());
  width = next_box.right() - next_box.left();
  //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
  //              block_box.right(),word_box.right(),end_gap,
  //              next_box.right(),next_box.left(),width,
  //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
}

/*************************************************************************
 * get_rep_char()
 * Return the first accepted character from the repetition string. This is the
 * character which is repeated - as determined earlier by fix_rep_char()
 *************************************************************************/
UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
  int i;
  for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
    ;
  }

  if (i < word->reject_map.length()) {
    return word->best_choice->unichar_id(i);
  } else {
    return word->uch_set->unichar_to_id(unrecognised_char.c_str());
  }
}

/*************************************************************************
 * SUSPECT LEVELS
 *
 * 0 - don't reject ANYTHING
 * 1,2 - partial rejection
 * 3 - BEST
 *
 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
 * tessedit_minimal_rejection.
 *************************************************************************/
void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
  int len = word_res->reject_map.length();
  const WERD_CHOICE &word = *(word_res->best_choice);
  const UNICHARSET &uchset = *word.unicharset();
  int i;
  float rating_per_ch;

  if (suspect_level == 0) {
    for (i = 0; i < len; i++) {
      if (word_res->reject_map[i].rejected()) {
        word_res->reject_map[i].setrej_minimal_rej_accept();
      }
    }
    return;
  }

  if (suspect_level >= 3) {
    return; // Use defaults
  }

  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/

  if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
    /* Unreject alphas in dictionary words */
    for (i = 0; i < len; ++i) {
      if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
        word_res->reject_map[i].setrej_minimal_rej_accept();
      }
    }
  }

  rating_per_ch = word.rating() / word_res->reject_map.length();

  if (rating_per_ch >= suspect_rating_per_ch) {
    return; // Don't touch bad ratings
  }

  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
    /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
    for (i = 0; i < len; ++i) {
      if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
        word_res->reject_map[i].setrej_minimal_rej_accept();
      }
    }
  }

  for (i = 0; i < len; i++) {
    if (word_res->reject_map[i].rejected()) {
      if (word_res->reject_map[i].flag(R_DOC_REJ)) {
        word_res->reject_map[i].setrej_minimal_rej_accept();
      }
      if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
        word_res->reject_map[i].setrej_minimal_rej_accept();
      }
      if (word_res->reject_map[i].flag(R_ROW_REJ)) {
        word_res->reject_map[i].setrej_minimal_rej_accept();
      }
    }
  }

  if (suspect_level == 2) {
    return;
  }

  if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
    for (i = 0; i < len; i++) {
      if (word_res->reject_map[i].rejected()) {
        if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
             word_res->reject_map[i].flag(R_POSTNN_1IL))) {
          word_res->reject_map[i].setrej_minimal_rej_accept();
        }

        if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
          word_res->reject_map[i].setrej_minimal_rej_accept();
        }
      }
    }
  }

  if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
                             word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
      acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
    if (word_res->reject_map.length() > suspect_short_words) {
      for (i = 0; i < len; i++) {
        if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
                                                   word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
                                                   word_res->reject_map[i].flag(R_POSTNN_1IL) ||
                                                   word_res->reject_map[i].flag(R_MM_REJECT))) {
          word_res->reject_map[i].setrej_minimal_rej_accept();
        }
      }
    }
  }
}

int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
  int count = 0;
  for (unsigned i = 0; i < word.length(); ++i) {
    if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
      count++;
    }
  }
  return count;
}

int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
  int count = 0;
  for (unsigned i = 0; i < word.length(); ++i) {
    if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
        word.unicharset()->get_isdigit(word.unichar_id(i))) {
      count++;
    }
  }
  return count;
}

bool Tesseract::acceptable_number_string(const char *s, const char *lengths) {
  bool prev_digit = false;

  if (*lengths == 1 && *s == '(') {
    s++;
  }

  if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {
    s++;
  }

  for (; *s != '\0'; s += *(lengths++)) {
    if (unicharset.get_isdigit(s, *lengths)) {
      prev_digit = true;
    } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {
      prev_digit = false;
    } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
               ((*s == '%') || (*s == ')'))) {
      return true;
    } else if (prev_digit && *lengths == 1 && (*s == '%') &&
               (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
               (*(s + *lengths + *(lengths + 1)) == '\0')) {
      return true;
    } else {
      return false;
    }
  }
  return true;
}
} // namespace tesseract

Coverage Report

Created: 2025-11-16 06:50

Line	Count	Source
1		/******************************************************************
2		* File: output.cpp (Formerly output.c)
3		* Description: Output pass
4		* Author: Phil Cheatle
5		*
6		* (C) Copyright 1994, Hewlett-Packard Ltd.
7		** Licensed under the Apache License, Version 2.0 (the "License");
8		** you may not use this file except in compliance with the License.
9		** You may obtain a copy of the License at
10		** http://www.apache.org/licenses/LICENSE-2.0
11		** Unless required by applicable law or agreed to in writing, software
12		** distributed under the License is distributed on an "AS IS" BASIS,
13		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		** See the License for the specific language governing permissions and
15		** limitations under the License.
16		*
17		**********************************************************************/
18
19		#include "output.h"
20
21		#include "control.h"
22		#include "tesseractclass.h"
23		#include "tessvars.h"
24		#ifndef DISABLED_LEGACY_ENGINE
25		# include "docqual.h"
26		# include "reject.h"
27		#endif
28
29		#include "helpers.h"
30
31		#include <cctype>
32		#include <cerrno>
33		#include <cstring>
34
35	0	#define CTRL_NEWLINE '\012' // newline
36	0	#define CTRL_HARDLINE '\015' // cr
37
38		namespace tesseract {
39		void Tesseract::output_pass( // Tess output pass //send to api
40	0	PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {
41	0	BLOCK_RES *block_of_last_word;
42	0	bool force_eol; // During output
43	0	BLOCK *nextblock; // block of next word
44	0	WERD *nextword; // next word
45
46	0	page_res_it.restart_page();
47	0	block_of_last_word = nullptr;
48	0	while (page_res_it.word() != nullptr) {
49	0	check_debug_pt(page_res_it.word(), 120);
50
51	0	if (target_word_box) {
52	0	TBOX current_word_box = page_res_it.word()->word->bounding_box();
53	0	FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
54	0	(current_word_box.bottom() + current_word_box.top()) / 2);
55	0	if (!target_word_box->contains(center_pt)) {
56	0	page_res_it.forward();
57	0	continue;
58	0	}
59	0	}
60	0	if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
61	0	block_of_last_word = page_res_it.block();
62	0	}
63
64	0	force_eol =
65	0	(tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) \|\|
66	0	(page_res_it.next_word() == nullptr);
67
68	0	if (page_res_it.next_word() != nullptr) {
69	0	nextword = page_res_it.next_word()->word;
70	0	} else {
71	0	nextword = nullptr;
72	0	}
73	0	if (page_res_it.next_block() != nullptr) {
74	0	nextblock = page_res_it.next_block()->block;
75	0	} else {
76	0	nextblock = nullptr;
77	0	}
78		// regardless of tilde crunching
79	0	write_results(page_res_it,
80	0	determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
81	0	nextword, nextblock),
82	0	force_eol);
83	0	page_res_it.forward();
84	0	}
85	0	}
86
87		/*************************************************************************
88		* write_results()
89		*
90		* All recognition and rejection has now been done. Generate the following:
91		* .txt file - giving the final best choices with NO highlighting
92		* .raw file - giving the tesseract top choice output for each word
93		* .map file - showing how the .txt file has been rejected in the .ep file
94		* epchoice list - a list of one element per word, containing the text for the
95		* epaper. Reject strings are inserted.
96		* inset list - a list of bounding boxes of reject insets - indexed by the
97		* reject strings in the epchoice text.
98		*************************************************************************/
99		void Tesseract::write_results(PAGE_RES_IT &page_res_it,
100		char newline_type, // type of newline
101	0	bool force_eol) { // override tilde crunch?
102	0	WERD_RES *word = page_res_it.word();
103	0	const UNICHARSET &uchset = *word->uch_set;
104	0	UNICHAR_ID space = uchset.unichar_to_id(" ");
105
106	0	if ((word->unlv_crunch_mode != CR_NONE \|\| word->best_choice->empty()) &&
107	0	!tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
108	0	bool need_reject = false;
109	0	if ((word->unlv_crunch_mode != CR_DELETE) &&
110	0	(!stats_.tilde_crunch_written \|\|
111	0	((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
112	0	!word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
113	0	if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
114	0	!word->word->flag(W_FUZZY_SP)) {
115	0	stats_.last_char_was_tilde = false;
116	0	}
117	0	need_reject = true;
118	0	}
119	0	if ((need_reject && !stats_.last_char_was_tilde) \|\|
120	0	(force_eol && stats_.write_results_empty_block)) {
121		/* Write a reject char - mark as rejected unless zero_rejection mode */
122	0	stats_.last_char_was_tilde = true;
123	0	stats_.tilde_crunch_written = true;
124	0	stats_.last_char_was_newline = false;
125	0	stats_.write_results_empty_block = false;
126	0	}
127
128	0	if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) \|\| force_eol) {
129	0	stats_.tilde_crunch_written = false;
130	0	stats_.last_char_was_newline = true;
131	0	stats_.last_char_was_tilde = false;
132	0	}
133
134	0	if (force_eol) {
135	0	stats_.write_results_empty_block = true;
136	0	}
137	0	return;
138	0	}
139
140		/* NORMAL PROCESSING of non tilde crunched words */
141
142	0	stats_.tilde_crunch_written = false;
143	0	if (newline_type) {
144	0	stats_.last_char_was_newline = true;
145	0	} else {
146	0	stats_.last_char_was_newline = false;
147	0	}
148	0	stats_.write_results_empty_block = force_eol; // about to write a real word
149
150	0	if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
151	0	!(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
152	0	(word->best_choice->unichar_id(0) == space)) {
153		/* Prevent adjacent tilde across words - we know that adjacent tildes within
154		words have been removed */
155	0	word->MergeAdjacentBlobs(0);
156	0	}
157	0	if (newline_type \|\| (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
158	0	stats_.last_char_was_tilde = false;
159	0	} else {
160	0	if (word->reject_map.length() > 0) {
161	0	if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
162	0	stats_.last_char_was_tilde = true;
163	0	} else {
164	0	stats_.last_char_was_tilde = false;
165	0	}
166	0	} else if (word->word->space() > 0) {
167	0	stats_.last_char_was_tilde = false;
168	0	}
169		/* else it is unchanged as there are no output chars */
170	0	}
171
172	0	ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
173
174	0	set_unlv_suspects(word);
175	0	check_debug_pt(word, 120);
176	0	if (tessedit_rejection_debug) {
177	0	tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
178	0	dict_word(*(word->best_choice)));
179	0	}
180	0	if (!word->word->flag(W_REP_CHAR) \|\| !tessedit_write_rep_codes) {
181	0	if (tessedit_zero_rejection) {
182		/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
183	0	for (unsigned i = 0; i < word->best_choice->length(); ++i) {
184	0	if (word->reject_map[i].rejected()) {
185	0	word->reject_map[i].setrej_minimal_rej_accept();
186	0	}
187	0	}
188	0	}
189	0	if (tessedit_minimal_rejection) {
190		/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
191	0	for (unsigned i = 0; i < word->best_choice->length(); ++i) {
192	0	if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
193	0	word->reject_map[i].setrej_minimal_rej_accept();
194	0	}
195	0	}
196	0	}
197	0	}
198	0	}
199
200		/**********************************************************************
201		* determine_newline_type
202		*
203		* Find whether we have a wrapping or hard newline.
204		* Return false if not at end of line.
205		**********************************************************************/
206
207		char determine_newline_type( // test line ends
208		WERD *word, // word to do
209		BLOCK *block, // current block
210		WERD *next_word, // next word
211		BLOCK *next_block // block of next word
212	0	) {
213	0	int16_t end_gap; // to right edge
214	0	int16_t width; // of next word
215	0	TBOX word_box; // bounding
216	0	TBOX next_box; // next word
217	0	TBOX block_box; // block bounding
218
219	0	if (!word->flag(W_EOL)) {
220	0	return false; // not end of line
221	0	}
222	0	if (next_word == nullptr \|\| next_block == nullptr \|\| block != next_block) {
223	0	return CTRL_NEWLINE;
224	0	}
225	0	if (next_word->space() > 0) {
226	0	return CTRL_HARDLINE; // it is tabbed
227	0	}
228	0	word_box = word->bounding_box();
229	0	next_box = next_word->bounding_box();
230	0	block_box = block->pdblk.bounding_box();
231		// gap to eol
232	0	end_gap = block_box.right() - word_box.right();
233	0	end_gap -= static_cast<int32_t>(block->space());
234	0	width = next_box.right() - next_box.left();
235		// tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
236		// block_box.right(),word_box.right(),end_gap,
237		// next_box.right(),next_box.left(),width,
238		// end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
239	0	return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
240	0	}
241
242		/*************************************************************************
243		* get_rep_char()
244		* Return the first accepted character from the repetition string. This is the
245		* character which is repeated - as determined earlier by fix_rep_char()
246		*************************************************************************/
247	0	UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
248	0	int i;
249	0	for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
250	0	;
251	0	}
252
253	0	if (i < word->reject_map.length()) {
254	0	return word->best_choice->unichar_id(i);
255	0	} else {
256	0	return word->uch_set->unichar_to_id(unrecognised_char.c_str());
257	0	}
258	0	}
259
260		/*************************************************************************
261		* SUSPECT LEVELS
262		*
263		* 0 - don't reject ANYTHING
264		* 1,2 - partial rejection
265		* 3 - BEST
266		*
267		* NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
268		* tessedit_minimal_rejection.
269		*************************************************************************/
270	0	void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
271	0	int len = word_res->reject_map.length();
272	0	const WERD_CHOICE &word = *(word_res->best_choice);
273	0	const UNICHARSET &uchset = *word.unicharset();
274	0	int i;
275	0	float rating_per_ch;
276
277	0	if (suspect_level == 0) {
278	0	for (i = 0; i < len; i++) {
279	0	if (word_res->reject_map[i].rejected()) {
280	0	word_res->reject_map[i].setrej_minimal_rej_accept();
281	0	}
282	0	}
283	0	return;
284	0	}
285
286	0	if (suspect_level >= 3) {
287	0	return; // Use defaults
288	0	}
289
290		/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
291
292	0	if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
293		/* Unreject alphas in dictionary words */
294	0	for (i = 0; i < len; ++i) {
295	0	if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
296	0	word_res->reject_map[i].setrej_minimal_rej_accept();
297	0	}
298	0	}
299	0	}
300
301	0	rating_per_ch = word.rating() / word_res->reject_map.length();
302
303	0	if (rating_per_ch >= suspect_rating_per_ch) {
304	0	return; // Don't touch bad ratings
305	0	}
306
307	0	if ((word_res->tess_accepted) \|\| (rating_per_ch < suspect_accept_rating)) {
308		/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
309	0	for (i = 0; i < len; ++i) {
310	0	if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
311	0	word_res->reject_map[i].setrej_minimal_rej_accept();
312	0	}
313	0	}
314	0	}
315
316	0	for (i = 0; i < len; i++) {
317	0	if (word_res->reject_map[i].rejected()) {
318	0	if (word_res->reject_map[i].flag(R_DOC_REJ)) {
319	0	word_res->reject_map[i].setrej_minimal_rej_accept();
320	0	}
321	0	if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
322	0	word_res->reject_map[i].setrej_minimal_rej_accept();
323	0	}
324	0	if (word_res->reject_map[i].flag(R_ROW_REJ)) {
325	0	word_res->reject_map[i].setrej_minimal_rej_accept();
326	0	}
327	0	}
328	0	}
329
330	0	if (suspect_level == 2) {
331	0	return;
332	0	}
333
334	0	if (!suspect_constrain_1Il \|\| (word_res->reject_map.length() <= suspect_short_words)) {
335	0	for (i = 0; i < len; i++) {
336	0	if (word_res->reject_map[i].rejected()) {
337	0	if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) \|\|
338	0	word_res->reject_map[i].flag(R_POSTNN_1IL))) {
339	0	word_res->reject_map[i].setrej_minimal_rej_accept();
340	0	}
341
342	0	if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
343	0	word_res->reject_map[i].setrej_minimal_rej_accept();
344	0	}
345	0	}
346	0	}
347	0	}
348
349	0	if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
350	0	word.unichar_lengths().c_str()) != AC_UNACCEPTABLE \|\|
351	0	acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
352	0	if (word_res->reject_map.length() > suspect_short_words) {
353	0	for (i = 0; i < len; i++) {
354	0	if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() \|\|
355	0	word_res->reject_map[i].flag(R_1IL_CONFLICT) \|\|
356	0	word_res->reject_map[i].flag(R_POSTNN_1IL) \|\|
357	0	word_res->reject_map[i].flag(R_MM_REJECT))) {
358	0	word_res->reject_map[i].setrej_minimal_rej_accept();
359	0	}
360	0	}
361	0	}
362	0	}
363	0	}
364
365	0	int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
366	0	int count = 0;
367	0	for (unsigned i = 0; i < word.length(); ++i) {
368	0	if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
369	0	count++;
370	0	}
371	0	}
372	0	return count;
373	0	}
374
375	0	int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
376	0	int count = 0;
377	0	for (unsigned i = 0; i < word.length(); ++i) {
378	0	if (word.unicharset()->get_isalpha(word.unichar_id(i)) \|\|
379	0	word.unicharset()->get_isdigit(word.unichar_id(i))) {
380	0	count++;
381	0	}
382	0	}
383	0	return count;
384	0	}
385
386	0	bool Tesseract::acceptable_number_string(const char s, const char lengths) {
387	0	bool prev_digit = false;
388
389	0	if (lengths == 1 && s == '(') {
390	0	s++;
391	0	}
392
393	0	if (lengths == 1 && ((s == '$') \|\| (s == '.') \|\| (s == '+') \|\| (*s == '-'))) {
394	0	s++;
395	0	}
396
397	0	for (; s != '\0'; s += (lengths++)) {
398	0	if (unicharset.get_isdigit(s, *lengths)) {
399	0	prev_digit = true;
400	0	} else if (prev_digit && (lengths == 1 && ((s == '.') \|\| (s == ',') \|\| (s == '-')))) {
401	0	prev_digit = false;
402	0	} else if (prev_digit && lengths == 1 && ((s + *lengths) == '\0') &&
403	0	((s == '%') \|\| (s == ')'))) {
404	0	return true;
405	0	} else if (prev_digit && lengths == 1 && (s == '%') &&
406	0	((lengths + 1) == 1 && (s + *lengths) == ')') &&
407	0	((s + lengths + *(lengths + 1)) == '\0')) {
408	0	return true;
409	0	} else {
410	0	return false;
411	0	}
412	0	}
413	0	return true;
414	0	}
415		} // namespace tesseract