/src/tesseract/src/dict/permdawg.cpp

Source (jump to first uncovered line)
/******************************************************************************
 *
 * File:         permdawg.cpp  (Formerly permdawg.c)
 * Description:  Scale word choices by a dictionary
 * Author:       Mark Seaman, OCR Technology
 *
 * (c) Copyright 1987, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 *****************************************************************************/
/*----------------------------------------------------------------------
              I n c l u d e s
----------------------------------------------------------------------*/

#include "dawg.h"
#include "params.h"
#include "stopper.h"
#include "tprintf.h"

#include <algorithm>
#include <cctype>
#include "dict.h"

/*----------------------------------------------------------------------
              F u n c t i o n s
----------------------------------------------------------------------*/
namespace tesseract {

/**
 * @name go_deeper_dawg_fxn
 *
 * If the choice being composed so far could be a dictionary word
 * keep exploring choices.
 */
void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
                              int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
                              bool word_ending, WERD_CHOICE *word, float certainties[],
                              float *limit, WERD_CHOICE *best_choice, int *attempts_left,
                              void *void_more_args) {
  auto *more_args = static_cast<DawgArgs *>(void_more_args);
  word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
  int word_index = word->length() - 1;
  if (best_choice->rating() < *limit) {
    return;
  }
  // Look up char in DAWG

  // If the current unichar is an ngram first try calling
  // letter_is_okay() for each unigram it contains separately.
  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
  bool checked_unigrams = false;
  if (getUnicharset().get_isngram(orig_uch_id)) {
    if (dawg_debug_level) {
      tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
    }
    int num_unigrams = 0;
    word->remove_last_unichar_id();
    std::vector<UNICHAR_ID> encoding;
    const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
    // Since the string came out of the unicharset, failure is impossible.
    ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
    bool unigrams_ok = true;
    // Construct DawgArgs that reflect the current state.
    DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
    DawgPositionVector unigram_updated_dawgs;
    DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
    // Check unigrams in the ngram with letter_is_okay().
    for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
      UNICHAR_ID uch_id = encoding[i];
      ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
      ++num_unigrams;
      word->append_unichar_id(uch_id, 1, 0.0, 0.0);
      unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),
                                             word->unichar_id(word_index + num_unigrams - 1),
                                             word_ending && i == encoding.size() - 1);
      (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
      if (dawg_debug_level) {
        tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
                unigrams_ok ? "OK" : "not OK");
      }
    }
    // Restore the word and copy the updated dawg state if needed.
    while (num_unigrams-- > 0) {
      word->remove_last_unichar_id();
    }
    word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
    if (unigrams_ok) {
      checked_unigrams = true;
      more_args->permuter = unigram_dawg_args.permuter;
      *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
    }
  }

  // Check which dawgs from the dawgs_ vector contain the word
  // up to and including the current unichar.
  if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),
                                                   word->unichar_id(word_index), word_ending)) {
    // Add a new word choice
    if (word_ending) {
      if (dawg_debug_level) {
        tprintf("found word = %s\n", word->debug_string().c_str());
      }
      if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
        if (output_ambig_words_file_ == nullptr) {
          output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
          if (output_ambig_words_file_ == nullptr) {
            tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
            exit(1);
          }
          std::string word_str;
          word->string_and_lengths(&word_str, nullptr);
          word_str += " ";
          fprintf(output_ambig_words_file_, "%s", word_str.c_str());
        }
        std::string word_str;
        word->string_and_lengths(&word_str, nullptr);
        word_str += " ";
        fprintf(output_ambig_words_file_, "%s", word_str.c_str());
      }
      WERD_CHOICE *adjusted_word = word;
      adjusted_word->set_permuter(more_args->permuter);
      update_best_choice(*adjusted_word, best_choice);
    } else { // search the next letter
      // Make updated_* point to the next entries in the DawgPositionVector
      // arrays (that were originally created in dawg_permute_and_select)
      ++(more_args->updated_dawgs);
      // Make active_dawgs and constraints point to the updated ones.
      ++(more_args->active_dawgs);
      permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
                      certainties, limit, best_choice, attempts_left, more_args);
      // Restore previous state to explore another letter in this position.
      --(more_args->updated_dawgs);
      --(more_args->active_dawgs);
    }
  } else {
    if (dawg_debug_level) {
      tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
    }
  }
}

/**
 * dawg_permute_and_select
 *
 * Recursively explore all the possible character combinations in
 * the given char_choices. Use go_deeper_dawg_fxn() to search all the
 * dawgs in the dawgs_ vector in parallel and discard invalid words.
 *
 * Allocate and return a WERD_CHOICE with the best valid word found.
 */
WERD_CHOICE *Dict::dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                                           float rating_limit) {
  auto *best_choice = new WERD_CHOICE(&getUnicharset());
  best_choice->make_bad();
  best_choice->set_rating(rating_limit);
  if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {
    return best_choice;
  }
  auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
  init_active_dawgs(&(active_dawgs[0]), true);
  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);

  float certainties[MAX_WERD_LENGTH];
  this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
  int attempts_left = max_permuter_attempts;
  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
                  &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
  delete[] active_dawgs;
  return best_choice;
}

/**
 * permute_choices
 *
 * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
 * with the given char_choice_index in char_choices.
 */
void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
                           int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
                           WERD_CHOICE *word, float certainties[], float *limit,
                           WERD_CHOICE *best_choice, int *attempts_left, void *more_args) {
  if (debug) {
    tprintf(
        "%s permute_choices: char_choice_index=%d"
        " limit=%g rating=%g, certainty=%g word=%s\n",
        debug, char_choice_index, *limit, word->rating(), word->certainty(),
        word->debug_string().c_str());
  }
  if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
    BLOB_CHOICE_IT blob_choice_it;
    blob_choice_it.set_to_list(char_choices.at(char_choice_index));
    for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
      (*attempts_left)--;
      append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
                     prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
                     more_args);
      if (*attempts_left <= 0) {
        if (debug) {
          tprintf("permute_choices(): attempts_left is 0\n");
        }
        break;
      }
    }
  }
}

/**
 * append_choices
 *
 * Checks to see whether or not the next choice is worth appending to
 * the word being generated. If so then keeps going deeper into the word.
 *
 * This function assumes that Dict::go_deeper_fxn_ is set.
 */
void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
                          const BLOB_CHOICE &blob_choice, int char_choice_index,
                          const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
                          float certainties[], float *limit, WERD_CHOICE *best_choice,
                          int *attempts_left, void *more_args) {
  auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);

  // Deal with fragments.
  CHAR_FRAGMENT_INFO char_frag_info;
  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
                           prev_char_frag_info, debug, word_ending, &char_frag_info)) {
    return; // blob_choice must be an invalid fragment
  }
  // Search the next letter if this character is a fragment.
  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
    permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
                    limit, best_choice, attempts_left, more_args);
    return;
  }

  // Add the next unichar.
  float old_rating = word->rating();
  float old_certainty = word->certainty();
  uint8_t old_permuter = word->permuter();
  certainties[word->length()] = char_frag_info.certainty;
  word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
                                          char_frag_info.rating, char_frag_info.certainty);

  // Explore the next unichar.
  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
                          word, certainties, limit, best_choice, attempts_left, more_args);

  // Remove the unichar we added to explore other choices in it's place.
  word->remove_last_unichar_id();
  word->set_rating(old_rating);
  word->set_certainty(old_certainty);
  word->set_permuter(old_permuter);
}

/**
 * @name fragment_state
 *
 * Given the current char choice and information about previously seen
 * fragments, determines whether adjacent character fragments are
 * present and whether they can be concatenated.
 *
 * The given prev_char_frag_info contains:
 * - fragment: if not nullptr contains information about immediately
 *   preceding fragmented character choice
 * - num_fragments: number of fragments that have been used so far
 *   to construct a character
 * - certainty: certainty of the current choice or minimum
 *   certainty of all fragments concatenated so far
 * - rating: rating of the current choice or sum of fragment
 *   ratings concatenated so far
 *
 * The output char_frag_info is filled in as follows:
 * - character: is set to be nullptr if the choice is a non-matching
 *   or non-ending fragment piece; is set to unichar of the given choice
 *   if it represents a regular character or a matching ending fragment
 * - fragment,num_fragments,certainty,rating are set as described above
 *
 * @returns false if a non-matching fragment is discovered, true otherwise.
 */
bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
                               const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
                               int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) {
  const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
  const CHAR_FRAGMENT *prev_fragment =
      prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;

  // Print debug info for fragments.
  if (debug && (prev_fragment || this_fragment)) {
    tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
            getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
    if (prev_fragment) {
      tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
    }
    if (this_fragment) {
      tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
    }
  }

  char_frag_info->unichar_id = curr_unichar_id;
  char_frag_info->fragment = this_fragment;
  char_frag_info->rating = curr_rating;
  char_frag_info->certainty = curr_certainty;
  char_frag_info->num_fragments = 1;
  if (prev_fragment && !this_fragment) {
    if (debug) {
      tprintf("Skip choice with incomplete fragment\n");
    }
    return false;
  }
  if (this_fragment) {
    // We are dealing with a fragment.
    char_frag_info->unichar_id = INVALID_UNICHAR_ID;
    if (prev_fragment) {
      if (!this_fragment->is_continuation_of(prev_fragment)) {
        if (debug) {
          tprintf("Non-matching fragment piece\n");
        }
        return false;
      }
      if (this_fragment->is_ending()) {
        char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
        char_frag_info->fragment = nullptr;
        if (debug) {
          tprintf("Built character %s from fragments\n",
                  getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
        }
      } else {
        if (debug) {
          tprintf("Record fragment continuation\n");
        }
        char_frag_info->fragment = this_fragment;
      }
      // Update certainty and rating.
      char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
      char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
      char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
    } else {
      if (this_fragment->is_beginning()) {
        if (debug) {
          tprintf("Record fragment beginning\n");
        }
      } else {
        if (debug) {
          tprintf("Non-starting fragment piece with no prev_fragment\n");
        }
        return false;
      }
    }
  }
  if (word_ending && char_frag_info->fragment) {
    if (debug) {
      tprintf("Word cannot end with a fragment\n");
    }
    return false;
  }
  return true;
}

} // namespace tesseract

Coverage Report

Created: 2025-07-23 07:12

Line	Count	Source (jump to first uncovered line)
1		/******************************************************************************
2		*
3		* File: permdawg.cpp (Formerly permdawg.c)
4		* Description: Scale word choices by a dictionary
5		* Author: Mark Seaman, OCR Technology
6		*
7		* (c) Copyright 1987, Hewlett-Packard Company.
8		** Licensed under the Apache License, Version 2.0 (the "License");
9		** you may not use this file except in compliance with the License.
10		** You may obtain a copy of the License at
11		** http://www.apache.org/licenses/LICENSE-2.0
12		** Unless required by applicable law or agreed to in writing, software
13		** distributed under the License is distributed on an "AS IS" BASIS,
14		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15		** See the License for the specific language governing permissions and
16		** limitations under the License.
17		*
18		*****************************************************************************/
19		/*----------------------------------------------------------------------
20		I n c l u d e s
21		----------------------------------------------------------------------*/
22
23		#include "dawg.h"
24		#include "params.h"
25		#include "stopper.h"
26		#include "tprintf.h"
27
28		#include <algorithm>
29		#include <cctype>
30		#include "dict.h"
31
32		/*----------------------------------------------------------------------
33		F u n c t i o n s
34		----------------------------------------------------------------------*/
35		namespace tesseract {
36
37		/**
38		* @name go_deeper_dawg_fxn
39		*
40		* If the choice being composed so far could be a dictionary word
41		* keep exploring choices.
42		*/
43		void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
44		int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
45		bool word_ending, WERD_CHOICE *word, float certainties[],
46		float limit, WERD_CHOICE best_choice, int *attempts_left,
47	617k	void *void_more_args) {
48	617k	auto more_args = static_cast<DawgArgs >(void_more_args);
49	617k	word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
50	617k	int word_index = word->length() - 1;
51	617k	if (best_choice->rating() < *limit) {
52	0	return;
53	0	}
54		// Look up char in DAWG
55
56		// If the current unichar is an ngram first try calling
57		// letter_is_okay() for each unigram it contains separately.
58	617k	UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
59	617k	bool checked_unigrams = false;
60	617k	if (getUnicharset().get_isngram(orig_uch_id)) {
61	132k	if (dawg_debug_level) {
62	0	tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
63	0	}
64	132k	int num_unigrams = 0;
65	132k	word->remove_last_unichar_id();
66	132k	std::vector<UNICHAR_ID> encoding;
67	132k	const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68		// Since the string came out of the unicharset, failure is impossible.
69	132k	ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
70	132k	bool unigrams_ok = true;
71		// Construct DawgArgs that reflect the current state.
72	132k	DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
73	132k	DawgPositionVector unigram_updated_dawgs;
74	132k	DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
75		// Check unigrams in the ngram with letter_is_okay().
76	264k	for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
77	132k	UNICHAR_ID uch_id = encoding[i];
78	132k	ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
79	132k	++num_unigrams;
80	132k	word->append_unichar_id(uch_id, 1, 0.0, 0.0);
81	132k	unigrams_ok = (this->letter_is_okay_)(&unigram_dawg_args, word->unicharset(),
82	132k	word->unichar_id(word_index + num_unigrams - 1),
83	132k	word_ending && i == encoding.size() - 1);
84	132k	(unigram_dawg_args.active_dawgs) = (unigram_dawg_args.updated_dawgs);
85	132k	if (dawg_debug_level) {
86	0	tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
87	0	unigrams_ok ? "OK" : "not OK");
88	0	}
89	132k	}
90		// Restore the word and copy the updated dawg state if needed.
91	264k	while (num_unigrams-- > 0) {
92	132k	word->remove_last_unichar_id();
93	132k	}
94	132k	word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
95	132k	if (unigrams_ok) {
96	0	checked_unigrams = true;
97	0	more_args->permuter = unigram_dawg_args.permuter;
98	0	(more_args->updated_dawgs) = (unigram_dawg_args.updated_dawgs);
99	0	}
100	132k	}
101
102		// Check which dawgs from the dawgs_ vector contain the word
103		// up to and including the current unichar.
104	617k	if (checked_unigrams \|\| (this->letter_is_okay_)(more_args, word->unicharset(),
105	617k	word->unichar_id(word_index), word_ending)) {
106		// Add a new word choice
107	0	if (word_ending) {
108	0	if (dawg_debug_level) {
109	0	tprintf("found word = %s\n", word->debug_string().c_str());
110	0	}
111	0	if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
112	0	if (output_ambig_words_file_ == nullptr) {
113	0	output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
114	0	if (output_ambig_words_file_ == nullptr) {
115	0	tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
116	0	exit(1);
117	0	}
118	0	std::string word_str;
119	0	word->string_and_lengths(&word_str, nullptr);
120	0	word_str += " ";
121	0	fprintf(output_ambig_words_file_, "%s", word_str.c_str());
122	0	}
123	0	std::string word_str;
124	0	word->string_and_lengths(&word_str, nullptr);
125	0	word_str += " ";
126	0	fprintf(output_ambig_words_file_, "%s", word_str.c_str());
127	0	}
128	0	WERD_CHOICE *adjusted_word = word;
129	0	adjusted_word->set_permuter(more_args->permuter);
130	0	update_best_choice(*adjusted_word, best_choice);
131	0	} else { // search the next letter
132		// Make updated_* point to the next entries in the DawgPositionVector
133		// arrays (that were originally created in dawg_permute_and_select)
134	0	++(more_args->updated_dawgs);
135		// Make active_dawgs and constraints point to the updated ones.
136	0	++(more_args->active_dawgs);
137	0	permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
138	0	certainties, limit, best_choice, attempts_left, more_args);
139		// Restore previous state to explore another letter in this position.
140	0	--(more_args->updated_dawgs);
141	0	--(more_args->active_dawgs);
142	0	}
143	617k	} else {
144	617k	if (dawg_debug_level) {
145	0	tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
146	0	}
147	617k	}
148	617k	}
149
150		/**
151		* dawg_permute_and_select
152		*
153		* Recursively explore all the possible character combinations in
154		* the given char_choices. Use go_deeper_dawg_fxn() to search all the
155		* dawgs in the dawgs_ vector in parallel and discard invalid words.
156		*
157		* Allocate and return a WERD_CHOICE with the best valid word found.
158		*/
159		WERD_CHOICE *Dict::dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,
160	414k	float rating_limit) {
161	414k	auto *best_choice = new WERD_CHOICE(&getUnicharset());
162	414k	best_choice->make_bad();
163	414k	best_choice->set_rating(rating_limit);
164	414k	if (char_choices.empty() \|\| char_choices.size() > MAX_WERD_LENGTH) {
165	0	return best_choice;
166	0	}
167	414k	auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
168	414k	init_active_dawgs(&(active_dawgs[0]), true);
169	414k	DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
170	414k	WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
171
172	414k	float certainties[MAX_WERD_LENGTH];
173	414k	this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
174	414k	int attempts_left = max_permuter_attempts;
175	414k	permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
176	414k	&word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
177	414k	delete[] active_dawgs;
178	414k	return best_choice;
179	414k	}
180
181		/**
182		* permute_choices
183		*
184		* Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
185		* with the given char_choice_index in char_choices.
186		*/
187		void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
188		int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
189		WERD_CHOICE word, float certainties[], float limit,
190	432k	WERD_CHOICE best_choice, int attempts_left, void *more_args) {
191	432k	if (debug) {
192	0	tprintf(
193	0	"%s permute_choices: char_choice_index=%d"
194	0	" limit=%g rating=%g, certainty=%g word=%s\n",
195	0	debug, char_choice_index, *limit, word->rating(), word->certainty(),
196	0	word->debug_string().c_str());
197	0	}
198	432k	if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
199	432k	BLOB_CHOICE_IT blob_choice_it;
200	432k	blob_choice_it.set_to_list(char_choices.at(char_choice_index));
201	1.12M	for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
202	693k	(*attempts_left)--;
203	693k	append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
204	693k	prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
205	693k	more_args);
206	693k	if (*attempts_left <= 0) {
207	0	if (debug) {
208	0	tprintf("permute_choices(): attempts_left is 0\n");
209	0	}
210	0	break;
211	0	}
212	693k	}
213	432k	}
214	432k	}
215
216		/**
217		* append_choices
218		*
219		* Checks to see whether or not the next choice is worth appending to
220		* the word being generated. If so then keeps going deeper into the word.
221		*
222		* This function assumes that Dict::go_deeper_fxn_ is set.
223		*/
224		void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
225		const BLOB_CHOICE &blob_choice, int char_choice_index,
226		const CHAR_FRAGMENT_INFO prev_char_frag_info, WERD_CHOICE word,
227		float certainties[], float limit, WERD_CHOICE best_choice,
228	693k	int attempts_left, void more_args) {
229	693k	auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
230
231		// Deal with fragments.
232	693k	CHAR_FRAGMENT_INFO char_frag_info;
233	693k	if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
234	693k	prev_char_frag_info, debug, word_ending, &char_frag_info)) {
235	57.3k	return; // blob_choice must be an invalid fragment
236	57.3k	}
237		// Search the next letter if this character is a fragment.
238	636k	if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
239	18.5k	permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
240	18.5k	limit, best_choice, attempts_left, more_args);
241	18.5k	return;
242	18.5k	}
243
244		// Add the next unichar.
245	617k	float old_rating = word->rating();
246	617k	float old_certainty = word->certainty();
247	617k	uint8_t old_permuter = word->permuter();
248	617k	certainties[word->length()] = char_frag_info.certainty;
249	617k	word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
250	617k	char_frag_info.rating, char_frag_info.certainty);
251
252		// Explore the next unichar.
253	617k	(this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
254	617k	word, certainties, limit, best_choice, attempts_left, more_args);
255
256		// Remove the unichar we added to explore other choices in it's place.
257	617k	word->remove_last_unichar_id();
258	617k	word->set_rating(old_rating);
259	617k	word->set_certainty(old_certainty);
260	617k	word->set_permuter(old_permuter);
261	617k	}
262
263		/**
264		* @name fragment_state
265		*
266		* Given the current char choice and information about previously seen
267		* fragments, determines whether adjacent character fragments are
268		* present and whether they can be concatenated.
269		*
270		* The given prev_char_frag_info contains:
271		* - fragment: if not nullptr contains information about immediately
272		* preceding fragmented character choice
273		* - num_fragments: number of fragments that have been used so far
274		* to construct a character
275		* - certainty: certainty of the current choice or minimum
276		* certainty of all fragments concatenated so far
277		* - rating: rating of the current choice or sum of fragment
278		* ratings concatenated so far
279		*
280		* The output char_frag_info is filled in as follows:
281		* - character: is set to be nullptr if the choice is a non-matching
282		* or non-ending fragment piece; is set to unichar of the given choice
283		* if it represents a regular character or a matching ending fragment
284		* - fragment,num_fragments,certainty,rating are set as described above
285		*
286		* @returns false if a non-matching fragment is discovered, true otherwise.
287		*/
288		bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
289		const CHAR_FRAGMENT_INFO prev_char_frag_info, const char debug,
290	693k	int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) {
291	693k	const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
292	693k	const CHAR_FRAGMENT *prev_fragment =
293	693k	prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
294
295		// Print debug info for fragments.
296	693k	if (debug && (prev_fragment \|\| this_fragment)) {
297	0	tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
298	0	getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
299	0	if (prev_fragment) {
300	0	tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
301	0	}
302	0	if (this_fragment) {
303	0	tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
304	0	}
305	0	}
306
307	693k	char_frag_info->unichar_id = curr_unichar_id;
308	693k	char_frag_info->fragment = this_fragment;
309	693k	char_frag_info->rating = curr_rating;
310	693k	char_frag_info->certainty = curr_certainty;
311	693k	char_frag_info->num_fragments = 1;
312	693k	if (prev_fragment && !this_fragment) {
313	23.9k	if (debug) {
314	0	tprintf("Skip choice with incomplete fragment\n");
315	0	}
316	23.9k	return false;
317	23.9k	}
318	669k	if (this_fragment) {
319		// We are dealing with a fragment.
320	69.1k	char_frag_info->unichar_id = INVALID_UNICHAR_ID;
321	69.1k	if (prev_fragment) {
322	52.0k	if (!this_fragment->is_continuation_of(prev_fragment)) {
323	33.4k	if (debug) {
324	0	tprintf("Non-matching fragment piece\n");
325	0	}
326	33.4k	return false;
327	33.4k	}
328	18.5k	if (this_fragment->is_ending()) {
329	17.1k	char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
330	17.1k	char_frag_info->fragment = nullptr;
331	17.1k	if (debug) {
332	0	tprintf("Built character %s from fragments\n",
333	0	getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
334	0	}
335	17.1k	} else {
336	1.46k	if (debug) {
337	0	tprintf("Record fragment continuation\n");
338	0	}
339	1.46k	char_frag_info->fragment = this_fragment;
340	1.46k	}
341		// Update certainty and rating.
342	18.5k	char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
343	18.5k	char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
344	18.5k	char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
345	18.5k	} else {
346	17.1k	if (this_fragment->is_beginning()) {
347	17.1k	if (debug) {
348	0	tprintf("Record fragment beginning\n");
349	0	}
350	17.1k	} else {
351	0	if (debug) {
352	0	tprintf("Non-starting fragment piece with no prev_fragment\n");
353	0	}
354	0	return false;
355	0	}
356	17.1k	}
357	69.1k	}
358	636k	if (word_ending && char_frag_info->fragment) {
359	0	if (debug) {
360	0	tprintf("Word cannot end with a fragment\n");
361	0	}
362	0	return false;
363	0	}
364	636k	return true;
365	636k	}
366
367		} // namespace tesseract