Coverage Report

Created: 2025-07-23 07:12

/src/tesseract/src/dict/permdawg.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * File:         permdawg.cpp  (Formerly permdawg.c)
4
 * Description:  Scale word choices by a dictionary
5
 * Author:       Mark Seaman, OCR Technology
6
 *
7
 * (c) Copyright 1987, Hewlett-Packard Company.
8
 ** Licensed under the Apache License, Version 2.0 (the "License");
9
 ** you may not use this file except in compliance with the License.
10
 ** You may obtain a copy of the License at
11
 ** http://www.apache.org/licenses/LICENSE-2.0
12
 ** Unless required by applicable law or agreed to in writing, software
13
 ** distributed under the License is distributed on an "AS IS" BASIS,
14
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
 ** See the License for the specific language governing permissions and
16
 ** limitations under the License.
17
 *
18
 *****************************************************************************/
19
/*----------------------------------------------------------------------
20
              I n c l u d e s
21
----------------------------------------------------------------------*/
22
23
#include "dawg.h"
24
#include "params.h"
25
#include "stopper.h"
26
#include "tprintf.h"
27
28
#include <algorithm>
29
#include <cctype>
30
#include "dict.h"
31
32
/*----------------------------------------------------------------------
33
              F u n c t i o n s
34
----------------------------------------------------------------------*/
35
namespace tesseract {
36
37
/**
38
 * @name go_deeper_dawg_fxn
39
 *
40
 * If the choice being composed so far could be a dictionary word
41
 * keep exploring choices.
42
 */
43
void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
44
                              int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
45
                              bool word_ending, WERD_CHOICE *word, float certainties[],
46
                              float *limit, WERD_CHOICE *best_choice, int *attempts_left,
47
617k
                              void *void_more_args) {
48
617k
  auto *more_args = static_cast<DawgArgs *>(void_more_args);
49
617k
  word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
50
617k
  int word_index = word->length() - 1;
51
617k
  if (best_choice->rating() < *limit) {
52
0
    return;
53
0
  }
54
  // Look up char in DAWG
55
56
  // If the current unichar is an ngram first try calling
57
  // letter_is_okay() for each unigram it contains separately.
58
617k
  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
59
617k
  bool checked_unigrams = false;
60
617k
  if (getUnicharset().get_isngram(orig_uch_id)) {
61
132k
    if (dawg_debug_level) {
62
0
      tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
63
0
    }
64
132k
    int num_unigrams = 0;
65
132k
    word->remove_last_unichar_id();
66
132k
    std::vector<UNICHAR_ID> encoding;
67
132k
    const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68
    // Since the string came out of the unicharset, failure is impossible.
69
132k
    ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
70
132k
    bool unigrams_ok = true;
71
    // Construct DawgArgs that reflect the current state.
72
132k
    DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
73
132k
    DawgPositionVector unigram_updated_dawgs;
74
132k
    DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
75
    // Check unigrams in the ngram with letter_is_okay().
76
264k
    for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
77
132k
      UNICHAR_ID uch_id = encoding[i];
78
132k
      ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
79
132k
      ++num_unigrams;
80
132k
      word->append_unichar_id(uch_id, 1, 0.0, 0.0);
81
132k
      unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),
82
132k
                                             word->unichar_id(word_index + num_unigrams - 1),
83
132k
                                             word_ending && i == encoding.size() - 1);
84
132k
      (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
85
132k
      if (dawg_debug_level) {
86
0
        tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
87
0
                unigrams_ok ? "OK" : "not OK");
88
0
      }
89
132k
    }
90
    // Restore the word and copy the updated dawg state if needed.
91
264k
    while (num_unigrams-- > 0) {
92
132k
      word->remove_last_unichar_id();
93
132k
    }
94
132k
    word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
95
132k
    if (unigrams_ok) {
96
0
      checked_unigrams = true;
97
0
      more_args->permuter = unigram_dawg_args.permuter;
98
0
      *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
99
0
    }
100
132k
  }
101
102
  // Check which dawgs from the dawgs_ vector contain the word
103
  // up to and including the current unichar.
104
617k
  if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),
105
617k
                                                   word->unichar_id(word_index), word_ending)) {
106
    // Add a new word choice
107
0
    if (word_ending) {
108
0
      if (dawg_debug_level) {
109
0
        tprintf("found word = %s\n", word->debug_string().c_str());
110
0
      }
111
0
      if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
112
0
        if (output_ambig_words_file_ == nullptr) {
113
0
          output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
114
0
          if (output_ambig_words_file_ == nullptr) {
115
0
            tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
116
0
            exit(1);
117
0
          }
118
0
          std::string word_str;
119
0
          word->string_and_lengths(&word_str, nullptr);
120
0
          word_str += " ";
121
0
          fprintf(output_ambig_words_file_, "%s", word_str.c_str());
122
0
        }
123
0
        std::string word_str;
124
0
        word->string_and_lengths(&word_str, nullptr);
125
0
        word_str += " ";
126
0
        fprintf(output_ambig_words_file_, "%s", word_str.c_str());
127
0
      }
128
0
      WERD_CHOICE *adjusted_word = word;
129
0
      adjusted_word->set_permuter(more_args->permuter);
130
0
      update_best_choice(*adjusted_word, best_choice);
131
0
    } else { // search the next letter
132
      // Make updated_* point to the next entries in the DawgPositionVector
133
      // arrays (that were originally created in dawg_permute_and_select)
134
0
      ++(more_args->updated_dawgs);
135
      // Make active_dawgs and constraints point to the updated ones.
136
0
      ++(more_args->active_dawgs);
137
0
      permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
138
0
                      certainties, limit, best_choice, attempts_left, more_args);
139
      // Restore previous state to explore another letter in this position.
140
0
      --(more_args->updated_dawgs);
141
0
      --(more_args->active_dawgs);
142
0
    }
143
617k
  } else {
144
617k
    if (dawg_debug_level) {
145
0
      tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
146
0
    }
147
617k
  }
148
617k
}
149
150
/**
151
 * dawg_permute_and_select
152
 *
153
 * Recursively explore all the possible character combinations in
154
 * the given char_choices. Use go_deeper_dawg_fxn() to search all the
155
 * dawgs in the dawgs_ vector in parallel and discard invalid words.
156
 *
157
 * Allocate and return a WERD_CHOICE with the best valid word found.
158
 */
159
WERD_CHOICE *Dict::dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,
160
414k
                                           float rating_limit) {
161
414k
  auto *best_choice = new WERD_CHOICE(&getUnicharset());
162
414k
  best_choice->make_bad();
163
414k
  best_choice->set_rating(rating_limit);
164
414k
  if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {
165
0
    return best_choice;
166
0
  }
167
414k
  auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
168
414k
  init_active_dawgs(&(active_dawgs[0]), true);
169
414k
  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
170
414k
  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
171
172
414k
  float certainties[MAX_WERD_LENGTH];
173
414k
  this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
174
414k
  int attempts_left = max_permuter_attempts;
175
414k
  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
176
414k
                  &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
177
414k
  delete[] active_dawgs;
178
414k
  return best_choice;
179
414k
}
180
181
/**
182
 * permute_choices
183
 *
184
 * Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST
185
 * with the given char_choice_index in char_choices.
186
 */
187
void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
188
                           int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
189
                           WERD_CHOICE *word, float certainties[], float *limit,
190
432k
                           WERD_CHOICE *best_choice, int *attempts_left, void *more_args) {
191
432k
  if (debug) {
192
0
    tprintf(
193
0
        "%s permute_choices: char_choice_index=%d"
194
0
        " limit=%g rating=%g, certainty=%g word=%s\n",
195
0
        debug, char_choice_index, *limit, word->rating(), word->certainty(),
196
0
        word->debug_string().c_str());
197
0
  }
198
432k
  if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
199
432k
    BLOB_CHOICE_IT blob_choice_it;
200
432k
    blob_choice_it.set_to_list(char_choices.at(char_choice_index));
201
1.12M
    for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
202
693k
      (*attempts_left)--;
203
693k
      append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
204
693k
                     prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
205
693k
                     more_args);
206
693k
      if (*attempts_left <= 0) {
207
0
        if (debug) {
208
0
          tprintf("permute_choices(): attempts_left is 0\n");
209
0
        }
210
0
        break;
211
0
      }
212
693k
    }
213
432k
  }
214
432k
}
215
216
/**
217
 * append_choices
218
 *
219
 * Checks to see whether or not the next choice is worth appending to
220
 * the word being generated. If so then keeps going deeper into the word.
221
 *
222
 * This function assumes that Dict::go_deeper_fxn_ is set.
223
 */
224
void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
225
                          const BLOB_CHOICE &blob_choice, int char_choice_index,
226
                          const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
227
                          float certainties[], float *limit, WERD_CHOICE *best_choice,
228
693k
                          int *attempts_left, void *more_args) {
229
693k
  auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
230
231
  // Deal with fragments.
232
693k
  CHAR_FRAGMENT_INFO char_frag_info;
233
693k
  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
234
693k
                           prev_char_frag_info, debug, word_ending, &char_frag_info)) {
235
57.3k
    return; // blob_choice must be an invalid fragment
236
57.3k
  }
237
  // Search the next letter if this character is a fragment.
238
636k
  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
239
18.5k
    permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
240
18.5k
                    limit, best_choice, attempts_left, more_args);
241
18.5k
    return;
242
18.5k
  }
243
244
  // Add the next unichar.
245
617k
  float old_rating = word->rating();
246
617k
  float old_certainty = word->certainty();
247
617k
  uint8_t old_permuter = word->permuter();
248
617k
  certainties[word->length()] = char_frag_info.certainty;
249
617k
  word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
250
617k
                                          char_frag_info.rating, char_frag_info.certainty);
251
252
  // Explore the next unichar.
253
617k
  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
254
617k
                          word, certainties, limit, best_choice, attempts_left, more_args);
255
256
  // Remove the unichar we added to explore other choices in it's place.
257
617k
  word->remove_last_unichar_id();
258
617k
  word->set_rating(old_rating);
259
617k
  word->set_certainty(old_certainty);
260
617k
  word->set_permuter(old_permuter);
261
617k
}
262
263
/**
264
 * @name fragment_state
265
 *
266
 * Given the current char choice and information about previously seen
267
 * fragments, determines whether adjacent character fragments are
268
 * present and whether they can be concatenated.
269
 *
270
 * The given prev_char_frag_info contains:
271
 * - fragment: if not nullptr contains information about immediately
272
 *   preceding fragmented character choice
273
 * - num_fragments: number of fragments that have been used so far
274
 *   to construct a character
275
 * - certainty: certainty of the current choice or minimum
276
 *   certainty of all fragments concatenated so far
277
 * - rating: rating of the current choice or sum of fragment
278
 *   ratings concatenated so far
279
 *
280
 * The output char_frag_info is filled in as follows:
281
 * - character: is set to be nullptr if the choice is a non-matching
282
 *   or non-ending fragment piece; is set to unichar of the given choice
283
 *   if it represents a regular character or a matching ending fragment
284
 * - fragment,num_fragments,certainty,rating are set as described above
285
 *
286
 * @returns false if a non-matching fragment is discovered, true otherwise.
287
 */
288
bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
289
                               const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
290
693k
                               int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) {
291
693k
  const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
292
693k
  const CHAR_FRAGMENT *prev_fragment =
293
693k
      prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
294
295
  // Print debug info for fragments.
296
693k
  if (debug && (prev_fragment || this_fragment)) {
297
0
    tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
298
0
            getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
299
0
    if (prev_fragment) {
300
0
      tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
301
0
    }
302
0
    if (this_fragment) {
303
0
      tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
304
0
    }
305
0
  }
306
307
693k
  char_frag_info->unichar_id = curr_unichar_id;
308
693k
  char_frag_info->fragment = this_fragment;
309
693k
  char_frag_info->rating = curr_rating;
310
693k
  char_frag_info->certainty = curr_certainty;
311
693k
  char_frag_info->num_fragments = 1;
312
693k
  if (prev_fragment && !this_fragment) {
313
23.9k
    if (debug) {
314
0
      tprintf("Skip choice with incomplete fragment\n");
315
0
    }
316
23.9k
    return false;
317
23.9k
  }
318
669k
  if (this_fragment) {
319
    // We are dealing with a fragment.
320
69.1k
    char_frag_info->unichar_id = INVALID_UNICHAR_ID;
321
69.1k
    if (prev_fragment) {
322
52.0k
      if (!this_fragment->is_continuation_of(prev_fragment)) {
323
33.4k
        if (debug) {
324
0
          tprintf("Non-matching fragment piece\n");
325
0
        }
326
33.4k
        return false;
327
33.4k
      }
328
18.5k
      if (this_fragment->is_ending()) {
329
17.1k
        char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
330
17.1k
        char_frag_info->fragment = nullptr;
331
17.1k
        if (debug) {
332
0
          tprintf("Built character %s from fragments\n",
333
0
                  getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
334
0
        }
335
17.1k
      } else {
336
1.46k
        if (debug) {
337
0
          tprintf("Record fragment continuation\n");
338
0
        }
339
1.46k
        char_frag_info->fragment = this_fragment;
340
1.46k
      }
341
      // Update certainty and rating.
342
18.5k
      char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
343
18.5k
      char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
344
18.5k
      char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
345
18.5k
    } else {
346
17.1k
      if (this_fragment->is_beginning()) {
347
17.1k
        if (debug) {
348
0
          tprintf("Record fragment beginning\n");
349
0
        }
350
17.1k
      } else {
351
0
        if (debug) {
352
0
          tprintf("Non-starting fragment piece with no prev_fragment\n");
353
0
        }
354
0
        return false;
355
0
      }
356
17.1k
    }
357
69.1k
  }
358
636k
  if (word_ending && char_frag_info->fragment) {
359
0
    if (debug) {
360
0
      tprintf("Word cannot end with a fragment\n");
361
0
    }
362
0
    return false;
363
0
  }
364
636k
  return true;
365
636k
}
366
367
} // namespace tesseract