Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/wordrec/language_model.cpp
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        language_model.cpp
3
// Description: Functions that utilize the knowledge about the properties,
4
//              structure and statistics of the language to help recognition.
5
// Author:      Daria Antonova
6
//
7
// (C) Copyright 2009, Google Inc.
8
// Licensed under the Apache License, Version 2.0 (the "License");
9
// you may not use this file except in compliance with the License.
10
// You may obtain a copy of the License at
11
// http://www.apache.org/licenses/LICENSE-2.0
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS,
14
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
// See the License for the specific language governing permissions and
16
// limitations under the License.
17
//
18
///////////////////////////////////////////////////////////////////////
19
20
#include "language_model.h"
21
#include <tesseract/unichar.h>       // for UNICHAR_ID, INVALID_UNICHAR_ID
22
#include <cassert>                   // for assert
23
#include <cmath>                     // for log2, pow
24
#include "blamer.h"                  // for BlamerBundle
25
#include "ccutil.h"                  // for CCUtil
26
#include "dawg.h"                    // for NO_EDGE, Dawg, Dawg::kPatternUn...
27
#include "errcode.h"                 // for ASSERT_HOST
28
#include "lm_state.h"                // for ViterbiStateEntry, ViterbiState...
29
#include "matrix.h"                  // for MATRIX_COORD
30
#include "pageres.h"                 // for WERD_RES
31
#include "params.h"                  // for IntParam, BoolParam, DoubleParam
32
#include "params_training_featdef.h" // for ParamsTrainingHypothesis, PTRAI...
33
#include "tprintf.h"                 // for tprintf
34
#include "unicharset.h"              // for UNICHARSET
35
#include "unicity_table.h"           // for UnicityTable
36
37
template <typename T>
38
class UnicityTable;
39
40
namespace tesseract {
41
42
class LMPainPoints;
43
struct FontInfo;
44
45
#if defined(ANDROID)
46
static inline double log2(double n) {
47
  return log(n) / log(2.0);
48
}
49
#endif // ANDROID
50
51
const float LanguageModel::kMaxAvgNgramCost = 25.0f;
52
53
LanguageModel::LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict)
54
2
    : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
55
                 dict->getCCUtil()->params())
56
2
    , BOOL_INIT_MEMBER(language_model_ngram_on, false,
57
                       "Turn on/off the use of character ngram model", dict->getCCUtil()->params())
58
2
    , INT_MEMBER(language_model_ngram_order, 8, "Maximum order of the character ngram model",
59
                 dict->getCCUtil()->params())
60
2
    , INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
61
                 "Maximum number of prunable (those for which"
62
                 " PrunablePath() is true) entries in each viterbi list"
63
                 " recorded in BLOB_CHOICEs",
64
                 dict->getCCUtil()->params())
65
2
    , INT_MEMBER(language_model_viterbi_list_max_size, 500,
66
                 "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
67
                 dict->getCCUtil()->params())
68
2
    , double_MEMBER(language_model_ngram_small_prob, 0.000001,
69
                    "To avoid overly small denominators use this as the "
70
                    "floor of the probability returned by the ngram model.",
71
                    dict->getCCUtil()->params())
72
2
    , double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
73
                    "Average classifier score of a non-matching unichar.",
74
                    dict->getCCUtil()->params())
75
2
    , BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
76
                  "Use only the first UTF8 step of the given string"
77
                  " when computing log probabilities.",
78
                  dict->getCCUtil()->params())
79
2
    , double_MEMBER(language_model_ngram_scale_factor, 0.03,
80
                    "Strength of the character ngram model relative to the"
81
                    " character classifier ",
82
                    dict->getCCUtil()->params())
83
2
    , double_MEMBER(language_model_ngram_rating_factor, 16.0,
84
                    "Factor to bring log-probs into the same range as ratings"
85
                    " when multiplied by outline length ",
86
                    dict->getCCUtil()->params())
87
2
    , BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
88
                  "Words are delimited by space", dict->getCCUtil()->params())
89
2
    , INT_MEMBER(language_model_min_compound_length, 3, "Minimum length of compound words",
90
                 dict->getCCUtil()->params())
91
2
    , double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
92
                    "Penalty for words not in the frequent word dictionary",
93
                    dict->getCCUtil()->params())
94
2
    , double_MEMBER(language_model_penalty_non_dict_word, 0.15, "Penalty for non-dictionary words",
95
                    dict->getCCUtil()->params())
96
2
    , double_MEMBER(language_model_penalty_punc, 0.2, "Penalty for inconsistent punctuation",
97
                    dict->getCCUtil()->params())
98
2
    , double_MEMBER(language_model_penalty_case, 0.1, "Penalty for inconsistent case",
99
                    dict->getCCUtil()->params())
100
2
    , double_MEMBER(language_model_penalty_script, 0.5, "Penalty for inconsistent script",
101
                    dict->getCCUtil()->params())
102
2
    , double_MEMBER(language_model_penalty_chartype, 0.3, "Penalty for inconsistent character type",
103
                    dict->getCCUtil()->params())
104
    ,
105
    // TODO(daria, rays): enable font consistency checking
106
    // after improving font analysis.
107
2
    double_MEMBER(language_model_penalty_font, 0.00, "Penalty for inconsistent font",
108
                  dict->getCCUtil()->params())
109
2
    , double_MEMBER(language_model_penalty_spacing, 0.05, "Penalty for inconsistent spacing",
110
                    dict->getCCUtil()->params())
111
2
    , double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment",
112
                    dict->getCCUtil()->params())
113
2
    , INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations (ScrollView)",
114
                 dict->getCCUtil()->params())
115
2
    , BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
116
                       "Use sigmoidal score for certainty", dict->getCCUtil()->params())
117
2
    , dawg_args_(nullptr, new DawgPositionVector(), NO_PERM)
118
2
    , fontinfo_table_(fontinfo_table)
119
2
    , dict_(dict) {
120
2
  ASSERT_HOST(dict_ != nullptr);
121
2
}
122
123
0
LanguageModel::~LanguageModel() {
124
0
  delete dawg_args_.updated_dawgs;
125
0
}
126
127
void LanguageModel::InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch,
128
98.6k
                                float max_char_wh_ratio, float rating_cert_scale) {
129
98.6k
  fixed_pitch_ = fixed_pitch;
130
98.6k
  max_char_wh_ratio_ = max_char_wh_ratio;
131
98.6k
  rating_cert_scale_ = rating_cert_scale;
132
98.6k
  acceptable_choice_found_ = false;
133
98.6k
  correct_segmentation_explored_ = false;
134
135
  // Initialize vectors with beginning DawgInfos.
136
98.6k
  very_beginning_active_dawgs_.clear();
137
98.6k
  dict_->init_active_dawgs(&very_beginning_active_dawgs_, false);
138
98.6k
  beginning_active_dawgs_.clear();
139
98.6k
  dict_->default_dawgs(&beginning_active_dawgs_, false);
140
141
  // Fill prev_word_str_ with the last language_model_ngram_order
142
  // unichars from prev_word.
143
98.6k
  if (language_model_ngram_on) {
144
0
    if (prev_word != nullptr && !prev_word->unichar_string().empty()) {
145
0
      prev_word_str_ = prev_word->unichar_string();
146
0
      if (language_model_ngram_space_delimited_language) {
147
0
        prev_word_str_ += ' ';
148
0
      }
149
0
    } else {
150
0
      prev_word_str_ = " ";
151
0
    }
152
0
    const char *str_ptr = prev_word_str_.c_str();
153
0
    const char *str_end = str_ptr + prev_word_str_.length();
154
0
    int step;
155
0
    prev_word_unichar_step_len_ = 0;
156
0
    while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
157
0
      str_ptr += step;
158
0
      ++prev_word_unichar_step_len_;
159
0
    }
160
0
    ASSERT_HOST(str_ptr == str_end);
161
0
  }
162
98.6k
}
163
164
/**
165
 * Helper scans the collection of predecessors for competing siblings that
166
 * have the same letter with the opposite case, setting competing_vse.
167
 */
168
2.49M
static void ScanParentsForCaseMix(const UNICHARSET &unicharset, LanguageModelState *parent_node) {
169
2.49M
  if (parent_node == nullptr) {
170
238k
    return;
171
238k
  }
172
2.25M
  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
173
15.0M
  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
174
12.7M
    ViterbiStateEntry *vse = vit.data();
175
12.7M
    vse->competing_vse = nullptr;
176
12.7M
    UNICHAR_ID unichar_id = vse->curr_b->unichar_id();
177
12.7M
    if (unicharset.get_isupper(unichar_id) || unicharset.get_islower(unichar_id)) {
178
7.98M
      UNICHAR_ID other_case = unicharset.get_other_case(unichar_id);
179
7.98M
      if (other_case == unichar_id) {
180
385k
        continue; // Not in unicharset.
181
385k
      }
182
      // Find other case in same list. There could be multiple entries with
183
      // the same unichar_id, but in theory, they should all point to the
184
      // same BLOB_CHOICE, and that is what we will be using to decide
185
      // which to keep.
186
7.60M
      ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries);
187
7.60M
      for (vit2.mark_cycle_pt();
188
82.0M
           !vit2.cycled_list() && vit2.data()->curr_b->unichar_id() != other_case; vit2.forward()) {
189
74.4M
      }
190
7.60M
      if (!vit2.cycled_list()) {
191
1.10M
        vse->competing_vse = vit2.data();
192
1.10M
      }
193
7.60M
    }
194
12.7M
  }
195
2.25M
}
196
197
/**
198
 * Helper returns true if the given choice has a better case variant before
199
 * it in the choice_list that is not distinguishable by size.
200
 */
201
static bool HasBetterCaseVariant(const UNICHARSET &unicharset, const BLOB_CHOICE *choice,
202
7.60M
                                 BLOB_CHOICE_LIST *choices) {
203
7.60M
  UNICHAR_ID choice_id = choice->unichar_id();
204
7.60M
  UNICHAR_ID other_case = unicharset.get_other_case(choice_id);
205
7.60M
  if (other_case == choice_id || other_case == INVALID_UNICHAR_ID) {
206
4.16M
    return false; // Not upper or lower or not in unicharset.
207
4.16M
  }
208
3.44M
  if (unicharset.SizesDistinct(choice_id, other_case)) {
209
1.91M
    return false; // Can be separated by size.
210
1.91M
  }
211
1.53M
  BLOB_CHOICE_IT bc_it(choices);
212
4.23M
  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
213
4.23M
    BLOB_CHOICE *better_choice = bc_it.data();
214
4.23M
    if (better_choice->unichar_id() == other_case) {
215
112k
      return true; // Found an earlier instance of other_case.
216
4.12M
    } else if (better_choice == choice) {
217
1.41M
      return false; // Reached the original choice.
218
1.41M
    }
219
4.23M
  }
220
0
  return false; // Should never happen, but just in case.
221
1.53M
}
222
223
/**
224
 * UpdateState has the job of combining the ViterbiStateEntry lists on each
225
 * of the choices on parent_list with each of the blob choices in curr_list,
226
 * making a new ViterbiStateEntry for each sensible path.
227
 *
228
 * This could be a huge set of combinations, creating a lot of work only to
229
 * be truncated by some beam limit, but only certain kinds of paths will
230
 * continue at the next step:
231
 * - paths that are liked by the language model: either a DAWG or the n-gram
232
 *   model, where active.
233
 * - paths that represent some kind of top choice. The old permuter permuted
234
 *   the top raw classifier score, the top upper case word and the top lower-
235
 *   case word. UpdateState now concentrates its top-choice paths on top
236
 *   lower-case, top upper-case (or caseless alpha), and top digit sequence,
237
 *   with allowance for continuation of these paths through blobs where such
238
 *   a character does not appear in the choices list.
239
 *
240
 * GetNextParentVSE enforces some of these models to minimize the number of
241
 * calls to AddViterbiStateEntry, even prior to looking at the language model.
242
 * Thus an n-blob sequence of [l1I] will produce 3n calls to
243
 * AddViterbiStateEntry instead of 3^n.
244
 *
245
 * Of course it isn't quite that simple as Title Case is handled by allowing
246
 * lower case to continue an upper case initial, but it has to be detected
247
 * in the combiner so it knows which upper case letters are initial alphas.
248
 */
249
bool LanguageModel::UpdateState(bool just_classified, int curr_col, int curr_row,
250
                                BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node,
251
                                LMPainPoints *pain_points, WERD_RES *word_res,
252
2.49M
                                BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) {
253
2.49M
  if (language_model_debug_level > 0) {
254
0
    tprintf("\nUpdateState: col=%d row=%d %s", curr_col, curr_row,
255
0
            just_classified ? "just_classified" : "");
256
0
    if (language_model_debug_level > 5) {
257
0
      tprintf("(parent=%p)\n", static_cast<void *>(parent_node));
258
0
    } else {
259
0
      tprintf("\n");
260
0
    }
261
0
  }
262
  // Initialize helper variables.
263
2.49M
  bool word_end = (curr_row + 1 >= word_res->ratings->dimension());
264
2.49M
  bool new_changed = false;
265
2.49M
  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
266
2.49M
  const UNICHARSET &unicharset = dict_->getUnicharset();
267
2.49M
  BLOB_CHOICE *first_lower = nullptr;
268
2.49M
  BLOB_CHOICE *first_upper = nullptr;
269
2.49M
  BLOB_CHOICE *first_digit = nullptr;
270
2.49M
  bool has_alnum_mix = false;
271
2.49M
  if (parent_node != nullptr) {
272
2.25M
    int result = SetTopParentLowerUpperDigit(parent_node);
273
2.25M
    if (result < 0) {
274
0
      if (language_model_debug_level > 0) {
275
0
        tprintf("No parents found to process\n");
276
0
      }
277
0
      return false;
278
0
    }
279
2.25M
    if (result > 0) {
280
262k
      has_alnum_mix = true;
281
262k
    }
282
2.25M
  }
283
2.49M
  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper, &first_digit)) {
284
2.24M
    has_alnum_mix = false;
285
2.24M
  };
286
2.49M
  ScanParentsForCaseMix(unicharset, parent_node);
287
2.49M
  if (language_model_debug_level > 3 && parent_node != nullptr) {
288
0
    parent_node->Print("Parent viterbi list");
289
0
  }
290
2.49M
  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
291
292
  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
293
2.49M
  ViterbiStateEntry_IT vit;
294
2.49M
  BLOB_CHOICE_IT c_it(curr_list);
295
11.7M
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
296
9.23M
    BLOB_CHOICE *choice = c_it.data();
297
    // TODO(antonova): make sure commenting this out if ok for ngram
298
    // model scoring (I think this was introduced to fix ngram model quirks).
299
    // Skip nullptr unichars unless it is the only choice.
300
    // if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
301
9.23M
    UNICHAR_ID unichar_id = choice->unichar_id();
302
9.23M
    if (unicharset.get_fragment(unichar_id)) {
303
0
      continue; // Skip fragments.
304
0
    }
305
    // Set top choice flags.
306
9.23M
    LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
307
9.23M
    if (c_it.at_first() || !new_changed) {
308
5.71M
      blob_choice_flags |= kSmallestRatingFlag;
309
5.71M
    }
310
9.23M
    if (first_lower == choice) {
311
2.49M
      blob_choice_flags |= kLowerCaseFlag;
312
2.49M
    }
313
9.23M
    if (first_upper == choice) {
314
2.49M
      blob_choice_flags |= kUpperCaseFlag;
315
2.49M
    }
316
9.23M
    if (first_digit == choice) {
317
2.49M
      blob_choice_flags |= kDigitFlag;
318
2.49M
    }
319
320
9.23M
    if (parent_node == nullptr) {
321
      // Process the beginning of a word.
322
      // If there is a better case variant that is not distinguished by size,
323
      // skip this blob choice, as we have no choice but to accept the result
324
      // of the character classifier to distinguish between them, even if
325
      // followed by an upper case.
326
      // With words like iPoc, and other CamelBackWords, the lower-upper
327
      // transition can only be achieved if the classifier has the correct case
328
      // as the top choice, and leaving an initial I lower down the list
329
      // increases the chances of choosing IPoc simply because it doesn't
330
      // include such a transition. iPoc will beat iPOC and ipoc because
331
      // the other words are baseline/x-height inconsistent.
332
977k
      if (HasBetterCaseVariant(unicharset, choice, curr_list)) {
333
23.5k
        continue;
334
23.5k
      }
335
      // Upper counts as lower at the beginning of a word.
336
953k
      if (blob_choice_flags & kUpperCaseFlag) {
337
226k
        blob_choice_flags |= kLowerCaseFlag;
338
226k
      }
339
953k
      new_changed |= AddViterbiStateEntry(blob_choice_flags, denom, word_end, curr_col, curr_row,
340
953k
                                          choice, curr_state, nullptr, pain_points, word_res,
341
953k
                                          best_choice_bundle, blamer_bundle);
342
8.25M
    } else {
343
      // Get viterbi entries from each parent ViterbiStateEntry.
344
8.25M
      vit.set_to_list(&parent_node->viterbi_state_entries);
345
8.25M
      int vit_counter = 0;
346
8.25M
      vit.mark_cycle_pt();
347
8.25M
      ViterbiStateEntry *parent_vse = nullptr;
348
8.25M
      LanguageModelFlagsType top_choice_flags;
349
26.6M
      while ((parent_vse =
350
26.6M
                  GetNextParentVSE(just_classified, has_alnum_mix, c_it.data(), blob_choice_flags,
351
26.6M
                                   unicharset, word_res, &vit, &top_choice_flags)) != nullptr) {
352
        // Skip pruned entries and do not look at prunable entries if already
353
        // examined language_model_viterbi_list_max_num_prunable of those.
354
18.3M
        if (PrunablePath(*parent_vse) &&
355
18.3M
            (++vit_counter > language_model_viterbi_list_max_num_prunable ||
356
3.77M
             (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
357
256k
          continue;
358
256k
        }
359
        // If the parent has no alnum choice, (ie choice is the first in a
360
        // string of alnum), and there is a better case variant that is not
361
        // distinguished by size, skip this blob choice/parent, as with the
362
        // initial blob treatment above.
363
18.1M
        if (!parent_vse->HasAlnumChoice(unicharset) &&
364
18.1M
            HasBetterCaseVariant(unicharset, choice, curr_list)) {
365
88.6k
          continue;
366
88.6k
        }
367
        // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
368
        // looks good according to the Dawgs or character ngram model.
369
18.0M
        new_changed |= AddViterbiStateEntry(top_choice_flags, denom, word_end, curr_col, curr_row,
370
18.0M
                                            c_it.data(), curr_state, parent_vse, pain_points,
371
18.0M
                                            word_res, best_choice_bundle, blamer_bundle);
372
18.0M
      }
373
8.25M
    }
374
9.23M
  }
375
2.49M
  return new_changed;
376
2.49M
}
377
378
/**
379
 * Finds the first lower and upper case letter and first digit in curr_list.
380
 * For non-upper/lower languages, alpha counts as upper.
381
 * Uses the first character in the list in place of empty results.
382
 * Returns true if both alpha and digits are found.
383
 */
384
bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower,
385
                                          BLOB_CHOICE **first_upper,
386
2.49M
                                          BLOB_CHOICE **first_digit) const {
387
2.49M
  BLOB_CHOICE_IT c_it(curr_list);
388
2.49M
  const UNICHARSET &unicharset = dict_->getUnicharset();
389
2.49M
  BLOB_CHOICE *first_unichar = nullptr;
390
11.7M
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
391
9.23M
    UNICHAR_ID unichar_id = c_it.data()->unichar_id();
392
9.23M
    if (unicharset.get_fragment(unichar_id)) {
393
0
      continue; // skip fragments
394
0
    }
395
9.23M
    if (first_unichar == nullptr) {
396
2.49M
      first_unichar = c_it.data();
397
2.49M
    }
398
9.23M
    if (*first_lower == nullptr && unicharset.get_islower(unichar_id)) {
399
2.04M
      *first_lower = c_it.data();
400
2.04M
    }
401
9.23M
    if (*first_upper == nullptr && unicharset.get_isalpha(unichar_id) &&
402
9.23M
        !unicharset.get_islower(unichar_id)) {
403
804k
      *first_upper = c_it.data();
404
804k
    }
405
9.23M
    if (*first_digit == nullptr && unicharset.get_isdigit(unichar_id)) {
406
256k
      *first_digit = c_it.data();
407
256k
    }
408
9.23M
  }
409
2.49M
  ASSERT_HOST(first_unichar != nullptr);
410
2.49M
  bool mixed = (*first_lower != nullptr || *first_upper != nullptr) && *first_digit != nullptr;
411
2.49M
  if (*first_lower == nullptr) {
412
444k
    *first_lower = first_unichar;
413
444k
  }
414
2.49M
  if (*first_upper == nullptr) {
415
1.69M
    *first_upper = first_unichar;
416
1.69M
  }
417
2.49M
  if (*first_digit == nullptr) {
418
2.23M
    *first_digit = first_unichar;
419
2.23M
  }
420
2.49M
  return mixed;
421
2.49M
}
422
423
/**
424
 * Forces there to be at least one entry in the overall set of the
425
 * viterbi_state_entries of each element of parent_node that has the
426
 * top_choice_flag set for lower, upper and digit using the same rules as
427
 * GetTopLowerUpperDigit, setting the flag on the first found suitable
428
 * candidate, whether or not the flag is set on some other parent.
429
 * Returns 1 if both alpha and digits are found among the parents, -1 if no
430
 * parents are found at all (a legitimate case), and 0 otherwise.
431
 */
432
2.25M
int LanguageModel::SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const {
433
2.25M
  if (parent_node == nullptr) {
434
0
    return -1;
435
0
  }
436
2.25M
  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
437
2.25M
  ViterbiStateEntry *top_lower = nullptr;
438
2.25M
  ViterbiStateEntry *top_upper = nullptr;
439
2.25M
  ViterbiStateEntry *top_digit = nullptr;
440
2.25M
  ViterbiStateEntry *top_choice = nullptr;
441
2.25M
  float lower_rating = 0.0f;
442
2.25M
  float upper_rating = 0.0f;
443
2.25M
  float digit_rating = 0.0f;
444
2.25M
  float top_rating = 0.0f;
445
2.25M
  const UNICHARSET &unicharset = dict_->getUnicharset();
446
2.25M
  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
447
15.0M
  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
448
12.7M
    ViterbiStateEntry *vse = vit.data();
449
    // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
450
    // back to the real character if needed.
451
12.7M
    ViterbiStateEntry *unichar_vse = vse;
452
12.7M
    UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
453
12.7M
    float rating = unichar_vse->curr_b->rating();
454
12.7M
    while (unichar_id == INVALID_UNICHAR_ID && unichar_vse->parent_vse != nullptr) {
455
0
      unichar_vse = unichar_vse->parent_vse;
456
0
      unichar_id = unichar_vse->curr_b->unichar_id();
457
0
      rating = unichar_vse->curr_b->rating();
458
0
    }
459
12.7M
    if (unichar_id != INVALID_UNICHAR_ID) {
460
12.7M
      if (unicharset.get_islower(unichar_id)) {
461
6.55M
        if (top_lower == nullptr || lower_rating > rating) {
462
2.07M
          top_lower = vse;
463
2.07M
          lower_rating = rating;
464
2.07M
        }
465
6.55M
      } else if (unicharset.get_isalpha(unichar_id)) {
466
1.43M
        if (top_upper == nullptr || upper_rating > rating) {
467
650k
          top_upper = vse;
468
650k
          upper_rating = rating;
469
650k
        }
470
4.76M
      } else if (unicharset.get_isdigit(unichar_id)) {
471
609k
        if (top_digit == nullptr || digit_rating > rating) {
472
278k
          top_digit = vse;
473
278k
          digit_rating = rating;
474
278k
        }
475
609k
      }
476
12.7M
    }
477
12.7M
    if (top_choice == nullptr || top_rating > rating) {
478
2.99M
      top_choice = vse;
479
2.99M
      top_rating = rating;
480
2.99M
      top_id = unichar_id;
481
2.99M
    }
482
12.7M
  }
483
2.25M
  if (top_choice == nullptr) {
484
0
    return -1;
485
0
  }
486
2.25M
  bool mixed = (top_lower != nullptr || top_upper != nullptr) && top_digit != nullptr;
487
2.25M
  if (top_lower == nullptr) {
488
488k
    top_lower = top_choice;
489
488k
  }
490
2.25M
  top_lower->top_choice_flags |= kLowerCaseFlag;
491
2.25M
  if (top_upper == nullptr) {
492
1.64M
    top_upper = top_choice;
493
1.64M
  }
494
2.25M
  top_upper->top_choice_flags |= kUpperCaseFlag;
495
2.25M
  if (top_digit == nullptr) {
496
1.98M
    top_digit = top_choice;
497
1.98M
  }
498
2.25M
  top_digit->top_choice_flags |= kDigitFlag;
499
2.25M
  top_choice->top_choice_flags |= kSmallestRatingFlag;
500
2.25M
  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
501
2.25M
      (top_choice->top_choice_flags & (kLowerCaseFlag | kUpperCaseFlag | kDigitFlag))) {
502
    // If the compound marker top choice carries any of the top alnum flags,
503
    // then give it all of them, allowing words like I-295 to be chosen.
504
0
    top_choice->top_choice_flags |= kLowerCaseFlag | kUpperCaseFlag | kDigitFlag;
505
0
  }
506
2.25M
  return mixed ? 1 : 0;
507
2.25M
}
508
509
/**
510
 * Finds the next ViterbiStateEntry with which the given unichar_id can
511
 * combine sensibly, taking into account any mixed alnum/mixed case
512
 * situation, and whether this combination has been inspected before.
513
 */
514
ViterbiStateEntry *LanguageModel::GetNextParentVSE(bool just_classified, bool mixed_alnum,
515
                                                   const BLOB_CHOICE *bc,
516
                                                   LanguageModelFlagsType blob_choice_flags,
517
                                                   const UNICHARSET &unicharset, WERD_RES *word_res,
518
                                                   ViterbiStateEntry_IT *vse_it,
519
26.6M
                                                   LanguageModelFlagsType *top_choice_flags) const {
520
57.8M
  for (; !vse_it->cycled_list(); vse_it->forward()) {
521
49.6M
    ViterbiStateEntry *parent_vse = vse_it->data();
522
    // Only consider the parent if it has been updated or
523
    // if the current ratings cell has just been classified.
524
49.6M
    if (!just_classified && !parent_vse->updated) {
525
30.6M
      continue;
526
30.6M
    }
527
19.0M
    if (language_model_debug_level > 2) {
528
0
      parent_vse->Print("Considering");
529
0
    }
530
    // If the parent is non-alnum, then upper counts as lower.
531
19.0M
    *top_choice_flags = blob_choice_flags;
532
19.0M
    if ((blob_choice_flags & kUpperCaseFlag) && !parent_vse->HasAlnumChoice(unicharset)) {
533
1.57M
      *top_choice_flags |= kLowerCaseFlag;
534
1.57M
    }
535
19.0M
    *top_choice_flags &= parent_vse->top_choice_flags;
536
19.0M
    UNICHAR_ID unichar_id = bc->unichar_id();
537
19.0M
    const BLOB_CHOICE *parent_b = parent_vse->curr_b;
538
19.0M
    UNICHAR_ID parent_id = parent_b->unichar_id();
539
    // Digits do not bind to alphas if there is a mix in both parent and current
540
    // or if the alpha is not the top choice.
541
19.0M
    if (unicharset.get_isdigit(unichar_id) && unicharset.get_isalpha(parent_id) &&
542
19.0M
        (mixed_alnum || *top_choice_flags == 0)) {
543
197k
      continue; // Digits don't bind to alphas.
544
197k
    }
545
    // Likewise alphas do not bind to digits if there is a mix in both or if
546
    // the digit is not the top choice.
547
18.8M
    if (unicharset.get_isalpha(unichar_id) && unicharset.get_isdigit(parent_id) &&
548
18.8M
        (mixed_alnum || *top_choice_flags == 0)) {
549
391k
      continue; // Alphas don't bind to digits.
550
391k
    }
551
    // If there is a case mix of the same alpha in the parent list, then
552
    // competing_vse is non-null and will be used to determine whether
553
    // or not to bind the current blob choice.
554
18.4M
    if (parent_vse->competing_vse != nullptr) {
555
1.25M
      const BLOB_CHOICE *competing_b = parent_vse->competing_vse->curr_b;
556
1.25M
      UNICHAR_ID other_id = competing_b->unichar_id();
557
1.25M
      if (language_model_debug_level >= 5) {
558
0
        tprintf("Parent %s has competition %s\n", unicharset.id_to_unichar(parent_id),
559
0
                unicharset.id_to_unichar(other_id));
560
0
      }
561
1.25M
      if (unicharset.SizesDistinct(parent_id, other_id)) {
562
        // If other_id matches bc wrt position and size, and parent_id, doesn't,
563
        // don't bind to the current parent.
564
525k
        if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
565
525k
                                language_model_debug_level >= 5) &&
566
525k
            !bc->PosAndSizeAgree(*parent_b, word_res->x_height, language_model_debug_level >= 5)) {
567
78.7k
          continue; // Competing blobchoice has a better vertical match.
568
78.7k
        }
569
525k
      }
570
1.25M
    }
571
18.3M
    vse_it->forward();
572
18.3M
    return parent_vse; // This one is good!
573
18.4M
  }
574
8.25M
  return nullptr; // Ran out of possibilities.
575
26.6M
}
576
577
bool LanguageModel::AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom,
578
                                         bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b,
579
                                         LanguageModelState *curr_state,
580
                                         ViterbiStateEntry *parent_vse, LMPainPoints *pain_points,
581
                                         WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
582
18.9M
                                         BlamerBundle *blamer_bundle) {
583
18.9M
  ViterbiStateEntry_IT vit;
584
18.9M
  if (language_model_debug_level > 1) {
585
0
    tprintf(
586
0
        "AddViterbiStateEntry for unichar %s rating=%.4f"
587
0
        " certainty=%.4f top_choice_flags=0x%x",
588
0
        dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->rating(), b->certainty(),
589
0
        top_choice_flags);
590
0
    if (language_model_debug_level > 5) {
591
0
      tprintf(" parent_vse=%p\n", static_cast<void *>(parent_vse));
592
0
    } else {
593
0
      tprintf("\n");
594
0
    }
595
0
  }
596
18.9M
  ASSERT_HOST(curr_state != nullptr);
597
  // Check whether the list is full.
598
18.9M
  if (curr_state->viterbi_state_entries_length >= language_model_viterbi_list_max_size) {
599
0
    if (language_model_debug_level > 1) {
600
0
      tprintf("AddViterbiStateEntry: viterbi list is full!\n");
601
0
    }
602
0
    return false;
603
0
  }
604
605
  // Invoke Dawg language model component.
606
18.9M
  LanguageModelDawgInfo *dawg_info = GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
607
608
18.9M
  float outline_length = AssociateUtils::ComputeOutlineLength(rating_cert_scale_, *b);
609
  // Invoke Ngram language model component.
610
18.9M
  LanguageModelNgramInfo *ngram_info = nullptr;
611
18.9M
  if (language_model_ngram_on) {
612
0
    ngram_info =
613
0
        GenerateNgramInfo(dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(),
614
0
                          denom, curr_col, curr_row, outline_length, parent_vse);
615
0
    ASSERT_HOST(ngram_info != nullptr);
616
0
  }
617
18.9M
  bool liked_by_language_model =
618
18.9M
      dawg_info != nullptr || (ngram_info != nullptr && !ngram_info->pruned);
619
  // Quick escape if not liked by the language model, can't be consistent
620
  // xheight, and not top choice.
621
18.9M
  if (!liked_by_language_model && top_choice_flags == 0) {
622
8.75M
    if (language_model_debug_level > 1) {
623
0
      tprintf("Language model components very early pruned this entry\n");
624
0
    }
625
8.75M
    delete ngram_info;
626
8.75M
    delete dawg_info;
627
8.75M
    return false;
628
8.75M
  }
629
630
  // Check consistency of the path and set the relevant consistency_info.
631
10.2M
  LMConsistencyInfo consistency_info(parent_vse != nullptr ? &parent_vse->consistency_info
632
10.2M
                                                           : nullptr);
633
  // Start with just the x-height consistency, as it provides significant
634
  // pruning opportunity.
635
10.2M
  consistency_info.ComputeXheightConsistency(
636
10.2M
      b, dict_->getUnicharset().get_ispunctuation(b->unichar_id()));
637
  // Turn off xheight consistent flag if not consistent.
638
10.2M
  if (consistency_info.InconsistentXHeight()) {
639
6.01M
    top_choice_flags &= ~kXhtConsistentFlag;
640
6.01M
  }
641
642
  // Quick escape if not liked by the language model, not consistent xheight,
643
  // and not top choice.
644
10.2M
  if (!liked_by_language_model && top_choice_flags == 0) {
645
1.14M
    if (language_model_debug_level > 1) {
646
0
      tprintf("Language model components early pruned this entry\n");
647
0
    }
648
1.14M
    delete ngram_info;
649
1.14M
    delete dawg_info;
650
1.14M
    return false;
651
1.14M
  }
652
653
  // Compute the rest of the consistency info.
654
9.07M
  FillConsistencyInfo(curr_col, word_end, b, parent_vse, word_res, &consistency_info);
655
9.07M
  if (dawg_info != nullptr && consistency_info.invalid_punc) {
656
0
    consistency_info.invalid_punc = false; // do not penalize dict words
657
0
  }
658
659
  // Compute cost of associating the blobs that represent the current unichar.
660
9.07M
  AssociateStats associate_stats;
661
9.07M
  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_, parent_vse, word_res,
662
9.07M
                        &associate_stats);
663
9.07M
  if (parent_vse != nullptr) {
664
8.12M
    associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
665
8.12M
    associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
666
8.12M
  }
667
668
  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
669
9.07M
  auto *new_vse = new ViterbiStateEntry(parent_vse, b, 0.0, outline_length, consistency_info,
670
9.07M
                                        associate_stats, top_choice_flags, dawg_info, ngram_info,
671
9.07M
                                        (language_model_debug_level > 0)
672
9.07M
                                            ? dict_->getUnicharset().id_to_unichar(b->unichar_id())
673
9.07M
                                            : nullptr);
674
9.07M
  new_vse->cost = ComputeAdjustedPathCost(new_vse);
675
9.07M
  if (language_model_debug_level >= 3) {
676
0
    tprintf("Adjusted cost = %g\n", new_vse->cost);
677
0
  }
678
679
  // Invoke Top Choice language model component to make the final adjustments
680
  // to new_vse->top_choice_flags.
681
9.07M
  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
682
8.58M
    GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
683
8.58M
  }
684
685
  // If language model components did not like this unichar - return.
686
9.07M
  bool keep = new_vse->top_choice_flags || liked_by_language_model;
687
9.07M
  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
688
9.07M
      consistency_info.inconsistent_script) {      // with inconsistent script
689
20.8k
    keep = false;
690
20.8k
  }
691
9.07M
  if (!keep) {
692
6.72M
    if (language_model_debug_level > 1) {
693
0
      tprintf("Language model components did not like this entry\n");
694
0
    }
695
6.72M
    delete new_vse;
696
6.72M
    return false;
697
6.72M
  }
698
699
  // Discard this entry if it represents a prunable path and
700
  // language_model_viterbi_list_max_num_prunable such entries with a lower
701
  // cost have already been recorded.
702
2.35M
  if (PrunablePath(*new_vse) &&
703
2.35M
      (curr_state->viterbi_state_entries_prunable_length >=
704
0
       language_model_viterbi_list_max_num_prunable) &&
705
2.35M
      new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
706
0
    if (language_model_debug_level > 1) {
707
0
      tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n", new_vse->cost,
708
0
              curr_state->viterbi_state_entries_prunable_max_cost);
709
0
    }
710
0
    delete new_vse;
711
0
    return false;
712
0
  }
713
714
  // Update best choice if needed.
715
2.35M
  if (word_end) {
716
399k
    UpdateBestChoice(new_vse, pain_points, word_res, best_choice_bundle, blamer_bundle);
717
    // Discard the entry if UpdateBestChoice() found flaws in it.
718
399k
    if (new_vse->cost >= WERD_CHOICE::kBadRating && new_vse != best_choice_bundle->best_vse) {
719
0
      if (language_model_debug_level > 1) {
720
0
        tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
721
0
      }
722
0
      delete new_vse;
723
0
      return false;
724
0
    }
725
399k
  }
726
727
  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
728
2.35M
  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare, false, new_vse);
729
2.35M
  curr_state->viterbi_state_entries_length++;
730
2.35M
  if (PrunablePath(*new_vse)) {
731
0
    curr_state->viterbi_state_entries_prunable_length++;
732
0
  }
733
734
  // Update lms->viterbi_state_entries_prunable_max_cost and clear
735
  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
736
2.35M
  if ((curr_state->viterbi_state_entries_prunable_length >=
737
2.35M
       language_model_viterbi_list_max_num_prunable) ||
738
2.35M
      new_vse->top_choice_flags) {
739
2.35M
    ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
740
2.35M
    int prunable_counter = language_model_viterbi_list_max_num_prunable;
741
2.35M
    vit.set_to_list(&(curr_state->viterbi_state_entries));
742
16.5M
    for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
743
14.1M
      ViterbiStateEntry *curr_vse = vit.data();
744
      // Clear the appropriate top choice flags of the entries in the
745
      // list that have cost higher thank new_entry->cost
746
      // (since they will not be top choices any more).
747
14.1M
      if (curr_vse->top_choice_flags && curr_vse != new_vse && curr_vse->cost > new_vse->cost) {
748
3.27M
        curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
749
3.27M
      }
750
14.1M
      if (prunable_counter > 0 && PrunablePath(*curr_vse)) {
751
6.69M
        --prunable_counter;
752
6.69M
      }
753
      // Update curr_state->viterbi_state_entries_prunable_max_cost.
754
14.1M
      if (prunable_counter == 0) {
755
272k
        curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
756
272k
        if (language_model_debug_level > 1) {
757
0
          tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
758
0
                  curr_state->viterbi_state_entries_prunable_max_cost);
759
0
        }
760
272k
        prunable_counter = -1; // stop counting
761
272k
      }
762
14.1M
    }
763
2.35M
  }
764
765
  // Print the newly created ViterbiStateEntry.
766
2.35M
  if (language_model_debug_level > 2) {
767
0
    new_vse->Print("New");
768
0
    if (language_model_debug_level > 5) {
769
0
      curr_state->Print("Updated viterbi list");
770
0
    }
771
0
  }
772
773
2.35M
  return true;
774
2.35M
}
775
776
void LanguageModel::GenerateTopChoiceInfo(ViterbiStateEntry *new_vse,
777
                                          const ViterbiStateEntry *parent_vse,
778
8.58M
                                          LanguageModelState *lms) {
779
8.58M
  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
780
8.58M
  for (vit.mark_cycle_pt();
781
24.3M
       !vit.cycled_list() && new_vse->top_choice_flags && new_vse->cost >= vit.data()->cost;
782
15.7M
       vit.forward()) {
783
    // Clear the appropriate flags if the list already contains
784
    // a top choice entry with a lower cost.
785
15.7M
    new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
786
15.7M
  }
787
8.58M
  if (language_model_debug_level > 2) {
788
0
    tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n", new_vse->top_choice_flags);
789
0
  }
790
8.58M
}
791
792
LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(bool word_end, int curr_col, int curr_row,
793
                                                       const BLOB_CHOICE &b,
794
18.9M
                                                       const ViterbiStateEntry *parent_vse) {
795
  // Initialize active_dawgs from parent_vse if it is not nullptr.
796
  // Otherwise use very_beginning_active_dawgs_.
797
18.9M
  if (parent_vse == nullptr) {
798
953k
    dawg_args_.active_dawgs = &very_beginning_active_dawgs_;
799
953k
    dawg_args_.permuter = NO_PERM;
800
18.0M
  } else {
801
18.0M
    if (parent_vse->dawg_info == nullptr) {
802
18.0M
      return nullptr; // not a dict word path
803
18.0M
    }
804
0
    dawg_args_.active_dawgs = &parent_vse->dawg_info->active_dawgs;
805
0
    dawg_args_.permuter = parent_vse->dawg_info->permuter;
806
0
  }
807
808
  // Deal with hyphenated words.
809
953k
  if (word_end && dict_->has_hyphen_end(&dict_->getUnicharset(), b.unichar_id(), curr_col == 0)) {
810
0
    if (language_model_debug_level > 0) {
811
0
      tprintf("Hyphenated word found\n");
812
0
    }
813
0
    return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM);
814
0
  }
815
816
  // Deal with compound words.
817
953k
  if (dict_->compound_marker(b.unichar_id()) &&
818
953k
      (parent_vse == nullptr || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
819
0
    if (language_model_debug_level > 0) {
820
0
      tprintf("Found compound marker\n");
821
0
    }
822
    // Do not allow compound operators at the beginning and end of the word.
823
    // Do not allow more than one compound operator per word.
824
    // Do not allow compounding of words with lengths shorter than
825
    // language_model_min_compound_length
826
0
    if (parent_vse == nullptr || word_end || dawg_args_.permuter == COMPOUND_PERM ||
827
0
        parent_vse->length < language_model_min_compound_length) {
828
0
      return nullptr;
829
0
    }
830
831
    // Check that the path terminated before the current character is a word.
832
0
    bool has_word_ending = false;
833
0
    for (unsigned i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
834
0
      const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
835
0
      const Dawg *pdawg = pos.dawg_index < 0 ? nullptr : dict_->GetDawg(pos.dawg_index);
836
0
      if (pdawg == nullptr || pos.back_to_punc) {
837
0
        continue;
838
0
      };
839
0
      if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
840
0
          pdawg->end_of_word(pos.dawg_ref)) {
841
0
        has_word_ending = true;
842
0
        break;
843
0
      }
844
0
    }
845
0
    if (!has_word_ending) {
846
0
      return nullptr;
847
0
    }
848
849
0
    if (language_model_debug_level > 0) {
850
0
      tprintf("Compound word found\n");
851
0
    }
852
0
    return new LanguageModelDawgInfo(&beginning_active_dawgs_, COMPOUND_PERM);
853
0
  } // done dealing with compound words
854
855
953k
  LanguageModelDawgInfo *dawg_info = nullptr;
856
857
  // Call LetterIsOkay().
858
  // Use the normalized IDs so that all shapes of ' can be allowed in words
859
  // like don't.
860
953k
  const auto &normed_ids = dict_->getUnicharset().normed_ids(b.unichar_id());
861
953k
  DawgPositionVector tmp_active_dawgs;
862
953k
  for (unsigned i = 0; i < normed_ids.size(); ++i) {
863
953k
    if (language_model_debug_level > 2) {
864
0
      tprintf("Test Letter OK for unichar %d, normed %d\n", b.unichar_id(), normed_ids[i]);
865
0
    }
866
953k
    dict_->LetterIsOkay(&dawg_args_, dict_->getUnicharset(), normed_ids[i],
867
953k
                        word_end && i == normed_ids.size() - 1);
868
953k
    if (dawg_args_.permuter == NO_PERM) {
869
953k
      break;
870
953k
    } else if (i < normed_ids.size() - 1) {
871
0
      tmp_active_dawgs = *dawg_args_.updated_dawgs;
872
0
      dawg_args_.active_dawgs = &tmp_active_dawgs;
873
0
    }
874
0
    if (language_model_debug_level > 2) {
875
0
      tprintf("Letter was OK for unichar %d, normed %d\n", b.unichar_id(), normed_ids[i]);
876
0
    }
877
0
  }
878
953k
  dawg_args_.active_dawgs = nullptr;
879
953k
  if (dawg_args_.permuter != NO_PERM) {
880
0
    dawg_info = new LanguageModelDawgInfo(dawg_args_.updated_dawgs, dawg_args_.permuter);
881
953k
  } else if (language_model_debug_level > 3) {
882
0
    tprintf("Letter %s not OK!\n", dict_->getUnicharset().id_to_unichar(b.unichar_id()));
883
0
  }
884
885
953k
  return dawg_info;
886
953k
}
887
888
LanguageModelNgramInfo *LanguageModel::GenerateNgramInfo(const char *unichar, float certainty,
889
                                                         float denom, int curr_col, int curr_row,
890
                                                         float outline_length,
891
0
                                                         const ViterbiStateEntry *parent_vse) {
892
  // Initialize parent context.
893
0
  const char *pcontext_ptr = "";
894
0
  int pcontext_unichar_step_len = 0;
895
0
  if (parent_vse == nullptr) {
896
0
    pcontext_ptr = prev_word_str_.c_str();
897
0
    pcontext_unichar_step_len = prev_word_unichar_step_len_;
898
0
  } else {
899
0
    pcontext_ptr = parent_vse->ngram_info->context.c_str();
900
0
    pcontext_unichar_step_len = parent_vse->ngram_info->context_unichar_step_len;
901
0
  }
902
  // Compute p(unichar | parent context).
903
0
  int unichar_step_len = 0;
904
0
  bool pruned = false;
905
0
  float ngram_cost;
906
0
  float ngram_and_classifier_cost = ComputeNgramCost(unichar, certainty, denom, pcontext_ptr,
907
0
                                                     &unichar_step_len, &pruned, &ngram_cost);
908
  // Normalize just the ngram_and_classifier_cost by outline_length.
909
  // The ngram_cost is used by the params_model, so it needs to be left as-is,
910
  // and the params model cost will be normalized by outline_length.
911
0
  ngram_and_classifier_cost *= outline_length / language_model_ngram_rating_factor;
912
  // Add the ngram_cost of the parent.
913
0
  if (parent_vse != nullptr) {
914
0
    ngram_and_classifier_cost += parent_vse->ngram_info->ngram_and_classifier_cost;
915
0
    ngram_cost += parent_vse->ngram_info->ngram_cost;
916
0
  }
917
918
  // Shorten parent context string by unichar_step_len unichars.
919
0
  int num_remove = (unichar_step_len + pcontext_unichar_step_len - language_model_ngram_order);
920
0
  if (num_remove > 0) {
921
0
    pcontext_unichar_step_len -= num_remove;
922
0
  }
923
0
  while (num_remove > 0 && *pcontext_ptr != '\0') {
924
0
    pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
925
0
    --num_remove;
926
0
  }
927
928
  // Decide whether to prune this ngram path and update changed accordingly.
929
0
  if (parent_vse != nullptr && parent_vse->ngram_info->pruned) {
930
0
    pruned = true;
931
0
  }
932
933
  // Construct and return the new LanguageModelNgramInfo.
934
0
  auto *ngram_info = new LanguageModelNgramInfo(pcontext_ptr, pcontext_unichar_step_len, pruned,
935
0
                                                ngram_cost, ngram_and_classifier_cost);
936
0
  ngram_info->context += unichar;
937
0
  ngram_info->context_unichar_step_len += unichar_step_len;
938
0
  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
939
0
  return ngram_info;
940
0
}
941
942
float LanguageModel::ComputeNgramCost(const char *unichar, float certainty, float denom,
943
                                      const char *context, int *unichar_step_len,
944
0
                                      bool *found_small_prob, float *ngram_cost) {
945
0
  const char *context_ptr = context;
946
0
  char *modified_context = nullptr;
947
0
  char *modified_context_end = nullptr;
948
0
  const char *unichar_ptr = unichar;
949
0
  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
950
0
  float prob = 0.0f;
951
0
  int step = 0;
952
0
  while (unichar_ptr < unichar_end && (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
953
0
    if (language_model_debug_level > 1) {
954
0
      tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
955
0
              dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
956
0
    }
957
0
    prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
958
0
    ++(*unichar_step_len);
959
0
    if (language_model_ngram_use_only_first_uft8_step) {
960
0
      break;
961
0
    }
962
0
    unichar_ptr += step;
963
    // If there are multiple UTF8 characters present in unichar, context is
964
    // updated to include the previously examined characters from str,
965
    // unless use_only_first_uft8_step is true.
966
0
    if (unichar_ptr < unichar_end) {
967
0
      if (modified_context == nullptr) {
968
0
        size_t context_len = strlen(context);
969
0
        modified_context = new char[context_len + strlen(unichar_ptr) + step + 1];
970
0
        memcpy(modified_context, context, context_len);
971
0
        modified_context_end = modified_context + context_len;
972
0
        context_ptr = modified_context;
973
0
      }
974
0
      strncpy(modified_context_end, unichar_ptr - step, step);
975
0
      modified_context_end += step;
976
0
      *modified_context_end = '\0';
977
0
    }
978
0
  }
979
0
  prob /= static_cast<float>(*unichar_step_len); // normalize
980
0
  if (prob < language_model_ngram_small_prob) {
981
0
    if (language_model_debug_level > 0) {
982
0
      tprintf("Found small prob %g\n", prob);
983
0
    }
984
0
    *found_small_prob = true;
985
0
    prob = language_model_ngram_small_prob;
986
0
  }
987
0
  *ngram_cost = -1 * std::log2(prob);
988
0
  float ngram_and_classifier_cost = -1 * std::log2(CertaintyScore(certainty) / denom) +
989
0
                                    *ngram_cost * language_model_ngram_scale_factor;
990
0
  if (language_model_debug_level > 1) {
991
0
    tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar, unichar, context_ptr,
992
0
            CertaintyScore(certainty) / denom, prob, ngram_and_classifier_cost);
993
0
  }
994
0
  delete[] modified_context;
995
0
  return ngram_and_classifier_cost;
996
0
}
997
998
0
float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {
999
0
  if (curr_list->empty()) {
1000
0
    return 1.0f;
1001
0
  }
1002
0
  float denom = 0.0f;
1003
0
  int len = 0;
1004
0
  BLOB_CHOICE_IT c_it(curr_list);
1005
0
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1006
0
    ASSERT_HOST(c_it.data() != nullptr);
1007
0
    ++len;
1008
0
    denom += CertaintyScore(c_it.data()->certainty());
1009
0
  }
1010
0
  assert(len != 0);
1011
  // The ideal situation would be to have the classifier scores for
1012
  // classifying each position as each of the characters in the unicharset.
1013
  // Since we cannot do this because of speed, we add a very crude estimate
1014
  // of what these scores for the "missing" classifications would sum up to.
1015
0
  denom +=
1016
0
      (dict_->getUnicharset().size() - len) * CertaintyScore(language_model_ngram_nonmatch_score);
1017
1018
0
  return denom;
1019
0
}
1020
1021
void LanguageModel::FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b,
1022
                                        ViterbiStateEntry *parent_vse, WERD_RES *word_res,
1023
9.07M
                                        LMConsistencyInfo *consistency_info) {
1024
9.07M
  const UNICHARSET &unicharset = dict_->getUnicharset();
1025
9.07M
  UNICHAR_ID unichar_id = b->unichar_id();
1026
9.07M
  BLOB_CHOICE *parent_b = parent_vse != nullptr ? parent_vse->curr_b : nullptr;
1027
1028
  // Check punctuation validity.
1029
9.07M
  if (unicharset.get_ispunctuation(unichar_id)) {
1030
2.17M
    consistency_info->num_punc++;
1031
2.17M
  }
1032
9.07M
  if (dict_->GetPuncDawg() != nullptr && !consistency_info->invalid_punc) {
1033
0
    if (dict_->compound_marker(unichar_id) && parent_b != nullptr &&
1034
0
        (unicharset.get_isalpha(parent_b->unichar_id()) ||
1035
0
         unicharset.get_isdigit(parent_b->unichar_id()))) {
1036
      // reset punc_ref for compound words
1037
0
      consistency_info->punc_ref = NO_EDGE;
1038
0
    } else {
1039
0
      bool is_apos = dict_->is_apostrophe(unichar_id);
1040
0
      bool prev_is_numalpha =
1041
0
          (parent_b != nullptr && (unicharset.get_isalpha(parent_b->unichar_id()) ||
1042
0
                                   unicharset.get_isdigit(parent_b->unichar_id())));
1043
0
      UNICHAR_ID pattern_unichar_id =
1044
0
          (unicharset.get_isalpha(unichar_id) || unicharset.get_isdigit(unichar_id) ||
1045
0
           (is_apos && prev_is_numalpha))
1046
0
              ? Dawg::kPatternUnicharID
1047
0
              : unichar_id;
1048
0
      if (consistency_info->punc_ref == NO_EDGE || pattern_unichar_id != Dawg::kPatternUnicharID ||
1049
0
          dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1050
0
              Dawg::kPatternUnicharID) {
1051
0
        NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(), consistency_info->punc_ref);
1052
0
        consistency_info->punc_ref = (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1053
0
                                                             node, pattern_unichar_id, word_end)
1054
0
                                                       : NO_EDGE;
1055
0
        if (consistency_info->punc_ref == NO_EDGE) {
1056
0
          consistency_info->invalid_punc = true;
1057
0
        }
1058
0
      }
1059
0
    }
1060
0
  }
1061
1062
  // Update case related counters.
1063
9.07M
  if (parent_vse != nullptr && !word_end && dict_->compound_marker(unichar_id)) {
1064
    // Reset counters if we are dealing with a compound word.
1065
0
    consistency_info->num_lower = 0;
1066
0
    consistency_info->num_non_first_upper = 0;
1067
9.07M
  } else if (unicharset.get_islower(unichar_id)) {
1068
4.22M
    consistency_info->num_lower++;
1069
4.85M
  } else if ((parent_b != nullptr) && unicharset.get_isupper(unichar_id)) {
1070
1.35M
    if (unicharset.get_isupper(parent_b->unichar_id()) || consistency_info->num_lower > 0 ||
1071
1.35M
        consistency_info->num_non_first_upper > 0) {
1072
1.13M
      consistency_info->num_non_first_upper++;
1073
1.13M
    }
1074
1.35M
  }
1075
1076
  // Initialize consistency_info->script_id (use script of unichar_id
1077
  // if it is not Common, use script id recorded by the parent otherwise).
1078
  // Set inconsistent_script to true if the script of the current unichar
1079
  // is not consistent with that of the parent.
1080
9.07M
  consistency_info->script_id = unicharset.get_script(unichar_id);
1081
  // Hiragana and Katakana can mix with Han.
1082
9.07M
  if (dict_->getUnicharset().han_sid() != dict_->getUnicharset().null_sid()) {
1083
0
    if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1084
0
         consistency_info->script_id == unicharset.hiragana_sid()) ||
1085
0
        (unicharset.katakana_sid() != unicharset.null_sid() &&
1086
0
         consistency_info->script_id == unicharset.katakana_sid())) {
1087
0
      consistency_info->script_id = dict_->getUnicharset().han_sid();
1088
0
    }
1089
0
  }
1090
1091
9.07M
  if (parent_vse != nullptr &&
1092
9.07M
      (parent_vse->consistency_info.script_id != dict_->getUnicharset().common_sid())) {
1093
7.00M
    int parent_script_id = parent_vse->consistency_info.script_id;
1094
    // If script_id is Common, use script id of the parent instead.
1095
7.00M
    if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1096
2.41M
      consistency_info->script_id = parent_script_id;
1097
2.41M
    }
1098
7.00M
    if (consistency_info->script_id != parent_script_id) {
1099
33.1k
      consistency_info->inconsistent_script = true;
1100
33.1k
    }
1101
7.00M
  }
1102
1103
  // Update chartype related counters.
1104
9.07M
  if (unicharset.get_isalpha(unichar_id)) {
1105
5.75M
    consistency_info->num_alphas++;
1106
5.75M
  } else if (unicharset.get_isdigit(unichar_id)) {
1107
283k
    consistency_info->num_digits++;
1108
3.03M
  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1109
854k
    consistency_info->num_other++;
1110
854k
  }
1111
1112
  // Check font and spacing consistency.
1113
9.07M
  if (fontinfo_table_->size() > 0 && parent_b != nullptr) {
1114
8.12M
    int fontinfo_id = -1;
1115
8.12M
    if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1116
8.12M
        parent_b->fontinfo_id2() == b->fontinfo_id()) {
1117
1.90M
      fontinfo_id = b->fontinfo_id();
1118
6.21M
    } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1119
6.21M
               parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1120
498k
      fontinfo_id = b->fontinfo_id2();
1121
498k
    }
1122
8.12M
    if (language_model_debug_level > 1) {
1123
0
      tprintf(
1124
0
          "pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1125
0
          (parent_b->fontinfo_id() >= 0) ? fontinfo_table_->at(parent_b->fontinfo_id()).name : "",
1126
0
          (parent_b->fontinfo_id2() >= 0) ? fontinfo_table_->at(parent_b->fontinfo_id2()).name
1127
0
                                          : "",
1128
0
          (b->fontinfo_id() >= 0) ? fontinfo_table_->at(b->fontinfo_id()).name : "",
1129
0
          (fontinfo_id >= 0) ? fontinfo_table_->at(fontinfo_id).name : "",
1130
0
          (fontinfo_id >= 0) ? fontinfo_table_->at(fontinfo_id).name : "", fontinfo_id);
1131
0
    }
1132
8.12M
    if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1133
8.12M
      bool expected_gap_found = false;
1134
8.12M
      float expected_gap = 0.0f;
1135
8.12M
      int temp_gap;
1136
8.12M
      if (fontinfo_id >= 0) { // found a common font
1137
2.40M
        ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1138
2.40M
        if (fontinfo_table_->at(fontinfo_id)
1139
2.40M
                .get_spacing(parent_b->unichar_id(), unichar_id, &temp_gap)) {
1140
1.66M
          expected_gap = temp_gap;
1141
1.66M
          expected_gap_found = true;
1142
1.66M
        }
1143
5.71M
      } else {
1144
5.71M
        consistency_info->inconsistent_font = true;
1145
        // Get an average of the expected gaps in each font
1146
5.71M
        int num_addends = 0;
1147
5.71M
        int temp_fid;
1148
28.5M
        for (int i = 0; i < 4; ++i) {
1149
22.8M
          if (i == 0) {
1150
5.71M
            temp_fid = parent_b->fontinfo_id();
1151
17.1M
          } else if (i == 1) {
1152
5.71M
            temp_fid = parent_b->fontinfo_id2();
1153
11.4M
          } else if (i == 2) {
1154
5.71M
            temp_fid = b->fontinfo_id();
1155
5.71M
          } else {
1156
5.71M
            temp_fid = b->fontinfo_id2();
1157
5.71M
          }
1158
22.8M
          ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1159
22.8M
          if (temp_fid >= 0 && fontinfo_table_->at(temp_fid).get_spacing(parent_b->unichar_id(),
1160
22.6M
                                                                         unichar_id, &temp_gap)) {
1161
13.0M
            expected_gap += temp_gap;
1162
13.0M
            num_addends++;
1163
13.0M
          }
1164
22.8M
        }
1165
5.71M
        if (num_addends > 0) {
1166
3.30M
          expected_gap /= static_cast<float>(num_addends);
1167
3.30M
          expected_gap_found = true;
1168
3.30M
        }
1169
5.71M
      }
1170
8.12M
      if (expected_gap_found) {
1171
4.96M
        int actual_gap = word_res->GetBlobsGap(curr_col - 1);
1172
4.96M
        if (actual_gap == 0) {
1173
913k
          consistency_info->num_inconsistent_spaces++;
1174
4.05M
        } else {
1175
4.05M
          float gap_ratio = expected_gap / actual_gap;
1176
          // TODO(rays) The gaps seem to be way off most of the time, saved by
1177
          // the error here that the ratio was compared to 1/2, when it should
1178
          // have been 0.5f. Find the source of the gaps discrepancy and put
1179
          // the 0.5f here in place of 0.0f.
1180
          // Test on 2476595.sj, pages 0 to 6. (In French.)
1181
4.05M
          if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1182
1.83M
            consistency_info->num_inconsistent_spaces++;
1183
1.83M
          }
1184
4.05M
        }
1185
4.96M
        if (language_model_debug_level > 1) {
1186
0
          tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1187
0
                  unicharset.id_to_unichar(parent_b->unichar_id()), parent_b->unichar_id(),
1188
0
                  unicharset.id_to_unichar(unichar_id), unichar_id, curr_col, expected_gap,
1189
0
                  actual_gap);
1190
0
        }
1191
4.96M
      }
1192
8.12M
    }
1193
8.12M
  }
1194
9.07M
}
1195
1196
9.07M
float LanguageModel::ComputeAdjustedPathCost(ViterbiStateEntry *vse) {
1197
9.07M
  ASSERT_HOST(vse != nullptr);
1198
9.07M
  if (params_model_.Initialized()) {
1199
0
    float features[PTRAIN_NUM_FEATURE_TYPES];
1200
0
    ExtractFeaturesFromPath(*vse, features);
1201
0
    float cost = params_model_.ComputeCost(features);
1202
0
    if (language_model_debug_level > 3) {
1203
0
      tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1204
0
      if (language_model_debug_level >= 5) {
1205
0
        for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1206
0
          tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1207
0
        }
1208
0
      }
1209
0
    }
1210
0
    return cost * vse->outline_length;
1211
9.07M
  } else {
1212
9.07M
    float adjustment = 1.0f;
1213
9.07M
    if (vse->dawg_info == nullptr || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1214
9.07M
      adjustment += language_model_penalty_non_freq_dict_word;
1215
9.07M
    }
1216
9.07M
    if (vse->dawg_info == nullptr) {
1217
9.07M
      adjustment += language_model_penalty_non_dict_word;
1218
9.07M
      if (vse->length > language_model_min_compound_length) {
1219
5.99M
        adjustment +=
1220
5.99M
            ((vse->length - language_model_min_compound_length) * language_model_penalty_increment);
1221
5.99M
      }
1222
9.07M
    }
1223
9.07M
    if (vse->associate_stats.shape_cost > 0) {
1224
0
      adjustment += vse->associate_stats.shape_cost / static_cast<float>(vse->length);
1225
0
    }
1226
9.07M
    if (language_model_ngram_on) {
1227
0
      ASSERT_HOST(vse->ngram_info != nullptr);
1228
0
      return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1229
9.07M
    } else {
1230
9.07M
      adjustment += ComputeConsistencyAdjustment(vse->dawg_info, vse->consistency_info);
1231
9.07M
      return vse->ratings_sum * adjustment;
1232
9.07M
    }
1233
9.07M
  }
1234
9.07M
}
1235
1236
void LanguageModel::UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points,
1237
                                     WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
1238
399k
                                     BlamerBundle *blamer_bundle) {
1239
399k
  bool truth_path;
1240
399k
  WERD_CHOICE *word =
1241
399k
      ConstructWord(vse, word_res, &best_choice_bundle->fixpt, blamer_bundle, &truth_path);
1242
399k
  ASSERT_HOST(word != nullptr);
1243
399k
  if (dict_->stopper_debug_level >= 1) {
1244
0
    std::string word_str;
1245
0
    word->string_and_lengths(&word_str, nullptr);
1246
0
    vse->Print(word_str.c_str());
1247
0
  }
1248
399k
  if (language_model_debug_level > 0) {
1249
0
    word->print("UpdateBestChoice() constructed word");
1250
0
  }
1251
  // Record features from the current path if necessary.
1252
399k
  ParamsTrainingHypothesis curr_hyp;
1253
399k
  if (blamer_bundle != nullptr) {
1254
0
    if (vse->dawg_info != nullptr) {
1255
0
      vse->dawg_info->permuter = static_cast<PermuterType>(word->permuter());
1256
0
    }
1257
0
    ExtractFeaturesFromPath(*vse, curr_hyp.features);
1258
0
    word->string_and_lengths(&(curr_hyp.str), nullptr);
1259
0
    curr_hyp.cost = vse->cost; // record cost for error rate computations
1260
0
    if (language_model_debug_level > 0) {
1261
0
      tprintf("Raw features extracted from %s (cost=%g) [ ", curr_hyp.str.c_str(), curr_hyp.cost);
1262
0
      for (float feature : curr_hyp.features) {
1263
0
        tprintf("%g ", feature);
1264
0
      }
1265
0
      tprintf("]\n");
1266
0
    }
1267
    // Record the current hypothesis in params_training_bundle.
1268
0
    blamer_bundle->AddHypothesis(curr_hyp);
1269
0
    if (truth_path) {
1270
0
      blamer_bundle->UpdateBestRating(word->rating());
1271
0
    }
1272
0
  }
1273
399k
  if (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()) {
1274
    // The word was constructed solely for blamer_bundle->AddHypothesis, so
1275
    // we no longer need it.
1276
0
    delete word;
1277
0
    return;
1278
0
  }
1279
399k
  if (word_res->chopped_word != nullptr && !word_res->chopped_word->blobs.empty()) {
1280
399k
    word->SetScriptPositions(false, word_res->chopped_word, language_model_debug_level);
1281
399k
  }
1282
  // Update and log new raw_choice if needed.
1283
399k
  if (word_res->raw_choice == nullptr || word->rating() < word_res->raw_choice->rating()) {
1284
200k
    if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0) {
1285
0
      tprintf("Updated raw choice\n");
1286
0
    }
1287
200k
  }
1288
  // Set the modified rating for best choice to vse->cost and log best choice.
1289
399k
  word->set_rating(vse->cost);
1290
  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1291
  // computes adjust_factor that is used by the adaption code (e.g. by
1292
  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1293
  // Note: the rating of the word is not adjusted.
1294
399k
  dict_->adjust_word(word, vse->dawg_info == nullptr, vse->consistency_info.xht_decision, 0.0,
1295
399k
                     false, language_model_debug_level > 0);
1296
  // Hand ownership of the word over to the word_res.
1297
399k
  if (!word_res->LogNewCookedChoice(dict_->tessedit_truncate_wordchoice_log,
1298
399k
                                    dict_->stopper_debug_level >= 1, word)) {
1299
    // The word was so bad that it was deleted.
1300
46.9k
    return;
1301
46.9k
  }
1302
352k
  if (word_res->best_choice == word) {
1303
    // Word was the new best.
1304
216k
    if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
1305
216k
        AcceptablePath(*vse)) {
1306
0
      acceptable_choice_found_ = true;
1307
0
    }
1308
    // Update best_choice_bundle.
1309
216k
    best_choice_bundle->updated = true;
1310
216k
    best_choice_bundle->best_vse = vse;
1311
216k
    if (language_model_debug_level > 0) {
1312
0
      tprintf("Updated best choice\n");
1313
0
      word->print_state("New state ");
1314
0
    }
1315
    // Update hyphen state if we are dealing with a dictionary word.
1316
216k
    if (vse->dawg_info != nullptr) {
1317
0
      if (dict_->has_hyphen_end(*word)) {
1318
0
        dict_->set_hyphen_word(*word, *(dawg_args_.active_dawgs));
1319
0
      } else {
1320
0
        dict_->reset_hyphen_vars(true);
1321
0
      }
1322
0
    }
1323
1324
216k
    if (blamer_bundle != nullptr) {
1325
0
      blamer_bundle->set_best_choice_is_dict_and_top_choice(vse->dawg_info != nullptr &&
1326
0
                                                            vse->top_choice_flags);
1327
0
    }
1328
216k
  }
1329
#ifndef GRAPHICS_DISABLED
1330
  if (wordrec_display_segmentations && word_res->chopped_word != nullptr) {
1331
    word->DisplaySegmentation(word_res->chopped_word);
1332
  }
1333
#endif
1334
352k
}
1335
1336
0
void LanguageModel::ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[]) {
1337
0
  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1338
  // Record dictionary match info.
1339
0
  int len = vse.length <= kMaxSmallWordUnichars ? 0 : vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1340
0
  if (vse.dawg_info != nullptr) {
1341
0
    int permuter = vse.dawg_info->permuter;
1342
0
    if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1343
0
      if (vse.consistency_info.num_digits == vse.length) {
1344
0
        features[PTRAIN_DIGITS_SHORT + len] = 1.0f;
1345
0
      } else {
1346
0
        features[PTRAIN_NUM_SHORT + len] = 1.0f;
1347
0
      }
1348
0
    } else if (permuter == DOC_DAWG_PERM) {
1349
0
      features[PTRAIN_DOC_SHORT + len] = 1.0f;
1350
0
    } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1351
0
               permuter == COMPOUND_PERM) {
1352
0
      features[PTRAIN_DICT_SHORT + len] = 1.0f;
1353
0
    } else if (permuter == FREQ_DAWG_PERM) {
1354
0
      features[PTRAIN_FREQ_SHORT + len] = 1.0f;
1355
0
    }
1356
0
  }
1357
  // Record shape cost feature (normalized by path length).
1358
0
  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1359
0
      vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1360
  // Record ngram cost. (normalized by the path length).
1361
0
  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0f;
1362
0
  if (vse.ngram_info != nullptr) {
1363
0
    features[PTRAIN_NGRAM_COST_PER_CHAR] =
1364
0
        vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1365
0
  }
1366
  // Record consistency-related features.
1367
  // Disabled this feature for due to its poor performance.
1368
  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1369
0
  features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
1370
0
  features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
1371
0
  features[PTRAIN_NUM_BAD_CHAR_TYPE] =
1372
0
      vse.dawg_info == nullptr ? vse.consistency_info.NumInconsistentChartype() : 0.0f;
1373
0
  features[PTRAIN_NUM_BAD_SPACING] = vse.consistency_info.NumInconsistentSpaces();
1374
  // Disabled this feature for now due to its poor performance.
1375
  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1376
1377
  // Classifier-related features.
1378
0
  if (vse.outline_length > 0.0f) {
1379
0
    features[PTRAIN_RATING_PER_CHAR] = vse.ratings_sum / vse.outline_length;
1380
0
  } else {
1381
    // Avoid FP division by 0.
1382
0
    features[PTRAIN_RATING_PER_CHAR] = 0.0f;
1383
0
  }
1384
0
}
1385
1386
WERD_CHOICE *LanguageModel::ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res,
1387
                                          DANGERR *fixpt, BlamerBundle *blamer_bundle,
1388
399k
                                          bool *truth_path) {
1389
399k
  if (truth_path != nullptr) {
1390
399k
    *truth_path =
1391
399k
        (blamer_bundle != nullptr && vse->length == blamer_bundle->correct_segmentation_length());
1392
399k
  }
1393
399k
  BLOB_CHOICE *curr_b = vse->curr_b;
1394
399k
  ViterbiStateEntry *curr_vse = vse;
1395
1396
399k
  int i;
1397
399k
  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1398
1399
  // Re-compute the variance of the width-to-height ratios (since we now
1400
  // can compute the mean over the whole word).
1401
399k
  float full_wh_ratio_mean = 0.0f;
1402
399k
  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1403
0
    vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1404
0
    full_wh_ratio_mean =
1405
0
        (vse->associate_stats.full_wh_ratio_total / static_cast<float>(vse->length));
1406
0
    vse->associate_stats.full_wh_ratio_var = 0.0f;
1407
0
  }
1408
1409
  // Construct a WERD_CHOICE by tracing parent pointers.
1410
399k
  auto *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1411
399k
  word->set_length(vse->length);
1412
399k
  int total_blobs = 0;
1413
2.88M
  for (i = (vse->length - 1); i >= 0; --i) {
1414
2.88M
    if (blamer_bundle != nullptr && truth_path != nullptr && *truth_path &&
1415
2.88M
        !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1416
0
      *truth_path = false;
1417
0
    }
1418
    // The number of blobs used for this choice is row - col + 1.
1419
2.88M
    int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1420
2.88M
    total_blobs += num_blobs;
1421
2.88M
    word->set_blob_choice(i, num_blobs, curr_b);
1422
    // Update the width-to-height ratio variance. Useful non-space delimited
1423
    // languages to ensure that the blobs are of uniform width.
1424
    // Skip leading and trailing punctuation when computing the variance.
1425
2.88M
    if ((full_wh_ratio_mean != 0.0f &&
1426
2.88M
         ((curr_vse != vse && curr_vse->parent_vse != nullptr) ||
1427
0
          !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1428
0
      vse->associate_stats.full_wh_ratio_var +=
1429
0
          pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1430
0
      if (language_model_debug_level > 2) {
1431
0
        tprintf("full_wh_ratio_var += (%g-%g)^2\n", full_wh_ratio_mean,
1432
0
                curr_vse->associate_stats.full_wh_ratio);
1433
0
      }
1434
0
    }
1435
1436
    // Mark the word as compound if compound permuter was set for any of
1437
    // the unichars on the path (usually this will happen for unichars
1438
    // that are compounding operators, like "-" and "/").
1439
2.88M
    if (!compound && curr_vse->dawg_info && curr_vse->dawg_info->permuter == COMPOUND_PERM) {
1440
0
      compound = true;
1441
0
    }
1442
1443
    // Update curr_* pointers.
1444
2.88M
    curr_vse = curr_vse->parent_vse;
1445
2.88M
    if (curr_vse == nullptr) {
1446
399k
      break;
1447
399k
    }
1448
2.48M
    curr_b = curr_vse->curr_b;
1449
2.48M
  }
1450
399k
  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1451
399k
  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1452
  // Re-adjust shape cost to include the updated width-to-height variance.
1453
399k
  if (full_wh_ratio_mean != 0.0f) {
1454
0
    vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1455
0
  }
1456
1457
399k
  word->set_rating(vse->ratings_sum);
1458
399k
  word->set_certainty(vse->min_certainty);
1459
399k
  word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1460
399k
                      vse->consistency_info.BodyMaxXHeight());
1461
399k
  if (vse->dawg_info != nullptr) {
1462
0
    word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1463
399k
  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1464
0
    word->set_permuter(NGRAM_PERM);
1465
399k
  } else if (vse->top_choice_flags) {
1466
399k
    word->set_permuter(TOP_CHOICE_PERM);
1467
399k
  } else {
1468
0
    word->set_permuter(NO_PERM);
1469
0
  }
1470
399k
  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true, word_res->ratings));
1471
399k
  return word;
1472
399k
}
1473
1474
} // namespace tesseract