Coverage Report

Created: 2025-11-16 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/dict/stopper.cpp
Line
Count
Source
1
/******************************************************************************
2
 ** Filename:    stopper.c
3
 ** Purpose:     Stopping criteria for word classifier.
4
 ** Author:      Dan Johnson
5
 **
6
 ** (c) Copyright Hewlett-Packard Company, 1988.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 ******************************************************************************/
17
18
#include <cctype>
19
#include <cmath>
20
#include <cstdio>
21
#include <cstring>
22
23
#include "stopper.h"
24
#ifndef DISABLED_LEGACY_ENGINE
25
#  include "ambigs.h"
26
#endif
27
#include <tesseract/unichar.h>
28
#include "ccutil.h"
29
#include "dict.h"
30
#include "helpers.h"
31
#include "matchdefs.h"
32
#include "pageres.h"
33
#include "params.h"
34
#include "ratngs.h"
35
36
/*----------------------------------------------------------------------------
37
              Private Code
38
----------------------------------------------------------------------------*/
39
40
namespace tesseract {
41
42
bool Dict::AcceptableChoice(const WERD_CHOICE &best_choice,
43
393k
                            XHeightConsistencyEnum xheight_consistency) {
44
393k
  float CertaintyThreshold = stopper_nondict_certainty_base;
45
393k
  int WordSize;
46
47
393k
  if (stopper_no_acceptable_choices) {
48
0
    return false;
49
0
  }
50
51
393k
  if (best_choice.empty()) {
52
0
    return false;
53
0
  }
54
55
393k
  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
56
393k
  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
57
393k
  bool is_case_ok = case_ok(best_choice);
58
59
393k
  if (stopper_debug_level >= 1) {
60
0
    const char *xht = "UNKNOWN";
61
0
    switch (xheight_consistency) {
62
0
      case XH_GOOD:
63
0
        xht = "NORMAL";
64
0
        break;
65
0
      case XH_SUBNORMAL:
66
0
        xht = "SUBNORMAL";
67
0
        break;
68
0
      case XH_INCONSISTENT:
69
0
        xht = "INCONSISTENT";
70
0
        break;
71
0
      default:
72
0
        xht = "UNKNOWN";
73
0
    }
74
0
    tprintf("\nStopper:  %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
75
0
            best_choice.unichar_string().c_str(), (is_valid_word ? 'y' : 'n'),
76
0
            (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height());
77
0
  }
78
  // Do not accept invalid words in PASS1.
79
393k
  if (reject_offset_ <= 0.0f && !is_valid_word) {
80
393k
    return false;
81
393k
  }
82
0
  if (is_valid_word && is_case_ok) {
83
0
    WordSize = LengthOfShortestAlphaRun(best_choice);
84
0
    WordSize -= stopper_smallword_size;
85
0
    if (WordSize < 0) {
86
0
      WordSize = 0;
87
0
    }
88
0
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
89
0
  }
90
91
0
  if (stopper_debug_level >= 1) {
92
0
    tprintf("Stopper:  Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
93
0
            best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
94
0
  }
95
96
0
  if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold &&
97
0
      xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) {
98
0
    return true;
99
0
  } else {
100
0
    if (stopper_debug_level >= 1) {
101
0
      tprintf(
102
0
          "AcceptableChoice() returned false"
103
0
          " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
104
0
          no_dang_ambigs, best_choice.certainty(), CertaintyThreshold,
105
0
          UniformCertainties(best_choice));
106
0
    }
107
0
    return false;
108
0
  }
109
0
}
110
111
241k
bool Dict::AcceptableResult(WERD_RES *word) const {
112
241k
  if (word->best_choice == nullptr) {
113
0
    return false;
114
0
  }
115
241k
  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
116
241k
  int WordSize;
117
118
241k
  if (stopper_debug_level >= 1) {
119
0
    tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
120
0
            word->best_choice->debug_string().c_str(), (valid_word(*word->best_choice) ? 'y' : 'n'),
121
0
            (case_ok(*word->best_choice) ? 'y' : 'n'),
122
0
            word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
123
0
            word->best_choices.singleton() ? 'n' : 'y');
124
0
  }
125
126
241k
  if (word->best_choice->empty() || !word->best_choices.singleton()) {
127
48.3k
    return false;
128
48.3k
  }
129
192k
  if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {
130
82.7k
    WordSize = LengthOfShortestAlphaRun(*word->best_choice);
131
82.7k
    WordSize -= stopper_smallword_size;
132
82.7k
    if (WordSize < 0) {
133
70.3k
      WordSize = 0;
134
70.3k
    }
135
82.7k
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
136
82.7k
  }
137
138
192k
  if (stopper_debug_level >= 1) {
139
0
    tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ", word->best_choice->certainty(),
140
0
            CertaintyThreshold);
141
0
  }
142
143
192k
  if (word->best_choice->certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) {
144
14.6k
    if (stopper_debug_level >= 1) {
145
0
      tprintf("ACCEPTED\n");
146
0
    }
147
14.6k
    return true;
148
178k
  } else {
149
178k
    if (stopper_debug_level >= 1) {
150
0
      tprintf("REJECTED\n");
151
0
    }
152
178k
    return false;
153
178k
  }
154
192k
}
155
156
#if !defined(DISABLED_LEGACY_ENGINE)
157
158
bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_replaceable,
159
817k
                            MATRIX *ratings) {
160
817k
  if (stopper_debug_level > 2) {
161
0
    tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().c_str());
162
0
  }
163
164
  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
165
  // for each unichar id in BestChoice.
166
817k
  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
167
817k
  bool ambigs_found = false;
168
  // For each position in best_choice:
169
  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
170
  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
171
  // -- look for ambiguities corresponding to wrong_ngram in the list while
172
  //    adding the following unichar_ids from best_choice to wrong_ngram
173
  //
174
  // Repeat the above procedure twice: first time look through
175
  // ambigs to be replaced and replace all the ambiguities found;
176
  // second time look through dangerous ambiguities and construct
177
  // ambig_blob_choices with fake a blob choice for each ambiguity
178
  // and pass them to dawg_permute_and_select() to search for
179
  // ambiguous words in the dictionaries.
180
  //
181
  // Note that during the execution of the for loop (on the first pass)
182
  // if replacements are made the length of best_choice might change.
183
2.45M
  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
184
1.63M
    bool replace = (fix_replaceable && pass == 0);
185
1.63M
    const UnicharAmbigsVector &table =
186
1.63M
        replace ? getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
187
1.63M
    if (!replace) {
188
      // Initialize ambig_blob_choices with lists containing a single
189
      // unichar id for the corresponding position in best_choice.
190
      // best_choice consisting from only the original letters will
191
      // have a rating of 0.0.
192
7.39M
      for (unsigned i = 0; i < best_choice->length(); ++i) {
193
6.58M
        auto *lst = new BLOB_CHOICE_LIST();
194
6.58M
        BLOB_CHOICE_IT lst_it(lst);
195
        // TODO(rays/antonova) Put real xheights and y shifts here.
196
6.58M
        lst_it.add_to_end(
197
6.58M
            new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
198
6.58M
        ambig_blob_choices.push_back(lst);
199
6.58M
      }
200
817k
    }
201
1.63M
    UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
202
1.63M
    int wrong_ngram_index;
203
1.63M
    int blob_index = 0;
204
14.7M
    for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) {
205
13.1M
      auto curr_unichar_id = best_choice->unichar_id(i);
206
13.1M
      if (stopper_debug_level > 2) {
207
0
        tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous",
208
0
                getUnicharset().debug_str(curr_unichar_id).c_str());
209
0
      }
210
13.1M
      int num_wrong_blobs = best_choice->state(i);
211
13.1M
      wrong_ngram_index = 0;
212
13.1M
      wrong_ngram[wrong_ngram_index] = curr_unichar_id;
213
13.1M
      if (curr_unichar_id == INVALID_UNICHAR_ID || static_cast<size_t>(curr_unichar_id) >= table.size() ||
214
13.1M
          table[curr_unichar_id] == nullptr) {
215
7.60M
        continue; // there is no ambig spec for this unichar id
216
7.60M
      }
217
5.55M
      AmbigSpec_IT spec_it(table[curr_unichar_id]);
218
584M
      for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
219
582M
        const AmbigSpec *ambig_spec = spec_it.data();
220
582M
        wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID;
221
582M
        int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram);
222
582M
        if (stopper_debug_level > 2) {
223
0
          tprintf("candidate ngram: ");
224
0
          UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
225
0
          tprintf("current ngram from spec: ");
226
0
          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
227
0
          tprintf("comparison result: %d\n", compare);
228
0
        }
229
582M
        if (compare == 0) {
230
          // Record the place where we found an ambiguity.
231
2.09M
          if (fixpt != nullptr) {
232
2.09M
            UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
233
2.09M
            fixpt->push_back(DANGERR_INFO(blob_index, blob_index + num_wrong_blobs, replace,
234
2.09M
                                          getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
235
2.09M
                                          leftmost_id));
236
2.09M
            if (stopper_debug_level > 1) {
237
0
              tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false,
238
0
                      getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
239
0
                      getUnicharset().id_to_unichar(leftmost_id));
240
0
            }
241
2.09M
          }
242
243
2.09M
          if (replace) {
244
341k
            if (stopper_debug_level > 2) {
245
0
              tprintf("replace ambiguity with %s : ",
246
0
                      getUnicharset().id_to_unichar(ambig_spec->correct_ngram_id));
247
0
              UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
248
0
            }
249
341k
            ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice,
250
341k
                         ratings);
251
1.75M
          } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
252
            // We found dang ambig - update ambig_blob_choices.
253
1.75M
            if (stopper_debug_level > 2) {
254
0
              tprintf("found ambiguity: ");
255
0
              UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
256
0
            }
257
1.75M
            ambigs_found = true;
258
3.85M
            for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) {
259
              // Add a blob choice for the corresponding fragment of the
260
              // ambiguity. These fake blob choices are initialized with
261
              // negative ratings (which are not possible for real blob
262
              // choices), so that dawg_permute_and_select() considers any
263
              // word not consisting of only the original letters a better
264
              // choice and stops searching for alternatives once such a
265
              // choice is found.
266
2.10M
              BLOB_CHOICE_IT bc_it(ambig_blob_choices[i + tmp_index]);
267
2.10M
              bc_it.add_to_end(new BLOB_CHOICE(ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
268
2.10M
                                               -1, 0, 1, 0, BCC_AMBIG));
269
2.10M
            }
270
1.75M
          }
271
2.09M
          spec_it.forward();
272
580M
        } else if (compare == -1) {
273
9.74M
          unsigned next_index;
274
9.74M
          if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size &&
275
6.84M
              ((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) {
276
            // Add the next unichar id to wrong_ngram and keep looking for
277
            // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
278
5.94M
            wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index);
279
5.94M
            num_wrong_blobs += best_choice->state(next_index);
280
5.94M
          } else {
281
3.79M
            break; // no more matching ambigs in this AMBIG_SPEC_LIST
282
3.79M
          }
283
570M
        } else {
284
570M
          spec_it.forward();
285
570M
        }
286
582M
      } // end searching AmbigSpec_LIST
287
5.55M
    }   // end searching best_choice
288
1.63M
  }     // end searching replace and dangerous ambigs
289
290
  // If any ambiguities were found permute the constructed ambig_blob_choices
291
  // to see if an alternative dictionary word can be found.
292
817k
  if (ambigs_found) {
293
401k
    if (stopper_debug_level > 2) {
294
0
      tprintf("\nResulting ambig_blob_choices:\n");
295
0
      for (unsigned i = 0; i < ambig_blob_choices.size(); ++i) {
296
0
        print_ratings_list("", ambig_blob_choices.at(i), getUnicharset());
297
0
        tprintf("\n");
298
0
      }
299
0
    }
300
401k
    WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
301
401k
    ambigs_found = (alt_word->rating() < 0.0);
302
401k
    if (ambigs_found) {
303
0
      if (stopper_debug_level >= 1) {
304
0
        tprintf("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().c_str());
305
0
      }
306
0
      if (fixpt != nullptr) {
307
        // Note: Currently character choices combined from fragments can only
308
        // be generated by NoDangrousAmbigs(). This code should be updated if
309
        // the capability to produce classifications combined from character
310
        // fragments is added to other functions.
311
0
        int orig_i = 0;
312
0
        for (unsigned i = 0; i < alt_word->length(); ++i) {
313
0
          const UNICHARSET &uchset = getUnicharset();
314
0
          bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i));
315
0
          UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
316
0
          if (replacement_is_ngram) {
317
            // we have to extract the leftmost unichar from the ngram.
318
0
            const char *str = uchset.id_to_unichar(leftmost_id);
319
0
            int step = uchset.step(str);
320
0
            if (step) {
321
0
              leftmost_id = uchset.unichar_to_id(str, step);
322
0
            }
323
0
          }
324
0
          int end_i = orig_i + alt_word->state(i);
325
0
          if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) {
326
            // Compute proper blob indices.
327
0
            int blob_start = 0;
328
0
            for (int j = 0; j < orig_i; ++j) {
329
0
              blob_start += best_choice->state(j);
330
0
            }
331
0
            int blob_end = blob_start;
332
0
            for (int j = orig_i; j < end_i; ++j) {
333
0
              blob_end += best_choice->state(j);
334
0
            }
335
0
            fixpt->push_back(
336
0
                DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id));
337
0
            if (stopper_debug_level > 1) {
338
0
              tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true,
339
0
                      replacement_is_ngram, uchset.id_to_unichar(leftmost_id));
340
0
            }
341
0
          }
342
0
          orig_i += alt_word->state(i);
343
0
        }
344
0
      }
345
0
    }
346
401k
    delete alt_word;
347
401k
  }
348
817k
  if (output_ambig_words_file_ != nullptr) {
349
0
    fprintf(output_ambig_words_file_, "\n");
350
0
  }
351
352
6.58M
  for (auto data : ambig_blob_choices) {
353
6.58M
    delete data;
354
6.58M
  }
355
817k
  return !ambigs_found;
356
817k
}
357
358
0
void Dict::EndDangerousAmbigs() {}
359
360
#endif // !defined(DISABLED_LEGACY_ENGINE)
361
362
45.9k
void Dict::SetupStopperPass1() {
363
45.9k
  reject_offset_ = 0.0;
364
45.9k
}
365
366
67.2k
void Dict::SetupStopperPass2() {
367
67.2k
  reject_offset_ = stopper_phase2_certainty_rejection_offset;
368
67.2k
}
369
370
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
371
341k
                        UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings) {
372
341k
  int num_blobs_to_replace = 0;
373
341k
  int begin_blob_index = 0;
374
341k
  int i;
375
  // Rating and certainty for the new BLOB_CHOICE are derived from the
376
  // replaced choices.
377
341k
  float new_rating = 0.0f;
378
341k
  float new_certainty = 0.0f;
379
341k
  BLOB_CHOICE *old_choice = nullptr;
380
2.70M
  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
381
2.36M
    if (i >= wrong_ngram_begin_index) {
382
694k
      int num_blobs = werd_choice->state(i);
383
694k
      int col = begin_blob_index + num_blobs_to_replace;
384
694k
      int row = col + num_blobs - 1;
385
694k
      BLOB_CHOICE_LIST *choices = ratings->get(col, row);
386
694k
      ASSERT_HOST(choices != nullptr);
387
694k
      old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
388
694k
      ASSERT_HOST(old_choice != nullptr);
389
694k
      new_rating += old_choice->rating();
390
694k
      new_certainty += old_choice->certainty();
391
694k
      num_blobs_to_replace += num_blobs;
392
1.66M
    } else {
393
1.66M
      begin_blob_index += werd_choice->state(i);
394
1.66M
    }
395
2.36M
  }
396
341k
  new_certainty /= wrong_ngram_size;
397
  // If there is no entry in the ratings matrix, add it.
398
341k
  MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1);
399
341k
  if (!coord.Valid(*ratings)) {
400
2.07k
    ratings->IncreaseBandSize(coord.row - coord.col + 1);
401
2.07k
  }
402
341k
  if (ratings->get(coord.col, coord.row) == nullptr) {
403
66.6k
    ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
404
66.6k
  }
405
341k
  BLOB_CHOICE_LIST *new_choices = ratings->get(coord.col, coord.row);
406
341k
  BLOB_CHOICE *choice = FindMatchingChoice(correct_ngram_id, new_choices);
407
341k
  if (choice != nullptr) {
408
    // Already there. Upgrade if new rating better.
409
262k
    if (new_rating < choice->rating()) {
410
9.16k
      choice->set_rating(new_rating);
411
9.16k
    }
412
262k
    if (new_certainty < choice->certainty()) {
413
3.47k
      choice->set_certainty(new_certainty);
414
3.47k
    }
415
    // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
416
262k
  } else {
417
    // Need a new choice with the correct_ngram_id.
418
79.5k
    choice = new BLOB_CHOICE(*old_choice);
419
79.5k
    choice->set_unichar_id(correct_ngram_id);
420
79.5k
    choice->set_rating(new_rating);
421
79.5k
    choice->set_certainty(new_certainty);
422
79.5k
    choice->set_classifier(BCC_AMBIG);
423
79.5k
    choice->set_matrix_cell(coord.col, coord.row);
424
79.5k
    BLOB_CHOICE_IT it(new_choices);
425
79.5k
    it.add_to_end(choice);
426
79.5k
  }
427
  // Remove current unichar from werd_choice. On the last iteration
428
  // set the correct replacement unichar instead of removing a unichar.
429
1.03M
  for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) {
430
694k
    if (replaced_count + 1 == wrong_ngram_size) {
431
341k
      werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice);
432
352k
    } else {
433
352k
      werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
434
352k
    }
435
694k
  }
436
341k
  if (stopper_debug_level >= 1) {
437
0
    werd_choice->print("ReplaceAmbig() ");
438
0
    tprintf("Modified blob_choices: ");
439
0
    print_ratings_list("\n", new_choices, getUnicharset());
440
0
  }
441
341k
}
442
443
82.7k
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
444
82.7k
  int shortest = INT32_MAX;
445
82.7k
  int curr_len = 0;
446
182k
  for (unsigned w = 0; w < WordChoice.length(); ++w) {
447
99.3k
    if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
448
47.8k
      curr_len++;
449
51.4k
    } else if (curr_len > 0) {
450
749
      if (curr_len < shortest) {
451
748
        shortest = curr_len;
452
748
      }
453
749
      curr_len = 0;
454
749
    }
455
99.3k
  }
456
82.7k
  if (curr_len > 0 && curr_len < shortest) {
457
33.7k
    shortest = curr_len;
458
49.0k
  } else if (shortest == INT32_MAX) {
459
48.3k
    shortest = 0;
460
48.3k
  }
461
82.7k
  return shortest;
462
82.7k
}
463
464
0
int Dict::UniformCertainties(const WERD_CHOICE &word) {
465
0
  float Certainty;
466
0
  float WorstCertainty = FLT_MAX;
467
0
  float CertaintyThreshold;
468
0
  double TotalCertainty;
469
0
  double TotalCertaintySquared;
470
0
  double Variance;
471
0
  float Mean, StdDev;
472
0
  int word_length = word.length();
473
474
0
  if (word_length < 3) {
475
0
    return true;
476
0
  }
477
478
0
  TotalCertainty = TotalCertaintySquared = 0.0;
479
0
  for (int i = 0; i < word_length; ++i) {
480
0
    Certainty = word.certainty(i);
481
0
    TotalCertainty += Certainty;
482
0
    TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
483
0
    if (Certainty < WorstCertainty) {
484
0
      WorstCertainty = Certainty;
485
0
    }
486
0
  }
487
488
  // Subtract off worst certainty from statistics.
489
0
  word_length--;
490
0
  TotalCertainty -= WorstCertainty;
491
0
  TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
492
493
0
  Mean = TotalCertainty / word_length;
494
0
  Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) /
495
0
              (word_length * (word_length - 1)));
496
0
  if (Variance < 0.0) {
497
0
    Variance = 0.0;
498
0
  }
499
0
  StdDev = sqrt(Variance);
500
501
0
  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
502
0
  if (CertaintyThreshold > stopper_nondict_certainty_base) {
503
0
    CertaintyThreshold = stopper_nondict_certainty_base;
504
0
  }
505
506
0
  if (word.certainty() < CertaintyThreshold) {
507
0
    if (stopper_debug_level >= 1) {
508
0
      tprintf(
509
0
          "Stopper: Non-uniform certainty = %4.1f"
510
0
          " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
511
0
          word.certainty(), Mean, StdDev, CertaintyThreshold);
512
0
    }
513
0
    return false;
514
0
  } else {
515
0
    return true;
516
0
  }
517
0
}
518
519
} // namespace tesseract