/src/tesseract/src/dict/stopper.cpp

Source
/******************************************************************************
 ** Filename:    stopper.c
 ** Purpose:     Stopping criteria for word classifier.
 ** Author:      Dan Johnson
 **
 ** (c) Copyright Hewlett-Packard Company, 1988.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 ******************************************************************************/

#include <cctype>
#include <cmath>
#include <cstdio>
#include <cstring>

#include "stopper.h"
#ifndef DISABLED_LEGACY_ENGINE
#  include "ambigs.h"
#endif
#include <tesseract/unichar.h>
#include "ccutil.h"
#include "dict.h"
#include "helpers.h"
#include "matchdefs.h"
#include "pageres.h"
#include "params.h"
#include "ratngs.h"

/*----------------------------------------------------------------------------
              Private Code
----------------------------------------------------------------------------*/

namespace tesseract {

bool Dict::AcceptableChoice(const WERD_CHOICE &best_choice,
                            XHeightConsistencyEnum xheight_consistency) {
  float CertaintyThreshold = stopper_nondict_certainty_base;
  int WordSize;

  if (stopper_no_acceptable_choices) {
    return false;
  }

  if (best_choice.empty()) {
    return false;
  }

  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
  bool is_case_ok = case_ok(best_choice);

  if (stopper_debug_level >= 1) {
    const char *xht = "UNKNOWN";
    switch (xheight_consistency) {
      case XH_GOOD:
        xht = "NORMAL";
        break;
      case XH_SUBNORMAL:
        xht = "SUBNORMAL";
        break;
      case XH_INCONSISTENT:
        xht = "INCONSISTENT";
        break;
      default:
        xht = "UNKNOWN";
    }
    tprintf("\nStopper:  %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
            best_choice.unichar_string().c_str(), (is_valid_word ? 'y' : 'n'),
            (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height());
  }
  // Do not accept invalid words in PASS1.
  if (reject_offset_ <= 0.0f && !is_valid_word) {
    return false;
  }
  if (is_valid_word && is_case_ok) {
    WordSize = LengthOfShortestAlphaRun(best_choice);
    WordSize -= stopper_smallword_size;
    if (WordSize < 0) {
      WordSize = 0;
    }
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
  }

  if (stopper_debug_level >= 1) {
    tprintf("Stopper:  Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
            best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
  }

  if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold &&
      xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) {
    return true;
  } else {
    if (stopper_debug_level >= 1) {
      tprintf(
          "AcceptableChoice() returned false"
          " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
          no_dang_ambigs, best_choice.certainty(), CertaintyThreshold,
          UniformCertainties(best_choice));
    }
    return false;
  }
}

bool Dict::AcceptableResult(WERD_RES *word) const {
  if (word->best_choice == nullptr) {
    return false;
  }
  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
  int WordSize;

  if (stopper_debug_level >= 1) {
    tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
            word->best_choice->debug_string().c_str(), (valid_word(*word->best_choice) ? 'y' : 'n'),
            (case_ok(*word->best_choice) ? 'y' : 'n'),
            word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
            word->best_choices.singleton() ? 'n' : 'y');
  }

  if (word->best_choice->empty() || !word->best_choices.singleton()) {
    return false;
  }
  if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {
    WordSize = LengthOfShortestAlphaRun(*word->best_choice);
    WordSize -= stopper_smallword_size;
    if (WordSize < 0) {
      WordSize = 0;
    }
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
  }

  if (stopper_debug_level >= 1) {
    tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ", word->best_choice->certainty(),
            CertaintyThreshold);
  }

  if (word->best_choice->certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) {
    if (stopper_debug_level >= 1) {
      tprintf("ACCEPTED\n");
    }
    return true;
  } else {
    if (stopper_debug_level >= 1) {
      tprintf("REJECTED\n");
    }
    return false;
  }
}

#if !defined(DISABLED_LEGACY_ENGINE)

bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_replaceable,
                            MATRIX *ratings) {
  if (stopper_debug_level > 2) {
    tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().c_str());
  }

  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
  // for each unichar id in BestChoice.
  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
  bool ambigs_found = false;
  // For each position in best_choice:
  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
  // -- look for ambiguities corresponding to wrong_ngram in the list while
  //    adding the following unichar_ids from best_choice to wrong_ngram
  //
  // Repeat the above procedure twice: first time look through
  // ambigs to be replaced and replace all the ambiguities found;
  // second time look through dangerous ambiguities and construct
  // ambig_blob_choices with fake a blob choice for each ambiguity
  // and pass them to dawg_permute_and_select() to search for
  // ambiguous words in the dictionaries.
  //
  // Note that during the execution of the for loop (on the first pass)
  // if replacements are made the length of best_choice might change.
  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
    bool replace = (fix_replaceable && pass == 0);
    const UnicharAmbigsVector &table =
        replace ? getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
    if (!replace) {
      // Initialize ambig_blob_choices with lists containing a single
      // unichar id for the corresponding position in best_choice.
      // best_choice consisting from only the original letters will
      // have a rating of 0.0.
      for (unsigned i = 0; i < best_choice->length(); ++i) {
        auto *lst = new BLOB_CHOICE_LIST();
        BLOB_CHOICE_IT lst_it(lst);
        // TODO(rays/antonova) Put real xheights and y shifts here.
        lst_it.add_to_end(
            new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
        ambig_blob_choices.push_back(lst);
      }
    }
    UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
    int wrong_ngram_index;
    int blob_index = 0;
    for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) {
      auto curr_unichar_id = best_choice->unichar_id(i);
      if (stopper_debug_level > 2) {
        tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous",
                getUnicharset().debug_str(curr_unichar_id).c_str());
      }
      int num_wrong_blobs = best_choice->state(i);
      wrong_ngram_index = 0;
      wrong_ngram[wrong_ngram_index] = curr_unichar_id;
      if (curr_unichar_id == INVALID_UNICHAR_ID || static_cast<size_t>(curr_unichar_id) >= table.size() ||
          table[curr_unichar_id] == nullptr) {
        continue; // there is no ambig spec for this unichar id
      }
      AmbigSpec_IT spec_it(table[curr_unichar_id]);
      for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
        const AmbigSpec *ambig_spec = spec_it.data();
        wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID;
        int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram);
        if (stopper_debug_level > 2) {
          tprintf("candidate ngram: ");
          UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
          tprintf("current ngram from spec: ");
          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
          tprintf("comparison result: %d\n", compare);
        }
        if (compare == 0) {
          // Record the place where we found an ambiguity.
          if (fixpt != nullptr) {
            UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
            fixpt->push_back(DANGERR_INFO(blob_index, blob_index + num_wrong_blobs, replace,
                                          getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
                                          leftmost_id));
            if (stopper_debug_level > 1) {
              tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false,
                      getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
                      getUnicharset().id_to_unichar(leftmost_id));
            }
          }

          if (replace) {
            if (stopper_debug_level > 2) {
              tprintf("replace ambiguity with %s : ",
                      getUnicharset().id_to_unichar(ambig_spec->correct_ngram_id));
              UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
            }
            ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice,
                         ratings);
          } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
            // We found dang ambig - update ambig_blob_choices.
            if (stopper_debug_level > 2) {
              tprintf("found ambiguity: ");
              UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
            }
            ambigs_found = true;
            for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) {
              // Add a blob choice for the corresponding fragment of the
              // ambiguity. These fake blob choices are initialized with
              // negative ratings (which are not possible for real blob
              // choices), so that dawg_permute_and_select() considers any
              // word not consisting of only the original letters a better
              // choice and stops searching for alternatives once such a
              // choice is found.
              BLOB_CHOICE_IT bc_it(ambig_blob_choices[i + tmp_index]);
              bc_it.add_to_end(new BLOB_CHOICE(ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
                                               -1, 0, 1, 0, BCC_AMBIG));
            }
          }
          spec_it.forward();
        } else if (compare == -1) {
          unsigned next_index;
          if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size &&
              ((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) {
            // Add the next unichar id to wrong_ngram and keep looking for
            // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
            wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index);
            num_wrong_blobs += best_choice->state(next_index);
          } else {
            break; // no more matching ambigs in this AMBIG_SPEC_LIST
          }
        } else {
          spec_it.forward();
        }
      } // end searching AmbigSpec_LIST
    }   // end searching best_choice
  }     // end searching replace and dangerous ambigs

  // If any ambiguities were found permute the constructed ambig_blob_choices
  // to see if an alternative dictionary word can be found.
  if (ambigs_found) {
    if (stopper_debug_level > 2) {
      tprintf("\nResulting ambig_blob_choices:\n");
      for (unsigned i = 0; i < ambig_blob_choices.size(); ++i) {
        print_ratings_list("", ambig_blob_choices.at(i), getUnicharset());
        tprintf("\n");
      }
    }
    WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
    ambigs_found = (alt_word->rating() < 0.0);
    if (ambigs_found) {
      if (stopper_debug_level >= 1) {
        tprintf("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().c_str());
      }
      if (fixpt != nullptr) {
        // Note: Currently character choices combined from fragments can only
        // be generated by NoDangrousAmbigs(). This code should be updated if
        // the capability to produce classifications combined from character
        // fragments is added to other functions.
        int orig_i = 0;
        for (unsigned i = 0; i < alt_word->length(); ++i) {
          const UNICHARSET &uchset = getUnicharset();
          bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i));
          UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
          if (replacement_is_ngram) {
            // we have to extract the leftmost unichar from the ngram.
            const char *str = uchset.id_to_unichar(leftmost_id);
            int step = uchset.step(str);
            if (step) {
              leftmost_id = uchset.unichar_to_id(str, step);
            }
          }
          int end_i = orig_i + alt_word->state(i);
          if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) {
            // Compute proper blob indices.
            int blob_start = 0;
            for (int j = 0; j < orig_i; ++j) {
              blob_start += best_choice->state(j);
            }
            int blob_end = blob_start;
            for (int j = orig_i; j < end_i; ++j) {
              blob_end += best_choice->state(j);
            }
            fixpt->push_back(
                DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id));
            if (stopper_debug_level > 1) {
              tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true,
                      replacement_is_ngram, uchset.id_to_unichar(leftmost_id));
            }
          }
          orig_i += alt_word->state(i);
        }
      }
    }
    delete alt_word;
  }
  if (output_ambig_words_file_ != nullptr) {
    fprintf(output_ambig_words_file_, "\n");
  }

  for (auto data : ambig_blob_choices) {
    delete data;
  }
  return !ambigs_found;
}

void Dict::EndDangerousAmbigs() {}

#endif // !defined(DISABLED_LEGACY_ENGINE)

void Dict::SetupStopperPass1() {
  reject_offset_ = 0.0;
}

void Dict::SetupStopperPass2() {
  reject_offset_ = stopper_phase2_certainty_rejection_offset;
}

void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
                        UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings) {
  int num_blobs_to_replace = 0;
  int begin_blob_index = 0;
  int i;
  // Rating and certainty for the new BLOB_CHOICE are derived from the
  // replaced choices.
  float new_rating = 0.0f;
  float new_certainty = 0.0f;
  BLOB_CHOICE *old_choice = nullptr;
  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
    if (i >= wrong_ngram_begin_index) {
      int num_blobs = werd_choice->state(i);
      int col = begin_blob_index + num_blobs_to_replace;
      int row = col + num_blobs - 1;
      BLOB_CHOICE_LIST *choices = ratings->get(col, row);
      ASSERT_HOST(choices != nullptr);
      old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
      ASSERT_HOST(old_choice != nullptr);
      new_rating += old_choice->rating();
      new_certainty += old_choice->certainty();
      num_blobs_to_replace += num_blobs;
    } else {
      begin_blob_index += werd_choice->state(i);
    }
  }
  new_certainty /= wrong_ngram_size;
  // If there is no entry in the ratings matrix, add it.
  MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1);
  if (!coord.Valid(*ratings)) {
    ratings->IncreaseBandSize(coord.row - coord.col + 1);
  }
  if (ratings->get(coord.col, coord.row) == nullptr) {
    ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
  }
  BLOB_CHOICE_LIST *new_choices = ratings->get(coord.col, coord.row);
  BLOB_CHOICE *choice = FindMatchingChoice(correct_ngram_id, new_choices);
  if (choice != nullptr) {
    // Already there. Upgrade if new rating better.
    if (new_rating < choice->rating()) {
      choice->set_rating(new_rating);
    }
    if (new_certainty < choice->certainty()) {
      choice->set_certainty(new_certainty);
    }
    // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
  } else {
    // Need a new choice with the correct_ngram_id.
    choice = new BLOB_CHOICE(*old_choice);
    choice->set_unichar_id(correct_ngram_id);
    choice->set_rating(new_rating);
    choice->set_certainty(new_certainty);
    choice->set_classifier(BCC_AMBIG);
    choice->set_matrix_cell(coord.col, coord.row);
    BLOB_CHOICE_IT it(new_choices);
    it.add_to_end(choice);
  }
  // Remove current unichar from werd_choice. On the last iteration
  // set the correct replacement unichar instead of removing a unichar.
  for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) {
    if (replaced_count + 1 == wrong_ngram_size) {
      werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice);
    } else {
      werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
    }
  }
  if (stopper_debug_level >= 1) {
    werd_choice->print("ReplaceAmbig() ");
    tprintf("Modified blob_choices: ");
    print_ratings_list("\n", new_choices, getUnicharset());
  }
}

int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
  int shortest = INT32_MAX;
  int curr_len = 0;
  for (unsigned w = 0; w < WordChoice.length(); ++w) {
    if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
      curr_len++;
    } else if (curr_len > 0) {
      if (curr_len < shortest) {
        shortest = curr_len;
      }
      curr_len = 0;
    }
  }
  if (curr_len > 0 && curr_len < shortest) {
    shortest = curr_len;
  } else if (shortest == INT32_MAX) {
    shortest = 0;
  }
  return shortest;
}

int Dict::UniformCertainties(const WERD_CHOICE &word) {
  float Certainty;
  float WorstCertainty = FLT_MAX;
  float CertaintyThreshold;
  double TotalCertainty;
  double TotalCertaintySquared;
  double Variance;
  float Mean, StdDev;
  int word_length = word.length();

  if (word_length < 3) {
    return true;
  }

  TotalCertainty = TotalCertaintySquared = 0.0;
  for (int i = 0; i < word_length; ++i) {
    Certainty = word.certainty(i);
    TotalCertainty += Certainty;
    TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
    if (Certainty < WorstCertainty) {
      WorstCertainty = Certainty;
    }
  }

  // Subtract off worst certainty from statistics.
  word_length--;
  TotalCertainty -= WorstCertainty;
  TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;

  Mean = TotalCertainty / word_length;
  Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) /
              (word_length * (word_length - 1)));
  if (Variance < 0.0) {
    Variance = 0.0;
  }
  StdDev = sqrt(Variance);

  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
  if (CertaintyThreshold > stopper_nondict_certainty_base) {
    CertaintyThreshold = stopper_nondict_certainty_base;
  }

  if (word.certainty() < CertaintyThreshold) {
    if (stopper_debug_level >= 1) {
      tprintf(
          "Stopper: Non-uniform certainty = %4.1f"
          " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
          word.certainty(), Mean, StdDev, CertaintyThreshold);
    }
    return false;
  } else {
    return true;
  }
}

} // namespace tesseract

Coverage Report

Created: 2025-11-16 06:50

Line	Count	Source
1		/******************************************************************************
2		** Filename: stopper.c
3		** Purpose: Stopping criteria for word classifier.
4		** Author: Dan Johnson
5		**
6		** (c) Copyright Hewlett-Packard Company, 1988.
7		** Licensed under the Apache License, Version 2.0 (the "License");
8		** you may not use this file except in compliance with the License.
9		** You may obtain a copy of the License at
10		** http://www.apache.org/licenses/LICENSE-2.0
11		** Unless required by applicable law or agreed to in writing, software
12		** distributed under the License is distributed on an "AS IS" BASIS,
13		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		** See the License for the specific language governing permissions and
15		** limitations under the License.
16		******************************************************************************/
17
18		#include <cctype>
19		#include <cmath>
20		#include <cstdio>
21		#include <cstring>
22
23		#include "stopper.h"
24		#ifndef DISABLED_LEGACY_ENGINE
25		# include "ambigs.h"
26		#endif
27		#include <tesseract/unichar.h>
28		#include "ccutil.h"
29		#include "dict.h"
30		#include "helpers.h"
31		#include "matchdefs.h"
32		#include "pageres.h"
33		#include "params.h"
34		#include "ratngs.h"
35
36		/*----------------------------------------------------------------------------
37		Private Code
38		----------------------------------------------------------------------------*/
39
40		namespace tesseract {
41
42		bool Dict::AcceptableChoice(const WERD_CHOICE &best_choice,
43	393k	XHeightConsistencyEnum xheight_consistency) {
44	393k	float CertaintyThreshold = stopper_nondict_certainty_base;
45	393k	int WordSize;
46
47	393k	if (stopper_no_acceptable_choices) {
48	0	return false;
49	0	}
50
51	393k	if (best_choice.empty()) {
52	0	return false;
53	0	}
54
55	393k	bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
56	393k	bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
57	393k	bool is_case_ok = case_ok(best_choice);
58
59	393k	if (stopper_debug_level >= 1) {
60	0	const char *xht = "UNKNOWN";
61	0	switch (xheight_consistency) {
62	0	case XH_GOOD:
63	0	xht = "NORMAL";
64	0	break;
65	0	case XH_SUBNORMAL:
66	0	xht = "SUBNORMAL";
67	0	break;
68	0	case XH_INCONSISTENT:
69	0	xht = "INCONSISTENT";
70	0	break;
71	0	default:
72	0	xht = "UNKNOWN";
73	0	}
74	0	tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
75	0	best_choice.unichar_string().c_str(), (is_valid_word ? 'y' : 'n'),
76	0	(is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height());
77	0	}
78		// Do not accept invalid words in PASS1.
79	393k	if (reject_offset_ <= 0.0f && !is_valid_word) {
80	393k	return false;
81	393k	}
82	0	if (is_valid_word && is_case_ok) {
83	0	WordSize = LengthOfShortestAlphaRun(best_choice);
84	0	WordSize -= stopper_smallword_size;
85	0	if (WordSize < 0) {
86	0	WordSize = 0;
87	0	}
88	0	CertaintyThreshold += WordSize * stopper_certainty_per_char;
89	0	}
90
91	0	if (stopper_debug_level >= 1) {
92	0	tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
93	0	best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
94	0	}
95
96	0	if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold &&
97	0	xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) {
98	0	return true;
99	0	} else {
100	0	if (stopper_debug_level >= 1) {
101	0	tprintf(
102	0	"AcceptableChoice() returned false"
103	0	" (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
104	0	no_dang_ambigs, best_choice.certainty(), CertaintyThreshold,
105	0	UniformCertainties(best_choice));
106	0	}
107	0	return false;
108	0	}
109	0	}
110
111	241k	bool Dict::AcceptableResult(WERD_RES *word) const {
112	241k	if (word->best_choice == nullptr) {
113	0	return false;
114	0	}
115	241k	float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
116	241k	int WordSize;
117
118	241k	if (stopper_debug_level >= 1) {
119	0	tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
120	0	word->best_choice->debug_string().c_str(), (valid_word(*word->best_choice) ? 'y' : 'n'),
121	0	(case_ok(*word->best_choice) ? 'y' : 'n'),
122	0	word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
123	0	word->best_choices.singleton() ? 'n' : 'y');
124	0	}
125
126	241k	if (word->best_choice->empty() \|\| !word->best_choices.singleton()) {
127	48.3k	return false;
128	48.3k	}
129	192k	if (valid_word(word->best_choice) && case_ok(word->best_choice)) {
130	82.7k	WordSize = LengthOfShortestAlphaRun(*word->best_choice);
131	82.7k	WordSize -= stopper_smallword_size;
132	82.7k	if (WordSize < 0) {
133	70.3k	WordSize = 0;
134	70.3k	}
135	82.7k	CertaintyThreshold += WordSize * stopper_certainty_per_char;
136	82.7k	}
137
138	192k	if (stopper_debug_level >= 1) {
139	0	tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", word->best_choice->certainty(),
140	0	CertaintyThreshold);
141	0	}
142
143	192k	if (word->best_choice->certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) {
144	14.6k	if (stopper_debug_level >= 1) {
145	0	tprintf("ACCEPTED\n");
146	0	}
147	14.6k	return true;
148	178k	} else {
149	178k	if (stopper_debug_level >= 1) {
150	0	tprintf("REJECTED\n");
151	0	}
152	178k	return false;
153	178k	}
154	192k	}
155
156		#if !defined(DISABLED_LEGACY_ENGINE)
157
158		bool Dict::NoDangerousAmbig(WERD_CHOICE best_choice, DANGERR fixpt, bool fix_replaceable,
159	817k	MATRIX *ratings) {
160	817k	if (stopper_debug_level > 2) {
161	0	tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().c_str());
162	0	}
163
164		// Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
165		// for each unichar id in BestChoice.
166	817k	BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
167	817k	bool ambigs_found = false;
168		// For each position in best_choice:
169		// -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
170		// -- initialize wrong_ngram with a single unichar_id at best_choice[i]
171		// -- look for ambiguities corresponding to wrong_ngram in the list while
172		// adding the following unichar_ids from best_choice to wrong_ngram
173		//
174		// Repeat the above procedure twice: first time look through
175		// ambigs to be replaced and replace all the ambiguities found;
176		// second time look through dangerous ambiguities and construct
177		// ambig_blob_choices with fake a blob choice for each ambiguity
178		// and pass them to dawg_permute_and_select() to search for
179		// ambiguous words in the dictionaries.
180		//
181		// Note that during the execution of the for loop (on the first pass)
182		// if replacements are made the length of best_choice might change.
183	2.45M	for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
184	1.63M	bool replace = (fix_replaceable && pass == 0);
185	1.63M	const UnicharAmbigsVector &table =
186	1.63M	replace ? getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
187	1.63M	if (!replace) {
188		// Initialize ambig_blob_choices with lists containing a single
189		// unichar id for the corresponding position in best_choice.
190		// best_choice consisting from only the original letters will
191		// have a rating of 0.0.
192	7.39M	for (unsigned i = 0; i < best_choice->length(); ++i) {
193	6.58M	auto *lst = new BLOB_CHOICE_LIST();
194	6.58M	BLOB_CHOICE_IT lst_it(lst);
195		// TODO(rays/antonova) Put real xheights and y shifts here.
196	6.58M	lst_it.add_to_end(
197	6.58M	new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
198	6.58M	ambig_blob_choices.push_back(lst);
199	6.58M	}
200	817k	}
201	1.63M	UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
202	1.63M	int wrong_ngram_index;
203	1.63M	int blob_index = 0;
204	14.7M	for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) {
205	13.1M	auto curr_unichar_id = best_choice->unichar_id(i);
206	13.1M	if (stopper_debug_level > 2) {
207	0	tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous",
208	0	getUnicharset().debug_str(curr_unichar_id).c_str());
209	0	}
210	13.1M	int num_wrong_blobs = best_choice->state(i);
211	13.1M	wrong_ngram_index = 0;
212	13.1M	wrong_ngram[wrong_ngram_index] = curr_unichar_id;
213	13.1M	if (curr_unichar_id == INVALID_UNICHAR_ID \|\| static_cast<size_t>(curr_unichar_id) >= table.size() \|\|
214	13.1M	table[curr_unichar_id] == nullptr) {
215	7.60M	continue; // there is no ambig spec for this unichar id
216	7.60M	}
217	5.55M	AmbigSpec_IT spec_it(table[curr_unichar_id]);
218	584M	for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
219	582M	const AmbigSpec *ambig_spec = spec_it.data();
220	582M	wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID;
221	582M	int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram);
222	582M	if (stopper_debug_level > 2) {
223	0	tprintf("candidate ngram: ");
224	0	UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
225	0	tprintf("current ngram from spec: ");
226	0	UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
227	0	tprintf("comparison result: %d\n", compare);
228	0	}
229	582M	if (compare == 0) {
230		// Record the place where we found an ambiguity.
231	2.09M	if (fixpt != nullptr) {
232	2.09M	UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
233	2.09M	fixpt->push_back(DANGERR_INFO(blob_index, blob_index + num_wrong_blobs, replace,
234	2.09M	getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
235	2.09M	leftmost_id));
236	2.09M	if (stopper_debug_level > 1) {
237	0	tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false,
238	0	getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
239	0	getUnicharset().id_to_unichar(leftmost_id));
240	0	}
241	2.09M	}
242
243	2.09M	if (replace) {
244	341k	if (stopper_debug_level > 2) {
245	0	tprintf("replace ambiguity with %s : ",
246	0	getUnicharset().id_to_unichar(ambig_spec->correct_ngram_id));
247	0	UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
248	0	}
249	341k	ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice,
250	341k	ratings);
251	1.75M	} else if (i > 0 \|\| ambig_spec->type != CASE_AMBIG) {
252		// We found dang ambig - update ambig_blob_choices.
253	1.75M	if (stopper_debug_level > 2) {
254	0	tprintf("found ambiguity: ");
255	0	UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
256	0	}
257	1.75M	ambigs_found = true;
258	3.85M	for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) {
259		// Add a blob choice for the corresponding fragment of the
260		// ambiguity. These fake blob choices are initialized with
261		// negative ratings (which are not possible for real blob
262		// choices), so that dawg_permute_and_select() considers any
263		// word not consisting of only the original letters a better
264		// choice and stops searching for alternatives once such a
265		// choice is found.
266	2.10M	BLOB_CHOICE_IT bc_it(ambig_blob_choices[i + tmp_index]);
267	2.10M	bc_it.add_to_end(new BLOB_CHOICE(ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
268	2.10M	-1, 0, 1, 0, BCC_AMBIG));
269	2.10M	}
270	1.75M	}
271	2.09M	spec_it.forward();
272	580M	} else if (compare == -1) {
273	9.74M	unsigned next_index;
274	9.74M	if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size &&
275	6.84M	((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) {
276		// Add the next unichar id to wrong_ngram and keep looking for
277		// more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
278	5.94M	wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index);
279	5.94M	num_wrong_blobs += best_choice->state(next_index);
280	5.94M	} else {
281	3.79M	break; // no more matching ambigs in this AMBIG_SPEC_LIST
282	3.79M	}
283	570M	} else {
284	570M	spec_it.forward();
285	570M	}
286	582M	} // end searching AmbigSpec_LIST
287	5.55M	} // end searching best_choice
288	1.63M	} // end searching replace and dangerous ambigs
289
290		// If any ambiguities were found permute the constructed ambig_blob_choices
291		// to see if an alternative dictionary word can be found.
292	817k	if (ambigs_found) {
293	401k	if (stopper_debug_level > 2) {
294	0	tprintf("\nResulting ambig_blob_choices:\n");
295	0	for (unsigned i = 0; i < ambig_blob_choices.size(); ++i) {
296	0	print_ratings_list("", ambig_blob_choices.at(i), getUnicharset());
297	0	tprintf("\n");
298	0	}
299	0	}
300	401k	WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
301	401k	ambigs_found = (alt_word->rating() < 0.0);
302	401k	if (ambigs_found) {
303	0	if (stopper_debug_level >= 1) {
304	0	tprintf("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().c_str());
305	0	}
306	0	if (fixpt != nullptr) {
307		// Note: Currently character choices combined from fragments can only
308		// be generated by NoDangrousAmbigs(). This code should be updated if
309		// the capability to produce classifications combined from character
310		// fragments is added to other functions.
311	0	int orig_i = 0;
312	0	for (unsigned i = 0; i < alt_word->length(); ++i) {
313	0	const UNICHARSET &uchset = getUnicharset();
314	0	bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i));
315	0	UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
316	0	if (replacement_is_ngram) {
317		// we have to extract the leftmost unichar from the ngram.
318	0	const char *str = uchset.id_to_unichar(leftmost_id);
319	0	int step = uchset.step(str);
320	0	if (step) {
321	0	leftmost_id = uchset.unichar_to_id(str, step);
322	0	}
323	0	}
324	0	int end_i = orig_i + alt_word->state(i);
325	0	if (alt_word->state(i) > 1 \|\| (orig_i + 1 == end_i && replacement_is_ngram)) {
326		// Compute proper blob indices.
327	0	int blob_start = 0;
328	0	for (int j = 0; j < orig_i; ++j) {
329	0	blob_start += best_choice->state(j);
330	0	}
331	0	int blob_end = blob_start;
332	0	for (int j = orig_i; j < end_i; ++j) {
333	0	blob_end += best_choice->state(j);
334	0	}
335	0	fixpt->push_back(
336	0	DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id));
337	0	if (stopper_debug_level > 1) {
338	0	tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true,
339	0	replacement_is_ngram, uchset.id_to_unichar(leftmost_id));
340	0	}
341	0	}
342	0	orig_i += alt_word->state(i);
343	0	}
344	0	}
345	0	}
346	401k	delete alt_word;
347	401k	}
348	817k	if (output_ambig_words_file_ != nullptr) {
349	0	fprintf(output_ambig_words_file_, "\n");
350	0	}
351
352	6.58M	for (auto data : ambig_blob_choices) {
353	6.58M	delete data;
354	6.58M	}
355	817k	return !ambigs_found;
356	817k	}
357
358	0	void Dict::EndDangerousAmbigs() {}
359
360		#endif // !defined(DISABLED_LEGACY_ENGINE)
361
362	45.9k	void Dict::SetupStopperPass1() {
363	45.9k	reject_offset_ = 0.0;
364	45.9k	}
365
366	67.2k	void Dict::SetupStopperPass2() {
367	67.2k	reject_offset_ = stopper_phase2_certainty_rejection_offset;
368	67.2k	}
369
370		void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
371	341k	UNICHAR_ID correct_ngram_id, WERD_CHOICE werd_choice, MATRIX ratings) {
372	341k	int num_blobs_to_replace = 0;
373	341k	int begin_blob_index = 0;
374	341k	int i;
375		// Rating and certainty for the new BLOB_CHOICE are derived from the
376		// replaced choices.
377	341k	float new_rating = 0.0f;
378	341k	float new_certainty = 0.0f;
379	341k	BLOB_CHOICE *old_choice = nullptr;
380	2.70M	for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
381	2.36M	if (i >= wrong_ngram_begin_index) {
382	694k	int num_blobs = werd_choice->state(i);
383	694k	int col = begin_blob_index + num_blobs_to_replace;
384	694k	int row = col + num_blobs - 1;
385	694k	BLOB_CHOICE_LIST *choices = ratings->get(col, row);
386	694k	ASSERT_HOST(choices != nullptr);
387	694k	old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
388	694k	ASSERT_HOST(old_choice != nullptr);
389	694k	new_rating += old_choice->rating();
390	694k	new_certainty += old_choice->certainty();
391	694k	num_blobs_to_replace += num_blobs;
392	1.66M	} else {
393	1.66M	begin_blob_index += werd_choice->state(i);
394	1.66M	}
395	2.36M	}
396	341k	new_certainty /= wrong_ngram_size;
397		// If there is no entry in the ratings matrix, add it.
398	341k	MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1);
399	341k	if (!coord.Valid(*ratings)) {
400	2.07k	ratings->IncreaseBandSize(coord.row - coord.col + 1);
401	2.07k	}
402	341k	if (ratings->get(coord.col, coord.row) == nullptr) {
403	66.6k	ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
404	66.6k	}
405	341k	BLOB_CHOICE_LIST *new_choices = ratings->get(coord.col, coord.row);
406	341k	BLOB_CHOICE *choice = FindMatchingChoice(correct_ngram_id, new_choices);
407	341k	if (choice != nullptr) {
408		// Already there. Upgrade if new rating better.
409	262k	if (new_rating < choice->rating()) {
410	9.16k	choice->set_rating(new_rating);
411	9.16k	}
412	262k	if (new_certainty < choice->certainty()) {
413	3.47k	choice->set_certainty(new_certainty);
414	3.47k	}
415		// DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
416	262k	} else {
417		// Need a new choice with the correct_ngram_id.
418	79.5k	choice = new BLOB_CHOICE(*old_choice);
419	79.5k	choice->set_unichar_id(correct_ngram_id);
420	79.5k	choice->set_rating(new_rating);
421	79.5k	choice->set_certainty(new_certainty);
422	79.5k	choice->set_classifier(BCC_AMBIG);
423	79.5k	choice->set_matrix_cell(coord.col, coord.row);
424	79.5k	BLOB_CHOICE_IT it(new_choices);
425	79.5k	it.add_to_end(choice);
426	79.5k	}
427		// Remove current unichar from werd_choice. On the last iteration
428		// set the correct replacement unichar instead of removing a unichar.
429	1.03M	for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) {
430	694k	if (replaced_count + 1 == wrong_ngram_size) {
431	341k	werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice);
432	352k	} else {
433	352k	werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
434	352k	}
435	694k	}
436	341k	if (stopper_debug_level >= 1) {
437	0	werd_choice->print("ReplaceAmbig() ");
438	0	tprintf("Modified blob_choices: ");
439	0	print_ratings_list("\n", new_choices, getUnicharset());
440	0	}
441	341k	}
442
443	82.7k	int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
444	82.7k	int shortest = INT32_MAX;
445	82.7k	int curr_len = 0;
446	182k	for (unsigned w = 0; w < WordChoice.length(); ++w) {
447	99.3k	if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
448	47.8k	curr_len++;
449	51.4k	} else if (curr_len > 0) {
450	749	if (curr_len < shortest) {
451	748	shortest = curr_len;
452	748	}
453	749	curr_len = 0;
454	749	}
455	99.3k	}
456	82.7k	if (curr_len > 0 && curr_len < shortest) {
457	33.7k	shortest = curr_len;
458	49.0k	} else if (shortest == INT32_MAX) {
459	48.3k	shortest = 0;
460	48.3k	}
461	82.7k	return shortest;
462	82.7k	}
463
464	0	int Dict::UniformCertainties(const WERD_CHOICE &word) {
465	0	float Certainty;
466	0	float WorstCertainty = FLT_MAX;
467	0	float CertaintyThreshold;
468	0	double TotalCertainty;
469	0	double TotalCertaintySquared;
470	0	double Variance;
471	0	float Mean, StdDev;
472	0	int word_length = word.length();
473
474	0	if (word_length < 3) {
475	0	return true;
476	0	}
477
478	0	TotalCertainty = TotalCertaintySquared = 0.0;
479	0	for (int i = 0; i < word_length; ++i) {
480	0	Certainty = word.certainty(i);
481	0	TotalCertainty += Certainty;
482	0	TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
483	0	if (Certainty < WorstCertainty) {
484	0	WorstCertainty = Certainty;
485	0	}
486	0	}
487
488		// Subtract off worst certainty from statistics.
489	0	word_length--;
490	0	TotalCertainty -= WorstCertainty;
491	0	TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
492
493	0	Mean = TotalCertainty / word_length;
494	0	Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) /
495	0	(word_length * (word_length - 1)));
496	0	if (Variance < 0.0) {
497	0	Variance = 0.0;
498	0	}
499	0	StdDev = sqrt(Variance);
500
501	0	CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
502	0	if (CertaintyThreshold > stopper_nondict_certainty_base) {
503	0	CertaintyThreshold = stopper_nondict_certainty_base;
504	0	}
505
506	0	if (word.certainty() < CertaintyThreshold) {
507	0	if (stopper_debug_level >= 1) {
508	0	tprintf(
509	0	"Stopper: Non-uniform certainty = %4.1f"
510	0	" (m=%4.1f, s=%4.1f, t=%4.1f)\n",
511	0	word.certainty(), Mean, StdDev, CertaintyThreshold);
512	0	}
513	0	return false;
514	0	} else {
515	0	return true;
516	0	}
517	0	}
518
519		} // namespace tesseract