/src/tesseract/src/dict/stopper.cpp
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | ** Filename: stopper.c |
3 | | ** Purpose: Stopping criteria for word classifier. |
4 | | ** Author: Dan Johnson |
5 | | ** |
6 | | ** (c) Copyright Hewlett-Packard Company, 1988. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | ******************************************************************************/ |
17 | | |
18 | | #include <cctype> |
19 | | #include <cmath> |
20 | | #include <cstdio> |
21 | | #include <cstring> |
22 | | |
23 | | #include "stopper.h" |
24 | | #ifndef DISABLED_LEGACY_ENGINE |
25 | | # include "ambigs.h" |
26 | | #endif |
27 | | #include <tesseract/unichar.h> |
28 | | #include "ccutil.h" |
29 | | #include "dict.h" |
30 | | #include "helpers.h" |
31 | | #include "matchdefs.h" |
32 | | #include "pageres.h" |
33 | | #include "params.h" |
34 | | #include "ratngs.h" |
35 | | |
36 | | /*---------------------------------------------------------------------------- |
37 | | Private Code |
38 | | ----------------------------------------------------------------------------*/ |
39 | | |
40 | | namespace tesseract { |
41 | | |
42 | | bool Dict::AcceptableChoice(const WERD_CHOICE &best_choice, |
43 | 393k | XHeightConsistencyEnum xheight_consistency) { |
44 | 393k | float CertaintyThreshold = stopper_nondict_certainty_base; |
45 | 393k | int WordSize; |
46 | | |
47 | 393k | if (stopper_no_acceptable_choices) { |
48 | 0 | return false; |
49 | 0 | } |
50 | | |
51 | 393k | if (best_choice.empty()) { |
52 | 0 | return false; |
53 | 0 | } |
54 | | |
55 | 393k | bool no_dang_ambigs = !best_choice.dangerous_ambig_found(); |
56 | 393k | bool is_valid_word = valid_word_permuter(best_choice.permuter(), false); |
57 | 393k | bool is_case_ok = case_ok(best_choice); |
58 | | |
59 | 393k | if (stopper_debug_level >= 1) { |
60 | 0 | const char *xht = "UNKNOWN"; |
61 | 0 | switch (xheight_consistency) { |
62 | 0 | case XH_GOOD: |
63 | 0 | xht = "NORMAL"; |
64 | 0 | break; |
65 | 0 | case XH_SUBNORMAL: |
66 | 0 | xht = "SUBNORMAL"; |
67 | 0 | break; |
68 | 0 | case XH_INCONSISTENT: |
69 | 0 | xht = "INCONSISTENT"; |
70 | 0 | break; |
71 | 0 | default: |
72 | 0 | xht = "UNKNOWN"; |
73 | 0 | } |
74 | 0 | tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n", |
75 | 0 | best_choice.unichar_string().c_str(), (is_valid_word ? 'y' : 'n'), |
76 | 0 | (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height()); |
77 | 0 | } |
78 | | // Do not accept invalid words in PASS1. |
79 | 393k | if (reject_offset_ <= 0.0f && !is_valid_word) { |
80 | 393k | return false; |
81 | 393k | } |
82 | 0 | if (is_valid_word && is_case_ok) { |
83 | 0 | WordSize = LengthOfShortestAlphaRun(best_choice); |
84 | 0 | WordSize -= stopper_smallword_size; |
85 | 0 | if (WordSize < 0) { |
86 | 0 | WordSize = 0; |
87 | 0 | } |
88 | 0 | CertaintyThreshold += WordSize * stopper_certainty_per_char; |
89 | 0 | } |
90 | |
|
91 | 0 | if (stopper_debug_level >= 1) { |
92 | 0 | tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n", |
93 | 0 | best_choice.rating(), best_choice.certainty(), CertaintyThreshold); |
94 | 0 | } |
95 | |
|
96 | 0 | if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold && |
97 | 0 | xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) { |
98 | 0 | return true; |
99 | 0 | } else { |
100 | 0 | if (stopper_debug_level >= 1) { |
101 | 0 | tprintf( |
102 | 0 | "AcceptableChoice() returned false" |
103 | 0 | " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n", |
104 | 0 | no_dang_ambigs, best_choice.certainty(), CertaintyThreshold, |
105 | 0 | UniformCertainties(best_choice)); |
106 | 0 | } |
107 | 0 | return false; |
108 | 0 | } |
109 | 0 | } |
110 | | |
111 | 241k | bool Dict::AcceptableResult(WERD_RES *word) const { |
112 | 241k | if (word->best_choice == nullptr) { |
113 | 0 | return false; |
114 | 0 | } |
115 | 241k | float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; |
116 | 241k | int WordSize; |
117 | | |
118 | 241k | if (stopper_debug_level >= 1) { |
119 | 0 | tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n", |
120 | 0 | word->best_choice->debug_string().c_str(), (valid_word(*word->best_choice) ? 'y' : 'n'), |
121 | 0 | (case_ok(*word->best_choice) ? 'y' : 'n'), |
122 | 0 | word->best_choice->dangerous_ambig_found() ? 'n' : 'y', |
123 | 0 | word->best_choices.singleton() ? 'n' : 'y'); |
124 | 0 | } |
125 | | |
126 | 241k | if (word->best_choice->empty() || !word->best_choices.singleton()) { |
127 | 48.3k | return false; |
128 | 48.3k | } |
129 | 192k | if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) { |
130 | 82.7k | WordSize = LengthOfShortestAlphaRun(*word->best_choice); |
131 | 82.7k | WordSize -= stopper_smallword_size; |
132 | 82.7k | if (WordSize < 0) { |
133 | 70.3k | WordSize = 0; |
134 | 70.3k | } |
135 | 82.7k | CertaintyThreshold += WordSize * stopper_certainty_per_char; |
136 | 82.7k | } |
137 | | |
138 | 192k | if (stopper_debug_level >= 1) { |
139 | 0 | tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", word->best_choice->certainty(), |
140 | 0 | CertaintyThreshold); |
141 | 0 | } |
142 | | |
143 | 192k | if (word->best_choice->certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) { |
144 | 14.6k | if (stopper_debug_level >= 1) { |
145 | 0 | tprintf("ACCEPTED\n"); |
146 | 0 | } |
147 | 14.6k | return true; |
148 | 178k | } else { |
149 | 178k | if (stopper_debug_level >= 1) { |
150 | 0 | tprintf("REJECTED\n"); |
151 | 0 | } |
152 | 178k | return false; |
153 | 178k | } |
154 | 192k | } |
155 | | |
156 | | #if !defined(DISABLED_LEGACY_ENGINE) |
157 | | |
158 | | bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_replaceable, |
159 | 817k | MATRIX *ratings) { |
160 | 817k | if (stopper_debug_level > 2) { |
161 | 0 | tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().c_str()); |
162 | 0 | } |
163 | | |
164 | | // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities |
165 | | // for each unichar id in BestChoice. |
166 | 817k | BLOB_CHOICE_LIST_VECTOR ambig_blob_choices; |
167 | 817k | bool ambigs_found = false; |
168 | | // For each position in best_choice: |
169 | | // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i] |
170 | | // -- initialize wrong_ngram with a single unichar_id at best_choice[i] |
171 | | // -- look for ambiguities corresponding to wrong_ngram in the list while |
172 | | // adding the following unichar_ids from best_choice to wrong_ngram |
173 | | // |
174 | | // Repeat the above procedure twice: first time look through |
175 | | // ambigs to be replaced and replace all the ambiguities found; |
176 | | // second time look through dangerous ambiguities and construct |
177 | | // ambig_blob_choices with fake a blob choice for each ambiguity |
178 | | // and pass them to dawg_permute_and_select() to search for |
179 | | // ambiguous words in the dictionaries. |
180 | | // |
181 | | // Note that during the execution of the for loop (on the first pass) |
182 | | // if replacements are made the length of best_choice might change. |
183 | 2.45M | for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) { |
184 | 1.63M | bool replace = (fix_replaceable && pass == 0); |
185 | 1.63M | const UnicharAmbigsVector &table = |
186 | 1.63M | replace ? getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs(); |
187 | 1.63M | if (!replace) { |
188 | | // Initialize ambig_blob_choices with lists containing a single |
189 | | // unichar id for the corresponding position in best_choice. |
190 | | // best_choice consisting from only the original letters will |
191 | | // have a rating of 0.0. |
192 | 7.39M | for (unsigned i = 0; i < best_choice->length(); ++i) { |
193 | 6.58M | auto *lst = new BLOB_CHOICE_LIST(); |
194 | 6.58M | BLOB_CHOICE_IT lst_it(lst); |
195 | | // TODO(rays/antonova) Put real xheights and y shifts here. |
196 | 6.58M | lst_it.add_to_end( |
197 | 6.58M | new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG)); |
198 | 6.58M | ambig_blob_choices.push_back(lst); |
199 | 6.58M | } |
200 | 817k | } |
201 | 1.63M | UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; |
202 | 1.63M | int wrong_ngram_index; |
203 | 1.63M | int blob_index = 0; |
204 | 14.7M | for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) { |
205 | 13.1M | auto curr_unichar_id = best_choice->unichar_id(i); |
206 | 13.1M | if (stopper_debug_level > 2) { |
207 | 0 | tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous", |
208 | 0 | getUnicharset().debug_str(curr_unichar_id).c_str()); |
209 | 0 | } |
210 | 13.1M | int num_wrong_blobs = best_choice->state(i); |
211 | 13.1M | wrong_ngram_index = 0; |
212 | 13.1M | wrong_ngram[wrong_ngram_index] = curr_unichar_id; |
213 | 13.1M | if (curr_unichar_id == INVALID_UNICHAR_ID || static_cast<size_t>(curr_unichar_id) >= table.size() || |
214 | 13.1M | table[curr_unichar_id] == nullptr) { |
215 | 7.60M | continue; // there is no ambig spec for this unichar id |
216 | 7.60M | } |
217 | 5.55M | AmbigSpec_IT spec_it(table[curr_unichar_id]); |
218 | 584M | for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) { |
219 | 582M | const AmbigSpec *ambig_spec = spec_it.data(); |
220 | 582M | wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID; |
221 | 582M | int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram); |
222 | 582M | if (stopper_debug_level > 2) { |
223 | 0 | tprintf("candidate ngram: "); |
224 | 0 | UnicharIdArrayUtils::print(wrong_ngram, getUnicharset()); |
225 | 0 | tprintf("current ngram from spec: "); |
226 | 0 | UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset()); |
227 | 0 | tprintf("comparison result: %d\n", compare); |
228 | 0 | } |
229 | 582M | if (compare == 0) { |
230 | | // Record the place where we found an ambiguity. |
231 | 2.09M | if (fixpt != nullptr) { |
232 | 2.09M | UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0]; |
233 | 2.09M | fixpt->push_back(DANGERR_INFO(blob_index, blob_index + num_wrong_blobs, replace, |
234 | 2.09M | getUnicharset().get_isngram(ambig_spec->correct_ngram_id), |
235 | 2.09M | leftmost_id)); |
236 | 2.09M | if (stopper_debug_level > 1) { |
237 | 0 | tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false, |
238 | 0 | getUnicharset().get_isngram(ambig_spec->correct_ngram_id), |
239 | 0 | getUnicharset().id_to_unichar(leftmost_id)); |
240 | 0 | } |
241 | 2.09M | } |
242 | | |
243 | 2.09M | if (replace) { |
244 | 341k | if (stopper_debug_level > 2) { |
245 | 0 | tprintf("replace ambiguity with %s : ", |
246 | 0 | getUnicharset().id_to_unichar(ambig_spec->correct_ngram_id)); |
247 | 0 | UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset()); |
248 | 0 | } |
249 | 341k | ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice, |
250 | 341k | ratings); |
251 | 1.75M | } else if (i > 0 || ambig_spec->type != CASE_AMBIG) { |
252 | | // We found dang ambig - update ambig_blob_choices. |
253 | 1.75M | if (stopper_debug_level > 2) { |
254 | 0 | tprintf("found ambiguity: "); |
255 | 0 | UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset()); |
256 | 0 | } |
257 | 1.75M | ambigs_found = true; |
258 | 3.85M | for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) { |
259 | | // Add a blob choice for the corresponding fragment of the |
260 | | // ambiguity. These fake blob choices are initialized with |
261 | | // negative ratings (which are not possible for real blob |
262 | | // choices), so that dawg_permute_and_select() considers any |
263 | | // word not consisting of only the original letters a better |
264 | | // choice and stops searching for alternatives once such a |
265 | | // choice is found. |
266 | 2.10M | BLOB_CHOICE_IT bc_it(ambig_blob_choices[i + tmp_index]); |
267 | 2.10M | bc_it.add_to_end(new BLOB_CHOICE(ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, |
268 | 2.10M | -1, 0, 1, 0, BCC_AMBIG)); |
269 | 2.10M | } |
270 | 1.75M | } |
271 | 2.09M | spec_it.forward(); |
272 | 580M | } else if (compare == -1) { |
273 | 9.74M | unsigned next_index; |
274 | 9.74M | if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size && |
275 | 6.84M | ((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) { |
276 | | // Add the next unichar id to wrong_ngram and keep looking for |
277 | | // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST. |
278 | 5.94M | wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index); |
279 | 5.94M | num_wrong_blobs += best_choice->state(next_index); |
280 | 5.94M | } else { |
281 | 3.79M | break; // no more matching ambigs in this AMBIG_SPEC_LIST |
282 | 3.79M | } |
283 | 570M | } else { |
284 | 570M | spec_it.forward(); |
285 | 570M | } |
286 | 582M | } // end searching AmbigSpec_LIST |
287 | 5.55M | } // end searching best_choice |
288 | 1.63M | } // end searching replace and dangerous ambigs |
289 | | |
290 | | // If any ambiguities were found permute the constructed ambig_blob_choices |
291 | | // to see if an alternative dictionary word can be found. |
292 | 817k | if (ambigs_found) { |
293 | 401k | if (stopper_debug_level > 2) { |
294 | 0 | tprintf("\nResulting ambig_blob_choices:\n"); |
295 | 0 | for (unsigned i = 0; i < ambig_blob_choices.size(); ++i) { |
296 | 0 | print_ratings_list("", ambig_blob_choices.at(i), getUnicharset()); |
297 | 0 | tprintf("\n"); |
298 | 0 | } |
299 | 0 | } |
300 | 401k | WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0); |
301 | 401k | ambigs_found = (alt_word->rating() < 0.0); |
302 | 401k | if (ambigs_found) { |
303 | 0 | if (stopper_debug_level >= 1) { |
304 | 0 | tprintf("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().c_str()); |
305 | 0 | } |
306 | 0 | if (fixpt != nullptr) { |
307 | | // Note: Currently character choices combined from fragments can only |
308 | | // be generated by NoDangrousAmbigs(). This code should be updated if |
309 | | // the capability to produce classifications combined from character |
310 | | // fragments is added to other functions. |
311 | 0 | int orig_i = 0; |
312 | 0 | for (unsigned i = 0; i < alt_word->length(); ++i) { |
313 | 0 | const UNICHARSET &uchset = getUnicharset(); |
314 | 0 | bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i)); |
315 | 0 | UNICHAR_ID leftmost_id = alt_word->unichar_id(i); |
316 | 0 | if (replacement_is_ngram) { |
317 | | // we have to extract the leftmost unichar from the ngram. |
318 | 0 | const char *str = uchset.id_to_unichar(leftmost_id); |
319 | 0 | int step = uchset.step(str); |
320 | 0 | if (step) { |
321 | 0 | leftmost_id = uchset.unichar_to_id(str, step); |
322 | 0 | } |
323 | 0 | } |
324 | 0 | int end_i = orig_i + alt_word->state(i); |
325 | 0 | if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) { |
326 | | // Compute proper blob indices. |
327 | 0 | int blob_start = 0; |
328 | 0 | for (int j = 0; j < orig_i; ++j) { |
329 | 0 | blob_start += best_choice->state(j); |
330 | 0 | } |
331 | 0 | int blob_end = blob_start; |
332 | 0 | for (int j = orig_i; j < end_i; ++j) { |
333 | 0 | blob_end += best_choice->state(j); |
334 | 0 | } |
335 | 0 | fixpt->push_back( |
336 | 0 | DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id)); |
337 | 0 | if (stopper_debug_level > 1) { |
338 | 0 | tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true, |
339 | 0 | replacement_is_ngram, uchset.id_to_unichar(leftmost_id)); |
340 | 0 | } |
341 | 0 | } |
342 | 0 | orig_i += alt_word->state(i); |
343 | 0 | } |
344 | 0 | } |
345 | 0 | } |
346 | 401k | delete alt_word; |
347 | 401k | } |
348 | 817k | if (output_ambig_words_file_ != nullptr) { |
349 | 0 | fprintf(output_ambig_words_file_, "\n"); |
350 | 0 | } |
351 | | |
352 | 6.58M | for (auto data : ambig_blob_choices) { |
353 | 6.58M | delete data; |
354 | 6.58M | } |
355 | 817k | return !ambigs_found; |
356 | 817k | } |
357 | | |
358 | 0 | void Dict::EndDangerousAmbigs() {} |
359 | | |
360 | | #endif // !defined(DISABLED_LEGACY_ENGINE) |
361 | | |
362 | 45.9k | void Dict::SetupStopperPass1() { |
363 | 45.9k | reject_offset_ = 0.0; |
364 | 45.9k | } |
365 | | |
366 | 67.2k | void Dict::SetupStopperPass2() { |
367 | 67.2k | reject_offset_ = stopper_phase2_certainty_rejection_offset; |
368 | 67.2k | } |
369 | | |
370 | | void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, |
371 | 341k | UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings) { |
372 | 341k | int num_blobs_to_replace = 0; |
373 | 341k | int begin_blob_index = 0; |
374 | 341k | int i; |
375 | | // Rating and certainty for the new BLOB_CHOICE are derived from the |
376 | | // replaced choices. |
377 | 341k | float new_rating = 0.0f; |
378 | 341k | float new_certainty = 0.0f; |
379 | 341k | BLOB_CHOICE *old_choice = nullptr; |
380 | 2.70M | for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) { |
381 | 2.36M | if (i >= wrong_ngram_begin_index) { |
382 | 694k | int num_blobs = werd_choice->state(i); |
383 | 694k | int col = begin_blob_index + num_blobs_to_replace; |
384 | 694k | int row = col + num_blobs - 1; |
385 | 694k | BLOB_CHOICE_LIST *choices = ratings->get(col, row); |
386 | 694k | ASSERT_HOST(choices != nullptr); |
387 | 694k | old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices); |
388 | 694k | ASSERT_HOST(old_choice != nullptr); |
389 | 694k | new_rating += old_choice->rating(); |
390 | 694k | new_certainty += old_choice->certainty(); |
391 | 694k | num_blobs_to_replace += num_blobs; |
392 | 1.66M | } else { |
393 | 1.66M | begin_blob_index += werd_choice->state(i); |
394 | 1.66M | } |
395 | 2.36M | } |
396 | 341k | new_certainty /= wrong_ngram_size; |
397 | | // If there is no entry in the ratings matrix, add it. |
398 | 341k | MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1); |
399 | 341k | if (!coord.Valid(*ratings)) { |
400 | 2.07k | ratings->IncreaseBandSize(coord.row - coord.col + 1); |
401 | 2.07k | } |
402 | 341k | if (ratings->get(coord.col, coord.row) == nullptr) { |
403 | 66.6k | ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST); |
404 | 66.6k | } |
405 | 341k | BLOB_CHOICE_LIST *new_choices = ratings->get(coord.col, coord.row); |
406 | 341k | BLOB_CHOICE *choice = FindMatchingChoice(correct_ngram_id, new_choices); |
407 | 341k | if (choice != nullptr) { |
408 | | // Already there. Upgrade if new rating better. |
409 | 262k | if (new_rating < choice->rating()) { |
410 | 9.16k | choice->set_rating(new_rating); |
411 | 9.16k | } |
412 | 262k | if (new_certainty < choice->certainty()) { |
413 | 3.47k | choice->set_certainty(new_certainty); |
414 | 3.47k | } |
415 | | // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState. |
416 | 262k | } else { |
417 | | // Need a new choice with the correct_ngram_id. |
418 | 79.5k | choice = new BLOB_CHOICE(*old_choice); |
419 | 79.5k | choice->set_unichar_id(correct_ngram_id); |
420 | 79.5k | choice->set_rating(new_rating); |
421 | 79.5k | choice->set_certainty(new_certainty); |
422 | 79.5k | choice->set_classifier(BCC_AMBIG); |
423 | 79.5k | choice->set_matrix_cell(coord.col, coord.row); |
424 | 79.5k | BLOB_CHOICE_IT it(new_choices); |
425 | 79.5k | it.add_to_end(choice); |
426 | 79.5k | } |
427 | | // Remove current unichar from werd_choice. On the last iteration |
428 | | // set the correct replacement unichar instead of removing a unichar. |
429 | 1.03M | for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) { |
430 | 694k | if (replaced_count + 1 == wrong_ngram_size) { |
431 | 341k | werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice); |
432 | 352k | } else { |
433 | 352k | werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1); |
434 | 352k | } |
435 | 694k | } |
436 | 341k | if (stopper_debug_level >= 1) { |
437 | 0 | werd_choice->print("ReplaceAmbig() "); |
438 | 0 | tprintf("Modified blob_choices: "); |
439 | 0 | print_ratings_list("\n", new_choices, getUnicharset()); |
440 | 0 | } |
441 | 341k | } |
442 | | |
443 | 82.7k | int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const { |
444 | 82.7k | int shortest = INT32_MAX; |
445 | 82.7k | int curr_len = 0; |
446 | 182k | for (unsigned w = 0; w < WordChoice.length(); ++w) { |
447 | 99.3k | if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) { |
448 | 47.8k | curr_len++; |
449 | 51.4k | } else if (curr_len > 0) { |
450 | 749 | if (curr_len < shortest) { |
451 | 748 | shortest = curr_len; |
452 | 748 | } |
453 | 749 | curr_len = 0; |
454 | 749 | } |
455 | 99.3k | } |
456 | 82.7k | if (curr_len > 0 && curr_len < shortest) { |
457 | 33.7k | shortest = curr_len; |
458 | 49.0k | } else if (shortest == INT32_MAX) { |
459 | 48.3k | shortest = 0; |
460 | 48.3k | } |
461 | 82.7k | return shortest; |
462 | 82.7k | } |
463 | | |
464 | 0 | int Dict::UniformCertainties(const WERD_CHOICE &word) { |
465 | 0 | float Certainty; |
466 | 0 | float WorstCertainty = FLT_MAX; |
467 | 0 | float CertaintyThreshold; |
468 | 0 | double TotalCertainty; |
469 | 0 | double TotalCertaintySquared; |
470 | 0 | double Variance; |
471 | 0 | float Mean, StdDev; |
472 | 0 | int word_length = word.length(); |
473 | |
|
474 | 0 | if (word_length < 3) { |
475 | 0 | return true; |
476 | 0 | } |
477 | | |
478 | 0 | TotalCertainty = TotalCertaintySquared = 0.0; |
479 | 0 | for (int i = 0; i < word_length; ++i) { |
480 | 0 | Certainty = word.certainty(i); |
481 | 0 | TotalCertainty += Certainty; |
482 | 0 | TotalCertaintySquared += static_cast<double>(Certainty) * Certainty; |
483 | 0 | if (Certainty < WorstCertainty) { |
484 | 0 | WorstCertainty = Certainty; |
485 | 0 | } |
486 | 0 | } |
487 | | |
488 | | // Subtract off worst certainty from statistics. |
489 | 0 | word_length--; |
490 | 0 | TotalCertainty -= WorstCertainty; |
491 | 0 | TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty; |
492 | |
|
493 | 0 | Mean = TotalCertainty / word_length; |
494 | 0 | Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) / |
495 | 0 | (word_length * (word_length - 1))); |
496 | 0 | if (Variance < 0.0) { |
497 | 0 | Variance = 0.0; |
498 | 0 | } |
499 | 0 | StdDev = sqrt(Variance); |
500 | |
|
501 | 0 | CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev; |
502 | 0 | if (CertaintyThreshold > stopper_nondict_certainty_base) { |
503 | 0 | CertaintyThreshold = stopper_nondict_certainty_base; |
504 | 0 | } |
505 | |
|
506 | 0 | if (word.certainty() < CertaintyThreshold) { |
507 | 0 | if (stopper_debug_level >= 1) { |
508 | 0 | tprintf( |
509 | 0 | "Stopper: Non-uniform certainty = %4.1f" |
510 | 0 | " (m=%4.1f, s=%4.1f, t=%4.1f)\n", |
511 | 0 | word.certainty(), Mean, StdDev, CertaintyThreshold); |
512 | 0 | } |
513 | 0 | return false; |
514 | 0 | } else { |
515 | 0 | return true; |
516 | 0 | } |
517 | 0 | } |
518 | | |
519 | | } // namespace tesseract |