Coverage Report

Created: 2025-06-13 07:02

/src/tesseract/src/dict/dict.cpp
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////
2
// File:        dict.cpp
3
// Description: dict class.
4
// Author:      Samuel Charron
5
//
6
// (C) Copyright 2006, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#include "dict.h"
20
21
#include "tesserrstream.h"  // for tesserr
22
#include "tprintf.h"
23
24
#include <cstdio>
25
26
namespace tesseract {
27
28
class Image;
29
30
Dict::Dict(CCUtil *ccutil)
31
4
    : letter_is_okay_(&tesseract::Dict::def_letter_is_okay)
32
4
    , probability_in_context_(&tesseract::Dict::def_probability_in_context)
33
4
    , ccutil_(ccutil)
34
4
    , wildcard_unichar_id_(INVALID_UNICHAR_ID)
35
4
    , apostrophe_unichar_id_(INVALID_UNICHAR_ID)
36
4
    , question_unichar_id_(INVALID_UNICHAR_ID)
37
4
    , slash_unichar_id_(INVALID_UNICHAR_ID)
38
4
    , hyphen_unichar_id_(INVALID_UNICHAR_ID)
39
4
    , STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
40
                    getCCUtil()->params())
41
4
    , STRING_INIT_MEMBER(user_words_suffix, "",
42
                         "A suffix of user-provided words located in tessdata.",
43
                         getCCUtil()->params())
44
4
    , STRING_MEMBER(user_patterns_file, "", "A filename of user-provided patterns.",
45
                    getCCUtil()->params())
46
4
    , STRING_INIT_MEMBER(user_patterns_suffix, "",
47
                         "A suffix of user-provided patterns located in "
48
                         "tessdata.",
49
                         getCCUtil()->params())
50
4
    , BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", getCCUtil()->params())
51
4
    , BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", getCCUtil()->params())
52
4
    , BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
53
                       getCCUtil()->params())
54
4
    , BOOL_INIT_MEMBER(load_punc_dawg, true,
55
                       "Load dawg with punctuation"
56
                       " patterns.",
57
                       getCCUtil()->params())
58
4
    , BOOL_INIT_MEMBER(load_number_dawg, true,
59
                       "Load dawg with number"
60
                       " patterns.",
61
                       getCCUtil()->params())
62
4
    , BOOL_INIT_MEMBER(load_bigram_dawg, true,
63
                       "Load dawg with special word "
64
                       "bigrams.",
65
                       getCCUtil()->params())
66
4
    , double_MEMBER(xheight_penalty_subscripts, 0.125,
67
                    "Score penalty (0.1 = 10%) added if there are subscripts "
68
                    "or superscripts in a word, but it is otherwise OK.",
69
                    getCCUtil()->params())
70
4
    , double_MEMBER(xheight_penalty_inconsistent, 0.25,
71
                    "Score penalty (0.1 = 10%) added if an xheight is "
72
                    "inconsistent.",
73
                    getCCUtil()->params())
74
4
    , double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
75
                    "Score multiplier for word matches which have good case and"
76
                    " are frequent in the given language (lower is better).",
77
                    getCCUtil()->params())
78
4
    , double_MEMBER(segment_penalty_dict_case_ok, 1.1,
79
                    "Score multiplier for word matches that have good case "
80
                    "(lower is better).",
81
                    getCCUtil()->params())
82
4
    , double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
83
                    "Default score multiplier for word matches, which may have "
84
                    "case issues (lower is better).",
85
                    getCCUtil()->params())
86
4
    , double_MEMBER(segment_penalty_dict_nonword, 1.25,
87
                    "Score multiplier for glyph fragment segmentations which "
88
                    "do not match a dictionary word (lower is better).",
89
                    getCCUtil()->params())
90
4
    , double_MEMBER(segment_penalty_garbage, 1.50,
91
                    "Score multiplier for poorly cased strings that are not in"
92
                    " the dictionary and generally look like garbage (lower is"
93
                    " better).",
94
                    getCCUtil()->params())
95
4
    , STRING_MEMBER(output_ambig_words_file, "",
96
                    "Output file for ambiguities found in the dictionary", getCCUtil()->params())
97
4
    , INT_MEMBER(dawg_debug_level, 0,
98
                 "Set to 1 for general debug info"
99
                 ", to 2 for more details, to 3 to see all the debug messages",
100
                 getCCUtil()->params())
101
4
    , INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", getCCUtil()->params())
102
4
    , BOOL_MEMBER(use_only_first_uft8_step, false,
103
                  "Use only the first UTF8 step of the given string"
104
                  " when computing log probabilities.",
105
                  getCCUtil()->params())
106
4
    , double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", getCCUtil()->params())
107
4
    , double_MEMBER(stopper_nondict_certainty_base, -2.50, "Certainty threshold for non-dict words",
108
                    getCCUtil()->params())
109
4
    , double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, "Reject certainty offset",
110
                    getCCUtil()->params())
111
4
    , INT_MEMBER(stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word",
112
                 getCCUtil()->params())
113
4
    , double_MEMBER(stopper_certainty_per_char, -0.50,
114
                    "Certainty to add"
115
                    " for each dict char above small word size.",
116
                    getCCUtil()->params())
117
4
    , double_MEMBER(stopper_allowable_character_badness, 3.0,
118
                    "Max certainty variation allowed in a word (in sigma)", getCCUtil()->params())
119
4
    , INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", getCCUtil()->params())
120
4
    , BOOL_MEMBER(stopper_no_acceptable_choices, false,
121
                  "Make AcceptableChoice() always return false. Useful"
122
                  " when there is a need to explore all segmentations",
123
                  getCCUtil()->params())
124
4
    , INT_MEMBER(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list",
125
                 getCCUtil()->params())
126
4
    , STRING_MEMBER(word_to_debug, "",
127
                    "Word for which stopper debug"
128
                    " information should be printed to stdout",
129
                    getCCUtil()->params())
130
4
    , BOOL_MEMBER(segment_nonalphabetic_script, false,
131
                  "Don't use any alphabetic-specific tricks."
132
                  " Set to true in the traineddata config file for"
133
                  " scripts that are cursive or inherently fixed-pitch",
134
                  getCCUtil()->params())
135
4
    , BOOL_MEMBER(save_doc_words, 0, "Save Document Words", getCCUtil()->params())
136
4
    , double_MEMBER(doc_dict_pending_threshold, 0.0, "Worst certainty for using pending dictionary",
137
                    getCCUtil()->params())
138
4
    , double_MEMBER(doc_dict_certainty_threshold, -2.25,
139
                    "Worst certainty for words that can be inserted into the"
140
                    " document dictionary",
141
                    getCCUtil()->params())
142
4
    , INT_MEMBER(max_permuter_attempts, 10000,
143
                 "Maximum number of different"
144
                 " character choices to consider during permutation."
145
                 " This limit is especially useful when user patterns"
146
                 " are specified, since overly generic patterns can result in"
147
                 " dawg search exploring an overly large number of options.",
148
4
                 getCCUtil()->params()) {
149
4
  reject_offset_ = 0.0;
150
4
  go_deeper_fxn_ = nullptr;
151
4
  hyphen_word_ = nullptr;
152
4
  last_word_on_line_ = false;
153
4
  document_words_ = nullptr;
154
4
  dawg_cache_ = nullptr;
155
4
  dawg_cache_is_ours_ = false;
156
4
  pending_words_ = nullptr;
157
4
  bigram_dawg_ = nullptr;
158
4
  freq_dawg_ = nullptr;
159
4
  punc_dawg_ = nullptr;
160
4
  unambig_dawg_ = nullptr;
161
4
  wordseg_rating_adjust_factor_ = -1.0f;
162
4
  output_ambig_words_file_ = nullptr;
163
4
}
164
165
0
Dict::~Dict() {
166
0
  End();
167
0
  delete hyphen_word_;
168
0
  if (output_ambig_words_file_ != nullptr) {
169
0
    fclose(output_ambig_words_file_);
170
0
  }
171
0
}
172
173
4
DawgCache *Dict::GlobalDawgCache() {
174
  // This global cache (a singleton) will outlive every Tesseract instance
175
  // (even those that someone else might declare as global static variables).
176
4
  static DawgCache cache;
177
4
  return &cache;
178
4
}
179
180
// Sets up ready for a Load or LoadLSTM.
181
4
void Dict::SetupForLoad(DawgCache *dawg_cache) {
182
4
  if (dawgs_.size() != 0) {
183
2
    this->End();
184
2
  }
185
186
4
  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
187
4
  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
188
4
  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
189
4
  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
190
191
4
  if (dawg_cache != nullptr) {
192
4
    dawg_cache_ = dawg_cache;
193
4
    dawg_cache_is_ours_ = false;
194
4
  } else {
195
0
    dawg_cache_ = new DawgCache();
196
0
    dawg_cache_is_ours_ = true;
197
0
  }
198
4
}
199
200
// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
201
2
void Dict::Load(const std::string &lang, TessdataManager *data_file) {
202
  // Load dawgs_.
203
2
  if (load_punc_dawg) {
204
2
    punc_dawg_ =
205
2
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
206
2
    if (punc_dawg_) {
207
2
      dawgs_.push_back(punc_dawg_);
208
2
    }
209
2
  }
210
2
  if (load_system_dawg) {
211
2
    Dawg *system_dawg =
212
2
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
213
2
    if (system_dawg) {
214
2
      dawgs_.push_back(system_dawg);
215
2
    }
216
2
  }
217
2
  if (load_number_dawg) {
218
2
    Dawg *number_dawg =
219
2
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
220
2
    if (number_dawg) {
221
2
      dawgs_.push_back(number_dawg);
222
2
    }
223
2
  }
224
2
  if (load_bigram_dawg) {
225
2
    bigram_dawg_ =
226
2
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, dawg_debug_level, data_file);
227
    // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
228
    // dawgs_!!
229
2
  }
230
2
  if (load_freq_dawg) {
231
2
    freq_dawg_ =
232
2
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
233
2
    if (freq_dawg_) {
234
2
      dawgs_.push_back(freq_dawg_);
235
2
    }
236
2
  }
237
2
  if (load_unambig_dawg) {
238
2
    unambig_dawg_ =
239
2
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
240
2
    if (unambig_dawg_) {
241
0
      dawgs_.push_back(unambig_dawg_);
242
0
    }
243
2
  }
244
245
2
  std::string name;
246
2
  if (!user_words_suffix.empty() || !user_words_file.empty()) {
247
0
    Trie *trie_ptr =
248
0
        new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
249
0
    if (!user_words_file.empty()) {
250
0
      name = user_words_file;
251
0
    } else {
252
0
      name = getCCUtil()->language_data_path_prefix;
253
0
      name += user_words_suffix;
254
0
    }
255
0
    if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
256
0
                                          Trie::RRP_REVERSE_IF_HAS_RTL)) {
257
0
      tprintf("Error: failed to load %s\n", name.c_str());
258
0
      delete trie_ptr;
259
0
    } else {
260
0
      dawgs_.push_back(trie_ptr);
261
0
    }
262
0
  }
263
264
2
  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
265
0
    Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
266
0
                              dawg_debug_level);
267
0
    trie_ptr->initialize_patterns(&(getUnicharset()));
268
0
    if (!user_patterns_file.empty()) {
269
0
      name = user_patterns_file;
270
0
    } else {
271
0
      name = getCCUtil()->language_data_path_prefix;
272
0
      name += user_patterns_suffix;
273
0
    }
274
0
    if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
275
0
      tprintf("Error: failed to load %s\n", name.c_str());
276
0
      delete trie_ptr;
277
0
    } else {
278
0
      dawgs_.push_back(trie_ptr);
279
0
    }
280
0
  }
281
282
2
  document_words_ =
283
2
      new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
284
2
  dawgs_.push_back(document_words_);
285
286
  // This dawg is temporary and should not be searched by letter_is_ok.
287
2
  pending_words_ =
288
2
      new Trie(DAWG_TYPE_WORD, lang, NO_PERM, getUnicharset().size(), dawg_debug_level);
289
2
}
290
291
// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
292
2
void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
293
  // Load dawgs_.
294
2
  if (load_punc_dawg) {
295
2
    punc_dawg_ =
296
2
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
297
2
    if (punc_dawg_) {
298
2
      dawgs_.push_back(punc_dawg_);
299
2
    }
300
2
  }
301
2
  if (load_system_dawg) {
302
2
    Dawg *system_dawg =
303
2
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
304
2
    if (system_dawg) {
305
2
      dawgs_.push_back(system_dawg);
306
2
    }
307
2
  }
308
2
  if (load_number_dawg) {
309
2
    Dawg *number_dawg =
310
2
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
311
2
    if (number_dawg) {
312
2
      dawgs_.push_back(number_dawg);
313
2
    }
314
2
  }
315
316
  // stolen from Dict::Load (but needs params_ from Tesseract
317
  // langdata/config/api):
318
2
  std::string name;
319
2
  if (!user_words_suffix.empty() || !user_words_file.empty()) {
320
0
    Trie *trie_ptr =
321
0
        new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
322
0
    if (!user_words_file.empty()) {
323
0
      name = user_words_file;
324
0
    } else {
325
0
      name = getCCUtil()->language_data_path_prefix;
326
0
      name += user_words_suffix;
327
0
    }
328
0
    if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
329
0
                                          Trie::RRP_REVERSE_IF_HAS_RTL)) {
330
0
      tprintf("Error: failed to load %s\n", name.c_str());
331
0
      delete trie_ptr;
332
0
    } else {
333
0
      dawgs_.push_back(trie_ptr);
334
0
    }
335
0
  }
336
337
2
  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
338
0
    Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
339
0
                              dawg_debug_level);
340
0
    trie_ptr->initialize_patterns(&(getUnicharset()));
341
0
    if (!user_patterns_file.empty()) {
342
0
      name = user_patterns_file;
343
0
    } else {
344
0
      name = getCCUtil()->language_data_path_prefix;
345
0
      name += user_patterns_suffix;
346
0
    }
347
0
    if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
348
0
      tprintf("Error: failed to load %s\n", name.c_str());
349
0
      delete trie_ptr;
350
0
    } else {
351
0
      dawgs_.push_back(trie_ptr);
352
0
    }
353
0
  }
354
2
}
355
356
// Completes the loading process after Load() and/or LoadLSTM().
357
// Returns false if no dictionaries were loaded.
358
4
bool Dict::FinishLoad() {
359
4
  if (dawgs_.empty()) {
360
0
    return false;
361
0
  }
362
  // Construct a list of corresponding successors for each dawg. Each entry, i,
363
  // in the successors_ vector is a vector of integers that represent the
364
  // indices into the dawgs_ vector of the successors for dawg i.
365
4
  successors_.reserve(dawgs_.size());
366
16
  for (auto dawg : dawgs_) {
367
16
    auto *lst = new SuccessorList();
368
84
    for (unsigned j = 0; j < dawgs_.size(); ++j) {
369
68
      const Dawg *other = dawgs_[j];
370
68
      if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
371
68
          kDawgSuccessors[dawg->type()][other->type()]) {
372
24
        lst->push_back(j);
373
24
      }
374
68
    }
375
16
    successors_.push_back(lst);
376
16
  }
377
4
  return true;
378
4
}
379
380
2
void Dict::End() {
381
2
  if (dawgs_.empty()) {
382
0
    return; // Not safe to call twice.
383
0
  }
384
6
  for (auto &dawg : dawgs_) {
385
6
    if (!dawg_cache_->FreeDawg(dawg)) {
386
0
      delete dawg;
387
0
    }
388
6
  }
389
2
  dawg_cache_->FreeDawg(bigram_dawg_);
390
2
  if (dawg_cache_is_ours_) {
391
0
    delete dawg_cache_;
392
0
    dawg_cache_ = nullptr;
393
0
  }
394
6
  for (auto successor : successors_) {
395
6
    delete successor;
396
6
  }
397
2
  dawgs_.clear();
398
2
  successors_.clear();
399
2
  document_words_ = nullptr;
400
2
  delete pending_words_;
401
2
  pending_words_ = nullptr;
402
2
}
403
404
// Returns true if in light of the current state unichar_id is allowed
405
// according to at least one of the dawgs in the dawgs_ vector.
406
// See more extensive comments in dict.h where this function is declared.
407
int Dict::def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset,
408
2.74M
                             UNICHAR_ID unichar_id, bool word_end) const {
409
2.74M
  auto *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
410
411
2.74M
  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
412
413
2.74M
  if (dawg_debug_level >= 3) {
414
0
    tesserr << "def_letter_is_okay: current unichar="
415
0
            << getUnicharset().debug_str(unichar_id)
416
0
            << " word_end=" << word_end
417
0
            << " num active dawgs=" << dawg_args->active_dawgs->size() << '\n';
418
0
  }
419
420
  // Do not accept words that contain kPatternUnicharID.
421
  // (otherwise pattern dawgs would not function correctly).
422
  // Do not accept words containing INVALID_UNICHAR_IDs.
423
2.74M
  if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) {
424
3.43k
    dawg_args->permuter = NO_PERM;
425
3.43k
    return NO_PERM;
426
3.43k
  }
427
428
  // Initialization.
429
2.73M
  PermuterType curr_perm = NO_PERM;
430
2.73M
  dawg_args->updated_dawgs->clear();
431
2.73M
  dawg_args->valid_end = false;
432
433
  // Go over the active_dawgs vector and insert DawgPosition records
434
  // with the updated ref (an edge with the corresponding unichar id) into
435
  // dawg_args->updated_pos.
436
5.12M
  for (unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) {
437
2.38M
    const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
438
2.38M
    const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
439
2.38M
    const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
440
441
2.38M
    if (!dawg && !punc_dawg) {
442
      // shouldn't happen.
443
0
      tprintf("Received DawgPosition with no dawg or punc_dawg.  wth?\n");
444
0
      continue;
445
0
    }
446
2.38M
    if (!dawg) {
447
      // We're in the punctuation dawg.  A core dawg has not been chosen.
448
711k
      NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
449
711k
      EDGE_REF punc_transition_edge =
450
711k
          punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
451
711k
      if (punc_transition_edge != NO_EDGE) {
452
        // Find all successors, and see which can transition.
453
711k
        const SuccessorList &slist = *(successors_[pos.punc_index]);
454
2.84M
        for (int sdawg_index : slist) {
455
2.84M
          const Dawg *sdawg = dawgs_[sdawg_index];
456
2.84M
          UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
457
2.84M
          EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
458
2.84M
          if (dawg_edge != NO_EDGE) {
459
795k
            if (dawg_debug_level >= 3) {
460
0
              tprintf("Letter found in dawg %d\n", sdawg_index);
461
0
            }
462
795k
            dawg_args->updated_dawgs->add_unique(
463
795k
                DawgPosition(sdawg_index, dawg_edge, pos.punc_index, punc_transition_edge, false),
464
795k
                dawg_debug_level > 0, "Append transition from punc dawg to current dawgs: ");
465
795k
            if (sdawg->permuter() > curr_perm) {
466
622k
              curr_perm = sdawg->permuter();
467
622k
            }
468
795k
            if (sdawg->end_of_word(dawg_edge) && punc_dawg->end_of_word(punc_transition_edge)) {
469
515k
              dawg_args->valid_end = true;
470
515k
            }
471
795k
          }
472
2.84M
        }
473
711k
      }
474
711k
      EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
475
711k
      if (punc_edge != NO_EDGE) {
476
9.62k
        if (dawg_debug_level >= 3) {
477
0
          tprintf("Letter found in punctuation dawg\n");
478
0
        }
479
9.62k
        dawg_args->updated_dawgs->add_unique(
480
9.62k
            DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), dawg_debug_level > 0,
481
9.62k
            "Extend punctuation dawg: ");
482
9.62k
        if (PUNC_PERM > curr_perm) {
483
9.62k
          curr_perm = PUNC_PERM;
484
9.62k
        }
485
9.62k
        if (punc_dawg->end_of_word(punc_edge)) {
486
0
          dawg_args->valid_end = true;
487
0
        }
488
9.62k
      }
489
711k
      continue;
490
711k
    }
491
492
1.67M
    if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
493
      // We can end the main word here.
494
      //  If we can continue on the punc ref, add that possibility.
495
1.02M
      NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
496
1.02M
      EDGE_REF punc_edge =
497
1.02M
          punc_node == NO_EDGE ? NO_EDGE : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
498
1.02M
      if (punc_edge != NO_EDGE) {
499
140k
        dawg_args->updated_dawgs->add_unique(
500
140k
            DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, punc_edge, true),
501
140k
            dawg_debug_level > 0, "Return to punctuation dawg: ");
502
140k
        if (dawg->permuter() > curr_perm) {
503
135k
          curr_perm = dawg->permuter();
504
135k
        }
505
140k
        if (punc_dawg->end_of_word(punc_edge)) {
506
138k
          dawg_args->valid_end = true;
507
138k
        }
508
140k
      }
509
1.02M
    }
510
511
1.67M
    if (pos.back_to_punc) {
512
129k
      continue;
513
129k
    }
514
515
    // If we are dealing with the pattern dawg, look up all the
516
    // possible edges, not only for the exact unichar_id, but also
517
    // for all its character classes (alpha, digit, etc).
518
1.54M
    if (dawg->type() == DAWG_TYPE_PATTERN) {
519
0
      ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, &curr_perm);
520
      // There can't be any successors to dawg that is of type
521
      // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
522
0
      continue;
523
0
    }
524
525
    // Find the edge out of the node for the unichar_id.
526
1.54M
    NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
527
1.54M
    EDGE_REF edge =
528
1.54M
        (node == NO_EDGE)
529
1.54M
            ? NO_EDGE
530
1.54M
            : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
531
532
1.54M
    if (dawg_debug_level >= 3) {
533
0
      tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node, edge);
534
0
    }
535
536
1.54M
    if (edge != NO_EDGE) { // the unichar was found in the current dawg
537
136k
      if (dawg_debug_level >= 3) {
538
0
        tprintf("Letter found in dawg %d\n", pos.dawg_index);
539
0
      }
540
136k
      if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
541
0
        if (dawg_debug_level >= 3) {
542
0
          tprintf("Punctuation constraint not satisfied at end of word.\n");
543
0
        }
544
0
        continue;
545
0
      }
546
136k
      if (dawg->permuter() > curr_perm) {
547
124k
        curr_perm = dawg->permuter();
548
124k
      }
549
136k
      if (dawg->end_of_word(edge) &&
550
136k
          (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) {
551
95.6k
        dawg_args->valid_end = true;
552
95.6k
      }
553
136k
      dawg_args->updated_dawgs->add_unique(
554
136k
          DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, false),
555
136k
          dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
556
136k
    }
557
1.54M
  } // end for
558
  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
559
  // or if we found the current letter in a non-punctuation dawg. This
560
  // allows preserving information on which dawg the "core" word came from.
561
  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
562
2.73M
  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
563
2.73M
      (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
564
2.73M
    dawg_args->permuter = curr_perm;
565
2.73M
  }
566
2.73M
  if (dawg_debug_level >= 2) {
567
0
    tprintf("Returning %d for permuter code for this character.\n", dawg_args->permuter);
568
0
  }
569
2.73M
  return dawg_args->permuter;
570
2.74M
}
571
572
void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHAR_ID unichar_id,
573
0
                               bool word_end, DawgArgs *dawg_args, PermuterType *curr_perm) const {
574
0
  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
575
  // Try to find the edge corresponding to the exact unichar_id and to all the
576
  // edges corresponding to the character class of unichar_id.
577
0
  std::vector<UNICHAR_ID> unichar_id_patterns;
578
0
  unichar_id_patterns.push_back(unichar_id);
579
0
  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
580
0
  for (int unichar_id_pattern : unichar_id_patterns) {
581
    // On the first iteration check all the outgoing edges.
582
    // On the second iteration check all self-loops.
583
0
    for (int k = 0; k < 2; ++k) {
584
0
      EDGE_REF edge = (k == 0)
585
0
                          ? dawg->edge_char_of(node, unichar_id_pattern, word_end)
586
0
                          : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_pattern, word_end);
587
0
      if (edge == NO_EDGE) {
588
0
        continue;
589
0
      }
590
0
      if (dawg_debug_level >= 3) {
591
0
        tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node,
592
0
                edge);
593
0
        tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
594
0
      }
595
0
      if (dawg->permuter() > *curr_perm) {
596
0
        *curr_perm = dawg->permuter();
597
0
      }
598
0
      if (dawg->end_of_word(edge)) {
599
0
        dawg_args->valid_end = true;
600
0
      }
601
0
      dawg_args->updated_dawgs->add_unique(
602
0
          DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, pos.back_to_punc),
603
0
          dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
604
0
    }
605
0
  }
606
0
}
607
608
// Fill the given active_dawgs vector with dawgs that could contain the
609
// beginning of the word. If hyphenated() returns true, copy the entries
610
// from hyphen_active_dawgs_ instead.
611
358k
void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const {
612
358k
  if (hyphenated()) {
613
0
    *active_dawgs = hyphen_active_dawgs_;
614
0
    if (dawg_debug_level >= 3) {
615
0
      for (unsigned i = 0; i < hyphen_active_dawgs_.size(); ++i) {
616
0
        tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
617
0
                hyphen_active_dawgs_[i].dawg_index, hyphen_active_dawgs_[i].dawg_ref);
618
0
      }
619
0
    }
620
358k
  } else {
621
358k
    default_dawgs(active_dawgs, ambigs_mode);
622
358k
  }
623
358k
}
624
625
1.02M
void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_patterns) const {
626
1.02M
  bool punc_dawg_available = (punc_dawg_ != nullptr) &&
627
1.02M
                             punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
628
629
4.64M
  for (unsigned i = 0; i < dawgs_.size(); i++) {
630
3.62M
    if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
631
3.62M
      int dawg_ty = dawgs_[i]->type();
632
3.62M
      bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
633
3.62M
      if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
634
724k
        dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
635
724k
        if (dawg_debug_level >= 3) {
636
0
          tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
637
0
        }
638
2.89M
      } else if (!punc_dawg_available || !subsumed_by_punc) {
639
0
        dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
640
0
        if (dawg_debug_level >= 3) {
641
0
          tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
642
0
        }
643
0
      }
644
3.62M
    }
645
3.62M
  }
646
1.02M
}
647
648
7.50k
void Dict::add_document_word(const WERD_CHOICE &best_choice) {
649
  // Do not add hyphenated word parts to the document dawg.
650
  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
651
  // called when the first part of the hyphenated word is
652
  // discovered and while the second part of the word is recognized.
653
  // hyphen_word_ is cleared in cc_recg() before the next word on
654
  // the line is recognized.
655
7.50k
  if (hyphen_word_) {
656
0
    return;
657
0
  }
658
659
7.50k
  int stringlen = best_choice.length();
660
661
7.50k
  if (valid_word(best_choice) || stringlen < 2) {
662
6.10k
    return;
663
6.10k
  }
664
665
  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
666
1.39k
  if (best_choice.length() >= kDocDictMaxRepChars) {
667
1.04k
    int num_rep_chars = 1;
668
1.04k
    UNICHAR_ID uch_id = best_choice.unichar_id(0);
669
7.10k
    for (unsigned i = 1; i < best_choice.length(); ++i) {
670
6.40k
      if (best_choice.unichar_id(i) != uch_id) {
671
4.62k
        num_rep_chars = 1;
672
4.62k
        uch_id = best_choice.unichar_id(i);
673
4.62k
      } else {
674
1.78k
        ++num_rep_chars;
675
1.78k
        if (num_rep_chars == kDocDictMaxRepChars) {
676
340
          return;
677
340
        }
678
1.78k
      }
679
6.40k
    }
680
1.04k
  }
681
682
1.05k
  if (best_choice.certainty() < doc_dict_certainty_threshold || stringlen == 2) {
683
1.05k
    if (best_choice.certainty() < doc_dict_pending_threshold) {
684
1.05k
      return;
685
1.05k
    }
686
687
0
    if (!pending_words_->word_in_dawg(best_choice)) {
688
0
      if (stringlen > 2 ||
689
0
          (stringlen == 2 && getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
690
0
           getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
691
0
        pending_words_->add_word_to_dawg(best_choice);
692
0
      }
693
0
      return;
694
0
    }
695
0
  }
696
697
2
  if (save_doc_words) {
698
0
    std::string filename(getCCUtil()->imagefile);
699
0
    filename += ".doc";
700
0
    FILE *doc_word_file = fopen(filename.c_str(), "a");
701
0
    if (doc_word_file == nullptr) {
702
0
      tprintf("Error: Could not open file %s\n", filename.c_str());
703
0
      ASSERT_HOST(doc_word_file);
704
0
    }
705
0
    fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
706
0
    fclose(doc_word_file);
707
0
  }
708
2
  document_words_->add_word_to_dawg(best_choice);
709
2
}
710
711
void Dict::adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency,
712
331k
                       float additional_adjust, bool modify_rating, bool debug) {
713
331k
  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
714
331k
                 word->GetTopScriptID() == getUnicharset().han_sid());
715
331k
  bool case_is_ok = (is_han || case_ok(*word));
716
331k
  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
717
718
331k
  float adjust_factor = additional_adjust;
719
331k
  float new_rating = word->rating();
720
331k
  new_rating += kRatingPad;
721
331k
  const char *xheight_triggered = "";
722
331k
  if (word->length() > 1) {
723
    // Calculate x-height and y-offset consistency penalties.
724
297k
    switch (xheight_consistency) {
725
209k
      case XH_INCONSISTENT:
726
209k
        adjust_factor += xheight_penalty_inconsistent;
727
209k
        xheight_triggered = ", xhtBAD";
728
209k
        break;
729
51.8k
      case XH_SUBNORMAL:
730
51.8k
        adjust_factor += xheight_penalty_subscripts;
731
51.8k
        xheight_triggered = ", xhtSUB";
732
51.8k
        break;
733
36.3k
      case XH_GOOD:
734
        // leave the factor alone - all good!
735
36.3k
        break;
736
297k
    }
737
    // TODO(eger): if nonword is true, but there is a "core" that is a dict
738
    // word, negate nonword status.
739
297k
  } else {
740
34.0k
    if (debug) {
741
0
      tprintf("Consistency could not be calculated.\n");
742
0
    }
743
34.0k
  }
744
331k
  if (debug) {
745
0
    tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", word->unichar_string().c_str(),
746
0
            word->rating(), xheight_triggered);
747
0
  }
748
749
331k
  if (nonword) { // non-dictionary word
750
331k
    if (case_is_ok && punc_is_ok) {
751
0
      adjust_factor += segment_penalty_dict_nonword;
752
0
      new_rating *= adjust_factor;
753
0
      if (debug) {
754
0
        tprintf(", W");
755
0
      }
756
331k
    } else {
757
331k
      adjust_factor += segment_penalty_garbage;
758
331k
      new_rating *= adjust_factor;
759
331k
      if (debug) {
760
0
        if (!case_is_ok) {
761
0
          tprintf(", C");
762
0
        }
763
0
        if (!punc_is_ok) {
764
0
          tprintf(", P");
765
0
        }
766
0
      }
767
331k
    }
768
331k
  } else { // dictionary word
769
0
    if (case_is_ok) {
770
0
      if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
771
0
        word->set_permuter(FREQ_DAWG_PERM);
772
0
        adjust_factor += segment_penalty_dict_frequent_word;
773
0
        new_rating *= adjust_factor;
774
0
        if (debug) {
775
0
          tprintf(", F");
776
0
        }
777
0
      } else {
778
0
        adjust_factor += segment_penalty_dict_case_ok;
779
0
        new_rating *= adjust_factor;
780
0
        if (debug) {
781
0
          tprintf(", ");
782
0
        }
783
0
      }
784
0
    } else {
785
0
      adjust_factor += segment_penalty_dict_case_bad;
786
0
      new_rating *= adjust_factor;
787
0
      if (debug) {
788
0
        tprintf(", C");
789
0
      }
790
0
    }
791
0
  }
792
331k
  new_rating -= kRatingPad;
793
331k
  if (modify_rating) {
794
0
    word->set_rating(new_rating);
795
0
  }
796
331k
  if (debug) {
797
0
    tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
798
0
  }
799
331k
  word->set_adjust_factor(adjust_factor);
800
331k
}
801
802
101k
int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
803
101k
  const WERD_CHOICE *word_ptr = &word;
804
101k
  WERD_CHOICE temp_word(word.unicharset());
805
101k
  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
806
0
    copy_hyphen_info(&temp_word);
807
0
    temp_word += word;
808
0
    word_ptr = &temp_word;
809
0
  }
810
101k
  if (word_ptr->empty()) {
811
0
    return NO_PERM;
812
0
  }
813
  // Allocate vectors for holding current and updated
814
  // active_dawgs and initialize them.
815
101k
  DawgPositionVector active_dawgs[2];
816
101k
  init_active_dawgs(&(active_dawgs[0]), false);
817
101k
  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
818
101k
  int last_index = word_ptr->length() - 1;
819
  // Call letter_is_okay for each letter in the word.
820
215k
  for (int i = hyphen_base_size(); i <= last_index; ++i) {
821
147k
    if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), word_ptr->unichar_id(i),
822
147k
                                   i == last_index))) {
823
32.1k
      break;
824
32.1k
    }
825
    // Swap active_dawgs, constraints with the corresponding updated vector.
826
114k
    if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
827
93.6k
      dawg_args.updated_dawgs = &(active_dawgs[0]);
828
93.6k
      ++(dawg_args.active_dawgs);
829
93.6k
    } else {
830
21.2k
      ++(dawg_args.updated_dawgs);
831
21.2k
      dawg_args.active_dawgs = &(active_dawgs[0]);
832
21.2k
    }
833
114k
  }
834
101k
  return valid_word_permuter(dawg_args.permuter, numbers_ok) ? dawg_args.permuter : NO_PERM;
835
101k
}
836
837
0
bool Dict::valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const {
838
0
  if (bigram_dawg_ == nullptr) {
839
0
    return false;
840
0
  }
841
842
  // Extract the core word from the middle of each word with any digits
843
  //         replaced with question marks.
844
0
  unsigned w1start, w1end, w2start, w2end;
845
0
  word1.punct_stripped(&w1start, &w1end);
846
0
  word2.punct_stripped(&w2start, &w2end);
847
848
  // We don't want to penalize a single guillemet, hyphen, etc.
849
  // But our bigram list doesn't have any information about punctuation.
850
0
  if (w1start >= w1end) {
851
0
    return word1.length() < 3;
852
0
  }
853
0
  if (w2start >= w2end) {
854
0
    return word2.length() < 3;
855
0
  }
856
857
0
  const UNICHARSET &uchset = getUnicharset();
858
0
  std::vector<UNICHAR_ID> bigram_string;
859
0
  bigram_string.reserve(w1end + w2end + 1);
860
0
  for (auto i = w1start; i < w1end; i++) {
861
0
    const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i));
862
0
    if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
863
0
      bigram_string.push_back(question_unichar_id_);
864
0
    } else {
865
0
      bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
866
0
    }
867
0
  }
868
0
  bigram_string.push_back(UNICHAR_SPACE);
869
0
  for (auto i = w2start; i < w2end; i++) {
870
0
    const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i));
871
0
    if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
872
0
      bigram_string.push_back(question_unichar_id_);
873
0
    } else {
874
0
      bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
875
0
    }
876
0
  }
877
0
  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
878
0
  for (int i : bigram_string) {
879
0
    normalized_word.append_unichar_id_space_allocated(i, 1, 0.0f, 0.0f);
880
0
  }
881
0
  return bigram_dawg_->word_in_dawg(normalized_word);
882
0
}
883
884
331k
bool Dict::valid_punctuation(const WERD_CHOICE &word) {
885
331k
  if (word.empty()) {
886
0
    return NO_PERM;
887
0
  }
888
331k
  WERD_CHOICE new_word(word.unicharset());
889
331k
  auto last_index = word.length() - 1;
890
331k
  int new_len;
891
3.35M
  for (unsigned i = 0; i <= last_index; ++i) {
892
3.05M
    UNICHAR_ID unichar_id = (word.unichar_id(i));
893
3.05M
    if (getUnicharset().get_ispunctuation(unichar_id)) {
894
845k
      new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
895
2.20M
    } else if (!getUnicharset().get_isalpha(unichar_id) &&
896
2.20M
               !getUnicharset().get_isdigit(unichar_id)) {
897
29.6k
      return false; // neither punc, nor alpha, nor digit
898
2.17M
    } else if ((new_len = new_word.length()) == 0 ||
899
2.17M
               new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
900
645k
      new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
901
645k
    }
902
3.05M
  }
903
301k
  for (unsigned i = 0; i < dawgs_.size(); ++i) {
904
0
    if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
905
0
        dawgs_[i]->word_in_dawg(new_word)) {
906
0
      return true;
907
0
    }
908
0
  }
909
301k
  return false;
910
301k
}
911
912
/// Returns true if the language is space-delimited (not CJ, or T).
913
1
bool Dict::IsSpaceDelimitedLang() const {
914
1
  const UNICHARSET &u_set = getUnicharset();
915
1
  if (u_set.han_sid() > 0) {
916
0
    return false;
917
0
  }
918
1
  if (u_set.katakana_sid() > 0) {
919
0
    return false;
920
0
  }
921
1
  if (u_set.thai_sid() > 0) {
922
0
    return false;
923
0
  }
924
1
  return true;
925
1
}
926
927
} // namespace tesseract