Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccmain/tessedit.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        tessedit.cpp  (Formerly tessedit.c)
3
 * Description: (Previously) Main program for merge of tess and editor.
4
 *              Now just code to load the language model and various
5
 *              engine-specific data files.
6
 * Author:      Ray Smith
7
 *
8
 * (C) Copyright 1992, Hewlett-Packard Ltd.
9
 ** Licensed under the Apache License, Version 2.0 (the "License");
10
 ** you may not use this file except in compliance with the License.
11
 ** You may obtain a copy of the License at
12
 ** http://www.apache.org/licenses/LICENSE-2.0
13
 ** Unless required by applicable law or agreed to in writing, software
14
 ** distributed under the License is distributed on an "AS IS" BASIS,
15
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
 ** See the License for the specific language governing permissions and
17
 ** limitations under the License.
18
 *
19
 **********************************************************************/
20
21
// Include automatically generated configuration file if running autoconf.
22
#ifdef HAVE_CONFIG_H
23
#  include "config_auto.h"
24
#endif
25
26
#include "control.h"
27
#include "matchdefs.h"
28
#include "pageres.h"
29
#include "params.h"
30
#include "stopper.h"
31
#include "tesseractclass.h"
32
#include "tessvars.h"
33
#include "tprintf.h"
34
#ifndef DISABLED_LEGACY_ENGINE
35
#  include "chop.h"
36
#  include "intmatcher.h"
37
#  include "reject.h"
38
#endif
39
#include "lstmrecognizer.h"
40
41
namespace tesseract {
42
43
// Read a "config" file containing a set of variable, value pairs.
44
// Searches the standard places: tessdata/configs, tessdata/tessconfigs
45
// and also accepts a relative or absolute path name.
46
0
void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
47
0
  std::string path = datadir;
48
0
  path += "configs/";
49
0
  path += filename;
50
0
  FILE *fp;
51
0
  if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
52
0
    fclose(fp);
53
0
  } else {
54
0
    path = datadir;
55
0
    path += "tessconfigs/";
56
0
    path += filename;
57
0
    if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
58
0
      fclose(fp);
59
0
    } else {
60
0
      path = filename;
61
0
    }
62
0
  }
63
0
  ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
64
0
}
65
66
// Returns false if a unicharset file for the specified language was not found
67
// or was invalid.
68
// This function initializes TessdataManager. After TessdataManager is
69
// no longer needed, TessdataManager::End() should be called.
70
//
71
// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
72
// it is OEM_DEFAULT, in which case the value of the variable will be obtained
73
// from the language-specific config file (stored in [lang].traineddata), from
74
// the config files specified on the command line or left as the default
75
// OEM_TESSERACT_ONLY if none of the configs specify this variable.
76
bool Tesseract::init_tesseract_lang_data(const std::string &arg0,
77
                                         const std::string &language, OcrEngineMode oem,
78
                                         char **configs, int configs_size,
79
                                         const std::vector<std::string> *vars_vec,
80
                                         const std::vector<std::string> *vars_values,
81
4
                                         bool set_only_non_debug_params, TessdataManager *mgr) {
82
  // Set the language data path prefix
83
4
  lang = !language.empty() ? language : "eng";
84
4
  language_data_path_prefix = datadir;
85
4
  language_data_path_prefix += lang;
86
4
  language_data_path_prefix += ".";
87
88
  // Initialize TessdataManager.
89
4
  std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
90
4
  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
91
0
    tprintf("Error opening data file %s\n", tessdata_path.c_str());
92
0
    tprintf(
93
0
        "Please make sure the TESSDATA_PREFIX environment variable is set"
94
0
        " to your \"tessdata\" directory.\n");
95
0
    return false;
96
0
  }
97
#ifdef DISABLED_LEGACY_ENGINE
98
  tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
99
#else
100
4
  if (oem == OEM_DEFAULT) {
101
    // Set the engine mode from availability, which can then be overridden by
102
    // the config file when we read it below.
103
4
    if (!mgr->IsLSTMAvailable()) {
104
0
      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
105
4
    } else if (!mgr->IsBaseAvailable()) {
106
0
      tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
107
4
    } else {
108
4
      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
109
4
    }
110
4
  }
111
4
#endif // ndef DISABLED_LEGACY_ENGINE
112
113
  // If a language specific config file (lang.config) exists, load it in.
114
4
  TFile fp;
115
4
  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
116
0
    ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params());
117
0
  }
118
119
4
  SetParamConstraint set_params_constraint =
120
4
      set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
121
  // Load tesseract variables from config files. This is done after loading
122
  // language-specific variables from [lang].traineddata file, so that custom
123
  // config files can override values in [lang].traineddata file.
124
4
  for (int i = 0; i < configs_size; ++i) {
125
0
    read_config_file(configs[i], set_params_constraint);
126
0
  }
127
128
  // Set params specified in vars_vec (done after setting params from config
129
  // files, so that params in vars_vec can override those from files).
130
4
  if (vars_vec != nullptr && vars_values != nullptr) {
131
0
    for (unsigned i = 0; i < vars_vec->size(); ++i) {
132
0
      if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
133
0
                                set_params_constraint, this->params())) {
134
0
        tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
135
0
      }
136
0
    }
137
0
  }
138
139
4
  if (!tessedit_write_params_to_file.empty()) {
140
0
    FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
141
0
    if (params_file != nullptr) {
142
0
      ParamUtils::PrintParams(params_file, this->params());
143
0
      fclose(params_file);
144
0
    } else {
145
0
      tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
146
0
    }
147
0
  }
148
149
4
#ifndef DISABLED_LEGACY_ENGINE
150
  // Determine which ocr engine(s) should be loaded and used for recognition.
151
4
  if (oem != OEM_DEFAULT) {
152
0
    tessedit_ocr_engine_mode.set_value(oem);
153
0
  }
154
4
#endif
155
156
  // If we are only loading the config file (and so not planning on doing any
157
  // recognition) then there's nothing else do here.
158
4
  if (tessedit_init_config_only) {
159
0
    return true;
160
0
  }
161
162
// The various OcrEngineMode settings (see tesseract/publictypes.h) determine
163
// which engine-specific data files need to be loaded. If LSTM_ONLY is
164
// requested, the base Tesseract files are *Not* required.
165
#ifdef DISABLED_LEGACY_ENGINE
166
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
167
#else
168
4
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
169
4
      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
170
4
#endif // ndef DISABLED_LEGACY_ENGINE
171
4
    if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
172
4
      lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
173
4
      ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
174
4
    } else {
175
0
      tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
176
0
      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
177
0
    }
178
4
  }
179
180
  // Load the unicharset
181
4
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
182
    // Avoid requiring a unicharset when we aren't running base tesseract.
183
0
    unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
184
0
  }
185
4
#ifndef DISABLED_LEGACY_ENGINE
186
4
  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
187
0
    tprintf(
188
0
        "Error: Tesseract (legacy) engine requested, but components are "
189
0
        "not present in %s!!\n",
190
0
        tessdata_path.c_str());
191
0
    return false;
192
0
  }
193
4
#endif // ndef DISABLED_LEGACY_ENGINE
194
4
  if (unicharset.size() > MAX_NUM_CLASSES) {
195
0
    tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
196
0
    return false;
197
0
  }
198
4
  right_to_left_ = unicharset.major_right_to_left();
199
200
4
#ifndef DISABLED_LEGACY_ENGINE
201
202
  // Setup initial unichar ambigs table and read universal ambigs.
203
4
  UNICHARSET encoder_unicharset;
204
4
  encoder_unicharset.CopyFrom(unicharset);
205
4
  unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
206
4
  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
207
208
4
  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
209
4
    unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
210
4
                                     use_ambigs_for_adaption, &unicharset);
211
4
  }
212
213
  // Init ParamsModel.
214
  // Load pass1 and pass2 weights (for now these two sets are the same, but in
215
  // the future separate sets of weights can be generated).
216
12
  for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
217
8
    language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
218
8
    if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
219
0
      if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
220
0
        return false;
221
0
      }
222
0
    }
223
8
  }
224
4
#endif // ndef DISABLED_LEGACY_ENGINE
225
226
4
  return true;
227
4
}
228
229
// Helper returns true if the given string is in the vector of strings.
230
8
static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {
231
8
  for (const auto &i : str_list) {
232
0
    if (i == str) {
233
0
      return true;
234
0
    }
235
0
  }
236
8
  return false;
237
8
}
238
239
// Parse a string of the form [~]<lang>[+[~]<lang>]*.
240
// Langs with no prefix get appended to to_load, provided they
241
// are not in there already.
242
// Langs with ~ prefix get appended to not_to_load, provided they are not in
243
// there already.
244
void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
245
8
                                    std::vector<std::string> *not_to_load) {
246
8
  std::string remains(lang_str);
247
  // Look whether the model file uses a prefix which must be applied to
248
  // included model files as well.
249
8
  std::string prefix;
250
8
  size_t found = lang.find_last_of('/');
251
8
  if (found != std::string::npos) {
252
    // A prefix was found.
253
0
    prefix = lang.substr(0, found + 1);
254
0
  }
255
12
  while (!remains.empty()) {
256
    // Find the start of the lang code and which vector to add to.
257
4
    const char *start = remains.c_str();
258
4
    while (*start == '+') {
259
0
      ++start;
260
0
    }
261
4
    std::vector<std::string> *target = to_load;
262
4
    if (*start == '~') {
263
0
      target = not_to_load;
264
0
      ++start;
265
0
    }
266
    // Find the index of the end of the lang code in string start.
267
4
    int end = strlen(start);
268
4
    const char *plus = strchr(start, '+');
269
4
    if (plus != nullptr && plus - start < end) {
270
0
      end = plus - start;
271
0
    }
272
4
    std::string lang_code(start);
273
4
    lang_code.resize(end);
274
4
    std::string next(start + end);
275
4
    remains = std::move(next);
276
4
    lang_code = prefix + lang_code;
277
    // Check whether lang_code is already in the target vector and add.
278
4
    if (!IsStrInList(lang_code, *target)) {
279
4
      target->push_back(lang_code);
280
4
    }
281
4
  }
282
8
}
283
284
// Initialize for potentially a set of languages defined by the language
285
// string and recursively any additional languages required by any language
286
// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
287
// See init_tesseract_internal for args.
288
int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,
289
                              const std::string &language, OcrEngineMode oem, char **configs,
290
                              int configs_size, const std::vector<std::string> *vars_vec,
291
                              const std::vector<std::string> *vars_values,
292
4
                              bool set_only_non_debug_params, TessdataManager *mgr) {
293
4
  std::vector<std::string> langs_to_load;
294
4
  std::vector<std::string> langs_not_to_load;
295
4
  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
296
297
4
  for (auto *lang : sub_langs_) {
298
0
    delete lang;
299
0
  }
300
301
  // Set the basename, compute the data directory.
302
4
  main_setup(arg0, textbase);
303
304
4
  sub_langs_.clear();
305
  // Find the first loadable lang and load into this.
306
  // Add any languages that this language requires
307
4
  bool loaded_primary = false;
308
  // Load the rest into sub_langs_.
309
  // WARNING: A range based for loop does not work here because langs_to_load
310
  // might be changed in the loop when a new submodel is found.
311
8
  for (size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
312
4
    auto &lang_to_load = langs_to_load[lang_index];
313
4
    if (!IsStrInList(lang_to_load, langs_not_to_load)) {
314
4
      const char *lang_str = lang_to_load.c_str();
315
4
      Tesseract *tess_to_init;
316
4
      if (!loaded_primary) {
317
4
        tess_to_init = this;
318
4
      } else {
319
0
        tess_to_init = new Tesseract;
320
0
        tess_to_init->main_setup(arg0, textbase);
321
0
      }
322
323
4
      int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
324
4
                                                         configs_size, vars_vec, vars_values,
325
4
                                                         set_only_non_debug_params, mgr);
326
      // Forget that language, but keep any reader we were given.
327
4
      mgr->Clear();
328
329
4
      if (!loaded_primary) {
330
4
        if (result < 0) {
331
0
          tprintf("Failed loading language '%s'\n", lang_str);
332
4
        } else {
333
4
          ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
334
4
                              &langs_not_to_load);
335
4
          loaded_primary = true;
336
4
        }
337
4
      } else {
338
0
        if (result < 0) {
339
0
          tprintf("Failed loading language '%s'\n", lang_str);
340
0
          delete tess_to_init;
341
0
        } else {
342
0
          sub_langs_.push_back(tess_to_init);
343
          // Add any languages that this language requires
344
0
          ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
345
0
                              &langs_not_to_load);
346
0
        }
347
0
      }
348
4
    }
349
4
  }
350
4
  if (!loaded_primary && !langs_to_load.empty()) {
351
0
    tprintf("Tesseract couldn't load any languages!\n");
352
0
    return -1; // Couldn't load any language!
353
0
  }
354
4
#ifndef DISABLED_LEGACY_ENGINE
355
4
  if (!sub_langs_.empty()) {
356
    // In multilingual mode word ratings have to be directly comparable,
357
    // so use the same language model weights for all languages:
358
    // use the primary language's params model if
359
    // tessedit_use_primary_params_model is set,
360
    // otherwise use default language model weights.
361
0
    if (tessedit_use_primary_params_model) {
362
0
      for (auto &sub_lang : sub_langs_) {
363
0
        sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
364
0
      }
365
0
      tprintf("Using params model of the primary language\n");
366
0
    } else {
367
0
      this->language_model_->getParamsModel().Clear();
368
0
      for (auto &sub_lang : sub_langs_) {
369
0
        sub_lang->language_model_->getParamsModel().Clear();
370
0
      }
371
0
    }
372
0
  }
373
374
4
  SetupUniversalFontIds();
375
4
#endif // ndef DISABLED_LEGACY_ENGINE
376
4
  return 0;
377
4
}
378
379
// Common initialization for a single language.
380
// arg0 is the datapath for the tessdata directory, which could be the
381
// path of the tessdata directory with no trailing /, or (if tessdata
382
// lives in the same directory as the executable, the path of the executable,
383
// hence the name arg0.
384
// textbase is an optional output file basename (used only for training)
385
// language is the language code to load.
386
// oem controls which engine(s) will operate on the image
387
// configs (argv) is an array of config filenames to load variables from.
388
// May be nullptr.
389
// configs_size (argc) is the number of elements in configs.
390
// vars_vec is an optional vector of variables to set.
391
// vars_values is an optional corresponding vector of values for the variables
392
// in vars_vec.
393
// If set_only_non_debug_params is true, only params that do not contain
394
// "debug" in the name will be set.
395
int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,
396
                                       const std::string &language, OcrEngineMode oem,
397
                                       char **configs, int configs_size,
398
                                       const std::vector<std::string> *vars_vec,
399
                                       const std::vector<std::string> *vars_values,
400
4
                                       bool set_only_non_debug_params, TessdataManager *mgr) {
401
4
  if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,
402
4
                                vars_values, set_only_non_debug_params, mgr)) {
403
0
    return -1;
404
0
  }
405
4
  if (tessedit_init_config_only) {
406
0
    return 0;
407
0
  }
408
  // If only LSTM will be used, skip loading Tesseract classifier's
409
  // pre-trained templates and dictionary.
410
4
  bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
411
4
  program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
412
4
  return 0; // Normal exit
413
4
}
414
415
#ifndef DISABLED_LEGACY_ENGINE
416
417
// Helper builds the all_fonts table by adding new fonts from new_fonts.
418
static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,
419
4
                         UnicityTable<FontInfo> *all_fonts) {
420
1.71k
  for (int i = 0; i < new_fonts.size(); ++i) {
421
    // UnicityTable uniques as we go.
422
1.70k
    all_fonts->push_back(new_fonts.at(i));
423
1.70k
  }
424
4
}
425
426
// Helper assigns an id to lang_fonts using the index in all_fonts table.
427
4
static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {
428
1.71k
  for (int i = 0; i < lang_fonts->size(); ++i) {
429
1.70k
    auto index = all_fonts.get_index(lang_fonts->at(i));
430
1.70k
    lang_fonts->at(i).universal_id = index;
431
1.70k
  }
432
4
}
433
434
// Set the universal_id member of each font to be unique among all
435
// instances of the same font loaded.
436
4
void Tesseract::SetupUniversalFontIds() {
437
  // Note that we can get away with bitwise copying FontInfo in
438
  // all_fonts, as it is a temporary structure and we avoid setting the
439
  // delete callback.
440
4
  UnicityTable<FontInfo> all_fonts;
441
442
  // Create the universal ID table.
443
4
  CollectFonts(get_fontinfo_table(), &all_fonts);
444
4
  for (auto &sub_lang : sub_langs_) {
445
0
    CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
446
0
  }
447
  // Assign ids from the table to each font table.
448
4
  AssignIds(all_fonts, &get_fontinfo_table());
449
4
  for (auto &sub_lang : sub_langs_) {
450
0
    AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
451
0
  }
452
4
  font_table_size_ = all_fonts.size();
453
4
}
454
455
#endif // ndef DISABLED_LEGACY_ENGINE
456
457
0
void Tesseract::end_tesseract() {
458
0
  end_recog();
459
0
}
460
461
/* Define command type identifiers */
462
463
enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT };
464
} // namespace tesseract