/src/tesseract/src/ccmain/tessedit.cpp

Source (jump to first uncovered line)
/**********************************************************************
 * File:        tessedit.cpp  (Formerly tessedit.c)
 * Description: (Previously) Main program for merge of tess and editor.
 *              Now just code to load the language model and various
 *              engine-specific data files.
 * Author:      Ray Smith
 *
 * (C) Copyright 1992, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#  include "config_auto.h"
#endif

#include "control.h"
#include "matchdefs.h"
#include "pageres.h"
#include "params.h"
#include "stopper.h"
#include "tesseractclass.h"
#include "tessvars.h"
#include "tprintf.h"
#ifndef DISABLED_LEGACY_ENGINE
#  include "chop.h"
#  include "intmatcher.h"
#  include "reject.h"
#endif
#include "lstmrecognizer.h"

namespace tesseract {

// Read a "config" file containing a set of variable, value pairs.
// Searches the standard places: tessdata/configs, tessdata/tessconfigs
// and also accepts a relative or absolute path name.
void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
  std::string path = datadir;
  path += "configs/";
  path += filename;
  FILE *fp;
  if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
    fclose(fp);
  } else {
    path = datadir;
    path += "tessconfigs/";
    path += filename;
    if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
      fclose(fp);
    } else {
      path = filename;
    }
  }
  ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
}

// Returns false if a unicharset file for the specified language was not found
// or was invalid.
// This function initializes TessdataManager. After TessdataManager is
// no longer needed, TessdataManager::End() should be called.
//
// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
// it is OEM_DEFAULT, in which case the value of the variable will be obtained
// from the language-specific config file (stored in [lang].traineddata), from
// the config files specified on the command line or left as the default
// OEM_TESSERACT_ONLY if none of the configs specify this variable.
bool Tesseract::init_tesseract_lang_data(const std::string &arg0,
                                         const std::string &language, OcrEngineMode oem,
                                         char **configs, int configs_size,
                                         const std::vector<std::string> *vars_vec,
                                         const std::vector<std::string> *vars_values,
                                         bool set_only_non_debug_params, TessdataManager *mgr) {
  // Set the language data path prefix
  lang = !language.empty() ? language : "eng";
  language_data_path_prefix = datadir;
  language_data_path_prefix += lang;
  language_data_path_prefix += ".";

  // Initialize TessdataManager.
  std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
    tprintf("Error opening data file %s\n", tessdata_path.c_str());
    tprintf(
        "Please make sure the TESSDATA_PREFIX environment variable is set"
        " to your \"tessdata\" directory.\n");
    return false;
  }
#ifdef DISABLED_LEGACY_ENGINE
  tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
#else
  if (oem == OEM_DEFAULT) {
    // Set the engine mode from availability, which can then be overridden by
    // the config file when we read it below.
    if (!mgr->IsLSTMAvailable()) {
      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
    } else if (!mgr->IsBaseAvailable()) {
      tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
    } else {
      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
    }
  }
#endif // ndef DISABLED_LEGACY_ENGINE

  // If a language specific config file (lang.config) exists, load it in.
  TFile fp;
  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
    ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params());
  }

  SetParamConstraint set_params_constraint =
      set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
  // Load tesseract variables from config files. This is done after loading
  // language-specific variables from [lang].traineddata file, so that custom
  // config files can override values in [lang].traineddata file.
  for (int i = 0; i < configs_size; ++i) {
    read_config_file(configs[i], set_params_constraint);
  }

  // Set params specified in vars_vec (done after setting params from config
  // files, so that params in vars_vec can override those from files).
  if (vars_vec != nullptr && vars_values != nullptr) {
    for (unsigned i = 0; i < vars_vec->size(); ++i) {
      if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
                                set_params_constraint, this->params())) {
        tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
      }
    }
  }

  if (!tessedit_write_params_to_file.empty()) {
    FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
    if (params_file != nullptr) {
      ParamUtils::PrintParams(params_file, this->params());
      fclose(params_file);
    } else {
      tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
    }
  }

#ifndef DISABLED_LEGACY_ENGINE
  // Determine which ocr engine(s) should be loaded and used for recognition.
  if (oem != OEM_DEFAULT) {
    tessedit_ocr_engine_mode.set_value(oem);
  }
#endif

  // If we are only loading the config file (and so not planning on doing any
  // recognition) then there's nothing else do here.
  if (tessedit_init_config_only) {
    return true;
  }

// The various OcrEngineMode settings (see tesseract/publictypes.h) determine
// which engine-specific data files need to be loaded. If LSTM_ONLY is
// requested, the base Tesseract files are *Not* required.
#ifdef DISABLED_LEGACY_ENGINE
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
#else
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
#endif // ndef DISABLED_LEGACY_ENGINE
    if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
      lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
      ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
    } else {
      tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
      tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
    }
  }

  // Load the unicharset
  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
    // Avoid requiring a unicharset when we aren't running base tesseract.
    unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
  }
#ifndef DISABLED_LEGACY_ENGINE
  else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
    tprintf(
        "Error: Tesseract (legacy) engine requested, but components are "
        "not present in %s!!\n",
        tessdata_path.c_str());
    return false;
  }
#endif // ndef DISABLED_LEGACY_ENGINE
  if (unicharset.size() > MAX_NUM_CLASSES) {
    tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
    return false;
  }
  right_to_left_ = unicharset.major_right_to_left();

#ifndef DISABLED_LEGACY_ENGINE

  // Setup initial unichar ambigs table and read universal ambigs.
  UNICHARSET encoder_unicharset;
  encoder_unicharset.CopyFrom(unicharset);
  unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);

  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
    unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
                                     use_ambigs_for_adaption, &unicharset);
  }

  // Init ParamsModel.
  // Load pass1 and pass2 weights (for now these two sets are the same, but in
  // the future separate sets of weights can be generated).
  for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
    language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
    if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
      if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
        return false;
      }
    }
  }
#endif // ndef DISABLED_LEGACY_ENGINE

  return true;
}

// Helper returns true if the given string is in the vector of strings.
static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {
  for (const auto &i : str_list) {
    if (i == str) {
      return true;
    }
  }
  return false;
}

// Parse a string of the form [~]<lang>[+[~]<lang>]*.
// Langs with no prefix get appended to to_load, provided they
// are not in there already.
// Langs with ~ prefix get appended to not_to_load, provided they are not in
// there already.
void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
                                    std::vector<std::string> *not_to_load) {
  std::string remains(lang_str);
  // Look whether the model file uses a prefix which must be applied to
  // included model files as well.
  std::string prefix;
  size_t found = lang.find_last_of('/');
  if (found != std::string::npos) {
    // A prefix was found.
    prefix = lang.substr(0, found + 1);
  }
  while (!remains.empty()) {
    // Find the start of the lang code and which vector to add to.
    const char *start = remains.c_str();
    while (*start == '+') {
      ++start;
    }
    std::vector<std::string> *target = to_load;
    if (*start == '~') {
      target = not_to_load;
      ++start;
    }
    // Find the index of the end of the lang code in string start.
    int end = strlen(start);
    const char *plus = strchr(start, '+');
    if (plus != nullptr && plus - start < end) {
      end = plus - start;
    }
    std::string lang_code(start);
    lang_code.resize(end);
    std::string next(start + end);
    remains = std::move(next);
    lang_code = prefix + lang_code;
    // Check whether lang_code is already in the target vector and add.
    if (!IsStrInList(lang_code, *target)) {
      target->push_back(lang_code);
    }
  }
}

// Initialize for potentially a set of languages defined by the language
// string and recursively any additional languages required by any language
// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
// See init_tesseract_internal for args.
int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,
                              const std::string &language, OcrEngineMode oem, char **configs,
                              int configs_size, const std::vector<std::string> *vars_vec,
                              const std::vector<std::string> *vars_values,
                              bool set_only_non_debug_params, TessdataManager *mgr) {
  std::vector<std::string> langs_to_load;
  std::vector<std::string> langs_not_to_load;
  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);

  for (auto *lang : sub_langs_) {
    delete lang;
  }

  // Set the basename, compute the data directory.
  main_setup(arg0, textbase);

  sub_langs_.clear();
  // Find the first loadable lang and load into this.
  // Add any languages that this language requires
  bool loaded_primary = false;
  // Load the rest into sub_langs_.
  // WARNING: A range based for loop does not work here because langs_to_load
  // might be changed in the loop when a new submodel is found.
  for (size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
    auto &lang_to_load = langs_to_load[lang_index];
    if (!IsStrInList(lang_to_load, langs_not_to_load)) {
      const char *lang_str = lang_to_load.c_str();
      Tesseract *tess_to_init;
      if (!loaded_primary) {
        tess_to_init = this;
      } else {
        tess_to_init = new Tesseract;
        tess_to_init->main_setup(arg0, textbase);
      }

      int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
                                                         configs_size, vars_vec, vars_values,
                                                         set_only_non_debug_params, mgr);
      // Forget that language, but keep any reader we were given.
      mgr->Clear();

      if (!loaded_primary) {
        if (result < 0) {
          tprintf("Failed loading language '%s'\n", lang_str);
        } else {
          ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
                              &langs_not_to_load);
          loaded_primary = true;
        }
      } else {
        if (result < 0) {
          tprintf("Failed loading language '%s'\n", lang_str);
          delete tess_to_init;
        } else {
          sub_langs_.push_back(tess_to_init);
          // Add any languages that this language requires
          ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
                              &langs_not_to_load);
        }
      }
    }
  }
  if (!loaded_primary && !langs_to_load.empty()) {
    tprintf("Tesseract couldn't load any languages!\n");
    return -1; // Couldn't load any language!
  }
#ifndef DISABLED_LEGACY_ENGINE
  if (!sub_langs_.empty()) {
    // In multilingual mode word ratings have to be directly comparable,
    // so use the same language model weights for all languages:
    // use the primary language's params model if
    // tessedit_use_primary_params_model is set,
    // otherwise use default language model weights.
    if (tessedit_use_primary_params_model) {
      for (auto &sub_lang : sub_langs_) {
        sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
      }
      tprintf("Using params model of the primary language\n");
    } else {
      this->language_model_->getParamsModel().Clear();
      for (auto &sub_lang : sub_langs_) {
        sub_lang->language_model_->getParamsModel().Clear();
      }
    }
  }

  SetupUniversalFontIds();
#endif // ndef DISABLED_LEGACY_ENGINE
  return 0;
}

// Common initialization for a single language.
// arg0 is the datapath for the tessdata directory, which could be the
// path of the tessdata directory with no trailing /, or (if tessdata
// lives in the same directory as the executable, the path of the executable,
// hence the name arg0.
// textbase is an optional output file basename (used only for training)
// language is the language code to load.
// oem controls which engine(s) will operate on the image
// configs (argv) is an array of config filenames to load variables from.
// May be nullptr.
// configs_size (argc) is the number of elements in configs.
// vars_vec is an optional vector of variables to set.
// vars_values is an optional corresponding vector of values for the variables
// in vars_vec.
// If set_only_non_debug_params is true, only params that do not contain
// "debug" in the name will be set.
int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,
                                       const std::string &language, OcrEngineMode oem,
                                       char **configs, int configs_size,
                                       const std::vector<std::string> *vars_vec,
                                       const std::vector<std::string> *vars_values,
                                       bool set_only_non_debug_params, TessdataManager *mgr) {
  if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,
                                vars_values, set_only_non_debug_params, mgr)) {
    return -1;
  }
  if (tessedit_init_config_only) {
    return 0;
  }
  // If only LSTM will be used, skip loading Tesseract classifier's
  // pre-trained templates and dictionary.
  bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
  program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
  return 0; // Normal exit
}

#ifndef DISABLED_LEGACY_ENGINE

// Helper builds the all_fonts table by adding new fonts from new_fonts.
static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,
                         UnicityTable<FontInfo> *all_fonts) {
  for (int i = 0; i < new_fonts.size(); ++i) {
    // UnicityTable uniques as we go.
    all_fonts->push_back(new_fonts.at(i));
  }
}

// Helper assigns an id to lang_fonts using the index in all_fonts table.
static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {
  for (int i = 0; i < lang_fonts->size(); ++i) {
    auto index = all_fonts.get_index(lang_fonts->at(i));
    lang_fonts->at(i).universal_id = index;
  }
}

// Set the universal_id member of each font to be unique among all
// instances of the same font loaded.
void Tesseract::SetupUniversalFontIds() {
  // Note that we can get away with bitwise copying FontInfo in
  // all_fonts, as it is a temporary structure and we avoid setting the
  // delete callback.
  UnicityTable<FontInfo> all_fonts;

  // Create the universal ID table.
  CollectFonts(get_fontinfo_table(), &all_fonts);
  for (auto &sub_lang : sub_langs_) {
    CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
  }
  // Assign ids from the table to each font table.
  AssignIds(all_fonts, &get_fontinfo_table());
  for (auto &sub_lang : sub_langs_) {
    AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
  }
  font_table_size_ = all_fonts.size();
}

#endif // ndef DISABLED_LEGACY_ENGINE

void Tesseract::end_tesseract() {
  end_recog();
}

/* Define command type identifiers */

enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT };
} // namespace tesseract

Coverage Report

Created: 2025-06-13 07:15

Line	Count	Source (jump to first uncovered line)
1		/**********************************************************************
2		* File: tessedit.cpp (Formerly tessedit.c)
3		* Description: (Previously) Main program for merge of tess and editor.
4		* Now just code to load the language model and various
5		* engine-specific data files.
6		* Author: Ray Smith
7		*
8		* (C) Copyright 1992, Hewlett-Packard Ltd.
9		** Licensed under the Apache License, Version 2.0 (the "License");
10		** you may not use this file except in compliance with the License.
11		** You may obtain a copy of the License at
12		** http://www.apache.org/licenses/LICENSE-2.0
13		** Unless required by applicable law or agreed to in writing, software
14		** distributed under the License is distributed on an "AS IS" BASIS,
15		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16		** See the License for the specific language governing permissions and
17		** limitations under the License.
18		*
19		**********************************************************************/
20
21		// Include automatically generated configuration file if running autoconf.
22		#ifdef HAVE_CONFIG_H
23		# include "config_auto.h"
24		#endif
25
26		#include "control.h"
27		#include "matchdefs.h"
28		#include "pageres.h"
29		#include "params.h"
30		#include "stopper.h"
31		#include "tesseractclass.h"
32		#include "tessvars.h"
33		#include "tprintf.h"
34		#ifndef DISABLED_LEGACY_ENGINE
35		# include "chop.h"
36		# include "intmatcher.h"
37		# include "reject.h"
38		#endif
39		#include "lstmrecognizer.h"
40
41		namespace tesseract {
42
43		// Read a "config" file containing a set of variable, value pairs.
44		// Searches the standard places: tessdata/configs, tessdata/tessconfigs
45		// and also accepts a relative or absolute path name.
46	0	void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
47	0	std::string path = datadir;
48	0	path += "configs/";
49	0	path += filename;
50	0	FILE *fp;
51	0	if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
52	0	fclose(fp);
53	0	} else {
54	0	path = datadir;
55	0	path += "tessconfigs/";
56	0	path += filename;
57	0	if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
58	0	fclose(fp);
59	0	} else {
60	0	path = filename;
61	0	}
62	0	}
63	0	ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
64	0	}
65
66		// Returns false if a unicharset file for the specified language was not found
67		// or was invalid.
68		// This function initializes TessdataManager. After TessdataManager is
69		// no longer needed, TessdataManager::End() should be called.
70		//
71		// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
72		// it is OEM_DEFAULT, in which case the value of the variable will be obtained
73		// from the language-specific config file (stored in [lang].traineddata), from
74		// the config files specified on the command line or left as the default
75		// OEM_TESSERACT_ONLY if none of the configs specify this variable.
76		bool Tesseract::init_tesseract_lang_data(const std::string &arg0,
77		const std::string &language, OcrEngineMode oem,
78		char **configs, int configs_size,
79		const std::vector<std::string> *vars_vec,
80		const std::vector<std::string> *vars_values,
81	4	bool set_only_non_debug_params, TessdataManager *mgr) {
82		// Set the language data path prefix
83	4	lang = !language.empty() ? language : "eng";
84	4	language_data_path_prefix = datadir;
85	4	language_data_path_prefix += lang;
86	4	language_data_path_prefix += ".";
87
88		// Initialize TessdataManager.
89	4	std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
90	4	if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
91	0	tprintf("Error opening data file %s\n", tessdata_path.c_str());
92	0	tprintf(
93	0	"Please make sure the TESSDATA_PREFIX environment variable is set"
94	0	" to your \"tessdata\" directory.\n");
95	0	return false;
96	0	}
97		#ifdef DISABLED_LEGACY_ENGINE
98		tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
99		#else
100	4	if (oem == OEM_DEFAULT) {
101		// Set the engine mode from availability, which can then be overridden by
102		// the config file when we read it below.
103	4	if (!mgr->IsLSTMAvailable()) {
104	0	tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
105	4	} else if (!mgr->IsBaseAvailable()) {
106	0	tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
107	4	} else {
108	4	tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
109	4	}
110	4	}
111	4	#endif // ndef DISABLED_LEGACY_ENGINE
112
113		// If a language specific config file (lang.config) exists, load it in.
114	4	TFile fp;
115	4	if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
116	0	ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params());
117	0	}
118
119	4	SetParamConstraint set_params_constraint =
120	4	set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
121		// Load tesseract variables from config files. This is done after loading
122		// language-specific variables from [lang].traineddata file, so that custom
123		// config files can override values in [lang].traineddata file.
124	4	for (int i = 0; i < configs_size; ++i) {
125	0	read_config_file(configs[i], set_params_constraint);
126	0	}
127
128		// Set params specified in vars_vec (done after setting params from config
129		// files, so that params in vars_vec can override those from files).
130	4	if (vars_vec != nullptr && vars_values != nullptr) {
131	0	for (unsigned i = 0; i < vars_vec->size(); ++i) {
132	0	if (!ParamUtils::SetParam((vars_vec)[i].c_str(), (vars_values)[i].c_str(),
133	0	set_params_constraint, this->params())) {
134	0	tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
135	0	}
136	0	}
137	0	}
138
139	4	if (!tessedit_write_params_to_file.empty()) {
140	0	FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
141	0	if (params_file != nullptr) {
142	0	ParamUtils::PrintParams(params_file, this->params());
143	0	fclose(params_file);
144	0	} else {
145	0	tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
146	0	}
147	0	}
148
149	4	#ifndef DISABLED_LEGACY_ENGINE
150		// Determine which ocr engine(s) should be loaded and used for recognition.
151	4	if (oem != OEM_DEFAULT) {
152	0	tessedit_ocr_engine_mode.set_value(oem);
153	0	}
154	4	#endif
155
156		// If we are only loading the config file (and so not planning on doing any
157		// recognition) then there's nothing else do here.
158	4	if (tessedit_init_config_only) {
159	0	return true;
160	0	}
161
162		// The various OcrEngineMode settings (see tesseract/publictypes.h) determine
163		// which engine-specific data files need to be loaded. If LSTM_ONLY is
164		// requested, the base Tesseract files are Not required.
165		#ifdef DISABLED_LEGACY_ENGINE
166		if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
167		#else
168	4	if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY \|\|
169	4	tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
170	4	#endif // ndef DISABLED_LEGACY_ENGINE
171	4	if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
172	4	lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
173	4	ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
174	4	} else {
175	0	tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
176	0	tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
177	0	}
178	4	}
179
180		// Load the unicharset
181	4	if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
182		// Avoid requiring a unicharset when we aren't running base tesseract.
183	0	unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
184	0	}
185	4	#ifndef DISABLED_LEGACY_ENGINE
186	4	else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) \|\| !unicharset.load_from_file(&fp, false)) {
187	0	tprintf(
188	0	"Error: Tesseract (legacy) engine requested, but components are "
189	0	"not present in %s!!\n",
190	0	tessdata_path.c_str());
191	0	return false;
192	0	}
193	4	#endif // ndef DISABLED_LEGACY_ENGINE
194	4	if (unicharset.size() > MAX_NUM_CLASSES) {
195	0	tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
196	0	return false;
197	0	}
198	4	right_to_left_ = unicharset.major_right_to_left();
199
200	4	#ifndef DISABLED_LEGACY_ENGINE
201
202		// Setup initial unichar ambigs table and read universal ambigs.
203	4	UNICHARSET encoder_unicharset;
204	4	encoder_unicharset.CopyFrom(unicharset);
205	4	unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
206	4	unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
207
208	4	if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
209	4	unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
210	4	use_ambigs_for_adaption, &unicharset);
211	4	}
212
213		// Init ParamsModel.
214		// Load pass1 and pass2 weights (for now these two sets are the same, but in
215		// the future separate sets of weights can be generated).
216	12	for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
217	8	language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
218	8	if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
219	0	if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
220	0	return false;
221	0	}
222	0	}
223	8	}
224	4	#endif // ndef DISABLED_LEGACY_ENGINE
225
226	4	return true;
227	4	}
228
229		// Helper returns true if the given string is in the vector of strings.
230	8	static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {
231	8	for (const auto &i : str_list) {
232	0	if (i == str) {
233	0	return true;
234	0	}
235	0	}
236	8	return false;
237	8	}
238
239		// Parse a string of the form [~]<lang>[+[~]<lang>]*.
240		// Langs with no prefix get appended to to_load, provided they
241		// are not in there already.
242		// Langs with ~ prefix get appended to not_to_load, provided they are not in
243		// there already.
244		void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
245	8	std::vector<std::string> *not_to_load) {
246	8	std::string remains(lang_str);
247		// Look whether the model file uses a prefix which must be applied to
248		// included model files as well.
249	8	std::string prefix;
250	8	size_t found = lang.find_last_of('/');
251	8	if (found != std::string::npos) {
252		// A prefix was found.
253	0	prefix = lang.substr(0, found + 1);
254	0	}
255	12	while (!remains.empty()) {
256		// Find the start of the lang code and which vector to add to.
257	4	const char *start = remains.c_str();
258	4	while (*start == '+') {
259	0	++start;
260	0	}
261	4	std::vector<std::string> *target = to_load;
262	4	if (*start == '~') {
263	0	target = not_to_load;
264	0	++start;
265	0	}
266		// Find the index of the end of the lang code in string start.
267	4	int end = strlen(start);
268	4	const char *plus = strchr(start, '+');
269	4	if (plus != nullptr && plus - start < end) {
270	0	end = plus - start;
271	0	}
272	4	std::string lang_code(start);
273	4	lang_code.resize(end);
274	4	std::string next(start + end);
275	4	remains = std::move(next);
276	4	lang_code = prefix + lang_code;
277		// Check whether lang_code is already in the target vector and add.
278	4	if (!IsStrInList(lang_code, *target)) {
279	4	target->push_back(lang_code);
280	4	}
281	4	}
282	8	}
283
284		// Initialize for potentially a set of languages defined by the language
285		// string and recursively any additional languages required by any language
286		// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
287		// See init_tesseract_internal for args.
288		int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,
289		const std::string &language, OcrEngineMode oem, char **configs,
290		int configs_size, const std::vector<std::string> *vars_vec,
291		const std::vector<std::string> *vars_values,
292	4	bool set_only_non_debug_params, TessdataManager *mgr) {
293	4	std::vector<std::string> langs_to_load;
294	4	std::vector<std::string> langs_not_to_load;
295	4	ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
296
297	4	for (auto *lang : sub_langs_) {
298	0	delete lang;
299	0	}
300
301		// Set the basename, compute the data directory.
302	4	main_setup(arg0, textbase);
303
304	4	sub_langs_.clear();
305		// Find the first loadable lang and load into this.
306		// Add any languages that this language requires
307	4	bool loaded_primary = false;
308		// Load the rest into sub_langs_.
309		// WARNING: A range based for loop does not work here because langs_to_load
310		// might be changed in the loop when a new submodel is found.
311	8	for (size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
312	4	auto &lang_to_load = langs_to_load[lang_index];
313	4	if (!IsStrInList(lang_to_load, langs_not_to_load)) {
314	4	const char *lang_str = lang_to_load.c_str();
315	4	Tesseract *tess_to_init;
316	4	if (!loaded_primary) {
317	4	tess_to_init = this;
318	4	} else {
319	0	tess_to_init = new Tesseract;
320	0	tess_to_init->main_setup(arg0, textbase);
321	0	}
322
323	4	int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
324	4	configs_size, vars_vec, vars_values,
325	4	set_only_non_debug_params, mgr);
326		// Forget that language, but keep any reader we were given.
327	4	mgr->Clear();
328
329	4	if (!loaded_primary) {
330	4	if (result < 0) {
331	0	tprintf("Failed loading language '%s'\n", lang_str);
332	4	} else {
333	4	ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
334	4	&langs_not_to_load);
335	4	loaded_primary = true;
336	4	}
337	4	} else {
338	0	if (result < 0) {
339	0	tprintf("Failed loading language '%s'\n", lang_str);
340	0	delete tess_to_init;
341	0	} else {
342	0	sub_langs_.push_back(tess_to_init);
343		// Add any languages that this language requires
344	0	ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
345	0	&langs_not_to_load);
346	0	}
347	0	}
348	4	}
349	4	}
350	4	if (!loaded_primary && !langs_to_load.empty()) {
351	0	tprintf("Tesseract couldn't load any languages!\n");
352	0	return -1; // Couldn't load any language!
353	0	}
354	4	#ifndef DISABLED_LEGACY_ENGINE
355	4	if (!sub_langs_.empty()) {
356		// In multilingual mode word ratings have to be directly comparable,
357		// so use the same language model weights for all languages:
358		// use the primary language's params model if
359		// tessedit_use_primary_params_model is set,
360		// otherwise use default language model weights.
361	0	if (tessedit_use_primary_params_model) {
362	0	for (auto &sub_lang : sub_langs_) {
363	0	sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
364	0	}
365	0	tprintf("Using params model of the primary language\n");
366	0	} else {
367	0	this->language_model_->getParamsModel().Clear();
368	0	for (auto &sub_lang : sub_langs_) {
369	0	sub_lang->language_model_->getParamsModel().Clear();
370	0	}
371	0	}
372	0	}
373
374	4	SetupUniversalFontIds();
375	4	#endif // ndef DISABLED_LEGACY_ENGINE
376	4	return 0;
377	4	}
378
379		// Common initialization for a single language.
380		// arg0 is the datapath for the tessdata directory, which could be the
381		// path of the tessdata directory with no trailing /, or (if tessdata
382		// lives in the same directory as the executable, the path of the executable,
383		// hence the name arg0.
384		// textbase is an optional output file basename (used only for training)
385		// language is the language code to load.
386		// oem controls which engine(s) will operate on the image
387		// configs (argv) is an array of config filenames to load variables from.
388		// May be nullptr.
389		// configs_size (argc) is the number of elements in configs.
390		// vars_vec is an optional vector of variables to set.
391		// vars_values is an optional corresponding vector of values for the variables
392		// in vars_vec.
393		// If set_only_non_debug_params is true, only params that do not contain
394		// "debug" in the name will be set.
395		int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,
396		const std::string &language, OcrEngineMode oem,
397		char **configs, int configs_size,
398		const std::vector<std::string> *vars_vec,
399		const std::vector<std::string> *vars_values,
400	4	bool set_only_non_debug_params, TessdataManager *mgr) {
401	4	if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,
402	4	vars_values, set_only_non_debug_params, mgr)) {
403	0	return -1;
404	0	}
405	4	if (tessedit_init_config_only) {
406	0	return 0;
407	0	}
408		// If only LSTM will be used, skip loading Tesseract classifier's
409		// pre-trained templates and dictionary.
410	4	bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
411	4	program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
412	4	return 0; // Normal exit
413	4	}
414
415		#ifndef DISABLED_LEGACY_ENGINE
416
417		// Helper builds the all_fonts table by adding new fonts from new_fonts.
418		static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,
419	4	UnicityTable<FontInfo> *all_fonts) {
420	1.71k	for (int i = 0; i < new_fonts.size(); ++i) {
421		// UnicityTable uniques as we go.
422	1.70k	all_fonts->push_back(new_fonts.at(i));
423	1.70k	}
424	4	}
425
426		// Helper assigns an id to lang_fonts using the index in all_fonts table.
427	4	static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {
428	1.71k	for (int i = 0; i < lang_fonts->size(); ++i) {
429	1.70k	auto index = all_fonts.get_index(lang_fonts->at(i));
430	1.70k	lang_fonts->at(i).universal_id = index;
431	1.70k	}
432	4	}
433
434		// Set the universal_id member of each font to be unique among all
435		// instances of the same font loaded.
436	4	void Tesseract::SetupUniversalFontIds() {
437		// Note that we can get away with bitwise copying FontInfo in
438		// all_fonts, as it is a temporary structure and we avoid setting the
439		// delete callback.
440	4	UnicityTable<FontInfo> all_fonts;
441
442		// Create the universal ID table.
443	4	CollectFonts(get_fontinfo_table(), &all_fonts);
444	4	for (auto &sub_lang : sub_langs_) {
445	0	CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
446	0	}
447		// Assign ids from the table to each font table.
448	4	AssignIds(all_fonts, &get_fontinfo_table());
449	4	for (auto &sub_lang : sub_langs_) {
450	0	AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
451	0	}
452	4	font_table_size_ = all_fonts.size();
453	4	}
454
455		#endif // ndef DISABLED_LEGACY_ENGINE
456
457	0	void Tesseract::end_tesseract() {
458	0	end_recog();
459	0	}
460
461		/* Define command type identifiers */
462
463		enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT };
464		} // namespace tesseract