/src/tesseract/src/ccmain/tessedit.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /**********************************************************************  | 
2  |  |  * File:        tessedit.cpp  (Formerly tessedit.c)  | 
3  |  |  * Description: (Previously) Main program for merge of tess and editor.  | 
4  |  |  *              Now just code to load the language model and various  | 
5  |  |  *              engine-specific data files.  | 
6  |  |  * Author:      Ray Smith  | 
7  |  |  *  | 
8  |  |  * (C) Copyright 1992, Hewlett-Packard Ltd.  | 
9  |  |  ** Licensed under the Apache License, Version 2.0 (the "License");  | 
10  |  |  ** you may not use this file except in compliance with the License.  | 
11  |  |  ** You may obtain a copy of the License at  | 
12  |  |  ** http://www.apache.org/licenses/LICENSE-2.0  | 
13  |  |  ** Unless required by applicable law or agreed to in writing, software  | 
14  |  |  ** distributed under the License is distributed on an "AS IS" BASIS,  | 
15  |  |  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
16  |  |  ** See the License for the specific language governing permissions and  | 
17  |  |  ** limitations under the License.  | 
18  |  |  *  | 
19  |  |  **********************************************************************/  | 
20  |  |  | 
21  |  | // Include automatically generated configuration file if running autoconf.  | 
22  |  | #ifdef HAVE_CONFIG_H  | 
23  |  | #  include "config_auto.h"  | 
24  |  | #endif  | 
25  |  |  | 
26  |  | #include "control.h"  | 
27  |  | #include "matchdefs.h"  | 
28  |  | #include "pageres.h"  | 
29  |  | #include "params.h"  | 
30  |  | #include "stopper.h"  | 
31  |  | #include "tesseractclass.h"  | 
32  |  | #include "tessvars.h"  | 
33  |  | #include "tprintf.h"  | 
34  |  | #ifndef DISABLED_LEGACY_ENGINE  | 
35  |  | #  include "chop.h"  | 
36  |  | #  include "intmatcher.h"  | 
37  |  | #  include "reject.h"  | 
38  |  | #endif  | 
39  |  | #include "lstmrecognizer.h"  | 
40  |  |  | 
41  |  | namespace tesseract { | 
42  |  |  | 
43  |  | // Read a "config" file containing a set of variable, value pairs.  | 
44  |  | // Searches the standard places: tessdata/configs, tessdata/tessconfigs  | 
45  |  | // and also accepts a relative or absolute path name.  | 
46  | 0  | void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) { | 
47  | 0  |   std::string path = datadir;  | 
48  | 0  |   path += "configs/";  | 
49  | 0  |   path += filename;  | 
50  | 0  |   FILE *fp;  | 
51  | 0  |   if ((fp = fopen(path.c_str(), "rb")) != nullptr) { | 
52  | 0  |     fclose(fp);  | 
53  | 0  |   } else { | 
54  | 0  |     path = datadir;  | 
55  | 0  |     path += "tessconfigs/";  | 
56  | 0  |     path += filename;  | 
57  | 0  |     if ((fp = fopen(path.c_str(), "rb")) != nullptr) { | 
58  | 0  |       fclose(fp);  | 
59  | 0  |     } else { | 
60  | 0  |       path = filename;  | 
61  | 0  |     }  | 
62  | 0  |   }  | 
63  | 0  |   ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());  | 
64  | 0  | }  | 
65  |  |  | 
66  |  | // Returns false if a unicharset file for the specified language was not found  | 
67  |  | // or was invalid.  | 
68  |  | // This function initializes TessdataManager. After TessdataManager is  | 
69  |  | // no longer needed, TessdataManager::End() should be called.  | 
70  |  | //  | 
71  |  | // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless  | 
72  |  | // it is OEM_DEFAULT, in which case the value of the variable will be obtained  | 
73  |  | // from the language-specific config file (stored in [lang].traineddata), from  | 
74  |  | // the config files specified on the command line or left as the default  | 
75  |  | // OEM_TESSERACT_ONLY if none of the configs specify this variable.  | 
76  |  | bool Tesseract::init_tesseract_lang_data(const std::string &arg0,  | 
77  |  |                                          const std::string &language, OcrEngineMode oem,  | 
78  |  |                                          char **configs, int configs_size,  | 
79  |  |                                          const std::vector<std::string> *vars_vec,  | 
80  |  |                                          const std::vector<std::string> *vars_values,  | 
81  | 4  |                                          bool set_only_non_debug_params, TessdataManager *mgr) { | 
82  |  |   // Set the language data path prefix  | 
83  | 4  |   lang = !language.empty() ? language : "eng";  | 
84  | 4  |   language_data_path_prefix = datadir;  | 
85  | 4  |   language_data_path_prefix += lang;  | 
86  | 4  |   language_data_path_prefix += ".";  | 
87  |  |  | 
88  |  |   // Initialize TessdataManager.  | 
89  | 4  |   std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;  | 
90  | 4  |   if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) { | 
91  | 0  |     tprintf("Error opening data file %s\n", tessdata_path.c_str()); | 
92  | 0  |     tprintf(  | 
93  | 0  |         "Please make sure the TESSDATA_PREFIX environment variable is set"  | 
94  | 0  |         " to your \"tessdata\" directory.\n");  | 
95  | 0  |     return false;  | 
96  | 0  |   }  | 
97  |  | #ifdef DISABLED_LEGACY_ENGINE  | 
98  |  |   tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);  | 
99  |  | #else  | 
100  | 4  |   if (oem == OEM_DEFAULT) { | 
101  |  |     // Set the engine mode from availability, which can then be overridden by  | 
102  |  |     // the config file when we read it below.  | 
103  | 4  |     if (!mgr->IsLSTMAvailable()) { | 
104  | 0  |       tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);  | 
105  | 4  |     } else if (!mgr->IsBaseAvailable()) { | 
106  | 0  |       tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);  | 
107  | 4  |     } else { | 
108  | 4  |       tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);  | 
109  | 4  |     }  | 
110  | 4  |   }  | 
111  | 4  | #endif // ndef DISABLED_LEGACY_ENGINE  | 
112  |  |  | 
113  |  |   // If a language specific config file (lang.config) exists, load it in.  | 
114  | 4  |   TFile fp;  | 
115  | 4  |   if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) { | 
116  | 0  |     ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, this->params());  | 
117  | 0  |   }  | 
118  |  |  | 
119  | 4  |   SetParamConstraint set_params_constraint =  | 
120  | 4  |       set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;  | 
121  |  |   // Load tesseract variables from config files. This is done after loading  | 
122  |  |   // language-specific variables from [lang].traineddata file, so that custom  | 
123  |  |   // config files can override values in [lang].traineddata file.  | 
124  | 4  |   for (int i = 0; i < configs_size; ++i) { | 
125  | 0  |     read_config_file(configs[i], set_params_constraint);  | 
126  | 0  |   }  | 
127  |  |  | 
128  |  |   // Set params specified in vars_vec (done after setting params from config  | 
129  |  |   // files, so that params in vars_vec can override those from files).  | 
130  | 4  |   if (vars_vec != nullptr && vars_values != nullptr) { | 
131  | 0  |     for (unsigned i = 0; i < vars_vec->size(); ++i) { | 
132  | 0  |       if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),  | 
133  | 0  |                                 set_params_constraint, this->params())) { | 
134  | 0  |         tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str()); | 
135  | 0  |       }  | 
136  | 0  |     }  | 
137  | 0  |   }  | 
138  |  |  | 
139  | 4  |   if (!tessedit_write_params_to_file.empty()) { | 
140  | 0  |     FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");  | 
141  | 0  |     if (params_file != nullptr) { | 
142  | 0  |       ParamUtils::PrintParams(params_file, this->params());  | 
143  | 0  |       fclose(params_file);  | 
144  | 0  |     } else { | 
145  | 0  |       tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str()); | 
146  | 0  |     }  | 
147  | 0  |   }  | 
148  |  |  | 
149  | 4  | #ifndef DISABLED_LEGACY_ENGINE  | 
150  |  |   // Determine which ocr engine(s) should be loaded and used for recognition.  | 
151  | 4  |   if (oem != OEM_DEFAULT) { | 
152  | 0  |     tessedit_ocr_engine_mode.set_value(oem);  | 
153  | 0  |   }  | 
154  | 4  | #endif  | 
155  |  |  | 
156  |  |   // If we are only loading the config file (and so not planning on doing any  | 
157  |  |   // recognition) then there's nothing else do here.  | 
158  | 4  |   if (tessedit_init_config_only) { | 
159  | 0  |     return true;  | 
160  | 0  |   }  | 
161  |  |  | 
162  |  | // The various OcrEngineMode settings (see tesseract/publictypes.h) determine  | 
163  |  | // which engine-specific data files need to be loaded. If LSTM_ONLY is  | 
164  |  | // requested, the base Tesseract files are *Not* required.  | 
165  |  | #ifdef DISABLED_LEGACY_ENGINE  | 
166  |  |   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { | 
167  |  | #else  | 
168  | 4  |   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||  | 
169  | 4  |       tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) { | 
170  | 4  | #endif // ndef DISABLED_LEGACY_ENGINE  | 
171  | 4  |     if (mgr->IsComponentAvailable(TESSDATA_LSTM)) { | 
172  | 4  |       lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());  | 
173  | 4  |       ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));  | 
174  | 4  |     } else { | 
175  | 0  |       tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n"); | 
176  | 0  |       tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);  | 
177  | 0  |     }  | 
178  | 4  |   }  | 
179  |  |  | 
180  |  |   // Load the unicharset  | 
181  | 4  |   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { | 
182  |  |     // Avoid requiring a unicharset when we aren't running base tesseract.  | 
183  | 0  |     unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());  | 
184  | 0  |   }  | 
185  | 4  | #ifndef DISABLED_LEGACY_ENGINE  | 
186  | 4  |   else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) { | 
187  | 0  |     tprintf(  | 
188  | 0  |         "Error: Tesseract (legacy) engine requested, but components are "  | 
189  | 0  |         "not present in %s!!\n",  | 
190  | 0  |         tessdata_path.c_str());  | 
191  | 0  |     return false;  | 
192  | 0  |   }  | 
193  | 4  | #endif // ndef DISABLED_LEGACY_ENGINE  | 
194  | 4  |   if (unicharset.size() > MAX_NUM_CLASSES) { | 
195  | 0  |     tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); | 
196  | 0  |     return false;  | 
197  | 0  |   }  | 
198  | 4  |   right_to_left_ = unicharset.major_right_to_left();  | 
199  |  |  | 
200  | 4  | #ifndef DISABLED_LEGACY_ENGINE  | 
201  |  |  | 
202  |  |   // Setup initial unichar ambigs table and read universal ambigs.  | 
203  | 4  |   UNICHARSET encoder_unicharset;  | 
204  | 4  |   encoder_unicharset.CopyFrom(unicharset);  | 
205  | 4  |   unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);  | 
206  | 4  |   unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);  | 
207  |  |  | 
208  | 4  |   if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) { | 
209  | 4  |     unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,  | 
210  | 4  |                                      use_ambigs_for_adaption, &unicharset);  | 
211  | 4  |   }  | 
212  |  |  | 
213  |  |   // Init ParamsModel.  | 
214  |  |   // Load pass1 and pass2 weights (for now these two sets are the same, but in  | 
215  |  |   // the future separate sets of weights can be generated).  | 
216  | 12  |   for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) { | 
217  | 8  |     language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));  | 
218  | 8  |     if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) { | 
219  | 0  |       if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) { | 
220  | 0  |         return false;  | 
221  | 0  |       }  | 
222  | 0  |     }  | 
223  | 8  |   }  | 
224  | 4  | #endif // ndef DISABLED_LEGACY_ENGINE  | 
225  |  |  | 
226  | 4  |   return true;  | 
227  | 4  | }  | 
228  |  |  | 
229  |  | // Helper returns true if the given string is in the vector of strings.  | 
230  | 8  | static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) { | 
231  | 8  |   for (const auto &i : str_list) { | 
232  | 0  |     if (i == str) { | 
233  | 0  |       return true;  | 
234  | 0  |     }  | 
235  | 0  |   }  | 
236  | 8  |   return false;  | 
237  | 8  | }  | 
238  |  |  | 
239  |  | // Parse a string of the form [~]<lang>[+[~]<lang>]*.  | 
240  |  | // Langs with no prefix get appended to to_load, provided they  | 
241  |  | // are not in there already.  | 
242  |  | // Langs with ~ prefix get appended to not_to_load, provided they are not in  | 
243  |  | // there already.  | 
244  |  | void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,  | 
245  | 8  |                                     std::vector<std::string> *not_to_load) { | 
246  | 8  |   std::string remains(lang_str);  | 
247  |  |   // Look whether the model file uses a prefix which must be applied to  | 
248  |  |   // included model files as well.  | 
249  | 8  |   std::string prefix;  | 
250  | 8  |   size_t found = lang.find_last_of('/'); | 
251  | 8  |   if (found != std::string::npos) { | 
252  |  |     // A prefix was found.  | 
253  | 0  |     prefix = lang.substr(0, found + 1);  | 
254  | 0  |   }  | 
255  | 12  |   while (!remains.empty()) { | 
256  |  |     // Find the start of the lang code and which vector to add to.  | 
257  | 4  |     const char *start = remains.c_str();  | 
258  | 4  |     while (*start == '+') { | 
259  | 0  |       ++start;  | 
260  | 0  |     }  | 
261  | 4  |     std::vector<std::string> *target = to_load;  | 
262  | 4  |     if (*start == '~') { | 
263  | 0  |       target = not_to_load;  | 
264  | 0  |       ++start;  | 
265  | 0  |     }  | 
266  |  |     // Find the index of the end of the lang code in string start.  | 
267  | 4  |     int end = strlen(start);  | 
268  | 4  |     const char *plus = strchr(start, '+');  | 
269  | 4  |     if (plus != nullptr && plus - start < end) { | 
270  | 0  |       end = plus - start;  | 
271  | 0  |     }  | 
272  | 4  |     std::string lang_code(start);  | 
273  | 4  |     lang_code.resize(end);  | 
274  | 4  |     std::string next(start + end);  | 
275  | 4  |     remains = std::move(next);  | 
276  | 4  |     lang_code = prefix + lang_code;  | 
277  |  |     // Check whether lang_code is already in the target vector and add.  | 
278  | 4  |     if (!IsStrInList(lang_code, *target)) { | 
279  | 4  |       target->push_back(lang_code);  | 
280  | 4  |     }  | 
281  | 4  |   }  | 
282  | 8  | }  | 
283  |  |  | 
284  |  | // Initialize for potentially a set of languages defined by the language  | 
285  |  | // string and recursively any additional languages required by any language  | 
286  |  | // traineddata file (via tessedit_load_sublangs in its config) that is loaded.  | 
287  |  | // See init_tesseract_internal for args.  | 
288  |  | int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,  | 
289  |  |                               const std::string &language, OcrEngineMode oem, char **configs,  | 
290  |  |                               int configs_size, const std::vector<std::string> *vars_vec,  | 
291  |  |                               const std::vector<std::string> *vars_values,  | 
292  | 4  |                               bool set_only_non_debug_params, TessdataManager *mgr) { | 
293  | 4  |   std::vector<std::string> langs_to_load;  | 
294  | 4  |   std::vector<std::string> langs_not_to_load;  | 
295  | 4  |   ParseLanguageString(language, &langs_to_load, &langs_not_to_load);  | 
296  |  |  | 
297  | 4  |   for (auto *lang : sub_langs_) { | 
298  | 0  |     delete lang;  | 
299  | 0  |   }  | 
300  |  |  | 
301  |  |   // Set the basename, compute the data directory.  | 
302  | 4  |   main_setup(arg0, textbase);  | 
303  |  |  | 
304  | 4  |   sub_langs_.clear();  | 
305  |  |   // Find the first loadable lang and load into this.  | 
306  |  |   // Add any languages that this language requires  | 
307  | 4  |   bool loaded_primary = false;  | 
308  |  |   // Load the rest into sub_langs_.  | 
309  |  |   // WARNING: A range based for loop does not work here because langs_to_load  | 
310  |  |   // might be changed in the loop when a new submodel is found.  | 
311  | 8  |   for (size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) { | 
312  | 4  |     auto &lang_to_load = langs_to_load[lang_index];  | 
313  | 4  |     if (!IsStrInList(lang_to_load, langs_not_to_load)) { | 
314  | 4  |       const char *lang_str = lang_to_load.c_str();  | 
315  | 4  |       Tesseract *tess_to_init;  | 
316  | 4  |       if (!loaded_primary) { | 
317  | 4  |         tess_to_init = this;  | 
318  | 4  |       } else { | 
319  | 0  |         tess_to_init = new Tesseract;  | 
320  | 0  |         tess_to_init->main_setup(arg0, textbase);  | 
321  | 0  |       }  | 
322  |  |  | 
323  | 4  |       int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,  | 
324  | 4  |                                                          configs_size, vars_vec, vars_values,  | 
325  | 4  |                                                          set_only_non_debug_params, mgr);  | 
326  |  |       // Forget that language, but keep any reader we were given.  | 
327  | 4  |       mgr->Clear();  | 
328  |  |  | 
329  | 4  |       if (!loaded_primary) { | 
330  | 4  |         if (result < 0) { | 
331  | 0  |           tprintf("Failed loading language '%s'\n", lang_str); | 
332  | 4  |         } else { | 
333  | 4  |           ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,  | 
334  | 4  |                               &langs_not_to_load);  | 
335  | 4  |           loaded_primary = true;  | 
336  | 4  |         }  | 
337  | 4  |       } else { | 
338  | 0  |         if (result < 0) { | 
339  | 0  |           tprintf("Failed loading language '%s'\n", lang_str); | 
340  | 0  |           delete tess_to_init;  | 
341  | 0  |         } else { | 
342  | 0  |           sub_langs_.push_back(tess_to_init);  | 
343  |  |           // Add any languages that this language requires  | 
344  | 0  |           ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,  | 
345  | 0  |                               &langs_not_to_load);  | 
346  | 0  |         }  | 
347  | 0  |       }  | 
348  | 4  |     }  | 
349  | 4  |   }  | 
350  | 4  |   if (!loaded_primary && !langs_to_load.empty()) { | 
351  | 0  |     tprintf("Tesseract couldn't load any languages!\n"); | 
352  | 0  |     return -1; // Couldn't load any language!  | 
353  | 0  |   }  | 
354  | 4  | #ifndef DISABLED_LEGACY_ENGINE  | 
355  | 4  |   if (!sub_langs_.empty()) { | 
356  |  |     // In multilingual mode word ratings have to be directly comparable,  | 
357  |  |     // so use the same language model weights for all languages:  | 
358  |  |     // use the primary language's params model if  | 
359  |  |     // tessedit_use_primary_params_model is set,  | 
360  |  |     // otherwise use default language model weights.  | 
361  | 0  |     if (tessedit_use_primary_params_model) { | 
362  | 0  |       for (auto &sub_lang : sub_langs_) { | 
363  | 0  |         sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());  | 
364  | 0  |       }  | 
365  | 0  |       tprintf("Using params model of the primary language\n"); | 
366  | 0  |     } else { | 
367  | 0  |       this->language_model_->getParamsModel().Clear();  | 
368  | 0  |       for (auto &sub_lang : sub_langs_) { | 
369  | 0  |         sub_lang->language_model_->getParamsModel().Clear();  | 
370  | 0  |       }  | 
371  | 0  |     }  | 
372  | 0  |   }  | 
373  |  |  | 
374  | 4  |   SetupUniversalFontIds();  | 
375  | 4  | #endif // ndef DISABLED_LEGACY_ENGINE  | 
376  | 4  |   return 0;  | 
377  | 4  | }  | 
378  |  |  | 
379  |  | // Common initialization for a single language.  | 
380  |  | // arg0 is the datapath for the tessdata directory, which could be the  | 
381  |  | // path of the tessdata directory with no trailing /, or (if tessdata  | 
382  |  | // lives in the same directory as the executable, the path of the executable,  | 
383  |  | // hence the name arg0.  | 
384  |  | // textbase is an optional output file basename (used only for training)  | 
385  |  | // language is the language code to load.  | 
386  |  | // oem controls which engine(s) will operate on the image  | 
387  |  | // configs (argv) is an array of config filenames to load variables from.  | 
388  |  | // May be nullptr.  | 
389  |  | // configs_size (argc) is the number of elements in configs.  | 
390  |  | // vars_vec is an optional vector of variables to set.  | 
391  |  | // vars_values is an optional corresponding vector of values for the variables  | 
392  |  | // in vars_vec.  | 
393  |  | // If set_only_non_debug_params is true, only params that do not contain  | 
394  |  | // "debug" in the name will be set.  | 
395  |  | int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,  | 
396  |  |                                        const std::string &language, OcrEngineMode oem,  | 
397  |  |                                        char **configs, int configs_size,  | 
398  |  |                                        const std::vector<std::string> *vars_vec,  | 
399  |  |                                        const std::vector<std::string> *vars_values,  | 
400  | 4  |                                        bool set_only_non_debug_params, TessdataManager *mgr) { | 
401  | 4  |   if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,  | 
402  | 4  |                                 vars_values, set_only_non_debug_params, mgr)) { | 
403  | 0  |     return -1;  | 
404  | 0  |   }  | 
405  | 4  |   if (tessedit_init_config_only) { | 
406  | 0  |     return 0;  | 
407  | 0  |   }  | 
408  |  |   // If only LSTM will be used, skip loading Tesseract classifier's  | 
409  |  |   // pre-trained templates and dictionary.  | 
410  | 4  |   bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;  | 
411  | 4  |   program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);  | 
412  | 4  |   return 0; // Normal exit  | 
413  | 4  | }  | 
414  |  |  | 
415  |  | #ifndef DISABLED_LEGACY_ENGINE  | 
416  |  |  | 
417  |  | // Helper builds the all_fonts table by adding new fonts from new_fonts.  | 
418  |  | static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,  | 
419  | 4  |                          UnicityTable<FontInfo> *all_fonts) { | 
420  | 1.71k  |   for (int i = 0; i < new_fonts.size(); ++i) { | 
421  |  |     // UnicityTable uniques as we go.  | 
422  | 1.70k  |     all_fonts->push_back(new_fonts.at(i));  | 
423  | 1.70k  |   }  | 
424  | 4  | }  | 
425  |  |  | 
426  |  | // Helper assigns an id to lang_fonts using the index in all_fonts table.  | 
427  | 4  | static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) { | 
428  | 1.71k  |   for (int i = 0; i < lang_fonts->size(); ++i) { | 
429  | 1.70k  |     auto index = all_fonts.get_index(lang_fonts->at(i));  | 
430  | 1.70k  |     lang_fonts->at(i).universal_id = index;  | 
431  | 1.70k  |   }  | 
432  | 4  | }  | 
433  |  |  | 
434  |  | // Set the universal_id member of each font to be unique among all  | 
435  |  | // instances of the same font loaded.  | 
436  | 4  | void Tesseract::SetupUniversalFontIds() { | 
437  |  |   // Note that we can get away with bitwise copying FontInfo in  | 
438  |  |   // all_fonts, as it is a temporary structure and we avoid setting the  | 
439  |  |   // delete callback.  | 
440  | 4  |   UnicityTable<FontInfo> all_fonts;  | 
441  |  |  | 
442  |  |   // Create the universal ID table.  | 
443  | 4  |   CollectFonts(get_fontinfo_table(), &all_fonts);  | 
444  | 4  |   for (auto &sub_lang : sub_langs_) { | 
445  | 0  |     CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);  | 
446  | 0  |   }  | 
447  |  |   // Assign ids from the table to each font table.  | 
448  | 4  |   AssignIds(all_fonts, &get_fontinfo_table());  | 
449  | 4  |   for (auto &sub_lang : sub_langs_) { | 
450  | 0  |     AssignIds(all_fonts, &sub_lang->get_fontinfo_table());  | 
451  | 0  |   }  | 
452  | 4  |   font_table_size_ = all_fonts.size();  | 
453  | 4  | }  | 
454  |  |  | 
455  |  | #endif // ndef DISABLED_LEGACY_ENGINE  | 
456  |  |  | 
457  | 0  | void Tesseract::end_tesseract() { | 
458  | 0  |   end_recog();  | 
459  | 0  | }  | 
460  |  |  | 
461  |  | /* Define command type identifiers */  | 
462  |  |  | 
463  |  | enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT }; | 
464  |  | } // namespace tesseract  |