/src/tesseract/src/api/baseapi.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: baseapi.cpp |
3 | | * Description: Simple API for calling tesseract. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 2006, Google Inc. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #define _USE_MATH_DEFINES // for M_PI |
20 | | |
21 | | // Include automatically generated configuration file if running autoconf. |
22 | | #ifdef HAVE_CONFIG_H |
23 | | # include "config_auto.h" |
24 | | #endif |
25 | | |
26 | | #include "boxword.h" // for BoxWord |
27 | | #include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST |
28 | | #include "dawg_cache.h" // for DawgCache |
29 | | #include "dict.h" // for Dict |
30 | | #include "elst.h" // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH |
31 | | #include "environ.h" // for l_uint8 |
32 | | #ifndef DISABLED_LEGACY_ENGINE |
33 | | #include "equationdetect.h" // for EquationDetect, destructor of equ_detect_ |
34 | | #endif // ndef DISABLED_LEGACY_ENGINE |
35 | | #include "errcode.h" // for ASSERT_HOST |
36 | | #include "helpers.h" // for IntCastRounded, chomp_string, copy_string |
37 | | #include "host.h" // for MAX_PATH |
38 | | #include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ... |
39 | | #ifndef DISABLED_LEGACY_ENGINE |
40 | | # include "intfx.h" // for INT_FX_RESULT_STRUCT |
41 | | #endif |
42 | | #include "mutableiterator.h" // for MutableIterator |
43 | | #include "normalis.h" // for kBlnBaselineOffset, kBlnXHeight |
44 | | #include "pageres.h" // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE... |
45 | | #include "paragraphs.h" // for DetectParagraphs |
46 | | #include "params.h" // for BoolParam, IntParam, DoubleParam, Stri... |
47 | | #include "pdblock.h" // for PDBLK |
48 | | #include "points.h" // for FCOORD |
49 | | #include "polyblk.h" // for POLY_BLOCK |
50 | | #include "rect.h" // for TBOX |
51 | | #include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST |
52 | | #include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix |
53 | | #include "tesseractclass.h" // for Tesseract |
54 | | #include "tprintf.h" // for tprintf |
55 | | #include "werd.h" // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP |
56 | | #include "thresholder.h" // for ImageThresholder |
57 | | |
58 | | #include <tesseract/baseapi.h> |
59 | | #include <tesseract/ocrclass.h> // for ETEXT_DESC |
60 | | #include <tesseract/osdetect.h> // for OSResults, OSBestResult, OrientationId... |
61 | | #include <tesseract/renderer.h> // for TessResultRenderer |
62 | | #include <tesseract/resultiterator.h> // for ResultIterator |
63 | | |
64 | | #include <cmath> // for round, M_PI |
65 | | #include <cstdint> // for int32_t |
66 | | #include <cstring> // for strcmp, strcpy |
67 | | #include <filesystem> // for std::filesystem |
68 | | #include <fstream> // for size_t |
69 | | #include <iostream> // for std::cin |
70 | | #include <locale> // for std::locale::classic |
71 | | #include <memory> // for std::unique_ptr |
72 | | #include <set> // for std::pair |
73 | | #include <sstream> // for std::stringstream |
74 | | #include <vector> // for std::vector |
75 | | |
76 | | #include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box... |
77 | | #ifdef HAVE_LIBCURL |
78 | | # include <curl/curl.h> |
79 | | #endif |
80 | | |
81 | | #ifdef __linux__ |
82 | | # include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE |
83 | | #endif |
84 | | |
85 | | #if defined(_WIN32) |
86 | | # include <fcntl.h> // for _O_BINARY |
87 | | # include <io.h> // for _setmode |
88 | | #endif |
89 | | |
90 | | namespace tesseract { |
91 | | |
92 | | static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin"); |
93 | | static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)"); |
94 | | #ifdef HAVE_LIBCURL |
95 | | static INT_VAR(curl_timeout, 0, "Timeout for curl in seconds"); |
96 | | static STRING_VAR(curl_cookiefile, "", "File with cookie data for curl"); |
97 | | #endif |
98 | | |
99 | | /** Minimum sensible image size to be worth running Tesseract. */ |
100 | | const int kMinRectSize = 10; |
101 | | /** Character returned when Tesseract couldn't recognize as anything. */ |
102 | | const char kTesseractReject = '~'; |
103 | | /** Character used by UNLV error counter as a reject. */ |
104 | | const char kUNLVReject = '~'; |
105 | | /** Character used by UNLV as a suspect marker. */ |
106 | | const char kUNLVSuspect = '^'; |
107 | | /** |
108 | | * Temp file used for storing current parameters before applying retry values. |
109 | | */ |
110 | | static const char *kOldVarsFile = "failed_vars.txt"; |
111 | | |
112 | | #ifndef DISABLED_LEGACY_ENGINE |
113 | | /** |
114 | | * Filename used for input image file, from which to derive a name to search |
115 | | * for a possible UNLV zone file, if none is specified by SetInputName. |
116 | | */ |
117 | | static const char *kInputFile = "noname.tif"; |
118 | | static const char kUnknownFontName[] = "UnknownFont"; |
119 | | |
120 | | static STRING_VAR(classify_font_name, kUnknownFontName, |
121 | | "Default font name to be used in training"); |
122 | | |
123 | | // Finds the name of the training font and returns it in fontname, by cutting |
124 | | // it out based on the expectation that the filename is of the form: |
125 | | // /path/to/dir/[lang].[fontname].exp[num] |
126 | | // The [lang], [fontname] and [num] fields should not have '.' characters. |
127 | | // If the global parameter classify_font_name is set, its value is used instead. |
128 | 0 | static void ExtractFontName(const char* filename, std::string* fontname) { |
129 | 0 | *fontname = classify_font_name; |
130 | 0 | if (*fontname == kUnknownFontName) { |
131 | | // filename is expected to be of the form [lang].[fontname].exp[num] |
132 | | // The [lang], [fontname] and [num] fields should not have '.' characters. |
133 | 0 | const char *basename = strrchr(filename, '/'); |
134 | 0 | const char *firstdot = strchr(basename ? basename : filename, '.'); |
135 | 0 | const char *lastdot = strrchr(filename, '.'); |
136 | 0 | if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) { |
137 | 0 | ++firstdot; |
138 | 0 | *fontname = firstdot; |
139 | 0 | fontname->resize(lastdot - firstdot); |
140 | 0 | } |
141 | 0 | } |
142 | 0 | } |
143 | | #endif |
144 | | |
145 | | /* Add all available languages recursively. |
146 | | */ |
147 | | static void addAvailableLanguages(const std::string &datadir, |
148 | 0 | std::vector<std::string> *langs) { |
149 | 0 | for (const auto& entry : |
150 | 0 | std::filesystem::recursive_directory_iterator(datadir, |
151 | 0 | std::filesystem::directory_options::follow_directory_symlink | |
152 | 0 | std::filesystem::directory_options::skip_permission_denied)) { |
153 | 0 | auto path = entry.path().lexically_relative(datadir); |
154 | 0 | if (path.extension() == ".traineddata") { |
155 | 0 | langs->push_back(path.replace_extension("").string()); |
156 | 0 | } |
157 | 0 | } |
158 | 0 | } |
159 | | |
160 | | TessBaseAPI::TessBaseAPI() |
161 | 2 | : tesseract_(nullptr) |
162 | 2 | , osd_tesseract_(nullptr) |
163 | 2 | , equ_detect_(nullptr) |
164 | 2 | , reader_(nullptr) |
165 | | , |
166 | | // thresholder_ is initialized to nullptr here, but will be set before use |
167 | | // by: A constructor of a derived API or created |
168 | | // implicitly when used in InternalSetImage. |
169 | 2 | thresholder_(nullptr) |
170 | 2 | , paragraph_models_(nullptr) |
171 | 2 | , block_list_(nullptr) |
172 | 2 | , page_res_(nullptr) |
173 | 2 | , last_oem_requested_(OEM_DEFAULT) |
174 | 2 | , recognition_done_(false) |
175 | 2 | , rect_left_(0) |
176 | 2 | , rect_top_(0) |
177 | 2 | , rect_width_(0) |
178 | 2 | , rect_height_(0) |
179 | 2 | , image_width_(0) |
180 | 2 | , image_height_(0) { |
181 | 2 | } |
182 | | |
183 | 0 | TessBaseAPI::~TessBaseAPI() { |
184 | 0 | End(); |
185 | 0 | } |
186 | | |
187 | | /** |
188 | | * Returns the version identifier as a static string. Do not delete. |
189 | | */ |
190 | 0 | const char *TessBaseAPI::Version() { |
191 | 0 | return TESSERACT_VERSION_STR; |
192 | 0 | } |
193 | | |
194 | | /** |
195 | | * Set the name of the input file. Needed only for training and |
196 | | * loading a UNLV zone file. |
197 | | */ |
198 | 0 | void TessBaseAPI::SetInputName(const char *name) { |
199 | 0 | input_file_ = name ? name : ""; |
200 | 0 | } |
201 | | |
202 | | /** Set the name of the output files. Needed only for debugging. */ |
203 | 0 | void TessBaseAPI::SetOutputName(const char *name) { |
204 | 0 | output_file_ = name ? name : ""; |
205 | 0 | } |
206 | | |
207 | 2 | bool TessBaseAPI::SetVariable(const char *name, const char *value) { |
208 | 2 | if (tesseract_ == nullptr) { |
209 | 0 | tesseract_ = new Tesseract; |
210 | 0 | } |
211 | 2 | return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY, |
212 | 2 | tesseract_->params()); |
213 | 2 | } |
214 | | |
215 | 0 | bool TessBaseAPI::SetDebugVariable(const char *name, const char *value) { |
216 | 0 | if (tesseract_ == nullptr) { |
217 | 0 | tesseract_ = new Tesseract; |
218 | 0 | } |
219 | 0 | return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, tesseract_->params()); |
220 | 0 | } |
221 | | |
222 | 14.6k | bool TessBaseAPI::GetIntVariable(const char *name, int *value) const { |
223 | 14.6k | auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params, |
224 | 14.6k | tesseract_->params()->int_params); |
225 | 14.6k | if (p == nullptr) { |
226 | 0 | return false; |
227 | 0 | } |
228 | 14.6k | *value = (int32_t)(*p); |
229 | 14.6k | return true; |
230 | 14.6k | } |
231 | | |
232 | 6.88k | bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const { |
233 | 6.88k | auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params, |
234 | 6.88k | tesseract_->params()->bool_params); |
235 | 6.88k | if (p == nullptr) { |
236 | 0 | return false; |
237 | 0 | } |
238 | 6.88k | *value = bool(*p); |
239 | 6.88k | return true; |
240 | 6.88k | } |
241 | | |
242 | 0 | const char *TessBaseAPI::GetStringVariable(const char *name) const { |
243 | 0 | auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params, |
244 | 0 | tesseract_->params()->string_params); |
245 | 0 | return (p != nullptr) ? p->c_str() : nullptr; |
246 | 0 | } |
247 | | |
248 | 0 | bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const { |
249 | 0 | auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params, |
250 | 0 | tesseract_->params()->double_params); |
251 | 0 | if (p == nullptr) { |
252 | 0 | return false; |
253 | 0 | } |
254 | 0 | *value = (double)(*p); |
255 | 0 | return true; |
256 | 0 | } |
257 | | |
258 | | /** Get value of named variable as a string, if it exists. */ |
259 | 0 | bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const { |
260 | 0 | return ParamUtils::GetParamAsString(name, tesseract_->params(), val); |
261 | 0 | } |
262 | | |
263 | | #ifndef DISABLED_LEGACY_ENGINE |
264 | | |
265 | | /** Print Tesseract fonts table to the given file. */ |
266 | 0 | void TessBaseAPI::PrintFontsTable(FILE *fp) const { |
267 | 0 | const int fontinfo_size = tesseract_->get_fontinfo_table().size(); |
268 | 0 | for (int font_index = 1; font_index < fontinfo_size; ++font_index) { |
269 | 0 | FontInfo font = tesseract_->get_fontinfo_table().at(font_index); |
270 | 0 | fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s" |
271 | 0 | " is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n", |
272 | 0 | font_index, font.name, |
273 | 0 | font.is_italic() ? "true" : "false", |
274 | 0 | font.is_bold() ? "true" : "false", |
275 | 0 | font.is_fixed_pitch() ? "true" : "false", |
276 | 0 | font.is_serif() ? "true" : "false", |
277 | 0 | font.is_fraktur() ? "true" : "false"); |
278 | 0 | } |
279 | 0 | } |
280 | | |
281 | | #endif |
282 | | |
283 | | /** Print Tesseract parameters to the given file. */ |
284 | 0 | void TessBaseAPI::PrintVariables(FILE *fp) const { |
285 | 0 | ParamUtils::PrintParams(fp, tesseract_->params()); |
286 | 0 | } |
287 | | |
288 | | /** |
289 | | * The datapath must be the name of the data directory or |
290 | | * some other file in which the data directory resides (for instance argv[0].) |
291 | | * The language is (usually) an ISO 639-3 string or nullptr will default to eng. |
292 | | * If numeric_mode is true, then only digits and Roman numerals will |
293 | | * be returned. |
294 | | * @return: 0 on success and -1 on initialization failure. |
295 | | */ |
296 | | int TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs, |
297 | | int configs_size, const std::vector<std::string> *vars_vec, |
298 | 2 | const std::vector<std::string> *vars_values, bool set_only_non_debug_params) { |
299 | 2 | return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values, |
300 | 2 | set_only_non_debug_params, nullptr); |
301 | 2 | } |
302 | | |
303 | | // In-memory version reads the traineddata file directly from the given |
304 | | // data[data_size] array. Also implements the version with a datapath in data, |
305 | | // flagged by data_size = 0. |
306 | | int TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem, |
307 | | char **configs, int configs_size, const std::vector<std::string> *vars_vec, |
308 | | const std::vector<std::string> *vars_values, bool set_only_non_debug_params, |
309 | 2 | FileReader reader) { |
310 | 2 | if (language == nullptr) { |
311 | 0 | language = ""; |
312 | 0 | } |
313 | 2 | if (data == nullptr) { |
314 | 2 | data = ""; |
315 | 2 | } |
316 | 2 | std::string datapath = data_size == 0 ? data : language; |
317 | | // If the datapath, OcrEngineMode or the language have changed - start again. |
318 | | // Note that the language_ field stores the last requested language that was |
319 | | // initialized successfully, while tesseract_->lang stores the language |
320 | | // actually used. They differ only if the requested language was nullptr, in |
321 | | // which case tesseract_->lang is set to the Tesseract default ("eng"). |
322 | 2 | if (tesseract_ != nullptr && |
323 | 2 | (datapath_.empty() || language_.empty() || datapath_ != datapath || |
324 | 0 | last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) { |
325 | 0 | delete tesseract_; |
326 | 0 | tesseract_ = nullptr; |
327 | 0 | } |
328 | 2 | bool reset_classifier = true; |
329 | 2 | if (tesseract_ == nullptr) { |
330 | 2 | reset_classifier = false; |
331 | 2 | tesseract_ = new Tesseract; |
332 | 2 | if (reader != nullptr) { |
333 | 0 | reader_ = reader; |
334 | 0 | } |
335 | 2 | TessdataManager mgr(reader_); |
336 | 2 | if (data_size != 0) { |
337 | 0 | mgr.LoadMemBuffer(language, data, data_size); |
338 | 0 | } |
339 | 2 | if (tesseract_->init_tesseract(datapath, output_file_, language, oem, configs, |
340 | 2 | configs_size, vars_vec, vars_values, set_only_non_debug_params, |
341 | 2 | &mgr) != 0) { |
342 | 0 | return -1; |
343 | 0 | } |
344 | 2 | } |
345 | | |
346 | | // Update datapath and language requested for the last valid initialization. |
347 | 2 | datapath_ = std::move(datapath); |
348 | 2 | if (datapath_.empty() && !tesseract_->datadir.empty()) { |
349 | 2 | datapath_ = tesseract_->datadir; |
350 | 2 | } |
351 | | |
352 | 2 | language_ = language; |
353 | 2 | last_oem_requested_ = oem; |
354 | | |
355 | 2 | #ifndef DISABLED_LEGACY_ENGINE |
356 | | // For same language and datapath, just reset the adaptive classifier. |
357 | 2 | if (reset_classifier) { |
358 | 0 | tesseract_->ResetAdaptiveClassifier(); |
359 | 0 | } |
360 | 2 | #endif // ndef DISABLED_LEGACY_ENGINE |
361 | 2 | return 0; |
362 | 2 | } |
363 | | |
364 | | /** |
365 | | * Returns the languages string used in the last valid initialization. |
366 | | * If the last initialization specified "deu+hin" then that will be |
367 | | * returned. If hin loaded eng automatically as well, then that will |
368 | | * not be included in this list. To find the languages actually |
369 | | * loaded use GetLoadedLanguagesAsVector. |
370 | | * The returned string should NOT be deleted. |
371 | | */ |
372 | 0 | const char *TessBaseAPI::GetInitLanguagesAsString() const { |
373 | 0 | return language_.c_str(); |
374 | 0 | } |
375 | | |
376 | | /** |
377 | | * Returns the loaded languages in the vector of std::string. |
378 | | * Includes all languages loaded by the last Init, including those loaded |
379 | | * as dependencies of other loaded languages. |
380 | | */ |
381 | 0 | void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const { |
382 | 0 | langs->clear(); |
383 | 0 | if (tesseract_ != nullptr) { |
384 | 0 | langs->push_back(tesseract_->lang); |
385 | 0 | int num_subs = tesseract_->num_sub_langs(); |
386 | 0 | for (int i = 0; i < num_subs; ++i) { |
387 | 0 | langs->push_back(tesseract_->get_sub_lang(i)->lang); |
388 | 0 | } |
389 | 0 | } |
390 | 0 | } |
391 | | |
392 | | /** |
393 | | * Returns the available languages in the sorted vector of std::string. |
394 | | */ |
395 | 0 | void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const { |
396 | 0 | langs->clear(); |
397 | 0 | if (tesseract_ != nullptr) { |
398 | 0 | addAvailableLanguages(tesseract_->datadir, langs); |
399 | 0 | std::sort(langs->begin(), langs->end()); |
400 | 0 | } |
401 | 0 | } |
402 | | |
403 | | /** |
404 | | * Init only for page layout analysis. Use only for calls to SetImage and |
405 | | * AnalysePage. Calls that attempt recognition will generate an error. |
406 | | */ |
407 | 0 | void TessBaseAPI::InitForAnalysePage() { |
408 | 0 | if (tesseract_ == nullptr) { |
409 | 0 | tesseract_ = new Tesseract; |
410 | 0 | #ifndef DISABLED_LEGACY_ENGINE |
411 | 0 | tesseract_->InitAdaptiveClassifier(nullptr); |
412 | 0 | #endif |
413 | 0 | } |
414 | 0 | } |
415 | | |
416 | | /** |
417 | | * Read a "config" file containing a set of parameter name, value pairs. |
418 | | * Searches the standard places: tessdata/configs, tessdata/tessconfigs |
419 | | * and also accepts a relative or absolute path name. |
420 | | */ |
421 | 0 | void TessBaseAPI::ReadConfigFile(const char *filename) { |
422 | 0 | tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY); |
423 | 0 | } |
424 | | |
425 | | /** Same as above, but only set debug params from the given config file. */ |
426 | 0 | void TessBaseAPI::ReadDebugConfigFile(const char *filename) { |
427 | 0 | tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY); |
428 | 0 | } |
429 | | |
430 | | /** |
431 | | * Set the current page segmentation mode. Defaults to PSM_AUTO. |
432 | | * The mode is stored as an IntParam so it can also be modified by |
433 | | * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). |
434 | | */ |
435 | 0 | void TessBaseAPI::SetPageSegMode(PageSegMode mode) { |
436 | 0 | if (tesseract_ == nullptr) { |
437 | 0 | tesseract_ = new Tesseract; |
438 | 0 | } |
439 | 0 | tesseract_->tessedit_pageseg_mode.set_value(mode); |
440 | 0 | } |
441 | | |
442 | | /** Return the current page segmentation mode. */ |
443 | 0 | PageSegMode TessBaseAPI::GetPageSegMode() const { |
444 | 0 | if (tesseract_ == nullptr) { |
445 | 0 | return PSM_SINGLE_BLOCK; |
446 | 0 | } |
447 | 0 | return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode)); |
448 | 0 | } |
449 | | |
450 | | /** |
451 | | * Recognize a rectangle from an image and return the result as a string. |
452 | | * May be called many times for a single Init. |
453 | | * Currently has no error checking. |
454 | | * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. |
455 | | * Palette color images will not work properly and must be converted to |
456 | | * 24 bit. |
457 | | * Binary images of 1 bit per pixel may also be given but they must be |
458 | | * byte packed with the MSB of the first byte being the first pixel, and a |
459 | | * one pixel is WHITE. For binary images set bytes_per_pixel=0. |
460 | | * The recognized text is returned as a char* which is coded |
461 | | * as UTF8 and must be freed with the delete [] operator. |
462 | | */ |
463 | | char *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, |
464 | 0 | int bytes_per_line, int left, int top, int width, int height) { |
465 | 0 | if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) { |
466 | 0 | return nullptr; // Nothing worth doing. |
467 | 0 | } |
468 | | |
469 | | // Since this original api didn't give the exact size of the image, |
470 | | // we have to invent a reasonable value. |
471 | 0 | int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8; |
472 | 0 | SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel, |
473 | 0 | bytes_per_line); |
474 | 0 | SetRectangle(left, top, width, height); |
475 | |
|
476 | 0 | return GetUTF8Text(); |
477 | 0 | } |
478 | | |
479 | | #ifndef DISABLED_LEGACY_ENGINE |
480 | | /** |
481 | | * Call between pages or documents etc to free up memory and forget |
482 | | * adaptive data. |
483 | | */ |
484 | 0 | void TessBaseAPI::ClearAdaptiveClassifier() { |
485 | 0 | if (tesseract_ == nullptr) { |
486 | 0 | return; |
487 | 0 | } |
488 | 0 | tesseract_->ResetAdaptiveClassifier(); |
489 | 0 | tesseract_->ResetDocumentDictionary(); |
490 | 0 | } |
491 | | #endif // ndef DISABLED_LEGACY_ENGINE |
492 | | |
493 | | /** |
494 | | * Provide an image for Tesseract to recognize. Format is as |
495 | | * TesseractRect above. Copies the image buffer and converts to Pix. |
496 | | * SetImage clears all recognition results, and sets the rectangle to the |
497 | | * full image, so it may be followed immediately by a GetUTF8Text, and it |
498 | | * will automatically perform recognition. |
499 | | */ |
500 | | void TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height, |
501 | 0 | int bytes_per_pixel, int bytes_per_line) { |
502 | 0 | if (InternalSetImage()) { |
503 | 0 | thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line); |
504 | 0 | SetInputImage(thresholder_->GetPixRect()); |
505 | 0 | } |
506 | 0 | } |
507 | | |
508 | 0 | void TessBaseAPI::SetSourceResolution(int ppi) { |
509 | 0 | if (thresholder_) { |
510 | 0 | thresholder_->SetSourceYResolution(ppi); |
511 | 0 | } else { |
512 | 0 | tprintf("Please call SetImage before SetSourceResolution.\n"); |
513 | 0 | } |
514 | 0 | } |
515 | | |
516 | | /** |
517 | | * Provide an image for Tesseract to recognize. As with SetImage above, |
518 | | * Tesseract takes its own copy of the image, so it need not persist until |
519 | | * after Recognize. |
520 | | * Pix vs raw, which to use? |
521 | | * Use Pix where possible. Tesseract uses Pix as its internal representation |
522 | | * and it is therefore more efficient to provide a Pix directly. |
523 | | */ |
524 | 7.74k | void TessBaseAPI::SetImage(Pix *pix) { |
525 | 7.74k | if (InternalSetImage()) { |
526 | 7.74k | if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) { |
527 | | // remove alpha channel from png |
528 | 0 | Pix *p1 = pixRemoveAlpha(pix); |
529 | 0 | pixSetSpp(p1, 3); |
530 | 0 | (void)pixCopy(pix, p1); |
531 | 0 | pixDestroy(&p1); |
532 | 0 | } |
533 | 7.74k | thresholder_->SetImage(pix); |
534 | 7.74k | SetInputImage(thresholder_->GetPixRect()); |
535 | 7.74k | } |
536 | 7.74k | } |
537 | | |
538 | | /** |
539 | | * Restrict recognition to a sub-rectangle of the image. Call after SetImage. |
540 | | * Each SetRectangle clears the recognition results so multiple rectangles |
541 | | * can be recognized with the same image. |
542 | | */ |
543 | 0 | void TessBaseAPI::SetRectangle(int left, int top, int width, int height) { |
544 | 0 | if (thresholder_ == nullptr) { |
545 | 0 | return; |
546 | 0 | } |
547 | 0 | thresholder_->SetRectangle(left, top, width, height); |
548 | 0 | ClearResults(); |
549 | 0 | } |
550 | | |
551 | | /** |
552 | | * ONLY available after SetImage if you have Leptonica installed. |
553 | | * Get a copy of the internal thresholded image from Tesseract. |
554 | | */ |
555 | 0 | Pix *TessBaseAPI::GetThresholdedImage() { |
556 | 0 | if (tesseract_ == nullptr || thresholder_ == nullptr) { |
557 | 0 | return nullptr; |
558 | 0 | } |
559 | 0 | if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) { |
560 | 0 | return nullptr; |
561 | 0 | } |
562 | 0 | return tesseract_->pix_binary().clone(); |
563 | 0 | } |
564 | | |
565 | | /** |
566 | | * Get the result of page layout analysis as a leptonica-style |
567 | | * Boxa, Pixa pair, in reading order. |
568 | | * Can be called before or after Recognize. |
569 | | */ |
570 | 0 | Boxa *TessBaseAPI::GetRegions(Pixa **pixa) { |
571 | 0 | return GetComponentImages(RIL_BLOCK, false, pixa, nullptr); |
572 | 0 | } |
573 | | |
574 | | /** |
575 | | * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order. |
576 | | * Can be called before or after Recognize. |
577 | | * If blockids is not nullptr, the block-id of each line is also returned as an |
578 | | * array of one element per line. delete [] after use. |
579 | | * If paraids is not nullptr, the paragraph-id of each line within its block is |
580 | | * also returned as an array of one element per line. delete [] after use. |
581 | | */ |
582 | | Boxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa, |
583 | 0 | int **blockids, int **paraids) { |
584 | 0 | return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids); |
585 | 0 | } |
586 | | |
587 | | /** |
588 | | * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa |
589 | | * pair, in reading order. Enables downstream handling of non-rectangular |
590 | | * regions. |
591 | | * Can be called before or after Recognize. |
592 | | * If blockids is not nullptr, the block-id of each line is also returned as an |
593 | | * array of one element per line. delete [] after use. |
594 | | */ |
595 | 0 | Boxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) { |
596 | 0 | return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids); |
597 | 0 | } |
598 | | |
599 | | /** |
600 | | * Get the words as a leptonica-style |
601 | | * Boxa, Pixa pair, in reading order. |
602 | | * Can be called before or after Recognize. |
603 | | */ |
604 | 0 | Boxa *TessBaseAPI::GetWords(Pixa **pixa) { |
605 | 0 | return GetComponentImages(RIL_WORD, true, pixa, nullptr); |
606 | 0 | } |
607 | | |
608 | | /** |
609 | | * Gets the individual connected (text) components (created |
610 | | * after pages segmentation step, but before recognition) |
611 | | * as a leptonica-style Boxa, Pixa pair, in reading order. |
612 | | * Can be called before or after Recognize. |
613 | | */ |
614 | 0 | Boxa *TessBaseAPI::GetConnectedComponents(Pixa **pixa) { |
615 | 0 | return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr); |
616 | 0 | } |
617 | | |
618 | | /** |
619 | | * Get the given level kind of components (block, textline, word etc.) as a |
620 | | * leptonica-style Boxa, Pixa pair, in reading order. |
621 | | * Can be called before or after Recognize. |
622 | | * If blockids is not nullptr, the block-id of each component is also returned |
623 | | * as an array of one element per component. delete [] after use. |
624 | | * If text_only is true, then only text components are returned. |
625 | | */ |
626 | | Boxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, |
627 | | const int raw_padding, Pixa **pixa, int **blockids, |
628 | 0 | int **paraids) { |
629 | 0 | /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator()); |
630 | 0 | if (page_it == nullptr) { |
631 | 0 | page_it.reset(AnalyseLayout()); |
632 | 0 | } |
633 | 0 | if (page_it == nullptr) { |
634 | 0 | return nullptr; // Failed. |
635 | 0 | } |
636 | | |
637 | | // Count the components to get a size for the arrays. |
638 | 0 | int component_count = 0; |
639 | 0 | int left, top, right, bottom; |
640 | |
|
641 | 0 | if (raw_image) { |
642 | | // Get bounding box in original raw image with padding. |
643 | 0 | do { |
644 | 0 | if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) && |
645 | 0 | (!text_only || PTIsTextType(page_it->BlockType()))) { |
646 | 0 | ++component_count; |
647 | 0 | } |
648 | 0 | } while (page_it->Next(level)); |
649 | 0 | } else { |
650 | | // Get bounding box from binarized imaged. Note that this could be |
651 | | // differently scaled from the original image. |
652 | 0 | do { |
653 | 0 | if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) && |
654 | 0 | (!text_only || PTIsTextType(page_it->BlockType()))) { |
655 | 0 | ++component_count; |
656 | 0 | } |
657 | 0 | } while (page_it->Next(level)); |
658 | 0 | } |
659 | |
|
660 | 0 | Boxa *boxa = boxaCreate(component_count); |
661 | 0 | if (pixa != nullptr) { |
662 | 0 | *pixa = pixaCreate(component_count); |
663 | 0 | } |
664 | 0 | if (blockids != nullptr) { |
665 | 0 | *blockids = new int[component_count]; |
666 | 0 | } |
667 | 0 | if (paraids != nullptr) { |
668 | 0 | *paraids = new int[component_count]; |
669 | 0 | } |
670 | |
|
671 | 0 | int blockid = 0; |
672 | 0 | int paraid = 0; |
673 | 0 | int component_index = 0; |
674 | 0 | page_it->Begin(); |
675 | 0 | do { |
676 | 0 | bool got_bounding_box; |
677 | 0 | if (raw_image) { |
678 | 0 | got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom); |
679 | 0 | } else { |
680 | 0 | got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom); |
681 | 0 | } |
682 | 0 | if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) { |
683 | 0 | Box *lbox = boxCreate(left, top, right - left, bottom - top); |
684 | 0 | boxaAddBox(boxa, lbox, L_INSERT); |
685 | 0 | if (pixa != nullptr) { |
686 | 0 | Pix *pix = nullptr; |
687 | 0 | if (raw_image) { |
688 | 0 | pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top); |
689 | 0 | } else { |
690 | 0 | pix = page_it->GetBinaryImage(level); |
691 | 0 | } |
692 | 0 | pixaAddPix(*pixa, pix, L_INSERT); |
693 | 0 | pixaAddBox(*pixa, lbox, L_CLONE); |
694 | 0 | } |
695 | 0 | if (paraids != nullptr) { |
696 | 0 | (*paraids)[component_index] = paraid; |
697 | 0 | if (page_it->IsAtFinalElement(RIL_PARA, level)) { |
698 | 0 | ++paraid; |
699 | 0 | } |
700 | 0 | } |
701 | 0 | if (blockids != nullptr) { |
702 | 0 | (*blockids)[component_index] = blockid; |
703 | 0 | if (page_it->IsAtFinalElement(RIL_BLOCK, level)) { |
704 | 0 | ++blockid; |
705 | 0 | paraid = 0; |
706 | 0 | } |
707 | 0 | } |
708 | 0 | ++component_index; |
709 | 0 | } |
710 | 0 | } while (page_it->Next(level)); |
711 | 0 | return boxa; |
712 | 0 | } |
713 | | |
714 | 0 | int TessBaseAPI::GetThresholdedImageScaleFactor() const { |
715 | 0 | if (thresholder_ == nullptr) { |
716 | 0 | return 0; |
717 | 0 | } |
718 | 0 | return thresholder_->GetScaleFactor(); |
719 | 0 | } |
720 | | |
721 | | /** |
722 | | * Runs page layout analysis in the mode set by SetPageSegMode. |
723 | | * May optionally be called prior to Recognize to get access to just |
724 | | * the page layout results. Returns an iterator to the results. |
725 | | * If merge_similar_words is true, words are combined where suitable for use |
726 | | * with a line recognizer. Use if you want to use AnalyseLayout to find the |
727 | | * textlines, and then want to process textline fragments with an external |
728 | | * line recognizer. |
729 | | * Returns nullptr on error or an empty page. |
730 | | * The returned iterator must be deleted after use. |
731 | | * WARNING! This class points to data held within the TessBaseAPI class, and |
732 | | * therefore can only be used while the TessBaseAPI class still exists and |
733 | | * has not been subjected to a call of Init, SetImage, Recognize, Clear, End |
734 | | * DetectOS, or anything else that changes the internal PAGE_RES. |
735 | | */ |
736 | 0 | PageIterator *TessBaseAPI::AnalyseLayout() { |
737 | 0 | return AnalyseLayout(false); |
738 | 0 | } |
739 | | |
740 | 0 | PageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) { |
741 | 0 | if (FindLines() == 0) { |
742 | 0 | if (block_list_->empty()) { |
743 | 0 | return nullptr; // The page was empty. |
744 | 0 | } |
745 | 0 | page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr); |
746 | 0 | DetectParagraphs(false); |
747 | 0 | return new PageIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(), |
748 | 0 | thresholder_->GetScaledYResolution(), rect_left_, rect_top_, |
749 | 0 | rect_width_, rect_height_); |
750 | 0 | } |
751 | 0 | return nullptr; |
752 | 0 | } |
753 | | |
754 | | /** |
755 | | * Recognize the tesseract global image and return the result as Tesseract |
756 | | * internal structures. |
757 | | */ |
758 | 7.74k | int TessBaseAPI::Recognize(ETEXT_DESC *monitor) { |
759 | 7.74k | if (tesseract_ == nullptr) { |
760 | 0 | return -1; |
761 | 0 | } |
762 | 7.74k | if (FindLines() != 0) { |
763 | 0 | return -1; |
764 | 0 | } |
765 | 7.74k | delete page_res_; |
766 | 7.74k | if (block_list_->empty()) { |
767 | 863 | page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_); |
768 | 863 | return 0; // Empty page. |
769 | 863 | } |
770 | | |
771 | 6.88k | tesseract_->SetBlackAndWhitelist(); |
772 | 6.88k | recognition_done_ = true; |
773 | 6.88k | #ifndef DISABLED_LEGACY_ENGINE |
774 | 6.88k | if (tesseract_->tessedit_resegment_from_line_boxes) { |
775 | 0 | page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), true, block_list_); |
776 | 6.88k | } else if (tesseract_->tessedit_resegment_from_boxes) { |
777 | 0 | page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), false, block_list_); |
778 | 0 | } else |
779 | 6.88k | #endif // ndef DISABLED_LEGACY_ENGINE |
780 | 6.88k | { |
781 | 6.88k | page_res_ = |
782 | 6.88k | new PAGE_RES(tesseract_->AnyLSTMLang(), block_list_, &tesseract_->prev_word_best_choice_); |
783 | 6.88k | } |
784 | | |
785 | 6.88k | if (page_res_ == nullptr) { |
786 | 0 | return -1; |
787 | 0 | } |
788 | | |
789 | 6.88k | if (tesseract_->tessedit_train_line_recognizer) { |
790 | 0 | if (!tesseract_->TrainLineRecognizer(input_file_.c_str(), output_file_, block_list_)) { |
791 | 0 | return -1; |
792 | 0 | } |
793 | 0 | tesseract_->CorrectClassifyWords(page_res_); |
794 | 0 | return 0; |
795 | 0 | } |
796 | 6.88k | #ifndef DISABLED_LEGACY_ENGINE |
797 | 6.88k | if (tesseract_->tessedit_make_boxes_from_boxes) { |
798 | 0 | tesseract_->CorrectClassifyWords(page_res_); |
799 | 0 | return 0; |
800 | 0 | } |
801 | 6.88k | #endif // ndef DISABLED_LEGACY_ENGINE |
802 | | |
803 | 6.88k | int result = 0; |
804 | 6.88k | if (tesseract_->interactive_display_mode) { |
805 | | #ifndef GRAPHICS_DISABLED |
806 | | tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_); |
807 | | #endif // !GRAPHICS_DISABLED |
808 | | // The page_res is invalid after an interactive session, so cleanup |
809 | | // in a way that lets us continue to the next page without crashing. |
810 | 0 | delete page_res_; |
811 | 0 | page_res_ = nullptr; |
812 | 0 | return -1; |
813 | 0 | #ifndef DISABLED_LEGACY_ENGINE |
814 | 6.88k | } else if (tesseract_->tessedit_train_from_boxes) { |
815 | 0 | std::string fontname; |
816 | 0 | ExtractFontName(output_file_.c_str(), &fontname); |
817 | 0 | tesseract_->ApplyBoxTraining(fontname, page_res_); |
818 | 6.88k | } else if (tesseract_->tessedit_ambigs_training) { |
819 | 0 | FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str()); |
820 | | // OCR the page segmented into words by tesseract. |
821 | 0 | tesseract_->recog_training_segmented(input_file_.c_str(), page_res_, monitor, |
822 | 0 | training_output_file); |
823 | 0 | fclose(training_output_file); |
824 | 0 | #endif // ndef DISABLED_LEGACY_ENGINE |
825 | 6.88k | } else { |
826 | | // Now run the main recognition. |
827 | 6.88k | bool wait_for_text = true; |
828 | 6.88k | GetBoolVariable("paragraph_text_based", &wait_for_text); |
829 | 6.88k | if (!wait_for_text) { |
830 | 0 | DetectParagraphs(false); |
831 | 0 | } |
832 | 6.88k | if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) { |
833 | 6.88k | if (wait_for_text) { |
834 | 6.88k | DetectParagraphs(true); |
835 | 6.88k | } |
836 | 6.88k | } else { |
837 | 0 | result = -1; |
838 | 0 | } |
839 | 6.88k | } |
840 | 6.88k | return result; |
841 | 6.88k | } |
842 | | |
843 | | // Takes ownership of the input pix. |
844 | 7.74k | void TessBaseAPI::SetInputImage(Pix *pix) { |
845 | 7.74k | tesseract_->set_pix_original(pix); |
846 | 7.74k | } |
847 | | |
848 | 0 | Pix *TessBaseAPI::GetInputImage() { |
849 | 0 | return tesseract_->pix_original(); |
850 | 0 | } |
851 | | |
852 | 0 | const char *TessBaseAPI::GetInputName() { |
853 | 0 | if (!input_file_.empty()) { |
854 | 0 | return input_file_.c_str(); |
855 | 0 | } |
856 | 0 | return nullptr; |
857 | 0 | } |
858 | | |
859 | 0 | const char *TessBaseAPI::GetDatapath() { |
860 | 0 | return tesseract_->datadir.c_str(); |
861 | 0 | } |
862 | | |
863 | 0 | int TessBaseAPI::GetSourceYResolution() { |
864 | 0 | if (thresholder_ == nullptr) |
865 | 0 | return -1; |
866 | 0 | return thresholder_->GetSourceYResolution(); |
867 | 0 | } |
868 | | |
869 | | // If flist exists, get data from there. Otherwise get data from buf. |
870 | | // Seems convoluted, but is the easiest way I know of to meet multiple |
871 | | // goals. Support streaming from stdin, and also work on platforms |
872 | | // lacking fmemopen. |
873 | | // TODO: check different logic for flist/buf and simplify. |
874 | | bool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config, |
875 | | int timeout_millisec, TessResultRenderer *renderer, |
876 | 0 | int tessedit_page_number) { |
877 | 0 | if (!flist && !buf) { |
878 | 0 | return false; |
879 | 0 | } |
880 | 0 | unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; |
881 | 0 | char pagename[MAX_PATH]; |
882 | |
|
883 | 0 | std::vector<std::string> lines; |
884 | 0 | if (!flist) { |
885 | 0 | std::string line; |
886 | 0 | for (const auto ch : *buf) { |
887 | 0 | if (ch == '\n') { |
888 | 0 | lines.push_back(line); |
889 | 0 | line.clear(); |
890 | 0 | } else { |
891 | 0 | line.push_back(ch); |
892 | 0 | } |
893 | 0 | } |
894 | 0 | if (!line.empty()) { |
895 | | // Add last line without terminating LF. |
896 | 0 | lines.push_back(line); |
897 | 0 | } |
898 | 0 | if (lines.empty()) { |
899 | 0 | return false; |
900 | 0 | } |
901 | 0 | } |
902 | | |
903 | | // Skip to the requested page number. |
904 | 0 | for (unsigned i = 0; i < page; i++) { |
905 | 0 | if (flist) { |
906 | 0 | if (fgets(pagename, sizeof(pagename), flist) == nullptr) { |
907 | 0 | break; |
908 | 0 | } |
909 | 0 | } |
910 | 0 | } |
911 | | |
912 | | // Begin producing output |
913 | 0 | if (renderer && !renderer->BeginDocument(document_title.c_str())) { |
914 | 0 | return false; |
915 | 0 | } |
916 | | |
917 | | // Loop over all pages - or just the requested one |
918 | 0 | while (true) { |
919 | 0 | if (flist) { |
920 | 0 | if (fgets(pagename, sizeof(pagename), flist) == nullptr) { |
921 | 0 | break; |
922 | 0 | } |
923 | 0 | } else { |
924 | 0 | if (page >= lines.size()) { |
925 | 0 | break; |
926 | 0 | } |
927 | 0 | snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str()); |
928 | 0 | } |
929 | 0 | chomp_string(pagename); |
930 | 0 | Pix *pix = pixRead(pagename); |
931 | 0 | if (pix == nullptr) { |
932 | 0 | tprintf("Image file %s cannot be read!\n", pagename); |
933 | 0 | return false; |
934 | 0 | } |
935 | 0 | tprintf("Page %u : %s\n", page, pagename); |
936 | 0 | bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer); |
937 | 0 | pixDestroy(&pix); |
938 | 0 | if (!r) { |
939 | 0 | return false; |
940 | 0 | } |
941 | 0 | if (tessedit_page_number >= 0) { |
942 | 0 | break; |
943 | 0 | } |
944 | 0 | ++page; |
945 | 0 | } |
946 | | |
947 | | // Finish producing output |
948 | 0 | if (renderer && !renderer->EndDocument()) { |
949 | 0 | return false; |
950 | 0 | } |
951 | 0 | return true; |
952 | 0 | } |
953 | | |
954 | | bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename, |
955 | | const char *retry_config, int timeout_millisec, |
956 | | TessResultRenderer *renderer, |
957 | 0 | int tessedit_page_number) { |
958 | 0 | Pix *pix = nullptr; |
959 | 0 | int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; |
960 | 0 | size_t offset = 0; |
961 | 0 | for (;; ++page) { |
962 | 0 | if (tessedit_page_number >= 0) { |
963 | 0 | page = tessedit_page_number; |
964 | 0 | pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page); |
965 | 0 | } else { |
966 | 0 | pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset) |
967 | 0 | : pixReadFromMultipageTiff(filename, &offset); |
968 | 0 | } |
969 | 0 | if (pix == nullptr) { |
970 | 0 | break; |
971 | 0 | } |
972 | 0 | if (offset || page > 0) { |
973 | | // Only print page number for multipage TIFF file. |
974 | 0 | tprintf("Page %d\n", page + 1); |
975 | 0 | } |
976 | 0 | auto page_string = std::to_string(page); |
977 | 0 | SetVariable("applybox_page", page_string.c_str()); |
978 | 0 | bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer); |
979 | 0 | pixDestroy(&pix); |
980 | 0 | if (!r) { |
981 | 0 | return false; |
982 | 0 | } |
983 | 0 | if (tessedit_page_number >= 0) { |
984 | 0 | break; |
985 | 0 | } |
986 | 0 | if (!offset) { |
987 | 0 | break; |
988 | 0 | } |
989 | 0 | } |
990 | 0 | return true; |
991 | 0 | } |
992 | | |
993 | | // Master ProcessPages calls ProcessPagesInternal and then does any post- |
994 | | // processing required due to being in a training mode. |
995 | | bool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec, |
996 | 0 | TessResultRenderer *renderer) { |
997 | 0 | bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer); |
998 | 0 | #ifndef DISABLED_LEGACY_ENGINE |
999 | 0 | if (result) { |
1000 | 0 | if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) { |
1001 | 0 | tprintf("Write of TR file failed: %s\n", output_file_.c_str()); |
1002 | 0 | return false; |
1003 | 0 | } |
1004 | 0 | } |
1005 | 0 | #endif // ndef DISABLED_LEGACY_ENGINE |
1006 | 0 | return result; |
1007 | 0 | } |
1008 | | |
1009 | | #ifdef HAVE_LIBCURL |
1010 | | static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) { |
1011 | | size = size * nmemb; |
1012 | | auto *buf = reinterpret_cast<std::string *>(userp); |
1013 | | buf->append(reinterpret_cast<const char *>(contents), size); |
1014 | | return size; |
1015 | | } |
1016 | | #endif |
1017 | | |
1018 | | // In the ideal scenario, Tesseract will start working on data as soon |
1019 | | // as it can. For example, if you stream a filelist through stdin, we |
1020 | | // should start the OCR process as soon as the first filename is |
1021 | | // available. This is particularly useful when hooking Tesseract up to |
1022 | | // slow hardware such as a book scanning machine. |
1023 | | // |
1024 | | // Unfortunately there are tradeoffs. You can't seek on stdin. That |
1025 | | // makes automatic detection of datatype (TIFF? filelist? PNG?) |
1026 | | // impractical. So we support a command line flag to explicitly |
1027 | | // identify the scenario that really matters: filelists on |
1028 | | // stdin. We'll still do our best if the user likes pipes. |
1029 | | bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config, |
1030 | 0 | int timeout_millisec, TessResultRenderer *renderer) { |
1031 | 0 | bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-"); |
1032 | 0 | if (stdInput) { |
1033 | | #ifdef WIN32 |
1034 | | if (_setmode(_fileno(stdin), _O_BINARY) == -1) |
1035 | | tprintf("ERROR: cin to binary: %s", strerror(errno)); |
1036 | | #endif // WIN32 |
1037 | 0 | } |
1038 | |
|
1039 | 0 | if (stream_filelist) { |
1040 | 0 | return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer, |
1041 | 0 | tesseract_->tessedit_page_number); |
1042 | 0 | } |
1043 | | |
1044 | | // At this point we are officially in autodection territory. |
1045 | | // That means any data in stdin must be buffered, to make it |
1046 | | // seekable. |
1047 | 0 | std::string buf; |
1048 | 0 | const l_uint8 *data = nullptr; |
1049 | 0 | if (stdInput) { |
1050 | 0 | buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>())); |
1051 | 0 | data = reinterpret_cast<const l_uint8 *>(buf.data()); |
1052 | 0 | } else if (strstr(filename, "://") != nullptr) { |
1053 | | // Get image or image list by URL. |
1054 | | #ifdef HAVE_LIBCURL |
1055 | | CURL *curl = curl_easy_init(); |
1056 | | if (curl == nullptr) { |
1057 | | fprintf(stderr, "Error, curl_easy_init failed\n"); |
1058 | | return false; |
1059 | | } else { |
1060 | | CURLcode curlcode; |
1061 | | auto error = [curl, &curlcode](const char *function) { |
1062 | | fprintf(stderr, "Error, %s failed with error %s\n", function, curl_easy_strerror(curlcode)); |
1063 | | curl_easy_cleanup(curl); |
1064 | | return false; |
1065 | | }; |
1066 | | curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename); |
1067 | | if (curlcode != CURLE_OK) { |
1068 | | return error("curl_easy_setopt"); |
1069 | | } |
1070 | | curlcode = curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L); |
1071 | | if (curlcode != CURLE_OK) { |
1072 | | return error("curl_easy_setopt"); |
1073 | | } |
1074 | | // Follow HTTP, HTTPS, FTP and FTPS redirects. |
1075 | | curlcode = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); |
1076 | | if (curlcode != CURLE_OK) { |
1077 | | return error("curl_easy_setopt"); |
1078 | | } |
1079 | | // Allow no more than 8 redirections to prevent endless loops. |
1080 | | curlcode = curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 8); |
1081 | | if (curlcode != CURLE_OK) { |
1082 | | return error("curl_easy_setopt"); |
1083 | | } |
1084 | | int timeout = curl_timeout; |
1085 | | if (timeout > 0) { |
1086 | | curlcode = curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L); |
1087 | | if (curlcode != CURLE_OK) { |
1088 | | return error("curl_easy_setopt"); |
1089 | | } |
1090 | | curlcode = curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); |
1091 | | if (curlcode != CURLE_OK) { |
1092 | | return error("curl_easy_setopt"); |
1093 | | } |
1094 | | } |
1095 | | std::string cookiefile = curl_cookiefile; |
1096 | | if (!cookiefile.empty()) { |
1097 | | curlcode = curl_easy_setopt(curl, CURLOPT_COOKIEFILE, cookiefile.c_str()); |
1098 | | if (curlcode != CURLE_OK) { |
1099 | | return error("curl_easy_setopt"); |
1100 | | } |
1101 | | } |
1102 | | curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); |
1103 | | if (curlcode != CURLE_OK) { |
1104 | | return error("curl_easy_setopt"); |
1105 | | } |
1106 | | curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf); |
1107 | | if (curlcode != CURLE_OK) { |
1108 | | return error("curl_easy_setopt"); |
1109 | | } |
1110 | | curlcode = curl_easy_setopt(curl, CURLOPT_USERAGENT, "Tesseract OCR"); |
1111 | | if (curlcode != CURLE_OK) { |
1112 | | return error("curl_easy_setopt"); |
1113 | | } |
1114 | | curlcode = curl_easy_perform(curl); |
1115 | | if (curlcode != CURLE_OK) { |
1116 | | return error("curl_easy_perform"); |
1117 | | } |
1118 | | curl_easy_cleanup(curl); |
1119 | | data = reinterpret_cast<const l_uint8 *>(buf.data()); |
1120 | | } |
1121 | | #else |
1122 | 0 | fprintf(stderr, "Error, this tesseract has no URL support\n"); |
1123 | 0 | return false; |
1124 | 0 | #endif |
1125 | 0 | } else { |
1126 | | // Check whether the input file can be read. |
1127 | 0 | if (FILE *file = fopen(filename, "rb")) { |
1128 | 0 | fclose(file); |
1129 | 0 | } else { |
1130 | 0 | fprintf(stderr, "Error, cannot read input file %s: %s\n", filename, strerror(errno)); |
1131 | 0 | return false; |
1132 | 0 | } |
1133 | 0 | } |
1134 | | |
1135 | | // Here is our autodetection |
1136 | 0 | int format; |
1137 | 0 | int r = |
1138 | 0 | (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format); |
1139 | | |
1140 | | // Maybe we have a filelist |
1141 | 0 | if (r != 0 || format == IFF_UNKNOWN) { |
1142 | 0 | std::string s; |
1143 | 0 | if (data != nullptr) { |
1144 | 0 | s = buf.c_str(); |
1145 | 0 | } else { |
1146 | 0 | std::ifstream t(filename); |
1147 | 0 | std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>()); |
1148 | 0 | s = u.c_str(); |
1149 | 0 | } |
1150 | 0 | return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer, |
1151 | 0 | tesseract_->tessedit_page_number); |
1152 | 0 | } |
1153 | | |
1154 | | // Maybe we have a TIFF which is potentially multipage |
1155 | 0 | bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE || |
1156 | 0 | format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW || |
1157 | 0 | #if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76 |
1158 | 0 | format == IFF_TIFF_JPEG || |
1159 | 0 | #endif |
1160 | 0 | format == IFF_TIFF_ZIP); |
1161 | | |
1162 | | // Fail early if we can, before producing any output |
1163 | 0 | Pix *pix = nullptr; |
1164 | 0 | if (!tiff) { |
1165 | 0 | pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename); |
1166 | 0 | if (pix == nullptr) { |
1167 | 0 | return false; |
1168 | 0 | } |
1169 | 0 | } |
1170 | | |
1171 | | // Begin the output |
1172 | 0 | if (renderer && !renderer->BeginDocument(document_title.c_str())) { |
1173 | 0 | pixDestroy(&pix); |
1174 | 0 | return false; |
1175 | 0 | } |
1176 | | |
1177 | | // Produce output |
1178 | 0 | r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec, |
1179 | 0 | renderer, tesseract_->tessedit_page_number) |
1180 | 0 | : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer); |
1181 | | |
1182 | | // Clean up memory as needed |
1183 | 0 | pixDestroy(&pix); |
1184 | | |
1185 | | // End the output |
1186 | 0 | if (!r || (renderer && !renderer->EndDocument())) { |
1187 | 0 | return false; |
1188 | 0 | } |
1189 | 0 | return true; |
1190 | 0 | } |
1191 | | |
1192 | | bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename, |
1193 | | const char *retry_config, int timeout_millisec, |
1194 | 0 | TessResultRenderer *renderer) { |
1195 | 0 | SetInputName(filename); |
1196 | 0 | SetImage(pix); |
1197 | 0 | bool failed = false; |
1198 | |
|
1199 | 0 | if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) { |
1200 | | // Disabled character recognition |
1201 | 0 | if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) { |
1202 | 0 | failed = true; |
1203 | 0 | } |
1204 | 0 | } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) { |
1205 | 0 | failed = FindLines() != 0; |
1206 | 0 | } else if (timeout_millisec > 0) { |
1207 | | // Running with a timeout. |
1208 | 0 | ETEXT_DESC monitor; |
1209 | 0 | monitor.cancel = nullptr; |
1210 | 0 | monitor.cancel_this = nullptr; |
1211 | 0 | monitor.set_deadline_msecs(timeout_millisec); |
1212 | | |
1213 | | // Now run the main recognition. |
1214 | 0 | failed = Recognize(&monitor) < 0; |
1215 | 0 | } else { |
1216 | | // Normal layout and character recognition with no timeout. |
1217 | 0 | failed = Recognize(nullptr) < 0; |
1218 | 0 | } |
1219 | |
|
1220 | 0 | if (tesseract_->tessedit_write_images) { |
1221 | 0 | Pix *page_pix = GetThresholdedImage(); |
1222 | 0 | std::string output_filename = output_file_ + ".processed"; |
1223 | 0 | if (page_index > 0) { |
1224 | 0 | output_filename += std::to_string(page_index); |
1225 | 0 | } |
1226 | 0 | output_filename += ".tif"; |
1227 | 0 | pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4); |
1228 | 0 | pixDestroy(&page_pix); |
1229 | 0 | } |
1230 | |
|
1231 | 0 | if (failed && retry_config != nullptr && retry_config[0] != '\0') { |
1232 | | // Save current config variables before switching modes. |
1233 | 0 | FILE *fp = fopen(kOldVarsFile, "wb"); |
1234 | 0 | if (fp == nullptr) { |
1235 | 0 | tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile); |
1236 | 0 | } else { |
1237 | 0 | PrintVariables(fp); |
1238 | 0 | fclose(fp); |
1239 | 0 | } |
1240 | | // Switch to alternate mode for retry. |
1241 | 0 | ReadConfigFile(retry_config); |
1242 | 0 | SetImage(pix); |
1243 | 0 | Recognize(nullptr); |
1244 | | // Restore saved config variables. |
1245 | 0 | ReadConfigFile(kOldVarsFile); |
1246 | 0 | } |
1247 | |
|
1248 | 0 | if (renderer && !failed) { |
1249 | 0 | failed = !renderer->AddImage(this); |
1250 | 0 | } |
1251 | |
|
1252 | 0 | return !failed; |
1253 | 0 | } |
1254 | | |
1255 | | /** |
1256 | | * Get a left-to-right iterator to the results of LayoutAnalysis and/or |
1257 | | * Recognize. The returned iterator must be deleted after use. |
1258 | | */ |
1259 | 0 | LTRResultIterator *TessBaseAPI::GetLTRIterator() { |
1260 | 0 | if (tesseract_ == nullptr || page_res_ == nullptr) { |
1261 | 0 | return nullptr; |
1262 | 0 | } |
1263 | 0 | return new LTRResultIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(), |
1264 | 0 | thresholder_->GetScaledYResolution(), rect_left_, rect_top_, |
1265 | 0 | rect_width_, rect_height_); |
1266 | 0 | } |
1267 | | |
1268 | | /** |
1269 | | * Get a reading-order iterator to the results of LayoutAnalysis and/or |
1270 | | * Recognize. The returned iterator must be deleted after use. |
1271 | | * WARNING! This class points to data held within the TessBaseAPI class, and |
1272 | | * therefore can only be used while the TessBaseAPI class still exists and |
1273 | | * has not been subjected to a call of Init, SetImage, Recognize, Clear, End |
1274 | | * DetectOS, or anything else that changes the internal PAGE_RES. |
1275 | | */ |
1276 | 7.74k | ResultIterator *TessBaseAPI::GetIterator() { |
1277 | 7.74k | if (tesseract_ == nullptr || page_res_ == nullptr) { |
1278 | 0 | return nullptr; |
1279 | 0 | } |
1280 | 7.74k | return ResultIterator::StartOfParagraph(LTRResultIterator( |
1281 | 7.74k | page_res_, tesseract_, thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(), |
1282 | 7.74k | rect_left_, rect_top_, rect_width_, rect_height_)); |
1283 | 7.74k | } |
1284 | | |
1285 | | /** |
1286 | | * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. |
1287 | | * The returned iterator must be deleted after use. |
1288 | | * WARNING! This class points to data held within the TessBaseAPI class, and |
1289 | | * therefore can only be used while the TessBaseAPI class still exists and |
1290 | | * has not been subjected to a call of Init, SetImage, Recognize, Clear, End |
1291 | | * DetectOS, or anything else that changes the internal PAGE_RES. |
1292 | | */ |
1293 | 6.88k | MutableIterator *TessBaseAPI::GetMutableIterator() { |
1294 | 6.88k | if (tesseract_ == nullptr || page_res_ == nullptr) { |
1295 | 0 | return nullptr; |
1296 | 0 | } |
1297 | 6.88k | return new MutableIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(), |
1298 | 6.88k | thresholder_->GetScaledYResolution(), rect_left_, rect_top_, |
1299 | 6.88k | rect_width_, rect_height_); |
1300 | 6.88k | } |
1301 | | |
1302 | | /** Make a text string from the internal data structures. */ |
1303 | 7.74k | char *TessBaseAPI::GetUTF8Text() { |
1304 | 7.74k | if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { |
1305 | 0 | return nullptr; |
1306 | 0 | } |
1307 | 7.74k | std::string text(""); |
1308 | 7.74k | const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator()); |
1309 | 10.8k | do { |
1310 | 10.8k | if (it->Empty(RIL_PARA)) { |
1311 | 865 | continue; |
1312 | 865 | } |
1313 | 9.97k | auto block_type = it->BlockType(); |
1314 | 9.97k | switch (block_type) { |
1315 | 0 | case PT_FLOWING_IMAGE: |
1316 | 0 | case PT_HEADING_IMAGE: |
1317 | 0 | case PT_PULLOUT_IMAGE: |
1318 | 0 | case PT_HORZ_LINE: |
1319 | 0 | case PT_VERT_LINE: |
1320 | | // Ignore images and lines for text output. |
1321 | 0 | continue; |
1322 | 0 | case PT_NOISE: |
1323 | 0 | tprintf("TODO: Please report image which triggers the noise case.\n"); |
1324 | 0 | ASSERT_HOST(false); |
1325 | 9.97k | default: |
1326 | 9.97k | break; |
1327 | 9.97k | } |
1328 | | |
1329 | 9.97k | const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA)); |
1330 | 9.97k | text += para_text.get(); |
1331 | 10.8k | } while (it->Next(RIL_PARA)); |
1332 | 7.74k | return copy_string(text); |
1333 | 7.74k | } |
1334 | | |
1335 | 0 | static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) { |
1336 | 0 | int left, top, right, bottom; |
1337 | 0 | it->BoundingBox(level, &left, &top, &right, &bottom); |
1338 | 0 | text += "\t" + std::to_string(left); |
1339 | 0 | text += "\t" + std::to_string(top); |
1340 | 0 | text += "\t" + std::to_string(right - left); |
1341 | 0 | text += "\t" + std::to_string(bottom - top); |
1342 | 0 | } |
1343 | | |
1344 | | /** |
1345 | | * Make a TSV-formatted string from the internal data structures. |
1346 | | * page_number is 0-based but will appear in the output as 1-based. |
1347 | | * Returned string must be freed with the delete [] operator. |
1348 | | */ |
1349 | 0 | char *TessBaseAPI::GetTSVText(int page_number) { |
1350 | 0 | if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) { |
1351 | 0 | return nullptr; |
1352 | 0 | } |
1353 | | |
1354 | | #if !defined(NDEBUG) |
1355 | | int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; |
1356 | | #endif |
1357 | 0 | int page_id = page_number + 1; // we use 1-based page numbers. |
1358 | |
|
1359 | 0 | int page_num = page_id; |
1360 | 0 | int block_num = 0; |
1361 | 0 | int par_num = 0; |
1362 | 0 | int line_num = 0; |
1363 | 0 | int word_num = 0; |
1364 | |
|
1365 | 0 | std::string tsv_str; |
1366 | 0 | tsv_str += "1\t" + std::to_string(page_num); // level 1 - page |
1367 | 0 | tsv_str += "\t" + std::to_string(block_num); |
1368 | 0 | tsv_str += "\t" + std::to_string(par_num); |
1369 | 0 | tsv_str += "\t" + std::to_string(line_num); |
1370 | 0 | tsv_str += "\t" + std::to_string(word_num); |
1371 | 0 | tsv_str += "\t" + std::to_string(rect_left_); |
1372 | 0 | tsv_str += "\t" + std::to_string(rect_top_); |
1373 | 0 | tsv_str += "\t" + std::to_string(rect_width_); |
1374 | 0 | tsv_str += "\t" + std::to_string(rect_height_); |
1375 | 0 | tsv_str += "\t-1\t\n"; |
1376 | |
|
1377 | 0 | const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator()); |
1378 | 0 | while (!res_it->Empty(RIL_BLOCK)) { |
1379 | 0 | if (res_it->Empty(RIL_WORD)) { |
1380 | 0 | res_it->Next(RIL_WORD); |
1381 | 0 | continue; |
1382 | 0 | } |
1383 | | |
1384 | | // Add rows for any new block/paragraph/textline. |
1385 | 0 | if (res_it->IsAtBeginningOf(RIL_BLOCK)) { |
1386 | 0 | block_num++; |
1387 | 0 | par_num = 0; |
1388 | 0 | line_num = 0; |
1389 | 0 | word_num = 0; |
1390 | 0 | tsv_str += "2\t" + std::to_string(page_num); // level 2 - block |
1391 | 0 | tsv_str += "\t" + std::to_string(block_num); |
1392 | 0 | tsv_str += "\t" + std::to_string(par_num); |
1393 | 0 | tsv_str += "\t" + std::to_string(line_num); |
1394 | 0 | tsv_str += "\t" + std::to_string(word_num); |
1395 | 0 | AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str); |
1396 | 0 | tsv_str += "\t-1\t\n"; // end of row for block |
1397 | 0 | } |
1398 | 0 | if (res_it->IsAtBeginningOf(RIL_PARA)) { |
1399 | 0 | par_num++; |
1400 | 0 | line_num = 0; |
1401 | 0 | word_num = 0; |
1402 | 0 | tsv_str += "3\t" + std::to_string(page_num); // level 3 - paragraph |
1403 | 0 | tsv_str += "\t" + std::to_string(block_num); |
1404 | 0 | tsv_str += "\t" + std::to_string(par_num); |
1405 | 0 | tsv_str += "\t" + std::to_string(line_num); |
1406 | 0 | tsv_str += "\t" + std::to_string(word_num); |
1407 | 0 | AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str); |
1408 | 0 | tsv_str += "\t-1\t\n"; // end of row for para |
1409 | 0 | } |
1410 | 0 | if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { |
1411 | 0 | line_num++; |
1412 | 0 | word_num = 0; |
1413 | 0 | tsv_str += "4\t" + std::to_string(page_num); // level 4 - line |
1414 | 0 | tsv_str += "\t" + std::to_string(block_num); |
1415 | 0 | tsv_str += "\t" + std::to_string(par_num); |
1416 | 0 | tsv_str += "\t" + std::to_string(line_num); |
1417 | 0 | tsv_str += "\t" + std::to_string(word_num); |
1418 | 0 | AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str); |
1419 | 0 | tsv_str += "\t-1\t\n"; // end of row for line |
1420 | 0 | } |
1421 | | |
1422 | | // Now, process the word... |
1423 | 0 | int left, top, right, bottom; |
1424 | 0 | res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); |
1425 | 0 | word_num++; |
1426 | 0 | tsv_str += "5\t" + std::to_string(page_num); // level 5 - word |
1427 | 0 | tsv_str += "\t" + std::to_string(block_num); |
1428 | 0 | tsv_str += "\t" + std::to_string(par_num); |
1429 | 0 | tsv_str += "\t" + std::to_string(line_num); |
1430 | 0 | tsv_str += "\t" + std::to_string(word_num); |
1431 | 0 | tsv_str += "\t" + std::to_string(left); |
1432 | 0 | tsv_str += "\t" + std::to_string(top); |
1433 | 0 | tsv_str += "\t" + std::to_string(right - left); |
1434 | 0 | tsv_str += "\t" + std::to_string(bottom - top); |
1435 | 0 | tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD)); |
1436 | 0 | tsv_str += "\t"; |
1437 | |
|
1438 | | #if !defined(NDEBUG) |
1439 | | // Increment counts if at end of block/paragraph/textline. |
1440 | | if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) { |
1441 | | lcnt++; |
1442 | | } |
1443 | | if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) { |
1444 | | pcnt++; |
1445 | | } |
1446 | | if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) { |
1447 | | bcnt++; |
1448 | | } |
1449 | | #endif |
1450 | |
|
1451 | 0 | do { |
1452 | 0 | tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get(); |
1453 | 0 | res_it->Next(RIL_SYMBOL); |
1454 | 0 | } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); |
1455 | 0 | tsv_str += "\n"; // end of row |
1456 | | #if !defined(NDEBUG) |
1457 | | wcnt++; |
1458 | | #endif |
1459 | 0 | } |
1460 | |
|
1461 | 0 | return copy_string(tsv_str); |
1462 | 0 | } |
1463 | | |
1464 | | /** The 5 numbers output for each box (the usual 4 and a page number.) */ |
1465 | | const int kNumbersPerBlob = 5; |
1466 | | /** |
1467 | | * The number of bytes taken by each number. Since we use int16_t for ICOORD, |
1468 | | * assume only 5 digits max. |
1469 | | */ |
1470 | | const int kBytesPerNumber = 5; |
1471 | | /** |
1472 | | * Multiplier for max expected textlength assumes (kBytesPerNumber + space) |
1473 | | * * kNumbersPerBlob plus the newline. Add to this the |
1474 | | * original UTF8 characters, and one kMaxBytesPerLine for safety. |
1475 | | */ |
1476 | | const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1; |
1477 | | /** Max bytes in the decimal representation of int64_t. */ |
1478 | | const int kBytesPer64BitNumber = 20; |
1479 | | /** |
1480 | | * A maximal single box could occupy kNumbersPerBlob numbers at |
1481 | | * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a |
1482 | | * space plus the newline and the maximum length of a UNICHAR. |
1483 | | * Test against this on each iteration for safety. |
1484 | | */ |
1485 | | const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + UNICHAR_LEN; |
1486 | | |
1487 | | /** |
1488 | | * The recognized text is returned as a char* which is coded |
1489 | | * as a UTF8 box file. |
1490 | | * page_number is a 0-base page index that will appear in the box file. |
1491 | | * Returned string must be freed with the delete [] operator. |
1492 | | */ |
1493 | 0 | char *TessBaseAPI::GetBoxText(int page_number) { |
1494 | 0 | if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { |
1495 | 0 | return nullptr; |
1496 | 0 | } |
1497 | 0 | int blob_count; |
1498 | 0 | int utf8_length = TextLength(&blob_count); |
1499 | 0 | int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine; |
1500 | 0 | char *result = new char[total_length]; |
1501 | 0 | result[0] = '\0'; |
1502 | 0 | int output_length = 0; |
1503 | 0 | LTRResultIterator *it = GetLTRIterator(); |
1504 | 0 | do { |
1505 | 0 | int left, top, right, bottom; |
1506 | 0 | if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) { |
1507 | 0 | const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL)); |
1508 | | // Tesseract uses space for recognition failure. Fix to a reject |
1509 | | // character, kTesseractReject so we don't create illegal box files. |
1510 | 0 | for (int i = 0; text[i] != '\0'; ++i) { |
1511 | 0 | if (text[i] == ' ') { |
1512 | 0 | text[i] = kTesseractReject; |
1513 | 0 | } |
1514 | 0 | } |
1515 | 0 | snprintf(result + output_length, total_length - output_length, "%s %d %d %d %d %d\n", |
1516 | 0 | text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number); |
1517 | 0 | output_length += strlen(result + output_length); |
1518 | | // Just in case... |
1519 | 0 | if (output_length + kMaxBytesPerLine > total_length) { |
1520 | 0 | break; |
1521 | 0 | } |
1522 | 0 | } |
1523 | 0 | } while (it->Next(RIL_SYMBOL)); |
1524 | 0 | delete it; |
1525 | 0 | return result; |
1526 | 0 | } |
1527 | | |
1528 | | /** |
1529 | | * Conversion table for non-latin characters. |
1530 | | * Maps characters out of the latin set into the latin set. |
1531 | | * TODO(rays) incorporate this translation into unicharset. |
1532 | | */ |
1533 | | const int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0}; |
1534 | | /** Latin chars corresponding to the unicode chars above. */ |
1535 | | const int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0}; |
1536 | | |
1537 | | /** |
1538 | | * The recognized text is returned as a char* which is coded |
1539 | | * as UNLV format Latin-1 with specific reject and suspect codes. |
1540 | | * Returned string must be freed with the delete [] operator. |
1541 | | */ |
1542 | 0 | char *TessBaseAPI::GetUNLVText() { |
1543 | 0 | if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { |
1544 | 0 | return nullptr; |
1545 | 0 | } |
1546 | 0 | bool tilde_crunch_written = false; |
1547 | 0 | bool last_char_was_newline = true; |
1548 | 0 | bool last_char_was_tilde = false; |
1549 | |
|
1550 | 0 | int total_length = TextLength(nullptr); |
1551 | 0 | PAGE_RES_IT page_res_it(page_res_); |
1552 | 0 | char *result = new char[total_length]; |
1553 | 0 | char *ptr = result; |
1554 | 0 | for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { |
1555 | 0 | WERD_RES *word = page_res_it.word(); |
1556 | | // Process the current word. |
1557 | 0 | if (word->unlv_crunch_mode != CR_NONE) { |
1558 | 0 | if (word->unlv_crunch_mode != CR_DELETE && |
1559 | 0 | (!tilde_crunch_written || |
1560 | 0 | (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 && |
1561 | 0 | !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) { |
1562 | 0 | if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) && |
1563 | 0 | !word->word->flag(W_FUZZY_SP)) { |
1564 | | /* Write a space to separate from preceding good text */ |
1565 | 0 | *ptr++ = ' '; |
1566 | 0 | last_char_was_tilde = false; |
1567 | 0 | } |
1568 | 0 | if (!last_char_was_tilde) { |
1569 | | // Write a reject char. |
1570 | 0 | last_char_was_tilde = true; |
1571 | 0 | *ptr++ = kUNLVReject; |
1572 | 0 | tilde_crunch_written = true; |
1573 | 0 | last_char_was_newline = false; |
1574 | 0 | } |
1575 | 0 | } |
1576 | 0 | } else { |
1577 | | // NORMAL PROCESSING of non tilde crunched words. |
1578 | 0 | tilde_crunch_written = false; |
1579 | 0 | tesseract_->set_unlv_suspects(word); |
1580 | 0 | const char *wordstr = word->best_choice->unichar_string().c_str(); |
1581 | 0 | const auto &lengths = word->best_choice->unichar_lengths(); |
1582 | 0 | int length = lengths.length(); |
1583 | 0 | int i = 0; |
1584 | 0 | int offset = 0; |
1585 | |
|
1586 | 0 | if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') { |
1587 | | // Prevent adjacent tilde across words - we know that adjacent tildes |
1588 | | // within words have been removed. |
1589 | | // Skip the first character. |
1590 | 0 | offset = lengths[i++]; |
1591 | 0 | } |
1592 | 0 | if (i < length && wordstr[offset] != 0) { |
1593 | 0 | if (!last_char_was_newline) { |
1594 | 0 | *ptr++ = ' '; |
1595 | 0 | } else { |
1596 | 0 | last_char_was_newline = false; |
1597 | 0 | } |
1598 | 0 | for (; i < length; offset += lengths[i++]) { |
1599 | 0 | if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) { |
1600 | 0 | *ptr++ = kUNLVReject; |
1601 | 0 | last_char_was_tilde = true; |
1602 | 0 | } else { |
1603 | 0 | if (word->reject_map[i].rejected()) { |
1604 | 0 | *ptr++ = kUNLVSuspect; |
1605 | 0 | } |
1606 | 0 | UNICHAR ch(wordstr + offset, lengths[i]); |
1607 | 0 | int uni_ch = ch.first_uni(); |
1608 | 0 | for (int j = 0; kUniChs[j] != 0; ++j) { |
1609 | 0 | if (kUniChs[j] == uni_ch) { |
1610 | 0 | uni_ch = kLatinChs[j]; |
1611 | 0 | break; |
1612 | 0 | } |
1613 | 0 | } |
1614 | 0 | if (uni_ch <= 0xff) { |
1615 | 0 | *ptr++ = static_cast<char>(uni_ch); |
1616 | 0 | last_char_was_tilde = false; |
1617 | 0 | } else { |
1618 | 0 | *ptr++ = kUNLVReject; |
1619 | 0 | last_char_was_tilde = true; |
1620 | 0 | } |
1621 | 0 | } |
1622 | 0 | } |
1623 | 0 | } |
1624 | 0 | } |
1625 | 0 | if (word->word->flag(W_EOL) && !last_char_was_newline) { |
1626 | | /* Add a new line output */ |
1627 | 0 | *ptr++ = '\n'; |
1628 | 0 | tilde_crunch_written = false; |
1629 | 0 | last_char_was_newline = true; |
1630 | 0 | last_char_was_tilde = false; |
1631 | 0 | } |
1632 | 0 | } |
1633 | 0 | *ptr++ = '\n'; |
1634 | 0 | *ptr = '\0'; |
1635 | 0 | return result; |
1636 | 0 | } |
1637 | | |
1638 | | #ifndef DISABLED_LEGACY_ENGINE |
1639 | | |
1640 | | /** |
1641 | | * Detect the orientation of the input image and apparent script (alphabet). |
1642 | | * orient_deg is the detected clockwise rotation of the input image in degrees |
1643 | | * (0, 90, 180, 270) |
1644 | | * orient_conf is the confidence (15.0 is reasonably confident) |
1645 | | * script_name is an ASCII string, the name of the script, e.g. "Latin" |
1646 | | * script_conf is confidence level in the script |
1647 | | * Returns true on success and writes values to each parameter as an output |
1648 | | */ |
1649 | | bool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf, |
1650 | 0 | const char **script_name, float *script_conf) { |
1651 | 0 | OSResults osr; |
1652 | |
|
1653 | 0 | bool osd = DetectOS(&osr); |
1654 | 0 | if (!osd) { |
1655 | 0 | return false; |
1656 | 0 | } |
1657 | | |
1658 | 0 | int orient_id = osr.best_result.orientation_id; |
1659 | 0 | int script_id = osr.get_best_script(orient_id); |
1660 | 0 | if (orient_conf) { |
1661 | 0 | *orient_conf = osr.best_result.oconfidence; |
1662 | 0 | } |
1663 | 0 | if (orient_deg) { |
1664 | 0 | *orient_deg = orient_id * 90; // convert quadrant to degrees |
1665 | 0 | } |
1666 | |
|
1667 | 0 | if (script_name) { |
1668 | 0 | const char *script = osr.unicharset->get_script_from_script_id(script_id); |
1669 | |
|
1670 | 0 | *script_name = script; |
1671 | 0 | } |
1672 | |
|
1673 | 0 | if (script_conf) { |
1674 | 0 | *script_conf = osr.best_result.sconfidence; |
1675 | 0 | } |
1676 | |
|
1677 | 0 | return true; |
1678 | 0 | } |
1679 | | |
1680 | | /** |
1681 | | * The recognized text is returned as a char* which is coded |
1682 | | * as UTF8 and must be freed with the delete [] operator. |
1683 | | * page_number is a 0-based page index that will appear in the osd file. |
1684 | | */ |
1685 | 0 | char *TessBaseAPI::GetOsdText(int page_number) { |
1686 | 0 | int orient_deg; |
1687 | 0 | float orient_conf; |
1688 | 0 | const char *script_name; |
1689 | 0 | float script_conf; |
1690 | |
|
1691 | 0 | if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) { |
1692 | 0 | return nullptr; |
1693 | 0 | } |
1694 | | |
1695 | | // clockwise rotation needed to make the page upright |
1696 | 0 | int rotate = OrientationIdToValue(orient_deg / 90); |
1697 | |
|
1698 | 0 | std::stringstream stream; |
1699 | | // Use "C" locale (needed for float values orient_conf and script_conf). |
1700 | 0 | stream.imbue(std::locale::classic()); |
1701 | | // Use fixed notation with 2 digits after the decimal point for float values. |
1702 | 0 | stream.precision(2); |
1703 | 0 | stream << std::fixed << "Page number: " << page_number << "\n" |
1704 | 0 | << "Orientation in degrees: " << orient_deg << "\n" |
1705 | 0 | << "Rotate: " << rotate << "\n" |
1706 | 0 | << "Orientation confidence: " << orient_conf << "\n" |
1707 | 0 | << "Script: " << script_name << "\n" |
1708 | 0 | << "Script confidence: " << script_conf << "\n"; |
1709 | 0 | return copy_string(stream.str()); |
1710 | 0 | } |
1711 | | |
1712 | | #endif // ndef DISABLED_LEGACY_ENGINE |
1713 | | |
1714 | | /** Returns the average word confidence for Tesseract page result. */ |
1715 | 0 | int TessBaseAPI::MeanTextConf() { |
1716 | 0 | int *conf = AllWordConfidences(); |
1717 | 0 | if (!conf) { |
1718 | 0 | return 0; |
1719 | 0 | } |
1720 | 0 | int sum = 0; |
1721 | 0 | int *pt = conf; |
1722 | 0 | while (*pt >= 0) { |
1723 | 0 | sum += *pt++; |
1724 | 0 | } |
1725 | 0 | if (pt != conf) { |
1726 | 0 | sum /= pt - conf; |
1727 | 0 | } |
1728 | 0 | delete[] conf; |
1729 | 0 | return sum; |
1730 | 0 | } |
1731 | | |
1732 | | /** Returns an array of all word confidences, terminated by -1. */ |
1733 | 0 | int *TessBaseAPI::AllWordConfidences() { |
1734 | 0 | if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) { |
1735 | 0 | return nullptr; |
1736 | 0 | } |
1737 | 0 | int n_word = 0; |
1738 | 0 | PAGE_RES_IT res_it(page_res_); |
1739 | 0 | for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) { |
1740 | 0 | n_word++; |
1741 | 0 | } |
1742 | |
|
1743 | 0 | int *conf = new int[n_word + 1]; |
1744 | 0 | n_word = 0; |
1745 | 0 | for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) { |
1746 | 0 | WERD_RES *word = res_it.word(); |
1747 | 0 | WERD_CHOICE *choice = word->best_choice; |
1748 | 0 | int w_conf = static_cast<int>(100 + 5 * choice->certainty()); |
1749 | | // This is the eq for converting Tesseract confidence to 1..100 |
1750 | 0 | if (w_conf < 0) { |
1751 | 0 | w_conf = 0; |
1752 | 0 | } |
1753 | 0 | if (w_conf > 100) { |
1754 | 0 | w_conf = 100; |
1755 | 0 | } |
1756 | 0 | conf[n_word++] = w_conf; |
1757 | 0 | } |
1758 | 0 | conf[n_word] = -1; |
1759 | 0 | return conf; |
1760 | 0 | } |
1761 | | |
1762 | | #ifndef DISABLED_LEGACY_ENGINE |
1763 | | /** |
1764 | | * Applies the given word to the adaptive classifier if possible. |
1765 | | * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can |
1766 | | * tell the boundaries of the graphemes. |
1767 | | * Assumes that SetImage/SetRectangle have been used to set the image |
1768 | | * to the given word. The mode arg should be PSM_SINGLE_WORD or |
1769 | | * PSM_CIRCLE_WORD, as that will be used to control layout analysis. |
1770 | | * The currently set PageSegMode is preserved. |
1771 | | * Returns false if adaption was not possible for some reason. |
1772 | | */ |
1773 | 0 | bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) { |
1774 | 0 | int debug = 0; |
1775 | 0 | GetIntVariable("applybox_debug", &debug); |
1776 | 0 | bool success = true; |
1777 | 0 | PageSegMode current_psm = GetPageSegMode(); |
1778 | 0 | SetPageSegMode(mode); |
1779 | 0 | SetVariable("classify_enable_learning", "0"); |
1780 | 0 | const std::unique_ptr<const char[]> text(GetUTF8Text()); |
1781 | 0 | if (debug) { |
1782 | 0 | tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr); |
1783 | 0 | } |
1784 | 0 | if (text != nullptr) { |
1785 | 0 | PAGE_RES_IT it(page_res_); |
1786 | 0 | WERD_RES *word_res = it.word(); |
1787 | 0 | if (word_res != nullptr) { |
1788 | 0 | word_res->word->set_text(wordstr); |
1789 | | // Check to see if text matches wordstr. |
1790 | 0 | int w = 0; |
1791 | 0 | int t; |
1792 | 0 | for (t = 0; text[t] != '\0'; ++t) { |
1793 | 0 | if (text[t] == '\n' || text[t] == ' ') { |
1794 | 0 | continue; |
1795 | 0 | } |
1796 | 0 | while (wordstr[w] == ' ') { |
1797 | 0 | ++w; |
1798 | 0 | } |
1799 | 0 | if (text[t] != wordstr[w]) { |
1800 | 0 | break; |
1801 | 0 | } |
1802 | 0 | ++w; |
1803 | 0 | } |
1804 | 0 | if (text[t] != '\0' || wordstr[w] != '\0') { |
1805 | | // No match. |
1806 | 0 | delete page_res_; |
1807 | 0 | std::vector<TBOX> boxes; |
1808 | 0 | page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_); |
1809 | 0 | tesseract_->ReSegmentByClassification(page_res_); |
1810 | 0 | tesseract_->TidyUp(page_res_); |
1811 | 0 | PAGE_RES_IT pr_it(page_res_); |
1812 | 0 | if (pr_it.word() == nullptr) { |
1813 | 0 | success = false; |
1814 | 0 | } else { |
1815 | 0 | word_res = pr_it.word(); |
1816 | 0 | } |
1817 | 0 | } else { |
1818 | 0 | word_res->BestChoiceToCorrectText(); |
1819 | 0 | } |
1820 | 0 | if (success) { |
1821 | 0 | tesseract_->EnableLearning = true; |
1822 | 0 | tesseract_->LearnWord(nullptr, word_res); |
1823 | 0 | } |
1824 | 0 | } else { |
1825 | 0 | success = false; |
1826 | 0 | } |
1827 | 0 | } else { |
1828 | 0 | success = false; |
1829 | 0 | } |
1830 | 0 | SetPageSegMode(current_psm); |
1831 | 0 | return success; |
1832 | 0 | } |
1833 | | #endif // ndef DISABLED_LEGACY_ENGINE |
1834 | | |
1835 | | /** |
1836 | | * Free up recognition results and any stored image data, without actually |
1837 | | * freeing any recognition data that would be time-consuming to reload. |
1838 | | * Afterwards, you must call SetImage or TesseractRect before doing |
1839 | | * any Recognize or Get* operation. |
1840 | | */ |
1841 | 0 | void TessBaseAPI::Clear() { |
1842 | 0 | if (thresholder_ != nullptr) { |
1843 | 0 | thresholder_->Clear(); |
1844 | 0 | } |
1845 | 0 | ClearResults(); |
1846 | 0 | if (tesseract_ != nullptr) { |
1847 | 0 | SetInputImage(nullptr); |
1848 | 0 | } |
1849 | 0 | } |
1850 | | |
1851 | | /** |
1852 | | * Close down tesseract and free up all memory. End() is equivalent to |
1853 | | * destructing and reconstructing your TessBaseAPI. |
1854 | | * Once End() has been used, none of the other API functions may be used |
1855 | | * other than Init and anything declared above it in the class definition. |
1856 | | */ |
1857 | 0 | void TessBaseAPI::End() { |
1858 | 0 | Clear(); |
1859 | 0 | delete thresholder_; |
1860 | 0 | thresholder_ = nullptr; |
1861 | 0 | delete page_res_; |
1862 | 0 | page_res_ = nullptr; |
1863 | 0 | delete block_list_; |
1864 | 0 | block_list_ = nullptr; |
1865 | 0 | if (paragraph_models_ != nullptr) { |
1866 | 0 | for (auto model : *paragraph_models_) { |
1867 | 0 | delete model; |
1868 | 0 | } |
1869 | 0 | delete paragraph_models_; |
1870 | 0 | paragraph_models_ = nullptr; |
1871 | 0 | } |
1872 | 0 | #ifndef DISABLED_LEGACY_ENGINE |
1873 | 0 | if (osd_tesseract_ == tesseract_) { |
1874 | 0 | osd_tesseract_ = nullptr; |
1875 | 0 | } |
1876 | 0 | delete osd_tesseract_; |
1877 | 0 | osd_tesseract_ = nullptr; |
1878 | 0 | delete equ_detect_; |
1879 | 0 | equ_detect_ = nullptr; |
1880 | 0 | #endif // ndef DISABLED_LEGACY_ENGINE |
1881 | 0 | delete tesseract_; |
1882 | 0 | tesseract_ = nullptr; |
1883 | 0 | input_file_.clear(); |
1884 | 0 | output_file_.clear(); |
1885 | 0 | datapath_.clear(); |
1886 | 0 | language_.clear(); |
1887 | 0 | } |
1888 | | |
1889 | | // Clear any library-level memory caches. |
1890 | | // There are a variety of expensive-to-load constant data structures (mostly |
1891 | | // language dictionaries) that are cached globally -- surviving the Init() |
1892 | | // and End() of individual TessBaseAPI's. This function allows the clearing |
1893 | | // of these caches. |
1894 | 0 | void TessBaseAPI::ClearPersistentCache() { |
1895 | 0 | Dict::GlobalDawgCache()->DeleteUnusedDawgs(); |
1896 | 0 | } |
1897 | | |
1898 | | /** |
1899 | | * Check whether a word is valid according to Tesseract's language model |
1900 | | * returns 0 if the word is invalid, non-zero if valid |
1901 | | */ |
1902 | 0 | int TessBaseAPI::IsValidWord(const char *word) const { |
1903 | 0 | return tesseract_->getDict().valid_word(word); |
1904 | 0 | } |
1905 | | // Returns true if utf8_character is defined in the UniCharset. |
1906 | 0 | bool TessBaseAPI::IsValidCharacter(const char *utf8_character) const { |
1907 | 0 | return tesseract_->unicharset.contains_unichar(utf8_character); |
1908 | 0 | } |
1909 | | |
1910 | | // TODO(rays) Obsolete this function and replace with a more aptly named |
1911 | | // function that returns image coordinates rather than tesseract coordinates. |
1912 | 0 | bool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) { |
1913 | 0 | const std::unique_ptr<const PageIterator> it(AnalyseLayout()); |
1914 | 0 | if (it == nullptr) { |
1915 | 0 | return false; |
1916 | 0 | } |
1917 | 0 | int x1, x2, y1, y2; |
1918 | 0 | it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2); |
1919 | | // Calculate offset and slope (NOTE: Kind of ugly) |
1920 | 0 | if (x2 <= x1) { |
1921 | 0 | x2 = x1 + 1; |
1922 | 0 | } |
1923 | | // Convert the point pair to slope/offset of the baseline (in image coords.) |
1924 | 0 | *out_slope = static_cast<float>(y2 - y1) / (x2 - x1); |
1925 | 0 | *out_offset = static_cast<int>(y1 - *out_slope * x1); |
1926 | | // Get the y-coord of the baseline at the left and right edges of the |
1927 | | // textline's bounding box. |
1928 | 0 | int left, top, right, bottom; |
1929 | 0 | if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) { |
1930 | 0 | return false; |
1931 | 0 | } |
1932 | 0 | int left_y = IntCastRounded(*out_slope * left + *out_offset); |
1933 | 0 | int right_y = IntCastRounded(*out_slope * right + *out_offset); |
1934 | | // Shift the baseline down so it passes through the nearest bottom-corner |
1935 | | // of the textline's bounding box. This is the difference between the y |
1936 | | // at the lowest (max) edge of the box and the actual box bottom. |
1937 | 0 | *out_offset += bottom - std::max(left_y, right_y); |
1938 | | // Switch back to bottom-up tesseract coordinates. Requires negation of |
1939 | | // the slope and height - offset for the offset. |
1940 | 0 | *out_slope = -*out_slope; |
1941 | 0 | *out_offset = rect_height_ - *out_offset; |
1942 | |
|
1943 | 0 | return true; |
1944 | 0 | } |
1945 | | |
1946 | | /** Sets Dict::letter_is_okay_ function to point to the given function. */ |
1947 | 0 | void TessBaseAPI::SetDictFunc(DictFunc f) { |
1948 | 0 | if (tesseract_ != nullptr) { |
1949 | 0 | tesseract_->getDict().letter_is_okay_ = f; |
1950 | 0 | } |
1951 | 0 | } |
1952 | | |
1953 | | /** |
1954 | | * Sets Dict::probability_in_context_ function to point to the given |
1955 | | * function. |
1956 | | * |
1957 | | * @param f A single function that returns the probability of the current |
1958 | | * "character" (in general a utf-8 string), given the context of a previous |
1959 | | * utf-8 string. |
1960 | | */ |
1961 | 0 | void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) { |
1962 | 0 | if (tesseract_ != nullptr) { |
1963 | 0 | tesseract_->getDict().probability_in_context_ = f; |
1964 | | // Set it for the sublangs too. |
1965 | 0 | int num_subs = tesseract_->num_sub_langs(); |
1966 | 0 | for (int i = 0; i < num_subs; ++i) { |
1967 | 0 | tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f; |
1968 | 0 | } |
1969 | 0 | } |
1970 | 0 | } |
1971 | | |
1972 | | /** Common code for setting the image. */ |
1973 | 7.74k | bool TessBaseAPI::InternalSetImage() { |
1974 | 7.74k | if (tesseract_ == nullptr) { |
1975 | 0 | tprintf("Please call Init before attempting to set an image.\n"); |
1976 | 0 | return false; |
1977 | 0 | } |
1978 | 7.74k | if (thresholder_ == nullptr) { |
1979 | 1 | thresholder_ = new ImageThresholder; |
1980 | 1 | } |
1981 | 7.74k | ClearResults(); |
1982 | 7.74k | return true; |
1983 | 7.74k | } |
1984 | | |
1985 | | /** |
1986 | | * Run the thresholder to make the thresholded image, returned in pix, |
1987 | | * which must not be nullptr. *pix must be initialized to nullptr, or point |
1988 | | * to an existing pixDestroyable Pix. |
1989 | | * The usual argument to Threshold is Tesseract::mutable_pix_binary(). |
1990 | | */ |
1991 | 7.74k | bool TessBaseAPI::Threshold(Pix **pix) { |
1992 | 7.74k | ASSERT_HOST(pix != nullptr); |
1993 | 7.74k | if (*pix != nullptr) { |
1994 | 0 | pixDestroy(pix); |
1995 | 0 | } |
1996 | | // Zero resolution messes up the algorithms, so make sure it is credible. |
1997 | 7.74k | int user_dpi = 0; |
1998 | 7.74k | GetIntVariable("user_defined_dpi", &user_dpi); |
1999 | 7.74k | int y_res = thresholder_->GetScaledYResolution(); |
2000 | 7.74k | if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) { |
2001 | 0 | tprintf( |
2002 | 0 | "Warning: User defined image dpi is outside of expected range " |
2003 | 0 | "(%d - %d)!\n", |
2004 | 0 | kMinCredibleResolution, kMaxCredibleResolution); |
2005 | 0 | } |
2006 | | // Always use user defined dpi |
2007 | 7.74k | if (user_dpi) { |
2008 | 0 | thresholder_->SetSourceYResolution(user_dpi); |
2009 | 7.74k | } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) { |
2010 | 7.74k | if (y_res != 0) { |
2011 | | // Show warning only if a resolution was given. |
2012 | 0 | tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n", |
2013 | 0 | y_res, kMinCredibleResolution); |
2014 | 0 | } |
2015 | 7.74k | thresholder_->SetSourceYResolution(kMinCredibleResolution); |
2016 | 7.74k | } |
2017 | | |
2018 | 7.74k | auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method)); |
2019 | | |
2020 | 7.74k | if (thresholding_method == ThresholdMethod::Otsu) { |
2021 | 7.74k | Image pix_binary(*pix); |
2022 | 7.74k | if (!thresholder_->ThresholdToPix(&pix_binary)) { |
2023 | 0 | return false; |
2024 | 0 | } |
2025 | 7.74k | *pix = pix_binary; |
2026 | | |
2027 | 7.74k | if (!thresholder_->IsBinary()) { |
2028 | 0 | tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds()); |
2029 | 0 | tesseract_->set_pix_grey(thresholder_->GetPixRectGrey()); |
2030 | 7.74k | } else { |
2031 | 7.74k | tesseract_->set_pix_thresholds(nullptr); |
2032 | 7.74k | tesseract_->set_pix_grey(nullptr); |
2033 | 7.74k | } |
2034 | 7.74k | } else { |
2035 | 0 | auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method); |
2036 | |
|
2037 | 0 | if (!ok) { |
2038 | 0 | return false; |
2039 | 0 | } |
2040 | 0 | *pix = pix_binary; |
2041 | |
|
2042 | 0 | tesseract_->set_pix_thresholds(pix_thresholds); |
2043 | 0 | tesseract_->set_pix_grey(pix_grey); |
2044 | 0 | } |
2045 | | |
2046 | 7.74k | thresholder_->GetImageSizes(&rect_left_, &rect_top_, &rect_width_, &rect_height_, &image_width_, |
2047 | 7.74k | &image_height_); |
2048 | | |
2049 | | // Set the internal resolution that is used for layout parameters from the |
2050 | | // estimated resolution, rather than the image resolution, which may be |
2051 | | // fabricated, but we will use the image resolution, if there is one, to |
2052 | | // report output point sizes. |
2053 | 7.74k | int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(), |
2054 | 7.74k | kMinCredibleResolution, kMaxCredibleResolution); |
2055 | 7.74k | if (estimated_res != thresholder_->GetScaledEstimatedResolution()) { |
2056 | 0 | tprintf( |
2057 | 0 | "Estimated internal resolution %d out of range! " |
2058 | 0 | "Corrected to %d.\n", |
2059 | 0 | thresholder_->GetScaledEstimatedResolution(), estimated_res); |
2060 | 0 | } |
2061 | 7.74k | tesseract_->set_source_resolution(estimated_res); |
2062 | 7.74k | return true; |
2063 | 7.74k | } |
2064 | | |
2065 | | /** Find lines from the image making the BLOCK_LIST. */ |
2066 | 7.74k | int TessBaseAPI::FindLines() { |
2067 | 7.74k | if (thresholder_ == nullptr || thresholder_->IsEmpty()) { |
2068 | 0 | tprintf("Please call SetImage before attempting recognition.\n"); |
2069 | 0 | return -1; |
2070 | 0 | } |
2071 | 7.74k | if (recognition_done_) { |
2072 | 0 | ClearResults(); |
2073 | 0 | } |
2074 | 7.74k | if (!block_list_->empty()) { |
2075 | 0 | return 0; |
2076 | 0 | } |
2077 | 7.74k | if (tesseract_ == nullptr) { |
2078 | 0 | tesseract_ = new Tesseract; |
2079 | 0 | #ifndef DISABLED_LEGACY_ENGINE |
2080 | 0 | tesseract_->InitAdaptiveClassifier(nullptr); |
2081 | 0 | #endif |
2082 | 0 | } |
2083 | 7.74k | if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) { |
2084 | 0 | return -1; |
2085 | 0 | } |
2086 | | |
2087 | 7.74k | tesseract_->PrepareForPageseg(); |
2088 | | |
2089 | 7.74k | #ifndef DISABLED_LEGACY_ENGINE |
2090 | 7.74k | if (tesseract_->textord_equation_detect) { |
2091 | 0 | if (equ_detect_ == nullptr && !datapath_.empty()) { |
2092 | 0 | equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr); |
2093 | 0 | } |
2094 | 0 | if (equ_detect_ == nullptr) { |
2095 | 0 | tprintf("Warning: Could not set equation detector\n"); |
2096 | 0 | } else { |
2097 | 0 | tesseract_->SetEquationDetect(equ_detect_); |
2098 | 0 | } |
2099 | 0 | } |
2100 | 7.74k | #endif // ndef DISABLED_LEGACY_ENGINE |
2101 | | |
2102 | 7.74k | Tesseract *osd_tess = osd_tesseract_; |
2103 | 7.74k | OSResults osr; |
2104 | 7.74k | #ifndef DISABLED_LEGACY_ENGINE |
2105 | 7.74k | if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) { |
2106 | 0 | if (strcmp(language_.c_str(), "osd") == 0) { |
2107 | 0 | osd_tess = tesseract_; |
2108 | 0 | } else { |
2109 | 0 | osd_tesseract_ = new Tesseract; |
2110 | 0 | TessdataManager mgr(reader_); |
2111 | 0 | if (datapath_.empty()) { |
2112 | 0 | tprintf( |
2113 | 0 | "Warning: Auto orientation and script detection requested," |
2114 | 0 | " but data path is undefined\n"); |
2115 | 0 | delete osd_tesseract_; |
2116 | 0 | osd_tesseract_ = nullptr; |
2117 | 0 | } else if (osd_tesseract_->init_tesseract(datapath_, "", "osd", OEM_TESSERACT_ONLY, |
2118 | 0 | nullptr, 0, nullptr, nullptr, false, &mgr) == 0) { |
2119 | 0 | osd_tess = osd_tesseract_; |
2120 | 0 | osd_tesseract_->set_source_resolution(thresholder_->GetSourceYResolution()); |
2121 | 0 | } else { |
2122 | 0 | tprintf( |
2123 | 0 | "Warning: Auto orientation and script detection requested," |
2124 | 0 | " but osd language failed to load\n"); |
2125 | 0 | delete osd_tesseract_; |
2126 | 0 | osd_tesseract_ = nullptr; |
2127 | 0 | } |
2128 | 0 | } |
2129 | 0 | } |
2130 | 7.74k | #endif // ndef DISABLED_LEGACY_ENGINE |
2131 | | |
2132 | 7.74k | if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) { |
2133 | 0 | return -1; |
2134 | 0 | } |
2135 | | |
2136 | | // If Devanagari is being recognized, we use different images for page seg |
2137 | | // and for OCR. |
2138 | 7.74k | tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr); |
2139 | 7.74k | return 0; |
2140 | 7.74k | } |
2141 | | |
2142 | | /** |
2143 | | * Return average gradient of lines on page. |
2144 | | */ |
2145 | 0 | float TessBaseAPI::GetGradient() { |
2146 | 0 | return tesseract_->gradient(); |
2147 | 0 | } |
2148 | | |
2149 | | /** Delete the pageres and clear the block list ready for a new page. */ |
2150 | 7.74k | void TessBaseAPI::ClearResults() { |
2151 | 7.74k | if (tesseract_ != nullptr) { |
2152 | 7.74k | tesseract_->Clear(); |
2153 | 7.74k | } |
2154 | 7.74k | delete page_res_; |
2155 | 7.74k | page_res_ = nullptr; |
2156 | 7.74k | recognition_done_ = false; |
2157 | 7.74k | if (block_list_ == nullptr) { |
2158 | 1 | block_list_ = new BLOCK_LIST; |
2159 | 7.74k | } else { |
2160 | 7.74k | block_list_->clear(); |
2161 | 7.74k | } |
2162 | 7.74k | if (paragraph_models_ != nullptr) { |
2163 | 6.88k | for (auto model : *paragraph_models_) { |
2164 | 1.39k | delete model; |
2165 | 1.39k | } |
2166 | 6.88k | delete paragraph_models_; |
2167 | 6.88k | paragraph_models_ = nullptr; |
2168 | 6.88k | } |
2169 | 7.74k | } |
2170 | | |
2171 | | /** |
2172 | | * Return the length of the output text string, as UTF8, assuming |
2173 | | * liberally two spacing marks after each word (as paragraphs end with two |
2174 | | * newlines), and assuming a single character reject marker for each rejected |
2175 | | * character. |
2176 | | * Also return the number of recognized blobs in blob_count. |
2177 | | */ |
2178 | 0 | int TessBaseAPI::TextLength(int *blob_count) const { |
2179 | 0 | if (tesseract_ == nullptr || page_res_ == nullptr) { |
2180 | 0 | return 0; |
2181 | 0 | } |
2182 | | |
2183 | 0 | PAGE_RES_IT page_res_it(page_res_); |
2184 | 0 | int total_length = 2; |
2185 | 0 | int total_blobs = 0; |
2186 | | // Iterate over the data structures to extract the recognition result. |
2187 | 0 | for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) { |
2188 | 0 | WERD_RES *word = page_res_it.word(); |
2189 | 0 | WERD_CHOICE *choice = word->best_choice; |
2190 | 0 | if (choice != nullptr) { |
2191 | 0 | total_blobs += choice->length() + 2; |
2192 | 0 | total_length += choice->unichar_string().length() + 2; |
2193 | 0 | for (int i = 0; i < word->reject_map.length(); ++i) { |
2194 | 0 | if (word->reject_map[i].rejected()) { |
2195 | 0 | ++total_length; |
2196 | 0 | } |
2197 | 0 | } |
2198 | 0 | } |
2199 | 0 | } |
2200 | 0 | if (blob_count != nullptr) { |
2201 | 0 | *blob_count = total_blobs; |
2202 | 0 | } |
2203 | 0 | return total_length; |
2204 | 0 | } |
2205 | | |
2206 | | #ifndef DISABLED_LEGACY_ENGINE |
2207 | | /** |
2208 | | * Estimates the Orientation And Script of the image. |
2209 | | * Returns true if the image was processed successfully. |
2210 | | */ |
2211 | 0 | bool TessBaseAPI::DetectOS(OSResults *osr) { |
2212 | 0 | if (tesseract_ == nullptr) { |
2213 | 0 | return false; |
2214 | 0 | } |
2215 | 0 | ClearResults(); |
2216 | 0 | if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) { |
2217 | 0 | return false; |
2218 | 0 | } |
2219 | | |
2220 | 0 | if (input_file_.empty()) { |
2221 | 0 | input_file_ = kInputFile; |
2222 | 0 | } |
2223 | 0 | return orientation_and_script_detection(input_file_.c_str(), osr, tesseract_) > 0; |
2224 | 0 | } |
2225 | | #endif // #ifndef DISABLED_LEGACY_ENGINE |
2226 | | |
2227 | 0 | void TessBaseAPI::set_min_orientation_margin(double margin) { |
2228 | 0 | tesseract_->min_orientation_margin.set_value(margin); |
2229 | 0 | } |
2230 | | |
2231 | | /** |
2232 | | * Return text orientation of each block as determined in an earlier page layout |
2233 | | * analysis operation. Orientation is returned as the number of ccw 90-degree |
2234 | | * rotations (in [0..3]) required to make the text in the block upright |
2235 | | * (readable). Note that this may not necessary be the block orientation |
2236 | | * preferred for recognition (such as the case of vertical CJK text). |
2237 | | * |
2238 | | * Also returns whether the text in the block is believed to have vertical |
2239 | | * writing direction (when in an upright page orientation). |
2240 | | * |
2241 | | * The returned array is of length equal to the number of text blocks, which may |
2242 | | * be less than the total number of blocks. The ordering is intended to be |
2243 | | * consistent with GetTextLines(). |
2244 | | */ |
2245 | 0 | void TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) { |
2246 | 0 | delete[] * block_orientation; |
2247 | 0 | *block_orientation = nullptr; |
2248 | 0 | delete[] * vertical_writing; |
2249 | 0 | *vertical_writing = nullptr; |
2250 | 0 | BLOCK_IT block_it(block_list_); |
2251 | |
|
2252 | 0 | block_it.move_to_first(); |
2253 | 0 | int num_blocks = 0; |
2254 | 0 | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
2255 | 0 | if (!block_it.data()->pdblk.poly_block()->IsText()) { |
2256 | 0 | continue; |
2257 | 0 | } |
2258 | 0 | ++num_blocks; |
2259 | 0 | } |
2260 | 0 | if (!num_blocks) { |
2261 | 0 | tprintf("WARNING: Found no blocks\n"); |
2262 | 0 | return; |
2263 | 0 | } |
2264 | 0 | *block_orientation = new int[num_blocks]; |
2265 | 0 | *vertical_writing = new bool[num_blocks]; |
2266 | 0 | block_it.move_to_first(); |
2267 | 0 | int i = 0; |
2268 | 0 | for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { |
2269 | 0 | if (!block_it.data()->pdblk.poly_block()->IsText()) { |
2270 | 0 | continue; |
2271 | 0 | } |
2272 | 0 | FCOORD re_rotation = block_it.data()->re_rotation(); |
2273 | 0 | float re_theta = re_rotation.angle(); |
2274 | 0 | FCOORD classify_rotation = block_it.data()->classify_rotation(); |
2275 | 0 | float classify_theta = classify_rotation.angle(); |
2276 | 0 | double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI; |
2277 | 0 | if (rot_theta < 0) { |
2278 | 0 | rot_theta += 4; |
2279 | 0 | } |
2280 | 0 | int num_rotations = static_cast<int>(rot_theta + 0.5); |
2281 | 0 | (*block_orientation)[i] = num_rotations; |
2282 | | // The classify_rotation is non-zero only if the text has vertical |
2283 | | // writing direction. |
2284 | 0 | (*vertical_writing)[i] = classify_rotation.y() != 0.0f; |
2285 | 0 | ++i; |
2286 | 0 | } |
2287 | 0 | } |
2288 | | |
2289 | 6.88k | void TessBaseAPI::DetectParagraphs(bool after_text_recognition) { |
2290 | 6.88k | int debug_level = 0; |
2291 | 6.88k | GetIntVariable("paragraph_debug_level", &debug_level); |
2292 | 6.88k | if (paragraph_models_ == nullptr) { |
2293 | 6.88k | paragraph_models_ = new std::vector<ParagraphModel *>; |
2294 | 6.88k | } |
2295 | 6.88k | MutableIterator *result_it = GetMutableIterator(); |
2296 | 6.88k | do { // Detect paragraphs for this block |
2297 | 6.88k | std::vector<ParagraphModel *> models; |
2298 | 6.88k | ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models); |
2299 | 6.88k | paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end()); |
2300 | 6.88k | } while (result_it->Next(RIL_BLOCK)); |
2301 | 6.88k | delete result_it; |
2302 | 6.88k | } |
2303 | | |
2304 | | /** This method returns the string form of the specified unichar. */ |
2305 | 0 | const char *TessBaseAPI::GetUnichar(int unichar_id) const { |
2306 | 0 | return tesseract_->unicharset.id_to_unichar(unichar_id); |
2307 | 0 | } |
2308 | | |
2309 | | /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ |
2310 | 0 | const Dawg *TessBaseAPI::GetDawg(int i) const { |
2311 | 0 | if (tesseract_ == nullptr || i >= NumDawgs()) { |
2312 | 0 | return nullptr; |
2313 | 0 | } |
2314 | 0 | return tesseract_->getDict().GetDawg(i); |
2315 | 0 | } |
2316 | | |
2317 | | /** Return the number of dawgs loaded into tesseract_ object. */ |
2318 | 0 | int TessBaseAPI::NumDawgs() const { |
2319 | 0 | return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs(); |
2320 | 0 | } |
2321 | | |
2322 | | /** Escape a char string - replace <>&"' with HTML codes. */ |
2323 | 0 | std::string HOcrEscape(const char *text) { |
2324 | 0 | std::string ret; |
2325 | 0 | const char *ptr; |
2326 | 0 | for (ptr = text; *ptr; ptr++) { |
2327 | 0 | switch (*ptr) { |
2328 | 0 | case '<': |
2329 | 0 | ret += "<"; |
2330 | 0 | break; |
2331 | 0 | case '>': |
2332 | 0 | ret += ">"; |
2333 | 0 | break; |
2334 | 0 | case '&': |
2335 | 0 | ret += "&"; |
2336 | 0 | break; |
2337 | 0 | case '"': |
2338 | 0 | ret += """; |
2339 | 0 | break; |
2340 | 0 | case '\'': |
2341 | 0 | ret += "'"; |
2342 | 0 | break; |
2343 | 0 | default: |
2344 | 0 | ret += *ptr; |
2345 | 0 | } |
2346 | 0 | } |
2347 | 0 | return ret; |
2348 | 0 | } |
2349 | | |
2350 | | } // namespace tesseract |