Coverage Report

Created: 2025-06-13 07:02

/src/tesseract/src/api/baseapi.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        baseapi.cpp
3
 * Description: Simple API for calling tesseract.
4
 * Author:      Ray Smith
5
 *
6
 * (C) Copyright 2006, Google Inc.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#define _USE_MATH_DEFINES // for M_PI
20
21
// Include automatically generated configuration file if running autoconf.
22
#ifdef HAVE_CONFIG_H
23
#  include "config_auto.h"
24
#endif
25
26
#include "boxword.h"    // for BoxWord
27
#include "coutln.h"     // for C_OUTLINE_IT, C_OUTLINE_LIST
28
#include "dawg_cache.h" // for DawgCache
29
#include "dict.h"       // for Dict
30
#include "elst.h"       // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH
31
#include "environ.h"    // for l_uint8
32
#ifndef DISABLED_LEGACY_ENGINE
33
#include "equationdetect.h" // for EquationDetect, destructor of equ_detect_
34
#endif // ndef DISABLED_LEGACY_ENGINE
35
#include "errcode.h" // for ASSERT_HOST
36
#include "helpers.h" // for IntCastRounded, chomp_string, copy_string
37
#include "host.h"    // for MAX_PATH
38
#include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ...
39
#ifndef DISABLED_LEGACY_ENGINE
40
#  include "intfx.h" // for INT_FX_RESULT_STRUCT
41
#endif
42
#include "mutableiterator.h" // for MutableIterator
43
#include "normalis.h"        // for kBlnBaselineOffset, kBlnXHeight
44
#include "pageres.h"         // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE...
45
#include "paragraphs.h"      // for DetectParagraphs
46
#include "params.h"          // for BoolParam, IntParam, DoubleParam, Stri...
47
#include "pdblock.h"         // for PDBLK
48
#include "points.h"          // for FCOORD
49
#include "polyblk.h"         // for POLY_BLOCK
50
#include "rect.h"            // for TBOX
51
#include "stepblob.h"        // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
52
#include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix
53
#include "tesseractclass.h"  // for Tesseract
54
#include "tprintf.h"         // for tprintf
55
#include "werd.h"            // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP
56
#include "thresholder.h"     // for ImageThresholder
57
58
#include <tesseract/baseapi.h>
59
#include <tesseract/ocrclass.h>       // for ETEXT_DESC
60
#include <tesseract/osdetect.h>       // for OSResults, OSBestResult, OrientationId...
61
#include <tesseract/renderer.h>       // for TessResultRenderer
62
#include <tesseract/resultiterator.h> // for ResultIterator
63
64
#include <cmath>    // for round, M_PI
65
#include <cstdint>  // for int32_t
66
#include <cstring>  // for strcmp, strcpy
67
#include <filesystem> // for std::filesystem
68
#include <fstream>  // for size_t
69
#include <iostream> // for std::cin
70
#include <locale>   // for std::locale::classic
71
#include <memory>   // for std::unique_ptr
72
#include <set>      // for std::pair
73
#include <sstream>  // for std::stringstream
74
#include <vector>   // for std::vector
75
76
#include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box...
77
#ifdef HAVE_LIBCURL
78
#  include <curl/curl.h>
79
#endif
80
81
#ifdef __linux__
82
#  include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE
83
#endif
84
85
#if defined(_WIN32)
86
#  include <fcntl.h> // for _O_BINARY
87
#  include <io.h>    // for _setmode
88
#endif
89
90
namespace tesseract {
91
92
static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin");
93
static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)");
94
#ifdef HAVE_LIBCURL
95
static INT_VAR(curl_timeout, 0, "Timeout for curl in seconds");
96
static STRING_VAR(curl_cookiefile, "", "File with cookie data for curl");
97
#endif
98
99
/** Minimum sensible image size to be worth running Tesseract. */
100
const int kMinRectSize = 10;
101
/** Character returned when Tesseract couldn't recognize as anything. */
102
const char kTesseractReject = '~';
103
/** Character used by UNLV error counter as a reject. */
104
const char kUNLVReject = '~';
105
/** Character used by UNLV as a suspect marker. */
106
const char kUNLVSuspect = '^';
107
/**
108
 * Temp file used for storing current parameters before applying retry values.
109
 */
110
static const char *kOldVarsFile = "failed_vars.txt";
111
112
#ifndef DISABLED_LEGACY_ENGINE
113
/**
114
 * Filename used for input image file, from which to derive a name to search
115
 * for a possible UNLV zone file, if none is specified by SetInputName.
116
 */
117
static const char *kInputFile = "noname.tif";
118
static const char kUnknownFontName[] = "UnknownFont";
119
120
static STRING_VAR(classify_font_name, kUnknownFontName,
121
                  "Default font name to be used in training");
122
123
// Finds the name of the training font and returns it in fontname, by cutting
124
// it out based on the expectation that the filename is of the form:
125
// /path/to/dir/[lang].[fontname].exp[num]
126
// The [lang], [fontname] and [num] fields should not have '.' characters.
127
// If the global parameter classify_font_name is set, its value is used instead.
128
0
static void ExtractFontName(const char* filename, std::string* fontname) {
129
0
  *fontname = classify_font_name;
130
0
  if (*fontname == kUnknownFontName) {
131
    // filename is expected to be of the form [lang].[fontname].exp[num]
132
    // The [lang], [fontname] and [num] fields should not have '.' characters.
133
0
    const char *basename = strrchr(filename, '/');
134
0
    const char *firstdot = strchr(basename ? basename : filename, '.');
135
0
    const char *lastdot  = strrchr(filename, '.');
136
0
    if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {
137
0
      ++firstdot;
138
0
      *fontname = firstdot;
139
0
      fontname->resize(lastdot - firstdot);
140
0
    }
141
0
  }
142
0
}
143
#endif
144
145
/* Add all available languages recursively.
146
 */
147
static void addAvailableLanguages(const std::string &datadir,
148
0
                                  std::vector<std::string> *langs) {
149
0
  for (const auto& entry :
150
0
       std::filesystem::recursive_directory_iterator(datadir,
151
0
         std::filesystem::directory_options::follow_directory_symlink |
152
0
         std::filesystem::directory_options::skip_permission_denied)) {
153
0
    auto path = entry.path().lexically_relative(datadir);
154
0
    if (path.extension() == ".traineddata") {
155
0
      langs->push_back(path.replace_extension("").string());
156
0
    }
157
0
  }
158
0
}
159
160
TessBaseAPI::TessBaseAPI()
161
2
    : tesseract_(nullptr)
162
2
    , osd_tesseract_(nullptr)
163
2
    , equ_detect_(nullptr)
164
2
    , reader_(nullptr)
165
    ,
166
    // thresholder_ is initialized to nullptr here, but will be set before use
167
    // by: A constructor of a derived API or created
168
    // implicitly when used in InternalSetImage.
169
2
    thresholder_(nullptr)
170
2
    , paragraph_models_(nullptr)
171
2
    , block_list_(nullptr)
172
2
    , page_res_(nullptr)
173
2
    , last_oem_requested_(OEM_DEFAULT)
174
2
    , recognition_done_(false)
175
2
    , rect_left_(0)
176
2
    , rect_top_(0)
177
2
    , rect_width_(0)
178
2
    , rect_height_(0)
179
2
    , image_width_(0)
180
2
    , image_height_(0) {
181
2
}
182
183
0
TessBaseAPI::~TessBaseAPI() {
184
0
  End();
185
0
}
186
187
/**
188
 * Returns the version identifier as a static string. Do not delete.
189
 */
190
0
const char *TessBaseAPI::Version() {
191
0
  return TESSERACT_VERSION_STR;
192
0
}
193
194
/**
195
 * Set the name of the input file. Needed only for training and
196
 * loading a UNLV zone file.
197
 */
198
0
void TessBaseAPI::SetInputName(const char *name) {
199
0
  input_file_ = name ? name : "";
200
0
}
201
202
/** Set the name of the output files. Needed only for debugging. */
203
0
void TessBaseAPI::SetOutputName(const char *name) {
204
0
  output_file_ = name ? name : "";
205
0
}
206
207
2
bool TessBaseAPI::SetVariable(const char *name, const char *value) {
208
2
  if (tesseract_ == nullptr) {
209
0
    tesseract_ = new Tesseract;
210
0
  }
211
2
  return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
212
2
                              tesseract_->params());
213
2
}
214
215
0
bool TessBaseAPI::SetDebugVariable(const char *name, const char *value) {
216
0
  if (tesseract_ == nullptr) {
217
0
    tesseract_ = new Tesseract;
218
0
  }
219
0
  return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY, tesseract_->params());
220
0
}
221
222
14.6k
bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
223
14.6k
  auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params,
224
14.6k
                                            tesseract_->params()->int_params);
225
14.6k
  if (p == nullptr) {
226
0
    return false;
227
0
  }
228
14.6k
  *value = (int32_t)(*p);
229
14.6k
  return true;
230
14.6k
}
231
232
6.88k
bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
233
6.88k
  auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params,
234
6.88k
                                             tesseract_->params()->bool_params);
235
6.88k
  if (p == nullptr) {
236
0
    return false;
237
0
  }
238
6.88k
  *value = bool(*p);
239
6.88k
  return true;
240
6.88k
}
241
242
0
const char *TessBaseAPI::GetStringVariable(const char *name) const {
243
0
  auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params,
244
0
                                               tesseract_->params()->string_params);
245
0
  return (p != nullptr) ? p->c_str() : nullptr;
246
0
}
247
248
0
bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
249
0
  auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params,
250
0
                                               tesseract_->params()->double_params);
251
0
  if (p == nullptr) {
252
0
    return false;
253
0
  }
254
0
  *value = (double)(*p);
255
0
  return true;
256
0
}
257
258
/** Get value of named variable as a string, if it exists. */
259
0
bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const {
260
0
  return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
261
0
}
262
263
#ifndef DISABLED_LEGACY_ENGINE
264
265
/** Print Tesseract fonts table to the given file. */
266
0
void TessBaseAPI::PrintFontsTable(FILE *fp) const {
267
0
  const int fontinfo_size = tesseract_->get_fontinfo_table().size();
268
0
  for (int font_index = 1; font_index < fontinfo_size; ++font_index) {
269
0
    FontInfo font = tesseract_->get_fontinfo_table().at(font_index);
270
0
    fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s"
271
0
                " is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n",
272
0
                font_index, font.name,
273
0
                font.is_italic() ? "true" : "false",
274
0
                font.is_bold() ? "true" : "false",
275
0
                font.is_fixed_pitch() ? "true" : "false",
276
0
                font.is_serif() ? "true" : "false",
277
0
                font.is_fraktur() ? "true" : "false");
278
0
  }
279
0
}
280
281
#endif
282
283
/** Print Tesseract parameters to the given file. */
284
0
void TessBaseAPI::PrintVariables(FILE *fp) const {
285
0
  ParamUtils::PrintParams(fp, tesseract_->params());
286
0
}
287
288
/**
289
 * The datapath must be the name of the data directory or
290
 * some other file in which the data directory resides (for instance argv[0].)
291
 * The language is (usually) an ISO 639-3 string or nullptr will default to eng.
292
 * If numeric_mode is true, then only digits and Roman numerals will
293
 * be returned.
294
 * @return: 0 on success and -1 on initialization failure.
295
 */
296
int TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs,
297
                      int configs_size, const std::vector<std::string> *vars_vec,
298
2
                      const std::vector<std::string> *vars_values, bool set_only_non_debug_params) {
299
2
  return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values,
300
2
              set_only_non_debug_params, nullptr);
301
2
}
302
303
// In-memory version reads the traineddata file directly from the given
304
// data[data_size] array. Also implements the version with a datapath in data,
305
// flagged by data_size = 0.
306
int TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem,
307
                      char **configs, int configs_size, const std::vector<std::string> *vars_vec,
308
                      const std::vector<std::string> *vars_values, bool set_only_non_debug_params,
309
2
                      FileReader reader) {
310
2
  if (language == nullptr) {
311
0
    language = "";
312
0
  }
313
2
  if (data == nullptr) {
314
2
    data = "";
315
2
  }
316
2
  std::string datapath = data_size == 0 ? data : language;
317
  // If the datapath, OcrEngineMode or the language have changed - start again.
318
  // Note that the language_ field stores the last requested language that was
319
  // initialized successfully, while tesseract_->lang stores the language
320
  // actually used. They differ only if the requested language was nullptr, in
321
  // which case tesseract_->lang is set to the Tesseract default ("eng").
322
2
  if (tesseract_ != nullptr &&
323
2
      (datapath_.empty() || language_.empty() || datapath_ != datapath ||
324
0
       last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) {
325
0
    delete tesseract_;
326
0
    tesseract_ = nullptr;
327
0
  }
328
2
  bool reset_classifier = true;
329
2
  if (tesseract_ == nullptr) {
330
2
    reset_classifier = false;
331
2
    tesseract_ = new Tesseract;
332
2
    if (reader != nullptr) {
333
0
      reader_ = reader;
334
0
    }
335
2
    TessdataManager mgr(reader_);
336
2
    if (data_size != 0) {
337
0
      mgr.LoadMemBuffer(language, data, data_size);
338
0
    }
339
2
    if (tesseract_->init_tesseract(datapath, output_file_, language, oem, configs,
340
2
                                   configs_size, vars_vec, vars_values, set_only_non_debug_params,
341
2
                                   &mgr) != 0) {
342
0
      return -1;
343
0
    }
344
2
  }
345
346
  // Update datapath and language requested for the last valid initialization.
347
2
  datapath_ = std::move(datapath);
348
2
  if (datapath_.empty() && !tesseract_->datadir.empty()) {
349
2
    datapath_ = tesseract_->datadir;
350
2
  }
351
352
2
  language_ = language;
353
2
  last_oem_requested_ = oem;
354
355
2
#ifndef DISABLED_LEGACY_ENGINE
356
  // For same language and datapath, just reset the adaptive classifier.
357
2
  if (reset_classifier) {
358
0
    tesseract_->ResetAdaptiveClassifier();
359
0
  }
360
2
#endif // ndef DISABLED_LEGACY_ENGINE
361
2
  return 0;
362
2
}
363
364
/**
365
 * Returns the languages string used in the last valid initialization.
366
 * If the last initialization specified "deu+hin" then that will be
367
 * returned. If hin loaded eng automatically as well, then that will
368
 * not be included in this list. To find the languages actually
369
 * loaded use GetLoadedLanguagesAsVector.
370
 * The returned string should NOT be deleted.
371
 */
372
0
const char *TessBaseAPI::GetInitLanguagesAsString() const {
373
0
  return language_.c_str();
374
0
}
375
376
/**
377
 * Returns the loaded languages in the vector of std::string.
378
 * Includes all languages loaded by the last Init, including those loaded
379
 * as dependencies of other loaded languages.
380
 */
381
0
void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const {
382
0
  langs->clear();
383
0
  if (tesseract_ != nullptr) {
384
0
    langs->push_back(tesseract_->lang);
385
0
    int num_subs = tesseract_->num_sub_langs();
386
0
    for (int i = 0; i < num_subs; ++i) {
387
0
      langs->push_back(tesseract_->get_sub_lang(i)->lang);
388
0
    }
389
0
  }
390
0
}
391
392
/**
393
 * Returns the available languages in the sorted vector of std::string.
394
 */
395
0
void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const {
396
0
  langs->clear();
397
0
  if (tesseract_ != nullptr) {
398
0
    addAvailableLanguages(tesseract_->datadir, langs);
399
0
    std::sort(langs->begin(), langs->end());
400
0
  }
401
0
}
402
403
/**
404
 * Init only for page layout analysis. Use only for calls to SetImage and
405
 * AnalysePage. Calls that attempt recognition will generate an error.
406
 */
407
0
void TessBaseAPI::InitForAnalysePage() {
408
0
  if (tesseract_ == nullptr) {
409
0
    tesseract_ = new Tesseract;
410
0
#ifndef DISABLED_LEGACY_ENGINE
411
0
    tesseract_->InitAdaptiveClassifier(nullptr);
412
0
#endif
413
0
  }
414
0
}
415
416
/**
417
 * Read a "config" file containing a set of parameter name, value pairs.
418
 * Searches the standard places: tessdata/configs, tessdata/tessconfigs
419
 * and also accepts a relative or absolute path name.
420
 */
421
0
void TessBaseAPI::ReadConfigFile(const char *filename) {
422
0
  tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY);
423
0
}
424
425
/** Same as above, but only set debug params from the given config file. */
426
0
void TessBaseAPI::ReadDebugConfigFile(const char *filename) {
427
0
  tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY);
428
0
}
429
430
/**
431
 * Set the current page segmentation mode. Defaults to PSM_AUTO.
432
 * The mode is stored as an IntParam so it can also be modified by
433
 * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
434
 */
435
0
void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
436
0
  if (tesseract_ == nullptr) {
437
0
    tesseract_ = new Tesseract;
438
0
  }
439
0
  tesseract_->tessedit_pageseg_mode.set_value(mode);
440
0
}
441
442
/** Return the current page segmentation mode. */
443
0
PageSegMode TessBaseAPI::GetPageSegMode() const {
444
0
  if (tesseract_ == nullptr) {
445
0
    return PSM_SINGLE_BLOCK;
446
0
  }
447
0
  return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode));
448
0
}
449
450
/**
451
 * Recognize a rectangle from an image and return the result as a string.
452
 * May be called many times for a single Init.
453
 * Currently has no error checking.
454
 * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
455
 * Palette color images will not work properly and must be converted to
456
 * 24 bit.
457
 * Binary images of 1 bit per pixel may also be given but they must be
458
 * byte packed with the MSB of the first byte being the first pixel, and a
459
 * one pixel is WHITE. For binary images set bytes_per_pixel=0.
460
 * The recognized text is returned as a char* which is coded
461
 * as UTF8 and must be freed with the delete [] operator.
462
 */
463
char *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
464
0
                                 int bytes_per_line, int left, int top, int width, int height) {
465
0
  if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) {
466
0
    return nullptr; // Nothing worth doing.
467
0
  }
468
469
  // Since this original api didn't give the exact size of the image,
470
  // we have to invent a reasonable value.
471
0
  int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
472
0
  SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel,
473
0
           bytes_per_line);
474
0
  SetRectangle(left, top, width, height);
475
476
0
  return GetUTF8Text();
477
0
}
478
479
#ifndef DISABLED_LEGACY_ENGINE
480
/**
481
 * Call between pages or documents etc to free up memory and forget
482
 * adaptive data.
483
 */
484
0
void TessBaseAPI::ClearAdaptiveClassifier() {
485
0
  if (tesseract_ == nullptr) {
486
0
    return;
487
0
  }
488
0
  tesseract_->ResetAdaptiveClassifier();
489
0
  tesseract_->ResetDocumentDictionary();
490
0
}
491
#endif // ndef DISABLED_LEGACY_ENGINE
492
493
/**
494
 * Provide an image for Tesseract to recognize. Format is as
495
 * TesseractRect above. Copies the image buffer and converts to Pix.
496
 * SetImage clears all recognition results, and sets the rectangle to the
497
 * full image, so it may be followed immediately by a GetUTF8Text, and it
498
 * will automatically perform recognition.
499
 */
500
void TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height,
501
0
                           int bytes_per_pixel, int bytes_per_line) {
502
0
  if (InternalSetImage()) {
503
0
    thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
504
0
    SetInputImage(thresholder_->GetPixRect());
505
0
  }
506
0
}
507
508
0
void TessBaseAPI::SetSourceResolution(int ppi) {
509
0
  if (thresholder_) {
510
0
    thresholder_->SetSourceYResolution(ppi);
511
0
  } else {
512
0
    tprintf("Please call SetImage before SetSourceResolution.\n");
513
0
  }
514
0
}
515
516
/**
517
 * Provide an image for Tesseract to recognize. As with SetImage above,
518
 * Tesseract takes its own copy of the image, so it need not persist until
519
 * after Recognize.
520
 * Pix vs raw, which to use?
521
 * Use Pix where possible. Tesseract uses Pix as its internal representation
522
 * and it is therefore more efficient to provide a Pix directly.
523
 */
524
7.74k
void TessBaseAPI::SetImage(Pix *pix) {
525
7.74k
  if (InternalSetImage()) {
526
7.74k
    if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) {
527
      // remove alpha channel from png
528
0
      Pix *p1 = pixRemoveAlpha(pix);
529
0
      pixSetSpp(p1, 3);
530
0
      (void)pixCopy(pix, p1);
531
0
      pixDestroy(&p1);
532
0
    }
533
7.74k
    thresholder_->SetImage(pix);
534
7.74k
    SetInputImage(thresholder_->GetPixRect());
535
7.74k
  }
536
7.74k
}
537
538
/**
539
 * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
540
 * Each SetRectangle clears the recognition results so multiple rectangles
541
 * can be recognized with the same image.
542
 */
543
0
void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
544
0
  if (thresholder_ == nullptr) {
545
0
    return;
546
0
  }
547
0
  thresholder_->SetRectangle(left, top, width, height);
548
0
  ClearResults();
549
0
}
550
551
/**
552
 * ONLY available after SetImage if you have Leptonica installed.
553
 * Get a copy of the internal thresholded image from Tesseract.
554
 */
555
0
Pix *TessBaseAPI::GetThresholdedImage() {
556
0
  if (tesseract_ == nullptr || thresholder_ == nullptr) {
557
0
    return nullptr;
558
0
  }
559
0
  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
560
0
    return nullptr;
561
0
  }
562
0
  return tesseract_->pix_binary().clone();
563
0
}
564
565
/**
566
 * Get the result of page layout analysis as a leptonica-style
567
 * Boxa, Pixa pair, in reading order.
568
 * Can be called before or after Recognize.
569
 */
570
0
Boxa *TessBaseAPI::GetRegions(Pixa **pixa) {
571
0
  return GetComponentImages(RIL_BLOCK, false, pixa, nullptr);
572
0
}
573
574
/**
575
 * Get the textlines as a leptonica-style Boxa, Pixa pair, in reading order.
576
 * Can be called before or after Recognize.
577
 * If blockids is not nullptr, the block-id of each line is also returned as an
578
 * array of one element per line. delete [] after use.
579
 * If paraids is not nullptr, the paragraph-id of each line within its block is
580
 * also returned as an array of one element per line. delete [] after use.
581
 */
582
Boxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa,
583
0
                                int **blockids, int **paraids) {
584
0
  return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids);
585
0
}
586
587
/**
588
 * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
589
 * pair, in reading order. Enables downstream handling of non-rectangular
590
 * regions.
591
 * Can be called before or after Recognize.
592
 * If blockids is not nullptr, the block-id of each line is also returned as an
593
 * array of one element per line. delete [] after use.
594
 */
595
0
Boxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) {
596
0
  return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
597
0
}
598
599
/**
600
 * Get the words as a leptonica-style
601
 * Boxa, Pixa pair, in reading order.
602
 * Can be called before or after Recognize.
603
 */
604
0
Boxa *TessBaseAPI::GetWords(Pixa **pixa) {
605
0
  return GetComponentImages(RIL_WORD, true, pixa, nullptr);
606
0
}
607
608
/**
609
 * Gets the individual connected (text) components (created
610
 * after pages segmentation step, but before recognition)
611
 * as a leptonica-style Boxa, Pixa pair, in reading order.
612
 * Can be called before or after Recognize.
613
 */
614
0
Boxa *TessBaseAPI::GetConnectedComponents(Pixa **pixa) {
615
0
  return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr);
616
0
}
617
618
/**
619
 * Get the given level kind of components (block, textline, word etc.) as a
620
 * leptonica-style Boxa, Pixa pair, in reading order.
621
 * Can be called before or after Recognize.
622
 * If blockids is not nullptr, the block-id of each component is also returned
623
 * as an array of one element per component. delete [] after use.
624
 * If text_only is true, then only text components are returned.
625
 */
626
Boxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image,
627
                                      const int raw_padding, Pixa **pixa, int **blockids,
628
0
                                      int **paraids) {
629
0
  /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator());
630
0
  if (page_it == nullptr) {
631
0
    page_it.reset(AnalyseLayout());
632
0
  }
633
0
  if (page_it == nullptr) {
634
0
    return nullptr; // Failed.
635
0
  }
636
637
  // Count the components to get a size for the arrays.
638
0
  int component_count = 0;
639
0
  int left, top, right, bottom;
640
641
0
  if (raw_image) {
642
    // Get bounding box in original raw image with padding.
643
0
    do {
644
0
      if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) &&
645
0
          (!text_only || PTIsTextType(page_it->BlockType()))) {
646
0
        ++component_count;
647
0
      }
648
0
    } while (page_it->Next(level));
649
0
  } else {
650
    // Get bounding box from binarized imaged. Note that this could be
651
    // differently scaled from the original image.
652
0
    do {
653
0
      if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
654
0
          (!text_only || PTIsTextType(page_it->BlockType()))) {
655
0
        ++component_count;
656
0
      }
657
0
    } while (page_it->Next(level));
658
0
  }
659
660
0
  Boxa *boxa = boxaCreate(component_count);
661
0
  if (pixa != nullptr) {
662
0
    *pixa = pixaCreate(component_count);
663
0
  }
664
0
  if (blockids != nullptr) {
665
0
    *blockids = new int[component_count];
666
0
  }
667
0
  if (paraids != nullptr) {
668
0
    *paraids = new int[component_count];
669
0
  }
670
671
0
  int blockid = 0;
672
0
  int paraid = 0;
673
0
  int component_index = 0;
674
0
  page_it->Begin();
675
0
  do {
676
0
    bool got_bounding_box;
677
0
    if (raw_image) {
678
0
      got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom);
679
0
    } else {
680
0
      got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom);
681
0
    }
682
0
    if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) {
683
0
      Box *lbox = boxCreate(left, top, right - left, bottom - top);
684
0
      boxaAddBox(boxa, lbox, L_INSERT);
685
0
      if (pixa != nullptr) {
686
0
        Pix *pix = nullptr;
687
0
        if (raw_image) {
688
0
          pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top);
689
0
        } else {
690
0
          pix = page_it->GetBinaryImage(level);
691
0
        }
692
0
        pixaAddPix(*pixa, pix, L_INSERT);
693
0
        pixaAddBox(*pixa, lbox, L_CLONE);
694
0
      }
695
0
      if (paraids != nullptr) {
696
0
        (*paraids)[component_index] = paraid;
697
0
        if (page_it->IsAtFinalElement(RIL_PARA, level)) {
698
0
          ++paraid;
699
0
        }
700
0
      }
701
0
      if (blockids != nullptr) {
702
0
        (*blockids)[component_index] = blockid;
703
0
        if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
704
0
          ++blockid;
705
0
          paraid = 0;
706
0
        }
707
0
      }
708
0
      ++component_index;
709
0
    }
710
0
  } while (page_it->Next(level));
711
0
  return boxa;
712
0
}
713
714
0
int TessBaseAPI::GetThresholdedImageScaleFactor() const {
715
0
  if (thresholder_ == nullptr) {
716
0
    return 0;
717
0
  }
718
0
  return thresholder_->GetScaleFactor();
719
0
}
720
721
/**
722
 * Runs page layout analysis in the mode set by SetPageSegMode.
723
 * May optionally be called prior to Recognize to get access to just
724
 * the page layout results. Returns an iterator to the results.
725
 * If merge_similar_words is true, words are combined where suitable for use
726
 * with a line recognizer. Use if you want to use AnalyseLayout to find the
727
 * textlines, and then want to process textline fragments with an external
728
 * line recognizer.
729
 * Returns nullptr on error or an empty page.
730
 * The returned iterator must be deleted after use.
731
 * WARNING! This class points to data held within the TessBaseAPI class, and
732
 * therefore can only be used while the TessBaseAPI class still exists and
733
 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
734
 * DetectOS, or anything else that changes the internal PAGE_RES.
735
 */
736
0
PageIterator *TessBaseAPI::AnalyseLayout() {
737
0
  return AnalyseLayout(false);
738
0
}
739
740
0
PageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
741
0
  if (FindLines() == 0) {
742
0
    if (block_list_->empty()) {
743
0
      return nullptr; // The page was empty.
744
0
    }
745
0
    page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr);
746
0
    DetectParagraphs(false);
747
0
    return new PageIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
748
0
                            thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
749
0
                            rect_width_, rect_height_);
750
0
  }
751
0
  return nullptr;
752
0
}
753
754
/**
755
 * Recognize the tesseract global image and return the result as Tesseract
756
 * internal structures.
757
 */
758
7.74k
int TessBaseAPI::Recognize(ETEXT_DESC *monitor) {
759
7.74k
  if (tesseract_ == nullptr) {
760
0
    return -1;
761
0
  }
762
7.74k
  if (FindLines() != 0) {
763
0
    return -1;
764
0
  }
765
7.74k
  delete page_res_;
766
7.74k
  if (block_list_->empty()) {
767
863
    page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_);
768
863
    return 0; // Empty page.
769
863
  }
770
771
6.88k
  tesseract_->SetBlackAndWhitelist();
772
6.88k
  recognition_done_ = true;
773
6.88k
#ifndef DISABLED_LEGACY_ENGINE
774
6.88k
  if (tesseract_->tessedit_resegment_from_line_boxes) {
775
0
    page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), true, block_list_);
776
6.88k
  } else if (tesseract_->tessedit_resegment_from_boxes) {
777
0
    page_res_ = tesseract_->ApplyBoxes(input_file_.c_str(), false, block_list_);
778
0
  } else
779
6.88k
#endif // ndef DISABLED_LEGACY_ENGINE
780
6.88k
  {
781
6.88k
    page_res_ =
782
6.88k
        new PAGE_RES(tesseract_->AnyLSTMLang(), block_list_, &tesseract_->prev_word_best_choice_);
783
6.88k
  }
784
785
6.88k
  if (page_res_ == nullptr) {
786
0
    return -1;
787
0
  }
788
789
6.88k
  if (tesseract_->tessedit_train_line_recognizer) {
790
0
    if (!tesseract_->TrainLineRecognizer(input_file_.c_str(), output_file_, block_list_)) {
791
0
      return -1;
792
0
    }
793
0
    tesseract_->CorrectClassifyWords(page_res_);
794
0
    return 0;
795
0
  }
796
6.88k
#ifndef DISABLED_LEGACY_ENGINE
797
6.88k
  if (tesseract_->tessedit_make_boxes_from_boxes) {
798
0
    tesseract_->CorrectClassifyWords(page_res_);
799
0
    return 0;
800
0
  }
801
6.88k
#endif // ndef DISABLED_LEGACY_ENGINE
802
803
6.88k
  int result = 0;
804
6.88k
  if (tesseract_->interactive_display_mode) {
805
#ifndef GRAPHICS_DISABLED
806
    tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_);
807
#endif // !GRAPHICS_DISABLED
808
       // The page_res is invalid after an interactive session, so cleanup
809
       // in a way that lets us continue to the next page without crashing.
810
0
    delete page_res_;
811
0
    page_res_ = nullptr;
812
0
    return -1;
813
0
#ifndef DISABLED_LEGACY_ENGINE
814
6.88k
  } else if (tesseract_->tessedit_train_from_boxes) {
815
0
    std::string fontname;
816
0
    ExtractFontName(output_file_.c_str(), &fontname);
817
0
    tesseract_->ApplyBoxTraining(fontname, page_res_);
818
6.88k
  } else if (tesseract_->tessedit_ambigs_training) {
819
0
    FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str());
820
    // OCR the page segmented into words by tesseract.
821
0
    tesseract_->recog_training_segmented(input_file_.c_str(), page_res_, monitor,
822
0
                                         training_output_file);
823
0
    fclose(training_output_file);
824
0
#endif // ndef DISABLED_LEGACY_ENGINE
825
6.88k
  } else {
826
    // Now run the main recognition.
827
6.88k
    bool wait_for_text = true;
828
6.88k
    GetBoolVariable("paragraph_text_based", &wait_for_text);
829
6.88k
    if (!wait_for_text) {
830
0
      DetectParagraphs(false);
831
0
    }
832
6.88k
    if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) {
833
6.88k
      if (wait_for_text) {
834
6.88k
        DetectParagraphs(true);
835
6.88k
      }
836
6.88k
    } else {
837
0
      result = -1;
838
0
    }
839
6.88k
  }
840
6.88k
  return result;
841
6.88k
}
842
843
// Takes ownership of the input pix.
844
7.74k
void TessBaseAPI::SetInputImage(Pix *pix) {
845
7.74k
  tesseract_->set_pix_original(pix);
846
7.74k
}
847
848
0
Pix *TessBaseAPI::GetInputImage() {
849
0
  return tesseract_->pix_original();
850
0
}
851
852
0
const char *TessBaseAPI::GetInputName() {
853
0
  if (!input_file_.empty()) {
854
0
    return input_file_.c_str();
855
0
  }
856
0
  return nullptr;
857
0
}
858
859
0
const char *TessBaseAPI::GetDatapath() {
860
0
  return tesseract_->datadir.c_str();
861
0
}
862
863
0
int TessBaseAPI::GetSourceYResolution() {
864
0
  if (thresholder_ == nullptr)
865
0
    return -1;
866
0
  return thresholder_->GetSourceYResolution();
867
0
}
868
869
// If flist exists, get data from there. Otherwise get data from buf.
870
// Seems convoluted, but is the easiest way I know of to meet multiple
871
// goals. Support streaming from stdin, and also work on platforms
872
// lacking fmemopen.
873
// TODO: check different logic for flist/buf and simplify.
874
bool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config,
875
                                       int timeout_millisec, TessResultRenderer *renderer,
876
0
                                       int tessedit_page_number) {
877
0
  if (!flist && !buf) {
878
0
    return false;
879
0
  }
880
0
  unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
881
0
  char pagename[MAX_PATH];
882
883
0
  std::vector<std::string> lines;
884
0
  if (!flist) {
885
0
    std::string line;
886
0
    for (const auto ch : *buf) {
887
0
      if (ch == '\n') {
888
0
        lines.push_back(line);
889
0
        line.clear();
890
0
      } else {
891
0
        line.push_back(ch);
892
0
      }
893
0
    }
894
0
    if (!line.empty()) {
895
      // Add last line without terminating LF.
896
0
      lines.push_back(line);
897
0
    }
898
0
    if (lines.empty()) {
899
0
      return false;
900
0
    }
901
0
  }
902
903
  // Skip to the requested page number.
904
0
  for (unsigned i = 0; i < page; i++) {
905
0
    if (flist) {
906
0
      if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
907
0
        break;
908
0
      }
909
0
    }
910
0
  }
911
912
  // Begin producing output
913
0
  if (renderer && !renderer->BeginDocument(document_title.c_str())) {
914
0
    return false;
915
0
  }
916
917
  // Loop over all pages - or just the requested one
918
0
  while (true) {
919
0
    if (flist) {
920
0
      if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
921
0
        break;
922
0
      }
923
0
    } else {
924
0
      if (page >= lines.size()) {
925
0
        break;
926
0
      }
927
0
      snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
928
0
    }
929
0
    chomp_string(pagename);
930
0
    Pix *pix = pixRead(pagename);
931
0
    if (pix == nullptr) {
932
0
      tprintf("Image file %s cannot be read!\n", pagename);
933
0
      return false;
934
0
    }
935
0
    tprintf("Page %u : %s\n", page, pagename);
936
0
    bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer);
937
0
    pixDestroy(&pix);
938
0
    if (!r) {
939
0
      return false;
940
0
    }
941
0
    if (tessedit_page_number >= 0) {
942
0
      break;
943
0
    }
944
0
    ++page;
945
0
  }
946
947
  // Finish producing output
948
0
  if (renderer && !renderer->EndDocument()) {
949
0
    return false;
950
0
  }
951
0
  return true;
952
0
}
953
954
bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename,
955
                                            const char *retry_config, int timeout_millisec,
956
                                            TessResultRenderer *renderer,
957
0
                                            int tessedit_page_number) {
958
0
  Pix *pix = nullptr;
959
0
  int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
960
0
  size_t offset = 0;
961
0
  for (;; ++page) {
962
0
    if (tessedit_page_number >= 0) {
963
0
      page = tessedit_page_number;
964
0
      pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page);
965
0
    } else {
966
0
      pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
967
0
                   : pixReadFromMultipageTiff(filename, &offset);
968
0
    }
969
0
    if (pix == nullptr) {
970
0
      break;
971
0
    }
972
0
    if (offset || page > 0) {
973
      // Only print page number for multipage TIFF file.
974
0
      tprintf("Page %d\n", page + 1);
975
0
    }
976
0
    auto page_string = std::to_string(page);
977
0
    SetVariable("applybox_page", page_string.c_str());
978
0
    bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer);
979
0
    pixDestroy(&pix);
980
0
    if (!r) {
981
0
      return false;
982
0
    }
983
0
    if (tessedit_page_number >= 0) {
984
0
      break;
985
0
    }
986
0
    if (!offset) {
987
0
      break;
988
0
    }
989
0
  }
990
0
  return true;
991
0
}
992
993
// Master ProcessPages calls ProcessPagesInternal and then does any post-
994
// processing required due to being in a training mode.
995
bool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec,
996
0
                               TessResultRenderer *renderer) {
997
0
  bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
998
0
#ifndef DISABLED_LEGACY_ENGINE
999
0
  if (result) {
1000
0
    if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) {
1001
0
      tprintf("Write of TR file failed: %s\n", output_file_.c_str());
1002
0
      return false;
1003
0
    }
1004
0
  }
1005
0
#endif // ndef DISABLED_LEGACY_ENGINE
1006
0
  return result;
1007
0
}
1008
1009
#ifdef HAVE_LIBCURL
1010
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
1011
  size = size * nmemb;
1012
  auto *buf = reinterpret_cast<std::string *>(userp);
1013
  buf->append(reinterpret_cast<const char *>(contents), size);
1014
  return size;
1015
}
1016
#endif
1017
1018
// In the ideal scenario, Tesseract will start working on data as soon
1019
// as it can. For example, if you stream a filelist through stdin, we
1020
// should start the OCR process as soon as the first filename is
1021
// available. This is particularly useful when hooking Tesseract up to
1022
// slow hardware such as a book scanning machine.
1023
//
1024
// Unfortunately there are tradeoffs. You can't seek on stdin. That
1025
// makes automatic detection of datatype (TIFF? filelist? PNG?)
1026
// impractical.  So we support a command line flag to explicitly
1027
// identify the scenario that really matters: filelists on
1028
// stdin. We'll still do our best if the user likes pipes.
1029
bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config,
1030
0
                                       int timeout_millisec, TessResultRenderer *renderer) {
1031
0
  bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
1032
0
  if (stdInput) {
1033
#ifdef WIN32
1034
    if (_setmode(_fileno(stdin), _O_BINARY) == -1)
1035
      tprintf("ERROR: cin to binary: %s", strerror(errno));
1036
#endif // WIN32
1037
0
  }
1038
1039
0
  if (stream_filelist) {
1040
0
    return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer,
1041
0
                                tesseract_->tessedit_page_number);
1042
0
  }
1043
1044
  // At this point we are officially in autodection territory.
1045
  // That means any data in stdin must be buffered, to make it
1046
  // seekable.
1047
0
  std::string buf;
1048
0
  const l_uint8 *data = nullptr;
1049
0
  if (stdInput) {
1050
0
    buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>()));
1051
0
    data = reinterpret_cast<const l_uint8 *>(buf.data());
1052
0
  } else if (strstr(filename, "://") != nullptr) {
1053
    // Get image or image list by URL.
1054
#ifdef HAVE_LIBCURL
1055
    CURL *curl = curl_easy_init();
1056
    if (curl == nullptr) {
1057
      fprintf(stderr, "Error, curl_easy_init failed\n");
1058
      return false;
1059
    } else {
1060
      CURLcode curlcode;
1061
      auto error = [curl, &curlcode](const char *function) {
1062
        fprintf(stderr, "Error, %s failed with error %s\n", function, curl_easy_strerror(curlcode));
1063
        curl_easy_cleanup(curl);
1064
        return false;
1065
      };
1066
      curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
1067
      if (curlcode != CURLE_OK) {
1068
        return error("curl_easy_setopt");
1069
      }
1070
      curlcode = curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
1071
      if (curlcode != CURLE_OK) {
1072
        return error("curl_easy_setopt");
1073
      }
1074
      // Follow HTTP, HTTPS, FTP and FTPS redirects.
1075
      curlcode = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
1076
      if (curlcode != CURLE_OK) {
1077
        return error("curl_easy_setopt");
1078
      }
1079
      // Allow no more than 8 redirections to prevent endless loops.
1080
      curlcode = curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 8);
1081
      if (curlcode != CURLE_OK) {
1082
        return error("curl_easy_setopt");
1083
      }
1084
      int timeout = curl_timeout;
1085
      if (timeout > 0) {
1086
        curlcode = curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
1087
        if (curlcode != CURLE_OK) {
1088
          return error("curl_easy_setopt");
1089
        }
1090
        curlcode = curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
1091
        if (curlcode != CURLE_OK) {
1092
          return error("curl_easy_setopt");
1093
        }
1094
      }
1095
      std::string cookiefile = curl_cookiefile;
1096
      if (!cookiefile.empty()) {
1097
        curlcode = curl_easy_setopt(curl, CURLOPT_COOKIEFILE, cookiefile.c_str());
1098
        if (curlcode != CURLE_OK) {
1099
          return error("curl_easy_setopt");
1100
        }
1101
      }
1102
      curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
1103
      if (curlcode != CURLE_OK) {
1104
        return error("curl_easy_setopt");
1105
      }
1106
      curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
1107
      if (curlcode != CURLE_OK) {
1108
        return error("curl_easy_setopt");
1109
      }
1110
      curlcode = curl_easy_setopt(curl, CURLOPT_USERAGENT, "Tesseract OCR");
1111
      if (curlcode != CURLE_OK) {
1112
        return error("curl_easy_setopt");
1113
      }
1114
      curlcode = curl_easy_perform(curl);
1115
      if (curlcode != CURLE_OK) {
1116
        return error("curl_easy_perform");
1117
      }
1118
      curl_easy_cleanup(curl);
1119
      data = reinterpret_cast<const l_uint8 *>(buf.data());
1120
    }
1121
#else
1122
0
    fprintf(stderr, "Error, this tesseract has no URL support\n");
1123
0
    return false;
1124
0
#endif
1125
0
  } else {
1126
    // Check whether the input file can be read.
1127
0
    if (FILE *file = fopen(filename, "rb")) {
1128
0
      fclose(file);
1129
0
    } else {
1130
0
      fprintf(stderr, "Error, cannot read input file %s: %s\n", filename, strerror(errno));
1131
0
      return false;
1132
0
    }
1133
0
  }
1134
1135
  // Here is our autodetection
1136
0
  int format;
1137
0
  int r =
1138
0
      (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format);
1139
1140
  // Maybe we have a filelist
1141
0
  if (r != 0 || format == IFF_UNKNOWN) {
1142
0
    std::string s;
1143
0
    if (data != nullptr) {
1144
0
      s = buf.c_str();
1145
0
    } else {
1146
0
      std::ifstream t(filename);
1147
0
      std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
1148
0
      s = u.c_str();
1149
0
    }
1150
0
    return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer,
1151
0
                                tesseract_->tessedit_page_number);
1152
0
  }
1153
1154
  // Maybe we have a TIFF which is potentially multipage
1155
0
  bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE ||
1156
0
               format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
1157
0
#if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76
1158
0
               format == IFF_TIFF_JPEG ||
1159
0
#endif
1160
0
               format == IFF_TIFF_ZIP);
1161
1162
  // Fail early if we can, before producing any output
1163
0
  Pix *pix = nullptr;
1164
0
  if (!tiff) {
1165
0
    pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
1166
0
    if (pix == nullptr) {
1167
0
      return false;
1168
0
    }
1169
0
  }
1170
1171
  // Begin the output
1172
0
  if (renderer && !renderer->BeginDocument(document_title.c_str())) {
1173
0
    pixDestroy(&pix);
1174
0
    return false;
1175
0
  }
1176
1177
  // Produce output
1178
0
  r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec,
1179
0
                                         renderer, tesseract_->tessedit_page_number)
1180
0
             : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer);
1181
1182
  // Clean up memory as needed
1183
0
  pixDestroy(&pix);
1184
1185
  // End the output
1186
0
  if (!r || (renderer && !renderer->EndDocument())) {
1187
0
    return false;
1188
0
  }
1189
0
  return true;
1190
0
}
1191
1192
bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
1193
                              const char *retry_config, int timeout_millisec,
1194
0
                              TessResultRenderer *renderer) {
1195
0
  SetInputName(filename);
1196
0
  SetImage(pix);
1197
0
  bool failed = false;
1198
1199
0
  if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
1200
    // Disabled character recognition
1201
0
    if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) {
1202
0
      failed = true;
1203
0
    }
1204
0
  } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) {
1205
0
    failed = FindLines() != 0;
1206
0
  } else if (timeout_millisec > 0) {
1207
    // Running with a timeout.
1208
0
    ETEXT_DESC monitor;
1209
0
    monitor.cancel = nullptr;
1210
0
    monitor.cancel_this = nullptr;
1211
0
    monitor.set_deadline_msecs(timeout_millisec);
1212
1213
    // Now run the main recognition.
1214
0
    failed = Recognize(&monitor) < 0;
1215
0
  } else {
1216
    // Normal layout and character recognition with no timeout.
1217
0
    failed = Recognize(nullptr) < 0;
1218
0
  }
1219
1220
0
  if (tesseract_->tessedit_write_images) {
1221
0
    Pix *page_pix = GetThresholdedImage();
1222
0
    std::string output_filename = output_file_ + ".processed";
1223
0
    if (page_index > 0) {
1224
0
      output_filename += std::to_string(page_index);
1225
0
    }
1226
0
    output_filename += ".tif";
1227
0
    pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4);
1228
0
    pixDestroy(&page_pix);
1229
0
  }
1230
1231
0
  if (failed && retry_config != nullptr && retry_config[0] != '\0') {
1232
    // Save current config variables before switching modes.
1233
0
    FILE *fp = fopen(kOldVarsFile, "wb");
1234
0
    if (fp == nullptr) {
1235
0
      tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile);
1236
0
    } else {
1237
0
      PrintVariables(fp);
1238
0
      fclose(fp);
1239
0
    }
1240
    // Switch to alternate mode for retry.
1241
0
    ReadConfigFile(retry_config);
1242
0
    SetImage(pix);
1243
0
    Recognize(nullptr);
1244
    // Restore saved config variables.
1245
0
    ReadConfigFile(kOldVarsFile);
1246
0
  }
1247
1248
0
  if (renderer && !failed) {
1249
0
    failed = !renderer->AddImage(this);
1250
0
  }
1251
1252
0
  return !failed;
1253
0
}
1254
1255
/**
1256
 * Get a left-to-right iterator to the results of LayoutAnalysis and/or
1257
 * Recognize. The returned iterator must be deleted after use.
1258
 */
1259
0
LTRResultIterator *TessBaseAPI::GetLTRIterator() {
1260
0
  if (tesseract_ == nullptr || page_res_ == nullptr) {
1261
0
    return nullptr;
1262
0
  }
1263
0
  return new LTRResultIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
1264
0
                               thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
1265
0
                               rect_width_, rect_height_);
1266
0
}
1267
1268
/**
1269
 * Get a reading-order iterator to the results of LayoutAnalysis and/or
1270
 * Recognize. The returned iterator must be deleted after use.
1271
 * WARNING! This class points to data held within the TessBaseAPI class, and
1272
 * therefore can only be used while the TessBaseAPI class still exists and
1273
 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
1274
 * DetectOS, or anything else that changes the internal PAGE_RES.
1275
 */
1276
7.74k
ResultIterator *TessBaseAPI::GetIterator() {
1277
7.74k
  if (tesseract_ == nullptr || page_res_ == nullptr) {
1278
0
    return nullptr;
1279
0
  }
1280
7.74k
  return ResultIterator::StartOfParagraph(LTRResultIterator(
1281
7.74k
      page_res_, tesseract_, thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
1282
7.74k
      rect_left_, rect_top_, rect_width_, rect_height_));
1283
7.74k
}
1284
1285
/**
1286
 * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
1287
 * The returned iterator must be deleted after use.
1288
 * WARNING! This class points to data held within the TessBaseAPI class, and
1289
 * therefore can only be used while the TessBaseAPI class still exists and
1290
 * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
1291
 * DetectOS, or anything else that changes the internal PAGE_RES.
1292
 */
1293
6.88k
MutableIterator *TessBaseAPI::GetMutableIterator() {
1294
6.88k
  if (tesseract_ == nullptr || page_res_ == nullptr) {
1295
0
    return nullptr;
1296
0
  }
1297
6.88k
  return new MutableIterator(page_res_, tesseract_, thresholder_->GetScaleFactor(),
1298
6.88k
                             thresholder_->GetScaledYResolution(), rect_left_, rect_top_,
1299
6.88k
                             rect_width_, rect_height_);
1300
6.88k
}
1301
1302
/** Make a text string from the internal data structures. */
1303
7.74k
char *TessBaseAPI::GetUTF8Text() {
1304
7.74k
  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1305
0
    return nullptr;
1306
0
  }
1307
7.74k
  std::string text("");
1308
7.74k
  const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator());
1309
10.8k
  do {
1310
10.8k
    if (it->Empty(RIL_PARA)) {
1311
865
      continue;
1312
865
    }
1313
9.97k
    auto block_type = it->BlockType();
1314
9.97k
    switch (block_type) {
1315
0
      case PT_FLOWING_IMAGE:
1316
0
      case PT_HEADING_IMAGE:
1317
0
      case PT_PULLOUT_IMAGE:
1318
0
      case PT_HORZ_LINE:
1319
0
      case PT_VERT_LINE:
1320
        // Ignore images and lines for text output.
1321
0
        continue;
1322
0
      case PT_NOISE:
1323
0
        tprintf("TODO: Please report image which triggers the noise case.\n");
1324
0
        ASSERT_HOST(false);
1325
9.97k
      default:
1326
9.97k
        break;
1327
9.97k
    }
1328
1329
9.97k
    const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
1330
9.97k
    text += para_text.get();
1331
10.8k
  } while (it->Next(RIL_PARA));
1332
7.74k
  return copy_string(text);
1333
7.74k
}
1334
1335
0
static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) {
1336
0
  int left, top, right, bottom;
1337
0
  it->BoundingBox(level, &left, &top, &right, &bottom);
1338
0
  text += "\t" + std::to_string(left);
1339
0
  text += "\t" + std::to_string(top);
1340
0
  text += "\t" + std::to_string(right - left);
1341
0
  text += "\t" + std::to_string(bottom - top);
1342
0
}
1343
1344
/**
1345
 * Make a TSV-formatted string from the internal data structures.
1346
 * page_number is 0-based but will appear in the output as 1-based.
1347
 * Returned string must be freed with the delete [] operator.
1348
 */
1349
0
char *TessBaseAPI::GetTSVText(int page_number) {
1350
0
  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
1351
0
    return nullptr;
1352
0
  }
1353
1354
#if !defined(NDEBUG)
1355
  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1356
#endif
1357
0
  int page_id = page_number + 1; // we use 1-based page numbers.
1358
1359
0
  int page_num = page_id;
1360
0
  int block_num = 0;
1361
0
  int par_num = 0;
1362
0
  int line_num = 0;
1363
0
  int word_num = 0;
1364
1365
0
  std::string tsv_str;
1366
0
  tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
1367
0
  tsv_str += "\t" + std::to_string(block_num);
1368
0
  tsv_str += "\t" + std::to_string(par_num);
1369
0
  tsv_str += "\t" + std::to_string(line_num);
1370
0
  tsv_str += "\t" + std::to_string(word_num);
1371
0
  tsv_str += "\t" + std::to_string(rect_left_);
1372
0
  tsv_str += "\t" + std::to_string(rect_top_);
1373
0
  tsv_str += "\t" + std::to_string(rect_width_);
1374
0
  tsv_str += "\t" + std::to_string(rect_height_);
1375
0
  tsv_str += "\t-1\t\n";
1376
1377
0
  const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
1378
0
  while (!res_it->Empty(RIL_BLOCK)) {
1379
0
    if (res_it->Empty(RIL_WORD)) {
1380
0
      res_it->Next(RIL_WORD);
1381
0
      continue;
1382
0
    }
1383
1384
    // Add rows for any new block/paragraph/textline.
1385
0
    if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1386
0
      block_num++;
1387
0
      par_num = 0;
1388
0
      line_num = 0;
1389
0
      word_num = 0;
1390
0
      tsv_str += "2\t" + std::to_string(page_num); // level 2 - block
1391
0
      tsv_str += "\t" + std::to_string(block_num);
1392
0
      tsv_str += "\t" + std::to_string(par_num);
1393
0
      tsv_str += "\t" + std::to_string(line_num);
1394
0
      tsv_str += "\t" + std::to_string(word_num);
1395
0
      AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
1396
0
      tsv_str += "\t-1\t\n"; // end of row for block
1397
0
    }
1398
0
    if (res_it->IsAtBeginningOf(RIL_PARA)) {
1399
0
      par_num++;
1400
0
      line_num = 0;
1401
0
      word_num = 0;
1402
0
      tsv_str += "3\t" + std::to_string(page_num); // level 3 - paragraph
1403
0
      tsv_str += "\t" + std::to_string(block_num);
1404
0
      tsv_str += "\t" + std::to_string(par_num);
1405
0
      tsv_str += "\t" + std::to_string(line_num);
1406
0
      tsv_str += "\t" + std::to_string(word_num);
1407
0
      AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
1408
0
      tsv_str += "\t-1\t\n"; // end of row for para
1409
0
    }
1410
0
    if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1411
0
      line_num++;
1412
0
      word_num = 0;
1413
0
      tsv_str += "4\t" + std::to_string(page_num); // level 4 - line
1414
0
      tsv_str += "\t" + std::to_string(block_num);
1415
0
      tsv_str += "\t" + std::to_string(par_num);
1416
0
      tsv_str += "\t" + std::to_string(line_num);
1417
0
      tsv_str += "\t" + std::to_string(word_num);
1418
0
      AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
1419
0
      tsv_str += "\t-1\t\n"; // end of row for line
1420
0
    }
1421
1422
    // Now, process the word...
1423
0
    int left, top, right, bottom;
1424
0
    res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1425
0
    word_num++;
1426
0
    tsv_str += "5\t" + std::to_string(page_num); // level 5 - word
1427
0
    tsv_str += "\t" + std::to_string(block_num);
1428
0
    tsv_str += "\t" + std::to_string(par_num);
1429
0
    tsv_str += "\t" + std::to_string(line_num);
1430
0
    tsv_str += "\t" + std::to_string(word_num);
1431
0
    tsv_str += "\t" + std::to_string(left);
1432
0
    tsv_str += "\t" + std::to_string(top);
1433
0
    tsv_str += "\t" + std::to_string(right - left);
1434
0
    tsv_str += "\t" + std::to_string(bottom - top);
1435
0
    tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
1436
0
    tsv_str += "\t";
1437
1438
#if !defined(NDEBUG)
1439
    // Increment counts if at end of block/paragraph/textline.
1440
    if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
1441
      lcnt++;
1442
    }
1443
    if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) {
1444
      pcnt++;
1445
    }
1446
    if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) {
1447
      bcnt++;
1448
    }
1449
#endif
1450
1451
0
    do {
1452
0
      tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1453
0
      res_it->Next(RIL_SYMBOL);
1454
0
    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1455
0
    tsv_str += "\n"; // end of row
1456
#if !defined(NDEBUG)
1457
    wcnt++;
1458
#endif
1459
0
  }
1460
1461
0
  return copy_string(tsv_str);
1462
0
}
1463
1464
/** The 5 numbers output for each box (the usual 4 and a page number.) */
1465
const int kNumbersPerBlob = 5;
1466
/**
1467
 * The number of bytes taken by each number. Since we use int16_t for ICOORD,
1468
 * assume only 5 digits max.
1469
 */
1470
const int kBytesPerNumber = 5;
1471
/**
1472
 * Multiplier for max expected textlength assumes (kBytesPerNumber + space)
1473
 * * kNumbersPerBlob plus the newline. Add to this the
1474
 * original UTF8 characters, and one kMaxBytesPerLine for safety.
1475
 */
1476
const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
1477
/** Max bytes in the decimal representation of int64_t. */
1478
const int kBytesPer64BitNumber = 20;
1479
/**
1480
 * A maximal single box could occupy kNumbersPerBlob numbers at
1481
 * kBytesPer64BitNumber digits (if someone sneaks in a 64 bit value) and a
1482
 * space plus the newline and the maximum length of a UNICHAR.
1483
 * Test against this on each iteration for safety.
1484
 */
1485
const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 + UNICHAR_LEN;
1486
1487
/**
1488
 * The recognized text is returned as a char* which is coded
1489
 * as a UTF8 box file.
1490
 * page_number is a 0-base page index that will appear in the box file.
1491
 * Returned string must be freed with the delete [] operator.
1492
 */
1493
0
char *TessBaseAPI::GetBoxText(int page_number) {
1494
0
  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1495
0
    return nullptr;
1496
0
  }
1497
0
  int blob_count;
1498
0
  int utf8_length = TextLength(&blob_count);
1499
0
  int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine;
1500
0
  char *result = new char[total_length];
1501
0
  result[0] = '\0';
1502
0
  int output_length = 0;
1503
0
  LTRResultIterator *it = GetLTRIterator();
1504
0
  do {
1505
0
    int left, top, right, bottom;
1506
0
    if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1507
0
      const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
1508
      // Tesseract uses space for recognition failure. Fix to a reject
1509
      // character, kTesseractReject so we don't create illegal box files.
1510
0
      for (int i = 0; text[i] != '\0'; ++i) {
1511
0
        if (text[i] == ' ') {
1512
0
          text[i] = kTesseractReject;
1513
0
        }
1514
0
      }
1515
0
      snprintf(result + output_length, total_length - output_length, "%s %d %d %d %d %d\n",
1516
0
               text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number);
1517
0
      output_length += strlen(result + output_length);
1518
      // Just in case...
1519
0
      if (output_length + kMaxBytesPerLine > total_length) {
1520
0
        break;
1521
0
      }
1522
0
    }
1523
0
  } while (it->Next(RIL_SYMBOL));
1524
0
  delete it;
1525
0
  return result;
1526
0
}
1527
1528
/**
1529
 * Conversion table for non-latin characters.
1530
 * Maps characters out of the latin set into the latin set.
1531
 * TODO(rays) incorporate this translation into unicharset.
1532
 */
1533
const int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0};
1534
/** Latin chars corresponding to the unicode chars above. */
1535
const int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0};
1536
1537
/**
1538
 * The recognized text is returned as a char* which is coded
1539
 * as UNLV format Latin-1 with specific reject and suspect codes.
1540
 * Returned string must be freed with the delete [] operator.
1541
 */
1542
0
char *TessBaseAPI::GetUNLVText() {
1543
0
  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1544
0
    return nullptr;
1545
0
  }
1546
0
  bool tilde_crunch_written = false;
1547
0
  bool last_char_was_newline = true;
1548
0
  bool last_char_was_tilde = false;
1549
1550
0
  int total_length = TextLength(nullptr);
1551
0
  PAGE_RES_IT page_res_it(page_res_);
1552
0
  char *result = new char[total_length];
1553
0
  char *ptr = result;
1554
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
1555
0
    WERD_RES *word = page_res_it.word();
1556
    // Process the current word.
1557
0
    if (word->unlv_crunch_mode != CR_NONE) {
1558
0
      if (word->unlv_crunch_mode != CR_DELETE &&
1559
0
          (!tilde_crunch_written ||
1560
0
           (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 &&
1561
0
            !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
1562
0
        if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) &&
1563
0
            !word->word->flag(W_FUZZY_SP)) {
1564
          /* Write a space to separate from preceding good text */
1565
0
          *ptr++ = ' ';
1566
0
          last_char_was_tilde = false;
1567
0
        }
1568
0
        if (!last_char_was_tilde) {
1569
          // Write a reject char.
1570
0
          last_char_was_tilde = true;
1571
0
          *ptr++ = kUNLVReject;
1572
0
          tilde_crunch_written = true;
1573
0
          last_char_was_newline = false;
1574
0
        }
1575
0
      }
1576
0
    } else {
1577
      // NORMAL PROCESSING of non tilde crunched words.
1578
0
      tilde_crunch_written = false;
1579
0
      tesseract_->set_unlv_suspects(word);
1580
0
      const char *wordstr = word->best_choice->unichar_string().c_str();
1581
0
      const auto &lengths = word->best_choice->unichar_lengths();
1582
0
      int length = lengths.length();
1583
0
      int i = 0;
1584
0
      int offset = 0;
1585
1586
0
      if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') {
1587
        // Prevent adjacent tilde across words - we know that adjacent tildes
1588
        // within words have been removed.
1589
        // Skip the first character.
1590
0
        offset = lengths[i++];
1591
0
      }
1592
0
      if (i < length && wordstr[offset] != 0) {
1593
0
        if (!last_char_was_newline) {
1594
0
          *ptr++ = ' ';
1595
0
        } else {
1596
0
          last_char_was_newline = false;
1597
0
        }
1598
0
        for (; i < length; offset += lengths[i++]) {
1599
0
          if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) {
1600
0
            *ptr++ = kUNLVReject;
1601
0
            last_char_was_tilde = true;
1602
0
          } else {
1603
0
            if (word->reject_map[i].rejected()) {
1604
0
              *ptr++ = kUNLVSuspect;
1605
0
            }
1606
0
            UNICHAR ch(wordstr + offset, lengths[i]);
1607
0
            int uni_ch = ch.first_uni();
1608
0
            for (int j = 0; kUniChs[j] != 0; ++j) {
1609
0
              if (kUniChs[j] == uni_ch) {
1610
0
                uni_ch = kLatinChs[j];
1611
0
                break;
1612
0
              }
1613
0
            }
1614
0
            if (uni_ch <= 0xff) {
1615
0
              *ptr++ = static_cast<char>(uni_ch);
1616
0
              last_char_was_tilde = false;
1617
0
            } else {
1618
0
              *ptr++ = kUNLVReject;
1619
0
              last_char_was_tilde = true;
1620
0
            }
1621
0
          }
1622
0
        }
1623
0
      }
1624
0
    }
1625
0
    if (word->word->flag(W_EOL) && !last_char_was_newline) {
1626
      /* Add a new line output */
1627
0
      *ptr++ = '\n';
1628
0
      tilde_crunch_written = false;
1629
0
      last_char_was_newline = true;
1630
0
      last_char_was_tilde = false;
1631
0
    }
1632
0
  }
1633
0
  *ptr++ = '\n';
1634
0
  *ptr = '\0';
1635
0
  return result;
1636
0
}
1637
1638
#ifndef DISABLED_LEGACY_ENGINE
1639
1640
/**
1641
 * Detect the orientation of the input image and apparent script (alphabet).
1642
 * orient_deg is the detected clockwise rotation of the input image in degrees
1643
 * (0, 90, 180, 270)
1644
 * orient_conf is the confidence (15.0 is reasonably confident)
1645
 * script_name is an ASCII string, the name of the script, e.g. "Latin"
1646
 * script_conf is confidence level in the script
1647
 * Returns true on success and writes values to each parameter as an output
1648
 */
1649
bool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf,
1650
0
                                          const char **script_name, float *script_conf) {
1651
0
  OSResults osr;
1652
1653
0
  bool osd = DetectOS(&osr);
1654
0
  if (!osd) {
1655
0
    return false;
1656
0
  }
1657
1658
0
  int orient_id = osr.best_result.orientation_id;
1659
0
  int script_id = osr.get_best_script(orient_id);
1660
0
  if (orient_conf) {
1661
0
    *orient_conf = osr.best_result.oconfidence;
1662
0
  }
1663
0
  if (orient_deg) {
1664
0
    *orient_deg = orient_id * 90; // convert quadrant to degrees
1665
0
  }
1666
1667
0
  if (script_name) {
1668
0
    const char *script = osr.unicharset->get_script_from_script_id(script_id);
1669
1670
0
    *script_name = script;
1671
0
  }
1672
1673
0
  if (script_conf) {
1674
0
    *script_conf = osr.best_result.sconfidence;
1675
0
  }
1676
1677
0
  return true;
1678
0
}
1679
1680
/**
1681
 * The recognized text is returned as a char* which is coded
1682
 * as UTF8 and must be freed with the delete [] operator.
1683
 * page_number is a 0-based page index that will appear in the osd file.
1684
 */
1685
0
char *TessBaseAPI::GetOsdText(int page_number) {
1686
0
  int orient_deg;
1687
0
  float orient_conf;
1688
0
  const char *script_name;
1689
0
  float script_conf;
1690
1691
0
  if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) {
1692
0
    return nullptr;
1693
0
  }
1694
1695
  // clockwise rotation needed to make the page upright
1696
0
  int rotate = OrientationIdToValue(orient_deg / 90);
1697
1698
0
  std::stringstream stream;
1699
  // Use "C" locale (needed for float values orient_conf and script_conf).
1700
0
  stream.imbue(std::locale::classic());
1701
  // Use fixed notation with 2 digits after the decimal point for float values.
1702
0
  stream.precision(2);
1703
0
  stream << std::fixed << "Page number: " << page_number << "\n"
1704
0
         << "Orientation in degrees: " << orient_deg << "\n"
1705
0
         << "Rotate: " << rotate << "\n"
1706
0
         << "Orientation confidence: " << orient_conf << "\n"
1707
0
         << "Script: " << script_name << "\n"
1708
0
         << "Script confidence: " << script_conf << "\n";
1709
0
  return copy_string(stream.str());
1710
0
}
1711
1712
#endif // ndef DISABLED_LEGACY_ENGINE
1713
1714
/** Returns the average word confidence for Tesseract page result. */
1715
0
int TessBaseAPI::MeanTextConf() {
1716
0
  int *conf = AllWordConfidences();
1717
0
  if (!conf) {
1718
0
    return 0;
1719
0
  }
1720
0
  int sum = 0;
1721
0
  int *pt = conf;
1722
0
  while (*pt >= 0) {
1723
0
    sum += *pt++;
1724
0
  }
1725
0
  if (pt != conf) {
1726
0
    sum /= pt - conf;
1727
0
  }
1728
0
  delete[] conf;
1729
0
  return sum;
1730
0
}
1731
1732
/** Returns an array of all word confidences, terminated by -1. */
1733
0
int *TessBaseAPI::AllWordConfidences() {
1734
0
  if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1735
0
    return nullptr;
1736
0
  }
1737
0
  int n_word = 0;
1738
0
  PAGE_RES_IT res_it(page_res_);
1739
0
  for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1740
0
    n_word++;
1741
0
  }
1742
1743
0
  int *conf = new int[n_word + 1];
1744
0
  n_word = 0;
1745
0
  for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1746
0
    WERD_RES *word = res_it.word();
1747
0
    WERD_CHOICE *choice = word->best_choice;
1748
0
    int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1749
    // This is the eq for converting Tesseract confidence to 1..100
1750
0
    if (w_conf < 0) {
1751
0
      w_conf = 0;
1752
0
    }
1753
0
    if (w_conf > 100) {
1754
0
      w_conf = 100;
1755
0
    }
1756
0
    conf[n_word++] = w_conf;
1757
0
  }
1758
0
  conf[n_word] = -1;
1759
0
  return conf;
1760
0
}
1761
1762
#ifndef DISABLED_LEGACY_ENGINE
1763
/**
1764
 * Applies the given word to the adaptive classifier if possible.
1765
 * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
1766
 * tell the boundaries of the graphemes.
1767
 * Assumes that SetImage/SetRectangle have been used to set the image
1768
 * to the given word. The mode arg should be PSM_SINGLE_WORD or
1769
 * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
1770
 * The currently set PageSegMode is preserved.
1771
 * Returns false if adaption was not possible for some reason.
1772
 */
1773
0
bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) {
1774
0
  int debug = 0;
1775
0
  GetIntVariable("applybox_debug", &debug);
1776
0
  bool success = true;
1777
0
  PageSegMode current_psm = GetPageSegMode();
1778
0
  SetPageSegMode(mode);
1779
0
  SetVariable("classify_enable_learning", "0");
1780
0
  const std::unique_ptr<const char[]> text(GetUTF8Text());
1781
0
  if (debug) {
1782
0
    tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
1783
0
  }
1784
0
  if (text != nullptr) {
1785
0
    PAGE_RES_IT it(page_res_);
1786
0
    WERD_RES *word_res = it.word();
1787
0
    if (word_res != nullptr) {
1788
0
      word_res->word->set_text(wordstr);
1789
      // Check to see if text matches wordstr.
1790
0
      int w = 0;
1791
0
      int t;
1792
0
      for (t = 0; text[t] != '\0'; ++t) {
1793
0
        if (text[t] == '\n' || text[t] == ' ') {
1794
0
          continue;
1795
0
        }
1796
0
        while (wordstr[w] == ' ') {
1797
0
          ++w;
1798
0
        }
1799
0
        if (text[t] != wordstr[w]) {
1800
0
          break;
1801
0
        }
1802
0
        ++w;
1803
0
      }
1804
0
      if (text[t] != '\0' || wordstr[w] != '\0') {
1805
        // No match.
1806
0
        delete page_res_;
1807
0
        std::vector<TBOX> boxes;
1808
0
        page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_);
1809
0
        tesseract_->ReSegmentByClassification(page_res_);
1810
0
        tesseract_->TidyUp(page_res_);
1811
0
        PAGE_RES_IT pr_it(page_res_);
1812
0
        if (pr_it.word() == nullptr) {
1813
0
          success = false;
1814
0
        } else {
1815
0
          word_res = pr_it.word();
1816
0
        }
1817
0
      } else {
1818
0
        word_res->BestChoiceToCorrectText();
1819
0
      }
1820
0
      if (success) {
1821
0
        tesseract_->EnableLearning = true;
1822
0
        tesseract_->LearnWord(nullptr, word_res);
1823
0
      }
1824
0
    } else {
1825
0
      success = false;
1826
0
    }
1827
0
  } else {
1828
0
    success = false;
1829
0
  }
1830
0
  SetPageSegMode(current_psm);
1831
0
  return success;
1832
0
}
1833
#endif // ndef DISABLED_LEGACY_ENGINE
1834
1835
/**
1836
 * Free up recognition results and any stored image data, without actually
1837
 * freeing any recognition data that would be time-consuming to reload.
1838
 * Afterwards, you must call SetImage or TesseractRect before doing
1839
 * any Recognize or Get* operation.
1840
 */
1841
0
void TessBaseAPI::Clear() {
1842
0
  if (thresholder_ != nullptr) {
1843
0
    thresholder_->Clear();
1844
0
  }
1845
0
  ClearResults();
1846
0
  if (tesseract_ != nullptr) {
1847
0
    SetInputImage(nullptr);
1848
0
  }
1849
0
}
1850
1851
/**
1852
 * Close down tesseract and free up all memory. End() is equivalent to
1853
 * destructing and reconstructing your TessBaseAPI.
1854
 * Once End() has been used, none of the other API functions may be used
1855
 * other than Init and anything declared above it in the class definition.
1856
 */
1857
0
void TessBaseAPI::End() {
1858
0
  Clear();
1859
0
  delete thresholder_;
1860
0
  thresholder_ = nullptr;
1861
0
  delete page_res_;
1862
0
  page_res_ = nullptr;
1863
0
  delete block_list_;
1864
0
  block_list_ = nullptr;
1865
0
  if (paragraph_models_ != nullptr) {
1866
0
    for (auto model : *paragraph_models_) {
1867
0
      delete model;
1868
0
    }
1869
0
    delete paragraph_models_;
1870
0
    paragraph_models_ = nullptr;
1871
0
  }
1872
0
#ifndef DISABLED_LEGACY_ENGINE
1873
0
  if (osd_tesseract_ == tesseract_) {
1874
0
    osd_tesseract_ = nullptr;
1875
0
  }
1876
0
  delete osd_tesseract_;
1877
0
  osd_tesseract_ = nullptr;
1878
0
  delete equ_detect_;
1879
0
  equ_detect_ = nullptr;
1880
0
#endif // ndef DISABLED_LEGACY_ENGINE
1881
0
  delete tesseract_;
1882
0
  tesseract_ = nullptr;
1883
0
  input_file_.clear();
1884
0
  output_file_.clear();
1885
0
  datapath_.clear();
1886
0
  language_.clear();
1887
0
}
1888
1889
// Clear any library-level memory caches.
1890
// There are a variety of expensive-to-load constant data structures (mostly
1891
// language dictionaries) that are cached globally -- surviving the Init()
1892
// and End() of individual TessBaseAPI's.  This function allows the clearing
1893
// of these caches.
1894
0
void TessBaseAPI::ClearPersistentCache() {
1895
0
  Dict::GlobalDawgCache()->DeleteUnusedDawgs();
1896
0
}
1897
1898
/**
1899
 * Check whether a word is valid according to Tesseract's language model
1900
 * returns 0 if the word is invalid, non-zero if valid
1901
 */
1902
0
int TessBaseAPI::IsValidWord(const char *word) const {
1903
0
  return tesseract_->getDict().valid_word(word);
1904
0
}
1905
// Returns true if utf8_character is defined in the UniCharset.
1906
0
bool TessBaseAPI::IsValidCharacter(const char *utf8_character) const {
1907
0
  return tesseract_->unicharset.contains_unichar(utf8_character);
1908
0
}
1909
1910
// TODO(rays) Obsolete this function and replace with a more aptly named
1911
// function that returns image coordinates rather than tesseract coordinates.
1912
0
bool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) {
1913
0
  const std::unique_ptr<const PageIterator> it(AnalyseLayout());
1914
0
  if (it == nullptr) {
1915
0
    return false;
1916
0
  }
1917
0
  int x1, x2, y1, y2;
1918
0
  it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
1919
  // Calculate offset and slope (NOTE: Kind of ugly)
1920
0
  if (x2 <= x1) {
1921
0
    x2 = x1 + 1;
1922
0
  }
1923
  // Convert the point pair to slope/offset of the baseline (in image coords.)
1924
0
  *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
1925
0
  *out_offset = static_cast<int>(y1 - *out_slope * x1);
1926
  // Get the y-coord of the baseline at the left and right edges of the
1927
  // textline's bounding box.
1928
0
  int left, top, right, bottom;
1929
0
  if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
1930
0
    return false;
1931
0
  }
1932
0
  int left_y = IntCastRounded(*out_slope * left + *out_offset);
1933
0
  int right_y = IntCastRounded(*out_slope * right + *out_offset);
1934
  // Shift the baseline down so it passes through the nearest bottom-corner
1935
  // of the textline's bounding box. This is the difference between the y
1936
  // at the lowest (max) edge of the box and the actual box bottom.
1937
0
  *out_offset += bottom - std::max(left_y, right_y);
1938
  // Switch back to bottom-up tesseract coordinates. Requires negation of
1939
  // the slope and height - offset for the offset.
1940
0
  *out_slope = -*out_slope;
1941
0
  *out_offset = rect_height_ - *out_offset;
1942
1943
0
  return true;
1944
0
}
1945
1946
/** Sets Dict::letter_is_okay_ function to point to the given function. */
1947
0
void TessBaseAPI::SetDictFunc(DictFunc f) {
1948
0
  if (tesseract_ != nullptr) {
1949
0
    tesseract_->getDict().letter_is_okay_ = f;
1950
0
  }
1951
0
}
1952
1953
/**
1954
 * Sets Dict::probability_in_context_ function to point to the given
1955
 * function.
1956
 *
1957
 * @param f A single function that returns the probability of the current
1958
 * "character" (in general a utf-8 string), given the context of a previous
1959
 * utf-8 string.
1960
 */
1961
0
void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) {
1962
0
  if (tesseract_ != nullptr) {
1963
0
    tesseract_->getDict().probability_in_context_ = f;
1964
    // Set it for the sublangs too.
1965
0
    int num_subs = tesseract_->num_sub_langs();
1966
0
    for (int i = 0; i < num_subs; ++i) {
1967
0
      tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f;
1968
0
    }
1969
0
  }
1970
0
}
1971
1972
/** Common code for setting the image. */
1973
7.74k
bool TessBaseAPI::InternalSetImage() {
1974
7.74k
  if (tesseract_ == nullptr) {
1975
0
    tprintf("Please call Init before attempting to set an image.\n");
1976
0
    return false;
1977
0
  }
1978
7.74k
  if (thresholder_ == nullptr) {
1979
1
    thresholder_ = new ImageThresholder;
1980
1
  }
1981
7.74k
  ClearResults();
1982
7.74k
  return true;
1983
7.74k
}
1984
1985
/**
1986
 * Run the thresholder to make the thresholded image, returned in pix,
1987
 * which must not be nullptr. *pix must be initialized to nullptr, or point
1988
 * to an existing pixDestroyable Pix.
1989
 * The usual argument to Threshold is Tesseract::mutable_pix_binary().
1990
 */
1991
7.74k
bool TessBaseAPI::Threshold(Pix **pix) {
1992
7.74k
  ASSERT_HOST(pix != nullptr);
1993
7.74k
  if (*pix != nullptr) {
1994
0
    pixDestroy(pix);
1995
0
  }
1996
  // Zero resolution messes up the algorithms, so make sure it is credible.
1997
7.74k
  int user_dpi = 0;
1998
7.74k
  GetIntVariable("user_defined_dpi", &user_dpi);
1999
7.74k
  int y_res = thresholder_->GetScaledYResolution();
2000
7.74k
  if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) {
2001
0
    tprintf(
2002
0
        "Warning: User defined image dpi is outside of expected range "
2003
0
        "(%d - %d)!\n",
2004
0
        kMinCredibleResolution, kMaxCredibleResolution);
2005
0
  }
2006
  // Always use user defined dpi
2007
7.74k
  if (user_dpi) {
2008
0
    thresholder_->SetSourceYResolution(user_dpi);
2009
7.74k
  } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
2010
7.74k
    if (y_res != 0) {
2011
      // Show warning only if a resolution was given.
2012
0
      tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n",
2013
0
              y_res, kMinCredibleResolution);
2014
0
    }
2015
7.74k
    thresholder_->SetSourceYResolution(kMinCredibleResolution);
2016
7.74k
  }
2017
2018
7.74k
  auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method));
2019
2020
7.74k
  if (thresholding_method == ThresholdMethod::Otsu) {
2021
7.74k
    Image pix_binary(*pix);
2022
7.74k
    if (!thresholder_->ThresholdToPix(&pix_binary)) {
2023
0
      return false;
2024
0
    }
2025
7.74k
    *pix = pix_binary;
2026
2027
7.74k
    if (!thresholder_->IsBinary()) {
2028
0
      tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds());
2029
0
      tesseract_->set_pix_grey(thresholder_->GetPixRectGrey());
2030
7.74k
    } else {
2031
7.74k
      tesseract_->set_pix_thresholds(nullptr);
2032
7.74k
      tesseract_->set_pix_grey(nullptr);
2033
7.74k
    }
2034
7.74k
  } else {
2035
0
    auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method);
2036
2037
0
    if (!ok) {
2038
0
      return false;
2039
0
    }
2040
0
    *pix = pix_binary;
2041
2042
0
    tesseract_->set_pix_thresholds(pix_thresholds);
2043
0
    tesseract_->set_pix_grey(pix_grey);
2044
0
  }
2045
2046
7.74k
  thresholder_->GetImageSizes(&rect_left_, &rect_top_, &rect_width_, &rect_height_, &image_width_,
2047
7.74k
                              &image_height_);
2048
2049
  // Set the internal resolution that is used for layout parameters from the
2050
  // estimated resolution, rather than the image resolution, which may be
2051
  // fabricated, but we will use the image resolution, if there is one, to
2052
  // report output point sizes.
2053
7.74k
  int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
2054
7.74k
                                  kMinCredibleResolution, kMaxCredibleResolution);
2055
7.74k
  if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
2056
0
    tprintf(
2057
0
        "Estimated internal resolution %d out of range! "
2058
0
        "Corrected to %d.\n",
2059
0
        thresholder_->GetScaledEstimatedResolution(), estimated_res);
2060
0
  }
2061
7.74k
  tesseract_->set_source_resolution(estimated_res);
2062
7.74k
  return true;
2063
7.74k
}
2064
2065
/** Find lines from the image making the BLOCK_LIST. */
2066
7.74k
int TessBaseAPI::FindLines() {
2067
7.74k
  if (thresholder_ == nullptr || thresholder_->IsEmpty()) {
2068
0
    tprintf("Please call SetImage before attempting recognition.\n");
2069
0
    return -1;
2070
0
  }
2071
7.74k
  if (recognition_done_) {
2072
0
    ClearResults();
2073
0
  }
2074
7.74k
  if (!block_list_->empty()) {
2075
0
    return 0;
2076
0
  }
2077
7.74k
  if (tesseract_ == nullptr) {
2078
0
    tesseract_ = new Tesseract;
2079
0
#ifndef DISABLED_LEGACY_ENGINE
2080
0
    tesseract_->InitAdaptiveClassifier(nullptr);
2081
0
#endif
2082
0
  }
2083
7.74k
  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2084
0
    return -1;
2085
0
  }
2086
2087
7.74k
  tesseract_->PrepareForPageseg();
2088
2089
7.74k
#ifndef DISABLED_LEGACY_ENGINE
2090
7.74k
  if (tesseract_->textord_equation_detect) {
2091
0
    if (equ_detect_ == nullptr && !datapath_.empty()) {
2092
0
      equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr);
2093
0
    }
2094
0
    if (equ_detect_ == nullptr) {
2095
0
      tprintf("Warning: Could not set equation detector\n");
2096
0
    } else {
2097
0
      tesseract_->SetEquationDetect(equ_detect_);
2098
0
    }
2099
0
  }
2100
7.74k
#endif // ndef DISABLED_LEGACY_ENGINE
2101
2102
7.74k
  Tesseract *osd_tess = osd_tesseract_;
2103
7.74k
  OSResults osr;
2104
7.74k
#ifndef DISABLED_LEGACY_ENGINE
2105
7.74k
  if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) {
2106
0
    if (strcmp(language_.c_str(), "osd") == 0) {
2107
0
      osd_tess = tesseract_;
2108
0
    } else {
2109
0
      osd_tesseract_ = new Tesseract;
2110
0
      TessdataManager mgr(reader_);
2111
0
      if (datapath_.empty()) {
2112
0
        tprintf(
2113
0
            "Warning: Auto orientation and script detection requested,"
2114
0
            " but data path is undefined\n");
2115
0
        delete osd_tesseract_;
2116
0
        osd_tesseract_ = nullptr;
2117
0
      } else if (osd_tesseract_->init_tesseract(datapath_, "", "osd", OEM_TESSERACT_ONLY,
2118
0
                                                nullptr, 0, nullptr, nullptr, false, &mgr) == 0) {
2119
0
        osd_tess = osd_tesseract_;
2120
0
        osd_tesseract_->set_source_resolution(thresholder_->GetSourceYResolution());
2121
0
      } else {
2122
0
        tprintf(
2123
0
            "Warning: Auto orientation and script detection requested,"
2124
0
            " but osd language failed to load\n");
2125
0
        delete osd_tesseract_;
2126
0
        osd_tesseract_ = nullptr;
2127
0
      }
2128
0
    }
2129
0
  }
2130
7.74k
#endif // ndef DISABLED_LEGACY_ENGINE
2131
2132
7.74k
  if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) {
2133
0
    return -1;
2134
0
  }
2135
2136
  // If Devanagari is being recognized, we use different images for page seg
2137
  // and for OCR.
2138
7.74k
  tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
2139
7.74k
  return 0;
2140
7.74k
}
2141
2142
/**
2143
 * Return average gradient of lines on page.
2144
 */
2145
0
float TessBaseAPI::GetGradient() {
2146
0
  return tesseract_->gradient();
2147
0
}
2148
2149
/** Delete the pageres and clear the block list ready for a new page. */
2150
7.74k
void TessBaseAPI::ClearResults() {
2151
7.74k
  if (tesseract_ != nullptr) {
2152
7.74k
    tesseract_->Clear();
2153
7.74k
  }
2154
7.74k
  delete page_res_;
2155
7.74k
  page_res_ = nullptr;
2156
7.74k
  recognition_done_ = false;
2157
7.74k
  if (block_list_ == nullptr) {
2158
1
    block_list_ = new BLOCK_LIST;
2159
7.74k
  } else {
2160
7.74k
    block_list_->clear();
2161
7.74k
  }
2162
7.74k
  if (paragraph_models_ != nullptr) {
2163
6.88k
    for (auto model : *paragraph_models_) {
2164
1.39k
      delete model;
2165
1.39k
    }
2166
6.88k
    delete paragraph_models_;
2167
6.88k
    paragraph_models_ = nullptr;
2168
6.88k
  }
2169
7.74k
}
2170
2171
/**
2172
 * Return the length of the output text string, as UTF8, assuming
2173
 * liberally two spacing marks after each word (as paragraphs end with two
2174
 * newlines), and assuming a single character reject marker for each rejected
2175
 * character.
2176
 * Also return the number of recognized blobs in blob_count.
2177
 */
2178
0
int TessBaseAPI::TextLength(int *blob_count) const {
2179
0
  if (tesseract_ == nullptr || page_res_ == nullptr) {
2180
0
    return 0;
2181
0
  }
2182
2183
0
  PAGE_RES_IT page_res_it(page_res_);
2184
0
  int total_length = 2;
2185
0
  int total_blobs = 0;
2186
  // Iterate over the data structures to extract the recognition result.
2187
0
  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2188
0
    WERD_RES *word = page_res_it.word();
2189
0
    WERD_CHOICE *choice = word->best_choice;
2190
0
    if (choice != nullptr) {
2191
0
      total_blobs += choice->length() + 2;
2192
0
      total_length += choice->unichar_string().length() + 2;
2193
0
      for (int i = 0; i < word->reject_map.length(); ++i) {
2194
0
        if (word->reject_map[i].rejected()) {
2195
0
          ++total_length;
2196
0
        }
2197
0
      }
2198
0
    }
2199
0
  }
2200
0
  if (blob_count != nullptr) {
2201
0
    *blob_count = total_blobs;
2202
0
  }
2203
0
  return total_length;
2204
0
}
2205
2206
#ifndef DISABLED_LEGACY_ENGINE
2207
/**
2208
 * Estimates the Orientation And Script of the image.
2209
 * Returns true if the image was processed successfully.
2210
 */
2211
0
bool TessBaseAPI::DetectOS(OSResults *osr) {
2212
0
  if (tesseract_ == nullptr) {
2213
0
    return false;
2214
0
  }
2215
0
  ClearResults();
2216
0
  if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2217
0
    return false;
2218
0
  }
2219
2220
0
  if (input_file_.empty()) {
2221
0
    input_file_ = kInputFile;
2222
0
  }
2223
0
  return orientation_and_script_detection(input_file_.c_str(), osr, tesseract_) > 0;
2224
0
}
2225
#endif // #ifndef DISABLED_LEGACY_ENGINE
2226
2227
0
void TessBaseAPI::set_min_orientation_margin(double margin) {
2228
0
  tesseract_->min_orientation_margin.set_value(margin);
2229
0
}
2230
2231
/**
2232
 * Return text orientation of each block as determined in an earlier page layout
2233
 * analysis operation. Orientation is returned as the number of ccw 90-degree
2234
 * rotations (in [0..3]) required to make the text in the block upright
2235
 * (readable). Note that this may not necessary be the block orientation
2236
 * preferred for recognition (such as the case of vertical CJK text).
2237
 *
2238
 * Also returns whether the text in the block is believed to have vertical
2239
 * writing direction (when in an upright page orientation).
2240
 *
2241
 * The returned array is of length equal to the number of text blocks, which may
2242
 * be less than the total number of blocks. The ordering is intended to be
2243
 * consistent with GetTextLines().
2244
 */
2245
0
void TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) {
2246
0
  delete[] * block_orientation;
2247
0
  *block_orientation = nullptr;
2248
0
  delete[] * vertical_writing;
2249
0
  *vertical_writing = nullptr;
2250
0
  BLOCK_IT block_it(block_list_);
2251
2252
0
  block_it.move_to_first();
2253
0
  int num_blocks = 0;
2254
0
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2255
0
    if (!block_it.data()->pdblk.poly_block()->IsText()) {
2256
0
      continue;
2257
0
    }
2258
0
    ++num_blocks;
2259
0
  }
2260
0
  if (!num_blocks) {
2261
0
    tprintf("WARNING: Found no blocks\n");
2262
0
    return;
2263
0
  }
2264
0
  *block_orientation = new int[num_blocks];
2265
0
  *vertical_writing = new bool[num_blocks];
2266
0
  block_it.move_to_first();
2267
0
  int i = 0;
2268
0
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2269
0
    if (!block_it.data()->pdblk.poly_block()->IsText()) {
2270
0
      continue;
2271
0
    }
2272
0
    FCOORD re_rotation = block_it.data()->re_rotation();
2273
0
    float re_theta = re_rotation.angle();
2274
0
    FCOORD classify_rotation = block_it.data()->classify_rotation();
2275
0
    float classify_theta = classify_rotation.angle();
2276
0
    double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI;
2277
0
    if (rot_theta < 0) {
2278
0
      rot_theta += 4;
2279
0
    }
2280
0
    int num_rotations = static_cast<int>(rot_theta + 0.5);
2281
0
    (*block_orientation)[i] = num_rotations;
2282
    // The classify_rotation is non-zero only if the text has vertical
2283
    // writing direction.
2284
0
    (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
2285
0
    ++i;
2286
0
  }
2287
0
}
2288
2289
6.88k
void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
2290
6.88k
  int debug_level = 0;
2291
6.88k
  GetIntVariable("paragraph_debug_level", &debug_level);
2292
6.88k
  if (paragraph_models_ == nullptr) {
2293
6.88k
    paragraph_models_ = new std::vector<ParagraphModel *>;
2294
6.88k
  }
2295
6.88k
  MutableIterator *result_it = GetMutableIterator();
2296
6.88k
  do { // Detect paragraphs for this block
2297
6.88k
    std::vector<ParagraphModel *> models;
2298
6.88k
    ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models);
2299
6.88k
    paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end());
2300
6.88k
  } while (result_it->Next(RIL_BLOCK));
2301
6.88k
  delete result_it;
2302
6.88k
}
2303
2304
/** This method returns the string form of the specified unichar. */
2305
0
const char *TessBaseAPI::GetUnichar(int unichar_id) const {
2306
0
  return tesseract_->unicharset.id_to_unichar(unichar_id);
2307
0
}
2308
2309
/** Return the pointer to the i-th dawg loaded into tesseract_ object. */
2310
0
const Dawg *TessBaseAPI::GetDawg(int i) const {
2311
0
  if (tesseract_ == nullptr || i >= NumDawgs()) {
2312
0
    return nullptr;
2313
0
  }
2314
0
  return tesseract_->getDict().GetDawg(i);
2315
0
}
2316
2317
/** Return the number of dawgs loaded into tesseract_ object. */
2318
0
int TessBaseAPI::NumDawgs() const {
2319
0
  return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();
2320
0
}
2321
2322
/** Escape a char string - replace <>&"' with HTML codes. */
2323
0
std::string HOcrEscape(const char *text) {
2324
0
  std::string ret;
2325
0
  const char *ptr;
2326
0
  for (ptr = text; *ptr; ptr++) {
2327
0
    switch (*ptr) {
2328
0
      case '<':
2329
0
        ret += "&lt;";
2330
0
        break;
2331
0
      case '>':
2332
0
        ret += "&gt;";
2333
0
        break;
2334
0
      case '&':
2335
0
        ret += "&amp;";
2336
0
        break;
2337
0
      case '"':
2338
0
        ret += "&quot;";
2339
0
        break;
2340
0
      case '\'':
2341
0
        ret += "&#39;";
2342
0
        break;
2343
0
      default:
2344
0
        ret += *ptr;
2345
0
    }
2346
0
  }
2347
0
  return ret;
2348
0
}
2349
2350
} // namespace tesseract