Coverage Report

Created: 2024-02-28 06:46

/src/tesseract/include/tesseract/baseapi.h
Line
Count
Source (jump to first uncovered line)
1
// SPDX-License-Identifier: Apache-2.0
2
// File:        baseapi.h
3
// Description: Simple API for calling tesseract.
4
// Author:      Ray Smith
5
//
6
// (C) Copyright 2006, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
17
#ifndef TESSERACT_API_BASEAPI_H_
18
#define TESSERACT_API_BASEAPI_H_
19
20
#ifdef HAVE_CONFIG_H
21
#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
22
#endif
23
24
#include "export.h"
25
#include "pageiterator.h"
26
#include "publictypes.h"
27
#include "resultiterator.h"
28
#include "unichar.h"
29
30
#include <tesseract/version.h>
31
32
#include <cstdio>
33
#include <vector> // for std::vector
34
35
struct Pix;
36
struct Pixa;
37
struct Boxa;
38
39
namespace tesseract {
40
41
class PAGE_RES;
42
class ParagraphModel;
43
class BLOCK_LIST;
44
class ETEXT_DESC;
45
struct OSResults;
46
class UNICHARSET;
47
48
class Dawg;
49
class Dict;
50
class EquationDetect;
51
class PageIterator;
52
class ImageThresholder;
53
class LTRResultIterator;
54
class ResultIterator;
55
class MutableIterator;
56
class TessResultRenderer;
57
class Tesseract;
58
59
// Function to read a std::vector<char> from a whole file.
60
// Returns false on failure.
61
using FileReader = bool (*)(const char *filename, std::vector<char> *data);
62
63
using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID,
64
                               bool) const;
65
using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *,
66
                                                  int, const char *, int);
67
68
/**
69
 * Base class for all tesseract APIs.
70
 * Specific classes can add ability to work on different inputs or produce
71
 * different outputs.
72
 * This class is mostly an interface layer on top of the Tesseract instance
73
 * class to hide the data types so that users of this class don't have to
74
 * include any other Tesseract headers.
75
 */
76
class TESS_API TessBaseAPI {
77
public:
78
  TessBaseAPI();
79
  virtual ~TessBaseAPI();
80
  // Copy constructor and assignment operator are currently unsupported.
81
  TessBaseAPI(TessBaseAPI const &) = delete;
82
  TessBaseAPI &operator=(TessBaseAPI const &) = delete;
83
84
  /**
85
   * Returns the version identifier as a static string. Do not delete.
86
   */
87
  static const char *Version();
88
89
  /**
90
   * If compiled with OpenCL AND an available OpenCL
91
   * device is deemed faster than serial code, then
92
   * "device" is populated with the cl_device_id
93
   * and returns sizeof(cl_device_id)
94
   * otherwise *device=nullptr and returns 0.
95
   */
96
  static size_t getOpenCLDevice(void **device);
97
98
  /**
99
   * Set the name of the input file. Needed for training and
100
   * reading a UNLV zone file, and for searchable PDF output.
101
   */
102
  void SetInputName(const char *name);
103
  /**
104
   * These functions are required for searchable PDF output.
105
   * We need our hands on the input file so that we can include
106
   * it in the PDF without transcoding. If that is not possible,
107
   * we need the original image. Finally, resolution metadata
108
   * is stored in the PDF so we need that as well.
109
   */
110
  const char *GetInputName();
111
  // Takes ownership of the input pix.
112
  void SetInputImage(Pix *pix);
113
  Pix *GetInputImage();
114
  int GetSourceYResolution();
115
  const char *GetDatapath();
116
117
  /** Set the name of the bonus output files. Needed only for debugging. */
118
  void SetOutputName(const char *name);
119
120
  /**
121
   * Set the value of an internal "parameter."
122
   * Supply the name of the parameter and the value as a string, just as
123
   * you would in a config file.
124
   * Returns false if the name lookup failed.
125
   * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z.
126
   * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode.
127
   * SetVariable may be used before Init, but settings will revert to
128
   * defaults on End().
129
   *
130
   * Note: Must be called after Init(). Only works for non-init variables
131
   * (init variables should be passed to Init()).
132
   */
133
  bool SetVariable(const char *name, const char *value);
134
  bool SetDebugVariable(const char *name, const char *value);
135
136
  /**
137
   * Returns true if the parameter was found among Tesseract parameters.
138
   * Fills in value with the value of the parameter.
139
   */
140
  bool GetIntVariable(const char *name, int *value) const;
141
  bool GetBoolVariable(const char *name, bool *value) const;
142
  bool GetDoubleVariable(const char *name, double *value) const;
143
144
  /**
145
   * Returns the pointer to the string that represents the value of the
146
   * parameter if it was found among Tesseract parameters.
147
   */
148
  const char *GetStringVariable(const char *name) const;
149
150
#ifndef DISABLED_LEGACY_ENGINE
151
152
  /**
153
   * Print Tesseract fonts table to the given file.
154
   */
155
  void PrintFontsTable(FILE *fp) const;
156
157
#endif
158
159
  /**
160
   * Print Tesseract parameters to the given file.
161
   */
162
  void PrintVariables(FILE *fp) const;
163
164
  /**
165
   * Get value of named variable as a string, if it exists.
166
   */
167
  bool GetVariableAsString(const char *name, std::string *val) const;
168
169
  /**
170
   * Instances are now mostly thread-safe and totally independent,
171
   * but some global parameters remain. Basically it is safe to use multiple
172
   * TessBaseAPIs in different threads in parallel, UNLESS:
173
   * you use SetVariable on some of the Params in classify and textord.
174
   * If you do, then the effect will be to change it for all your instances.
175
   *
176
   * Start tesseract. Returns zero on success and -1 on failure.
177
   * NOTE that the only members that may be called before Init are those
178
   * listed above here in the class definition.
179
   *
180
   * The datapath must be the name of the tessdata directory.
181
   * The language is (usually) an ISO 639-3 string or nullptr will default to
182
   * eng. It is entirely safe (and eventually will be efficient too) to call
183
   * Init multiple times on the same instance to change language, or just
184
   * to reset the classifier.
185
   * The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating
186
   * that multiple languages are to be loaded. Eg hin+eng will load Hindi and
187
   * English. Languages may specify internally that they want to be loaded
188
   * with one or more other languages, so the ~ sign is available to override
189
   * that. Eg if hin were set to load eng by default, then hin+~eng would force
190
   * loading only hin. The number of loaded languages is limited only by
191
   * memory, with the caveat that loading additional languages will impact
192
   * both speed and accuracy, as there is more work to do to decide on the
193
   * applicable language, and there is more chance of hallucinating incorrect
194
   * words.
195
   * WARNING: On changing languages, all Tesseract parameters are reset
196
   * back to their default values. (Which may vary between languages.)
197
   * If you have a rare need to set a Variable that controls
198
   * initialization for a second call to Init you should explicitly
199
   * call End() and then use SetVariable before Init. This is only a very
200
   * rare use case, since there are very few uses that require any parameters
201
   * to be set before Init.
202
   *
203
   * If set_only_non_debug_params is true, only params that do not contain
204
   * "debug" in the name will be set.
205
   */
206
  int Init(const char *datapath, const char *language, OcrEngineMode mode,
207
           char **configs, int configs_size,
208
           const std::vector<std::string> *vars_vec,
209
           const std::vector<std::string> *vars_values,
210
           bool set_only_non_debug_params);
211
0
  int Init(const char *datapath, const char *language, OcrEngineMode oem) {
212
0
    return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false);
213
0
  }
214
4
  int Init(const char *datapath, const char *language) {
215
4
    return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr,
216
4
                false);
217
4
  }
218
  // In-memory version reads the traineddata file directly from the given
219
  // data[data_size] array, and/or reads data via a FileReader.
220
  int Init(const char *data, int data_size, const char *language,
221
           OcrEngineMode mode, char **configs, int configs_size,
222
           const std::vector<std::string> *vars_vec,
223
           const std::vector<std::string> *vars_values,
224
           bool set_only_non_debug_params, FileReader reader);
225
226
  /**
227
   * Returns the languages string used in the last valid initialization.
228
   * If the last initialization specified "deu+hin" then that will be
229
   * returned. If hin loaded eng automatically as well, then that will
230
   * not be included in this list. To find the languages actually
231
   * loaded use GetLoadedLanguagesAsVector.
232
   * The returned string should NOT be deleted.
233
   */
234
  const char *GetInitLanguagesAsString() const;
235
236
  /**
237
   * Returns the loaded languages in the vector of std::string.
238
   * Includes all languages loaded by the last Init, including those loaded
239
   * as dependencies of other loaded languages.
240
   */
241
  void GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const;
242
243
  /**
244
   * Returns the available languages in the sorted vector of std::string.
245
   */
246
  void GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const;
247
248
  /**
249
   * Init only for page layout analysis. Use only for calls to SetImage and
250
   * AnalysePage. Calls that attempt recognition will generate an error.
251
   */
252
  void InitForAnalysePage();
253
254
  /**
255
   * Read a "config" file containing a set of param, value pairs.
256
   * Searches the standard places: tessdata/configs, tessdata/tessconfigs
257
   * and also accepts a relative or absolute path name.
258
   * Note: only non-init params will be set (init params are set by Init()).
259
   */
260
  void ReadConfigFile(const char *filename);
261
  /** Same as above, but only set debug params from the given config file. */
262
  void ReadDebugConfigFile(const char *filename);
263
264
  /**
265
   * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK.
266
   * The mode is stored as an IntParam so it can also be modified by
267
   * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
268
   */
269
  void SetPageSegMode(PageSegMode mode);
270
271
  /** Return the current page segmentation mode. */
272
  PageSegMode GetPageSegMode() const;
273
274
  /**
275
   * Recognize a rectangle from an image and return the result as a string.
276
   * May be called many times for a single Init.
277
   * Currently has no error checking.
278
   * Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
279
   * Palette color images will not work properly and must be converted to
280
   * 24 bit.
281
   * Binary images of 1 bit per pixel may also be given but they must be
282
   * byte packed with the MSB of the first byte being the first pixel, and a
283
   * 1 represents WHITE. For binary images set bytes_per_pixel=0.
284
   * The recognized text is returned as a char* which is coded
285
   * as UTF8 and must be freed with the delete [] operator.
286
   *
287
   * Note that TesseractRect is the simplified convenience interface.
288
   * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize,
289
   * and one or more of the Get*Text functions below.
290
   */
291
  char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
292
                      int bytes_per_line, int left, int top, int width,
293
                      int height);
294
295
  /**
296
   * Call between pages or documents etc to free up memory and forget
297
   * adaptive data.
298
   */
299
  void ClearAdaptiveClassifier();
300
301
  /**
302
   * @defgroup AdvancedAPI Advanced API
303
   * The following methods break TesseractRect into pieces, so you can
304
   * get hold of the thresholded image, get the text in different formats,
305
   * get bounding boxes, confidences etc.
306
   */
307
  /* @{ */
308
309
  /**
310
   * Provide an image for Tesseract to recognize. Format is as
311
   * TesseractRect above. Copies the image buffer and converts to Pix.
312
   * SetImage clears all recognition results, and sets the rectangle to the
313
   * full image, so it may be followed immediately by a GetUTF8Text, and it
314
   * will automatically perform recognition.
315
   */
316
  void SetImage(const unsigned char *imagedata, int width, int height,
317
                int bytes_per_pixel, int bytes_per_line);
318
319
  /**
320
   * Provide an image for Tesseract to recognize. As with SetImage above,
321
   * Tesseract takes its own copy of the image, so it need not persist until
322
   * after Recognize.
323
   * Pix vs raw, which to use?
324
   * Use Pix where possible. Tesseract uses Pix as its internal representation
325
   * and it is therefore more efficient to provide a Pix directly.
326
   */
327
  void SetImage(Pix *pix);
328
329
  /**
330
   * Set the resolution of the source image in pixels per inch so font size
331
   * information can be calculated in results.  Call this after SetImage().
332
   */
333
  void SetSourceResolution(int ppi);
334
335
  /**
336
   * Restrict recognition to a sub-rectangle of the image. Call after SetImage.
337
   * Each SetRectangle clears the recogntion results so multiple rectangles
338
   * can be recognized with the same image.
339
   */
340
  void SetRectangle(int left, int top, int width, int height);
341
342
  /**
343
   * Get a copy of the internal thresholded image from Tesseract.
344
   * Caller takes ownership of the Pix and must pixDestroy it.
345
   * May be called any time after SetImage, or after TesseractRect.
346
   */
347
  Pix *GetThresholdedImage();
348
349
  /**
350
   * Get the result of page layout analysis as a leptonica-style
351
   * Boxa, Pixa pair, in reading order.
352
   * Can be called before or after Recognize.
353
   */
354
  Boxa *GetRegions(Pixa **pixa);
355
356
  /**
357
   * Get the textlines as a leptonica-style
358
   * Boxa, Pixa pair, in reading order.
359
   * Can be called before or after Recognize.
360
   * If raw_image is true, then extract from the original image instead of the
361
   * thresholded image and pad by raw_padding pixels.
362
   * If blockids is not nullptr, the block-id of each line is also returned as
363
   * an array of one element per line. delete [] after use. If paraids is not
364
   * nullptr, the paragraph-id of each line within its block is also returned as
365
   * an array of one element per line. delete [] after use.
366
   */
367
  Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa,
368
                     int **blockids, int **paraids);
369
  /*
370
   Helper method to extract from the thresholded image. (most common usage)
371
*/
372
0
  Boxa *GetTextlines(Pixa **pixa, int **blockids) {
373
0
    return GetTextlines(false, 0, pixa, blockids, nullptr);
374
0
  }
375
376
  /**
377
   * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa
378
   * pair, in reading order. Enables downstream handling of non-rectangular
379
   * regions.
380
   * Can be called before or after Recognize.
381
   * If blockids is not nullptr, the block-id of each line is also returned as
382
   * an array of one element per line. delete [] after use.
383
   */
384
  Boxa *GetStrips(Pixa **pixa, int **blockids);
385
386
  /**
387
   * Get the words as a leptonica-style
388
   * Boxa, Pixa pair, in reading order.
389
   * Can be called before or after Recognize.
390
   */
391
  Boxa *GetWords(Pixa **pixa);
392
393
  /**
394
   * Gets the individual connected (text) components (created
395
   * after pages segmentation step, but before recognition)
396
   * as a leptonica-style Boxa, Pixa pair, in reading order.
397
   * Can be called before or after Recognize.
398
   * Note: the caller is responsible for calling boxaDestroy()
399
   * on the returned Boxa array and pixaDestroy() on cc array.
400
   */
401
  Boxa *GetConnectedComponents(Pixa **cc);
402
403
  /**
404
   * Get the given level kind of components (block, textline, word etc.) as a
405
   * leptonica-style Boxa, Pixa pair, in reading order.
406
   * Can be called before or after Recognize.
407
   * If blockids is not nullptr, the block-id of each component is also returned
408
   * as an array of one element per component. delete [] after use.
409
   * If blockids is not nullptr, the paragraph-id of each component with its
410
   * block is also returned as an array of one element per component. delete []
411
   * after use. If raw_image is true, then portions of the original image are
412
   * extracted instead of the thresholded image and padded with raw_padding. If
413
   * text_only is true, then only text components are returned.
414
   */
415
  Boxa *GetComponentImages(PageIteratorLevel level, bool text_only,
416
                           bool raw_image, int raw_padding, Pixa **pixa,
417
                           int **blockids, int **paraids);
418
  // Helper function to get binary images with no padding (most common usage).
419
  Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only,
420
0
                           Pixa **pixa, int **blockids) {
421
0
    return GetComponentImages(level, text_only, false, 0, pixa, blockids,
422
0
                              nullptr);
423
0
  }
424
425
  /**
426
   * Returns the scale factor of the thresholded image that would be returned by
427
   * GetThresholdedImage() and the various GetX() methods that call
428
   * GetComponentImages().
429
   * Returns 0 if no thresholder has been set.
430
   */
431
  int GetThresholdedImageScaleFactor() const;
432
433
  /**
434
   * Runs page layout analysis in the mode set by SetPageSegMode.
435
   * May optionally be called prior to Recognize to get access to just
436
   * the page layout results. Returns an iterator to the results.
437
   * If merge_similar_words is true, words are combined where suitable for use
438
   * with a line recognizer. Use if you want to use AnalyseLayout to find the
439
   * textlines, and then want to process textline fragments with an external
440
   * line recognizer.
441
   * Returns nullptr on error or an empty page.
442
   * The returned iterator must be deleted after use.
443
   * WARNING! This class points to data held within the TessBaseAPI class, and
444
   * therefore can only be used while the TessBaseAPI class still exists and
445
   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
446
   * DetectOS, or anything else that changes the internal PAGE_RES.
447
   */
448
  PageIterator *AnalyseLayout();
449
  PageIterator *AnalyseLayout(bool merge_similar_words);
450
451
  /**
452
   * Recognize the image from SetAndThresholdImage, generating Tesseract
453
   * internal structures. Returns 0 on success.
454
   * Optional. The Get*Text functions below will call Recognize if needed.
455
   * After Recognize, the output is kept internally until the next SetImage.
456
   */
457
  int Recognize(ETEXT_DESC *monitor);
458
459
  /**
460
   * Methods to retrieve information after SetAndThresholdImage(),
461
   * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.)
462
   */
463
464
  /**
465
   * Turns images into symbolic text.
466
   *
467
   * filename can point to a single image, a multi-page TIFF,
468
   * or a plain text list of image filenames.
469
   *
470
   * retry_config is useful for debugging. If not nullptr, you can fall
471
   * back to an alternate configuration if a page fails for some
472
   * reason.
473
   *
474
   * timeout_millisec terminates processing if any single page
475
   * takes too long. Set to 0 for unlimited time.
476
   *
477
   * renderer is responsible for creating the output. For example,
478
   * use the TessTextRenderer if you want plaintext output, or
479
   * the TessPDFRender to produce searchable PDF.
480
   *
481
   * If tessedit_page_number is non-negative, will only process that
482
   * single page. Works for multi-page tiff file, or filelist.
483
   *
484
   * Returns true if successful, false on error.
485
   */
486
  bool ProcessPages(const char *filename, const char *retry_config,
487
                    int timeout_millisec, TessResultRenderer *renderer);
488
  // Does the real work of ProcessPages.
489
  bool ProcessPagesInternal(const char *filename, const char *retry_config,
490
                            int timeout_millisec, TessResultRenderer *renderer);
491
492
  /**
493
   * Turn a single image into symbolic text.
494
   *
495
   * The pix is the image processed. filename and page_index are
496
   * metadata used by side-effect processes, such as reading a box
497
   * file or formatting as hOCR.
498
   *
499
   * See ProcessPages for descriptions of other parameters.
500
   */
501
  bool ProcessPage(Pix *pix, int page_index, const char *filename,
502
                   const char *retry_config, int timeout_millisec,
503
                   TessResultRenderer *renderer);
504
505
  /**
506
   * Get a reading-order iterator to the results of LayoutAnalysis and/or
507
   * Recognize. The returned iterator must be deleted after use.
508
   * WARNING! This class points to data held within the TessBaseAPI class, and
509
   * therefore can only be used while the TessBaseAPI class still exists and
510
   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
511
   * DetectOS, or anything else that changes the internal PAGE_RES.
512
   */
513
  ResultIterator *GetIterator();
514
515
  /**
516
   * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize.
517
   * The returned iterator must be deleted after use.
518
   * WARNING! This class points to data held within the TessBaseAPI class, and
519
   * therefore can only be used while the TessBaseAPI class still exists and
520
   * has not been subjected to a call of Init, SetImage, Recognize, Clear, End
521
   * DetectOS, or anything else that changes the internal PAGE_RES.
522
   */
523
  MutableIterator *GetMutableIterator();
524
525
  /**
526
   * The recognized text is returned as a char* which is coded
527
   * as UTF8 and must be freed with the delete [] operator.
528
   */
529
  char *GetUTF8Text();
530
531
  /**
532
   * Make a HTML-formatted string with hOCR markup from the internal
533
   * data structures.
534
   * page_number is 0-based but will appear in the output as 1-based.
535
   * monitor can be used to
536
   *  cancel the recognition
537
   *  receive progress callbacks
538
   * Returned string must be freed with the delete [] operator.
539
   */
540
  char *GetHOCRText(ETEXT_DESC *monitor, int page_number);
541
542
  /**
543
   * Make a HTML-formatted string with hOCR markup from the internal
544
   * data structures.
545
   * page_number is 0-based but will appear in the output as 1-based.
546
   * Returned string must be freed with the delete [] operator.
547
   */
548
  char *GetHOCRText(int page_number);
549
550
  /**
551
   * Make an XML-formatted string with Alto markup from the internal
552
   * data structures.
553
   */
554
  char *GetAltoText(ETEXT_DESC *monitor, int page_number);
555
556
  /**
557
   * Make an XML-formatted string with Alto markup from the internal
558
   * data structures.
559
   */
560
  char *GetAltoText(int page_number);
561
562
  /**
563
   * Make a TSV-formatted string from the internal data structures.
564
   * page_number is 0-based but will appear in the output as 1-based.
565
   * Returned string must be freed with the delete [] operator.
566
   */
567
  char *GetTSVText(int page_number);
568
569
  /**
570
   * Make a box file for LSTM training from the internal data structures.
571
   * Constructs coordinates in the original image - not just the rectangle.
572
   * page_number is a 0-based page index that will appear in the box file.
573
   * Returned string must be freed with the delete [] operator.
574
   */
575
  char *GetLSTMBoxText(int page_number);
576
577
  /**
578
   * The recognized text is returned as a char* which is coded in the same
579
   * format as a box file used in training.
580
   * Constructs coordinates in the original image - not just the rectangle.
581
   * page_number is a 0-based page index that will appear in the box file.
582
   * Returned string must be freed with the delete [] operator.
583
   */
584
  char *GetBoxText(int page_number);
585
586
  /**
587
   * The recognized text is returned as a char* which is coded in the same
588
   * format as a WordStr box file used in training.
589
   * page_number is a 0-based page index that will appear in the box file.
590
   * Returned string must be freed with the delete [] operator.
591
   */
592
  char *GetWordStrBoxText(int page_number);
593
594
  /**
595
   * The recognized text is returned as a char* which is coded
596
   * as UNLV format Latin-1 with specific reject and suspect codes.
597
   * Returned string must be freed with the delete [] operator.
598
   */
599
  char *GetUNLVText();
600
601
  /**
602
   * Detect the orientation of the input image and apparent script (alphabet).
603
   * orient_deg is the detected clockwise rotation of the input image in degrees
604
   * (0, 90, 180, 270)
605
   * orient_conf is the confidence (15.0 is reasonably confident)
606
   * script_name is an ASCII string, the name of the script, e.g. "Latin"
607
   * script_conf is confidence level in the script
608
   * Returns true on success and writes values to each parameter as an output
609
   */
610
  bool DetectOrientationScript(int *orient_deg, float *orient_conf,
611
                               const char **script_name, float *script_conf);
612
613
  /**
614
   * The recognized text is returned as a char* which is coded
615
   * as UTF8 and must be freed with the delete [] operator.
616
   * page_number is a 0-based page index that will appear in the osd file.
617
   */
618
  char *GetOsdText(int page_number);
619
620
  /** Returns the (average) confidence value between 0 and 100. */
621
  int MeanTextConf();
622
  /**
623
   * Returns all word confidences (between 0 and 100) in an array, terminated
624
   * by -1.  The calling function must delete [] after use.
625
   * The number of confidences should correspond to the number of space-
626
   * delimited words in GetUTF8Text.
627
   */
628
  int *AllWordConfidences();
629
630
#ifndef DISABLED_LEGACY_ENGINE
631
  /**
632
   * Applies the given word to the adaptive classifier if possible.
633
   * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can
634
   * tell the boundaries of the graphemes.
635
   * Assumes that SetImage/SetRectangle have been used to set the image
636
   * to the given word. The mode arg should be PSM_SINGLE_WORD or
637
   * PSM_CIRCLE_WORD, as that will be used to control layout analysis.
638
   * The currently set PageSegMode is preserved.
639
   * Returns false if adaption was not possible for some reason.
640
   */
641
  bool AdaptToWordStr(PageSegMode mode, const char *wordstr);
642
#endif //  ndef DISABLED_LEGACY_ENGINE
643
644
  /**
645
   * Free up recognition results and any stored image data, without actually
646
   * freeing any recognition data that would be time-consuming to reload.
647
   * Afterwards, you must call SetImage or TesseractRect before doing
648
   * any Recognize or Get* operation.
649
   */
650
  void Clear();
651
652
  /**
653
   * Close down tesseract and free up all memory. End() is equivalent to
654
   * destructing and reconstructing your TessBaseAPI.
655
   * Once End() has been used, none of the other API functions may be used
656
   * other than Init and anything declared above it in the class definition.
657
   */
658
  void End();
659
660
  /**
661
   * Clear any library-level memory caches.
662
   * There are a variety of expensive-to-load constant data structures (mostly
663
   * language dictionaries) that are cached globally -- surviving the Init()
664
   * and End() of individual TessBaseAPI's.  This function allows the clearing
665
   * of these caches.
666
   **/
667
  static void ClearPersistentCache();
668
669
  /**
670
   * Check whether a word is valid according to Tesseract's language model
671
   * @return 0 if the word is invalid, non-zero if valid.
672
   * @warning temporary! This function will be removed from here and placed
673
   * in a separate API at some future time.
674
   */
675
  int IsValidWord(const char *word) const;
676
  // Returns true if utf8_character is defined in the UniCharset.
677
  bool IsValidCharacter(const char *utf8_character) const;
678
679
  bool GetTextDirection(int *out_offset, float *out_slope);
680
681
  /** Sets Dict::letter_is_okay_ function to point to the given function. */
682
  void SetDictFunc(DictFunc f);
683
684
  /** Sets Dict::probability_in_context_ function to point to the given
685
   * function.
686
   */
687
  void SetProbabilityInContextFunc(ProbabilityInContextFunc f);
688
689
  /**
690
   * Estimates the Orientation And Script of the image.
691
   * @return true if the image was processed successfully.
692
   */
693
  bool DetectOS(OSResults *);
694
695
  /**
696
   * Return text orientation of each block as determined by an earlier run
697
   * of layout analysis.
698
   */
699
  void GetBlockTextOrientations(int **block_orientation,
700
                                bool **vertical_writing);
701
702
  /** This method returns the string form of the specified unichar. */
703
  const char *GetUnichar(int unichar_id) const;
704
705
  /** Return the pointer to the i-th dawg loaded into tesseract_ object. */
706
  const Dawg *GetDawg(int i) const;
707
708
  /** Return the number of dawgs loaded into tesseract_ object. */
709
  int NumDawgs() const;
710
711
0
  Tesseract *tesseract() const {
712
0
    return tesseract_;
713
0
  }
714
715
0
  OcrEngineMode oem() const {
716
0
    return last_oem_requested_;
717
0
  }
718
719
  void set_min_orientation_margin(double margin);
720
  /* @} */
721
722
protected:
723
  /** Common code for setting the image. Returns true if Init has been called.
724
   */
725
  bool InternalSetImage();
726
727
  /**
728
   * Run the thresholder to make the thresholded image. If pix is not nullptr,
729
   * the source is thresholded to pix instead of the internal IMAGE.
730
   */
731
  virtual bool Threshold(Pix **pix);
732
733
  /**
734
   * Find lines from the image making the BLOCK_LIST.
735
   * @return 0 on success.
736
   */
737
  int FindLines();
738
739
  /** Delete the pageres and block list ready for a new page. */
740
  void ClearResults();
741
742
  /**
743
   * Return an LTR Result Iterator -- used only for training, as we really want
744
   * to ignore all BiDi smarts at that point.
745
   * delete once you're done with it.
746
   */
747
  LTRResultIterator *GetLTRIterator();
748
749
  /**
750
   * Return the length of the output text string, as UTF8, assuming
751
   * one newline per line and one per block, with a terminator,
752
   * and assuming a single character reject marker for each rejected character.
753
   * Also return the number of recognized blobs in blob_count.
754
   */
755
  int TextLength(int *blob_count) const;
756
757
  //// paragraphs.cpp ////////////////////////////////////////////////////
758
  void DetectParagraphs(bool after_text_recognition);
759
760
0
  const PAGE_RES *GetPageRes() const {
761
0
    return page_res_;
762
0
  }
763
764
protected:
765
  Tesseract *tesseract_;          ///< The underlying data object.
766
  Tesseract *osd_tesseract_;      ///< For orientation & script detection.
767
  EquationDetect *equ_detect_;    ///< The equation detector.
768
  FileReader reader_;             ///< Reads files from any filesystem.
769
  ImageThresholder *thresholder_; ///< Image thresholding module.
770
  std::vector<ParagraphModel *> *paragraph_models_;
771
  BLOCK_LIST *block_list_;           ///< The page layout.
772
  PAGE_RES *page_res_;               ///< The page-level data.
773
  std::string input_file_;           ///< Name used by training code.
774
  std::string output_file_;          ///< Name used by debug code.
775
  std::string datapath_;             ///< Current location of tessdata.
776
  std::string language_;             ///< Last initialized language.
777
  OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested.
778
  bool recognition_done_;            ///< page_res_ contains recognition data.
779
780
  /**
781
   * @defgroup ThresholderParams Thresholder Parameters
782
   * Parameters saved from the Thresholder. Needed to rebuild coordinates.
783
   */
784
  /* @{ */
785
  int rect_left_;
786
  int rect_top_;
787
  int rect_width_;
788
  int rect_height_;
789
  int image_width_;
790
  int image_height_;
791
  /* @} */
792
793
private:
794
  // A list of image filenames gets special consideration
795
  bool ProcessPagesFileList(FILE *fp, std::string *buf,
796
                            const char *retry_config, int timeout_millisec,
797
                            TessResultRenderer *renderer,
798
                            int tessedit_page_number);
799
  // TIFF supports multipage so gets special consideration.
800
  bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size,
801
                                 const char *filename, const char *retry_config,
802
                                 int timeout_millisec,
803
                                 TessResultRenderer *renderer,
804
                                 int tessedit_page_number);
805
}; // class TessBaseAPI.
806
807
/** Escape a char string - replace &<>"' with HTML codes. */
808
std::string HOcrEscape(const char *text);
809
810
} // namespace tesseract
811
812
#endif // TESSERACT_API_BASEAPI_H_