/src/tesseract/include/tesseract/baseapi.h
Line | Count | Source (jump to first uncovered line) |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // File: baseapi.h |
3 | | // Description: Simple API for calling tesseract. |
4 | | // Author: Ray Smith |
5 | | // |
6 | | // (C) Copyright 2006, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | |
17 | | #ifndef TESSERACT_API_BASEAPI_H_ |
18 | | #define TESSERACT_API_BASEAPI_H_ |
19 | | |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" // DISABLED_LEGACY_ENGINE |
22 | | #endif |
23 | | |
24 | | #include "export.h" |
25 | | #include "pageiterator.h" |
26 | | #include "publictypes.h" |
27 | | #include "resultiterator.h" |
28 | | #include "unichar.h" |
29 | | |
30 | | #include <tesseract/version.h> |
31 | | |
32 | | #include <cstdio> |
33 | | #include <vector> // for std::vector |
34 | | |
35 | | struct Pix; |
36 | | struct Pixa; |
37 | | struct Boxa; |
38 | | |
39 | | namespace tesseract { |
40 | | |
41 | | class PAGE_RES; |
42 | | class ParagraphModel; |
43 | | class BLOCK_LIST; |
44 | | class ETEXT_DESC; |
45 | | struct OSResults; |
46 | | class UNICHARSET; |
47 | | |
48 | | class Dawg; |
49 | | class Dict; |
50 | | class EquationDetect; |
51 | | class PageIterator; |
52 | | class ImageThresholder; |
53 | | class LTRResultIterator; |
54 | | class ResultIterator; |
55 | | class MutableIterator; |
56 | | class TessResultRenderer; |
57 | | class Tesseract; |
58 | | |
59 | | // Function to read a std::vector<char> from a whole file. |
60 | | // Returns false on failure. |
61 | | using FileReader = bool (*)(const char *filename, std::vector<char> *data); |
62 | | |
63 | | using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, |
64 | | bool) const; |
65 | | using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, |
66 | | int, const char *, int); |
67 | | |
68 | | /** |
69 | | * Base class for all tesseract APIs. |
70 | | * Specific classes can add ability to work on different inputs or produce |
71 | | * different outputs. |
72 | | * This class is mostly an interface layer on top of the Tesseract instance |
73 | | * class to hide the data types so that users of this class don't have to |
74 | | * include any other Tesseract headers. |
75 | | */ |
76 | | class TESS_API TessBaseAPI { |
77 | | public: |
78 | | TessBaseAPI(); |
79 | | virtual ~TessBaseAPI(); |
80 | | // Copy constructor and assignment operator are currently unsupported. |
81 | | TessBaseAPI(TessBaseAPI const &) = delete; |
82 | | TessBaseAPI &operator=(TessBaseAPI const &) = delete; |
83 | | |
84 | | /** |
85 | | * Returns the version identifier as a static string. Do not delete. |
86 | | */ |
87 | | static const char *Version(); |
88 | | |
89 | | /** |
90 | | * If compiled with OpenCL AND an available OpenCL |
91 | | * device is deemed faster than serial code, then |
92 | | * "device" is populated with the cl_device_id |
93 | | * and returns sizeof(cl_device_id) |
94 | | * otherwise *device=nullptr and returns 0. |
95 | | */ |
96 | | static size_t getOpenCLDevice(void **device); |
97 | | |
98 | | /** |
99 | | * Set the name of the input file. Needed for training and |
100 | | * reading a UNLV zone file, and for searchable PDF output. |
101 | | */ |
102 | | void SetInputName(const char *name); |
103 | | /** |
104 | | * These functions are required for searchable PDF output. |
105 | | * We need our hands on the input file so that we can include |
106 | | * it in the PDF without transcoding. If that is not possible, |
107 | | * we need the original image. Finally, resolution metadata |
108 | | * is stored in the PDF so we need that as well. |
109 | | */ |
110 | | const char *GetInputName(); |
111 | | // Takes ownership of the input pix. |
112 | | void SetInputImage(Pix *pix); |
113 | | Pix *GetInputImage(); |
114 | | int GetSourceYResolution(); |
115 | | const char *GetDatapath(); |
116 | | |
117 | | /** Set the name of the bonus output files. Needed only for debugging. */ |
118 | | void SetOutputName(const char *name); |
119 | | |
120 | | /** |
121 | | * Set the value of an internal "parameter." |
122 | | * Supply the name of the parameter and the value as a string, just as |
123 | | * you would in a config file. |
124 | | * Returns false if the name lookup failed. |
125 | | * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. |
126 | | * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode. |
127 | | * SetVariable may be used before Init, but settings will revert to |
128 | | * defaults on End(). |
129 | | * |
130 | | * Note: Must be called after Init(). Only works for non-init variables |
131 | | * (init variables should be passed to Init()). |
132 | | */ |
133 | | bool SetVariable(const char *name, const char *value); |
134 | | bool SetDebugVariable(const char *name, const char *value); |
135 | | |
136 | | /** |
137 | | * Returns true if the parameter was found among Tesseract parameters. |
138 | | * Fills in value with the value of the parameter. |
139 | | */ |
140 | | bool GetIntVariable(const char *name, int *value) const; |
141 | | bool GetBoolVariable(const char *name, bool *value) const; |
142 | | bool GetDoubleVariable(const char *name, double *value) const; |
143 | | |
144 | | /** |
145 | | * Returns the pointer to the string that represents the value of the |
146 | | * parameter if it was found among Tesseract parameters. |
147 | | */ |
148 | | const char *GetStringVariable(const char *name) const; |
149 | | |
150 | | #ifndef DISABLED_LEGACY_ENGINE |
151 | | |
152 | | /** |
153 | | * Print Tesseract fonts table to the given file. |
154 | | */ |
155 | | void PrintFontsTable(FILE *fp) const; |
156 | | |
157 | | #endif |
158 | | |
159 | | /** |
160 | | * Print Tesseract parameters to the given file. |
161 | | */ |
162 | | void PrintVariables(FILE *fp) const; |
163 | | |
164 | | /** |
165 | | * Get value of named variable as a string, if it exists. |
166 | | */ |
167 | | bool GetVariableAsString(const char *name, std::string *val) const; |
168 | | |
169 | | /** |
170 | | * Instances are now mostly thread-safe and totally independent, |
171 | | * but some global parameters remain. Basically it is safe to use multiple |
172 | | * TessBaseAPIs in different threads in parallel, UNLESS: |
173 | | * you use SetVariable on some of the Params in classify and textord. |
174 | | * If you do, then the effect will be to change it for all your instances. |
175 | | * |
176 | | * Start tesseract. Returns zero on success and -1 on failure. |
177 | | * NOTE that the only members that may be called before Init are those |
178 | | * listed above here in the class definition. |
179 | | * |
180 | | * The datapath must be the name of the tessdata directory. |
181 | | * The language is (usually) an ISO 639-3 string or nullptr will default to |
182 | | * eng. It is entirely safe (and eventually will be efficient too) to call |
183 | | * Init multiple times on the same instance to change language, or just |
184 | | * to reset the classifier. |
185 | | * The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating |
186 | | * that multiple languages are to be loaded. Eg hin+eng will load Hindi and |
187 | | * English. Languages may specify internally that they want to be loaded |
188 | | * with one or more other languages, so the ~ sign is available to override |
189 | | * that. Eg if hin were set to load eng by default, then hin+~eng would force |
190 | | * loading only hin. The number of loaded languages is limited only by |
191 | | * memory, with the caveat that loading additional languages will impact |
192 | | * both speed and accuracy, as there is more work to do to decide on the |
193 | | * applicable language, and there is more chance of hallucinating incorrect |
194 | | * words. |
195 | | * WARNING: On changing languages, all Tesseract parameters are reset |
196 | | * back to their default values. (Which may vary between languages.) |
197 | | * If you have a rare need to set a Variable that controls |
198 | | * initialization for a second call to Init you should explicitly |
199 | | * call End() and then use SetVariable before Init. This is only a very |
200 | | * rare use case, since there are very few uses that require any parameters |
201 | | * to be set before Init. |
202 | | * |
203 | | * If set_only_non_debug_params is true, only params that do not contain |
204 | | * "debug" in the name will be set. |
205 | | */ |
206 | | int Init(const char *datapath, const char *language, OcrEngineMode mode, |
207 | | char **configs, int configs_size, |
208 | | const std::vector<std::string> *vars_vec, |
209 | | const std::vector<std::string> *vars_values, |
210 | | bool set_only_non_debug_params); |
211 | 0 | int Init(const char *datapath, const char *language, OcrEngineMode oem) { |
212 | 0 | return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false); |
213 | 0 | } |
214 | 4 | int Init(const char *datapath, const char *language) { |
215 | 4 | return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, |
216 | 4 | false); |
217 | 4 | } |
218 | | // In-memory version reads the traineddata file directly from the given |
219 | | // data[data_size] array, and/or reads data via a FileReader. |
220 | | int Init(const char *data, int data_size, const char *language, |
221 | | OcrEngineMode mode, char **configs, int configs_size, |
222 | | const std::vector<std::string> *vars_vec, |
223 | | const std::vector<std::string> *vars_values, |
224 | | bool set_only_non_debug_params, FileReader reader); |
225 | | |
226 | | /** |
227 | | * Returns the languages string used in the last valid initialization. |
228 | | * If the last initialization specified "deu+hin" then that will be |
229 | | * returned. If hin loaded eng automatically as well, then that will |
230 | | * not be included in this list. To find the languages actually |
231 | | * loaded use GetLoadedLanguagesAsVector. |
232 | | * The returned string should NOT be deleted. |
233 | | */ |
234 | | const char *GetInitLanguagesAsString() const; |
235 | | |
236 | | /** |
237 | | * Returns the loaded languages in the vector of std::string. |
238 | | * Includes all languages loaded by the last Init, including those loaded |
239 | | * as dependencies of other loaded languages. |
240 | | */ |
241 | | void GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const; |
242 | | |
243 | | /** |
244 | | * Returns the available languages in the sorted vector of std::string. |
245 | | */ |
246 | | void GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const; |
247 | | |
248 | | /** |
249 | | * Init only for page layout analysis. Use only for calls to SetImage and |
250 | | * AnalysePage. Calls that attempt recognition will generate an error. |
251 | | */ |
252 | | void InitForAnalysePage(); |
253 | | |
254 | | /** |
255 | | * Read a "config" file containing a set of param, value pairs. |
256 | | * Searches the standard places: tessdata/configs, tessdata/tessconfigs |
257 | | * and also accepts a relative or absolute path name. |
258 | | * Note: only non-init params will be set (init params are set by Init()). |
259 | | */ |
260 | | void ReadConfigFile(const char *filename); |
261 | | /** Same as above, but only set debug params from the given config file. */ |
262 | | void ReadDebugConfigFile(const char *filename); |
263 | | |
264 | | /** |
265 | | * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. |
266 | | * The mode is stored as an IntParam so it can also be modified by |
267 | | * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). |
268 | | */ |
269 | | void SetPageSegMode(PageSegMode mode); |
270 | | |
271 | | /** Return the current page segmentation mode. */ |
272 | | PageSegMode GetPageSegMode() const; |
273 | | |
274 | | /** |
275 | | * Recognize a rectangle from an image and return the result as a string. |
276 | | * May be called many times for a single Init. |
277 | | * Currently has no error checking. |
278 | | * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. |
279 | | * Palette color images will not work properly and must be converted to |
280 | | * 24 bit. |
281 | | * Binary images of 1 bit per pixel may also be given but they must be |
282 | | * byte packed with the MSB of the first byte being the first pixel, and a |
283 | | * 1 represents WHITE. For binary images set bytes_per_pixel=0. |
284 | | * The recognized text is returned as a char* which is coded |
285 | | * as UTF8 and must be freed with the delete [] operator. |
286 | | * |
287 | | * Note that TesseractRect is the simplified convenience interface. |
288 | | * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, |
289 | | * and one or more of the Get*Text functions below. |
290 | | */ |
291 | | char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, |
292 | | int bytes_per_line, int left, int top, int width, |
293 | | int height); |
294 | | |
295 | | /** |
296 | | * Call between pages or documents etc to free up memory and forget |
297 | | * adaptive data. |
298 | | */ |
299 | | void ClearAdaptiveClassifier(); |
300 | | |
301 | | /** |
302 | | * @defgroup AdvancedAPI Advanced API |
303 | | * The following methods break TesseractRect into pieces, so you can |
304 | | * get hold of the thresholded image, get the text in different formats, |
305 | | * get bounding boxes, confidences etc. |
306 | | */ |
307 | | /* @{ */ |
308 | | |
309 | | /** |
310 | | * Provide an image for Tesseract to recognize. Format is as |
311 | | * TesseractRect above. Copies the image buffer and converts to Pix. |
312 | | * SetImage clears all recognition results, and sets the rectangle to the |
313 | | * full image, so it may be followed immediately by a GetUTF8Text, and it |
314 | | * will automatically perform recognition. |
315 | | */ |
316 | | void SetImage(const unsigned char *imagedata, int width, int height, |
317 | | int bytes_per_pixel, int bytes_per_line); |
318 | | |
319 | | /** |
320 | | * Provide an image for Tesseract to recognize. As with SetImage above, |
321 | | * Tesseract takes its own copy of the image, so it need not persist until |
322 | | * after Recognize. |
323 | | * Pix vs raw, which to use? |
324 | | * Use Pix where possible. Tesseract uses Pix as its internal representation |
325 | | * and it is therefore more efficient to provide a Pix directly. |
326 | | */ |
327 | | void SetImage(Pix *pix); |
328 | | |
329 | | /** |
330 | | * Set the resolution of the source image in pixels per inch so font size |
331 | | * information can be calculated in results. Call this after SetImage(). |
332 | | */ |
333 | | void SetSourceResolution(int ppi); |
334 | | |
335 | | /** |
336 | | * Restrict recognition to a sub-rectangle of the image. Call after SetImage. |
337 | | * Each SetRectangle clears the recogntion results so multiple rectangles |
338 | | * can be recognized with the same image. |
339 | | */ |
340 | | void SetRectangle(int left, int top, int width, int height); |
341 | | |
342 | | /** |
343 | | * Get a copy of the internal thresholded image from Tesseract. |
344 | | * Caller takes ownership of the Pix and must pixDestroy it. |
345 | | * May be called any time after SetImage, or after TesseractRect. |
346 | | */ |
347 | | Pix *GetThresholdedImage(); |
348 | | |
349 | | /** |
350 | | * Get the result of page layout analysis as a leptonica-style |
351 | | * Boxa, Pixa pair, in reading order. |
352 | | * Can be called before or after Recognize. |
353 | | */ |
354 | | Boxa *GetRegions(Pixa **pixa); |
355 | | |
356 | | /** |
357 | | * Get the textlines as a leptonica-style |
358 | | * Boxa, Pixa pair, in reading order. |
359 | | * Can be called before or after Recognize. |
360 | | * If raw_image is true, then extract from the original image instead of the |
361 | | * thresholded image and pad by raw_padding pixels. |
362 | | * If blockids is not nullptr, the block-id of each line is also returned as |
363 | | * an array of one element per line. delete [] after use. If paraids is not |
364 | | * nullptr, the paragraph-id of each line within its block is also returned as |
365 | | * an array of one element per line. delete [] after use. |
366 | | */ |
367 | | Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, |
368 | | int **blockids, int **paraids); |
369 | | /* |
370 | | Helper method to extract from the thresholded image. (most common usage) |
371 | | */ |
372 | 0 | Boxa *GetTextlines(Pixa **pixa, int **blockids) { |
373 | 0 | return GetTextlines(false, 0, pixa, blockids, nullptr); |
374 | 0 | } |
375 | | |
376 | | /** |
377 | | * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa |
378 | | * pair, in reading order. Enables downstream handling of non-rectangular |
379 | | * regions. |
380 | | * Can be called before or after Recognize. |
381 | | * If blockids is not nullptr, the block-id of each line is also returned as |
382 | | * an array of one element per line. delete [] after use. |
383 | | */ |
384 | | Boxa *GetStrips(Pixa **pixa, int **blockids); |
385 | | |
386 | | /** |
387 | | * Get the words as a leptonica-style |
388 | | * Boxa, Pixa pair, in reading order. |
389 | | * Can be called before or after Recognize. |
390 | | */ |
391 | | Boxa *GetWords(Pixa **pixa); |
392 | | |
393 | | /** |
394 | | * Gets the individual connected (text) components (created |
395 | | * after pages segmentation step, but before recognition) |
396 | | * as a leptonica-style Boxa, Pixa pair, in reading order. |
397 | | * Can be called before or after Recognize. |
398 | | * Note: the caller is responsible for calling boxaDestroy() |
399 | | * on the returned Boxa array and pixaDestroy() on cc array. |
400 | | */ |
401 | | Boxa *GetConnectedComponents(Pixa **cc); |
402 | | |
403 | | /** |
404 | | * Get the given level kind of components (block, textline, word etc.) as a |
405 | | * leptonica-style Boxa, Pixa pair, in reading order. |
406 | | * Can be called before or after Recognize. |
407 | | * If blockids is not nullptr, the block-id of each component is also returned |
408 | | * as an array of one element per component. delete [] after use. |
409 | | * If blockids is not nullptr, the paragraph-id of each component with its |
410 | | * block is also returned as an array of one element per component. delete [] |
411 | | * after use. If raw_image is true, then portions of the original image are |
412 | | * extracted instead of the thresholded image and padded with raw_padding. If |
413 | | * text_only is true, then only text components are returned. |
414 | | */ |
415 | | Boxa *GetComponentImages(PageIteratorLevel level, bool text_only, |
416 | | bool raw_image, int raw_padding, Pixa **pixa, |
417 | | int **blockids, int **paraids); |
418 | | // Helper function to get binary images with no padding (most common usage). |
419 | | Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only, |
420 | 0 | Pixa **pixa, int **blockids) { |
421 | 0 | return GetComponentImages(level, text_only, false, 0, pixa, blockids, |
422 | 0 | nullptr); |
423 | 0 | } |
424 | | |
425 | | /** |
426 | | * Returns the scale factor of the thresholded image that would be returned by |
427 | | * GetThresholdedImage() and the various GetX() methods that call |
428 | | * GetComponentImages(). |
429 | | * Returns 0 if no thresholder has been set. |
430 | | */ |
431 | | int GetThresholdedImageScaleFactor() const; |
432 | | |
433 | | /** |
434 | | * Runs page layout analysis in the mode set by SetPageSegMode. |
435 | | * May optionally be called prior to Recognize to get access to just |
436 | | * the page layout results. Returns an iterator to the results. |
437 | | * If merge_similar_words is true, words are combined where suitable for use |
438 | | * with a line recognizer. Use if you want to use AnalyseLayout to find the |
439 | | * textlines, and then want to process textline fragments with an external |
440 | | * line recognizer. |
441 | | * Returns nullptr on error or an empty page. |
442 | | * The returned iterator must be deleted after use. |
443 | | * WARNING! This class points to data held within the TessBaseAPI class, and |
444 | | * therefore can only be used while the TessBaseAPI class still exists and |
445 | | * has not been subjected to a call of Init, SetImage, Recognize, Clear, End |
446 | | * DetectOS, or anything else that changes the internal PAGE_RES. |
447 | | */ |
448 | | PageIterator *AnalyseLayout(); |
449 | | PageIterator *AnalyseLayout(bool merge_similar_words); |
450 | | |
451 | | /** |
452 | | * Recognize the image from SetAndThresholdImage, generating Tesseract |
453 | | * internal structures. Returns 0 on success. |
454 | | * Optional. The Get*Text functions below will call Recognize if needed. |
455 | | * After Recognize, the output is kept internally until the next SetImage. |
456 | | */ |
457 | | int Recognize(ETEXT_DESC *monitor); |
458 | | |
459 | | /** |
460 | | * Methods to retrieve information after SetAndThresholdImage(), |
461 | | * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) |
462 | | */ |
463 | | |
464 | | /** |
465 | | * Turns images into symbolic text. |
466 | | * |
467 | | * filename can point to a single image, a multi-page TIFF, |
468 | | * or a plain text list of image filenames. |
469 | | * |
470 | | * retry_config is useful for debugging. If not nullptr, you can fall |
471 | | * back to an alternate configuration if a page fails for some |
472 | | * reason. |
473 | | * |
474 | | * timeout_millisec terminates processing if any single page |
475 | | * takes too long. Set to 0 for unlimited time. |
476 | | * |
477 | | * renderer is responsible for creating the output. For example, |
478 | | * use the TessTextRenderer if you want plaintext output, or |
479 | | * the TessPDFRender to produce searchable PDF. |
480 | | * |
481 | | * If tessedit_page_number is non-negative, will only process that |
482 | | * single page. Works for multi-page tiff file, or filelist. |
483 | | * |
484 | | * Returns true if successful, false on error. |
485 | | */ |
486 | | bool ProcessPages(const char *filename, const char *retry_config, |
487 | | int timeout_millisec, TessResultRenderer *renderer); |
488 | | // Does the real work of ProcessPages. |
489 | | bool ProcessPagesInternal(const char *filename, const char *retry_config, |
490 | | int timeout_millisec, TessResultRenderer *renderer); |
491 | | |
492 | | /** |
493 | | * Turn a single image into symbolic text. |
494 | | * |
495 | | * The pix is the image processed. filename and page_index are |
496 | | * metadata used by side-effect processes, such as reading a box |
497 | | * file or formatting as hOCR. |
498 | | * |
499 | | * See ProcessPages for descriptions of other parameters. |
500 | | */ |
501 | | bool ProcessPage(Pix *pix, int page_index, const char *filename, |
502 | | const char *retry_config, int timeout_millisec, |
503 | | TessResultRenderer *renderer); |
504 | | |
505 | | /** |
506 | | * Get a reading-order iterator to the results of LayoutAnalysis and/or |
507 | | * Recognize. The returned iterator must be deleted after use. |
508 | | * WARNING! This class points to data held within the TessBaseAPI class, and |
509 | | * therefore can only be used while the TessBaseAPI class still exists and |
510 | | * has not been subjected to a call of Init, SetImage, Recognize, Clear, End |
511 | | * DetectOS, or anything else that changes the internal PAGE_RES. |
512 | | */ |
513 | | ResultIterator *GetIterator(); |
514 | | |
515 | | /** |
516 | | * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. |
517 | | * The returned iterator must be deleted after use. |
518 | | * WARNING! This class points to data held within the TessBaseAPI class, and |
519 | | * therefore can only be used while the TessBaseAPI class still exists and |
520 | | * has not been subjected to a call of Init, SetImage, Recognize, Clear, End |
521 | | * DetectOS, or anything else that changes the internal PAGE_RES. |
522 | | */ |
523 | | MutableIterator *GetMutableIterator(); |
524 | | |
525 | | /** |
526 | | * The recognized text is returned as a char* which is coded |
527 | | * as UTF8 and must be freed with the delete [] operator. |
528 | | */ |
529 | | char *GetUTF8Text(); |
530 | | |
531 | | /** |
532 | | * Make a HTML-formatted string with hOCR markup from the internal |
533 | | * data structures. |
534 | | * page_number is 0-based but will appear in the output as 1-based. |
535 | | * monitor can be used to |
536 | | * cancel the recognition |
537 | | * receive progress callbacks |
538 | | * Returned string must be freed with the delete [] operator. |
539 | | */ |
540 | | char *GetHOCRText(ETEXT_DESC *monitor, int page_number); |
541 | | |
542 | | /** |
543 | | * Make a HTML-formatted string with hOCR markup from the internal |
544 | | * data structures. |
545 | | * page_number is 0-based but will appear in the output as 1-based. |
546 | | * Returned string must be freed with the delete [] operator. |
547 | | */ |
548 | | char *GetHOCRText(int page_number); |
549 | | |
550 | | /** |
551 | | * Make an XML-formatted string with Alto markup from the internal |
552 | | * data structures. |
553 | | */ |
554 | | char *GetAltoText(ETEXT_DESC *monitor, int page_number); |
555 | | |
556 | | /** |
557 | | * Make an XML-formatted string with Alto markup from the internal |
558 | | * data structures. |
559 | | */ |
560 | | char *GetAltoText(int page_number); |
561 | | |
562 | | /** |
563 | | * Make a TSV-formatted string from the internal data structures. |
564 | | * page_number is 0-based but will appear in the output as 1-based. |
565 | | * Returned string must be freed with the delete [] operator. |
566 | | */ |
567 | | char *GetTSVText(int page_number); |
568 | | |
569 | | /** |
570 | | * Make a box file for LSTM training from the internal data structures. |
571 | | * Constructs coordinates in the original image - not just the rectangle. |
572 | | * page_number is a 0-based page index that will appear in the box file. |
573 | | * Returned string must be freed with the delete [] operator. |
574 | | */ |
575 | | char *GetLSTMBoxText(int page_number); |
576 | | |
577 | | /** |
578 | | * The recognized text is returned as a char* which is coded in the same |
579 | | * format as a box file used in training. |
580 | | * Constructs coordinates in the original image - not just the rectangle. |
581 | | * page_number is a 0-based page index that will appear in the box file. |
582 | | * Returned string must be freed with the delete [] operator. |
583 | | */ |
584 | | char *GetBoxText(int page_number); |
585 | | |
586 | | /** |
587 | | * The recognized text is returned as a char* which is coded in the same |
588 | | * format as a WordStr box file used in training. |
589 | | * page_number is a 0-based page index that will appear in the box file. |
590 | | * Returned string must be freed with the delete [] operator. |
591 | | */ |
592 | | char *GetWordStrBoxText(int page_number); |
593 | | |
594 | | /** |
595 | | * The recognized text is returned as a char* which is coded |
596 | | * as UNLV format Latin-1 with specific reject and suspect codes. |
597 | | * Returned string must be freed with the delete [] operator. |
598 | | */ |
599 | | char *GetUNLVText(); |
600 | | |
601 | | /** |
602 | | * Detect the orientation of the input image and apparent script (alphabet). |
603 | | * orient_deg is the detected clockwise rotation of the input image in degrees |
604 | | * (0, 90, 180, 270) |
605 | | * orient_conf is the confidence (15.0 is reasonably confident) |
606 | | * script_name is an ASCII string, the name of the script, e.g. "Latin" |
607 | | * script_conf is confidence level in the script |
608 | | * Returns true on success and writes values to each parameter as an output |
609 | | */ |
610 | | bool DetectOrientationScript(int *orient_deg, float *orient_conf, |
611 | | const char **script_name, float *script_conf); |
612 | | |
613 | | /** |
614 | | * The recognized text is returned as a char* which is coded |
615 | | * as UTF8 and must be freed with the delete [] operator. |
616 | | * page_number is a 0-based page index that will appear in the osd file. |
617 | | */ |
618 | | char *GetOsdText(int page_number); |
619 | | |
620 | | /** Returns the (average) confidence value between 0 and 100. */ |
621 | | int MeanTextConf(); |
622 | | /** |
623 | | * Returns all word confidences (between 0 and 100) in an array, terminated |
624 | | * by -1. The calling function must delete [] after use. |
625 | | * The number of confidences should correspond to the number of space- |
626 | | * delimited words in GetUTF8Text. |
627 | | */ |
628 | | int *AllWordConfidences(); |
629 | | |
630 | | #ifndef DISABLED_LEGACY_ENGINE |
631 | | /** |
632 | | * Applies the given word to the adaptive classifier if possible. |
633 | | * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can |
634 | | * tell the boundaries of the graphemes. |
635 | | * Assumes that SetImage/SetRectangle have been used to set the image |
636 | | * to the given word. The mode arg should be PSM_SINGLE_WORD or |
637 | | * PSM_CIRCLE_WORD, as that will be used to control layout analysis. |
638 | | * The currently set PageSegMode is preserved. |
639 | | * Returns false if adaption was not possible for some reason. |
640 | | */ |
641 | | bool AdaptToWordStr(PageSegMode mode, const char *wordstr); |
642 | | #endif // ndef DISABLED_LEGACY_ENGINE |
643 | | |
644 | | /** |
645 | | * Free up recognition results and any stored image data, without actually |
646 | | * freeing any recognition data that would be time-consuming to reload. |
647 | | * Afterwards, you must call SetImage or TesseractRect before doing |
648 | | * any Recognize or Get* operation. |
649 | | */ |
650 | | void Clear(); |
651 | | |
652 | | /** |
653 | | * Close down tesseract and free up all memory. End() is equivalent to |
654 | | * destructing and reconstructing your TessBaseAPI. |
655 | | * Once End() has been used, none of the other API functions may be used |
656 | | * other than Init and anything declared above it in the class definition. |
657 | | */ |
658 | | void End(); |
659 | | |
660 | | /** |
661 | | * Clear any library-level memory caches. |
662 | | * There are a variety of expensive-to-load constant data structures (mostly |
663 | | * language dictionaries) that are cached globally -- surviving the Init() |
664 | | * and End() of individual TessBaseAPI's. This function allows the clearing |
665 | | * of these caches. |
666 | | **/ |
667 | | static void ClearPersistentCache(); |
668 | | |
669 | | /** |
670 | | * Check whether a word is valid according to Tesseract's language model |
671 | | * @return 0 if the word is invalid, non-zero if valid. |
672 | | * @warning temporary! This function will be removed from here and placed |
673 | | * in a separate API at some future time. |
674 | | */ |
675 | | int IsValidWord(const char *word) const; |
676 | | // Returns true if utf8_character is defined in the UniCharset. |
677 | | bool IsValidCharacter(const char *utf8_character) const; |
678 | | |
679 | | bool GetTextDirection(int *out_offset, float *out_slope); |
680 | | |
681 | | /** Sets Dict::letter_is_okay_ function to point to the given function. */ |
682 | | void SetDictFunc(DictFunc f); |
683 | | |
684 | | /** Sets Dict::probability_in_context_ function to point to the given |
685 | | * function. |
686 | | */ |
687 | | void SetProbabilityInContextFunc(ProbabilityInContextFunc f); |
688 | | |
689 | | /** |
690 | | * Estimates the Orientation And Script of the image. |
691 | | * @return true if the image was processed successfully. |
692 | | */ |
693 | | bool DetectOS(OSResults *); |
694 | | |
695 | | /** |
696 | | * Return text orientation of each block as determined by an earlier run |
697 | | * of layout analysis. |
698 | | */ |
699 | | void GetBlockTextOrientations(int **block_orientation, |
700 | | bool **vertical_writing); |
701 | | |
702 | | /** This method returns the string form of the specified unichar. */ |
703 | | const char *GetUnichar(int unichar_id) const; |
704 | | |
705 | | /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ |
706 | | const Dawg *GetDawg(int i) const; |
707 | | |
708 | | /** Return the number of dawgs loaded into tesseract_ object. */ |
709 | | int NumDawgs() const; |
710 | | |
711 | 0 | Tesseract *tesseract() const { |
712 | 0 | return tesseract_; |
713 | 0 | } |
714 | | |
715 | 0 | OcrEngineMode oem() const { |
716 | 0 | return last_oem_requested_; |
717 | 0 | } |
718 | | |
719 | | void set_min_orientation_margin(double margin); |
720 | | /* @} */ |
721 | | |
722 | | protected: |
723 | | /** Common code for setting the image. Returns true if Init has been called. |
724 | | */ |
725 | | bool InternalSetImage(); |
726 | | |
727 | | /** |
728 | | * Run the thresholder to make the thresholded image. If pix is not nullptr, |
729 | | * the source is thresholded to pix instead of the internal IMAGE. |
730 | | */ |
731 | | virtual bool Threshold(Pix **pix); |
732 | | |
733 | | /** |
734 | | * Find lines from the image making the BLOCK_LIST. |
735 | | * @return 0 on success. |
736 | | */ |
737 | | int FindLines(); |
738 | | |
739 | | /** Delete the pageres and block list ready for a new page. */ |
740 | | void ClearResults(); |
741 | | |
742 | | /** |
743 | | * Return an LTR Result Iterator -- used only for training, as we really want |
744 | | * to ignore all BiDi smarts at that point. |
745 | | * delete once you're done with it. |
746 | | */ |
747 | | LTRResultIterator *GetLTRIterator(); |
748 | | |
749 | | /** |
750 | | * Return the length of the output text string, as UTF8, assuming |
751 | | * one newline per line and one per block, with a terminator, |
752 | | * and assuming a single character reject marker for each rejected character. |
753 | | * Also return the number of recognized blobs in blob_count. |
754 | | */ |
755 | | int TextLength(int *blob_count) const; |
756 | | |
757 | | //// paragraphs.cpp //////////////////////////////////////////////////// |
758 | | void DetectParagraphs(bool after_text_recognition); |
759 | | |
760 | 0 | const PAGE_RES *GetPageRes() const { |
761 | 0 | return page_res_; |
762 | 0 | } |
763 | | |
764 | | protected: |
765 | | Tesseract *tesseract_; ///< The underlying data object. |
766 | | Tesseract *osd_tesseract_; ///< For orientation & script detection. |
767 | | EquationDetect *equ_detect_; ///< The equation detector. |
768 | | FileReader reader_; ///< Reads files from any filesystem. |
769 | | ImageThresholder *thresholder_; ///< Image thresholding module. |
770 | | std::vector<ParagraphModel *> *paragraph_models_; |
771 | | BLOCK_LIST *block_list_; ///< The page layout. |
772 | | PAGE_RES *page_res_; ///< The page-level data. |
773 | | std::string input_file_; ///< Name used by training code. |
774 | | std::string output_file_; ///< Name used by debug code. |
775 | | std::string datapath_; ///< Current location of tessdata. |
776 | | std::string language_; ///< Last initialized language. |
777 | | OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested. |
778 | | bool recognition_done_; ///< page_res_ contains recognition data. |
779 | | |
780 | | /** |
781 | | * @defgroup ThresholderParams Thresholder Parameters |
782 | | * Parameters saved from the Thresholder. Needed to rebuild coordinates. |
783 | | */ |
784 | | /* @{ */ |
785 | | int rect_left_; |
786 | | int rect_top_; |
787 | | int rect_width_; |
788 | | int rect_height_; |
789 | | int image_width_; |
790 | | int image_height_; |
791 | | /* @} */ |
792 | | |
793 | | private: |
794 | | // A list of image filenames gets special consideration |
795 | | bool ProcessPagesFileList(FILE *fp, std::string *buf, |
796 | | const char *retry_config, int timeout_millisec, |
797 | | TessResultRenderer *renderer, |
798 | | int tessedit_page_number); |
799 | | // TIFF supports multipage so gets special consideration. |
800 | | bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, |
801 | | const char *filename, const char *retry_config, |
802 | | int timeout_millisec, |
803 | | TessResultRenderer *renderer, |
804 | | int tessedit_page_number); |
805 | | }; // class TessBaseAPI. |
806 | | |
807 | | /** Escape a char string - replace &<>"' with HTML codes. */ |
808 | | std::string HOcrEscape(const char *text); |
809 | | |
810 | | } // namespace tesseract |
811 | | |
812 | | #endif // TESSERACT_API_BASEAPI_H_ |