/src/tesseract/include/tesseract/baseapi.h
Line | Count | Source |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // File: baseapi.h |
3 | | // Description: Simple API for calling tesseract. |
4 | | // Author: Ray Smith |
5 | | // |
6 | | // (C) Copyright 2006, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | |
17 | | #ifndef TESSERACT_API_BASEAPI_H_ |
18 | | #define TESSERACT_API_BASEAPI_H_ |
19 | | |
20 | | #ifdef HAVE_CONFIG_H |
21 | | # include "config_auto.h" // DISABLED_LEGACY_ENGINE |
22 | | #endif |
23 | | |
24 | | #include "export.h" |
25 | | #include "pageiterator.h" |
26 | | #include "publictypes.h" |
27 | | #include "resultiterator.h" |
28 | | #include "unichar.h" |
29 | | |
30 | | #include <tesseract/version.h> |
31 | | |
32 | | #include <cstdio> |
33 | | #include <vector> // for std::vector |
34 | | |
35 | | struct Pix; |
36 | | struct Pixa; |
37 | | struct Boxa; |
38 | | |
39 | | namespace tesseract { |
40 | | |
41 | | class PAGE_RES; |
42 | | class ParagraphModel; |
43 | | class BLOCK_LIST; |
44 | | class ETEXT_DESC; |
45 | | struct OSResults; |
46 | | class UNICHARSET; |
47 | | |
48 | | class Dawg; |
49 | | class Dict; |
50 | | class EquationDetect; |
51 | | class PageIterator; |
52 | | class ImageThresholder; |
53 | | class LTRResultIterator; |
54 | | class ResultIterator; |
55 | | class MutableIterator; |
56 | | class TessResultRenderer; |
57 | | class Tesseract; |
58 | | |
59 | | // Function to read a std::vector<char> from a whole file. |
60 | | // Returns false on failure. |
61 | | using FileReader = bool (*)(const char *filename, std::vector<char> *data); |
62 | | |
63 | | using DictFunc = int (Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, |
64 | | bool) const; |
65 | | using ProbabilityInContextFunc = double (Dict::*)(const char *, const char *, |
66 | | int, const char *, int); |
67 | | |
68 | | /** |
69 | | * Base class for all tesseract APIs. |
70 | | * Specific classes can add ability to work on different inputs or produce |
71 | | * different outputs. |
72 | | * This class is mostly an interface layer on top of the Tesseract instance |
73 | | * class to hide the data types so that users of this class don't have to |
74 | | * include any other Tesseract headers. |
75 | | */ |
76 | | class TESS_API TessBaseAPI { |
77 | | public: |
78 | | TessBaseAPI(); |
79 | | virtual ~TessBaseAPI(); |
80 | | // Copy constructor and assignment operator are currently unsupported. |
81 | | TessBaseAPI(TessBaseAPI const &) = delete; |
82 | | TessBaseAPI &operator=(TessBaseAPI const &) = delete; |
83 | | |
84 | | /** |
85 | | * Returns the version identifier as a static string. Do not delete. |
86 | | */ |
87 | | static const char *Version(); |
88 | | |
89 | | /** |
90 | | * Set the name of the input file. Needed for training and |
91 | | * reading a UNLV zone file, and for searchable PDF output. |
92 | | */ |
93 | | void SetInputName(const char *name); |
94 | | /** |
95 | | * These functions are required for searchable PDF output. |
96 | | * We need our hands on the input file so that we can include |
97 | | * it in the PDF without transcoding. If that is not possible, |
98 | | * we need the original image. Finally, resolution metadata |
99 | | * is stored in the PDF so we need that as well. |
100 | | */ |
101 | | const char *GetInputName(); |
102 | | // Takes ownership of the input pix. |
103 | | void SetInputImage(Pix *pix); |
104 | | Pix *GetInputImage(); |
105 | | int GetSourceYResolution(); |
106 | | const char *GetDatapath(); |
107 | | |
108 | | /** Set the name of the bonus output files. Needed only for debugging. */ |
109 | | void SetOutputName(const char *name); |
110 | | |
111 | | /** |
112 | | * Set the value of an internal "parameter." |
113 | | * Supply the name of the parameter and the value as a string, just as |
114 | | * you would in a config file. |
115 | | * Returns false if the name lookup failed. |
116 | | * Eg SetVariable("tessedit_char_blacklist", "xyz"); to ignore x, y and z. |
117 | | * Or SetVariable("classify_bln_numeric_mode", "1"); to set numeric-only mode. |
118 | | * SetVariable may be used before Init, but settings will revert to |
119 | | * defaults on End(). |
120 | | * |
121 | | * Note: Must be called after Init(). Only works for non-init variables |
122 | | * (init variables should be passed to Init()). |
123 | | */ |
124 | | bool SetVariable(const char *name, const char *value); |
125 | | bool SetDebugVariable(const char *name, const char *value); |
126 | | |
127 | | /** |
128 | | * Returns true if the parameter was found among Tesseract parameters. |
129 | | * Fills in value with the value of the parameter. |
130 | | */ |
131 | | bool GetIntVariable(const char *name, int *value) const; |
132 | | bool GetBoolVariable(const char *name, bool *value) const; |
133 | | bool GetDoubleVariable(const char *name, double *value) const; |
134 | | |
135 | | /** |
136 | | * Returns the pointer to the string that represents the value of the |
137 | | * parameter if it was found among Tesseract parameters. |
138 | | */ |
139 | | const char *GetStringVariable(const char *name) const; |
140 | | |
141 | | #ifndef DISABLED_LEGACY_ENGINE |
142 | | |
143 | | /** |
144 | | * Print Tesseract fonts table to the given file. |
145 | | */ |
146 | | void PrintFontsTable(FILE *fp) const; |
147 | | |
148 | | #endif |
149 | | |
150 | | /** |
151 | | * Print Tesseract parameters to the given file. |
152 | | */ |
153 | | void PrintVariables(FILE *fp) const; |
154 | | |
155 | | /** |
156 | | * Get value of named variable as a string, if it exists. |
157 | | */ |
158 | | bool GetVariableAsString(const char *name, std::string *val) const; |
159 | | |
160 | | /** |
161 | | * Instances are now mostly thread-safe and totally independent, |
162 | | * but some global parameters remain. Basically it is safe to use multiple |
163 | | * TessBaseAPIs in different threads in parallel, UNLESS: |
164 | | * you use SetVariable on some of the Params in classify and textord. |
165 | | * If you do, then the effect will be to change it for all your instances. |
166 | | * |
167 | | * Start tesseract. Returns zero on success and -1 on failure. |
168 | | * NOTE that the only members that may be called before Init are those |
169 | | * listed above here in the class definition. |
170 | | * |
171 | | * The datapath must be the name of the tessdata directory. |
172 | | * The language is (usually) an ISO 639-3 string or nullptr will default to |
173 | | * eng. It is entirely safe (and eventually will be efficient too) to call |
174 | | * Init multiple times on the same instance to change language, or just |
175 | | * to reset the classifier. |
176 | | * The language may be a string of the form [~]<lang>[+[~]<lang>]* indicating |
177 | | * that multiple languages are to be loaded. Eg hin+eng will load Hindi and |
178 | | * English. Languages may specify internally that they want to be loaded |
179 | | * with one or more other languages, so the ~ sign is available to override |
180 | | * that. Eg if hin were set to load eng by default, then hin+~eng would force |
181 | | * loading only hin. The number of loaded languages is limited only by |
182 | | * memory, with the caveat that loading additional languages will impact |
183 | | * both speed and accuracy, as there is more work to do to decide on the |
184 | | * applicable language, and there is more chance of hallucinating incorrect |
185 | | * words. |
186 | | * WARNING: On changing languages, all Tesseract parameters are reset |
187 | | * back to their default values. (Which may vary between languages.) |
188 | | * If you have a rare need to set a Variable that controls |
189 | | * initialization for a second call to Init you should explicitly |
190 | | * call End() and then use SetVariable before Init. This is only a very |
191 | | * rare use case, since there are very few uses that require any parameters |
192 | | * to be set before Init. |
193 | | * |
194 | | * If set_only_non_debug_params is true, only params that do not contain |
195 | | * "debug" in the name will be set. |
196 | | */ |
197 | | int Init(const char *datapath, const char *language, OcrEngineMode mode, |
198 | | char **configs, int configs_size, |
199 | | const std::vector<std::string> *vars_vec, |
200 | | const std::vector<std::string> *vars_values, |
201 | | bool set_only_non_debug_params); |
202 | 0 | int Init(const char *datapath, const char *language, OcrEngineMode oem) { |
203 | 0 | return Init(datapath, language, oem, nullptr, 0, nullptr, nullptr, false); |
204 | 0 | } |
205 | 4 | int Init(const char *datapath, const char *language) { |
206 | 4 | return Init(datapath, language, OEM_DEFAULT, nullptr, 0, nullptr, nullptr, |
207 | 4 | false); |
208 | 4 | } |
209 | | // In-memory version reads the traineddata file directly from the given |
210 | | // data[data_size] array, and/or reads data via a FileReader. |
211 | | int Init(const char *data, int data_size, const char *language, |
212 | | OcrEngineMode mode, char **configs, int configs_size, |
213 | | const std::vector<std::string> *vars_vec, |
214 | | const std::vector<std::string> *vars_values, |
215 | | bool set_only_non_debug_params, FileReader reader); |
216 | | |
217 | | /** |
218 | | * Returns the languages string used in the last valid initialization. |
219 | | * If the last initialization specified "deu+hin" then that will be |
220 | | * returned. If hin loaded eng automatically as well, then that will |
221 | | * not be included in this list. To find the languages actually |
222 | | * loaded use GetLoadedLanguagesAsVector. |
223 | | * The returned string should NOT be deleted. |
224 | | */ |
225 | | const char *GetInitLanguagesAsString() const; |
226 | | |
227 | | /** |
228 | | * Returns the loaded languages in the vector of std::string. |
229 | | * Includes all languages loaded by the last Init, including those loaded |
230 | | * as dependencies of other loaded languages. |
231 | | */ |
232 | | void GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const; |
233 | | |
234 | | /** |
235 | | * Returns the available languages in the sorted vector of std::string. |
236 | | */ |
237 | | void GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const; |
238 | | |
239 | | /** |
240 | | * Init only for page layout analysis. Use only for calls to SetImage and |
241 | | * AnalysePage. Calls that attempt recognition will generate an error. |
242 | | */ |
243 | | void InitForAnalysePage(); |
244 | | |
245 | | /** |
246 | | * Read a "config" file containing a set of param, value pairs. |
247 | | * Searches the standard places: tessdata/configs, tessdata/tessconfigs |
248 | | * and also accepts a relative or absolute path name. |
249 | | * Note: only non-init params will be set (init params are set by Init()). |
250 | | */ |
251 | | void ReadConfigFile(const char *filename); |
252 | | /** Same as above, but only set debug params from the given config file. */ |
253 | | void ReadDebugConfigFile(const char *filename); |
254 | | |
255 | | /** |
256 | | * Set the current page segmentation mode. Defaults to PSM_SINGLE_BLOCK. |
257 | | * The mode is stored as an IntParam so it can also be modified by |
258 | | * ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). |
259 | | */ |
260 | | void SetPageSegMode(PageSegMode mode); |
261 | | |
262 | | /** Return the current page segmentation mode. */ |
263 | | PageSegMode GetPageSegMode() const; |
264 | | |
265 | | /** |
266 | | * Recognize a rectangle from an image and return the result as a string. |
267 | | * May be called many times for a single Init. |
268 | | * Currently has no error checking. |
269 | | * Greyscale of 8 and color of 24 or 32 bits per pixel may be given. |
270 | | * Palette color images will not work properly and must be converted to |
271 | | * 24 bit. |
272 | | * Binary images of 1 bit per pixel may also be given but they must be |
273 | | * byte packed with the MSB of the first byte being the first pixel, and a |
274 | | * 1 represents WHITE. For binary images set bytes_per_pixel=0. |
275 | | * The recognized text is returned as a char* which is coded |
276 | | * as UTF8 and must be freed with the delete [] operator. |
277 | | * |
278 | | * Note that TesseractRect is the simplified convenience interface. |
279 | | * For advanced uses, use SetImage, (optionally) SetRectangle, Recognize, |
280 | | * and one or more of the Get*Text functions below. |
281 | | */ |
282 | | char *TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, |
283 | | int bytes_per_line, int left, int top, int width, |
284 | | int height); |
285 | | |
286 | | /** |
287 | | * Call between pages or documents etc to free up memory and forget |
288 | | * adaptive data. |
289 | | */ |
290 | | void ClearAdaptiveClassifier(); |
291 | | |
292 | | /** |
293 | | * @defgroup AdvancedAPI Advanced API |
294 | | * The following methods break TesseractRect into pieces, so you can |
295 | | * get hold of the thresholded image, get the text in different formats, |
296 | | * get bounding boxes, confidences etc. |
297 | | */ |
298 | | /* @{ */ |
299 | | |
300 | | /** |
301 | | * Provide an image for Tesseract to recognize. Format is as |
302 | | * TesseractRect above. Copies the image buffer and converts to Pix. |
303 | | * SetImage clears all recognition results, and sets the rectangle to the |
304 | | * full image, so it may be followed immediately by a GetUTF8Text, and it |
305 | | * will automatically perform recognition. |
306 | | */ |
307 | | void SetImage(const unsigned char *imagedata, int width, int height, |
308 | | int bytes_per_pixel, int bytes_per_line); |
309 | | |
310 | | /** |
311 | | * Provide an image for Tesseract to recognize. As with SetImage above, |
312 | | * Tesseract takes its own copy of the image, so it need not persist until |
313 | | * after Recognize. |
314 | | * Pix vs raw, which to use? |
315 | | * Use Pix where possible. Tesseract uses Pix as its internal representation |
316 | | * and it is therefore more efficient to provide a Pix directly. |
317 | | */ |
318 | | void SetImage(Pix *pix); |
319 | | |
320 | | /** |
321 | | * Set the resolution of the source image in pixels per inch so font size |
322 | | * information can be calculated in results. Call this after SetImage(). |
323 | | */ |
324 | | void SetSourceResolution(int ppi); |
325 | | |
326 | | /** |
327 | | * Restrict recognition to a sub-rectangle of the image. Call after SetImage. |
328 | | * Each SetRectangle clears the recogntion results so multiple rectangles |
329 | | * can be recognized with the same image. |
330 | | */ |
331 | | void SetRectangle(int left, int top, int width, int height); |
332 | | |
333 | | /** |
334 | | * Get a copy of the internal thresholded image from Tesseract. |
335 | | * Caller takes ownership of the Pix and must pixDestroy it. |
336 | | * May be called any time after SetImage, or after TesseractRect. |
337 | | */ |
338 | | Pix *GetThresholdedImage(); |
339 | | |
340 | | /** |
341 | | * Return average gradient of lines on page. |
342 | | */ |
343 | | float GetGradient(); |
344 | | |
345 | | /** |
346 | | * Get the result of page layout analysis as a leptonica-style |
347 | | * Boxa, Pixa pair, in reading order. |
348 | | * Can be called before or after Recognize. |
349 | | */ |
350 | | Boxa *GetRegions(Pixa **pixa); |
351 | | |
352 | | /** |
353 | | * Get the textlines as a leptonica-style |
354 | | * Boxa, Pixa pair, in reading order. |
355 | | * Can be called before or after Recognize. |
356 | | * If raw_image is true, then extract from the original image instead of the |
357 | | * thresholded image and pad by raw_padding pixels. |
358 | | * If blockids is not nullptr, the block-id of each line is also returned as |
359 | | * an array of one element per line. delete [] after use. If paraids is not |
360 | | * nullptr, the paragraph-id of each line within its block is also returned as |
361 | | * an array of one element per line. delete [] after use. |
362 | | */ |
363 | | Boxa *GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, |
364 | | int **blockids, int **paraids); |
365 | | /* |
366 | | Helper method to extract from the thresholded image. (most common usage) |
367 | | */ |
368 | 0 | Boxa *GetTextlines(Pixa **pixa, int **blockids) { |
369 | 0 | return GetTextlines(false, 0, pixa, blockids, nullptr); |
370 | 0 | } |
371 | | |
372 | | /** |
373 | | * Get textlines and strips of image regions as a leptonica-style Boxa, Pixa |
374 | | * pair, in reading order. Enables downstream handling of non-rectangular |
375 | | * regions. |
376 | | * Can be called before or after Recognize. |
377 | | * If blockids is not nullptr, the block-id of each line is also returned as |
378 | | * an array of one element per line. delete [] after use. |
379 | | */ |
380 | | Boxa *GetStrips(Pixa **pixa, int **blockids); |
381 | | |
382 | | /** |
383 | | * Get the words as a leptonica-style |
384 | | * Boxa, Pixa pair, in reading order. |
385 | | * Can be called before or after Recognize. |
386 | | */ |
387 | | Boxa *GetWords(Pixa **pixa); |
388 | | |
389 | | /** |
390 | | * Gets the individual connected (text) components (created |
391 | | * after pages segmentation step, but before recognition) |
392 | | * as a leptonica-style Boxa, Pixa pair, in reading order. |
393 | | * Can be called before or after Recognize. |
394 | | * Note: the caller is responsible for calling boxaDestroy() |
395 | | * on the returned Boxa array and pixaDestroy() on cc array. |
396 | | */ |
397 | | Boxa *GetConnectedComponents(Pixa **cc); |
398 | | |
399 | | /** |
400 | | * Get the given level kind of components (block, textline, word etc.) as a |
401 | | * leptonica-style Boxa, Pixa pair, in reading order. |
402 | | * Can be called before or after Recognize. |
403 | | * If blockids is not nullptr, the block-id of each component is also returned |
404 | | * as an array of one element per component. delete [] after use. |
405 | | * If blockids is not nullptr, the paragraph-id of each component with its |
406 | | * block is also returned as an array of one element per component. delete [] |
407 | | * after use. If raw_image is true, then portions of the original image are |
408 | | * extracted instead of the thresholded image and padded with raw_padding. If |
409 | | * text_only is true, then only text components are returned. |
410 | | */ |
411 | | Boxa *GetComponentImages(PageIteratorLevel level, bool text_only, |
412 | | bool raw_image, int raw_padding, Pixa **pixa, |
413 | | int **blockids, int **paraids); |
414 | | // Helper function to get binary images with no padding (most common usage). |
415 | | Boxa *GetComponentImages(const PageIteratorLevel level, const bool text_only, |
416 | 0 | Pixa **pixa, int **blockids) { |
417 | 0 | return GetComponentImages(level, text_only, false, 0, pixa, blockids, |
418 | 0 | nullptr); |
419 | 0 | } |
420 | | |
421 | | /** |
422 | | * Returns the scale factor of the thresholded image that would be returned by |
423 | | * GetThresholdedImage() and the various GetX() methods that call |
424 | | * GetComponentImages(). |
425 | | * Returns 0 if no thresholder has been set. |
426 | | */ |
427 | | int GetThresholdedImageScaleFactor() const; |
428 | | |
429 | | /** |
430 | | * Runs page layout analysis in the mode set by SetPageSegMode. |
431 | | * May optionally be called prior to Recognize to get access to just |
432 | | * the page layout results. Returns an iterator to the results. |
433 | | * If merge_similar_words is true, words are combined where suitable for use |
434 | | * with a line recognizer. Use if you want to use AnalyseLayout to find the |
435 | | * textlines, and then want to process textline fragments with an external |
436 | | * line recognizer. |
437 | | * Returns nullptr on error or an empty page. |
438 | | * The returned iterator must be deleted after use. |
439 | | * WARNING! This class points to data held within the TessBaseAPI class, and |
440 | | * therefore can only be used while the TessBaseAPI class still exists and |
441 | | * has not been subjected to a call of Init, SetImage, Recognize, Clear, End |
442 | | * DetectOS, or anything else that changes the internal PAGE_RES. |
443 | | */ |
444 | | PageIterator *AnalyseLayout(); |
445 | | PageIterator *AnalyseLayout(bool merge_similar_words); |
446 | | |
447 | | /** |
448 | | * Recognize the image from SetAndThresholdImage, generating Tesseract |
449 | | * internal structures. Returns 0 on success. |
450 | | * Optional. The Get*Text functions below will call Recognize if needed. |
451 | | * After Recognize, the output is kept internally until the next SetImage. |
452 | | */ |
453 | | int Recognize(ETEXT_DESC *monitor); |
454 | | |
455 | | /** |
456 | | * Methods to retrieve information after SetAndThresholdImage(), |
457 | | * Recognize() or TesseractRect(). (Recognize is called implicitly if needed.) |
458 | | */ |
459 | | |
460 | | /** |
461 | | * Turns images into symbolic text. |
462 | | * |
463 | | * filename can point to a single image, a multi-page TIFF, |
464 | | * or a plain text list of image filenames. |
465 | | * |
466 | | * retry_config is useful for debugging. If not nullptr, you can fall |
467 | | * back to an alternate configuration if a page fails for some |
468 | | * reason. |
469 | | * |
470 | | * timeout_millisec terminates processing if any single page |
471 | | * takes too long. Set to 0 for unlimited time. |
472 | | * |
473 | | * renderer is responsible for creating the output. For example, |
474 | | * use the TessTextRenderer if you want plaintext output, or |
475 | | * the TessPDFRender to produce searchable PDF. |
476 | | * |
477 | | * If tessedit_page_number is non-negative, will only process that |
478 | | * single page. Works for multi-page tiff file, or filelist. |
479 | | * |
480 | | * Returns true if successful, false on error. |
481 | | */ |
482 | | bool ProcessPages(const char *filename, const char *retry_config, |
483 | | int timeout_millisec, TessResultRenderer *renderer); |
484 | | // Does the real work of ProcessPages. |
485 | | bool ProcessPagesInternal(const char *filename, const char *retry_config, |
486 | | int timeout_millisec, TessResultRenderer *renderer); |
487 | | |
488 | | /** |
489 | | * Turn a single image into symbolic text. |
490 | | * |
491 | | * The pix is the image processed. filename and page_index are |
492 | | * metadata used by side-effect processes, such as reading a box |
493 | | * file or formatting as hOCR. |
494 | | * |
495 | | * See ProcessPages for descriptions of other parameters. |
496 | | */ |
497 | | bool ProcessPage(Pix *pix, int page_index, const char *filename, |
498 | | const char *retry_config, int timeout_millisec, |
499 | | TessResultRenderer *renderer); |
500 | | |
501 | | /** |
502 | | * Get a reading-order iterator to the results of LayoutAnalysis and/or |
503 | | * Recognize. The returned iterator must be deleted after use. |
504 | | * WARNING! This class points to data held within the TessBaseAPI class, and |
505 | | * therefore can only be used while the TessBaseAPI class still exists and |
506 | | * has not been subjected to a call of Init, SetImage, Recognize, Clear, End |
507 | | * DetectOS, or anything else that changes the internal PAGE_RES. |
508 | | */ |
509 | | ResultIterator *GetIterator(); |
510 | | |
511 | | /** |
512 | | * Get a mutable iterator to the results of LayoutAnalysis and/or Recognize. |
513 | | * The returned iterator must be deleted after use. |
514 | | * WARNING! This class points to data held within the TessBaseAPI class, and |
515 | | * therefore can only be used while the TessBaseAPI class still exists and |
516 | | * has not been subjected to a call of Init, SetImage, Recognize, Clear, End |
517 | | * DetectOS, or anything else that changes the internal PAGE_RES. |
518 | | */ |
519 | | MutableIterator *GetMutableIterator(); |
520 | | |
521 | | /** |
522 | | * The recognized text is returned as a char* which is coded |
523 | | * as UTF8 and must be freed with the delete [] operator. |
524 | | */ |
525 | | char *GetUTF8Text(); |
526 | | |
527 | | /** |
528 | | * Make a HTML-formatted string with hOCR markup from the internal |
529 | | * data structures. |
530 | | * page_number is 0-based but will appear in the output as 1-based. |
531 | | * monitor can be used to |
532 | | * cancel the recognition |
533 | | * receive progress callbacks |
534 | | * Returned string must be freed with the delete [] operator. |
535 | | */ |
536 | | char *GetHOCRText(ETEXT_DESC *monitor, int page_number); |
537 | | |
538 | | /** |
539 | | * Make a HTML-formatted string with hOCR markup from the internal |
540 | | * data structures. |
541 | | * page_number is 0-based but will appear in the output as 1-based. |
542 | | * Returned string must be freed with the delete [] operator. |
543 | | */ |
544 | | char *GetHOCRText(int page_number); |
545 | | |
546 | | /** |
547 | | * Make an XML-formatted string with Alto markup from the internal |
548 | | * data structures. |
549 | | */ |
550 | | char *GetAltoText(ETEXT_DESC *monitor, int page_number); |
551 | | |
552 | | /** |
553 | | * Make an XML-formatted string with Alto markup from the internal |
554 | | * data structures. |
555 | | */ |
556 | | char *GetAltoText(int page_number); |
557 | | |
558 | | /** |
559 | | * Make an XML-formatted string with PAGE markup from the internal |
560 | | * data structures. |
561 | | */ |
562 | | char *GetPAGEText(ETEXT_DESC *monitor, int page_number); |
563 | | |
564 | | /** |
565 | | * Make an XML-formatted string with PAGE markup from the internal |
566 | | * data structures. |
567 | | */ |
568 | | char *GetPAGEText(int page_number); |
569 | | |
570 | | /** |
571 | | * Make a TSV-formatted string from the internal data structures. |
572 | | * page_number is 0-based but will appear in the output as 1-based. |
573 | | * Returned string must be freed with the delete [] operator. |
574 | | */ |
575 | | char *GetTSVText(int page_number); |
576 | | |
577 | | /** |
578 | | * Make a box file for LSTM training from the internal data structures. |
579 | | * Constructs coordinates in the original image - not just the rectangle. |
580 | | * page_number is a 0-based page index that will appear in the box file. |
581 | | * Returned string must be freed with the delete [] operator. |
582 | | */ |
583 | | char *GetLSTMBoxText(int page_number); |
584 | | |
585 | | /** |
586 | | * The recognized text is returned as a char* which is coded in the same |
587 | | * format as a box file used in training. |
588 | | * Constructs coordinates in the original image - not just the rectangle. |
589 | | * page_number is a 0-based page index that will appear in the box file. |
590 | | * Returned string must be freed with the delete [] operator. |
591 | | */ |
592 | | char *GetBoxText(int page_number); |
593 | | |
594 | | /** |
595 | | * The recognized text is returned as a char* which is coded in the same |
596 | | * format as a WordStr box file used in training. |
597 | | * page_number is a 0-based page index that will appear in the box file. |
598 | | * Returned string must be freed with the delete [] operator. |
599 | | */ |
600 | | char *GetWordStrBoxText(int page_number); |
601 | | |
602 | | /** |
603 | | * The recognized text is returned as a char* which is coded |
604 | | * as UNLV format Latin-1 with specific reject and suspect codes. |
605 | | * Returned string must be freed with the delete [] operator. |
606 | | */ |
607 | | char *GetUNLVText(); |
608 | | |
609 | | /** |
610 | | * Detect the orientation of the input image and apparent script (alphabet). |
611 | | * orient_deg is the detected clockwise rotation of the input image in degrees |
612 | | * (0, 90, 180, 270) |
613 | | * orient_conf is the confidence (15.0 is reasonably confident) |
614 | | * script_name is an ASCII string, the name of the script, e.g. "Latin" |
615 | | * script_conf is confidence level in the script |
616 | | * Returns true on success and writes values to each parameter as an output |
617 | | */ |
618 | | bool DetectOrientationScript(int *orient_deg, float *orient_conf, |
619 | | const char **script_name, float *script_conf); |
620 | | |
621 | | /** |
622 | | * The recognized text is returned as a char* which is coded |
623 | | * as UTF8 and must be freed with the delete [] operator. |
624 | | * page_number is a 0-based page index that will appear in the osd file. |
625 | | */ |
626 | | char *GetOsdText(int page_number); |
627 | | |
628 | | /** Returns the (average) confidence value between 0 and 100. */ |
629 | | int MeanTextConf(); |
630 | | /** |
631 | | * Returns all word confidences (between 0 and 100) in an array, terminated |
632 | | * by -1. The calling function must delete [] after use. |
633 | | * The number of confidences should correspond to the number of space- |
634 | | * delimited words in GetUTF8Text. |
635 | | */ |
636 | | int *AllWordConfidences(); |
637 | | |
638 | | #ifndef DISABLED_LEGACY_ENGINE |
639 | | /** |
640 | | * Applies the given word to the adaptive classifier if possible. |
641 | | * The word must be SPACE-DELIMITED UTF-8 - l i k e t h i s , so it can |
642 | | * tell the boundaries of the graphemes. |
643 | | * Assumes that SetImage/SetRectangle have been used to set the image |
644 | | * to the given word. The mode arg should be PSM_SINGLE_WORD or |
645 | | * PSM_CIRCLE_WORD, as that will be used to control layout analysis. |
646 | | * The currently set PageSegMode is preserved. |
647 | | * Returns false if adaption was not possible for some reason. |
648 | | */ |
649 | | bool AdaptToWordStr(PageSegMode mode, const char *wordstr); |
650 | | #endif // ndef DISABLED_LEGACY_ENGINE |
651 | | |
652 | | /** |
653 | | * Free up recognition results and any stored image data, without actually |
654 | | * freeing any recognition data that would be time-consuming to reload. |
655 | | * Afterwards, you must call SetImage or TesseractRect before doing |
656 | | * any Recognize or Get* operation. |
657 | | */ |
658 | | void Clear(); |
659 | | |
660 | | /** |
661 | | * Close down tesseract and free up all memory. End() is equivalent to |
662 | | * destructing and reconstructing your TessBaseAPI. |
663 | | * Once End() has been used, none of the other API functions may be used |
664 | | * other than Init and anything declared above it in the class definition. |
665 | | */ |
666 | | void End(); |
667 | | |
668 | | /** |
669 | | * Clear any library-level memory caches. |
670 | | * There are a variety of expensive-to-load constant data structures (mostly |
671 | | * language dictionaries) that are cached globally -- surviving the Init() |
672 | | * and End() of individual TessBaseAPI's. This function allows the clearing |
673 | | * of these caches. |
674 | | **/ |
675 | | static void ClearPersistentCache(); |
676 | | |
677 | | /** |
678 | | * Check whether a word is valid according to Tesseract's language model |
679 | | * @return 0 if the word is invalid, non-zero if valid. |
680 | | * @warning temporary! This function will be removed from here and placed |
681 | | * in a separate API at some future time. |
682 | | */ |
683 | | int IsValidWord(const char *word) const; |
684 | | // Returns true if utf8_character is defined in the UniCharset. |
685 | | bool IsValidCharacter(const char *utf8_character) const; |
686 | | |
687 | | bool GetTextDirection(int *out_offset, float *out_slope); |
688 | | |
689 | | /** Sets Dict::letter_is_okay_ function to point to the given function. */ |
690 | | void SetDictFunc(DictFunc f); |
691 | | |
692 | | /** Sets Dict::probability_in_context_ function to point to the given |
693 | | * function. |
694 | | */ |
695 | | void SetProbabilityInContextFunc(ProbabilityInContextFunc f); |
696 | | |
697 | | /** |
698 | | * Estimates the Orientation And Script of the image. |
699 | | * @return true if the image was processed successfully. |
700 | | */ |
701 | | bool DetectOS(OSResults *); |
702 | | |
703 | | /** |
704 | | * Return text orientation of each block as determined by an earlier run |
705 | | * of layout analysis. |
706 | | */ |
707 | | void GetBlockTextOrientations(int **block_orientation, |
708 | | bool **vertical_writing); |
709 | | |
710 | | /** This method returns the string form of the specified unichar. */ |
711 | | const char *GetUnichar(int unichar_id) const; |
712 | | |
713 | | /** Return the pointer to the i-th dawg loaded into tesseract_ object. */ |
714 | | const Dawg *GetDawg(int i) const; |
715 | | |
716 | | /** Return the number of dawgs loaded into tesseract_ object. */ |
717 | | int NumDawgs() const; |
718 | | |
719 | 0 | Tesseract *tesseract() const { |
720 | 0 | return tesseract_; |
721 | 0 | } |
722 | | |
723 | 0 | OcrEngineMode oem() const { |
724 | 0 | return last_oem_requested_; |
725 | 0 | } |
726 | | |
727 | | void set_min_orientation_margin(double margin); |
728 | | /* @} */ |
729 | | |
730 | | protected: |
731 | | /** Common code for setting the image. Returns true if Init has been called. |
732 | | */ |
733 | | bool InternalSetImage(); |
734 | | |
735 | | /** |
736 | | * Run the thresholder to make the thresholded image. If pix is not nullptr, |
737 | | * the source is thresholded to pix instead of the internal IMAGE. |
738 | | */ |
739 | | virtual bool Threshold(Pix **pix); |
740 | | |
741 | | /** |
742 | | * Find lines from the image making the BLOCK_LIST. |
743 | | * @return 0 on success. |
744 | | */ |
745 | | int FindLines(); |
746 | | |
747 | | /** Delete the pageres and block list ready for a new page. */ |
748 | | void ClearResults(); |
749 | | |
750 | | /** |
751 | | * Return an LTR Result Iterator -- used only for training, as we really want |
752 | | * to ignore all BiDi smarts at that point. |
753 | | * delete once you're done with it. |
754 | | */ |
755 | | LTRResultIterator *GetLTRIterator(); |
756 | | |
757 | | /** |
758 | | * Return the length of the output text string, as UTF8, assuming |
759 | | * one newline per line and one per block, with a terminator, |
760 | | * and assuming a single character reject marker for each rejected character. |
761 | | * Also return the number of recognized blobs in blob_count. |
762 | | */ |
763 | | int TextLength(int *blob_count) const; |
764 | | |
765 | | //// paragraphs.cpp //////////////////////////////////////////////////// |
766 | | void DetectParagraphs(bool after_text_recognition); |
767 | | |
768 | 0 | const PAGE_RES *GetPageRes() const { |
769 | 0 | return page_res_; |
770 | 0 | } |
771 | | |
772 | | protected: |
773 | | Tesseract *tesseract_; ///< The underlying data object. |
774 | | Tesseract *osd_tesseract_; ///< For orientation & script detection. |
775 | | EquationDetect *equ_detect_; ///< The equation detector. |
776 | | FileReader reader_; ///< Reads files from any filesystem. |
777 | | ImageThresholder *thresholder_; ///< Image thresholding module. |
778 | | std::vector<ParagraphModel *> *paragraph_models_; |
779 | | BLOCK_LIST *block_list_; ///< The page layout. |
780 | | PAGE_RES *page_res_; ///< The page-level data. |
781 | | std::string input_file_; ///< Name used by training code. |
782 | | std::string output_file_; ///< Name used by debug code. |
783 | | std::string datapath_; ///< Current location of tessdata. |
784 | | std::string language_; ///< Last initialized language. |
785 | | OcrEngineMode last_oem_requested_; ///< Last ocr language mode requested. |
786 | | bool recognition_done_; ///< page_res_ contains recognition data. |
787 | | |
788 | | /** |
789 | | * @defgroup ThresholderParams Thresholder Parameters |
790 | | * Parameters saved from the Thresholder. Needed to rebuild coordinates. |
791 | | */ |
792 | | /* @{ */ |
793 | | int rect_left_; |
794 | | int rect_top_; |
795 | | int rect_width_; |
796 | | int rect_height_; |
797 | | int image_width_; |
798 | | int image_height_; |
799 | | /* @} */ |
800 | | |
801 | | private: |
802 | | // A list of image filenames gets special consideration |
803 | | bool ProcessPagesFileList(FILE *fp, std::string *buf, |
804 | | const char *retry_config, int timeout_millisec, |
805 | | TessResultRenderer *renderer, |
806 | | int tessedit_page_number); |
807 | | // TIFF supports multipage so gets special consideration. |
808 | | bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, |
809 | | const char *filename, const char *retry_config, |
810 | | int timeout_millisec, |
811 | | TessResultRenderer *renderer, |
812 | | int tessedit_page_number); |
813 | | }; // class TessBaseAPI. |
814 | | |
815 | | /** Escape a char string - replace &<>"' with HTML codes. */ |
816 | | std::string HOcrEscape(const char *text); |
817 | | |
818 | | } // namespace tesseract |
819 | | |
820 | | #endif // TESSERACT_API_BASEAPI_H_ |