/src/tesseract/include/tesseract/pageiterator.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // SPDX-License-Identifier: Apache-2.0  | 
2  |  | // File:        pageiterator.h  | 
3  |  | // Description: Iterator for tesseract page structure that avoids using  | 
4  |  | //              tesseract internal data structures.  | 
5  |  | // Author:      Ray Smith  | 
6  |  | //  | 
7  |  | // (C) Copyright 2010, Google Inc.  | 
8  |  | // Licensed under the Apache License, Version 2.0 (the "License");  | 
9  |  | // you may not use this file except in compliance with the License.  | 
10  |  | // You may obtain a copy of the License at  | 
11  |  | // http://www.apache.org/licenses/LICENSE-2.0  | 
12  |  | // Unless required by applicable law or agreed to in writing, software  | 
13  |  | // distributed under the License is distributed on an "AS IS" BASIS,  | 
14  |  | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
15  |  | // See the License for the specific language governing permissions and  | 
16  |  | // limitations under the License.  | 
17  |  |  | 
18  |  | #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_  | 
19  |  | #define TESSERACT_CCMAIN_PAGEITERATOR_H_  | 
20  |  |  | 
21  |  | #include "export.h"  | 
22  |  | #include "publictypes.h"  | 
23  |  |  | 
24  |  | struct Pix;  | 
25  |  | struct Pta;  | 
26  |  |  | 
27  |  | namespace tesseract { | 
28  |  |  | 
29  |  | struct BlamerBundle;  | 
30  |  | class C_BLOB_IT;  | 
31  |  | class PAGE_RES;  | 
32  |  | class PAGE_RES_IT;  | 
33  |  | class WERD;  | 
34  |  |  | 
35  |  | class Tesseract;  | 
36  |  |  | 
37  |  | /**  | 
38  |  |  * Class to iterate over tesseract page structure, providing access to all  | 
39  |  |  * levels of the page hierarchy, without including any tesseract headers or  | 
40  |  |  * having to handle any tesseract structures.  | 
41  |  |  * WARNING! This class points to data held within the TessBaseAPI class, and  | 
42  |  |  * therefore can only be used while the TessBaseAPI class still exists and  | 
43  |  |  * has not been subjected to a call of Init, SetImage, Recognize, Clear, End  | 
44  |  |  * DetectOS, or anything else that changes the internal PAGE_RES.  | 
45  |  |  * See tesseract/publictypes.h for the definition of PageIteratorLevel.  | 
46  |  |  * See also ResultIterator, derived from PageIterator, which adds in the  | 
47  |  |  * ability to access OCR output with text-specific methods.  | 
48  |  |  */  | 
49  |  |  | 
50  |  | class TESS_API PageIterator { | 
51  |  | public:  | 
52  |  |   /**  | 
53  |  |    * page_res and tesseract come directly from the BaseAPI.  | 
54  |  |    * The rectangle parameters are copied indirectly from the Thresholder,  | 
55  |  |    * via the BaseAPI. They represent the coordinates of some rectangle in an  | 
56  |  |    * original image (in top-left-origin coordinates) and therefore the top-left  | 
57  |  |    * needs to be added to any output boxes in order to specify coordinates  | 
58  |  |    * in the original image. See TessBaseAPI::SetRectangle.  | 
59  |  |    * The scale and scaled_yres are in case the Thresholder scaled the image  | 
60  |  |    * rectangle prior to thresholding. Any coordinates in tesseract's image  | 
61  |  |    * must be divided by scale before adding (rect_left, rect_top).  | 
62  |  |    * The scaled_yres indicates the effective resolution of the binary image  | 
63  |  |    * that tesseract has been given by the Thresholder.  | 
64  |  |    * After the constructor, Begin has already been called.  | 
65  |  |    */  | 
66  |  |   PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,  | 
67  |  |                int scaled_yres, int rect_left, int rect_top, int rect_width,  | 
68  |  |                int rect_height);  | 
69  |  |   virtual ~PageIterator();  | 
70  |  |  | 
71  |  |   /**  | 
72  |  |    * Page/ResultIterators may be copied! This makes it possible to iterate over  | 
73  |  |    * all the objects at a lower level, while maintaining an iterator to  | 
74  |  |    * objects at a higher level. These constructors DO NOT CALL Begin, so  | 
75  |  |    * iterations will continue from the location of src.  | 
76  |  |    */  | 
77  |  |   PageIterator(const PageIterator &src);  | 
78  |  |   const PageIterator &operator=(const PageIterator &src);  | 
79  |  |  | 
80  |  |   /** Are we positioned at the same location as other? */  | 
81  |  |   bool PositionedAtSameWord(const PAGE_RES_IT *other) const;  | 
82  |  |  | 
83  |  |   // ============= Moving around within the page ============.  | 
84  |  |  | 
85  |  |   /**  | 
86  |  |    * Moves the iterator to point to the start of the page to begin an  | 
87  |  |    * iteration.  | 
88  |  |    */  | 
89  |  |   virtual void Begin();  | 
90  |  |  | 
91  |  |   /**  | 
92  |  |    * Moves the iterator to the beginning of the paragraph.  | 
93  |  |    * This class implements this functionality by moving it to the zero indexed  | 
94  |  |    * blob of the first (leftmost) word on the first row of the paragraph.  | 
95  |  |    */  | 
96  |  |   virtual void RestartParagraph();  | 
97  |  |  | 
98  |  |   /**  | 
99  |  |    * Return whether this iterator points anywhere in the first textline of a  | 
100  |  |    * paragraph.  | 
101  |  |    */  | 
102  |  |   bool IsWithinFirstTextlineOfParagraph() const;  | 
103  |  |  | 
104  |  |   /**  | 
105  |  |    * Moves the iterator to the beginning of the text line.  | 
106  |  |    * This class implements this functionality by moving it to the zero indexed  | 
107  |  |    * blob of the first (leftmost) word of the row.  | 
108  |  |    */  | 
109  |  |   virtual void RestartRow();  | 
110  |  |  | 
111  |  |   /**  | 
112  |  |    * Moves to the start of the next object at the given level in the  | 
113  |  |    * page hierarchy, and returns false if the end of the page was reached.  | 
114  |  |    * NOTE that RIL_SYMBOL will skip non-text blocks, but all other  | 
115  |  |    * PageIteratorLevel level values will visit each non-text block once.  | 
116  |  |    * Think of non text blocks as containing a single para, with a single line,  | 
117  |  |    * with a single imaginary word.  | 
118  |  |    * Calls to Next with different levels may be freely intermixed.  | 
119  |  |    * This function iterates words in right-to-left scripts correctly, if  | 
120  |  |    * the appropriate language has been loaded into Tesseract.  | 
121  |  |    */  | 
122  |  |   virtual bool Next(PageIteratorLevel level);  | 
123  |  |  | 
124  |  |   /**  | 
125  |  |    * Returns true if the iterator is at the start of an object at the given  | 
126  |  |    * level.  | 
127  |  |    *  | 
128  |  |    * For instance, suppose an iterator it is pointed to the first symbol of the  | 
129  |  |    * first word of the third line of the second paragraph of the first block in  | 
130  |  |    * a page, then:  | 
131  |  |    *   it.IsAtBeginningOf(RIL_BLOCK) = false  | 
132  |  |    *   it.IsAtBeginningOf(RIL_PARA) = false  | 
133  |  |    *   it.IsAtBeginningOf(RIL_TEXTLINE) = true  | 
134  |  |    *   it.IsAtBeginningOf(RIL_WORD) = true  | 
135  |  |    *   it.IsAtBeginningOf(RIL_SYMBOL) = true  | 
136  |  |    */  | 
137  |  |   virtual bool IsAtBeginningOf(PageIteratorLevel level) const;  | 
138  |  |  | 
139  |  |   /**  | 
140  |  |    * Returns whether the iterator is positioned at the last element in a  | 
141  |  |    * given level. (e.g. the last word in a line, the last line in a block)  | 
142  |  |    *  | 
143  |  |    *     Here's some two-paragraph example  | 
144  |  |    *   text.  It starts off innocuously  | 
145  |  |    *   enough but quickly turns bizarre.  | 
146  |  |    *     The author inserts a cornucopia  | 
147  |  |    *   of words to guard against confused  | 
148  |  |    *   references.  | 
149  |  |    *  | 
150  |  |    * Now take an iterator it pointed to the start of "bizarre."  | 
151  |  |    *  it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false  | 
152  |  |    *  it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true  | 
153  |  |    *  it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false  | 
154  |  |    */  | 
155  |  |   virtual bool IsAtFinalElement(PageIteratorLevel level,  | 
156  |  |                                 PageIteratorLevel element) const;  | 
157  |  |  | 
158  |  |   /**  | 
159  |  |    * Returns whether this iterator is positioned  | 
160  |  |    *   before other:   -1  | 
161  |  |    *   equal to other:  0  | 
162  |  |    *   after other:     1  | 
163  |  |    */  | 
164  |  |   int Cmp(const PageIterator &other) const;  | 
165  |  |  | 
166  |  |   // ============= Accessing data ==============.  | 
167  |  |   // Coordinate system:  | 
168  |  |   // Integer coordinates are at the cracks between the pixels.  | 
169  |  |   // The top-left corner of the top-left pixel in the image is at (0,0).  | 
170  |  |   // The bottom-right corner of the bottom-right pixel in the image is at  | 
171  |  |   // (width, height).  | 
172  |  |   // Every bounding box goes from the top-left of the top-left contained  | 
173  |  |   // pixel to the bottom-right of the bottom-right contained pixel, so  | 
174  |  |   // the bounding box of the single top-left pixel in the image is:  | 
175  |  |   // (0,0)->(1,1).  | 
176  |  |   // If an image rectangle has been set in the API, then returned coordinates  | 
177  |  |   // relate to the original (full) image, rather than the rectangle.  | 
178  |  |  | 
179  |  |   /**  | 
180  |  |    * Controls what to include in a bounding box. Bounding boxes of all levels  | 
181  |  |    * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.  | 
182  |  |    * Between layout analysis and recognition, it isn't known where all  | 
183  |  |    * diacritics belong, so this control is used to include or exclude some  | 
184  |  |    * diacritics that are above or below the main body of the word. In most cases  | 
185  |  |    * where the placement is obvious, and after recognition, it doesn't make as  | 
186  |  |    * much difference, as the diacritics will already be included in the word.  | 
187  |  |    */  | 
188  |  |   void SetBoundingBoxComponents(bool include_upper_dots,  | 
189  | 0  |                                 bool include_lower_dots) { | 
190  | 0  |     include_upper_dots_ = include_upper_dots;  | 
191  | 0  |     include_lower_dots_ = include_lower_dots;  | 
192  | 0  |   }  | 
193  |  |  | 
194  |  |   /**  | 
195  |  |    * Returns the bounding rectangle of the current object at the given level.  | 
196  |  |    * See comment on coordinate system above.  | 
197  |  |    * Returns false if there is no such object at the current position.  | 
198  |  |    * The returned bounding box is guaranteed to match the size and position  | 
199  |  |    * of the image returned by GetBinaryImage, but may clip foreground pixels  | 
200  |  |    * from a grey image. The padding argument to GetImage can be used to expand  | 
201  |  |    * the image to include more foreground pixels. See GetImage below.  | 
202  |  |    */  | 
203  |  |   bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,  | 
204  |  |                    int *bottom) const;  | 
205  |  |   bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,  | 
206  |  |                    int *right, int *bottom) const;  | 
207  |  |   /**  | 
208  |  |    * Returns the bounding rectangle of the object in a coordinate system of the  | 
209  |  |    * working image rectangle having its origin at (rect_left_, rect_top_) with  | 
210  |  |    * respect to the original image and is scaled by a factor scale_.  | 
211  |  |    */  | 
212  |  |   bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,  | 
213  |  |                            int *right, int *bottom) const;  | 
214  |  |  | 
215  |  |   /** Returns whether there is no object of a given level. */  | 
216  |  |   bool Empty(PageIteratorLevel level) const;  | 
217  |  |  | 
218  |  |   /**  | 
219  |  |    * Returns the type of the current block.  | 
220  |  |    * See tesseract/publictypes.h for PolyBlockType.  | 
221  |  |    */  | 
222  |  |   PolyBlockType BlockType() const;  | 
223  |  |  | 
224  |  |   /**  | 
225  |  |    * Returns the polygon outline of the current block. The returned Pta must  | 
226  |  |    * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices  | 
227  |  |    * of the polygon, and the last edge is the line segment between the last  | 
228  |  |    * point and the first point. nullptr will be returned if the iterator is  | 
229  |  |    * at the end of the document or layout analysis was not used.  | 
230  |  |    */  | 
231  |  |   Pta *BlockPolygon() const;  | 
232  |  |  | 
233  |  |   /**  | 
234  |  |    * Returns a binary image of the current object at the given level.  | 
235  |  |    * The position and size match the return from BoundingBoxInternal, and so  | 
236  |  |    * this could be upscaled with respect to the original input image.  | 
237  |  |    * Use pixDestroy to delete the image after use.  | 
238  |  |    */  | 
239  |  |   Pix *GetBinaryImage(PageIteratorLevel level) const;  | 
240  |  |  | 
241  |  |   /**  | 
242  |  |    * Returns an image of the current object at the given level in greyscale  | 
243  |  |    * if available in the input. To guarantee a binary image use BinaryImage.  | 
244  |  |    * NOTE that in order to give the best possible image, the bounds are  | 
245  |  |    * expanded slightly over the binary connected component, by the supplied  | 
246  |  |    * padding, so the top-left position of the returned image is returned  | 
247  |  |    * in (left,top). These will most likely not match the coordinates  | 
248  |  |    * returned by BoundingBox.  | 
249  |  |    * If you do not supply an original image, you will get a binary one.  | 
250  |  |    * Use pixDestroy to delete the image after use.  | 
251  |  |    */  | 
252  |  |   Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,  | 
253  |  |                 int *left, int *top) const;  | 
254  |  |  | 
255  |  |   /**  | 
256  |  |    * Returns the baseline of the current object at the given level.  | 
257  |  |    * The baseline is the line that passes through (x1, y1) and (x2, y2).  | 
258  |  |    * WARNING: with vertical text, baselines may be vertical!  | 
259  |  |    * Returns false if there is no baseline at the current position.  | 
260  |  |    */  | 
261  |  |   bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,  | 
262  |  |                 int *y2) const;  | 
263  |  |  | 
264  |  |   // Returns the attributes of the current row.  | 
265  |  |   void RowAttributes(float *row_height, float *descenders,  | 
266  |  |                      float *ascenders) const;  | 
267  |  |  | 
268  |  |   /**  | 
269  |  |    * Returns orientation for the block the iterator points to.  | 
270  |  |    *   orientation, writing_direction, textline_order: see publictypes.h  | 
271  |  |    *   deskew_angle: after rotating the block so the text orientation is  | 
272  |  |    *                 upright, how many radians does one have to rotate the  | 
273  |  |    *                 block anti-clockwise for it to be level?  | 
274  |  |    *                   -Pi/4 <= deskew_angle <= Pi/4  | 
275  |  |    */  | 
276  |  |   void Orientation(tesseract::Orientation *orientation,  | 
277  |  |                    tesseract::WritingDirection *writing_direction,  | 
278  |  |                    tesseract::TextlineOrder *textline_order,  | 
279  |  |                    float *deskew_angle) const;  | 
280  |  |  | 
281  |  |   /**  | 
282  |  |    * Returns information about the current paragraph, if available.  | 
283  |  |    *  | 
284  |  |    *   justification -  | 
285  |  |    *     LEFT if ragged right, or fully justified and script is left-to-right.  | 
286  |  |    *     RIGHT if ragged left, or fully justified and script is right-to-left.  | 
287  |  |    *     unknown if it looks like source code or we have very few lines.  | 
288  |  |    *   is_list_item -  | 
289  |  |    *     true if we believe this is a member of an ordered or unordered list.  | 
290  |  |    *   is_crown -  | 
291  |  |    *     true if the first line of the paragraph is aligned with the other  | 
292  |  |    *     lines of the paragraph even though subsequent paragraphs have first  | 
293  |  |    *     line indents.  This typically indicates that this is the continuation  | 
294  |  |    *     of a previous paragraph or that it is the very first paragraph in  | 
295  |  |    *     the chapter.  | 
296  |  |    *   first_line_indent -  | 
297  |  |    *     For LEFT aligned paragraphs, the first text line of paragraphs of  | 
298  |  |    *     this kind are indented this many pixels from the left edge of the  | 
299  |  |    *     rest of the paragraph.  | 
300  |  |    *     for RIGHT aligned paragraphs, the first text line of paragraphs of  | 
301  |  |    *     this kind are indented this many pixels from the right edge of the  | 
302  |  |    *     rest of the paragraph.  | 
303  |  |    *     NOTE 1: This value may be negative.  | 
304  |  |    *     NOTE 2: if *is_crown == true, the first line of this paragraph is  | 
305  |  |    *             actually flush, and first_line_indent is set to the "common"  | 
306  |  |    *             first_line_indent for subsequent paragraphs in this block  | 
307  |  |    *             of text.  | 
308  |  |    */  | 
309  |  |   void ParagraphInfo(tesseract::ParagraphJustification *justification,  | 
310  |  |                      bool *is_list_item, bool *is_crown,  | 
311  |  |                      int *first_line_indent) const;  | 
312  |  |  | 
313  |  |   // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle  | 
314  |  |   // of the current word to the given pointer (takes ownership of the pointer)  | 
315  |  |   // and returns true.  | 
316  |  |   // Can only be used when iterating on the word level.  | 
317  |  |   bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);  | 
318  |  |  | 
319  |  | protected:  | 
320  |  |   /**  | 
321  |  |    * Sets up the internal data for iterating the blobs of a new word, then  | 
322  |  |    * moves the iterator to the given offset.  | 
323  |  |    */  | 
324  |  |   void BeginWord(int offset);  | 
325  |  |  | 
326  |  |   /** Pointer to the page_res owned by the API. */  | 
327  |  |   PAGE_RES *page_res_;  | 
328  |  |   /** Pointer to the Tesseract object owned by the API. */  | 
329  |  |   Tesseract *tesseract_;  | 
330  |  |   /**  | 
331  |  |    * The iterator to the page_res_. Owned by this ResultIterator.  | 
332  |  |    * A pointer just to avoid dragging in Tesseract includes.  | 
333  |  |    */  | 
334  |  |   PAGE_RES_IT *it_;  | 
335  |  |   /**  | 
336  |  |    * The current input WERD being iterated. If there is an output from OCR,  | 
337  |  |    * then word_ is nullptr. Owned by the API  | 
338  |  |    */  | 
339  |  |   WERD *word_;  | 
340  |  |   /** The length of the current word_. */  | 
341  |  |   int word_length_;  | 
342  |  |   /** The current blob index within the word. */  | 
343  |  |   int blob_index_;  | 
344  |  |   /**  | 
345  |  |    * Iterator to the blobs within the word. If nullptr, then we are iterating  | 
346  |  |    * OCR results in the box_word.  | 
347  |  |    * Owned by this ResultIterator.  | 
348  |  |    */  | 
349  |  |   C_BLOB_IT *cblob_it_;  | 
350  |  |   /** Control over what to include in bounding boxes. */  | 
351  |  |   bool include_upper_dots_;  | 
352  |  |   bool include_lower_dots_;  | 
353  |  |   /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/  | 
354  |  |   int scale_;  | 
355  |  |   int scaled_yres_;  | 
356  |  |   int rect_left_;  | 
357  |  |   int rect_top_;  | 
358  |  |   int rect_width_;  | 
359  |  |   int rect_height_;  | 
360  |  | };  | 
361  |  |  | 
362  |  | } // namespace tesseract.  | 
363  |  |  | 
364  |  | #endif // TESSERACT_CCMAIN_PAGEITERATOR_H_  |