/src/tesseract/include/tesseract/pageiterator.h
Line | Count | Source (jump to first uncovered line) |
1 | | // SPDX-License-Identifier: Apache-2.0 |
2 | | // File: pageiterator.h |
3 | | // Description: Iterator for tesseract page structure that avoids using |
4 | | // tesseract internal data structures. |
5 | | // Author: Ray Smith |
6 | | // |
7 | | // (C) Copyright 2010, Google Inc. |
8 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
9 | | // you may not use this file except in compliance with the License. |
10 | | // You may obtain a copy of the License at |
11 | | // http://www.apache.org/licenses/LICENSE-2.0 |
12 | | // Unless required by applicable law or agreed to in writing, software |
13 | | // distributed under the License is distributed on an "AS IS" BASIS, |
14 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | // See the License for the specific language governing permissions and |
16 | | // limitations under the License. |
17 | | |
18 | | #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_ |
19 | | #define TESSERACT_CCMAIN_PAGEITERATOR_H_ |
20 | | |
21 | | #include "export.h" |
22 | | #include "publictypes.h" |
23 | | |
24 | | struct Pix; |
25 | | struct Pta; |
26 | | |
27 | | namespace tesseract { |
28 | | |
29 | | struct BlamerBundle; |
30 | | class C_BLOB_IT; |
31 | | class PAGE_RES; |
32 | | class PAGE_RES_IT; |
33 | | class WERD; |
34 | | |
35 | | class Tesseract; |
36 | | |
37 | | /** |
38 | | * Class to iterate over tesseract page structure, providing access to all |
39 | | * levels of the page hierarchy, without including any tesseract headers or |
40 | | * having to handle any tesseract structures. |
41 | | * WARNING! This class points to data held within the TessBaseAPI class, and |
42 | | * therefore can only be used while the TessBaseAPI class still exists and |
43 | | * has not been subjected to a call of Init, SetImage, Recognize, Clear, End |
44 | | * DetectOS, or anything else that changes the internal PAGE_RES. |
45 | | * See tesseract/publictypes.h for the definition of PageIteratorLevel. |
46 | | * See also ResultIterator, derived from PageIterator, which adds in the |
47 | | * ability to access OCR output with text-specific methods. |
48 | | */ |
49 | | |
50 | | class TESS_API PageIterator { |
51 | | public: |
52 | | /** |
53 | | * page_res and tesseract come directly from the BaseAPI. |
54 | | * The rectangle parameters are copied indirectly from the Thresholder, |
55 | | * via the BaseAPI. They represent the coordinates of some rectangle in an |
56 | | * original image (in top-left-origin coordinates) and therefore the top-left |
57 | | * needs to be added to any output boxes in order to specify coordinates |
58 | | * in the original image. See TessBaseAPI::SetRectangle. |
59 | | * The scale and scaled_yres are in case the Thresholder scaled the image |
60 | | * rectangle prior to thresholding. Any coordinates in tesseract's image |
61 | | * must be divided by scale before adding (rect_left, rect_top). |
62 | | * The scaled_yres indicates the effective resolution of the binary image |
63 | | * that tesseract has been given by the Thresholder. |
64 | | * After the constructor, Begin has already been called. |
65 | | */ |
66 | | PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, |
67 | | int scaled_yres, int rect_left, int rect_top, int rect_width, |
68 | | int rect_height); |
69 | | virtual ~PageIterator(); |
70 | | |
71 | | /** |
72 | | * Page/ResultIterators may be copied! This makes it possible to iterate over |
73 | | * all the objects at a lower level, while maintaining an iterator to |
74 | | * objects at a higher level. These constructors DO NOT CALL Begin, so |
75 | | * iterations will continue from the location of src. |
76 | | */ |
77 | | PageIterator(const PageIterator &src); |
78 | | const PageIterator &operator=(const PageIterator &src); |
79 | | |
80 | | /** Are we positioned at the same location as other? */ |
81 | | bool PositionedAtSameWord(const PAGE_RES_IT *other) const; |
82 | | |
83 | | // ============= Moving around within the page ============. |
84 | | |
85 | | /** |
86 | | * Moves the iterator to point to the start of the page to begin an |
87 | | * iteration. |
88 | | */ |
89 | | virtual void Begin(); |
90 | | |
91 | | /** |
92 | | * Moves the iterator to the beginning of the paragraph. |
93 | | * This class implements this functionality by moving it to the zero indexed |
94 | | * blob of the first (leftmost) word on the first row of the paragraph. |
95 | | */ |
96 | | virtual void RestartParagraph(); |
97 | | |
98 | | /** |
99 | | * Return whether this iterator points anywhere in the first textline of a |
100 | | * paragraph. |
101 | | */ |
102 | | bool IsWithinFirstTextlineOfParagraph() const; |
103 | | |
104 | | /** |
105 | | * Moves the iterator to the beginning of the text line. |
106 | | * This class implements this functionality by moving it to the zero indexed |
107 | | * blob of the first (leftmost) word of the row. |
108 | | */ |
109 | | virtual void RestartRow(); |
110 | | |
111 | | /** |
112 | | * Moves to the start of the next object at the given level in the |
113 | | * page hierarchy, and returns false if the end of the page was reached. |
114 | | * NOTE that RIL_SYMBOL will skip non-text blocks, but all other |
115 | | * PageIteratorLevel level values will visit each non-text block once. |
116 | | * Think of non text blocks as containing a single para, with a single line, |
117 | | * with a single imaginary word. |
118 | | * Calls to Next with different levels may be freely intermixed. |
119 | | * This function iterates words in right-to-left scripts correctly, if |
120 | | * the appropriate language has been loaded into Tesseract. |
121 | | */ |
122 | | virtual bool Next(PageIteratorLevel level); |
123 | | |
124 | | /** |
125 | | * Returns true if the iterator is at the start of an object at the given |
126 | | * level. |
127 | | * |
128 | | * For instance, suppose an iterator it is pointed to the first symbol of the |
129 | | * first word of the third line of the second paragraph of the first block in |
130 | | * a page, then: |
131 | | * it.IsAtBeginningOf(RIL_BLOCK) = false |
132 | | * it.IsAtBeginningOf(RIL_PARA) = false |
133 | | * it.IsAtBeginningOf(RIL_TEXTLINE) = true |
134 | | * it.IsAtBeginningOf(RIL_WORD) = true |
135 | | * it.IsAtBeginningOf(RIL_SYMBOL) = true |
136 | | */ |
137 | | virtual bool IsAtBeginningOf(PageIteratorLevel level) const; |
138 | | |
139 | | /** |
140 | | * Returns whether the iterator is positioned at the last element in a |
141 | | * given level. (e.g. the last word in a line, the last line in a block) |
142 | | * |
143 | | * Here's some two-paragraph example |
144 | | * text. It starts off innocuously |
145 | | * enough but quickly turns bizarre. |
146 | | * The author inserts a cornucopia |
147 | | * of words to guard against confused |
148 | | * references. |
149 | | * |
150 | | * Now take an iterator it pointed to the start of "bizarre." |
151 | | * it.IsAtFinalElement(RIL_PARA, RIL_SYMBOL) = false |
152 | | * it.IsAtFinalElement(RIL_PARA, RIL_WORD) = true |
153 | | * it.IsAtFinalElement(RIL_BLOCK, RIL_WORD) = false |
154 | | */ |
155 | | virtual bool IsAtFinalElement(PageIteratorLevel level, |
156 | | PageIteratorLevel element) const; |
157 | | |
158 | | /** |
159 | | * Returns whether this iterator is positioned |
160 | | * before other: -1 |
161 | | * equal to other: 0 |
162 | | * after other: 1 |
163 | | */ |
164 | | int Cmp(const PageIterator &other) const; |
165 | | |
166 | | // ============= Accessing data ==============. |
167 | | // Coordinate system: |
168 | | // Integer coordinates are at the cracks between the pixels. |
169 | | // The top-left corner of the top-left pixel in the image is at (0,0). |
170 | | // The bottom-right corner of the bottom-right pixel in the image is at |
171 | | // (width, height). |
172 | | // Every bounding box goes from the top-left of the top-left contained |
173 | | // pixel to the bottom-right of the bottom-right contained pixel, so |
174 | | // the bounding box of the single top-left pixel in the image is: |
175 | | // (0,0)->(1,1). |
176 | | // If an image rectangle has been set in the API, then returned coordinates |
177 | | // relate to the original (full) image, rather than the rectangle. |
178 | | |
179 | | /** |
180 | | * Controls what to include in a bounding box. Bounding boxes of all levels |
181 | | * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. |
182 | | * Between layout analysis and recognition, it isn't known where all |
183 | | * diacritics belong, so this control is used to include or exclude some |
184 | | * diacritics that are above or below the main body of the word. In most cases |
185 | | * where the placement is obvious, and after recognition, it doesn't make as |
186 | | * much difference, as the diacritics will already be included in the word. |
187 | | */ |
188 | | void SetBoundingBoxComponents(bool include_upper_dots, |
189 | 0 | bool include_lower_dots) { |
190 | 0 | include_upper_dots_ = include_upper_dots; |
191 | 0 | include_lower_dots_ = include_lower_dots; |
192 | 0 | } |
193 | | |
194 | | /** |
195 | | * Returns the bounding rectangle of the current object at the given level. |
196 | | * See comment on coordinate system above. |
197 | | * Returns false if there is no such object at the current position. |
198 | | * The returned bounding box is guaranteed to match the size and position |
199 | | * of the image returned by GetBinaryImage, but may clip foreground pixels |
200 | | * from a grey image. The padding argument to GetImage can be used to expand |
201 | | * the image to include more foreground pixels. See GetImage below. |
202 | | */ |
203 | | bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, |
204 | | int *bottom) const; |
205 | | bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top, |
206 | | int *right, int *bottom) const; |
207 | | /** |
208 | | * Returns the bounding rectangle of the object in a coordinate system of the |
209 | | * working image rectangle having its origin at (rect_left_, rect_top_) with |
210 | | * respect to the original image and is scaled by a factor scale_. |
211 | | */ |
212 | | bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, |
213 | | int *right, int *bottom) const; |
214 | | |
215 | | /** Returns whether there is no object of a given level. */ |
216 | | bool Empty(PageIteratorLevel level) const; |
217 | | |
218 | | /** |
219 | | * Returns the type of the current block. |
220 | | * See tesseract/publictypes.h for PolyBlockType. |
221 | | */ |
222 | | PolyBlockType BlockType() const; |
223 | | |
224 | | /** |
225 | | * Returns the polygon outline of the current block. The returned Pta must |
226 | | * be ptaDestroy-ed after use. Note that the returned Pta lists the vertices |
227 | | * of the polygon, and the last edge is the line segment between the last |
228 | | * point and the first point. nullptr will be returned if the iterator is |
229 | | * at the end of the document or layout analysis was not used. |
230 | | */ |
231 | | Pta *BlockPolygon() const; |
232 | | |
233 | | /** |
234 | | * Returns a binary image of the current object at the given level. |
235 | | * The position and size match the return from BoundingBoxInternal, and so |
236 | | * this could be upscaled with respect to the original input image. |
237 | | * Use pixDestroy to delete the image after use. |
238 | | */ |
239 | | Pix *GetBinaryImage(PageIteratorLevel level) const; |
240 | | |
241 | | /** |
242 | | * Returns an image of the current object at the given level in greyscale |
243 | | * if available in the input. To guarantee a binary image use BinaryImage. |
244 | | * NOTE that in order to give the best possible image, the bounds are |
245 | | * expanded slightly over the binary connected component, by the supplied |
246 | | * padding, so the top-left position of the returned image is returned |
247 | | * in (left,top). These will most likely not match the coordinates |
248 | | * returned by BoundingBox. |
249 | | * If you do not supply an original image, you will get a binary one. |
250 | | * Use pixDestroy to delete the image after use. |
251 | | */ |
252 | | Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img, |
253 | | int *left, int *top) const; |
254 | | |
255 | | /** |
256 | | * Returns the baseline of the current object at the given level. |
257 | | * The baseline is the line that passes through (x1, y1) and (x2, y2). |
258 | | * WARNING: with vertical text, baselines may be vertical! |
259 | | * Returns false if there is no baseline at the current position. |
260 | | */ |
261 | | bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, |
262 | | int *y2) const; |
263 | | |
264 | | // Returns the attributes of the current row. |
265 | | void RowAttributes(float *row_height, float *descenders, |
266 | | float *ascenders) const; |
267 | | |
268 | | /** |
269 | | * Returns orientation for the block the iterator points to. |
270 | | * orientation, writing_direction, textline_order: see publictypes.h |
271 | | * deskew_angle: after rotating the block so the text orientation is |
272 | | * upright, how many radians does one have to rotate the |
273 | | * block anti-clockwise for it to be level? |
274 | | * -Pi/4 <= deskew_angle <= Pi/4 |
275 | | */ |
276 | | void Orientation(tesseract::Orientation *orientation, |
277 | | tesseract::WritingDirection *writing_direction, |
278 | | tesseract::TextlineOrder *textline_order, |
279 | | float *deskew_angle) const; |
280 | | |
281 | | /** |
282 | | * Returns information about the current paragraph, if available. |
283 | | * |
284 | | * justification - |
285 | | * LEFT if ragged right, or fully justified and script is left-to-right. |
286 | | * RIGHT if ragged left, or fully justified and script is right-to-left. |
287 | | * unknown if it looks like source code or we have very few lines. |
288 | | * is_list_item - |
289 | | * true if we believe this is a member of an ordered or unordered list. |
290 | | * is_crown - |
291 | | * true if the first line of the paragraph is aligned with the other |
292 | | * lines of the paragraph even though subsequent paragraphs have first |
293 | | * line indents. This typically indicates that this is the continuation |
294 | | * of a previous paragraph or that it is the very first paragraph in |
295 | | * the chapter. |
296 | | * first_line_indent - |
297 | | * For LEFT aligned paragraphs, the first text line of paragraphs of |
298 | | * this kind are indented this many pixels from the left edge of the |
299 | | * rest of the paragraph. |
300 | | * for RIGHT aligned paragraphs, the first text line of paragraphs of |
301 | | * this kind are indented this many pixels from the right edge of the |
302 | | * rest of the paragraph. |
303 | | * NOTE 1: This value may be negative. |
304 | | * NOTE 2: if *is_crown == true, the first line of this paragraph is |
305 | | * actually flush, and first_line_indent is set to the "common" |
306 | | * first_line_indent for subsequent paragraphs in this block |
307 | | * of text. |
308 | | */ |
309 | | void ParagraphInfo(tesseract::ParagraphJustification *justification, |
310 | | bool *is_list_item, bool *is_crown, |
311 | | int *first_line_indent) const; |
312 | | |
313 | | // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle |
314 | | // of the current word to the given pointer (takes ownership of the pointer) |
315 | | // and returns true. |
316 | | // Can only be used when iterating on the word level. |
317 | | bool SetWordBlamerBundle(BlamerBundle *blamer_bundle); |
318 | | |
319 | | protected: |
320 | | /** |
321 | | * Sets up the internal data for iterating the blobs of a new word, then |
322 | | * moves the iterator to the given offset. |
323 | | */ |
324 | | void BeginWord(int offset); |
325 | | |
326 | | /** Pointer to the page_res owned by the API. */ |
327 | | PAGE_RES *page_res_; |
328 | | /** Pointer to the Tesseract object owned by the API. */ |
329 | | Tesseract *tesseract_; |
330 | | /** |
331 | | * The iterator to the page_res_. Owned by this ResultIterator. |
332 | | * A pointer just to avoid dragging in Tesseract includes. |
333 | | */ |
334 | | PAGE_RES_IT *it_; |
335 | | /** |
336 | | * The current input WERD being iterated. If there is an output from OCR, |
337 | | * then word_ is nullptr. Owned by the API |
338 | | */ |
339 | | WERD *word_; |
340 | | /** The length of the current word_. */ |
341 | | int word_length_; |
342 | | /** The current blob index within the word. */ |
343 | | int blob_index_; |
344 | | /** |
345 | | * Iterator to the blobs within the word. If nullptr, then we are iterating |
346 | | * OCR results in the box_word. |
347 | | * Owned by this ResultIterator. |
348 | | */ |
349 | | C_BLOB_IT *cblob_it_; |
350 | | /** Control over what to include in bounding boxes. */ |
351 | | bool include_upper_dots_; |
352 | | bool include_lower_dots_; |
353 | | /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ |
354 | | int scale_; |
355 | | int scaled_yres_; |
356 | | int rect_left_; |
357 | | int rect_top_; |
358 | | int rect_width_; |
359 | | int rect_height_; |
360 | | }; |
361 | | |
362 | | } // namespace tesseract. |
363 | | |
364 | | #endif // TESSERACT_CCMAIN_PAGEITERATOR_H_ |