/src/tesseract/src/ccstruct/ocrpara.h
Line | Count | Source |
1 | | ///////////////////////////////////////////////////////////////////// |
2 | | // File: ocrpara.h |
3 | | // Description: OCR Paragraph Output Type |
4 | | // Author: David Eger |
5 | | // |
6 | | // (C) Copyright 2010, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | // |
17 | | /////////////////////////////////////////////////////////////////////// |
18 | | |
19 | | #ifndef TESSERACT_CCSTRUCT_OCRPARA_H_ |
20 | | #define TESSERACT_CCSTRUCT_OCRPARA_H_ |
21 | | |
22 | | #include "elst.h" |
23 | | |
24 | | #include <tesseract/publictypes.h> |
25 | | |
26 | | namespace tesseract { |
27 | | |
28 | | class ParagraphModel; |
29 | | |
30 | | struct PARA : public ELIST<PARA>::LINK { |
31 | | public: |
32 | | PARA() |
33 | 21.1k | : model(nullptr) |
34 | 21.1k | , is_list_item(false) |
35 | 21.1k | , is_very_first_or_continuation(false) |
36 | 21.1k | , has_drop_cap(false) {} |
37 | | |
38 | | // We do not own the model, we just reference it. |
39 | | // model may be nullptr if there is not a good model for this paragraph. |
40 | | const ParagraphModel *model; |
41 | | |
42 | | bool is_list_item; |
43 | | |
44 | | // The first paragraph on a page often lacks a first line indent, but should |
45 | | // still be modeled by the same model as other body text paragraphs on the |
46 | | // page. |
47 | | bool is_very_first_or_continuation; |
48 | | |
49 | | // Does this paragraph begin with a drop cap? |
50 | | bool has_drop_cap; |
51 | | }; |
52 | | |
53 | | ELISTIZEH(PARA) |
54 | | |
55 | | // A geometric model of paragraph indentation and alignment. |
56 | | // |
57 | | // Measurements are in pixels. The meaning of the integer arguments changes |
58 | | // depending upon the value of justification. Distances less than or equal |
59 | | // to tolerance apart we take as "equivalent" for the purpose of model |
60 | | // matching, and in the examples below, we assume tolerance is zero. |
61 | | // |
62 | | // justification = LEFT: |
63 | | // margin the "ignored" margin to the left block edge. |
64 | | // first_indent indent from the left margin to a typical first text line. |
65 | | // body_indent indent from the left margin of a typical body text line. |
66 | | // |
67 | | // justification = RIGHT: |
68 | | // margin the "ignored" margin to the right block edge. |
69 | | // first_indent indent from the right margin to a typical first text line. |
70 | | // body_indent indent from the right margin of a typical body text line. |
71 | | // |
72 | | // justification = CENTER: |
73 | | // margin ignored |
74 | | // first_indent ignored |
75 | | // body_indent ignored |
76 | | // |
77 | | // ====== Extended example, assuming each letter is ten pixels wide: ======= |
78 | | // |
79 | | // +--------------------------------+ |
80 | | // | Awesome | ParagraphModel(CENTER, 0, 0, 0) |
81 | | // | Centered Title | |
82 | | // | Paragraph Detection | |
83 | | // | OCR TEAM | |
84 | | // | 10 November 2010 | |
85 | | // | | |
86 | | // | Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0) |
87 | | // |This paragraph starts at the top| |
88 | | // |of the page and takes 3 lines. | |
89 | | // | Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0) |
90 | | // |which indicates that the first | |
91 | | // |paragraph is not a continuation | |
92 | | // |from a previous page, as it is | |
93 | | // |indented just like this second | |
94 | | // |paragraph. | |
95 | | // | Here is a block quote. It | ParagraphModel(LEFT, 30, 0, 0) |
96 | | // | looks like the prior text | |
97 | | // | but it is indented more | |
98 | | // | and is fully justified. | |
99 | | // | So how does one deal with | ParagraphModel(LEFT, 0, 20, 0) |
100 | | // |centered text, block quotes, | |
101 | | // |normal paragraphs, and lists | |
102 | | // |like what follows? | |
103 | | // |1. Make a plan. | ParagraphModel(LEFT, 0, 0, 30) |
104 | | // |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30) |
105 | | // | looking for lines where the | |
106 | | // | first word of the next line | |
107 | | // | would fit on the previous | |
108 | | // | line. | |
109 | | // |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30) |
110 | | // | Python and try it out. | |
111 | | // |4. Determine how to fix the | ParagraphModel(LEFT, 0, 0, 30) |
112 | | // | mistakes. | |
113 | | // |5. Repeat. | ParagraphModel(LEFT, 0, 0, 30) |
114 | | // | For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0) |
115 | | // |you can try to identify source | |
116 | | // |code. Ouch! | |
117 | | // +--------------------------------+ |
118 | | class TESS_API ParagraphModel { |
119 | | public: |
120 | | ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent, |
121 | | int body_indent, int tolerance) |
122 | 38.3k | : justification_(justification) |
123 | 38.3k | , margin_(margin) |
124 | 38.3k | , first_indent_(first_indent) |
125 | 38.3k | , body_indent_(body_indent) |
126 | 38.3k | , tolerance_(tolerance) { |
127 | | // Make one of {first_indent, body_indent} is 0. |
128 | 38.3k | int added_margin = first_indent; |
129 | 38.3k | if (body_indent < added_margin) { |
130 | 7.14k | added_margin = body_indent; |
131 | 7.14k | } |
132 | 38.3k | margin_ += added_margin; |
133 | 38.3k | first_indent_ -= added_margin; |
134 | 38.3k | body_indent_ -= added_margin; |
135 | 38.3k | } |
136 | | |
137 | | ParagraphModel() |
138 | 35.1k | : justification_(tesseract::JUSTIFICATION_UNKNOWN) |
139 | 35.1k | , margin_(0) |
140 | 35.1k | , first_indent_(0) |
141 | 35.1k | , body_indent_(0) |
142 | 35.1k | , tolerance_(0) {} |
143 | | |
144 | | // ValidFirstLine() and ValidBodyLine() take arguments describing a text line |
145 | | // in a block of text which we are trying to model: |
146 | | // lmargin, lindent: these add up to the distance from the leftmost ink |
147 | | // in the text line to the surrounding text block's left |
148 | | // edge. |
149 | | // rmargin, rindent: these add up to the distance from the rightmost ink |
150 | | // in the text line to the surrounding text block's right |
151 | | // edge. |
152 | | // The caller determines the division between "margin" and "indent", which |
153 | | // only actually affect whether we think the line may be centered. |
154 | | // |
155 | | // If the amount of whitespace matches the amount of whitespace expected on |
156 | | // the relevant side of the line (within tolerance_) we say it matches. |
157 | | |
158 | | // Return whether a given text line could be a first paragraph line according |
159 | | // to this paragraph model. |
160 | | bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const; |
161 | | |
162 | | // Return whether a given text line could be a first paragraph line according |
163 | | // to this paragraph model. |
164 | | bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const; |
165 | | |
166 | 85.7k | tesseract::ParagraphJustification justification() const { |
167 | 85.7k | return justification_; |
168 | 85.7k | } |
169 | 0 | int margin() const { |
170 | 0 | return margin_; |
171 | 0 | } |
172 | 3.76k | int first_indent() const { |
173 | 3.76k | return first_indent_; |
174 | 3.76k | } |
175 | 3.76k | int body_indent() const { |
176 | 3.76k | return body_indent_; |
177 | 3.76k | } |
178 | 3.76k | int tolerance() const { |
179 | 3.76k | return tolerance_; |
180 | 3.76k | } |
181 | 19.2k | bool is_flush() const { |
182 | 19.2k | return (justification_ == tesseract::JUSTIFICATION_LEFT || |
183 | 17.7k | justification_ == tesseract::JUSTIFICATION_RIGHT) && |
184 | 19.1k | abs(first_indent_ - body_indent_) <= tolerance_; |
185 | 19.2k | } |
186 | | |
187 | | // Return whether this model is likely to agree with the other model on most |
188 | | // paragraphs they are marked. |
189 | | bool Comparable(const ParagraphModel &other) const; |
190 | | |
191 | | std::string ToString() const; |
192 | | |
193 | | private: |
194 | | tesseract::ParagraphJustification justification_; |
195 | | int margin_; |
196 | | int first_indent_; |
197 | | int body_indent_; |
198 | | int tolerance_; |
199 | | }; |
200 | | |
201 | | } // namespace tesseract |
202 | | |
203 | | #endif // TESSERACT_CCSTRUCT_OCRPARA_H_ |