/src/tesseract/src/ccmain/paragraphs_internal.h
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: paragraphs_internal.h |
3 | | * Description: Paragraph Detection internal data structures. |
4 | | * Author: David Eger |
5 | | * |
6 | | * (C) Copyright 2011, Google Inc. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ |
20 | | #define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ |
21 | | |
22 | | #include <tesseract/publictypes.h> // for ParagraphJustification |
23 | | #include "paragraphs.h" |
24 | | |
25 | | // NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS |
26 | | // DATA STRUCTURES OR FUNCTIONS IN THIS FILE. |
27 | | |
28 | | namespace tesseract { |
29 | | |
30 | | class UNICHARSET; |
31 | | class WERD_CHOICE; |
32 | | |
33 | | // Return whether the given word is likely to be a list item start word. |
34 | | TESS_API |
35 | | bool AsciiLikelyListItem(const std::string &word); |
36 | | |
37 | | // Set right word attributes given either a unicharset and werd or a utf8 |
38 | | // string. |
39 | | TESS_API |
40 | | void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, |
41 | | bool *is_list, bool *starts_idea, bool *ends_idea); |
42 | | |
43 | | // Set left word attributes given either a unicharset and werd or a utf8 string. |
44 | | TESS_API |
45 | | void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, |
46 | | bool *is_list, bool *starts_idea, bool *ends_idea); |
47 | | |
48 | | enum LineType { |
49 | | LT_START = 'S', // First line of a paragraph. |
50 | | LT_BODY = 'C', // Continuation line of a paragraph. |
51 | | LT_UNKNOWN = 'U', // No clues. |
52 | | LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY. |
53 | | }; |
54 | | |
55 | | // The first paragraph in a page of body text is often un-indented. |
56 | | // This is a typographic convention which is common to indicate either that: |
57 | | // (1) The paragraph is the continuation of a previous paragraph, or |
58 | | // (2) The paragraph is the first paragraph in a chapter. |
59 | | // |
60 | | // I refer to such paragraphs as "crown"s, and the output of the paragraph |
61 | | // detection algorithm attempts to give them the same paragraph model as |
62 | | // the rest of the body text. |
63 | | // |
64 | | // Nonetheless, while building hypotheses, it is useful to mark the lines |
65 | | // of crown paragraphs temporarily as crowns, either aligned left or right. |
66 | | extern const ParagraphModel *kCrownLeft; |
67 | | extern const ParagraphModel *kCrownRight; |
68 | | |
69 | 234k | inline bool StrongModel(const ParagraphModel *model) { |
70 | 234k | return model != nullptr && model != kCrownLeft && model != kCrownRight; |
71 | 234k | } |
72 | | |
73 | | struct LineHypothesis { |
74 | 0 | LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {} |
75 | 82.6k | LineHypothesis(LineType line_type, const ParagraphModel *m) : ty(line_type), model(m) {} |
76 | | LineHypothesis(const LineHypothesis &other) = default; |
77 | | |
78 | | // Copy assignment operator. |
79 | | LineHypothesis &operator=(const LineHypothesis &other) = default; |
80 | | |
81 | 37.6k | bool operator==(const LineHypothesis &other) const { |
82 | 37.6k | return ty == other.ty && model == other.model; |
83 | 37.6k | } |
84 | | |
85 | | LineType ty; |
86 | | const ParagraphModel *model; |
87 | | }; |
88 | | |
89 | | class ParagraphTheory; // Forward Declaration |
90 | | |
91 | | using SetOfModels = std::vector<const ParagraphModel *>; |
92 | | |
93 | | // Row Scratch Registers are data generated by the paragraph detection |
94 | | // algorithm based on a RowInfo input. |
95 | | class RowScratchRegisters { |
96 | | public: |
97 | | // We presume row will outlive us. |
98 | | void Init(const RowInfo &row); |
99 | | |
100 | | LineType GetLineType() const; |
101 | | |
102 | | LineType GetLineType(const ParagraphModel *model) const; |
103 | | |
104 | | // Mark this as a start line type, sans model. This is useful for the |
105 | | // initial marking of probable body lines or paragraph start lines. |
106 | | void SetStartLine(); |
107 | | |
108 | | // Mark this as a body line type, sans model. This is useful for the |
109 | | // initial marking of probably body lines or paragraph start lines. |
110 | | void SetBodyLine(); |
111 | | |
112 | | // Record that this row fits as a paragraph start line in the given model, |
113 | | void AddStartLine(const ParagraphModel *model); |
114 | | // Record that this row fits as a paragraph body line in the given model, |
115 | | void AddBodyLine(const ParagraphModel *model); |
116 | | |
117 | | // Clear all hypotheses about this line. |
118 | 141k | void SetUnknown() { |
119 | 141k | hypotheses_.clear(); |
120 | 141k | } |
121 | | |
122 | | // Append all hypotheses of strong models that match this row as a start. |
123 | | void StartHypotheses(SetOfModels *models) const; |
124 | | |
125 | | // Append all hypotheses of strong models matching this row. |
126 | | void StrongHypotheses(SetOfModels *models) const; |
127 | | |
128 | | // Append all hypotheses for this row. |
129 | | void NonNullHypotheses(SetOfModels *models) const; |
130 | | |
131 | | // Discard any hypotheses whose model is not in the given list. |
132 | | void DiscardNonMatchingHypotheses(const SetOfModels &models); |
133 | | |
134 | | // If we have only one hypothesis and that is that this line is a paragraph |
135 | | // start line of a certain model, return that model. Else return nullptr. |
136 | | const ParagraphModel *UniqueStartHypothesis() const; |
137 | | |
138 | | // If we have only one hypothesis and that is that this line is a paragraph |
139 | | // body line of a certain model, return that model. Else return nullptr. |
140 | | const ParagraphModel *UniqueBodyHypothesis() const; |
141 | | |
142 | | // Return the indentation for the side opposite of the aligned side. |
143 | 170k | int OffsideIndent(tesseract::ParagraphJustification just) const { |
144 | 170k | switch (just) { |
145 | 40.7k | case tesseract::JUSTIFICATION_RIGHT: |
146 | 40.7k | return lindent_; |
147 | 129k | case tesseract::JUSTIFICATION_LEFT: |
148 | 129k | return rindent_; |
149 | 0 | default: |
150 | 0 | return lindent_ > rindent_ ? lindent_ : rindent_; |
151 | 170k | } |
152 | 170k | } |
153 | | |
154 | | // Return the indentation for the side the text is aligned to. |
155 | 2.81k | int AlignsideIndent(tesseract::ParagraphJustification just) const { |
156 | 2.81k | switch (just) { |
157 | 341 | case tesseract::JUSTIFICATION_RIGHT: |
158 | 341 | return rindent_; |
159 | 2.47k | case tesseract::JUSTIFICATION_LEFT: |
160 | 2.47k | return lindent_; |
161 | 0 | default: |
162 | 0 | return lindent_ > rindent_ ? lindent_ : rindent_; |
163 | 2.81k | } |
164 | 2.81k | } |
165 | | |
166 | | // Append header fields to a vector of row headings. |
167 | | static void AppendDebugHeaderFields(std::vector<std::string> &header); |
168 | | |
169 | | // Append data for this row to a vector of debug strings. |
170 | | void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const; |
171 | | |
172 | | const RowInfo *ri_; |
173 | | |
174 | | // These four constants form a horizontal box model for the white space |
175 | | // on the edges of each line. At each point in the algorithm, the following |
176 | | // shall hold: |
177 | | // ri_->pix_ldistance = lmargin_ + lindent_ |
178 | | // ri_->pix_rdistance = rindent_ + rmargin_ |
179 | | int lmargin_; |
180 | | int lindent_; |
181 | | int rindent_; |
182 | | int rmargin_; |
183 | | |
184 | | private: |
185 | | // Hypotheses of either LT_START or LT_BODY |
186 | | std::vector<LineHypothesis> hypotheses_; |
187 | | }; |
188 | | |
189 | | // A collection of convenience functions for wrapping the set of |
190 | | // Paragraph Models we believe correctly model the paragraphs in the image. |
191 | | class ParagraphTheory { |
192 | | public: |
193 | | // We presume models will outlive us, and that models will take ownership |
194 | | // of any ParagraphModel *'s we add. |
195 | 6.87k | explicit ParagraphTheory(std::vector<ParagraphModel *> *models) : models_(models) {} |
196 | 0 | std::vector<ParagraphModel *> &models() { |
197 | 0 | return *models_; |
198 | 0 | } |
199 | 0 | const std::vector<ParagraphModel *> &models() const { |
200 | 0 | return *models_; |
201 | 0 | } |
202 | | |
203 | | // Return an existing model if one that is Comparable() can be found. |
204 | | // Else, allocate a new copy of model to save and return a pointer to it. |
205 | | const ParagraphModel *AddModel(const ParagraphModel &model); |
206 | | |
207 | | // Discard any models we've made that are not in the list of used models. |
208 | | void DiscardUnusedModels(const SetOfModels &used_models); |
209 | | |
210 | | // Return the set of all non-centered models. |
211 | | void NonCenteredModels(SetOfModels *models); |
212 | | |
213 | | // If any of the non-centered paragraph models we know about fit |
214 | | // rows[start, end), return it. Else nullptr. |
215 | | const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start, |
216 | | int end) const; |
217 | | |
218 | | int IndexOf(const ParagraphModel *model) const; |
219 | | |
220 | | private: |
221 | | std::vector<ParagraphModel *> *models_; |
222 | | std::vector<ParagraphModel *> models_we_added_; |
223 | | }; |
224 | | |
225 | | bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row, |
226 | | const ParagraphModel *model); |
227 | | bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row, |
228 | | const ParagraphModel *model); |
229 | | bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b, |
230 | | const ParagraphModel *model); |
231 | | |
232 | | // A class for smearing Paragraph Model hypotheses to surrounding rows. |
233 | | // The idea here is that StrongEvidenceClassify first marks only exceedingly |
234 | | // obvious start and body rows and constructs models of them. Thereafter, |
235 | | // we may have left over unmarked lines (mostly end-of-paragraph lines) which |
236 | | // were too short to have much confidence about, but which fit the models we've |
237 | | // constructed perfectly and which we ought to mark. This class is used to |
238 | | // "smear" our models over the text. |
239 | | class ParagraphModelSmearer { |
240 | | public: |
241 | | ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end, |
242 | | ParagraphTheory *theory); |
243 | | |
244 | | // Smear forward paragraph models from existing row markings to subsequent |
245 | | // text lines if they fit, and mark any thereafter still unmodeled rows |
246 | | // with any model in the theory that fits them. |
247 | | void Smear(); |
248 | | |
249 | | private: |
250 | | // Record in open_models_ for rows [start_row, end_row) the list of models |
251 | | // currently open at each row. |
252 | | // A model is still open in a row if some previous row has said model as a |
253 | | // start hypothesis, and all rows since (including this row) would fit as |
254 | | // either a body or start line in that model. |
255 | | void CalculateOpenModels(int row_start, int row_end); |
256 | | |
257 | 341k | SetOfModels &OpenModels(int row) { |
258 | 341k | return open_models_[row - row_start_ + 1]; |
259 | 341k | } |
260 | | |
261 | | ParagraphTheory *theory_; |
262 | | std::vector<RowScratchRegisters> *rows_; |
263 | | int row_start_; |
264 | | int row_end_; |
265 | | |
266 | | // open_models_ corresponds to rows[start_row_ - 1, end_row_] |
267 | | // |
268 | | // open_models_: Contains models which there was an active (open) paragraph |
269 | | // as of the previous line and for which the left and right |
270 | | // indents admit the possibility that this text line continues |
271 | | // to fit the same model. |
272 | | // TODO(eger): Think about whether we can get rid of "Open" models and just |
273 | | // use the current hypotheses on RowScratchRegisters. |
274 | | std::vector<SetOfModels> open_models_; |
275 | | }; |
276 | | |
277 | | // Clear all hypotheses about lines [start, end) and reset the margins to the |
278 | | // percentile (0..100) value of the left and right row edges for this run of |
279 | | // rows. |
280 | | void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start, |
281 | | int end, int percentile); |
282 | | |
283 | | // Return the median inter-word space in rows[row_start, row_end). |
284 | | int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end); |
285 | | |
286 | | // Return whether the first word on the after line can fit in the space at |
287 | | // the end of the before line (knowing which way the text is aligned and read). |
288 | | bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, |
289 | | tesseract::ParagraphJustification justification); |
290 | | |
291 | | // Return whether the first word on the after line can fit in the space at |
292 | | // the end of the before line (not knowing the text alignment). |
293 | | bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after); |
294 | | |
295 | | // Do rows[start, end) form a single instance of the given paragraph model? |
296 | | bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end, |
297 | | const ParagraphModel *model); |
298 | | |
299 | | // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known), |
300 | | // normalize each row_owner to point to an actual PARA, and output the |
301 | | // paragraphs in order onto paragraphs. |
302 | | void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs); |
303 | | |
304 | | } // namespace tesseract |
305 | | |
306 | | #endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ |