Coverage Report

Created: 2025-06-13 07:02

/src/tesseract/src/ccmain/paragraphs_internal.h
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        paragraphs_internal.h
3
 * Description: Paragraph Detection internal data structures.
4
 * Author:      David Eger
5
 *
6
 * (C) Copyright 2011, Google Inc.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
20
#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
21
22
#include <tesseract/publictypes.h> // for ParagraphJustification
23
#include "paragraphs.h"
24
25
// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
26
// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
27
28
namespace tesseract {
29
30
class UNICHARSET;
31
class WERD_CHOICE;
32
33
// Return whether the given word is likely to be a list item start word.
34
TESS_API
35
bool AsciiLikelyListItem(const std::string &word);
36
37
// Set right word attributes given either a unicharset and werd or a utf8
38
// string.
39
TESS_API
40
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
41
                         bool *is_list, bool *starts_idea, bool *ends_idea);
42
43
// Set left word attributes given either a unicharset and werd or a utf8 string.
44
TESS_API
45
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
46
                        bool *is_list, bool *starts_idea, bool *ends_idea);
47
48
enum LineType {
49
  LT_START = 'S',    // First line of a paragraph.
50
  LT_BODY = 'C',     // Continuation line of a paragraph.
51
  LT_UNKNOWN = 'U',  // No clues.
52
  LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
53
};
54
55
// The first paragraph in a page of body text is often un-indented.
56
// This is a typographic convention which is common to indicate either that:
57
// (1) The paragraph is the continuation of a previous paragraph, or
58
// (2) The paragraph is the first paragraph in a chapter.
59
//
60
// I refer to such paragraphs as "crown"s, and the output of the paragraph
61
// detection algorithm attempts to give them the same paragraph model as
62
// the rest of the body text.
63
//
64
// Nonetheless, while building hypotheses, it is useful to mark the lines
65
// of crown paragraphs temporarily as crowns, either aligned left or right.
66
extern const ParagraphModel *kCrownLeft;
67
extern const ParagraphModel *kCrownRight;
68
69
234k
inline bool StrongModel(const ParagraphModel *model) {
70
234k
  return model != nullptr && model != kCrownLeft && model != kCrownRight;
71
234k
}
72
73
struct LineHypothesis {
74
0
  LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
75
82.6k
  LineHypothesis(LineType line_type, const ParagraphModel *m) : ty(line_type), model(m) {}
76
  LineHypothesis(const LineHypothesis &other) = default;
77
78
  // Copy assignment operator.
79
  LineHypothesis &operator=(const LineHypothesis &other) = default;
80
81
37.6k
  bool operator==(const LineHypothesis &other) const {
82
37.6k
    return ty == other.ty && model == other.model;
83
37.6k
  }
84
85
  LineType ty;
86
  const ParagraphModel *model;
87
};
88
89
class ParagraphTheory; // Forward Declaration
90
91
using SetOfModels = std::vector<const ParagraphModel *>;
92
93
// Row Scratch Registers are data generated by the paragraph detection
94
// algorithm based on a RowInfo input.
95
class RowScratchRegisters {
96
public:
97
  // We presume row will outlive us.
98
  void Init(const RowInfo &row);
99
100
  LineType GetLineType() const;
101
102
  LineType GetLineType(const ParagraphModel *model) const;
103
104
  // Mark this as a start line type, sans model.  This is useful for the
105
  // initial marking of probable body lines or paragraph start lines.
106
  void SetStartLine();
107
108
  // Mark this as a body line type, sans model.  This is useful for the
109
  // initial marking of probably body lines or paragraph start lines.
110
  void SetBodyLine();
111
112
  // Record that this row fits as a paragraph start line in the given model,
113
  void AddStartLine(const ParagraphModel *model);
114
  // Record that this row fits as a paragraph body line in the given model,
115
  void AddBodyLine(const ParagraphModel *model);
116
117
  // Clear all hypotheses about this line.
118
141k
  void SetUnknown() {
119
141k
    hypotheses_.clear();
120
141k
  }
121
122
  // Append all hypotheses of strong models that match this row as a start.
123
  void StartHypotheses(SetOfModels *models) const;
124
125
  // Append all hypotheses of strong models matching this row.
126
  void StrongHypotheses(SetOfModels *models) const;
127
128
  // Append all hypotheses for this row.
129
  void NonNullHypotheses(SetOfModels *models) const;
130
131
  // Discard any hypotheses whose model is not in the given list.
132
  void DiscardNonMatchingHypotheses(const SetOfModels &models);
133
134
  // If we have only one hypothesis and that is that this line is a paragraph
135
  // start line of a certain model, return that model.  Else return nullptr.
136
  const ParagraphModel *UniqueStartHypothesis() const;
137
138
  // If we have only one hypothesis and that is that this line is a paragraph
139
  // body line of a certain model, return that model.  Else return nullptr.
140
  const ParagraphModel *UniqueBodyHypothesis() const;
141
142
  // Return the indentation for the side opposite of the aligned side.
143
170k
  int OffsideIndent(tesseract::ParagraphJustification just) const {
144
170k
    switch (just) {
145
40.7k
      case tesseract::JUSTIFICATION_RIGHT:
146
40.7k
        return lindent_;
147
129k
      case tesseract::JUSTIFICATION_LEFT:
148
129k
        return rindent_;
149
0
      default:
150
0
        return lindent_ > rindent_ ? lindent_ : rindent_;
151
170k
    }
152
170k
  }
153
154
  // Return the indentation for the side the text is aligned to.
155
2.81k
  int AlignsideIndent(tesseract::ParagraphJustification just) const {
156
2.81k
    switch (just) {
157
341
      case tesseract::JUSTIFICATION_RIGHT:
158
341
        return rindent_;
159
2.47k
      case tesseract::JUSTIFICATION_LEFT:
160
2.47k
        return lindent_;
161
0
      default:
162
0
        return lindent_ > rindent_ ? lindent_ : rindent_;
163
2.81k
    }
164
2.81k
  }
165
166
  // Append header fields to a vector of row headings.
167
  static void AppendDebugHeaderFields(std::vector<std::string> &header);
168
169
  // Append data for this row to a vector of debug strings.
170
  void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const;
171
172
  const RowInfo *ri_;
173
174
  // These four constants form a horizontal box model for the white space
175
  // on the edges of each line.  At each point in the algorithm, the following
176
  // shall hold:
177
  //   ri_->pix_ldistance = lmargin_ + lindent_
178
  //   ri_->pix_rdistance = rindent_ + rmargin_
179
  int lmargin_;
180
  int lindent_;
181
  int rindent_;
182
  int rmargin_;
183
184
private:
185
  // Hypotheses of either LT_START or LT_BODY
186
  std::vector<LineHypothesis> hypotheses_;
187
};
188
189
// A collection of convenience functions for wrapping the set of
190
// Paragraph Models we believe correctly model the paragraphs in the image.
191
class ParagraphTheory {
192
public:
193
  // We presume models will outlive us, and that models will take ownership
194
  // of any ParagraphModel *'s we add.
195
6.87k
  explicit ParagraphTheory(std::vector<ParagraphModel *> *models) : models_(models) {}
196
0
  std::vector<ParagraphModel *> &models() {
197
0
    return *models_;
198
0
  }
199
0
  const std::vector<ParagraphModel *> &models() const {
200
0
    return *models_;
201
0
  }
202
203
  // Return an existing model if one that is Comparable() can be found.
204
  // Else, allocate a new copy of model to save and return a pointer to it.
205
  const ParagraphModel *AddModel(const ParagraphModel &model);
206
207
  // Discard any models we've made that are not in the list of used models.
208
  void DiscardUnusedModels(const SetOfModels &used_models);
209
210
  // Return the set of all non-centered models.
211
  void NonCenteredModels(SetOfModels *models);
212
213
  // If any of the non-centered paragraph models we know about fit
214
  // rows[start, end), return it.  Else nullptr.
215
  const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,
216
                             int end) const;
217
218
  int IndexOf(const ParagraphModel *model) const;
219
220
private:
221
  std::vector<ParagraphModel *> *models_;
222
  std::vector<ParagraphModel *> models_we_added_;
223
};
224
225
bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
226
                    const ParagraphModel *model);
227
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
228
                   const ParagraphModel *model);
229
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
230
                     const ParagraphModel *model);
231
232
// A class for smearing Paragraph Model hypotheses to surrounding rows.
233
// The idea here is that StrongEvidenceClassify first marks only exceedingly
234
// obvious start and body rows and constructs models of them.  Thereafter,
235
// we may have left over unmarked lines (mostly end-of-paragraph lines) which
236
// were too short to have much confidence about, but which fit the models we've
237
// constructed perfectly and which we ought to mark.  This class is used to
238
// "smear" our models over the text.
239
class ParagraphModelSmearer {
240
public:
241
  ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
242
                        ParagraphTheory *theory);
243
244
  // Smear forward paragraph models from existing row markings to subsequent
245
  // text lines if they fit, and mark any thereafter still unmodeled rows
246
  // with any model in the theory that fits them.
247
  void Smear();
248
249
private:
250
  // Record in open_models_ for rows [start_row, end_row) the list of models
251
  // currently open at each row.
252
  // A model is still open in a row if some previous row has said model as a
253
  // start hypothesis, and all rows since (including this row) would fit as
254
  // either a body or start line in that model.
255
  void CalculateOpenModels(int row_start, int row_end);
256
257
341k
  SetOfModels &OpenModels(int row) {
258
341k
    return open_models_[row - row_start_ + 1];
259
341k
  }
260
261
  ParagraphTheory *theory_;
262
  std::vector<RowScratchRegisters> *rows_;
263
  int row_start_;
264
  int row_end_;
265
266
  // open_models_ corresponds to rows[start_row_ - 1, end_row_]
267
  //
268
  // open_models_:  Contains models which there was an active (open) paragraph
269
  //                as of the previous line and for which the left and right
270
  //                indents admit the possibility that this text line continues
271
  //                to fit the same model.
272
  // TODO(eger): Think about whether we can get rid of "Open" models and just
273
  //   use the current hypotheses on RowScratchRegisters.
274
  std::vector<SetOfModels> open_models_;
275
};
276
277
// Clear all hypotheses about lines [start, end) and reset the margins to the
278
// percentile (0..100) value of the left and right row edges for this run of
279
// rows.
280
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
281
                                        int end, int percentile);
282
283
// Return the median inter-word space in rows[row_start, row_end).
284
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);
285
286
// Return whether the first word on the after line can fit in the space at
287
// the end of the before line (knowing which way the text is aligned and read).
288
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,
289
                           tesseract::ParagraphJustification justification);
290
291
// Return whether the first word on the after line can fit in the space at
292
// the end of the before line (not knowing the text alignment).
293
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);
294
295
// Do rows[start, end) form a single instance of the given paragraph model?
296
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
297
                  const ParagraphModel *model);
298
299
// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
300
// normalize each row_owner to point to an actual PARA, and output the
301
// paragraphs in order onto paragraphs.
302
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);
303
304
} // namespace tesseract
305
306
#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_