/src/tesseract/src/ccmain/paragraphs_internal.h

Source (jump to first uncovered line)
/**********************************************************************
 * File:        paragraphs_internal.h
 * Description: Paragraph Detection internal data structures.
 * Author:      David Eger
 *
 * (C) Copyright 2011, Google Inc.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_

#include <tesseract/publictypes.h> // for ParagraphJustification
#include "paragraphs.h"

// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.

namespace tesseract {

class UNICHARSET;
class WERD_CHOICE;

// Return whether the given word is likely to be a list item start word.
TESS_API
bool AsciiLikelyListItem(const std::string &word);

// Set right word attributes given either a unicharset and werd or a utf8
// string.
TESS_API
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
                         bool *is_list, bool *starts_idea, bool *ends_idea);

// Set left word attributes given either a unicharset and werd or a utf8 string.
TESS_API
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
                        bool *is_list, bool *starts_idea, bool *ends_idea);

enum LineType {
  LT_START = 'S',    // First line of a paragraph.
  LT_BODY = 'C',     // Continuation line of a paragraph.
  LT_UNKNOWN = 'U',  // No clues.
  LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
};

// The first paragraph in a page of body text is often un-indented.
// This is a typographic convention which is common to indicate either that:
// (1) The paragraph is the continuation of a previous paragraph, or
// (2) The paragraph is the first paragraph in a chapter.
//
// I refer to such paragraphs as "crown"s, and the output of the paragraph
// detection algorithm attempts to give them the same paragraph model as
// the rest of the body text.
//
// Nonetheless, while building hypotheses, it is useful to mark the lines
// of crown paragraphs temporarily as crowns, either aligned left or right.
extern const ParagraphModel *kCrownLeft;
extern const ParagraphModel *kCrownRight;

inline bool StrongModel(const ParagraphModel *model) {
  return model != nullptr && model != kCrownLeft && model != kCrownRight;
}

struct LineHypothesis {
  LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
  LineHypothesis(LineType line_type, const ParagraphModel *m) : ty(line_type), model(m) {}
  LineHypothesis(const LineHypothesis &other) = default;

  // Copy assignment operator.
  LineHypothesis &operator=(const LineHypothesis &other) = default;

  bool operator==(const LineHypothesis &other) const {
    return ty == other.ty && model == other.model;
  }

  LineType ty;
  const ParagraphModel *model;
};

class ParagraphTheory; // Forward Declaration

using SetOfModels = std::vector<const ParagraphModel *>;

// Row Scratch Registers are data generated by the paragraph detection
// algorithm based on a RowInfo input.
class RowScratchRegisters {
public:
  // We presume row will outlive us.
  void Init(const RowInfo &row);

  LineType GetLineType() const;

  LineType GetLineType(const ParagraphModel *model) const;

  // Mark this as a start line type, sans model.  This is useful for the
  // initial marking of probable body lines or paragraph start lines.
  void SetStartLine();

  // Mark this as a body line type, sans model.  This is useful for the
  // initial marking of probably body lines or paragraph start lines.
  void SetBodyLine();

  // Record that this row fits as a paragraph start line in the given model,
  void AddStartLine(const ParagraphModel *model);
  // Record that this row fits as a paragraph body line in the given model,
  void AddBodyLine(const ParagraphModel *model);

  // Clear all hypotheses about this line.
  void SetUnknown() {
    hypotheses_.clear();
  }

  // Append all hypotheses of strong models that match this row as a start.
  void StartHypotheses(SetOfModels *models) const;

  // Append all hypotheses of strong models matching this row.
  void StrongHypotheses(SetOfModels *models) const;

  // Append all hypotheses for this row.
  void NonNullHypotheses(SetOfModels *models) const;

  // Discard any hypotheses whose model is not in the given list.
  void DiscardNonMatchingHypotheses(const SetOfModels &models);

  // If we have only one hypothesis and that is that this line is a paragraph
  // start line of a certain model, return that model.  Else return nullptr.
  const ParagraphModel *UniqueStartHypothesis() const;

  // If we have only one hypothesis and that is that this line is a paragraph
  // body line of a certain model, return that model.  Else return nullptr.
  const ParagraphModel *UniqueBodyHypothesis() const;

  // Return the indentation for the side opposite of the aligned side.
  int OffsideIndent(tesseract::ParagraphJustification just) const {
    switch (just) {
      case tesseract::JUSTIFICATION_RIGHT:
        return lindent_;
      case tesseract::JUSTIFICATION_LEFT:
        return rindent_;
      default:
        return lindent_ > rindent_ ? lindent_ : rindent_;
    }
  }

  // Return the indentation for the side the text is aligned to.
  int AlignsideIndent(tesseract::ParagraphJustification just) const {
    switch (just) {
      case tesseract::JUSTIFICATION_RIGHT:
        return rindent_;
      case tesseract::JUSTIFICATION_LEFT:
        return lindent_;
      default:
        return lindent_ > rindent_ ? lindent_ : rindent_;
    }
  }

  // Append header fields to a vector of row headings.
  static void AppendDebugHeaderFields(std::vector<std::string> &header);

  // Append data for this row to a vector of debug strings.
  void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const;

  const RowInfo *ri_;

  // These four constants form a horizontal box model for the white space
  // on the edges of each line.  At each point in the algorithm, the following
  // shall hold:
  //   ri_->pix_ldistance = lmargin_ + lindent_
  //   ri_->pix_rdistance = rindent_ + rmargin_
  int lmargin_;
  int lindent_;
  int rindent_;
  int rmargin_;

private:
  // Hypotheses of either LT_START or LT_BODY
  std::vector<LineHypothesis> hypotheses_;
};

// A collection of convenience functions for wrapping the set of
// Paragraph Models we believe correctly model the paragraphs in the image.
class ParagraphTheory {
public:
  // We presume models will outlive us, and that models will take ownership
  // of any ParagraphModel *'s we add.
  explicit ParagraphTheory(std::vector<ParagraphModel *> *models) : models_(models) {}
  std::vector<ParagraphModel *> &models() {
    return *models_;
  }
  const std::vector<ParagraphModel *> &models() const {
    return *models_;
  }

  // Return an existing model if one that is Comparable() can be found.
  // Else, allocate a new copy of model to save and return a pointer to it.
  const ParagraphModel *AddModel(const ParagraphModel &model);

  // Discard any models we've made that are not in the list of used models.
  void DiscardUnusedModels(const SetOfModels &used_models);

  // Return the set of all non-centered models.
  void NonCenteredModels(SetOfModels *models);

  // If any of the non-centered paragraph models we know about fit
  // rows[start, end), return it.  Else nullptr.
  const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start,
                             int end) const;

  int IndexOf(const ParagraphModel *model) const;

private:
  std::vector<ParagraphModel *> *models_;
  std::vector<ParagraphModel *> models_we_added_;
};

bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
                    const ParagraphModel *model);
bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
                   const ParagraphModel *model);
bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
                     const ParagraphModel *model);

// A class for smearing Paragraph Model hypotheses to surrounding rows.
// The idea here is that StrongEvidenceClassify first marks only exceedingly
// obvious start and body rows and constructs models of them.  Thereafter,
// we may have left over unmarked lines (mostly end-of-paragraph lines) which
// were too short to have much confidence about, but which fit the models we've
// constructed perfectly and which we ought to mark.  This class is used to
// "smear" our models over the text.
class ParagraphModelSmearer {
public:
  ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
                        ParagraphTheory *theory);

  // Smear forward paragraph models from existing row markings to subsequent
  // text lines if they fit, and mark any thereafter still unmodeled rows
  // with any model in the theory that fits them.
  void Smear();

private:
  // Record in open_models_ for rows [start_row, end_row) the list of models
  // currently open at each row.
  // A model is still open in a row if some previous row has said model as a
  // start hypothesis, and all rows since (including this row) would fit as
  // either a body or start line in that model.
  void CalculateOpenModels(int row_start, int row_end);

  SetOfModels &OpenModels(int row) {
    return open_models_[row - row_start_ + 1];
  }

  ParagraphTheory *theory_;
  std::vector<RowScratchRegisters> *rows_;
  int row_start_;
  int row_end_;

  // open_models_ corresponds to rows[start_row_ - 1, end_row_]
  //
  // open_models_:  Contains models which there was an active (open) paragraph
  //                as of the previous line and for which the left and right
  //                indents admit the possibility that this text line continues
  //                to fit the same model.
  // TODO(eger): Think about whether we can get rid of "Open" models and just
  //   use the current hypotheses on RowScratchRegisters.
  std::vector<SetOfModels> open_models_;
};

// Clear all hypotheses about lines [start, end) and reset the margins to the
// percentile (0..100) value of the left and right row edges for this run of
// rows.
void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
                                        int end, int percentile);

// Return the median inter-word space in rows[row_start, row_end).
int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);

// Return whether the first word on the after line can fit in the space at
// the end of the before line (knowing which way the text is aligned and read).
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,
                           tesseract::ParagraphJustification justification);

// Return whether the first word on the after line can fit in the space at
// the end of the before line (not knowing the text alignment).
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);

// Do rows[start, end) form a single instance of the given paragraph model?
bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
                  const ParagraphModel *model);

// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
// normalize each row_owner to point to an actual PARA, and output the
// paragraphs in order onto paragraphs.
void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs);

} // namespace tesseract

#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_

Coverage Report

Created: 2025-06-13 07:02

Line	Count	Source (jump to first uncovered line)
1		/**********************************************************************
2		* File: paragraphs_internal.h
3		* Description: Paragraph Detection internal data structures.
4		* Author: David Eger
5		*
6		* (C) Copyright 2011, Google Inc.
7		** Licensed under the Apache License, Version 2.0 (the "License");
8		** you may not use this file except in compliance with the License.
9		** You may obtain a copy of the License at
10		** http://www.apache.org/licenses/LICENSE-2.0
11		** Unless required by applicable law or agreed to in writing, software
12		** distributed under the License is distributed on an "AS IS" BASIS,
13		** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		** See the License for the specific language governing permissions and
15		** limitations under the License.
16		*
17		**********************************************************************/
18
19		#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
20		#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
21
22		#include <tesseract/publictypes.h> // for ParagraphJustification
23		#include "paragraphs.h"
24
25		// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
26		// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
27
28		namespace tesseract {
29
30		class UNICHARSET;
31		class WERD_CHOICE;
32
33		// Return whether the given word is likely to be a list item start word.
34		TESS_API
35		bool AsciiLikelyListItem(const std::string &word);
36
37		// Set right word attributes given either a unicharset and werd or a utf8
38		// string.
39		TESS_API
40		void RightWordAttributes(const UNICHARSET unicharset, const WERD_CHOICE werd, const std::string &utf8,
41		bool is_list, bool starts_idea, bool *ends_idea);
42
43		// Set left word attributes given either a unicharset and werd or a utf8 string.
44		TESS_API
45		void LeftWordAttributes(const UNICHARSET unicharset, const WERD_CHOICE werd, const std::string &utf8,
46		bool is_list, bool starts_idea, bool *ends_idea);
47
48		enum LineType {
49		LT_START = 'S', // First line of a paragraph.
50		LT_BODY = 'C', // Continuation line of a paragraph.
51		LT_UNKNOWN = 'U', // No clues.
52		LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY.
53		};
54
55		// The first paragraph in a page of body text is often un-indented.
56		// This is a typographic convention which is common to indicate either that:
57		// (1) The paragraph is the continuation of a previous paragraph, or
58		// (2) The paragraph is the first paragraph in a chapter.
59		//
60		// I refer to such paragraphs as "crown"s, and the output of the paragraph
61		// detection algorithm attempts to give them the same paragraph model as
62		// the rest of the body text.
63		//
64		// Nonetheless, while building hypotheses, it is useful to mark the lines
65		// of crown paragraphs temporarily as crowns, either aligned left or right.
66		extern const ParagraphModel *kCrownLeft;
67		extern const ParagraphModel *kCrownRight;
68
69	234k	inline bool StrongModel(const ParagraphModel *model) {
70	234k	return model != nullptr && model != kCrownLeft && model != kCrownRight;
71	234k	}
72
73		struct LineHypothesis {
74	0	LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {}
75	82.6k	LineHypothesis(LineType line_type, const ParagraphModel *m) : ty(line_type), model(m) {}
76		LineHypothesis(const LineHypothesis &other) = default;
77
78		// Copy assignment operator.
79		LineHypothesis &operator=(const LineHypothesis &other) = default;
80
81	37.6k	bool operator==(const LineHypothesis &other) const {
82	37.6k	return ty == other.ty && model == other.model;
83	37.6k	}
84
85		LineType ty;
86		const ParagraphModel *model;
87		};
88
89		class ParagraphTheory; // Forward Declaration
90
91		using SetOfModels = std::vector<const ParagraphModel *>;
92
93		// Row Scratch Registers are data generated by the paragraph detection
94		// algorithm based on a RowInfo input.
95		class RowScratchRegisters {
96		public:
97		// We presume row will outlive us.
98		void Init(const RowInfo &row);
99
100		LineType GetLineType() const;
101
102		LineType GetLineType(const ParagraphModel *model) const;
103
104		// Mark this as a start line type, sans model. This is useful for the
105		// initial marking of probable body lines or paragraph start lines.
106		void SetStartLine();
107
108		// Mark this as a body line type, sans model. This is useful for the
109		// initial marking of probably body lines or paragraph start lines.
110		void SetBodyLine();
111
112		// Record that this row fits as a paragraph start line in the given model,
113		void AddStartLine(const ParagraphModel *model);
114		// Record that this row fits as a paragraph body line in the given model,
115		void AddBodyLine(const ParagraphModel *model);
116
117		// Clear all hypotheses about this line.
118	141k	void SetUnknown() {
119	141k	hypotheses_.clear();
120	141k	}
121
122		// Append all hypotheses of strong models that match this row as a start.
123		void StartHypotheses(SetOfModels *models) const;
124
125		// Append all hypotheses of strong models matching this row.
126		void StrongHypotheses(SetOfModels *models) const;
127
128		// Append all hypotheses for this row.
129		void NonNullHypotheses(SetOfModels *models) const;
130
131		// Discard any hypotheses whose model is not in the given list.
132		void DiscardNonMatchingHypotheses(const SetOfModels &models);
133
134		// If we have only one hypothesis and that is that this line is a paragraph
135		// start line of a certain model, return that model. Else return nullptr.
136		const ParagraphModel *UniqueStartHypothesis() const;
137
138		// If we have only one hypothesis and that is that this line is a paragraph
139		// body line of a certain model, return that model. Else return nullptr.
140		const ParagraphModel *UniqueBodyHypothesis() const;
141
142		// Return the indentation for the side opposite of the aligned side.
143	170k	int OffsideIndent(tesseract::ParagraphJustification just) const {
144	170k	switch (just) {
145	40.7k	case tesseract::JUSTIFICATION_RIGHT:
146	40.7k	return lindent_;
147	129k	case tesseract::JUSTIFICATION_LEFT:
148	129k	return rindent_;
149	0	default:
150	0	return lindent_ > rindent_ ? lindent_ : rindent_;
151	170k	}
152	170k	}
153
154		// Return the indentation for the side the text is aligned to.
155	2.81k	int AlignsideIndent(tesseract::ParagraphJustification just) const {
156	2.81k	switch (just) {
157	341	case tesseract::JUSTIFICATION_RIGHT:
158	341	return rindent_;
159	2.47k	case tesseract::JUSTIFICATION_LEFT:
160	2.47k	return lindent_;
161	0	default:
162	0	return lindent_ > rindent_ ? lindent_ : rindent_;
163	2.81k	}
164	2.81k	}
165
166		// Append header fields to a vector of row headings.
167		static void AppendDebugHeaderFields(std::vector<std::string> &header);
168
169		// Append data for this row to a vector of debug strings.
170		void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const;
171
172		const RowInfo *ri_;
173
174		// These four constants form a horizontal box model for the white space
175		// on the edges of each line. At each point in the algorithm, the following
176		// shall hold:
177		// ri_->pix_ldistance = lmargin_ + lindent_
178		// ri_->pix_rdistance = rindent_ + rmargin_
179		int lmargin_;
180		int lindent_;
181		int rindent_;
182		int rmargin_;
183
184		private:
185		// Hypotheses of either LT_START or LT_BODY
186		std::vector<LineHypothesis> hypotheses_;
187		};
188
189		// A collection of convenience functions for wrapping the set of
190		// Paragraph Models we believe correctly model the paragraphs in the image.
191		class ParagraphTheory {
192		public:
193		// We presume models will outlive us, and that models will take ownership
194		// of any ParagraphModel *'s we add.
195	6.87k	explicit ParagraphTheory(std::vector<ParagraphModel > models) : models_(models) {}
196	0	std::vector<ParagraphModel *> &models() {
197	0	return *models_;
198	0	}
199	0	const std::vector<ParagraphModel *> &models() const {
200	0	return *models_;
201	0	}
202
203		// Return an existing model if one that is Comparable() can be found.
204		// Else, allocate a new copy of model to save and return a pointer to it.
205		const ParagraphModel *AddModel(const ParagraphModel &model);
206
207		// Discard any models we've made that are not in the list of used models.
208		void DiscardUnusedModels(const SetOfModels &used_models);
209
210		// Return the set of all non-centered models.
211		void NonCenteredModels(SetOfModels *models);
212
213		// If any of the non-centered paragraph models we know about fit
214		// rows[start, end), return it. Else nullptr.
215		const ParagraphModel Fits(const std::vector<RowScratchRegisters> rows, int start,
216		int end) const;
217
218		int IndexOf(const ParagraphModel *model) const;
219
220		private:
221		std::vector<ParagraphModel > models_;
222		std::vector<ParagraphModel *> models_we_added_;
223		};
224
225		bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
226		const ParagraphModel *model);
227		bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
228		const ParagraphModel *model);
229		bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
230		const ParagraphModel *model);
231
232		// A class for smearing Paragraph Model hypotheses to surrounding rows.
233		// The idea here is that StrongEvidenceClassify first marks only exceedingly
234		// obvious start and body rows and constructs models of them. Thereafter,
235		// we may have left over unmarked lines (mostly end-of-paragraph lines) which
236		// were too short to have much confidence about, but which fit the models we've
237		// constructed perfectly and which we ought to mark. This class is used to
238		// "smear" our models over the text.
239		class ParagraphModelSmearer {
240		public:
241		ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
242		ParagraphTheory *theory);
243
244		// Smear forward paragraph models from existing row markings to subsequent
245		// text lines if they fit, and mark any thereafter still unmodeled rows
246		// with any model in the theory that fits them.
247		void Smear();
248
249		private:
250		// Record in open_models_ for rows [start_row, end_row) the list of models
251		// currently open at each row.
252		// A model is still open in a row if some previous row has said model as a
253		// start hypothesis, and all rows since (including this row) would fit as
254		// either a body or start line in that model.
255		void CalculateOpenModels(int row_start, int row_end);
256
257	341k	SetOfModels &OpenModels(int row) {
258	341k	return open_models_[row - row_start_ + 1];
259	341k	}
260
261		ParagraphTheory *theory_;
262		std::vector<RowScratchRegisters> *rows_;
263		int row_start_;
264		int row_end_;
265
266		// open_models_ corresponds to rows[start_row_ - 1, end_row_]
267		//
268		// open_models_: Contains models which there was an active (open) paragraph
269		// as of the previous line and for which the left and right
270		// indents admit the possibility that this text line continues
271		// to fit the same model.
272		// TODO(eger): Think about whether we can get rid of "Open" models and just
273		// use the current hypotheses on RowScratchRegisters.
274		std::vector<SetOfModels> open_models_;
275		};
276
277		// Clear all hypotheses about lines [start, end) and reset the margins to the
278		// percentile (0..100) value of the left and right row edges for this run of
279		// rows.
280		void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
281		int end, int percentile);
282
283		// Return the median inter-word space in rows[row_start, row_end).
284		int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end);
285
286		// Return whether the first word on the after line can fit in the space at
287		// the end of the before line (knowing which way the text is aligned and read).
288		bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after,
289		tesseract::ParagraphJustification justification);
290
291		// Return whether the first word on the after line can fit in the space at
292		// the end of the before line (not knowing the text alignment).
293		bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after);
294
295		// Do rows[start, end) form a single instance of the given paragraph model?
296		bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
297		const ParagraphModel *model);
298
299		// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
300		// normalize each row_owner to point to an actual PARA, and output the
301		// paragraphs in order onto paragraphs.
302		void CanonicalizeDetectionResults(std::vector<PARA > row_owners, PARA_LIST *paragraphs);
303
304		} // namespace tesseract
305
306		#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_