Coverage Report

Created: 2025-11-16 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/ccstruct/ocrpara.h
Line
Count
Source
1
/////////////////////////////////////////////////////////////////////
2
// File:        ocrpara.h
3
// Description: OCR Paragraph Output Type
4
// Author:      David Eger
5
//
6
// (C) Copyright 2010, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
17
///////////////////////////////////////////////////////////////////////
18
19
#ifndef TESSERACT_CCSTRUCT_OCRPARA_H_
20
#define TESSERACT_CCSTRUCT_OCRPARA_H_
21
22
#include "elst.h"
23
24
#include <tesseract/publictypes.h>
25
26
namespace tesseract {
27
28
class ParagraphModel;
29
30
struct PARA : public ELIST<PARA>::LINK {
31
public:
32
  PARA()
33
21.1k
      : model(nullptr)
34
21.1k
      , is_list_item(false)
35
21.1k
      , is_very_first_or_continuation(false)
36
21.1k
      , has_drop_cap(false) {}
37
38
  // We do not own the model, we just reference it.
39
  // model may be nullptr if there is not a good model for this paragraph.
40
  const ParagraphModel *model;
41
42
  bool is_list_item;
43
44
  // The first paragraph on a page often lacks a first line indent, but should
45
  // still be modeled by the same model as other body text paragraphs on the
46
  // page.
47
  bool is_very_first_or_continuation;
48
49
  // Does this paragraph begin with a drop cap?
50
  bool has_drop_cap;
51
};
52
53
ELISTIZEH(PARA)
54
55
// A geometric model of paragraph indentation and alignment.
56
//
57
// Measurements are in pixels. The meaning of the integer arguments changes
58
// depending upon the value of justification.  Distances less than or equal
59
// to tolerance apart we take as "equivalent" for the purpose of model
60
// matching, and in the examples below, we assume tolerance is zero.
61
//
62
// justification = LEFT:
63
//   margin       the "ignored" margin to the left block edge.
64
//   first_indent indent from the left margin to a typical first text line.
65
//   body_indent  indent from the left margin of a typical body text line.
66
//
67
// justification = RIGHT:
68
//   margin       the "ignored" margin to the right block edge.
69
//   first_indent indent from the right margin to a typical first text line.
70
//   body_indent  indent from the right margin of a typical body text line.
71
//
72
// justification = CENTER:
73
//   margin       ignored
74
//   first_indent ignored
75
//   body_indent  ignored
76
//
77
//  ====== Extended example, assuming each letter is ten pixels wide: =======
78
//
79
// +--------------------------------+
80
// |      Awesome                   | ParagraphModel(CENTER, 0, 0, 0)
81
// |   Centered Title               |
82
// | Paragraph Detection            |
83
// |      OCR TEAM                  |
84
// |  10 November 2010              |
85
// |                                |
86
// |  Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)
87
// |This paragraph starts at the top|
88
// |of the page and takes 3 lines.  |
89
// |  Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)
90
// |which indicates that the first  |
91
// |paragraph is not a continuation |
92
// |from a previous page, as it is  |
93
// |indented just like this second  |
94
// |paragraph.                      |
95
// |   Here is a block quote. It    | ParagraphModel(LEFT, 30, 0, 0)
96
// |   looks like the prior text    |
97
// |   but it  is indented  more    |
98
// |   and is fully justified.      |
99
// |  So how does one deal with     | ParagraphModel(LEFT, 0, 20, 0)
100
// |centered text, block quotes,    |
101
// |normal paragraphs, and lists    |
102
// |like what follows?              |
103
// |1. Make a plan.                 | ParagraphModel(LEFT, 0, 0, 30)
104
// |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)
105
// |   looking for lines where the  |
106
// |   first word of the next line  |
107
// |   would fit on the previous    |
108
// |   line.                        |
109
// |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)
110
// |   Python and try it out.       |
111
// |4. Determine how to fix the     | ParagraphModel(LEFT, 0, 0, 30)
112
// |   mistakes.                    |
113
// |5. Repeat.                      | ParagraphModel(LEFT, 0, 0, 30)
114
// |  For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)
115
// |you can try to identify source  |
116
// |code.  Ouch!                    |
117
// +--------------------------------+
118
class TESS_API ParagraphModel {
119
public:
120
  ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent,
121
                 int body_indent, int tolerance)
122
38.3k
      : justification_(justification)
123
38.3k
      , margin_(margin)
124
38.3k
      , first_indent_(first_indent)
125
38.3k
      , body_indent_(body_indent)
126
38.3k
      , tolerance_(tolerance) {
127
    // Make one of {first_indent, body_indent} is 0.
128
38.3k
    int added_margin = first_indent;
129
38.3k
    if (body_indent < added_margin) {
130
7.14k
      added_margin = body_indent;
131
7.14k
    }
132
38.3k
    margin_ += added_margin;
133
38.3k
    first_indent_ -= added_margin;
134
38.3k
    body_indent_ -= added_margin;
135
38.3k
  }
136
137
  ParagraphModel()
138
35.1k
      : justification_(tesseract::JUSTIFICATION_UNKNOWN)
139
35.1k
      , margin_(0)
140
35.1k
      , first_indent_(0)
141
35.1k
      , body_indent_(0)
142
35.1k
      , tolerance_(0) {}
143
144
  // ValidFirstLine() and ValidBodyLine() take arguments describing a text line
145
  // in a block of text which we are trying to model:
146
  //   lmargin, lindent:  these add up to the distance from the leftmost ink
147
  //                      in the text line to the surrounding text block's left
148
  //                      edge.
149
  //   rmargin, rindent:  these add up to the distance from the rightmost ink
150
  //                      in the text line to the surrounding text block's right
151
  //                      edge.
152
  // The caller determines the division between "margin" and "indent", which
153
  // only actually affect whether we think the line may be centered.
154
  //
155
  // If the amount of whitespace matches the amount of whitespace expected on
156
  // the relevant side of the line (within tolerance_) we say it matches.
157
158
  // Return whether a given text line could be a first paragraph line according
159
  // to this paragraph model.
160
  bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;
161
162
  // Return whether a given text line could be a first paragraph line according
163
  // to this paragraph model.
164
  bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;
165
166
85.7k
  tesseract::ParagraphJustification justification() const {
167
85.7k
    return justification_;
168
85.7k
  }
169
0
  int margin() const {
170
0
    return margin_;
171
0
  }
172
3.76k
  int first_indent() const {
173
3.76k
    return first_indent_;
174
3.76k
  }
175
3.76k
  int body_indent() const {
176
3.76k
    return body_indent_;
177
3.76k
  }
178
3.76k
  int tolerance() const {
179
3.76k
    return tolerance_;
180
3.76k
  }
181
19.2k
  bool is_flush() const {
182
19.2k
    return (justification_ == tesseract::JUSTIFICATION_LEFT ||
183
17.7k
            justification_ == tesseract::JUSTIFICATION_RIGHT) &&
184
19.1k
           abs(first_indent_ - body_indent_) <= tolerance_;
185
19.2k
  }
186
187
  // Return whether this model is likely to agree with the other model on most
188
  // paragraphs they are marked.
189
  bool Comparable(const ParagraphModel &other) const;
190
191
  std::string ToString() const;
192
193
private:
194
  tesseract::ParagraphJustification justification_;
195
  int margin_;
196
  int first_indent_;
197
  int body_indent_;
198
  int tolerance_;
199
};
200
201
} // namespace tesseract
202
203
#endif // TESSERACT_CCSTRUCT_OCRPARA_H_