Coverage Report

Created: 2025-07-23 07:12

/src/tesseract/src/ccstruct/werd.h
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        werd.h
3
 * Description: Code for the WERD class.
4
 * Author:      Ray Smith
5
 *
6
 * (C) Copyright 1991, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#ifndef WERD_H
20
#define WERD_H
21
22
#include "elst2.h"
23
#include "params.h"
24
#include "stepblob.h"
25
26
#include <bitset>
27
28
namespace tesseract {
29
30
enum WERD_FLAGS {
31
  W_SEGMENTED,          ///< correctly segmented
32
  W_ITALIC,             ///< italic text
33
  W_BOLD,               ///< bold text
34
  W_BOL,                ///< start of line
35
  W_EOL,                ///< end of line
36
  W_NORMALIZED,         ///< flags
37
  W_SCRIPT_HAS_XHEIGHT, ///< x-height concept makes sense.
38
  W_SCRIPT_IS_LATIN,    ///< Special case latin for y. splitting.
39
  W_DONT_CHOP,          ///< fixed pitch chopped
40
  W_REP_CHAR,           ///< repeated character
41
  W_FUZZY_SP,           ///< fuzzy space
42
  W_FUZZY_NON,          ///< fuzzy nonspace
43
  W_INVERSE             ///< white on black
44
};
45
46
enum DISPLAY_FLAGS {
47
  /* Display flags bit number allocations */
48
  DF_BOX,          ///< Bounding box
49
  DF_TEXT,         ///< Correct ascii
50
  DF_POLYGONAL,    ///< Polyg approx
51
  DF_EDGE_STEP,    ///< Edge steps
52
  DF_BN_POLYGONAL, ///< BL normalisd polyapx
53
  DF_BLAMER        ///< Blamer information
54
};
55
56
class ROW; // forward decl
57
58
class TESS_API WERD : public ELIST2<WERD>::LINK {
59
public:
60
128k
  WERD() = default;
61
  // WERD constructed with:
62
  //   blob_list - blobs of the word (we take this list's contents)
63
  //   blanks - number of blanks before the word
64
  //   text - correct text (outlives WERD)
65
  WERD(C_BLOB_LIST *blob_list, uint8_t blanks, const char *text);
66
67
  // WERD constructed from:
68
  //   blob_list - blobs in the word
69
  //   clone - werd to clone flags, etc from.
70
  WERD(C_BLOB_LIST *blob_list, WERD *clone);
71
72
  // Construct a WERD from a single_blob and clone the flags from this.
73
  // W_BOL and W_EOL flags are set according to the given values.
74
  WERD *ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob);
75
76
554k
  ~WERD() = default;
77
78
  // assignment
79
  WERD &operator=(const WERD &source);
80
81
  // This method returns a new werd constructed using the blobs in the input
82
  // all_blobs list, which correspond to the blobs in this werd object. The
83
  // blobs used to construct the new word are consumed and removed from the
84
  // input all_blobs list.
85
  // Returns nullptr if the word couldn't be constructed.
86
  // Returns original blobs for which no matches were found in the output list
87
  // orphan_blobs (appends).
88
  WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs);
89
90
  // Accessors for reject / DUFF blobs in various formats
91
119k
  C_BLOB_LIST *rej_cblob_list() { // compact format
92
119k
    return &rej_cblobs;
93
119k
  }
94
95
  // Accessors for good blobs in various formats.
96
4.18M
  C_BLOB_LIST *cblob_list() { // get compact blobs
97
4.18M
    return &cblobs;
98
4.18M
  }
99
100
14.0k
  uint8_t space() const { // access function
101
14.0k
    return blanks;
102
14.0k
  }
103
14.0k
  void set_blanks(uint8_t new_blanks) {
104
14.0k
    blanks = new_blanks;
105
14.0k
  }
106
0
  int script_id() const {
107
0
    return script_id_;
108
0
  }
109
456k
  void set_script_id(int id) {
110
456k
    script_id_ = id;
111
456k
  }
112
113
  // Returns the (default) bounding box including all the dots.
114
  TBOX bounding_box() const; // compute bounding box
115
  // Returns the bounding box including the desired combination of upper and
116
  // lower noise/diacritic elements.
117
  TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
118
  // Returns the bounding box of only the good blobs.
119
  TBOX true_bounding_box() const;
120
121
0
  const char *text() const {
122
0
    return correct.c_str();
123
0
  }
124
0
  void set_text(const char *new_text) {
125
0
    correct = new_text;
126
0
  }
127
128
4.62M
  bool flag(WERD_FLAGS mask) const {
129
4.62M
    return flags[mask];
130
4.62M
  }
131
2.45M
  void set_flag(WERD_FLAGS mask, bool value) {
132
2.45M
    flags.set(mask, value);
133
2.45M
  }
134
135
0
  bool display_flag(uint8_t flag) const {
136
0
    return disp_flags[flag];
137
0
  }
138
0
  void set_display_flag(uint8_t flag, bool value) {
139
0
    disp_flags.set(flag, value);
140
0
  }
141
142
  WERD *shallow_copy(); // shallow copy word
143
144
  // reposition word by vector
145
  void move(const ICOORD vec);
146
147
  // join other's blobs onto this werd, emptying out other.
148
  void join_on(WERD *other);
149
150
  // copy other's blobs onto this word, leaving other intact.
151
  void copy_on(WERD *other);
152
153
  // tprintf word metadata (but not blob innards)
154
  void print() const;
155
156
#ifndef GRAPHICS_DISABLED
157
  // plot word on window in a uniform colour
158
  void plot(ScrollView *window, ScrollView::Color colour);
159
160
  // Get the next color in the (looping) rainbow.
161
  static ScrollView::Color NextColor(ScrollView::Color colour);
162
163
  // plot word on window in a rainbow of colours
164
  void plot(ScrollView *window);
165
166
  // plot rejected blobs in a rainbow of colours
167
  void plot_rej_blobs(ScrollView *window);
168
#endif // !GRAPHICS_DISABLED
169
170
  // Removes noise from the word by moving small outlines to the rej_cblobs
171
  // list, based on the size_threshold.
172
  void CleanNoise(float size_threshold);
173
174
  // Extracts all the noise outlines and stuffs the pointers into the given
175
  // vector of outlines. Afterwards, the outlines vector owns the pointers.
176
  void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines);
177
  // Adds the selected outlines to the indcated real blobs, and puts the rest
178
  // back in rej_cblobs where they came from. Where the target_blobs entry is
179
  // nullptr, a run of wanted outlines is put into a single new blob.
180
  // Ownership of the outlines is transferred back to the word. (Hence
181
  // vector and not PointerVector.)
182
  // Returns true if any new blob was added to the start of the word, which
183
  // suggests that it might need joining to the word before it, and likewise
184
  // sets make_next_word_fuzzy true if any new blob was added to the end.
185
  bool AddSelectedOutlines(const std::vector<bool> &wanted,
186
                           const std::vector<C_BLOB *> &target_blobs,
187
                           const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
188
189
private:
190
  uint8_t blanks = 0;     // no of blanks
191
  std::bitset<16> flags;  // flags about word
192
  std::bitset<16> disp_flags; // display flags
193
  int16_t script_id_ = 0; // From unicharset.
194
  std::string correct;    // correct text
195
  C_BLOB_LIST cblobs;     // compacted blobs
196
  C_BLOB_LIST rej_cblobs; // DUFF blobs
197
};
198
199
ELIST2IZEH(WERD)
200
201
} // namespace tesseract
202
203
#include "ocrrow.h" // placed here due to
204
205
namespace tesseract {
206
207
// compare words by increasing order of left edge, suitable for qsort(3)
208
int word_comparator(const WERD *word1, const WERD *word2);
209
210
} // namespace tesseract
211
212
#endif