/src/tesseract/src/ccstruct/werd.h
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: werd.h |
3 | | * Description: Code for the WERD class. |
4 | | * Author: Ray Smith |
5 | | * |
6 | | * (C) Copyright 1991, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | **********************************************************************/ |
18 | | |
19 | | #ifndef WERD_H |
20 | | #define WERD_H |
21 | | |
22 | | #include "elst2.h" |
23 | | #include "params.h" |
24 | | #include "stepblob.h" |
25 | | |
26 | | #include <bitset> |
27 | | |
28 | | namespace tesseract { |
29 | | |
30 | | enum WERD_FLAGS { |
31 | | W_SEGMENTED, ///< correctly segmented |
32 | | W_ITALIC, ///< italic text |
33 | | W_BOLD, ///< bold text |
34 | | W_BOL, ///< start of line |
35 | | W_EOL, ///< end of line |
36 | | W_NORMALIZED, ///< flags |
37 | | W_SCRIPT_HAS_XHEIGHT, ///< x-height concept makes sense. |
38 | | W_SCRIPT_IS_LATIN, ///< Special case latin for y. splitting. |
39 | | W_DONT_CHOP, ///< fixed pitch chopped |
40 | | W_REP_CHAR, ///< repeated character |
41 | | W_FUZZY_SP, ///< fuzzy space |
42 | | W_FUZZY_NON, ///< fuzzy nonspace |
43 | | W_INVERSE ///< white on black |
44 | | }; |
45 | | |
46 | | enum DISPLAY_FLAGS { |
47 | | /* Display flags bit number allocations */ |
48 | | DF_BOX, ///< Bounding box |
49 | | DF_TEXT, ///< Correct ascii |
50 | | DF_POLYGONAL, ///< Polyg approx |
51 | | DF_EDGE_STEP, ///< Edge steps |
52 | | DF_BN_POLYGONAL, ///< BL normalisd polyapx |
53 | | DF_BLAMER ///< Blamer information |
54 | | }; |
55 | | |
56 | | class ROW; // forward decl |
57 | | |
58 | | class TESS_API WERD : public ELIST2<WERD>::LINK { |
59 | | public: |
60 | 128k | WERD() = default; |
61 | | // WERD constructed with: |
62 | | // blob_list - blobs of the word (we take this list's contents) |
63 | | // blanks - number of blanks before the word |
64 | | // text - correct text (outlives WERD) |
65 | | WERD(C_BLOB_LIST *blob_list, uint8_t blanks, const char *text); |
66 | | |
67 | | // WERD constructed from: |
68 | | // blob_list - blobs in the word |
69 | | // clone - werd to clone flags, etc from. |
70 | | WERD(C_BLOB_LIST *blob_list, WERD *clone); |
71 | | |
72 | | // Construct a WERD from a single_blob and clone the flags from this. |
73 | | // W_BOL and W_EOL flags are set according to the given values. |
74 | | WERD *ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob); |
75 | | |
76 | 554k | ~WERD() = default; |
77 | | |
78 | | // assignment |
79 | | WERD &operator=(const WERD &source); |
80 | | |
81 | | // This method returns a new werd constructed using the blobs in the input |
82 | | // all_blobs list, which correspond to the blobs in this werd object. The |
83 | | // blobs used to construct the new word are consumed and removed from the |
84 | | // input all_blobs list. |
85 | | // Returns nullptr if the word couldn't be constructed. |
86 | | // Returns original blobs for which no matches were found in the output list |
87 | | // orphan_blobs (appends). |
88 | | WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs); |
89 | | |
90 | | // Accessors for reject / DUFF blobs in various formats |
91 | 119k | C_BLOB_LIST *rej_cblob_list() { // compact format |
92 | 119k | return &rej_cblobs; |
93 | 119k | } |
94 | | |
95 | | // Accessors for good blobs in various formats. |
96 | 4.18M | C_BLOB_LIST *cblob_list() { // get compact blobs |
97 | 4.18M | return &cblobs; |
98 | 4.18M | } |
99 | | |
100 | 14.0k | uint8_t space() const { // access function |
101 | 14.0k | return blanks; |
102 | 14.0k | } |
103 | 14.0k | void set_blanks(uint8_t new_blanks) { |
104 | 14.0k | blanks = new_blanks; |
105 | 14.0k | } |
106 | 0 | int script_id() const { |
107 | 0 | return script_id_; |
108 | 0 | } |
109 | 456k | void set_script_id(int id) { |
110 | 456k | script_id_ = id; |
111 | 456k | } |
112 | | |
113 | | // Returns the (default) bounding box including all the dots. |
114 | | TBOX bounding_box() const; // compute bounding box |
115 | | // Returns the bounding box including the desired combination of upper and |
116 | | // lower noise/diacritic elements. |
117 | | TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; |
118 | | // Returns the bounding box of only the good blobs. |
119 | | TBOX true_bounding_box() const; |
120 | | |
121 | 0 | const char *text() const { |
122 | 0 | return correct.c_str(); |
123 | 0 | } |
124 | 0 | void set_text(const char *new_text) { |
125 | 0 | correct = new_text; |
126 | 0 | } |
127 | | |
128 | 4.62M | bool flag(WERD_FLAGS mask) const { |
129 | 4.62M | return flags[mask]; |
130 | 4.62M | } |
131 | 2.45M | void set_flag(WERD_FLAGS mask, bool value) { |
132 | 2.45M | flags.set(mask, value); |
133 | 2.45M | } |
134 | | |
135 | 0 | bool display_flag(uint8_t flag) const { |
136 | 0 | return disp_flags[flag]; |
137 | 0 | } |
138 | 0 | void set_display_flag(uint8_t flag, bool value) { |
139 | 0 | disp_flags.set(flag, value); |
140 | 0 | } |
141 | | |
142 | | WERD *shallow_copy(); // shallow copy word |
143 | | |
144 | | // reposition word by vector |
145 | | void move(const ICOORD vec); |
146 | | |
147 | | // join other's blobs onto this werd, emptying out other. |
148 | | void join_on(WERD *other); |
149 | | |
150 | | // copy other's blobs onto this word, leaving other intact. |
151 | | void copy_on(WERD *other); |
152 | | |
153 | | // tprintf word metadata (but not blob innards) |
154 | | void print() const; |
155 | | |
156 | | #ifndef GRAPHICS_DISABLED |
157 | | // plot word on window in a uniform colour |
158 | | void plot(ScrollView *window, ScrollView::Color colour); |
159 | | |
160 | | // Get the next color in the (looping) rainbow. |
161 | | static ScrollView::Color NextColor(ScrollView::Color colour); |
162 | | |
163 | | // plot word on window in a rainbow of colours |
164 | | void plot(ScrollView *window); |
165 | | |
166 | | // plot rejected blobs in a rainbow of colours |
167 | | void plot_rej_blobs(ScrollView *window); |
168 | | #endif // !GRAPHICS_DISABLED |
169 | | |
170 | | // Removes noise from the word by moving small outlines to the rej_cblobs |
171 | | // list, based on the size_threshold. |
172 | | void CleanNoise(float size_threshold); |
173 | | |
174 | | // Extracts all the noise outlines and stuffs the pointers into the given |
175 | | // vector of outlines. Afterwards, the outlines vector owns the pointers. |
176 | | void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines); |
177 | | // Adds the selected outlines to the indcated real blobs, and puts the rest |
178 | | // back in rej_cblobs where they came from. Where the target_blobs entry is |
179 | | // nullptr, a run of wanted outlines is put into a single new blob. |
180 | | // Ownership of the outlines is transferred back to the word. (Hence |
181 | | // vector and not PointerVector.) |
182 | | // Returns true if any new blob was added to the start of the word, which |
183 | | // suggests that it might need joining to the word before it, and likewise |
184 | | // sets make_next_word_fuzzy true if any new blob was added to the end. |
185 | | bool AddSelectedOutlines(const std::vector<bool> &wanted, |
186 | | const std::vector<C_BLOB *> &target_blobs, |
187 | | const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy); |
188 | | |
189 | | private: |
190 | | uint8_t blanks = 0; // no of blanks |
191 | | std::bitset<16> flags; // flags about word |
192 | | std::bitset<16> disp_flags; // display flags |
193 | | int16_t script_id_ = 0; // From unicharset. |
194 | | std::string correct; // correct text |
195 | | C_BLOB_LIST cblobs; // compacted blobs |
196 | | C_BLOB_LIST rej_cblobs; // DUFF blobs |
197 | | }; |
198 | | |
199 | | ELIST2IZEH(WERD) |
200 | | |
201 | | } // namespace tesseract |
202 | | |
203 | | #include "ocrrow.h" // placed here due to |
204 | | |
205 | | namespace tesseract { |
206 | | |
207 | | // compare words by increasing order of left edge, suitable for qsort(3) |
208 | | int word_comparator(const WERD *word1, const WERD *word2); |
209 | | |
210 | | } // namespace tesseract |
211 | | |
212 | | #endif |