/src/tesseract/src/ccstruct/rejctmap.h
Line | Count | Source (jump to first uncovered line) |
1 | | /********************************************************************** |
2 | | * File: rejctmap.h (Formerly rejmap.h) |
3 | | * Description: REJ and REJMAP class functions. |
4 | | * Author: Phil Cheatle |
5 | | * |
6 | | * (C) Copyright 1994, Hewlett-Packard Ltd. |
7 | | ** Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | ** you may not use this file except in compliance with the License. |
9 | | ** You may obtain a copy of the License at |
10 | | ** http://www.apache.org/licenses/LICENSE-2.0 |
11 | | ** Unless required by applicable law or agreed to in writing, software |
12 | | ** distributed under the License is distributed on an "AS IS" BASIS, |
13 | | ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | ** See the License for the specific language governing permissions and |
15 | | ** limitations under the License. |
16 | | * |
17 | | |
18 | | This module may look unnecessarily verbose, but here's the philosophy... |
19 | | |
20 | | ALL processing of the reject map is done in this module. There are lots of |
21 | | separate calls to set reject/accept flags. These have DELIBERATELY been kept |
22 | | distinct so that this module can decide what to do. |
23 | | |
24 | | Basically, there is a flag for each sort of rejection or acceptance. This |
25 | | provides a history of what has happened to EACH character. |
26 | | |
27 | | Determining whether a character is CURRENTLY rejected depends on implicit |
28 | | understanding of the SEQUENCE of possible calls. The flags are defined and |
29 | | grouped in the REJ_FLAGS enum. These groupings are used in determining a |
30 | | characters CURRENT rejection status. Basically, a character is ACCEPTED if |
31 | | |
32 | | none of the permanent rej flags are set |
33 | | AND ( the character has never been rejected |
34 | | OR an accept flag is set which is LATER than the latest reject flag ) |
35 | | |
36 | | IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE |
37 | | OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!! |
38 | | **********************************************************************/ |
39 | | |
40 | | #ifndef REJCTMAP_H |
41 | | #define REJCTMAP_H |
42 | | |
43 | | #include "errcode.h" |
44 | | #include "params.h" |
45 | | |
46 | | #include <bitset> |
47 | | #include <memory> |
48 | | |
49 | | namespace tesseract { |
50 | | |
51 | | enum REJ_FLAGS { |
52 | | /* Reject modes which are NEVER overridden */ |
53 | | R_TESS_FAILURE, // PERM Tess didn't classify |
54 | | R_SMALL_XHT, // PERM Xht too small |
55 | | R_EDGE_CHAR, // PERM Too close to edge of image |
56 | | R_1IL_CONFLICT, // PERM 1Il confusion |
57 | | R_POSTNN_1IL, // PERM 1Il unrejected by NN |
58 | | R_REJ_CBLOB, // PERM Odd blob |
59 | | R_MM_REJECT, // PERM Matrix match rejection (m's) |
60 | | R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend |
61 | | |
62 | | /* Initial reject modes (pre NN_ACCEPT) */ |
63 | | R_POOR_MATCH, // TEMP Ray's original heuristic (Not used) |
64 | | R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD |
65 | | R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD |
66 | | R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD |
67 | | |
68 | | /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */ |
69 | | R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop |
70 | | R_DUBIOUS, // TEMP Post NN dodgy chars |
71 | | R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN |
72 | | R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest |
73 | | R_XHT_FIXUP, // TEMP Xht tests unsure |
74 | | |
75 | | /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */ |
76 | | R_BAD_QUALITY, // TEMP Quality metrics bad for WERD |
77 | | |
78 | | /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/ |
79 | | R_DOC_REJ, // TEMP Document rejection |
80 | | R_BLOCK_REJ, // TEMP Block rejection |
81 | | R_ROW_REJ, // TEMP Row rejection |
82 | | R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space |
83 | | |
84 | | /* Accept modes which occur between the above rejection groups */ |
85 | | R_NN_ACCEPT, // NN acceptance |
86 | | R_HYPHEN_ACCEPT, // Hyphen acceptance |
87 | | R_MM_ACCEPT, // Matrix match acceptance |
88 | | R_QUALITY_ACCEPT, // Accept word in good quality doc |
89 | | R_MINIMAL_REJ_ACCEPT // Accept EVERYTHING except tess failures |
90 | | }; |
91 | | |
92 | | /* REJECT MAP VALUES */ |
93 | | |
94 | 0 | #define MAP_ACCEPT '1' |
95 | 0 | #define MAP_REJECT_PERM '0' |
96 | 0 | #define MAP_REJECT_TEMP '2' |
97 | 0 | #define MAP_REJECT_POTENTIAL '3' |
98 | | |
99 | | class REJ { |
100 | | std::bitset<32> flags; |
101 | | |
102 | 321k | void set_flag(REJ_FLAGS rej_flag) { |
103 | 321k | flags.set(rej_flag); |
104 | 321k | } |
105 | | |
106 | | public: |
107 | 1.73M | REJ() = default; |
108 | | |
109 | | REJ( // classwise copy |
110 | 0 | const REJ &source) { |
111 | 0 | flags = source.flags; |
112 | 0 | } |
113 | | |
114 | | REJ &operator=( // assign REJ |
115 | | const REJ &source) = default; |
116 | | |
117 | 9.96k | bool flag(REJ_FLAGS rej_flag) const { |
118 | 9.96k | return flags[rej_flag]; |
119 | 9.96k | } |
120 | | |
121 | 0 | char display_char() const { |
122 | 0 | if (perm_rejected()) { |
123 | 0 | return MAP_REJECT_PERM; |
124 | 0 | } else if (accept_if_good_quality()) { |
125 | 0 | return MAP_REJECT_POTENTIAL; |
126 | 0 | } else if (rejected()) { |
127 | 0 | return MAP_REJECT_TEMP; |
128 | 0 | } else { |
129 | 0 | return MAP_ACCEPT; |
130 | 0 | } |
131 | 0 | } |
132 | | |
133 | 470 | bool perm_rejected() const { // Is char perm reject? |
134 | 470 | return (flag(R_TESS_FAILURE) || flag(R_SMALL_XHT) || flag(R_EDGE_CHAR) || |
135 | 470 | flag(R_1IL_CONFLICT) || flag(R_POSTNN_1IL) || flag(R_REJ_CBLOB) || |
136 | 470 | flag(R_BAD_REPETITION) || flag(R_MM_REJECT)); |
137 | 470 | } |
138 | | |
139 | | private: |
140 | 391 | bool rej_before_nn_accept() const { |
141 | 391 | return flag(R_POOR_MATCH) || flag(R_NOT_TESS_ACCEPTED) || |
142 | 391 | flag(R_CONTAINS_BLANKS) || flag(R_BAD_PERMUTER); |
143 | 391 | } |
144 | | |
145 | 391 | bool rej_between_nn_and_mm() const { |
146 | 391 | return flag(R_HYPHEN) || flag(R_DUBIOUS) || flag(R_NO_ALPHANUMS) || |
147 | 391 | flag(R_MOSTLY_REJ) || flag(R_XHT_FIXUP); |
148 | 391 | } |
149 | | |
150 | 391 | bool rej_between_mm_and_quality_accept() const { |
151 | 391 | return flag(R_BAD_QUALITY); |
152 | 391 | } |
153 | | |
154 | 391 | bool rej_between_quality_and_minimal_rej_accept() const { |
155 | 391 | return flag(R_DOC_REJ) || flag(R_BLOCK_REJ) || flag(R_ROW_REJ) || |
156 | 391 | flag(R_UNLV_REJ); |
157 | 391 | } |
158 | | |
159 | 391 | bool rej_before_mm_accept() const { |
160 | 391 | return rej_between_nn_and_mm() || |
161 | 391 | (rej_before_nn_accept() && !flag(R_NN_ACCEPT) && |
162 | 391 | !flag(R_HYPHEN_ACCEPT)); |
163 | 391 | } |
164 | | |
165 | 391 | bool rej_before_quality_accept() const { |
166 | 391 | return rej_between_mm_and_quality_accept() || |
167 | 391 | (!flag(R_MM_ACCEPT) && rej_before_mm_accept()); |
168 | 391 | } |
169 | | |
170 | | public: |
171 | 470 | bool rejected() const { // Is char rejected? |
172 | 470 | if (flag(R_MINIMAL_REJ_ACCEPT)) { |
173 | 0 | return false; |
174 | 470 | } else { |
175 | 470 | return (perm_rejected() || rej_between_quality_and_minimal_rej_accept() || |
176 | 470 | (!flag(R_QUALITY_ACCEPT) && rej_before_quality_accept())); |
177 | 470 | } |
178 | 470 | } |
179 | | |
180 | 0 | bool accept_if_good_quality() const { // potential rej? |
181 | 0 | return (rejected() && !perm_rejected() && flag(R_BAD_PERMUTER) && |
182 | 0 | !flag(R_POOR_MATCH) && !flag(R_NOT_TESS_ACCEPTED) && |
183 | 0 | !flag(R_CONTAINS_BLANKS) && |
184 | 0 | (!rej_between_nn_and_mm() && !rej_between_mm_and_quality_accept() && |
185 | 0 | !rej_between_quality_and_minimal_rej_accept())); |
186 | 0 | } |
187 | | |
188 | 85 | void setrej_tess_failure() { // Tess generated blank |
189 | 85 | set_flag(R_TESS_FAILURE); |
190 | 85 | } |
191 | | |
192 | 0 | void setrej_small_xht() { // Small xht char/wd |
193 | 0 | set_flag(R_SMALL_XHT); |
194 | 0 | } |
195 | | |
196 | 120k | void setrej_edge_char() { // Close to image edge |
197 | 120k | set_flag(R_EDGE_CHAR); |
198 | 120k | } |
199 | | |
200 | 0 | void setrej_1Il_conflict() { // Initial reject map |
201 | 0 | set_flag(R_1IL_CONFLICT); |
202 | 0 | } |
203 | | |
204 | 0 | void setrej_postNN_1Il() { // 1Il after NN |
205 | 0 | set_flag(R_POSTNN_1IL); |
206 | 0 | } |
207 | | |
208 | 0 | void setrej_rej_cblob() { // Insert duff blob |
209 | 0 | set_flag(R_REJ_CBLOB); |
210 | 0 | } |
211 | | |
212 | 0 | void setrej_mm_reject() { // Matrix matcher |
213 | 0 | set_flag(R_MM_REJECT); |
214 | 0 | } |
215 | | |
216 | 0 | void setrej_bad_repetition() { // Odd repeated char |
217 | 0 | set_flag(R_BAD_REPETITION); |
218 | 0 | } |
219 | | |
220 | 200k | void setrej_poor_match() { // Failed Rays heuristic |
221 | 200k | set_flag(R_POOR_MATCH); |
222 | 200k | } |
223 | | |
224 | 0 | void setrej_not_tess_accepted() { |
225 | | // TEMP reject_word |
226 | 0 | set_flag(R_NOT_TESS_ACCEPTED); |
227 | 0 | } |
228 | | |
229 | 0 | void setrej_contains_blanks() { |
230 | | // TEMP reject_word |
231 | 0 | set_flag(R_CONTAINS_BLANKS); |
232 | 0 | } |
233 | | |
234 | 0 | void setrej_bad_permuter() { // POTENTIAL reject_word |
235 | 0 | set_flag(R_BAD_PERMUTER); |
236 | 0 | } |
237 | | |
238 | 135 | void setrej_hyphen() { // PostNN dubious hyphen or . |
239 | 135 | set_flag(R_HYPHEN); |
240 | 135 | } |
241 | | |
242 | 0 | void setrej_dubious() { // PostNN dubious limit |
243 | 0 | set_flag(R_DUBIOUS); |
244 | 0 | } |
245 | | |
246 | 0 | void setrej_no_alphanums() { // TEMP reject_word |
247 | 0 | set_flag(R_NO_ALPHANUMS); |
248 | 0 | } |
249 | | |
250 | 0 | void setrej_mostly_rej() { // TEMP reject_word |
251 | 0 | set_flag(R_MOSTLY_REJ); |
252 | 0 | } |
253 | | |
254 | 0 | void setrej_xht_fixup() { // xht fixup |
255 | 0 | set_flag(R_XHT_FIXUP); |
256 | 0 | } |
257 | | |
258 | 0 | void setrej_bad_quality() { // TEMP reject_word |
259 | 0 | set_flag(R_BAD_QUALITY); |
260 | 0 | } |
261 | | |
262 | 0 | void setrej_doc_rej() { // TEMP reject_word |
263 | 0 | set_flag(R_DOC_REJ); |
264 | 0 | } |
265 | | |
266 | 0 | void setrej_block_rej() { // TEMP reject_word |
267 | 0 | set_flag(R_BLOCK_REJ); |
268 | 0 | } |
269 | | |
270 | 0 | void setrej_row_rej() { // TEMP reject_word |
271 | 0 | set_flag(R_ROW_REJ); |
272 | 0 | } |
273 | | |
274 | 0 | void setrej_unlv_rej() { // TEMP reject_word |
275 | 0 | set_flag(R_UNLV_REJ); |
276 | 0 | } |
277 | | |
278 | 114 | void setrej_hyphen_accept() { // NN Flipped a char |
279 | 114 | set_flag(R_HYPHEN_ACCEPT); |
280 | 114 | } |
281 | | |
282 | 0 | void setrej_nn_accept() { // NN Flipped a char |
283 | 0 | set_flag(R_NN_ACCEPT); |
284 | 0 | } |
285 | | |
286 | 0 | void setrej_mm_accept() { // Matrix matcher |
287 | 0 | set_flag(R_MM_ACCEPT); |
288 | 0 | } |
289 | | |
290 | 0 | void setrej_quality_accept() { // Quality flip a char |
291 | 0 | set_flag(R_QUALITY_ACCEPT); |
292 | 0 | } |
293 | | |
294 | 0 | void setrej_minimal_rej_accept() { |
295 | | // Accept all except blank |
296 | 0 | set_flag(R_MINIMAL_REJ_ACCEPT); |
297 | 0 | } |
298 | | |
299 | 218 | bool accepted() const { // Is char accepted? |
300 | 218 | return !rejected(); |
301 | 218 | } |
302 | | |
303 | 0 | bool recoverable() const { |
304 | 0 | return (rejected() && !perm_rejected()); |
305 | 0 | } |
306 | | |
307 | | void full_print(FILE *fp) const; |
308 | | }; |
309 | | |
310 | | class REJMAP { |
311 | | std::unique_ptr<REJ[]> ptr; // ptr to the chars |
312 | | uint16_t len = 0; // Number of chars |
313 | | |
314 | | public: |
315 | 739k | REJMAP() = default; |
316 | | |
317 | 0 | REJMAP(const REJMAP &rejmap) { |
318 | 0 | *this = rejmap; |
319 | 0 | } |
320 | | |
321 | | REJMAP &operator=(const REJMAP &source); |
322 | | |
323 | | // Sets up the ptr array to length, whatever it was before. |
324 | | void initialise(uint16_t length); |
325 | | |
326 | | REJ &operator[]( // access function |
327 | | uint16_t index) const // map index |
328 | 321k | { |
329 | 321k | ASSERT_HOST(index < len); |
330 | 321k | return ptr[index]; // no bounds checks |
331 | 321k | } |
332 | | |
333 | 96.0k | uint16_t length() const { // map length |
334 | 96.0k | return len; |
335 | 96.0k | } |
336 | | |
337 | | int16_t accept_count() const; // How many accepted? |
338 | | |
339 | 0 | int16_t reject_count() const { // How many rejects? |
340 | 0 | return len - accept_count(); |
341 | 0 | } |
342 | | |
343 | | // Cut out an element. |
344 | | void remove_pos(uint16_t pos); |
345 | | |
346 | | void print(FILE *fp) const; |
347 | | |
348 | | void full_print(FILE *fp) const; |
349 | | |
350 | | bool recoverable_rejects() const; // Any non perm rejs? |
351 | | |
352 | | bool quality_recoverable_rejects() const; |
353 | | // Any potential rejs? |
354 | | |
355 | | void rej_word_small_xht(); // Reject whole word |
356 | | // Reject whole word |
357 | | void rej_word_tess_failure(); |
358 | | void rej_word_not_tess_accepted(); |
359 | | // Reject whole word |
360 | | // Reject whole word |
361 | | void rej_word_contains_blanks(); |
362 | | // Reject whole word |
363 | | void rej_word_bad_permuter(); |
364 | | void rej_word_xht_fixup(); // Reject whole word |
365 | | // Reject whole word |
366 | | void rej_word_no_alphanums(); |
367 | | void rej_word_mostly_rej(); // Reject whole word |
368 | | void rej_word_bad_quality(); // Reject whole word |
369 | | void rej_word_doc_rej(); // Reject whole word |
370 | | void rej_word_block_rej(); // Reject whole word |
371 | | void rej_word_row_rej(); // Reject whole word |
372 | | }; |
373 | | |
374 | | } // namespace tesseract |
375 | | |
376 | | #endif |