Coverage Report

Created: 2026-06-13 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/ccstruct/rejctmap.h
Line
Count
Source
1
/**********************************************************************
2
 * File:        rejctmap.h  (Formerly rejmap.h)
3
 * Description: REJ and REJMAP class functions.
4
 * Author:    Phil Cheatle
5
 *
6
 * (C) Copyright 1994, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
18
This module may look unnecessarily verbose, but here's the philosophy...
19
20
ALL processing of the reject map is done in this module. There are lots of
21
separate calls to set reject/accept flags. These have DELIBERATELY been kept
22
distinct so that this module can decide what to do.
23
24
Basically, there is a flag for each sort of rejection or acceptance. This
25
provides a history of what has happened to EACH character.
26
27
Determining whether a character is CURRENTLY rejected depends on implicit
28
understanding of the SEQUENCE of possible calls. The flags are defined and
29
grouped in the REJ_FLAGS enum. These groupings are used in determining a
30
characters CURRENT rejection status. Basically, a character is ACCEPTED if
31
32
    none of the permanent rej flags are set
33
  AND (    the character has never been rejected
34
      OR an accept flag is set which is LATER than the latest reject flag )
35
36
IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE
37
OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!!
38
**********************************************************************/
39
40
#ifndef REJCTMAP_H
41
#define REJCTMAP_H
42
43
#include "errcode.h"
44
#include "params.h"
45
46
#include <bitset>
47
#include <memory>
48
49
namespace tesseract {
50
51
enum REJ_FLAGS {
52
  /* Reject modes which are NEVER overridden */
53
  R_TESS_FAILURE,   // PERM Tess didn't classify
54
  R_SMALL_XHT,      // PERM Xht too small
55
  R_EDGE_CHAR,      // PERM Too close to edge of image
56
  R_1IL_CONFLICT,   // PERM 1Il confusion
57
  R_POSTNN_1IL,     // PERM 1Il unrejected by NN
58
  R_REJ_CBLOB,      // PERM Odd blob
59
  R_MM_REJECT,      // PERM Matrix match rejection (m's)
60
  R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend
61
62
  /* Initial reject modes (pre NN_ACCEPT) */
63
  R_POOR_MATCH,        // TEMP Ray's original heuristic (Not used)
64
  R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD
65
  R_CONTAINS_BLANKS,   // TEMP Tess failed on other chs in WERD
66
  R_BAD_PERMUTER,      // POTENTIAL Bad permuter for WERD
67
68
  /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */
69
  R_HYPHEN,       // TEMP Post NN dodgy hyphen or full stop
70
  R_DUBIOUS,      // TEMP Post NN dodgy chars
71
  R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN
72
  R_MOSTLY_REJ,   // TEMP Most of word rejected so rej the rest
73
  R_XHT_FIXUP,    // TEMP Xht tests unsure
74
75
  /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */
76
  R_BAD_QUALITY, // TEMP Quality metrics bad for WERD
77
78
  /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accept */
79
  R_DOC_REJ,   // TEMP Document rejection
80
  R_BLOCK_REJ, // TEMP Block rejection
81
  R_ROW_REJ,   // TEMP Row rejection
82
  R_UNLV_REJ,  // TEMP ~ turned to - or ^ turned to space
83
84
  /* Accept modes which occur between the above rejection groups */
85
  R_NN_ACCEPT,         // NN acceptance
86
  R_HYPHEN_ACCEPT,     // Hyphen acceptance
87
  R_MM_ACCEPT,         // Matrix match acceptance
88
  R_QUALITY_ACCEPT,    // Accept word in good quality doc
89
  R_MINIMAL_REJ_ACCEPT // Accept EVERYTHING except tess failures
90
};
91
92
/* REJECT MAP VALUES */
93
94
0
#define MAP_ACCEPT '1'
95
0
#define MAP_REJECT_PERM '0'
96
0
#define MAP_REJECT_TEMP '2'
97
0
#define MAP_REJECT_POTENTIAL '3'
98
99
class REJ {
100
  std::bitset<32> flags;
101
102
282k
  void set_flag(REJ_FLAGS rej_flag) {
103
282k
    flags.set(rej_flag);
104
282k
  }
105
106
public:
107
1.72M
  REJ() = default;
108
109
  REJ( // classwise copy
110
0
      const REJ &source) {
111
0
    flags = source.flags;
112
0
  }
113
114
  REJ &operator=( // assign REJ
115
      const REJ &source) = default;
116
117
9.67k
  bool flag(REJ_FLAGS rej_flag) const {
118
9.67k
    return flags[rej_flag];
119
9.67k
  }
120
121
0
  char display_char() const {
122
0
    if (perm_rejected()) {
123
0
      return MAP_REJECT_PERM;
124
0
    } else if (accept_if_good_quality()) {
125
0
      return MAP_REJECT_POTENTIAL;
126
0
    } else if (rejected()) {
127
0
      return MAP_REJECT_TEMP;
128
0
    } else {
129
0
      return MAP_ACCEPT;
130
0
    }
131
0
  }
132
133
442
  bool perm_rejected() const { // Is char perm reject?
134
442
    return (flag(R_TESS_FAILURE) || flag(R_SMALL_XHT) || flag(R_EDGE_CHAR) ||
135
382
            flag(R_1IL_CONFLICT) || flag(R_POSTNN_1IL) || flag(R_REJ_CBLOB) ||
136
382
            flag(R_BAD_REPETITION) || flag(R_MM_REJECT));
137
442
  }
138
139
private:
140
382
  bool rej_before_nn_accept() const {
141
382
    return flag(R_POOR_MATCH) || flag(R_NOT_TESS_ACCEPTED) ||
142
269
           flag(R_CONTAINS_BLANKS) || flag(R_BAD_PERMUTER);
143
382
  }
144
145
382
  bool rej_between_nn_and_mm() const {
146
382
    return flag(R_HYPHEN) || flag(R_DUBIOUS) || flag(R_NO_ALPHANUMS) ||
147
382
           flag(R_MOSTLY_REJ) || flag(R_XHT_FIXUP);
148
382
  }
149
150
382
  bool rej_between_mm_and_quality_accept() const {
151
382
    return flag(R_BAD_QUALITY);
152
382
  }
153
154
382
  bool rej_between_quality_and_minimal_rej_accept() const {
155
382
    return flag(R_DOC_REJ) || flag(R_BLOCK_REJ) || flag(R_ROW_REJ) ||
156
382
           flag(R_UNLV_REJ);
157
382
  }
158
159
382
  bool rej_before_mm_accept() const {
160
382
    return rej_between_nn_and_mm() ||
161
382
           (rej_before_nn_accept() && !flag(R_NN_ACCEPT) &&
162
113
            !flag(R_HYPHEN_ACCEPT));
163
382
  }
164
165
382
  bool rej_before_quality_accept() const {
166
382
    return rej_between_mm_and_quality_accept() ||
167
382
           (!flag(R_MM_ACCEPT) && rej_before_mm_accept());
168
382
  }
169
170
public:
171
442
  bool rejected() const { // Is char rejected?
172
442
    if (flag(R_MINIMAL_REJ_ACCEPT)) {
173
0
      return false;
174
442
    } else {
175
442
      return (perm_rejected() || rej_between_quality_and_minimal_rej_accept() ||
176
382
              (!flag(R_QUALITY_ACCEPT) && rej_before_quality_accept()));
177
442
    }
178
442
  }
179
180
0
  bool accept_if_good_quality() const { // potential rej?
181
0
    return (rejected() && !perm_rejected() && flag(R_BAD_PERMUTER) &&
182
0
            !flag(R_POOR_MATCH) && !flag(R_NOT_TESS_ACCEPTED) &&
183
0
            !flag(R_CONTAINS_BLANKS) &&
184
0
            (!rej_between_nn_and_mm() && !rej_between_mm_and_quality_accept() &&
185
0
             !rej_between_quality_and_minimal_rej_accept()));
186
0
  }
187
188
69
  void setrej_tess_failure() { // Tess generated blank
189
69
    set_flag(R_TESS_FAILURE);
190
69
  }
191
192
0
  void setrej_small_xht() { // Small xht char/wd
193
0
    set_flag(R_SMALL_XHT);
194
0
  }
195
196
108k
  void setrej_edge_char() { // Close to image edge
197
108k
    set_flag(R_EDGE_CHAR);
198
108k
  }
199
200
0
  void setrej_1Il_conflict() { // Initial reject map
201
0
    set_flag(R_1IL_CONFLICT);
202
0
  }
203
204
0
  void setrej_postNN_1Il() { // 1Il after NN
205
0
    set_flag(R_POSTNN_1IL);
206
0
  }
207
208
0
  void setrej_rej_cblob() { // Insert duff blob
209
0
    set_flag(R_REJ_CBLOB);
210
0
  }
211
212
0
  void setrej_mm_reject() { // Matrix matcher
213
0
    set_flag(R_MM_REJECT);
214
0
  }
215
216
0
  void setrej_bad_repetition() { // Odd repeated char
217
0
    set_flag(R_BAD_REPETITION);
218
0
  }
219
220
173k
  void setrej_poor_match() { // Failed Rays heuristic
221
173k
    set_flag(R_POOR_MATCH);
222
173k
  }
223
224
0
  void setrej_not_tess_accepted() {
225
    // TEMP reject_word
226
0
    set_flag(R_NOT_TESS_ACCEPTED);
227
0
  }
228
229
0
  void setrej_contains_blanks() {
230
    // TEMP reject_word
231
0
    set_flag(R_CONTAINS_BLANKS);
232
0
  }
233
234
0
  void setrej_bad_permuter() { // POTENTIAL reject_word
235
0
    set_flag(R_BAD_PERMUTER);
236
0
  }
237
238
136
  void setrej_hyphen() { // PostNN dubious hyphen or .
239
136
    set_flag(R_HYPHEN);
240
136
  }
241
242
0
  void setrej_dubious() { // PostNN dubious limit
243
0
    set_flag(R_DUBIOUS);
244
0
  }
245
246
0
  void setrej_no_alphanums() { // TEMP reject_word
247
0
    set_flag(R_NO_ALPHANUMS);
248
0
  }
249
250
0
  void setrej_mostly_rej() { // TEMP reject_word
251
0
    set_flag(R_MOSTLY_REJ);
252
0
  }
253
254
0
  void setrej_xht_fixup() { // xht fixup
255
0
    set_flag(R_XHT_FIXUP);
256
0
  }
257
258
0
  void setrej_bad_quality() { // TEMP reject_word
259
0
    set_flag(R_BAD_QUALITY);
260
0
  }
261
262
0
  void setrej_doc_rej() { // TEMP reject_word
263
0
    set_flag(R_DOC_REJ);
264
0
  }
265
266
0
  void setrej_block_rej() { // TEMP reject_word
267
0
    set_flag(R_BLOCK_REJ);
268
0
  }
269
270
0
  void setrej_row_rej() { // TEMP reject_word
271
0
    set_flag(R_ROW_REJ);
272
0
  }
273
274
0
  void setrej_unlv_rej() { // TEMP reject_word
275
0
    set_flag(R_UNLV_REJ);
276
0
  }
277
278
81
  void setrej_hyphen_accept() { // NN Flipped a char
279
81
    set_flag(R_HYPHEN_ACCEPT);
280
81
  }
281
282
0
  void setrej_nn_accept() { // NN Flipped a char
283
0
    set_flag(R_NN_ACCEPT);
284
0
  }
285
286
0
  void setrej_mm_accept() { // Matrix matcher
287
0
    set_flag(R_MM_ACCEPT);
288
0
  }
289
290
0
  void setrej_quality_accept() { // Quality flip a char
291
0
    set_flag(R_QUALITY_ACCEPT);
292
0
  }
293
294
0
  void setrej_minimal_rej_accept() {
295
    // Accept all except blank
296
0
    set_flag(R_MINIMAL_REJ_ACCEPT);
297
0
  }
298
299
220
  bool accepted() const { // Is char accepted?
300
220
    return !rejected();
301
220
  }
302
303
0
  bool recoverable() const {
304
0
    return (rejected() && !perm_rejected());
305
0
  }
306
307
  void full_print(FILE *fp) const;
308
};
309
310
class REJMAP {
311
  std::unique_ptr<REJ[]> ptr; // ptr to the chars
312
  uint16_t len = 0;           // Number of chars
313
314
public:
315
685k
  REJMAP() = default;
316
317
0
  REJMAP(const REJMAP &rejmap) {
318
0
    *this = rejmap;
319
0
  }
320
321
  REJMAP &operator=(const REJMAP &source);
322
323
  // Sets up the ptr array to length, whatever it was before.
324
  void initialise(uint16_t length);
325
326
  REJ &operator[](         // access function
327
      uint16_t index) const // map index
328
283k
  {
329
283k
    ASSERT_HOST(index < len);
330
283k
    return ptr[index]; // no bounds checks
331
283k
  }
332
333
85.2k
  uint16_t length() const { // map length
334
85.2k
    return len;
335
85.2k
  }
336
337
  int16_t accept_count() const; // How many accepted?
338
339
0
  int16_t reject_count() const { // How many rejects?
340
0
    return len - accept_count();
341
0
  }
342
343
  // Cut out an element.
344
  void remove_pos(uint16_t pos);
345
346
  void print(FILE *fp) const;
347
348
  void full_print(FILE *fp) const;
349
350
  bool recoverable_rejects() const; // Any non perm rejs?
351
352
  bool quality_recoverable_rejects() const;
353
  // Any potential rejs?
354
355
  void rej_word_small_xht(); // Reject whole word
356
                             // Reject whole word
357
  void rej_word_tess_failure();
358
  void rej_word_not_tess_accepted();
359
  // Reject whole word
360
  // Reject whole word
361
  void rej_word_contains_blanks();
362
  // Reject whole word
363
  void rej_word_bad_permuter();
364
  void rej_word_xht_fixup(); // Reject whole word
365
                             // Reject whole word
366
  void rej_word_no_alphanums();
367
  void rej_word_mostly_rej();  // Reject whole word
368
  void rej_word_bad_quality(); // Reject whole word
369
  void rej_word_doc_rej();     // Reject whole word
370
  void rej_word_block_rej();   // Reject whole word
371
  void rej_word_row_rej();     // Reject whole word
372
};
373
374
} // namespace tesseract
375
376
#endif