Coverage Report

Created: 2025-06-13 07:02

/src/tesseract/src/ccstruct/ratngs.h
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        ratngs.h  (Formerly ratings.h)
3
 * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.
4
 * Author:      Ray Smith
5
 *
6
 * (C) Copyright 1992, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#ifndef RATNGS_H
20
#define RATNGS_H
21
22
#ifdef HAVE_CONFIG_H
23
#  include "config_auto.h" // DISABLED_LEGACY_ENGINE
24
#endif
25
26
#include "clst.h"
27
#include "elst.h"
28
#ifndef DISABLED_LEGACY_ENGINE
29
#  include "fontinfo.h"
30
#endif // undef DISABLED_LEGACY_ENGINE
31
#include "matrix.h"
32
#include "unicharset.h"
33
#include "werd.h"
34
35
#include <tesseract/unichar.h>
36
37
#include <cassert>
38
#include <cfloat> // for FLT_MAX
39
40
namespace tesseract {
41
42
class MATRIX;
43
struct TBLOB;
44
struct TWERD;
45
46
// Enum to describe the source of a BLOB_CHOICE to make it possible to determine
47
// whether a blob has been classified by inspecting the BLOB_CHOICEs.
48
enum BlobChoiceClassifier {
49
  BCC_STATIC_CLASSIFIER,  // From the char_norm classifier.
50
  BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier.
51
  BCC_SPECKLE_CLASSIFIER, // Backup for failed classification.
52
  BCC_AMBIG,              // Generated by ambiguity detection.
53
  BCC_FAKE,               // From some other process.
54
};
55
56
class BLOB_CHOICE : public ELIST<BLOB_CHOICE>::LINK {
57
public:
58
371
  BLOB_CHOICE() {
59
371
    unichar_id_ = UNICHAR_SPACE;
60
371
    fontinfo_id_ = -1;
61
371
    fontinfo_id2_ = -1;
62
371
    rating_ = 10.0f;
63
371
    certainty_ = -1.0f;
64
371
    script_id_ = -1;
65
371
    min_xheight_ = 0.0f;
66
371
    max_xheight_ = 0.0f;
67
371
    yshift_ = 0.0f;
68
371
    classifier_ = BCC_FAKE;
69
371
  }
70
  BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
71
              float src_rating,          // rating
72
              float src_cert,            // certainty
73
              int script_id,             // script
74
              float min_xheight,         // min xheight in image pixel units
75
              float max_xheight,         // max xheight allowed by this char
76
              float yshift,              // the larger of y shift (top or bottom)
77
              BlobChoiceClassifier c);   // adapted match or other
78
  BLOB_CHOICE(const BLOB_CHOICE &other);
79
11.0M
  ~BLOB_CHOICE() = default;
80
81
300M
  UNICHAR_ID unichar_id() const {
82
300M
    return unichar_id_;
83
300M
  }
84
70.7M
  float rating() const {
85
70.7M
    return rating_;
86
70.7M
  }
87
45.1M
  float certainty() const {
88
45.1M
    return certainty_;
89
45.1M
  }
90
42.4M
  int16_t fontinfo_id() const {
91
42.4M
    return fontinfo_id_;
92
42.4M
  }
93
35.6M
  int16_t fontinfo_id2() const {
94
35.6M
    return fontinfo_id2_;
95
35.6M
  }
96
#ifndef DISABLED_LEGACY_ENGINE
97
177k
  const std::vector<ScoredFont> &fonts() const {
98
177k
    return fonts_;
99
177k
  }
100
3.62M
  void set_fonts(const std::vector<ScoredFont> &fonts) {
101
3.62M
    fonts_ = fonts;
102
3.62M
    int score1 = 0, score2 = 0;
103
3.62M
    fontinfo_id_ = -1;
104
3.62M
    fontinfo_id2_ = -1;
105
111M
    for (auto &f : fonts_) {
106
111M
      if (f.score > score1) {
107
15.5M
        score2 = score1;
108
15.5M
        fontinfo_id2_ = fontinfo_id_;
109
15.5M
        score1 = f.score;
110
15.5M
        fontinfo_id_ = f.fontinfo_id;
111
95.8M
      } else if (f.score > score2) {
112
9.31M
        score2 = f.score;
113
9.31M
        fontinfo_id2_ = f.fontinfo_id;
114
9.31M
      }
115
111M
    }
116
3.62M
  }
117
#endif // ndef DISABLED_LEGACY_ENGINE
118
22.0k
  int script_id() const {
119
22.0k
    return script_id_;
120
22.0k
  }
121
8.58M
  const MATRIX_COORD &matrix_cell() {
122
8.58M
    return matrix_cell_;
123
8.58M
  }
124
8.33M
  float min_xheight() const {
125
8.33M
    return min_xheight_;
126
8.33M
  }
127
8.33M
  float max_xheight() const {
128
8.33M
    return max_xheight_;
129
8.33M
  }
130
14.0M
  float yshift() const {
131
14.0M
    return yshift_;
132
14.0M
  }
133
0
  BlobChoiceClassifier classifier() const {
134
0
    return classifier_;
135
0
  }
136
8.79M
  bool IsAdapted() const {
137
8.79M
    return classifier_ == BCC_ADAPTED_CLASSIFIER;
138
8.79M
  }
139
1.39M
  bool IsClassified() const {
140
1.39M
    return classifier_ == BCC_STATIC_CLASSIFIER || classifier_ == BCC_ADAPTED_CLASSIFIER ||
141
1.39M
           classifier_ == BCC_SPECKLE_CLASSIFIER;
142
1.39M
  }
143
144
22.4k
  void set_unichar_id(UNICHAR_ID newunichar_id) {
145
22.4k
    unichar_id_ = newunichar_id;
146
22.4k
  }
147
24.4k
  void set_rating(float newrat) {
148
24.4k
    rating_ = newrat;
149
24.4k
  }
150
22.7k
  void set_certainty(float newrat) {
151
22.7k
    certainty_ = newrat;
152
22.7k
  }
153
0
  void set_script(int newscript_id) {
154
0
    script_id_ = newscript_id;
155
0
  }
156
12.0M
  void set_matrix_cell(int col, int row) {
157
12.0M
    matrix_cell_.col = col;
158
12.0M
    matrix_cell_.row = row;
159
12.0M
  }
160
22.0k
  void set_classifier(BlobChoiceClassifier classifier) {
161
22.0k
    classifier_ = classifier;
162
22.0k
  }
163
0
  static BLOB_CHOICE *deep_copy(const BLOB_CHOICE *src) {
164
0
    auto *choice = new BLOB_CHOICE;
165
0
    *choice = *src;
166
0
    return choice;
167
0
  }
168
  // Returns true if *this and other agree on the baseline and x-height
169
  // to within some tolerance based on a given estimate of the x-height.
170
  bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const;
171
172
0
  void print(const UNICHARSET *unicharset) const {
173
0
    tprintf("r%.2f c%.2f x[%g,%g]: %d %s",
174
0
            static_cast<double>(rating_),
175
0
            static_cast<double>(certainty_),
176
0
            static_cast<double>(min_xheight_),
177
0
            static_cast<double>(max_xheight_),
178
0
            unichar_id_, (unicharset == nullptr) ? "" : unicharset->debug_str(unichar_id_).c_str());
179
0
  }
180
0
  void print_full() const {
181
0
    print(nullptr);
182
0
    tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n", script_id_, fontinfo_id_,
183
0
            fontinfo_id2_, static_cast<double>(yshift_), classifier_);
184
0
  }
185
  // Sort function for sorting BLOB_CHOICEs in increasing order of rating.
186
0
  static int SortByRating(const void *p1, const void *p2) {
187
0
    const BLOB_CHOICE *bc1 = *static_cast<const BLOB_CHOICE *const *>(p1);
188
0
    const BLOB_CHOICE *bc2 = *static_cast<const BLOB_CHOICE *const *>(p2);
189
0
    return (bc1->rating_ < bc2->rating_) ? -1 : 1;
190
0
  }
191
192
private:
193
  // Copy assignment operator.
194
  BLOB_CHOICE &operator=(const BLOB_CHOICE &other);
195
196
  UNICHAR_ID unichar_id_; // unichar id
197
#ifndef DISABLED_LEGACY_ENGINE
198
  // Fonts and scores. Allowed to be empty.
199
  std::vector<ScoredFont> fonts_;
200
#endif                   // ndef DISABLED_LEGACY_ENGINE
201
  int16_t fontinfo_id_;  // char font information
202
  int16_t fontinfo_id2_; // 2nd choice font information
203
  // Rating is the classifier distance weighted by the length of the outline
204
  // in the blob. In terms of probability, classifier distance is -klog p such
205
  // that the resulting distance is in the range [0, 1] and then
206
  // rating = w (-k log p) where w is the weight for the length of the outline.
207
  // Sums of ratings may be compared meaningfully for words of different
208
  // segmentation.
209
  float rating_; // size related
210
  // Certainty is a number in [-20, 0] indicating the classifier certainty
211
  // of the choice. In terms of probability, certainty = 20 (k log p) where
212
  // k is defined as above to normalize -klog p to the range [0, 1].
213
  float certainty_; // absolute
214
  int script_id_;
215
  // Holds the position of this choice in the ratings matrix.
216
  // Used to location position in the matrix during path backtracking.
217
  MATRIX_COORD matrix_cell_;
218
  // X-height range (in image pixels) that this classification supports.
219
  float min_xheight_;
220
  float max_xheight_;
221
  // yshift_ - The vertical distance (in image pixels) the character is
222
  //           shifted (up or down) from an acceptable y position.
223
  float yshift_;
224
  BlobChoiceClassifier classifier_; // What generated *this.
225
};
226
227
// Make BLOB_CHOICE listable.
228
ELISTIZEH(BLOB_CHOICE)
229
230
// Return the BLOB_CHOICE in bc_list matching a given unichar_id,
231
// or nullptr if there is no match.
232
BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list);
233
234
// Permuter codes used in WERD_CHOICEs.
235
enum PermuterType {
236
  NO_PERM,           // 0
237
  PUNC_PERM,         // 1
238
  TOP_CHOICE_PERM,   // 2
239
  LOWER_CASE_PERM,   // 3
240
  UPPER_CASE_PERM,   // 4
241
  NGRAM_PERM,        // 5
242
  NUMBER_PERM,       // 6
243
  USER_PATTERN_PERM, // 7
244
  SYSTEM_DAWG_PERM,  // 8
245
  DOC_DAWG_PERM,     // 9
246
  USER_DAWG_PERM,    // 10
247
  FREQ_DAWG_PERM,    // 11
248
  COMPOUND_PERM,     // 12
249
250
  NUM_PERMUTER_TYPES
251
};
252
253
// ScriptPos tells whether a character is subscript, superscript or normal.
254
enum ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP };
255
256
const char *ScriptPosToString(ScriptPos script_pos);
257
258
class TESS_API WERD_CHOICE : public ELIST<WERD_CHOICE>::LINK {
259
public:
260
  static const float kBadRating;
261
  static const char *permuter_name(uint8_t permuter);
262
263
644k
  WERD_CHOICE(const UNICHARSET *unicharset) : unicharset_(unicharset) {
264
644k
    this->init(8);
265
644k
  }
266
581k
  WERD_CHOICE(const UNICHARSET *unicharset, int reserved) : unicharset_(unicharset) {
267
581k
    this->init(reserved);
268
581k
  }
269
  WERD_CHOICE(const char *src_string, const char *src_lengths, float src_rating,
270
              float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)
271
0
      : unicharset_(&unicharset) {
272
0
    this->init(src_string, src_lengths, src_rating, src_certainty, src_permuter);
273
0
  }
274
  WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
275
327k
  WERD_CHOICE(const WERD_CHOICE &word) : ELIST<WERD_CHOICE>::LINK(word), unicharset_(word.unicharset_) {
276
327k
    this->init(word.length());
277
327k
    this->operator=(word);
278
327k
  }
279
  ~WERD_CHOICE();
280
281
1.61M
  const UNICHARSET *unicharset() const {
282
1.61M
    return unicharset_;
283
1.61M
  }
284
906k
  bool empty() const {
285
906k
    return length_ == 0;
286
906k
  }
287
54.0M
  inline unsigned length() const {
288
54.0M
    return length_;
289
54.0M
  }
290
862k
  float adjust_factor() const {
291
862k
    return adjust_factor_;
292
862k
  }
293
331k
  void set_adjust_factor(float factor) {
294
331k
    adjust_factor_ = factor;
295
331k
  }
296
499k
  inline const std::vector<UNICHAR_ID> &unichar_ids() const {
297
499k
    return unichar_ids_;
298
499k
  }
299
38.7M
  inline UNICHAR_ID unichar_id(unsigned index) const {
300
38.7M
    assert(index < length_);
301
38.7M
    return unichar_ids_[index];
302
38.7M
  }
303
21.4M
  inline unsigned state(unsigned index) const {
304
21.4M
    return state_[index];
305
21.4M
  }
306
5.25M
  ScriptPos BlobPosition(unsigned index) const {
307
5.25M
    if (index >= length_) {
308
0
      return SP_NORMAL;
309
0
    }
310
5.25M
    return script_pos_[index];
311
5.25M
  }
312
5.91M
  inline float rating() const {
313
5.91M
    return rating_;
314
5.91M
  }
315
1.76M
  inline float certainty() const {
316
1.76M
    return certainty_;
317
1.76M
  }
318
1.30M
  inline float certainty(unsigned index) const {
319
1.30M
    return certainties_[index];
320
1.30M
  }
321
327k
  inline float min_x_height() const {
322
327k
    return min_x_height_;
323
327k
  }
324
327k
  inline float max_x_height() const {
325
327k
    return max_x_height_;
326
327k
  }
327
331k
  inline void set_x_heights(float min_height, float max_height) {
328
331k
    min_x_height_ = min_height;
329
331k
    max_x_height_ = max_height;
330
331k
  }
331
1.44M
  inline uint8_t permuter() const {
332
1.44M
    return permuter_;
333
1.44M
  }
334
  const char *permuter_name() const;
335
  // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
336
  // taken from the appropriate cell in the ratings MATRIX.
337
  // Borrowed pointer, so do not delete.
338
  BLOB_CHOICE_LIST *blob_choices(unsigned index, MATRIX *ratings) const;
339
340
  // Returns the MATRIX_COORD corresponding to the location in the ratings
341
  // MATRIX for the given index into the word.
342
  MATRIX_COORD MatrixCoord(unsigned index) const;
343
344
409
  inline void set_unichar_id(UNICHAR_ID unichar_id, unsigned index) {
345
409
    assert(index < length_);
346
409
    unichar_ids_[index] = unichar_id;
347
409
  }
348
187k
  bool dangerous_ambig_found() const {
349
187k
    return dangerous_ambig_found_;
350
187k
  }
351
331k
  void set_dangerous_ambig_found_(bool value) {
352
331k
    dangerous_ambig_found_ = value;
353
331k
  }
354
1.25M
  inline void set_rating(float new_val) {
355
1.25M
    rating_ = new_val;
356
1.25M
  }
357
755k
  inline void set_certainty(float new_val) {
358
755k
    certainty_ = new_val;
359
755k
  }
360
928k
  inline void set_permuter(uint8_t perm) {
361
928k
    permuter_ = perm;
362
928k
  }
363
  // Note: this function should only be used if all the fields
364
  // are populated manually with set_* functions (rather than
365
  // (copy)constructors and append_* functions).
366
331k
  inline void set_length(unsigned len) {
367
331k
    ASSERT_HOST(reserved_ >= len);
368
331k
    length_ = len;
369
331k
  }
370
371
  /// Make more space in unichar_id_ and fragment_lengths_ arrays.
372
752k
  inline void double_the_size() {
373
752k
    if (reserved_ > 0) {
374
752k
      reserved_ *= 2;
375
752k
    } else {
376
0
      reserved_ = 1;
377
0
    }
378
752k
    unichar_ids_.resize(reserved_);
379
752k
    script_pos_.resize(reserved_);
380
752k
    state_.resize(reserved_);
381
752k
    certainties_.resize(reserved_);
382
752k
  }
383
384
  /// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and
385
  /// fragment_length_ arrays. Sets other values to default (blank) values.
386
1.55M
  inline void init(unsigned reserved) {
387
1.55M
    reserved_ = reserved;
388
1.55M
    if (reserved > 0) {
389
1.55M
      unichar_ids_.resize(reserved);
390
1.55M
      script_pos_.resize(reserved);
391
1.55M
      state_.resize(reserved);
392
1.55M
      certainties_.resize(reserved);
393
1.55M
    } else {
394
124
      unichar_ids_.clear();
395
124
      script_pos_.clear();
396
124
      state_.clear();
397
124
      certainties_.clear();
398
124
    }
399
1.55M
    length_ = 0;
400
1.55M
    adjust_factor_ = 1.0f;
401
1.55M
    rating_ = 0.0;
402
1.55M
    certainty_ = FLT_MAX;
403
1.55M
    min_x_height_ = 0.0f;
404
1.55M
    max_x_height_ = FLT_MAX;
405
1.55M
    permuter_ = NO_PERM;
406
1.55M
    unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
407
1.55M
    dangerous_ambig_found_ = false;
408
1.55M
  }
409
410
  /// Helper function to build a WERD_CHOICE from the given string,
411
  /// fragment lengths, rating, certainty and permuter.
412
  /// The function assumes that src_string is not nullptr.
413
  /// src_lengths argument could be nullptr, in which case the unichars
414
  /// in src_string are assumed to all be of length 1.
415
  void init(const char *src_string, const char *src_lengths, float src_rating, float src_certainty,
416
            uint8_t src_permuter);
417
418
  /// Set the fields in this choice to be default (bad) values.
419
211k
  inline void make_bad() {
420
211k
    length_ = 0;
421
211k
    rating_ = kBadRating;
422
211k
    certainty_ = -FLT_MAX;
423
211k
  }
424
425
  /// This function assumes that there is enough space reserved
426
  /// in the WERD_CHOICE for adding another unichar.
427
  /// This is an efficient alternative to append_unichar_id().
428
  inline void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating,
429
2.01M
                                                float certainty) {
430
2.01M
    assert(reserved_ > length_);
431
2.01M
    length_++;
432
2.01M
    this->set_unichar_id(unichar_id, blob_count, rating, certainty, length_ - 1);
433
2.01M
  }
434
435
  void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty);
436
437
  inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty,
438
2.01M
                             unsigned index) {
439
2.01M
    assert(index < length_);
440
2.01M
    unichar_ids_[index] = unichar_id;
441
2.01M
    state_[index] = blob_count;
442
2.01M
    certainties_[index] = certainty;
443
2.01M
    script_pos_[index] = SP_NORMAL;
444
2.01M
    rating_ += rating;
445
2.01M
    if (certainty < certainty_) {
446
754k
      certainty_ = certainty;
447
754k
    }
448
2.01M
  }
449
  // Sets the entries for the given index from the BLOB_CHOICE, assuming
450
  // unit fragment lengths, but setting the state for this index to blob_count.
451
  void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice);
452
453
  bool contains_unichar_id(UNICHAR_ID unichar_id) const;
454
  void remove_unichar_ids(unsigned index, int num);
455
470k
  inline void remove_last_unichar_id() {
456
470k
    --length_;
457
470k
  }
458
102k
  inline void remove_unichar_id(unsigned index) {
459
102k
    this->remove_unichar_ids(index, 1);
460
102k
  }
461
  bool has_rtl_unichar_id() const;
462
  void reverse_and_mirror_unichar_ids();
463
464
  // Returns the half-open interval of unichar_id indices [start, end) which
465
  // enclose the core portion of this word -- the part after stripping
466
  // punctuation from the left and right.
467
  void punct_stripped(unsigned *start_core, unsigned *end_core) const;
468
469
  // Returns the indices [start, end) containing the core of the word, stripped
470
  // of any superscript digits on either side. (i.e., the non-footnote part
471
  // of the word). There is no guarantee that the output range is non-empty.
472
  void GetNonSuperscriptSpan(int *start, int *end) const;
473
474
  // Return a copy of this WERD_CHOICE with the choices [start, end).
475
  // The result is useful only for checking against a dictionary.
476
  WERD_CHOICE shallow_copy(unsigned start, unsigned end) const;
477
478
  void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const;
479
0
  std::string debug_string() const {
480
0
    std::string word_str;
481
0
    for (unsigned i = 0; i < length_; ++i) {
482
0
      word_str += unicharset_->debug_str(unichar_ids_[i]);
483
0
      word_str += " ";
484
0
    }
485
0
    return word_str;
486
0
  }
487
  // Returns true if any unichar_id in the word is a non-space-delimited char.
488
0
  bool ContainsAnyNonSpaceDelimited() const {
489
0
    for (unsigned i = 0; i < length_; ++i) {
490
0
      if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) {
491
0
        return true;
492
0
      }
493
0
    }
494
0
    return false;
495
0
  }
496
  // Returns true if the word is all spaces.
497
53.0k
  bool IsAllSpaces() const {
498
53.1k
    for (unsigned i = 0; i < length_; ++i) {
499
53.0k
      if (unichar_ids_[i] != UNICHAR_SPACE) {
500
52.9k
        return false;
501
52.9k
      }
502
53.0k
    }
503
54
    return true;
504
53.0k
  }
505
506
  // Call this to override the default (strict left to right graphemes)
507
  // with the fact that some engine produces a "reading order" set of
508
  // Graphemes for each word.
509
0
  bool set_unichars_in_script_order(bool in_script_order) {
510
0
    return unichars_in_script_order_ = in_script_order;
511
0
  }
512
513
0
  bool unichars_in_script_order() const {
514
0
    return unichars_in_script_order_;
515
0
  }
516
517
  // Returns a UTF-8 string equivalent to the current choice
518
  // of UNICHAR IDs.
519
2.59M
  std::string &unichar_string() {
520
2.59M
    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
521
2.59M
    return unichar_string_;
522
2.59M
  }
523
524
  // Returns a UTF-8 string equivalent to the current choice
525
  // of UNICHAR IDs.
526
0
  const std::string &unichar_string() const {
527
0
    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
528
0
    return unichar_string_;
529
0
  }
530
531
  // Returns the lengths, one byte each, representing the number of bytes
532
  // required in the unichar_string for each UNICHAR_ID.
533
229k
  const std::string &unichar_lengths() const {
534
229k
    this->string_and_lengths(&unichar_string_, &unichar_lengths_);
535
229k
    return unichar_lengths_;
536
229k
  }
537
538
  // Sets up the script_pos_ member using the blobs_list to get the bln
539
  // bounding boxes, *this to get the unichars, and this->unicharset
540
  // to get the target positions. If small_caps is true, sub/super are not
541
  // considered, but dropcaps are.
542
  // NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.)
543
  void SetScriptPositions(bool small_caps, TWERD *word, int debug = 0);
544
  // Sets all the script_pos_ positions to the given position.
545
  void SetAllScriptPositions(ScriptPos position);
546
547
  static ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset,
548
                                    const TBOX &blob_box, UNICHAR_ID unichar_id);
549
550
  // Returns the "dominant" script ID for the word.  By "dominant", the script
551
  // must account for at least half the characters.  Otherwise, it returns 0.
552
  // Note that for Japanese, Hiragana and Katakana are simply treated as Han.
553
  int GetTopScriptID() const;
554
555
  // Fixes the state_ for a chop at the given blob_posiiton.
556
  void UpdateStateForSplit(int blob_position);
557
558
  // Returns the sum of all the state elements, being the total number of blobs.
559
  unsigned TotalOfStates() const;
560
561
0
  void print() const {
562
0
    this->print("");
563
0
  }
564
  void print(const char *msg) const;
565
  // Prints the segmentation state with an introductory message.
566
  void print_state(const char *msg) const;
567
568
  // Displays the segmentation state of *this (if not the same as the last
569
  // one displayed) and waits for a click in the window.
570
  void DisplaySegmentation(TWERD *word);
571
572
  WERD_CHOICE &operator+=(        // concatanate
573
      const WERD_CHOICE &second); // second on first
574
575
  WERD_CHOICE &operator=(const WERD_CHOICE &source);
576
577
private:
578
  const UNICHARSET *unicharset_;
579
  // TODO(rays) Perhaps replace the multiple arrays with an array of structs?
580
  // unichar_ids_ is an array of classifier "results" that make up a word.
581
  // For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position
582
  // of each unichar_id.
583
  // state_[i] indicates the number of blobs in WERD_RES::chopped_word that
584
  // were put together to make the classification results in the ith position
585
  // in unichar_ids_, and certainties_[i] is the certainty of the choice that
586
  // was used in this word.
587
  // == Change from before ==
588
  // Previously there was fragment_lengths_ that allowed a word to be
589
  // artificially composed of multiple fragment results. Since the new
590
  // segmentation search doesn't do fragments, treatment of fragments has
591
  // been moved to a lower level, augmenting the ratings matrix with the
592
  // combined fragments, and allowing the language-model/segmentation-search
593
  // to deal with only the combined unichar_ids.
594
  std::vector<UNICHAR_ID> unichar_ids_; // unichar ids that represent the text of the word
595
  std::vector<ScriptPos> script_pos_;   // Normal/Sub/Superscript of each unichar.
596
  std::vector<int> state_;              // Number of blobs in each unichar.
597
  std::vector<float> certainties_;      // Certainty of each unichar.
598
  unsigned reserved_;            // size of the above arrays
599
  unsigned length_;              // word length
600
  // Factor that was used to adjust the rating.
601
  float adjust_factor_;
602
  // Rating is the sum of the ratings of the individual blobs in the word.
603
  float rating_; // size related
604
  // certainty is the min (worst) certainty of the individual blobs in the word.
605
  float certainty_; // absolute
606
  // xheight computed from the result, or 0 if inconsistent.
607
  float min_x_height_;
608
  float max_x_height_;
609
  uint8_t permuter_; // permuter code
610
611
  // Normally, the ratings_ matrix represents the recognition results in order
612
  // from left-to-right.  However, some engines (say Cube) may return
613
  // recognition results in the order of the script's major reading direction
614
  // (for Arabic, that is right-to-left).
615
  bool unichars_in_script_order_;
616
  // True if NoDangerousAmbig found an ambiguity.
617
  bool dangerous_ambig_found_;
618
619
  // The following variables are populated and passed by reference any
620
  // time unichar_string() or unichar_lengths() are called.
621
  mutable std::string unichar_string_;
622
  mutable std::string unichar_lengths_;
623
};
624
625
// Make WERD_CHOICE listable.
626
ELISTIZEH(WERD_CHOICE)
627
using BLOB_CHOICE_LIST_VECTOR = std::vector<BLOB_CHOICE_LIST *>;
628
629
// Utilities for comparing WERD_CHOICEs
630
631
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2);
632
633
// Utilities for debug printing.
634
void print_ratings_list(const char *msg,                     // intro message
635
                        BLOB_CHOICE_LIST *ratings,           // list of results
636
                        const UNICHARSET &current_unicharset // unicharset that can be used
637
                                                             // for id-to-unichar conversion
638
);
639
640
} // namespace tesseract
641
642
#endif