Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccstruct/ratngs.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File: ratngs.cpp  (Formerly ratings.c)
3
 * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.
4
 * Author: Ray Smith
5
 *
6
 * (C) Copyright 1992, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#ifdef HAVE_CONFIG_H
20
#  include "config_auto.h"
21
#endif
22
23
#include "ratngs.h"
24
25
#include "blobs.h"
26
#include "matrix.h"
27
#include "normalis.h" // kBlnBaselineOffset.
28
#include "unicharset.h"
29
30
#include <algorithm>
31
#include <cmath>
32
#include <string>
33
#include <vector>
34
35
namespace tesseract {
36
37
const float WERD_CHOICE::kBadRating = 100000.0;
38
// Min offset in baseline-normalized coords to make a character a subscript.
39
const int kMinSubscriptOffset = 20;
40
// Min offset in baseline-normalized coords to make a character a superscript.
41
const int kMinSuperscriptOffset = 20;
42
// Max y of bottom of a drop-cap blob.
43
const int kMaxDropCapBottom = -128;
44
// Max fraction of x-height to use as denominator in measuring x-height overlap.
45
const double kMaxOverlapDenominator = 0.125;
46
// Min fraction of x-height range that should be in agreement for matching
47
// x-heights.
48
const double kMinXHeightMatch = 0.5;
49
// Max tolerance on baseline position as a fraction of x-height for matching
50
// baselines.
51
const double kMaxBaselineDrift = 0.0625;
52
53
static const char kPermuterTypeNoPerm[] = "None";
54
static const char kPermuterTypePuncPerm[] = "Punctuation";
55
static const char kPermuterTypeTopPerm[] = "Top Choice";
56
static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
57
static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
58
static const char kPermuterTypeNgramPerm[] = "Ngram";
59
static const char kPermuterTypeNumberPerm[] = "Number";
60
static const char kPermuterTypeUserPatPerm[] = "User Pattern";
61
static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
62
static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
63
static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
64
static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
65
static const char kPermuterTypeCompoundPerm[] = "Compound";
66
67
static const char *const kPermuterTypeNames[] = {
68
    kPermuterTypeNoPerm,       // 0
69
    kPermuterTypePuncPerm,     // 1
70
    kPermuterTypeTopPerm,      // 2
71
    kPermuterTypeLowerPerm,    // 3
72
    kPermuterTypeUpperPerm,    // 4
73
    kPermuterTypeNgramPerm,    // 5
74
    kPermuterTypeNumberPerm,   // 6
75
    kPermuterTypeUserPatPerm,  // 7
76
    kPermuterTypeSysDawgPerm,  // 8
77
    kPermuterTypeDocDawgPerm,  // 9
78
    kPermuterTypeUserDawgPerm, // 10
79
    kPermuterTypeFreqDawgPerm, // 11
80
    kPermuterTypeCompoundPerm  // 12
81
};
82
83
/**
84
 * BLOB_CHOICE::BLOB_CHOICE
85
 *
86
 * Constructor to build a BLOB_CHOICE from a char, rating and certainty.
87
 */
88
BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
89
                         float src_rating,          // rating
90
                         float src_cert,            // certainty
91
                         int src_script_id,         // script
92
                         float min_xheight,         // min xheight allowed
93
                         float max_xheight,         // max xheight by this char
94
                         float yshift,              // yshift out of position
95
13.2M
                         BlobChoiceClassifier c) {  // adapted match or other
96
13.2M
  unichar_id_ = src_unichar_id;
97
13.2M
  rating_ = src_rating;
98
13.2M
  certainty_ = src_cert;
99
13.2M
  fontinfo_id_ = -1;
100
13.2M
  fontinfo_id2_ = -1;
101
13.2M
  script_id_ = src_script_id;
102
13.2M
  min_xheight_ = min_xheight;
103
13.2M
  max_xheight_ = max_xheight;
104
13.2M
  yshift_ = yshift;
105
13.2M
  classifier_ = c;
106
13.2M
}
107
108
/**
109
 * BLOB_CHOICE::BLOB_CHOICE
110
 *
111
 * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE.
112
 */
113
36.0k
BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) : ELIST<BLOB_CHOICE>::LINK(other) {
114
36.0k
  unichar_id_ = other.unichar_id();
115
36.0k
  rating_ = other.rating();
116
36.0k
  certainty_ = other.certainty();
117
36.0k
  fontinfo_id_ = other.fontinfo_id();
118
36.0k
  fontinfo_id2_ = other.fontinfo_id2();
119
36.0k
  script_id_ = other.script_id();
120
36.0k
  matrix_cell_ = other.matrix_cell_;
121
36.0k
  min_xheight_ = other.min_xheight_;
122
36.0k
  max_xheight_ = other.max_xheight_;
123
36.0k
  yshift_ = other.yshift();
124
36.0k
  classifier_ = other.classifier_;
125
36.0k
#ifndef DISABLED_LEGACY_ENGINE
126
36.0k
  fonts_ = other.fonts_;
127
36.0k
#endif // ndef DISABLED_LEGACY_ENGINE
128
36.0k
}
129
130
// Copy assignment operator.
131
0
BLOB_CHOICE &BLOB_CHOICE::operator=(const BLOB_CHOICE &other) {
132
0
  ELIST<BLOB_CHOICE>::LINK::operator=(other);
133
0
  unichar_id_ = other.unichar_id();
134
0
  rating_ = other.rating();
135
0
  certainty_ = other.certainty();
136
0
  fontinfo_id_ = other.fontinfo_id();
137
0
  fontinfo_id2_ = other.fontinfo_id2();
138
0
  script_id_ = other.script_id();
139
0
  matrix_cell_ = other.matrix_cell_;
140
0
  min_xheight_ = other.min_xheight_;
141
0
  max_xheight_ = other.max_xheight_;
142
0
  yshift_ = other.yshift();
143
0
  classifier_ = other.classifier_;
144
0
#ifndef DISABLED_LEGACY_ENGINE
145
0
  fonts_ = other.fonts_;
146
0
#endif // ndef DISABLED_LEGACY_ENGINE
147
0
  return *this;
148
0
}
149
150
// Returns true if *this and other agree on the baseline and x-height
151
// to within some tolerance based on a given estimate of the x-height.
152
664k
bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const {
153
664k
  double baseline_diff = std::fabs(yshift() - other.yshift());
154
664k
  if (baseline_diff > kMaxBaselineDrift * x_height) {
155
417k
    if (debug) {
156
0
      tprintf("Baseline diff %g for %d v %d\n", baseline_diff, unichar_id_, other.unichar_id_);
157
0
    }
158
417k
    return false;
159
417k
  }
160
247k
  double this_range = max_xheight() - min_xheight();
161
247k
  double other_range = other.max_xheight() - other.min_xheight();
162
247k
  double denominator =
163
247k
      ClipToRange(std::min(this_range, other_range), 1.0, kMaxOverlapDenominator * x_height);
164
247k
  double overlap =
165
247k
      std::min(max_xheight(), other.max_xheight()) - std::max(min_xheight(), other.min_xheight());
166
247k
  overlap /= denominator;
167
247k
  if (debug) {
168
0
    tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n", unichar_id_,
169
0
            other.unichar_id_, baseline_diff, this_range, other_range, denominator, overlap);
170
0
  }
171
172
247k
  return overlap >= kMinXHeightMatch;
173
664k
}
174
175
// Helper to find the BLOB_CHOICE in the bc_list that matches the given
176
// unichar_id, or nullptr if there is no match.
177
644k
BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list) {
178
  // Find the corresponding best BLOB_CHOICE.
179
644k
  BLOB_CHOICE_IT choice_it(bc_list);
180
894k
  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
181
857k
    BLOB_CHOICE *choice = choice_it.data();
182
857k
    if (choice->unichar_id() == char_id) {
183
608k
      return choice;
184
608k
    }
185
857k
  }
186
36.4k
  return nullptr;
187
644k
}
188
189
0
const char *WERD_CHOICE::permuter_name(uint8_t permuter) {
190
0
  return kPermuterTypeNames[permuter];
191
0
}
192
193
0
const char *ScriptPosToString(enum ScriptPos script_pos) {
194
0
  switch (script_pos) {
195
0
    case SP_NORMAL:
196
0
      return "NORM";
197
0
    case SP_SUBSCRIPT:
198
0
      return "SUB";
199
0
    case SP_SUPERSCRIPT:
200
0
      return "SUPER";
201
0
    case SP_DROPCAP:
202
0
      return "DROPC";
203
0
  }
204
0
  return "SP_UNKNOWN";
205
0
}
206
207
/**
208
 * WERD_CHOICE::WERD_CHOICE
209
 *
210
 * Constructor to build a WERD_CHOICE from the given string.
211
 * The function assumes that src_string is not nullptr.
212
 */
213
WERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset)
214
0
    : unicharset_(&unicharset) {
215
0
  std::vector<UNICHAR_ID> encoding;
216
0
  std::vector<char> lengths;
217
0
  std::string cleaned = unicharset.CleanupString(src_string);
218
0
  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths, nullptr)) {
219
0
    lengths.push_back('\0');
220
0
    std::string src_lengths = &lengths[0];
221
0
    this->init(cleaned.c_str(), src_lengths.c_str(), 0.0, 0.0, NO_PERM);
222
0
  } else { // There must have been an invalid unichar in the string.
223
0
    this->init(8);
224
0
    this->make_bad();
225
0
  }
226
0
}
227
228
/**
229
 * WERD_CHOICE::init
230
 *
231
 * Helper function to build a WERD_CHOICE from the given string,
232
 * fragment lengths, rating, certainty and permuter.
233
 *
234
 * The function assumes that src_string is not nullptr.
235
 * src_lengths argument could be nullptr, in which case the unichars
236
 * in src_string are assumed to all be of length 1.
237
 */
238
void WERD_CHOICE::init(const char *src_string, const char *src_lengths, float src_rating,
239
0
                       float src_certainty, uint8_t src_permuter) {
240
0
  int src_string_len = strlen(src_string);
241
0
  if (src_string_len == 0) {
242
0
    this->init(8);
243
0
  } else {
244
0
    this->init(src_lengths ? strlen(src_lengths) : src_string_len);
245
0
    length_ = reserved_;
246
0
    int offset = 0;
247
0
    for (unsigned i = 0; i < length_; ++i) {
248
0
      int unichar_length = src_lengths ? src_lengths[i] : 1;
249
0
      unichar_ids_[i] = unicharset_->unichar_to_id(src_string + offset, unichar_length);
250
0
      state_[i] = 1;
251
0
      certainties_[i] = src_certainty;
252
0
      offset += unichar_length;
253
0
    }
254
0
  }
255
0
  adjust_factor_ = 1.0f;
256
0
  rating_ = src_rating;
257
0
  certainty_ = src_certainty;
258
0
  permuter_ = src_permuter;
259
0
  dangerous_ambig_found_ = false;
260
0
}
261
262
/**
263
 * WERD_CHOICE::~WERD_CHOICE
264
 */
265
2.18M
WERD_CHOICE::~WERD_CHOICE() = default;
266
267
0
const char *WERD_CHOICE::permuter_name() const {
268
0
  return kPermuterTypeNames[permuter_];
269
0
}
270
271
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
272
// taken from the appropriate cell in the ratings MATRIX.
273
// Borrowed pointer, so do not delete.
274
331k
BLOB_CHOICE_LIST *WERD_CHOICE::blob_choices(unsigned index, MATRIX *ratings) const {
275
331k
  MATRIX_COORD coord = MatrixCoord(index);
276
331k
  BLOB_CHOICE_LIST *result = ratings->get(coord.col, coord.row);
277
331k
  if (result == nullptr) {
278
320
    result = new BLOB_CHOICE_LIST;
279
320
    ratings->put(coord.col, coord.row, result);
280
320
  }
281
331k
  return result;
282
331k
}
283
284
// Returns the MATRIX_COORD corresponding to the location in the ratings
285
// MATRIX for the given index into the word.
286
332k
MATRIX_COORD WERD_CHOICE::MatrixCoord(unsigned index) const {
287
332k
  int col = 0;
288
2.47M
  for (unsigned i = 0; i < index; ++i) {
289
2.14M
    col += state_[i];
290
2.14M
  }
291
332k
  int row = col + state_[index] - 1;
292
332k
  return MATRIX_COORD(col, row);
293
332k
}
294
295
// Sets the entries for the given index from the BLOB_CHOICE, assuming
296
// unit fragment lengths, but setting the state for this index to blob_count.
297
2.98M
void WERD_CHOICE::set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice) {
298
2.98M
  unichar_ids_[index] = blob_choice->unichar_id();
299
2.98M
  script_pos_[index] = tesseract::SP_NORMAL;
300
2.98M
  state_[index] = blob_count;
301
2.98M
  certainties_[index] = blob_choice->certainty();
302
2.98M
}
303
304
/**
305
 * contains_unichar_id
306
 *
307
 * Returns true if unichar_ids_ contain the given unichar_id, false otherwise.
308
 */
309
0
bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
310
0
  for (unsigned i = 0; i < length_; ++i) {
311
0
    if (unichar_ids_[i] == unichar_id) {
312
0
      return true;
313
0
    }
314
0
  }
315
0
  return false;
316
0
}
317
318
/**
319
 * remove_unichar_ids
320
 *
321
 * Removes num unichar ids starting from index start from unichar_ids_
322
 * and updates length_ and fragment_lengths_ to reflect this change.
323
 * Note: this function does not modify rating_ and certainty_.
324
 */
325
105k
void WERD_CHOICE::remove_unichar_ids(unsigned start, int num) {
326
105k
  ASSERT_HOST(start + num <= length_);
327
  // Accumulate the states to account for the merged blobs.
328
211k
  for (int i = 0; i < num; ++i) {
329
105k
    if (start > 0) {
330
105k
      state_[start - 1] += state_[start + i];
331
105k
    } else if (start + num < length_) {
332
0
      state_[start + num] += state_[start + i];
333
0
    }
334
105k
  }
335
692k
  for (unsigned i = start; i + num < length_; ++i) {
336
587k
    unichar_ids_[i] = unichar_ids_[i + num];
337
587k
    script_pos_[i] = script_pos_[i + num];
338
587k
    state_[i] = state_[i + num];
339
587k
    certainties_[i] = certainties_[i + num];
340
587k
  }
341
105k
  length_ -= num;
342
105k
}
343
344
/**
345
 * reverse_and_mirror_unichar_ids
346
 *
347
 * Reverses and mirrors unichars in unichar_ids.
348
 */
349
0
void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
350
0
  for (unsigned i = 0; i < length_ / 2; ++i) {
351
0
    UNICHAR_ID tmp_id = unichar_ids_[i];
352
0
    unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_ - 1 - i]);
353
0
    unichar_ids_[length_ - 1 - i] = unicharset_->get_mirror(tmp_id);
354
0
  }
355
0
  if (length_ % 2 != 0) {
356
0
    unichar_ids_[length_ / 2] = unicharset_->get_mirror(unichar_ids_[length_ / 2]);
357
0
  }
358
0
}
359
360
/**
361
 * punct_stripped
362
 *
363
 * Returns the half-open interval of unichar_id indices [start, end) which
364
 * enclose the core portion of this word -- the part after stripping
365
 * punctuation from the left and right.
366
 */
367
0
void WERD_CHOICE::punct_stripped(unsigned *start, unsigned *end) const {
368
0
  *start = 0;
369
0
  *end = length();
370
0
  while (*start < length() && unicharset()->get_ispunctuation(unichar_id(*start))) {
371
0
    (*start)++;
372
0
  }
373
0
  while (*end > *start && unicharset()->get_ispunctuation(unichar_id(*end - 1))) {
374
0
    (*end)--;
375
0
  }
376
0
}
377
378
0
void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
379
0
  int end = length();
380
0
  while (end > 0 && unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
381
0
         BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) {
382
0
    end--;
383
0
  }
384
0
  int start = 0;
385
0
  while (start < end && unicharset_->get_isdigit(unichar_ids_[start]) &&
386
0
         BlobPosition(start) == tesseract::SP_SUPERSCRIPT) {
387
0
    start++;
388
0
  }
389
0
  *pstart = start;
390
0
  *pend = end;
391
0
}
392
393
0
WERD_CHOICE WERD_CHOICE::shallow_copy(unsigned start, unsigned end) const {
394
0
  ASSERT_HOST(start <= length_);
395
0
  ASSERT_HOST(end <= length_);
396
0
  if (end < start) {
397
0
    end = start;
398
0
  }
399
0
  WERD_CHOICE retval(unicharset_, end - start);
400
0
  for (auto i = start; i < end; i++) {
401
0
    retval.append_unichar_id_space_allocated(unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
402
0
  }
403
0
  return retval;
404
0
}
405
406
/**
407
 * has_rtl_unichar_id
408
 *
409
 * Returns true if unichar_ids contain at least one "strongly" RTL unichar.
410
 */
411
0
bool WERD_CHOICE::has_rtl_unichar_id() const {
412
0
  for (unsigned i = 0; i < length_; ++i) {
413
0
    UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
414
0
    if (dir == UNICHARSET::U_RIGHT_TO_LEFT || dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {
415
0
      return true;
416
0
    }
417
0
  }
418
0
  return false;
419
0
}
420
421
/**
422
 * string_and_lengths
423
 *
424
 * Populates the given word_str with unichars from unichar_ids and
425
 * and word_lengths_str with the corresponding unichar lengths.
426
 */
427
3.26M
void WERD_CHOICE::string_and_lengths(std::string *word_str, std::string *word_lengths_str) const {
428
3.26M
  *word_str = "";
429
3.26M
  if (word_lengths_str != nullptr) {
430
3.26M
    *word_lengths_str = "";
431
3.26M
  }
432
34.0M
  for (unsigned i = 0; i < length_; ++i) {
433
30.8M
    const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
434
30.8M
    *word_str += ch;
435
30.8M
    if (word_lengths_str != nullptr) {
436
30.8M
      *word_lengths_str += (char)strlen(ch);
437
30.8M
    }
438
30.8M
  }
439
3.26M
}
440
441
/**
442
 * append_unichar_id
443
 *
444
 * Make sure there is enough space in the word for the new unichar id
445
 * and call append_unichar_id_space_allocated().
446
 */
447
void WERD_CHOICE::append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating,
448
1.45M
                                    float certainty) {
449
1.45M
  if (length_ == reserved_) {
450
41.4k
    this->double_the_size();
451
41.4k
  }
452
1.45M
  this->append_unichar_id_space_allocated(unichar_id, blob_count, rating, certainty);
453
1.45M
}
454
455
/**
456
 * WERD_CHOICE::operator+=
457
 *
458
 * Cat a second word rating on the end of this current one.
459
 * The ratings are added and the confidence is the min.
460
 * If the permuters are NOT the same the permuter is set to COMPOUND_PERM
461
 */
462
280k
WERD_CHOICE &WERD_CHOICE::operator+=(const WERD_CHOICE &second) {
463
280k
  ASSERT_HOST(unicharset_ == second.unicharset_);
464
1.59M
  while (reserved_ < length_ + second.length()) {
465
1.31M
    this->double_the_size();
466
1.31M
  }
467
280k
  const std::vector<UNICHAR_ID> &other_unichar_ids = second.unichar_ids();
468
12.3M
  for (unsigned i = 0; i < second.length(); ++i) {
469
12.0M
    unichar_ids_[length_ + i] = other_unichar_ids[i];
470
12.0M
    state_[length_ + i] = second.state_[i];
471
12.0M
    certainties_[length_ + i] = second.certainties_[i];
472
12.0M
    script_pos_[length_ + i] = second.BlobPosition(i);
473
12.0M
  }
474
280k
  length_ += second.length();
475
280k
  if (second.adjust_factor_ > adjust_factor_) {
476
153k
    adjust_factor_ = second.adjust_factor_;
477
153k
  }
478
280k
  rating_ += second.rating();            // add ratings
479
280k
  if (second.certainty() < certainty_) { // take min
480
224k
    certainty_ = second.certainty();
481
224k
  }
482
280k
  if (second.dangerous_ambig_found_) {
483
0
    dangerous_ambig_found_ = true;
484
0
  }
485
280k
  if (permuter_ == NO_PERM) {
486
0
    permuter_ = second.permuter();
487
280k
  } else if (second.permuter() != NO_PERM && second.permuter() != permuter_) {
488
0
    permuter_ = COMPOUND_PERM;
489
0
  }
490
280k
  return *this;
491
280k
}
492
493
/**
494
 * WERD_CHOICE::operator=
495
 *
496
 * Allocate enough memory to hold a copy of source and copy over
497
 * all the information from source to this WERD_CHOICE.
498
 */
499
598k
WERD_CHOICE &WERD_CHOICE::operator=(const WERD_CHOICE &source) {
500
598k
  while (reserved_ < source.length()) {
501
0
    this->double_the_size();
502
0
  }
503
504
598k
  unicharset_ = source.unicharset_;
505
598k
  const std::vector<UNICHAR_ID> &other_unichar_ids = source.unichar_ids();
506
3.72M
  for (unsigned i = 0; i < source.length(); ++i) {
507
3.12M
    unichar_ids_[i] = other_unichar_ids[i];
508
3.12M
    state_[i] = source.state_[i];
509
3.12M
    certainties_[i] = source.certainties_[i];
510
3.12M
    script_pos_[i] = source.BlobPosition(i);
511
3.12M
  }
512
598k
  length_ = source.length();
513
598k
  adjust_factor_ = source.adjust_factor_;
514
598k
  rating_ = source.rating();
515
598k
  certainty_ = source.certainty();
516
598k
  min_x_height_ = source.min_x_height();
517
598k
  max_x_height_ = source.max_x_height();
518
598k
  permuter_ = source.permuter();
519
598k
  dangerous_ambig_found_ = source.dangerous_ambig_found_;
520
598k
  return *this;
521
598k
}
522
523
// Sets up the script_pos_ member using the blobs_list to get the bln
524
// bounding boxes, *this to get the unichars, and this->unicharset
525
// to get the target positions. If small_caps is true, sub/super are not
526
// considered, but dropcaps are.
527
// NOTE: blobs_list should be the chopped_word blobs. (Fully segmented.)
528
399k
void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD *word, int debug) {
529
  // Initialize to normal.
530
3.17M
  for (unsigned i = 0; i < length_; ++i) {
531
2.77M
    script_pos_[i] = tesseract::SP_NORMAL;
532
2.77M
  }
533
399k
  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
534
0
    return;
535
0
  }
536
537
399k
  unsigned position_counts[4] = {0, 0, 0, 0};
538
539
399k
  int chunk_index = 0;
540
3.17M
  for (unsigned blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
541
2.77M
    TBLOB *tblob = word->blobs[chunk_index];
542
2.77M
    int uni_id = unichar_id(blob_index);
543
2.77M
    TBOX blob_box = tblob->bounding_box();
544
2.77M
    if (!state_.empty()) {
545
3.91M
      for (int i = 1; i < state_[blob_index]; ++i) {
546
1.14M
        ++chunk_index;
547
1.14M
        tblob = word->blobs[chunk_index];
548
1.14M
        blob_box += tblob->bounding_box();
549
1.14M
      }
550
2.77M
    }
551
2.77M
    script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, uni_id);
552
2.77M
    if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
553
0
      script_pos_[blob_index] = tesseract::SP_NORMAL;
554
0
    }
555
2.77M
    position_counts[script_pos_[blob_index]]++;
556
2.77M
  }
557
  // If almost everything looks like a superscript or subscript,
558
  // we most likely just got the baseline wrong.
559
399k
  if (4 * position_counts[tesseract::SP_SUBSCRIPT] > 3 * length_ ||
560
399k
      4 * position_counts[tesseract::SP_SUPERSCRIPT] > 3 * length_) {
561
37.8k
    if (debug >= 2) {
562
0
      tprintf(
563
0
          "Most characters of %s are subscript or superscript.\n"
564
0
          "That seems wrong, so I'll assume we got the baseline wrong\n",
565
0
          unichar_string().c_str());
566
0
    }
567
226k
    for (unsigned i = 0; i < length_; i++) {
568
188k
      ScriptPos sp = script_pos_[i];
569
188k
      if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) {
570
175k
        ASSERT_HOST(position_counts[sp] > 0);
571
175k
        position_counts[sp]--;
572
175k
        position_counts[tesseract::SP_NORMAL]++;
573
175k
        script_pos_[i] = tesseract::SP_NORMAL;
574
175k
      }
575
188k
    }
576
37.8k
  }
577
578
399k
  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) {
579
0
    tprintf("SetScriptPosition on %s\n", unichar_string().c_str());
580
0
    int chunk_index = 0;
581
0
    for (unsigned blob_index = 0; blob_index < length_; ++blob_index) {
582
0
      if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
583
0
        TBLOB *tblob = word->blobs[chunk_index];
584
0
        ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index));
585
0
      }
586
0
      chunk_index += state_.empty() ? 1 : state_[blob_index];
587
0
    }
588
0
  }
589
399k
}
590
591
// Sets all the script_pos_ positions to the given position.
592
5.33k
void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) {
593
24.2k
  for (unsigned i = 0; i < length_; ++i) {
594
18.8k
    script_pos_[i] = position;
595
18.8k
  }
596
5.33k
}
597
598
/* static */
599
ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset,
600
2.77M
                                        const TBOX &blob_box, UNICHAR_ID unichar_id) {
601
2.77M
  ScriptPos retval = tesseract::SP_NORMAL;
602
2.77M
  int top = blob_box.top();
603
2.77M
  int bottom = blob_box.bottom();
604
2.77M
  int min_bottom, max_bottom, min_top, max_top;
605
2.77M
  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);
606
607
2.77M
  int sub_thresh_top = min_top - kMinSubscriptOffset;
608
2.77M
  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
609
2.77M
  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
610
2.77M
  if (bottom <= kMaxDropCapBottom) {
611
406k
    retval = tesseract::SP_DROPCAP;
612
2.37M
  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
613
391k
    retval = tesseract::SP_SUBSCRIPT;
614
1.98M
  } else if (bottom > sup_thresh_bot) {
615
638k
    retval = tesseract::SP_SUPERSCRIPT;
616
638k
  }
617
618
2.77M
  if (print_debug) {
619
0
    const char *pos = ScriptPosToString(retval);
620
0
    tprintf(
621
0
        "%s Character %s[bot:%d top: %d]  "
622
0
        "bot_range[%d,%d]  top_range[%d, %d] "
623
0
        "sub_thresh[bot:%d top:%d]  sup_thresh_bot %d\n",
624
0
        pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top,
625
0
        max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot);
626
0
  }
627
2.77M
  return retval;
628
2.77M
}
629
630
// Returns the script-id (eg Han) of the dominant script in the word.
631
0
int WERD_CHOICE::GetTopScriptID() const {
632
0
  unsigned max_script = unicharset_->get_script_table_size();
633
0
  std::vector<unsigned> sid(max_script);
634
0
  for (unsigned x = 0; x < length_; ++x) {
635
0
    int script_id = unicharset_->get_script(unichar_id(x));
636
0
    sid[script_id]++;
637
0
  }
638
0
  if (unicharset_->han_sid() != unicharset_->null_sid()) {
639
    // Add the Hiragana & Katakana counts to Han and zero them out.
640
0
    if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
641
0
      sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
642
0
      sid[unicharset_->hiragana_sid()] = 0;
643
0
    }
644
0
    if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
645
0
      sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
646
0
      sid[unicharset_->katakana_sid()] = 0;
647
0
    }
648
0
  }
649
  // Note that high script ID overrides lower one on a tie, thus biasing
650
  // towards non-Common script (if sorted that way in unicharset file).
651
0
  unsigned max_sid = 0;
652
0
  for (unsigned x = 1; x < max_script; x++) {
653
0
    if (sid[x] >= sid[max_sid]) {
654
0
      max_sid = x;
655
0
    }
656
0
  }
657
0
  if (sid[max_sid] < length_ / 2) {
658
0
    max_sid = unicharset_->null_sid();
659
0
  }
660
0
  return max_sid;
661
0
}
662
663
// Fixes the state_ for a chop at the given blob_posiiton.
664
567k
void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
665
567k
  int total_chunks = 0;
666
3.07M
  for (unsigned i = 0; i < length_; ++i) {
667
3.07M
    total_chunks += state_[i];
668
3.07M
    if (total_chunks > blob_position) {
669
567k
      ++state_[i];
670
567k
      return;
671
567k
    }
672
3.07M
  }
673
567k
}
674
675
// Returns the sum of all the state elements, being the total number of blobs.
676
5.63M
unsigned WERD_CHOICE::TotalOfStates() const {
677
5.63M
  unsigned total_chunks = 0;
678
54.2M
  for (unsigned i = 0; i < length_; ++i) {
679
48.6M
    total_chunks += state_[i];
680
48.6M
  }
681
5.63M
  return total_chunks;
682
5.63M
}
683
684
/**
685
 * WERD_CHOICE::print
686
 *
687
 * Print WERD_CHOICE to stdout.
688
 */
689
0
void WERD_CHOICE::print(const char *msg) const {
690
0
  tprintf("%s : ", msg);
691
0
  for (unsigned i = 0; i < length_; ++i) {
692
0
    tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
693
0
  }
694
0
  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", rating_, certainty_,
695
0
          adjust_factor_, permuter_, min_x_height_, max_x_height_, dangerous_ambig_found_);
696
0
  tprintf("pos");
697
0
  for (unsigned i = 0; i < length_; ++i) {
698
0
    tprintf("\t%s", ScriptPosToString(script_pos_[i]));
699
0
  }
700
0
  tprintf("\nstr");
701
0
  for (unsigned i = 0; i < length_; ++i) {
702
0
    tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
703
0
  }
704
0
  tprintf("\nstate:");
705
0
  for (unsigned i = 0; i < length_; ++i) {
706
0
    tprintf("\t%d ", state_[i]);
707
0
  }
708
0
  tprintf("\nC");
709
0
  for (unsigned i = 0; i < length_; ++i) {
710
0
    tprintf("\t%.3f", certainties_[i]);
711
0
  }
712
0
  tprintf("\n");
713
0
}
714
715
// Prints the segmentation state with an introductory message.
716
0
void WERD_CHOICE::print_state(const char *msg) const {
717
0
  tprintf("%s", msg);
718
0
  for (unsigned i = 0; i < length_; ++i) {
719
0
    tprintf(" %d", state_[i]);
720
0
  }
721
0
  tprintf("\n");
722
0
}
723
724
#ifndef GRAPHICS_DISABLED
725
726
// Displays the segmentation state of *this (if not the same as the last
727
// one displayed) and waits for a click in the window.
728
void WERD_CHOICE::DisplaySegmentation(TWERD *word) {
729
  // Number of different colors to draw with.
730
  const int kNumColors = 6;
731
  static ScrollView *segm_window = nullptr;
732
  // Check the state against the static prev_drawn_state.
733
  static std::vector<int> prev_drawn_state;
734
  bool already_done = prev_drawn_state.size() == length_;
735
  if (!already_done) {
736
    prev_drawn_state.clear();
737
    prev_drawn_state.resize(length_);
738
  }
739
  for (unsigned i = 0; i < length_; ++i) {
740
    if (prev_drawn_state[i] != state_[i]) {
741
      already_done = false;
742
    }
743
    prev_drawn_state[i] = state_[i];
744
  }
745
  if (already_done || word->blobs.empty()) {
746
    return;
747
  }
748
749
  // Create the window if needed.
750
  if (segm_window == nullptr) {
751
    segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 2000.0, 256.0, true);
752
  } else {
753
    segm_window->Clear();
754
  }
755
756
  TBOX bbox;
757
  int blob_index = 0;
758
  for (unsigned c = 0; c < length_; ++c) {
759
    auto color = static_cast<ScrollView::Color>(c % kNumColors + 3);
760
    for (int i = 0; i < state_[c]; ++i, ++blob_index) {
761
      TBLOB *blob = word->blobs[blob_index];
762
      bbox += blob->bounding_box();
763
      blob->plot(segm_window, color, color);
764
    }
765
  }
766
  segm_window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom());
767
  segm_window->Update();
768
  segm_window->Wait();
769
}
770
771
#endif // !GRAPHICS_DISABLED
772
773
0
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2) {
774
0
  const UNICHARSET *uchset = word1.unicharset();
775
0
  if (word2.unicharset() != uchset) {
776
0
    return false;
777
0
  }
778
0
  unsigned w1start, w1end;
779
0
  word1.punct_stripped(&w1start, &w1end);
780
0
  unsigned w2start, w2end;
781
0
  word2.punct_stripped(&w2start, &w2end);
782
0
  if (w1end - w1start != w2end - w2start) {
783
0
    return false;
784
0
  }
785
0
  for (unsigned i = 0; i < w1end - w1start; i++) {
786
0
    if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
787
0
        uchset->to_lower(word2.unichar_id(w2start + i))) {
788
0
      return false;
789
0
    }
790
0
  }
791
0
  return true;
792
0
}
793
794
/**
795
 * print_ratings_list
796
 *
797
 * Send all the ratings out to the logfile.
798
 *
799
 * @param msg intro message
800
 * @param ratings list of ratings
801
 * @param current_unicharset unicharset that can be used
802
 * for id-to-unichar conversion
803
 */
804
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings,
805
0
                        const UNICHARSET &current_unicharset) {
806
0
  if (ratings->empty()) {
807
0
    tprintf("%s:<none>\n", msg);
808
0
    return;
809
0
  }
810
0
  if (*msg != '\0') {
811
0
    tprintf("%s\n", msg);
812
0
  }
813
0
  BLOB_CHOICE_IT c_it;
814
0
  c_it.set_to_list(ratings);
815
0
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
816
0
    c_it.data()->print(&current_unicharset);
817
0
    if (!c_it.at_last()) {
818
0
      tprintf("\n");
819
0
    }
820
0
  }
821
0
  tprintf("\n");
822
0
  fflush(stdout);
823
0
}
824
825
} // namespace tesseract