Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/ccstruct/pageres.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        pageres.cpp  (Formerly page_res.c)
3
 * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
4
 *              and an iterator class to iterate over the words.
5
 * Main purposes:
6
 *              Easy way to iterate over the words without a 3-nested loop.
7
 *              Holds data used during word recognition.
8
 *              Holds information about alternative spacing paths.
9
 * Author:      Phil Cheatle
10
 *
11
 * (C) Copyright 1992, Hewlett-Packard Ltd.
12
 ** Licensed under the Apache License, Version 2.0 (the "License");
13
 ** you may not use this file except in compliance with the License.
14
 ** You may obtain a copy of the License at
15
 ** http://www.apache.org/licenses/LICENSE-2.0
16
 ** Unless required by applicable law or agreed to in writing, software
17
 ** distributed under the License is distributed on an "AS IS" BASIS,
18
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
 ** See the License for the specific language governing permissions and
20
 ** limitations under the License.
21
 *
22
 **********************************************************************/
23
24
#include "pageres.h"
25
26
#include "blamer.h"   // for BlamerBundle
27
#include "blobs.h"    // for TWERD, TBLOB
28
#include "boxword.h"  // for BoxWord
29
#include "errcode.h"  // for ASSERT_HOST
30
#include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
31
#include "ocrrow.h"   // for ROW, ROW_IT
32
#include "pdblock.h"  // for PDBLK
33
#include "polyblk.h"  // for POLY_BLOCK
34
#include "seam.h"     // for SEAM, start_seam_list
35
#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
36
#include "tprintf.h"  // for tprintf
37
38
#include <tesseract/publictypes.h> // for OcrEngineMode, OEM_LSTM_ONLY
39
40
#include <cassert> // for assert
41
#include <cstdint> // for INT32_MAX
42
#include <cstring> // for strlen
43
44
struct Pix;
45
46
namespace tesseract {
47
48
// Gain factor for computing thresholds that determine the ambiguity of a
49
// word.
50
static const double kStopperAmbiguityThresholdGain = 8.0;
51
// Constant offset for computing thresholds that determine the ambiguity of a
52
// word.
53
static const double kStopperAmbiguityThresholdOffset = 1.5;
54
// Max number of broken pieces to associate.
55
const int kWordrecMaxNumJoinChunks = 4;
56
// Max ratio of word box height to line size to allow it to be processed as
57
// a line with other words.
58
const double kMaxWordSizeRatio = 1.25;
59
// Max ratio of line box height to line size to allow a new word to be added.
60
const double kMaxLineSizeRatio = 1.25;
61
// Max ratio of word gap to line size to allow a new word to be added.
62
const double kMaxWordGapRatio = 2.0;
63
64
// Computes and returns a threshold of certainty difference used to determine
65
// which words to keep, based on the adjustment factors of the two words.
66
// TODO(rays) This is horrible. Replace with an enhance params training model.
67
485k
static double StopperAmbigThreshold(double f1, double f2) {
68
485k
  return (f2 - f1) * kStopperAmbiguityThresholdGain -
69
485k
         kStopperAmbiguityThresholdOffset;
70
485k
}
71
72
/*************************************************************************
73
 * PAGE_RES::PAGE_RES
74
 *
75
 * Constructor for page results
76
 *************************************************************************/
77
PAGE_RES::PAGE_RES(bool merge_similar_words, BLOCK_LIST *the_block_list,
78
7.72k
                   WERD_CHOICE **prev_word_best_choice_ptr) {
79
7.72k
  Init();
80
7.72k
  BLOCK_IT block_it(the_block_list);
81
7.72k
  BLOCK_RES_IT block_res_it(&block_res_list);
82
14.7k
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
83
7.06k
    block_res_it.add_to_end(
84
7.06k
        new BLOCK_RES(merge_similar_words, block_it.data()));
85
7.06k
  }
86
7.72k
  prev_word_best_choice = prev_word_best_choice_ptr;
87
7.72k
}
88
89
/*************************************************************************
90
 * BLOCK_RES::BLOCK_RES
91
 *
92
 * Constructor for BLOCK results
93
 *************************************************************************/
94
95
7.06k
BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
96
7.06k
  ROW_IT row_it(the_block->row_list());
97
7.06k
  ROW_RES_IT row_res_it(&row_res_list);
98
99
7.06k
  char_count = 0;
100
7.06k
  rej_count = 0;
101
7.06k
  font_class = -1; // not assigned
102
7.06k
  x_height = -1.0;
103
7.06k
  font_assigned = false;
104
7.06k
  row_count = 0;
105
106
7.06k
  block = the_block;
107
108
102k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
109
95.5k
    row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
110
95.5k
  }
111
7.06k
}
112
113
/*************************************************************************
114
 * ROW_RES::ROW_RES
115
 *
116
 * Constructor for ROW results
117
 *************************************************************************/
118
119
95.5k
ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
120
95.5k
  WERD_IT word_it(the_row->word_list());
121
95.5k
  WERD_RES_IT word_res_it(&word_res_list);
122
95.5k
  WERD_RES *combo = nullptr; // current combination of fuzzies
123
95.5k
  WERD *copy_word;
124
125
95.5k
  char_count = 0;
126
95.5k
  rej_count = 0;
127
95.5k
  whole_word_rej_count = 0;
128
129
95.5k
  row = the_row;
130
95.5k
  bool add_next_word = false;
131
95.5k
  TBOX union_box;
132
95.5k
  float line_height =
133
95.5k
      the_row->x_height() + the_row->ascenders() - the_row->descenders();
134
241k
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
135
145k
    auto *word_res = new WERD_RES(word_it.data());
136
145k
    word_res->x_height = the_row->x_height();
137
145k
    if (add_next_word) {
138
33.9k
      ASSERT_HOST(combo != nullptr);
139
      // We are adding this word to the combination.
140
33.9k
      word_res->part_of_combo = true;
141
33.9k
      combo->copy_on(word_res);
142
111k
    } else if (merge_similar_words) {
143
111k
      union_box = word_res->word->bounding_box();
144
111k
      add_next_word = !word_res->word->flag(W_REP_CHAR) &&
145
111k
                      union_box.height() <= line_height * kMaxWordSizeRatio;
146
111k
      word_res->odd_size = !add_next_word;
147
111k
    }
148
145k
    WERD *next_word = word_it.data_relative(1);
149
145k
    if (merge_similar_words) {
150
145k
      if (add_next_word && !next_word->flag(W_REP_CHAR)) {
151
        // Next word will be added on if all of the following are true:
152
        // Not a rep char.
153
        // Box height small enough.
154
        // Union box height small enough.
155
        // Horizontal gap small enough.
156
132k
        TBOX next_box = next_word->bounding_box();
157
132k
        int prev_right = union_box.right();
158
132k
        union_box += next_box;
159
132k
        if (next_box.height() > line_height * kMaxWordSizeRatio ||
160
132k
            union_box.height() > line_height * kMaxLineSizeRatio ||
161
132k
            next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
162
17.2k
          add_next_word = false;
163
17.2k
        }
164
132k
      }
165
145k
      next_word->set_flag(W_FUZZY_NON, add_next_word);
166
145k
    } else {
167
0
      add_next_word = next_word->flag(W_FUZZY_NON);
168
0
    }
169
145k
    if (add_next_word) {
170
115k
      if (combo == nullptr) {
171
84.1k
        copy_word = new WERD;
172
84.1k
        *copy_word = *(word_it.data()); // deep copy
173
84.1k
        combo = new WERD_RES(copy_word);
174
84.1k
        combo->x_height = the_row->x_height();
175
84.1k
        combo->combination = true;
176
84.1k
        word_res_it.add_to_end(combo);
177
84.1k
      }
178
115k
      word_res->part_of_combo = true;
179
115k
    } else {
180
30.5k
      combo = nullptr;
181
30.5k
    }
182
145k
    word_res_it.add_to_end(word_res);
183
145k
  }
184
95.5k
}
185
186
26.8k
WERD_RES &WERD_RES::operator=(const WERD_RES &source) {
187
26.8k
  this->ELIST<WERD_RES>::LINK::operator=(source);
188
26.8k
  Clear();
189
26.8k
  if (source.combination) {
190
0
    word = new WERD;
191
0
    *word = *(source.word); // deep copy
192
26.8k
  } else {
193
26.8k
    word = source.word; // pt to same word
194
26.8k
  }
195
26.8k
  if (source.bln_boxes != nullptr) {
196
26.8k
    bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
197
26.8k
  }
198
26.8k
  if (source.chopped_word != nullptr) {
199
26.8k
    chopped_word = new TWERD(*source.chopped_word);
200
26.8k
  }
201
26.8k
  if (source.rebuild_word != nullptr) {
202
13.1k
    rebuild_word = new TWERD(*source.rebuild_word);
203
13.1k
  }
204
  // TODO(rays) Do we ever need to copy the seam_array?
205
26.8k
  blob_row = source.blob_row;
206
26.8k
  denorm = source.denorm;
207
26.8k
  if (source.box_word != nullptr) {
208
13.1k
    box_word = new tesseract::BoxWord(*source.box_word);
209
13.1k
  }
210
26.8k
  best_state = source.best_state;
211
26.8k
  correct_text = source.correct_text;
212
26.8k
  blob_widths = source.blob_widths;
213
26.8k
  blob_gaps = source.blob_gaps;
214
  // None of the uses of operator= require the ratings matrix to be copied,
215
  // so don't as it would be really slow.
216
217
  // Copy the cooked choices.
218
26.8k
  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&source.best_choices));
219
26.8k
  WERD_CHOICE_IT wc_dest_it(&best_choices);
220
83.8k
  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
221
57.0k
    const WERD_CHOICE *choice = wc_it.data();
222
57.0k
    wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
223
57.0k
  }
224
26.8k
  if (!wc_dest_it.empty()) {
225
13.1k
    wc_dest_it.move_to_first();
226
13.1k
    best_choice = wc_dest_it.data();
227
13.7k
  } else {
228
13.7k
    best_choice = nullptr;
229
13.7k
  }
230
231
26.8k
  if (source.raw_choice != nullptr) {
232
13.1k
    raw_choice = new WERD_CHOICE(*source.raw_choice);
233
13.7k
  } else {
234
13.7k
    raw_choice = nullptr;
235
13.7k
  }
236
26.8k
  if (source.ep_choice != nullptr) {
237
0
    ep_choice = new WERD_CHOICE(*source.ep_choice);
238
26.8k
  } else {
239
26.8k
    ep_choice = nullptr;
240
26.8k
  }
241
26.8k
  reject_map = source.reject_map;
242
26.8k
  combination = source.combination;
243
26.8k
  part_of_combo = source.part_of_combo;
244
26.8k
  CopySimpleFields(source);
245
26.8k
  if (source.blamer_bundle != nullptr) {
246
0
    blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
247
0
  }
248
26.8k
  return *this;
249
26.8k
}
250
251
// Copies basic fields that don't involve pointers that might be useful
252
// to copy when making one WERD_RES from another.
253
239k
void WERD_RES::CopySimpleFields(const WERD_RES &source) {
254
239k
  tess_failed = source.tess_failed;
255
239k
  tess_accepted = source.tess_accepted;
256
239k
  tess_would_adapt = source.tess_would_adapt;
257
239k
  done = source.done;
258
239k
  unlv_crunch_mode = source.unlv_crunch_mode;
259
239k
  small_caps = source.small_caps;
260
239k
  odd_size = source.odd_size;
261
239k
  fontinfo = source.fontinfo;
262
239k
  fontinfo2 = source.fontinfo2;
263
239k
  fontinfo_id_count = source.fontinfo_id_count;
264
239k
  fontinfo_id2_count = source.fontinfo_id2_count;
265
239k
  x_height = source.x_height;
266
239k
  caps_height = source.caps_height;
267
239k
  baseline_shift = source.baseline_shift;
268
239k
  guessed_x_ht = source.guessed_x_ht;
269
239k
  guessed_caps_ht = source.guessed_caps_ht;
270
239k
  reject_spaces = source.reject_spaces;
271
239k
  uch_set = source.uch_set;
272
239k
  tesseract = source.tesseract;
273
239k
}
274
275
// Initializes a blank (default constructed) WERD_RES from one that has
276
// already been recognized.
277
// Use SetupFor*Recognition afterwards to complete the setup and make
278
// it ready for a retry recognition.
279
141k
void WERD_RES::InitForRetryRecognition(const WERD_RES &source) {
280
141k
  word = source.word;
281
141k
  CopySimpleFields(source);
282
141k
  if (source.blamer_bundle != nullptr) {
283
0
    blamer_bundle = new BlamerBundle();
284
0
    blamer_bundle->CopyTruth(*source.blamer_bundle);
285
0
  }
286
141k
}
287
288
// Sets up the members used in recognition: bln_boxes, chopped_word,
289
// seam_array, denorm.  Returns false if
290
// the word is empty and sets up fake results.  If use_body_size is
291
// true and row->body_size is set, then body_size will be used for
292
// blob normalization instead of xheight + ascrise. This flag is for
293
// those languages that are using CJK pitch model and thus it has to
294
// be true if and only if tesseract->textord_use_cjk_fp_model is
295
// true.
296
// If allow_detailed_fx is true, the feature extractor will receive fine
297
// precision outline information, allowing smoother features and better
298
// features on low resolution images.
299
// The norm_mode_hint sets the default mode for normalization in absence
300
// of any of the above flags.
301
// norm_box is used to override the word bounding box to determine the
302
// normalization scale and offset.
303
// Returns false if the word is empty and sets up fake results.
304
bool WERD_RES::SetupForRecognition(const UNICHARSET &unicharset_in,
305
                                   tesseract::Tesseract *tess, Image pix,
306
                                   int norm_mode, const TBOX *norm_box,
307
                                   bool numeric_mode, bool use_body_size,
308
                                   bool allow_detailed_fx, ROW *row,
309
300k
                                   const BLOCK *block) {
310
300k
  auto norm_mode_hint = static_cast<tesseract::OcrEngineMode>(norm_mode);
311
300k
  tesseract = tess;
312
300k
  POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
313
300k
  if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
314
300k
       word->cblob_list()->empty()) ||
315
300k
      (pb != nullptr && !pb->IsText())) {
316
    // Empty words occur when all the blobs have been moved to the rej_blobs
317
    // list, which seems to occur frequently in junk.
318
150
    SetupFake(unicharset_in);
319
150
    word->set_flag(W_REP_CHAR, false);
320
150
    return false;
321
150
  }
322
300k
  ClearResults();
323
300k
  SetupWordScript(unicharset_in);
324
300k
  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
325
300k
  float word_xheight =
326
300k
      use_body_size && row != nullptr && row->body_size() > 0.0f
327
300k
          ? row->body_size()
328
300k
          : x_height;
329
300k
  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
330
300k
                            word_xheight, baseline_shift, numeric_mode,
331
300k
                            norm_mode_hint, norm_box, &denorm);
332
300k
  blob_row = row;
333
300k
  SetupBasicsFromChoppedWord(unicharset_in);
334
300k
  SetupBlamerBundle();
335
300k
  int num_blobs = chopped_word->NumBlobs();
336
300k
  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
337
300k
  tess_failed = false;
338
300k
  return true;
339
300k
}
340
341
// Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
342
// accumulators from a made chopped word.  We presume the fields are already
343
// empty.
344
340k
void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {
345
340k
  bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);
346
340k
  start_seam_list(chopped_word, &seam_array);
347
340k
  SetupBlobWidthsAndGaps();
348
340k
  ClearWordChoices();
349
340k
}
350
351
// Sets up the members used in recognition for an empty recognition result:
352
// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
353
150
void WERD_RES::SetupFake(const UNICHARSET &unicharset_in) {
354
150
  ClearResults();
355
150
  SetupWordScript(unicharset_in);
356
150
  chopped_word = new TWERD;
357
150
  rebuild_word = new TWERD;
358
150
  bln_boxes = new tesseract::BoxWord;
359
150
  box_word = new tesseract::BoxWord;
360
150
  int blob_count = word->cblob_list()->length();
361
150
  if (blob_count > 0) {
362
0
    auto **fake_choices = new BLOB_CHOICE *[blob_count];
363
    // For non-text blocks, just pass any blobs through to the box_word
364
    // and call the word failed with a fake classification.
365
0
    C_BLOB_IT b_it(word->cblob_list());
366
0
    int blob_id = 0;
367
0
    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
368
0
      TBOX box = b_it.data()->bounding_box();
369
0
      box_word->InsertBox(box_word->length(), box);
370
0
      fake_choices[blob_id++] = new BLOB_CHOICE;
371
0
    }
372
0
    FakeClassifyWord(blob_count, fake_choices);
373
0
    delete[] fake_choices;
374
150
  } else {
375
150
    auto *word = new WERD_CHOICE(&unicharset_in);
376
150
    word->make_bad();
377
150
    LogNewRawChoice(word);
378
    // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
379
150
    LogNewCookedChoice(1, false, word);
380
150
  }
381
150
  tess_failed = true;
382
150
  done = true;
383
150
}
384
385
300k
void WERD_RES::SetupWordScript(const UNICHARSET &uch) {
386
300k
  uch_set = &uch;
387
300k
  int script = uch.default_sid();
388
300k
  word->set_script_id(script);
389
300k
  word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
390
300k
  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
391
300k
}
392
393
// Sets up the blamer_bundle if it is not null, using the initialized denorm.
394
300k
void WERD_RES::SetupBlamerBundle() {
395
300k
  if (blamer_bundle != nullptr) {
396
0
    blamer_bundle->SetupNormTruthWord(denorm);
397
0
  }
398
300k
}
399
400
// Computes the blob_widths and blob_gaps from the chopped_word.
401
449k
void WERD_RES::SetupBlobWidthsAndGaps() {
402
449k
  blob_widths.clear();
403
449k
  blob_gaps.clear();
404
449k
  int num_blobs = chopped_word->NumBlobs();
405
3.97M
  for (int b = 0; b < num_blobs; ++b) {
406
3.52M
    TBLOB *blob = chopped_word->blobs[b];
407
3.52M
    TBOX box = blob->bounding_box();
408
3.52M
    blob_widths.push_back(box.width());
409
3.52M
    if (b + 1 < num_blobs) {
410
3.07M
      blob_gaps.push_back(chopped_word->blobs[b + 1]->bounding_box().left() -
411
3.07M
                          box.right());
412
3.07M
    }
413
3.52M
  }
414
449k
}
415
416
// Updates internal data to account for a new SEAM (chop) at the given
417
// blob_number. Fixes the ratings matrix and states in the choices, as well
418
// as the blob widths and gaps.
419
109k
void WERD_RES::InsertSeam(int blob_number, SEAM *seam) {
420
  // Insert the seam into the SEAMS array.
421
109k
  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
422
109k
  seam_array.insert(seam_array.begin() + blob_number, seam);
423
109k
  if (ratings != nullptr) {
424
    // Expand the ratings matrix.
425
109k
    ratings = ratings->ConsumeAndMakeBigger(blob_number);
426
    // Fix all the segmentation states.
427
109k
    if (raw_choice != nullptr) {
428
109k
      raw_choice->UpdateStateForSplit(blob_number);
429
109k
    }
430
109k
    WERD_CHOICE_IT wc_it(&best_choices);
431
567k
    for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
432
457k
      WERD_CHOICE *choice = wc_it.data();
433
457k
      choice->UpdateStateForSplit(blob_number);
434
457k
    }
435
109k
    SetupBlobWidthsAndGaps();
436
109k
  }
437
109k
}
438
439
// Returns true if all the word choices except the first have adjust_factors
440
// worse than the given threshold.
441
0
bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {
442
  // The choices are not changed by this iteration.
443
0
  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
444
0
  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
445
0
    WERD_CHOICE *choice = wc_it.data();
446
0
    if (choice->adjust_factor() <= threshold) {
447
0
      return false;
448
0
    }
449
0
  }
450
0
  return true;
451
0
}
452
453
// Returns true if the current word is ambiguous (by number of answers or
454
// by dangerous ambigs.)
455
31.9k
bool WERD_RES::IsAmbiguous() {
456
31.9k
  return !best_choices.singleton() || best_choice->dangerous_ambig_found();
457
31.9k
}
458
459
// Returns true if the ratings matrix size matches the sum of each of the
460
// segmentation states.
461
996k
bool WERD_RES::StatesAllValid() {
462
996k
  unsigned ratings_dim = ratings->dimension();
463
996k
  if (raw_choice->TotalOfStates() != ratings_dim) {
464
0
    tprintf("raw_choice has total of states = %u vs ratings dim of %u\n",
465
0
            raw_choice->TotalOfStates(), ratings_dim);
466
0
    return false;
467
0
  }
468
996k
  WERD_CHOICE_IT it(&best_choices);
469
996k
  unsigned index = 0;
470
5.23M
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
471
4.23M
    WERD_CHOICE *choice = it.data();
472
4.23M
    if (choice->TotalOfStates() != ratings_dim) {
473
0
      tprintf("Cooked #%u has total of states = %u vs ratings dim of %u\n",
474
0
              index, choice->TotalOfStates(), ratings_dim);
475
0
      return false;
476
0
    }
477
4.23M
  }
478
996k
  return true;
479
996k
}
480
481
// Prints a list of words found if debug is true or the word result matches
482
// the word_to_debug.
483
98.6k
void WERD_RES::DebugWordChoices(bool debug, const char *word_to_debug) {
484
98.6k
  if (debug || (word_to_debug != nullptr && *word_to_debug != '\0' &&
485
98.6k
                best_choice != nullptr &&
486
98.6k
                best_choice->unichar_string() == std::string(word_to_debug))) {
487
0
    if (raw_choice != nullptr) {
488
0
      raw_choice->print("\nBest Raw Choice");
489
0
    }
490
491
0
    WERD_CHOICE_IT it(&best_choices);
492
0
    int index = 0;
493
0
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
494
0
      WERD_CHOICE *choice = it.data();
495
0
      std::string label;
496
0
      label += "\nCooked Choice #" + std::to_string(index);
497
0
      choice->print(label.c_str());
498
0
    }
499
0
  }
500
98.6k
}
501
502
// Prints the top choice along with the accepted/done flags.
503
0
void WERD_RES::DebugTopChoice(const char *msg) const {
504
0
  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ", tess_accepted,
505
0
          tess_would_adapt, done);
506
0
  if (best_choice == nullptr) {
507
0
    tprintf("<Null choice>\n");
508
0
  } else {
509
0
    best_choice->print(msg);
510
0
  }
511
0
}
512
513
// Removes from best_choices all choices which are not within a reasonable
514
// range of the best choice.
515
// TODO(rays) incorporate the information used here into the params training
516
// re-ranker, in place of this heuristic that is based on the previous
517
// adjustment factor.
518
98.6k
void WERD_RES::FilterWordChoices(int debug_level) {
519
98.6k
  if (best_choice == nullptr || best_choices.singleton()) {
520
50.6k
    return;
521
50.6k
  }
522
523
48.0k
  if (debug_level >= 2) {
524
0
    best_choice->print("\nFiltering against best choice");
525
0
  }
526
48.0k
  WERD_CHOICE_IT it(&best_choices);
527
48.0k
  int index = 0;
528
233k
  for (it.forward(); !it.at_first(); it.forward(), ++index) {
529
185k
    WERD_CHOICE *choice = it.data();
530
185k
    float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
531
185k
                                            choice->adjust_factor());
532
    // i, j index the blob choice in choice, best_choice.
533
    // chunk is an index into the chopped_word blobs (AKA chunks).
534
    // Since the two words may use different segmentations of the chunks, we
535
    // iterate over the chunks to find out whether a comparable blob
536
    // classification is much worse than the best result.
537
185k
    unsigned i = 0, j = 0, chunk = 0;
538
    // Each iteration of the while deals with 1 chunk. On entry choice_chunk
539
    // and best_chunk are the indices of the first chunk in the NEXT blob,
540
    // i.e. we don't have to increment i, j while chunk < choice_chunk and
541
    // best_chunk respectively.
542
185k
    auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
543
1.74M
    while (i < choice->length() && j < best_choice->length()) {
544
1.66M
      if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
545
1.66M
          choice->certainty(i) - best_choice->certainty(j) < threshold) {
546
100k
        if (debug_level >= 2) {
547
0
          choice->print("WorstCertaintyDiffWorseThan");
548
0
          tprintf(
549
0
              "i %u j %u Choice->Blob[i].Certainty %.4g"
550
0
              " WorstOtherChoiceCertainty %g Threshold %g\n",
551
0
              i, j, choice->certainty(i), best_choice->certainty(j), threshold);
552
0
          tprintf("Discarding bad choice #%d\n", index);
553
0
        }
554
100k
        delete it.extract();
555
100k
        break;
556
100k
      }
557
1.56M
      ++chunk;
558
      // If needed, advance choice_chunk to keep up with chunk.
559
2.35M
      while (choice_chunk < chunk && ++i < choice->length()) {
560
797k
        choice_chunk += choice->state(i);
561
797k
      }
562
      // If needed, advance best_chunk to keep up with chunk.
563
2.31M
      while (best_chunk < chunk && ++j < best_choice->length()) {
564
756k
        best_chunk += best_choice->state(j);
565
756k
      }
566
1.56M
    }
567
185k
  }
568
48.0k
}
569
570
void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
571
                                         float min_rating, float max_rating,
572
                                         float rating_margin,
573
2.15k
                                         float *thresholds) {
574
2.15k
  int chunk = 0;
575
2.15k
  int end_chunk = best_choice->state(0);
576
2.15k
  int end_raw_chunk = raw_choice->state(0);
577
2.15k
  int raw_blob = 0;
578
4.43k
  for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) {
579
2.28k
    float avg_rating = 0.0f;
580
2.28k
    int num_error_chunks = 0;
581
582
    // For each chunk in best choice blob i, count non-matching raw results.
583
4.49k
    while (chunk < end_chunk) {
584
2.21k
      if (chunk >= end_raw_chunk) {
585
3
        ++raw_blob;
586
3
        end_raw_chunk += raw_choice->state(raw_blob);
587
3
      }
588
2.21k
      if (best_choice->unichar_id(i) != raw_choice->unichar_id(raw_blob)) {
589
3
        avg_rating += raw_choice->certainty(raw_blob);
590
3
        ++num_error_chunks;
591
3
      }
592
2.21k
      ++chunk;
593
2.21k
    }
594
595
2.28k
    if (num_error_chunks > 0) {
596
2
      avg_rating /= num_error_chunks;
597
2
      *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
598
2.27k
    } else {
599
2.27k
      *thresholds = max_rating;
600
2.27k
    }
601
602
2.28k
    if (*thresholds > max_rating) {
603
2
      *thresholds = max_rating;
604
2
    }
605
2.28k
    if (*thresholds < min_rating) {
606
0
      *thresholds = min_rating;
607
0
    }
608
2.28k
  }
609
2.15k
}
610
611
// Saves a copy of the word_choice if it has the best unadjusted rating.
612
// Returns true if the word_choice was the new best.
613
291k
bool WERD_RES::LogNewRawChoice(WERD_CHOICE *word_choice) {
614
291k
  if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
615
291k
    delete raw_choice;
616
291k
    raw_choice = new WERD_CHOICE(*word_choice);
617
291k
    raw_choice->set_permuter(TOP_CHOICE_PERM);
618
291k
    return true;
619
291k
  }
620
0
  return false;
621
291k
}
622
623
// Consumes word_choice by adding it to best_choices, (taking ownership) if
624
// the certainty for word_choice is some distance of the best choice in
625
// best_choices, or by deleting the word_choice and returning false.
626
// The best_choices list is kept in sorted order by rating. Duplicates are
627
// removed, and the list is kept no longer than max_num_choices in length.
628
// Returns true if the word_choice is still a valid pointer.
629
bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
630
490k
                                  WERD_CHOICE *word_choice) {
631
490k
  if (best_choice != nullptr) {
632
    // Throw out obviously bad choices to save some work.
633
    // TODO(rays) Get rid of this! This piece of code produces different
634
    // results according to the order in which words are found, which is an
635
    // undesirable behavior. It would be better to keep all the choices and
636
    // prune them later when more information is available.
637
300k
    float max_certainty_delta = StopperAmbigThreshold(
638
300k
        best_choice->adjust_factor(), word_choice->adjust_factor());
639
300k
    if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) {
640
18.3k
      max_certainty_delta = -kStopperAmbiguityThresholdOffset;
641
18.3k
    }
642
300k
    if (word_choice->certainty() - best_choice->certainty() <
643
300k
        max_certainty_delta) {
644
38.6k
      if (debug) {
645
0
        std::string bad_string;
646
0
        word_choice->string_and_lengths(&bad_string, nullptr);
647
0
        tprintf(
648
0
            "Discarding choice \"%s\" with an overly low certainty"
649
0
            " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
650
0
            bad_string.c_str(), word_choice->certainty(),
651
0
            best_choice->certainty(),
652
0
            max_certainty_delta + best_choice->certainty());
653
0
      }
654
38.6k
      delete word_choice;
655
38.6k
      return false;
656
38.6k
    }
657
300k
  }
658
659
  // Insert in the list in order of increasing rating, but knock out worse
660
  // string duplicates.
661
451k
  WERD_CHOICE_IT it(&best_choices);
662
451k
  const std::string &new_str = word_choice->unichar_string();
663
451k
  bool inserted = false;
664
451k
  int num_choices = 0;
665
451k
  if (!it.empty()) {
666
1.26M
    do {
667
1.26M
      WERD_CHOICE *choice = it.data();
668
1.26M
      if (choice->rating() > word_choice->rating() && !inserted) {
669
        // Time to insert.
670
213k
        it.add_before_stay_put(word_choice);
671
213k
        inserted = true;
672
213k
        if (num_choices == 0) {
673
118k
          best_choice = word_choice; // This is the new best.
674
118k
        }
675
213k
        ++num_choices;
676
213k
      }
677
1.26M
      if (choice->unichar_string() == new_str) {
678
25.0k
        if (inserted) {
679
          // New is better.
680
20.7k
          delete it.extract();
681
20.7k
        } else {
682
          // Old is better.
683
4.29k
          if (debug) {
684
0
            tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
685
0
                    new_str.c_str(), word_choice->rating(), choice->rating());
686
0
          }
687
4.29k
          delete word_choice;
688
4.29k
          return false;
689
4.29k
        }
690
1.24M
      } else {
691
1.24M
        ++num_choices;
692
1.24M
        if (num_choices > max_num_choices) {
693
47.8k
          delete it.extract();
694
47.8k
        }
695
1.24M
      }
696
1.26M
      it.forward();
697
1.26M
    } while (!it.at_first());
698
262k
  }
699
447k
  if (!inserted && num_choices < max_num_choices) {
700
230k
    it.add_to_end(word_choice);
701
230k
    inserted = true;
702
230k
    if (num_choices == 0) {
703
189k
      best_choice = word_choice; // This is the new best.
704
189k
    }
705
230k
  }
706
447k
  if (debug) {
707
0
    if (inserted) {
708
0
      tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
709
0
    } else {
710
0
      tprintf("Poor");
711
0
    }
712
0
    word_choice->print(" Word Choice");
713
0
  }
714
447k
  if (!inserted) {
715
3.96k
    delete word_choice;
716
3.96k
    return false;
717
3.96k
  }
718
443k
  return true;
719
447k
}
720
721
// Simple helper moves the ownership of the pointer data from src to dest,
722
// first deleting anything in dest, and nulling out src afterwards.
723
template <class T>
724
353k
static void MovePointerData(T **dest, T **src) {
725
353k
  delete *dest;
726
353k
  *dest = *src;
727
353k
  *src = nullptr;
728
353k
}
pageres.cpp:void tesseract::MovePointerData<tesseract::TWERD>(tesseract::TWERD**, tesseract::TWERD**)
Line
Count
Source
724
141k
static void MovePointerData(T **dest, T **src) {
725
141k
  delete *dest;
726
141k
  *dest = *src;
727
141k
  *src = nullptr;
728
141k
}
pageres.cpp:void tesseract::MovePointerData<tesseract::BoxWord>(tesseract::BoxWord**, tesseract::BoxWord**)
Line
Count
Source
724
70.6k
static void MovePointerData(T **dest, T **src) {
725
70.6k
  delete *dest;
726
70.6k
  *dest = *src;
727
70.6k
  *src = nullptr;
728
70.6k
}
pageres.cpp:void tesseract::MovePointerData<tesseract::MATRIX>(tesseract::MATRIX**, tesseract::MATRIX**)
Line
Count
Source
724
70.6k
static void MovePointerData(T **dest, T **src) {
725
70.6k
  delete *dest;
726
70.6k
  *dest = *src;
727
70.6k
  *src = nullptr;
728
70.6k
}
pageres.cpp:void tesseract::MovePointerData<tesseract::WERD_CHOICE>(tesseract::WERD_CHOICE**, tesseract::WERD_CHOICE**)
Line
Count
Source
724
70.6k
static void MovePointerData(T **dest, T **src) {
725
70.6k
  delete *dest;
726
70.6k
  *dest = *src;
727
70.6k
  *src = nullptr;
728
70.6k
}
729
730
// Prints a brief list of all the best choices.
731
0
void WERD_RES::PrintBestChoices() const {
732
0
  std::string alternates_str;
733
0
  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
734
0
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
735
0
    if (!it.at_first()) {
736
0
      alternates_str += "\", \"";
737
0
    }
738
0
    alternates_str += it.data()->unichar_string();
739
0
  }
740
0
  tprintf("Alternates for \"%s\": {\"%s\"}\n",
741
0
          best_choice->unichar_string().c_str(), alternates_str.c_str());
742
0
}
743
744
// Returns the sum of the widths of the blob between start_blob and last_blob
745
// inclusive.
746
11.3M
int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) const {
747
11.3M
  int result = 0;
748
40.7M
  for (int b = start_blob; b <= last_blob; ++b) {
749
29.3M
    result += blob_widths[b];
750
29.3M
    if (b < last_blob) {
751
18.0M
      result += blob_gaps[b];
752
18.0M
    }
753
29.3M
  }
754
11.3M
  return result;
755
11.3M
}
756
// Returns the width of a gap between the specified blob and the next one.
757
22.9M
int WERD_RES::GetBlobsGap(unsigned blob_index) const {
758
22.9M
  if (blob_index >= blob_gaps.size()) {
759
0
    return 0;
760
0
  }
761
22.9M
  return blob_gaps[blob_index];
762
22.9M
}
763
764
// Returns the BLOB_CHOICE corresponding to the given index in the
765
// best choice word taken from the appropriate cell in the ratings MATRIX.
766
// Borrowed pointer, so do not delete. May return nullptr if there is no
767
// BLOB_CHOICE matching the unichar_id at the given index.
768
331k
BLOB_CHOICE *WERD_RES::GetBlobChoice(unsigned index) const {
769
331k
  if (index >= best_choice->length()) {
770
0
    return nullptr;
771
0
  }
772
331k
  BLOB_CHOICE_LIST *choices = GetBlobChoices(index);
773
331k
  return FindMatchingChoice(best_choice->unichar_id(index), choices);
774
331k
}
775
776
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the
777
// best choice word taken from the appropriate cell in the ratings MATRIX.
778
// Borrowed pointer, so do not delete.
779
331k
BLOB_CHOICE_LIST *WERD_RES::GetBlobChoices(int index) const {
780
331k
  return best_choice->blob_choices(index, ratings);
781
331k
}
782
783
// Moves the results fields from word to this. This takes ownership of all
784
// the data, so src can be destructed.
785
70.6k
void WERD_RES::ConsumeWordResults(WERD_RES *word) {
786
70.6k
  denorm = word->denorm;
787
70.6k
  blob_row = word->blob_row;
788
70.6k
  MovePointerData(&chopped_word, &word->chopped_word);
789
70.6k
  MovePointerData(&rebuild_word, &word->rebuild_word);
790
70.6k
  MovePointerData(&box_word, &word->box_word);
791
272k
  for (auto data : seam_array) {
792
272k
    delete data;
793
272k
  }
794
70.6k
  seam_array = word->seam_array;
795
70.6k
  word->seam_array.clear();
796
  // TODO: optimize moves.
797
70.6k
  best_state = word->best_state;
798
70.6k
  word->best_state.clear();
799
70.6k
  correct_text = word->correct_text;
800
70.6k
  word->correct_text.clear();
801
70.6k
  blob_widths = word->blob_widths;
802
70.6k
  word->blob_widths.clear();
803
70.6k
  blob_gaps = word->blob_gaps;
804
70.6k
  word->blob_gaps.clear();
805
70.6k
  if (ratings != nullptr) {
806
70.6k
    ratings->delete_matrix_pointers();
807
70.6k
  }
808
70.6k
  MovePointerData(&ratings, &word->ratings);
809
70.6k
  best_choice = word->best_choice;
810
70.6k
  MovePointerData(&raw_choice, &word->raw_choice);
811
70.6k
  best_choices.clear();
812
70.6k
  WERD_CHOICE_IT wc_it(&best_choices);
813
70.6k
  wc_it.add_list_after(&word->best_choices);
814
70.6k
  reject_map = word->reject_map;
815
70.6k
  if (word->blamer_bundle != nullptr) {
816
0
    assert(blamer_bundle != nullptr);
817
0
    blamer_bundle->CopyResults(*(word->blamer_bundle));
818
0
  }
819
70.6k
  CopySimpleFields(*word);
820
70.6k
}
821
822
// Replace the best choice and rebuild box word.
823
// choice must be from the current best_choices list.
824
0
void WERD_RES::ReplaceBestChoice(WERD_CHOICE *choice) {
825
0
  best_choice = choice;
826
0
  RebuildBestState();
827
0
  SetupBoxWord();
828
  // Make up a fake reject map of the right length to keep the
829
  // rejection pass happy.
830
0
  reject_map.initialise(best_state.size());
831
0
  done = tess_accepted = tess_would_adapt = true;
832
0
  SetScriptPositions();
833
0
}
834
835
// Builds the rebuild_word and sets the best_state from the chopped_word and
836
// the best_choice->state.
837
98.6k
void WERD_RES::RebuildBestState() {
838
98.6k
  ASSERT_HOST(best_choice != nullptr);
839
98.6k
  delete rebuild_word;
840
98.6k
  rebuild_word = new TWERD;
841
98.6k
  if (seam_array.empty()) {
842
45.4k
    start_seam_list(chopped_word, &seam_array);
843
45.4k
  }
844
98.6k
  best_state.clear();
845
98.6k
  int start = 0;
846
441k
  for (unsigned i = 0; i < best_choice->length(); ++i) {
847
342k
    int length = best_choice->state(i);
848
342k
    best_state.push_back(length);
849
342k
    if (length > 1) {
850
93.7k
      SEAM::JoinPieces(seam_array, chopped_word->blobs, start,
851
93.7k
                       start + length - 1);
852
93.7k
    }
853
342k
    TBLOB *blob = chopped_word->blobs[start];
854
342k
    rebuild_word->blobs.push_back(new TBLOB(*blob));
855
342k
    if (length > 1) {
856
93.7k
      SEAM::BreakPieces(seam_array, chopped_word->blobs, start,
857
93.7k
                        start + length - 1);
858
93.7k
    }
859
342k
    start += length;
860
342k
  }
861
98.6k
}
862
863
// Copies the chopped_word to the rebuild_word, faking a best_state as well.
864
// Also sets up the output box_word.
865
0
void WERD_RES::CloneChoppedToRebuild() {
866
0
  delete rebuild_word;
867
0
  rebuild_word = new TWERD(*chopped_word);
868
0
  SetupBoxWord();
869
0
  auto word_len = box_word->length();
870
0
  best_state.reserve(word_len);
871
0
  correct_text.reserve(word_len);
872
0
  for (unsigned i = 0; i < word_len; ++i) {
873
0
    best_state.push_back(1);
874
0
    correct_text.emplace_back("");
875
0
  }
876
0
}
877
878
// Sets/replaces the box_word with one made from the rebuild_word.
879
91.3k
void WERD_RES::SetupBoxWord() {
880
91.3k
  delete box_word;
881
91.3k
  rebuild_word->ComputeBoundingBoxes();
882
91.3k
  box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);
883
91.3k
  box_word->ClipToOriginalWord(denorm.block(), word);
884
91.3k
}
885
886
// Sets up the script positions in the output best_choice using the best_choice
887
// to get the unichars, and the unicharset to get the target positions.
888
0
void WERD_RES::SetScriptPositions() {
889
0
  best_choice->SetScriptPositions(small_caps, chopped_word);
890
0
}
891
// Sets all the blobs in all the words (raw choice and best choices) to be
892
// the given position. (When a sub/superscript is recognized as a separate
893
// word, it falls victim to the rule that a whole word cannot be sub or
894
// superscript, so this function overrides that problem.)
895
1.79k
void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {
896
1.79k
  raw_choice->SetAllScriptPositions(position);
897
1.79k
  WERD_CHOICE_IT wc_it(&best_choices);
898
5.33k
  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
899
3.53k
    wc_it.data()->SetAllScriptPositions(position);
900
3.53k
  }
901
1.79k
}
902
903
// Classifies the word with some already-calculated BLOB_CHOICEs.
904
// The choices are an array of blob_count pointers to BLOB_CHOICE,
905
// providing a single classifier result for each blob.
906
// The BLOB_CHOICEs are consumed and the word takes ownership.
907
// The number of blobs in the box_word must match blob_count.
908
0
void WERD_RES::FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices) {
909
  // Setup the WERD_RES.
910
0
  ASSERT_HOST(box_word != nullptr);
911
0
  ASSERT_HOST(blob_count == box_word->length());
912
0
  ClearWordChoices();
913
0
  ClearRatings();
914
0
  ratings = new MATRIX(blob_count, 1);
915
0
  for (unsigned c = 0; c < blob_count; ++c) {
916
0
    auto *choice_list = new BLOB_CHOICE_LIST;
917
0
    BLOB_CHOICE_IT choice_it(choice_list);
918
0
    choice_it.add_after_then_move(choices[c]);
919
0
    ratings->put(c, c, choice_list);
920
0
  }
921
0
  FakeWordFromRatings(TOP_CHOICE_PERM);
922
0
  reject_map.initialise(blob_count);
923
0
  best_state.clear();
924
0
  best_state.resize(blob_count, 1);
925
0
  done = true;
926
0
}
927
928
// Creates a WERD_CHOICE for the word using the top choices from the leading
929
// diagonal of the ratings matrix.
930
90.7k
void WERD_RES::FakeWordFromRatings(PermuterType permuter) {
931
90.7k
  int num_blobs = ratings->dimension();
932
90.7k
  auto *word_choice = new WERD_CHOICE(uch_set, num_blobs);
933
90.7k
  word_choice->set_permuter(permuter);
934
218k
  for (int b = 0; b < num_blobs; ++b) {
935
127k
    UNICHAR_ID unichar_id = UNICHAR_SPACE;
936
    // Initialize rating and certainty like in WERD_CHOICE::make_bad().
937
127k
    float rating = WERD_CHOICE::kBadRating;
938
127k
    float certainty = -FLT_MAX;
939
127k
    BLOB_CHOICE_LIST *choices = ratings->get(b, b);
940
127k
    if (choices != nullptr && !choices->empty()) {
941
127k
      BLOB_CHOICE_IT bc_it(choices);
942
127k
      BLOB_CHOICE *choice = bc_it.data();
943
127k
      unichar_id = choice->unichar_id();
944
127k
      rating = choice->rating();
945
127k
      certainty = choice->certainty();
946
127k
    }
947
127k
    word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
948
127k
                                                   certainty);
949
127k
  }
950
90.7k
  LogNewRawChoice(word_choice);
951
  // Ownership of word_choice taken by word here.
952
90.7k
  LogNewCookedChoice(1, false, word_choice);
953
90.7k
}
954
955
// Copies the best_choice strings to the correct_text for adaption/training.
956
2.15k
void WERD_RES::BestChoiceToCorrectText() {
957
2.15k
  correct_text.clear();
958
2.15k
  ASSERT_HOST(best_choice != nullptr);
959
4.43k
  for (unsigned i = 0; i < best_choice->length(); ++i) {
960
2.28k
    UNICHAR_ID choice_id = best_choice->unichar_id(i);
961
2.28k
    const char *blob_choice = uch_set->id_to_unichar(choice_id);
962
2.28k
    correct_text.emplace_back(blob_choice);
963
2.28k
  }
964
2.15k
}
965
966
// Merges 2 adjacent blobs in the result if the permanent callback
967
// class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
968
// callback box_cb is nullptr or returns true, setting the merged blob
969
// result to the class returned from class_cb.
970
// Returns true if anything was merged.
971
bool WERD_RES::ConditionalBlobMerge(
972
    const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,
973
153k
    const std::function<bool(const TBOX &, const TBOX &)> &box_cb) {
974
153k
  ASSERT_HOST(best_choice->empty() || ratings != nullptr);
975
153k
  bool modified = false;
976
619k
  for (unsigned i = 0; i + 1 < best_choice->length(); ++i) {
977
465k
    UNICHAR_ID new_id =
978
465k
        class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1));
979
465k
    if (new_id != INVALID_UNICHAR_ID &&
980
465k
        (box_cb == nullptr ||
981
575
         box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) {
982
      // Raw choice should not be fixed.
983
407
      best_choice->set_unichar_id(new_id, i);
984
407
      modified = true;
985
407
      MergeAdjacentBlobs(i);
986
407
      const MATRIX_COORD &coord = best_choice->MatrixCoord(i);
987
407
      if (!coord.Valid(*ratings)) {
988
0
        ratings->IncreaseBandSize(coord.row + 1 - coord.col);
989
0
      }
990
407
      BLOB_CHOICE_LIST *blob_choices = GetBlobChoices(i);
991
407
      if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
992
        // Insert a fake result.
993
398
        auto *blob_choice = new BLOB_CHOICE;
994
398
        blob_choice->set_unichar_id(new_id);
995
398
        BLOB_CHOICE_IT bc_it(blob_choices);
996
398
        bc_it.add_before_then_move(blob_choice);
997
398
      }
998
407
    }
999
465k
  }
1000
153k
  return modified;
1001
153k
}
1002
1003
// Merges 2 adjacent blobs in the result (index and index+1) and corrects
1004
// all the data to account for the change.
1005
407
void WERD_RES::MergeAdjacentBlobs(unsigned index) {
1006
407
  if (reject_map.length() == best_choice->length()) {
1007
284
    reject_map.remove_pos(index);
1008
284
  }
1009
407
  best_choice->remove_unichar_id(index + 1);
1010
407
  rebuild_word->MergeBlobs(index, index + 2);
1011
407
  box_word->MergeBoxes(index, index + 2);
1012
407
  if (index + 1 < best_state.size()) {
1013
407
    best_state[index] += best_state[index + 1];
1014
407
    best_state.erase(best_state.begin() + index + 1);
1015
407
  }
1016
407
}
1017
1018
// TODO(tkielbus) Decide between keeping this behavior here or modifying the
1019
// training data.
1020
1021
// Utility function for fix_quotes
1022
// Return true if the next character in the string (given the UTF8 length in
1023
// bytes) is a quote character.
1024
252k
static int is_simple_quote(const char *signed_str, int length) {
1025
252k
  const auto *str = reinterpret_cast<const unsigned char *>(signed_str);
1026
  // Standard 1 byte quotes.
1027
252k
  return (length == 1 && (*str == '\'' || *str == '`')) ||
1028
         // UTF-8 3 bytes curved quotes.
1029
252k
         (length == 3 &&
1030
249k
          ((*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x98) ||
1031
34.4k
           (*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x99)));
1032
252k
}
1033
1034
// Callback helper for fix_quotes returns a double quote if both
1035
// arguments are quote, otherwise INVALID_UNICHAR_ID.
1036
232k
UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
1037
232k
  const char *ch = uch_set->id_to_unichar(id1);
1038
232k
  const char *next_ch = uch_set->id_to_unichar(id2);
1039
232k
  if (is_simple_quote(ch, strlen(ch)) &&
1040
232k
      is_simple_quote(next_ch, strlen(next_ch))) {
1041
284
    return uch_set->unichar_to_id("\"");
1042
284
  }
1043
232k
  return INVALID_UNICHAR_ID;
1044
232k
}
1045
1046
// Change pairs of quotes to double quotes.
1047
76.7k
void WERD_RES::fix_quotes() {
1048
76.7k
  if (!uch_set->contains_unichar("\"") ||
1049
76.7k
      !uch_set->get_enabled(uch_set->unichar_to_id("\""))) {
1050
0
    return; // Don't create it if it is disallowed.
1051
0
  }
1052
1053
76.7k
  using namespace std::placeholders; // for _1, _2
1054
76.7k
  ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2), nullptr);
1055
76.7k
}
1056
1057
// Callback helper for fix_hyphens returns UNICHAR_ID of - if both
1058
// arguments are hyphen, otherwise INVALID_UNICHAR_ID.
1059
232k
UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
1060
232k
  const char *ch = uch_set->id_to_unichar(id1);
1061
232k
  const char *next_ch = uch_set->id_to_unichar(id2);
1062
232k
  if (strlen(ch) == 1 && strlen(next_ch) == 1 && (*ch == '-' || *ch == '~') &&
1063
232k
      (*next_ch == '-' || *next_ch == '~')) {
1064
291
    return uch_set->unichar_to_id("-");
1065
291
  }
1066
232k
  return INVALID_UNICHAR_ID;
1067
232k
}
1068
1069
// Callback helper for fix_hyphens returns true if box1 and box2 overlap
1070
// (assuming both on the same textline, are in order and a chopped em dash.)
1071
291
bool WERD_RES::HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2) {
1072
291
  return box1.right() >= box2.left();
1073
291
}
1074
1075
// Change pairs of hyphens to a single hyphen if the bounding boxes touch
1076
// Typically a long dash which has been segmented.
1077
76.7k
void WERD_RES::fix_hyphens() {
1078
76.7k
  if (!uch_set->contains_unichar("-") ||
1079
76.7k
      !uch_set->get_enabled(uch_set->unichar_to_id("-"))) {
1080
0
    return; // Don't create it if it is disallowed.
1081
0
  }
1082
1083
76.7k
  using namespace std::placeholders; // for _1, _2
1084
76.7k
  ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2),
1085
76.7k
                       std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2));
1086
76.7k
}
1087
1088
// Callback helper for merge_tess_fails returns a space if both
1089
// arguments are space, otherwise INVALID_UNICHAR_ID.
1090
0
UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
1091
0
  if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) {
1092
0
    return id1;
1093
0
  } else {
1094
0
    return INVALID_UNICHAR_ID;
1095
0
  }
1096
0
}
1097
1098
// Change pairs of tess failures to a single one
1099
0
void WERD_RES::merge_tess_fails() {
1100
0
  using namespace std::placeholders; // for _1, _2
1101
0
  if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2),
1102
0
                           nullptr)) {
1103
0
    unsigned len = best_choice->length();
1104
0
    ASSERT_HOST(reject_map.length() == len);
1105
0
    ASSERT_HOST(box_word->length() == len);
1106
0
  }
1107
0
}
1108
1109
// Returns true if the collection of count pieces, starting at start, are all
1110
// natural connected components, ie there are no real chops involved.
1111
0
bool WERD_RES::PiecesAllNatural(int start, int count) const {
1112
  // all seams must have no splits.
1113
0
  for (int index = start; index < start + count - 1; ++index) {
1114
0
    if (index >= 0 && static_cast<size_t>(index) < seam_array.size()) {
1115
0
      SEAM *seam = seam_array[index];
1116
0
      if (seam != nullptr && seam->HasAnySplits()) {
1117
0
        return false;
1118
0
      }
1119
0
    }
1120
0
  }
1121
0
  return true;
1122
0
}
1123
1124
503k
WERD_RES::~WERD_RES() {
1125
503k
  Clear();
1126
503k
}
1127
1128
530k
void WERD_RES::Clear() {
1129
530k
  if (combination) {
1130
164k
    delete word;
1131
164k
  }
1132
530k
  word = nullptr;
1133
530k
  delete blamer_bundle;
1134
530k
  blamer_bundle = nullptr;
1135
530k
  ClearResults();
1136
530k
}
1137
1138
871k
void WERD_RES::ClearResults() {
1139
871k
  done = false;
1140
871k
  fontinfo = nullptr;
1141
871k
  fontinfo2 = nullptr;
1142
871k
  fontinfo_id_count = 0;
1143
871k
  fontinfo_id2_count = 0;
1144
871k
  delete bln_boxes;
1145
871k
  bln_boxes = nullptr;
1146
871k
  blob_row = nullptr;
1147
871k
  delete chopped_word;
1148
871k
  chopped_word = nullptr;
1149
871k
  delete rebuild_word;
1150
871k
  rebuild_word = nullptr;
1151
871k
  delete box_word;
1152
871k
  box_word = nullptr;
1153
871k
  best_state.clear();
1154
871k
  correct_text.clear();
1155
1.47M
  for (auto data : seam_array) {
1156
1.47M
    delete data;
1157
1.47M
  }
1158
871k
  seam_array.clear();
1159
871k
  blob_widths.clear();
1160
871k
  blob_gaps.clear();
1161
871k
  ClearRatings();
1162
871k
  ClearWordChoices();
1163
871k
  if (blamer_bundle != nullptr) {
1164
0
    blamer_bundle->ClearResults();
1165
0
  }
1166
871k
}
1167
1.21M
void WERD_RES::ClearWordChoices() {
1168
1.21M
  best_choice = nullptr;
1169
1.21M
  delete raw_choice;
1170
1.21M
  raw_choice = nullptr;
1171
1.21M
  best_choices.clear();
1172
1.21M
  delete ep_choice;
1173
1.21M
  ep_choice = nullptr;
1174
1.21M
}
1175
871k
void WERD_RES::ClearRatings() {
1176
871k
  if (ratings != nullptr) {
1177
343k
    ratings->delete_matrix_pointers();
1178
343k
    delete ratings;
1179
343k
    ratings = nullptr;
1180
343k
  }
1181
871k
}
1182
1183
930k
int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
1184
930k
  ASSERT_HOST(page_res == other.page_res);
1185
930k
  if (other.block_res == nullptr) {
1186
    // other points to the end of the page.
1187
0
    if (block_res == nullptr) {
1188
0
      return 0;
1189
0
    }
1190
0
    return -1;
1191
0
  }
1192
930k
  if (block_res == nullptr) {
1193
543k
    return 1; // we point to the end of the page.
1194
543k
  }
1195
386k
  if (block_res == other.block_res) {
1196
386k
    if (other.row_res == nullptr || row_res == nullptr) {
1197
      // this should only happen if we hit an image block.
1198
0
      return 0;
1199
0
    }
1200
386k
    if (row_res == other.row_res) {
1201
      // we point to the same block and row.
1202
195k
      ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
1203
195k
      if (word_res == other.word_res) {
1204
        // we point to the same word!
1205
195k
        return 0;
1206
195k
      }
1207
1208
0
      WERD_RES_IT word_res_it(&row_res->word_res_list);
1209
0
      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1210
0
           word_res_it.forward()) {
1211
0
        if (word_res_it.data() == word_res) {
1212
0
          return -1;
1213
0
        } else if (word_res_it.data() == other.word_res) {
1214
0
          return 1;
1215
0
        }
1216
0
      }
1217
0
      ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1218
0
    }
1219
1220
    // we both point to the same block, but different rows.
1221
191k
    ROW_RES_IT row_res_it(&block_res->row_res_list);
1222
2.27M
    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1223
2.27M
         row_res_it.forward()) {
1224
2.27M
      if (row_res_it.data() == row_res) {
1225
167k
        return -1;
1226
2.11M
      } else if (row_res_it.data() == other.row_res) {
1227
23.3k
        return 1;
1228
23.3k
      }
1229
2.27M
    }
1230
0
    ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1231
0
  }
1232
1233
  // We point to different blocks.
1234
0
  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1235
0
  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
1236
0
       block_res_it.forward()) {
1237
0
    if (block_res_it.data() == block_res) {
1238
0
      return -1;
1239
0
    } else if (block_res_it.data() == other.block_res) {
1240
0
      return 1;
1241
0
    }
1242
0
  }
1243
  // Shouldn't happen...
1244
0
  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1245
0
  return 0;
1246
0
}
1247
1248
// Inserts the new_word as a combination owned by a corresponding WERD_RES
1249
// before the current position. The simple fields of the WERD_RES are copied
1250
// from clone_res and the resulting WERD_RES is returned for further setup
1251
// with best_choice etc.
1252
WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res,
1253
0
                                             WERD *new_word) {
1254
  // Make a WERD_RES for the new_word.
1255
0
  auto *new_res = new WERD_RES(new_word);
1256
0
  new_res->CopySimpleFields(clone_res);
1257
0
  new_res->combination = true;
1258
  // Insert into the appropriate place in the ROW_RES.
1259
0
  WERD_RES_IT wr_it(&row()->word_res_list);
1260
0
  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1261
0
    WERD_RES *word = wr_it.data();
1262
0
    if (word == word_res) {
1263
0
      break;
1264
0
    }
1265
0
  }
1266
0
  ASSERT_HOST(!wr_it.cycled_list());
1267
0
  wr_it.add_before_then_move(new_res);
1268
0
  if (wr_it.at_first()) {
1269
    // This is the new first word, so reset the member iterator so it
1270
    // detects the cycled_list state correctly.
1271
0
    ResetWordIterator();
1272
0
  }
1273
0
  return new_res;
1274
0
}
1275
1276
// Helper computes the boundaries between blobs in the word. The blob bounds
1277
// are likely very poor, if they come from LSTM, where it only outputs the
1278
// character at one pixel within it, so we find the midpoints between them.
1279
static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box,
1280
                            C_BLOB_LIST *next_word_blobs,
1281
90.7k
                            std::vector<int> *blob_ends) {
1282
90.7k
  C_BLOB_IT blob_it(word.word->cblob_list());
1283
127k
  for (int length : word.best_state) {
1284
    // Get the bounding box of the fake blobs
1285
127k
    TBOX blob_box = blob_it.data()->bounding_box();
1286
127k
    blob_it.forward();
1287
127k
    for (int b = 1; b < length; ++b) {
1288
0
      blob_box += blob_it.data()->bounding_box();
1289
0
      blob_it.forward();
1290
0
    }
1291
    // This blob_box is crap, so for now we are only looking for the
1292
    // boundaries between them.
1293
127k
    int blob_end = INT32_MAX;
1294
127k
    if (!blob_it.at_first() || next_word_blobs != nullptr) {
1295
47.6k
      if (blob_it.at_first()) {
1296
11.1k
        blob_it.set_to_list(next_word_blobs);
1297
11.1k
      }
1298
47.6k
      blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
1299
47.6k
    }
1300
127k
    blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());
1301
127k
    blob_ends->push_back(blob_end);
1302
127k
  }
1303
90.7k
  blob_ends->back() = clip_box.right();
1304
90.7k
}
1305
1306
// Helper computes the bounds of a word by restricting it to existing words
1307
// that significantly overlap.
1308
static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words,
1309
90.7k
                              int w_index, TBOX prev_box, WERD_RES_IT w_it) {
1310
90.7k
  constexpr int kSignificantOverlapFraction = 4;
1311
90.7k
  TBOX clipped_box;
1312
90.7k
  TBOX current_box = words[w_index]->word->bounding_box();
1313
90.7k
  TBOX next_box;
1314
90.7k
  if (static_cast<size_t>(w_index + 1) < words.size() &&
1315
90.7k
      words[w_index + 1] != nullptr && words[w_index + 1]->word != nullptr) {
1316
11.1k
    next_box = words[w_index + 1]->word->bounding_box();
1317
11.1k
  }
1318
241k
  for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
1319
150k
       w_it.forward()) {
1320
150k
    if (w_it.data() == nullptr || w_it.data()->word == nullptr) {
1321
0
      continue;
1322
0
    }
1323
150k
    TBOX w_box = w_it.data()->word->bounding_box();
1324
150k
    int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);
1325
150k
    int width_limit = w_box.width() / kSignificantOverlapFraction;
1326
150k
    int min_significant_overlap = std::max(height_limit, width_limit);
1327
150k
    int overlap = w_box.intersection(current_box).width();
1328
150k
    int prev_overlap = w_box.intersection(prev_box).width();
1329
150k
    int next_overlap = w_box.intersection(next_box).width();
1330
150k
    if (overlap > min_significant_overlap) {
1331
96.4k
      if (prev_overlap > min_significant_overlap) {
1332
        // We have no choice but to use the LSTM word edge.
1333
746
        clipped_box.set_left(current_box.left());
1334
95.6k
      } else if (next_overlap > min_significant_overlap) {
1335
        // We have no choice but to use the LSTM word edge.
1336
713
        clipped_box.set_right(current_box.right());
1337
94.9k
      } else {
1338
94.9k
        clipped_box += w_box;
1339
94.9k
      }
1340
96.4k
    }
1341
150k
  }
1342
90.7k
  if (clipped_box.height() <= 0) {
1343
18.4k
    clipped_box.set_top(current_box.top());
1344
18.4k
    clipped_box.set_bottom(current_box.bottom());
1345
18.4k
  }
1346
90.7k
  if (clipped_box.width() <= 0) {
1347
18.3k
    clipped_box = current_box;
1348
18.3k
  }
1349
90.7k
  return clipped_box;
1350
90.7k
}
1351
1352
// Helper moves the blob from src to dest. If it isn't contained by clip_box,
1353
// the blob is replaced by a fake that is contained.
1354
static TBOX MoveAndClipBlob(C_BLOB_IT *src_it, C_BLOB_IT *dest_it,
1355
404k
                            const TBOX &clip_box) {
1356
404k
  C_BLOB *src_blob = src_it->extract();
1357
404k
  TBOX box = src_blob->bounding_box();
1358
404k
  if (!clip_box.contains(box)) {
1359
27.2k
    int left =
1360
27.2k
        ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);
1361
27.2k
    int right =
1362
27.2k
        ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());
1363
27.2k
    int top =
1364
27.2k
        ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());
1365
27.2k
    int bottom =
1366
27.2k
        ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);
1367
27.2k
    box = TBOX(left, bottom, right, top);
1368
27.2k
    delete src_blob;
1369
27.2k
    src_blob = C_BLOB::FakeBlob(box);
1370
27.2k
  }
1371
404k
  dest_it->add_after_then_move(src_blob);
1372
404k
  return box;
1373
404k
}
1374
1375
// Replaces the current WERD/WERD_RES with the given words. The given words
1376
// contain fake blobs that indicate the position of the characters. These are
1377
// replaced with real blobs from the current word as much as possible.
1378
void PAGE_RES_IT::ReplaceCurrentWord(
1379
79.6k
    tesseract::PointerVector<WERD_RES> *words) {
1380
79.6k
  if (words->empty()) {
1381
0
    DeleteCurrentWord();
1382
0
    return;
1383
0
  }
1384
79.6k
  WERD_RES *input_word = word();
1385
  // Set the BOL/EOL flags on the words from the input word.
1386
79.6k
  if (input_word->word->flag(W_BOL)) {
1387
68.7k
    (*words)[0]->word->set_flag(W_BOL, true);
1388
68.7k
  } else {
1389
10.9k
    (*words)[0]->word->set_blanks(input_word->word->space());
1390
10.9k
  }
1391
79.6k
  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1392
1393
  // Move the blobs from the input word to the new set of words.
1394
  // If the input word_res is a combination, then the replacements will also be
1395
  // combinations, and will own their own words. If the input word_res is not a
1396
  // combination, then the final replacements will not be either, (although it
1397
  // is allowed for the input words to be combinations) and their words
1398
  // will get put on the row list. This maintains the ownership rules.
1399
79.6k
  WERD_IT w_it(row()->row->word_list());
1400
79.6k
  if (!input_word->combination) {
1401
36.9k
    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1402
36.9k
      WERD *word = w_it.data();
1403
36.9k
      if (word == input_word->word) {
1404
10.7k
        break;
1405
10.7k
      }
1406
36.9k
    }
1407
    // w_it is now set to the input_word's word.
1408
10.7k
    ASSERT_HOST(!w_it.cycled_list());
1409
10.7k
  }
1410
  // Insert into the appropriate place in the ROW_RES.
1411
79.6k
  WERD_RES_IT wr_it(&row()->word_res_list);
1412
120k
  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1413
120k
    WERD_RES *word = wr_it.data();
1414
120k
    if (word == input_word) {
1415
79.6k
      break;
1416
79.6k
    }
1417
120k
  }
1418
79.6k
  ASSERT_HOST(!wr_it.cycled_list());
1419
  // Since we only have an estimate of the bounds between blobs, use the blob
1420
  // x-middle as the determiner of where to put the blobs
1421
79.6k
  C_BLOB_IT src_b_it(input_word->word->cblob_list());
1422
79.6k
  src_b_it.sort(&C_BLOB::SortByXMiddle);
1423
79.6k
  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1424
79.6k
  rej_b_it.sort(&C_BLOB::SortByXMiddle);
1425
79.6k
  TBOX clip_box;
1426
170k
  for (size_t w = 0; w < words->size(); ++w) {
1427
90.7k
    WERD_RES *word_w = (*words)[w];
1428
90.7k
    clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1429
    // Compute blob boundaries.
1430
90.7k
    std::vector<int> blob_ends;
1431
90.7k
    C_BLOB_LIST *next_word_blobs =
1432
90.7k
        w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1433
90.7k
    ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1434
    // Remove the fake blobs on the current word, but keep safe for back-up if
1435
    // no blob can be found.
1436
90.7k
    C_BLOB_LIST fake_blobs;
1437
90.7k
    C_BLOB_IT fake_b_it(&fake_blobs);
1438
90.7k
    fake_b_it.add_list_after(word_w->word->cblob_list());
1439
90.7k
    fake_b_it.move_to_first();
1440
90.7k
    word_w->word->cblob_list()->clear();
1441
90.7k
    C_BLOB_IT dest_it(word_w->word->cblob_list());
1442
    // Build the box word as we move the blobs.
1443
90.7k
    auto *box_word = new tesseract::BoxWord;
1444
218k
    for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
1445
127k
      int end_x = blob_ends[i];
1446
127k
      TBOX blob_box;
1447
      // Add the blobs up to end_x.
1448
452k
      while (!src_b_it.empty() &&
1449
452k
             src_b_it.data()->bounding_box().x_middle() < end_x) {
1450
325k
        blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1451
325k
        src_b_it.forward();
1452
325k
      }
1453
191k
      while (!rej_b_it.empty() &&
1454
191k
             rej_b_it.data()->bounding_box().x_middle() < end_x) {
1455
64.2k
        blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1456
64.2k
        rej_b_it.forward();
1457
64.2k
      }
1458
127k
      if (blob_box.null_box()) {
1459
        // Use the original box as a back-up.
1460
15.3k
        blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1461
15.3k
      }
1462
127k
      box_word->InsertBox(i, blob_box);
1463
127k
    }
1464
90.7k
    delete word_w->box_word;
1465
90.7k
    word_w->box_word = box_word;
1466
90.7k
    if (!input_word->combination) {
1467
      // Insert word_w->word into the ROW. It doesn't own its word, so the
1468
      // ROW needs to own it.
1469
10.8k
      w_it.add_before_stay_put(word_w->word);
1470
10.8k
      word_w->combination = false;
1471
10.8k
    }
1472
90.7k
    (*words)[w] = nullptr; // We are taking ownership.
1473
90.7k
    wr_it.add_before_stay_put(word_w);
1474
90.7k
  }
1475
  // We have taken ownership of the words.
1476
79.6k
  words->clear();
1477
  // Delete the current word, which has been replaced. We could just call
1478
  // DeleteCurrentWord, but that would iterate both lists again, and we know
1479
  // we are already in the right place.
1480
79.6k
  if (!input_word->combination) {
1481
10.7k
    delete w_it.extract();
1482
10.7k
  }
1483
79.6k
  delete wr_it.extract();
1484
79.6k
  ResetWordIterator();
1485
79.6k
}
1486
1487
// Deletes the current WERD_RES and its underlying WERD.
1488
382
void PAGE_RES_IT::DeleteCurrentWord() {
1489
  // Check that this word is as we expect. part_of_combos are NEVER iterated
1490
  // by the normal iterator, so we should never be trying to delete them.
1491
382
  ASSERT_HOST(!word_res->part_of_combo);
1492
382
  if (!word_res->combination) {
1493
    // Combinations own their own word, so we won't find the word on the
1494
    // row's word_list, but it is legitimate to try to delete them.
1495
    // Delete word from the ROW when not a combination.
1496
71
    WERD_IT w_it(row()->row->word_list());
1497
245
    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1498
245
      if (w_it.data() == word_res->word) {
1499
71
        break;
1500
71
      }
1501
245
    }
1502
71
    ASSERT_HOST(!w_it.cycled_list());
1503
71
    delete w_it.extract();
1504
71
  }
1505
  // Remove the WERD_RES for the new_word.
1506
  // Remove the WORD_RES from the ROW_RES.
1507
382
  WERD_RES_IT wr_it(&row()->word_res_list);
1508
1.05k
  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1509
1.05k
    if (wr_it.data() == word_res) {
1510
382
      word_res = nullptr;
1511
382
      break;
1512
382
    }
1513
1.05k
  }
1514
382
  ASSERT_HOST(!wr_it.cycled_list());
1515
382
  delete wr_it.extract();
1516
382
  ResetWordIterator();
1517
382
}
1518
1519
// Makes the current word a fuzzy space if not already fuzzy. Updates
1520
// corresponding part of combo if required.
1521
0
void PAGE_RES_IT::MakeCurrentWordFuzzy() {
1522
0
  WERD *real_word = word_res->word;
1523
0
  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1524
0
    real_word->set_flag(W_FUZZY_SP, true);
1525
0
    if (word_res->combination) {
1526
      // The next word should be the corresponding part of combo, but we have
1527
      // already stepped past it, so find it by search.
1528
0
      WERD_RES_IT wr_it(&row()->word_res_list);
1529
0
      for (wr_it.mark_cycle_pt();
1530
0
           !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1531
0
      }
1532
0
      wr_it.forward();
1533
0
      ASSERT_HOST(wr_it.data()->part_of_combo);
1534
0
      real_word = wr_it.data()->word;
1535
0
      ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1536
0
                  !real_word->flag(W_FUZZY_NON));
1537
0
      real_word->set_flag(W_FUZZY_SP, true);
1538
0
    }
1539
0
  }
1540
0
}
1541
1542
/*************************************************************************
1543
 * PAGE_RES_IT::restart_page
1544
 *
1545
 * Set things up at the start of the page
1546
 *************************************************************************/
1547
1548
4.46M
WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
1549
4.46M
  block_res_it.set_to_list(&page_res->block_res_list);
1550
4.46M
  block_res_it.mark_cycle_pt();
1551
4.46M
  prev_block_res = nullptr;
1552
4.46M
  prev_row_res = nullptr;
1553
4.46M
  prev_word_res = nullptr;
1554
4.46M
  block_res = nullptr;
1555
4.46M
  row_res = nullptr;
1556
4.46M
  word_res = nullptr;
1557
4.46M
  next_block_res = nullptr;
1558
4.46M
  next_row_res = nullptr;
1559
4.46M
  next_word_res = nullptr;
1560
4.46M
  internal_forward(true, empty_ok);
1561
4.46M
  return internal_forward(false, empty_ok);
1562
4.46M
}
1563
1564
// Recovers from operations on the current word, such as in InsertCloneWord
1565
// and DeleteCurrentWord.
1566
// Resets the word_res_it so that it is one past the next_word_res, as
1567
// it should be after internal_forward. If next_row_res != row_res,
1568
// then the next_word_res is in the next row, so there is no need to do
1569
// anything to word_res_it, but it is still a good idea to reset the pointers
1570
// word_res and prev_word_res, which are still in the current row.
1571
80.0k
void PAGE_RES_IT::ResetWordIterator() {
1572
80.0k
  if (row_res == next_row_res) {
1573
    // Reset the member iterator so it can move forward and detect the
1574
    // cycled_list state correctly.
1575
9.49k
    word_res_it.move_to_first();
1576
9.49k
    for (word_res_it.mark_cycle_pt();
1577
49.7k
         !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1578
40.2k
         word_res_it.forward()) {
1579
40.2k
      if (!word_res_it.data()->part_of_combo) {
1580
29.1k
        if (prev_row_res == row_res) {
1581
24.5k
          prev_word_res = word_res;
1582
24.5k
        }
1583
29.1k
        word_res = word_res_it.data();
1584
29.1k
      }
1585
40.2k
    }
1586
9.49k
    ASSERT_HOST(!word_res_it.cycled_list());
1587
9.49k
    wr_it_of_next_word = word_res_it;
1588
9.49k
    word_res_it.forward();
1589
70.5k
  } else {
1590
    // word_res_it is OK, but reset word_res and prev_word_res if needed.
1591
70.5k
    WERD_RES_IT wr_it(&row_res->word_res_list);
1592
265k
    for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1593
195k
      if (!wr_it.data()->part_of_combo) {
1594
93.0k
        if (prev_row_res == row_res) {
1595
18.5k
          prev_word_res = word_res;
1596
18.5k
        }
1597
93.0k
        word_res = wr_it.data();
1598
93.0k
      }
1599
195k
    }
1600
70.5k
  }
1601
80.0k
}
1602
1603
/*************************************************************************
1604
 * PAGE_RES_IT::internal_forward
1605
 *
1606
 * Find the next word on the page. If empty_ok is true, then non-text blocks
1607
 * and text blocks with no text are visited as if they contain a single
1608
 * imaginary word in a single imaginary row. (word() and row() both return
1609
 *nullptr in such a block and the return value is nullptr.) If empty_ok is
1610
 *false, the old behaviour is maintained. Each real word is visited and empty
1611
 *and non-text blocks and rows are skipped. new_block is used to initialize the
1612
 *iterators for a new block. The iterator maintains pointers to block, row and
1613
 *word for the previous, current and next words.  These are correct, regardless
1614
 *of block/row boundaries. nullptr values denote start and end of the page.
1615
 *************************************************************************/
1616
1617
95.7M
WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1618
95.7M
  bool new_row = false;
1619
1620
95.7M
  prev_block_res = block_res;
1621
95.7M
  prev_row_res = row_res;
1622
95.7M
  prev_word_res = word_res;
1623
95.7M
  block_res = next_block_res;
1624
95.7M
  row_res = next_row_res;
1625
95.7M
  word_res = next_word_res;
1626
95.7M
  wr_it_of_current_word = wr_it_of_next_word;
1627
95.7M
  next_block_res = nullptr;
1628
95.7M
  next_row_res = nullptr;
1629
95.7M
  next_word_res = nullptr;
1630
1631
96.7M
  while (!block_res_it.cycled_list()) {
1632
94.9M
    if (new_block) {
1633
4.45M
      new_block = false;
1634
4.45M
      row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1635
4.45M
      row_res_it.mark_cycle_pt();
1636
4.45M
      if (row_res_it.empty() && empty_ok) {
1637
0
        next_block_res = block_res_it.data();
1638
0
        break;
1639
0
      }
1640
4.45M
      new_row = true;
1641
4.45M
    }
1642
154M
    while (!row_res_it.cycled_list()) {
1643
153M
      if (new_row) {
1644
63.2M
        new_row = false;
1645
63.2M
        word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1646
63.2M
        word_res_it.mark_cycle_pt();
1647
63.2M
      }
1648
      // Skip any part_of_combo words.
1649
231M
      while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) {
1650
77.8M
        word_res_it.forward();
1651
77.8M
      }
1652
153M
      if (!word_res_it.cycled_list()) {
1653
93.9M
        next_block_res = block_res_it.data();
1654
93.9M
        next_row_res = row_res_it.data();
1655
93.9M
        next_word_res = word_res_it.data();
1656
93.9M
        wr_it_of_next_word = word_res_it;
1657
93.9M
        word_res_it.forward();
1658
93.9M
        goto foundword;
1659
93.9M
      }
1660
      // end of row reached
1661
59.8M
      row_res_it.forward();
1662
59.8M
      new_row = true;
1663
59.8M
    }
1664
    // end of block reached
1665
1.02M
    block_res_it.forward();
1666
1.02M
    new_block = true;
1667
1.02M
  }
1668
95.7M
foundword:
1669
  // Update prev_word_best_choice pointer.
1670
95.7M
  if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {
1671
95.7M
    *page_res->prev_word_best_choice = (new_block || prev_word_res == nullptr)
1672
95.7M
                                           ? nullptr
1673
95.7M
                                           : prev_word_res->best_choice;
1674
95.7M
  }
1675
95.7M
  return word_res;
1676
95.7M
}
1677
1678
/*************************************************************************
1679
 * PAGE_RES_IT::restart_row()
1680
 *
1681
 * Move to the beginning (leftmost word) of the current row.
1682
 *************************************************************************/
1683
3.78M
WERD_RES *PAGE_RES_IT::restart_row() {
1684
3.78M
  ROW_RES *row = this->row();
1685
3.78M
  if (!row) {
1686
1.35k
    return nullptr;
1687
1.35k
  }
1688
68.8M
  for (restart_page(); this->row() != row; forward()) {
1689
    // pass
1690
65.0M
  }
1691
3.78M
  return word();
1692
3.78M
}
1693
1694
/*************************************************************************
1695
 * PAGE_RES_IT::forward_paragraph
1696
 *
1697
 * Move to the beginning of the next paragraph, allowing empty blocks.
1698
 *************************************************************************/
1699
1700
941k
WERD_RES *PAGE_RES_IT::forward_paragraph() {
1701
16.3M
  while (block_res == next_block_res &&
1702
16.3M
         (next_row_res != nullptr && next_row_res->row != nullptr &&
1703
15.7M
          row_res->row->para() == next_row_res->row->para())) {
1704
15.4M
    internal_forward(false, true);
1705
15.4M
  }
1706
941k
  return internal_forward(false, true);
1707
941k
}
1708
1709
/*************************************************************************
1710
 * PAGE_RES_IT::forward_block
1711
 *
1712
 * Move to the beginning of the next block, allowing empty blocks.
1713
 *************************************************************************/
1714
1715
7.05k
WERD_RES *PAGE_RES_IT::forward_block() {
1716
122k
  while (block_res == next_block_res) {
1717
115k
    internal_forward(false, true);
1718
115k
  }
1719
7.05k
  return internal_forward(false, true);
1720
7.05k
}
1721
1722
0
void PAGE_RES_IT::rej_stat_word() {
1723
0
  int16_t chars_in_word;
1724
0
  int16_t rejects_in_word = 0;
1725
1726
0
  chars_in_word = word_res->reject_map.length();
1727
0
  page_res->char_count += chars_in_word;
1728
0
  block_res->char_count += chars_in_word;
1729
0
  row_res->char_count += chars_in_word;
1730
1731
0
  rejects_in_word = word_res->reject_map.reject_count();
1732
1733
0
  page_res->rej_count += rejects_in_word;
1734
0
  block_res->rej_count += rejects_in_word;
1735
0
  row_res->rej_count += rejects_in_word;
1736
0
  if (chars_in_word == rejects_in_word) {
1737
0
    row_res->whole_word_rej_count += rejects_in_word;
1738
0
  }
1739
0
}
1740
1741
} // namespace tesseract