Coverage Report

Created: 2025-07-23 07:12

/src/tesseract/src/wordrec/chopper.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * File:         chopper.cpp  (Formerly chopper.c)
4
 * Author:       Mark Seaman, OCR Technology
5
 *
6
 * (c) Copyright 1987, Hewlett-Packard Company.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 *****************************************************************************/
18
19
// Include automatically generated configuration file if running autoconf.
20
#ifdef HAVE_CONFIG_H
21
#  include "config_auto.h"
22
#endif
23
24
#include "blamer.h"         // for BlamerBundle, IRR_CORRECT
25
#include "blobs.h"          // for TPOINT, TBLOB, EDGEPT, TESSLINE, divisible_blob
26
#include "dict.h"           // for Dict
27
#include "lm_pain_points.h" // for LMPainPoints
28
#include "lm_state.h"       // for BestChoiceBundle
29
#include "matrix.h"         // for MATRIX
30
#include "normalis.h"       // for DENORM
31
#include "pageres.h"        // for WERD_RES
32
#include "params.h"         // for IntParam, BoolParam
33
#include "ratngs.h"         // for BLOB_CHOICE (ptr only), BLOB_CHOICE_LIST (ptr ...
34
#include "rect.h"           // for TBOX
35
#include "render.h"         // for display_blob
36
#include "seam.h"           // for SEAM
37
#include "split.h"          // for remove_edgept
38
#include "stopper.h"        // for DANGERR
39
#include "tprintf.h"        // for tprintf
40
#include "wordrec.h"        // for Wordrec, SegSearchPending (ptr only)
41
42
namespace tesseract {
43
44
// Even though the limit on the number of chunks may now be removed, keep
45
// the same limit for repeatable behavior, and it may be a speed advantage.
46
static const int kMaxNumChunks = 64;
47
48
/*----------------------------------------------------------------------
49
          F u n c t i o n s
50
----------------------------------------------------------------------*/
51
52
/**
53
 * @name check_blob
54
 *
55
 * @return true if blob has a non whole outline.
56
 */
57
209k
static int check_blob(TBLOB *blob) {
58
209k
  TESSLINE *outline;
59
209k
  EDGEPT *edgept;
60
61
582k
  for (outline = blob->outlines; outline != nullptr; outline = outline->next) {
62
372k
    edgept = outline->loop;
63
2.40M
    do {
64
2.40M
      if (edgept == nullptr) {
65
0
        break;
66
0
      }
67
2.40M
      edgept = edgept->next;
68
2.40M
    } while (edgept != outline->loop);
69
372k
    if (edgept == nullptr) {
70
0
      return 1;
71
0
    }
72
372k
  }
73
209k
  return 0;
74
209k
}
75
76
/**
77
 * @name any_shared_split_points
78
 *
79
 * Return true if any of the splits share a point with this one.
80
 */
81
201k
static int any_shared_split_points(const std::vector<SEAM *> &seams, SEAM *seam) {
82
201k
  int length;
83
201k
  int index;
84
85
201k
  length = seams.size();
86
2.89M
  for (index = 0; index < length; index++) {
87
2.69M
    if (seam->SharesPosition(*seams[index])) {
88
5.13k
      return true;
89
5.13k
    }
90
2.69M
  }
91
196k
  return false;
92
201k
}
93
94
/**
95
 * @name preserve_outline_tree
96
 *
97
 * Copy the list of outlines.
98
 */
99
2.02M
static void preserve_outline(EDGEPT *start) {
100
2.02M
  EDGEPT *srcpt;
101
102
2.02M
  if (start == nullptr) {
103
0
    return;
104
0
  }
105
2.02M
  srcpt = start;
106
13.7M
  do {
107
13.7M
    srcpt->runlength = 1;
108
13.7M
    srcpt = srcpt->next;
109
13.7M
  } while (srcpt != start);
110
2.02M
  srcpt->runlength = 2;
111
2.02M
}
112
113
1.07M
static void preserve_outline_tree(TESSLINE *srcline) {
114
1.07M
  TESSLINE *outline;
115
116
3.09M
  for (outline = srcline; outline != nullptr; outline = outline->next) {
117
2.02M
    preserve_outline(outline->loop);
118
2.02M
  }
119
1.07M
}
120
121
/**
122
 * @name restore_outline_tree
123
 *
124
 * Copy the list of outlines.
125
 */
126
1.82M
static EDGEPT *restore_outline(EDGEPT *start) {
127
1.82M
  EDGEPT *srcpt;
128
1.82M
  EDGEPT *real_start;
129
130
1.82M
  if (start == nullptr) {
131
0
    return nullptr;
132
0
  }
133
1.82M
  srcpt = start;
134
2.51M
  do {
135
2.51M
    if (srcpt->runlength == 2) {
136
1.82M
      break;
137
1.82M
    }
138
693k
    srcpt = srcpt->next;
139
693k
  } while (srcpt != start);
140
0
  real_start = srcpt;
141
11.7M
  do {
142
11.7M
    srcpt = srcpt->next;
143
11.7M
    if (srcpt->prev->runlength == 0) {
144
34.7k
      remove_edgept(srcpt->prev);
145
34.7k
    }
146
11.7M
  } while (srcpt != real_start);
147
1.82M
  return real_start;
148
1.82M
}
149
150
1.00M
static void restore_outline_tree(TESSLINE *srcline) {
151
1.00M
  TESSLINE *outline;
152
153
2.82M
  for (outline = srcline; outline != nullptr; outline = outline->next) {
154
1.82M
    outline->loop = restore_outline(outline->loop);
155
1.82M
    outline->start = outline->loop->pos;
156
1.82M
  }
157
1.00M
}
158
159
/**********************************************************************
160
 * total_containment
161
 *
162
 * Check to see if one of these outlines is totally contained within
163
 * the bounding box of the other.
164
 **********************************************************************/
165
274k
static int16_t total_containment(TBLOB *blob1, TBLOB *blob2) {
166
274k
  TBOX box1 = blob1->bounding_box();
167
274k
  TBOX box2 = blob2->bounding_box();
168
274k
  return box1.contains(box2) || box2.contains(box1);
169
274k
}
170
171
// Helper runs all the checks on a seam to make sure it is valid.
172
// Returns the seam if OK, otherwise deletes the seam and returns nullptr.
173
static SEAM *CheckSeam(int debug_level, int32_t blob_number, TWERD *word, TBLOB *blob,
174
1.23M
                       TBLOB *other_blob, const std::vector<SEAM *> &seams, SEAM *seam) {
175
1.23M
  if (seam == nullptr || blob->outlines == nullptr || other_blob->outlines == nullptr ||
176
1.23M
      total_containment(blob, other_blob) || check_blob(other_blob) ||
177
1.23M
      !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) ||
178
1.23M
      any_shared_split_points(seams, seam) ||
179
1.23M
      !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) {
180
1.04M
    word->blobs.erase(word->blobs.begin() + blob_number + 1);
181
1.04M
    if (seam) {
182
84.8k
      seam->UndoSeam(blob, other_blob);
183
84.8k
      delete seam;
184
84.8k
      seam = nullptr;
185
#ifndef GRAPHICS_DISABLED
186
      if (debug_level) {
187
        if (debug_level > 2) {
188
          display_blob(blob, ScrollView::RED);
189
        }
190
        tprintf("\n** seam being removed ** \n");
191
      }
192
#endif
193
956k
    } else {
194
956k
      delete other_blob;
195
956k
    }
196
1.04M
    return nullptr;
197
1.04M
  }
198
196k
  return seam;
199
1.23M
}
200
201
/**
202
 * @name attempt_blob_chop
203
 *
204
 * Try to split the this blob after this one.  Check to make sure that
205
 * it was successful.
206
 */
207
SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,
208
1.07M
                                 const std::vector<SEAM *> &seams) {
209
1.07M
  if (repair_unchopped_blobs) {
210
1.07M
    preserve_outline_tree(blob->outlines);
211
1.07M
  }
212
1.07M
  TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
213
  // Insert it into the word.
214
1.07M
  word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob);
215
216
1.07M
  SEAM *seam = nullptr;
217
1.07M
  if (prioritize_division) {
218
0
    TPOINT location;
219
0
    if (divisible_blob(blob, italic_blob, &location)) {
220
0
      seam = new SEAM(0.0f, location);
221
0
    }
222
0
  }
223
1.07M
  if (seam == nullptr) {
224
1.07M
    seam = pick_good_seam(blob);
225
1.07M
  }
226
1.07M
  if (chop_debug) {
227
0
    if (seam != nullptr) {
228
0
      seam->Print("Good seam picked=");
229
0
    } else {
230
0
      tprintf("\n** no seam picked *** \n");
231
0
    }
232
0
  }
233
1.07M
  if (seam) {
234
113k
    seam->ApplySeam(italic_blob, blob, other_blob);
235
113k
  }
236
237
1.07M
  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam);
238
1.07M
  if (seam == nullptr) {
239
1.00M
    if (repair_unchopped_blobs) {
240
1.00M
      restore_outline_tree(blob->outlines);
241
1.00M
    }
242
1.00M
    if (allow_blob_division && !prioritize_division) {
243
      // If the blob can simply be divided into outlines, then do that.
244
1.00M
      TPOINT location;
245
1.00M
      if (divisible_blob(blob, italic_blob, &location)) {
246
167k
        other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
247
167k
        word->blobs.insert(word->blobs.begin() + blob_number + 1, other_blob);
248
167k
        seam = new SEAM(0.0f, location);
249
167k
        seam->ApplySeam(italic_blob, blob, other_blob);
250
167k
        seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam);
251
167k
      }
252
1.00M
    }
253
1.00M
  }
254
1.07M
  if (seam != nullptr) {
255
    // Make sure this seam doesn't get chopped again.
256
196k
    seam->Finalize();
257
196k
  }
258
1.07M
  return seam;
259
1.07M
}
260
261
SEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
262
1.07M
                                  const std::vector<SEAM *> &seams) {
263
1.07M
  return attempt_blob_chop(word, word->blobs[blob_number], blob_number, italic_blob, seams);
264
1.07M
}
265
266
SEAM *Wordrec::chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob,
267
0
                                     WERD_RES *word_res, unsigned *blob_number) {
268
0
  TWERD *word = word_res->chopped_word;
269
0
  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
270
0
    TBLOB *blob = word->blobs[*blob_number];
271
0
    TPOINT topleft, botright;
272
0
    topleft.x = blob->bounding_box().left();
273
0
    topleft.y = blob->bounding_box().top();
274
0
    botright.x = blob->bounding_box().right();
275
0
    botright.y = blob->bounding_box().bottom();
276
277
0
    TPOINT original_topleft, original_botright;
278
0
    word_res->denorm.DenormTransform(nullptr, topleft, &original_topleft);
279
0
    word_res->denorm.DenormTransform(nullptr, botright, &original_botright);
280
281
0
    TBOX original_box =
282
0
        TBOX(original_topleft.x, original_botright.y, original_botright.x, original_topleft.y);
283
284
0
    bool almost_equal_box = false;
285
0
    int num_overlap = 0;
286
0
    for (auto &&boxe : boxes) {
287
0
      if (original_box.overlap_fraction(boxe) > 0.125) {
288
0
        num_overlap++;
289
0
      }
290
0
      if (original_box.almost_equal(boxe, 3)) {
291
0
        almost_equal_box = true;
292
0
      }
293
0
    }
294
295
0
    TPOINT location;
296
0
    if (divisible_blob(blob, italic_blob, &location) || (!almost_equal_box && num_overlap > 1)) {
297
0
      SEAM *seam = attempt_blob_chop(word, blob, *blob_number, italic_blob, word_res->seam_array);
298
0
      if (seam != nullptr) {
299
0
        return seam;
300
0
      }
301
0
    }
302
0
  }
303
304
0
  *blob_number = UINT_MAX;
305
0
  return nullptr;
306
0
}
307
308
/**
309
 * @name improve_one_blob
310
 *
311
 * Finds the best place to chop, based on the worst blob, fixpt, or next to
312
 * a fragment, according to the input. Returns the SEAM corresponding to the
313
 * chop point, if any is found, and the index in the ratings_matrix of the
314
 * chopped blob. Note that blob_choices is just a copy of the pointers in the
315
 * leading diagonal of the ratings MATRIX.
316
 * Although the blob is chopped, the returned SEAM is yet to be inserted into
317
 * word->seam_array and the resulting blobs are unclassified, so this function
318
 * can be used by ApplyBox as well as during recognition.
319
 */
320
SEAM *Wordrec::improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
321
                                bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
322
337k
                                unsigned *blob_number) {
323
337k
  float rating_ceiling = FLT_MAX;
324
337k
  SEAM *seam = nullptr;
325
1.21M
  do {
326
1.21M
    auto blob = select_blob_to_split_from_fixpt(fixpt);
327
1.21M
    if (chop_debug) {
328
0
      tprintf("blob_number from fixpt = %d\n", blob);
329
0
    }
330
1.21M
    bool split_point_from_dict = (blob != -1);
331
1.21M
    if (split_point_from_dict) {
332
0
      fixpt->clear();
333
1.21M
    } else {
334
1.21M
      blob = select_blob_to_split(blob_choices, rating_ceiling, split_next_to_fragment);
335
1.21M
    }
336
1.21M
    if (chop_debug) {
337
0
      tprintf("blob_number = %d\n", blob);
338
0
    }
339
1.21M
    *blob_number = blob;
340
1.21M
    if (blob == -1) {
341
140k
      return nullptr;
342
140k
    }
343
344
    // TODO(rays) it may eventually help to allow italic_blob to be true,
345
1.07M
    seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob, word->seam_array);
346
1.07M
    if (seam != nullptr) {
347
196k
      break; // Success!
348
196k
    }
349
873k
    if (blob_choices[*blob_number] == nullptr) {
350
0
      return nullptr;
351
0
    }
352
873k
    if (!split_point_from_dict) {
353
      // We chopped the worst rated blob, try something else next time.
354
873k
      rating_ceiling = blob_choices[*blob_number]->rating();
355
873k
    }
356
873k
  } while (true);
357
196k
  return seam;
358
337k
}
359
360
/**
361
 * @name chop_one_blob
362
 *
363
 * Start with the current one-blob word and its classification.  Find
364
 * the worst blobs and try to divide it up to improve the ratings.
365
 * Used for testing chopper.
366
 */
367
SEAM *Wordrec::chop_one_blob(const std::vector<TBOX> &boxes,
368
                             const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
369
0
                             unsigned *blob_number) {
370
0
  if (prioritize_division) {
371
0
    return chop_overlapping_blob(boxes, true, word_res, blob_number);
372
0
  } else {
373
0
    return improve_one_blob(blob_choices, nullptr, false, true, word_res, blob_number);
374
0
  }
375
0
}
376
377
/**
378
 * @name chop_word_main
379
 *
380
 * Classify the blobs in this word and permute the results.  Find the
381
 * worst blob in the word and chop it up.  Continue this process until
382
 * a good answer has been found or all the blobs have been chopped up
383
 * enough.  The results are returned in the WERD_RES.
384
 */
385
144k
void Wordrec::chop_word_main(WERD_RES *word) {
386
144k
  int num_blobs = word->chopped_word->NumBlobs();
387
144k
  if (word->ratings == nullptr) {
388
27.8k
    word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
389
27.8k
  }
390
144k
  if (word->ratings->get(0, 0) == nullptr) {
391
    // Run initial classification.
392
724k
    for (int b = 0; b < num_blobs; ++b) {
393
580k
      BLOB_CHOICE_LIST *choices = classify_piece(
394
580k
          word->seam_array, b, b, "Initial:", word->chopped_word, word->blamer_bundle);
395
580k
      word->ratings->put(b, b, choices);
396
580k
    }
397
144k
  } else {
398
    // Blobs have been pre-classified. Set matrix cell for all blob choices
399
0
    for (int col = 0; col < word->ratings->dimension(); ++col) {
400
0
      for (int row = col;
401
0
           row < word->ratings->dimension() && row < col + word->ratings->bandwidth(); ++row) {
402
0
        BLOB_CHOICE_LIST *choices = word->ratings->get(col, row);
403
0
        if (choices != nullptr) {
404
0
          BLOB_CHOICE_IT bc_it(choices);
405
0
          for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
406
0
            bc_it.data()->set_matrix_cell(col, row);
407
0
          }
408
0
        }
409
0
      }
410
0
    }
411
0
  }
412
413
  // Run Segmentation Search.
414
144k
  BestChoiceBundle best_choice_bundle(word->ratings->dimension());
415
144k
  SegSearch(word, &best_choice_bundle, word->blamer_bundle);
416
417
144k
  if (word->best_choice == nullptr) {
418
    // SegSearch found no valid paths, so just use the leading diagonal.
419
0
    word->FakeWordFromRatings(TOP_CHOICE_PERM);
420
0
  }
421
144k
  word->RebuildBestState();
422
  // If we finished without a hyphen at the end of the word, let the next word
423
  // be found in the dictionary.
424
144k
  if (word->word->flag(W_EOL) && !getDict().has_hyphen_end(*word->best_choice)) {
425
118k
    getDict().reset_hyphen_vars(true);
426
118k
  }
427
428
144k
  if (word->blamer_bundle != nullptr && this->fill_lattice_ != nullptr) {
429
0
    CallFillLattice(*word->ratings, word->best_choices, *word->uch_set, word->blamer_bundle);
430
0
  }
431
144k
  if (wordrec_debug_level > 0) {
432
0
    tprintf("Final Ratings Matrix:\n");
433
0
    word->ratings->print(getDict().getUnicharset());
434
0
  }
435
144k
  word->FilterWordChoices(getDict().stopper_debug_level);
436
144k
}
437
438
/**
439
 * @name improve_by_chopping
440
 *
441
 * Repeatedly chops the worst blob, classifying the new blobs fixing up all
442
 * the data, and incrementally runs the segmentation search until a good word
443
 * is found, or no more chops can be found.
444
 */
445
void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES *word,
446
                                  BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
447
                                  LMPainPoints *pain_points,
448
140k
                                  std::vector<SegSearchPending> *pending) {
449
140k
  unsigned blob_number;
450
337k
  do { // improvement loop.
451
    // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
452
    // one to chop.
453
337k
    std::vector<BLOB_CHOICE *> blob_choices;
454
337k
    int num_blobs = word->ratings->dimension();
455
3.89M
    for (int i = 0; i < num_blobs; ++i) {
456
3.55M
      BLOB_CHOICE_LIST *choices = word->ratings->get(i, i);
457
3.55M
      if (choices == nullptr || choices->empty()) {
458
0
        blob_choices.push_back(nullptr);
459
3.55M
      } else {
460
3.55M
        BLOB_CHOICE_IT bc_it(choices);
461
3.55M
        blob_choices.push_back(bc_it.data());
462
3.55M
      }
463
3.55M
    }
464
337k
    SEAM *seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt, false, false, word,
465
337k
                                  &blob_number);
466
337k
    if (seam == nullptr) {
467
140k
      break;
468
140k
    }
469
    // A chop has been made. We have to correct all the data structures to
470
    // take into account the extra bottom-level blob.
471
    // Put the seam into the seam_array and correct everything else on the
472
    // word: ratings matrix (including matrix location in the BLOB_CHOICES),
473
    // states in WERD_CHOICEs, and blob widths.
474
196k
    word->InsertSeam(blob_number, seam);
475
    // Insert a new entry in the beam array.
476
196k
    best_choice_bundle->beam.insert(best_choice_bundle->beam.begin() + blob_number, new LanguageModelState);
477
    // Fixpts are outdated, but will get recalculated.
478
196k
    best_choice_bundle->fixpt.clear();
479
    // Remap existing pain points.
480
196k
    pain_points->RemapForSplit(blob_number);
481
    // Insert a new pending at the chop point.
482
196k
    pending->insert(pending->begin() + blob_number, SegSearchPending());
483
484
    // Classify the two newly created blobs using ProcessSegSearchPainPoint,
485
    // as that updates the pending correctly and adds new pain points.
486
196k
    MATRIX_COORD pain_point(blob_number, blob_number);
487
196k
    ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word, pain_points, blamer_bundle);
488
196k
    pain_point.col = blob_number + 1;
489
196k
    pain_point.row = blob_number + 1;
490
196k
    ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word, pain_points, blamer_bundle);
491
196k
    if (language_model_->language_model_ngram_on) {
492
      // N-gram evaluation depends on the number of blobs in a chunk, so we
493
      // have to re-evaluate everything in the word.
494
0
      ResetNGramSearch(word, best_choice_bundle, *pending);
495
0
      blob_number = 0;
496
0
    }
497
    // Run language model incrementally. (Except with the n-gram model on.)
498
196k
    UpdateSegSearchNodes(rating_cert_scale, blob_number, pending, word, pain_points,
499
196k
                         best_choice_bundle, blamer_bundle);
500
196k
  } while (!language_model_->AcceptableChoiceFound() && word->ratings->dimension() < kMaxNumChunks);
501
502
  // If after running only the chopper best_choice is incorrect and no blame
503
  // has been yet set, blame the classifier if best_choice is classifier's
504
  // top choice and is a dictionary word (i.e. language model could not have
505
  // helped). Otherwise blame the tradeoff between the classifier and
506
  // the old language model (permuters).
507
140k
  if (word->blamer_bundle != nullptr &&
508
140k
      word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT &&
509
140k
      !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
510
0
    bool valid_permuter = word->best_choice != nullptr &&
511
0
                          Dict::valid_word_permuter(word->best_choice->permuter(), false);
512
0
    word->blamer_bundle->BlameClassifierOrLangModel(word, getDict().getUnicharset(), valid_permuter,
513
0
                                                    wordrec_debug_blamer);
514
0
  }
515
140k
}
516
517
/**********************************************************************
518
 * select_blob_to_split
519
 *
520
 * These are the results of the last classification.  Find a likely
521
 * place to apply splits.  If none, return -1.
522
 **********************************************************************/
523
int Wordrec::select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices,
524
1.21M
                                  float rating_ceiling, bool split_next_to_fragment) {
525
1.21M
  BLOB_CHOICE *blob_choice;
526
1.21M
  float worst = -FLT_MAX;
527
1.21M
  int worst_index = -1;
528
1.21M
  float worst_near_fragment = -FLT_MAX;
529
1.21M
  int worst_index_near_fragment = -1;
530
1.21M
  std::vector<const CHAR_FRAGMENT *> fragments;
531
532
1.21M
  if (chop_debug) {
533
0
    if (rating_ceiling < FLT_MAX) {
534
0
      tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
535
0
    } else {
536
0
      tprintf("rating_ceiling = No Limit\n");
537
0
    }
538
0
  }
539
540
1.21M
  if (split_next_to_fragment && blob_choices.size() > 0) {
541
0
    fragments.resize(blob_choices.size());
542
0
    if (blob_choices[0] != nullptr) {
543
0
      fragments[0] = getDict().getUnicharset().get_fragment(blob_choices[0]->unichar_id());
544
0
    } else {
545
0
      fragments[0] = nullptr;
546
0
    }
547
0
  }
548
549
20.7M
  for (unsigned x = 0; x < blob_choices.size(); ++x) {
550
19.5M
    if (blob_choices[x] == nullptr) {
551
0
      return x;
552
19.5M
    } else {
553
19.5M
      blob_choice = blob_choices[x];
554
      // Populate fragments for the following position.
555
19.5M
      if (split_next_to_fragment && x + 1 < blob_choices.size()) {
556
0
        if (blob_choices[x + 1] != nullptr) {
557
0
          fragments[x + 1] =
558
0
              getDict().getUnicharset().get_fragment(blob_choices[x + 1]->unichar_id());
559
0
        } else {
560
0
          fragments[x + 1] = nullptr;
561
0
        }
562
0
      }
563
19.5M
      if (blob_choice->rating() < rating_ceiling &&
564
19.5M
          blob_choice->certainty() < tessedit_certainty_threshold) {
565
        // Update worst and worst_index.
566
13.3M
        if (blob_choice->rating() > worst) {
567
2.83M
          worst_index = x;
568
2.83M
          worst = blob_choice->rating();
569
2.83M
        }
570
13.3M
        if (split_next_to_fragment) {
571
          // Update worst_near_fragment and worst_index_near_fragment.
572
0
          bool expand_following_fragment =
573
0
              (x + 1 < blob_choices.size() && fragments[x + 1] != nullptr &&
574
0
               !fragments[x + 1]->is_beginning());
575
0
          bool expand_preceding_fragment =
576
0
              (x > 0 && fragments[x - 1] != nullptr && !fragments[x - 1]->is_ending());
577
0
          if ((expand_following_fragment || expand_preceding_fragment) &&
578
0
              blob_choice->rating() > worst_near_fragment) {
579
0
            worst_index_near_fragment = x;
580
0
            worst_near_fragment = blob_choice->rating();
581
0
            if (chop_debug) {
582
0
              tprintf(
583
0
                  "worst_index_near_fragment=%d"
584
0
                  " expand_following_fragment=%d"
585
0
                  " expand_preceding_fragment=%d\n",
586
0
                  worst_index_near_fragment, expand_following_fragment, expand_preceding_fragment);
587
0
            }
588
0
          }
589
0
        }
590
13.3M
      }
591
19.5M
    }
592
19.5M
  }
593
  // TODO(daria): maybe a threshold of badness for
594
  // worst_near_fragment would be useful.
595
1.21M
  return worst_index_near_fragment != -1 ? worst_index_near_fragment : worst_index;
596
1.21M
}
597
598
/**********************************************************************
599
 * select_blob_to_split_from_fixpt
600
 *
601
 * Given the fix point from a dictionary search, if there is a single
602
 * dangerous blob that maps to multiple characters, return that blob
603
 * index as a place we need to split.  If none, return -1.
604
 **********************************************************************/
605
1.21M
int Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) {
606
1.21M
  if (!fixpt) {
607
0
    return -1;
608
0
  }
609
2.60M
  for (auto &i : *fixpt) {
610
2.60M
    if (i.begin + 1 == i.end && i.dangerous && i.correct_is_ngram) {
611
0
      return i.begin;
612
0
    }
613
2.60M
  }
614
1.21M
  return -1;
615
1.21M
}
616
617
} // namespace tesseract