Coverage Report

Created: 2026-06-13 06:51

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/ccmain/applybox.cpp
Line
Count
Source
1
/**********************************************************************
2
 * File:        applybox.cpp  (Formerly applybox.c)
3
 * Description: Re segment rows according to box file data
4
 * Author:      Phil Cheatle
5
 *
6
 * (C) Copyright 1993, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
#ifndef DISABLED_LEGACY_ENGINE
20
#  include <allheaders.h>
21
#  include <cctype>
22
#  include <cerrno>
23
#  include <cstring>
24
#  include "boxread.h"
25
#endif // ndef DISABLED_LEGACY_ENGINE
26
#include <tesseract/unichar.h>
27
#include "pageres.h"
28
#include "tesseractclass.h"
29
#include "tesserrstream.h"  // for tesserr
30
#include "unicharset.h"
31
32
#ifndef DISABLED_LEGACY_ENGINE
33
/** Max number of blobs to classify together in FindSegmentation. */
34
const int kMaxGroupSize = 4;
35
/// Max fraction of median allowed as deviation in xheight before switching
36
/// to median.
37
const double kMaxXHeightDeviationFraction = 0.125;
38
#endif // ndef DISABLED_LEGACY_ENGINE
39
40
/**
41
 * The box file is assumed to contain box definitions, one per line, of the
42
 * following format for blob-level boxes:
43
 * @verbatim
44
 *   <UTF8 str> <left> <bottom> <right> <top> <page id>
45
 * @endverbatim
46
 * and for word/line-level boxes:
47
 * @verbatim
48
 *   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
49
 * @endverbatim
50
 * NOTES:
51
 * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
52
 *
53
 * <page id> is 0-based, and the page number is used for multipage input (tiff).
54
 *
55
 * In the blob-level form, each line represents a recognizable unit, which may
56
 * be several UTF-8 bytes, but there is a bounding box around each recognizable
57
 * unit, and no classifier is needed to train in this mode (bootstrapping.)
58
 *
59
 * In the word/line-level form, the line begins with the literal "WordStr", and
60
 * the bounding box bounds either a whole line or a whole word. The recognizable
61
 * units in the word/line are listed after the # at the end of the line and
62
 * are space delimited, ignoring any original spaces on the line.
63
 * Eg.
64
 * @verbatim
65
 * word -> #w o r d
66
 * multi word line -> #m u l t i w o r d l i n e
67
 * @endverbatim
68
 * The recognizable units must be space-delimited in order to allow multiple
69
 * unicodes to be used for a single recognizable unit, eg Hindi.
70
 *
71
 * In this mode, the classifier must have been pre-trained with the desired
72
 * character set, or it will not be able to find the character segmentations.
73
 */
74
75
namespace tesseract {
76
77
#ifndef DISABLED_LEGACY_ENGINE
78
0
static void clear_any_old_text(BLOCK_LIST *block_list) {
79
0
  BLOCK_IT block_it(block_list);
80
0
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
81
0
    ROW_IT row_it(block_it.data()->row_list());
82
0
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
83
0
      WERD_IT word_it(row_it.data()->word_list());
84
0
      for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
85
0
        word_it.data()->set_text("");
86
0
      }
87
0
    }
88
0
  }
89
0
}
90
91
// Applies the box file based on the image name filename, and resegments
92
// the words in the block_list (page), with:
93
// blob-mode: one blob per line in the box file, words as input.
94
// word/line-mode: one blob per space-delimited unit after the #, and one word
95
// per line in the box file. (See comment above for box file format.)
96
// If find_segmentation is true, (word/line mode) then the classifier is used
97
// to re-segment words/lines to match the space-delimited truth string for
98
// each box. In this case, the input box may be for a word or even a whole
99
// text line, and the output words will contain multiple blobs corresponding
100
// to the space-delimited input string.
101
// With find_segmentation false, no classifier is needed, but the chopper
102
// can still be used to correctly segment touching characters with the help
103
// of the input boxes.
104
// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
105
// from normal classification, ie. with a word, chopped_word, rebuild_word,
106
// seam_array, denorm, box_word, and best_state, but NO best_choice or
107
// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
108
// Instead, the correct_text member of WERD_RES is set, and this may be later
109
// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
110
// is not required before calling ApplyBoxTraining.
111
PAGE_RES *Tesseract::ApplyBoxes(const char *filename, bool find_segmentation,
112
0
                                BLOCK_LIST *block_list) {
113
0
  std::vector<TBOX> boxes;
114
0
  std::vector<std::string> texts, full_texts;
115
0
  if (!ReadAllBoxes(applybox_page, true, filename, &boxes, &texts, &full_texts, nullptr)) {
116
0
    return nullptr; // Can't do it.
117
0
  }
118
119
0
  const int box_count = boxes.size();
120
0
  int box_failures = 0;
121
122
  // In word mode, we use the boxes to make a word for each box, but
123
  // in blob mode we use the existing words and maximally chop them first.
124
0
  PAGE_RES *page_res = find_segmentation ? nullptr : SetupApplyBoxes(boxes, block_list);
125
0
  clear_any_old_text(block_list);
126
127
0
  for (int i = 0; i < box_count; i++) {
128
0
    bool foundit = false;
129
0
    if (page_res != nullptr) {
130
0
      foundit =
131
0
          ResegmentCharBox(page_res, (i == 0) ? nullptr : &boxes[i - 1], boxes[i],
132
0
                           (i == box_count - 1) ? nullptr : &boxes[i + 1], full_texts[i].c_str());
133
0
    } else {
134
0
      foundit = ResegmentWordBox(block_list, boxes[i],
135
0
                                 (i == box_count - 1) ? nullptr : &boxes[i + 1], texts[i].c_str());
136
0
    }
137
0
    if (!foundit) {
138
0
      box_failures++;
139
0
      ReportFailedBox(i, boxes[i], texts[i].c_str(), "FAILURE! Couldn't find a matching blob");
140
0
    }
141
0
  }
142
143
0
  if (page_res == nullptr) {
144
    // In word/line mode, we now maximally chop all the words and resegment
145
    // them with the classifier.
146
0
    page_res = SetupApplyBoxes(boxes, block_list);
147
0
    ReSegmentByClassification(page_res);
148
0
  }
149
0
  if (applybox_debug > 0) {
150
0
    tprintf("APPLY_BOXES:\n");
151
0
    tprintf("   Boxes read from boxfile:  %6d\n", box_count);
152
0
    if (box_failures > 0) {
153
0
      tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
154
0
    }
155
0
  }
156
0
  TidyUp(page_res);
157
0
  return page_res;
158
0
}
159
160
// Helper computes median xheight in the image.
161
0
static double MedianXHeight(BLOCK_LIST *block_list) {
162
0
  BLOCK_IT block_it(block_list);
163
0
  STATS xheights(0, block_it.data()->pdblk.bounding_box().height() - 1);
164
0
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
165
0
    ROW_IT row_it(block_it.data()->row_list());
166
0
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
167
0
      xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
168
0
    }
169
0
  }
170
0
  return xheights.median();
171
0
}
172
173
/// Any row xheight that is significantly different from the median is set
174
/// to the median.
175
0
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
176
0
  const double median_xheight = MedianXHeight(block_list);
177
0
  const double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
178
  // Strip all fuzzy space markers to simplify the PAGE_RES.
179
0
  BLOCK_IT b_it(block_list);
180
0
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
181
0
    BLOCK *block = b_it.data();
182
0
    ROW_IT r_it(block->row_list());
183
0
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
184
0
      ROW *row = r_it.data();
185
0
      const double diff = fabs(row->x_height() - median_xheight);
186
0
      if (diff > max_deviation) {
187
0
        if (applybox_debug) {
188
0
          tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight);
189
0
        }
190
0
        row->set_x_height(static_cast<float>(median_xheight));
191
0
      }
192
0
    }
193
0
  }
194
0
}
195
196
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
197
/// All fuzzy spaces are removed, and all the words are maximally chopped.
198
0
PAGE_RES *Tesseract::SetupApplyBoxes(const std::vector<TBOX> &boxes, BLOCK_LIST *block_list) {
199
0
  PreenXHeights(block_list);
200
  // Strip all fuzzy space markers to simplify the PAGE_RES.
201
0
  BLOCK_IT b_it(block_list);
202
0
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
203
0
    BLOCK *block = b_it.data();
204
0
    ROW_IT r_it(block->row_list());
205
0
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
206
0
      ROW *row = r_it.data();
207
0
      WERD_IT w_it(row->word_list());
208
0
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
209
0
        WERD *word = w_it.data();
210
0
        if (word->cblob_list()->empty()) {
211
0
          delete w_it.extract();
212
0
        } else {
213
0
          word->set_flag(W_FUZZY_SP, false);
214
0
          word->set_flag(W_FUZZY_NON, false);
215
0
        }
216
0
      }
217
0
    }
218
0
  }
219
0
  auto *page_res = new PAGE_RES(false, block_list, nullptr);
220
0
  PAGE_RES_IT pr_it(page_res);
221
0
  WERD_RES *word_res;
222
0
  while ((word_res = pr_it.word()) != nullptr) {
223
0
    MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res);
224
0
    pr_it.forward();
225
0
  }
226
0
  return page_res;
227
0
}
228
229
/// Tests the chopper by exhaustively running chop_one_blob.
230
/// The word_res will contain filled chopped_word, seam_array, denorm,
231
/// box_word and best_state for the maximally chopped word.
232
void Tesseract::MaximallyChopWord(const std::vector<TBOX> &boxes, BLOCK *block, ROW *row,
233
0
                                  WERD_RES *word_res) {
234
0
  if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
235
0
                                     classify_bln_numeric_mode, textord_use_cjk_fp_model,
236
0
                                     poly_allow_detailed_fx, row, block)) {
237
0
    word_res->CloneChoppedToRebuild();
238
0
    return;
239
0
  }
240
0
  if (chop_debug) {
241
0
    tprintf("Maximally chopping word at:");
242
0
    word_res->word->bounding_box().print();
243
0
  }
244
0
  std::vector<BLOB_CHOICE *> blob_choices;
245
0
  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
246
0
  auto rating = static_cast<float>(INT8_MAX);
247
0
  for (unsigned i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
248
    // The rating and certainty are not quite arbitrary. Since
249
    // select_blob_to_chop uses the worst certainty to choose, they all have
250
    // to be different, so starting with INT8_MAX, subtract 1/8 for each blob
251
    // in here, and then divide by e each time they are chopped, which
252
    // should guarantee a set of unequal values for the whole tree of blobs
253
    // produced, however much chopping is required. The chops are thus only
254
    // limited by the ability of the chopper to find suitable chop points,
255
    // and not by the value of the certainties.
256
0
    auto *choice = new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
257
0
    blob_choices.push_back(choice);
258
0
    rating -= 0.125f;
259
0
  }
260
0
  const double e = exp(1.0); // The base of natural logs.
261
0
  unsigned blob_number;
262
0
  if (!assume_fixed_pitch_char_segment) {
263
    // We only chop if the language is not fixed pitch like CJK.
264
0
    SEAM *seam = nullptr;
265
0
    int right_chop_index = 0;
266
0
    while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != nullptr) {
267
0
      word_res->InsertSeam(blob_number, seam);
268
0
      BLOB_CHOICE *left_choice = blob_choices[blob_number];
269
0
      rating = left_choice->rating() / e;
270
0
      left_choice->set_rating(rating);
271
0
      left_choice->set_certainty(-rating);
272
      // combine confidence w/ serial #
273
0
      auto *right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
274
0
                                           0.0f, 0.0f, BCC_FAKE);
275
0
      blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
276
0
    }
277
0
  }
278
0
  word_res->CloneChoppedToRebuild();
279
0
  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
280
0
}
281
282
/// Helper to compute the dispute resolution metric.
283
/// Disputed blob resolution. The aim is to give the blob to the most
284
/// appropriate boxfile box. Most of the time it is obvious, but if
285
/// two boxfile boxes overlap significantly it is not. If a small boxfile
286
/// box takes most of the blob, and a large boxfile box does too, then
287
/// we want the small boxfile box to get it, but if the small box
288
/// is much smaller than the blob, we don't want it to get it.
289
/// Details of the disputed blob resolution:
290
/// Given a box with area A, and a blob with area B, with overlap area C,
291
/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
292
/// miss metric gets the blob.
293
0
static double BoxMissMetric(const TBOX &box1, const TBOX &box2) {
294
0
  const int overlap_area = box1.intersection(box2).area();
295
0
  const int a = box1.area();
296
0
  const int b = box2.area();
297
0
  ASSERT_HOST(a != 0 && b != 0);
298
0
  return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
299
0
}
300
301
/// Gather consecutive blobs that match the given box into the best_state
302
/// and corresponding correct_text.
303
///
304
/// Fights over which box owns which blobs are settled by pre-chopping and
305
/// applying the blobs to box or next_box with the least non-overlap.
306
/// @return false if the box was in error, which can only be caused by
307
/// failing to find an appropriate blob for a box.
308
///
309
/// This means that occasionally, blobs may be incorrectly segmented if the
310
/// chopper fails to find a suitable chop point.
311
bool Tesseract::ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box,
312
0
                                 const TBOX *next_box, const char *correct_text) {
313
0
  if (applybox_debug > 1) {
314
0
    tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
315
0
  }
316
0
  PAGE_RES_IT page_res_it(page_res);
317
0
  WERD_RES *word_res;
318
0
  for (word_res = page_res_it.word(); word_res != nullptr; word_res = page_res_it.forward()) {
319
0
    if (!word_res->box_word->bounding_box().major_overlap(box)) {
320
0
      continue;
321
0
    }
322
0
    if (applybox_debug > 1) {
323
0
      tprintf("Checking word box:");
324
0
      word_res->box_word->bounding_box().print();
325
0
    }
326
0
    int word_len = word_res->box_word->length();
327
0
    for (int i = 0; i < word_len; ++i) {
328
0
      TBOX char_box = TBOX();
329
0
      int blob_count = 0;
330
0
      for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
331
0
        TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
332
0
        if (!blob_box.major_overlap(box)) {
333
0
          break;
334
0
        }
335
0
        if (word_res->correct_text[i + blob_count].length() > 0) {
336
0
          break; // Blob is claimed already.
337
0
        }
338
0
        if (next_box != nullptr) {
339
0
          const double current_box_miss_metric = BoxMissMetric(blob_box, box);
340
0
          const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
341
0
          if (applybox_debug > 2) {
342
0
            tprintf("Checking blob:");
343
0
            blob_box.print();
344
0
            tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
345
0
                    next_box_miss_metric);
346
0
          }
347
0
          if (current_box_miss_metric > next_box_miss_metric) {
348
0
            break; // Blob is a better match for next box.
349
0
          }
350
0
        }
351
0
        char_box += blob_box;
352
0
      }
353
0
      if (blob_count > 0) {
354
0
        if (applybox_debug > 1) {
355
0
          tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
356
0
        }
357
0
        if (!char_box.almost_equal(box, 3) &&
358
0
            ((next_box != nullptr && box.x_gap(*next_box) < -3) ||
359
0
             (prev_box != nullptr && prev_box->x_gap(box) < -3))) {
360
0
          return false;
361
0
        }
362
        // We refine just the box_word, best_state and correct_text here.
363
        // The rebuild_word is made in TidyUp.
364
        // blob_count blobs are put together to match the box. Merge the
365
        // box_word boxes, save the blob_count in the state and the text.
366
0
        word_res->box_word->MergeBoxes(i, i + blob_count);
367
0
        word_res->best_state[i] = blob_count;
368
0
        word_res->correct_text[i] = correct_text;
369
0
        if (applybox_debug > 2) {
370
0
          tprintf("%d Blobs match: blob box:", blob_count);
371
0
          word_res->box_word->BlobBox(i).print();
372
0
          tprintf("Matches box:");
373
0
          box.print();
374
0
          if (next_box != nullptr) {
375
0
            tprintf("With next box:");
376
0
            next_box->print();
377
0
          }
378
0
        }
379
        // Eliminated best_state and correct_text entries for the consumed
380
        // blobs.
381
0
        for (int j = 1; j < blob_count; ++j) {
382
0
          word_res->best_state.erase(word_res->best_state.begin() + i + 1);
383
0
          word_res->correct_text.erase(word_res->correct_text.begin() + i + 1);
384
0
        }
385
        // Assume that no box spans multiple source words, so we are done with
386
        // this box.
387
0
        if (applybox_debug > 1) {
388
0
          tprintf("Best state = ");
389
0
          for (auto best_state : word_res->best_state) {
390
0
            tprintf("%d ", best_state);
391
0
          }
392
0
          tprintf("\n");
393
0
          tprintf("Correct text = [[ ");
394
0
          for (auto &it : word_res->correct_text) {
395
0
            tprintf("%s ", it.c_str());
396
0
          }
397
0
          tprintf("]]\n");
398
0
        }
399
0
        return true;
400
0
      }
401
0
    }
402
0
  }
403
0
  if (applybox_debug > 0) {
404
0
    tprintf("FAIL!\n");
405
0
  }
406
0
  return false; // Failure.
407
0
}
408
409
/// Consume all source blobs that strongly overlap the given box,
410
/// putting them into a new word, with the correct_text label.
411
/// Fights over which box owns which blobs are settled by
412
/// applying the blobs to box or next_box with the least non-overlap.
413
/// @return false if the box was in error, which can only be caused by
414
/// failing to find an overlapping blob for a box.
415
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box,
416
0
                                 const char *correct_text) {
417
0
  if (applybox_debug > 1) {
418
0
    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
419
0
  }
420
0
  WERD *new_word = nullptr;
421
0
  BLOCK_IT b_it(block_list);
422
0
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
423
0
    BLOCK *block = b_it.data();
424
0
    if (!box.major_overlap(block->pdblk.bounding_box())) {
425
0
      continue;
426
0
    }
427
0
    ROW_IT r_it(block->row_list());
428
0
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
429
0
      ROW *row = r_it.data();
430
0
      if (!box.major_overlap(row->bounding_box())) {
431
0
        continue;
432
0
      }
433
0
      WERD_IT w_it(row->word_list());
434
0
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
435
0
        WERD *word = w_it.data();
436
0
        if (applybox_debug > 2) {
437
0
          tprintf("Checking word:");
438
0
          word->bounding_box().print();
439
0
        }
440
0
        if (word->text() != nullptr && word->text()[0] != '\0') {
441
0
          continue; // Ignore words that are already done.
442
0
        }
443
0
        if (!box.major_overlap(word->bounding_box())) {
444
0
          continue;
445
0
        }
446
0
        C_BLOB_IT blob_it(word->cblob_list());
447
0
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
448
0
          C_BLOB *blob = blob_it.data();
449
0
          TBOX blob_box = blob->bounding_box();
450
0
          if (!blob_box.major_overlap(box)) {
451
0
            continue;
452
0
          }
453
0
          if (next_box != nullptr) {
454
0
            const double current_box_miss_metric = BoxMissMetric(blob_box, box);
455
0
            const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
456
0
            if (applybox_debug > 2) {
457
0
              tprintf("Checking blob:");
458
0
              blob_box.print();
459
0
              tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric,
460
0
                      next_box_miss_metric);
461
0
            }
462
0
            if (current_box_miss_metric > next_box_miss_metric) {
463
0
              continue; // Blob is a better match for next box.
464
0
            }
465
0
          }
466
0
          if (applybox_debug > 2) {
467
0
            tprintf("Blob match: blob:");
468
0
            blob_box.print();
469
0
            tprintf("Matches box:");
470
0
            box.print();
471
0
            if (next_box != nullptr) {
472
0
              tprintf("With next box:");
473
0
              next_box->print();
474
0
            }
475
0
          }
476
0
          if (new_word == nullptr) {
477
            // Make a new word with a single blob.
478
0
            new_word = word->shallow_copy();
479
0
            new_word->set_text(correct_text);
480
0
            w_it.add_to_end(new_word);
481
0
          }
482
0
          C_BLOB_IT new_blob_it(new_word->cblob_list());
483
0
          new_blob_it.add_to_end(blob_it.extract());
484
0
        }
485
0
      }
486
0
    }
487
0
  }
488
0
  if (new_word == nullptr && applybox_debug > 0) {
489
0
    tprintf("FAIL!\n");
490
0
  }
491
0
  return new_word != nullptr;
492
0
}
493
494
/// Resegments the words by running the classifier in an attempt to find the
495
/// correct segmentation that produces the required string.
496
0
void Tesseract::ReSegmentByClassification(PAGE_RES *page_res) {
497
0
  PAGE_RES_IT pr_it(page_res);
498
0
  WERD_RES *word_res;
499
0
  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
500
0
    const WERD *word = word_res->word;
501
0
    if (word->text() == nullptr || word->text()[0] == '\0') {
502
0
      continue; // Ignore words that have no text.
503
0
    }
504
    // Convert the correct text to a vector of UNICHAR_ID
505
0
    std::vector<UNICHAR_ID> target_text;
506
0
    if (!ConvertStringToUnichars(word->text(), &target_text)) {
507
0
      tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text());
508
0
      pr_it.DeleteCurrentWord();
509
0
      continue;
510
0
    }
511
0
    if (!FindSegmentation(target_text, word_res)) {
512
0
      tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text());
513
0
      pr_it.DeleteCurrentWord();
514
0
      continue;
515
0
    }
516
0
  }
517
0
}
518
519
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
520
/// @return false if an invalid UNICHAR_ID is encountered.
521
0
bool Tesseract::ConvertStringToUnichars(const char *utf8, std::vector<UNICHAR_ID> *class_ids) {
522
0
  for (int step = 0; *utf8 != '\0'; utf8 += step) {
523
0
    const char *next_space = strchr(utf8, ' ');
524
0
    if (next_space == nullptr) {
525
0
      next_space = utf8 + strlen(utf8);
526
0
    }
527
0
    step = next_space - utf8;
528
0
    UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
529
0
    if (class_id == INVALID_UNICHAR_ID) {
530
0
      return false;
531
0
    }
532
0
    while (utf8[step] == ' ') {
533
0
      ++step;
534
0
    }
535
0
    class_ids->push_back(class_id);
536
0
  }
537
0
  return true;
538
0
}
539
540
/// Resegments the word to achieve the target_text from the classifier.
541
/// Returns false if the re-segmentation fails.
542
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
543
/// applies a full search on the classifier results to find the best classified
544
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
545
/// substitutions ARE used.
546
0
bool Tesseract::FindSegmentation(const std::vector<UNICHAR_ID> &target_text, WERD_RES *word_res) {
547
  // Classify all required combinations of blobs and save results in choices.
548
0
  const int word_length = word_res->box_word->length();
549
0
  auto *choices = new std::vector<BLOB_CHOICE_LIST *>[word_length];
550
0
  for (int i = 0; i < word_length; ++i) {
551
0
    for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
552
0
      BLOB_CHOICE_LIST *match_result =
553
0
          classify_piece(word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word,
554
0
                         word_res->blamer_bundle);
555
0
      if (applybox_debug > 2) {
556
0
        tprintf("%d+%d:", i, j);
557
0
        print_ratings_list("Segment:", match_result, unicharset);
558
0
      }
559
0
      choices[i].push_back(match_result);
560
0
    }
561
0
  }
562
  // Search the segmentation graph for the target text. Must be an exact
563
  // match. Using wildcards makes it difficult to find the correct
564
  // segmentation even when it is there.
565
0
  word_res->best_state.clear();
566
0
  std::vector<int> search_segmentation;
567
0
  float best_rating = 0.0f;
568
0
  SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
569
0
                &word_res->best_state);
570
0
  for (int i = 0; i < word_length; ++i) {
571
0
    for (auto choice : choices[i]) {
572
0
      delete choice;
573
0
    }
574
0
  }
575
0
  delete[] choices;
576
0
  if (word_res->best_state.empty()) {
577
    // Build the original segmentation and if it is the same length as the
578
    // truth, assume it will do.
579
0
    int blob_count = 1;
580
0
    for (auto s : word_res->seam_array) {
581
0
      SEAM *seam = s;
582
0
      if (!seam->HasAnySplits()) {
583
0
        word_res->best_state.push_back(blob_count);
584
0
        blob_count = 1;
585
0
      } else {
586
0
        ++blob_count;
587
0
      }
588
0
    }
589
0
    word_res->best_state.push_back(blob_count);
590
0
    if (word_res->best_state.size() != target_text.size()) {
591
0
      word_res->best_state.clear(); // No good. Original segmentation bad size.
592
0
      return false;
593
0
    }
594
0
  }
595
0
  word_res->correct_text.clear();
596
0
  for (auto &text : target_text) {
597
0
    word_res->correct_text.emplace_back(unicharset.id_to_unichar(text));
598
0
  }
599
0
  return true;
600
0
}
601
602
/// Recursive helper to find a match to the target_text (from text_index
603
/// position) in the choices (from choices_pos position).
604
/// @param choices is an array of vectors of length choices_length,
605
/// with each element representing a starting position in the word, and the
606
/// #vector holding classification results for a sequence of consecutive
607
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
608
/// @param choices_pos
609
/// @param choices_length
610
/// @param target_text
611
/// @param text_index
612
/// @param rating
613
/// @param segmentation
614
/// @param best_rating
615
/// @param best_segmentation
616
void Tesseract::SearchForText(const std::vector<BLOB_CHOICE_LIST *> *choices, int choices_pos,
617
                              unsigned choices_length, const std::vector<UNICHAR_ID> &target_text,
618
                              unsigned text_index, float rating, std::vector<int> *segmentation,
619
0
                              float *best_rating, std::vector<int> *best_segmentation) {
620
0
  const UnicharAmbigsVector &table = getDict().getUnicharAmbigs().dang_ambigs();
621
0
  for (unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
622
    // Rating of matching choice or worst choice if no match.
623
0
    float choice_rating = 0.0f;
624
    // Find the corresponding best BLOB_CHOICE.
625
0
    BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
626
0
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
627
0
      const BLOB_CHOICE *choice = choice_it.data();
628
0
      choice_rating = choice->rating();
629
0
      auto class_id = choice->unichar_id();
630
0
      if (class_id == target_text[text_index]) {
631
0
        break;
632
0
      }
633
      // Search ambigs table.
634
0
      if (static_cast<size_t>(class_id) < table.size() && table[class_id] != nullptr) {
635
0
        AmbigSpec_IT spec_it(table[class_id]);
636
0
        for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
637
0
          const AmbigSpec *ambig_spec = spec_it.data();
638
          // We'll only do 1-1.
639
0
          if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
640
0
              ambig_spec->correct_ngram_id == target_text[text_index]) {
641
0
            break;
642
0
          }
643
0
        }
644
0
        if (!spec_it.cycled_list()) {
645
0
          break; // Found an ambig.
646
0
        }
647
0
      }
648
0
    }
649
0
    if (choice_it.cycled_list()) {
650
0
      continue; // No match.
651
0
    }
652
0
    segmentation->push_back(length);
653
0
    if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
654
      // This is a complete match. If the rating is good record a new best.
655
0
      if (applybox_debug > 2) {
656
0
        tesserr << "Complete match, rating = " << rating + choice_rating
657
0
                << ", best=" << *best_rating
658
0
                << ", seglength=" << segmentation->size()
659
0
                << ", best=" << best_segmentation->size() << '\n';
660
0
      }
661
0
      if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
662
0
        *best_segmentation = *segmentation;
663
0
        *best_rating = rating + choice_rating;
664
0
      }
665
0
    } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
666
0
      if (applybox_debug > 3) {
667
0
        tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
668
0
                unicharset.id_to_unichar(target_text[text_index]),
669
0
                choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig",
670
0
                choices_pos, length);
671
0
      }
672
0
      SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
673
0
                    rating + choice_rating, segmentation, best_rating, best_segmentation);
674
0
      if (applybox_debug > 3) {
675
0
        tprintf("End recursion for %d=%s\n", target_text[text_index],
676
0
                unicharset.id_to_unichar(target_text[text_index]));
677
0
      }
678
0
    }
679
0
    segmentation->resize(segmentation->size() - 1);
680
0
  }
681
0
}
682
683
/// - Counts up the labelled words and the blobs within.
684
/// - Deletes all unused or emptied words, counting the unused ones.
685
/// - Resets W_BOL and W_EOL flags correctly.
686
/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
687
0
void Tesseract::TidyUp(PAGE_RES *page_res) {
688
0
  int ok_blob_count = 0;
689
0
  int bad_blob_count = 0;
690
  // TODO: check usage of ok_word_count.
691
0
  int ok_word_count = 0;
692
0
  int unlabelled_words = 0;
693
0
  PAGE_RES_IT pr_it(page_res);
694
0
  WERD_RES *word_res;
695
0
  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
696
0
    int ok_in_word = 0;
697
0
    int blob_count = word_res->correct_text.size();
698
0
    auto *word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
699
0
    word_choice->set_permuter(TOP_CHOICE_PERM);
700
0
    for (int c = 0; c < blob_count; ++c) {
701
0
      if (word_res->correct_text[c].length() > 0) {
702
0
        ++ok_in_word;
703
0
      }
704
      // Since we only need a fake word_res->best_choice, the actual
705
      // unichar_ids do not matter. Which is fortunate, since TidyUp()
706
      // can be called while training Tesseract, at the stage where
707
      // unicharset is not meaningful yet.
708
0
      word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->best_state[c],
709
0
                                                     1.0f, -1.0f);
710
0
    }
711
0
    if (ok_in_word > 0) {
712
0
      ok_blob_count += ok_in_word;
713
0
      bad_blob_count += word_res->correct_text.size() - ok_in_word;
714
0
      word_res->LogNewRawChoice(word_choice);
715
0
      word_res->LogNewCookedChoice(1, false, word_choice);
716
0
    } else {
717
0
      ++unlabelled_words;
718
0
      if (applybox_debug > 0) {
719
0
        tprintf("APPLY_BOXES: Unlabelled word at :");
720
0
        word_res->word->bounding_box().print();
721
0
      }
722
0
      pr_it.DeleteCurrentWord();
723
0
      delete word_choice;
724
0
    }
725
0
  }
726
0
  pr_it.restart_page();
727
0
  for (; (word_res = pr_it.word()) != nullptr; pr_it.forward()) {
728
    // Denormalize back to a BoxWord.
729
0
    word_res->RebuildBestState();
730
0
    word_res->SetupBoxWord();
731
0
    word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
732
0
    word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
733
0
  }
734
0
  if (applybox_debug > 0) {
735
0
    tprintf("   Found %d good blobs.\n", ok_blob_count);
736
0
    if (bad_blob_count > 0) {
737
0
      tprintf("   Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
738
0
    }
739
0
    if (unlabelled_words > 0) {
740
0
      tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
741
0
    }
742
0
  }
743
0
}
744
745
/** Logs a bad box by line in the box file and box coords.*/
746
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
747
0
                                const char *err_msg) {
748
0
  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
749
0
          box.left(), box.bottom(), box.right(), box.top(), err_msg);
750
0
}
751
752
/// Calls #LearnWord to extract features for labelled blobs within each word.
753
/// Features are stored in an internal buffer.
754
0
void Tesseract::ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res) {
755
0
  PAGE_RES_IT pr_it(page_res);
756
0
  int word_count = 0;
757
0
  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
758
0
    LearnWord(fontname.c_str(), word_res);
759
0
    ++word_count;
760
0
  }
761
0
  tprintf("Generated training data for %d words\n", word_count);
762
0
}
763
764
#endif // ndef DISABLED_LEGACY_ENGINE
765
766
/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
767
0
void Tesseract::CorrectClassifyWords(PAGE_RES *page_res) {
768
0
  PAGE_RES_IT pr_it(page_res);
769
0
  for (WERD_RES *word_res = pr_it.word(); word_res != nullptr; word_res = pr_it.forward()) {
770
0
    auto *choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
771
0
    for (auto &correct_text : word_res->correct_text) {
772
      // The part before the first space is the real ground truth, and the
773
      // rest is the bounding box location and page number.
774
0
      std::vector<std::string> tokens = split(correct_text, ' ');
775
0
      UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
776
0
      choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);
777
0
    }
778
0
    word_res->ClearWordChoices();
779
0
    word_res->LogNewRawChoice(choice);
780
0
    word_res->LogNewCookedChoice(1, false, choice);
781
0
  }
782
0
}
783
784
} // namespace tesseract