Coverage Report

Created: 2025-06-13 07:02

/src/tesseract/src/ccmain/fixspace.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************
2
 * File:        fixspace.cpp  (Formerly fixspace.c)
3
 * Description: Implements a pass over the page res, exploring the alternative
4
 *              spacing possibilities, trying to use context to improve the
5
 *              word spacing
6
 * Author:      Phil Cheatle
7
 *
8
 * (C) Copyright 1993, Hewlett-Packard Ltd.
9
 ** Licensed under the Apache License, Version 2.0 (the "License");
10
 ** you may not use this file except in compliance with the License.
11
 ** You may obtain a copy of the License at
12
 ** http://www.apache.org/licenses/LICENSE-2.0
13
 ** Unless required by applicable law or agreed to in writing, software
14
 ** distributed under the License is distributed on an "AS IS" BASIS,
15
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
 ** See the License for the specific language governing permissions and
17
 ** limitations under the License.
18
 *
19
 **********************************************************************/
20
21
#include "fixspace.h"
22
23
#include "blobs.h"          // for TWERD, TBLOB, TESSLINE
24
#include "boxword.h"        // for BoxWord
25
#include "errcode.h"        // for ASSERT_HOST
26
#include "normalis.h"       // for kBlnXHeight, kBlnBaselineOffset
27
#include "pageres.h"        // for WERD_RES_IT, WERD_RES, WERD_RES_LIST
28
#include "params.h"         // for IntParam, StringParam, BoolParam, DoubleParam, ...
29
#include "ratngs.h"         // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM
30
#include "rect.h"           // for TBOX
31
#include "stepblob.h"       // for C_BLOB_IT, C_BLOB_LIST, C_BLOB
32
#include "tesseractclass.h" // for Tesseract, TesseractStats, WordData
33
#include "tessvars.h"       // for debug_fp
34
#include "tprintf.h"        // for tprintf
35
#include "unicharset.h"     // for UNICHARSET
36
#include "werd.h"           // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP
37
38
#include <tesseract/ocrclass.h> // for ETEXT_DESC
39
#include <tesseract/unichar.h>  // for UNICHAR_ID
40
41
#include <cstdint> // for INT16_MAX, int16_t, int32_t
42
43
namespace tesseract {
44
45
class BLOCK;
46
class ROW;
47
48
0
#define PERFECT_WERDS 999
49
50
/**********************************************************************
51
 *  c_blob_comparator()
52
 *
53
 *  Blob comparator used to sort a blob list so that blobs are in increasing
54
 *  order of left edge.
55
 **********************************************************************/
56
57
static int c_blob_comparator( // sort blobs
58
    const C_BLOB *blob1,
59
    const C_BLOB *blob2
60
0
) {
61
0
  return blob1->bounding_box().left() - blob2->bounding_box().left();
62
0
}
63
64
/**
65
 * @name fix_fuzzy_spaces()
66
 * Walk over the page finding sequences of words joined by fuzzy spaces. Extract
67
 * them as a sublist, process the sublist to find the optimal arrangement of
68
 * spaces then replace the sublist in the ROW_RES.
69
 *
70
 * @param monitor progress monitor
71
 * @param word_count count of words in doc
72
 * @param[out] page_res
73
 */
74
0
void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) {
75
0
  BLOCK_RES_IT block_res_it;
76
0
  ROW_RES_IT row_res_it;
77
0
  WERD_RES_IT word_res_it_from;
78
0
  WERD_RES_IT word_res_it_to;
79
0
  WERD_RES *word_res;
80
0
  WERD_RES_LIST fuzzy_space_words;
81
0
  int16_t new_length;
82
0
  bool prevent_null_wd_fixsp; // DON'T process blobless wds
83
0
  int32_t word_index;         // current word
84
85
0
  block_res_it.set_to_list(&page_res->block_res_list);
86
0
  word_index = 0;
87
0
  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {
88
0
    row_res_it.set_to_list(&block_res_it.data()->row_res_list);
89
0
    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {
90
0
      word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
91
0
      while (!word_res_it_from.at_last()) {
92
0
        word_res = word_res_it_from.data();
93
0
        while (!word_res_it_from.at_last() &&
94
0
               !(word_res->combination ||
95
0
                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
96
0
                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
97
0
          fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
98
0
          word_res = word_res_it_from.forward();
99
0
          word_index++;
100
0
          if (monitor != nullptr) {
101
0
            monitor->ocr_alive = true;
102
0
            monitor->progress = 90 + 5 * word_index / word_count;
103
0
            if (monitor->deadline_exceeded() ||
104
0
                (monitor->cancel != nullptr &&
105
0
                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
106
0
              return;
107
0
            }
108
0
          }
109
0
        }
110
111
0
        if (!word_res_it_from.at_last()) {
112
0
          word_res_it_to = word_res_it_from;
113
0
          prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();
114
0
          if (check_debug_pt(word_res, 60)) {
115
0
            debug_fix_space_level.set_value(10);
116
0
          }
117
0
          word_res_it_to.forward();
118
0
          word_index++;
119
0
          if (monitor != nullptr) {
120
0
            monitor->ocr_alive = true;
121
0
            monitor->progress = 90 + 5 * word_index / word_count;
122
0
            if (monitor->deadline_exceeded() ||
123
0
                (monitor->cancel != nullptr &&
124
0
                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {
125
0
              return;
126
0
            }
127
0
          }
128
0
          while (!word_res_it_to.at_last() &&
129
0
                 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
130
0
                  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
131
0
            if (check_debug_pt(word_res, 60)) {
132
0
              debug_fix_space_level.set_value(10);
133
0
            }
134
0
            if (word_res->word->cblob_list()->empty()) {
135
0
              prevent_null_wd_fixsp = true;
136
0
            }
137
0
            word_res = word_res_it_to.forward();
138
0
          }
139
0
          if (check_debug_pt(word_res, 60)) {
140
0
            debug_fix_space_level.set_value(10);
141
0
          }
142
0
          if (word_res->word->cblob_list()->empty()) {
143
0
            prevent_null_wd_fixsp = true;
144
0
          }
145
0
          if (prevent_null_wd_fixsp) {
146
0
            word_res_it_from = word_res_it_to;
147
0
          } else {
148
0
            fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);
149
0
            fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,
150
0
                                 block_res_it.data()->block);
151
0
            new_length = fuzzy_space_words.length();
152
0
            word_res_it_from.add_list_before(&fuzzy_space_words);
153
0
            for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {
154
0
              word_res_it_from.forward();
155
0
            }
156
0
          }
157
0
          if (test_pt) {
158
0
            debug_fix_space_level.set_value(0);
159
0
          }
160
0
        }
161
0
        fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
162
        // Last word in row
163
0
      }
164
0
    }
165
0
  }
166
0
}
167
168
0
void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
169
0
  int16_t best_score;
170
0
  WERD_RES_LIST current_perm;
171
0
  bool improved = false;
172
173
0
  best_score = eval_word_spacing(best_perm); // default score
174
0
  dump_words(best_perm, best_score, 1, improved);
175
176
0
  if (best_score != PERFECT_WERDS) {
177
0
    initialise_search(best_perm, current_perm);
178
0
  }
179
180
0
  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
181
0
    match_current_words(current_perm, row, block);
182
0
    int16_t current_score = eval_word_spacing(current_perm);
183
0
    dump_words(current_perm, current_score, 2, improved);
184
0
    if (current_score > best_score) {
185
0
      best_perm.clear();
186
0
      best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
187
0
      best_score = current_score;
188
0
      improved = true;
189
0
    }
190
0
    if (current_score < PERFECT_WERDS) {
191
0
      transform_to_next_perm(current_perm);
192
0
    }
193
0
  }
194
0
  dump_words(best_perm, best_score, 3, improved);
195
0
}
196
197
0
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
198
0
  WERD_RES_IT src_it(&src_list);
199
0
  WERD_RES_IT new_it(&new_list);
200
0
  WERD_RES *new_wd;
201
202
0
  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
203
0
    WERD_RES *src_wd = src_it.data();
204
0
    if (!src_wd->combination) {
205
0
      new_wd = WERD_RES::deep_copy(src_wd);
206
0
      new_wd->combination = false;
207
0
      new_wd->part_of_combo = false;
208
0
      new_it.add_after_then_move(new_wd);
209
0
    }
210
0
  }
211
0
}
212
213
0
void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) {
214
0
  WERD_RES_IT word_it(&words);
215
0
  WERD_RES *word;
216
  // Since we are not using PAGE_RES to iterate over words, we need to update
217
  // prev_word_best_choice_ before calling classify_word_pass2().
218
0
  prev_word_best_choice_ = nullptr;
219
0
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
220
0
    word = word_it.data();
221
0
    if ((!word->part_of_combo) && (word->box_word == nullptr)) {
222
0
      WordData word_data(block, row, word);
223
0
      SetupWordPassN(2, &word_data);
224
0
      classify_word_and_language(2, nullptr, &word_data);
225
0
    }
226
0
    prev_word_best_choice_ = word->best_choice;
227
0
  }
228
0
}
229
230
/**
231
 * @name eval_word_spacing()
232
 * The basic measure is the number of characters in contextually confirmed
233
 * words. (I.e the word is done)
234
 * If all words are contextually confirmed the evaluation is deemed perfect.
235
 *
236
 * Some fiddles are done to handle "1"s as these are VERY frequent causes of
237
 * fuzzy spaces. The problem with the basic measure is that "561 63" would score
238
 * the same as "56163", though given our knowledge that the space is fuzzy, and
239
 * that there is a "1" next to the fuzzy space, we need to ensure that "56163"
240
 * is preferred.
241
 *
242
 * The solution is to NOT COUNT the score of any word which has a digit at one
243
 * end and a "1Il" as the character the other side of the space.
244
 *
245
 * Conversely, any character next to a "1" within a word is counted as a
246
 * positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1
247
 * side of the "1" joined).  "56163" would score 7 - all chars in a numeric word
248
 * + 2 sides of a "1" joined.
249
 *
250
 * The joined 1 rule is applied to any word REGARDLESS of contextual
251
 * confirmation.  Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
252
 * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
253
 *
254
 */
255
0
int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
256
0
  WERD_RES_IT word_res_it(&word_res_list);
257
0
  int16_t total_score = 0;
258
0
  int16_t word_count = 0;
259
0
  int16_t done_word_count = 0;
260
0
  int i;
261
0
  int16_t offset;
262
0
  int16_t prev_word_score = 0;
263
0
  bool prev_word_done = false;
264
0
  bool prev_char_1 = false;     // prev ch a "1/I/l"?
265
0
  bool prev_char_digit = false; // prev ch 2..9 or 0
266
0
  const char *punct_chars = "!\"`',.:;";
267
0
  do {
268
    // current word
269
0
    WERD_RES *word = word_res_it.data();
270
0
    bool word_done = fixspace_thinks_word_done(word);
271
0
    word_count++;
272
0
    if (word->tess_failed) {
273
0
      total_score += prev_word_score;
274
0
      if (prev_word_done) {
275
0
        done_word_count++;
276
0
      }
277
0
      prev_word_score = 0;
278
0
      prev_char_1 = false;
279
0
      prev_char_digit = false;
280
0
      prev_word_done = false;
281
0
    } else {
282
      /*
283
  Can we add the prev word score and potentially count this word?
284
  Yes IF it didn't end in a 1 when the first char of this word is a digit
285
    AND it didn't end in a digit when the first char of this word is a 1
286
*/
287
0
      auto word_len = word->reject_map.length();
288
0
      bool current_word_ok_so_far = false;
289
0
      if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
290
0
            (prev_char_digit &&
291
0
             ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
292
0
               word->best_choice->unichar_string()[0] == '1') ||
293
0
              (!word_done &&
294
0
               conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
295
0
        total_score += prev_word_score;
296
0
        if (prev_word_done) {
297
0
          done_word_count++;
298
0
        }
299
0
        current_word_ok_so_far = word_done;
300
0
      }
301
302
0
      if (current_word_ok_so_far) {
303
0
        prev_word_done = true;
304
0
        prev_word_score = word_len;
305
0
      } else {
306
0
        prev_word_done = false;
307
0
        prev_word_score = 0;
308
0
      }
309
310
      /* Add 1 to total score for every joined 1 regardless of context and
311
   rejtn */
312
0
      for (i = 0, prev_char_1 = false; i < word_len; i++) {
313
0
        bool current_char_1 = word->best_choice->unichar_string()[i] == '1';
314
0
        if (prev_char_1 || (current_char_1 && (i > 0))) {
315
0
          total_score++;
316
0
        }
317
0
        prev_char_1 = current_char_1;
318
0
      }
319
320
      /* Add 1 to total score for every joined punctuation regardless of context
321
  and rejtn */
322
0
      if (tessedit_prefer_joined_punct) {
323
0
        bool prev_char_punct;
324
0
        for (i = 0, offset = 0, prev_char_punct = false; i < word_len;
325
0
             offset += word->best_choice->unichar_lengths()[i++]) {
326
0
          bool current_char_punct =
327
0
              strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;
328
0
          if (prev_char_punct || (current_char_punct && i > 0)) {
329
0
            total_score++;
330
0
          }
331
0
          prev_char_punct = current_char_punct;
332
0
        }
333
0
      }
334
0
      prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
335
0
      for (i = 0, offset = 0; i < word_len - 1;
336
0
           offset += word->best_choice->unichar_lengths()[i++]) {
337
0
        ;
338
0
      }
339
0
      prev_char_1 =
340
0
          ((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
341
0
           (!word_done &&
342
0
            conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
343
0
    }
344
    /* Find next word */
345
0
    do {
346
0
      word_res_it.forward();
347
0
    } while (word_res_it.data()->part_of_combo);
348
0
  } while (!word_res_it.at_first());
349
0
  total_score += prev_word_score;
350
0
  if (prev_word_done) {
351
0
    done_word_count++;
352
0
  }
353
0
  if (done_word_count == word_count) {
354
0
    return PERFECT_WERDS;
355
0
  } else {
356
0
    return total_score;
357
0
  }
358
0
}
359
360
0
bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
361
0
  int i;
362
0
  int offset;
363
364
0
  for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) {
365
0
    ;
366
0
  }
367
0
  return (
368
0
      word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
369
0
                                 word->best_choice->unichar_lengths()[i]) ||
370
0
      (word->best_choice->permuter() == NUMBER_PERM &&
371
0
       numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
372
0
}
373
374
/**
375
 * @name transform_to_next_perm()
376
 * Examines the current word list to find the smallest word gap size. Then walks
377
 * the word list closing any gaps of this size by either inserted new
378
 * combination words, or extending existing ones.
379
 *
380
 * The routine COULD be limited to stop it building words longer than N blobs.
381
 *
382
 * If there are no more gaps then it DELETES the entire list and returns the
383
 * empty list to cause termination.
384
 */
385
0
void transform_to_next_perm(WERD_RES_LIST &words) {
386
0
  WERD_RES_IT word_it(&words);
387
0
  WERD_RES_IT prev_word_it(&words);
388
0
  WERD_RES *word;
389
0
  WERD_RES *prev_word;
390
0
  int16_t prev_right = -INT16_MAX;
391
0
  TBOX box;
392
0
  int16_t gap;
393
0
  int16_t min_gap = INT16_MAX;
394
395
0
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
396
0
    word = word_it.data();
397
0
    if (!word->part_of_combo) {
398
0
      box = word->word->bounding_box();
399
0
      if (prev_right > -INT16_MAX) {
400
0
        gap = box.left() - prev_right;
401
0
        if (gap < min_gap) {
402
0
          min_gap = gap;
403
0
        }
404
0
      }
405
0
      prev_right = box.right();
406
0
    }
407
0
  }
408
0
  if (min_gap < INT16_MAX) {
409
0
    prev_right = -INT16_MAX; // back to start
410
0
    word_it.set_to_list(&words);
411
    // Note: we can't use cycle_pt due to inserted combos at start of list.
412
0
    for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) {
413
0
      word = word_it.data();
414
0
      if (!word->part_of_combo) {
415
0
        box = word->word->bounding_box();
416
0
        if (prev_right > -INT16_MAX) {
417
0
          gap = box.left() - prev_right;
418
0
          if (gap <= min_gap) {
419
0
            prev_word = prev_word_it.data();
420
0
            WERD_RES *combo;
421
0
            if (prev_word->combination) {
422
0
              combo = prev_word;
423
0
            } else {
424
              /* Make a new combination and insert before
425
               * the first word being joined. */
426
0
              auto *copy_word = new WERD;
427
0
              *copy_word = *(prev_word->word);
428
              // deep copy
429
0
              combo = new WERD_RES(copy_word);
430
0
              combo->combination = true;
431
0
              combo->x_height = prev_word->x_height;
432
0
              prev_word->part_of_combo = true;
433
0
              prev_word_it.add_before_then_move(combo);
434
0
            }
435
0
            combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
436
0
            if (word->combination) {
437
0
              combo->word->join_on(word->word);
438
              // Move blobs to combo
439
              // old combo no longer needed
440
0
              delete word_it.extract();
441
0
            } else {
442
              // Copy current wd to combo
443
0
              combo->copy_on(word);
444
0
              word->part_of_combo = true;
445
0
            }
446
0
            combo->done = false;
447
0
            combo->ClearResults();
448
0
          } else {
449
0
            prev_word_it = word_it; // catch up
450
0
          }
451
0
        }
452
0
        prev_right = box.right();
453
0
      }
454
0
    }
455
0
  } else {
456
0
    words.clear(); // signal termination
457
0
  }
458
0
}
459
460
0
void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) {
461
0
  WERD_RES_IT word_res_it(&perm);
462
463
0
  if (debug_fix_space_level > 0) {
464
0
    if (mode == 1) {
465
0
      stats_.dump_words_str = "";
466
0
      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
467
0
        if (!word_res_it.data()->part_of_combo) {
468
0
          stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();
469
0
          stats_.dump_words_str += ' ';
470
0
        }
471
0
      }
472
0
    }
473
474
0
    if (debug_fix_space_level > 1) {
475
0
      switch (mode) {
476
0
        case 1:
477
0
          tprintf("EXTRACTED (%d): \"", score);
478
0
          break;
479
0
        case 2:
480
0
          tprintf("TESTED (%d): \"", score);
481
0
          break;
482
0
        case 3:
483
0
          tprintf("RETURNED (%d): \"", score);
484
0
          break;
485
0
      }
486
487
0
      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
488
0
        if (!word_res_it.data()->part_of_combo) {
489
0
          tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
490
0
                  static_cast<int>(word_res_it.data()->best_choice->permuter()));
491
0
        }
492
0
      }
493
0
      tprintf("\"\n");
494
0
    } else if (improved) {
495
0
      tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());
496
0
      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
497
0
        if (!word_res_it.data()->part_of_combo) {
498
0
          tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
499
0
                  static_cast<int>(word_res_it.data()->best_choice->permuter()));
500
0
        }
501
0
      }
502
0
      tprintf("\"\n");
503
0
    }
504
0
  }
505
0
}
506
507
0
bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
508
0
  if (word->done) {
509
0
    return true;
510
0
  }
511
512
  /*
513
  Use all the standard pass 2 conditions for mode 5 in set_done() in
514
  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
515
  CARE WHETHER WE HAVE of/at on/an etc.
516
*/
517
0
  if (fixsp_done_mode > 0 &&
518
0
      (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
519
0
       fixsp_done_mode == 3) &&
520
0
      (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&
521
0
      ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
522
0
       (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
523
0
       (word->best_choice->permuter() == USER_DAWG_PERM) ||
524
0
       (word->best_choice->permuter() == NUMBER_PERM))) {
525
0
    return true;
526
0
  } else {
527
0
    return false;
528
0
  }
529
0
}
530
531
/**
532
 * @name fix_sp_fp_word()
533
 * Test the current word to see if it can be split by deleting noise blobs. If
534
 * so, do the business.
535
 * Return with the iterator pointing to the same place if the word is unchanged,
536
 * or the last of the replacement words.
537
 */
538
0
void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) {
539
0
  WERD_RES *word_res;
540
0
  WERD_RES_LIST sub_word_list;
541
0
  WERD_RES_IT sub_word_list_it(&sub_word_list);
542
0
  int16_t new_length;
543
0
  float junk;
544
545
0
  word_res = word_res_it.data();
546
0
  if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||
547
0
      !word_res->word->flag(W_DONT_CHOP)) {
548
0
    return;
549
0
  }
550
551
0
  auto blob_index = worst_noise_blob(word_res, &junk);
552
0
  if (blob_index < 0) {
553
0
    return;
554
0
  }
555
556
0
  if (debug_fix_space_level > 1) {
557
0
    tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str());
558
0
  }
559
0
  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
560
0
  sub_word_list_it.add_after_stay_put(word_res_it.extract());
561
0
  fix_noisy_space_list(sub_word_list, row, block);
562
0
  new_length = sub_word_list.length();
563
0
  word_res_it.add_list_before(&sub_word_list);
564
0
  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
565
0
    word_res_it.forward();
566
0
  }
567
0
}
568
569
0
void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {
570
0
  int16_t best_score;
571
0
  WERD_RES_IT best_perm_it(&best_perm);
572
0
  WERD_RES_LIST current_perm;
573
0
  WERD_RES_IT current_perm_it(&current_perm);
574
0
  WERD_RES *old_word_res;
575
0
  int16_t current_score;
576
0
  bool improved = false;
577
578
0
  best_score = fp_eval_word_spacing(best_perm); // default score
579
580
0
  dump_words(best_perm, best_score, 1, improved);
581
582
0
  old_word_res = best_perm_it.data();
583
  // Even deep_copy doesn't copy the underlying WERD unless its combination
584
  // flag is true!.
585
0
  old_word_res->combination = true; // Kludge to force deep copy
586
0
  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
587
0
  old_word_res->combination = false; // Undo kludge
588
589
0
  break_noisiest_blob_word(current_perm);
590
591
0
  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
592
0
    match_current_words(current_perm, row, block);
593
0
    current_score = fp_eval_word_spacing(current_perm);
594
0
    dump_words(current_perm, current_score, 2, improved);
595
0
    if (current_score > best_score) {
596
0
      best_perm.clear();
597
0
      best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
598
0
      best_score = current_score;
599
0
      improved = true;
600
0
    }
601
0
    if (current_score < PERFECT_WERDS) {
602
0
      break_noisiest_blob_word(current_perm);
603
0
    }
604
0
  }
605
0
  dump_words(best_perm, best_score, 3, improved);
606
0
}
607
608
/**
609
 * break_noisiest_blob_word()
610
 * Find the word with the blob which looks like the worst noise.
611
 * Break the word into two, deleting the noise blob.
612
 */
613
0
void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
614
0
  WERD_RES_IT word_it(&words);
615
0
  WERD_RES_IT worst_word_it;
616
0
  float worst_noise_score = 9999;
617
0
  int worst_blob_index = -1; // Noisiest blob of noisiest wd
618
0
  float noise_score;         // of wds noisiest blob
619
0
  WERD_RES *word_res;
620
0
  C_BLOB_IT blob_it;
621
0
  C_BLOB_IT rej_cblob_it;
622
0
  C_BLOB_LIST new_blob_list;
623
0
  C_BLOB_IT new_blob_it;
624
0
  C_BLOB_IT new_rej_cblob_it;
625
0
  WERD *new_word;
626
0
  int16_t start_of_noise_blob;
627
0
  int16_t i;
628
629
0
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
630
0
    auto blob_index = worst_noise_blob(word_it.data(), &noise_score);
631
0
    if (blob_index > -1 && worst_noise_score > noise_score) {
632
0
      worst_noise_score = noise_score;
633
0
      worst_blob_index = blob_index;
634
0
      worst_word_it = word_it;
635
0
    }
636
0
  }
637
0
  if (worst_blob_index < 0) {
638
0
    words.clear(); // signal termination
639
0
    return;
640
0
  }
641
642
  /* Now split the worst_word_it */
643
644
0
  word_res = worst_word_it.data();
645
646
  /* Move blobs before noise blob to a new bloblist */
647
648
0
  new_blob_it.set_to_list(&new_blob_list);
649
0
  blob_it.set_to_list(word_res->word->cblob_list());
650
0
  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
651
0
    new_blob_it.add_after_then_move(blob_it.extract());
652
0
  }
653
0
  start_of_noise_blob = blob_it.data()->bounding_box().left();
654
0
  delete blob_it.extract(); // throw out noise blob
655
656
0
  new_word = new WERD(&new_blob_list, word_res->word);
657
0
  new_word->set_flag(W_EOL, false);
658
0
  word_res->word->set_flag(W_BOL, false);
659
0
  word_res->word->set_blanks(1); // After break
660
661
0
  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
662
0
  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
663
0
  for (; (!rej_cblob_it.empty() &&
664
0
          (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
665
0
       rej_cblob_it.forward()) {
666
0
    new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
667
0
  }
668
669
0
  auto *new_word_res = new WERD_RES(new_word);
670
0
  new_word_res->combination = true;
671
0
  worst_word_it.add_before_then_move(new_word_res);
672
673
0
  word_res->ClearResults();
674
0
}
675
676
0
int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
677
0
  float noise_score[512];
678
0
  int min_noise_blob; // 1st contender
679
0
  int max_noise_blob; // last contender
680
0
  int non_noise_count;
681
0
  int worst_noise_blob; // Worst blob
682
0
  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
683
0
  float non_noise_limit = kBlnXHeight * 0.8;
684
685
0
  if (word_res->rebuild_word == nullptr) {
686
0
    return -1; // Can't handle cube words.
687
0
  }
688
689
  // Normalised.
690
0
  auto blob_count = word_res->box_word->length();
691
0
  ASSERT_HOST(blob_count <= 512);
692
0
  if (blob_count < 5) {
693
0
    return -1; // too short to split
694
0
  }
695
696
    /* Get the noise scores for all blobs */
697
698
0
#ifndef SECURE_NAMES
699
0
  if (debug_fix_space_level > 5) {
700
0
    tprintf("FP fixspace Noise metrics for \"%s\": ",
701
0
            word_res->best_choice->unichar_string().c_str());
702
0
  }
703
0
#endif
704
705
0
  for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
706
0
    TBLOB *blob = word_res->rebuild_word->blobs[i];
707
0
    if (word_res->reject_map[i].accepted()) {
708
0
      noise_score[i] = non_noise_limit;
709
0
    } else {
710
0
      noise_score[i] = blob_noise_score(blob);
711
0
    }
712
713
0
    if (debug_fix_space_level > 5) {
714
0
      tprintf("%1.1f ", noise_score[i]);
715
0
    }
716
0
  }
717
0
  if (debug_fix_space_level > 5) {
718
0
    tprintf("\n");
719
0
  }
720
721
  /* Now find the worst one which is far enough away from the end of the word */
722
723
0
  non_noise_count = 0;
724
0
  int i;
725
0
  for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
726
0
    if (noise_score[i] >= non_noise_limit) {
727
0
      non_noise_count++;
728
0
    }
729
0
  }
730
0
  if (non_noise_count < fixsp_non_noise_limit) {
731
0
    return -1;
732
0
  }
733
734
0
  min_noise_blob = i;
735
736
0
  non_noise_count = 0;
737
0
  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {
738
0
    if (noise_score[i] >= non_noise_limit) {
739
0
      non_noise_count++;
740
0
    }
741
0
  }
742
0
  if (non_noise_count < fixsp_non_noise_limit) {
743
0
    return -1;
744
0
  }
745
746
0
  max_noise_blob = i;
747
748
0
  if (min_noise_blob > max_noise_blob) {
749
0
    return -1;
750
0
  }
751
752
0
  *worst_noise_score = small_limit;
753
0
  worst_noise_blob = -1;
754
0
  for (auto i = min_noise_blob; i <= max_noise_blob; i++) {
755
0
    if (noise_score[i] < *worst_noise_score) {
756
0
      worst_noise_blob = i;
757
0
      *worst_noise_score = noise_score[i];
758
0
    }
759
0
  }
760
0
  return worst_noise_blob;
761
0
}
762
763
0
float Tesseract::blob_noise_score(TBLOB *blob) {
764
0
  TBOX box; // BB of outline
765
0
  int16_t outline_count = 0;
766
0
  int16_t max_dimension;
767
0
  int16_t largest_outline_dimension = 0;
768
769
0
  for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
770
0
    outline_count++;
771
0
    box = ol->bounding_box();
772
0
    if (box.height() > box.width()) {
773
0
      max_dimension = box.height();
774
0
    } else {
775
0
      max_dimension = box.width();
776
0
    }
777
778
0
    if (largest_outline_dimension < max_dimension) {
779
0
      largest_outline_dimension = max_dimension;
780
0
    }
781
0
  }
782
783
0
  if (outline_count > 5) {
784
    // penalise LOTS of blobs
785
0
    largest_outline_dimension *= 2;
786
0
  }
787
788
0
  box = blob->bounding_box();
789
0
  if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {
790
    // Lax blob is if high or low
791
0
    largest_outline_dimension /= 2;
792
0
  }
793
794
0
  return largest_outline_dimension;
795
0
}
796
797
0
void fixspace_dbg(WERD_RES *word) {
798
0
  TBOX box = word->word->bounding_box();
799
0
  const bool show_map_detail = false;
800
801
0
  box.print();
802
0
  tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
803
0
  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", word->word->cblob_list()->length(),
804
0
          word->rebuild_word->NumBlobs(), word->box_word->length());
805
0
  word->reject_map.print(debug_fp);
806
0
  tprintf("\n");
807
0
  if (show_map_detail) {
808
0
    tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
809
0
    for (unsigned i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
810
0
      tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
811
0
      word->reject_map[i].full_print(debug_fp);
812
0
    }
813
0
  }
814
815
0
  tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
816
0
  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
817
0
}
818
819
/**
820
 * fp_eval_word_spacing()
821
 * Evaluation function for fixed pitch word lists.
822
 *
823
 * Basically, count the number of "nice" characters - those which are in tess
824
 * acceptable words or in dict words and are not rejected.
825
 * Penalise any potential noise chars
826
 */
827
0
int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
828
0
  WERD_RES_IT word_it(&word_res_list);
829
0
  WERD_RES *word;
830
0
  int16_t score = 0;
831
0
  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
832
833
0
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
834
0
    word = word_it.data();
835
0
    if (word->rebuild_word == nullptr) {
836
0
      continue; // Can't handle cube words.
837
0
    }
838
0
    if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
839
0
        word->best_choice->permuter() == FREQ_DAWG_PERM ||
840
0
        word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {
841
0
      auto num_blobs = word->rebuild_word->NumBlobs();
842
0
      UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
843
0
      for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
844
0
        TBLOB *blob = word->rebuild_word->blobs[i];
845
0
        if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {
846
0
          score -= 1; // penalise possibly erroneous non-space
847
0
        } else if (word->reject_map[i].accepted()) {
848
0
          score++;
849
0
        }
850
0
      }
851
0
    }
852
0
  }
853
0
  if (score < 0) {
854
0
    score = 0;
855
0
  }
856
0
  return score;
857
0
}
858
859
} // namespace tesseract