Coverage Report

Created: 2025-06-13 07:15

/src/tesseract/src/textord/tospace.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed under the Apache License, Version 2.0 (the "License");
2
// you may not use this file except in compliance with the License.
3
// You may obtain a copy of the License at
4
// http://www.apache.org/licenses/LICENSE-2.0
5
// Unless required by applicable law or agreed to in writing, software
6
// distributed under the License is distributed on an "AS IS" BASIS,
7
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8
// See the License for the specific language governing permissions and
9
// limitations under the License.
10
/**********************************************************************
11
 * tospace.cpp
12
 *
13
 * Compute fuzzy word spacing thresholds for each row.
14
 * I.e. set :   max_nonspace
15
 *              space_threshold
16
 *              min_space
17
 *              kern_size
18
 *              space_size
19
 * for each row.
20
 * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
21
 *
22
 * Note: functions in this file were originally not members of any
23
 * class or enclosed by any namespace. Now they are all static members
24
 * of the Textord class.
25
 *
26
 **********************************************************************/
27
28
#include "drawtord.h"
29
#include "statistc.h"
30
#include "textord.h"
31
#include "tovars.h"
32
33
// Include automatically generated configuration file if running autoconf.
34
#ifdef HAVE_CONFIG_H
35
#  include "config_auto.h"
36
#endif
37
38
#include <algorithm>
39
#include <cmath>
40
#include <memory>
41
42
1.21M
#define MAXSPACING 128 /*max expected spacing in pix */
43
44
namespace tesseract {
45
void Textord::to_spacing(ICOORD page_tr,       // topright of page
46
                         TO_BLOCK_LIST *blocks // blocks on page
47
15.4k
) {
48
15.4k
  TO_BLOCK_IT block_it; // iterator
49
15.4k
  TO_BLOCK *block;      // current block;
50
15.4k
  TO_ROW *row;          // current row
51
15.4k
  int block_index;      // block number
52
15.4k
  int row_index;        // row number
53
  // estimated width of real spaces for whole block
54
15.4k
  int16_t block_space_gap_width;
55
  // estimated width of non space gaps for whole block
56
15.4k
  int16_t block_non_space_gap_width;
57
15.4k
  bool old_text_ord_proportional; // old fixed/prop result
58
59
15.4k
  block_it.set_to_list(blocks);
60
15.4k
  block_index = 1;
61
30.9k
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
62
15.4k
    block = block_it.data();
63
15.4k
    std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk
64
15.4k
    block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,
65
15.4k
                        block_non_space_gap_width);
66
    // Make sure relative values of block-level space and non-space gap
67
    // widths are reasonable. The ratio of 1:3 is also used in
68
    // block_spacing_stats, to correct the block_space_gap_width.
69
    // Useful for arabic and hindi, when the non-space gap width is
70
    // often over-estimated and should not be trusted. A similar ratio
71
    // is found in block_spacing_stats.
72
15.4k
    if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
73
15.4k
        block_non_space_gap_width > block_space_gap_width / 3) {
74
0
      block_non_space_gap_width = block_space_gap_width / 3;
75
0
    }
76
    // row iterator
77
15.4k
    TO_ROW_IT row_it(block->get_rows());
78
15.4k
    row_index = 1;
79
194k
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
80
178k
      row = row_it.data();
81
178k
      if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) {
82
173k
        if ((tosp_debug_level > 0) && !old_text_ord_proportional) {
83
0
          tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index);
84
0
        }
85
173k
        row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,
86
173k
                          block_non_space_gap_width);
87
173k
      } else {
88
5.71k
        if ((tosp_debug_level > 0) && old_text_ord_proportional) {
89
0
          tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index,
90
0
                  row_index, row->pitch_decision, row->fixed_pitch);
91
0
        }
92
5.71k
      }
93
#ifndef GRAPHICS_DISABLED
94
      if (textord_show_initial_words) {
95
        plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
96
      }
97
#endif
98
178k
      row_index++;
99
178k
    }
100
15.4k
    block_index++;
101
15.4k
  }
102
15.4k
}
103
104
/*************************************************************************
105
 * block_spacing_stats()
106
 *************************************************************************/
107
108
void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,
109
                                  int16_t &block_space_gap_width,    // resulting estimate
110
                                  int16_t &block_non_space_gap_width // resulting estimate
111
15.4k
) {
112
15.4k
  TO_ROW *row;         // current row
113
15.4k
  BLOBNBOX_IT blob_it; // iterator
114
115
15.4k
  STATS centre_to_centre_stats(0, MAXSPACING - 1);
116
  // DEBUG USE ONLY
117
15.4k
  STATS all_gap_stats(0, MAXSPACING - 1);
118
15.4k
  STATS space_gap_stats(0, MAXSPACING - 1);
119
15.4k
  int16_t minwidth = MAXSPACING; // narrowest blob
120
15.4k
  TBOX blob_box;
121
15.4k
  TBOX prev_blob_box;
122
15.4k
  int16_t centre_to_centre;
123
15.4k
  int16_t gap_width;
124
15.4k
  float real_space_threshold;
125
15.4k
  float iqr_centre_to_centre; // DEBUG USE ONLY
126
15.4k
  float iqr_all_gap_stats;    // DEBUG USE ONLY
127
15.4k
  int32_t end_of_row;
128
15.4k
  int32_t row_length;
129
130
  // row iterator
131
15.4k
  TO_ROW_IT row_it(block->get_rows());
132
194k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
133
178k
    row = row_it.data();
134
178k
    if (!row->blob_list()->empty() &&
135
178k
        (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
136
178k
         (row->pitch_decision == PITCH_CORR_PROP))) {
137
173k
      blob_it.set_to_list(row->blob_list());
138
173k
      blob_it.mark_cycle_pt();
139
173k
      end_of_row = blob_it.data_relative(-1)->bounding_box().right();
140
173k
      if (tosp_use_pre_chopping) {
141
0
        blob_box = box_next_pre_chopped(&blob_it);
142
173k
      } else if (tosp_stats_use_xht_gaps) {
143
173k
        blob_box = reduced_box_next(row, &blob_it);
144
173k
      } else {
145
0
        blob_box = box_next(&blob_it);
146
0
      }
147
173k
      row_length = end_of_row - blob_box.left();
148
173k
      if (blob_box.width() < minwidth) {
149
21.5k
        minwidth = blob_box.width();
150
21.5k
      }
151
173k
      prev_blob_box = blob_box;
152
1.23M
      while (!blob_it.cycled_list()) {
153
1.06M
        if (tosp_use_pre_chopping) {
154
0
          blob_box = box_next_pre_chopped(&blob_it);
155
1.06M
        } else if (tosp_stats_use_xht_gaps) {
156
1.06M
          blob_box = reduced_box_next(row, &blob_it);
157
1.06M
        } else {
158
0
          blob_box = box_next(&blob_it);
159
0
        }
160
1.06M
        if (blob_box.width() < minwidth) {
161
8.10k
          minwidth = blob_box.width();
162
8.10k
        }
163
1.06M
        int16_t left = prev_blob_box.right();
164
1.06M
        int16_t right = blob_box.left();
165
1.06M
        gap_width = right - left;
166
1.06M
        if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
167
1.05M
          all_gap_stats.add(gap_width, 1);
168
169
1.05M
          centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2;
170
          // DEBUG
171
1.05M
          centre_to_centre_stats.add(centre_to_centre, 1);
172
          // DEBUG
173
1.05M
        }
174
1.06M
        prev_blob_box = blob_box;
175
1.06M
      }
176
173k
    }
177
178k
  }
178
179
  // Inadequate samples
180
15.4k
  if (all_gap_stats.get_total() <= 1) {
181
6.42k
    block_non_space_gap_width = minwidth;
182
6.42k
    block_space_gap_width = -1; // No est. space width
183
                                // DEBUG
184
6.42k
    old_text_ord_proportional = true;
185
9.04k
  } else {
186
    /* For debug only ..... */
187
9.04k
    iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25);
188
9.04k
    iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25);
189
9.04k
    old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats;
190
    /* .......For debug only */
191
192
    /*
193
The median of the gaps is used as an estimate of the NON-SPACE gap width.
194
This RELIES on the assumption that there are more gaps WITHIN words than
195
BETWEEN words in a block
196
197
Now try to estimate the width of a real space for all real spaces in the
198
block. Do this by using a crude threshold to ignore "narrow" gaps, then
199
find the median of the "wide" gaps and use this.
200
*/
201
9.04k
    block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median()));
202
    // median gap
203
204
9.04k
    row_it.set_to_list(block->get_rows());
205
143k
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
206
134k
      row = row_it.data();
207
134k
      if (!row->blob_list()->empty() &&
208
134k
          (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
209
134k
           (row->pitch_decision == PITCH_CORR_PROP))) {
210
130k
        real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width,
211
130k
                                        tosp_init_guess_xht_mult * row->xheight);
212
130k
        blob_it.set_to_list(row->blob_list());
213
130k
        blob_it.mark_cycle_pt();
214
130k
        end_of_row = blob_it.data_relative(-1)->bounding_box().right();
215
130k
        if (tosp_use_pre_chopping) {
216
0
          blob_box = box_next_pre_chopped(&blob_it);
217
130k
        } else if (tosp_stats_use_xht_gaps) {
218
130k
          blob_box = reduced_box_next(row, &blob_it);
219
130k
        } else {
220
0
          blob_box = box_next(&blob_it);
221
0
        }
222
130k
        row_length = blob_box.left() - end_of_row;
223
130k
        prev_blob_box = blob_box;
224
1.19M
        while (!blob_it.cycled_list()) {
225
1.06M
          if (tosp_use_pre_chopping) {
226
0
            blob_box = box_next_pre_chopped(&blob_it);
227
1.06M
          } else if (tosp_stats_use_xht_gaps) {
228
1.06M
            blob_box = reduced_box_next(row, &blob_it);
229
1.06M
          } else {
230
0
            blob_box = box_next(&blob_it);
231
0
          }
232
1.06M
          int16_t left = prev_blob_box.right();
233
1.06M
          int16_t right = blob_box.left();
234
1.06M
          gap_width = right - left;
235
1.06M
          if ((gap_width > real_space_threshold) &&
236
1.06M
              !ignore_big_gap(row, row_length, gapmap, left, right)) {
237
            /*
238
If tosp_use_cert_spaces is enabled, the estimate of the space gap is
239
restricted to obvious spaces - those wider than half the xht or
240
those with wide blobs on both sides - i.e not things that are
241
suspect 1's or punctuation that is sometimes widely spaced.
242
*/
243
110k
            if (!tosp_block_use_cert_spaces ||
244
110k
                (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
245
110k
                ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
246
77.2k
                 (!tosp_narrow_blobs_not_cert ||
247
20.0k
                  (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
248
110k
                (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
249
38.3k
              space_gap_stats.add(gap_width, 1);
250
38.3k
            }
251
110k
          }
252
1.06M
          prev_blob_box = blob_box;
253
1.06M
        }
254
130k
      }
255
134k
    }
256
    // Inadequate samples
257
9.04k
    if (space_gap_stats.get_total() <= 2) {
258
7.36k
      block_space_gap_width = -1; // No est. space width
259
7.36k
    } else {
260
1.68k
      block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())),
261
1.68k
                                       static_cast<int16_t>(3 * block_non_space_gap_width));
262
1.68k
    }
263
9.04k
  }
264
15.4k
}
265
266
/*************************************************************************
267
 * row_spacing_stats()
268
 * Set values for min_space, max_non_space based on row stats only
269
 * If failure - return 0 values.
270
 *************************************************************************/
271
void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,
272
                                int16_t block_space_gap_width,    // estimate for block
273
                                int16_t block_non_space_gap_width // estimate for block
274
173k
) {
275
  // iterator
276
173k
  BLOBNBOX_IT blob_it = row->blob_list();
277
173k
  STATS all_gap_stats(0, MAXSPACING - 1);
278
173k
  STATS cert_space_gap_stats(0, MAXSPACING - 1);
279
173k
  STATS all_space_gap_stats(0, MAXSPACING - 1);
280
173k
  STATS small_gap_stats(0, MAXSPACING - 1);
281
173k
  TBOX blob_box;
282
173k
  TBOX prev_blob_box;
283
173k
  int16_t gap_width;
284
173k
  int16_t real_space_threshold = 0;
285
173k
  int16_t max = 0;
286
173k
  int16_t large_gap_count = 0;
287
173k
  bool suspected_table;
288
173k
  bool good_block_space_estimate = block_space_gap_width > 0;
289
173k
  int32_t end_of_row;
290
173k
  int32_t row_length = 0;
291
173k
  float sane_space;
292
173k
  int32_t sane_threshold;
293
294
  /* Collect first pass stats for row */
295
296
173k
  if (!good_block_space_estimate) {
297
138k
    block_space_gap_width = int16_t(std::floor(row->xheight / 2));
298
138k
  }
299
173k
  if (!row->blob_list()->empty()) {
300
173k
    if (tosp_threshold_bias1 > 0) {
301
0
      real_space_threshold =
302
0
          block_non_space_gap_width +
303
0
          int16_t(floor(0.5 + tosp_threshold_bias1 *
304
0
                                  (block_space_gap_width - block_non_space_gap_width)));
305
173k
    } else {
306
173k
      real_space_threshold = // Old TO method
307
173k
          (block_space_gap_width + block_non_space_gap_width) / 2;
308
173k
    }
309
173k
    blob_it.set_to_list(row->blob_list());
310
173k
    blob_it.mark_cycle_pt();
311
173k
    end_of_row = blob_it.data_relative(-1)->bounding_box().right();
312
173k
    if (tosp_use_pre_chopping) {
313
0
      blob_box = box_next_pre_chopped(&blob_it);
314
173k
    } else if (tosp_stats_use_xht_gaps) {
315
173k
      blob_box = reduced_box_next(row, &blob_it);
316
173k
    } else {
317
0
      blob_box = box_next(&blob_it);
318
0
    }
319
173k
    row_length = end_of_row - blob_box.left();
320
173k
    prev_blob_box = blob_box;
321
1.23M
    while (!blob_it.cycled_list()) {
322
1.06M
      if (tosp_use_pre_chopping) {
323
0
        blob_box = box_next_pre_chopped(&blob_it);
324
1.06M
      } else if (tosp_stats_use_xht_gaps) {
325
1.06M
        blob_box = reduced_box_next(row, &blob_it);
326
1.06M
      } else {
327
0
        blob_box = box_next(&blob_it);
328
0
      }
329
1.06M
      int16_t left = prev_blob_box.right();
330
1.06M
      int16_t right = blob_box.left();
331
1.06M
      gap_width = right - left;
332
1.06M
      if (ignore_big_gap(row, row_length, gapmap, left, right)) {
333
10.3k
        large_gap_count++;
334
1.05M
      } else {
335
1.05M
        if (gap_width >= real_space_threshold) {
336
137k
          if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
337
137k
              ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
338
96.0k
               (!tosp_narrow_blobs_not_cert ||
339
22.5k
                (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
340
137k
              (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
341
49.7k
            cert_space_gap_stats.add(gap_width, 1);
342
49.7k
          }
343
137k
          all_space_gap_stats.add(gap_width, 1);
344
918k
        } else {
345
918k
          small_gap_stats.add(gap_width, 1);
346
918k
        }
347
1.05M
        all_gap_stats.add(gap_width, 1);
348
1.05M
      }
349
1.06M
      prev_blob_box = blob_box;
350
1.06M
    }
351
173k
  }
352
173k
  suspected_table = (large_gap_count > 1) ||
353
173k
                    ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples));
354
355
  /* Now determine row kern size, space size and threshold */
356
357
173k
  if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) ||
358
173k
      ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) &&
359
167k
       cert_space_gap_stats.get_total() > 0)) {
360
18.1k
    old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats,
361
18.1k
                  block_space_gap_width, block_non_space_gap_width);
362
155k
  } else {
363
155k
    if (!tosp_recovery_isolated_row_stats ||
364
155k
        !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) {
365
146k
      if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) {
366
0
        tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx);
367
0
      }
368
146k
      if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
369
        // Use block default
370
17.1k
        row->space_size = block_space_gap_width;
371
17.1k
        if (all_gap_stats.get_total() > tosp_redo_kern_limit) {
372
6.27k
          row->kern_size = all_gap_stats.median();
373
10.8k
        } else {
374
10.8k
          row->kern_size = block_non_space_gap_width;
375
10.8k
        }
376
17.1k
        row->space_threshold =
377
17.1k
            int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
378
129k
      } else {
379
129k
        old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats,
380
129k
                      block_space_gap_width, block_non_space_gap_width);
381
129k
      }
382
146k
    }
383
155k
  }
384
385
173k
  if (tosp_improve_thresh && !suspected_table) {
386
0
    improve_row_threshold(row, &all_gap_stats);
387
0
  }
388
389
  /* Now lets try to be careful not to do anything silly with tables when we
390
are ignoring big gaps*/
391
173k
  if (tosp_sanity_method == 0) {
392
0
    if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
393
0
      if (tosp_debug_level > 5) {
394
0
        tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx,
395
0
                row->kern_size, row->space_threshold, row->space_size);
396
0
      }
397
0
      row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);
398
0
      row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
399
0
    }
400
173k
  } else if (tosp_sanity_method == 1) {
401
173k
    sane_space = row->space_size;
402
    /* NEVER let space size get too close to kern size */
403
173k
    if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
404
173k
        ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) {
405
24.3k
      if (good_block_space_estimate &&
406
24.3k
          (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) {
407
1.28k
        sane_space = block_space_gap_width;
408
23.0k
      } else {
409
23.0k
        sane_space =
410
23.0k
            std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
411
23.0k
                     row->xheight / 2.0f);
412
23.0k
      }
413
24.3k
      if (tosp_debug_level > 5) {
414
0
        tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx,
415
0
                row->kern_size, row->space_threshold, row->space_size, sane_space);
416
0
      }
417
24.3k
      row->space_size = sane_space;
418
24.3k
      row->space_threshold =
419
24.3k
          int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
420
24.3k
    }
421
    /* NEVER let threshold get VERY far away from kern */
422
173k
    sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f)));
423
173k
    if (row->space_threshold > sane_threshold) {
424
4.35k
      if (tosp_debug_level > 5) {
425
0
        tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx,
426
0
                row->kern_size, row->space_threshold, row->space_size, sane_threshold);
427
0
      }
428
4.35k
      row->space_threshold = sane_threshold;
429
4.35k
      if (row->space_size <= sane_threshold) {
430
0
        row->space_size = row->space_threshold + 1.0f;
431
0
      }
432
4.35k
    }
433
    /* Beware of tables - there may be NO spaces */
434
173k
    if (suspected_table) {
435
5.89k
      sane_space =
436
5.89k
          std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight);
437
5.89k
      sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2));
438
439
5.89k
      if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) {
440
758
        if (tosp_debug_level > 5) {
441
0
          tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx,
442
0
                  row->kern_size, row->space_threshold, row->space_size);
443
0
        }
444
        // the minimum sane value
445
758
        row->space_threshold = static_cast<int32_t>(sane_space);
446
758
        row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
447
758
      }
448
5.89k
    }
449
173k
  }
450
451
  /* Now lets try to put some error limits on the threshold */
452
453
173k
  if (tosp_old_to_method) {
454
    /* Old textord made a space if gap >= threshold */
455
    // NO FUZZY SPACES YET
456
0
    row->max_nonspace = row->space_threshold;
457
    // NO FUZZY SPACES       YET
458
0
    row->min_space = row->space_threshold + 1;
459
173k
  } else {
460
    /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
461
173k
    row->min_space =
462
173k
        std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size));
463
173k
    if (row->min_space <= row->space_threshold) {
464
      // Don't be silly
465
30.5k
      row->min_space = row->space_threshold + 1;
466
30.5k
    }
467
    /*
468
Lets try to guess the max certain kern gap by looking at the cluster of
469
kerns for the row. The row is proportional so the kerns should cluster
470
tightly at the bottom of the distribution. We also expect most gaps to be
471
kerns. Find the maximum of the kern piles between 0 and twice the kern
472
estimate. Piles before the first one with less than 1/10 the maximum
473
number of samples can be taken as certain kerns.
474
475
  Of course, there are some cases where the kern peak and space peaks merge,
476
  so we will put an UPPER limit on the max certain kern gap of some fraction
477
  below the threshold.
478
*/
479
480
    // upper bound
481
173k
    int32_t max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2);
482
483
    // default
484
173k
    row->max_nonspace = max_max_nonspace;
485
877k
    for (int32_t index = 0; index <= max_max_nonspace; index++) {
486
776k
      if (all_gap_stats.pile_count(index) > max) {
487
121k
        max = all_gap_stats.pile_count(index);
488
121k
      }
489
776k
      if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) {
490
71.8k
        row->max_nonspace = index;
491
71.8k
        break;
492
71.8k
      }
493
776k
    }
494
173k
  }
495
496
  /* Yet another algorithm - simpler this time - just choose a fraction of the
497
threshold to space range */
498
499
173k
  if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) {
500
173k
    row->min_space = std::max(
501
173k
        row->min_space, static_cast<int32_t>(ceil(row->space_threshold +
502
173k
                                                  tosp_fuzzy_sp_fraction *
503
173k
                                                      (row->space_size - row->space_threshold))));
504
173k
  }
505
506
  /* Ensure that ANY space less than some multiplier times the kern size is
507
fuzzy.  In tables there is a risk of erroneously setting a small space size
508
when there are no real spaces. Sometimes tables have text squashed into
509
columns so that the kn->sp ratio is small anyway - this means that we can't
510
use this to force a wider separation - hence we rely on context to join any
511
dubious breaks. */
512
513
173k
  if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) {
514
173k
    row->min_space = std::max(
515
173k
        row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size)));
516
173k
  }
517
518
173k
  if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
519
172k
    row->max_nonspace = static_cast<int32_t>(floor(
520
172k
        0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)));
521
172k
  }
522
173k
  if (row->max_nonspace > row->space_threshold) {
523
    // Don't be silly
524
0
    row->max_nonspace = row->space_threshold;
525
0
  }
526
527
173k
  if (tosp_debug_level > 5) {
528
0
    tprintf(
529
0
        "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) "
530
0
        "Sp:%3.2f\n",
531
0
        block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width,
532
0
        real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold,
533
0
        row->min_space, row->space_size);
534
0
  }
535
173k
  if (tosp_debug_level > 10) {
536
0
    tprintf(
537
0
        "row->kern_size = %3.2f, row->space_size = %3.2f, "
538
0
        "row->space_threshold = %d\n",
539
0
        row->kern_size, row->space_size, row->space_threshold);
540
0
  }
541
173k
}
542
543
void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,
544
                            STATS *small_gap_stats,
545
                            int16_t block_space_gap_width,    // estimate for block
546
                            int16_t block_non_space_gap_width // estimate for block
547
147k
) {
548
  /* First, estimate row space size */
549
  /* Old to condition was > 2 */
550
147k
  if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) {
551
    // Adequate samples
552
    /* Set space size to median of spaces BUT limits it if it seems wildly out
553
     */
554
9.83k
    row->space_size = space_gap_stats->median();
555
9.83k
    if (row->space_size > block_space_gap_width * 1.5) {
556
1.08k
      if (tosp_old_to_bug_fix) {
557
0
        row->space_size = block_space_gap_width * 1.5;
558
1.08k
      } else {
559
        // BUG??? should be *1.5
560
1.08k
        row->space_size = block_space_gap_width;
561
1.08k
      }
562
1.08k
    }
563
9.83k
    if (row->space_size < (block_non_space_gap_width * 2) + 1) {
564
2.70k
      row->space_size = (block_non_space_gap_width * 2) + 1;
565
2.70k
    }
566
9.83k
  }
567
  // Only 1 or 2 samples
568
138k
  else if (space_gap_stats->get_total() >= 1) {
569
    // hence mean not median
570
28.9k
    row->space_size = space_gap_stats->mean();
571
28.9k
    if (row->space_size > block_space_gap_width * 1.5) {
572
3.74k
      if (tosp_old_to_bug_fix) {
573
0
        row->space_size = block_space_gap_width * 1.5;
574
3.74k
      } else {
575
        // BUG??? should be *1.5
576
3.74k
        row->space_size = block_space_gap_width;
577
3.74k
      }
578
3.74k
    }
579
28.9k
    if (row->space_size < (block_non_space_gap_width * 3) + 1) {
580
13.2k
      row->space_size = (block_non_space_gap_width * 3) + 1;
581
13.2k
    }
582
109k
  } else {
583
    // Use block default
584
109k
    row->space_size = block_space_gap_width;
585
109k
  }
586
587
  /* Next, estimate row kern size */
588
147k
  if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) {
589
0
    row->kern_size = small_gap_stats->median();
590
147k
  } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) {
591
15.7k
    row->kern_size = all_gap_stats->median();
592
132k
  } else { // old TO -SAME FOR ALL ROWS
593
132k
    row->kern_size = block_non_space_gap_width;
594
132k
  }
595
596
  /* Finally, estimate row space threshold */
597
147k
  if (tosp_threshold_bias2 > 0) {
598
0
    row->space_threshold = int32_t(
599
0
        floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size)));
600
147k
  } else {
601
    /*
602
  NOTE old text ord uses (space_size + kern_size + 1)/2  as the threshold
603
and holds this in a float. The use is with a >= test
604
NEW textord uses an integer threshold and a > test
605
It comes to the same thing.
606
  (Though there is a difference in that old textor has integer space_size
607
  and kern_size.)
608
*/
609
147k
    row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
610
147k
  }
611
612
  // Apply the same logic and ratios as in row_spacing_stats to
613
  // restrict relative values of the row's space_size, kern_size, and
614
  // space_threshold
615
147k
  if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&
616
147k
      ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
617
0
       ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) {
618
0
    if (row->kern_size > 2.5) {
619
0
      row->kern_size = row->space_size / tosp_min_sane_kn_sp;
620
0
    }
621
0
    row->space_threshold =
622
0
        int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
623
0
  }
624
147k
}
625
626
/*************************************************************************
627
 * isolated_row_stats()
628
 * Set values for min_space, max_non_space based on row stats only
629
 *************************************************************************/
630
bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats,
631
155k
                                 bool suspected_table, int16_t block_idx, int16_t row_idx) {
632
155k
  float kern_estimate;
633
155k
  float crude_threshold_estimate;
634
155k
  int16_t small_gaps_count;
635
155k
  int16_t total;
636
  // iterator
637
155k
  BLOBNBOX_IT blob_it = row->blob_list();
638
155k
  STATS cert_space_gap_stats(0, MAXSPACING - 1);
639
155k
  STATS all_space_gap_stats(0, MAXSPACING - 1);
640
155k
  STATS small_gap_stats(0, MAXSPACING - 1);
641
155k
  TBOX blob_box;
642
155k
  TBOX prev_blob_box;
643
155k
  int16_t gap_width;
644
155k
  int32_t end_of_row;
645
155k
  int32_t row_length;
646
647
155k
  kern_estimate = all_gap_stats->median();
648
155k
  crude_threshold_estimate =
649
155k
      std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight);
650
155k
  small_gaps_count =
651
155k
      stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate)));
652
155k
  total = all_gap_stats->get_total();
653
654
155k
  if ((total <= tosp_redo_kern_limit) ||
655
155k
      ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||
656
155k
      (total - small_gaps_count < 1)) {
657
146k
    if (tosp_debug_level > 5) {
658
0
      tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx);
659
0
    }
660
146k
    return false;
661
146k
  }
662
8.13k
  blob_it.set_to_list(row->blob_list());
663
8.13k
  blob_it.mark_cycle_pt();
664
8.13k
  end_of_row = blob_it.data_relative(-1)->bounding_box().right();
665
8.13k
  if (tosp_use_pre_chopping) {
666
0
    blob_box = box_next_pre_chopped(&blob_it);
667
8.13k
  } else if (tosp_stats_use_xht_gaps) {
668
8.13k
    blob_box = reduced_box_next(row, &blob_it);
669
8.13k
  } else {
670
0
    blob_box = box_next(&blob_it);
671
0
  }
672
8.13k
  row_length = end_of_row - blob_box.left();
673
8.13k
  prev_blob_box = blob_box;
674
245k
  while (!blob_it.cycled_list()) {
675
237k
    if (tosp_use_pre_chopping) {
676
0
      blob_box = box_next_pre_chopped(&blob_it);
677
237k
    } else if (tosp_stats_use_xht_gaps) {
678
237k
      blob_box = reduced_box_next(row, &blob_it);
679
237k
    } else {
680
0
      blob_box = box_next(&blob_it);
681
0
    }
682
237k
    int16_t left = prev_blob_box.right();
683
237k
    int16_t right = blob_box.left();
684
237k
    gap_width = right - left;
685
237k
    if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
686
237k
        (gap_width > crude_threshold_estimate)) {
687
21.7k
      if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
688
21.7k
          ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
689
19.6k
           (!tosp_narrow_blobs_not_cert ||
690
6.56k
            (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
691
21.7k
          (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
692
2.72k
        cert_space_gap_stats.add(gap_width, 1);
693
2.72k
      }
694
21.7k
      all_space_gap_stats.add(gap_width, 1);
695
21.7k
    }
696
237k
    if (gap_width < crude_threshold_estimate) {
697
215k
      small_gap_stats.add(gap_width, 1);
698
215k
    }
699
700
237k
    prev_blob_box = blob_box;
701
237k
  }
702
8.13k
  if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
703
    // median
704
89
    row->space_size = cert_space_gap_stats.median();
705
8.04k
  } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) {
706
    // to avoid spaced
707
33
    row->space_size = cert_space_gap_stats.mean();
708
  //      1's in tables
709
8.01k
  } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
710
    // median
711
2.63k
    row->space_size = all_space_gap_stats.median();
712
5.37k
  } else {
713
5.37k
    row->space_size = all_space_gap_stats.mean();
714
5.37k
  }
715
716
8.13k
  if (tosp_only_small_gaps_for_kern) {
717
0
    row->kern_size = small_gap_stats.median();
718
8.13k
  } else {
719
8.13k
    row->kern_size = all_gap_stats->median();
720
8.13k
  }
721
8.13k
  row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
722
  /* Sanity check */
723
8.13k
  if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) ||
724
8.13k
      (row->space_threshold <= 0)) {
725
28
    if (tosp_debug_level > 5) {
726
0
      tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx,
727
0
              row->kern_size, row->space_threshold, row->space_size);
728
0
    }
729
28
    row->kern_size = 0.0f;
730
28
    row->space_threshold = 0;
731
28
    row->space_size = 0.0f;
732
28
    return false;
733
28
  }
734
735
8.10k
  if (tosp_debug_level > 5) {
736
0
    tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size,
737
0
            row->space_threshold, row->space_size);
738
0
  }
739
8.10k
  return true;
740
8.13k
}
741
742
155k
int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
743
155k
  int16_t index;
744
155k
  int16_t total = 0;
745
746
925k
  for (index = 0; index < threshold; index++) {
747
770k
    total += stats->pile_count(index);
748
770k
  }
749
155k
  return total;
750
155k
}
751
752
/*************************************************************************
753
 * improve_row_threshold()
754
 *    Try to recognise a "normal line" -
755
 *           > 25 gaps
756
 *     &&    space > 3 * kn  && space > 10
757
 *              (I.e. reasonably large space and kn:sp ratio)
758
 *     &&    > 3/4 # gaps < kn + (sp - kn)/3
759
 *              (I.e. most gaps are well away from space estimate)
760
 *     &&    a gap of max(3, (sp - kn) / 3) empty histogram positions is found
761
 *           somewhere in the histogram between kn and sp
762
 *     THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
763
 *          NO!!!!! the bristol line has "11" with a gap of 12 between the
764
 *1's!!! try moving the default threshold to within this band but leave the
765
 *          fuzzy limit calculation as at present.
766
 *************************************************************************/
767
0
void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
768
0
  float sp = row->space_size;
769
0
  float kn = row->kern_size;
770
0
  int16_t reqd_zero_width = 0;
771
0
  int16_t zero_width = 0;
772
0
  int16_t zero_start = 0;
773
0
  int16_t index = 0;
774
775
0
  if (tosp_debug_level > 10) {
776
0
    tprintf("Improve row threshold 0");
777
0
  }
778
0
  if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) ||
779
0
      (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) <
780
0
       (0.75 * all_gap_stats->get_total()))) {
781
0
    return;
782
0
  }
783
0
  if (tosp_debug_level > 10) {
784
0
    tprintf(" 1");
785
0
  }
786
  /*
787
Look for the first region of all 0's in the histogram which is wider than
788
max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current
789
threshold is not within it, move the threshold so that is just inside it.
790
*/
791
0
  reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5));
792
0
  if (reqd_zero_width < 3) {
793
0
    reqd_zero_width = 3;
794
0
  }
795
796
0
  for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) {
797
0
    if (all_gap_stats->pile_count(index) == 0) {
798
0
      if (zero_width == 0) {
799
0
        zero_start = index;
800
0
      }
801
0
      zero_width++;
802
0
    } else {
803
0
      if (zero_width >= reqd_zero_width) {
804
0
        break;
805
0
      } else {
806
0
        zero_width = 0;
807
0
      }
808
0
    }
809
0
  }
810
0
  index--;
811
0
  if (tosp_debug_level > 10) {
812
0
    tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width,
813
0
            zero_width, zero_start, row->space_threshold);
814
0
  }
815
0
  if ((zero_width < reqd_zero_width) ||
816
0
      ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) {
817
0
    return;
818
0
  }
819
0
  if (tosp_debug_level > 10) {
820
0
    tprintf(" 2");
821
0
  }
822
0
  if (row->space_threshold < zero_start) {
823
0
    if (tosp_debug_level > 5) {
824
0
      tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n", kn, sp, zero_start,
825
0
              index, row->space_threshold, zero_start);
826
0
    }
827
0
    row->space_threshold = zero_start;
828
0
  }
829
0
  if (row->space_threshold > index) {
830
0
    if (tosp_debug_level > 5) {
831
0
      tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n", kn, sp, zero_start,
832
0
              index, row->space_threshold, index);
833
0
    }
834
0
    row->space_threshold = index;
835
0
  }
836
0
}
837
838
/**********************************************************************
839
 * make_prop_words
840
 *
841
 * Convert a TO_ROW to a ROW.
842
 **********************************************************************/
843
ROW *Textord::make_prop_words(TO_ROW *row,    // row to make
844
                              FCOORD rotation // for drawing
845
173k
) {
846
173k
  bool bol; // start of line
847
  /* prev_ values are for start of word being built. non prev_ values are for
848
the gap between the word being built and the next one. */
849
173k
  bool prev_fuzzy_sp;     // probably space
850
173k
  bool prev_fuzzy_non;    // probably not
851
173k
  uint8_t prev_blanks;    // in front of word
852
173k
  bool fuzzy_sp = false;  // probably space
853
173k
  bool fuzzy_non = false; // probably not
854
173k
  uint8_t blanks = 0;     // in front of word
855
173k
  bool prev_gap_was_a_space = false;
856
173k
  bool break_at_next_gap = false;
857
173k
  ROW *real_row; // output row
858
173k
  C_OUTLINE_IT cout_it;
859
173k
  C_BLOB_LIST cblobs;
860
173k
  C_BLOB_IT cblob_it = &cblobs;
861
173k
  WERD_LIST words;
862
173k
  WERD *word; // new word
863
173k
  int32_t next_rep_char_word_right = INT32_MAX;
864
173k
  float repetition_spacing; // gap between repetitions
865
173k
  int32_t xstarts[2];       // row ends
866
173k
  int32_t prev_x;           // end of prev blob
867
173k
  BLOBNBOX_IT box_it;       // iterator
868
173k
  TBOX prev_blob_box;
869
173k
  TBOX next_blob_box;
870
173k
  int16_t prev_gap = INT16_MAX;
871
173k
  int16_t current_gap = INT16_MAX;
872
173k
  int16_t next_gap = INT16_MAX;
873
173k
  int16_t prev_within_xht_gap = INT16_MAX;
874
173k
  int16_t current_within_xht_gap = INT16_MAX;
875
173k
  int16_t next_within_xht_gap = INT16_MAX;
876
173k
  int16_t word_count = 0;
877
878
  // repeated char words
879
173k
  WERD_IT rep_char_it(&(row->rep_words));
880
173k
  if (!rep_char_it.empty()) {
881
0
    next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
882
0
  }
883
884
173k
  prev_x = -INT16_MAX;
885
173k
  cblob_it.set_to_list(&cblobs);
886
173k
  box_it.set_to_list(row->blob_list());
887
  // new words
888
173k
  WERD_IT word_it(&words);
889
173k
  bol = true;
890
173k
  prev_blanks = 0;
891
173k
  prev_fuzzy_sp = false;
892
173k
  prev_fuzzy_non = false;
893
173k
  if (!box_it.empty()) {
894
173k
    xstarts[0] = box_it.data()->bounding_box().left();
895
173k
    if (xstarts[0] > next_rep_char_word_right) {
896
      /* We need to insert a repeated char word at the start of the row */
897
0
      word = rep_char_it.extract();
898
0
      word_it.add_after_then_move(word);
899
      /* Set spaces before repeated char word */
900
0
      word->set_flag(W_BOL, true);
901
0
      bol = false;
902
0
      word->set_blanks(0);
903
      // NO uncertainty
904
0
      word->set_flag(W_FUZZY_SP, false);
905
0
      word->set_flag(W_FUZZY_NON, false);
906
0
      xstarts[0] = word->bounding_box().left();
907
      /* Set spaces after repeated char word (and leave current word set) */
908
0
      repetition_spacing = find_mean_blob_spacing(word);
909
0
      current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;
910
0
      current_within_xht_gap = current_gap;
911
0
      if (current_gap > tosp_rep_space * repetition_spacing) {
912
0
        prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
913
0
        if (prev_blanks < 1) {
914
0
          prev_blanks = 1;
915
0
        }
916
0
      } else {
917
0
        prev_blanks = 0;
918
0
      }
919
0
      if (tosp_debug_level > 5) {
920
0
        tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f;  Rgap:%d  ",
921
0
                box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),
922
0
                repetition_spacing, current_gap);
923
0
      }
924
0
      prev_fuzzy_sp = false;
925
0
      prev_fuzzy_non = false;
926
0
      if (rep_char_it.empty()) {
927
0
        next_rep_char_word_right = INT32_MAX;
928
0
      } else {
929
0
        rep_char_it.forward();
930
0
        next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
931
0
      }
932
0
    }
933
934
173k
    peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
935
2.54M
    do {
936
2.54M
      auto bblob = box_it.data();
937
2.54M
      auto blob_box = bblob->bounding_box();
938
2.54M
      if (bblob->joined_to_prev()) {
939
1.07M
        auto cblob = bblob->remove_cblob();
940
1.07M
        if (cblob != nullptr) {
941
1.07M
          cout_it.set_to_list(cblob_it.data()->out_list());
942
1.07M
          cout_it.move_to_last();
943
1.07M
          cout_it.add_list_after(cblob->out_list());
944
1.07M
          delete cblob;
945
1.07M
        }
946
1.46M
      } else {
947
1.46M
        auto cblob = bblob->cblob();
948
1.46M
        if (cblob != nullptr) {
949
1.23M
          bblob->set_owns_cblob(false);
950
1.23M
          cblob_it.add_after_then_move(cblob);
951
1.23M
        }
952
1.46M
        prev_x = blob_box.right();
953
1.46M
      }
954
2.54M
      box_it.forward(); // next one
955
2.54M
      bblob = box_it.data();
956
2.54M
      blob_box = bblob->bounding_box();
957
958
2.54M
      if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
959
        /* Real Blob - not multiple outlines or pre-chopped */
960
1.23M
        prev_gap = current_gap;
961
1.23M
        prev_within_xht_gap = current_within_xht_gap;
962
1.23M
        prev_blob_box = next_blob_box;
963
1.23M
        current_gap = next_gap;
964
1.23M
        current_within_xht_gap = next_within_xht_gap;
965
1.23M
        peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
966
967
1.23M
        int16_t prev_gap_arg = prev_gap;
968
1.23M
        int16_t next_gap_arg = next_gap;
969
1.23M
        if (tosp_only_use_xht_gaps) {
970
0
          prev_gap_arg = prev_within_xht_gap;
971
0
          next_gap_arg = next_within_xht_gap;
972
0
        }
973
        // Decide if a word-break should be inserted
974
1.23M
        if (blob_box.left() > next_rep_char_word_right ||
975
1.23M
            make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,
976
1.23M
                              current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,
977
1.23M
                              fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||
978
1.23M
            box_it.at_first()) {
979
          /* Form a new word out of the blobs collected */
980
280k
          word = new WERD(&cblobs, prev_blanks, nullptr);
981
280k
          word_count++;
982
280k
          word_it.add_after_then_move(word);
983
280k
          if (bol) {
984
173k
            word->set_flag(W_BOL, true);
985
173k
            bol = false;
986
173k
          }
987
280k
          if (prev_fuzzy_sp) {
988
            // probably space
989
28.6k
            word->set_flag(W_FUZZY_SP, true);
990
252k
          } else if (prev_fuzzy_non) {
991
19.7k
            word->set_flag(W_FUZZY_NON, true);
992
19.7k
          }
993
          // probably not
994
995
280k
          if (blob_box.left() > next_rep_char_word_right) {
996
            /* We need to insert a repeated char word */
997
0
            word = rep_char_it.extract();
998
0
            word_it.add_after_then_move(word);
999
1000
            /* Set spaces before repeated char word */
1001
0
            repetition_spacing = find_mean_blob_spacing(word);
1002
0
            current_gap = word->bounding_box().left() - prev_x;
1003
0
            current_within_xht_gap = current_gap;
1004
0
            if (current_gap > tosp_rep_space * repetition_spacing) {
1005
0
              blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1006
0
              if (blanks < 1) {
1007
0
                blanks = 1;
1008
0
              }
1009
0
            } else {
1010
0
              blanks = 0;
1011
0
            }
1012
0
            if (tosp_debug_level > 5) {
1013
0
              tprintf("Repch wd (%d,%d) rep gap %5.2f;  Lgap:%d (%d blanks);",
1014
0
                      word->bounding_box().left(), word->bounding_box().bottom(),
1015
0
                      repetition_spacing, current_gap, blanks);
1016
0
            }
1017
0
            word->set_blanks(blanks);
1018
            // NO uncertainty
1019
0
            word->set_flag(W_FUZZY_SP, false);
1020
0
            word->set_flag(W_FUZZY_NON, false);
1021
1022
            /* Set spaces after repeated char word (and leave current word set)
1023
             */
1024
0
            current_gap = blob_box.left() - next_rep_char_word_right;
1025
0
            if (current_gap > tosp_rep_space * repetition_spacing) {
1026
0
              blanks = static_cast<uint8_t>(current_gap / row->space_size);
1027
0
              if (blanks < 1) {
1028
0
                blanks = 1;
1029
0
              }
1030
0
            } else {
1031
0
              blanks = 0;
1032
0
            }
1033
0
            if (tosp_debug_level > 5) {
1034
0
              tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks);
1035
0
            }
1036
0
            fuzzy_sp = false;
1037
0
            fuzzy_non = false;
1038
1039
0
            if (rep_char_it.empty()) {
1040
0
              next_rep_char_word_right = INT32_MAX;
1041
0
            } else {
1042
0
              rep_char_it.forward();
1043
0
              next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
1044
0
            }
1045
0
          }
1046
1047
280k
          if (box_it.at_first() && rep_char_it.empty()) {
1048
            // at end of line
1049
173k
            word->set_flag(W_EOL, true);
1050
173k
            xstarts[1] = prev_x;
1051
173k
          } else {
1052
107k
            prev_blanks = blanks;
1053
107k
            prev_fuzzy_sp = fuzzy_sp;
1054
107k
            prev_fuzzy_non = fuzzy_non;
1055
107k
          }
1056
280k
        }
1057
1.23M
      }
1058
2.54M
    } while (!box_it.at_first()); // until back at start
1059
1060
    /* Insert any further repeated char words */
1061
173k
    while (!rep_char_it.empty()) {
1062
0
      word = rep_char_it.extract();
1063
0
      word_it.add_after_then_move(word);
1064
1065
      /* Set spaces before repeated char word */
1066
0
      repetition_spacing = find_mean_blob_spacing(word);
1067
0
      current_gap = word->bounding_box().left() - prev_x;
1068
0
      if (current_gap > tosp_rep_space * repetition_spacing) {
1069
0
        blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1070
0
        if (blanks < 1) {
1071
0
          blanks = 1;
1072
0
        }
1073
0
      } else {
1074
0
        blanks = 0;
1075
0
      }
1076
0
      if (tosp_debug_level > 5) {
1077
0
        tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1078
0
                word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing,
1079
0
                current_gap, blanks);
1080
0
      }
1081
0
      word->set_blanks(blanks);
1082
      // NO uncertainty
1083
0
      word->set_flag(W_FUZZY_SP, false);
1084
0
      word->set_flag(W_FUZZY_NON, false);
1085
0
      prev_x = word->bounding_box().right();
1086
0
      if (rep_char_it.empty()) {
1087
        // at end of line
1088
0
        word->set_flag(W_EOL, true);
1089
0
        xstarts[1] = prev_x;
1090
0
      } else {
1091
0
        rep_char_it.forward();
1092
0
      }
1093
0
    }
1094
173k
    real_row =
1095
173k
        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1096
173k
    word_it.set_to_list(real_row->word_list());
1097
    // put words in row
1098
173k
    word_it.add_list_after(&words);
1099
173k
    real_row->recalc_bounding_box();
1100
1101
173k
    if (tosp_debug_level > 4) {
1102
0
      tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1103
0
              real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1104
0
              real_row->bounding_box().right(), real_row->bounding_box().top());
1105
0
    }
1106
173k
    return real_row;
1107
173k
  }
1108
0
  return nullptr;
1109
173k
}
1110
1111
/**********************************************************************
1112
 * make_blob_words
1113
 *
1114
 * Converts words into blobs so that each blob is a single character.
1115
 *  Used for chopper test.
1116
 **********************************************************************/
1117
ROW *Textord::make_blob_words(TO_ROW *row,    // row to make
1118
                              FCOORD rotation // for drawing
1119
0
) {
1120
0
  bool bol;      // start of line
1121
0
  ROW *real_row; // output row
1122
0
  C_OUTLINE_IT cout_it;
1123
0
  C_BLOB_LIST cblobs;
1124
0
  C_BLOB_IT cblob_it = &cblobs;
1125
0
  WERD_LIST words;
1126
0
  WERD *word;         // new word
1127
0
  BLOBNBOX_IT box_it; // iterator
1128
0
  int16_t word_count = 0;
1129
1130
0
  cblob_it.set_to_list(&cblobs);
1131
0
  box_it.set_to_list(row->blob_list());
1132
  // new words
1133
0
  WERD_IT word_it(&words);
1134
0
  bol = true;
1135
0
  if (!box_it.empty()) {
1136
0
    do {
1137
0
      auto bblob = box_it.data();
1138
0
      auto blob_box = bblob->bounding_box();
1139
0
      if (bblob->joined_to_prev()) {
1140
0
        auto cblob = bblob->remove_cblob();
1141
0
        if (cblob != nullptr) {
1142
0
          cout_it.set_to_list(cblob_it.data()->out_list());
1143
0
          cout_it.move_to_last();
1144
0
          cout_it.add_list_after(cblob->out_list());
1145
0
          delete cblob;
1146
0
        }
1147
0
      } else {
1148
0
        auto cblob = bblob->cblob();
1149
0
        if (cblob != nullptr) {
1150
0
          bblob->set_owns_cblob(false);
1151
0
          cblob_it.add_after_then_move(cblob);
1152
0
        }
1153
0
      }
1154
0
      box_it.forward(); // next one
1155
0
      bblob = box_it.data();
1156
0
      blob_box = bblob->bounding_box();
1157
1158
0
      if (!bblob->joined_to_prev() && !cblobs.empty()) {
1159
0
        word = new WERD(&cblobs, 1, nullptr);
1160
0
        word_count++;
1161
0
        word_it.add_after_then_move(word);
1162
0
        if (bol) {
1163
0
          word->set_flag(W_BOL, true);
1164
0
          bol = false;
1165
0
        }
1166
0
        if (box_it.at_first()) { // at end of line
1167
0
          word->set_flag(W_EOL, true);
1168
0
        }
1169
0
      }
1170
0
    } while (!box_it.at_first()); // until back at start
1171
    /* Setup the row with created words. */
1172
0
    real_row =
1173
0
        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1174
0
    word_it.set_to_list(real_row->word_list());
1175
    // put words in row
1176
0
    word_it.add_list_after(&words);
1177
0
    real_row->recalc_bounding_box();
1178
0
    if (tosp_debug_level > 4) {
1179
0
      tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1180
0
              real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1181
0
              real_row->bounding_box().right(), real_row->bounding_box().top());
1182
0
    }
1183
0
    return real_row;
1184
0
  }
1185
0
  return nullptr;
1186
0
}
1187
1188
bool Textord::make_a_word_break(TO_ROW *row,   // row being made
1189
                                TBOX blob_box, // for next_blob // how many blanks?
1190
                                int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,
1191
                                int16_t within_xht_current_gap, TBOX next_blob_box,
1192
                                int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,
1193
1.23M
                                bool &prev_gap_was_a_space, bool &break_at_next_gap) {
1194
1.23M
  bool space;
1195
1.23M
  int16_t current_gap;
1196
1.23M
  float fuzzy_sp_to_kn_limit;
1197
1198
1.23M
  if (break_at_next_gap) {
1199
0
    break_at_next_gap = false;
1200
0
    return true;
1201
0
  }
1202
  /* Inhibit using the reduced gap if
1203
  The kerning is large - chars are not kerned and reducing "f"s can cause
1204
  erroneous blanks
1205
OR  The real gap is less than 0
1206
OR  The real gap is less than the kerning estimate
1207
*/
1208
1.23M
  if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1209
1.23M
      ((tosp_dont_fool_with_small_kerns >= 0) &&
1210
972k
       (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) {
1211
    // Ignore the difference
1212
266k
    within_xht_current_gap = real_current_gap;
1213
266k
  }
1214
1215
1.23M
  if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) {
1216
0
    current_gap = within_xht_current_gap;
1217
1.23M
  } else {
1218
1.23M
    current_gap = real_current_gap;
1219
1.23M
  }
1220
1221
1.23M
  if (tosp_old_to_method) {
1222
    // Boring old method
1223
0
    space = current_gap > row->max_nonspace;
1224
0
    if (space && (current_gap < INT16_MAX)) {
1225
0
      if (current_gap < row->min_space) {
1226
0
        if (current_gap > row->space_threshold) {
1227
0
          blanks = 1;
1228
0
          fuzzy_sp = true;
1229
0
          fuzzy_non = false;
1230
0
        } else {
1231
0
          blanks = 0;
1232
0
          fuzzy_sp = false;
1233
0
          fuzzy_non = true;
1234
0
        }
1235
0
      } else {
1236
0
        if (row->space_size == 0.0f) {
1237
          // Avoid FP division by 0.
1238
0
          blanks = 1;
1239
0
        } else {
1240
0
          blanks = static_cast<uint8_t>(current_gap / row->space_size);
1241
0
          if (blanks < 1) {
1242
0
            blanks = 1;
1243
0
          }
1244
0
        }
1245
0
        fuzzy_sp = false;
1246
0
        fuzzy_non = false;
1247
0
      }
1248
0
    }
1249
0
    return space;
1250
1.23M
  } else {
1251
    /* New exciting heuristic method */
1252
1.23M
    if (prev_blob_box.null_box()) { // Beginning of row
1253
94
      prev_gap_was_a_space = true;
1254
94
    }
1255
1256
    // Default as old TO
1257
1.23M
    space = current_gap > row->space_threshold;
1258
1259
    /* Set defaults for the word break in case we find one.  Currently there are
1260
no fuzzy spaces. Depending on the reliability of the different heuristics
1261
we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1262
be used if the function returns true - ie the word is to be broken.
1263
*/
1264
1.23M
    int num_blanks = current_gap;
1265
1.23M
    if (row->space_size > 1.0f) {
1266
1.23M
      num_blanks = IntCastRounded(current_gap / row->space_size);
1267
1.23M
    }
1268
1.23M
    blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
1269
1.23M
    fuzzy_sp = false;
1270
1.23M
    fuzzy_non = false;
1271
    /*
1272
If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1273
despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1274
context.
1275
*/
1276
1.23M
    if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) &&
1277
1.23M
        (within_xht_current_gap > row->max_nonspace)) {
1278
2.01k
      space = true;
1279
2.01k
      fuzzy_non = true;
1280
#ifndef GRAPHICS_DISABLED
1281
      mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1282
               next_gap);
1283
#endif
1284
1.23M
    } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) &&
1285
1.23M
               (within_xht_current_gap > row->space_threshold)) {
1286
262
      space = true;
1287
262
      if (tosp_flip_fuzz_kn_to_sp) {
1288
262
        fuzzy_sp = true;
1289
262
      } else {
1290
0
        fuzzy_non = true;
1291
0
      }
1292
#ifndef GRAPHICS_DISABLED
1293
      mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1294
               next_gap);
1295
#endif
1296
1.23M
    } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) &&
1297
1.23M
               (within_xht_current_gap >= row->min_space)) {
1298
268
      space = true;
1299
#ifndef GRAPHICS_DISABLED
1300
      mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1301
               next_gap);
1302
#endif
1303
1.23M
    } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) &&
1304
1.23M
               suspected_punct_blob(row, blob_box)) {
1305
0
      break_at_next_gap = true;
1306
0
    }
1307
    /* Now continue with normal heuristics */
1308
1.23M
    else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) {
1309
      /* Heuristics to turn dubious spaces to kerns */
1310
43.0k
      if (tosp_pass_wide_fuzz_sp_to_context > 0) {
1311
43.0k
        fuzzy_sp_to_kn_limit =
1312
43.0k
            row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size);
1313
43.0k
      } else {
1314
0
        fuzzy_sp_to_kn_limit = 99999.0f;
1315
0
      }
1316
1317
      /* If current gap is significantly smaller than the previous space the
1318
other side of a narrow blob then this gap is a kern. */
1319
43.0k
      if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space &&
1320
43.0k
          (current_gap <= tosp_gap_factor * prev_gap)) {
1321
1.52k
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1322
351
          if (tosp_flip_fuzz_sp_to_kn) {
1323
351
            fuzzy_non = true;
1324
351
          } else {
1325
0
            fuzzy_sp = true;
1326
0
          }
1327
1.17k
        } else {
1328
1.17k
          space = false;
1329
1.17k
        }
1330
#ifndef GRAPHICS_DISABLED
1331
        mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1332
                 next_gap);
1333
#endif
1334
1.52k
      }
1335
      /* If current gap not much bigger than the previous kern the other side of
1336
a narrow blob then this gap is a kern as well */
1337
41.5k
      else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) &&
1338
41.5k
               !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) {
1339
4.75k
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1340
1.04k
          if (tosp_flip_fuzz_sp_to_kn) {
1341
1.04k
            fuzzy_non = true;
1342
1.04k
          } else {
1343
0
            fuzzy_sp = true;
1344
0
          }
1345
3.71k
        } else {
1346
3.71k
          space = false;
1347
3.71k
        }
1348
#ifndef GRAPHICS_DISABLED
1349
        mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1350
                 next_gap);
1351
#endif
1352
36.7k
      } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
1353
36.7k
                 (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) {
1354
4.29k
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1355
1.10k
          if (tosp_flip_fuzz_sp_to_kn) {
1356
1.10k
            fuzzy_non = true;
1357
1.10k
          } else {
1358
0
            fuzzy_sp = true;
1359
0
          }
1360
3.19k
        } else {
1361
3.19k
          space = false;
1362
3.19k
        }
1363
#ifndef GRAPHICS_DISABLED
1364
        mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1365
                 next_gap);
1366
#endif
1367
32.4k
      } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
1368
32.4k
                 (next_gap <= row->space_threshold) &&
1369
32.4k
                 (current_gap * tosp_gap_factor <= next_gap)) {
1370
181
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1371
31
          if (tosp_flip_fuzz_sp_to_kn) {
1372
31
            fuzzy_non = true;
1373
31
          } else {
1374
0
            fuzzy_sp = true;
1375
0
          }
1376
150
        } else {
1377
150
          space = false;
1378
150
        }
1379
#ifndef GRAPHICS_DISABLED
1380
        mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1381
                 next_gap);
1382
#endif
1383
32.2k
      } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) ||
1384
32.2k
                  ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) {
1385
27.8k
        fuzzy_sp = true;
1386
#ifndef GRAPHICS_DISABLED
1387
        mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1388
                 next_gap);
1389
#endif
1390
27.8k
      }
1391
1.19M
    } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) {
1392
      /* Heuristics to turn dubious kerns to spaces */
1393
      /* TRIED THIS BUT IT MADE THINGS WORSE
1394
    if (prev_gap == INT16_MAX)
1395
      prev_gap = 0;  // start of row
1396
    if (next_gap == INT16_MAX)
1397
      next_gap = 0;  // end of row
1398
*/
1399
35.7k
      if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) &&
1400
35.7k
          (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
1401
35.7k
          wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) {
1402
591
        space = true;
1403
        /*
1404
tosp_flip_caution is an attempt to stop the default changing in cases
1405
where there is a large difference between the kern and space estimates.
1406
  See problem in 'chiefs' where "have" gets split in the quotation.
1407
*/
1408
591
        if ((tosp_flip_fuzz_kn_to_sp) &&
1409
591
            ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) {
1410
591
          fuzzy_sp = true;
1411
591
        } else {
1412
0
          fuzzy_non = true;
1413
0
        }
1414
#ifndef GRAPHICS_DISABLED
1415
        mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1416
                 next_gap);
1417
#endif
1418
35.1k
      } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 &&
1419
35.1k
                 current_gap > 5 && // Rule 9 handles small gap, big ratio.
1420
35.1k
                 current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
1421
35.1k
                 !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) &&
1422
35.1k
                 !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) {
1423
216
        space = true;
1424
216
        fuzzy_non = true;
1425
#ifndef GRAPHICS_DISABLED
1426
        mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1427
                 next_gap);
1428
#endif
1429
34.9k
      } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) &&
1430
34.9k
                 (next_blob_box.width() > 0) &&
1431
34.9k
                 (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
1432
34.9k
                 (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) &&
1433
14.9k
                                              !suspected_punct_blob(row, next_blob_box)))) {
1434
14.9k
        space = true;
1435
14.9k
        fuzzy_non = true;
1436
#ifndef GRAPHICS_DISABLED
1437
        mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1438
                 next_gap);
1439
#endif
1440
14.9k
      }
1441
35.7k
    }
1442
1.23M
    if (tosp_debug_level > 10) {
1443
0
      tprintf(
1444
0
          "word break = %d current_gap = %d, prev_gap = %d, "
1445
0
          "next_gap = %d\n",
1446
0
          space ? 1 : 0, current_gap, prev_gap, next_gap);
1447
0
    }
1448
1.23M
    prev_gap_was_a_space = space && !(fuzzy_non);
1449
1.23M
    return space;
1450
1.23M
  }
1451
1.23M
}
1452
1453
262k
bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
1454
262k
  bool result;
1455
262k
  result =
1456
262k
      ((blob_box.width() <= tosp_narrow_fraction * row->xheight) ||
1457
262k
       ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio));
1458
262k
  return result;
1459
262k
}
1460
1461
227k
bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
1462
227k
  bool result;
1463
227k
  if (tosp_wide_fraction > 0) {
1464
227k
    if (tosp_wide_aspect_ratio > 0) {
1465
0
      result =
1466
0
          ((blob_box.width() >= tosp_wide_fraction * row->xheight) &&
1467
0
           ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio));
1468
227k
    } else {
1469
227k
      result = (blob_box.width() >= tosp_wide_fraction * row->xheight);
1470
227k
    }
1471
227k
  } else {
1472
0
    result = !narrow_blob(row, blob_box);
1473
0
  }
1474
227k
  return result;
1475
227k
}
1476
1477
2.13k
bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
1478
2.13k
  bool result;
1479
2.13k
  float baseline;
1480
2.13k
  float blob_x_centre;
1481
  /* Find baseline of centre of blob */
1482
2.13k
  blob_x_centre = (box.right() + box.left()) / 2.0;
1483
2.13k
  baseline = row->baseline.y(blob_x_centre);
1484
1485
2.13k
  result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) ||
1486
2.13k
           (box.bottom() > baseline + row->xheight / 2.0);
1487
2.13k
  return result;
1488
2.13k
}
1489
1490
void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box,
1491
1.41M
                               int16_t &next_gap, int16_t &next_within_xht_gap) {
1492
1.41M
  TBOX next_reduced_blob_box;
1493
1.41M
  TBOX bit_beyond;
1494
1.41M
  BLOBNBOX_IT reduced_box_it = box_it;
1495
1496
1.41M
  next_blob_box = box_next(&box_it);
1497
1.41M
  next_reduced_blob_box = reduced_box_next(row, &reduced_box_it);
1498
1.41M
  if (box_it.at_first()) {
1499
234k
    next_gap = INT16_MAX;
1500
234k
    next_within_xht_gap = INT16_MAX;
1501
1.17M
  } else {
1502
1.17M
    bit_beyond = box_it.data()->bounding_box();
1503
1.17M
    next_gap = bit_beyond.left() - next_blob_box.right();
1504
1.17M
    bit_beyond = reduced_box_next(row, &reduced_box_it);
1505
1.17M
    next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right();
1506
1.17M
  }
1507
1.41M
}
1508
1509
#ifndef GRAPHICS_DISABLED
1510
void Textord::mark_gap(TBOX blob,    // blob following gap
1511
                       int16_t rule, // heuristic id
1512
                       int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,
1513
                       int16_t next_blob_width, int16_t next_gap) {
1514
  ScrollView::Color col; // of ellipse marking flipped gap
1515
1516
  switch (rule) {
1517
    case 1:
1518
      col = ScrollView::RED;
1519
      break;
1520
    case 2:
1521
      col = ScrollView::CYAN;
1522
      break;
1523
    case 3:
1524
      col = ScrollView::GREEN;
1525
      break;
1526
    case 4:
1527
      col = ScrollView::BLACK;
1528
      break;
1529
    case 5:
1530
      col = ScrollView::MAGENTA;
1531
      break;
1532
    case 6:
1533
      col = ScrollView::BLUE;
1534
      break;
1535
1536
    case 7:
1537
      col = ScrollView::WHITE;
1538
      break;
1539
    case 8:
1540
      col = ScrollView::YELLOW;
1541
      break;
1542
    case 9:
1543
      col = ScrollView::BLACK;
1544
      break;
1545
1546
    case 20:
1547
      col = ScrollView::CYAN;
1548
      break;
1549
    case 21:
1550
      col = ScrollView::GREEN;
1551
      break;
1552
    case 22:
1553
      col = ScrollView::MAGENTA;
1554
      break;
1555
    default:
1556
      col = ScrollView::BLACK;
1557
  }
1558
  if (textord_show_initial_words) {
1559
    to_win->Pen(col);
1560
    /*  if (rule < 20)
1561
    //interior_style(to_win, INT_SOLID, false);
1562
  else
1563
    //interior_style(to_win, INT_HOLLOW, true);*/
1564
    // x radius
1565
    to_win->Ellipse(current_gap / 2.0f,
1566
                    blob.height() / 2.0f, // y radius
1567
                                          // x centre
1568
                    blob.left() - current_gap / 2.0f,
1569
                    // y centre
1570
                    blob.bottom() + blob.height() / 2.0f);
1571
  }
1572
  if (tosp_debug_level > 5) {
1573
    tprintf("  (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2,
1574
            blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap);
1575
  }
1576
}
1577
#endif
1578
1579
0
float Textord::find_mean_blob_spacing(WERD *word) {
1580
0
  C_BLOB_IT cblob_it;
1581
0
  TBOX blob_box;
1582
0
  int32_t gap_sum = 0;
1583
0
  int16_t gap_count = 0;
1584
0
  int16_t prev_right;
1585
1586
0
  cblob_it.set_to_list(word->cblob_list());
1587
0
  if (!cblob_it.empty()) {
1588
0
    cblob_it.mark_cycle_pt();
1589
0
    prev_right = cblob_it.data()->bounding_box().right();
1590
    // first blob
1591
0
    cblob_it.forward();
1592
0
    for (; !cblob_it.cycled_list(); cblob_it.forward()) {
1593
0
      blob_box = cblob_it.data()->bounding_box();
1594
0
      gap_sum += blob_box.left() - prev_right;
1595
0
      gap_count++;
1596
0
      prev_right = blob_box.right();
1597
0
    }
1598
0
  }
1599
0
  if (gap_count > 0) {
1600
0
    return (gap_sum / static_cast<float>(gap_count));
1601
0
  } else {
1602
0
    return 0.0f;
1603
0
  }
1604
0
}
1605
1606
bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left,
1607
2.48M
                             int16_t right) {
1608
2.48M
  int16_t gap = right - left + 1;
1609
1610
2.48M
  if (tosp_ignore_big_gaps > 999) {
1611
0
    return false; // Don't ignore
1612
0
  }
1613
2.48M
  if (tosp_ignore_big_gaps > 0) {
1614
0
    return (gap > tosp_ignore_big_gaps * row->xheight);
1615
0
  }
1616
2.48M
  if (gap > tosp_ignore_very_big_gaps * row->xheight) {
1617
18.7k
    return true;
1618
18.7k
  }
1619
2.47M
  if (tosp_ignore_big_gaps == 0) {
1620
0
    if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) {
1621
0
      return true;
1622
0
    }
1623
0
    if ((gap > 1.75 * row->xheight) &&
1624
0
        ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) {
1625
0
      return true;
1626
0
    }
1627
2.47M
  } else {
1628
    /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table
1629
     */
1630
2.47M
    if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) {
1631
11.8k
      return true;
1632
11.8k
    }
1633
2.47M
  }
1634
2.45M
  return false;
1635
2.47M
}
1636
1637
/**********************************************************************
1638
 * reduced_box_next
1639
 *
1640
 * Compute the bounding box of this blob with merging of x overlaps
1641
 * but no pre-chopping.
1642
 * Then move the iterator on to the start of the next blob.
1643
 * DON'T reduce the box for small things - eg punctuation.
1644
 **********************************************************************/
1645
TBOX Textord::reduced_box_next(TO_ROW *row,    // current row
1646
                               BLOBNBOX_IT *it // iterator to blobds
1647
6.51M
) {
1648
6.51M
  BLOBNBOX *blob;             // current blob
1649
6.51M
  BLOBNBOX *head_blob;        // place to store box
1650
6.51M
  TBOX full_box;              // full blob boundg box
1651
6.51M
  TBOX reduced_box;           // box of significant part
1652
6.51M
  int16_t left_above_xht;     // ABOVE xht left limit
1653
6.51M
  int16_t new_left_above_xht; // ABOVE xht left limit
1654
1655
6.51M
  blob = it->data();
1656
6.51M
  if (blob->red_box_set()) {
1657
5.27M
    reduced_box = blob->reduced_box();
1658
10.7M
    do {
1659
10.7M
      it->forward();
1660
10.7M
      blob = it->data();
1661
10.7M
    } while (blob->cblob() == nullptr || blob->joined_to_prev());
1662
5.27M
    return reduced_box;
1663
5.27M
  }
1664
1.23M
  head_blob = blob;
1665
1.23M
  full_box = blob->bounding_box();
1666
1.23M
  reduced_box = reduced_box_for_blob(blob, row, &left_above_xht);
1667
2.54M
  do {
1668
2.54M
    it->forward();
1669
2.54M
    blob = it->data();
1670
2.54M
    if (blob->cblob() == nullptr) {
1671
      // was pre-chopped
1672
233k
      full_box += blob->bounding_box();
1673
2.31M
    } else if (blob->joined_to_prev()) {
1674
1.07M
      reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht);
1675
1.07M
      left_above_xht = std::min(left_above_xht, new_left_above_xht);
1676
1.07M
    }
1677
2.54M
  }
1678
  // until next real blob
1679
2.54M
  while (blob->cblob() == nullptr || blob->joined_to_prev());
1680
1681
1.23M
  if ((reduced_box.width() > 0) &&
1682
1.23M
      ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) &&
1683
1.23M
      (reduced_box.height() > 0.7 * row->xheight)) {
1684
#ifndef GRAPHICS_DISABLED
1685
    if (textord_show_initial_words) {
1686
      reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW);
1687
    }
1688
#endif
1689
912k
  } else {
1690
912k
    reduced_box = full_box;
1691
912k
  }
1692
1.23M
  head_blob->set_reduced_box(reduced_box);
1693
1.23M
  return reduced_box;
1694
6.51M
}
1695
1696
/*************************************************************************
1697
 * reduced_box_for_blob()
1698
 * Find box for blob which is the same height and y position as the whole blob,
1699
 * but whose left limit is the left most position of the blob ABOVE the
1700
 * baseline and whose right limit is the right most position of the blob BELOW
1701
 * the xheight.
1702
 *
1703
 *
1704
 * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1705
 *         "home".  Perhaps we need something which say if the width ABOVE the
1706
 *         xht alone includes the whole of the reduced width, then use the full
1707
 *         blob box - Might still fail on italic F
1708
 *
1709
 *         Alternatively we could be a little less severe and only reduce the
1710
 *         left and right edges by half the difference between the full box and
1711
 *         the reduced box.
1712
 *
1713
 * NOTE that we need to rotate all the coordinates as
1714
 * find_blob_limits finds the y min and max within a specified x band
1715
 *************************************************************************/
1716
2.31M
TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) {
1717
2.31M
  float baseline;
1718
2.31M
  float blob_x_centre;
1719
2.31M
  float left_limit;
1720
2.31M
  float right_limit;
1721
2.31M
  float junk;
1722
2.31M
  TBOX blob_box;
1723
1724
  /* Find baseline of centre of blob */
1725
1726
2.31M
  blob_box = blob->bounding_box();
1727
2.31M
  blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0;
1728
2.31M
  baseline = row->baseline.y(blob_x_centre);
1729
1730
  /*
1731
Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1732
caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1733
*/
1734
2.31M
  left_limit = static_cast<float>(INT32_MAX);
1735
2.31M
  junk = static_cast<float>(-INT32_MAX);
1736
2.31M
  find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX),
1737
2.31M
                     left_limit, junk);
1738
2.31M
  if (left_limit > junk) {
1739
1.88M
    *left_above_xht = INT16_MAX; // No area above xht
1740
1.88M
  } else {
1741
425k
    *left_above_xht = static_cast<int16_t>(std::floor(left_limit));
1742
425k
  }
1743
  /*
1744
Find reduced LH limit of blob - the left extent of the region ABOVE the
1745
baseline.
1746
*/
1747
2.31M
  left_limit = static_cast<float>(INT32_MAX);
1748
2.31M
  junk = static_cast<float>(-INT32_MAX);
1749
2.31M
  find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk);
1750
1751
2.31M
  if (left_limit > junk) {
1752
199k
    return TBOX(); // no area within xht so return empty box
1753
199k
  }
1754
  /*
1755
Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1756
*/
1757
2.11M
  junk = static_cast<float>(INT32_MAX);
1758
2.11M
  right_limit = static_cast<float>(-INT32_MAX);
1759
2.11M
  find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk,
1760
2.11M
                     right_limit);
1761
2.11M
  if (junk > right_limit) {
1762
258k
    return TBOX(); // no area within xht so return empty box
1763
258k
  }
1764
1765
1.85M
  return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()),
1766
1.85M
              ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top()));
1767
2.11M
}
1768
} // namespace tesseract