Coverage Report

Created: 2025-09-27 07:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tesseract/src/textord/tospace.cpp
Line
Count
Source
1
// Licensed under the Apache License, Version 2.0 (the "License");
2
// you may not use this file except in compliance with the License.
3
// You may obtain a copy of the License at
4
// http://www.apache.org/licenses/LICENSE-2.0
5
// Unless required by applicable law or agreed to in writing, software
6
// distributed under the License is distributed on an "AS IS" BASIS,
7
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8
// See the License for the specific language governing permissions and
9
// limitations under the License.
10
/**********************************************************************
11
 * tospace.cpp
12
 *
13
 * Compute fuzzy word spacing thresholds for each row.
14
 * I.e. set :   max_nonspace
15
 *              space_threshold
16
 *              min_space
17
 *              kern_size
18
 *              space_size
19
 * for each row.
20
 * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
21
 *
22
 * Note: functions in this file were originally not members of any
23
 * class or enclosed by any namespace. Now they are all static members
24
 * of the Textord class.
25
 *
26
 **********************************************************************/
27
28
#include "drawtord.h"
29
#include "statistc.h"
30
#include "textord.h"
31
#include "tovars.h"
32
33
// Include automatically generated configuration file if running autoconf.
34
#ifdef HAVE_CONFIG_H
35
#  include "config_auto.h"
36
#endif
37
38
#include <algorithm>
39
#include <cmath>
40
#include <memory>
41
42
1.21M
#define MAXSPACING 128 /*max expected spacing in pix */
43
44
namespace tesseract {
45
void Textord::to_spacing(ICOORD page_tr,       // topright of page
46
                         TO_BLOCK_LIST *blocks // blocks on page
47
16.1k
) {
48
16.1k
  TO_BLOCK_IT block_it; // iterator
49
16.1k
  TO_BLOCK *block;      // current block;
50
16.1k
  TO_ROW *row;          // current row
51
16.1k
  int block_index;      // block number
52
16.1k
  int row_index;        // row number
53
  // estimated width of real spaces for whole block
54
16.1k
  int16_t block_space_gap_width;
55
  // estimated width of non space gaps for whole block
56
16.1k
  int16_t block_non_space_gap_width;
57
16.1k
  bool old_text_ord_proportional; // old fixed/prop result
58
59
16.1k
  block_it.set_to_list(blocks);
60
16.1k
  block_index = 1;
61
32.3k
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
62
16.1k
    block = block_it.data();
63
16.1k
    std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk
64
16.1k
    block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,
65
16.1k
                        block_non_space_gap_width);
66
    // Make sure relative values of block-level space and non-space gap
67
    // widths are reasonable. The ratio of 1:3 is also used in
68
    // block_spacing_stats, to correct the block_space_gap_width.
69
    // Useful for arabic and hindi, when the non-space gap width is
70
    // often over-estimated and should not be trusted. A similar ratio
71
    // is found in block_spacing_stats.
72
16.1k
    if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
73
0
        block_non_space_gap_width > block_space_gap_width / 3) {
74
0
      block_non_space_gap_width = block_space_gap_width / 3;
75
0
    }
76
    // row iterator
77
16.1k
    TO_ROW_IT row_it(block->get_rows());
78
16.1k
    row_index = 1;
79
193k
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
80
177k
      row = row_it.data();
81
177k
      if ((row->pitch_decision == PITCH_DEF_PROP) || (row->pitch_decision == PITCH_CORR_PROP)) {
82
172k
        if ((tosp_debug_level > 0) && !old_text_ord_proportional) {
83
0
          tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index);
84
0
        }
85
172k
        row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,
86
172k
                          block_non_space_gap_width);
87
172k
      } else {
88
5.61k
        if ((tosp_debug_level > 0) && old_text_ord_proportional) {
89
0
          tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index,
90
0
                  row_index, row->pitch_decision, row->fixed_pitch);
91
0
        }
92
5.61k
      }
93
#ifndef GRAPHICS_DISABLED
94
      if (textord_show_initial_words) {
95
        plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
96
      }
97
#endif
98
177k
      row_index++;
99
177k
    }
100
16.1k
    block_index++;
101
16.1k
  }
102
16.1k
}
103
104
/*************************************************************************
105
 * block_spacing_stats()
106
 *************************************************************************/
107
108
void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,
109
                                  int16_t &block_space_gap_width,    // resulting estimate
110
                                  int16_t &block_non_space_gap_width // resulting estimate
111
16.1k
) {
112
16.1k
  TO_ROW *row;         // current row
113
16.1k
  BLOBNBOX_IT blob_it; // iterator
114
115
16.1k
  STATS centre_to_centre_stats(0, MAXSPACING - 1);
116
  // DEBUG USE ONLY
117
16.1k
  STATS all_gap_stats(0, MAXSPACING - 1);
118
16.1k
  STATS space_gap_stats(0, MAXSPACING - 1);
119
16.1k
  int16_t minwidth = MAXSPACING; // narrowest blob
120
16.1k
  TBOX blob_box;
121
16.1k
  TBOX prev_blob_box;
122
16.1k
  int16_t centre_to_centre;
123
16.1k
  int16_t gap_width;
124
16.1k
  float real_space_threshold;
125
16.1k
  float iqr_centre_to_centre; // DEBUG USE ONLY
126
16.1k
  float iqr_all_gap_stats;    // DEBUG USE ONLY
127
16.1k
  int32_t end_of_row;
128
16.1k
  int32_t row_length;
129
130
  // row iterator
131
16.1k
  TO_ROW_IT row_it(block->get_rows());
132
193k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
133
177k
    row = row_it.data();
134
177k
    if (!row->blob_list()->empty() &&
135
177k
        (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
136
172k
         (row->pitch_decision == PITCH_CORR_PROP))) {
137
172k
      blob_it.set_to_list(row->blob_list());
138
172k
      blob_it.mark_cycle_pt();
139
172k
      end_of_row = blob_it.data_relative(-1)->bounding_box().right();
140
172k
      if (tosp_use_pre_chopping) {
141
0
        blob_box = box_next_pre_chopped(&blob_it);
142
172k
      } else if (tosp_stats_use_xht_gaps) {
143
172k
        blob_box = reduced_box_next(row, &blob_it);
144
172k
      } else {
145
0
        blob_box = box_next(&blob_it);
146
0
      }
147
172k
      row_length = end_of_row - blob_box.left();
148
172k
      if (blob_box.width() < minwidth) {
149
22.6k
        minwidth = blob_box.width();
150
22.6k
      }
151
172k
      prev_blob_box = blob_box;
152
1.24M
      while (!blob_it.cycled_list()) {
153
1.07M
        if (tosp_use_pre_chopping) {
154
0
          blob_box = box_next_pre_chopped(&blob_it);
155
1.07M
        } else if (tosp_stats_use_xht_gaps) {
156
1.07M
          blob_box = reduced_box_next(row, &blob_it);
157
1.07M
        } else {
158
0
          blob_box = box_next(&blob_it);
159
0
        }
160
1.07M
        if (blob_box.width() < minwidth) {
161
8.15k
          minwidth = blob_box.width();
162
8.15k
        }
163
1.07M
        int16_t left = prev_blob_box.right();
164
1.07M
        int16_t right = blob_box.left();
165
1.07M
        gap_width = right - left;
166
1.07M
        if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
167
1.06M
          all_gap_stats.add(gap_width, 1);
168
169
1.06M
          centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2;
170
          // DEBUG
171
1.06M
          centre_to_centre_stats.add(centre_to_centre, 1);
172
          // DEBUG
173
1.06M
        }
174
1.07M
        prev_blob_box = blob_box;
175
1.07M
      }
176
172k
    }
177
177k
  }
178
179
  // Inadequate samples
180
16.1k
  if (all_gap_stats.get_total() <= 1) {
181
7.20k
    block_non_space_gap_width = minwidth;
182
7.20k
    block_space_gap_width = -1; // No est. space width
183
                                // DEBUG
184
7.20k
    old_text_ord_proportional = true;
185
8.94k
  } else {
186
    /* For debug only ..... */
187
8.94k
    iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25);
188
8.94k
    iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25);
189
8.94k
    old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats;
190
    /* .......For debug only */
191
192
    /*
193
The median of the gaps is used as an estimate of the NON-SPACE gap width.
194
This RELIES on the assumption that there are more gaps WITHIN words than
195
BETWEEN words in a block
196
197
Now try to estimate the width of a real space for all real spaces in the
198
block. Do this by using a crude threshold to ignore "narrow" gaps, then
199
find the median of the "wide" gaps and use this.
200
*/
201
8.94k
    block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median()));
202
    // median gap
203
204
8.94k
    row_it.set_to_list(block->get_rows());
205
140k
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
206
131k
      row = row_it.data();
207
131k
      if (!row->blob_list()->empty() &&
208
131k
          (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
209
127k
           (row->pitch_decision == PITCH_CORR_PROP))) {
210
127k
        real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width,
211
127k
                                        tosp_init_guess_xht_mult * row->xheight);
212
127k
        blob_it.set_to_list(row->blob_list());
213
127k
        blob_it.mark_cycle_pt();
214
127k
        end_of_row = blob_it.data_relative(-1)->bounding_box().right();
215
127k
        if (tosp_use_pre_chopping) {
216
0
          blob_box = box_next_pre_chopped(&blob_it);
217
127k
        } else if (tosp_stats_use_xht_gaps) {
218
127k
          blob_box = reduced_box_next(row, &blob_it);
219
127k
        } else {
220
0
          blob_box = box_next(&blob_it);
221
0
        }
222
127k
        row_length = blob_box.left() - end_of_row;
223
127k
        prev_blob_box = blob_box;
224
1.19M
        while (!blob_it.cycled_list()) {
225
1.07M
          if (tosp_use_pre_chopping) {
226
0
            blob_box = box_next_pre_chopped(&blob_it);
227
1.07M
          } else if (tosp_stats_use_xht_gaps) {
228
1.07M
            blob_box = reduced_box_next(row, &blob_it);
229
1.07M
          } else {
230
0
            blob_box = box_next(&blob_it);
231
0
          }
232
1.07M
          int16_t left = prev_blob_box.right();
233
1.07M
          int16_t right = blob_box.left();
234
1.07M
          gap_width = right - left;
235
1.07M
          if ((gap_width > real_space_threshold) &&
236
118k
              !ignore_big_gap(row, row_length, gapmap, left, right)) {
237
            /*
238
If tosp_use_cert_spaces is enabled, the estimate of the space gap is
239
restricted to obvious spaces - those wider than half the xht or
240
those with wide blobs on both sides - i.e not things that are
241
suspect 1's or punctuation that is sometimes widely spaced.
242
*/
243
108k
            if (!tosp_block_use_cert_spaces ||
244
108k
                (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
245
75.1k
                ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
246
20.1k
                 (!tosp_narrow_blobs_not_cert ||
247
20.1k
                  (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
248
72.7k
                (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
249
38.4k
              space_gap_stats.add(gap_width, 1);
250
38.4k
            }
251
108k
          }
252
1.07M
          prev_blob_box = blob_box;
253
1.07M
        }
254
127k
      }
255
131k
    }
256
    // Inadequate samples
257
8.94k
    if (space_gap_stats.get_total() <= 2) {
258
7.22k
      block_space_gap_width = -1; // No est. space width
259
7.22k
    } else {
260
1.72k
      block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())),
261
1.72k
                                       static_cast<int16_t>(3 * block_non_space_gap_width));
262
1.72k
    }
263
8.94k
  }
264
16.1k
}
265
266
/*************************************************************************
267
 * row_spacing_stats()
268
 * Set values for min_space, max_non_space based on row stats only
269
 * If failure - return 0 values.
270
 *************************************************************************/
271
void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,
272
                                int16_t block_space_gap_width,    // estimate for block
273
                                int16_t block_non_space_gap_width // estimate for block
274
172k
) {
275
  // iterator
276
172k
  BLOBNBOX_IT blob_it = row->blob_list();
277
172k
  STATS all_gap_stats(0, MAXSPACING - 1);
278
172k
  STATS cert_space_gap_stats(0, MAXSPACING - 1);
279
172k
  STATS all_space_gap_stats(0, MAXSPACING - 1);
280
172k
  STATS small_gap_stats(0, MAXSPACING - 1);
281
172k
  TBOX blob_box;
282
172k
  TBOX prev_blob_box;
283
172k
  int16_t gap_width;
284
172k
  int16_t real_space_threshold = 0;
285
172k
  int16_t max = 0;
286
172k
  int16_t large_gap_count = 0;
287
172k
  bool suspected_table;
288
172k
  bool good_block_space_estimate = block_space_gap_width > 0;
289
172k
  int32_t end_of_row;
290
172k
  int32_t row_length = 0;
291
172k
  float sane_space;
292
172k
  int32_t sane_threshold;
293
294
  /* Collect first pass stats for row */
295
296
172k
  if (!good_block_space_estimate) {
297
137k
    block_space_gap_width = int16_t(std::floor(row->xheight / 2));
298
137k
  }
299
172k
  if (!row->blob_list()->empty()) {
300
172k
    if (tosp_threshold_bias1 > 0) {
301
0
      real_space_threshold =
302
0
          block_non_space_gap_width +
303
0
          int16_t(floor(0.5 + tosp_threshold_bias1 *
304
0
                                  (block_space_gap_width - block_non_space_gap_width)));
305
172k
    } else {
306
172k
      real_space_threshold = // Old TO method
307
172k
          (block_space_gap_width + block_non_space_gap_width) / 2;
308
172k
    }
309
172k
    blob_it.set_to_list(row->blob_list());
310
172k
    blob_it.mark_cycle_pt();
311
172k
    end_of_row = blob_it.data_relative(-1)->bounding_box().right();
312
172k
    if (tosp_use_pre_chopping) {
313
0
      blob_box = box_next_pre_chopped(&blob_it);
314
172k
    } else if (tosp_stats_use_xht_gaps) {
315
172k
      blob_box = reduced_box_next(row, &blob_it);
316
172k
    } else {
317
0
      blob_box = box_next(&blob_it);
318
0
    }
319
172k
    row_length = end_of_row - blob_box.left();
320
172k
    prev_blob_box = blob_box;
321
1.24M
    while (!blob_it.cycled_list()) {
322
1.07M
      if (tosp_use_pre_chopping) {
323
0
        blob_box = box_next_pre_chopped(&blob_it);
324
1.07M
      } else if (tosp_stats_use_xht_gaps) {
325
1.07M
        blob_box = reduced_box_next(row, &blob_it);
326
1.07M
      } else {
327
0
        blob_box = box_next(&blob_it);
328
0
      }
329
1.07M
      int16_t left = prev_blob_box.right();
330
1.07M
      int16_t right = blob_box.left();
331
1.07M
      gap_width = right - left;
332
1.07M
      if (ignore_big_gap(row, row_length, gapmap, left, right)) {
333
10.0k
        large_gap_count++;
334
1.06M
      } else {
335
1.06M
        if (gap_width >= real_space_threshold) {
336
138k
          if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
337
96.2k
              ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
338
23.4k
               (!tosp_narrow_blobs_not_cert ||
339
23.4k
                (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
340
92.5k
              (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
341
49.6k
            cert_space_gap_stats.add(gap_width, 1);
342
49.6k
          }
343
138k
          all_space_gap_stats.add(gap_width, 1);
344
924k
        } else {
345
924k
          small_gap_stats.add(gap_width, 1);
346
924k
        }
347
1.06M
        all_gap_stats.add(gap_width, 1);
348
1.06M
      }
349
1.07M
      prev_blob_box = blob_box;
350
1.07M
    }
351
172k
  }
352
172k
  suspected_table = (large_gap_count > 1) ||
353
170k
                    ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples));
354
355
  /* Now determine row kern size, space size and threshold */
356
357
172k
  if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) ||
358
166k
      ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) &&
359
153k
       cert_space_gap_stats.get_total() > 0)) {
360
18.3k
    old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats,
361
18.3k
                  block_space_gap_width, block_non_space_gap_width);
362
153k
  } else {
363
153k
    if (!tosp_recovery_isolated_row_stats ||
364
153k
        !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) {
365
145k
      if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) {
366
0
        tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx);
367
0
      }
368
145k
      if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
369
        // Use block default
370
17.0k
        row->space_size = block_space_gap_width;
371
17.0k
        if (all_gap_stats.get_total() > tosp_redo_kern_limit) {
372
6.16k
          row->kern_size = all_gap_stats.median();
373
10.9k
        } else {
374
10.9k
          row->kern_size = block_non_space_gap_width;
375
10.9k
        }
376
17.0k
        row->space_threshold =
377
17.0k
            int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
378
128k
      } else {
379
128k
        old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats,
380
128k
                      block_space_gap_width, block_non_space_gap_width);
381
128k
      }
382
145k
    }
383
153k
  }
384
385
172k
  if (tosp_improve_thresh && !suspected_table) {
386
0
    improve_row_threshold(row, &all_gap_stats);
387
0
  }
388
389
  /* Now lets try to be careful not to do anything silly with tables when we
390
are ignoring big gaps*/
391
172k
  if (tosp_sanity_method == 0) {
392
0
    if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
393
0
      if (tosp_debug_level > 5) {
394
0
        tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx,
395
0
                row->kern_size, row->space_threshold, row->space_size);
396
0
      }
397
0
      row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);
398
0
      row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
399
0
    }
400
172k
  } else if (tosp_sanity_method == 1) {
401
172k
    sane_space = row->space_size;
402
    /* NEVER let space size get too close to kern size */
403
172k
    if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
404
151k
        ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) {
405
23.4k
      if (good_block_space_estimate &&
406
1.38k
          (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) {
407
1.22k
        sane_space = block_space_gap_width;
408
22.2k
      } else {
409
22.2k
        sane_space =
410
22.2k
            std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
411
22.2k
                     row->xheight / 2.0f);
412
22.2k
      }
413
23.4k
      if (tosp_debug_level > 5) {
414
0
        tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx,
415
0
                row->kern_size, row->space_threshold, row->space_size, sane_space);
416
0
      }
417
23.4k
      row->space_size = sane_space;
418
23.4k
      row->space_threshold =
419
23.4k
          int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
420
23.4k
    }
421
    /* NEVER let threshold get VERY far away from kern */
422
172k
    sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f)));
423
172k
    if (row->space_threshold > sane_threshold) {
424
4.29k
      if (tosp_debug_level > 5) {
425
0
        tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx,
426
0
                row->kern_size, row->space_threshold, row->space_size, sane_threshold);
427
0
      }
428
4.29k
      row->space_threshold = sane_threshold;
429
4.29k
      if (row->space_size <= sane_threshold) {
430
0
        row->space_size = row->space_threshold + 1.0f;
431
0
      }
432
4.29k
    }
433
    /* Beware of tables - there may be NO spaces */
434
172k
    if (suspected_table) {
435
6.00k
      sane_space =
436
6.00k
          std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight);
437
6.00k
      sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2));
438
439
6.00k
      if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) {
440
723
        if (tosp_debug_level > 5) {
441
0
          tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx,
442
0
                  row->kern_size, row->space_threshold, row->space_size);
443
0
        }
444
        // the minimum sane value
445
723
        row->space_threshold = static_cast<int32_t>(sane_space);
446
723
        row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
447
723
      }
448
6.00k
    }
449
172k
  }
450
451
  /* Now lets try to put some error limits on the threshold */
452
453
172k
  if (tosp_old_to_method) {
454
    /* Old textord made a space if gap >= threshold */
455
    // NO FUZZY SPACES YET
456
0
    row->max_nonspace = row->space_threshold;
457
    // NO FUZZY SPACES       YET
458
0
    row->min_space = row->space_threshold + 1;
459
172k
  } else {
460
    /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
461
172k
    row->min_space =
462
172k
        std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size));
463
172k
    if (row->min_space <= row->space_threshold) {
464
      // Don't be silly
465
30.3k
      row->min_space = row->space_threshold + 1;
466
30.3k
    }
467
    /*
468
Lets try to guess the max certain kern gap by looking at the cluster of
469
kerns for the row. The row is proportional so the kerns should cluster
470
tightly at the bottom of the distribution. We also expect most gaps to be
471
kerns. Find the maximum of the kern piles between 0 and twice the kern
472
estimate. Piles before the first one with less than 1/10 the maximum
473
number of samples can be taken as certain kerns.
474
475
  Of course, there are some cases where the kern peak and space peaks merge,
476
  so we will put an UPPER limit on the max certain kern gap of some fraction
477
  below the threshold.
478
*/
479
480
    // upper bound
481
172k
    int32_t max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2);
482
483
    // default
484
172k
    row->max_nonspace = max_max_nonspace;
485
873k
    for (int32_t index = 0; index <= max_max_nonspace; index++) {
486
772k
      if (all_gap_stats.pile_count(index) > max) {
487
120k
        max = all_gap_stats.pile_count(index);
488
120k
      }
489
772k
      if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) {
490
71.0k
        row->max_nonspace = index;
491
71.0k
        break;
492
71.0k
      }
493
772k
    }
494
172k
  }
495
496
  /* Yet another algorithm - simpler this time - just choose a fraction of the
497
threshold to space range */
498
499
172k
  if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) {
500
172k
    row->min_space = std::max(
501
172k
        row->min_space, static_cast<int32_t>(ceil(row->space_threshold +
502
172k
                                                  tosp_fuzzy_sp_fraction *
503
172k
                                                      (row->space_size - row->space_threshold))));
504
172k
  }
505
506
  /* Ensure that ANY space less than some multiplier times the kern size is
507
fuzzy.  In tables there is a risk of erroneously setting a small space size
508
when there are no real spaces. Sometimes tables have text squashed into
509
columns so that the kn->sp ratio is small anyway - this means that we can't
510
use this to force a wider separation - hence we rely on context to join any
511
dubious breaks. */
512
513
172k
  if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) {
514
172k
    row->min_space = std::max(
515
172k
        row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size)));
516
172k
  }
517
518
172k
  if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
519
171k
    row->max_nonspace = static_cast<int32_t>(floor(
520
171k
        0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)));
521
171k
  }
522
172k
  if (row->max_nonspace > row->space_threshold) {
523
    // Don't be silly
524
0
    row->max_nonspace = row->space_threshold;
525
0
  }
526
527
172k
  if (tosp_debug_level > 5) {
528
0
    tprintf(
529
0
        "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) "
530
0
        "Sp:%3.2f\n",
531
0
        block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width,
532
0
        real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold,
533
0
        row->min_space, row->space_size);
534
0
  }
535
172k
  if (tosp_debug_level > 10) {
536
0
    tprintf(
537
0
        "row->kern_size = %3.2f, row->space_size = %3.2f, "
538
0
        "row->space_threshold = %d\n",
539
0
        row->kern_size, row->space_size, row->space_threshold);
540
0
  }
541
172k
}
542
543
void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,
544
                            STATS *small_gap_stats,
545
                            int16_t block_space_gap_width,    // estimate for block
546
                            int16_t block_non_space_gap_width // estimate for block
547
146k
) {
548
  /* First, estimate row space size */
549
  /* Old to condition was > 2 */
550
146k
  if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) {
551
    // Adequate samples
552
    /* Set space size to median of spaces BUT limits it if it seems wildly out
553
     */
554
9.75k
    row->space_size = space_gap_stats->median();
555
9.75k
    if (row->space_size > block_space_gap_width * 1.5) {
556
1.08k
      if (tosp_old_to_bug_fix) {
557
0
        row->space_size = block_space_gap_width * 1.5;
558
1.08k
      } else {
559
        // BUG??? should be *1.5
560
1.08k
        row->space_size = block_space_gap_width;
561
1.08k
      }
562
1.08k
    }
563
9.75k
    if (row->space_size < (block_non_space_gap_width * 2) + 1) {
564
2.72k
      row->space_size = (block_non_space_gap_width * 2) + 1;
565
2.72k
    }
566
9.75k
  }
567
  // Only 1 or 2 samples
568
136k
  else if (space_gap_stats->get_total() >= 1) {
569
    // hence mean not median
570
28.5k
    row->space_size = space_gap_stats->mean();
571
28.5k
    if (row->space_size > block_space_gap_width * 1.5) {
572
3.71k
      if (tosp_old_to_bug_fix) {
573
0
        row->space_size = block_space_gap_width * 1.5;
574
3.71k
      } else {
575
        // BUG??? should be *1.5
576
3.71k
        row->space_size = block_space_gap_width;
577
3.71k
      }
578
3.71k
    }
579
28.5k
    if (row->space_size < (block_non_space_gap_width * 3) + 1) {
580
12.9k
      row->space_size = (block_non_space_gap_width * 3) + 1;
581
12.9k
    }
582
108k
  } else {
583
    // Use block default
584
108k
    row->space_size = block_space_gap_width;
585
108k
  }
586
587
  /* Next, estimate row kern size */
588
146k
  if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) {
589
0
    row->kern_size = small_gap_stats->median();
590
146k
  } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) {
591
15.7k
    row->kern_size = all_gap_stats->median();
592
130k
  } else { // old TO -SAME FOR ALL ROWS
593
130k
    row->kern_size = block_non_space_gap_width;
594
130k
  }
595
596
  /* Finally, estimate row space threshold */
597
146k
  if (tosp_threshold_bias2 > 0) {
598
0
    row->space_threshold = int32_t(
599
0
        floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size)));
600
146k
  } else {
601
    /*
602
  NOTE old text ord uses (space_size + kern_size + 1)/2  as the threshold
603
and holds this in a float. The use is with a >= test
604
NEW textord uses an integer threshold and a > test
605
It comes to the same thing.
606
  (Though there is a difference in that old textor has integer space_size
607
  and kern_size.)
608
*/
609
146k
    row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
610
146k
  }
611
612
  // Apply the same logic and ratios as in row_spacing_stats to
613
  // restrict relative values of the row's space_size, kern_size, and
614
  // space_threshold
615
146k
  if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&
616
0
      ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
617
0
       ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) {
618
0
    if (row->kern_size > 2.5) {
619
0
      row->kern_size = row->space_size / tosp_min_sane_kn_sp;
620
0
    }
621
0
    row->space_threshold =
622
0
        int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
623
0
  }
624
146k
}
625
626
/*************************************************************************
627
 * isolated_row_stats()
628
 * Set values for min_space, max_non_space based on row stats only
629
 *************************************************************************/
630
bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats,
631
153k
                                 bool suspected_table, int16_t block_idx, int16_t row_idx) {
632
153k
  float kern_estimate;
633
153k
  float crude_threshold_estimate;
634
153k
  int16_t small_gaps_count;
635
153k
  int16_t total;
636
  // iterator
637
153k
  BLOBNBOX_IT blob_it = row->blob_list();
638
153k
  STATS cert_space_gap_stats(0, MAXSPACING - 1);
639
153k
  STATS all_space_gap_stats(0, MAXSPACING - 1);
640
153k
  STATS small_gap_stats(0, MAXSPACING - 1);
641
153k
  TBOX blob_box;
642
153k
  TBOX prev_blob_box;
643
153k
  int16_t gap_width;
644
153k
  int32_t end_of_row;
645
153k
  int32_t row_length;
646
647
153k
  kern_estimate = all_gap_stats->median();
648
153k
  crude_threshold_estimate =
649
153k
      std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight);
650
153k
  small_gaps_count =
651
153k
      stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate)));
652
153k
  total = all_gap_stats->get_total();
653
654
153k
  if ((total <= tosp_redo_kern_limit) ||
655
24.1k
      ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||
656
145k
      (total - small_gaps_count < 1)) {
657
145k
    if (tosp_debug_level > 5) {
658
0
      tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx);
659
0
    }
660
145k
    return false;
661
145k
  }
662
8.28k
  blob_it.set_to_list(row->blob_list());
663
8.28k
  blob_it.mark_cycle_pt();
664
8.28k
  end_of_row = blob_it.data_relative(-1)->bounding_box().right();
665
8.28k
  if (tosp_use_pre_chopping) {
666
0
    blob_box = box_next_pre_chopped(&blob_it);
667
8.28k
  } else if (tosp_stats_use_xht_gaps) {
668
8.28k
    blob_box = reduced_box_next(row, &blob_it);
669
8.28k
  } else {
670
0
    blob_box = box_next(&blob_it);
671
0
  }
672
8.28k
  row_length = end_of_row - blob_box.left();
673
8.28k
  prev_blob_box = blob_box;
674
255k
  while (!blob_it.cycled_list()) {
675
247k
    if (tosp_use_pre_chopping) {
676
0
      blob_box = box_next_pre_chopped(&blob_it);
677
247k
    } else if (tosp_stats_use_xht_gaps) {
678
247k
      blob_box = reduced_box_next(row, &blob_it);
679
247k
    } else {
680
0
      blob_box = box_next(&blob_it);
681
0
    }
682
247k
    int16_t left = prev_blob_box.right();
683
247k
    int16_t right = blob_box.left();
684
247k
    gap_width = right - left;
685
247k
    if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
686
247k
        (gap_width > crude_threshold_estimate)) {
687
22.1k
      if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
688
19.8k
          ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
689
6.56k
           (!tosp_narrow_blobs_not_cert ||
690
6.56k
            (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
691
19.6k
          (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
692
2.82k
        cert_space_gap_stats.add(gap_width, 1);
693
2.82k
      }
694
22.1k
      all_space_gap_stats.add(gap_width, 1);
695
22.1k
    }
696
247k
    if (gap_width < crude_threshold_estimate) {
697
225k
      small_gap_stats.add(gap_width, 1);
698
225k
    }
699
700
247k
    prev_blob_box = blob_box;
701
247k
  }
702
8.28k
  if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
703
    // median
704
80
    row->space_size = cert_space_gap_stats.median();
705
8.20k
  } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) {
706
    // to avoid spaced
707
40
    row->space_size = cert_space_gap_stats.mean();
708
  //      1's in tables
709
8.16k
  } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
710
    // median
711
2.71k
    row->space_size = all_space_gap_stats.median();
712
5.44k
  } else {
713
5.44k
    row->space_size = all_space_gap_stats.mean();
714
5.44k
  }
715
716
8.28k
  if (tosp_only_small_gaps_for_kern) {
717
0
    row->kern_size = small_gap_stats.median();
718
8.28k
  } else {
719
8.28k
    row->kern_size = all_gap_stats->median();
720
8.28k
  }
721
8.28k
  row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
722
  /* Sanity check */
723
8.28k
  if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) ||
724
8.25k
      (row->space_threshold <= 0)) {
725
26
    if (tosp_debug_level > 5) {
726
0
      tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx,
727
0
              row->kern_size, row->space_threshold, row->space_size);
728
0
    }
729
26
    row->kern_size = 0.0f;
730
26
    row->space_threshold = 0;
731
26
    row->space_size = 0.0f;
732
26
    return false;
733
26
  }
734
735
8.25k
  if (tosp_debug_level > 5) {
736
0
    tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size,
737
0
            row->space_threshold, row->space_size);
738
0
  }
739
8.25k
  return true;
740
8.28k
}
741
742
153k
int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
743
153k
  int16_t index;
744
153k
  int16_t total = 0;
745
746
930k
  for (index = 0; index < threshold; index++) {
747
776k
    total += stats->pile_count(index);
748
776k
  }
749
153k
  return total;
750
153k
}
751
752
/*************************************************************************
753
 * improve_row_threshold()
754
 *    Try to recognise a "normal line" -
755
 *           > 25 gaps
756
 *     &&    space > 3 * kn  && space > 10
757
 *              (I.e. reasonably large space and kn:sp ratio)
758
 *     &&    > 3/4 # gaps < kn + (sp - kn)/3
759
 *              (I.e. most gaps are well away from space estimate)
760
 *     &&    a gap of max(3, (sp - kn) / 3) empty histogram positions is found
761
 *           somewhere in the histogram between kn and sp
762
 *     THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
763
 *          NO!!!!! the bristol line has "11" with a gap of 12 between the
764
 *1's!!! try moving the default threshold to within this band but leave the
765
 *          fuzzy limit calculation as at present.
766
 *************************************************************************/
767
0
void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
768
0
  float sp = row->space_size;
769
0
  float kn = row->kern_size;
770
0
  int16_t reqd_zero_width = 0;
771
0
  int16_t zero_width = 0;
772
0
  int16_t zero_start = 0;
773
0
  int16_t index = 0;
774
775
0
  if (tosp_debug_level > 10) {
776
0
    tprintf("Improve row threshold 0");
777
0
  }
778
0
  if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) ||
779
0
      (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) <
780
0
       (0.75 * all_gap_stats->get_total()))) {
781
0
    return;
782
0
  }
783
0
  if (tosp_debug_level > 10) {
784
0
    tprintf(" 1");
785
0
  }
786
  /*
787
Look for the first region of all 0's in the histogram which is wider than
788
max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current
789
threshold is not within it, move the threshold so that is just inside it.
790
*/
791
0
  reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5));
792
0
  if (reqd_zero_width < 3) {
793
0
    reqd_zero_width = 3;
794
0
  }
795
796
0
  for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) {
797
0
    if (all_gap_stats->pile_count(index) == 0) {
798
0
      if (zero_width == 0) {
799
0
        zero_start = index;
800
0
      }
801
0
      zero_width++;
802
0
    } else {
803
0
      if (zero_width >= reqd_zero_width) {
804
0
        break;
805
0
      } else {
806
0
        zero_width = 0;
807
0
      }
808
0
    }
809
0
  }
810
0
  index--;
811
0
  if (tosp_debug_level > 10) {
812
0
    tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width,
813
0
            zero_width, zero_start, row->space_threshold);
814
0
  }
815
0
  if ((zero_width < reqd_zero_width) ||
816
0
      ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) {
817
0
    return;
818
0
  }
819
0
  if (tosp_debug_level > 10) {
820
0
    tprintf(" 2");
821
0
  }
822
0
  if (row->space_threshold < zero_start) {
823
0
    if (tosp_debug_level > 5) {
824
0
      tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n", kn, sp, zero_start,
825
0
              index, row->space_threshold, zero_start);
826
0
    }
827
0
    row->space_threshold = zero_start;
828
0
  }
829
0
  if (row->space_threshold > index) {
830
0
    if (tosp_debug_level > 5) {
831
0
      tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n", kn, sp, zero_start,
832
0
              index, row->space_threshold, index);
833
0
    }
834
0
    row->space_threshold = index;
835
0
  }
836
0
}
837
838
/**********************************************************************
839
 * make_prop_words
840
 *
841
 * Convert a TO_ROW to a ROW.
842
 **********************************************************************/
843
ROW *Textord::make_prop_words(TO_ROW *row,    // row to make
844
                              FCOORD rotation // for drawing
845
172k
) {
846
172k
  bool bol; // start of line
847
  /* prev_ values are for start of word being built. non prev_ values are for
848
the gap between the word being built and the next one. */
849
172k
  bool prev_fuzzy_sp;     // probably space
850
172k
  bool prev_fuzzy_non;    // probably not
851
172k
  uint8_t prev_blanks;    // in front of word
852
172k
  bool fuzzy_sp = false;  // probably space
853
172k
  bool fuzzy_non = false; // probably not
854
172k
  uint8_t blanks = 0;     // in front of word
855
172k
  bool prev_gap_was_a_space = false;
856
172k
  bool break_at_next_gap = false;
857
172k
  ROW *real_row; // output row
858
172k
  C_OUTLINE_IT cout_it;
859
172k
  C_BLOB_LIST cblobs;
860
172k
  C_BLOB_IT cblob_it = &cblobs;
861
172k
  WERD_LIST words;
862
172k
  WERD *word; // new word
863
172k
  int32_t next_rep_char_word_right = INT32_MAX;
864
172k
  float repetition_spacing; // gap between repetitions
865
172k
  int32_t xstarts[2];       // row ends
866
172k
  int32_t prev_x;           // end of prev blob
867
172k
  BLOBNBOX_IT box_it;       // iterator
868
172k
  TBOX prev_blob_box;
869
172k
  TBOX next_blob_box;
870
172k
  int16_t prev_gap = INT16_MAX;
871
172k
  int16_t current_gap = INT16_MAX;
872
172k
  int16_t next_gap = INT16_MAX;
873
172k
  int16_t prev_within_xht_gap = INT16_MAX;
874
172k
  int16_t current_within_xht_gap = INT16_MAX;
875
172k
  int16_t next_within_xht_gap = INT16_MAX;
876
172k
  int16_t word_count = 0;
877
878
  // repeated char words
879
172k
  WERD_IT rep_char_it(&(row->rep_words));
880
172k
  if (!rep_char_it.empty()) {
881
0
    next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
882
0
  }
883
884
172k
  prev_x = -INT16_MAX;
885
172k
  cblob_it.set_to_list(&cblobs);
886
172k
  box_it.set_to_list(row->blob_list());
887
  // new words
888
172k
  WERD_IT word_it(&words);
889
172k
  bol = true;
890
172k
  prev_blanks = 0;
891
172k
  prev_fuzzy_sp = false;
892
172k
  prev_fuzzy_non = false;
893
172k
  if (!box_it.empty()) {
894
172k
    xstarts[0] = box_it.data()->bounding_box().left();
895
172k
    if (xstarts[0] > next_rep_char_word_right) {
896
      /* We need to insert a repeated char word at the start of the row */
897
0
      word = rep_char_it.extract();
898
0
      word_it.add_after_then_move(word);
899
      /* Set spaces before repeated char word */
900
0
      word->set_flag(W_BOL, true);
901
0
      bol = false;
902
0
      word->set_blanks(0);
903
      // NO uncertainty
904
0
      word->set_flag(W_FUZZY_SP, false);
905
0
      word->set_flag(W_FUZZY_NON, false);
906
0
      xstarts[0] = word->bounding_box().left();
907
      /* Set spaces after repeated char word (and leave current word set) */
908
0
      repetition_spacing = find_mean_blob_spacing(word);
909
0
      current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;
910
0
      current_within_xht_gap = current_gap;
911
0
      if (current_gap > tosp_rep_space * repetition_spacing) {
912
0
        prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
913
0
        if (prev_blanks < 1) {
914
0
          prev_blanks = 1;
915
0
        }
916
0
      } else {
917
0
        prev_blanks = 0;
918
0
      }
919
0
      if (tosp_debug_level > 5) {
920
0
        tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f;  Rgap:%d  ",
921
0
                box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),
922
0
                repetition_spacing, current_gap);
923
0
      }
924
0
      prev_fuzzy_sp = false;
925
0
      prev_fuzzy_non = false;
926
0
      if (rep_char_it.empty()) {
927
0
        next_rep_char_word_right = INT32_MAX;
928
0
      } else {
929
0
        rep_char_it.forward();
930
0
        next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
931
0
      }
932
0
    }
933
934
172k
    peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
935
2.64M
    do {
936
2.64M
      auto bblob = box_it.data();
937
2.64M
      auto blob_box = bblob->bounding_box();
938
2.64M
      if (bblob->joined_to_prev()) {
939
1.16M
        auto cblob = bblob->remove_cblob();
940
1.16M
        if (cblob != nullptr) {
941
1.15M
          cout_it.set_to_list(cblob_it.data()->out_list());
942
1.15M
          cout_it.move_to_last();
943
1.15M
          cout_it.add_list_after(cblob->out_list());
944
1.15M
          delete cblob;
945
1.15M
        }
946
1.47M
      } else {
947
1.47M
        auto cblob = bblob->cblob();
948
1.47M
        if (cblob != nullptr) {
949
1.24M
          bblob->set_owns_cblob(false);
950
1.24M
          cblob_it.add_after_then_move(cblob);
951
1.24M
        }
952
1.47M
        prev_x = blob_box.right();
953
1.47M
      }
954
2.64M
      box_it.forward(); // next one
955
2.64M
      bblob = box_it.data();
956
2.64M
      blob_box = bblob->bounding_box();
957
958
2.64M
      if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
959
        /* Real Blob - not multiple outlines or pre-chopped */
960
1.24M
        prev_gap = current_gap;
961
1.24M
        prev_within_xht_gap = current_within_xht_gap;
962
1.24M
        prev_blob_box = next_blob_box;
963
1.24M
        current_gap = next_gap;
964
1.24M
        current_within_xht_gap = next_within_xht_gap;
965
1.24M
        peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
966
967
1.24M
        int16_t prev_gap_arg = prev_gap;
968
1.24M
        int16_t next_gap_arg = next_gap;
969
1.24M
        if (tosp_only_use_xht_gaps) {
970
0
          prev_gap_arg = prev_within_xht_gap;
971
0
          next_gap_arg = next_within_xht_gap;
972
0
        }
973
        // Decide if a word-break should be inserted
974
1.24M
        if (blob_box.left() > next_rep_char_word_right ||
975
1.24M
            make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,
976
1.24M
                              current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,
977
1.24M
                              fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||
978
966k
            box_it.at_first()) {
979
          /* Form a new word out of the blobs collected */
980
278k
          word = new WERD(&cblobs, prev_blanks, nullptr);
981
278k
          word_count++;
982
278k
          word_it.add_after_then_move(word);
983
278k
          if (bol) {
984
172k
            word->set_flag(W_BOL, true);
985
172k
            bol = false;
986
172k
          }
987
278k
          if (prev_fuzzy_sp) {
988
            // probably space
989
28.5k
            word->set_flag(W_FUZZY_SP, true);
990
250k
          } else if (prev_fuzzy_non) {
991
19.2k
            word->set_flag(W_FUZZY_NON, true);
992
19.2k
          }
993
          // probably not
994
995
278k
          if (blob_box.left() > next_rep_char_word_right) {
996
            /* We need to insert a repeated char word */
997
0
            word = rep_char_it.extract();
998
0
            word_it.add_after_then_move(word);
999
1000
            /* Set spaces before repeated char word */
1001
0
            repetition_spacing = find_mean_blob_spacing(word);
1002
0
            current_gap = word->bounding_box().left() - prev_x;
1003
0
            current_within_xht_gap = current_gap;
1004
0
            if (current_gap > tosp_rep_space * repetition_spacing) {
1005
0
              blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1006
0
              if (blanks < 1) {
1007
0
                blanks = 1;
1008
0
              }
1009
0
            } else {
1010
0
              blanks = 0;
1011
0
            }
1012
0
            if (tosp_debug_level > 5) {
1013
0
              tprintf("Repch wd (%d,%d) rep gap %5.2f;  Lgap:%d (%d blanks);",
1014
0
                      word->bounding_box().left(), word->bounding_box().bottom(),
1015
0
                      repetition_spacing, current_gap, blanks);
1016
0
            }
1017
0
            word->set_blanks(blanks);
1018
            // NO uncertainty
1019
0
            word->set_flag(W_FUZZY_SP, false);
1020
0
            word->set_flag(W_FUZZY_NON, false);
1021
1022
            /* Set spaces after repeated char word (and leave current word set)
1023
             */
1024
0
            current_gap = blob_box.left() - next_rep_char_word_right;
1025
0
            if (current_gap > tosp_rep_space * repetition_spacing) {
1026
0
              blanks = static_cast<uint8_t>(current_gap / row->space_size);
1027
0
              if (blanks < 1) {
1028
0
                blanks = 1;
1029
0
              }
1030
0
            } else {
1031
0
              blanks = 0;
1032
0
            }
1033
0
            if (tosp_debug_level > 5) {
1034
0
              tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks);
1035
0
            }
1036
0
            fuzzy_sp = false;
1037
0
            fuzzy_non = false;
1038
1039
0
            if (rep_char_it.empty()) {
1040
0
              next_rep_char_word_right = INT32_MAX;
1041
0
            } else {
1042
0
              rep_char_it.forward();
1043
0
              next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
1044
0
            }
1045
0
          }
1046
1047
278k
          if (box_it.at_first() && rep_char_it.empty()) {
1048
            // at end of line
1049
172k
            word->set_flag(W_EOL, true);
1050
172k
            xstarts[1] = prev_x;
1051
172k
          } else {
1052
106k
            prev_blanks = blanks;
1053
106k
            prev_fuzzy_sp = fuzzy_sp;
1054
106k
            prev_fuzzy_non = fuzzy_non;
1055
106k
          }
1056
278k
        }
1057
1.24M
      }
1058
2.64M
    } while (!box_it.at_first()); // until back at start
1059
1060
    /* Insert any further repeated char words */
1061
172k
    while (!rep_char_it.empty()) {
1062
0
      word = rep_char_it.extract();
1063
0
      word_it.add_after_then_move(word);
1064
1065
      /* Set spaces before repeated char word */
1066
0
      repetition_spacing = find_mean_blob_spacing(word);
1067
0
      current_gap = word->bounding_box().left() - prev_x;
1068
0
      if (current_gap > tosp_rep_space * repetition_spacing) {
1069
0
        blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1070
0
        if (blanks < 1) {
1071
0
          blanks = 1;
1072
0
        }
1073
0
      } else {
1074
0
        blanks = 0;
1075
0
      }
1076
0
      if (tosp_debug_level > 5) {
1077
0
        tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1078
0
                word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing,
1079
0
                current_gap, blanks);
1080
0
      }
1081
0
      word->set_blanks(blanks);
1082
      // NO uncertainty
1083
0
      word->set_flag(W_FUZZY_SP, false);
1084
0
      word->set_flag(W_FUZZY_NON, false);
1085
0
      prev_x = word->bounding_box().right();
1086
0
      if (rep_char_it.empty()) {
1087
        // at end of line
1088
0
        word->set_flag(W_EOL, true);
1089
0
        xstarts[1] = prev_x;
1090
0
      } else {
1091
0
        rep_char_it.forward();
1092
0
      }
1093
0
    }
1094
172k
    real_row =
1095
172k
        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1096
172k
    word_it.set_to_list(real_row->word_list());
1097
    // put words in row
1098
172k
    word_it.add_list_after(&words);
1099
172k
    real_row->recalc_bounding_box();
1100
1101
172k
    if (tosp_debug_level > 4) {
1102
0
      tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1103
0
              real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1104
0
              real_row->bounding_box().right(), real_row->bounding_box().top());
1105
0
    }
1106
172k
    return real_row;
1107
172k
  }
1108
0
  return nullptr;
1109
172k
}
1110
1111
/**********************************************************************
1112
 * make_blob_words
1113
 *
1114
 * Converts words into blobs so that each blob is a single character.
1115
 *  Used for chopper test.
1116
 **********************************************************************/
1117
ROW *Textord::make_blob_words(TO_ROW *row,    // row to make
1118
                              FCOORD rotation // for drawing
1119
0
) {
1120
0
  bool bol;      // start of line
1121
0
  ROW *real_row; // output row
1122
0
  C_OUTLINE_IT cout_it;
1123
0
  C_BLOB_LIST cblobs;
1124
0
  C_BLOB_IT cblob_it = &cblobs;
1125
0
  WERD_LIST words;
1126
0
  WERD *word;         // new word
1127
0
  BLOBNBOX_IT box_it; // iterator
1128
0
  int16_t word_count = 0;
1129
1130
0
  cblob_it.set_to_list(&cblobs);
1131
0
  box_it.set_to_list(row->blob_list());
1132
  // new words
1133
0
  WERD_IT word_it(&words);
1134
0
  bol = true;
1135
0
  if (!box_it.empty()) {
1136
0
    do {
1137
0
      auto bblob = box_it.data();
1138
0
      auto blob_box = bblob->bounding_box();
1139
0
      if (bblob->joined_to_prev()) {
1140
0
        auto cblob = bblob->remove_cblob();
1141
0
        if (cblob != nullptr) {
1142
0
          cout_it.set_to_list(cblob_it.data()->out_list());
1143
0
          cout_it.move_to_last();
1144
0
          cout_it.add_list_after(cblob->out_list());
1145
0
          delete cblob;
1146
0
        }
1147
0
      } else {
1148
0
        auto cblob = bblob->cblob();
1149
0
        if (cblob != nullptr) {
1150
0
          bblob->set_owns_cblob(false);
1151
0
          cblob_it.add_after_then_move(cblob);
1152
0
        }
1153
0
      }
1154
0
      box_it.forward(); // next one
1155
0
      bblob = box_it.data();
1156
0
      blob_box = bblob->bounding_box();
1157
1158
0
      if (!bblob->joined_to_prev() && !cblobs.empty()) {
1159
0
        word = new WERD(&cblobs, 1, nullptr);
1160
0
        word_count++;
1161
0
        word_it.add_after_then_move(word);
1162
0
        if (bol) {
1163
0
          word->set_flag(W_BOL, true);
1164
0
          bol = false;
1165
0
        }
1166
0
        if (box_it.at_first()) { // at end of line
1167
0
          word->set_flag(W_EOL, true);
1168
0
        }
1169
0
      }
1170
0
    } while (!box_it.at_first()); // until back at start
1171
    /* Setup the row with created words. */
1172
0
    real_row =
1173
0
        new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1174
0
    word_it.set_to_list(real_row->word_list());
1175
    // put words in row
1176
0
    word_it.add_list_after(&words);
1177
0
    real_row->recalc_bounding_box();
1178
0
    if (tosp_debug_level > 4) {
1179
0
      tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1180
0
              real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1181
0
              real_row->bounding_box().right(), real_row->bounding_box().top());
1182
0
    }
1183
0
    return real_row;
1184
0
  }
1185
0
  return nullptr;
1186
0
}
1187
1188
bool Textord::make_a_word_break(TO_ROW *row,   // row being made
1189
                                TBOX blob_box, // for next_blob // how many blanks?
1190
                                int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,
1191
                                int16_t within_xht_current_gap, TBOX next_blob_box,
1192
                                int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,
1193
1.24M
                                bool &prev_gap_was_a_space, bool &break_at_next_gap) {
1194
1.24M
  bool space;
1195
1.24M
  int16_t current_gap;
1196
1.24M
  float fuzzy_sp_to_kn_limit;
1197
1198
1.24M
  if (break_at_next_gap) {
1199
0
    break_at_next_gap = false;
1200
0
    return true;
1201
0
  }
1202
  /* Inhibit using the reduced gap if
1203
  The kerning is large - chars are not kerned and reducing "f"s can cause
1204
  erroneous blanks
1205
OR  The real gap is less than 0
1206
OR  The real gap is less than the kerning estimate
1207
*/
1208
1.24M
  if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1209
964k
      ((tosp_dont_fool_with_small_kerns >= 0) &&
1210
280k
       (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) {
1211
    // Ignore the difference
1212
280k
    within_xht_current_gap = real_current_gap;
1213
280k
  }
1214
1215
1.24M
  if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) {
1216
0
    current_gap = within_xht_current_gap;
1217
1.24M
  } else {
1218
1.24M
    current_gap = real_current_gap;
1219
1.24M
  }
1220
1221
1.24M
  if (tosp_old_to_method) {
1222
    // Boring old method
1223
0
    space = current_gap > row->max_nonspace;
1224
0
    if (space && (current_gap < INT16_MAX)) {
1225
0
      if (current_gap < row->min_space) {
1226
0
        if (current_gap > row->space_threshold) {
1227
0
          blanks = 1;
1228
0
          fuzzy_sp = true;
1229
0
          fuzzy_non = false;
1230
0
        } else {
1231
0
          blanks = 0;
1232
0
          fuzzy_sp = false;
1233
0
          fuzzy_non = true;
1234
0
        }
1235
0
      } else {
1236
0
        if (row->space_size == 0.0f) {
1237
          // Avoid FP division by 0.
1238
0
          blanks = 1;
1239
0
        } else {
1240
0
          blanks = static_cast<uint8_t>(current_gap / row->space_size);
1241
0
          if (blanks < 1) {
1242
0
            blanks = 1;
1243
0
          }
1244
0
        }
1245
0
        fuzzy_sp = false;
1246
0
        fuzzy_non = false;
1247
0
      }
1248
0
    }
1249
0
    return space;
1250
1.24M
  } else {
1251
    /* New exciting heuristic method */
1252
1.24M
    if (prev_blob_box.null_box()) { // Beginning of row
1253
122
      prev_gap_was_a_space = true;
1254
122
    }
1255
1256
    // Default as old TO
1257
1.24M
    space = current_gap > row->space_threshold;
1258
1259
    /* Set defaults for the word break in case we find one.  Currently there are
1260
no fuzzy spaces. Depending on the reliability of the different heuristics
1261
we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1262
be used if the function returns true - ie the word is to be broken.
1263
*/
1264
1.24M
    int num_blanks = current_gap;
1265
1.24M
    if (row->space_size > 1.0f) {
1266
1.24M
      num_blanks = IntCastRounded(current_gap / row->space_size);
1267
1.24M
    }
1268
1.24M
    blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
1269
1.24M
    fuzzy_sp = false;
1270
1.24M
    fuzzy_non = false;
1271
    /*
1272
If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1273
despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1274
context.
1275
*/
1276
1.24M
    if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) &&
1277
939k
        (within_xht_current_gap > row->max_nonspace)) {
1278
1.94k
      space = true;
1279
1.94k
      fuzzy_non = true;
1280
#ifndef GRAPHICS_DISABLED
1281
      mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1282
               next_gap);
1283
#endif
1284
1.24M
    } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) &&
1285
973k
               (within_xht_current_gap > row->space_threshold)) {
1286
238
      space = true;
1287
238
      if (tosp_flip_fuzz_kn_to_sp) {
1288
238
        fuzzy_sp = true;
1289
238
      } else {
1290
0
        fuzzy_non = true;
1291
0
      }
1292
#ifndef GRAPHICS_DISABLED
1293
      mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1294
               next_gap);
1295
#endif
1296
1.24M
    } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) &&
1297
1.01M
               (within_xht_current_gap >= row->min_space)) {
1298
286
      space = true;
1299
#ifndef GRAPHICS_DISABLED
1300
      mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1301
               next_gap);
1302
#endif
1303
1.24M
    } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) &&
1304
0
               suspected_punct_blob(row, blob_box)) {
1305
0
      break_at_next_gap = true;
1306
0
    }
1307
    /* Now continue with normal heuristics */
1308
1.24M
    else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) {
1309
      /* Heuristics to turn dubious spaces to kerns */
1310
42.5k
      if (tosp_pass_wide_fuzz_sp_to_context > 0) {
1311
42.5k
        fuzzy_sp_to_kn_limit =
1312
42.5k
            row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size);
1313
42.5k
      } else {
1314
0
        fuzzy_sp_to_kn_limit = 99999.0f;
1315
0
      }
1316
1317
      /* If current gap is significantly smaller than the previous space the
1318
other side of a narrow blob then this gap is a kern. */
1319
42.5k
      if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space &&
1320
3.43k
          (current_gap <= tosp_gap_factor * prev_gap)) {
1321
1.51k
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1322
340
          if (tosp_flip_fuzz_sp_to_kn) {
1323
340
            fuzzy_non = true;
1324
340
          } else {
1325
0
            fuzzy_sp = true;
1326
0
          }
1327
1.17k
        } else {
1328
1.17k
          space = false;
1329
1.17k
        }
1330
#ifndef GRAPHICS_DISABLED
1331
        mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1332
                 next_gap);
1333
#endif
1334
1.51k
      }
1335
      /* If current gap not much bigger than the previous kern the other side of
1336
a narrow blob then this gap is a kern as well */
1337
41.0k
      else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) &&
1338
32.1k
               !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) {
1339
4.73k
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1340
1.06k
          if (tosp_flip_fuzz_sp_to_kn) {
1341
1.06k
            fuzzy_non = true;
1342
1.06k
          } else {
1343
0
            fuzzy_sp = true;
1344
0
          }
1345
3.67k
        } else {
1346
3.67k
          space = false;
1347
3.67k
        }
1348
#ifndef GRAPHICS_DISABLED
1349
        mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1350
                 next_gap);
1351
#endif
1352
36.3k
      } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
1353
28.4k
                 (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) {
1354
4.25k
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1355
1.14k
          if (tosp_flip_fuzz_sp_to_kn) {
1356
1.14k
            fuzzy_non = true;
1357
1.14k
          } else {
1358
0
            fuzzy_sp = true;
1359
0
          }
1360
3.10k
        } else {
1361
3.10k
          space = false;
1362
3.10k
        }
1363
#ifndef GRAPHICS_DISABLED
1364
        mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1365
                 next_gap);
1366
#endif
1367
32.0k
      } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
1368
24.2k
                 (next_gap <= row->space_threshold) &&
1369
22.2k
                 (current_gap * tosp_gap_factor <= next_gap)) {
1370
169
        if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1371
30
          if (tosp_flip_fuzz_sp_to_kn) {
1372
30
            fuzzy_non = true;
1373
30
          } else {
1374
0
            fuzzy_sp = true;
1375
0
          }
1376
139
        } else {
1377
139
          space = false;
1378
139
        }
1379
#ifndef GRAPHICS_DISABLED
1380
        mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1381
                 next_gap);
1382
#endif
1383
31.8k
      } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) ||
1384
27.8k
                  ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) {
1385
27.8k
        fuzzy_sp = true;
1386
#ifndef GRAPHICS_DISABLED
1387
        mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1388
                 next_gap);
1389
#endif
1390
27.8k
      }
1391
1.19M
    } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) {
1392
      /* Heuristics to turn dubious kerns to spaces */
1393
      /* TRIED THIS BUT IT MADE THINGS WORSE
1394
    if (prev_gap == INT16_MAX)
1395
      prev_gap = 0;  // start of row
1396
    if (next_gap == INT16_MAX)
1397
      next_gap = 0;  // end of row
1398
*/
1399
35.4k
      if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) &&
1400
35.4k
          (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
1401
17.1k
          wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) {
1402
482
        space = true;
1403
        /*
1404
tosp_flip_caution is an attempt to stop the default changing in cases
1405
where there is a large difference between the kern and space estimates.
1406
  See problem in 'chiefs' where "have" gets split in the quotation.
1407
*/
1408
482
        if ((tosp_flip_fuzz_kn_to_sp) &&
1409
482
            ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) {
1410
482
          fuzzy_sp = true;
1411
482
        } else {
1412
0
          fuzzy_non = true;
1413
0
        }
1414
#ifndef GRAPHICS_DISABLED
1415
        mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1416
                 next_gap);
1417
#endif
1418
34.9k
      } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 &&
1419
34.9k
                 current_gap > 5 && // Rule 9 handles small gap, big ratio.
1420
11.9k
                 current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
1421
6.02k
                 !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) &&
1422
811
                 !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) {
1423
184
        space = true;
1424
184
        fuzzy_non = true;
1425
#ifndef GRAPHICS_DISABLED
1426
        mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1427
                 next_gap);
1428
#endif
1429
34.7k
      } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) &&
1430
34.7k
                 (next_blob_box.width() > 0) &&
1431
34.7k
                 (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
1432
14.5k
                 (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) &&
1433
14.5k
                                              !suspected_punct_blob(row, next_blob_box)))) {
1434
14.5k
        space = true;
1435
14.5k
        fuzzy_non = true;
1436
#ifndef GRAPHICS_DISABLED
1437
        mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1438
                 next_gap);
1439
#endif
1440
14.5k
      }
1441
35.4k
    }
1442
1.24M
    if (tosp_debug_level > 10) {
1443
0
      tprintf(
1444
0
          "word break = %d current_gap = %d, prev_gap = %d, "
1445
0
          "next_gap = %d\n",
1446
0
          space ? 1 : 0, current_gap, prev_gap, next_gap);
1447
0
    }
1448
1.24M
    prev_gap_was_a_space = space && !(fuzzy_non);
1449
1.24M
    return space;
1450
1.24M
  }
1451
1.24M
}
1452
1453
261k
bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
1454
261k
  bool result;
1455
261k
  result =
1456
261k
      ((blob_box.width() <= tosp_narrow_fraction * row->xheight) ||
1457
85.3k
       ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio));
1458
261k
  return result;
1459
261k
}
1460
1461
224k
bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
1462
224k
  bool result;
1463
224k
  if (tosp_wide_fraction > 0) {
1464
224k
    if (tosp_wide_aspect_ratio > 0) {
1465
0
      result =
1466
0
          ((blob_box.width() >= tosp_wide_fraction * row->xheight) &&
1467
0
           ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio));
1468
224k
    } else {
1469
224k
      result = (blob_box.width() >= tosp_wide_fraction * row->xheight);
1470
224k
    }
1471
224k
  } else {
1472
0
    result = !narrow_blob(row, blob_box);
1473
0
  }
1474
224k
  return result;
1475
224k
}
1476
1477
2.01k
bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
1478
2.01k
  bool result;
1479
2.01k
  float baseline;
1480
2.01k
  float blob_x_centre;
1481
  /* Find baseline of centre of blob */
1482
2.01k
  blob_x_centre = (box.right() + box.left()) / 2.0;
1483
2.01k
  baseline = row->baseline.y(blob_x_centre);
1484
1485
2.01k
  result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) ||
1486
1.03k
           (box.bottom() > baseline + row->xheight / 2.0);
1487
2.01k
  return result;
1488
2.01k
}
1489
1490
void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box,
1491
1.41M
                               int16_t &next_gap, int16_t &next_within_xht_gap) {
1492
1.41M
  TBOX next_reduced_blob_box;
1493
1.41M
  TBOX bit_beyond;
1494
1.41M
  BLOBNBOX_IT reduced_box_it = box_it;
1495
1496
1.41M
  next_blob_box = box_next(&box_it);
1497
1.41M
  next_reduced_blob_box = reduced_box_next(row, &reduced_box_it);
1498
1.41M
  if (box_it.at_first()) {
1499
234k
    next_gap = INT16_MAX;
1500
234k
    next_within_xht_gap = INT16_MAX;
1501
1.18M
  } else {
1502
1.18M
    bit_beyond = box_it.data()->bounding_box();
1503
1.18M
    next_gap = bit_beyond.left() - next_blob_box.right();
1504
1.18M
    bit_beyond = reduced_box_next(row, &reduced_box_it);
1505
1.18M
    next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right();
1506
1.18M
  }
1507
1.41M
}
1508
1509
#ifndef GRAPHICS_DISABLED
1510
void Textord::mark_gap(TBOX blob,    // blob following gap
1511
                       int16_t rule, // heuristic id
1512
                       int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,
1513
                       int16_t next_blob_width, int16_t next_gap) {
1514
  ScrollView::Color col; // of ellipse marking flipped gap
1515
1516
  switch (rule) {
1517
    case 1:
1518
      col = ScrollView::RED;
1519
      break;
1520
    case 2:
1521
      col = ScrollView::CYAN;
1522
      break;
1523
    case 3:
1524
      col = ScrollView::GREEN;
1525
      break;
1526
    case 4:
1527
      col = ScrollView::BLACK;
1528
      break;
1529
    case 5:
1530
      col = ScrollView::MAGENTA;
1531
      break;
1532
    case 6:
1533
      col = ScrollView::BLUE;
1534
      break;
1535
1536
    case 7:
1537
      col = ScrollView::WHITE;
1538
      break;
1539
    case 8:
1540
      col = ScrollView::YELLOW;
1541
      break;
1542
    case 9:
1543
      col = ScrollView::BLACK;
1544
      break;
1545
1546
    case 20:
1547
      col = ScrollView::CYAN;
1548
      break;
1549
    case 21:
1550
      col = ScrollView::GREEN;
1551
      break;
1552
    case 22:
1553
      col = ScrollView::MAGENTA;
1554
      break;
1555
    default:
1556
      col = ScrollView::BLACK;
1557
  }
1558
  if (textord_show_initial_words) {
1559
    to_win->Pen(col);
1560
    /*  if (rule < 20)
1561
    //interior_style(to_win, INT_SOLID, false);
1562
  else
1563
    //interior_style(to_win, INT_HOLLOW, true);*/
1564
    // x radius
1565
    to_win->Ellipse(current_gap / 2.0f,
1566
                    blob.height() / 2.0f, // y radius
1567
                                          // x centre
1568
                    blob.left() - current_gap / 2.0f,
1569
                    // y centre
1570
                    blob.bottom() + blob.height() / 2.0f);
1571
  }
1572
  if (tosp_debug_level > 5) {
1573
    tprintf("  (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2,
1574
            blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap);
1575
  }
1576
}
1577
#endif
1578
1579
0
float Textord::find_mean_blob_spacing(WERD *word) {
1580
0
  C_BLOB_IT cblob_it;
1581
0
  TBOX blob_box;
1582
0
  int32_t gap_sum = 0;
1583
0
  int16_t gap_count = 0;
1584
0
  int16_t prev_right;
1585
1586
0
  cblob_it.set_to_list(word->cblob_list());
1587
0
  if (!cblob_it.empty()) {
1588
0
    cblob_it.mark_cycle_pt();
1589
0
    prev_right = cblob_it.data()->bounding_box().right();
1590
    // first blob
1591
0
    cblob_it.forward();
1592
0
    for (; !cblob_it.cycled_list(); cblob_it.forward()) {
1593
0
      blob_box = cblob_it.data()->bounding_box();
1594
0
      gap_sum += blob_box.left() - prev_right;
1595
0
      gap_count++;
1596
0
      prev_right = blob_box.right();
1597
0
    }
1598
0
  }
1599
0
  if (gap_count > 0) {
1600
0
    return (gap_sum / static_cast<float>(gap_count));
1601
0
  } else {
1602
0
    return 0.0f;
1603
0
  }
1604
0
}
1605
1606
bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left,
1607
2.51M
                             int16_t right) {
1608
2.51M
  int16_t gap = right - left + 1;
1609
1610
2.51M
  if (tosp_ignore_big_gaps > 999) {
1611
0
    return false; // Don't ignore
1612
0
  }
1613
2.51M
  if (tosp_ignore_big_gaps > 0) {
1614
0
    return (gap > tosp_ignore_big_gaps * row->xheight);
1615
0
  }
1616
2.51M
  if (gap > tosp_ignore_very_big_gaps * row->xheight) {
1617
18.6k
    return true;
1618
18.6k
  }
1619
2.49M
  if (tosp_ignore_big_gaps == 0) {
1620
0
    if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) {
1621
0
      return true;
1622
0
    }
1623
0
    if ((gap > 1.75 * row->xheight) &&
1624
0
        ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) {
1625
0
      return true;
1626
0
    }
1627
2.49M
  } else {
1628
    /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table
1629
     */
1630
2.49M
    if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) {
1631
11.2k
      return true;
1632
11.2k
    }
1633
2.49M
  }
1634
2.48M
  return false;
1635
2.49M
}
1636
1637
/**********************************************************************
1638
 * reduced_box_next
1639
 *
1640
 * Compute the bounding box of this blob with merging of x overlaps
1641
 * but no pre-chopping.
1642
 * Then move the iterator on to the start of the next blob.
1643
 * DON'T reduce the box for small things - eg punctuation.
1644
 **********************************************************************/
1645
TBOX Textord::reduced_box_next(TO_ROW *row,    // current row
1646
                               BLOBNBOX_IT *it // iterator to blobds
1647
6.54M
) {
1648
6.54M
  BLOBNBOX *blob;             // current blob
1649
6.54M
  BLOBNBOX *head_blob;        // place to store box
1650
6.54M
  TBOX full_box;              // full blob boundg box
1651
6.54M
  TBOX reduced_box;           // box of significant part
1652
6.54M
  int16_t left_above_xht;     // ABOVE xht left limit
1653
6.54M
  int16_t new_left_above_xht; // ABOVE xht left limit
1654
1655
6.54M
  blob = it->data();
1656
6.54M
  if (blob->red_box_set()) {
1657
5.29M
    reduced_box = blob->reduced_box();
1658
11.2M
    do {
1659
11.2M
      it->forward();
1660
11.2M
      blob = it->data();
1661
11.2M
    } while (blob->cblob() == nullptr || blob->joined_to_prev());
1662
5.29M
    return reduced_box;
1663
5.29M
  }
1664
1.24M
  head_blob = blob;
1665
1.24M
  full_box = blob->bounding_box();
1666
1.24M
  reduced_box = reduced_box_for_blob(blob, row, &left_above_xht);
1667
2.64M
  do {
1668
2.64M
    it->forward();
1669
2.64M
    blob = it->data();
1670
2.64M
    if (blob->cblob() == nullptr) {
1671
      // was pre-chopped
1672
239k
      full_box += blob->bounding_box();
1673
2.40M
    } else if (blob->joined_to_prev()) {
1674
1.15M
      reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht);
1675
1.15M
      left_above_xht = std::min(left_above_xht, new_left_above_xht);
1676
1.15M
    }
1677
2.64M
  }
1678
  // until next real blob
1679
2.64M
  while (blob->cblob() == nullptr || blob->joined_to_prev());
1680
1681
1.24M
  if ((reduced_box.width() > 0) &&
1682
1.10M
      ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) &&
1683
970k
      (reduced_box.height() > 0.7 * row->xheight)) {
1684
#ifndef GRAPHICS_DISABLED
1685
    if (textord_show_initial_words) {
1686
      reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW);
1687
    }
1688
#endif
1689
904k
  } else {
1690
904k
    reduced_box = full_box;
1691
904k
  }
1692
1.24M
  head_blob->set_reduced_box(reduced_box);
1693
1.24M
  return reduced_box;
1694
6.54M
}
1695
1696
/*************************************************************************
1697
 * reduced_box_for_blob()
1698
 * Find box for blob which is the same height and y position as the whole blob,
1699
 * but whose left limit is the left most position of the blob ABOVE the
1700
 * baseline and whose right limit is the right most position of the blob BELOW
1701
 * the xheight.
1702
 *
1703
 *
1704
 * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1705
 *         "home".  Perhaps we need something which say if the width ABOVE the
1706
 *         xht alone includes the whole of the reduced width, then use the full
1707
 *         blob box - Might still fail on italic F
1708
 *
1709
 *         Alternatively we could be a little less severe and only reduce the
1710
 *         left and right edges by half the difference between the full box and
1711
 *         the reduced box.
1712
 *
1713
 * NOTE that we need to rotate all the coordinates as
1714
 * find_blob_limits finds the y min and max within a specified x band
1715
 *************************************************************************/
1716
2.40M
TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) {
1717
2.40M
  float baseline;
1718
2.40M
  float blob_x_centre;
1719
2.40M
  float left_limit;
1720
2.40M
  float right_limit;
1721
2.40M
  float junk;
1722
2.40M
  TBOX blob_box;
1723
1724
  /* Find baseline of centre of blob */
1725
1726
2.40M
  blob_box = blob->bounding_box();
1727
2.40M
  blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0;
1728
2.40M
  baseline = row->baseline.y(blob_x_centre);
1729
1730
  /*
1731
Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1732
caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1733
*/
1734
2.40M
  left_limit = static_cast<float>(INT32_MAX);
1735
2.40M
  junk = static_cast<float>(-INT32_MAX);
1736
2.40M
  find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX),
1737
2.40M
                     left_limit, junk);
1738
2.40M
  if (left_limit > junk) {
1739
1.96M
    *left_above_xht = INT16_MAX; // No area above xht
1740
1.96M
  } else {
1741
431k
    *left_above_xht = static_cast<int16_t>(std::floor(left_limit));
1742
431k
  }
1743
  /*
1744
Find reduced LH limit of blob - the left extent of the region ABOVE the
1745
baseline.
1746
*/
1747
2.40M
  left_limit = static_cast<float>(INT32_MAX);
1748
2.40M
  junk = static_cast<float>(-INT32_MAX);
1749
2.40M
  find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk);
1750
1751
2.40M
  if (left_limit > junk) {
1752
200k
    return TBOX(); // no area within xht so return empty box
1753
200k
  }
1754
  /*
1755
Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1756
*/
1757
2.20M
  junk = static_cast<float>(INT32_MAX);
1758
2.20M
  right_limit = static_cast<float>(-INT32_MAX);
1759
2.20M
  find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk,
1760
2.20M
                     right_limit);
1761
2.20M
  if (junk > right_limit) {
1762
257k
    return TBOX(); // no area within xht so return empty box
1763
257k
  }
1764
1765
1.94M
  return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()),
1766
1.94M
              ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top()));
1767
2.20M
}
1768
} // namespace tesseract