Coverage Report

Created: 2025-07-23 07:12

/src/tesseract/src/textord/topitch.cpp
Line
Count
Source (jump to first uncovered line)
1
/**********************************************************************
2
 * File:        topitch.cpp  (Formerly to_pitch.c)
3
 * Description: Code to determine fixed pitchness and the pitch if fixed.
4
 * Author:      Ray Smith
5
 *
6
 * (C) Copyright 1993, Hewlett-Packard Ltd.
7
 ** Licensed under the Apache License, Version 2.0 (the "License");
8
 ** you may not use this file except in compliance with the License.
9
 ** You may obtain a copy of the License at
10
 ** http://www.apache.org/licenses/LICENSE-2.0
11
 ** Unless required by applicable law or agreed to in writing, software
12
 ** distributed under the License is distributed on an "AS IS" BASIS,
13
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 ** See the License for the specific language governing permissions and
15
 ** limitations under the License.
16
 *
17
 **********************************************************************/
18
19
// Include automatically generated configuration file if running autoconf.
20
#ifdef HAVE_CONFIG_H
21
#  include "config_auto.h"
22
#endif
23
24
#include "topitch.h"
25
26
#include "blobbox.h"
27
#include "drawtord.h"
28
#include "makerow.h"
29
#include "pithsync.h"
30
#include "pitsync1.h"
31
#include "statistc.h"
32
#include "tovars.h"
33
#include "wordseg.h"
34
35
#include "helpers.h"
36
37
#include <memory>
38
39
namespace tesseract {
40
41
static BOOL_VAR(textord_all_prop, false, "All doc is proportial text");
42
BOOL_VAR(textord_debug_pitch_test, false, "Debug on fixed pitch test");
43
static BOOL_VAR(textord_disable_pitch_test, false, "Turn off dp fixed pitch algorithm");
44
BOOL_VAR(textord_fast_pitch_test, false, "Do even faster pitch algorithm");
45
BOOL_VAR(textord_debug_pitch_metric, false, "Write full metric stuff");
46
BOOL_VAR(textord_show_row_cuts, false, "Draw row-level cuts");
47
BOOL_VAR(textord_show_page_cuts, false, "Draw page-level cuts");
48
BOOL_VAR(textord_blockndoc_fixed, false, "Attempt whole doc/block fixed pitch");
49
double_VAR(textord_projection_scale, 0.200, "Ding rate for mid-cuts");
50
double_VAR(textord_balance_factor, 1.0, "Ding rate for unbalanced char cells");
51
52
386k
#define BLOCK_STATS_CLUSTERS 10
53
17.2k
#define MAX_ALLOWED_PITCH 100 // max pixel pitch.
54
55
// qsort function to sort 2 floats.
56
49.2k
static int sort_floats(const void *arg1, const void *arg2) {
57
49.2k
  float diff = *reinterpret_cast<const float *>(arg1) - *reinterpret_cast<const float *>(arg2);
58
49.2k
  if (diff > 0) {
59
6.74k
    return 1;
60
42.4k
  } else if (diff < 0) {
61
42.4k
    return -1;
62
42.4k
  } else {
63
0
    return 0;
64
0
  }
65
49.2k
}
66
67
/**********************************************************************
68
 * compute_fixed_pitch
69
 *
70
 * Decide whether each row is fixed pitch individually.
71
 * Correlate definite and uncertain results to obtain an individual
72
 * result for each row in the TO_ROW class.
73
 **********************************************************************/
74
75
void compute_fixed_pitch(ICOORD page_tr,             // top right
76
                         TO_BLOCK_LIST *port_blocks, // input list
77
                         float gradient,             // page skew
78
                         FCOORD rotation,            // for drawing
79
17.2k
                         bool testing_on) {          // correct orientation
80
17.2k
  TO_BLOCK_IT block_it;                              // iterator
81
17.2k
  TO_BLOCK *block;                                   // current block;
82
17.2k
  TO_ROW *row;                                       // current row
83
17.2k
  int block_index;                                   // block number
84
17.2k
  int row_index;                                     // row number
85
86
#ifndef GRAPHICS_DISABLED
87
  if (textord_show_initial_words && testing_on) {
88
    if (to_win == nullptr) {
89
      create_to_win(page_tr);
90
    }
91
  }
92
#endif
93
94
17.2k
  block_it.set_to_list(port_blocks);
95
17.2k
  block_index = 1;
96
34.5k
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
97
17.2k
    block = block_it.data();
98
17.2k
    compute_block_pitch(block, rotation, block_index, testing_on);
99
17.2k
    block_index++;
100
17.2k
  }
101
102
17.2k
  if (!try_doc_fixed(page_tr, port_blocks, gradient)) {
103
17.2k
    block_index = 1;
104
34.5k
    for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
105
17.2k
      block = block_it.data();
106
17.2k
      if (!try_block_fixed(block, block_index)) {
107
17.2k
        try_rows_fixed(block, block_index, testing_on);
108
17.2k
      }
109
17.2k
      block_index++;
110
17.2k
    }
111
17.2k
  }
112
113
17.2k
  block_index = 1;
114
34.5k
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
115
17.2k
    block = block_it.data();
116
17.2k
    POLY_BLOCK *pb = block->block->pdblk.poly_block();
117
17.2k
    if (pb != nullptr && !pb->IsText()) {
118
0
      continue; // Non-text doesn't exist!
119
0
    }
120
    // row iterator
121
17.2k
    TO_ROW_IT row_it(block->get_rows());
122
17.2k
    row_index = 1;
123
203k
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
124
185k
      row = row_it.data();
125
185k
      fix_row_pitch(row, block, port_blocks, row_index, block_index);
126
185k
      row_index++;
127
185k
    }
128
17.2k
    block_index++;
129
17.2k
  }
130
#ifndef GRAPHICS_DISABLED
131
  if (textord_show_initial_words && testing_on) {
132
    ScrollView::Update();
133
  }
134
#endif
135
17.2k
}
136
137
/**********************************************************************
138
 * fix_row_pitch
139
 *
140
 * Get a pitch_decision for this row by voting among similar rows in the
141
 * block, then similar rows over all the page, or any other rows at all.
142
 **********************************************************************/
143
144
void fix_row_pitch(TO_ROW *bad_row,        // row to fix
145
                   TO_BLOCK *bad_block,    // block of bad_row
146
                   TO_BLOCK_LIST *blocks,  // blocks to scan
147
                   int32_t row_target,     // number of row
148
185k
                   int32_t block_target) { // number of block
149
185k
  int16_t mid_cuts;
150
185k
  int block_votes;               // votes in block
151
185k
  int like_votes;                // votes over page
152
185k
  int other_votes;               // votes of unlike blocks
153
185k
  int block_index;               // number of block
154
185k
  int maxwidth;                  // max pitch
155
185k
  TO_BLOCK_IT block_it = blocks; // block iterator
156
185k
  TO_BLOCK *block;               // current block
157
185k
  TO_ROW *row;                   // current row
158
185k
  float sp_sd;                   // space deviation
159
185k
  STATS block_stats;             // pitches in block
160
185k
  STATS like_stats;              // pitches in page
161
162
185k
  block_votes = like_votes = other_votes = 0;
163
185k
  maxwidth = static_cast<int32_t>(ceil(bad_row->xheight * textord_words_maxspace));
164
185k
  if (bad_row->pitch_decision != PITCH_DEF_FIXED && bad_row->pitch_decision != PITCH_DEF_PROP) {
165
176k
    block_stats.set_range(0, maxwidth - 1);
166
176k
    like_stats.set_range(0, maxwidth - 1);
167
176k
    block_index = 1;
168
352k
    for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
169
176k
      block = block_it.data();
170
176k
      POLY_BLOCK *pb = block->block->pdblk.poly_block();
171
176k
      if (pb != nullptr && !pb->IsText()) {
172
0
        continue; // Non text doesn't exist!
173
0
      }
174
176k
      TO_ROW_IT row_it(block->get_rows());
175
3.84M
      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
176
3.67M
        row = row_it.data();
177
3.67M
        if ((bad_row->all_caps &&
178
3.67M
             row->xheight + row->ascrise <
179
423k
                 (bad_row->xheight + bad_row->ascrise) * (1 + textord_pitch_rowsimilarity) &&
180
3.67M
             row->xheight + row->ascrise >
181
217k
                 (bad_row->xheight + bad_row->ascrise) * (1 - textord_pitch_rowsimilarity)) ||
182
3.67M
            (!bad_row->all_caps &&
183
3.51M
             row->xheight < bad_row->xheight * (1 + textord_pitch_rowsimilarity) &&
184
3.51M
             row->xheight > bad_row->xheight * (1 - textord_pitch_rowsimilarity))) {
185
2.94M
          if (block_index == block_target) {
186
2.94M
            if (row->pitch_decision == PITCH_DEF_FIXED) {
187
20.0k
              block_votes += textord_words_veto_power;
188
20.0k
              block_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power);
189
2.92M
            } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
190
2.92M
                       row->pitch_decision == PITCH_CORR_FIXED) {
191
43.9k
              block_votes++;
192
43.9k
              block_stats.add(static_cast<int32_t>(row->fixed_pitch), 1);
193
2.88M
            } else if (row->pitch_decision == PITCH_DEF_PROP) {
194
48.3k
              block_votes -= textord_words_veto_power;
195
2.83M
            } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
196
2.83M
                       row->pitch_decision == PITCH_CORR_PROP) {
197
1.34M
              block_votes--;
198
1.34M
            }
199
2.94M
          } else {
200
0
            if (row->pitch_decision == PITCH_DEF_FIXED) {
201
0
              like_votes += textord_words_veto_power;
202
0
              like_stats.add(static_cast<int32_t>(row->fixed_pitch), textord_words_veto_power);
203
0
            } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
204
0
                       row->pitch_decision == PITCH_CORR_FIXED) {
205
0
              like_votes++;
206
0
              like_stats.add(static_cast<int32_t>(row->fixed_pitch), 1);
207
0
            } else if (row->pitch_decision == PITCH_DEF_PROP) {
208
0
              like_votes -= textord_words_veto_power;
209
0
            } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
210
0
                       row->pitch_decision == PITCH_CORR_PROP) {
211
0
              like_votes--;
212
0
            }
213
0
          }
214
2.94M
        } else {
215
728k
          if (row->pitch_decision == PITCH_DEF_FIXED) {
216
2.67k
            other_votes += textord_words_veto_power;
217
725k
          } else if (row->pitch_decision == PITCH_MAYBE_FIXED ||
218
725k
                     row->pitch_decision == PITCH_CORR_FIXED) {
219
5.38k
            other_votes++;
220
720k
          } else if (row->pitch_decision == PITCH_DEF_PROP) {
221
30.9k
            other_votes -= textord_words_veto_power;
222
689k
          } else if (row->pitch_decision == PITCH_MAYBE_PROP ||
223
689k
                     row->pitch_decision == PITCH_CORR_PROP) {
224
350k
            other_votes--;
225
350k
          }
226
728k
        }
227
3.67M
      }
228
176k
      block_index++;
229
176k
    }
230
176k
    if (block_votes > textord_words_veto_power) {
231
3.79k
      bad_row->fixed_pitch = block_stats.ile(0.5);
232
3.79k
      bad_row->pitch_decision = PITCH_CORR_FIXED;
233
172k
    } else if (block_votes <= textord_words_veto_power && like_votes > 0) {
234
0
      bad_row->fixed_pitch = like_stats.ile(0.5);
235
0
      bad_row->pitch_decision = PITCH_CORR_FIXED;
236
172k
    } else {
237
172k
      bad_row->pitch_decision = PITCH_CORR_PROP;
238
172k
      if (block_votes == 0 && like_votes == 0 && other_votes > 0 &&
239
172k
          (textord_debug_pitch_test || textord_debug_pitch_metric)) {
240
0
        tprintf(
241
0
            "Warning:row %d of block %d set prop with no like rows against "
242
0
            "trend\n",
243
0
            row_target, block_target);
244
0
      }
245
172k
    }
246
176k
  }
247
185k
  if (textord_debug_pitch_metric) {
248
0
    tprintf(":b_votes=%d:l_votes=%d:o_votes=%d", block_votes, like_votes, other_votes);
249
0
    tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise);
250
0
  }
251
185k
  if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
252
3.79k
    if (bad_row->fixed_pitch < textord_min_xheight) {
253
1.98k
      if (block_votes > 0) {
254
1.98k
        bad_row->fixed_pitch = block_stats.ile(0.5);
255
1.98k
      } else if (block_votes == 0 && like_votes > 0) {
256
0
        bad_row->fixed_pitch = like_stats.ile(0.5);
257
0
      } else {
258
0
        tprintf("Warning:guessing pitch as xheight on row %d, block %d\n", row_target,
259
0
                block_target);
260
0
        bad_row->fixed_pitch = bad_row->xheight;
261
0
      }
262
1.98k
    }
263
3.79k
    if (bad_row->fixed_pitch < textord_min_xheight) {
264
1.98k
      bad_row->fixed_pitch = (float)textord_min_xheight;
265
1.98k
    }
266
3.79k
    bad_row->kern_size = bad_row->fixed_pitch / 4;
267
3.79k
    bad_row->min_space = static_cast<int32_t>(bad_row->fixed_pitch * 0.6);
268
3.79k
    bad_row->max_nonspace = static_cast<int32_t>(bad_row->fixed_pitch * 0.4);
269
3.79k
    bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2;
270
3.79k
    bad_row->space_size = bad_row->fixed_pitch;
271
3.79k
    if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) {
272
2.63k
      tune_row_pitch(bad_row, &bad_row->projection, bad_row->projection_left,
273
2.63k
                     bad_row->projection_right,
274
2.63k
                     (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
275
2.63k
                     sp_sd, mid_cuts, &bad_row->char_cells, false);
276
2.63k
    }
277
182k
  } else if (bad_row->pitch_decision == PITCH_CORR_PROP ||
278
182k
             bad_row->pitch_decision == PITCH_DEF_PROP) {
279
180k
    bad_row->fixed_pitch = 0.0f;
280
180k
    bad_row->char_cells.clear();
281
180k
  }
282
185k
}
283
284
/**********************************************************************
285
 * compute_block_pitch
286
 *
287
 * Decide whether each block is fixed pitch individually.
288
 **********************************************************************/
289
290
void compute_block_pitch(TO_BLOCK *block,     // input list
291
                         FCOORD rotation,     // for drawing
292
                         int32_t block_index, // block number
293
17.2k
                         bool testing_on) {   // correct orientation
294
17.2k
  TBOX block_box;                             // bounding box
295
296
17.2k
  block_box = block->block->pdblk.bounding_box();
297
17.2k
  if (testing_on && textord_debug_pitch_test) {
298
0
    tprintf("Block %d at (%d,%d)->(%d,%d)\n", block_index, block_box.left(), block_box.bottom(),
299
0
            block_box.right(), block_box.top());
300
0
  }
301
17.2k
  block->min_space = static_cast<int32_t>(floor(block->xheight * textord_words_default_minspace));
302
17.2k
  block->max_nonspace = static_cast<int32_t>(ceil(block->xheight * textord_words_default_nonspace));
303
17.2k
  block->fixed_pitch = 0.0f;
304
17.2k
  block->space_size = static_cast<float>(block->min_space);
305
17.2k
  block->kern_size = static_cast<float>(block->max_nonspace);
306
17.2k
  block->pr_nonsp = block->xheight * words_default_prop_nonspace;
307
17.2k
  block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop;
308
17.2k
  if (!block->get_rows()->empty()) {
309
16.7k
    ASSERT_HOST(block->xheight > 0);
310
16.7k
    find_repeated_chars(block, textord_show_initial_words && testing_on);
311
#ifndef GRAPHICS_DISABLED
312
    if (textord_show_initial_words && testing_on) {
313
      // overlap_picture_ops(true);
314
      ScrollView::Update();
315
    }
316
#endif
317
16.7k
    compute_rows_pitch(block, block_index, textord_debug_pitch_test && testing_on);
318
16.7k
  }
319
17.2k
}
320
321
/**********************************************************************
322
 * compute_rows_pitch
323
 *
324
 * Decide whether each row is fixed pitch individually.
325
 **********************************************************************/
326
327
bool compute_rows_pitch( // find line stats
328
    TO_BLOCK *block,     // block to do
329
    int32_t block_index, // block number
330
    bool testing_on      // correct orientation
331
16.7k
) {
332
16.7k
  int32_t maxwidth;   // of spaces
333
16.7k
  TO_ROW *row;        // current row
334
16.7k
  int32_t row_index;  // row number.
335
16.7k
  float lower, upper; // cluster thresholds
336
16.7k
  TO_ROW_IT row_it = block->get_rows();
337
338
16.7k
  row_index = 1;
339
202k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
340
185k
    row = row_it.data();
341
185k
    ASSERT_HOST(row->xheight > 0);
342
185k
    row->compute_vertical_projection();
343
185k
    maxwidth = static_cast<int32_t>(ceil(row->xheight * textord_words_maxspace));
344
185k
    if (row_pitch_stats(row, maxwidth, testing_on) &&
345
185k
        find_row_pitch(row, maxwidth, textord_dotmatrix_gap + 1, block, block_index, row_index,
346
128k
                       testing_on)) {
347
14.6k
      if (row->fixed_pitch == 0) {
348
0
        lower = row->pr_nonsp;
349
0
        upper = row->pr_space;
350
0
        row->space_size = upper;
351
0
        row->kern_size = lower;
352
0
      }
353
171k
    } else {
354
171k
      row->fixed_pitch = 0.0f; // insufficient data
355
171k
      row->pitch_decision = PITCH_DUNNO;
356
171k
    }
357
185k
    row_index++;
358
185k
  }
359
16.7k
  return false;
360
16.7k
}
361
362
/**********************************************************************
363
 * try_doc_fixed
364
 *
365
 * Attempt to call the entire document fixed pitch.
366
 **********************************************************************/
367
368
bool try_doc_fixed(             // determine pitch
369
    ICOORD page_tr,             // top right
370
    TO_BLOCK_LIST *port_blocks, // input list
371
    float gradient              // page skew
372
17.2k
) {
373
17.2k
  int16_t master_x; // uniform shifts
374
17.2k
  int16_t pitch;    // median pitch.
375
17.2k
  int x;            // profile coord
376
17.2k
  int prop_blocks;  // correct counts
377
17.2k
  int fixed_blocks;
378
17.2k
  int total_row_count; // total in page
379
                       // iterator
380
17.2k
  TO_BLOCK_IT block_it = port_blocks;
381
17.2k
  TO_BLOCK *block;         // current block;
382
17.2k
  TO_ROW *row;             // current row
383
17.2k
  int16_t projection_left; // edges
384
17.2k
  int16_t projection_right;
385
17.2k
  int16_t row_left; // edges of row
386
17.2k
  int16_t row_right;
387
17.2k
  float master_y;     // uniform shifts
388
17.2k
  float shift_factor; // page skew correction
389
17.2k
  float final_pitch;  // output pitch
390
17.2k
  float row_y;        // baseline
391
17.2k
  STATS projection;   // entire page
392
17.2k
  STATS pitches(0, MAX_ALLOWED_PITCH - 1);
393
  // for median
394
17.2k
  float sp_sd;      // space sd
395
17.2k
  int16_t mid_cuts; // no of cheap cuts
396
17.2k
  float pitch_sd;   // sync rating
397
398
17.2k
  if (!textord_blockndoc_fixed ||
399
17.2k
      block_it.empty() || block_it.data()->get_rows()->empty()) {
400
17.2k
    return false;
401
17.2k
  }
402
0
  shift_factor = gradient / (gradient * gradient + 1);
403
  // row iterator
404
0
  TO_ROW_IT row_it(block_it.data()->get_rows());
405
0
  master_x = row_it.data()->projection_left;
406
0
  master_y = row_it.data()->baseline.y(master_x);
407
0
  projection_left = INT16_MAX;
408
0
  projection_right = -INT16_MAX;
409
0
  prop_blocks = 0;
410
0
  fixed_blocks = 0;
411
0
  total_row_count = 0;
412
413
0
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
414
0
    block = block_it.data();
415
0
    row_it.set_to_list(block->get_rows());
416
0
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
417
0
      row = row_it.data();
418
0
      total_row_count++;
419
0
      if (row->fixed_pitch > 0) {
420
0
        pitches.add(static_cast<int32_t>(row->fixed_pitch), 1);
421
0
      }
422
      // find median
423
0
      row_y = row->baseline.y(master_x);
424
0
      row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y));
425
0
      row_right = static_cast<int16_t>(row->projection_right - shift_factor * (master_y - row_y));
426
0
      if (row_left < projection_left) {
427
0
        projection_left = row_left;
428
0
      }
429
0
      if (row_right > projection_right) {
430
0
        projection_right = row_right;
431
0
      }
432
0
    }
433
0
  }
434
0
  if (pitches.get_total() == 0) {
435
0
    return false;
436
0
  }
437
0
  projection.set_range(projection_left, projection_right - 1);
438
439
0
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
440
0
    block = block_it.data();
441
0
    row_it.set_to_list(block->get_rows());
442
0
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
443
0
      row = row_it.data();
444
0
      row_y = row->baseline.y(master_x);
445
0
      row_left = static_cast<int16_t>(row->projection_left - shift_factor * (master_y - row_y));
446
0
      for (x = row->projection_left; x < row->projection_right; x++, row_left++) {
447
0
        projection.add(row_left, row->projection.pile_count(x));
448
0
      }
449
0
    }
450
0
  }
451
452
0
  row_it.set_to_list(block_it.data()->get_rows());
453
0
  row = row_it.data();
454
#ifndef GRAPHICS_DISABLED
455
  if (textord_show_page_cuts && to_win != nullptr) {
456
    projection.plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
457
  }
458
#endif
459
0
  final_pitch = pitches.ile(0.5);
460
0
  pitch = static_cast<int16_t>(final_pitch);
461
0
  pitch_sd = tune_row_pitch(row, &projection, projection_left, projection_right, pitch * 0.75,
462
0
                            final_pitch, sp_sd, mid_cuts, &row->char_cells, false);
463
464
0
  if (textord_debug_pitch_metric) {
465
0
    tprintf(
466
0
        "try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%"
467
0
        "g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
468
0
        prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, pitch_sd / total_row_count,
469
0
        pitch_sd / pitch, pitch_sd / total_row_count / pitch);
470
0
  }
471
472
#ifndef GRAPHICS_DISABLED
473
  if (textord_show_page_cuts && to_win != nullptr) {
474
    float row_shift;              // shift for row
475
    ICOORDELT_LIST *master_cells; // cells for page
476
    master_cells = &row->char_cells;
477
    for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
478
      block = block_it.data();
479
      row_it.set_to_list(block->get_rows());
480
      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
481
        row = row_it.data();
482
        row_y = row->baseline.y(master_x);
483
        row_shift = shift_factor * (master_y - row_y);
484
        plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells);
485
      }
486
    }
487
  }
488
#endif
489
0
  row->char_cells.clear();
490
0
  return false;
491
0
}
492
493
/**********************************************************************
494
 * try_block_fixed
495
 *
496
 * Try to call the entire block fixed.
497
 **********************************************************************/
498
499
bool try_block_fixed(   // find line stats
500
    TO_BLOCK *block,    // block to do
501
    int32_t block_index // block number
502
17.2k
) {
503
17.2k
  return false;
504
17.2k
}
505
506
/**********************************************************************
507
 * try_rows_fixed
508
 *
509
 * Decide whether each row is fixed pitch individually.
510
 **********************************************************************/
511
512
bool try_rows_fixed(     // find line stats
513
    TO_BLOCK *block,     // block to do
514
    int32_t block_index, // block number
515
    bool testing_on      // correct orientation
516
17.2k
) {
517
17.2k
  TO_ROW *row;           // current row
518
17.2k
  int32_t def_fixed = 0; // counters
519
17.2k
  int32_t def_prop = 0;
520
17.2k
  int32_t maybe_fixed = 0;
521
17.2k
  int32_t maybe_prop = 0;
522
17.2k
  int32_t dunno = 0;
523
17.2k
  int32_t corr_fixed = 0;
524
17.2k
  int32_t corr_prop = 0;
525
17.2k
  float lower, upper; // cluster thresholds
526
17.2k
  TO_ROW_IT row_it = block->get_rows();
527
528
203k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
529
185k
    row = row_it.data();
530
185k
    ASSERT_HOST(row->xheight > 0);
531
185k
    if (row->fixed_pitch > 0 && fixed_pitch_row(row, block->block, block_index)) {
532
14.6k
      if (row->fixed_pitch == 0) {
533
0
        lower = row->pr_nonsp;
534
0
        upper = row->pr_space;
535
0
        row->space_size = upper;
536
0
        row->kern_size = lower;
537
0
      }
538
14.6k
    }
539
185k
  }
540
17.2k
  count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,
541
17.2k
                    dunno);
542
17.2k
  if (testing_on &&
543
17.2k
      (textord_debug_pitch_test || textord_blocksall_prop || textord_blocksall_fixed)) {
544
0
    tprintf("Initially:");
545
0
    print_block_counts(block, block_index);
546
0
  }
547
17.2k
  if (def_fixed > def_prop * textord_words_veto_power) {
548
269
    block->pitch_decision = PITCH_DEF_FIXED;
549
17.0k
  } else if (def_prop > def_fixed * textord_words_veto_power) {
550
1.45k
    block->pitch_decision = PITCH_DEF_PROP;
551
15.5k
  } else if (def_fixed > 0 || def_prop > 0) {
552
190
    block->pitch_decision = PITCH_DUNNO;
553
15.3k
  } else if (maybe_fixed > maybe_prop * textord_words_veto_power) {
554
58
    block->pitch_decision = PITCH_MAYBE_FIXED;
555
15.3k
  } else if (maybe_prop > maybe_fixed * textord_words_veto_power) {
556
188
    block->pitch_decision = PITCH_MAYBE_PROP;
557
15.1k
  } else {
558
15.1k
    block->pitch_decision = PITCH_DUNNO;
559
15.1k
  }
560
17.2k
  return false;
561
17.2k
}
562
563
/**********************************************************************
564
 * print_block_counts
565
 *
566
 * Count up how many rows have what decision and print the results.
567
 **********************************************************************/
568
569
void print_block_counts( // find line stats
570
    TO_BLOCK *block,     // block to do
571
    int32_t block_index  // block number
572
0
) {
573
0
  int32_t def_fixed = 0; // counters
574
0
  int32_t def_prop = 0;
575
0
  int32_t maybe_fixed = 0;
576
0
  int32_t maybe_prop = 0;
577
0
  int32_t dunno = 0;
578
0
  int32_t corr_fixed = 0;
579
0
  int32_t corr_prop = 0;
580
581
0
  count_block_votes(block, def_fixed, def_prop, maybe_fixed, maybe_prop, corr_fixed, corr_prop,
582
0
                    dunno);
583
0
  tprintf("Block %d has (%d,%d,%d)", block_index, def_fixed, maybe_fixed, corr_fixed);
584
0
  if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed)) {
585
0
    tprintf(" (Wrongly)");
586
0
  }
587
0
  tprintf(" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
588
0
  if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop)) {
589
0
    tprintf(" (Wrongly)");
590
0
  }
591
0
  tprintf(" prop, %d dunno\n", dunno);
592
0
}
593
594
/**********************************************************************
595
 * count_block_votes
596
 *
597
 * Count the number of rows in the block with each kind of pitch_decision.
598
 **********************************************************************/
599
600
void count_block_votes( // find line stats
601
    TO_BLOCK *block,    // block to do
602
    int32_t &def_fixed, // add to counts
603
    int32_t &def_prop, int32_t &maybe_fixed, int32_t &maybe_prop, int32_t &corr_fixed,
604
17.2k
    int32_t &corr_prop, int32_t &dunno) {
605
17.2k
  TO_ROW *row; // current row
606
17.2k
  TO_ROW_IT row_it = block->get_rows();
607
608
203k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
609
185k
    row = row_it.data();
610
185k
    switch (row->pitch_decision) {
611
171k
      case PITCH_DUNNO:
612
171k
        dunno++;
613
171k
        break;
614
7.88k
      case PITCH_DEF_PROP:
615
7.88k
        def_prop++;
616
7.88k
        break;
617
3.95k
      case PITCH_MAYBE_PROP:
618
3.95k
        maybe_prop++;
619
3.95k
        break;
620
1.77k
      case PITCH_DEF_FIXED:
621
1.77k
        def_fixed++;
622
1.77k
        break;
623
1.06k
      case PITCH_MAYBE_FIXED:
624
1.06k
        maybe_fixed++;
625
1.06k
        break;
626
0
      case PITCH_CORR_PROP:
627
0
        corr_prop++;
628
0
        break;
629
0
      case PITCH_CORR_FIXED:
630
0
        corr_fixed++;
631
0
        break;
632
185k
    }
633
185k
  }
634
17.2k
}
635
636
/**********************************************************************
637
 * row_pitch_stats
638
 *
639
 * Decide whether each row is fixed pitch individually.
640
 **********************************************************************/
641
642
bool row_pitch_stats( // find line stats
643
    TO_ROW *row,      // current row
644
    int32_t maxwidth, // of spaces
645
    bool testing_on   // correct orientation
646
185k
) {
647
185k
  BLOBNBOX *blob;        // current blob
648
185k
  int gap_index;         // current gap
649
185k
  int32_t prev_x;        // end of prev blob
650
185k
  int32_t cluster_count; // no of clusters
651
185k
  int32_t prev_count;    // of clusters
652
185k
  int32_t smooth_factor; // for smoothing stats
653
185k
  TBOX blob_box;         // bounding box
654
185k
  float lower, upper;    // cluster thresholds
655
                         // gap sizes
656
185k
  float gaps[BLOCK_STATS_CLUSTERS];
657
  // blobs
658
185k
  BLOBNBOX_IT blob_it = row->blob_list();
659
185k
  STATS gap_stats(0, maxwidth - 1);
660
185k
  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
661
  // clusters
662
663
185k
  smooth_factor = static_cast<int32_t>(row->xheight * textord_wordstats_smooth_factor + 1.5);
664
185k
  if (!blob_it.empty()) {
665
185k
    prev_x = blob_it.data()->bounding_box().right();
666
185k
    blob_it.forward();
667
2.68M
    while (!blob_it.at_first()) {
668
2.50M
      blob = blob_it.data();
669
2.50M
      if (!blob->joined_to_prev()) {
670
1.33M
        blob_box = blob->bounding_box();
671
1.33M
        if (blob_box.left() - prev_x < maxwidth) {
672
1.33M
          gap_stats.add(blob_box.left() - prev_x, 1);
673
1.33M
        }
674
1.33M
        prev_x = blob_box.right();
675
1.33M
      }
676
2.50M
      blob_it.forward();
677
2.50M
    }
678
185k
  }
679
185k
  if (gap_stats.get_total() == 0) {
680
57.1k
    return false;
681
57.1k
  }
682
128k
  cluster_count = 0;
683
128k
  lower = row->xheight * words_initial_lower;
684
128k
  upper = row->xheight * words_initial_upper;
685
128k
  gap_stats.smooth(smooth_factor);
686
257k
  do {
687
257k
    prev_count = cluster_count;
688
257k
    cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
689
257k
                                      BLOCK_STATS_CLUSTERS, cluster_stats);
690
257k
  } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
691
128k
  if (cluster_count < 1) {
692
0
    return false;
693
0
  }
694
304k
  for (gap_index = 0; gap_index < cluster_count; gap_index++) {
695
175k
    gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
696
175k
  }
697
  // get medians
698
128k
  if (testing_on) {
699
0
    tprintf("cluster_count=%d:", cluster_count);
700
0
    for (gap_index = 0; gap_index < cluster_count; gap_index++) {
701
0
      tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
702
0
    }
703
0
    tprintf("\n");
704
0
  }
705
128k
  qsort(gaps, cluster_count, sizeof(float), sort_floats);
706
707
  // Try to find proportional non-space and space for row.
708
128k
  lower = row->xheight * words_default_prop_nonspace;
709
128k
  upper = row->xheight * textord_words_min_minspace;
710
239k
  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < lower; gap_index++) {
711
110k
    ;
712
110k
  }
713
128k
  if (gap_index == 0) {
714
18.6k
    if (testing_on) {
715
0
      tprintf("No clusters below nonspace threshold!!\n");
716
0
    }
717
18.6k
    if (cluster_count > 1) {
718
4.91k
      row->pr_nonsp = gaps[0];
719
4.91k
      row->pr_space = gaps[1];
720
13.6k
    } else {
721
13.6k
      row->pr_nonsp = lower;
722
13.6k
      row->pr_space = gaps[0];
723
13.6k
    }
724
110k
  } else {
725
110k
    row->pr_nonsp = gaps[gap_index - 1];
726
111k
    while (gap_index < cluster_count && gaps[gap_index] < upper) {
727
1.14k
      gap_index++;
728
1.14k
    }
729
110k
    if (gap_index == cluster_count) {
730
78.7k
      if (testing_on) {
731
0
        tprintf("No clusters above nonspace threshold!!\n");
732
0
      }
733
78.7k
      row->pr_space = lower * textord_spacesize_ratioprop;
734
78.7k
    } else {
735
31.4k
      row->pr_space = gaps[gap_index];
736
31.4k
    }
737
110k
  }
738
739
  // Now try to find the fixed pitch space and non-space.
740
128k
  upper = row->xheight * words_default_fixed_space;
741
281k
  for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] < upper; gap_index++) {
742
152k
    ;
743
152k
  }
744
128k
  if (gap_index == 0) {
745
3.46k
    if (testing_on) {
746
0
      tprintf("No clusters below space threshold!!\n");
747
0
    }
748
3.46k
    row->fp_nonsp = upper;
749
3.46k
    row->fp_space = gaps[0];
750
125k
  } else {
751
125k
    row->fp_nonsp = gaps[gap_index - 1];
752
125k
    if (gap_index == cluster_count) {
753
108k
      if (testing_on) {
754
0
        tprintf("No clusters above space threshold!!\n");
755
0
      }
756
108k
      row->fp_space = row->xheight;
757
108k
    } else {
758
16.4k
      row->fp_space = gaps[gap_index];
759
16.4k
    }
760
125k
  }
761
128k
  if (testing_on) {
762
0
    tprintf(
763
0
        "Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, "
764
0
        "fp_space=%g\n",
765
0
        row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
766
0
  }
767
128k
  return true; // computed some stats
768
128k
}
769
770
/**********************************************************************
771
 * find_row_pitch
772
 *
773
 * Check to see if this row could be fixed pitch using the given spacings.
774
 * Blobs with gaps smaller than the lower threshold are assumed to be one.
775
 * The larger threshold is the word gap threshold.
776
 **********************************************************************/
777
778
bool find_row_pitch(     // find lines
779
    TO_ROW *row,         // row to do
780
    int32_t maxwidth,    // max permitted space
781
    int32_t dm_gap,      // ignorable gaps
782
    TO_BLOCK *block,     // block of row
783
    int32_t block_index, // block_number
784
    int32_t row_index,   // number of row
785
    bool testing_on      // correct orientation
786
128k
) {
787
128k
  bool used_dm_model; // looks like dot matrix
788
128k
  float min_space;    // estimate threshold
789
128k
  float non_space;    // gap size
790
128k
  float gap_iqr;      // interquartile range
791
128k
  float pitch_iqr;
792
128k
  float dm_gap_iqr; // interquartile range
793
128k
  float dm_pitch_iqr;
794
128k
  float dm_pitch;      // pitch with dm on
795
128k
  float pitch;         // revised estimate
796
128k
  float initial_pitch; // guess at pitch
797
128k
  STATS gap_stats(0, maxwidth - 1);
798
  // centre-centre
799
128k
  STATS pitch_stats(0, maxwidth - 1);
800
801
128k
  row->fixed_pitch = 0.0f;
802
128k
  initial_pitch = row->fp_space;
803
128k
  if (initial_pitch > row->xheight * (1 + words_default_fixed_limit)) {
804
5.36k
    initial_pitch = row->xheight; // keep pitch decent
805
5.36k
  }
806
128k
  non_space = row->fp_nonsp;
807
128k
  if (non_space > initial_pitch) {
808
0
    non_space = initial_pitch;
809
0
  }
810
128k
  min_space = (initial_pitch + non_space) / 2;
811
812
128k
  if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false,
813
128k
                         dm_gap)) {
814
125k
    dm_gap_iqr = 0.0001f;
815
125k
    dm_pitch_iqr = maxwidth * 2.0f;
816
125k
    dm_pitch = initial_pitch;
817
125k
  } else {
818
3.14k
    dm_gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
819
3.14k
    dm_pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
820
3.14k
    dm_pitch = pitch_stats.ile(0.5);
821
3.14k
  }
822
128k
  gap_stats.clear();
823
128k
  pitch_stats.clear();
824
128k
  if (!count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch, min_space, true, false, 0)) {
825
116k
    gap_iqr = 0.0001f;
826
116k
    pitch_iqr = maxwidth * 3.0f;
827
116k
  } else {
828
12.7k
    gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
829
12.7k
    pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
830
12.7k
    if (testing_on) {
831
0
      tprintf(
832
0
          "First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, "
833
0
          "pitch=%g\n",
834
0
          initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5));
835
0
    }
836
12.7k
    initial_pitch = pitch_stats.ile(0.5);
837
12.7k
    if (min_space > initial_pitch && count_pitch_stats(row, &gap_stats, &pitch_stats, initial_pitch,
838
3.33k
                                                       initial_pitch, true, false, 0)) {
839
3.25k
      min_space = initial_pitch;
840
3.25k
      gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
841
3.25k
      pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
842
3.25k
      if (testing_on) {
843
0
        tprintf(
844
0
            "Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, "
845
0
            "pitch=%g\n",
846
0
            initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile(0.5));
847
0
      }
848
3.25k
      initial_pitch = pitch_stats.ile(0.5);
849
3.25k
    }
850
12.7k
  }
851
128k
  if (textord_debug_pitch_metric) {
852
0
    tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:", block_index,
853
0
            row_index, 'X', pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
854
0
            pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth
855
0
                ? 'D'
856
0
                : (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
857
0
  }
858
128k
  if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
859
114k
    row->pitch_decision = PITCH_DUNNO;
860
114k
    if (textord_debug_pitch_metric) {
861
0
      tprintf("\n");
862
0
    }
863
114k
    return false; // insufficient data
864
114k
  }
865
14.6k
  if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
866
12.7k
    if (testing_on) {
867
0
      tprintf(
868
0
          "Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, "
869
0
          "dm_gap_iqr=%g\n",
870
0
          pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
871
0
    }
872
12.7k
    gap_iqr = gap_stats.ile(0.75) - gap_stats.ile(0.25);
873
12.7k
    pitch_iqr = pitch_stats.ile(0.75) - pitch_stats.ile(0.25);
874
12.7k
    pitch = pitch_stats.ile(0.5);
875
12.7k
    used_dm_model = false;
876
12.7k
  } else {
877
1.97k
    if (testing_on) {
878
0
      tprintf(
879
0
          "Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, "
880
0
          "dm_gap_iqr=%g\n",
881
0
          pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
882
0
    }
883
1.97k
    gap_iqr = dm_gap_iqr;
884
1.97k
    pitch_iqr = dm_pitch_iqr;
885
1.97k
    pitch = dm_pitch;
886
1.97k
    used_dm_model = true;
887
1.97k
  }
888
14.6k
  if (textord_debug_pitch_metric) {
889
0
    tprintf("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:", pitch_iqr, gap_iqr, pitch);
890
0
    tprintf("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:", pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
891
0
            pitch_iqr < gap_iqr * textord_fpiqr_ratio &&
892
0
                    pitch_iqr < block->xheight * textord_max_pitch_iqr &&
893
0
                    pitch < block->xheight * textord_words_default_maxspace
894
0
                ? 'F'
895
0
                : 'P');
896
0
  }
897
14.6k
  if (pitch_iqr < gap_iqr * textord_fpiqr_ratio &&
898
14.6k
      pitch_iqr < block->xheight * textord_max_pitch_iqr &&
899
14.6k
      pitch < block->xheight * textord_words_default_maxspace) {
900
7.27k
    row->pitch_decision = PITCH_MAYBE_FIXED;
901
7.39k
  } else {
902
7.39k
    row->pitch_decision = PITCH_MAYBE_PROP;
903
7.39k
  }
904
14.6k
  row->fixed_pitch = pitch;
905
14.6k
  row->kern_size = gap_stats.ile(0.5);
906
14.6k
  row->min_space = static_cast<int32_t>(row->fixed_pitch + non_space) / 2;
907
14.6k
  if (row->min_space > row->fixed_pitch) {
908
168
    row->min_space = static_cast<int32_t>(row->fixed_pitch);
909
168
  }
910
14.6k
  row->max_nonspace = row->min_space;
911
14.6k
  row->space_size = row->fixed_pitch;
912
14.6k
  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
913
14.6k
  row->used_dm_model = used_dm_model;
914
14.6k
  return true;
915
128k
}
916
917
/**********************************************************************
918
 * fixed_pitch_row
919
 *
920
 * Check to see if this row could be fixed pitch using the given spacings.
921
 * Blobs with gaps smaller than the lower threshold are assumed to be one.
922
 * The larger threshold is the word gap threshold.
923
 **********************************************************************/
924
925
bool fixed_pitch_row(TO_ROW *row, // row to do
926
                     BLOCK *block,
927
                     int32_t block_index // block_number
928
14.6k
) {
929
14.6k
  const char *res_string; // pitch result
930
14.6k
  int16_t mid_cuts;       // no of cheap cuts
931
14.6k
  float non_space;        // gap size
932
14.6k
  float pitch_sd;         // error on pitch
933
14.6k
  float sp_sd = 0.0f;     // space sd
934
935
14.6k
  non_space = row->fp_nonsp;
936
14.6k
  if (non_space > row->fixed_pitch) {
937
281
    non_space = row->fixed_pitch;
938
281
  }
939
14.6k
  POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
940
14.6k
  if (textord_all_prop || (pb != nullptr && !pb->IsText())) {
941
    // Set the decision to definitely proportional.
942
0
    pitch_sd = textord_words_def_prop * row->fixed_pitch;
943
0
    row->pitch_decision = PITCH_DEF_PROP;
944
14.6k
  } else {
945
14.6k
    pitch_sd = tune_row_pitch(row, &row->projection, row->projection_left, row->projection_right,
946
14.6k
                              (row->fixed_pitch + non_space * 3) / 4, row->fixed_pitch, sp_sd,
947
14.6k
                              mid_cuts, &row->char_cells, block_index == textord_debug_block);
948
14.6k
    if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch &&
949
14.6k
        ((pitsync_linear_version & 3) < 3 ||
950
2.83k
         ((pitsync_linear_version & 3) >= 3 &&
951
2.83k
          (row->used_dm_model || sp_sd > 20 || (pitch_sd == 0 && sp_sd > 10))))) {
952
2.83k
      if (pitch_sd < textord_words_def_fixed * row->fixed_pitch && !row->all_caps &&
953
2.83k
          ((pitsync_linear_version & 3) < 3 || sp_sd > 20)) {
954
1.77k
        row->pitch_decision = PITCH_DEF_FIXED;
955
1.77k
      } else {
956
1.06k
        row->pitch_decision = PITCH_MAYBE_FIXED;
957
1.06k
      }
958
11.8k
    } else if ((pitsync_linear_version & 3) < 3 || sp_sd > 20 || mid_cuts > 0 ||
959
11.8k
               pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
960
11.8k
      if (pitch_sd < textord_words_def_prop * row->fixed_pitch) {
961
3.95k
        row->pitch_decision = PITCH_MAYBE_PROP;
962
7.88k
      } else {
963
7.88k
        row->pitch_decision = PITCH_DEF_PROP;
964
7.88k
      }
965
11.8k
    } else {
966
0
      row->pitch_decision = PITCH_DUNNO;
967
0
    }
968
14.6k
  }
969
970
14.6k
  if (textord_debug_pitch_metric) {
971
0
    res_string = "??";
972
0
    switch (row->pitch_decision) {
973
0
      case PITCH_DEF_PROP:
974
0
        res_string = "DP";
975
0
        break;
976
0
      case PITCH_MAYBE_PROP:
977
0
        res_string = "MP";
978
0
        break;
979
0
      case PITCH_DEF_FIXED:
980
0
        res_string = "DF";
981
0
        break;
982
0
      case PITCH_MAYBE_FIXED:
983
0
        res_string = "MF";
984
0
        break;
985
0
      default:
986
0
        res_string = "??";
987
0
    }
988
0
    tprintf(":sd/p=%g:occ=%g:init_res=%s\n", pitch_sd / row->fixed_pitch, sp_sd, res_string);
989
0
  }
990
14.6k
  return true;
991
14.6k
}
992
993
/**********************************************************************
994
 * count_pitch_stats
995
 *
996
 * Count up the gap and pitch stats on the block to see if it is fixed pitch.
997
 * Blobs with gaps smaller than the lower threshold are assumed to be one.
998
 * The larger threshold is the word gap threshold.
999
 * The return value indicates whether there were any decent values to use.
1000
 **********************************************************************/
1001
1002
bool count_pitch_stats(  // find lines
1003
    TO_ROW *row,         // row to do
1004
    STATS *gap_stats,    // blob gaps
1005
    STATS *pitch_stats,  // centre-centre stats
1006
    float initial_pitch, // guess at pitch
1007
    float min_space,     // estimate space size
1008
    bool ignore_outsize, // discard big objects
1009
    bool split_outsize,  // split big objects
1010
    int32_t dm_gap       // ignorable gaps
1011
261k
) {
1012
261k
  bool prev_valid; // not word broken
1013
261k
  BLOBNBOX *blob;  // current blob
1014
                   // blobs
1015
261k
  BLOBNBOX_IT blob_it = row->blob_list();
1016
261k
  int32_t prev_right;  // end of prev blob
1017
261k
  int32_t prev_centre; // centre of previous blob
1018
261k
  int32_t x_centre;    // centre of this blob
1019
261k
  int32_t blob_width;  // width of blob
1020
261k
  int32_t width_units; // no of widths in blob
1021
261k
  float width;         // blob width
1022
261k
  TBOX blob_box;       // bounding box
1023
261k
  TBOX joined_box;     // of super blob
1024
1025
261k
  gap_stats->clear();
1026
261k
  pitch_stats->clear();
1027
261k
  if (blob_it.empty()) {
1028
0
    return false;
1029
0
  }
1030
261k
  prev_valid = false;
1031
261k
  prev_centre = 0;
1032
261k
  prev_right = 0; // stop compiler warning
1033
261k
  joined_box = blob_it.data()->bounding_box();
1034
5.19M
  do {
1035
5.19M
    blob_it.forward();
1036
5.19M
    blob = blob_it.data();
1037
5.19M
    if (!blob->joined_to_prev()) {
1038
2.97M
      blob_box = blob->bounding_box();
1039
2.97M
      if ((blob_box.left() - joined_box.right() < dm_gap && !blob_it.at_first()) ||
1040
2.97M
          blob->cblob() == nullptr) {
1041
1.46M
        joined_box += blob_box; // merge blobs
1042
1.50M
      } else {
1043
1.50M
        blob_width = joined_box.width();
1044
1.50M
        if (split_outsize) {
1045
0
          width_units =
1046
0
              static_cast<int32_t>(floor(static_cast<float>(blob_width) / initial_pitch + 0.5));
1047
0
          if (width_units < 1) {
1048
0
            width_units = 1;
1049
0
          }
1050
0
          width_units--;
1051
1.50M
        } else if (ignore_outsize) {
1052
1.50M
          width = static_cast<float>(blob_width) / initial_pitch;
1053
1.50M
          width_units =
1054
1.50M
              width < 1 + words_default_fixed_limit && width > 1 - words_default_fixed_limit ? 0
1055
1.50M
                                                                                             : -1;
1056
1.50M
        } else {
1057
0
          width_units = 0; // everything in
1058
0
        }
1059
1.50M
        x_centre = static_cast<int32_t>(joined_box.left() +
1060
1.50M
                                        (blob_width - width_units * initial_pitch) / 2);
1061
1.50M
        if (prev_valid && width_units >= 0) {
1062
          //                                              if (width_units>0)
1063
          //                                              {
1064
          //                                                      tprintf("wu=%d,
1065
          //                                                      width=%d,
1066
          //                                                      xc=%d, adding
1067
          //                                                      %d\n",
1068
          //                                                              width_units,blob_width,x_centre,x_centre-prev_centre);
1069
          //                                              }
1070
134k
          gap_stats->add(joined_box.left() - prev_right, 1);
1071
134k
          pitch_stats->add(x_centre - prev_centre, 1);
1072
134k
        }
1073
1.50M
        prev_centre = static_cast<int32_t>(x_centre + width_units * initial_pitch);
1074
1.50M
        prev_right = joined_box.right();
1075
1.50M
        prev_valid = blob_box.left() - joined_box.right() < min_space;
1076
1.50M
        prev_valid = prev_valid && width_units >= 0;
1077
1.50M
        joined_box = blob_box;
1078
1.50M
      }
1079
2.97M
    }
1080
5.19M
  } while (!blob_it.at_first());
1081
261k
  return gap_stats->get_total() >= 3;
1082
261k
}
1083
1084
/**********************************************************************
1085
 * tune_row_pitch
1086
 *
1087
 * Use a dp algorithm to fit the character cells and return the sd of
1088
 * the cell size over the row.
1089
 **********************************************************************/
1090
1091
float tune_row_pitch(           // find fp cells
1092
    TO_ROW *row,                // row to do
1093
    STATS *projection,          // vertical projection
1094
    int16_t projection_left,    // edge of projection
1095
    int16_t projection_right,   // edge of projection
1096
    float space_size,           // size of blank
1097
    float &initial_pitch,       // guess at pitch
1098
    float &best_sp_sd,          // space sd
1099
    int16_t &best_mid_cuts,     // no of cheap cuts
1100
    ICOORDELT_LIST *best_cells, // row cells
1101
    bool testing_on             // individual words
1102
17.3k
) {
1103
17.3k
  int pitch_delta;           // offset pitch
1104
17.3k
  int16_t mid_cuts;          // cheap cuts
1105
17.3k
  float pitch_sd;            // current sd
1106
17.3k
  float best_sd;             // best result
1107
17.3k
  float best_pitch;          // pitch for best result
1108
17.3k
  float initial_sd;          // starting error
1109
17.3k
  float sp_sd;               // space sd
1110
17.3k
  ICOORDELT_LIST test_cells; // row cells
1111
17.3k
  ICOORDELT_IT best_it;      // start of best list
1112
1113
17.3k
  if (textord_fast_pitch_test) {
1114
0
    return tune_row_pitch2(row, projection, projection_left, projection_right, space_size,
1115
0
                           initial_pitch, best_sp_sd,
1116
                           // space sd
1117
0
                           best_mid_cuts, best_cells, testing_on);
1118
0
  }
1119
17.3k
  if (textord_disable_pitch_test) {
1120
0
    best_sp_sd = initial_pitch;
1121
0
    return initial_pitch;
1122
0
  }
1123
17.3k
  initial_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1124
17.3k
                                initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on);
1125
17.3k
  best_sd = initial_sd;
1126
17.3k
  best_pitch = initial_pitch;
1127
17.3k
  if (testing_on) {
1128
0
    tprintf("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
1129
0
  }
1130
28.1k
  for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1131
23.9k
    pitch_sd =
1132
23.9k
        compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1133
23.9k
                         initial_pitch + pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);
1134
23.9k
    if (testing_on) {
1135
0
      tprintf("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta, pitch_sd);
1136
0
    }
1137
23.9k
    if (pitch_sd < best_sd) {
1138
8.31k
      best_sd = pitch_sd;
1139
8.31k
      best_mid_cuts = mid_cuts;
1140
8.31k
      best_sp_sd = sp_sd;
1141
8.31k
      best_pitch = initial_pitch + pitch_delta;
1142
8.31k
      best_cells->clear();
1143
8.31k
      best_it.set_to_list(best_cells);
1144
8.31k
      best_it.add_list_after(&test_cells);
1145
15.6k
    } else {
1146
15.6k
      test_cells.clear();
1147
15.6k
    }
1148
23.9k
    if (pitch_sd > initial_sd) {
1149
13.1k
      break; // getting worse
1150
13.1k
    }
1151
23.9k
  }
1152
28.9k
  for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
1153
24.2k
    pitch_sd =
1154
24.2k
        compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1155
24.2k
                         initial_pitch - pitch_delta, sp_sd, mid_cuts, &test_cells, testing_on);
1156
24.2k
    if (testing_on) {
1157
0
      tprintf("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta, pitch_sd);
1158
0
    }
1159
24.2k
    if (pitch_sd < best_sd) {
1160
5.68k
      best_sd = pitch_sd;
1161
5.68k
      best_mid_cuts = mid_cuts;
1162
5.68k
      best_sp_sd = sp_sd;
1163
5.68k
      best_pitch = initial_pitch - pitch_delta;
1164
5.68k
      best_cells->clear();
1165
5.68k
      best_it.set_to_list(best_cells);
1166
5.68k
      best_it.add_list_after(&test_cells);
1167
18.6k
    } else {
1168
18.6k
      test_cells.clear();
1169
18.6k
    }
1170
24.2k
    if (pitch_sd > initial_sd) {
1171
12.6k
      break;
1172
12.6k
    }
1173
24.2k
  }
1174
17.3k
  initial_pitch = best_pitch;
1175
1176
17.3k
  if (textord_debug_pitch_metric) {
1177
0
    print_pitch_sd(row, projection, projection_left, projection_right, space_size, best_pitch);
1178
0
  }
1179
1180
17.3k
  return best_sd;
1181
17.3k
}
1182
1183
/**********************************************************************
1184
 * tune_row_pitch
1185
 *
1186
 * Use a dp algorithm to fit the character cells and return the sd of
1187
 * the cell size over the row.
1188
 **********************************************************************/
1189
1190
float tune_row_pitch2(          // find fp cells
1191
    TO_ROW *row,                // row to do
1192
    STATS *projection,          // vertical projection
1193
    int16_t projection_left,    // edge of projection
1194
    int16_t projection_right,   // edge of projection
1195
    float space_size,           // size of blank
1196
    float &initial_pitch,       // guess at pitch
1197
    float &best_sp_sd,          // space sd
1198
    int16_t &best_mid_cuts,     // no of cheap cuts
1199
    ICOORDELT_LIST *best_cells, // row cells
1200
    bool testing_on             // individual words
1201
0
) {
1202
0
  int pitch_delta;    // offset pitch
1203
0
  int16_t pixel;      // pixel coord
1204
0
  int16_t best_pixel; // pixel coord
1205
0
  int16_t best_delta; // best pitch
1206
0
  int16_t best_pitch; // best pitch
1207
0
  int16_t start;      // of good range
1208
0
  int16_t end;        // of good range
1209
0
  int32_t best_count; // lowest sum
1210
0
  float best_sd;      // best result
1211
1212
0
  best_sp_sd = initial_pitch;
1213
1214
0
  best_pitch = static_cast<int>(initial_pitch);
1215
0
  if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) {
1216
0
    return initial_pitch;
1217
0
  }
1218
0
  std::unique_ptr<STATS[]> sum_proj(new STATS[textord_pitch_range * 2 + 1]); // summed projection
1219
1220
0
  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1221
0
    sum_proj[textord_pitch_range + pitch_delta].set_range(0, best_pitch + pitch_delta);
1222
0
  }
1223
0
  for (pixel = projection_left; pixel <= projection_right; pixel++) {
1224
0
    for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1225
0
      sum_proj[textord_pitch_range + pitch_delta].add(
1226
0
          (pixel - projection_left) % (best_pitch + pitch_delta), projection->pile_count(pixel));
1227
0
    }
1228
0
  }
1229
0
  best_count = sum_proj[textord_pitch_range].pile_count(0);
1230
0
  best_delta = 0;
1231
0
  best_pixel = 0;
1232
0
  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) {
1233
0
    for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
1234
0
      if (sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel) < best_count) {
1235
0
        best_count = sum_proj[textord_pitch_range + pitch_delta].pile_count(pixel);
1236
0
        best_delta = pitch_delta;
1237
0
        best_pixel = pixel;
1238
0
      }
1239
0
    }
1240
0
  }
1241
0
  if (testing_on) {
1242
0
    tprintf("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n", initial_pitch, best_delta,
1243
0
            best_count);
1244
0
  }
1245
0
  best_pitch += best_delta;
1246
0
  initial_pitch = best_pitch;
1247
0
  best_count++;
1248
0
  best_count += best_count;
1249
0
  for (start = best_pixel - 2;
1250
0
       start > best_pixel - best_pitch &&
1251
0
       sum_proj[textord_pitch_range + best_delta].pile_count(start % best_pitch) <= best_count;
1252
0
       start--) {
1253
0
    ;
1254
0
  }
1255
0
  for (end = best_pixel + 2;
1256
0
       end < best_pixel + best_pitch &&
1257
0
       sum_proj[textord_pitch_range + best_delta].pile_count(end % best_pitch) <= best_count;
1258
0
       end++) {
1259
0
    ;
1260
0
  }
1261
1262
0
  best_sd = compute_pitch_sd(row, projection, projection_left, projection_right, space_size,
1263
0
                             initial_pitch, best_sp_sd, best_mid_cuts, best_cells, testing_on,
1264
0
                             start, end);
1265
0
  if (testing_on) {
1266
0
    tprintf("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch, best_sd);
1267
0
  }
1268
1269
0
  if (textord_debug_pitch_metric) {
1270
0
    print_pitch_sd(row, projection, projection_left, projection_right, space_size, initial_pitch);
1271
0
  }
1272
1273
0
  return best_sd;
1274
0
}
1275
1276
/**********************************************************************
1277
 * compute_pitch_sd
1278
 *
1279
 * Use a dp algorithm to fit the character cells and return the sd of
1280
 * the cell size over the row.
1281
 **********************************************************************/
1282
1283
float compute_pitch_sd(        // find fp cells
1284
    TO_ROW *row,               // row to do
1285
    STATS *projection,         // vertical projection
1286
    int16_t projection_left,   // edge
1287
    int16_t projection_right,  // edge
1288
    float space_size,          // size of blank
1289
    float initial_pitch,       // guess at pitch
1290
    float &sp_sd,              // space sd
1291
    int16_t &mid_cuts,         // no of free cuts
1292
    ICOORDELT_LIST *row_cells, // list of chop pts
1293
    bool testing_on,           // individual words
1294
    int16_t start,             // start of good range
1295
    int16_t end                // end of good range
1296
65.5k
) {
1297
65.5k
  int16_t occupation; // no of cells in word.
1298
                      // blobs
1299
65.5k
  BLOBNBOX_IT blob_it = row->blob_list();
1300
65.5k
  BLOBNBOX_IT start_it;  // start of word
1301
65.5k
  BLOBNBOX_IT plot_it;   // for plotting
1302
65.5k
  int16_t blob_count;    // no of blobs
1303
65.5k
  TBOX blob_box;         // bounding box
1304
65.5k
  TBOX prev_box;         // of super blob
1305
65.5k
  int32_t prev_right;    // of word sync
1306
65.5k
  int scale_factor;      // on scores for big words
1307
65.5k
  int32_t sp_count;      // spaces
1308
65.5k
  FPSEGPT_LIST seg_list; // char cells
1309
65.5k
  FPSEGPT_IT seg_it;     // iterator
1310
65.5k
  int16_t segpos;        // position of segment
1311
65.5k
  int16_t cellpos;       // previous cell boundary
1312
                         // iterator
1313
65.5k
  ICOORDELT_IT cell_it = row_cells;
1314
65.5k
  ICOORDELT *cell;     // new cell
1315
65.5k
  double sqsum;        // sum of squares
1316
65.5k
  double spsum;        // of spaces
1317
65.5k
  double sp_var;       // space error
1318
65.5k
  double word_sync;    // result for word
1319
65.5k
  int32_t total_count; // total blobs
1320
1321
65.5k
  if ((pitsync_linear_version & 3) > 1) {
1322
65.5k
    word_sync = compute_pitch_sd2(row, projection, projection_left, projection_right, initial_pitch,
1323
65.5k
                                  occupation, mid_cuts, row_cells, testing_on, start, end);
1324
65.5k
    sp_sd = occupation;
1325
65.5k
    return word_sync;
1326
65.5k
  }
1327
0
  mid_cuts = 0;
1328
0
  cellpos = 0;
1329
0
  total_count = 0;
1330
0
  sqsum = 0;
1331
0
  sp_count = 0;
1332
0
  spsum = 0;
1333
0
  prev_right = -1;
1334
0
  if (blob_it.empty()) {
1335
0
    return space_size * 10;
1336
0
  }
1337
#ifndef GRAPHICS_DISABLED
1338
  if (testing_on && to_win != nullptr) {
1339
    blob_box = blob_it.data()->bounding_box();
1340
    projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
1341
  }
1342
#endif
1343
0
  start_it = blob_it;
1344
0
  blob_count = 0;
1345
0
  blob_box = box_next(&blob_it); // first blob
1346
0
  blob_it.mark_cycle_pt();
1347
0
  do {
1348
0
    for (; blob_count > 0; blob_count--) {
1349
0
      box_next(&start_it);
1350
0
    }
1351
0
    do {
1352
0
      prev_box = blob_box;
1353
0
      blob_count++;
1354
0
      blob_box = box_next(&blob_it);
1355
0
    } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size);
1356
0
    plot_it = start_it;
1357
0
    if (pitsync_linear_version & 3) {
1358
0
      word_sync = check_pitch_sync2(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2,
1359
0
                                    projection, projection_left, projection_right,
1360
0
                                    row->xheight * textord_projection_scale, occupation, &seg_list,
1361
0
                                    start, end);
1362
0
    } else {
1363
0
      word_sync = check_pitch_sync(&start_it, blob_count, static_cast<int16_t>(initial_pitch), 2,
1364
0
                                   projection, &seg_list);
1365
0
    }
1366
0
    if (testing_on) {
1367
0
      tprintf("Word ending at (%d,%d), len=%d, sync rating=%g, ", prev_box.right(), prev_box.top(),
1368
0
              seg_list.length() - 1, word_sync);
1369
0
      seg_it.set_to_list(&seg_list);
1370
0
      for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1371
0
        if (seg_it.data()->faked) {
1372
0
          tprintf("(F)");
1373
0
        }
1374
0
        tprintf("%d, ", seg_it.data()->position());
1375
        //                              tprintf("C=%g, s=%g, sq=%g\n",
1376
        //                                      seg_it.data()->cost_function(),
1377
        //                                      seg_it.data()->sum(),
1378
        //                                      seg_it.data()->squares());
1379
0
      }
1380
0
      tprintf("\n");
1381
0
    }
1382
#ifndef GRAPHICS_DISABLED
1383
    if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) {
1384
      plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1385
    }
1386
#endif
1387
0
    seg_it.set_to_list(&seg_list);
1388
0
    if (prev_right >= 0) {
1389
0
      sp_var = seg_it.data()->position() - prev_right;
1390
0
      sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;
1391
0
      sp_var *= sp_var;
1392
0
      spsum += sp_var;
1393
0
      sp_count++;
1394
0
    }
1395
0
    for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1396
0
      segpos = seg_it.data()->position();
1397
0
      if (cell_it.empty() || segpos > cellpos + initial_pitch / 2) {
1398
        // big gap
1399
0
        while (!cell_it.empty() && segpos > cellpos + initial_pitch * 3 / 2) {
1400
0
          cell = new ICOORDELT(cellpos + static_cast<int16_t>(initial_pitch), 0);
1401
0
          cell_it.add_after_then_move(cell);
1402
0
          cellpos += static_cast<int16_t>(initial_pitch);
1403
0
        }
1404
        // make new one
1405
0
        cell = new ICOORDELT(segpos, 0);
1406
0
        cell_it.add_after_then_move(cell);
1407
0
        cellpos = segpos;
1408
0
      } else if (segpos > cellpos - initial_pitch / 2) {
1409
0
        cell = cell_it.data();
1410
        // average positions
1411
0
        cell->set_x((cellpos + segpos) / 2);
1412
0
        cellpos = cell->x();
1413
0
      }
1414
0
    }
1415
0
    seg_it.move_to_last();
1416
0
    prev_right = seg_it.data()->position();
1417
0
    if (textord_pitch_scalebigwords) {
1418
0
      scale_factor = (seg_list.length() - 2) / 2;
1419
0
      if (scale_factor < 1) {
1420
0
        scale_factor = 1;
1421
0
      }
1422
0
    } else {
1423
0
      scale_factor = 1;
1424
0
    }
1425
0
    sqsum += word_sync * scale_factor;
1426
0
    total_count += (seg_list.length() - 1) * scale_factor;
1427
0
    seg_list.clear();
1428
0
  } while (!blob_it.cycled_list());
1429
0
  sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;
1430
0
  return total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;
1431
0
}
1432
1433
/**********************************************************************
1434
 * compute_pitch_sd2
1435
 *
1436
 * Use a dp algorithm to fit the character cells and return the sd of
1437
 * the cell size over the row.
1438
 **********************************************************************/
1439
1440
float compute_pitch_sd2(       // find fp cells
1441
    TO_ROW *row,               // row to do
1442
    STATS *projection,         // vertical projection
1443
    int16_t projection_left,   // edge
1444
    int16_t projection_right,  // edge
1445
    float initial_pitch,       // guess at pitch
1446
    int16_t &occupation,       // no of occupied cells
1447
    int16_t &mid_cuts,         // no of free cuts
1448
    ICOORDELT_LIST *row_cells, // list of chop pts
1449
    bool testing_on,           // individual words
1450
    int16_t start,             // start of good range
1451
    int16_t end                // end of good range
1452
65.5k
) {
1453
  // blobs
1454
65.5k
  BLOBNBOX_IT blob_it = row->blob_list();
1455
65.5k
  BLOBNBOX_IT plot_it;
1456
65.5k
  int16_t blob_count;    // no of blobs
1457
65.5k
  TBOX blob_box;         // bounding box
1458
65.5k
  FPSEGPT_LIST seg_list; // char cells
1459
65.5k
  FPSEGPT_IT seg_it;     // iterator
1460
65.5k
  int16_t segpos;        // position of segment
1461
                         // iterator
1462
65.5k
  ICOORDELT_IT cell_it = row_cells;
1463
65.5k
  ICOORDELT *cell;  // new cell
1464
65.5k
  double word_sync; // result for word
1465
1466
65.5k
  mid_cuts = 0;
1467
65.5k
  if (blob_it.empty()) {
1468
0
    occupation = 0;
1469
0
    return initial_pitch * 10;
1470
0
  }
1471
#ifndef GRAPHICS_DISABLED
1472
  if (testing_on && to_win != nullptr) {
1473
    projection->plot(to_win, projection_left, row->intercept(), 1.0f, -1.0f, ScrollView::CORAL);
1474
  }
1475
#endif
1476
65.5k
  blob_count = 0;
1477
65.5k
  blob_it.mark_cycle_pt();
1478
909k
  do {
1479
    // first blob
1480
909k
    blob_box = box_next(&blob_it);
1481
909k
    blob_count++;
1482
909k
  } while (!blob_it.cycled_list());
1483
65.5k
  plot_it = blob_it;
1484
65.5k
  word_sync = check_pitch_sync2(
1485
65.5k
      &blob_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left,
1486
65.5k
      projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, start, end);
1487
65.5k
  if (testing_on) {
1488
0
    tprintf("Row ending at (%d,%d), len=%d, sync rating=%g, ", blob_box.right(), blob_box.top(),
1489
0
            seg_list.length() - 1, word_sync);
1490
0
    seg_it.set_to_list(&seg_list);
1491
0
    for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1492
0
      if (seg_it.data()->faked) {
1493
0
        tprintf("(F)");
1494
0
      }
1495
0
      tprintf("%d, ", seg_it.data()->position());
1496
      //                              tprintf("C=%g, s=%g, sq=%g\n",
1497
      //                                      seg_it.data()->cost_function(),
1498
      //                                      seg_it.data()->sum(),
1499
      //                                      seg_it.data()->squares());
1500
0
    }
1501
0
    tprintf("\n");
1502
0
  }
1503
#ifndef GRAPHICS_DISABLED
1504
  if (textord_show_fixed_cuts && blob_count > 0 && to_win != nullptr) {
1505
    plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
1506
  }
1507
#endif
1508
65.5k
  seg_it.set_to_list(&seg_list);
1509
866k
  for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) {
1510
801k
    segpos = seg_it.data()->position();
1511
    // make new one
1512
801k
    cell = new ICOORDELT(segpos, 0);
1513
801k
    cell_it.add_after_then_move(cell);
1514
801k
    if (seg_it.at_last()) {
1515
65.5k
      mid_cuts = seg_it.data()->cheap_cuts();
1516
65.5k
    }
1517
801k
  }
1518
65.5k
  seg_list.clear();
1519
65.5k
  return occupation > 0 ? sqrt(word_sync / occupation) : initial_pitch * 10;
1520
65.5k
}
1521
1522
/**********************************************************************
1523
 * print_pitch_sd
1524
 *
1525
 * Use a dp algorithm to fit the character cells and return the sd of
1526
 * the cell size over the row.
1527
 **********************************************************************/
1528
1529
void print_pitch_sd(         // find fp cells
1530
    TO_ROW *row,             // row to do
1531
    STATS *projection,       // vertical projection
1532
    int16_t projection_left, // edges //size of blank
1533
    int16_t projection_right, float space_size,
1534
    float initial_pitch // guess at pitch
1535
0
) {
1536
0
  const char *res2;   // pitch result
1537
0
  int16_t occupation; // used cells
1538
0
  float sp_sd;        // space sd
1539
                      // blobs
1540
0
  BLOBNBOX_IT blob_it = row->blob_list();
1541
0
  BLOBNBOX_IT start_it;     // start of word
1542
0
  BLOBNBOX_IT row_start;    // start of row
1543
0
  int16_t blob_count;       // no of blobs
1544
0
  int16_t total_blob_count; // total blobs in line
1545
0
  TBOX blob_box;            // bounding box
1546
0
  TBOX prev_box;            // of super blob
1547
0
  int32_t prev_right;       // of word sync
1548
0
  int scale_factor;         // on scores for big words
1549
0
  int32_t sp_count;         // spaces
1550
0
  FPSEGPT_LIST seg_list;    // char cells
1551
0
  FPSEGPT_IT seg_it;        // iterator
1552
0
  double sqsum;             // sum of squares
1553
0
  double spsum;             // of spaces
1554
0
  double sp_var;            // space error
1555
0
  double word_sync;         // result for word
1556
0
  double total_count;       // total cuts
1557
1558
0
  if (blob_it.empty()) {
1559
0
    return;
1560
0
  }
1561
0
  row_start = blob_it;
1562
0
  total_blob_count = 0;
1563
1564
0
  total_count = 0;
1565
0
  sqsum = 0;
1566
0
  sp_count = 0;
1567
0
  spsum = 0;
1568
0
  prev_right = -1;
1569
0
  blob_it = row_start;
1570
0
  start_it = blob_it;
1571
0
  blob_count = 0;
1572
0
  blob_box = box_next(&blob_it); // first blob
1573
0
  blob_it.mark_cycle_pt();
1574
0
  do {
1575
0
    for (; blob_count > 0; blob_count--) {
1576
0
      box_next(&start_it);
1577
0
    }
1578
0
    do {
1579
0
      prev_box = blob_box;
1580
0
      blob_count++;
1581
0
      blob_box = box_next(&blob_it);
1582
0
    } while (!blob_it.cycled_list() && blob_box.left() - prev_box.right() < space_size);
1583
0
    word_sync = check_pitch_sync2(
1584
0
        &start_it, blob_count, static_cast<int16_t>(initial_pitch), 2, projection, projection_left,
1585
0
        projection_right, row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0);
1586
0
    total_blob_count += blob_count;
1587
0
    seg_it.set_to_list(&seg_list);
1588
0
    if (prev_right >= 0) {
1589
0
      sp_var = seg_it.data()->position() - prev_right;
1590
0
      sp_var -= floor(sp_var / initial_pitch + 0.5) * initial_pitch;
1591
0
      sp_var *= sp_var;
1592
0
      spsum += sp_var;
1593
0
      sp_count++;
1594
0
    }
1595
0
    seg_it.move_to_last();
1596
0
    prev_right = seg_it.data()->position();
1597
0
    if (textord_pitch_scalebigwords) {
1598
0
      scale_factor = (seg_list.length() - 2) / 2;
1599
0
      if (scale_factor < 1) {
1600
0
        scale_factor = 1;
1601
0
      }
1602
0
    } else {
1603
0
      scale_factor = 1;
1604
0
    }
1605
0
    sqsum += word_sync * scale_factor;
1606
0
    total_count += (seg_list.length() - 1) * scale_factor;
1607
0
    seg_list.clear();
1608
0
  } while (!blob_it.cycled_list());
1609
0
  sp_sd = sp_count > 0 ? sqrt(spsum / sp_count) : 0;
1610
0
  word_sync = total_count > 0 ? sqrt(sqsum / total_count) : space_size * 10;
1611
0
  tprintf("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:", word_sync, word_sync / initial_pitch, sp_sd,
1612
0
          word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P');
1613
1614
0
  start_it = row_start;
1615
0
  blob_it = row_start;
1616
0
  word_sync =
1617
0
      check_pitch_sync2(&blob_it, total_blob_count, static_cast<int16_t>(initial_pitch), 2,
1618
0
                        projection, projection_left, projection_right,
1619
0
                        row->xheight * textord_projection_scale, occupation, &seg_list, 0, 0);
1620
0
  if (occupation > 1) {
1621
0
    word_sync /= occupation;
1622
0
  }
1623
0
  word_sync = sqrt(word_sync);
1624
1625
#ifndef GRAPHICS_DISABLED
1626
  if (textord_show_row_cuts && to_win != nullptr) {
1627
    plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list);
1628
  }
1629
#endif
1630
0
  seg_list.clear();
1631
0
  if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
1632
0
    if (word_sync < textord_words_def_fixed * initial_pitch && !row->all_caps) {
1633
0
      res2 = "DF";
1634
0
    } else {
1635
0
      res2 = "MF";
1636
0
    }
1637
0
  } else {
1638
0
    res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
1639
0
  }
1640
0
  tprintf(
1641
0
      "row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, "
1642
0
      "all_caps=%d\n",
1643
0
      word_sync, word_sync / initial_pitch,
1644
0
      word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P', occupation, res2,
1645
0
      initial_pitch, row->fixed_pitch, row->all_caps);
1646
0
}
1647
1648
/**********************************************************************
1649
 * find_repeated_chars
1650
 *
1651
 * Extract marked leader blobs and put them
1652
 * into words in advance of fixed pitch checking and word generation.
1653
 **********************************************************************/
1654
void find_repeated_chars(TO_BLOCK *block,   // Block to search.
1655
16.7k
                         bool testing_on) { // Debug mode.
1656
16.7k
  POLY_BLOCK *pb = block->block->pdblk.poly_block();
1657
16.7k
  if (pb != nullptr && !pb->IsText()) {
1658
0
    return; // Don't find repeated chars in non-text blocks.
1659
0
  }
1660
1661
16.7k
  TO_ROW *row;
1662
16.7k
  BLOBNBOX_IT box_it;
1663
16.7k
  BLOBNBOX_IT search_it; // forward search
1664
16.7k
  WERD *word;            // new word
1665
16.7k
  TBOX word_box;         // for plotting
1666
16.7k
  int blobcount, repeated_set;
1667
1668
16.7k
  TO_ROW_IT row_it = block->get_rows();
1669
16.7k
  if (row_it.empty()) {
1670
0
    return; // empty block
1671
0
  }
1672
202k
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1673
185k
    row = row_it.data();
1674
185k
    box_it.set_to_list(row->blob_list());
1675
185k
    if (box_it.empty()) {
1676
0
      continue; // no blobs in this row
1677
0
    }
1678
185k
    if (!row->rep_chars_marked()) {
1679
0
      mark_repeated_chars(row);
1680
0
    }
1681
185k
    if (row->num_repeated_sets() == 0) {
1682
185k
      continue; // nothing to do for this row
1683
185k
    }
1684
    // new words
1685
0
    WERD_IT word_it(&row->rep_words);
1686
0
    do {
1687
0
      if (box_it.data()->repeated_set() != 0 && !box_it.data()->joined_to_prev()) {
1688
0
        blobcount = 1;
1689
0
        repeated_set = box_it.data()->repeated_set();
1690
0
        search_it = box_it;
1691
0
        search_it.forward();
1692
0
        while (!search_it.at_first() && search_it.data()->repeated_set() == repeated_set) {
1693
0
          blobcount++;
1694
0
          search_it.forward();
1695
0
        }
1696
        // After the call to make_real_word() all the blobs from this
1697
        // repeated set will be removed from the blob list. box_it will be
1698
        // set to point to the blob after the end of the extracted sequence.
1699
0
        word = make_real_word(&box_it, blobcount, box_it.at_first(), 1);
1700
0
        if (!box_it.empty() && box_it.data()->joined_to_prev()) {
1701
0
          tprintf("Bad box joined to prev at");
1702
0
          box_it.data()->bounding_box().print();
1703
0
          tprintf("After repeated word:");
1704
0
          word->bounding_box().print();
1705
0
        }
1706
0
        ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev());
1707
0
        word->set_flag(W_REP_CHAR, true);
1708
0
        word->set_flag(W_DONT_CHOP, true);
1709
0
        word_it.add_after_then_move(word);
1710
0
      } else {
1711
0
        box_it.forward();
1712
0
      }
1713
0
    } while (!box_it.at_first());
1714
0
  }
1715
16.7k
}
1716
1717
/**********************************************************************
1718
 * plot_fp_word
1719
 *
1720
 * Plot a block of words as if fixed pitch.
1721
 **********************************************************************/
1722
1723
#ifndef GRAPHICS_DISABLED
1724
void plot_fp_word(   // draw block of words
1725
    TO_BLOCK *block, // block to draw
1726
    float pitch,     // pitch to draw with
1727
    float nonspace   // for space threshold
1728
) {
1729
  TO_ROW *row; // current row
1730
  TO_ROW_IT row_it = block->get_rows();
1731
1732
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1733
    row = row_it.data();
1734
    row->min_space = static_cast<int32_t>((pitch + nonspace) / 2);
1735
    row->max_nonspace = row->min_space;
1736
    row->space_threshold = row->min_space;
1737
    plot_word_decisions(to_win, static_cast<int16_t>(pitch), row);
1738
  }
1739
}
1740
#endif
1741
1742
} // namespace tesseract